diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,244842 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.15410322498768686, + "eval_steps": 500, + "global_step": 153000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 5.036053104172773e-06, + "grad_norm": 184.37385834066, + "learning_rate": 5.035956731059767e-09, + "loss": 2.9192, + "mean_token_accuracy": 0.3965517282485962, + "step": 5 + }, + { + "epoch": 1.0072106208345546e-05, + "grad_norm": 115.51465348321524, + "learning_rate": 1.0071913462119533e-08, + "loss": 3.1229, + "mean_token_accuracy": 0.3206896513700485, + "step": 10 + }, + { + "epoch": 1.5108159312518318e-05, + "grad_norm": 128.14684544029438, + "learning_rate": 1.51078701931793e-08, + "loss": 2.8221, + "mean_token_accuracy": 0.42068964838981626, + "step": 15 + }, + { + "epoch": 2.014421241669109e-05, + "grad_norm": 93.98992739112428, + "learning_rate": 2.0143826924239066e-08, + "loss": 2.9034, + "mean_token_accuracy": 0.42413793206214906, + "step": 20 + }, + { + "epoch": 2.5180265520863864e-05, + "grad_norm": 120.66703121286582, + "learning_rate": 2.5179783655298835e-08, + "loss": 2.7341, + "mean_token_accuracy": 0.420689657330513, + "step": 25 + }, + { + "epoch": 3.0216318625036637e-05, + "grad_norm": 108.72624899473382, + "learning_rate": 3.02157403863586e-08, + "loss": 2.9094, + "mean_token_accuracy": 0.3896551728248596, + "step": 30 + }, + { + "epoch": 3.525237172920941e-05, + "grad_norm": 135.17281815093503, + "learning_rate": 3.5251697117418364e-08, + "loss": 3.3604, + "mean_token_accuracy": 0.3733817219734192, + "step": 35 + }, + { + "epoch": 4.028842483338218e-05, + "grad_norm": 121.9632914213751, + "learning_rate": 4.028765384847813e-08, + "loss": 3.2009, + "mean_token_accuracy": 0.3379310369491577, + "step": 40 + }, + { + "epoch": 4.532447793755496e-05, + "grad_norm": 97.98992561195891, + "learning_rate": 4.53236105795379e-08, + "loss": 2.7912, + "mean_token_accuracy": 0.4034482717514038, + "step": 45 + }, + { + "epoch": 5.036053104172773e-05, + "grad_norm": 142.3603859690338, + "learning_rate": 5.035956731059767e-08, + "loss": 2.9498, + "mean_token_accuracy": 0.3827586233615875, + "step": 50 + }, + { + "epoch": 5.5396584145900504e-05, + "grad_norm": 135.23102796458127, + "learning_rate": 5.5395524041657433e-08, + "loss": 2.9546, + "mean_token_accuracy": 0.38620689511299133, + "step": 55 + }, + { + "epoch": 6.043263725007327e-05, + "grad_norm": 114.50257152228085, + "learning_rate": 6.04314807727172e-08, + "loss": 2.4589, + "mean_token_accuracy": 0.46551724076271056, + "step": 60 + }, + { + "epoch": 6.546869035424605e-05, + "grad_norm": 106.87696604255558, + "learning_rate": 6.546743750377697e-08, + "loss": 2.3529, + "mean_token_accuracy": 0.482758617401123, + "step": 65 + }, + { + "epoch": 7.050474345841883e-05, + "grad_norm": 144.7875008882521, + "learning_rate": 7.050339423483673e-08, + "loss": 2.7852, + "mean_token_accuracy": 0.41724138259887694, + "step": 70 + }, + { + "epoch": 7.554079656259159e-05, + "grad_norm": 150.28386750658944, + "learning_rate": 7.553935096589651e-08, + "loss": 2.9964, + "mean_token_accuracy": 0.3990147829055786, + "step": 75 + }, + { + "epoch": 8.057684966676436e-05, + "grad_norm": 128.590739084924, + "learning_rate": 8.057530769695627e-08, + "loss": 2.6091, + "mean_token_accuracy": 0.441379314661026, + "step": 80 + }, + { + "epoch": 8.561290277093714e-05, + "grad_norm": 136.77950873458363, + "learning_rate": 8.561126442801605e-08, + "loss": 3.0939, + "mean_token_accuracy": 0.3793103456497192, + "step": 85 + }, + { + "epoch": 9.064895587510992e-05, + "grad_norm": 154.18804147239845, + "learning_rate": 9.06472211590758e-08, + "loss": 3.4264, + "mean_token_accuracy": 0.346158492565155, + "step": 90 + }, + { + "epoch": 9.568500897928268e-05, + "grad_norm": 116.13046764073876, + "learning_rate": 9.568317789013557e-08, + "loss": 2.7618, + "mean_token_accuracy": 0.36896551251411436, + "step": 95 + }, + { + "epoch": 0.00010072106208345546, + "grad_norm": 110.77374107676607, + "learning_rate": 1.0071913462119534e-07, + "loss": 3.0722, + "mean_token_accuracy": 0.36551723480224607, + "step": 100 + }, + { + "epoch": 0.00010575711518762823, + "grad_norm": 113.01719127713102, + "learning_rate": 1.057550913522551e-07, + "loss": 2.698, + "mean_token_accuracy": 0.3724137842655182, + "step": 105 + }, + { + "epoch": 0.00011079316829180101, + "grad_norm": 142.57442968544424, + "learning_rate": 1.1079104808331487e-07, + "loss": 2.4708, + "mean_token_accuracy": 0.47586206197738645, + "step": 110 + }, + { + "epoch": 0.00011582922139597377, + "grad_norm": 115.01977070578374, + "learning_rate": 1.1582700481437464e-07, + "loss": 2.3308, + "mean_token_accuracy": 0.44137930274009707, + "step": 115 + }, + { + "epoch": 0.00012086527450014655, + "grad_norm": 123.76974341799016, + "learning_rate": 1.208629615454344e-07, + "loss": 2.5578, + "mean_token_accuracy": 0.4724137902259827, + "step": 120 + }, + { + "epoch": 0.0001259013276043193, + "grad_norm": 152.76032381601127, + "learning_rate": 1.258989182764942e-07, + "loss": 2.7359, + "mean_token_accuracy": 0.4068965494632721, + "step": 125 + }, + { + "epoch": 0.0001309373807084921, + "grad_norm": 118.64261820647128, + "learning_rate": 1.3093487500755394e-07, + "loss": 2.7449, + "mean_token_accuracy": 0.44482758045196535, + "step": 130 + }, + { + "epoch": 0.00013597343381266486, + "grad_norm": 151.93036554078935, + "learning_rate": 1.359708317386137e-07, + "loss": 2.8709, + "mean_token_accuracy": 0.3896551728248596, + "step": 135 + }, + { + "epoch": 0.00014100948691683765, + "grad_norm": 101.1616090153162, + "learning_rate": 1.4100678846967345e-07, + "loss": 2.7185, + "mean_token_accuracy": 0.4379310250282288, + "step": 140 + }, + { + "epoch": 0.00014604554002101041, + "grad_norm": 125.01068204666701, + "learning_rate": 1.4604274520073324e-07, + "loss": 2.8559, + "mean_token_accuracy": 0.47241378426551817, + "step": 145 + }, + { + "epoch": 0.00015108159312518318, + "grad_norm": 134.74781524363345, + "learning_rate": 1.5107870193179302e-07, + "loss": 2.6456, + "mean_token_accuracy": 0.4068965554237366, + "step": 150 + }, + { + "epoch": 0.00015611764622935597, + "grad_norm": 153.65307950744636, + "learning_rate": 1.5611465866285278e-07, + "loss": 2.6937, + "mean_token_accuracy": 0.40689656138420105, + "step": 155 + }, + { + "epoch": 0.00016115369933352873, + "grad_norm": 119.37977441865058, + "learning_rate": 1.6115061539391253e-07, + "loss": 2.8349, + "mean_token_accuracy": 0.4, + "step": 160 + }, + { + "epoch": 0.0001661897524377015, + "grad_norm": 158.93124979095967, + "learning_rate": 1.6618657212497231e-07, + "loss": 2.9299, + "mean_token_accuracy": 0.4137930989265442, + "step": 165 + }, + { + "epoch": 0.00017122580554187428, + "grad_norm": 135.82887677202035, + "learning_rate": 1.712225288560321e-07, + "loss": 2.9838, + "mean_token_accuracy": 0.37586206793785093, + "step": 170 + }, + { + "epoch": 0.00017626185864604704, + "grad_norm": 112.26795998385566, + "learning_rate": 1.7625848558709185e-07, + "loss": 2.4926, + "mean_token_accuracy": 0.4586206912994385, + "step": 175 + }, + { + "epoch": 0.00018129791175021983, + "grad_norm": 76.3531078505134, + "learning_rate": 1.812944423181516e-07, + "loss": 2.4358, + "mean_token_accuracy": 0.4931034505367279, + "step": 180 + }, + { + "epoch": 0.0001863339648543926, + "grad_norm": 142.39390058617943, + "learning_rate": 1.8633039904921136e-07, + "loss": 2.6134, + "mean_token_accuracy": 0.4551724076271057, + "step": 185 + }, + { + "epoch": 0.00019137001795856536, + "grad_norm": 114.87053401612461, + "learning_rate": 1.9136635578027115e-07, + "loss": 2.7383, + "mean_token_accuracy": 0.458620685338974, + "step": 190 + }, + { + "epoch": 0.00019640607106273815, + "grad_norm": 174.51701190472923, + "learning_rate": 1.9640231251133093e-07, + "loss": 3.0416, + "mean_token_accuracy": 0.3551724135875702, + "step": 195 + }, + { + "epoch": 0.0002014421241669109, + "grad_norm": 87.92382514620128, + "learning_rate": 2.0143826924239068e-07, + "loss": 2.6535, + "mean_token_accuracy": 0.44827585816383364, + "step": 200 + }, + { + "epoch": 0.0002064781772710837, + "grad_norm": 125.3923304530879, + "learning_rate": 2.0647422597345044e-07, + "loss": 3.395, + "mean_token_accuracy": 0.3034482717514038, + "step": 205 + }, + { + "epoch": 0.00021151423037525646, + "grad_norm": 189.93261250839308, + "learning_rate": 2.115101827045102e-07, + "loss": 2.6287, + "mean_token_accuracy": 0.41379310488700866, + "step": 210 + }, + { + "epoch": 0.00021655028347942923, + "grad_norm": 147.48631644542024, + "learning_rate": 2.1654613943556998e-07, + "loss": 2.9504, + "mean_token_accuracy": 0.3875982999801636, + "step": 215 + }, + { + "epoch": 0.00022158633658360202, + "grad_norm": 176.65054130210027, + "learning_rate": 2.2158209616662973e-07, + "loss": 2.9528, + "mean_token_accuracy": 0.36551723480224607, + "step": 220 + }, + { + "epoch": 0.00022662238968777478, + "grad_norm": 170.4652299488915, + "learning_rate": 2.266180528976895e-07, + "loss": 2.3855, + "mean_token_accuracy": 0.4517241358757019, + "step": 225 + }, + { + "epoch": 0.00023165844279194754, + "grad_norm": 120.54500147452612, + "learning_rate": 2.3165400962874927e-07, + "loss": 2.7001, + "mean_token_accuracy": 0.4000000059604645, + "step": 230 + }, + { + "epoch": 0.00023669449589612033, + "grad_norm": 113.6082665799816, + "learning_rate": 2.3668996635980905e-07, + "loss": 3.0279, + "mean_token_accuracy": 0.3551724135875702, + "step": 235 + }, + { + "epoch": 0.0002417305490002931, + "grad_norm": 93.23066527567585, + "learning_rate": 2.417259230908688e-07, + "loss": 2.7444, + "mean_token_accuracy": 0.4172413766384125, + "step": 240 + }, + { + "epoch": 0.0002467666021044659, + "grad_norm": 170.03465277438232, + "learning_rate": 2.467618798219286e-07, + "loss": 2.5591, + "mean_token_accuracy": 0.42758620977401735, + "step": 245 + }, + { + "epoch": 0.0002518026552086386, + "grad_norm": 97.79355591471369, + "learning_rate": 2.517978365529884e-07, + "loss": 2.6857, + "mean_token_accuracy": 0.4448275864124298, + "step": 250 + }, + { + "epoch": 0.0002568387083128114, + "grad_norm": 110.2269865945556, + "learning_rate": 2.568337932840481e-07, + "loss": 2.745, + "mean_token_accuracy": 0.4310344815254211, + "step": 255 + }, + { + "epoch": 0.0002618747614169842, + "grad_norm": 167.01819819201668, + "learning_rate": 2.618697500151079e-07, + "loss": 2.4903, + "mean_token_accuracy": 0.4448275864124298, + "step": 260 + }, + { + "epoch": 0.000266910814521157, + "grad_norm": 82.02620765743933, + "learning_rate": 2.669057067461676e-07, + "loss": 2.813, + "mean_token_accuracy": 0.36896551251411436, + "step": 265 + }, + { + "epoch": 0.0002719468676253297, + "grad_norm": 114.49835583281566, + "learning_rate": 2.719416634772274e-07, + "loss": 2.8973, + "mean_token_accuracy": 0.4034482777118683, + "step": 270 + }, + { + "epoch": 0.0002769829207295025, + "grad_norm": 143.21545063778842, + "learning_rate": 2.769776202082872e-07, + "loss": 2.9087, + "mean_token_accuracy": 0.44482758045196535, + "step": 275 + }, + { + "epoch": 0.0002820189738336753, + "grad_norm": 131.6447637316192, + "learning_rate": 2.820135769393469e-07, + "loss": 2.38, + "mean_token_accuracy": 0.45517241954803467, + "step": 280 + }, + { + "epoch": 0.00028705502693784804, + "grad_norm": 164.32375103652484, + "learning_rate": 2.8704953367040674e-07, + "loss": 2.6754, + "mean_token_accuracy": 0.4620689690113068, + "step": 285 + }, + { + "epoch": 0.00029209108004202083, + "grad_norm": 127.73209131432515, + "learning_rate": 2.920854904014665e-07, + "loss": 2.767, + "mean_token_accuracy": 0.41379310488700866, + "step": 290 + }, + { + "epoch": 0.0002971271331461936, + "grad_norm": 95.51532568889598, + "learning_rate": 2.9712144713252626e-07, + "loss": 2.7629, + "mean_token_accuracy": 0.41034482717514037, + "step": 295 + }, + { + "epoch": 0.00030216318625036635, + "grad_norm": 142.2580984331193, + "learning_rate": 3.0215740386358604e-07, + "loss": 3.0414, + "mean_token_accuracy": 0.3551724135875702, + "step": 300 + }, + { + "epoch": 0.00030719923935453914, + "grad_norm": 148.80577494007386, + "learning_rate": 3.0719336059464577e-07, + "loss": 2.9303, + "mean_token_accuracy": 0.3931034505367279, + "step": 305 + }, + { + "epoch": 0.00031223529245871193, + "grad_norm": 130.62579712147522, + "learning_rate": 3.1222931732570555e-07, + "loss": 2.6566, + "mean_token_accuracy": 0.42413792610168455, + "step": 310 + }, + { + "epoch": 0.00031727134556288467, + "grad_norm": 147.32065904244521, + "learning_rate": 3.1726527405676533e-07, + "loss": 2.6772, + "mean_token_accuracy": 0.42758620977401735, + "step": 315 + }, + { + "epoch": 0.00032230739866705746, + "grad_norm": 192.00664416812978, + "learning_rate": 3.2230123078782506e-07, + "loss": 2.7212, + "mean_token_accuracy": 0.4255293428897858, + "step": 320 + }, + { + "epoch": 0.00032734345177123025, + "grad_norm": 103.49225237823313, + "learning_rate": 3.2733718751888484e-07, + "loss": 2.926, + "mean_token_accuracy": 0.34827586710453035, + "step": 325 + }, + { + "epoch": 0.000332379504875403, + "grad_norm": 127.22193639149162, + "learning_rate": 3.3237314424994463e-07, + "loss": 2.8522, + "mean_token_accuracy": 0.4, + "step": 330 + }, + { + "epoch": 0.0003374155579795758, + "grad_norm": 116.89075063440451, + "learning_rate": 3.374091009810044e-07, + "loss": 2.603, + "mean_token_accuracy": 0.4413793087005615, + "step": 335 + }, + { + "epoch": 0.00034245161108374856, + "grad_norm": 102.90675650693785, + "learning_rate": 3.424450577120642e-07, + "loss": 2.4348, + "mean_token_accuracy": 0.43793103098869324, + "step": 340 + }, + { + "epoch": 0.00034748766418792135, + "grad_norm": 129.28524192411595, + "learning_rate": 3.474810144431239e-07, + "loss": 2.9245, + "mean_token_accuracy": 0.3896551728248596, + "step": 345 + }, + { + "epoch": 0.0003525237172920941, + "grad_norm": 118.97989649596283, + "learning_rate": 3.525169711741837e-07, + "loss": 2.4452, + "mean_token_accuracy": 0.43448275327682495, + "step": 350 + }, + { + "epoch": 0.0003575597703962669, + "grad_norm": 111.56592749408559, + "learning_rate": 3.5755292790524343e-07, + "loss": 2.9099, + "mean_token_accuracy": 0.41639443039894103, + "step": 355 + }, + { + "epoch": 0.00036259582350043967, + "grad_norm": 144.8197948477708, + "learning_rate": 3.625888846363032e-07, + "loss": 2.9199, + "mean_token_accuracy": 0.3896551728248596, + "step": 360 + }, + { + "epoch": 0.0003676318766046124, + "grad_norm": 95.28332303023947, + "learning_rate": 3.67624841367363e-07, + "loss": 3.0385, + "mean_token_accuracy": 0.37241379618644715, + "step": 365 + }, + { + "epoch": 0.0003726679297087852, + "grad_norm": 130.90212137646938, + "learning_rate": 3.7266079809842273e-07, + "loss": 2.4822, + "mean_token_accuracy": 0.4517241418361664, + "step": 370 + }, + { + "epoch": 0.000377703982812958, + "grad_norm": 84.3170716920256, + "learning_rate": 3.776967548294825e-07, + "loss": 2.4249, + "mean_token_accuracy": 0.5034482657909394, + "step": 375 + }, + { + "epoch": 0.0003827400359171307, + "grad_norm": 115.19065645002041, + "learning_rate": 3.827327115605423e-07, + "loss": 2.8054, + "mean_token_accuracy": 0.4602540791034698, + "step": 380 + }, + { + "epoch": 0.0003877760890213035, + "grad_norm": 171.33198789937248, + "learning_rate": 3.8776866829160207e-07, + "loss": 2.8631, + "mean_token_accuracy": 0.3676108419895172, + "step": 385 + }, + { + "epoch": 0.0003928121421254763, + "grad_norm": 136.27446247446824, + "learning_rate": 3.9280462502266186e-07, + "loss": 2.6387, + "mean_token_accuracy": 0.45517241954803467, + "step": 390 + }, + { + "epoch": 0.00039784819522964903, + "grad_norm": 129.7593181067108, + "learning_rate": 3.978405817537216e-07, + "loss": 2.6709, + "mean_token_accuracy": 0.42413793206214906, + "step": 395 + }, + { + "epoch": 0.0004028842483338218, + "grad_norm": 112.10185767125414, + "learning_rate": 4.0287653848478137e-07, + "loss": 2.7646, + "mean_token_accuracy": 0.37586207389831544, + "step": 400 + }, + { + "epoch": 0.0004079203014379946, + "grad_norm": 95.84334217571804, + "learning_rate": 4.0791249521584115e-07, + "loss": 2.7615, + "mean_token_accuracy": 0.403448274731636, + "step": 405 + }, + { + "epoch": 0.0004129563545421674, + "grad_norm": 106.4905683415504, + "learning_rate": 4.129484519469009e-07, + "loss": 2.7835, + "mean_token_accuracy": 0.41161524653434756, + "step": 410 + }, + { + "epoch": 0.00041799240764634014, + "grad_norm": 82.77934931657926, + "learning_rate": 4.1798440867796066e-07, + "loss": 2.5381, + "mean_token_accuracy": 0.45517241954803467, + "step": 415 + }, + { + "epoch": 0.00042302846075051293, + "grad_norm": 105.1284031779718, + "learning_rate": 4.230203654090204e-07, + "loss": 2.8011, + "mean_token_accuracy": 0.3836055725812912, + "step": 420 + }, + { + "epoch": 0.0004280645138546857, + "grad_norm": 101.16566563275299, + "learning_rate": 4.2805632214008017e-07, + "loss": 2.3296, + "mean_token_accuracy": 0.4620689630508423, + "step": 425 + }, + { + "epoch": 0.00043310056695885845, + "grad_norm": 95.80579434894857, + "learning_rate": 4.3309227887113996e-07, + "loss": 2.3242, + "mean_token_accuracy": 0.46551724076271056, + "step": 430 + }, + { + "epoch": 0.00043813662006303124, + "grad_norm": 141.8439665770817, + "learning_rate": 4.381282356021997e-07, + "loss": 2.6309, + "mean_token_accuracy": 0.38275861740112305, + "step": 435 + }, + { + "epoch": 0.00044317267316720403, + "grad_norm": 81.4978736073144, + "learning_rate": 4.4316419233325947e-07, + "loss": 2.4957, + "mean_token_accuracy": 0.43448275327682495, + "step": 440 + }, + { + "epoch": 0.00044820872627137677, + "grad_norm": 89.41859589037193, + "learning_rate": 4.4820014906431925e-07, + "loss": 2.6591, + "mean_token_accuracy": 0.4379310429096222, + "step": 445 + }, + { + "epoch": 0.00045324477937554956, + "grad_norm": 117.22822735881033, + "learning_rate": 4.53236105795379e-07, + "loss": 2.6265, + "mean_token_accuracy": 0.42413793206214906, + "step": 450 + }, + { + "epoch": 0.00045828083247972235, + "grad_norm": 117.772096619878, + "learning_rate": 4.5827206252643876e-07, + "loss": 2.4679, + "mean_token_accuracy": 0.46896551847457885, + "step": 455 + }, + { + "epoch": 0.0004633168855838951, + "grad_norm": 119.45972976735337, + "learning_rate": 4.6330801925749854e-07, + "loss": 2.7592, + "mean_token_accuracy": 0.42183908224105837, + "step": 460 + }, + { + "epoch": 0.00046835293868806787, + "grad_norm": 101.66925022479907, + "learning_rate": 4.683439759885584e-07, + "loss": 2.5975, + "mean_token_accuracy": 0.44162561297416686, + "step": 465 + }, + { + "epoch": 0.00047338899179224066, + "grad_norm": 74.51822544368427, + "learning_rate": 4.733799327196181e-07, + "loss": 2.2852, + "mean_token_accuracy": 0.46551724672317507, + "step": 470 + }, + { + "epoch": 0.0004784250448964134, + "grad_norm": 104.31649200353398, + "learning_rate": 4.784158894506779e-07, + "loss": 2.7482, + "mean_token_accuracy": 0.3758620619773865, + "step": 475 + }, + { + "epoch": 0.0004834610980005862, + "grad_norm": 70.35465134175018, + "learning_rate": 4.834518461817376e-07, + "loss": 2.4995, + "mean_token_accuracy": 0.4379310369491577, + "step": 480 + }, + { + "epoch": 0.000488497151104759, + "grad_norm": 97.30671099352266, + "learning_rate": 4.884878029127974e-07, + "loss": 2.7713, + "mean_token_accuracy": 0.4172413766384125, + "step": 485 + }, + { + "epoch": 0.0004935332042089318, + "grad_norm": 128.57769163039333, + "learning_rate": 4.935237596438572e-07, + "loss": 2.9798, + "mean_token_accuracy": 0.3931034505367279, + "step": 490 + }, + { + "epoch": 0.0004985692573131046, + "grad_norm": 109.78819543288185, + "learning_rate": 4.98559716374917e-07, + "loss": 2.5184, + "mean_token_accuracy": 0.4206896543502808, + "step": 495 + }, + { + "epoch": 0.0005036053104172772, + "grad_norm": 107.37855232859297, + "learning_rate": 5.035956731059767e-07, + "loss": 2.7295, + "mean_token_accuracy": 0.39310345649719236, + "step": 500 + }, + { + "epoch": 0.00050864136352145, + "grad_norm": 77.71522135810928, + "learning_rate": 5.086316298370364e-07, + "loss": 2.5853, + "mean_token_accuracy": 0.4517241358757019, + "step": 505 + }, + { + "epoch": 0.0005136774166256228, + "grad_norm": 135.20387787149846, + "learning_rate": 5.136675865680962e-07, + "loss": 2.5211, + "mean_token_accuracy": 0.42758620977401735, + "step": 510 + }, + { + "epoch": 0.0005187134697297956, + "grad_norm": 116.21072332002251, + "learning_rate": 5.18703543299156e-07, + "loss": 2.7668, + "mean_token_accuracy": 0.4068965554237366, + "step": 515 + }, + { + "epoch": 0.0005237495228339684, + "grad_norm": 96.95603099242913, + "learning_rate": 5.237395000302158e-07, + "loss": 2.6718, + "mean_token_accuracy": 0.4172413766384125, + "step": 520 + }, + { + "epoch": 0.0005287855759381412, + "grad_norm": 100.72935932881842, + "learning_rate": 5.287754567612756e-07, + "loss": 3.1407, + "mean_token_accuracy": 0.3827586233615875, + "step": 525 + }, + { + "epoch": 0.000533821629042314, + "grad_norm": 125.46755545312327, + "learning_rate": 5.338114134923352e-07, + "loss": 2.5017, + "mean_token_accuracy": 0.4310344815254211, + "step": 530 + }, + { + "epoch": 0.0005388576821464867, + "grad_norm": 110.32098076381747, + "learning_rate": 5.38847370223395e-07, + "loss": 2.5591, + "mean_token_accuracy": 0.4310344815254211, + "step": 535 + }, + { + "epoch": 0.0005438937352506594, + "grad_norm": 98.3318094080258, + "learning_rate": 5.438833269544548e-07, + "loss": 2.7035, + "mean_token_accuracy": 0.4, + "step": 540 + }, + { + "epoch": 0.0005489297883548322, + "grad_norm": 68.26146745661678, + "learning_rate": 5.489192836855146e-07, + "loss": 2.8647, + "mean_token_accuracy": 0.41476104259490965, + "step": 545 + }, + { + "epoch": 0.000553965841459005, + "grad_norm": 78.42414525742541, + "learning_rate": 5.539552404165744e-07, + "loss": 2.635, + "mean_token_accuracy": 0.4310344815254211, + "step": 550 + }, + { + "epoch": 0.0005590018945631778, + "grad_norm": 91.61441788019096, + "learning_rate": 5.589911971476341e-07, + "loss": 2.5186, + "mean_token_accuracy": 0.441379314661026, + "step": 555 + }, + { + "epoch": 0.0005640379476673506, + "grad_norm": 106.20995303402762, + "learning_rate": 5.640271538786938e-07, + "loss": 2.6926, + "mean_token_accuracy": 0.42413793206214906, + "step": 560 + }, + { + "epoch": 0.0005690740007715233, + "grad_norm": 141.215242800728, + "learning_rate": 5.690631106097537e-07, + "loss": 2.7865, + "mean_token_accuracy": 0.4010889232158661, + "step": 565 + }, + { + "epoch": 0.0005741100538756961, + "grad_norm": 85.06108014144998, + "learning_rate": 5.740990673408135e-07, + "loss": 2.7034, + "mean_token_accuracy": 0.4517241358757019, + "step": 570 + }, + { + "epoch": 0.0005791461069798689, + "grad_norm": 108.0575295666833, + "learning_rate": 5.791350240718733e-07, + "loss": 2.6878, + "mean_token_accuracy": 0.4517241358757019, + "step": 575 + }, + { + "epoch": 0.0005841821600840417, + "grad_norm": 75.89057427381948, + "learning_rate": 5.84170980802933e-07, + "loss": 2.3963, + "mean_token_accuracy": 0.4517241358757019, + "step": 580 + }, + { + "epoch": 0.0005892182131882144, + "grad_norm": 101.216650291623, + "learning_rate": 5.892069375339927e-07, + "loss": 2.2118, + "mean_token_accuracy": 0.5068965554237366, + "step": 585 + }, + { + "epoch": 0.0005942542662923872, + "grad_norm": 88.03001336056393, + "learning_rate": 5.942428942650525e-07, + "loss": 2.5335, + "mean_token_accuracy": 0.47404718995094297, + "step": 590 + }, + { + "epoch": 0.00059929031939656, + "grad_norm": 92.11349880726202, + "learning_rate": 5.992788509961123e-07, + "loss": 2.1097, + "mean_token_accuracy": 0.46551724076271056, + "step": 595 + }, + { + "epoch": 0.0006043263725007327, + "grad_norm": 121.245838725664, + "learning_rate": 6.043148077271721e-07, + "loss": 2.3172, + "mean_token_accuracy": 0.46896552443504336, + "step": 600 + }, + { + "epoch": 0.0006093624256049055, + "grad_norm": 79.52855727841695, + "learning_rate": 6.093507644582319e-07, + "loss": 2.3817, + "mean_token_accuracy": 0.46696914434432985, + "step": 605 + }, + { + "epoch": 0.0006143984787090783, + "grad_norm": 67.81969591475001, + "learning_rate": 6.143867211892915e-07, + "loss": 2.4404, + "mean_token_accuracy": 0.46551724076271056, + "step": 610 + }, + { + "epoch": 0.0006194345318132511, + "grad_norm": 92.77669389751524, + "learning_rate": 6.194226779203513e-07, + "loss": 2.5152, + "mean_token_accuracy": 0.4068965494632721, + "step": 615 + }, + { + "epoch": 0.0006244705849174239, + "grad_norm": 80.48286293005252, + "learning_rate": 6.244586346514111e-07, + "loss": 2.4309, + "mean_token_accuracy": 0.47241379618644713, + "step": 620 + }, + { + "epoch": 0.0006295066380215967, + "grad_norm": 61.75806798712446, + "learning_rate": 6.294945913824709e-07, + "loss": 2.5416, + "mean_token_accuracy": 0.4551724135875702, + "step": 625 + }, + { + "epoch": 0.0006345426911257693, + "grad_norm": 74.23387017595724, + "learning_rate": 6.345305481135307e-07, + "loss": 2.6247, + "mean_token_accuracy": 0.4689655125141144, + "step": 630 + }, + { + "epoch": 0.0006395787442299421, + "grad_norm": 76.06760280009239, + "learning_rate": 6.395665048445903e-07, + "loss": 2.8092, + "mean_token_accuracy": 0.4000000059604645, + "step": 635 + }, + { + "epoch": 0.0006446147973341149, + "grad_norm": 85.24232343466204, + "learning_rate": 6.446024615756501e-07, + "loss": 2.8278, + "mean_token_accuracy": 0.3862069010734558, + "step": 640 + }, + { + "epoch": 0.0006496508504382877, + "grad_norm": 86.6529097753573, + "learning_rate": 6.496384183067099e-07, + "loss": 2.5016, + "mean_token_accuracy": 0.4413793206214905, + "step": 645 + }, + { + "epoch": 0.0006546869035424605, + "grad_norm": 104.60639124238119, + "learning_rate": 6.546743750377697e-07, + "loss": 2.4112, + "mean_token_accuracy": 0.4655172348022461, + "step": 650 + }, + { + "epoch": 0.0006597229566466333, + "grad_norm": 93.64425274786298, + "learning_rate": 6.597103317688295e-07, + "loss": 2.3852, + "mean_token_accuracy": 0.4206896543502808, + "step": 655 + }, + { + "epoch": 0.000664759009750806, + "grad_norm": 76.01933409259676, + "learning_rate": 6.647462884998893e-07, + "loss": 2.6791, + "mean_token_accuracy": 0.3999999940395355, + "step": 660 + }, + { + "epoch": 0.0006697950628549788, + "grad_norm": 98.87426350115594, + "learning_rate": 6.69782245230949e-07, + "loss": 2.3945, + "mean_token_accuracy": 0.4448275864124298, + "step": 665 + }, + { + "epoch": 0.0006748311159591515, + "grad_norm": 92.22357358011968, + "learning_rate": 6.748182019620088e-07, + "loss": 2.3475, + "mean_token_accuracy": 0.4620689690113068, + "step": 670 + }, + { + "epoch": 0.0006798671690633243, + "grad_norm": 81.95174569212841, + "learning_rate": 6.798541586930686e-07, + "loss": 2.4461, + "mean_token_accuracy": 0.4413793087005615, + "step": 675 + }, + { + "epoch": 0.0006849032221674971, + "grad_norm": 79.96710306620447, + "learning_rate": 6.848901154241284e-07, + "loss": 2.4317, + "mean_token_accuracy": 0.4517241358757019, + "step": 680 + }, + { + "epoch": 0.0006899392752716699, + "grad_norm": 103.32052424746628, + "learning_rate": 6.899260721551881e-07, + "loss": 2.2136, + "mean_token_accuracy": 0.458620685338974, + "step": 685 + }, + { + "epoch": 0.0006949753283758427, + "grad_norm": 86.42014498307215, + "learning_rate": 6.949620288862478e-07, + "loss": 2.579, + "mean_token_accuracy": 0.43448275327682495, + "step": 690 + }, + { + "epoch": 0.0007000113814800154, + "grad_norm": 74.84087535799382, + "learning_rate": 6.999979856173076e-07, + "loss": 2.6747, + "mean_token_accuracy": 0.38275861740112305, + "step": 695 + }, + { + "epoch": 0.0007050474345841882, + "grad_norm": 116.48024543484408, + "learning_rate": 7.050339423483674e-07, + "loss": 2.5092, + "mean_token_accuracy": 0.4103448331356049, + "step": 700 + }, + { + "epoch": 0.000710083487688361, + "grad_norm": 93.84051280549203, + "learning_rate": 7.100698990794272e-07, + "loss": 2.3428, + "mean_token_accuracy": 0.4689655125141144, + "step": 705 + }, + { + "epoch": 0.0007151195407925338, + "grad_norm": 176.4810165342915, + "learning_rate": 7.151058558104869e-07, + "loss": 2.4544, + "mean_token_accuracy": 0.4620689630508423, + "step": 710 + }, + { + "epoch": 0.0007201555938967065, + "grad_norm": 123.49408801166076, + "learning_rate": 7.201418125415466e-07, + "loss": 2.5885, + "mean_token_accuracy": 0.4413793087005615, + "step": 715 + }, + { + "epoch": 0.0007251916470008793, + "grad_norm": 83.05349987822244, + "learning_rate": 7.251777692726064e-07, + "loss": 2.7422, + "mean_token_accuracy": 0.42413793206214906, + "step": 720 + }, + { + "epoch": 0.000730227700105052, + "grad_norm": 83.15795900002738, + "learning_rate": 7.302137260036662e-07, + "loss": 2.2589, + "mean_token_accuracy": 0.4482758641242981, + "step": 725 + }, + { + "epoch": 0.0007352637532092248, + "grad_norm": 75.12910461884285, + "learning_rate": 7.35249682734726e-07, + "loss": 2.4144, + "mean_token_accuracy": 0.4793103516101837, + "step": 730 + }, + { + "epoch": 0.0007402998063133976, + "grad_norm": 76.6862144802982, + "learning_rate": 7.402856394657858e-07, + "loss": 2.7284, + "mean_token_accuracy": 0.41379310488700866, + "step": 735 + }, + { + "epoch": 0.0007453358594175704, + "grad_norm": 79.02963844112384, + "learning_rate": 7.453215961968455e-07, + "loss": 2.4321, + "mean_token_accuracy": 0.4862068951129913, + "step": 740 + }, + { + "epoch": 0.0007503719125217432, + "grad_norm": 89.21830756888185, + "learning_rate": 7.503575529279052e-07, + "loss": 2.5776, + "mean_token_accuracy": 0.42758620977401735, + "step": 745 + }, + { + "epoch": 0.000755407965625916, + "grad_norm": 119.60579380554891, + "learning_rate": 7.55393509658965e-07, + "loss": 2.3662, + "mean_token_accuracy": 0.44827585220336913, + "step": 750 + }, + { + "epoch": 0.0007604440187300888, + "grad_norm": 120.12614404675867, + "learning_rate": 7.604294663900248e-07, + "loss": 2.609, + "mean_token_accuracy": 0.4068965494632721, + "step": 755 + }, + { + "epoch": 0.0007654800718342614, + "grad_norm": 77.52867628278035, + "learning_rate": 7.654654231210846e-07, + "loss": 2.4938, + "mean_token_accuracy": 0.4137930989265442, + "step": 760 + }, + { + "epoch": 0.0007705161249384342, + "grad_norm": 112.22966822304785, + "learning_rate": 7.705013798521443e-07, + "loss": 2.3313, + "mean_token_accuracy": 0.46551724672317507, + "step": 765 + }, + { + "epoch": 0.000775552178042607, + "grad_norm": 131.61596142722587, + "learning_rate": 7.755373365832041e-07, + "loss": 2.3593, + "mean_token_accuracy": 0.4310344815254211, + "step": 770 + }, + { + "epoch": 0.0007805882311467798, + "grad_norm": 79.68065032041677, + "learning_rate": 7.805732933142639e-07, + "loss": 2.4274, + "mean_token_accuracy": 0.4517241358757019, + "step": 775 + }, + { + "epoch": 0.0007856242842509526, + "grad_norm": 114.9038039336646, + "learning_rate": 7.856092500453237e-07, + "loss": 2.2574, + "mean_token_accuracy": 0.4413793087005615, + "step": 780 + }, + { + "epoch": 0.0007906603373551254, + "grad_norm": 119.73169033830584, + "learning_rate": 7.906452067763835e-07, + "loss": 2.335, + "mean_token_accuracy": 0.47586206197738645, + "step": 785 + }, + { + "epoch": 0.0007956963904592981, + "grad_norm": 92.57948321194462, + "learning_rate": 7.956811635074432e-07, + "loss": 2.7936, + "mean_token_accuracy": 0.4, + "step": 790 + }, + { + "epoch": 0.0008007324435634709, + "grad_norm": 101.27560151020234, + "learning_rate": 8.00717120238503e-07, + "loss": 2.5011, + "mean_token_accuracy": 0.43103447556495667, + "step": 795 + }, + { + "epoch": 0.0008057684966676436, + "grad_norm": 96.97006473089812, + "learning_rate": 8.057530769695627e-07, + "loss": 2.4138, + "mean_token_accuracy": 0.4241379380226135, + "step": 800 + }, + { + "epoch": 0.0008108045497718164, + "grad_norm": 102.94666468476578, + "learning_rate": 8.107890337006225e-07, + "loss": 2.4822, + "mean_token_accuracy": 0.3999999940395355, + "step": 805 + }, + { + "epoch": 0.0008158406028759892, + "grad_norm": 75.38572226991253, + "learning_rate": 8.158249904316823e-07, + "loss": 2.7287, + "mean_token_accuracy": 0.379310342669487, + "step": 810 + }, + { + "epoch": 0.000820876655980162, + "grad_norm": 84.97815107088769, + "learning_rate": 8.20860947162742e-07, + "loss": 2.3224, + "mean_token_accuracy": 0.4517241299152374, + "step": 815 + }, + { + "epoch": 0.0008259127090843348, + "grad_norm": 70.05670008505929, + "learning_rate": 8.258969038938018e-07, + "loss": 2.4871, + "mean_token_accuracy": 0.42758620977401735, + "step": 820 + }, + { + "epoch": 0.0008309487621885075, + "grad_norm": 76.18238155734613, + "learning_rate": 8.309328606248615e-07, + "loss": 2.187, + "mean_token_accuracy": 0.47586206793785096, + "step": 825 + }, + { + "epoch": 0.0008359848152926803, + "grad_norm": 63.141514681604384, + "learning_rate": 8.359688173559213e-07, + "loss": 2.3036, + "mean_token_accuracy": 0.4551724135875702, + "step": 830 + }, + { + "epoch": 0.0008410208683968531, + "grad_norm": 109.9644745805576, + "learning_rate": 8.410047740869811e-07, + "loss": 2.5717, + "mean_token_accuracy": 0.4103448301553726, + "step": 835 + }, + { + "epoch": 0.0008460569215010259, + "grad_norm": 87.77664891234033, + "learning_rate": 8.460407308180408e-07, + "loss": 2.4026, + "mean_token_accuracy": 0.4517241418361664, + "step": 840 + }, + { + "epoch": 0.0008510929746051986, + "grad_norm": 69.01201408024284, + "learning_rate": 8.510766875491006e-07, + "loss": 2.3477, + "mean_token_accuracy": 0.4758620738983154, + "step": 845 + }, + { + "epoch": 0.0008561290277093714, + "grad_norm": 121.60129783717338, + "learning_rate": 8.561126442801603e-07, + "loss": 1.9042, + "mean_token_accuracy": 0.5482758581638336, + "step": 850 + }, + { + "epoch": 0.0008611650808135441, + "grad_norm": 121.32099991086666, + "learning_rate": 8.611486010112201e-07, + "loss": 2.1015, + "mean_token_accuracy": 0.4931034505367279, + "step": 855 + }, + { + "epoch": 0.0008662011339177169, + "grad_norm": 104.05189548327019, + "learning_rate": 8.661845577422799e-07, + "loss": 2.1571, + "mean_token_accuracy": 0.49655172824859617, + "step": 860 + }, + { + "epoch": 0.0008712371870218897, + "grad_norm": 125.16861668453716, + "learning_rate": 8.712205144733397e-07, + "loss": 2.1423, + "mean_token_accuracy": 0.4965517222881317, + "step": 865 + }, + { + "epoch": 0.0008762732401260625, + "grad_norm": 105.3064722604332, + "learning_rate": 8.762564712043994e-07, + "loss": 2.1473, + "mean_token_accuracy": 0.5000000059604645, + "step": 870 + }, + { + "epoch": 0.0008813092932302353, + "grad_norm": 70.464647210329, + "learning_rate": 8.812924279354592e-07, + "loss": 1.8264, + "mean_token_accuracy": 0.5344827532768249, + "step": 875 + }, + { + "epoch": 0.0008863453463344081, + "grad_norm": 77.82747467870017, + "learning_rate": 8.863283846665189e-07, + "loss": 2.22, + "mean_token_accuracy": 0.48965518474578856, + "step": 880 + }, + { + "epoch": 0.0008913813994385809, + "grad_norm": 70.69831654174507, + "learning_rate": 8.913643413975787e-07, + "loss": 2.2467, + "mean_token_accuracy": 0.45862067937850953, + "step": 885 + }, + { + "epoch": 0.0008964174525427535, + "grad_norm": 67.69527822701062, + "learning_rate": 8.964002981286385e-07, + "loss": 2.2231, + "mean_token_accuracy": 0.4551724076271057, + "step": 890 + }, + { + "epoch": 0.0009014535056469263, + "grad_norm": 78.5600702087927, + "learning_rate": 9.014362548596982e-07, + "loss": 2.2696, + "mean_token_accuracy": 0.4379310369491577, + "step": 895 + }, + { + "epoch": 0.0009064895587510991, + "grad_norm": 90.37824983572487, + "learning_rate": 9.06472211590758e-07, + "loss": 2.3338, + "mean_token_accuracy": 0.4396854221820831, + "step": 900 + }, + { + "epoch": 0.0009115256118552719, + "grad_norm": 83.8315104397229, + "learning_rate": 9.115081683218177e-07, + "loss": 2.5784, + "mean_token_accuracy": 0.42068966031074523, + "step": 905 + }, + { + "epoch": 0.0009165616649594447, + "grad_norm": 73.61129192415399, + "learning_rate": 9.165441250528775e-07, + "loss": 2.3417, + "mean_token_accuracy": 0.42413793206214906, + "step": 910 + }, + { + "epoch": 0.0009215977180636175, + "grad_norm": 87.10247355700956, + "learning_rate": 9.215800817839373e-07, + "loss": 2.766, + "mean_token_accuracy": 0.36896551847457887, + "step": 915 + }, + { + "epoch": 0.0009266337711677902, + "grad_norm": 89.69322759711844, + "learning_rate": 9.266160385149971e-07, + "loss": 2.4475, + "mean_token_accuracy": 0.42413792610168455, + "step": 920 + }, + { + "epoch": 0.000931669824271963, + "grad_norm": 78.14797469167598, + "learning_rate": 9.31651995246057e-07, + "loss": 2.4458, + "mean_token_accuracy": 0.4448275864124298, + "step": 925 + }, + { + "epoch": 0.0009367058773761357, + "grad_norm": 85.58270905685941, + "learning_rate": 9.366879519771168e-07, + "loss": 2.8264, + "mean_token_accuracy": 0.3655172437429428, + "step": 930 + }, + { + "epoch": 0.0009417419304803085, + "grad_norm": 80.21212458809858, + "learning_rate": 9.417239087081765e-07, + "loss": 2.2188, + "mean_token_accuracy": 0.4689655125141144, + "step": 935 + }, + { + "epoch": 0.0009467779835844813, + "grad_norm": 80.47822616708802, + "learning_rate": 9.467598654392362e-07, + "loss": 2.4089, + "mean_token_accuracy": 0.4551724135875702, + "step": 940 + }, + { + "epoch": 0.0009518140366886541, + "grad_norm": 69.37473453455668, + "learning_rate": 9.51795822170296e-07, + "loss": 2.1651, + "mean_token_accuracy": 0.4620689630508423, + "step": 945 + }, + { + "epoch": 0.0009568500897928268, + "grad_norm": 64.46855922972864, + "learning_rate": 9.568317789013559e-07, + "loss": 2.3544, + "mean_token_accuracy": 0.4068965494632721, + "step": 950 + }, + { + "epoch": 0.0009618861428969996, + "grad_norm": 97.94578556230171, + "learning_rate": 9.618677356324156e-07, + "loss": 2.1219, + "mean_token_accuracy": 0.4896551728248596, + "step": 955 + }, + { + "epoch": 0.0009669221960011724, + "grad_norm": 82.31751501537289, + "learning_rate": 9.669036923634752e-07, + "loss": 1.9896, + "mean_token_accuracy": 0.5241379261016845, + "step": 960 + }, + { + "epoch": 0.0009719582491053452, + "grad_norm": 90.64651861642238, + "learning_rate": 9.719396490945351e-07, + "loss": 2.3545, + "mean_token_accuracy": 0.4275862157344818, + "step": 965 + }, + { + "epoch": 0.000976994302209518, + "grad_norm": 73.91083962121589, + "learning_rate": 9.769756058255948e-07, + "loss": 2.0212, + "mean_token_accuracy": 0.5068965494632721, + "step": 970 + }, + { + "epoch": 0.0009820303553136906, + "grad_norm": 98.13645531687172, + "learning_rate": 9.820115625566547e-07, + "loss": 2.2457, + "mean_token_accuracy": 0.4586206912994385, + "step": 975 + }, + { + "epoch": 0.0009870664084178635, + "grad_norm": 105.73782207242293, + "learning_rate": 9.870475192877144e-07, + "loss": 2.6174, + "mean_token_accuracy": 0.4448275864124298, + "step": 980 + }, + { + "epoch": 0.0009921024615220362, + "grad_norm": 94.53904273037824, + "learning_rate": 9.92083476018774e-07, + "loss": 2.4942, + "mean_token_accuracy": 0.4379310369491577, + "step": 985 + }, + { + "epoch": 0.0009971385146262091, + "grad_norm": 95.10209112496764, + "learning_rate": 9.97119432749834e-07, + "loss": 2.2514, + "mean_token_accuracy": 0.458620685338974, + "step": 990 + }, + { + "epoch": 0.0010021745677303818, + "grad_norm": 94.55687912818004, + "learning_rate": 1.0021553894808936e-06, + "loss": 2.6231, + "mean_token_accuracy": 0.4068965554237366, + "step": 995 + }, + { + "epoch": 0.0010072106208345545, + "grad_norm": 87.29290646053556, + "learning_rate": 1.0071913462119535e-06, + "loss": 2.1165, + "mean_token_accuracy": 0.4310344696044922, + "step": 1000 + }, + { + "epoch": 0.0010122466739387274, + "grad_norm": 82.80634468832311, + "learning_rate": 1.0122273029430132e-06, + "loss": 2.8514, + "mean_token_accuracy": 0.38620689511299133, + "step": 1005 + }, + { + "epoch": 0.0010172827270429, + "grad_norm": 98.21734279194256, + "learning_rate": 1.0172632596740729e-06, + "loss": 2.1538, + "mean_token_accuracy": 0.46358137130737304, + "step": 1010 + }, + { + "epoch": 0.001022318780147073, + "grad_norm": 81.24226898033034, + "learning_rate": 1.0222992164051327e-06, + "loss": 2.2706, + "mean_token_accuracy": 0.4206896543502808, + "step": 1015 + }, + { + "epoch": 0.0010273548332512456, + "grad_norm": 89.83676190967354, + "learning_rate": 1.0273351731361924e-06, + "loss": 2.2762, + "mean_token_accuracy": 0.4551724076271057, + "step": 1020 + }, + { + "epoch": 0.0010323908863554185, + "grad_norm": 84.52068018737582, + "learning_rate": 1.0323711298672523e-06, + "loss": 2.1873, + "mean_token_accuracy": 0.5087719261646271, + "step": 1025 + }, + { + "epoch": 0.0010374269394595912, + "grad_norm": 75.93525632067038, + "learning_rate": 1.037407086598312e-06, + "loss": 2.3937, + "mean_token_accuracy": 0.4344827592372894, + "step": 1030 + }, + { + "epoch": 0.001042462992563764, + "grad_norm": 89.77920301537786, + "learning_rate": 1.0424430433293717e-06, + "loss": 2.5033, + "mean_token_accuracy": 0.4241379380226135, + "step": 1035 + }, + { + "epoch": 0.0010474990456679368, + "grad_norm": 70.79544822302483, + "learning_rate": 1.0474790000604315e-06, + "loss": 2.2906, + "mean_token_accuracy": 0.38275861740112305, + "step": 1040 + }, + { + "epoch": 0.0010525350987721095, + "grad_norm": 54.96460134605188, + "learning_rate": 1.0525149567914912e-06, + "loss": 2.3459, + "mean_token_accuracy": 0.4310344815254211, + "step": 1045 + }, + { + "epoch": 0.0010575711518762824, + "grad_norm": 125.77867336302904, + "learning_rate": 1.0575509135225511e-06, + "loss": 2.497, + "mean_token_accuracy": 0.43448275327682495, + "step": 1050 + }, + { + "epoch": 0.001062607204980455, + "grad_norm": 64.89975992149849, + "learning_rate": 1.0625868702536108e-06, + "loss": 2.478, + "mean_token_accuracy": 0.441379314661026, + "step": 1055 + }, + { + "epoch": 0.001067643258084628, + "grad_norm": 94.86657410197256, + "learning_rate": 1.0676228269846705e-06, + "loss": 2.6097, + "mean_token_accuracy": 0.4137930989265442, + "step": 1060 + }, + { + "epoch": 0.0010726793111888006, + "grad_norm": 69.48415418506345, + "learning_rate": 1.0726587837157304e-06, + "loss": 2.3105, + "mean_token_accuracy": 0.4379310429096222, + "step": 1065 + }, + { + "epoch": 0.0010777153642929733, + "grad_norm": 77.21060146011098, + "learning_rate": 1.07769474044679e-06, + "loss": 2.0062, + "mean_token_accuracy": 0.482758617401123, + "step": 1070 + }, + { + "epoch": 0.0010827514173971462, + "grad_norm": 116.27805213569609, + "learning_rate": 1.08273069717785e-06, + "loss": 2.4718, + "mean_token_accuracy": 0.44827585816383364, + "step": 1075 + }, + { + "epoch": 0.001087787470501319, + "grad_norm": 88.34489248480183, + "learning_rate": 1.0877666539089096e-06, + "loss": 2.4136, + "mean_token_accuracy": 0.4413793087005615, + "step": 1080 + }, + { + "epoch": 0.0010928235236054918, + "grad_norm": 95.08995116339224, + "learning_rate": 1.0928026106399695e-06, + "loss": 2.1384, + "mean_token_accuracy": 0.4862069010734558, + "step": 1085 + }, + { + "epoch": 0.0010978595767096645, + "grad_norm": 93.39381170238919, + "learning_rate": 1.0978385673710292e-06, + "loss": 2.4491, + "mean_token_accuracy": 0.4655172526836395, + "step": 1090 + }, + { + "epoch": 0.0011028956298138372, + "grad_norm": 89.49662934117966, + "learning_rate": 1.1028745241020888e-06, + "loss": 2.4618, + "mean_token_accuracy": 0.4344827651977539, + "step": 1095 + }, + { + "epoch": 0.00110793168291801, + "grad_norm": 79.1104342235396, + "learning_rate": 1.1079104808331487e-06, + "loss": 2.3571, + "mean_token_accuracy": 0.45517240166664125, + "step": 1100 + }, + { + "epoch": 0.0011129677360221827, + "grad_norm": 98.20149378743784, + "learning_rate": 1.1129464375642084e-06, + "loss": 2.484, + "mean_token_accuracy": 0.4448275864124298, + "step": 1105 + }, + { + "epoch": 0.0011180037891263556, + "grad_norm": 79.82266104144223, + "learning_rate": 1.1179823942952683e-06, + "loss": 2.0848, + "mean_token_accuracy": 0.5068965435028077, + "step": 1110 + }, + { + "epoch": 0.0011230398422305283, + "grad_norm": 85.3229008140252, + "learning_rate": 1.123018351026328e-06, + "loss": 2.3575, + "mean_token_accuracy": 0.43103447556495667, + "step": 1115 + }, + { + "epoch": 0.0011280758953347012, + "grad_norm": 87.4265057563504, + "learning_rate": 1.1280543077573876e-06, + "loss": 2.4454, + "mean_token_accuracy": 0.4206896543502808, + "step": 1120 + }, + { + "epoch": 0.001133111948438874, + "grad_norm": 92.87773421724492, + "learning_rate": 1.1330902644884477e-06, + "loss": 2.0481, + "mean_token_accuracy": 0.4724137902259827, + "step": 1125 + }, + { + "epoch": 0.0011381480015430466, + "grad_norm": 101.75831315850915, + "learning_rate": 1.1381262212195074e-06, + "loss": 2.1305, + "mean_token_accuracy": 0.4896551609039307, + "step": 1130 + }, + { + "epoch": 0.0011431840546472195, + "grad_norm": 70.03599602938513, + "learning_rate": 1.143162177950567e-06, + "loss": 2.0372, + "mean_token_accuracy": 0.5034482717514038, + "step": 1135 + }, + { + "epoch": 0.0011482201077513922, + "grad_norm": 98.2055206900503, + "learning_rate": 1.148198134681627e-06, + "loss": 2.3925, + "mean_token_accuracy": 0.4551724135875702, + "step": 1140 + }, + { + "epoch": 0.001153256160855565, + "grad_norm": 92.20626916972, + "learning_rate": 1.1532340914126867e-06, + "loss": 2.4617, + "mean_token_accuracy": 0.39655172228813174, + "step": 1145 + }, + { + "epoch": 0.0011582922139597377, + "grad_norm": 85.11082646832499, + "learning_rate": 1.1582700481437465e-06, + "loss": 2.0419, + "mean_token_accuracy": 0.510344821214676, + "step": 1150 + }, + { + "epoch": 0.0011633282670639106, + "grad_norm": 63.86010897517878, + "learning_rate": 1.1633060048748062e-06, + "loss": 2.381, + "mean_token_accuracy": 0.41724138259887694, + "step": 1155 + }, + { + "epoch": 0.0011683643201680833, + "grad_norm": 125.54070223178105, + "learning_rate": 1.168341961605866e-06, + "loss": 2.2377, + "mean_token_accuracy": 0.4655172288417816, + "step": 1160 + }, + { + "epoch": 0.001173400373272256, + "grad_norm": 90.16110689488018, + "learning_rate": 1.1733779183369258e-06, + "loss": 2.2894, + "mean_token_accuracy": 0.482758617401123, + "step": 1165 + }, + { + "epoch": 0.001178436426376429, + "grad_norm": 83.87535543555006, + "learning_rate": 1.1784138750679855e-06, + "loss": 2.6102, + "mean_token_accuracy": 0.4121597111225128, + "step": 1170 + }, + { + "epoch": 0.0011834724794806016, + "grad_norm": 100.54991805051277, + "learning_rate": 1.1834498317990453e-06, + "loss": 2.1402, + "mean_token_accuracy": 0.5172413766384125, + "step": 1175 + }, + { + "epoch": 0.0011885085325847745, + "grad_norm": 105.51429755094078, + "learning_rate": 1.188485788530105e-06, + "loss": 2.1268, + "mean_token_accuracy": 0.47931033968925474, + "step": 1180 + }, + { + "epoch": 0.0011935445856889472, + "grad_norm": 81.31925779452442, + "learning_rate": 1.193521745261165e-06, + "loss": 2.3923, + "mean_token_accuracy": 0.43103448748588563, + "step": 1185 + }, + { + "epoch": 0.00119858063879312, + "grad_norm": 75.45651145037678, + "learning_rate": 1.1985577019922246e-06, + "loss": 2.0322, + "mean_token_accuracy": 0.4862069010734558, + "step": 1190 + }, + { + "epoch": 0.0012036166918972927, + "grad_norm": 146.180295202791, + "learning_rate": 1.2035936587232843e-06, + "loss": 2.2846, + "mean_token_accuracy": 0.49679802656173705, + "step": 1195 + }, + { + "epoch": 0.0012086527450014654, + "grad_norm": 90.76549548259376, + "learning_rate": 1.2086296154543442e-06, + "loss": 2.4331, + "mean_token_accuracy": 0.44827585816383364, + "step": 1200 + }, + { + "epoch": 0.0012136887981056383, + "grad_norm": 70.80672854832602, + "learning_rate": 1.2136655721854038e-06, + "loss": 2.0853, + "mean_token_accuracy": 0.46551724076271056, + "step": 1205 + }, + { + "epoch": 0.001218724851209811, + "grad_norm": 75.90940406667318, + "learning_rate": 1.2187015289164637e-06, + "loss": 2.4316, + "mean_token_accuracy": 0.41034482717514037, + "step": 1210 + }, + { + "epoch": 0.0012237609043139839, + "grad_norm": 74.55984293879484, + "learning_rate": 1.2237374856475234e-06, + "loss": 2.2343, + "mean_token_accuracy": 0.4724137902259827, + "step": 1215 + }, + { + "epoch": 0.0012287969574181566, + "grad_norm": 61.851756927426095, + "learning_rate": 1.228773442378583e-06, + "loss": 2.1658, + "mean_token_accuracy": 0.47241380214691164, + "step": 1220 + }, + { + "epoch": 0.0012338330105223293, + "grad_norm": 86.12593022767761, + "learning_rate": 1.233809399109643e-06, + "loss": 2.2726, + "mean_token_accuracy": 0.4482758641242981, + "step": 1225 + }, + { + "epoch": 0.0012388690636265022, + "grad_norm": 80.10808653391787, + "learning_rate": 1.2388453558407026e-06, + "loss": 2.4066, + "mean_token_accuracy": 0.41440886855125425, + "step": 1230 + }, + { + "epoch": 0.0012439051167306748, + "grad_norm": 100.8825454342411, + "learning_rate": 1.2438813125717625e-06, + "loss": 2.357, + "mean_token_accuracy": 0.4417487621307373, + "step": 1235 + }, + { + "epoch": 0.0012489411698348477, + "grad_norm": 98.5626113712527, + "learning_rate": 1.2489172693028222e-06, + "loss": 2.5658, + "mean_token_accuracy": 0.42758620977401735, + "step": 1240 + }, + { + "epoch": 0.0012539772229390204, + "grad_norm": 79.78350555896583, + "learning_rate": 1.2539532260338819e-06, + "loss": 2.3719, + "mean_token_accuracy": 0.42758620977401735, + "step": 1245 + }, + { + "epoch": 0.0012590132760431933, + "grad_norm": 92.1415609730985, + "learning_rate": 1.2589891827649418e-06, + "loss": 2.1144, + "mean_token_accuracy": 0.4724137902259827, + "step": 1250 + }, + { + "epoch": 0.001264049329147366, + "grad_norm": 80.28929122349953, + "learning_rate": 1.2640251394960014e-06, + "loss": 2.1751, + "mean_token_accuracy": 0.45517241954803467, + "step": 1255 + }, + { + "epoch": 0.0012690853822515387, + "grad_norm": 98.53427546574116, + "learning_rate": 1.2690610962270613e-06, + "loss": 2.4729, + "mean_token_accuracy": 0.4344827592372894, + "step": 1260 + }, + { + "epoch": 0.0012741214353557116, + "grad_norm": 89.22919365213846, + "learning_rate": 1.274097052958121e-06, + "loss": 2.3517, + "mean_token_accuracy": 0.42758620381355283, + "step": 1265 + }, + { + "epoch": 0.0012791574884598843, + "grad_norm": 66.32414561947962, + "learning_rate": 1.2791330096891807e-06, + "loss": 2.1336, + "mean_token_accuracy": 0.4379310369491577, + "step": 1270 + }, + { + "epoch": 0.0012841935415640572, + "grad_norm": 79.24123706737915, + "learning_rate": 1.2841689664202406e-06, + "loss": 2.3614, + "mean_token_accuracy": 0.4068965494632721, + "step": 1275 + }, + { + "epoch": 0.0012892295946682298, + "grad_norm": 91.26346669185378, + "learning_rate": 1.2892049231513002e-06, + "loss": 2.5666, + "mean_token_accuracy": 0.4103448331356049, + "step": 1280 + }, + { + "epoch": 0.0012942656477724027, + "grad_norm": 123.23330494528199, + "learning_rate": 1.2942408798823601e-06, + "loss": 2.1011, + "mean_token_accuracy": 0.5297640681266784, + "step": 1285 + }, + { + "epoch": 0.0012993017008765754, + "grad_norm": 57.70042962841069, + "learning_rate": 1.2992768366134198e-06, + "loss": 2.5687, + "mean_token_accuracy": 0.4206896543502808, + "step": 1290 + }, + { + "epoch": 0.001304337753980748, + "grad_norm": 96.58903314675543, + "learning_rate": 1.3043127933444795e-06, + "loss": 2.4582, + "mean_token_accuracy": 0.43448275327682495, + "step": 1295 + }, + { + "epoch": 0.001309373807084921, + "grad_norm": 72.08930163850195, + "learning_rate": 1.3093487500755394e-06, + "loss": 2.2915, + "mean_token_accuracy": 0.4689655125141144, + "step": 1300 + }, + { + "epoch": 0.0013144098601890937, + "grad_norm": 105.52358898170489, + "learning_rate": 1.314384706806599e-06, + "loss": 2.063, + "mean_token_accuracy": 0.5034482777118683, + "step": 1305 + }, + { + "epoch": 0.0013194459132932666, + "grad_norm": 85.14372468467099, + "learning_rate": 1.319420663537659e-06, + "loss": 2.3354, + "mean_token_accuracy": 0.4172413766384125, + "step": 1310 + }, + { + "epoch": 0.0013244819663974393, + "grad_norm": 97.53967066326068, + "learning_rate": 1.3244566202687186e-06, + "loss": 2.4497, + "mean_token_accuracy": 0.4413793087005615, + "step": 1315 + }, + { + "epoch": 0.001329518019501612, + "grad_norm": 86.35446091083718, + "learning_rate": 1.3294925769997785e-06, + "loss": 2.63, + "mean_token_accuracy": 0.3931034505367279, + "step": 1320 + }, + { + "epoch": 0.0013345540726057848, + "grad_norm": 126.78052960926874, + "learning_rate": 1.3345285337308382e-06, + "loss": 2.1806, + "mean_token_accuracy": 0.4620689570903778, + "step": 1325 + }, + { + "epoch": 0.0013395901257099575, + "grad_norm": 94.40356981733358, + "learning_rate": 1.339564490461898e-06, + "loss": 2.175, + "mean_token_accuracy": 0.4846340000629425, + "step": 1330 + }, + { + "epoch": 0.0013446261788141304, + "grad_norm": 80.673031220175, + "learning_rate": 1.344600447192958e-06, + "loss": 2.3372, + "mean_token_accuracy": 0.42758620381355283, + "step": 1335 + }, + { + "epoch": 0.001349662231918303, + "grad_norm": 75.00444691515132, + "learning_rate": 1.3496364039240176e-06, + "loss": 2.0388, + "mean_token_accuracy": 0.517241382598877, + "step": 1340 + }, + { + "epoch": 0.001354698285022476, + "grad_norm": 64.32829062489732, + "learning_rate": 1.3546723606550773e-06, + "loss": 2.2268, + "mean_token_accuracy": 0.41724138259887694, + "step": 1345 + }, + { + "epoch": 0.0013597343381266487, + "grad_norm": 75.68791544527417, + "learning_rate": 1.3597083173861372e-06, + "loss": 2.0087, + "mean_token_accuracy": 0.4620689690113068, + "step": 1350 + }, + { + "epoch": 0.0013647703912308214, + "grad_norm": 82.44388548224241, + "learning_rate": 1.3647442741171969e-06, + "loss": 2.1741, + "mean_token_accuracy": 0.47586206793785096, + "step": 1355 + }, + { + "epoch": 0.0013698064443349943, + "grad_norm": 102.22951578421268, + "learning_rate": 1.3697802308482568e-06, + "loss": 2.2479, + "mean_token_accuracy": 0.441379314661026, + "step": 1360 + }, + { + "epoch": 0.001374842497439167, + "grad_norm": 90.29348760683527, + "learning_rate": 1.3748161875793164e-06, + "loss": 2.2465, + "mean_token_accuracy": 0.4620689630508423, + "step": 1365 + }, + { + "epoch": 0.0013798785505433398, + "grad_norm": 82.6765889619488, + "learning_rate": 1.3798521443103761e-06, + "loss": 2.1641, + "mean_token_accuracy": 0.45862067937850953, + "step": 1370 + }, + { + "epoch": 0.0013849146036475125, + "grad_norm": 79.0657462090166, + "learning_rate": 1.384888101041436e-06, + "loss": 2.502, + "mean_token_accuracy": 0.42758620381355283, + "step": 1375 + }, + { + "epoch": 0.0013899506567516854, + "grad_norm": 72.71052100869652, + "learning_rate": 1.3899240577724957e-06, + "loss": 2.638, + "mean_token_accuracy": 0.39655172228813174, + "step": 1380 + }, + { + "epoch": 0.001394986709855858, + "grad_norm": 87.28279095020855, + "learning_rate": 1.3949600145035556e-06, + "loss": 2.3732, + "mean_token_accuracy": 0.41724138259887694, + "step": 1385 + }, + { + "epoch": 0.0014000227629600308, + "grad_norm": 108.05874347851243, + "learning_rate": 1.3999959712346152e-06, + "loss": 2.283, + "mean_token_accuracy": 0.44482759237289426, + "step": 1390 + }, + { + "epoch": 0.0014050588160642037, + "grad_norm": 91.61751858243969, + "learning_rate": 1.405031927965675e-06, + "loss": 2.5354, + "mean_token_accuracy": 0.38620689511299133, + "step": 1395 + }, + { + "epoch": 0.0014100948691683764, + "grad_norm": 81.13759435226928, + "learning_rate": 1.4100678846967348e-06, + "loss": 2.1121, + "mean_token_accuracy": 0.47586206793785096, + "step": 1400 + }, + { + "epoch": 0.0014151309222725492, + "grad_norm": 67.64855921366826, + "learning_rate": 1.4151038414277945e-06, + "loss": 2.1707, + "mean_token_accuracy": 0.4586206912994385, + "step": 1405 + }, + { + "epoch": 0.001420166975376722, + "grad_norm": 92.82286441771318, + "learning_rate": 1.4201397981588544e-06, + "loss": 2.2161, + "mean_token_accuracy": 0.39655172228813174, + "step": 1410 + }, + { + "epoch": 0.0014252030284808948, + "grad_norm": 80.09402502047881, + "learning_rate": 1.425175754889914e-06, + "loss": 2.3469, + "mean_token_accuracy": 0.417241370677948, + "step": 1415 + }, + { + "epoch": 0.0014302390815850675, + "grad_norm": 96.60022128767807, + "learning_rate": 1.4302117116209737e-06, + "loss": 2.1765, + "mean_token_accuracy": 0.4620689690113068, + "step": 1420 + }, + { + "epoch": 0.0014352751346892402, + "grad_norm": 92.51314836577201, + "learning_rate": 1.4352476683520336e-06, + "loss": 2.3332, + "mean_token_accuracy": 0.4344827651977539, + "step": 1425 + }, + { + "epoch": 0.001440311187793413, + "grad_norm": 76.5267903608808, + "learning_rate": 1.4402836250830933e-06, + "loss": 2.454, + "mean_token_accuracy": 0.40816696882247927, + "step": 1430 + }, + { + "epoch": 0.0014453472408975858, + "grad_norm": 112.99849516326708, + "learning_rate": 1.4453195818141532e-06, + "loss": 2.6772, + "mean_token_accuracy": 0.4, + "step": 1435 + }, + { + "epoch": 0.0014503832940017587, + "grad_norm": 78.57110578607713, + "learning_rate": 1.4503555385452129e-06, + "loss": 2.3715, + "mean_token_accuracy": 0.4586206912994385, + "step": 1440 + }, + { + "epoch": 0.0014554193471059314, + "grad_norm": 92.1189167634099, + "learning_rate": 1.4553914952762727e-06, + "loss": 2.3398, + "mean_token_accuracy": 0.441379314661026, + "step": 1445 + }, + { + "epoch": 0.001460455400210104, + "grad_norm": 85.60064817375593, + "learning_rate": 1.4604274520073324e-06, + "loss": 2.0117, + "mean_token_accuracy": 0.5172413766384125, + "step": 1450 + }, + { + "epoch": 0.001465491453314277, + "grad_norm": 116.31359990676597, + "learning_rate": 1.465463408738392e-06, + "loss": 2.3036, + "mean_token_accuracy": 0.4103448212146759, + "step": 1455 + }, + { + "epoch": 0.0014705275064184496, + "grad_norm": 106.96774675136832, + "learning_rate": 1.470499365469452e-06, + "loss": 2.4191, + "mean_token_accuracy": 0.3896551728248596, + "step": 1460 + }, + { + "epoch": 0.0014755635595226225, + "grad_norm": 89.51959518282786, + "learning_rate": 1.4755353222005117e-06, + "loss": 2.3593, + "mean_token_accuracy": 0.4517241358757019, + "step": 1465 + }, + { + "epoch": 0.0014805996126267952, + "grad_norm": 82.50721156898831, + "learning_rate": 1.4805712789315716e-06, + "loss": 2.279, + "mean_token_accuracy": 0.4655172288417816, + "step": 1470 + }, + { + "epoch": 0.001485635665730968, + "grad_norm": 90.02670440792986, + "learning_rate": 1.4856072356626312e-06, + "loss": 2.0858, + "mean_token_accuracy": 0.5021778523921967, + "step": 1475 + }, + { + "epoch": 0.0014906717188351408, + "grad_norm": 83.33805010048364, + "learning_rate": 1.490643192393691e-06, + "loss": 2.4329, + "mean_token_accuracy": 0.4620689690113068, + "step": 1480 + }, + { + "epoch": 0.0014957077719393135, + "grad_norm": 97.84951238368177, + "learning_rate": 1.4956791491247508e-06, + "loss": 2.1651, + "mean_token_accuracy": 0.4620689690113068, + "step": 1485 + }, + { + "epoch": 0.0015007438250434863, + "grad_norm": 114.56671182964665, + "learning_rate": 1.5007151058558105e-06, + "loss": 2.4274, + "mean_token_accuracy": 0.4655172288417816, + "step": 1490 + }, + { + "epoch": 0.001505779878147659, + "grad_norm": 92.29710095724172, + "learning_rate": 1.5057510625868704e-06, + "loss": 2.0575, + "mean_token_accuracy": 0.4689655125141144, + "step": 1495 + }, + { + "epoch": 0.001510815931251832, + "grad_norm": 81.63875752328688, + "learning_rate": 1.51078701931793e-06, + "loss": 2.228, + "mean_token_accuracy": 0.4655172348022461, + "step": 1500 + }, + { + "epoch": 0.0015158519843560046, + "grad_norm": 95.38223895272955, + "learning_rate": 1.5158229760489897e-06, + "loss": 2.5547, + "mean_token_accuracy": 0.417241370677948, + "step": 1505 + }, + { + "epoch": 0.0015208880374601775, + "grad_norm": 86.54306236708071, + "learning_rate": 1.5208589327800496e-06, + "loss": 2.2331, + "mean_token_accuracy": 0.4689655125141144, + "step": 1510 + }, + { + "epoch": 0.0015259240905643502, + "grad_norm": 105.14984086161786, + "learning_rate": 1.5258948895111093e-06, + "loss": 2.3451, + "mean_token_accuracy": 0.4551724076271057, + "step": 1515 + }, + { + "epoch": 0.0015309601436685229, + "grad_norm": 69.71878729515852, + "learning_rate": 1.5309308462421692e-06, + "loss": 2.0062, + "mean_token_accuracy": 0.5, + "step": 1520 + }, + { + "epoch": 0.0015359961967726958, + "grad_norm": 72.75790316207774, + "learning_rate": 1.5359668029732288e-06, + "loss": 2.2077, + "mean_token_accuracy": 0.4517241358757019, + "step": 1525 + }, + { + "epoch": 0.0015410322498768685, + "grad_norm": 105.32959226149646, + "learning_rate": 1.5410027597042885e-06, + "loss": 2.3033, + "mean_token_accuracy": 0.4517241299152374, + "step": 1530 + }, + { + "epoch": 0.0015460683029810413, + "grad_norm": 81.31609279829296, + "learning_rate": 1.5460387164353486e-06, + "loss": 2.4319, + "mean_token_accuracy": 0.46551724076271056, + "step": 1535 + }, + { + "epoch": 0.001551104356085214, + "grad_norm": 120.60692135671961, + "learning_rate": 1.5510746731664083e-06, + "loss": 2.1975, + "mean_token_accuracy": 0.5068965554237366, + "step": 1540 + }, + { + "epoch": 0.001556140409189387, + "grad_norm": 66.07699669915323, + "learning_rate": 1.556110629897468e-06, + "loss": 2.6908, + "mean_token_accuracy": 0.4068965554237366, + "step": 1545 + }, + { + "epoch": 0.0015611764622935596, + "grad_norm": 94.284505799871, + "learning_rate": 1.5611465866285279e-06, + "loss": 2.2898, + "mean_token_accuracy": 0.4482758641242981, + "step": 1550 + }, + { + "epoch": 0.0015662125153977323, + "grad_norm": 101.87430211498186, + "learning_rate": 1.5661825433595875e-06, + "loss": 2.5544, + "mean_token_accuracy": 0.4068965494632721, + "step": 1555 + }, + { + "epoch": 0.0015712485685019052, + "grad_norm": 64.9453871123965, + "learning_rate": 1.5712185000906474e-06, + "loss": 2.1962, + "mean_token_accuracy": 0.506896561384201, + "step": 1560 + }, + { + "epoch": 0.0015762846216060779, + "grad_norm": 102.87728343758079, + "learning_rate": 1.576254456821707e-06, + "loss": 2.2039, + "mean_token_accuracy": 0.482758617401123, + "step": 1565 + }, + { + "epoch": 0.0015813206747102508, + "grad_norm": 57.98800839039422, + "learning_rate": 1.581290413552767e-06, + "loss": 2.187, + "mean_token_accuracy": 0.45862067937850953, + "step": 1570 + }, + { + "epoch": 0.0015863567278144235, + "grad_norm": 83.2383170154284, + "learning_rate": 1.5863263702838267e-06, + "loss": 2.505, + "mean_token_accuracy": 0.4310344815254211, + "step": 1575 + }, + { + "epoch": 0.0015913927809185961, + "grad_norm": 81.55813491742742, + "learning_rate": 1.5913623270148863e-06, + "loss": 2.3187, + "mean_token_accuracy": 0.4482758641242981, + "step": 1580 + }, + { + "epoch": 0.001596428834022769, + "grad_norm": 79.37737258693716, + "learning_rate": 1.5963982837459462e-06, + "loss": 2.1579, + "mean_token_accuracy": 0.47586206197738645, + "step": 1585 + }, + { + "epoch": 0.0016014648871269417, + "grad_norm": 65.85649458113443, + "learning_rate": 1.601434240477006e-06, + "loss": 2.4545, + "mean_token_accuracy": 0.4462794899940491, + "step": 1590 + }, + { + "epoch": 0.0016065009402311146, + "grad_norm": 86.55837076754545, + "learning_rate": 1.6064701972080658e-06, + "loss": 2.2032, + "mean_token_accuracy": 0.5206896483898162, + "step": 1595 + }, + { + "epoch": 0.0016115369933352873, + "grad_norm": 67.68460300678399, + "learning_rate": 1.6115061539391255e-06, + "loss": 2.3538, + "mean_token_accuracy": 0.43103447556495667, + "step": 1600 + }, + { + "epoch": 0.0016165730464394602, + "grad_norm": 97.73554671726215, + "learning_rate": 1.6165421106701851e-06, + "loss": 2.3681, + "mean_token_accuracy": 0.41379310488700866, + "step": 1605 + }, + { + "epoch": 0.0016216090995436329, + "grad_norm": 89.9015968600415, + "learning_rate": 1.621578067401245e-06, + "loss": 2.1631, + "mean_token_accuracy": 0.4931034505367279, + "step": 1610 + }, + { + "epoch": 0.0016266451526478056, + "grad_norm": 79.46924592264445, + "learning_rate": 1.6266140241323047e-06, + "loss": 2.3175, + "mean_token_accuracy": 0.43103448748588563, + "step": 1615 + }, + { + "epoch": 0.0016316812057519784, + "grad_norm": 73.13835634105239, + "learning_rate": 1.6316499808633646e-06, + "loss": 2.4491, + "mean_token_accuracy": 0.47241380214691164, + "step": 1620 + }, + { + "epoch": 0.0016367172588561511, + "grad_norm": 75.7165498923317, + "learning_rate": 1.6366859375944243e-06, + "loss": 2.2476, + "mean_token_accuracy": 0.47241378426551817, + "step": 1625 + }, + { + "epoch": 0.001641753311960324, + "grad_norm": 99.96607029375616, + "learning_rate": 1.641721894325484e-06, + "loss": 2.0968, + "mean_token_accuracy": 0.5034482717514038, + "step": 1630 + }, + { + "epoch": 0.0016467893650644967, + "grad_norm": 79.96250185977877, + "learning_rate": 1.6467578510565438e-06, + "loss": 2.3436, + "mean_token_accuracy": 0.4344827473163605, + "step": 1635 + }, + { + "epoch": 0.0016518254181686696, + "grad_norm": 95.68949071193957, + "learning_rate": 1.6517938077876035e-06, + "loss": 2.1151, + "mean_token_accuracy": 0.4551724135875702, + "step": 1640 + }, + { + "epoch": 0.0016568614712728423, + "grad_norm": 95.36760843473598, + "learning_rate": 1.6568297645186634e-06, + "loss": 2.5977, + "mean_token_accuracy": 0.4172413766384125, + "step": 1645 + }, + { + "epoch": 0.001661897524377015, + "grad_norm": 86.1813138303892, + "learning_rate": 1.661865721249723e-06, + "loss": 2.0947, + "mean_token_accuracy": 0.4947973370552063, + "step": 1650 + }, + { + "epoch": 0.0016669335774811879, + "grad_norm": 97.68301957398029, + "learning_rate": 1.6669016779807828e-06, + "loss": 2.3633, + "mean_token_accuracy": 0.4258318305015564, + "step": 1655 + }, + { + "epoch": 0.0016719696305853606, + "grad_norm": 90.99186538858493, + "learning_rate": 1.6719376347118426e-06, + "loss": 2.5167, + "mean_token_accuracy": 0.38620689511299133, + "step": 1660 + }, + { + "epoch": 0.0016770056836895334, + "grad_norm": 80.15738544479304, + "learning_rate": 1.6769735914429023e-06, + "loss": 2.1744, + "mean_token_accuracy": 0.46551724076271056, + "step": 1665 + }, + { + "epoch": 0.0016820417367937061, + "grad_norm": 62.455297579323094, + "learning_rate": 1.6820095481739622e-06, + "loss": 1.7986, + "mean_token_accuracy": 0.5172413766384125, + "step": 1670 + }, + { + "epoch": 0.0016870777898978788, + "grad_norm": 93.5258823881037, + "learning_rate": 1.6870455049050219e-06, + "loss": 2.2099, + "mean_token_accuracy": 0.4620689630508423, + "step": 1675 + }, + { + "epoch": 0.0016921138430020517, + "grad_norm": 81.85947049962068, + "learning_rate": 1.6920814616360816e-06, + "loss": 2.2992, + "mean_token_accuracy": 0.4344827592372894, + "step": 1680 + }, + { + "epoch": 0.0016971498961062244, + "grad_norm": 83.11223935967024, + "learning_rate": 1.6971174183671415e-06, + "loss": 2.2199, + "mean_token_accuracy": 0.4517241418361664, + "step": 1685 + }, + { + "epoch": 0.0017021859492103973, + "grad_norm": 72.37325634818649, + "learning_rate": 1.7021533750982011e-06, + "loss": 1.9433, + "mean_token_accuracy": 0.517241370677948, + "step": 1690 + }, + { + "epoch": 0.00170722200231457, + "grad_norm": 62.67619225503342, + "learning_rate": 1.707189331829261e-06, + "loss": 2.4231, + "mean_token_accuracy": 0.41881426572799685, + "step": 1695 + }, + { + "epoch": 0.0017122580554187429, + "grad_norm": 76.68126850923235, + "learning_rate": 1.7122252885603207e-06, + "loss": 2.2686, + "mean_token_accuracy": 0.46206897497177124, + "step": 1700 + }, + { + "epoch": 0.0017172941085229155, + "grad_norm": 117.7889306707762, + "learning_rate": 1.7172612452913806e-06, + "loss": 2.3816, + "mean_token_accuracy": 0.4517241358757019, + "step": 1705 + }, + { + "epoch": 0.0017223301616270882, + "grad_norm": 84.14255536308399, + "learning_rate": 1.7222972020224403e-06, + "loss": 2.6233, + "mean_token_accuracy": 0.4241379380226135, + "step": 1710 + }, + { + "epoch": 0.0017273662147312611, + "grad_norm": 69.09697402332309, + "learning_rate": 1.7273331587535e-06, + "loss": 2.2852, + "mean_token_accuracy": 0.4620689630508423, + "step": 1715 + }, + { + "epoch": 0.0017324022678354338, + "grad_norm": 78.92314362178588, + "learning_rate": 1.7323691154845598e-06, + "loss": 2.3568, + "mean_token_accuracy": 0.4551724076271057, + "step": 1720 + }, + { + "epoch": 0.0017374383209396067, + "grad_norm": 139.17619606608554, + "learning_rate": 1.7374050722156195e-06, + "loss": 2.4275, + "mean_token_accuracy": 0.4620689690113068, + "step": 1725 + }, + { + "epoch": 0.0017424743740437794, + "grad_norm": 90.20116612955967, + "learning_rate": 1.7424410289466794e-06, + "loss": 2.1705, + "mean_token_accuracy": 0.4896551728248596, + "step": 1730 + }, + { + "epoch": 0.0017475104271479523, + "grad_norm": 99.60120723489602, + "learning_rate": 1.747476985677739e-06, + "loss": 2.6302, + "mean_token_accuracy": 0.4413793087005615, + "step": 1735 + }, + { + "epoch": 0.001752546480252125, + "grad_norm": 77.20807801933192, + "learning_rate": 1.7525129424087987e-06, + "loss": 1.9387, + "mean_token_accuracy": 0.5103448331356049, + "step": 1740 + }, + { + "epoch": 0.0017575825333562977, + "grad_norm": 76.41242622887216, + "learning_rate": 1.7575488991398586e-06, + "loss": 2.2007, + "mean_token_accuracy": 0.4482758641242981, + "step": 1745 + }, + { + "epoch": 0.0017626185864604705, + "grad_norm": 80.8139352793975, + "learning_rate": 1.7625848558709183e-06, + "loss": 2.1226, + "mean_token_accuracy": 0.44827587008476255, + "step": 1750 + }, + { + "epoch": 0.0017676546395646432, + "grad_norm": 76.4179824339492, + "learning_rate": 1.7676208126019782e-06, + "loss": 2.0023, + "mean_token_accuracy": 0.5344827592372894, + "step": 1755 + }, + { + "epoch": 0.0017726906926688161, + "grad_norm": 95.62054558276478, + "learning_rate": 1.7726567693330379e-06, + "loss": 1.972, + "mean_token_accuracy": 0.5056261241436004, + "step": 1760 + }, + { + "epoch": 0.0017777267457729888, + "grad_norm": 72.46220370172047, + "learning_rate": 1.7776927260640975e-06, + "loss": 2.3215, + "mean_token_accuracy": 0.4325468838214874, + "step": 1765 + }, + { + "epoch": 0.0017827627988771617, + "grad_norm": 128.46683836513, + "learning_rate": 1.7827286827951574e-06, + "loss": 2.3107, + "mean_token_accuracy": 0.4517241299152374, + "step": 1770 + }, + { + "epoch": 0.0017877988519813344, + "grad_norm": 112.44825568435205, + "learning_rate": 1.7877646395262171e-06, + "loss": 1.9738, + "mean_token_accuracy": 0.5294010818004609, + "step": 1775 + }, + { + "epoch": 0.001792834905085507, + "grad_norm": 105.88635733073585, + "learning_rate": 1.792800596257277e-06, + "loss": 2.4159, + "mean_token_accuracy": 0.42758620977401735, + "step": 1780 + }, + { + "epoch": 0.00179787095818968, + "grad_norm": 76.56275729629095, + "learning_rate": 1.7978365529883367e-06, + "loss": 2.1934, + "mean_token_accuracy": 0.4517241358757019, + "step": 1785 + }, + { + "epoch": 0.0018029070112938526, + "grad_norm": 96.91990849635508, + "learning_rate": 1.8028725097193964e-06, + "loss": 2.3044, + "mean_token_accuracy": 0.47931034564971925, + "step": 1790 + }, + { + "epoch": 0.0018079430643980255, + "grad_norm": 89.37010754407544, + "learning_rate": 1.8079084664504562e-06, + "loss": 2.1361, + "mean_token_accuracy": 0.4965517222881317, + "step": 1795 + }, + { + "epoch": 0.0018129791175021982, + "grad_norm": 73.72513138969829, + "learning_rate": 1.812944423181516e-06, + "loss": 2.2718, + "mean_token_accuracy": 0.4137930989265442, + "step": 1800 + }, + { + "epoch": 0.001818015170606371, + "grad_norm": 92.87287243238539, + "learning_rate": 1.8179803799125758e-06, + "loss": 2.5586, + "mean_token_accuracy": 0.4310344815254211, + "step": 1805 + }, + { + "epoch": 0.0018230512237105438, + "grad_norm": 82.8126099115831, + "learning_rate": 1.8230163366436355e-06, + "loss": 1.9987, + "mean_token_accuracy": 0.47398667931556704, + "step": 1810 + }, + { + "epoch": 0.0018280872768147165, + "grad_norm": 74.56083754345453, + "learning_rate": 1.8280522933746952e-06, + "loss": 2.2487, + "mean_token_accuracy": 0.4551724076271057, + "step": 1815 + }, + { + "epoch": 0.0018331233299188894, + "grad_norm": 77.35069789149611, + "learning_rate": 1.833088250105755e-06, + "loss": 2.3571, + "mean_token_accuracy": 0.41379311084747317, + "step": 1820 + }, + { + "epoch": 0.001838159383023062, + "grad_norm": 74.50495153809906, + "learning_rate": 1.8381242068368147e-06, + "loss": 1.8451, + "mean_token_accuracy": 0.5778325200080872, + "step": 1825 + }, + { + "epoch": 0.001843195436127235, + "grad_norm": 65.97754985371876, + "learning_rate": 1.8431601635678746e-06, + "loss": 2.227, + "mean_token_accuracy": 0.47586206793785096, + "step": 1830 + }, + { + "epoch": 0.0018482314892314076, + "grad_norm": 90.9212629625924, + "learning_rate": 1.8481961202989343e-06, + "loss": 1.9323, + "mean_token_accuracy": 0.5206896543502808, + "step": 1835 + }, + { + "epoch": 0.0018532675423355803, + "grad_norm": 73.71458988607003, + "learning_rate": 1.8532320770299942e-06, + "loss": 2.3401, + "mean_token_accuracy": 0.5034482777118683, + "step": 1840 + }, + { + "epoch": 0.0018583035954397532, + "grad_norm": 67.14808135899057, + "learning_rate": 1.8582680337610543e-06, + "loss": 2.4888, + "mean_token_accuracy": 0.4310344815254211, + "step": 1845 + }, + { + "epoch": 0.001863339648543926, + "grad_norm": 109.35527297957105, + "learning_rate": 1.863303990492114e-06, + "loss": 2.3476, + "mean_token_accuracy": 0.46551724076271056, + "step": 1850 + }, + { + "epoch": 0.0018683757016480988, + "grad_norm": 104.65003734665592, + "learning_rate": 1.8683399472231736e-06, + "loss": 2.1779, + "mean_token_accuracy": 0.4620689630508423, + "step": 1855 + }, + { + "epoch": 0.0018734117547522715, + "grad_norm": 63.45899564844917, + "learning_rate": 1.8733759039542335e-06, + "loss": 2.1909, + "mean_token_accuracy": 0.46896551847457885, + "step": 1860 + }, + { + "epoch": 0.0018784478078564444, + "grad_norm": 98.1692066952957, + "learning_rate": 1.8784118606852932e-06, + "loss": 2.1605, + "mean_token_accuracy": 0.49655172824859617, + "step": 1865 + }, + { + "epoch": 0.001883483860960617, + "grad_norm": 72.12043589392583, + "learning_rate": 1.883447817416353e-06, + "loss": 1.9092, + "mean_token_accuracy": 0.5310344755649566, + "step": 1870 + }, + { + "epoch": 0.0018885199140647897, + "grad_norm": 63.38032620835685, + "learning_rate": 1.8884837741474128e-06, + "loss": 2.307, + "mean_token_accuracy": 0.42413792610168455, + "step": 1875 + }, + { + "epoch": 0.0018935559671689626, + "grad_norm": 97.51558259527754, + "learning_rate": 1.8935197308784724e-06, + "loss": 2.2036, + "mean_token_accuracy": 0.43793103098869324, + "step": 1880 + }, + { + "epoch": 0.0018985920202731353, + "grad_norm": 81.28538443821304, + "learning_rate": 1.8985556876095323e-06, + "loss": 2.4599, + "mean_token_accuracy": 0.42758620977401735, + "step": 1885 + }, + { + "epoch": 0.0019036280733773082, + "grad_norm": 93.11508609155473, + "learning_rate": 1.903591644340592e-06, + "loss": 2.2178, + "mean_token_accuracy": 0.45862067937850953, + "step": 1890 + }, + { + "epoch": 0.001908664126481481, + "grad_norm": 94.4600094375921, + "learning_rate": 1.9086276010716517e-06, + "loss": 2.3936, + "mean_token_accuracy": 0.4172413766384125, + "step": 1895 + }, + { + "epoch": 0.0019137001795856536, + "grad_norm": 107.17710083956324, + "learning_rate": 1.9136635578027118e-06, + "loss": 2.0258, + "mean_token_accuracy": 0.5034482836723327, + "step": 1900 + }, + { + "epoch": 0.0019187362326898265, + "grad_norm": 117.47541846574994, + "learning_rate": 1.9186995145337714e-06, + "loss": 2.5823, + "mean_token_accuracy": 0.42413792610168455, + "step": 1905 + }, + { + "epoch": 0.0019237722857939992, + "grad_norm": 66.79156739266057, + "learning_rate": 1.923735471264831e-06, + "loss": 2.2979, + "mean_token_accuracy": 0.4517241418361664, + "step": 1910 + }, + { + "epoch": 0.001928808338898172, + "grad_norm": 78.40004200556646, + "learning_rate": 1.928771427995891e-06, + "loss": 2.1359, + "mean_token_accuracy": 0.4931034564971924, + "step": 1915 + }, + { + "epoch": 0.0019338443920023447, + "grad_norm": 79.92592154394615, + "learning_rate": 1.9338073847269505e-06, + "loss": 1.9515, + "mean_token_accuracy": 0.5430732011795044, + "step": 1920 + }, + { + "epoch": 0.0019388804451065176, + "grad_norm": 79.20333698917693, + "learning_rate": 1.9388433414580106e-06, + "loss": 2.5004, + "mean_token_accuracy": 0.37241379022598264, + "step": 1925 + }, + { + "epoch": 0.0019439164982106903, + "grad_norm": 99.99097699670727, + "learning_rate": 1.9438792981890703e-06, + "loss": 2.4431, + "mean_token_accuracy": 0.37586206793785093, + "step": 1930 + }, + { + "epoch": 0.001948952551314863, + "grad_norm": 89.71904440600169, + "learning_rate": 1.94891525492013e-06, + "loss": 2.0776, + "mean_token_accuracy": 0.47241379618644713, + "step": 1935 + }, + { + "epoch": 0.001953988604419036, + "grad_norm": 75.47698662083619, + "learning_rate": 1.9539512116511896e-06, + "loss": 2.426, + "mean_token_accuracy": 0.45517241954803467, + "step": 1940 + }, + { + "epoch": 0.001959024657523209, + "grad_norm": 70.85028722040185, + "learning_rate": 1.9589871683822493e-06, + "loss": 2.4601, + "mean_token_accuracy": 0.4379310429096222, + "step": 1945 + }, + { + "epoch": 0.0019640607106273813, + "grad_norm": 93.65941348272948, + "learning_rate": 1.9640231251133094e-06, + "loss": 2.3995, + "mean_token_accuracy": 0.4517241418361664, + "step": 1950 + }, + { + "epoch": 0.001969096763731554, + "grad_norm": 80.73007515789497, + "learning_rate": 1.969059081844369e-06, + "loss": 2.4547, + "mean_token_accuracy": 0.43103447556495667, + "step": 1955 + }, + { + "epoch": 0.001974132816835727, + "grad_norm": 75.54402884891343, + "learning_rate": 1.9740950385754287e-06, + "loss": 2.1322, + "mean_token_accuracy": 0.44137930274009707, + "step": 1960 + }, + { + "epoch": 0.0019791688699398995, + "grad_norm": 112.67510206807452, + "learning_rate": 1.9791309953064884e-06, + "loss": 1.8569, + "mean_token_accuracy": 0.5344827592372894, + "step": 1965 + }, + { + "epoch": 0.0019842049230440724, + "grad_norm": 91.16433235557035, + "learning_rate": 1.984166952037548e-06, + "loss": 2.4891, + "mean_token_accuracy": 0.4172413766384125, + "step": 1970 + }, + { + "epoch": 0.0019892409761482453, + "grad_norm": 83.11854793895924, + "learning_rate": 1.989202908768608e-06, + "loss": 2.2815, + "mean_token_accuracy": 0.4620689690113068, + "step": 1975 + }, + { + "epoch": 0.0019942770292524182, + "grad_norm": 92.17564771902191, + "learning_rate": 1.994238865499668e-06, + "loss": 2.2691, + "mean_token_accuracy": 0.4586206912994385, + "step": 1980 + }, + { + "epoch": 0.0019993130823565907, + "grad_norm": 82.1739264613652, + "learning_rate": 1.9992748222307275e-06, + "loss": 2.4372, + "mean_token_accuracy": 0.40344828069210054, + "step": 1985 + }, + { + "epoch": 0.0020043491354607636, + "grad_norm": 77.13165118044004, + "learning_rate": 2.0043107789617872e-06, + "loss": 2.0082, + "mean_token_accuracy": 0.5137930989265442, + "step": 1990 + }, + { + "epoch": 0.0020093851885649365, + "grad_norm": 84.79058491562802, + "learning_rate": 2.009346735692847e-06, + "loss": 2.3669, + "mean_token_accuracy": 0.45517241954803467, + "step": 1995 + }, + { + "epoch": 0.002014421241669109, + "grad_norm": 80.37046342006643, + "learning_rate": 2.014382692423907e-06, + "loss": 1.8865, + "mean_token_accuracy": 0.5241379320621491, + "step": 2000 + }, + { + "epoch": 0.002019457294773282, + "grad_norm": 81.49479556557111, + "learning_rate": 2.0194186491549667e-06, + "loss": 2.4085, + "mean_token_accuracy": 0.493103438615799, + "step": 2005 + }, + { + "epoch": 0.0020244933478774547, + "grad_norm": 56.22427568650456, + "learning_rate": 2.0244546058860263e-06, + "loss": 2.5069, + "mean_token_accuracy": 0.4034482777118683, + "step": 2010 + }, + { + "epoch": 0.0020295294009816276, + "grad_norm": 94.38835039101788, + "learning_rate": 2.029490562617086e-06, + "loss": 2.2433, + "mean_token_accuracy": 0.43103447556495667, + "step": 2015 + }, + { + "epoch": 0.0020345654540858, + "grad_norm": 85.49904954261659, + "learning_rate": 2.0345265193481457e-06, + "loss": 2.1228, + "mean_token_accuracy": 0.441379314661026, + "step": 2020 + }, + { + "epoch": 0.002039601507189973, + "grad_norm": 90.15610942839515, + "learning_rate": 2.039562476079206e-06, + "loss": 2.217, + "mean_token_accuracy": 0.46896551847457885, + "step": 2025 + }, + { + "epoch": 0.002044637560294146, + "grad_norm": 66.39729329968833, + "learning_rate": 2.0445984328102655e-06, + "loss": 2.2885, + "mean_token_accuracy": 0.4517241358757019, + "step": 2030 + }, + { + "epoch": 0.0020496736133983184, + "grad_norm": 65.84731199639643, + "learning_rate": 2.049634389541325e-06, + "loss": 2.1231, + "mean_token_accuracy": 0.47241378426551817, + "step": 2035 + }, + { + "epoch": 0.0020547096665024913, + "grad_norm": 130.13767097405986, + "learning_rate": 2.054670346272385e-06, + "loss": 2.0289, + "mean_token_accuracy": 0.5103448152542114, + "step": 2040 + }, + { + "epoch": 0.002059745719606664, + "grad_norm": 54.395648527616, + "learning_rate": 2.0597063030034445e-06, + "loss": 2.2106, + "mean_token_accuracy": 0.5137931048870087, + "step": 2045 + }, + { + "epoch": 0.002064781772710837, + "grad_norm": 134.39841460779195, + "learning_rate": 2.0647422597345046e-06, + "loss": 2.413, + "mean_token_accuracy": 0.44313369393348695, + "step": 2050 + }, + { + "epoch": 0.0020698178258150095, + "grad_norm": 120.07876619471952, + "learning_rate": 2.0697782164655643e-06, + "loss": 2.5518, + "mean_token_accuracy": 0.44482758045196535, + "step": 2055 + }, + { + "epoch": 0.0020748538789191824, + "grad_norm": 72.39383002650447, + "learning_rate": 2.074814173196624e-06, + "loss": 2.2809, + "mean_token_accuracy": 0.47931034564971925, + "step": 2060 + }, + { + "epoch": 0.0020798899320233553, + "grad_norm": 97.62233220271177, + "learning_rate": 2.0798501299276836e-06, + "loss": 2.2048, + "mean_token_accuracy": 0.43103448748588563, + "step": 2065 + }, + { + "epoch": 0.002084925985127528, + "grad_norm": 94.22741555340882, + "learning_rate": 2.0848860866587433e-06, + "loss": 2.6581, + "mean_token_accuracy": 0.3773139715194702, + "step": 2070 + }, + { + "epoch": 0.0020899620382317007, + "grad_norm": 72.49275371459426, + "learning_rate": 2.0899220433898034e-06, + "loss": 2.3135, + "mean_token_accuracy": 0.4517241358757019, + "step": 2075 + }, + { + "epoch": 0.0020949980913358736, + "grad_norm": 92.5064895978771, + "learning_rate": 2.094958000120863e-06, + "loss": 1.944, + "mean_token_accuracy": 0.5214285731315613, + "step": 2080 + }, + { + "epoch": 0.0021000341444400465, + "grad_norm": 64.00100532892989, + "learning_rate": 2.0999939568519228e-06, + "loss": 2.033, + "mean_token_accuracy": 0.5137931048870087, + "step": 2085 + }, + { + "epoch": 0.002105070197544219, + "grad_norm": 87.93719120962002, + "learning_rate": 2.1050299135829824e-06, + "loss": 2.0544, + "mean_token_accuracy": 0.5052631616592407, + "step": 2090 + }, + { + "epoch": 0.002110106250648392, + "grad_norm": 73.45534817073921, + "learning_rate": 2.110065870314042e-06, + "loss": 2.2357, + "mean_token_accuracy": 0.46896551847457885, + "step": 2095 + }, + { + "epoch": 0.0021151423037525647, + "grad_norm": 94.14541146787158, + "learning_rate": 2.1151018270451022e-06, + "loss": 2.5262, + "mean_token_accuracy": 0.42758620381355283, + "step": 2100 + }, + { + "epoch": 0.002120178356856737, + "grad_norm": 56.35782310806002, + "learning_rate": 2.120137783776162e-06, + "loss": 2.2755, + "mean_token_accuracy": 0.42758620977401735, + "step": 2105 + }, + { + "epoch": 0.00212521440996091, + "grad_norm": 93.10910782240434, + "learning_rate": 2.1251737405072216e-06, + "loss": 2.3457, + "mean_token_accuracy": 0.46896551847457885, + "step": 2110 + }, + { + "epoch": 0.002130250463065083, + "grad_norm": 89.00218583542788, + "learning_rate": 2.1302096972382812e-06, + "loss": 2.0866, + "mean_token_accuracy": 0.47241379618644713, + "step": 2115 + }, + { + "epoch": 0.002135286516169256, + "grad_norm": 104.72491235268645, + "learning_rate": 2.135245653969341e-06, + "loss": 2.4817, + "mean_token_accuracy": 0.44137930274009707, + "step": 2120 + }, + { + "epoch": 0.0021403225692734284, + "grad_norm": 64.98428934631258, + "learning_rate": 2.140281610700401e-06, + "loss": 2.343, + "mean_token_accuracy": 0.4241379380226135, + "step": 2125 + }, + { + "epoch": 0.0021453586223776013, + "grad_norm": 73.36348079388735, + "learning_rate": 2.1453175674314607e-06, + "loss": 2.1639, + "mean_token_accuracy": 0.482758617401123, + "step": 2130 + }, + { + "epoch": 0.002150394675481774, + "grad_norm": 107.03010254220776, + "learning_rate": 2.1503535241625204e-06, + "loss": 2.5505, + "mean_token_accuracy": 0.4586206912994385, + "step": 2135 + }, + { + "epoch": 0.0021554307285859466, + "grad_norm": 66.03717059466969, + "learning_rate": 2.15538948089358e-06, + "loss": 2.0671, + "mean_token_accuracy": 0.46551724076271056, + "step": 2140 + }, + { + "epoch": 0.0021604667816901195, + "grad_norm": 79.19439165546005, + "learning_rate": 2.16042543762464e-06, + "loss": 2.1992, + "mean_token_accuracy": 0.46896552443504336, + "step": 2145 + }, + { + "epoch": 0.0021655028347942924, + "grad_norm": 82.24561213588349, + "learning_rate": 2.1654613943557e-06, + "loss": 2.1769, + "mean_token_accuracy": 0.4620689630508423, + "step": 2150 + }, + { + "epoch": 0.0021705388878984653, + "grad_norm": 76.36154367829377, + "learning_rate": 2.1704973510867595e-06, + "loss": 2.3223, + "mean_token_accuracy": 0.44652147889137267, + "step": 2155 + }, + { + "epoch": 0.002175574941002638, + "grad_norm": 108.17682750951424, + "learning_rate": 2.175533307817819e-06, + "loss": 2.4235, + "mean_token_accuracy": 0.4551724076271057, + "step": 2160 + }, + { + "epoch": 0.0021806109941068107, + "grad_norm": 77.20581243724298, + "learning_rate": 2.180569264548879e-06, + "loss": 2.1487, + "mean_token_accuracy": 0.43448275327682495, + "step": 2165 + }, + { + "epoch": 0.0021856470472109836, + "grad_norm": 108.63764512169979, + "learning_rate": 2.185605221279939e-06, + "loss": 2.113, + "mean_token_accuracy": 0.47586206793785096, + "step": 2170 + }, + { + "epoch": 0.002190683100315156, + "grad_norm": 127.56322936980933, + "learning_rate": 2.1906411780109986e-06, + "loss": 2.0053, + "mean_token_accuracy": 0.49655172824859617, + "step": 2175 + }, + { + "epoch": 0.002195719153419329, + "grad_norm": 55.45017324660117, + "learning_rate": 2.1956771347420583e-06, + "loss": 2.3794, + "mean_token_accuracy": 0.42068964838981626, + "step": 2180 + }, + { + "epoch": 0.002200755206523502, + "grad_norm": 85.81602701764474, + "learning_rate": 2.200713091473118e-06, + "loss": 2.4024, + "mean_token_accuracy": 0.37586206793785093, + "step": 2185 + }, + { + "epoch": 0.0022057912596276743, + "grad_norm": 88.05309187658263, + "learning_rate": 2.2057490482041777e-06, + "loss": 2.1531, + "mean_token_accuracy": 0.45862067937850953, + "step": 2190 + }, + { + "epoch": 0.002210827312731847, + "grad_norm": 87.3300889846787, + "learning_rate": 2.2107850049352378e-06, + "loss": 2.2589, + "mean_token_accuracy": 0.4517241358757019, + "step": 2195 + }, + { + "epoch": 0.00221586336583602, + "grad_norm": 80.27639809690658, + "learning_rate": 2.2158209616662974e-06, + "loss": 2.5973, + "mean_token_accuracy": 0.3965517282485962, + "step": 2200 + }, + { + "epoch": 0.002220899418940193, + "grad_norm": 78.04712385778724, + "learning_rate": 2.220856918397357e-06, + "loss": 2.4277, + "mean_token_accuracy": 0.46896552443504336, + "step": 2205 + }, + { + "epoch": 0.0022259354720443655, + "grad_norm": 92.64826813063166, + "learning_rate": 2.225892875128417e-06, + "loss": 2.1198, + "mean_token_accuracy": 0.4896551728248596, + "step": 2210 + }, + { + "epoch": 0.0022309715251485384, + "grad_norm": 89.20145562977099, + "learning_rate": 2.2309288318594765e-06, + "loss": 1.9321, + "mean_token_accuracy": 0.4896551728248596, + "step": 2215 + }, + { + "epoch": 0.0022360075782527113, + "grad_norm": 75.11550738769606, + "learning_rate": 2.2359647885905366e-06, + "loss": 2.4328, + "mean_token_accuracy": 0.4379310250282288, + "step": 2220 + }, + { + "epoch": 0.0022410436313568837, + "grad_norm": 83.96441877070305, + "learning_rate": 2.2410007453215962e-06, + "loss": 2.5839, + "mean_token_accuracy": 0.37931033968925476, + "step": 2225 + }, + { + "epoch": 0.0022460796844610566, + "grad_norm": 95.40399944514314, + "learning_rate": 2.246036702052656e-06, + "loss": 2.1122, + "mean_token_accuracy": 0.482758617401123, + "step": 2230 + }, + { + "epoch": 0.0022511157375652295, + "grad_norm": 74.50641584187392, + "learning_rate": 2.2510726587837156e-06, + "loss": 1.904, + "mean_token_accuracy": 0.517241370677948, + "step": 2235 + }, + { + "epoch": 0.0022561517906694024, + "grad_norm": 67.31716477555766, + "learning_rate": 2.2561086155147753e-06, + "loss": 2.0203, + "mean_token_accuracy": 0.5, + "step": 2240 + }, + { + "epoch": 0.002261187843773575, + "grad_norm": 64.81349773051566, + "learning_rate": 2.2611445722458354e-06, + "loss": 2.4687, + "mean_token_accuracy": 0.4551724076271057, + "step": 2245 + }, + { + "epoch": 0.002266223896877748, + "grad_norm": 109.77026567699987, + "learning_rate": 2.2661805289768955e-06, + "loss": 2.349, + "mean_token_accuracy": 0.46551724076271056, + "step": 2250 + }, + { + "epoch": 0.0022712599499819207, + "grad_norm": 120.97306916821232, + "learning_rate": 2.271216485707955e-06, + "loss": 2.5395, + "mean_token_accuracy": 0.3620689630508423, + "step": 2255 + }, + { + "epoch": 0.002276296003086093, + "grad_norm": 94.8291436898578, + "learning_rate": 2.276252442439015e-06, + "loss": 2.6694, + "mean_token_accuracy": 0.41724138259887694, + "step": 2260 + }, + { + "epoch": 0.002281332056190266, + "grad_norm": 86.98101902489451, + "learning_rate": 2.2812883991700745e-06, + "loss": 2.4048, + "mean_token_accuracy": 0.4448275864124298, + "step": 2265 + }, + { + "epoch": 0.002286368109294439, + "grad_norm": 95.56511129735804, + "learning_rate": 2.286324355901134e-06, + "loss": 2.1267, + "mean_token_accuracy": 0.4689655065536499, + "step": 2270 + }, + { + "epoch": 0.002291404162398612, + "grad_norm": 104.90206108988845, + "learning_rate": 2.2913603126321943e-06, + "loss": 2.3055, + "mean_token_accuracy": 0.4206896543502808, + "step": 2275 + }, + { + "epoch": 0.0022964402155027843, + "grad_norm": 74.27571117193528, + "learning_rate": 2.296396269363254e-06, + "loss": 1.9802, + "mean_token_accuracy": 0.5000000059604645, + "step": 2280 + }, + { + "epoch": 0.002301476268606957, + "grad_norm": 72.30702953692304, + "learning_rate": 2.3014322260943136e-06, + "loss": 2.2243, + "mean_token_accuracy": 0.493103438615799, + "step": 2285 + }, + { + "epoch": 0.00230651232171113, + "grad_norm": 84.27252913417315, + "learning_rate": 2.3064681828253733e-06, + "loss": 1.8297, + "mean_token_accuracy": 0.4758620738983154, + "step": 2290 + }, + { + "epoch": 0.0023115483748153026, + "grad_norm": 88.51176814283133, + "learning_rate": 2.311504139556433e-06, + "loss": 2.4405, + "mean_token_accuracy": 0.47586206793785096, + "step": 2295 + }, + { + "epoch": 0.0023165844279194755, + "grad_norm": 82.73835152115163, + "learning_rate": 2.316540096287493e-06, + "loss": 2.5402, + "mean_token_accuracy": 0.4379310369491577, + "step": 2300 + }, + { + "epoch": 0.0023216204810236484, + "grad_norm": 64.3044780421925, + "learning_rate": 2.3215760530185528e-06, + "loss": 2.1602, + "mean_token_accuracy": 0.4448275864124298, + "step": 2305 + }, + { + "epoch": 0.0023266565341278213, + "grad_norm": 74.1918294250614, + "learning_rate": 2.3266120097496124e-06, + "loss": 2.3048, + "mean_token_accuracy": 0.4517241418361664, + "step": 2310 + }, + { + "epoch": 0.0023316925872319937, + "grad_norm": 83.60794216618378, + "learning_rate": 2.331647966480672e-06, + "loss": 1.9677, + "mean_token_accuracy": 0.4965517222881317, + "step": 2315 + }, + { + "epoch": 0.0023367286403361666, + "grad_norm": 89.93528196412021, + "learning_rate": 2.336683923211732e-06, + "loss": 2.0785, + "mean_token_accuracy": 0.47586207985877993, + "step": 2320 + }, + { + "epoch": 0.0023417646934403395, + "grad_norm": 86.46648739040855, + "learning_rate": 2.341719879942792e-06, + "loss": 2.1121, + "mean_token_accuracy": 0.4918330252170563, + "step": 2325 + }, + { + "epoch": 0.002346800746544512, + "grad_norm": 67.5352938098029, + "learning_rate": 2.3467558366738516e-06, + "loss": 2.1478, + "mean_token_accuracy": 0.49655171632766726, + "step": 2330 + }, + { + "epoch": 0.002351836799648685, + "grad_norm": 108.5068336301013, + "learning_rate": 2.3517917934049112e-06, + "loss": 2.2149, + "mean_token_accuracy": 0.4586206912994385, + "step": 2335 + }, + { + "epoch": 0.002356872852752858, + "grad_norm": 103.22246022867125, + "learning_rate": 2.356827750135971e-06, + "loss": 2.0018, + "mean_token_accuracy": 0.5137930989265442, + "step": 2340 + }, + { + "epoch": 0.0023619089058570307, + "grad_norm": 94.32368257825978, + "learning_rate": 2.3618637068670306e-06, + "loss": 2.1515, + "mean_token_accuracy": 0.4620689630508423, + "step": 2345 + }, + { + "epoch": 0.002366944958961203, + "grad_norm": 99.54118280992225, + "learning_rate": 2.3668996635980907e-06, + "loss": 2.44, + "mean_token_accuracy": 0.4, + "step": 2350 + }, + { + "epoch": 0.002371981012065376, + "grad_norm": 70.83937677532117, + "learning_rate": 2.3719356203291504e-06, + "loss": 1.9913, + "mean_token_accuracy": 0.4931034445762634, + "step": 2355 + }, + { + "epoch": 0.002377017065169549, + "grad_norm": 102.43012653014074, + "learning_rate": 2.37697157706021e-06, + "loss": 2.5206, + "mean_token_accuracy": 0.4068965554237366, + "step": 2360 + }, + { + "epoch": 0.0023820531182737214, + "grad_norm": 86.88467641525777, + "learning_rate": 2.3820075337912697e-06, + "loss": 2.4241, + "mean_token_accuracy": 0.43793103098869324, + "step": 2365 + }, + { + "epoch": 0.0023870891713778943, + "grad_norm": 63.118065292996434, + "learning_rate": 2.38704349052233e-06, + "loss": 2.0229, + "mean_token_accuracy": 0.47586206197738645, + "step": 2370 + }, + { + "epoch": 0.002392125224482067, + "grad_norm": 70.01644381681285, + "learning_rate": 2.3920794472533895e-06, + "loss": 2.2602, + "mean_token_accuracy": 0.47586206197738645, + "step": 2375 + }, + { + "epoch": 0.00239716127758624, + "grad_norm": 97.41031614296067, + "learning_rate": 2.397115403984449e-06, + "loss": 2.4941, + "mean_token_accuracy": 0.3896551728248596, + "step": 2380 + }, + { + "epoch": 0.0024021973306904126, + "grad_norm": 75.60346659696802, + "learning_rate": 2.402151360715509e-06, + "loss": 2.0855, + "mean_token_accuracy": 0.47586206793785096, + "step": 2385 + }, + { + "epoch": 0.0024072333837945855, + "grad_norm": 83.90825897672056, + "learning_rate": 2.4071873174465685e-06, + "loss": 2.2922, + "mean_token_accuracy": 0.4482758641242981, + "step": 2390 + }, + { + "epoch": 0.0024122694368987584, + "grad_norm": 100.94223097937659, + "learning_rate": 2.4122232741776286e-06, + "loss": 2.1944, + "mean_token_accuracy": 0.4413793087005615, + "step": 2395 + }, + { + "epoch": 0.002417305490002931, + "grad_norm": 102.77293421672823, + "learning_rate": 2.4172592309086883e-06, + "loss": 2.3001, + "mean_token_accuracy": 0.47241379618644713, + "step": 2400 + }, + { + "epoch": 0.0024223415431071037, + "grad_norm": 86.84525134236557, + "learning_rate": 2.422295187639748e-06, + "loss": 2.4248, + "mean_token_accuracy": 0.44137930274009707, + "step": 2405 + }, + { + "epoch": 0.0024273775962112766, + "grad_norm": 96.97384741714902, + "learning_rate": 2.4273311443708077e-06, + "loss": 2.1911, + "mean_token_accuracy": 0.4430732041597366, + "step": 2410 + }, + { + "epoch": 0.002432413649315449, + "grad_norm": 72.50198571802704, + "learning_rate": 2.4323671011018673e-06, + "loss": 2.4365, + "mean_token_accuracy": 0.4517241418361664, + "step": 2415 + }, + { + "epoch": 0.002437449702419622, + "grad_norm": 70.69782391007874, + "learning_rate": 2.4374030578329274e-06, + "loss": 2.2345, + "mean_token_accuracy": 0.5068965494632721, + "step": 2420 + }, + { + "epoch": 0.002442485755523795, + "grad_norm": 60.86278051650412, + "learning_rate": 2.442439014563987e-06, + "loss": 2.1528, + "mean_token_accuracy": 0.4689655125141144, + "step": 2425 + }, + { + "epoch": 0.0024475218086279678, + "grad_norm": 106.27687998858838, + "learning_rate": 2.447474971295047e-06, + "loss": 2.2131, + "mean_token_accuracy": 0.47241379618644713, + "step": 2430 + }, + { + "epoch": 0.0024525578617321402, + "grad_norm": 77.03583959000423, + "learning_rate": 2.4525109280261065e-06, + "loss": 2.4299, + "mean_token_accuracy": 0.44482758045196535, + "step": 2435 + }, + { + "epoch": 0.002457593914836313, + "grad_norm": 86.48972855633426, + "learning_rate": 2.457546884757166e-06, + "loss": 2.4645, + "mean_token_accuracy": 0.3931034505367279, + "step": 2440 + }, + { + "epoch": 0.002462629967940486, + "grad_norm": 101.74116534462931, + "learning_rate": 2.4625828414882262e-06, + "loss": 2.2693, + "mean_token_accuracy": 0.458620685338974, + "step": 2445 + }, + { + "epoch": 0.0024676660210446585, + "grad_norm": 87.02022023865, + "learning_rate": 2.467618798219286e-06, + "loss": 2.14, + "mean_token_accuracy": 0.4448275864124298, + "step": 2450 + }, + { + "epoch": 0.0024727020741488314, + "grad_norm": 108.44561616595888, + "learning_rate": 2.4726547549503456e-06, + "loss": 2.0587, + "mean_token_accuracy": 0.501875376701355, + "step": 2455 + }, + { + "epoch": 0.0024777381272530043, + "grad_norm": 48.251676953606776, + "learning_rate": 2.4776907116814053e-06, + "loss": 2.0645, + "mean_token_accuracy": 0.48965516686439514, + "step": 2460 + }, + { + "epoch": 0.002482774180357177, + "grad_norm": 70.59690625746232, + "learning_rate": 2.482726668412465e-06, + "loss": 2.1256, + "mean_token_accuracy": 0.4413793206214905, + "step": 2465 + }, + { + "epoch": 0.0024878102334613497, + "grad_norm": 82.42145596417369, + "learning_rate": 2.487762625143525e-06, + "loss": 2.2615, + "mean_token_accuracy": 0.4344827651977539, + "step": 2470 + }, + { + "epoch": 0.0024928462865655226, + "grad_norm": 81.98411365204434, + "learning_rate": 2.4927985818745847e-06, + "loss": 2.2869, + "mean_token_accuracy": 0.4689655065536499, + "step": 2475 + }, + { + "epoch": 0.0024978823396696955, + "grad_norm": 70.04223610163596, + "learning_rate": 2.4978345386056444e-06, + "loss": 2.1946, + "mean_token_accuracy": 0.43103448748588563, + "step": 2480 + }, + { + "epoch": 0.002502918392773868, + "grad_norm": 101.49786400227609, + "learning_rate": 2.502870495336704e-06, + "loss": 2.0097, + "mean_token_accuracy": 0.5137931048870087, + "step": 2485 + }, + { + "epoch": 0.002507954445878041, + "grad_norm": 104.72451071576143, + "learning_rate": 2.5079064520677638e-06, + "loss": 2.4874, + "mean_token_accuracy": 0.41034482717514037, + "step": 2490 + }, + { + "epoch": 0.0025129904989822137, + "grad_norm": 99.01470523307447, + "learning_rate": 2.512942408798824e-06, + "loss": 2.2392, + "mean_token_accuracy": 0.4517241418361664, + "step": 2495 + }, + { + "epoch": 0.0025180265520863866, + "grad_norm": 88.32559689127825, + "learning_rate": 2.5179783655298835e-06, + "loss": 2.3482, + "mean_token_accuracy": 0.4517241358757019, + "step": 2500 + }, + { + "epoch": 0.002523062605190559, + "grad_norm": 89.02802254323824, + "learning_rate": 2.523014322260943e-06, + "loss": 2.2567, + "mean_token_accuracy": 0.4482758641242981, + "step": 2505 + }, + { + "epoch": 0.002528098658294732, + "grad_norm": 86.95813959236109, + "learning_rate": 2.528050278992003e-06, + "loss": 2.246, + "mean_token_accuracy": 0.43103448748588563, + "step": 2510 + }, + { + "epoch": 0.002533134711398905, + "grad_norm": 79.48097965215312, + "learning_rate": 2.5330862357230626e-06, + "loss": 2.2283, + "mean_token_accuracy": 0.4551724135875702, + "step": 2515 + }, + { + "epoch": 0.0025381707645030773, + "grad_norm": 58.9093877425727, + "learning_rate": 2.5381221924541227e-06, + "loss": 2.2861, + "mean_token_accuracy": 0.4310344815254211, + "step": 2520 + }, + { + "epoch": 0.0025432068176072502, + "grad_norm": 84.59294272409089, + "learning_rate": 2.5431581491851823e-06, + "loss": 2.1781, + "mean_token_accuracy": 0.47241380214691164, + "step": 2525 + }, + { + "epoch": 0.002548242870711423, + "grad_norm": 93.01054790297542, + "learning_rate": 2.548194105916242e-06, + "loss": 2.1546, + "mean_token_accuracy": 0.46896552443504336, + "step": 2530 + }, + { + "epoch": 0.002553278923815596, + "grad_norm": 78.08738277996378, + "learning_rate": 2.5532300626473017e-06, + "loss": 1.9295, + "mean_token_accuracy": 0.5344827532768249, + "step": 2535 + }, + { + "epoch": 0.0025583149769197685, + "grad_norm": 67.40543490089269, + "learning_rate": 2.5582660193783614e-06, + "loss": 2.248, + "mean_token_accuracy": 0.47241379618644713, + "step": 2540 + }, + { + "epoch": 0.0025633510300239414, + "grad_norm": 72.29670272302188, + "learning_rate": 2.5633019761094215e-06, + "loss": 2.0328, + "mean_token_accuracy": 0.5241379261016845, + "step": 2545 + }, + { + "epoch": 0.0025683870831281143, + "grad_norm": 58.53504161644367, + "learning_rate": 2.568337932840481e-06, + "loss": 2.1585, + "mean_token_accuracy": 0.47586206793785096, + "step": 2550 + }, + { + "epoch": 0.0025734231362322868, + "grad_norm": 80.11779051597156, + "learning_rate": 2.573373889571541e-06, + "loss": 2.0434, + "mean_token_accuracy": 0.5206896483898162, + "step": 2555 + }, + { + "epoch": 0.0025784591893364597, + "grad_norm": 71.36974839218286, + "learning_rate": 2.5784098463026005e-06, + "loss": 2.1484, + "mean_token_accuracy": 0.4931034445762634, + "step": 2560 + }, + { + "epoch": 0.0025834952424406326, + "grad_norm": 81.40961290959542, + "learning_rate": 2.58344580303366e-06, + "loss": 1.8759, + "mean_token_accuracy": 0.501600980758667, + "step": 2565 + }, + { + "epoch": 0.0025885312955448055, + "grad_norm": 66.22402706092257, + "learning_rate": 2.5884817597647203e-06, + "loss": 2.4082, + "mean_token_accuracy": 0.47241379618644713, + "step": 2570 + }, + { + "epoch": 0.002593567348648978, + "grad_norm": 80.19970229278616, + "learning_rate": 2.59351771649578e-06, + "loss": 2.0116, + "mean_token_accuracy": 0.48965516686439514, + "step": 2575 + }, + { + "epoch": 0.002598603401753151, + "grad_norm": 104.71192991518264, + "learning_rate": 2.5985536732268396e-06, + "loss": 2.1655, + "mean_token_accuracy": 0.5034482657909394, + "step": 2580 + }, + { + "epoch": 0.0026036394548573237, + "grad_norm": 68.27060019618828, + "learning_rate": 2.6035896299578993e-06, + "loss": 2.4255, + "mean_token_accuracy": 0.4344827651977539, + "step": 2585 + }, + { + "epoch": 0.002608675507961496, + "grad_norm": 70.59755907631495, + "learning_rate": 2.608625586688959e-06, + "loss": 2.0864, + "mean_token_accuracy": 0.4896551728248596, + "step": 2590 + }, + { + "epoch": 0.002613711561065669, + "grad_norm": 66.4695034393724, + "learning_rate": 2.613661543420019e-06, + "loss": 1.9887, + "mean_token_accuracy": 0.48965516686439514, + "step": 2595 + }, + { + "epoch": 0.002618747614169842, + "grad_norm": 85.86804752415499, + "learning_rate": 2.6186975001510788e-06, + "loss": 2.3009, + "mean_token_accuracy": 0.4344827651977539, + "step": 2600 + }, + { + "epoch": 0.002623783667274015, + "grad_norm": 58.1255458488033, + "learning_rate": 2.6237334568821384e-06, + "loss": 2.622, + "mean_token_accuracy": 0.39655171930789945, + "step": 2605 + }, + { + "epoch": 0.0026288197203781873, + "grad_norm": 119.02446585580975, + "learning_rate": 2.628769413613198e-06, + "loss": 2.0564, + "mean_token_accuracy": 0.49999999403953554, + "step": 2610 + }, + { + "epoch": 0.0026338557734823602, + "grad_norm": 82.92094424704854, + "learning_rate": 2.6338053703442578e-06, + "loss": 2.0456, + "mean_token_accuracy": 0.4914700508117676, + "step": 2615 + }, + { + "epoch": 0.002638891826586533, + "grad_norm": 55.66215764464568, + "learning_rate": 2.638841327075318e-06, + "loss": 2.072, + "mean_token_accuracy": 0.5124016880989075, + "step": 2620 + }, + { + "epoch": 0.0026439278796907056, + "grad_norm": 87.19697469509939, + "learning_rate": 2.6438772838063776e-06, + "loss": 2.3927, + "mean_token_accuracy": 0.4172413766384125, + "step": 2625 + }, + { + "epoch": 0.0026489639327948785, + "grad_norm": 78.62896459530597, + "learning_rate": 2.6489132405374372e-06, + "loss": 2.4046, + "mean_token_accuracy": 0.4344827473163605, + "step": 2630 + }, + { + "epoch": 0.0026539999858990514, + "grad_norm": 81.7209698382802, + "learning_rate": 2.653949197268497e-06, + "loss": 2.298, + "mean_token_accuracy": 0.44482758045196535, + "step": 2635 + }, + { + "epoch": 0.002659036039003224, + "grad_norm": 73.54194820010473, + "learning_rate": 2.658985153999557e-06, + "loss": 2.3581, + "mean_token_accuracy": 0.42758620977401735, + "step": 2640 + }, + { + "epoch": 0.0026640720921073968, + "grad_norm": 76.53589154258704, + "learning_rate": 2.6640211107306167e-06, + "loss": 1.9931, + "mean_token_accuracy": 0.5103448331356049, + "step": 2645 + }, + { + "epoch": 0.0026691081452115697, + "grad_norm": 70.3446493180254, + "learning_rate": 2.6690570674616764e-06, + "loss": 2.2345, + "mean_token_accuracy": 0.44827585220336913, + "step": 2650 + }, + { + "epoch": 0.0026741441983157426, + "grad_norm": 66.94775145030579, + "learning_rate": 2.674093024192736e-06, + "loss": 2.1199, + "mean_token_accuracy": 0.46896551847457885, + "step": 2655 + }, + { + "epoch": 0.002679180251419915, + "grad_norm": 82.4314149698713, + "learning_rate": 2.679128980923796e-06, + "loss": 2.4063, + "mean_token_accuracy": 0.44482758045196535, + "step": 2660 + }, + { + "epoch": 0.002684216304524088, + "grad_norm": 62.06480470534354, + "learning_rate": 2.684164937654856e-06, + "loss": 2.0691, + "mean_token_accuracy": 0.5137930989265442, + "step": 2665 + }, + { + "epoch": 0.002689252357628261, + "grad_norm": 66.70698711652226, + "learning_rate": 2.689200894385916e-06, + "loss": 2.2688, + "mean_token_accuracy": 0.47586206793785096, + "step": 2670 + }, + { + "epoch": 0.0026942884107324333, + "grad_norm": 92.30191251456944, + "learning_rate": 2.6942368511169756e-06, + "loss": 2.4153, + "mean_token_accuracy": 0.4172413766384125, + "step": 2675 + }, + { + "epoch": 0.002699324463836606, + "grad_norm": 82.37019415130442, + "learning_rate": 2.6992728078480353e-06, + "loss": 2.2139, + "mean_token_accuracy": 0.4551724135875702, + "step": 2680 + }, + { + "epoch": 0.002704360516940779, + "grad_norm": 86.38876965878977, + "learning_rate": 2.704308764579095e-06, + "loss": 1.9345, + "mean_token_accuracy": 0.45728976726531984, + "step": 2685 + }, + { + "epoch": 0.002709396570044952, + "grad_norm": 76.06336349944823, + "learning_rate": 2.7093447213101546e-06, + "loss": 2.243, + "mean_token_accuracy": 0.4620689690113068, + "step": 2690 + }, + { + "epoch": 0.0027144326231491244, + "grad_norm": 75.96416232968137, + "learning_rate": 2.7143806780412147e-06, + "loss": 2.0259, + "mean_token_accuracy": 0.4945812880992889, + "step": 2695 + }, + { + "epoch": 0.0027194686762532973, + "grad_norm": 69.82960493111311, + "learning_rate": 2.7194166347722744e-06, + "loss": 1.9928, + "mean_token_accuracy": 0.5050211727619172, + "step": 2700 + }, + { + "epoch": 0.0027245047293574702, + "grad_norm": 52.23875665766504, + "learning_rate": 2.724452591503334e-06, + "loss": 2.0432, + "mean_token_accuracy": 0.47241380214691164, + "step": 2705 + }, + { + "epoch": 0.0027295407824616427, + "grad_norm": 115.77684114149575, + "learning_rate": 2.7294885482343938e-06, + "loss": 2.3347, + "mean_token_accuracy": 0.40889292359352114, + "step": 2710 + }, + { + "epoch": 0.0027345768355658156, + "grad_norm": 84.14499349996761, + "learning_rate": 2.7345245049654534e-06, + "loss": 2.2263, + "mean_token_accuracy": 0.44482758045196535, + "step": 2715 + }, + { + "epoch": 0.0027396128886699885, + "grad_norm": 56.68820943966481, + "learning_rate": 2.7395604616965135e-06, + "loss": 2.2164, + "mean_token_accuracy": 0.4813067138195038, + "step": 2720 + }, + { + "epoch": 0.0027446489417741614, + "grad_norm": 87.70003380523275, + "learning_rate": 2.744596418427573e-06, + "loss": 2.2856, + "mean_token_accuracy": 0.41034482717514037, + "step": 2725 + }, + { + "epoch": 0.002749684994878334, + "grad_norm": 62.4706991286528, + "learning_rate": 2.749632375158633e-06, + "loss": 2.1508, + "mean_token_accuracy": 0.46551724672317507, + "step": 2730 + }, + { + "epoch": 0.0027547210479825068, + "grad_norm": 99.26586227429233, + "learning_rate": 2.7546683318896926e-06, + "loss": 2.4804, + "mean_token_accuracy": 0.39310344457626345, + "step": 2735 + }, + { + "epoch": 0.0027597571010866797, + "grad_norm": 73.53182483854552, + "learning_rate": 2.7597042886207522e-06, + "loss": 2.4294, + "mean_token_accuracy": 0.4551724135875702, + "step": 2740 + }, + { + "epoch": 0.002764793154190852, + "grad_norm": 82.72995814482483, + "learning_rate": 2.7647402453518123e-06, + "loss": 2.3659, + "mean_token_accuracy": 0.4551724076271057, + "step": 2745 + }, + { + "epoch": 0.002769829207295025, + "grad_norm": 64.62304256081194, + "learning_rate": 2.769776202082872e-06, + "loss": 2.4052, + "mean_token_accuracy": 0.43793103098869324, + "step": 2750 + }, + { + "epoch": 0.002774865260399198, + "grad_norm": 65.3219204617248, + "learning_rate": 2.7748121588139317e-06, + "loss": 2.5977, + "mean_token_accuracy": 0.3758620619773865, + "step": 2755 + }, + { + "epoch": 0.002779901313503371, + "grad_norm": 71.86885898318236, + "learning_rate": 2.7798481155449914e-06, + "loss": 2.1898, + "mean_token_accuracy": 0.482758617401123, + "step": 2760 + }, + { + "epoch": 0.0027849373666075433, + "grad_norm": 69.210397481664, + "learning_rate": 2.784884072276051e-06, + "loss": 2.0809, + "mean_token_accuracy": 0.4551724135875702, + "step": 2765 + }, + { + "epoch": 0.002789973419711716, + "grad_norm": 83.34684729851516, + "learning_rate": 2.789920029007111e-06, + "loss": 2.1819, + "mean_token_accuracy": 0.5137931048870087, + "step": 2770 + }, + { + "epoch": 0.002795009472815889, + "grad_norm": 79.64701795356842, + "learning_rate": 2.794955985738171e-06, + "loss": 1.9888, + "mean_token_accuracy": 0.519177258014679, + "step": 2775 + }, + { + "epoch": 0.0028000455259200615, + "grad_norm": 87.4166099000275, + "learning_rate": 2.7999919424692305e-06, + "loss": 2.2323, + "mean_token_accuracy": 0.4620689630508423, + "step": 2780 + }, + { + "epoch": 0.0028050815790242344, + "grad_norm": 92.27110314428752, + "learning_rate": 2.80502789920029e-06, + "loss": 2.2346, + "mean_token_accuracy": 0.46551724076271056, + "step": 2785 + }, + { + "epoch": 0.0028101176321284073, + "grad_norm": 83.89918878588203, + "learning_rate": 2.81006385593135e-06, + "loss": 2.4355, + "mean_token_accuracy": 0.4586206912994385, + "step": 2790 + }, + { + "epoch": 0.0028151536852325802, + "grad_norm": 93.45920370138626, + "learning_rate": 2.81509981266241e-06, + "loss": 2.1915, + "mean_token_accuracy": 0.4482758641242981, + "step": 2795 + }, + { + "epoch": 0.0028201897383367527, + "grad_norm": 66.17805425813921, + "learning_rate": 2.8201357693934696e-06, + "loss": 1.9405, + "mean_token_accuracy": 0.5172413766384125, + "step": 2800 + }, + { + "epoch": 0.0028252257914409256, + "grad_norm": 80.65609196819172, + "learning_rate": 2.8251717261245293e-06, + "loss": 1.9453, + "mean_token_accuracy": 0.4965517222881317, + "step": 2805 + }, + { + "epoch": 0.0028302618445450985, + "grad_norm": 97.7429973449736, + "learning_rate": 2.830207682855589e-06, + "loss": 2.0581, + "mean_token_accuracy": 0.47931033968925474, + "step": 2810 + }, + { + "epoch": 0.002835297897649271, + "grad_norm": 73.69020724777327, + "learning_rate": 2.8352436395866487e-06, + "loss": 2.06, + "mean_token_accuracy": 0.4862068951129913, + "step": 2815 + }, + { + "epoch": 0.002840333950753444, + "grad_norm": 103.57005210577499, + "learning_rate": 2.8402795963177088e-06, + "loss": 1.9724, + "mean_token_accuracy": 0.5599753677845001, + "step": 2820 + }, + { + "epoch": 0.0028453700038576168, + "grad_norm": 65.42148294671267, + "learning_rate": 2.8453155530487684e-06, + "loss": 2.1466, + "mean_token_accuracy": 0.4862068951129913, + "step": 2825 + }, + { + "epoch": 0.0028504060569617897, + "grad_norm": 80.32749919516732, + "learning_rate": 2.850351509779828e-06, + "loss": 2.3368, + "mean_token_accuracy": 0.4413793087005615, + "step": 2830 + }, + { + "epoch": 0.002855442110065962, + "grad_norm": 63.96996264969172, + "learning_rate": 2.8553874665108878e-06, + "loss": 2.2333, + "mean_token_accuracy": 0.44827585220336913, + "step": 2835 + }, + { + "epoch": 0.002860478163170135, + "grad_norm": 100.35171769164828, + "learning_rate": 2.8604234232419475e-06, + "loss": 2.1746, + "mean_token_accuracy": 0.48965516686439514, + "step": 2840 + }, + { + "epoch": 0.002865514216274308, + "grad_norm": 74.6536346084636, + "learning_rate": 2.8654593799730076e-06, + "loss": 2.4039, + "mean_token_accuracy": 0.43103447556495667, + "step": 2845 + }, + { + "epoch": 0.0028705502693784804, + "grad_norm": 67.03898789894554, + "learning_rate": 2.8704953367040672e-06, + "loss": 1.868, + "mean_token_accuracy": 0.5344827532768249, + "step": 2850 + }, + { + "epoch": 0.0028755863224826533, + "grad_norm": 62.902392482822286, + "learning_rate": 2.875531293435127e-06, + "loss": 2.0197, + "mean_token_accuracy": 0.4896551609039307, + "step": 2855 + }, + { + "epoch": 0.002880622375586826, + "grad_norm": 88.00830680347272, + "learning_rate": 2.8805672501661866e-06, + "loss": 2.1306, + "mean_token_accuracy": 0.46551724672317507, + "step": 2860 + }, + { + "epoch": 0.0028856584286909986, + "grad_norm": 63.916751344187546, + "learning_rate": 2.8856032068972463e-06, + "loss": 2.3754, + "mean_token_accuracy": 0.4137930989265442, + "step": 2865 + }, + { + "epoch": 0.0028906944817951715, + "grad_norm": 103.41070480830378, + "learning_rate": 2.8906391636283064e-06, + "loss": 2.3709, + "mean_token_accuracy": 0.4517241418361664, + "step": 2870 + }, + { + "epoch": 0.0028957305348993444, + "grad_norm": 91.28132935611058, + "learning_rate": 2.895675120359366e-06, + "loss": 2.4747, + "mean_token_accuracy": 0.4284482777118683, + "step": 2875 + }, + { + "epoch": 0.0029007665880035173, + "grad_norm": 65.39938015352139, + "learning_rate": 2.9007110770904257e-06, + "loss": 2.1448, + "mean_token_accuracy": 0.5017587065696716, + "step": 2880 + }, + { + "epoch": 0.00290580264110769, + "grad_norm": 80.84669521191931, + "learning_rate": 2.9057470338214854e-06, + "loss": 2.424, + "mean_token_accuracy": 0.4517241358757019, + "step": 2885 + }, + { + "epoch": 0.0029108386942118627, + "grad_norm": 70.26365698499949, + "learning_rate": 2.9107829905525455e-06, + "loss": 2.0661, + "mean_token_accuracy": 0.5137931048870087, + "step": 2890 + }, + { + "epoch": 0.0029158747473160356, + "grad_norm": 101.26292734443231, + "learning_rate": 2.915818947283605e-06, + "loss": 2.5474, + "mean_token_accuracy": 0.4, + "step": 2895 + }, + { + "epoch": 0.002920910800420208, + "grad_norm": 60.000357169863605, + "learning_rate": 2.920854904014665e-06, + "loss": 2.1052, + "mean_token_accuracy": 0.4631773352622986, + "step": 2900 + }, + { + "epoch": 0.002925946853524381, + "grad_norm": 74.98131419220954, + "learning_rate": 2.9258908607457245e-06, + "loss": 2.1614, + "mean_token_accuracy": 0.4551724135875702, + "step": 2905 + }, + { + "epoch": 0.002930982906628554, + "grad_norm": 68.99947846683835, + "learning_rate": 2.930926817476784e-06, + "loss": 2.3706, + "mean_token_accuracy": 0.4137930989265442, + "step": 2910 + }, + { + "epoch": 0.0029360189597327268, + "grad_norm": 67.03628627387538, + "learning_rate": 2.9359627742078443e-06, + "loss": 2.04, + "mean_token_accuracy": 0.5137931048870087, + "step": 2915 + }, + { + "epoch": 0.0029410550128368992, + "grad_norm": 75.64477076811589, + "learning_rate": 2.940998730938904e-06, + "loss": 2.0422, + "mean_token_accuracy": 0.47931033968925474, + "step": 2920 + }, + { + "epoch": 0.002946091065941072, + "grad_norm": 67.99748776157973, + "learning_rate": 2.9460346876699637e-06, + "loss": 2.1311, + "mean_token_accuracy": 0.4344827592372894, + "step": 2925 + }, + { + "epoch": 0.002951127119045245, + "grad_norm": 89.5573630946037, + "learning_rate": 2.9510706444010233e-06, + "loss": 2.0571, + "mean_token_accuracy": 0.49655171632766726, + "step": 2930 + }, + { + "epoch": 0.0029561631721494175, + "grad_norm": 66.55704747284501, + "learning_rate": 2.956106601132083e-06, + "loss": 2.3881, + "mean_token_accuracy": 0.4620689630508423, + "step": 2935 + }, + { + "epoch": 0.0029611992252535904, + "grad_norm": 87.14039565243294, + "learning_rate": 2.961142557863143e-06, + "loss": 2.1442, + "mean_token_accuracy": 0.49655171632766726, + "step": 2940 + }, + { + "epoch": 0.0029662352783577633, + "grad_norm": 90.25881878937624, + "learning_rate": 2.9661785145942028e-06, + "loss": 2.4009, + "mean_token_accuracy": 0.46551724076271056, + "step": 2945 + }, + { + "epoch": 0.002971271331461936, + "grad_norm": 65.05480310172725, + "learning_rate": 2.9712144713252625e-06, + "loss": 2.2554, + "mean_token_accuracy": 0.4827586203813553, + "step": 2950 + }, + { + "epoch": 0.0029763073845661086, + "grad_norm": 86.4849105855807, + "learning_rate": 2.976250428056322e-06, + "loss": 2.5262, + "mean_token_accuracy": 0.3999999940395355, + "step": 2955 + }, + { + "epoch": 0.0029813434376702815, + "grad_norm": 88.51232309243866, + "learning_rate": 2.981286384787382e-06, + "loss": 2.4157, + "mean_token_accuracy": 0.4482758641242981, + "step": 2960 + }, + { + "epoch": 0.0029863794907744544, + "grad_norm": 91.550109868469, + "learning_rate": 2.986322341518442e-06, + "loss": 2.4113, + "mean_token_accuracy": 0.44990926384925845, + "step": 2965 + }, + { + "epoch": 0.002991415543878627, + "grad_norm": 89.42015661481138, + "learning_rate": 2.9913582982495016e-06, + "loss": 2.4826, + "mean_token_accuracy": 0.4034482777118683, + "step": 2970 + }, + { + "epoch": 0.0029964515969828, + "grad_norm": 88.55892077387162, + "learning_rate": 2.9963942549805613e-06, + "loss": 2.5427, + "mean_token_accuracy": 0.4172413766384125, + "step": 2975 + }, + { + "epoch": 0.0030014876500869727, + "grad_norm": 65.73514147933889, + "learning_rate": 3.001430211711621e-06, + "loss": 1.9901, + "mean_token_accuracy": 0.49999999403953554, + "step": 2980 + }, + { + "epoch": 0.0030065237031911456, + "grad_norm": 72.57805870200298, + "learning_rate": 3.0064661684426806e-06, + "loss": 2.4738, + "mean_token_accuracy": 0.42758620381355283, + "step": 2985 + }, + { + "epoch": 0.003011559756295318, + "grad_norm": 60.010863596474195, + "learning_rate": 3.0115021251737407e-06, + "loss": 2.371, + "mean_token_accuracy": 0.4655172348022461, + "step": 2990 + }, + { + "epoch": 0.003016595809399491, + "grad_norm": 95.48496956376763, + "learning_rate": 3.0165380819048004e-06, + "loss": 1.8541, + "mean_token_accuracy": 0.5298850655555725, + "step": 2995 + }, + { + "epoch": 0.003021631862503664, + "grad_norm": 65.01925032048041, + "learning_rate": 3.02157403863586e-06, + "loss": 2.3843, + "mean_token_accuracy": 0.4381773352622986, + "step": 3000 + }, + { + "epoch": 0.0030266679156078363, + "grad_norm": 94.96498384919472, + "learning_rate": 3.0266099953669197e-06, + "loss": 2.2954, + "mean_token_accuracy": 0.43226600885391236, + "step": 3005 + }, + { + "epoch": 0.0030317039687120092, + "grad_norm": 71.6103947976109, + "learning_rate": 3.0316459520979794e-06, + "loss": 2.2797, + "mean_token_accuracy": 0.4586206912994385, + "step": 3010 + }, + { + "epoch": 0.003036740021816182, + "grad_norm": 49.030310186456774, + "learning_rate": 3.0366819088290395e-06, + "loss": 1.947, + "mean_token_accuracy": 0.5241379320621491, + "step": 3015 + }, + { + "epoch": 0.003041776074920355, + "grad_norm": 59.85878115626745, + "learning_rate": 3.041717865560099e-06, + "loss": 2.1276, + "mean_token_accuracy": 0.4793103516101837, + "step": 3020 + }, + { + "epoch": 0.0030468121280245275, + "grad_norm": 69.13894546432422, + "learning_rate": 3.046753822291159e-06, + "loss": 2.0739, + "mean_token_accuracy": 0.48965516686439514, + "step": 3025 + }, + { + "epoch": 0.0030518481811287004, + "grad_norm": 84.83562826030482, + "learning_rate": 3.0517897790222186e-06, + "loss": 2.2518, + "mean_token_accuracy": 0.4931034505367279, + "step": 3030 + }, + { + "epoch": 0.0030568842342328733, + "grad_norm": 85.33763457035896, + "learning_rate": 3.0568257357532782e-06, + "loss": 2.119, + "mean_token_accuracy": 0.4689655125141144, + "step": 3035 + }, + { + "epoch": 0.0030619202873370457, + "grad_norm": 103.52015647679562, + "learning_rate": 3.0618616924843383e-06, + "loss": 2.4503, + "mean_token_accuracy": 0.41379310488700866, + "step": 3040 + }, + { + "epoch": 0.0030669563404412186, + "grad_norm": 86.49448330285979, + "learning_rate": 3.066897649215398e-06, + "loss": 2.234, + "mean_token_accuracy": 0.45862067937850953, + "step": 3045 + }, + { + "epoch": 0.0030719923935453915, + "grad_norm": 81.0860817025796, + "learning_rate": 3.0719336059464577e-06, + "loss": 2.0751, + "mean_token_accuracy": 0.49872957468032836, + "step": 3050 + }, + { + "epoch": 0.0030770284466495644, + "grad_norm": 72.29159826557503, + "learning_rate": 3.0769695626775174e-06, + "loss": 2.1363, + "mean_token_accuracy": 0.4620689630508423, + "step": 3055 + }, + { + "epoch": 0.003082064499753737, + "grad_norm": 86.2876880027152, + "learning_rate": 3.082005519408577e-06, + "loss": 2.4084, + "mean_token_accuracy": 0.4034482777118683, + "step": 3060 + }, + { + "epoch": 0.00308710055285791, + "grad_norm": 83.45307043452166, + "learning_rate": 3.087041476139637e-06, + "loss": 2.4245, + "mean_token_accuracy": 0.4379310429096222, + "step": 3065 + }, + { + "epoch": 0.0030921366059620827, + "grad_norm": 81.41168850676306, + "learning_rate": 3.0920774328706972e-06, + "loss": 2.1272, + "mean_token_accuracy": 0.4586206912994385, + "step": 3070 + }, + { + "epoch": 0.003097172659066255, + "grad_norm": 93.25355915172533, + "learning_rate": 3.097113389601757e-06, + "loss": 2.4373, + "mean_token_accuracy": 0.41724138855934145, + "step": 3075 + }, + { + "epoch": 0.003102208712170428, + "grad_norm": 104.56491335235592, + "learning_rate": 3.1021493463328166e-06, + "loss": 2.1426, + "mean_token_accuracy": 0.4551724076271057, + "step": 3080 + }, + { + "epoch": 0.003107244765274601, + "grad_norm": 106.61106264671604, + "learning_rate": 3.1071853030638763e-06, + "loss": 2.4611, + "mean_token_accuracy": 0.4206896543502808, + "step": 3085 + }, + { + "epoch": 0.003112280818378774, + "grad_norm": 87.5884365295295, + "learning_rate": 3.112221259794936e-06, + "loss": 2.3855, + "mean_token_accuracy": 0.480762255191803, + "step": 3090 + }, + { + "epoch": 0.0031173168714829463, + "grad_norm": 77.4187048838736, + "learning_rate": 3.117257216525996e-06, + "loss": 2.2813, + "mean_token_accuracy": 0.4379310369491577, + "step": 3095 + }, + { + "epoch": 0.0031223529245871192, + "grad_norm": 96.01644052275523, + "learning_rate": 3.1222931732570557e-06, + "loss": 2.3341, + "mean_token_accuracy": 0.47078040838241575, + "step": 3100 + }, + { + "epoch": 0.003127388977691292, + "grad_norm": 80.92844065704236, + "learning_rate": 3.127329129988115e-06, + "loss": 2.0389, + "mean_token_accuracy": 0.5121597111225128, + "step": 3105 + }, + { + "epoch": 0.0031324250307954646, + "grad_norm": 59.490080733519015, + "learning_rate": 3.132365086719175e-06, + "loss": 2.4377, + "mean_token_accuracy": 0.4137930989265442, + "step": 3110 + }, + { + "epoch": 0.0031374610838996375, + "grad_norm": 66.00689060356684, + "learning_rate": 3.1374010434502347e-06, + "loss": 2.0668, + "mean_token_accuracy": 0.49999999403953554, + "step": 3115 + }, + { + "epoch": 0.0031424971370038104, + "grad_norm": 176.4001965962792, + "learning_rate": 3.142437000181295e-06, + "loss": 2.5429, + "mean_token_accuracy": 0.38620689511299133, + "step": 3120 + }, + { + "epoch": 0.003147533190107983, + "grad_norm": 103.32199386607853, + "learning_rate": 3.147472956912354e-06, + "loss": 2.1981, + "mean_token_accuracy": 0.45517241954803467, + "step": 3125 + }, + { + "epoch": 0.0031525692432121557, + "grad_norm": 61.90781695146732, + "learning_rate": 3.152508913643414e-06, + "loss": 1.9303, + "mean_token_accuracy": 0.5068965435028077, + "step": 3130 + }, + { + "epoch": 0.0031576052963163286, + "grad_norm": 66.86932863486322, + "learning_rate": 3.1575448703744735e-06, + "loss": 2.1472, + "mean_token_accuracy": 0.4655172348022461, + "step": 3135 + }, + { + "epoch": 0.0031626413494205015, + "grad_norm": 56.911438747214845, + "learning_rate": 3.162580827105534e-06, + "loss": 2.1996, + "mean_token_accuracy": 0.4896551609039307, + "step": 3140 + }, + { + "epoch": 0.003167677402524674, + "grad_norm": 65.90434831968797, + "learning_rate": 3.1676167838365932e-06, + "loss": 2.2503, + "mean_token_accuracy": 0.47931033968925474, + "step": 3145 + }, + { + "epoch": 0.003172713455628847, + "grad_norm": 94.15538832461324, + "learning_rate": 3.1726527405676533e-06, + "loss": 2.1304, + "mean_token_accuracy": 0.5172413766384125, + "step": 3150 + }, + { + "epoch": 0.00317774950873302, + "grad_norm": 75.63183227983484, + "learning_rate": 3.1776886972987126e-06, + "loss": 2.1292, + "mean_token_accuracy": 0.4896551728248596, + "step": 3155 + }, + { + "epoch": 0.0031827855618371923, + "grad_norm": 67.18445741960737, + "learning_rate": 3.1827246540297727e-06, + "loss": 2.2006, + "mean_token_accuracy": 0.4827586054801941, + "step": 3160 + }, + { + "epoch": 0.003187821614941365, + "grad_norm": 72.00631080527997, + "learning_rate": 3.1877606107608324e-06, + "loss": 2.1479, + "mean_token_accuracy": 0.4517241358757019, + "step": 3165 + }, + { + "epoch": 0.003192857668045538, + "grad_norm": 69.13595140264927, + "learning_rate": 3.1927965674918925e-06, + "loss": 1.7245, + "mean_token_accuracy": 0.5838669955730438, + "step": 3170 + }, + { + "epoch": 0.003197893721149711, + "grad_norm": 71.10822797753556, + "learning_rate": 3.1978325242229517e-06, + "loss": 2.3047, + "mean_token_accuracy": 0.47241378426551817, + "step": 3175 + }, + { + "epoch": 0.0032029297742538834, + "grad_norm": 93.12915752237711, + "learning_rate": 3.202868480954012e-06, + "loss": 2.4167, + "mean_token_accuracy": 0.3931034505367279, + "step": 3180 + }, + { + "epoch": 0.0032079658273580563, + "grad_norm": 58.75058719579265, + "learning_rate": 3.2079044376850715e-06, + "loss": 2.2029, + "mean_token_accuracy": 0.42068964838981626, + "step": 3185 + }, + { + "epoch": 0.0032130018804622292, + "grad_norm": 78.23709193028031, + "learning_rate": 3.2129403944161316e-06, + "loss": 2.6463, + "mean_token_accuracy": 0.3724137842655182, + "step": 3190 + }, + { + "epoch": 0.0032180379335664017, + "grad_norm": 88.18487494198256, + "learning_rate": 3.217976351147191e-06, + "loss": 2.0882, + "mean_token_accuracy": 0.48275862336158754, + "step": 3195 + }, + { + "epoch": 0.0032230739866705746, + "grad_norm": 80.77723888561393, + "learning_rate": 3.223012307878251e-06, + "loss": 2.1852, + "mean_token_accuracy": 0.4862068951129913, + "step": 3200 + }, + { + "epoch": 0.0032281100397747475, + "grad_norm": 65.69255397408914, + "learning_rate": 3.22804826460931e-06, + "loss": 2.4897, + "mean_token_accuracy": 0.41724138259887694, + "step": 3205 + }, + { + "epoch": 0.0032331460928789204, + "grad_norm": 73.88061599403267, + "learning_rate": 3.2330842213403703e-06, + "loss": 2.2371, + "mean_token_accuracy": 0.47586206197738645, + "step": 3210 + }, + { + "epoch": 0.003238182145983093, + "grad_norm": 91.98232071006288, + "learning_rate": 3.23812017807143e-06, + "loss": 2.1787, + "mean_token_accuracy": 0.48275862336158754, + "step": 3215 + }, + { + "epoch": 0.0032432181990872657, + "grad_norm": 62.72222230252338, + "learning_rate": 3.24315613480249e-06, + "loss": 2.3648, + "mean_token_accuracy": 0.42413793206214906, + "step": 3220 + }, + { + "epoch": 0.0032482542521914386, + "grad_norm": 86.0604729106889, + "learning_rate": 3.2481920915335493e-06, + "loss": 2.5279, + "mean_token_accuracy": 0.38965516686439516, + "step": 3225 + }, + { + "epoch": 0.003253290305295611, + "grad_norm": 69.3939076142181, + "learning_rate": 3.2532280482646094e-06, + "loss": 2.2904, + "mean_token_accuracy": 0.4849364757537842, + "step": 3230 + }, + { + "epoch": 0.003258326358399784, + "grad_norm": 64.12567706361092, + "learning_rate": 3.258264004995669e-06, + "loss": 2.309, + "mean_token_accuracy": 0.44827585816383364, + "step": 3235 + }, + { + "epoch": 0.003263362411503957, + "grad_norm": 69.32636502894857, + "learning_rate": 3.263299961726729e-06, + "loss": 2.0678, + "mean_token_accuracy": 0.4862069010734558, + "step": 3240 + }, + { + "epoch": 0.00326839846460813, + "grad_norm": 79.91087498801657, + "learning_rate": 3.2683359184577885e-06, + "loss": 2.0761, + "mean_token_accuracy": 0.5119782269001008, + "step": 3245 + }, + { + "epoch": 0.0032734345177123023, + "grad_norm": 62.59537746986369, + "learning_rate": 3.2733718751888486e-06, + "loss": 2.4258, + "mean_token_accuracy": 0.42413792610168455, + "step": 3250 + }, + { + "epoch": 0.003278470570816475, + "grad_norm": 76.6221373052642, + "learning_rate": 3.278407831919908e-06, + "loss": 2.3818, + "mean_token_accuracy": 0.42413792610168455, + "step": 3255 + }, + { + "epoch": 0.003283506623920648, + "grad_norm": 102.34594839825111, + "learning_rate": 3.283443788650968e-06, + "loss": 1.9283, + "mean_token_accuracy": 0.5068965435028077, + "step": 3260 + }, + { + "epoch": 0.0032885426770248205, + "grad_norm": 74.47466952805327, + "learning_rate": 3.2884797453820276e-06, + "loss": 2.3048, + "mean_token_accuracy": 0.43103448748588563, + "step": 3265 + }, + { + "epoch": 0.0032935787301289934, + "grad_norm": 85.84855823735926, + "learning_rate": 3.2935157021130877e-06, + "loss": 2.008, + "mean_token_accuracy": 0.47931034564971925, + "step": 3270 + }, + { + "epoch": 0.0032986147832331663, + "grad_norm": 68.32688721306249, + "learning_rate": 3.2985516588441478e-06, + "loss": 2.4571, + "mean_token_accuracy": 0.4344827592372894, + "step": 3275 + }, + { + "epoch": 0.003303650836337339, + "grad_norm": 57.944642708698225, + "learning_rate": 3.303587615575207e-06, + "loss": 2.6626, + "mean_token_accuracy": 0.3999999940395355, + "step": 3280 + }, + { + "epoch": 0.0033086868894415117, + "grad_norm": 75.54629317907177, + "learning_rate": 3.308623572306267e-06, + "loss": 2.3377, + "mean_token_accuracy": 0.4448275864124298, + "step": 3285 + }, + { + "epoch": 0.0033137229425456846, + "grad_norm": 97.91341302005132, + "learning_rate": 3.313659529037327e-06, + "loss": 2.2696, + "mean_token_accuracy": 0.42068966031074523, + "step": 3290 + }, + { + "epoch": 0.0033187589956498575, + "grad_norm": 68.00506859333011, + "learning_rate": 3.318695485768387e-06, + "loss": 2.0894, + "mean_token_accuracy": 0.46896551847457885, + "step": 3295 + }, + { + "epoch": 0.00332379504875403, + "grad_norm": 81.44210177411483, + "learning_rate": 3.323731442499446e-06, + "loss": 2.0202, + "mean_token_accuracy": 0.5103448271751404, + "step": 3300 + }, + { + "epoch": 0.003328831101858203, + "grad_norm": 84.87738445515838, + "learning_rate": 3.3287673992305063e-06, + "loss": 2.0818, + "mean_token_accuracy": 0.5019963622093201, + "step": 3305 + }, + { + "epoch": 0.0033338671549623757, + "grad_norm": 67.4530211237485, + "learning_rate": 3.3338033559615655e-06, + "loss": 2.4397, + "mean_token_accuracy": 0.4379310369491577, + "step": 3310 + }, + { + "epoch": 0.0033389032080665486, + "grad_norm": 69.07746772961687, + "learning_rate": 3.3388393126926256e-06, + "loss": 2.104, + "mean_token_accuracy": 0.46896551847457885, + "step": 3315 + }, + { + "epoch": 0.003343939261170721, + "grad_norm": 101.65889637029338, + "learning_rate": 3.3438752694236853e-06, + "loss": 2.4903, + "mean_token_accuracy": 0.4103448212146759, + "step": 3320 + }, + { + "epoch": 0.003348975314274894, + "grad_norm": 122.39798118649, + "learning_rate": 3.3489112261547454e-06, + "loss": 2.2311, + "mean_token_accuracy": 0.4931034445762634, + "step": 3325 + }, + { + "epoch": 0.003354011367379067, + "grad_norm": 51.86216167236562, + "learning_rate": 3.3539471828858046e-06, + "loss": 1.9066, + "mean_token_accuracy": 0.5310344815254211, + "step": 3330 + }, + { + "epoch": 0.0033590474204832394, + "grad_norm": 69.06014636067414, + "learning_rate": 3.3589831396168647e-06, + "loss": 2.3247, + "mean_token_accuracy": 0.4551724076271057, + "step": 3335 + }, + { + "epoch": 0.0033640834735874123, + "grad_norm": 60.105566055871755, + "learning_rate": 3.3640190963479244e-06, + "loss": 2.5613, + "mean_token_accuracy": 0.417241370677948, + "step": 3340 + }, + { + "epoch": 0.003369119526691585, + "grad_norm": 67.18565868599282, + "learning_rate": 3.3690550530789845e-06, + "loss": 2.3156, + "mean_token_accuracy": 0.3999999940395355, + "step": 3345 + }, + { + "epoch": 0.0033741555797957576, + "grad_norm": 72.41177643014052, + "learning_rate": 3.3740910098100438e-06, + "loss": 2.1682, + "mean_token_accuracy": 0.4862068951129913, + "step": 3350 + }, + { + "epoch": 0.0033791916328999305, + "grad_norm": 71.12155487845098, + "learning_rate": 3.379126966541104e-06, + "loss": 2.3652, + "mean_token_accuracy": 0.4344827473163605, + "step": 3355 + }, + { + "epoch": 0.0033842276860041034, + "grad_norm": 54.85595735011981, + "learning_rate": 3.384162923272163e-06, + "loss": 2.306, + "mean_token_accuracy": 0.42758620977401735, + "step": 3360 + }, + { + "epoch": 0.0033892637391082763, + "grad_norm": 76.61154397660779, + "learning_rate": 3.3891988800032236e-06, + "loss": 2.1391, + "mean_token_accuracy": 0.4675136208534241, + "step": 3365 + }, + { + "epoch": 0.0033942997922124488, + "grad_norm": 68.67037474704775, + "learning_rate": 3.394234836734283e-06, + "loss": 2.2515, + "mean_token_accuracy": 0.47931034564971925, + "step": 3370 + }, + { + "epoch": 0.0033993358453166217, + "grad_norm": 52.73592751943477, + "learning_rate": 3.399270793465343e-06, + "loss": 2.2811, + "mean_token_accuracy": 0.46896551847457885, + "step": 3375 + }, + { + "epoch": 0.0034043718984207946, + "grad_norm": 66.85389688733349, + "learning_rate": 3.4043067501964023e-06, + "loss": 2.001, + "mean_token_accuracy": 0.5034482777118683, + "step": 3380 + }, + { + "epoch": 0.003409407951524967, + "grad_norm": 78.88779690444377, + "learning_rate": 3.4093427069274624e-06, + "loss": 2.4047, + "mean_token_accuracy": 0.4724137902259827, + "step": 3385 + }, + { + "epoch": 0.00341444400462914, + "grad_norm": 101.29439469265937, + "learning_rate": 3.414378663658522e-06, + "loss": 2.0217, + "mean_token_accuracy": 0.5241379320621491, + "step": 3390 + }, + { + "epoch": 0.003419480057733313, + "grad_norm": 106.62456854021953, + "learning_rate": 3.419414620389582e-06, + "loss": 1.9196, + "mean_token_accuracy": 0.4862068831920624, + "step": 3395 + }, + { + "epoch": 0.0034245161108374857, + "grad_norm": 73.26336701367737, + "learning_rate": 3.4244505771206414e-06, + "loss": 2.2365, + "mean_token_accuracy": 0.4551724135875702, + "step": 3400 + }, + { + "epoch": 0.003429552163941658, + "grad_norm": 74.67394340125233, + "learning_rate": 3.4294865338517015e-06, + "loss": 2.3358, + "mean_token_accuracy": 0.42413793206214906, + "step": 3405 + }, + { + "epoch": 0.003434588217045831, + "grad_norm": 68.53252070520398, + "learning_rate": 3.434522490582761e-06, + "loss": 2.3594, + "mean_token_accuracy": 0.42413793206214906, + "step": 3410 + }, + { + "epoch": 0.003439624270150004, + "grad_norm": 68.185013590259, + "learning_rate": 3.4395584473138213e-06, + "loss": 2.1012, + "mean_token_accuracy": 0.48154870271682737, + "step": 3415 + }, + { + "epoch": 0.0034446603232541765, + "grad_norm": 83.30862263246033, + "learning_rate": 3.4445944040448805e-06, + "loss": 2.1941, + "mean_token_accuracy": 0.4620689630508423, + "step": 3420 + }, + { + "epoch": 0.0034496963763583494, + "grad_norm": 104.99516517035869, + "learning_rate": 3.4496303607759406e-06, + "loss": 2.1823, + "mean_token_accuracy": 0.48457351326942444, + "step": 3425 + }, + { + "epoch": 0.0034547324294625223, + "grad_norm": 110.00124048542287, + "learning_rate": 3.454666317507e-06, + "loss": 2.4887, + "mean_token_accuracy": 0.38965516686439516, + "step": 3430 + }, + { + "epoch": 0.003459768482566695, + "grad_norm": 74.82133672955143, + "learning_rate": 3.45970227423806e-06, + "loss": 2.6637, + "mean_token_accuracy": 0.3413793116807938, + "step": 3435 + }, + { + "epoch": 0.0034648045356708676, + "grad_norm": 72.25224979625271, + "learning_rate": 3.4647382309691196e-06, + "loss": 2.1014, + "mean_token_accuracy": 0.4896551609039307, + "step": 3440 + }, + { + "epoch": 0.0034698405887750405, + "grad_norm": 108.94761765016044, + "learning_rate": 3.4697741877001797e-06, + "loss": 2.0139, + "mean_token_accuracy": 0.482758617401123, + "step": 3445 + }, + { + "epoch": 0.0034748766418792134, + "grad_norm": 79.01038491752975, + "learning_rate": 3.474810144431239e-06, + "loss": 1.9689, + "mean_token_accuracy": 0.5068965435028077, + "step": 3450 + }, + { + "epoch": 0.003479912694983386, + "grad_norm": 78.71801163458723, + "learning_rate": 3.479846101162299e-06, + "loss": 2.0985, + "mean_token_accuracy": 0.5034482836723327, + "step": 3455 + }, + { + "epoch": 0.0034849487480875588, + "grad_norm": 63.097794630809865, + "learning_rate": 3.4848820578933588e-06, + "loss": 2.1702, + "mean_token_accuracy": 0.46551724076271056, + "step": 3460 + }, + { + "epoch": 0.0034899848011917317, + "grad_norm": 76.1510617565935, + "learning_rate": 3.489918014624419e-06, + "loss": 2.1298, + "mean_token_accuracy": 0.510344821214676, + "step": 3465 + }, + { + "epoch": 0.0034950208542959046, + "grad_norm": 92.86291262024004, + "learning_rate": 3.494953971355478e-06, + "loss": 2.0496, + "mean_token_accuracy": 0.4862068951129913, + "step": 3470 + }, + { + "epoch": 0.003500056907400077, + "grad_norm": 95.99020443429825, + "learning_rate": 3.4999899280865382e-06, + "loss": 2.1456, + "mean_token_accuracy": 0.4896551609039307, + "step": 3475 + }, + { + "epoch": 0.00350509296050425, + "grad_norm": 82.6329914564027, + "learning_rate": 3.5050258848175975e-06, + "loss": 2.3636, + "mean_token_accuracy": 0.4620689630508423, + "step": 3480 + }, + { + "epoch": 0.003510129013608423, + "grad_norm": 61.11018049383657, + "learning_rate": 3.5100618415486576e-06, + "loss": 2.2933, + "mean_token_accuracy": 0.42413792610168455, + "step": 3485 + }, + { + "epoch": 0.0035151650667125953, + "grad_norm": 119.59082860991383, + "learning_rate": 3.5150977982797173e-06, + "loss": 1.9017, + "mean_token_accuracy": 0.5206896424293518, + "step": 3490 + }, + { + "epoch": 0.003520201119816768, + "grad_norm": 62.215343437407796, + "learning_rate": 3.5201337550107774e-06, + "loss": 2.232, + "mean_token_accuracy": 0.4551724076271057, + "step": 3495 + }, + { + "epoch": 0.003525237172920941, + "grad_norm": 98.74880231547037, + "learning_rate": 3.5251697117418366e-06, + "loss": 2.4658, + "mean_token_accuracy": 0.4275861978530884, + "step": 3500 + }, + { + "epoch": 0.003530273226025114, + "grad_norm": 86.30846794670663, + "learning_rate": 3.5302056684728967e-06, + "loss": 2.5153, + "mean_token_accuracy": 0.40000000298023225, + "step": 3505 + }, + { + "epoch": 0.0035353092791292865, + "grad_norm": 77.02887776073075, + "learning_rate": 3.5352416252039564e-06, + "loss": 2.6338, + "mean_token_accuracy": 0.37586206793785093, + "step": 3510 + }, + { + "epoch": 0.0035403453322334594, + "grad_norm": 91.40390753427455, + "learning_rate": 3.5402775819350165e-06, + "loss": 2.0753, + "mean_token_accuracy": 0.5034482657909394, + "step": 3515 + }, + { + "epoch": 0.0035453813853376323, + "grad_norm": 54.2541043009632, + "learning_rate": 3.5453135386660757e-06, + "loss": 2.1156, + "mean_token_accuracy": 0.4655172348022461, + "step": 3520 + }, + { + "epoch": 0.0035504174384418047, + "grad_norm": 67.84233516952993, + "learning_rate": 3.550349495397136e-06, + "loss": 2.1998, + "mean_token_accuracy": 0.5, + "step": 3525 + }, + { + "epoch": 0.0035554534915459776, + "grad_norm": 76.00835204515016, + "learning_rate": 3.555385452128195e-06, + "loss": 2.2102, + "mean_token_accuracy": 0.47586206793785096, + "step": 3530 + }, + { + "epoch": 0.0035604895446501505, + "grad_norm": 64.18721302787216, + "learning_rate": 3.560421408859255e-06, + "loss": 2.0036, + "mean_token_accuracy": 0.4758620738983154, + "step": 3535 + }, + { + "epoch": 0.0035655255977543234, + "grad_norm": 69.72603043250474, + "learning_rate": 3.565457365590315e-06, + "loss": 2.2701, + "mean_token_accuracy": 0.4413793087005615, + "step": 3540 + }, + { + "epoch": 0.003570561650858496, + "grad_norm": 49.336332319641855, + "learning_rate": 3.570493322321375e-06, + "loss": 2.2199, + "mean_token_accuracy": 0.42413792610168455, + "step": 3545 + }, + { + "epoch": 0.0035755977039626688, + "grad_norm": 102.97511347272919, + "learning_rate": 3.5755292790524342e-06, + "loss": 2.1684, + "mean_token_accuracy": 0.48275861144065857, + "step": 3550 + }, + { + "epoch": 0.0035806337570668417, + "grad_norm": 75.69131383661562, + "learning_rate": 3.5805652357834943e-06, + "loss": 2.1375, + "mean_token_accuracy": 0.48965518474578856, + "step": 3555 + }, + { + "epoch": 0.003585669810171014, + "grad_norm": 85.29764511331626, + "learning_rate": 3.585601192514554e-06, + "loss": 1.9744, + "mean_token_accuracy": 0.5034482777118683, + "step": 3560 + }, + { + "epoch": 0.003590705863275187, + "grad_norm": 63.289273927785565, + "learning_rate": 3.590637149245614e-06, + "loss": 1.9565, + "mean_token_accuracy": 0.47586206197738645, + "step": 3565 + }, + { + "epoch": 0.00359574191637936, + "grad_norm": 80.30134192622461, + "learning_rate": 3.5956731059766733e-06, + "loss": 2.1315, + "mean_token_accuracy": 0.48275862336158754, + "step": 3570 + }, + { + "epoch": 0.0036007779694835324, + "grad_norm": 50.647434265493764, + "learning_rate": 3.6007090627077334e-06, + "loss": 2.1857, + "mean_token_accuracy": 0.4676345944404602, + "step": 3575 + }, + { + "epoch": 0.0036058140225877053, + "grad_norm": 65.72498649601756, + "learning_rate": 3.6057450194387927e-06, + "loss": 2.4623, + "mean_token_accuracy": 0.42510586977005005, + "step": 3580 + }, + { + "epoch": 0.003610850075691878, + "grad_norm": 94.02834505436228, + "learning_rate": 3.610780976169853e-06, + "loss": 2.4245, + "mean_token_accuracy": 0.43793103098869324, + "step": 3585 + }, + { + "epoch": 0.003615886128796051, + "grad_norm": 74.30497161061314, + "learning_rate": 3.6158169329009125e-06, + "loss": 2.0252, + "mean_token_accuracy": 0.49655172824859617, + "step": 3590 + }, + { + "epoch": 0.0036209221819002236, + "grad_norm": 47.520490622559606, + "learning_rate": 3.6208528896319726e-06, + "loss": 2.4116, + "mean_token_accuracy": 0.4241379380226135, + "step": 3595 + }, + { + "epoch": 0.0036259582350043965, + "grad_norm": 63.18103921688582, + "learning_rate": 3.625888846363032e-06, + "loss": 2.6274, + "mean_token_accuracy": 0.4034482777118683, + "step": 3600 + }, + { + "epoch": 0.0036309942881085694, + "grad_norm": 71.20204844074186, + "learning_rate": 3.630924803094092e-06, + "loss": 1.8998, + "mean_token_accuracy": 0.5206896543502808, + "step": 3605 + }, + { + "epoch": 0.003636030341212742, + "grad_norm": 86.46180197318455, + "learning_rate": 3.6359607598251516e-06, + "loss": 2.3773, + "mean_token_accuracy": 0.458620685338974, + "step": 3610 + }, + { + "epoch": 0.0036410663943169147, + "grad_norm": 68.88902774157108, + "learning_rate": 3.6409967165562117e-06, + "loss": 2.54, + "mean_token_accuracy": 0.4034482717514038, + "step": 3615 + }, + { + "epoch": 0.0036461024474210876, + "grad_norm": 84.03757858949756, + "learning_rate": 3.646032673287271e-06, + "loss": 1.9773, + "mean_token_accuracy": 0.5127646744251251, + "step": 3620 + }, + { + "epoch": 0.0036511385005252605, + "grad_norm": 98.45081092487356, + "learning_rate": 3.651068630018331e-06, + "loss": 2.6486, + "mean_token_accuracy": 0.3793103456497192, + "step": 3625 + }, + { + "epoch": 0.003656174553629433, + "grad_norm": 65.57839039921986, + "learning_rate": 3.6561045867493903e-06, + "loss": 2.2012, + "mean_token_accuracy": 0.46376285552978513, + "step": 3630 + }, + { + "epoch": 0.003661210606733606, + "grad_norm": 48.663535680537514, + "learning_rate": 3.661140543480451e-06, + "loss": 1.9311, + "mean_token_accuracy": 0.5689655184745789, + "step": 3635 + }, + { + "epoch": 0.0036662466598377788, + "grad_norm": 71.542719255767, + "learning_rate": 3.66617650021151e-06, + "loss": 1.9894, + "mean_token_accuracy": 0.4896551728248596, + "step": 3640 + }, + { + "epoch": 0.0036712827129419512, + "grad_norm": 80.30064279528494, + "learning_rate": 3.67121245694257e-06, + "loss": 2.3826, + "mean_token_accuracy": 0.4551724076271057, + "step": 3645 + }, + { + "epoch": 0.003676318766046124, + "grad_norm": 58.98396996137691, + "learning_rate": 3.6762484136736294e-06, + "loss": 2.1597, + "mean_token_accuracy": 0.48771928548812865, + "step": 3650 + }, + { + "epoch": 0.003681354819150297, + "grad_norm": 219.89255193600903, + "learning_rate": 3.6812843704046895e-06, + "loss": 2.3411, + "mean_token_accuracy": 0.43103448748588563, + "step": 3655 + }, + { + "epoch": 0.00368639087225447, + "grad_norm": 64.68524497717114, + "learning_rate": 3.6863203271357492e-06, + "loss": 2.1244, + "mean_token_accuracy": 0.4760435581207275, + "step": 3660 + }, + { + "epoch": 0.0036914269253586424, + "grad_norm": 95.24641271260893, + "learning_rate": 3.6913562838668093e-06, + "loss": 2.3517, + "mean_token_accuracy": 0.4551724135875702, + "step": 3665 + }, + { + "epoch": 0.0036964629784628153, + "grad_norm": 62.979719804286965, + "learning_rate": 3.6963922405978686e-06, + "loss": 2.4041, + "mean_token_accuracy": 0.40000000298023225, + "step": 3670 + }, + { + "epoch": 0.003701499031566988, + "grad_norm": 69.66840206666882, + "learning_rate": 3.7014281973289287e-06, + "loss": 2.1918, + "mean_token_accuracy": 0.43793103098869324, + "step": 3675 + }, + { + "epoch": 0.0037065350846711607, + "grad_norm": 66.95460023221484, + "learning_rate": 3.7064641540599883e-06, + "loss": 2.5266, + "mean_token_accuracy": 0.3793103516101837, + "step": 3680 + }, + { + "epoch": 0.0037115711377753336, + "grad_norm": 49.14894012709388, + "learning_rate": 3.7115001107910484e-06, + "loss": 2.2411, + "mean_token_accuracy": 0.4931034564971924, + "step": 3685 + }, + { + "epoch": 0.0037166071908795065, + "grad_norm": 76.64945081703381, + "learning_rate": 3.7165360675221085e-06, + "loss": 2.2674, + "mean_token_accuracy": 0.4620689570903778, + "step": 3690 + }, + { + "epoch": 0.0037216432439836794, + "grad_norm": 72.07156586487014, + "learning_rate": 3.721572024253168e-06, + "loss": 2.1469, + "mean_token_accuracy": 0.4517241418361664, + "step": 3695 + }, + { + "epoch": 0.003726679297087852, + "grad_norm": 55.83660019268889, + "learning_rate": 3.726607980984228e-06, + "loss": 2.0973, + "mean_token_accuracy": 0.47241379618644713, + "step": 3700 + }, + { + "epoch": 0.0037317153501920247, + "grad_norm": 85.50471241152457, + "learning_rate": 3.731643937715287e-06, + "loss": 2.258, + "mean_token_accuracy": 0.4586206912994385, + "step": 3705 + }, + { + "epoch": 0.0037367514032961976, + "grad_norm": 74.0609858646388, + "learning_rate": 3.7366798944463473e-06, + "loss": 2.2093, + "mean_token_accuracy": 0.5068965554237366, + "step": 3710 + }, + { + "epoch": 0.00374178745640037, + "grad_norm": 103.45310432229822, + "learning_rate": 3.741715851177407e-06, + "loss": 2.6597, + "mean_token_accuracy": 0.4068965554237366, + "step": 3715 + }, + { + "epoch": 0.003746823509504543, + "grad_norm": 98.12325393792392, + "learning_rate": 3.746751807908467e-06, + "loss": 2.4466, + "mean_token_accuracy": 0.4448275864124298, + "step": 3720 + }, + { + "epoch": 0.003751859562608716, + "grad_norm": 57.1653678469413, + "learning_rate": 3.7517877646395263e-06, + "loss": 2.0738, + "mean_token_accuracy": 0.4862068951129913, + "step": 3725 + }, + { + "epoch": 0.0037568956157128888, + "grad_norm": 70.8981827875069, + "learning_rate": 3.7568237213705864e-06, + "loss": 1.9772, + "mean_token_accuracy": 0.49655171632766726, + "step": 3730 + }, + { + "epoch": 0.0037619316688170612, + "grad_norm": 61.30132774505522, + "learning_rate": 3.761859678101646e-06, + "loss": 2.6244, + "mean_token_accuracy": 0.4, + "step": 3735 + }, + { + "epoch": 0.003766967721921234, + "grad_norm": 67.09862478096984, + "learning_rate": 3.766895634832706e-06, + "loss": 2.0188, + "mean_token_accuracy": 0.5172413766384125, + "step": 3740 + }, + { + "epoch": 0.003772003775025407, + "grad_norm": 66.351129040528, + "learning_rate": 3.7719315915637654e-06, + "loss": 2.3546, + "mean_token_accuracy": 0.3999999940395355, + "step": 3745 + }, + { + "epoch": 0.0037770398281295795, + "grad_norm": 68.18041714322693, + "learning_rate": 3.7769675482948255e-06, + "loss": 2.1001, + "mean_token_accuracy": 0.4931034445762634, + "step": 3750 + }, + { + "epoch": 0.0037820758812337524, + "grad_norm": 72.55500172162084, + "learning_rate": 3.7820035050258848e-06, + "loss": 2.3197, + "mean_token_accuracy": 0.40689654350280763, + "step": 3755 + }, + { + "epoch": 0.0037871119343379253, + "grad_norm": 62.17451142262177, + "learning_rate": 3.787039461756945e-06, + "loss": 2.176, + "mean_token_accuracy": 0.4689655125141144, + "step": 3760 + }, + { + "epoch": 0.003792147987442098, + "grad_norm": 68.48842259879379, + "learning_rate": 3.7920754184880045e-06, + "loss": 2.1518, + "mean_token_accuracy": 0.4724137902259827, + "step": 3765 + }, + { + "epoch": 0.0037971840405462707, + "grad_norm": 89.95885898653464, + "learning_rate": 3.7971113752190646e-06, + "loss": 2.1153, + "mean_token_accuracy": 0.4706593990325928, + "step": 3770 + }, + { + "epoch": 0.0038022200936504436, + "grad_norm": 52.615998656096316, + "learning_rate": 3.802147331950124e-06, + "loss": 2.5564, + "mean_token_accuracy": 0.4258318245410919, + "step": 3775 + }, + { + "epoch": 0.0038072561467546165, + "grad_norm": 62.07442076895445, + "learning_rate": 3.807183288681184e-06, + "loss": 1.8344, + "mean_token_accuracy": 0.5160919547080993, + "step": 3780 + }, + { + "epoch": 0.003812292199858789, + "grad_norm": 61.03177882638895, + "learning_rate": 3.8122192454122437e-06, + "loss": 2.0359, + "mean_token_accuracy": 0.5, + "step": 3785 + }, + { + "epoch": 0.003817328252962962, + "grad_norm": 85.74588906043324, + "learning_rate": 3.817255202143303e-06, + "loss": 2.3117, + "mean_token_accuracy": 0.45517241954803467, + "step": 3790 + }, + { + "epoch": 0.0038223643060671347, + "grad_norm": 89.95195355878391, + "learning_rate": 3.8222911588743634e-06, + "loss": 2.1961, + "mean_token_accuracy": 0.4448275864124298, + "step": 3795 + }, + { + "epoch": 0.003827400359171307, + "grad_norm": 71.4226892060403, + "learning_rate": 3.8273271156054235e-06, + "loss": 2.1332, + "mean_token_accuracy": 0.48275862336158754, + "step": 3800 + }, + { + "epoch": 0.00383243641227548, + "grad_norm": 75.86783507205627, + "learning_rate": 3.832363072336483e-06, + "loss": 2.4997, + "mean_token_accuracy": 0.41724138259887694, + "step": 3805 + }, + { + "epoch": 0.003837472465379653, + "grad_norm": 74.3842085733325, + "learning_rate": 3.837399029067543e-06, + "loss": 2.2921, + "mean_token_accuracy": 0.44621899724006653, + "step": 3810 + }, + { + "epoch": 0.003842508518483826, + "grad_norm": 59.76921710712986, + "learning_rate": 3.842434985798602e-06, + "loss": 2.0702, + "mean_token_accuracy": 0.5034482777118683, + "step": 3815 + }, + { + "epoch": 0.0038475445715879983, + "grad_norm": 79.43928032912176, + "learning_rate": 3.847470942529662e-06, + "loss": 2.3318, + "mean_token_accuracy": 0.4482758641242981, + "step": 3820 + }, + { + "epoch": 0.0038525806246921712, + "grad_norm": 73.53894426401041, + "learning_rate": 3.8525068992607215e-06, + "loss": 2.0034, + "mean_token_accuracy": 0.4896551728248596, + "step": 3825 + }, + { + "epoch": 0.003857616677796344, + "grad_norm": 90.13419870256526, + "learning_rate": 3.857542855991782e-06, + "loss": 2.5503, + "mean_token_accuracy": 0.3999999940395355, + "step": 3830 + }, + { + "epoch": 0.0038626527309005166, + "grad_norm": 71.5623849820841, + "learning_rate": 3.862578812722841e-06, + "loss": 2.1522, + "mean_token_accuracy": 0.47931033968925474, + "step": 3835 + }, + { + "epoch": 0.0038676887840046895, + "grad_norm": 88.93229072009744, + "learning_rate": 3.867614769453901e-06, + "loss": 2.3312, + "mean_token_accuracy": 0.4482758641242981, + "step": 3840 + }, + { + "epoch": 0.0038727248371088624, + "grad_norm": 95.57253395168001, + "learning_rate": 3.872650726184961e-06, + "loss": 2.3891, + "mean_token_accuracy": 0.4103448212146759, + "step": 3845 + }, + { + "epoch": 0.0038777608902130353, + "grad_norm": 99.53244641082048, + "learning_rate": 3.877686682916021e-06, + "loss": 2.1269, + "mean_token_accuracy": 0.4724137902259827, + "step": 3850 + }, + { + "epoch": 0.0038827969433172078, + "grad_norm": 66.32690974374738, + "learning_rate": 3.88272263964708e-06, + "loss": 2.3929, + "mean_token_accuracy": 0.4068965554237366, + "step": 3855 + }, + { + "epoch": 0.0038878329964213807, + "grad_norm": 72.58499868972568, + "learning_rate": 3.8877585963781405e-06, + "loss": 2.2971, + "mean_token_accuracy": 0.4517241358757019, + "step": 3860 + }, + { + "epoch": 0.0038928690495255536, + "grad_norm": 81.1126693490413, + "learning_rate": 3.8927945531092e-06, + "loss": 2.3377, + "mean_token_accuracy": 0.44343616962432864, + "step": 3865 + }, + { + "epoch": 0.003897905102629726, + "grad_norm": 80.47860055164507, + "learning_rate": 3.89783050984026e-06, + "loss": 2.3569, + "mean_token_accuracy": 0.4310344696044922, + "step": 3870 + }, + { + "epoch": 0.003902941155733899, + "grad_norm": 80.35359476634429, + "learning_rate": 3.902866466571319e-06, + "loss": 2.3744, + "mean_token_accuracy": 0.4344827473163605, + "step": 3875 + }, + { + "epoch": 0.003907977208838072, + "grad_norm": 48.78239038260757, + "learning_rate": 3.907902423302379e-06, + "loss": 1.9804, + "mean_token_accuracy": 0.5151477813720703, + "step": 3880 + }, + { + "epoch": 0.003913013261942245, + "grad_norm": 62.476966785250376, + "learning_rate": 3.9129383800334385e-06, + "loss": 1.729, + "mean_token_accuracy": 0.558620685338974, + "step": 3885 + }, + { + "epoch": 0.003918049315046418, + "grad_norm": 65.41113658727998, + "learning_rate": 3.9179743367644986e-06, + "loss": 2.0854, + "mean_token_accuracy": 0.48965516686439514, + "step": 3890 + }, + { + "epoch": 0.00392308536815059, + "grad_norm": 74.59678459847665, + "learning_rate": 3.923010293495559e-06, + "loss": 2.264, + "mean_token_accuracy": 0.4517241358757019, + "step": 3895 + }, + { + "epoch": 0.0039281214212547625, + "grad_norm": 58.99977571026314, + "learning_rate": 3.928046250226619e-06, + "loss": 2.1787, + "mean_token_accuracy": 0.47586206197738645, + "step": 3900 + }, + { + "epoch": 0.0039331574743589354, + "grad_norm": 48.427653762972085, + "learning_rate": 3.933082206957678e-06, + "loss": 2.2662, + "mean_token_accuracy": 0.482758617401123, + "step": 3905 + }, + { + "epoch": 0.003938193527463108, + "grad_norm": 58.61050058841118, + "learning_rate": 3.938118163688738e-06, + "loss": 2.0359, + "mean_token_accuracy": 0.46896551847457885, + "step": 3910 + }, + { + "epoch": 0.003943229580567281, + "grad_norm": 56.57091735569473, + "learning_rate": 3.943154120419797e-06, + "loss": 2.3083, + "mean_token_accuracy": 0.47586206793785096, + "step": 3915 + }, + { + "epoch": 0.003948265633671454, + "grad_norm": 66.3296435475827, + "learning_rate": 3.9481900771508575e-06, + "loss": 2.4199, + "mean_token_accuracy": 0.4068965554237366, + "step": 3920 + }, + { + "epoch": 0.003953301686775627, + "grad_norm": 54.75842624216521, + "learning_rate": 3.953226033881917e-06, + "loss": 2.2852, + "mean_token_accuracy": 0.4724137902259827, + "step": 3925 + }, + { + "epoch": 0.003958337739879799, + "grad_norm": 87.54273533048352, + "learning_rate": 3.958261990612977e-06, + "loss": 2.2838, + "mean_token_accuracy": 0.42758620381355283, + "step": 3930 + }, + { + "epoch": 0.003963373792983972, + "grad_norm": 65.62266502700778, + "learning_rate": 3.963297947344036e-06, + "loss": 2.2713, + "mean_token_accuracy": 0.4206896543502808, + "step": 3935 + }, + { + "epoch": 0.003968409846088145, + "grad_norm": 53.90634618729618, + "learning_rate": 3.968333904075096e-06, + "loss": 2.1689, + "mean_token_accuracy": 0.48451300859451296, + "step": 3940 + }, + { + "epoch": 0.003973445899192318, + "grad_norm": 99.00851267101648, + "learning_rate": 3.973369860806156e-06, + "loss": 2.1535, + "mean_token_accuracy": 0.47931034564971925, + "step": 3945 + }, + { + "epoch": 0.003978481952296491, + "grad_norm": 84.25507492981015, + "learning_rate": 3.978405817537216e-06, + "loss": 2.2916, + "mean_token_accuracy": 0.4551724135875702, + "step": 3950 + }, + { + "epoch": 0.0039835180054006636, + "grad_norm": 62.56338182840301, + "learning_rate": 3.983441774268276e-06, + "loss": 2.3058, + "mean_token_accuracy": 0.42413792610168455, + "step": 3955 + }, + { + "epoch": 0.0039885540585048364, + "grad_norm": 62.169913336503825, + "learning_rate": 3.988477730999336e-06, + "loss": 2.0497, + "mean_token_accuracy": 0.5034482777118683, + "step": 3960 + }, + { + "epoch": 0.0039935901116090085, + "grad_norm": 76.66567210340112, + "learning_rate": 3.993513687730395e-06, + "loss": 2.5195, + "mean_token_accuracy": 0.4137930989265442, + "step": 3965 + }, + { + "epoch": 0.003998626164713181, + "grad_norm": 62.606570648472726, + "learning_rate": 3.998549644461455e-06, + "loss": 2.1871, + "mean_token_accuracy": 0.4965517222881317, + "step": 3970 + }, + { + "epoch": 0.004003662217817354, + "grad_norm": 86.38547380472534, + "learning_rate": 4.003585601192514e-06, + "loss": 2.3424, + "mean_token_accuracy": 0.4413793087005615, + "step": 3975 + }, + { + "epoch": 0.004008698270921527, + "grad_norm": 117.61303072590046, + "learning_rate": 4.0086215579235744e-06, + "loss": 2.2574, + "mean_token_accuracy": 0.4379310369491577, + "step": 3980 + }, + { + "epoch": 0.0040137343240257, + "grad_norm": 99.93116454487797, + "learning_rate": 4.013657514654634e-06, + "loss": 2.2737, + "mean_token_accuracy": 0.44827585816383364, + "step": 3985 + }, + { + "epoch": 0.004018770377129873, + "grad_norm": 64.85471790719244, + "learning_rate": 4.018693471385694e-06, + "loss": 2.0841, + "mean_token_accuracy": 0.5083743929862976, + "step": 3990 + }, + { + "epoch": 0.004023806430234046, + "grad_norm": 78.67211476879521, + "learning_rate": 4.023729428116754e-06, + "loss": 2.25, + "mean_token_accuracy": 0.44827587008476255, + "step": 3995 + }, + { + "epoch": 0.004028842483338218, + "grad_norm": 77.65505798090179, + "learning_rate": 4.028765384847814e-06, + "loss": 2.373, + "mean_token_accuracy": 0.46896551847457885, + "step": 4000 + }, + { + "epoch": 0.004033878536442391, + "grad_norm": 74.0153516668006, + "learning_rate": 4.033801341578873e-06, + "loss": 1.8359, + "mean_token_accuracy": 0.5482391357421875, + "step": 4005 + }, + { + "epoch": 0.004038914589546564, + "grad_norm": 82.77301271541326, + "learning_rate": 4.038837298309933e-06, + "loss": 2.3718, + "mean_token_accuracy": 0.37586206793785093, + "step": 4010 + }, + { + "epoch": 0.004043950642650737, + "grad_norm": 63.0783373081286, + "learning_rate": 4.043873255040993e-06, + "loss": 2.2047, + "mean_token_accuracy": 0.4774954617023468, + "step": 4015 + }, + { + "epoch": 0.0040489866957549095, + "grad_norm": 62.26969492820148, + "learning_rate": 4.048909211772053e-06, + "loss": 2.2346, + "mean_token_accuracy": 0.47241379618644713, + "step": 4020 + }, + { + "epoch": 0.004054022748859082, + "grad_norm": 95.70289653640307, + "learning_rate": 4.053945168503112e-06, + "loss": 2.2305, + "mean_token_accuracy": 0.4637023627758026, + "step": 4025 + }, + { + "epoch": 0.004059058801963255, + "grad_norm": 88.17929760264013, + "learning_rate": 4.058981125234172e-06, + "loss": 2.1404, + "mean_token_accuracy": 0.46551724672317507, + "step": 4030 + }, + { + "epoch": 0.004064094855067427, + "grad_norm": 72.68733610313816, + "learning_rate": 4.064017081965231e-06, + "loss": 2.1502, + "mean_token_accuracy": 0.46551724076271056, + "step": 4035 + }, + { + "epoch": 0.0040691309081716, + "grad_norm": 49.552751477214244, + "learning_rate": 4.069053038696291e-06, + "loss": 2.2424, + "mean_token_accuracy": 0.441379314661026, + "step": 4040 + }, + { + "epoch": 0.004074166961275773, + "grad_norm": 68.25887709540862, + "learning_rate": 4.0740889954273515e-06, + "loss": 2.311, + "mean_token_accuracy": 0.42758620381355283, + "step": 4045 + }, + { + "epoch": 0.004079203014379946, + "grad_norm": 56.62278606811682, + "learning_rate": 4.079124952158412e-06, + "loss": 2.3819, + "mean_token_accuracy": 0.4551724135875702, + "step": 4050 + }, + { + "epoch": 0.004084239067484119, + "grad_norm": 69.46349583210451, + "learning_rate": 4.084160908889471e-06, + "loss": 2.1256, + "mean_token_accuracy": 0.5000000059604645, + "step": 4055 + }, + { + "epoch": 0.004089275120588292, + "grad_norm": 88.98933765721326, + "learning_rate": 4.089196865620531e-06, + "loss": 2.3185, + "mean_token_accuracy": 0.41724138259887694, + "step": 4060 + }, + { + "epoch": 0.004094311173692465, + "grad_norm": 84.68266311921388, + "learning_rate": 4.09423282235159e-06, + "loss": 2.3945, + "mean_token_accuracy": 0.4344827651977539, + "step": 4065 + }, + { + "epoch": 0.004099347226796637, + "grad_norm": 89.7442286741659, + "learning_rate": 4.09926877908265e-06, + "loss": 2.0471, + "mean_token_accuracy": 0.5310344755649566, + "step": 4070 + }, + { + "epoch": 0.00410438327990081, + "grad_norm": 59.5009210643266, + "learning_rate": 4.1043047358137096e-06, + "loss": 2.3767, + "mean_token_accuracy": 0.4, + "step": 4075 + }, + { + "epoch": 0.0041094193330049825, + "grad_norm": 108.58049568685225, + "learning_rate": 4.10934069254477e-06, + "loss": 2.0846, + "mean_token_accuracy": 0.4517241358757019, + "step": 4080 + }, + { + "epoch": 0.0041144553861091554, + "grad_norm": 108.38210125411696, + "learning_rate": 4.114376649275829e-06, + "loss": 2.4153, + "mean_token_accuracy": 0.4448275864124298, + "step": 4085 + }, + { + "epoch": 0.004119491439213328, + "grad_norm": 67.85716210742117, + "learning_rate": 4.119412606006889e-06, + "loss": 2.326, + "mean_token_accuracy": 0.4517241358757019, + "step": 4090 + }, + { + "epoch": 0.004124527492317501, + "grad_norm": 79.18843431472932, + "learning_rate": 4.124448562737949e-06, + "loss": 2.4928, + "mean_token_accuracy": 0.42413793206214906, + "step": 4095 + }, + { + "epoch": 0.004129563545421674, + "grad_norm": 63.522685747569085, + "learning_rate": 4.129484519469009e-06, + "loss": 2.4672, + "mean_token_accuracy": 0.43103447556495667, + "step": 4100 + }, + { + "epoch": 0.004134599598525846, + "grad_norm": 115.60913548148612, + "learning_rate": 4.134520476200069e-06, + "loss": 2.2976, + "mean_token_accuracy": 0.46551724076271056, + "step": 4105 + }, + { + "epoch": 0.004139635651630019, + "grad_norm": 59.60227736143606, + "learning_rate": 4.1395564329311286e-06, + "loss": 2.0141, + "mean_token_accuracy": 0.5206896543502808, + "step": 4110 + }, + { + "epoch": 0.004144671704734192, + "grad_norm": 58.75462543465369, + "learning_rate": 4.144592389662189e-06, + "loss": 2.1149, + "mean_token_accuracy": 0.46551724076271056, + "step": 4115 + }, + { + "epoch": 0.004149707757838365, + "grad_norm": 86.59490868521627, + "learning_rate": 4.149628346393248e-06, + "loss": 2.4158, + "mean_token_accuracy": 0.46551724672317507, + "step": 4120 + }, + { + "epoch": 0.004154743810942538, + "grad_norm": 78.14889020209608, + "learning_rate": 4.154664303124308e-06, + "loss": 2.3572, + "mean_token_accuracy": 0.4206896543502808, + "step": 4125 + }, + { + "epoch": 0.004159779864046711, + "grad_norm": 71.21794193286092, + "learning_rate": 4.159700259855367e-06, + "loss": 2.23, + "mean_token_accuracy": 0.482758617401123, + "step": 4130 + }, + { + "epoch": 0.0041648159171508835, + "grad_norm": 76.80361300018873, + "learning_rate": 4.164736216586427e-06, + "loss": 2.0961, + "mean_token_accuracy": 0.4931034505367279, + "step": 4135 + }, + { + "epoch": 0.004169851970255056, + "grad_norm": 70.3361635992732, + "learning_rate": 4.169772173317487e-06, + "loss": 2.1367, + "mean_token_accuracy": 0.4931034564971924, + "step": 4140 + }, + { + "epoch": 0.0041748880233592285, + "grad_norm": 79.17235931033579, + "learning_rate": 4.174808130048547e-06, + "loss": 2.1642, + "mean_token_accuracy": 0.4517241358757019, + "step": 4145 + }, + { + "epoch": 0.004179924076463401, + "grad_norm": 56.08195935616169, + "learning_rate": 4.179844086779607e-06, + "loss": 2.3033, + "mean_token_accuracy": 0.42758620381355283, + "step": 4150 + }, + { + "epoch": 0.004184960129567574, + "grad_norm": 65.64879341776756, + "learning_rate": 4.184880043510667e-06, + "loss": 2.2199, + "mean_token_accuracy": 0.482758617401123, + "step": 4155 + }, + { + "epoch": 0.004189996182671747, + "grad_norm": 64.95543229406819, + "learning_rate": 4.189916000241726e-06, + "loss": 2.2161, + "mean_token_accuracy": 0.46896551847457885, + "step": 4160 + }, + { + "epoch": 0.00419503223577592, + "grad_norm": 65.43334328689735, + "learning_rate": 4.194951956972786e-06, + "loss": 2.4811, + "mean_token_accuracy": 0.4610405325889587, + "step": 4165 + }, + { + "epoch": 0.004200068288880093, + "grad_norm": 97.81663041597253, + "learning_rate": 4.1999879137038455e-06, + "loss": 2.3968, + "mean_token_accuracy": 0.44482759237289426, + "step": 4170 + }, + { + "epoch": 0.004205104341984265, + "grad_norm": 57.72628417556213, + "learning_rate": 4.205023870434906e-06, + "loss": 2.3253, + "mean_token_accuracy": 0.4568058133125305, + "step": 4175 + }, + { + "epoch": 0.004210140395088438, + "grad_norm": 49.56207392096942, + "learning_rate": 4.210059827165965e-06, + "loss": 1.9694, + "mean_token_accuracy": 0.5034482777118683, + "step": 4180 + }, + { + "epoch": 0.004215176448192611, + "grad_norm": 62.83170327084586, + "learning_rate": 4.215095783897025e-06, + "loss": 1.9727, + "mean_token_accuracy": 0.534482765197754, + "step": 4185 + }, + { + "epoch": 0.004220212501296784, + "grad_norm": 62.69438576282593, + "learning_rate": 4.220131740628084e-06, + "loss": 2.1881, + "mean_token_accuracy": 0.4896551787853241, + "step": 4190 + }, + { + "epoch": 0.004225248554400957, + "grad_norm": 91.54571706784118, + "learning_rate": 4.225167697359144e-06, + "loss": 2.308, + "mean_token_accuracy": 0.44337567687034607, + "step": 4195 + }, + { + "epoch": 0.0042302846075051295, + "grad_norm": 69.24260088954128, + "learning_rate": 4.2302036540902044e-06, + "loss": 2.0292, + "mean_token_accuracy": 0.4620689630508423, + "step": 4200 + }, + { + "epoch": 0.004235320660609302, + "grad_norm": 57.414343945959466, + "learning_rate": 4.2352396108212645e-06, + "loss": 2.5571, + "mean_token_accuracy": 0.3793103456497192, + "step": 4205 + }, + { + "epoch": 0.004240356713713474, + "grad_norm": 58.793211970168, + "learning_rate": 4.240275567552324e-06, + "loss": 2.2745, + "mean_token_accuracy": 0.47586206793785096, + "step": 4210 + }, + { + "epoch": 0.004245392766817647, + "grad_norm": 65.17152410248484, + "learning_rate": 4.245311524283384e-06, + "loss": 2.4349, + "mean_token_accuracy": 0.4034482777118683, + "step": 4215 + }, + { + "epoch": 0.00425042881992182, + "grad_norm": 72.72687356845236, + "learning_rate": 4.250347481014443e-06, + "loss": 2.1792, + "mean_token_accuracy": 0.4724137902259827, + "step": 4220 + }, + { + "epoch": 0.004255464873025993, + "grad_norm": 62.49065108487903, + "learning_rate": 4.255383437745503e-06, + "loss": 2.3734, + "mean_token_accuracy": 0.441379314661026, + "step": 4225 + }, + { + "epoch": 0.004260500926130166, + "grad_norm": 54.81935974666083, + "learning_rate": 4.2604193944765625e-06, + "loss": 1.9219, + "mean_token_accuracy": 0.48275861144065857, + "step": 4230 + }, + { + "epoch": 0.004265536979234339, + "grad_norm": 68.19630894776515, + "learning_rate": 4.265455351207623e-06, + "loss": 2.3024, + "mean_token_accuracy": 0.4448275864124298, + "step": 4235 + }, + { + "epoch": 0.004270573032338512, + "grad_norm": 59.49425627894097, + "learning_rate": 4.270491307938682e-06, + "loss": 2.2641, + "mean_token_accuracy": 0.42758620381355283, + "step": 4240 + }, + { + "epoch": 0.004275609085442684, + "grad_norm": 76.29664193763357, + "learning_rate": 4.275527264669743e-06, + "loss": 2.3725, + "mean_token_accuracy": 0.4344827592372894, + "step": 4245 + }, + { + "epoch": 0.004280645138546857, + "grad_norm": 57.16522446964406, + "learning_rate": 4.280563221400802e-06, + "loss": 2.2484, + "mean_token_accuracy": 0.42068966031074523, + "step": 4250 + }, + { + "epoch": 0.00428568119165103, + "grad_norm": 48.86394236491711, + "learning_rate": 4.285599178131862e-06, + "loss": 2.2847, + "mean_token_accuracy": 0.44827587008476255, + "step": 4255 + }, + { + "epoch": 0.0042907172447552025, + "grad_norm": 68.95637394188687, + "learning_rate": 4.290635134862921e-06, + "loss": 2.2714, + "mean_token_accuracy": 0.43974592089653014, + "step": 4260 + }, + { + "epoch": 0.004295753297859375, + "grad_norm": 75.88635881864354, + "learning_rate": 4.2956710915939815e-06, + "loss": 2.4482, + "mean_token_accuracy": 0.44137930274009707, + "step": 4265 + }, + { + "epoch": 0.004300789350963548, + "grad_norm": 66.10308244526328, + "learning_rate": 4.300707048325041e-06, + "loss": 2.1913, + "mean_token_accuracy": 0.4448275864124298, + "step": 4270 + }, + { + "epoch": 0.004305825404067721, + "grad_norm": 90.8791867607473, + "learning_rate": 4.305743005056101e-06, + "loss": 2.3027, + "mean_token_accuracy": 0.42758620977401735, + "step": 4275 + }, + { + "epoch": 0.004310861457171893, + "grad_norm": 59.049802787426515, + "learning_rate": 4.31077896178716e-06, + "loss": 2.5168, + "mean_token_accuracy": 0.46551724076271056, + "step": 4280 + }, + { + "epoch": 0.004315897510276066, + "grad_norm": 69.93615957051077, + "learning_rate": 4.31581491851822e-06, + "loss": 1.9633, + "mean_token_accuracy": 0.5223230361938477, + "step": 4285 + }, + { + "epoch": 0.004320933563380239, + "grad_norm": 67.34699780506475, + "learning_rate": 4.32085087524928e-06, + "loss": 2.364, + "mean_token_accuracy": 0.4379310369491577, + "step": 4290 + }, + { + "epoch": 0.004325969616484412, + "grad_norm": 65.20172878460436, + "learning_rate": 4.32588683198034e-06, + "loss": 2.1659, + "mean_token_accuracy": 0.4709618926048279, + "step": 4295 + }, + { + "epoch": 0.004331005669588585, + "grad_norm": 49.160352658060376, + "learning_rate": 4.3309227887114e-06, + "loss": 2.4155, + "mean_token_accuracy": 0.4, + "step": 4300 + }, + { + "epoch": 0.004336041722692758, + "grad_norm": 67.83562963406249, + "learning_rate": 4.33595874544246e-06, + "loss": 2.1095, + "mean_token_accuracy": 0.47931033968925474, + "step": 4305 + }, + { + "epoch": 0.004341077775796931, + "grad_norm": 63.95533170846425, + "learning_rate": 4.340994702173519e-06, + "loss": 1.9979, + "mean_token_accuracy": 0.46896551847457885, + "step": 4310 + }, + { + "epoch": 0.004346113828901103, + "grad_norm": 75.13482461195716, + "learning_rate": 4.346030658904579e-06, + "loss": 2.4099, + "mean_token_accuracy": 0.39655172228813174, + "step": 4315 + }, + { + "epoch": 0.004351149882005276, + "grad_norm": 55.339938507732924, + "learning_rate": 4.351066615635638e-06, + "loss": 2.3664, + "mean_token_accuracy": 0.42758620977401735, + "step": 4320 + }, + { + "epoch": 0.0043561859351094485, + "grad_norm": 55.65167695804991, + "learning_rate": 4.3561025723666985e-06, + "loss": 2.3625, + "mean_token_accuracy": 0.42758620381355283, + "step": 4325 + }, + { + "epoch": 0.004361221988213621, + "grad_norm": 98.10746909641446, + "learning_rate": 4.361138529097758e-06, + "loss": 2.4246, + "mean_token_accuracy": 0.4638838529586792, + "step": 4330 + }, + { + "epoch": 0.004366258041317794, + "grad_norm": 65.04298626807085, + "learning_rate": 4.366174485828818e-06, + "loss": 1.9045, + "mean_token_accuracy": 0.4862068951129913, + "step": 4335 + }, + { + "epoch": 0.004371294094421967, + "grad_norm": 43.4520912677257, + "learning_rate": 4.371210442559878e-06, + "loss": 1.9465, + "mean_token_accuracy": 0.4965517222881317, + "step": 4340 + }, + { + "epoch": 0.00437633014752614, + "grad_norm": 69.93909397068933, + "learning_rate": 4.376246399290938e-06, + "loss": 2.1646, + "mean_token_accuracy": 0.44827585220336913, + "step": 4345 + }, + { + "epoch": 0.004381366200630312, + "grad_norm": 77.74271973392179, + "learning_rate": 4.381282356021997e-06, + "loss": 2.4375, + "mean_token_accuracy": 0.4172413766384125, + "step": 4350 + }, + { + "epoch": 0.004386402253734485, + "grad_norm": 69.83835820188298, + "learning_rate": 4.386318312753057e-06, + "loss": 2.1176, + "mean_token_accuracy": 0.4758620738983154, + "step": 4355 + }, + { + "epoch": 0.004391438306838658, + "grad_norm": 71.74437770669098, + "learning_rate": 4.391354269484117e-06, + "loss": 2.1683, + "mean_token_accuracy": 0.4758620738983154, + "step": 4360 + }, + { + "epoch": 0.004396474359942831, + "grad_norm": 52.51628942437407, + "learning_rate": 4.396390226215177e-06, + "loss": 2.5393, + "mean_token_accuracy": 0.42413792610168455, + "step": 4365 + }, + { + "epoch": 0.004401510413047004, + "grad_norm": 70.59851980920405, + "learning_rate": 4.401426182946236e-06, + "loss": 2.13, + "mean_token_accuracy": 0.4620689690113068, + "step": 4370 + }, + { + "epoch": 0.004406546466151177, + "grad_norm": 61.16601154799414, + "learning_rate": 4.406462139677296e-06, + "loss": 2.1883, + "mean_token_accuracy": 0.4620689630508423, + "step": 4375 + }, + { + "epoch": 0.004411582519255349, + "grad_norm": 61.74240175224402, + "learning_rate": 4.411498096408355e-06, + "loss": 2.0564, + "mean_token_accuracy": 0.46551724076271056, + "step": 4380 + }, + { + "epoch": 0.0044166185723595215, + "grad_norm": 72.26463937268562, + "learning_rate": 4.4165340531394154e-06, + "loss": 2.2968, + "mean_token_accuracy": 0.4620689630508423, + "step": 4385 + }, + { + "epoch": 0.004421654625463694, + "grad_norm": 77.76039206164852, + "learning_rate": 4.4215700098704755e-06, + "loss": 2.2692, + "mean_token_accuracy": 0.458620685338974, + "step": 4390 + }, + { + "epoch": 0.004426690678567867, + "grad_norm": 56.58291496932423, + "learning_rate": 4.426605966601536e-06, + "loss": 2.5289, + "mean_token_accuracy": 0.4344827651977539, + "step": 4395 + }, + { + "epoch": 0.00443172673167204, + "grad_norm": 58.707053632135995, + "learning_rate": 4.431641923332595e-06, + "loss": 1.913, + "mean_token_accuracy": 0.49655172824859617, + "step": 4400 + }, + { + "epoch": 0.004436762784776213, + "grad_norm": 61.3813071039486, + "learning_rate": 4.436677880063655e-06, + "loss": 2.062, + "mean_token_accuracy": 0.5034482836723327, + "step": 4405 + }, + { + "epoch": 0.004441798837880386, + "grad_norm": 66.61686004967721, + "learning_rate": 4.441713836794714e-06, + "loss": 2.1011, + "mean_token_accuracy": 0.43103447556495667, + "step": 4410 + }, + { + "epoch": 0.004446834890984558, + "grad_norm": 59.98302955793069, + "learning_rate": 4.446749793525774e-06, + "loss": 2.3519, + "mean_token_accuracy": 0.41379310488700866, + "step": 4415 + }, + { + "epoch": 0.004451870944088731, + "grad_norm": 61.15411290722074, + "learning_rate": 4.451785750256834e-06, + "loss": 2.2279, + "mean_token_accuracy": 0.42413792610168455, + "step": 4420 + }, + { + "epoch": 0.004456906997192904, + "grad_norm": 51.142757781524864, + "learning_rate": 4.456821706987894e-06, + "loss": 2.2609, + "mean_token_accuracy": 0.44137930274009707, + "step": 4425 + }, + { + "epoch": 0.004461943050297077, + "grad_norm": 109.01904955739985, + "learning_rate": 4.461857663718953e-06, + "loss": 2.1417, + "mean_token_accuracy": 0.46896551847457885, + "step": 4430 + }, + { + "epoch": 0.00446697910340125, + "grad_norm": 48.9991020715633, + "learning_rate": 4.466893620450013e-06, + "loss": 2.3973, + "mean_token_accuracy": 0.44482759237289426, + "step": 4435 + }, + { + "epoch": 0.0044720151565054225, + "grad_norm": 101.26876555247364, + "learning_rate": 4.471929577181073e-06, + "loss": 2.3059, + "mean_token_accuracy": 0.47931033968925474, + "step": 4440 + }, + { + "epoch": 0.004477051209609595, + "grad_norm": 61.605800254304555, + "learning_rate": 4.476965533912133e-06, + "loss": 2.163, + "mean_token_accuracy": 0.5000000059604645, + "step": 4445 + }, + { + "epoch": 0.0044820872627137675, + "grad_norm": 73.55537222440582, + "learning_rate": 4.4820014906431925e-06, + "loss": 2.1569, + "mean_token_accuracy": 0.48275861144065857, + "step": 4450 + }, + { + "epoch": 0.00448712331581794, + "grad_norm": 102.54082419730184, + "learning_rate": 4.487037447374253e-06, + "loss": 2.4404, + "mean_token_accuracy": 0.46896551847457885, + "step": 4455 + }, + { + "epoch": 0.004492159368922113, + "grad_norm": 63.78217305810308, + "learning_rate": 4.492073404105312e-06, + "loss": 2.2506, + "mean_token_accuracy": 0.4586206912994385, + "step": 4460 + }, + { + "epoch": 0.004497195422026286, + "grad_norm": 78.11780626517847, + "learning_rate": 4.497109360836372e-06, + "loss": 2.348, + "mean_token_accuracy": 0.4434361755847931, + "step": 4465 + }, + { + "epoch": 0.004502231475130459, + "grad_norm": 52.63214825163616, + "learning_rate": 4.502145317567431e-06, + "loss": 2.261, + "mean_token_accuracy": 0.42758620977401735, + "step": 4470 + }, + { + "epoch": 0.004507267528234632, + "grad_norm": 76.67504507830839, + "learning_rate": 4.507181274298491e-06, + "loss": 2.3101, + "mean_token_accuracy": 0.45045371651649474, + "step": 4475 + }, + { + "epoch": 0.004512303581338805, + "grad_norm": 60.22942829695503, + "learning_rate": 4.5122172310295506e-06, + "loss": 1.9103, + "mean_token_accuracy": 0.482758617401123, + "step": 4480 + }, + { + "epoch": 0.004517339634442977, + "grad_norm": 82.42019024043736, + "learning_rate": 4.517253187760611e-06, + "loss": 2.1296, + "mean_token_accuracy": 0.49655171632766726, + "step": 4485 + }, + { + "epoch": 0.00452237568754715, + "grad_norm": 95.54582598762356, + "learning_rate": 4.522289144491671e-06, + "loss": 2.274, + "mean_token_accuracy": 0.4413793206214905, + "step": 4490 + }, + { + "epoch": 0.004527411740651323, + "grad_norm": 62.373890540238975, + "learning_rate": 4.527325101222731e-06, + "loss": 2.4116, + "mean_token_accuracy": 0.42413793206214906, + "step": 4495 + }, + { + "epoch": 0.004532447793755496, + "grad_norm": 62.289917758139644, + "learning_rate": 4.532361057953791e-06, + "loss": 1.9978, + "mean_token_accuracy": 0.47931034564971925, + "step": 4500 + }, + { + "epoch": 0.0045374838468596685, + "grad_norm": 50.160683932339474, + "learning_rate": 4.53739701468485e-06, + "loss": 2.2359, + "mean_token_accuracy": 0.46896552443504336, + "step": 4505 + }, + { + "epoch": 0.004542519899963841, + "grad_norm": 65.88185342026964, + "learning_rate": 4.54243297141591e-06, + "loss": 2.1066, + "mean_token_accuracy": 0.4862068951129913, + "step": 4510 + }, + { + "epoch": 0.004547555953068014, + "grad_norm": 100.95707527287281, + "learning_rate": 4.5474689281469696e-06, + "loss": 2.1162, + "mean_token_accuracy": 0.47749546766281126, + "step": 4515 + }, + { + "epoch": 0.004552592006172186, + "grad_norm": 95.95146769243699, + "learning_rate": 4.55250488487803e-06, + "loss": 2.3188, + "mean_token_accuracy": 0.4482758641242981, + "step": 4520 + }, + { + "epoch": 0.004557628059276359, + "grad_norm": 95.50098311464909, + "learning_rate": 4.557540841609089e-06, + "loss": 2.4757, + "mean_token_accuracy": 0.4103448212146759, + "step": 4525 + }, + { + "epoch": 0.004562664112380532, + "grad_norm": 66.92387869361015, + "learning_rate": 4.562576798340149e-06, + "loss": 2.2637, + "mean_token_accuracy": 0.4448275864124298, + "step": 4530 + }, + { + "epoch": 0.004567700165484705, + "grad_norm": 74.14669347554786, + "learning_rate": 4.567612755071208e-06, + "loss": 2.3675, + "mean_token_accuracy": 0.4689655125141144, + "step": 4535 + }, + { + "epoch": 0.004572736218588878, + "grad_norm": 64.59498241691422, + "learning_rate": 4.572648711802268e-06, + "loss": 2.6723, + "mean_token_accuracy": 0.3896551728248596, + "step": 4540 + }, + { + "epoch": 0.004577772271693051, + "grad_norm": 69.61065714540345, + "learning_rate": 4.5776846685333285e-06, + "loss": 2.4983, + "mean_token_accuracy": 0.42758620381355283, + "step": 4545 + }, + { + "epoch": 0.004582808324797224, + "grad_norm": 58.47058691664711, + "learning_rate": 4.5827206252643886e-06, + "loss": 2.0005, + "mean_token_accuracy": 0.5517241418361664, + "step": 4550 + }, + { + "epoch": 0.004587844377901396, + "grad_norm": 56.11088926623105, + "learning_rate": 4.587756581995448e-06, + "loss": 1.9313, + "mean_token_accuracy": 0.46896551847457885, + "step": 4555 + }, + { + "epoch": 0.004592880431005569, + "grad_norm": 86.78285751193991, + "learning_rate": 4.592792538726508e-06, + "loss": 2.377, + "mean_token_accuracy": 0.4482758641242981, + "step": 4560 + }, + { + "epoch": 0.0045979164841097415, + "grad_norm": 79.69445898404734, + "learning_rate": 4.597828495457567e-06, + "loss": 1.9947, + "mean_token_accuracy": 0.5295220911502838, + "step": 4565 + }, + { + "epoch": 0.004602952537213914, + "grad_norm": 116.84324927014865, + "learning_rate": 4.602864452188627e-06, + "loss": 2.4357, + "mean_token_accuracy": 0.4068965494632721, + "step": 4570 + }, + { + "epoch": 0.004607988590318087, + "grad_norm": 52.49389608047635, + "learning_rate": 4.6079004089196865e-06, + "loss": 2.4641, + "mean_token_accuracy": 0.42758620381355283, + "step": 4575 + }, + { + "epoch": 0.00461302464342226, + "grad_norm": 62.34033980519452, + "learning_rate": 4.612936365650747e-06, + "loss": 2.3382, + "mean_token_accuracy": 0.4402298808097839, + "step": 4580 + }, + { + "epoch": 0.004618060696526433, + "grad_norm": 65.55807484979542, + "learning_rate": 4.617972322381806e-06, + "loss": 2.3517, + "mean_token_accuracy": 0.4448275864124298, + "step": 4585 + }, + { + "epoch": 0.004623096749630605, + "grad_norm": 61.214992349349046, + "learning_rate": 4.623008279112866e-06, + "loss": 2.0357, + "mean_token_accuracy": 0.4896551728248596, + "step": 4590 + }, + { + "epoch": 0.004628132802734778, + "grad_norm": 69.57912414030602, + "learning_rate": 4.628044235843926e-06, + "loss": 2.2602, + "mean_token_accuracy": 0.47931033968925474, + "step": 4595 + }, + { + "epoch": 0.004633168855838951, + "grad_norm": 44.03075011529503, + "learning_rate": 4.633080192574986e-06, + "loss": 2.228, + "mean_token_accuracy": 0.46551724672317507, + "step": 4600 + }, + { + "epoch": 0.004638204908943124, + "grad_norm": 58.011647404611644, + "learning_rate": 4.6381161493060454e-06, + "loss": 1.8955, + "mean_token_accuracy": 0.5068965435028077, + "step": 4605 + }, + { + "epoch": 0.004643240962047297, + "grad_norm": 71.89392724736372, + "learning_rate": 4.6431521060371055e-06, + "loss": 2.4062, + "mean_token_accuracy": 0.458620685338974, + "step": 4610 + }, + { + "epoch": 0.00464827701515147, + "grad_norm": 76.05217171654296, + "learning_rate": 4.648188062768165e-06, + "loss": 1.9633, + "mean_token_accuracy": 0.4862069010734558, + "step": 4615 + }, + { + "epoch": 0.0046533130682556425, + "grad_norm": 68.25925693843631, + "learning_rate": 4.653224019499225e-06, + "loss": 2.0736, + "mean_token_accuracy": 0.5034482717514038, + "step": 4620 + }, + { + "epoch": 0.0046583491213598146, + "grad_norm": 54.400617273650944, + "learning_rate": 4.658259976230284e-06, + "loss": 2.3649, + "mean_token_accuracy": 0.42413793206214906, + "step": 4625 + }, + { + "epoch": 0.0046633851744639875, + "grad_norm": 70.30471005276473, + "learning_rate": 4.663295932961344e-06, + "loss": 2.3179, + "mean_token_accuracy": 0.4517241358757019, + "step": 4630 + }, + { + "epoch": 0.00466842122756816, + "grad_norm": 87.76760979261556, + "learning_rate": 4.6683318896924035e-06, + "loss": 2.3422, + "mean_token_accuracy": 0.4620689690113068, + "step": 4635 + }, + { + "epoch": 0.004673457280672333, + "grad_norm": 73.91291213360861, + "learning_rate": 4.673367846423464e-06, + "loss": 2.2407, + "mean_token_accuracy": 0.4862068951129913, + "step": 4640 + }, + { + "epoch": 0.004678493333776506, + "grad_norm": 58.18290091036251, + "learning_rate": 4.678403803154524e-06, + "loss": 2.2202, + "mean_token_accuracy": 0.4620689690113068, + "step": 4645 + }, + { + "epoch": 0.004683529386880679, + "grad_norm": 48.085184516340156, + "learning_rate": 4.683439759885584e-06, + "loss": 2.0219, + "mean_token_accuracy": 0.47586206197738645, + "step": 4650 + }, + { + "epoch": 0.004688565439984852, + "grad_norm": 54.084923900833566, + "learning_rate": 4.688475716616643e-06, + "loss": 2.2256, + "mean_token_accuracy": 0.46551724672317507, + "step": 4655 + }, + { + "epoch": 0.004693601493089024, + "grad_norm": 67.46580634055775, + "learning_rate": 4.693511673347703e-06, + "loss": 2.3484, + "mean_token_accuracy": 0.4206896543502808, + "step": 4660 + }, + { + "epoch": 0.004698637546193197, + "grad_norm": 57.961361849711835, + "learning_rate": 4.698547630078762e-06, + "loss": 2.4713, + "mean_token_accuracy": 0.37586206793785093, + "step": 4665 + }, + { + "epoch": 0.00470367359929737, + "grad_norm": 63.790142384919214, + "learning_rate": 4.7035835868098225e-06, + "loss": 2.2818, + "mean_token_accuracy": 0.4620689630508423, + "step": 4670 + }, + { + "epoch": 0.004708709652401543, + "grad_norm": 65.9742494655388, + "learning_rate": 4.708619543540882e-06, + "loss": 2.1405, + "mean_token_accuracy": 0.5034482777118683, + "step": 4675 + }, + { + "epoch": 0.004713745705505716, + "grad_norm": 67.30241307973168, + "learning_rate": 4.713655500271942e-06, + "loss": 2.2616, + "mean_token_accuracy": 0.4344827592372894, + "step": 4680 + }, + { + "epoch": 0.0047187817586098885, + "grad_norm": 72.50106901007615, + "learning_rate": 4.718691457003001e-06, + "loss": 2.3447, + "mean_token_accuracy": 0.4620689690113068, + "step": 4685 + }, + { + "epoch": 0.004723817811714061, + "grad_norm": 56.62281379038907, + "learning_rate": 4.723727413734061e-06, + "loss": 2.352, + "mean_token_accuracy": 0.4206896543502808, + "step": 4690 + }, + { + "epoch": 0.004728853864818233, + "grad_norm": 79.2218996566941, + "learning_rate": 4.728763370465121e-06, + "loss": 2.1499, + "mean_token_accuracy": 0.4620689630508423, + "step": 4695 + }, + { + "epoch": 0.004733889917922406, + "grad_norm": 62.15045348248068, + "learning_rate": 4.733799327196181e-06, + "loss": 2.2403, + "mean_token_accuracy": 0.4275861978530884, + "step": 4700 + }, + { + "epoch": 0.004738925971026579, + "grad_norm": 57.76997918011777, + "learning_rate": 4.738835283927241e-06, + "loss": 2.3428, + "mean_token_accuracy": 0.4206896543502808, + "step": 4705 + }, + { + "epoch": 0.004743962024130752, + "grad_norm": 64.84623090385051, + "learning_rate": 4.743871240658301e-06, + "loss": 2.4429, + "mean_token_accuracy": 0.3896551787853241, + "step": 4710 + }, + { + "epoch": 0.004748998077234925, + "grad_norm": 96.81953623021111, + "learning_rate": 4.74890719738936e-06, + "loss": 2.3722, + "mean_token_accuracy": 0.4707198977470398, + "step": 4715 + }, + { + "epoch": 0.004754034130339098, + "grad_norm": 74.37955995411201, + "learning_rate": 4.75394315412042e-06, + "loss": 2.4442, + "mean_token_accuracy": 0.43448275327682495, + "step": 4720 + }, + { + "epoch": 0.004759070183443271, + "grad_norm": 51.35350572521384, + "learning_rate": 4.758979110851479e-06, + "loss": 2.3695, + "mean_token_accuracy": 0.42758620381355283, + "step": 4725 + }, + { + "epoch": 0.004764106236547443, + "grad_norm": 62.03456840429953, + "learning_rate": 4.7640150675825395e-06, + "loss": 2.1875, + "mean_token_accuracy": 0.4551724135875702, + "step": 4730 + }, + { + "epoch": 0.004769142289651616, + "grad_norm": 75.5145494584868, + "learning_rate": 4.769051024313599e-06, + "loss": 2.2886, + "mean_token_accuracy": 0.4620689630508423, + "step": 4735 + }, + { + "epoch": 0.004774178342755789, + "grad_norm": 85.27769518322883, + "learning_rate": 4.77408698104466e-06, + "loss": 2.4552, + "mean_token_accuracy": 0.41034482717514037, + "step": 4740 + }, + { + "epoch": 0.0047792143958599615, + "grad_norm": 55.441122368856306, + "learning_rate": 4.779122937775719e-06, + "loss": 2.498, + "mean_token_accuracy": 0.3758620649576187, + "step": 4745 + }, + { + "epoch": 0.004784250448964134, + "grad_norm": 72.08228569372173, + "learning_rate": 4.784158894506779e-06, + "loss": 2.2111, + "mean_token_accuracy": 0.4724137902259827, + "step": 4750 + }, + { + "epoch": 0.004789286502068307, + "grad_norm": 43.94582784000871, + "learning_rate": 4.789194851237838e-06, + "loss": 2.1538, + "mean_token_accuracy": 0.482758629322052, + "step": 4755 + }, + { + "epoch": 0.00479432255517248, + "grad_norm": 73.53525344219976, + "learning_rate": 4.794230807968898e-06, + "loss": 2.3359, + "mean_token_accuracy": 0.482758617401123, + "step": 4760 + }, + { + "epoch": 0.004799358608276652, + "grad_norm": 71.77174418250638, + "learning_rate": 4.799266764699958e-06, + "loss": 2.3293, + "mean_token_accuracy": 0.43793103098869324, + "step": 4765 + }, + { + "epoch": 0.004804394661380825, + "grad_norm": 70.76277547318473, + "learning_rate": 4.804302721431018e-06, + "loss": 2.091, + "mean_token_accuracy": 0.4724137902259827, + "step": 4770 + }, + { + "epoch": 0.004809430714484998, + "grad_norm": 63.171319060344324, + "learning_rate": 4.809338678162077e-06, + "loss": 2.1785, + "mean_token_accuracy": 0.4793103516101837, + "step": 4775 + }, + { + "epoch": 0.004814466767589171, + "grad_norm": 66.72407050626937, + "learning_rate": 4.814374634893137e-06, + "loss": 2.0943, + "mean_token_accuracy": 0.46896552443504336, + "step": 4780 + }, + { + "epoch": 0.004819502820693344, + "grad_norm": 60.33468915114281, + "learning_rate": 4.819410591624197e-06, + "loss": 1.8636, + "mean_token_accuracy": 0.5034482657909394, + "step": 4785 + }, + { + "epoch": 0.004824538873797517, + "grad_norm": 49.81477886862322, + "learning_rate": 4.824446548355257e-06, + "loss": 2.3342, + "mean_token_accuracy": 0.4517241358757019, + "step": 4790 + }, + { + "epoch": 0.00482957492690169, + "grad_norm": 104.55613995004154, + "learning_rate": 4.8294825050863165e-06, + "loss": 2.7633, + "mean_token_accuracy": 0.37586206793785093, + "step": 4795 + }, + { + "epoch": 0.004834610980005862, + "grad_norm": 88.80635937299884, + "learning_rate": 4.834518461817377e-06, + "loss": 2.5066, + "mean_token_accuracy": 0.41724138855934145, + "step": 4800 + }, + { + "epoch": 0.0048396470331100346, + "grad_norm": 55.427107699392195, + "learning_rate": 4.839554418548436e-06, + "loss": 2.0271, + "mean_token_accuracy": 0.4724137902259827, + "step": 4805 + }, + { + "epoch": 0.0048446830862142075, + "grad_norm": 74.33928657449144, + "learning_rate": 4.844590375279496e-06, + "loss": 1.9994, + "mean_token_accuracy": 0.5228070139884948, + "step": 4810 + }, + { + "epoch": 0.00484971913931838, + "grad_norm": 52.11039364281213, + "learning_rate": 4.849626332010555e-06, + "loss": 1.9534, + "mean_token_accuracy": 0.5259528160095215, + "step": 4815 + }, + { + "epoch": 0.004854755192422553, + "grad_norm": 42.4150483663928, + "learning_rate": 4.854662288741615e-06, + "loss": 2.413, + "mean_token_accuracy": 0.4413793087005615, + "step": 4820 + }, + { + "epoch": 0.004859791245526726, + "grad_norm": 68.14359699517831, + "learning_rate": 4.859698245472675e-06, + "loss": 2.1299, + "mean_token_accuracy": 0.4862068951129913, + "step": 4825 + }, + { + "epoch": 0.004864827298630898, + "grad_norm": 55.80214183455283, + "learning_rate": 4.864734202203735e-06, + "loss": 2.3377, + "mean_token_accuracy": 0.4310344815254211, + "step": 4830 + }, + { + "epoch": 0.004869863351735071, + "grad_norm": 53.26715757398765, + "learning_rate": 4.869770158934795e-06, + "loss": 2.3427, + "mean_token_accuracy": 0.4241379380226135, + "step": 4835 + }, + { + "epoch": 0.004874899404839244, + "grad_norm": 67.82974998562977, + "learning_rate": 4.874806115665855e-06, + "loss": 1.937, + "mean_token_accuracy": 0.4793103516101837, + "step": 4840 + }, + { + "epoch": 0.004879935457943417, + "grad_norm": 58.27714151764549, + "learning_rate": 4.879842072396914e-06, + "loss": 2.2357, + "mean_token_accuracy": 0.47684728503227236, + "step": 4845 + }, + { + "epoch": 0.00488497151104759, + "grad_norm": 55.22575011739198, + "learning_rate": 4.884878029127974e-06, + "loss": 2.1676, + "mean_token_accuracy": 0.441379314661026, + "step": 4850 + }, + { + "epoch": 0.004890007564151763, + "grad_norm": 67.18177798528704, + "learning_rate": 4.8899139858590335e-06, + "loss": 2.0052, + "mean_token_accuracy": 0.5448275804519653, + "step": 4855 + }, + { + "epoch": 0.0048950436172559356, + "grad_norm": 64.27041597300736, + "learning_rate": 4.894949942590094e-06, + "loss": 2.5831, + "mean_token_accuracy": 0.3896551728248596, + "step": 4860 + }, + { + "epoch": 0.004900079670360108, + "grad_norm": 70.55649328261276, + "learning_rate": 4.899985899321153e-06, + "loss": 2.4933, + "mean_token_accuracy": 0.39655172228813174, + "step": 4865 + }, + { + "epoch": 0.0049051157234642805, + "grad_norm": 50.00552854251337, + "learning_rate": 4.905021856052213e-06, + "loss": 2.3004, + "mean_token_accuracy": 0.46551724076271056, + "step": 4870 + }, + { + "epoch": 0.004910151776568453, + "grad_norm": 47.03616536333132, + "learning_rate": 4.910057812783272e-06, + "loss": 2.019, + "mean_token_accuracy": 0.4655172348022461, + "step": 4875 + }, + { + "epoch": 0.004915187829672626, + "grad_norm": 59.92274711167819, + "learning_rate": 4.915093769514332e-06, + "loss": 2.1261, + "mean_token_accuracy": 0.4931034445762634, + "step": 4880 + }, + { + "epoch": 0.004920223882776799, + "grad_norm": 110.51028995828258, + "learning_rate": 4.920129726245392e-06, + "loss": 2.384, + "mean_token_accuracy": 0.420689657330513, + "step": 4885 + }, + { + "epoch": 0.004925259935880972, + "grad_norm": 42.23604726138878, + "learning_rate": 4.9251656829764525e-06, + "loss": 2.5604, + "mean_token_accuracy": 0.42413792610168455, + "step": 4890 + }, + { + "epoch": 0.004930295988985145, + "grad_norm": 52.02558027425352, + "learning_rate": 4.930201639707512e-06, + "loss": 2.1597, + "mean_token_accuracy": 0.471082878112793, + "step": 4895 + }, + { + "epoch": 0.004935332042089317, + "grad_norm": 62.29288889140503, + "learning_rate": 4.935237596438572e-06, + "loss": 2.5317, + "mean_token_accuracy": 0.4172413766384125, + "step": 4900 + }, + { + "epoch": 0.00494036809519349, + "grad_norm": 67.17619635608203, + "learning_rate": 4.940273553169631e-06, + "loss": 1.8321, + "mean_token_accuracy": 0.5275862038135528, + "step": 4905 + }, + { + "epoch": 0.004945404148297663, + "grad_norm": 57.84088141004681, + "learning_rate": 4.945309509900691e-06, + "loss": 2.0287, + "mean_token_accuracy": 0.49655172824859617, + "step": 4910 + }, + { + "epoch": 0.004950440201401836, + "grad_norm": 62.455860357304665, + "learning_rate": 4.950345466631751e-06, + "loss": 2.1387, + "mean_token_accuracy": 0.4482758641242981, + "step": 4915 + }, + { + "epoch": 0.004955476254506009, + "grad_norm": 52.760833668853046, + "learning_rate": 4.9553814233628105e-06, + "loss": 2.1012, + "mean_token_accuracy": 0.4675136089324951, + "step": 4920 + }, + { + "epoch": 0.0049605123076101815, + "grad_norm": 75.79250365458297, + "learning_rate": 4.960417380093871e-06, + "loss": 2.1276, + "mean_token_accuracy": 0.4862069010734558, + "step": 4925 + }, + { + "epoch": 0.004965548360714354, + "grad_norm": 66.2174262384771, + "learning_rate": 4.96545333682493e-06, + "loss": 2.3614, + "mean_token_accuracy": 0.41724138259887694, + "step": 4930 + }, + { + "epoch": 0.0049705844138185264, + "grad_norm": 76.75236718541544, + "learning_rate": 4.97048929355599e-06, + "loss": 2.1555, + "mean_token_accuracy": 0.441379314661026, + "step": 4935 + }, + { + "epoch": 0.004975620466922699, + "grad_norm": 49.16557382693346, + "learning_rate": 4.97552525028705e-06, + "loss": 2.5746, + "mean_token_accuracy": 0.43793103098869324, + "step": 4940 + }, + { + "epoch": 0.004980656520026872, + "grad_norm": 70.40210340843936, + "learning_rate": 4.98056120701811e-06, + "loss": 2.2663, + "mean_token_accuracy": 0.45862067937850953, + "step": 4945 + }, + { + "epoch": 0.004985692573131045, + "grad_norm": 56.22664948209718, + "learning_rate": 4.9855971637491695e-06, + "loss": 2.3825, + "mean_token_accuracy": 0.4172413766384125, + "step": 4950 + }, + { + "epoch": 0.004990728626235218, + "grad_norm": 58.640772496492474, + "learning_rate": 4.9906331204802296e-06, + "loss": 2.372, + "mean_token_accuracy": 0.4275861978530884, + "step": 4955 + }, + { + "epoch": 0.004995764679339391, + "grad_norm": 52.597269837763356, + "learning_rate": 4.995669077211289e-06, + "loss": 2.0586, + "mean_token_accuracy": 0.482758617401123, + "step": 4960 + }, + { + "epoch": 0.005000800732443564, + "grad_norm": 61.54477380487586, + "learning_rate": 5.000705033942349e-06, + "loss": 2.2386, + "mean_token_accuracy": 0.5034482717514038, + "step": 4965 + }, + { + "epoch": 0.005005836785547736, + "grad_norm": 57.72962510637242, + "learning_rate": 5.005740990673408e-06, + "loss": 2.2071, + "mean_token_accuracy": 0.482758617401123, + "step": 4970 + }, + { + "epoch": 0.005010872838651909, + "grad_norm": 57.799889722372136, + "learning_rate": 5.010776947404468e-06, + "loss": 2.489, + "mean_token_accuracy": 0.41724138259887694, + "step": 4975 + }, + { + "epoch": 0.005015908891756082, + "grad_norm": 67.77003986715913, + "learning_rate": 5.0158129041355275e-06, + "loss": 2.371, + "mean_token_accuracy": 0.4448275864124298, + "step": 4980 + }, + { + "epoch": 0.0050209449448602545, + "grad_norm": 76.43776800322563, + "learning_rate": 5.020848860866588e-06, + "loss": 2.6248, + "mean_token_accuracy": 0.44482758045196535, + "step": 4985 + }, + { + "epoch": 0.0050259809979644274, + "grad_norm": 44.665451568261545, + "learning_rate": 5.025884817597648e-06, + "loss": 2.4031, + "mean_token_accuracy": 0.44827585816383364, + "step": 4990 + }, + { + "epoch": 0.0050310170510686, + "grad_norm": 60.92144952752837, + "learning_rate": 5.030920774328708e-06, + "loss": 2.2022, + "mean_token_accuracy": 0.42413793206214906, + "step": 4995 + }, + { + "epoch": 0.005036053104172773, + "grad_norm": 79.81800341358644, + "learning_rate": 5.035956731059767e-06, + "loss": 2.0991, + "mean_token_accuracy": 0.5021778583526612, + "step": 5000 + }, + { + "epoch": 0.005041089157276945, + "grad_norm": 57.74455605209466, + "learning_rate": 5.040992687790827e-06, + "loss": 2.2064, + "mean_token_accuracy": 0.4620689630508423, + "step": 5005 + }, + { + "epoch": 0.005046125210381118, + "grad_norm": 44.799091360443555, + "learning_rate": 5.046028644521886e-06, + "loss": 2.1773, + "mean_token_accuracy": 0.4413793087005615, + "step": 5010 + }, + { + "epoch": 0.005051161263485291, + "grad_norm": 46.792375177451206, + "learning_rate": 5.0510646012529465e-06, + "loss": 1.9952, + "mean_token_accuracy": 0.5137931108474731, + "step": 5015 + }, + { + "epoch": 0.005056197316589464, + "grad_norm": 52.24108330229672, + "learning_rate": 5.056100557984006e-06, + "loss": 2.3965, + "mean_token_accuracy": 0.40689654350280763, + "step": 5020 + }, + { + "epoch": 0.005061233369693637, + "grad_norm": 86.39586184224119, + "learning_rate": 5.061136514715066e-06, + "loss": 2.3281, + "mean_token_accuracy": 0.4344827651977539, + "step": 5025 + }, + { + "epoch": 0.00506626942279781, + "grad_norm": 48.203306834958575, + "learning_rate": 5.066172471446125e-06, + "loss": 2.1886, + "mean_token_accuracy": 0.458620685338974, + "step": 5030 + }, + { + "epoch": 0.005071305475901983, + "grad_norm": 57.17687520705525, + "learning_rate": 5.071208428177185e-06, + "loss": 2.3704, + "mean_token_accuracy": 0.44137930274009707, + "step": 5035 + }, + { + "epoch": 0.005076341529006155, + "grad_norm": 62.69867911410207, + "learning_rate": 5.076244384908245e-06, + "loss": 2.1292, + "mean_token_accuracy": 0.4724137902259827, + "step": 5040 + }, + { + "epoch": 0.005081377582110328, + "grad_norm": 45.02012801633202, + "learning_rate": 5.0812803416393054e-06, + "loss": 2.0253, + "mean_token_accuracy": 0.506896561384201, + "step": 5045 + }, + { + "epoch": 0.0050864136352145005, + "grad_norm": 60.94816448716288, + "learning_rate": 5.086316298370365e-06, + "loss": 2.218, + "mean_token_accuracy": 0.46061705350875853, + "step": 5050 + }, + { + "epoch": 0.005091449688318673, + "grad_norm": 77.26236876616662, + "learning_rate": 5.091352255101425e-06, + "loss": 2.2055, + "mean_token_accuracy": 0.4620689570903778, + "step": 5055 + }, + { + "epoch": 0.005096485741422846, + "grad_norm": 56.9512706879724, + "learning_rate": 5.096388211832484e-06, + "loss": 2.2843, + "mean_token_accuracy": 0.4448275864124298, + "step": 5060 + }, + { + "epoch": 0.005101521794527019, + "grad_norm": 59.63238208304837, + "learning_rate": 5.101424168563544e-06, + "loss": 2.3069, + "mean_token_accuracy": 0.4413793087005615, + "step": 5065 + }, + { + "epoch": 0.005106557847631192, + "grad_norm": 52.129514672046895, + "learning_rate": 5.106460125294603e-06, + "loss": 2.5455, + "mean_token_accuracy": 0.458620685338974, + "step": 5070 + }, + { + "epoch": 0.005111593900735364, + "grad_norm": 56.40752774241312, + "learning_rate": 5.1114960820256635e-06, + "loss": 2.003, + "mean_token_accuracy": 0.5275861978530884, + "step": 5075 + }, + { + "epoch": 0.005116629953839537, + "grad_norm": 62.155488816053406, + "learning_rate": 5.116532038756723e-06, + "loss": 1.8662, + "mean_token_accuracy": 0.5172413766384125, + "step": 5080 + }, + { + "epoch": 0.00512166600694371, + "grad_norm": 60.10773183923074, + "learning_rate": 5.121567995487783e-06, + "loss": 2.332, + "mean_token_accuracy": 0.441379314661026, + "step": 5085 + }, + { + "epoch": 0.005126702060047883, + "grad_norm": 45.36708723794407, + "learning_rate": 5.126603952218843e-06, + "loss": 2.0886, + "mean_token_accuracy": 0.4931034505367279, + "step": 5090 + }, + { + "epoch": 0.005131738113152056, + "grad_norm": 54.995604496817265, + "learning_rate": 5.131639908949903e-06, + "loss": 1.975, + "mean_token_accuracy": 0.482758617401123, + "step": 5095 + }, + { + "epoch": 0.005136774166256229, + "grad_norm": 86.00466369830168, + "learning_rate": 5.136675865680962e-06, + "loss": 2.2981, + "mean_token_accuracy": 0.4121597111225128, + "step": 5100 + }, + { + "epoch": 0.0051418102193604015, + "grad_norm": 52.472386114961644, + "learning_rate": 5.141711822412022e-06, + "loss": 2.0993, + "mean_token_accuracy": 0.4655172348022461, + "step": 5105 + }, + { + "epoch": 0.0051468462724645735, + "grad_norm": 40.59334492780365, + "learning_rate": 5.146747779143082e-06, + "loss": 2.2135, + "mean_token_accuracy": 0.43103447556495667, + "step": 5110 + }, + { + "epoch": 0.005151882325568746, + "grad_norm": 57.05439660541193, + "learning_rate": 5.151783735874142e-06, + "loss": 2.2692, + "mean_token_accuracy": 0.4762250483036041, + "step": 5115 + }, + { + "epoch": 0.005156918378672919, + "grad_norm": 62.17435367166306, + "learning_rate": 5.156819692605201e-06, + "loss": 2.2575, + "mean_token_accuracy": 0.46896551847457885, + "step": 5120 + }, + { + "epoch": 0.005161954431777092, + "grad_norm": 81.96978366140067, + "learning_rate": 5.161855649336261e-06, + "loss": 2.1175, + "mean_token_accuracy": 0.4517241299152374, + "step": 5125 + }, + { + "epoch": 0.005166990484881265, + "grad_norm": 56.6224519743192, + "learning_rate": 5.16689160606732e-06, + "loss": 1.9847, + "mean_token_accuracy": 0.5368421018123627, + "step": 5130 + }, + { + "epoch": 0.005172026537985438, + "grad_norm": 54.49737081118318, + "learning_rate": 5.1719275627983804e-06, + "loss": 2.1467, + "mean_token_accuracy": 0.48620688915252686, + "step": 5135 + }, + { + "epoch": 0.005177062591089611, + "grad_norm": 113.21496006124241, + "learning_rate": 5.1769635195294405e-06, + "loss": 2.5677, + "mean_token_accuracy": 0.4310344696044922, + "step": 5140 + }, + { + "epoch": 0.005182098644193783, + "grad_norm": 65.34478537520857, + "learning_rate": 5.181999476260501e-06, + "loss": 2.3153, + "mean_token_accuracy": 0.4551724076271057, + "step": 5145 + }, + { + "epoch": 0.005187134697297956, + "grad_norm": 63.53608539077253, + "learning_rate": 5.18703543299156e-06, + "loss": 2.6021, + "mean_token_accuracy": 0.42068966031074523, + "step": 5150 + }, + { + "epoch": 0.005192170750402129, + "grad_norm": 62.76026244838133, + "learning_rate": 5.19207138972262e-06, + "loss": 2.4379, + "mean_token_accuracy": 0.42413793206214906, + "step": 5155 + }, + { + "epoch": 0.005197206803506302, + "grad_norm": 72.67055814492477, + "learning_rate": 5.197107346453679e-06, + "loss": 1.8438, + "mean_token_accuracy": 0.5551724135875702, + "step": 5160 + }, + { + "epoch": 0.0052022428566104745, + "grad_norm": 71.07748091316792, + "learning_rate": 5.202143303184739e-06, + "loss": 2.2333, + "mean_token_accuracy": 0.43448275327682495, + "step": 5165 + }, + { + "epoch": 0.0052072789097146474, + "grad_norm": 60.47556272487013, + "learning_rate": 5.207179259915799e-06, + "loss": 2.3338, + "mean_token_accuracy": 0.4206896543502808, + "step": 5170 + }, + { + "epoch": 0.00521231496281882, + "grad_norm": 77.68342587456141, + "learning_rate": 5.212215216646859e-06, + "loss": 2.1392, + "mean_token_accuracy": 0.45517241954803467, + "step": 5175 + }, + { + "epoch": 0.005217351015922992, + "grad_norm": 76.44543284207457, + "learning_rate": 5.217251173377918e-06, + "loss": 2.1785, + "mean_token_accuracy": 0.45172414779663084, + "step": 5180 + }, + { + "epoch": 0.005222387069027165, + "grad_norm": 61.890828680579716, + "learning_rate": 5.222287130108978e-06, + "loss": 1.9292, + "mean_token_accuracy": 0.5551724135875702, + "step": 5185 + }, + { + "epoch": 0.005227423122131338, + "grad_norm": 48.671387299657646, + "learning_rate": 5.227323086840038e-06, + "loss": 2.0347, + "mean_token_accuracy": 0.4689655125141144, + "step": 5190 + }, + { + "epoch": 0.005232459175235511, + "grad_norm": 61.79449331491543, + "learning_rate": 5.232359043571098e-06, + "loss": 2.4038, + "mean_token_accuracy": 0.42758620977401735, + "step": 5195 + }, + { + "epoch": 0.005237495228339684, + "grad_norm": 59.436565152531095, + "learning_rate": 5.2373950003021575e-06, + "loss": 2.5369, + "mean_token_accuracy": 0.41724138259887694, + "step": 5200 + }, + { + "epoch": 0.005242531281443857, + "grad_norm": 60.45275169688569, + "learning_rate": 5.242430957033218e-06, + "loss": 2.4996, + "mean_token_accuracy": 0.4482758641242981, + "step": 5205 + }, + { + "epoch": 0.00524756733454803, + "grad_norm": 66.91125060646078, + "learning_rate": 5.247466913764277e-06, + "loss": 2.3523, + "mean_token_accuracy": 0.4310344815254211, + "step": 5210 + }, + { + "epoch": 0.005252603387652202, + "grad_norm": 51.212774272790426, + "learning_rate": 5.252502870495337e-06, + "loss": 2.0909, + "mean_token_accuracy": 0.4586206912994385, + "step": 5215 + }, + { + "epoch": 0.005257639440756375, + "grad_norm": 94.47941921851192, + "learning_rate": 5.257538827226396e-06, + "loss": 2.3875, + "mean_token_accuracy": 0.46551724672317507, + "step": 5220 + }, + { + "epoch": 0.005262675493860548, + "grad_norm": 47.80111854043782, + "learning_rate": 5.262574783957456e-06, + "loss": 2.1418, + "mean_token_accuracy": 0.47586206793785096, + "step": 5225 + }, + { + "epoch": 0.0052677115469647205, + "grad_norm": 48.403905631961834, + "learning_rate": 5.2676107406885156e-06, + "loss": 2.2353, + "mean_token_accuracy": 0.49999998807907103, + "step": 5230 + }, + { + "epoch": 0.005272747600068893, + "grad_norm": 47.84826563045979, + "learning_rate": 5.272646697419576e-06, + "loss": 2.1832, + "mean_token_accuracy": 0.4586206912994385, + "step": 5235 + }, + { + "epoch": 0.005277783653173066, + "grad_norm": 53.5025591865723, + "learning_rate": 5.277682654150636e-06, + "loss": 2.226, + "mean_token_accuracy": 0.4379310369491577, + "step": 5240 + }, + { + "epoch": 0.005282819706277239, + "grad_norm": 68.00684103158602, + "learning_rate": 5.282718610881696e-06, + "loss": 2.6529, + "mean_token_accuracy": 0.41379310488700866, + "step": 5245 + }, + { + "epoch": 0.005287855759381411, + "grad_norm": 54.61815522016781, + "learning_rate": 5.287754567612755e-06, + "loss": 2.4593, + "mean_token_accuracy": 0.42758620977401735, + "step": 5250 + }, + { + "epoch": 0.005292891812485584, + "grad_norm": 55.75107179089296, + "learning_rate": 5.292790524343815e-06, + "loss": 2.1424, + "mean_token_accuracy": 0.45517240166664125, + "step": 5255 + }, + { + "epoch": 0.005297927865589757, + "grad_norm": 44.33943991407288, + "learning_rate": 5.2978264810748745e-06, + "loss": 2.2922, + "mean_token_accuracy": 0.4551724076271057, + "step": 5260 + }, + { + "epoch": 0.00530296391869393, + "grad_norm": 67.01123815884502, + "learning_rate": 5.3028624378059346e-06, + "loss": 2.6027, + "mean_token_accuracy": 0.42758620977401735, + "step": 5265 + }, + { + "epoch": 0.005307999971798103, + "grad_norm": 70.88445439672337, + "learning_rate": 5.307898394536994e-06, + "loss": 2.3783, + "mean_token_accuracy": 0.41379310488700866, + "step": 5270 + }, + { + "epoch": 0.005313036024902276, + "grad_norm": 59.1625159017375, + "learning_rate": 5.312934351268054e-06, + "loss": 1.8583, + "mean_token_accuracy": 0.5068965554237366, + "step": 5275 + }, + { + "epoch": 0.005318072078006448, + "grad_norm": 76.45157150282814, + "learning_rate": 5.317970307999114e-06, + "loss": 2.2655, + "mean_token_accuracy": 0.4517241358757019, + "step": 5280 + }, + { + "epoch": 0.005323108131110621, + "grad_norm": 55.597350436717164, + "learning_rate": 5.323006264730174e-06, + "loss": 2.1003, + "mean_token_accuracy": 0.4482758641242981, + "step": 5285 + }, + { + "epoch": 0.0053281441842147935, + "grad_norm": 60.360247882929805, + "learning_rate": 5.328042221461233e-06, + "loss": 2.2907, + "mean_token_accuracy": 0.40344826579093934, + "step": 5290 + }, + { + "epoch": 0.005333180237318966, + "grad_norm": 53.76963053135653, + "learning_rate": 5.3330781781922935e-06, + "loss": 2.2519, + "mean_token_accuracy": 0.5, + "step": 5295 + }, + { + "epoch": 0.005338216290423139, + "grad_norm": 43.79342476087142, + "learning_rate": 5.338114134923353e-06, + "loss": 2.0367, + "mean_token_accuracy": 0.4793103516101837, + "step": 5300 + }, + { + "epoch": 0.005343252343527312, + "grad_norm": 65.07224569261078, + "learning_rate": 5.343150091654413e-06, + "loss": 2.6087, + "mean_token_accuracy": 0.4344827651977539, + "step": 5305 + }, + { + "epoch": 0.005348288396631485, + "grad_norm": 60.182082038884175, + "learning_rate": 5.348186048385472e-06, + "loss": 2.5583, + "mean_token_accuracy": 0.4517241418361664, + "step": 5310 + }, + { + "epoch": 0.005353324449735657, + "grad_norm": 63.020502953937104, + "learning_rate": 5.353222005116532e-06, + "loss": 2.1694, + "mean_token_accuracy": 0.4586206912994385, + "step": 5315 + }, + { + "epoch": 0.00535836050283983, + "grad_norm": 41.8160444016396, + "learning_rate": 5.358257961847592e-06, + "loss": 2.2148, + "mean_token_accuracy": 0.4551724135875702, + "step": 5320 + }, + { + "epoch": 0.005363396555944003, + "grad_norm": 83.60877480562837, + "learning_rate": 5.3632939185786515e-06, + "loss": 2.2031, + "mean_token_accuracy": 0.44827587008476255, + "step": 5325 + }, + { + "epoch": 0.005368432609048176, + "grad_norm": 57.71490915260869, + "learning_rate": 5.368329875309712e-06, + "loss": 2.2611, + "mean_token_accuracy": 0.4655172348022461, + "step": 5330 + }, + { + "epoch": 0.005373468662152349, + "grad_norm": 76.19323528123365, + "learning_rate": 5.373365832040772e-06, + "loss": 2.387, + "mean_token_accuracy": 0.4482758641242981, + "step": 5335 + }, + { + "epoch": 0.005378504715256522, + "grad_norm": 64.06808216803532, + "learning_rate": 5.378401788771832e-06, + "loss": 2.4848, + "mean_token_accuracy": 0.4344827651977539, + "step": 5340 + }, + { + "epoch": 0.0053835407683606945, + "grad_norm": 55.9399912469596, + "learning_rate": 5.383437745502891e-06, + "loss": 2.1121, + "mean_token_accuracy": 0.5006049573421478, + "step": 5345 + }, + { + "epoch": 0.005388576821464867, + "grad_norm": 113.56564731295282, + "learning_rate": 5.388473702233951e-06, + "loss": 2.0349, + "mean_token_accuracy": 0.4947044312953949, + "step": 5350 + }, + { + "epoch": 0.0053936128745690395, + "grad_norm": 73.21276716942864, + "learning_rate": 5.3935096589650104e-06, + "loss": 1.9133, + "mean_token_accuracy": 0.49709620475769045, + "step": 5355 + }, + { + "epoch": 0.005398648927673212, + "grad_norm": 53.70820082358731, + "learning_rate": 5.3985456156960705e-06, + "loss": 2.2817, + "mean_token_accuracy": 0.4083484590053558, + "step": 5360 + }, + { + "epoch": 0.005403684980777385, + "grad_norm": 53.549812595066996, + "learning_rate": 5.40358157242713e-06, + "loss": 2.15, + "mean_token_accuracy": 0.4620689570903778, + "step": 5365 + }, + { + "epoch": 0.005408721033881558, + "grad_norm": 44.51704364714626, + "learning_rate": 5.40861752915819e-06, + "loss": 2.0451, + "mean_token_accuracy": 0.47931033968925474, + "step": 5370 + }, + { + "epoch": 0.005413757086985731, + "grad_norm": 92.17141296261514, + "learning_rate": 5.413653485889249e-06, + "loss": 2.1481, + "mean_token_accuracy": 0.4517241358757019, + "step": 5375 + }, + { + "epoch": 0.005418793140089904, + "grad_norm": 76.61343739248683, + "learning_rate": 5.418689442620309e-06, + "loss": 2.2007, + "mean_token_accuracy": 0.4862069010734558, + "step": 5380 + }, + { + "epoch": 0.005423829193194076, + "grad_norm": 93.55658908837326, + "learning_rate": 5.423725399351369e-06, + "loss": 2.5759, + "mean_token_accuracy": 0.4068965554237366, + "step": 5385 + }, + { + "epoch": 0.005428865246298249, + "grad_norm": 47.029094729218436, + "learning_rate": 5.4287613560824295e-06, + "loss": 1.9805, + "mean_token_accuracy": 0.5206896483898162, + "step": 5390 + }, + { + "epoch": 0.005433901299402422, + "grad_norm": 83.2217552654011, + "learning_rate": 5.433797312813489e-06, + "loss": 2.3926, + "mean_token_accuracy": 0.4896551728248596, + "step": 5395 + }, + { + "epoch": 0.005438937352506595, + "grad_norm": 70.63687969253428, + "learning_rate": 5.438833269544549e-06, + "loss": 2.0463, + "mean_token_accuracy": 0.46206897497177124, + "step": 5400 + }, + { + "epoch": 0.005443973405610768, + "grad_norm": 65.12417167724513, + "learning_rate": 5.443869226275608e-06, + "loss": 2.1981, + "mean_token_accuracy": 0.5206896543502808, + "step": 5405 + }, + { + "epoch": 0.0054490094587149405, + "grad_norm": 69.81079100440061, + "learning_rate": 5.448905183006668e-06, + "loss": 2.3623, + "mean_token_accuracy": 0.4068965554237366, + "step": 5410 + }, + { + "epoch": 0.005454045511819113, + "grad_norm": 63.16387639721739, + "learning_rate": 5.453941139737727e-06, + "loss": 2.3249, + "mean_token_accuracy": 0.4379310429096222, + "step": 5415 + }, + { + "epoch": 0.005459081564923285, + "grad_norm": 64.81181393568374, + "learning_rate": 5.4589770964687875e-06, + "loss": 2.6455, + "mean_token_accuracy": 0.3965517282485962, + "step": 5420 + }, + { + "epoch": 0.005464117618027458, + "grad_norm": 44.52213605967673, + "learning_rate": 5.464013053199847e-06, + "loss": 2.2336, + "mean_token_accuracy": 0.4620689630508423, + "step": 5425 + }, + { + "epoch": 0.005469153671131631, + "grad_norm": 58.8393114707555, + "learning_rate": 5.469049009930907e-06, + "loss": 2.2412, + "mean_token_accuracy": 0.4431337058544159, + "step": 5430 + }, + { + "epoch": 0.005474189724235804, + "grad_norm": 50.30879302436615, + "learning_rate": 5.474084966661967e-06, + "loss": 2.2877, + "mean_token_accuracy": 0.46551724076271056, + "step": 5435 + }, + { + "epoch": 0.005479225777339977, + "grad_norm": 70.76102328863462, + "learning_rate": 5.479120923393027e-06, + "loss": 2.1271, + "mean_token_accuracy": 0.4620689690113068, + "step": 5440 + }, + { + "epoch": 0.00548426183044415, + "grad_norm": 62.62060847967712, + "learning_rate": 5.484156880124086e-06, + "loss": 2.4177, + "mean_token_accuracy": 0.44827585816383364, + "step": 5445 + }, + { + "epoch": 0.005489297883548323, + "grad_norm": 55.38496834228082, + "learning_rate": 5.489192836855146e-06, + "loss": 2.1094, + "mean_token_accuracy": 0.4620689630508423, + "step": 5450 + }, + { + "epoch": 0.005494333936652495, + "grad_norm": 56.23226674735334, + "learning_rate": 5.494228793586206e-06, + "loss": 2.4609, + "mean_token_accuracy": 0.4310344815254211, + "step": 5455 + }, + { + "epoch": 0.005499369989756668, + "grad_norm": 54.99112966327625, + "learning_rate": 5.499264750317266e-06, + "loss": 2.1683, + "mean_token_accuracy": 0.4896551787853241, + "step": 5460 + }, + { + "epoch": 0.005504406042860841, + "grad_norm": 64.42595924612726, + "learning_rate": 5.504300707048325e-06, + "loss": 2.1856, + "mean_token_accuracy": 0.47586206197738645, + "step": 5465 + }, + { + "epoch": 0.0055094420959650135, + "grad_norm": 59.95598926419033, + "learning_rate": 5.509336663779385e-06, + "loss": 2.2492, + "mean_token_accuracy": 0.4862068951129913, + "step": 5470 + }, + { + "epoch": 0.005514478149069186, + "grad_norm": 44.0925278011909, + "learning_rate": 5.514372620510444e-06, + "loss": 2.2783, + "mean_token_accuracy": 0.4517241358757019, + "step": 5475 + }, + { + "epoch": 0.005519514202173359, + "grad_norm": 64.05006231377183, + "learning_rate": 5.5194085772415045e-06, + "loss": 2.354, + "mean_token_accuracy": 0.43793103098869324, + "step": 5480 + }, + { + "epoch": 0.005524550255277532, + "grad_norm": 52.52561344609644, + "learning_rate": 5.5244445339725646e-06, + "loss": 2.0963, + "mean_token_accuracy": 0.4620689690113068, + "step": 5485 + }, + { + "epoch": 0.005529586308381704, + "grad_norm": 42.114320567333806, + "learning_rate": 5.529480490703625e-06, + "loss": 2.1934, + "mean_token_accuracy": 0.44827585816383364, + "step": 5490 + }, + { + "epoch": 0.005534622361485877, + "grad_norm": 57.01095671829796, + "learning_rate": 5.534516447434684e-06, + "loss": 2.4048, + "mean_token_accuracy": 0.4, + "step": 5495 + }, + { + "epoch": 0.00553965841459005, + "grad_norm": 49.00131848788912, + "learning_rate": 5.539552404165744e-06, + "loss": 2.3366, + "mean_token_accuracy": 0.45027223229408264, + "step": 5500 + }, + { + "epoch": 0.005544694467694223, + "grad_norm": 60.36814182264058, + "learning_rate": 5.544588360896803e-06, + "loss": 2.1626, + "mean_token_accuracy": 0.4862068951129913, + "step": 5505 + }, + { + "epoch": 0.005549730520798396, + "grad_norm": 65.08074346488905, + "learning_rate": 5.549624317627863e-06, + "loss": 2.2481, + "mean_token_accuracy": 0.42413792610168455, + "step": 5510 + }, + { + "epoch": 0.005554766573902569, + "grad_norm": 54.70828816858123, + "learning_rate": 5.554660274358923e-06, + "loss": 2.2637, + "mean_token_accuracy": 0.4448275864124298, + "step": 5515 + }, + { + "epoch": 0.005559802627006742, + "grad_norm": 63.353929079694254, + "learning_rate": 5.559696231089983e-06, + "loss": 2.2661, + "mean_token_accuracy": 0.4896551728248596, + "step": 5520 + }, + { + "epoch": 0.005564838680110914, + "grad_norm": 51.32308740958065, + "learning_rate": 5.564732187821042e-06, + "loss": 2.227, + "mean_token_accuracy": 0.4310344815254211, + "step": 5525 + }, + { + "epoch": 0.005569874733215087, + "grad_norm": 46.60665809118542, + "learning_rate": 5.569768144552102e-06, + "loss": 2.3502, + "mean_token_accuracy": 0.4517241418361664, + "step": 5530 + }, + { + "epoch": 0.0055749107863192595, + "grad_norm": 67.73235174555208, + "learning_rate": 5.574804101283162e-06, + "loss": 2.5164, + "mean_token_accuracy": 0.41034482717514037, + "step": 5535 + }, + { + "epoch": 0.005579946839423432, + "grad_norm": 52.24925622094272, + "learning_rate": 5.579840058014222e-06, + "loss": 2.2935, + "mean_token_accuracy": 0.4689655125141144, + "step": 5540 + }, + { + "epoch": 0.005584982892527605, + "grad_norm": 75.32023535126956, + "learning_rate": 5.5848760147452815e-06, + "loss": 2.3796, + "mean_token_accuracy": 0.44827585816383364, + "step": 5545 + }, + { + "epoch": 0.005590018945631778, + "grad_norm": 66.80441385795447, + "learning_rate": 5.589911971476342e-06, + "loss": 2.1181, + "mean_token_accuracy": 0.48620688915252686, + "step": 5550 + }, + { + "epoch": 0.005595054998735951, + "grad_norm": 59.94957939020645, + "learning_rate": 5.594947928207401e-06, + "loss": 2.4706, + "mean_token_accuracy": 0.41379311084747317, + "step": 5555 + }, + { + "epoch": 0.005600091051840123, + "grad_norm": 50.08012602349299, + "learning_rate": 5.599983884938461e-06, + "loss": 2.0734, + "mean_token_accuracy": 0.4896551787853241, + "step": 5560 + }, + { + "epoch": 0.005605127104944296, + "grad_norm": 65.919910906976, + "learning_rate": 5.60501984166952e-06, + "loss": 2.5222, + "mean_token_accuracy": 0.4215517222881317, + "step": 5565 + }, + { + "epoch": 0.005610163158048469, + "grad_norm": 60.10771322488036, + "learning_rate": 5.61005579840058e-06, + "loss": 2.3789, + "mean_token_accuracy": 0.4344827651977539, + "step": 5570 + }, + { + "epoch": 0.005615199211152642, + "grad_norm": 54.08231618732255, + "learning_rate": 5.61509175513164e-06, + "loss": 2.2147, + "mean_token_accuracy": 0.47586206793785096, + "step": 5575 + }, + { + "epoch": 0.005620235264256815, + "grad_norm": 62.05875026888758, + "learning_rate": 5.6201277118627e-06, + "loss": 2.0597, + "mean_token_accuracy": 0.5034482777118683, + "step": 5580 + }, + { + "epoch": 0.005625271317360988, + "grad_norm": 49.492586784714035, + "learning_rate": 5.62516366859376e-06, + "loss": 2.1203, + "mean_token_accuracy": 0.46551724076271056, + "step": 5585 + }, + { + "epoch": 0.0056303073704651605, + "grad_norm": 58.72449283191179, + "learning_rate": 5.63019962532482e-06, + "loss": 2.291, + "mean_token_accuracy": 0.4206896543502808, + "step": 5590 + }, + { + "epoch": 0.0056353434235693325, + "grad_norm": 55.98856238334883, + "learning_rate": 5.635235582055879e-06, + "loss": 2.3986, + "mean_token_accuracy": 0.42758620381355283, + "step": 5595 + }, + { + "epoch": 0.005640379476673505, + "grad_norm": 74.18381080201779, + "learning_rate": 5.640271538786939e-06, + "loss": 2.247, + "mean_token_accuracy": 0.42413793206214906, + "step": 5600 + }, + { + "epoch": 0.005645415529777678, + "grad_norm": 53.5420684785195, + "learning_rate": 5.6453074955179985e-06, + "loss": 2.5564, + "mean_token_accuracy": 0.4137931078672409, + "step": 5605 + }, + { + "epoch": 0.005650451582881851, + "grad_norm": 56.02088526313848, + "learning_rate": 5.650343452249059e-06, + "loss": 2.4848, + "mean_token_accuracy": 0.4103448331356049, + "step": 5610 + }, + { + "epoch": 0.005655487635986024, + "grad_norm": 49.00342280378441, + "learning_rate": 5.655379408980118e-06, + "loss": 2.4967, + "mean_token_accuracy": 0.42413792610168455, + "step": 5615 + }, + { + "epoch": 0.005660523689090197, + "grad_norm": 44.88229759761831, + "learning_rate": 5.660415365711178e-06, + "loss": 2.056, + "mean_token_accuracy": 0.5000000059604645, + "step": 5620 + }, + { + "epoch": 0.00566555974219437, + "grad_norm": 70.69456880129235, + "learning_rate": 5.665451322442237e-06, + "loss": 2.4881, + "mean_token_accuracy": 0.4047791838645935, + "step": 5625 + }, + { + "epoch": 0.005670595795298542, + "grad_norm": 59.18438327572698, + "learning_rate": 5.670487279173297e-06, + "loss": 2.48, + "mean_token_accuracy": 0.4689655125141144, + "step": 5630 + }, + { + "epoch": 0.005675631848402715, + "grad_norm": 40.75703500824426, + "learning_rate": 5.675523235904357e-06, + "loss": 2.1524, + "mean_token_accuracy": 0.48756158351898193, + "step": 5635 + }, + { + "epoch": 0.005680667901506888, + "grad_norm": 50.08284194333192, + "learning_rate": 5.6805591926354175e-06, + "loss": 2.1987, + "mean_token_accuracy": 0.42758620977401735, + "step": 5640 + }, + { + "epoch": 0.005685703954611061, + "grad_norm": 56.36385170838698, + "learning_rate": 5.685595149366477e-06, + "loss": 2.2666, + "mean_token_accuracy": 0.44827585816383364, + "step": 5645 + }, + { + "epoch": 0.0056907400077152335, + "grad_norm": 51.950733914048236, + "learning_rate": 5.690631106097537e-06, + "loss": 2.1852, + "mean_token_accuracy": 0.43103447556495667, + "step": 5650 + }, + { + "epoch": 0.005695776060819406, + "grad_norm": 60.96397142529166, + "learning_rate": 5.695667062828596e-06, + "loss": 2.3226, + "mean_token_accuracy": 0.4103448212146759, + "step": 5655 + }, + { + "epoch": 0.005700812113923579, + "grad_norm": 79.61105429853812, + "learning_rate": 5.700703019559656e-06, + "loss": 2.1115, + "mean_token_accuracy": 0.47586206197738645, + "step": 5660 + }, + { + "epoch": 0.005705848167027751, + "grad_norm": 75.72032938501376, + "learning_rate": 5.7057389762907155e-06, + "loss": 2.1379, + "mean_token_accuracy": 0.5103448331356049, + "step": 5665 + }, + { + "epoch": 0.005710884220131924, + "grad_norm": 59.13252588476254, + "learning_rate": 5.7107749330217756e-06, + "loss": 2.2078, + "mean_token_accuracy": 0.49655172824859617, + "step": 5670 + }, + { + "epoch": 0.005715920273236097, + "grad_norm": 68.9960825391538, + "learning_rate": 5.715810889752835e-06, + "loss": 2.4626, + "mean_token_accuracy": 0.4551724076271057, + "step": 5675 + }, + { + "epoch": 0.00572095632634027, + "grad_norm": 47.17846463373798, + "learning_rate": 5.720846846483895e-06, + "loss": 2.1256, + "mean_token_accuracy": 0.458620685338974, + "step": 5680 + }, + { + "epoch": 0.005725992379444443, + "grad_norm": 54.73905473454097, + "learning_rate": 5.725882803214955e-06, + "loss": 2.2757, + "mean_token_accuracy": 0.4517241299152374, + "step": 5685 + }, + { + "epoch": 0.005731028432548616, + "grad_norm": 63.42248559250619, + "learning_rate": 5.730918759946015e-06, + "loss": 2.2552, + "mean_token_accuracy": 0.42758620381355283, + "step": 5690 + }, + { + "epoch": 0.005736064485652789, + "grad_norm": 42.5949749046304, + "learning_rate": 5.735954716677074e-06, + "loss": 2.1937, + "mean_token_accuracy": 0.4551724135875702, + "step": 5695 + }, + { + "epoch": 0.005741100538756961, + "grad_norm": 69.13309493901087, + "learning_rate": 5.7409906734081345e-06, + "loss": 2.3679, + "mean_token_accuracy": 0.4379310429096222, + "step": 5700 + }, + { + "epoch": 0.005746136591861134, + "grad_norm": 59.63654313937762, + "learning_rate": 5.746026630139194e-06, + "loss": 2.1821, + "mean_token_accuracy": 0.44301270246505736, + "step": 5705 + }, + { + "epoch": 0.0057511726449653066, + "grad_norm": 61.580943844379135, + "learning_rate": 5.751062586870254e-06, + "loss": 1.983, + "mean_token_accuracy": 0.4965517222881317, + "step": 5710 + }, + { + "epoch": 0.0057562086980694795, + "grad_norm": 63.30812303070324, + "learning_rate": 5.756098543601313e-06, + "loss": 2.2954, + "mean_token_accuracy": 0.47586207985877993, + "step": 5715 + }, + { + "epoch": 0.005761244751173652, + "grad_norm": 62.340322181989, + "learning_rate": 5.761134500332373e-06, + "loss": 2.14, + "mean_token_accuracy": 0.5, + "step": 5720 + }, + { + "epoch": 0.005766280804277825, + "grad_norm": 59.57448420965543, + "learning_rate": 5.766170457063433e-06, + "loss": 2.531, + "mean_token_accuracy": 0.3965517282485962, + "step": 5725 + }, + { + "epoch": 0.005771316857381997, + "grad_norm": 93.63703696832067, + "learning_rate": 5.7712064137944925e-06, + "loss": 2.4451, + "mean_token_accuracy": 0.4379310369491577, + "step": 5730 + }, + { + "epoch": 0.00577635291048617, + "grad_norm": 66.08940271682413, + "learning_rate": 5.7762423705255535e-06, + "loss": 2.3376, + "mean_token_accuracy": 0.4, + "step": 5735 + }, + { + "epoch": 0.005781388963590343, + "grad_norm": 45.83839722180335, + "learning_rate": 5.781278327256613e-06, + "loss": 2.1261, + "mean_token_accuracy": 0.49655171632766726, + "step": 5740 + }, + { + "epoch": 0.005786425016694516, + "grad_norm": 52.48591290460214, + "learning_rate": 5.786314283987673e-06, + "loss": 2.3372, + "mean_token_accuracy": 0.45517241954803467, + "step": 5745 + }, + { + "epoch": 0.005791461069798689, + "grad_norm": 60.91483557097881, + "learning_rate": 5.791350240718732e-06, + "loss": 2.3398, + "mean_token_accuracy": 0.45172412395477296, + "step": 5750 + }, + { + "epoch": 0.005796497122902862, + "grad_norm": 70.67164561781587, + "learning_rate": 5.796386197449792e-06, + "loss": 2.085, + "mean_token_accuracy": 0.49655172824859617, + "step": 5755 + }, + { + "epoch": 0.005801533176007035, + "grad_norm": 50.44081672382117, + "learning_rate": 5.8014221541808514e-06, + "loss": 2.1841, + "mean_token_accuracy": 0.4517241299152374, + "step": 5760 + }, + { + "epoch": 0.005806569229111207, + "grad_norm": 67.56015597622336, + "learning_rate": 5.8064581109119115e-06, + "loss": 2.3871, + "mean_token_accuracy": 0.4448275864124298, + "step": 5765 + }, + { + "epoch": 0.00581160528221538, + "grad_norm": 49.52748932236692, + "learning_rate": 5.811494067642971e-06, + "loss": 2.2865, + "mean_token_accuracy": 0.4586206912994385, + "step": 5770 + }, + { + "epoch": 0.0058166413353195525, + "grad_norm": 67.4483402277736, + "learning_rate": 5.816530024374031e-06, + "loss": 2.3308, + "mean_token_accuracy": 0.45899015069007876, + "step": 5775 + }, + { + "epoch": 0.005821677388423725, + "grad_norm": 47.80006453818044, + "learning_rate": 5.821565981105091e-06, + "loss": 2.1648, + "mean_token_accuracy": 0.4413793087005615, + "step": 5780 + }, + { + "epoch": 0.005826713441527898, + "grad_norm": 47.4724644033891, + "learning_rate": 5.826601937836151e-06, + "loss": 2.1088, + "mean_token_accuracy": 0.47586206197738645, + "step": 5785 + }, + { + "epoch": 0.005831749494632071, + "grad_norm": 49.10590670907701, + "learning_rate": 5.83163789456721e-06, + "loss": 2.3584, + "mean_token_accuracy": 0.42758620977401735, + "step": 5790 + }, + { + "epoch": 0.005836785547736244, + "grad_norm": 51.676391997937756, + "learning_rate": 5.8366738512982704e-06, + "loss": 2.0627, + "mean_token_accuracy": 0.4379310429096222, + "step": 5795 + }, + { + "epoch": 0.005841821600840416, + "grad_norm": 46.147196643559006, + "learning_rate": 5.84170980802933e-06, + "loss": 2.1942, + "mean_token_accuracy": 0.4965517222881317, + "step": 5800 + }, + { + "epoch": 0.005846857653944589, + "grad_norm": 56.03749358458488, + "learning_rate": 5.84674576476039e-06, + "loss": 2.3903, + "mean_token_accuracy": 0.3862068891525269, + "step": 5805 + }, + { + "epoch": 0.005851893707048762, + "grad_norm": 49.44581882563388, + "learning_rate": 5.851781721491449e-06, + "loss": 2.4331, + "mean_token_accuracy": 0.4034482717514038, + "step": 5810 + }, + { + "epoch": 0.005856929760152935, + "grad_norm": 71.63146635800206, + "learning_rate": 5.856817678222509e-06, + "loss": 2.4119, + "mean_token_accuracy": 0.3793103456497192, + "step": 5815 + }, + { + "epoch": 0.005861965813257108, + "grad_norm": 49.778333141589755, + "learning_rate": 5.861853634953568e-06, + "loss": 2.3262, + "mean_token_accuracy": 0.4344827651977539, + "step": 5820 + }, + { + "epoch": 0.005867001866361281, + "grad_norm": 48.245521713618245, + "learning_rate": 5.8668895916846285e-06, + "loss": 2.4417, + "mean_token_accuracy": 0.4034482777118683, + "step": 5825 + }, + { + "epoch": 0.0058720379194654535, + "grad_norm": 41.717443758605434, + "learning_rate": 5.871925548415689e-06, + "loss": 1.977, + "mean_token_accuracy": 0.49231699109077454, + "step": 5830 + }, + { + "epoch": 0.0058770739725696255, + "grad_norm": 62.59403087777202, + "learning_rate": 5.876961505146749e-06, + "loss": 2.2271, + "mean_token_accuracy": 0.4517241358757019, + "step": 5835 + }, + { + "epoch": 0.0058821100256737984, + "grad_norm": 56.003939996292274, + "learning_rate": 5.881997461877808e-06, + "loss": 2.6189, + "mean_token_accuracy": 0.3931034505367279, + "step": 5840 + }, + { + "epoch": 0.005887146078777971, + "grad_norm": 47.416387974853365, + "learning_rate": 5.887033418608868e-06, + "loss": 1.9012, + "mean_token_accuracy": 0.4896551787853241, + "step": 5845 + }, + { + "epoch": 0.005892182131882144, + "grad_norm": 49.586942409837356, + "learning_rate": 5.892069375339927e-06, + "loss": 2.057, + "mean_token_accuracy": 0.5068965494632721, + "step": 5850 + }, + { + "epoch": 0.005897218184986317, + "grad_norm": 59.682775630254525, + "learning_rate": 5.897105332070987e-06, + "loss": 2.1288, + "mean_token_accuracy": 0.5034482777118683, + "step": 5855 + }, + { + "epoch": 0.00590225423809049, + "grad_norm": 56.77091109864296, + "learning_rate": 5.902141288802047e-06, + "loss": 2.8807, + "mean_token_accuracy": 0.37241379022598264, + "step": 5860 + }, + { + "epoch": 0.005907290291194663, + "grad_norm": 46.001687468067615, + "learning_rate": 5.907177245533107e-06, + "loss": 2.4388, + "mean_token_accuracy": 0.4172413766384125, + "step": 5865 + }, + { + "epoch": 0.005912326344298835, + "grad_norm": 66.65662223029962, + "learning_rate": 5.912213202264166e-06, + "loss": 2.2835, + "mean_token_accuracy": 0.42413792610168455, + "step": 5870 + }, + { + "epoch": 0.005917362397403008, + "grad_norm": 55.869435267795275, + "learning_rate": 5.917249158995226e-06, + "loss": 2.2795, + "mean_token_accuracy": 0.4275862157344818, + "step": 5875 + }, + { + "epoch": 0.005922398450507181, + "grad_norm": 59.690548626168386, + "learning_rate": 5.922285115726286e-06, + "loss": 2.0925, + "mean_token_accuracy": 0.475862056016922, + "step": 5880 + }, + { + "epoch": 0.005927434503611354, + "grad_norm": 63.0369648609136, + "learning_rate": 5.927321072457346e-06, + "loss": 2.3474, + "mean_token_accuracy": 0.45172412395477296, + "step": 5885 + }, + { + "epoch": 0.0059324705567155266, + "grad_norm": 44.779249009806435, + "learning_rate": 5.9323570291884056e-06, + "loss": 2.5309, + "mean_token_accuracy": 0.3793103456497192, + "step": 5890 + }, + { + "epoch": 0.0059375066098196995, + "grad_norm": 50.12635118283943, + "learning_rate": 5.937392985919466e-06, + "loss": 2.4249, + "mean_token_accuracy": 0.3758620619773865, + "step": 5895 + }, + { + "epoch": 0.005942542662923872, + "grad_norm": 53.6000511347473, + "learning_rate": 5.942428942650525e-06, + "loss": 2.1204, + "mean_token_accuracy": 0.4379310369491577, + "step": 5900 + }, + { + "epoch": 0.005947578716028044, + "grad_norm": 42.2251064214332, + "learning_rate": 5.947464899381585e-06, + "loss": 2.4249, + "mean_token_accuracy": 0.41724138259887694, + "step": 5905 + }, + { + "epoch": 0.005952614769132217, + "grad_norm": 45.74037917102216, + "learning_rate": 5.952500856112644e-06, + "loss": 2.2228, + "mean_token_accuracy": 0.4517241299152374, + "step": 5910 + }, + { + "epoch": 0.00595765082223639, + "grad_norm": 47.18715188090433, + "learning_rate": 5.957536812843704e-06, + "loss": 2.267, + "mean_token_accuracy": 0.44827587008476255, + "step": 5915 + }, + { + "epoch": 0.005962686875340563, + "grad_norm": 73.07126749022403, + "learning_rate": 5.962572769574764e-06, + "loss": 2.2476, + "mean_token_accuracy": 0.48275861144065857, + "step": 5920 + }, + { + "epoch": 0.005967722928444736, + "grad_norm": 45.65174713110633, + "learning_rate": 5.967608726305824e-06, + "loss": 2.01, + "mean_token_accuracy": 0.5017543852329254, + "step": 5925 + }, + { + "epoch": 0.005972758981548909, + "grad_norm": 73.17486697360341, + "learning_rate": 5.972644683036884e-06, + "loss": 2.2443, + "mean_token_accuracy": 0.4517241299152374, + "step": 5930 + }, + { + "epoch": 0.005977795034653082, + "grad_norm": 50.62187265347177, + "learning_rate": 5.977680639767944e-06, + "loss": 2.3614, + "mean_token_accuracy": 0.4413793087005615, + "step": 5935 + }, + { + "epoch": 0.005982831087757254, + "grad_norm": 71.65213282291283, + "learning_rate": 5.982716596499003e-06, + "loss": 2.0024, + "mean_token_accuracy": 0.5379310309886932, + "step": 5940 + }, + { + "epoch": 0.005987867140861427, + "grad_norm": 45.805198992796576, + "learning_rate": 5.987752553230063e-06, + "loss": 2.2747, + "mean_token_accuracy": 0.4379310369491577, + "step": 5945 + }, + { + "epoch": 0.0059929031939656, + "grad_norm": 63.84750561509887, + "learning_rate": 5.9927885099611225e-06, + "loss": 2.1543, + "mean_token_accuracy": 0.46787658929824827, + "step": 5950 + }, + { + "epoch": 0.0059979392470697725, + "grad_norm": 52.74623210309535, + "learning_rate": 5.997824466692183e-06, + "loss": 2.3777, + "mean_token_accuracy": 0.482758617401123, + "step": 5955 + }, + { + "epoch": 0.006002975300173945, + "grad_norm": 44.82689823442204, + "learning_rate": 6.002860423423242e-06, + "loss": 2.3646, + "mean_token_accuracy": 0.43103448748588563, + "step": 5960 + }, + { + "epoch": 0.006008011353278118, + "grad_norm": 36.79439771031893, + "learning_rate": 6.007896380154302e-06, + "loss": 2.5272, + "mean_token_accuracy": 0.4241379380226135, + "step": 5965 + }, + { + "epoch": 0.006013047406382291, + "grad_norm": 46.76840821428695, + "learning_rate": 6.012932336885361e-06, + "loss": 2.6179, + "mean_token_accuracy": 0.4103448212146759, + "step": 5970 + }, + { + "epoch": 0.006018083459486463, + "grad_norm": 61.742549064534096, + "learning_rate": 6.017968293616421e-06, + "loss": 2.4178, + "mean_token_accuracy": 0.4413793087005615, + "step": 5975 + }, + { + "epoch": 0.006023119512590636, + "grad_norm": 39.96388589238497, + "learning_rate": 6.0230042503474814e-06, + "loss": 2.2839, + "mean_token_accuracy": 0.47586206197738645, + "step": 5980 + }, + { + "epoch": 0.006028155565694809, + "grad_norm": 51.1152972679412, + "learning_rate": 6.0280402070785415e-06, + "loss": 2.4542, + "mean_token_accuracy": 0.46551724076271056, + "step": 5985 + }, + { + "epoch": 0.006033191618798982, + "grad_norm": 41.03389359657289, + "learning_rate": 6.033076163809601e-06, + "loss": 2.2936, + "mean_token_accuracy": 0.4896551787853241, + "step": 5990 + }, + { + "epoch": 0.006038227671903155, + "grad_norm": 52.97017434032984, + "learning_rate": 6.038112120540661e-06, + "loss": 2.6077, + "mean_token_accuracy": 0.4172413766384125, + "step": 5995 + }, + { + "epoch": 0.006043263725007328, + "grad_norm": 54.56388228996818, + "learning_rate": 6.04314807727172e-06, + "loss": 2.688, + "mean_token_accuracy": 0.3551724225282669, + "step": 6000 + }, + { + "epoch": 0.006048299778111501, + "grad_norm": 59.272229527489564, + "learning_rate": 6.04818403400278e-06, + "loss": 2.4984, + "mean_token_accuracy": 0.44482758045196535, + "step": 6005 + }, + { + "epoch": 0.006053335831215673, + "grad_norm": 55.3052764060603, + "learning_rate": 6.0532199907338395e-06, + "loss": 2.3765, + "mean_token_accuracy": 0.4586206912994385, + "step": 6010 + }, + { + "epoch": 0.0060583718843198455, + "grad_norm": 43.43145741753114, + "learning_rate": 6.0582559474649e-06, + "loss": 2.4768, + "mean_token_accuracy": 0.38965516686439516, + "step": 6015 + }, + { + "epoch": 0.0060634079374240184, + "grad_norm": 74.04778846790151, + "learning_rate": 6.063291904195959e-06, + "loss": 2.4068, + "mean_token_accuracy": 0.41034482717514037, + "step": 6020 + }, + { + "epoch": 0.006068443990528191, + "grad_norm": 48.8311733790857, + "learning_rate": 6.068327860927019e-06, + "loss": 2.628, + "mean_token_accuracy": 0.4120992124080658, + "step": 6025 + }, + { + "epoch": 0.006073480043632364, + "grad_norm": 68.89743007492572, + "learning_rate": 6.073363817658079e-06, + "loss": 2.2201, + "mean_token_accuracy": 0.4436781704425812, + "step": 6030 + }, + { + "epoch": 0.006078516096736537, + "grad_norm": 64.98102392847618, + "learning_rate": 6.078399774389139e-06, + "loss": 2.3434, + "mean_token_accuracy": 0.46551724672317507, + "step": 6035 + }, + { + "epoch": 0.00608355214984071, + "grad_norm": 53.137492829913924, + "learning_rate": 6.083435731120198e-06, + "loss": 2.1615, + "mean_token_accuracy": 0.4448275864124298, + "step": 6040 + }, + { + "epoch": 0.006088588202944882, + "grad_norm": 51.69054411972822, + "learning_rate": 6.0884716878512585e-06, + "loss": 2.1671, + "mean_token_accuracy": 0.49999999403953554, + "step": 6045 + }, + { + "epoch": 0.006093624256049055, + "grad_norm": 40.8330215843773, + "learning_rate": 6.093507644582318e-06, + "loss": 2.1323, + "mean_token_accuracy": 0.4517241358757019, + "step": 6050 + }, + { + "epoch": 0.006098660309153228, + "grad_norm": 44.62175430710238, + "learning_rate": 6.098543601313378e-06, + "loss": 2.2325, + "mean_token_accuracy": 0.45862069725990295, + "step": 6055 + }, + { + "epoch": 0.006103696362257401, + "grad_norm": 91.88244434629704, + "learning_rate": 6.103579558044437e-06, + "loss": 2.1949, + "mean_token_accuracy": 0.45862069725990295, + "step": 6060 + }, + { + "epoch": 0.006108732415361574, + "grad_norm": 47.25705097010408, + "learning_rate": 6.108615514775497e-06, + "loss": 2.292, + "mean_token_accuracy": 0.4758620738983154, + "step": 6065 + }, + { + "epoch": 0.0061137684684657466, + "grad_norm": 48.18774763111376, + "learning_rate": 6.1136514715065565e-06, + "loss": 2.3922, + "mean_token_accuracy": 0.42413792610168455, + "step": 6070 + }, + { + "epoch": 0.0061188045215699195, + "grad_norm": 48.67803708131491, + "learning_rate": 6.1186874282376166e-06, + "loss": 2.2161, + "mean_token_accuracy": 0.4950393199920654, + "step": 6075 + }, + { + "epoch": 0.0061238405746740915, + "grad_norm": 54.9652354132669, + "learning_rate": 6.123723384968677e-06, + "loss": 2.3173, + "mean_token_accuracy": 0.4448275864124298, + "step": 6080 + }, + { + "epoch": 0.006128876627778264, + "grad_norm": 67.28103712072297, + "learning_rate": 6.128759341699737e-06, + "loss": 2.2039, + "mean_token_accuracy": 0.4448275864124298, + "step": 6085 + }, + { + "epoch": 0.006133912680882437, + "grad_norm": 46.78691878078224, + "learning_rate": 6.133795298430796e-06, + "loss": 2.366, + "mean_token_accuracy": 0.4344827592372894, + "step": 6090 + }, + { + "epoch": 0.00613894873398661, + "grad_norm": 51.59746353275638, + "learning_rate": 6.138831255161856e-06, + "loss": 2.1121, + "mean_token_accuracy": 0.48275861144065857, + "step": 6095 + }, + { + "epoch": 0.006143984787090783, + "grad_norm": 44.57197378514623, + "learning_rate": 6.143867211892915e-06, + "loss": 2.3257, + "mean_token_accuracy": 0.4551724135875702, + "step": 6100 + }, + { + "epoch": 0.006149020840194956, + "grad_norm": 41.87145916444733, + "learning_rate": 6.1489031686239755e-06, + "loss": 2.2679, + "mean_token_accuracy": 0.47586206197738645, + "step": 6105 + }, + { + "epoch": 0.006154056893299129, + "grad_norm": 50.881408989922846, + "learning_rate": 6.153939125355035e-06, + "loss": 2.09, + "mean_token_accuracy": 0.46896552443504336, + "step": 6110 + }, + { + "epoch": 0.006159092946403301, + "grad_norm": 49.065415270065614, + "learning_rate": 6.158975082086095e-06, + "loss": 2.4182, + "mean_token_accuracy": 0.4068965494632721, + "step": 6115 + }, + { + "epoch": 0.006164128999507474, + "grad_norm": 56.40941295285368, + "learning_rate": 6.164011038817154e-06, + "loss": 2.2607, + "mean_token_accuracy": 0.44827585816383364, + "step": 6120 + }, + { + "epoch": 0.006169165052611647, + "grad_norm": 59.94106134132087, + "learning_rate": 6.169046995548214e-06, + "loss": 2.2057, + "mean_token_accuracy": 0.4620689690113068, + "step": 6125 + }, + { + "epoch": 0.00617420110571582, + "grad_norm": 60.51513615811769, + "learning_rate": 6.174082952279274e-06, + "loss": 2.6637, + "mean_token_accuracy": 0.4034482717514038, + "step": 6130 + }, + { + "epoch": 0.0061792371588199925, + "grad_norm": 45.04109258907642, + "learning_rate": 6.179118909010334e-06, + "loss": 2.3988, + "mean_token_accuracy": 0.4896551728248596, + "step": 6135 + }, + { + "epoch": 0.006184273211924165, + "grad_norm": 42.07406866907482, + "learning_rate": 6.1841548657413945e-06, + "loss": 2.2585, + "mean_token_accuracy": 0.4689655125141144, + "step": 6140 + }, + { + "epoch": 0.006189309265028338, + "grad_norm": 35.23544246771574, + "learning_rate": 6.189190822472454e-06, + "loss": 2.0644, + "mean_token_accuracy": 0.48965516686439514, + "step": 6145 + }, + { + "epoch": 0.00619434531813251, + "grad_norm": 51.947830805105106, + "learning_rate": 6.194226779203514e-06, + "loss": 2.1011, + "mean_token_accuracy": 0.5137930989265442, + "step": 6150 + }, + { + "epoch": 0.006199381371236683, + "grad_norm": 40.92681724617014, + "learning_rate": 6.199262735934573e-06, + "loss": 1.7219, + "mean_token_accuracy": 0.558620685338974, + "step": 6155 + }, + { + "epoch": 0.006204417424340856, + "grad_norm": 72.52788549550016, + "learning_rate": 6.204298692665633e-06, + "loss": 2.3502, + "mean_token_accuracy": 0.44482758045196535, + "step": 6160 + }, + { + "epoch": 0.006209453477445029, + "grad_norm": 40.4514357625008, + "learning_rate": 6.2093346493966924e-06, + "loss": 2.0021, + "mean_token_accuracy": 0.5034482657909394, + "step": 6165 + }, + { + "epoch": 0.006214489530549202, + "grad_norm": 42.23448253442933, + "learning_rate": 6.2143706061277525e-06, + "loss": 2.3192, + "mean_token_accuracy": 0.43793103098869324, + "step": 6170 + }, + { + "epoch": 0.006219525583653375, + "grad_norm": 45.95498196457272, + "learning_rate": 6.219406562858812e-06, + "loss": 2.3219, + "mean_token_accuracy": 0.4482758641242981, + "step": 6175 + }, + { + "epoch": 0.006224561636757548, + "grad_norm": 71.03223809129595, + "learning_rate": 6.224442519589872e-06, + "loss": 2.533, + "mean_token_accuracy": 0.4034482717514038, + "step": 6180 + }, + { + "epoch": 0.00622959768986172, + "grad_norm": 60.59925401084252, + "learning_rate": 6.229478476320932e-06, + "loss": 2.6162, + "mean_token_accuracy": 0.3620689630508423, + "step": 6185 + }, + { + "epoch": 0.006234633742965893, + "grad_norm": 63.38753204190949, + "learning_rate": 6.234514433051992e-06, + "loss": 2.2126, + "mean_token_accuracy": 0.44827585816383364, + "step": 6190 + }, + { + "epoch": 0.0062396697960700655, + "grad_norm": 60.06166266856566, + "learning_rate": 6.239550389783051e-06, + "loss": 2.2572, + "mean_token_accuracy": 0.44827585816383364, + "step": 6195 + }, + { + "epoch": 0.0062447058491742384, + "grad_norm": 71.29171513043804, + "learning_rate": 6.2445863465141114e-06, + "loss": 2.5219, + "mean_token_accuracy": 0.40344826579093934, + "step": 6200 + }, + { + "epoch": 0.006249741902278411, + "grad_norm": 49.1617253919964, + "learning_rate": 6.249622303245171e-06, + "loss": 2.4708, + "mean_token_accuracy": 0.3827586233615875, + "step": 6205 + }, + { + "epoch": 0.006254777955382584, + "grad_norm": 52.14615912760074, + "learning_rate": 6.25465825997623e-06, + "loss": 2.3844, + "mean_token_accuracy": 0.44827585220336913, + "step": 6210 + }, + { + "epoch": 0.006259814008486756, + "grad_norm": 53.628046240845265, + "learning_rate": 6.25969421670729e-06, + "loss": 2.1537, + "mean_token_accuracy": 0.4413792997598648, + "step": 6215 + }, + { + "epoch": 0.006264850061590929, + "grad_norm": 51.347910812131715, + "learning_rate": 6.26473017343835e-06, + "loss": 2.2062, + "mean_token_accuracy": 0.4551724076271057, + "step": 6220 + }, + { + "epoch": 0.006269886114695102, + "grad_norm": 45.999235325877976, + "learning_rate": 6.26976613016941e-06, + "loss": 2.3164, + "mean_token_accuracy": 0.44482759237289426, + "step": 6225 + }, + { + "epoch": 0.006274922167799275, + "grad_norm": 70.87573238831074, + "learning_rate": 6.2748020869004695e-06, + "loss": 2.0772, + "mean_token_accuracy": 0.42758620381355283, + "step": 6230 + }, + { + "epoch": 0.006279958220903448, + "grad_norm": 50.89961608996036, + "learning_rate": 6.27983804363153e-06, + "loss": 2.3694, + "mean_token_accuracy": 0.42068964838981626, + "step": 6235 + }, + { + "epoch": 0.006284994274007621, + "grad_norm": 70.38825319121104, + "learning_rate": 6.28487400036259e-06, + "loss": 2.2528, + "mean_token_accuracy": 0.4413793087005615, + "step": 6240 + }, + { + "epoch": 0.006290030327111794, + "grad_norm": 43.413320890677525, + "learning_rate": 6.28990995709365e-06, + "loss": 2.3949, + "mean_token_accuracy": 0.4034482717514038, + "step": 6245 + }, + { + "epoch": 0.006295066380215966, + "grad_norm": 53.35471033987399, + "learning_rate": 6.294945913824708e-06, + "loss": 2.3449, + "mean_token_accuracy": 0.4068965554237366, + "step": 6250 + }, + { + "epoch": 0.006300102433320139, + "grad_norm": 49.48306681907208, + "learning_rate": 6.299981870555768e-06, + "loss": 2.4828, + "mean_token_accuracy": 0.450030255317688, + "step": 6255 + }, + { + "epoch": 0.0063051384864243115, + "grad_norm": 51.66348090698549, + "learning_rate": 6.305017827286828e-06, + "loss": 2.4319, + "mean_token_accuracy": 0.4172413766384125, + "step": 6260 + }, + { + "epoch": 0.006310174539528484, + "grad_norm": 54.983296847206454, + "learning_rate": 6.3100537840178885e-06, + "loss": 2.5207, + "mean_token_accuracy": 0.3896551728248596, + "step": 6265 + }, + { + "epoch": 0.006315210592632657, + "grad_norm": 57.2159305784855, + "learning_rate": 6.315089740748947e-06, + "loss": 2.1859, + "mean_token_accuracy": 0.5241379261016845, + "step": 6270 + }, + { + "epoch": 0.00632024664573683, + "grad_norm": 49.84401674196088, + "learning_rate": 6.320125697480008e-06, + "loss": 2.2443, + "mean_token_accuracy": 0.45862067937850953, + "step": 6275 + }, + { + "epoch": 0.006325282698841003, + "grad_norm": 50.34434897808259, + "learning_rate": 6.325161654211068e-06, + "loss": 2.3023, + "mean_token_accuracy": 0.4843315064907074, + "step": 6280 + }, + { + "epoch": 0.006330318751945175, + "grad_norm": 37.64893641660931, + "learning_rate": 6.330197610942128e-06, + "loss": 2.2012, + "mean_token_accuracy": 0.47241379618644713, + "step": 6285 + }, + { + "epoch": 0.006335354805049348, + "grad_norm": 54.30004367765904, + "learning_rate": 6.3352335676731865e-06, + "loss": 2.2012, + "mean_token_accuracy": 0.46067755222320556, + "step": 6290 + }, + { + "epoch": 0.006340390858153521, + "grad_norm": 62.30228385139683, + "learning_rate": 6.3402695244042466e-06, + "loss": 2.372, + "mean_token_accuracy": 0.47586206793785096, + "step": 6295 + }, + { + "epoch": 0.006345426911257694, + "grad_norm": 54.45164943957353, + "learning_rate": 6.345305481135307e-06, + "loss": 2.2323, + "mean_token_accuracy": 0.46206897497177124, + "step": 6300 + }, + { + "epoch": 0.006350462964361867, + "grad_norm": 51.94258537664782, + "learning_rate": 6.350341437866367e-06, + "loss": 2.4528, + "mean_token_accuracy": 0.4517241418361664, + "step": 6305 + }, + { + "epoch": 0.00635549901746604, + "grad_norm": 41.92105173730461, + "learning_rate": 6.355377394597425e-06, + "loss": 2.3346, + "mean_token_accuracy": 0.5034482717514038, + "step": 6310 + }, + { + "epoch": 0.0063605350705702125, + "grad_norm": 57.91014312614557, + "learning_rate": 6.360413351328485e-06, + "loss": 2.1116, + "mean_token_accuracy": 0.4931034445762634, + "step": 6315 + }, + { + "epoch": 0.0063655711236743845, + "grad_norm": 57.6689154729563, + "learning_rate": 6.365449308059545e-06, + "loss": 2.1509, + "mean_token_accuracy": 0.4724137902259827, + "step": 6320 + }, + { + "epoch": 0.006370607176778557, + "grad_norm": 49.18580348890661, + "learning_rate": 6.3704852647906055e-06, + "loss": 2.1044, + "mean_token_accuracy": 0.4610405325889587, + "step": 6325 + }, + { + "epoch": 0.00637564322988273, + "grad_norm": 41.5843166932207, + "learning_rate": 6.375521221521665e-06, + "loss": 2.7451, + "mean_token_accuracy": 0.4482758641242981, + "step": 6330 + }, + { + "epoch": 0.006380679282986903, + "grad_norm": 59.449037433241976, + "learning_rate": 6.380557178252725e-06, + "loss": 2.5404, + "mean_token_accuracy": 0.42976405322551725, + "step": 6335 + }, + { + "epoch": 0.006385715336091076, + "grad_norm": 47.41394815150942, + "learning_rate": 6.385593134983785e-06, + "loss": 2.3082, + "mean_token_accuracy": 0.42758620381355283, + "step": 6340 + }, + { + "epoch": 0.006390751389195249, + "grad_norm": 66.59307195900514, + "learning_rate": 6.390629091714845e-06, + "loss": 2.1217, + "mean_token_accuracy": 0.44827585816383364, + "step": 6345 + }, + { + "epoch": 0.006395787442299422, + "grad_norm": 50.20758022153302, + "learning_rate": 6.395665048445903e-06, + "loss": 2.1906, + "mean_token_accuracy": 0.4689655125141144, + "step": 6350 + }, + { + "epoch": 0.006400823495403594, + "grad_norm": 46.68766382287173, + "learning_rate": 6.4007010051769635e-06, + "loss": 2.4806, + "mean_token_accuracy": 0.4172413766384125, + "step": 6355 + }, + { + "epoch": 0.006405859548507767, + "grad_norm": 78.4136007973002, + "learning_rate": 6.405736961908024e-06, + "loss": 2.8195, + "mean_token_accuracy": 0.38965516686439516, + "step": 6360 + }, + { + "epoch": 0.00641089560161194, + "grad_norm": 47.809726494906585, + "learning_rate": 6.410772918639084e-06, + "loss": 2.4458, + "mean_token_accuracy": 0.4428917169570923, + "step": 6365 + }, + { + "epoch": 0.006415931654716113, + "grad_norm": 54.69693504704357, + "learning_rate": 6.415808875370143e-06, + "loss": 2.3096, + "mean_token_accuracy": 0.4068965494632721, + "step": 6370 + }, + { + "epoch": 0.0064209677078202855, + "grad_norm": 45.18477247322321, + "learning_rate": 6.420844832101203e-06, + "loss": 2.2768, + "mean_token_accuracy": 0.4448275864124298, + "step": 6375 + }, + { + "epoch": 0.0064260037609244584, + "grad_norm": 41.75081284043611, + "learning_rate": 6.425880788832263e-06, + "loss": 2.1484, + "mean_token_accuracy": 0.5000000059604645, + "step": 6380 + }, + { + "epoch": 0.006431039814028631, + "grad_norm": 60.48977352776294, + "learning_rate": 6.430916745563323e-06, + "loss": 2.0936, + "mean_token_accuracy": 0.4448275864124298, + "step": 6385 + }, + { + "epoch": 0.006436075867132803, + "grad_norm": 50.43668501080648, + "learning_rate": 6.435952702294382e-06, + "loss": 2.3059, + "mean_token_accuracy": 0.4413793087005615, + "step": 6390 + }, + { + "epoch": 0.006441111920236976, + "grad_norm": 70.28522233717965, + "learning_rate": 6.440988659025442e-06, + "loss": 2.2686, + "mean_token_accuracy": 0.44652147889137267, + "step": 6395 + }, + { + "epoch": 0.006446147973341149, + "grad_norm": 57.66148989977817, + "learning_rate": 6.446024615756502e-06, + "loss": 2.2815, + "mean_token_accuracy": 0.4551724135875702, + "step": 6400 + }, + { + "epoch": 0.006451184026445322, + "grad_norm": 42.62732695699391, + "learning_rate": 6.451060572487562e-06, + "loss": 2.3338, + "mean_token_accuracy": 0.42413792610168455, + "step": 6405 + }, + { + "epoch": 0.006456220079549495, + "grad_norm": 71.03479029588128, + "learning_rate": 6.45609652921862e-06, + "loss": 2.1171, + "mean_token_accuracy": 0.46467028856277465, + "step": 6410 + }, + { + "epoch": 0.006461256132653668, + "grad_norm": 54.16591231991322, + "learning_rate": 6.4611324859496805e-06, + "loss": 1.9617, + "mean_token_accuracy": 0.5052026629447937, + "step": 6415 + }, + { + "epoch": 0.006466292185757841, + "grad_norm": 43.26053484179596, + "learning_rate": 6.466168442680741e-06, + "loss": 2.2872, + "mean_token_accuracy": 0.44827585816383364, + "step": 6420 + }, + { + "epoch": 0.006471328238862013, + "grad_norm": 52.84199867986601, + "learning_rate": 6.471204399411801e-06, + "loss": 2.306, + "mean_token_accuracy": 0.458620685338974, + "step": 6425 + }, + { + "epoch": 0.006476364291966186, + "grad_norm": 46.96134223366744, + "learning_rate": 6.47624035614286e-06, + "loss": 2.3141, + "mean_token_accuracy": 0.4931034445762634, + "step": 6430 + }, + { + "epoch": 0.006481400345070359, + "grad_norm": 73.80457116500342, + "learning_rate": 6.48127631287392e-06, + "loss": 2.5553, + "mean_token_accuracy": 0.39999999701976774, + "step": 6435 + }, + { + "epoch": 0.0064864363981745315, + "grad_norm": 65.85543230417136, + "learning_rate": 6.48631226960498e-06, + "loss": 2.4888, + "mean_token_accuracy": 0.3999999940395355, + "step": 6440 + }, + { + "epoch": 0.006491472451278704, + "grad_norm": 41.42742849971942, + "learning_rate": 6.49134822633604e-06, + "loss": 1.9763, + "mean_token_accuracy": 0.4931034445762634, + "step": 6445 + }, + { + "epoch": 0.006496508504382877, + "grad_norm": 48.89036856678386, + "learning_rate": 6.496384183067099e-06, + "loss": 2.2551, + "mean_token_accuracy": 0.4517241358757019, + "step": 6450 + }, + { + "epoch": 0.00650154455748705, + "grad_norm": 52.465825307129904, + "learning_rate": 6.501420139798159e-06, + "loss": 2.3935, + "mean_token_accuracy": 0.41034482717514037, + "step": 6455 + }, + { + "epoch": 0.006506580610591222, + "grad_norm": 59.072941227879916, + "learning_rate": 6.506456096529219e-06, + "loss": 2.5701, + "mean_token_accuracy": 0.37586207389831544, + "step": 6460 + }, + { + "epoch": 0.006511616663695395, + "grad_norm": 51.59900121801609, + "learning_rate": 6.511492053260279e-06, + "loss": 2.2572, + "mean_token_accuracy": 0.4310344815254211, + "step": 6465 + }, + { + "epoch": 0.006516652716799568, + "grad_norm": 64.40473563043554, + "learning_rate": 6.516528009991338e-06, + "loss": 2.4966, + "mean_token_accuracy": 0.441379314661026, + "step": 6470 + }, + { + "epoch": 0.006521688769903741, + "grad_norm": 53.07618014705688, + "learning_rate": 6.521563966722398e-06, + "loss": 2.0773, + "mean_token_accuracy": 0.5053236544132232, + "step": 6475 + }, + { + "epoch": 0.006526724823007914, + "grad_norm": 49.69534158548225, + "learning_rate": 6.526599923453458e-06, + "loss": 2.2584, + "mean_token_accuracy": 0.43647912740707395, + "step": 6480 + }, + { + "epoch": 0.006531760876112087, + "grad_norm": 36.00189829175751, + "learning_rate": 6.5316358801845185e-06, + "loss": 2.2539, + "mean_token_accuracy": 0.4413793087005615, + "step": 6485 + }, + { + "epoch": 0.00653679692921626, + "grad_norm": 51.085414882729374, + "learning_rate": 6.536671836915577e-06, + "loss": 2.1821, + "mean_token_accuracy": 0.4517241418361664, + "step": 6490 + }, + { + "epoch": 0.006541832982320432, + "grad_norm": 41.33047324342737, + "learning_rate": 6.541707793646637e-06, + "loss": 2.5362, + "mean_token_accuracy": 0.41379310488700866, + "step": 6495 + }, + { + "epoch": 0.0065468690354246045, + "grad_norm": 39.817116760636985, + "learning_rate": 6.546743750377697e-06, + "loss": 2.0292, + "mean_token_accuracy": 0.45517241954803467, + "step": 6500 + }, + { + "epoch": 0.006551905088528777, + "grad_norm": 54.476993781896766, + "learning_rate": 6.551779707108757e-06, + "loss": 2.4226, + "mean_token_accuracy": 0.44137930274009707, + "step": 6505 + }, + { + "epoch": 0.00655694114163295, + "grad_norm": 37.075696447095005, + "learning_rate": 6.556815663839816e-06, + "loss": 2.1614, + "mean_token_accuracy": 0.4724137902259827, + "step": 6510 + }, + { + "epoch": 0.006561977194737123, + "grad_norm": 41.83870390251137, + "learning_rate": 6.561851620570876e-06, + "loss": 2.3574, + "mean_token_accuracy": 0.44827587008476255, + "step": 6515 + }, + { + "epoch": 0.006567013247841296, + "grad_norm": 49.884207237603086, + "learning_rate": 6.566887577301936e-06, + "loss": 1.9869, + "mean_token_accuracy": 0.4689655125141144, + "step": 6520 + }, + { + "epoch": 0.006572049300945469, + "grad_norm": 64.91084528084171, + "learning_rate": 6.571923534032996e-06, + "loss": 1.9082, + "mean_token_accuracy": 0.5275862038135528, + "step": 6525 + }, + { + "epoch": 0.006577085354049641, + "grad_norm": 67.48064653609676, + "learning_rate": 6.576959490764055e-06, + "loss": 2.3476, + "mean_token_accuracy": 0.41034482717514037, + "step": 6530 + }, + { + "epoch": 0.006582121407153814, + "grad_norm": 47.06341963789077, + "learning_rate": 6.581995447495115e-06, + "loss": 2.1041, + "mean_token_accuracy": 0.4896551728248596, + "step": 6535 + }, + { + "epoch": 0.006587157460257987, + "grad_norm": 40.80627504404265, + "learning_rate": 6.587031404226175e-06, + "loss": 2.145, + "mean_token_accuracy": 0.4517241418361664, + "step": 6540 + }, + { + "epoch": 0.00659219351336216, + "grad_norm": 61.507831111427976, + "learning_rate": 6.5920673609572355e-06, + "loss": 2.2986, + "mean_token_accuracy": 0.42413793206214906, + "step": 6545 + }, + { + "epoch": 0.006597229566466333, + "grad_norm": 37.707830868354016, + "learning_rate": 6.5971033176882956e-06, + "loss": 2.1459, + "mean_token_accuracy": 0.47931034564971925, + "step": 6550 + }, + { + "epoch": 0.0066022656195705055, + "grad_norm": 44.45500335259625, + "learning_rate": 6.602139274419354e-06, + "loss": 2.1738, + "mean_token_accuracy": 0.44482759237289426, + "step": 6555 + }, + { + "epoch": 0.006607301672674678, + "grad_norm": 53.06572702603528, + "learning_rate": 6.607175231150414e-06, + "loss": 2.4652, + "mean_token_accuracy": 0.42068964838981626, + "step": 6560 + }, + { + "epoch": 0.0066123377257788505, + "grad_norm": 51.13847730255945, + "learning_rate": 6.612211187881474e-06, + "loss": 2.3187, + "mean_token_accuracy": 0.417241370677948, + "step": 6565 + }, + { + "epoch": 0.006617373778883023, + "grad_norm": 61.94679831656144, + "learning_rate": 6.617247144612534e-06, + "loss": 1.8596, + "mean_token_accuracy": 0.5539624989032745, + "step": 6570 + }, + { + "epoch": 0.006622409831987196, + "grad_norm": 57.21162842111359, + "learning_rate": 6.6222831013435935e-06, + "loss": 2.1794, + "mean_token_accuracy": 0.482758617401123, + "step": 6575 + }, + { + "epoch": 0.006627445885091369, + "grad_norm": 60.123527880645874, + "learning_rate": 6.627319058074654e-06, + "loss": 2.2354, + "mean_token_accuracy": 0.4724137902259827, + "step": 6580 + }, + { + "epoch": 0.006632481938195542, + "grad_norm": 47.63020519891939, + "learning_rate": 6.632355014805714e-06, + "loss": 1.9821, + "mean_token_accuracy": 0.47241380214691164, + "step": 6585 + }, + { + "epoch": 0.006637517991299715, + "grad_norm": 48.66873674193257, + "learning_rate": 6.637390971536774e-06, + "loss": 2.0273, + "mean_token_accuracy": 0.5000000059604645, + "step": 6590 + }, + { + "epoch": 0.006642554044403888, + "grad_norm": 50.00042456961159, + "learning_rate": 6.642426928267832e-06, + "loss": 2.2344, + "mean_token_accuracy": 0.44827585518360136, + "step": 6595 + }, + { + "epoch": 0.00664759009750806, + "grad_norm": 56.230081942508846, + "learning_rate": 6.647462884998892e-06, + "loss": 1.9971, + "mean_token_accuracy": 0.4965517222881317, + "step": 6600 + }, + { + "epoch": 0.006652626150612233, + "grad_norm": 51.54557010375823, + "learning_rate": 6.6524988417299524e-06, + "loss": 2.5054, + "mean_token_accuracy": 0.42068966031074523, + "step": 6605 + }, + { + "epoch": 0.006657662203716406, + "grad_norm": 96.51899146918406, + "learning_rate": 6.6575347984610125e-06, + "loss": 2.3347, + "mean_token_accuracy": 0.44827587008476255, + "step": 6610 + }, + { + "epoch": 0.006662698256820579, + "grad_norm": 48.47226280128962, + "learning_rate": 6.662570755192071e-06, + "loss": 2.3117, + "mean_token_accuracy": 0.4034482777118683, + "step": 6615 + }, + { + "epoch": 0.0066677343099247515, + "grad_norm": 55.760672261985775, + "learning_rate": 6.667606711923131e-06, + "loss": 2.2402, + "mean_token_accuracy": 0.4517241299152374, + "step": 6620 + }, + { + "epoch": 0.006672770363028924, + "grad_norm": 54.06452456481192, + "learning_rate": 6.672642668654191e-06, + "loss": 2.3452, + "mean_token_accuracy": 0.4689655125141144, + "step": 6625 + }, + { + "epoch": 0.006677806416133097, + "grad_norm": 36.6955803829172, + "learning_rate": 6.677678625385251e-06, + "loss": 2.1433, + "mean_token_accuracy": 0.46896551847457885, + "step": 6630 + }, + { + "epoch": 0.006682842469237269, + "grad_norm": 57.01798009939822, + "learning_rate": 6.6827145821163105e-06, + "loss": 2.4756, + "mean_token_accuracy": 0.4172413766384125, + "step": 6635 + }, + { + "epoch": 0.006687878522341442, + "grad_norm": 42.74085015435525, + "learning_rate": 6.687750538847371e-06, + "loss": 2.4783, + "mean_token_accuracy": 0.4310344815254211, + "step": 6640 + }, + { + "epoch": 0.006692914575445615, + "grad_norm": 50.32812591651769, + "learning_rate": 6.692786495578431e-06, + "loss": 2.2453, + "mean_token_accuracy": 0.47586206793785096, + "step": 6645 + }, + { + "epoch": 0.006697950628549788, + "grad_norm": 47.23870937895076, + "learning_rate": 6.697822452309491e-06, + "loss": 2.3842, + "mean_token_accuracy": 0.43793103098869324, + "step": 6650 + }, + { + "epoch": 0.006702986681653961, + "grad_norm": 53.125040717996974, + "learning_rate": 6.702858409040549e-06, + "loss": 2.0319, + "mean_token_accuracy": 0.5248768568038941, + "step": 6655 + }, + { + "epoch": 0.006708022734758134, + "grad_norm": 48.4897488020064, + "learning_rate": 6.707894365771609e-06, + "loss": 2.2106, + "mean_token_accuracy": 0.4793103516101837, + "step": 6660 + }, + { + "epoch": 0.006713058787862306, + "grad_norm": 37.454403300158496, + "learning_rate": 6.712930322502669e-06, + "loss": 2.0216, + "mean_token_accuracy": 0.46551724672317507, + "step": 6665 + }, + { + "epoch": 0.006718094840966479, + "grad_norm": 50.30966431581214, + "learning_rate": 6.7179662792337295e-06, + "loss": 2.2227, + "mean_token_accuracy": 0.47241379618644713, + "step": 6670 + }, + { + "epoch": 0.006723130894070652, + "grad_norm": 63.24348477311662, + "learning_rate": 6.723002235964789e-06, + "loss": 2.2059, + "mean_token_accuracy": 0.4724137902259827, + "step": 6675 + }, + { + "epoch": 0.0067281669471748245, + "grad_norm": 39.8398484643016, + "learning_rate": 6.728038192695849e-06, + "loss": 2.126, + "mean_token_accuracy": 0.49655171632766726, + "step": 6680 + }, + { + "epoch": 0.006733203000278997, + "grad_norm": 47.64301163347034, + "learning_rate": 6.733074149426909e-06, + "loss": 2.2225, + "mean_token_accuracy": 0.4275862008333206, + "step": 6685 + }, + { + "epoch": 0.00673823905338317, + "grad_norm": 45.702792838032366, + "learning_rate": 6.738110106157969e-06, + "loss": 2.2141, + "mean_token_accuracy": 0.42758620381355283, + "step": 6690 + }, + { + "epoch": 0.006743275106487343, + "grad_norm": 57.452127969203346, + "learning_rate": 6.7431460628890274e-06, + "loss": 2.5248, + "mean_token_accuracy": 0.42758620977401735, + "step": 6695 + }, + { + "epoch": 0.006748311159591515, + "grad_norm": 53.024427870925045, + "learning_rate": 6.7481820196200875e-06, + "loss": 2.1566, + "mean_token_accuracy": 0.493103438615799, + "step": 6700 + }, + { + "epoch": 0.006753347212695688, + "grad_norm": 59.97626946191907, + "learning_rate": 6.753217976351148e-06, + "loss": 2.4165, + "mean_token_accuracy": 0.45517241954803467, + "step": 6705 + }, + { + "epoch": 0.006758383265799861, + "grad_norm": 48.20810556569317, + "learning_rate": 6.758253933082208e-06, + "loss": 2.1737, + "mean_token_accuracy": 0.4206896543502808, + "step": 6710 + }, + { + "epoch": 0.006763419318904034, + "grad_norm": 46.63656709914599, + "learning_rate": 6.763289889813266e-06, + "loss": 2.0034, + "mean_token_accuracy": 0.5206896483898162, + "step": 6715 + }, + { + "epoch": 0.006768455372008207, + "grad_norm": 57.40403145219679, + "learning_rate": 6.768325846544326e-06, + "loss": 2.294, + "mean_token_accuracy": 0.38275861740112305, + "step": 6720 + }, + { + "epoch": 0.00677349142511238, + "grad_norm": 37.16886830025921, + "learning_rate": 6.773361803275386e-06, + "loss": 1.9982, + "mean_token_accuracy": 0.46376285552978513, + "step": 6725 + }, + { + "epoch": 0.006778527478216553, + "grad_norm": 53.48813957852404, + "learning_rate": 6.778397760006447e-06, + "loss": 2.5181, + "mean_token_accuracy": 0.4206896543502808, + "step": 6730 + }, + { + "epoch": 0.006783563531320725, + "grad_norm": 65.47277696014064, + "learning_rate": 6.783433716737506e-06, + "loss": 2.091, + "mean_token_accuracy": 0.47241378426551817, + "step": 6735 + }, + { + "epoch": 0.0067885995844248976, + "grad_norm": 56.640810367366534, + "learning_rate": 6.788469673468566e-06, + "loss": 2.2398, + "mean_token_accuracy": 0.4586206912994385, + "step": 6740 + }, + { + "epoch": 0.0067936356375290705, + "grad_norm": 47.60842460570875, + "learning_rate": 6.793505630199626e-06, + "loss": 2.3848, + "mean_token_accuracy": 0.4188747704029083, + "step": 6745 + }, + { + "epoch": 0.006798671690633243, + "grad_norm": 46.917684100079526, + "learning_rate": 6.798541586930686e-06, + "loss": 2.3391, + "mean_token_accuracy": 0.42758620381355283, + "step": 6750 + }, + { + "epoch": 0.006803707743737416, + "grad_norm": 60.644771256802855, + "learning_rate": 6.803577543661744e-06, + "loss": 2.4641, + "mean_token_accuracy": 0.43103447556495667, + "step": 6755 + }, + { + "epoch": 0.006808743796841589, + "grad_norm": 62.82804240875024, + "learning_rate": 6.8086135003928045e-06, + "loss": 2.2765, + "mean_token_accuracy": 0.46551724076271056, + "step": 6760 + }, + { + "epoch": 0.006813779849945762, + "grad_norm": 40.53424666995209, + "learning_rate": 6.813649457123865e-06, + "loss": 2.3878, + "mean_token_accuracy": 0.4344827651977539, + "step": 6765 + }, + { + "epoch": 0.006818815903049934, + "grad_norm": 40.35131518017455, + "learning_rate": 6.818685413854925e-06, + "loss": 1.9425, + "mean_token_accuracy": 0.4862069010734558, + "step": 6770 + }, + { + "epoch": 0.006823851956154107, + "grad_norm": 51.78643046388614, + "learning_rate": 6.823721370585984e-06, + "loss": 1.9515, + "mean_token_accuracy": 0.4980641186237335, + "step": 6775 + }, + { + "epoch": 0.00682888800925828, + "grad_norm": 43.96695647448754, + "learning_rate": 6.828757327317044e-06, + "loss": 2.4561, + "mean_token_accuracy": 0.41379310488700866, + "step": 6780 + }, + { + "epoch": 0.006833924062362453, + "grad_norm": 63.86217062717558, + "learning_rate": 6.833793284048104e-06, + "loss": 2.6298, + "mean_token_accuracy": 0.3620689630508423, + "step": 6785 + }, + { + "epoch": 0.006838960115466626, + "grad_norm": 36.578459286712544, + "learning_rate": 6.838829240779164e-06, + "loss": 2.173, + "mean_token_accuracy": 0.4862068951129913, + "step": 6790 + }, + { + "epoch": 0.006843996168570799, + "grad_norm": 64.08570577875474, + "learning_rate": 6.843865197510223e-06, + "loss": 2.2524, + "mean_token_accuracy": 0.47065940499305725, + "step": 6795 + }, + { + "epoch": 0.0068490322216749715, + "grad_norm": 35.99027574878657, + "learning_rate": 6.848901154241283e-06, + "loss": 1.9677, + "mean_token_accuracy": 0.5103448331356049, + "step": 6800 + }, + { + "epoch": 0.0068540682747791435, + "grad_norm": 50.82366506340916, + "learning_rate": 6.853937110972343e-06, + "loss": 2.6616, + "mean_token_accuracy": 0.4344827651977539, + "step": 6805 + }, + { + "epoch": 0.006859104327883316, + "grad_norm": 56.74084504843212, + "learning_rate": 6.858973067703403e-06, + "loss": 2.5657, + "mean_token_accuracy": 0.4310344815254211, + "step": 6810 + }, + { + "epoch": 0.006864140380987489, + "grad_norm": 54.51289033212468, + "learning_rate": 6.864009024434462e-06, + "loss": 2.3658, + "mean_token_accuracy": 0.4448275864124298, + "step": 6815 + }, + { + "epoch": 0.006869176434091662, + "grad_norm": 46.783396914734425, + "learning_rate": 6.869044981165522e-06, + "loss": 2.5182, + "mean_token_accuracy": 0.41034482717514037, + "step": 6820 + }, + { + "epoch": 0.006874212487195835, + "grad_norm": 79.28297437408888, + "learning_rate": 6.874080937896582e-06, + "loss": 2.2713, + "mean_token_accuracy": 0.4334543228149414, + "step": 6825 + }, + { + "epoch": 0.006879248540300008, + "grad_norm": 45.494765448083164, + "learning_rate": 6.8791168946276425e-06, + "loss": 2.4228, + "mean_token_accuracy": 0.3999999940395355, + "step": 6830 + }, + { + "epoch": 0.006884284593404181, + "grad_norm": 55.301649930743146, + "learning_rate": 6.884152851358701e-06, + "loss": 2.3687, + "mean_token_accuracy": 0.4103448212146759, + "step": 6835 + }, + { + "epoch": 0.006889320646508353, + "grad_norm": 44.22728771278015, + "learning_rate": 6.889188808089761e-06, + "loss": 1.9686, + "mean_token_accuracy": 0.4931034445762634, + "step": 6840 + }, + { + "epoch": 0.006894356699612526, + "grad_norm": 48.12891439657761, + "learning_rate": 6.894224764820821e-06, + "loss": 2.4267, + "mean_token_accuracy": 0.4034482777118683, + "step": 6845 + }, + { + "epoch": 0.006899392752716699, + "grad_norm": 43.82322937757349, + "learning_rate": 6.899260721551881e-06, + "loss": 2.5481, + "mean_token_accuracy": 0.42758620977401735, + "step": 6850 + }, + { + "epoch": 0.006904428805820872, + "grad_norm": 41.232169674165256, + "learning_rate": 6.90429667828294e-06, + "loss": 2.0909, + "mean_token_accuracy": 0.4517241418361664, + "step": 6855 + }, + { + "epoch": 0.0069094648589250445, + "grad_norm": 39.12170262843131, + "learning_rate": 6.909332635014e-06, + "loss": 2.2639, + "mean_token_accuracy": 0.42413793206214906, + "step": 6860 + }, + { + "epoch": 0.006914500912029217, + "grad_norm": 44.177066952309005, + "learning_rate": 6.91436859174506e-06, + "loss": 2.1092, + "mean_token_accuracy": 0.49999999403953554, + "step": 6865 + }, + { + "epoch": 0.00691953696513339, + "grad_norm": 35.88057306583056, + "learning_rate": 6.91940454847612e-06, + "loss": 2.3898, + "mean_token_accuracy": 0.41724138259887694, + "step": 6870 + }, + { + "epoch": 0.006924573018237562, + "grad_norm": 46.648218080915434, + "learning_rate": 6.924440505207179e-06, + "loss": 2.2594, + "mean_token_accuracy": 0.4413793087005615, + "step": 6875 + }, + { + "epoch": 0.006929609071341735, + "grad_norm": 46.64638252103064, + "learning_rate": 6.929476461938239e-06, + "loss": 2.5701, + "mean_token_accuracy": 0.3724137932062149, + "step": 6880 + }, + { + "epoch": 0.006934645124445908, + "grad_norm": 60.967390761094904, + "learning_rate": 6.934512418669299e-06, + "loss": 2.4825, + "mean_token_accuracy": 0.39655172228813174, + "step": 6885 + }, + { + "epoch": 0.006939681177550081, + "grad_norm": 42.448747225099766, + "learning_rate": 6.9395483754003595e-06, + "loss": 2.289, + "mean_token_accuracy": 0.41724138259887694, + "step": 6890 + }, + { + "epoch": 0.006944717230654254, + "grad_norm": 49.65101480148005, + "learning_rate": 6.944584332131418e-06, + "loss": 2.3864, + "mean_token_accuracy": 0.48275861144065857, + "step": 6895 + }, + { + "epoch": 0.006949753283758427, + "grad_norm": 48.59491465201465, + "learning_rate": 6.949620288862478e-06, + "loss": 2.4128, + "mean_token_accuracy": 0.4517241358757019, + "step": 6900 + }, + { + "epoch": 0.0069547893368626, + "grad_norm": 48.388399846741024, + "learning_rate": 6.954656245593538e-06, + "loss": 2.4457, + "mean_token_accuracy": 0.42068964838981626, + "step": 6905 + }, + { + "epoch": 0.006959825389966772, + "grad_norm": 56.41617185193928, + "learning_rate": 6.959692202324598e-06, + "loss": 2.3499, + "mean_token_accuracy": 0.45862069725990295, + "step": 6910 + }, + { + "epoch": 0.006964861443070945, + "grad_norm": 50.51187410921404, + "learning_rate": 6.9647281590556574e-06, + "loss": 2.059, + "mean_token_accuracy": 0.49655171632766726, + "step": 6915 + }, + { + "epoch": 0.0069698974961751176, + "grad_norm": 85.11663788352037, + "learning_rate": 6.9697641157867175e-06, + "loss": 2.2402, + "mean_token_accuracy": 0.42607380747795104, + "step": 6920 + }, + { + "epoch": 0.0069749335492792905, + "grad_norm": 41.13440245696484, + "learning_rate": 6.974800072517778e-06, + "loss": 2.5678, + "mean_token_accuracy": 0.3793103456497192, + "step": 6925 + }, + { + "epoch": 0.006979969602383463, + "grad_norm": 61.38473172336067, + "learning_rate": 6.979836029248838e-06, + "loss": 2.3286, + "mean_token_accuracy": 0.4034482777118683, + "step": 6930 + }, + { + "epoch": 0.006985005655487636, + "grad_norm": 45.999149982488184, + "learning_rate": 6.984871985979896e-06, + "loss": 2.0911, + "mean_token_accuracy": 0.49999999403953554, + "step": 6935 + }, + { + "epoch": 0.006990041708591809, + "grad_norm": 40.14202405659626, + "learning_rate": 6.989907942710956e-06, + "loss": 2.3611, + "mean_token_accuracy": 0.4430732011795044, + "step": 6940 + }, + { + "epoch": 0.006995077761695981, + "grad_norm": 44.87245761526127, + "learning_rate": 6.994943899442016e-06, + "loss": 2.1066, + "mean_token_accuracy": 0.5034482657909394, + "step": 6945 + }, + { + "epoch": 0.007000113814800154, + "grad_norm": 42.42499090282306, + "learning_rate": 6.9999798561730765e-06, + "loss": 2.3274, + "mean_token_accuracy": 0.4068965494632721, + "step": 6950 + }, + { + "epoch": 0.007005149867904327, + "grad_norm": 40.83366280287029, + "learning_rate": 7.0050158129041366e-06, + "loss": 2.5474, + "mean_token_accuracy": 0.4103448212146759, + "step": 6955 + }, + { + "epoch": 0.0070101859210085, + "grad_norm": 57.589357362046215, + "learning_rate": 7.010051769635195e-06, + "loss": 2.3615, + "mean_token_accuracy": 0.4517241358757019, + "step": 6960 + }, + { + "epoch": 0.007015221974112673, + "grad_norm": 40.936388699541595, + "learning_rate": 7.015087726366255e-06, + "loss": 2.4903, + "mean_token_accuracy": 0.4103448331356049, + "step": 6965 + }, + { + "epoch": 0.007020258027216846, + "grad_norm": 40.902305019210914, + "learning_rate": 7.020123683097315e-06, + "loss": 2.3589, + "mean_token_accuracy": 0.3896551698446274, + "step": 6970 + }, + { + "epoch": 0.007025294080321019, + "grad_norm": 42.38641675691526, + "learning_rate": 7.025159639828375e-06, + "loss": 2.0391, + "mean_token_accuracy": 0.5041871905326843, + "step": 6975 + }, + { + "epoch": 0.007030330133425191, + "grad_norm": 41.51045590162831, + "learning_rate": 7.0301955965594345e-06, + "loss": 2.1452, + "mean_token_accuracy": 0.4862068951129913, + "step": 6980 + }, + { + "epoch": 0.0070353661865293635, + "grad_norm": 35.919549653825314, + "learning_rate": 7.035231553290495e-06, + "loss": 2.1058, + "mean_token_accuracy": 0.44482758045196535, + "step": 6985 + }, + { + "epoch": 0.007040402239633536, + "grad_norm": 42.620218788263536, + "learning_rate": 7.040267510021555e-06, + "loss": 2.2592, + "mean_token_accuracy": 0.458620673418045, + "step": 6990 + }, + { + "epoch": 0.007045438292737709, + "grad_norm": 42.64000935045413, + "learning_rate": 7.045303466752615e-06, + "loss": 2.1976, + "mean_token_accuracy": 0.4413793087005615, + "step": 6995 + }, + { + "epoch": 0.007050474345841882, + "grad_norm": 52.48581687340867, + "learning_rate": 7.050339423483673e-06, + "loss": 2.2135, + "mean_token_accuracy": 0.4689655125141144, + "step": 7000 + }, + { + "epoch": 0.007055510398946055, + "grad_norm": 41.59638511429294, + "learning_rate": 7.055375380214733e-06, + "loss": 2.4292, + "mean_token_accuracy": 0.4, + "step": 7005 + }, + { + "epoch": 0.007060546452050228, + "grad_norm": 43.87306165002983, + "learning_rate": 7.060411336945793e-06, + "loss": 2.3456, + "mean_token_accuracy": 0.45329703092575074, + "step": 7010 + }, + { + "epoch": 0.0070655825051544, + "grad_norm": 50.80426034621016, + "learning_rate": 7.0654472936768535e-06, + "loss": 2.154, + "mean_token_accuracy": 0.48620688915252686, + "step": 7015 + }, + { + "epoch": 0.007070618558258573, + "grad_norm": 61.30114396391118, + "learning_rate": 7.070483250407913e-06, + "loss": 2.7253, + "mean_token_accuracy": 0.3827586233615875, + "step": 7020 + }, + { + "epoch": 0.007075654611362746, + "grad_norm": 47.43481055790515, + "learning_rate": 7.075519207138973e-06, + "loss": 2.1718, + "mean_token_accuracy": 0.458620685338974, + "step": 7025 + }, + { + "epoch": 0.007080690664466919, + "grad_norm": 37.75871150231663, + "learning_rate": 7.080555163870033e-06, + "loss": 2.2518, + "mean_token_accuracy": 0.4482758641242981, + "step": 7030 + }, + { + "epoch": 0.007085726717571092, + "grad_norm": 41.71150169854353, + "learning_rate": 7.085591120601093e-06, + "loss": 2.3305, + "mean_token_accuracy": 0.47241378426551817, + "step": 7035 + }, + { + "epoch": 0.0070907627706752645, + "grad_norm": 54.28638817606991, + "learning_rate": 7.0906270773321515e-06, + "loss": 2.3878, + "mean_token_accuracy": 0.4241379380226135, + "step": 7040 + }, + { + "epoch": 0.007095798823779437, + "grad_norm": 54.87041248299273, + "learning_rate": 7.0956630340632116e-06, + "loss": 2.0312, + "mean_token_accuracy": 0.4851179659366608, + "step": 7045 + }, + { + "epoch": 0.0071008348768836094, + "grad_norm": 46.303498100930106, + "learning_rate": 7.100698990794272e-06, + "loss": 2.5216, + "mean_token_accuracy": 0.37241379618644715, + "step": 7050 + }, + { + "epoch": 0.007105870929987782, + "grad_norm": 63.59682745721162, + "learning_rate": 7.105734947525332e-06, + "loss": 2.6238, + "mean_token_accuracy": 0.3793103456497192, + "step": 7055 + }, + { + "epoch": 0.007110906983091955, + "grad_norm": 60.02220891170065, + "learning_rate": 7.11077090425639e-06, + "loss": 2.1847, + "mean_token_accuracy": 0.4344827592372894, + "step": 7060 + }, + { + "epoch": 0.007115943036196128, + "grad_norm": 34.54793210198037, + "learning_rate": 7.11580686098745e-06, + "loss": 2.1355, + "mean_token_accuracy": 0.47416818141937256, + "step": 7065 + }, + { + "epoch": 0.007120979089300301, + "grad_norm": 35.76723916001373, + "learning_rate": 7.12084281771851e-06, + "loss": 2.3813, + "mean_token_accuracy": 0.45517241954803467, + "step": 7070 + }, + { + "epoch": 0.007126015142404474, + "grad_norm": 34.621456064436906, + "learning_rate": 7.1258787744495705e-06, + "loss": 2.2343, + "mean_token_accuracy": 0.4344827592372894, + "step": 7075 + }, + { + "epoch": 0.007131051195508647, + "grad_norm": 55.39100176217363, + "learning_rate": 7.13091473118063e-06, + "loss": 2.5818, + "mean_token_accuracy": 0.44827585816383364, + "step": 7080 + }, + { + "epoch": 0.007136087248612819, + "grad_norm": 38.870365812464414, + "learning_rate": 7.13595068791169e-06, + "loss": 2.4907, + "mean_token_accuracy": 0.43793103098869324, + "step": 7085 + }, + { + "epoch": 0.007141123301716992, + "grad_norm": 50.11195221501107, + "learning_rate": 7.14098664464275e-06, + "loss": 2.0391, + "mean_token_accuracy": 0.48275861144065857, + "step": 7090 + }, + { + "epoch": 0.007146159354821165, + "grad_norm": 42.72960315470217, + "learning_rate": 7.14602260137381e-06, + "loss": 2.4855, + "mean_token_accuracy": 0.42413792610168455, + "step": 7095 + }, + { + "epoch": 0.0071511954079253376, + "grad_norm": 51.15688743407462, + "learning_rate": 7.1510585581048684e-06, + "loss": 2.8293, + "mean_token_accuracy": 0.3689655065536499, + "step": 7100 + }, + { + "epoch": 0.0071562314610295105, + "grad_norm": 36.360669140435384, + "learning_rate": 7.1560945148359285e-06, + "loss": 2.4846, + "mean_token_accuracy": 0.4310344815254211, + "step": 7105 + }, + { + "epoch": 0.007161267514133683, + "grad_norm": 44.930176568604665, + "learning_rate": 7.161130471566989e-06, + "loss": 2.2118, + "mean_token_accuracy": 0.42758620977401735, + "step": 7110 + }, + { + "epoch": 0.007166303567237855, + "grad_norm": 48.940881453163314, + "learning_rate": 7.166166428298049e-06, + "loss": 2.6227, + "mean_token_accuracy": 0.3896551728248596, + "step": 7115 + }, + { + "epoch": 0.007171339620342028, + "grad_norm": 36.751809186726064, + "learning_rate": 7.171202385029108e-06, + "loss": 2.1878, + "mean_token_accuracy": 0.48620688915252686, + "step": 7120 + }, + { + "epoch": 0.007176375673446201, + "grad_norm": 49.89972201755115, + "learning_rate": 7.176238341760168e-06, + "loss": 2.3019, + "mean_token_accuracy": 0.42758620381355283, + "step": 7125 + }, + { + "epoch": 0.007181411726550374, + "grad_norm": 36.71295017293243, + "learning_rate": 7.181274298491228e-06, + "loss": 1.9507, + "mean_token_accuracy": 0.510344821214676, + "step": 7130 + }, + { + "epoch": 0.007186447779654547, + "grad_norm": 69.97056559126018, + "learning_rate": 7.186310255222288e-06, + "loss": 2.3335, + "mean_token_accuracy": 0.4448275864124298, + "step": 7135 + }, + { + "epoch": 0.00719148383275872, + "grad_norm": 63.53009980885941, + "learning_rate": 7.191346211953347e-06, + "loss": 2.3054, + "mean_token_accuracy": 0.43103447556495667, + "step": 7140 + }, + { + "epoch": 0.007196519885862893, + "grad_norm": 35.44089407594087, + "learning_rate": 7.196382168684407e-06, + "loss": 2.3375, + "mean_token_accuracy": 0.4413793087005615, + "step": 7145 + }, + { + "epoch": 0.007201555938967065, + "grad_norm": 41.84611359776772, + "learning_rate": 7.201418125415467e-06, + "loss": 2.3276, + "mean_token_accuracy": 0.4310344815254211, + "step": 7150 + }, + { + "epoch": 0.007206591992071238, + "grad_norm": 44.616005258217434, + "learning_rate": 7.206454082146527e-06, + "loss": 2.3455, + "mean_token_accuracy": 0.43103447556495667, + "step": 7155 + }, + { + "epoch": 0.007211628045175411, + "grad_norm": 39.730617227637744, + "learning_rate": 7.211490038877585e-06, + "loss": 2.1367, + "mean_token_accuracy": 0.4344827592372894, + "step": 7160 + }, + { + "epoch": 0.0072166640982795835, + "grad_norm": 51.97990611504933, + "learning_rate": 7.2165259956086455e-06, + "loss": 2.254, + "mean_token_accuracy": 0.43448275327682495, + "step": 7165 + }, + { + "epoch": 0.007221700151383756, + "grad_norm": 45.32075821540736, + "learning_rate": 7.221561952339706e-06, + "loss": 2.6748, + "mean_token_accuracy": 0.37586206793785093, + "step": 7170 + }, + { + "epoch": 0.007226736204487929, + "grad_norm": 53.216386937354066, + "learning_rate": 7.226597909070766e-06, + "loss": 2.2826, + "mean_token_accuracy": 0.4689655125141144, + "step": 7175 + }, + { + "epoch": 0.007231772257592102, + "grad_norm": 45.460423208274044, + "learning_rate": 7.231633865801825e-06, + "loss": 2.1541, + "mean_token_accuracy": 0.441379314661026, + "step": 7180 + }, + { + "epoch": 0.007236808310696274, + "grad_norm": 42.53052807470696, + "learning_rate": 7.236669822532885e-06, + "loss": 2.208, + "mean_token_accuracy": 0.4758620738983154, + "step": 7185 + }, + { + "epoch": 0.007241844363800447, + "grad_norm": 36.42312803969594, + "learning_rate": 7.241705779263945e-06, + "loss": 2.287, + "mean_token_accuracy": 0.4551724076271057, + "step": 7190 + }, + { + "epoch": 0.00724688041690462, + "grad_norm": 66.7069081742678, + "learning_rate": 7.246741735995005e-06, + "loss": 2.0455, + "mean_token_accuracy": 0.4689655125141144, + "step": 7195 + }, + { + "epoch": 0.007251916470008793, + "grad_norm": 45.309369072802916, + "learning_rate": 7.251777692726064e-06, + "loss": 2.5352, + "mean_token_accuracy": 0.3896551728248596, + "step": 7200 + }, + { + "epoch": 0.007256952523112966, + "grad_norm": 60.952399529324026, + "learning_rate": 7.256813649457124e-06, + "loss": 2.4926, + "mean_token_accuracy": 0.42238354682922363, + "step": 7205 + }, + { + "epoch": 0.007261988576217139, + "grad_norm": 49.81871048491977, + "learning_rate": 7.261849606188184e-06, + "loss": 2.2722, + "mean_token_accuracy": 0.4241379380226135, + "step": 7210 + }, + { + "epoch": 0.007267024629321312, + "grad_norm": 47.851171879008675, + "learning_rate": 7.266885562919244e-06, + "loss": 2.2262, + "mean_token_accuracy": 0.4551724076271057, + "step": 7215 + }, + { + "epoch": 0.007272060682425484, + "grad_norm": 48.03082363527323, + "learning_rate": 7.271921519650303e-06, + "loss": 2.3253, + "mean_token_accuracy": 0.42413793206214906, + "step": 7220 + }, + { + "epoch": 0.0072770967355296565, + "grad_norm": 42.18251540875435, + "learning_rate": 7.276957476381363e-06, + "loss": 2.0987, + "mean_token_accuracy": 0.47586206793785096, + "step": 7225 + }, + { + "epoch": 0.0072821327886338294, + "grad_norm": 51.33179474610389, + "learning_rate": 7.281993433112423e-06, + "loss": 2.4539, + "mean_token_accuracy": 0.4448275864124298, + "step": 7230 + }, + { + "epoch": 0.007287168841738002, + "grad_norm": 49.01537882849991, + "learning_rate": 7.2870293898434835e-06, + "loss": 2.2662, + "mean_token_accuracy": 0.42758620381355283, + "step": 7235 + }, + { + "epoch": 0.007292204894842175, + "grad_norm": 46.04061801252075, + "learning_rate": 7.292065346574542e-06, + "loss": 2.0704, + "mean_token_accuracy": 0.5539279341697693, + "step": 7240 + }, + { + "epoch": 0.007297240947946348, + "grad_norm": 87.56177636085452, + "learning_rate": 7.297101303305602e-06, + "loss": 2.3001, + "mean_token_accuracy": 0.47586206197738645, + "step": 7245 + }, + { + "epoch": 0.007302277001050521, + "grad_norm": 63.621548698017534, + "learning_rate": 7.302137260036662e-06, + "loss": 2.1936, + "mean_token_accuracy": 0.4655172348022461, + "step": 7250 + }, + { + "epoch": 0.007307313054154693, + "grad_norm": 46.414432950132, + "learning_rate": 7.307173216767722e-06, + "loss": 2.5565, + "mean_token_accuracy": 0.37543859481811526, + "step": 7255 + }, + { + "epoch": 0.007312349107258866, + "grad_norm": 38.9196822916388, + "learning_rate": 7.312209173498781e-06, + "loss": 2.3082, + "mean_token_accuracy": 0.417241370677948, + "step": 7260 + }, + { + "epoch": 0.007317385160363039, + "grad_norm": 42.048293480324, + "learning_rate": 7.3172451302298416e-06, + "loss": 2.4473, + "mean_token_accuracy": 0.4172413766384125, + "step": 7265 + }, + { + "epoch": 0.007322421213467212, + "grad_norm": 44.41952114574183, + "learning_rate": 7.322281086960902e-06, + "loss": 2.6062, + "mean_token_accuracy": 0.4103448331356049, + "step": 7270 + }, + { + "epoch": 0.007327457266571385, + "grad_norm": 48.99043258135825, + "learning_rate": 7.327317043691962e-06, + "loss": 2.1611, + "mean_token_accuracy": 0.4758620738983154, + "step": 7275 + }, + { + "epoch": 0.0073324933196755575, + "grad_norm": 35.01901562228969, + "learning_rate": 7.33235300042302e-06, + "loss": 2.153, + "mean_token_accuracy": 0.46551724076271056, + "step": 7280 + }, + { + "epoch": 0.0073375293727797304, + "grad_norm": 29.19362158498308, + "learning_rate": 7.33738895715408e-06, + "loss": 1.9198, + "mean_token_accuracy": 0.5448275744915009, + "step": 7285 + }, + { + "epoch": 0.0073425654258839025, + "grad_norm": 53.34989488039861, + "learning_rate": 7.34242491388514e-06, + "loss": 2.5123, + "mean_token_accuracy": 0.4206896543502808, + "step": 7290 + }, + { + "epoch": 0.007347601478988075, + "grad_norm": 61.67489299187737, + "learning_rate": 7.3474608706162005e-06, + "loss": 2.3752, + "mean_token_accuracy": 0.42758620381355283, + "step": 7295 + }, + { + "epoch": 0.007352637532092248, + "grad_norm": 40.30318813251888, + "learning_rate": 7.352496827347259e-06, + "loss": 2.267, + "mean_token_accuracy": 0.4310344815254211, + "step": 7300 + }, + { + "epoch": 0.007357673585196421, + "grad_norm": 57.08723519385807, + "learning_rate": 7.357532784078319e-06, + "loss": 2.3227, + "mean_token_accuracy": 0.43986691236495973, + "step": 7305 + }, + { + "epoch": 0.007362709638300594, + "grad_norm": 75.57077908588624, + "learning_rate": 7.362568740809379e-06, + "loss": 2.0686, + "mean_token_accuracy": 0.48124622106552123, + "step": 7310 + }, + { + "epoch": 0.007367745691404767, + "grad_norm": 114.78906317035006, + "learning_rate": 7.367604697540439e-06, + "loss": 2.1749, + "mean_token_accuracy": 0.46896551847457885, + "step": 7315 + }, + { + "epoch": 0.00737278174450894, + "grad_norm": 86.06727225037893, + "learning_rate": 7.3726406542714984e-06, + "loss": 2.0564, + "mean_token_accuracy": 0.5137930929660797, + "step": 7320 + }, + { + "epoch": 0.007377817797613112, + "grad_norm": 55.05800807458252, + "learning_rate": 7.3776766110025585e-06, + "loss": 2.2081, + "mean_token_accuracy": 0.44482759237289426, + "step": 7325 + }, + { + "epoch": 0.007382853850717285, + "grad_norm": 41.05143809129591, + "learning_rate": 7.382712567733619e-06, + "loss": 2.4455, + "mean_token_accuracy": 0.43448275327682495, + "step": 7330 + }, + { + "epoch": 0.007387889903821458, + "grad_norm": 58.9581671994853, + "learning_rate": 7.387748524464679e-06, + "loss": 2.3688, + "mean_token_accuracy": 0.4034482717514038, + "step": 7335 + }, + { + "epoch": 0.007392925956925631, + "grad_norm": 54.62953796419114, + "learning_rate": 7.392784481195737e-06, + "loss": 2.2827, + "mean_token_accuracy": 0.4655172348022461, + "step": 7340 + }, + { + "epoch": 0.0073979620100298035, + "grad_norm": 52.97286611481698, + "learning_rate": 7.397820437926797e-06, + "loss": 2.1995, + "mean_token_accuracy": 0.4206896543502808, + "step": 7345 + }, + { + "epoch": 0.007402998063133976, + "grad_norm": 55.79868702350211, + "learning_rate": 7.402856394657857e-06, + "loss": 2.4488, + "mean_token_accuracy": 0.4517241358757019, + "step": 7350 + }, + { + "epoch": 0.007408034116238149, + "grad_norm": 45.48915047364951, + "learning_rate": 7.4078923513889174e-06, + "loss": 2.1298, + "mean_token_accuracy": 0.47586206793785096, + "step": 7355 + }, + { + "epoch": 0.007413070169342321, + "grad_norm": 48.04390928096492, + "learning_rate": 7.412928308119977e-06, + "loss": 2.4245, + "mean_token_accuracy": 0.4206896543502808, + "step": 7360 + }, + { + "epoch": 0.007418106222446494, + "grad_norm": 38.62262242716692, + "learning_rate": 7.417964264851037e-06, + "loss": 2.4254, + "mean_token_accuracy": 0.4517241418361664, + "step": 7365 + }, + { + "epoch": 0.007423142275550667, + "grad_norm": 38.74466594304998, + "learning_rate": 7.423000221582097e-06, + "loss": 2.3334, + "mean_token_accuracy": 0.41724138259887694, + "step": 7370 + }, + { + "epoch": 0.00742817832865484, + "grad_norm": 45.456584143004314, + "learning_rate": 7.428036178313157e-06, + "loss": 2.5147, + "mean_token_accuracy": 0.3896551728248596, + "step": 7375 + }, + { + "epoch": 0.007433214381759013, + "grad_norm": 44.209083090275534, + "learning_rate": 7.433072135044217e-06, + "loss": 2.3097, + "mean_token_accuracy": 0.4448275864124298, + "step": 7380 + }, + { + "epoch": 0.007438250434863186, + "grad_norm": 48.190024803535884, + "learning_rate": 7.4381080917752755e-06, + "loss": 2.134, + "mean_token_accuracy": 0.493103438615799, + "step": 7385 + }, + { + "epoch": 0.007443286487967359, + "grad_norm": 47.13081119078609, + "learning_rate": 7.443144048506336e-06, + "loss": 2.8193, + "mean_token_accuracy": 0.4034482717514038, + "step": 7390 + }, + { + "epoch": 0.007448322541071531, + "grad_norm": 32.16926280846503, + "learning_rate": 7.448180005237396e-06, + "loss": 2.3151, + "mean_token_accuracy": 0.4551724135875702, + "step": 7395 + }, + { + "epoch": 0.007453358594175704, + "grad_norm": 52.59528993764966, + "learning_rate": 7.453215961968456e-06, + "loss": 2.7566, + "mean_token_accuracy": 0.38620689511299133, + "step": 7400 + }, + { + "epoch": 0.0074583946472798765, + "grad_norm": 49.81781580297935, + "learning_rate": 7.458251918699514e-06, + "loss": 2.4321, + "mean_token_accuracy": 0.42758620381355283, + "step": 7405 + }, + { + "epoch": 0.007463430700384049, + "grad_norm": 62.28853719820733, + "learning_rate": 7.463287875430574e-06, + "loss": 2.3217, + "mean_token_accuracy": 0.4379310369491577, + "step": 7410 + }, + { + "epoch": 0.007468466753488222, + "grad_norm": 51.31986401745938, + "learning_rate": 7.468323832161634e-06, + "loss": 2.4849, + "mean_token_accuracy": 0.3896551638841629, + "step": 7415 + }, + { + "epoch": 0.007473502806592395, + "grad_norm": 48.378725380322784, + "learning_rate": 7.4733597888926945e-06, + "loss": 2.0426, + "mean_token_accuracy": 0.47586206197738645, + "step": 7420 + }, + { + "epoch": 0.007478538859696568, + "grad_norm": 53.47077832783121, + "learning_rate": 7.478395745623754e-06, + "loss": 2.244, + "mean_token_accuracy": 0.49655172824859617, + "step": 7425 + }, + { + "epoch": 0.00748357491280074, + "grad_norm": 56.998836287754415, + "learning_rate": 7.483431702354814e-06, + "loss": 2.4785, + "mean_token_accuracy": 0.3862069010734558, + "step": 7430 + }, + { + "epoch": 0.007488610965904913, + "grad_norm": 46.11160333115532, + "learning_rate": 7.488467659085874e-06, + "loss": 2.4127, + "mean_token_accuracy": 0.441379314661026, + "step": 7435 + }, + { + "epoch": 0.007493647019009086, + "grad_norm": 58.27831726409421, + "learning_rate": 7.493503615816934e-06, + "loss": 2.4934, + "mean_token_accuracy": 0.4103448331356049, + "step": 7440 + }, + { + "epoch": 0.007498683072113259, + "grad_norm": 40.930010813898996, + "learning_rate": 7.4985395725479925e-06, + "loss": 2.5607, + "mean_token_accuracy": 0.37241379618644715, + "step": 7445 + }, + { + "epoch": 0.007503719125217432, + "grad_norm": 66.3185144914046, + "learning_rate": 7.5035755292790526e-06, + "loss": 2.4734, + "mean_token_accuracy": 0.4034482777118683, + "step": 7450 + }, + { + "epoch": 0.007508755178321605, + "grad_norm": 33.60927099771563, + "learning_rate": 7.508611486010113e-06, + "loss": 2.2858, + "mean_token_accuracy": 0.4275861978530884, + "step": 7455 + }, + { + "epoch": 0.0075137912314257775, + "grad_norm": 51.45387795202174, + "learning_rate": 7.513647442741173e-06, + "loss": 2.2988, + "mean_token_accuracy": 0.4448275864124298, + "step": 7460 + }, + { + "epoch": 0.00751882728452995, + "grad_norm": 49.88928089858543, + "learning_rate": 7.518683399472232e-06, + "loss": 2.1042, + "mean_token_accuracy": 0.5103448331356049, + "step": 7465 + }, + { + "epoch": 0.0075238633376341225, + "grad_norm": 42.43795352052936, + "learning_rate": 7.523719356203292e-06, + "loss": 2.0689, + "mean_token_accuracy": 0.4724137902259827, + "step": 7470 + }, + { + "epoch": 0.007528899390738295, + "grad_norm": 42.86491293975076, + "learning_rate": 7.528755312934352e-06, + "loss": 2.1156, + "mean_token_accuracy": 0.4862069070339203, + "step": 7475 + }, + { + "epoch": 0.007533935443842468, + "grad_norm": 46.90900950426308, + "learning_rate": 7.533791269665412e-06, + "loss": 2.508, + "mean_token_accuracy": 0.417241370677948, + "step": 7480 + }, + { + "epoch": 0.007538971496946641, + "grad_norm": 49.16215530452209, + "learning_rate": 7.538827226396471e-06, + "loss": 2.3843, + "mean_token_accuracy": 0.458620685338974, + "step": 7485 + }, + { + "epoch": 0.007544007550050814, + "grad_norm": 39.59053964805687, + "learning_rate": 7.543863183127531e-06, + "loss": 2.1863, + "mean_token_accuracy": 0.4758620738983154, + "step": 7490 + }, + { + "epoch": 0.007549043603154987, + "grad_norm": 42.8136710568869, + "learning_rate": 7.548899139858591e-06, + "loss": 2.3064, + "mean_token_accuracy": 0.441379314661026, + "step": 7495 + }, + { + "epoch": 0.007554079656259159, + "grad_norm": 43.91565828053337, + "learning_rate": 7.553935096589651e-06, + "loss": 2.6552, + "mean_token_accuracy": 0.39655172228813174, + "step": 7500 + }, + { + "epoch": 0.007559115709363332, + "grad_norm": 42.645135244946374, + "learning_rate": 7.5589710533207094e-06, + "loss": 2.1104, + "mean_token_accuracy": 0.47931033968925474, + "step": 7505 + }, + { + "epoch": 0.007564151762467505, + "grad_norm": 46.96750923330262, + "learning_rate": 7.5640070100517695e-06, + "loss": 2.41, + "mean_token_accuracy": 0.37586206793785093, + "step": 7510 + }, + { + "epoch": 0.007569187815571678, + "grad_norm": 59.30592769963364, + "learning_rate": 7.56904296678283e-06, + "loss": 2.2797, + "mean_token_accuracy": 0.45517240166664125, + "step": 7515 + }, + { + "epoch": 0.007574223868675851, + "grad_norm": 43.34814182854512, + "learning_rate": 7.57407892351389e-06, + "loss": 2.3341, + "mean_token_accuracy": 0.47241380214691164, + "step": 7520 + }, + { + "epoch": 0.0075792599217800235, + "grad_norm": 39.766194807364954, + "learning_rate": 7.579114880244949e-06, + "loss": 1.9177, + "mean_token_accuracy": 0.5137931048870087, + "step": 7525 + }, + { + "epoch": 0.007584295974884196, + "grad_norm": 47.52299872328629, + "learning_rate": 7.584150836976009e-06, + "loss": 2.5546, + "mean_token_accuracy": 0.3793103456497192, + "step": 7530 + }, + { + "epoch": 0.007589332027988368, + "grad_norm": 42.412062957608335, + "learning_rate": 7.589186793707069e-06, + "loss": 2.3719, + "mean_token_accuracy": 0.4034482777118683, + "step": 7535 + }, + { + "epoch": 0.007594368081092541, + "grad_norm": 58.95088711549987, + "learning_rate": 7.594222750438129e-06, + "loss": 2.3328, + "mean_token_accuracy": 0.4261342942714691, + "step": 7540 + }, + { + "epoch": 0.007599404134196714, + "grad_norm": 50.84044022227517, + "learning_rate": 7.599258707169188e-06, + "loss": 2.2715, + "mean_token_accuracy": 0.4620689570903778, + "step": 7545 + }, + { + "epoch": 0.007604440187300887, + "grad_norm": 32.41353159043417, + "learning_rate": 7.604294663900248e-06, + "loss": 2.0957, + "mean_token_accuracy": 0.47586206793785096, + "step": 7550 + }, + { + "epoch": 0.00760947624040506, + "grad_norm": 53.91798170282348, + "learning_rate": 7.609330620631308e-06, + "loss": 2.2431, + "mean_token_accuracy": 0.4190562665462494, + "step": 7555 + }, + { + "epoch": 0.007614512293509233, + "grad_norm": 41.611142788557196, + "learning_rate": 7.614366577362368e-06, + "loss": 2.2547, + "mean_token_accuracy": 0.4379310369491577, + "step": 7560 + }, + { + "epoch": 0.007619548346613405, + "grad_norm": 51.2639911083754, + "learning_rate": 7.619402534093427e-06, + "loss": 2.3106, + "mean_token_accuracy": 0.44658197164535524, + "step": 7565 + }, + { + "epoch": 0.007624584399717578, + "grad_norm": 52.95813443200162, + "learning_rate": 7.624438490824487e-06, + "loss": 2.633, + "mean_token_accuracy": 0.3931034505367279, + "step": 7570 + }, + { + "epoch": 0.007629620452821751, + "grad_norm": 50.922433698370426, + "learning_rate": 7.629474447555547e-06, + "loss": 2.1826, + "mean_token_accuracy": 0.4448275864124298, + "step": 7575 + }, + { + "epoch": 0.007634656505925924, + "grad_norm": 43.28774317244623, + "learning_rate": 7.634510404286607e-06, + "loss": 2.3969, + "mean_token_accuracy": 0.4431941986083984, + "step": 7580 + }, + { + "epoch": 0.0076396925590300965, + "grad_norm": 70.09333539919614, + "learning_rate": 7.639546361017666e-06, + "loss": 2.1912, + "mean_token_accuracy": 0.46551724672317507, + "step": 7585 + }, + { + "epoch": 0.007644728612134269, + "grad_norm": 35.64048317441706, + "learning_rate": 7.644582317748727e-06, + "loss": 2.0953, + "mean_token_accuracy": 0.45517241954803467, + "step": 7590 + }, + { + "epoch": 0.007649764665238442, + "grad_norm": 38.46646477447764, + "learning_rate": 7.649618274479786e-06, + "loss": 2.1526, + "mean_token_accuracy": 0.5034482717514038, + "step": 7595 + }, + { + "epoch": 0.007654800718342614, + "grad_norm": 41.14308446109098, + "learning_rate": 7.654654231210847e-06, + "loss": 2.4023, + "mean_token_accuracy": 0.4344827592372894, + "step": 7600 + }, + { + "epoch": 0.007659836771446787, + "grad_norm": 50.13062794776937, + "learning_rate": 7.659690187941905e-06, + "loss": 2.3628, + "mean_token_accuracy": 0.4310344815254211, + "step": 7605 + }, + { + "epoch": 0.00766487282455096, + "grad_norm": 49.47233875223298, + "learning_rate": 7.664726144672966e-06, + "loss": 2.2587, + "mean_token_accuracy": 0.493103438615799, + "step": 7610 + }, + { + "epoch": 0.007669908877655133, + "grad_norm": 46.79330785810778, + "learning_rate": 7.669762101404025e-06, + "loss": 2.257, + "mean_token_accuracy": 0.43103448748588563, + "step": 7615 + }, + { + "epoch": 0.007674944930759306, + "grad_norm": 43.624608234233115, + "learning_rate": 7.674798058135086e-06, + "loss": 2.2174, + "mean_token_accuracy": 0.4379310369491577, + "step": 7620 + }, + { + "epoch": 0.007679980983863479, + "grad_norm": 41.3989602121546, + "learning_rate": 7.679834014866143e-06, + "loss": 2.3472, + "mean_token_accuracy": 0.39655172228813174, + "step": 7625 + }, + { + "epoch": 0.007685017036967652, + "grad_norm": 42.18964005080521, + "learning_rate": 7.684869971597204e-06, + "loss": 2.2834, + "mean_token_accuracy": 0.4517241358757019, + "step": 7630 + }, + { + "epoch": 0.007690053090071824, + "grad_norm": 61.18417420388271, + "learning_rate": 7.689905928328264e-06, + "loss": 2.6339, + "mean_token_accuracy": 0.4148820281028748, + "step": 7635 + }, + { + "epoch": 0.007695089143175997, + "grad_norm": 35.60457577088861, + "learning_rate": 7.694941885059325e-06, + "loss": 1.9169, + "mean_token_accuracy": 0.5551724076271057, + "step": 7640 + }, + { + "epoch": 0.00770012519628017, + "grad_norm": 40.23032198226782, + "learning_rate": 7.699977841790384e-06, + "loss": 2.6328, + "mean_token_accuracy": 0.34482758641242983, + "step": 7645 + }, + { + "epoch": 0.0077051612493843425, + "grad_norm": 43.579986828190805, + "learning_rate": 7.705013798521443e-06, + "loss": 2.4993, + "mean_token_accuracy": 0.42413793206214906, + "step": 7650 + }, + { + "epoch": 0.007710197302488515, + "grad_norm": 47.96196094150128, + "learning_rate": 7.710049755252504e-06, + "loss": 2.458, + "mean_token_accuracy": 0.41724138259887694, + "step": 7655 + }, + { + "epoch": 0.007715233355592688, + "grad_norm": 48.11525324326653, + "learning_rate": 7.715085711983563e-06, + "loss": 2.5764, + "mean_token_accuracy": 0.3793103456497192, + "step": 7660 + }, + { + "epoch": 0.007720269408696861, + "grad_norm": 33.9895637232027, + "learning_rate": 7.720121668714622e-06, + "loss": 2.3611, + "mean_token_accuracy": 0.3931034505367279, + "step": 7665 + }, + { + "epoch": 0.007725305461801033, + "grad_norm": 52.86292244157372, + "learning_rate": 7.725157625445682e-06, + "loss": 2.2499, + "mean_token_accuracy": 0.42068966031074523, + "step": 7670 + }, + { + "epoch": 0.007730341514905206, + "grad_norm": 42.653361613734404, + "learning_rate": 7.730193582176743e-06, + "loss": 2.3643, + "mean_token_accuracy": 0.44482758045196535, + "step": 7675 + }, + { + "epoch": 0.007735377568009379, + "grad_norm": 46.33142546120593, + "learning_rate": 7.735229538907802e-06, + "loss": 2.4415, + "mean_token_accuracy": 0.4344827651977539, + "step": 7680 + }, + { + "epoch": 0.007740413621113552, + "grad_norm": 43.52860533957293, + "learning_rate": 7.740265495638861e-06, + "loss": 2.4598, + "mean_token_accuracy": 0.4000000059604645, + "step": 7685 + }, + { + "epoch": 0.007745449674217725, + "grad_norm": 39.834301317700906, + "learning_rate": 7.745301452369922e-06, + "loss": 2.1268, + "mean_token_accuracy": 0.44827585816383364, + "step": 7690 + }, + { + "epoch": 0.007750485727321898, + "grad_norm": 36.51372472704589, + "learning_rate": 7.750337409100981e-06, + "loss": 2.2211, + "mean_token_accuracy": 0.48620688915252686, + "step": 7695 + }, + { + "epoch": 0.007755521780426071, + "grad_norm": 40.87093081805841, + "learning_rate": 7.755373365832042e-06, + "loss": 2.0887, + "mean_token_accuracy": 0.4896551787853241, + "step": 7700 + }, + { + "epoch": 0.007760557833530243, + "grad_norm": 48.91354782935568, + "learning_rate": 7.7604093225631e-06, + "loss": 2.3963, + "mean_token_accuracy": 0.4310344815254211, + "step": 7705 + }, + { + "epoch": 0.0077655938866344155, + "grad_norm": 38.817663032493066, + "learning_rate": 7.76544527929416e-06, + "loss": 2.1348, + "mean_token_accuracy": 0.46896551847457885, + "step": 7710 + }, + { + "epoch": 0.007770629939738588, + "grad_norm": 49.931966801400854, + "learning_rate": 7.77048123602522e-06, + "loss": 2.2143, + "mean_token_accuracy": 0.4482758641242981, + "step": 7715 + }, + { + "epoch": 0.007775665992842761, + "grad_norm": 40.49895562901206, + "learning_rate": 7.775517192756281e-06, + "loss": 2.2857, + "mean_token_accuracy": 0.44827587008476255, + "step": 7720 + }, + { + "epoch": 0.007780702045946934, + "grad_norm": 59.34824961207105, + "learning_rate": 7.780553149487339e-06, + "loss": 2.5745, + "mean_token_accuracy": 0.42758620381355283, + "step": 7725 + }, + { + "epoch": 0.007785738099051107, + "grad_norm": 52.490397455647674, + "learning_rate": 7.7855891062184e-06, + "loss": 2.1525, + "mean_token_accuracy": 0.4620689630508423, + "step": 7730 + }, + { + "epoch": 0.00779077415215528, + "grad_norm": 57.12734894283857, + "learning_rate": 7.790625062949459e-06, + "loss": 2.3675, + "mean_token_accuracy": 0.4358136713504791, + "step": 7735 + }, + { + "epoch": 0.007795810205259452, + "grad_norm": 35.61173530034061, + "learning_rate": 7.79566101968052e-06, + "loss": 2.0446, + "mean_token_accuracy": 0.4931034505367279, + "step": 7740 + }, + { + "epoch": 0.007800846258363625, + "grad_norm": 40.00574990266676, + "learning_rate": 7.800696976411579e-06, + "loss": 2.2376, + "mean_token_accuracy": 0.41379310488700866, + "step": 7745 + }, + { + "epoch": 0.007805882311467798, + "grad_norm": 58.59536698522381, + "learning_rate": 7.805732933142638e-06, + "loss": 2.4005, + "mean_token_accuracy": 0.3965517282485962, + "step": 7750 + }, + { + "epoch": 0.007810918364571971, + "grad_norm": 42.72620314201858, + "learning_rate": 7.8107688898737e-06, + "loss": 2.4971, + "mean_token_accuracy": 0.4344827592372894, + "step": 7755 + }, + { + "epoch": 0.007815954417676144, + "grad_norm": 54.89578534200915, + "learning_rate": 7.815804846604758e-06, + "loss": 2.3608, + "mean_token_accuracy": 0.4206896543502808, + "step": 7760 + }, + { + "epoch": 0.007820990470780316, + "grad_norm": 37.01029195744664, + "learning_rate": 7.820840803335818e-06, + "loss": 2.1321, + "mean_token_accuracy": 0.4896551728248596, + "step": 7765 + }, + { + "epoch": 0.00782602652388449, + "grad_norm": 40.008356401804875, + "learning_rate": 7.825876760066877e-06, + "loss": 2.268, + "mean_token_accuracy": 0.46896551847457885, + "step": 7770 + }, + { + "epoch": 0.007831062576988661, + "grad_norm": 64.97541840339466, + "learning_rate": 7.830912716797938e-06, + "loss": 2.3936, + "mean_token_accuracy": 0.42068966031074523, + "step": 7775 + }, + { + "epoch": 0.007836098630092835, + "grad_norm": 40.205765222638945, + "learning_rate": 7.835948673528997e-06, + "loss": 2.3242, + "mean_token_accuracy": 0.4724137902259827, + "step": 7780 + }, + { + "epoch": 0.007841134683197007, + "grad_norm": 35.61571063010522, + "learning_rate": 7.840984630260058e-06, + "loss": 2.2819, + "mean_token_accuracy": 0.4000000059604645, + "step": 7785 + }, + { + "epoch": 0.00784617073630118, + "grad_norm": 34.920276614563896, + "learning_rate": 7.846020586991117e-06, + "loss": 2.054, + "mean_token_accuracy": 0.47586206793785096, + "step": 7790 + }, + { + "epoch": 0.007851206789405353, + "grad_norm": 37.76935688270458, + "learning_rate": 7.851056543722177e-06, + "loss": 2.4469, + "mean_token_accuracy": 0.4344827592372894, + "step": 7795 + }, + { + "epoch": 0.007856242842509525, + "grad_norm": 51.292214779958435, + "learning_rate": 7.856092500453238e-06, + "loss": 2.5364, + "mean_token_accuracy": 0.4034482777118683, + "step": 7800 + }, + { + "epoch": 0.007861278895613699, + "grad_norm": 57.726874259807644, + "learning_rate": 7.861128457184297e-06, + "loss": 2.4918, + "mean_token_accuracy": 0.4034482777118683, + "step": 7805 + }, + { + "epoch": 0.007866314948717871, + "grad_norm": 42.92663352203394, + "learning_rate": 7.866164413915356e-06, + "loss": 2.2924, + "mean_token_accuracy": 0.43103448748588563, + "step": 7810 + }, + { + "epoch": 0.007871351001822045, + "grad_norm": 58.559171094667974, + "learning_rate": 7.871200370646415e-06, + "loss": 2.3535, + "mean_token_accuracy": 0.43986691236495973, + "step": 7815 + }, + { + "epoch": 0.007876387054926217, + "grad_norm": 37.632915335504485, + "learning_rate": 7.876236327377476e-06, + "loss": 2.2677, + "mean_token_accuracy": 0.42758620381355283, + "step": 7820 + }, + { + "epoch": 0.007881423108030389, + "grad_norm": 44.94353635415, + "learning_rate": 7.881272284108535e-06, + "loss": 2.6584, + "mean_token_accuracy": 0.38965516090393065, + "step": 7825 + }, + { + "epoch": 0.007886459161134562, + "grad_norm": 46.459638036345424, + "learning_rate": 7.886308240839595e-06, + "loss": 2.3074, + "mean_token_accuracy": 0.4068965494632721, + "step": 7830 + }, + { + "epoch": 0.007891495214238734, + "grad_norm": 40.9678738471561, + "learning_rate": 7.891344197570654e-06, + "loss": 1.9877, + "mean_token_accuracy": 0.46896551847457885, + "step": 7835 + }, + { + "epoch": 0.007896531267342908, + "grad_norm": 35.270568273042805, + "learning_rate": 7.896380154301715e-06, + "loss": 2.5115, + "mean_token_accuracy": 0.43448275327682495, + "step": 7840 + }, + { + "epoch": 0.00790156732044708, + "grad_norm": 41.37868217635883, + "learning_rate": 7.901416111032774e-06, + "loss": 2.3873, + "mean_token_accuracy": 0.38620689511299133, + "step": 7845 + }, + { + "epoch": 0.007906603373551254, + "grad_norm": 36.816621183394375, + "learning_rate": 7.906452067763833e-06, + "loss": 2.5478, + "mean_token_accuracy": 0.4034482717514038, + "step": 7850 + }, + { + "epoch": 0.007911639426655426, + "grad_norm": 38.40622947514277, + "learning_rate": 7.911488024494894e-06, + "loss": 2.4048, + "mean_token_accuracy": 0.4103448212146759, + "step": 7855 + }, + { + "epoch": 0.007916675479759598, + "grad_norm": 64.2355979643051, + "learning_rate": 7.916523981225954e-06, + "loss": 1.9694, + "mean_token_accuracy": 0.4689655125141144, + "step": 7860 + }, + { + "epoch": 0.007921711532863772, + "grad_norm": 34.714103206978635, + "learning_rate": 7.921559937957015e-06, + "loss": 2.5681, + "mean_token_accuracy": 0.3793103456497192, + "step": 7865 + }, + { + "epoch": 0.007926747585967944, + "grad_norm": 39.01254189774147, + "learning_rate": 7.926595894688072e-06, + "loss": 2.7166, + "mean_token_accuracy": 0.3482758581638336, + "step": 7870 + }, + { + "epoch": 0.007931783639072118, + "grad_norm": 34.12397176007829, + "learning_rate": 7.931631851419133e-06, + "loss": 2.2516, + "mean_token_accuracy": 0.4413793087005615, + "step": 7875 + }, + { + "epoch": 0.00793681969217629, + "grad_norm": 33.56962315886412, + "learning_rate": 7.936667808150192e-06, + "loss": 2.1755, + "mean_token_accuracy": 0.4241379380226135, + "step": 7880 + }, + { + "epoch": 0.007941855745280463, + "grad_norm": 35.76202580989349, + "learning_rate": 7.941703764881253e-06, + "loss": 2.1647, + "mean_token_accuracy": 0.42413793206214906, + "step": 7885 + }, + { + "epoch": 0.007946891798384636, + "grad_norm": 42.44219298453, + "learning_rate": 7.946739721612313e-06, + "loss": 2.0553, + "mean_token_accuracy": 0.45517241954803467, + "step": 7890 + }, + { + "epoch": 0.007951927851488808, + "grad_norm": 44.999233786125274, + "learning_rate": 7.951775678343372e-06, + "loss": 2.4038, + "mean_token_accuracy": 0.44827585816383364, + "step": 7895 + }, + { + "epoch": 0.007956963904592981, + "grad_norm": 32.965110297352844, + "learning_rate": 7.956811635074433e-06, + "loss": 2.0719, + "mean_token_accuracy": 0.46896551847457885, + "step": 7900 + }, + { + "epoch": 0.007961999957697153, + "grad_norm": 32.48970915329887, + "learning_rate": 7.961847591805492e-06, + "loss": 2.1785, + "mean_token_accuracy": 0.4172413766384125, + "step": 7905 + }, + { + "epoch": 0.007967036010801327, + "grad_norm": 36.76062399210729, + "learning_rate": 7.966883548536551e-06, + "loss": 2.1412, + "mean_token_accuracy": 0.42758620977401735, + "step": 7910 + }, + { + "epoch": 0.0079720720639055, + "grad_norm": 38.249870561280375, + "learning_rate": 7.97191950526761e-06, + "loss": 2.1193, + "mean_token_accuracy": 0.482758629322052, + "step": 7915 + }, + { + "epoch": 0.007977108117009673, + "grad_norm": 32.57216676657864, + "learning_rate": 7.976955461998671e-06, + "loss": 1.993, + "mean_token_accuracy": 0.4793103337287903, + "step": 7920 + }, + { + "epoch": 0.007982144170113845, + "grad_norm": 37.797191997060736, + "learning_rate": 7.98199141872973e-06, + "loss": 2.5122, + "mean_token_accuracy": 0.41379310488700866, + "step": 7925 + }, + { + "epoch": 0.007987180223218017, + "grad_norm": 41.1157806705336, + "learning_rate": 7.98702737546079e-06, + "loss": 2.1286, + "mean_token_accuracy": 0.42758620977401735, + "step": 7930 + }, + { + "epoch": 0.00799221627632219, + "grad_norm": 52.97541033800784, + "learning_rate": 7.99206333219185e-06, + "loss": 2.3396, + "mean_token_accuracy": 0.47931033968925474, + "step": 7935 + }, + { + "epoch": 0.007997252329426363, + "grad_norm": 33.541916680506986, + "learning_rate": 7.99709928892291e-06, + "loss": 2.3101, + "mean_token_accuracy": 0.4413793206214905, + "step": 7940 + }, + { + "epoch": 0.008002288382530537, + "grad_norm": 42.78588823599312, + "learning_rate": 8.00213524565397e-06, + "loss": 2.2737, + "mean_token_accuracy": 0.4379310369491577, + "step": 7945 + }, + { + "epoch": 0.008007324435634709, + "grad_norm": 40.79040540595521, + "learning_rate": 8.007171202385029e-06, + "loss": 2.4116, + "mean_token_accuracy": 0.41034482717514037, + "step": 7950 + }, + { + "epoch": 0.008012360488738882, + "grad_norm": 42.04350609933651, + "learning_rate": 8.01220715911609e-06, + "loss": 2.2567, + "mean_token_accuracy": 0.4206896424293518, + "step": 7955 + }, + { + "epoch": 0.008017396541843054, + "grad_norm": 65.63300468855537, + "learning_rate": 8.017243115847149e-06, + "loss": 2.5179, + "mean_token_accuracy": 0.35862069129943847, + "step": 7960 + }, + { + "epoch": 0.008022432594947226, + "grad_norm": 62.49629976732185, + "learning_rate": 8.02227907257821e-06, + "loss": 2.2414, + "mean_token_accuracy": 0.42758620381355283, + "step": 7965 + }, + { + "epoch": 0.0080274686480514, + "grad_norm": 50.490741559164285, + "learning_rate": 8.027315029309267e-06, + "loss": 2.4121, + "mean_token_accuracy": 0.41724138259887694, + "step": 7970 + }, + { + "epoch": 0.008032504701155572, + "grad_norm": 39.45273352591065, + "learning_rate": 8.032350986040328e-06, + "loss": 2.1944, + "mean_token_accuracy": 0.4206896543502808, + "step": 7975 + }, + { + "epoch": 0.008037540754259746, + "grad_norm": 37.438793730310145, + "learning_rate": 8.037386942771388e-06, + "loss": 2.0942, + "mean_token_accuracy": 0.48620688915252686, + "step": 7980 + }, + { + "epoch": 0.008042576807363918, + "grad_norm": 53.69375202443497, + "learning_rate": 8.042422899502449e-06, + "loss": 2.1803, + "mean_token_accuracy": 0.44827585220336913, + "step": 7985 + }, + { + "epoch": 0.008047612860468092, + "grad_norm": 55.15305514642944, + "learning_rate": 8.047458856233508e-06, + "loss": 2.3959, + "mean_token_accuracy": 0.4379310250282288, + "step": 7990 + }, + { + "epoch": 0.008052648913572264, + "grad_norm": 38.64104129707437, + "learning_rate": 8.052494812964567e-06, + "loss": 2.19, + "mean_token_accuracy": 0.4344827592372894, + "step": 7995 + }, + { + "epoch": 0.008057684966676436, + "grad_norm": 58.24588947930217, + "learning_rate": 8.057530769695628e-06, + "loss": 2.4183, + "mean_token_accuracy": 0.4275861978530884, + "step": 8000 + }, + { + "epoch": 0.00806272101978061, + "grad_norm": 40.03924470610734, + "learning_rate": 8.062566726426687e-06, + "loss": 2.1939, + "mean_token_accuracy": 0.47931033968925474, + "step": 8005 + }, + { + "epoch": 0.008067757072884782, + "grad_norm": 42.81956428420776, + "learning_rate": 8.067602683157746e-06, + "loss": 2.4836, + "mean_token_accuracy": 0.3862068891525269, + "step": 8010 + }, + { + "epoch": 0.008072793125988955, + "grad_norm": 34.87071358648223, + "learning_rate": 8.072638639888806e-06, + "loss": 2.2091, + "mean_token_accuracy": 0.42068966031074523, + "step": 8015 + }, + { + "epoch": 0.008077829179093127, + "grad_norm": 72.49380101156484, + "learning_rate": 8.077674596619867e-06, + "loss": 2.3102, + "mean_token_accuracy": 0.43103448748588563, + "step": 8020 + }, + { + "epoch": 0.008082865232197301, + "grad_norm": 50.21162649746146, + "learning_rate": 8.082710553350926e-06, + "loss": 2.3308, + "mean_token_accuracy": 0.44137930274009707, + "step": 8025 + }, + { + "epoch": 0.008087901285301473, + "grad_norm": 39.84420939519963, + "learning_rate": 8.087746510081985e-06, + "loss": 2.5493, + "mean_token_accuracy": 0.39310343861579894, + "step": 8030 + }, + { + "epoch": 0.008092937338405645, + "grad_norm": 48.5081883347131, + "learning_rate": 8.092782466813046e-06, + "loss": 2.1594, + "mean_token_accuracy": 0.441379314661026, + "step": 8035 + }, + { + "epoch": 0.008097973391509819, + "grad_norm": 53.68934938315932, + "learning_rate": 8.097818423544105e-06, + "loss": 2.3162, + "mean_token_accuracy": 0.4517241358757019, + "step": 8040 + }, + { + "epoch": 0.008103009444613991, + "grad_norm": 41.84541629098553, + "learning_rate": 8.102854380275166e-06, + "loss": 1.9821, + "mean_token_accuracy": 0.5103448271751404, + "step": 8045 + }, + { + "epoch": 0.008108045497718165, + "grad_norm": 66.47421368788807, + "learning_rate": 8.107890337006224e-06, + "loss": 2.4003, + "mean_token_accuracy": 0.42413793206214906, + "step": 8050 + }, + { + "epoch": 0.008113081550822337, + "grad_norm": 36.45912900699369, + "learning_rate": 8.112926293737285e-06, + "loss": 2.1878, + "mean_token_accuracy": 0.46551724076271056, + "step": 8055 + }, + { + "epoch": 0.00811811760392651, + "grad_norm": 39.97560119457867, + "learning_rate": 8.117962250468344e-06, + "loss": 2.1582, + "mean_token_accuracy": 0.4517241358757019, + "step": 8060 + }, + { + "epoch": 0.008123153657030683, + "grad_norm": 41.58880024560286, + "learning_rate": 8.122998207199405e-06, + "loss": 2.3462, + "mean_token_accuracy": 0.4551724135875702, + "step": 8065 + }, + { + "epoch": 0.008128189710134855, + "grad_norm": 44.26450059515507, + "learning_rate": 8.128034163930463e-06, + "loss": 2.0619, + "mean_token_accuracy": 0.45862067937850953, + "step": 8070 + }, + { + "epoch": 0.008133225763239028, + "grad_norm": 37.63761804732457, + "learning_rate": 8.133070120661524e-06, + "loss": 2.2692, + "mean_token_accuracy": 0.47586206197738645, + "step": 8075 + }, + { + "epoch": 0.0081382618163432, + "grad_norm": 40.85379689044504, + "learning_rate": 8.138106077392583e-06, + "loss": 2.3687, + "mean_token_accuracy": 0.4241379201412201, + "step": 8080 + }, + { + "epoch": 0.008143297869447374, + "grad_norm": 34.960317811560124, + "learning_rate": 8.143142034123644e-06, + "loss": 2.0302, + "mean_token_accuracy": 0.4724137902259827, + "step": 8085 + }, + { + "epoch": 0.008148333922551546, + "grad_norm": 41.97406448951179, + "learning_rate": 8.148177990854703e-06, + "loss": 2.2372, + "mean_token_accuracy": 0.4620689630508423, + "step": 8090 + }, + { + "epoch": 0.00815336997565572, + "grad_norm": 50.088763402484254, + "learning_rate": 8.153213947585762e-06, + "loss": 2.0148, + "mean_token_accuracy": 0.5083484530448914, + "step": 8095 + }, + { + "epoch": 0.008158406028759892, + "grad_norm": 41.3082877624813, + "learning_rate": 8.158249904316823e-06, + "loss": 2.121, + "mean_token_accuracy": 0.4635813653469086, + "step": 8100 + }, + { + "epoch": 0.008163442081864064, + "grad_norm": 36.25225758129978, + "learning_rate": 8.163285861047882e-06, + "loss": 2.2037, + "mean_token_accuracy": 0.4711433708667755, + "step": 8105 + }, + { + "epoch": 0.008168478134968238, + "grad_norm": 34.254175564685156, + "learning_rate": 8.168321817778942e-06, + "loss": 2.0332, + "mean_token_accuracy": 0.4413793087005615, + "step": 8110 + }, + { + "epoch": 0.00817351418807241, + "grad_norm": 36.044330006326014, + "learning_rate": 8.173357774510001e-06, + "loss": 2.2798, + "mean_token_accuracy": 0.3965517163276672, + "step": 8115 + }, + { + "epoch": 0.008178550241176584, + "grad_norm": 37.45435926828359, + "learning_rate": 8.178393731241062e-06, + "loss": 2.3105, + "mean_token_accuracy": 0.4379310369491577, + "step": 8120 + }, + { + "epoch": 0.008183586294280756, + "grad_norm": 49.65605341966188, + "learning_rate": 8.183429687972121e-06, + "loss": 2.1947, + "mean_token_accuracy": 0.46551724076271056, + "step": 8125 + }, + { + "epoch": 0.00818862234738493, + "grad_norm": 57.50848782891851, + "learning_rate": 8.18846564470318e-06, + "loss": 2.5328, + "mean_token_accuracy": 0.4103448212146759, + "step": 8130 + }, + { + "epoch": 0.008193658400489101, + "grad_norm": 40.497723755430066, + "learning_rate": 8.193501601434241e-06, + "loss": 2.6684, + "mean_token_accuracy": 0.36206897497177126, + "step": 8135 + }, + { + "epoch": 0.008198694453593273, + "grad_norm": 38.91745328429812, + "learning_rate": 8.1985375581653e-06, + "loss": 2.5117, + "mean_token_accuracy": 0.41724138259887694, + "step": 8140 + }, + { + "epoch": 0.008203730506697447, + "grad_norm": 41.95578759760776, + "learning_rate": 8.203573514896362e-06, + "loss": 2.6908, + "mean_token_accuracy": 0.39655172228813174, + "step": 8145 + }, + { + "epoch": 0.00820876655980162, + "grad_norm": 36.58132849975484, + "learning_rate": 8.208609471627419e-06, + "loss": 2.0842, + "mean_token_accuracy": 0.47931034564971925, + "step": 8150 + }, + { + "epoch": 0.008213802612905793, + "grad_norm": 45.23125145655737, + "learning_rate": 8.21364542835848e-06, + "loss": 2.2524, + "mean_token_accuracy": 0.4689655125141144, + "step": 8155 + }, + { + "epoch": 0.008218838666009965, + "grad_norm": 49.47783473807567, + "learning_rate": 8.21868138508954e-06, + "loss": 2.0822, + "mean_token_accuracy": 0.49879008531570435, + "step": 8160 + }, + { + "epoch": 0.008223874719114139, + "grad_norm": 44.80987018798864, + "learning_rate": 8.2237173418206e-06, + "loss": 2.3988, + "mean_token_accuracy": 0.4620689630508423, + "step": 8165 + }, + { + "epoch": 0.008228910772218311, + "grad_norm": 34.00098919013609, + "learning_rate": 8.228753298551658e-06, + "loss": 2.4857, + "mean_token_accuracy": 0.41379310488700866, + "step": 8170 + }, + { + "epoch": 0.008233946825322483, + "grad_norm": 46.77333255101476, + "learning_rate": 8.233789255282719e-06, + "loss": 2.0845, + "mean_token_accuracy": 0.5172413766384125, + "step": 8175 + }, + { + "epoch": 0.008238982878426657, + "grad_norm": 43.920721387950344, + "learning_rate": 8.238825212013778e-06, + "loss": 2.3801, + "mean_token_accuracy": 0.41724138259887694, + "step": 8180 + }, + { + "epoch": 0.008244018931530829, + "grad_norm": 42.612665764814295, + "learning_rate": 8.243861168744839e-06, + "loss": 2.3864, + "mean_token_accuracy": 0.42413793206214906, + "step": 8185 + }, + { + "epoch": 0.008249054984635002, + "grad_norm": 50.25398520828357, + "learning_rate": 8.248897125475898e-06, + "loss": 2.1631, + "mean_token_accuracy": 0.46896551847457885, + "step": 8190 + }, + { + "epoch": 0.008254091037739174, + "grad_norm": 60.12588568454377, + "learning_rate": 8.253933082206957e-06, + "loss": 2.0079, + "mean_token_accuracy": 0.5068965494632721, + "step": 8195 + }, + { + "epoch": 0.008259127090843348, + "grad_norm": 40.82705027320322, + "learning_rate": 8.258969038938018e-06, + "loss": 2.2173, + "mean_token_accuracy": 0.488384747505188, + "step": 8200 + }, + { + "epoch": 0.00826416314394752, + "grad_norm": 43.393192343596866, + "learning_rate": 8.264004995669078e-06, + "loss": 2.5521, + "mean_token_accuracy": 0.40344826579093934, + "step": 8205 + }, + { + "epoch": 0.008269199197051692, + "grad_norm": 58.900952500629366, + "learning_rate": 8.269040952400139e-06, + "loss": 2.3182, + "mean_token_accuracy": 0.4310344815254211, + "step": 8210 + }, + { + "epoch": 0.008274235250155866, + "grad_norm": 38.14638932995811, + "learning_rate": 8.274076909131196e-06, + "loss": 2.4245, + "mean_token_accuracy": 0.44827585816383364, + "step": 8215 + }, + { + "epoch": 0.008279271303260038, + "grad_norm": 37.38631091056911, + "learning_rate": 8.279112865862257e-06, + "loss": 2.4876, + "mean_token_accuracy": 0.3999999940395355, + "step": 8220 + }, + { + "epoch": 0.008284307356364212, + "grad_norm": 38.72564801450457, + "learning_rate": 8.284148822593316e-06, + "loss": 2.3549, + "mean_token_accuracy": 0.4551724076271057, + "step": 8225 + }, + { + "epoch": 0.008289343409468384, + "grad_norm": 42.6978603522511, + "learning_rate": 8.289184779324377e-06, + "loss": 2.2665, + "mean_token_accuracy": 0.42413793206214906, + "step": 8230 + }, + { + "epoch": 0.008294379462572558, + "grad_norm": 33.87195102632296, + "learning_rate": 8.294220736055437e-06, + "loss": 2.1292, + "mean_token_accuracy": 0.48142769932746887, + "step": 8235 + }, + { + "epoch": 0.00829941551567673, + "grad_norm": 69.36294434779421, + "learning_rate": 8.299256692786496e-06, + "loss": 2.2775, + "mean_token_accuracy": 0.458620685338974, + "step": 8240 + }, + { + "epoch": 0.008304451568780902, + "grad_norm": 40.956191026101365, + "learning_rate": 8.304292649517557e-06, + "loss": 2.0405, + "mean_token_accuracy": 0.4724137902259827, + "step": 8245 + }, + { + "epoch": 0.008309487621885076, + "grad_norm": 41.88479779991668, + "learning_rate": 8.309328606248616e-06, + "loss": 2.2381, + "mean_token_accuracy": 0.4425892233848572, + "step": 8250 + }, + { + "epoch": 0.008314523674989248, + "grad_norm": 52.61882079229251, + "learning_rate": 8.314364562979675e-06, + "loss": 2.1083, + "mean_token_accuracy": 0.4551724135875702, + "step": 8255 + }, + { + "epoch": 0.008319559728093421, + "grad_norm": 65.35804446109087, + "learning_rate": 8.319400519710735e-06, + "loss": 2.3923, + "mean_token_accuracy": 0.48275861144065857, + "step": 8260 + }, + { + "epoch": 0.008324595781197593, + "grad_norm": 57.60275869147242, + "learning_rate": 8.324436476441795e-06, + "loss": 2.5744, + "mean_token_accuracy": 0.3620689660310745, + "step": 8265 + }, + { + "epoch": 0.008329631834301767, + "grad_norm": 44.053857815518576, + "learning_rate": 8.329472433172855e-06, + "loss": 2.4967, + "mean_token_accuracy": 0.3896551728248596, + "step": 8270 + }, + { + "epoch": 0.008334667887405939, + "grad_norm": 42.719321551200395, + "learning_rate": 8.334508389903914e-06, + "loss": 2.3409, + "mean_token_accuracy": 0.46896551847457885, + "step": 8275 + }, + { + "epoch": 0.008339703940510111, + "grad_norm": 43.611556390877425, + "learning_rate": 8.339544346634973e-06, + "loss": 2.3195, + "mean_token_accuracy": 0.4448275834321976, + "step": 8280 + }, + { + "epoch": 0.008344739993614285, + "grad_norm": 50.46139890887202, + "learning_rate": 8.344580303366034e-06, + "loss": 2.046, + "mean_token_accuracy": 0.5034482777118683, + "step": 8285 + }, + { + "epoch": 0.008349776046718457, + "grad_norm": 28.860968470344737, + "learning_rate": 8.349616260097093e-06, + "loss": 2.1862, + "mean_token_accuracy": 0.4379310369491577, + "step": 8290 + }, + { + "epoch": 0.00835481209982263, + "grad_norm": 69.69906829796771, + "learning_rate": 8.354652216828153e-06, + "loss": 2.577, + "mean_token_accuracy": 0.3827586233615875, + "step": 8295 + }, + { + "epoch": 0.008359848152926803, + "grad_norm": 52.572973166661, + "learning_rate": 8.359688173559214e-06, + "loss": 2.8307, + "mean_token_accuracy": 0.38275861740112305, + "step": 8300 + }, + { + "epoch": 0.008364884206030977, + "grad_norm": 31.898356867082015, + "learning_rate": 8.364724130290273e-06, + "loss": 2.2226, + "mean_token_accuracy": 0.4497882664203644, + "step": 8305 + }, + { + "epoch": 0.008369920259135149, + "grad_norm": 39.30706262525915, + "learning_rate": 8.369760087021334e-06, + "loss": 1.8708, + "mean_token_accuracy": 0.5275862038135528, + "step": 8310 + }, + { + "epoch": 0.00837495631223932, + "grad_norm": 30.970151881832347, + "learning_rate": 8.374796043752391e-06, + "loss": 2.3245, + "mean_token_accuracy": 0.42413793206214906, + "step": 8315 + }, + { + "epoch": 0.008379992365343494, + "grad_norm": 33.16500427142407, + "learning_rate": 8.379832000483452e-06, + "loss": 2.3575, + "mean_token_accuracy": 0.41724138259887694, + "step": 8320 + }, + { + "epoch": 0.008385028418447666, + "grad_norm": 41.805727711977745, + "learning_rate": 8.384867957214512e-06, + "loss": 2.568, + "mean_token_accuracy": 0.4689655125141144, + "step": 8325 + }, + { + "epoch": 0.00839006447155184, + "grad_norm": 31.872970009545238, + "learning_rate": 8.389903913945573e-06, + "loss": 2.2111, + "mean_token_accuracy": 0.4413793087005615, + "step": 8330 + }, + { + "epoch": 0.008395100524656012, + "grad_norm": 47.08279578101426, + "learning_rate": 8.394939870676632e-06, + "loss": 2.5163, + "mean_token_accuracy": 0.38620689511299133, + "step": 8335 + }, + { + "epoch": 0.008400136577760186, + "grad_norm": 33.704780663549656, + "learning_rate": 8.399975827407691e-06, + "loss": 2.6529, + "mean_token_accuracy": 0.4103448212146759, + "step": 8340 + }, + { + "epoch": 0.008405172630864358, + "grad_norm": 30.837387375797213, + "learning_rate": 8.405011784138752e-06, + "loss": 2.1594, + "mean_token_accuracy": 0.45335754156112673, + "step": 8345 + }, + { + "epoch": 0.00841020868396853, + "grad_norm": 52.45484445909543, + "learning_rate": 8.410047740869811e-06, + "loss": 2.4392, + "mean_token_accuracy": 0.4172413766384125, + "step": 8350 + }, + { + "epoch": 0.008415244737072704, + "grad_norm": 47.76608733928675, + "learning_rate": 8.41508369760087e-06, + "loss": 2.354, + "mean_token_accuracy": 0.4586206912994385, + "step": 8355 + }, + { + "epoch": 0.008420280790176876, + "grad_norm": 34.27814291661414, + "learning_rate": 8.42011965433193e-06, + "loss": 2.388, + "mean_token_accuracy": 0.4000000059604645, + "step": 8360 + }, + { + "epoch": 0.00842531684328105, + "grad_norm": 49.11198164309136, + "learning_rate": 8.42515561106299e-06, + "loss": 2.322, + "mean_token_accuracy": 0.44137929677963256, + "step": 8365 + }, + { + "epoch": 0.008430352896385222, + "grad_norm": 42.31509318885319, + "learning_rate": 8.43019156779405e-06, + "loss": 2.4799, + "mean_token_accuracy": 0.4310344815254211, + "step": 8370 + }, + { + "epoch": 0.008435388949489395, + "grad_norm": 51.40820805360451, + "learning_rate": 8.43522752452511e-06, + "loss": 2.3267, + "mean_token_accuracy": 0.441379314661026, + "step": 8375 + }, + { + "epoch": 0.008440425002593567, + "grad_norm": 43.753664682766484, + "learning_rate": 8.440263481256168e-06, + "loss": 2.3846, + "mean_token_accuracy": 0.46896551847457885, + "step": 8380 + }, + { + "epoch": 0.00844546105569774, + "grad_norm": 48.13972778588484, + "learning_rate": 8.44529943798723e-06, + "loss": 2.433, + "mean_token_accuracy": 0.4310344815254211, + "step": 8385 + }, + { + "epoch": 0.008450497108801913, + "grad_norm": 41.326123472679015, + "learning_rate": 8.450335394718289e-06, + "loss": 2.4033, + "mean_token_accuracy": 0.4068965554237366, + "step": 8390 + }, + { + "epoch": 0.008455533161906085, + "grad_norm": 50.25378083705675, + "learning_rate": 8.455371351449348e-06, + "loss": 2.535, + "mean_token_accuracy": 0.4172413766384125, + "step": 8395 + }, + { + "epoch": 0.008460569215010259, + "grad_norm": 52.0906988744877, + "learning_rate": 8.460407308180409e-06, + "loss": 2.487, + "mean_token_accuracy": 0.3551724135875702, + "step": 8400 + }, + { + "epoch": 0.008465605268114431, + "grad_norm": 35.796510007826384, + "learning_rate": 8.465443264911468e-06, + "loss": 2.4722, + "mean_token_accuracy": 0.41905626058578493, + "step": 8405 + }, + { + "epoch": 0.008470641321218605, + "grad_norm": 36.42615482899636, + "learning_rate": 8.470479221642529e-06, + "loss": 2.4102, + "mean_token_accuracy": 0.43793103098869324, + "step": 8410 + }, + { + "epoch": 0.008475677374322777, + "grad_norm": 34.04840358539263, + "learning_rate": 8.475515178373587e-06, + "loss": 2.0857, + "mean_token_accuracy": 0.4740471839904785, + "step": 8415 + }, + { + "epoch": 0.008480713427426949, + "grad_norm": 38.36875296363297, + "learning_rate": 8.480551135104648e-06, + "loss": 2.3213, + "mean_token_accuracy": 0.42758620381355283, + "step": 8420 + }, + { + "epoch": 0.008485749480531123, + "grad_norm": 33.45723674163815, + "learning_rate": 8.485587091835707e-06, + "loss": 2.2982, + "mean_token_accuracy": 0.42068966031074523, + "step": 8425 + }, + { + "epoch": 0.008490785533635295, + "grad_norm": 40.84221107925927, + "learning_rate": 8.490623048566768e-06, + "loss": 2.2565, + "mean_token_accuracy": 0.44482759237289426, + "step": 8430 + }, + { + "epoch": 0.008495821586739468, + "grad_norm": 34.69469725065078, + "learning_rate": 8.495659005297827e-06, + "loss": 2.5097, + "mean_token_accuracy": 0.4517241418361664, + "step": 8435 + }, + { + "epoch": 0.00850085763984364, + "grad_norm": 40.34390793703597, + "learning_rate": 8.500694962028886e-06, + "loss": 2.2338, + "mean_token_accuracy": 0.42758620977401735, + "step": 8440 + }, + { + "epoch": 0.008505893692947814, + "grad_norm": 73.85690999251504, + "learning_rate": 8.505730918759947e-06, + "loss": 2.5933, + "mean_token_accuracy": 0.38620689809322356, + "step": 8445 + }, + { + "epoch": 0.008510929746051986, + "grad_norm": 44.998841024004676, + "learning_rate": 8.510766875491006e-06, + "loss": 2.2552, + "mean_token_accuracy": 0.4379310369491577, + "step": 8450 + }, + { + "epoch": 0.008515965799156158, + "grad_norm": 51.3707729358596, + "learning_rate": 8.515802832222066e-06, + "loss": 2.5021, + "mean_token_accuracy": 0.3620689630508423, + "step": 8455 + }, + { + "epoch": 0.008521001852260332, + "grad_norm": 24.101087322421513, + "learning_rate": 8.520838788953125e-06, + "loss": 1.909, + "mean_token_accuracy": 0.5344827532768249, + "step": 8460 + }, + { + "epoch": 0.008526037905364504, + "grad_norm": 47.28834755963221, + "learning_rate": 8.525874745684186e-06, + "loss": 2.4499, + "mean_token_accuracy": 0.3965517282485962, + "step": 8465 + }, + { + "epoch": 0.008531073958468678, + "grad_norm": 44.24203588040872, + "learning_rate": 8.530910702415245e-06, + "loss": 1.9313, + "mean_token_accuracy": 0.5241379320621491, + "step": 8470 + }, + { + "epoch": 0.00853611001157285, + "grad_norm": 48.91440137614651, + "learning_rate": 8.535946659146304e-06, + "loss": 2.441, + "mean_token_accuracy": 0.4, + "step": 8475 + }, + { + "epoch": 0.008541146064677024, + "grad_norm": 33.09101341416672, + "learning_rate": 8.540982615877364e-06, + "loss": 2.4386, + "mean_token_accuracy": 0.4241379380226135, + "step": 8480 + }, + { + "epoch": 0.008546182117781196, + "grad_norm": 49.80674398208219, + "learning_rate": 8.546018572608425e-06, + "loss": 2.4408, + "mean_token_accuracy": 0.4068965494632721, + "step": 8485 + }, + { + "epoch": 0.008551218170885368, + "grad_norm": 33.570541412030465, + "learning_rate": 8.551054529339486e-06, + "loss": 2.3873, + "mean_token_accuracy": 0.41724138855934145, + "step": 8490 + }, + { + "epoch": 0.008556254223989541, + "grad_norm": 41.25732377379406, + "learning_rate": 8.556090486070543e-06, + "loss": 2.118, + "mean_token_accuracy": 0.4655172348022461, + "step": 8495 + }, + { + "epoch": 0.008561290277093713, + "grad_norm": 37.305382099284905, + "learning_rate": 8.561126442801604e-06, + "loss": 2.536, + "mean_token_accuracy": 0.35517241060733795, + "step": 8500 + }, + { + "epoch": 0.008566326330197887, + "grad_norm": 36.75693946224768, + "learning_rate": 8.566162399532663e-06, + "loss": 2.3517, + "mean_token_accuracy": 0.42758620977401735, + "step": 8505 + }, + { + "epoch": 0.00857136238330206, + "grad_norm": 38.92055962350644, + "learning_rate": 8.571198356263724e-06, + "loss": 2.2516, + "mean_token_accuracy": 0.4689655065536499, + "step": 8510 + }, + { + "epoch": 0.008576398436406233, + "grad_norm": 40.83239379741632, + "learning_rate": 8.576234312994782e-06, + "loss": 2.0389, + "mean_token_accuracy": 0.5379310309886932, + "step": 8515 + }, + { + "epoch": 0.008581434489510405, + "grad_norm": 40.174235708758424, + "learning_rate": 8.581270269725843e-06, + "loss": 2.1814, + "mean_token_accuracy": 0.4655172348022461, + "step": 8520 + }, + { + "epoch": 0.008586470542614577, + "grad_norm": 39.61815409953896, + "learning_rate": 8.586306226456902e-06, + "loss": 2.3356, + "mean_token_accuracy": 0.42068966031074523, + "step": 8525 + }, + { + "epoch": 0.00859150659571875, + "grad_norm": 49.367727612798916, + "learning_rate": 8.591342183187963e-06, + "loss": 2.3857, + "mean_token_accuracy": 0.45517241954803467, + "step": 8530 + }, + { + "epoch": 0.008596542648822923, + "grad_norm": 35.705311181018494, + "learning_rate": 8.596378139919022e-06, + "loss": 2.2989, + "mean_token_accuracy": 0.4517241418361664, + "step": 8535 + }, + { + "epoch": 0.008601578701927097, + "grad_norm": 35.31542421008467, + "learning_rate": 8.601414096650082e-06, + "loss": 2.2813, + "mean_token_accuracy": 0.4896551728248596, + "step": 8540 + }, + { + "epoch": 0.008606614755031269, + "grad_norm": 46.1537174592479, + "learning_rate": 8.606450053381142e-06, + "loss": 2.6581, + "mean_token_accuracy": 0.37241379022598264, + "step": 8545 + }, + { + "epoch": 0.008611650808135442, + "grad_norm": 38.98543506655495, + "learning_rate": 8.611486010112202e-06, + "loss": 2.5408, + "mean_token_accuracy": 0.43103447556495667, + "step": 8550 + }, + { + "epoch": 0.008616686861239614, + "grad_norm": 41.395770800519884, + "learning_rate": 8.616521966843261e-06, + "loss": 2.5932, + "mean_token_accuracy": 0.4034482777118683, + "step": 8555 + }, + { + "epoch": 0.008621722914343787, + "grad_norm": 51.973730865817735, + "learning_rate": 8.62155792357432e-06, + "loss": 2.0703, + "mean_token_accuracy": 0.4862068951129913, + "step": 8560 + }, + { + "epoch": 0.00862675896744796, + "grad_norm": 39.53364695474878, + "learning_rate": 8.626593880305381e-06, + "loss": 2.2078, + "mean_token_accuracy": 0.47586206197738645, + "step": 8565 + }, + { + "epoch": 0.008631795020552132, + "grad_norm": 42.99717329018743, + "learning_rate": 8.63162983703644e-06, + "loss": 2.19, + "mean_token_accuracy": 0.482758617401123, + "step": 8570 + }, + { + "epoch": 0.008636831073656306, + "grad_norm": 36.192265195542795, + "learning_rate": 8.6366657937675e-06, + "loss": 2.3426, + "mean_token_accuracy": 0.441379314661026, + "step": 8575 + }, + { + "epoch": 0.008641867126760478, + "grad_norm": 44.509754288641254, + "learning_rate": 8.64170175049856e-06, + "loss": 2.5496, + "mean_token_accuracy": 0.40544464290142057, + "step": 8580 + }, + { + "epoch": 0.008646903179864652, + "grad_norm": 42.55656077506188, + "learning_rate": 8.64673770722962e-06, + "loss": 2.225, + "mean_token_accuracy": 0.45517241954803467, + "step": 8585 + }, + { + "epoch": 0.008651939232968824, + "grad_norm": 43.70543672001637, + "learning_rate": 8.65177366396068e-06, + "loss": 2.3263, + "mean_token_accuracy": 0.45517241954803467, + "step": 8590 + }, + { + "epoch": 0.008656975286072996, + "grad_norm": 38.05152548624296, + "learning_rate": 8.65680962069174e-06, + "loss": 2.3196, + "mean_token_accuracy": 0.4344827592372894, + "step": 8595 + }, + { + "epoch": 0.00866201133917717, + "grad_norm": 31.14734596815842, + "learning_rate": 8.6618455774228e-06, + "loss": 2.3625, + "mean_token_accuracy": 0.44482759237289426, + "step": 8600 + }, + { + "epoch": 0.008667047392281342, + "grad_norm": 37.841178598641775, + "learning_rate": 8.666881534153859e-06, + "loss": 2.2106, + "mean_token_accuracy": 0.43793103098869324, + "step": 8605 + }, + { + "epoch": 0.008672083445385515, + "grad_norm": 40.33737448405054, + "learning_rate": 8.67191749088492e-06, + "loss": 2.0556, + "mean_token_accuracy": 0.45668481588363646, + "step": 8610 + }, + { + "epoch": 0.008677119498489688, + "grad_norm": 35.0094061971413, + "learning_rate": 8.676953447615979e-06, + "loss": 2.312, + "mean_token_accuracy": 0.47114337682724, + "step": 8615 + }, + { + "epoch": 0.008682155551593861, + "grad_norm": 44.92520828183209, + "learning_rate": 8.681989404347038e-06, + "loss": 2.2136, + "mean_token_accuracy": 0.4551724135875702, + "step": 8620 + }, + { + "epoch": 0.008687191604698033, + "grad_norm": 42.8886256403922, + "learning_rate": 8.687025361078097e-06, + "loss": 2.5508, + "mean_token_accuracy": 0.4103448212146759, + "step": 8625 + }, + { + "epoch": 0.008692227657802205, + "grad_norm": 40.88069658026503, + "learning_rate": 8.692061317809158e-06, + "loss": 2.321, + "mean_token_accuracy": 0.42758620381355283, + "step": 8630 + }, + { + "epoch": 0.008697263710906379, + "grad_norm": 40.499993649477204, + "learning_rate": 8.697097274540217e-06, + "loss": 2.2651, + "mean_token_accuracy": 0.4620689690113068, + "step": 8635 + }, + { + "epoch": 0.008702299764010551, + "grad_norm": 37.44539081801667, + "learning_rate": 8.702133231271277e-06, + "loss": 2.4669, + "mean_token_accuracy": 0.3758620709180832, + "step": 8640 + }, + { + "epoch": 0.008707335817114725, + "grad_norm": 43.00615053294406, + "learning_rate": 8.707169188002338e-06, + "loss": 2.1534, + "mean_token_accuracy": 0.4896551728248596, + "step": 8645 + }, + { + "epoch": 0.008712371870218897, + "grad_norm": 36.150761656034426, + "learning_rate": 8.712205144733397e-06, + "loss": 2.4289, + "mean_token_accuracy": 0.4379310369491577, + "step": 8650 + }, + { + "epoch": 0.00871740792332307, + "grad_norm": 32.623081683858004, + "learning_rate": 8.717241101464458e-06, + "loss": 2.2866, + "mean_token_accuracy": 0.44137930274009707, + "step": 8655 + }, + { + "epoch": 0.008722443976427243, + "grad_norm": 37.20720087192561, + "learning_rate": 8.722277058195515e-06, + "loss": 2.5528, + "mean_token_accuracy": 0.37586206793785093, + "step": 8660 + }, + { + "epoch": 0.008727480029531415, + "grad_norm": 47.81739334123764, + "learning_rate": 8.727313014926576e-06, + "loss": 2.3752, + "mean_token_accuracy": 0.47241378426551817, + "step": 8665 + }, + { + "epoch": 0.008732516082635589, + "grad_norm": 29.557989687858534, + "learning_rate": 8.732348971657636e-06, + "loss": 2.1488, + "mean_token_accuracy": 0.4793103337287903, + "step": 8670 + }, + { + "epoch": 0.00873755213573976, + "grad_norm": 37.71365427684128, + "learning_rate": 8.737384928388697e-06, + "loss": 2.2568, + "mean_token_accuracy": 0.47241379618644713, + "step": 8675 + }, + { + "epoch": 0.008742588188843934, + "grad_norm": 70.82503241671895, + "learning_rate": 8.742420885119756e-06, + "loss": 2.4081, + "mean_token_accuracy": 0.38275861740112305, + "step": 8680 + }, + { + "epoch": 0.008747624241948106, + "grad_norm": 29.05161967453014, + "learning_rate": 8.747456841850815e-06, + "loss": 2.1983, + "mean_token_accuracy": 0.4379310369491577, + "step": 8685 + }, + { + "epoch": 0.00875266029505228, + "grad_norm": 43.57380949547287, + "learning_rate": 8.752492798581876e-06, + "loss": 2.1979, + "mean_token_accuracy": 0.4517241418361664, + "step": 8690 + }, + { + "epoch": 0.008757696348156452, + "grad_norm": 38.23286519420096, + "learning_rate": 8.757528755312935e-06, + "loss": 2.1895, + "mean_token_accuracy": 0.4448275864124298, + "step": 8695 + }, + { + "epoch": 0.008762732401260624, + "grad_norm": 37.86838052576704, + "learning_rate": 8.762564712043995e-06, + "loss": 2.0717, + "mean_token_accuracy": 0.4482758641242981, + "step": 8700 + }, + { + "epoch": 0.008767768454364798, + "grad_norm": 27.461225141146343, + "learning_rate": 8.767600668775054e-06, + "loss": 2.3125, + "mean_token_accuracy": 0.4689655125141144, + "step": 8705 + }, + { + "epoch": 0.00877280450746897, + "grad_norm": 49.42405799120369, + "learning_rate": 8.772636625506115e-06, + "loss": 2.4737, + "mean_token_accuracy": 0.4068965494632721, + "step": 8710 + }, + { + "epoch": 0.008777840560573144, + "grad_norm": 29.38572461818643, + "learning_rate": 8.777672582237174e-06, + "loss": 2.3134, + "mean_token_accuracy": 0.4068965554237366, + "step": 8715 + }, + { + "epoch": 0.008782876613677316, + "grad_norm": 33.767494241591706, + "learning_rate": 8.782708538968233e-06, + "loss": 2.572, + "mean_token_accuracy": 0.4379310369491577, + "step": 8720 + }, + { + "epoch": 0.008787912666781488, + "grad_norm": 48.39461483688185, + "learning_rate": 8.787744495699293e-06, + "loss": 2.3966, + "mean_token_accuracy": 0.4275861978530884, + "step": 8725 + }, + { + "epoch": 0.008792948719885662, + "grad_norm": 38.000891790029144, + "learning_rate": 8.792780452430353e-06, + "loss": 2.3606, + "mean_token_accuracy": 0.4034482777118683, + "step": 8730 + }, + { + "epoch": 0.008797984772989834, + "grad_norm": 46.65716281408244, + "learning_rate": 8.797816409161413e-06, + "loss": 2.5848, + "mean_token_accuracy": 0.4103448212146759, + "step": 8735 + }, + { + "epoch": 0.008803020826094007, + "grad_norm": 36.24999275021773, + "learning_rate": 8.802852365892472e-06, + "loss": 2.238, + "mean_token_accuracy": 0.45027223229408264, + "step": 8740 + }, + { + "epoch": 0.00880805687919818, + "grad_norm": 37.003292026075044, + "learning_rate": 8.807888322623533e-06, + "loss": 2.2523, + "mean_token_accuracy": 0.4344827592372894, + "step": 8745 + }, + { + "epoch": 0.008813092932302353, + "grad_norm": 43.76924611698608, + "learning_rate": 8.812924279354592e-06, + "loss": 2.4025, + "mean_token_accuracy": 0.46551724076271056, + "step": 8750 + }, + { + "epoch": 0.008818128985406525, + "grad_norm": 45.49062638467538, + "learning_rate": 8.817960236085653e-06, + "loss": 2.1171, + "mean_token_accuracy": 0.4517241418361664, + "step": 8755 + }, + { + "epoch": 0.008823165038510697, + "grad_norm": 37.34148266789959, + "learning_rate": 8.82299619281671e-06, + "loss": 2.1282, + "mean_token_accuracy": 0.47586206793785096, + "step": 8760 + }, + { + "epoch": 0.008828201091614871, + "grad_norm": 48.44665781185846, + "learning_rate": 8.828032149547772e-06, + "loss": 2.0659, + "mean_token_accuracy": 0.48275862336158754, + "step": 8765 + }, + { + "epoch": 0.008833237144719043, + "grad_norm": 40.81478468434724, + "learning_rate": 8.833068106278831e-06, + "loss": 2.3315, + "mean_token_accuracy": 0.4413793087005615, + "step": 8770 + }, + { + "epoch": 0.008838273197823217, + "grad_norm": 28.433689295949712, + "learning_rate": 8.838104063009892e-06, + "loss": 2.4215, + "mean_token_accuracy": 0.417241370677948, + "step": 8775 + }, + { + "epoch": 0.008843309250927389, + "grad_norm": 51.63645324935579, + "learning_rate": 8.843140019740951e-06, + "loss": 2.3854, + "mean_token_accuracy": 0.39655172228813174, + "step": 8780 + }, + { + "epoch": 0.008848345304031563, + "grad_norm": 44.843455274997986, + "learning_rate": 8.84817597647201e-06, + "loss": 2.002, + "mean_token_accuracy": 0.5137930929660797, + "step": 8785 + }, + { + "epoch": 0.008853381357135735, + "grad_norm": 43.91729593034281, + "learning_rate": 8.853211933203071e-06, + "loss": 2.2936, + "mean_token_accuracy": 0.4517241358757019, + "step": 8790 + }, + { + "epoch": 0.008858417410239907, + "grad_norm": 45.90493445949028, + "learning_rate": 8.85824788993413e-06, + "loss": 1.9773, + "mean_token_accuracy": 0.4931034505367279, + "step": 8795 + }, + { + "epoch": 0.00886345346334408, + "grad_norm": 50.237365248672106, + "learning_rate": 8.86328384666519e-06, + "loss": 2.1663, + "mean_token_accuracy": 0.4344827592372894, + "step": 8800 + }, + { + "epoch": 0.008868489516448252, + "grad_norm": 46.8542522745384, + "learning_rate": 8.868319803396249e-06, + "loss": 2.3889, + "mean_token_accuracy": 0.44325469732284545, + "step": 8805 + }, + { + "epoch": 0.008873525569552426, + "grad_norm": 50.31435705567498, + "learning_rate": 8.87335576012731e-06, + "loss": 2.6584, + "mean_token_accuracy": 0.41379310488700866, + "step": 8810 + }, + { + "epoch": 0.008878561622656598, + "grad_norm": 41.08626928352491, + "learning_rate": 8.87839171685837e-06, + "loss": 2.3161, + "mean_token_accuracy": 0.4517241418361664, + "step": 8815 + }, + { + "epoch": 0.008883597675760772, + "grad_norm": 34.973993618368965, + "learning_rate": 8.883427673589428e-06, + "loss": 2.2866, + "mean_token_accuracy": 0.47586207985877993, + "step": 8820 + }, + { + "epoch": 0.008888633728864944, + "grad_norm": 60.628902434693316, + "learning_rate": 8.888463630320488e-06, + "loss": 2.3726, + "mean_token_accuracy": 0.45517241954803467, + "step": 8825 + }, + { + "epoch": 0.008893669781969116, + "grad_norm": 33.119677580682875, + "learning_rate": 8.893499587051549e-06, + "loss": 2.4343, + "mean_token_accuracy": 0.4172413766384125, + "step": 8830 + }, + { + "epoch": 0.00889870583507329, + "grad_norm": 35.82282077874185, + "learning_rate": 8.898535543782608e-06, + "loss": 2.1443, + "mean_token_accuracy": 0.47931034564971925, + "step": 8835 + }, + { + "epoch": 0.008903741888177462, + "grad_norm": 74.12009728054932, + "learning_rate": 8.903571500513667e-06, + "loss": 2.2997, + "mean_token_accuracy": 0.43448275327682495, + "step": 8840 + }, + { + "epoch": 0.008908777941281636, + "grad_norm": 42.03595518693085, + "learning_rate": 8.908607457244728e-06, + "loss": 2.5405, + "mean_token_accuracy": 0.41034482717514037, + "step": 8845 + }, + { + "epoch": 0.008913813994385808, + "grad_norm": 37.246355210177335, + "learning_rate": 8.913643413975787e-06, + "loss": 2.3698, + "mean_token_accuracy": 0.4310344815254211, + "step": 8850 + }, + { + "epoch": 0.008918850047489981, + "grad_norm": 39.412295209463686, + "learning_rate": 8.918679370706848e-06, + "loss": 1.8509, + "mean_token_accuracy": 0.5310344815254211, + "step": 8855 + }, + { + "epoch": 0.008923886100594153, + "grad_norm": 42.34480877915383, + "learning_rate": 8.923715327437906e-06, + "loss": 2.3356, + "mean_token_accuracy": 0.4103448331356049, + "step": 8860 + }, + { + "epoch": 0.008928922153698325, + "grad_norm": 36.93129834496132, + "learning_rate": 8.928751284168967e-06, + "loss": 1.9451, + "mean_token_accuracy": 0.46551724076271056, + "step": 8865 + }, + { + "epoch": 0.0089339582068025, + "grad_norm": 44.40214287692589, + "learning_rate": 8.933787240900026e-06, + "loss": 2.3087, + "mean_token_accuracy": 0.4689655125141144, + "step": 8870 + }, + { + "epoch": 0.008938994259906671, + "grad_norm": 46.459044601030485, + "learning_rate": 8.938823197631087e-06, + "loss": 2.5888, + "mean_token_accuracy": 0.41724138259887694, + "step": 8875 + }, + { + "epoch": 0.008944030313010845, + "grad_norm": 36.17053422894732, + "learning_rate": 8.943859154362146e-06, + "loss": 2.3261, + "mean_token_accuracy": 0.4379310369491577, + "step": 8880 + }, + { + "epoch": 0.008949066366115017, + "grad_norm": 39.471827120213575, + "learning_rate": 8.948895111093206e-06, + "loss": 2.2563, + "mean_token_accuracy": 0.43448275327682495, + "step": 8885 + }, + { + "epoch": 0.00895410241921919, + "grad_norm": 35.29991544199468, + "learning_rate": 8.953931067824266e-06, + "loss": 2.4227, + "mean_token_accuracy": 0.42758620381355283, + "step": 8890 + }, + { + "epoch": 0.008959138472323363, + "grad_norm": 32.2206525931091, + "learning_rate": 8.958967024555326e-06, + "loss": 2.0856, + "mean_token_accuracy": 0.47241378426551817, + "step": 8895 + }, + { + "epoch": 0.008964174525427535, + "grad_norm": 40.16277530680962, + "learning_rate": 8.964002981286385e-06, + "loss": 2.126, + "mean_token_accuracy": 0.5068965494632721, + "step": 8900 + }, + { + "epoch": 0.008969210578531709, + "grad_norm": 36.27583049175853, + "learning_rate": 8.969038938017444e-06, + "loss": 2.3759, + "mean_token_accuracy": 0.41379311084747317, + "step": 8905 + }, + { + "epoch": 0.00897424663163588, + "grad_norm": 40.78625627856431, + "learning_rate": 8.974074894748505e-06, + "loss": 2.4439, + "mean_token_accuracy": 0.39310344457626345, + "step": 8910 + }, + { + "epoch": 0.008979282684740054, + "grad_norm": 35.61396377356604, + "learning_rate": 8.979110851479564e-06, + "loss": 2.657, + "mean_token_accuracy": 0.37586207389831544, + "step": 8915 + }, + { + "epoch": 0.008984318737844227, + "grad_norm": 41.937265004924406, + "learning_rate": 8.984146808210624e-06, + "loss": 2.4642, + "mean_token_accuracy": 0.4034482777118683, + "step": 8920 + }, + { + "epoch": 0.0089893547909484, + "grad_norm": 46.92405418834552, + "learning_rate": 8.989182764941683e-06, + "loss": 2.1024, + "mean_token_accuracy": 0.482758617401123, + "step": 8925 + }, + { + "epoch": 0.008994390844052572, + "grad_norm": 40.53266462613649, + "learning_rate": 8.994218721672744e-06, + "loss": 2.5797, + "mean_token_accuracy": 0.4137930989265442, + "step": 8930 + }, + { + "epoch": 0.008999426897156744, + "grad_norm": 39.23018318463504, + "learning_rate": 8.999254678403803e-06, + "loss": 2.2317, + "mean_token_accuracy": 0.46551724076271056, + "step": 8935 + }, + { + "epoch": 0.009004462950260918, + "grad_norm": 31.550674691626337, + "learning_rate": 9.004290635134862e-06, + "loss": 2.8168, + "mean_token_accuracy": 0.38620689511299133, + "step": 8940 + }, + { + "epoch": 0.00900949900336509, + "grad_norm": 68.64533245222765, + "learning_rate": 9.009326591865923e-06, + "loss": 2.2131, + "mean_token_accuracy": 0.482758629322052, + "step": 8945 + }, + { + "epoch": 0.009014535056469264, + "grad_norm": 42.42249912773483, + "learning_rate": 9.014362548596983e-06, + "loss": 2.1215, + "mean_token_accuracy": 0.4862068951129913, + "step": 8950 + }, + { + "epoch": 0.009019571109573436, + "grad_norm": 45.18732616195425, + "learning_rate": 9.019398505328044e-06, + "loss": 2.4248, + "mean_token_accuracy": 0.44187192916870116, + "step": 8955 + }, + { + "epoch": 0.00902460716267761, + "grad_norm": 35.63652813760277, + "learning_rate": 9.024434462059101e-06, + "loss": 2.6377, + "mean_token_accuracy": 0.4068965554237366, + "step": 8960 + }, + { + "epoch": 0.009029643215781782, + "grad_norm": 35.049794696554954, + "learning_rate": 9.029470418790162e-06, + "loss": 2.5643, + "mean_token_accuracy": 0.4206896543502808, + "step": 8965 + }, + { + "epoch": 0.009034679268885954, + "grad_norm": 41.191098651544245, + "learning_rate": 9.034506375521221e-06, + "loss": 2.3087, + "mean_token_accuracy": 0.45862069725990295, + "step": 8970 + }, + { + "epoch": 0.009039715321990128, + "grad_norm": 33.91435207143863, + "learning_rate": 9.039542332252282e-06, + "loss": 2.1685, + "mean_token_accuracy": 0.458620685338974, + "step": 8975 + }, + { + "epoch": 0.0090447513750943, + "grad_norm": 31.991502555305818, + "learning_rate": 9.044578288983342e-06, + "loss": 2.2538, + "mean_token_accuracy": 0.44137930274009707, + "step": 8980 + }, + { + "epoch": 0.009049787428198473, + "grad_norm": 31.863882731371987, + "learning_rate": 9.0496142457144e-06, + "loss": 2.5573, + "mean_token_accuracy": 0.38275861740112305, + "step": 8985 + }, + { + "epoch": 0.009054823481302645, + "grad_norm": 36.29538954972781, + "learning_rate": 9.054650202445462e-06, + "loss": 2.1569, + "mean_token_accuracy": 0.4517241358757019, + "step": 8990 + }, + { + "epoch": 0.009059859534406819, + "grad_norm": 35.64888336346998, + "learning_rate": 9.059686159176521e-06, + "loss": 2.1827, + "mean_token_accuracy": 0.4413793087005615, + "step": 8995 + }, + { + "epoch": 0.009064895587510991, + "grad_norm": 35.91379375037939, + "learning_rate": 9.064722115907582e-06, + "loss": 2.3477, + "mean_token_accuracy": 0.4517241299152374, + "step": 9000 + }, + { + "epoch": 0.009069931640615163, + "grad_norm": 36.88970119610507, + "learning_rate": 9.06975807263864e-06, + "loss": 2.6087, + "mean_token_accuracy": 0.3965517163276672, + "step": 9005 + }, + { + "epoch": 0.009074967693719337, + "grad_norm": 41.9216727845431, + "learning_rate": 9.0747940293697e-06, + "loss": 2.3214, + "mean_token_accuracy": 0.458620685338974, + "step": 9010 + }, + { + "epoch": 0.009080003746823509, + "grad_norm": 48.24826254484031, + "learning_rate": 9.07982998610076e-06, + "loss": 2.2357, + "mean_token_accuracy": 0.46551724076271056, + "step": 9015 + }, + { + "epoch": 0.009085039799927683, + "grad_norm": 89.105300244921, + "learning_rate": 9.08486594283182e-06, + "loss": 2.3236, + "mean_token_accuracy": 0.43647912740707395, + "step": 9020 + }, + { + "epoch": 0.009090075853031855, + "grad_norm": 47.32426116857205, + "learning_rate": 9.08990189956288e-06, + "loss": 2.6921, + "mean_token_accuracy": 0.37586206793785093, + "step": 9025 + }, + { + "epoch": 0.009095111906136029, + "grad_norm": 35.637115173034246, + "learning_rate": 9.094937856293939e-06, + "loss": 2.4778, + "mean_token_accuracy": 0.3793103456497192, + "step": 9030 + }, + { + "epoch": 0.0091001479592402, + "grad_norm": 46.00403941566564, + "learning_rate": 9.099973813025e-06, + "loss": 2.4927, + "mean_token_accuracy": 0.39310344457626345, + "step": 9035 + }, + { + "epoch": 0.009105184012344373, + "grad_norm": 38.43241376328721, + "learning_rate": 9.10500976975606e-06, + "loss": 2.2848, + "mean_token_accuracy": 0.42758620381355283, + "step": 9040 + }, + { + "epoch": 0.009110220065448546, + "grad_norm": 35.176183223070055, + "learning_rate": 9.110045726487119e-06, + "loss": 2.4917, + "mean_token_accuracy": 0.38965516686439516, + "step": 9045 + }, + { + "epoch": 0.009115256118552718, + "grad_norm": 38.76078797257197, + "learning_rate": 9.115081683218178e-06, + "loss": 2.5065, + "mean_token_accuracy": 0.37586206793785093, + "step": 9050 + }, + { + "epoch": 0.009120292171656892, + "grad_norm": 39.51896197275594, + "learning_rate": 9.120117639949239e-06, + "loss": 2.1839, + "mean_token_accuracy": 0.48620688915252686, + "step": 9055 + }, + { + "epoch": 0.009125328224761064, + "grad_norm": 52.32388189414563, + "learning_rate": 9.125153596680298e-06, + "loss": 2.3236, + "mean_token_accuracy": 0.4655172348022461, + "step": 9060 + }, + { + "epoch": 0.009130364277865238, + "grad_norm": 36.430747498799896, + "learning_rate": 9.130189553411357e-06, + "loss": 1.9821, + "mean_token_accuracy": 0.4931034445762634, + "step": 9065 + }, + { + "epoch": 0.00913540033096941, + "grad_norm": 40.2020694323548, + "learning_rate": 9.135225510142417e-06, + "loss": 2.2924, + "mean_token_accuracy": 0.5, + "step": 9070 + }, + { + "epoch": 0.009140436384073582, + "grad_norm": 41.64997068405822, + "learning_rate": 9.140261466873477e-06, + "loss": 2.2588, + "mean_token_accuracy": 0.45517241954803467, + "step": 9075 + }, + { + "epoch": 0.009145472437177756, + "grad_norm": 39.939107386196405, + "learning_rate": 9.145297423604537e-06, + "loss": 2.0679, + "mean_token_accuracy": 0.46551724076271056, + "step": 9080 + }, + { + "epoch": 0.009150508490281928, + "grad_norm": 37.80708946159949, + "learning_rate": 9.150333380335596e-06, + "loss": 2.2589, + "mean_token_accuracy": 0.4689655303955078, + "step": 9085 + }, + { + "epoch": 0.009155544543386102, + "grad_norm": 40.82085204326002, + "learning_rate": 9.155369337066657e-06, + "loss": 2.3523, + "mean_token_accuracy": 0.4310344815254211, + "step": 9090 + }, + { + "epoch": 0.009160580596490274, + "grad_norm": 36.91840742315867, + "learning_rate": 9.160405293797716e-06, + "loss": 2.2179, + "mean_token_accuracy": 0.4206896543502808, + "step": 9095 + }, + { + "epoch": 0.009165616649594447, + "grad_norm": 39.089459598305226, + "learning_rate": 9.165441250528777e-06, + "loss": 2.5603, + "mean_token_accuracy": 0.4413793087005615, + "step": 9100 + }, + { + "epoch": 0.00917065270269862, + "grad_norm": 41.483819350705524, + "learning_rate": 9.170477207259835e-06, + "loss": 2.4374, + "mean_token_accuracy": 0.4068965494632721, + "step": 9105 + }, + { + "epoch": 0.009175688755802791, + "grad_norm": 38.93403221599773, + "learning_rate": 9.175513163990896e-06, + "loss": 2.4541, + "mean_token_accuracy": 0.4532970368862152, + "step": 9110 + }, + { + "epoch": 0.009180724808906965, + "grad_norm": 37.15036938373122, + "learning_rate": 9.180549120721955e-06, + "loss": 2.2243, + "mean_token_accuracy": 0.4724137902259827, + "step": 9115 + }, + { + "epoch": 0.009185760862011137, + "grad_norm": 43.307143940722554, + "learning_rate": 9.185585077453016e-06, + "loss": 2.342, + "mean_token_accuracy": 0.4517241418361664, + "step": 9120 + }, + { + "epoch": 0.009190796915115311, + "grad_norm": 36.52593057082021, + "learning_rate": 9.190621034184075e-06, + "loss": 2.4541, + "mean_token_accuracy": 0.3827586233615875, + "step": 9125 + }, + { + "epoch": 0.009195832968219483, + "grad_norm": 31.69491766657416, + "learning_rate": 9.195656990915134e-06, + "loss": 2.2288, + "mean_token_accuracy": 0.4880822718143463, + "step": 9130 + }, + { + "epoch": 0.009200869021323657, + "grad_norm": 33.46994089808851, + "learning_rate": 9.200692947646195e-06, + "loss": 2.5069, + "mean_token_accuracy": 0.4000000059604645, + "step": 9135 + }, + { + "epoch": 0.009205905074427829, + "grad_norm": 47.98058660860216, + "learning_rate": 9.205728904377255e-06, + "loss": 2.0576, + "mean_token_accuracy": 0.47931034564971925, + "step": 9140 + }, + { + "epoch": 0.009210941127532, + "grad_norm": 36.29921682152968, + "learning_rate": 9.210764861108314e-06, + "loss": 2.1427, + "mean_token_accuracy": 0.44827585816383364, + "step": 9145 + }, + { + "epoch": 0.009215977180636175, + "grad_norm": 33.49505729104911, + "learning_rate": 9.215800817839373e-06, + "loss": 2.6732, + "mean_token_accuracy": 0.4034482777118683, + "step": 9150 + }, + { + "epoch": 0.009221013233740347, + "grad_norm": 44.95200390828714, + "learning_rate": 9.220836774570434e-06, + "loss": 2.4702, + "mean_token_accuracy": 0.41379310488700866, + "step": 9155 + }, + { + "epoch": 0.00922604928684452, + "grad_norm": 42.43805088101427, + "learning_rate": 9.225872731301493e-06, + "loss": 2.6847, + "mean_token_accuracy": 0.39655172228813174, + "step": 9160 + }, + { + "epoch": 0.009231085339948692, + "grad_norm": 32.32555586065422, + "learning_rate": 9.230908688032552e-06, + "loss": 1.752, + "mean_token_accuracy": 0.5379310369491577, + "step": 9165 + }, + { + "epoch": 0.009236121393052866, + "grad_norm": 34.91766987949955, + "learning_rate": 9.235944644763612e-06, + "loss": 2.1642, + "mean_token_accuracy": 0.4661329984664917, + "step": 9170 + }, + { + "epoch": 0.009241157446157038, + "grad_norm": 34.98220392415641, + "learning_rate": 9.240980601494673e-06, + "loss": 2.3802, + "mean_token_accuracy": 0.4517241358757019, + "step": 9175 + }, + { + "epoch": 0.00924619349926121, + "grad_norm": 33.05414702913924, + "learning_rate": 9.246016558225732e-06, + "loss": 2.7526, + "mean_token_accuracy": 0.4137930989265442, + "step": 9180 + }, + { + "epoch": 0.009251229552365384, + "grad_norm": 33.42497700812565, + "learning_rate": 9.251052514956791e-06, + "loss": 2.617, + "mean_token_accuracy": 0.382758629322052, + "step": 9185 + }, + { + "epoch": 0.009256265605469556, + "grad_norm": 38.947026588854825, + "learning_rate": 9.256088471687852e-06, + "loss": 2.3767, + "mean_token_accuracy": 0.43448275327682495, + "step": 9190 + }, + { + "epoch": 0.00926130165857373, + "grad_norm": 51.582724085860605, + "learning_rate": 9.261124428418911e-06, + "loss": 2.4202, + "mean_token_accuracy": 0.3827586233615875, + "step": 9195 + }, + { + "epoch": 0.009266337711677902, + "grad_norm": 48.45775234282198, + "learning_rate": 9.266160385149972e-06, + "loss": 2.4542, + "mean_token_accuracy": 0.43793103098869324, + "step": 9200 + }, + { + "epoch": 0.009271373764782076, + "grad_norm": 33.477875381335934, + "learning_rate": 9.27119634188103e-06, + "loss": 2.2288, + "mean_token_accuracy": 0.43793103098869324, + "step": 9205 + }, + { + "epoch": 0.009276409817886248, + "grad_norm": 33.39386192015122, + "learning_rate": 9.276232298612091e-06, + "loss": 2.2179, + "mean_token_accuracy": 0.4931034445762634, + "step": 9210 + }, + { + "epoch": 0.00928144587099042, + "grad_norm": 37.65668756353228, + "learning_rate": 9.28126825534315e-06, + "loss": 2.5813, + "mean_token_accuracy": 0.3965517282485962, + "step": 9215 + }, + { + "epoch": 0.009286481924094593, + "grad_norm": 45.799604301856725, + "learning_rate": 9.286304212074211e-06, + "loss": 2.2835, + "mean_token_accuracy": 0.458620685338974, + "step": 9220 + }, + { + "epoch": 0.009291517977198765, + "grad_norm": 37.43982355107599, + "learning_rate": 9.29134016880527e-06, + "loss": 2.1804, + "mean_token_accuracy": 0.47586206793785096, + "step": 9225 + }, + { + "epoch": 0.00929655403030294, + "grad_norm": 30.442295618337628, + "learning_rate": 9.29637612553633e-06, + "loss": 2.1534, + "mean_token_accuracy": 0.44827585816383364, + "step": 9230 + }, + { + "epoch": 0.009301590083407111, + "grad_norm": 28.580133085103068, + "learning_rate": 9.30141208226739e-06, + "loss": 2.3145, + "mean_token_accuracy": 0.4482758641242981, + "step": 9235 + }, + { + "epoch": 0.009306626136511285, + "grad_norm": 48.86313982794057, + "learning_rate": 9.30644803899845e-06, + "loss": 2.5066, + "mean_token_accuracy": 0.4586206912994385, + "step": 9240 + }, + { + "epoch": 0.009311662189615457, + "grad_norm": 35.62881252197929, + "learning_rate": 9.311483995729509e-06, + "loss": 2.3503, + "mean_token_accuracy": 0.37586206793785093, + "step": 9245 + }, + { + "epoch": 0.009316698242719629, + "grad_norm": 37.529854807030055, + "learning_rate": 9.316519952460568e-06, + "loss": 2.168, + "mean_token_accuracy": 0.4310344815254211, + "step": 9250 + }, + { + "epoch": 0.009321734295823803, + "grad_norm": 30.44971412684755, + "learning_rate": 9.32155590919163e-06, + "loss": 2.2228, + "mean_token_accuracy": 0.46551724076271056, + "step": 9255 + }, + { + "epoch": 0.009326770348927975, + "grad_norm": 40.207080397248326, + "learning_rate": 9.326591865922688e-06, + "loss": 2.4239, + "mean_token_accuracy": 0.4068965554237366, + "step": 9260 + }, + { + "epoch": 0.009331806402032149, + "grad_norm": 37.07009577442753, + "learning_rate": 9.331627822653748e-06, + "loss": 2.6045, + "mean_token_accuracy": 0.38965516686439516, + "step": 9265 + }, + { + "epoch": 0.00933684245513632, + "grad_norm": 37.49203138237693, + "learning_rate": 9.336663779384807e-06, + "loss": 2.3839, + "mean_token_accuracy": 0.41034482717514037, + "step": 9270 + }, + { + "epoch": 0.009341878508240494, + "grad_norm": 37.72476793928882, + "learning_rate": 9.341699736115868e-06, + "loss": 2.733, + "mean_token_accuracy": 0.3551724135875702, + "step": 9275 + }, + { + "epoch": 0.009346914561344666, + "grad_norm": 39.5307546229843, + "learning_rate": 9.346735692846927e-06, + "loss": 2.3011, + "mean_token_accuracy": 0.4436176776885986, + "step": 9280 + }, + { + "epoch": 0.009351950614448839, + "grad_norm": 35.12676499845565, + "learning_rate": 9.351771649577986e-06, + "loss": 2.2473, + "mean_token_accuracy": 0.4551724076271057, + "step": 9285 + }, + { + "epoch": 0.009356986667553012, + "grad_norm": 37.01640778622918, + "learning_rate": 9.356807606309047e-06, + "loss": 2.3446, + "mean_token_accuracy": 0.4344827592372894, + "step": 9290 + }, + { + "epoch": 0.009362022720657184, + "grad_norm": 32.90171139625545, + "learning_rate": 9.361843563040107e-06, + "loss": 2.5519, + "mean_token_accuracy": 0.4620689690113068, + "step": 9295 + }, + { + "epoch": 0.009367058773761358, + "grad_norm": 38.007285404658894, + "learning_rate": 9.366879519771168e-06, + "loss": 2.439, + "mean_token_accuracy": 0.42758620977401735, + "step": 9300 + }, + { + "epoch": 0.00937209482686553, + "grad_norm": 35.19365639538145, + "learning_rate": 9.371915476502225e-06, + "loss": 2.3766, + "mean_token_accuracy": 0.4398064136505127, + "step": 9305 + }, + { + "epoch": 0.009377130879969704, + "grad_norm": 41.130173655164306, + "learning_rate": 9.376951433233286e-06, + "loss": 2.1596, + "mean_token_accuracy": 0.5068965554237366, + "step": 9310 + }, + { + "epoch": 0.009382166933073876, + "grad_norm": 41.138525515556516, + "learning_rate": 9.381987389964345e-06, + "loss": 2.1327, + "mean_token_accuracy": 0.501996374130249, + "step": 9315 + }, + { + "epoch": 0.009387202986178048, + "grad_norm": 39.71963933737859, + "learning_rate": 9.387023346695406e-06, + "loss": 2.2676, + "mean_token_accuracy": 0.4310344815254211, + "step": 9320 + }, + { + "epoch": 0.009392239039282222, + "grad_norm": 29.04877378610156, + "learning_rate": 9.392059303426466e-06, + "loss": 2.3532, + "mean_token_accuracy": 0.4551724135875702, + "step": 9325 + }, + { + "epoch": 0.009397275092386394, + "grad_norm": 47.707668170366624, + "learning_rate": 9.397095260157525e-06, + "loss": 2.2283, + "mean_token_accuracy": 0.46551724076271056, + "step": 9330 + }, + { + "epoch": 0.009402311145490568, + "grad_norm": 39.31902167780123, + "learning_rate": 9.402131216888586e-06, + "loss": 2.3474, + "mean_token_accuracy": 0.42068966031074523, + "step": 9335 + }, + { + "epoch": 0.00940734719859474, + "grad_norm": 35.20443660332652, + "learning_rate": 9.407167173619645e-06, + "loss": 2.3389, + "mean_token_accuracy": 0.36206896007061007, + "step": 9340 + }, + { + "epoch": 0.009412383251698913, + "grad_norm": 28.877074126269388, + "learning_rate": 9.412203130350704e-06, + "loss": 2.0916, + "mean_token_accuracy": 0.4724137902259827, + "step": 9345 + }, + { + "epoch": 0.009417419304803085, + "grad_norm": 62.02005661605764, + "learning_rate": 9.417239087081763e-06, + "loss": 2.4439, + "mean_token_accuracy": 0.4034482777118683, + "step": 9350 + }, + { + "epoch": 0.009422455357907257, + "grad_norm": 54.1560187250223, + "learning_rate": 9.422275043812824e-06, + "loss": 2.24, + "mean_token_accuracy": 0.4448275864124298, + "step": 9355 + }, + { + "epoch": 0.009427491411011431, + "grad_norm": 37.19352151029625, + "learning_rate": 9.427311000543884e-06, + "loss": 2.2809, + "mean_token_accuracy": 0.4448275864124298, + "step": 9360 + }, + { + "epoch": 0.009432527464115603, + "grad_norm": 55.89592120683475, + "learning_rate": 9.432346957274943e-06, + "loss": 2.4476, + "mean_token_accuracy": 0.41724138855934145, + "step": 9365 + }, + { + "epoch": 0.009437563517219777, + "grad_norm": 38.05658085036962, + "learning_rate": 9.437382914006002e-06, + "loss": 2.197, + "mean_token_accuracy": 0.4586206912994385, + "step": 9370 + }, + { + "epoch": 0.009442599570323949, + "grad_norm": 48.367057019437205, + "learning_rate": 9.442418870737063e-06, + "loss": 2.3918, + "mean_token_accuracy": 0.42758620381355283, + "step": 9375 + }, + { + "epoch": 0.009447635623428123, + "grad_norm": 35.34738493121819, + "learning_rate": 9.447454827468122e-06, + "loss": 2.3408, + "mean_token_accuracy": 0.4294615864753723, + "step": 9380 + }, + { + "epoch": 0.009452671676532295, + "grad_norm": 35.47678578946282, + "learning_rate": 9.452490784199182e-06, + "loss": 2.3693, + "mean_token_accuracy": 0.44827585816383364, + "step": 9385 + }, + { + "epoch": 0.009457707729636467, + "grad_norm": 45.824387931172495, + "learning_rate": 9.457526740930243e-06, + "loss": 2.5497, + "mean_token_accuracy": 0.3931034505367279, + "step": 9390 + }, + { + "epoch": 0.00946274378274064, + "grad_norm": 40.298470472182366, + "learning_rate": 9.462562697661302e-06, + "loss": 2.3054, + "mean_token_accuracy": 0.46896551847457885, + "step": 9395 + }, + { + "epoch": 0.009467779835844813, + "grad_norm": 36.99580884410551, + "learning_rate": 9.467598654392363e-06, + "loss": 2.3268, + "mean_token_accuracy": 0.42413792610168455, + "step": 9400 + }, + { + "epoch": 0.009472815888948986, + "grad_norm": 36.53748203376824, + "learning_rate": 9.47263461112342e-06, + "loss": 2.2065, + "mean_token_accuracy": 0.4551724076271057, + "step": 9405 + }, + { + "epoch": 0.009477851942053158, + "grad_norm": 30.8940653222547, + "learning_rate": 9.477670567854481e-06, + "loss": 2.2611, + "mean_token_accuracy": 0.42758620977401735, + "step": 9410 + }, + { + "epoch": 0.009482887995157332, + "grad_norm": 28.734091237414834, + "learning_rate": 9.48270652458554e-06, + "loss": 2.1265, + "mean_token_accuracy": 0.5034482777118683, + "step": 9415 + }, + { + "epoch": 0.009487924048261504, + "grad_norm": 31.1004818606481, + "learning_rate": 9.487742481316602e-06, + "loss": 2.1517, + "mean_token_accuracy": 0.4551724076271057, + "step": 9420 + }, + { + "epoch": 0.009492960101365676, + "grad_norm": 34.473597381292784, + "learning_rate": 9.49277843804766e-06, + "loss": 2.6196, + "mean_token_accuracy": 0.45862067937850953, + "step": 9425 + }, + { + "epoch": 0.00949799615446985, + "grad_norm": 53.67873321079563, + "learning_rate": 9.49781439477872e-06, + "loss": 2.4002, + "mean_token_accuracy": 0.41724138259887694, + "step": 9430 + }, + { + "epoch": 0.009503032207574022, + "grad_norm": 47.26783759849226, + "learning_rate": 9.502850351509781e-06, + "loss": 2.4613, + "mean_token_accuracy": 0.4034482777118683, + "step": 9435 + }, + { + "epoch": 0.009508068260678196, + "grad_norm": 31.863222782905083, + "learning_rate": 9.50788630824084e-06, + "loss": 1.9586, + "mean_token_accuracy": 0.4551724135875702, + "step": 9440 + }, + { + "epoch": 0.009513104313782368, + "grad_norm": 39.919495468484904, + "learning_rate": 9.512922264971901e-06, + "loss": 2.2941, + "mean_token_accuracy": 0.47241380214691164, + "step": 9445 + }, + { + "epoch": 0.009518140366886542, + "grad_norm": 48.282929799846265, + "learning_rate": 9.517958221702959e-06, + "loss": 2.6901, + "mean_token_accuracy": 0.4137930989265442, + "step": 9450 + }, + { + "epoch": 0.009523176419990714, + "grad_norm": 37.14896936637999, + "learning_rate": 9.52299417843402e-06, + "loss": 2.3656, + "mean_token_accuracy": 0.4, + "step": 9455 + }, + { + "epoch": 0.009528212473094886, + "grad_norm": 47.67910216055814, + "learning_rate": 9.528030135165079e-06, + "loss": 2.0631, + "mean_token_accuracy": 0.4689655303955078, + "step": 9460 + }, + { + "epoch": 0.00953324852619906, + "grad_norm": 26.999679700384807, + "learning_rate": 9.53306609189614e-06, + "loss": 2.2564, + "mean_token_accuracy": 0.4517241418361664, + "step": 9465 + }, + { + "epoch": 0.009538284579303231, + "grad_norm": 36.86952532916416, + "learning_rate": 9.538102048627197e-06, + "loss": 2.1999, + "mean_token_accuracy": 0.38620689511299133, + "step": 9470 + }, + { + "epoch": 0.009543320632407405, + "grad_norm": 26.76187454186431, + "learning_rate": 9.543138005358258e-06, + "loss": 2.4029, + "mean_token_accuracy": 0.4517241418361664, + "step": 9475 + }, + { + "epoch": 0.009548356685511577, + "grad_norm": 37.90728106773331, + "learning_rate": 9.54817396208932e-06, + "loss": 2.2918, + "mean_token_accuracy": 0.4517241358757019, + "step": 9480 + }, + { + "epoch": 0.009553392738615751, + "grad_norm": 41.43769948358146, + "learning_rate": 9.553209918820379e-06, + "loss": 2.3432, + "mean_token_accuracy": 0.4172413766384125, + "step": 9485 + }, + { + "epoch": 0.009558428791719923, + "grad_norm": 41.565916707039364, + "learning_rate": 9.558245875551438e-06, + "loss": 2.117, + "mean_token_accuracy": 0.49704434871673586, + "step": 9490 + }, + { + "epoch": 0.009563464844824095, + "grad_norm": 53.22743699044022, + "learning_rate": 9.563281832282497e-06, + "loss": 2.3736, + "mean_token_accuracy": 0.43103448748588563, + "step": 9495 + }, + { + "epoch": 0.009568500897928269, + "grad_norm": 38.46665459041057, + "learning_rate": 9.568317789013558e-06, + "loss": 2.1753, + "mean_token_accuracy": 0.4724137902259827, + "step": 9500 + }, + { + "epoch": 0.00957353695103244, + "grad_norm": 32.935608646062974, + "learning_rate": 9.573353745744617e-06, + "loss": 2.5808, + "mean_token_accuracy": 0.4261947989463806, + "step": 9505 + }, + { + "epoch": 0.009578573004136615, + "grad_norm": 40.2120567077209, + "learning_rate": 9.578389702475677e-06, + "loss": 2.3295, + "mean_token_accuracy": 0.4379310369491577, + "step": 9510 + }, + { + "epoch": 0.009583609057240787, + "grad_norm": 34.18815683141399, + "learning_rate": 9.583425659206736e-06, + "loss": 2.351, + "mean_token_accuracy": 0.46037508845329284, + "step": 9515 + }, + { + "epoch": 0.00958864511034496, + "grad_norm": 54.585054653436785, + "learning_rate": 9.588461615937797e-06, + "loss": 2.3846, + "mean_token_accuracy": 0.4620689690113068, + "step": 9520 + }, + { + "epoch": 0.009593681163449132, + "grad_norm": 43.93374931186114, + "learning_rate": 9.593497572668856e-06, + "loss": 2.5726, + "mean_token_accuracy": 0.4482758641242981, + "step": 9525 + }, + { + "epoch": 0.009598717216553304, + "grad_norm": 44.360102782051726, + "learning_rate": 9.598533529399915e-06, + "loss": 2.4388, + "mean_token_accuracy": 0.4758620738983154, + "step": 9530 + }, + { + "epoch": 0.009603753269657478, + "grad_norm": 46.75849766622284, + "learning_rate": 9.603569486130976e-06, + "loss": 2.3864, + "mean_token_accuracy": 0.42758620381355283, + "step": 9535 + }, + { + "epoch": 0.00960878932276165, + "grad_norm": 53.288768433113255, + "learning_rate": 9.608605442862035e-06, + "loss": 2.4145, + "mean_token_accuracy": 0.45390198826789857, + "step": 9540 + }, + { + "epoch": 0.009613825375865824, + "grad_norm": 41.53323184911322, + "learning_rate": 9.613641399593096e-06, + "loss": 2.8338, + "mean_token_accuracy": 0.3379310369491577, + "step": 9545 + }, + { + "epoch": 0.009618861428969996, + "grad_norm": 57.717260151059946, + "learning_rate": 9.618677356324154e-06, + "loss": 2.2696, + "mean_token_accuracy": 0.5172413945198059, + "step": 9550 + }, + { + "epoch": 0.00962389748207417, + "grad_norm": 50.134721596711685, + "learning_rate": 9.623713313055215e-06, + "loss": 2.3364, + "mean_token_accuracy": 0.3999999940395355, + "step": 9555 + }, + { + "epoch": 0.009628933535178342, + "grad_norm": 40.60547468196455, + "learning_rate": 9.628749269786274e-06, + "loss": 2.3199, + "mean_token_accuracy": 0.4517241299152374, + "step": 9560 + }, + { + "epoch": 0.009633969588282514, + "grad_norm": 39.38986228375763, + "learning_rate": 9.633785226517335e-06, + "loss": 1.9784, + "mean_token_accuracy": 0.4564428269863129, + "step": 9565 + }, + { + "epoch": 0.009639005641386688, + "grad_norm": 35.58370937791689, + "learning_rate": 9.638821183248394e-06, + "loss": 2.2372, + "mean_token_accuracy": 0.4413793087005615, + "step": 9570 + }, + { + "epoch": 0.00964404169449086, + "grad_norm": 41.49612424376096, + "learning_rate": 9.643857139979454e-06, + "loss": 2.4391, + "mean_token_accuracy": 0.4172413766384125, + "step": 9575 + }, + { + "epoch": 0.009649077747595033, + "grad_norm": 29.33403181280721, + "learning_rate": 9.648893096710515e-06, + "loss": 2.1304, + "mean_token_accuracy": 0.44827585816383364, + "step": 9580 + }, + { + "epoch": 0.009654113800699205, + "grad_norm": 38.16799360078175, + "learning_rate": 9.653929053441574e-06, + "loss": 2.4373, + "mean_token_accuracy": 0.4379310369491577, + "step": 9585 + }, + { + "epoch": 0.00965914985380338, + "grad_norm": 37.07512875659534, + "learning_rate": 9.658965010172633e-06, + "loss": 2.3941, + "mean_token_accuracy": 0.4206896543502808, + "step": 9590 + }, + { + "epoch": 0.009664185906907551, + "grad_norm": 44.19484681262662, + "learning_rate": 9.664000966903692e-06, + "loss": 2.506, + "mean_token_accuracy": 0.458620685338974, + "step": 9595 + }, + { + "epoch": 0.009669221960011723, + "grad_norm": 32.4605651046261, + "learning_rate": 9.669036923634753e-06, + "loss": 2.486, + "mean_token_accuracy": 0.42758620977401735, + "step": 9600 + }, + { + "epoch": 0.009674258013115897, + "grad_norm": 40.13781900070692, + "learning_rate": 9.674072880365812e-06, + "loss": 2.4369, + "mean_token_accuracy": 0.4310344815254211, + "step": 9605 + }, + { + "epoch": 0.009679294066220069, + "grad_norm": 40.321007584891326, + "learning_rate": 9.679108837096872e-06, + "loss": 2.2452, + "mean_token_accuracy": 0.4534180223941803, + "step": 9610 + }, + { + "epoch": 0.009684330119324243, + "grad_norm": 46.95991123018105, + "learning_rate": 9.684144793827931e-06, + "loss": 2.23, + "mean_token_accuracy": 0.4482758641242981, + "step": 9615 + }, + { + "epoch": 0.009689366172428415, + "grad_norm": 38.62739915985313, + "learning_rate": 9.689180750558992e-06, + "loss": 2.489, + "mean_token_accuracy": 0.39310344457626345, + "step": 9620 + }, + { + "epoch": 0.009694402225532587, + "grad_norm": 62.271892841552415, + "learning_rate": 9.694216707290051e-06, + "loss": 2.6741, + "mean_token_accuracy": 0.3999999940395355, + "step": 9625 + }, + { + "epoch": 0.00969943827863676, + "grad_norm": 27.64831639695065, + "learning_rate": 9.69925266402111e-06, + "loss": 2.0584, + "mean_token_accuracy": 0.49999999403953554, + "step": 9630 + }, + { + "epoch": 0.009704474331740933, + "grad_norm": 49.90490509813038, + "learning_rate": 9.704288620752171e-06, + "loss": 2.5651, + "mean_token_accuracy": 0.40344828367233276, + "step": 9635 + }, + { + "epoch": 0.009709510384845106, + "grad_norm": 54.015041389139334, + "learning_rate": 9.70932457748323e-06, + "loss": 2.8962, + "mean_token_accuracy": 0.3896551728248596, + "step": 9640 + }, + { + "epoch": 0.009714546437949279, + "grad_norm": 44.89892736778689, + "learning_rate": 9.714360534214292e-06, + "loss": 2.1613, + "mean_token_accuracy": 0.42758620977401735, + "step": 9645 + }, + { + "epoch": 0.009719582491053452, + "grad_norm": 38.46119154950889, + "learning_rate": 9.71939649094535e-06, + "loss": 2.0622, + "mean_token_accuracy": 0.4965517222881317, + "step": 9650 + }, + { + "epoch": 0.009724618544157624, + "grad_norm": 50.21323139349961, + "learning_rate": 9.72443244767641e-06, + "loss": 2.2516, + "mean_token_accuracy": 0.44827585816383364, + "step": 9655 + }, + { + "epoch": 0.009729654597261796, + "grad_norm": 37.73573326108465, + "learning_rate": 9.72946840440747e-06, + "loss": 2.2475, + "mean_token_accuracy": 0.4482758641242981, + "step": 9660 + }, + { + "epoch": 0.00973469065036597, + "grad_norm": 40.6102850334886, + "learning_rate": 9.73450436113853e-06, + "loss": 2.2481, + "mean_token_accuracy": 0.4448275983333588, + "step": 9665 + }, + { + "epoch": 0.009739726703470142, + "grad_norm": 29.76052277546363, + "learning_rate": 9.73954031786959e-06, + "loss": 2.5081, + "mean_token_accuracy": 0.43103447556495667, + "step": 9670 + }, + { + "epoch": 0.009744762756574316, + "grad_norm": 32.381522891329375, + "learning_rate": 9.744576274600649e-06, + "loss": 2.2839, + "mean_token_accuracy": 0.44137930274009707, + "step": 9675 + }, + { + "epoch": 0.009749798809678488, + "grad_norm": 46.2840128628751, + "learning_rate": 9.74961223133171e-06, + "loss": 2.6206, + "mean_token_accuracy": 0.42758620977401735, + "step": 9680 + }, + { + "epoch": 0.009754834862782662, + "grad_norm": 43.417552676867075, + "learning_rate": 9.754648188062769e-06, + "loss": 2.2318, + "mean_token_accuracy": 0.45862067937850953, + "step": 9685 + }, + { + "epoch": 0.009759870915886834, + "grad_norm": 31.91717182901664, + "learning_rate": 9.759684144793828e-06, + "loss": 2.0837, + "mean_token_accuracy": 0.49310343265533446, + "step": 9690 + }, + { + "epoch": 0.009764906968991006, + "grad_norm": 36.06732688179533, + "learning_rate": 9.764720101524888e-06, + "loss": 2.4187, + "mean_token_accuracy": 0.4379310369491577, + "step": 9695 + }, + { + "epoch": 0.00976994302209518, + "grad_norm": 26.48922905878847, + "learning_rate": 9.769756058255948e-06, + "loss": 2.0905, + "mean_token_accuracy": 0.4744101643562317, + "step": 9700 + }, + { + "epoch": 0.009774979075199352, + "grad_norm": 28.02741967196921, + "learning_rate": 9.774792014987008e-06, + "loss": 2.2322, + "mean_token_accuracy": 0.4862068951129913, + "step": 9705 + }, + { + "epoch": 0.009780015128303525, + "grad_norm": 41.51969097502853, + "learning_rate": 9.779827971718067e-06, + "loss": 2.4651, + "mean_token_accuracy": 0.44482758045196535, + "step": 9710 + }, + { + "epoch": 0.009785051181407697, + "grad_norm": 35.66419905736955, + "learning_rate": 9.784863928449126e-06, + "loss": 2.2878, + "mean_token_accuracy": 0.44827585816383364, + "step": 9715 + }, + { + "epoch": 0.009790087234511871, + "grad_norm": 31.90114095618044, + "learning_rate": 9.789899885180187e-06, + "loss": 2.1398, + "mean_token_accuracy": 0.4689655125141144, + "step": 9720 + }, + { + "epoch": 0.009795123287616043, + "grad_norm": 39.50798321565754, + "learning_rate": 9.794935841911246e-06, + "loss": 2.1036, + "mean_token_accuracy": 0.5, + "step": 9725 + }, + { + "epoch": 0.009800159340720215, + "grad_norm": 40.770153479433105, + "learning_rate": 9.799971798642306e-06, + "loss": 2.2716, + "mean_token_accuracy": 0.43448275327682495, + "step": 9730 + }, + { + "epoch": 0.009805195393824389, + "grad_norm": 36.35831334602836, + "learning_rate": 9.805007755373367e-06, + "loss": 2.4703, + "mean_token_accuracy": 0.4172413766384125, + "step": 9735 + }, + { + "epoch": 0.009810231446928561, + "grad_norm": 47.07511466216155, + "learning_rate": 9.810043712104426e-06, + "loss": 2.48, + "mean_token_accuracy": 0.4, + "step": 9740 + }, + { + "epoch": 0.009815267500032735, + "grad_norm": 39.24649678591775, + "learning_rate": 9.815079668835487e-06, + "loss": 2.1266, + "mean_token_accuracy": 0.4482758641242981, + "step": 9745 + }, + { + "epoch": 0.009820303553136907, + "grad_norm": 50.72562269044652, + "learning_rate": 9.820115625566544e-06, + "loss": 2.4771, + "mean_token_accuracy": 0.441379314661026, + "step": 9750 + }, + { + "epoch": 0.00982533960624108, + "grad_norm": 29.216256530995295, + "learning_rate": 9.825151582297605e-06, + "loss": 2.2538, + "mean_token_accuracy": 0.4672111332416534, + "step": 9755 + }, + { + "epoch": 0.009830375659345253, + "grad_norm": 42.76259851964069, + "learning_rate": 9.830187539028665e-06, + "loss": 2.2418, + "mean_token_accuracy": 0.458620685338974, + "step": 9760 + }, + { + "epoch": 0.009835411712449425, + "grad_norm": 39.224557538858306, + "learning_rate": 9.835223495759726e-06, + "loss": 2.3883, + "mean_token_accuracy": 0.4137930989265442, + "step": 9765 + }, + { + "epoch": 0.009840447765553598, + "grad_norm": 48.384732272443365, + "learning_rate": 9.840259452490785e-06, + "loss": 2.2704, + "mean_token_accuracy": 0.4379310369491577, + "step": 9770 + }, + { + "epoch": 0.00984548381865777, + "grad_norm": 49.31576871313363, + "learning_rate": 9.845295409221844e-06, + "loss": 2.1596, + "mean_token_accuracy": 0.4896551787853241, + "step": 9775 + }, + { + "epoch": 0.009850519871761944, + "grad_norm": 51.34938786740068, + "learning_rate": 9.850331365952905e-06, + "loss": 2.389, + "mean_token_accuracy": 0.4517241358757019, + "step": 9780 + }, + { + "epoch": 0.009855555924866116, + "grad_norm": 35.20188050811193, + "learning_rate": 9.855367322683964e-06, + "loss": 2.3697, + "mean_token_accuracy": 0.46551724672317507, + "step": 9785 + }, + { + "epoch": 0.00986059197797029, + "grad_norm": 26.90127488898475, + "learning_rate": 9.860403279415023e-06, + "loss": 2.3977, + "mean_token_accuracy": 0.41863279342651366, + "step": 9790 + }, + { + "epoch": 0.009865628031074462, + "grad_norm": 48.6790640080729, + "learning_rate": 9.865439236146083e-06, + "loss": 2.6652, + "mean_token_accuracy": 0.3965517282485962, + "step": 9795 + }, + { + "epoch": 0.009870664084178634, + "grad_norm": 37.18319928261827, + "learning_rate": 9.870475192877144e-06, + "loss": 2.0745, + "mean_token_accuracy": 0.5068965554237366, + "step": 9800 + }, + { + "epoch": 0.009875700137282808, + "grad_norm": 34.914326881492144, + "learning_rate": 9.875511149608203e-06, + "loss": 2.2877, + "mean_token_accuracy": 0.4551724076271057, + "step": 9805 + }, + { + "epoch": 0.00988073619038698, + "grad_norm": 28.541790634884798, + "learning_rate": 9.880547106339262e-06, + "loss": 2.2875, + "mean_token_accuracy": 0.4586206912994385, + "step": 9810 + }, + { + "epoch": 0.009885772243491154, + "grad_norm": 32.10128822311137, + "learning_rate": 9.885583063070321e-06, + "loss": 2.3616, + "mean_token_accuracy": 0.4379310429096222, + "step": 9815 + }, + { + "epoch": 0.009890808296595326, + "grad_norm": 34.681438426135955, + "learning_rate": 9.890619019801382e-06, + "loss": 2.5218, + "mean_token_accuracy": 0.42758620977401735, + "step": 9820 + }, + { + "epoch": 0.0098958443496995, + "grad_norm": 36.78314398172527, + "learning_rate": 9.895654976532442e-06, + "loss": 2.4562, + "mean_token_accuracy": 0.4068965494632721, + "step": 9825 + }, + { + "epoch": 0.009900880402803671, + "grad_norm": 36.400868026310306, + "learning_rate": 9.900690933263503e-06, + "loss": 2.4623, + "mean_token_accuracy": 0.4551724135875702, + "step": 9830 + }, + { + "epoch": 0.009905916455907843, + "grad_norm": 63.09611463095056, + "learning_rate": 9.905726889994562e-06, + "loss": 2.8517, + "mean_token_accuracy": 0.3862069070339203, + "step": 9835 + }, + { + "epoch": 0.009910952509012017, + "grad_norm": 37.59352311555945, + "learning_rate": 9.910762846725621e-06, + "loss": 2.4938, + "mean_token_accuracy": 0.4344827592372894, + "step": 9840 + }, + { + "epoch": 0.00991598856211619, + "grad_norm": 33.1384832019594, + "learning_rate": 9.915798803456682e-06, + "loss": 2.4016, + "mean_token_accuracy": 0.4068965554237366, + "step": 9845 + }, + { + "epoch": 0.009921024615220363, + "grad_norm": 42.68695036645854, + "learning_rate": 9.920834760187741e-06, + "loss": 2.4635, + "mean_token_accuracy": 0.42758620977401735, + "step": 9850 + }, + { + "epoch": 0.009926060668324535, + "grad_norm": 43.95232104622241, + "learning_rate": 9.9258707169188e-06, + "loss": 2.4681, + "mean_token_accuracy": 0.42413792610168455, + "step": 9855 + }, + { + "epoch": 0.009931096721428709, + "grad_norm": 30.44499917573671, + "learning_rate": 9.93090667364986e-06, + "loss": 2.2056, + "mean_token_accuracy": 0.4103448212146759, + "step": 9860 + }, + { + "epoch": 0.00993613277453288, + "grad_norm": 38.329240619404715, + "learning_rate": 9.93594263038092e-06, + "loss": 2.294, + "mean_token_accuracy": 0.44827585816383364, + "step": 9865 + }, + { + "epoch": 0.009941168827637053, + "grad_norm": 34.0841614870972, + "learning_rate": 9.94097858711198e-06, + "loss": 1.9979, + "mean_token_accuracy": 0.47241378426551817, + "step": 9870 + }, + { + "epoch": 0.009946204880741227, + "grad_norm": 44.164764186961506, + "learning_rate": 9.94601454384304e-06, + "loss": 2.4154, + "mean_token_accuracy": 0.42758620977401735, + "step": 9875 + }, + { + "epoch": 0.009951240933845399, + "grad_norm": 44.39462956801971, + "learning_rate": 9.9510505005741e-06, + "loss": 2.474, + "mean_token_accuracy": 0.3999999940395355, + "step": 9880 + }, + { + "epoch": 0.009956276986949572, + "grad_norm": 55.0620247814656, + "learning_rate": 9.95608645730516e-06, + "loss": 2.2507, + "mean_token_accuracy": 0.4551724076271057, + "step": 9885 + }, + { + "epoch": 0.009961313040053744, + "grad_norm": 36.68055150617545, + "learning_rate": 9.96112241403622e-06, + "loss": 2.3992, + "mean_token_accuracy": 0.4379310250282288, + "step": 9890 + }, + { + "epoch": 0.009966349093157918, + "grad_norm": 39.585869992984804, + "learning_rate": 9.966158370767278e-06, + "loss": 2.7119, + "mean_token_accuracy": 0.37241379022598264, + "step": 9895 + }, + { + "epoch": 0.00997138514626209, + "grad_norm": 40.458863814081724, + "learning_rate": 9.971194327498339e-06, + "loss": 2.4842, + "mean_token_accuracy": 0.41724138259887694, + "step": 9900 + }, + { + "epoch": 0.009976421199366262, + "grad_norm": 53.56858318648054, + "learning_rate": 9.976230284229398e-06, + "loss": 2.416, + "mean_token_accuracy": 0.48620688915252686, + "step": 9905 + }, + { + "epoch": 0.009981457252470436, + "grad_norm": 44.82050561599625, + "learning_rate": 9.981266240960459e-06, + "loss": 2.071, + "mean_token_accuracy": 0.5091954052448273, + "step": 9910 + }, + { + "epoch": 0.009986493305574608, + "grad_norm": 38.30779622627783, + "learning_rate": 9.986302197691517e-06, + "loss": 2.174, + "mean_token_accuracy": 0.4379310369491577, + "step": 9915 + }, + { + "epoch": 0.009991529358678782, + "grad_norm": 67.06462653549127, + "learning_rate": 9.991338154422578e-06, + "loss": 2.2627, + "mean_token_accuracy": 0.4379310369491577, + "step": 9920 + }, + { + "epoch": 0.009996565411782954, + "grad_norm": 28.459262555152275, + "learning_rate": 9.996374111153637e-06, + "loss": 2.3344, + "mean_token_accuracy": 0.44482758045196535, + "step": 9925 + }, + { + "epoch": 0.010001601464887128, + "grad_norm": 46.06415459922314, + "learning_rate": 1.0001410067884698e-05, + "loss": 2.3853, + "mean_token_accuracy": 0.47035692930221557, + "step": 9930 + }, + { + "epoch": 0.0100066375179913, + "grad_norm": 51.086771355234134, + "learning_rate": 1.0006446024615757e-05, + "loss": 2.5218, + "mean_token_accuracy": 0.36896551251411436, + "step": 9935 + }, + { + "epoch": 0.010011673571095472, + "grad_norm": 32.83820277340714, + "learning_rate": 1.0011481981346816e-05, + "loss": 2.3408, + "mean_token_accuracy": 0.4689655125141144, + "step": 9940 + }, + { + "epoch": 0.010016709624199645, + "grad_norm": 48.87697795659866, + "learning_rate": 1.0016517938077877e-05, + "loss": 2.6272, + "mean_token_accuracy": 0.39655172228813174, + "step": 9945 + }, + { + "epoch": 0.010021745677303818, + "grad_norm": 34.953158531330146, + "learning_rate": 1.0021553894808937e-05, + "loss": 2.5131, + "mean_token_accuracy": 0.42758620977401735, + "step": 9950 + }, + { + "epoch": 0.010026781730407991, + "grad_norm": 28.441336427561588, + "learning_rate": 1.0026589851539996e-05, + "loss": 2.4865, + "mean_token_accuracy": 0.3793103456497192, + "step": 9955 + }, + { + "epoch": 0.010031817783512163, + "grad_norm": 42.87131918602361, + "learning_rate": 1.0031625808271055e-05, + "loss": 2.422, + "mean_token_accuracy": 0.4344827473163605, + "step": 9960 + }, + { + "epoch": 0.010036853836616337, + "grad_norm": 43.3397507835919, + "learning_rate": 1.0036661765002116e-05, + "loss": 2.3817, + "mean_token_accuracy": 0.4551724135875702, + "step": 9965 + }, + { + "epoch": 0.010041889889720509, + "grad_norm": 38.42467000016695, + "learning_rate": 1.0041697721733175e-05, + "loss": 2.2616, + "mean_token_accuracy": 0.4413793087005615, + "step": 9970 + }, + { + "epoch": 0.010046925942824681, + "grad_norm": 31.831649271520824, + "learning_rate": 1.0046733678464234e-05, + "loss": 2.1811, + "mean_token_accuracy": 0.47931034564971925, + "step": 9975 + }, + { + "epoch": 0.010051961995928855, + "grad_norm": 43.05992344527729, + "learning_rate": 1.0051769635195295e-05, + "loss": 2.4242, + "mean_token_accuracy": 0.41379310488700866, + "step": 9980 + }, + { + "epoch": 0.010056998049033027, + "grad_norm": 45.65732736647551, + "learning_rate": 1.0056805591926355e-05, + "loss": 2.319, + "mean_token_accuracy": 0.47931033968925474, + "step": 9985 + }, + { + "epoch": 0.0100620341021372, + "grad_norm": 30.152063473395394, + "learning_rate": 1.0061841548657416e-05, + "loss": 2.157, + "mean_token_accuracy": 0.4708409011363983, + "step": 9990 + }, + { + "epoch": 0.010067070155241373, + "grad_norm": 42.11063830354294, + "learning_rate": 1.0066877505388473e-05, + "loss": 2.4317, + "mean_token_accuracy": 0.43629764914512636, + "step": 9995 + }, + { + "epoch": 0.010072106208345546, + "grad_norm": 38.66812569506359, + "learning_rate": 1.0071913462119534e-05, + "loss": 2.1862, + "mean_token_accuracy": 0.47586207985877993, + "step": 10000 + }, + { + "epoch": 0.010077142261449719, + "grad_norm": 46.125095337828306, + "learning_rate": 1.0076949418850593e-05, + "loss": 2.0364, + "mean_token_accuracy": 0.45892317295074464, + "step": 10005 + }, + { + "epoch": 0.01008217831455389, + "grad_norm": 41.57642310009351, + "learning_rate": 1.0081985375581654e-05, + "loss": 2.4041, + "mean_token_accuracy": 0.417241370677948, + "step": 10010 + }, + { + "epoch": 0.010087214367658064, + "grad_norm": 38.99155556204114, + "learning_rate": 1.0087021332312712e-05, + "loss": 2.3741, + "mean_token_accuracy": 0.4482758641242981, + "step": 10015 + }, + { + "epoch": 0.010092250420762236, + "grad_norm": 33.56325731922833, + "learning_rate": 1.0092057289043773e-05, + "loss": 2.4864, + "mean_token_accuracy": 0.42413792908191683, + "step": 10020 + }, + { + "epoch": 0.01009728647386641, + "grad_norm": 33.18879509078343, + "learning_rate": 1.0097093245774834e-05, + "loss": 2.3649, + "mean_token_accuracy": 0.4015124022960663, + "step": 10025 + }, + { + "epoch": 0.010102322526970582, + "grad_norm": 34.39943228589022, + "learning_rate": 1.0102129202505893e-05, + "loss": 2.6679, + "mean_token_accuracy": 0.40852994918823243, + "step": 10030 + }, + { + "epoch": 0.010107358580074756, + "grad_norm": 37.828265395747664, + "learning_rate": 1.0107165159236952e-05, + "loss": 2.2851, + "mean_token_accuracy": 0.41379310488700866, + "step": 10035 + }, + { + "epoch": 0.010112394633178928, + "grad_norm": 36.38859441554674, + "learning_rate": 1.0112201115968012e-05, + "loss": 2.5865, + "mean_token_accuracy": 0.4329098641872406, + "step": 10040 + }, + { + "epoch": 0.0101174306862831, + "grad_norm": 31.873835647500325, + "learning_rate": 1.0117237072699072e-05, + "loss": 2.8942, + "mean_token_accuracy": 0.3620689630508423, + "step": 10045 + }, + { + "epoch": 0.010122466739387274, + "grad_norm": 40.698633581325126, + "learning_rate": 1.0122273029430132e-05, + "loss": 2.2452, + "mean_token_accuracy": 0.417241370677948, + "step": 10050 + }, + { + "epoch": 0.010127502792491446, + "grad_norm": 42.06101459085175, + "learning_rate": 1.0127308986161191e-05, + "loss": 2.6023, + "mean_token_accuracy": 0.4137930989265442, + "step": 10055 + }, + { + "epoch": 0.01013253884559562, + "grad_norm": 39.467673678439034, + "learning_rate": 1.013234494289225e-05, + "loss": 2.6424, + "mean_token_accuracy": 0.3965517282485962, + "step": 10060 + }, + { + "epoch": 0.010137574898699792, + "grad_norm": 42.071961100338484, + "learning_rate": 1.0137380899623311e-05, + "loss": 1.959, + "mean_token_accuracy": 0.49458128213882446, + "step": 10065 + }, + { + "epoch": 0.010142610951803965, + "grad_norm": 39.88961383158405, + "learning_rate": 1.014241685635437e-05, + "loss": 2.2529, + "mean_token_accuracy": 0.49655172824859617, + "step": 10070 + }, + { + "epoch": 0.010147647004908137, + "grad_norm": 34.121754964629766, + "learning_rate": 1.014745281308543e-05, + "loss": 2.0943, + "mean_token_accuracy": 0.4620689570903778, + "step": 10075 + }, + { + "epoch": 0.01015268305801231, + "grad_norm": 29.790457740494546, + "learning_rate": 1.015248876981649e-05, + "loss": 2.0784, + "mean_token_accuracy": 0.5000000059604645, + "step": 10080 + }, + { + "epoch": 0.010157719111116483, + "grad_norm": 31.451171586131146, + "learning_rate": 1.015752472654755e-05, + "loss": 2.2829, + "mean_token_accuracy": 0.44137930274009707, + "step": 10085 + }, + { + "epoch": 0.010162755164220655, + "grad_norm": 36.046115622484, + "learning_rate": 1.0162560683278611e-05, + "loss": 2.3188, + "mean_token_accuracy": 0.4310344815254211, + "step": 10090 + }, + { + "epoch": 0.010167791217324829, + "grad_norm": 46.43566816366216, + "learning_rate": 1.0167596640009668e-05, + "loss": 2.4158, + "mean_token_accuracy": 0.39655171930789945, + "step": 10095 + }, + { + "epoch": 0.010172827270429001, + "grad_norm": 31.508864275643315, + "learning_rate": 1.017263259674073e-05, + "loss": 2.4279, + "mean_token_accuracy": 0.44827585816383364, + "step": 10100 + }, + { + "epoch": 0.010177863323533175, + "grad_norm": 53.9199628523731, + "learning_rate": 1.0177668553471789e-05, + "loss": 2.4693, + "mean_token_accuracy": 0.4326073706150055, + "step": 10105 + }, + { + "epoch": 0.010182899376637347, + "grad_norm": 70.96315335275057, + "learning_rate": 1.018270451020285e-05, + "loss": 2.3394, + "mean_token_accuracy": 0.43103448748588563, + "step": 10110 + }, + { + "epoch": 0.010187935429741519, + "grad_norm": 30.03274480049322, + "learning_rate": 1.0187740466933909e-05, + "loss": 2.3719, + "mean_token_accuracy": 0.45862069725990295, + "step": 10115 + }, + { + "epoch": 0.010192971482845693, + "grad_norm": 69.3129552628717, + "learning_rate": 1.0192776423664968e-05, + "loss": 2.3225, + "mean_token_accuracy": 0.4551724135875702, + "step": 10120 + }, + { + "epoch": 0.010198007535949865, + "grad_norm": 41.14229513935154, + "learning_rate": 1.0197812380396029e-05, + "loss": 2.3886, + "mean_token_accuracy": 0.4034482717514038, + "step": 10125 + }, + { + "epoch": 0.010203043589054038, + "grad_norm": 47.73622815749257, + "learning_rate": 1.0202848337127088e-05, + "loss": 2.4968, + "mean_token_accuracy": 0.3655172407627106, + "step": 10130 + }, + { + "epoch": 0.01020807964215821, + "grad_norm": 45.56822357280052, + "learning_rate": 1.0207884293858148e-05, + "loss": 2.5413, + "mean_token_accuracy": 0.42758620977401735, + "step": 10135 + }, + { + "epoch": 0.010213115695262384, + "grad_norm": 37.646686550718584, + "learning_rate": 1.0212920250589207e-05, + "loss": 2.6364, + "mean_token_accuracy": 0.42413793206214906, + "step": 10140 + }, + { + "epoch": 0.010218151748366556, + "grad_norm": 49.39968114323596, + "learning_rate": 1.0217956207320268e-05, + "loss": 2.5315, + "mean_token_accuracy": 0.43793103098869324, + "step": 10145 + }, + { + "epoch": 0.010223187801470728, + "grad_norm": 45.94514004688942, + "learning_rate": 1.0222992164051327e-05, + "loss": 2.2119, + "mean_token_accuracy": 0.4551724135875702, + "step": 10150 + }, + { + "epoch": 0.010228223854574902, + "grad_norm": 37.12571323654598, + "learning_rate": 1.0228028120782386e-05, + "loss": 2.1912, + "mean_token_accuracy": 0.47931033968925474, + "step": 10155 + }, + { + "epoch": 0.010233259907679074, + "grad_norm": 34.323056762948674, + "learning_rate": 1.0233064077513445e-05, + "loss": 2.3982, + "mean_token_accuracy": 0.4068965494632721, + "step": 10160 + }, + { + "epoch": 0.010238295960783248, + "grad_norm": 33.61140332733371, + "learning_rate": 1.0238100034244506e-05, + "loss": 2.4634, + "mean_token_accuracy": 0.403448274731636, + "step": 10165 + }, + { + "epoch": 0.01024333201388742, + "grad_norm": 38.788665882277485, + "learning_rate": 1.0243135990975566e-05, + "loss": 2.5118, + "mean_token_accuracy": 0.38620689511299133, + "step": 10170 + }, + { + "epoch": 0.010248368066991594, + "grad_norm": 25.526287462456242, + "learning_rate": 1.0248171947706625e-05, + "loss": 2.1438, + "mean_token_accuracy": 0.4931034505367279, + "step": 10175 + }, + { + "epoch": 0.010253404120095766, + "grad_norm": 30.326332465652825, + "learning_rate": 1.0253207904437686e-05, + "loss": 2.3796, + "mean_token_accuracy": 0.41724138259887694, + "step": 10180 + }, + { + "epoch": 0.010258440173199938, + "grad_norm": 29.005554764384442, + "learning_rate": 1.0258243861168745e-05, + "loss": 2.545, + "mean_token_accuracy": 0.3724137932062149, + "step": 10185 + }, + { + "epoch": 0.010263476226304111, + "grad_norm": 31.73406266245715, + "learning_rate": 1.0263279817899806e-05, + "loss": 2.4644, + "mean_token_accuracy": 0.39655172228813174, + "step": 10190 + }, + { + "epoch": 0.010268512279408283, + "grad_norm": 34.43022747983628, + "learning_rate": 1.0268315774630864e-05, + "loss": 2.3896, + "mean_token_accuracy": 0.40889292359352114, + "step": 10195 + }, + { + "epoch": 0.010273548332512457, + "grad_norm": 27.99367376866882, + "learning_rate": 1.0273351731361925e-05, + "loss": 1.9994, + "mean_token_accuracy": 0.507758617401123, + "step": 10200 + }, + { + "epoch": 0.01027858438561663, + "grad_norm": 30.967094601001342, + "learning_rate": 1.0278387688092984e-05, + "loss": 2.3204, + "mean_token_accuracy": 0.41379310190677643, + "step": 10205 + }, + { + "epoch": 0.010283620438720803, + "grad_norm": 29.206943835308003, + "learning_rate": 1.0283423644824045e-05, + "loss": 2.462, + "mean_token_accuracy": 0.4206896543502808, + "step": 10210 + }, + { + "epoch": 0.010288656491824975, + "grad_norm": 29.54404023854891, + "learning_rate": 1.0288459601555104e-05, + "loss": 2.2165, + "mean_token_accuracy": 0.458620685338974, + "step": 10215 + }, + { + "epoch": 0.010293692544929147, + "grad_norm": 33.49345790588835, + "learning_rate": 1.0293495558286163e-05, + "loss": 2.152, + "mean_token_accuracy": 0.4675136089324951, + "step": 10220 + }, + { + "epoch": 0.01029872859803332, + "grad_norm": 33.7405598595986, + "learning_rate": 1.0298531515017224e-05, + "loss": 2.4107, + "mean_token_accuracy": 0.4052026689052582, + "step": 10225 + }, + { + "epoch": 0.010303764651137493, + "grad_norm": 36.611624491665, + "learning_rate": 1.0303567471748283e-05, + "loss": 2.4126, + "mean_token_accuracy": 0.41724138259887694, + "step": 10230 + }, + { + "epoch": 0.010308800704241667, + "grad_norm": 37.98533037557905, + "learning_rate": 1.0308603428479344e-05, + "loss": 2.3959, + "mean_token_accuracy": 0.4344827651977539, + "step": 10235 + }, + { + "epoch": 0.010313836757345839, + "grad_norm": 35.07177866820885, + "learning_rate": 1.0313639385210402e-05, + "loss": 2.2599, + "mean_token_accuracy": 0.4517241358757019, + "step": 10240 + }, + { + "epoch": 0.010318872810450012, + "grad_norm": 33.1345513630012, + "learning_rate": 1.0318675341941463e-05, + "loss": 2.3856, + "mean_token_accuracy": 0.45862069725990295, + "step": 10245 + }, + { + "epoch": 0.010323908863554184, + "grad_norm": 40.76178624731318, + "learning_rate": 1.0323711298672522e-05, + "loss": 2.4771, + "mean_token_accuracy": 0.43793103098869324, + "step": 10250 + }, + { + "epoch": 0.010328944916658356, + "grad_norm": 33.80518574326572, + "learning_rate": 1.0328747255403583e-05, + "loss": 2.5765, + "mean_token_accuracy": 0.4482758641242981, + "step": 10255 + }, + { + "epoch": 0.01033398096976253, + "grad_norm": 30.474424754649327, + "learning_rate": 1.033378321213464e-05, + "loss": 2.4128, + "mean_token_accuracy": 0.4172413766384125, + "step": 10260 + }, + { + "epoch": 0.010339017022866702, + "grad_norm": 40.975024126470394, + "learning_rate": 1.0338819168865702e-05, + "loss": 2.2503, + "mean_token_accuracy": 0.4620689690113068, + "step": 10265 + }, + { + "epoch": 0.010344053075970876, + "grad_norm": 37.17312442751016, + "learning_rate": 1.0343855125596761e-05, + "loss": 2.372, + "mean_token_accuracy": 0.4448275864124298, + "step": 10270 + }, + { + "epoch": 0.010349089129075048, + "grad_norm": 32.218541451818474, + "learning_rate": 1.0348891082327822e-05, + "loss": 1.894, + "mean_token_accuracy": 0.510344821214676, + "step": 10275 + }, + { + "epoch": 0.010354125182179222, + "grad_norm": 44.016033789572056, + "learning_rate": 1.0353927039058881e-05, + "loss": 2.5603, + "mean_token_accuracy": 0.4241379380226135, + "step": 10280 + }, + { + "epoch": 0.010359161235283394, + "grad_norm": 32.84931297504163, + "learning_rate": 1.035896299578994e-05, + "loss": 2.7023, + "mean_token_accuracy": 0.3896551728248596, + "step": 10285 + }, + { + "epoch": 0.010364197288387566, + "grad_norm": 60.93161200252047, + "learning_rate": 1.0363998952521001e-05, + "loss": 2.5679, + "mean_token_accuracy": 0.36896551847457887, + "step": 10290 + }, + { + "epoch": 0.01036923334149174, + "grad_norm": 49.15507426802025, + "learning_rate": 1.036903490925206e-05, + "loss": 2.0556, + "mean_token_accuracy": 0.4917725265026093, + "step": 10295 + }, + { + "epoch": 0.010374269394595912, + "grad_norm": 38.44915570473643, + "learning_rate": 1.037407086598312e-05, + "loss": 2.088, + "mean_token_accuracy": 0.4862068951129913, + "step": 10300 + }, + { + "epoch": 0.010379305447700085, + "grad_norm": 44.386929537455174, + "learning_rate": 1.0379106822714179e-05, + "loss": 2.4016, + "mean_token_accuracy": 0.4137930989265442, + "step": 10305 + }, + { + "epoch": 0.010384341500804257, + "grad_norm": 33.66940250326215, + "learning_rate": 1.038414277944524e-05, + "loss": 2.2458, + "mean_token_accuracy": 0.4257108271121979, + "step": 10310 + }, + { + "epoch": 0.010389377553908431, + "grad_norm": 32.54140177408694, + "learning_rate": 1.03891787361763e-05, + "loss": 2.6449, + "mean_token_accuracy": 0.4, + "step": 10315 + }, + { + "epoch": 0.010394413607012603, + "grad_norm": 32.01748553433284, + "learning_rate": 1.0394214692907359e-05, + "loss": 2.553, + "mean_token_accuracy": 0.4000000059604645, + "step": 10320 + }, + { + "epoch": 0.010399449660116775, + "grad_norm": 31.971247633558377, + "learning_rate": 1.039925064963842e-05, + "loss": 2.3284, + "mean_token_accuracy": 0.42758620381355283, + "step": 10325 + }, + { + "epoch": 0.010404485713220949, + "grad_norm": 34.620665921994465, + "learning_rate": 1.0404286606369479e-05, + "loss": 2.159, + "mean_token_accuracy": 0.43793103098869324, + "step": 10330 + }, + { + "epoch": 0.010409521766325121, + "grad_norm": 32.01140298663732, + "learning_rate": 1.040932256310054e-05, + "loss": 2.3091, + "mean_token_accuracy": 0.41379311084747317, + "step": 10335 + }, + { + "epoch": 0.010414557819429295, + "grad_norm": 30.11274705485961, + "learning_rate": 1.0414358519831597e-05, + "loss": 2.687, + "mean_token_accuracy": 0.379310342669487, + "step": 10340 + }, + { + "epoch": 0.010419593872533467, + "grad_norm": 30.338783706914825, + "learning_rate": 1.0419394476562658e-05, + "loss": 2.5365, + "mean_token_accuracy": 0.42068966031074523, + "step": 10345 + }, + { + "epoch": 0.01042462992563764, + "grad_norm": 31.367240557110694, + "learning_rate": 1.0424430433293717e-05, + "loss": 2.1733, + "mean_token_accuracy": 0.46551724076271056, + "step": 10350 + }, + { + "epoch": 0.010429665978741813, + "grad_norm": 29.963685590914608, + "learning_rate": 1.0429466390024778e-05, + "loss": 2.3913, + "mean_token_accuracy": 0.46551724076271056, + "step": 10355 + }, + { + "epoch": 0.010434702031845985, + "grad_norm": 36.380367870754824, + "learning_rate": 1.0434502346755836e-05, + "loss": 2.1971, + "mean_token_accuracy": 0.46896551847457885, + "step": 10360 + }, + { + "epoch": 0.010439738084950159, + "grad_norm": 34.096170672241456, + "learning_rate": 1.0439538303486897e-05, + "loss": 2.1788, + "mean_token_accuracy": 0.4068965494632721, + "step": 10365 + }, + { + "epoch": 0.01044477413805433, + "grad_norm": 42.67944538689265, + "learning_rate": 1.0444574260217956e-05, + "loss": 2.5724, + "mean_token_accuracy": 0.3896551728248596, + "step": 10370 + }, + { + "epoch": 0.010449810191158504, + "grad_norm": 27.381774069460064, + "learning_rate": 1.0449610216949017e-05, + "loss": 2.4781, + "mean_token_accuracy": 0.42068964838981626, + "step": 10375 + }, + { + "epoch": 0.010454846244262676, + "grad_norm": 41.50178999751459, + "learning_rate": 1.0454646173680076e-05, + "loss": 2.6237, + "mean_token_accuracy": 0.4517241418361664, + "step": 10380 + }, + { + "epoch": 0.01045988229736685, + "grad_norm": 38.11152593697709, + "learning_rate": 1.0459682130411136e-05, + "loss": 2.4905, + "mean_token_accuracy": 0.3999999940395355, + "step": 10385 + }, + { + "epoch": 0.010464918350471022, + "grad_norm": 34.56273592883333, + "learning_rate": 1.0464718087142197e-05, + "loss": 2.2841, + "mean_token_accuracy": 0.4517241418361664, + "step": 10390 + }, + { + "epoch": 0.010469954403575194, + "grad_norm": 31.5623634777684, + "learning_rate": 1.0469754043873256e-05, + "loss": 2.0629, + "mean_token_accuracy": 0.4862069010734558, + "step": 10395 + }, + { + "epoch": 0.010474990456679368, + "grad_norm": 36.89789198023529, + "learning_rate": 1.0474790000604315e-05, + "loss": 2.0878, + "mean_token_accuracy": 0.48620688915252686, + "step": 10400 + }, + { + "epoch": 0.01048002650978354, + "grad_norm": 29.092667266763794, + "learning_rate": 1.0479825957335374e-05, + "loss": 2.1674, + "mean_token_accuracy": 0.4517241358757019, + "step": 10405 + }, + { + "epoch": 0.010485062562887714, + "grad_norm": 33.796137606706274, + "learning_rate": 1.0484861914066435e-05, + "loss": 2.3987, + "mean_token_accuracy": 0.4344827651977539, + "step": 10410 + }, + { + "epoch": 0.010490098615991886, + "grad_norm": 41.349286992092715, + "learning_rate": 1.0489897870797494e-05, + "loss": 1.9461, + "mean_token_accuracy": 0.5379310309886932, + "step": 10415 + }, + { + "epoch": 0.01049513466909606, + "grad_norm": 43.83738701693651, + "learning_rate": 1.0494933827528554e-05, + "loss": 2.1715, + "mean_token_accuracy": 0.5, + "step": 10420 + }, + { + "epoch": 0.010500170722200232, + "grad_norm": 48.605883467615634, + "learning_rate": 1.0499969784259615e-05, + "loss": 1.924, + "mean_token_accuracy": 0.4896551609039307, + "step": 10425 + }, + { + "epoch": 0.010505206775304404, + "grad_norm": 43.23936858690715, + "learning_rate": 1.0505005740990674e-05, + "loss": 2.4113, + "mean_token_accuracy": 0.38620689511299133, + "step": 10430 + }, + { + "epoch": 0.010510242828408577, + "grad_norm": 36.5430019894329, + "learning_rate": 1.0510041697721735e-05, + "loss": 2.1961, + "mean_token_accuracy": 0.4517241418361664, + "step": 10435 + }, + { + "epoch": 0.01051527888151275, + "grad_norm": 34.447764293989415, + "learning_rate": 1.0515077654452792e-05, + "loss": 2.362, + "mean_token_accuracy": 0.4448275864124298, + "step": 10440 + }, + { + "epoch": 0.010520314934616923, + "grad_norm": 26.080254247863245, + "learning_rate": 1.0520113611183853e-05, + "loss": 1.9788, + "mean_token_accuracy": 0.5241379320621491, + "step": 10445 + }, + { + "epoch": 0.010525350987721095, + "grad_norm": 36.549197409715205, + "learning_rate": 1.0525149567914913e-05, + "loss": 2.3184, + "mean_token_accuracy": 0.44827585816383364, + "step": 10450 + }, + { + "epoch": 0.010530387040825269, + "grad_norm": 34.923478669428654, + "learning_rate": 1.0530185524645974e-05, + "loss": 2.5212, + "mean_token_accuracy": 0.3896551728248596, + "step": 10455 + }, + { + "epoch": 0.010535423093929441, + "grad_norm": 37.53359655933454, + "learning_rate": 1.0535221481377031e-05, + "loss": 2.6596, + "mean_token_accuracy": 0.39812461733818055, + "step": 10460 + }, + { + "epoch": 0.010540459147033613, + "grad_norm": 36.16227551252876, + "learning_rate": 1.0540257438108092e-05, + "loss": 2.2883, + "mean_token_accuracy": 0.4551724076271057, + "step": 10465 + }, + { + "epoch": 0.010545495200137787, + "grad_norm": 40.87509166642309, + "learning_rate": 1.0545293394839151e-05, + "loss": 2.456, + "mean_token_accuracy": 0.43793103098869324, + "step": 10470 + }, + { + "epoch": 0.010550531253241959, + "grad_norm": 31.590113096171606, + "learning_rate": 1.0550329351570212e-05, + "loss": 2.4117, + "mean_token_accuracy": 0.4517241418361664, + "step": 10475 + }, + { + "epoch": 0.010555567306346133, + "grad_norm": 30.305767696671513, + "learning_rate": 1.0555365308301272e-05, + "loss": 2.5476, + "mean_token_accuracy": 0.41724138259887694, + "step": 10480 + }, + { + "epoch": 0.010560603359450305, + "grad_norm": 32.76867948987744, + "learning_rate": 1.056040126503233e-05, + "loss": 2.2252, + "mean_token_accuracy": 0.42758620977401735, + "step": 10485 + }, + { + "epoch": 0.010565639412554478, + "grad_norm": 40.53187548573647, + "learning_rate": 1.0565437221763392e-05, + "loss": 2.4542, + "mean_token_accuracy": 0.4241379380226135, + "step": 10490 + }, + { + "epoch": 0.01057067546565865, + "grad_norm": 46.384836393572755, + "learning_rate": 1.0570473178494451e-05, + "loss": 2.3364, + "mean_token_accuracy": 0.4379310429096222, + "step": 10495 + }, + { + "epoch": 0.010575711518762822, + "grad_norm": 40.46301443808825, + "learning_rate": 1.057550913522551e-05, + "loss": 2.0937, + "mean_token_accuracy": 0.47241380214691164, + "step": 10500 + }, + { + "epoch": 0.010580747571866996, + "grad_norm": 33.78554610465026, + "learning_rate": 1.058054509195657e-05, + "loss": 2.5259, + "mean_token_accuracy": 0.41034482717514037, + "step": 10505 + }, + { + "epoch": 0.010585783624971168, + "grad_norm": 29.225273971815636, + "learning_rate": 1.058558104868763e-05, + "loss": 2.2701, + "mean_token_accuracy": 0.4448275864124298, + "step": 10510 + }, + { + "epoch": 0.010590819678075342, + "grad_norm": 34.01289709218677, + "learning_rate": 1.059061700541869e-05, + "loss": 2.3709, + "mean_token_accuracy": 0.43103448748588563, + "step": 10515 + }, + { + "epoch": 0.010595855731179514, + "grad_norm": 30.382913515346488, + "learning_rate": 1.0595652962149749e-05, + "loss": 2.1501, + "mean_token_accuracy": 0.4465819835662842, + "step": 10520 + }, + { + "epoch": 0.010600891784283688, + "grad_norm": 33.081721420695494, + "learning_rate": 1.060068891888081e-05, + "loss": 2.2121, + "mean_token_accuracy": 0.458620685338974, + "step": 10525 + }, + { + "epoch": 0.01060592783738786, + "grad_norm": 35.17090716258229, + "learning_rate": 1.0605724875611869e-05, + "loss": 2.3656, + "mean_token_accuracy": 0.3793103456497192, + "step": 10530 + }, + { + "epoch": 0.010610963890492032, + "grad_norm": 32.192309809976365, + "learning_rate": 1.061076083234293e-05, + "loss": 2.5107, + "mean_token_accuracy": 0.38275861740112305, + "step": 10535 + }, + { + "epoch": 0.010615999943596206, + "grad_norm": 31.04615069279317, + "learning_rate": 1.0615796789073988e-05, + "loss": 2.3761, + "mean_token_accuracy": 0.44827587008476255, + "step": 10540 + }, + { + "epoch": 0.010621035996700378, + "grad_norm": 64.55964817708052, + "learning_rate": 1.0620832745805049e-05, + "loss": 2.1871, + "mean_token_accuracy": 0.4689655125141144, + "step": 10545 + }, + { + "epoch": 0.010626072049804551, + "grad_norm": 38.85016308431354, + "learning_rate": 1.0625868702536108e-05, + "loss": 2.2563, + "mean_token_accuracy": 0.41034482717514037, + "step": 10550 + }, + { + "epoch": 0.010631108102908723, + "grad_norm": 26.576059714129702, + "learning_rate": 1.0630904659267169e-05, + "loss": 2.5498, + "mean_token_accuracy": 0.4551724076271057, + "step": 10555 + }, + { + "epoch": 0.010636144156012895, + "grad_norm": 38.01246436180795, + "learning_rate": 1.0635940615998228e-05, + "loss": 2.3082, + "mean_token_accuracy": 0.46061705946922304, + "step": 10560 + }, + { + "epoch": 0.01064118020911707, + "grad_norm": 37.18754122588303, + "learning_rate": 1.0640976572729287e-05, + "loss": 2.6082, + "mean_token_accuracy": 0.42758620977401735, + "step": 10565 + }, + { + "epoch": 0.010646216262221241, + "grad_norm": 42.63476028829352, + "learning_rate": 1.0646012529460348e-05, + "loss": 2.2656, + "mean_token_accuracy": 0.4620689630508423, + "step": 10570 + }, + { + "epoch": 0.010651252315325415, + "grad_norm": 40.975244659439724, + "learning_rate": 1.0651048486191408e-05, + "loss": 2.553, + "mean_token_accuracy": 0.3931034505367279, + "step": 10575 + }, + { + "epoch": 0.010656288368429587, + "grad_norm": 33.23995476948762, + "learning_rate": 1.0656084442922467e-05, + "loss": 2.2532, + "mean_token_accuracy": 0.4517241418361664, + "step": 10580 + }, + { + "epoch": 0.01066132442153376, + "grad_norm": 30.75365679975307, + "learning_rate": 1.0661120399653526e-05, + "loss": 2.2224, + "mean_token_accuracy": 0.4413793087005615, + "step": 10585 + }, + { + "epoch": 0.010666360474637933, + "grad_norm": 27.457811915426404, + "learning_rate": 1.0666156356384587e-05, + "loss": 2.4204, + "mean_token_accuracy": 0.4000000059604645, + "step": 10590 + }, + { + "epoch": 0.010671396527742105, + "grad_norm": 29.16548481748518, + "learning_rate": 1.0671192313115646e-05, + "loss": 2.4316, + "mean_token_accuracy": 0.41379310488700866, + "step": 10595 + }, + { + "epoch": 0.010676432580846279, + "grad_norm": 42.18592461218951, + "learning_rate": 1.0676228269846705e-05, + "loss": 2.5808, + "mean_token_accuracy": 0.45347853302955626, + "step": 10600 + }, + { + "epoch": 0.01068146863395045, + "grad_norm": 27.027727745012783, + "learning_rate": 1.0681264226577765e-05, + "loss": 2.1103, + "mean_token_accuracy": 0.5034482717514038, + "step": 10605 + }, + { + "epoch": 0.010686504687054624, + "grad_norm": 33.77769669905049, + "learning_rate": 1.0686300183308826e-05, + "loss": 2.1884, + "mean_token_accuracy": 0.4482758641242981, + "step": 10610 + }, + { + "epoch": 0.010691540740158796, + "grad_norm": 29.04056765645392, + "learning_rate": 1.0691336140039885e-05, + "loss": 2.5517, + "mean_token_accuracy": 0.4103448212146759, + "step": 10615 + }, + { + "epoch": 0.01069657679326297, + "grad_norm": 31.874408928154775, + "learning_rate": 1.0696372096770944e-05, + "loss": 2.5515, + "mean_token_accuracy": 0.3965517282485962, + "step": 10620 + }, + { + "epoch": 0.010701612846367142, + "grad_norm": 37.52674642679141, + "learning_rate": 1.0701408053502005e-05, + "loss": 2.1416, + "mean_token_accuracy": 0.4275861978530884, + "step": 10625 + }, + { + "epoch": 0.010706648899471314, + "grad_norm": 44.616324128214735, + "learning_rate": 1.0706444010233064e-05, + "loss": 2.4567, + "mean_token_accuracy": 0.4508166968822479, + "step": 10630 + }, + { + "epoch": 0.010711684952575488, + "grad_norm": 30.28047573965209, + "learning_rate": 1.0711479966964125e-05, + "loss": 2.2173, + "mean_token_accuracy": 0.4655172348022461, + "step": 10635 + }, + { + "epoch": 0.01071672100567966, + "grad_norm": 40.30605274727645, + "learning_rate": 1.0716515923695185e-05, + "loss": 2.3271, + "mean_token_accuracy": 0.4137930989265442, + "step": 10640 + }, + { + "epoch": 0.010721757058783834, + "grad_norm": 38.06545042897882, + "learning_rate": 1.0721551880426244e-05, + "loss": 2.5044, + "mean_token_accuracy": 0.38620689511299133, + "step": 10645 + }, + { + "epoch": 0.010726793111888006, + "grad_norm": 33.37743765259418, + "learning_rate": 1.0726587837157303e-05, + "loss": 2.2496, + "mean_token_accuracy": 0.43793103098869324, + "step": 10650 + }, + { + "epoch": 0.01073182916499218, + "grad_norm": 39.03853637308467, + "learning_rate": 1.0731623793888364e-05, + "loss": 2.2717, + "mean_token_accuracy": 0.4206896543502808, + "step": 10655 + }, + { + "epoch": 0.010736865218096352, + "grad_norm": 30.311936754021644, + "learning_rate": 1.0736659750619423e-05, + "loss": 2.2295, + "mean_token_accuracy": 0.42758620381355283, + "step": 10660 + }, + { + "epoch": 0.010741901271200524, + "grad_norm": 30.667808738026263, + "learning_rate": 1.0741695707350483e-05, + "loss": 2.2949, + "mean_token_accuracy": 0.443254691362381, + "step": 10665 + }, + { + "epoch": 0.010746937324304697, + "grad_norm": 37.447558920945205, + "learning_rate": 1.0746731664081543e-05, + "loss": 2.1711, + "mean_token_accuracy": 0.49999999403953554, + "step": 10670 + }, + { + "epoch": 0.01075197337740887, + "grad_norm": 26.372475278157463, + "learning_rate": 1.0751767620812603e-05, + "loss": 2.1953, + "mean_token_accuracy": 0.5068965554237366, + "step": 10675 + }, + { + "epoch": 0.010757009430513043, + "grad_norm": 31.452994819804683, + "learning_rate": 1.0756803577543664e-05, + "loss": 2.2469, + "mean_token_accuracy": 0.4638838529586792, + "step": 10680 + }, + { + "epoch": 0.010762045483617215, + "grad_norm": 45.01702147043336, + "learning_rate": 1.0761839534274721e-05, + "loss": 2.4548, + "mean_token_accuracy": 0.3896551728248596, + "step": 10685 + }, + { + "epoch": 0.010767081536721389, + "grad_norm": 30.509440563135705, + "learning_rate": 1.0766875491005782e-05, + "loss": 2.4362, + "mean_token_accuracy": 0.4068965494632721, + "step": 10690 + }, + { + "epoch": 0.010772117589825561, + "grad_norm": 37.82580881728543, + "learning_rate": 1.0771911447736841e-05, + "loss": 2.4261, + "mean_token_accuracy": 0.4034482717514038, + "step": 10695 + }, + { + "epoch": 0.010777153642929733, + "grad_norm": 30.617201300860028, + "learning_rate": 1.0776947404467902e-05, + "loss": 2.2913, + "mean_token_accuracy": 0.4413793087005615, + "step": 10700 + }, + { + "epoch": 0.010782189696033907, + "grad_norm": 31.574496057490556, + "learning_rate": 1.078198336119896e-05, + "loss": 2.4517, + "mean_token_accuracy": 0.42758620381355283, + "step": 10705 + }, + { + "epoch": 0.010787225749138079, + "grad_norm": 30.274046926342876, + "learning_rate": 1.0787019317930021e-05, + "loss": 2.0461, + "mean_token_accuracy": 0.44827585816383364, + "step": 10710 + }, + { + "epoch": 0.010792261802242253, + "grad_norm": 42.660090113818136, + "learning_rate": 1.079205527466108e-05, + "loss": 2.2115, + "mean_token_accuracy": 0.4630541861057281, + "step": 10715 + }, + { + "epoch": 0.010797297855346425, + "grad_norm": 29.498819295321166, + "learning_rate": 1.0797091231392141e-05, + "loss": 2.2346, + "mean_token_accuracy": 0.4344827592372894, + "step": 10720 + }, + { + "epoch": 0.010802333908450599, + "grad_norm": 33.12173831816271, + "learning_rate": 1.08021271881232e-05, + "loss": 2.533, + "mean_token_accuracy": 0.4172413766384125, + "step": 10725 + }, + { + "epoch": 0.01080736996155477, + "grad_norm": 30.280856042169987, + "learning_rate": 1.080716314485426e-05, + "loss": 2.2034, + "mean_token_accuracy": 0.48275862336158754, + "step": 10730 + }, + { + "epoch": 0.010812406014658943, + "grad_norm": 41.147301396772875, + "learning_rate": 1.081219910158532e-05, + "loss": 2.3396, + "mean_token_accuracy": 0.4482758641242981, + "step": 10735 + }, + { + "epoch": 0.010817442067763116, + "grad_norm": 43.41143222499023, + "learning_rate": 1.081723505831638e-05, + "loss": 2.3149, + "mean_token_accuracy": 0.37586206793785093, + "step": 10740 + }, + { + "epoch": 0.010822478120867288, + "grad_norm": 33.3717507098183, + "learning_rate": 1.0822271015047439e-05, + "loss": 2.4314, + "mean_token_accuracy": 0.4310344815254211, + "step": 10745 + }, + { + "epoch": 0.010827514173971462, + "grad_norm": 31.4728786030362, + "learning_rate": 1.0827306971778498e-05, + "loss": 2.4191, + "mean_token_accuracy": 0.4517241358757019, + "step": 10750 + }, + { + "epoch": 0.010832550227075634, + "grad_norm": 46.142685102008066, + "learning_rate": 1.083234292850956e-05, + "loss": 2.6398, + "mean_token_accuracy": 0.3827586233615875, + "step": 10755 + }, + { + "epoch": 0.010837586280179808, + "grad_norm": 31.440824186910028, + "learning_rate": 1.0837378885240619e-05, + "loss": 2.3034, + "mean_token_accuracy": 0.46896551847457885, + "step": 10760 + }, + { + "epoch": 0.01084262233328398, + "grad_norm": 31.191653278009152, + "learning_rate": 1.0842414841971678e-05, + "loss": 2.2683, + "mean_token_accuracy": 0.4482758641242981, + "step": 10765 + }, + { + "epoch": 0.010847658386388152, + "grad_norm": 34.481908805591246, + "learning_rate": 1.0847450798702739e-05, + "loss": 2.3137, + "mean_token_accuracy": 0.41034482717514037, + "step": 10770 + }, + { + "epoch": 0.010852694439492326, + "grad_norm": 35.177721248481724, + "learning_rate": 1.0852486755433798e-05, + "loss": 2.4809, + "mean_token_accuracy": 0.42413793206214906, + "step": 10775 + }, + { + "epoch": 0.010857730492596498, + "grad_norm": 24.668699965673156, + "learning_rate": 1.0857522712164859e-05, + "loss": 2.2867, + "mean_token_accuracy": 0.4843920111656189, + "step": 10780 + }, + { + "epoch": 0.010862766545700672, + "grad_norm": 38.65989026859781, + "learning_rate": 1.0862558668895916e-05, + "loss": 2.7365, + "mean_token_accuracy": 0.3862068891525269, + "step": 10785 + }, + { + "epoch": 0.010867802598804844, + "grad_norm": 32.6797676511802, + "learning_rate": 1.0867594625626977e-05, + "loss": 2.0282, + "mean_token_accuracy": 0.493103438615799, + "step": 10790 + }, + { + "epoch": 0.010872838651909017, + "grad_norm": 38.27756675451432, + "learning_rate": 1.0872630582358037e-05, + "loss": 2.2889, + "mean_token_accuracy": 0.42068966627120974, + "step": 10795 + }, + { + "epoch": 0.01087787470501319, + "grad_norm": 30.721209632790146, + "learning_rate": 1.0877666539089098e-05, + "loss": 2.3026, + "mean_token_accuracy": 0.4034482777118683, + "step": 10800 + }, + { + "epoch": 0.010882910758117361, + "grad_norm": 28.100437125715874, + "learning_rate": 1.0882702495820155e-05, + "loss": 2.1089, + "mean_token_accuracy": 0.47931034564971925, + "step": 10805 + }, + { + "epoch": 0.010887946811221535, + "grad_norm": 40.72504770972406, + "learning_rate": 1.0887738452551216e-05, + "loss": 2.0119, + "mean_token_accuracy": 0.45960590839385984, + "step": 10810 + }, + { + "epoch": 0.010892982864325707, + "grad_norm": 35.685912974091096, + "learning_rate": 1.0892774409282275e-05, + "loss": 2.4779, + "mean_token_accuracy": 0.4379310369491577, + "step": 10815 + }, + { + "epoch": 0.010898018917429881, + "grad_norm": 29.391599354248623, + "learning_rate": 1.0897810366013336e-05, + "loss": 2.1848, + "mean_token_accuracy": 0.47586206793785096, + "step": 10820 + }, + { + "epoch": 0.010903054970534053, + "grad_norm": 34.55382534828304, + "learning_rate": 1.0902846322744396e-05, + "loss": 2.3804, + "mean_token_accuracy": 0.44676345586776733, + "step": 10825 + }, + { + "epoch": 0.010908091023638227, + "grad_norm": 27.222208406079336, + "learning_rate": 1.0907882279475455e-05, + "loss": 2.114, + "mean_token_accuracy": 0.47931034564971925, + "step": 10830 + }, + { + "epoch": 0.010913127076742399, + "grad_norm": 38.90441483749146, + "learning_rate": 1.0912918236206516e-05, + "loss": 2.1331, + "mean_token_accuracy": 0.4517241358757019, + "step": 10835 + }, + { + "epoch": 0.01091816312984657, + "grad_norm": 27.594837667700784, + "learning_rate": 1.0917954192937575e-05, + "loss": 2.4736, + "mean_token_accuracy": 0.40689656138420105, + "step": 10840 + }, + { + "epoch": 0.010923199182950745, + "grad_norm": 31.031302054703925, + "learning_rate": 1.0922990149668634e-05, + "loss": 2.3521, + "mean_token_accuracy": 0.441379314661026, + "step": 10845 + }, + { + "epoch": 0.010928235236054917, + "grad_norm": 35.17986897043136, + "learning_rate": 1.0928026106399694e-05, + "loss": 2.041, + "mean_token_accuracy": 0.4815486967563629, + "step": 10850 + }, + { + "epoch": 0.01093327128915909, + "grad_norm": 31.86622657961703, + "learning_rate": 1.0933062063130754e-05, + "loss": 1.9663, + "mean_token_accuracy": 0.4862068951129913, + "step": 10855 + }, + { + "epoch": 0.010938307342263262, + "grad_norm": 30.268773930429212, + "learning_rate": 1.0938098019861814e-05, + "loss": 2.5621, + "mean_token_accuracy": 0.4586206912994385, + "step": 10860 + }, + { + "epoch": 0.010943343395367436, + "grad_norm": 45.43403418647319, + "learning_rate": 1.0943133976592873e-05, + "loss": 2.2358, + "mean_token_accuracy": 0.4344827473163605, + "step": 10865 + }, + { + "epoch": 0.010948379448471608, + "grad_norm": 33.582158780226145, + "learning_rate": 1.0948169933323934e-05, + "loss": 2.6082, + "mean_token_accuracy": 0.3876588046550751, + "step": 10870 + }, + { + "epoch": 0.01095341550157578, + "grad_norm": 31.410646610305644, + "learning_rate": 1.0953205890054993e-05, + "loss": 2.0884, + "mean_token_accuracy": 0.48965518474578856, + "step": 10875 + }, + { + "epoch": 0.010958451554679954, + "grad_norm": 32.015748320983846, + "learning_rate": 1.0958241846786054e-05, + "loss": 2.4072, + "mean_token_accuracy": 0.4448275864124298, + "step": 10880 + }, + { + "epoch": 0.010963487607784126, + "grad_norm": 35.290072474386754, + "learning_rate": 1.0963277803517112e-05, + "loss": 2.5729, + "mean_token_accuracy": 0.36551723480224607, + "step": 10885 + }, + { + "epoch": 0.0109685236608883, + "grad_norm": 37.62099214540939, + "learning_rate": 1.0968313760248173e-05, + "loss": 2.1619, + "mean_token_accuracy": 0.48965516686439514, + "step": 10890 + }, + { + "epoch": 0.010973559713992472, + "grad_norm": 39.17360022731531, + "learning_rate": 1.0973349716979232e-05, + "loss": 2.1668, + "mean_token_accuracy": 0.4931034445762634, + "step": 10895 + }, + { + "epoch": 0.010978595767096646, + "grad_norm": 39.20607909256005, + "learning_rate": 1.0978385673710293e-05, + "loss": 2.362, + "mean_token_accuracy": 0.4137930989265442, + "step": 10900 + }, + { + "epoch": 0.010983631820200818, + "grad_norm": 28.8195460410601, + "learning_rate": 1.098342163044135e-05, + "loss": 2.0535, + "mean_token_accuracy": 0.47931034564971925, + "step": 10905 + }, + { + "epoch": 0.01098866787330499, + "grad_norm": 26.96027186118255, + "learning_rate": 1.0988457587172411e-05, + "loss": 2.4447, + "mean_token_accuracy": 0.4310344815254211, + "step": 10910 + }, + { + "epoch": 0.010993703926409163, + "grad_norm": 32.46548692038139, + "learning_rate": 1.099349354390347e-05, + "loss": 2.479, + "mean_token_accuracy": 0.3931034505367279, + "step": 10915 + }, + { + "epoch": 0.010998739979513335, + "grad_norm": 36.583635851619455, + "learning_rate": 1.0998529500634532e-05, + "loss": 2.0519, + "mean_token_accuracy": 0.5073891639709472, + "step": 10920 + }, + { + "epoch": 0.01100377603261751, + "grad_norm": 41.64154573575927, + "learning_rate": 1.100356545736559e-05, + "loss": 2.286, + "mean_token_accuracy": 0.42758620977401735, + "step": 10925 + }, + { + "epoch": 0.011008812085721681, + "grad_norm": 40.41842045859674, + "learning_rate": 1.100860141409665e-05, + "loss": 2.4726, + "mean_token_accuracy": 0.4517241418361664, + "step": 10930 + }, + { + "epoch": 0.011013848138825855, + "grad_norm": 62.12691163358502, + "learning_rate": 1.1013637370827711e-05, + "loss": 2.3722, + "mean_token_accuracy": 0.46551724076271056, + "step": 10935 + }, + { + "epoch": 0.011018884191930027, + "grad_norm": 37.149261798603646, + "learning_rate": 1.101867332755877e-05, + "loss": 2.6742, + "mean_token_accuracy": 0.3862069010734558, + "step": 10940 + }, + { + "epoch": 0.011023920245034199, + "grad_norm": 34.67222367300885, + "learning_rate": 1.102370928428983e-05, + "loss": 2.4205, + "mean_token_accuracy": 0.40689654350280763, + "step": 10945 + }, + { + "epoch": 0.011028956298138373, + "grad_norm": 37.71382518710506, + "learning_rate": 1.1028745241020889e-05, + "loss": 2.623, + "mean_token_accuracy": 0.37586206793785093, + "step": 10950 + }, + { + "epoch": 0.011033992351242545, + "grad_norm": 32.413317647004746, + "learning_rate": 1.103378119775195e-05, + "loss": 2.4341, + "mean_token_accuracy": 0.38620689511299133, + "step": 10955 + }, + { + "epoch": 0.011039028404346719, + "grad_norm": 35.67497569497302, + "learning_rate": 1.1038817154483009e-05, + "loss": 2.2033, + "mean_token_accuracy": 0.4517241358757019, + "step": 10960 + }, + { + "epoch": 0.01104406445745089, + "grad_norm": 30.863900522813342, + "learning_rate": 1.1043853111214068e-05, + "loss": 2.6417, + "mean_token_accuracy": 0.35862069129943847, + "step": 10965 + }, + { + "epoch": 0.011049100510555064, + "grad_norm": 35.02413144173615, + "learning_rate": 1.1048889067945129e-05, + "loss": 2.7192, + "mean_token_accuracy": 0.3482758641242981, + "step": 10970 + }, + { + "epoch": 0.011054136563659236, + "grad_norm": 36.63426614943195, + "learning_rate": 1.1053925024676188e-05, + "loss": 2.4429, + "mean_token_accuracy": 0.43278887271881106, + "step": 10975 + }, + { + "epoch": 0.011059172616763409, + "grad_norm": 30.269112291284028, + "learning_rate": 1.105896098140725e-05, + "loss": 2.0593, + "mean_token_accuracy": 0.510344821214676, + "step": 10980 + }, + { + "epoch": 0.011064208669867582, + "grad_norm": 42.663104483967274, + "learning_rate": 1.1063996938138307e-05, + "loss": 2.4622, + "mean_token_accuracy": 0.41724138259887694, + "step": 10985 + }, + { + "epoch": 0.011069244722971754, + "grad_norm": 27.953517198650847, + "learning_rate": 1.1069032894869368e-05, + "loss": 3.0749, + "mean_token_accuracy": 0.3655172407627106, + "step": 10990 + }, + { + "epoch": 0.011074280776075928, + "grad_norm": 30.661192056723667, + "learning_rate": 1.1074068851600427e-05, + "loss": 2.1688, + "mean_token_accuracy": 0.46733213067054746, + "step": 10995 + }, + { + "epoch": 0.0110793168291801, + "grad_norm": 29.223269218994012, + "learning_rate": 1.1079104808331488e-05, + "loss": 2.4347, + "mean_token_accuracy": 0.40689654350280763, + "step": 11000 + }, + { + "epoch": 0.011084352882284274, + "grad_norm": 43.18024493372328, + "learning_rate": 1.1084140765062546e-05, + "loss": 2.1825, + "mean_token_accuracy": 0.4620689690113068, + "step": 11005 + }, + { + "epoch": 0.011089388935388446, + "grad_norm": 38.806657948380135, + "learning_rate": 1.1089176721793607e-05, + "loss": 2.3115, + "mean_token_accuracy": 0.441379314661026, + "step": 11010 + }, + { + "epoch": 0.011094424988492618, + "grad_norm": 29.00955146269285, + "learning_rate": 1.1094212678524668e-05, + "loss": 2.5242, + "mean_token_accuracy": 0.3827586114406586, + "step": 11015 + }, + { + "epoch": 0.011099461041596792, + "grad_norm": 30.07485564419664, + "learning_rate": 1.1099248635255727e-05, + "loss": 2.8016, + "mean_token_accuracy": 0.4000000059604645, + "step": 11020 + }, + { + "epoch": 0.011104497094700964, + "grad_norm": 28.114295267933155, + "learning_rate": 1.1104284591986786e-05, + "loss": 2.3921, + "mean_token_accuracy": 0.4411373257637024, + "step": 11025 + }, + { + "epoch": 0.011109533147805137, + "grad_norm": 26.67568879535564, + "learning_rate": 1.1109320548717845e-05, + "loss": 2.1925, + "mean_token_accuracy": 0.4628554105758667, + "step": 11030 + }, + { + "epoch": 0.01111456920090931, + "grad_norm": 34.98690467561199, + "learning_rate": 1.1114356505448906e-05, + "loss": 2.4062, + "mean_token_accuracy": 0.4229280173778534, + "step": 11035 + }, + { + "epoch": 0.011119605254013483, + "grad_norm": 29.837911136343347, + "learning_rate": 1.1119392462179965e-05, + "loss": 2.2937, + "mean_token_accuracy": 0.42758620381355283, + "step": 11040 + }, + { + "epoch": 0.011124641307117655, + "grad_norm": 31.885912913079434, + "learning_rate": 1.1124428418911026e-05, + "loss": 2.1902, + "mean_token_accuracy": 0.4365396201610565, + "step": 11045 + }, + { + "epoch": 0.011129677360221827, + "grad_norm": 33.496562829065844, + "learning_rate": 1.1129464375642084e-05, + "loss": 2.2347, + "mean_token_accuracy": 0.46551724076271056, + "step": 11050 + }, + { + "epoch": 0.011134713413326001, + "grad_norm": 33.07013525649395, + "learning_rate": 1.1134500332373145e-05, + "loss": 2.4862, + "mean_token_accuracy": 0.37931033968925476, + "step": 11055 + }, + { + "epoch": 0.011139749466430173, + "grad_norm": 37.02871091344708, + "learning_rate": 1.1139536289104204e-05, + "loss": 2.1365, + "mean_token_accuracy": 0.48850575685501096, + "step": 11060 + }, + { + "epoch": 0.011144785519534347, + "grad_norm": 38.73879710788627, + "learning_rate": 1.1144572245835265e-05, + "loss": 2.4776, + "mean_token_accuracy": 0.4137930989265442, + "step": 11065 + }, + { + "epoch": 0.011149821572638519, + "grad_norm": 28.330673424128214, + "learning_rate": 1.1149608202566324e-05, + "loss": 2.2198, + "mean_token_accuracy": 0.4724137902259827, + "step": 11070 + }, + { + "epoch": 0.011154857625742693, + "grad_norm": 37.837426934207514, + "learning_rate": 1.1154644159297384e-05, + "loss": 2.2808, + "mean_token_accuracy": 0.46896551847457885, + "step": 11075 + }, + { + "epoch": 0.011159893678846865, + "grad_norm": 28.955344729503338, + "learning_rate": 1.1159680116028445e-05, + "loss": 2.2519, + "mean_token_accuracy": 0.4655172348022461, + "step": 11080 + }, + { + "epoch": 0.011164929731951037, + "grad_norm": 32.96526797175497, + "learning_rate": 1.1164716072759504e-05, + "loss": 2.1388, + "mean_token_accuracy": 0.46551724672317507, + "step": 11085 + }, + { + "epoch": 0.01116996578505521, + "grad_norm": 27.156821986326424, + "learning_rate": 1.1169752029490563e-05, + "loss": 2.2883, + "mean_token_accuracy": 0.458620685338974, + "step": 11090 + }, + { + "epoch": 0.011175001838159383, + "grad_norm": 32.685599485769295, + "learning_rate": 1.1174787986221622e-05, + "loss": 2.346, + "mean_token_accuracy": 0.44646098017692565, + "step": 11095 + }, + { + "epoch": 0.011180037891263556, + "grad_norm": 24.513999428761405, + "learning_rate": 1.1179823942952683e-05, + "loss": 2.1141, + "mean_token_accuracy": 0.4655172348022461, + "step": 11100 + }, + { + "epoch": 0.011185073944367728, + "grad_norm": 21.7578777902776, + "learning_rate": 1.1184859899683743e-05, + "loss": 2.3718, + "mean_token_accuracy": 0.4379310429096222, + "step": 11105 + }, + { + "epoch": 0.011190109997471902, + "grad_norm": 32.63089585725225, + "learning_rate": 1.1189895856414802e-05, + "loss": 2.4199, + "mean_token_accuracy": 0.41379310488700866, + "step": 11110 + }, + { + "epoch": 0.011195146050576074, + "grad_norm": 46.00322654408078, + "learning_rate": 1.1194931813145863e-05, + "loss": 2.6226, + "mean_token_accuracy": 0.4206896543502808, + "step": 11115 + }, + { + "epoch": 0.011200182103680246, + "grad_norm": 35.00582142668977, + "learning_rate": 1.1199967769876922e-05, + "loss": 2.3573, + "mean_token_accuracy": 0.4551724135875702, + "step": 11120 + }, + { + "epoch": 0.01120521815678442, + "grad_norm": 28.746498102901292, + "learning_rate": 1.1205003726607983e-05, + "loss": 2.6983, + "mean_token_accuracy": 0.4068965494632721, + "step": 11125 + }, + { + "epoch": 0.011210254209888592, + "grad_norm": 34.09145114869498, + "learning_rate": 1.121003968333904e-05, + "loss": 2.4323, + "mean_token_accuracy": 0.441379314661026, + "step": 11130 + }, + { + "epoch": 0.011215290262992766, + "grad_norm": 25.872618432889894, + "learning_rate": 1.1215075640070101e-05, + "loss": 2.2974, + "mean_token_accuracy": 0.41379311084747317, + "step": 11135 + }, + { + "epoch": 0.011220326316096938, + "grad_norm": 28.214100238942155, + "learning_rate": 1.122011159680116e-05, + "loss": 2.4921, + "mean_token_accuracy": 0.39655172526836396, + "step": 11140 + }, + { + "epoch": 0.011225362369201112, + "grad_norm": 35.07036940965686, + "learning_rate": 1.1225147553532222e-05, + "loss": 2.4969, + "mean_token_accuracy": 0.41724138259887694, + "step": 11145 + }, + { + "epoch": 0.011230398422305284, + "grad_norm": 47.16272793714296, + "learning_rate": 1.123018351026328e-05, + "loss": 2.4695, + "mean_token_accuracy": 0.4034482717514038, + "step": 11150 + }, + { + "epoch": 0.011235434475409456, + "grad_norm": 33.1641828555765, + "learning_rate": 1.123521946699434e-05, + "loss": 2.1111, + "mean_token_accuracy": 0.44827585518360136, + "step": 11155 + }, + { + "epoch": 0.01124047052851363, + "grad_norm": 45.461005017927846, + "learning_rate": 1.12402554237254e-05, + "loss": 2.2241, + "mean_token_accuracy": 0.417241370677948, + "step": 11160 + }, + { + "epoch": 0.011245506581617801, + "grad_norm": 32.834775475881635, + "learning_rate": 1.124529138045646e-05, + "loss": 2.4683, + "mean_token_accuracy": 0.42413792610168455, + "step": 11165 + }, + { + "epoch": 0.011250542634721975, + "grad_norm": 30.542638051166918, + "learning_rate": 1.125032733718752e-05, + "loss": 2.2336, + "mean_token_accuracy": 0.4517241418361664, + "step": 11170 + }, + { + "epoch": 0.011255578687826147, + "grad_norm": 37.97004941684028, + "learning_rate": 1.1255363293918579e-05, + "loss": 2.4737, + "mean_token_accuracy": 0.42068964838981626, + "step": 11175 + }, + { + "epoch": 0.011260614740930321, + "grad_norm": 25.21621941964731, + "learning_rate": 1.126039925064964e-05, + "loss": 2.3255, + "mean_token_accuracy": 0.42413792610168455, + "step": 11180 + }, + { + "epoch": 0.011265650794034493, + "grad_norm": 28.696916748890857, + "learning_rate": 1.1265435207380699e-05, + "loss": 2.4528, + "mean_token_accuracy": 0.41724138259887694, + "step": 11185 + }, + { + "epoch": 0.011270686847138665, + "grad_norm": 22.949255114064812, + "learning_rate": 1.1270471164111758e-05, + "loss": 2.2295, + "mean_token_accuracy": 0.48275862336158754, + "step": 11190 + }, + { + "epoch": 0.011275722900242839, + "grad_norm": 34.225415844031936, + "learning_rate": 1.1275507120842818e-05, + "loss": 2.1593, + "mean_token_accuracy": 0.4793103516101837, + "step": 11195 + }, + { + "epoch": 0.01128075895334701, + "grad_norm": 35.40808704076608, + "learning_rate": 1.1280543077573878e-05, + "loss": 2.326, + "mean_token_accuracy": 0.42068964838981626, + "step": 11200 + }, + { + "epoch": 0.011285795006451185, + "grad_norm": 40.77515201218449, + "learning_rate": 1.1285579034304938e-05, + "loss": 2.2034, + "mean_token_accuracy": 0.46896552443504336, + "step": 11205 + }, + { + "epoch": 0.011290831059555357, + "grad_norm": 34.7462626263935, + "learning_rate": 1.1290614991035997e-05, + "loss": 2.6503, + "mean_token_accuracy": 0.3517241418361664, + "step": 11210 + }, + { + "epoch": 0.01129586711265953, + "grad_norm": 32.09408511157453, + "learning_rate": 1.1295650947767058e-05, + "loss": 2.2219, + "mean_token_accuracy": 0.43103448748588563, + "step": 11215 + }, + { + "epoch": 0.011300903165763702, + "grad_norm": 28.95363557734509, + "learning_rate": 1.1300686904498117e-05, + "loss": 2.3018, + "mean_token_accuracy": 0.3896551728248596, + "step": 11220 + }, + { + "epoch": 0.011305939218867874, + "grad_norm": 30.60510430538887, + "learning_rate": 1.1305722861229178e-05, + "loss": 2.3663, + "mean_token_accuracy": 0.4034482777118683, + "step": 11225 + }, + { + "epoch": 0.011310975271972048, + "grad_norm": 31.873080131196023, + "learning_rate": 1.1310758817960236e-05, + "loss": 2.323, + "mean_token_accuracy": 0.43448275327682495, + "step": 11230 + }, + { + "epoch": 0.01131601132507622, + "grad_norm": 28.18307099489073, + "learning_rate": 1.1315794774691297e-05, + "loss": 2.6619, + "mean_token_accuracy": 0.4206896543502808, + "step": 11235 + }, + { + "epoch": 0.011321047378180394, + "grad_norm": 32.97547241319515, + "learning_rate": 1.1320830731422356e-05, + "loss": 2.2373, + "mean_token_accuracy": 0.4310344815254211, + "step": 11240 + }, + { + "epoch": 0.011326083431284566, + "grad_norm": 50.067571260052695, + "learning_rate": 1.1325866688153417e-05, + "loss": 2.6148, + "mean_token_accuracy": 0.35862069129943847, + "step": 11245 + }, + { + "epoch": 0.01133111948438874, + "grad_norm": 32.835463579930064, + "learning_rate": 1.1330902644884474e-05, + "loss": 2.0574, + "mean_token_accuracy": 0.4931034505367279, + "step": 11250 + }, + { + "epoch": 0.011336155537492912, + "grad_norm": 29.09041071054721, + "learning_rate": 1.1335938601615535e-05, + "loss": 2.2842, + "mean_token_accuracy": 0.5102238357067108, + "step": 11255 + }, + { + "epoch": 0.011341191590597084, + "grad_norm": 36.44403016123223, + "learning_rate": 1.1340974558346595e-05, + "loss": 2.338, + "mean_token_accuracy": 0.4310344815254211, + "step": 11260 + }, + { + "epoch": 0.011346227643701258, + "grad_norm": 30.47387174292945, + "learning_rate": 1.1346010515077656e-05, + "loss": 2.3608, + "mean_token_accuracy": 0.43103448748588563, + "step": 11265 + }, + { + "epoch": 0.01135126369680543, + "grad_norm": 43.57486939529165, + "learning_rate": 1.1351046471808715e-05, + "loss": 2.3635, + "mean_token_accuracy": 0.4620689630508423, + "step": 11270 + }, + { + "epoch": 0.011356299749909603, + "grad_norm": 30.80385732606569, + "learning_rate": 1.1356082428539774e-05, + "loss": 2.3265, + "mean_token_accuracy": 0.4517241418361664, + "step": 11275 + }, + { + "epoch": 0.011361335803013775, + "grad_norm": 50.56760163880261, + "learning_rate": 1.1361118385270835e-05, + "loss": 2.2775, + "mean_token_accuracy": 0.4310344815254211, + "step": 11280 + }, + { + "epoch": 0.01136637185611795, + "grad_norm": 31.833970951701428, + "learning_rate": 1.1366154342001894e-05, + "loss": 2.1973, + "mean_token_accuracy": 0.5034482777118683, + "step": 11285 + }, + { + "epoch": 0.011371407909222121, + "grad_norm": 31.02810684886034, + "learning_rate": 1.1371190298732954e-05, + "loss": 2.1515, + "mean_token_accuracy": 0.458620685338974, + "step": 11290 + }, + { + "epoch": 0.011376443962326293, + "grad_norm": 35.70808072877237, + "learning_rate": 1.1376226255464013e-05, + "loss": 2.2424, + "mean_token_accuracy": 0.4551724135875702, + "step": 11295 + }, + { + "epoch": 0.011381480015430467, + "grad_norm": 38.79489333374522, + "learning_rate": 1.1381262212195074e-05, + "loss": 2.258, + "mean_token_accuracy": 0.47241378426551817, + "step": 11300 + }, + { + "epoch": 0.011386516068534639, + "grad_norm": 31.24392550176355, + "learning_rate": 1.1386298168926133e-05, + "loss": 2.5387, + "mean_token_accuracy": 0.3793103456497192, + "step": 11305 + }, + { + "epoch": 0.011391552121638813, + "grad_norm": 31.509881235195156, + "learning_rate": 1.1391334125657192e-05, + "loss": 2.1173, + "mean_token_accuracy": 0.4938423693180084, + "step": 11310 + }, + { + "epoch": 0.011396588174742985, + "grad_norm": 31.499717094399063, + "learning_rate": 1.1396370082388253e-05, + "loss": 2.0832, + "mean_token_accuracy": 0.4862068951129913, + "step": 11315 + }, + { + "epoch": 0.011401624227847159, + "grad_norm": 24.264805572099565, + "learning_rate": 1.1401406039119312e-05, + "loss": 2.2539, + "mean_token_accuracy": 0.46551724076271056, + "step": 11320 + }, + { + "epoch": 0.01140666028095133, + "grad_norm": 27.005003970828003, + "learning_rate": 1.1406441995850373e-05, + "loss": 2.7109, + "mean_token_accuracy": 0.37586207389831544, + "step": 11325 + }, + { + "epoch": 0.011411696334055503, + "grad_norm": 28.823876961086924, + "learning_rate": 1.1411477952581431e-05, + "loss": 2.4805, + "mean_token_accuracy": 0.4379310429096222, + "step": 11330 + }, + { + "epoch": 0.011416732387159676, + "grad_norm": 30.45345939695766, + "learning_rate": 1.1416513909312492e-05, + "loss": 2.0434, + "mean_token_accuracy": 0.47586206793785096, + "step": 11335 + }, + { + "epoch": 0.011421768440263848, + "grad_norm": 33.73712091994839, + "learning_rate": 1.1421549866043551e-05, + "loss": 2.3956, + "mean_token_accuracy": 0.4620689630508423, + "step": 11340 + }, + { + "epoch": 0.011426804493368022, + "grad_norm": 36.010374141068525, + "learning_rate": 1.1426585822774612e-05, + "loss": 2.4198, + "mean_token_accuracy": 0.44482758045196535, + "step": 11345 + }, + { + "epoch": 0.011431840546472194, + "grad_norm": 29.630348630082686, + "learning_rate": 1.143162177950567e-05, + "loss": 2.4567, + "mean_token_accuracy": 0.4689655125141144, + "step": 11350 + }, + { + "epoch": 0.011436876599576368, + "grad_norm": 29.800977235316534, + "learning_rate": 1.143665773623673e-05, + "loss": 2.2673, + "mean_token_accuracy": 0.4655172348022461, + "step": 11355 + }, + { + "epoch": 0.01144191265268054, + "grad_norm": 27.859238389939275, + "learning_rate": 1.144169369296779e-05, + "loss": 2.1193, + "mean_token_accuracy": 0.48965516686439514, + "step": 11360 + }, + { + "epoch": 0.011446948705784712, + "grad_norm": 35.39450583241857, + "learning_rate": 1.144672964969885e-05, + "loss": 2.4734, + "mean_token_accuracy": 0.41379310488700866, + "step": 11365 + }, + { + "epoch": 0.011451984758888886, + "grad_norm": 33.915949128640754, + "learning_rate": 1.145176560642991e-05, + "loss": 2.217, + "mean_token_accuracy": 0.4275861978530884, + "step": 11370 + }, + { + "epoch": 0.011457020811993058, + "grad_norm": 48.20686091215249, + "learning_rate": 1.145680156316097e-05, + "loss": 2.1881, + "mean_token_accuracy": 0.41724138259887694, + "step": 11375 + }, + { + "epoch": 0.011462056865097232, + "grad_norm": 38.3299278932728, + "learning_rate": 1.146183751989203e-05, + "loss": 2.5015, + "mean_token_accuracy": 0.43103447556495667, + "step": 11380 + }, + { + "epoch": 0.011467092918201404, + "grad_norm": 32.7158403231524, + "learning_rate": 1.146687347662309e-05, + "loss": 2.3428, + "mean_token_accuracy": 0.4379310369491577, + "step": 11385 + }, + { + "epoch": 0.011472128971305577, + "grad_norm": 33.26722294101563, + "learning_rate": 1.1471909433354149e-05, + "loss": 2.4984, + "mean_token_accuracy": 0.4068965494632721, + "step": 11390 + }, + { + "epoch": 0.01147716502440975, + "grad_norm": 25.846843769642586, + "learning_rate": 1.1476945390085208e-05, + "loss": 2.3896, + "mean_token_accuracy": 0.42758620977401735, + "step": 11395 + }, + { + "epoch": 0.011482201077513922, + "grad_norm": 35.05438406558011, + "learning_rate": 1.1481981346816269e-05, + "loss": 2.6029, + "mean_token_accuracy": 0.36896551847457887, + "step": 11400 + }, + { + "epoch": 0.011487237130618095, + "grad_norm": 29.293321031606666, + "learning_rate": 1.1487017303547328e-05, + "loss": 2.5893, + "mean_token_accuracy": 0.37586206793785093, + "step": 11405 + }, + { + "epoch": 0.011492273183722267, + "grad_norm": 28.752858193786885, + "learning_rate": 1.1492053260278387e-05, + "loss": 2.3525, + "mean_token_accuracy": 0.44482758045196535, + "step": 11410 + }, + { + "epoch": 0.011497309236826441, + "grad_norm": 29.837383742888036, + "learning_rate": 1.1497089217009448e-05, + "loss": 2.3698, + "mean_token_accuracy": 0.41034482717514037, + "step": 11415 + }, + { + "epoch": 0.011502345289930613, + "grad_norm": 36.076392292081245, + "learning_rate": 1.1502125173740508e-05, + "loss": 2.2143, + "mean_token_accuracy": 0.4310344815254211, + "step": 11420 + }, + { + "epoch": 0.011507381343034787, + "grad_norm": 30.270360452784974, + "learning_rate": 1.1507161130471569e-05, + "loss": 2.2552, + "mean_token_accuracy": 0.46551724076271056, + "step": 11425 + }, + { + "epoch": 0.011512417396138959, + "grad_norm": 28.41563831409088, + "learning_rate": 1.1512197087202626e-05, + "loss": 2.3775, + "mean_token_accuracy": 0.42413792610168455, + "step": 11430 + }, + { + "epoch": 0.011517453449243131, + "grad_norm": 34.58551614517551, + "learning_rate": 1.1517233043933687e-05, + "loss": 2.4192, + "mean_token_accuracy": 0.4068965494632721, + "step": 11435 + }, + { + "epoch": 0.011522489502347305, + "grad_norm": 29.814524399114482, + "learning_rate": 1.1522269000664746e-05, + "loss": 2.6102, + "mean_token_accuracy": 0.3896551728248596, + "step": 11440 + }, + { + "epoch": 0.011527525555451477, + "grad_norm": 33.218503540841375, + "learning_rate": 1.1527304957395807e-05, + "loss": 2.3605, + "mean_token_accuracy": 0.4310344815254211, + "step": 11445 + }, + { + "epoch": 0.01153256160855565, + "grad_norm": 26.991593312418317, + "learning_rate": 1.1532340914126867e-05, + "loss": 2.1858, + "mean_token_accuracy": 0.4068965554237366, + "step": 11450 + }, + { + "epoch": 0.011537597661659823, + "grad_norm": 31.969388025806072, + "learning_rate": 1.1537376870857926e-05, + "loss": 2.5483, + "mean_token_accuracy": 0.41379310488700866, + "step": 11455 + }, + { + "epoch": 0.011542633714763995, + "grad_norm": 26.521594588707206, + "learning_rate": 1.1542412827588985e-05, + "loss": 2.1821, + "mean_token_accuracy": 0.4562807857990265, + "step": 11460 + }, + { + "epoch": 0.011547669767868168, + "grad_norm": 38.75884281078359, + "learning_rate": 1.1547448784320046e-05, + "loss": 2.2722, + "mean_token_accuracy": 0.4413793087005615, + "step": 11465 + }, + { + "epoch": 0.01155270582097234, + "grad_norm": 26.2340559642593, + "learning_rate": 1.1552484741051107e-05, + "loss": 2.3726, + "mean_token_accuracy": 0.44827585816383364, + "step": 11470 + }, + { + "epoch": 0.011557741874076514, + "grad_norm": 34.17758089850744, + "learning_rate": 1.1557520697782165e-05, + "loss": 2.2684, + "mean_token_accuracy": 0.44827585816383364, + "step": 11475 + }, + { + "epoch": 0.011562777927180686, + "grad_norm": 38.78050550743558, + "learning_rate": 1.1562556654513225e-05, + "loss": 2.2958, + "mean_token_accuracy": 0.4620689570903778, + "step": 11480 + }, + { + "epoch": 0.01156781398028486, + "grad_norm": 39.9482929360104, + "learning_rate": 1.1567592611244285e-05, + "loss": 2.2652, + "mean_token_accuracy": 0.441379314661026, + "step": 11485 + }, + { + "epoch": 0.011572850033389032, + "grad_norm": 34.383272971167514, + "learning_rate": 1.1572628567975346e-05, + "loss": 2.4905, + "mean_token_accuracy": 0.39655172228813174, + "step": 11490 + }, + { + "epoch": 0.011577886086493204, + "grad_norm": 35.92989023232943, + "learning_rate": 1.1577664524706403e-05, + "loss": 2.1399, + "mean_token_accuracy": 0.4862068951129913, + "step": 11495 + }, + { + "epoch": 0.011582922139597378, + "grad_norm": 35.44359713743167, + "learning_rate": 1.1582700481437464e-05, + "loss": 2.5389, + "mean_token_accuracy": 0.4344827473163605, + "step": 11500 + }, + { + "epoch": 0.01158795819270155, + "grad_norm": 30.112689003701576, + "learning_rate": 1.1587736438168523e-05, + "loss": 2.2239, + "mean_token_accuracy": 0.4620689690113068, + "step": 11505 + }, + { + "epoch": 0.011592994245805724, + "grad_norm": 39.33231840187201, + "learning_rate": 1.1592772394899584e-05, + "loss": 2.3532, + "mean_token_accuracy": 0.43448275327682495, + "step": 11510 + }, + { + "epoch": 0.011598030298909896, + "grad_norm": 33.836971216147816, + "learning_rate": 1.1597808351630644e-05, + "loss": 2.2874, + "mean_token_accuracy": 0.4172413766384125, + "step": 11515 + }, + { + "epoch": 0.01160306635201407, + "grad_norm": 31.48522080302191, + "learning_rate": 1.1602844308361703e-05, + "loss": 2.0137, + "mean_token_accuracy": 0.47586206793785096, + "step": 11520 + }, + { + "epoch": 0.011608102405118241, + "grad_norm": 37.354047830631124, + "learning_rate": 1.1607880265092764e-05, + "loss": 2.3898, + "mean_token_accuracy": 0.480762255191803, + "step": 11525 + }, + { + "epoch": 0.011613138458222413, + "grad_norm": 27.73356906619096, + "learning_rate": 1.1612916221823823e-05, + "loss": 2.2007, + "mean_token_accuracy": 0.4517241358757019, + "step": 11530 + }, + { + "epoch": 0.011618174511326587, + "grad_norm": 35.47297794558834, + "learning_rate": 1.1617952178554882e-05, + "loss": 2.1781, + "mean_token_accuracy": 0.47586207985877993, + "step": 11535 + }, + { + "epoch": 0.01162321056443076, + "grad_norm": 42.539437963521095, + "learning_rate": 1.1622988135285942e-05, + "loss": 2.3222, + "mean_token_accuracy": 0.39310343861579894, + "step": 11540 + }, + { + "epoch": 0.011628246617534933, + "grad_norm": 36.40070439042412, + "learning_rate": 1.1628024092017003e-05, + "loss": 2.5816, + "mean_token_accuracy": 0.4206896543502808, + "step": 11545 + }, + { + "epoch": 0.011633282670639105, + "grad_norm": 33.16120407285037, + "learning_rate": 1.1633060048748062e-05, + "loss": 2.3945, + "mean_token_accuracy": 0.4068965494632721, + "step": 11550 + }, + { + "epoch": 0.011638318723743279, + "grad_norm": 29.484771041421798, + "learning_rate": 1.1638096005479121e-05, + "loss": 2.1433, + "mean_token_accuracy": 0.4931034505367279, + "step": 11555 + }, + { + "epoch": 0.01164335477684745, + "grad_norm": 36.40939325677855, + "learning_rate": 1.1643131962210182e-05, + "loss": 2.4058, + "mean_token_accuracy": 0.41379310488700866, + "step": 11560 + }, + { + "epoch": 0.011648390829951623, + "grad_norm": 27.946950946732393, + "learning_rate": 1.1648167918941241e-05, + "loss": 2.2502, + "mean_token_accuracy": 0.47586206793785096, + "step": 11565 + }, + { + "epoch": 0.011653426883055797, + "grad_norm": 38.44211173044791, + "learning_rate": 1.1653203875672302e-05, + "loss": 2.2147, + "mean_token_accuracy": 0.45862067937850953, + "step": 11570 + }, + { + "epoch": 0.011658462936159969, + "grad_norm": 41.09401139542813, + "learning_rate": 1.165823983240336e-05, + "loss": 2.1679, + "mean_token_accuracy": 0.4517241358757019, + "step": 11575 + }, + { + "epoch": 0.011663498989264142, + "grad_norm": 36.09791853494381, + "learning_rate": 1.166327578913442e-05, + "loss": 2.3712, + "mean_token_accuracy": 0.3793103456497192, + "step": 11580 + }, + { + "epoch": 0.011668535042368314, + "grad_norm": 28.79494635873865, + "learning_rate": 1.166831174586548e-05, + "loss": 2.2485, + "mean_token_accuracy": 0.4517241358757019, + "step": 11585 + }, + { + "epoch": 0.011673571095472488, + "grad_norm": 28.460158171464016, + "learning_rate": 1.1673347702596541e-05, + "loss": 2.2697, + "mean_token_accuracy": 0.47931034564971925, + "step": 11590 + }, + { + "epoch": 0.01167860714857666, + "grad_norm": 31.95727198297756, + "learning_rate": 1.1678383659327598e-05, + "loss": 2.2268, + "mean_token_accuracy": 0.47241379618644713, + "step": 11595 + }, + { + "epoch": 0.011683643201680832, + "grad_norm": 39.39864172166567, + "learning_rate": 1.168341961605866e-05, + "loss": 2.6456, + "mean_token_accuracy": 0.3599515974521637, + "step": 11600 + }, + { + "epoch": 0.011688679254785006, + "grad_norm": 33.92420170376145, + "learning_rate": 1.1688455572789719e-05, + "loss": 2.5457, + "mean_token_accuracy": 0.3931034475564957, + "step": 11605 + }, + { + "epoch": 0.011693715307889178, + "grad_norm": 35.31032262005259, + "learning_rate": 1.169349152952078e-05, + "loss": 2.7874, + "mean_token_accuracy": 0.3827586233615875, + "step": 11610 + }, + { + "epoch": 0.011698751360993352, + "grad_norm": 36.408309424903294, + "learning_rate": 1.1698527486251839e-05, + "loss": 2.1933, + "mean_token_accuracy": 0.43103447556495667, + "step": 11615 + }, + { + "epoch": 0.011703787414097524, + "grad_norm": 26.883386887357823, + "learning_rate": 1.1703563442982898e-05, + "loss": 2.2353, + "mean_token_accuracy": 0.505807626247406, + "step": 11620 + }, + { + "epoch": 0.011708823467201698, + "grad_norm": 30.3866929815451, + "learning_rate": 1.1708599399713959e-05, + "loss": 2.3994, + "mean_token_accuracy": 0.4137930989265442, + "step": 11625 + }, + { + "epoch": 0.01171385952030587, + "grad_norm": 27.47648489661462, + "learning_rate": 1.1713635356445018e-05, + "loss": 2.1753, + "mean_token_accuracy": 0.4758620738983154, + "step": 11630 + }, + { + "epoch": 0.011718895573410042, + "grad_norm": 28.30304310887975, + "learning_rate": 1.1718671313176078e-05, + "loss": 2.1896, + "mean_token_accuracy": 0.46551724076271056, + "step": 11635 + }, + { + "epoch": 0.011723931626514215, + "grad_norm": 31.20339825793529, + "learning_rate": 1.1723707269907137e-05, + "loss": 2.1507, + "mean_token_accuracy": 0.4586206912994385, + "step": 11640 + }, + { + "epoch": 0.011728967679618387, + "grad_norm": 26.769937723201615, + "learning_rate": 1.1728743226638198e-05, + "loss": 2.1793, + "mean_token_accuracy": 0.4905172407627106, + "step": 11645 + }, + { + "epoch": 0.011734003732722561, + "grad_norm": 47.6107599957727, + "learning_rate": 1.1733779183369257e-05, + "loss": 2.7639, + "mean_token_accuracy": 0.3655172407627106, + "step": 11650 + }, + { + "epoch": 0.011739039785826733, + "grad_norm": 31.107877045445967, + "learning_rate": 1.1738815140100316e-05, + "loss": 2.231, + "mean_token_accuracy": 0.43974592089653014, + "step": 11655 + }, + { + "epoch": 0.011744075838930907, + "grad_norm": 28.79640354428086, + "learning_rate": 1.1743851096831377e-05, + "loss": 2.2575, + "mean_token_accuracy": 0.4379310369491577, + "step": 11660 + }, + { + "epoch": 0.011749111892035079, + "grad_norm": 23.592700094394363, + "learning_rate": 1.1748887053562436e-05, + "loss": 1.9979, + "mean_token_accuracy": 0.5034482657909394, + "step": 11665 + }, + { + "epoch": 0.011754147945139251, + "grad_norm": 30.827337374231064, + "learning_rate": 1.1753923010293497e-05, + "loss": 2.2609, + "mean_token_accuracy": 0.4156079888343811, + "step": 11670 + }, + { + "epoch": 0.011759183998243425, + "grad_norm": 57.831809437527056, + "learning_rate": 1.1758958967024555e-05, + "loss": 2.4051, + "mean_token_accuracy": 0.4275861978530884, + "step": 11675 + }, + { + "epoch": 0.011764220051347597, + "grad_norm": 38.3851403276558, + "learning_rate": 1.1763994923755616e-05, + "loss": 2.5404, + "mean_token_accuracy": 0.4068965494632721, + "step": 11680 + }, + { + "epoch": 0.01176925610445177, + "grad_norm": 33.92180640453206, + "learning_rate": 1.1769030880486675e-05, + "loss": 2.0088, + "mean_token_accuracy": 0.5310344815254211, + "step": 11685 + }, + { + "epoch": 0.011774292157555943, + "grad_norm": 34.4335348158334, + "learning_rate": 1.1774066837217736e-05, + "loss": 2.3242, + "mean_token_accuracy": 0.4448275864124298, + "step": 11690 + }, + { + "epoch": 0.011779328210660116, + "grad_norm": 37.86839475338455, + "learning_rate": 1.1779102793948794e-05, + "loss": 2.3915, + "mean_token_accuracy": 0.45747126936912536, + "step": 11695 + }, + { + "epoch": 0.011784364263764288, + "grad_norm": 29.533663641715155, + "learning_rate": 1.1784138750679855e-05, + "loss": 2.2349, + "mean_token_accuracy": 0.47096188068389894, + "step": 11700 + }, + { + "epoch": 0.01178940031686846, + "grad_norm": 29.147766761954117, + "learning_rate": 1.1789174707410914e-05, + "loss": 2.0756, + "mean_token_accuracy": 0.47586206197738645, + "step": 11705 + }, + { + "epoch": 0.011794436369972634, + "grad_norm": 30.423986876778347, + "learning_rate": 1.1794210664141975e-05, + "loss": 2.2408, + "mean_token_accuracy": 0.4103448212146759, + "step": 11710 + }, + { + "epoch": 0.011799472423076806, + "grad_norm": 48.76108093174837, + "learning_rate": 1.1799246620873034e-05, + "loss": 2.6077, + "mean_token_accuracy": 0.4034482777118683, + "step": 11715 + }, + { + "epoch": 0.01180450847618098, + "grad_norm": 33.45300175282271, + "learning_rate": 1.1804282577604093e-05, + "loss": 2.3763, + "mean_token_accuracy": 0.4379310429096222, + "step": 11720 + }, + { + "epoch": 0.011809544529285152, + "grad_norm": 35.70077979385335, + "learning_rate": 1.1809318534335154e-05, + "loss": 2.4995, + "mean_token_accuracy": 0.4620689690113068, + "step": 11725 + }, + { + "epoch": 0.011814580582389326, + "grad_norm": 31.22088762399235, + "learning_rate": 1.1814354491066214e-05, + "loss": 2.4538, + "mean_token_accuracy": 0.42758620977401735, + "step": 11730 + }, + { + "epoch": 0.011819616635493498, + "grad_norm": 29.091625305492162, + "learning_rate": 1.1819390447797273e-05, + "loss": 2.2808, + "mean_token_accuracy": 0.4620689630508423, + "step": 11735 + }, + { + "epoch": 0.01182465268859767, + "grad_norm": 31.713195667671137, + "learning_rate": 1.1824426404528332e-05, + "loss": 2.1855, + "mean_token_accuracy": 0.4620689690113068, + "step": 11740 + }, + { + "epoch": 0.011829688741701844, + "grad_norm": 41.178585232472834, + "learning_rate": 1.1829462361259393e-05, + "loss": 2.5098, + "mean_token_accuracy": 0.4344827592372894, + "step": 11745 + }, + { + "epoch": 0.011834724794806016, + "grad_norm": 26.55604875629458, + "learning_rate": 1.1834498317990452e-05, + "loss": 2.2414, + "mean_token_accuracy": 0.43793103098869324, + "step": 11750 + }, + { + "epoch": 0.01183976084791019, + "grad_norm": 33.219480125829634, + "learning_rate": 1.1839534274721511e-05, + "loss": 2.341, + "mean_token_accuracy": 0.460556560754776, + "step": 11755 + }, + { + "epoch": 0.011844796901014362, + "grad_norm": 35.91387042737535, + "learning_rate": 1.1844570231452572e-05, + "loss": 2.4802, + "mean_token_accuracy": 0.41185723543167113, + "step": 11760 + }, + { + "epoch": 0.011849832954118535, + "grad_norm": 30.829630617005744, + "learning_rate": 1.1849606188183632e-05, + "loss": 2.2007, + "mean_token_accuracy": 0.4689655065536499, + "step": 11765 + }, + { + "epoch": 0.011854869007222707, + "grad_norm": 26.424355055171844, + "learning_rate": 1.1854642144914693e-05, + "loss": 2.3634, + "mean_token_accuracy": 0.4551724135875702, + "step": 11770 + }, + { + "epoch": 0.01185990506032688, + "grad_norm": 39.74142420053445, + "learning_rate": 1.185967810164575e-05, + "loss": 2.6674, + "mean_token_accuracy": 0.3517241358757019, + "step": 11775 + }, + { + "epoch": 0.011864941113431053, + "grad_norm": 29.204152397600183, + "learning_rate": 1.1864714058376811e-05, + "loss": 2.4455, + "mean_token_accuracy": 0.41724138259887694, + "step": 11780 + }, + { + "epoch": 0.011869977166535225, + "grad_norm": 30.820441868810942, + "learning_rate": 1.186975001510787e-05, + "loss": 2.4034, + "mean_token_accuracy": 0.4379310429096222, + "step": 11785 + }, + { + "epoch": 0.011875013219639399, + "grad_norm": 34.93596359481468, + "learning_rate": 1.1874785971838931e-05, + "loss": 2.2903, + "mean_token_accuracy": 0.41379310488700866, + "step": 11790 + }, + { + "epoch": 0.011880049272743571, + "grad_norm": 35.18496665378755, + "learning_rate": 1.1879821928569989e-05, + "loss": 2.3573, + "mean_token_accuracy": 0.44827585816383364, + "step": 11795 + }, + { + "epoch": 0.011885085325847745, + "grad_norm": 26.243347308215515, + "learning_rate": 1.188485788530105e-05, + "loss": 2.3211, + "mean_token_accuracy": 0.47586206793785096, + "step": 11800 + }, + { + "epoch": 0.011890121378951917, + "grad_norm": 42.77812947857821, + "learning_rate": 1.1889893842032109e-05, + "loss": 2.3755, + "mean_token_accuracy": 0.4620689630508423, + "step": 11805 + }, + { + "epoch": 0.011895157432056089, + "grad_norm": 29.44263931797846, + "learning_rate": 1.189492979876317e-05, + "loss": 2.4405, + "mean_token_accuracy": 0.42413792610168455, + "step": 11810 + }, + { + "epoch": 0.011900193485160263, + "grad_norm": 29.622466696353953, + "learning_rate": 1.189996575549423e-05, + "loss": 2.222, + "mean_token_accuracy": 0.42758620381355283, + "step": 11815 + }, + { + "epoch": 0.011905229538264435, + "grad_norm": 59.869047997826236, + "learning_rate": 1.1905001712225289e-05, + "loss": 2.4852, + "mean_token_accuracy": 0.42068964838981626, + "step": 11820 + }, + { + "epoch": 0.011910265591368608, + "grad_norm": 40.603104169243366, + "learning_rate": 1.191003766895635e-05, + "loss": 2.6181, + "mean_token_accuracy": 0.4034482717514038, + "step": 11825 + }, + { + "epoch": 0.01191530164447278, + "grad_norm": 30.373590109935176, + "learning_rate": 1.1915073625687409e-05, + "loss": 2.3887, + "mean_token_accuracy": 0.4241379380226135, + "step": 11830 + }, + { + "epoch": 0.011920337697576954, + "grad_norm": 27.992116159530656, + "learning_rate": 1.1920109582418468e-05, + "loss": 2.6311, + "mean_token_accuracy": 0.39655173420906065, + "step": 11835 + }, + { + "epoch": 0.011925373750681126, + "grad_norm": 21.99121016474274, + "learning_rate": 1.1925145539149527e-05, + "loss": 2.3176, + "mean_token_accuracy": 0.4586206912994385, + "step": 11840 + }, + { + "epoch": 0.011930409803785298, + "grad_norm": 24.090678675968853, + "learning_rate": 1.1930181495880588e-05, + "loss": 2.096, + "mean_token_accuracy": 0.463520872592926, + "step": 11845 + }, + { + "epoch": 0.011935445856889472, + "grad_norm": 22.389842023634845, + "learning_rate": 1.1935217452611647e-05, + "loss": 2.1761, + "mean_token_accuracy": 0.4344827651977539, + "step": 11850 + }, + { + "epoch": 0.011940481909993644, + "grad_norm": 23.848870125804265, + "learning_rate": 1.1940253409342707e-05, + "loss": 2.626, + "mean_token_accuracy": 0.3670901358127594, + "step": 11855 + }, + { + "epoch": 0.011945517963097818, + "grad_norm": 30.34740952210801, + "learning_rate": 1.1945289366073768e-05, + "loss": 2.3298, + "mean_token_accuracy": 0.42758620381355283, + "step": 11860 + }, + { + "epoch": 0.01195055401620199, + "grad_norm": 29.72381944433842, + "learning_rate": 1.1950325322804827e-05, + "loss": 2.1655, + "mean_token_accuracy": 0.4586206912994385, + "step": 11865 + }, + { + "epoch": 0.011955590069306164, + "grad_norm": 37.395219009976124, + "learning_rate": 1.1955361279535888e-05, + "loss": 2.3425, + "mean_token_accuracy": 0.43448275327682495, + "step": 11870 + }, + { + "epoch": 0.011960626122410336, + "grad_norm": 33.79469696269186, + "learning_rate": 1.1960397236266947e-05, + "loss": 2.2985, + "mean_token_accuracy": 0.4689655065536499, + "step": 11875 + }, + { + "epoch": 0.011965662175514508, + "grad_norm": 57.48276174097468, + "learning_rate": 1.1965433192998006e-05, + "loss": 2.508, + "mean_token_accuracy": 0.42758620381355283, + "step": 11880 + }, + { + "epoch": 0.011970698228618681, + "grad_norm": 23.57946935792069, + "learning_rate": 1.1970469149729066e-05, + "loss": 2.4551, + "mean_token_accuracy": 0.4379310250282288, + "step": 11885 + }, + { + "epoch": 0.011975734281722853, + "grad_norm": 30.65853885506041, + "learning_rate": 1.1975505106460127e-05, + "loss": 2.2331, + "mean_token_accuracy": 0.4517241299152374, + "step": 11890 + }, + { + "epoch": 0.011980770334827027, + "grad_norm": 32.8691052090834, + "learning_rate": 1.1980541063191186e-05, + "loss": 2.0642, + "mean_token_accuracy": 0.4845130145549774, + "step": 11895 + }, + { + "epoch": 0.0119858063879312, + "grad_norm": 30.200057255884282, + "learning_rate": 1.1985577019922245e-05, + "loss": 2.3444, + "mean_token_accuracy": 0.4034482717514038, + "step": 11900 + }, + { + "epoch": 0.011990842441035373, + "grad_norm": 33.21811713029088, + "learning_rate": 1.1990612976653304e-05, + "loss": 2.1185, + "mean_token_accuracy": 0.41379310488700866, + "step": 11905 + }, + { + "epoch": 0.011995878494139545, + "grad_norm": 26.002346921505122, + "learning_rate": 1.1995648933384365e-05, + "loss": 2.3714, + "mean_token_accuracy": 0.4431337058544159, + "step": 11910 + }, + { + "epoch": 0.012000914547243717, + "grad_norm": 28.41105857341897, + "learning_rate": 1.2000684890115425e-05, + "loss": 2.2993, + "mean_token_accuracy": 0.4120387136936188, + "step": 11915 + }, + { + "epoch": 0.01200595060034789, + "grad_norm": 35.13040381814801, + "learning_rate": 1.2005720846846484e-05, + "loss": 2.0992, + "mean_token_accuracy": 0.4344827651977539, + "step": 11920 + }, + { + "epoch": 0.012010986653452063, + "grad_norm": 33.902052562487064, + "learning_rate": 1.2010756803577545e-05, + "loss": 2.3548, + "mean_token_accuracy": 0.42758620977401735, + "step": 11925 + }, + { + "epoch": 0.012016022706556237, + "grad_norm": 30.351260673050795, + "learning_rate": 1.2015792760308604e-05, + "loss": 2.3414, + "mean_token_accuracy": 0.4448275864124298, + "step": 11930 + }, + { + "epoch": 0.012021058759660409, + "grad_norm": 29.727351781076695, + "learning_rate": 1.2020828717039665e-05, + "loss": 2.1047, + "mean_token_accuracy": 0.517241370677948, + "step": 11935 + }, + { + "epoch": 0.012026094812764582, + "grad_norm": 42.319718975242544, + "learning_rate": 1.2025864673770722e-05, + "loss": 2.4983, + "mean_token_accuracy": 0.44827587008476255, + "step": 11940 + }, + { + "epoch": 0.012031130865868754, + "grad_norm": 34.90828718420889, + "learning_rate": 1.2030900630501783e-05, + "loss": 2.2032, + "mean_token_accuracy": 0.4517241358757019, + "step": 11945 + }, + { + "epoch": 0.012036166918972926, + "grad_norm": 36.6431165004841, + "learning_rate": 1.2035936587232843e-05, + "loss": 2.4441, + "mean_token_accuracy": 0.4379310369491577, + "step": 11950 + }, + { + "epoch": 0.0120412029720771, + "grad_norm": 48.10309109629409, + "learning_rate": 1.2040972543963904e-05, + "loss": 2.2942, + "mean_token_accuracy": 0.44137930274009707, + "step": 11955 + }, + { + "epoch": 0.012046239025181272, + "grad_norm": 40.61548526380205, + "learning_rate": 1.2046008500694963e-05, + "loss": 2.7224, + "mean_token_accuracy": 0.3862068891525269, + "step": 11960 + }, + { + "epoch": 0.012051275078285446, + "grad_norm": 35.51999667987734, + "learning_rate": 1.2051044457426022e-05, + "loss": 2.4255, + "mean_token_accuracy": 0.4172413766384125, + "step": 11965 + }, + { + "epoch": 0.012056311131389618, + "grad_norm": 36.76471067656789, + "learning_rate": 1.2056080414157083e-05, + "loss": 2.2225, + "mean_token_accuracy": 0.46896551847457885, + "step": 11970 + }, + { + "epoch": 0.012061347184493792, + "grad_norm": 31.02849146260985, + "learning_rate": 1.2061116370888142e-05, + "loss": 2.2786, + "mean_token_accuracy": 0.4275862157344818, + "step": 11975 + }, + { + "epoch": 0.012066383237597964, + "grad_norm": 25.74508082509686, + "learning_rate": 1.2066152327619202e-05, + "loss": 2.2063, + "mean_token_accuracy": 0.42758620381355283, + "step": 11980 + }, + { + "epoch": 0.012071419290702136, + "grad_norm": 24.714618897499193, + "learning_rate": 1.207118828435026e-05, + "loss": 2.4041, + "mean_token_accuracy": 0.4, + "step": 11985 + }, + { + "epoch": 0.01207645534380631, + "grad_norm": 36.555366923221555, + "learning_rate": 1.2076224241081322e-05, + "loss": 2.2502, + "mean_token_accuracy": 0.44593595564365385, + "step": 11990 + }, + { + "epoch": 0.012081491396910482, + "grad_norm": 34.32218517791795, + "learning_rate": 1.2081260197812381e-05, + "loss": 2.5374, + "mean_token_accuracy": 0.42413792610168455, + "step": 11995 + }, + { + "epoch": 0.012086527450014655, + "grad_norm": 29.016327400206517, + "learning_rate": 1.208629615454344e-05, + "loss": 2.3157, + "mean_token_accuracy": 0.42068966031074523, + "step": 12000 + }, + { + "epoch": 0.012091563503118827, + "grad_norm": 32.278258368869174, + "learning_rate": 1.2091332111274501e-05, + "loss": 2.2051, + "mean_token_accuracy": 0.4344827592372894, + "step": 12005 + }, + { + "epoch": 0.012096599556223001, + "grad_norm": 34.500678726034955, + "learning_rate": 1.209636806800556e-05, + "loss": 2.2132, + "mean_token_accuracy": 0.4746521472930908, + "step": 12010 + }, + { + "epoch": 0.012101635609327173, + "grad_norm": 23.72196587971329, + "learning_rate": 1.2101404024736621e-05, + "loss": 2.254, + "mean_token_accuracy": 0.49310343265533446, + "step": 12015 + }, + { + "epoch": 0.012106671662431345, + "grad_norm": 25.039264292882553, + "learning_rate": 1.2106439981467679e-05, + "loss": 2.551, + "mean_token_accuracy": 0.36896551847457887, + "step": 12020 + }, + { + "epoch": 0.012111707715535519, + "grad_norm": 27.67678840246632, + "learning_rate": 1.211147593819874e-05, + "loss": 2.1059, + "mean_token_accuracy": 0.46896551847457885, + "step": 12025 + }, + { + "epoch": 0.012116743768639691, + "grad_norm": 32.01213154648573, + "learning_rate": 1.21165118949298e-05, + "loss": 2.1221, + "mean_token_accuracy": 0.47241379618644713, + "step": 12030 + }, + { + "epoch": 0.012121779821743865, + "grad_norm": 32.43952805086492, + "learning_rate": 1.212154785166086e-05, + "loss": 2.4842, + "mean_token_accuracy": 0.4310344815254211, + "step": 12035 + }, + { + "epoch": 0.012126815874848037, + "grad_norm": 32.91024604085388, + "learning_rate": 1.2126583808391918e-05, + "loss": 2.5699, + "mean_token_accuracy": 0.3482758581638336, + "step": 12040 + }, + { + "epoch": 0.01213185192795221, + "grad_norm": 40.23602559617418, + "learning_rate": 1.2131619765122979e-05, + "loss": 2.2727, + "mean_token_accuracy": 0.47352216243743894, + "step": 12045 + }, + { + "epoch": 0.012136887981056383, + "grad_norm": 31.90534793888992, + "learning_rate": 1.2136655721854038e-05, + "loss": 2.409, + "mean_token_accuracy": 0.42413792610168455, + "step": 12050 + }, + { + "epoch": 0.012141924034160555, + "grad_norm": 29.161363129473802, + "learning_rate": 1.2141691678585099e-05, + "loss": 2.4105, + "mean_token_accuracy": 0.41724138259887694, + "step": 12055 + }, + { + "epoch": 0.012146960087264728, + "grad_norm": 30.206054380356, + "learning_rate": 1.2146727635316158e-05, + "loss": 2.2371, + "mean_token_accuracy": 0.47434966564178466, + "step": 12060 + }, + { + "epoch": 0.0121519961403689, + "grad_norm": 30.366279329096894, + "learning_rate": 1.2151763592047217e-05, + "loss": 2.1487, + "mean_token_accuracy": 0.47404718995094297, + "step": 12065 + }, + { + "epoch": 0.012157032193473074, + "grad_norm": 32.6056171639377, + "learning_rate": 1.2156799548778278e-05, + "loss": 2.3538, + "mean_token_accuracy": 0.48862674832344055, + "step": 12070 + }, + { + "epoch": 0.012162068246577246, + "grad_norm": 35.79740680820382, + "learning_rate": 1.2161835505509338e-05, + "loss": 2.3017, + "mean_token_accuracy": 0.41724138259887694, + "step": 12075 + }, + { + "epoch": 0.01216710429968142, + "grad_norm": 32.8361135077361, + "learning_rate": 1.2166871462240397e-05, + "loss": 2.052, + "mean_token_accuracy": 0.5068965435028077, + "step": 12080 + }, + { + "epoch": 0.012172140352785592, + "grad_norm": 27.195149448502644, + "learning_rate": 1.2171907418971456e-05, + "loss": 2.0828, + "mean_token_accuracy": 0.44482758045196535, + "step": 12085 + }, + { + "epoch": 0.012177176405889764, + "grad_norm": 35.51346782622862, + "learning_rate": 1.2176943375702517e-05, + "loss": 2.2336, + "mean_token_accuracy": 0.4884452521800995, + "step": 12090 + }, + { + "epoch": 0.012182212458993938, + "grad_norm": 47.91198506078173, + "learning_rate": 1.2181979332433576e-05, + "loss": 2.7316, + "mean_token_accuracy": 0.37586207389831544, + "step": 12095 + }, + { + "epoch": 0.01218724851209811, + "grad_norm": 25.24952257274832, + "learning_rate": 1.2187015289164636e-05, + "loss": 2.2287, + "mean_token_accuracy": 0.44827585816383364, + "step": 12100 + }, + { + "epoch": 0.012192284565202284, + "grad_norm": 41.366680473854885, + "learning_rate": 1.2192051245895696e-05, + "loss": 2.1682, + "mean_token_accuracy": 0.49655171632766726, + "step": 12105 + }, + { + "epoch": 0.012197320618306456, + "grad_norm": 29.580693126967436, + "learning_rate": 1.2197087202626756e-05, + "loss": 2.4111, + "mean_token_accuracy": 0.4103448212146759, + "step": 12110 + }, + { + "epoch": 0.01220235667141063, + "grad_norm": 27.795948916880032, + "learning_rate": 1.2202123159357817e-05, + "loss": 2.1578, + "mean_token_accuracy": 0.47434966564178466, + "step": 12115 + }, + { + "epoch": 0.012207392724514802, + "grad_norm": 24.31278972152385, + "learning_rate": 1.2207159116088874e-05, + "loss": 2.2462, + "mean_token_accuracy": 0.46206897497177124, + "step": 12120 + }, + { + "epoch": 0.012212428777618974, + "grad_norm": 44.73619705892749, + "learning_rate": 1.2212195072819935e-05, + "loss": 2.4258, + "mean_token_accuracy": 0.3907440960407257, + "step": 12125 + }, + { + "epoch": 0.012217464830723147, + "grad_norm": 44.09987761342455, + "learning_rate": 1.2217231029550994e-05, + "loss": 2.4476, + "mean_token_accuracy": 0.41379311084747317, + "step": 12130 + }, + { + "epoch": 0.01222250088382732, + "grad_norm": 31.159134806619846, + "learning_rate": 1.2222266986282055e-05, + "loss": 2.5624, + "mean_token_accuracy": 0.38275861740112305, + "step": 12135 + }, + { + "epoch": 0.012227536936931493, + "grad_norm": 35.7648789972433, + "learning_rate": 1.2227302943013113e-05, + "loss": 2.5196, + "mean_token_accuracy": 0.39655172228813174, + "step": 12140 + }, + { + "epoch": 0.012232572990035665, + "grad_norm": 38.242615424674746, + "learning_rate": 1.2232338899744174e-05, + "loss": 2.6402, + "mean_token_accuracy": 0.39655172228813174, + "step": 12145 + }, + { + "epoch": 0.012237609043139839, + "grad_norm": 40.887422675709274, + "learning_rate": 1.2237374856475233e-05, + "loss": 2.3138, + "mean_token_accuracy": 0.415426504611969, + "step": 12150 + }, + { + "epoch": 0.012242645096244011, + "grad_norm": 31.931748143686473, + "learning_rate": 1.2242410813206294e-05, + "loss": 2.4109, + "mean_token_accuracy": 0.4379310250282288, + "step": 12155 + }, + { + "epoch": 0.012247681149348183, + "grad_norm": 24.02194620414054, + "learning_rate": 1.2247446769937353e-05, + "loss": 2.1376, + "mean_token_accuracy": 0.43103448748588563, + "step": 12160 + }, + { + "epoch": 0.012252717202452357, + "grad_norm": 32.9457221709957, + "learning_rate": 1.2252482726668413e-05, + "loss": 2.3615, + "mean_token_accuracy": 0.39310344457626345, + "step": 12165 + }, + { + "epoch": 0.012257753255556529, + "grad_norm": 33.00860703281463, + "learning_rate": 1.2257518683399474e-05, + "loss": 2.4587, + "mean_token_accuracy": 0.4034482777118683, + "step": 12170 + }, + { + "epoch": 0.012262789308660703, + "grad_norm": 44.89004291993204, + "learning_rate": 1.2262554640130533e-05, + "loss": 2.4888, + "mean_token_accuracy": 0.35172412991523744, + "step": 12175 + }, + { + "epoch": 0.012267825361764875, + "grad_norm": 44.11400169077773, + "learning_rate": 1.2267590596861592e-05, + "loss": 2.5631, + "mean_token_accuracy": 0.41034482717514037, + "step": 12180 + }, + { + "epoch": 0.012272861414869048, + "grad_norm": 30.01498010857881, + "learning_rate": 1.2272626553592651e-05, + "loss": 2.3986, + "mean_token_accuracy": 0.4344827473163605, + "step": 12185 + }, + { + "epoch": 0.01227789746797322, + "grad_norm": 21.309668670364317, + "learning_rate": 1.2277662510323712e-05, + "loss": 2.114, + "mean_token_accuracy": 0.48965516686439514, + "step": 12190 + }, + { + "epoch": 0.012282933521077392, + "grad_norm": 26.11629365341752, + "learning_rate": 1.2282698467054771e-05, + "loss": 2.3484, + "mean_token_accuracy": 0.41034482717514037, + "step": 12195 + }, + { + "epoch": 0.012287969574181566, + "grad_norm": 27.875106173708257, + "learning_rate": 1.228773442378583e-05, + "loss": 2.3855, + "mean_token_accuracy": 0.44827585816383364, + "step": 12200 + }, + { + "epoch": 0.012293005627285738, + "grad_norm": 30.164966670858732, + "learning_rate": 1.2292770380516892e-05, + "loss": 2.3645, + "mean_token_accuracy": 0.4482758641242981, + "step": 12205 + }, + { + "epoch": 0.012298041680389912, + "grad_norm": 26.723320574896206, + "learning_rate": 1.2297806337247951e-05, + "loss": 2.2234, + "mean_token_accuracy": 0.458620685338974, + "step": 12210 + }, + { + "epoch": 0.012303077733494084, + "grad_norm": 27.655417095411156, + "learning_rate": 1.2302842293979012e-05, + "loss": 2.1854, + "mean_token_accuracy": 0.42413793206214906, + "step": 12215 + }, + { + "epoch": 0.012308113786598258, + "grad_norm": 25.755773423332897, + "learning_rate": 1.230787825071007e-05, + "loss": 2.3357, + "mean_token_accuracy": 0.4655172348022461, + "step": 12220 + }, + { + "epoch": 0.01231314983970243, + "grad_norm": 31.448424780292072, + "learning_rate": 1.231291420744113e-05, + "loss": 2.2223, + "mean_token_accuracy": 0.4620689630508423, + "step": 12225 + }, + { + "epoch": 0.012318185892806602, + "grad_norm": 24.98279423637835, + "learning_rate": 1.231795016417219e-05, + "loss": 2.3607, + "mean_token_accuracy": 0.4448275864124298, + "step": 12230 + }, + { + "epoch": 0.012323221945910776, + "grad_norm": 35.59861885622897, + "learning_rate": 1.232298612090325e-05, + "loss": 2.7285, + "mean_token_accuracy": 0.3896551728248596, + "step": 12235 + }, + { + "epoch": 0.012328257999014948, + "grad_norm": 20.429017503250947, + "learning_rate": 1.2328022077634308e-05, + "loss": 1.92, + "mean_token_accuracy": 0.5127041757106781, + "step": 12240 + }, + { + "epoch": 0.012333294052119121, + "grad_norm": 30.131159760352496, + "learning_rate": 1.2333058034365369e-05, + "loss": 2.5884, + "mean_token_accuracy": 0.39310343861579894, + "step": 12245 + }, + { + "epoch": 0.012338330105223293, + "grad_norm": 48.5446176386253, + "learning_rate": 1.2338093991096428e-05, + "loss": 2.3249, + "mean_token_accuracy": 0.42068964838981626, + "step": 12250 + }, + { + "epoch": 0.012343366158327467, + "grad_norm": 27.146414488717674, + "learning_rate": 1.234312994782749e-05, + "loss": 2.2559, + "mean_token_accuracy": 0.4620689570903778, + "step": 12255 + }, + { + "epoch": 0.01234840221143164, + "grad_norm": 43.04661433142306, + "learning_rate": 1.2348165904558549e-05, + "loss": 2.6008, + "mean_token_accuracy": 0.3793103456497192, + "step": 12260 + }, + { + "epoch": 0.012353438264535811, + "grad_norm": 25.40896500313097, + "learning_rate": 1.2353201861289608e-05, + "loss": 2.2481, + "mean_token_accuracy": 0.44137930274009707, + "step": 12265 + }, + { + "epoch": 0.012358474317639985, + "grad_norm": 23.37790638152415, + "learning_rate": 1.2358237818020669e-05, + "loss": 2.5951, + "mean_token_accuracy": 0.3655172407627106, + "step": 12270 + }, + { + "epoch": 0.012363510370744157, + "grad_norm": 27.49061747667942, + "learning_rate": 1.2363273774751728e-05, + "loss": 2.3631, + "mean_token_accuracy": 0.4433151841163635, + "step": 12275 + }, + { + "epoch": 0.01236854642384833, + "grad_norm": 35.385313867573075, + "learning_rate": 1.2368309731482789e-05, + "loss": 2.3532, + "mean_token_accuracy": 0.40689654350280763, + "step": 12280 + }, + { + "epoch": 0.012373582476952503, + "grad_norm": 25.341615106386364, + "learning_rate": 1.2373345688213846e-05, + "loss": 2.204, + "mean_token_accuracy": 0.5241379320621491, + "step": 12285 + }, + { + "epoch": 0.012378618530056677, + "grad_norm": 29.179479229905585, + "learning_rate": 1.2378381644944907e-05, + "loss": 2.3344, + "mean_token_accuracy": 0.4310344815254211, + "step": 12290 + }, + { + "epoch": 0.012383654583160849, + "grad_norm": 25.541063248156956, + "learning_rate": 1.2383417601675967e-05, + "loss": 2.7222, + "mean_token_accuracy": 0.36896551251411436, + "step": 12295 + }, + { + "epoch": 0.01238869063626502, + "grad_norm": 35.224607397398145, + "learning_rate": 1.2388453558407028e-05, + "loss": 2.322, + "mean_token_accuracy": 0.43793103098869324, + "step": 12300 + }, + { + "epoch": 0.012393726689369194, + "grad_norm": 22.708548898606317, + "learning_rate": 1.2393489515138087e-05, + "loss": 2.0088, + "mean_token_accuracy": 0.5137931108474731, + "step": 12305 + }, + { + "epoch": 0.012398762742473366, + "grad_norm": 32.69161134529763, + "learning_rate": 1.2398525471869146e-05, + "loss": 2.4177, + "mean_token_accuracy": 0.3862069010734558, + "step": 12310 + }, + { + "epoch": 0.01240379879557754, + "grad_norm": 47.44484033035358, + "learning_rate": 1.2403561428600207e-05, + "loss": 2.5551, + "mean_token_accuracy": 0.4137930989265442, + "step": 12315 + }, + { + "epoch": 0.012408834848681712, + "grad_norm": 38.70807064210927, + "learning_rate": 1.2408597385331266e-05, + "loss": 2.3365, + "mean_token_accuracy": 0.4482758641242981, + "step": 12320 + }, + { + "epoch": 0.012413870901785886, + "grad_norm": 27.9617671840753, + "learning_rate": 1.2413633342062326e-05, + "loss": 2.0856, + "mean_token_accuracy": 0.501875388622284, + "step": 12325 + }, + { + "epoch": 0.012418906954890058, + "grad_norm": 24.76509584209334, + "learning_rate": 1.2418669298793385e-05, + "loss": 2.1031, + "mean_token_accuracy": 0.4950393199920654, + "step": 12330 + }, + { + "epoch": 0.01242394300799423, + "grad_norm": 27.286537860002337, + "learning_rate": 1.2423705255524446e-05, + "loss": 2.344, + "mean_token_accuracy": 0.4344827592372894, + "step": 12335 + }, + { + "epoch": 0.012428979061098404, + "grad_norm": 26.310202965921295, + "learning_rate": 1.2428741212255505e-05, + "loss": 2.6101, + "mean_token_accuracy": 0.3655172407627106, + "step": 12340 + }, + { + "epoch": 0.012434015114202576, + "grad_norm": 29.288826593649755, + "learning_rate": 1.2433777168986564e-05, + "loss": 2.4956, + "mean_token_accuracy": 0.41034482717514037, + "step": 12345 + }, + { + "epoch": 0.01243905116730675, + "grad_norm": 31.112832553224887, + "learning_rate": 1.2438813125717624e-05, + "loss": 2.4809, + "mean_token_accuracy": 0.42068964838981626, + "step": 12350 + }, + { + "epoch": 0.012444087220410922, + "grad_norm": 35.57880917039802, + "learning_rate": 1.2443849082448685e-05, + "loss": 2.421, + "mean_token_accuracy": 0.45517241954803467, + "step": 12355 + }, + { + "epoch": 0.012449123273515095, + "grad_norm": 32.95203755786022, + "learning_rate": 1.2448885039179744e-05, + "loss": 2.2853, + "mean_token_accuracy": 0.4, + "step": 12360 + }, + { + "epoch": 0.012454159326619267, + "grad_norm": 27.700703152268495, + "learning_rate": 1.2453920995910803e-05, + "loss": 2.7464, + "mean_token_accuracy": 0.36896551251411436, + "step": 12365 + }, + { + "epoch": 0.01245919537972344, + "grad_norm": 35.365804558101395, + "learning_rate": 1.2458956952641864e-05, + "loss": 2.6811, + "mean_token_accuracy": 0.324137932062149, + "step": 12370 + }, + { + "epoch": 0.012464231432827613, + "grad_norm": 32.97976107029191, + "learning_rate": 1.2463992909372923e-05, + "loss": 2.1364, + "mean_token_accuracy": 0.47241379618644713, + "step": 12375 + }, + { + "epoch": 0.012469267485931785, + "grad_norm": 32.98809461892356, + "learning_rate": 1.2469028866103984e-05, + "loss": 2.3336, + "mean_token_accuracy": 0.4620689630508423, + "step": 12380 + }, + { + "epoch": 0.012474303539035959, + "grad_norm": 27.020086462010717, + "learning_rate": 1.2474064822835042e-05, + "loss": 2.3314, + "mean_token_accuracy": 0.44827585816383364, + "step": 12385 + }, + { + "epoch": 0.012479339592140131, + "grad_norm": 41.278708666932026, + "learning_rate": 1.2479100779566103e-05, + "loss": 2.394, + "mean_token_accuracy": 0.4172413766384125, + "step": 12390 + }, + { + "epoch": 0.012484375645244303, + "grad_norm": 32.959432220913286, + "learning_rate": 1.2484136736297162e-05, + "loss": 2.0557, + "mean_token_accuracy": 0.5, + "step": 12395 + }, + { + "epoch": 0.012489411698348477, + "grad_norm": 25.54759803744777, + "learning_rate": 1.2489172693028223e-05, + "loss": 2.3786, + "mean_token_accuracy": 0.4137930989265442, + "step": 12400 + }, + { + "epoch": 0.012494447751452649, + "grad_norm": 36.15524711163905, + "learning_rate": 1.2494208649759282e-05, + "loss": 2.7923, + "mean_token_accuracy": 0.37931033968925476, + "step": 12405 + }, + { + "epoch": 0.012499483804556823, + "grad_norm": 24.931286486004183, + "learning_rate": 1.2499244606490341e-05, + "loss": 2.1781, + "mean_token_accuracy": 0.4517241358757019, + "step": 12410 + }, + { + "epoch": 0.012504519857660995, + "grad_norm": 24.71513888532816, + "learning_rate": 1.2504280563221402e-05, + "loss": 2.4558, + "mean_token_accuracy": 0.4379310369491577, + "step": 12415 + }, + { + "epoch": 0.012509555910765168, + "grad_norm": 24.781446474498864, + "learning_rate": 1.250931651995246e-05, + "loss": 2.1278, + "mean_token_accuracy": 0.48620688915252686, + "step": 12420 + }, + { + "epoch": 0.01251459196386934, + "grad_norm": 35.22602686370479, + "learning_rate": 1.2514352476683523e-05, + "loss": 2.5821, + "mean_token_accuracy": 0.40689654350280763, + "step": 12425 + }, + { + "epoch": 0.012519628016973513, + "grad_norm": 35.25326745978308, + "learning_rate": 1.251938843341458e-05, + "loss": 2.0507, + "mean_token_accuracy": 0.4983061134815216, + "step": 12430 + }, + { + "epoch": 0.012524664070077686, + "grad_norm": 41.8082689159271, + "learning_rate": 1.252442439014564e-05, + "loss": 2.8233, + "mean_token_accuracy": 0.3793103456497192, + "step": 12435 + }, + { + "epoch": 0.012529700123181858, + "grad_norm": 32.11253142547804, + "learning_rate": 1.25294603468767e-05, + "loss": 2.6194, + "mean_token_accuracy": 0.42413793206214906, + "step": 12440 + }, + { + "epoch": 0.012534736176286032, + "grad_norm": 29.054580901967682, + "learning_rate": 1.253449630360776e-05, + "loss": 2.2797, + "mean_token_accuracy": 0.4379310250282288, + "step": 12445 + }, + { + "epoch": 0.012539772229390204, + "grad_norm": 25.22875251004564, + "learning_rate": 1.253953226033882e-05, + "loss": 2.4613, + "mean_token_accuracy": 0.4, + "step": 12450 + }, + { + "epoch": 0.012544808282494378, + "grad_norm": 51.59575048924858, + "learning_rate": 1.254456821706988e-05, + "loss": 2.3611, + "mean_token_accuracy": 0.4551724135875702, + "step": 12455 + }, + { + "epoch": 0.01254984433559855, + "grad_norm": 27.2790310159277, + "learning_rate": 1.2549604173800939e-05, + "loss": 2.1536, + "mean_token_accuracy": 0.43793103098869324, + "step": 12460 + }, + { + "epoch": 0.012554880388702722, + "grad_norm": 31.30030855839559, + "learning_rate": 1.2554640130532e-05, + "loss": 2.4671, + "mean_token_accuracy": 0.43103447556495667, + "step": 12465 + }, + { + "epoch": 0.012559916441806896, + "grad_norm": 25.26799897715249, + "learning_rate": 1.255967608726306e-05, + "loss": 2.0066, + "mean_token_accuracy": 0.4433151841163635, + "step": 12470 + }, + { + "epoch": 0.012564952494911068, + "grad_norm": 24.987728592457096, + "learning_rate": 1.2564712043994117e-05, + "loss": 2.1716, + "mean_token_accuracy": 0.4724137902259827, + "step": 12475 + }, + { + "epoch": 0.012569988548015242, + "grad_norm": 35.16232997894578, + "learning_rate": 1.256974800072518e-05, + "loss": 1.9862, + "mean_token_accuracy": 0.5120992124080658, + "step": 12480 + }, + { + "epoch": 0.012575024601119414, + "grad_norm": 24.259143302035962, + "learning_rate": 1.2574783957456237e-05, + "loss": 1.9787, + "mean_token_accuracy": 0.458620685338974, + "step": 12485 + }, + { + "epoch": 0.012580060654223587, + "grad_norm": 31.140449407356805, + "learning_rate": 1.25798199141873e-05, + "loss": 2.7262, + "mean_token_accuracy": 0.35517241060733795, + "step": 12490 + }, + { + "epoch": 0.01258509670732776, + "grad_norm": 29.54485719851589, + "learning_rate": 1.2584855870918357e-05, + "loss": 2.3806, + "mean_token_accuracy": 0.4482758641242981, + "step": 12495 + }, + { + "epoch": 0.012590132760431931, + "grad_norm": 28.347802255076466, + "learning_rate": 1.2589891827649416e-05, + "loss": 2.4771, + "mean_token_accuracy": 0.44827585816383364, + "step": 12500 + }, + { + "epoch": 0.012595168813536105, + "grad_norm": 27.13069768874934, + "learning_rate": 1.2594927784380477e-05, + "loss": 2.2474, + "mean_token_accuracy": 0.42413793206214906, + "step": 12505 + }, + { + "epoch": 0.012600204866640277, + "grad_norm": 26.013557359862748, + "learning_rate": 1.2599963741111537e-05, + "loss": 2.0707, + "mean_token_accuracy": 0.4931034445762634, + "step": 12510 + }, + { + "epoch": 0.012605240919744451, + "grad_norm": 23.211625934928932, + "learning_rate": 1.2604999697842596e-05, + "loss": 2.3397, + "mean_token_accuracy": 0.42413792610168455, + "step": 12515 + }, + { + "epoch": 0.012610276972848623, + "grad_norm": 31.799509596239663, + "learning_rate": 1.2610035654573657e-05, + "loss": 2.3452, + "mean_token_accuracy": 0.45656382441520693, + "step": 12520 + }, + { + "epoch": 0.012615313025952797, + "grad_norm": 28.174141859846408, + "learning_rate": 1.2615071611304716e-05, + "loss": 2.5538, + "mean_token_accuracy": 0.38620689511299133, + "step": 12525 + }, + { + "epoch": 0.012620349079056969, + "grad_norm": 31.155542413297635, + "learning_rate": 1.2620107568035777e-05, + "loss": 2.5918, + "mean_token_accuracy": 0.412099215388298, + "step": 12530 + }, + { + "epoch": 0.01262538513216114, + "grad_norm": 36.72536837237263, + "learning_rate": 1.2625143524766836e-05, + "loss": 2.6464, + "mean_token_accuracy": 0.3689655065536499, + "step": 12535 + }, + { + "epoch": 0.012630421185265315, + "grad_norm": 32.56709727600641, + "learning_rate": 1.2630179481497894e-05, + "loss": 2.3201, + "mean_token_accuracy": 0.43448275327682495, + "step": 12540 + }, + { + "epoch": 0.012635457238369487, + "grad_norm": 32.31279525695266, + "learning_rate": 1.2635215438228956e-05, + "loss": 2.2739, + "mean_token_accuracy": 0.4034482777118683, + "step": 12545 + }, + { + "epoch": 0.01264049329147366, + "grad_norm": 47.25737661831153, + "learning_rate": 1.2640251394960016e-05, + "loss": 2.5598, + "mean_token_accuracy": 0.37241379022598264, + "step": 12550 + }, + { + "epoch": 0.012645529344577832, + "grad_norm": 31.864885779048016, + "learning_rate": 1.2645287351691073e-05, + "loss": 2.2602, + "mean_token_accuracy": 0.4655172288417816, + "step": 12555 + }, + { + "epoch": 0.012650565397682006, + "grad_norm": 28.524852632538792, + "learning_rate": 1.2650323308422136e-05, + "loss": 2.3715, + "mean_token_accuracy": 0.41724138259887694, + "step": 12560 + }, + { + "epoch": 0.012655601450786178, + "grad_norm": 31.215855647245988, + "learning_rate": 1.2655359265153193e-05, + "loss": 2.6743, + "mean_token_accuracy": 0.3517241358757019, + "step": 12565 + }, + { + "epoch": 0.01266063750389035, + "grad_norm": 49.518531177360714, + "learning_rate": 1.2660395221884256e-05, + "loss": 2.5935, + "mean_token_accuracy": 0.39999999701976774, + "step": 12570 + }, + { + "epoch": 0.012665673556994524, + "grad_norm": 39.34856259495588, + "learning_rate": 1.2665431178615314e-05, + "loss": 2.4846, + "mean_token_accuracy": 0.441379314661026, + "step": 12575 + }, + { + "epoch": 0.012670709610098696, + "grad_norm": 32.406584460502856, + "learning_rate": 1.2670467135346373e-05, + "loss": 2.4491, + "mean_token_accuracy": 0.37931033968925476, + "step": 12580 + }, + { + "epoch": 0.01267574566320287, + "grad_norm": 24.96458692458947, + "learning_rate": 1.2675503092077434e-05, + "loss": 2.6967, + "mean_token_accuracy": 0.3620689630508423, + "step": 12585 + }, + { + "epoch": 0.012680781716307042, + "grad_norm": 28.173207219771577, + "learning_rate": 1.2680539048808493e-05, + "loss": 2.0474, + "mean_token_accuracy": 0.4896551787853241, + "step": 12590 + }, + { + "epoch": 0.012685817769411216, + "grad_norm": 37.32953389360052, + "learning_rate": 1.2685575005539552e-05, + "loss": 2.3558, + "mean_token_accuracy": 0.38620689511299133, + "step": 12595 + }, + { + "epoch": 0.012690853822515388, + "grad_norm": 30.089706154473134, + "learning_rate": 1.2690610962270613e-05, + "loss": 2.2442, + "mean_token_accuracy": 0.4862068951129913, + "step": 12600 + }, + { + "epoch": 0.01269588987561956, + "grad_norm": 27.694821905567846, + "learning_rate": 1.2695646919001673e-05, + "loss": 2.2743, + "mean_token_accuracy": 0.46551724076271056, + "step": 12605 + }, + { + "epoch": 0.012700925928723733, + "grad_norm": 31.646543090492017, + "learning_rate": 1.2700682875732734e-05, + "loss": 2.1807, + "mean_token_accuracy": 0.42413793206214906, + "step": 12610 + }, + { + "epoch": 0.012705961981827905, + "grad_norm": 36.95474372000229, + "learning_rate": 1.2705718832463793e-05, + "loss": 2.6449, + "mean_token_accuracy": 0.41034482717514037, + "step": 12615 + }, + { + "epoch": 0.01271099803493208, + "grad_norm": 29.810877420075435, + "learning_rate": 1.271075478919485e-05, + "loss": 2.363, + "mean_token_accuracy": 0.4068965494632721, + "step": 12620 + }, + { + "epoch": 0.012716034088036251, + "grad_norm": 31.577223317974738, + "learning_rate": 1.2715790745925913e-05, + "loss": 2.3964, + "mean_token_accuracy": 0.40798547863960266, + "step": 12625 + }, + { + "epoch": 0.012721070141140425, + "grad_norm": 30.18369857092084, + "learning_rate": 1.272082670265697e-05, + "loss": 2.4358, + "mean_token_accuracy": 0.3965517163276672, + "step": 12630 + }, + { + "epoch": 0.012726106194244597, + "grad_norm": 31.931394544713104, + "learning_rate": 1.272586265938803e-05, + "loss": 2.3843, + "mean_token_accuracy": 0.417241370677948, + "step": 12635 + }, + { + "epoch": 0.012731142247348769, + "grad_norm": 31.667023714276244, + "learning_rate": 1.273089861611909e-05, + "loss": 2.3703, + "mean_token_accuracy": 0.3827586233615875, + "step": 12640 + }, + { + "epoch": 0.012736178300452943, + "grad_norm": 33.01904718223163, + "learning_rate": 1.273593457285015e-05, + "loss": 2.5054, + "mean_token_accuracy": 0.43623714447021483, + "step": 12645 + }, + { + "epoch": 0.012741214353557115, + "grad_norm": 28.75349632298836, + "learning_rate": 1.2740970529581211e-05, + "loss": 2.2973, + "mean_token_accuracy": 0.4206896543502808, + "step": 12650 + }, + { + "epoch": 0.012746250406661289, + "grad_norm": 43.89957086085222, + "learning_rate": 1.274600648631227e-05, + "loss": 2.5134, + "mean_token_accuracy": 0.38620689511299133, + "step": 12655 + }, + { + "epoch": 0.01275128645976546, + "grad_norm": 35.192670025155444, + "learning_rate": 1.275104244304333e-05, + "loss": 2.4297, + "mean_token_accuracy": 0.43103447556495667, + "step": 12660 + }, + { + "epoch": 0.012756322512869634, + "grad_norm": 30.650579666006582, + "learning_rate": 1.275607839977439e-05, + "loss": 2.3771, + "mean_token_accuracy": 0.42068966031074523, + "step": 12665 + }, + { + "epoch": 0.012761358565973806, + "grad_norm": 31.011786103933645, + "learning_rate": 1.276111435650545e-05, + "loss": 2.4126, + "mean_token_accuracy": 0.4086509466171265, + "step": 12670 + }, + { + "epoch": 0.012766394619077978, + "grad_norm": 33.38479119722718, + "learning_rate": 1.2766150313236507e-05, + "loss": 2.3375, + "mean_token_accuracy": 0.4448275864124298, + "step": 12675 + }, + { + "epoch": 0.012771430672182152, + "grad_norm": 30.475820515807357, + "learning_rate": 1.277118626996757e-05, + "loss": 2.5579, + "mean_token_accuracy": 0.3793103456497192, + "step": 12680 + }, + { + "epoch": 0.012776466725286324, + "grad_norm": 25.73170396499636, + "learning_rate": 1.2776222226698627e-05, + "loss": 2.232, + "mean_token_accuracy": 0.42068966031074523, + "step": 12685 + }, + { + "epoch": 0.012781502778390498, + "grad_norm": 25.757602331231993, + "learning_rate": 1.278125818342969e-05, + "loss": 2.4408, + "mean_token_accuracy": 0.4396249294281006, + "step": 12690 + }, + { + "epoch": 0.01278653883149467, + "grad_norm": 32.45572523514291, + "learning_rate": 1.2786294140160748e-05, + "loss": 2.5287, + "mean_token_accuracy": 0.44827585816383364, + "step": 12695 + }, + { + "epoch": 0.012791574884598844, + "grad_norm": 28.769040816368303, + "learning_rate": 1.2791330096891807e-05, + "loss": 2.4698, + "mean_token_accuracy": 0.42413793206214906, + "step": 12700 + }, + { + "epoch": 0.012796610937703016, + "grad_norm": 37.55962852563274, + "learning_rate": 1.2796366053622868e-05, + "loss": 2.0511, + "mean_token_accuracy": 0.4870689630508423, + "step": 12705 + }, + { + "epoch": 0.012801646990807188, + "grad_norm": 27.589639816641036, + "learning_rate": 1.2801402010353927e-05, + "loss": 2.2718, + "mean_token_accuracy": 0.4551724135875702, + "step": 12710 + }, + { + "epoch": 0.012806683043911362, + "grad_norm": 39.01467674178639, + "learning_rate": 1.2806437967084988e-05, + "loss": 2.6747, + "mean_token_accuracy": 0.38275861740112305, + "step": 12715 + }, + { + "epoch": 0.012811719097015534, + "grad_norm": 31.54599311241794, + "learning_rate": 1.2811473923816047e-05, + "loss": 2.1155, + "mean_token_accuracy": 0.4881427764892578, + "step": 12720 + }, + { + "epoch": 0.012816755150119707, + "grad_norm": 38.41931270092057, + "learning_rate": 1.2816509880547106e-05, + "loss": 2.1786, + "mean_token_accuracy": 0.42758620381355283, + "step": 12725 + }, + { + "epoch": 0.01282179120322388, + "grad_norm": 32.52700224985398, + "learning_rate": 1.2821545837278167e-05, + "loss": 1.7954, + "mean_token_accuracy": 0.5517241477966308, + "step": 12730 + }, + { + "epoch": 0.012826827256328053, + "grad_norm": 30.782415251204366, + "learning_rate": 1.2826581794009227e-05, + "loss": 2.2334, + "mean_token_accuracy": 0.42758620381355283, + "step": 12735 + }, + { + "epoch": 0.012831863309432225, + "grad_norm": 44.07255498344601, + "learning_rate": 1.2831617750740286e-05, + "loss": 2.7697, + "mean_token_accuracy": 0.4344827592372894, + "step": 12740 + }, + { + "epoch": 0.012836899362536397, + "grad_norm": 30.566807063967055, + "learning_rate": 1.2836653707471347e-05, + "loss": 2.0987, + "mean_token_accuracy": 0.458620685338974, + "step": 12745 + }, + { + "epoch": 0.012841935415640571, + "grad_norm": 32.43777592863901, + "learning_rate": 1.2841689664202406e-05, + "loss": 2.3014, + "mean_token_accuracy": 0.40689654350280763, + "step": 12750 + }, + { + "epoch": 0.012846971468744743, + "grad_norm": 29.424119561405323, + "learning_rate": 1.2846725620933467e-05, + "loss": 2.5864, + "mean_token_accuracy": 0.42068964838981626, + "step": 12755 + }, + { + "epoch": 0.012852007521848917, + "grad_norm": 31.273473895404905, + "learning_rate": 1.2851761577664526e-05, + "loss": 2.319, + "mean_token_accuracy": 0.4517241418361664, + "step": 12760 + }, + { + "epoch": 0.012857043574953089, + "grad_norm": 37.668560355649866, + "learning_rate": 1.2856797534395584e-05, + "loss": 2.8225, + "mean_token_accuracy": 0.417241370677948, + "step": 12765 + }, + { + "epoch": 0.012862079628057263, + "grad_norm": 32.0196168050631, + "learning_rate": 1.2861833491126647e-05, + "loss": 2.7624, + "mean_token_accuracy": 0.35862069129943847, + "step": 12770 + }, + { + "epoch": 0.012867115681161435, + "grad_norm": 28.41207023695925, + "learning_rate": 1.2866869447857704e-05, + "loss": 2.4554, + "mean_token_accuracy": 0.4379310369491577, + "step": 12775 + }, + { + "epoch": 0.012872151734265607, + "grad_norm": 36.93349054722976, + "learning_rate": 1.2871905404588763e-05, + "loss": 2.4285, + "mean_token_accuracy": 0.43793103098869324, + "step": 12780 + }, + { + "epoch": 0.01287718778736978, + "grad_norm": 28.567774362389525, + "learning_rate": 1.2876941361319824e-05, + "loss": 2.3522, + "mean_token_accuracy": 0.4448275864124298, + "step": 12785 + }, + { + "epoch": 0.012882223840473953, + "grad_norm": 34.28002705071036, + "learning_rate": 1.2881977318050884e-05, + "loss": 2.6179, + "mean_token_accuracy": 0.36896551549434664, + "step": 12790 + }, + { + "epoch": 0.012887259893578126, + "grad_norm": 25.95669245715233, + "learning_rate": 1.2887013274781945e-05, + "loss": 2.2537, + "mean_token_accuracy": 0.4172413766384125, + "step": 12795 + }, + { + "epoch": 0.012892295946682298, + "grad_norm": 32.76418069538328, + "learning_rate": 1.2892049231513004e-05, + "loss": 2.439, + "mean_token_accuracy": 0.4068965494632721, + "step": 12800 + }, + { + "epoch": 0.012897331999786472, + "grad_norm": 30.483698834661528, + "learning_rate": 1.2897085188244063e-05, + "loss": 2.3958, + "mean_token_accuracy": 0.4068965524435043, + "step": 12805 + }, + { + "epoch": 0.012902368052890644, + "grad_norm": 23.111492257569815, + "learning_rate": 1.2902121144975124e-05, + "loss": 2.2623, + "mean_token_accuracy": 0.42413792610168455, + "step": 12810 + }, + { + "epoch": 0.012907404105994816, + "grad_norm": 28.560933429334074, + "learning_rate": 1.2907157101706183e-05, + "loss": 2.5017, + "mean_token_accuracy": 0.4137930989265442, + "step": 12815 + }, + { + "epoch": 0.01291244015909899, + "grad_norm": 25.093406735469227, + "learning_rate": 1.291219305843724e-05, + "loss": 2.2457, + "mean_token_accuracy": 0.47586206197738645, + "step": 12820 + }, + { + "epoch": 0.012917476212203162, + "grad_norm": 25.620166034413394, + "learning_rate": 1.2917229015168303e-05, + "loss": 2.0981, + "mean_token_accuracy": 0.4896551609039307, + "step": 12825 + }, + { + "epoch": 0.012922512265307336, + "grad_norm": 23.74110744119613, + "learning_rate": 1.2922264971899361e-05, + "loss": 2.2391, + "mean_token_accuracy": 0.49522080421447756, + "step": 12830 + }, + { + "epoch": 0.012927548318411508, + "grad_norm": 30.55861636771355, + "learning_rate": 1.2927300928630424e-05, + "loss": 2.2856, + "mean_token_accuracy": 0.441379314661026, + "step": 12835 + }, + { + "epoch": 0.012932584371515682, + "grad_norm": 26.0317251444116, + "learning_rate": 1.2932336885361481e-05, + "loss": 2.3061, + "mean_token_accuracy": 0.4551724076271057, + "step": 12840 + }, + { + "epoch": 0.012937620424619854, + "grad_norm": 26.985724445535347, + "learning_rate": 1.293737284209254e-05, + "loss": 2.5272, + "mean_token_accuracy": 0.4344827592372894, + "step": 12845 + }, + { + "epoch": 0.012942656477724026, + "grad_norm": 27.969385054594625, + "learning_rate": 1.2942408798823601e-05, + "loss": 2.5063, + "mean_token_accuracy": 0.40344826579093934, + "step": 12850 + }, + { + "epoch": 0.0129476925308282, + "grad_norm": 30.513456889761024, + "learning_rate": 1.294744475555466e-05, + "loss": 1.9497, + "mean_token_accuracy": 0.4770935893058777, + "step": 12855 + }, + { + "epoch": 0.012952728583932371, + "grad_norm": 46.75875363559398, + "learning_rate": 1.295248071228572e-05, + "loss": 2.7059, + "mean_token_accuracy": 0.42758620381355283, + "step": 12860 + }, + { + "epoch": 0.012957764637036545, + "grad_norm": 26.230855533020616, + "learning_rate": 1.295751666901678e-05, + "loss": 2.2372, + "mean_token_accuracy": 0.47241378426551817, + "step": 12865 + }, + { + "epoch": 0.012962800690140717, + "grad_norm": 28.802748603821968, + "learning_rate": 1.296255262574784e-05, + "loss": 2.1341, + "mean_token_accuracy": 0.4798029541969299, + "step": 12870 + }, + { + "epoch": 0.012967836743244891, + "grad_norm": 33.73548848255155, + "learning_rate": 1.2967588582478901e-05, + "loss": 2.4402, + "mean_token_accuracy": 0.4068965554237366, + "step": 12875 + }, + { + "epoch": 0.012972872796349063, + "grad_norm": 42.73450696683177, + "learning_rate": 1.297262453920996e-05, + "loss": 2.2558, + "mean_token_accuracy": 0.43793103992939, + "step": 12880 + }, + { + "epoch": 0.012977908849453235, + "grad_norm": 35.47754526219574, + "learning_rate": 1.2977660495941018e-05, + "loss": 2.426, + "mean_token_accuracy": 0.42413792610168455, + "step": 12885 + }, + { + "epoch": 0.012982944902557409, + "grad_norm": 38.609434301944674, + "learning_rate": 1.298269645267208e-05, + "loss": 2.3408, + "mean_token_accuracy": 0.43854679465293883, + "step": 12890 + }, + { + "epoch": 0.01298798095566158, + "grad_norm": 35.45261316088407, + "learning_rate": 1.2987732409403138e-05, + "loss": 2.402, + "mean_token_accuracy": 0.4534180223941803, + "step": 12895 + }, + { + "epoch": 0.012993017008765755, + "grad_norm": 33.4224987380504, + "learning_rate": 1.2992768366134197e-05, + "loss": 2.4698, + "mean_token_accuracy": 0.4034482717514038, + "step": 12900 + }, + { + "epoch": 0.012998053061869927, + "grad_norm": 35.609743099720625, + "learning_rate": 1.2997804322865258e-05, + "loss": 2.4653, + "mean_token_accuracy": 0.41379310488700866, + "step": 12905 + }, + { + "epoch": 0.0130030891149741, + "grad_norm": 24.919853466004504, + "learning_rate": 1.3002840279596317e-05, + "loss": 2.3445, + "mean_token_accuracy": 0.43793103098869324, + "step": 12910 + }, + { + "epoch": 0.013008125168078272, + "grad_norm": 25.4773996910296, + "learning_rate": 1.300787623632738e-05, + "loss": 2.3354, + "mean_token_accuracy": 0.46551724672317507, + "step": 12915 + }, + { + "epoch": 0.013013161221182444, + "grad_norm": 29.623283373632454, + "learning_rate": 1.3012912193058438e-05, + "loss": 2.3086, + "mean_token_accuracy": 0.44137930274009707, + "step": 12920 + }, + { + "epoch": 0.013018197274286618, + "grad_norm": 32.02303539201582, + "learning_rate": 1.3017948149789497e-05, + "loss": 2.1336, + "mean_token_accuracy": 0.45517241954803467, + "step": 12925 + }, + { + "epoch": 0.01302323332739079, + "grad_norm": 26.590419690457505, + "learning_rate": 1.3022984106520558e-05, + "loss": 2.286, + "mean_token_accuracy": 0.43103447556495667, + "step": 12930 + }, + { + "epoch": 0.013028269380494964, + "grad_norm": 24.97942720598132, + "learning_rate": 1.3028020063251617e-05, + "loss": 2.3042, + "mean_token_accuracy": 0.44827585220336913, + "step": 12935 + }, + { + "epoch": 0.013033305433599136, + "grad_norm": 30.217694894270398, + "learning_rate": 1.3033056019982676e-05, + "loss": 2.323, + "mean_token_accuracy": 0.43793103098869324, + "step": 12940 + }, + { + "epoch": 0.01303834148670331, + "grad_norm": 38.34318197705085, + "learning_rate": 1.3038091976713737e-05, + "loss": 2.3795, + "mean_token_accuracy": 0.42631577849388125, + "step": 12945 + }, + { + "epoch": 0.013043377539807482, + "grad_norm": 28.4063648755663, + "learning_rate": 1.3043127933444797e-05, + "loss": 2.1181, + "mean_token_accuracy": 0.482758629322052, + "step": 12950 + }, + { + "epoch": 0.013048413592911654, + "grad_norm": 30.205296249243396, + "learning_rate": 1.3048163890175858e-05, + "loss": 2.6092, + "mean_token_accuracy": 0.4206896543502808, + "step": 12955 + }, + { + "epoch": 0.013053449646015828, + "grad_norm": 28.583186441150584, + "learning_rate": 1.3053199846906917e-05, + "loss": 2.4494, + "mean_token_accuracy": 0.44482758045196535, + "step": 12960 + }, + { + "epoch": 0.01305848569912, + "grad_norm": 30.346269622031645, + "learning_rate": 1.3058235803637974e-05, + "loss": 2.3381, + "mean_token_accuracy": 0.42758620381355283, + "step": 12965 + }, + { + "epoch": 0.013063521752224173, + "grad_norm": 28.22640394302225, + "learning_rate": 1.3063271760369037e-05, + "loss": 2.3978, + "mean_token_accuracy": 0.41379310190677643, + "step": 12970 + }, + { + "epoch": 0.013068557805328345, + "grad_norm": 26.82382405332868, + "learning_rate": 1.3068307717100095e-05, + "loss": 2.5297, + "mean_token_accuracy": 0.4517241418361664, + "step": 12975 + }, + { + "epoch": 0.01307359385843252, + "grad_norm": 40.97766546169816, + "learning_rate": 1.3073343673831154e-05, + "loss": 2.4479, + "mean_token_accuracy": 0.36896551847457887, + "step": 12980 + }, + { + "epoch": 0.013078629911536691, + "grad_norm": 28.42873371062202, + "learning_rate": 1.3078379630562215e-05, + "loss": 2.2434, + "mean_token_accuracy": 0.4758620738983154, + "step": 12985 + }, + { + "epoch": 0.013083665964640863, + "grad_norm": 23.98547544977723, + "learning_rate": 1.3083415587293274e-05, + "loss": 2.1944, + "mean_token_accuracy": 0.45517241954803467, + "step": 12990 + }, + { + "epoch": 0.013088702017745037, + "grad_norm": 29.32057895859403, + "learning_rate": 1.3088451544024335e-05, + "loss": 2.4815, + "mean_token_accuracy": 0.3793103456497192, + "step": 12995 + }, + { + "epoch": 0.013093738070849209, + "grad_norm": 35.90334327283973, + "learning_rate": 1.3093487500755394e-05, + "loss": 2.4015, + "mean_token_accuracy": 0.47931033968925474, + "step": 13000 + }, + { + "epoch": 0.013098774123953383, + "grad_norm": 28.007809165784924, + "learning_rate": 1.3098523457486453e-05, + "loss": 2.8044, + "mean_token_accuracy": 0.3758620619773865, + "step": 13005 + }, + { + "epoch": 0.013103810177057555, + "grad_norm": 28.882632142862747, + "learning_rate": 1.3103559414217514e-05, + "loss": 2.2319, + "mean_token_accuracy": 0.47586206197738645, + "step": 13010 + }, + { + "epoch": 0.013108846230161729, + "grad_norm": 39.222945278278296, + "learning_rate": 1.3108595370948574e-05, + "loss": 2.6117, + "mean_token_accuracy": 0.4159104645252228, + "step": 13015 + }, + { + "epoch": 0.0131138822832659, + "grad_norm": 28.066340136477823, + "learning_rate": 1.3113631327679631e-05, + "loss": 2.5679, + "mean_token_accuracy": 0.4344827651977539, + "step": 13020 + }, + { + "epoch": 0.013118918336370073, + "grad_norm": 31.665754869067026, + "learning_rate": 1.3118667284410694e-05, + "loss": 2.0537, + "mean_token_accuracy": 0.4620689690113068, + "step": 13025 + }, + { + "epoch": 0.013123954389474246, + "grad_norm": 37.78623901562369, + "learning_rate": 1.3123703241141751e-05, + "loss": 2.4536, + "mean_token_accuracy": 0.38620689511299133, + "step": 13030 + }, + { + "epoch": 0.013128990442578418, + "grad_norm": 30.237792331384547, + "learning_rate": 1.3128739197872814e-05, + "loss": 2.6079, + "mean_token_accuracy": 0.403448274731636, + "step": 13035 + }, + { + "epoch": 0.013134026495682592, + "grad_norm": 27.212776761823513, + "learning_rate": 1.3133775154603872e-05, + "loss": 2.4359, + "mean_token_accuracy": 0.41034482717514037, + "step": 13040 + }, + { + "epoch": 0.013139062548786764, + "grad_norm": 28.091725473534964, + "learning_rate": 1.3138811111334931e-05, + "loss": 2.4651, + "mean_token_accuracy": 0.40344826579093934, + "step": 13045 + }, + { + "epoch": 0.013144098601890938, + "grad_norm": 25.78388333567489, + "learning_rate": 1.3143847068065992e-05, + "loss": 2.622, + "mean_token_accuracy": 0.38965516686439516, + "step": 13050 + }, + { + "epoch": 0.01314913465499511, + "grad_norm": 27.013001814895404, + "learning_rate": 1.3148883024797051e-05, + "loss": 2.5008, + "mean_token_accuracy": 0.417241370677948, + "step": 13055 + }, + { + "epoch": 0.013154170708099282, + "grad_norm": 24.611167133186306, + "learning_rate": 1.315391898152811e-05, + "loss": 2.2566, + "mean_token_accuracy": 0.458620685338974, + "step": 13060 + }, + { + "epoch": 0.013159206761203456, + "grad_norm": 21.589827906902283, + "learning_rate": 1.3158954938259171e-05, + "loss": 2.2282, + "mean_token_accuracy": 0.4206896543502808, + "step": 13065 + }, + { + "epoch": 0.013164242814307628, + "grad_norm": 27.028339947340495, + "learning_rate": 1.316399089499023e-05, + "loss": 2.491, + "mean_token_accuracy": 0.4482758641242981, + "step": 13070 + }, + { + "epoch": 0.013169278867411802, + "grad_norm": 30.11790167101707, + "learning_rate": 1.3169026851721291e-05, + "loss": 2.3151, + "mean_token_accuracy": 0.4517241418361664, + "step": 13075 + }, + { + "epoch": 0.013174314920515974, + "grad_norm": 36.839886245442564, + "learning_rate": 1.317406280845235e-05, + "loss": 2.5183, + "mean_token_accuracy": 0.46551724076271056, + "step": 13080 + }, + { + "epoch": 0.013179350973620147, + "grad_norm": 27.402049215157966, + "learning_rate": 1.317909876518341e-05, + "loss": 2.0824, + "mean_token_accuracy": 0.4588626742362976, + "step": 13085 + }, + { + "epoch": 0.01318438702672432, + "grad_norm": 32.58936476564454, + "learning_rate": 1.3184134721914471e-05, + "loss": 2.3953, + "mean_token_accuracy": 0.4068965554237366, + "step": 13090 + }, + { + "epoch": 0.013189423079828492, + "grad_norm": 37.75702389372153, + "learning_rate": 1.318917067864553e-05, + "loss": 2.3736, + "mean_token_accuracy": 0.42413793206214906, + "step": 13095 + }, + { + "epoch": 0.013194459132932665, + "grad_norm": 29.830997361558296, + "learning_rate": 1.3194206635376591e-05, + "loss": 2.0218, + "mean_token_accuracy": 0.5251231491565704, + "step": 13100 + }, + { + "epoch": 0.013199495186036837, + "grad_norm": 26.926482307655053, + "learning_rate": 1.319924259210765e-05, + "loss": 2.1943, + "mean_token_accuracy": 0.42758620977401735, + "step": 13105 + }, + { + "epoch": 0.013204531239141011, + "grad_norm": 24.51714622167604, + "learning_rate": 1.3204278548838708e-05, + "loss": 2.5788, + "mean_token_accuracy": 0.3793103456497192, + "step": 13110 + }, + { + "epoch": 0.013209567292245183, + "grad_norm": 24.711564779492285, + "learning_rate": 1.320931450556977e-05, + "loss": 2.3359, + "mean_token_accuracy": 0.4448275864124298, + "step": 13115 + }, + { + "epoch": 0.013214603345349357, + "grad_norm": 31.842543773740047, + "learning_rate": 1.3214350462300828e-05, + "loss": 2.4287, + "mean_token_accuracy": 0.3493647873401642, + "step": 13120 + }, + { + "epoch": 0.013219639398453529, + "grad_norm": 29.806671252443607, + "learning_rate": 1.3219386419031887e-05, + "loss": 2.2382, + "mean_token_accuracy": 0.4620689690113068, + "step": 13125 + }, + { + "epoch": 0.013224675451557701, + "grad_norm": 26.74280108147092, + "learning_rate": 1.3224422375762948e-05, + "loss": 2.2388, + "mean_token_accuracy": 0.4517241358757019, + "step": 13130 + }, + { + "epoch": 0.013229711504661875, + "grad_norm": 26.70603868985356, + "learning_rate": 1.3229458332494008e-05, + "loss": 2.4793, + "mean_token_accuracy": 0.3931034505367279, + "step": 13135 + }, + { + "epoch": 0.013234747557766047, + "grad_norm": 28.695697798680055, + "learning_rate": 1.3234494289225069e-05, + "loss": 2.0935, + "mean_token_accuracy": 0.4551724076271057, + "step": 13140 + }, + { + "epoch": 0.01323978361087022, + "grad_norm": 29.635497332223242, + "learning_rate": 1.3239530245956128e-05, + "loss": 2.7615, + "mean_token_accuracy": 0.3983061194419861, + "step": 13145 + }, + { + "epoch": 0.013244819663974393, + "grad_norm": 28.279503536811465, + "learning_rate": 1.3244566202687187e-05, + "loss": 2.3152, + "mean_token_accuracy": 0.4640653431415558, + "step": 13150 + }, + { + "epoch": 0.013249855717078566, + "grad_norm": 30.117343825351398, + "learning_rate": 1.3249602159418248e-05, + "loss": 2.4695, + "mean_token_accuracy": 0.4068965554237366, + "step": 13155 + }, + { + "epoch": 0.013254891770182738, + "grad_norm": 25.44118815330071, + "learning_rate": 1.3254638116149307e-05, + "loss": 2.3541, + "mean_token_accuracy": 0.4172413766384125, + "step": 13160 + }, + { + "epoch": 0.01325992782328691, + "grad_norm": 26.712037637891942, + "learning_rate": 1.3259674072880365e-05, + "loss": 2.3713, + "mean_token_accuracy": 0.4275861978530884, + "step": 13165 + }, + { + "epoch": 0.013264963876391084, + "grad_norm": 37.18679979972518, + "learning_rate": 1.3264710029611427e-05, + "loss": 2.3854, + "mean_token_accuracy": 0.4517241418361664, + "step": 13170 + }, + { + "epoch": 0.013269999929495256, + "grad_norm": 36.61800566398733, + "learning_rate": 1.3269745986342485e-05, + "loss": 2.5815, + "mean_token_accuracy": 0.42413792610168455, + "step": 13175 + }, + { + "epoch": 0.01327503598259943, + "grad_norm": 30.344365867524466, + "learning_rate": 1.3274781943073548e-05, + "loss": 2.2539, + "mean_token_accuracy": 0.42413792610168455, + "step": 13180 + }, + { + "epoch": 0.013280072035703602, + "grad_norm": 35.59478771723536, + "learning_rate": 1.3279817899804605e-05, + "loss": 2.9194, + "mean_token_accuracy": 0.3586206823587418, + "step": 13185 + }, + { + "epoch": 0.013285108088807776, + "grad_norm": 26.7564241636928, + "learning_rate": 1.3284853856535664e-05, + "loss": 2.6201, + "mean_token_accuracy": 0.3931034505367279, + "step": 13190 + }, + { + "epoch": 0.013290144141911948, + "grad_norm": 27.467643933222462, + "learning_rate": 1.3289889813266725e-05, + "loss": 2.5296, + "mean_token_accuracy": 0.3999999940395355, + "step": 13195 + }, + { + "epoch": 0.01329518019501612, + "grad_norm": 29.147119182875322, + "learning_rate": 1.3294925769997785e-05, + "loss": 2.0661, + "mean_token_accuracy": 0.5000000059604645, + "step": 13200 + }, + { + "epoch": 0.013300216248120294, + "grad_norm": 25.532278933057587, + "learning_rate": 1.3299961726728844e-05, + "loss": 2.3221, + "mean_token_accuracy": 0.42758620381355283, + "step": 13205 + }, + { + "epoch": 0.013305252301224466, + "grad_norm": 32.99655712921853, + "learning_rate": 1.3304997683459905e-05, + "loss": 2.3824, + "mean_token_accuracy": 0.41548699140548706, + "step": 13210 + }, + { + "epoch": 0.01331028835432864, + "grad_norm": 25.898519198369804, + "learning_rate": 1.3310033640190964e-05, + "loss": 2.491, + "mean_token_accuracy": 0.4103448212146759, + "step": 13215 + }, + { + "epoch": 0.013315324407432811, + "grad_norm": 29.607799723185142, + "learning_rate": 1.3315069596922025e-05, + "loss": 2.5533, + "mean_token_accuracy": 0.42068966031074523, + "step": 13220 + }, + { + "epoch": 0.013320360460536985, + "grad_norm": 26.91335893225431, + "learning_rate": 1.3320105553653084e-05, + "loss": 2.4274, + "mean_token_accuracy": 0.42758620381355283, + "step": 13225 + }, + { + "epoch": 0.013325396513641157, + "grad_norm": 37.029474253182556, + "learning_rate": 1.3325141510384142e-05, + "loss": 2.4071, + "mean_token_accuracy": 0.43103448748588563, + "step": 13230 + }, + { + "epoch": 0.01333043256674533, + "grad_norm": 38.49354046059756, + "learning_rate": 1.3330177467115205e-05, + "loss": 2.1807, + "mean_token_accuracy": 0.4517241299152374, + "step": 13235 + }, + { + "epoch": 0.013335468619849503, + "grad_norm": 21.079691866672174, + "learning_rate": 1.3335213423846262e-05, + "loss": 2.0981, + "mean_token_accuracy": 0.5052026569843292, + "step": 13240 + }, + { + "epoch": 0.013340504672953675, + "grad_norm": 31.24127807906519, + "learning_rate": 1.3340249380577321e-05, + "loss": 2.3168, + "mean_token_accuracy": 0.42413792610168455, + "step": 13245 + }, + { + "epoch": 0.013345540726057849, + "grad_norm": 31.35469483163652, + "learning_rate": 1.3345285337308382e-05, + "loss": 2.4527, + "mean_token_accuracy": 0.4206896543502808, + "step": 13250 + }, + { + "epoch": 0.01335057677916202, + "grad_norm": 29.570816697010443, + "learning_rate": 1.3350321294039442e-05, + "loss": 2.445, + "mean_token_accuracy": 0.42068964838981626, + "step": 13255 + }, + { + "epoch": 0.013355612832266195, + "grad_norm": 23.95500614024331, + "learning_rate": 1.3355357250770502e-05, + "loss": 2.3022, + "mean_token_accuracy": 0.458620685338974, + "step": 13260 + }, + { + "epoch": 0.013360648885370367, + "grad_norm": 21.812508768874007, + "learning_rate": 1.3360393207501562e-05, + "loss": 2.1632, + "mean_token_accuracy": 0.4620689690113068, + "step": 13265 + }, + { + "epoch": 0.013365684938474539, + "grad_norm": 31.20300093111897, + "learning_rate": 1.3365429164232621e-05, + "loss": 2.4955, + "mean_token_accuracy": 0.42758620381355283, + "step": 13270 + }, + { + "epoch": 0.013370720991578712, + "grad_norm": 33.92234758081913, + "learning_rate": 1.3370465120963682e-05, + "loss": 2.5696, + "mean_token_accuracy": 0.3862069010734558, + "step": 13275 + }, + { + "epoch": 0.013375757044682884, + "grad_norm": 37.19318737941734, + "learning_rate": 1.3375501077694741e-05, + "loss": 2.6097, + "mean_token_accuracy": 0.3931034475564957, + "step": 13280 + }, + { + "epoch": 0.013380793097787058, + "grad_norm": 26.389424915461618, + "learning_rate": 1.33805370344258e-05, + "loss": 2.0169, + "mean_token_accuracy": 0.49100985527038576, + "step": 13285 + }, + { + "epoch": 0.01338582915089123, + "grad_norm": 25.892190160599146, + "learning_rate": 1.3385572991156861e-05, + "loss": 2.1506, + "mean_token_accuracy": 0.45862069725990295, + "step": 13290 + }, + { + "epoch": 0.013390865203995402, + "grad_norm": 26.778048777594854, + "learning_rate": 1.339060894788792e-05, + "loss": 2.0451, + "mean_token_accuracy": 0.4413793087005615, + "step": 13295 + }, + { + "epoch": 0.013395901257099576, + "grad_norm": 25.86233147889207, + "learning_rate": 1.3395644904618982e-05, + "loss": 2.3476, + "mean_token_accuracy": 0.4310344815254211, + "step": 13300 + }, + { + "epoch": 0.013400937310203748, + "grad_norm": 31.839048903275426, + "learning_rate": 1.340068086135004e-05, + "loss": 1.9876, + "mean_token_accuracy": 0.4863279044628143, + "step": 13305 + }, + { + "epoch": 0.013405973363307922, + "grad_norm": 36.77715308947395, + "learning_rate": 1.3405716818081098e-05, + "loss": 2.0154, + "mean_token_accuracy": 0.5021173536777497, + "step": 13310 + }, + { + "epoch": 0.013411009416412094, + "grad_norm": 29.051248858598928, + "learning_rate": 1.3410752774812161e-05, + "loss": 2.2748, + "mean_token_accuracy": 0.4931034445762634, + "step": 13315 + }, + { + "epoch": 0.013416045469516268, + "grad_norm": 23.82582602335328, + "learning_rate": 1.3415788731543219e-05, + "loss": 2.2898, + "mean_token_accuracy": 0.4310344815254211, + "step": 13320 + }, + { + "epoch": 0.01342108152262044, + "grad_norm": 31.30941445438363, + "learning_rate": 1.3420824688274278e-05, + "loss": 2.6116, + "mean_token_accuracy": 0.37241379022598264, + "step": 13325 + }, + { + "epoch": 0.013426117575724612, + "grad_norm": 33.24636965502477, + "learning_rate": 1.3425860645005339e-05, + "loss": 2.4368, + "mean_token_accuracy": 0.41209921836853025, + "step": 13330 + }, + { + "epoch": 0.013431153628828785, + "grad_norm": 56.4758936308936, + "learning_rate": 1.3430896601736398e-05, + "loss": 2.4223, + "mean_token_accuracy": 0.44482758045196535, + "step": 13335 + }, + { + "epoch": 0.013436189681932957, + "grad_norm": 25.72549086319441, + "learning_rate": 1.3435932558467459e-05, + "loss": 2.61, + "mean_token_accuracy": 0.37586206793785093, + "step": 13340 + }, + { + "epoch": 0.013441225735037131, + "grad_norm": 42.08671318015093, + "learning_rate": 1.3440968515198518e-05, + "loss": 2.3626, + "mean_token_accuracy": 0.4068965494632721, + "step": 13345 + }, + { + "epoch": 0.013446261788141303, + "grad_norm": 43.64949811480978, + "learning_rate": 1.3446004471929577e-05, + "loss": 2.6187, + "mean_token_accuracy": 0.3896551787853241, + "step": 13350 + }, + { + "epoch": 0.013451297841245477, + "grad_norm": 21.56184740972458, + "learning_rate": 1.3451040428660638e-05, + "loss": 2.3611, + "mean_token_accuracy": 0.4344827651977539, + "step": 13355 + }, + { + "epoch": 0.013456333894349649, + "grad_norm": 28.561819188138596, + "learning_rate": 1.3456076385391698e-05, + "loss": 2.1824, + "mean_token_accuracy": 0.49655171036720275, + "step": 13360 + }, + { + "epoch": 0.013461369947453821, + "grad_norm": 24.101584654395488, + "learning_rate": 1.3461112342122755e-05, + "loss": 2.2919, + "mean_token_accuracy": 0.47586206793785096, + "step": 13365 + }, + { + "epoch": 0.013466406000557995, + "grad_norm": 35.34163390202499, + "learning_rate": 1.3466148298853818e-05, + "loss": 2.7604, + "mean_token_accuracy": 0.40344828367233276, + "step": 13370 + }, + { + "epoch": 0.013471442053662167, + "grad_norm": 26.35615471100323, + "learning_rate": 1.3471184255584875e-05, + "loss": 2.777, + "mean_token_accuracy": 0.37241379618644715, + "step": 13375 + }, + { + "epoch": 0.01347647810676634, + "grad_norm": 23.30053669527517, + "learning_rate": 1.3476220212315938e-05, + "loss": 2.6872, + "mean_token_accuracy": 0.3827586203813553, + "step": 13380 + }, + { + "epoch": 0.013481514159870513, + "grad_norm": 32.3574464927237, + "learning_rate": 1.3481256169046996e-05, + "loss": 2.487, + "mean_token_accuracy": 0.42413792610168455, + "step": 13385 + }, + { + "epoch": 0.013486550212974686, + "grad_norm": 29.957176784804318, + "learning_rate": 1.3486292125778055e-05, + "loss": 2.2204, + "mean_token_accuracy": 0.45486992597579956, + "step": 13390 + }, + { + "epoch": 0.013491586266078858, + "grad_norm": 32.34722239570284, + "learning_rate": 1.3491328082509116e-05, + "loss": 2.5156, + "mean_token_accuracy": 0.4068965554237366, + "step": 13395 + }, + { + "epoch": 0.01349662231918303, + "grad_norm": 28.08430323260744, + "learning_rate": 1.3496364039240175e-05, + "loss": 2.3849, + "mean_token_accuracy": 0.42758620381355283, + "step": 13400 + }, + { + "epoch": 0.013501658372287204, + "grad_norm": 32.73009089942878, + "learning_rate": 1.3501399995971234e-05, + "loss": 2.4437, + "mean_token_accuracy": 0.4, + "step": 13405 + }, + { + "epoch": 0.013506694425391376, + "grad_norm": 46.64258261808528, + "learning_rate": 1.3506435952702295e-05, + "loss": 2.3428, + "mean_token_accuracy": 0.4620689630508423, + "step": 13410 + }, + { + "epoch": 0.01351173047849555, + "grad_norm": 28.61483244079587, + "learning_rate": 1.3511471909433355e-05, + "loss": 2.2551, + "mean_token_accuracy": 0.447005432844162, + "step": 13415 + }, + { + "epoch": 0.013516766531599722, + "grad_norm": 40.331135864675595, + "learning_rate": 1.3516507866164415e-05, + "loss": 2.4398, + "mean_token_accuracy": 0.3931034505367279, + "step": 13420 + }, + { + "epoch": 0.013521802584703896, + "grad_norm": 26.52834992994393, + "learning_rate": 1.3521543822895475e-05, + "loss": 2.2539, + "mean_token_accuracy": 0.4931034445762634, + "step": 13425 + }, + { + "epoch": 0.013526838637808068, + "grad_norm": 25.29292943626565, + "learning_rate": 1.3526579779626532e-05, + "loss": 1.9428, + "mean_token_accuracy": 0.501875388622284, + "step": 13430 + }, + { + "epoch": 0.01353187469091224, + "grad_norm": 21.22945332741738, + "learning_rate": 1.3531615736357595e-05, + "loss": 2.1188, + "mean_token_accuracy": 0.46896552443504336, + "step": 13435 + }, + { + "epoch": 0.013536910744016414, + "grad_norm": 27.491291043643972, + "learning_rate": 1.3536651693088653e-05, + "loss": 2.1109, + "mean_token_accuracy": 0.4758620738983154, + "step": 13440 + }, + { + "epoch": 0.013541946797120586, + "grad_norm": 34.482057498307704, + "learning_rate": 1.3541687649819712e-05, + "loss": 2.1006, + "mean_token_accuracy": 0.45517240166664125, + "step": 13445 + }, + { + "epoch": 0.01354698285022476, + "grad_norm": 31.40567347381732, + "learning_rate": 1.3546723606550773e-05, + "loss": 2.6787, + "mean_token_accuracy": 0.4137930989265442, + "step": 13450 + }, + { + "epoch": 0.013552018903328931, + "grad_norm": 24.84582439916837, + "learning_rate": 1.3551759563281832e-05, + "loss": 2.3336, + "mean_token_accuracy": 0.4413793087005615, + "step": 13455 + }, + { + "epoch": 0.013557054956433105, + "grad_norm": 24.216317611258546, + "learning_rate": 1.3556795520012895e-05, + "loss": 2.3509, + "mean_token_accuracy": 0.4172413766384125, + "step": 13460 + }, + { + "epoch": 0.013562091009537277, + "grad_norm": 30.41956767424792, + "learning_rate": 1.3561831476743952e-05, + "loss": 2.3585, + "mean_token_accuracy": 0.4159104585647583, + "step": 13465 + }, + { + "epoch": 0.01356712706264145, + "grad_norm": 23.993589383061074, + "learning_rate": 1.3566867433475011e-05, + "loss": 2.1802, + "mean_token_accuracy": 0.46896551847457885, + "step": 13470 + }, + { + "epoch": 0.013572163115745623, + "grad_norm": 27.152218485375624, + "learning_rate": 1.3571903390206072e-05, + "loss": 2.1909, + "mean_token_accuracy": 0.4448275864124298, + "step": 13475 + }, + { + "epoch": 0.013577199168849795, + "grad_norm": 27.541940208110763, + "learning_rate": 1.3576939346937132e-05, + "loss": 2.1498, + "mean_token_accuracy": 0.4586206912994385, + "step": 13480 + }, + { + "epoch": 0.013582235221953969, + "grad_norm": 31.59059505205469, + "learning_rate": 1.3581975303668191e-05, + "loss": 2.5284, + "mean_token_accuracy": 0.4620689630508423, + "step": 13485 + }, + { + "epoch": 0.013587271275058141, + "grad_norm": 33.25011383514295, + "learning_rate": 1.3587011260399252e-05, + "loss": 2.462, + "mean_token_accuracy": 0.44827585816383364, + "step": 13490 + }, + { + "epoch": 0.013592307328162315, + "grad_norm": 30.642490137801477, + "learning_rate": 1.3592047217130311e-05, + "loss": 2.4603, + "mean_token_accuracy": 0.441379314661026, + "step": 13495 + }, + { + "epoch": 0.013597343381266487, + "grad_norm": 26.23554612018405, + "learning_rate": 1.3597083173861372e-05, + "loss": 2.1111, + "mean_token_accuracy": 0.4517241358757019, + "step": 13500 + }, + { + "epoch": 0.013602379434370659, + "grad_norm": 30.53288435029085, + "learning_rate": 1.3602119130592431e-05, + "loss": 2.49, + "mean_token_accuracy": 0.41034482717514037, + "step": 13505 + }, + { + "epoch": 0.013607415487474833, + "grad_norm": 27.50941733450289, + "learning_rate": 1.3607155087323489e-05, + "loss": 2.3081, + "mean_token_accuracy": 0.4620689570903778, + "step": 13510 + }, + { + "epoch": 0.013612451540579005, + "grad_norm": 27.830259864410383, + "learning_rate": 1.3612191044054551e-05, + "loss": 2.2092, + "mean_token_accuracy": 0.46551724076271056, + "step": 13515 + }, + { + "epoch": 0.013617487593683178, + "grad_norm": 23.233848429616888, + "learning_rate": 1.3617227000785609e-05, + "loss": 2.59, + "mean_token_accuracy": 0.43793103098869324, + "step": 13520 + }, + { + "epoch": 0.01362252364678735, + "grad_norm": 34.706070952462554, + "learning_rate": 1.3622262957516672e-05, + "loss": 2.6078, + "mean_token_accuracy": 0.38620689511299133, + "step": 13525 + }, + { + "epoch": 0.013627559699891524, + "grad_norm": 32.16073625179282, + "learning_rate": 1.362729891424773e-05, + "loss": 2.4252, + "mean_token_accuracy": 0.43103448748588563, + "step": 13530 + }, + { + "epoch": 0.013632595752995696, + "grad_norm": 28.321563610821467, + "learning_rate": 1.3632334870978788e-05, + "loss": 2.5101, + "mean_token_accuracy": 0.4379310250282288, + "step": 13535 + }, + { + "epoch": 0.013637631806099868, + "grad_norm": 38.784157737560314, + "learning_rate": 1.363737082770985e-05, + "loss": 2.4589, + "mean_token_accuracy": 0.4379310250282288, + "step": 13540 + }, + { + "epoch": 0.013642667859204042, + "grad_norm": 27.71392350651912, + "learning_rate": 1.3642406784440909e-05, + "loss": 2.4885, + "mean_token_accuracy": 0.4034482717514038, + "step": 13545 + }, + { + "epoch": 0.013647703912308214, + "grad_norm": 23.197070909643145, + "learning_rate": 1.3647442741171968e-05, + "loss": 2.4825, + "mean_token_accuracy": 0.4137930989265442, + "step": 13550 + }, + { + "epoch": 0.013652739965412388, + "grad_norm": 38.68560041527579, + "learning_rate": 1.3652478697903029e-05, + "loss": 2.178, + "mean_token_accuracy": 0.4689655125141144, + "step": 13555 + }, + { + "epoch": 0.01365777601851656, + "grad_norm": 30.43931275360943, + "learning_rate": 1.3657514654634088e-05, + "loss": 2.1141, + "mean_token_accuracy": 0.441379314661026, + "step": 13560 + }, + { + "epoch": 0.013662812071620734, + "grad_norm": 30.67907474851602, + "learning_rate": 1.3662550611365149e-05, + "loss": 2.4654, + "mean_token_accuracy": 0.3965517163276672, + "step": 13565 + }, + { + "epoch": 0.013667848124724906, + "grad_norm": 27.280492970470206, + "learning_rate": 1.3667586568096208e-05, + "loss": 2.1042, + "mean_token_accuracy": 0.4620689690113068, + "step": 13570 + }, + { + "epoch": 0.013672884177829078, + "grad_norm": 30.050951280900833, + "learning_rate": 1.3672622524827266e-05, + "loss": 2.2671, + "mean_token_accuracy": 0.4016333997249603, + "step": 13575 + }, + { + "epoch": 0.013677920230933251, + "grad_norm": 37.713115178803086, + "learning_rate": 1.3677658481558329e-05, + "loss": 2.5251, + "mean_token_accuracy": 0.4, + "step": 13580 + }, + { + "epoch": 0.013682956284037423, + "grad_norm": 33.27826098970429, + "learning_rate": 1.3682694438289386e-05, + "loss": 2.2968, + "mean_token_accuracy": 0.4758620738983154, + "step": 13585 + }, + { + "epoch": 0.013687992337141597, + "grad_norm": 27.662748828303325, + "learning_rate": 1.3687730395020445e-05, + "loss": 2.3947, + "mean_token_accuracy": 0.4517241418361664, + "step": 13590 + }, + { + "epoch": 0.01369302839024577, + "grad_norm": 36.71722543414404, + "learning_rate": 1.3692766351751506e-05, + "loss": 2.2264, + "mean_token_accuracy": 0.4774349570274353, + "step": 13595 + }, + { + "epoch": 0.013698064443349943, + "grad_norm": 34.30343247257018, + "learning_rate": 1.3697802308482566e-05, + "loss": 2.4774, + "mean_token_accuracy": 0.41034482717514037, + "step": 13600 + }, + { + "epoch": 0.013703100496454115, + "grad_norm": 25.973692331354307, + "learning_rate": 1.3702838265213626e-05, + "loss": 2.3587, + "mean_token_accuracy": 0.4517241358757019, + "step": 13605 + }, + { + "epoch": 0.013708136549558287, + "grad_norm": 27.022399535616298, + "learning_rate": 1.3707874221944686e-05, + "loss": 2.5817, + "mean_token_accuracy": 0.4206896543502808, + "step": 13610 + }, + { + "epoch": 0.01371317260266246, + "grad_norm": 26.085525309504455, + "learning_rate": 1.3712910178675745e-05, + "loss": 2.3602, + "mean_token_accuracy": 0.4068965554237366, + "step": 13615 + }, + { + "epoch": 0.013718208655766633, + "grad_norm": 35.99124576343834, + "learning_rate": 1.3717946135406806e-05, + "loss": 2.6556, + "mean_token_accuracy": 0.3793103516101837, + "step": 13620 + }, + { + "epoch": 0.013723244708870807, + "grad_norm": 30.092340639666475, + "learning_rate": 1.3722982092137865e-05, + "loss": 2.5238, + "mean_token_accuracy": 0.37586206793785093, + "step": 13625 + }, + { + "epoch": 0.013728280761974979, + "grad_norm": 29.52150213665843, + "learning_rate": 1.3728018048868924e-05, + "loss": 2.3419, + "mean_token_accuracy": 0.4896551787853241, + "step": 13630 + }, + { + "epoch": 0.013733316815079152, + "grad_norm": 29.549291110669905, + "learning_rate": 1.3733054005599985e-05, + "loss": 2.637, + "mean_token_accuracy": 0.40689654350280763, + "step": 13635 + }, + { + "epoch": 0.013738352868183324, + "grad_norm": 25.207613493411678, + "learning_rate": 1.3738089962331045e-05, + "loss": 2.3176, + "mean_token_accuracy": 0.4724137902259827, + "step": 13640 + }, + { + "epoch": 0.013743388921287496, + "grad_norm": 28.393334540972603, + "learning_rate": 1.3743125919062106e-05, + "loss": 2.0455, + "mean_token_accuracy": 0.47241379618644713, + "step": 13645 + }, + { + "epoch": 0.01374842497439167, + "grad_norm": 27.632355446797295, + "learning_rate": 1.3748161875793165e-05, + "loss": 2.0234, + "mean_token_accuracy": 0.458620685338974, + "step": 13650 + }, + { + "epoch": 0.013753461027495842, + "grad_norm": 26.20244205085071, + "learning_rate": 1.3753197832524222e-05, + "loss": 2.3586, + "mean_token_accuracy": 0.45172414779663084, + "step": 13655 + }, + { + "epoch": 0.013758497080600016, + "grad_norm": 63.60662309744961, + "learning_rate": 1.3758233789255285e-05, + "loss": 2.3394, + "mean_token_accuracy": 0.4810042381286621, + "step": 13660 + }, + { + "epoch": 0.013763533133704188, + "grad_norm": 33.41388476499537, + "learning_rate": 1.3763269745986343e-05, + "loss": 2.1838, + "mean_token_accuracy": 0.4931034505367279, + "step": 13665 + }, + { + "epoch": 0.013768569186808362, + "grad_norm": 33.727315823247686, + "learning_rate": 1.3768305702717402e-05, + "loss": 2.6535, + "mean_token_accuracy": 0.37586206793785093, + "step": 13670 + }, + { + "epoch": 0.013773605239912534, + "grad_norm": 24.28062834079656, + "learning_rate": 1.3773341659448463e-05, + "loss": 2.5132, + "mean_token_accuracy": 0.39310344457626345, + "step": 13675 + }, + { + "epoch": 0.013778641293016706, + "grad_norm": 30.0255864202641, + "learning_rate": 1.3778377616179522e-05, + "loss": 2.4625, + "mean_token_accuracy": 0.4172413766384125, + "step": 13680 + }, + { + "epoch": 0.01378367734612088, + "grad_norm": 28.381195466955294, + "learning_rate": 1.3783413572910583e-05, + "loss": 2.2771, + "mean_token_accuracy": 0.4431336998939514, + "step": 13685 + }, + { + "epoch": 0.013788713399225052, + "grad_norm": 24.081556394380694, + "learning_rate": 1.3788449529641642e-05, + "loss": 2.4156, + "mean_token_accuracy": 0.4275862157344818, + "step": 13690 + }, + { + "epoch": 0.013793749452329225, + "grad_norm": 27.61645154489071, + "learning_rate": 1.3793485486372702e-05, + "loss": 2.6592, + "mean_token_accuracy": 0.36702964305877683, + "step": 13695 + }, + { + "epoch": 0.013798785505433397, + "grad_norm": 21.93220612182478, + "learning_rate": 1.3798521443103762e-05, + "loss": 2.2208, + "mean_token_accuracy": 0.441379314661026, + "step": 13700 + }, + { + "epoch": 0.013803821558537571, + "grad_norm": 23.421907472730563, + "learning_rate": 1.3803557399834822e-05, + "loss": 2.2828, + "mean_token_accuracy": 0.4379310429096222, + "step": 13705 + }, + { + "epoch": 0.013808857611641743, + "grad_norm": 25.106479254007787, + "learning_rate": 1.380859335656588e-05, + "loss": 2.4229, + "mean_token_accuracy": 0.4517241358757019, + "step": 13710 + }, + { + "epoch": 0.013813893664745915, + "grad_norm": 27.098208304017174, + "learning_rate": 1.3813629313296942e-05, + "loss": 2.0094, + "mean_token_accuracy": 0.4979431450366974, + "step": 13715 + }, + { + "epoch": 0.013818929717850089, + "grad_norm": 22.250983551870384, + "learning_rate": 1.3818665270028e-05, + "loss": 1.8665, + "mean_token_accuracy": 0.49866908192634585, + "step": 13720 + }, + { + "epoch": 0.013823965770954261, + "grad_norm": 23.73972392007322, + "learning_rate": 1.3823701226759062e-05, + "loss": 2.4838, + "mean_token_accuracy": 0.3862068891525269, + "step": 13725 + }, + { + "epoch": 0.013829001824058435, + "grad_norm": 38.9274355090047, + "learning_rate": 1.382873718349012e-05, + "loss": 2.6813, + "mean_token_accuracy": 0.3896551728248596, + "step": 13730 + }, + { + "epoch": 0.013834037877162607, + "grad_norm": 23.65292742875833, + "learning_rate": 1.3833773140221179e-05, + "loss": 2.3639, + "mean_token_accuracy": 0.4517241358757019, + "step": 13735 + }, + { + "epoch": 0.01383907393026678, + "grad_norm": 22.436691589144164, + "learning_rate": 1.383880909695224e-05, + "loss": 2.5212, + "mean_token_accuracy": 0.44482759237289426, + "step": 13740 + }, + { + "epoch": 0.013844109983370953, + "grad_norm": 31.56529295102431, + "learning_rate": 1.3843845053683299e-05, + "loss": 2.2573, + "mean_token_accuracy": 0.4620689690113068, + "step": 13745 + }, + { + "epoch": 0.013849146036475125, + "grad_norm": 35.93501081823169, + "learning_rate": 1.3848881010414358e-05, + "loss": 2.265, + "mean_token_accuracy": 0.41947974264621735, + "step": 13750 + }, + { + "epoch": 0.013854182089579298, + "grad_norm": 27.34234921325077, + "learning_rate": 1.385391696714542e-05, + "loss": 2.5391, + "mean_token_accuracy": 0.3896551728248596, + "step": 13755 + }, + { + "epoch": 0.01385921814268347, + "grad_norm": 32.05644762769521, + "learning_rate": 1.3858952923876479e-05, + "loss": 2.3079, + "mean_token_accuracy": 0.39310344457626345, + "step": 13760 + }, + { + "epoch": 0.013864254195787644, + "grad_norm": 25.697503249893554, + "learning_rate": 1.386398888060754e-05, + "loss": 2.6425, + "mean_token_accuracy": 0.41034482717514037, + "step": 13765 + }, + { + "epoch": 0.013869290248891816, + "grad_norm": 31.76197281271624, + "learning_rate": 1.3869024837338599e-05, + "loss": 2.456, + "mean_token_accuracy": 0.42413793206214906, + "step": 13770 + }, + { + "epoch": 0.01387432630199599, + "grad_norm": 35.308250611662054, + "learning_rate": 1.3874060794069656e-05, + "loss": 2.586, + "mean_token_accuracy": 0.3793103456497192, + "step": 13775 + }, + { + "epoch": 0.013879362355100162, + "grad_norm": 50.70823971099878, + "learning_rate": 1.3879096750800719e-05, + "loss": 2.4025, + "mean_token_accuracy": 0.4379310369491577, + "step": 13780 + }, + { + "epoch": 0.013884398408204334, + "grad_norm": 25.038569938292934, + "learning_rate": 1.3884132707531777e-05, + "loss": 2.2659, + "mean_token_accuracy": 0.4816092014312744, + "step": 13785 + }, + { + "epoch": 0.013889434461308508, + "grad_norm": 36.3842874449946, + "learning_rate": 1.3889168664262836e-05, + "loss": 2.2339, + "mean_token_accuracy": 0.4862069010734558, + "step": 13790 + }, + { + "epoch": 0.01389447051441268, + "grad_norm": 25.89752160790185, + "learning_rate": 1.3894204620993897e-05, + "loss": 2.4768, + "mean_token_accuracy": 0.44482758045196535, + "step": 13795 + }, + { + "epoch": 0.013899506567516854, + "grad_norm": 26.974745755744706, + "learning_rate": 1.3899240577724956e-05, + "loss": 2.276, + "mean_token_accuracy": 0.47931033968925474, + "step": 13800 + }, + { + "epoch": 0.013904542620621026, + "grad_norm": 48.42921170210844, + "learning_rate": 1.3904276534456017e-05, + "loss": 2.2699, + "mean_token_accuracy": 0.4880822777748108, + "step": 13805 + }, + { + "epoch": 0.0139095786737252, + "grad_norm": 33.952280450743174, + "learning_rate": 1.3909312491187076e-05, + "loss": 2.1916, + "mean_token_accuracy": 0.4640048384666443, + "step": 13810 + }, + { + "epoch": 0.013914614726829371, + "grad_norm": 41.27832516759408, + "learning_rate": 1.3914348447918135e-05, + "loss": 2.3554, + "mean_token_accuracy": 0.45517241954803467, + "step": 13815 + }, + { + "epoch": 0.013919650779933544, + "grad_norm": 25.632922284676905, + "learning_rate": 1.3919384404649196e-05, + "loss": 2.5967, + "mean_token_accuracy": 0.39310343861579894, + "step": 13820 + }, + { + "epoch": 0.013924686833037717, + "grad_norm": 24.720269311162866, + "learning_rate": 1.3924420361380256e-05, + "loss": 2.3295, + "mean_token_accuracy": 0.4448275864124298, + "step": 13825 + }, + { + "epoch": 0.01392972288614189, + "grad_norm": 50.640605524944306, + "learning_rate": 1.3929456318111315e-05, + "loss": 2.4225, + "mean_token_accuracy": 0.3965517282485962, + "step": 13830 + }, + { + "epoch": 0.013934758939246063, + "grad_norm": 27.139377775582044, + "learning_rate": 1.3934492274842376e-05, + "loss": 2.3684, + "mean_token_accuracy": 0.4034482717514038, + "step": 13835 + }, + { + "epoch": 0.013939794992350235, + "grad_norm": 35.27235715048766, + "learning_rate": 1.3939528231573435e-05, + "loss": 2.0935, + "mean_token_accuracy": 0.47586206197738645, + "step": 13840 + }, + { + "epoch": 0.013944831045454409, + "grad_norm": 26.039475966100603, + "learning_rate": 1.3944564188304496e-05, + "loss": 2.5792, + "mean_token_accuracy": 0.37241379618644715, + "step": 13845 + }, + { + "epoch": 0.013949867098558581, + "grad_norm": 26.66515189012988, + "learning_rate": 1.3949600145035555e-05, + "loss": 2.2224, + "mean_token_accuracy": 0.46896551847457885, + "step": 13850 + }, + { + "epoch": 0.013954903151662753, + "grad_norm": 24.771804674537318, + "learning_rate": 1.3954636101766613e-05, + "loss": 2.1707, + "mean_token_accuracy": 0.47586206793785096, + "step": 13855 + }, + { + "epoch": 0.013959939204766927, + "grad_norm": 36.59570498183246, + "learning_rate": 1.3959672058497675e-05, + "loss": 2.0786, + "mean_token_accuracy": 0.5068965554237366, + "step": 13860 + }, + { + "epoch": 0.013964975257871099, + "grad_norm": 35.36675202574528, + "learning_rate": 1.3964708015228733e-05, + "loss": 2.7755, + "mean_token_accuracy": 0.37586206793785093, + "step": 13865 + }, + { + "epoch": 0.013970011310975273, + "grad_norm": 31.663274052818668, + "learning_rate": 1.3969743971959792e-05, + "loss": 2.4857, + "mean_token_accuracy": 0.4413793087005615, + "step": 13870 + }, + { + "epoch": 0.013975047364079445, + "grad_norm": 26.270126446720514, + "learning_rate": 1.3974779928690853e-05, + "loss": 2.1984, + "mean_token_accuracy": 0.49655172824859617, + "step": 13875 + }, + { + "epoch": 0.013980083417183618, + "grad_norm": 28.711550933625286, + "learning_rate": 1.3979815885421913e-05, + "loss": 2.5893, + "mean_token_accuracy": 0.41034482717514037, + "step": 13880 + }, + { + "epoch": 0.01398511947028779, + "grad_norm": 30.314419676094158, + "learning_rate": 1.3984851842152973e-05, + "loss": 2.335, + "mean_token_accuracy": 0.4517241358757019, + "step": 13885 + }, + { + "epoch": 0.013990155523391962, + "grad_norm": 23.226241161342884, + "learning_rate": 1.3989887798884033e-05, + "loss": 2.3859, + "mean_token_accuracy": 0.46551724672317507, + "step": 13890 + }, + { + "epoch": 0.013995191576496136, + "grad_norm": 23.09558356944471, + "learning_rate": 1.3994923755615092e-05, + "loss": 2.3441, + "mean_token_accuracy": 0.4517241418361664, + "step": 13895 + }, + { + "epoch": 0.014000227629600308, + "grad_norm": 101.74202625802418, + "learning_rate": 1.3999959712346153e-05, + "loss": 2.2123, + "mean_token_accuracy": 0.4517241358757019, + "step": 13900 + }, + { + "epoch": 0.014005263682704482, + "grad_norm": 30.492760460386002, + "learning_rate": 1.4004995669077212e-05, + "loss": 2.7457, + "mean_token_accuracy": 0.3793103456497192, + "step": 13905 + }, + { + "epoch": 0.014010299735808654, + "grad_norm": 34.712605638850874, + "learning_rate": 1.4010031625808273e-05, + "loss": 2.4381, + "mean_token_accuracy": 0.43448275327682495, + "step": 13910 + }, + { + "epoch": 0.014015335788912828, + "grad_norm": 27.227924619512613, + "learning_rate": 1.4015067582539332e-05, + "loss": 2.3635, + "mean_token_accuracy": 0.42758620381355283, + "step": 13915 + }, + { + "epoch": 0.014020371842017, + "grad_norm": 29.891136199318833, + "learning_rate": 1.402010353927039e-05, + "loss": 2.4869, + "mean_token_accuracy": 0.4206896543502808, + "step": 13920 + }, + { + "epoch": 0.014025407895121172, + "grad_norm": 31.899386988634994, + "learning_rate": 1.4025139496001453e-05, + "loss": 2.7688, + "mean_token_accuracy": 0.3620689630508423, + "step": 13925 + }, + { + "epoch": 0.014030443948225346, + "grad_norm": 26.29877775224825, + "learning_rate": 1.403017545273251e-05, + "loss": 2.3842, + "mean_token_accuracy": 0.42413792610168455, + "step": 13930 + }, + { + "epoch": 0.014035480001329518, + "grad_norm": 39.76180304652059, + "learning_rate": 1.403521140946357e-05, + "loss": 2.5543, + "mean_token_accuracy": 0.4172413766384125, + "step": 13935 + }, + { + "epoch": 0.014040516054433691, + "grad_norm": 22.592645527562233, + "learning_rate": 1.404024736619463e-05, + "loss": 2.2495, + "mean_token_accuracy": 0.4379310250282288, + "step": 13940 + }, + { + "epoch": 0.014045552107537863, + "grad_norm": 28.894902637027247, + "learning_rate": 1.404528332292569e-05, + "loss": 2.5621, + "mean_token_accuracy": 0.38275861740112305, + "step": 13945 + }, + { + "epoch": 0.014050588160642037, + "grad_norm": 25.78605113653275, + "learning_rate": 1.405031927965675e-05, + "loss": 2.48, + "mean_token_accuracy": 0.44137930274009707, + "step": 13950 + }, + { + "epoch": 0.01405562421374621, + "grad_norm": 30.14873450824353, + "learning_rate": 1.405535523638781e-05, + "loss": 2.4725, + "mean_token_accuracy": 0.4344827592372894, + "step": 13955 + }, + { + "epoch": 0.014060660266850381, + "grad_norm": 34.26628193605773, + "learning_rate": 1.4060391193118869e-05, + "loss": 2.2456, + "mean_token_accuracy": 0.44482758045196535, + "step": 13960 + }, + { + "epoch": 0.014065696319954555, + "grad_norm": 39.72013121838575, + "learning_rate": 1.406542714984993e-05, + "loss": 2.5008, + "mean_token_accuracy": 0.41034482717514037, + "step": 13965 + }, + { + "epoch": 0.014070732373058727, + "grad_norm": 38.271817513991074, + "learning_rate": 1.407046310658099e-05, + "loss": 2.1637, + "mean_token_accuracy": 0.4776769518852234, + "step": 13970 + }, + { + "epoch": 0.0140757684261629, + "grad_norm": 32.873694259727834, + "learning_rate": 1.4075499063312047e-05, + "loss": 2.4976, + "mean_token_accuracy": 0.42238354682922363, + "step": 13975 + }, + { + "epoch": 0.014080804479267073, + "grad_norm": 29.826439758053755, + "learning_rate": 1.408053502004311e-05, + "loss": 2.642, + "mean_token_accuracy": 0.39310344457626345, + "step": 13980 + }, + { + "epoch": 0.014085840532371247, + "grad_norm": 40.26036309612394, + "learning_rate": 1.4085570976774167e-05, + "loss": 2.4766, + "mean_token_accuracy": 0.40344826579093934, + "step": 13985 + }, + { + "epoch": 0.014090876585475419, + "grad_norm": 27.751350139391278, + "learning_rate": 1.409060693350523e-05, + "loss": 2.7553, + "mean_token_accuracy": 0.35862069129943847, + "step": 13990 + }, + { + "epoch": 0.01409591263857959, + "grad_norm": 23.466008050323317, + "learning_rate": 1.4095642890236289e-05, + "loss": 2.4024, + "mean_token_accuracy": 0.4310344815254211, + "step": 13995 + }, + { + "epoch": 0.014100948691683764, + "grad_norm": 35.17414276987657, + "learning_rate": 1.4100678846967346e-05, + "loss": 2.0949, + "mean_token_accuracy": 0.4517241299152374, + "step": 14000 + }, + { + "epoch": 0.014105984744787936, + "grad_norm": 32.25859286777294, + "learning_rate": 1.4105714803698409e-05, + "loss": 2.835, + "mean_token_accuracy": 0.41379310488700866, + "step": 14005 + }, + { + "epoch": 0.01411102079789211, + "grad_norm": 25.007506879940863, + "learning_rate": 1.4110750760429467e-05, + "loss": 2.2616, + "mean_token_accuracy": 0.4517241418361664, + "step": 14010 + }, + { + "epoch": 0.014116056850996282, + "grad_norm": 68.50716236505473, + "learning_rate": 1.4115786717160526e-05, + "loss": 2.2437, + "mean_token_accuracy": 0.4551724135875702, + "step": 14015 + }, + { + "epoch": 0.014121092904100456, + "grad_norm": 36.66102476353269, + "learning_rate": 1.4120822673891587e-05, + "loss": 2.6624, + "mean_token_accuracy": 0.4000000059604645, + "step": 14020 + }, + { + "epoch": 0.014126128957204628, + "grad_norm": 24.2480174756622, + "learning_rate": 1.4125858630622646e-05, + "loss": 2.0828, + "mean_token_accuracy": 0.4814277052879333, + "step": 14025 + }, + { + "epoch": 0.0141311650103088, + "grad_norm": 30.75320405712023, + "learning_rate": 1.4130894587353707e-05, + "loss": 2.1088, + "mean_token_accuracy": 0.471203875541687, + "step": 14030 + }, + { + "epoch": 0.014136201063412974, + "grad_norm": 23.89200080830402, + "learning_rate": 1.4135930544084766e-05, + "loss": 2.2859, + "mean_token_accuracy": 0.4586206912994385, + "step": 14035 + }, + { + "epoch": 0.014141237116517146, + "grad_norm": 29.945100949469442, + "learning_rate": 1.4140966500815826e-05, + "loss": 2.4027, + "mean_token_accuracy": 0.42758620977401735, + "step": 14040 + }, + { + "epoch": 0.01414627316962132, + "grad_norm": 26.65745870542326, + "learning_rate": 1.4146002457546886e-05, + "loss": 2.6715, + "mean_token_accuracy": 0.38620689511299133, + "step": 14045 + }, + { + "epoch": 0.014151309222725492, + "grad_norm": 25.343701912182958, + "learning_rate": 1.4151038414277946e-05, + "loss": 2.0758, + "mean_token_accuracy": 0.4843920111656189, + "step": 14050 + }, + { + "epoch": 0.014156345275829665, + "grad_norm": 28.916872677606936, + "learning_rate": 1.4156074371009003e-05, + "loss": 2.225, + "mean_token_accuracy": 0.45172414779663084, + "step": 14055 + }, + { + "epoch": 0.014161381328933837, + "grad_norm": 22.556662395783103, + "learning_rate": 1.4161110327740066e-05, + "loss": 2.6137, + "mean_token_accuracy": 0.36896551847457887, + "step": 14060 + }, + { + "epoch": 0.01416641738203801, + "grad_norm": 27.375479137320863, + "learning_rate": 1.4166146284471123e-05, + "loss": 2.3445, + "mean_token_accuracy": 0.441379314661026, + "step": 14065 + }, + { + "epoch": 0.014171453435142183, + "grad_norm": 54.781488321326066, + "learning_rate": 1.4171182241202186e-05, + "loss": 2.7439, + "mean_token_accuracy": 0.37241379618644715, + "step": 14070 + }, + { + "epoch": 0.014176489488246355, + "grad_norm": 246.07215370401684, + "learning_rate": 1.4176218197933244e-05, + "loss": 2.4367, + "mean_token_accuracy": 0.44137930274009707, + "step": 14075 + }, + { + "epoch": 0.014181525541350529, + "grad_norm": 27.776921535867178, + "learning_rate": 1.4181254154664303e-05, + "loss": 2.5066, + "mean_token_accuracy": 0.4275861978530884, + "step": 14080 + }, + { + "epoch": 0.014186561594454701, + "grad_norm": 58.933875145998734, + "learning_rate": 1.4186290111395364e-05, + "loss": 2.3849, + "mean_token_accuracy": 0.4137930929660797, + "step": 14085 + }, + { + "epoch": 0.014191597647558875, + "grad_norm": 35.05198756979633, + "learning_rate": 1.4191326068126423e-05, + "loss": 2.4113, + "mean_token_accuracy": 0.3965517282485962, + "step": 14090 + }, + { + "epoch": 0.014196633700663047, + "grad_norm": 26.46254026181598, + "learning_rate": 1.4196362024857482e-05, + "loss": 2.346, + "mean_token_accuracy": 0.42413792610168455, + "step": 14095 + }, + { + "epoch": 0.014201669753767219, + "grad_norm": 31.1211841860338, + "learning_rate": 1.4201397981588543e-05, + "loss": 2.6274, + "mean_token_accuracy": 0.37586206793785093, + "step": 14100 + }, + { + "epoch": 0.014206705806871393, + "grad_norm": 29.253925889450294, + "learning_rate": 1.4206433938319603e-05, + "loss": 2.2547, + "mean_token_accuracy": 0.4896551787853241, + "step": 14105 + }, + { + "epoch": 0.014211741859975565, + "grad_norm": 33.50981125265915, + "learning_rate": 1.4211469895050664e-05, + "loss": 2.2725, + "mean_token_accuracy": 0.4448275864124298, + "step": 14110 + }, + { + "epoch": 0.014216777913079738, + "grad_norm": 27.774604248723605, + "learning_rate": 1.4216505851781723e-05, + "loss": 2.6449, + "mean_token_accuracy": 0.36551723778247835, + "step": 14115 + }, + { + "epoch": 0.01422181396618391, + "grad_norm": 33.25420172122438, + "learning_rate": 1.422154180851278e-05, + "loss": 2.4876, + "mean_token_accuracy": 0.4, + "step": 14120 + }, + { + "epoch": 0.014226850019288084, + "grad_norm": 31.532518030740587, + "learning_rate": 1.4226577765243843e-05, + "loss": 2.6674, + "mean_token_accuracy": 0.3862069010734558, + "step": 14125 + }, + { + "epoch": 0.014231886072392256, + "grad_norm": 33.5155146342872, + "learning_rate": 1.42316137219749e-05, + "loss": 2.1178, + "mean_token_accuracy": 0.4747126460075378, + "step": 14130 + }, + { + "epoch": 0.014236922125496428, + "grad_norm": 28.715544392339666, + "learning_rate": 1.423664967870596e-05, + "loss": 2.233, + "mean_token_accuracy": 0.47931034564971925, + "step": 14135 + }, + { + "epoch": 0.014241958178600602, + "grad_norm": 30.460340240910913, + "learning_rate": 1.424168563543702e-05, + "loss": 2.6734, + "mean_token_accuracy": 0.4000000059604645, + "step": 14140 + }, + { + "epoch": 0.014246994231704774, + "grad_norm": 27.101519865271197, + "learning_rate": 1.424672159216808e-05, + "loss": 2.3561, + "mean_token_accuracy": 0.4068965494632721, + "step": 14145 + }, + { + "epoch": 0.014252030284808948, + "grad_norm": 27.227915795552246, + "learning_rate": 1.4251757548899141e-05, + "loss": 2.4843, + "mean_token_accuracy": 0.358620685338974, + "step": 14150 + }, + { + "epoch": 0.01425706633791312, + "grad_norm": 31.3813151807137, + "learning_rate": 1.42567935056302e-05, + "loss": 2.4392, + "mean_token_accuracy": 0.38620689511299133, + "step": 14155 + }, + { + "epoch": 0.014262102391017294, + "grad_norm": 25.214489497455098, + "learning_rate": 1.426182946236126e-05, + "loss": 2.4543, + "mean_token_accuracy": 0.4137930989265442, + "step": 14160 + }, + { + "epoch": 0.014267138444121466, + "grad_norm": 24.12913450499344, + "learning_rate": 1.426686541909232e-05, + "loss": 2.227, + "mean_token_accuracy": 0.4620689630508423, + "step": 14165 + }, + { + "epoch": 0.014272174497225638, + "grad_norm": 31.44419808272467, + "learning_rate": 1.427190137582338e-05, + "loss": 2.5371, + "mean_token_accuracy": 0.43448275327682495, + "step": 14170 + }, + { + "epoch": 0.014277210550329811, + "grad_norm": 28.22215443729513, + "learning_rate": 1.4276937332554439e-05, + "loss": 2.3255, + "mean_token_accuracy": 0.4517241299152374, + "step": 14175 + }, + { + "epoch": 0.014282246603433984, + "grad_norm": 26.528236688115253, + "learning_rate": 1.42819732892855e-05, + "loss": 2.2097, + "mean_token_accuracy": 0.38620689511299133, + "step": 14180 + }, + { + "epoch": 0.014287282656538157, + "grad_norm": 35.1016338446267, + "learning_rate": 1.4287009246016559e-05, + "loss": 2.5796, + "mean_token_accuracy": 0.42758620381355283, + "step": 14185 + }, + { + "epoch": 0.01429231870964233, + "grad_norm": 27.484766831262114, + "learning_rate": 1.429204520274762e-05, + "loss": 2.7045, + "mean_token_accuracy": 0.39655172228813174, + "step": 14190 + }, + { + "epoch": 0.014297354762746503, + "grad_norm": 32.40012238366505, + "learning_rate": 1.429708115947868e-05, + "loss": 2.4418, + "mean_token_accuracy": 0.42068966031074523, + "step": 14195 + }, + { + "epoch": 0.014302390815850675, + "grad_norm": 35.6998407352267, + "learning_rate": 1.4302117116209737e-05, + "loss": 2.6028, + "mean_token_accuracy": 0.4117362380027771, + "step": 14200 + }, + { + "epoch": 0.014307426868954847, + "grad_norm": 22.914058568561767, + "learning_rate": 1.43071530729408e-05, + "loss": 2.4783, + "mean_token_accuracy": 0.38620689511299133, + "step": 14205 + }, + { + "epoch": 0.014312462922059021, + "grad_norm": 32.16962207546506, + "learning_rate": 1.4312189029671857e-05, + "loss": 2.0697, + "mean_token_accuracy": 0.5021778583526612, + "step": 14210 + }, + { + "epoch": 0.014317498975163193, + "grad_norm": 24.869260572392864, + "learning_rate": 1.4317224986402916e-05, + "loss": 2.4156, + "mean_token_accuracy": 0.4758620738983154, + "step": 14215 + }, + { + "epoch": 0.014322535028267367, + "grad_norm": 22.23016165598873, + "learning_rate": 1.4322260943133977e-05, + "loss": 2.5263, + "mean_token_accuracy": 0.4206896543502808, + "step": 14220 + }, + { + "epoch": 0.014327571081371539, + "grad_norm": 26.071865742208946, + "learning_rate": 1.4327296899865037e-05, + "loss": 1.9892, + "mean_token_accuracy": 0.47586206793785096, + "step": 14225 + }, + { + "epoch": 0.01433260713447571, + "grad_norm": 33.19550897813042, + "learning_rate": 1.4332332856596097e-05, + "loss": 2.3962, + "mean_token_accuracy": 0.4172413766384125, + "step": 14230 + }, + { + "epoch": 0.014337643187579885, + "grad_norm": 21.19220314119713, + "learning_rate": 1.4337368813327157e-05, + "loss": 2.1142, + "mean_token_accuracy": 0.4448275864124298, + "step": 14235 + }, + { + "epoch": 0.014342679240684057, + "grad_norm": 23.877300425917774, + "learning_rate": 1.4342404770058216e-05, + "loss": 1.8484, + "mean_token_accuracy": 0.509513008594513, + "step": 14240 + }, + { + "epoch": 0.01434771529378823, + "grad_norm": 22.32873953528322, + "learning_rate": 1.4347440726789277e-05, + "loss": 2.9369, + "mean_token_accuracy": 0.3620689660310745, + "step": 14245 + }, + { + "epoch": 0.014352751346892402, + "grad_norm": 27.65259544978894, + "learning_rate": 1.4352476683520336e-05, + "loss": 2.5462, + "mean_token_accuracy": 0.382758629322052, + "step": 14250 + }, + { + "epoch": 0.014357787399996576, + "grad_norm": 22.313122434855202, + "learning_rate": 1.4357512640251394e-05, + "loss": 2.3461, + "mean_token_accuracy": 0.3965517163276672, + "step": 14255 + }, + { + "epoch": 0.014362823453100748, + "grad_norm": 29.83000271412299, + "learning_rate": 1.4362548596982456e-05, + "loss": 2.3808, + "mean_token_accuracy": 0.4068965494632721, + "step": 14260 + }, + { + "epoch": 0.01436785950620492, + "grad_norm": 35.86189348186604, + "learning_rate": 1.4367584553713514e-05, + "loss": 2.1064, + "mean_token_accuracy": 0.4862069010734558, + "step": 14265 + }, + { + "epoch": 0.014372895559309094, + "grad_norm": 41.13423038184262, + "learning_rate": 1.4372620510444577e-05, + "loss": 2.4613, + "mean_token_accuracy": 0.39655172228813174, + "step": 14270 + }, + { + "epoch": 0.014377931612413266, + "grad_norm": 33.188887788989035, + "learning_rate": 1.4377656467175634e-05, + "loss": 2.4464, + "mean_token_accuracy": 0.39655172228813174, + "step": 14275 + }, + { + "epoch": 0.01438296766551744, + "grad_norm": 25.35912386879697, + "learning_rate": 1.4382692423906693e-05, + "loss": 2.146, + "mean_token_accuracy": 0.4586206912994385, + "step": 14280 + }, + { + "epoch": 0.014388003718621612, + "grad_norm": 22.906730867776258, + "learning_rate": 1.4387728380637754e-05, + "loss": 2.6516, + "mean_token_accuracy": 0.39310344457626345, + "step": 14285 + }, + { + "epoch": 0.014393039771725786, + "grad_norm": 26.760427296554425, + "learning_rate": 1.4392764337368814e-05, + "loss": 2.6042, + "mean_token_accuracy": 0.41379310488700866, + "step": 14290 + }, + { + "epoch": 0.014398075824829958, + "grad_norm": 36.08217689005033, + "learning_rate": 1.4397800294099873e-05, + "loss": 2.5417, + "mean_token_accuracy": 0.4153055131435394, + "step": 14295 + }, + { + "epoch": 0.01440311187793413, + "grad_norm": 27.360802480662073, + "learning_rate": 1.4402836250830934e-05, + "loss": 2.4354, + "mean_token_accuracy": 0.3793103456497192, + "step": 14300 + }, + { + "epoch": 0.014408147931038303, + "grad_norm": 20.472226319165895, + "learning_rate": 1.4407872207561993e-05, + "loss": 2.636, + "mean_token_accuracy": 0.42758620381355283, + "step": 14305 + }, + { + "epoch": 0.014413183984142475, + "grad_norm": 29.23899429352142, + "learning_rate": 1.4412908164293054e-05, + "loss": 2.4251, + "mean_token_accuracy": 0.4413793087005615, + "step": 14310 + }, + { + "epoch": 0.01441822003724665, + "grad_norm": 24.67105306384765, + "learning_rate": 1.4417944121024113e-05, + "loss": 2.3114, + "mean_token_accuracy": 0.44301270246505736, + "step": 14315 + }, + { + "epoch": 0.014423256090350821, + "grad_norm": 34.37186566837699, + "learning_rate": 1.442298007775517e-05, + "loss": 2.6264, + "mean_token_accuracy": 0.3620689630508423, + "step": 14320 + }, + { + "epoch": 0.014428292143454995, + "grad_norm": 25.17078717340894, + "learning_rate": 1.4428016034486233e-05, + "loss": 2.1531, + "mean_token_accuracy": 0.4931034505367279, + "step": 14325 + }, + { + "epoch": 0.014433328196559167, + "grad_norm": 24.18417796685301, + "learning_rate": 1.4433051991217291e-05, + "loss": 2.4494, + "mean_token_accuracy": 0.4068965494632721, + "step": 14330 + }, + { + "epoch": 0.014438364249663339, + "grad_norm": 24.29239509295025, + "learning_rate": 1.4438087947948354e-05, + "loss": 2.4686, + "mean_token_accuracy": 0.4103448331356049, + "step": 14335 + }, + { + "epoch": 0.014443400302767513, + "grad_norm": 35.50455526908005, + "learning_rate": 1.4443123904679411e-05, + "loss": 2.6254, + "mean_token_accuracy": 0.3827586233615875, + "step": 14340 + }, + { + "epoch": 0.014448436355871685, + "grad_norm": 28.65470367774043, + "learning_rate": 1.444815986141047e-05, + "loss": 2.2081, + "mean_token_accuracy": 0.4620689630508423, + "step": 14345 + }, + { + "epoch": 0.014453472408975859, + "grad_norm": 30.071500613430047, + "learning_rate": 1.4453195818141531e-05, + "loss": 2.5225, + "mean_token_accuracy": 0.4137930989265442, + "step": 14350 + }, + { + "epoch": 0.01445850846208003, + "grad_norm": 28.25659719911171, + "learning_rate": 1.445823177487259e-05, + "loss": 2.2636, + "mean_token_accuracy": 0.41379310488700866, + "step": 14355 + }, + { + "epoch": 0.014463544515184204, + "grad_norm": 27.62098847917744, + "learning_rate": 1.446326773160365e-05, + "loss": 2.3036, + "mean_token_accuracy": 0.41379311084747317, + "step": 14360 + }, + { + "epoch": 0.014468580568288376, + "grad_norm": 29.7177333227106, + "learning_rate": 1.4468303688334711e-05, + "loss": 2.497, + "mean_token_accuracy": 0.39655172228813174, + "step": 14365 + }, + { + "epoch": 0.014473616621392548, + "grad_norm": 23.06481319136907, + "learning_rate": 1.447333964506577e-05, + "loss": 2.5153, + "mean_token_accuracy": 0.41034482717514037, + "step": 14370 + }, + { + "epoch": 0.014478652674496722, + "grad_norm": 40.948306461440474, + "learning_rate": 1.4478375601796831e-05, + "loss": 2.3496, + "mean_token_accuracy": 0.42758620381355283, + "step": 14375 + }, + { + "epoch": 0.014483688727600894, + "grad_norm": 26.10986851990998, + "learning_rate": 1.448341155852789e-05, + "loss": 2.129, + "mean_token_accuracy": 0.47241379618644713, + "step": 14380 + }, + { + "epoch": 0.014488724780705068, + "grad_norm": 26.34141056928323, + "learning_rate": 1.448844751525895e-05, + "loss": 2.2718, + "mean_token_accuracy": 0.441379314661026, + "step": 14385 + }, + { + "epoch": 0.01449376083380924, + "grad_norm": 34.06788425758033, + "learning_rate": 1.449348347199001e-05, + "loss": 2.6892, + "mean_token_accuracy": 0.4068965494632721, + "step": 14390 + }, + { + "epoch": 0.014498796886913414, + "grad_norm": 25.74411314356086, + "learning_rate": 1.449851942872107e-05, + "loss": 2.1606, + "mean_token_accuracy": 0.482758617401123, + "step": 14395 + }, + { + "epoch": 0.014503832940017586, + "grad_norm": 32.04970155871049, + "learning_rate": 1.4503555385452127e-05, + "loss": 2.526, + "mean_token_accuracy": 0.4034482777118683, + "step": 14400 + }, + { + "epoch": 0.014508868993121758, + "grad_norm": 40.42955901776003, + "learning_rate": 1.450859134218319e-05, + "loss": 2.3825, + "mean_token_accuracy": 0.44137930274009707, + "step": 14405 + }, + { + "epoch": 0.014513905046225932, + "grad_norm": 26.482696231713515, + "learning_rate": 1.4513627298914248e-05, + "loss": 2.2495, + "mean_token_accuracy": 0.4448275864124298, + "step": 14410 + }, + { + "epoch": 0.014518941099330104, + "grad_norm": 27.877491566202455, + "learning_rate": 1.451866325564531e-05, + "loss": 2.4241, + "mean_token_accuracy": 0.43103448748588563, + "step": 14415 + }, + { + "epoch": 0.014523977152434277, + "grad_norm": 32.811506623502666, + "learning_rate": 1.4523699212376368e-05, + "loss": 2.5715, + "mean_token_accuracy": 0.4344827651977539, + "step": 14420 + }, + { + "epoch": 0.01452901320553845, + "grad_norm": 27.172225710497397, + "learning_rate": 1.4528735169107427e-05, + "loss": 2.5742, + "mean_token_accuracy": 0.4, + "step": 14425 + }, + { + "epoch": 0.014534049258642623, + "grad_norm": 23.964519185889756, + "learning_rate": 1.4533771125838488e-05, + "loss": 2.4602, + "mean_token_accuracy": 0.40689654350280763, + "step": 14430 + }, + { + "epoch": 0.014539085311746795, + "grad_norm": 21.620465590987802, + "learning_rate": 1.4538807082569547e-05, + "loss": 2.2618, + "mean_token_accuracy": 0.4667487680912018, + "step": 14435 + }, + { + "epoch": 0.014544121364850967, + "grad_norm": 34.407022748929776, + "learning_rate": 1.4543843039300606e-05, + "loss": 2.741, + "mean_token_accuracy": 0.358620685338974, + "step": 14440 + }, + { + "epoch": 0.014549157417955141, + "grad_norm": 26.901566633859108, + "learning_rate": 1.4548878996031667e-05, + "loss": 2.5655, + "mean_token_accuracy": 0.3551724165678024, + "step": 14445 + }, + { + "epoch": 0.014554193471059313, + "grad_norm": 30.33823793701239, + "learning_rate": 1.4553914952762727e-05, + "loss": 2.2824, + "mean_token_accuracy": 0.43103447556495667, + "step": 14450 + }, + { + "epoch": 0.014559229524163487, + "grad_norm": 28.041826678771642, + "learning_rate": 1.4558950909493788e-05, + "loss": 2.3527, + "mean_token_accuracy": 0.44506956934928893, + "step": 14455 + }, + { + "epoch": 0.014564265577267659, + "grad_norm": 23.836348893852037, + "learning_rate": 1.4563986866224847e-05, + "loss": 2.5028, + "mean_token_accuracy": 0.39310344457626345, + "step": 14460 + }, + { + "epoch": 0.014569301630371833, + "grad_norm": 23.688286291447078, + "learning_rate": 1.4569022822955904e-05, + "loss": 2.186, + "mean_token_accuracy": 0.4379310369491577, + "step": 14465 + }, + { + "epoch": 0.014574337683476005, + "grad_norm": 25.848618829871278, + "learning_rate": 1.4574058779686967e-05, + "loss": 2.5572, + "mean_token_accuracy": 0.4, + "step": 14470 + }, + { + "epoch": 0.014579373736580177, + "grad_norm": 33.876501866356286, + "learning_rate": 1.4579094736418025e-05, + "loss": 2.3571, + "mean_token_accuracy": 0.4517846405506134, + "step": 14475 + }, + { + "epoch": 0.01458440978968435, + "grad_norm": 24.124623586229266, + "learning_rate": 1.4584130693149084e-05, + "loss": 2.1319, + "mean_token_accuracy": 0.4774349629878998, + "step": 14480 + }, + { + "epoch": 0.014589445842788522, + "grad_norm": 31.687703082018878, + "learning_rate": 1.4589166649880145e-05, + "loss": 2.1639, + "mean_token_accuracy": 0.4620689570903778, + "step": 14485 + }, + { + "epoch": 0.014594481895892696, + "grad_norm": 23.696390897799596, + "learning_rate": 1.4594202606611204e-05, + "loss": 2.2667, + "mean_token_accuracy": 0.4572897732257843, + "step": 14490 + }, + { + "epoch": 0.014599517948996868, + "grad_norm": 34.633908347115025, + "learning_rate": 1.4599238563342265e-05, + "loss": 2.5372, + "mean_token_accuracy": 0.4310344934463501, + "step": 14495 + }, + { + "epoch": 0.014604554002101042, + "grad_norm": 22.485813727402057, + "learning_rate": 1.4604274520073324e-05, + "loss": 2.3241, + "mean_token_accuracy": 0.37931033968925476, + "step": 14500 + }, + { + "epoch": 0.014609590055205214, + "grad_norm": 24.35050591572347, + "learning_rate": 1.4609310476804383e-05, + "loss": 2.5262, + "mean_token_accuracy": 0.3896551787853241, + "step": 14505 + }, + { + "epoch": 0.014614626108309386, + "grad_norm": 25.770059544374785, + "learning_rate": 1.4614346433535444e-05, + "loss": 2.106, + "mean_token_accuracy": 0.47586206197738645, + "step": 14510 + }, + { + "epoch": 0.01461966216141356, + "grad_norm": 26.824760960715643, + "learning_rate": 1.4619382390266504e-05, + "loss": 2.1678, + "mean_token_accuracy": 0.47779794931411745, + "step": 14515 + }, + { + "epoch": 0.014624698214517732, + "grad_norm": 23.679776358811104, + "learning_rate": 1.4624418346997561e-05, + "loss": 2.3399, + "mean_token_accuracy": 0.4620689630508423, + "step": 14520 + }, + { + "epoch": 0.014629734267621906, + "grad_norm": 24.441598109204982, + "learning_rate": 1.4629454303728624e-05, + "loss": 2.1747, + "mean_token_accuracy": 0.4862069010734558, + "step": 14525 + }, + { + "epoch": 0.014634770320726078, + "grad_norm": 25.110411719257563, + "learning_rate": 1.4634490260459683e-05, + "loss": 2.3748, + "mean_token_accuracy": 0.44137930274009707, + "step": 14530 + }, + { + "epoch": 0.014639806373830251, + "grad_norm": 34.40920774889646, + "learning_rate": 1.4639526217190744e-05, + "loss": 2.5975, + "mean_token_accuracy": 0.41379310488700866, + "step": 14535 + }, + { + "epoch": 0.014644842426934424, + "grad_norm": 27.70907043333397, + "learning_rate": 1.4644562173921803e-05, + "loss": 2.1599, + "mean_token_accuracy": 0.458620673418045, + "step": 14540 + }, + { + "epoch": 0.014649878480038596, + "grad_norm": 29.130907292101533, + "learning_rate": 1.4649598130652861e-05, + "loss": 2.0242, + "mean_token_accuracy": 0.4776769518852234, + "step": 14545 + }, + { + "epoch": 0.01465491453314277, + "grad_norm": 30.266866667366006, + "learning_rate": 1.4654634087383924e-05, + "loss": 2.3795, + "mean_token_accuracy": 0.39655172228813174, + "step": 14550 + }, + { + "epoch": 0.014659950586246941, + "grad_norm": 44.05107062953808, + "learning_rate": 1.4659670044114981e-05, + "loss": 2.5076, + "mean_token_accuracy": 0.3896551728248596, + "step": 14555 + }, + { + "epoch": 0.014664986639351115, + "grad_norm": 32.18245345330848, + "learning_rate": 1.466470600084604e-05, + "loss": 2.4041, + "mean_token_accuracy": 0.4361766457557678, + "step": 14560 + }, + { + "epoch": 0.014670022692455287, + "grad_norm": 24.60002828072494, + "learning_rate": 1.4669741957577101e-05, + "loss": 2.1617, + "mean_token_accuracy": 0.4882637619972229, + "step": 14565 + }, + { + "epoch": 0.014675058745559461, + "grad_norm": 21.926331825965136, + "learning_rate": 1.467477791430816e-05, + "loss": 2.2334, + "mean_token_accuracy": 0.4137930989265442, + "step": 14570 + }, + { + "epoch": 0.014680094798663633, + "grad_norm": 24.495448086931294, + "learning_rate": 1.4679813871039222e-05, + "loss": 2.3695, + "mean_token_accuracy": 0.4310344815254211, + "step": 14575 + }, + { + "epoch": 0.014685130851767805, + "grad_norm": 33.86289949768534, + "learning_rate": 1.468484982777028e-05, + "loss": 2.4642, + "mean_token_accuracy": 0.4482758641242981, + "step": 14580 + }, + { + "epoch": 0.014690166904871979, + "grad_norm": 23.648091612720897, + "learning_rate": 1.468988578450134e-05, + "loss": 2.3015, + "mean_token_accuracy": 0.4689655125141144, + "step": 14585 + }, + { + "epoch": 0.01469520295797615, + "grad_norm": 34.577433059607316, + "learning_rate": 1.4694921741232401e-05, + "loss": 2.478, + "mean_token_accuracy": 0.48965516686439514, + "step": 14590 + }, + { + "epoch": 0.014700239011080325, + "grad_norm": 29.463683004904315, + "learning_rate": 1.469995769796346e-05, + "loss": 1.9954, + "mean_token_accuracy": 0.5448275923728942, + "step": 14595 + }, + { + "epoch": 0.014705275064184497, + "grad_norm": 22.86993303535317, + "learning_rate": 1.4704993654694518e-05, + "loss": 2.4433, + "mean_token_accuracy": 0.4620689630508423, + "step": 14600 + }, + { + "epoch": 0.01471031111728867, + "grad_norm": 43.657261488453614, + "learning_rate": 1.471002961142558e-05, + "loss": 2.5465, + "mean_token_accuracy": 0.41379310488700866, + "step": 14605 + }, + { + "epoch": 0.014715347170392842, + "grad_norm": 26.46599634655422, + "learning_rate": 1.4715065568156638e-05, + "loss": 2.2547, + "mean_token_accuracy": 0.4206896543502808, + "step": 14610 + }, + { + "epoch": 0.014720383223497014, + "grad_norm": 27.900220529339226, + "learning_rate": 1.47201015248877e-05, + "loss": 2.6512, + "mean_token_accuracy": 0.35862068831920624, + "step": 14615 + }, + { + "epoch": 0.014725419276601188, + "grad_norm": 27.9573882940398, + "learning_rate": 1.4725137481618758e-05, + "loss": 2.3622, + "mean_token_accuracy": 0.4000000059604645, + "step": 14620 + }, + { + "epoch": 0.01473045532970536, + "grad_norm": 26.587399558194228, + "learning_rate": 1.4730173438349817e-05, + "loss": 2.2841, + "mean_token_accuracy": 0.42413792610168455, + "step": 14625 + }, + { + "epoch": 0.014735491382809534, + "grad_norm": 23.155787701986483, + "learning_rate": 1.4735209395080878e-05, + "loss": 2.352, + "mean_token_accuracy": 0.42934058904647826, + "step": 14630 + }, + { + "epoch": 0.014740527435913706, + "grad_norm": 22.477119018359964, + "learning_rate": 1.4740245351811938e-05, + "loss": 2.2796, + "mean_token_accuracy": 0.46551724672317507, + "step": 14635 + }, + { + "epoch": 0.01474556348901788, + "grad_norm": 27.54722202451458, + "learning_rate": 1.4745281308542997e-05, + "loss": 2.4387, + "mean_token_accuracy": 0.482758617401123, + "step": 14640 + }, + { + "epoch": 0.014750599542122052, + "grad_norm": 25.80632190041504, + "learning_rate": 1.4750317265274058e-05, + "loss": 2.6151, + "mean_token_accuracy": 0.43793103098869324, + "step": 14645 + }, + { + "epoch": 0.014755635595226224, + "grad_norm": 23.673409912815078, + "learning_rate": 1.4755353222005117e-05, + "loss": 2.6233, + "mean_token_accuracy": 0.4137930929660797, + "step": 14650 + }, + { + "epoch": 0.014760671648330398, + "grad_norm": 27.939991644356798, + "learning_rate": 1.4760389178736178e-05, + "loss": 2.4924, + "mean_token_accuracy": 0.3793103456497192, + "step": 14655 + }, + { + "epoch": 0.01476570770143457, + "grad_norm": 23.975817338639356, + "learning_rate": 1.4765425135467237e-05, + "loss": 2.5087, + "mean_token_accuracy": 0.4137930989265442, + "step": 14660 + }, + { + "epoch": 0.014770743754538743, + "grad_norm": 28.661051438121323, + "learning_rate": 1.4770461092198295e-05, + "loss": 2.1658, + "mean_token_accuracy": 0.42068966031074523, + "step": 14665 + }, + { + "epoch": 0.014775779807642915, + "grad_norm": 44.83158259002183, + "learning_rate": 1.4775497048929357e-05, + "loss": 2.1556, + "mean_token_accuracy": 0.5137931048870087, + "step": 14670 + }, + { + "epoch": 0.01478081586074709, + "grad_norm": 27.26092397373069, + "learning_rate": 1.4780533005660415e-05, + "loss": 2.7949, + "mean_token_accuracy": 0.35862069129943847, + "step": 14675 + }, + { + "epoch": 0.014785851913851261, + "grad_norm": 26.64347587068617, + "learning_rate": 1.4785568962391474e-05, + "loss": 2.2384, + "mean_token_accuracy": 0.41034482717514037, + "step": 14680 + }, + { + "epoch": 0.014790887966955433, + "grad_norm": 33.48874693375749, + "learning_rate": 1.4790604919122535e-05, + "loss": 2.4473, + "mean_token_accuracy": 0.4620689630508423, + "step": 14685 + }, + { + "epoch": 0.014795924020059607, + "grad_norm": 33.36040429142167, + "learning_rate": 1.4795640875853594e-05, + "loss": 2.3568, + "mean_token_accuracy": 0.40000000298023225, + "step": 14690 + }, + { + "epoch": 0.014800960073163779, + "grad_norm": 47.28622128639422, + "learning_rate": 1.4800676832584655e-05, + "loss": 2.6198, + "mean_token_accuracy": 0.37586206793785093, + "step": 14695 + }, + { + "epoch": 0.014805996126267953, + "grad_norm": 28.21091643531407, + "learning_rate": 1.4805712789315715e-05, + "loss": 2.3908, + "mean_token_accuracy": 0.44137929677963256, + "step": 14700 + }, + { + "epoch": 0.014811032179372125, + "grad_norm": 29.966442030392802, + "learning_rate": 1.4810748746046774e-05, + "loss": 2.3101, + "mean_token_accuracy": 0.42758620381355283, + "step": 14705 + }, + { + "epoch": 0.014816068232476299, + "grad_norm": 30.56615164110763, + "learning_rate": 1.4815784702777835e-05, + "loss": 2.6077, + "mean_token_accuracy": 0.38275861740112305, + "step": 14710 + }, + { + "epoch": 0.01482110428558047, + "grad_norm": 25.104057223093797, + "learning_rate": 1.4820820659508894e-05, + "loss": 2.194, + "mean_token_accuracy": 0.44482758045196535, + "step": 14715 + }, + { + "epoch": 0.014826140338684643, + "grad_norm": 29.51730075187755, + "learning_rate": 1.4825856616239953e-05, + "loss": 2.1224, + "mean_token_accuracy": 0.458620685338974, + "step": 14720 + }, + { + "epoch": 0.014831176391788816, + "grad_norm": 22.664552677527578, + "learning_rate": 1.4830892572971014e-05, + "loss": 1.8991, + "mean_token_accuracy": 0.5137931048870087, + "step": 14725 + }, + { + "epoch": 0.014836212444892988, + "grad_norm": 24.448382876717243, + "learning_rate": 1.4835928529702074e-05, + "loss": 2.3314, + "mean_token_accuracy": 0.47241379618644713, + "step": 14730 + }, + { + "epoch": 0.014841248497997162, + "grad_norm": 24.64328918290692, + "learning_rate": 1.4840964486433135e-05, + "loss": 2.6031, + "mean_token_accuracy": 0.3965517282485962, + "step": 14735 + }, + { + "epoch": 0.014846284551101334, + "grad_norm": 29.602939490499384, + "learning_rate": 1.4846000443164194e-05, + "loss": 2.4319, + "mean_token_accuracy": 0.39655172228813174, + "step": 14740 + }, + { + "epoch": 0.014851320604205508, + "grad_norm": 21.834560427448505, + "learning_rate": 1.4851036399895251e-05, + "loss": 2.3018, + "mean_token_accuracy": 0.42758620381355283, + "step": 14745 + }, + { + "epoch": 0.01485635665730968, + "grad_norm": 27.09831909643038, + "learning_rate": 1.4856072356626314e-05, + "loss": 2.4512, + "mean_token_accuracy": 0.4344827592372894, + "step": 14750 + }, + { + "epoch": 0.014861392710413852, + "grad_norm": 27.246073559468776, + "learning_rate": 1.4861108313357372e-05, + "loss": 2.8919, + "mean_token_accuracy": 0.341379314661026, + "step": 14755 + }, + { + "epoch": 0.014866428763518026, + "grad_norm": 32.53746534574877, + "learning_rate": 1.4866144270088434e-05, + "loss": 2.5639, + "mean_token_accuracy": 0.41724138259887694, + "step": 14760 + }, + { + "epoch": 0.014871464816622198, + "grad_norm": 32.11908456690238, + "learning_rate": 1.4871180226819492e-05, + "loss": 2.4317, + "mean_token_accuracy": 0.4535390198230743, + "step": 14765 + }, + { + "epoch": 0.014876500869726372, + "grad_norm": 21.0885052894486, + "learning_rate": 1.4876216183550551e-05, + "loss": 2.2866, + "mean_token_accuracy": 0.4482758641242981, + "step": 14770 + }, + { + "epoch": 0.014881536922830544, + "grad_norm": 31.71635540005267, + "learning_rate": 1.4881252140281612e-05, + "loss": 2.4607, + "mean_token_accuracy": 0.43103448748588563, + "step": 14775 + }, + { + "epoch": 0.014886572975934717, + "grad_norm": 26.551065702831263, + "learning_rate": 1.4886288097012671e-05, + "loss": 2.2315, + "mean_token_accuracy": 0.4931034445762634, + "step": 14780 + }, + { + "epoch": 0.01489160902903889, + "grad_norm": 26.31800167604234, + "learning_rate": 1.489132405374373e-05, + "loss": 2.1847, + "mean_token_accuracy": 0.46551724076271056, + "step": 14785 + }, + { + "epoch": 0.014896645082143061, + "grad_norm": 28.85586010118484, + "learning_rate": 1.4896360010474791e-05, + "loss": 2.3582, + "mean_token_accuracy": 0.43103448748588563, + "step": 14790 + }, + { + "epoch": 0.014901681135247235, + "grad_norm": 24.224798034892938, + "learning_rate": 1.490139596720585e-05, + "loss": 2.3948, + "mean_token_accuracy": 0.44137930274009707, + "step": 14795 + }, + { + "epoch": 0.014906717188351407, + "grad_norm": 25.305991708360004, + "learning_rate": 1.4906431923936912e-05, + "loss": 2.5472, + "mean_token_accuracy": 0.37241379022598264, + "step": 14800 + }, + { + "epoch": 0.014911753241455581, + "grad_norm": 22.11855014596968, + "learning_rate": 1.4911467880667971e-05, + "loss": 2.2594, + "mean_token_accuracy": 0.42068964838981626, + "step": 14805 + }, + { + "epoch": 0.014916789294559753, + "grad_norm": 31.92017127712201, + "learning_rate": 1.4916503837399028e-05, + "loss": 2.5865, + "mean_token_accuracy": 0.37241379022598264, + "step": 14810 + }, + { + "epoch": 0.014921825347663927, + "grad_norm": 22.662170426299145, + "learning_rate": 1.4921539794130091e-05, + "loss": 2.3608, + "mean_token_accuracy": 0.42758620381355283, + "step": 14815 + }, + { + "epoch": 0.014926861400768099, + "grad_norm": 25.223517984678654, + "learning_rate": 1.4926575750861149e-05, + "loss": 1.98, + "mean_token_accuracy": 0.47931033968925474, + "step": 14820 + }, + { + "epoch": 0.014931897453872271, + "grad_norm": 23.5384031661648, + "learning_rate": 1.4931611707592208e-05, + "loss": 2.1661, + "mean_token_accuracy": 0.46551724076271056, + "step": 14825 + }, + { + "epoch": 0.014936933506976445, + "grad_norm": 27.81598026744748, + "learning_rate": 1.4936647664323269e-05, + "loss": 2.4364, + "mean_token_accuracy": 0.41724138259887694, + "step": 14830 + }, + { + "epoch": 0.014941969560080617, + "grad_norm": 23.43711974743467, + "learning_rate": 1.4941683621054328e-05, + "loss": 2.2014, + "mean_token_accuracy": 0.4482758641242981, + "step": 14835 + }, + { + "epoch": 0.01494700561318479, + "grad_norm": 23.275180241035205, + "learning_rate": 1.4946719577785389e-05, + "loss": 2.636, + "mean_token_accuracy": 0.39655172228813174, + "step": 14840 + }, + { + "epoch": 0.014952041666288962, + "grad_norm": 21.630565260730904, + "learning_rate": 1.4951755534516448e-05, + "loss": 2.3176, + "mean_token_accuracy": 0.488324248790741, + "step": 14845 + }, + { + "epoch": 0.014957077719393136, + "grad_norm": 22.65031123196753, + "learning_rate": 1.4956791491247508e-05, + "loss": 2.2864, + "mean_token_accuracy": 0.5344827592372894, + "step": 14850 + }, + { + "epoch": 0.014962113772497308, + "grad_norm": 28.671975200109998, + "learning_rate": 1.4961827447978568e-05, + "loss": 2.8464, + "mean_token_accuracy": 0.3862068921327591, + "step": 14855 + }, + { + "epoch": 0.01496714982560148, + "grad_norm": 26.657981432517147, + "learning_rate": 1.4966863404709628e-05, + "loss": 2.4944, + "mean_token_accuracy": 0.4103448331356049, + "step": 14860 + }, + { + "epoch": 0.014972185878705654, + "grad_norm": 26.683902641254384, + "learning_rate": 1.4971899361440685e-05, + "loss": 2.3469, + "mean_token_accuracy": 0.4344827592372894, + "step": 14865 + }, + { + "epoch": 0.014977221931809826, + "grad_norm": 24.6457348562685, + "learning_rate": 1.4976935318171748e-05, + "loss": 2.5826, + "mean_token_accuracy": 0.38275861740112305, + "step": 14870 + }, + { + "epoch": 0.014982257984914, + "grad_norm": 19.37687188663943, + "learning_rate": 1.4981971274902805e-05, + "loss": 2.2248, + "mean_token_accuracy": 0.43793103098869324, + "step": 14875 + }, + { + "epoch": 0.014987294038018172, + "grad_norm": 29.402498225411847, + "learning_rate": 1.4987007231633868e-05, + "loss": 2.0806, + "mean_token_accuracy": 0.4724137902259827, + "step": 14880 + }, + { + "epoch": 0.014992330091122346, + "grad_norm": 31.0861367592149, + "learning_rate": 1.4992043188364926e-05, + "loss": 2.441, + "mean_token_accuracy": 0.4068965375423431, + "step": 14885 + }, + { + "epoch": 0.014997366144226518, + "grad_norm": 29.60742108065984, + "learning_rate": 1.4997079145095985e-05, + "loss": 2.2121, + "mean_token_accuracy": 0.47931034564971925, + "step": 14890 + }, + { + "epoch": 0.01500240219733069, + "grad_norm": 24.110506300490766, + "learning_rate": 1.5002115101827046e-05, + "loss": 2.4483, + "mean_token_accuracy": 0.42758620977401735, + "step": 14895 + }, + { + "epoch": 0.015007438250434863, + "grad_norm": 36.5036726395976, + "learning_rate": 1.5007151058558105e-05, + "loss": 2.2583, + "mean_token_accuracy": 0.484359610080719, + "step": 14900 + }, + { + "epoch": 0.015012474303539036, + "grad_norm": 22.055851039073005, + "learning_rate": 1.5012187015289164e-05, + "loss": 2.4453, + "mean_token_accuracy": 0.44827585816383364, + "step": 14905 + }, + { + "epoch": 0.01501751035664321, + "grad_norm": 34.09602018492731, + "learning_rate": 1.5017222972020225e-05, + "loss": 2.1159, + "mean_token_accuracy": 0.5070780336856842, + "step": 14910 + }, + { + "epoch": 0.015022546409747381, + "grad_norm": 24.787918165458834, + "learning_rate": 1.5022258928751285e-05, + "loss": 2.2127, + "mean_token_accuracy": 0.4413793087005615, + "step": 14915 + }, + { + "epoch": 0.015027582462851555, + "grad_norm": 26.47279743468467, + "learning_rate": 1.5027294885482346e-05, + "loss": 2.3159, + "mean_token_accuracy": 0.45172412395477296, + "step": 14920 + }, + { + "epoch": 0.015032618515955727, + "grad_norm": 31.033818205756386, + "learning_rate": 1.5032330842213405e-05, + "loss": 2.4719, + "mean_token_accuracy": 0.4275862157344818, + "step": 14925 + }, + { + "epoch": 0.0150376545690599, + "grad_norm": 26.94708550355271, + "learning_rate": 1.5037366798944464e-05, + "loss": 2.3457, + "mean_token_accuracy": 0.4413793087005615, + "step": 14930 + }, + { + "epoch": 0.015042690622164073, + "grad_norm": 25.169244647715573, + "learning_rate": 1.5042402755675525e-05, + "loss": 2.1845, + "mean_token_accuracy": 0.4517241299152374, + "step": 14935 + }, + { + "epoch": 0.015047726675268245, + "grad_norm": 40.10087759421944, + "learning_rate": 1.5047438712406584e-05, + "loss": 2.4953, + "mean_token_accuracy": 0.4034482717514038, + "step": 14940 + }, + { + "epoch": 0.015052762728372419, + "grad_norm": 29.313313283164337, + "learning_rate": 1.5052474669137642e-05, + "loss": 2.4427, + "mean_token_accuracy": 0.4413793087005615, + "step": 14945 + }, + { + "epoch": 0.01505779878147659, + "grad_norm": 22.470059260982964, + "learning_rate": 1.5057510625868704e-05, + "loss": 2.577, + "mean_token_accuracy": 0.3862069010734558, + "step": 14950 + }, + { + "epoch": 0.015062834834580765, + "grad_norm": 28.6797743395478, + "learning_rate": 1.5062546582599762e-05, + "loss": 2.6405, + "mean_token_accuracy": 0.3965517163276672, + "step": 14955 + }, + { + "epoch": 0.015067870887684937, + "grad_norm": 33.0424606347182, + "learning_rate": 1.5067582539330825e-05, + "loss": 1.9827, + "mean_token_accuracy": 0.48523896336555483, + "step": 14960 + }, + { + "epoch": 0.015072906940789109, + "grad_norm": 28.14559724508174, + "learning_rate": 1.5072618496061882e-05, + "loss": 2.6368, + "mean_token_accuracy": 0.38275861740112305, + "step": 14965 + }, + { + "epoch": 0.015077942993893282, + "grad_norm": 24.374999285284307, + "learning_rate": 1.5077654452792941e-05, + "loss": 2.3932, + "mean_token_accuracy": 0.41724138259887694, + "step": 14970 + }, + { + "epoch": 0.015082979046997454, + "grad_norm": 25.616286346354993, + "learning_rate": 1.5082690409524002e-05, + "loss": 2.3574, + "mean_token_accuracy": 0.41379310488700866, + "step": 14975 + }, + { + "epoch": 0.015088015100101628, + "grad_norm": 27.237489024038084, + "learning_rate": 1.5087726366255062e-05, + "loss": 2.2576, + "mean_token_accuracy": 0.4551724135875702, + "step": 14980 + }, + { + "epoch": 0.0150930511532058, + "grad_norm": 25.741042246992208, + "learning_rate": 1.5092762322986121e-05, + "loss": 2.4418, + "mean_token_accuracy": 0.42758620977401735, + "step": 14985 + }, + { + "epoch": 0.015098087206309974, + "grad_norm": 29.331359785369592, + "learning_rate": 1.5097798279717182e-05, + "loss": 2.4019, + "mean_token_accuracy": 0.41379310488700866, + "step": 14990 + }, + { + "epoch": 0.015103123259414146, + "grad_norm": 27.882007846329156, + "learning_rate": 1.5102834236448241e-05, + "loss": 2.4753, + "mean_token_accuracy": 0.4275861978530884, + "step": 14995 + }, + { + "epoch": 0.015108159312518318, + "grad_norm": 34.95171478184749, + "learning_rate": 1.5107870193179302e-05, + "loss": 2.5929, + "mean_token_accuracy": 0.42413793206214906, + "step": 15000 + }, + { + "epoch": 0.015113195365622492, + "grad_norm": 32.956074239610174, + "learning_rate": 1.5112906149910361e-05, + "loss": 2.3283, + "mean_token_accuracy": 0.42068964838981626, + "step": 15005 + }, + { + "epoch": 0.015118231418726664, + "grad_norm": 22.777803284158868, + "learning_rate": 1.5117942106641419e-05, + "loss": 2.1476, + "mean_token_accuracy": 0.43793103098869324, + "step": 15010 + }, + { + "epoch": 0.015123267471830838, + "grad_norm": 27.160889631129503, + "learning_rate": 1.5122978063372481e-05, + "loss": 2.3891, + "mean_token_accuracy": 0.4482758641242981, + "step": 15015 + }, + { + "epoch": 0.01512830352493501, + "grad_norm": 30.92468799538444, + "learning_rate": 1.5128014020103539e-05, + "loss": 2.3801, + "mean_token_accuracy": 0.42068966031074523, + "step": 15020 + }, + { + "epoch": 0.015133339578039183, + "grad_norm": 22.7474869503625, + "learning_rate": 1.5133049976834598e-05, + "loss": 2.2962, + "mean_token_accuracy": 0.4137931078672409, + "step": 15025 + }, + { + "epoch": 0.015138375631143355, + "grad_norm": 29.11117019798295, + "learning_rate": 1.513808593356566e-05, + "loss": 2.2306, + "mean_token_accuracy": 0.493103438615799, + "step": 15030 + }, + { + "epoch": 0.015143411684247527, + "grad_norm": 21.12295579576171, + "learning_rate": 1.5143121890296719e-05, + "loss": 2.1813, + "mean_token_accuracy": 0.4689655125141144, + "step": 15035 + }, + { + "epoch": 0.015148447737351701, + "grad_norm": 28.935505185650776, + "learning_rate": 1.514815784702778e-05, + "loss": 2.267, + "mean_token_accuracy": 0.4931034564971924, + "step": 15040 + }, + { + "epoch": 0.015153483790455873, + "grad_norm": 31.40162474672853, + "learning_rate": 1.5153193803758839e-05, + "loss": 2.3161, + "mean_token_accuracy": 0.42413793206214906, + "step": 15045 + }, + { + "epoch": 0.015158519843560047, + "grad_norm": 26.131745620735387, + "learning_rate": 1.5158229760489898e-05, + "loss": 2.3862, + "mean_token_accuracy": 0.42413792610168455, + "step": 15050 + }, + { + "epoch": 0.015163555896664219, + "grad_norm": 28.287149193031876, + "learning_rate": 1.5163265717220959e-05, + "loss": 2.5364, + "mean_token_accuracy": 0.44482759237289426, + "step": 15055 + }, + { + "epoch": 0.015168591949768393, + "grad_norm": 19.64781895533074, + "learning_rate": 1.5168301673952018e-05, + "loss": 2.4416, + "mean_token_accuracy": 0.39655171930789945, + "step": 15060 + }, + { + "epoch": 0.015173628002872565, + "grad_norm": 27.655730291447014, + "learning_rate": 1.5173337630683076e-05, + "loss": 2.504, + "mean_token_accuracy": 0.417241370677948, + "step": 15065 + }, + { + "epoch": 0.015178664055976737, + "grad_norm": 37.69673256513576, + "learning_rate": 1.5178373587414138e-05, + "loss": 2.4305, + "mean_token_accuracy": 0.44827585816383364, + "step": 15070 + }, + { + "epoch": 0.01518370010908091, + "grad_norm": 28.71471968505637, + "learning_rate": 1.5183409544145198e-05, + "loss": 2.2258, + "mean_token_accuracy": 0.4637930989265442, + "step": 15075 + }, + { + "epoch": 0.015188736162185083, + "grad_norm": 27.040804556333185, + "learning_rate": 1.5188445500876259e-05, + "loss": 2.4311, + "mean_token_accuracy": 0.36896551847457887, + "step": 15080 + }, + { + "epoch": 0.015193772215289256, + "grad_norm": 21.01676275131767, + "learning_rate": 1.5193481457607318e-05, + "loss": 2.291, + "mean_token_accuracy": 0.3999999940395355, + "step": 15085 + }, + { + "epoch": 0.015198808268393428, + "grad_norm": 26.14926082172177, + "learning_rate": 1.5198517414338375e-05, + "loss": 2.7127, + "mean_token_accuracy": 0.3620689570903778, + "step": 15090 + }, + { + "epoch": 0.015203844321497602, + "grad_norm": 30.068215386643843, + "learning_rate": 1.5203553371069438e-05, + "loss": 2.5639, + "mean_token_accuracy": 0.44482759237289426, + "step": 15095 + }, + { + "epoch": 0.015208880374601774, + "grad_norm": 30.376764799380407, + "learning_rate": 1.5208589327800496e-05, + "loss": 2.57, + "mean_token_accuracy": 0.4620689690113068, + "step": 15100 + }, + { + "epoch": 0.015213916427705946, + "grad_norm": 27.737970240050405, + "learning_rate": 1.5213625284531555e-05, + "loss": 2.508, + "mean_token_accuracy": 0.44827585220336913, + "step": 15105 + }, + { + "epoch": 0.01521895248081012, + "grad_norm": 31.12720254062167, + "learning_rate": 1.5218661241262616e-05, + "loss": 2.4977, + "mean_token_accuracy": 0.42982456386089324, + "step": 15110 + }, + { + "epoch": 0.015223988533914292, + "grad_norm": 26.92285197046569, + "learning_rate": 1.5223697197993675e-05, + "loss": 2.4711, + "mean_token_accuracy": 0.4034482717514038, + "step": 15115 + }, + { + "epoch": 0.015229024587018466, + "grad_norm": 32.305778791098305, + "learning_rate": 1.5228733154724736e-05, + "loss": 2.3784, + "mean_token_accuracy": 0.44827585816383364, + "step": 15120 + }, + { + "epoch": 0.015234060640122638, + "grad_norm": 21.90958700099828, + "learning_rate": 1.5233769111455795e-05, + "loss": 2.1262, + "mean_token_accuracy": 0.4620689690113068, + "step": 15125 + }, + { + "epoch": 0.01523909669322681, + "grad_norm": 28.276987843375537, + "learning_rate": 1.5238805068186854e-05, + "loss": 2.3825, + "mean_token_accuracy": 0.4620689690113068, + "step": 15130 + }, + { + "epoch": 0.015244132746330984, + "grad_norm": 33.500641313669036, + "learning_rate": 1.5243841024917915e-05, + "loss": 2.446, + "mean_token_accuracy": 0.441379314661026, + "step": 15135 + }, + { + "epoch": 0.015249168799435156, + "grad_norm": 26.76904883676346, + "learning_rate": 1.5248876981648975e-05, + "loss": 2.3168, + "mean_token_accuracy": 0.441379314661026, + "step": 15140 + }, + { + "epoch": 0.01525420485253933, + "grad_norm": 23.2680974431548, + "learning_rate": 1.5253912938380036e-05, + "loss": 2.4012, + "mean_token_accuracy": 0.46206897497177124, + "step": 15145 + }, + { + "epoch": 0.015259240905643501, + "grad_norm": 32.07531607208907, + "learning_rate": 1.5258948895111095e-05, + "loss": 2.6204, + "mean_token_accuracy": 0.38965516686439516, + "step": 15150 + }, + { + "epoch": 0.015264276958747675, + "grad_norm": 29.75079183643218, + "learning_rate": 1.5263984851842154e-05, + "loss": 2.6213, + "mean_token_accuracy": 0.42413793206214906, + "step": 15155 + }, + { + "epoch": 0.015269313011851847, + "grad_norm": 28.667975208877643, + "learning_rate": 1.5269020808573213e-05, + "loss": 2.3524, + "mean_token_accuracy": 0.4517241418361664, + "step": 15160 + }, + { + "epoch": 0.01527434906495602, + "grad_norm": 29.495643499924547, + "learning_rate": 1.5274056765304273e-05, + "loss": 2.4358, + "mean_token_accuracy": 0.4413793087005615, + "step": 15165 + }, + { + "epoch": 0.015279385118060193, + "grad_norm": 23.539930596724293, + "learning_rate": 1.5279092722035332e-05, + "loss": 1.9282, + "mean_token_accuracy": 0.5275861978530884, + "step": 15170 + }, + { + "epoch": 0.015284421171164365, + "grad_norm": 27.136937283318872, + "learning_rate": 1.5284128678766395e-05, + "loss": 2.4718, + "mean_token_accuracy": 0.42758620977401735, + "step": 15175 + }, + { + "epoch": 0.015289457224268539, + "grad_norm": 26.40953267002137, + "learning_rate": 1.5289164635497454e-05, + "loss": 2.3817, + "mean_token_accuracy": 0.4597290694713593, + "step": 15180 + }, + { + "epoch": 0.015294493277372711, + "grad_norm": 21.97792386095895, + "learning_rate": 1.5294200592228513e-05, + "loss": 2.4647, + "mean_token_accuracy": 0.3862068891525269, + "step": 15185 + }, + { + "epoch": 0.015299529330476885, + "grad_norm": 28.92035318736106, + "learning_rate": 1.5299236548959572e-05, + "loss": 2.5609, + "mean_token_accuracy": 0.3862068891525269, + "step": 15190 + }, + { + "epoch": 0.015304565383581057, + "grad_norm": 27.038182465843654, + "learning_rate": 1.530427250569063e-05, + "loss": 2.3196, + "mean_token_accuracy": 0.4206896543502808, + "step": 15195 + }, + { + "epoch": 0.015309601436685229, + "grad_norm": 23.57627638728257, + "learning_rate": 1.5309308462421694e-05, + "loss": 2.5406, + "mean_token_accuracy": 0.38620689511299133, + "step": 15200 + }, + { + "epoch": 0.015314637489789402, + "grad_norm": 32.39661561322554, + "learning_rate": 1.531434441915275e-05, + "loss": 2.2925, + "mean_token_accuracy": 0.43793103098869324, + "step": 15205 + }, + { + "epoch": 0.015319673542893575, + "grad_norm": 21.61241132847649, + "learning_rate": 1.531938037588381e-05, + "loss": 2.3827, + "mean_token_accuracy": 0.41724138259887694, + "step": 15210 + }, + { + "epoch": 0.015324709595997748, + "grad_norm": 62.22562315211066, + "learning_rate": 1.5324416332614872e-05, + "loss": 2.1067, + "mean_token_accuracy": 0.4758620738983154, + "step": 15215 + }, + { + "epoch": 0.01532974564910192, + "grad_norm": 33.307248233027074, + "learning_rate": 1.532945228934593e-05, + "loss": 2.9393, + "mean_token_accuracy": 0.31379309892654417, + "step": 15220 + }, + { + "epoch": 0.015334781702206094, + "grad_norm": 22.824698164028327, + "learning_rate": 1.533448824607699e-05, + "loss": 2.1646, + "mean_token_accuracy": 0.42758620977401735, + "step": 15225 + }, + { + "epoch": 0.015339817755310266, + "grad_norm": 39.60586148444596, + "learning_rate": 1.533952420280805e-05, + "loss": 2.4388, + "mean_token_accuracy": 0.4034482717514038, + "step": 15230 + }, + { + "epoch": 0.015344853808414438, + "grad_norm": 21.514431137827334, + "learning_rate": 1.534456015953911e-05, + "loss": 2.54, + "mean_token_accuracy": 0.42068966031074523, + "step": 15235 + }, + { + "epoch": 0.015349889861518612, + "grad_norm": 26.501008741817458, + "learning_rate": 1.534959611627017e-05, + "loss": 2.2307, + "mean_token_accuracy": 0.4655172348022461, + "step": 15240 + }, + { + "epoch": 0.015354925914622784, + "grad_norm": 30.964055113230447, + "learning_rate": 1.535463207300123e-05, + "loss": 2.4714, + "mean_token_accuracy": 0.40689654350280763, + "step": 15245 + }, + { + "epoch": 0.015359961967726958, + "grad_norm": 23.609834060334986, + "learning_rate": 1.5359668029732287e-05, + "loss": 2.4079, + "mean_token_accuracy": 0.42068966031074523, + "step": 15250 + }, + { + "epoch": 0.01536499802083113, + "grad_norm": 31.221222286855603, + "learning_rate": 1.536470398646335e-05, + "loss": 2.4158, + "mean_token_accuracy": 0.4344827651977539, + "step": 15255 + }, + { + "epoch": 0.015370034073935303, + "grad_norm": 26.52871738394261, + "learning_rate": 1.536973994319441e-05, + "loss": 2.5322, + "mean_token_accuracy": 0.4620689690113068, + "step": 15260 + }, + { + "epoch": 0.015375070127039476, + "grad_norm": 27.420888859961313, + "learning_rate": 1.537477589992547e-05, + "loss": 2.5503, + "mean_token_accuracy": 0.41034482717514037, + "step": 15265 + }, + { + "epoch": 0.015380106180143648, + "grad_norm": 27.623311514977516, + "learning_rate": 1.5379811856656527e-05, + "loss": 2.6551, + "mean_token_accuracy": 0.37241379022598264, + "step": 15270 + }, + { + "epoch": 0.015385142233247821, + "grad_norm": 28.179215674514353, + "learning_rate": 1.5384847813387586e-05, + "loss": 2.2328, + "mean_token_accuracy": 0.43932244181632996, + "step": 15275 + }, + { + "epoch": 0.015390178286351993, + "grad_norm": 23.4748892846769, + "learning_rate": 1.538988377011865e-05, + "loss": 2.2666, + "mean_token_accuracy": 0.4517241358757019, + "step": 15280 + }, + { + "epoch": 0.015395214339456167, + "grad_norm": 60.65185448627418, + "learning_rate": 1.5394919726849708e-05, + "loss": 2.6417, + "mean_token_accuracy": 0.3827586233615875, + "step": 15285 + }, + { + "epoch": 0.01540025039256034, + "grad_norm": 25.402593201443093, + "learning_rate": 1.5399955683580768e-05, + "loss": 2.5087, + "mean_token_accuracy": 0.3931034505367279, + "step": 15290 + }, + { + "epoch": 0.015405286445664513, + "grad_norm": 27.022524882760127, + "learning_rate": 1.5404991640311827e-05, + "loss": 2.4856, + "mean_token_accuracy": 0.3931034505367279, + "step": 15295 + }, + { + "epoch": 0.015410322498768685, + "grad_norm": 26.340769506923586, + "learning_rate": 1.5410027597042886e-05, + "loss": 2.478, + "mean_token_accuracy": 0.4068965494632721, + "step": 15300 + }, + { + "epoch": 0.015415358551872857, + "grad_norm": 31.93989389902504, + "learning_rate": 1.541506355377395e-05, + "loss": 2.2626, + "mean_token_accuracy": 0.4931034505367279, + "step": 15305 + }, + { + "epoch": 0.01542039460497703, + "grad_norm": 25.94858099628174, + "learning_rate": 1.5420099510505008e-05, + "loss": 2.4934, + "mean_token_accuracy": 0.4051421582698822, + "step": 15310 + }, + { + "epoch": 0.015425430658081203, + "grad_norm": 38.07859351500177, + "learning_rate": 1.5425135467236064e-05, + "loss": 2.3503, + "mean_token_accuracy": 0.4620689630508423, + "step": 15315 + }, + { + "epoch": 0.015430466711185377, + "grad_norm": 33.9922370450758, + "learning_rate": 1.5430171423967126e-05, + "loss": 2.5306, + "mean_token_accuracy": 0.3965517282485962, + "step": 15320 + }, + { + "epoch": 0.015435502764289549, + "grad_norm": 27.94677952657774, + "learning_rate": 1.5435207380698186e-05, + "loss": 2.7147, + "mean_token_accuracy": 0.320689657330513, + "step": 15325 + }, + { + "epoch": 0.015440538817393722, + "grad_norm": 26.254540596488283, + "learning_rate": 1.5440243337429245e-05, + "loss": 2.3457, + "mean_token_accuracy": 0.4344827592372894, + "step": 15330 + }, + { + "epoch": 0.015445574870497894, + "grad_norm": 26.858089823420904, + "learning_rate": 1.5445279294160304e-05, + "loss": 2.5259, + "mean_token_accuracy": 0.3896551728248596, + "step": 15335 + }, + { + "epoch": 0.015450610923602066, + "grad_norm": 25.678576939756653, + "learning_rate": 1.5450315250891363e-05, + "loss": 2.3543, + "mean_token_accuracy": 0.4344827651977539, + "step": 15340 + }, + { + "epoch": 0.01545564697670624, + "grad_norm": 26.635519574655135, + "learning_rate": 1.5455351207622426e-05, + "loss": 2.2627, + "mean_token_accuracy": 0.4172413766384125, + "step": 15345 + }, + { + "epoch": 0.015460683029810412, + "grad_norm": 22.938107403922594, + "learning_rate": 1.5460387164353485e-05, + "loss": 2.3581, + "mean_token_accuracy": 0.4137930989265442, + "step": 15350 + }, + { + "epoch": 0.015465719082914586, + "grad_norm": 30.32331116088612, + "learning_rate": 1.5465423121084545e-05, + "loss": 2.4135, + "mean_token_accuracy": 0.39655172228813174, + "step": 15355 + }, + { + "epoch": 0.015470755136018758, + "grad_norm": 29.976195194096814, + "learning_rate": 1.5470459077815604e-05, + "loss": 2.313, + "mean_token_accuracy": 0.39655172228813174, + "step": 15360 + }, + { + "epoch": 0.015475791189122932, + "grad_norm": 23.365203161684644, + "learning_rate": 1.5475495034546663e-05, + "loss": 2.4102, + "mean_token_accuracy": 0.4586206912994385, + "step": 15365 + }, + { + "epoch": 0.015480827242227104, + "grad_norm": 29.183489873934562, + "learning_rate": 1.5480530991277722e-05, + "loss": 2.1398, + "mean_token_accuracy": 0.4310344934463501, + "step": 15370 + }, + { + "epoch": 0.015485863295331276, + "grad_norm": 27.940375902532523, + "learning_rate": 1.5485566948008785e-05, + "loss": 2.1217, + "mean_token_accuracy": 0.4620689630508423, + "step": 15375 + }, + { + "epoch": 0.01549089934843545, + "grad_norm": 32.49007707356087, + "learning_rate": 1.5490602904739844e-05, + "loss": 2.2558, + "mean_token_accuracy": 0.4620689630508423, + "step": 15380 + }, + { + "epoch": 0.015495935401539622, + "grad_norm": 25.252953640610716, + "learning_rate": 1.5495638861470903e-05, + "loss": 2.2966, + "mean_token_accuracy": 0.40689654350280763, + "step": 15385 + }, + { + "epoch": 0.015500971454643795, + "grad_norm": 27.663082739947956, + "learning_rate": 1.5500674818201963e-05, + "loss": 2.4797, + "mean_token_accuracy": 0.4034482777118683, + "step": 15390 + }, + { + "epoch": 0.015506007507747967, + "grad_norm": 43.82563328270075, + "learning_rate": 1.5505710774933022e-05, + "loss": 2.2844, + "mean_token_accuracy": 0.4137930989265442, + "step": 15395 + }, + { + "epoch": 0.015511043560852141, + "grad_norm": 29.145020488591964, + "learning_rate": 1.5510746731664085e-05, + "loss": 2.0937, + "mean_token_accuracy": 0.47241379618644713, + "step": 15400 + }, + { + "epoch": 0.015516079613956313, + "grad_norm": 22.83227662007667, + "learning_rate": 1.551578268839514e-05, + "loss": 2.2476, + "mean_token_accuracy": 0.4517241358757019, + "step": 15405 + }, + { + "epoch": 0.015521115667060485, + "grad_norm": 26.132016761693365, + "learning_rate": 1.55208186451262e-05, + "loss": 2.3593, + "mean_token_accuracy": 0.441379314661026, + "step": 15410 + }, + { + "epoch": 0.015526151720164659, + "grad_norm": 19.341227683769997, + "learning_rate": 1.5525854601857262e-05, + "loss": 2.1409, + "mean_token_accuracy": 0.4931034445762634, + "step": 15415 + }, + { + "epoch": 0.015531187773268831, + "grad_norm": 29.721415781920566, + "learning_rate": 1.553089055858832e-05, + "loss": 2.4372, + "mean_token_accuracy": 0.417241370677948, + "step": 15420 + }, + { + "epoch": 0.015536223826373005, + "grad_norm": 22.50283103005277, + "learning_rate": 1.553592651531938e-05, + "loss": 2.2973, + "mean_token_accuracy": 0.4448275864124298, + "step": 15425 + }, + { + "epoch": 0.015541259879477177, + "grad_norm": 23.985986542567865, + "learning_rate": 1.554096247205044e-05, + "loss": 2.2876, + "mean_token_accuracy": 0.46551724076271056, + "step": 15430 + }, + { + "epoch": 0.01554629593258135, + "grad_norm": 25.27321331074455, + "learning_rate": 1.55459984287815e-05, + "loss": 2.2492, + "mean_token_accuracy": 0.47586206793785096, + "step": 15435 + }, + { + "epoch": 0.015551331985685523, + "grad_norm": 29.489918811966035, + "learning_rate": 1.5551034385512562e-05, + "loss": 2.0868, + "mean_token_accuracy": 0.48337438702583313, + "step": 15440 + }, + { + "epoch": 0.015556368038789695, + "grad_norm": 24.33428711106757, + "learning_rate": 1.555607034224362e-05, + "loss": 2.4336, + "mean_token_accuracy": 0.39310343861579894, + "step": 15445 + }, + { + "epoch": 0.015561404091893868, + "grad_norm": 28.539339088876677, + "learning_rate": 1.5561106298974677e-05, + "loss": 2.4891, + "mean_token_accuracy": 0.38620689511299133, + "step": 15450 + }, + { + "epoch": 0.01556644014499804, + "grad_norm": 24.705792318123912, + "learning_rate": 1.556614225570574e-05, + "loss": 2.2383, + "mean_token_accuracy": 0.43103447556495667, + "step": 15455 + }, + { + "epoch": 0.015571476198102214, + "grad_norm": 21.822106373407035, + "learning_rate": 1.55711782124368e-05, + "loss": 2.0719, + "mean_token_accuracy": 0.48965516686439514, + "step": 15460 + }, + { + "epoch": 0.015576512251206386, + "grad_norm": 31.90504845301236, + "learning_rate": 1.5576214169167862e-05, + "loss": 2.2195, + "mean_token_accuracy": 0.4862068951129913, + "step": 15465 + }, + { + "epoch": 0.01558154830431056, + "grad_norm": 27.086377007185842, + "learning_rate": 1.5581250125898918e-05, + "loss": 2.2882, + "mean_token_accuracy": 0.45862067937850953, + "step": 15470 + }, + { + "epoch": 0.015586584357414732, + "grad_norm": 22.299810823910082, + "learning_rate": 1.5586286082629977e-05, + "loss": 2.3129, + "mean_token_accuracy": 0.4758620738983154, + "step": 15475 + }, + { + "epoch": 0.015591620410518904, + "grad_norm": 25.69934904554766, + "learning_rate": 1.559132203936104e-05, + "loss": 2.4365, + "mean_token_accuracy": 0.39310345649719236, + "step": 15480 + }, + { + "epoch": 0.015596656463623078, + "grad_norm": 33.522117494510724, + "learning_rate": 1.55963579960921e-05, + "loss": 2.3073, + "mean_token_accuracy": 0.46896551847457885, + "step": 15485 + }, + { + "epoch": 0.01560169251672725, + "grad_norm": 31.277892949129864, + "learning_rate": 1.5601393952823158e-05, + "loss": 2.3262, + "mean_token_accuracy": 0.4275861978530884, + "step": 15490 + }, + { + "epoch": 0.015606728569831424, + "grad_norm": 28.30338192441034, + "learning_rate": 1.5606429909554217e-05, + "loss": 2.4136, + "mean_token_accuracy": 0.39999998807907106, + "step": 15495 + }, + { + "epoch": 0.015611764622935596, + "grad_norm": 26.097227251949477, + "learning_rate": 1.5611465866285276e-05, + "loss": 2.1959, + "mean_token_accuracy": 0.46896552443504336, + "step": 15500 + }, + { + "epoch": 0.01561680067603977, + "grad_norm": 26.142762877336704, + "learning_rate": 1.561650182301634e-05, + "loss": 2.481, + "mean_token_accuracy": 0.4344827592372894, + "step": 15505 + }, + { + "epoch": 0.015621836729143941, + "grad_norm": 34.112400762998185, + "learning_rate": 1.56215377797474e-05, + "loss": 2.5313, + "mean_token_accuracy": 0.38620689511299133, + "step": 15510 + }, + { + "epoch": 0.015626872782248113, + "grad_norm": 35.81910547070459, + "learning_rate": 1.5626573736478454e-05, + "loss": 2.509, + "mean_token_accuracy": 0.4206896543502808, + "step": 15515 + }, + { + "epoch": 0.015631908835352287, + "grad_norm": 26.876770466531983, + "learning_rate": 1.5631609693209517e-05, + "loss": 1.9901, + "mean_token_accuracy": 0.4689655125141144, + "step": 15520 + }, + { + "epoch": 0.01563694488845646, + "grad_norm": 30.964368393603834, + "learning_rate": 1.5636645649940576e-05, + "loss": 2.2874, + "mean_token_accuracy": 0.45517241954803467, + "step": 15525 + }, + { + "epoch": 0.01564198094156063, + "grad_norm": 27.4545831418006, + "learning_rate": 1.5641681606671635e-05, + "loss": 2.2759, + "mean_token_accuracy": 0.4379310250282288, + "step": 15530 + }, + { + "epoch": 0.015647016994664805, + "grad_norm": 24.67646371041598, + "learning_rate": 1.5646717563402698e-05, + "loss": 2.2276, + "mean_token_accuracy": 0.482758617401123, + "step": 15535 + }, + { + "epoch": 0.01565205304776898, + "grad_norm": 33.70781072486016, + "learning_rate": 1.5651753520133754e-05, + "loss": 2.5344, + "mean_token_accuracy": 0.4586206912994385, + "step": 15540 + }, + { + "epoch": 0.015657089100873153, + "grad_norm": 25.208622671873613, + "learning_rate": 1.5656789476864817e-05, + "loss": 2.6548, + "mean_token_accuracy": 0.3896551728248596, + "step": 15545 + }, + { + "epoch": 0.015662125153977323, + "grad_norm": 30.92790838738424, + "learning_rate": 1.5661825433595876e-05, + "loss": 3.016, + "mean_token_accuracy": 0.37241379618644715, + "step": 15550 + }, + { + "epoch": 0.015667161207081497, + "grad_norm": 24.351642725436907, + "learning_rate": 1.5666861390326935e-05, + "loss": 1.849, + "mean_token_accuracy": 0.5224440395832062, + "step": 15555 + }, + { + "epoch": 0.01567219726018567, + "grad_norm": 20.30646079720107, + "learning_rate": 1.5671897347057994e-05, + "loss": 2.2394, + "mean_token_accuracy": 0.42758620381355283, + "step": 15560 + }, + { + "epoch": 0.01567723331328984, + "grad_norm": 28.533673083955637, + "learning_rate": 1.5676933303789054e-05, + "loss": 2.569, + "mean_token_accuracy": 0.41379310488700866, + "step": 15565 + }, + { + "epoch": 0.015682269366394015, + "grad_norm": 24.948961473221317, + "learning_rate": 1.5681969260520116e-05, + "loss": 2.4286, + "mean_token_accuracy": 0.4206896543502808, + "step": 15570 + }, + { + "epoch": 0.01568730541949819, + "grad_norm": 24.21568089839342, + "learning_rate": 1.5687005217251175e-05, + "loss": 1.9818, + "mean_token_accuracy": 0.4448275864124298, + "step": 15575 + }, + { + "epoch": 0.01569234147260236, + "grad_norm": 31.979436338504815, + "learning_rate": 1.5692041173982235e-05, + "loss": 2.5227, + "mean_token_accuracy": 0.40344826579093934, + "step": 15580 + }, + { + "epoch": 0.015697377525706532, + "grad_norm": 24.92234295687272, + "learning_rate": 1.5697077130713294e-05, + "loss": 2.5614, + "mean_token_accuracy": 0.39310344457626345, + "step": 15585 + }, + { + "epoch": 0.015702413578810706, + "grad_norm": 20.689721415656155, + "learning_rate": 1.5702113087444353e-05, + "loss": 2.1045, + "mean_token_accuracy": 0.5068965494632721, + "step": 15590 + }, + { + "epoch": 0.01570744963191488, + "grad_norm": 28.801084846283672, + "learning_rate": 1.5707149044175412e-05, + "loss": 2.5776, + "mean_token_accuracy": 0.37931033968925476, + "step": 15595 + }, + { + "epoch": 0.01571248568501905, + "grad_norm": 26.269572739616184, + "learning_rate": 1.5712185000906475e-05, + "loss": 2.2005, + "mean_token_accuracy": 0.45517241954803467, + "step": 15600 + }, + { + "epoch": 0.015717521738123224, + "grad_norm": 29.64383672673831, + "learning_rate": 1.571722095763753e-05, + "loss": 2.3336, + "mean_token_accuracy": 0.43103447556495667, + "step": 15605 + }, + { + "epoch": 0.015722557791227398, + "grad_norm": 21.780053690574633, + "learning_rate": 1.5722256914368594e-05, + "loss": 2.4922, + "mean_token_accuracy": 0.38275861740112305, + "step": 15610 + }, + { + "epoch": 0.015727593844331568, + "grad_norm": 27.077007276609294, + "learning_rate": 1.5727292871099653e-05, + "loss": 2.5873, + "mean_token_accuracy": 0.3724137932062149, + "step": 15615 + }, + { + "epoch": 0.015732629897435742, + "grad_norm": 23.68917863219754, + "learning_rate": 1.5732328827830712e-05, + "loss": 2.4491, + "mean_token_accuracy": 0.4354679822921753, + "step": 15620 + }, + { + "epoch": 0.015737665950539916, + "grad_norm": 18.837769585846583, + "learning_rate": 1.573736478456177e-05, + "loss": 2.5415, + "mean_token_accuracy": 0.43793103098869324, + "step": 15625 + }, + { + "epoch": 0.01574270200364409, + "grad_norm": 29.55895038432199, + "learning_rate": 1.574240074129283e-05, + "loss": 2.3969, + "mean_token_accuracy": 0.41379310488700866, + "step": 15630 + }, + { + "epoch": 0.01574773805674826, + "grad_norm": 23.305301382521773, + "learning_rate": 1.574743669802389e-05, + "loss": 2.2443, + "mean_token_accuracy": 0.5137930929660797, + "step": 15635 + }, + { + "epoch": 0.015752774109852433, + "grad_norm": 22.872085066464045, + "learning_rate": 1.5752472654754952e-05, + "loss": 2.4616, + "mean_token_accuracy": 0.44482759237289426, + "step": 15640 + }, + { + "epoch": 0.015757810162956607, + "grad_norm": 31.678905506034983, + "learning_rate": 1.5757508611486012e-05, + "loss": 2.8223, + "mean_token_accuracy": 0.35983060896396635, + "step": 15645 + }, + { + "epoch": 0.015762846216060777, + "grad_norm": 26.29372465340689, + "learning_rate": 1.576254456821707e-05, + "loss": 2.111, + "mean_token_accuracy": 0.47931034564971925, + "step": 15650 + }, + { + "epoch": 0.01576788226916495, + "grad_norm": 24.979648008473138, + "learning_rate": 1.576758052494813e-05, + "loss": 2.3929, + "mean_token_accuracy": 0.4360556542873383, + "step": 15655 + }, + { + "epoch": 0.015772918322269125, + "grad_norm": 20.083755440355905, + "learning_rate": 1.577261648167919e-05, + "loss": 2.1284, + "mean_token_accuracy": 0.4931034505367279, + "step": 15660 + }, + { + "epoch": 0.0157779543753733, + "grad_norm": 25.446001775037097, + "learning_rate": 1.5777652438410252e-05, + "loss": 2.8251, + "mean_token_accuracy": 0.3655172407627106, + "step": 15665 + }, + { + "epoch": 0.01578299042847747, + "grad_norm": 24.14010394513108, + "learning_rate": 1.5782688395141308e-05, + "loss": 2.1684, + "mean_token_accuracy": 0.4918935298919678, + "step": 15670 + }, + { + "epoch": 0.015788026481581643, + "grad_norm": 40.70765718398589, + "learning_rate": 1.5787724351872367e-05, + "loss": 2.4998, + "mean_token_accuracy": 0.4379310369491577, + "step": 15675 + }, + { + "epoch": 0.015793062534685817, + "grad_norm": 46.79036510826806, + "learning_rate": 1.579276030860343e-05, + "loss": 2.6341, + "mean_token_accuracy": 0.4379310369491577, + "step": 15680 + }, + { + "epoch": 0.015798098587789987, + "grad_norm": 25.23462316227281, + "learning_rate": 1.579779626533449e-05, + "loss": 2.1441, + "mean_token_accuracy": 0.46067755222320556, + "step": 15685 + }, + { + "epoch": 0.01580313464089416, + "grad_norm": 27.96837390920119, + "learning_rate": 1.580283222206555e-05, + "loss": 2.5682, + "mean_token_accuracy": 0.4068965494632721, + "step": 15690 + }, + { + "epoch": 0.015808170693998334, + "grad_norm": 28.13620690300876, + "learning_rate": 1.5807868178796608e-05, + "loss": 2.6107, + "mean_token_accuracy": 0.3931034505367279, + "step": 15695 + }, + { + "epoch": 0.015813206747102508, + "grad_norm": 26.96325482019206, + "learning_rate": 1.5812904135527667e-05, + "loss": 2.4025, + "mean_token_accuracy": 0.4379310220479965, + "step": 15700 + }, + { + "epoch": 0.01581824280020668, + "grad_norm": 30.18764897032944, + "learning_rate": 1.581794009225873e-05, + "loss": 2.728, + "mean_token_accuracy": 0.37931033968925476, + "step": 15705 + }, + { + "epoch": 0.015823278853310852, + "grad_norm": 20.77718634717397, + "learning_rate": 1.582297604898979e-05, + "loss": 2.3238, + "mean_token_accuracy": 0.458620685338974, + "step": 15710 + }, + { + "epoch": 0.015828314906415026, + "grad_norm": 23.93752695363553, + "learning_rate": 1.5828012005720848e-05, + "loss": 2.3833, + "mean_token_accuracy": 0.4758620738983154, + "step": 15715 + }, + { + "epoch": 0.015833350959519196, + "grad_norm": 25.19312957998834, + "learning_rate": 1.5833047962451907e-05, + "loss": 2.05, + "mean_token_accuracy": 0.47410768866539, + "step": 15720 + }, + { + "epoch": 0.01583838701262337, + "grad_norm": 28.501528142587986, + "learning_rate": 1.5838083919182967e-05, + "loss": 2.3783, + "mean_token_accuracy": 0.43103448748588563, + "step": 15725 + }, + { + "epoch": 0.015843423065727544, + "grad_norm": 23.766406540567694, + "learning_rate": 1.584311987591403e-05, + "loss": 2.4806, + "mean_token_accuracy": 0.41899576783180237, + "step": 15730 + }, + { + "epoch": 0.015848459118831718, + "grad_norm": 27.67939898804874, + "learning_rate": 1.584815583264509e-05, + "loss": 2.2029, + "mean_token_accuracy": 0.458620685338974, + "step": 15735 + }, + { + "epoch": 0.015853495171935888, + "grad_norm": 24.19068887361558, + "learning_rate": 1.5853191789376144e-05, + "loss": 2.2707, + "mean_token_accuracy": 0.4172413766384125, + "step": 15740 + }, + { + "epoch": 0.01585853122504006, + "grad_norm": 26.408644458985073, + "learning_rate": 1.5858227746107207e-05, + "loss": 2.4602, + "mean_token_accuracy": 0.35172413289546967, + "step": 15745 + }, + { + "epoch": 0.015863567278144235, + "grad_norm": 18.221737377816677, + "learning_rate": 1.5863263702838266e-05, + "loss": 2.1366, + "mean_token_accuracy": 0.49655171632766726, + "step": 15750 + }, + { + "epoch": 0.015868603331248406, + "grad_norm": 25.412962717365165, + "learning_rate": 1.5868299659569325e-05, + "loss": 2.1603, + "mean_token_accuracy": 0.4724137902259827, + "step": 15755 + }, + { + "epoch": 0.01587363938435258, + "grad_norm": 29.229194550437484, + "learning_rate": 1.5873335616300385e-05, + "loss": 2.4608, + "mean_token_accuracy": 0.4068965494632721, + "step": 15760 + }, + { + "epoch": 0.015878675437456753, + "grad_norm": 24.21488829858154, + "learning_rate": 1.5878371573031444e-05, + "loss": 2.1328, + "mean_token_accuracy": 0.48965516686439514, + "step": 15765 + }, + { + "epoch": 0.015883711490560927, + "grad_norm": 29.291146162527546, + "learning_rate": 1.5883407529762507e-05, + "loss": 2.6086, + "mean_token_accuracy": 0.4068965554237366, + "step": 15770 + }, + { + "epoch": 0.015888747543665097, + "grad_norm": 35.60298936591672, + "learning_rate": 1.5888443486493566e-05, + "loss": 2.7648, + "mean_token_accuracy": 0.4, + "step": 15775 + }, + { + "epoch": 0.01589378359676927, + "grad_norm": 23.381797296031145, + "learning_rate": 1.5893479443224625e-05, + "loss": 2.3552, + "mean_token_accuracy": 0.4655172348022461, + "step": 15780 + }, + { + "epoch": 0.015898819649873445, + "grad_norm": 25.299462976071695, + "learning_rate": 1.5898515399955684e-05, + "loss": 2.4132, + "mean_token_accuracy": 0.4517241358757019, + "step": 15785 + }, + { + "epoch": 0.015903855702977615, + "grad_norm": 25.921225227680083, + "learning_rate": 1.5903551356686744e-05, + "loss": 2.2967, + "mean_token_accuracy": 0.45246304869651793, + "step": 15790 + }, + { + "epoch": 0.01590889175608179, + "grad_norm": 29.139199157012364, + "learning_rate": 1.5908587313417803e-05, + "loss": 2.3918, + "mean_token_accuracy": 0.4083484590053558, + "step": 15795 + }, + { + "epoch": 0.015913927809185963, + "grad_norm": 22.595354253050626, + "learning_rate": 1.5913623270148866e-05, + "loss": 2.4594, + "mean_token_accuracy": 0.4310344934463501, + "step": 15800 + }, + { + "epoch": 0.015918963862290136, + "grad_norm": 26.59467412170808, + "learning_rate": 1.591865922687992e-05, + "loss": 2.273, + "mean_token_accuracy": 0.4221415579319, + "step": 15805 + }, + { + "epoch": 0.015923999915394307, + "grad_norm": 31.676023878385156, + "learning_rate": 1.5923695183610984e-05, + "loss": 2.6697, + "mean_token_accuracy": 0.36896551251411436, + "step": 15810 + }, + { + "epoch": 0.01592903596849848, + "grad_norm": 22.191416333855827, + "learning_rate": 1.5928731140342043e-05, + "loss": 2.0939, + "mean_token_accuracy": 0.46982758641242983, + "step": 15815 + }, + { + "epoch": 0.015934072021602654, + "grad_norm": 23.957835292298817, + "learning_rate": 1.5933767097073103e-05, + "loss": 2.3493, + "mean_token_accuracy": 0.4310344815254211, + "step": 15820 + }, + { + "epoch": 0.015939108074706825, + "grad_norm": 22.71593701763225, + "learning_rate": 1.5938803053804162e-05, + "loss": 2.382, + "mean_token_accuracy": 0.4528325080871582, + "step": 15825 + }, + { + "epoch": 0.015944144127811, + "grad_norm": 24.683184400372657, + "learning_rate": 1.594383901053522e-05, + "loss": 2.4161, + "mean_token_accuracy": 0.3931034475564957, + "step": 15830 + }, + { + "epoch": 0.015949180180915172, + "grad_norm": 32.61330315995067, + "learning_rate": 1.594887496726628e-05, + "loss": 2.2772, + "mean_token_accuracy": 0.4620689630508423, + "step": 15835 + }, + { + "epoch": 0.015954216234019346, + "grad_norm": 20.358648699825046, + "learning_rate": 1.5953910923997343e-05, + "loss": 2.2344, + "mean_token_accuracy": 0.4068965494632721, + "step": 15840 + }, + { + "epoch": 0.015959252287123516, + "grad_norm": 25.81643722807954, + "learning_rate": 1.5958946880728402e-05, + "loss": 2.4167, + "mean_token_accuracy": 0.4344827592372894, + "step": 15845 + }, + { + "epoch": 0.01596428834022769, + "grad_norm": 24.8563934157008, + "learning_rate": 1.596398283745946e-05, + "loss": 2.3589, + "mean_token_accuracy": 0.4344827592372894, + "step": 15850 + }, + { + "epoch": 0.015969324393331864, + "grad_norm": 35.92665779226816, + "learning_rate": 1.596901879419052e-05, + "loss": 2.564, + "mean_token_accuracy": 0.4034482717514038, + "step": 15855 + }, + { + "epoch": 0.015974360446436034, + "grad_norm": 27.996171339057042, + "learning_rate": 1.597405475092158e-05, + "loss": 2.6749, + "mean_token_accuracy": 0.40859044194221494, + "step": 15860 + }, + { + "epoch": 0.015979396499540208, + "grad_norm": 40.1817657857375, + "learning_rate": 1.5979090707652643e-05, + "loss": 2.3577, + "mean_token_accuracy": 0.4379310369491577, + "step": 15865 + }, + { + "epoch": 0.01598443255264438, + "grad_norm": 23.95405503722282, + "learning_rate": 1.59841266643837e-05, + "loss": 2.2048, + "mean_token_accuracy": 0.4551724076271057, + "step": 15870 + }, + { + "epoch": 0.015989468605748555, + "grad_norm": 28.213042977269204, + "learning_rate": 1.5989162621114758e-05, + "loss": 2.3296, + "mean_token_accuracy": 0.45408348441123964, + "step": 15875 + }, + { + "epoch": 0.015994504658852726, + "grad_norm": 26.55424489285476, + "learning_rate": 1.599419857784582e-05, + "loss": 2.6034, + "mean_token_accuracy": 0.42068966031074523, + "step": 15880 + }, + { + "epoch": 0.0159995407119569, + "grad_norm": 22.234568172741824, + "learning_rate": 1.599923453457688e-05, + "loss": 2.5275, + "mean_token_accuracy": 0.41034482717514037, + "step": 15885 + }, + { + "epoch": 0.016004576765061073, + "grad_norm": 23.47598604754119, + "learning_rate": 1.600427049130794e-05, + "loss": 2.1826, + "mean_token_accuracy": 0.4137930929660797, + "step": 15890 + }, + { + "epoch": 0.016009612818165243, + "grad_norm": 24.63537524759791, + "learning_rate": 1.6009306448038998e-05, + "loss": 2.1615, + "mean_token_accuracy": 0.4465819776058197, + "step": 15895 + }, + { + "epoch": 0.016014648871269417, + "grad_norm": 24.126278888168514, + "learning_rate": 1.6014342404770057e-05, + "loss": 2.5908, + "mean_token_accuracy": 0.4053236603736877, + "step": 15900 + }, + { + "epoch": 0.01601968492437359, + "grad_norm": 25.096587721640475, + "learning_rate": 1.601937836150112e-05, + "loss": 2.6339, + "mean_token_accuracy": 0.334482753276825, + "step": 15905 + }, + { + "epoch": 0.016024720977477765, + "grad_norm": 23.967545157840846, + "learning_rate": 1.602441431823218e-05, + "loss": 2.185, + "mean_token_accuracy": 0.48784029483795166, + "step": 15910 + }, + { + "epoch": 0.016029757030581935, + "grad_norm": 35.22146616714449, + "learning_rate": 1.602945027496324e-05, + "loss": 2.5509, + "mean_token_accuracy": 0.40689654350280763, + "step": 15915 + }, + { + "epoch": 0.01603479308368611, + "grad_norm": 27.901641006268484, + "learning_rate": 1.6034486231694298e-05, + "loss": 2.5059, + "mean_token_accuracy": 0.4034482777118683, + "step": 15920 + }, + { + "epoch": 0.016039829136790282, + "grad_norm": 25.70249742300798, + "learning_rate": 1.6039522188425357e-05, + "loss": 2.5246, + "mean_token_accuracy": 0.4310344815254211, + "step": 15925 + }, + { + "epoch": 0.016044865189894453, + "grad_norm": 24.771958903437202, + "learning_rate": 1.604455814515642e-05, + "loss": 2.5717, + "mean_token_accuracy": 0.41724138259887694, + "step": 15930 + }, + { + "epoch": 0.016049901242998627, + "grad_norm": 29.341028345696607, + "learning_rate": 1.604959410188748e-05, + "loss": 2.3848, + "mean_token_accuracy": 0.43333333134651186, + "step": 15935 + }, + { + "epoch": 0.0160549372961028, + "grad_norm": 27.489158950671673, + "learning_rate": 1.6054630058618535e-05, + "loss": 2.6895, + "mean_token_accuracy": 0.37586207389831544, + "step": 15940 + }, + { + "epoch": 0.016059973349206974, + "grad_norm": 26.6104073922447, + "learning_rate": 1.6059666015349597e-05, + "loss": 2.4997, + "mean_token_accuracy": 0.37931033968925476, + "step": 15945 + }, + { + "epoch": 0.016065009402311144, + "grad_norm": 24.3907766332011, + "learning_rate": 1.6064701972080657e-05, + "loss": 2.1662, + "mean_token_accuracy": 0.46896551847457885, + "step": 15950 + }, + { + "epoch": 0.016070045455415318, + "grad_norm": 21.092949351361437, + "learning_rate": 1.606973792881172e-05, + "loss": 2.2405, + "mean_token_accuracy": 0.4379310369491577, + "step": 15955 + }, + { + "epoch": 0.016075081508519492, + "grad_norm": 27.576884189386345, + "learning_rate": 1.6074773885542775e-05, + "loss": 2.513, + "mean_token_accuracy": 0.45359951853752134, + "step": 15960 + }, + { + "epoch": 0.016080117561623662, + "grad_norm": 24.217079033603405, + "learning_rate": 1.6079809842273834e-05, + "loss": 2.132, + "mean_token_accuracy": 0.4724137902259827, + "step": 15965 + }, + { + "epoch": 0.016085153614727836, + "grad_norm": 32.857801870504055, + "learning_rate": 1.6084845799004897e-05, + "loss": 2.3211, + "mean_token_accuracy": 0.43103447556495667, + "step": 15970 + }, + { + "epoch": 0.01609018966783201, + "grad_norm": 26.15977713710439, + "learning_rate": 1.6089881755735956e-05, + "loss": 2.0777, + "mean_token_accuracy": 0.4310344815254211, + "step": 15975 + }, + { + "epoch": 0.016095225720936183, + "grad_norm": 29.713938267381558, + "learning_rate": 1.6094917712467016e-05, + "loss": 2.5705, + "mean_token_accuracy": 0.3931034475564957, + "step": 15980 + }, + { + "epoch": 0.016100261774040354, + "grad_norm": 27.288597723397817, + "learning_rate": 1.6099953669198075e-05, + "loss": 2.1992, + "mean_token_accuracy": 0.4103448331356049, + "step": 15985 + }, + { + "epoch": 0.016105297827144528, + "grad_norm": 25.287181102055378, + "learning_rate": 1.6104989625929134e-05, + "loss": 2.2514, + "mean_token_accuracy": 0.4379310369491577, + "step": 15990 + }, + { + "epoch": 0.0161103338802487, + "grad_norm": 26.393151576672576, + "learning_rate": 1.6110025582660197e-05, + "loss": 2.3617, + "mean_token_accuracy": 0.42758620977401735, + "step": 15995 + }, + { + "epoch": 0.01611536993335287, + "grad_norm": 35.85765610785674, + "learning_rate": 1.6115061539391256e-05, + "loss": 2.1501, + "mean_token_accuracy": 0.46551724672317507, + "step": 16000 + }, + { + "epoch": 0.016120405986457045, + "grad_norm": 19.665480402671054, + "learning_rate": 1.6120097496122312e-05, + "loss": 2.3023, + "mean_token_accuracy": 0.41034482717514037, + "step": 16005 + }, + { + "epoch": 0.01612544203956122, + "grad_norm": 31.828767051237428, + "learning_rate": 1.6125133452853374e-05, + "loss": 2.4721, + "mean_token_accuracy": 0.4241379380226135, + "step": 16010 + }, + { + "epoch": 0.016130478092665393, + "grad_norm": 20.483178545507872, + "learning_rate": 1.6130169409584434e-05, + "loss": 2.2288, + "mean_token_accuracy": 0.4689655125141144, + "step": 16015 + }, + { + "epoch": 0.016135514145769563, + "grad_norm": 23.6914917792498, + "learning_rate": 1.6135205366315493e-05, + "loss": 2.6866, + "mean_token_accuracy": 0.3517241418361664, + "step": 16020 + }, + { + "epoch": 0.016140550198873737, + "grad_norm": 23.035257388334024, + "learning_rate": 1.6140241323046552e-05, + "loss": 2.3521, + "mean_token_accuracy": 0.43103448748588563, + "step": 16025 + }, + { + "epoch": 0.01614558625197791, + "grad_norm": 28.04894710546101, + "learning_rate": 1.614527727977761e-05, + "loss": 2.6548, + "mean_token_accuracy": 0.3620689570903778, + "step": 16030 + }, + { + "epoch": 0.01615062230508208, + "grad_norm": 26.162963037248577, + "learning_rate": 1.6150313236508674e-05, + "loss": 2.4051, + "mean_token_accuracy": 0.38620689511299133, + "step": 16035 + }, + { + "epoch": 0.016155658358186255, + "grad_norm": 18.915327482976778, + "learning_rate": 1.6155349193239733e-05, + "loss": 2.4538, + "mean_token_accuracy": 0.43986691236495973, + "step": 16040 + }, + { + "epoch": 0.01616069441129043, + "grad_norm": 20.01961908454445, + "learning_rate": 1.6160385149970793e-05, + "loss": 2.3729, + "mean_token_accuracy": 0.4068965494632721, + "step": 16045 + }, + { + "epoch": 0.016165730464394602, + "grad_norm": 25.79690408161775, + "learning_rate": 1.6165421106701852e-05, + "loss": 2.6078, + "mean_token_accuracy": 0.32413792610168457, + "step": 16050 + }, + { + "epoch": 0.016170766517498773, + "grad_norm": 30.41181267883847, + "learning_rate": 1.617045706343291e-05, + "loss": 2.2503, + "mean_token_accuracy": 0.46551724076271056, + "step": 16055 + }, + { + "epoch": 0.016175802570602946, + "grad_norm": 22.211689171292427, + "learning_rate": 1.617549302016397e-05, + "loss": 2.1925, + "mean_token_accuracy": 0.4413793087005615, + "step": 16060 + }, + { + "epoch": 0.01618083862370712, + "grad_norm": 25.39401378285009, + "learning_rate": 1.6180528976895033e-05, + "loss": 2.5749, + "mean_token_accuracy": 0.3482758581638336, + "step": 16065 + }, + { + "epoch": 0.01618587467681129, + "grad_norm": 26.027902502935834, + "learning_rate": 1.6185564933626092e-05, + "loss": 2.2322, + "mean_token_accuracy": 0.4586206912994385, + "step": 16070 + }, + { + "epoch": 0.016190910729915464, + "grad_norm": 27.80210475261293, + "learning_rate": 1.619060089035715e-05, + "loss": 2.8137, + "mean_token_accuracy": 0.3620689630508423, + "step": 16075 + }, + { + "epoch": 0.016195946783019638, + "grad_norm": 31.22774236205814, + "learning_rate": 1.619563684708821e-05, + "loss": 2.6171, + "mean_token_accuracy": 0.3931034505367279, + "step": 16080 + }, + { + "epoch": 0.016200982836123812, + "grad_norm": 26.954897007995886, + "learning_rate": 1.620067280381927e-05, + "loss": 2.751, + "mean_token_accuracy": 0.4413793087005615, + "step": 16085 + }, + { + "epoch": 0.016206018889227982, + "grad_norm": 19.33062368570597, + "learning_rate": 1.6205708760550333e-05, + "loss": 2.3301, + "mean_token_accuracy": 0.43103448748588563, + "step": 16090 + }, + { + "epoch": 0.016211054942332156, + "grad_norm": 24.381737230547273, + "learning_rate": 1.621074471728139e-05, + "loss": 2.2871, + "mean_token_accuracy": 0.42413793206214906, + "step": 16095 + }, + { + "epoch": 0.01621609099543633, + "grad_norm": 23.581354748241136, + "learning_rate": 1.6215780674012448e-05, + "loss": 2.4416, + "mean_token_accuracy": 0.38620689511299133, + "step": 16100 + }, + { + "epoch": 0.0162211270485405, + "grad_norm": 23.858839747082953, + "learning_rate": 1.622081663074351e-05, + "loss": 2.282, + "mean_token_accuracy": 0.43793103098869324, + "step": 16105 + }, + { + "epoch": 0.016226163101644674, + "grad_norm": 17.098119307804446, + "learning_rate": 1.622585258747457e-05, + "loss": 1.8045, + "mean_token_accuracy": 0.5241379380226135, + "step": 16110 + }, + { + "epoch": 0.016231199154748847, + "grad_norm": 26.61697143943336, + "learning_rate": 1.623088854420563e-05, + "loss": 2.4687, + "mean_token_accuracy": 0.42413793206214906, + "step": 16115 + }, + { + "epoch": 0.01623623520785302, + "grad_norm": 25.163525739575366, + "learning_rate": 1.6235924500936688e-05, + "loss": 2.2006, + "mean_token_accuracy": 0.44289171099662783, + "step": 16120 + }, + { + "epoch": 0.01624127126095719, + "grad_norm": 25.925235005717155, + "learning_rate": 1.6240960457667747e-05, + "loss": 2.3186, + "mean_token_accuracy": 0.4310344815254211, + "step": 16125 + }, + { + "epoch": 0.016246307314061365, + "grad_norm": 35.864643725650254, + "learning_rate": 1.624599641439881e-05, + "loss": 2.6501, + "mean_token_accuracy": 0.4034482777118683, + "step": 16130 + }, + { + "epoch": 0.01625134336716554, + "grad_norm": 20.2869681221461, + "learning_rate": 1.625103237112987e-05, + "loss": 2.1231, + "mean_token_accuracy": 0.44482757449150084, + "step": 16135 + }, + { + "epoch": 0.01625637942026971, + "grad_norm": 27.069352429360414, + "learning_rate": 1.6256068327860925e-05, + "loss": 2.5896, + "mean_token_accuracy": 0.3896551728248596, + "step": 16140 + }, + { + "epoch": 0.016261415473373883, + "grad_norm": 25.99181200103059, + "learning_rate": 1.6261104284591988e-05, + "loss": 2.4997, + "mean_token_accuracy": 0.40689654350280763, + "step": 16145 + }, + { + "epoch": 0.016266451526478057, + "grad_norm": 21.896147953216605, + "learning_rate": 1.6266140241323047e-05, + "loss": 2.2066, + "mean_token_accuracy": 0.4482758641242981, + "step": 16150 + }, + { + "epoch": 0.01627148757958223, + "grad_norm": 32.932150703178834, + "learning_rate": 1.627117619805411e-05, + "loss": 2.3894, + "mean_token_accuracy": 0.4068965554237366, + "step": 16155 + }, + { + "epoch": 0.0162765236326864, + "grad_norm": 26.24690299402618, + "learning_rate": 1.6276212154785166e-05, + "loss": 2.0487, + "mean_token_accuracy": 0.4482758641242981, + "step": 16160 + }, + { + "epoch": 0.016281559685790575, + "grad_norm": 24.052027266493173, + "learning_rate": 1.6281248111516225e-05, + "loss": 2.7884, + "mean_token_accuracy": 0.36896551847457887, + "step": 16165 + }, + { + "epoch": 0.01628659573889475, + "grad_norm": 27.82847029412751, + "learning_rate": 1.6286284068247288e-05, + "loss": 2.378, + "mean_token_accuracy": 0.42068964838981626, + "step": 16170 + }, + { + "epoch": 0.01629163179199892, + "grad_norm": 31.713487553074717, + "learning_rate": 1.6291320024978347e-05, + "loss": 2.7123, + "mean_token_accuracy": 0.3724137842655182, + "step": 16175 + }, + { + "epoch": 0.016296667845103092, + "grad_norm": 22.310470652978424, + "learning_rate": 1.6296355981709406e-05, + "loss": 2.5887, + "mean_token_accuracy": 0.37931033968925476, + "step": 16180 + }, + { + "epoch": 0.016301703898207266, + "grad_norm": 18.85288515149046, + "learning_rate": 1.6301391938440465e-05, + "loss": 2.2143, + "mean_token_accuracy": 0.4344827651977539, + "step": 16185 + }, + { + "epoch": 0.01630673995131144, + "grad_norm": 26.4457753751189, + "learning_rate": 1.6306427895171525e-05, + "loss": 2.6684, + "mean_token_accuracy": 0.39655172228813174, + "step": 16190 + }, + { + "epoch": 0.01631177600441561, + "grad_norm": 21.654323523382995, + "learning_rate": 1.6311463851902587e-05, + "loss": 1.9999, + "mean_token_accuracy": 0.5379310369491577, + "step": 16195 + }, + { + "epoch": 0.016316812057519784, + "grad_norm": 28.013841265705846, + "learning_rate": 1.6316499808633646e-05, + "loss": 2.7621, + "mean_token_accuracy": 0.40689654350280763, + "step": 16200 + }, + { + "epoch": 0.016321848110623958, + "grad_norm": 28.508231338281014, + "learning_rate": 1.6321535765364702e-05, + "loss": 2.5796, + "mean_token_accuracy": 0.37586206793785093, + "step": 16205 + }, + { + "epoch": 0.016326884163728128, + "grad_norm": 27.295748122135567, + "learning_rate": 1.6326571722095765e-05, + "loss": 2.6954, + "mean_token_accuracy": 0.4068965494632721, + "step": 16210 + }, + { + "epoch": 0.016331920216832302, + "grad_norm": 31.794370345097523, + "learning_rate": 1.6331607678826824e-05, + "loss": 2.2109, + "mean_token_accuracy": 0.42758620381355283, + "step": 16215 + }, + { + "epoch": 0.016336956269936476, + "grad_norm": 25.54876472381247, + "learning_rate": 1.6336643635557883e-05, + "loss": 2.585, + "mean_token_accuracy": 0.41578946709632875, + "step": 16220 + }, + { + "epoch": 0.01634199232304065, + "grad_norm": 18.84254917369886, + "learning_rate": 1.6341679592288943e-05, + "loss": 2.2133, + "mean_token_accuracy": 0.441379314661026, + "step": 16225 + }, + { + "epoch": 0.01634702837614482, + "grad_norm": 25.790788684532103, + "learning_rate": 1.6346715549020002e-05, + "loss": 2.177, + "mean_token_accuracy": 0.4896551609039307, + "step": 16230 + }, + { + "epoch": 0.016352064429248993, + "grad_norm": 36.49193846036397, + "learning_rate": 1.6351751505751065e-05, + "loss": 2.5195, + "mean_token_accuracy": 0.4206896543502808, + "step": 16235 + }, + { + "epoch": 0.016357100482353167, + "grad_norm": 29.154792125619448, + "learning_rate": 1.6356787462482124e-05, + "loss": 2.4637, + "mean_token_accuracy": 0.4137931078672409, + "step": 16240 + }, + { + "epoch": 0.016362136535457338, + "grad_norm": 23.948759302242753, + "learning_rate": 1.6361823419213183e-05, + "loss": 2.1343, + "mean_token_accuracy": 0.4982456147670746, + "step": 16245 + }, + { + "epoch": 0.01636717258856151, + "grad_norm": 30.949804075670286, + "learning_rate": 1.6366859375944242e-05, + "loss": 2.2667, + "mean_token_accuracy": 0.482758617401123, + "step": 16250 + }, + { + "epoch": 0.016372208641665685, + "grad_norm": 25.926615500117563, + "learning_rate": 1.63718953326753e-05, + "loss": 2.5455, + "mean_token_accuracy": 0.4137930989265442, + "step": 16255 + }, + { + "epoch": 0.01637724469476986, + "grad_norm": 27.231062987404485, + "learning_rate": 1.637693128940636e-05, + "loss": 2.3902, + "mean_token_accuracy": 0.42758620381355283, + "step": 16260 + }, + { + "epoch": 0.01638228074787403, + "grad_norm": 21.267875106576117, + "learning_rate": 1.6381967246137423e-05, + "loss": 2.2121, + "mean_token_accuracy": 0.4344827651977539, + "step": 16265 + }, + { + "epoch": 0.016387316800978203, + "grad_norm": 23.679544749161433, + "learning_rate": 1.6387003202868483e-05, + "loss": 2.5736, + "mean_token_accuracy": 0.3931034505367279, + "step": 16270 + }, + { + "epoch": 0.016392352854082377, + "grad_norm": 34.90922485444246, + "learning_rate": 1.6392039159599542e-05, + "loss": 2.5651, + "mean_token_accuracy": 0.3655172407627106, + "step": 16275 + }, + { + "epoch": 0.016397388907186547, + "grad_norm": 27.663061049582694, + "learning_rate": 1.63970751163306e-05, + "loss": 2.3567, + "mean_token_accuracy": 0.42758620977401735, + "step": 16280 + }, + { + "epoch": 0.01640242496029072, + "grad_norm": 24.40515639467733, + "learning_rate": 1.640211107306166e-05, + "loss": 2.694, + "mean_token_accuracy": 0.4379310369491577, + "step": 16285 + }, + { + "epoch": 0.016407461013394894, + "grad_norm": 30.254389304531294, + "learning_rate": 1.6407147029792723e-05, + "loss": 2.0815, + "mean_token_accuracy": 0.42068964838981626, + "step": 16290 + }, + { + "epoch": 0.016412497066499068, + "grad_norm": 33.502771661139455, + "learning_rate": 1.641218298652378e-05, + "loss": 2.6024, + "mean_token_accuracy": 0.41379310488700866, + "step": 16295 + }, + { + "epoch": 0.01641753311960324, + "grad_norm": 33.652929006096365, + "learning_rate": 1.6417218943254838e-05, + "loss": 2.3518, + "mean_token_accuracy": 0.44658197164535524, + "step": 16300 + }, + { + "epoch": 0.016422569172707412, + "grad_norm": 18.406245554811697, + "learning_rate": 1.64222548999859e-05, + "loss": 2.2587, + "mean_token_accuracy": 0.42413792610168455, + "step": 16305 + }, + { + "epoch": 0.016427605225811586, + "grad_norm": 28.860191862911947, + "learning_rate": 1.642729085671696e-05, + "loss": 2.2777, + "mean_token_accuracy": 0.44482759237289426, + "step": 16310 + }, + { + "epoch": 0.016432641278915756, + "grad_norm": 24.594454152829588, + "learning_rate": 1.643232681344802e-05, + "loss": 2.499, + "mean_token_accuracy": 0.4034482717514038, + "step": 16315 + }, + { + "epoch": 0.01643767733201993, + "grad_norm": 31.766144385372044, + "learning_rate": 1.643736277017908e-05, + "loss": 2.1385, + "mean_token_accuracy": 0.46896551847457885, + "step": 16320 + }, + { + "epoch": 0.016442713385124104, + "grad_norm": 31.475708951014706, + "learning_rate": 1.6442398726910138e-05, + "loss": 2.3606, + "mean_token_accuracy": 0.4310344815254211, + "step": 16325 + }, + { + "epoch": 0.016447749438228278, + "grad_norm": 22.84072727117869, + "learning_rate": 1.64474346836412e-05, + "loss": 2.1678, + "mean_token_accuracy": 0.4172413766384125, + "step": 16330 + }, + { + "epoch": 0.016452785491332448, + "grad_norm": 22.68620768044315, + "learning_rate": 1.645247064037226e-05, + "loss": 2.2783, + "mean_token_accuracy": 0.41379310488700866, + "step": 16335 + }, + { + "epoch": 0.016457821544436622, + "grad_norm": 25.42061216455319, + "learning_rate": 1.6457506597103316e-05, + "loss": 2.5925, + "mean_token_accuracy": 0.4241379380226135, + "step": 16340 + }, + { + "epoch": 0.016462857597540796, + "grad_norm": 21.347266167333927, + "learning_rate": 1.6462542553834378e-05, + "loss": 2.2483, + "mean_token_accuracy": 0.4724137902259827, + "step": 16345 + }, + { + "epoch": 0.016467893650644966, + "grad_norm": 23.976569145358116, + "learning_rate": 1.6467578510565438e-05, + "loss": 2.3479, + "mean_token_accuracy": 0.41724138557910917, + "step": 16350 + }, + { + "epoch": 0.01647292970374914, + "grad_norm": 22.645448624959368, + "learning_rate": 1.64726144672965e-05, + "loss": 2.3178, + "mean_token_accuracy": 0.4517241358757019, + "step": 16355 + }, + { + "epoch": 0.016477965756853313, + "grad_norm": 33.61058309427705, + "learning_rate": 1.6477650424027556e-05, + "loss": 3.0406, + "mean_token_accuracy": 0.36206896007061007, + "step": 16360 + }, + { + "epoch": 0.016483001809957487, + "grad_norm": 18.299279680385496, + "learning_rate": 1.6482686380758615e-05, + "loss": 2.414, + "mean_token_accuracy": 0.4206896543502808, + "step": 16365 + }, + { + "epoch": 0.016488037863061657, + "grad_norm": 26.716352235897503, + "learning_rate": 1.6487722337489678e-05, + "loss": 2.6868, + "mean_token_accuracy": 0.3482758641242981, + "step": 16370 + }, + { + "epoch": 0.01649307391616583, + "grad_norm": 25.03171105871525, + "learning_rate": 1.6492758294220737e-05, + "loss": 2.444, + "mean_token_accuracy": 0.46551724672317507, + "step": 16375 + }, + { + "epoch": 0.016498109969270005, + "grad_norm": 30.903829283500382, + "learning_rate": 1.6497794250951796e-05, + "loss": 2.4255, + "mean_token_accuracy": 0.4655172348022461, + "step": 16380 + }, + { + "epoch": 0.016503146022374175, + "grad_norm": 31.945931610683598, + "learning_rate": 1.6502830207682856e-05, + "loss": 2.5491, + "mean_token_accuracy": 0.42068965137004855, + "step": 16385 + }, + { + "epoch": 0.01650818207547835, + "grad_norm": 21.596168913162956, + "learning_rate": 1.6507866164413915e-05, + "loss": 2.3104, + "mean_token_accuracy": 0.4517241358757019, + "step": 16390 + }, + { + "epoch": 0.016513218128582523, + "grad_norm": 26.8934510304877, + "learning_rate": 1.6512902121144978e-05, + "loss": 2.3333, + "mean_token_accuracy": 0.42068966031074523, + "step": 16395 + }, + { + "epoch": 0.016518254181686697, + "grad_norm": 27.573908605537593, + "learning_rate": 1.6517938077876037e-05, + "loss": 2.3879, + "mean_token_accuracy": 0.4310344815254211, + "step": 16400 + }, + { + "epoch": 0.016523290234790867, + "grad_norm": 28.291771619159178, + "learning_rate": 1.6522974034607093e-05, + "loss": 2.3594, + "mean_token_accuracy": 0.4620689570903778, + "step": 16405 + }, + { + "epoch": 0.01652832628789504, + "grad_norm": 23.505598476403605, + "learning_rate": 1.6528009991338155e-05, + "loss": 2.569, + "mean_token_accuracy": 0.4172413766384125, + "step": 16410 + }, + { + "epoch": 0.016533362340999214, + "grad_norm": 31.102384970986943, + "learning_rate": 1.6533045948069215e-05, + "loss": 2.132, + "mean_token_accuracy": 0.441379314661026, + "step": 16415 + }, + { + "epoch": 0.016538398394103385, + "grad_norm": 31.089726723547216, + "learning_rate": 1.6538081904800277e-05, + "loss": 2.5162, + "mean_token_accuracy": 0.3931034505367279, + "step": 16420 + }, + { + "epoch": 0.01654343444720756, + "grad_norm": 28.617932566672557, + "learning_rate": 1.6543117861531333e-05, + "loss": 2.804, + "mean_token_accuracy": 0.3724137932062149, + "step": 16425 + }, + { + "epoch": 0.016548470500311732, + "grad_norm": 19.99098296598657, + "learning_rate": 1.6548153818262392e-05, + "loss": 2.158, + "mean_token_accuracy": 0.4931034564971924, + "step": 16430 + }, + { + "epoch": 0.016553506553415906, + "grad_norm": 25.670661093428624, + "learning_rate": 1.6553189774993455e-05, + "loss": 2.518, + "mean_token_accuracy": 0.41379310488700866, + "step": 16435 + }, + { + "epoch": 0.016558542606520076, + "grad_norm": 22.060563784602415, + "learning_rate": 1.6558225731724514e-05, + "loss": 2.5407, + "mean_token_accuracy": 0.39655172228813174, + "step": 16440 + }, + { + "epoch": 0.01656357865962425, + "grad_norm": 18.920481955920355, + "learning_rate": 1.6563261688455574e-05, + "loss": 2.5141, + "mean_token_accuracy": 0.43587417602539064, + "step": 16445 + }, + { + "epoch": 0.016568614712728424, + "grad_norm": 20.766823787273278, + "learning_rate": 1.6568297645186633e-05, + "loss": 2.2358, + "mean_token_accuracy": 0.3896551787853241, + "step": 16450 + }, + { + "epoch": 0.016573650765832594, + "grad_norm": 23.010608740906083, + "learning_rate": 1.6573333601917692e-05, + "loss": 2.6799, + "mean_token_accuracy": 0.37586206793785093, + "step": 16455 + }, + { + "epoch": 0.016578686818936768, + "grad_norm": 20.460391920551803, + "learning_rate": 1.6578369558648755e-05, + "loss": 2.6053, + "mean_token_accuracy": 0.4, + "step": 16460 + }, + { + "epoch": 0.01658372287204094, + "grad_norm": 24.95671465400697, + "learning_rate": 1.6583405515379814e-05, + "loss": 2.273, + "mean_token_accuracy": 0.5018753707408905, + "step": 16465 + }, + { + "epoch": 0.016588758925145115, + "grad_norm": 25.306175338911693, + "learning_rate": 1.6588441472110873e-05, + "loss": 1.933, + "mean_token_accuracy": 0.46551724076271056, + "step": 16470 + }, + { + "epoch": 0.016593794978249286, + "grad_norm": 25.10691877592691, + "learning_rate": 1.6593477428841932e-05, + "loss": 2.341, + "mean_token_accuracy": 0.45172412395477296, + "step": 16475 + }, + { + "epoch": 0.01659883103135346, + "grad_norm": 20.494545115912963, + "learning_rate": 1.659851338557299e-05, + "loss": 2.5433, + "mean_token_accuracy": 0.41034482717514037, + "step": 16480 + }, + { + "epoch": 0.016603867084457633, + "grad_norm": 29.08519778130725, + "learning_rate": 1.660354934230405e-05, + "loss": 2.552, + "mean_token_accuracy": 0.3896551728248596, + "step": 16485 + }, + { + "epoch": 0.016608903137561803, + "grad_norm": 38.19694701792948, + "learning_rate": 1.6608585299035114e-05, + "loss": 2.4155, + "mean_token_accuracy": 0.4172413766384125, + "step": 16490 + }, + { + "epoch": 0.016613939190665977, + "grad_norm": 30.46599049650939, + "learning_rate": 1.661362125576617e-05, + "loss": 2.4659, + "mean_token_accuracy": 0.3965517282485962, + "step": 16495 + }, + { + "epoch": 0.01661897524377015, + "grad_norm": 25.081106031779544, + "learning_rate": 1.6618657212497232e-05, + "loss": 2.4273, + "mean_token_accuracy": 0.4344827592372894, + "step": 16500 + }, + { + "epoch": 0.016624011296874325, + "grad_norm": 22.516074121367467, + "learning_rate": 1.662369316922829e-05, + "loss": 2.3528, + "mean_token_accuracy": 0.441379314661026, + "step": 16505 + }, + { + "epoch": 0.016629047349978495, + "grad_norm": 25.04593411553964, + "learning_rate": 1.662872912595935e-05, + "loss": 2.2382, + "mean_token_accuracy": 0.4689655125141144, + "step": 16510 + }, + { + "epoch": 0.01663408340308267, + "grad_norm": 26.22121289626016, + "learning_rate": 1.663376508269041e-05, + "loss": 2.3972, + "mean_token_accuracy": 0.4551724135875702, + "step": 16515 + }, + { + "epoch": 0.016639119456186843, + "grad_norm": 26.259929173846718, + "learning_rate": 1.663880103942147e-05, + "loss": 2.5201, + "mean_token_accuracy": 0.42758620977401735, + "step": 16520 + }, + { + "epoch": 0.016644155509291013, + "grad_norm": 26.960765378942927, + "learning_rate": 1.664383699615253e-05, + "loss": 2.2683, + "mean_token_accuracy": 0.48275862336158754, + "step": 16525 + }, + { + "epoch": 0.016649191562395187, + "grad_norm": 25.882408101246885, + "learning_rate": 1.664887295288359e-05, + "loss": 2.2574, + "mean_token_accuracy": 0.42758620381355283, + "step": 16530 + }, + { + "epoch": 0.01665422761549936, + "grad_norm": 32.53124626375646, + "learning_rate": 1.665390890961465e-05, + "loss": 2.1684, + "mean_token_accuracy": 0.5344827592372894, + "step": 16535 + }, + { + "epoch": 0.016659263668603534, + "grad_norm": 23.457614859671715, + "learning_rate": 1.665894486634571e-05, + "loss": 2.2992, + "mean_token_accuracy": 0.43103448748588563, + "step": 16540 + }, + { + "epoch": 0.016664299721707704, + "grad_norm": 22.57790746846105, + "learning_rate": 1.666398082307677e-05, + "loss": 2.5344, + "mean_token_accuracy": 0.43793103098869324, + "step": 16545 + }, + { + "epoch": 0.016669335774811878, + "grad_norm": 23.55183037302489, + "learning_rate": 1.6669016779807828e-05, + "loss": 2.1971, + "mean_token_accuracy": 0.4363581418991089, + "step": 16550 + }, + { + "epoch": 0.016674371827916052, + "grad_norm": 31.813446379687424, + "learning_rate": 1.667405273653889e-05, + "loss": 2.3633, + "mean_token_accuracy": 0.4330308437347412, + "step": 16555 + }, + { + "epoch": 0.016679407881020222, + "grad_norm": 28.010272178958477, + "learning_rate": 1.6679088693269947e-05, + "loss": 2.1711, + "mean_token_accuracy": 0.44482758045196535, + "step": 16560 + }, + { + "epoch": 0.016684443934124396, + "grad_norm": 24.766228205349133, + "learning_rate": 1.6684124650001006e-05, + "loss": 2.531, + "mean_token_accuracy": 0.46551724076271056, + "step": 16565 + }, + { + "epoch": 0.01668947998722857, + "grad_norm": 32.626409140469605, + "learning_rate": 1.668916060673207e-05, + "loss": 2.3574, + "mean_token_accuracy": 0.41034482717514037, + "step": 16570 + }, + { + "epoch": 0.016694516040332744, + "grad_norm": 28.68895241745584, + "learning_rate": 1.6694196563463128e-05, + "loss": 2.2842, + "mean_token_accuracy": 0.41379310488700866, + "step": 16575 + }, + { + "epoch": 0.016699552093436914, + "grad_norm": 24.786791611751994, + "learning_rate": 1.6699232520194187e-05, + "loss": 2.2697, + "mean_token_accuracy": 0.4448275864124298, + "step": 16580 + }, + { + "epoch": 0.016704588146541088, + "grad_norm": 22.849364945602453, + "learning_rate": 1.6704268476925246e-05, + "loss": 2.2101, + "mean_token_accuracy": 0.4206896543502808, + "step": 16585 + }, + { + "epoch": 0.01670962419964526, + "grad_norm": 28.815823615461213, + "learning_rate": 1.6709304433656305e-05, + "loss": 2.3716, + "mean_token_accuracy": 0.441379314661026, + "step": 16590 + }, + { + "epoch": 0.016714660252749432, + "grad_norm": 25.67189319869878, + "learning_rate": 1.6714340390387368e-05, + "loss": 2.4791, + "mean_token_accuracy": 0.3827586233615875, + "step": 16595 + }, + { + "epoch": 0.016719696305853606, + "grad_norm": 20.250273980546684, + "learning_rate": 1.6719376347118427e-05, + "loss": 2.1972, + "mean_token_accuracy": 0.4241379201412201, + "step": 16600 + }, + { + "epoch": 0.01672473235895778, + "grad_norm": 27.812835609853447, + "learning_rate": 1.6724412303849487e-05, + "loss": 2.4992, + "mean_token_accuracy": 0.39655171930789945, + "step": 16605 + }, + { + "epoch": 0.016729768412061953, + "grad_norm": 27.02153546562052, + "learning_rate": 1.6729448260580546e-05, + "loss": 2.2211, + "mean_token_accuracy": 0.42068966031074523, + "step": 16610 + }, + { + "epoch": 0.016734804465166123, + "grad_norm": 25.07533165800941, + "learning_rate": 1.6734484217311605e-05, + "loss": 2.691, + "mean_token_accuracy": 0.3862068891525269, + "step": 16615 + }, + { + "epoch": 0.016739840518270297, + "grad_norm": 19.262066873221702, + "learning_rate": 1.6739520174042668e-05, + "loss": 2.5882, + "mean_token_accuracy": 0.4034482777118683, + "step": 16620 + }, + { + "epoch": 0.01674487657137447, + "grad_norm": 27.630239723381568, + "learning_rate": 1.6744556130773727e-05, + "loss": 2.4266, + "mean_token_accuracy": 0.4330308556556702, + "step": 16625 + }, + { + "epoch": 0.01674991262447864, + "grad_norm": 28.69394637716775, + "learning_rate": 1.6749592087504783e-05, + "loss": 2.4544, + "mean_token_accuracy": 0.4034482777118683, + "step": 16630 + }, + { + "epoch": 0.016754948677582815, + "grad_norm": 26.138878231836326, + "learning_rate": 1.6754628044235845e-05, + "loss": 2.3082, + "mean_token_accuracy": 0.4980641186237335, + "step": 16635 + }, + { + "epoch": 0.01675998473068699, + "grad_norm": 24.53681925560233, + "learning_rate": 1.6759664000966905e-05, + "loss": 2.2614, + "mean_token_accuracy": 0.43103448748588563, + "step": 16640 + }, + { + "epoch": 0.016765020783791162, + "grad_norm": 21.98524845882193, + "learning_rate": 1.6764699957697964e-05, + "loss": 2.0146, + "mean_token_accuracy": 0.5160314559936523, + "step": 16645 + }, + { + "epoch": 0.016770056836895333, + "grad_norm": 25.55489798229527, + "learning_rate": 1.6769735914429023e-05, + "loss": 2.0501, + "mean_token_accuracy": 0.5034482717514038, + "step": 16650 + }, + { + "epoch": 0.016775092889999507, + "grad_norm": 29.455896722420146, + "learning_rate": 1.6774771871160082e-05, + "loss": 2.4353, + "mean_token_accuracy": 0.36896551847457887, + "step": 16655 + }, + { + "epoch": 0.01678012894310368, + "grad_norm": 31.003228995006612, + "learning_rate": 1.6779807827891145e-05, + "loss": 2.4644, + "mean_token_accuracy": 0.4034482717514038, + "step": 16660 + }, + { + "epoch": 0.01678516499620785, + "grad_norm": 25.38053425177616, + "learning_rate": 1.6784843784622204e-05, + "loss": 2.3268, + "mean_token_accuracy": 0.4551724135875702, + "step": 16665 + }, + { + "epoch": 0.016790201049312024, + "grad_norm": 21.760982674378607, + "learning_rate": 1.6789879741353264e-05, + "loss": 2.6763, + "mean_token_accuracy": 0.3551724076271057, + "step": 16670 + }, + { + "epoch": 0.016795237102416198, + "grad_norm": 24.404175760634832, + "learning_rate": 1.6794915698084323e-05, + "loss": 2.4728, + "mean_token_accuracy": 0.41724138259887694, + "step": 16675 + }, + { + "epoch": 0.016800273155520372, + "grad_norm": 21.991555110528008, + "learning_rate": 1.6799951654815382e-05, + "loss": 2.2598, + "mean_token_accuracy": 0.4551724135875702, + "step": 16680 + }, + { + "epoch": 0.016805309208624542, + "grad_norm": 22.362492828740233, + "learning_rate": 1.680498761154644e-05, + "loss": 2.3174, + "mean_token_accuracy": 0.4, + "step": 16685 + }, + { + "epoch": 0.016810345261728716, + "grad_norm": 19.596537434292678, + "learning_rate": 1.6810023568277504e-05, + "loss": 2.2956, + "mean_token_accuracy": 0.43793103098869324, + "step": 16690 + }, + { + "epoch": 0.01681538131483289, + "grad_norm": 18.386247426360306, + "learning_rate": 1.681505952500856e-05, + "loss": 2.5653, + "mean_token_accuracy": 0.3551724135875702, + "step": 16695 + }, + { + "epoch": 0.01682041736793706, + "grad_norm": 20.27802176513931, + "learning_rate": 1.6820095481739623e-05, + "loss": 2.3197, + "mean_token_accuracy": 0.41379310488700866, + "step": 16700 + }, + { + "epoch": 0.016825453421041234, + "grad_norm": 23.94985538201386, + "learning_rate": 1.6825131438470682e-05, + "loss": 2.3415, + "mean_token_accuracy": 0.4068965554237366, + "step": 16705 + }, + { + "epoch": 0.016830489474145408, + "grad_norm": 23.136199548446854, + "learning_rate": 1.683016739520174e-05, + "loss": 2.1842, + "mean_token_accuracy": 0.4379310369491577, + "step": 16710 + }, + { + "epoch": 0.01683552552724958, + "grad_norm": 22.392366648969645, + "learning_rate": 1.68352033519328e-05, + "loss": 2.3128, + "mean_token_accuracy": 0.44137930274009707, + "step": 16715 + }, + { + "epoch": 0.01684056158035375, + "grad_norm": 20.79343688099604, + "learning_rate": 1.684023930866386e-05, + "loss": 2.3726, + "mean_token_accuracy": 0.4034482777118683, + "step": 16720 + }, + { + "epoch": 0.016845597633457925, + "grad_norm": 23.53411563648727, + "learning_rate": 1.684527526539492e-05, + "loss": 2.3836, + "mean_token_accuracy": 0.4448275864124298, + "step": 16725 + }, + { + "epoch": 0.0168506336865621, + "grad_norm": 23.404719460434745, + "learning_rate": 1.685031122212598e-05, + "loss": 2.2104, + "mean_token_accuracy": 0.47931033968925474, + "step": 16730 + }, + { + "epoch": 0.01685566973966627, + "grad_norm": 23.173083679408165, + "learning_rate": 1.685534717885704e-05, + "loss": 2.2963, + "mean_token_accuracy": 0.41724138259887694, + "step": 16735 + }, + { + "epoch": 0.016860705792770443, + "grad_norm": 30.797005470254117, + "learning_rate": 1.68603831355881e-05, + "loss": 2.3107, + "mean_token_accuracy": 0.43103447556495667, + "step": 16740 + }, + { + "epoch": 0.016865741845874617, + "grad_norm": 24.228966362661172, + "learning_rate": 1.686541909231916e-05, + "loss": 2.5924, + "mean_token_accuracy": 0.36896551847457887, + "step": 16745 + }, + { + "epoch": 0.01687077789897879, + "grad_norm": 29.877716458941702, + "learning_rate": 1.687045504905022e-05, + "loss": 2.5948, + "mean_token_accuracy": 0.4448275864124298, + "step": 16750 + }, + { + "epoch": 0.01687581395208296, + "grad_norm": 23.175622911123604, + "learning_rate": 1.687549100578128e-05, + "loss": 2.4684, + "mean_token_accuracy": 0.4206896543502808, + "step": 16755 + }, + { + "epoch": 0.016880850005187135, + "grad_norm": 29.47379565480641, + "learning_rate": 1.6880526962512337e-05, + "loss": 2.5385, + "mean_token_accuracy": 0.4034482777118683, + "step": 16760 + }, + { + "epoch": 0.01688588605829131, + "grad_norm": 25.63048320923515, + "learning_rate": 1.6885562919243396e-05, + "loss": 2.2866, + "mean_token_accuracy": 0.441379314661026, + "step": 16765 + }, + { + "epoch": 0.01689092211139548, + "grad_norm": 25.083368180138745, + "learning_rate": 1.689059887597446e-05, + "loss": 2.5079, + "mean_token_accuracy": 0.37586206793785093, + "step": 16770 + }, + { + "epoch": 0.016895958164499653, + "grad_norm": 31.985448268481715, + "learning_rate": 1.6895634832705518e-05, + "loss": 2.4864, + "mean_token_accuracy": 0.3965517282485962, + "step": 16775 + }, + { + "epoch": 0.016900994217603826, + "grad_norm": 25.796786094649132, + "learning_rate": 1.6900670789436577e-05, + "loss": 2.6663, + "mean_token_accuracy": 0.3551724076271057, + "step": 16780 + }, + { + "epoch": 0.016906030270708, + "grad_norm": 21.709930426023988, + "learning_rate": 1.6905706746167637e-05, + "loss": 2.7189, + "mean_token_accuracy": 0.3793103456497192, + "step": 16785 + }, + { + "epoch": 0.01691106632381217, + "grad_norm": 19.617598631316774, + "learning_rate": 1.6910742702898696e-05, + "loss": 1.9605, + "mean_token_accuracy": 0.5172413766384125, + "step": 16790 + }, + { + "epoch": 0.016916102376916344, + "grad_norm": 21.846556545721675, + "learning_rate": 1.691577865962976e-05, + "loss": 2.3074, + "mean_token_accuracy": 0.46551724672317507, + "step": 16795 + }, + { + "epoch": 0.016921138430020518, + "grad_norm": 23.557834718940086, + "learning_rate": 1.6920814616360818e-05, + "loss": 2.3278, + "mean_token_accuracy": 0.4620689690113068, + "step": 16800 + }, + { + "epoch": 0.016926174483124688, + "grad_norm": 18.266984969749736, + "learning_rate": 1.6925850573091877e-05, + "loss": 2.2223, + "mean_token_accuracy": 0.42413792610168455, + "step": 16805 + }, + { + "epoch": 0.016931210536228862, + "grad_norm": 28.987995605777982, + "learning_rate": 1.6930886529822936e-05, + "loss": 2.3678, + "mean_token_accuracy": 0.4103448212146759, + "step": 16810 + }, + { + "epoch": 0.016936246589333036, + "grad_norm": 19.09859453193959, + "learning_rate": 1.6935922486553996e-05, + "loss": 2.1327, + "mean_token_accuracy": 0.4931034445762634, + "step": 16815 + }, + { + "epoch": 0.01694128264243721, + "grad_norm": 28.642916482497466, + "learning_rate": 1.6940958443285058e-05, + "loss": 2.7749, + "mean_token_accuracy": 0.3620689630508423, + "step": 16820 + }, + { + "epoch": 0.01694631869554138, + "grad_norm": 28.281008802403854, + "learning_rate": 1.6945994400016117e-05, + "loss": 2.5941, + "mean_token_accuracy": 0.3896551728248596, + "step": 16825 + }, + { + "epoch": 0.016951354748645554, + "grad_norm": 23.616357843621316, + "learning_rate": 1.6951030356747173e-05, + "loss": 2.2922, + "mean_token_accuracy": 0.4344827592372894, + "step": 16830 + }, + { + "epoch": 0.016956390801749727, + "grad_norm": 23.848832177595586, + "learning_rate": 1.6956066313478236e-05, + "loss": 2.6654, + "mean_token_accuracy": 0.38275861740112305, + "step": 16835 + }, + { + "epoch": 0.016961426854853898, + "grad_norm": 22.375460521922786, + "learning_rate": 1.6961102270209295e-05, + "loss": 2.527, + "mean_token_accuracy": 0.39655172228813174, + "step": 16840 + }, + { + "epoch": 0.01696646290795807, + "grad_norm": 22.49239676855279, + "learning_rate": 1.6966138226940358e-05, + "loss": 2.5873, + "mean_token_accuracy": 0.39794312715530394, + "step": 16845 + }, + { + "epoch": 0.016971498961062245, + "grad_norm": 22.330940870463852, + "learning_rate": 1.6971174183671414e-05, + "loss": 2.778, + "mean_token_accuracy": 0.38021778464317324, + "step": 16850 + }, + { + "epoch": 0.01697653501416642, + "grad_norm": 21.86718311836748, + "learning_rate": 1.6976210140402473e-05, + "loss": 2.0557, + "mean_token_accuracy": 0.4517241358757019, + "step": 16855 + }, + { + "epoch": 0.01698157106727059, + "grad_norm": 29.518516609639338, + "learning_rate": 1.6981246097133536e-05, + "loss": 2.4277, + "mean_token_accuracy": 0.37392619252204895, + "step": 16860 + }, + { + "epoch": 0.016986607120374763, + "grad_norm": 22.75331959358362, + "learning_rate": 1.6986282053864595e-05, + "loss": 2.7001, + "mean_token_accuracy": 0.4068965494632721, + "step": 16865 + }, + { + "epoch": 0.016991643173478937, + "grad_norm": 27.48321845168011, + "learning_rate": 1.6991318010595654e-05, + "loss": 2.8149, + "mean_token_accuracy": 0.36551723480224607, + "step": 16870 + }, + { + "epoch": 0.016996679226583107, + "grad_norm": 25.53383939797929, + "learning_rate": 1.6996353967326713e-05, + "loss": 2.5689, + "mean_token_accuracy": 0.3931034505367279, + "step": 16875 + }, + { + "epoch": 0.01700171527968728, + "grad_norm": 23.4979280092709, + "learning_rate": 1.7001389924057773e-05, + "loss": 2.4146, + "mean_token_accuracy": 0.47931033968925474, + "step": 16880 + }, + { + "epoch": 0.017006751332791455, + "grad_norm": 30.884072616511304, + "learning_rate": 1.7006425880788835e-05, + "loss": 2.3744, + "mean_token_accuracy": 0.443254679441452, + "step": 16885 + }, + { + "epoch": 0.01701178738589563, + "grad_norm": 18.901462006544367, + "learning_rate": 1.7011461837519894e-05, + "loss": 2.2586, + "mean_token_accuracy": 0.41724138259887694, + "step": 16890 + }, + { + "epoch": 0.0170168234389998, + "grad_norm": 23.643674359115106, + "learning_rate": 1.701649779425095e-05, + "loss": 2.4209, + "mean_token_accuracy": 0.44827585816383364, + "step": 16895 + }, + { + "epoch": 0.017021859492103972, + "grad_norm": 18.502886715526195, + "learning_rate": 1.7021533750982013e-05, + "loss": 2.1947, + "mean_token_accuracy": 0.4862069010734558, + "step": 16900 + }, + { + "epoch": 0.017026895545208146, + "grad_norm": 23.80808223912398, + "learning_rate": 1.7026569707713072e-05, + "loss": 1.9763, + "mean_token_accuracy": 0.4896551728248596, + "step": 16905 + }, + { + "epoch": 0.017031931598312317, + "grad_norm": 16.292167289333896, + "learning_rate": 1.703160566444413e-05, + "loss": 2.0484, + "mean_token_accuracy": 0.482758617401123, + "step": 16910 + }, + { + "epoch": 0.01703696765141649, + "grad_norm": 24.26451475533685, + "learning_rate": 1.703664162117519e-05, + "loss": 2.4793, + "mean_token_accuracy": 0.41379310488700866, + "step": 16915 + }, + { + "epoch": 0.017042003704520664, + "grad_norm": 26.12217445671135, + "learning_rate": 1.704167757790625e-05, + "loss": 2.8941, + "mean_token_accuracy": 0.3896551728248596, + "step": 16920 + }, + { + "epoch": 0.017047039757624838, + "grad_norm": 28.03840535394834, + "learning_rate": 1.7046713534637313e-05, + "loss": 2.6204, + "mean_token_accuracy": 0.4413793087005615, + "step": 16925 + }, + { + "epoch": 0.017052075810729008, + "grad_norm": 20.946667814989045, + "learning_rate": 1.7051749491368372e-05, + "loss": 2.3908, + "mean_token_accuracy": 0.3896551787853241, + "step": 16930 + }, + { + "epoch": 0.017057111863833182, + "grad_norm": 21.573774119361257, + "learning_rate": 1.705678544809943e-05, + "loss": 2.5358, + "mean_token_accuracy": 0.43103447556495667, + "step": 16935 + }, + { + "epoch": 0.017062147916937356, + "grad_norm": 32.16166612648354, + "learning_rate": 1.706182140483049e-05, + "loss": 2.6069, + "mean_token_accuracy": 0.39655172228813174, + "step": 16940 + }, + { + "epoch": 0.017067183970041526, + "grad_norm": 21.356594774685313, + "learning_rate": 1.706685736156155e-05, + "loss": 2.5144, + "mean_token_accuracy": 0.42820197343826294, + "step": 16945 + }, + { + "epoch": 0.0170722200231457, + "grad_norm": 26.64612832961511, + "learning_rate": 1.707189331829261e-05, + "loss": 2.439, + "mean_token_accuracy": 0.3620689630508423, + "step": 16950 + }, + { + "epoch": 0.017077256076249873, + "grad_norm": 28.609755436671982, + "learning_rate": 1.707692927502367e-05, + "loss": 2.6326, + "mean_token_accuracy": 0.41893526911735535, + "step": 16955 + }, + { + "epoch": 0.017082292129354047, + "grad_norm": 21.62245765998265, + "learning_rate": 1.7081965231754727e-05, + "loss": 2.0171, + "mean_token_accuracy": 0.5110837399959565, + "step": 16960 + }, + { + "epoch": 0.017087328182458218, + "grad_norm": 25.860257412667455, + "learning_rate": 1.708700118848579e-05, + "loss": 2.3232, + "mean_token_accuracy": 0.4137930989265442, + "step": 16965 + }, + { + "epoch": 0.01709236423556239, + "grad_norm": 23.80630529834784, + "learning_rate": 1.709203714521685e-05, + "loss": 2.3485, + "mean_token_accuracy": 0.41379310488700866, + "step": 16970 + }, + { + "epoch": 0.017097400288666565, + "grad_norm": 24.002206389771395, + "learning_rate": 1.709707310194791e-05, + "loss": 2.2951, + "mean_token_accuracy": 0.46896551847457885, + "step": 16975 + }, + { + "epoch": 0.017102436341770735, + "grad_norm": 20.28646346479148, + "learning_rate": 1.710210905867897e-05, + "loss": 2.194, + "mean_token_accuracy": 0.4551724076271057, + "step": 16980 + }, + { + "epoch": 0.01710747239487491, + "grad_norm": 26.826628437996515, + "learning_rate": 1.7107145015410027e-05, + "loss": 2.738, + "mean_token_accuracy": 0.441379314661026, + "step": 16985 + }, + { + "epoch": 0.017112508447979083, + "grad_norm": 25.704531254648305, + "learning_rate": 1.7112180972141086e-05, + "loss": 2.1682, + "mean_token_accuracy": 0.4241379380226135, + "step": 16990 + }, + { + "epoch": 0.017117544501083257, + "grad_norm": 22.8919726188464, + "learning_rate": 1.711721692887215e-05, + "loss": 2.4086, + "mean_token_accuracy": 0.4551724135875702, + "step": 16995 + }, + { + "epoch": 0.017122580554187427, + "grad_norm": 23.351421984432573, + "learning_rate": 1.7122252885603208e-05, + "loss": 2.3203, + "mean_token_accuracy": 0.4482758641242981, + "step": 17000 + }, + { + "epoch": 0.0171276166072916, + "grad_norm": 37.58523548924887, + "learning_rate": 1.7127288842334267e-05, + "loss": 2.1699, + "mean_token_accuracy": 0.4689655065536499, + "step": 17005 + }, + { + "epoch": 0.017132652660395774, + "grad_norm": 19.83621903528445, + "learning_rate": 1.7132324799065327e-05, + "loss": 2.2801, + "mean_token_accuracy": 0.4586206912994385, + "step": 17010 + }, + { + "epoch": 0.017137688713499945, + "grad_norm": 29.990762500032883, + "learning_rate": 1.7137360755796386e-05, + "loss": 2.3877, + "mean_token_accuracy": 0.3724137842655182, + "step": 17015 + }, + { + "epoch": 0.01714272476660412, + "grad_norm": 21.395193560502197, + "learning_rate": 1.714239671252745e-05, + "loss": 2.6039, + "mean_token_accuracy": 0.42068966031074523, + "step": 17020 + }, + { + "epoch": 0.017147760819708292, + "grad_norm": 22.821579564968065, + "learning_rate": 1.7147432669258508e-05, + "loss": 2.4418, + "mean_token_accuracy": 0.4206896543502808, + "step": 17025 + }, + { + "epoch": 0.017152796872812466, + "grad_norm": 26.608042043880413, + "learning_rate": 1.7152468625989564e-05, + "loss": 2.4207, + "mean_token_accuracy": 0.43793103098869324, + "step": 17030 + }, + { + "epoch": 0.017157832925916636, + "grad_norm": 25.625257156422766, + "learning_rate": 1.7157504582720626e-05, + "loss": 2.479, + "mean_token_accuracy": 0.4188747763633728, + "step": 17035 + }, + { + "epoch": 0.01716286897902081, + "grad_norm": 23.757593294716646, + "learning_rate": 1.7162540539451686e-05, + "loss": 2.1516, + "mean_token_accuracy": 0.4517241358757019, + "step": 17040 + }, + { + "epoch": 0.017167905032124984, + "grad_norm": 29.63098332835245, + "learning_rate": 1.7167576496182748e-05, + "loss": 2.7424, + "mean_token_accuracy": 0.3495462715625763, + "step": 17045 + }, + { + "epoch": 0.017172941085229154, + "grad_norm": 24.27220567025843, + "learning_rate": 1.7172612452913804e-05, + "loss": 2.2286, + "mean_token_accuracy": 0.47241379618644713, + "step": 17050 + }, + { + "epoch": 0.017177977138333328, + "grad_norm": 21.539020120510976, + "learning_rate": 1.7177648409644863e-05, + "loss": 2.3071, + "mean_token_accuracy": 0.42758620977401735, + "step": 17055 + }, + { + "epoch": 0.0171830131914375, + "grad_norm": 21.68058802859244, + "learning_rate": 1.7182684366375926e-05, + "loss": 2.2475, + "mean_token_accuracy": 0.46551724672317507, + "step": 17060 + }, + { + "epoch": 0.017188049244541675, + "grad_norm": 28.396285901412426, + "learning_rate": 1.7187720323106985e-05, + "loss": 1.9916, + "mean_token_accuracy": 0.4758620738983154, + "step": 17065 + }, + { + "epoch": 0.017193085297645846, + "grad_norm": 25.752053143382444, + "learning_rate": 1.7192756279838045e-05, + "loss": 2.3177, + "mean_token_accuracy": 0.4586206912994385, + "step": 17070 + }, + { + "epoch": 0.01719812135075002, + "grad_norm": 24.691942460316515, + "learning_rate": 1.7197792236569104e-05, + "loss": 2.1456, + "mean_token_accuracy": 0.46896551847457885, + "step": 17075 + }, + { + "epoch": 0.017203157403854193, + "grad_norm": 16.54078023280488, + "learning_rate": 1.7202828193300163e-05, + "loss": 2.1242, + "mean_token_accuracy": 0.4379310369491577, + "step": 17080 + }, + { + "epoch": 0.017208193456958364, + "grad_norm": 22.855771907560182, + "learning_rate": 1.7207864150031226e-05, + "loss": 2.2227, + "mean_token_accuracy": 0.4517241358757019, + "step": 17085 + }, + { + "epoch": 0.017213229510062537, + "grad_norm": 33.074964765988945, + "learning_rate": 1.7212900106762285e-05, + "loss": 2.4396, + "mean_token_accuracy": 0.4551724076271057, + "step": 17090 + }, + { + "epoch": 0.01721826556316671, + "grad_norm": 22.62862453504766, + "learning_rate": 1.721793606349334e-05, + "loss": 2.0233, + "mean_token_accuracy": 0.48620688915252686, + "step": 17095 + }, + { + "epoch": 0.017223301616270885, + "grad_norm": 20.273131121523456, + "learning_rate": 1.7222972020224403e-05, + "loss": 2.6155, + "mean_token_accuracy": 0.3827586144208908, + "step": 17100 + }, + { + "epoch": 0.017228337669375055, + "grad_norm": 22.51211578881003, + "learning_rate": 1.7228007976955463e-05, + "loss": 2.4178, + "mean_token_accuracy": 0.3965517163276672, + "step": 17105 + }, + { + "epoch": 0.01723337372247923, + "grad_norm": 84.16959554946303, + "learning_rate": 1.7233043933686522e-05, + "loss": 2.3054, + "mean_token_accuracy": 0.4551724135875702, + "step": 17110 + }, + { + "epoch": 0.017238409775583403, + "grad_norm": 20.50101782438021, + "learning_rate": 1.723807989041758e-05, + "loss": 2.5777, + "mean_token_accuracy": 0.3724137842655182, + "step": 17115 + }, + { + "epoch": 0.017243445828687573, + "grad_norm": 27.33558168710615, + "learning_rate": 1.724311584714864e-05, + "loss": 2.0983, + "mean_token_accuracy": 0.4620689630508423, + "step": 17120 + }, + { + "epoch": 0.017248481881791747, + "grad_norm": 27.514305515855735, + "learning_rate": 1.7248151803879703e-05, + "loss": 2.3523, + "mean_token_accuracy": 0.4399273991584778, + "step": 17125 + }, + { + "epoch": 0.01725351793489592, + "grad_norm": 23.18210368859295, + "learning_rate": 1.7253187760610762e-05, + "loss": 2.2599, + "mean_token_accuracy": 0.4034482777118683, + "step": 17130 + }, + { + "epoch": 0.017258553988000094, + "grad_norm": 17.704120273365863, + "learning_rate": 1.725822371734182e-05, + "loss": 2.2296, + "mean_token_accuracy": 0.44137930274009707, + "step": 17135 + }, + { + "epoch": 0.017263590041104265, + "grad_norm": 21.017394238514395, + "learning_rate": 1.726325967407288e-05, + "loss": 2.1386, + "mean_token_accuracy": 0.46551724672317507, + "step": 17140 + }, + { + "epoch": 0.01726862609420844, + "grad_norm": 27.582861840428897, + "learning_rate": 1.726829563080394e-05, + "loss": 2.3705, + "mean_token_accuracy": 0.3965517282485962, + "step": 17145 + }, + { + "epoch": 0.017273662147312612, + "grad_norm": 26.01625521275371, + "learning_rate": 1.7273331587535e-05, + "loss": 2.4477, + "mean_token_accuracy": 0.4068965494632721, + "step": 17150 + }, + { + "epoch": 0.017278698200416782, + "grad_norm": 36.17817015082944, + "learning_rate": 1.7278367544266062e-05, + "loss": 2.3759, + "mean_token_accuracy": 0.4551724135875702, + "step": 17155 + }, + { + "epoch": 0.017283734253520956, + "grad_norm": 21.422984387095664, + "learning_rate": 1.728340350099712e-05, + "loss": 2.1965, + "mean_token_accuracy": 0.46896551847457885, + "step": 17160 + }, + { + "epoch": 0.01728877030662513, + "grad_norm": 20.580260013866525, + "learning_rate": 1.728843945772818e-05, + "loss": 2.4614, + "mean_token_accuracy": 0.4, + "step": 17165 + }, + { + "epoch": 0.017293806359729304, + "grad_norm": 19.256983666487866, + "learning_rate": 1.729347541445924e-05, + "loss": 2.5576, + "mean_token_accuracy": 0.4103448212146759, + "step": 17170 + }, + { + "epoch": 0.017298842412833474, + "grad_norm": 27.353384892700625, + "learning_rate": 1.72985113711903e-05, + "loss": 2.5757, + "mean_token_accuracy": 0.37241379618644715, + "step": 17175 + }, + { + "epoch": 0.017303878465937648, + "grad_norm": 23.615407544767102, + "learning_rate": 1.730354732792136e-05, + "loss": 2.2754, + "mean_token_accuracy": 0.43448275327682495, + "step": 17180 + }, + { + "epoch": 0.01730891451904182, + "grad_norm": 26.244350652956335, + "learning_rate": 1.7308583284652417e-05, + "loss": 2.1651, + "mean_token_accuracy": 0.47586206793785096, + "step": 17185 + }, + { + "epoch": 0.017313950572145992, + "grad_norm": 19.305258312131684, + "learning_rate": 1.731361924138348e-05, + "loss": 2.4327, + "mean_token_accuracy": 0.40086206793785095, + "step": 17190 + }, + { + "epoch": 0.017318986625250166, + "grad_norm": 30.19412163179365, + "learning_rate": 1.731865519811454e-05, + "loss": 2.5088, + "mean_token_accuracy": 0.42758620381355283, + "step": 17195 + }, + { + "epoch": 0.01732402267835434, + "grad_norm": 30.747832867794486, + "learning_rate": 1.73236911548456e-05, + "loss": 2.5431, + "mean_token_accuracy": 0.4172413766384125, + "step": 17200 + }, + { + "epoch": 0.017329058731458513, + "grad_norm": 23.1037889960755, + "learning_rate": 1.7328727111576658e-05, + "loss": 2.5489, + "mean_token_accuracy": 0.3862069010734558, + "step": 17205 + }, + { + "epoch": 0.017334094784562683, + "grad_norm": 25.941128439981238, + "learning_rate": 1.7333763068307717e-05, + "loss": 2.4251, + "mean_token_accuracy": 0.4137930989265442, + "step": 17210 + }, + { + "epoch": 0.017339130837666857, + "grad_norm": 23.869162905032915, + "learning_rate": 1.7338799025038776e-05, + "loss": 2.533, + "mean_token_accuracy": 0.3551724135875702, + "step": 17215 + }, + { + "epoch": 0.01734416689077103, + "grad_norm": 26.36224671369415, + "learning_rate": 1.734383498176984e-05, + "loss": 2.7179, + "mean_token_accuracy": 0.36896551251411436, + "step": 17220 + }, + { + "epoch": 0.0173492029438752, + "grad_norm": 21.00237111569138, + "learning_rate": 1.7348870938500898e-05, + "loss": 2.1971, + "mean_token_accuracy": 0.4689655125141144, + "step": 17225 + }, + { + "epoch": 0.017354238996979375, + "grad_norm": 22.121329078886497, + "learning_rate": 1.7353906895231958e-05, + "loss": 2.2713, + "mean_token_accuracy": 0.4482758641242981, + "step": 17230 + }, + { + "epoch": 0.01735927505008355, + "grad_norm": 35.590469996180964, + "learning_rate": 1.7358942851963017e-05, + "loss": 2.6176, + "mean_token_accuracy": 0.4137930989265442, + "step": 17235 + }, + { + "epoch": 0.017364311103187723, + "grad_norm": 27.8567394318579, + "learning_rate": 1.7363978808694076e-05, + "loss": 2.4839, + "mean_token_accuracy": 0.4172413766384125, + "step": 17240 + }, + { + "epoch": 0.017369347156291893, + "grad_norm": 35.18304278404932, + "learning_rate": 1.736901476542514e-05, + "loss": 2.312, + "mean_token_accuracy": 0.4862068951129913, + "step": 17245 + }, + { + "epoch": 0.017374383209396067, + "grad_norm": 103.94002807813789, + "learning_rate": 1.7374050722156195e-05, + "loss": 2.5213, + "mean_token_accuracy": 0.358620685338974, + "step": 17250 + }, + { + "epoch": 0.01737941926250024, + "grad_norm": 26.540669579160614, + "learning_rate": 1.7379086678887254e-05, + "loss": 2.6305, + "mean_token_accuracy": 0.4000000059604645, + "step": 17255 + }, + { + "epoch": 0.01738445531560441, + "grad_norm": 24.15332022067708, + "learning_rate": 1.7384122635618316e-05, + "loss": 2.1583, + "mean_token_accuracy": 0.45710828304290774, + "step": 17260 + }, + { + "epoch": 0.017389491368708584, + "grad_norm": 35.886360997256936, + "learning_rate": 1.7389158592349376e-05, + "loss": 2.6766, + "mean_token_accuracy": 0.38076224327087405, + "step": 17265 + }, + { + "epoch": 0.017394527421812758, + "grad_norm": 23.556475145192334, + "learning_rate": 1.7394194549080435e-05, + "loss": 2.4301, + "mean_token_accuracy": 0.38965516686439516, + "step": 17270 + }, + { + "epoch": 0.017399563474916932, + "grad_norm": 25.85148136055674, + "learning_rate": 1.7399230505811494e-05, + "loss": 2.3706, + "mean_token_accuracy": 0.42758620381355283, + "step": 17275 + }, + { + "epoch": 0.017404599528021102, + "grad_norm": 23.049517198126363, + "learning_rate": 1.7404266462542553e-05, + "loss": 2.1388, + "mean_token_accuracy": 0.49491833448410033, + "step": 17280 + }, + { + "epoch": 0.017409635581125276, + "grad_norm": 18.43971040666176, + "learning_rate": 1.7409302419273616e-05, + "loss": 2.0456, + "mean_token_accuracy": 0.4620689630508423, + "step": 17285 + }, + { + "epoch": 0.01741467163422945, + "grad_norm": 23.837235878500735, + "learning_rate": 1.7414338376004675e-05, + "loss": 2.3365, + "mean_token_accuracy": 0.43793103098869324, + "step": 17290 + }, + { + "epoch": 0.01741970768733362, + "grad_norm": 18.68997577120234, + "learning_rate": 1.741937433273573e-05, + "loss": 2.1335, + "mean_token_accuracy": 0.45517241954803467, + "step": 17295 + }, + { + "epoch": 0.017424743740437794, + "grad_norm": 20.104010768017094, + "learning_rate": 1.7424410289466794e-05, + "loss": 2.116, + "mean_token_accuracy": 0.47931033968925474, + "step": 17300 + }, + { + "epoch": 0.017429779793541968, + "grad_norm": 26.76951987270601, + "learning_rate": 1.7429446246197853e-05, + "loss": 2.4479, + "mean_token_accuracy": 0.43793103098869324, + "step": 17305 + }, + { + "epoch": 0.01743481584664614, + "grad_norm": 25.869988185983974, + "learning_rate": 1.7434482202928916e-05, + "loss": 2.3298, + "mean_token_accuracy": 0.4448275864124298, + "step": 17310 + }, + { + "epoch": 0.01743985189975031, + "grad_norm": 19.647809220096583, + "learning_rate": 1.743951815965997e-05, + "loss": 2.3403, + "mean_token_accuracy": 0.4344827502965927, + "step": 17315 + }, + { + "epoch": 0.017444887952854485, + "grad_norm": 21.341376187528436, + "learning_rate": 1.744455411639103e-05, + "loss": 2.2894, + "mean_token_accuracy": 0.4189352750778198, + "step": 17320 + }, + { + "epoch": 0.01744992400595866, + "grad_norm": 34.67212526756585, + "learning_rate": 1.7449590073122094e-05, + "loss": 2.8367, + "mean_token_accuracy": 0.39310344457626345, + "step": 17325 + }, + { + "epoch": 0.01745496005906283, + "grad_norm": 23.27076907558333, + "learning_rate": 1.7454626029853153e-05, + "loss": 2.2101, + "mean_token_accuracy": 0.4551724076271057, + "step": 17330 + }, + { + "epoch": 0.017459996112167003, + "grad_norm": 23.911707460278514, + "learning_rate": 1.7459661986584212e-05, + "loss": 2.5651, + "mean_token_accuracy": 0.37586207389831544, + "step": 17335 + }, + { + "epoch": 0.017465032165271177, + "grad_norm": 25.603640452219295, + "learning_rate": 1.746469794331527e-05, + "loss": 2.3273, + "mean_token_accuracy": 0.4862069070339203, + "step": 17340 + }, + { + "epoch": 0.01747006821837535, + "grad_norm": 26.569056738424994, + "learning_rate": 1.746973390004633e-05, + "loss": 2.0427, + "mean_token_accuracy": 0.44482758045196535, + "step": 17345 + }, + { + "epoch": 0.01747510427147952, + "grad_norm": 24.032783190066525, + "learning_rate": 1.7474769856777393e-05, + "loss": 2.611, + "mean_token_accuracy": 0.4344827592372894, + "step": 17350 + }, + { + "epoch": 0.017480140324583695, + "grad_norm": 24.773190508745046, + "learning_rate": 1.7479805813508452e-05, + "loss": 2.3345, + "mean_token_accuracy": 0.44482759237289426, + "step": 17355 + }, + { + "epoch": 0.01748517637768787, + "grad_norm": 26.556699719132038, + "learning_rate": 1.748484177023951e-05, + "loss": 2.6376, + "mean_token_accuracy": 0.3896551787853241, + "step": 17360 + }, + { + "epoch": 0.01749021243079204, + "grad_norm": 26.31064089327591, + "learning_rate": 1.748987772697057e-05, + "loss": 2.5097, + "mean_token_accuracy": 0.43103447556495667, + "step": 17365 + }, + { + "epoch": 0.017495248483896213, + "grad_norm": 21.454088842914057, + "learning_rate": 1.749491368370163e-05, + "loss": 1.9923, + "mean_token_accuracy": 0.4712038815021515, + "step": 17370 + }, + { + "epoch": 0.017500284537000386, + "grad_norm": 18.55862458351415, + "learning_rate": 1.749994964043269e-05, + "loss": 2.5636, + "mean_token_accuracy": 0.3827586144208908, + "step": 17375 + }, + { + "epoch": 0.01750532059010456, + "grad_norm": 28.418284016942984, + "learning_rate": 1.7504985597163752e-05, + "loss": 2.0485, + "mean_token_accuracy": 0.47852389216423036, + "step": 17380 + }, + { + "epoch": 0.01751035664320873, + "grad_norm": 27.532273746794147, + "learning_rate": 1.7510021553894808e-05, + "loss": 2.283, + "mean_token_accuracy": 0.47931034564971925, + "step": 17385 + }, + { + "epoch": 0.017515392696312904, + "grad_norm": 23.28607582655699, + "learning_rate": 1.751505751062587e-05, + "loss": 2.6172, + "mean_token_accuracy": 0.4329098641872406, + "step": 17390 + }, + { + "epoch": 0.017520428749417078, + "grad_norm": 31.3212357837227, + "learning_rate": 1.752009346735693e-05, + "loss": 3.0013, + "mean_token_accuracy": 0.32413792610168457, + "step": 17395 + }, + { + "epoch": 0.01752546480252125, + "grad_norm": 30.656970365145632, + "learning_rate": 1.752512942408799e-05, + "loss": 2.4152, + "mean_token_accuracy": 0.4413793087005615, + "step": 17400 + }, + { + "epoch": 0.017530500855625422, + "grad_norm": 28.62431970737396, + "learning_rate": 1.753016538081905e-05, + "loss": 2.3463, + "mean_token_accuracy": 0.4724137902259827, + "step": 17405 + }, + { + "epoch": 0.017535536908729596, + "grad_norm": 25.44910255275347, + "learning_rate": 1.7535201337550108e-05, + "loss": 2.8187, + "mean_token_accuracy": 0.3758620619773865, + "step": 17410 + }, + { + "epoch": 0.017540572961833766, + "grad_norm": 22.715702431006928, + "learning_rate": 1.7540237294281167e-05, + "loss": 2.448, + "mean_token_accuracy": 0.4172413766384125, + "step": 17415 + }, + { + "epoch": 0.01754560901493794, + "grad_norm": 22.497223643230594, + "learning_rate": 1.754527325101223e-05, + "loss": 2.4341, + "mean_token_accuracy": 0.46376285552978513, + "step": 17420 + }, + { + "epoch": 0.017550645068042114, + "grad_norm": 21.959151439427185, + "learning_rate": 1.755030920774329e-05, + "loss": 2.651, + "mean_token_accuracy": 0.37931035459041595, + "step": 17425 + }, + { + "epoch": 0.017555681121146288, + "grad_norm": 40.84754050533766, + "learning_rate": 1.7555345164474348e-05, + "loss": 2.4802, + "mean_token_accuracy": 0.4275862008333206, + "step": 17430 + }, + { + "epoch": 0.017560717174250458, + "grad_norm": 22.6856804952366, + "learning_rate": 1.7560381121205407e-05, + "loss": 2.2713, + "mean_token_accuracy": 0.42413792610168455, + "step": 17435 + }, + { + "epoch": 0.01756575322735463, + "grad_norm": 33.437174409592515, + "learning_rate": 1.7565417077936466e-05, + "loss": 2.5717, + "mean_token_accuracy": 0.4034482717514038, + "step": 17440 + }, + { + "epoch": 0.017570789280458805, + "grad_norm": 30.44284986700725, + "learning_rate": 1.757045303466753e-05, + "loss": 2.4759, + "mean_token_accuracy": 0.4068965494632721, + "step": 17445 + }, + { + "epoch": 0.017575825333562976, + "grad_norm": 42.91800692760815, + "learning_rate": 1.7575488991398585e-05, + "loss": 2.7056, + "mean_token_accuracy": 0.4344827592372894, + "step": 17450 + }, + { + "epoch": 0.01758086138666715, + "grad_norm": 24.418330753413446, + "learning_rate": 1.7580524948129644e-05, + "loss": 2.3593, + "mean_token_accuracy": 0.43448275327682495, + "step": 17455 + }, + { + "epoch": 0.017585897439771323, + "grad_norm": 25.518142057620672, + "learning_rate": 1.7585560904860707e-05, + "loss": 2.4157, + "mean_token_accuracy": 0.45517241060733793, + "step": 17460 + }, + { + "epoch": 0.017590933492875497, + "grad_norm": 18.98461253847586, + "learning_rate": 1.7590596861591766e-05, + "loss": 2.337, + "mean_token_accuracy": 0.4241379201412201, + "step": 17465 + }, + { + "epoch": 0.017595969545979667, + "grad_norm": 35.89824019991377, + "learning_rate": 1.7595632818322825e-05, + "loss": 2.2511, + "mean_token_accuracy": 0.42758620381355283, + "step": 17470 + }, + { + "epoch": 0.01760100559908384, + "grad_norm": 22.815412389329182, + "learning_rate": 1.7600668775053885e-05, + "loss": 2.5754, + "mean_token_accuracy": 0.45517241954803467, + "step": 17475 + }, + { + "epoch": 0.017606041652188015, + "grad_norm": 24.80619368605664, + "learning_rate": 1.7605704731784944e-05, + "loss": 2.3215, + "mean_token_accuracy": 0.4572897732257843, + "step": 17480 + }, + { + "epoch": 0.017611077705292185, + "grad_norm": 29.686113140259838, + "learning_rate": 1.7610740688516007e-05, + "loss": 2.3438, + "mean_token_accuracy": 0.4344827592372894, + "step": 17485 + }, + { + "epoch": 0.01761611375839636, + "grad_norm": 27.604209203992585, + "learning_rate": 1.7615776645247066e-05, + "loss": 2.4472, + "mean_token_accuracy": 0.42413793206214906, + "step": 17490 + }, + { + "epoch": 0.017621149811500533, + "grad_norm": 24.714328471228647, + "learning_rate": 1.762081260197812e-05, + "loss": 2.4688, + "mean_token_accuracy": 0.43793103098869324, + "step": 17495 + }, + { + "epoch": 0.017626185864604706, + "grad_norm": 19.88228375594727, + "learning_rate": 1.7625848558709184e-05, + "loss": 2.2342, + "mean_token_accuracy": 0.4551724076271057, + "step": 17500 + }, + { + "epoch": 0.017631221917708877, + "grad_norm": 18.768651688850156, + "learning_rate": 1.7630884515440244e-05, + "loss": 2.1795, + "mean_token_accuracy": 0.4517241418361664, + "step": 17505 + }, + { + "epoch": 0.01763625797081305, + "grad_norm": 25.87447357424474, + "learning_rate": 1.7635920472171306e-05, + "loss": 2.5555, + "mean_token_accuracy": 0.39655172228813174, + "step": 17510 + }, + { + "epoch": 0.017641294023917224, + "grad_norm": 20.611190898242114, + "learning_rate": 1.7640956428902365e-05, + "loss": 2.2705, + "mean_token_accuracy": 0.4088324248790741, + "step": 17515 + }, + { + "epoch": 0.017646330077021394, + "grad_norm": 24.30741609002821, + "learning_rate": 1.764599238563342e-05, + "loss": 1.943, + "mean_token_accuracy": 0.5155172407627105, + "step": 17520 + }, + { + "epoch": 0.017651366130125568, + "grad_norm": 28.197705245983673, + "learning_rate": 1.7651028342364484e-05, + "loss": 2.7748, + "mean_token_accuracy": 0.3620689570903778, + "step": 17525 + }, + { + "epoch": 0.017656402183229742, + "grad_norm": 24.178148949146955, + "learning_rate": 1.7656064299095543e-05, + "loss": 2.417, + "mean_token_accuracy": 0.4103448331356049, + "step": 17530 + }, + { + "epoch": 0.017661438236333916, + "grad_norm": 24.384424738049628, + "learning_rate": 1.7661100255826602e-05, + "loss": 2.2878, + "mean_token_accuracy": 0.44827585816383364, + "step": 17535 + }, + { + "epoch": 0.017666474289438086, + "grad_norm": 32.82125709249734, + "learning_rate": 1.7666136212557662e-05, + "loss": 2.4782, + "mean_token_accuracy": 0.43103448748588563, + "step": 17540 + }, + { + "epoch": 0.01767151034254226, + "grad_norm": 20.304794758538325, + "learning_rate": 1.767117216928872e-05, + "loss": 2.4222, + "mean_token_accuracy": 0.4206896543502808, + "step": 17545 + }, + { + "epoch": 0.017676546395646434, + "grad_norm": 19.461983219121986, + "learning_rate": 1.7676208126019784e-05, + "loss": 2.5778, + "mean_token_accuracy": 0.39310344457626345, + "step": 17550 + }, + { + "epoch": 0.017681582448750604, + "grad_norm": 18.5774299106097, + "learning_rate": 1.7681244082750843e-05, + "loss": 2.1944, + "mean_token_accuracy": 0.42068964838981626, + "step": 17555 + }, + { + "epoch": 0.017686618501854778, + "grad_norm": 30.991594543549862, + "learning_rate": 1.7686280039481902e-05, + "loss": 2.1665, + "mean_token_accuracy": 0.44482759237289426, + "step": 17560 + }, + { + "epoch": 0.01769165455495895, + "grad_norm": 26.435215446921944, + "learning_rate": 1.769131599621296e-05, + "loss": 2.7565, + "mean_token_accuracy": 0.3862068921327591, + "step": 17565 + }, + { + "epoch": 0.017696690608063125, + "grad_norm": 23.76161114727331, + "learning_rate": 1.769635195294402e-05, + "loss": 2.2272, + "mean_token_accuracy": 0.43103448748588563, + "step": 17570 + }, + { + "epoch": 0.017701726661167295, + "grad_norm": 23.746848862187342, + "learning_rate": 1.770138790967508e-05, + "loss": 2.2432, + "mean_token_accuracy": 0.43189655542373656, + "step": 17575 + }, + { + "epoch": 0.01770676271427147, + "grad_norm": 24.675746764587743, + "learning_rate": 1.7706423866406143e-05, + "loss": 2.377, + "mean_token_accuracy": 0.4413793087005615, + "step": 17580 + }, + { + "epoch": 0.017711798767375643, + "grad_norm": 28.94297930018188, + "learning_rate": 1.77114598231372e-05, + "loss": 2.2313, + "mean_token_accuracy": 0.4772167444229126, + "step": 17585 + }, + { + "epoch": 0.017716834820479813, + "grad_norm": 22.398562915053283, + "learning_rate": 1.771649577986826e-05, + "loss": 2.3615, + "mean_token_accuracy": 0.47453115582466127, + "step": 17590 + }, + { + "epoch": 0.017721870873583987, + "grad_norm": 31.97321389502253, + "learning_rate": 1.772153173659932e-05, + "loss": 2.4503, + "mean_token_accuracy": 0.463520872592926, + "step": 17595 + }, + { + "epoch": 0.01772690692668816, + "grad_norm": 20.57668206466762, + "learning_rate": 1.772656769333038e-05, + "loss": 2.3387, + "mean_token_accuracy": 0.482758617401123, + "step": 17600 + }, + { + "epoch": 0.017731942979792335, + "grad_norm": 19.505591317719347, + "learning_rate": 1.773160365006144e-05, + "loss": 2.3006, + "mean_token_accuracy": 0.4586206912994385, + "step": 17605 + }, + { + "epoch": 0.017736979032896505, + "grad_norm": 40.07010102083105, + "learning_rate": 1.7736639606792498e-05, + "loss": 2.6011, + "mean_token_accuracy": 0.42928009629249575, + "step": 17610 + }, + { + "epoch": 0.01774201508600068, + "grad_norm": 31.149344924268654, + "learning_rate": 1.774167556352356e-05, + "loss": 2.3403, + "mean_token_accuracy": 0.41034482717514037, + "step": 17615 + }, + { + "epoch": 0.017747051139104852, + "grad_norm": 22.999002334619874, + "learning_rate": 1.774671152025462e-05, + "loss": 2.558, + "mean_token_accuracy": 0.4328493714332581, + "step": 17620 + }, + { + "epoch": 0.017752087192209023, + "grad_norm": 19.465299523840663, + "learning_rate": 1.775174747698568e-05, + "loss": 2.197, + "mean_token_accuracy": 0.4708409011363983, + "step": 17625 + }, + { + "epoch": 0.017757123245313196, + "grad_norm": 27.630794938445447, + "learning_rate": 1.775678343371674e-05, + "loss": 2.2774, + "mean_token_accuracy": 0.4034482717514038, + "step": 17630 + }, + { + "epoch": 0.01776215929841737, + "grad_norm": 26.03809983600311, + "learning_rate": 1.7761819390447798e-05, + "loss": 2.615, + "mean_token_accuracy": 0.4137930989265442, + "step": 17635 + }, + { + "epoch": 0.017767195351521544, + "grad_norm": 30.731470789434216, + "learning_rate": 1.7766855347178857e-05, + "loss": 2.2515, + "mean_token_accuracy": 0.4689655303955078, + "step": 17640 + }, + { + "epoch": 0.017772231404625714, + "grad_norm": 22.528022437516356, + "learning_rate": 1.777189130390992e-05, + "loss": 2.3947, + "mean_token_accuracy": 0.41379310488700866, + "step": 17645 + }, + { + "epoch": 0.017777267457729888, + "grad_norm": 20.055747158686895, + "learning_rate": 1.7776927260640975e-05, + "loss": 2.2321, + "mean_token_accuracy": 0.441379314661026, + "step": 17650 + }, + { + "epoch": 0.017782303510834062, + "grad_norm": 25.88682399334647, + "learning_rate": 1.7781963217372038e-05, + "loss": 2.5413, + "mean_token_accuracy": 0.4068965494632721, + "step": 17655 + }, + { + "epoch": 0.017787339563938232, + "grad_norm": 26.07787959697268, + "learning_rate": 1.7786999174103097e-05, + "loss": 2.522, + "mean_token_accuracy": 0.4034482777118683, + "step": 17660 + }, + { + "epoch": 0.017792375617042406, + "grad_norm": 31.559282927008837, + "learning_rate": 1.7792035130834157e-05, + "loss": 2.1127, + "mean_token_accuracy": 0.4482758641242981, + "step": 17665 + }, + { + "epoch": 0.01779741167014658, + "grad_norm": 28.65068090976905, + "learning_rate": 1.7797071087565216e-05, + "loss": 2.4776, + "mean_token_accuracy": 0.4103448212146759, + "step": 17670 + }, + { + "epoch": 0.017802447723250753, + "grad_norm": 27.93920190299055, + "learning_rate": 1.7802107044296275e-05, + "loss": 2.3597, + "mean_token_accuracy": 0.44664247035980226, + "step": 17675 + }, + { + "epoch": 0.017807483776354924, + "grad_norm": 22.397118637496995, + "learning_rate": 1.7807143001027334e-05, + "loss": 2.0955, + "mean_token_accuracy": 0.4896551787853241, + "step": 17680 + }, + { + "epoch": 0.017812519829459098, + "grad_norm": 22.853642386216666, + "learning_rate": 1.7812178957758397e-05, + "loss": 2.5238, + "mean_token_accuracy": 0.3551724076271057, + "step": 17685 + }, + { + "epoch": 0.01781755588256327, + "grad_norm": 26.36908877903871, + "learning_rate": 1.7817214914489456e-05, + "loss": 2.7587, + "mean_token_accuracy": 0.41724138259887694, + "step": 17690 + }, + { + "epoch": 0.01782259193566744, + "grad_norm": 24.9865067076988, + "learning_rate": 1.7822250871220516e-05, + "loss": 2.4114, + "mean_token_accuracy": 0.4068965554237366, + "step": 17695 + }, + { + "epoch": 0.017827627988771615, + "grad_norm": 24.312096621812806, + "learning_rate": 1.7827286827951575e-05, + "loss": 2.2751, + "mean_token_accuracy": 0.4620689630508423, + "step": 17700 + }, + { + "epoch": 0.01783266404187579, + "grad_norm": 26.887038696163156, + "learning_rate": 1.7832322784682634e-05, + "loss": 2.5744, + "mean_token_accuracy": 0.38275861740112305, + "step": 17705 + }, + { + "epoch": 0.017837700094979963, + "grad_norm": 28.059780291831554, + "learning_rate": 1.7837358741413697e-05, + "loss": 2.4029, + "mean_token_accuracy": 0.45172414779663084, + "step": 17710 + }, + { + "epoch": 0.017842736148084133, + "grad_norm": 21.021218327454225, + "learning_rate": 1.7842394698144756e-05, + "loss": 2.1719, + "mean_token_accuracy": 0.4586206912994385, + "step": 17715 + }, + { + "epoch": 0.017847772201188307, + "grad_norm": 25.718807589788288, + "learning_rate": 1.7847430654875812e-05, + "loss": 2.2782, + "mean_token_accuracy": 0.41917725205421447, + "step": 17720 + }, + { + "epoch": 0.01785280825429248, + "grad_norm": 29.280621724606597, + "learning_rate": 1.7852466611606874e-05, + "loss": 2.3456, + "mean_token_accuracy": 0.44827585816383364, + "step": 17725 + }, + { + "epoch": 0.01785784430739665, + "grad_norm": 26.999977608066605, + "learning_rate": 1.7857502568337934e-05, + "loss": 2.4852, + "mean_token_accuracy": 0.40689654350280763, + "step": 17730 + }, + { + "epoch": 0.017862880360500825, + "grad_norm": 22.89249060597913, + "learning_rate": 1.7862538525068996e-05, + "loss": 2.4806, + "mean_token_accuracy": 0.41034482717514037, + "step": 17735 + }, + { + "epoch": 0.017867916413605, + "grad_norm": 32.99612133088084, + "learning_rate": 1.7867574481800052e-05, + "loss": 2.9538, + "mean_token_accuracy": 0.34482758343219755, + "step": 17740 + }, + { + "epoch": 0.017872952466709172, + "grad_norm": 19.215433956640425, + "learning_rate": 1.787261043853111e-05, + "loss": 2.1701, + "mean_token_accuracy": 0.5084089517593384, + "step": 17745 + }, + { + "epoch": 0.017877988519813343, + "grad_norm": 21.83348193967266, + "learning_rate": 1.7877646395262174e-05, + "loss": 2.4333, + "mean_token_accuracy": 0.41004234552383423, + "step": 17750 + }, + { + "epoch": 0.017883024572917516, + "grad_norm": 20.536419297082414, + "learning_rate": 1.7882682351993233e-05, + "loss": 2.3946, + "mean_token_accuracy": 0.4172413766384125, + "step": 17755 + }, + { + "epoch": 0.01788806062602169, + "grad_norm": 22.407979233843204, + "learning_rate": 1.7887718308724293e-05, + "loss": 2.1646, + "mean_token_accuracy": 0.4689655125141144, + "step": 17760 + }, + { + "epoch": 0.01789309667912586, + "grad_norm": 47.956054068011184, + "learning_rate": 1.7892754265455352e-05, + "loss": 2.4651, + "mean_token_accuracy": 0.43103448748588563, + "step": 17765 + }, + { + "epoch": 0.017898132732230034, + "grad_norm": 20.930335234278594, + "learning_rate": 1.789779022218641e-05, + "loss": 2.4341, + "mean_token_accuracy": 0.4189957737922668, + "step": 17770 + }, + { + "epoch": 0.017903168785334208, + "grad_norm": 21.71411443575081, + "learning_rate": 1.7902826178917474e-05, + "loss": 2.2481, + "mean_token_accuracy": 0.43641863465309144, + "step": 17775 + }, + { + "epoch": 0.01790820483843838, + "grad_norm": 28.7102381154154, + "learning_rate": 1.7907862135648533e-05, + "loss": 2.7496, + "mean_token_accuracy": 0.3931034505367279, + "step": 17780 + }, + { + "epoch": 0.017913240891542552, + "grad_norm": 21.781497393774814, + "learning_rate": 1.791289809237959e-05, + "loss": 2.5741, + "mean_token_accuracy": 0.4103448331356049, + "step": 17785 + }, + { + "epoch": 0.017918276944646726, + "grad_norm": 24.781937344331205, + "learning_rate": 1.791793404911065e-05, + "loss": 2.8346, + "mean_token_accuracy": 0.3896551728248596, + "step": 17790 + }, + { + "epoch": 0.0179233129977509, + "grad_norm": 29.059807852533513, + "learning_rate": 1.792297000584171e-05, + "loss": 2.5855, + "mean_token_accuracy": 0.43103447556495667, + "step": 17795 + }, + { + "epoch": 0.01792834905085507, + "grad_norm": 25.407786555534514, + "learning_rate": 1.792800596257277e-05, + "loss": 2.4015, + "mean_token_accuracy": 0.44827587008476255, + "step": 17800 + }, + { + "epoch": 0.017933385103959244, + "grad_norm": 17.973880950806024, + "learning_rate": 1.793304191930383e-05, + "loss": 2.2747, + "mean_token_accuracy": 0.45396249890327456, + "step": 17805 + }, + { + "epoch": 0.017938421157063417, + "grad_norm": 17.283264375485086, + "learning_rate": 1.793807787603489e-05, + "loss": 2.3358, + "mean_token_accuracy": 0.4517241358757019, + "step": 17810 + }, + { + "epoch": 0.01794345721016759, + "grad_norm": 28.150657409318192, + "learning_rate": 1.794311383276595e-05, + "loss": 2.4233, + "mean_token_accuracy": 0.4310344845056534, + "step": 17815 + }, + { + "epoch": 0.01794849326327176, + "grad_norm": 21.39567686871591, + "learning_rate": 1.794814978949701e-05, + "loss": 2.5214, + "mean_token_accuracy": 0.41034482717514037, + "step": 17820 + }, + { + "epoch": 0.017953529316375935, + "grad_norm": 26.30556551873422, + "learning_rate": 1.795318574622807e-05, + "loss": 2.3908, + "mean_token_accuracy": 0.3793103456497192, + "step": 17825 + }, + { + "epoch": 0.01795856536948011, + "grad_norm": 19.253761101351866, + "learning_rate": 1.795822170295913e-05, + "loss": 2.3134, + "mean_token_accuracy": 0.45517241954803467, + "step": 17830 + }, + { + "epoch": 0.01796360142258428, + "grad_norm": 17.509717985428647, + "learning_rate": 1.7963257659690188e-05, + "loss": 2.6476, + "mean_token_accuracy": 0.37241379022598264, + "step": 17835 + }, + { + "epoch": 0.017968637475688453, + "grad_norm": 25.57963889440956, + "learning_rate": 1.7968293616421247e-05, + "loss": 2.4963, + "mean_token_accuracy": 0.43448275327682495, + "step": 17840 + }, + { + "epoch": 0.017973673528792627, + "grad_norm": 21.712329938231694, + "learning_rate": 1.797332957315231e-05, + "loss": 2.4557, + "mean_token_accuracy": 0.4517241299152374, + "step": 17845 + }, + { + "epoch": 0.0179787095818968, + "grad_norm": 22.276125501252693, + "learning_rate": 1.7978365529883366e-05, + "loss": 2.5801, + "mean_token_accuracy": 0.4344827592372894, + "step": 17850 + }, + { + "epoch": 0.01798374563500097, + "grad_norm": 25.400250930627106, + "learning_rate": 1.798340148661443e-05, + "loss": 2.4426, + "mean_token_accuracy": 0.42413793206214906, + "step": 17855 + }, + { + "epoch": 0.017988781688105145, + "grad_norm": 21.432619555026445, + "learning_rate": 1.7988437443345488e-05, + "loss": 2.6549, + "mean_token_accuracy": 0.41379310488700866, + "step": 17860 + }, + { + "epoch": 0.01799381774120932, + "grad_norm": 25.12524210740627, + "learning_rate": 1.7993473400076547e-05, + "loss": 2.5328, + "mean_token_accuracy": 0.41724137365818026, + "step": 17865 + }, + { + "epoch": 0.01799885379431349, + "grad_norm": 23.899818435308884, + "learning_rate": 1.7998509356807606e-05, + "loss": 2.4895, + "mean_token_accuracy": 0.47586206793785096, + "step": 17870 + }, + { + "epoch": 0.018003889847417662, + "grad_norm": 24.689313656253333, + "learning_rate": 1.8003545313538666e-05, + "loss": 2.3117, + "mean_token_accuracy": 0.4448275864124298, + "step": 17875 + }, + { + "epoch": 0.018008925900521836, + "grad_norm": 24.852767428053237, + "learning_rate": 1.8008581270269725e-05, + "loss": 2.3515, + "mean_token_accuracy": 0.44482758045196535, + "step": 17880 + }, + { + "epoch": 0.01801396195362601, + "grad_norm": 25.045935697856397, + "learning_rate": 1.8013617227000787e-05, + "loss": 2.7134, + "mean_token_accuracy": 0.37931033968925476, + "step": 17885 + }, + { + "epoch": 0.01801899800673018, + "grad_norm": 22.09563764144968, + "learning_rate": 1.8018653183731847e-05, + "loss": 2.3021, + "mean_token_accuracy": 0.42758620977401735, + "step": 17890 + }, + { + "epoch": 0.018024034059834354, + "grad_norm": 23.96496201047184, + "learning_rate": 1.8023689140462906e-05, + "loss": 2.4955, + "mean_token_accuracy": 0.43103448748588563, + "step": 17895 + }, + { + "epoch": 0.018029070112938528, + "grad_norm": 22.62156259127355, + "learning_rate": 1.8028725097193965e-05, + "loss": 2.6392, + "mean_token_accuracy": 0.4034482717514038, + "step": 17900 + }, + { + "epoch": 0.018034106166042698, + "grad_norm": 21.977156595515346, + "learning_rate": 1.8033761053925024e-05, + "loss": 2.179, + "mean_token_accuracy": 0.46896552443504336, + "step": 17905 + }, + { + "epoch": 0.018039142219146872, + "grad_norm": 26.5977938469768, + "learning_rate": 1.8038797010656087e-05, + "loss": 2.2387, + "mean_token_accuracy": 0.4551724076271057, + "step": 17910 + }, + { + "epoch": 0.018044178272251046, + "grad_norm": 24.20540638808236, + "learning_rate": 1.8043832967387146e-05, + "loss": 2.3446, + "mean_token_accuracy": 0.4379310369491577, + "step": 17915 + }, + { + "epoch": 0.01804921432535522, + "grad_norm": 19.484822030973742, + "learning_rate": 1.8048868924118202e-05, + "loss": 2.2816, + "mean_token_accuracy": 0.4344827592372894, + "step": 17920 + }, + { + "epoch": 0.01805425037845939, + "grad_norm": 27.28241072551932, + "learning_rate": 1.8053904880849265e-05, + "loss": 2.4013, + "mean_token_accuracy": 0.4517241418361664, + "step": 17925 + }, + { + "epoch": 0.018059286431563563, + "grad_norm": 20.45482198487239, + "learning_rate": 1.8058940837580324e-05, + "loss": 2.4581, + "mean_token_accuracy": 0.42758620977401735, + "step": 17930 + }, + { + "epoch": 0.018064322484667737, + "grad_norm": 26.813351799703437, + "learning_rate": 1.8063976794311387e-05, + "loss": 2.4552, + "mean_token_accuracy": 0.4068965494632721, + "step": 17935 + }, + { + "epoch": 0.018069358537771908, + "grad_norm": 19.369171272788428, + "learning_rate": 1.8069012751042443e-05, + "loss": 2.1522, + "mean_token_accuracy": 0.42413793206214906, + "step": 17940 + }, + { + "epoch": 0.01807439459087608, + "grad_norm": 22.992413625878434, + "learning_rate": 1.8074048707773502e-05, + "loss": 2.2247, + "mean_token_accuracy": 0.4517241299152374, + "step": 17945 + }, + { + "epoch": 0.018079430643980255, + "grad_norm": 22.051474908139603, + "learning_rate": 1.8079084664504565e-05, + "loss": 2.5226, + "mean_token_accuracy": 0.36896551847457887, + "step": 17950 + }, + { + "epoch": 0.01808446669708443, + "grad_norm": 25.30049048940708, + "learning_rate": 1.8084120621235624e-05, + "loss": 2.3189, + "mean_token_accuracy": 0.4379310429096222, + "step": 17955 + }, + { + "epoch": 0.0180895027501886, + "grad_norm": 20.92183462199487, + "learning_rate": 1.8089156577966683e-05, + "loss": 2.5142, + "mean_token_accuracy": 0.4206896543502808, + "step": 17960 + }, + { + "epoch": 0.018094538803292773, + "grad_norm": 24.861238493924773, + "learning_rate": 1.8094192534697742e-05, + "loss": 2.2847, + "mean_token_accuracy": 0.4724137902259827, + "step": 17965 + }, + { + "epoch": 0.018099574856396947, + "grad_norm": 26.07495989185928, + "learning_rate": 1.80992284914288e-05, + "loss": 2.5871, + "mean_token_accuracy": 0.41379310488700866, + "step": 17970 + }, + { + "epoch": 0.018104610909501117, + "grad_norm": 87.31115130334746, + "learning_rate": 1.8104264448159864e-05, + "loss": 2.4175, + "mean_token_accuracy": 0.42413792610168455, + "step": 17975 + }, + { + "epoch": 0.01810964696260529, + "grad_norm": 24.69459049240753, + "learning_rate": 1.8109300404890923e-05, + "loss": 2.2178, + "mean_token_accuracy": 0.44827585816383364, + "step": 17980 + }, + { + "epoch": 0.018114683015709464, + "grad_norm": 23.56723405687756, + "learning_rate": 1.811433636162198e-05, + "loss": 2.4496, + "mean_token_accuracy": 0.39655172228813174, + "step": 17985 + }, + { + "epoch": 0.018119719068813638, + "grad_norm": 24.335418420927656, + "learning_rate": 1.8119372318353042e-05, + "loss": 2.3083, + "mean_token_accuracy": 0.4862068951129913, + "step": 17990 + }, + { + "epoch": 0.01812475512191781, + "grad_norm": 22.404727098820032, + "learning_rate": 1.81244082750841e-05, + "loss": 2.0647, + "mean_token_accuracy": 0.4707199037075043, + "step": 17995 + }, + { + "epoch": 0.018129791175021982, + "grad_norm": 21.666225047985428, + "learning_rate": 1.8129444231815164e-05, + "loss": 2.5004, + "mean_token_accuracy": 0.41379310488700866, + "step": 18000 + }, + { + "epoch": 0.018134827228126156, + "grad_norm": 24.688246564524185, + "learning_rate": 1.813448018854622e-05, + "loss": 2.3677, + "mean_token_accuracy": 0.44827585816383364, + "step": 18005 + }, + { + "epoch": 0.018139863281230326, + "grad_norm": 37.11071613560086, + "learning_rate": 1.813951614527728e-05, + "loss": 2.6779, + "mean_token_accuracy": 0.4137930989265442, + "step": 18010 + }, + { + "epoch": 0.0181448993343345, + "grad_norm": 29.42426974889362, + "learning_rate": 1.814455210200834e-05, + "loss": 2.3591, + "mean_token_accuracy": 0.46896551847457885, + "step": 18015 + }, + { + "epoch": 0.018149935387438674, + "grad_norm": 26.212084878253762, + "learning_rate": 1.81495880587394e-05, + "loss": 2.2063, + "mean_token_accuracy": 0.4586206912994385, + "step": 18020 + }, + { + "epoch": 0.018154971440542848, + "grad_norm": 30.990568639230204, + "learning_rate": 1.815462401547046e-05, + "loss": 2.749, + "mean_token_accuracy": 0.4172413766384125, + "step": 18025 + }, + { + "epoch": 0.018160007493647018, + "grad_norm": 31.099745693694484, + "learning_rate": 1.815965997220152e-05, + "loss": 2.2452, + "mean_token_accuracy": 0.47586206793785096, + "step": 18030 + }, + { + "epoch": 0.01816504354675119, + "grad_norm": 22.46318922164754, + "learning_rate": 1.816469592893258e-05, + "loss": 2.3509, + "mean_token_accuracy": 0.5051421642303466, + "step": 18035 + }, + { + "epoch": 0.018170079599855365, + "grad_norm": 19.35005212412641, + "learning_rate": 1.816973188566364e-05, + "loss": 2.2841, + "mean_token_accuracy": 0.4344827592372894, + "step": 18040 + }, + { + "epoch": 0.018175115652959536, + "grad_norm": 21.684310936835846, + "learning_rate": 1.81747678423947e-05, + "loss": 2.4035, + "mean_token_accuracy": 0.4448275864124298, + "step": 18045 + }, + { + "epoch": 0.01818015170606371, + "grad_norm": 33.086371182428195, + "learning_rate": 1.817980379912576e-05, + "loss": 2.3478, + "mean_token_accuracy": 0.4689655125141144, + "step": 18050 + }, + { + "epoch": 0.018185187759167883, + "grad_norm": 19.07403892507646, + "learning_rate": 1.818483975585682e-05, + "loss": 1.9568, + "mean_token_accuracy": 0.4640653312206268, + "step": 18055 + }, + { + "epoch": 0.018190223812272057, + "grad_norm": 34.102307363480314, + "learning_rate": 1.8189875712587878e-05, + "loss": 2.3472, + "mean_token_accuracy": 0.46551724076271056, + "step": 18060 + }, + { + "epoch": 0.018195259865376227, + "grad_norm": 23.171587807785667, + "learning_rate": 1.8194911669318937e-05, + "loss": 2.5169, + "mean_token_accuracy": 0.3931034505367279, + "step": 18065 + }, + { + "epoch": 0.0182002959184804, + "grad_norm": 17.733018611376394, + "learning_rate": 1.819994762605e-05, + "loss": 2.0571, + "mean_token_accuracy": 0.4689655125141144, + "step": 18070 + }, + { + "epoch": 0.018205331971584575, + "grad_norm": 22.644821568901595, + "learning_rate": 1.8204983582781056e-05, + "loss": 2.5359, + "mean_token_accuracy": 0.4068965554237366, + "step": 18075 + }, + { + "epoch": 0.018210368024688745, + "grad_norm": 20.264727304480328, + "learning_rate": 1.821001953951212e-05, + "loss": 2.3684, + "mean_token_accuracy": 0.4517241358757019, + "step": 18080 + }, + { + "epoch": 0.01821540407779292, + "grad_norm": 262.09946279066895, + "learning_rate": 1.8215055496243178e-05, + "loss": 2.2312, + "mean_token_accuracy": 0.47586206793785096, + "step": 18085 + }, + { + "epoch": 0.018220440130897093, + "grad_norm": 31.818670940558857, + "learning_rate": 1.8220091452974237e-05, + "loss": 2.4037, + "mean_token_accuracy": 0.42758620977401735, + "step": 18090 + }, + { + "epoch": 0.018225476184001266, + "grad_norm": 23.209558515353002, + "learning_rate": 1.8225127409705296e-05, + "loss": 2.3897, + "mean_token_accuracy": 0.4344827592372894, + "step": 18095 + }, + { + "epoch": 0.018230512237105437, + "grad_norm": 28.612889002732594, + "learning_rate": 1.8230163366436356e-05, + "loss": 2.4921, + "mean_token_accuracy": 0.37931033968925476, + "step": 18100 + }, + { + "epoch": 0.01823554829020961, + "grad_norm": 23.55292949986719, + "learning_rate": 1.8235199323167415e-05, + "loss": 2.2141, + "mean_token_accuracy": 0.4448275864124298, + "step": 18105 + }, + { + "epoch": 0.018240584343313784, + "grad_norm": 23.948456099648727, + "learning_rate": 1.8240235279898478e-05, + "loss": 2.7445, + "mean_token_accuracy": 0.4103448212146759, + "step": 18110 + }, + { + "epoch": 0.018245620396417955, + "grad_norm": 31.177019492375045, + "learning_rate": 1.8245271236629537e-05, + "loss": 2.2587, + "mean_token_accuracy": 0.49655171632766726, + "step": 18115 + }, + { + "epoch": 0.01825065644952213, + "grad_norm": 20.244547132110107, + "learning_rate": 1.8250307193360596e-05, + "loss": 2.3337, + "mean_token_accuracy": 0.4379310369491577, + "step": 18120 + }, + { + "epoch": 0.018255692502626302, + "grad_norm": 20.66116424283987, + "learning_rate": 1.8255343150091655e-05, + "loss": 2.2243, + "mean_token_accuracy": 0.48275862336158754, + "step": 18125 + }, + { + "epoch": 0.018260728555730476, + "grad_norm": 20.0904786657892, + "learning_rate": 1.8260379106822715e-05, + "loss": 2.4443, + "mean_token_accuracy": 0.3827586233615875, + "step": 18130 + }, + { + "epoch": 0.018265764608834646, + "grad_norm": 22.21856935231415, + "learning_rate": 1.8265415063553777e-05, + "loss": 2.2486, + "mean_token_accuracy": 0.44827585816383364, + "step": 18135 + }, + { + "epoch": 0.01827080066193882, + "grad_norm": 21.22679228510795, + "learning_rate": 1.8270451020284833e-05, + "loss": 2.2839, + "mean_token_accuracy": 0.43103448748588563, + "step": 18140 + }, + { + "epoch": 0.018275836715042994, + "grad_norm": 20.21340258007444, + "learning_rate": 1.8275486977015892e-05, + "loss": 2.5993, + "mean_token_accuracy": 0.4068965494632721, + "step": 18145 + }, + { + "epoch": 0.018280872768147164, + "grad_norm": 24.636832974506714, + "learning_rate": 1.8280522933746955e-05, + "loss": 2.5697, + "mean_token_accuracy": 0.4310344815254211, + "step": 18150 + }, + { + "epoch": 0.018285908821251338, + "grad_norm": 25.73286651674442, + "learning_rate": 1.8285558890478014e-05, + "loss": 2.3321, + "mean_token_accuracy": 0.441379314661026, + "step": 18155 + }, + { + "epoch": 0.01829094487435551, + "grad_norm": 20.64224988299182, + "learning_rate": 1.8290594847209073e-05, + "loss": 2.165, + "mean_token_accuracy": 0.4172413766384125, + "step": 18160 + }, + { + "epoch": 0.018295980927459685, + "grad_norm": 32.5435038140328, + "learning_rate": 1.8295630803940133e-05, + "loss": 2.5579, + "mean_token_accuracy": 0.4379310250282288, + "step": 18165 + }, + { + "epoch": 0.018301016980563856, + "grad_norm": 23.232260552158536, + "learning_rate": 1.8300666760671192e-05, + "loss": 2.7553, + "mean_token_accuracy": 0.38620689511299133, + "step": 18170 + }, + { + "epoch": 0.01830605303366803, + "grad_norm": 21.16191819070584, + "learning_rate": 1.8305702717402255e-05, + "loss": 2.4844, + "mean_token_accuracy": 0.4049606740474701, + "step": 18175 + }, + { + "epoch": 0.018311089086772203, + "grad_norm": 21.791003867464912, + "learning_rate": 1.8310738674133314e-05, + "loss": 2.4461, + "mean_token_accuracy": 0.4676346004009247, + "step": 18180 + }, + { + "epoch": 0.018316125139876373, + "grad_norm": 25.34037013791446, + "learning_rate": 1.831577463086437e-05, + "loss": 2.2647, + "mean_token_accuracy": 0.4655172348022461, + "step": 18185 + }, + { + "epoch": 0.018321161192980547, + "grad_norm": 22.080552835509003, + "learning_rate": 1.8320810587595432e-05, + "loss": 2.0999, + "mean_token_accuracy": 0.441379314661026, + "step": 18190 + }, + { + "epoch": 0.01832619724608472, + "grad_norm": 27.581194693649795, + "learning_rate": 1.832584654432649e-05, + "loss": 2.5972, + "mean_token_accuracy": 0.417241370677948, + "step": 18195 + }, + { + "epoch": 0.018331233299188895, + "grad_norm": 23.409531578584257, + "learning_rate": 1.8330882501057554e-05, + "loss": 2.0593, + "mean_token_accuracy": 0.4931034505367279, + "step": 18200 + }, + { + "epoch": 0.018336269352293065, + "grad_norm": 24.20278832127102, + "learning_rate": 1.833591845778861e-05, + "loss": 2.8432, + "mean_token_accuracy": 0.3896551728248596, + "step": 18205 + }, + { + "epoch": 0.01834130540539724, + "grad_norm": 29.951294493477235, + "learning_rate": 1.834095441451967e-05, + "loss": 2.397, + "mean_token_accuracy": 0.42758620381355283, + "step": 18210 + }, + { + "epoch": 0.018346341458501413, + "grad_norm": 24.22736942119158, + "learning_rate": 1.8345990371250732e-05, + "loss": 2.2892, + "mean_token_accuracy": 0.4517241358757019, + "step": 18215 + }, + { + "epoch": 0.018351377511605583, + "grad_norm": 19.50679996040326, + "learning_rate": 1.835102632798179e-05, + "loss": 2.5672, + "mean_token_accuracy": 0.38275861740112305, + "step": 18220 + }, + { + "epoch": 0.018356413564709757, + "grad_norm": 25.986453028308706, + "learning_rate": 1.835606228471285e-05, + "loss": 2.4904, + "mean_token_accuracy": 0.4344827592372894, + "step": 18225 + }, + { + "epoch": 0.01836144961781393, + "grad_norm": 21.306181038151017, + "learning_rate": 1.836109824144391e-05, + "loss": 2.1886, + "mean_token_accuracy": 0.4551724076271057, + "step": 18230 + }, + { + "epoch": 0.018366485670918104, + "grad_norm": 25.16754083291629, + "learning_rate": 1.836613419817497e-05, + "loss": 2.2746, + "mean_token_accuracy": 0.458620685338974, + "step": 18235 + }, + { + "epoch": 0.018371521724022274, + "grad_norm": 24.206454312674186, + "learning_rate": 1.837117015490603e-05, + "loss": 2.5027, + "mean_token_accuracy": 0.4206896543502808, + "step": 18240 + }, + { + "epoch": 0.018376557777126448, + "grad_norm": 23.248629177404304, + "learning_rate": 1.837620611163709e-05, + "loss": 2.7173, + "mean_token_accuracy": 0.4034482777118683, + "step": 18245 + }, + { + "epoch": 0.018381593830230622, + "grad_norm": 25.15605550080502, + "learning_rate": 1.838124206836815e-05, + "loss": 2.135, + "mean_token_accuracy": 0.4465819776058197, + "step": 18250 + }, + { + "epoch": 0.018386629883334792, + "grad_norm": 26.67414574265421, + "learning_rate": 1.838627802509921e-05, + "loss": 2.5828, + "mean_token_accuracy": 0.41034482717514037, + "step": 18255 + }, + { + "epoch": 0.018391665936438966, + "grad_norm": 36.44143395868395, + "learning_rate": 1.839131398183027e-05, + "loss": 2.1803, + "mean_token_accuracy": 0.4807881832122803, + "step": 18260 + }, + { + "epoch": 0.01839670198954314, + "grad_norm": 21.887312294518598, + "learning_rate": 1.8396349938561328e-05, + "loss": 2.122, + "mean_token_accuracy": 0.4724137902259827, + "step": 18265 + }, + { + "epoch": 0.018401738042647314, + "grad_norm": 24.12697173675327, + "learning_rate": 1.840138589529239e-05, + "loss": 2.3038, + "mean_token_accuracy": 0.4172413766384125, + "step": 18270 + }, + { + "epoch": 0.018406774095751484, + "grad_norm": 20.76582688741353, + "learning_rate": 1.8406421852023446e-05, + "loss": 2.4333, + "mean_token_accuracy": 0.4034482777118683, + "step": 18275 + }, + { + "epoch": 0.018411810148855658, + "grad_norm": 19.30215815795281, + "learning_rate": 1.841145780875451e-05, + "loss": 2.3007, + "mean_token_accuracy": 0.4172413766384125, + "step": 18280 + }, + { + "epoch": 0.01841684620195983, + "grad_norm": 18.933301816078163, + "learning_rate": 1.841649376548557e-05, + "loss": 2.449, + "mean_token_accuracy": 0.4206896543502808, + "step": 18285 + }, + { + "epoch": 0.018421882255064, + "grad_norm": 25.17672256729281, + "learning_rate": 1.8421529722216628e-05, + "loss": 2.4051, + "mean_token_accuracy": 0.39655172228813174, + "step": 18290 + }, + { + "epoch": 0.018426918308168175, + "grad_norm": 20.298991493644095, + "learning_rate": 1.8426565678947687e-05, + "loss": 2.548, + "mean_token_accuracy": 0.4620689570903778, + "step": 18295 + }, + { + "epoch": 0.01843195436127235, + "grad_norm": 33.72948507543238, + "learning_rate": 1.8431601635678746e-05, + "loss": 2.4262, + "mean_token_accuracy": 0.417241370677948, + "step": 18300 + }, + { + "epoch": 0.018436990414376523, + "grad_norm": 24.376558557746463, + "learning_rate": 1.8436637592409805e-05, + "loss": 2.4937, + "mean_token_accuracy": 0.36896551251411436, + "step": 18305 + }, + { + "epoch": 0.018442026467480693, + "grad_norm": 24.105309686406684, + "learning_rate": 1.8441673549140868e-05, + "loss": 2.447, + "mean_token_accuracy": 0.4206896543502808, + "step": 18310 + }, + { + "epoch": 0.018447062520584867, + "grad_norm": 17.683494157745173, + "learning_rate": 1.8446709505871927e-05, + "loss": 2.4752, + "mean_token_accuracy": 0.4172413766384125, + "step": 18315 + }, + { + "epoch": 0.01845209857368904, + "grad_norm": 24.057333381526337, + "learning_rate": 1.8451745462602986e-05, + "loss": 2.3932, + "mean_token_accuracy": 0.41379310488700866, + "step": 18320 + }, + { + "epoch": 0.01845713462679321, + "grad_norm": 17.076969746205037, + "learning_rate": 1.8456781419334046e-05, + "loss": 2.1679, + "mean_token_accuracy": 0.47084090709686277, + "step": 18325 + }, + { + "epoch": 0.018462170679897385, + "grad_norm": 33.926869220634046, + "learning_rate": 1.8461817376065105e-05, + "loss": 2.2014, + "mean_token_accuracy": 0.38620689511299133, + "step": 18330 + }, + { + "epoch": 0.01846720673300156, + "grad_norm": 26.747349952152167, + "learning_rate": 1.8466853332796168e-05, + "loss": 2.6312, + "mean_token_accuracy": 0.3482758641242981, + "step": 18335 + }, + { + "epoch": 0.018472242786105732, + "grad_norm": 19.441859540332494, + "learning_rate": 1.8471889289527224e-05, + "loss": 2.7548, + "mean_token_accuracy": 0.4068965494632721, + "step": 18340 + }, + { + "epoch": 0.018477278839209903, + "grad_norm": 18.43005966056521, + "learning_rate": 1.8476925246258283e-05, + "loss": 2.2123, + "mean_token_accuracy": 0.4206896543502808, + "step": 18345 + }, + { + "epoch": 0.018482314892314076, + "grad_norm": 24.168112688403802, + "learning_rate": 1.8481961202989345e-05, + "loss": 2.5376, + "mean_token_accuracy": 0.4551724135875702, + "step": 18350 + }, + { + "epoch": 0.01848735094541825, + "grad_norm": 35.506522947631794, + "learning_rate": 1.8486997159720405e-05, + "loss": 2.541, + "mean_token_accuracy": 0.4118572294712067, + "step": 18355 + }, + { + "epoch": 0.01849238699852242, + "grad_norm": 16.67603722301561, + "learning_rate": 1.8492033116451464e-05, + "loss": 2.4443, + "mean_token_accuracy": 0.4344827592372894, + "step": 18360 + }, + { + "epoch": 0.018497423051626594, + "grad_norm": 23.02598946078082, + "learning_rate": 1.8497069073182523e-05, + "loss": 2.5876, + "mean_token_accuracy": 0.42068966031074523, + "step": 18365 + }, + { + "epoch": 0.018502459104730768, + "grad_norm": 29.81051655374456, + "learning_rate": 1.8502105029913582e-05, + "loss": 2.3613, + "mean_token_accuracy": 0.4464609742164612, + "step": 18370 + }, + { + "epoch": 0.018507495157834942, + "grad_norm": 24.071169605696266, + "learning_rate": 1.8507140986644645e-05, + "loss": 2.6156, + "mean_token_accuracy": 0.4206896543502808, + "step": 18375 + }, + { + "epoch": 0.018512531210939112, + "grad_norm": 17.924317123757, + "learning_rate": 1.8512176943375704e-05, + "loss": 2.4325, + "mean_token_accuracy": 0.42413792610168455, + "step": 18380 + }, + { + "epoch": 0.018517567264043286, + "grad_norm": 23.6614030272707, + "learning_rate": 1.851721290010676e-05, + "loss": 2.1647, + "mean_token_accuracy": 0.4662561535835266, + "step": 18385 + }, + { + "epoch": 0.01852260331714746, + "grad_norm": 24.344594508760775, + "learning_rate": 1.8522248856837823e-05, + "loss": 2.8046, + "mean_token_accuracy": 0.3965517282485962, + "step": 18390 + }, + { + "epoch": 0.01852763937025163, + "grad_norm": 29.05329447343585, + "learning_rate": 1.8527284813568882e-05, + "loss": 2.795, + "mean_token_accuracy": 0.38965516686439516, + "step": 18395 + }, + { + "epoch": 0.018532675423355804, + "grad_norm": 23.44668463015132, + "learning_rate": 1.8532320770299945e-05, + "loss": 2.6074, + "mean_token_accuracy": 0.4172413766384125, + "step": 18400 + }, + { + "epoch": 0.018537711476459977, + "grad_norm": 25.68964458542458, + "learning_rate": 1.8537356727031e-05, + "loss": 2.4078, + "mean_token_accuracy": 0.3999999940395355, + "step": 18405 + }, + { + "epoch": 0.01854274752956415, + "grad_norm": 19.79096826839757, + "learning_rate": 1.854239268376206e-05, + "loss": 2.4656, + "mean_token_accuracy": 0.4, + "step": 18410 + }, + { + "epoch": 0.01854778358266832, + "grad_norm": 25.116766876971045, + "learning_rate": 1.8547428640493122e-05, + "loss": 2.4208, + "mean_token_accuracy": 0.4655172348022461, + "step": 18415 + }, + { + "epoch": 0.018552819635772495, + "grad_norm": 21.10956035204785, + "learning_rate": 1.8552464597224182e-05, + "loss": 2.7123, + "mean_token_accuracy": 0.39999998807907106, + "step": 18420 + }, + { + "epoch": 0.01855785568887667, + "grad_norm": 28.40286701907159, + "learning_rate": 1.8557500553955244e-05, + "loss": 2.2128, + "mean_token_accuracy": 0.4263157844543457, + "step": 18425 + }, + { + "epoch": 0.01856289174198084, + "grad_norm": 20.320657102430992, + "learning_rate": 1.85625365106863e-05, + "loss": 2.2389, + "mean_token_accuracy": 0.43224440813064574, + "step": 18430 + }, + { + "epoch": 0.018567927795085013, + "grad_norm": 26.43193860882125, + "learning_rate": 1.856757246741736e-05, + "loss": 2.179, + "mean_token_accuracy": 0.4620689690113068, + "step": 18435 + }, + { + "epoch": 0.018572963848189187, + "grad_norm": 26.029482516303847, + "learning_rate": 1.8572608424148422e-05, + "loss": 2.4367, + "mean_token_accuracy": 0.4448275864124298, + "step": 18440 + }, + { + "epoch": 0.01857799990129336, + "grad_norm": 26.36269070776917, + "learning_rate": 1.857764438087948e-05, + "loss": 2.5901, + "mean_token_accuracy": 0.42758620381355283, + "step": 18445 + }, + { + "epoch": 0.01858303595439753, + "grad_norm": 24.25198266824291, + "learning_rate": 1.858268033761054e-05, + "loss": 2.277, + "mean_token_accuracy": 0.4517241299152374, + "step": 18450 + }, + { + "epoch": 0.018588072007501705, + "grad_norm": 20.965138881577037, + "learning_rate": 1.85877162943416e-05, + "loss": 2.1715, + "mean_token_accuracy": 0.4517241418361664, + "step": 18455 + }, + { + "epoch": 0.01859310806060588, + "grad_norm": 27.964006325279833, + "learning_rate": 1.859275225107266e-05, + "loss": 2.6605, + "mean_token_accuracy": 0.3827586233615875, + "step": 18460 + }, + { + "epoch": 0.01859814411371005, + "grad_norm": 19.795094147467587, + "learning_rate": 1.8597788207803722e-05, + "loss": 2.0803, + "mean_token_accuracy": 0.45359951853752134, + "step": 18465 + }, + { + "epoch": 0.018603180166814223, + "grad_norm": 23.69274929922452, + "learning_rate": 1.860282416453478e-05, + "loss": 2.2592, + "mean_token_accuracy": 0.4620689630508423, + "step": 18470 + }, + { + "epoch": 0.018608216219918396, + "grad_norm": 25.75426115793313, + "learning_rate": 1.8607860121265837e-05, + "loss": 2.7545, + "mean_token_accuracy": 0.4034482777118683, + "step": 18475 + }, + { + "epoch": 0.01861325227302257, + "grad_norm": 25.138947021390557, + "learning_rate": 1.86128960779969e-05, + "loss": 2.608, + "mean_token_accuracy": 0.41542649269104004, + "step": 18480 + }, + { + "epoch": 0.01861828832612674, + "grad_norm": 20.300810854767338, + "learning_rate": 1.861793203472796e-05, + "loss": 2.4421, + "mean_token_accuracy": 0.45033273100852966, + "step": 18485 + }, + { + "epoch": 0.018623324379230914, + "grad_norm": 20.708512153951613, + "learning_rate": 1.8622967991459018e-05, + "loss": 2.3171, + "mean_token_accuracy": 0.4137930989265442, + "step": 18490 + }, + { + "epoch": 0.018628360432335088, + "grad_norm": 18.960310294395807, + "learning_rate": 1.8628003948190077e-05, + "loss": 2.3082, + "mean_token_accuracy": 0.47586206197738645, + "step": 18495 + }, + { + "epoch": 0.018633396485439258, + "grad_norm": 28.23100233338147, + "learning_rate": 1.8633039904921137e-05, + "loss": 2.5978, + "mean_token_accuracy": 0.39655172228813174, + "step": 18500 + }, + { + "epoch": 0.018638432538543432, + "grad_norm": 22.786989551854223, + "learning_rate": 1.86380758616522e-05, + "loss": 2.4694, + "mean_token_accuracy": 0.4172413766384125, + "step": 18505 + }, + { + "epoch": 0.018643468591647606, + "grad_norm": 18.117899626868255, + "learning_rate": 1.864311181838326e-05, + "loss": 2.1059, + "mean_token_accuracy": 0.47241379618644713, + "step": 18510 + }, + { + "epoch": 0.01864850464475178, + "grad_norm": 25.602680263941036, + "learning_rate": 1.8648147775114318e-05, + "loss": 2.4637, + "mean_token_accuracy": 0.37586207389831544, + "step": 18515 + }, + { + "epoch": 0.01865354069785595, + "grad_norm": 24.191454159131858, + "learning_rate": 1.8653183731845377e-05, + "loss": 2.5345, + "mean_token_accuracy": 0.3896551728248596, + "step": 18520 + }, + { + "epoch": 0.018658576750960124, + "grad_norm": 23.13403813802116, + "learning_rate": 1.8658219688576436e-05, + "loss": 2.4672, + "mean_token_accuracy": 0.39655172228813174, + "step": 18525 + }, + { + "epoch": 0.018663612804064297, + "grad_norm": 23.663311339393506, + "learning_rate": 1.8663255645307495e-05, + "loss": 2.3559, + "mean_token_accuracy": 0.42068964838981626, + "step": 18530 + }, + { + "epoch": 0.018668648857168468, + "grad_norm": 27.471039374875893, + "learning_rate": 1.8668291602038558e-05, + "loss": 2.4905, + "mean_token_accuracy": 0.4137930989265442, + "step": 18535 + }, + { + "epoch": 0.01867368491027264, + "grad_norm": 23.03935742764747, + "learning_rate": 1.8673327558769614e-05, + "loss": 2.5281, + "mean_token_accuracy": 0.3655172407627106, + "step": 18540 + }, + { + "epoch": 0.018678720963376815, + "grad_norm": 20.12579561621323, + "learning_rate": 1.8678363515500677e-05, + "loss": 2.0713, + "mean_token_accuracy": 0.4571082890033722, + "step": 18545 + }, + { + "epoch": 0.01868375701648099, + "grad_norm": 21.439037772772547, + "learning_rate": 1.8683399472231736e-05, + "loss": 2.3707, + "mean_token_accuracy": 0.4433151841163635, + "step": 18550 + }, + { + "epoch": 0.01868879306958516, + "grad_norm": 30.227392675033723, + "learning_rate": 1.8688435428962795e-05, + "loss": 2.3508, + "mean_token_accuracy": 0.4310344815254211, + "step": 18555 + }, + { + "epoch": 0.018693829122689333, + "grad_norm": 18.40296840728748, + "learning_rate": 1.8693471385693854e-05, + "loss": 2.4781, + "mean_token_accuracy": 0.43774954676628114, + "step": 18560 + }, + { + "epoch": 0.018698865175793507, + "grad_norm": 25.110820555236238, + "learning_rate": 1.8698507342424914e-05, + "loss": 2.7454, + "mean_token_accuracy": 0.39171203672885896, + "step": 18565 + }, + { + "epoch": 0.018703901228897677, + "grad_norm": 19.20230503447168, + "learning_rate": 1.8703543299155973e-05, + "loss": 2.3663, + "mean_token_accuracy": 0.4655172348022461, + "step": 18570 + }, + { + "epoch": 0.01870893728200185, + "grad_norm": 20.745675559486628, + "learning_rate": 1.8708579255887035e-05, + "loss": 2.4282, + "mean_token_accuracy": 0.42613430619239806, + "step": 18575 + }, + { + "epoch": 0.018713973335106025, + "grad_norm": 20.523347873253048, + "learning_rate": 1.8713615212618095e-05, + "loss": 2.6128, + "mean_token_accuracy": 0.41034482717514037, + "step": 18580 + }, + { + "epoch": 0.0187190093882102, + "grad_norm": 26.002638657581258, + "learning_rate": 1.8718651169349154e-05, + "loss": 2.5821, + "mean_token_accuracy": 0.4034482777118683, + "step": 18585 + }, + { + "epoch": 0.01872404544131437, + "grad_norm": 19.715752976807146, + "learning_rate": 1.8723687126080213e-05, + "loss": 2.6948, + "mean_token_accuracy": 0.3807622492313385, + "step": 18590 + }, + { + "epoch": 0.018729081494418542, + "grad_norm": 18.18784159170998, + "learning_rate": 1.8728723082811273e-05, + "loss": 2.6336, + "mean_token_accuracy": 0.441379314661026, + "step": 18595 + }, + { + "epoch": 0.018734117547522716, + "grad_norm": 19.16161870410463, + "learning_rate": 1.8733759039542335e-05, + "loss": 2.2031, + "mean_token_accuracy": 0.4551724135875702, + "step": 18600 + }, + { + "epoch": 0.018739153600626886, + "grad_norm": 23.118294381776135, + "learning_rate": 1.8738794996273394e-05, + "loss": 2.4119, + "mean_token_accuracy": 0.47586206197738645, + "step": 18605 + }, + { + "epoch": 0.01874418965373106, + "grad_norm": 17.85852863510062, + "learning_rate": 1.874383095300445e-05, + "loss": 2.6663, + "mean_token_accuracy": 0.4103448212146759, + "step": 18610 + }, + { + "epoch": 0.018749225706835234, + "grad_norm": 18.90992499439653, + "learning_rate": 1.8748866909735513e-05, + "loss": 2.3938, + "mean_token_accuracy": 0.43103447556495667, + "step": 18615 + }, + { + "epoch": 0.018754261759939408, + "grad_norm": 24.132692004407723, + "learning_rate": 1.8753902866466572e-05, + "loss": 2.3201, + "mean_token_accuracy": 0.42068964838981626, + "step": 18620 + }, + { + "epoch": 0.018759297813043578, + "grad_norm": 24.32289904119032, + "learning_rate": 1.8758938823197635e-05, + "loss": 2.4022, + "mean_token_accuracy": 0.4471264362335205, + "step": 18625 + }, + { + "epoch": 0.018764333866147752, + "grad_norm": 20.352546763306705, + "learning_rate": 1.876397477992869e-05, + "loss": 2.3471, + "mean_token_accuracy": 0.4137930989265442, + "step": 18630 + }, + { + "epoch": 0.018769369919251926, + "grad_norm": 20.132813035546505, + "learning_rate": 1.876901073665975e-05, + "loss": 2.2938, + "mean_token_accuracy": 0.47931033968925474, + "step": 18635 + }, + { + "epoch": 0.018774405972356096, + "grad_norm": 20.924008918925406, + "learning_rate": 1.8774046693390813e-05, + "loss": 2.7575, + "mean_token_accuracy": 0.3517241388559341, + "step": 18640 + }, + { + "epoch": 0.01877944202546027, + "grad_norm": 22.971882668830713, + "learning_rate": 1.8779082650121872e-05, + "loss": 2.9397, + "mean_token_accuracy": 0.3655172407627106, + "step": 18645 + }, + { + "epoch": 0.018784478078564443, + "grad_norm": 20.384143348866996, + "learning_rate": 1.878411860685293e-05, + "loss": 2.205, + "mean_token_accuracy": 0.45172412395477296, + "step": 18650 + }, + { + "epoch": 0.018789514131668617, + "grad_norm": 20.90598635261647, + "learning_rate": 1.878915456358399e-05, + "loss": 2.2686, + "mean_token_accuracy": 0.44137930274009707, + "step": 18655 + }, + { + "epoch": 0.018794550184772787, + "grad_norm": 18.391497744588737, + "learning_rate": 1.879419052031505e-05, + "loss": 2.4152, + "mean_token_accuracy": 0.3862069010734558, + "step": 18660 + }, + { + "epoch": 0.01879958623787696, + "grad_norm": 22.68095456033905, + "learning_rate": 1.8799226477046112e-05, + "loss": 2.353, + "mean_token_accuracy": 0.4413793087005615, + "step": 18665 + }, + { + "epoch": 0.018804622290981135, + "grad_norm": 27.357647063369004, + "learning_rate": 1.880426243377717e-05, + "loss": 2.6484, + "mean_token_accuracy": 0.41724138855934145, + "step": 18670 + }, + { + "epoch": 0.018809658344085305, + "grad_norm": 19.81846864631288, + "learning_rate": 1.8809298390508227e-05, + "loss": 2.4199, + "mean_token_accuracy": 0.4448275864124298, + "step": 18675 + }, + { + "epoch": 0.01881469439718948, + "grad_norm": 26.03359990295359, + "learning_rate": 1.881433434723929e-05, + "loss": 2.2594, + "mean_token_accuracy": 0.40344828367233276, + "step": 18680 + }, + { + "epoch": 0.018819730450293653, + "grad_norm": 19.701604642943927, + "learning_rate": 1.881937030397035e-05, + "loss": 2.3354, + "mean_token_accuracy": 0.41724138259887694, + "step": 18685 + }, + { + "epoch": 0.018824766503397827, + "grad_norm": 20.66082461782884, + "learning_rate": 1.882440626070141e-05, + "loss": 2.3089, + "mean_token_accuracy": 0.42413793206214906, + "step": 18690 + }, + { + "epoch": 0.018829802556501997, + "grad_norm": 24.63902668536542, + "learning_rate": 1.8829442217432468e-05, + "loss": 2.2573, + "mean_token_accuracy": 0.4295220851898193, + "step": 18695 + }, + { + "epoch": 0.01883483860960617, + "grad_norm": 30.656894829562354, + "learning_rate": 1.8834478174163527e-05, + "loss": 3.0471, + "mean_token_accuracy": 0.41379310488700866, + "step": 18700 + }, + { + "epoch": 0.018839874662710344, + "grad_norm": 20.04233789310212, + "learning_rate": 1.883951413089459e-05, + "loss": 2.3474, + "mean_token_accuracy": 0.3827586203813553, + "step": 18705 + }, + { + "epoch": 0.018844910715814515, + "grad_norm": 30.93024479269929, + "learning_rate": 1.884455008762565e-05, + "loss": 2.5824, + "mean_token_accuracy": 0.3758620649576187, + "step": 18710 + }, + { + "epoch": 0.01884994676891869, + "grad_norm": 20.213321690357215, + "learning_rate": 1.8849586044356708e-05, + "loss": 2.4802, + "mean_token_accuracy": 0.46745312213897705, + "step": 18715 + }, + { + "epoch": 0.018854982822022862, + "grad_norm": 28.4021018595473, + "learning_rate": 1.8854622001087767e-05, + "loss": 3.0217, + "mean_token_accuracy": 0.38965517580509185, + "step": 18720 + }, + { + "epoch": 0.018860018875127036, + "grad_norm": 30.664398158741548, + "learning_rate": 1.8859657957818827e-05, + "loss": 2.5236, + "mean_token_accuracy": 0.42758620977401735, + "step": 18725 + }, + { + "epoch": 0.018865054928231206, + "grad_norm": 18.888463410119773, + "learning_rate": 1.8864693914549886e-05, + "loss": 2.3902, + "mean_token_accuracy": 0.44827587008476255, + "step": 18730 + }, + { + "epoch": 0.01887009098133538, + "grad_norm": 23.481759982635758, + "learning_rate": 1.886972987128095e-05, + "loss": 2.3706, + "mean_token_accuracy": 0.3862068891525269, + "step": 18735 + }, + { + "epoch": 0.018875127034439554, + "grad_norm": 24.201531574535828, + "learning_rate": 1.8874765828012004e-05, + "loss": 2.4099, + "mean_token_accuracy": 0.43103447556495667, + "step": 18740 + }, + { + "epoch": 0.018880163087543724, + "grad_norm": 17.2269666441305, + "learning_rate": 1.8879801784743067e-05, + "loss": 2.0047, + "mean_token_accuracy": 0.47586206793785096, + "step": 18745 + }, + { + "epoch": 0.018885199140647898, + "grad_norm": 37.174245614286264, + "learning_rate": 1.8884837741474126e-05, + "loss": 2.6555, + "mean_token_accuracy": 0.4050211727619171, + "step": 18750 + }, + { + "epoch": 0.01889023519375207, + "grad_norm": 17.887613203658898, + "learning_rate": 1.8889873698205186e-05, + "loss": 2.462, + "mean_token_accuracy": 0.4517241358757019, + "step": 18755 + }, + { + "epoch": 0.018895271246856245, + "grad_norm": 22.39999922272541, + "learning_rate": 1.8894909654936245e-05, + "loss": 2.1091, + "mean_token_accuracy": 0.5137931108474731, + "step": 18760 + }, + { + "epoch": 0.018900307299960416, + "grad_norm": 29.83249442873979, + "learning_rate": 1.8899945611667304e-05, + "loss": 2.4745, + "mean_token_accuracy": 0.39655172228813174, + "step": 18765 + }, + { + "epoch": 0.01890534335306459, + "grad_norm": 23.89755745016707, + "learning_rate": 1.8904981568398363e-05, + "loss": 2.7878, + "mean_token_accuracy": 0.3655172407627106, + "step": 18770 + }, + { + "epoch": 0.018910379406168763, + "grad_norm": 20.68289422700322, + "learning_rate": 1.8910017525129426e-05, + "loss": 2.4851, + "mean_token_accuracy": 0.3999999940395355, + "step": 18775 + }, + { + "epoch": 0.018915415459272934, + "grad_norm": 18.21338794531444, + "learning_rate": 1.8915053481860485e-05, + "loss": 2.2646, + "mean_token_accuracy": 0.4896551609039307, + "step": 18780 + }, + { + "epoch": 0.018920451512377107, + "grad_norm": 22.000402602411857, + "learning_rate": 1.8920089438591544e-05, + "loss": 2.3004, + "mean_token_accuracy": 0.48275861144065857, + "step": 18785 + }, + { + "epoch": 0.01892548756548128, + "grad_norm": 22.879675315635378, + "learning_rate": 1.8925125395322604e-05, + "loss": 2.5838, + "mean_token_accuracy": 0.41379310488700866, + "step": 18790 + }, + { + "epoch": 0.018930523618585455, + "grad_norm": 23.42500362417901, + "learning_rate": 1.8930161352053663e-05, + "loss": 2.7684, + "mean_token_accuracy": 0.3482758641242981, + "step": 18795 + }, + { + "epoch": 0.018935559671689625, + "grad_norm": 23.188705991115167, + "learning_rate": 1.8935197308784726e-05, + "loss": 2.3294, + "mean_token_accuracy": 0.42068964838981626, + "step": 18800 + }, + { + "epoch": 0.0189405957247938, + "grad_norm": 20.350090717460624, + "learning_rate": 1.8940233265515785e-05, + "loss": 2.4954, + "mean_token_accuracy": 0.43793103098869324, + "step": 18805 + }, + { + "epoch": 0.018945631777897973, + "grad_norm": 20.2595044789961, + "learning_rate": 1.894526922224684e-05, + "loss": 2.5942, + "mean_token_accuracy": 0.400369456410408, + "step": 18810 + }, + { + "epoch": 0.018950667831002143, + "grad_norm": 31.09178734581473, + "learning_rate": 1.8950305178977903e-05, + "loss": 2.6437, + "mean_token_accuracy": 0.33448275923728943, + "step": 18815 + }, + { + "epoch": 0.018955703884106317, + "grad_norm": 24.66167751656284, + "learning_rate": 1.8955341135708963e-05, + "loss": 2.1062, + "mean_token_accuracy": 0.4344827592372894, + "step": 18820 + }, + { + "epoch": 0.01896073993721049, + "grad_norm": 20.15465544497304, + "learning_rate": 1.8960377092440025e-05, + "loss": 2.4724, + "mean_token_accuracy": 0.41034482717514037, + "step": 18825 + }, + { + "epoch": 0.018965775990314664, + "grad_norm": 29.108649941448636, + "learning_rate": 1.896541304917108e-05, + "loss": 2.5572, + "mean_token_accuracy": 0.42413793206214906, + "step": 18830 + }, + { + "epoch": 0.018970812043418835, + "grad_norm": 16.53741994988761, + "learning_rate": 1.897044900590214e-05, + "loss": 2.591, + "mean_token_accuracy": 0.4068965554237366, + "step": 18835 + }, + { + "epoch": 0.01897584809652301, + "grad_norm": 20.388984480647625, + "learning_rate": 1.8975484962633203e-05, + "loss": 2.6244, + "mean_token_accuracy": 0.441379314661026, + "step": 18840 + }, + { + "epoch": 0.018980884149627182, + "grad_norm": 23.652446278266744, + "learning_rate": 1.8980520919364262e-05, + "loss": 2.4188, + "mean_token_accuracy": 0.4448275864124298, + "step": 18845 + }, + { + "epoch": 0.018985920202731352, + "grad_norm": 19.742141238148445, + "learning_rate": 1.898555687609532e-05, + "loss": 2.4013, + "mean_token_accuracy": 0.4034482777118683, + "step": 18850 + }, + { + "epoch": 0.018990956255835526, + "grad_norm": 20.60123636793245, + "learning_rate": 1.899059283282638e-05, + "loss": 2.0559, + "mean_token_accuracy": 0.5206896603107453, + "step": 18855 + }, + { + "epoch": 0.0189959923089397, + "grad_norm": 21.468944623244187, + "learning_rate": 1.899562878955744e-05, + "loss": 2.3469, + "mean_token_accuracy": 0.4896551728248596, + "step": 18860 + }, + { + "epoch": 0.019001028362043874, + "grad_norm": 17.353235706003375, + "learning_rate": 1.9000664746288503e-05, + "loss": 2.0922, + "mean_token_accuracy": 0.49655171632766726, + "step": 18865 + }, + { + "epoch": 0.019006064415148044, + "grad_norm": 28.14462167623131, + "learning_rate": 1.9005700703019562e-05, + "loss": 3.0209, + "mean_token_accuracy": 0.32413792610168457, + "step": 18870 + }, + { + "epoch": 0.019011100468252218, + "grad_norm": 18.975555536316858, + "learning_rate": 1.9010736659750618e-05, + "loss": 2.0986, + "mean_token_accuracy": 0.4862069010734558, + "step": 18875 + }, + { + "epoch": 0.01901613652135639, + "grad_norm": 22.76994866800488, + "learning_rate": 1.901577261648168e-05, + "loss": 2.411, + "mean_token_accuracy": 0.4758620738983154, + "step": 18880 + }, + { + "epoch": 0.019021172574460562, + "grad_norm": 20.948865374630078, + "learning_rate": 1.902080857321274e-05, + "loss": 2.4945, + "mean_token_accuracy": 0.4068965494632721, + "step": 18885 + }, + { + "epoch": 0.019026208627564736, + "grad_norm": 30.26253072264103, + "learning_rate": 1.9025844529943802e-05, + "loss": 2.855, + "mean_token_accuracy": 0.379310342669487, + "step": 18890 + }, + { + "epoch": 0.01903124468066891, + "grad_norm": 25.749921093712405, + "learning_rate": 1.9030880486674858e-05, + "loss": 2.3108, + "mean_token_accuracy": 0.4413793087005615, + "step": 18895 + }, + { + "epoch": 0.019036280733773083, + "grad_norm": 24.526414610094402, + "learning_rate": 1.9035916443405917e-05, + "loss": 2.363, + "mean_token_accuracy": 0.41379310488700866, + "step": 18900 + }, + { + "epoch": 0.019041316786877253, + "grad_norm": 25.657819075648135, + "learning_rate": 1.904095240013698e-05, + "loss": 2.2318, + "mean_token_accuracy": 0.4620689690113068, + "step": 18905 + }, + { + "epoch": 0.019046352839981427, + "grad_norm": 28.70816140072034, + "learning_rate": 1.904598835686804e-05, + "loss": 2.6287, + "mean_token_accuracy": 0.3827586233615875, + "step": 18910 + }, + { + "epoch": 0.0190513888930856, + "grad_norm": 34.635211628022624, + "learning_rate": 1.90510243135991e-05, + "loss": 2.5462, + "mean_token_accuracy": 0.4413793087005615, + "step": 18915 + }, + { + "epoch": 0.01905642494618977, + "grad_norm": 18.94225028225776, + "learning_rate": 1.9056060270330158e-05, + "loss": 2.084, + "mean_token_accuracy": 0.4689655125141144, + "step": 18920 + }, + { + "epoch": 0.019061460999293945, + "grad_norm": 18.62035792038576, + "learning_rate": 1.9061096227061217e-05, + "loss": 2.6458, + "mean_token_accuracy": 0.3896551728248596, + "step": 18925 + }, + { + "epoch": 0.01906649705239812, + "grad_norm": 34.229517364856164, + "learning_rate": 1.906613218379228e-05, + "loss": 2.6708, + "mean_token_accuracy": 0.4517241418361664, + "step": 18930 + }, + { + "epoch": 0.019071533105502293, + "grad_norm": 25.769185841767303, + "learning_rate": 1.907116814052334e-05, + "loss": 2.6084, + "mean_token_accuracy": 0.3827586233615875, + "step": 18935 + }, + { + "epoch": 0.019076569158606463, + "grad_norm": 22.22799869039467, + "learning_rate": 1.9076204097254395e-05, + "loss": 2.2299, + "mean_token_accuracy": 0.4034482777118683, + "step": 18940 + }, + { + "epoch": 0.019081605211710637, + "grad_norm": 25.64921687545923, + "learning_rate": 1.9081240053985457e-05, + "loss": 2.3967, + "mean_token_accuracy": 0.42068966031074523, + "step": 18945 + }, + { + "epoch": 0.01908664126481481, + "grad_norm": 28.145675705745326, + "learning_rate": 1.9086276010716517e-05, + "loss": 2.4984, + "mean_token_accuracy": 0.42758620381355283, + "step": 18950 + }, + { + "epoch": 0.01909167731791898, + "grad_norm": 23.058502847850253, + "learning_rate": 1.9091311967447576e-05, + "loss": 2.5672, + "mean_token_accuracy": 0.4344827651977539, + "step": 18955 + }, + { + "epoch": 0.019096713371023154, + "grad_norm": 21.459581212472585, + "learning_rate": 1.909634792417864e-05, + "loss": 2.4628, + "mean_token_accuracy": 0.3965517163276672, + "step": 18960 + }, + { + "epoch": 0.019101749424127328, + "grad_norm": 31.939235127973767, + "learning_rate": 1.9101383880909694e-05, + "loss": 2.4405, + "mean_token_accuracy": 0.4206896543502808, + "step": 18965 + }, + { + "epoch": 0.019106785477231502, + "grad_norm": 20.528203750990738, + "learning_rate": 1.9106419837640757e-05, + "loss": 2.2257, + "mean_token_accuracy": 0.4620689630508423, + "step": 18970 + }, + { + "epoch": 0.019111821530335672, + "grad_norm": 21.453631207307506, + "learning_rate": 1.9111455794371816e-05, + "loss": 2.3287, + "mean_token_accuracy": 0.44482759237289426, + "step": 18975 + }, + { + "epoch": 0.019116857583439846, + "grad_norm": 64.85160328271458, + "learning_rate": 1.9116491751102876e-05, + "loss": 2.497, + "mean_token_accuracy": 0.4448275864124298, + "step": 18980 + }, + { + "epoch": 0.01912189363654402, + "grad_norm": 24.049164964505476, + "learning_rate": 1.9121527707833935e-05, + "loss": 2.3524, + "mean_token_accuracy": 0.441379314661026, + "step": 18985 + }, + { + "epoch": 0.01912692968964819, + "grad_norm": 23.076237318845802, + "learning_rate": 1.9126563664564994e-05, + "loss": 2.6937, + "mean_token_accuracy": 0.3896551728248596, + "step": 18990 + }, + { + "epoch": 0.019131965742752364, + "grad_norm": 24.031773041287398, + "learning_rate": 1.9131599621296053e-05, + "loss": 2.3618, + "mean_token_accuracy": 0.4034482717514038, + "step": 18995 + }, + { + "epoch": 0.019137001795856538, + "grad_norm": 24.385140679476745, + "learning_rate": 1.9136635578027116e-05, + "loss": 2.3517, + "mean_token_accuracy": 0.4275861978530884, + "step": 19000 + }, + { + "epoch": 0.01914203784896071, + "grad_norm": 21.35885024541804, + "learning_rate": 1.9141671534758175e-05, + "loss": 2.2411, + "mean_token_accuracy": 0.38965516686439516, + "step": 19005 + }, + { + "epoch": 0.01914707390206488, + "grad_norm": 21.07186337358999, + "learning_rate": 1.9146707491489235e-05, + "loss": 2.0067, + "mean_token_accuracy": 0.5021173536777497, + "step": 19010 + }, + { + "epoch": 0.019152109955169055, + "grad_norm": 18.488088610319316, + "learning_rate": 1.9151743448220294e-05, + "loss": 2.7663, + "mean_token_accuracy": 0.38620689511299133, + "step": 19015 + }, + { + "epoch": 0.01915714600827323, + "grad_norm": 17.356027015500718, + "learning_rate": 1.9156779404951353e-05, + "loss": 2.0081, + "mean_token_accuracy": 0.48694581389427183, + "step": 19020 + }, + { + "epoch": 0.0191621820613774, + "grad_norm": 20.176422967642363, + "learning_rate": 1.9161815361682416e-05, + "loss": 2.5009, + "mean_token_accuracy": 0.33448275923728943, + "step": 19025 + }, + { + "epoch": 0.019167218114481573, + "grad_norm": 19.321236228404917, + "learning_rate": 1.916685131841347e-05, + "loss": 2.3869, + "mean_token_accuracy": 0.4310344815254211, + "step": 19030 + }, + { + "epoch": 0.019172254167585747, + "grad_norm": 20.58533999038996, + "learning_rate": 1.917188727514453e-05, + "loss": 2.5406, + "mean_token_accuracy": 0.4000000059604645, + "step": 19035 + }, + { + "epoch": 0.01917729022068992, + "grad_norm": 23.345554869767714, + "learning_rate": 1.9176923231875593e-05, + "loss": 2.3113, + "mean_token_accuracy": 0.48275862336158754, + "step": 19040 + }, + { + "epoch": 0.01918232627379409, + "grad_norm": 25.62405401274355, + "learning_rate": 1.9181959188606653e-05, + "loss": 2.6555, + "mean_token_accuracy": 0.382758629322052, + "step": 19045 + }, + { + "epoch": 0.019187362326898265, + "grad_norm": 22.634158233986092, + "learning_rate": 1.9186995145337712e-05, + "loss": 2.3265, + "mean_token_accuracy": 0.44827585816383364, + "step": 19050 + }, + { + "epoch": 0.01919239838000244, + "grad_norm": 19.071023605774617, + "learning_rate": 1.919203110206877e-05, + "loss": 2.5948, + "mean_token_accuracy": 0.4344827592372894, + "step": 19055 + }, + { + "epoch": 0.01919743443310661, + "grad_norm": 30.103983955026525, + "learning_rate": 1.919706705879983e-05, + "loss": 2.465, + "mean_token_accuracy": 0.41034482717514037, + "step": 19060 + }, + { + "epoch": 0.019202470486210783, + "grad_norm": 23.749913222399545, + "learning_rate": 1.9202103015530893e-05, + "loss": 2.7265, + "mean_token_accuracy": 0.4275862157344818, + "step": 19065 + }, + { + "epoch": 0.019207506539314956, + "grad_norm": 28.846360838484934, + "learning_rate": 1.9207138972261952e-05, + "loss": 2.5446, + "mean_token_accuracy": 0.4379310250282288, + "step": 19070 + }, + { + "epoch": 0.01921254259241913, + "grad_norm": 21.115389956683682, + "learning_rate": 1.9212174928993008e-05, + "loss": 2.6631, + "mean_token_accuracy": 0.3827586233615875, + "step": 19075 + }, + { + "epoch": 0.0192175786455233, + "grad_norm": 27.022949371441303, + "learning_rate": 1.921721088572407e-05, + "loss": 2.7361, + "mean_token_accuracy": 0.3793103516101837, + "step": 19080 + }, + { + "epoch": 0.019222614698627474, + "grad_norm": 28.72932577451179, + "learning_rate": 1.922224684245513e-05, + "loss": 2.4192, + "mean_token_accuracy": 0.39310344457626345, + "step": 19085 + }, + { + "epoch": 0.019227650751731648, + "grad_norm": 21.33406037284613, + "learning_rate": 1.9227282799186193e-05, + "loss": 2.7183, + "mean_token_accuracy": 0.334482753276825, + "step": 19090 + }, + { + "epoch": 0.01923268680483582, + "grad_norm": 23.305160789540917, + "learning_rate": 1.923231875591725e-05, + "loss": 2.399, + "mean_token_accuracy": 0.43793103098869324, + "step": 19095 + }, + { + "epoch": 0.019237722857939992, + "grad_norm": 34.73831446218085, + "learning_rate": 1.9237354712648308e-05, + "loss": 2.7472, + "mean_token_accuracy": 0.38620689511299133, + "step": 19100 + }, + { + "epoch": 0.019242758911044166, + "grad_norm": 21.614758806338134, + "learning_rate": 1.924239066937937e-05, + "loss": 2.4642, + "mean_token_accuracy": 0.3931034505367279, + "step": 19105 + }, + { + "epoch": 0.01924779496414834, + "grad_norm": 25.36006241470092, + "learning_rate": 1.924742662611043e-05, + "loss": 2.3311, + "mean_token_accuracy": 0.4344827651977539, + "step": 19110 + }, + { + "epoch": 0.01925283101725251, + "grad_norm": 18.226525431525708, + "learning_rate": 1.925246258284149e-05, + "loss": 2.2973, + "mean_token_accuracy": 0.4620689690113068, + "step": 19115 + }, + { + "epoch": 0.019257867070356684, + "grad_norm": 24.492660683601795, + "learning_rate": 1.9257498539572548e-05, + "loss": 2.135, + "mean_token_accuracy": 0.4448275864124298, + "step": 19120 + }, + { + "epoch": 0.019262903123460857, + "grad_norm": 21.68615301058674, + "learning_rate": 1.9262534496303608e-05, + "loss": 2.3263, + "mean_token_accuracy": 0.4586206912994385, + "step": 19125 + }, + { + "epoch": 0.019267939176565028, + "grad_norm": 23.94939291291365, + "learning_rate": 1.926757045303467e-05, + "loss": 2.3115, + "mean_token_accuracy": 0.4517241418361664, + "step": 19130 + }, + { + "epoch": 0.0192729752296692, + "grad_norm": 21.485230371257042, + "learning_rate": 1.927260640976573e-05, + "loss": 2.4975, + "mean_token_accuracy": 0.42068964838981626, + "step": 19135 + }, + { + "epoch": 0.019278011282773375, + "grad_norm": 21.19594214676024, + "learning_rate": 1.927764236649679e-05, + "loss": 2.5566, + "mean_token_accuracy": 0.36896551847457887, + "step": 19140 + }, + { + "epoch": 0.01928304733587755, + "grad_norm": 25.499974404046863, + "learning_rate": 1.9282678323227848e-05, + "loss": 2.8967, + "mean_token_accuracy": 0.3620689630508423, + "step": 19145 + }, + { + "epoch": 0.01928808338898172, + "grad_norm": 20.904588169592742, + "learning_rate": 1.9287714279958907e-05, + "loss": 2.4021, + "mean_token_accuracy": 0.4504537105560303, + "step": 19150 + }, + { + "epoch": 0.019293119442085893, + "grad_norm": 22.25344689644324, + "learning_rate": 1.9292750236689966e-05, + "loss": 3.0055, + "mean_token_accuracy": 0.3482758641242981, + "step": 19155 + }, + { + "epoch": 0.019298155495190067, + "grad_norm": 20.132923013800138, + "learning_rate": 1.929778619342103e-05, + "loss": 2.0817, + "mean_token_accuracy": 0.47931034564971925, + "step": 19160 + }, + { + "epoch": 0.019303191548294237, + "grad_norm": 21.354640319349592, + "learning_rate": 1.9302822150152085e-05, + "loss": 2.1367, + "mean_token_accuracy": 0.4517241299152374, + "step": 19165 + }, + { + "epoch": 0.01930822760139841, + "grad_norm": 19.3046963605477, + "learning_rate": 1.9307858106883148e-05, + "loss": 2.5955, + "mean_token_accuracy": 0.3862068891525269, + "step": 19170 + }, + { + "epoch": 0.019313263654502585, + "grad_norm": 22.097695379788654, + "learning_rate": 1.9312894063614207e-05, + "loss": 2.2049, + "mean_token_accuracy": 0.4724137902259827, + "step": 19175 + }, + { + "epoch": 0.01931829970760676, + "grad_norm": 22.31561560021954, + "learning_rate": 1.9317930020345266e-05, + "loss": 2.7319, + "mean_token_accuracy": 0.41034482717514037, + "step": 19180 + }, + { + "epoch": 0.01932333576071093, + "grad_norm": 26.417886084764117, + "learning_rate": 1.9322965977076325e-05, + "loss": 2.329, + "mean_token_accuracy": 0.46896551847457885, + "step": 19185 + }, + { + "epoch": 0.019328371813815103, + "grad_norm": 19.866357520399784, + "learning_rate": 1.9328001933807385e-05, + "loss": 2.7874, + "mean_token_accuracy": 0.33103448152542114, + "step": 19190 + }, + { + "epoch": 0.019333407866919276, + "grad_norm": 18.360339379830954, + "learning_rate": 1.9333037890538444e-05, + "loss": 2.3254, + "mean_token_accuracy": 0.4448275864124298, + "step": 19195 + }, + { + "epoch": 0.019338443920023447, + "grad_norm": 25.473783650744327, + "learning_rate": 1.9338073847269506e-05, + "loss": 2.3748, + "mean_token_accuracy": 0.43448275327682495, + "step": 19200 + }, + { + "epoch": 0.01934347997312762, + "grad_norm": 22.726667607811105, + "learning_rate": 1.9343109804000566e-05, + "loss": 2.4949, + "mean_token_accuracy": 0.4, + "step": 19205 + }, + { + "epoch": 0.019348516026231794, + "grad_norm": 21.143753892616008, + "learning_rate": 1.9348145760731625e-05, + "loss": 2.4894, + "mean_token_accuracy": 0.38275861740112305, + "step": 19210 + }, + { + "epoch": 0.019353552079335968, + "grad_norm": 24.746164793535748, + "learning_rate": 1.9353181717462684e-05, + "loss": 2.324, + "mean_token_accuracy": 0.43103447556495667, + "step": 19215 + }, + { + "epoch": 0.019358588132440138, + "grad_norm": 21.17050979255156, + "learning_rate": 1.9358217674193743e-05, + "loss": 2.6149, + "mean_token_accuracy": 0.3999999940395355, + "step": 19220 + }, + { + "epoch": 0.019363624185544312, + "grad_norm": 23.853296853406036, + "learning_rate": 1.9363253630924806e-05, + "loss": 2.6862, + "mean_token_accuracy": 0.3862068891525269, + "step": 19225 + }, + { + "epoch": 0.019368660238648486, + "grad_norm": 27.02083369670501, + "learning_rate": 1.9368289587655862e-05, + "loss": 2.5642, + "mean_token_accuracy": 0.3965517282485962, + "step": 19230 + }, + { + "epoch": 0.019373696291752656, + "grad_norm": 22.91106839604228, + "learning_rate": 1.9373325544386925e-05, + "loss": 2.2112, + "mean_token_accuracy": 0.4448275864124298, + "step": 19235 + }, + { + "epoch": 0.01937873234485683, + "grad_norm": 28.464968452712085, + "learning_rate": 1.9378361501117984e-05, + "loss": 2.6346, + "mean_token_accuracy": 0.39655172228813174, + "step": 19240 + }, + { + "epoch": 0.019383768397961004, + "grad_norm": 20.30334656201811, + "learning_rate": 1.9383397457849043e-05, + "loss": 2.3556, + "mean_token_accuracy": 0.41379310488700866, + "step": 19245 + }, + { + "epoch": 0.019388804451065174, + "grad_norm": 23.40595938767292, + "learning_rate": 1.9388433414580102e-05, + "loss": 2.2606, + "mean_token_accuracy": 0.4241379380226135, + "step": 19250 + }, + { + "epoch": 0.019393840504169348, + "grad_norm": 35.80985472161722, + "learning_rate": 1.939346937131116e-05, + "loss": 2.7535, + "mean_token_accuracy": 0.4103448331356049, + "step": 19255 + }, + { + "epoch": 0.01939887655727352, + "grad_norm": 25.64783571019983, + "learning_rate": 1.939850532804222e-05, + "loss": 2.456, + "mean_token_accuracy": 0.40344828367233276, + "step": 19260 + }, + { + "epoch": 0.019403912610377695, + "grad_norm": 22.055052769733614, + "learning_rate": 1.9403541284773284e-05, + "loss": 2.5328, + "mean_token_accuracy": 0.42413793206214906, + "step": 19265 + }, + { + "epoch": 0.019408948663481865, + "grad_norm": 30.061260574863393, + "learning_rate": 1.9408577241504343e-05, + "loss": 2.3038, + "mean_token_accuracy": 0.43103448748588563, + "step": 19270 + }, + { + "epoch": 0.01941398471658604, + "grad_norm": 23.492195740144325, + "learning_rate": 1.9413613198235402e-05, + "loss": 2.418, + "mean_token_accuracy": 0.4571082890033722, + "step": 19275 + }, + { + "epoch": 0.019419020769690213, + "grad_norm": 23.17806247370198, + "learning_rate": 1.941864915496646e-05, + "loss": 2.2688, + "mean_token_accuracy": 0.46551724076271056, + "step": 19280 + }, + { + "epoch": 0.019424056822794383, + "grad_norm": 25.987372865088407, + "learning_rate": 1.942368511169752e-05, + "loss": 2.4035, + "mean_token_accuracy": 0.4395644307136536, + "step": 19285 + }, + { + "epoch": 0.019429092875898557, + "grad_norm": 18.559552602524132, + "learning_rate": 1.9428721068428583e-05, + "loss": 2.6534, + "mean_token_accuracy": 0.3931034505367279, + "step": 19290 + }, + { + "epoch": 0.01943412892900273, + "grad_norm": 23.03042873979043, + "learning_rate": 1.943375702515964e-05, + "loss": 2.5958, + "mean_token_accuracy": 0.4206896424293518, + "step": 19295 + }, + { + "epoch": 0.019439164982106905, + "grad_norm": 24.926087738102524, + "learning_rate": 1.94387929818907e-05, + "loss": 2.3143, + "mean_token_accuracy": 0.40689654350280763, + "step": 19300 + }, + { + "epoch": 0.019444201035211075, + "grad_norm": 21.260535807099927, + "learning_rate": 1.944382893862176e-05, + "loss": 2.5112, + "mean_token_accuracy": 0.4068965554237366, + "step": 19305 + }, + { + "epoch": 0.01944923708831525, + "grad_norm": 20.591896190660602, + "learning_rate": 1.944886489535282e-05, + "loss": 2.328, + "mean_token_accuracy": 0.4482758641242981, + "step": 19310 + }, + { + "epoch": 0.019454273141419422, + "grad_norm": 20.233529813523838, + "learning_rate": 1.945390085208388e-05, + "loss": 2.2581, + "mean_token_accuracy": 0.48523896336555483, + "step": 19315 + }, + { + "epoch": 0.019459309194523593, + "grad_norm": 18.27460801311187, + "learning_rate": 1.945893680881494e-05, + "loss": 2.7165, + "mean_token_accuracy": 0.4034482777118683, + "step": 19320 + }, + { + "epoch": 0.019464345247627766, + "grad_norm": 21.45706905987862, + "learning_rate": 1.9463972765545998e-05, + "loss": 2.7051, + "mean_token_accuracy": 0.3999999940395355, + "step": 19325 + }, + { + "epoch": 0.01946938130073194, + "grad_norm": 22.056454640252664, + "learning_rate": 1.946900872227706e-05, + "loss": 2.4943, + "mean_token_accuracy": 0.44482759237289426, + "step": 19330 + }, + { + "epoch": 0.019474417353836114, + "grad_norm": 26.35043528970658, + "learning_rate": 1.947404467900812e-05, + "loss": 2.816, + "mean_token_accuracy": 0.3862069010734558, + "step": 19335 + }, + { + "epoch": 0.019479453406940284, + "grad_norm": 29.277167580017586, + "learning_rate": 1.947908063573918e-05, + "loss": 2.3923, + "mean_token_accuracy": 0.4068965554237366, + "step": 19340 + }, + { + "epoch": 0.019484489460044458, + "grad_norm": 22.660510664728218, + "learning_rate": 1.948411659247024e-05, + "loss": 2.5806, + "mean_token_accuracy": 0.40344828367233276, + "step": 19345 + }, + { + "epoch": 0.019489525513148632, + "grad_norm": 16.08706833776135, + "learning_rate": 1.9489152549201298e-05, + "loss": 2.2493, + "mean_token_accuracy": 0.4571082890033722, + "step": 19350 + }, + { + "epoch": 0.019494561566252802, + "grad_norm": 26.745840492307664, + "learning_rate": 1.949418850593236e-05, + "loss": 2.763, + "mean_token_accuracy": 0.39999998807907106, + "step": 19355 + }, + { + "epoch": 0.019499597619356976, + "grad_norm": 20.477973285395915, + "learning_rate": 1.949922446266342e-05, + "loss": 2.6096, + "mean_token_accuracy": 0.4, + "step": 19360 + }, + { + "epoch": 0.01950463367246115, + "grad_norm": 17.145100040157523, + "learning_rate": 1.9504260419394475e-05, + "loss": 2.2553, + "mean_token_accuracy": 0.4379310369491577, + "step": 19365 + }, + { + "epoch": 0.019509669725565323, + "grad_norm": 25.0147791770493, + "learning_rate": 1.9509296376125538e-05, + "loss": 2.0779, + "mean_token_accuracy": 0.5367211163043976, + "step": 19370 + }, + { + "epoch": 0.019514705778669494, + "grad_norm": 25.196541997571952, + "learning_rate": 1.9514332332856597e-05, + "loss": 2.3969, + "mean_token_accuracy": 0.4206896543502808, + "step": 19375 + }, + { + "epoch": 0.019519741831773667, + "grad_norm": 24.755288927952, + "learning_rate": 1.9519368289587657e-05, + "loss": 2.3844, + "mean_token_accuracy": 0.42232305407524107, + "step": 19380 + }, + { + "epoch": 0.01952477788487784, + "grad_norm": 19.04459836152872, + "learning_rate": 1.9524404246318716e-05, + "loss": 2.3192, + "mean_token_accuracy": 0.43103448748588563, + "step": 19385 + }, + { + "epoch": 0.01952981393798201, + "grad_norm": 21.558449894243214, + "learning_rate": 1.9529440203049775e-05, + "loss": 2.1703, + "mean_token_accuracy": 0.4413793087005615, + "step": 19390 + }, + { + "epoch": 0.019534849991086185, + "grad_norm": 19.781588443586138, + "learning_rate": 1.9534476159780838e-05, + "loss": 2.3806, + "mean_token_accuracy": 0.43793103098869324, + "step": 19395 + }, + { + "epoch": 0.01953988604419036, + "grad_norm": 23.29396094065663, + "learning_rate": 1.9539512116511897e-05, + "loss": 2.5847, + "mean_token_accuracy": 0.33793103098869326, + "step": 19400 + }, + { + "epoch": 0.019544922097294533, + "grad_norm": 20.581060755594958, + "learning_rate": 1.9544548073242956e-05, + "loss": 2.4807, + "mean_token_accuracy": 0.4344827651977539, + "step": 19405 + }, + { + "epoch": 0.019549958150398703, + "grad_norm": 23.24665183560334, + "learning_rate": 1.9549584029974015e-05, + "loss": 2.7321, + "mean_token_accuracy": 0.35862069129943847, + "step": 19410 + }, + { + "epoch": 0.019554994203502877, + "grad_norm": 23.020710398695247, + "learning_rate": 1.9554619986705075e-05, + "loss": 2.3401, + "mean_token_accuracy": 0.4448275864124298, + "step": 19415 + }, + { + "epoch": 0.01956003025660705, + "grad_norm": 23.059783685177063, + "learning_rate": 1.9559655943436134e-05, + "loss": 2.2026, + "mean_token_accuracy": 0.41034482717514037, + "step": 19420 + }, + { + "epoch": 0.01956506630971122, + "grad_norm": 18.711438378302265, + "learning_rate": 1.9564691900167197e-05, + "loss": 2.2146, + "mean_token_accuracy": 0.44482758045196535, + "step": 19425 + }, + { + "epoch": 0.019570102362815395, + "grad_norm": 22.450052771439, + "learning_rate": 1.9569727856898252e-05, + "loss": 2.4209, + "mean_token_accuracy": 0.42589232325553894, + "step": 19430 + }, + { + "epoch": 0.01957513841591957, + "grad_norm": 22.64552852095471, + "learning_rate": 1.9574763813629315e-05, + "loss": 2.2356, + "mean_token_accuracy": 0.4448275864124298, + "step": 19435 + }, + { + "epoch": 0.019580174469023742, + "grad_norm": 18.9687169723688, + "learning_rate": 1.9579799770360374e-05, + "loss": 3.0165, + "mean_token_accuracy": 0.417241370677948, + "step": 19440 + }, + { + "epoch": 0.019585210522127913, + "grad_norm": 22.678322909084194, + "learning_rate": 1.9584835727091434e-05, + "loss": 2.5673, + "mean_token_accuracy": 0.38965517580509185, + "step": 19445 + }, + { + "epoch": 0.019590246575232086, + "grad_norm": 44.06056001444506, + "learning_rate": 1.9589871683822493e-05, + "loss": 2.4834, + "mean_token_accuracy": 0.3827586233615875, + "step": 19450 + }, + { + "epoch": 0.01959528262833626, + "grad_norm": 25.35535799068325, + "learning_rate": 1.9594907640553552e-05, + "loss": 2.4644, + "mean_token_accuracy": 0.4448275864124298, + "step": 19455 + }, + { + "epoch": 0.01960031868144043, + "grad_norm": 20.428422659854686, + "learning_rate": 1.959994359728461e-05, + "loss": 2.5562, + "mean_token_accuracy": 0.3931034505367279, + "step": 19460 + }, + { + "epoch": 0.019605354734544604, + "grad_norm": 21.926657100495863, + "learning_rate": 1.9604979554015674e-05, + "loss": 2.4568, + "mean_token_accuracy": 0.37241379022598264, + "step": 19465 + }, + { + "epoch": 0.019610390787648778, + "grad_norm": 18.571065640052613, + "learning_rate": 1.9610015510746733e-05, + "loss": 2.1219, + "mean_token_accuracy": 0.5000000119209289, + "step": 19470 + }, + { + "epoch": 0.01961542684075295, + "grad_norm": 19.99694544236865, + "learning_rate": 1.9615051467477792e-05, + "loss": 2.5688, + "mean_token_accuracy": 0.4413793087005615, + "step": 19475 + }, + { + "epoch": 0.019620462893857122, + "grad_norm": 23.29830442478529, + "learning_rate": 1.9620087424208852e-05, + "loss": 2.6918, + "mean_token_accuracy": 0.3758620619773865, + "step": 19480 + }, + { + "epoch": 0.019625498946961296, + "grad_norm": 28.014318131290587, + "learning_rate": 1.962512338093991e-05, + "loss": 2.7607, + "mean_token_accuracy": 0.3965517282485962, + "step": 19485 + }, + { + "epoch": 0.01963053500006547, + "grad_norm": 24.013684902351578, + "learning_rate": 1.9630159337670974e-05, + "loss": 2.4681, + "mean_token_accuracy": 0.38620689511299133, + "step": 19490 + }, + { + "epoch": 0.01963557105316964, + "grad_norm": 19.9055592544922, + "learning_rate": 1.9635195294402033e-05, + "loss": 2.2672, + "mean_token_accuracy": 0.4379310369491577, + "step": 19495 + }, + { + "epoch": 0.019640607106273814, + "grad_norm": 22.794552668528056, + "learning_rate": 1.964023125113309e-05, + "loss": 2.4731, + "mean_token_accuracy": 0.39655172526836396, + "step": 19500 + }, + { + "epoch": 0.019645643159377987, + "grad_norm": 26.673363214235547, + "learning_rate": 1.964526720786415e-05, + "loss": 2.5865, + "mean_token_accuracy": 0.40163338780403135, + "step": 19505 + }, + { + "epoch": 0.01965067921248216, + "grad_norm": 23.1969981139211, + "learning_rate": 1.965030316459521e-05, + "loss": 2.619, + "mean_token_accuracy": 0.35862069129943847, + "step": 19510 + }, + { + "epoch": 0.01965571526558633, + "grad_norm": 20.98372477459728, + "learning_rate": 1.9655339121326273e-05, + "loss": 2.4192, + "mean_token_accuracy": 0.43793103098869324, + "step": 19515 + }, + { + "epoch": 0.019660751318690505, + "grad_norm": 18.775265012060057, + "learning_rate": 1.966037507805733e-05, + "loss": 2.2992, + "mean_token_accuracy": 0.4379310369491577, + "step": 19520 + }, + { + "epoch": 0.01966578737179468, + "grad_norm": 20.382438064492526, + "learning_rate": 1.966541103478839e-05, + "loss": 2.4878, + "mean_token_accuracy": 0.3999999940395355, + "step": 19525 + }, + { + "epoch": 0.01967082342489885, + "grad_norm": 22.513270262911472, + "learning_rate": 1.967044699151945e-05, + "loss": 2.6284, + "mean_token_accuracy": 0.43629764318466185, + "step": 19530 + }, + { + "epoch": 0.019675859478003023, + "grad_norm": 21.774391755695813, + "learning_rate": 1.967548294825051e-05, + "loss": 2.4784, + "mean_token_accuracy": 0.4275862157344818, + "step": 19535 + }, + { + "epoch": 0.019680895531107197, + "grad_norm": 22.771118313889335, + "learning_rate": 1.968051890498157e-05, + "loss": 2.654, + "mean_token_accuracy": 0.36896551847457887, + "step": 19540 + }, + { + "epoch": 0.01968593158421137, + "grad_norm": 21.313805036033248, + "learning_rate": 1.968555486171263e-05, + "loss": 2.2671, + "mean_token_accuracy": 0.4896551609039307, + "step": 19545 + }, + { + "epoch": 0.01969096763731554, + "grad_norm": 28.38610331227566, + "learning_rate": 1.9690590818443688e-05, + "loss": 2.5704, + "mean_token_accuracy": 0.3655172407627106, + "step": 19550 + }, + { + "epoch": 0.019696003690419715, + "grad_norm": 21.180729103172418, + "learning_rate": 1.969562677517475e-05, + "loss": 2.2293, + "mean_token_accuracy": 0.4068965494632721, + "step": 19555 + }, + { + "epoch": 0.01970103974352389, + "grad_norm": 24.115690333598778, + "learning_rate": 1.970066273190581e-05, + "loss": 2.5892, + "mean_token_accuracy": 0.43103447556495667, + "step": 19560 + }, + { + "epoch": 0.01970607579662806, + "grad_norm": 25.41951153973096, + "learning_rate": 1.9705698688636866e-05, + "loss": 2.4177, + "mean_token_accuracy": 0.4678161025047302, + "step": 19565 + }, + { + "epoch": 0.019711111849732232, + "grad_norm": 22.571575768541788, + "learning_rate": 1.971073464536793e-05, + "loss": 2.3521, + "mean_token_accuracy": 0.3965517282485962, + "step": 19570 + }, + { + "epoch": 0.019716147902836406, + "grad_norm": 25.122398212189243, + "learning_rate": 1.9715770602098988e-05, + "loss": 2.429, + "mean_token_accuracy": 0.4137930989265442, + "step": 19575 + }, + { + "epoch": 0.01972118395594058, + "grad_norm": 18.59421859271088, + "learning_rate": 1.9720806558830047e-05, + "loss": 2.299, + "mean_token_accuracy": 0.4379310369491577, + "step": 19580 + }, + { + "epoch": 0.01972622000904475, + "grad_norm": 30.89035485234619, + "learning_rate": 1.9725842515561106e-05, + "loss": 2.159, + "mean_token_accuracy": 0.4488916337490082, + "step": 19585 + }, + { + "epoch": 0.019731256062148924, + "grad_norm": 22.616968370618622, + "learning_rate": 1.9730878472292165e-05, + "loss": 2.6236, + "mean_token_accuracy": 0.4206896543502808, + "step": 19590 + }, + { + "epoch": 0.019736292115253098, + "grad_norm": 21.267021295042294, + "learning_rate": 1.9735914429023228e-05, + "loss": 2.2882, + "mean_token_accuracy": 0.43956443667411804, + "step": 19595 + }, + { + "epoch": 0.019741328168357268, + "grad_norm": 19.578745283777145, + "learning_rate": 1.9740950385754287e-05, + "loss": 2.633, + "mean_token_accuracy": 0.38620689511299133, + "step": 19600 + }, + { + "epoch": 0.019746364221461442, + "grad_norm": 25.107282899046538, + "learning_rate": 1.9745986342485347e-05, + "loss": 2.5779, + "mean_token_accuracy": 0.36896551251411436, + "step": 19605 + }, + { + "epoch": 0.019751400274565616, + "grad_norm": 22.184511800268798, + "learning_rate": 1.9751022299216406e-05, + "loss": 2.7231, + "mean_token_accuracy": 0.3827586233615875, + "step": 19610 + }, + { + "epoch": 0.01975643632766979, + "grad_norm": 20.477292563075824, + "learning_rate": 1.9756058255947465e-05, + "loss": 2.7844, + "mean_token_accuracy": 0.38620689511299133, + "step": 19615 + }, + { + "epoch": 0.01976147238077396, + "grad_norm": 22.64414698638484, + "learning_rate": 1.9761094212678524e-05, + "loss": 2.5353, + "mean_token_accuracy": 0.420689657330513, + "step": 19620 + }, + { + "epoch": 0.019766508433878133, + "grad_norm": 28.672234040709238, + "learning_rate": 1.9766130169409587e-05, + "loss": 2.7729, + "mean_token_accuracy": 0.38620689511299133, + "step": 19625 + }, + { + "epoch": 0.019771544486982307, + "grad_norm": 23.000095257899765, + "learning_rate": 1.9771166126140643e-05, + "loss": 2.5256, + "mean_token_accuracy": 0.43793103098869324, + "step": 19630 + }, + { + "epoch": 0.019776580540086477, + "grad_norm": 21.79197923301158, + "learning_rate": 1.9776202082871706e-05, + "loss": 2.1846, + "mean_token_accuracy": 0.40344828367233276, + "step": 19635 + }, + { + "epoch": 0.01978161659319065, + "grad_norm": 32.78900397382135, + "learning_rate": 1.9781238039602765e-05, + "loss": 2.4213, + "mean_token_accuracy": 0.4363581359386444, + "step": 19640 + }, + { + "epoch": 0.019786652646294825, + "grad_norm": 24.728391006801576, + "learning_rate": 1.9786273996333824e-05, + "loss": 2.2754, + "mean_token_accuracy": 0.458620685338974, + "step": 19645 + }, + { + "epoch": 0.019791688699399, + "grad_norm": 28.035853431038824, + "learning_rate": 1.9791309953064883e-05, + "loss": 2.5163, + "mean_token_accuracy": 0.4379310429096222, + "step": 19650 + }, + { + "epoch": 0.01979672475250317, + "grad_norm": 18.822381505457027, + "learning_rate": 1.9796345909795943e-05, + "loss": 2.4849, + "mean_token_accuracy": 0.4344827592372894, + "step": 19655 + }, + { + "epoch": 0.019801760805607343, + "grad_norm": 19.354404502416187, + "learning_rate": 1.9801381866527005e-05, + "loss": 2.3816, + "mean_token_accuracy": 0.42758620381355283, + "step": 19660 + }, + { + "epoch": 0.019806796858711517, + "grad_norm": 22.63016659594927, + "learning_rate": 1.9806417823258064e-05, + "loss": 2.4781, + "mean_token_accuracy": 0.45517241954803467, + "step": 19665 + }, + { + "epoch": 0.019811832911815687, + "grad_norm": 24.21606805861793, + "learning_rate": 1.9811453779989124e-05, + "loss": 2.3656, + "mean_token_accuracy": 0.41724138259887694, + "step": 19670 + }, + { + "epoch": 0.01981686896491986, + "grad_norm": 25.636286240418784, + "learning_rate": 1.9816489736720183e-05, + "loss": 2.6066, + "mean_token_accuracy": 0.4310344815254211, + "step": 19675 + }, + { + "epoch": 0.019821905018024034, + "grad_norm": 19.697027037679458, + "learning_rate": 1.9821525693451242e-05, + "loss": 2.2553, + "mean_token_accuracy": 0.3827586114406586, + "step": 19680 + }, + { + "epoch": 0.019826941071128208, + "grad_norm": 20.22678751788821, + "learning_rate": 1.98265616501823e-05, + "loss": 2.5872, + "mean_token_accuracy": 0.3793103456497192, + "step": 19685 + }, + { + "epoch": 0.01983197712423238, + "grad_norm": 17.132820605132977, + "learning_rate": 1.9831597606913364e-05, + "loss": 2.3199, + "mean_token_accuracy": 0.42758620977401735, + "step": 19690 + }, + { + "epoch": 0.019837013177336552, + "grad_norm": 29.307599096242587, + "learning_rate": 1.9836633563644423e-05, + "loss": 2.6303, + "mean_token_accuracy": 0.4068965494632721, + "step": 19695 + }, + { + "epoch": 0.019842049230440726, + "grad_norm": 31.20471407504354, + "learning_rate": 1.9841669520375483e-05, + "loss": 2.4529, + "mean_token_accuracy": 0.42413793206214906, + "step": 19700 + }, + { + "epoch": 0.019847085283544896, + "grad_norm": 18.829203985718003, + "learning_rate": 1.9846705477106542e-05, + "loss": 2.4723, + "mean_token_accuracy": 0.42068966031074523, + "step": 19705 + }, + { + "epoch": 0.01985212133664907, + "grad_norm": 21.99221809137749, + "learning_rate": 1.98517414338376e-05, + "loss": 2.4622, + "mean_token_accuracy": 0.4034482777118683, + "step": 19710 + }, + { + "epoch": 0.019857157389753244, + "grad_norm": 31.271867027813663, + "learning_rate": 1.9856777390568664e-05, + "loss": 2.3963, + "mean_token_accuracy": 0.4413793087005615, + "step": 19715 + }, + { + "epoch": 0.019862193442857418, + "grad_norm": 25.27460252796519, + "learning_rate": 1.986181334729972e-05, + "loss": 2.3298, + "mean_token_accuracy": 0.46206897497177124, + "step": 19720 + }, + { + "epoch": 0.019867229495961588, + "grad_norm": 25.23595392014646, + "learning_rate": 1.986684930403078e-05, + "loss": 2.1099, + "mean_token_accuracy": 0.4448275864124298, + "step": 19725 + }, + { + "epoch": 0.01987226554906576, + "grad_norm": 18.890023522028038, + "learning_rate": 1.987188526076184e-05, + "loss": 2.1577, + "mean_token_accuracy": 0.4482758641242981, + "step": 19730 + }, + { + "epoch": 0.019877301602169935, + "grad_norm": 19.92335395801169, + "learning_rate": 1.98769212174929e-05, + "loss": 2.8164, + "mean_token_accuracy": 0.3551724165678024, + "step": 19735 + }, + { + "epoch": 0.019882337655274106, + "grad_norm": 21.354098540231103, + "learning_rate": 1.988195717422396e-05, + "loss": 2.2178, + "mean_token_accuracy": 0.4862068951129913, + "step": 19740 + }, + { + "epoch": 0.01988737370837828, + "grad_norm": 19.111345276088578, + "learning_rate": 1.988699313095502e-05, + "loss": 2.4499, + "mean_token_accuracy": 0.42413792610168455, + "step": 19745 + }, + { + "epoch": 0.019892409761482453, + "grad_norm": 19.940269924189423, + "learning_rate": 1.989202908768608e-05, + "loss": 2.3744, + "mean_token_accuracy": 0.43448275327682495, + "step": 19750 + }, + { + "epoch": 0.019897445814586627, + "grad_norm": 24.00192066203954, + "learning_rate": 1.989706504441714e-05, + "loss": 2.0404, + "mean_token_accuracy": 0.49999999403953554, + "step": 19755 + }, + { + "epoch": 0.019902481867690797, + "grad_norm": 22.85741513974964, + "learning_rate": 1.99021010011482e-05, + "loss": 2.1737, + "mean_token_accuracy": 0.46551724672317507, + "step": 19760 + }, + { + "epoch": 0.01990751792079497, + "grad_norm": 24.031069465769527, + "learning_rate": 1.9907136957879256e-05, + "loss": 2.4035, + "mean_token_accuracy": 0.441379314661026, + "step": 19765 + }, + { + "epoch": 0.019912553973899145, + "grad_norm": 21.910986262870132, + "learning_rate": 1.991217291461032e-05, + "loss": 2.6481, + "mean_token_accuracy": 0.4034482777118683, + "step": 19770 + }, + { + "epoch": 0.019917590027003315, + "grad_norm": 23.957067736517022, + "learning_rate": 1.9917208871341378e-05, + "loss": 2.5237, + "mean_token_accuracy": 0.38275861740112305, + "step": 19775 + }, + { + "epoch": 0.01992262608010749, + "grad_norm": 23.42427466199256, + "learning_rate": 1.992224482807244e-05, + "loss": 2.4608, + "mean_token_accuracy": 0.4068965554237366, + "step": 19780 + }, + { + "epoch": 0.019927662133211663, + "grad_norm": 27.26501196993155, + "learning_rate": 1.9927280784803497e-05, + "loss": 2.6888, + "mean_token_accuracy": 0.358620685338974, + "step": 19785 + }, + { + "epoch": 0.019932698186315836, + "grad_norm": 27.30677704137981, + "learning_rate": 1.9932316741534556e-05, + "loss": 2.445, + "mean_token_accuracy": 0.4050211668014526, + "step": 19790 + }, + { + "epoch": 0.019937734239420007, + "grad_norm": 23.788990132045406, + "learning_rate": 1.993735269826562e-05, + "loss": 2.5053, + "mean_token_accuracy": 0.4310344815254211, + "step": 19795 + }, + { + "epoch": 0.01994277029252418, + "grad_norm": 19.99802614459865, + "learning_rate": 1.9942388654996678e-05, + "loss": 2.469, + "mean_token_accuracy": 0.4, + "step": 19800 + }, + { + "epoch": 0.019947806345628354, + "grad_norm": 22.36445313189385, + "learning_rate": 1.9947424611727737e-05, + "loss": 2.4429, + "mean_token_accuracy": 0.39479734301567077, + "step": 19805 + }, + { + "epoch": 0.019952842398732525, + "grad_norm": 21.32394750571172, + "learning_rate": 1.9952460568458796e-05, + "loss": 2.0264, + "mean_token_accuracy": 0.4344827592372894, + "step": 19810 + }, + { + "epoch": 0.0199578784518367, + "grad_norm": 22.272840630467616, + "learning_rate": 1.9957496525189856e-05, + "loss": 2.4318, + "mean_token_accuracy": 0.4310344815254211, + "step": 19815 + }, + { + "epoch": 0.019962914504940872, + "grad_norm": 26.755959642632238, + "learning_rate": 1.9962532481920918e-05, + "loss": 2.2747, + "mean_token_accuracy": 0.4328493714332581, + "step": 19820 + }, + { + "epoch": 0.019967950558045046, + "grad_norm": 24.81101266374396, + "learning_rate": 1.9967568438651977e-05, + "loss": 2.4354, + "mean_token_accuracy": 0.4620689690113068, + "step": 19825 + }, + { + "epoch": 0.019972986611149216, + "grad_norm": 30.033593269499878, + "learning_rate": 1.9972604395383033e-05, + "loss": 2.2176, + "mean_token_accuracy": 0.46551724672317507, + "step": 19830 + }, + { + "epoch": 0.01997802266425339, + "grad_norm": 19.281394208094138, + "learning_rate": 1.9977640352114096e-05, + "loss": 2.1782, + "mean_token_accuracy": 0.4689655125141144, + "step": 19835 + }, + { + "epoch": 0.019983058717357564, + "grad_norm": 27.686479717412116, + "learning_rate": 1.9982676308845155e-05, + "loss": 2.6764, + "mean_token_accuracy": 0.4034482777118683, + "step": 19840 + }, + { + "epoch": 0.019988094770461734, + "grad_norm": 21.337179671378916, + "learning_rate": 1.9987712265576214e-05, + "loss": 2.329, + "mean_token_accuracy": 0.4413793087005615, + "step": 19845 + }, + { + "epoch": 0.019993130823565908, + "grad_norm": 21.74332707671995, + "learning_rate": 1.9992748222307274e-05, + "loss": 2.3135, + "mean_token_accuracy": 0.4310344815254211, + "step": 19850 + }, + { + "epoch": 0.01999816687667008, + "grad_norm": 22.31467039186713, + "learning_rate": 1.9997784179038333e-05, + "loss": 2.438, + "mean_token_accuracy": 0.3808832406997681, + "step": 19855 + }, + { + "epoch": 0.020003202929774255, + "grad_norm": 25.04168673486434, + "learning_rate": 2.0002820135769396e-05, + "loss": 2.5857, + "mean_token_accuracy": 0.4034482717514038, + "step": 19860 + }, + { + "epoch": 0.020008238982878426, + "grad_norm": 17.856824867718597, + "learning_rate": 2.0007856092500455e-05, + "loss": 2.2978, + "mean_token_accuracy": 0.46551724076271056, + "step": 19865 + }, + { + "epoch": 0.0200132750359826, + "grad_norm": 16.683259631887143, + "learning_rate": 2.0012892049231514e-05, + "loss": 2.4661, + "mean_token_accuracy": 0.41724138259887694, + "step": 19870 + }, + { + "epoch": 0.020018311089086773, + "grad_norm": 24.762002397014577, + "learning_rate": 2.0017928005962573e-05, + "loss": 2.2516, + "mean_token_accuracy": 0.46896552443504336, + "step": 19875 + }, + { + "epoch": 0.020023347142190943, + "grad_norm": 26.204284897366747, + "learning_rate": 2.0022963962693633e-05, + "loss": 2.4559, + "mean_token_accuracy": 0.37586206793785093, + "step": 19880 + }, + { + "epoch": 0.020028383195295117, + "grad_norm": 19.24675451553648, + "learning_rate": 2.0027999919424692e-05, + "loss": 2.6628, + "mean_token_accuracy": 0.4034482777118683, + "step": 19885 + }, + { + "epoch": 0.02003341924839929, + "grad_norm": 20.522969947818638, + "learning_rate": 2.0033035876155755e-05, + "loss": 2.2211, + "mean_token_accuracy": 0.46896551847457885, + "step": 19890 + }, + { + "epoch": 0.020038455301503465, + "grad_norm": 22.243001438453426, + "learning_rate": 2.0038071832886814e-05, + "loss": 2.6968, + "mean_token_accuracy": 0.41034482717514037, + "step": 19895 + }, + { + "epoch": 0.020043491354607635, + "grad_norm": 20.73232941888443, + "learning_rate": 2.0043107789617873e-05, + "loss": 2.1047, + "mean_token_accuracy": 0.46551724672317507, + "step": 19900 + }, + { + "epoch": 0.02004852740771181, + "grad_norm": 22.283220168231363, + "learning_rate": 2.0048143746348932e-05, + "loss": 2.4339, + "mean_token_accuracy": 0.42413793206214906, + "step": 19905 + }, + { + "epoch": 0.020053563460815983, + "grad_norm": 19.39015337331024, + "learning_rate": 2.005317970307999e-05, + "loss": 2.1464, + "mean_token_accuracy": 0.4551724076271057, + "step": 19910 + }, + { + "epoch": 0.020058599513920153, + "grad_norm": 26.57323772574109, + "learning_rate": 2.0058215659811054e-05, + "loss": 2.3459, + "mean_token_accuracy": 0.4151845097541809, + "step": 19915 + }, + { + "epoch": 0.020063635567024327, + "grad_norm": 20.181797350457796, + "learning_rate": 2.006325161654211e-05, + "loss": 2.4108, + "mean_token_accuracy": 0.4068965494632721, + "step": 19920 + }, + { + "epoch": 0.0200686716201285, + "grad_norm": 19.82957670016997, + "learning_rate": 2.006828757327317e-05, + "loss": 2.4968, + "mean_token_accuracy": 0.4669950783252716, + "step": 19925 + }, + { + "epoch": 0.020073707673232674, + "grad_norm": 22.735002622728427, + "learning_rate": 2.0073323530004232e-05, + "loss": 2.4244, + "mean_token_accuracy": 0.4103448212146759, + "step": 19930 + }, + { + "epoch": 0.020078743726336844, + "grad_norm": 29.210612347748405, + "learning_rate": 2.007835948673529e-05, + "loss": 2.582, + "mean_token_accuracy": 0.37931033968925476, + "step": 19935 + }, + { + "epoch": 0.020083779779441018, + "grad_norm": 21.203336744598115, + "learning_rate": 2.008339544346635e-05, + "loss": 2.22, + "mean_token_accuracy": 0.4655172348022461, + "step": 19940 + }, + { + "epoch": 0.020088815832545192, + "grad_norm": 28.502244458367624, + "learning_rate": 2.008843140019741e-05, + "loss": 2.4644, + "mean_token_accuracy": 0.3965517282485962, + "step": 19945 + }, + { + "epoch": 0.020093851885649362, + "grad_norm": 17.4466692547439, + "learning_rate": 2.009346735692847e-05, + "loss": 2.5001, + "mean_token_accuracy": 0.4482758641242981, + "step": 19950 + }, + { + "epoch": 0.020098887938753536, + "grad_norm": 22.91752071433038, + "learning_rate": 2.009850331365953e-05, + "loss": 2.5429, + "mean_token_accuracy": 0.4482758641242981, + "step": 19955 + }, + { + "epoch": 0.02010392399185771, + "grad_norm": 27.732540239252913, + "learning_rate": 2.010353927039059e-05, + "loss": 2.5477, + "mean_token_accuracy": 0.4413793087005615, + "step": 19960 + }, + { + "epoch": 0.020108960044961884, + "grad_norm": 20.60362071143817, + "learning_rate": 2.0108575227121647e-05, + "loss": 2.4125, + "mean_token_accuracy": 0.4758620738983154, + "step": 19965 + }, + { + "epoch": 0.020113996098066054, + "grad_norm": 17.461400081581683, + "learning_rate": 2.011361118385271e-05, + "loss": 2.3917, + "mean_token_accuracy": 0.4275861978530884, + "step": 19970 + }, + { + "epoch": 0.020119032151170228, + "grad_norm": 21.196968315768604, + "learning_rate": 2.011864714058377e-05, + "loss": 2.6275, + "mean_token_accuracy": 0.3758620709180832, + "step": 19975 + }, + { + "epoch": 0.0201240682042744, + "grad_norm": 22.67066390019272, + "learning_rate": 2.012368309731483e-05, + "loss": 2.2504, + "mean_token_accuracy": 0.43284936547279357, + "step": 19980 + }, + { + "epoch": 0.02012910425737857, + "grad_norm": 19.720872828690645, + "learning_rate": 2.0128719054045887e-05, + "loss": 2.0867, + "mean_token_accuracy": 0.45517241954803467, + "step": 19985 + }, + { + "epoch": 0.020134140310482745, + "grad_norm": 21.14831266732385, + "learning_rate": 2.0133755010776946e-05, + "loss": 2.529, + "mean_token_accuracy": 0.42758620381355283, + "step": 19990 + }, + { + "epoch": 0.02013917636358692, + "grad_norm": 29.826248086436237, + "learning_rate": 2.013879096750801e-05, + "loss": 2.7157, + "mean_token_accuracy": 0.3896551728248596, + "step": 19995 + }, + { + "epoch": 0.020144212416691093, + "grad_norm": 22.348135691696463, + "learning_rate": 2.0143826924239068e-05, + "loss": 2.4669, + "mean_token_accuracy": 0.458620685338974, + "step": 20000 + }, + { + "epoch": 0.020149248469795263, + "grad_norm": 20.312473209514764, + "learning_rate": 2.0148862880970128e-05, + "loss": 2.2812, + "mean_token_accuracy": 0.417241370677948, + "step": 20005 + }, + { + "epoch": 0.020154284522899437, + "grad_norm": 18.128967472374892, + "learning_rate": 2.0153898837701187e-05, + "loss": 2.2357, + "mean_token_accuracy": 0.4344827592372894, + "step": 20010 + }, + { + "epoch": 0.02015932057600361, + "grad_norm": 25.972831262973227, + "learning_rate": 2.0158934794432246e-05, + "loss": 2.6056, + "mean_token_accuracy": 0.4068965554237366, + "step": 20015 + }, + { + "epoch": 0.02016435662910778, + "grad_norm": 21.698010994170193, + "learning_rate": 2.016397075116331e-05, + "loss": 2.2518, + "mean_token_accuracy": 0.4222625494003296, + "step": 20020 + }, + { + "epoch": 0.020169392682211955, + "grad_norm": 20.49595400193641, + "learning_rate": 2.0169006707894368e-05, + "loss": 2.3718, + "mean_token_accuracy": 0.458620685338974, + "step": 20025 + }, + { + "epoch": 0.02017442873531613, + "grad_norm": 19.545832859740564, + "learning_rate": 2.0174042664625424e-05, + "loss": 2.2899, + "mean_token_accuracy": 0.44827585220336913, + "step": 20030 + }, + { + "epoch": 0.020179464788420302, + "grad_norm": 25.627378483006126, + "learning_rate": 2.0179078621356486e-05, + "loss": 2.6059, + "mean_token_accuracy": 0.3896551728248596, + "step": 20035 + }, + { + "epoch": 0.020184500841524473, + "grad_norm": 14.454305060601428, + "learning_rate": 2.0184114578087546e-05, + "loss": 2.122, + "mean_token_accuracy": 0.45983060598373415, + "step": 20040 + }, + { + "epoch": 0.020189536894628646, + "grad_norm": 23.900280511559952, + "learning_rate": 2.018915053481861e-05, + "loss": 2.5806, + "mean_token_accuracy": 0.3965517282485962, + "step": 20045 + }, + { + "epoch": 0.02019457294773282, + "grad_norm": 17.19209105736644, + "learning_rate": 2.0194186491549668e-05, + "loss": 2.0879, + "mean_token_accuracy": 0.4931034564971924, + "step": 20050 + }, + { + "epoch": 0.02019960900083699, + "grad_norm": 27.12404573786952, + "learning_rate": 2.0199222448280723e-05, + "loss": 2.5582, + "mean_token_accuracy": 0.39310344457626345, + "step": 20055 + }, + { + "epoch": 0.020204645053941164, + "grad_norm": 29.18327620572081, + "learning_rate": 2.0204258405011786e-05, + "loss": 2.7166, + "mean_token_accuracy": 0.39655172228813174, + "step": 20060 + }, + { + "epoch": 0.020209681107045338, + "grad_norm": 20.9754133748773, + "learning_rate": 2.0209294361742845e-05, + "loss": 2.2913, + "mean_token_accuracy": 0.44137929677963256, + "step": 20065 + }, + { + "epoch": 0.020214717160149512, + "grad_norm": 16.682171951543346, + "learning_rate": 2.0214330318473905e-05, + "loss": 2.1102, + "mean_token_accuracy": 0.46896551847457885, + "step": 20070 + }, + { + "epoch": 0.020219753213253682, + "grad_norm": 20.855804478755104, + "learning_rate": 2.0219366275204964e-05, + "loss": 2.1519, + "mean_token_accuracy": 0.458620685338974, + "step": 20075 + }, + { + "epoch": 0.020224789266357856, + "grad_norm": 19.195625308820834, + "learning_rate": 2.0224402231936023e-05, + "loss": 2.4016, + "mean_token_accuracy": 0.4536600112915039, + "step": 20080 + }, + { + "epoch": 0.02022982531946203, + "grad_norm": 25.41693500861622, + "learning_rate": 2.0229438188667086e-05, + "loss": 2.2716, + "mean_token_accuracy": 0.4292196035385132, + "step": 20085 + }, + { + "epoch": 0.0202348613725662, + "grad_norm": 16.122117355877883, + "learning_rate": 2.0234474145398145e-05, + "loss": 2.0698, + "mean_token_accuracy": 0.49999999403953554, + "step": 20090 + }, + { + "epoch": 0.020239897425670374, + "grad_norm": 24.694727606681806, + "learning_rate": 2.0239510102129204e-05, + "loss": 2.7991, + "mean_token_accuracy": 0.33793103098869326, + "step": 20095 + }, + { + "epoch": 0.020244933478774547, + "grad_norm": 21.772912849525827, + "learning_rate": 2.0244546058860263e-05, + "loss": 2.4242, + "mean_token_accuracy": 0.42068966031074523, + "step": 20100 + }, + { + "epoch": 0.02024996953187872, + "grad_norm": 24.77716630187442, + "learning_rate": 2.0249582015591323e-05, + "loss": 2.4497, + "mean_token_accuracy": 0.42068966031074523, + "step": 20105 + }, + { + "epoch": 0.02025500558498289, + "grad_norm": 25.288169202397047, + "learning_rate": 2.0254617972322382e-05, + "loss": 2.6914, + "mean_token_accuracy": 0.4034482777118683, + "step": 20110 + }, + { + "epoch": 0.020260041638087065, + "grad_norm": 26.315086993608244, + "learning_rate": 2.0259653929053445e-05, + "loss": 2.2057, + "mean_token_accuracy": 0.45529341101646426, + "step": 20115 + }, + { + "epoch": 0.02026507769119124, + "grad_norm": 26.5018000994247, + "learning_rate": 2.02646898857845e-05, + "loss": 2.9612, + "mean_token_accuracy": 0.32758620381355286, + "step": 20120 + }, + { + "epoch": 0.02027011374429541, + "grad_norm": 23.329797054303814, + "learning_rate": 2.0269725842515563e-05, + "loss": 2.3103, + "mean_token_accuracy": 0.4379310369491577, + "step": 20125 + }, + { + "epoch": 0.020275149797399583, + "grad_norm": 20.105331378118663, + "learning_rate": 2.0274761799246622e-05, + "loss": 2.3334, + "mean_token_accuracy": 0.3551724135875702, + "step": 20130 + }, + { + "epoch": 0.020280185850503757, + "grad_norm": 25.72764941586213, + "learning_rate": 2.027979775597768e-05, + "loss": 2.39, + "mean_token_accuracy": 0.42758620381355283, + "step": 20135 + }, + { + "epoch": 0.02028522190360793, + "grad_norm": 25.904955880847428, + "learning_rate": 2.028483371270874e-05, + "loss": 2.5929, + "mean_token_accuracy": 0.36551723480224607, + "step": 20140 + }, + { + "epoch": 0.0202902579567121, + "grad_norm": 19.082821335083086, + "learning_rate": 2.02898696694398e-05, + "loss": 2.172, + "mean_token_accuracy": 0.4448275864124298, + "step": 20145 + }, + { + "epoch": 0.020295294009816275, + "grad_norm": 24.65784182761241, + "learning_rate": 2.029490562617086e-05, + "loss": 2.6988, + "mean_token_accuracy": 0.4068965554237366, + "step": 20150 + }, + { + "epoch": 0.02030033006292045, + "grad_norm": 21.390529974095937, + "learning_rate": 2.0299941582901922e-05, + "loss": 2.3091, + "mean_token_accuracy": 0.4551724135875702, + "step": 20155 + }, + { + "epoch": 0.02030536611602462, + "grad_norm": 23.214021808635774, + "learning_rate": 2.030497753963298e-05, + "loss": 2.3324, + "mean_token_accuracy": 0.39310344457626345, + "step": 20160 + }, + { + "epoch": 0.020310402169128793, + "grad_norm": 24.718262858775972, + "learning_rate": 2.031001349636404e-05, + "loss": 2.2154, + "mean_token_accuracy": 0.46745312213897705, + "step": 20165 + }, + { + "epoch": 0.020315438222232966, + "grad_norm": 26.881671772445134, + "learning_rate": 2.03150494530951e-05, + "loss": 2.6324, + "mean_token_accuracy": 0.3999999940395355, + "step": 20170 + }, + { + "epoch": 0.02032047427533714, + "grad_norm": 26.623659235145304, + "learning_rate": 2.032008540982616e-05, + "loss": 2.6511, + "mean_token_accuracy": 0.42068966031074523, + "step": 20175 + }, + { + "epoch": 0.02032551032844131, + "grad_norm": 23.388010906064196, + "learning_rate": 2.0325121366557222e-05, + "loss": 2.5614, + "mean_token_accuracy": 0.4255898356437683, + "step": 20180 + }, + { + "epoch": 0.020330546381545484, + "grad_norm": 17.768880869214286, + "learning_rate": 2.0330157323288278e-05, + "loss": 2.5482, + "mean_token_accuracy": 0.44482758045196535, + "step": 20185 + }, + { + "epoch": 0.020335582434649658, + "grad_norm": 27.100302762358506, + "learning_rate": 2.0335193280019337e-05, + "loss": 2.6876, + "mean_token_accuracy": 0.3931034505367279, + "step": 20190 + }, + { + "epoch": 0.020340618487753828, + "grad_norm": 22.183201716737187, + "learning_rate": 2.03402292367504e-05, + "loss": 2.5313, + "mean_token_accuracy": 0.41034482717514037, + "step": 20195 + }, + { + "epoch": 0.020345654540858002, + "grad_norm": 19.279018153046746, + "learning_rate": 2.034526519348146e-05, + "loss": 2.402, + "mean_token_accuracy": 0.41379310488700866, + "step": 20200 + }, + { + "epoch": 0.020350690593962176, + "grad_norm": 21.74218347014969, + "learning_rate": 2.0350301150212518e-05, + "loss": 2.459, + "mean_token_accuracy": 0.43067150712013247, + "step": 20205 + }, + { + "epoch": 0.02035572664706635, + "grad_norm": 28.277716378251696, + "learning_rate": 2.0355337106943577e-05, + "loss": 2.2686, + "mean_token_accuracy": 0.4620689630508423, + "step": 20210 + }, + { + "epoch": 0.02036076270017052, + "grad_norm": 26.340418280170695, + "learning_rate": 2.0360373063674636e-05, + "loss": 2.6408, + "mean_token_accuracy": 0.38620689809322356, + "step": 20215 + }, + { + "epoch": 0.020365798753274694, + "grad_norm": 20.68237692618053, + "learning_rate": 2.03654090204057e-05, + "loss": 2.1616, + "mean_token_accuracy": 0.4940108895301819, + "step": 20220 + }, + { + "epoch": 0.020370834806378867, + "grad_norm": 18.233475076546913, + "learning_rate": 2.037044497713676e-05, + "loss": 2.5348, + "mean_token_accuracy": 0.3931034505367279, + "step": 20225 + }, + { + "epoch": 0.020375870859483038, + "grad_norm": 22.631162411150207, + "learning_rate": 2.0375480933867818e-05, + "loss": 2.2392, + "mean_token_accuracy": 0.4620689630508423, + "step": 20230 + }, + { + "epoch": 0.02038090691258721, + "grad_norm": 20.58237313282624, + "learning_rate": 2.0380516890598877e-05, + "loss": 2.3934, + "mean_token_accuracy": 0.4190562665462494, + "step": 20235 + }, + { + "epoch": 0.020385942965691385, + "grad_norm": 19.87636273252363, + "learning_rate": 2.0385552847329936e-05, + "loss": 2.4813, + "mean_token_accuracy": 0.4172413766384125, + "step": 20240 + }, + { + "epoch": 0.02039097901879556, + "grad_norm": 22.159772173630532, + "learning_rate": 2.0390588804061e-05, + "loss": 2.2794, + "mean_token_accuracy": 0.43793103098869324, + "step": 20245 + }, + { + "epoch": 0.02039601507189973, + "grad_norm": 20.003481403413694, + "learning_rate": 2.0395624760792058e-05, + "loss": 2.6248, + "mean_token_accuracy": 0.38965516686439516, + "step": 20250 + }, + { + "epoch": 0.020401051125003903, + "grad_norm": 27.02614037097411, + "learning_rate": 2.0400660717523114e-05, + "loss": 2.4206, + "mean_token_accuracy": 0.4413793087005615, + "step": 20255 + }, + { + "epoch": 0.020406087178108077, + "grad_norm": 20.61797433414269, + "learning_rate": 2.0405696674254177e-05, + "loss": 2.5768, + "mean_token_accuracy": 0.4310344815254211, + "step": 20260 + }, + { + "epoch": 0.020411123231212247, + "grad_norm": 27.116937568094198, + "learning_rate": 2.0410732630985236e-05, + "loss": 3.1021, + "mean_token_accuracy": 0.3379310369491577, + "step": 20265 + }, + { + "epoch": 0.02041615928431642, + "grad_norm": 28.933521363285905, + "learning_rate": 2.0415768587716295e-05, + "loss": 2.3532, + "mean_token_accuracy": 0.43448275327682495, + "step": 20270 + }, + { + "epoch": 0.020421195337420595, + "grad_norm": 17.38801833294216, + "learning_rate": 2.0420804544447354e-05, + "loss": 2.3373, + "mean_token_accuracy": 0.4310344934463501, + "step": 20275 + }, + { + "epoch": 0.02042623139052477, + "grad_norm": 20.51271345922397, + "learning_rate": 2.0425840501178414e-05, + "loss": 2.2784, + "mean_token_accuracy": 0.4344827592372894, + "step": 20280 + }, + { + "epoch": 0.02043126744362894, + "grad_norm": 22.602800097757285, + "learning_rate": 2.0430876457909476e-05, + "loss": 2.4874, + "mean_token_accuracy": 0.3482758641242981, + "step": 20285 + }, + { + "epoch": 0.020436303496733112, + "grad_norm": 18.903154608885608, + "learning_rate": 2.0435912414640535e-05, + "loss": 2.2618, + "mean_token_accuracy": 0.4517241418361664, + "step": 20290 + }, + { + "epoch": 0.020441339549837286, + "grad_norm": 20.892725119984043, + "learning_rate": 2.0440948371371595e-05, + "loss": 2.2489, + "mean_token_accuracy": 0.4241379380226135, + "step": 20295 + }, + { + "epoch": 0.020446375602941456, + "grad_norm": 22.96702302124522, + "learning_rate": 2.0445984328102654e-05, + "loss": 2.7542, + "mean_token_accuracy": 0.3882032573223114, + "step": 20300 + }, + { + "epoch": 0.02045141165604563, + "grad_norm": 18.4462427780884, + "learning_rate": 2.0451020284833713e-05, + "loss": 2.6832, + "mean_token_accuracy": 0.41379311084747317, + "step": 20305 + }, + { + "epoch": 0.020456447709149804, + "grad_norm": 24.544093617629084, + "learning_rate": 2.0456056241564772e-05, + "loss": 2.2292, + "mean_token_accuracy": 0.4206896543502808, + "step": 20310 + }, + { + "epoch": 0.020461483762253978, + "grad_norm": 22.46487817436657, + "learning_rate": 2.0461092198295835e-05, + "loss": 2.5283, + "mean_token_accuracy": 0.4034482717514038, + "step": 20315 + }, + { + "epoch": 0.020466519815358148, + "grad_norm": 22.26812629777027, + "learning_rate": 2.046612815502689e-05, + "loss": 2.309, + "mean_token_accuracy": 0.4206896543502808, + "step": 20320 + }, + { + "epoch": 0.020471555868462322, + "grad_norm": 22.410231858747288, + "learning_rate": 2.0471164111757954e-05, + "loss": 2.6072, + "mean_token_accuracy": 0.3862068891525269, + "step": 20325 + }, + { + "epoch": 0.020476591921566496, + "grad_norm": 21.763726941931004, + "learning_rate": 2.0476200068489013e-05, + "loss": 2.2369, + "mean_token_accuracy": 0.4206896543502808, + "step": 20330 + }, + { + "epoch": 0.020481627974670666, + "grad_norm": 25.267250731705264, + "learning_rate": 2.0481236025220072e-05, + "loss": 2.2505, + "mean_token_accuracy": 0.4551724135875702, + "step": 20335 + }, + { + "epoch": 0.02048666402777484, + "grad_norm": 21.002614873531986, + "learning_rate": 2.048627198195113e-05, + "loss": 2.6855, + "mean_token_accuracy": 0.38620689511299133, + "step": 20340 + }, + { + "epoch": 0.020491700080879013, + "grad_norm": 21.750525760714734, + "learning_rate": 2.049130793868219e-05, + "loss": 2.4442, + "mean_token_accuracy": 0.4103448212146759, + "step": 20345 + }, + { + "epoch": 0.020496736133983187, + "grad_norm": 17.003556925783922, + "learning_rate": 2.049634389541325e-05, + "loss": 2.5639, + "mean_token_accuracy": 0.43103448748588563, + "step": 20350 + }, + { + "epoch": 0.020501772187087357, + "grad_norm": 23.950653001718205, + "learning_rate": 2.0501379852144312e-05, + "loss": 2.3697, + "mean_token_accuracy": 0.41911675333976744, + "step": 20355 + }, + { + "epoch": 0.02050680824019153, + "grad_norm": 20.740732031594927, + "learning_rate": 2.0506415808875372e-05, + "loss": 2.2549, + "mean_token_accuracy": 0.4551724135875702, + "step": 20360 + }, + { + "epoch": 0.020511844293295705, + "grad_norm": 22.145651452968355, + "learning_rate": 2.051145176560643e-05, + "loss": 2.6835, + "mean_token_accuracy": 0.4068965554237366, + "step": 20365 + }, + { + "epoch": 0.020516880346399875, + "grad_norm": 23.640407822414623, + "learning_rate": 2.051648772233749e-05, + "loss": 2.5363, + "mean_token_accuracy": 0.43103448748588563, + "step": 20370 + }, + { + "epoch": 0.02052191639950405, + "grad_norm": 25.09349317688804, + "learning_rate": 2.052152367906855e-05, + "loss": 2.5742, + "mean_token_accuracy": 0.4172413766384125, + "step": 20375 + }, + { + "epoch": 0.020526952452608223, + "grad_norm": 23.22167483946741, + "learning_rate": 2.0526559635799612e-05, + "loss": 2.5049, + "mean_token_accuracy": 0.40344826579093934, + "step": 20380 + }, + { + "epoch": 0.020531988505712397, + "grad_norm": 20.365829939790775, + "learning_rate": 2.0531595592530668e-05, + "loss": 2.4667, + "mean_token_accuracy": 0.41724138259887694, + "step": 20385 + }, + { + "epoch": 0.020537024558816567, + "grad_norm": 21.19800753381962, + "learning_rate": 2.0536631549261727e-05, + "loss": 2.2691, + "mean_token_accuracy": 0.42758620381355283, + "step": 20390 + }, + { + "epoch": 0.02054206061192074, + "grad_norm": 21.900326403813818, + "learning_rate": 2.054166750599279e-05, + "loss": 2.2727, + "mean_token_accuracy": 0.47931034564971925, + "step": 20395 + }, + { + "epoch": 0.020547096665024914, + "grad_norm": 28.93848201795326, + "learning_rate": 2.054670346272385e-05, + "loss": 2.451, + "mean_token_accuracy": 0.4517241358757019, + "step": 20400 + }, + { + "epoch": 0.020552132718129085, + "grad_norm": 23.891048107482828, + "learning_rate": 2.0551739419454912e-05, + "loss": 2.6828, + "mean_token_accuracy": 0.37586207389831544, + "step": 20405 + }, + { + "epoch": 0.02055716877123326, + "grad_norm": 21.840176469710574, + "learning_rate": 2.0556775376185968e-05, + "loss": 2.627, + "mean_token_accuracy": 0.43103447556495667, + "step": 20410 + }, + { + "epoch": 0.020562204824337432, + "grad_norm": 18.922850606996075, + "learning_rate": 2.0561811332917027e-05, + "loss": 2.4214, + "mean_token_accuracy": 0.42758620381355283, + "step": 20415 + }, + { + "epoch": 0.020567240877441606, + "grad_norm": 18.174470785782198, + "learning_rate": 2.056684728964809e-05, + "loss": 2.2255, + "mean_token_accuracy": 0.4898366630077362, + "step": 20420 + }, + { + "epoch": 0.020572276930545776, + "grad_norm": 27.226391043954685, + "learning_rate": 2.057188324637915e-05, + "loss": 2.8804, + "mean_token_accuracy": 0.36551723480224607, + "step": 20425 + }, + { + "epoch": 0.02057731298364995, + "grad_norm": 17.44236010108542, + "learning_rate": 2.0576919203110208e-05, + "loss": 2.4531, + "mean_token_accuracy": 0.40471869707107544, + "step": 20430 + }, + { + "epoch": 0.020582349036754124, + "grad_norm": 22.27273076169086, + "learning_rate": 2.0581955159841267e-05, + "loss": 2.6766, + "mean_token_accuracy": 0.43103447556495667, + "step": 20435 + }, + { + "epoch": 0.020587385089858294, + "grad_norm": 17.693952333965473, + "learning_rate": 2.0586991116572327e-05, + "loss": 2.2798, + "mean_token_accuracy": 0.4396854221820831, + "step": 20440 + }, + { + "epoch": 0.020592421142962468, + "grad_norm": 18.023586223898967, + "learning_rate": 2.059202707330339e-05, + "loss": 2.5281, + "mean_token_accuracy": 0.4137930989265442, + "step": 20445 + }, + { + "epoch": 0.02059745719606664, + "grad_norm": 25.29147616398832, + "learning_rate": 2.059706303003445e-05, + "loss": 2.645, + "mean_token_accuracy": 0.3965517163276672, + "step": 20450 + }, + { + "epoch": 0.020602493249170815, + "grad_norm": 18.916044560677673, + "learning_rate": 2.0602098986765504e-05, + "loss": 2.4573, + "mean_token_accuracy": 0.4275861978530884, + "step": 20455 + }, + { + "epoch": 0.020607529302274986, + "grad_norm": 19.329200011984057, + "learning_rate": 2.0607134943496567e-05, + "loss": 2.3013, + "mean_token_accuracy": 0.44652147889137267, + "step": 20460 + }, + { + "epoch": 0.02061256535537916, + "grad_norm": 32.913470360769075, + "learning_rate": 2.0612170900227626e-05, + "loss": 2.4724, + "mean_token_accuracy": 0.37701149582862853, + "step": 20465 + }, + { + "epoch": 0.020617601408483333, + "grad_norm": 24.584539090771973, + "learning_rate": 2.061720685695869e-05, + "loss": 2.3344, + "mean_token_accuracy": 0.44349666237831115, + "step": 20470 + }, + { + "epoch": 0.020622637461587504, + "grad_norm": 23.468866691562596, + "learning_rate": 2.0622242813689745e-05, + "loss": 2.3436, + "mean_token_accuracy": 0.42413793206214906, + "step": 20475 + }, + { + "epoch": 0.020627673514691677, + "grad_norm": 25.082168456220924, + "learning_rate": 2.0627278770420804e-05, + "loss": 2.2851, + "mean_token_accuracy": 0.4551724076271057, + "step": 20480 + }, + { + "epoch": 0.02063270956779585, + "grad_norm": 20.65713368804598, + "learning_rate": 2.0632314727151867e-05, + "loss": 1.9743, + "mean_token_accuracy": 0.48965516686439514, + "step": 20485 + }, + { + "epoch": 0.020637745620900025, + "grad_norm": 20.22227763404955, + "learning_rate": 2.0637350683882926e-05, + "loss": 2.4053, + "mean_token_accuracy": 0.41379310488700866, + "step": 20490 + }, + { + "epoch": 0.020642781674004195, + "grad_norm": 22.313221064851298, + "learning_rate": 2.0642386640613985e-05, + "loss": 2.3204, + "mean_token_accuracy": 0.42068964838981626, + "step": 20495 + }, + { + "epoch": 0.02064781772710837, + "grad_norm": 23.042933047901666, + "learning_rate": 2.0647422597345044e-05, + "loss": 2.4391, + "mean_token_accuracy": 0.4206896543502808, + "step": 20500 + }, + { + "epoch": 0.020652853780212543, + "grad_norm": 19.087961230640815, + "learning_rate": 2.0652458554076104e-05, + "loss": 2.2903, + "mean_token_accuracy": 0.4379310250282288, + "step": 20505 + }, + { + "epoch": 0.020657889833316713, + "grad_norm": 17.33476864845292, + "learning_rate": 2.0657494510807166e-05, + "loss": 2.2708, + "mean_token_accuracy": 0.46412583589553835, + "step": 20510 + }, + { + "epoch": 0.020662925886420887, + "grad_norm": 17.156648512367376, + "learning_rate": 2.0662530467538226e-05, + "loss": 2.3583, + "mean_token_accuracy": 0.43103448748588563, + "step": 20515 + }, + { + "epoch": 0.02066796193952506, + "grad_norm": 20.317100966878122, + "learning_rate": 2.066756642426928e-05, + "loss": 2.6848, + "mean_token_accuracy": 0.39655172228813174, + "step": 20520 + }, + { + "epoch": 0.020672997992629234, + "grad_norm": 26.520492216235837, + "learning_rate": 2.0672602381000344e-05, + "loss": 2.2614, + "mean_token_accuracy": 0.46394434571266174, + "step": 20525 + }, + { + "epoch": 0.020678034045733405, + "grad_norm": 20.12590707674502, + "learning_rate": 2.0677638337731403e-05, + "loss": 2.3983, + "mean_token_accuracy": 0.4344827651977539, + "step": 20530 + }, + { + "epoch": 0.02068307009883758, + "grad_norm": 28.39348166787303, + "learning_rate": 2.0682674294462463e-05, + "loss": 2.6283, + "mean_token_accuracy": 0.4137930989265442, + "step": 20535 + }, + { + "epoch": 0.020688106151941752, + "grad_norm": 21.616073684971788, + "learning_rate": 2.0687710251193522e-05, + "loss": 2.2901, + "mean_token_accuracy": 0.417241370677948, + "step": 20540 + }, + { + "epoch": 0.020693142205045922, + "grad_norm": 24.01636217342845, + "learning_rate": 2.069274620792458e-05, + "loss": 2.7583, + "mean_token_accuracy": 0.4068965494632721, + "step": 20545 + }, + { + "epoch": 0.020698178258150096, + "grad_norm": 24.980828653272724, + "learning_rate": 2.0697782164655644e-05, + "loss": 2.6273, + "mean_token_accuracy": 0.37931033968925476, + "step": 20550 + }, + { + "epoch": 0.02070321431125427, + "grad_norm": 22.826711065736834, + "learning_rate": 2.0702818121386703e-05, + "loss": 2.3888, + "mean_token_accuracy": 0.4413793087005615, + "step": 20555 + }, + { + "epoch": 0.020708250364358444, + "grad_norm": 18.506444929152813, + "learning_rate": 2.0707854078117762e-05, + "loss": 2.5037, + "mean_token_accuracy": 0.3551724135875702, + "step": 20560 + }, + { + "epoch": 0.020713286417462614, + "grad_norm": 21.239680886285672, + "learning_rate": 2.071289003484882e-05, + "loss": 2.4648, + "mean_token_accuracy": 0.4172413766384125, + "step": 20565 + }, + { + "epoch": 0.020718322470566788, + "grad_norm": 23.471894097945487, + "learning_rate": 2.071792599157988e-05, + "loss": 2.2872, + "mean_token_accuracy": 0.44827585816383364, + "step": 20570 + }, + { + "epoch": 0.02072335852367096, + "grad_norm": 22.43242279166281, + "learning_rate": 2.072296194831094e-05, + "loss": 2.8487, + "mean_token_accuracy": 0.3482758581638336, + "step": 20575 + }, + { + "epoch": 0.020728394576775132, + "grad_norm": 19.04518189750828, + "learning_rate": 2.0727997905042003e-05, + "loss": 2.0623, + "mean_token_accuracy": 0.43103448748588563, + "step": 20580 + }, + { + "epoch": 0.020733430629879306, + "grad_norm": 24.175859135965364, + "learning_rate": 2.0733033861773062e-05, + "loss": 2.2581, + "mean_token_accuracy": 0.4401088893413544, + "step": 20585 + }, + { + "epoch": 0.02073846668298348, + "grad_norm": 28.21371826847323, + "learning_rate": 2.073806981850412e-05, + "loss": 2.8496, + "mean_token_accuracy": 0.3601935803890228, + "step": 20590 + }, + { + "epoch": 0.020743502736087653, + "grad_norm": 24.159271640617533, + "learning_rate": 2.074310577523518e-05, + "loss": 2.4508, + "mean_token_accuracy": 0.4206896543502808, + "step": 20595 + }, + { + "epoch": 0.020748538789191823, + "grad_norm": 21.871630064081565, + "learning_rate": 2.074814173196624e-05, + "loss": 2.5846, + "mean_token_accuracy": 0.3947973370552063, + "step": 20600 + }, + { + "epoch": 0.020753574842295997, + "grad_norm": 21.57414972492069, + "learning_rate": 2.0753177688697302e-05, + "loss": 2.1858, + "mean_token_accuracy": 0.43448275327682495, + "step": 20605 + }, + { + "epoch": 0.02075861089540017, + "grad_norm": 20.689410453152817, + "learning_rate": 2.0758213645428358e-05, + "loss": 2.4268, + "mean_token_accuracy": 0.45027223229408264, + "step": 20610 + }, + { + "epoch": 0.02076364694850434, + "grad_norm": 18.94062976006235, + "learning_rate": 2.0763249602159417e-05, + "loss": 2.3732, + "mean_token_accuracy": 0.42413792610168455, + "step": 20615 + }, + { + "epoch": 0.020768683001608515, + "grad_norm": 21.162725631034164, + "learning_rate": 2.076828555889048e-05, + "loss": 2.0892, + "mean_token_accuracy": 0.4862068951129913, + "step": 20620 + }, + { + "epoch": 0.02077371905471269, + "grad_norm": 19.343405837960226, + "learning_rate": 2.077332151562154e-05, + "loss": 2.1845, + "mean_token_accuracy": 0.4448275864124298, + "step": 20625 + }, + { + "epoch": 0.020778755107816863, + "grad_norm": 26.657340398656682, + "learning_rate": 2.07783574723526e-05, + "loss": 2.459, + "mean_token_accuracy": 0.43236539959907533, + "step": 20630 + }, + { + "epoch": 0.020783791160921033, + "grad_norm": 23.297529860383225, + "learning_rate": 2.0783393429083658e-05, + "loss": 2.5583, + "mean_token_accuracy": 0.4103448182344437, + "step": 20635 + }, + { + "epoch": 0.020788827214025207, + "grad_norm": 24.375311232146665, + "learning_rate": 2.0788429385814717e-05, + "loss": 2.7381, + "mean_token_accuracy": 0.40689654648303986, + "step": 20640 + }, + { + "epoch": 0.02079386326712938, + "grad_norm": 25.910400053753975, + "learning_rate": 2.079346534254578e-05, + "loss": 2.3787, + "mean_token_accuracy": 0.4137930989265442, + "step": 20645 + }, + { + "epoch": 0.02079889932023355, + "grad_norm": 18.392707396541727, + "learning_rate": 2.079850129927684e-05, + "loss": 2.3692, + "mean_token_accuracy": 0.47586207985877993, + "step": 20650 + }, + { + "epoch": 0.020803935373337724, + "grad_norm": 25.560505360356665, + "learning_rate": 2.0803537256007895e-05, + "loss": 2.8649, + "mean_token_accuracy": 0.35862069129943847, + "step": 20655 + }, + { + "epoch": 0.020808971426441898, + "grad_norm": 20.969388023639176, + "learning_rate": 2.0808573212738957e-05, + "loss": 2.6647, + "mean_token_accuracy": 0.37586205899715425, + "step": 20660 + }, + { + "epoch": 0.020814007479546072, + "grad_norm": 21.408324197549252, + "learning_rate": 2.0813609169470017e-05, + "loss": 2.2042, + "mean_token_accuracy": 0.4275861978530884, + "step": 20665 + }, + { + "epoch": 0.020819043532650242, + "grad_norm": 22.89885398256849, + "learning_rate": 2.081864512620108e-05, + "loss": 2.4562, + "mean_token_accuracy": 0.4310344815254211, + "step": 20670 + }, + { + "epoch": 0.020824079585754416, + "grad_norm": 25.6720580402118, + "learning_rate": 2.0823681082932135e-05, + "loss": 2.2407, + "mean_token_accuracy": 0.3896551787853241, + "step": 20675 + }, + { + "epoch": 0.02082911563885859, + "grad_norm": 20.391907160039267, + "learning_rate": 2.0828717039663194e-05, + "loss": 2.6567, + "mean_token_accuracy": 0.4103448212146759, + "step": 20680 + }, + { + "epoch": 0.02083415169196276, + "grad_norm": 19.72446517063746, + "learning_rate": 2.0833752996394257e-05, + "loss": 2.4718, + "mean_token_accuracy": 0.3896551728248596, + "step": 20685 + }, + { + "epoch": 0.020839187745066934, + "grad_norm": 33.62630801668942, + "learning_rate": 2.0838788953125316e-05, + "loss": 2.8378, + "mean_token_accuracy": 0.38965516686439516, + "step": 20690 + }, + { + "epoch": 0.020844223798171108, + "grad_norm": 18.19741363651444, + "learning_rate": 2.0843824909856376e-05, + "loss": 2.3302, + "mean_token_accuracy": 0.44827585816383364, + "step": 20695 + }, + { + "epoch": 0.02084925985127528, + "grad_norm": 26.038475731701535, + "learning_rate": 2.0848860866587435e-05, + "loss": 2.4227, + "mean_token_accuracy": 0.458620685338974, + "step": 20700 + }, + { + "epoch": 0.02085429590437945, + "grad_norm": 20.694083873134705, + "learning_rate": 2.0853896823318494e-05, + "loss": 2.3401, + "mean_token_accuracy": 0.46551724076271056, + "step": 20705 + }, + { + "epoch": 0.020859331957483625, + "grad_norm": 20.517144448968768, + "learning_rate": 2.0858932780049557e-05, + "loss": 2.6501, + "mean_token_accuracy": 0.40859044194221494, + "step": 20710 + }, + { + "epoch": 0.0208643680105878, + "grad_norm": 20.15282612144898, + "learning_rate": 2.0863968736780616e-05, + "loss": 2.7406, + "mean_token_accuracy": 0.43260737657547, + "step": 20715 + }, + { + "epoch": 0.02086940406369197, + "grad_norm": 21.926990376297955, + "learning_rate": 2.0869004693511672e-05, + "loss": 2.7712, + "mean_token_accuracy": 0.3551724076271057, + "step": 20720 + }, + { + "epoch": 0.020874440116796143, + "grad_norm": 32.90672899016541, + "learning_rate": 2.0874040650242734e-05, + "loss": 2.5821, + "mean_token_accuracy": 0.4172413766384125, + "step": 20725 + }, + { + "epoch": 0.020879476169900317, + "grad_norm": 24.75315313920941, + "learning_rate": 2.0879076606973794e-05, + "loss": 2.1809, + "mean_token_accuracy": 0.4896551549434662, + "step": 20730 + }, + { + "epoch": 0.02088451222300449, + "grad_norm": 29.915871870629, + "learning_rate": 2.0884112563704853e-05, + "loss": 2.4106, + "mean_token_accuracy": 0.44827585816383364, + "step": 20735 + }, + { + "epoch": 0.02088954827610866, + "grad_norm": 23.027456020227316, + "learning_rate": 2.0889148520435912e-05, + "loss": 2.4046, + "mean_token_accuracy": 0.41379310488700866, + "step": 20740 + }, + { + "epoch": 0.020894584329212835, + "grad_norm": 21.455565044689756, + "learning_rate": 2.089418447716697e-05, + "loss": 2.303, + "mean_token_accuracy": 0.4620689690113068, + "step": 20745 + }, + { + "epoch": 0.02089962038231701, + "grad_norm": 22.19652830621377, + "learning_rate": 2.0899220433898034e-05, + "loss": 2.3625, + "mean_token_accuracy": 0.42758620381355283, + "step": 20750 + }, + { + "epoch": 0.02090465643542118, + "grad_norm": 19.064880914822027, + "learning_rate": 2.0904256390629093e-05, + "loss": 2.2502, + "mean_token_accuracy": 0.4206896543502808, + "step": 20755 + }, + { + "epoch": 0.020909692488525353, + "grad_norm": 21.06840897155441, + "learning_rate": 2.0909292347360153e-05, + "loss": 2.6852, + "mean_token_accuracy": 0.40689654648303986, + "step": 20760 + }, + { + "epoch": 0.020914728541629526, + "grad_norm": 17.433829716573808, + "learning_rate": 2.0914328304091212e-05, + "loss": 2.3182, + "mean_token_accuracy": 0.47586206793785096, + "step": 20765 + }, + { + "epoch": 0.0209197645947337, + "grad_norm": 18.32584598886545, + "learning_rate": 2.091936426082227e-05, + "loss": 2.3205, + "mean_token_accuracy": 0.41724138259887694, + "step": 20770 + }, + { + "epoch": 0.02092480064783787, + "grad_norm": 20.27329927322556, + "learning_rate": 2.092440021755333e-05, + "loss": 2.6067, + "mean_token_accuracy": 0.4275862157344818, + "step": 20775 + }, + { + "epoch": 0.020929836700942044, + "grad_norm": 19.079933779732045, + "learning_rate": 2.0929436174284393e-05, + "loss": 2.47, + "mean_token_accuracy": 0.3896551728248596, + "step": 20780 + }, + { + "epoch": 0.020934872754046218, + "grad_norm": 24.529186463683658, + "learning_rate": 2.0934472131015452e-05, + "loss": 2.7102, + "mean_token_accuracy": 0.37586206793785093, + "step": 20785 + }, + { + "epoch": 0.02093990880715039, + "grad_norm": 18.22434578596645, + "learning_rate": 2.093950808774651e-05, + "loss": 2.2973, + "mean_token_accuracy": 0.46958128809928895, + "step": 20790 + }, + { + "epoch": 0.020944944860254562, + "grad_norm": 23.97462592483251, + "learning_rate": 2.094454404447757e-05, + "loss": 2.5887, + "mean_token_accuracy": 0.35862068831920624, + "step": 20795 + }, + { + "epoch": 0.020949980913358736, + "grad_norm": 18.229944357014688, + "learning_rate": 2.094958000120863e-05, + "loss": 1.9787, + "mean_token_accuracy": 0.5, + "step": 20800 + }, + { + "epoch": 0.02095501696646291, + "grad_norm": 26.823473252745288, + "learning_rate": 2.0954615957939693e-05, + "loss": 2.2683, + "mean_token_accuracy": 0.4570477843284607, + "step": 20805 + }, + { + "epoch": 0.02096005301956708, + "grad_norm": 24.404108843190482, + "learning_rate": 2.095965191467075e-05, + "loss": 2.6987, + "mean_token_accuracy": 0.38965516686439516, + "step": 20810 + }, + { + "epoch": 0.020965089072671254, + "grad_norm": 15.250059427661341, + "learning_rate": 2.0964687871401808e-05, + "loss": 2.3291, + "mean_token_accuracy": 0.4744101703166962, + "step": 20815 + }, + { + "epoch": 0.020970125125775427, + "grad_norm": 20.08263411313411, + "learning_rate": 2.096972382813287e-05, + "loss": 2.3474, + "mean_token_accuracy": 0.4620689690113068, + "step": 20820 + }, + { + "epoch": 0.020975161178879598, + "grad_norm": 35.39941870480334, + "learning_rate": 2.097475978486393e-05, + "loss": 2.6609, + "mean_token_accuracy": 0.4310344815254211, + "step": 20825 + }, + { + "epoch": 0.02098019723198377, + "grad_norm": 22.391267340765804, + "learning_rate": 2.097979574159499e-05, + "loss": 2.4936, + "mean_token_accuracy": 0.4034482777118683, + "step": 20830 + }, + { + "epoch": 0.020985233285087945, + "grad_norm": 20.109065682505307, + "learning_rate": 2.0984831698326048e-05, + "loss": 2.8539, + "mean_token_accuracy": 0.32758620083332063, + "step": 20835 + }, + { + "epoch": 0.02099026933819212, + "grad_norm": 31.136650218679605, + "learning_rate": 2.0989867655057107e-05, + "loss": 2.4236, + "mean_token_accuracy": 0.3896551698446274, + "step": 20840 + }, + { + "epoch": 0.02099530539129629, + "grad_norm": 28.374319694473026, + "learning_rate": 2.099490361178817e-05, + "loss": 2.3838, + "mean_token_accuracy": 0.4379310369491577, + "step": 20845 + }, + { + "epoch": 0.021000341444400463, + "grad_norm": 19.199579244580416, + "learning_rate": 2.099993956851923e-05, + "loss": 2.3744, + "mean_token_accuracy": 0.4241379380226135, + "step": 20850 + }, + { + "epoch": 0.021005377497504637, + "grad_norm": 20.131444451548532, + "learning_rate": 2.100497552525029e-05, + "loss": 2.1426, + "mean_token_accuracy": 0.4724137902259827, + "step": 20855 + }, + { + "epoch": 0.021010413550608807, + "grad_norm": 23.244707857595394, + "learning_rate": 2.1010011481981348e-05, + "loss": 2.558, + "mean_token_accuracy": 0.4034482777118683, + "step": 20860 + }, + { + "epoch": 0.02101544960371298, + "grad_norm": 132.14670814511433, + "learning_rate": 2.1015047438712407e-05, + "loss": 2.4918, + "mean_token_accuracy": 0.41034482717514037, + "step": 20865 + }, + { + "epoch": 0.021020485656817155, + "grad_norm": 27.211363773870946, + "learning_rate": 2.102008339544347e-05, + "loss": 2.6126, + "mean_token_accuracy": 0.38965516686439516, + "step": 20870 + }, + { + "epoch": 0.02102552170992133, + "grad_norm": 17.86001900291278, + "learning_rate": 2.1025119352174526e-05, + "loss": 2.1934, + "mean_token_accuracy": 0.4517241358757019, + "step": 20875 + }, + { + "epoch": 0.0210305577630255, + "grad_norm": 20.966328468401194, + "learning_rate": 2.1030155308905585e-05, + "loss": 2.4641, + "mean_token_accuracy": 0.38965516686439516, + "step": 20880 + }, + { + "epoch": 0.021035593816129673, + "grad_norm": 22.163251619676686, + "learning_rate": 2.1035191265636648e-05, + "loss": 2.5332, + "mean_token_accuracy": 0.36551723480224607, + "step": 20885 + }, + { + "epoch": 0.021040629869233846, + "grad_norm": 15.45488073309816, + "learning_rate": 2.1040227222367707e-05, + "loss": 2.4469, + "mean_token_accuracy": 0.42068964838981626, + "step": 20890 + }, + { + "epoch": 0.021045665922338017, + "grad_norm": 22.2710628132038, + "learning_rate": 2.1045263179098766e-05, + "loss": 2.4395, + "mean_token_accuracy": 0.4448275864124298, + "step": 20895 + }, + { + "epoch": 0.02105070197544219, + "grad_norm": 30.7929429454058, + "learning_rate": 2.1050299135829825e-05, + "loss": 2.6728, + "mean_token_accuracy": 0.3655172407627106, + "step": 20900 + }, + { + "epoch": 0.021055738028546364, + "grad_norm": 16.014108532331385, + "learning_rate": 2.1055335092560885e-05, + "loss": 2.3556, + "mean_token_accuracy": 0.41724138259887694, + "step": 20905 + }, + { + "epoch": 0.021060774081650538, + "grad_norm": 21.14752554231236, + "learning_rate": 2.1060371049291947e-05, + "loss": 2.4167, + "mean_token_accuracy": 0.41034482717514037, + "step": 20910 + }, + { + "epoch": 0.021065810134754708, + "grad_norm": 19.346628894314094, + "learning_rate": 2.1065407006023006e-05, + "loss": 2.3921, + "mean_token_accuracy": 0.43793103098869324, + "step": 20915 + }, + { + "epoch": 0.021070846187858882, + "grad_norm": 28.703939352884596, + "learning_rate": 2.1070442962754062e-05, + "loss": 2.7912, + "mean_token_accuracy": 0.4068965554237366, + "step": 20920 + }, + { + "epoch": 0.021075882240963056, + "grad_norm": 24.01577725670423, + "learning_rate": 2.1075478919485125e-05, + "loss": 2.6339, + "mean_token_accuracy": 0.3655172437429428, + "step": 20925 + }, + { + "epoch": 0.021080918294067226, + "grad_norm": 36.243478886755774, + "learning_rate": 2.1080514876216184e-05, + "loss": 2.2843, + "mean_token_accuracy": 0.4261947929859161, + "step": 20930 + }, + { + "epoch": 0.0210859543471714, + "grad_norm": 20.60866818430531, + "learning_rate": 2.1085550832947247e-05, + "loss": 2.6057, + "mean_token_accuracy": 0.38620689511299133, + "step": 20935 + }, + { + "epoch": 0.021090990400275574, + "grad_norm": 18.582219388872534, + "learning_rate": 2.1090586789678303e-05, + "loss": 2.4615, + "mean_token_accuracy": 0.4517241418361664, + "step": 20940 + }, + { + "epoch": 0.021096026453379747, + "grad_norm": 27.594770077376182, + "learning_rate": 2.1095622746409362e-05, + "loss": 2.8242, + "mean_token_accuracy": 0.3776164650917053, + "step": 20945 + }, + { + "epoch": 0.021101062506483918, + "grad_norm": 24.24602305086245, + "learning_rate": 2.1100658703140425e-05, + "loss": 2.8683, + "mean_token_accuracy": 0.3517241358757019, + "step": 20950 + }, + { + "epoch": 0.02110609855958809, + "grad_norm": 16.263725098729793, + "learning_rate": 2.1105694659871484e-05, + "loss": 2.5608, + "mean_token_accuracy": 0.401875376701355, + "step": 20955 + }, + { + "epoch": 0.021111134612692265, + "grad_norm": 19.569998824677764, + "learning_rate": 2.1110730616602543e-05, + "loss": 2.477, + "mean_token_accuracy": 0.391349071264267, + "step": 20960 + }, + { + "epoch": 0.021116170665796435, + "grad_norm": 21.781077255744773, + "learning_rate": 2.1115766573333602e-05, + "loss": 2.7433, + "mean_token_accuracy": 0.4000000059604645, + "step": 20965 + }, + { + "epoch": 0.02112120671890061, + "grad_norm": 18.40128037980539, + "learning_rate": 2.112080253006466e-05, + "loss": 2.4836, + "mean_token_accuracy": 0.3655172407627106, + "step": 20970 + }, + { + "epoch": 0.021126242772004783, + "grad_norm": 46.492749617498546, + "learning_rate": 2.1125838486795724e-05, + "loss": 2.3891, + "mean_token_accuracy": 0.4551724135875702, + "step": 20975 + }, + { + "epoch": 0.021131278825108957, + "grad_norm": 19.349739978910076, + "learning_rate": 2.1130874443526783e-05, + "loss": 2.8626, + "mean_token_accuracy": 0.3517241358757019, + "step": 20980 + }, + { + "epoch": 0.021136314878213127, + "grad_norm": 20.97780408660785, + "learning_rate": 2.1135910400257843e-05, + "loss": 2.4613, + "mean_token_accuracy": 0.37931033968925476, + "step": 20985 + }, + { + "epoch": 0.0211413509313173, + "grad_norm": 24.445144217630627, + "learning_rate": 2.1140946356988902e-05, + "loss": 2.2161, + "mean_token_accuracy": 0.4620689690113068, + "step": 20990 + }, + { + "epoch": 0.021146386984421475, + "grad_norm": 21.6366577428185, + "learning_rate": 2.114598231371996e-05, + "loss": 2.1493, + "mean_token_accuracy": 0.4620689630508423, + "step": 20995 + }, + { + "epoch": 0.021151423037525645, + "grad_norm": 24.427408832294397, + "learning_rate": 2.115101827045102e-05, + "loss": 2.3802, + "mean_token_accuracy": 0.40344828367233276, + "step": 21000 + }, + { + "epoch": 0.02115645909062982, + "grad_norm": 23.111088936342323, + "learning_rate": 2.1156054227182083e-05, + "loss": 2.3372, + "mean_token_accuracy": 0.4310344815254211, + "step": 21005 + }, + { + "epoch": 0.021161495143733992, + "grad_norm": 17.82266980970074, + "learning_rate": 2.116109018391314e-05, + "loss": 1.8099, + "mean_token_accuracy": 0.5241379261016845, + "step": 21010 + }, + { + "epoch": 0.021166531196838166, + "grad_norm": 23.38331723872994, + "learning_rate": 2.11661261406442e-05, + "loss": 2.8492, + "mean_token_accuracy": 0.3620689630508423, + "step": 21015 + }, + { + "epoch": 0.021171567249942336, + "grad_norm": 25.22452911055958, + "learning_rate": 2.117116209737526e-05, + "loss": 2.5648, + "mean_token_accuracy": 0.417241370677948, + "step": 21020 + }, + { + "epoch": 0.02117660330304651, + "grad_norm": 19.44492501026059, + "learning_rate": 2.117619805410632e-05, + "loss": 2.4373, + "mean_token_accuracy": 0.4172413766384125, + "step": 21025 + }, + { + "epoch": 0.021181639356150684, + "grad_norm": 26.642112492184182, + "learning_rate": 2.118123401083738e-05, + "loss": 2.9754, + "mean_token_accuracy": 0.33103448152542114, + "step": 21030 + }, + { + "epoch": 0.021186675409254854, + "grad_norm": 18.614317069537332, + "learning_rate": 2.118626996756844e-05, + "loss": 2.4687, + "mean_token_accuracy": 0.41379310488700866, + "step": 21035 + }, + { + "epoch": 0.021191711462359028, + "grad_norm": 22.057150559882654, + "learning_rate": 2.1191305924299498e-05, + "loss": 2.4049, + "mean_token_accuracy": 0.4344827592372894, + "step": 21040 + }, + { + "epoch": 0.021196747515463202, + "grad_norm": 18.71614200054645, + "learning_rate": 2.119634188103056e-05, + "loss": 2.5238, + "mean_token_accuracy": 0.4068965494632721, + "step": 21045 + }, + { + "epoch": 0.021201783568567376, + "grad_norm": 18.585694268094016, + "learning_rate": 2.120137783776162e-05, + "loss": 2.7448, + "mean_token_accuracy": 0.36551724672317504, + "step": 21050 + }, + { + "epoch": 0.021206819621671546, + "grad_norm": 19.608911101528797, + "learning_rate": 2.120641379449268e-05, + "loss": 2.5549, + "mean_token_accuracy": 0.3965517282485962, + "step": 21055 + }, + { + "epoch": 0.02121185567477572, + "grad_norm": 21.560036124037065, + "learning_rate": 2.1211449751223738e-05, + "loss": 2.3331, + "mean_token_accuracy": 0.44482757449150084, + "step": 21060 + }, + { + "epoch": 0.021216891727879893, + "grad_norm": 18.11930895897455, + "learning_rate": 2.1216485707954798e-05, + "loss": 2.4571, + "mean_token_accuracy": 0.46551724672317507, + "step": 21065 + }, + { + "epoch": 0.021221927780984064, + "grad_norm": 25.66411609483518, + "learning_rate": 2.122152166468586e-05, + "loss": 2.5234, + "mean_token_accuracy": 0.39310345649719236, + "step": 21070 + }, + { + "epoch": 0.021226963834088237, + "grad_norm": 23.356947858412035, + "learning_rate": 2.1226557621416916e-05, + "loss": 2.1268, + "mean_token_accuracy": 0.4620689630508423, + "step": 21075 + }, + { + "epoch": 0.02123199988719241, + "grad_norm": 22.557399422631665, + "learning_rate": 2.1231593578147975e-05, + "loss": 2.8912, + "mean_token_accuracy": 0.39655172228813174, + "step": 21080 + }, + { + "epoch": 0.02123703594029658, + "grad_norm": 24.702196168548216, + "learning_rate": 2.1236629534879038e-05, + "loss": 2.3692, + "mean_token_accuracy": 0.4275861978530884, + "step": 21085 + }, + { + "epoch": 0.021242071993400755, + "grad_norm": 20.286725374452587, + "learning_rate": 2.1241665491610097e-05, + "loss": 2.4183, + "mean_token_accuracy": 0.3931034505367279, + "step": 21090 + }, + { + "epoch": 0.02124710804650493, + "grad_norm": 25.638408993938015, + "learning_rate": 2.1246701448341156e-05, + "loss": 2.1182, + "mean_token_accuracy": 0.453901994228363, + "step": 21095 + }, + { + "epoch": 0.021252144099609103, + "grad_norm": 18.671845367902705, + "learning_rate": 2.1251737405072216e-05, + "loss": 2.1867, + "mean_token_accuracy": 0.46551724076271056, + "step": 21100 + }, + { + "epoch": 0.021257180152713273, + "grad_norm": 20.53286730376482, + "learning_rate": 2.1256773361803275e-05, + "loss": 2.5607, + "mean_token_accuracy": 0.38965516686439516, + "step": 21105 + }, + { + "epoch": 0.021262216205817447, + "grad_norm": 21.654851421821583, + "learning_rate": 2.1261809318534338e-05, + "loss": 2.4706, + "mean_token_accuracy": 0.4137930989265442, + "step": 21110 + }, + { + "epoch": 0.02126725225892162, + "grad_norm": 18.0515039288747, + "learning_rate": 2.1266845275265397e-05, + "loss": 2.2179, + "mean_token_accuracy": 0.43103448748588563, + "step": 21115 + }, + { + "epoch": 0.02127228831202579, + "grad_norm": 23.624520030738847, + "learning_rate": 2.1271881231996456e-05, + "loss": 2.2583, + "mean_token_accuracy": 0.44482759237289426, + "step": 21120 + }, + { + "epoch": 0.021277324365129965, + "grad_norm": 21.366795476132026, + "learning_rate": 2.1276917188727515e-05, + "loss": 2.4638, + "mean_token_accuracy": 0.4103448331356049, + "step": 21125 + }, + { + "epoch": 0.02128236041823414, + "grad_norm": 18.204018378083113, + "learning_rate": 2.1281953145458575e-05, + "loss": 2.1241, + "mean_token_accuracy": 0.4344827592372894, + "step": 21130 + }, + { + "epoch": 0.021287396471338312, + "grad_norm": 24.57298413065489, + "learning_rate": 2.1286989102189637e-05, + "loss": 2.8041, + "mean_token_accuracy": 0.3517241418361664, + "step": 21135 + }, + { + "epoch": 0.021292432524442483, + "grad_norm": 20.42255866967111, + "learning_rate": 2.1292025058920697e-05, + "loss": 2.3904, + "mean_token_accuracy": 0.41724138259887694, + "step": 21140 + }, + { + "epoch": 0.021297468577546656, + "grad_norm": 17.821941091188915, + "learning_rate": 2.1297061015651752e-05, + "loss": 2.0925, + "mean_token_accuracy": 0.4603750824928284, + "step": 21145 + }, + { + "epoch": 0.02130250463065083, + "grad_norm": 22.035364753885666, + "learning_rate": 2.1302096972382815e-05, + "loss": 2.3861, + "mean_token_accuracy": 0.4103448212146759, + "step": 21150 + }, + { + "epoch": 0.021307540683755, + "grad_norm": 16.70567385951173, + "learning_rate": 2.1307132929113874e-05, + "loss": 2.2951, + "mean_token_accuracy": 0.4137930989265442, + "step": 21155 + }, + { + "epoch": 0.021312576736859174, + "grad_norm": 17.220590490166508, + "learning_rate": 2.1312168885844934e-05, + "loss": 2.3865, + "mean_token_accuracy": 0.3758620619773865, + "step": 21160 + }, + { + "epoch": 0.021317612789963348, + "grad_norm": 28.45063276946072, + "learning_rate": 2.1317204842575993e-05, + "loss": 2.7205, + "mean_token_accuracy": 0.3517241388559341, + "step": 21165 + }, + { + "epoch": 0.02132264884306752, + "grad_norm": 20.96639430569427, + "learning_rate": 2.1322240799307052e-05, + "loss": 2.5758, + "mean_token_accuracy": 0.4000000059604645, + "step": 21170 + }, + { + "epoch": 0.021327684896171692, + "grad_norm": 20.574555001628905, + "learning_rate": 2.1327276756038115e-05, + "loss": 2.6416, + "mean_token_accuracy": 0.3999999940395355, + "step": 21175 + }, + { + "epoch": 0.021332720949275866, + "grad_norm": 19.359705768956218, + "learning_rate": 2.1332312712769174e-05, + "loss": 2.5769, + "mean_token_accuracy": 0.4034482777118683, + "step": 21180 + }, + { + "epoch": 0.02133775700238004, + "grad_norm": 15.733272651482979, + "learning_rate": 2.1337348669500233e-05, + "loss": 1.912, + "mean_token_accuracy": 0.510344821214676, + "step": 21185 + }, + { + "epoch": 0.02134279305548421, + "grad_norm": 17.0283901025158, + "learning_rate": 2.1342384626231292e-05, + "loss": 2.4239, + "mean_token_accuracy": 0.458620685338974, + "step": 21190 + }, + { + "epoch": 0.021347829108588384, + "grad_norm": 23.272429140930885, + "learning_rate": 2.134742058296235e-05, + "loss": 2.6658, + "mean_token_accuracy": 0.38275861740112305, + "step": 21195 + }, + { + "epoch": 0.021352865161692557, + "grad_norm": 24.685033246849063, + "learning_rate": 2.135245653969341e-05, + "loss": 2.5016, + "mean_token_accuracy": 0.40689654350280763, + "step": 21200 + }, + { + "epoch": 0.02135790121479673, + "grad_norm": 22.868386925848363, + "learning_rate": 2.1357492496424474e-05, + "loss": 2.3335, + "mean_token_accuracy": 0.4551724135875702, + "step": 21205 + }, + { + "epoch": 0.0213629372679009, + "grad_norm": 20.629060435773535, + "learning_rate": 2.136252845315553e-05, + "loss": 2.4539, + "mean_token_accuracy": 0.41034482717514037, + "step": 21210 + }, + { + "epoch": 0.021367973321005075, + "grad_norm": 27.954016138822368, + "learning_rate": 2.1367564409886592e-05, + "loss": 2.7829, + "mean_token_accuracy": 0.3517241418361664, + "step": 21215 + }, + { + "epoch": 0.02137300937410925, + "grad_norm": 17.66840555137636, + "learning_rate": 2.137260036661765e-05, + "loss": 2.3751, + "mean_token_accuracy": 0.4586206912994385, + "step": 21220 + }, + { + "epoch": 0.02137804542721342, + "grad_norm": 22.69578949512599, + "learning_rate": 2.137763632334871e-05, + "loss": 2.5368, + "mean_token_accuracy": 0.38620689511299133, + "step": 21225 + }, + { + "epoch": 0.021383081480317593, + "grad_norm": 25.03333138350183, + "learning_rate": 2.138267228007977e-05, + "loss": 2.6763, + "mean_token_accuracy": 0.4034482717514038, + "step": 21230 + }, + { + "epoch": 0.021388117533421767, + "grad_norm": 22.30316455489453, + "learning_rate": 2.138770823681083e-05, + "loss": 2.6622, + "mean_token_accuracy": 0.39655172228813174, + "step": 21235 + }, + { + "epoch": 0.02139315358652594, + "grad_norm": 25.50510708600825, + "learning_rate": 2.139274419354189e-05, + "loss": 2.4519, + "mean_token_accuracy": 0.4413793087005615, + "step": 21240 + }, + { + "epoch": 0.02139818963963011, + "grad_norm": 18.309690727445744, + "learning_rate": 2.139778015027295e-05, + "loss": 2.4431, + "mean_token_accuracy": 0.42068964838981626, + "step": 21245 + }, + { + "epoch": 0.021403225692734285, + "grad_norm": 18.969194831388506, + "learning_rate": 2.140281610700401e-05, + "loss": 2.2073, + "mean_token_accuracy": 0.45517241656780244, + "step": 21250 + }, + { + "epoch": 0.02140826174583846, + "grad_norm": 22.96546996371774, + "learning_rate": 2.140785206373507e-05, + "loss": 2.7717, + "mean_token_accuracy": 0.37241379022598264, + "step": 21255 + }, + { + "epoch": 0.02141329779894263, + "grad_norm": 20.26171254629993, + "learning_rate": 2.141288802046613e-05, + "loss": 2.606, + "mean_token_accuracy": 0.3896551728248596, + "step": 21260 + }, + { + "epoch": 0.021418333852046802, + "grad_norm": 21.646809421900933, + "learning_rate": 2.1417923977197188e-05, + "loss": 2.5949, + "mean_token_accuracy": 0.36896551251411436, + "step": 21265 + }, + { + "epoch": 0.021423369905150976, + "grad_norm": 16.72230650044111, + "learning_rate": 2.142295993392825e-05, + "loss": 2.2471, + "mean_token_accuracy": 0.4551724076271057, + "step": 21270 + }, + { + "epoch": 0.02142840595825515, + "grad_norm": 18.533552086992632, + "learning_rate": 2.1427995890659307e-05, + "loss": 2.2507, + "mean_token_accuracy": 0.4310344815254211, + "step": 21275 + }, + { + "epoch": 0.02143344201135932, + "grad_norm": 25.12206590002422, + "learning_rate": 2.143303184739037e-05, + "loss": 2.1477, + "mean_token_accuracy": 0.42952207922935487, + "step": 21280 + }, + { + "epoch": 0.021438478064463494, + "grad_norm": 18.46060055551337, + "learning_rate": 2.143806780412143e-05, + "loss": 2.2397, + "mean_token_accuracy": 0.4551724135875702, + "step": 21285 + }, + { + "epoch": 0.021443514117567668, + "grad_norm": 21.09666101634836, + "learning_rate": 2.1443103760852488e-05, + "loss": 2.7549, + "mean_token_accuracy": 0.4018148899078369, + "step": 21290 + }, + { + "epoch": 0.021448550170671838, + "grad_norm": 24.00296093144513, + "learning_rate": 2.1448139717583547e-05, + "loss": 2.66, + "mean_token_accuracy": 0.4172413766384125, + "step": 21295 + }, + { + "epoch": 0.021453586223776012, + "grad_norm": 30.69521282994135, + "learning_rate": 2.1453175674314606e-05, + "loss": 2.8866, + "mean_token_accuracy": 0.42413793206214906, + "step": 21300 + }, + { + "epoch": 0.021458622276880186, + "grad_norm": 20.638833099008483, + "learning_rate": 2.1458211631045665e-05, + "loss": 2.4647, + "mean_token_accuracy": 0.3931034505367279, + "step": 21305 + }, + { + "epoch": 0.02146365832998436, + "grad_norm": 19.200889976637768, + "learning_rate": 2.1463247587776728e-05, + "loss": 2.473, + "mean_token_accuracy": 0.4482758641242981, + "step": 21310 + }, + { + "epoch": 0.02146869438308853, + "grad_norm": 23.3168330177276, + "learning_rate": 2.1468283544507787e-05, + "loss": 2.5571, + "mean_token_accuracy": 0.4517241418361664, + "step": 21315 + }, + { + "epoch": 0.021473730436192703, + "grad_norm": 23.29290192156174, + "learning_rate": 2.1473319501238847e-05, + "loss": 2.4604, + "mean_token_accuracy": 0.37241379618644715, + "step": 21320 + }, + { + "epoch": 0.021478766489296877, + "grad_norm": 23.982081592848296, + "learning_rate": 2.1478355457969906e-05, + "loss": 2.6965, + "mean_token_accuracy": 0.3620689630508423, + "step": 21325 + }, + { + "epoch": 0.021483802542401047, + "grad_norm": 19.886959459342087, + "learning_rate": 2.1483391414700965e-05, + "loss": 2.4501, + "mean_token_accuracy": 0.3827586233615875, + "step": 21330 + }, + { + "epoch": 0.02148883859550522, + "grad_norm": 17.80350165783561, + "learning_rate": 2.1488427371432028e-05, + "loss": 2.2599, + "mean_token_accuracy": 0.4517241358757019, + "step": 21335 + }, + { + "epoch": 0.021493874648609395, + "grad_norm": 58.46189296422362, + "learning_rate": 2.1493463328163087e-05, + "loss": 2.768, + "mean_token_accuracy": 0.38275861740112305, + "step": 21340 + }, + { + "epoch": 0.02149891070171357, + "grad_norm": 19.649963383936225, + "learning_rate": 2.1498499284894143e-05, + "loss": 2.6442, + "mean_token_accuracy": 0.3931034505367279, + "step": 21345 + }, + { + "epoch": 0.02150394675481774, + "grad_norm": 18.758889172396653, + "learning_rate": 2.1503535241625205e-05, + "loss": 2.4415, + "mean_token_accuracy": 0.4448275864124298, + "step": 21350 + }, + { + "epoch": 0.021508982807921913, + "grad_norm": 19.3393762573304, + "learning_rate": 2.1508571198356265e-05, + "loss": 2.3811, + "mean_token_accuracy": 0.4379310369491577, + "step": 21355 + }, + { + "epoch": 0.021514018861026087, + "grad_norm": 18.917696870709353, + "learning_rate": 2.1513607155087327e-05, + "loss": 2.187, + "mean_token_accuracy": 0.4586206912994385, + "step": 21360 + }, + { + "epoch": 0.021519054914130257, + "grad_norm": 26.589235099172644, + "learning_rate": 2.1518643111818383e-05, + "loss": 2.5438, + "mean_token_accuracy": 0.4068965554237366, + "step": 21365 + }, + { + "epoch": 0.02152409096723443, + "grad_norm": 19.88762641238384, + "learning_rate": 2.1523679068549442e-05, + "loss": 2.5633, + "mean_token_accuracy": 0.3758620649576187, + "step": 21370 + }, + { + "epoch": 0.021529127020338604, + "grad_norm": 16.272772106163, + "learning_rate": 2.1528715025280505e-05, + "loss": 2.4822, + "mean_token_accuracy": 0.43793103098869324, + "step": 21375 + }, + { + "epoch": 0.021534163073442778, + "grad_norm": 18.894817428127777, + "learning_rate": 2.1533750982011564e-05, + "loss": 2.739, + "mean_token_accuracy": 0.36206896901130675, + "step": 21380 + }, + { + "epoch": 0.02153919912654695, + "grad_norm": 19.088232725365174, + "learning_rate": 2.1538786938742624e-05, + "loss": 2.1964, + "mean_token_accuracy": 0.4586206912994385, + "step": 21385 + }, + { + "epoch": 0.021544235179651122, + "grad_norm": 14.86055196125715, + "learning_rate": 2.1543822895473683e-05, + "loss": 2.1084, + "mean_token_accuracy": 0.5125831842422486, + "step": 21390 + }, + { + "epoch": 0.021549271232755296, + "grad_norm": 21.15171316757635, + "learning_rate": 2.1548858852204742e-05, + "loss": 2.5044, + "mean_token_accuracy": 0.4034482777118683, + "step": 21395 + }, + { + "epoch": 0.021554307285859466, + "grad_norm": 21.614144527541498, + "learning_rate": 2.1553894808935805e-05, + "loss": 2.4896, + "mean_token_accuracy": 0.41929824352264405, + "step": 21400 + }, + { + "epoch": 0.02155934333896364, + "grad_norm": 16.39683654684843, + "learning_rate": 2.1558930765666864e-05, + "loss": 2.4782, + "mean_token_accuracy": 0.4137930989265442, + "step": 21405 + }, + { + "epoch": 0.021564379392067814, + "grad_norm": 20.836267171175415, + "learning_rate": 2.156396672239792e-05, + "loss": 2.6525, + "mean_token_accuracy": 0.4103448152542114, + "step": 21410 + }, + { + "epoch": 0.021569415445171988, + "grad_norm": 19.327925233600205, + "learning_rate": 2.1569002679128983e-05, + "loss": 2.4536, + "mean_token_accuracy": 0.4413793087005615, + "step": 21415 + }, + { + "epoch": 0.021574451498276158, + "grad_norm": 31.673396901473147, + "learning_rate": 2.1574038635860042e-05, + "loss": 2.0451, + "mean_token_accuracy": 0.4793103516101837, + "step": 21420 + }, + { + "epoch": 0.02157948755138033, + "grad_norm": 20.757972626620997, + "learning_rate": 2.15790745925911e-05, + "loss": 2.8699, + "mean_token_accuracy": 0.3482758700847626, + "step": 21425 + }, + { + "epoch": 0.021584523604484505, + "grad_norm": 23.04080107849021, + "learning_rate": 2.158411054932216e-05, + "loss": 2.3406, + "mean_token_accuracy": 0.3896551728248596, + "step": 21430 + }, + { + "epoch": 0.021589559657588676, + "grad_norm": 20.102389032289793, + "learning_rate": 2.158914650605322e-05, + "loss": 2.3703, + "mean_token_accuracy": 0.40852994918823243, + "step": 21435 + }, + { + "epoch": 0.02159459571069285, + "grad_norm": 21.870507736067438, + "learning_rate": 2.1594182462784282e-05, + "loss": 2.6617, + "mean_token_accuracy": 0.4068965494632721, + "step": 21440 + }, + { + "epoch": 0.021599631763797023, + "grad_norm": 17.929142313440906, + "learning_rate": 2.159921841951534e-05, + "loss": 2.4824, + "mean_token_accuracy": 0.41379310488700866, + "step": 21445 + }, + { + "epoch": 0.021604667816901197, + "grad_norm": 23.078209549488363, + "learning_rate": 2.16042543762464e-05, + "loss": 2.4586, + "mean_token_accuracy": 0.3965517282485962, + "step": 21450 + }, + { + "epoch": 0.021609703870005367, + "grad_norm": 23.90567190795958, + "learning_rate": 2.160929033297746e-05, + "loss": 2.5161, + "mean_token_accuracy": 0.3909255862236023, + "step": 21455 + }, + { + "epoch": 0.02161473992310954, + "grad_norm": 17.45807935710331, + "learning_rate": 2.161432628970852e-05, + "loss": 2.4131, + "mean_token_accuracy": 0.4034482717514038, + "step": 21460 + }, + { + "epoch": 0.021619775976213715, + "grad_norm": 26.050217461696022, + "learning_rate": 2.161936224643958e-05, + "loss": 2.7065, + "mean_token_accuracy": 0.39310343861579894, + "step": 21465 + }, + { + "epoch": 0.021624812029317885, + "grad_norm": 23.61665140389976, + "learning_rate": 2.162439820317064e-05, + "loss": 2.7876, + "mean_token_accuracy": 0.3448275804519653, + "step": 21470 + }, + { + "epoch": 0.02162984808242206, + "grad_norm": 16.055468444101866, + "learning_rate": 2.1629434159901697e-05, + "loss": 2.4813, + "mean_token_accuracy": 0.37779794335365297, + "step": 21475 + }, + { + "epoch": 0.021634884135526233, + "grad_norm": 25.499886817554344, + "learning_rate": 2.163447011663276e-05, + "loss": 2.268, + "mean_token_accuracy": 0.40344828367233276, + "step": 21480 + }, + { + "epoch": 0.021639920188630406, + "grad_norm": 28.854832200085404, + "learning_rate": 2.163950607336382e-05, + "loss": 2.1744, + "mean_token_accuracy": 0.4689655065536499, + "step": 21485 + }, + { + "epoch": 0.021644956241734577, + "grad_norm": 20.437794488963352, + "learning_rate": 2.1644542030094878e-05, + "loss": 2.6063, + "mean_token_accuracy": 0.3999999940395355, + "step": 21490 + }, + { + "epoch": 0.02164999229483875, + "grad_norm": 24.872013565093347, + "learning_rate": 2.164957798682594e-05, + "loss": 2.3231, + "mean_token_accuracy": 0.4551724076271057, + "step": 21495 + }, + { + "epoch": 0.021655028347942924, + "grad_norm": 21.057369221037185, + "learning_rate": 2.1654613943556997e-05, + "loss": 2.6737, + "mean_token_accuracy": 0.3620689630508423, + "step": 21500 + }, + { + "epoch": 0.021660064401047095, + "grad_norm": 21.0599149429598, + "learning_rate": 2.1659649900288056e-05, + "loss": 2.2349, + "mean_token_accuracy": 0.4103448212146759, + "step": 21505 + }, + { + "epoch": 0.02166510045415127, + "grad_norm": 25.471535098354508, + "learning_rate": 2.166468585701912e-05, + "loss": 2.5737, + "mean_token_accuracy": 0.4034482717514038, + "step": 21510 + }, + { + "epoch": 0.021670136507255442, + "grad_norm": 23.50442998090923, + "learning_rate": 2.1669721813750178e-05, + "loss": 2.9357, + "mean_token_accuracy": 0.38620689511299133, + "step": 21515 + }, + { + "epoch": 0.021675172560359616, + "grad_norm": 21.35387634333185, + "learning_rate": 2.1674757770481237e-05, + "loss": 2.8811, + "mean_token_accuracy": 0.37241379618644715, + "step": 21520 + }, + { + "epoch": 0.021680208613463786, + "grad_norm": 21.789492178161805, + "learning_rate": 2.1679793727212296e-05, + "loss": 2.4923, + "mean_token_accuracy": 0.42068964838981626, + "step": 21525 + }, + { + "epoch": 0.02168524466656796, + "grad_norm": 14.99269352384579, + "learning_rate": 2.1684829683943356e-05, + "loss": 2.2103, + "mean_token_accuracy": 0.45704780220985414, + "step": 21530 + }, + { + "epoch": 0.021690280719672134, + "grad_norm": 24.334991206304526, + "learning_rate": 2.1689865640674418e-05, + "loss": 2.7528, + "mean_token_accuracy": 0.403448274731636, + "step": 21535 + }, + { + "epoch": 0.021695316772776304, + "grad_norm": 25.147343277361273, + "learning_rate": 2.1694901597405477e-05, + "loss": 2.0583, + "mean_token_accuracy": 0.46206897497177124, + "step": 21540 + }, + { + "epoch": 0.021700352825880478, + "grad_norm": 17.124692390940705, + "learning_rate": 2.1699937554136533e-05, + "loss": 2.5092, + "mean_token_accuracy": 0.4034482777118683, + "step": 21545 + }, + { + "epoch": 0.02170538887898465, + "grad_norm": 21.562574627880416, + "learning_rate": 2.1704973510867596e-05, + "loss": 2.2704, + "mean_token_accuracy": 0.45517241954803467, + "step": 21550 + }, + { + "epoch": 0.021710424932088825, + "grad_norm": 18.064784382956333, + "learning_rate": 2.1710009467598655e-05, + "loss": 2.4777, + "mean_token_accuracy": 0.39655172228813174, + "step": 21555 + }, + { + "epoch": 0.021715460985192996, + "grad_norm": 20.57611940441752, + "learning_rate": 2.1715045424329718e-05, + "loss": 2.246, + "mean_token_accuracy": 0.4379310369491577, + "step": 21560 + }, + { + "epoch": 0.02172049703829717, + "grad_norm": 21.736817655462637, + "learning_rate": 2.1720081381060774e-05, + "loss": 2.5078, + "mean_token_accuracy": 0.4255898356437683, + "step": 21565 + }, + { + "epoch": 0.021725533091401343, + "grad_norm": 20.270649883822532, + "learning_rate": 2.1725117337791833e-05, + "loss": 2.3941, + "mean_token_accuracy": 0.43986691236495973, + "step": 21570 + }, + { + "epoch": 0.021730569144505513, + "grad_norm": 22.29449077866916, + "learning_rate": 2.1730153294522896e-05, + "loss": 2.4522, + "mean_token_accuracy": 0.4172413766384125, + "step": 21575 + }, + { + "epoch": 0.021735605197609687, + "grad_norm": 18.59065764390342, + "learning_rate": 2.1735189251253955e-05, + "loss": 2.642, + "mean_token_accuracy": 0.37241379618644715, + "step": 21580 + }, + { + "epoch": 0.02174064125071386, + "grad_norm": 18.105978132724644, + "learning_rate": 2.1740225207985014e-05, + "loss": 2.4865, + "mean_token_accuracy": 0.39655172228813174, + "step": 21585 + }, + { + "epoch": 0.021745677303818035, + "grad_norm": 19.23912587497405, + "learning_rate": 2.1745261164716073e-05, + "loss": 2.542, + "mean_token_accuracy": 0.40798547863960266, + "step": 21590 + }, + { + "epoch": 0.021750713356922205, + "grad_norm": 16.97105849821303, + "learning_rate": 2.1750297121447133e-05, + "loss": 2.3399, + "mean_token_accuracy": 0.4379310369491577, + "step": 21595 + }, + { + "epoch": 0.02175574941002638, + "grad_norm": 20.906553191127436, + "learning_rate": 2.1755333078178195e-05, + "loss": 1.9815, + "mean_token_accuracy": 0.48275861144065857, + "step": 21600 + }, + { + "epoch": 0.021760785463130553, + "grad_norm": 21.106023720707956, + "learning_rate": 2.1760369034909254e-05, + "loss": 2.4754, + "mean_token_accuracy": 0.42758620977401735, + "step": 21605 + }, + { + "epoch": 0.021765821516234723, + "grad_norm": 20.51090469243702, + "learning_rate": 2.176540499164031e-05, + "loss": 2.5098, + "mean_token_accuracy": 0.4137930989265442, + "step": 21610 + }, + { + "epoch": 0.021770857569338897, + "grad_norm": 22.621381846141933, + "learning_rate": 2.1770440948371373e-05, + "loss": 2.3678, + "mean_token_accuracy": 0.43103447556495667, + "step": 21615 + }, + { + "epoch": 0.02177589362244307, + "grad_norm": 21.716603358908372, + "learning_rate": 2.1775476905102432e-05, + "loss": 2.5886, + "mean_token_accuracy": 0.4068965494632721, + "step": 21620 + }, + { + "epoch": 0.021780929675547244, + "grad_norm": 18.38476529279915, + "learning_rate": 2.178051286183349e-05, + "loss": 2.4682, + "mean_token_accuracy": 0.42262552976608275, + "step": 21625 + }, + { + "epoch": 0.021785965728651414, + "grad_norm": 18.11916614840942, + "learning_rate": 2.178554881856455e-05, + "loss": 2.3106, + "mean_token_accuracy": 0.4379310369491577, + "step": 21630 + }, + { + "epoch": 0.021791001781755588, + "grad_norm": 23.036596259685872, + "learning_rate": 2.179058477529561e-05, + "loss": 2.7539, + "mean_token_accuracy": 0.4103448212146759, + "step": 21635 + }, + { + "epoch": 0.021796037834859762, + "grad_norm": 19.71209653554668, + "learning_rate": 2.1795620732026673e-05, + "loss": 2.3661, + "mean_token_accuracy": 0.4206896543502808, + "step": 21640 + }, + { + "epoch": 0.021801073887963932, + "grad_norm": 17.633921261304454, + "learning_rate": 2.1800656688757732e-05, + "loss": 2.5948, + "mean_token_accuracy": 0.40344826877117157, + "step": 21645 + }, + { + "epoch": 0.021806109941068106, + "grad_norm": 17.009069070450185, + "learning_rate": 2.180569264548879e-05, + "loss": 2.2365, + "mean_token_accuracy": 0.48275862336158754, + "step": 21650 + }, + { + "epoch": 0.02181114599417228, + "grad_norm": 35.62188496942071, + "learning_rate": 2.181072860221985e-05, + "loss": 3.0139, + "mean_token_accuracy": 0.3655172437429428, + "step": 21655 + }, + { + "epoch": 0.021816182047276454, + "grad_norm": 18.39800007551049, + "learning_rate": 2.181576455895091e-05, + "loss": 2.2108, + "mean_token_accuracy": 0.41379310488700866, + "step": 21660 + }, + { + "epoch": 0.021821218100380624, + "grad_norm": 21.737673921005147, + "learning_rate": 2.182080051568197e-05, + "loss": 2.5352, + "mean_token_accuracy": 0.4310344815254211, + "step": 21665 + }, + { + "epoch": 0.021826254153484798, + "grad_norm": 18.96735408373914, + "learning_rate": 2.182583647241303e-05, + "loss": 2.3243, + "mean_token_accuracy": 0.4310344815254211, + "step": 21670 + }, + { + "epoch": 0.02183129020658897, + "grad_norm": 15.139247169467529, + "learning_rate": 2.183087242914409e-05, + "loss": 2.389, + "mean_token_accuracy": 0.4034482777118683, + "step": 21675 + }, + { + "epoch": 0.02183632625969314, + "grad_norm": 19.119105362923744, + "learning_rate": 2.183590838587515e-05, + "loss": 2.4462, + "mean_token_accuracy": 0.40689654350280763, + "step": 21680 + }, + { + "epoch": 0.021841362312797315, + "grad_norm": 18.930010892932486, + "learning_rate": 2.184094434260621e-05, + "loss": 2.3072, + "mean_token_accuracy": 0.44355716109275817, + "step": 21685 + }, + { + "epoch": 0.02184639836590149, + "grad_norm": 24.216667079310714, + "learning_rate": 2.184598029933727e-05, + "loss": 2.6638, + "mean_token_accuracy": 0.39782214164733887, + "step": 21690 + }, + { + "epoch": 0.021851434419005663, + "grad_norm": 21.807882113820927, + "learning_rate": 2.185101625606833e-05, + "loss": 2.2816, + "mean_token_accuracy": 0.4793103516101837, + "step": 21695 + }, + { + "epoch": 0.021856470472109833, + "grad_norm": 16.88438049742662, + "learning_rate": 2.1856052212799387e-05, + "loss": 2.4499, + "mean_token_accuracy": 0.4241379201412201, + "step": 21700 + }, + { + "epoch": 0.021861506525214007, + "grad_norm": 45.5940845574123, + "learning_rate": 2.186108816953045e-05, + "loss": 2.2933, + "mean_token_accuracy": 0.41379310488700866, + "step": 21705 + }, + { + "epoch": 0.02186654257831818, + "grad_norm": 19.19299342144245, + "learning_rate": 2.186612412626151e-05, + "loss": 2.8039, + "mean_token_accuracy": 0.4068965554237366, + "step": 21710 + }, + { + "epoch": 0.02187157863142235, + "grad_norm": 20.60919521921651, + "learning_rate": 2.1871160082992568e-05, + "loss": 2.6019, + "mean_token_accuracy": 0.41034482717514037, + "step": 21715 + }, + { + "epoch": 0.021876614684526525, + "grad_norm": 23.285871346211103, + "learning_rate": 2.1876196039723627e-05, + "loss": 2.6472, + "mean_token_accuracy": 0.403448274731636, + "step": 21720 + }, + { + "epoch": 0.0218816507376307, + "grad_norm": 21.954056115545995, + "learning_rate": 2.1881231996454687e-05, + "loss": 2.6767, + "mean_token_accuracy": 0.3551724076271057, + "step": 21725 + }, + { + "epoch": 0.021886686790734872, + "grad_norm": 19.39028219645686, + "learning_rate": 2.1886267953185746e-05, + "loss": 2.3678, + "mean_token_accuracy": 0.42758620977401735, + "step": 21730 + }, + { + "epoch": 0.021891722843839043, + "grad_norm": 16.332245733504926, + "learning_rate": 2.189130390991681e-05, + "loss": 2.8005, + "mean_token_accuracy": 0.358620685338974, + "step": 21735 + }, + { + "epoch": 0.021896758896943216, + "grad_norm": 19.014127284738546, + "learning_rate": 2.1896339866647868e-05, + "loss": 2.3323, + "mean_token_accuracy": 0.4172413766384125, + "step": 21740 + }, + { + "epoch": 0.02190179495004739, + "grad_norm": 15.392209986702513, + "learning_rate": 2.1901375823378927e-05, + "loss": 2.4275, + "mean_token_accuracy": 0.441379314661026, + "step": 21745 + }, + { + "epoch": 0.02190683100315156, + "grad_norm": 26.426989347706606, + "learning_rate": 2.1906411780109986e-05, + "loss": 2.2238, + "mean_token_accuracy": 0.47586206793785096, + "step": 21750 + }, + { + "epoch": 0.021911867056255734, + "grad_norm": 21.487707141990146, + "learning_rate": 2.1911447736841046e-05, + "loss": 2.4446, + "mean_token_accuracy": 0.4517241358757019, + "step": 21755 + }, + { + "epoch": 0.021916903109359908, + "grad_norm": 23.425239068699245, + "learning_rate": 2.1916483693572108e-05, + "loss": 2.5788, + "mean_token_accuracy": 0.4413793087005615, + "step": 21760 + }, + { + "epoch": 0.021921939162464082, + "grad_norm": 21.06322853220178, + "learning_rate": 2.1921519650303164e-05, + "loss": 2.4168, + "mean_token_accuracy": 0.4379310429096222, + "step": 21765 + }, + { + "epoch": 0.021926975215568252, + "grad_norm": 20.29777047241038, + "learning_rate": 2.1926555607034223e-05, + "loss": 2.1808, + "mean_token_accuracy": 0.4413793087005615, + "step": 21770 + }, + { + "epoch": 0.021932011268672426, + "grad_norm": 21.90389458575001, + "learning_rate": 2.1931591563765286e-05, + "loss": 2.6499, + "mean_token_accuracy": 0.3827586114406586, + "step": 21775 + }, + { + "epoch": 0.0219370473217766, + "grad_norm": 17.848850455719244, + "learning_rate": 2.1936627520496345e-05, + "loss": 2.2759, + "mean_token_accuracy": 0.441379314661026, + "step": 21780 + }, + { + "epoch": 0.02194208337488077, + "grad_norm": 21.07058548206167, + "learning_rate": 2.1941663477227405e-05, + "loss": 2.3136, + "mean_token_accuracy": 0.4620689690113068, + "step": 21785 + }, + { + "epoch": 0.021947119427984944, + "grad_norm": 14.98205951650851, + "learning_rate": 2.1946699433958464e-05, + "loss": 2.3405, + "mean_token_accuracy": 0.4655172348022461, + "step": 21790 + }, + { + "epoch": 0.021952155481089117, + "grad_norm": 19.368701853449807, + "learning_rate": 2.1951735390689523e-05, + "loss": 2.3931, + "mean_token_accuracy": 0.4124016880989075, + "step": 21795 + }, + { + "epoch": 0.02195719153419329, + "grad_norm": 20.403598050296736, + "learning_rate": 2.1956771347420586e-05, + "loss": 2.5877, + "mean_token_accuracy": 0.38965516686439516, + "step": 21800 + }, + { + "epoch": 0.02196222758729746, + "grad_norm": 15.39313184023891, + "learning_rate": 2.1961807304151645e-05, + "loss": 2.0945, + "mean_token_accuracy": 0.458620685338974, + "step": 21805 + }, + { + "epoch": 0.021967263640401635, + "grad_norm": 17.43069099682159, + "learning_rate": 2.19668432608827e-05, + "loss": 2.5244, + "mean_token_accuracy": 0.4689655125141144, + "step": 21810 + }, + { + "epoch": 0.02197229969350581, + "grad_norm": 19.96825753111768, + "learning_rate": 2.1971879217613763e-05, + "loss": 2.6333, + "mean_token_accuracy": 0.41034482717514037, + "step": 21815 + }, + { + "epoch": 0.02197733574660998, + "grad_norm": 21.828421216227504, + "learning_rate": 2.1976915174344823e-05, + "loss": 2.456, + "mean_token_accuracy": 0.4172413766384125, + "step": 21820 + }, + { + "epoch": 0.021982371799714153, + "grad_norm": 22.166922489684964, + "learning_rate": 2.1981951131075885e-05, + "loss": 2.5327, + "mean_token_accuracy": 0.41724138259887694, + "step": 21825 + }, + { + "epoch": 0.021987407852818327, + "grad_norm": 19.418319006373743, + "learning_rate": 2.198698708780694e-05, + "loss": 2.7421, + "mean_token_accuracy": 0.3896551728248596, + "step": 21830 + }, + { + "epoch": 0.0219924439059225, + "grad_norm": 20.83800054741006, + "learning_rate": 2.1992023044538e-05, + "loss": 2.4592, + "mean_token_accuracy": 0.4068965554237366, + "step": 21835 + }, + { + "epoch": 0.02199747995902667, + "grad_norm": 17.23561815004508, + "learning_rate": 2.1997059001269063e-05, + "loss": 2.1408, + "mean_token_accuracy": 0.495160311460495, + "step": 21840 + }, + { + "epoch": 0.022002516012130845, + "grad_norm": 32.72038738751284, + "learning_rate": 2.2002094958000122e-05, + "loss": 2.3529, + "mean_token_accuracy": 0.4620689630508423, + "step": 21845 + }, + { + "epoch": 0.02200755206523502, + "grad_norm": 22.98757668643551, + "learning_rate": 2.200713091473118e-05, + "loss": 2.5392, + "mean_token_accuracy": 0.45862067937850953, + "step": 21850 + }, + { + "epoch": 0.02201258811833919, + "grad_norm": 18.93283329650684, + "learning_rate": 2.201216687146224e-05, + "loss": 2.1277, + "mean_token_accuracy": 0.512522679567337, + "step": 21855 + }, + { + "epoch": 0.022017624171443363, + "grad_norm": 20.262030735855543, + "learning_rate": 2.20172028281933e-05, + "loss": 3.0397, + "mean_token_accuracy": 0.3241379290819168, + "step": 21860 + }, + { + "epoch": 0.022022660224547536, + "grad_norm": 22.240427103536543, + "learning_rate": 2.2022238784924363e-05, + "loss": 2.7564, + "mean_token_accuracy": 0.37931033968925476, + "step": 21865 + }, + { + "epoch": 0.02202769627765171, + "grad_norm": 31.935353878221505, + "learning_rate": 2.2027274741655422e-05, + "loss": 2.6589, + "mean_token_accuracy": 0.37931033968925476, + "step": 21870 + }, + { + "epoch": 0.02203273233075588, + "grad_norm": 18.99255074863125, + "learning_rate": 2.203231069838648e-05, + "loss": 2.8635, + "mean_token_accuracy": 0.3793103516101837, + "step": 21875 + }, + { + "epoch": 0.022037768383860054, + "grad_norm": 18.670657842813824, + "learning_rate": 2.203734665511754e-05, + "loss": 2.1107, + "mean_token_accuracy": 0.47931034564971925, + "step": 21880 + }, + { + "epoch": 0.022042804436964228, + "grad_norm": 20.68548688465189, + "learning_rate": 2.20423826118486e-05, + "loss": 2.2387, + "mean_token_accuracy": 0.46551724076271056, + "step": 21885 + }, + { + "epoch": 0.022047840490068398, + "grad_norm": 21.19548673988787, + "learning_rate": 2.204741856857966e-05, + "loss": 2.4872, + "mean_token_accuracy": 0.3827586233615875, + "step": 21890 + }, + { + "epoch": 0.022052876543172572, + "grad_norm": 23.818325367342048, + "learning_rate": 2.205245452531072e-05, + "loss": 2.6599, + "mean_token_accuracy": 0.41857229471206664, + "step": 21895 + }, + { + "epoch": 0.022057912596276746, + "grad_norm": 21.64332862225286, + "learning_rate": 2.2057490482041777e-05, + "loss": 2.3055, + "mean_token_accuracy": 0.44700543880462645, + "step": 21900 + }, + { + "epoch": 0.02206294864938092, + "grad_norm": 20.0682466915093, + "learning_rate": 2.206252643877284e-05, + "loss": 2.7357, + "mean_token_accuracy": 0.35862069129943847, + "step": 21905 + }, + { + "epoch": 0.02206798470248509, + "grad_norm": 17.813662132891224, + "learning_rate": 2.20675623955039e-05, + "loss": 2.3007, + "mean_token_accuracy": 0.44482758045196535, + "step": 21910 + }, + { + "epoch": 0.022073020755589264, + "grad_norm": 24.433699893250825, + "learning_rate": 2.207259835223496e-05, + "loss": 2.2106, + "mean_token_accuracy": 0.47241378426551817, + "step": 21915 + }, + { + "epoch": 0.022078056808693437, + "grad_norm": 18.850225640797632, + "learning_rate": 2.2077634308966018e-05, + "loss": 2.5472, + "mean_token_accuracy": 0.4413793087005615, + "step": 21920 + }, + { + "epoch": 0.022083092861797608, + "grad_norm": 21.393883458892418, + "learning_rate": 2.2082670265697077e-05, + "loss": 2.3689, + "mean_token_accuracy": 0.4344827592372894, + "step": 21925 + }, + { + "epoch": 0.02208812891490178, + "grad_norm": 22.28777989040416, + "learning_rate": 2.2087706222428136e-05, + "loss": 2.5321, + "mean_token_accuracy": 0.41724138259887694, + "step": 21930 + }, + { + "epoch": 0.022093164968005955, + "grad_norm": 19.131649625652614, + "learning_rate": 2.20927421791592e-05, + "loss": 2.5263, + "mean_token_accuracy": 0.3931034475564957, + "step": 21935 + }, + { + "epoch": 0.02209820102111013, + "grad_norm": 19.46406322167404, + "learning_rate": 2.2097778135890258e-05, + "loss": 2.688, + "mean_token_accuracy": 0.37241379022598264, + "step": 21940 + }, + { + "epoch": 0.0221032370742143, + "grad_norm": 18.292065564127814, + "learning_rate": 2.2102814092621318e-05, + "loss": 2.5956, + "mean_token_accuracy": 0.3965517282485962, + "step": 21945 + }, + { + "epoch": 0.022108273127318473, + "grad_norm": 17.825280635786516, + "learning_rate": 2.2107850049352377e-05, + "loss": 2.7299, + "mean_token_accuracy": 0.38965516686439516, + "step": 21950 + }, + { + "epoch": 0.022113309180422647, + "grad_norm": 19.878855842706756, + "learning_rate": 2.2112886006083436e-05, + "loss": 2.6181, + "mean_token_accuracy": 0.3965517163276672, + "step": 21955 + }, + { + "epoch": 0.022118345233526817, + "grad_norm": 22.842665736173938, + "learning_rate": 2.21179219628145e-05, + "loss": 2.9404, + "mean_token_accuracy": 0.37931033968925476, + "step": 21960 + }, + { + "epoch": 0.02212338128663099, + "grad_norm": 19.60550180726225, + "learning_rate": 2.2122957919545555e-05, + "loss": 2.2325, + "mean_token_accuracy": 0.46896552443504336, + "step": 21965 + }, + { + "epoch": 0.022128417339735165, + "grad_norm": 16.54542227085258, + "learning_rate": 2.2127993876276614e-05, + "loss": 2.3343, + "mean_token_accuracy": 0.4448275864124298, + "step": 21970 + }, + { + "epoch": 0.02213345339283934, + "grad_norm": 20.55499149420374, + "learning_rate": 2.2133029833007676e-05, + "loss": 2.4232, + "mean_token_accuracy": 0.44482759237289426, + "step": 21975 + }, + { + "epoch": 0.02213848944594351, + "grad_norm": 17.144770357985625, + "learning_rate": 2.2138065789738736e-05, + "loss": 2.1678, + "mean_token_accuracy": 0.47586206793785096, + "step": 21980 + }, + { + "epoch": 0.022143525499047682, + "grad_norm": 24.792037772286424, + "learning_rate": 2.2143101746469795e-05, + "loss": 2.5414, + "mean_token_accuracy": 0.3517241358757019, + "step": 21985 + }, + { + "epoch": 0.022148561552151856, + "grad_norm": 18.64800959994105, + "learning_rate": 2.2148137703200854e-05, + "loss": 2.4998, + "mean_token_accuracy": 0.3517241358757019, + "step": 21990 + }, + { + "epoch": 0.022153597605256026, + "grad_norm": 30.721025502671388, + "learning_rate": 2.2153173659931913e-05, + "loss": 2.1328, + "mean_token_accuracy": 0.4620689630508423, + "step": 21995 + }, + { + "epoch": 0.0221586336583602, + "grad_norm": 18.529978820796558, + "learning_rate": 2.2158209616662976e-05, + "loss": 2.5182, + "mean_token_accuracy": 0.40889291763305663, + "step": 22000 + }, + { + "epoch": 0.022163669711464374, + "grad_norm": 26.469480126011934, + "learning_rate": 2.2163245573394035e-05, + "loss": 2.2622, + "mean_token_accuracy": 0.41379311084747317, + "step": 22005 + }, + { + "epoch": 0.022168705764568548, + "grad_norm": 18.16070527807918, + "learning_rate": 2.216828153012509e-05, + "loss": 2.3764, + "mean_token_accuracy": 0.39999999701976774, + "step": 22010 + }, + { + "epoch": 0.022173741817672718, + "grad_norm": 17.24744470420216, + "learning_rate": 2.2173317486856154e-05, + "loss": 2.3641, + "mean_token_accuracy": 0.4344827651977539, + "step": 22015 + }, + { + "epoch": 0.022178777870776892, + "grad_norm": 19.488925858277245, + "learning_rate": 2.2178353443587213e-05, + "loss": 2.1236, + "mean_token_accuracy": 0.46958128213882444, + "step": 22020 + }, + { + "epoch": 0.022183813923881066, + "grad_norm": 27.528580089454017, + "learning_rate": 2.2183389400318276e-05, + "loss": 2.913, + "mean_token_accuracy": 0.34137930870056155, + "step": 22025 + }, + { + "epoch": 0.022188849976985236, + "grad_norm": 33.62422459603598, + "learning_rate": 2.2188425357049335e-05, + "loss": 2.5234, + "mean_token_accuracy": 0.3827586144208908, + "step": 22030 + }, + { + "epoch": 0.02219388603008941, + "grad_norm": 19.474926122141394, + "learning_rate": 2.219346131378039e-05, + "loss": 2.6024, + "mean_token_accuracy": 0.42413793206214906, + "step": 22035 + }, + { + "epoch": 0.022198922083193583, + "grad_norm": 16.585208457436593, + "learning_rate": 2.2198497270511454e-05, + "loss": 2.6301, + "mean_token_accuracy": 0.37586206793785093, + "step": 22040 + }, + { + "epoch": 0.022203958136297757, + "grad_norm": 18.568035252189002, + "learning_rate": 2.2203533227242513e-05, + "loss": 2.6529, + "mean_token_accuracy": 0.3896551728248596, + "step": 22045 + }, + { + "epoch": 0.022208994189401927, + "grad_norm": 19.064448471957252, + "learning_rate": 2.2208569183973572e-05, + "loss": 2.2902, + "mean_token_accuracy": 0.4724137902259827, + "step": 22050 + }, + { + "epoch": 0.0222140302425061, + "grad_norm": 20.336275906284552, + "learning_rate": 2.221360514070463e-05, + "loss": 2.5596, + "mean_token_accuracy": 0.39999998807907106, + "step": 22055 + }, + { + "epoch": 0.022219066295610275, + "grad_norm": 17.74089781034804, + "learning_rate": 2.221864109743569e-05, + "loss": 2.3276, + "mean_token_accuracy": 0.41034482717514037, + "step": 22060 + }, + { + "epoch": 0.022224102348714445, + "grad_norm": 28.9707201207465, + "learning_rate": 2.2223677054166753e-05, + "loss": 2.5623, + "mean_token_accuracy": 0.4172413766384125, + "step": 22065 + }, + { + "epoch": 0.02222913840181862, + "grad_norm": 17.797284401979596, + "learning_rate": 2.2228713010897812e-05, + "loss": 2.0655, + "mean_token_accuracy": 0.5206896543502808, + "step": 22070 + }, + { + "epoch": 0.022234174454922793, + "grad_norm": 17.04126926265857, + "learning_rate": 2.223374896762887e-05, + "loss": 2.5295, + "mean_token_accuracy": 0.39310344457626345, + "step": 22075 + }, + { + "epoch": 0.022239210508026967, + "grad_norm": 18.219008304708492, + "learning_rate": 2.223878492435993e-05, + "loss": 2.0721, + "mean_token_accuracy": 0.4931034445762634, + "step": 22080 + }, + { + "epoch": 0.022244246561131137, + "grad_norm": 19.579876178045556, + "learning_rate": 2.224382088109099e-05, + "loss": 2.6664, + "mean_token_accuracy": 0.39310344457626345, + "step": 22085 + }, + { + "epoch": 0.02224928261423531, + "grad_norm": 20.727652525353168, + "learning_rate": 2.2248856837822053e-05, + "loss": 2.6057, + "mean_token_accuracy": 0.36896551251411436, + "step": 22090 + }, + { + "epoch": 0.022254318667339484, + "grad_norm": 25.71178053912386, + "learning_rate": 2.2253892794553112e-05, + "loss": 2.52, + "mean_token_accuracy": 0.41724138259887694, + "step": 22095 + }, + { + "epoch": 0.022259354720443655, + "grad_norm": 22.19101844185746, + "learning_rate": 2.2258928751284168e-05, + "loss": 2.5349, + "mean_token_accuracy": 0.3999999940395355, + "step": 22100 + }, + { + "epoch": 0.02226439077354783, + "grad_norm": 17.444978177841975, + "learning_rate": 2.226396470801523e-05, + "loss": 2.1695, + "mean_token_accuracy": 0.4896551728248596, + "step": 22105 + }, + { + "epoch": 0.022269426826652002, + "grad_norm": 17.525196428243337, + "learning_rate": 2.226900066474629e-05, + "loss": 2.5023, + "mean_token_accuracy": 0.43103447556495667, + "step": 22110 + }, + { + "epoch": 0.022274462879756176, + "grad_norm": 17.312631765622033, + "learning_rate": 2.227403662147735e-05, + "loss": 2.518, + "mean_token_accuracy": 0.4103448212146759, + "step": 22115 + }, + { + "epoch": 0.022279498932860346, + "grad_norm": 31.832139802670028, + "learning_rate": 2.227907257820841e-05, + "loss": 2.6816, + "mean_token_accuracy": 0.3793103516101837, + "step": 22120 + }, + { + "epoch": 0.02228453498596452, + "grad_norm": 19.066747654896485, + "learning_rate": 2.2284108534939468e-05, + "loss": 2.3297, + "mean_token_accuracy": 0.42475369572639465, + "step": 22125 + }, + { + "epoch": 0.022289571039068694, + "grad_norm": 17.678994499298476, + "learning_rate": 2.228914449167053e-05, + "loss": 2.4118, + "mean_token_accuracy": 0.44482758045196535, + "step": 22130 + }, + { + "epoch": 0.022294607092172864, + "grad_norm": 22.517709464302218, + "learning_rate": 2.229418044840159e-05, + "loss": 2.7515, + "mean_token_accuracy": 0.41724138259887694, + "step": 22135 + }, + { + "epoch": 0.022299643145277038, + "grad_norm": 18.862621011620117, + "learning_rate": 2.229921640513265e-05, + "loss": 2.5284, + "mean_token_accuracy": 0.4, + "step": 22140 + }, + { + "epoch": 0.02230467919838121, + "grad_norm": 28.54303826814318, + "learning_rate": 2.2304252361863708e-05, + "loss": 2.3845, + "mean_token_accuracy": 0.43793103098869324, + "step": 22145 + }, + { + "epoch": 0.022309715251485385, + "grad_norm": 19.720926920423196, + "learning_rate": 2.2309288318594767e-05, + "loss": 2.392, + "mean_token_accuracy": 0.4206896424293518, + "step": 22150 + }, + { + "epoch": 0.022314751304589556, + "grad_norm": 21.147741435462077, + "learning_rate": 2.2314324275325827e-05, + "loss": 2.383, + "mean_token_accuracy": 0.41379310488700866, + "step": 22155 + }, + { + "epoch": 0.02231978735769373, + "grad_norm": 21.84547054587362, + "learning_rate": 2.231936023205689e-05, + "loss": 2.6847, + "mean_token_accuracy": 0.4, + "step": 22160 + }, + { + "epoch": 0.022324823410797903, + "grad_norm": 28.431524954376894, + "learning_rate": 2.2324396188787945e-05, + "loss": 2.5632, + "mean_token_accuracy": 0.42413793206214906, + "step": 22165 + }, + { + "epoch": 0.022329859463902074, + "grad_norm": 19.16903860920344, + "learning_rate": 2.2329432145519008e-05, + "loss": 2.223, + "mean_token_accuracy": 0.458620685338974, + "step": 22170 + }, + { + "epoch": 0.022334895517006247, + "grad_norm": 26.85744625812132, + "learning_rate": 2.2334468102250067e-05, + "loss": 2.5157, + "mean_token_accuracy": 0.3965517282485962, + "step": 22175 + }, + { + "epoch": 0.02233993157011042, + "grad_norm": 22.947922162874136, + "learning_rate": 2.2339504058981126e-05, + "loss": 2.7134, + "mean_token_accuracy": 0.358620685338974, + "step": 22180 + }, + { + "epoch": 0.022344967623214595, + "grad_norm": 21.377917623613296, + "learning_rate": 2.2344540015712185e-05, + "loss": 2.44, + "mean_token_accuracy": 0.44827585816383364, + "step": 22185 + }, + { + "epoch": 0.022350003676318765, + "grad_norm": 17.445912939362415, + "learning_rate": 2.2349575972443245e-05, + "loss": 2.4056, + "mean_token_accuracy": 0.42413792610168455, + "step": 22190 + }, + { + "epoch": 0.02235503972942294, + "grad_norm": 22.54786475522324, + "learning_rate": 2.2354611929174304e-05, + "loss": 2.6845, + "mean_token_accuracy": 0.36896551549434664, + "step": 22195 + }, + { + "epoch": 0.022360075782527113, + "grad_norm": 20.031459047298654, + "learning_rate": 2.2359647885905367e-05, + "loss": 2.3074, + "mean_token_accuracy": 0.4568663060665131, + "step": 22200 + }, + { + "epoch": 0.022365111835631283, + "grad_norm": 23.318258570706305, + "learning_rate": 2.2364683842636426e-05, + "loss": 2.5512, + "mean_token_accuracy": 0.3931034505367279, + "step": 22205 + }, + { + "epoch": 0.022370147888735457, + "grad_norm": 22.834068307645833, + "learning_rate": 2.2369719799367485e-05, + "loss": 2.5969, + "mean_token_accuracy": 0.41379310488700866, + "step": 22210 + }, + { + "epoch": 0.02237518394183963, + "grad_norm": 24.388384134336423, + "learning_rate": 2.2374755756098544e-05, + "loss": 2.1986, + "mean_token_accuracy": 0.46551724672317507, + "step": 22215 + }, + { + "epoch": 0.022380219994943804, + "grad_norm": 23.372507417723465, + "learning_rate": 2.2379791712829604e-05, + "loss": 2.5898, + "mean_token_accuracy": 0.4034482717514038, + "step": 22220 + }, + { + "epoch": 0.022385256048047975, + "grad_norm": 18.871339052692168, + "learning_rate": 2.2384827669560666e-05, + "loss": 2.276, + "mean_token_accuracy": 0.43793103098869324, + "step": 22225 + }, + { + "epoch": 0.02239029210115215, + "grad_norm": 26.062997719553426, + "learning_rate": 2.2389863626291725e-05, + "loss": 2.4886, + "mean_token_accuracy": 0.4358741700649261, + "step": 22230 + }, + { + "epoch": 0.022395328154256322, + "grad_norm": 23.174492575524077, + "learning_rate": 2.239489958302278e-05, + "loss": 2.6006, + "mean_token_accuracy": 0.3931034505367279, + "step": 22235 + }, + { + "epoch": 0.022400364207360492, + "grad_norm": 18.1040954333043, + "learning_rate": 2.2399935539753844e-05, + "loss": 2.4222, + "mean_token_accuracy": 0.41379311084747317, + "step": 22240 + }, + { + "epoch": 0.022405400260464666, + "grad_norm": 24.493134539717133, + "learning_rate": 2.2404971496484903e-05, + "loss": 2.849, + "mean_token_accuracy": 0.36896551251411436, + "step": 22245 + }, + { + "epoch": 0.02241043631356884, + "grad_norm": 19.18978538303809, + "learning_rate": 2.2410007453215966e-05, + "loss": 2.6983, + "mean_token_accuracy": 0.3896551728248596, + "step": 22250 + }, + { + "epoch": 0.022415472366673014, + "grad_norm": 14.265059198158891, + "learning_rate": 2.2415043409947022e-05, + "loss": 2.2997, + "mean_token_accuracy": 0.48747730255126953, + "step": 22255 + }, + { + "epoch": 0.022420508419777184, + "grad_norm": 25.427834720968296, + "learning_rate": 2.242007936667808e-05, + "loss": 2.6528, + "mean_token_accuracy": 0.3758620619773865, + "step": 22260 + }, + { + "epoch": 0.022425544472881358, + "grad_norm": 18.51087331360119, + "learning_rate": 2.2425115323409144e-05, + "loss": 2.59, + "mean_token_accuracy": 0.43103447556495667, + "step": 22265 + }, + { + "epoch": 0.02243058052598553, + "grad_norm": 21.42302683306011, + "learning_rate": 2.2430151280140203e-05, + "loss": 2.5512, + "mean_token_accuracy": 0.3896551787853241, + "step": 22270 + }, + { + "epoch": 0.022435616579089702, + "grad_norm": 23.046029473027314, + "learning_rate": 2.2435187236871262e-05, + "loss": 2.5384, + "mean_token_accuracy": 0.36896551847457887, + "step": 22275 + }, + { + "epoch": 0.022440652632193876, + "grad_norm": 16.057161230476066, + "learning_rate": 2.244022319360232e-05, + "loss": 2.6856, + "mean_token_accuracy": 0.4068965554237366, + "step": 22280 + }, + { + "epoch": 0.02244568868529805, + "grad_norm": 17.639397575770012, + "learning_rate": 2.244525915033338e-05, + "loss": 2.5045, + "mean_token_accuracy": 0.4034482717514038, + "step": 22285 + }, + { + "epoch": 0.022450724738402223, + "grad_norm": 17.893885697651772, + "learning_rate": 2.2450295107064443e-05, + "loss": 2.5162, + "mean_token_accuracy": 0.42413792610168455, + "step": 22290 + }, + { + "epoch": 0.022455760791506393, + "grad_norm": 17.329695781495438, + "learning_rate": 2.2455331063795503e-05, + "loss": 2.1294, + "mean_token_accuracy": 0.4103448331356049, + "step": 22295 + }, + { + "epoch": 0.022460796844610567, + "grad_norm": 18.504104992864324, + "learning_rate": 2.246036702052656e-05, + "loss": 2.5708, + "mean_token_accuracy": 0.37241379618644715, + "step": 22300 + }, + { + "epoch": 0.02246583289771474, + "grad_norm": 22.03805586439012, + "learning_rate": 2.246540297725762e-05, + "loss": 2.3461, + "mean_token_accuracy": 0.4482758641242981, + "step": 22305 + }, + { + "epoch": 0.02247086895081891, + "grad_norm": 21.069023053962944, + "learning_rate": 2.247043893398868e-05, + "loss": 2.3123, + "mean_token_accuracy": 0.4344827592372894, + "step": 22310 + }, + { + "epoch": 0.022475905003923085, + "grad_norm": 18.7543630496956, + "learning_rate": 2.247547489071974e-05, + "loss": 2.3285, + "mean_token_accuracy": 0.45862069725990295, + "step": 22315 + }, + { + "epoch": 0.02248094105702726, + "grad_norm": 18.58035747384884, + "learning_rate": 2.24805108474508e-05, + "loss": 2.5487, + "mean_token_accuracy": 0.4068965554237366, + "step": 22320 + }, + { + "epoch": 0.022485977110131432, + "grad_norm": 29.659721198682824, + "learning_rate": 2.2485546804181858e-05, + "loss": 2.7291, + "mean_token_accuracy": 0.33448275923728943, + "step": 22325 + }, + { + "epoch": 0.022491013163235603, + "grad_norm": 23.104117481218953, + "learning_rate": 2.249058276091292e-05, + "loss": 2.4649, + "mean_token_accuracy": 0.40562612414360044, + "step": 22330 + }, + { + "epoch": 0.022496049216339777, + "grad_norm": 17.66567299953306, + "learning_rate": 2.249561871764398e-05, + "loss": 2.6325, + "mean_token_accuracy": 0.3655172407627106, + "step": 22335 + }, + { + "epoch": 0.02250108526944395, + "grad_norm": 27.871278373827845, + "learning_rate": 2.250065467437504e-05, + "loss": 2.7913, + "mean_token_accuracy": 0.3896551728248596, + "step": 22340 + }, + { + "epoch": 0.02250612132254812, + "grad_norm": 20.518959052869597, + "learning_rate": 2.25056906311061e-05, + "loss": 2.4906, + "mean_token_accuracy": 0.4068965554237366, + "step": 22345 + }, + { + "epoch": 0.022511157375652294, + "grad_norm": 23.563047248173834, + "learning_rate": 2.2510726587837158e-05, + "loss": 2.8235, + "mean_token_accuracy": 0.38275861740112305, + "step": 22350 + }, + { + "epoch": 0.022516193428756468, + "grad_norm": 20.75663922178308, + "learning_rate": 2.2515762544568217e-05, + "loss": 2.316, + "mean_token_accuracy": 0.4517241358757019, + "step": 22355 + }, + { + "epoch": 0.022521229481860642, + "grad_norm": 17.69785708724455, + "learning_rate": 2.252079850129928e-05, + "loss": 2.5617, + "mean_token_accuracy": 0.3862069010734558, + "step": 22360 + }, + { + "epoch": 0.022526265534964812, + "grad_norm": 17.627379486652025, + "learning_rate": 2.2525834458030335e-05, + "loss": 2.2386, + "mean_token_accuracy": 0.458620685338974, + "step": 22365 + }, + { + "epoch": 0.022531301588068986, + "grad_norm": 21.97807014633657, + "learning_rate": 2.2530870414761398e-05, + "loss": 2.3722, + "mean_token_accuracy": 0.42758620977401735, + "step": 22370 + }, + { + "epoch": 0.02253633764117316, + "grad_norm": 17.838855015088015, + "learning_rate": 2.2535906371492457e-05, + "loss": 2.3766, + "mean_token_accuracy": 0.41724138259887694, + "step": 22375 + }, + { + "epoch": 0.02254137369427733, + "grad_norm": 21.8998732395183, + "learning_rate": 2.2540942328223517e-05, + "loss": 2.2293, + "mean_token_accuracy": 0.47241379618644713, + "step": 22380 + }, + { + "epoch": 0.022546409747381504, + "grad_norm": 24.701371474290475, + "learning_rate": 2.2545978284954576e-05, + "loss": 2.5738, + "mean_token_accuracy": 0.4206896543502808, + "step": 22385 + }, + { + "epoch": 0.022551445800485678, + "grad_norm": 21.6951296392118, + "learning_rate": 2.2551014241685635e-05, + "loss": 2.4643, + "mean_token_accuracy": 0.43103448748588563, + "step": 22390 + }, + { + "epoch": 0.02255648185358985, + "grad_norm": 21.552926014920672, + "learning_rate": 2.2556050198416694e-05, + "loss": 2.5057, + "mean_token_accuracy": 0.3825166404247284, + "step": 22395 + }, + { + "epoch": 0.02256151790669402, + "grad_norm": 16.728623960915243, + "learning_rate": 2.2561086155147757e-05, + "loss": 2.3849, + "mean_token_accuracy": 0.4482758641242981, + "step": 22400 + }, + { + "epoch": 0.022566553959798195, + "grad_norm": 21.15696615803908, + "learning_rate": 2.2566122111878816e-05, + "loss": 2.4679, + "mean_token_accuracy": 0.42758620381355283, + "step": 22405 + }, + { + "epoch": 0.02257159001290237, + "grad_norm": 22.60642953967318, + "learning_rate": 2.2571158068609876e-05, + "loss": 2.2346, + "mean_token_accuracy": 0.44482759237289426, + "step": 22410 + }, + { + "epoch": 0.02257662606600654, + "grad_norm": 27.526608502129932, + "learning_rate": 2.2576194025340935e-05, + "loss": 2.7476, + "mean_token_accuracy": 0.3827586233615875, + "step": 22415 + }, + { + "epoch": 0.022581662119110713, + "grad_norm": 17.84235743698005, + "learning_rate": 2.2581229982071994e-05, + "loss": 2.2659, + "mean_token_accuracy": 0.4724137902259827, + "step": 22420 + }, + { + "epoch": 0.022586698172214887, + "grad_norm": 20.25861201645504, + "learning_rate": 2.2586265938803057e-05, + "loss": 2.7866, + "mean_token_accuracy": 0.3827586233615875, + "step": 22425 + }, + { + "epoch": 0.02259173422531906, + "grad_norm": 25.454065220567372, + "learning_rate": 2.2591301895534116e-05, + "loss": 2.6312, + "mean_token_accuracy": 0.38275861740112305, + "step": 22430 + }, + { + "epoch": 0.02259677027842323, + "grad_norm": 15.768697011891009, + "learning_rate": 2.2596337852265172e-05, + "loss": 2.5159, + "mean_token_accuracy": 0.4344827473163605, + "step": 22435 + }, + { + "epoch": 0.022601806331527405, + "grad_norm": 26.235903057070672, + "learning_rate": 2.2601373808996234e-05, + "loss": 2.3687, + "mean_token_accuracy": 0.4379310429096222, + "step": 22440 + }, + { + "epoch": 0.02260684238463158, + "grad_norm": 18.37197651668032, + "learning_rate": 2.2606409765727294e-05, + "loss": 2.5066, + "mean_token_accuracy": 0.4413793087005615, + "step": 22445 + }, + { + "epoch": 0.02261187843773575, + "grad_norm": 20.495894762654945, + "learning_rate": 2.2611445722458356e-05, + "loss": 2.5205, + "mean_token_accuracy": 0.39655172228813174, + "step": 22450 + }, + { + "epoch": 0.022616914490839923, + "grad_norm": 20.432753457155183, + "learning_rate": 2.2616481679189412e-05, + "loss": 2.5514, + "mean_token_accuracy": 0.4172413766384125, + "step": 22455 + }, + { + "epoch": 0.022621950543944096, + "grad_norm": 30.722106421553068, + "learning_rate": 2.262151763592047e-05, + "loss": 2.8931, + "mean_token_accuracy": 0.3517241358757019, + "step": 22460 + }, + { + "epoch": 0.02262698659704827, + "grad_norm": 22.013400065153043, + "learning_rate": 2.2626553592651534e-05, + "loss": 2.3811, + "mean_token_accuracy": 0.42068964838981626, + "step": 22465 + }, + { + "epoch": 0.02263202265015244, + "grad_norm": 23.76465805126878, + "learning_rate": 2.2631589549382593e-05, + "loss": 2.7677, + "mean_token_accuracy": 0.3793103456497192, + "step": 22470 + }, + { + "epoch": 0.022637058703256614, + "grad_norm": 17.779989889542147, + "learning_rate": 2.2636625506113653e-05, + "loss": 2.2668, + "mean_token_accuracy": 0.4517241358757019, + "step": 22475 + }, + { + "epoch": 0.022642094756360788, + "grad_norm": 21.257585924279987, + "learning_rate": 2.2641661462844712e-05, + "loss": 2.2506, + "mean_token_accuracy": 0.42413792610168455, + "step": 22480 + }, + { + "epoch": 0.02264713080946496, + "grad_norm": 16.850431182569892, + "learning_rate": 2.264669741957577e-05, + "loss": 2.2979, + "mean_token_accuracy": 0.43793103098869324, + "step": 22485 + }, + { + "epoch": 0.022652166862569132, + "grad_norm": 24.456366384931854, + "learning_rate": 2.2651733376306834e-05, + "loss": 2.6736, + "mean_token_accuracy": 0.39999999701976774, + "step": 22490 + }, + { + "epoch": 0.022657202915673306, + "grad_norm": 19.1417247541628, + "learning_rate": 2.2656769333037893e-05, + "loss": 2.3048, + "mean_token_accuracy": 0.4206896543502808, + "step": 22495 + }, + { + "epoch": 0.02266223896877748, + "grad_norm": 18.84599896684726, + "learning_rate": 2.266180528976895e-05, + "loss": 2.6017, + "mean_token_accuracy": 0.39310345649719236, + "step": 22500 + }, + { + "epoch": 0.02266727502188165, + "grad_norm": 16.62193286530031, + "learning_rate": 2.266684124650001e-05, + "loss": 2.0632, + "mean_token_accuracy": 0.458620685338974, + "step": 22505 + }, + { + "epoch": 0.022672311074985824, + "grad_norm": 18.591903937220767, + "learning_rate": 2.267187720323107e-05, + "loss": 2.5364, + "mean_token_accuracy": 0.4, + "step": 22510 + }, + { + "epoch": 0.022677347128089997, + "grad_norm": 21.217173761893346, + "learning_rate": 2.2676913159962133e-05, + "loss": 2.8769, + "mean_token_accuracy": 0.3999999940395355, + "step": 22515 + }, + { + "epoch": 0.022682383181194168, + "grad_norm": 17.840197566068447, + "learning_rate": 2.268194911669319e-05, + "loss": 2.5899, + "mean_token_accuracy": 0.3931034505367279, + "step": 22520 + }, + { + "epoch": 0.02268741923429834, + "grad_norm": 18.780295823148354, + "learning_rate": 2.268698507342425e-05, + "loss": 2.5879, + "mean_token_accuracy": 0.39655172228813174, + "step": 22525 + }, + { + "epoch": 0.022692455287402515, + "grad_norm": 19.105647606527235, + "learning_rate": 2.269202103015531e-05, + "loss": 2.0668, + "mean_token_accuracy": 0.44482759237289426, + "step": 22530 + }, + { + "epoch": 0.02269749134050669, + "grad_norm": 20.68306278123323, + "learning_rate": 2.269705698688637e-05, + "loss": 2.503, + "mean_token_accuracy": 0.4103448331356049, + "step": 22535 + }, + { + "epoch": 0.02270252739361086, + "grad_norm": 24.15491603822209, + "learning_rate": 2.270209294361743e-05, + "loss": 2.3817, + "mean_token_accuracy": 0.4068965554237366, + "step": 22540 + }, + { + "epoch": 0.022707563446715033, + "grad_norm": 20.827742247720774, + "learning_rate": 2.270712890034849e-05, + "loss": 2.4991, + "mean_token_accuracy": 0.4, + "step": 22545 + }, + { + "epoch": 0.022712599499819207, + "grad_norm": 21.334346528435063, + "learning_rate": 2.2712164857079548e-05, + "loss": 2.2004, + "mean_token_accuracy": 0.4413793087005615, + "step": 22550 + }, + { + "epoch": 0.022717635552923377, + "grad_norm": 19.185609288663436, + "learning_rate": 2.271720081381061e-05, + "loss": 2.175, + "mean_token_accuracy": 0.48765880465507505, + "step": 22555 + }, + { + "epoch": 0.02272267160602755, + "grad_norm": 23.687491018950126, + "learning_rate": 2.272223677054167e-05, + "loss": 2.3841, + "mean_token_accuracy": 0.45862069725990295, + "step": 22560 + }, + { + "epoch": 0.022727707659131725, + "grad_norm": 16.67345590425749, + "learning_rate": 2.272727272727273e-05, + "loss": 2.1727, + "mean_token_accuracy": 0.4172413766384125, + "step": 22565 + }, + { + "epoch": 0.0227327437122359, + "grad_norm": 19.26383388557665, + "learning_rate": 2.273230868400379e-05, + "loss": 2.7115, + "mean_token_accuracy": 0.358620685338974, + "step": 22570 + }, + { + "epoch": 0.02273777976534007, + "grad_norm": 18.937735556298822, + "learning_rate": 2.2737344640734848e-05, + "loss": 2.0429, + "mean_token_accuracy": 0.5044334948062896, + "step": 22575 + }, + { + "epoch": 0.022742815818444242, + "grad_norm": 19.95636119563846, + "learning_rate": 2.2742380597465907e-05, + "loss": 2.5669, + "mean_token_accuracy": 0.4344827592372894, + "step": 22580 + }, + { + "epoch": 0.022747851871548416, + "grad_norm": 22.079111238407474, + "learning_rate": 2.274741655419697e-05, + "loss": 2.2817, + "mean_token_accuracy": 0.41379310488700866, + "step": 22585 + }, + { + "epoch": 0.022752887924652587, + "grad_norm": 20.731706517261895, + "learning_rate": 2.2752452510928026e-05, + "loss": 2.5628, + "mean_token_accuracy": 0.3793103456497192, + "step": 22590 + }, + { + "epoch": 0.02275792397775676, + "grad_norm": 30.561394062291406, + "learning_rate": 2.2757488467659088e-05, + "loss": 2.2307, + "mean_token_accuracy": 0.41905626058578493, + "step": 22595 + }, + { + "epoch": 0.022762960030860934, + "grad_norm": 21.528989707860283, + "learning_rate": 2.2762524424390147e-05, + "loss": 2.7548, + "mean_token_accuracy": 0.3793103456497192, + "step": 22600 + }, + { + "epoch": 0.022767996083965108, + "grad_norm": 15.878088498627157, + "learning_rate": 2.2767560381121207e-05, + "loss": 2.3395, + "mean_token_accuracy": 0.42758620977401735, + "step": 22605 + }, + { + "epoch": 0.022773032137069278, + "grad_norm": 21.38107964537585, + "learning_rate": 2.2772596337852266e-05, + "loss": 2.2321, + "mean_token_accuracy": 0.45668481588363646, + "step": 22610 + }, + { + "epoch": 0.022778068190173452, + "grad_norm": 18.638422817724088, + "learning_rate": 2.2777632294583325e-05, + "loss": 2.419, + "mean_token_accuracy": 0.4344827592372894, + "step": 22615 + }, + { + "epoch": 0.022783104243277626, + "grad_norm": 25.541632104800644, + "learning_rate": 2.2782668251314384e-05, + "loss": 2.7406, + "mean_token_accuracy": 0.3724137842655182, + "step": 22620 + }, + { + "epoch": 0.022788140296381796, + "grad_norm": 24.870099288549408, + "learning_rate": 2.2787704208045447e-05, + "loss": 2.4821, + "mean_token_accuracy": 0.42758620977401735, + "step": 22625 + }, + { + "epoch": 0.02279317634948597, + "grad_norm": 28.984746085927338, + "learning_rate": 2.2792740164776506e-05, + "loss": 2.5235, + "mean_token_accuracy": 0.42758620977401735, + "step": 22630 + }, + { + "epoch": 0.022798212402590144, + "grad_norm": 17.33730303914686, + "learning_rate": 2.2797776121507566e-05, + "loss": 2.4157, + "mean_token_accuracy": 0.4034482717514038, + "step": 22635 + }, + { + "epoch": 0.022803248455694317, + "grad_norm": 17.429441611093488, + "learning_rate": 2.2802812078238625e-05, + "loss": 2.6754, + "mean_token_accuracy": 0.3517241358757019, + "step": 22640 + }, + { + "epoch": 0.022808284508798488, + "grad_norm": 20.765204328300264, + "learning_rate": 2.2807848034969684e-05, + "loss": 2.2778, + "mean_token_accuracy": 0.4517241299152374, + "step": 22645 + }, + { + "epoch": 0.02281332056190266, + "grad_norm": 25.36196991681778, + "learning_rate": 2.2812883991700747e-05, + "loss": 2.5011, + "mean_token_accuracy": 0.4327888786792755, + "step": 22650 + }, + { + "epoch": 0.022818356615006835, + "grad_norm": 18.79703684930374, + "learning_rate": 2.2817919948431803e-05, + "loss": 2.3929, + "mean_token_accuracy": 0.43103448748588563, + "step": 22655 + }, + { + "epoch": 0.022823392668111005, + "grad_norm": 17.85611908453297, + "learning_rate": 2.2822955905162862e-05, + "loss": 2.3343, + "mean_token_accuracy": 0.4, + "step": 22660 + }, + { + "epoch": 0.02282842872121518, + "grad_norm": 21.226299837353054, + "learning_rate": 2.2827991861893925e-05, + "loss": 2.5153, + "mean_token_accuracy": 0.458620685338974, + "step": 22665 + }, + { + "epoch": 0.022833464774319353, + "grad_norm": 21.64309500389514, + "learning_rate": 2.2833027818624984e-05, + "loss": 2.5086, + "mean_token_accuracy": 0.3862068891525269, + "step": 22670 + }, + { + "epoch": 0.022838500827423527, + "grad_norm": 21.710984538066924, + "learning_rate": 2.2838063775356043e-05, + "loss": 2.4433, + "mean_token_accuracy": 0.4034482717514038, + "step": 22675 + }, + { + "epoch": 0.022843536880527697, + "grad_norm": 21.00478712738089, + "learning_rate": 2.2843099732087102e-05, + "loss": 2.8297, + "mean_token_accuracy": 0.3805202662944794, + "step": 22680 + }, + { + "epoch": 0.02284857293363187, + "grad_norm": 21.870098936627976, + "learning_rate": 2.284813568881816e-05, + "loss": 2.497, + "mean_token_accuracy": 0.4344827651977539, + "step": 22685 + }, + { + "epoch": 0.022853608986736045, + "grad_norm": 17.213851678606343, + "learning_rate": 2.2853171645549224e-05, + "loss": 2.2466, + "mean_token_accuracy": 0.44137930274009707, + "step": 22690 + }, + { + "epoch": 0.022858645039840215, + "grad_norm": 17.763106173996384, + "learning_rate": 2.2858207602280283e-05, + "loss": 2.4115, + "mean_token_accuracy": 0.45862067937850953, + "step": 22695 + }, + { + "epoch": 0.02286368109294439, + "grad_norm": 21.85975830465156, + "learning_rate": 2.286324355901134e-05, + "loss": 2.4841, + "mean_token_accuracy": 0.4344827651977539, + "step": 22700 + }, + { + "epoch": 0.022868717146048562, + "grad_norm": 18.601093321681876, + "learning_rate": 2.2868279515742402e-05, + "loss": 2.353, + "mean_token_accuracy": 0.41034482419490814, + "step": 22705 + }, + { + "epoch": 0.022873753199152736, + "grad_norm": 15.097304122293911, + "learning_rate": 2.287331547247346e-05, + "loss": 1.9461, + "mean_token_accuracy": 0.5310344874858857, + "step": 22710 + }, + { + "epoch": 0.022878789252256906, + "grad_norm": 20.603198245841, + "learning_rate": 2.2878351429204524e-05, + "loss": 2.5929, + "mean_token_accuracy": 0.3896551728248596, + "step": 22715 + }, + { + "epoch": 0.02288382530536108, + "grad_norm": 21.944195006115965, + "learning_rate": 2.288338738593558e-05, + "loss": 2.7959, + "mean_token_accuracy": 0.3586206942796707, + "step": 22720 + }, + { + "epoch": 0.022888861358465254, + "grad_norm": 12.962044530940386, + "learning_rate": 2.288842334266664e-05, + "loss": 2.4928, + "mean_token_accuracy": 0.3655172407627106, + "step": 22725 + }, + { + "epoch": 0.022893897411569424, + "grad_norm": 17.448281235913246, + "learning_rate": 2.28934592993977e-05, + "loss": 2.666, + "mean_token_accuracy": 0.35862069129943847, + "step": 22730 + }, + { + "epoch": 0.022898933464673598, + "grad_norm": 32.96787197038869, + "learning_rate": 2.289849525612876e-05, + "loss": 2.3277, + "mean_token_accuracy": 0.4517241299152374, + "step": 22735 + }, + { + "epoch": 0.022903969517777772, + "grad_norm": 21.302498746653814, + "learning_rate": 2.290353121285982e-05, + "loss": 2.2013, + "mean_token_accuracy": 0.4068965494632721, + "step": 22740 + }, + { + "epoch": 0.022909005570881946, + "grad_norm": 20.115490757384034, + "learning_rate": 2.290856716959088e-05, + "loss": 2.6838, + "mean_token_accuracy": 0.40689654350280763, + "step": 22745 + }, + { + "epoch": 0.022914041623986116, + "grad_norm": 23.267723147162197, + "learning_rate": 2.291360312632194e-05, + "loss": 2.5425, + "mean_token_accuracy": 0.4710936903953552, + "step": 22750 + }, + { + "epoch": 0.02291907767709029, + "grad_norm": 18.638277963020272, + "learning_rate": 2.2918639083053e-05, + "loss": 2.376, + "mean_token_accuracy": 0.3931034505367279, + "step": 22755 + }, + { + "epoch": 0.022924113730194463, + "grad_norm": 18.212097372203548, + "learning_rate": 2.292367503978406e-05, + "loss": 2.3664, + "mean_token_accuracy": 0.4724137902259827, + "step": 22760 + }, + { + "epoch": 0.022929149783298634, + "grad_norm": 19.321665218909793, + "learning_rate": 2.292871099651512e-05, + "loss": 2.8056, + "mean_token_accuracy": 0.37931033968925476, + "step": 22765 + }, + { + "epoch": 0.022934185836402807, + "grad_norm": 17.6201713074256, + "learning_rate": 2.293374695324618e-05, + "loss": 2.5647, + "mean_token_accuracy": 0.43448275327682495, + "step": 22770 + }, + { + "epoch": 0.02293922188950698, + "grad_norm": 17.811160390890254, + "learning_rate": 2.2938782909977238e-05, + "loss": 2.5508, + "mean_token_accuracy": 0.37241379022598264, + "step": 22775 + }, + { + "epoch": 0.022944257942611155, + "grad_norm": 18.261853885826937, + "learning_rate": 2.2943818866708297e-05, + "loss": 2.5657, + "mean_token_accuracy": 0.3137931048870087, + "step": 22780 + }, + { + "epoch": 0.022949293995715325, + "grad_norm": 23.7481239411116, + "learning_rate": 2.294885482343936e-05, + "loss": 2.4587, + "mean_token_accuracy": 0.4000000059604645, + "step": 22785 + }, + { + "epoch": 0.0229543300488195, + "grad_norm": 17.042074337437455, + "learning_rate": 2.2953890780170416e-05, + "loss": 2.2672, + "mean_token_accuracy": 0.4379310369491577, + "step": 22790 + }, + { + "epoch": 0.022959366101923673, + "grad_norm": 16.890928928924556, + "learning_rate": 2.295892673690148e-05, + "loss": 2.2507, + "mean_token_accuracy": 0.46896551847457885, + "step": 22795 + }, + { + "epoch": 0.022964402155027843, + "grad_norm": 19.464047203597147, + "learning_rate": 2.2963962693632538e-05, + "loss": 2.4715, + "mean_token_accuracy": 0.41034482717514037, + "step": 22800 + }, + { + "epoch": 0.022969438208132017, + "grad_norm": 18.366188725603518, + "learning_rate": 2.2968998650363597e-05, + "loss": 2.1385, + "mean_token_accuracy": 0.4620689630508423, + "step": 22805 + }, + { + "epoch": 0.02297447426123619, + "grad_norm": 23.101494973332073, + "learning_rate": 2.2974034607094656e-05, + "loss": 2.4148, + "mean_token_accuracy": 0.4103448331356049, + "step": 22810 + }, + { + "epoch": 0.022979510314340364, + "grad_norm": 26.724476072881952, + "learning_rate": 2.2979070563825716e-05, + "loss": 2.7551, + "mean_token_accuracy": 0.3965517163276672, + "step": 22815 + }, + { + "epoch": 0.022984546367444535, + "grad_norm": 19.558034340626925, + "learning_rate": 2.2984106520556775e-05, + "loss": 2.2781, + "mean_token_accuracy": 0.4666256129741669, + "step": 22820 + }, + { + "epoch": 0.02298958242054871, + "grad_norm": 18.958228517274435, + "learning_rate": 2.2989142477287838e-05, + "loss": 2.179, + "mean_token_accuracy": 0.44827585816383364, + "step": 22825 + }, + { + "epoch": 0.022994618473652882, + "grad_norm": 17.580023659461865, + "learning_rate": 2.2994178434018897e-05, + "loss": 2.4399, + "mean_token_accuracy": 0.4348457336425781, + "step": 22830 + }, + { + "epoch": 0.022999654526757052, + "grad_norm": 18.822075228700367, + "learning_rate": 2.2999214390749956e-05, + "loss": 2.6342, + "mean_token_accuracy": 0.4034482777118683, + "step": 22835 + }, + { + "epoch": 0.023004690579861226, + "grad_norm": 21.018624034341308, + "learning_rate": 2.3004250347481015e-05, + "loss": 2.4219, + "mean_token_accuracy": 0.4379310369491577, + "step": 22840 + }, + { + "epoch": 0.0230097266329654, + "grad_norm": 19.91394150717735, + "learning_rate": 2.3009286304212075e-05, + "loss": 2.3617, + "mean_token_accuracy": 0.4689655125141144, + "step": 22845 + }, + { + "epoch": 0.023014762686069574, + "grad_norm": 22.90880935741304, + "learning_rate": 2.3014322260943137e-05, + "loss": 2.6454, + "mean_token_accuracy": 0.3827586233615875, + "step": 22850 + }, + { + "epoch": 0.023019798739173744, + "grad_norm": 18.84600555991698, + "learning_rate": 2.3019358217674193e-05, + "loss": 2.4955, + "mean_token_accuracy": 0.3902709364891052, + "step": 22855 + }, + { + "epoch": 0.023024834792277918, + "grad_norm": 17.557138401404146, + "learning_rate": 2.3024394174405252e-05, + "loss": 2.5965, + "mean_token_accuracy": 0.4, + "step": 22860 + }, + { + "epoch": 0.02302987084538209, + "grad_norm": 19.117916121017814, + "learning_rate": 2.3029430131136315e-05, + "loss": 2.1499, + "mean_token_accuracy": 0.4655172348022461, + "step": 22865 + }, + { + "epoch": 0.023034906898486262, + "grad_norm": 20.41709412811478, + "learning_rate": 2.3034466087867374e-05, + "loss": 2.1779, + "mean_token_accuracy": 0.4620689690113068, + "step": 22870 + }, + { + "epoch": 0.023039942951590436, + "grad_norm": 20.080008126135866, + "learning_rate": 2.3039502044598433e-05, + "loss": 2.0551, + "mean_token_accuracy": 0.47586206793785096, + "step": 22875 + }, + { + "epoch": 0.02304497900469461, + "grad_norm": 20.352513627776563, + "learning_rate": 2.3044538001329493e-05, + "loss": 2.4495, + "mean_token_accuracy": 0.4103448212146759, + "step": 22880 + }, + { + "epoch": 0.023050015057798783, + "grad_norm": 24.65168187970714, + "learning_rate": 2.3049573958060552e-05, + "loss": 2.8885, + "mean_token_accuracy": 0.37586207389831544, + "step": 22885 + }, + { + "epoch": 0.023055051110902954, + "grad_norm": 25.48647988694503, + "learning_rate": 2.3054609914791615e-05, + "loss": 2.8524, + "mean_token_accuracy": 0.3551724135875702, + "step": 22890 + }, + { + "epoch": 0.023060087164007127, + "grad_norm": 17.646917083114957, + "learning_rate": 2.3059645871522674e-05, + "loss": 2.4218, + "mean_token_accuracy": 0.4379310369491577, + "step": 22895 + }, + { + "epoch": 0.0230651232171113, + "grad_norm": 23.14327183533927, + "learning_rate": 2.3064681828253733e-05, + "loss": 2.4162, + "mean_token_accuracy": 0.4137930989265442, + "step": 22900 + }, + { + "epoch": 0.02307015927021547, + "grad_norm": 19.92739523568574, + "learning_rate": 2.3069717784984792e-05, + "loss": 2.8309, + "mean_token_accuracy": 0.39497882723808286, + "step": 22905 + }, + { + "epoch": 0.023075195323319645, + "grad_norm": 18.376262145963032, + "learning_rate": 2.307475374171585e-05, + "loss": 2.7732, + "mean_token_accuracy": 0.3840290427207947, + "step": 22910 + }, + { + "epoch": 0.02308023137642382, + "grad_norm": 20.488677917222653, + "learning_rate": 2.3079789698446914e-05, + "loss": 2.1023, + "mean_token_accuracy": 0.4551724135875702, + "step": 22915 + }, + { + "epoch": 0.02308526742952799, + "grad_norm": 17.03609906560991, + "learning_rate": 2.308482565517797e-05, + "loss": 2.2248, + "mean_token_accuracy": 0.4482758641242981, + "step": 22920 + }, + { + "epoch": 0.023090303482632163, + "grad_norm": 26.112126585190843, + "learning_rate": 2.308986161190903e-05, + "loss": 3.0098, + "mean_token_accuracy": 0.36896551847457887, + "step": 22925 + }, + { + "epoch": 0.023095339535736337, + "grad_norm": 18.985530438448993, + "learning_rate": 2.3094897568640092e-05, + "loss": 2.4916, + "mean_token_accuracy": 0.4379310250282288, + "step": 22930 + }, + { + "epoch": 0.02310037558884051, + "grad_norm": 16.922123042320393, + "learning_rate": 2.309993352537115e-05, + "loss": 2.5926, + "mean_token_accuracy": 0.36896551549434664, + "step": 22935 + }, + { + "epoch": 0.02310541164194468, + "grad_norm": 16.930864720978843, + "learning_rate": 2.3104969482102214e-05, + "loss": 2.5479, + "mean_token_accuracy": 0.42413792610168455, + "step": 22940 + }, + { + "epoch": 0.023110447695048855, + "grad_norm": 18.606395868769965, + "learning_rate": 2.311000543883327e-05, + "loss": 2.2358, + "mean_token_accuracy": 0.48160919547080994, + "step": 22945 + }, + { + "epoch": 0.02311548374815303, + "grad_norm": 21.863191604249504, + "learning_rate": 2.311504139556433e-05, + "loss": 2.3576, + "mean_token_accuracy": 0.4620689690113068, + "step": 22950 + }, + { + "epoch": 0.0231205198012572, + "grad_norm": 16.662726102692137, + "learning_rate": 2.312007735229539e-05, + "loss": 2.7683, + "mean_token_accuracy": 0.36551723778247835, + "step": 22955 + }, + { + "epoch": 0.023125555854361372, + "grad_norm": 19.677123315731578, + "learning_rate": 2.312511330902645e-05, + "loss": 2.3679, + "mean_token_accuracy": 0.4137930989265442, + "step": 22960 + }, + { + "epoch": 0.023130591907465546, + "grad_norm": 20.872678676649432, + "learning_rate": 2.313014926575751e-05, + "loss": 2.758, + "mean_token_accuracy": 0.3793103456497192, + "step": 22965 + }, + { + "epoch": 0.02313562796056972, + "grad_norm": 16.632263227426613, + "learning_rate": 2.313518522248857e-05, + "loss": 2.5275, + "mean_token_accuracy": 0.41034482717514037, + "step": 22970 + }, + { + "epoch": 0.02314066401367389, + "grad_norm": 25.55095335528505, + "learning_rate": 2.314022117921963e-05, + "loss": 2.5002, + "mean_token_accuracy": 0.3931034505367279, + "step": 22975 + }, + { + "epoch": 0.023145700066778064, + "grad_norm": 20.012027080019955, + "learning_rate": 2.314525713595069e-05, + "loss": 2.4564, + "mean_token_accuracy": 0.43103448748588563, + "step": 22980 + }, + { + "epoch": 0.023150736119882238, + "grad_norm": 17.574902582764828, + "learning_rate": 2.315029309268175e-05, + "loss": 2.5635, + "mean_token_accuracy": 0.47428917288780215, + "step": 22985 + }, + { + "epoch": 0.023155772172986408, + "grad_norm": 19.712751364062672, + "learning_rate": 2.3155329049412806e-05, + "loss": 2.3971, + "mean_token_accuracy": 0.4172413766384125, + "step": 22990 + }, + { + "epoch": 0.023160808226090582, + "grad_norm": 16.462634401717388, + "learning_rate": 2.316036500614387e-05, + "loss": 2.5296, + "mean_token_accuracy": 0.44827585816383364, + "step": 22995 + }, + { + "epoch": 0.023165844279194756, + "grad_norm": 18.108234727285744, + "learning_rate": 2.316540096287493e-05, + "loss": 2.1544, + "mean_token_accuracy": 0.47586206793785096, + "step": 23000 + }, + { + "epoch": 0.02317088033229893, + "grad_norm": 20.20866342765393, + "learning_rate": 2.3170436919605988e-05, + "loss": 2.5775, + "mean_token_accuracy": 0.3862069010734558, + "step": 23005 + }, + { + "epoch": 0.0231759163854031, + "grad_norm": 17.034229444595386, + "learning_rate": 2.3175472876337047e-05, + "loss": 2.318, + "mean_token_accuracy": 0.4698275923728943, + "step": 23010 + }, + { + "epoch": 0.023180952438507273, + "grad_norm": 18.786363107672358, + "learning_rate": 2.3180508833068106e-05, + "loss": 2.8486, + "mean_token_accuracy": 0.3530550479888916, + "step": 23015 + }, + { + "epoch": 0.023185988491611447, + "grad_norm": 19.18161811533767, + "learning_rate": 2.318554478979917e-05, + "loss": 2.5546, + "mean_token_accuracy": 0.4, + "step": 23020 + }, + { + "epoch": 0.023191024544715617, + "grad_norm": 21.68999575278217, + "learning_rate": 2.3190580746530228e-05, + "loss": 2.5458, + "mean_token_accuracy": 0.3931034505367279, + "step": 23025 + }, + { + "epoch": 0.02319606059781979, + "grad_norm": 22.92044649834235, + "learning_rate": 2.3195616703261287e-05, + "loss": 2.5237, + "mean_token_accuracy": 0.4206896543502808, + "step": 23030 + }, + { + "epoch": 0.023201096650923965, + "grad_norm": 16.142754652578454, + "learning_rate": 2.3200652659992346e-05, + "loss": 2.4486, + "mean_token_accuracy": 0.4068965554237366, + "step": 23035 + }, + { + "epoch": 0.02320613270402814, + "grad_norm": 21.27352017314491, + "learning_rate": 2.3205688616723406e-05, + "loss": 2.6783, + "mean_token_accuracy": 0.39655172228813174, + "step": 23040 + }, + { + "epoch": 0.02321116875713231, + "grad_norm": 22.180863995376555, + "learning_rate": 2.3210724573454465e-05, + "loss": 2.2055, + "mean_token_accuracy": 0.5344827532768249, + "step": 23045 + }, + { + "epoch": 0.023216204810236483, + "grad_norm": 16.206469249136866, + "learning_rate": 2.3215760530185528e-05, + "loss": 2.443, + "mean_token_accuracy": 0.3793103456497192, + "step": 23050 + }, + { + "epoch": 0.023221240863340657, + "grad_norm": 17.963615643345832, + "learning_rate": 2.3220796486916584e-05, + "loss": 2.2049, + "mean_token_accuracy": 0.5068965494632721, + "step": 23055 + }, + { + "epoch": 0.023226276916444827, + "grad_norm": 24.290381567609824, + "learning_rate": 2.3225832443647646e-05, + "loss": 2.6754, + "mean_token_accuracy": 0.4258923172950745, + "step": 23060 + }, + { + "epoch": 0.023231312969549, + "grad_norm": 20.946573083044743, + "learning_rate": 2.3230868400378705e-05, + "loss": 2.384, + "mean_token_accuracy": 0.4344827651977539, + "step": 23065 + }, + { + "epoch": 0.023236349022653174, + "grad_norm": 18.485546373030807, + "learning_rate": 2.3235904357109765e-05, + "loss": 2.0698, + "mean_token_accuracy": 0.5275862038135528, + "step": 23070 + }, + { + "epoch": 0.023241385075757348, + "grad_norm": 17.033514005794434, + "learning_rate": 2.3240940313840824e-05, + "loss": 2.3657, + "mean_token_accuracy": 0.4448275864124298, + "step": 23075 + }, + { + "epoch": 0.02324642112886152, + "grad_norm": 22.632504364202727, + "learning_rate": 2.3245976270571883e-05, + "loss": 2.5893, + "mean_token_accuracy": 0.39999999701976774, + "step": 23080 + }, + { + "epoch": 0.023251457181965692, + "grad_norm": 19.044371870230986, + "learning_rate": 2.3251012227302942e-05, + "loss": 2.2975, + "mean_token_accuracy": 0.42758620381355283, + "step": 23085 + }, + { + "epoch": 0.023256493235069866, + "grad_norm": 14.956867521627869, + "learning_rate": 2.3256048184034005e-05, + "loss": 2.9365, + "mean_token_accuracy": 0.3758620619773865, + "step": 23090 + }, + { + "epoch": 0.023261529288174036, + "grad_norm": 19.66057404064375, + "learning_rate": 2.3261084140765064e-05, + "loss": 2.7816, + "mean_token_accuracy": 0.3620689630508423, + "step": 23095 + }, + { + "epoch": 0.02326656534127821, + "grad_norm": 14.09174184629034, + "learning_rate": 2.3266120097496124e-05, + "loss": 2.4464, + "mean_token_accuracy": 0.45741077661514284, + "step": 23100 + }, + { + "epoch": 0.023271601394382384, + "grad_norm": 12.128026023680654, + "learning_rate": 2.3271156054227183e-05, + "loss": 2.2867, + "mean_token_accuracy": 0.412583190202713, + "step": 23105 + }, + { + "epoch": 0.023276637447486558, + "grad_norm": 19.176754278836363, + "learning_rate": 2.3276192010958242e-05, + "loss": 2.7116, + "mean_token_accuracy": 0.4310344815254211, + "step": 23110 + }, + { + "epoch": 0.023281673500590728, + "grad_norm": 18.963712150852185, + "learning_rate": 2.3281227967689305e-05, + "loss": 2.3551, + "mean_token_accuracy": 0.43793103098869324, + "step": 23115 + }, + { + "epoch": 0.0232867095536949, + "grad_norm": 20.51487798917281, + "learning_rate": 2.3286263924420364e-05, + "loss": 2.3704, + "mean_token_accuracy": 0.4103448212146759, + "step": 23120 + }, + { + "epoch": 0.023291745606799075, + "grad_norm": 21.575264438222536, + "learning_rate": 2.329129988115142e-05, + "loss": 2.6654, + "mean_token_accuracy": 0.3793103456497192, + "step": 23125 + }, + { + "epoch": 0.023296781659903246, + "grad_norm": 21.305805100371053, + "learning_rate": 2.3296335837882482e-05, + "loss": 2.7653, + "mean_token_accuracy": 0.37931033670902253, + "step": 23130 + }, + { + "epoch": 0.02330181771300742, + "grad_norm": 18.156066236647195, + "learning_rate": 2.3301371794613542e-05, + "loss": 2.6291, + "mean_token_accuracy": 0.38275861740112305, + "step": 23135 + }, + { + "epoch": 0.023306853766111593, + "grad_norm": 24.421629261010917, + "learning_rate": 2.3306407751344604e-05, + "loss": 2.5404, + "mean_token_accuracy": 0.41379310488700866, + "step": 23140 + }, + { + "epoch": 0.023311889819215767, + "grad_norm": 20.71044362521377, + "learning_rate": 2.331144370807566e-05, + "loss": 2.8435, + "mean_token_accuracy": 0.33103448152542114, + "step": 23145 + }, + { + "epoch": 0.023316925872319937, + "grad_norm": 149.8158185969122, + "learning_rate": 2.331647966480672e-05, + "loss": 2.4191, + "mean_token_accuracy": 0.4137930989265442, + "step": 23150 + }, + { + "epoch": 0.02332196192542411, + "grad_norm": 17.125177233582733, + "learning_rate": 2.3321515621537782e-05, + "loss": 2.7084, + "mean_token_accuracy": 0.47241378426551817, + "step": 23155 + }, + { + "epoch": 0.023326997978528285, + "grad_norm": 18.4979615776732, + "learning_rate": 2.332655157826884e-05, + "loss": 2.5377, + "mean_token_accuracy": 0.37586206793785093, + "step": 23160 + }, + { + "epoch": 0.023332034031632455, + "grad_norm": 22.74049703360252, + "learning_rate": 2.33315875349999e-05, + "loss": 2.1325, + "mean_token_accuracy": 0.4655172348022461, + "step": 23165 + }, + { + "epoch": 0.02333707008473663, + "grad_norm": 28.40734587243835, + "learning_rate": 2.333662349173096e-05, + "loss": 2.1769, + "mean_token_accuracy": 0.4448275864124298, + "step": 23170 + }, + { + "epoch": 0.023342106137840803, + "grad_norm": 15.90823415689045, + "learning_rate": 2.334165944846202e-05, + "loss": 2.2317, + "mean_token_accuracy": 0.48275862336158754, + "step": 23175 + }, + { + "epoch": 0.023347142190944976, + "grad_norm": 20.8508626072081, + "learning_rate": 2.3346695405193082e-05, + "loss": 2.2442, + "mean_token_accuracy": 0.45862069725990295, + "step": 23180 + }, + { + "epoch": 0.023352178244049147, + "grad_norm": 19.182113655354367, + "learning_rate": 2.335173136192414e-05, + "loss": 2.6433, + "mean_token_accuracy": 0.417241370677948, + "step": 23185 + }, + { + "epoch": 0.02335721429715332, + "grad_norm": 20.67039475556455, + "learning_rate": 2.3356767318655197e-05, + "loss": 2.4209, + "mean_token_accuracy": 0.42068964838981626, + "step": 23190 + }, + { + "epoch": 0.023362250350257494, + "grad_norm": 20.7741829319452, + "learning_rate": 2.336180327538626e-05, + "loss": 2.2663, + "mean_token_accuracy": 0.49039408564567566, + "step": 23195 + }, + { + "epoch": 0.023367286403361665, + "grad_norm": 29.140731300066438, + "learning_rate": 2.336683923211732e-05, + "loss": 2.3962, + "mean_token_accuracy": 0.441379314661026, + "step": 23200 + }, + { + "epoch": 0.02337232245646584, + "grad_norm": 21.57276173554743, + "learning_rate": 2.3371875188848378e-05, + "loss": 2.6041, + "mean_token_accuracy": 0.4172413766384125, + "step": 23205 + }, + { + "epoch": 0.023377358509570012, + "grad_norm": 16.823665657734125, + "learning_rate": 2.3376911145579437e-05, + "loss": 2.4537, + "mean_token_accuracy": 0.4, + "step": 23210 + }, + { + "epoch": 0.023382394562674186, + "grad_norm": 27.237850246821015, + "learning_rate": 2.3381947102310497e-05, + "loss": 2.461, + "mean_token_accuracy": 0.39655171930789945, + "step": 23215 + }, + { + "epoch": 0.023387430615778356, + "grad_norm": 18.487204871777276, + "learning_rate": 2.338698305904156e-05, + "loss": 2.4639, + "mean_token_accuracy": 0.4137930929660797, + "step": 23220 + }, + { + "epoch": 0.02339246666888253, + "grad_norm": 17.68062594413758, + "learning_rate": 2.339201901577262e-05, + "loss": 2.444, + "mean_token_accuracy": 0.4310344934463501, + "step": 23225 + }, + { + "epoch": 0.023397502721986704, + "grad_norm": 19.572802954405955, + "learning_rate": 2.3397054972503678e-05, + "loss": 2.6091, + "mean_token_accuracy": 0.3793103456497192, + "step": 23230 + }, + { + "epoch": 0.023402538775090874, + "grad_norm": 35.07218573970589, + "learning_rate": 2.3402090929234737e-05, + "loss": 2.9119, + "mean_token_accuracy": 0.3758620709180832, + "step": 23235 + }, + { + "epoch": 0.023407574828195048, + "grad_norm": 29.112515450855707, + "learning_rate": 2.3407126885965796e-05, + "loss": 2.5343, + "mean_token_accuracy": 0.4200738906860352, + "step": 23240 + }, + { + "epoch": 0.02341261088129922, + "grad_norm": 21.15423566602211, + "learning_rate": 2.3412162842696855e-05, + "loss": 2.3895, + "mean_token_accuracy": 0.4413793087005615, + "step": 23245 + }, + { + "epoch": 0.023417646934403395, + "grad_norm": 26.525838790535417, + "learning_rate": 2.3417198799427918e-05, + "loss": 2.6741, + "mean_token_accuracy": 0.38620689511299133, + "step": 23250 + }, + { + "epoch": 0.023422682987507566, + "grad_norm": 17.24454108983069, + "learning_rate": 2.3422234756158974e-05, + "loss": 2.4122, + "mean_token_accuracy": 0.38620689511299133, + "step": 23255 + }, + { + "epoch": 0.02342771904061174, + "grad_norm": 16.945555011004476, + "learning_rate": 2.3427270712890037e-05, + "loss": 2.5, + "mean_token_accuracy": 0.38965516686439516, + "step": 23260 + }, + { + "epoch": 0.023432755093715913, + "grad_norm": 19.777421857287845, + "learning_rate": 2.3432306669621096e-05, + "loss": 2.3755, + "mean_token_accuracy": 0.42758620977401735, + "step": 23265 + }, + { + "epoch": 0.023437791146820083, + "grad_norm": 12.783141533142857, + "learning_rate": 2.3437342626352155e-05, + "loss": 2.3924, + "mean_token_accuracy": 0.4620689630508423, + "step": 23270 + }, + { + "epoch": 0.023442827199924257, + "grad_norm": 18.493512293245807, + "learning_rate": 2.3442378583083214e-05, + "loss": 2.0751, + "mean_token_accuracy": 0.48965516686439514, + "step": 23275 + }, + { + "epoch": 0.02344786325302843, + "grad_norm": 16.915018131501643, + "learning_rate": 2.3447414539814274e-05, + "loss": 2.4425, + "mean_token_accuracy": 0.3981246203184128, + "step": 23280 + }, + { + "epoch": 0.023452899306132605, + "grad_norm": 22.963126691437093, + "learning_rate": 2.3452450496545333e-05, + "loss": 2.7325, + "mean_token_accuracy": 0.3379310339689255, + "step": 23285 + }, + { + "epoch": 0.023457935359236775, + "grad_norm": 14.693629660666877, + "learning_rate": 2.3457486453276395e-05, + "loss": 2.3208, + "mean_token_accuracy": 0.4275861978530884, + "step": 23290 + }, + { + "epoch": 0.02346297141234095, + "grad_norm": 18.11448411514457, + "learning_rate": 2.3462522410007455e-05, + "loss": 2.7619, + "mean_token_accuracy": 0.3551724135875702, + "step": 23295 + }, + { + "epoch": 0.023468007465445122, + "grad_norm": 26.292095807138796, + "learning_rate": 2.3467558366738514e-05, + "loss": 2.7848, + "mean_token_accuracy": 0.3793103516101837, + "step": 23300 + }, + { + "epoch": 0.023473043518549293, + "grad_norm": 18.532510370464035, + "learning_rate": 2.3472594323469573e-05, + "loss": 2.1855, + "mean_token_accuracy": 0.47586206793785096, + "step": 23305 + }, + { + "epoch": 0.023478079571653467, + "grad_norm": 17.28162510098119, + "learning_rate": 2.3477630280200633e-05, + "loss": 2.5518, + "mean_token_accuracy": 0.39999998807907106, + "step": 23310 + }, + { + "epoch": 0.02348311562475764, + "grad_norm": 19.6675761182803, + "learning_rate": 2.3482666236931695e-05, + "loss": 2.3339, + "mean_token_accuracy": 0.37586206793785093, + "step": 23315 + }, + { + "epoch": 0.023488151677861814, + "grad_norm": 27.76457846588624, + "learning_rate": 2.3487702193662754e-05, + "loss": 2.5959, + "mean_token_accuracy": 0.40689656138420105, + "step": 23320 + }, + { + "epoch": 0.023493187730965984, + "grad_norm": 20.64928450570137, + "learning_rate": 2.3492738150393814e-05, + "loss": 3.0885, + "mean_token_accuracy": 0.3172413736581802, + "step": 23325 + }, + { + "epoch": 0.023498223784070158, + "grad_norm": 18.917560597409945, + "learning_rate": 2.3497774107124873e-05, + "loss": 2.4874, + "mean_token_accuracy": 0.4482758641242981, + "step": 23330 + }, + { + "epoch": 0.023503259837174332, + "grad_norm": 15.411758240406288, + "learning_rate": 2.3502810063855932e-05, + "loss": 2.1298, + "mean_token_accuracy": 0.4793103516101837, + "step": 23335 + }, + { + "epoch": 0.023508295890278502, + "grad_norm": 20.015300306624695, + "learning_rate": 2.3507846020586995e-05, + "loss": 2.3879, + "mean_token_accuracy": 0.39310344457626345, + "step": 23340 + }, + { + "epoch": 0.023513331943382676, + "grad_norm": 17.345719924823456, + "learning_rate": 2.351288197731805e-05, + "loss": 2.3847, + "mean_token_accuracy": 0.4034482717514038, + "step": 23345 + }, + { + "epoch": 0.02351836799648685, + "grad_norm": 22.258021269623214, + "learning_rate": 2.351791793404911e-05, + "loss": 2.3043, + "mean_token_accuracy": 0.45517241954803467, + "step": 23350 + }, + { + "epoch": 0.023523404049591023, + "grad_norm": 18.761405749653825, + "learning_rate": 2.3522953890780173e-05, + "loss": 2.7389, + "mean_token_accuracy": 0.42044767141342165, + "step": 23355 + }, + { + "epoch": 0.023528440102695194, + "grad_norm": 20.208979398983107, + "learning_rate": 2.3527989847511232e-05, + "loss": 2.2699, + "mean_token_accuracy": 0.43793103098869324, + "step": 23360 + }, + { + "epoch": 0.023533476155799368, + "grad_norm": 19.523103142859696, + "learning_rate": 2.353302580424229e-05, + "loss": 2.8238, + "mean_token_accuracy": 0.35517241060733795, + "step": 23365 + }, + { + "epoch": 0.02353851220890354, + "grad_norm": 18.31660151996064, + "learning_rate": 2.353806176097335e-05, + "loss": 2.5006, + "mean_token_accuracy": 0.4034482777118683, + "step": 23370 + }, + { + "epoch": 0.02354354826200771, + "grad_norm": 18.759401601924488, + "learning_rate": 2.354309771770441e-05, + "loss": 2.5492, + "mean_token_accuracy": 0.43103447556495667, + "step": 23375 + }, + { + "epoch": 0.023548584315111885, + "grad_norm": 16.21704045480718, + "learning_rate": 2.3548133674435472e-05, + "loss": 2.8948, + "mean_token_accuracy": 0.36896551251411436, + "step": 23380 + }, + { + "epoch": 0.02355362036821606, + "grad_norm": 15.923460784263046, + "learning_rate": 2.355316963116653e-05, + "loss": 2.6577, + "mean_token_accuracy": 0.4137930989265442, + "step": 23385 + }, + { + "epoch": 0.023558656421320233, + "grad_norm": 18.906590448105323, + "learning_rate": 2.3558205587897587e-05, + "loss": 2.3943, + "mean_token_accuracy": 0.39655172228813174, + "step": 23390 + }, + { + "epoch": 0.023563692474424403, + "grad_norm": 30.838227828202086, + "learning_rate": 2.356324154462865e-05, + "loss": 2.3409, + "mean_token_accuracy": 0.4551724135875702, + "step": 23395 + }, + { + "epoch": 0.023568728527528577, + "grad_norm": 20.76235156703052, + "learning_rate": 2.356827750135971e-05, + "loss": 2.5859, + "mean_token_accuracy": 0.3931034505367279, + "step": 23400 + }, + { + "epoch": 0.02357376458063275, + "grad_norm": 20.699009468689052, + "learning_rate": 2.3573313458090772e-05, + "loss": 2.5678, + "mean_token_accuracy": 0.40344828367233276, + "step": 23405 + }, + { + "epoch": 0.02357880063373692, + "grad_norm": 17.94740499286189, + "learning_rate": 2.3578349414821828e-05, + "loss": 2.2906, + "mean_token_accuracy": 0.44827587008476255, + "step": 23410 + }, + { + "epoch": 0.023583836686841095, + "grad_norm": 20.281713312645824, + "learning_rate": 2.3583385371552887e-05, + "loss": 2.421, + "mean_token_accuracy": 0.4379310369491577, + "step": 23415 + }, + { + "epoch": 0.02358887273994527, + "grad_norm": 18.559842139842036, + "learning_rate": 2.358842132828395e-05, + "loss": 2.3308, + "mean_token_accuracy": 0.47586206197738645, + "step": 23420 + }, + { + "epoch": 0.023593908793049442, + "grad_norm": 17.881272738383622, + "learning_rate": 2.359345728501501e-05, + "loss": 2.717, + "mean_token_accuracy": 0.3896551728248596, + "step": 23425 + }, + { + "epoch": 0.023598944846153613, + "grad_norm": 15.785861140237035, + "learning_rate": 2.3598493241746068e-05, + "loss": 2.5003, + "mean_token_accuracy": 0.44482759237289426, + "step": 23430 + }, + { + "epoch": 0.023603980899257786, + "grad_norm": 23.018795432041433, + "learning_rate": 2.3603529198477127e-05, + "loss": 2.5212, + "mean_token_accuracy": 0.43623715043067934, + "step": 23435 + }, + { + "epoch": 0.02360901695236196, + "grad_norm": 24.590844691676757, + "learning_rate": 2.3608565155208187e-05, + "loss": 2.7643, + "mean_token_accuracy": 0.39655172228813174, + "step": 23440 + }, + { + "epoch": 0.02361405300546613, + "grad_norm": 20.011124953553157, + "learning_rate": 2.361360111193925e-05, + "loss": 2.5008, + "mean_token_accuracy": 0.4068965554237366, + "step": 23445 + }, + { + "epoch": 0.023619089058570304, + "grad_norm": 18.4145459224808, + "learning_rate": 2.361863706867031e-05, + "loss": 2.5606, + "mean_token_accuracy": 0.43448275327682495, + "step": 23450 + }, + { + "epoch": 0.023624125111674478, + "grad_norm": 23.77908547581733, + "learning_rate": 2.3623673025401364e-05, + "loss": 2.4494, + "mean_token_accuracy": 0.41379310488700866, + "step": 23455 + }, + { + "epoch": 0.023629161164778652, + "grad_norm": 26.079843414908968, + "learning_rate": 2.3628708982132427e-05, + "loss": 2.851, + "mean_token_accuracy": 0.3793103486299515, + "step": 23460 + }, + { + "epoch": 0.023634197217882822, + "grad_norm": 22.261213018982513, + "learning_rate": 2.3633744938863486e-05, + "loss": 2.4946, + "mean_token_accuracy": 0.40344826877117157, + "step": 23465 + }, + { + "epoch": 0.023639233270986996, + "grad_norm": 19.64210856798974, + "learning_rate": 2.3638780895594546e-05, + "loss": 2.5668, + "mean_token_accuracy": 0.36896551847457887, + "step": 23470 + }, + { + "epoch": 0.02364426932409117, + "grad_norm": 21.561370789108572, + "learning_rate": 2.3643816852325608e-05, + "loss": 2.4079, + "mean_token_accuracy": 0.4017543911933899, + "step": 23475 + }, + { + "epoch": 0.02364930537719534, + "grad_norm": 16.11440392242275, + "learning_rate": 2.3648852809056664e-05, + "loss": 2.2252, + "mean_token_accuracy": 0.46551724672317507, + "step": 23480 + }, + { + "epoch": 0.023654341430299514, + "grad_norm": 22.171707410434447, + "learning_rate": 2.3653888765787727e-05, + "loss": 2.907, + "mean_token_accuracy": 0.4, + "step": 23485 + }, + { + "epoch": 0.023659377483403687, + "grad_norm": 31.560156884595546, + "learning_rate": 2.3658924722518786e-05, + "loss": 2.4305, + "mean_token_accuracy": 0.42413793206214906, + "step": 23490 + }, + { + "epoch": 0.02366441353650786, + "grad_norm": 23.9305298764277, + "learning_rate": 2.3663960679249845e-05, + "loss": 2.5919, + "mean_token_accuracy": 0.39655172228813174, + "step": 23495 + }, + { + "epoch": 0.02366944958961203, + "grad_norm": 19.874659995757643, + "learning_rate": 2.3668996635980904e-05, + "loss": 2.4001, + "mean_token_accuracy": 0.3931034505367279, + "step": 23500 + }, + { + "epoch": 0.023674485642716205, + "grad_norm": 17.544494861672433, + "learning_rate": 2.3674032592711964e-05, + "loss": 2.3061, + "mean_token_accuracy": 0.42758620381355283, + "step": 23505 + }, + { + "epoch": 0.02367952169582038, + "grad_norm": 17.39787595227969, + "learning_rate": 2.3679068549443023e-05, + "loss": 2.8009, + "mean_token_accuracy": 0.35862069129943847, + "step": 23510 + }, + { + "epoch": 0.02368455774892455, + "grad_norm": 19.495897384488753, + "learning_rate": 2.3684104506174086e-05, + "loss": 2.0697, + "mean_token_accuracy": 0.453901994228363, + "step": 23515 + }, + { + "epoch": 0.023689593802028723, + "grad_norm": 17.415919447033218, + "learning_rate": 2.3689140462905145e-05, + "loss": 2.0513, + "mean_token_accuracy": 0.4896551609039307, + "step": 23520 + }, + { + "epoch": 0.023694629855132897, + "grad_norm": 16.298400575046205, + "learning_rate": 2.3694176419636204e-05, + "loss": 2.3334, + "mean_token_accuracy": 0.4620689630508423, + "step": 23525 + }, + { + "epoch": 0.02369966590823707, + "grad_norm": 22.940057601258907, + "learning_rate": 2.3699212376367263e-05, + "loss": 2.5494, + "mean_token_accuracy": 0.3931034505367279, + "step": 23530 + }, + { + "epoch": 0.02370470196134124, + "grad_norm": 22.822351243116703, + "learning_rate": 2.3704248333098323e-05, + "loss": 3.0025, + "mean_token_accuracy": 0.37586207389831544, + "step": 23535 + }, + { + "epoch": 0.023709738014445415, + "grad_norm": 25.967107128512296, + "learning_rate": 2.3709284289829385e-05, + "loss": 2.6307, + "mean_token_accuracy": 0.3827586203813553, + "step": 23540 + }, + { + "epoch": 0.02371477406754959, + "grad_norm": 18.29927294579388, + "learning_rate": 2.371432024656044e-05, + "loss": 2.7508, + "mean_token_accuracy": 0.3758620709180832, + "step": 23545 + }, + { + "epoch": 0.02371981012065376, + "grad_norm": 29.075152195889707, + "learning_rate": 2.37193562032915e-05, + "loss": 3.0583, + "mean_token_accuracy": 0.34482758343219755, + "step": 23550 + }, + { + "epoch": 0.023724846173757932, + "grad_norm": 17.41635731050841, + "learning_rate": 2.3724392160022563e-05, + "loss": 2.5049, + "mean_token_accuracy": 0.4172413766384125, + "step": 23555 + }, + { + "epoch": 0.023729882226862106, + "grad_norm": 20.45651449521775, + "learning_rate": 2.3729428116753622e-05, + "loss": 2.2647, + "mean_token_accuracy": 0.4427102208137512, + "step": 23560 + }, + { + "epoch": 0.02373491827996628, + "grad_norm": 16.140200188330002, + "learning_rate": 2.373446407348468e-05, + "loss": 2.5712, + "mean_token_accuracy": 0.41724138259887694, + "step": 23565 + }, + { + "epoch": 0.02373995433307045, + "grad_norm": 26.286340727833142, + "learning_rate": 2.373950003021574e-05, + "loss": 2.3267, + "mean_token_accuracy": 0.38275861740112305, + "step": 23570 + }, + { + "epoch": 0.023744990386174624, + "grad_norm": 19.212919218146784, + "learning_rate": 2.37445359869468e-05, + "loss": 2.3797, + "mean_token_accuracy": 0.4413793087005615, + "step": 23575 + }, + { + "epoch": 0.023750026439278798, + "grad_norm": 16.64514116275354, + "learning_rate": 2.3749571943677863e-05, + "loss": 2.3742, + "mean_token_accuracy": 0.4586206912994385, + "step": 23580 + }, + { + "epoch": 0.023755062492382968, + "grad_norm": 18.944658690430504, + "learning_rate": 2.3754607900408922e-05, + "loss": 2.6041, + "mean_token_accuracy": 0.38275861740112305, + "step": 23585 + }, + { + "epoch": 0.023760098545487142, + "grad_norm": 17.386407272354063, + "learning_rate": 2.3759643857139978e-05, + "loss": 2.212, + "mean_token_accuracy": 0.43793103098869324, + "step": 23590 + }, + { + "epoch": 0.023765134598591316, + "grad_norm": 17.44829653757147, + "learning_rate": 2.376467981387104e-05, + "loss": 2.6267, + "mean_token_accuracy": 0.4430732011795044, + "step": 23595 + }, + { + "epoch": 0.02377017065169549, + "grad_norm": 21.47115270928177, + "learning_rate": 2.37697157706021e-05, + "loss": 2.4859, + "mean_token_accuracy": 0.43103448748588563, + "step": 23600 + }, + { + "epoch": 0.02377520670479966, + "grad_norm": 17.36558017869824, + "learning_rate": 2.3774751727333162e-05, + "loss": 2.2194, + "mean_token_accuracy": 0.4689655125141144, + "step": 23605 + }, + { + "epoch": 0.023780242757903833, + "grad_norm": 16.234407530308633, + "learning_rate": 2.3779787684064218e-05, + "loss": 2.6729, + "mean_token_accuracy": 0.4, + "step": 23610 + }, + { + "epoch": 0.023785278811008007, + "grad_norm": 15.885473407388053, + "learning_rate": 2.3784823640795277e-05, + "loss": 2.205, + "mean_token_accuracy": 0.4758620738983154, + "step": 23615 + }, + { + "epoch": 0.023790314864112178, + "grad_norm": 17.728453772543524, + "learning_rate": 2.378985959752634e-05, + "loss": 2.7969, + "mean_token_accuracy": 0.3655172407627106, + "step": 23620 + }, + { + "epoch": 0.02379535091721635, + "grad_norm": 18.11546209261276, + "learning_rate": 2.37948955542574e-05, + "loss": 2.7741, + "mean_token_accuracy": 0.3344827562570572, + "step": 23625 + }, + { + "epoch": 0.023800386970320525, + "grad_norm": 16.705661265304805, + "learning_rate": 2.379993151098846e-05, + "loss": 2.2082, + "mean_token_accuracy": 0.44482758045196535, + "step": 23630 + }, + { + "epoch": 0.0238054230234247, + "grad_norm": 15.610559454990113, + "learning_rate": 2.3804967467719518e-05, + "loss": 2.3724, + "mean_token_accuracy": 0.4482758641242981, + "step": 23635 + }, + { + "epoch": 0.02381045907652887, + "grad_norm": 19.851795065165458, + "learning_rate": 2.3810003424450577e-05, + "loss": 2.5138, + "mean_token_accuracy": 0.3965517282485962, + "step": 23640 + }, + { + "epoch": 0.023815495129633043, + "grad_norm": 18.216179167334186, + "learning_rate": 2.381503938118164e-05, + "loss": 2.1583, + "mean_token_accuracy": 0.4655172348022461, + "step": 23645 + }, + { + "epoch": 0.023820531182737217, + "grad_norm": 29.54391006734845, + "learning_rate": 2.38200753379127e-05, + "loss": 2.7016, + "mean_token_accuracy": 0.41379310488700866, + "step": 23650 + }, + { + "epoch": 0.023825567235841387, + "grad_norm": 20.483513820459095, + "learning_rate": 2.3825111294643758e-05, + "loss": 2.4519, + "mean_token_accuracy": 0.37241379022598264, + "step": 23655 + }, + { + "epoch": 0.02383060328894556, + "grad_norm": 20.545656693137648, + "learning_rate": 2.3830147251374817e-05, + "loss": 2.5436, + "mean_token_accuracy": 0.38620689511299133, + "step": 23660 + }, + { + "epoch": 0.023835639342049735, + "grad_norm": 23.215230703990457, + "learning_rate": 2.3835183208105877e-05, + "loss": 2.3415, + "mean_token_accuracy": 0.4172413766384125, + "step": 23665 + }, + { + "epoch": 0.02384067539515391, + "grad_norm": 19.119674326637153, + "learning_rate": 2.3840219164836936e-05, + "loss": 2.2953, + "mean_token_accuracy": 0.42413793206214906, + "step": 23670 + }, + { + "epoch": 0.02384571144825808, + "grad_norm": 20.949025855348417, + "learning_rate": 2.3845255121568e-05, + "loss": 2.3744, + "mean_token_accuracy": 0.42758620977401735, + "step": 23675 + }, + { + "epoch": 0.023850747501362252, + "grad_norm": 19.02680851002806, + "learning_rate": 2.3850291078299054e-05, + "loss": 2.3298, + "mean_token_accuracy": 0.458620685338974, + "step": 23680 + }, + { + "epoch": 0.023855783554466426, + "grad_norm": 20.85833422456774, + "learning_rate": 2.3855327035030117e-05, + "loss": 2.4811, + "mean_token_accuracy": 0.37241379618644715, + "step": 23685 + }, + { + "epoch": 0.023860819607570596, + "grad_norm": 23.85156260096753, + "learning_rate": 2.3860362991761176e-05, + "loss": 2.9855, + "mean_token_accuracy": 0.2965517193078995, + "step": 23690 + }, + { + "epoch": 0.02386585566067477, + "grad_norm": 16.55918881731979, + "learning_rate": 2.3865398948492236e-05, + "loss": 2.6395, + "mean_token_accuracy": 0.39310344457626345, + "step": 23695 + }, + { + "epoch": 0.023870891713778944, + "grad_norm": 51.2283543673357, + "learning_rate": 2.3870434905223295e-05, + "loss": 2.473, + "mean_token_accuracy": 0.4241379380226135, + "step": 23700 + }, + { + "epoch": 0.023875927766883118, + "grad_norm": 17.387664292208008, + "learning_rate": 2.3875470861954354e-05, + "loss": 2.3641, + "mean_token_accuracy": 0.45517241954803467, + "step": 23705 + }, + { + "epoch": 0.023880963819987288, + "grad_norm": 20.42409548869287, + "learning_rate": 2.3880506818685413e-05, + "loss": 2.5717, + "mean_token_accuracy": 0.4137930989265442, + "step": 23710 + }, + { + "epoch": 0.023885999873091462, + "grad_norm": 18.3082402997011, + "learning_rate": 2.3885542775416476e-05, + "loss": 2.5601, + "mean_token_accuracy": 0.4413793206214905, + "step": 23715 + }, + { + "epoch": 0.023891035926195636, + "grad_norm": 24.44136640189517, + "learning_rate": 2.3890578732147535e-05, + "loss": 2.3461, + "mean_token_accuracy": 0.47931034564971925, + "step": 23720 + }, + { + "epoch": 0.023896071979299806, + "grad_norm": 17.13760110371456, + "learning_rate": 2.3895614688878595e-05, + "loss": 2.2449, + "mean_token_accuracy": 0.43448275327682495, + "step": 23725 + }, + { + "epoch": 0.02390110803240398, + "grad_norm": 15.073744295334292, + "learning_rate": 2.3900650645609654e-05, + "loss": 2.5755, + "mean_token_accuracy": 0.4034482717514038, + "step": 23730 + }, + { + "epoch": 0.023906144085508153, + "grad_norm": 28.7440150885389, + "learning_rate": 2.3905686602340713e-05, + "loss": 2.6923, + "mean_token_accuracy": 0.3827586203813553, + "step": 23735 + }, + { + "epoch": 0.023911180138612327, + "grad_norm": 20.02097132267851, + "learning_rate": 2.3910722559071776e-05, + "loss": 2.3861, + "mean_token_accuracy": 0.42758620381355283, + "step": 23740 + }, + { + "epoch": 0.023916216191716497, + "grad_norm": 17.307676050640012, + "learning_rate": 2.391575851580283e-05, + "loss": 2.2756, + "mean_token_accuracy": 0.43103448748588563, + "step": 23745 + }, + { + "epoch": 0.02392125224482067, + "grad_norm": 19.388220384863942, + "learning_rate": 2.3920794472533894e-05, + "loss": 2.3781, + "mean_token_accuracy": 0.4517241418361664, + "step": 23750 + }, + { + "epoch": 0.023926288297924845, + "grad_norm": 23.414296383731934, + "learning_rate": 2.3925830429264953e-05, + "loss": 2.8317, + "mean_token_accuracy": 0.39310345351696013, + "step": 23755 + }, + { + "epoch": 0.023931324351029015, + "grad_norm": 22.130689801476226, + "learning_rate": 2.3930866385996013e-05, + "loss": 2.7132, + "mean_token_accuracy": 0.3551724076271057, + "step": 23760 + }, + { + "epoch": 0.02393636040413319, + "grad_norm": 30.67360784502667, + "learning_rate": 2.3935902342727072e-05, + "loss": 2.9197, + "mean_token_accuracy": 0.3620689630508423, + "step": 23765 + }, + { + "epoch": 0.023941396457237363, + "grad_norm": 17.469142261168418, + "learning_rate": 2.394093829945813e-05, + "loss": 2.4147, + "mean_token_accuracy": 0.441379314661026, + "step": 23770 + }, + { + "epoch": 0.023946432510341537, + "grad_norm": 37.6236875387337, + "learning_rate": 2.394597425618919e-05, + "loss": 2.3822, + "mean_token_accuracy": 0.4466424763202667, + "step": 23775 + }, + { + "epoch": 0.023951468563445707, + "grad_norm": 21.66049301080567, + "learning_rate": 2.3951010212920253e-05, + "loss": 2.5332, + "mean_token_accuracy": 0.39310344457626345, + "step": 23780 + }, + { + "epoch": 0.02395650461654988, + "grad_norm": 35.47751975115294, + "learning_rate": 2.3956046169651312e-05, + "loss": 2.7288, + "mean_token_accuracy": 0.3862068891525269, + "step": 23785 + }, + { + "epoch": 0.023961540669654054, + "grad_norm": 19.618442503573856, + "learning_rate": 2.396108212638237e-05, + "loss": 2.5422, + "mean_token_accuracy": 0.4206896543502808, + "step": 23790 + }, + { + "epoch": 0.023966576722758225, + "grad_norm": 20.179552832379752, + "learning_rate": 2.396611808311343e-05, + "loss": 2.2058, + "mean_token_accuracy": 0.441379314661026, + "step": 23795 + }, + { + "epoch": 0.0239716127758624, + "grad_norm": 17.96604188546113, + "learning_rate": 2.397115403984449e-05, + "loss": 2.4077, + "mean_token_accuracy": 0.4344827592372894, + "step": 23800 + }, + { + "epoch": 0.023976648828966572, + "grad_norm": 20.144492100942074, + "learning_rate": 2.3976189996575553e-05, + "loss": 2.2976, + "mean_token_accuracy": 0.43103448748588563, + "step": 23805 + }, + { + "epoch": 0.023981684882070746, + "grad_norm": 25.614852115228516, + "learning_rate": 2.398122595330661e-05, + "loss": 2.4909, + "mean_token_accuracy": 0.39310344457626345, + "step": 23810 + }, + { + "epoch": 0.023986720935174916, + "grad_norm": 18.399126908540016, + "learning_rate": 2.3986261910037668e-05, + "loss": 2.7282, + "mean_token_accuracy": 0.41034482717514037, + "step": 23815 + }, + { + "epoch": 0.02399175698827909, + "grad_norm": 25.47444317057748, + "learning_rate": 2.399129786676873e-05, + "loss": 2.513, + "mean_token_accuracy": 0.4517241418361664, + "step": 23820 + }, + { + "epoch": 0.023996793041383264, + "grad_norm": 15.31360084651321, + "learning_rate": 2.399633382349979e-05, + "loss": 2.1752, + "mean_token_accuracy": 0.43793103098869324, + "step": 23825 + }, + { + "epoch": 0.024001829094487434, + "grad_norm": 22.68012745372398, + "learning_rate": 2.400136978023085e-05, + "loss": 2.6447, + "mean_token_accuracy": 0.42413793206214906, + "step": 23830 + }, + { + "epoch": 0.024006865147591608, + "grad_norm": 18.168727697916264, + "learning_rate": 2.4006405736961908e-05, + "loss": 2.4962, + "mean_token_accuracy": 0.38275861740112305, + "step": 23835 + }, + { + "epoch": 0.02401190120069578, + "grad_norm": 16.786267062300013, + "learning_rate": 2.4011441693692968e-05, + "loss": 2.7204, + "mean_token_accuracy": 0.37586207389831544, + "step": 23840 + }, + { + "epoch": 0.024016937253799955, + "grad_norm": 16.016152976189932, + "learning_rate": 2.401647765042403e-05, + "loss": 2.375, + "mean_token_accuracy": 0.4413793206214905, + "step": 23845 + }, + { + "epoch": 0.024021973306904126, + "grad_norm": 17.92130516304088, + "learning_rate": 2.402151360715509e-05, + "loss": 2.4421, + "mean_token_accuracy": 0.43448275327682495, + "step": 23850 + }, + { + "epoch": 0.0240270093600083, + "grad_norm": 16.675625221126964, + "learning_rate": 2.402654956388615e-05, + "loss": 2.6868, + "mean_token_accuracy": 0.3630369007587433, + "step": 23855 + }, + { + "epoch": 0.024032045413112473, + "grad_norm": 19.952075182854696, + "learning_rate": 2.4031585520617208e-05, + "loss": 2.3776, + "mean_token_accuracy": 0.4068965494632721, + "step": 23860 + }, + { + "epoch": 0.024037081466216643, + "grad_norm": 22.10554465846689, + "learning_rate": 2.4036621477348267e-05, + "loss": 2.374, + "mean_token_accuracy": 0.45015124082565305, + "step": 23865 + }, + { + "epoch": 0.024042117519320817, + "grad_norm": 14.676044338683282, + "learning_rate": 2.404165743407933e-05, + "loss": 2.15, + "mean_token_accuracy": 0.5, + "step": 23870 + }, + { + "epoch": 0.02404715357242499, + "grad_norm": 25.812892098466932, + "learning_rate": 2.404669339081039e-05, + "loss": 2.4162, + "mean_token_accuracy": 0.42068966031074523, + "step": 23875 + }, + { + "epoch": 0.024052189625529165, + "grad_norm": 23.64032767711632, + "learning_rate": 2.4051729347541445e-05, + "loss": 2.2248, + "mean_token_accuracy": 0.44827585816383364, + "step": 23880 + }, + { + "epoch": 0.024057225678633335, + "grad_norm": 17.989986680852034, + "learning_rate": 2.4056765304272508e-05, + "loss": 2.5464, + "mean_token_accuracy": 0.3965517163276672, + "step": 23885 + }, + { + "epoch": 0.02406226173173751, + "grad_norm": 20.044227117834797, + "learning_rate": 2.4061801261003567e-05, + "loss": 2.2554, + "mean_token_accuracy": 0.42413792610168455, + "step": 23890 + }, + { + "epoch": 0.024067297784841683, + "grad_norm": 20.638153058133938, + "learning_rate": 2.4066837217734626e-05, + "loss": 2.4875, + "mean_token_accuracy": 0.42068966031074523, + "step": 23895 + }, + { + "epoch": 0.024072333837945853, + "grad_norm": 18.413459950748987, + "learning_rate": 2.4071873174465685e-05, + "loss": 2.3067, + "mean_token_accuracy": 0.4564428210258484, + "step": 23900 + }, + { + "epoch": 0.024077369891050027, + "grad_norm": 16.828320695018007, + "learning_rate": 2.4076909131196745e-05, + "loss": 2.3119, + "mean_token_accuracy": 0.4206896543502808, + "step": 23905 + }, + { + "epoch": 0.0240824059441542, + "grad_norm": 21.81673854474231, + "learning_rate": 2.4081945087927807e-05, + "loss": 2.4467, + "mean_token_accuracy": 0.4034482777118683, + "step": 23910 + }, + { + "epoch": 0.024087441997258374, + "grad_norm": 23.49287935803324, + "learning_rate": 2.4086981044658866e-05, + "loss": 2.5759, + "mean_token_accuracy": 0.3965517282485962, + "step": 23915 + }, + { + "epoch": 0.024092478050362545, + "grad_norm": 19.30502263654567, + "learning_rate": 2.4092017001389926e-05, + "loss": 2.7033, + "mean_token_accuracy": 0.39310343861579894, + "step": 23920 + }, + { + "epoch": 0.02409751410346672, + "grad_norm": 17.662606857422873, + "learning_rate": 2.4097052958120985e-05, + "loss": 2.7485, + "mean_token_accuracy": 0.4034482777118683, + "step": 23925 + }, + { + "epoch": 0.024102550156570892, + "grad_norm": 16.215784273263488, + "learning_rate": 2.4102088914852044e-05, + "loss": 2.5865, + "mean_token_accuracy": 0.4, + "step": 23930 + }, + { + "epoch": 0.024107586209675062, + "grad_norm": 26.66835260547999, + "learning_rate": 2.4107124871583103e-05, + "loss": 2.7017, + "mean_token_accuracy": 0.42758620381355283, + "step": 23935 + }, + { + "epoch": 0.024112622262779236, + "grad_norm": 19.913434709524605, + "learning_rate": 2.4112160828314166e-05, + "loss": 2.4656, + "mean_token_accuracy": 0.3999999940395355, + "step": 23940 + }, + { + "epoch": 0.02411765831588341, + "grad_norm": 21.170122512768614, + "learning_rate": 2.4117196785045222e-05, + "loss": 2.6768, + "mean_token_accuracy": 0.3517241269350052, + "step": 23945 + }, + { + "epoch": 0.024122694368987584, + "grad_norm": 18.035763102311954, + "learning_rate": 2.4122232741776285e-05, + "loss": 2.5846, + "mean_token_accuracy": 0.42758620977401735, + "step": 23950 + }, + { + "epoch": 0.024127730422091754, + "grad_norm": 19.4718076259686, + "learning_rate": 2.4127268698507344e-05, + "loss": 2.322, + "mean_token_accuracy": 0.44827585816383364, + "step": 23955 + }, + { + "epoch": 0.024132766475195928, + "grad_norm": 21.585899262498586, + "learning_rate": 2.4132304655238403e-05, + "loss": 2.6497, + "mean_token_accuracy": 0.43793103098869324, + "step": 23960 + }, + { + "epoch": 0.0241378025283001, + "grad_norm": 19.850923863495765, + "learning_rate": 2.4137340611969462e-05, + "loss": 2.6617, + "mean_token_accuracy": 0.4075123131275177, + "step": 23965 + }, + { + "epoch": 0.024142838581404272, + "grad_norm": 22.17783684920762, + "learning_rate": 2.414237656870052e-05, + "loss": 2.3709, + "mean_token_accuracy": 0.4600120961666107, + "step": 23970 + }, + { + "epoch": 0.024147874634508446, + "grad_norm": 16.894900167050032, + "learning_rate": 2.414741252543158e-05, + "loss": 2.8468, + "mean_token_accuracy": 0.3379310339689255, + "step": 23975 + }, + { + "epoch": 0.02415291068761262, + "grad_norm": 16.912222409000538, + "learning_rate": 2.4152448482162644e-05, + "loss": 2.8597, + "mean_token_accuracy": 0.3551724076271057, + "step": 23980 + }, + { + "epoch": 0.024157946740716793, + "grad_norm": 16.193736500925475, + "learning_rate": 2.4157484438893703e-05, + "loss": 2.3511, + "mean_token_accuracy": 0.44482757449150084, + "step": 23985 + }, + { + "epoch": 0.024162982793820963, + "grad_norm": 21.53111701601264, + "learning_rate": 2.4162520395624762e-05, + "loss": 2.2601, + "mean_token_accuracy": 0.44827585816383364, + "step": 23990 + }, + { + "epoch": 0.024168018846925137, + "grad_norm": 21.193268423501816, + "learning_rate": 2.416755635235582e-05, + "loss": 2.8239, + "mean_token_accuracy": 0.3551724135875702, + "step": 23995 + }, + { + "epoch": 0.02417305490002931, + "grad_norm": 20.592701760354394, + "learning_rate": 2.417259230908688e-05, + "loss": 2.4138, + "mean_token_accuracy": 0.37241379618644715, + "step": 24000 + }, + { + "epoch": 0.02417809095313348, + "grad_norm": 19.921432117200787, + "learning_rate": 2.4177628265817943e-05, + "loss": 2.4218, + "mean_token_accuracy": 0.4034482717514038, + "step": 24005 + }, + { + "epoch": 0.024183127006237655, + "grad_norm": 21.550864731566712, + "learning_rate": 2.4182664222549002e-05, + "loss": 2.817, + "mean_token_accuracy": 0.4103448331356049, + "step": 24010 + }, + { + "epoch": 0.02418816305934183, + "grad_norm": 17.768554714060226, + "learning_rate": 2.418770017928006e-05, + "loss": 2.3806, + "mean_token_accuracy": 0.44482759237289426, + "step": 24015 + }, + { + "epoch": 0.024193199112446002, + "grad_norm": 18.925638938620864, + "learning_rate": 2.419273613601112e-05, + "loss": 2.7243, + "mean_token_accuracy": 0.36896550953388213, + "step": 24020 + }, + { + "epoch": 0.024198235165550173, + "grad_norm": 24.90100334270838, + "learning_rate": 2.419777209274218e-05, + "loss": 2.8157, + "mean_token_accuracy": 0.4034482777118683, + "step": 24025 + }, + { + "epoch": 0.024203271218654347, + "grad_norm": 17.928949221693, + "learning_rate": 2.4202808049473243e-05, + "loss": 2.4075, + "mean_token_accuracy": 0.42758620381355283, + "step": 24030 + }, + { + "epoch": 0.02420830727175852, + "grad_norm": 19.140987729560628, + "learning_rate": 2.42078440062043e-05, + "loss": 2.5518, + "mean_token_accuracy": 0.4034482777118683, + "step": 24035 + }, + { + "epoch": 0.02421334332486269, + "grad_norm": 20.30586760951174, + "learning_rate": 2.4212879962935358e-05, + "loss": 2.5527, + "mean_token_accuracy": 0.4206896543502808, + "step": 24040 + }, + { + "epoch": 0.024218379377966864, + "grad_norm": 17.551572643904933, + "learning_rate": 2.421791591966642e-05, + "loss": 2.455, + "mean_token_accuracy": 0.42758620381355283, + "step": 24045 + }, + { + "epoch": 0.024223415431071038, + "grad_norm": 16.342841664109613, + "learning_rate": 2.422295187639748e-05, + "loss": 2.3376, + "mean_token_accuracy": 0.4275862157344818, + "step": 24050 + }, + { + "epoch": 0.024228451484175212, + "grad_norm": 17.804604179470658, + "learning_rate": 2.422798783312854e-05, + "loss": 2.6032, + "mean_token_accuracy": 0.4068965494632721, + "step": 24055 + }, + { + "epoch": 0.024233487537279382, + "grad_norm": 17.592264622386562, + "learning_rate": 2.42330237898596e-05, + "loss": 2.7319, + "mean_token_accuracy": 0.37241379022598264, + "step": 24060 + }, + { + "epoch": 0.024238523590383556, + "grad_norm": 18.70524892686871, + "learning_rate": 2.4238059746590658e-05, + "loss": 2.792, + "mean_token_accuracy": 0.33103448152542114, + "step": 24065 + }, + { + "epoch": 0.02424355964348773, + "grad_norm": 18.73021823163781, + "learning_rate": 2.424309570332172e-05, + "loss": 2.5406, + "mean_token_accuracy": 0.38965516686439516, + "step": 24070 + }, + { + "epoch": 0.0242485956965919, + "grad_norm": 16.912862108497624, + "learning_rate": 2.424813166005278e-05, + "loss": 2.2733, + "mean_token_accuracy": 0.4379310369491577, + "step": 24075 + }, + { + "epoch": 0.024253631749696074, + "grad_norm": 22.8795921811821, + "learning_rate": 2.4253167616783835e-05, + "loss": 2.5659, + "mean_token_accuracy": 0.4, + "step": 24080 + }, + { + "epoch": 0.024258667802800248, + "grad_norm": 12.80166004182257, + "learning_rate": 2.4258203573514898e-05, + "loss": 2.2321, + "mean_token_accuracy": 0.458620685338974, + "step": 24085 + }, + { + "epoch": 0.02426370385590442, + "grad_norm": 22.465262436657135, + "learning_rate": 2.4263239530245957e-05, + "loss": 2.2853, + "mean_token_accuracy": 0.4344827592372894, + "step": 24090 + }, + { + "epoch": 0.02426873990900859, + "grad_norm": 15.623299848179487, + "learning_rate": 2.4268275486977017e-05, + "loss": 2.3665, + "mean_token_accuracy": 0.43623715043067934, + "step": 24095 + }, + { + "epoch": 0.024273775962112765, + "grad_norm": 19.558770115813186, + "learning_rate": 2.4273311443708076e-05, + "loss": 2.2891, + "mean_token_accuracy": 0.43103447556495667, + "step": 24100 + }, + { + "epoch": 0.02427881201521694, + "grad_norm": 19.686222048753695, + "learning_rate": 2.4278347400439135e-05, + "loss": 2.1801, + "mean_token_accuracy": 0.4689655125141144, + "step": 24105 + }, + { + "epoch": 0.02428384806832111, + "grad_norm": 19.39144627877769, + "learning_rate": 2.4283383357170198e-05, + "loss": 2.7346, + "mean_token_accuracy": 0.3758620619773865, + "step": 24110 + }, + { + "epoch": 0.024288884121425283, + "grad_norm": 18.151312673542574, + "learning_rate": 2.4288419313901257e-05, + "loss": 2.4198, + "mean_token_accuracy": 0.4379310369491577, + "step": 24115 + }, + { + "epoch": 0.024293920174529457, + "grad_norm": 19.477249769427054, + "learning_rate": 2.4293455270632316e-05, + "loss": 2.4622, + "mean_token_accuracy": 0.4379310429096222, + "step": 24120 + }, + { + "epoch": 0.02429895622763363, + "grad_norm": 31.726814248013124, + "learning_rate": 2.4298491227363375e-05, + "loss": 2.5321, + "mean_token_accuracy": 0.41724138259887694, + "step": 24125 + }, + { + "epoch": 0.0243039922807378, + "grad_norm": 18.966070269860957, + "learning_rate": 2.4303527184094435e-05, + "loss": 2.758, + "mean_token_accuracy": 0.37931033968925476, + "step": 24130 + }, + { + "epoch": 0.024309028333841975, + "grad_norm": 17.80509596771346, + "learning_rate": 2.4308563140825497e-05, + "loss": 2.4839, + "mean_token_accuracy": 0.4000000059604645, + "step": 24135 + }, + { + "epoch": 0.02431406438694615, + "grad_norm": 19.937091406991943, + "learning_rate": 2.4313599097556557e-05, + "loss": 2.703, + "mean_token_accuracy": 0.4344827592372894, + "step": 24140 + }, + { + "epoch": 0.02431910044005032, + "grad_norm": 15.898890802254346, + "learning_rate": 2.4318635054287612e-05, + "loss": 2.3847, + "mean_token_accuracy": 0.42413792908191683, + "step": 24145 + }, + { + "epoch": 0.024324136493154493, + "grad_norm": 19.43573129709378, + "learning_rate": 2.4323671011018675e-05, + "loss": 2.5674, + "mean_token_accuracy": 0.44137930274009707, + "step": 24150 + }, + { + "epoch": 0.024329172546258666, + "grad_norm": 20.771969583912146, + "learning_rate": 2.4328706967749734e-05, + "loss": 2.2578, + "mean_token_accuracy": 0.4482758641242981, + "step": 24155 + }, + { + "epoch": 0.02433420859936284, + "grad_norm": 23.651249681451077, + "learning_rate": 2.4333742924480794e-05, + "loss": 2.3961, + "mean_token_accuracy": 0.41724138259887694, + "step": 24160 + }, + { + "epoch": 0.02433924465246701, + "grad_norm": 17.032808415753045, + "learning_rate": 2.4338778881211853e-05, + "loss": 2.236, + "mean_token_accuracy": 0.44827585816383364, + "step": 24165 + }, + { + "epoch": 0.024344280705571184, + "grad_norm": 24.679534951733046, + "learning_rate": 2.4343814837942912e-05, + "loss": 2.2653, + "mean_token_accuracy": 0.42413793206214906, + "step": 24170 + }, + { + "epoch": 0.024349316758675358, + "grad_norm": 20.729235655324032, + "learning_rate": 2.4348850794673975e-05, + "loss": 2.855, + "mean_token_accuracy": 0.4068965494632721, + "step": 24175 + }, + { + "epoch": 0.024354352811779528, + "grad_norm": 20.261565463783676, + "learning_rate": 2.4353886751405034e-05, + "loss": 2.6154, + "mean_token_accuracy": 0.4068965554237366, + "step": 24180 + }, + { + "epoch": 0.024359388864883702, + "grad_norm": 17.39036865028312, + "learning_rate": 2.4358922708136093e-05, + "loss": 2.2315, + "mean_token_accuracy": 0.4551724076271057, + "step": 24185 + }, + { + "epoch": 0.024364424917987876, + "grad_norm": 44.123394292928026, + "learning_rate": 2.4363958664867153e-05, + "loss": 2.3042, + "mean_token_accuracy": 0.43103448748588563, + "step": 24190 + }, + { + "epoch": 0.02436946097109205, + "grad_norm": 16.46897818962927, + "learning_rate": 2.4368994621598212e-05, + "loss": 2.192, + "mean_token_accuracy": 0.5158499777317047, + "step": 24195 + }, + { + "epoch": 0.02437449702419622, + "grad_norm": 28.367890542714264, + "learning_rate": 2.437403057832927e-05, + "loss": 2.8244, + "mean_token_accuracy": 0.3482758551836014, + "step": 24200 + }, + { + "epoch": 0.024379533077300394, + "grad_norm": 20.49783329111513, + "learning_rate": 2.4379066535060334e-05, + "loss": 2.336, + "mean_token_accuracy": 0.4620689570903778, + "step": 24205 + }, + { + "epoch": 0.024384569130404567, + "grad_norm": 15.318189432820885, + "learning_rate": 2.4384102491791393e-05, + "loss": 2.4151, + "mean_token_accuracy": 0.43448275327682495, + "step": 24210 + }, + { + "epoch": 0.024389605183508738, + "grad_norm": 20.647187093968235, + "learning_rate": 2.4389138448522452e-05, + "loss": 2.4961, + "mean_token_accuracy": 0.36206896901130675, + "step": 24215 + }, + { + "epoch": 0.02439464123661291, + "grad_norm": 17.669452701050684, + "learning_rate": 2.439417440525351e-05, + "loss": 2.3542, + "mean_token_accuracy": 0.4241379380226135, + "step": 24220 + }, + { + "epoch": 0.024399677289717085, + "grad_norm": 15.753709691855176, + "learning_rate": 2.439921036198457e-05, + "loss": 2.1501, + "mean_token_accuracy": 0.45517241954803467, + "step": 24225 + }, + { + "epoch": 0.02440471334282126, + "grad_norm": 23.721320606344243, + "learning_rate": 2.4404246318715633e-05, + "loss": 2.4075, + "mean_token_accuracy": 0.4034482777118683, + "step": 24230 + }, + { + "epoch": 0.02440974939592543, + "grad_norm": 22.503598763814765, + "learning_rate": 2.440928227544669e-05, + "loss": 2.3722, + "mean_token_accuracy": 0.4724137902259827, + "step": 24235 + }, + { + "epoch": 0.024414785449029603, + "grad_norm": 19.017136080703306, + "learning_rate": 2.441431823217775e-05, + "loss": 2.7946, + "mean_token_accuracy": 0.3068965494632721, + "step": 24240 + }, + { + "epoch": 0.024419821502133777, + "grad_norm": 23.327749943169568, + "learning_rate": 2.441935418890881e-05, + "loss": 2.7489, + "mean_token_accuracy": 0.37931033968925476, + "step": 24245 + }, + { + "epoch": 0.024424857555237947, + "grad_norm": 15.832258500108582, + "learning_rate": 2.442439014563987e-05, + "loss": 2.436, + "mean_token_accuracy": 0.44652147889137267, + "step": 24250 + }, + { + "epoch": 0.02442989360834212, + "grad_norm": 15.748477578508847, + "learning_rate": 2.442942610237093e-05, + "loss": 2.542, + "mean_token_accuracy": 0.4034482777118683, + "step": 24255 + }, + { + "epoch": 0.024434929661446295, + "grad_norm": 15.790439998427184, + "learning_rate": 2.443446205910199e-05, + "loss": 2.2461, + "mean_token_accuracy": 0.4517241299152374, + "step": 24260 + }, + { + "epoch": 0.02443996571455047, + "grad_norm": 18.772263555079316, + "learning_rate": 2.4439498015833048e-05, + "loss": 2.4401, + "mean_token_accuracy": 0.44827585816383364, + "step": 24265 + }, + { + "epoch": 0.02444500176765464, + "grad_norm": 18.544639463434557, + "learning_rate": 2.444453397256411e-05, + "loss": 2.3275, + "mean_token_accuracy": 0.39310344457626345, + "step": 24270 + }, + { + "epoch": 0.024450037820758812, + "grad_norm": 26.590497230399418, + "learning_rate": 2.444956992929517e-05, + "loss": 2.5562, + "mean_token_accuracy": 0.43448275327682495, + "step": 24275 + }, + { + "epoch": 0.024455073873862986, + "grad_norm": 12.62897062329328, + "learning_rate": 2.4454605886026226e-05, + "loss": 2.2338, + "mean_token_accuracy": 0.44827585816383364, + "step": 24280 + }, + { + "epoch": 0.024460109926967157, + "grad_norm": 18.154489320379174, + "learning_rate": 2.445964184275729e-05, + "loss": 2.3891, + "mean_token_accuracy": 0.41379310488700866, + "step": 24285 + }, + { + "epoch": 0.02446514598007133, + "grad_norm": 20.99429726009109, + "learning_rate": 2.4464677799488348e-05, + "loss": 2.691, + "mean_token_accuracy": 0.41379310488700866, + "step": 24290 + }, + { + "epoch": 0.024470182033175504, + "grad_norm": 20.71018682888791, + "learning_rate": 2.446971375621941e-05, + "loss": 2.6495, + "mean_token_accuracy": 0.37241379022598264, + "step": 24295 + }, + { + "epoch": 0.024475218086279678, + "grad_norm": 19.336803756713824, + "learning_rate": 2.4474749712950466e-05, + "loss": 2.3205, + "mean_token_accuracy": 0.44482758045196535, + "step": 24300 + }, + { + "epoch": 0.024480254139383848, + "grad_norm": 23.159013081144217, + "learning_rate": 2.4479785669681525e-05, + "loss": 2.7001, + "mean_token_accuracy": 0.3758620649576187, + "step": 24305 + }, + { + "epoch": 0.024485290192488022, + "grad_norm": 20.732985893209918, + "learning_rate": 2.4484821626412588e-05, + "loss": 2.4586, + "mean_token_accuracy": 0.39310344457626345, + "step": 24310 + }, + { + "epoch": 0.024490326245592196, + "grad_norm": 23.471164030980802, + "learning_rate": 2.4489857583143647e-05, + "loss": 2.3553, + "mean_token_accuracy": 0.3999999940395355, + "step": 24315 + }, + { + "epoch": 0.024495362298696366, + "grad_norm": 16.595434863592082, + "learning_rate": 2.4494893539874707e-05, + "loss": 1.9999, + "mean_token_accuracy": 0.4896551728248596, + "step": 24320 + }, + { + "epoch": 0.02450039835180054, + "grad_norm": 18.19664889333985, + "learning_rate": 2.4499929496605766e-05, + "loss": 2.4495, + "mean_token_accuracy": 0.420689657330513, + "step": 24325 + }, + { + "epoch": 0.024505434404904713, + "grad_norm": 20.45755013844865, + "learning_rate": 2.4504965453336825e-05, + "loss": 2.6182, + "mean_token_accuracy": 0.4137930989265442, + "step": 24330 + }, + { + "epoch": 0.024510470458008887, + "grad_norm": 20.325710443332817, + "learning_rate": 2.4510001410067888e-05, + "loss": 2.6119, + "mean_token_accuracy": 0.3793103456497192, + "step": 24335 + }, + { + "epoch": 0.024515506511113058, + "grad_norm": 19.34484418395446, + "learning_rate": 2.4515037366798947e-05, + "loss": 2.3206, + "mean_token_accuracy": 0.4553538978099823, + "step": 24340 + }, + { + "epoch": 0.02452054256421723, + "grad_norm": 15.669025808095792, + "learning_rate": 2.4520073323530003e-05, + "loss": 2.2205, + "mean_token_accuracy": 0.46896551847457885, + "step": 24345 + }, + { + "epoch": 0.024525578617321405, + "grad_norm": 13.827188307515428, + "learning_rate": 2.4525109280261066e-05, + "loss": 2.1297, + "mean_token_accuracy": 0.4620689690113068, + "step": 24350 + }, + { + "epoch": 0.024530614670425575, + "grad_norm": 19.620393159668154, + "learning_rate": 2.4530145236992125e-05, + "loss": 2.7106, + "mean_token_accuracy": 0.41034482717514037, + "step": 24355 + }, + { + "epoch": 0.02453565072352975, + "grad_norm": 30.779562881948802, + "learning_rate": 2.4535181193723184e-05, + "loss": 2.4532, + "mean_token_accuracy": 0.4344827592372894, + "step": 24360 + }, + { + "epoch": 0.024540686776633923, + "grad_norm": 18.567332813367724, + "learning_rate": 2.4540217150454243e-05, + "loss": 2.6432, + "mean_token_accuracy": 0.3793103456497192, + "step": 24365 + }, + { + "epoch": 0.024545722829738097, + "grad_norm": 16.927543303379768, + "learning_rate": 2.4545253107185303e-05, + "loss": 2.6049, + "mean_token_accuracy": 0.41379310488700866, + "step": 24370 + }, + { + "epoch": 0.024550758882842267, + "grad_norm": 19.021473216580382, + "learning_rate": 2.4550289063916365e-05, + "loss": 2.2336, + "mean_token_accuracy": 0.4344827562570572, + "step": 24375 + }, + { + "epoch": 0.02455579493594644, + "grad_norm": 19.40924859198866, + "learning_rate": 2.4555325020647424e-05, + "loss": 2.8492, + "mean_token_accuracy": 0.4034482777118683, + "step": 24380 + }, + { + "epoch": 0.024560830989050614, + "grad_norm": 19.304551417622783, + "learning_rate": 2.4560360977378484e-05, + "loss": 2.5777, + "mean_token_accuracy": 0.37586206793785093, + "step": 24385 + }, + { + "epoch": 0.024565867042154785, + "grad_norm": 16.389793628574363, + "learning_rate": 2.4565396934109543e-05, + "loss": 2.4374, + "mean_token_accuracy": 0.42758620977401735, + "step": 24390 + }, + { + "epoch": 0.02457090309525896, + "grad_norm": 19.716214146017972, + "learning_rate": 2.4570432890840602e-05, + "loss": 2.9267, + "mean_token_accuracy": 0.3620689630508423, + "step": 24395 + }, + { + "epoch": 0.024575939148363132, + "grad_norm": 15.230494681223712, + "learning_rate": 2.457546884757166e-05, + "loss": 2.2653, + "mean_token_accuracy": 0.4206896543502808, + "step": 24400 + }, + { + "epoch": 0.024580975201467306, + "grad_norm": 22.30345949185059, + "learning_rate": 2.4580504804302724e-05, + "loss": 2.2565, + "mean_token_accuracy": 0.4206896543502808, + "step": 24405 + }, + { + "epoch": 0.024586011254571476, + "grad_norm": 17.37170640533167, + "learning_rate": 2.4585540761033783e-05, + "loss": 2.0535, + "mean_token_accuracy": 0.4834240794181824, + "step": 24410 + }, + { + "epoch": 0.02459104730767565, + "grad_norm": 24.82227342826864, + "learning_rate": 2.4590576717764843e-05, + "loss": 2.9718, + "mean_token_accuracy": 0.37586207389831544, + "step": 24415 + }, + { + "epoch": 0.024596083360779824, + "grad_norm": 17.769985613307394, + "learning_rate": 2.4595612674495902e-05, + "loss": 2.3936, + "mean_token_accuracy": 0.42758620977401735, + "step": 24420 + }, + { + "epoch": 0.024601119413883994, + "grad_norm": 20.150924313811835, + "learning_rate": 2.460064863122696e-05, + "loss": 2.4459, + "mean_token_accuracy": 0.4396854341030121, + "step": 24425 + }, + { + "epoch": 0.024606155466988168, + "grad_norm": 21.11160829830446, + "learning_rate": 2.4605684587958024e-05, + "loss": 2.787, + "mean_token_accuracy": 0.36896551847457887, + "step": 24430 + }, + { + "epoch": 0.02461119152009234, + "grad_norm": 17.163781991658798, + "learning_rate": 2.461072054468908e-05, + "loss": 2.4174, + "mean_token_accuracy": 0.4555958867073059, + "step": 24435 + }, + { + "epoch": 0.024616227573196515, + "grad_norm": 29.660014470165088, + "learning_rate": 2.461575650142014e-05, + "loss": 2.6861, + "mean_token_accuracy": 0.3655172407627106, + "step": 24440 + }, + { + "epoch": 0.024621263626300686, + "grad_norm": 17.705153596476247, + "learning_rate": 2.46207924581512e-05, + "loss": 2.3078, + "mean_token_accuracy": 0.4517241418361664, + "step": 24445 + }, + { + "epoch": 0.02462629967940486, + "grad_norm": 18.729596378986045, + "learning_rate": 2.462582841488226e-05, + "loss": 2.5169, + "mean_token_accuracy": 0.41911675333976744, + "step": 24450 + }, + { + "epoch": 0.024631335732509033, + "grad_norm": 24.20005484762973, + "learning_rate": 2.463086437161332e-05, + "loss": 2.4975, + "mean_token_accuracy": 0.3965517163276672, + "step": 24455 + }, + { + "epoch": 0.024636371785613204, + "grad_norm": 18.28864537992677, + "learning_rate": 2.463590032834438e-05, + "loss": 2.3671, + "mean_token_accuracy": 0.4620689630508423, + "step": 24460 + }, + { + "epoch": 0.024641407838717377, + "grad_norm": 17.10611002962765, + "learning_rate": 2.464093628507544e-05, + "loss": 2.0954, + "mean_token_accuracy": 0.43103447556495667, + "step": 24465 + }, + { + "epoch": 0.02464644389182155, + "grad_norm": 18.72500807908979, + "learning_rate": 2.46459722418065e-05, + "loss": 2.2785, + "mean_token_accuracy": 0.47241379618644713, + "step": 24470 + }, + { + "epoch": 0.024651479944925725, + "grad_norm": 20.651166475084864, + "learning_rate": 2.465100819853756e-05, + "loss": 2.4147, + "mean_token_accuracy": 0.3862069010734558, + "step": 24475 + }, + { + "epoch": 0.024656515998029895, + "grad_norm": 23.84414566074428, + "learning_rate": 2.4656044155268616e-05, + "loss": 2.5527, + "mean_token_accuracy": 0.4310344845056534, + "step": 24480 + }, + { + "epoch": 0.02466155205113407, + "grad_norm": 19.637821449590685, + "learning_rate": 2.466108011199968e-05, + "loss": 2.3081, + "mean_token_accuracy": 0.48136720061302185, + "step": 24485 + }, + { + "epoch": 0.024666588104238243, + "grad_norm": 13.251230720661296, + "learning_rate": 2.4666116068730738e-05, + "loss": 2.3448, + "mean_token_accuracy": 0.4413793087005615, + "step": 24490 + }, + { + "epoch": 0.024671624157342413, + "grad_norm": 19.02134762061157, + "learning_rate": 2.46711520254618e-05, + "loss": 2.8104, + "mean_token_accuracy": 0.37241379022598264, + "step": 24495 + }, + { + "epoch": 0.024676660210446587, + "grad_norm": 17.52894686642188, + "learning_rate": 2.4676187982192857e-05, + "loss": 2.3716, + "mean_token_accuracy": 0.4275862157344818, + "step": 24500 + }, + { + "epoch": 0.02468169626355076, + "grad_norm": 15.92385736218499, + "learning_rate": 2.4681223938923916e-05, + "loss": 2.4998, + "mean_token_accuracy": 0.4206896543502808, + "step": 24505 + }, + { + "epoch": 0.024686732316654934, + "grad_norm": 16.775405667959248, + "learning_rate": 2.468625989565498e-05, + "loss": 2.4214, + "mean_token_accuracy": 0.3862068891525269, + "step": 24510 + }, + { + "epoch": 0.024691768369759105, + "grad_norm": 15.744594786796725, + "learning_rate": 2.4691295852386038e-05, + "loss": 2.4147, + "mean_token_accuracy": 0.4689655125141144, + "step": 24515 + }, + { + "epoch": 0.02469680442286328, + "grad_norm": 18.92062508377345, + "learning_rate": 2.4696331809117097e-05, + "loss": 2.4478, + "mean_token_accuracy": 0.4379310250282288, + "step": 24520 + }, + { + "epoch": 0.024701840475967452, + "grad_norm": 15.49140789864529, + "learning_rate": 2.4701367765848156e-05, + "loss": 2.332, + "mean_token_accuracy": 0.5088929176330567, + "step": 24525 + }, + { + "epoch": 0.024706876529071622, + "grad_norm": 18.91393168704787, + "learning_rate": 2.4706403722579216e-05, + "loss": 2.4616, + "mean_token_accuracy": 0.3999999940395355, + "step": 24530 + }, + { + "epoch": 0.024711912582175796, + "grad_norm": 23.20342370745326, + "learning_rate": 2.4711439679310278e-05, + "loss": 2.5195, + "mean_token_accuracy": 0.41379311084747317, + "step": 24535 + }, + { + "epoch": 0.02471694863527997, + "grad_norm": 14.75347209913705, + "learning_rate": 2.4716475636041337e-05, + "loss": 2.3001, + "mean_token_accuracy": 0.4448275864124298, + "step": 24540 + }, + { + "epoch": 0.024721984688384144, + "grad_norm": 22.740160004280103, + "learning_rate": 2.4721511592772397e-05, + "loss": 2.7785, + "mean_token_accuracy": 0.41379310488700866, + "step": 24545 + }, + { + "epoch": 0.024727020741488314, + "grad_norm": 19.953504453862625, + "learning_rate": 2.4726547549503456e-05, + "loss": 2.169, + "mean_token_accuracy": 0.4206896543502808, + "step": 24550 + }, + { + "epoch": 0.024732056794592488, + "grad_norm": 18.31913564290947, + "learning_rate": 2.4731583506234515e-05, + "loss": 2.6735, + "mean_token_accuracy": 0.3758620619773865, + "step": 24555 + }, + { + "epoch": 0.02473709284769666, + "grad_norm": 21.626815384586703, + "learning_rate": 2.4736619462965578e-05, + "loss": 2.2275, + "mean_token_accuracy": 0.47586206197738645, + "step": 24560 + }, + { + "epoch": 0.024742128900800832, + "grad_norm": 16.998328177654432, + "learning_rate": 2.4741655419696637e-05, + "loss": 2.4554, + "mean_token_accuracy": 0.45710828304290774, + "step": 24565 + }, + { + "epoch": 0.024747164953905006, + "grad_norm": 18.589248889678103, + "learning_rate": 2.4746691376427693e-05, + "loss": 2.645, + "mean_token_accuracy": 0.39310344457626345, + "step": 24570 + }, + { + "epoch": 0.02475220100700918, + "grad_norm": 16.764084229307176, + "learning_rate": 2.4751727333158756e-05, + "loss": 2.3329, + "mean_token_accuracy": 0.42758620977401735, + "step": 24575 + }, + { + "epoch": 0.024757237060113353, + "grad_norm": 20.164671405794497, + "learning_rate": 2.4756763289889815e-05, + "loss": 2.7357, + "mean_token_accuracy": 0.3551724135875702, + "step": 24580 + }, + { + "epoch": 0.024762273113217523, + "grad_norm": 14.868518052332309, + "learning_rate": 2.4761799246620874e-05, + "loss": 2.5724, + "mean_token_accuracy": 0.38620689511299133, + "step": 24585 + }, + { + "epoch": 0.024767309166321697, + "grad_norm": 13.859672935724632, + "learning_rate": 2.4766835203351933e-05, + "loss": 2.411, + "mean_token_accuracy": 0.4413793087005615, + "step": 24590 + }, + { + "epoch": 0.02477234521942587, + "grad_norm": 15.927321620647696, + "learning_rate": 2.4771871160082993e-05, + "loss": 2.609, + "mean_token_accuracy": 0.40344828367233276, + "step": 24595 + }, + { + "epoch": 0.02477738127253004, + "grad_norm": 16.895244227435985, + "learning_rate": 2.4776907116814055e-05, + "loss": 2.6078, + "mean_token_accuracy": 0.3655172407627106, + "step": 24600 + }, + { + "epoch": 0.024782417325634215, + "grad_norm": 19.04777104774694, + "learning_rate": 2.4781943073545115e-05, + "loss": 2.379, + "mean_token_accuracy": 0.4379310369491577, + "step": 24605 + }, + { + "epoch": 0.02478745337873839, + "grad_norm": 19.831878238366187, + "learning_rate": 2.4786979030276174e-05, + "loss": 2.4828, + "mean_token_accuracy": 0.40344828367233276, + "step": 24610 + }, + { + "epoch": 0.024792489431842563, + "grad_norm": 22.728341081182506, + "learning_rate": 2.4792014987007233e-05, + "loss": 2.7505, + "mean_token_accuracy": 0.4068965494632721, + "step": 24615 + }, + { + "epoch": 0.024797525484946733, + "grad_norm": 16.095819646876706, + "learning_rate": 2.4797050943738292e-05, + "loss": 2.461, + "mean_token_accuracy": 0.4068965494632721, + "step": 24620 + }, + { + "epoch": 0.024802561538050907, + "grad_norm": 16.79054018959549, + "learning_rate": 2.480208690046935e-05, + "loss": 2.3348, + "mean_token_accuracy": 0.45862067937850953, + "step": 24625 + }, + { + "epoch": 0.02480759759115508, + "grad_norm": 18.731074311834142, + "learning_rate": 2.4807122857200414e-05, + "loss": 2.6435, + "mean_token_accuracy": 0.4332123279571533, + "step": 24630 + }, + { + "epoch": 0.02481263364425925, + "grad_norm": 19.504385883659808, + "learning_rate": 2.481215881393147e-05, + "loss": 2.7015, + "mean_token_accuracy": 0.39443435668945315, + "step": 24635 + }, + { + "epoch": 0.024817669697363424, + "grad_norm": 20.773113840314863, + "learning_rate": 2.4817194770662533e-05, + "loss": 2.6341, + "mean_token_accuracy": 0.3793103337287903, + "step": 24640 + }, + { + "epoch": 0.024822705750467598, + "grad_norm": 19.93392308403074, + "learning_rate": 2.4822230727393592e-05, + "loss": 2.2136, + "mean_token_accuracy": 0.4448275864124298, + "step": 24645 + }, + { + "epoch": 0.024827741803571772, + "grad_norm": 20.33925334642438, + "learning_rate": 2.482726668412465e-05, + "loss": 2.3626, + "mean_token_accuracy": 0.43448275327682495, + "step": 24650 + }, + { + "epoch": 0.024832777856675942, + "grad_norm": 16.790907366349888, + "learning_rate": 2.483230264085571e-05, + "loss": 2.4879, + "mean_token_accuracy": 0.43103447556495667, + "step": 24655 + }, + { + "epoch": 0.024837813909780116, + "grad_norm": 20.972465004372598, + "learning_rate": 2.483733859758677e-05, + "loss": 2.6129, + "mean_token_accuracy": 0.3896551728248596, + "step": 24660 + }, + { + "epoch": 0.02484284996288429, + "grad_norm": 14.418340893590205, + "learning_rate": 2.484237455431783e-05, + "loss": 2.5755, + "mean_token_accuracy": 0.39310343861579894, + "step": 24665 + }, + { + "epoch": 0.02484788601598846, + "grad_norm": 18.5742558577864, + "learning_rate": 2.484741051104889e-05, + "loss": 2.7675, + "mean_token_accuracy": 0.39655172228813174, + "step": 24670 + }, + { + "epoch": 0.024852922069092634, + "grad_norm": 21.03926611979944, + "learning_rate": 2.485244646777995e-05, + "loss": 2.6652, + "mean_token_accuracy": 0.43793103098869324, + "step": 24675 + }, + { + "epoch": 0.024857958122196808, + "grad_norm": 29.160457014636684, + "learning_rate": 2.485748242451101e-05, + "loss": 2.5018, + "mean_token_accuracy": 0.36896551847457887, + "step": 24680 + }, + { + "epoch": 0.02486299417530098, + "grad_norm": 18.981458607799254, + "learning_rate": 2.486251838124207e-05, + "loss": 2.5516, + "mean_token_accuracy": 0.3965517282485962, + "step": 24685 + }, + { + "epoch": 0.02486803022840515, + "grad_norm": 82.18648647100278, + "learning_rate": 2.486755433797313e-05, + "loss": 2.5099, + "mean_token_accuracy": 0.42438423037528994, + "step": 24690 + }, + { + "epoch": 0.024873066281509325, + "grad_norm": 21.305590635899502, + "learning_rate": 2.487259029470419e-05, + "loss": 2.4482, + "mean_token_accuracy": 0.36896551251411436, + "step": 24695 + }, + { + "epoch": 0.0248781023346135, + "grad_norm": 19.534029317380895, + "learning_rate": 2.4877626251435247e-05, + "loss": 2.3878, + "mean_token_accuracy": 0.4034482777118683, + "step": 24700 + }, + { + "epoch": 0.02488313838771767, + "grad_norm": 15.089009895598371, + "learning_rate": 2.4882662208166306e-05, + "loss": 2.6547, + "mean_token_accuracy": 0.3517241358757019, + "step": 24705 + }, + { + "epoch": 0.024888174440821843, + "grad_norm": 15.522176264309552, + "learning_rate": 2.488769816489737e-05, + "loss": 2.2925, + "mean_token_accuracy": 0.42413792908191683, + "step": 24710 + }, + { + "epoch": 0.024893210493926017, + "grad_norm": 24.931839419742985, + "learning_rate": 2.4892734121628428e-05, + "loss": 2.8113, + "mean_token_accuracy": 0.37586206793785093, + "step": 24715 + }, + { + "epoch": 0.02489824654703019, + "grad_norm": 22.913966597139822, + "learning_rate": 2.4897770078359488e-05, + "loss": 2.8452, + "mean_token_accuracy": 0.4241379380226135, + "step": 24720 + }, + { + "epoch": 0.02490328260013436, + "grad_norm": 17.916058954501537, + "learning_rate": 2.4902806035090547e-05, + "loss": 2.5551, + "mean_token_accuracy": 0.33793103098869326, + "step": 24725 + }, + { + "epoch": 0.024908318653238535, + "grad_norm": 18.956282188232244, + "learning_rate": 2.4907841991821606e-05, + "loss": 2.5347, + "mean_token_accuracy": 0.4047791838645935, + "step": 24730 + }, + { + "epoch": 0.02491335470634271, + "grad_norm": 22.721342847492643, + "learning_rate": 2.491287794855267e-05, + "loss": 2.49, + "mean_token_accuracy": 0.4379310369491577, + "step": 24735 + }, + { + "epoch": 0.02491839075944688, + "grad_norm": 17.05183906231256, + "learning_rate": 2.4917913905283728e-05, + "loss": 2.5987, + "mean_token_accuracy": 0.39310344457626345, + "step": 24740 + }, + { + "epoch": 0.024923426812551053, + "grad_norm": 189.33649920766237, + "learning_rate": 2.4922949862014787e-05, + "loss": 2.8688, + "mean_token_accuracy": 0.35862069129943847, + "step": 24745 + }, + { + "epoch": 0.024928462865655227, + "grad_norm": 17.581526624657652, + "learning_rate": 2.4927985818745846e-05, + "loss": 2.3052, + "mean_token_accuracy": 0.4068965494632721, + "step": 24750 + }, + { + "epoch": 0.024933498918759397, + "grad_norm": 17.413732844641164, + "learning_rate": 2.4933021775476906e-05, + "loss": 2.7568, + "mean_token_accuracy": 0.34482758641242983, + "step": 24755 + }, + { + "epoch": 0.02493853497186357, + "grad_norm": 18.466674612906953, + "learning_rate": 2.493805773220797e-05, + "loss": 2.5837, + "mean_token_accuracy": 0.4310344815254211, + "step": 24760 + }, + { + "epoch": 0.024943571024967744, + "grad_norm": 16.87559628930719, + "learning_rate": 2.4943093688939028e-05, + "loss": 2.5021, + "mean_token_accuracy": 0.4034482717514038, + "step": 24765 + }, + { + "epoch": 0.024948607078071918, + "grad_norm": 14.209737865021951, + "learning_rate": 2.4948129645670083e-05, + "loss": 2.3501, + "mean_token_accuracy": 0.4000000059604645, + "step": 24770 + }, + { + "epoch": 0.02495364313117609, + "grad_norm": 20.22143574985584, + "learning_rate": 2.4953165602401146e-05, + "loss": 2.4344, + "mean_token_accuracy": 0.36551724970340727, + "step": 24775 + }, + { + "epoch": 0.024958679184280262, + "grad_norm": 18.602972689867542, + "learning_rate": 2.4958201559132205e-05, + "loss": 2.3165, + "mean_token_accuracy": 0.41379310488700866, + "step": 24780 + }, + { + "epoch": 0.024963715237384436, + "grad_norm": 17.35513427665744, + "learning_rate": 2.4963237515863265e-05, + "loss": 2.6037, + "mean_token_accuracy": 0.3827586233615875, + "step": 24785 + }, + { + "epoch": 0.024968751290488606, + "grad_norm": 21.82579321341961, + "learning_rate": 2.4968273472594324e-05, + "loss": 2.518, + "mean_token_accuracy": 0.3965517282485962, + "step": 24790 + }, + { + "epoch": 0.02497378734359278, + "grad_norm": 19.367511163002423, + "learning_rate": 2.4973309429325383e-05, + "loss": 2.7331, + "mean_token_accuracy": 0.38965516686439516, + "step": 24795 + }, + { + "epoch": 0.024978823396696954, + "grad_norm": 18.560369745773816, + "learning_rate": 2.4978345386056446e-05, + "loss": 2.9231, + "mean_token_accuracy": 0.3939503937959671, + "step": 24800 + }, + { + "epoch": 0.024983859449801128, + "grad_norm": 19.018720791785338, + "learning_rate": 2.4983381342787505e-05, + "loss": 2.737, + "mean_token_accuracy": 0.4137930989265442, + "step": 24805 + }, + { + "epoch": 0.024988895502905298, + "grad_norm": 16.156242104946447, + "learning_rate": 2.4988417299518564e-05, + "loss": 2.2594, + "mean_token_accuracy": 0.4551724076271057, + "step": 24810 + }, + { + "epoch": 0.02499393155600947, + "grad_norm": 18.215359716320375, + "learning_rate": 2.4993453256249623e-05, + "loss": 2.4389, + "mean_token_accuracy": 0.41034482717514037, + "step": 24815 + }, + { + "epoch": 0.024998967609113645, + "grad_norm": 15.965827269457066, + "learning_rate": 2.4998489212980683e-05, + "loss": 2.1229, + "mean_token_accuracy": 0.5241379320621491, + "step": 24820 + }, + { + "epoch": 0.025004003662217816, + "grad_norm": 18.017658849569003, + "learning_rate": 2.5003525169711745e-05, + "loss": 2.3537, + "mean_token_accuracy": 0.44827587008476255, + "step": 24825 + }, + { + "epoch": 0.02500903971532199, + "grad_norm": 19.32321763099768, + "learning_rate": 2.5008561126442805e-05, + "loss": 2.2264, + "mean_token_accuracy": 0.42068966031074523, + "step": 24830 + }, + { + "epoch": 0.025014075768426163, + "grad_norm": 16.79237613166486, + "learning_rate": 2.501359708317386e-05, + "loss": 1.9761, + "mean_token_accuracy": 0.4918935239315033, + "step": 24835 + }, + { + "epoch": 0.025019111821530337, + "grad_norm": 18.70889420607284, + "learning_rate": 2.501863303990492e-05, + "loss": 2.5177, + "mean_token_accuracy": 0.37586206793785093, + "step": 24840 + }, + { + "epoch": 0.025024147874634507, + "grad_norm": 19.736300992628863, + "learning_rate": 2.502366899663598e-05, + "loss": 2.64, + "mean_token_accuracy": 0.38620689511299133, + "step": 24845 + }, + { + "epoch": 0.02502918392773868, + "grad_norm": 25.939978522503115, + "learning_rate": 2.5028704953367045e-05, + "loss": 2.6035, + "mean_token_accuracy": 0.3965517282485962, + "step": 24850 + }, + { + "epoch": 0.025034219980842855, + "grad_norm": 16.35035068645743, + "learning_rate": 2.50337409100981e-05, + "loss": 2.733, + "mean_token_accuracy": 0.4034482777118683, + "step": 24855 + }, + { + "epoch": 0.025039256033947025, + "grad_norm": 20.1360113122945, + "learning_rate": 2.503877686682916e-05, + "loss": 2.5704, + "mean_token_accuracy": 0.3655172407627106, + "step": 24860 + }, + { + "epoch": 0.0250442920870512, + "grad_norm": 14.832555202575852, + "learning_rate": 2.504381282356022e-05, + "loss": 2.7804, + "mean_token_accuracy": 0.36206896007061007, + "step": 24865 + }, + { + "epoch": 0.025049328140155373, + "grad_norm": 16.66674451613789, + "learning_rate": 2.504884878029128e-05, + "loss": 2.3054, + "mean_token_accuracy": 0.4517241358757019, + "step": 24870 + }, + { + "epoch": 0.025054364193259546, + "grad_norm": 21.558419006097473, + "learning_rate": 2.505388473702234e-05, + "loss": 2.197, + "mean_token_accuracy": 0.49879008531570435, + "step": 24875 + }, + { + "epoch": 0.025059400246363717, + "grad_norm": 17.697669562281778, + "learning_rate": 2.50589206937534e-05, + "loss": 2.8036, + "mean_token_accuracy": 0.34137930870056155, + "step": 24880 + }, + { + "epoch": 0.02506443629946789, + "grad_norm": 19.8202204158267, + "learning_rate": 2.506395665048446e-05, + "loss": 2.4133, + "mean_token_accuracy": 0.36896551251411436, + "step": 24885 + }, + { + "epoch": 0.025069472352572064, + "grad_norm": 18.872696023625345, + "learning_rate": 2.506899260721552e-05, + "loss": 2.2376, + "mean_token_accuracy": 0.441379314661026, + "step": 24890 + }, + { + "epoch": 0.025074508405676234, + "grad_norm": 14.623671374832881, + "learning_rate": 2.507402856394658e-05, + "loss": 2.0833, + "mean_token_accuracy": 0.5, + "step": 24895 + }, + { + "epoch": 0.025079544458780408, + "grad_norm": 19.188064940481706, + "learning_rate": 2.507906452067764e-05, + "loss": 2.246, + "mean_token_accuracy": 0.4275861978530884, + "step": 24900 + }, + { + "epoch": 0.025084580511884582, + "grad_norm": 18.003936616059182, + "learning_rate": 2.50841004774087e-05, + "loss": 3.018, + "mean_token_accuracy": 0.3620689630508423, + "step": 24905 + }, + { + "epoch": 0.025089616564988756, + "grad_norm": 21.9859501299574, + "learning_rate": 2.508913643413976e-05, + "loss": 2.5498, + "mean_token_accuracy": 0.4103448212146759, + "step": 24910 + }, + { + "epoch": 0.025094652618092926, + "grad_norm": 24.648330978277695, + "learning_rate": 2.509417239087082e-05, + "loss": 2.4007, + "mean_token_accuracy": 0.41379311084747317, + "step": 24915 + }, + { + "epoch": 0.0250996886711971, + "grad_norm": 16.370652129392177, + "learning_rate": 2.5099208347601878e-05, + "loss": 2.2039, + "mean_token_accuracy": 0.48457351326942444, + "step": 24920 + }, + { + "epoch": 0.025104724724301274, + "grad_norm": 19.077952558668244, + "learning_rate": 2.5104244304332937e-05, + "loss": 2.4057, + "mean_token_accuracy": 0.43272837400436404, + "step": 24925 + }, + { + "epoch": 0.025109760777405444, + "grad_norm": 17.325037012733645, + "learning_rate": 2.5109280261064e-05, + "loss": 2.264, + "mean_token_accuracy": 0.46031457781791685, + "step": 24930 + }, + { + "epoch": 0.025114796830509618, + "grad_norm": 20.68964216092335, + "learning_rate": 2.511431621779506e-05, + "loss": 2.7405, + "mean_token_accuracy": 0.3931034505367279, + "step": 24935 + }, + { + "epoch": 0.02511983288361379, + "grad_norm": 20.040805741705526, + "learning_rate": 2.511935217452612e-05, + "loss": 2.8667, + "mean_token_accuracy": 0.4379310369491577, + "step": 24940 + }, + { + "epoch": 0.025124868936717965, + "grad_norm": 14.198128612907638, + "learning_rate": 2.5124388131257178e-05, + "loss": 2.0427, + "mean_token_accuracy": 0.47931033968925474, + "step": 24945 + }, + { + "epoch": 0.025129904989822135, + "grad_norm": 17.409894932930676, + "learning_rate": 2.5129424087988233e-05, + "loss": 2.6121, + "mean_token_accuracy": 0.39655172228813174, + "step": 24950 + }, + { + "epoch": 0.02513494104292631, + "grad_norm": 19.0159386616897, + "learning_rate": 2.51344600447193e-05, + "loss": 2.6767, + "mean_token_accuracy": 0.3862068891525269, + "step": 24955 + }, + { + "epoch": 0.025139977096030483, + "grad_norm": 20.89381608612409, + "learning_rate": 2.513949600145036e-05, + "loss": 2.585, + "mean_token_accuracy": 0.394313371181488, + "step": 24960 + }, + { + "epoch": 0.025145013149134653, + "grad_norm": 16.92376902226771, + "learning_rate": 2.5144531958181418e-05, + "loss": 2.561, + "mean_token_accuracy": 0.40193587839603423, + "step": 24965 + }, + { + "epoch": 0.025150049202238827, + "grad_norm": 17.116078318407272, + "learning_rate": 2.5149567914912474e-05, + "loss": 2.2923, + "mean_token_accuracy": 0.43303084969520567, + "step": 24970 + }, + { + "epoch": 0.025155085255343, + "grad_norm": 18.10420010094955, + "learning_rate": 2.5154603871643533e-05, + "loss": 2.2387, + "mean_token_accuracy": 0.41379310488700866, + "step": 24975 + }, + { + "epoch": 0.025160121308447175, + "grad_norm": 21.624387760201206, + "learning_rate": 2.51596398283746e-05, + "loss": 2.5693, + "mean_token_accuracy": 0.35862069129943847, + "step": 24980 + }, + { + "epoch": 0.025165157361551345, + "grad_norm": 18.333290732176884, + "learning_rate": 2.516467578510566e-05, + "loss": 2.3173, + "mean_token_accuracy": 0.46896551847457885, + "step": 24985 + }, + { + "epoch": 0.02517019341465552, + "grad_norm": 18.845008692154, + "learning_rate": 2.5169711741836714e-05, + "loss": 2.7614, + "mean_token_accuracy": 0.3931034505367279, + "step": 24990 + }, + { + "epoch": 0.025175229467759692, + "grad_norm": 17.255559981267496, + "learning_rate": 2.5174747698567774e-05, + "loss": 2.335, + "mean_token_accuracy": 0.4862069010734558, + "step": 24995 + }, + { + "epoch": 0.025180265520863863, + "grad_norm": 21.88476049416182, + "learning_rate": 2.5179783655298833e-05, + "loss": 2.8608, + "mean_token_accuracy": 0.358620685338974, + "step": 25000 + }, + { + "epoch": 0.025185301573968037, + "grad_norm": 17.96187983720199, + "learning_rate": 2.51848196120299e-05, + "loss": 2.5135, + "mean_token_accuracy": 0.39655172228813174, + "step": 25005 + }, + { + "epoch": 0.02519033762707221, + "grad_norm": 24.270993844206423, + "learning_rate": 2.5189855568760955e-05, + "loss": 2.5444, + "mean_token_accuracy": 0.3827586114406586, + "step": 25010 + }, + { + "epoch": 0.025195373680176384, + "grad_norm": 15.833730229194975, + "learning_rate": 2.5194891525492014e-05, + "loss": 2.5186, + "mean_token_accuracy": 0.3793103456497192, + "step": 25015 + }, + { + "epoch": 0.025200409733280554, + "grad_norm": 16.14537585954356, + "learning_rate": 2.5199927482223073e-05, + "loss": 2.6364, + "mean_token_accuracy": 0.3931034505367279, + "step": 25020 + }, + { + "epoch": 0.025205445786384728, + "grad_norm": 19.35379596063752, + "learning_rate": 2.5204963438954132e-05, + "loss": 2.5304, + "mean_token_accuracy": 0.42758620381355283, + "step": 25025 + }, + { + "epoch": 0.025210481839488902, + "grad_norm": 20.81768854210628, + "learning_rate": 2.5209999395685192e-05, + "loss": 2.5129, + "mean_token_accuracy": 0.4034482717514038, + "step": 25030 + }, + { + "epoch": 0.025215517892593072, + "grad_norm": 22.46113023506108, + "learning_rate": 2.5215035352416254e-05, + "loss": 2.5387, + "mean_token_accuracy": 0.41034482717514037, + "step": 25035 + }, + { + "epoch": 0.025220553945697246, + "grad_norm": 18.54863069543931, + "learning_rate": 2.5220071309147314e-05, + "loss": 2.5678, + "mean_token_accuracy": 0.42758620381355283, + "step": 25040 + }, + { + "epoch": 0.02522558999880142, + "grad_norm": 19.66333815922003, + "learning_rate": 2.5225107265878373e-05, + "loss": 2.3509, + "mean_token_accuracy": 0.4448275864124298, + "step": 25045 + }, + { + "epoch": 0.025230626051905593, + "grad_norm": 17.72482811832775, + "learning_rate": 2.5230143222609432e-05, + "loss": 2.1283, + "mean_token_accuracy": 0.5128856599330902, + "step": 25050 + }, + { + "epoch": 0.025235662105009764, + "grad_norm": 17.272617012682325, + "learning_rate": 2.523517917934049e-05, + "loss": 2.5181, + "mean_token_accuracy": 0.45517241954803467, + "step": 25055 + }, + { + "epoch": 0.025240698158113938, + "grad_norm": 19.82932438424549, + "learning_rate": 2.5240215136071554e-05, + "loss": 2.4561, + "mean_token_accuracy": 0.44137930274009707, + "step": 25060 + }, + { + "epoch": 0.02524573421121811, + "grad_norm": 22.77298130370189, + "learning_rate": 2.5245251092802613e-05, + "loss": 2.4903, + "mean_token_accuracy": 0.3931034505367279, + "step": 25065 + }, + { + "epoch": 0.02525077026432228, + "grad_norm": 21.62257489094834, + "learning_rate": 2.5250287049533672e-05, + "loss": 2.689, + "mean_token_accuracy": 0.3793103456497192, + "step": 25070 + }, + { + "epoch": 0.025255806317426455, + "grad_norm": 13.863664567921546, + "learning_rate": 2.5255323006264732e-05, + "loss": 2.403, + "mean_token_accuracy": 0.4310344815254211, + "step": 25075 + }, + { + "epoch": 0.02526084237053063, + "grad_norm": 22.331022361436613, + "learning_rate": 2.5260358962995788e-05, + "loss": 2.525, + "mean_token_accuracy": 0.3840290427207947, + "step": 25080 + }, + { + "epoch": 0.025265878423634803, + "grad_norm": 16.49316377332385, + "learning_rate": 2.5265394919726854e-05, + "loss": 2.2914, + "mean_token_accuracy": 0.4586206912994385, + "step": 25085 + }, + { + "epoch": 0.025270914476738973, + "grad_norm": 18.97188090009966, + "learning_rate": 2.5270430876457913e-05, + "loss": 2.4009, + "mean_token_accuracy": 0.42413793206214906, + "step": 25090 + }, + { + "epoch": 0.025275950529843147, + "grad_norm": 15.620107160335737, + "learning_rate": 2.5275466833188972e-05, + "loss": 2.4613, + "mean_token_accuracy": 0.3965517282485962, + "step": 25095 + }, + { + "epoch": 0.02528098658294732, + "grad_norm": 16.922721278950874, + "learning_rate": 2.528050278992003e-05, + "loss": 2.4986, + "mean_token_accuracy": 0.4413793087005615, + "step": 25100 + }, + { + "epoch": 0.02528602263605149, + "grad_norm": 20.00535561966354, + "learning_rate": 2.5285538746651087e-05, + "loss": 2.7847, + "mean_token_accuracy": 0.37241379022598264, + "step": 25105 + }, + { + "epoch": 0.025291058689155665, + "grad_norm": 17.215061285637095, + "learning_rate": 2.5290574703382147e-05, + "loss": 2.6579, + "mean_token_accuracy": 0.4172413766384125, + "step": 25110 + }, + { + "epoch": 0.02529609474225984, + "grad_norm": 14.513372330474597, + "learning_rate": 2.5295610660113213e-05, + "loss": 2.1779, + "mean_token_accuracy": 0.4724137902259827, + "step": 25115 + }, + { + "epoch": 0.025301130795364012, + "grad_norm": 20.863186511496853, + "learning_rate": 2.5300646616844272e-05, + "loss": 2.4937, + "mean_token_accuracy": 0.37586206793785093, + "step": 25120 + }, + { + "epoch": 0.025306166848468183, + "grad_norm": 21.46342969419995, + "learning_rate": 2.5305682573575328e-05, + "loss": 2.4036, + "mean_token_accuracy": 0.38620689511299133, + "step": 25125 + }, + { + "epoch": 0.025311202901572356, + "grad_norm": 26.00649860401789, + "learning_rate": 2.5310718530306387e-05, + "loss": 2.4331, + "mean_token_accuracy": 0.3965517163276672, + "step": 25130 + }, + { + "epoch": 0.02531623895467653, + "grad_norm": 34.40026324279338, + "learning_rate": 2.5315754487037446e-05, + "loss": 2.8299, + "mean_token_accuracy": 0.38965516686439516, + "step": 25135 + }, + { + "epoch": 0.0253212750077807, + "grad_norm": 18.20071192619595, + "learning_rate": 2.5320790443768512e-05, + "loss": 2.3868, + "mean_token_accuracy": 0.441379314661026, + "step": 25140 + }, + { + "epoch": 0.025326311060884874, + "grad_norm": 20.190765148183047, + "learning_rate": 2.5325826400499568e-05, + "loss": 2.7907, + "mean_token_accuracy": 0.358620685338974, + "step": 25145 + }, + { + "epoch": 0.025331347113989048, + "grad_norm": 14.735776789645938, + "learning_rate": 2.5330862357230627e-05, + "loss": 2.7531, + "mean_token_accuracy": 0.3724137932062149, + "step": 25150 + }, + { + "epoch": 0.02533638316709322, + "grad_norm": 18.06522623033623, + "learning_rate": 2.5335898313961687e-05, + "loss": 2.4597, + "mean_token_accuracy": 0.4413793087005615, + "step": 25155 + }, + { + "epoch": 0.025341419220197392, + "grad_norm": 21.628463505813496, + "learning_rate": 2.5340934270692746e-05, + "loss": 2.399, + "mean_token_accuracy": 0.45517241954803467, + "step": 25160 + }, + { + "epoch": 0.025346455273301566, + "grad_norm": 19.23205904667441, + "learning_rate": 2.534597022742381e-05, + "loss": 2.864, + "mean_token_accuracy": 0.37241379618644715, + "step": 25165 + }, + { + "epoch": 0.02535149132640574, + "grad_norm": 30.278630424450583, + "learning_rate": 2.5351006184154868e-05, + "loss": 2.422, + "mean_token_accuracy": 0.4344827592372894, + "step": 25170 + }, + { + "epoch": 0.02535652737950991, + "grad_norm": 17.83850074331239, + "learning_rate": 2.5356042140885927e-05, + "loss": 2.1901, + "mean_token_accuracy": 0.4517241358757019, + "step": 25175 + }, + { + "epoch": 0.025361563432614084, + "grad_norm": 18.48080690979553, + "learning_rate": 2.5361078097616986e-05, + "loss": 2.6698, + "mean_token_accuracy": 0.46249244809150697, + "step": 25180 + }, + { + "epoch": 0.025366599485718257, + "grad_norm": 32.049637530233355, + "learning_rate": 2.5366114054348045e-05, + "loss": 2.2209, + "mean_token_accuracy": 0.4448275864124298, + "step": 25185 + }, + { + "epoch": 0.02537163553882243, + "grad_norm": 15.69236784579381, + "learning_rate": 2.5371150011079105e-05, + "loss": 2.2953, + "mean_token_accuracy": 0.45517240166664125, + "step": 25190 + }, + { + "epoch": 0.0253766715919266, + "grad_norm": 19.148726080851517, + "learning_rate": 2.5376185967810167e-05, + "loss": 2.6002, + "mean_token_accuracy": 0.4258318156003952, + "step": 25195 + }, + { + "epoch": 0.025381707645030775, + "grad_norm": 21.03997649284189, + "learning_rate": 2.5381221924541227e-05, + "loss": 2.759, + "mean_token_accuracy": 0.36896551549434664, + "step": 25200 + }, + { + "epoch": 0.02538674369813495, + "grad_norm": 20.317732195565593, + "learning_rate": 2.5386257881272286e-05, + "loss": 2.8817, + "mean_token_accuracy": 0.43611615896224976, + "step": 25205 + }, + { + "epoch": 0.02539177975123912, + "grad_norm": 106.29538120599565, + "learning_rate": 2.5391293838003345e-05, + "loss": 2.8172, + "mean_token_accuracy": 0.4000000059604645, + "step": 25210 + }, + { + "epoch": 0.025396815804343293, + "grad_norm": 15.947181681645722, + "learning_rate": 2.53963297947344e-05, + "loss": 2.6476, + "mean_token_accuracy": 0.3896551728248596, + "step": 25215 + }, + { + "epoch": 0.025401851857447467, + "grad_norm": 17.532040045031415, + "learning_rate": 2.5401365751465467e-05, + "loss": 2.7778, + "mean_token_accuracy": 0.4068965494632721, + "step": 25220 + }, + { + "epoch": 0.02540688791055164, + "grad_norm": 19.5977381701148, + "learning_rate": 2.5406401708196526e-05, + "loss": 2.6396, + "mean_token_accuracy": 0.3689655244350433, + "step": 25225 + }, + { + "epoch": 0.02541192396365581, + "grad_norm": 18.143791265038214, + "learning_rate": 2.5411437664927586e-05, + "loss": 2.4555, + "mean_token_accuracy": 0.37864487767219546, + "step": 25230 + }, + { + "epoch": 0.025416960016759985, + "grad_norm": 17.002685910828774, + "learning_rate": 2.541647362165864e-05, + "loss": 2.4609, + "mean_token_accuracy": 0.414039409160614, + "step": 25235 + }, + { + "epoch": 0.02542199606986416, + "grad_norm": 13.578408900533748, + "learning_rate": 2.54215095783897e-05, + "loss": 2.3709, + "mean_token_accuracy": 0.42068964838981626, + "step": 25240 + }, + { + "epoch": 0.02542703212296833, + "grad_norm": 19.86415541795426, + "learning_rate": 2.5426545535120767e-05, + "loss": 2.1122, + "mean_token_accuracy": 0.4586206912994385, + "step": 25245 + }, + { + "epoch": 0.025432068176072502, + "grad_norm": 23.289069648920503, + "learning_rate": 2.5431581491851826e-05, + "loss": 2.7462, + "mean_token_accuracy": 0.3862068891525269, + "step": 25250 + }, + { + "epoch": 0.025437104229176676, + "grad_norm": 15.311422446034133, + "learning_rate": 2.5436617448582882e-05, + "loss": 2.7361, + "mean_token_accuracy": 0.3655172407627106, + "step": 25255 + }, + { + "epoch": 0.02544214028228085, + "grad_norm": 16.608153866502843, + "learning_rate": 2.544165340531394e-05, + "loss": 2.6996, + "mean_token_accuracy": 0.3758620619773865, + "step": 25260 + }, + { + "epoch": 0.02544717633538502, + "grad_norm": 19.18888651941495, + "learning_rate": 2.5446689362045e-05, + "loss": 2.6177, + "mean_token_accuracy": 0.4034482777118683, + "step": 25265 + }, + { + "epoch": 0.025452212388489194, + "grad_norm": 15.33509087846629, + "learning_rate": 2.545172531877606e-05, + "loss": 2.4175, + "mean_token_accuracy": 0.40689654350280763, + "step": 25270 + }, + { + "epoch": 0.025457248441593368, + "grad_norm": 17.36760810208012, + "learning_rate": 2.5456761275507122e-05, + "loss": 2.4885, + "mean_token_accuracy": 0.417241370677948, + "step": 25275 + }, + { + "epoch": 0.025462284494697538, + "grad_norm": 23.693052056183188, + "learning_rate": 2.546179723223818e-05, + "loss": 2.6248, + "mean_token_accuracy": 0.4344827562570572, + "step": 25280 + }, + { + "epoch": 0.025467320547801712, + "grad_norm": 19.860335632710257, + "learning_rate": 2.546683318896924e-05, + "loss": 2.7004, + "mean_token_accuracy": 0.3655172407627106, + "step": 25285 + }, + { + "epoch": 0.025472356600905886, + "grad_norm": 16.646581272070463, + "learning_rate": 2.54718691457003e-05, + "loss": 2.2726, + "mean_token_accuracy": 0.4482758641242981, + "step": 25290 + }, + { + "epoch": 0.02547739265401006, + "grad_norm": 18.260723463955063, + "learning_rate": 2.547690510243136e-05, + "loss": 2.6047, + "mean_token_accuracy": 0.3862069010734558, + "step": 25295 + }, + { + "epoch": 0.02548242870711423, + "grad_norm": 21.000602471070593, + "learning_rate": 2.5481941059162422e-05, + "loss": 2.9808, + "mean_token_accuracy": 0.3517241358757019, + "step": 25300 + }, + { + "epoch": 0.025487464760218403, + "grad_norm": 17.401585788863354, + "learning_rate": 2.548697701589348e-05, + "loss": 2.3299, + "mean_token_accuracy": 0.4551724135875702, + "step": 25305 + }, + { + "epoch": 0.025492500813322577, + "grad_norm": 18.689687345515637, + "learning_rate": 2.549201297262454e-05, + "loss": 2.4588, + "mean_token_accuracy": 0.39310343861579894, + "step": 25310 + }, + { + "epoch": 0.025497536866426748, + "grad_norm": 14.606915998653385, + "learning_rate": 2.54970489293556e-05, + "loss": 2.5731, + "mean_token_accuracy": 0.37241379618644715, + "step": 25315 + }, + { + "epoch": 0.02550257291953092, + "grad_norm": 17.013073598271678, + "learning_rate": 2.550208488608666e-05, + "loss": 2.259, + "mean_token_accuracy": 0.45722927451133727, + "step": 25320 + }, + { + "epoch": 0.025507608972635095, + "grad_norm": 17.38080225885054, + "learning_rate": 2.550712084281772e-05, + "loss": 2.2518, + "mean_token_accuracy": 0.4068965554237366, + "step": 25325 + }, + { + "epoch": 0.02551264502573927, + "grad_norm": 17.978972083770167, + "learning_rate": 2.551215679954878e-05, + "loss": 2.4513, + "mean_token_accuracy": 0.38620689511299133, + "step": 25330 + }, + { + "epoch": 0.02551768107884344, + "grad_norm": 19.774094161197233, + "learning_rate": 2.551719275627984e-05, + "loss": 2.9378, + "mean_token_accuracy": 0.3862069010734558, + "step": 25335 + }, + { + "epoch": 0.025522717131947613, + "grad_norm": 21.714034481302644, + "learning_rate": 2.55222287130109e-05, + "loss": 2.5489, + "mean_token_accuracy": 0.4117362380027771, + "step": 25340 + }, + { + "epoch": 0.025527753185051787, + "grad_norm": 23.22243259120189, + "learning_rate": 2.552726466974196e-05, + "loss": 2.5927, + "mean_token_accuracy": 0.4241379201412201, + "step": 25345 + }, + { + "epoch": 0.025532789238155957, + "grad_norm": 14.92782542071349, + "learning_rate": 2.5532300626473014e-05, + "loss": 2.5713, + "mean_token_accuracy": 0.4034482717514038, + "step": 25350 + }, + { + "epoch": 0.02553782529126013, + "grad_norm": 17.493311110377483, + "learning_rate": 2.553733658320408e-05, + "loss": 2.7252, + "mean_token_accuracy": 0.3793103337287903, + "step": 25355 + }, + { + "epoch": 0.025542861344364304, + "grad_norm": 34.95881759924783, + "learning_rate": 2.554237253993514e-05, + "loss": 2.6831, + "mean_token_accuracy": 0.3620689630508423, + "step": 25360 + }, + { + "epoch": 0.025547897397468478, + "grad_norm": 16.23018990945946, + "learning_rate": 2.55474084966662e-05, + "loss": 2.6707, + "mean_token_accuracy": 0.38965516686439516, + "step": 25365 + }, + { + "epoch": 0.02555293345057265, + "grad_norm": 16.40583605429416, + "learning_rate": 2.5552444453397255e-05, + "loss": 2.7782, + "mean_token_accuracy": 0.3689655065536499, + "step": 25370 + }, + { + "epoch": 0.025557969503676822, + "grad_norm": 15.840904171214309, + "learning_rate": 2.5557480410128314e-05, + "loss": 2.2282, + "mean_token_accuracy": 0.4676346004009247, + "step": 25375 + }, + { + "epoch": 0.025563005556780996, + "grad_norm": 19.312794213749445, + "learning_rate": 2.556251636685938e-05, + "loss": 2.2131, + "mean_token_accuracy": 0.4206896543502808, + "step": 25380 + }, + { + "epoch": 0.025568041609885166, + "grad_norm": 17.948052173318416, + "learning_rate": 2.556755232359044e-05, + "loss": 2.3424, + "mean_token_accuracy": 0.4620689570903778, + "step": 25385 + }, + { + "epoch": 0.02557307766298934, + "grad_norm": 18.769787843605116, + "learning_rate": 2.5572588280321495e-05, + "loss": 2.1896, + "mean_token_accuracy": 0.4551724076271057, + "step": 25390 + }, + { + "epoch": 0.025578113716093514, + "grad_norm": 23.47584088596801, + "learning_rate": 2.5577624237052554e-05, + "loss": 2.7814, + "mean_token_accuracy": 0.3846340000629425, + "step": 25395 + }, + { + "epoch": 0.025583149769197688, + "grad_norm": 19.429535941346668, + "learning_rate": 2.5582660193783614e-05, + "loss": 2.6939, + "mean_token_accuracy": 0.3896551728248596, + "step": 25400 + }, + { + "epoch": 0.025588185822301858, + "grad_norm": 18.36807691020928, + "learning_rate": 2.558769615051468e-05, + "loss": 2.37, + "mean_token_accuracy": 0.4517241299152374, + "step": 25405 + }, + { + "epoch": 0.02559322187540603, + "grad_norm": 19.29360840729329, + "learning_rate": 2.5592732107245736e-05, + "loss": 2.9643, + "mean_token_accuracy": 0.3482758581638336, + "step": 25410 + }, + { + "epoch": 0.025598257928510205, + "grad_norm": 18.679836721644953, + "learning_rate": 2.5597768063976795e-05, + "loss": 2.5049, + "mean_token_accuracy": 0.39655172228813174, + "step": 25415 + }, + { + "epoch": 0.025603293981614376, + "grad_norm": 23.384422070075516, + "learning_rate": 2.5602804020707854e-05, + "loss": 2.7445, + "mean_token_accuracy": 0.4344827473163605, + "step": 25420 + }, + { + "epoch": 0.02560833003471855, + "grad_norm": 19.740392865604928, + "learning_rate": 2.5607839977438913e-05, + "loss": 2.6483, + "mean_token_accuracy": 0.3896551728248596, + "step": 25425 + }, + { + "epoch": 0.025613366087822723, + "grad_norm": 18.149656931395914, + "learning_rate": 2.5612875934169976e-05, + "loss": 2.5275, + "mean_token_accuracy": 0.38620689511299133, + "step": 25430 + }, + { + "epoch": 0.025618402140926897, + "grad_norm": 27.272597762683887, + "learning_rate": 2.5617911890901035e-05, + "loss": 2.333, + "mean_token_accuracy": 0.46896552443504336, + "step": 25435 + }, + { + "epoch": 0.025623438194031067, + "grad_norm": 16.949556414515875, + "learning_rate": 2.5622947847632094e-05, + "loss": 2.2316, + "mean_token_accuracy": 0.41724138259887694, + "step": 25440 + }, + { + "epoch": 0.02562847424713524, + "grad_norm": 20.52569275798412, + "learning_rate": 2.5627983804363154e-05, + "loss": 2.6726, + "mean_token_accuracy": 0.41530550718307496, + "step": 25445 + }, + { + "epoch": 0.025633510300239415, + "grad_norm": 24.73756581169964, + "learning_rate": 2.5633019761094213e-05, + "loss": 2.3707, + "mean_token_accuracy": 0.4620689690113068, + "step": 25450 + }, + { + "epoch": 0.025638546353343585, + "grad_norm": 18.105321636581134, + "learning_rate": 2.5638055717825272e-05, + "loss": 2.6415, + "mean_token_accuracy": 0.4103448331356049, + "step": 25455 + }, + { + "epoch": 0.02564358240644776, + "grad_norm": 18.883994358153522, + "learning_rate": 2.5643091674556335e-05, + "loss": 2.5541, + "mean_token_accuracy": 0.4068965494632721, + "step": 25460 + }, + { + "epoch": 0.025648618459551933, + "grad_norm": 14.686502510630897, + "learning_rate": 2.5648127631287394e-05, + "loss": 2.4379, + "mean_token_accuracy": 0.43103447556495667, + "step": 25465 + }, + { + "epoch": 0.025653654512656106, + "grad_norm": 20.914455703014923, + "learning_rate": 2.5653163588018453e-05, + "loss": 2.4043, + "mean_token_accuracy": 0.43266788125038147, + "step": 25470 + }, + { + "epoch": 0.025658690565760277, + "grad_norm": 20.138971409900044, + "learning_rate": 2.5658199544749513e-05, + "loss": 2.8157, + "mean_token_accuracy": 0.33793103098869326, + "step": 25475 + }, + { + "epoch": 0.02566372661886445, + "grad_norm": 16.82444156946405, + "learning_rate": 2.5663235501480572e-05, + "loss": 2.5046, + "mean_token_accuracy": 0.4103448212146759, + "step": 25480 + }, + { + "epoch": 0.025668762671968624, + "grad_norm": 18.443627836378443, + "learning_rate": 2.5668271458211635e-05, + "loss": 2.6333, + "mean_token_accuracy": 0.4310344815254211, + "step": 25485 + }, + { + "epoch": 0.025673798725072795, + "grad_norm": 20.709829752009043, + "learning_rate": 2.5673307414942694e-05, + "loss": 2.6359, + "mean_token_accuracy": 0.4344827651977539, + "step": 25490 + }, + { + "epoch": 0.02567883477817697, + "grad_norm": 18.130729576689276, + "learning_rate": 2.5678343371673753e-05, + "loss": 2.3244, + "mean_token_accuracy": 0.41034482717514037, + "step": 25495 + }, + { + "epoch": 0.025683870831281142, + "grad_norm": 16.230731785149043, + "learning_rate": 2.5683379328404812e-05, + "loss": 2.4726, + "mean_token_accuracy": 0.4310344815254211, + "step": 25500 + }, + { + "epoch": 0.025688906884385316, + "grad_norm": 18.20539320947682, + "learning_rate": 2.5688415285135868e-05, + "loss": 2.5169, + "mean_token_accuracy": 0.3862068891525269, + "step": 25505 + }, + { + "epoch": 0.025693942937489486, + "grad_norm": 16.69858668203235, + "learning_rate": 2.5693451241866934e-05, + "loss": 2.5614, + "mean_token_accuracy": 0.39310344457626345, + "step": 25510 + }, + { + "epoch": 0.02569897899059366, + "grad_norm": 18.779806064249932, + "learning_rate": 2.5698487198597993e-05, + "loss": 2.3395, + "mean_token_accuracy": 0.42068964838981626, + "step": 25515 + }, + { + "epoch": 0.025704015043697834, + "grad_norm": 16.09783371069836, + "learning_rate": 2.5703523155329053e-05, + "loss": 2.465, + "mean_token_accuracy": 0.4344827592372894, + "step": 25520 + }, + { + "epoch": 0.025709051096802004, + "grad_norm": 16.56870002643387, + "learning_rate": 2.570855911206011e-05, + "loss": 2.0729, + "mean_token_accuracy": 0.4918330192565918, + "step": 25525 + }, + { + "epoch": 0.025714087149906178, + "grad_norm": 16.60627444423652, + "learning_rate": 2.5713595068791168e-05, + "loss": 2.2981, + "mean_token_accuracy": 0.4482758641242981, + "step": 25530 + }, + { + "epoch": 0.02571912320301035, + "grad_norm": 19.130053709863038, + "learning_rate": 2.5718631025522227e-05, + "loss": 2.4905, + "mean_token_accuracy": 0.4034482777118683, + "step": 25535 + }, + { + "epoch": 0.025724159256114525, + "grad_norm": 21.129642855953062, + "learning_rate": 2.5723666982253293e-05, + "loss": 2.2408, + "mean_token_accuracy": 0.47931033968925474, + "step": 25540 + }, + { + "epoch": 0.025729195309218696, + "grad_norm": 16.112794774482463, + "learning_rate": 2.572870293898435e-05, + "loss": 2.785, + "mean_token_accuracy": 0.37931033968925476, + "step": 25545 + }, + { + "epoch": 0.02573423136232287, + "grad_norm": 23.99444515186133, + "learning_rate": 2.5733738895715408e-05, + "loss": 2.5049, + "mean_token_accuracy": 0.3999999940395355, + "step": 25550 + }, + { + "epoch": 0.025739267415427043, + "grad_norm": 23.497804700842202, + "learning_rate": 2.5738774852446467e-05, + "loss": 2.7774, + "mean_token_accuracy": 0.42413793206214906, + "step": 25555 + }, + { + "epoch": 0.025744303468531213, + "grad_norm": 16.100507873258035, + "learning_rate": 2.5743810809177527e-05, + "loss": 2.2967, + "mean_token_accuracy": 0.4586206912994385, + "step": 25560 + }, + { + "epoch": 0.025749339521635387, + "grad_norm": 17.301694710784243, + "learning_rate": 2.574884676590859e-05, + "loss": 2.5299, + "mean_token_accuracy": 0.4206896543502808, + "step": 25565 + }, + { + "epoch": 0.02575437557473956, + "grad_norm": 21.98533833404907, + "learning_rate": 2.575388272263965e-05, + "loss": 2.4249, + "mean_token_accuracy": 0.43103447556495667, + "step": 25570 + }, + { + "epoch": 0.025759411627843735, + "grad_norm": 17.810299201605687, + "learning_rate": 2.5758918679370708e-05, + "loss": 2.3648, + "mean_token_accuracy": 0.4379310250282288, + "step": 25575 + }, + { + "epoch": 0.025764447680947905, + "grad_norm": 16.090632424096817, + "learning_rate": 2.5763954636101767e-05, + "loss": 2.5688, + "mean_token_accuracy": 0.4073891669511795, + "step": 25580 + }, + { + "epoch": 0.02576948373405208, + "grad_norm": 14.58884491277401, + "learning_rate": 2.5768990592832826e-05, + "loss": 2.3695, + "mean_token_accuracy": 0.4551724135875702, + "step": 25585 + }, + { + "epoch": 0.025774519787156253, + "grad_norm": 19.943803833642274, + "learning_rate": 2.577402654956389e-05, + "loss": 2.6594, + "mean_token_accuracy": 0.4206896543502808, + "step": 25590 + }, + { + "epoch": 0.025779555840260423, + "grad_norm": 14.95317666272273, + "learning_rate": 2.5779062506294948e-05, + "loss": 2.2516, + "mean_token_accuracy": 0.41724138259887694, + "step": 25595 + }, + { + "epoch": 0.025784591893364597, + "grad_norm": 17.2318539130284, + "learning_rate": 2.5784098463026008e-05, + "loss": 2.5403, + "mean_token_accuracy": 0.4241379201412201, + "step": 25600 + }, + { + "epoch": 0.02578962794646877, + "grad_norm": 16.951517867367336, + "learning_rate": 2.5789134419757067e-05, + "loss": 2.6145, + "mean_token_accuracy": 0.4068965554237366, + "step": 25605 + }, + { + "epoch": 0.025794663999572944, + "grad_norm": 17.988879447439896, + "learning_rate": 2.5794170376488126e-05, + "loss": 2.6668, + "mean_token_accuracy": 0.34827586114406583, + "step": 25610 + }, + { + "epoch": 0.025799700052677114, + "grad_norm": 19.43605848909659, + "learning_rate": 2.5799206333219182e-05, + "loss": 2.7925, + "mean_token_accuracy": 0.3551724135875702, + "step": 25615 + }, + { + "epoch": 0.025804736105781288, + "grad_norm": 17.25922898543315, + "learning_rate": 2.5804242289950248e-05, + "loss": 2.4205, + "mean_token_accuracy": 0.43793103098869324, + "step": 25620 + }, + { + "epoch": 0.025809772158885462, + "grad_norm": 17.429022745407767, + "learning_rate": 2.5809278246681307e-05, + "loss": 2.3654, + "mean_token_accuracy": 0.42068964838981626, + "step": 25625 + }, + { + "epoch": 0.025814808211989632, + "grad_norm": 19.49157222662756, + "learning_rate": 2.5814314203412366e-05, + "loss": 2.4973, + "mean_token_accuracy": 0.43103447556495667, + "step": 25630 + }, + { + "epoch": 0.025819844265093806, + "grad_norm": 16.19556525097804, + "learning_rate": 2.5819350160143426e-05, + "loss": 2.4594, + "mean_token_accuracy": 0.4655172437429428, + "step": 25635 + }, + { + "epoch": 0.02582488031819798, + "grad_norm": 15.068054900800872, + "learning_rate": 2.582438611687448e-05, + "loss": 2.6226, + "mean_token_accuracy": 0.41724138259887694, + "step": 25640 + }, + { + "epoch": 0.025829916371302154, + "grad_norm": 15.870176236691693, + "learning_rate": 2.5829422073605548e-05, + "loss": 2.7616, + "mean_token_accuracy": 0.39716748595237733, + "step": 25645 + }, + { + "epoch": 0.025834952424406324, + "grad_norm": 15.994175301859162, + "learning_rate": 2.5834458030336607e-05, + "loss": 2.8843, + "mean_token_accuracy": 0.3862068891525269, + "step": 25650 + }, + { + "epoch": 0.025839988477510498, + "grad_norm": 19.618323688037695, + "learning_rate": 2.5839493987067666e-05, + "loss": 2.5504, + "mean_token_accuracy": 0.3482758581638336, + "step": 25655 + }, + { + "epoch": 0.02584502453061467, + "grad_norm": 19.23104692090036, + "learning_rate": 2.5844529943798722e-05, + "loss": 2.2857, + "mean_token_accuracy": 0.4620689630508423, + "step": 25660 + }, + { + "epoch": 0.02585006058371884, + "grad_norm": 21.195469283711727, + "learning_rate": 2.584956590052978e-05, + "loss": 2.6651, + "mean_token_accuracy": 0.41724138259887694, + "step": 25665 + }, + { + "epoch": 0.025855096636823015, + "grad_norm": 24.52536002967969, + "learning_rate": 2.5854601857260847e-05, + "loss": 2.475, + "mean_token_accuracy": 0.4172413766384125, + "step": 25670 + }, + { + "epoch": 0.02586013268992719, + "grad_norm": 22.866080866976052, + "learning_rate": 2.5859637813991906e-05, + "loss": 2.7621, + "mean_token_accuracy": 0.38275861740112305, + "step": 25675 + }, + { + "epoch": 0.025865168743031363, + "grad_norm": 17.68343265998821, + "learning_rate": 2.5864673770722962e-05, + "loss": 2.8953, + "mean_token_accuracy": 0.38620689511299133, + "step": 25680 + }, + { + "epoch": 0.025870204796135533, + "grad_norm": 15.32400383488988, + "learning_rate": 2.586970972745402e-05, + "loss": 2.1915, + "mean_token_accuracy": 0.4482758641242981, + "step": 25685 + }, + { + "epoch": 0.025875240849239707, + "grad_norm": 16.87317473751135, + "learning_rate": 2.587474568418508e-05, + "loss": 2.3227, + "mean_token_accuracy": 0.40689654350280763, + "step": 25690 + }, + { + "epoch": 0.02588027690234388, + "grad_norm": 19.859119583445707, + "learning_rate": 2.587978164091614e-05, + "loss": 2.693, + "mean_token_accuracy": 0.36206896901130675, + "step": 25695 + }, + { + "epoch": 0.02588531295544805, + "grad_norm": 135.1677720117011, + "learning_rate": 2.5884817597647203e-05, + "loss": 2.6601, + "mean_token_accuracy": 0.39655172228813174, + "step": 25700 + }, + { + "epoch": 0.025890349008552225, + "grad_norm": 22.023851491174877, + "learning_rate": 2.5889853554378262e-05, + "loss": 2.3791, + "mean_token_accuracy": 0.3965517282485962, + "step": 25705 + }, + { + "epoch": 0.0258953850616564, + "grad_norm": 15.910325342440771, + "learning_rate": 2.589488951110932e-05, + "loss": 2.7685, + "mean_token_accuracy": 0.3896551728248596, + "step": 25710 + }, + { + "epoch": 0.025900421114760572, + "grad_norm": 25.30780956932258, + "learning_rate": 2.589992546784038e-05, + "loss": 2.4237, + "mean_token_accuracy": 0.39655172228813174, + "step": 25715 + }, + { + "epoch": 0.025905457167864743, + "grad_norm": 16.815822799509856, + "learning_rate": 2.590496142457144e-05, + "loss": 2.5661, + "mean_token_accuracy": 0.3931034505367279, + "step": 25720 + }, + { + "epoch": 0.025910493220968916, + "grad_norm": 16.52923888349581, + "learning_rate": 2.5909997381302502e-05, + "loss": 2.3364, + "mean_token_accuracy": 0.3965517282485962, + "step": 25725 + }, + { + "epoch": 0.02591552927407309, + "grad_norm": 17.715842359817223, + "learning_rate": 2.591503333803356e-05, + "loss": 2.3835, + "mean_token_accuracy": 0.4206896543502808, + "step": 25730 + }, + { + "epoch": 0.02592056532717726, + "grad_norm": 21.018188372487078, + "learning_rate": 2.592006929476462e-05, + "loss": 2.4326, + "mean_token_accuracy": 0.44301270246505736, + "step": 25735 + }, + { + "epoch": 0.025925601380281434, + "grad_norm": 18.27056015979054, + "learning_rate": 2.592510525149568e-05, + "loss": 2.6487, + "mean_token_accuracy": 0.37931033968925476, + "step": 25740 + }, + { + "epoch": 0.025930637433385608, + "grad_norm": 18.017348857205633, + "learning_rate": 2.593014120822674e-05, + "loss": 1.9818, + "mean_token_accuracy": 0.4724137783050537, + "step": 25745 + }, + { + "epoch": 0.025935673486489782, + "grad_norm": 18.74564105094958, + "learning_rate": 2.5935177164957802e-05, + "loss": 2.6819, + "mean_token_accuracy": 0.38965516090393065, + "step": 25750 + }, + { + "epoch": 0.025940709539593952, + "grad_norm": 18.219165566994345, + "learning_rate": 2.594021312168886e-05, + "loss": 2.2987, + "mean_token_accuracy": 0.41034482717514037, + "step": 25755 + }, + { + "epoch": 0.025945745592698126, + "grad_norm": 17.506243977813853, + "learning_rate": 2.594524907841992e-05, + "loss": 2.5484, + "mean_token_accuracy": 0.44137930274009707, + "step": 25760 + }, + { + "epoch": 0.0259507816458023, + "grad_norm": 28.640369531836274, + "learning_rate": 2.595028503515098e-05, + "loss": 2.658, + "mean_token_accuracy": 0.42758620381355283, + "step": 25765 + }, + { + "epoch": 0.02595581769890647, + "grad_norm": 17.766867069825732, + "learning_rate": 2.5955320991882036e-05, + "loss": 2.8072, + "mean_token_accuracy": 0.39310344457626345, + "step": 25770 + }, + { + "epoch": 0.025960853752010644, + "grad_norm": 18.5640748319897, + "learning_rate": 2.5960356948613102e-05, + "loss": 2.4961, + "mean_token_accuracy": 0.42758620381355283, + "step": 25775 + }, + { + "epoch": 0.025965889805114818, + "grad_norm": 21.973774046866726, + "learning_rate": 2.596539290534416e-05, + "loss": 2.0544, + "mean_token_accuracy": 0.47586206197738645, + "step": 25780 + }, + { + "epoch": 0.02597092585821899, + "grad_norm": 22.37003745862752, + "learning_rate": 2.597042886207522e-05, + "loss": 2.7339, + "mean_token_accuracy": 0.38965516686439516, + "step": 25785 + }, + { + "epoch": 0.02597596191132316, + "grad_norm": 17.99471368359866, + "learning_rate": 2.5975464818806276e-05, + "loss": 2.4399, + "mean_token_accuracy": 0.4256503224372864, + "step": 25790 + }, + { + "epoch": 0.025980997964427335, + "grad_norm": 20.82192750743505, + "learning_rate": 2.5980500775537335e-05, + "loss": 2.4286, + "mean_token_accuracy": 0.4413793087005615, + "step": 25795 + }, + { + "epoch": 0.02598603401753151, + "grad_norm": 16.021568612097294, + "learning_rate": 2.5985536732268395e-05, + "loss": 3.0416, + "mean_token_accuracy": 0.3413793116807938, + "step": 25800 + }, + { + "epoch": 0.02599107007063568, + "grad_norm": 15.812387949790491, + "learning_rate": 2.599057268899946e-05, + "loss": 2.2043, + "mean_token_accuracy": 0.4344827592372894, + "step": 25805 + }, + { + "epoch": 0.025996106123739853, + "grad_norm": 22.98439497317223, + "learning_rate": 2.5995608645730516e-05, + "loss": 2.8114, + "mean_token_accuracy": 0.3793103456497192, + "step": 25810 + }, + { + "epoch": 0.026001142176844027, + "grad_norm": 25.495545544050255, + "learning_rate": 2.6000644602461576e-05, + "loss": 2.7643, + "mean_token_accuracy": 0.40816696882247927, + "step": 25815 + }, + { + "epoch": 0.0260061782299482, + "grad_norm": 23.902481811966652, + "learning_rate": 2.6005680559192635e-05, + "loss": 2.5083, + "mean_token_accuracy": 0.44137930274009707, + "step": 25820 + }, + { + "epoch": 0.02601121428305237, + "grad_norm": 22.41158258452064, + "learning_rate": 2.6010716515923694e-05, + "loss": 2.3991, + "mean_token_accuracy": 0.4538415014743805, + "step": 25825 + }, + { + "epoch": 0.026016250336156545, + "grad_norm": 16.385023110520898, + "learning_rate": 2.601575247265476e-05, + "loss": 2.3529, + "mean_token_accuracy": 0.4586206912994385, + "step": 25830 + }, + { + "epoch": 0.02602128638926072, + "grad_norm": 16.46552764432163, + "learning_rate": 2.6020788429385816e-05, + "loss": 2.3643, + "mean_token_accuracy": 0.39310343861579894, + "step": 25835 + }, + { + "epoch": 0.02602632244236489, + "grad_norm": 20.10751831243197, + "learning_rate": 2.6025824386116875e-05, + "loss": 2.3995, + "mean_token_accuracy": 0.4344827592372894, + "step": 25840 + }, + { + "epoch": 0.026031358495469063, + "grad_norm": 18.83278374091365, + "learning_rate": 2.6030860342847935e-05, + "loss": 2.161, + "mean_token_accuracy": 0.4551724076271057, + "step": 25845 + }, + { + "epoch": 0.026036394548573236, + "grad_norm": 23.83657279959745, + "learning_rate": 2.6035896299578994e-05, + "loss": 2.7131, + "mean_token_accuracy": 0.3310344755649567, + "step": 25850 + }, + { + "epoch": 0.02604143060167741, + "grad_norm": 20.64653180181237, + "learning_rate": 2.6040932256310057e-05, + "loss": 2.3485, + "mean_token_accuracy": 0.42413792610168455, + "step": 25855 + }, + { + "epoch": 0.02604646665478158, + "grad_norm": 15.908623717196344, + "learning_rate": 2.6045968213041116e-05, + "loss": 2.3456, + "mean_token_accuracy": 0.41379310190677643, + "step": 25860 + }, + { + "epoch": 0.026051502707885754, + "grad_norm": 17.77808213405826, + "learning_rate": 2.6051004169772175e-05, + "loss": 2.5897, + "mean_token_accuracy": 0.4000000059604645, + "step": 25865 + }, + { + "epoch": 0.026056538760989928, + "grad_norm": 16.600821966319305, + "learning_rate": 2.6056040126503234e-05, + "loss": 2.7458, + "mean_token_accuracy": 0.41379310488700866, + "step": 25870 + }, + { + "epoch": 0.026061574814094098, + "grad_norm": 18.477933064527882, + "learning_rate": 2.6061076083234294e-05, + "loss": 2.7099, + "mean_token_accuracy": 0.3517241358757019, + "step": 25875 + }, + { + "epoch": 0.026066610867198272, + "grad_norm": 14.989907244546547, + "learning_rate": 2.6066112039965353e-05, + "loss": 2.5055, + "mean_token_accuracy": 0.4482758641242981, + "step": 25880 + }, + { + "epoch": 0.026071646920302446, + "grad_norm": 19.68610304171674, + "learning_rate": 2.6071147996696415e-05, + "loss": 2.3742, + "mean_token_accuracy": 0.46896551847457885, + "step": 25885 + }, + { + "epoch": 0.02607668297340662, + "grad_norm": 22.7325742237096, + "learning_rate": 2.6076183953427475e-05, + "loss": 2.4356, + "mean_token_accuracy": 0.39310345649719236, + "step": 25890 + }, + { + "epoch": 0.02608171902651079, + "grad_norm": 17.478388040797597, + "learning_rate": 2.6081219910158534e-05, + "loss": 2.6535, + "mean_token_accuracy": 0.38620689511299133, + "step": 25895 + }, + { + "epoch": 0.026086755079614964, + "grad_norm": 17.0547481257583, + "learning_rate": 2.6086255866889593e-05, + "loss": 2.1989, + "mean_token_accuracy": 0.42758620381355283, + "step": 25900 + }, + { + "epoch": 0.026091791132719137, + "grad_norm": 18.86088943424977, + "learning_rate": 2.609129182362065e-05, + "loss": 2.65, + "mean_token_accuracy": 0.4362371563911438, + "step": 25905 + }, + { + "epoch": 0.026096827185823308, + "grad_norm": 21.47708979460711, + "learning_rate": 2.6096327780351715e-05, + "loss": 2.364, + "mean_token_accuracy": 0.44137930274009707, + "step": 25910 + }, + { + "epoch": 0.02610186323892748, + "grad_norm": 16.176665968790154, + "learning_rate": 2.6101363737082774e-05, + "loss": 2.7433, + "mean_token_accuracy": 0.32413792610168457, + "step": 25915 + }, + { + "epoch": 0.026106899292031655, + "grad_norm": 19.767097960334574, + "learning_rate": 2.6106399693813834e-05, + "loss": 2.2928, + "mean_token_accuracy": 0.4379310429096222, + "step": 25920 + }, + { + "epoch": 0.02611193534513583, + "grad_norm": 18.958794498654328, + "learning_rate": 2.611143565054489e-05, + "loss": 2.5264, + "mean_token_accuracy": 0.3758620709180832, + "step": 25925 + }, + { + "epoch": 0.02611697139824, + "grad_norm": 21.31067008455053, + "learning_rate": 2.611647160727595e-05, + "loss": 2.4152, + "mean_token_accuracy": 0.41034482717514037, + "step": 25930 + }, + { + "epoch": 0.026122007451344173, + "grad_norm": 16.94517816408171, + "learning_rate": 2.6121507564007015e-05, + "loss": 2.3422, + "mean_token_accuracy": 0.4344827651977539, + "step": 25935 + }, + { + "epoch": 0.026127043504448347, + "grad_norm": 22.283280642839802, + "learning_rate": 2.6126543520738074e-05, + "loss": 2.3823, + "mean_token_accuracy": 0.4296430587768555, + "step": 25940 + }, + { + "epoch": 0.026132079557552517, + "grad_norm": 14.534419908807775, + "learning_rate": 2.613157947746913e-05, + "loss": 2.651, + "mean_token_accuracy": 0.3896551728248596, + "step": 25945 + }, + { + "epoch": 0.02613711561065669, + "grad_norm": 15.191204424783667, + "learning_rate": 2.613661543420019e-05, + "loss": 2.7103, + "mean_token_accuracy": 0.42413793206214906, + "step": 25950 + }, + { + "epoch": 0.026142151663760865, + "grad_norm": 20.072054598243465, + "learning_rate": 2.614165139093125e-05, + "loss": 2.5941, + "mean_token_accuracy": 0.3793103456497192, + "step": 25955 + }, + { + "epoch": 0.02614718771686504, + "grad_norm": 15.881046811846868, + "learning_rate": 2.6146687347662308e-05, + "loss": 2.6041, + "mean_token_accuracy": 0.41034482717514037, + "step": 25960 + }, + { + "epoch": 0.02615222376996921, + "grad_norm": 17.55399754053558, + "learning_rate": 2.615172330439337e-05, + "loss": 2.5773, + "mean_token_accuracy": 0.42280701994895936, + "step": 25965 + }, + { + "epoch": 0.026157259823073382, + "grad_norm": 18.13474837180395, + "learning_rate": 2.615675926112443e-05, + "loss": 2.4811, + "mean_token_accuracy": 0.43103448748588563, + "step": 25970 + }, + { + "epoch": 0.026162295876177556, + "grad_norm": 20.762393383593924, + "learning_rate": 2.616179521785549e-05, + "loss": 2.4835, + "mean_token_accuracy": 0.4068965494632721, + "step": 25975 + }, + { + "epoch": 0.026167331929281726, + "grad_norm": 18.646043287955806, + "learning_rate": 2.6166831174586548e-05, + "loss": 2.221, + "mean_token_accuracy": 0.4517241418361664, + "step": 25980 + }, + { + "epoch": 0.0261723679823859, + "grad_norm": 18.170538491944484, + "learning_rate": 2.6171867131317607e-05, + "loss": 2.3222, + "mean_token_accuracy": 0.4517241418361664, + "step": 25985 + }, + { + "epoch": 0.026177404035490074, + "grad_norm": 20.235071317859106, + "learning_rate": 2.617690308804867e-05, + "loss": 2.9746, + "mean_token_accuracy": 0.34827586114406583, + "step": 25990 + }, + { + "epoch": 0.026182440088594248, + "grad_norm": 16.02823025745344, + "learning_rate": 2.618193904477973e-05, + "loss": 2.1697, + "mean_token_accuracy": 0.4862069010734558, + "step": 25995 + }, + { + "epoch": 0.026187476141698418, + "grad_norm": 28.581867209201228, + "learning_rate": 2.618697500151079e-05, + "loss": 2.5501, + "mean_token_accuracy": 0.3840895354747772, + "step": 26000 + }, + { + "epoch": 0.026192512194802592, + "grad_norm": 16.35438875023757, + "learning_rate": 2.6192010958241848e-05, + "loss": 2.5164, + "mean_token_accuracy": 0.3999999940395355, + "step": 26005 + }, + { + "epoch": 0.026197548247906766, + "grad_norm": 18.598280665744497, + "learning_rate": 2.6197046914972907e-05, + "loss": 2.523, + "mean_token_accuracy": 0.38620689511299133, + "step": 26010 + }, + { + "epoch": 0.026202584301010936, + "grad_norm": 16.148807171906178, + "learning_rate": 2.620208287170397e-05, + "loss": 2.1861, + "mean_token_accuracy": 0.4917120337486267, + "step": 26015 + }, + { + "epoch": 0.02620762035411511, + "grad_norm": 28.374416166804952, + "learning_rate": 2.620711882843503e-05, + "loss": 3.1545, + "mean_token_accuracy": 0.3482758581638336, + "step": 26020 + }, + { + "epoch": 0.026212656407219283, + "grad_norm": 16.7052585556346, + "learning_rate": 2.6212154785166088e-05, + "loss": 2.9811, + "mean_token_accuracy": 0.41379310488700866, + "step": 26025 + }, + { + "epoch": 0.026217692460323457, + "grad_norm": 18.409387823406544, + "learning_rate": 2.6217190741897147e-05, + "loss": 2.3969, + "mean_token_accuracy": 0.42413792610168455, + "step": 26030 + }, + { + "epoch": 0.026222728513427628, + "grad_norm": 20.075371426043436, + "learning_rate": 2.6222226698628207e-05, + "loss": 2.5284, + "mean_token_accuracy": 0.4068965554237366, + "step": 26035 + }, + { + "epoch": 0.0262277645665318, + "grad_norm": 15.918577255361972, + "learning_rate": 2.6227262655359262e-05, + "loss": 2.5395, + "mean_token_accuracy": 0.4448275864124298, + "step": 26040 + }, + { + "epoch": 0.026232800619635975, + "grad_norm": 17.56779060560404, + "learning_rate": 2.623229861209033e-05, + "loss": 2.4262, + "mean_token_accuracy": 0.39655172228813174, + "step": 26045 + }, + { + "epoch": 0.026237836672740145, + "grad_norm": 15.920391598927626, + "learning_rate": 2.6237334568821388e-05, + "loss": 2.9236, + "mean_token_accuracy": 0.4103448212146759, + "step": 26050 + }, + { + "epoch": 0.02624287272584432, + "grad_norm": 19.982084165770296, + "learning_rate": 2.6242370525552447e-05, + "loss": 2.7152, + "mean_token_accuracy": 0.4103448331356049, + "step": 26055 + }, + { + "epoch": 0.026247908778948493, + "grad_norm": 22.713750530514663, + "learning_rate": 2.6247406482283503e-05, + "loss": 2.8908, + "mean_token_accuracy": 0.3620689660310745, + "step": 26060 + }, + { + "epoch": 0.026252944832052667, + "grad_norm": 23.267324228420723, + "learning_rate": 2.6252442439014562e-05, + "loss": 2.6547, + "mean_token_accuracy": 0.42758620977401735, + "step": 26065 + }, + { + "epoch": 0.026257980885156837, + "grad_norm": 20.882740320609155, + "learning_rate": 2.6257478395745628e-05, + "loss": 2.4161, + "mean_token_accuracy": 0.4413793087005615, + "step": 26070 + }, + { + "epoch": 0.02626301693826101, + "grad_norm": 18.498385332204048, + "learning_rate": 2.6262514352476687e-05, + "loss": 2.3803, + "mean_token_accuracy": 0.4241379380226135, + "step": 26075 + }, + { + "epoch": 0.026268052991365184, + "grad_norm": 16.837928652063248, + "learning_rate": 2.6267550309207743e-05, + "loss": 2.4041, + "mean_token_accuracy": 0.42758620977401735, + "step": 26080 + }, + { + "epoch": 0.026273089044469355, + "grad_norm": 16.31266645262563, + "learning_rate": 2.6272586265938802e-05, + "loss": 2.3609, + "mean_token_accuracy": 0.3881427705287933, + "step": 26085 + }, + { + "epoch": 0.02627812509757353, + "grad_norm": 17.449170934730695, + "learning_rate": 2.6277622222669862e-05, + "loss": 2.5095, + "mean_token_accuracy": 0.3758620619773865, + "step": 26090 + }, + { + "epoch": 0.026283161150677702, + "grad_norm": 18.7924229573458, + "learning_rate": 2.6282658179400928e-05, + "loss": 2.1091, + "mean_token_accuracy": 0.47241378426551817, + "step": 26095 + }, + { + "epoch": 0.026288197203781876, + "grad_norm": 17.644889229293288, + "learning_rate": 2.6287694136131984e-05, + "loss": 2.5119, + "mean_token_accuracy": 0.41560799181461333, + "step": 26100 + }, + { + "epoch": 0.026293233256886046, + "grad_norm": 26.663174742177656, + "learning_rate": 2.6292730092863043e-05, + "loss": 2.5348, + "mean_token_accuracy": 0.4050211668014526, + "step": 26105 + }, + { + "epoch": 0.02629826930999022, + "grad_norm": 21.22836011116904, + "learning_rate": 2.6297766049594102e-05, + "loss": 2.7621, + "mean_token_accuracy": 0.38275861740112305, + "step": 26110 + }, + { + "epoch": 0.026303305363094394, + "grad_norm": 18.542535876119285, + "learning_rate": 2.630280200632516e-05, + "loss": 2.4463, + "mean_token_accuracy": 0.4517241299152374, + "step": 26115 + }, + { + "epoch": 0.026308341416198564, + "grad_norm": 21.911508126273496, + "learning_rate": 2.630783796305622e-05, + "loss": 2.4079, + "mean_token_accuracy": 0.3862068891525269, + "step": 26120 + }, + { + "epoch": 0.026313377469302738, + "grad_norm": 19.651240442874094, + "learning_rate": 2.6312873919787283e-05, + "loss": 3.0094, + "mean_token_accuracy": 0.33793103098869326, + "step": 26125 + }, + { + "epoch": 0.02631841352240691, + "grad_norm": 17.69593332673549, + "learning_rate": 2.6317909876518343e-05, + "loss": 2.2797, + "mean_token_accuracy": 0.46551724076271056, + "step": 26130 + }, + { + "epoch": 0.026323449575511085, + "grad_norm": 20.692165945646035, + "learning_rate": 2.6322945833249402e-05, + "loss": 2.7463, + "mean_token_accuracy": 0.41034482717514037, + "step": 26135 + }, + { + "epoch": 0.026328485628615256, + "grad_norm": 21.706521491961233, + "learning_rate": 2.632798178998046e-05, + "loss": 2.3286, + "mean_token_accuracy": 0.42758620381355283, + "step": 26140 + }, + { + "epoch": 0.02633352168171943, + "grad_norm": 16.873861176971804, + "learning_rate": 2.633301774671152e-05, + "loss": 2.2868, + "mean_token_accuracy": 0.45862069725990295, + "step": 26145 + }, + { + "epoch": 0.026338557734823603, + "grad_norm": 14.176027348070487, + "learning_rate": 2.6338053703442583e-05, + "loss": 2.3125, + "mean_token_accuracy": 0.4241379201412201, + "step": 26150 + }, + { + "epoch": 0.026343593787927774, + "grad_norm": 18.125882997275788, + "learning_rate": 2.6343089660173642e-05, + "loss": 2.1809, + "mean_token_accuracy": 0.4241379380226135, + "step": 26155 + }, + { + "epoch": 0.026348629841031947, + "grad_norm": 19.870521606188333, + "learning_rate": 2.63481256169047e-05, + "loss": 2.5662, + "mean_token_accuracy": 0.4034482717514038, + "step": 26160 + }, + { + "epoch": 0.02635366589413612, + "grad_norm": 17.496770658728988, + "learning_rate": 2.635316157363576e-05, + "loss": 2.3505, + "mean_token_accuracy": 0.4359951615333557, + "step": 26165 + }, + { + "epoch": 0.026358701947240295, + "grad_norm": 19.903553768195984, + "learning_rate": 2.635819753036682e-05, + "loss": 2.4462, + "mean_token_accuracy": 0.39655172228813174, + "step": 26170 + }, + { + "epoch": 0.026363738000344465, + "grad_norm": 17.738590490295884, + "learning_rate": 2.6363233487097883e-05, + "loss": 2.6248, + "mean_token_accuracy": 0.4384236454963684, + "step": 26175 + }, + { + "epoch": 0.02636877405344864, + "grad_norm": 19.53928278627887, + "learning_rate": 2.6368269443828942e-05, + "loss": 3.031, + "mean_token_accuracy": 0.36896551847457887, + "step": 26180 + }, + { + "epoch": 0.026373810106552813, + "grad_norm": 15.730385086977082, + "learning_rate": 2.637330540056e-05, + "loss": 2.4054, + "mean_token_accuracy": 0.3965517282485962, + "step": 26185 + }, + { + "epoch": 0.026378846159656983, + "grad_norm": 30.52860961192424, + "learning_rate": 2.637834135729106e-05, + "loss": 2.7037, + "mean_token_accuracy": 0.3793103516101837, + "step": 26190 + }, + { + "epoch": 0.026383882212761157, + "grad_norm": 16.791824651966305, + "learning_rate": 2.6383377314022116e-05, + "loss": 2.1038, + "mean_token_accuracy": 0.45862069725990295, + "step": 26195 + }, + { + "epoch": 0.02638891826586533, + "grad_norm": 21.73665551890945, + "learning_rate": 2.6388413270753182e-05, + "loss": 2.5717, + "mean_token_accuracy": 0.38620689511299133, + "step": 26200 + }, + { + "epoch": 0.026393954318969504, + "grad_norm": 20.007691118140873, + "learning_rate": 2.639344922748424e-05, + "loss": 2.7644, + "mean_token_accuracy": 0.38965516686439516, + "step": 26205 + }, + { + "epoch": 0.026398990372073675, + "grad_norm": 14.65794053329391, + "learning_rate": 2.63984851842153e-05, + "loss": 2.3932, + "mean_token_accuracy": 0.42413793206214906, + "step": 26210 + }, + { + "epoch": 0.02640402642517785, + "grad_norm": 20.08702757380599, + "learning_rate": 2.6403521140946357e-05, + "loss": 2.5449, + "mean_token_accuracy": 0.4517241299152374, + "step": 26215 + }, + { + "epoch": 0.026409062478282022, + "grad_norm": 19.761384868979658, + "learning_rate": 2.6408557097677416e-05, + "loss": 2.3771, + "mean_token_accuracy": 0.4413793206214905, + "step": 26220 + }, + { + "epoch": 0.026414098531386192, + "grad_norm": 17.19068219959304, + "learning_rate": 2.6413593054408475e-05, + "loss": 2.3559, + "mean_token_accuracy": 0.4517241299152374, + "step": 26225 + }, + { + "epoch": 0.026419134584490366, + "grad_norm": 28.94639118718414, + "learning_rate": 2.641862901113954e-05, + "loss": 2.3919, + "mean_token_accuracy": 0.4517241299152374, + "step": 26230 + }, + { + "epoch": 0.02642417063759454, + "grad_norm": 18.030231458083296, + "learning_rate": 2.6423664967870597e-05, + "loss": 2.509, + "mean_token_accuracy": 0.36551723778247835, + "step": 26235 + }, + { + "epoch": 0.026429206690698714, + "grad_norm": 16.074891579148566, + "learning_rate": 2.6428700924601656e-05, + "loss": 2.3276, + "mean_token_accuracy": 0.4103448212146759, + "step": 26240 + }, + { + "epoch": 0.026434242743802884, + "grad_norm": 18.227665788986528, + "learning_rate": 2.6433736881332716e-05, + "loss": 2.3504, + "mean_token_accuracy": 0.4034482777118683, + "step": 26245 + }, + { + "epoch": 0.026439278796907058, + "grad_norm": 18.372551153445723, + "learning_rate": 2.6438772838063775e-05, + "loss": 2.676, + "mean_token_accuracy": 0.37931033968925476, + "step": 26250 + }, + { + "epoch": 0.02644431485001123, + "grad_norm": 20.244290584775747, + "learning_rate": 2.6443808794794837e-05, + "loss": 2.3995, + "mean_token_accuracy": 0.4344827473163605, + "step": 26255 + }, + { + "epoch": 0.026449350903115402, + "grad_norm": 15.321349976709902, + "learning_rate": 2.6448844751525897e-05, + "loss": 2.3865, + "mean_token_accuracy": 0.458620685338974, + "step": 26260 + }, + { + "epoch": 0.026454386956219576, + "grad_norm": 14.369231243448612, + "learning_rate": 2.6453880708256956e-05, + "loss": 2.0356, + "mean_token_accuracy": 0.4862068951129913, + "step": 26265 + }, + { + "epoch": 0.02645942300932375, + "grad_norm": 20.14346947038924, + "learning_rate": 2.6458916664988015e-05, + "loss": 2.5368, + "mean_token_accuracy": 0.4551724076271057, + "step": 26270 + }, + { + "epoch": 0.026464459062427923, + "grad_norm": 17.42228362121931, + "learning_rate": 2.6463952621719074e-05, + "loss": 2.4031, + "mean_token_accuracy": 0.44482758045196535, + "step": 26275 + }, + { + "epoch": 0.026469495115532093, + "grad_norm": 17.408870991882434, + "learning_rate": 2.6468988578450137e-05, + "loss": 2.5286, + "mean_token_accuracy": 0.41167573928833007, + "step": 26280 + }, + { + "epoch": 0.026474531168636267, + "grad_norm": 18.16205445534524, + "learning_rate": 2.6474024535181196e-05, + "loss": 2.6645, + "mean_token_accuracy": 0.38275861740112305, + "step": 26285 + }, + { + "epoch": 0.02647956722174044, + "grad_norm": 21.07187117745845, + "learning_rate": 2.6479060491912256e-05, + "loss": 2.7223, + "mean_token_accuracy": 0.42413793206214906, + "step": 26290 + }, + { + "epoch": 0.02648460327484461, + "grad_norm": 22.25249574898106, + "learning_rate": 2.6484096448643315e-05, + "loss": 2.7617, + "mean_token_accuracy": 0.3758620619773865, + "step": 26295 + }, + { + "epoch": 0.026489639327948785, + "grad_norm": 18.834483008035605, + "learning_rate": 2.6489132405374374e-05, + "loss": 2.4561, + "mean_token_accuracy": 0.4172413766384125, + "step": 26300 + }, + { + "epoch": 0.02649467538105296, + "grad_norm": 19.206897419078583, + "learning_rate": 2.649416836210543e-05, + "loss": 2.434, + "mean_token_accuracy": 0.41724138259887694, + "step": 26305 + }, + { + "epoch": 0.026499711434157133, + "grad_norm": 17.033178828163056, + "learning_rate": 2.6499204318836496e-05, + "loss": 2.4917, + "mean_token_accuracy": 0.42413792610168455, + "step": 26310 + }, + { + "epoch": 0.026504747487261303, + "grad_norm": 16.0311321100682, + "learning_rate": 2.6504240275567555e-05, + "loss": 2.5647, + "mean_token_accuracy": 0.3931034505367279, + "step": 26315 + }, + { + "epoch": 0.026509783540365477, + "grad_norm": 15.321897638472267, + "learning_rate": 2.6509276232298614e-05, + "loss": 2.3784, + "mean_token_accuracy": 0.41034482717514037, + "step": 26320 + }, + { + "epoch": 0.02651481959346965, + "grad_norm": 20.68872243861882, + "learning_rate": 2.651431218902967e-05, + "loss": 2.2419, + "mean_token_accuracy": 0.439443439245224, + "step": 26325 + }, + { + "epoch": 0.02651985564657382, + "grad_norm": 22.635571085642113, + "learning_rate": 2.651934814576073e-05, + "loss": 2.3832, + "mean_token_accuracy": 0.42758620381355283, + "step": 26330 + }, + { + "epoch": 0.026524891699677994, + "grad_norm": 19.10018053868613, + "learning_rate": 2.6524384102491796e-05, + "loss": 2.6389, + "mean_token_accuracy": 0.41034482717514037, + "step": 26335 + }, + { + "epoch": 0.026529927752782168, + "grad_norm": 17.310142897443317, + "learning_rate": 2.6529420059222855e-05, + "loss": 2.7653, + "mean_token_accuracy": 0.42068964838981626, + "step": 26340 + }, + { + "epoch": 0.026534963805886342, + "grad_norm": 18.289235779874982, + "learning_rate": 2.653445601595391e-05, + "loss": 2.4187, + "mean_token_accuracy": 0.4505747139453888, + "step": 26345 + }, + { + "epoch": 0.026539999858990512, + "grad_norm": 14.72144814066945, + "learning_rate": 2.653949197268497e-05, + "loss": 1.847, + "mean_token_accuracy": 0.5226860225200654, + "step": 26350 + }, + { + "epoch": 0.026545035912094686, + "grad_norm": 19.291380147515234, + "learning_rate": 2.654452792941603e-05, + "loss": 2.3638, + "mean_token_accuracy": 0.4448275864124298, + "step": 26355 + }, + { + "epoch": 0.02655007196519886, + "grad_norm": 24.490456362132456, + "learning_rate": 2.6549563886147095e-05, + "loss": 2.6383, + "mean_token_accuracy": 0.41923774480819703, + "step": 26360 + }, + { + "epoch": 0.02655510801830303, + "grad_norm": 19.106938611537938, + "learning_rate": 2.6554599842878155e-05, + "loss": 2.5966, + "mean_token_accuracy": 0.4103448212146759, + "step": 26365 + }, + { + "epoch": 0.026560144071407204, + "grad_norm": 23.37373698501716, + "learning_rate": 2.655963579960921e-05, + "loss": 2.6726, + "mean_token_accuracy": 0.41034482717514037, + "step": 26370 + }, + { + "epoch": 0.026565180124511378, + "grad_norm": 21.236744844716764, + "learning_rate": 2.656467175634027e-05, + "loss": 2.3783, + "mean_token_accuracy": 0.42758620977401735, + "step": 26375 + }, + { + "epoch": 0.02657021617761555, + "grad_norm": 22.67547365851139, + "learning_rate": 2.656970771307133e-05, + "loss": 2.7224, + "mean_token_accuracy": 0.4137930989265442, + "step": 26380 + }, + { + "epoch": 0.02657525223071972, + "grad_norm": 18.02583984070576, + "learning_rate": 2.6574743669802388e-05, + "loss": 2.7375, + "mean_token_accuracy": 0.35172413289546967, + "step": 26385 + }, + { + "epoch": 0.026580288283823895, + "grad_norm": 16.14427873406681, + "learning_rate": 2.657977962653345e-05, + "loss": 2.6752, + "mean_token_accuracy": 0.4000000059604645, + "step": 26390 + }, + { + "epoch": 0.02658532433692807, + "grad_norm": 25.49435604664068, + "learning_rate": 2.658481558326451e-05, + "loss": 2.8213, + "mean_token_accuracy": 0.38275861740112305, + "step": 26395 + }, + { + "epoch": 0.02659036039003224, + "grad_norm": 22.73198606305175, + "learning_rate": 2.658985153999557e-05, + "loss": 2.6673, + "mean_token_accuracy": 0.36206896901130675, + "step": 26400 + }, + { + "epoch": 0.026595396443136413, + "grad_norm": 15.150778368839646, + "learning_rate": 2.659488749672663e-05, + "loss": 2.7391, + "mean_token_accuracy": 0.34482758641242983, + "step": 26405 + }, + { + "epoch": 0.026600432496240587, + "grad_norm": 16.52730320678833, + "learning_rate": 2.6599923453457688e-05, + "loss": 2.4525, + "mean_token_accuracy": 0.42758620381355283, + "step": 26410 + }, + { + "epoch": 0.02660546854934476, + "grad_norm": 18.851941243178903, + "learning_rate": 2.660495941018875e-05, + "loss": 2.5088, + "mean_token_accuracy": 0.4469449520111084, + "step": 26415 + }, + { + "epoch": 0.02661050460244893, + "grad_norm": 19.163000430352216, + "learning_rate": 2.660999536691981e-05, + "loss": 2.5397, + "mean_token_accuracy": 0.3931034475564957, + "step": 26420 + }, + { + "epoch": 0.026615540655553105, + "grad_norm": 17.515856701055565, + "learning_rate": 2.661503132365087e-05, + "loss": 2.4471, + "mean_token_accuracy": 0.4137930989265442, + "step": 26425 + }, + { + "epoch": 0.02662057670865728, + "grad_norm": 17.50651473714608, + "learning_rate": 2.6620067280381928e-05, + "loss": 2.5059, + "mean_token_accuracy": 0.43103448748588563, + "step": 26430 + }, + { + "epoch": 0.02662561276176145, + "grad_norm": 20.836084231608638, + "learning_rate": 2.6625103237112987e-05, + "loss": 2.6053, + "mean_token_accuracy": 0.403448274731636, + "step": 26435 + }, + { + "epoch": 0.026630648814865623, + "grad_norm": 16.819906694833826, + "learning_rate": 2.663013919384405e-05, + "loss": 2.2948, + "mean_token_accuracy": 0.4433151841163635, + "step": 26440 + }, + { + "epoch": 0.026635684867969796, + "grad_norm": 20.018234219793513, + "learning_rate": 2.663517515057511e-05, + "loss": 2.5822, + "mean_token_accuracy": 0.36551724672317504, + "step": 26445 + }, + { + "epoch": 0.02664072092107397, + "grad_norm": 17.43892433515772, + "learning_rate": 2.664021110730617e-05, + "loss": 2.6166, + "mean_token_accuracy": 0.4068965554237366, + "step": 26450 + }, + { + "epoch": 0.02664575697417814, + "grad_norm": 17.03087560976558, + "learning_rate": 2.6645247064037228e-05, + "loss": 3.0, + "mean_token_accuracy": 0.320689657330513, + "step": 26455 + }, + { + "epoch": 0.026650793027282314, + "grad_norm": 22.121839046282748, + "learning_rate": 2.6650283020768284e-05, + "loss": 2.6548, + "mean_token_accuracy": 0.36551724672317504, + "step": 26460 + }, + { + "epoch": 0.026655829080386488, + "grad_norm": 22.480825895544886, + "learning_rate": 2.6655318977499343e-05, + "loss": 2.7532, + "mean_token_accuracy": 0.3965517282485962, + "step": 26465 + }, + { + "epoch": 0.02666086513349066, + "grad_norm": 14.583735454226035, + "learning_rate": 2.666035493423041e-05, + "loss": 2.3018, + "mean_token_accuracy": 0.4896551728248596, + "step": 26470 + }, + { + "epoch": 0.026665901186594832, + "grad_norm": 16.278201843025588, + "learning_rate": 2.6665390890961468e-05, + "loss": 2.5249, + "mean_token_accuracy": 0.4379310369491577, + "step": 26475 + }, + { + "epoch": 0.026670937239699006, + "grad_norm": 19.610670204553035, + "learning_rate": 2.6670426847692524e-05, + "loss": 2.7158, + "mean_token_accuracy": 0.42413793206214906, + "step": 26480 + }, + { + "epoch": 0.02667597329280318, + "grad_norm": 18.37476725856538, + "learning_rate": 2.6675462804423583e-05, + "loss": 2.6315, + "mean_token_accuracy": 0.41034482717514037, + "step": 26485 + }, + { + "epoch": 0.02668100934590735, + "grad_norm": 15.527395835135536, + "learning_rate": 2.6680498761154643e-05, + "loss": 2.5128, + "mean_token_accuracy": 0.42413792610168455, + "step": 26490 + }, + { + "epoch": 0.026686045399011524, + "grad_norm": 18.439513665993097, + "learning_rate": 2.668553471788571e-05, + "loss": 2.5044, + "mean_token_accuracy": 0.41724138259887694, + "step": 26495 + }, + { + "epoch": 0.026691081452115697, + "grad_norm": 18.166266412915164, + "learning_rate": 2.6690570674616765e-05, + "loss": 2.4041, + "mean_token_accuracy": 0.42068966031074523, + "step": 26500 + }, + { + "epoch": 0.026696117505219868, + "grad_norm": 43.84216410257988, + "learning_rate": 2.6695606631347824e-05, + "loss": 3.0035, + "mean_token_accuracy": 0.3896551728248596, + "step": 26505 + }, + { + "epoch": 0.02670115355832404, + "grad_norm": 24.907906952258795, + "learning_rate": 2.6700642588078883e-05, + "loss": 2.5792, + "mean_token_accuracy": 0.38965516686439516, + "step": 26510 + }, + { + "epoch": 0.026706189611428215, + "grad_norm": 18.975182651807927, + "learning_rate": 2.6705678544809942e-05, + "loss": 2.2964, + "mean_token_accuracy": 0.42413792610168455, + "step": 26515 + }, + { + "epoch": 0.02671122566453239, + "grad_norm": 26.6230578720545, + "learning_rate": 2.6710714501541005e-05, + "loss": 2.3411, + "mean_token_accuracy": 0.37586206793785093, + "step": 26520 + }, + { + "epoch": 0.02671626171763656, + "grad_norm": 17.475275833237866, + "learning_rate": 2.6715750458272064e-05, + "loss": 2.0487, + "mean_token_accuracy": 0.5082512438297272, + "step": 26525 + }, + { + "epoch": 0.026721297770740733, + "grad_norm": 19.749955516066873, + "learning_rate": 2.6720786415003123e-05, + "loss": 2.5239, + "mean_token_accuracy": 0.39310344457626345, + "step": 26530 + }, + { + "epoch": 0.026726333823844907, + "grad_norm": 18.345891727680215, + "learning_rate": 2.6725822371734183e-05, + "loss": 2.4944, + "mean_token_accuracy": 0.42758620381355283, + "step": 26535 + }, + { + "epoch": 0.026731369876949077, + "grad_norm": 24.237299235181624, + "learning_rate": 2.6730858328465242e-05, + "loss": 2.7044, + "mean_token_accuracy": 0.3896551728248596, + "step": 26540 + }, + { + "epoch": 0.02673640593005325, + "grad_norm": 18.50473053043379, + "learning_rate": 2.67358942851963e-05, + "loss": 2.7367, + "mean_token_accuracy": 0.3586206942796707, + "step": 26545 + }, + { + "epoch": 0.026741441983157425, + "grad_norm": 16.893301722057114, + "learning_rate": 2.6740930241927364e-05, + "loss": 2.5895, + "mean_token_accuracy": 0.39310344457626345, + "step": 26550 + }, + { + "epoch": 0.0267464780362616, + "grad_norm": 20.952702031989787, + "learning_rate": 2.6745966198658423e-05, + "loss": 2.4712, + "mean_token_accuracy": 0.40689654350280763, + "step": 26555 + }, + { + "epoch": 0.02675151408936577, + "grad_norm": 20.227259840838073, + "learning_rate": 2.6751002155389482e-05, + "loss": 2.6414, + "mean_token_accuracy": 0.3724137932062149, + "step": 26560 + }, + { + "epoch": 0.026756550142469943, + "grad_norm": 17.363299492869327, + "learning_rate": 2.675603811212054e-05, + "loss": 2.5097, + "mean_token_accuracy": 0.41379311084747317, + "step": 26565 + }, + { + "epoch": 0.026761586195574116, + "grad_norm": 22.942609479603245, + "learning_rate": 2.67610740688516e-05, + "loss": 2.5691, + "mean_token_accuracy": 0.37586207389831544, + "step": 26570 + }, + { + "epoch": 0.026766622248678287, + "grad_norm": 23.705061754197057, + "learning_rate": 2.6766110025582663e-05, + "loss": 2.3177, + "mean_token_accuracy": 0.4551724076271057, + "step": 26575 + }, + { + "epoch": 0.02677165830178246, + "grad_norm": 16.52475817165424, + "learning_rate": 2.6771145982313723e-05, + "loss": 2.2935, + "mean_token_accuracy": 0.4413793087005615, + "step": 26580 + }, + { + "epoch": 0.026776694354886634, + "grad_norm": 16.994721702510603, + "learning_rate": 2.6776181939044782e-05, + "loss": 2.3538, + "mean_token_accuracy": 0.44482757449150084, + "step": 26585 + }, + { + "epoch": 0.026781730407990804, + "grad_norm": 16.93425569342198, + "learning_rate": 2.678121789577584e-05, + "loss": 3.0897, + "mean_token_accuracy": 0.34137930572032926, + "step": 26590 + }, + { + "epoch": 0.026786766461094978, + "grad_norm": 17.71527059179849, + "learning_rate": 2.6786253852506897e-05, + "loss": 2.8102, + "mean_token_accuracy": 0.3758620709180832, + "step": 26595 + }, + { + "epoch": 0.026791802514199152, + "grad_norm": 20.41262201795603, + "learning_rate": 2.6791289809237963e-05, + "loss": 2.331, + "mean_token_accuracy": 0.4448275864124298, + "step": 26600 + }, + { + "epoch": 0.026796838567303326, + "grad_norm": 19.836118096521158, + "learning_rate": 2.6796325765969022e-05, + "loss": 2.6239, + "mean_token_accuracy": 0.4068965494632721, + "step": 26605 + }, + { + "epoch": 0.026801874620407496, + "grad_norm": 16.632618946257917, + "learning_rate": 2.680136172270008e-05, + "loss": 2.4838, + "mean_token_accuracy": 0.41724138259887694, + "step": 26610 + }, + { + "epoch": 0.02680691067351167, + "grad_norm": 25.57761902460903, + "learning_rate": 2.6806397679431138e-05, + "loss": 2.5928, + "mean_token_accuracy": 0.39310344457626345, + "step": 26615 + }, + { + "epoch": 0.026811946726615844, + "grad_norm": 20.335120611582486, + "learning_rate": 2.6811433636162197e-05, + "loss": 2.7376, + "mean_token_accuracy": 0.33448276221752166, + "step": 26620 + }, + { + "epoch": 0.026816982779720014, + "grad_norm": 17.75998385795157, + "learning_rate": 2.6816469592893263e-05, + "loss": 2.7259, + "mean_token_accuracy": 0.3999999940395355, + "step": 26625 + }, + { + "epoch": 0.026822018832824188, + "grad_norm": 20.875467040203176, + "learning_rate": 2.6821505549624322e-05, + "loss": 2.383, + "mean_token_accuracy": 0.4103448212146759, + "step": 26630 + }, + { + "epoch": 0.02682705488592836, + "grad_norm": 16.477109888812123, + "learning_rate": 2.6826541506355378e-05, + "loss": 2.3424, + "mean_token_accuracy": 0.49171202778816225, + "step": 26635 + }, + { + "epoch": 0.026832090939032535, + "grad_norm": 16.564570244018086, + "learning_rate": 2.6831577463086437e-05, + "loss": 2.397, + "mean_token_accuracy": 0.417241370677948, + "step": 26640 + }, + { + "epoch": 0.026837126992136705, + "grad_norm": 17.905535676758145, + "learning_rate": 2.6836613419817496e-05, + "loss": 2.1829, + "mean_token_accuracy": 0.4724137902259827, + "step": 26645 + }, + { + "epoch": 0.02684216304524088, + "grad_norm": 18.582089642733994, + "learning_rate": 2.6841649376548556e-05, + "loss": 2.5143, + "mean_token_accuracy": 0.41034482717514037, + "step": 26650 + }, + { + "epoch": 0.026847199098345053, + "grad_norm": 16.102334526438064, + "learning_rate": 2.6846685333279618e-05, + "loss": 2.3065, + "mean_token_accuracy": 0.4793103516101837, + "step": 26655 + }, + { + "epoch": 0.026852235151449223, + "grad_norm": 23.612563020690057, + "learning_rate": 2.6851721290010678e-05, + "loss": 2.5103, + "mean_token_accuracy": 0.3965517282485962, + "step": 26660 + }, + { + "epoch": 0.026857271204553397, + "grad_norm": 17.294890265387224, + "learning_rate": 2.6856757246741737e-05, + "loss": 2.3032, + "mean_token_accuracy": 0.46896551847457885, + "step": 26665 + }, + { + "epoch": 0.02686230725765757, + "grad_norm": 16.16031750561536, + "learning_rate": 2.6861793203472796e-05, + "loss": 2.297, + "mean_token_accuracy": 0.42068965137004855, + "step": 26670 + }, + { + "epoch": 0.026867343310761745, + "grad_norm": 19.91220422084447, + "learning_rate": 2.6866829160203855e-05, + "loss": 2.5976, + "mean_token_accuracy": 0.38620689511299133, + "step": 26675 + }, + { + "epoch": 0.026872379363865915, + "grad_norm": 15.2960239570482, + "learning_rate": 2.6871865116934918e-05, + "loss": 1.9801, + "mean_token_accuracy": 0.4620689630508423, + "step": 26680 + }, + { + "epoch": 0.02687741541697009, + "grad_norm": 22.639381307893732, + "learning_rate": 2.6876901073665977e-05, + "loss": 2.8707, + "mean_token_accuracy": 0.37586207389831544, + "step": 26685 + }, + { + "epoch": 0.026882451470074262, + "grad_norm": 15.410212350156943, + "learning_rate": 2.6881937030397036e-05, + "loss": 2.385, + "mean_token_accuracy": 0.42758620977401735, + "step": 26690 + }, + { + "epoch": 0.026887487523178433, + "grad_norm": 21.210383733920175, + "learning_rate": 2.6886972987128096e-05, + "loss": 2.6044, + "mean_token_accuracy": 0.38106473088264464, + "step": 26695 + }, + { + "epoch": 0.026892523576282606, + "grad_norm": 19.01866715972732, + "learning_rate": 2.6892008943859155e-05, + "loss": 2.8327, + "mean_token_accuracy": 0.3620689570903778, + "step": 26700 + }, + { + "epoch": 0.02689755962938678, + "grad_norm": 15.595556644708015, + "learning_rate": 2.6897044900590218e-05, + "loss": 2.1775, + "mean_token_accuracy": 0.4482758641242981, + "step": 26705 + }, + { + "epoch": 0.026902595682490954, + "grad_norm": 17.190845267179338, + "learning_rate": 2.6902080857321277e-05, + "loss": 2.6164, + "mean_token_accuracy": 0.41379311084747317, + "step": 26710 + }, + { + "epoch": 0.026907631735595124, + "grad_norm": 20.613282509870697, + "learning_rate": 2.6907116814052336e-05, + "loss": 2.6201, + "mean_token_accuracy": 0.4, + "step": 26715 + }, + { + "epoch": 0.026912667788699298, + "grad_norm": 14.536783358514555, + "learning_rate": 2.6912152770783395e-05, + "loss": 2.4021, + "mean_token_accuracy": 0.42413793206214906, + "step": 26720 + }, + { + "epoch": 0.026917703841803472, + "grad_norm": 17.187835334340203, + "learning_rate": 2.6917188727514455e-05, + "loss": 2.113, + "mean_token_accuracy": 0.4862068951129913, + "step": 26725 + }, + { + "epoch": 0.026922739894907642, + "grad_norm": 18.74858153677933, + "learning_rate": 2.692222468424551e-05, + "loss": 2.3955, + "mean_token_accuracy": 0.4241379380226135, + "step": 26730 + }, + { + "epoch": 0.026927775948011816, + "grad_norm": 17.62455447866605, + "learning_rate": 2.6927260640976577e-05, + "loss": 2.6174, + "mean_token_accuracy": 0.37586206793785093, + "step": 26735 + }, + { + "epoch": 0.02693281200111599, + "grad_norm": 21.0848241618325, + "learning_rate": 2.6932296597707636e-05, + "loss": 2.6278, + "mean_token_accuracy": 0.41034482717514037, + "step": 26740 + }, + { + "epoch": 0.026937848054220163, + "grad_norm": 15.06289578582256, + "learning_rate": 2.6937332554438695e-05, + "loss": 2.6431, + "mean_token_accuracy": 0.358620685338974, + "step": 26745 + }, + { + "epoch": 0.026942884107324334, + "grad_norm": 17.424882190669397, + "learning_rate": 2.694236851116975e-05, + "loss": 2.2415, + "mean_token_accuracy": 0.46896551847457885, + "step": 26750 + }, + { + "epoch": 0.026947920160428507, + "grad_norm": 19.050732013016383, + "learning_rate": 2.694740446790081e-05, + "loss": 2.7662, + "mean_token_accuracy": 0.33793102502822875, + "step": 26755 + }, + { + "epoch": 0.02695295621353268, + "grad_norm": 16.64238312550753, + "learning_rate": 2.6952440424631876e-05, + "loss": 2.5786, + "mean_token_accuracy": 0.38275861740112305, + "step": 26760 + }, + { + "epoch": 0.02695799226663685, + "grad_norm": 15.265324960878868, + "learning_rate": 2.6957476381362935e-05, + "loss": 2.3636, + "mean_token_accuracy": 0.3965517282485962, + "step": 26765 + }, + { + "epoch": 0.026963028319741025, + "grad_norm": 15.327091127577264, + "learning_rate": 2.696251233809399e-05, + "loss": 2.6416, + "mean_token_accuracy": 0.32758620083332063, + "step": 26770 + }, + { + "epoch": 0.0269680643728452, + "grad_norm": 20.964042168386715, + "learning_rate": 2.696754829482505e-05, + "loss": 2.741, + "mean_token_accuracy": 0.4068965554237366, + "step": 26775 + }, + { + "epoch": 0.026973100425949373, + "grad_norm": 14.946707768657257, + "learning_rate": 2.697258425155611e-05, + "loss": 2.3998, + "mean_token_accuracy": 0.41034482717514037, + "step": 26780 + }, + { + "epoch": 0.026978136479053543, + "grad_norm": 18.31421125412061, + "learning_rate": 2.6977620208287176e-05, + "loss": 2.7007, + "mean_token_accuracy": 0.3689655214548111, + "step": 26785 + }, + { + "epoch": 0.026983172532157717, + "grad_norm": 18.83236959697641, + "learning_rate": 2.698265616501823e-05, + "loss": 2.8449, + "mean_token_accuracy": 0.39310344457626345, + "step": 26790 + }, + { + "epoch": 0.02698820858526189, + "grad_norm": 15.591172964919453, + "learning_rate": 2.698769212174929e-05, + "loss": 2.2724, + "mean_token_accuracy": 0.4068965494632721, + "step": 26795 + }, + { + "epoch": 0.02699324463836606, + "grad_norm": 17.825145649700218, + "learning_rate": 2.699272807848035e-05, + "loss": 2.42, + "mean_token_accuracy": 0.43448275327682495, + "step": 26800 + }, + { + "epoch": 0.026998280691470235, + "grad_norm": 15.925485439034828, + "learning_rate": 2.699776403521141e-05, + "loss": 2.5881, + "mean_token_accuracy": 0.42068964838981626, + "step": 26805 + }, + { + "epoch": 0.02700331674457441, + "grad_norm": 15.144272722144196, + "learning_rate": 2.700279999194247e-05, + "loss": 2.3852, + "mean_token_accuracy": 0.41724138259887694, + "step": 26810 + }, + { + "epoch": 0.027008352797678582, + "grad_norm": 20.335665610270276, + "learning_rate": 2.700783594867353e-05, + "loss": 2.4388, + "mean_token_accuracy": 0.4172413766384125, + "step": 26815 + }, + { + "epoch": 0.027013388850782753, + "grad_norm": 21.056845117806382, + "learning_rate": 2.701287190540459e-05, + "loss": 2.7865, + "mean_token_accuracy": 0.41379310488700866, + "step": 26820 + }, + { + "epoch": 0.027018424903886926, + "grad_norm": 13.770110047772942, + "learning_rate": 2.701790786213565e-05, + "loss": 2.5349, + "mean_token_accuracy": 0.4103448331356049, + "step": 26825 + }, + { + "epoch": 0.0270234609569911, + "grad_norm": 18.94634921490606, + "learning_rate": 2.702294381886671e-05, + "loss": 2.4992, + "mean_token_accuracy": 0.358620685338974, + "step": 26830 + }, + { + "epoch": 0.02702849701009527, + "grad_norm": 17.53954027205738, + "learning_rate": 2.702797977559777e-05, + "loss": 2.2204, + "mean_token_accuracy": 0.4586206912994385, + "step": 26835 + }, + { + "epoch": 0.027033533063199444, + "grad_norm": 30.561714742724668, + "learning_rate": 2.703301573232883e-05, + "loss": 2.3019, + "mean_token_accuracy": 0.4901477873325348, + "step": 26840 + }, + { + "epoch": 0.027038569116303618, + "grad_norm": 17.753730779509883, + "learning_rate": 2.703805168905989e-05, + "loss": 2.6214, + "mean_token_accuracy": 0.362068971991539, + "step": 26845 + }, + { + "epoch": 0.02704360516940779, + "grad_norm": 18.252880811630657, + "learning_rate": 2.704308764579095e-05, + "loss": 2.5086, + "mean_token_accuracy": 0.379310342669487, + "step": 26850 + }, + { + "epoch": 0.027048641222511962, + "grad_norm": 21.59894897189673, + "learning_rate": 2.704812360252201e-05, + "loss": 2.3058, + "mean_token_accuracy": 0.47132486701011655, + "step": 26855 + }, + { + "epoch": 0.027053677275616136, + "grad_norm": 18.852684013264735, + "learning_rate": 2.7053159559253065e-05, + "loss": 2.6412, + "mean_token_accuracy": 0.43448275327682495, + "step": 26860 + }, + { + "epoch": 0.02705871332872031, + "grad_norm": 26.79472507347962, + "learning_rate": 2.705819551598413e-05, + "loss": 2.1864, + "mean_token_accuracy": 0.4482758641242981, + "step": 26865 + }, + { + "epoch": 0.02706374938182448, + "grad_norm": 17.60896181927883, + "learning_rate": 2.706323147271519e-05, + "loss": 2.4758, + "mean_token_accuracy": 0.3793103516101837, + "step": 26870 + }, + { + "epoch": 0.027068785434928654, + "grad_norm": 18.82987881197227, + "learning_rate": 2.706826742944625e-05, + "loss": 2.5264, + "mean_token_accuracy": 0.4551724135875702, + "step": 26875 + }, + { + "epoch": 0.027073821488032827, + "grad_norm": 20.03584561924533, + "learning_rate": 2.7073303386177305e-05, + "loss": 2.5425, + "mean_token_accuracy": 0.37241379618644715, + "step": 26880 + }, + { + "epoch": 0.027078857541137, + "grad_norm": 18.30948766305271, + "learning_rate": 2.7078339342908364e-05, + "loss": 2.4934, + "mean_token_accuracy": 0.42758620977401735, + "step": 26885 + }, + { + "epoch": 0.02708389359424117, + "grad_norm": 17.207135611741407, + "learning_rate": 2.7083375299639424e-05, + "loss": 2.7789, + "mean_token_accuracy": 0.3896551787853241, + "step": 26890 + }, + { + "epoch": 0.027088929647345345, + "grad_norm": 22.960485582479805, + "learning_rate": 2.708841125637049e-05, + "loss": 2.7601, + "mean_token_accuracy": 0.3896551728248596, + "step": 26895 + }, + { + "epoch": 0.02709396570044952, + "grad_norm": 19.925661290776958, + "learning_rate": 2.7093447213101545e-05, + "loss": 2.5567, + "mean_token_accuracy": 0.3965517282485962, + "step": 26900 + }, + { + "epoch": 0.02709900175355369, + "grad_norm": 17.056318333424937, + "learning_rate": 2.7098483169832605e-05, + "loss": 2.1354, + "mean_token_accuracy": 0.49999999403953554, + "step": 26905 + }, + { + "epoch": 0.027104037806657863, + "grad_norm": 20.625864014975154, + "learning_rate": 2.7103519126563664e-05, + "loss": 2.492, + "mean_token_accuracy": 0.441379314661026, + "step": 26910 + }, + { + "epoch": 0.027109073859762037, + "grad_norm": 36.02802024084089, + "learning_rate": 2.7108555083294723e-05, + "loss": 2.554, + "mean_token_accuracy": 0.3931034505367279, + "step": 26915 + }, + { + "epoch": 0.02711410991286621, + "grad_norm": 21.865503840078667, + "learning_rate": 2.711359104002579e-05, + "loss": 2.3369, + "mean_token_accuracy": 0.3965517282485962, + "step": 26920 + }, + { + "epoch": 0.02711914596597038, + "grad_norm": 28.08703386516553, + "learning_rate": 2.7118626996756845e-05, + "loss": 2.34, + "mean_token_accuracy": 0.4482758641242981, + "step": 26925 + }, + { + "epoch": 0.027124182019074555, + "grad_norm": 16.522715965145718, + "learning_rate": 2.7123662953487904e-05, + "loss": 2.0986, + "mean_token_accuracy": 0.47241379618644713, + "step": 26930 + }, + { + "epoch": 0.02712921807217873, + "grad_norm": 18.968549413848102, + "learning_rate": 2.7128698910218964e-05, + "loss": 2.5155, + "mean_token_accuracy": 0.4344827592372894, + "step": 26935 + }, + { + "epoch": 0.0271342541252829, + "grad_norm": 19.340938659631657, + "learning_rate": 2.7133734866950023e-05, + "loss": 3.1601, + "mean_token_accuracy": 0.35862069129943847, + "step": 26940 + }, + { + "epoch": 0.027139290178387072, + "grad_norm": 17.230546271280193, + "learning_rate": 2.7138770823681085e-05, + "loss": 2.6367, + "mean_token_accuracy": 0.43448275327682495, + "step": 26945 + }, + { + "epoch": 0.027144326231491246, + "grad_norm": 16.708268477234174, + "learning_rate": 2.7143806780412145e-05, + "loss": 2.2461, + "mean_token_accuracy": 0.4551724135875702, + "step": 26950 + }, + { + "epoch": 0.02714936228459542, + "grad_norm": 20.716048452102342, + "learning_rate": 2.7148842737143204e-05, + "loss": 2.8234, + "mean_token_accuracy": 0.3931034505367279, + "step": 26955 + }, + { + "epoch": 0.02715439833769959, + "grad_norm": 18.32760900396895, + "learning_rate": 2.7153878693874263e-05, + "loss": 2.4838, + "mean_token_accuracy": 0.4379310369491577, + "step": 26960 + }, + { + "epoch": 0.027159434390803764, + "grad_norm": 15.444974810534694, + "learning_rate": 2.7158914650605322e-05, + "loss": 1.9527, + "mean_token_accuracy": 0.5137930989265442, + "step": 26965 + }, + { + "epoch": 0.027164470443907938, + "grad_norm": 19.93434166031694, + "learning_rate": 2.7163950607336382e-05, + "loss": 2.7885, + "mean_token_accuracy": 0.38620689511299133, + "step": 26970 + }, + { + "epoch": 0.027169506497012108, + "grad_norm": 20.776767886046805, + "learning_rate": 2.7168986564067444e-05, + "loss": 2.6308, + "mean_token_accuracy": 0.40344826579093934, + "step": 26975 + }, + { + "epoch": 0.027174542550116282, + "grad_norm": 25.440511474432316, + "learning_rate": 2.7174022520798504e-05, + "loss": 2.5263, + "mean_token_accuracy": 0.4503327190876007, + "step": 26980 + }, + { + "epoch": 0.027179578603220456, + "grad_norm": 19.37709120574868, + "learning_rate": 2.7179058477529563e-05, + "loss": 2.3057, + "mean_token_accuracy": 0.4793103516101837, + "step": 26985 + }, + { + "epoch": 0.02718461465632463, + "grad_norm": 23.281731019611883, + "learning_rate": 2.7184094434260622e-05, + "loss": 2.4307, + "mean_token_accuracy": 0.4275861978530884, + "step": 26990 + }, + { + "epoch": 0.0271896507094288, + "grad_norm": 17.519760365166217, + "learning_rate": 2.7189130390991678e-05, + "loss": 2.3818, + "mean_token_accuracy": 0.4620689690113068, + "step": 26995 + }, + { + "epoch": 0.027194686762532973, + "grad_norm": 15.647199536126944, + "learning_rate": 2.7194166347722744e-05, + "loss": 2.4159, + "mean_token_accuracy": 0.3810042321681976, + "step": 27000 + }, + { + "epoch": 0.027199722815637147, + "grad_norm": 16.69858153493782, + "learning_rate": 2.7199202304453803e-05, + "loss": 2.4466, + "mean_token_accuracy": 0.4344827592372894, + "step": 27005 + }, + { + "epoch": 0.027204758868741317, + "grad_norm": 34.2201166855786, + "learning_rate": 2.7204238261184863e-05, + "loss": 2.4222, + "mean_token_accuracy": 0.43103448748588563, + "step": 27010 + }, + { + "epoch": 0.02720979492184549, + "grad_norm": 26.316403109142225, + "learning_rate": 2.720927421791592e-05, + "loss": 2.4732, + "mean_token_accuracy": 0.45862067937850953, + "step": 27015 + }, + { + "epoch": 0.027214830974949665, + "grad_norm": 23.191213483264814, + "learning_rate": 2.7214310174646978e-05, + "loss": 2.5648, + "mean_token_accuracy": 0.39655172228813174, + "step": 27020 + }, + { + "epoch": 0.02721986702805384, + "grad_norm": 22.964984968292125, + "learning_rate": 2.7219346131378044e-05, + "loss": 2.6449, + "mean_token_accuracy": 0.41034482717514037, + "step": 27025 + }, + { + "epoch": 0.02722490308115801, + "grad_norm": 22.08642847487544, + "learning_rate": 2.7224382088109103e-05, + "loss": 2.4799, + "mean_token_accuracy": 0.4, + "step": 27030 + }, + { + "epoch": 0.027229939134262183, + "grad_norm": 22.146748101344265, + "learning_rate": 2.722941804484016e-05, + "loss": 2.6073, + "mean_token_accuracy": 0.41724138259887694, + "step": 27035 + }, + { + "epoch": 0.027234975187366357, + "grad_norm": 13.863170103345503, + "learning_rate": 2.7234454001571218e-05, + "loss": 2.1983, + "mean_token_accuracy": 0.45172414779663084, + "step": 27040 + }, + { + "epoch": 0.027240011240470527, + "grad_norm": 16.525576335716433, + "learning_rate": 2.7239489958302277e-05, + "loss": 2.4636, + "mean_token_accuracy": 0.39655172228813174, + "step": 27045 + }, + { + "epoch": 0.0272450472935747, + "grad_norm": 19.459897906754897, + "learning_rate": 2.7244525915033343e-05, + "loss": 2.2676, + "mean_token_accuracy": 0.4689655005931854, + "step": 27050 + }, + { + "epoch": 0.027250083346678874, + "grad_norm": 20.05728976113244, + "learning_rate": 2.72495618717644e-05, + "loss": 2.5845, + "mean_token_accuracy": 0.3965517282485962, + "step": 27055 + }, + { + "epoch": 0.027255119399783048, + "grad_norm": 17.626868207961696, + "learning_rate": 2.725459782849546e-05, + "loss": 2.3637, + "mean_token_accuracy": 0.41379310488700866, + "step": 27060 + }, + { + "epoch": 0.02726015545288722, + "grad_norm": 15.158798206477504, + "learning_rate": 2.7259633785226518e-05, + "loss": 2.633, + "mean_token_accuracy": 0.37931033968925476, + "step": 27065 + }, + { + "epoch": 0.027265191505991392, + "grad_norm": 13.481894532543539, + "learning_rate": 2.7264669741957577e-05, + "loss": 2.4068, + "mean_token_accuracy": 0.43103448748588563, + "step": 27070 + }, + { + "epoch": 0.027270227559095566, + "grad_norm": 16.852618664683018, + "learning_rate": 2.7269705698688636e-05, + "loss": 2.3698, + "mean_token_accuracy": 0.4310344815254211, + "step": 27075 + }, + { + "epoch": 0.027275263612199736, + "grad_norm": 19.734619599837295, + "learning_rate": 2.72747416554197e-05, + "loss": 2.2818, + "mean_token_accuracy": 0.41379310488700866, + "step": 27080 + }, + { + "epoch": 0.02728029966530391, + "grad_norm": 20.307936796990443, + "learning_rate": 2.7279777612150758e-05, + "loss": 2.5906, + "mean_token_accuracy": 0.4103448331356049, + "step": 27085 + }, + { + "epoch": 0.027285335718408084, + "grad_norm": 19.32360831491387, + "learning_rate": 2.7284813568881817e-05, + "loss": 2.3485, + "mean_token_accuracy": 0.41724138259887694, + "step": 27090 + }, + { + "epoch": 0.027290371771512258, + "grad_norm": 19.115525201502127, + "learning_rate": 2.7289849525612877e-05, + "loss": 2.3921, + "mean_token_accuracy": 0.4482758641242981, + "step": 27095 + }, + { + "epoch": 0.027295407824616428, + "grad_norm": 15.637345081140717, + "learning_rate": 2.7294885482343936e-05, + "loss": 2.4251, + "mean_token_accuracy": 0.3931034505367279, + "step": 27100 + }, + { + "epoch": 0.0273004438777206, + "grad_norm": 23.685773578035285, + "learning_rate": 2.7299921439075e-05, + "loss": 2.4629, + "mean_token_accuracy": 0.46896552443504336, + "step": 27105 + }, + { + "epoch": 0.027305479930824775, + "grad_norm": 15.799165360948571, + "learning_rate": 2.7304957395806058e-05, + "loss": 2.2292, + "mean_token_accuracy": 0.4413793087005615, + "step": 27110 + }, + { + "epoch": 0.027310515983928946, + "grad_norm": 17.59960935721835, + "learning_rate": 2.7309993352537117e-05, + "loss": 2.5913, + "mean_token_accuracy": 0.36896551847457887, + "step": 27115 + }, + { + "epoch": 0.02731555203703312, + "grad_norm": 18.7232075113392, + "learning_rate": 2.7315029309268176e-05, + "loss": 2.4847, + "mean_token_accuracy": 0.4413793087005615, + "step": 27120 + }, + { + "epoch": 0.027320588090137293, + "grad_norm": 17.032580750019978, + "learning_rate": 2.7320065265999236e-05, + "loss": 2.5046, + "mean_token_accuracy": 0.43103447556495667, + "step": 27125 + }, + { + "epoch": 0.027325624143241467, + "grad_norm": 19.242248713464573, + "learning_rate": 2.7325101222730298e-05, + "loss": 2.8294, + "mean_token_accuracy": 0.358620685338974, + "step": 27130 + }, + { + "epoch": 0.027330660196345637, + "grad_norm": 16.340068498774713, + "learning_rate": 2.7330137179461357e-05, + "loss": 2.2902, + "mean_token_accuracy": 0.38965516686439516, + "step": 27135 + }, + { + "epoch": 0.02733569624944981, + "grad_norm": 19.702415815858213, + "learning_rate": 2.7335173136192417e-05, + "loss": 2.6053, + "mean_token_accuracy": 0.3999999940395355, + "step": 27140 + }, + { + "epoch": 0.027340732302553985, + "grad_norm": 17.912554722972416, + "learning_rate": 2.7340209092923476e-05, + "loss": 2.7645, + "mean_token_accuracy": 0.3275862097740173, + "step": 27145 + }, + { + "epoch": 0.027345768355658155, + "grad_norm": 18.37105908707584, + "learning_rate": 2.7345245049654532e-05, + "loss": 2.3614, + "mean_token_accuracy": 0.42413792610168455, + "step": 27150 + }, + { + "epoch": 0.02735080440876233, + "grad_norm": 14.139528130755384, + "learning_rate": 2.735028100638559e-05, + "loss": 2.4512, + "mean_token_accuracy": 0.4413793087005615, + "step": 27155 + }, + { + "epoch": 0.027355840461866503, + "grad_norm": 14.941170369381537, + "learning_rate": 2.7355316963116657e-05, + "loss": 2.3365, + "mean_token_accuracy": 0.458620685338974, + "step": 27160 + }, + { + "epoch": 0.027360876514970676, + "grad_norm": 16.22472969922191, + "learning_rate": 2.7360352919847716e-05, + "loss": 2.7369, + "mean_token_accuracy": 0.38275861740112305, + "step": 27165 + }, + { + "epoch": 0.027365912568074847, + "grad_norm": 20.399271711134233, + "learning_rate": 2.7365388876578772e-05, + "loss": 2.386, + "mean_token_accuracy": 0.4655172288417816, + "step": 27170 + }, + { + "epoch": 0.02737094862117902, + "grad_norm": 19.162675336913377, + "learning_rate": 2.737042483330983e-05, + "loss": 2.4567, + "mean_token_accuracy": 0.42758620381355283, + "step": 27175 + }, + { + "epoch": 0.027375984674283194, + "grad_norm": 14.320613954501372, + "learning_rate": 2.737546079004089e-05, + "loss": 2.2488, + "mean_token_accuracy": 0.40689654350280763, + "step": 27180 + }, + { + "epoch": 0.027381020727387365, + "grad_norm": 18.681276203673328, + "learning_rate": 2.7380496746771957e-05, + "loss": 2.6047, + "mean_token_accuracy": 0.3845735102891922, + "step": 27185 + }, + { + "epoch": 0.02738605678049154, + "grad_norm": 20.93588520302252, + "learning_rate": 2.7385532703503013e-05, + "loss": 2.6694, + "mean_token_accuracy": 0.4, + "step": 27190 + }, + { + "epoch": 0.027391092833595712, + "grad_norm": 18.13116529545736, + "learning_rate": 2.7390568660234072e-05, + "loss": 2.3693, + "mean_token_accuracy": 0.43980641961097716, + "step": 27195 + }, + { + "epoch": 0.027396128886699886, + "grad_norm": 23.654789853833485, + "learning_rate": 2.739560461696513e-05, + "loss": 2.7075, + "mean_token_accuracy": 0.41379310488700866, + "step": 27200 + }, + { + "epoch": 0.027401164939804056, + "grad_norm": 18.853120486741567, + "learning_rate": 2.740064057369619e-05, + "loss": 2.3148, + "mean_token_accuracy": 0.37586206793785093, + "step": 27205 + }, + { + "epoch": 0.02740620099290823, + "grad_norm": 16.511041412220326, + "learning_rate": 2.7405676530427253e-05, + "loss": 2.6131, + "mean_token_accuracy": 0.3793103516101837, + "step": 27210 + }, + { + "epoch": 0.027411237046012404, + "grad_norm": 19.001390329784446, + "learning_rate": 2.7410712487158312e-05, + "loss": 2.5543, + "mean_token_accuracy": 0.3793103337287903, + "step": 27215 + }, + { + "epoch": 0.027416273099116574, + "grad_norm": 18.111528051606957, + "learning_rate": 2.741574844388937e-05, + "loss": 2.5501, + "mean_token_accuracy": 0.41034482717514037, + "step": 27220 + }, + { + "epoch": 0.027421309152220748, + "grad_norm": 15.084221409984176, + "learning_rate": 2.742078440062043e-05, + "loss": 2.3916, + "mean_token_accuracy": 0.4068965494632721, + "step": 27225 + }, + { + "epoch": 0.02742634520532492, + "grad_norm": 16.546786983202054, + "learning_rate": 2.742582035735149e-05, + "loss": 2.5995, + "mean_token_accuracy": 0.41034482419490814, + "step": 27230 + }, + { + "epoch": 0.027431381258429095, + "grad_norm": 27.30198478534222, + "learning_rate": 2.743085631408255e-05, + "loss": 2.4059, + "mean_token_accuracy": 0.4379310369491577, + "step": 27235 + }, + { + "epoch": 0.027436417311533266, + "grad_norm": 19.04729231564735, + "learning_rate": 2.7435892270813612e-05, + "loss": 2.4831, + "mean_token_accuracy": 0.39655172228813174, + "step": 27240 + }, + { + "epoch": 0.02744145336463744, + "grad_norm": 15.459256543940615, + "learning_rate": 2.744092822754467e-05, + "loss": 2.1928, + "mean_token_accuracy": 0.4275862157344818, + "step": 27245 + }, + { + "epoch": 0.027446489417741613, + "grad_norm": 17.565142652486028, + "learning_rate": 2.744596418427573e-05, + "loss": 2.3983, + "mean_token_accuracy": 0.45517241954803467, + "step": 27250 + }, + { + "epoch": 0.027451525470845783, + "grad_norm": 16.007976885258323, + "learning_rate": 2.745100014100679e-05, + "loss": 2.6119, + "mean_token_accuracy": 0.4034482717514038, + "step": 27255 + }, + { + "epoch": 0.027456561523949957, + "grad_norm": 16.704478205800484, + "learning_rate": 2.745603609773785e-05, + "loss": 2.2223, + "mean_token_accuracy": 0.44482758045196535, + "step": 27260 + }, + { + "epoch": 0.02746159757705413, + "grad_norm": 20.047154352302815, + "learning_rate": 2.746107205446891e-05, + "loss": 2.2953, + "mean_token_accuracy": 0.42857142686843874, + "step": 27265 + }, + { + "epoch": 0.027466633630158305, + "grad_norm": 17.67405689327055, + "learning_rate": 2.746610801119997e-05, + "loss": 2.3107, + "mean_token_accuracy": 0.4379310369491577, + "step": 27270 + }, + { + "epoch": 0.027471669683262475, + "grad_norm": 17.399814925804186, + "learning_rate": 2.747114396793103e-05, + "loss": 2.5877, + "mean_token_accuracy": 0.4034482717514038, + "step": 27275 + }, + { + "epoch": 0.02747670573636665, + "grad_norm": 17.09992509959253, + "learning_rate": 2.747617992466209e-05, + "loss": 2.8556, + "mean_token_accuracy": 0.3344827562570572, + "step": 27280 + }, + { + "epoch": 0.027481741789470823, + "grad_norm": 16.59739341088239, + "learning_rate": 2.7481215881393145e-05, + "loss": 2.3149, + "mean_token_accuracy": 0.4536600112915039, + "step": 27285 + }, + { + "epoch": 0.027486777842574993, + "grad_norm": 13.747106996107085, + "learning_rate": 2.748625183812421e-05, + "loss": 2.673, + "mean_token_accuracy": 0.38771929740905764, + "step": 27290 + }, + { + "epoch": 0.027491813895679167, + "grad_norm": 16.07875722969407, + "learning_rate": 2.749128779485527e-05, + "loss": 2.4356, + "mean_token_accuracy": 0.38620689511299133, + "step": 27295 + }, + { + "epoch": 0.02749684994878334, + "grad_norm": 16.95683051408008, + "learning_rate": 2.749632375158633e-05, + "loss": 2.363, + "mean_token_accuracy": 0.4586206912994385, + "step": 27300 + }, + { + "epoch": 0.027501886001887514, + "grad_norm": 16.795844924933615, + "learning_rate": 2.7501359708317386e-05, + "loss": 2.0186, + "mean_token_accuracy": 0.5225650310516358, + "step": 27305 + }, + { + "epoch": 0.027506922054991684, + "grad_norm": 17.266893980462907, + "learning_rate": 2.7506395665048445e-05, + "loss": 2.1679, + "mean_token_accuracy": 0.4744101643562317, + "step": 27310 + }, + { + "epoch": 0.027511958108095858, + "grad_norm": 16.273114282901634, + "learning_rate": 2.7511431621779504e-05, + "loss": 2.319, + "mean_token_accuracy": 0.37241379022598264, + "step": 27315 + }, + { + "epoch": 0.027516994161200032, + "grad_norm": 19.307710076705433, + "learning_rate": 2.751646757851057e-05, + "loss": 2.5767, + "mean_token_accuracy": 0.34137930274009703, + "step": 27320 + }, + { + "epoch": 0.027522030214304202, + "grad_norm": 30.43067687637569, + "learning_rate": 2.7521503535241626e-05, + "loss": 2.6254, + "mean_token_accuracy": 0.39655172228813174, + "step": 27325 + }, + { + "epoch": 0.027527066267408376, + "grad_norm": 18.374438326215635, + "learning_rate": 2.7526539491972685e-05, + "loss": 2.5149, + "mean_token_accuracy": 0.39655172228813174, + "step": 27330 + }, + { + "epoch": 0.02753210232051255, + "grad_norm": 26.129356140041466, + "learning_rate": 2.7531575448703744e-05, + "loss": 2.8693, + "mean_token_accuracy": 0.41034482717514037, + "step": 27335 + }, + { + "epoch": 0.027537138373616724, + "grad_norm": 15.796704079911935, + "learning_rate": 2.7536611405434804e-05, + "loss": 2.6027, + "mean_token_accuracy": 0.40344828367233276, + "step": 27340 + }, + { + "epoch": 0.027542174426720894, + "grad_norm": 16.69399783766348, + "learning_rate": 2.7541647362165866e-05, + "loss": 2.636, + "mean_token_accuracy": 0.37241379022598264, + "step": 27345 + }, + { + "epoch": 0.027547210479825068, + "grad_norm": 14.943979543328604, + "learning_rate": 2.7546683318896926e-05, + "loss": 2.3698, + "mean_token_accuracy": 0.41512401700019835, + "step": 27350 + }, + { + "epoch": 0.02755224653292924, + "grad_norm": 18.816893755896213, + "learning_rate": 2.7551719275627985e-05, + "loss": 2.511, + "mean_token_accuracy": 0.42068964838981626, + "step": 27355 + }, + { + "epoch": 0.02755728258603341, + "grad_norm": 17.908242418328797, + "learning_rate": 2.7556755232359044e-05, + "loss": 2.5565, + "mean_token_accuracy": 0.3793103337287903, + "step": 27360 + }, + { + "epoch": 0.027562318639137585, + "grad_norm": 17.804988205490563, + "learning_rate": 2.7561791189090103e-05, + "loss": 2.0971, + "mean_token_accuracy": 0.4620689630508423, + "step": 27365 + }, + { + "epoch": 0.02756735469224176, + "grad_norm": 15.87490201353111, + "learning_rate": 2.7566827145821166e-05, + "loss": 2.3441, + "mean_token_accuracy": 0.41379310488700866, + "step": 27370 + }, + { + "epoch": 0.027572390745345933, + "grad_norm": 21.193956710503894, + "learning_rate": 2.7571863102552225e-05, + "loss": 2.8235, + "mean_token_accuracy": 0.4068965554237366, + "step": 27375 + }, + { + "epoch": 0.027577426798450103, + "grad_norm": 18.552782019103464, + "learning_rate": 2.7576899059283285e-05, + "loss": 2.6063, + "mean_token_accuracy": 0.41724137365818026, + "step": 27380 + }, + { + "epoch": 0.027582462851554277, + "grad_norm": 21.421510709269537, + "learning_rate": 2.7581935016014344e-05, + "loss": 2.0666, + "mean_token_accuracy": 0.4620689690113068, + "step": 27385 + }, + { + "epoch": 0.02758749890465845, + "grad_norm": 17.44875880129707, + "learning_rate": 2.7586970972745403e-05, + "loss": 2.8058, + "mean_token_accuracy": 0.3655172407627106, + "step": 27390 + }, + { + "epoch": 0.02759253495776262, + "grad_norm": 20.995873321355464, + "learning_rate": 2.759200692947646e-05, + "loss": 2.3796, + "mean_token_accuracy": 0.3965517163276672, + "step": 27395 + }, + { + "epoch": 0.027597571010866795, + "grad_norm": 13.63328799253174, + "learning_rate": 2.7597042886207525e-05, + "loss": 2.7358, + "mean_token_accuracy": 0.3379310369491577, + "step": 27400 + }, + { + "epoch": 0.02760260706397097, + "grad_norm": 16.588727252003245, + "learning_rate": 2.7602078842938584e-05, + "loss": 2.5769, + "mean_token_accuracy": 0.4227465093135834, + "step": 27405 + }, + { + "epoch": 0.027607643117075142, + "grad_norm": 23.59285830835005, + "learning_rate": 2.7607114799669643e-05, + "loss": 2.2749, + "mean_token_accuracy": 0.46551724672317507, + "step": 27410 + }, + { + "epoch": 0.027612679170179313, + "grad_norm": 20.440765333056376, + "learning_rate": 2.76121507564007e-05, + "loss": 2.8228, + "mean_token_accuracy": 0.36896550953388213, + "step": 27415 + }, + { + "epoch": 0.027617715223283486, + "grad_norm": 20.91788641667471, + "learning_rate": 2.761718671313176e-05, + "loss": 2.9673, + "mean_token_accuracy": 0.32413792312145234, + "step": 27420 + }, + { + "epoch": 0.02762275127638766, + "grad_norm": 16.396285605752766, + "learning_rate": 2.7622222669862825e-05, + "loss": 1.8723, + "mean_token_accuracy": 0.5275862038135528, + "step": 27425 + }, + { + "epoch": 0.02762778732949183, + "grad_norm": 18.880705172944772, + "learning_rate": 2.7627258626593884e-05, + "loss": 2.7126, + "mean_token_accuracy": 0.41034483909606934, + "step": 27430 + }, + { + "epoch": 0.027632823382596004, + "grad_norm": 16.92064894459665, + "learning_rate": 2.763229458332494e-05, + "loss": 3.0504, + "mean_token_accuracy": 0.33793103992938994, + "step": 27435 + }, + { + "epoch": 0.027637859435700178, + "grad_norm": 16.878082120130355, + "learning_rate": 2.7637330540056e-05, + "loss": 2.6974, + "mean_token_accuracy": 0.4034482777118683, + "step": 27440 + }, + { + "epoch": 0.027642895488804352, + "grad_norm": 18.289697322325747, + "learning_rate": 2.7642366496787058e-05, + "loss": 2.6675, + "mean_token_accuracy": 0.37586206793785093, + "step": 27445 + }, + { + "epoch": 0.027647931541908522, + "grad_norm": 19.102930256434053, + "learning_rate": 2.7647402453518124e-05, + "loss": 2.6065, + "mean_token_accuracy": 0.41034482717514037, + "step": 27450 + }, + { + "epoch": 0.027652967595012696, + "grad_norm": 15.220962965503691, + "learning_rate": 2.7652438410249183e-05, + "loss": 2.3591, + "mean_token_accuracy": 0.4068965554237366, + "step": 27455 + }, + { + "epoch": 0.02765800364811687, + "grad_norm": 17.91015905054953, + "learning_rate": 2.765747436698024e-05, + "loss": 2.5209, + "mean_token_accuracy": 0.41034482717514037, + "step": 27460 + }, + { + "epoch": 0.02766303970122104, + "grad_norm": 18.248586119445473, + "learning_rate": 2.76625103237113e-05, + "loss": 2.8705, + "mean_token_accuracy": 0.41034482717514037, + "step": 27465 + }, + { + "epoch": 0.027668075754325214, + "grad_norm": 20.875979916301507, + "learning_rate": 2.7667546280442358e-05, + "loss": 2.6522, + "mean_token_accuracy": 0.3965517163276672, + "step": 27470 + }, + { + "epoch": 0.027673111807429387, + "grad_norm": 17.518254495629723, + "learning_rate": 2.7672582237173424e-05, + "loss": 2.618, + "mean_token_accuracy": 0.4, + "step": 27475 + }, + { + "epoch": 0.02767814786053356, + "grad_norm": 20.144061773995343, + "learning_rate": 2.767761819390448e-05, + "loss": 2.7964, + "mean_token_accuracy": 0.39655172228813174, + "step": 27480 + }, + { + "epoch": 0.02768318391363773, + "grad_norm": 17.486141851878344, + "learning_rate": 2.768265415063554e-05, + "loss": 2.4017, + "mean_token_accuracy": 0.4103448331356049, + "step": 27485 + }, + { + "epoch": 0.027688219966741905, + "grad_norm": 13.308094106974512, + "learning_rate": 2.7687690107366598e-05, + "loss": 2.3088, + "mean_token_accuracy": 0.4363581359386444, + "step": 27490 + }, + { + "epoch": 0.02769325601984608, + "grad_norm": 18.10915018593621, + "learning_rate": 2.7692726064097657e-05, + "loss": 2.3441, + "mean_token_accuracy": 0.39310344457626345, + "step": 27495 + }, + { + "epoch": 0.02769829207295025, + "grad_norm": 16.233617345222303, + "learning_rate": 2.7697762020828717e-05, + "loss": 3.003, + "mean_token_accuracy": 0.38965516686439516, + "step": 27500 + }, + { + "epoch": 0.027703328126054423, + "grad_norm": 19.210116288880382, + "learning_rate": 2.770279797755978e-05, + "loss": 2.6668, + "mean_token_accuracy": 0.36551723480224607, + "step": 27505 + }, + { + "epoch": 0.027708364179158597, + "grad_norm": 18.12197878734856, + "learning_rate": 2.770783393429084e-05, + "loss": 2.4597, + "mean_token_accuracy": 0.41724138259887694, + "step": 27510 + }, + { + "epoch": 0.02771340023226277, + "grad_norm": 13.862625792092306, + "learning_rate": 2.7712869891021898e-05, + "loss": 2.6043, + "mean_token_accuracy": 0.4, + "step": 27515 + }, + { + "epoch": 0.02771843628536694, + "grad_norm": 18.153046794711525, + "learning_rate": 2.7717905847752957e-05, + "loss": 2.2161, + "mean_token_accuracy": 0.44996975660324096, + "step": 27520 + }, + { + "epoch": 0.027723472338471115, + "grad_norm": 18.425282122017222, + "learning_rate": 2.7722941804484016e-05, + "loss": 2.5786, + "mean_token_accuracy": 0.3620689630508423, + "step": 27525 + }, + { + "epoch": 0.02772850839157529, + "grad_norm": 14.436900841502986, + "learning_rate": 2.772797776121508e-05, + "loss": 2.6201, + "mean_token_accuracy": 0.4034482717514038, + "step": 27530 + }, + { + "epoch": 0.02773354444467946, + "grad_norm": 20.345230198578577, + "learning_rate": 2.7733013717946138e-05, + "loss": 2.2719, + "mean_token_accuracy": 0.4137930929660797, + "step": 27535 + }, + { + "epoch": 0.027738580497783633, + "grad_norm": 16.619536035295685, + "learning_rate": 2.7738049674677198e-05, + "loss": 2.8645, + "mean_token_accuracy": 0.37586206793785093, + "step": 27540 + }, + { + "epoch": 0.027743616550887806, + "grad_norm": 17.963895363822328, + "learning_rate": 2.7743085631408257e-05, + "loss": 2.2363, + "mean_token_accuracy": 0.4194192349910736, + "step": 27545 + }, + { + "epoch": 0.02774865260399198, + "grad_norm": 15.686307235911388, + "learning_rate": 2.7748121588139313e-05, + "loss": 2.4027, + "mean_token_accuracy": 0.4517241418361664, + "step": 27550 + }, + { + "epoch": 0.02775368865709615, + "grad_norm": 15.522724948962093, + "learning_rate": 2.775315754487038e-05, + "loss": 2.4636, + "mean_token_accuracy": 0.42758620381355283, + "step": 27555 + }, + { + "epoch": 0.027758724710200324, + "grad_norm": 16.891427149175534, + "learning_rate": 2.7758193501601438e-05, + "loss": 2.3729, + "mean_token_accuracy": 0.44827587008476255, + "step": 27560 + }, + { + "epoch": 0.027763760763304498, + "grad_norm": 15.43213113576706, + "learning_rate": 2.7763229458332497e-05, + "loss": 2.3027, + "mean_token_accuracy": 0.43968542814254763, + "step": 27565 + }, + { + "epoch": 0.027768796816408668, + "grad_norm": 20.47605482607052, + "learning_rate": 2.7768265415063553e-05, + "loss": 2.5564, + "mean_token_accuracy": 0.3862069010734558, + "step": 27570 + }, + { + "epoch": 0.027773832869512842, + "grad_norm": 16.04500676165232, + "learning_rate": 2.7773301371794612e-05, + "loss": 2.4943, + "mean_token_accuracy": 0.41724138259887694, + "step": 27575 + }, + { + "epoch": 0.027778868922617016, + "grad_norm": 19.208416667036083, + "learning_rate": 2.777833732852567e-05, + "loss": 2.4975, + "mean_token_accuracy": 0.42758620381355283, + "step": 27580 + }, + { + "epoch": 0.02778390497572119, + "grad_norm": 15.891822868563045, + "learning_rate": 2.7783373285256738e-05, + "loss": 2.4579, + "mean_token_accuracy": 0.4275862157344818, + "step": 27585 + }, + { + "epoch": 0.02778894102882536, + "grad_norm": 22.853696288490546, + "learning_rate": 2.7788409241987793e-05, + "loss": 2.7407, + "mean_token_accuracy": 0.35862069129943847, + "step": 27590 + }, + { + "epoch": 0.027793977081929534, + "grad_norm": 15.34662176986484, + "learning_rate": 2.7793445198718853e-05, + "loss": 2.5677, + "mean_token_accuracy": 0.46293104290962217, + "step": 27595 + }, + { + "epoch": 0.027799013135033707, + "grad_norm": 31.111726459828645, + "learning_rate": 2.7798481155449912e-05, + "loss": 2.5319, + "mean_token_accuracy": 0.4206896543502808, + "step": 27600 + }, + { + "epoch": 0.027804049188137878, + "grad_norm": 19.55455983453019, + "learning_rate": 2.780351711218097e-05, + "loss": 2.447, + "mean_token_accuracy": 0.4086509346961975, + "step": 27605 + }, + { + "epoch": 0.02780908524124205, + "grad_norm": 25.970737043944847, + "learning_rate": 2.7808553068912034e-05, + "loss": 2.8492, + "mean_token_accuracy": 0.3551724135875702, + "step": 27610 + }, + { + "epoch": 0.027814121294346225, + "grad_norm": 15.495540722126387, + "learning_rate": 2.7813589025643093e-05, + "loss": 2.4784, + "mean_token_accuracy": 0.41379311084747317, + "step": 27615 + }, + { + "epoch": 0.0278191573474504, + "grad_norm": 20.555353907013142, + "learning_rate": 2.7818624982374152e-05, + "loss": 2.2526, + "mean_token_accuracy": 0.4517241418361664, + "step": 27620 + }, + { + "epoch": 0.02782419340055457, + "grad_norm": 15.137720618809404, + "learning_rate": 2.782366093910521e-05, + "loss": 2.4995, + "mean_token_accuracy": 0.41034482717514037, + "step": 27625 + }, + { + "epoch": 0.027829229453658743, + "grad_norm": 22.703246825761113, + "learning_rate": 2.782869689583627e-05, + "loss": 2.618, + "mean_token_accuracy": 0.42068964838981626, + "step": 27630 + }, + { + "epoch": 0.027834265506762917, + "grad_norm": 14.773112132307784, + "learning_rate": 2.7833732852567334e-05, + "loss": 2.7074, + "mean_token_accuracy": 0.36896551847457887, + "step": 27635 + }, + { + "epoch": 0.027839301559867087, + "grad_norm": 15.426159256590825, + "learning_rate": 2.7838768809298393e-05, + "loss": 2.4968, + "mean_token_accuracy": 0.4862069010734558, + "step": 27640 + }, + { + "epoch": 0.02784433761297126, + "grad_norm": 25.194763368924228, + "learning_rate": 2.7843804766029452e-05, + "loss": 2.497, + "mean_token_accuracy": 0.4379310369491577, + "step": 27645 + }, + { + "epoch": 0.027849373666075435, + "grad_norm": 14.246080655706088, + "learning_rate": 2.784884072276051e-05, + "loss": 2.8493, + "mean_token_accuracy": 0.3862068891525269, + "step": 27650 + }, + { + "epoch": 0.02785440971917961, + "grad_norm": 18.474287584549504, + "learning_rate": 2.785387667949157e-05, + "loss": 2.4052, + "mean_token_accuracy": 0.4, + "step": 27655 + }, + { + "epoch": 0.02785944577228378, + "grad_norm": 16.71409043233609, + "learning_rate": 2.785891263622263e-05, + "loss": 2.8911, + "mean_token_accuracy": 0.4344827651977539, + "step": 27660 + }, + { + "epoch": 0.027864481825387952, + "grad_norm": 20.090150364568835, + "learning_rate": 2.7863948592953692e-05, + "loss": 2.5883, + "mean_token_accuracy": 0.41536598801612856, + "step": 27665 + }, + { + "epoch": 0.027869517878492126, + "grad_norm": 16.882301367990504, + "learning_rate": 2.786898454968475e-05, + "loss": 2.4642, + "mean_token_accuracy": 0.4, + "step": 27670 + }, + { + "epoch": 0.027874553931596296, + "grad_norm": 15.74960710091616, + "learning_rate": 2.787402050641581e-05, + "loss": 2.4615, + "mean_token_accuracy": 0.3931034505367279, + "step": 27675 + }, + { + "epoch": 0.02787958998470047, + "grad_norm": 18.67177149607671, + "learning_rate": 2.787905646314687e-05, + "loss": 2.2864, + "mean_token_accuracy": 0.4482758641242981, + "step": 27680 + }, + { + "epoch": 0.027884626037804644, + "grad_norm": 19.05106213101106, + "learning_rate": 2.7884092419877926e-05, + "loss": 2.4978, + "mean_token_accuracy": 0.3482758641242981, + "step": 27685 + }, + { + "epoch": 0.027889662090908818, + "grad_norm": 21.507768234466003, + "learning_rate": 2.7889128376608992e-05, + "loss": 2.6816, + "mean_token_accuracy": 0.3827586233615875, + "step": 27690 + }, + { + "epoch": 0.027894698144012988, + "grad_norm": 15.344116964323065, + "learning_rate": 2.789416433334005e-05, + "loss": 2.0571, + "mean_token_accuracy": 0.4886267364025116, + "step": 27695 + }, + { + "epoch": 0.027899734197117162, + "grad_norm": 19.620085004026407, + "learning_rate": 2.789920029007111e-05, + "loss": 2.4958, + "mean_token_accuracy": 0.3586206823587418, + "step": 27700 + }, + { + "epoch": 0.027904770250221336, + "grad_norm": 14.550015380850073, + "learning_rate": 2.7904236246802166e-05, + "loss": 2.3624, + "mean_token_accuracy": 0.4310344815254211, + "step": 27705 + }, + { + "epoch": 0.027909806303325506, + "grad_norm": 17.46027914323568, + "learning_rate": 2.7909272203533226e-05, + "loss": 2.2816, + "mean_token_accuracy": 0.42758620381355283, + "step": 27710 + }, + { + "epoch": 0.02791484235642968, + "grad_norm": 20.0772271958674, + "learning_rate": 2.7914308160264292e-05, + "loss": 2.5303, + "mean_token_accuracy": 0.43103447556495667, + "step": 27715 + }, + { + "epoch": 0.027919878409533853, + "grad_norm": 17.585069722639467, + "learning_rate": 2.791934411699535e-05, + "loss": 2.5077, + "mean_token_accuracy": 0.4034482717514038, + "step": 27720 + }, + { + "epoch": 0.027924914462638027, + "grad_norm": 23.138571693140538, + "learning_rate": 2.7924380073726407e-05, + "loss": 2.471, + "mean_token_accuracy": 0.42552934885025023, + "step": 27725 + }, + { + "epoch": 0.027929950515742197, + "grad_norm": 17.546299031505505, + "learning_rate": 2.7929416030457466e-05, + "loss": 2.2751, + "mean_token_accuracy": 0.46551724672317507, + "step": 27730 + }, + { + "epoch": 0.02793498656884637, + "grad_norm": 15.976702165631282, + "learning_rate": 2.7934451987188525e-05, + "loss": 2.4993, + "mean_token_accuracy": 0.4204476773738861, + "step": 27735 + }, + { + "epoch": 0.027940022621950545, + "grad_norm": 18.01271129798969, + "learning_rate": 2.7939487943919585e-05, + "loss": 2.5559, + "mean_token_accuracy": 0.3793103456497192, + "step": 27740 + }, + { + "epoch": 0.027945058675054715, + "grad_norm": 14.834110003270903, + "learning_rate": 2.7944523900650647e-05, + "loss": 2.5397, + "mean_token_accuracy": 0.44827587008476255, + "step": 27745 + }, + { + "epoch": 0.02795009472815889, + "grad_norm": 17.94608178767141, + "learning_rate": 2.7949559857381706e-05, + "loss": 2.2028, + "mean_token_accuracy": 0.47241378426551817, + "step": 27750 + }, + { + "epoch": 0.027955130781263063, + "grad_norm": 18.804822890663335, + "learning_rate": 2.7954595814112766e-05, + "loss": 2.4007, + "mean_token_accuracy": 0.4310344815254211, + "step": 27755 + }, + { + "epoch": 0.027960166834367237, + "grad_norm": 16.39515656859393, + "learning_rate": 2.7959631770843825e-05, + "loss": 2.5348, + "mean_token_accuracy": 0.3896551728248596, + "step": 27760 + }, + { + "epoch": 0.027965202887471407, + "grad_norm": 28.065565299964494, + "learning_rate": 2.7964667727574884e-05, + "loss": 2.968, + "mean_token_accuracy": 0.3655172407627106, + "step": 27765 + }, + { + "epoch": 0.02797023894057558, + "grad_norm": 17.65372005465926, + "learning_rate": 2.7969703684305947e-05, + "loss": 2.7629, + "mean_token_accuracy": 0.36896551847457887, + "step": 27770 + }, + { + "epoch": 0.027975274993679754, + "grad_norm": 21.72268211220837, + "learning_rate": 2.7974739641037006e-05, + "loss": 2.2851, + "mean_token_accuracy": 0.41379310488700866, + "step": 27775 + }, + { + "epoch": 0.027980311046783925, + "grad_norm": 19.954780967081327, + "learning_rate": 2.7979775597768065e-05, + "loss": 2.4921, + "mean_token_accuracy": 0.41724138259887694, + "step": 27780 + }, + { + "epoch": 0.0279853470998881, + "grad_norm": 14.575487488474167, + "learning_rate": 2.7984811554499125e-05, + "loss": 2.3811, + "mean_token_accuracy": 0.4034482777118683, + "step": 27785 + }, + { + "epoch": 0.027990383152992272, + "grad_norm": 18.321923707770228, + "learning_rate": 2.7989847511230184e-05, + "loss": 2.6048, + "mean_token_accuracy": 0.39310344457626345, + "step": 27790 + }, + { + "epoch": 0.027995419206096446, + "grad_norm": 17.302357428247525, + "learning_rate": 2.7994883467961247e-05, + "loss": 2.226, + "mean_token_accuracy": 0.4052631616592407, + "step": 27795 + }, + { + "epoch": 0.028000455259200616, + "grad_norm": 16.25255576134554, + "learning_rate": 2.7999919424692306e-05, + "loss": 2.5809, + "mean_token_accuracy": 0.4068965494632721, + "step": 27800 + }, + { + "epoch": 0.02800549131230479, + "grad_norm": 19.39570295181595, + "learning_rate": 2.8004955381423365e-05, + "loss": 2.6778, + "mean_token_accuracy": 0.3827586233615875, + "step": 27805 + }, + { + "epoch": 0.028010527365408964, + "grad_norm": 16.37041238981368, + "learning_rate": 2.8009991338154424e-05, + "loss": 2.4955, + "mean_token_accuracy": 0.4034482777118683, + "step": 27810 + }, + { + "epoch": 0.028015563418513134, + "grad_norm": 89.30411450683553, + "learning_rate": 2.8015027294885484e-05, + "loss": 2.5292, + "mean_token_accuracy": 0.43177340626716615, + "step": 27815 + }, + { + "epoch": 0.028020599471617308, + "grad_norm": 21.527221565624284, + "learning_rate": 2.8020063251616546e-05, + "loss": 3.0379, + "mean_token_accuracy": 0.3620689630508423, + "step": 27820 + }, + { + "epoch": 0.02802563552472148, + "grad_norm": 12.733251531637329, + "learning_rate": 2.8025099208347605e-05, + "loss": 2.4715, + "mean_token_accuracy": 0.4333938300609589, + "step": 27825 + }, + { + "epoch": 0.028030671577825655, + "grad_norm": 16.430781808910176, + "learning_rate": 2.8030135165078665e-05, + "loss": 2.1636, + "mean_token_accuracy": 0.4310344815254211, + "step": 27830 + }, + { + "epoch": 0.028035707630929826, + "grad_norm": 15.884619870136369, + "learning_rate": 2.8035171121809724e-05, + "loss": 2.6577, + "mean_token_accuracy": 0.39310344457626345, + "step": 27835 + }, + { + "epoch": 0.028040743684034, + "grad_norm": 18.34668658156639, + "learning_rate": 2.804020707854078e-05, + "loss": 2.206, + "mean_token_accuracy": 0.48511797189712524, + "step": 27840 + }, + { + "epoch": 0.028045779737138173, + "grad_norm": 18.610599551933596, + "learning_rate": 2.804524303527184e-05, + "loss": 2.3253, + "mean_token_accuracy": 0.4482758641242981, + "step": 27845 + }, + { + "epoch": 0.028050815790242344, + "grad_norm": 16.743486299083173, + "learning_rate": 2.8050278992002905e-05, + "loss": 2.5761, + "mean_token_accuracy": 0.4103448331356049, + "step": 27850 + }, + { + "epoch": 0.028055851843346517, + "grad_norm": 18.150593909167224, + "learning_rate": 2.8055314948733964e-05, + "loss": 2.42, + "mean_token_accuracy": 0.4448275864124298, + "step": 27855 + }, + { + "epoch": 0.02806088789645069, + "grad_norm": 14.519229200680817, + "learning_rate": 2.806035090546502e-05, + "loss": 2.1876, + "mean_token_accuracy": 0.47428917288780215, + "step": 27860 + }, + { + "epoch": 0.028065923949554865, + "grad_norm": 19.056812244405776, + "learning_rate": 2.806538686219608e-05, + "loss": 2.1612, + "mean_token_accuracy": 0.4379310369491577, + "step": 27865 + }, + { + "epoch": 0.028070960002659035, + "grad_norm": 24.010320458535293, + "learning_rate": 2.807042281892714e-05, + "loss": 2.2851, + "mean_token_accuracy": 0.3793103516101837, + "step": 27870 + }, + { + "epoch": 0.02807599605576321, + "grad_norm": 21.060408677319874, + "learning_rate": 2.8075458775658205e-05, + "loss": 2.6736, + "mean_token_accuracy": 0.3655172407627106, + "step": 27875 + }, + { + "epoch": 0.028081032108867383, + "grad_norm": 22.289764026945186, + "learning_rate": 2.808049473238926e-05, + "loss": 1.9696, + "mean_token_accuracy": 0.4551724255084991, + "step": 27880 + }, + { + "epoch": 0.028086068161971553, + "grad_norm": 19.05755364109824, + "learning_rate": 2.808553068912032e-05, + "loss": 2.6949, + "mean_token_accuracy": 0.4068965494632721, + "step": 27885 + }, + { + "epoch": 0.028091104215075727, + "grad_norm": 15.319318103975831, + "learning_rate": 2.809056664585138e-05, + "loss": 2.8223, + "mean_token_accuracy": 0.3551724135875702, + "step": 27890 + }, + { + "epoch": 0.0280961402681799, + "grad_norm": 15.738098348745073, + "learning_rate": 2.809560260258244e-05, + "loss": 2.4248, + "mean_token_accuracy": 0.41379310488700866, + "step": 27895 + }, + { + "epoch": 0.028101176321284074, + "grad_norm": 15.332045566305235, + "learning_rate": 2.81006385593135e-05, + "loss": 2.7991, + "mean_token_accuracy": 0.3793103456497192, + "step": 27900 + }, + { + "epoch": 0.028106212374388245, + "grad_norm": 21.789178010342635, + "learning_rate": 2.810567451604456e-05, + "loss": 2.3271, + "mean_token_accuracy": 0.43448275327682495, + "step": 27905 + }, + { + "epoch": 0.02811124842749242, + "grad_norm": 19.778918854279127, + "learning_rate": 2.811071047277562e-05, + "loss": 2.5712, + "mean_token_accuracy": 0.4103448331356049, + "step": 27910 + }, + { + "epoch": 0.028116284480596592, + "grad_norm": 20.28958993987868, + "learning_rate": 2.811574642950668e-05, + "loss": 2.7128, + "mean_token_accuracy": 0.3448275804519653, + "step": 27915 + }, + { + "epoch": 0.028121320533700762, + "grad_norm": 18.103177879446424, + "learning_rate": 2.8120782386237738e-05, + "loss": 2.4406, + "mean_token_accuracy": 0.4310344815254211, + "step": 27920 + }, + { + "epoch": 0.028126356586804936, + "grad_norm": 14.469902032746289, + "learning_rate": 2.8125818342968797e-05, + "loss": 2.5093, + "mean_token_accuracy": 0.3965517163276672, + "step": 27925 + }, + { + "epoch": 0.02813139263990911, + "grad_norm": 20.27840409738902, + "learning_rate": 2.813085429969986e-05, + "loss": 2.418, + "mean_token_accuracy": 0.41379310488700866, + "step": 27930 + }, + { + "epoch": 0.028136428693013284, + "grad_norm": 15.40105445725144, + "learning_rate": 2.813589025643092e-05, + "loss": 2.2511, + "mean_token_accuracy": 0.3999999940395355, + "step": 27935 + }, + { + "epoch": 0.028141464746117454, + "grad_norm": 18.36958005639998, + "learning_rate": 2.814092621316198e-05, + "loss": 2.5261, + "mean_token_accuracy": 0.4275861978530884, + "step": 27940 + }, + { + "epoch": 0.028146500799221628, + "grad_norm": 12.681719289106361, + "learning_rate": 2.8145962169893038e-05, + "loss": 2.5071, + "mean_token_accuracy": 0.3862069010734558, + "step": 27945 + }, + { + "epoch": 0.0281515368523258, + "grad_norm": 14.039456793214061, + "learning_rate": 2.8150998126624094e-05, + "loss": 2.4002, + "mean_token_accuracy": 0.45862069725990295, + "step": 27950 + }, + { + "epoch": 0.028156572905429972, + "grad_norm": 28.882402929753226, + "learning_rate": 2.815603408335516e-05, + "loss": 2.4402, + "mean_token_accuracy": 0.46551724672317507, + "step": 27955 + }, + { + "epoch": 0.028161608958534146, + "grad_norm": 18.353488520838976, + "learning_rate": 2.816107004008622e-05, + "loss": 2.3858, + "mean_token_accuracy": 0.44827585816383364, + "step": 27960 + }, + { + "epoch": 0.02816664501163832, + "grad_norm": 16.682382985152902, + "learning_rate": 2.8166105996817278e-05, + "loss": 2.2709, + "mean_token_accuracy": 0.44482759237289426, + "step": 27965 + }, + { + "epoch": 0.028171681064742493, + "grad_norm": 15.009710129735078, + "learning_rate": 2.8171141953548334e-05, + "loss": 2.4922, + "mean_token_accuracy": 0.35862069129943847, + "step": 27970 + }, + { + "epoch": 0.028176717117846663, + "grad_norm": 17.95951643713577, + "learning_rate": 2.8176177910279393e-05, + "loss": 2.4578, + "mean_token_accuracy": 0.3919540226459503, + "step": 27975 + }, + { + "epoch": 0.028181753170950837, + "grad_norm": 18.327300901465197, + "learning_rate": 2.818121386701046e-05, + "loss": 2.5142, + "mean_token_accuracy": 0.42758620977401735, + "step": 27980 + }, + { + "epoch": 0.02818678922405501, + "grad_norm": 22.54786310399626, + "learning_rate": 2.818624982374152e-05, + "loss": 2.2914, + "mean_token_accuracy": 0.42413792610168455, + "step": 27985 + }, + { + "epoch": 0.02819182527715918, + "grad_norm": 19.95293805187181, + "learning_rate": 2.8191285780472578e-05, + "loss": 2.6907, + "mean_token_accuracy": 0.36660616993904116, + "step": 27990 + }, + { + "epoch": 0.028196861330263355, + "grad_norm": 19.0165389938536, + "learning_rate": 2.8196321737203634e-05, + "loss": 2.5573, + "mean_token_accuracy": 0.36551724672317504, + "step": 27995 + }, + { + "epoch": 0.02820189738336753, + "grad_norm": 15.219067319872153, + "learning_rate": 2.8201357693934693e-05, + "loss": 2.422, + "mean_token_accuracy": 0.42928009033203124, + "step": 28000 + }, + { + "epoch": 0.028206933436471703, + "grad_norm": 17.632750790599026, + "learning_rate": 2.8206393650665752e-05, + "loss": 2.4535, + "mean_token_accuracy": 0.42758620381355283, + "step": 28005 + }, + { + "epoch": 0.028211969489575873, + "grad_norm": 17.614657994248734, + "learning_rate": 2.8211429607396818e-05, + "loss": 2.6753, + "mean_token_accuracy": 0.3965517282485962, + "step": 28010 + }, + { + "epoch": 0.028217005542680047, + "grad_norm": 16.826485508318573, + "learning_rate": 2.8216465564127874e-05, + "loss": 2.7549, + "mean_token_accuracy": 0.41379311084747317, + "step": 28015 + }, + { + "epoch": 0.02822204159578422, + "grad_norm": 13.056252781451715, + "learning_rate": 2.8221501520858933e-05, + "loss": 2.3114, + "mean_token_accuracy": 0.43103448748588563, + "step": 28020 + }, + { + "epoch": 0.02822707764888839, + "grad_norm": 55.24421861421408, + "learning_rate": 2.8226537477589993e-05, + "loss": 2.7715, + "mean_token_accuracy": 0.37241379618644715, + "step": 28025 + }, + { + "epoch": 0.028232113701992564, + "grad_norm": 17.498126633231717, + "learning_rate": 2.8231573434321052e-05, + "loss": 2.5553, + "mean_token_accuracy": 0.441379314661026, + "step": 28030 + }, + { + "epoch": 0.028237149755096738, + "grad_norm": 15.421492339228518, + "learning_rate": 2.8236609391052114e-05, + "loss": 2.4536, + "mean_token_accuracy": 0.4034482717514038, + "step": 28035 + }, + { + "epoch": 0.028242185808200912, + "grad_norm": 19.344334330590048, + "learning_rate": 2.8241645347783174e-05, + "loss": 2.7456, + "mean_token_accuracy": 0.36551724672317504, + "step": 28040 + }, + { + "epoch": 0.028247221861305082, + "grad_norm": 19.202701143764326, + "learning_rate": 2.8246681304514233e-05, + "loss": 2.7319, + "mean_token_accuracy": 0.3862069010734558, + "step": 28045 + }, + { + "epoch": 0.028252257914409256, + "grad_norm": 17.71799314885439, + "learning_rate": 2.8251717261245292e-05, + "loss": 2.6246, + "mean_token_accuracy": 0.3551724076271057, + "step": 28050 + }, + { + "epoch": 0.02825729396751343, + "grad_norm": 23.302681906181128, + "learning_rate": 2.825675321797635e-05, + "loss": 2.4886, + "mean_token_accuracy": 0.48275861144065857, + "step": 28055 + }, + { + "epoch": 0.0282623300206176, + "grad_norm": 16.5962225829614, + "learning_rate": 2.8261789174707414e-05, + "loss": 2.2526, + "mean_token_accuracy": 0.43974592089653014, + "step": 28060 + }, + { + "epoch": 0.028267366073721774, + "grad_norm": 17.581021132250964, + "learning_rate": 2.8266825131438473e-05, + "loss": 2.3007, + "mean_token_accuracy": 0.42758620381355283, + "step": 28065 + }, + { + "epoch": 0.028272402126825948, + "grad_norm": 14.75800334350753, + "learning_rate": 2.8271861088169533e-05, + "loss": 2.4331, + "mean_token_accuracy": 0.3999999940395355, + "step": 28070 + }, + { + "epoch": 0.02827743817993012, + "grad_norm": 17.60937388031897, + "learning_rate": 2.8276897044900592e-05, + "loss": 2.313, + "mean_token_accuracy": 0.4344827473163605, + "step": 28075 + }, + { + "epoch": 0.02828247423303429, + "grad_norm": 16.604390218683527, + "learning_rate": 2.828193300163165e-05, + "loss": 2.7307, + "mean_token_accuracy": 0.33448275923728943, + "step": 28080 + }, + { + "epoch": 0.028287510286138465, + "grad_norm": 17.94464888724479, + "learning_rate": 2.8286968958362707e-05, + "loss": 2.8678, + "mean_token_accuracy": 0.34137930274009703, + "step": 28085 + }, + { + "epoch": 0.02829254633924264, + "grad_norm": 19.651100900398013, + "learning_rate": 2.8292004915093773e-05, + "loss": 2.2848, + "mean_token_accuracy": 0.44137930274009707, + "step": 28090 + }, + { + "epoch": 0.02829758239234681, + "grad_norm": 15.295795302792653, + "learning_rate": 2.8297040871824832e-05, + "loss": 2.4788, + "mean_token_accuracy": 0.44137930274009707, + "step": 28095 + }, + { + "epoch": 0.028302618445450983, + "grad_norm": 20.682741585919363, + "learning_rate": 2.830207682855589e-05, + "loss": 2.7506, + "mean_token_accuracy": 0.3655172437429428, + "step": 28100 + }, + { + "epoch": 0.028307654498555157, + "grad_norm": 16.04253339594385, + "learning_rate": 2.8307112785286947e-05, + "loss": 2.3808, + "mean_token_accuracy": 0.4344827592372894, + "step": 28105 + }, + { + "epoch": 0.02831269055165933, + "grad_norm": 23.49564760061073, + "learning_rate": 2.8312148742018007e-05, + "loss": 2.7526, + "mean_token_accuracy": 0.3586206823587418, + "step": 28110 + }, + { + "epoch": 0.0283177266047635, + "grad_norm": 16.536225308379713, + "learning_rate": 2.8317184698749073e-05, + "loss": 2.2987, + "mean_token_accuracy": 0.4379310369491577, + "step": 28115 + }, + { + "epoch": 0.028322762657867675, + "grad_norm": 18.153213257442527, + "learning_rate": 2.8322220655480132e-05, + "loss": 2.512, + "mean_token_accuracy": 0.39310344457626345, + "step": 28120 + }, + { + "epoch": 0.02832779871097185, + "grad_norm": 16.60307269598842, + "learning_rate": 2.8327256612211188e-05, + "loss": 2.6854, + "mean_token_accuracy": 0.358620685338974, + "step": 28125 + }, + { + "epoch": 0.02833283476407602, + "grad_norm": 32.37664369579404, + "learning_rate": 2.8332292568942247e-05, + "loss": 2.5425, + "mean_token_accuracy": 0.4, + "step": 28130 + }, + { + "epoch": 0.028337870817180193, + "grad_norm": 17.726107727848664, + "learning_rate": 2.8337328525673306e-05, + "loss": 2.7216, + "mean_token_accuracy": 0.37931033968925476, + "step": 28135 + }, + { + "epoch": 0.028342906870284366, + "grad_norm": 18.92129243591575, + "learning_rate": 2.8342364482404372e-05, + "loss": 2.9826, + "mean_token_accuracy": 0.36896551847457887, + "step": 28140 + }, + { + "epoch": 0.02834794292338854, + "grad_norm": 21.34004038817204, + "learning_rate": 2.8347400439135428e-05, + "loss": 2.425, + "mean_token_accuracy": 0.375862056016922, + "step": 28145 + }, + { + "epoch": 0.02835297897649271, + "grad_norm": 73.87280373537293, + "learning_rate": 2.8352436395866487e-05, + "loss": 2.5554, + "mean_token_accuracy": 0.41724138259887694, + "step": 28150 + }, + { + "epoch": 0.028358015029596884, + "grad_norm": 22.390770447538266, + "learning_rate": 2.8357472352597547e-05, + "loss": 2.6512, + "mean_token_accuracy": 0.43599515557289126, + "step": 28155 + }, + { + "epoch": 0.028363051082701058, + "grad_norm": 33.1309942449407, + "learning_rate": 2.8362508309328606e-05, + "loss": 2.4506, + "mean_token_accuracy": 0.39655172228813174, + "step": 28160 + }, + { + "epoch": 0.02836808713580523, + "grad_norm": 22.962184957668015, + "learning_rate": 2.8367544266059665e-05, + "loss": 2.7354, + "mean_token_accuracy": 0.38275861740112305, + "step": 28165 + }, + { + "epoch": 0.028373123188909402, + "grad_norm": 19.250004778554494, + "learning_rate": 2.8372580222790728e-05, + "loss": 2.5642, + "mean_token_accuracy": 0.4531760334968567, + "step": 28170 + }, + { + "epoch": 0.028378159242013576, + "grad_norm": 15.826561314830004, + "learning_rate": 2.8377616179521787e-05, + "loss": 2.2403, + "mean_token_accuracy": 0.46896551847457885, + "step": 28175 + }, + { + "epoch": 0.02838319529511775, + "grad_norm": 21.04112169100893, + "learning_rate": 2.8382652136252846e-05, + "loss": 2.7231, + "mean_token_accuracy": 0.4052631616592407, + "step": 28180 + }, + { + "epoch": 0.02838823134822192, + "grad_norm": 25.12171502325933, + "learning_rate": 2.8387688092983906e-05, + "loss": 2.8073, + "mean_token_accuracy": 0.3638838529586792, + "step": 28185 + }, + { + "epoch": 0.028393267401326094, + "grad_norm": 18.11717143269485, + "learning_rate": 2.8392724049714965e-05, + "loss": 2.6618, + "mean_token_accuracy": 0.42068966329097746, + "step": 28190 + }, + { + "epoch": 0.028398303454430267, + "grad_norm": 16.415557152505404, + "learning_rate": 2.8397760006446027e-05, + "loss": 2.498, + "mean_token_accuracy": 0.4190562665462494, + "step": 28195 + }, + { + "epoch": 0.028403339507534438, + "grad_norm": 19.576931906091925, + "learning_rate": 2.8402795963177087e-05, + "loss": 2.4795, + "mean_token_accuracy": 0.4310344696044922, + "step": 28200 + }, + { + "epoch": 0.02840837556063861, + "grad_norm": 17.940512496304905, + "learning_rate": 2.8407831919908146e-05, + "loss": 2.4904, + "mean_token_accuracy": 0.38620689511299133, + "step": 28205 + }, + { + "epoch": 0.028413411613742785, + "grad_norm": 17.566175018559427, + "learning_rate": 2.8412867876639205e-05, + "loss": 2.1915, + "mean_token_accuracy": 0.413793095946312, + "step": 28210 + }, + { + "epoch": 0.02841844766684696, + "grad_norm": 21.345729586068174, + "learning_rate": 2.8417903833370264e-05, + "loss": 2.8239, + "mean_token_accuracy": 0.3896551728248596, + "step": 28215 + }, + { + "epoch": 0.02842348371995113, + "grad_norm": 19.826280017647704, + "learning_rate": 2.8422939790101327e-05, + "loss": 2.41, + "mean_token_accuracy": 0.44137930274009707, + "step": 28220 + }, + { + "epoch": 0.028428519773055303, + "grad_norm": 14.174347006435658, + "learning_rate": 2.8427975746832386e-05, + "loss": 2.6227, + "mean_token_accuracy": 0.4620689630508423, + "step": 28225 + }, + { + "epoch": 0.028433555826159477, + "grad_norm": 19.85448790785396, + "learning_rate": 2.8433011703563446e-05, + "loss": 2.6685, + "mean_token_accuracy": 0.37241379618644715, + "step": 28230 + }, + { + "epoch": 0.028438591879263647, + "grad_norm": 23.46289325800877, + "learning_rate": 2.8438047660294505e-05, + "loss": 2.5836, + "mean_token_accuracy": 0.4009679317474365, + "step": 28235 + }, + { + "epoch": 0.02844362793236782, + "grad_norm": 16.521132576753825, + "learning_rate": 2.844308361702556e-05, + "loss": 2.5734, + "mean_token_accuracy": 0.44482759237289426, + "step": 28240 + }, + { + "epoch": 0.028448663985471995, + "grad_norm": 14.793356163608912, + "learning_rate": 2.8448119573756627e-05, + "loss": 2.5237, + "mean_token_accuracy": 0.4034482777118683, + "step": 28245 + }, + { + "epoch": 0.02845370003857617, + "grad_norm": 16.02951691183068, + "learning_rate": 2.8453155530487686e-05, + "loss": 2.6474, + "mean_token_accuracy": 0.39655172228813174, + "step": 28250 + }, + { + "epoch": 0.02845873609168034, + "grad_norm": 19.58246946050734, + "learning_rate": 2.8458191487218745e-05, + "loss": 2.8468, + "mean_token_accuracy": 0.3620689630508423, + "step": 28255 + }, + { + "epoch": 0.028463772144784513, + "grad_norm": 31.83850974740709, + "learning_rate": 2.84632274439498e-05, + "loss": 2.6814, + "mean_token_accuracy": 0.39310345649719236, + "step": 28260 + }, + { + "epoch": 0.028468808197888686, + "grad_norm": 17.952002282014504, + "learning_rate": 2.846826340068086e-05, + "loss": 2.7324, + "mean_token_accuracy": 0.3758620619773865, + "step": 28265 + }, + { + "epoch": 0.028473844250992857, + "grad_norm": 20.80711445114084, + "learning_rate": 2.847329935741192e-05, + "loss": 2.6828, + "mean_token_accuracy": 0.34137930870056155, + "step": 28270 + }, + { + "epoch": 0.02847888030409703, + "grad_norm": 15.608876145680531, + "learning_rate": 2.8478335314142986e-05, + "loss": 2.5087, + "mean_token_accuracy": 0.42413793206214906, + "step": 28275 + }, + { + "epoch": 0.028483916357201204, + "grad_norm": 22.808256318855943, + "learning_rate": 2.848337127087404e-05, + "loss": 2.3822, + "mean_token_accuracy": 0.4068965554237366, + "step": 28280 + }, + { + "epoch": 0.028488952410305378, + "grad_norm": 20.00187164588378, + "learning_rate": 2.84884072276051e-05, + "loss": 2.253, + "mean_token_accuracy": 0.48481547832489014, + "step": 28285 + }, + { + "epoch": 0.028493988463409548, + "grad_norm": 18.70652730109442, + "learning_rate": 2.849344318433616e-05, + "loss": 2.7495, + "mean_token_accuracy": 0.38965516686439516, + "step": 28290 + }, + { + "epoch": 0.028499024516513722, + "grad_norm": 14.957659782138862, + "learning_rate": 2.849847914106722e-05, + "loss": 2.7427, + "mean_token_accuracy": 0.39122806787490844, + "step": 28295 + }, + { + "epoch": 0.028504060569617896, + "grad_norm": 17.513004807296323, + "learning_rate": 2.8503515097798282e-05, + "loss": 2.5535, + "mean_token_accuracy": 0.4034482717514038, + "step": 28300 + }, + { + "epoch": 0.028509096622722066, + "grad_norm": 14.519661865270336, + "learning_rate": 2.850855105452934e-05, + "loss": 2.4891, + "mean_token_accuracy": 0.43448275327682495, + "step": 28305 + }, + { + "epoch": 0.02851413267582624, + "grad_norm": 17.13897125372733, + "learning_rate": 2.85135870112604e-05, + "loss": 2.8747, + "mean_token_accuracy": 0.3724137842655182, + "step": 28310 + }, + { + "epoch": 0.028519168728930414, + "grad_norm": 15.334421870405613, + "learning_rate": 2.851862296799146e-05, + "loss": 2.4958, + "mean_token_accuracy": 0.3724137932062149, + "step": 28315 + }, + { + "epoch": 0.028524204782034587, + "grad_norm": 20.10438363764687, + "learning_rate": 2.852365892472252e-05, + "loss": 2.4017, + "mean_token_accuracy": 0.42758620977401735, + "step": 28320 + }, + { + "epoch": 0.028529240835138758, + "grad_norm": 17.371840478624957, + "learning_rate": 2.852869488145358e-05, + "loss": 2.326, + "mean_token_accuracy": 0.4137930989265442, + "step": 28325 + }, + { + "epoch": 0.02853427688824293, + "grad_norm": 15.87289770440186, + "learning_rate": 2.853373083818464e-05, + "loss": 2.2709, + "mean_token_accuracy": 0.46896551847457885, + "step": 28330 + }, + { + "epoch": 0.028539312941347105, + "grad_norm": 17.704485349921804, + "learning_rate": 2.85387667949157e-05, + "loss": 2.6764, + "mean_token_accuracy": 0.4034482717514038, + "step": 28335 + }, + { + "epoch": 0.028544348994451275, + "grad_norm": 15.67704496678658, + "learning_rate": 2.854380275164676e-05, + "loss": 2.5248, + "mean_token_accuracy": 0.3862068891525269, + "step": 28340 + }, + { + "epoch": 0.02854938504755545, + "grad_norm": 18.674184103464494, + "learning_rate": 2.854883870837782e-05, + "loss": 2.7058, + "mean_token_accuracy": 0.38777979016304015, + "step": 28345 + }, + { + "epoch": 0.028554421100659623, + "grad_norm": 21.621585331533026, + "learning_rate": 2.8553874665108878e-05, + "loss": 2.6666, + "mean_token_accuracy": 0.3793103456497192, + "step": 28350 + }, + { + "epoch": 0.028559457153763797, + "grad_norm": 17.56552274378061, + "learning_rate": 2.855891062183994e-05, + "loss": 2.5288, + "mean_token_accuracy": 0.3827586233615875, + "step": 28355 + }, + { + "epoch": 0.028564493206867967, + "grad_norm": 20.934192337728412, + "learning_rate": 2.8563946578571e-05, + "loss": 2.3287, + "mean_token_accuracy": 0.4655172288417816, + "step": 28360 + }, + { + "epoch": 0.02856952925997214, + "grad_norm": 16.736453287038298, + "learning_rate": 2.856898253530206e-05, + "loss": 2.5984, + "mean_token_accuracy": 0.4034482717514038, + "step": 28365 + }, + { + "epoch": 0.028574565313076315, + "grad_norm": 16.660382262154176, + "learning_rate": 2.8574018492033118e-05, + "loss": 2.5409, + "mean_token_accuracy": 0.43103447556495667, + "step": 28370 + }, + { + "epoch": 0.028579601366180485, + "grad_norm": 17.354954695654357, + "learning_rate": 2.8579054448764174e-05, + "loss": 2.2757, + "mean_token_accuracy": 0.4676950931549072, + "step": 28375 + }, + { + "epoch": 0.02858463741928466, + "grad_norm": 18.932211981169004, + "learning_rate": 2.858409040549524e-05, + "loss": 2.4844, + "mean_token_accuracy": 0.4137930989265442, + "step": 28380 + }, + { + "epoch": 0.028589673472388832, + "grad_norm": 18.1070588675393, + "learning_rate": 2.85891263622263e-05, + "loss": 2.2866, + "mean_token_accuracy": 0.4068965494632721, + "step": 28385 + }, + { + "epoch": 0.028594709525493006, + "grad_norm": 16.001697254622623, + "learning_rate": 2.859416231895736e-05, + "loss": 2.6019, + "mean_token_accuracy": 0.3793103456497192, + "step": 28390 + }, + { + "epoch": 0.028599745578597176, + "grad_norm": 19.35876023913447, + "learning_rate": 2.8599198275688414e-05, + "loss": 2.3085, + "mean_token_accuracy": 0.4344827592372894, + "step": 28395 + }, + { + "epoch": 0.02860478163170135, + "grad_norm": 13.972991550530294, + "learning_rate": 2.8604234232419474e-05, + "loss": 2.5052, + "mean_token_accuracy": 0.42413793206214906, + "step": 28400 + }, + { + "epoch": 0.028609817684805524, + "grad_norm": 16.29793093475021, + "learning_rate": 2.860927018915054e-05, + "loss": 2.6207, + "mean_token_accuracy": 0.3620689630508423, + "step": 28405 + }, + { + "epoch": 0.028614853737909694, + "grad_norm": 15.134446247525407, + "learning_rate": 2.86143061458816e-05, + "loss": 2.5534, + "mean_token_accuracy": 0.3551724135875702, + "step": 28410 + }, + { + "epoch": 0.028619889791013868, + "grad_norm": 19.129487196087364, + "learning_rate": 2.8619342102612655e-05, + "loss": 2.3528, + "mean_token_accuracy": 0.41584996581077577, + "step": 28415 + }, + { + "epoch": 0.028624925844118042, + "grad_norm": 17.069704069467814, + "learning_rate": 2.8624378059343714e-05, + "loss": 2.39, + "mean_token_accuracy": 0.4413793087005615, + "step": 28420 + }, + { + "epoch": 0.028629961897222212, + "grad_norm": 14.42712897580744, + "learning_rate": 2.8629414016074773e-05, + "loss": 2.4717, + "mean_token_accuracy": 0.3827586203813553, + "step": 28425 + }, + { + "epoch": 0.028634997950326386, + "grad_norm": 14.38898296988094, + "learning_rate": 2.8634449972805833e-05, + "loss": 2.4298, + "mean_token_accuracy": 0.42758620381355283, + "step": 28430 + }, + { + "epoch": 0.02864003400343056, + "grad_norm": 19.914229506996257, + "learning_rate": 2.8639485929536895e-05, + "loss": 2.5071, + "mean_token_accuracy": 0.4189957737922668, + "step": 28435 + }, + { + "epoch": 0.028645070056534733, + "grad_norm": 22.120000341294624, + "learning_rate": 2.8644521886267955e-05, + "loss": 2.7276, + "mean_token_accuracy": 0.37241379022598264, + "step": 28440 + }, + { + "epoch": 0.028650106109638904, + "grad_norm": 15.6329795924144, + "learning_rate": 2.8649557842999014e-05, + "loss": 2.655, + "mean_token_accuracy": 0.3827586203813553, + "step": 28445 + }, + { + "epoch": 0.028655142162743077, + "grad_norm": 19.58222722848476, + "learning_rate": 2.8654593799730073e-05, + "loss": 2.6931, + "mean_token_accuracy": 0.37586206793785093, + "step": 28450 + }, + { + "epoch": 0.02866017821584725, + "grad_norm": 17.56165568743544, + "learning_rate": 2.8659629756461132e-05, + "loss": 2.5882, + "mean_token_accuracy": 0.3793103516101837, + "step": 28455 + }, + { + "epoch": 0.02866521426895142, + "grad_norm": 166.10776884085166, + "learning_rate": 2.8664665713192195e-05, + "loss": 2.3056, + "mean_token_accuracy": 0.44482758045196535, + "step": 28460 + }, + { + "epoch": 0.028670250322055595, + "grad_norm": 17.200862684573128, + "learning_rate": 2.8669701669923254e-05, + "loss": 2.3147, + "mean_token_accuracy": 0.4034482777118683, + "step": 28465 + }, + { + "epoch": 0.02867528637515977, + "grad_norm": 21.81120326717603, + "learning_rate": 2.8674737626654313e-05, + "loss": 2.724, + "mean_token_accuracy": 0.4241379380226135, + "step": 28470 + }, + { + "epoch": 0.028680322428263943, + "grad_norm": 15.79321637757581, + "learning_rate": 2.8679773583385373e-05, + "loss": 2.4602, + "mean_token_accuracy": 0.3999999940395355, + "step": 28475 + }, + { + "epoch": 0.028685358481368113, + "grad_norm": 15.595180280947343, + "learning_rate": 2.8684809540116432e-05, + "loss": 2.2768, + "mean_token_accuracy": 0.39310343861579894, + "step": 28480 + }, + { + "epoch": 0.028690394534472287, + "grad_norm": 15.710697209575326, + "learning_rate": 2.8689845496847495e-05, + "loss": 2.37, + "mean_token_accuracy": 0.4640653431415558, + "step": 28485 + }, + { + "epoch": 0.02869543058757646, + "grad_norm": 17.296339684920692, + "learning_rate": 2.8694881453578554e-05, + "loss": 2.7449, + "mean_token_accuracy": 0.4206896543502808, + "step": 28490 + }, + { + "epoch": 0.02870046664068063, + "grad_norm": 21.010737274898965, + "learning_rate": 2.8699917410309613e-05, + "loss": 2.6273, + "mean_token_accuracy": 0.3950393170118332, + "step": 28495 + }, + { + "epoch": 0.028705502693784805, + "grad_norm": 15.902920062875422, + "learning_rate": 2.8704953367040672e-05, + "loss": 2.4669, + "mean_token_accuracy": 0.4172413766384125, + "step": 28500 + }, + { + "epoch": 0.02871053874688898, + "grad_norm": 17.076341940325076, + "learning_rate": 2.8709989323771728e-05, + "loss": 2.6852, + "mean_token_accuracy": 0.3862069010734558, + "step": 28505 + }, + { + "epoch": 0.028715574799993152, + "grad_norm": 16.98709935218074, + "learning_rate": 2.8715025280502787e-05, + "loss": 2.3382, + "mean_token_accuracy": 0.4310344815254211, + "step": 28510 + }, + { + "epoch": 0.028720610853097323, + "grad_norm": 17.330316083954713, + "learning_rate": 2.8720061237233854e-05, + "loss": 2.3106, + "mean_token_accuracy": 0.39655172228813174, + "step": 28515 + }, + { + "epoch": 0.028725646906201496, + "grad_norm": 25.490686676913427, + "learning_rate": 2.8725097193964913e-05, + "loss": 2.805, + "mean_token_accuracy": 0.37241379022598264, + "step": 28520 + }, + { + "epoch": 0.02873068295930567, + "grad_norm": 18.231296155299, + "learning_rate": 2.8730133150695972e-05, + "loss": 2.4691, + "mean_token_accuracy": 0.4360556542873383, + "step": 28525 + }, + { + "epoch": 0.02873571901240984, + "grad_norm": 16.587977870382247, + "learning_rate": 2.8735169107427028e-05, + "loss": 2.5418, + "mean_token_accuracy": 0.4413793087005615, + "step": 28530 + }, + { + "epoch": 0.028740755065514014, + "grad_norm": 14.216201101577282, + "learning_rate": 2.8740205064158087e-05, + "loss": 2.6134, + "mean_token_accuracy": 0.41724138259887694, + "step": 28535 + }, + { + "epoch": 0.028745791118618188, + "grad_norm": 17.768584360627216, + "learning_rate": 2.8745241020889153e-05, + "loss": 2.675, + "mean_token_accuracy": 0.4172413766384125, + "step": 28540 + }, + { + "epoch": 0.02875082717172236, + "grad_norm": 16.925126993391544, + "learning_rate": 2.8750276977620212e-05, + "loss": 2.5142, + "mean_token_accuracy": 0.44482758045196535, + "step": 28545 + }, + { + "epoch": 0.028755863224826532, + "grad_norm": 18.095037049482617, + "learning_rate": 2.8755312934351268e-05, + "loss": 2.5346, + "mean_token_accuracy": 0.3774349570274353, + "step": 28550 + }, + { + "epoch": 0.028760899277930706, + "grad_norm": 15.974696553493052, + "learning_rate": 2.8760348891082328e-05, + "loss": 2.0573, + "mean_token_accuracy": 0.47773745059967043, + "step": 28555 + }, + { + "epoch": 0.02876593533103488, + "grad_norm": 17.24971840188284, + "learning_rate": 2.8765384847813387e-05, + "loss": 2.677, + "mean_token_accuracy": 0.4172413766384125, + "step": 28560 + }, + { + "epoch": 0.02877097138413905, + "grad_norm": 13.377900067735093, + "learning_rate": 2.8770420804544453e-05, + "loss": 2.2563, + "mean_token_accuracy": 0.43793103098869324, + "step": 28565 + }, + { + "epoch": 0.028776007437243224, + "grad_norm": 14.791403159375339, + "learning_rate": 2.877545676127551e-05, + "loss": 2.2453, + "mean_token_accuracy": 0.4428917050361633, + "step": 28570 + }, + { + "epoch": 0.028781043490347397, + "grad_norm": 16.030834517123232, + "learning_rate": 2.8780492718006568e-05, + "loss": 2.3416, + "mean_token_accuracy": 0.4413793087005615, + "step": 28575 + }, + { + "epoch": 0.02878607954345157, + "grad_norm": 16.36994147550745, + "learning_rate": 2.8785528674737627e-05, + "loss": 2.2641, + "mean_token_accuracy": 0.4517241358757019, + "step": 28580 + }, + { + "epoch": 0.02879111559655574, + "grad_norm": 18.650187751198107, + "learning_rate": 2.8790564631468686e-05, + "loss": 2.7386, + "mean_token_accuracy": 0.3793103337287903, + "step": 28585 + }, + { + "epoch": 0.028796151649659915, + "grad_norm": 15.249022585260006, + "learning_rate": 2.8795600588199746e-05, + "loss": 2.4477, + "mean_token_accuracy": 0.4529340624809265, + "step": 28590 + }, + { + "epoch": 0.02880118770276409, + "grad_norm": 18.02934128318509, + "learning_rate": 2.880063654493081e-05, + "loss": 2.6862, + "mean_token_accuracy": 0.39310344457626345, + "step": 28595 + }, + { + "epoch": 0.02880622375586826, + "grad_norm": 15.501591268719338, + "learning_rate": 2.8805672501661868e-05, + "loss": 2.5998, + "mean_token_accuracy": 0.3448275804519653, + "step": 28600 + }, + { + "epoch": 0.028811259808972433, + "grad_norm": 15.241972571276792, + "learning_rate": 2.8810708458392927e-05, + "loss": 2.2108, + "mean_token_accuracy": 0.44827585816383364, + "step": 28605 + }, + { + "epoch": 0.028816295862076607, + "grad_norm": 14.42015317227423, + "learning_rate": 2.8815744415123986e-05, + "loss": 2.7649, + "mean_token_accuracy": 0.41905625760555265, + "step": 28610 + }, + { + "epoch": 0.02882133191518078, + "grad_norm": 18.06061595423494, + "learning_rate": 2.8820780371855045e-05, + "loss": 2.9039, + "mean_token_accuracy": 0.41724138259887694, + "step": 28615 + }, + { + "epoch": 0.02882636796828495, + "grad_norm": 18.53433787871925, + "learning_rate": 2.8825816328586108e-05, + "loss": 2.5882, + "mean_token_accuracy": 0.4275862157344818, + "step": 28620 + }, + { + "epoch": 0.028831404021389125, + "grad_norm": 19.77284074438423, + "learning_rate": 2.8830852285317167e-05, + "loss": 2.3672, + "mean_token_accuracy": 0.4689655125141144, + "step": 28625 + }, + { + "epoch": 0.0288364400744933, + "grad_norm": 28.34621196503054, + "learning_rate": 2.8835888242048226e-05, + "loss": 2.5127, + "mean_token_accuracy": 0.3931034505367279, + "step": 28630 + }, + { + "epoch": 0.02884147612759747, + "grad_norm": 16.09174284888342, + "learning_rate": 2.8840924198779286e-05, + "loss": 2.3801, + "mean_token_accuracy": 0.43793103098869324, + "step": 28635 + }, + { + "epoch": 0.028846512180701642, + "grad_norm": 16.721714547091548, + "learning_rate": 2.884596015551034e-05, + "loss": 2.6349, + "mean_token_accuracy": 0.4206896543502808, + "step": 28640 + }, + { + "epoch": 0.028851548233805816, + "grad_norm": 13.898473792738638, + "learning_rate": 2.8850996112241408e-05, + "loss": 2.4432, + "mean_token_accuracy": 0.4482758641242981, + "step": 28645 + }, + { + "epoch": 0.02885658428690999, + "grad_norm": 15.364852462422112, + "learning_rate": 2.8856032068972467e-05, + "loss": 2.5988, + "mean_token_accuracy": 0.4034482717514038, + "step": 28650 + }, + { + "epoch": 0.02886162034001416, + "grad_norm": 17.972904343985753, + "learning_rate": 2.8861068025703526e-05, + "loss": 2.6786, + "mean_token_accuracy": 0.41379310488700866, + "step": 28655 + }, + { + "epoch": 0.028866656393118334, + "grad_norm": 21.127516747859747, + "learning_rate": 2.8866103982434582e-05, + "loss": 2.5323, + "mean_token_accuracy": 0.38620689809322356, + "step": 28660 + }, + { + "epoch": 0.028871692446222508, + "grad_norm": 18.755520314369743, + "learning_rate": 2.887113993916564e-05, + "loss": 2.9684, + "mean_token_accuracy": 0.34137930870056155, + "step": 28665 + }, + { + "epoch": 0.028876728499326678, + "grad_norm": 14.214699062595612, + "learning_rate": 2.8876175895896707e-05, + "loss": 2.6565, + "mean_token_accuracy": 0.38965516686439516, + "step": 28670 + }, + { + "epoch": 0.028881764552430852, + "grad_norm": 31.295169110630717, + "learning_rate": 2.8881211852627767e-05, + "loss": 2.5725, + "mean_token_accuracy": 0.42413793206214906, + "step": 28675 + }, + { + "epoch": 0.028886800605535026, + "grad_norm": 22.841088830484146, + "learning_rate": 2.8886247809358822e-05, + "loss": 2.6562, + "mean_token_accuracy": 0.42413793206214906, + "step": 28680 + }, + { + "epoch": 0.0288918366586392, + "grad_norm": 15.120564371399617, + "learning_rate": 2.889128376608988e-05, + "loss": 2.4734, + "mean_token_accuracy": 0.4122201979160309, + "step": 28685 + }, + { + "epoch": 0.02889687271174337, + "grad_norm": 30.18559371388071, + "learning_rate": 2.889631972282094e-05, + "loss": 2.3805, + "mean_token_accuracy": 0.43103447556495667, + "step": 28690 + }, + { + "epoch": 0.028901908764847543, + "grad_norm": 18.26290462016411, + "learning_rate": 2.8901355679552e-05, + "loss": 2.5333, + "mean_token_accuracy": 0.4103448212146759, + "step": 28695 + }, + { + "epoch": 0.028906944817951717, + "grad_norm": 16.55591481046095, + "learning_rate": 2.8906391636283063e-05, + "loss": 2.8416, + "mean_token_accuracy": 0.36206896007061007, + "step": 28700 + }, + { + "epoch": 0.028911980871055887, + "grad_norm": 14.484541858229901, + "learning_rate": 2.8911427593014122e-05, + "loss": 2.373, + "mean_token_accuracy": 0.4379310369491577, + "step": 28705 + }, + { + "epoch": 0.02891701692416006, + "grad_norm": 19.282777625526734, + "learning_rate": 2.891646354974518e-05, + "loss": 2.6113, + "mean_token_accuracy": 0.41034482717514037, + "step": 28710 + }, + { + "epoch": 0.028922052977264235, + "grad_norm": 22.624382275761743, + "learning_rate": 2.892149950647624e-05, + "loss": 2.6291, + "mean_token_accuracy": 0.3965517282485962, + "step": 28715 + }, + { + "epoch": 0.02892708903036841, + "grad_norm": 21.532411632617524, + "learning_rate": 2.89265354632073e-05, + "loss": 2.4124, + "mean_token_accuracy": 0.39655172526836396, + "step": 28720 + }, + { + "epoch": 0.02893212508347258, + "grad_norm": 19.394091941253663, + "learning_rate": 2.8931571419938362e-05, + "loss": 2.2191, + "mean_token_accuracy": 0.4413793087005615, + "step": 28725 + }, + { + "epoch": 0.028937161136576753, + "grad_norm": 24.423443576163727, + "learning_rate": 2.8936607376669422e-05, + "loss": 2.5435, + "mean_token_accuracy": 0.4172413766384125, + "step": 28730 + }, + { + "epoch": 0.028942197189680927, + "grad_norm": 18.0448105306421, + "learning_rate": 2.894164333340048e-05, + "loss": 2.784, + "mean_token_accuracy": 0.39310344457626345, + "step": 28735 + }, + { + "epoch": 0.028947233242785097, + "grad_norm": 24.74294927013064, + "learning_rate": 2.894667929013154e-05, + "loss": 2.4028, + "mean_token_accuracy": 0.41724138259887694, + "step": 28740 + }, + { + "epoch": 0.02895226929588927, + "grad_norm": 26.055191371340445, + "learning_rate": 2.89517152468626e-05, + "loss": 2.5702, + "mean_token_accuracy": 0.4275861978530884, + "step": 28745 + }, + { + "epoch": 0.028957305348993444, + "grad_norm": 16.961313041193723, + "learning_rate": 2.8956751203593662e-05, + "loss": 2.0385, + "mean_token_accuracy": 0.46551724076271056, + "step": 28750 + }, + { + "epoch": 0.028962341402097618, + "grad_norm": 17.143891513820765, + "learning_rate": 2.896178716032472e-05, + "loss": 2.6204, + "mean_token_accuracy": 0.4206896543502808, + "step": 28755 + }, + { + "epoch": 0.02896737745520179, + "grad_norm": 17.042355568511894, + "learning_rate": 2.896682311705578e-05, + "loss": 2.4401, + "mean_token_accuracy": 0.4310344815254211, + "step": 28760 + }, + { + "epoch": 0.028972413508305962, + "grad_norm": 13.904077424609985, + "learning_rate": 2.897185907378684e-05, + "loss": 2.6169, + "mean_token_accuracy": 0.37586207389831544, + "step": 28765 + }, + { + "epoch": 0.028977449561410136, + "grad_norm": 16.84133339059328, + "learning_rate": 2.89768950305179e-05, + "loss": 2.3678, + "mean_token_accuracy": 0.47586206793785096, + "step": 28770 + }, + { + "epoch": 0.028982485614514306, + "grad_norm": 15.053974153693597, + "learning_rate": 2.8981930987248955e-05, + "loss": 2.7146, + "mean_token_accuracy": 0.3655172407627106, + "step": 28775 + }, + { + "epoch": 0.02898752166761848, + "grad_norm": 16.970639635903364, + "learning_rate": 2.898696694398002e-05, + "loss": 2.6197, + "mean_token_accuracy": 0.4259528160095215, + "step": 28780 + }, + { + "epoch": 0.028992557720722654, + "grad_norm": 16.986679847070718, + "learning_rate": 2.899200290071108e-05, + "loss": 2.6679, + "mean_token_accuracy": 0.3379310369491577, + "step": 28785 + }, + { + "epoch": 0.028997593773826828, + "grad_norm": 18.635741052732673, + "learning_rate": 2.899703885744214e-05, + "loss": 2.212, + "mean_token_accuracy": 0.4603750824928284, + "step": 28790 + }, + { + "epoch": 0.029002629826930998, + "grad_norm": 16.213551756939896, + "learning_rate": 2.9002074814173195e-05, + "loss": 2.3797, + "mean_token_accuracy": 0.43793103098869324, + "step": 28795 + }, + { + "epoch": 0.02900766588003517, + "grad_norm": 23.6826600500554, + "learning_rate": 2.9007110770904255e-05, + "loss": 2.7053, + "mean_token_accuracy": 0.3413793116807938, + "step": 28800 + }, + { + "epoch": 0.029012701933139345, + "grad_norm": 20.756111455150805, + "learning_rate": 2.901214672763532e-05, + "loss": 2.4377, + "mean_token_accuracy": 0.37931033968925476, + "step": 28805 + }, + { + "epoch": 0.029017737986243516, + "grad_norm": 19.017768632761857, + "learning_rate": 2.901718268436638e-05, + "loss": 2.4619, + "mean_token_accuracy": 0.40562612414360044, + "step": 28810 + }, + { + "epoch": 0.02902277403934769, + "grad_norm": 18.4681412814779, + "learning_rate": 2.9022218641097436e-05, + "loss": 2.4969, + "mean_token_accuracy": 0.37586206793785093, + "step": 28815 + }, + { + "epoch": 0.029027810092451863, + "grad_norm": 17.08970036111446, + "learning_rate": 2.9027254597828495e-05, + "loss": 2.3785, + "mean_token_accuracy": 0.4, + "step": 28820 + }, + { + "epoch": 0.029032846145556037, + "grad_norm": 18.67574865526711, + "learning_rate": 2.9032290554559554e-05, + "loss": 2.778, + "mean_token_accuracy": 0.382758629322052, + "step": 28825 + }, + { + "epoch": 0.029037882198660207, + "grad_norm": 16.415356320505825, + "learning_rate": 2.903732651129062e-05, + "loss": 2.2058, + "mean_token_accuracy": 0.4620689630508423, + "step": 28830 + }, + { + "epoch": 0.02904291825176438, + "grad_norm": 16.92230200822066, + "learning_rate": 2.9042362468021676e-05, + "loss": 2.448, + "mean_token_accuracy": 0.45760738253593447, + "step": 28835 + }, + { + "epoch": 0.029047954304868555, + "grad_norm": 16.387023010354024, + "learning_rate": 2.9047398424752735e-05, + "loss": 2.5928, + "mean_token_accuracy": 0.38275861740112305, + "step": 28840 + }, + { + "epoch": 0.029052990357972725, + "grad_norm": 14.967049143037833, + "learning_rate": 2.9052434381483795e-05, + "loss": 2.2491, + "mean_token_accuracy": 0.43103447556495667, + "step": 28845 + }, + { + "epoch": 0.0290580264110769, + "grad_norm": 26.815592130974913, + "learning_rate": 2.9057470338214854e-05, + "loss": 2.6212, + "mean_token_accuracy": 0.40000000298023225, + "step": 28850 + }, + { + "epoch": 0.029063062464181073, + "grad_norm": 20.062050470017812, + "learning_rate": 2.9062506294945913e-05, + "loss": 2.7217, + "mean_token_accuracy": 0.37241379022598264, + "step": 28855 + }, + { + "epoch": 0.029068098517285246, + "grad_norm": 19.379101099018673, + "learning_rate": 2.9067542251676976e-05, + "loss": 2.65, + "mean_token_accuracy": 0.3896551728248596, + "step": 28860 + }, + { + "epoch": 0.029073134570389417, + "grad_norm": 14.259386012303532, + "learning_rate": 2.9072578208408035e-05, + "loss": 2.4506, + "mean_token_accuracy": 0.42413792610168455, + "step": 28865 + }, + { + "epoch": 0.02907817062349359, + "grad_norm": 17.373325793435555, + "learning_rate": 2.9077614165139094e-05, + "loss": 2.4955, + "mean_token_accuracy": 0.4206896543502808, + "step": 28870 + }, + { + "epoch": 0.029083206676597764, + "grad_norm": 18.379079909453107, + "learning_rate": 2.9082650121870154e-05, + "loss": 2.7414, + "mean_token_accuracy": 0.3655172407627106, + "step": 28875 + }, + { + "epoch": 0.029088242729701935, + "grad_norm": 23.65953967382418, + "learning_rate": 2.9087686078601213e-05, + "loss": 2.5753, + "mean_token_accuracy": 0.42758620381355283, + "step": 28880 + }, + { + "epoch": 0.02909327878280611, + "grad_norm": 14.21683210635724, + "learning_rate": 2.9092722035332275e-05, + "loss": 2.4781, + "mean_token_accuracy": 0.4206896543502808, + "step": 28885 + }, + { + "epoch": 0.029098314835910282, + "grad_norm": 16.709303586132034, + "learning_rate": 2.9097757992063335e-05, + "loss": 2.1534, + "mean_token_accuracy": 0.5024803340435028, + "step": 28890 + }, + { + "epoch": 0.029103350889014456, + "grad_norm": 16.554841471233352, + "learning_rate": 2.9102793948794394e-05, + "loss": 2.2126, + "mean_token_accuracy": 0.47108287215232847, + "step": 28895 + }, + { + "epoch": 0.029108386942118626, + "grad_norm": 15.408381946064868, + "learning_rate": 2.9107829905525453e-05, + "loss": 2.7137, + "mean_token_accuracy": 0.38620689511299133, + "step": 28900 + }, + { + "epoch": 0.0291134229952228, + "grad_norm": 11.88918490778324, + "learning_rate": 2.9112865862256513e-05, + "loss": 2.2805, + "mean_token_accuracy": 0.47453114986419676, + "step": 28905 + }, + { + "epoch": 0.029118459048326974, + "grad_norm": 17.851370060465552, + "learning_rate": 2.9117901818987575e-05, + "loss": 2.4468, + "mean_token_accuracy": 0.3896551728248596, + "step": 28910 + }, + { + "epoch": 0.029123495101431144, + "grad_norm": 16.879994540888287, + "learning_rate": 2.9122937775718634e-05, + "loss": 2.3116, + "mean_token_accuracy": 0.4379310369491577, + "step": 28915 + }, + { + "epoch": 0.029128531154535318, + "grad_norm": 15.534236473320558, + "learning_rate": 2.9127973732449694e-05, + "loss": 2.3864, + "mean_token_accuracy": 0.394252872467041, + "step": 28920 + }, + { + "epoch": 0.02913356720763949, + "grad_norm": 18.047662964835066, + "learning_rate": 2.9133009689180753e-05, + "loss": 2.7292, + "mean_token_accuracy": 0.3896551728248596, + "step": 28925 + }, + { + "epoch": 0.029138603260743665, + "grad_norm": 13.768485046362416, + "learning_rate": 2.913804564591181e-05, + "loss": 2.5179, + "mean_token_accuracy": 0.3793103456497192, + "step": 28930 + }, + { + "epoch": 0.029143639313847836, + "grad_norm": 16.093592172210936, + "learning_rate": 2.9143081602642868e-05, + "loss": 2.7077, + "mean_token_accuracy": 0.4034482777118683, + "step": 28935 + }, + { + "epoch": 0.02914867536695201, + "grad_norm": 17.392814111433935, + "learning_rate": 2.9148117559373934e-05, + "loss": 2.5771, + "mean_token_accuracy": 0.4137930929660797, + "step": 28940 + }, + { + "epoch": 0.029153711420056183, + "grad_norm": 16.997913626391654, + "learning_rate": 2.9153153516104993e-05, + "loss": 2.4807, + "mean_token_accuracy": 0.441379314661026, + "step": 28945 + }, + { + "epoch": 0.029158747473160353, + "grad_norm": 23.048566877888664, + "learning_rate": 2.915818947283605e-05, + "loss": 2.4722, + "mean_token_accuracy": 0.3655172407627106, + "step": 28950 + }, + { + "epoch": 0.029163783526264527, + "grad_norm": 11.786499720776606, + "learning_rate": 2.916322542956711e-05, + "loss": 2.1988, + "mean_token_accuracy": 0.44827585816383364, + "step": 28955 + }, + { + "epoch": 0.0291688195793687, + "grad_norm": 18.48451621817639, + "learning_rate": 2.9168261386298168e-05, + "loss": 2.2716, + "mean_token_accuracy": 0.45862067937850953, + "step": 28960 + }, + { + "epoch": 0.029173855632472875, + "grad_norm": 18.298629578798383, + "learning_rate": 2.9173297343029234e-05, + "loss": 2.2899, + "mean_token_accuracy": 0.4793103337287903, + "step": 28965 + }, + { + "epoch": 0.029178891685577045, + "grad_norm": 20.290639960735724, + "learning_rate": 2.917833329976029e-05, + "loss": 2.6811, + "mean_token_accuracy": 0.38620689511299133, + "step": 28970 + }, + { + "epoch": 0.02918392773868122, + "grad_norm": 18.446891217979125, + "learning_rate": 2.918336925649135e-05, + "loss": 2.6811, + "mean_token_accuracy": 0.3931034505367279, + "step": 28975 + }, + { + "epoch": 0.029188963791785393, + "grad_norm": 17.24943587046956, + "learning_rate": 2.9188405213222408e-05, + "loss": 2.3526, + "mean_token_accuracy": 0.4551724076271057, + "step": 28980 + }, + { + "epoch": 0.029193999844889563, + "grad_norm": 22.330405333989457, + "learning_rate": 2.9193441169953467e-05, + "loss": 2.7507, + "mean_token_accuracy": 0.33793103098869326, + "step": 28985 + }, + { + "epoch": 0.029199035897993737, + "grad_norm": 26.77777667475145, + "learning_rate": 2.919847712668453e-05, + "loss": 2.5966, + "mean_token_accuracy": 0.3896551728248596, + "step": 28990 + }, + { + "epoch": 0.02920407195109791, + "grad_norm": 16.09287025862488, + "learning_rate": 2.920351308341559e-05, + "loss": 2.3633, + "mean_token_accuracy": 0.4586206912994385, + "step": 28995 + }, + { + "epoch": 0.029209108004202084, + "grad_norm": 20.712085479079104, + "learning_rate": 2.920854904014665e-05, + "loss": 2.4607, + "mean_token_accuracy": 0.4689655125141144, + "step": 29000 + }, + { + "epoch": 0.029214144057306254, + "grad_norm": 16.497744888524167, + "learning_rate": 2.9213584996877708e-05, + "loss": 2.772, + "mean_token_accuracy": 0.38620689511299133, + "step": 29005 + }, + { + "epoch": 0.029219180110410428, + "grad_norm": 16.601714947164467, + "learning_rate": 2.9218620953608767e-05, + "loss": 2.6891, + "mean_token_accuracy": 0.36896551847457887, + "step": 29010 + }, + { + "epoch": 0.029224216163514602, + "grad_norm": 18.263833642592957, + "learning_rate": 2.9223656910339826e-05, + "loss": 2.8077, + "mean_token_accuracy": 0.3551724135875702, + "step": 29015 + }, + { + "epoch": 0.029229252216618772, + "grad_norm": 18.047539245433132, + "learning_rate": 2.922869286707089e-05, + "loss": 2.6885, + "mean_token_accuracy": 0.40895341634750365, + "step": 29020 + }, + { + "epoch": 0.029234288269722946, + "grad_norm": 17.035566428510997, + "learning_rate": 2.9233728823801948e-05, + "loss": 2.69, + "mean_token_accuracy": 0.3896551728248596, + "step": 29025 + }, + { + "epoch": 0.02923932432282712, + "grad_norm": 19.086760974320768, + "learning_rate": 2.9238764780533007e-05, + "loss": 2.594, + "mean_token_accuracy": 0.4068965554237366, + "step": 29030 + }, + { + "epoch": 0.029244360375931294, + "grad_norm": 15.488313772590844, + "learning_rate": 2.9243800737264067e-05, + "loss": 2.9088, + "mean_token_accuracy": 0.3620689660310745, + "step": 29035 + }, + { + "epoch": 0.029249396429035464, + "grad_norm": 28.282500763414347, + "learning_rate": 2.9248836693995122e-05, + "loss": 2.4808, + "mean_token_accuracy": 0.34482758641242983, + "step": 29040 + }, + { + "epoch": 0.029254432482139638, + "grad_norm": 17.186948400173062, + "learning_rate": 2.925387265072619e-05, + "loss": 2.5365, + "mean_token_accuracy": 0.4258318305015564, + "step": 29045 + }, + { + "epoch": 0.02925946853524381, + "grad_norm": 22.317920546196827, + "learning_rate": 2.9258908607457248e-05, + "loss": 2.8038, + "mean_token_accuracy": 0.37241379022598264, + "step": 29050 + }, + { + "epoch": 0.02926450458834798, + "grad_norm": 15.088772550319392, + "learning_rate": 2.9263944564188307e-05, + "loss": 2.8417, + "mean_token_accuracy": 0.36896551847457887, + "step": 29055 + }, + { + "epoch": 0.029269540641452155, + "grad_norm": 24.365864876780236, + "learning_rate": 2.9268980520919366e-05, + "loss": 2.6709, + "mean_token_accuracy": 0.459359610080719, + "step": 29060 + }, + { + "epoch": 0.02927457669455633, + "grad_norm": 21.00513092648901, + "learning_rate": 2.9274016477650422e-05, + "loss": 2.5766, + "mean_token_accuracy": 0.39310344457626345, + "step": 29065 + }, + { + "epoch": 0.029279612747660503, + "grad_norm": 19.77508562302065, + "learning_rate": 2.9279052434381488e-05, + "loss": 2.7295, + "mean_token_accuracy": 0.3771929800510406, + "step": 29070 + }, + { + "epoch": 0.029284648800764673, + "grad_norm": 39.450216037499416, + "learning_rate": 2.9284088391112547e-05, + "loss": 2.6759, + "mean_token_accuracy": 0.37586206793785093, + "step": 29075 + }, + { + "epoch": 0.029289684853868847, + "grad_norm": 19.885862307445414, + "learning_rate": 2.9289124347843607e-05, + "loss": 2.7889, + "mean_token_accuracy": 0.3758620619773865, + "step": 29080 + }, + { + "epoch": 0.02929472090697302, + "grad_norm": 18.964273210774564, + "learning_rate": 2.9294160304574663e-05, + "loss": 2.607, + "mean_token_accuracy": 0.3931034475564957, + "step": 29085 + }, + { + "epoch": 0.02929975696007719, + "grad_norm": 17.559174560588463, + "learning_rate": 2.9299196261305722e-05, + "loss": 2.5961, + "mean_token_accuracy": 0.4119177222251892, + "step": 29090 + }, + { + "epoch": 0.029304793013181365, + "grad_norm": 17.372796279028535, + "learning_rate": 2.9304232218036788e-05, + "loss": 2.7834, + "mean_token_accuracy": 0.39655172228813174, + "step": 29095 + }, + { + "epoch": 0.02930982906628554, + "grad_norm": 17.50850945505296, + "learning_rate": 2.9309268174767847e-05, + "loss": 2.8309, + "mean_token_accuracy": 0.39310344457626345, + "step": 29100 + }, + { + "epoch": 0.029314865119389712, + "grad_norm": 17.050620886152583, + "learning_rate": 2.9314304131498903e-05, + "loss": 2.4639, + "mean_token_accuracy": 0.4379310369491577, + "step": 29105 + }, + { + "epoch": 0.029319901172493883, + "grad_norm": 17.908420992085034, + "learning_rate": 2.9319340088229962e-05, + "loss": 2.5188, + "mean_token_accuracy": 0.37931033968925476, + "step": 29110 + }, + { + "epoch": 0.029324937225598056, + "grad_norm": 20.677634081857832, + "learning_rate": 2.932437604496102e-05, + "loss": 2.8095, + "mean_token_accuracy": 0.34137930870056155, + "step": 29115 + }, + { + "epoch": 0.02932997327870223, + "grad_norm": 19.35289143344176, + "learning_rate": 2.932941200169208e-05, + "loss": 2.6752, + "mean_token_accuracy": 0.3620689630508423, + "step": 29120 + }, + { + "epoch": 0.0293350093318064, + "grad_norm": 16.88107204987131, + "learning_rate": 2.9334447958423143e-05, + "loss": 2.4056, + "mean_token_accuracy": 0.41379310488700866, + "step": 29125 + }, + { + "epoch": 0.029340045384910574, + "grad_norm": 22.884628993761428, + "learning_rate": 2.9339483915154203e-05, + "loss": 2.2999, + "mean_token_accuracy": 0.45862067937850953, + "step": 29130 + }, + { + "epoch": 0.029345081438014748, + "grad_norm": 16.77593779405751, + "learning_rate": 2.9344519871885262e-05, + "loss": 2.2376, + "mean_token_accuracy": 0.4379310369491577, + "step": 29135 + }, + { + "epoch": 0.029350117491118922, + "grad_norm": 18.29855260680542, + "learning_rate": 2.934955582861632e-05, + "loss": 2.4546, + "mean_token_accuracy": 0.4379310369491577, + "step": 29140 + }, + { + "epoch": 0.029355153544223092, + "grad_norm": 13.106796495422785, + "learning_rate": 2.935459178534738e-05, + "loss": 2.3484, + "mean_token_accuracy": 0.4601935803890228, + "step": 29145 + }, + { + "epoch": 0.029360189597327266, + "grad_norm": 19.101003876005805, + "learning_rate": 2.9359627742078443e-05, + "loss": 2.6957, + "mean_token_accuracy": 0.3448275804519653, + "step": 29150 + }, + { + "epoch": 0.02936522565043144, + "grad_norm": 13.709347748046984, + "learning_rate": 2.9364663698809502e-05, + "loss": 2.7858, + "mean_token_accuracy": 0.37241379022598264, + "step": 29155 + }, + { + "epoch": 0.02937026170353561, + "grad_norm": 17.87970586706311, + "learning_rate": 2.936969965554056e-05, + "loss": 2.685, + "mean_token_accuracy": 0.3620689630508423, + "step": 29160 + }, + { + "epoch": 0.029375297756639784, + "grad_norm": 18.524163065875594, + "learning_rate": 2.937473561227162e-05, + "loss": 3.0222, + "mean_token_accuracy": 0.3241379290819168, + "step": 29165 + }, + { + "epoch": 0.029380333809743957, + "grad_norm": 13.921594957708502, + "learning_rate": 2.937977156900268e-05, + "loss": 2.6907, + "mean_token_accuracy": 0.41034482717514037, + "step": 29170 + }, + { + "epoch": 0.02938536986284813, + "grad_norm": 15.55554656741362, + "learning_rate": 2.9384807525733743e-05, + "loss": 2.2311, + "mean_token_accuracy": 0.3931034505367279, + "step": 29175 + }, + { + "epoch": 0.0293904059159523, + "grad_norm": 17.354504545455352, + "learning_rate": 2.9389843482464802e-05, + "loss": 2.568, + "mean_token_accuracy": 0.4, + "step": 29180 + }, + { + "epoch": 0.029395441969056475, + "grad_norm": 15.895604495469875, + "learning_rate": 2.939487943919586e-05, + "loss": 2.3817, + "mean_token_accuracy": 0.42758620381355283, + "step": 29185 + }, + { + "epoch": 0.02940047802216065, + "grad_norm": 17.95642008554389, + "learning_rate": 2.939991539592692e-05, + "loss": 2.4346, + "mean_token_accuracy": 0.4241379380226135, + "step": 29190 + }, + { + "epoch": 0.02940551407526482, + "grad_norm": 14.331357044379631, + "learning_rate": 2.9404951352657976e-05, + "loss": 2.4156, + "mean_token_accuracy": 0.42413793206214906, + "step": 29195 + }, + { + "epoch": 0.029410550128368993, + "grad_norm": 19.07327428354615, + "learning_rate": 2.9409987309389036e-05, + "loss": 2.6629, + "mean_token_accuracy": 0.3482758551836014, + "step": 29200 + }, + { + "epoch": 0.029415586181473167, + "grad_norm": 15.908626053084024, + "learning_rate": 2.94150232661201e-05, + "loss": 2.4519, + "mean_token_accuracy": 0.4350272178649902, + "step": 29205 + }, + { + "epoch": 0.02942062223457734, + "grad_norm": 15.616566926145273, + "learning_rate": 2.942005922285116e-05, + "loss": 2.7775, + "mean_token_accuracy": 0.3620689660310745, + "step": 29210 + }, + { + "epoch": 0.02942565828768151, + "grad_norm": 16.518380686719727, + "learning_rate": 2.9425095179582217e-05, + "loss": 2.5235, + "mean_token_accuracy": 0.39310344457626345, + "step": 29215 + }, + { + "epoch": 0.029430694340785685, + "grad_norm": 27.027106638737187, + "learning_rate": 2.9430131136313276e-05, + "loss": 2.2819, + "mean_token_accuracy": 0.42068966031074523, + "step": 29220 + }, + { + "epoch": 0.02943573039388986, + "grad_norm": 14.52561221937901, + "learning_rate": 2.9435167093044335e-05, + "loss": 2.1586, + "mean_token_accuracy": 0.4724137902259827, + "step": 29225 + }, + { + "epoch": 0.02944076644699403, + "grad_norm": 15.013909963715054, + "learning_rate": 2.94402030497754e-05, + "loss": 2.5409, + "mean_token_accuracy": 0.41034482717514037, + "step": 29230 + }, + { + "epoch": 0.029445802500098203, + "grad_norm": 17.17768342793776, + "learning_rate": 2.9445239006506457e-05, + "loss": 2.3619, + "mean_token_accuracy": 0.4517241418361664, + "step": 29235 + }, + { + "epoch": 0.029450838553202376, + "grad_norm": 20.2352931071833, + "learning_rate": 2.9450274963237516e-05, + "loss": 2.4683, + "mean_token_accuracy": 0.39655172228813174, + "step": 29240 + }, + { + "epoch": 0.02945587460630655, + "grad_norm": 26.396247906740395, + "learning_rate": 2.9455310919968576e-05, + "loss": 2.7483, + "mean_token_accuracy": 0.36007259488105775, + "step": 29245 + }, + { + "epoch": 0.02946091065941072, + "grad_norm": 16.257561841829858, + "learning_rate": 2.9460346876699635e-05, + "loss": 2.7521, + "mean_token_accuracy": 0.37241379618644715, + "step": 29250 + }, + { + "epoch": 0.029465946712514894, + "grad_norm": 16.139308365458582, + "learning_rate": 2.9465382833430697e-05, + "loss": 2.6898, + "mean_token_accuracy": 0.3896551728248596, + "step": 29255 + }, + { + "epoch": 0.029470982765619068, + "grad_norm": 21.036289619873603, + "learning_rate": 2.9470418790161757e-05, + "loss": 2.8732, + "mean_token_accuracy": 0.37241379022598264, + "step": 29260 + }, + { + "epoch": 0.029476018818723238, + "grad_norm": 16.502844787291153, + "learning_rate": 2.9475454746892816e-05, + "loss": 2.2526, + "mean_token_accuracy": 0.46551724076271056, + "step": 29265 + }, + { + "epoch": 0.029481054871827412, + "grad_norm": 18.26457489152445, + "learning_rate": 2.9480490703623875e-05, + "loss": 2.5965, + "mean_token_accuracy": 0.3793103456497192, + "step": 29270 + }, + { + "epoch": 0.029486090924931586, + "grad_norm": 14.872907791582454, + "learning_rate": 2.9485526660354934e-05, + "loss": 2.3999, + "mean_token_accuracy": 0.45517241954803467, + "step": 29275 + }, + { + "epoch": 0.02949112697803576, + "grad_norm": 19.141729233509697, + "learning_rate": 2.9490562617085994e-05, + "loss": 2.258, + "mean_token_accuracy": 0.4448275864124298, + "step": 29280 + }, + { + "epoch": 0.02949616303113993, + "grad_norm": 18.814094296693867, + "learning_rate": 2.9495598573817056e-05, + "loss": 2.5004, + "mean_token_accuracy": 0.47376847863197324, + "step": 29285 + }, + { + "epoch": 0.029501199084244104, + "grad_norm": 19.141163139076475, + "learning_rate": 2.9500634530548116e-05, + "loss": 2.4433, + "mean_token_accuracy": 0.39310344457626345, + "step": 29290 + }, + { + "epoch": 0.029506235137348277, + "grad_norm": 13.91851606547807, + "learning_rate": 2.9505670487279175e-05, + "loss": 2.3206, + "mean_token_accuracy": 0.4379310250282288, + "step": 29295 + }, + { + "epoch": 0.029511271190452448, + "grad_norm": 23.90322714996998, + "learning_rate": 2.9510706444010234e-05, + "loss": 2.7151, + "mean_token_accuracy": 0.37241379022598264, + "step": 29300 + }, + { + "epoch": 0.02951630724355662, + "grad_norm": 24.02342827222452, + "learning_rate": 2.9515742400741293e-05, + "loss": 2.6927, + "mean_token_accuracy": 0.3827586233615875, + "step": 29305 + }, + { + "epoch": 0.029521343296660795, + "grad_norm": 24.847120259165067, + "learning_rate": 2.9520778357472356e-05, + "loss": 2.2897, + "mean_token_accuracy": 0.4310344815254211, + "step": 29310 + }, + { + "epoch": 0.02952637934976497, + "grad_norm": 16.41031167466528, + "learning_rate": 2.9525814314203415e-05, + "loss": 2.1434, + "mean_token_accuracy": 0.46896551847457885, + "step": 29315 + }, + { + "epoch": 0.02953141540286914, + "grad_norm": 30.884428525054425, + "learning_rate": 2.9530850270934475e-05, + "loss": 2.5676, + "mean_token_accuracy": 0.44137930274009707, + "step": 29320 + }, + { + "epoch": 0.029536451455973313, + "grad_norm": 18.8049378381986, + "learning_rate": 2.9535886227665534e-05, + "loss": 2.5201, + "mean_token_accuracy": 0.4103448212146759, + "step": 29325 + }, + { + "epoch": 0.029541487509077487, + "grad_norm": 27.83217431733001, + "learning_rate": 2.954092218439659e-05, + "loss": 2.907, + "mean_token_accuracy": 0.3551724076271057, + "step": 29330 + }, + { + "epoch": 0.029546523562181657, + "grad_norm": 17.355528083677683, + "learning_rate": 2.9545958141127656e-05, + "loss": 2.6722, + "mean_token_accuracy": 0.3896551728248596, + "step": 29335 + }, + { + "epoch": 0.02955155961528583, + "grad_norm": 13.243755961804263, + "learning_rate": 2.9550994097858715e-05, + "loss": 2.3976, + "mean_token_accuracy": 0.42758620977401735, + "step": 29340 + }, + { + "epoch": 0.029556595668390005, + "grad_norm": 17.048561242119124, + "learning_rate": 2.9556030054589774e-05, + "loss": 2.4364, + "mean_token_accuracy": 0.41034482717514037, + "step": 29345 + }, + { + "epoch": 0.02956163172149418, + "grad_norm": 15.943285690252303, + "learning_rate": 2.956106601132083e-05, + "loss": 2.437, + "mean_token_accuracy": 0.42758620381355283, + "step": 29350 + }, + { + "epoch": 0.02956666777459835, + "grad_norm": 30.236719435221882, + "learning_rate": 2.956610196805189e-05, + "loss": 2.5479, + "mean_token_accuracy": 0.43103448748588563, + "step": 29355 + }, + { + "epoch": 0.029571703827702522, + "grad_norm": 26.234716721973356, + "learning_rate": 2.957113792478295e-05, + "loss": 2.4102, + "mean_token_accuracy": 0.41379310488700866, + "step": 29360 + }, + { + "epoch": 0.029576739880806696, + "grad_norm": 16.875756528583572, + "learning_rate": 2.9576173881514015e-05, + "loss": 2.778, + "mean_token_accuracy": 0.3862068891525269, + "step": 29365 + }, + { + "epoch": 0.029581775933910866, + "grad_norm": 16.65316127944145, + "learning_rate": 2.958120983824507e-05, + "loss": 2.8712, + "mean_token_accuracy": 0.36206896901130675, + "step": 29370 + }, + { + "epoch": 0.02958681198701504, + "grad_norm": 15.998510219023641, + "learning_rate": 2.958624579497613e-05, + "loss": 2.6592, + "mean_token_accuracy": 0.41657592058181764, + "step": 29375 + }, + { + "epoch": 0.029591848040119214, + "grad_norm": 25.264633458656107, + "learning_rate": 2.959128175170719e-05, + "loss": 2.6608, + "mean_token_accuracy": 0.3827586233615875, + "step": 29380 + }, + { + "epoch": 0.029596884093223388, + "grad_norm": 17.114755030341012, + "learning_rate": 2.9596317708438248e-05, + "loss": 2.5136, + "mean_token_accuracy": 0.4137930989265442, + "step": 29385 + }, + { + "epoch": 0.029601920146327558, + "grad_norm": 16.08532132579987, + "learning_rate": 2.960135366516931e-05, + "loss": 2.1647, + "mean_token_accuracy": 0.4413793087005615, + "step": 29390 + }, + { + "epoch": 0.029606956199431732, + "grad_norm": 17.46570987124122, + "learning_rate": 2.960638962190037e-05, + "loss": 2.8, + "mean_token_accuracy": 0.3206896513700485, + "step": 29395 + }, + { + "epoch": 0.029611992252535906, + "grad_norm": 16.963443391288347, + "learning_rate": 2.961142557863143e-05, + "loss": 2.499, + "mean_token_accuracy": 0.3999999940395355, + "step": 29400 + }, + { + "epoch": 0.029617028305640076, + "grad_norm": 16.212345392825643, + "learning_rate": 2.961646153536249e-05, + "loss": 2.3559, + "mean_token_accuracy": 0.3551724195480347, + "step": 29405 + }, + { + "epoch": 0.02962206435874425, + "grad_norm": 27.524889522533908, + "learning_rate": 2.9621497492093548e-05, + "loss": 2.563, + "mean_token_accuracy": 0.4137930989265442, + "step": 29410 + }, + { + "epoch": 0.029627100411848423, + "grad_norm": 20.045406644533195, + "learning_rate": 2.962653344882461e-05, + "loss": 2.3554, + "mean_token_accuracy": 0.4034482777118683, + "step": 29415 + }, + { + "epoch": 0.029632136464952597, + "grad_norm": 19.817631140431637, + "learning_rate": 2.963156940555567e-05, + "loss": 2.4249, + "mean_token_accuracy": 0.42068966031074523, + "step": 29420 + }, + { + "epoch": 0.029637172518056767, + "grad_norm": 18.037053610332176, + "learning_rate": 2.963660536228673e-05, + "loss": 2.4095, + "mean_token_accuracy": 0.43908045887947084, + "step": 29425 + }, + { + "epoch": 0.02964220857116094, + "grad_norm": 17.383614056859916, + "learning_rate": 2.9641641319017788e-05, + "loss": 2.8091, + "mean_token_accuracy": 0.3793103456497192, + "step": 29430 + }, + { + "epoch": 0.029647244624265115, + "grad_norm": 15.99798770863236, + "learning_rate": 2.9646677275748848e-05, + "loss": 2.8413, + "mean_token_accuracy": 0.3586206942796707, + "step": 29435 + }, + { + "epoch": 0.029652280677369285, + "grad_norm": 18.543480893328507, + "learning_rate": 2.9651713232479907e-05, + "loss": 2.8001, + "mean_token_accuracy": 0.3793103456497192, + "step": 29440 + }, + { + "epoch": 0.02965731673047346, + "grad_norm": 19.222634969412997, + "learning_rate": 2.965674918921097e-05, + "loss": 2.5166, + "mean_token_accuracy": 0.3862069010734558, + "step": 29445 + }, + { + "epoch": 0.029662352783577633, + "grad_norm": 17.909249140601847, + "learning_rate": 2.966178514594203e-05, + "loss": 3.5245, + "mean_token_accuracy": 0.279310342669487, + "step": 29450 + }, + { + "epoch": 0.029667388836681807, + "grad_norm": 15.887584492170978, + "learning_rate": 2.9666821102673088e-05, + "loss": 2.9935, + "mean_token_accuracy": 0.37241379022598264, + "step": 29455 + }, + { + "epoch": 0.029672424889785977, + "grad_norm": 17.092160629461706, + "learning_rate": 2.9671857059404147e-05, + "loss": 2.7508, + "mean_token_accuracy": 0.35517241060733795, + "step": 29460 + }, + { + "epoch": 0.02967746094289015, + "grad_norm": 20.808594190083024, + "learning_rate": 2.9676893016135203e-05, + "loss": 2.4414, + "mean_token_accuracy": 0.4401088833808899, + "step": 29465 + }, + { + "epoch": 0.029682496995994324, + "grad_norm": 16.718310803358953, + "learning_rate": 2.968192897286627e-05, + "loss": 2.5105, + "mean_token_accuracy": 0.39310344457626345, + "step": 29470 + }, + { + "epoch": 0.029687533049098495, + "grad_norm": 15.667714017122984, + "learning_rate": 2.968696492959733e-05, + "loss": 2.4654, + "mean_token_accuracy": 0.4034482717514038, + "step": 29475 + }, + { + "epoch": 0.02969256910220267, + "grad_norm": 23.58282286190455, + "learning_rate": 2.9692000886328388e-05, + "loss": 3.0139, + "mean_token_accuracy": 0.3310344755649567, + "step": 29480 + }, + { + "epoch": 0.029697605155306842, + "grad_norm": 16.473412285425365, + "learning_rate": 2.9697036843059443e-05, + "loss": 2.6174, + "mean_token_accuracy": 0.4206896543502808, + "step": 29485 + }, + { + "epoch": 0.029702641208411016, + "grad_norm": 12.754355032883918, + "learning_rate": 2.9702072799790503e-05, + "loss": 2.2989, + "mean_token_accuracy": 0.4379310250282288, + "step": 29490 + }, + { + "epoch": 0.029707677261515186, + "grad_norm": 17.311328127135, + "learning_rate": 2.970710875652157e-05, + "loss": 2.7898, + "mean_token_accuracy": 0.3448275774717331, + "step": 29495 + }, + { + "epoch": 0.02971271331461936, + "grad_norm": 19.907535033061926, + "learning_rate": 2.9712144713252628e-05, + "loss": 2.8658, + "mean_token_accuracy": 0.3517241358757019, + "step": 29500 + }, + { + "epoch": 0.029717749367723534, + "grad_norm": 21.820613398937944, + "learning_rate": 2.9717180669983684e-05, + "loss": 2.5412, + "mean_token_accuracy": 0.4068965494632721, + "step": 29505 + }, + { + "epoch": 0.029722785420827704, + "grad_norm": 19.355267630451035, + "learning_rate": 2.9722216626714743e-05, + "loss": 2.442, + "mean_token_accuracy": 0.4206896543502808, + "step": 29510 + }, + { + "epoch": 0.029727821473931878, + "grad_norm": 16.268539391091032, + "learning_rate": 2.9727252583445802e-05, + "loss": 2.9293, + "mean_token_accuracy": 0.3551724135875702, + "step": 29515 + }, + { + "epoch": 0.02973285752703605, + "grad_norm": 18.52605119064113, + "learning_rate": 2.973228854017687e-05, + "loss": 2.7025, + "mean_token_accuracy": 0.42487684488296507, + "step": 29520 + }, + { + "epoch": 0.029737893580140225, + "grad_norm": 20.360358045699712, + "learning_rate": 2.9737324496907924e-05, + "loss": 2.3381, + "mean_token_accuracy": 0.4034482717514038, + "step": 29525 + }, + { + "epoch": 0.029742929633244396, + "grad_norm": 16.45291028427196, + "learning_rate": 2.9742360453638983e-05, + "loss": 2.881, + "mean_token_accuracy": 0.3172413736581802, + "step": 29530 + }, + { + "epoch": 0.02974796568634857, + "grad_norm": 18.656983633356006, + "learning_rate": 2.9747396410370043e-05, + "loss": 2.5963, + "mean_token_accuracy": 0.3827586233615875, + "step": 29535 + }, + { + "epoch": 0.029753001739452743, + "grad_norm": 22.289078598745878, + "learning_rate": 2.9752432367101102e-05, + "loss": 2.616, + "mean_token_accuracy": 0.39655172228813174, + "step": 29540 + }, + { + "epoch": 0.029758037792556914, + "grad_norm": 14.051042662660073, + "learning_rate": 2.975746832383216e-05, + "loss": 2.524, + "mean_token_accuracy": 0.4068965494632721, + "step": 29545 + }, + { + "epoch": 0.029763073845661087, + "grad_norm": 15.518998769213356, + "learning_rate": 2.9762504280563224e-05, + "loss": 2.5539, + "mean_token_accuracy": 0.43103448748588563, + "step": 29550 + }, + { + "epoch": 0.02976810989876526, + "grad_norm": 14.865696794876525, + "learning_rate": 2.9767540237294283e-05, + "loss": 2.7553, + "mean_token_accuracy": 0.38941318094730376, + "step": 29555 + }, + { + "epoch": 0.029773145951869435, + "grad_norm": 16.67474846412874, + "learning_rate": 2.9772576194025342e-05, + "loss": 2.6738, + "mean_token_accuracy": 0.38620689511299133, + "step": 29560 + }, + { + "epoch": 0.029778182004973605, + "grad_norm": 16.879283617126898, + "learning_rate": 2.97776121507564e-05, + "loss": 2.3034, + "mean_token_accuracy": 0.4379310429096222, + "step": 29565 + }, + { + "epoch": 0.02978321805807778, + "grad_norm": 17.18701211359169, + "learning_rate": 2.978264810748746e-05, + "loss": 2.4396, + "mean_token_accuracy": 0.4172413766384125, + "step": 29570 + }, + { + "epoch": 0.029788254111181953, + "grad_norm": 19.66418818138895, + "learning_rate": 2.9787684064218524e-05, + "loss": 2.3211, + "mean_token_accuracy": 0.42413792610168455, + "step": 29575 + }, + { + "epoch": 0.029793290164286123, + "grad_norm": 19.071364303775507, + "learning_rate": 2.9792720020949583e-05, + "loss": 2.3197, + "mean_token_accuracy": 0.3931034505367279, + "step": 29580 + }, + { + "epoch": 0.029798326217390297, + "grad_norm": 20.04572257127486, + "learning_rate": 2.9797755977680642e-05, + "loss": 2.2869, + "mean_token_accuracy": 0.4482758641242981, + "step": 29585 + }, + { + "epoch": 0.02980336227049447, + "grad_norm": 17.401040157300297, + "learning_rate": 2.98027919344117e-05, + "loss": 2.8732, + "mean_token_accuracy": 0.38965516686439516, + "step": 29590 + }, + { + "epoch": 0.029808398323598644, + "grad_norm": 19.239256825418053, + "learning_rate": 2.980782789114276e-05, + "loss": 2.6744, + "mean_token_accuracy": 0.3999999940395355, + "step": 29595 + }, + { + "epoch": 0.029813434376702815, + "grad_norm": 15.614640271657727, + "learning_rate": 2.9812863847873823e-05, + "loss": 2.7346, + "mean_token_accuracy": 0.37241379618644715, + "step": 29600 + }, + { + "epoch": 0.02981847042980699, + "grad_norm": 19.78024501329053, + "learning_rate": 2.9817899804604882e-05, + "loss": 2.5891, + "mean_token_accuracy": 0.3742286801338196, + "step": 29605 + }, + { + "epoch": 0.029823506482911162, + "grad_norm": 18.198249063445328, + "learning_rate": 2.9822935761335942e-05, + "loss": 2.2257, + "mean_token_accuracy": 0.4448275864124298, + "step": 29610 + }, + { + "epoch": 0.029828542536015332, + "grad_norm": 18.022695123390264, + "learning_rate": 2.9827971718067e-05, + "loss": 2.2505, + "mean_token_accuracy": 0.4413793087005615, + "step": 29615 + }, + { + "epoch": 0.029833578589119506, + "grad_norm": 19.066042279073216, + "learning_rate": 2.9833007674798057e-05, + "loss": 2.8002, + "mean_token_accuracy": 0.31724137663841245, + "step": 29620 + }, + { + "epoch": 0.02983861464222368, + "grad_norm": 17.61657298487773, + "learning_rate": 2.9838043631529116e-05, + "loss": 2.6507, + "mean_token_accuracy": 0.39310344457626345, + "step": 29625 + }, + { + "epoch": 0.029843650695327854, + "grad_norm": 19.032397408034644, + "learning_rate": 2.9843079588260182e-05, + "loss": 1.9455, + "mean_token_accuracy": 0.5228675067424774, + "step": 29630 + }, + { + "epoch": 0.029848686748432024, + "grad_norm": 16.61916737191513, + "learning_rate": 2.984811554499124e-05, + "loss": 2.3691, + "mean_token_accuracy": 0.41379310488700866, + "step": 29635 + }, + { + "epoch": 0.029853722801536198, + "grad_norm": 14.088103173301802, + "learning_rate": 2.9853151501722297e-05, + "loss": 2.3612, + "mean_token_accuracy": 0.4137930989265442, + "step": 29640 + }, + { + "epoch": 0.02985875885464037, + "grad_norm": 17.12606198410017, + "learning_rate": 2.9858187458453356e-05, + "loss": 2.6647, + "mean_token_accuracy": 0.37380519807338713, + "step": 29645 + }, + { + "epoch": 0.029863794907744542, + "grad_norm": 16.250140036134383, + "learning_rate": 2.9863223415184416e-05, + "loss": 2.1722, + "mean_token_accuracy": 0.4674531161785126, + "step": 29650 + }, + { + "epoch": 0.029868830960848716, + "grad_norm": 21.7110570919871, + "learning_rate": 2.9868259371915482e-05, + "loss": 3.1896, + "mean_token_accuracy": 0.3551724135875702, + "step": 29655 + }, + { + "epoch": 0.02987386701395289, + "grad_norm": 16.9838557980717, + "learning_rate": 2.9873295328646538e-05, + "loss": 2.3552, + "mean_token_accuracy": 0.41034482717514037, + "step": 29660 + }, + { + "epoch": 0.029878903067057063, + "grad_norm": 16.700492292832386, + "learning_rate": 2.9878331285377597e-05, + "loss": 2.8717, + "mean_token_accuracy": 0.36551723480224607, + "step": 29665 + }, + { + "epoch": 0.029883939120161233, + "grad_norm": 17.83220247346024, + "learning_rate": 2.9883367242108656e-05, + "loss": 2.737, + "mean_token_accuracy": 0.4103448212146759, + "step": 29670 + }, + { + "epoch": 0.029888975173265407, + "grad_norm": 16.586435854450247, + "learning_rate": 2.9888403198839715e-05, + "loss": 2.5934, + "mean_token_accuracy": 0.47586206793785096, + "step": 29675 + }, + { + "epoch": 0.02989401122636958, + "grad_norm": 19.154691351933703, + "learning_rate": 2.9893439155570778e-05, + "loss": 2.497, + "mean_token_accuracy": 0.4206896543502808, + "step": 29680 + }, + { + "epoch": 0.02989904727947375, + "grad_norm": 14.061548529639017, + "learning_rate": 2.9898475112301837e-05, + "loss": 2.7822, + "mean_token_accuracy": 0.3430127054452896, + "step": 29685 + }, + { + "epoch": 0.029904083332577925, + "grad_norm": 18.64674808087456, + "learning_rate": 2.9903511069032897e-05, + "loss": 2.4677, + "mean_token_accuracy": 0.4344827592372894, + "step": 29690 + }, + { + "epoch": 0.0299091193856821, + "grad_norm": 15.245902590548736, + "learning_rate": 2.9908547025763956e-05, + "loss": 2.482, + "mean_token_accuracy": 0.4517241418361664, + "step": 29695 + }, + { + "epoch": 0.029914155438786273, + "grad_norm": 16.128631436923406, + "learning_rate": 2.9913582982495015e-05, + "loss": 2.3459, + "mean_token_accuracy": 0.4586206912994385, + "step": 29700 + }, + { + "epoch": 0.029919191491890443, + "grad_norm": 21.582082133515264, + "learning_rate": 2.9918618939226074e-05, + "loss": 2.4995, + "mean_token_accuracy": 0.43793103098869324, + "step": 29705 + }, + { + "epoch": 0.029924227544994617, + "grad_norm": 15.086930408636556, + "learning_rate": 2.9923654895957137e-05, + "loss": 2.3631, + "mean_token_accuracy": 0.4517241418361664, + "step": 29710 + }, + { + "epoch": 0.02992926359809879, + "grad_norm": 16.38705304501124, + "learning_rate": 2.9928690852688196e-05, + "loss": 2.5819, + "mean_token_accuracy": 0.4103448152542114, + "step": 29715 + }, + { + "epoch": 0.02993429965120296, + "grad_norm": 17.330242075186415, + "learning_rate": 2.9933726809419255e-05, + "loss": 2.4724, + "mean_token_accuracy": 0.42413792610168455, + "step": 29720 + }, + { + "epoch": 0.029939335704307134, + "grad_norm": 18.19344825965578, + "learning_rate": 2.9938762766150315e-05, + "loss": 2.3184, + "mean_token_accuracy": 0.4551724135875702, + "step": 29725 + }, + { + "epoch": 0.029944371757411308, + "grad_norm": 14.256496596262643, + "learning_rate": 2.994379872288137e-05, + "loss": 2.3052, + "mean_token_accuracy": 0.46739262342453003, + "step": 29730 + }, + { + "epoch": 0.029949407810515482, + "grad_norm": 19.86412220122882, + "learning_rate": 2.9948834679612437e-05, + "loss": 2.2721, + "mean_token_accuracy": 0.3965517282485962, + "step": 29735 + }, + { + "epoch": 0.029954443863619652, + "grad_norm": 15.73026555272646, + "learning_rate": 2.9953870636343496e-05, + "loss": 2.1618, + "mean_token_accuracy": 0.4413793087005615, + "step": 29740 + }, + { + "epoch": 0.029959479916723826, + "grad_norm": 18.771153913685314, + "learning_rate": 2.9958906593074555e-05, + "loss": 2.6681, + "mean_token_accuracy": 0.4034482717514038, + "step": 29745 + }, + { + "epoch": 0.029964515969828, + "grad_norm": 14.665095972754285, + "learning_rate": 2.996394254980561e-05, + "loss": 2.4392, + "mean_token_accuracy": 0.4194797396659851, + "step": 29750 + }, + { + "epoch": 0.02996955202293217, + "grad_norm": 18.62029184840673, + "learning_rate": 2.996897850653667e-05, + "loss": 2.6521, + "mean_token_accuracy": 0.4034482717514038, + "step": 29755 + }, + { + "epoch": 0.029974588076036344, + "grad_norm": 15.597494328345055, + "learning_rate": 2.9974014463267736e-05, + "loss": 2.4606, + "mean_token_accuracy": 0.3620689630508423, + "step": 29760 + }, + { + "epoch": 0.029979624129140518, + "grad_norm": 17.466229073595606, + "learning_rate": 2.9979050419998795e-05, + "loss": 2.5123, + "mean_token_accuracy": 0.4206896543502808, + "step": 29765 + }, + { + "epoch": 0.02998466018224469, + "grad_norm": 15.80912650425274, + "learning_rate": 2.998408637672985e-05, + "loss": 2.4978, + "mean_token_accuracy": 0.42758620977401735, + "step": 29770 + }, + { + "epoch": 0.02998969623534886, + "grad_norm": 17.016601170452937, + "learning_rate": 2.998912233346091e-05, + "loss": 2.2564, + "mean_token_accuracy": 0.44482759237289426, + "step": 29775 + }, + { + "epoch": 0.029994732288453035, + "grad_norm": 15.918434279894857, + "learning_rate": 2.999415829019197e-05, + "loss": 2.5996, + "mean_token_accuracy": 0.36896551251411436, + "step": 29780 + }, + { + "epoch": 0.02999976834155721, + "grad_norm": 17.94975754946306, + "learning_rate": 2.999919424692303e-05, + "loss": 2.6092, + "mean_token_accuracy": 0.37586206793785093, + "step": 29785 + }, + { + "epoch": 0.03000480439466138, + "grad_norm": 19.641000207729128, + "learning_rate": 3.0004230203654092e-05, + "loss": 2.5008, + "mean_token_accuracy": 0.3915305435657501, + "step": 29790 + }, + { + "epoch": 0.030009840447765553, + "grad_norm": 14.956762460606019, + "learning_rate": 3.000926616038515e-05, + "loss": 2.5518, + "mean_token_accuracy": 0.40344828069210054, + "step": 29795 + }, + { + "epoch": 0.030014876500869727, + "grad_norm": 19.319891287401614, + "learning_rate": 3.001430211711621e-05, + "loss": 2.5014, + "mean_token_accuracy": 0.4068965554237366, + "step": 29800 + }, + { + "epoch": 0.0300199125539739, + "grad_norm": 17.179243454324823, + "learning_rate": 3.001933807384727e-05, + "loss": 2.3106, + "mean_token_accuracy": 0.40852994918823243, + "step": 29805 + }, + { + "epoch": 0.03002494860707807, + "grad_norm": 18.142029027307785, + "learning_rate": 3.002437403057833e-05, + "loss": 2.9119, + "mean_token_accuracy": 0.3103448271751404, + "step": 29810 + }, + { + "epoch": 0.030029984660182245, + "grad_norm": 17.313526864754152, + "learning_rate": 3.002940998730939e-05, + "loss": 2.8225, + "mean_token_accuracy": 0.38965516686439516, + "step": 29815 + }, + { + "epoch": 0.03003502071328642, + "grad_norm": 20.309705975662062, + "learning_rate": 3.003444594404045e-05, + "loss": 2.5784, + "mean_token_accuracy": 0.40000000298023225, + "step": 29820 + }, + { + "epoch": 0.03004005676639059, + "grad_norm": 17.34009037719211, + "learning_rate": 3.003948190077151e-05, + "loss": 2.3544, + "mean_token_accuracy": 0.4517241358757019, + "step": 29825 + }, + { + "epoch": 0.030045092819494763, + "grad_norm": 16.174106377404456, + "learning_rate": 3.004451785750257e-05, + "loss": 2.3002, + "mean_token_accuracy": 0.4344827592372894, + "step": 29830 + }, + { + "epoch": 0.030050128872598936, + "grad_norm": 15.652330903316003, + "learning_rate": 3.004955381423363e-05, + "loss": 2.7103, + "mean_token_accuracy": 0.37586206793785093, + "step": 29835 + }, + { + "epoch": 0.03005516492570311, + "grad_norm": 16.259342048104294, + "learning_rate": 3.005458977096469e-05, + "loss": 2.1563, + "mean_token_accuracy": 0.4359346628189087, + "step": 29840 + }, + { + "epoch": 0.03006020097880728, + "grad_norm": 11.828177056235981, + "learning_rate": 3.005962572769575e-05, + "loss": 1.9828, + "mean_token_accuracy": 0.517241370677948, + "step": 29845 + }, + { + "epoch": 0.030065237031911454, + "grad_norm": 14.157524797239157, + "learning_rate": 3.006466168442681e-05, + "loss": 2.2234, + "mean_token_accuracy": 0.441379314661026, + "step": 29850 + }, + { + "epoch": 0.030070273085015628, + "grad_norm": 15.347764881080051, + "learning_rate": 3.006969764115787e-05, + "loss": 2.4233, + "mean_token_accuracy": 0.4068965494632721, + "step": 29855 + }, + { + "epoch": 0.0300753091381198, + "grad_norm": 15.062032454511112, + "learning_rate": 3.0074733597888928e-05, + "loss": 2.5, + "mean_token_accuracy": 0.4284482717514038, + "step": 29860 + }, + { + "epoch": 0.030080345191223972, + "grad_norm": 17.840264001558253, + "learning_rate": 3.007976955461999e-05, + "loss": 2.6764, + "mean_token_accuracy": 0.3931034505367279, + "step": 29865 + }, + { + "epoch": 0.030085381244328146, + "grad_norm": 16.351636256178526, + "learning_rate": 3.008480551135105e-05, + "loss": 2.5083, + "mean_token_accuracy": 0.4034482777118683, + "step": 29870 + }, + { + "epoch": 0.03009041729743232, + "grad_norm": 15.206703634685407, + "learning_rate": 3.008984146808211e-05, + "loss": 2.0618, + "mean_token_accuracy": 0.5034482777118683, + "step": 29875 + }, + { + "epoch": 0.03009545335053649, + "grad_norm": 18.248234717958244, + "learning_rate": 3.009487742481317e-05, + "loss": 2.2608, + "mean_token_accuracy": 0.41834974884986875, + "step": 29880 + }, + { + "epoch": 0.030100489403640664, + "grad_norm": 17.786917365373313, + "learning_rate": 3.0099913381544224e-05, + "loss": 2.4322, + "mean_token_accuracy": 0.38620689511299133, + "step": 29885 + }, + { + "epoch": 0.030105525456744837, + "grad_norm": 18.0308470227239, + "learning_rate": 3.0104949338275284e-05, + "loss": 2.8619, + "mean_token_accuracy": 0.37241379022598264, + "step": 29890 + }, + { + "epoch": 0.030110561509849008, + "grad_norm": 17.387733731842374, + "learning_rate": 3.010998529500635e-05, + "loss": 2.5135, + "mean_token_accuracy": 0.4159104645252228, + "step": 29895 + }, + { + "epoch": 0.03011559756295318, + "grad_norm": 16.76268381363948, + "learning_rate": 3.011502125173741e-05, + "loss": 2.2325, + "mean_token_accuracy": 0.4551724076271057, + "step": 29900 + }, + { + "epoch": 0.030120633616057355, + "grad_norm": 16.49212219374332, + "learning_rate": 3.0120057208468465e-05, + "loss": 1.9437, + "mean_token_accuracy": 0.493103438615799, + "step": 29905 + }, + { + "epoch": 0.03012566966916153, + "grad_norm": 16.963242910904718, + "learning_rate": 3.0125093165199524e-05, + "loss": 2.4113, + "mean_token_accuracy": 0.43448275327682495, + "step": 29910 + }, + { + "epoch": 0.0301307057222657, + "grad_norm": 16.3011736103217, + "learning_rate": 3.0130129121930583e-05, + "loss": 2.4062, + "mean_token_accuracy": 0.4620689690113068, + "step": 29915 + }, + { + "epoch": 0.030135741775369873, + "grad_norm": 15.521354774392584, + "learning_rate": 3.013516507866165e-05, + "loss": 2.919, + "mean_token_accuracy": 0.38275861740112305, + "step": 29920 + }, + { + "epoch": 0.030140777828474047, + "grad_norm": 14.40645687979916, + "learning_rate": 3.0140201035392705e-05, + "loss": 2.5124, + "mean_token_accuracy": 0.42758620977401735, + "step": 29925 + }, + { + "epoch": 0.030145813881578217, + "grad_norm": 17.518773214470855, + "learning_rate": 3.0145236992123764e-05, + "loss": 2.563, + "mean_token_accuracy": 0.4344827651977539, + "step": 29930 + }, + { + "epoch": 0.03015084993468239, + "grad_norm": 20.584196523026073, + "learning_rate": 3.0150272948854824e-05, + "loss": 2.6491, + "mean_token_accuracy": 0.4157289803028107, + "step": 29935 + }, + { + "epoch": 0.030155885987786565, + "grad_norm": 27.52294978301983, + "learning_rate": 3.0155308905585883e-05, + "loss": 2.7911, + "mean_token_accuracy": 0.37931033968925476, + "step": 29940 + }, + { + "epoch": 0.03016092204089074, + "grad_norm": 17.442600571078536, + "learning_rate": 3.0160344862316946e-05, + "loss": 2.8233, + "mean_token_accuracy": 0.35862069129943847, + "step": 29945 + }, + { + "epoch": 0.03016595809399491, + "grad_norm": 14.196915889916598, + "learning_rate": 3.0165380819048005e-05, + "loss": 2.4175, + "mean_token_accuracy": 0.3931034505367279, + "step": 29950 + }, + { + "epoch": 0.030170994147099083, + "grad_norm": 14.369678741541954, + "learning_rate": 3.0170416775779064e-05, + "loss": 2.7933, + "mean_token_accuracy": 0.4000000059604645, + "step": 29955 + }, + { + "epoch": 0.030176030200203256, + "grad_norm": 16.642794135234677, + "learning_rate": 3.0175452732510123e-05, + "loss": 2.5102, + "mean_token_accuracy": 0.39655172228813174, + "step": 29960 + }, + { + "epoch": 0.030181066253307427, + "grad_norm": 14.283556090151132, + "learning_rate": 3.0180488689241183e-05, + "loss": 2.5282, + "mean_token_accuracy": 0.43103447556495667, + "step": 29965 + }, + { + "epoch": 0.0301861023064116, + "grad_norm": 15.677405826508265, + "learning_rate": 3.0185524645972242e-05, + "loss": 2.3808, + "mean_token_accuracy": 0.42068964838981626, + "step": 29970 + }, + { + "epoch": 0.030191138359515774, + "grad_norm": 15.490275639668234, + "learning_rate": 3.0190560602703304e-05, + "loss": 2.5581, + "mean_token_accuracy": 0.3827586114406586, + "step": 29975 + }, + { + "epoch": 0.030196174412619948, + "grad_norm": 16.035419940742443, + "learning_rate": 3.0195596559434364e-05, + "loss": 2.5398, + "mean_token_accuracy": 0.3965517163276672, + "step": 29980 + }, + { + "epoch": 0.030201210465724118, + "grad_norm": 19.11193119487132, + "learning_rate": 3.0200632516165423e-05, + "loss": 2.9306, + "mean_token_accuracy": 0.3241379201412201, + "step": 29985 + }, + { + "epoch": 0.030206246518828292, + "grad_norm": 15.232559623867571, + "learning_rate": 3.0205668472896482e-05, + "loss": 2.4816, + "mean_token_accuracy": 0.4000000059604645, + "step": 29990 + }, + { + "epoch": 0.030211282571932466, + "grad_norm": 16.198606256037586, + "learning_rate": 3.021070442962754e-05, + "loss": 2.4395, + "mean_token_accuracy": 0.4310344815254211, + "step": 29995 + }, + { + "epoch": 0.030216318625036636, + "grad_norm": 13.775571798437669, + "learning_rate": 3.0215740386358604e-05, + "loss": 2.6011, + "mean_token_accuracy": 0.4034482777118683, + "step": 30000 + }, + { + "epoch": 0.03022135467814081, + "grad_norm": 15.47556768242462, + "learning_rate": 3.0220776343089663e-05, + "loss": 2.3931, + "mean_token_accuracy": 0.4017543911933899, + "step": 30005 + }, + { + "epoch": 0.030226390731244984, + "grad_norm": 15.610516771316016, + "learning_rate": 3.0225812299820723e-05, + "loss": 2.4527, + "mean_token_accuracy": 0.4344827592372894, + "step": 30010 + }, + { + "epoch": 0.030231426784349157, + "grad_norm": 16.534416231849484, + "learning_rate": 3.0230848256551782e-05, + "loss": 2.6057, + "mean_token_accuracy": 0.36896551251411436, + "step": 30015 + }, + { + "epoch": 0.030236462837453328, + "grad_norm": 17.74059914239265, + "learning_rate": 3.0235884213282838e-05, + "loss": 2.5507, + "mean_token_accuracy": 0.41034482717514037, + "step": 30020 + }, + { + "epoch": 0.0302414988905575, + "grad_norm": 14.169490247079645, + "learning_rate": 3.0240920170013904e-05, + "loss": 2.5737, + "mean_token_accuracy": 0.42068964838981626, + "step": 30025 + }, + { + "epoch": 0.030246534943661675, + "grad_norm": 19.59932686888348, + "learning_rate": 3.0245956126744963e-05, + "loss": 2.8079, + "mean_token_accuracy": 0.37931033968925476, + "step": 30030 + }, + { + "epoch": 0.030251570996765845, + "grad_norm": 18.494276347302904, + "learning_rate": 3.0250992083476022e-05, + "loss": 2.6121, + "mean_token_accuracy": 0.36206896901130675, + "step": 30035 + }, + { + "epoch": 0.03025660704987002, + "grad_norm": 13.708368034542126, + "learning_rate": 3.0256028040207078e-05, + "loss": 2.6655, + "mean_token_accuracy": 0.41724138259887694, + "step": 30040 + }, + { + "epoch": 0.030261643102974193, + "grad_norm": 15.560306497404676, + "learning_rate": 3.0261063996938137e-05, + "loss": 2.393, + "mean_token_accuracy": 0.42758620381355283, + "step": 30045 + }, + { + "epoch": 0.030266679156078367, + "grad_norm": 19.125402155374278, + "learning_rate": 3.0266099953669197e-05, + "loss": 2.5258, + "mean_token_accuracy": 0.42068966031074523, + "step": 30050 + }, + { + "epoch": 0.030271715209182537, + "grad_norm": 17.04897351989556, + "learning_rate": 3.0271135910400263e-05, + "loss": 2.3408, + "mean_token_accuracy": 0.4586206912994385, + "step": 30055 + }, + { + "epoch": 0.03027675126228671, + "grad_norm": 16.310581901041303, + "learning_rate": 3.027617186713132e-05, + "loss": 2.7625, + "mean_token_accuracy": 0.34482758939266206, + "step": 30060 + }, + { + "epoch": 0.030281787315390885, + "grad_norm": 17.67124169396695, + "learning_rate": 3.0281207823862378e-05, + "loss": 2.8017, + "mean_token_accuracy": 0.3517241358757019, + "step": 30065 + }, + { + "epoch": 0.030286823368495055, + "grad_norm": 14.35494483560042, + "learning_rate": 3.0286243780593437e-05, + "loss": 1.952, + "mean_token_accuracy": 0.5288177132606506, + "step": 30070 + }, + { + "epoch": 0.03029185942159923, + "grad_norm": 13.329866981371957, + "learning_rate": 3.0291279737324496e-05, + "loss": 2.0153, + "mean_token_accuracy": 0.5803387761116028, + "step": 30075 + }, + { + "epoch": 0.030296895474703402, + "grad_norm": 16.091686005906237, + "learning_rate": 3.029631569405556e-05, + "loss": 2.758, + "mean_token_accuracy": 0.39310345649719236, + "step": 30080 + }, + { + "epoch": 0.030301931527807576, + "grad_norm": 16.04852341788052, + "learning_rate": 3.0301351650786618e-05, + "loss": 2.4701, + "mean_token_accuracy": 0.4034482777118683, + "step": 30085 + }, + { + "epoch": 0.030306967580911746, + "grad_norm": 24.079155564758615, + "learning_rate": 3.0306387607517677e-05, + "loss": 2.6988, + "mean_token_accuracy": 0.35862069129943847, + "step": 30090 + }, + { + "epoch": 0.03031200363401592, + "grad_norm": 15.189021805844565, + "learning_rate": 3.0311423564248737e-05, + "loss": 2.9328, + "mean_token_accuracy": 0.3517241358757019, + "step": 30095 + }, + { + "epoch": 0.030317039687120094, + "grad_norm": 17.214922066922952, + "learning_rate": 3.0316459520979796e-05, + "loss": 2.1698, + "mean_token_accuracy": 0.4620689630508423, + "step": 30100 + }, + { + "epoch": 0.030322075740224264, + "grad_norm": 13.732801799539205, + "learning_rate": 3.032149547771086e-05, + "loss": 2.3977, + "mean_token_accuracy": 0.4068965554237366, + "step": 30105 + }, + { + "epoch": 0.030327111793328438, + "grad_norm": 26.395799560108106, + "learning_rate": 3.0326531434441918e-05, + "loss": 2.4416, + "mean_token_accuracy": 0.4448275864124298, + "step": 30110 + }, + { + "epoch": 0.030332147846432612, + "grad_norm": 18.3000158028024, + "learning_rate": 3.0331567391172977e-05, + "loss": 2.6657, + "mean_token_accuracy": 0.358620685338974, + "step": 30115 + }, + { + "epoch": 0.030337183899536786, + "grad_norm": 19.960077024058872, + "learning_rate": 3.0336603347904036e-05, + "loss": 2.9329, + "mean_token_accuracy": 0.36551724672317504, + "step": 30120 + }, + { + "epoch": 0.030342219952640956, + "grad_norm": 15.363901624499073, + "learning_rate": 3.0341639304635096e-05, + "loss": 2.2601, + "mean_token_accuracy": 0.43793103098869324, + "step": 30125 + }, + { + "epoch": 0.03034725600574513, + "grad_norm": 38.208009406439, + "learning_rate": 3.034667526136615e-05, + "loss": 2.8535, + "mean_token_accuracy": 0.38620689511299133, + "step": 30130 + }, + { + "epoch": 0.030352292058849303, + "grad_norm": 17.565701736988956, + "learning_rate": 3.0351711218097217e-05, + "loss": 2.7111, + "mean_token_accuracy": 0.4068965554237366, + "step": 30135 + }, + { + "epoch": 0.030357328111953474, + "grad_norm": 15.456861743132727, + "learning_rate": 3.0356747174828277e-05, + "loss": 2.438, + "mean_token_accuracy": 0.4551724135875702, + "step": 30140 + }, + { + "epoch": 0.030362364165057647, + "grad_norm": 14.078903006187124, + "learning_rate": 3.0361783131559336e-05, + "loss": 2.3945, + "mean_token_accuracy": 0.43448275327682495, + "step": 30145 + }, + { + "epoch": 0.03036740021816182, + "grad_norm": 15.9738717494348, + "learning_rate": 3.0366819088290395e-05, + "loss": 2.6611, + "mean_token_accuracy": 0.3896551728248596, + "step": 30150 + }, + { + "epoch": 0.030372436271265995, + "grad_norm": 18.7889276627096, + "learning_rate": 3.037185504502145e-05, + "loss": 2.6185, + "mean_token_accuracy": 0.3931034505367279, + "step": 30155 + }, + { + "epoch": 0.030377472324370165, + "grad_norm": 16.74827760524287, + "learning_rate": 3.0376891001752517e-05, + "loss": 2.6533, + "mean_token_accuracy": 0.3793103456497192, + "step": 30160 + }, + { + "epoch": 0.03038250837747434, + "grad_norm": 15.512556359765467, + "learning_rate": 3.0381926958483576e-05, + "loss": 2.3699, + "mean_token_accuracy": 0.41379311084747317, + "step": 30165 + }, + { + "epoch": 0.030387544430578513, + "grad_norm": 15.730724119177593, + "learning_rate": 3.0386962915214636e-05, + "loss": 2.4307, + "mean_token_accuracy": 0.4344827592372894, + "step": 30170 + }, + { + "epoch": 0.030392580483682683, + "grad_norm": 16.212270092864646, + "learning_rate": 3.039199887194569e-05, + "loss": 2.3668, + "mean_token_accuracy": 0.4206896543502808, + "step": 30175 + }, + { + "epoch": 0.030397616536786857, + "grad_norm": 17.519953752298726, + "learning_rate": 3.039703482867675e-05, + "loss": 2.3741, + "mean_token_accuracy": 0.41379311084747317, + "step": 30180 + }, + { + "epoch": 0.03040265258989103, + "grad_norm": 29.649742037479474, + "learning_rate": 3.0402070785407817e-05, + "loss": 2.609, + "mean_token_accuracy": 0.42068966031074523, + "step": 30185 + }, + { + "epoch": 0.030407688642995204, + "grad_norm": 19.58096962696459, + "learning_rate": 3.0407106742138876e-05, + "loss": 2.7538, + "mean_token_accuracy": 0.3620689660310745, + "step": 30190 + }, + { + "epoch": 0.030412724696099375, + "grad_norm": 13.510709216510898, + "learning_rate": 3.0412142698869932e-05, + "loss": 2.1908, + "mean_token_accuracy": 0.4676346004009247, + "step": 30195 + }, + { + "epoch": 0.03041776074920355, + "grad_norm": 16.04623414307215, + "learning_rate": 3.041717865560099e-05, + "loss": 2.7714, + "mean_token_accuracy": 0.36206896901130675, + "step": 30200 + }, + { + "epoch": 0.030422796802307722, + "grad_norm": 22.567171505752484, + "learning_rate": 3.042221461233205e-05, + "loss": 2.3081, + "mean_token_accuracy": 0.4401088893413544, + "step": 30205 + }, + { + "epoch": 0.030427832855411893, + "grad_norm": 15.953017020592727, + "learning_rate": 3.042725056906311e-05, + "loss": 2.594, + "mean_token_accuracy": 0.3724137842655182, + "step": 30210 + }, + { + "epoch": 0.030432868908516066, + "grad_norm": 14.50691462715041, + "learning_rate": 3.0432286525794172e-05, + "loss": 2.4375, + "mean_token_accuracy": 0.4, + "step": 30215 + }, + { + "epoch": 0.03043790496162024, + "grad_norm": 22.839055709872014, + "learning_rate": 3.043732248252523e-05, + "loss": 2.4893, + "mean_token_accuracy": 0.4379310369491577, + "step": 30220 + }, + { + "epoch": 0.03044294101472441, + "grad_norm": 17.269660513922705, + "learning_rate": 3.044235843925629e-05, + "loss": 2.4767, + "mean_token_accuracy": 0.37241379022598264, + "step": 30225 + }, + { + "epoch": 0.030447977067828584, + "grad_norm": 15.95771989079032, + "learning_rate": 3.044739439598735e-05, + "loss": 2.5376, + "mean_token_accuracy": 0.41034482717514037, + "step": 30230 + }, + { + "epoch": 0.030453013120932758, + "grad_norm": 15.243007704133149, + "learning_rate": 3.045243035271841e-05, + "loss": 2.4553, + "mean_token_accuracy": 0.4172413766384125, + "step": 30235 + }, + { + "epoch": 0.03045804917403693, + "grad_norm": 17.981460307537475, + "learning_rate": 3.0457466309449472e-05, + "loss": 2.5478, + "mean_token_accuracy": 0.3689655065536499, + "step": 30240 + }, + { + "epoch": 0.030463085227141102, + "grad_norm": 14.888134765541642, + "learning_rate": 3.046250226618053e-05, + "loss": 2.3795, + "mean_token_accuracy": 0.42758620977401735, + "step": 30245 + }, + { + "epoch": 0.030468121280245276, + "grad_norm": 12.79576078181957, + "learning_rate": 3.046753822291159e-05, + "loss": 2.654, + "mean_token_accuracy": 0.38275861740112305, + "step": 30250 + }, + { + "epoch": 0.03047315733334945, + "grad_norm": 15.769688200831252, + "learning_rate": 3.047257417964265e-05, + "loss": 2.1658, + "mean_token_accuracy": 0.4586206912994385, + "step": 30255 + }, + { + "epoch": 0.03047819338645362, + "grad_norm": 19.879233265645915, + "learning_rate": 3.047761013637371e-05, + "loss": 2.4157, + "mean_token_accuracy": 0.47761645913124084, + "step": 30260 + }, + { + "epoch": 0.030483229439557794, + "grad_norm": 19.374864888797102, + "learning_rate": 3.048264609310477e-05, + "loss": 2.3592, + "mean_token_accuracy": 0.4137930989265442, + "step": 30265 + }, + { + "epoch": 0.030488265492661967, + "grad_norm": 18.35155632261849, + "learning_rate": 3.048768204983583e-05, + "loss": 2.5171, + "mean_token_accuracy": 0.37586206793785093, + "step": 30270 + }, + { + "epoch": 0.03049330154576614, + "grad_norm": 13.661640151285418, + "learning_rate": 3.049271800656689e-05, + "loss": 2.59, + "mean_token_accuracy": 0.4172413766384125, + "step": 30275 + }, + { + "epoch": 0.03049833759887031, + "grad_norm": 16.919438216718486, + "learning_rate": 3.049775396329795e-05, + "loss": 2.6538, + "mean_token_accuracy": 0.35172413289546967, + "step": 30280 + }, + { + "epoch": 0.030503373651974485, + "grad_norm": 17.411347244492763, + "learning_rate": 3.0502789920029005e-05, + "loss": 2.4037, + "mean_token_accuracy": 0.39655172228813174, + "step": 30285 + }, + { + "epoch": 0.03050840970507866, + "grad_norm": 15.887142689578116, + "learning_rate": 3.050782587676007e-05, + "loss": 2.8974, + "mean_token_accuracy": 0.3551724135875702, + "step": 30290 + }, + { + "epoch": 0.03051344575818283, + "grad_norm": 14.175207824430862, + "learning_rate": 3.051286183349113e-05, + "loss": 2.5189, + "mean_token_accuracy": 0.43103448748588563, + "step": 30295 + }, + { + "epoch": 0.030518481811287003, + "grad_norm": 16.714323267250684, + "learning_rate": 3.051789779022219e-05, + "loss": 2.8999, + "mean_token_accuracy": 0.35862069129943847, + "step": 30300 + }, + { + "epoch": 0.030523517864391177, + "grad_norm": 15.660011179155827, + "learning_rate": 3.052293374695325e-05, + "loss": 2.509, + "mean_token_accuracy": 0.3862068891525269, + "step": 30305 + }, + { + "epoch": 0.03052855391749535, + "grad_norm": 15.026243148831282, + "learning_rate": 3.052796970368431e-05, + "loss": 2.6013, + "mean_token_accuracy": 0.38965516686439516, + "step": 30310 + }, + { + "epoch": 0.03053358997059952, + "grad_norm": 15.477787044518426, + "learning_rate": 3.053300566041537e-05, + "loss": 2.7042, + "mean_token_accuracy": 0.3999999940395355, + "step": 30315 + }, + { + "epoch": 0.030538626023703695, + "grad_norm": 12.81301766343412, + "learning_rate": 3.053804161714643e-05, + "loss": 2.2586, + "mean_token_accuracy": 0.4413793087005615, + "step": 30320 + }, + { + "epoch": 0.03054366207680787, + "grad_norm": 16.990597685467748, + "learning_rate": 3.0543077573877486e-05, + "loss": 2.6749, + "mean_token_accuracy": 0.38620689511299133, + "step": 30325 + }, + { + "epoch": 0.03054869812991204, + "grad_norm": 21.61798670004041, + "learning_rate": 3.0548113530608545e-05, + "loss": 2.4997, + "mean_token_accuracy": 0.42758620381355283, + "step": 30330 + }, + { + "epoch": 0.030553734183016212, + "grad_norm": 14.460003232537366, + "learning_rate": 3.0553149487339605e-05, + "loss": 2.4109, + "mean_token_accuracy": 0.3999999940395355, + "step": 30335 + }, + { + "epoch": 0.030558770236120386, + "grad_norm": 16.160264494543526, + "learning_rate": 3.0558185444070664e-05, + "loss": 2.6603, + "mean_token_accuracy": 0.4, + "step": 30340 + }, + { + "epoch": 0.03056380628922456, + "grad_norm": 13.953997206180517, + "learning_rate": 3.056322140080173e-05, + "loss": 2.8845, + "mean_token_accuracy": 0.3379310369491577, + "step": 30345 + }, + { + "epoch": 0.03056884234232873, + "grad_norm": 16.906129798351298, + "learning_rate": 3.056825735753279e-05, + "loss": 2.5456, + "mean_token_accuracy": 0.4068965554237366, + "step": 30350 + }, + { + "epoch": 0.030573878395432904, + "grad_norm": 22.367706679008347, + "learning_rate": 3.057329331426385e-05, + "loss": 3.0359, + "mean_token_accuracy": 0.34137930870056155, + "step": 30355 + }, + { + "epoch": 0.030578914448537078, + "grad_norm": 16.374189070091447, + "learning_rate": 3.057832927099491e-05, + "loss": 2.2252, + "mean_token_accuracy": 0.44482759237289426, + "step": 30360 + }, + { + "epoch": 0.030583950501641248, + "grad_norm": 16.1528402456324, + "learning_rate": 3.058336522772596e-05, + "loss": 3.1152, + "mean_token_accuracy": 0.38275861740112305, + "step": 30365 + }, + { + "epoch": 0.030588986554745422, + "grad_norm": 18.890461062556426, + "learning_rate": 3.0588401184457026e-05, + "loss": 2.5253, + "mean_token_accuracy": 0.4034482777118683, + "step": 30370 + }, + { + "epoch": 0.030594022607849596, + "grad_norm": 14.805205464983276, + "learning_rate": 3.0593437141188085e-05, + "loss": 2.4996, + "mean_token_accuracy": 0.4517241299152374, + "step": 30375 + }, + { + "epoch": 0.03059905866095377, + "grad_norm": 22.203441800411206, + "learning_rate": 3.0598473097919145e-05, + "loss": 2.7833, + "mean_token_accuracy": 0.3517241388559341, + "step": 30380 + }, + { + "epoch": 0.03060409471405794, + "grad_norm": 17.692845895326847, + "learning_rate": 3.0603509054650204e-05, + "loss": 2.5453, + "mean_token_accuracy": 0.3551724135875702, + "step": 30385 + }, + { + "epoch": 0.030609130767162113, + "grad_norm": 14.856504019239393, + "learning_rate": 3.060854501138126e-05, + "loss": 2.5186, + "mean_token_accuracy": 0.41379310488700866, + "step": 30390 + }, + { + "epoch": 0.030614166820266287, + "grad_norm": 16.471938316044767, + "learning_rate": 3.061358096811232e-05, + "loss": 2.4007, + "mean_token_accuracy": 0.42413792610168455, + "step": 30395 + }, + { + "epoch": 0.030619202873370457, + "grad_norm": 19.997662772004308, + "learning_rate": 3.061861692484339e-05, + "loss": 2.6454, + "mean_token_accuracy": 0.39655172228813174, + "step": 30400 + }, + { + "epoch": 0.03062423892647463, + "grad_norm": 15.758670815001214, + "learning_rate": 3.062365288157444e-05, + "loss": 2.6411, + "mean_token_accuracy": 0.36013308763504026, + "step": 30405 + }, + { + "epoch": 0.030629274979578805, + "grad_norm": 16.511323955616472, + "learning_rate": 3.06286888383055e-05, + "loss": 2.5415, + "mean_token_accuracy": 0.3793103456497192, + "step": 30410 + }, + { + "epoch": 0.03063431103268298, + "grad_norm": 20.626581644952743, + "learning_rate": 3.063372479503656e-05, + "loss": 2.6776, + "mean_token_accuracy": 0.36896551847457887, + "step": 30415 + }, + { + "epoch": 0.03063934708578715, + "grad_norm": 12.598339861279566, + "learning_rate": 3.063876075176762e-05, + "loss": 2.2539, + "mean_token_accuracy": 0.4448275864124298, + "step": 30420 + }, + { + "epoch": 0.030644383138891323, + "grad_norm": 14.918041179807698, + "learning_rate": 3.0643796708498685e-05, + "loss": 2.2334, + "mean_token_accuracy": 0.44325469732284545, + "step": 30425 + }, + { + "epoch": 0.030649419191995497, + "grad_norm": 17.671264615316996, + "learning_rate": 3.0648832665229744e-05, + "loss": 2.5784, + "mean_token_accuracy": 0.42758620381355283, + "step": 30430 + }, + { + "epoch": 0.030654455245099667, + "grad_norm": 18.986090409974594, + "learning_rate": 3.06538686219608e-05, + "loss": 2.6954, + "mean_token_accuracy": 0.37586206793785093, + "step": 30435 + }, + { + "epoch": 0.03065949129820384, + "grad_norm": 17.397741861583288, + "learning_rate": 3.065890457869186e-05, + "loss": 2.7507, + "mean_token_accuracy": 0.34137930274009703, + "step": 30440 + }, + { + "epoch": 0.030664527351308014, + "grad_norm": 24.779265675093676, + "learning_rate": 3.066394053542292e-05, + "loss": 2.7733, + "mean_token_accuracy": 0.36896551251411436, + "step": 30445 + }, + { + "epoch": 0.030669563404412188, + "grad_norm": 16.746640332292042, + "learning_rate": 3.066897649215398e-05, + "loss": 2.6052, + "mean_token_accuracy": 0.42758620381355283, + "step": 30450 + }, + { + "epoch": 0.03067459945751636, + "grad_norm": 15.739215833573613, + "learning_rate": 3.067401244888504e-05, + "loss": 2.5325, + "mean_token_accuracy": 0.38965516686439516, + "step": 30455 + }, + { + "epoch": 0.030679635510620532, + "grad_norm": 20.508931044024585, + "learning_rate": 3.06790484056161e-05, + "loss": 2.3893, + "mean_token_accuracy": 0.4275861978530884, + "step": 30460 + }, + { + "epoch": 0.030684671563724706, + "grad_norm": 15.626797405280513, + "learning_rate": 3.068408436234716e-05, + "loss": 2.35, + "mean_token_accuracy": 0.4379310369491577, + "step": 30465 + }, + { + "epoch": 0.030689707616828876, + "grad_norm": 14.080461642415779, + "learning_rate": 3.068912031907822e-05, + "loss": 2.3851, + "mean_token_accuracy": 0.44016939401626587, + "step": 30470 + }, + { + "epoch": 0.03069474366993305, + "grad_norm": 16.52376324748906, + "learning_rate": 3.069415627580928e-05, + "loss": 2.4846, + "mean_token_accuracy": 0.46721113920211793, + "step": 30475 + }, + { + "epoch": 0.030699779723037224, + "grad_norm": 12.847063002731312, + "learning_rate": 3.069919223254034e-05, + "loss": 2.4385, + "mean_token_accuracy": 0.4310344815254211, + "step": 30480 + }, + { + "epoch": 0.030704815776141398, + "grad_norm": 16.868078186118225, + "learning_rate": 3.07042281892714e-05, + "loss": 2.5037, + "mean_token_accuracy": 0.3896551728248596, + "step": 30485 + }, + { + "epoch": 0.030709851829245568, + "grad_norm": 19.802893405374796, + "learning_rate": 3.070926414600246e-05, + "loss": 2.2445, + "mean_token_accuracy": 0.43448275327682495, + "step": 30490 + }, + { + "epoch": 0.03071488788234974, + "grad_norm": 19.087306139130025, + "learning_rate": 3.0714300102733514e-05, + "loss": 3.2061, + "mean_token_accuracy": 0.3551724135875702, + "step": 30495 + }, + { + "epoch": 0.030719923935453915, + "grad_norm": 18.61903663997251, + "learning_rate": 3.0719336059464573e-05, + "loss": 2.5648, + "mean_token_accuracy": 0.4206896543502808, + "step": 30500 + }, + { + "epoch": 0.030724959988558086, + "grad_norm": 16.494561291905015, + "learning_rate": 3.072437201619564e-05, + "loss": 2.445, + "mean_token_accuracy": 0.4310344934463501, + "step": 30505 + }, + { + "epoch": 0.03072999604166226, + "grad_norm": 22.52249822536078, + "learning_rate": 3.07294079729267e-05, + "loss": 2.8711, + "mean_token_accuracy": 0.4034482777118683, + "step": 30510 + }, + { + "epoch": 0.030735032094766433, + "grad_norm": 26.119276238155425, + "learning_rate": 3.073444392965776e-05, + "loss": 2.5825, + "mean_token_accuracy": 0.4068965494632721, + "step": 30515 + }, + { + "epoch": 0.030740068147870607, + "grad_norm": 20.99555841081645, + "learning_rate": 3.073947988638882e-05, + "loss": 2.5414, + "mean_token_accuracy": 0.41379310488700866, + "step": 30520 + }, + { + "epoch": 0.030745104200974777, + "grad_norm": 18.099366968560343, + "learning_rate": 3.0744515843119876e-05, + "loss": 2.4635, + "mean_token_accuracy": 0.42413792610168455, + "step": 30525 + }, + { + "epoch": 0.03075014025407895, + "grad_norm": 16.669622341003723, + "learning_rate": 3.074955179985094e-05, + "loss": 2.3943, + "mean_token_accuracy": 0.4655172526836395, + "step": 30530 + }, + { + "epoch": 0.030755176307183125, + "grad_norm": 16.040129049030103, + "learning_rate": 3.0754587756582e-05, + "loss": 2.4163, + "mean_token_accuracy": 0.45862069725990295, + "step": 30535 + }, + { + "epoch": 0.030760212360287295, + "grad_norm": 19.749330170349932, + "learning_rate": 3.0759623713313054e-05, + "loss": 2.7484, + "mean_token_accuracy": 0.36896551847457887, + "step": 30540 + }, + { + "epoch": 0.03076524841339147, + "grad_norm": 17.715912903773642, + "learning_rate": 3.0764659670044113e-05, + "loss": 2.925, + "mean_token_accuracy": 0.36551723480224607, + "step": 30545 + }, + { + "epoch": 0.030770284466495643, + "grad_norm": 22.346242077426773, + "learning_rate": 3.076969562677517e-05, + "loss": 2.5009, + "mean_token_accuracy": 0.40490018129348754, + "step": 30550 + }, + { + "epoch": 0.030775320519599816, + "grad_norm": 14.61701843403556, + "learning_rate": 3.077473158350623e-05, + "loss": 2.2661, + "mean_token_accuracy": 0.46896551847457885, + "step": 30555 + }, + { + "epoch": 0.030780356572703987, + "grad_norm": 20.428436460178094, + "learning_rate": 3.07797675402373e-05, + "loss": 2.7157, + "mean_token_accuracy": 0.3999999940395355, + "step": 30560 + }, + { + "epoch": 0.03078539262580816, + "grad_norm": 13.330522254559284, + "learning_rate": 3.078480349696836e-05, + "loss": 2.601, + "mean_token_accuracy": 0.4034482717514038, + "step": 30565 + }, + { + "epoch": 0.030790428678912334, + "grad_norm": 15.67972487976431, + "learning_rate": 3.0789839453699417e-05, + "loss": 2.4561, + "mean_token_accuracy": 0.4, + "step": 30570 + }, + { + "epoch": 0.030795464732016505, + "grad_norm": 17.251556472966243, + "learning_rate": 3.0794875410430476e-05, + "loss": 2.8083, + "mean_token_accuracy": 0.40689654350280763, + "step": 30575 + }, + { + "epoch": 0.03080050078512068, + "grad_norm": 16.10684482953834, + "learning_rate": 3.0799911367161535e-05, + "loss": 2.4988, + "mean_token_accuracy": 0.38620689511299133, + "step": 30580 + }, + { + "epoch": 0.030805536838224852, + "grad_norm": 13.562900055103084, + "learning_rate": 3.0804947323892594e-05, + "loss": 2.3046, + "mean_token_accuracy": 0.4517241358757019, + "step": 30585 + }, + { + "epoch": 0.030810572891329026, + "grad_norm": 20.721104845626176, + "learning_rate": 3.0809983280623654e-05, + "loss": 2.7369, + "mean_token_accuracy": 0.401935875415802, + "step": 30590 + }, + { + "epoch": 0.030815608944433196, + "grad_norm": 17.996233499089797, + "learning_rate": 3.081501923735471e-05, + "loss": 2.9257, + "mean_token_accuracy": 0.3703569293022156, + "step": 30595 + }, + { + "epoch": 0.03082064499753737, + "grad_norm": 16.630923579277347, + "learning_rate": 3.082005519408577e-05, + "loss": 2.5564, + "mean_token_accuracy": 0.4344827651977539, + "step": 30600 + }, + { + "epoch": 0.030825681050641544, + "grad_norm": 19.03321234938062, + "learning_rate": 3.082509115081683e-05, + "loss": 2.6629, + "mean_token_accuracy": 0.36551723480224607, + "step": 30605 + }, + { + "epoch": 0.030830717103745714, + "grad_norm": 21.08716015102735, + "learning_rate": 3.08301271075479e-05, + "loss": 2.6313, + "mean_token_accuracy": 0.37241379618644715, + "step": 30610 + }, + { + "epoch": 0.030835753156849888, + "grad_norm": 15.28496689810781, + "learning_rate": 3.0835163064278957e-05, + "loss": 2.4446, + "mean_token_accuracy": 0.42413793206214906, + "step": 30615 + }, + { + "epoch": 0.03084078920995406, + "grad_norm": 15.761055604278027, + "learning_rate": 3.0840199021010016e-05, + "loss": 2.5301, + "mean_token_accuracy": 0.4068965494632721, + "step": 30620 + }, + { + "epoch": 0.030845825263058235, + "grad_norm": 15.616708975901105, + "learning_rate": 3.0845234977741075e-05, + "loss": 2.5107, + "mean_token_accuracy": 0.4, + "step": 30625 + }, + { + "epoch": 0.030850861316162406, + "grad_norm": 16.55868516000282, + "learning_rate": 3.085027093447213e-05, + "loss": 2.2814, + "mean_token_accuracy": 0.4344827651977539, + "step": 30630 + }, + { + "epoch": 0.03085589736926658, + "grad_norm": 14.52843696094866, + "learning_rate": 3.085530689120319e-05, + "loss": 2.3017, + "mean_token_accuracy": 0.4482758641242981, + "step": 30635 + }, + { + "epoch": 0.030860933422370753, + "grad_norm": 16.934633652256025, + "learning_rate": 3.086034284793425e-05, + "loss": 2.4367, + "mean_token_accuracy": 0.44482759237289426, + "step": 30640 + }, + { + "epoch": 0.030865969475474923, + "grad_norm": 22.466024002782913, + "learning_rate": 3.086537880466531e-05, + "loss": 2.6612, + "mean_token_accuracy": 0.38965516686439516, + "step": 30645 + }, + { + "epoch": 0.030871005528579097, + "grad_norm": 17.8474206771584, + "learning_rate": 3.087041476139637e-05, + "loss": 2.4694, + "mean_token_accuracy": 0.4413793087005615, + "step": 30650 + }, + { + "epoch": 0.03087604158168327, + "grad_norm": 15.716022966809552, + "learning_rate": 3.087545071812743e-05, + "loss": 2.6973, + "mean_token_accuracy": 0.4068965554237366, + "step": 30655 + }, + { + "epoch": 0.030881077634787445, + "grad_norm": 14.92452062710475, + "learning_rate": 3.088048667485849e-05, + "loss": 2.0822, + "mean_token_accuracy": 0.506896561384201, + "step": 30660 + }, + { + "epoch": 0.030886113687891615, + "grad_norm": 15.40357572251711, + "learning_rate": 3.0885522631589556e-05, + "loss": 2.3737, + "mean_token_accuracy": 0.4068965554237366, + "step": 30665 + }, + { + "epoch": 0.03089114974099579, + "grad_norm": 19.24497440397035, + "learning_rate": 3.089055858832061e-05, + "loss": 2.6726, + "mean_token_accuracy": 0.3655172407627106, + "step": 30670 + }, + { + "epoch": 0.030896185794099962, + "grad_norm": 12.67523779854411, + "learning_rate": 3.089559454505167e-05, + "loss": 2.5733, + "mean_token_accuracy": 0.3965517282485962, + "step": 30675 + }, + { + "epoch": 0.030901221847204133, + "grad_norm": 16.52186462888829, + "learning_rate": 3.090063050178273e-05, + "loss": 2.4245, + "mean_token_accuracy": 0.3862069010734558, + "step": 30680 + }, + { + "epoch": 0.030906257900308307, + "grad_norm": 15.10428144502331, + "learning_rate": 3.0905666458513786e-05, + "loss": 2.2427, + "mean_token_accuracy": 0.42413793206214906, + "step": 30685 + }, + { + "epoch": 0.03091129395341248, + "grad_norm": 14.810758725723025, + "learning_rate": 3.091070241524485e-05, + "loss": 2.1321, + "mean_token_accuracy": 0.4689655303955078, + "step": 30690 + }, + { + "epoch": 0.030916330006516654, + "grad_norm": 26.21326831613668, + "learning_rate": 3.091573837197591e-05, + "loss": 2.4742, + "mean_token_accuracy": 0.4551724135875702, + "step": 30695 + }, + { + "epoch": 0.030921366059620824, + "grad_norm": 15.819044115318434, + "learning_rate": 3.092077432870697e-05, + "loss": 2.4561, + "mean_token_accuracy": 0.4034482717514038, + "step": 30700 + }, + { + "epoch": 0.030926402112724998, + "grad_norm": 15.909486073072909, + "learning_rate": 3.092581028543803e-05, + "loss": 2.5721, + "mean_token_accuracy": 0.37053840756416323, + "step": 30705 + }, + { + "epoch": 0.030931438165829172, + "grad_norm": 15.93527555515284, + "learning_rate": 3.093084624216909e-05, + "loss": 2.9305, + "mean_token_accuracy": 0.39122807383537295, + "step": 30710 + }, + { + "epoch": 0.030936474218933342, + "grad_norm": 15.497636299812536, + "learning_rate": 3.093588219890015e-05, + "loss": 2.3775, + "mean_token_accuracy": 0.4068965494632721, + "step": 30715 + }, + { + "epoch": 0.030941510272037516, + "grad_norm": 23.425999012698764, + "learning_rate": 3.094091815563121e-05, + "loss": 2.5795, + "mean_token_accuracy": 0.3862069010734558, + "step": 30720 + }, + { + "epoch": 0.03094654632514169, + "grad_norm": 13.218972220664844, + "learning_rate": 3.094595411236227e-05, + "loss": 2.4076, + "mean_token_accuracy": 0.4137930989265442, + "step": 30725 + }, + { + "epoch": 0.030951582378245864, + "grad_norm": 15.712632097284391, + "learning_rate": 3.0950990069093326e-05, + "loss": 2.3937, + "mean_token_accuracy": 0.4034482717514038, + "step": 30730 + }, + { + "epoch": 0.030956618431350034, + "grad_norm": 14.998592414504799, + "learning_rate": 3.0956026025824385e-05, + "loss": 2.7747, + "mean_token_accuracy": 0.4206896543502808, + "step": 30735 + }, + { + "epoch": 0.030961654484454208, + "grad_norm": 15.694805499787284, + "learning_rate": 3.0961061982555445e-05, + "loss": 2.2225, + "mean_token_accuracy": 0.45728976726531984, + "step": 30740 + }, + { + "epoch": 0.03096669053755838, + "grad_norm": 21.20947911959607, + "learning_rate": 3.096609793928651e-05, + "loss": 2.7668, + "mean_token_accuracy": 0.3620689630508423, + "step": 30745 + }, + { + "epoch": 0.03097172659066255, + "grad_norm": 15.294432570524169, + "learning_rate": 3.097113389601757e-05, + "loss": 2.216, + "mean_token_accuracy": 0.4689655125141144, + "step": 30750 + }, + { + "epoch": 0.030976762643766725, + "grad_norm": 19.416203681667856, + "learning_rate": 3.097616985274863e-05, + "loss": 2.7584, + "mean_token_accuracy": 0.37241379022598264, + "step": 30755 + }, + { + "epoch": 0.0309817986968709, + "grad_norm": 21.548508300542327, + "learning_rate": 3.098120580947969e-05, + "loss": 2.6926, + "mean_token_accuracy": 0.38275861740112305, + "step": 30760 + }, + { + "epoch": 0.030986834749975073, + "grad_norm": 19.669080029901618, + "learning_rate": 3.098624176621074e-05, + "loss": 2.7421, + "mean_token_accuracy": 0.358620685338974, + "step": 30765 + }, + { + "epoch": 0.030991870803079243, + "grad_norm": 20.706613489710804, + "learning_rate": 3.099127772294181e-05, + "loss": 2.4286, + "mean_token_accuracy": 0.44301270246505736, + "step": 30770 + }, + { + "epoch": 0.030996906856183417, + "grad_norm": 20.85591533429189, + "learning_rate": 3.0996313679672866e-05, + "loss": 2.8085, + "mean_token_accuracy": 0.37931033968925476, + "step": 30775 + }, + { + "epoch": 0.03100194290928759, + "grad_norm": 21.240582132057945, + "learning_rate": 3.1001349636403925e-05, + "loss": 2.3658, + "mean_token_accuracy": 0.4482758641242981, + "step": 30780 + }, + { + "epoch": 0.03100697896239176, + "grad_norm": 15.994801369832867, + "learning_rate": 3.1006385593134985e-05, + "loss": 2.4593, + "mean_token_accuracy": 0.4551724076271057, + "step": 30785 + }, + { + "epoch": 0.031012015015495935, + "grad_norm": 18.667222463450234, + "learning_rate": 3.1011421549866044e-05, + "loss": 2.2907, + "mean_token_accuracy": 0.4294615924358368, + "step": 30790 + }, + { + "epoch": 0.03101705106860011, + "grad_norm": 16.324995406302243, + "learning_rate": 3.101645750659711e-05, + "loss": 2.6139, + "mean_token_accuracy": 0.43284936547279357, + "step": 30795 + }, + { + "epoch": 0.031022087121704282, + "grad_norm": 15.471705160922829, + "learning_rate": 3.102149346332817e-05, + "loss": 2.37, + "mean_token_accuracy": 0.4517241299152374, + "step": 30800 + }, + { + "epoch": 0.031027123174808453, + "grad_norm": 17.685871338778593, + "learning_rate": 3.102652942005922e-05, + "loss": 2.6891, + "mean_token_accuracy": 0.4068965494632721, + "step": 30805 + }, + { + "epoch": 0.031032159227912626, + "grad_norm": 15.22067551990615, + "learning_rate": 3.103156537679028e-05, + "loss": 2.3012, + "mean_token_accuracy": 0.41724138259887694, + "step": 30810 + }, + { + "epoch": 0.0310371952810168, + "grad_norm": 19.65213364514607, + "learning_rate": 3.103660133352134e-05, + "loss": 2.3155, + "mean_token_accuracy": 0.4724137902259827, + "step": 30815 + }, + { + "epoch": 0.03104223133412097, + "grad_norm": 18.252813093205624, + "learning_rate": 3.10416372902524e-05, + "loss": 2.4949, + "mean_token_accuracy": 0.42758620977401735, + "step": 30820 + }, + { + "epoch": 0.031047267387225144, + "grad_norm": 15.127077616590098, + "learning_rate": 3.1046673246983466e-05, + "loss": 2.4722, + "mean_token_accuracy": 0.4, + "step": 30825 + }, + { + "epoch": 0.031052303440329318, + "grad_norm": 15.269135106129957, + "learning_rate": 3.1051709203714525e-05, + "loss": 2.4892, + "mean_token_accuracy": 0.3793103516101837, + "step": 30830 + }, + { + "epoch": 0.031057339493433492, + "grad_norm": 16.326919423890555, + "learning_rate": 3.1056745160445584e-05, + "loss": 2.883, + "mean_token_accuracy": 0.3876587986946106, + "step": 30835 + }, + { + "epoch": 0.031062375546537662, + "grad_norm": 16.85191112486692, + "learning_rate": 3.106178111717664e-05, + "loss": 2.8691, + "mean_token_accuracy": 0.33793103992938994, + "step": 30840 + }, + { + "epoch": 0.031067411599641836, + "grad_norm": 14.433164269210906, + "learning_rate": 3.10668170739077e-05, + "loss": 2.5017, + "mean_token_accuracy": 0.4310344815254211, + "step": 30845 + }, + { + "epoch": 0.03107244765274601, + "grad_norm": 17.38308372483792, + "learning_rate": 3.107185303063876e-05, + "loss": 2.6591, + "mean_token_accuracy": 0.4172413766384125, + "step": 30850 + }, + { + "epoch": 0.03107748370585018, + "grad_norm": 16.79876793440166, + "learning_rate": 3.107688898736982e-05, + "loss": 2.664, + "mean_token_accuracy": 0.4172413766384125, + "step": 30855 + }, + { + "epoch": 0.031082519758954354, + "grad_norm": 14.014276177627822, + "learning_rate": 3.108192494410088e-05, + "loss": 2.388, + "mean_token_accuracy": 0.44593595862388613, + "step": 30860 + }, + { + "epoch": 0.031087555812058527, + "grad_norm": 16.89202230198056, + "learning_rate": 3.108696090083194e-05, + "loss": 2.6615, + "mean_token_accuracy": 0.3862069070339203, + "step": 30865 + }, + { + "epoch": 0.0310925918651627, + "grad_norm": 19.650701121889007, + "learning_rate": 3.1091996857563e-05, + "loss": 2.8034, + "mean_token_accuracy": 0.36896551847457887, + "step": 30870 + }, + { + "epoch": 0.03109762791826687, + "grad_norm": 11.19242580243403, + "learning_rate": 3.1097032814294065e-05, + "loss": 2.3659, + "mean_token_accuracy": 0.4344827592372894, + "step": 30875 + }, + { + "epoch": 0.031102663971371045, + "grad_norm": 14.241357551160785, + "learning_rate": 3.1102068771025124e-05, + "loss": 2.3956, + "mean_token_accuracy": 0.39310344457626345, + "step": 30880 + }, + { + "epoch": 0.03110770002447522, + "grad_norm": 13.88578435636804, + "learning_rate": 3.110710472775618e-05, + "loss": 2.3374, + "mean_token_accuracy": 0.4448275864124298, + "step": 30885 + }, + { + "epoch": 0.03111273607757939, + "grad_norm": 17.2863321973129, + "learning_rate": 3.111214068448724e-05, + "loss": 2.5481, + "mean_token_accuracy": 0.42413793206214906, + "step": 30890 + }, + { + "epoch": 0.031117772130683563, + "grad_norm": 15.754254942066275, + "learning_rate": 3.11171766412183e-05, + "loss": 2.7277, + "mean_token_accuracy": 0.3620689630508423, + "step": 30895 + }, + { + "epoch": 0.031122808183787737, + "grad_norm": 45.51791045772726, + "learning_rate": 3.1122212597949354e-05, + "loss": 2.8824, + "mean_token_accuracy": 0.3517241388559341, + "step": 30900 + }, + { + "epoch": 0.03112784423689191, + "grad_norm": 17.994659165999792, + "learning_rate": 3.112724855468042e-05, + "loss": 2.4755, + "mean_token_accuracy": 0.42758620381355283, + "step": 30905 + }, + { + "epoch": 0.03113288028999608, + "grad_norm": 14.807042651527443, + "learning_rate": 3.113228451141148e-05, + "loss": 2.6859, + "mean_token_accuracy": 0.36551723480224607, + "step": 30910 + }, + { + "epoch": 0.031137916343100255, + "grad_norm": 22.77945547112795, + "learning_rate": 3.113732046814254e-05, + "loss": 2.7268, + "mean_token_accuracy": 0.4, + "step": 30915 + }, + { + "epoch": 0.03114295239620443, + "grad_norm": 22.04762658525619, + "learning_rate": 3.11423564248736e-05, + "loss": 2.5514, + "mean_token_accuracy": 0.4103448301553726, + "step": 30920 + }, + { + "epoch": 0.0311479884493086, + "grad_norm": 16.988566393812373, + "learning_rate": 3.114739238160466e-05, + "loss": 2.6242, + "mean_token_accuracy": 0.4123412013053894, + "step": 30925 + }, + { + "epoch": 0.031153024502412772, + "grad_norm": 16.71988532837217, + "learning_rate": 3.1152428338335723e-05, + "loss": 2.2341, + "mean_token_accuracy": 0.4517241358757019, + "step": 30930 + }, + { + "epoch": 0.031158060555516946, + "grad_norm": 12.978140756963043, + "learning_rate": 3.115746429506678e-05, + "loss": 2.2264, + "mean_token_accuracy": 0.42413793206214906, + "step": 30935 + }, + { + "epoch": 0.03116309660862112, + "grad_norm": 14.899213536918277, + "learning_rate": 3.1162500251797835e-05, + "loss": 2.2155, + "mean_token_accuracy": 0.42758620381355283, + "step": 30940 + }, + { + "epoch": 0.03116813266172529, + "grad_norm": 19.800373930749814, + "learning_rate": 3.1167536208528894e-05, + "loss": 2.6368, + "mean_token_accuracy": 0.42413793206214906, + "step": 30945 + }, + { + "epoch": 0.031173168714829464, + "grad_norm": 16.298288805042787, + "learning_rate": 3.1172572165259954e-05, + "loss": 2.5378, + "mean_token_accuracy": 0.38965516686439516, + "step": 30950 + }, + { + "epoch": 0.031178204767933638, + "grad_norm": 16.993684525467266, + "learning_rate": 3.117760812199102e-05, + "loss": 2.3674, + "mean_token_accuracy": 0.43448275327682495, + "step": 30955 + }, + { + "epoch": 0.031183240821037808, + "grad_norm": 15.295544513170974, + "learning_rate": 3.118264407872208e-05, + "loss": 2.7592, + "mean_token_accuracy": 0.39655172228813174, + "step": 30960 + }, + { + "epoch": 0.031188276874141982, + "grad_norm": 15.873606489573843, + "learning_rate": 3.118768003545314e-05, + "loss": 3.0178, + "mean_token_accuracy": 0.4034482717514038, + "step": 30965 + }, + { + "epoch": 0.031193312927246156, + "grad_norm": 15.663324762626273, + "learning_rate": 3.11927159921842e-05, + "loss": 2.4642, + "mean_token_accuracy": 0.4034482717514038, + "step": 30970 + }, + { + "epoch": 0.03119834898035033, + "grad_norm": 17.476814496946023, + "learning_rate": 3.119775194891526e-05, + "loss": 2.5677, + "mean_token_accuracy": 0.4034482777118683, + "step": 30975 + }, + { + "epoch": 0.0312033850334545, + "grad_norm": 18.808492746578334, + "learning_rate": 3.1202787905646316e-05, + "loss": 2.5759, + "mean_token_accuracy": 0.38620689511299133, + "step": 30980 + }, + { + "epoch": 0.031208421086558674, + "grad_norm": 39.588672677885555, + "learning_rate": 3.1207823862377375e-05, + "loss": 2.6563, + "mean_token_accuracy": 0.4172413796186447, + "step": 30985 + }, + { + "epoch": 0.031213457139662847, + "grad_norm": 14.129543206075445, + "learning_rate": 3.1212859819108434e-05, + "loss": 2.5498, + "mean_token_accuracy": 0.40344826579093934, + "step": 30990 + }, + { + "epoch": 0.031218493192767018, + "grad_norm": 15.76946471854939, + "learning_rate": 3.1217895775839494e-05, + "loss": 2.2457, + "mean_token_accuracy": 0.39655172228813174, + "step": 30995 + }, + { + "epoch": 0.03122352924587119, + "grad_norm": 15.306633393329385, + "learning_rate": 3.122293173257055e-05, + "loss": 2.3494, + "mean_token_accuracy": 0.43103447556495667, + "step": 31000 + }, + { + "epoch": 0.031228565298975365, + "grad_norm": 16.836498400929468, + "learning_rate": 3.122796768930161e-05, + "loss": 2.716, + "mean_token_accuracy": 0.36896551847457887, + "step": 31005 + }, + { + "epoch": 0.03123360135207954, + "grad_norm": 20.554276331077507, + "learning_rate": 3.123300364603268e-05, + "loss": 2.9228, + "mean_token_accuracy": 0.317241370677948, + "step": 31010 + }, + { + "epoch": 0.03123863740518371, + "grad_norm": 18.308975362752747, + "learning_rate": 3.123803960276374e-05, + "loss": 2.4142, + "mean_token_accuracy": 0.43103447556495667, + "step": 31015 + }, + { + "epoch": 0.031243673458287883, + "grad_norm": 20.393708621662288, + "learning_rate": 3.12430755594948e-05, + "loss": 2.6508, + "mean_token_accuracy": 0.3793103486299515, + "step": 31020 + }, + { + "epoch": 0.031248709511392057, + "grad_norm": 18.394551536127647, + "learning_rate": 3.1248111516225856e-05, + "loss": 2.0444, + "mean_token_accuracy": 0.5206896603107453, + "step": 31025 + }, + { + "epoch": 0.03125374556449623, + "grad_norm": 14.2412215479628, + "learning_rate": 3.125314747295691e-05, + "loss": 2.3363, + "mean_token_accuracy": 0.458620685338974, + "step": 31030 + }, + { + "epoch": 0.0312587816176004, + "grad_norm": 33.722193451709956, + "learning_rate": 3.1258183429687974e-05, + "loss": 2.4875, + "mean_token_accuracy": 0.3620689630508423, + "step": 31035 + }, + { + "epoch": 0.031263817670704575, + "grad_norm": 19.909251842160195, + "learning_rate": 3.1263219386419034e-05, + "loss": 2.4466, + "mean_token_accuracy": 0.4517241299152374, + "step": 31040 + }, + { + "epoch": 0.03126885372380875, + "grad_norm": 16.9322599502945, + "learning_rate": 3.126825534315009e-05, + "loss": 2.7081, + "mean_token_accuracy": 0.4448275864124298, + "step": 31045 + }, + { + "epoch": 0.03127388977691292, + "grad_norm": 16.894011442097224, + "learning_rate": 3.127329129988115e-05, + "loss": 2.614, + "mean_token_accuracy": 0.4068965494632721, + "step": 31050 + }, + { + "epoch": 0.031278925830017096, + "grad_norm": 18.85198696811844, + "learning_rate": 3.127832725661221e-05, + "loss": 2.5432, + "mean_token_accuracy": 0.43793103098869324, + "step": 31055 + }, + { + "epoch": 0.03128396188312126, + "grad_norm": 15.34474144552616, + "learning_rate": 3.128336321334327e-05, + "loss": 2.5084, + "mean_token_accuracy": 0.43103447556495667, + "step": 31060 + }, + { + "epoch": 0.031288997936225436, + "grad_norm": 14.720609481918503, + "learning_rate": 3.128839917007434e-05, + "loss": 2.5241, + "mean_token_accuracy": 0.4034482717514038, + "step": 31065 + }, + { + "epoch": 0.03129403398932961, + "grad_norm": 15.367324807748583, + "learning_rate": 3.1293435126805396e-05, + "loss": 2.5049, + "mean_token_accuracy": 0.38275861740112305, + "step": 31070 + }, + { + "epoch": 0.031299070042433784, + "grad_norm": 14.322543384691949, + "learning_rate": 3.129847108353645e-05, + "loss": 2.7715, + "mean_token_accuracy": 0.37241379618644715, + "step": 31075 + }, + { + "epoch": 0.03130410609553796, + "grad_norm": 15.767998939054454, + "learning_rate": 3.130350704026751e-05, + "loss": 2.5103, + "mean_token_accuracy": 0.42758620381355283, + "step": 31080 + }, + { + "epoch": 0.03130914214864213, + "grad_norm": 15.160467813036657, + "learning_rate": 3.130854299699857e-05, + "loss": 2.499, + "mean_token_accuracy": 0.4310344934463501, + "step": 31085 + }, + { + "epoch": 0.031314178201746305, + "grad_norm": 21.80624404172611, + "learning_rate": 3.131357895372963e-05, + "loss": 2.5723, + "mean_token_accuracy": 0.3827586233615875, + "step": 31090 + }, + { + "epoch": 0.03131921425485047, + "grad_norm": 16.066686064186534, + "learning_rate": 3.131861491046069e-05, + "loss": 2.3287, + "mean_token_accuracy": 0.47586206793785096, + "step": 31095 + }, + { + "epoch": 0.031324250307954646, + "grad_norm": 19.242333632357337, + "learning_rate": 3.132365086719175e-05, + "loss": 2.5894, + "mean_token_accuracy": 0.38965516686439516, + "step": 31100 + }, + { + "epoch": 0.03132928636105882, + "grad_norm": 16.70900888829499, + "learning_rate": 3.132868682392281e-05, + "loss": 2.7564, + "mean_token_accuracy": 0.36206896901130675, + "step": 31105 + }, + { + "epoch": 0.03133432241416299, + "grad_norm": 16.046108305789808, + "learning_rate": 3.133372278065387e-05, + "loss": 2.3339, + "mean_token_accuracy": 0.4068965494632721, + "step": 31110 + }, + { + "epoch": 0.03133935846726717, + "grad_norm": 14.866702335624908, + "learning_rate": 3.133875873738493e-05, + "loss": 2.5386, + "mean_token_accuracy": 0.4118572294712067, + "step": 31115 + }, + { + "epoch": 0.03134439452037134, + "grad_norm": 14.580116086686637, + "learning_rate": 3.134379469411599e-05, + "loss": 2.301, + "mean_token_accuracy": 0.4413793087005615, + "step": 31120 + }, + { + "epoch": 0.031349430573475515, + "grad_norm": 20.774442837483413, + "learning_rate": 3.134883065084705e-05, + "loss": 3.0282, + "mean_token_accuracy": 0.3482758581638336, + "step": 31125 + }, + { + "epoch": 0.03135446662657968, + "grad_norm": 21.004998362000944, + "learning_rate": 3.135386660757811e-05, + "loss": 2.4649, + "mean_token_accuracy": 0.42413793206214906, + "step": 31130 + }, + { + "epoch": 0.031359502679683855, + "grad_norm": 17.592016799611127, + "learning_rate": 3.1358902564309166e-05, + "loss": 2.4704, + "mean_token_accuracy": 0.3827586114406586, + "step": 31135 + }, + { + "epoch": 0.03136453873278803, + "grad_norm": 15.960928630974283, + "learning_rate": 3.136393852104023e-05, + "loss": 2.3782, + "mean_token_accuracy": 0.4570477962493896, + "step": 31140 + }, + { + "epoch": 0.0313695747858922, + "grad_norm": 29.694737979869522, + "learning_rate": 3.136897447777129e-05, + "loss": 2.3423, + "mean_token_accuracy": 0.44827587008476255, + "step": 31145 + }, + { + "epoch": 0.03137461083899638, + "grad_norm": 14.352684240678432, + "learning_rate": 3.137401043450235e-05, + "loss": 2.3205, + "mean_token_accuracy": 0.4344827592372894, + "step": 31150 + }, + { + "epoch": 0.03137964689210055, + "grad_norm": 16.571319270966846, + "learning_rate": 3.137904639123341e-05, + "loss": 3.2064, + "mean_token_accuracy": 0.3448275804519653, + "step": 31155 + }, + { + "epoch": 0.03138468294520472, + "grad_norm": 15.92645559737672, + "learning_rate": 3.138408234796447e-05, + "loss": 2.7204, + "mean_token_accuracy": 0.39655173420906065, + "step": 31160 + }, + { + "epoch": 0.03138971899830889, + "grad_norm": 12.657540460225267, + "learning_rate": 3.138911830469552e-05, + "loss": 2.4906, + "mean_token_accuracy": 0.39655172228813174, + "step": 31165 + }, + { + "epoch": 0.031394755051413065, + "grad_norm": 19.18366152158412, + "learning_rate": 3.139415426142659e-05, + "loss": 2.4713, + "mean_token_accuracy": 0.40496068000793456, + "step": 31170 + }, + { + "epoch": 0.03139979110451724, + "grad_norm": 15.155839399607364, + "learning_rate": 3.139919021815765e-05, + "loss": 2.7806, + "mean_token_accuracy": 0.3689655065536499, + "step": 31175 + }, + { + "epoch": 0.03140482715762141, + "grad_norm": 14.776247811640282, + "learning_rate": 3.1404226174888706e-05, + "loss": 2.6423, + "mean_token_accuracy": 0.3965517163276672, + "step": 31180 + }, + { + "epoch": 0.031409863210725586, + "grad_norm": 16.007125162849835, + "learning_rate": 3.1409262131619766e-05, + "loss": 2.4992, + "mean_token_accuracy": 0.42413793206214906, + "step": 31185 + }, + { + "epoch": 0.03141489926382976, + "grad_norm": 16.847697647164324, + "learning_rate": 3.1414298088350825e-05, + "loss": 2.6709, + "mean_token_accuracy": 0.3896551728248596, + "step": 31190 + }, + { + "epoch": 0.03141993531693393, + "grad_norm": 16.133831648128208, + "learning_rate": 3.141933404508189e-05, + "loss": 2.4347, + "mean_token_accuracy": 0.4034482717514038, + "step": 31195 + }, + { + "epoch": 0.0314249713700381, + "grad_norm": 14.727566765328183, + "learning_rate": 3.142437000181295e-05, + "loss": 2.9068, + "mean_token_accuracy": 0.3551724195480347, + "step": 31200 + }, + { + "epoch": 0.031430007423142274, + "grad_norm": 19.345363979708925, + "learning_rate": 3.1429405958544e-05, + "loss": 2.6444, + "mean_token_accuracy": 0.4333938241004944, + "step": 31205 + }, + { + "epoch": 0.03143504347624645, + "grad_norm": 12.921972872845366, + "learning_rate": 3.143444191527506e-05, + "loss": 2.3661, + "mean_token_accuracy": 0.44482759237289426, + "step": 31210 + }, + { + "epoch": 0.03144007952935062, + "grad_norm": 15.707671440103574, + "learning_rate": 3.143947787200612e-05, + "loss": 2.9799, + "mean_token_accuracy": 0.3674531102180481, + "step": 31215 + }, + { + "epoch": 0.031445115582454795, + "grad_norm": 14.18825812575258, + "learning_rate": 3.144451382873719e-05, + "loss": 2.8726, + "mean_token_accuracy": 0.417241370677948, + "step": 31220 + }, + { + "epoch": 0.03145015163555897, + "grad_norm": 15.098022736601756, + "learning_rate": 3.1449549785468246e-05, + "loss": 2.3315, + "mean_token_accuracy": 0.4068965494632721, + "step": 31225 + }, + { + "epoch": 0.031455187688663136, + "grad_norm": 16.275008678071586, + "learning_rate": 3.1454585742199306e-05, + "loss": 2.496, + "mean_token_accuracy": 0.4137930989265442, + "step": 31230 + }, + { + "epoch": 0.03146022374176731, + "grad_norm": 16.37831633858391, + "learning_rate": 3.1459621698930365e-05, + "loss": 2.7835, + "mean_token_accuracy": 0.38620689511299133, + "step": 31235 + }, + { + "epoch": 0.031465259794871484, + "grad_norm": 14.443258137319788, + "learning_rate": 3.1464657655661424e-05, + "loss": 2.2921, + "mean_token_accuracy": 0.4379310369491577, + "step": 31240 + }, + { + "epoch": 0.03147029584797566, + "grad_norm": 14.057277904370002, + "learning_rate": 3.1469693612392483e-05, + "loss": 2.4279, + "mean_token_accuracy": 0.4034482717514038, + "step": 31245 + }, + { + "epoch": 0.03147533190107983, + "grad_norm": 18.51838431656866, + "learning_rate": 3.147472956912354e-05, + "loss": 2.4886, + "mean_token_accuracy": 0.38620689511299133, + "step": 31250 + }, + { + "epoch": 0.031480367954184005, + "grad_norm": 18.216705430004907, + "learning_rate": 3.14797655258546e-05, + "loss": 2.338, + "mean_token_accuracy": 0.4, + "step": 31255 + }, + { + "epoch": 0.03148540400728818, + "grad_norm": 15.968213553105224, + "learning_rate": 3.148480148258566e-05, + "loss": 2.5599, + "mean_token_accuracy": 0.42413792610168455, + "step": 31260 + }, + { + "epoch": 0.031490440060392345, + "grad_norm": 17.942252847867046, + "learning_rate": 3.148983743931672e-05, + "loss": 2.5514, + "mean_token_accuracy": 0.41209922432899476, + "step": 31265 + }, + { + "epoch": 0.03149547611349652, + "grad_norm": 16.879011174205086, + "learning_rate": 3.149487339604778e-05, + "loss": 2.1798, + "mean_token_accuracy": 0.4392014563083649, + "step": 31270 + }, + { + "epoch": 0.03150051216660069, + "grad_norm": 15.90482147184509, + "learning_rate": 3.1499909352778846e-05, + "loss": 2.7663, + "mean_token_accuracy": 0.3827586233615875, + "step": 31275 + }, + { + "epoch": 0.03150554821970487, + "grad_norm": 17.7462794780518, + "learning_rate": 3.1504945309509905e-05, + "loss": 2.3981, + "mean_token_accuracy": 0.46569873094558717, + "step": 31280 + }, + { + "epoch": 0.03151058427280904, + "grad_norm": 14.320367617752172, + "learning_rate": 3.1509981266240964e-05, + "loss": 2.521, + "mean_token_accuracy": 0.4068965494632721, + "step": 31285 + }, + { + "epoch": 0.031515620325913214, + "grad_norm": 18.396955654277825, + "learning_rate": 3.1515017222972023e-05, + "loss": 2.8082, + "mean_token_accuracy": 0.4, + "step": 31290 + }, + { + "epoch": 0.03152065637901739, + "grad_norm": 21.158830596439245, + "learning_rate": 3.152005317970308e-05, + "loss": 2.4941, + "mean_token_accuracy": 0.38620689511299133, + "step": 31295 + }, + { + "epoch": 0.031525692432121555, + "grad_norm": 19.77063815032728, + "learning_rate": 3.152508913643414e-05, + "loss": 2.6731, + "mean_token_accuracy": 0.38620689809322356, + "step": 31300 + }, + { + "epoch": 0.03153072848522573, + "grad_norm": 12.14615999767482, + "learning_rate": 3.15301250931652e-05, + "loss": 2.1237, + "mean_token_accuracy": 0.4913793087005615, + "step": 31305 + }, + { + "epoch": 0.0315357645383299, + "grad_norm": 11.554243415446317, + "learning_rate": 3.153516104989626e-05, + "loss": 2.3632, + "mean_token_accuracy": 0.3931034475564957, + "step": 31310 + }, + { + "epoch": 0.031540800591434076, + "grad_norm": 14.158067402151664, + "learning_rate": 3.154019700662732e-05, + "loss": 2.7279, + "mean_token_accuracy": 0.4103448331356049, + "step": 31315 + }, + { + "epoch": 0.03154583664453825, + "grad_norm": 14.262203914962535, + "learning_rate": 3.154523296335838e-05, + "loss": 2.2979, + "mean_token_accuracy": 0.39310344457626345, + "step": 31320 + }, + { + "epoch": 0.031550872697642424, + "grad_norm": 17.321576642526523, + "learning_rate": 3.155026892008944e-05, + "loss": 2.6142, + "mean_token_accuracy": 0.3896551728248596, + "step": 31325 + }, + { + "epoch": 0.0315559087507466, + "grad_norm": 15.306670965275304, + "learning_rate": 3.1555304876820504e-05, + "loss": 2.5929, + "mean_token_accuracy": 0.387477308511734, + "step": 31330 + }, + { + "epoch": 0.031560944803850764, + "grad_norm": 15.413337737206035, + "learning_rate": 3.1560340833551564e-05, + "loss": 2.3474, + "mean_token_accuracy": 0.43647913336753846, + "step": 31335 + }, + { + "epoch": 0.03156598085695494, + "grad_norm": 22.273249633799253, + "learning_rate": 3.1565376790282616e-05, + "loss": 2.6444, + "mean_token_accuracy": 0.3931034475564957, + "step": 31340 + }, + { + "epoch": 0.03157101691005911, + "grad_norm": 17.494267500914628, + "learning_rate": 3.1570412747013675e-05, + "loss": 2.5777, + "mean_token_accuracy": 0.40344826579093934, + "step": 31345 + }, + { + "epoch": 0.031576052963163286, + "grad_norm": 14.95898898528262, + "learning_rate": 3.1575448703744735e-05, + "loss": 2.6368, + "mean_token_accuracy": 0.39655172228813174, + "step": 31350 + }, + { + "epoch": 0.03158108901626746, + "grad_norm": 14.731848845532488, + "learning_rate": 3.15804846604758e-05, + "loss": 2.6586, + "mean_token_accuracy": 0.3931034475564957, + "step": 31355 + }, + { + "epoch": 0.03158612506937163, + "grad_norm": 14.863021990681805, + "learning_rate": 3.158552061720686e-05, + "loss": 2.6173, + "mean_token_accuracy": 0.35862069129943847, + "step": 31360 + }, + { + "epoch": 0.03159116112247581, + "grad_norm": 26.053148183950647, + "learning_rate": 3.159055657393792e-05, + "loss": 2.4453, + "mean_token_accuracy": 0.4034482777118683, + "step": 31365 + }, + { + "epoch": 0.031596197175579974, + "grad_norm": 14.160352987372496, + "learning_rate": 3.159559253066898e-05, + "loss": 2.9098, + "mean_token_accuracy": 0.32758620083332063, + "step": 31370 + }, + { + "epoch": 0.03160123322868415, + "grad_norm": 17.335591222852372, + "learning_rate": 3.160062848740004e-05, + "loss": 2.4536, + "mean_token_accuracy": 0.4413793087005615, + "step": 31375 + }, + { + "epoch": 0.03160626928178832, + "grad_norm": 16.9423694416526, + "learning_rate": 3.16056644441311e-05, + "loss": 2.6466, + "mean_token_accuracy": 0.3793103456497192, + "step": 31380 + }, + { + "epoch": 0.031611305334892495, + "grad_norm": 13.019845002611305, + "learning_rate": 3.1610700400862156e-05, + "loss": 2.6357, + "mean_token_accuracy": 0.3482758581638336, + "step": 31385 + }, + { + "epoch": 0.03161634138799667, + "grad_norm": 16.793091355606048, + "learning_rate": 3.1615736357593215e-05, + "loss": 2.2026, + "mean_token_accuracy": 0.4172413766384125, + "step": 31390 + }, + { + "epoch": 0.03162137744110084, + "grad_norm": 13.42637592314996, + "learning_rate": 3.1620772314324275e-05, + "loss": 2.3526, + "mean_token_accuracy": 0.42413793206214906, + "step": 31395 + }, + { + "epoch": 0.031626413494205016, + "grad_norm": 15.418392819195669, + "learning_rate": 3.1625808271055334e-05, + "loss": 2.499, + "mean_token_accuracy": 0.441379314661026, + "step": 31400 + }, + { + "epoch": 0.03163144954730918, + "grad_norm": 13.323560131499793, + "learning_rate": 3.163084422778639e-05, + "loss": 2.2009, + "mean_token_accuracy": 0.4379310369491577, + "step": 31405 + }, + { + "epoch": 0.03163648560041336, + "grad_norm": 17.575433368499517, + "learning_rate": 3.163588018451746e-05, + "loss": 2.5852, + "mean_token_accuracy": 0.38965516090393065, + "step": 31410 + }, + { + "epoch": 0.03164152165351753, + "grad_norm": 17.747357953292266, + "learning_rate": 3.164091614124852e-05, + "loss": 2.3726, + "mean_token_accuracy": 0.42068964838981626, + "step": 31415 + }, + { + "epoch": 0.031646557706621704, + "grad_norm": 15.643955787364895, + "learning_rate": 3.164595209797958e-05, + "loss": 2.5645, + "mean_token_accuracy": 0.4172413766384125, + "step": 31420 + }, + { + "epoch": 0.03165159375972588, + "grad_norm": 11.410841995251184, + "learning_rate": 3.165098805471064e-05, + "loss": 1.9701, + "mean_token_accuracy": 0.5379310250282288, + "step": 31425 + }, + { + "epoch": 0.03165662981283005, + "grad_norm": 15.15772048623036, + "learning_rate": 3.1656024011441696e-05, + "loss": 2.3888, + "mean_token_accuracy": 0.435632187128067, + "step": 31430 + }, + { + "epoch": 0.031661665865934226, + "grad_norm": 15.717699656471156, + "learning_rate": 3.1661059968172755e-05, + "loss": 2.2099, + "mean_token_accuracy": 0.44827585816383364, + "step": 31435 + }, + { + "epoch": 0.03166670191903839, + "grad_norm": 17.289716454953915, + "learning_rate": 3.1666095924903815e-05, + "loss": 2.6555, + "mean_token_accuracy": 0.4049606680870056, + "step": 31440 + }, + { + "epoch": 0.031671737972142566, + "grad_norm": 18.346335970045768, + "learning_rate": 3.1671131881634874e-05, + "loss": 2.9139, + "mean_token_accuracy": 0.35862069129943847, + "step": 31445 + }, + { + "epoch": 0.03167677402524674, + "grad_norm": 12.666470571953404, + "learning_rate": 3.167616783836593e-05, + "loss": 2.4022, + "mean_token_accuracy": 0.40689654350280763, + "step": 31450 + }, + { + "epoch": 0.031681810078350914, + "grad_norm": 14.422252076922584, + "learning_rate": 3.168120379509699e-05, + "loss": 2.5108, + "mean_token_accuracy": 0.44827585816383364, + "step": 31455 + }, + { + "epoch": 0.03168684613145509, + "grad_norm": 15.004807021625458, + "learning_rate": 3.168623975182806e-05, + "loss": 2.8335, + "mean_token_accuracy": 0.3620689630508423, + "step": 31460 + }, + { + "epoch": 0.03169188218455926, + "grad_norm": 15.744403779421775, + "learning_rate": 3.169127570855912e-05, + "loss": 2.3454, + "mean_token_accuracy": 0.4517241358757019, + "step": 31465 + }, + { + "epoch": 0.031696918237663435, + "grad_norm": 15.401118211554786, + "learning_rate": 3.169631166529018e-05, + "loss": 2.4705, + "mean_token_accuracy": 0.3448275804519653, + "step": 31470 + }, + { + "epoch": 0.0317019542907676, + "grad_norm": 15.825140683763951, + "learning_rate": 3.170134762202123e-05, + "loss": 2.536, + "mean_token_accuracy": 0.3827586233615875, + "step": 31475 + }, + { + "epoch": 0.031706990343871776, + "grad_norm": 19.175481911711067, + "learning_rate": 3.170638357875229e-05, + "loss": 2.9579, + "mean_token_accuracy": 0.358620685338974, + "step": 31480 + }, + { + "epoch": 0.03171202639697595, + "grad_norm": 17.952788758661267, + "learning_rate": 3.171141953548335e-05, + "loss": 2.7074, + "mean_token_accuracy": 0.4413793087005615, + "step": 31485 + }, + { + "epoch": 0.03171706245008012, + "grad_norm": 18.461355586947736, + "learning_rate": 3.1716455492214414e-05, + "loss": 2.8004, + "mean_token_accuracy": 0.37586207687854767, + "step": 31490 + }, + { + "epoch": 0.0317220985031843, + "grad_norm": 16.67761105066735, + "learning_rate": 3.172149144894547e-05, + "loss": 2.4502, + "mean_token_accuracy": 0.4068965494632721, + "step": 31495 + }, + { + "epoch": 0.03172713455628847, + "grad_norm": 14.592085228725374, + "learning_rate": 3.172652740567653e-05, + "loss": 2.1967, + "mean_token_accuracy": 0.45172414779663084, + "step": 31500 + }, + { + "epoch": 0.031732170609392645, + "grad_norm": 15.874502254152878, + "learning_rate": 3.173156336240759e-05, + "loss": 2.4523, + "mean_token_accuracy": 0.4310344845056534, + "step": 31505 + }, + { + "epoch": 0.03173720666249681, + "grad_norm": 17.221500566629857, + "learning_rate": 3.173659931913865e-05, + "loss": 2.4348, + "mean_token_accuracy": 0.38620689511299133, + "step": 31510 + }, + { + "epoch": 0.031742242715600985, + "grad_norm": 18.42981261195008, + "learning_rate": 3.174163527586971e-05, + "loss": 2.4767, + "mean_token_accuracy": 0.3999999940395355, + "step": 31515 + }, + { + "epoch": 0.03174727876870516, + "grad_norm": 18.877477348149434, + "learning_rate": 3.174667123260077e-05, + "loss": 2.1275, + "mean_token_accuracy": 0.47084089517593386, + "step": 31520 + }, + { + "epoch": 0.03175231482180933, + "grad_norm": 11.620321892250821, + "learning_rate": 3.175170718933183e-05, + "loss": 2.2201, + "mean_token_accuracy": 0.46551724076271056, + "step": 31525 + }, + { + "epoch": 0.031757350874913506, + "grad_norm": 13.101133111318582, + "learning_rate": 3.175674314606289e-05, + "loss": 2.5496, + "mean_token_accuracy": 0.358620685338974, + "step": 31530 + }, + { + "epoch": 0.03176238692801768, + "grad_norm": 14.062639592531823, + "learning_rate": 3.176177910279395e-05, + "loss": 2.4424, + "mean_token_accuracy": 0.4068965554237366, + "step": 31535 + }, + { + "epoch": 0.031767422981121854, + "grad_norm": 15.321546480004097, + "learning_rate": 3.176681505952501e-05, + "loss": 2.286, + "mean_token_accuracy": 0.458620685338974, + "step": 31540 + }, + { + "epoch": 0.03177245903422602, + "grad_norm": 21.721346330798248, + "learning_rate": 3.177185101625607e-05, + "loss": 2.5814, + "mean_token_accuracy": 0.3655172288417816, + "step": 31545 + }, + { + "epoch": 0.031777495087330195, + "grad_norm": 18.35941125727484, + "learning_rate": 3.177688697298713e-05, + "loss": 2.5259, + "mean_token_accuracy": 0.3965517282485962, + "step": 31550 + }, + { + "epoch": 0.03178253114043437, + "grad_norm": 20.086716875417277, + "learning_rate": 3.178192292971819e-05, + "loss": 2.7044, + "mean_token_accuracy": 0.39655172228813174, + "step": 31555 + }, + { + "epoch": 0.03178756719353854, + "grad_norm": 18.096239198650704, + "learning_rate": 3.178695888644925e-05, + "loss": 2.515, + "mean_token_accuracy": 0.37586206793785093, + "step": 31560 + }, + { + "epoch": 0.031792603246642716, + "grad_norm": 15.904593621180277, + "learning_rate": 3.179199484318031e-05, + "loss": 2.5057, + "mean_token_accuracy": 0.4103448331356049, + "step": 31565 + }, + { + "epoch": 0.03179763929974689, + "grad_norm": 13.609211704508473, + "learning_rate": 3.179703079991137e-05, + "loss": 2.3665, + "mean_token_accuracy": 0.4551724135875702, + "step": 31570 + }, + { + "epoch": 0.03180267535285106, + "grad_norm": 15.502936242992288, + "learning_rate": 3.180206675664243e-05, + "loss": 2.7459, + "mean_token_accuracy": 0.39310344457626345, + "step": 31575 + }, + { + "epoch": 0.03180771140595523, + "grad_norm": 16.840585964848913, + "learning_rate": 3.180710271337349e-05, + "loss": 2.4033, + "mean_token_accuracy": 0.42413793206214906, + "step": 31580 + }, + { + "epoch": 0.031812747459059404, + "grad_norm": 15.703083750291317, + "learning_rate": 3.1812138670104547e-05, + "loss": 2.503, + "mean_token_accuracy": 0.42413792610168455, + "step": 31585 + }, + { + "epoch": 0.03181778351216358, + "grad_norm": 19.00823114562094, + "learning_rate": 3.1817174626835606e-05, + "loss": 2.6304, + "mean_token_accuracy": 0.42758620977401735, + "step": 31590 + }, + { + "epoch": 0.03182281956526775, + "grad_norm": 14.276788930685148, + "learning_rate": 3.182221058356667e-05, + "loss": 2.3928, + "mean_token_accuracy": 0.36896551251411436, + "step": 31595 + }, + { + "epoch": 0.031827855618371925, + "grad_norm": 19.897165155676845, + "learning_rate": 3.182724654029773e-05, + "loss": 2.855, + "mean_token_accuracy": 0.39655172228813174, + "step": 31600 + }, + { + "epoch": 0.0318328916714761, + "grad_norm": 14.947447954346066, + "learning_rate": 3.183228249702879e-05, + "loss": 2.7942, + "mean_token_accuracy": 0.3620689630508423, + "step": 31605 + }, + { + "epoch": 0.03183792772458027, + "grad_norm": 14.223649027208067, + "learning_rate": 3.183731845375984e-05, + "loss": 2.4967, + "mean_token_accuracy": 0.4310344815254211, + "step": 31610 + }, + { + "epoch": 0.03184296377768444, + "grad_norm": 15.142297670074997, + "learning_rate": 3.18423544104909e-05, + "loss": 2.6728, + "mean_token_accuracy": 0.4, + "step": 31615 + }, + { + "epoch": 0.03184799983078861, + "grad_norm": 13.37620587207293, + "learning_rate": 3.184739036722197e-05, + "loss": 2.2329, + "mean_token_accuracy": 0.4172413766384125, + "step": 31620 + }, + { + "epoch": 0.03185303588389279, + "grad_norm": 14.637774616054092, + "learning_rate": 3.185242632395303e-05, + "loss": 2.5065, + "mean_token_accuracy": 0.39655172228813174, + "step": 31625 + }, + { + "epoch": 0.03185807193699696, + "grad_norm": 14.375421932224224, + "learning_rate": 3.1857462280684087e-05, + "loss": 2.5404, + "mean_token_accuracy": 0.3758620709180832, + "step": 31630 + }, + { + "epoch": 0.031863107990101135, + "grad_norm": 14.83339160218976, + "learning_rate": 3.1862498237415146e-05, + "loss": 2.6449, + "mean_token_accuracy": 0.34827585220336915, + "step": 31635 + }, + { + "epoch": 0.03186814404320531, + "grad_norm": 14.669425163894635, + "learning_rate": 3.1867534194146205e-05, + "loss": 2.5196, + "mean_token_accuracy": 0.3896551728248596, + "step": 31640 + }, + { + "epoch": 0.03187318009630948, + "grad_norm": 15.785978085563007, + "learning_rate": 3.187257015087727e-05, + "loss": 2.3561, + "mean_token_accuracy": 0.4465517282485962, + "step": 31645 + }, + { + "epoch": 0.03187821614941365, + "grad_norm": 14.455878305703173, + "learning_rate": 3.1877606107608324e-05, + "loss": 2.5137, + "mean_token_accuracy": 0.4103448331356049, + "step": 31650 + }, + { + "epoch": 0.03188325220251782, + "grad_norm": 13.736572803826203, + "learning_rate": 3.188264206433938e-05, + "loss": 2.509, + "mean_token_accuracy": 0.37931033968925476, + "step": 31655 + }, + { + "epoch": 0.031888288255622, + "grad_norm": 22.019080362294957, + "learning_rate": 3.188767802107044e-05, + "loss": 2.2683, + "mean_token_accuracy": 0.4379310369491577, + "step": 31660 + }, + { + "epoch": 0.03189332430872617, + "grad_norm": 14.246769762409034, + "learning_rate": 3.18927139778015e-05, + "loss": 2.7988, + "mean_token_accuracy": 0.34482758641242983, + "step": 31665 + }, + { + "epoch": 0.031898360361830344, + "grad_norm": 15.584046437064652, + "learning_rate": 3.189774993453256e-05, + "loss": 2.5134, + "mean_token_accuracy": 0.4034482717514038, + "step": 31670 + }, + { + "epoch": 0.03190339641493452, + "grad_norm": 14.410254084746354, + "learning_rate": 3.190278589126363e-05, + "loss": 2.5094, + "mean_token_accuracy": 0.41379310488700866, + "step": 31675 + }, + { + "epoch": 0.03190843246803869, + "grad_norm": 15.974886832874885, + "learning_rate": 3.1907821847994686e-05, + "loss": 2.5336, + "mean_token_accuracy": 0.3896551698446274, + "step": 31680 + }, + { + "epoch": 0.03191346852114286, + "grad_norm": 14.018783291786033, + "learning_rate": 3.1912857804725745e-05, + "loss": 2.7433, + "mean_token_accuracy": 0.4103448212146759, + "step": 31685 + }, + { + "epoch": 0.03191850457424703, + "grad_norm": 15.887350090765855, + "learning_rate": 3.1917893761456804e-05, + "loss": 2.221, + "mean_token_accuracy": 0.44827585816383364, + "step": 31690 + }, + { + "epoch": 0.031923540627351206, + "grad_norm": 19.077985577253013, + "learning_rate": 3.1922929718187864e-05, + "loss": 2.3239, + "mean_token_accuracy": 0.458620685338974, + "step": 31695 + }, + { + "epoch": 0.03192857668045538, + "grad_norm": 14.531030389339577, + "learning_rate": 3.192796567491892e-05, + "loss": 2.3882, + "mean_token_accuracy": 0.42413792610168455, + "step": 31700 + }, + { + "epoch": 0.031933612733559553, + "grad_norm": 15.137880868002352, + "learning_rate": 3.193300163164998e-05, + "loss": 2.3285, + "mean_token_accuracy": 0.4517241418361664, + "step": 31705 + }, + { + "epoch": 0.03193864878666373, + "grad_norm": 19.463340667724417, + "learning_rate": 3.193803758838104e-05, + "loss": 2.5424, + "mean_token_accuracy": 0.44482758045196535, + "step": 31710 + }, + { + "epoch": 0.0319436848397679, + "grad_norm": 16.49929795130981, + "learning_rate": 3.19430735451121e-05, + "loss": 2.759, + "mean_token_accuracy": 0.38620689511299133, + "step": 31715 + }, + { + "epoch": 0.03194872089287207, + "grad_norm": 16.736216086243157, + "learning_rate": 3.194810950184316e-05, + "loss": 2.6337, + "mean_token_accuracy": 0.4275861978530884, + "step": 31720 + }, + { + "epoch": 0.03195375694597624, + "grad_norm": 16.23006486115833, + "learning_rate": 3.1953145458574226e-05, + "loss": 2.6085, + "mean_token_accuracy": 0.3965517282485962, + "step": 31725 + }, + { + "epoch": 0.031958792999080415, + "grad_norm": 13.194143443043792, + "learning_rate": 3.1958181415305285e-05, + "loss": 2.3266, + "mean_token_accuracy": 0.4413793087005615, + "step": 31730 + }, + { + "epoch": 0.03196382905218459, + "grad_norm": 15.630480005445824, + "learning_rate": 3.1963217372036344e-05, + "loss": 2.8838, + "mean_token_accuracy": 0.39655172228813174, + "step": 31735 + }, + { + "epoch": 0.03196886510528876, + "grad_norm": 14.79549729913046, + "learning_rate": 3.19682533287674e-05, + "loss": 2.6803, + "mean_token_accuracy": 0.42758620977401735, + "step": 31740 + }, + { + "epoch": 0.03197390115839294, + "grad_norm": 22.373040694030642, + "learning_rate": 3.1973289285498456e-05, + "loss": 2.5735, + "mean_token_accuracy": 0.41179673075675965, + "step": 31745 + }, + { + "epoch": 0.03197893721149711, + "grad_norm": 17.390565570140257, + "learning_rate": 3.1978325242229515e-05, + "loss": 2.7497, + "mean_token_accuracy": 0.4034482717514038, + "step": 31750 + }, + { + "epoch": 0.03198397326460128, + "grad_norm": 19.896043024010247, + "learning_rate": 3.198336119896058e-05, + "loss": 3.0829, + "mean_token_accuracy": 0.3137931048870087, + "step": 31755 + }, + { + "epoch": 0.03198900931770545, + "grad_norm": 14.38814132300305, + "learning_rate": 3.198839715569164e-05, + "loss": 2.4846, + "mean_token_accuracy": 0.4172413766384125, + "step": 31760 + }, + { + "epoch": 0.031994045370809625, + "grad_norm": 15.663242208109411, + "learning_rate": 3.19934331124227e-05, + "loss": 2.8776, + "mean_token_accuracy": 0.384633994102478, + "step": 31765 + }, + { + "epoch": 0.0319990814239138, + "grad_norm": 19.162530296818833, + "learning_rate": 3.199846906915376e-05, + "loss": 2.8799, + "mean_token_accuracy": 0.358620685338974, + "step": 31770 + }, + { + "epoch": 0.03200411747701797, + "grad_norm": 11.452596099938221, + "learning_rate": 3.200350502588482e-05, + "loss": 2.7131, + "mean_token_accuracy": 0.4137930929660797, + "step": 31775 + }, + { + "epoch": 0.032009153530122146, + "grad_norm": 15.490050237737035, + "learning_rate": 3.200854098261588e-05, + "loss": 2.3456, + "mean_token_accuracy": 0.4206896543502808, + "step": 31780 + }, + { + "epoch": 0.03201418958322632, + "grad_norm": 14.314799150622523, + "learning_rate": 3.201357693934694e-05, + "loss": 2.4438, + "mean_token_accuracy": 0.42758620381355283, + "step": 31785 + }, + { + "epoch": 0.03201922563633049, + "grad_norm": 16.9258713728606, + "learning_rate": 3.2018612896077996e-05, + "loss": 2.6915, + "mean_token_accuracy": 0.3827586203813553, + "step": 31790 + }, + { + "epoch": 0.03202426168943466, + "grad_norm": 13.844012758967448, + "learning_rate": 3.2023648852809055e-05, + "loss": 2.83, + "mean_token_accuracy": 0.42413792610168455, + "step": 31795 + }, + { + "epoch": 0.032029297742538834, + "grad_norm": 13.988678618649534, + "learning_rate": 3.2028684809540115e-05, + "loss": 2.2807, + "mean_token_accuracy": 0.45172414779663084, + "step": 31800 + }, + { + "epoch": 0.03203433379564301, + "grad_norm": 16.432931553909285, + "learning_rate": 3.203372076627118e-05, + "loss": 2.4547, + "mean_token_accuracy": 0.3896551728248596, + "step": 31805 + }, + { + "epoch": 0.03203936984874718, + "grad_norm": 17.372332683020776, + "learning_rate": 3.203875672300224e-05, + "loss": 2.5484, + "mean_token_accuracy": 0.3931034505367279, + "step": 31810 + }, + { + "epoch": 0.032044405901851356, + "grad_norm": 19.722913862720034, + "learning_rate": 3.20437926797333e-05, + "loss": 2.5644, + "mean_token_accuracy": 0.4034482777118683, + "step": 31815 + }, + { + "epoch": 0.03204944195495553, + "grad_norm": 14.232767844795314, + "learning_rate": 3.204882863646436e-05, + "loss": 2.5352, + "mean_token_accuracy": 0.37586206793785093, + "step": 31820 + }, + { + "epoch": 0.032054478008059696, + "grad_norm": 14.664770014833527, + "learning_rate": 3.205386459319542e-05, + "loss": 2.6077, + "mean_token_accuracy": 0.41724138557910917, + "step": 31825 + }, + { + "epoch": 0.03205951406116387, + "grad_norm": 14.629830340814618, + "learning_rate": 3.205890054992648e-05, + "loss": 2.2064, + "mean_token_accuracy": 0.47586206793785096, + "step": 31830 + }, + { + "epoch": 0.032064550114268044, + "grad_norm": 15.71497217950326, + "learning_rate": 3.2063936506657536e-05, + "loss": 2.5723, + "mean_token_accuracy": 0.4310344815254211, + "step": 31835 + }, + { + "epoch": 0.03206958616737222, + "grad_norm": 16.89048236517421, + "learning_rate": 3.2068972463388596e-05, + "loss": 2.6875, + "mean_token_accuracy": 0.3827586233615875, + "step": 31840 + }, + { + "epoch": 0.03207462222047639, + "grad_norm": 13.939032393676971, + "learning_rate": 3.2074008420119655e-05, + "loss": 2.5423, + "mean_token_accuracy": 0.46055657267570493, + "step": 31845 + }, + { + "epoch": 0.032079658273580565, + "grad_norm": 15.22873089984502, + "learning_rate": 3.2079044376850714e-05, + "loss": 2.6193, + "mean_token_accuracy": 0.42413792610168455, + "step": 31850 + }, + { + "epoch": 0.03208469432668474, + "grad_norm": 18.6062703212802, + "learning_rate": 3.208408033358177e-05, + "loss": 2.8232, + "mean_token_accuracy": 0.36896551847457887, + "step": 31855 + }, + { + "epoch": 0.032089730379788906, + "grad_norm": 16.753382946106342, + "learning_rate": 3.208911629031284e-05, + "loss": 2.4378, + "mean_token_accuracy": 0.43448275327682495, + "step": 31860 + }, + { + "epoch": 0.03209476643289308, + "grad_norm": 22.995077819149248, + "learning_rate": 3.20941522470439e-05, + "loss": 2.4631, + "mean_token_accuracy": 0.4551724135875702, + "step": 31865 + }, + { + "epoch": 0.03209980248599725, + "grad_norm": 13.258081784776488, + "learning_rate": 3.209918820377496e-05, + "loss": 2.257, + "mean_token_accuracy": 0.4934664249420166, + "step": 31870 + }, + { + "epoch": 0.03210483853910143, + "grad_norm": 18.049275558511553, + "learning_rate": 3.210422416050601e-05, + "loss": 2.46, + "mean_token_accuracy": 0.4344827592372894, + "step": 31875 + }, + { + "epoch": 0.0321098745922056, + "grad_norm": 16.098708798849017, + "learning_rate": 3.210926011723707e-05, + "loss": 2.0541, + "mean_token_accuracy": 0.4862068951129913, + "step": 31880 + }, + { + "epoch": 0.032114910645309774, + "grad_norm": 12.529038684230542, + "learning_rate": 3.2114296073968136e-05, + "loss": 2.7787, + "mean_token_accuracy": 0.39310344457626345, + "step": 31885 + }, + { + "epoch": 0.03211994669841395, + "grad_norm": 15.89469111712078, + "learning_rate": 3.2119332030699195e-05, + "loss": 2.5359, + "mean_token_accuracy": 0.42758620977401735, + "step": 31890 + }, + { + "epoch": 0.032124982751518115, + "grad_norm": 14.871866979347494, + "learning_rate": 3.2124367987430254e-05, + "loss": 2.4535, + "mean_token_accuracy": 0.40689656138420105, + "step": 31895 + }, + { + "epoch": 0.03213001880462229, + "grad_norm": 18.132815438206723, + "learning_rate": 3.212940394416131e-05, + "loss": 2.4753, + "mean_token_accuracy": 0.41034482717514037, + "step": 31900 + }, + { + "epoch": 0.03213505485772646, + "grad_norm": 13.057225950108466, + "learning_rate": 3.213443990089237e-05, + "loss": 2.1939, + "mean_token_accuracy": 0.4068965494632721, + "step": 31905 + }, + { + "epoch": 0.032140090910830636, + "grad_norm": 13.792825233556757, + "learning_rate": 3.213947585762344e-05, + "loss": 2.6195, + "mean_token_accuracy": 0.4034482777118683, + "step": 31910 + }, + { + "epoch": 0.03214512696393481, + "grad_norm": 14.798467684527143, + "learning_rate": 3.214451181435449e-05, + "loss": 2.3901, + "mean_token_accuracy": 0.4572292804718018, + "step": 31915 + }, + { + "epoch": 0.032150163017038984, + "grad_norm": 13.284380487643643, + "learning_rate": 3.214954777108555e-05, + "loss": 2.439, + "mean_token_accuracy": 0.4034482777118683, + "step": 31920 + }, + { + "epoch": 0.03215519907014316, + "grad_norm": 14.765989799890471, + "learning_rate": 3.215458372781661e-05, + "loss": 1.994, + "mean_token_accuracy": 0.49879008531570435, + "step": 31925 + }, + { + "epoch": 0.032160235123247324, + "grad_norm": 18.4559098476638, + "learning_rate": 3.215961968454767e-05, + "loss": 2.5886, + "mean_token_accuracy": 0.4330913484096527, + "step": 31930 + }, + { + "epoch": 0.0321652711763515, + "grad_norm": 17.271322567993348, + "learning_rate": 3.216465564127873e-05, + "loss": 2.6926, + "mean_token_accuracy": 0.38275861740112305, + "step": 31935 + }, + { + "epoch": 0.03217030722945567, + "grad_norm": 15.531095598974309, + "learning_rate": 3.2169691598009794e-05, + "loss": 2.8378, + "mean_token_accuracy": 0.3655172407627106, + "step": 31940 + }, + { + "epoch": 0.032175343282559846, + "grad_norm": 21.826110288110947, + "learning_rate": 3.217472755474085e-05, + "loss": 2.794, + "mean_token_accuracy": 0.39310344457626345, + "step": 31945 + }, + { + "epoch": 0.03218037933566402, + "grad_norm": 13.483256730885234, + "learning_rate": 3.217976351147191e-05, + "loss": 2.5576, + "mean_token_accuracy": 0.4068965554237366, + "step": 31950 + }, + { + "epoch": 0.03218541538876819, + "grad_norm": 25.399759107685377, + "learning_rate": 3.218479946820297e-05, + "loss": 2.7941, + "mean_token_accuracy": 0.3655172407627106, + "step": 31955 + }, + { + "epoch": 0.03219045144187237, + "grad_norm": 13.795769592448956, + "learning_rate": 3.218983542493403e-05, + "loss": 2.2029, + "mean_token_accuracy": 0.46896551847457885, + "step": 31960 + }, + { + "epoch": 0.032195487494976534, + "grad_norm": 19.687534207059002, + "learning_rate": 3.219487138166509e-05, + "loss": 2.7932, + "mean_token_accuracy": 0.38124621510505674, + "step": 31965 + }, + { + "epoch": 0.03220052354808071, + "grad_norm": 15.66597812115187, + "learning_rate": 3.219990733839615e-05, + "loss": 2.1681, + "mean_token_accuracy": 0.41379310488700866, + "step": 31970 + }, + { + "epoch": 0.03220555960118488, + "grad_norm": 18.582798690754903, + "learning_rate": 3.220494329512721e-05, + "loss": 2.505, + "mean_token_accuracy": 0.40344828367233276, + "step": 31975 + }, + { + "epoch": 0.032210595654289055, + "grad_norm": 20.892540358918726, + "learning_rate": 3.220997925185827e-05, + "loss": 2.7942, + "mean_token_accuracy": 0.358620685338974, + "step": 31980 + }, + { + "epoch": 0.03221563170739323, + "grad_norm": 21.311820183596392, + "learning_rate": 3.221501520858933e-05, + "loss": 2.8461, + "mean_token_accuracy": 0.3482758641242981, + "step": 31985 + }, + { + "epoch": 0.0322206677604974, + "grad_norm": 13.959394583223167, + "learning_rate": 3.2220051165320393e-05, + "loss": 2.5304, + "mean_token_accuracy": 0.37931033968925476, + "step": 31990 + }, + { + "epoch": 0.032225703813601576, + "grad_norm": 17.68524256841339, + "learning_rate": 3.222508712205145e-05, + "loss": 2.4262, + "mean_token_accuracy": 0.45172413885593415, + "step": 31995 + }, + { + "epoch": 0.03223073986670574, + "grad_norm": 11.963026965281514, + "learning_rate": 3.223012307878251e-05, + "loss": 2.522, + "mean_token_accuracy": 0.4229885071516037, + "step": 32000 + }, + { + "epoch": 0.03223577591980992, + "grad_norm": 20.74772290321624, + "learning_rate": 3.223515903551357e-05, + "loss": 2.2171, + "mean_token_accuracy": 0.45172412395477296, + "step": 32005 + }, + { + "epoch": 0.03224081197291409, + "grad_norm": 25.81333277563238, + "learning_rate": 3.2240194992244624e-05, + "loss": 2.4971, + "mean_token_accuracy": 0.43448275327682495, + "step": 32010 + }, + { + "epoch": 0.032245848026018264, + "grad_norm": 16.399910027816013, + "learning_rate": 3.224523094897568e-05, + "loss": 2.6021, + "mean_token_accuracy": 0.41034482717514037, + "step": 32015 + }, + { + "epoch": 0.03225088407912244, + "grad_norm": 15.444067457505872, + "learning_rate": 3.225026690570675e-05, + "loss": 2.328, + "mean_token_accuracy": 0.4344827592372894, + "step": 32020 + }, + { + "epoch": 0.03225592013222661, + "grad_norm": 18.380717556984646, + "learning_rate": 3.225530286243781e-05, + "loss": 2.6579, + "mean_token_accuracy": 0.4068965494632721, + "step": 32025 + }, + { + "epoch": 0.032260956185330786, + "grad_norm": 21.335418193543553, + "learning_rate": 3.226033881916887e-05, + "loss": 2.582, + "mean_token_accuracy": 0.4034482777118683, + "step": 32030 + }, + { + "epoch": 0.03226599223843495, + "grad_norm": 16.14789670198836, + "learning_rate": 3.226537477589993e-05, + "loss": 2.5123, + "mean_token_accuracy": 0.39310344457626345, + "step": 32035 + }, + { + "epoch": 0.032271028291539126, + "grad_norm": 37.10484275750181, + "learning_rate": 3.2270410732630986e-05, + "loss": 2.4986, + "mean_token_accuracy": 0.37586206793785093, + "step": 32040 + }, + { + "epoch": 0.0322760643446433, + "grad_norm": 16.336733506172358, + "learning_rate": 3.227544668936205e-05, + "loss": 2.2392, + "mean_token_accuracy": 0.4551724135875702, + "step": 32045 + }, + { + "epoch": 0.032281100397747474, + "grad_norm": 13.222422764888664, + "learning_rate": 3.2280482646093104e-05, + "loss": 2.1154, + "mean_token_accuracy": 0.4896551728248596, + "step": 32050 + }, + { + "epoch": 0.03228613645085165, + "grad_norm": 16.184272726538154, + "learning_rate": 3.2285518602824164e-05, + "loss": 2.6389, + "mean_token_accuracy": 0.4120992124080658, + "step": 32055 + }, + { + "epoch": 0.03229117250395582, + "grad_norm": 19.032141198336728, + "learning_rate": 3.229055455955522e-05, + "loss": 2.1948, + "mean_token_accuracy": 0.4620689690113068, + "step": 32060 + }, + { + "epoch": 0.032296208557059995, + "grad_norm": 17.6952403882455, + "learning_rate": 3.229559051628628e-05, + "loss": 2.4429, + "mean_token_accuracy": 0.4150030255317688, + "step": 32065 + }, + { + "epoch": 0.03230124461016416, + "grad_norm": 14.975568016200587, + "learning_rate": 3.230062647301735e-05, + "loss": 3.0166, + "mean_token_accuracy": 0.35862069129943847, + "step": 32070 + }, + { + "epoch": 0.032306280663268336, + "grad_norm": 16.535068595972923, + "learning_rate": 3.230566242974841e-05, + "loss": 2.4399, + "mean_token_accuracy": 0.38620689511299133, + "step": 32075 + }, + { + "epoch": 0.03231131671637251, + "grad_norm": 18.23646602773212, + "learning_rate": 3.231069838647947e-05, + "loss": 2.3771, + "mean_token_accuracy": 0.4172413766384125, + "step": 32080 + }, + { + "epoch": 0.03231635276947668, + "grad_norm": 17.42718513526834, + "learning_rate": 3.2315734343210526e-05, + "loss": 2.4916, + "mean_token_accuracy": 0.4360556542873383, + "step": 32085 + }, + { + "epoch": 0.03232138882258086, + "grad_norm": 19.450275304985905, + "learning_rate": 3.2320770299941585e-05, + "loss": 2.9534, + "mean_token_accuracy": 0.36896551847457887, + "step": 32090 + }, + { + "epoch": 0.03232642487568503, + "grad_norm": 16.82510128371573, + "learning_rate": 3.2325806256672645e-05, + "loss": 2.487, + "mean_token_accuracy": 0.44996975660324096, + "step": 32095 + }, + { + "epoch": 0.032331460928789205, + "grad_norm": 13.330910089092427, + "learning_rate": 3.2330842213403704e-05, + "loss": 2.5524, + "mean_token_accuracy": 0.39745916724205016, + "step": 32100 + }, + { + "epoch": 0.03233649698189337, + "grad_norm": 12.860601540779621, + "learning_rate": 3.233587817013476e-05, + "loss": 2.3268, + "mean_token_accuracy": 0.4344827651977539, + "step": 32105 + }, + { + "epoch": 0.032341533034997545, + "grad_norm": 18.282960698250932, + "learning_rate": 3.234091412686582e-05, + "loss": 2.3494, + "mean_token_accuracy": 0.458620685338974, + "step": 32110 + }, + { + "epoch": 0.03234656908810172, + "grad_norm": 14.623584881450494, + "learning_rate": 3.234595008359688e-05, + "loss": 2.6351, + "mean_token_accuracy": 0.38275861740112305, + "step": 32115 + }, + { + "epoch": 0.03235160514120589, + "grad_norm": 15.024159822342462, + "learning_rate": 3.235098604032794e-05, + "loss": 2.5269, + "mean_token_accuracy": 0.4225045382976532, + "step": 32120 + }, + { + "epoch": 0.032356641194310067, + "grad_norm": 19.099184236849922, + "learning_rate": 3.235602199705901e-05, + "loss": 2.4594, + "mean_token_accuracy": 0.46551724076271056, + "step": 32125 + }, + { + "epoch": 0.03236167724741424, + "grad_norm": 16.44982788636979, + "learning_rate": 3.2361057953790066e-05, + "loss": 2.4191, + "mean_token_accuracy": 0.4448275864124298, + "step": 32130 + }, + { + "epoch": 0.032366713300518414, + "grad_norm": 13.983052278791792, + "learning_rate": 3.2366093910521125e-05, + "loss": 2.3441, + "mean_token_accuracy": 0.43103447556495667, + "step": 32135 + }, + { + "epoch": 0.03237174935362258, + "grad_norm": 18.17881352541018, + "learning_rate": 3.2371129867252185e-05, + "loss": 2.346, + "mean_token_accuracy": 0.4310344815254211, + "step": 32140 + }, + { + "epoch": 0.032376785406726755, + "grad_norm": 13.80726677187621, + "learning_rate": 3.237616582398324e-05, + "loss": 2.2867, + "mean_token_accuracy": 0.42413792610168455, + "step": 32145 + }, + { + "epoch": 0.03238182145983093, + "grad_norm": 16.28163112531554, + "learning_rate": 3.23812017807143e-05, + "loss": 2.2719, + "mean_token_accuracy": 0.4137930989265442, + "step": 32150 + }, + { + "epoch": 0.0323868575129351, + "grad_norm": 15.098626086410823, + "learning_rate": 3.238623773744536e-05, + "loss": 2.95, + "mean_token_accuracy": 0.35517241060733795, + "step": 32155 + }, + { + "epoch": 0.032391893566039276, + "grad_norm": 15.224541699012704, + "learning_rate": 3.239127369417642e-05, + "loss": 2.5146, + "mean_token_accuracy": 0.3862068891525269, + "step": 32160 + }, + { + "epoch": 0.03239692961914345, + "grad_norm": 24.51257894992692, + "learning_rate": 3.239630965090748e-05, + "loss": 2.6108, + "mean_token_accuracy": 0.3586206793785095, + "step": 32165 + }, + { + "epoch": 0.032401965672247623, + "grad_norm": 16.041702719256328, + "learning_rate": 3.240134560763854e-05, + "loss": 2.1935, + "mean_token_accuracy": 0.4379310369491577, + "step": 32170 + }, + { + "epoch": 0.03240700172535179, + "grad_norm": 20.03427273189437, + "learning_rate": 3.24063815643696e-05, + "loss": 2.8543, + "mean_token_accuracy": 0.3999999940395355, + "step": 32175 + }, + { + "epoch": 0.032412037778455964, + "grad_norm": 15.025860946444702, + "learning_rate": 3.2411417521100665e-05, + "loss": 2.6747, + "mean_token_accuracy": 0.3896551728248596, + "step": 32180 + }, + { + "epoch": 0.03241707383156014, + "grad_norm": 15.18273402469302, + "learning_rate": 3.241645347783172e-05, + "loss": 2.3062, + "mean_token_accuracy": 0.4517241358757019, + "step": 32185 + }, + { + "epoch": 0.03242210988466431, + "grad_norm": 18.556809033548994, + "learning_rate": 3.242148943456278e-05, + "loss": 2.4114, + "mean_token_accuracy": 0.42068966031074523, + "step": 32190 + }, + { + "epoch": 0.032427145937768485, + "grad_norm": 16.060983362481547, + "learning_rate": 3.2426525391293836e-05, + "loss": 2.814, + "mean_token_accuracy": 0.35862069129943847, + "step": 32195 + }, + { + "epoch": 0.03243218199087266, + "grad_norm": 14.05237509166311, + "learning_rate": 3.2431561348024896e-05, + "loss": 2.6087, + "mean_token_accuracy": 0.38275861740112305, + "step": 32200 + }, + { + "epoch": 0.03243721804397683, + "grad_norm": 17.0173683494379, + "learning_rate": 3.243659730475596e-05, + "loss": 2.4193, + "mean_token_accuracy": 0.4218995749950409, + "step": 32205 + }, + { + "epoch": 0.032442254097081, + "grad_norm": 14.105437716019017, + "learning_rate": 3.244163326148702e-05, + "loss": 2.6755, + "mean_token_accuracy": 0.3965517163276672, + "step": 32210 + }, + { + "epoch": 0.032447290150185173, + "grad_norm": 18.915968337623788, + "learning_rate": 3.244666921821808e-05, + "loss": 2.5324, + "mean_token_accuracy": 0.3965517282485962, + "step": 32215 + }, + { + "epoch": 0.03245232620328935, + "grad_norm": 18.336440009248875, + "learning_rate": 3.245170517494914e-05, + "loss": 2.4277, + "mean_token_accuracy": 0.39655172228813174, + "step": 32220 + }, + { + "epoch": 0.03245736225639352, + "grad_norm": 18.859294052830872, + "learning_rate": 3.24567411316802e-05, + "loss": 2.4793, + "mean_token_accuracy": 0.41209921836853025, + "step": 32225 + }, + { + "epoch": 0.032462398309497695, + "grad_norm": 13.70911000016588, + "learning_rate": 3.246177708841126e-05, + "loss": 2.7154, + "mean_token_accuracy": 0.38620689809322356, + "step": 32230 + }, + { + "epoch": 0.03246743436260187, + "grad_norm": 14.393306776236178, + "learning_rate": 3.246681304514232e-05, + "loss": 2.7389, + "mean_token_accuracy": 0.37586207389831544, + "step": 32235 + }, + { + "epoch": 0.03247247041570604, + "grad_norm": 16.65650115500263, + "learning_rate": 3.2471849001873376e-05, + "loss": 1.9689, + "mean_token_accuracy": 0.44827585816383364, + "step": 32240 + }, + { + "epoch": 0.03247750646881021, + "grad_norm": 13.093253149439937, + "learning_rate": 3.2476884958604436e-05, + "loss": 2.0934, + "mean_token_accuracy": 0.43448275327682495, + "step": 32245 + }, + { + "epoch": 0.03248254252191438, + "grad_norm": 13.270009278754165, + "learning_rate": 3.2481920915335495e-05, + "loss": 2.5177, + "mean_token_accuracy": 0.4310344815254211, + "step": 32250 + }, + { + "epoch": 0.03248757857501856, + "grad_norm": 14.8297682942415, + "learning_rate": 3.2486956872066554e-05, + "loss": 2.6759, + "mean_token_accuracy": 0.3413793116807938, + "step": 32255 + }, + { + "epoch": 0.03249261462812273, + "grad_norm": 15.850042286632341, + "learning_rate": 3.249199282879762e-05, + "loss": 2.5311, + "mean_token_accuracy": 0.34137930870056155, + "step": 32260 + }, + { + "epoch": 0.032497650681226904, + "grad_norm": 17.355580273969014, + "learning_rate": 3.249702878552868e-05, + "loss": 2.5011, + "mean_token_accuracy": 0.3793103456497192, + "step": 32265 + }, + { + "epoch": 0.03250268673433108, + "grad_norm": 18.597345245564842, + "learning_rate": 3.250206474225974e-05, + "loss": 2.606, + "mean_token_accuracy": 0.38965516686439516, + "step": 32270 + }, + { + "epoch": 0.03250772278743525, + "grad_norm": 17.691182532611357, + "learning_rate": 3.250710069899079e-05, + "loss": 2.3904, + "mean_token_accuracy": 0.42238354682922363, + "step": 32275 + }, + { + "epoch": 0.03251275884053942, + "grad_norm": 12.973132273489238, + "learning_rate": 3.251213665572185e-05, + "loss": 2.4805, + "mean_token_accuracy": 0.41379311084747317, + "step": 32280 + }, + { + "epoch": 0.03251779489364359, + "grad_norm": 15.411380593076348, + "learning_rate": 3.2517172612452916e-05, + "loss": 2.7634, + "mean_token_accuracy": 0.3811857283115387, + "step": 32285 + }, + { + "epoch": 0.032522830946747766, + "grad_norm": 15.80851553983375, + "learning_rate": 3.2522208569183976e-05, + "loss": 2.5959, + "mean_token_accuracy": 0.36896551251411436, + "step": 32290 + }, + { + "epoch": 0.03252786699985194, + "grad_norm": 13.904617417854315, + "learning_rate": 3.2527244525915035e-05, + "loss": 2.5049, + "mean_token_accuracy": 0.4310344934463501, + "step": 32295 + }, + { + "epoch": 0.032532903052956114, + "grad_norm": 18.111453857054155, + "learning_rate": 3.2532280482646094e-05, + "loss": 2.7707, + "mean_token_accuracy": 0.3793103516101837, + "step": 32300 + }, + { + "epoch": 0.03253793910606029, + "grad_norm": 15.259650204224474, + "learning_rate": 3.2537316439377153e-05, + "loss": 2.1062, + "mean_token_accuracy": 0.42758620381355283, + "step": 32305 + }, + { + "epoch": 0.03254297515916446, + "grad_norm": 17.14264131869594, + "learning_rate": 3.254235239610822e-05, + "loss": 2.4472, + "mean_token_accuracy": 0.4137930989265442, + "step": 32310 + }, + { + "epoch": 0.03254801121226863, + "grad_norm": 18.216093845166327, + "learning_rate": 3.254738835283927e-05, + "loss": 3.3084, + "mean_token_accuracy": 0.3034482687711716, + "step": 32315 + }, + { + "epoch": 0.0325530472653728, + "grad_norm": 16.833267191726566, + "learning_rate": 3.255242430957033e-05, + "loss": 2.7356, + "mean_token_accuracy": 0.3827586233615875, + "step": 32320 + }, + { + "epoch": 0.032558083318476976, + "grad_norm": 14.263316660709878, + "learning_rate": 3.255746026630139e-05, + "loss": 2.5684, + "mean_token_accuracy": 0.40689654350280763, + "step": 32325 + }, + { + "epoch": 0.03256311937158115, + "grad_norm": 21.129096249725375, + "learning_rate": 3.256249622303245e-05, + "loss": 2.6917, + "mean_token_accuracy": 0.3793103456497192, + "step": 32330 + }, + { + "epoch": 0.03256815542468532, + "grad_norm": 15.109700293836818, + "learning_rate": 3.2567532179763516e-05, + "loss": 2.7367, + "mean_token_accuracy": 0.38275861740112305, + "step": 32335 + }, + { + "epoch": 0.0325731914777895, + "grad_norm": 14.672514085772532, + "learning_rate": 3.2572568136494575e-05, + "loss": 2.501, + "mean_token_accuracy": 0.42068966031074523, + "step": 32340 + }, + { + "epoch": 0.03257822753089367, + "grad_norm": 21.21441372986871, + "learning_rate": 3.2577604093225634e-05, + "loss": 2.4776, + "mean_token_accuracy": 0.42068964838981626, + "step": 32345 + }, + { + "epoch": 0.03258326358399784, + "grad_norm": 19.521571024778307, + "learning_rate": 3.2582640049956694e-05, + "loss": 2.8759, + "mean_token_accuracy": 0.38620689511299133, + "step": 32350 + }, + { + "epoch": 0.03258829963710201, + "grad_norm": 12.866670673767898, + "learning_rate": 3.258767600668775e-05, + "loss": 2.4569, + "mean_token_accuracy": 0.4034482777118683, + "step": 32355 + }, + { + "epoch": 0.032593335690206185, + "grad_norm": 15.294640161388223, + "learning_rate": 3.259271196341881e-05, + "loss": 2.4256, + "mean_token_accuracy": 0.4275861978530884, + "step": 32360 + }, + { + "epoch": 0.03259837174331036, + "grad_norm": 22.458370346777123, + "learning_rate": 3.259774792014987e-05, + "loss": 2.4107, + "mean_token_accuracy": 0.47032020688056947, + "step": 32365 + }, + { + "epoch": 0.03260340779641453, + "grad_norm": 26.28077278790031, + "learning_rate": 3.260278387688093e-05, + "loss": 2.7524, + "mean_token_accuracy": 0.37241379022598264, + "step": 32370 + }, + { + "epoch": 0.032608443849518706, + "grad_norm": 14.988661724543077, + "learning_rate": 3.260781983361199e-05, + "loss": 2.6765, + "mean_token_accuracy": 0.37241379022598264, + "step": 32375 + }, + { + "epoch": 0.03261347990262288, + "grad_norm": 16.100163483926046, + "learning_rate": 3.261285579034305e-05, + "loss": 2.4524, + "mean_token_accuracy": 0.44482759237289426, + "step": 32380 + }, + { + "epoch": 0.03261851595572705, + "grad_norm": 16.267974310613923, + "learning_rate": 3.261789174707411e-05, + "loss": 2.3633, + "mean_token_accuracy": 0.4344827592372894, + "step": 32385 + }, + { + "epoch": 0.03262355200883122, + "grad_norm": 13.636739659224656, + "learning_rate": 3.2622927703805174e-05, + "loss": 2.4255, + "mean_token_accuracy": 0.43103448748588563, + "step": 32390 + }, + { + "epoch": 0.032628588061935394, + "grad_norm": 19.241362527388315, + "learning_rate": 3.2627963660536234e-05, + "loss": 2.713, + "mean_token_accuracy": 0.3655172407627106, + "step": 32395 + }, + { + "epoch": 0.03263362411503957, + "grad_norm": 17.101459549411683, + "learning_rate": 3.263299961726729e-05, + "loss": 2.5745, + "mean_token_accuracy": 0.4413793087005615, + "step": 32400 + }, + { + "epoch": 0.03263866016814374, + "grad_norm": 14.632414485445233, + "learning_rate": 3.263803557399835e-05, + "loss": 2.6531, + "mean_token_accuracy": 0.441379314661026, + "step": 32405 + }, + { + "epoch": 0.032643696221247916, + "grad_norm": 18.3375729386485, + "learning_rate": 3.2643071530729405e-05, + "loss": 2.975, + "mean_token_accuracy": 0.3655172407627106, + "step": 32410 + }, + { + "epoch": 0.03264873227435209, + "grad_norm": 25.145133926330377, + "learning_rate": 3.264810748746047e-05, + "loss": 2.6496, + "mean_token_accuracy": 0.37241379618644715, + "step": 32415 + }, + { + "epoch": 0.032653768327456256, + "grad_norm": 17.51596270172547, + "learning_rate": 3.265314344419153e-05, + "loss": 2.6977, + "mean_token_accuracy": 0.36896551847457887, + "step": 32420 + }, + { + "epoch": 0.03265880438056043, + "grad_norm": 17.64171167935097, + "learning_rate": 3.265817940092259e-05, + "loss": 2.874, + "mean_token_accuracy": 0.3655172407627106, + "step": 32425 + }, + { + "epoch": 0.032663840433664604, + "grad_norm": 13.934183998688624, + "learning_rate": 3.266321535765365e-05, + "loss": 2.2642, + "mean_token_accuracy": 0.4586206912994385, + "step": 32430 + }, + { + "epoch": 0.03266887648676878, + "grad_norm": 15.839976746346874, + "learning_rate": 3.266825131438471e-05, + "loss": 2.8586, + "mean_token_accuracy": 0.36702964305877683, + "step": 32435 + }, + { + "epoch": 0.03267391253987295, + "grad_norm": 16.75131639610263, + "learning_rate": 3.267328727111577e-05, + "loss": 3.1976, + "mean_token_accuracy": 0.3827586233615875, + "step": 32440 + }, + { + "epoch": 0.032678948592977125, + "grad_norm": 18.332316803193702, + "learning_rate": 3.267832322784683e-05, + "loss": 2.754, + "mean_token_accuracy": 0.38275861740112305, + "step": 32445 + }, + { + "epoch": 0.0326839846460813, + "grad_norm": 15.568692537268564, + "learning_rate": 3.2683359184577885e-05, + "loss": 2.6681, + "mean_token_accuracy": 0.3931034505367279, + "step": 32450 + }, + { + "epoch": 0.032689020699185466, + "grad_norm": 12.886240656636897, + "learning_rate": 3.2688395141308945e-05, + "loss": 2.4385, + "mean_token_accuracy": 0.4103448331356049, + "step": 32455 + }, + { + "epoch": 0.03269405675228964, + "grad_norm": 17.45917550119998, + "learning_rate": 3.2693431098040004e-05, + "loss": 2.4444, + "mean_token_accuracy": 0.3551724016666412, + "step": 32460 + }, + { + "epoch": 0.03269909280539381, + "grad_norm": 18.194960230112926, + "learning_rate": 3.269846705477106e-05, + "loss": 2.8905, + "mean_token_accuracy": 0.3413793116807938, + "step": 32465 + }, + { + "epoch": 0.03270412885849799, + "grad_norm": 20.256597680089506, + "learning_rate": 3.270350301150213e-05, + "loss": 2.5095, + "mean_token_accuracy": 0.4172413766384125, + "step": 32470 + }, + { + "epoch": 0.03270916491160216, + "grad_norm": 13.679478173965265, + "learning_rate": 3.270853896823319e-05, + "loss": 2.1934, + "mean_token_accuracy": 0.4551724076271057, + "step": 32475 + }, + { + "epoch": 0.032714200964706334, + "grad_norm": 15.38817868053556, + "learning_rate": 3.271357492496425e-05, + "loss": 2.4598, + "mean_token_accuracy": 0.45359952449798585, + "step": 32480 + }, + { + "epoch": 0.03271923701781051, + "grad_norm": 13.33233748028832, + "learning_rate": 3.271861088169531e-05, + "loss": 2.0339, + "mean_token_accuracy": 0.48275862336158754, + "step": 32485 + }, + { + "epoch": 0.032724273070914675, + "grad_norm": 17.634742522008263, + "learning_rate": 3.2723646838426366e-05, + "loss": 2.0843, + "mean_token_accuracy": 0.4624384164810181, + "step": 32490 + }, + { + "epoch": 0.03272930912401885, + "grad_norm": 16.972196529768752, + "learning_rate": 3.2728682795157425e-05, + "loss": 2.0786, + "mean_token_accuracy": 0.4745311617851257, + "step": 32495 + }, + { + "epoch": 0.03273434517712302, + "grad_norm": 15.63336540800433, + "learning_rate": 3.2733718751888485e-05, + "loss": 2.5229, + "mean_token_accuracy": 0.36206896901130675, + "step": 32500 + }, + { + "epoch": 0.032739381230227196, + "grad_norm": 15.744294802959585, + "learning_rate": 3.2738754708619544e-05, + "loss": 2.48, + "mean_token_accuracy": 0.37586207687854767, + "step": 32505 + }, + { + "epoch": 0.03274441728333137, + "grad_norm": 15.706699492731067, + "learning_rate": 3.27437906653506e-05, + "loss": 2.6452, + "mean_token_accuracy": 0.3827586114406586, + "step": 32510 + }, + { + "epoch": 0.032749453336435544, + "grad_norm": 14.62583286960497, + "learning_rate": 3.274882662208166e-05, + "loss": 2.5581, + "mean_token_accuracy": 0.42413792610168455, + "step": 32515 + }, + { + "epoch": 0.03275448938953972, + "grad_norm": 16.327236137582013, + "learning_rate": 3.275386257881272e-05, + "loss": 2.7675, + "mean_token_accuracy": 0.4034482777118683, + "step": 32520 + }, + { + "epoch": 0.032759525442643884, + "grad_norm": 14.418837489538838, + "learning_rate": 3.275889853554379e-05, + "loss": 3.0492, + "mean_token_accuracy": 0.335632187128067, + "step": 32525 + }, + { + "epoch": 0.03276456149574806, + "grad_norm": 19.571838839152466, + "learning_rate": 3.276393449227485e-05, + "loss": 2.5628, + "mean_token_accuracy": 0.44482759237289426, + "step": 32530 + }, + { + "epoch": 0.03276959754885223, + "grad_norm": 16.185364961414447, + "learning_rate": 3.2768970449005906e-05, + "loss": 2.7562, + "mean_token_accuracy": 0.4103448152542114, + "step": 32535 + }, + { + "epoch": 0.032774633601956406, + "grad_norm": 19.374238875990887, + "learning_rate": 3.2774006405736965e-05, + "loss": 2.6599, + "mean_token_accuracy": 0.4103448331356049, + "step": 32540 + }, + { + "epoch": 0.03277966965506058, + "grad_norm": 23.46081446532975, + "learning_rate": 3.277904236246802e-05, + "loss": 2.7965, + "mean_token_accuracy": 0.3620689630508423, + "step": 32545 + }, + { + "epoch": 0.03278470570816475, + "grad_norm": 43.8760298723066, + "learning_rate": 3.2784078319199084e-05, + "loss": 2.8466, + "mean_token_accuracy": 0.3517241358757019, + "step": 32550 + }, + { + "epoch": 0.03278974176126893, + "grad_norm": 14.556031174394118, + "learning_rate": 3.278911427593014e-05, + "loss": 2.4029, + "mean_token_accuracy": 0.4, + "step": 32555 + }, + { + "epoch": 0.032794777814373094, + "grad_norm": 14.165278450746793, + "learning_rate": 3.27941502326612e-05, + "loss": 2.7859, + "mean_token_accuracy": 0.35517241060733795, + "step": 32560 + }, + { + "epoch": 0.03279981386747727, + "grad_norm": 14.130001904812096, + "learning_rate": 3.279918618939226e-05, + "loss": 2.7425, + "mean_token_accuracy": 0.358620685338974, + "step": 32565 + }, + { + "epoch": 0.03280484992058144, + "grad_norm": 13.785734789087467, + "learning_rate": 3.280422214612332e-05, + "loss": 2.2672, + "mean_token_accuracy": 0.44664247035980226, + "step": 32570 + }, + { + "epoch": 0.032809885973685615, + "grad_norm": 17.009895786402716, + "learning_rate": 3.280925810285439e-05, + "loss": 2.5961, + "mean_token_accuracy": 0.4137930989265442, + "step": 32575 + }, + { + "epoch": 0.03281492202678979, + "grad_norm": 15.864609471595381, + "learning_rate": 3.2814294059585446e-05, + "loss": 2.5674, + "mean_token_accuracy": 0.4172413766384125, + "step": 32580 + }, + { + "epoch": 0.03281995807989396, + "grad_norm": 15.276121474227274, + "learning_rate": 3.28193300163165e-05, + "loss": 3.0003, + "mean_token_accuracy": 0.33793103992938994, + "step": 32585 + }, + { + "epoch": 0.032824994132998137, + "grad_norm": 17.341411846904155, + "learning_rate": 3.282436597304756e-05, + "loss": 2.6605, + "mean_token_accuracy": 0.4084694445133209, + "step": 32590 + }, + { + "epoch": 0.0328300301861023, + "grad_norm": 13.761140576288689, + "learning_rate": 3.282940192977862e-05, + "loss": 2.5202, + "mean_token_accuracy": 0.4517241418361664, + "step": 32595 + }, + { + "epoch": 0.03283506623920648, + "grad_norm": 14.170902206409414, + "learning_rate": 3.2834437886509676e-05, + "loss": 2.1625, + "mean_token_accuracy": 0.4896551787853241, + "step": 32600 + }, + { + "epoch": 0.03284010229231065, + "grad_norm": 22.920358752005935, + "learning_rate": 3.283947384324074e-05, + "loss": 2.3717, + "mean_token_accuracy": 0.42758620381355283, + "step": 32605 + }, + { + "epoch": 0.032845138345414825, + "grad_norm": 13.484745472748521, + "learning_rate": 3.28445097999718e-05, + "loss": 2.3805, + "mean_token_accuracy": 0.4448275864124298, + "step": 32610 + }, + { + "epoch": 0.032850174398519, + "grad_norm": 33.812520425550005, + "learning_rate": 3.284954575670286e-05, + "loss": 2.5333, + "mean_token_accuracy": 0.42068964838981626, + "step": 32615 + }, + { + "epoch": 0.03285521045162317, + "grad_norm": 13.50382108218205, + "learning_rate": 3.285458171343392e-05, + "loss": 2.3242, + "mean_token_accuracy": 0.4241379201412201, + "step": 32620 + }, + { + "epoch": 0.032860246504727346, + "grad_norm": 19.01702079389191, + "learning_rate": 3.285961767016498e-05, + "loss": 2.4528, + "mean_token_accuracy": 0.42758620381355283, + "step": 32625 + }, + { + "epoch": 0.03286528255783151, + "grad_norm": 17.360495059037017, + "learning_rate": 3.286465362689604e-05, + "loss": 2.444, + "mean_token_accuracy": 0.39655172526836396, + "step": 32630 + }, + { + "epoch": 0.032870318610935687, + "grad_norm": 13.637231170933239, + "learning_rate": 3.28696895836271e-05, + "loss": 2.6759, + "mean_token_accuracy": 0.44283121824264526, + "step": 32635 + }, + { + "epoch": 0.03287535466403986, + "grad_norm": 24.739942452078214, + "learning_rate": 3.287472554035816e-05, + "loss": 2.5579, + "mean_token_accuracy": 0.39655172228813174, + "step": 32640 + }, + { + "epoch": 0.032880390717144034, + "grad_norm": 16.647408438471913, + "learning_rate": 3.2879761497089217e-05, + "loss": 2.5326, + "mean_token_accuracy": 0.44827585220336913, + "step": 32645 + }, + { + "epoch": 0.03288542677024821, + "grad_norm": 16.424397996555225, + "learning_rate": 3.2884797453820276e-05, + "loss": 2.0253, + "mean_token_accuracy": 0.4793103516101837, + "step": 32650 + }, + { + "epoch": 0.03289046282335238, + "grad_norm": 15.33131228653515, + "learning_rate": 3.288983341055134e-05, + "loss": 2.4195, + "mean_token_accuracy": 0.4034482717514038, + "step": 32655 + }, + { + "epoch": 0.032895498876456555, + "grad_norm": 13.874489198003475, + "learning_rate": 3.28948693672824e-05, + "loss": 2.3916, + "mean_token_accuracy": 0.4172413766384125, + "step": 32660 + }, + { + "epoch": 0.03290053492956072, + "grad_norm": 13.700994907168901, + "learning_rate": 3.289990532401346e-05, + "loss": 2.872, + "mean_token_accuracy": 0.3620689630508423, + "step": 32665 + }, + { + "epoch": 0.032905570982664896, + "grad_norm": 15.233080226532602, + "learning_rate": 3.290494128074452e-05, + "loss": 2.5351, + "mean_token_accuracy": 0.46061705946922304, + "step": 32670 + }, + { + "epoch": 0.03291060703576907, + "grad_norm": 15.122612866384182, + "learning_rate": 3.290997723747558e-05, + "loss": 2.4321, + "mean_token_accuracy": 0.4103448212146759, + "step": 32675 + }, + { + "epoch": 0.032915643088873243, + "grad_norm": 15.5546120122505, + "learning_rate": 3.291501319420663e-05, + "loss": 2.6522, + "mean_token_accuracy": 0.3758620649576187, + "step": 32680 + }, + { + "epoch": 0.03292067914197742, + "grad_norm": 18.58651549224703, + "learning_rate": 3.29200491509377e-05, + "loss": 2.5107, + "mean_token_accuracy": 0.3758620619773865, + "step": 32685 + }, + { + "epoch": 0.03292571519508159, + "grad_norm": 16.77188971342295, + "learning_rate": 3.2925085107668757e-05, + "loss": 2.4989, + "mean_token_accuracy": 0.3655172407627106, + "step": 32690 + }, + { + "epoch": 0.032930751248185765, + "grad_norm": 15.064119263848445, + "learning_rate": 3.2930121064399816e-05, + "loss": 2.6803, + "mean_token_accuracy": 0.32758620381355286, + "step": 32695 + }, + { + "epoch": 0.03293578730128993, + "grad_norm": 15.629164831492382, + "learning_rate": 3.2935157021130875e-05, + "loss": 2.4515, + "mean_token_accuracy": 0.441379314661026, + "step": 32700 + }, + { + "epoch": 0.032940823354394105, + "grad_norm": 19.312839807927848, + "learning_rate": 3.2940192977861934e-05, + "loss": 2.7672, + "mean_token_accuracy": 0.3827586233615875, + "step": 32705 + }, + { + "epoch": 0.03294585940749828, + "grad_norm": 13.74508109066384, + "learning_rate": 3.2945228934593e-05, + "loss": 2.4376, + "mean_token_accuracy": 0.39655172228813174, + "step": 32710 + }, + { + "epoch": 0.03295089546060245, + "grad_norm": 14.264688306605068, + "learning_rate": 3.295026489132406e-05, + "loss": 2.7354, + "mean_token_accuracy": 0.3931034505367279, + "step": 32715 + }, + { + "epoch": 0.03295593151370663, + "grad_norm": 14.819957772864308, + "learning_rate": 3.295530084805511e-05, + "loss": 2.4244, + "mean_token_accuracy": 0.4137930989265442, + "step": 32720 + }, + { + "epoch": 0.0329609675668108, + "grad_norm": 13.415476694124704, + "learning_rate": 3.296033680478617e-05, + "loss": 2.5399, + "mean_token_accuracy": 0.45862067937850953, + "step": 32725 + }, + { + "epoch": 0.032966003619914974, + "grad_norm": 17.35436693927641, + "learning_rate": 3.296537276151723e-05, + "loss": 2.5272, + "mean_token_accuracy": 0.3999999940395355, + "step": 32730 + }, + { + "epoch": 0.03297103967301914, + "grad_norm": 14.651602809937497, + "learning_rate": 3.29704087182483e-05, + "loss": 2.2693, + "mean_token_accuracy": 0.41379310488700866, + "step": 32735 + }, + { + "epoch": 0.032976075726123315, + "grad_norm": 14.714131839229292, + "learning_rate": 3.2975444674979356e-05, + "loss": 2.9447, + "mean_token_accuracy": 0.36896551847457887, + "step": 32740 + }, + { + "epoch": 0.03298111177922749, + "grad_norm": 17.26957853112257, + "learning_rate": 3.2980480631710415e-05, + "loss": 2.5958, + "mean_token_accuracy": 0.38275861740112305, + "step": 32745 + }, + { + "epoch": 0.03298614783233166, + "grad_norm": 13.979668246781813, + "learning_rate": 3.2985516588441474e-05, + "loss": 2.4161, + "mean_token_accuracy": 0.44482758045196535, + "step": 32750 + }, + { + "epoch": 0.032991183885435836, + "grad_norm": 13.237033524918994, + "learning_rate": 3.2990552545172534e-05, + "loss": 2.0654, + "mean_token_accuracy": 0.4551724076271057, + "step": 32755 + }, + { + "epoch": 0.03299621993854001, + "grad_norm": 31.87565633030914, + "learning_rate": 3.299558850190359e-05, + "loss": 3.1275, + "mean_token_accuracy": 0.37241379022598264, + "step": 32760 + }, + { + "epoch": 0.033001255991644184, + "grad_norm": 19.30916172166212, + "learning_rate": 3.300062445863465e-05, + "loss": 2.9449, + "mean_token_accuracy": 0.37586206793785093, + "step": 32765 + }, + { + "epoch": 0.03300629204474835, + "grad_norm": 13.601206403633212, + "learning_rate": 3.300566041536571e-05, + "loss": 2.3829, + "mean_token_accuracy": 0.4206896543502808, + "step": 32770 + }, + { + "epoch": 0.033011328097852524, + "grad_norm": 16.141391459853192, + "learning_rate": 3.301069637209677e-05, + "loss": 2.4434, + "mean_token_accuracy": 0.41379311084747317, + "step": 32775 + }, + { + "epoch": 0.0330163641509567, + "grad_norm": 16.526577673594275, + "learning_rate": 3.301573232882783e-05, + "loss": 2.3123, + "mean_token_accuracy": 0.4604355752468109, + "step": 32780 + }, + { + "epoch": 0.03302140020406087, + "grad_norm": 24.643942491537437, + "learning_rate": 3.302076828555889e-05, + "loss": 2.6579, + "mean_token_accuracy": 0.38275861740112305, + "step": 32785 + }, + { + "epoch": 0.033026436257165045, + "grad_norm": 16.47801802227408, + "learning_rate": 3.3025804242289955e-05, + "loss": 2.5916, + "mean_token_accuracy": 0.41875377893447874, + "step": 32790 + }, + { + "epoch": 0.03303147231026922, + "grad_norm": 19.186217286286556, + "learning_rate": 3.3030840199021014e-05, + "loss": 2.322, + "mean_token_accuracy": 0.44482758045196535, + "step": 32795 + }, + { + "epoch": 0.03303650836337339, + "grad_norm": 14.606280679160797, + "learning_rate": 3.3035876155752074e-05, + "loss": 2.8743, + "mean_token_accuracy": 0.39310343861579894, + "step": 32800 + }, + { + "epoch": 0.03304154441647756, + "grad_norm": 26.2061474423102, + "learning_rate": 3.304091211248313e-05, + "loss": 2.9024, + "mean_token_accuracy": 0.4275862067937851, + "step": 32805 + }, + { + "epoch": 0.033046580469581734, + "grad_norm": 14.26947536340395, + "learning_rate": 3.3045948069214185e-05, + "loss": 2.7068, + "mean_token_accuracy": 0.3931034505367279, + "step": 32810 + }, + { + "epoch": 0.03305161652268591, + "grad_norm": 21.934174959490285, + "learning_rate": 3.305098402594525e-05, + "loss": 2.6867, + "mean_token_accuracy": 0.40344828367233276, + "step": 32815 + }, + { + "epoch": 0.03305665257579008, + "grad_norm": 19.71199654781282, + "learning_rate": 3.305601998267631e-05, + "loss": 2.8772, + "mean_token_accuracy": 0.35862069129943847, + "step": 32820 + }, + { + "epoch": 0.033061688628894255, + "grad_norm": 14.184218606520233, + "learning_rate": 3.306105593940737e-05, + "loss": 2.599, + "mean_token_accuracy": 0.43103448748588563, + "step": 32825 + }, + { + "epoch": 0.03306672468199843, + "grad_norm": 21.144547968067535, + "learning_rate": 3.306609189613843e-05, + "loss": 2.922, + "mean_token_accuracy": 0.39310344457626345, + "step": 32830 + }, + { + "epoch": 0.0330717607351026, + "grad_norm": 14.796514973272407, + "learning_rate": 3.307112785286949e-05, + "loss": 2.0287, + "mean_token_accuracy": 0.5034482777118683, + "step": 32835 + }, + { + "epoch": 0.03307679678820677, + "grad_norm": 15.233998379859829, + "learning_rate": 3.3076163809600555e-05, + "loss": 2.4836, + "mean_token_accuracy": 0.4413793087005615, + "step": 32840 + }, + { + "epoch": 0.03308183284131094, + "grad_norm": 16.242673312143374, + "learning_rate": 3.3081199766331614e-05, + "loss": 2.4771, + "mean_token_accuracy": 0.43793103098869324, + "step": 32845 + }, + { + "epoch": 0.03308686889441512, + "grad_norm": 13.5680368270543, + "learning_rate": 3.3086235723062666e-05, + "loss": 1.9275, + "mean_token_accuracy": 0.4344827592372894, + "step": 32850 + }, + { + "epoch": 0.03309190494751929, + "grad_norm": 14.857133629414871, + "learning_rate": 3.3091271679793725e-05, + "loss": 2.6523, + "mean_token_accuracy": 0.38620689511299133, + "step": 32855 + }, + { + "epoch": 0.033096941000623464, + "grad_norm": 19.470643124619098, + "learning_rate": 3.3096307636524785e-05, + "loss": 2.8632, + "mean_token_accuracy": 0.3896551787853241, + "step": 32860 + }, + { + "epoch": 0.03310197705372764, + "grad_norm": 15.572803051558685, + "learning_rate": 3.3101343593255844e-05, + "loss": 2.5144, + "mean_token_accuracy": 0.43414571285247805, + "step": 32865 + }, + { + "epoch": 0.03310701310683181, + "grad_norm": 29.17644828434295, + "learning_rate": 3.310637954998691e-05, + "loss": 2.7512, + "mean_token_accuracy": 0.3999999940395355, + "step": 32870 + }, + { + "epoch": 0.03311204915993598, + "grad_norm": 27.802633706794726, + "learning_rate": 3.311141550671797e-05, + "loss": 2.6099, + "mean_token_accuracy": 0.37586206793785093, + "step": 32875 + }, + { + "epoch": 0.03311708521304015, + "grad_norm": 21.33415304577971, + "learning_rate": 3.311645146344903e-05, + "loss": 2.5907, + "mean_token_accuracy": 0.41034482717514037, + "step": 32880 + }, + { + "epoch": 0.033122121266144326, + "grad_norm": 13.503505508098185, + "learning_rate": 3.312148742018009e-05, + "loss": 2.3657, + "mean_token_accuracy": 0.4878402888774872, + "step": 32885 + }, + { + "epoch": 0.0331271573192485, + "grad_norm": 15.443038331686399, + "learning_rate": 3.312652337691115e-05, + "loss": 3.0062, + "mean_token_accuracy": 0.3448275923728943, + "step": 32890 + }, + { + "epoch": 0.033132193372352674, + "grad_norm": 15.225491335064921, + "learning_rate": 3.3131559333642206e-05, + "loss": 2.9249, + "mean_token_accuracy": 0.36551724672317504, + "step": 32895 + }, + { + "epoch": 0.03313722942545685, + "grad_norm": 19.099019693838706, + "learning_rate": 3.3136595290373266e-05, + "loss": 2.7016, + "mean_token_accuracy": 0.34827585220336915, + "step": 32900 + }, + { + "epoch": 0.03314226547856102, + "grad_norm": 15.563578133207391, + "learning_rate": 3.3141631247104325e-05, + "loss": 2.5884, + "mean_token_accuracy": 0.44137930274009707, + "step": 32905 + }, + { + "epoch": 0.03314730153166519, + "grad_norm": 21.3845044468222, + "learning_rate": 3.3146667203835384e-05, + "loss": 2.7174, + "mean_token_accuracy": 0.38965516686439516, + "step": 32910 + }, + { + "epoch": 0.03315233758476936, + "grad_norm": 13.745378551792657, + "learning_rate": 3.315170316056644e-05, + "loss": 3.0389, + "mean_token_accuracy": 0.33260738253593447, + "step": 32915 + }, + { + "epoch": 0.033157373637873536, + "grad_norm": 13.521368173600575, + "learning_rate": 3.315673911729751e-05, + "loss": 2.2271, + "mean_token_accuracy": 0.44482758045196535, + "step": 32920 + }, + { + "epoch": 0.03316240969097771, + "grad_norm": 12.712620518427432, + "learning_rate": 3.316177507402857e-05, + "loss": 2.5467, + "mean_token_accuracy": 0.4034482777118683, + "step": 32925 + }, + { + "epoch": 0.03316744574408188, + "grad_norm": 17.42330842763052, + "learning_rate": 3.316681103075963e-05, + "loss": 2.4931, + "mean_token_accuracy": 0.358620685338974, + "step": 32930 + }, + { + "epoch": 0.03317248179718606, + "grad_norm": 14.075211790528742, + "learning_rate": 3.317184698749069e-05, + "loss": 2.5371, + "mean_token_accuracy": 0.3793103456497192, + "step": 32935 + }, + { + "epoch": 0.03317751785029023, + "grad_norm": 14.758830134843063, + "learning_rate": 3.3176882944221746e-05, + "loss": 2.3068, + "mean_token_accuracy": 0.4344827592372894, + "step": 32940 + }, + { + "epoch": 0.0331825539033944, + "grad_norm": 17.388416649584013, + "learning_rate": 3.31819189009528e-05, + "loss": 2.1321, + "mean_token_accuracy": 0.4713248610496521, + "step": 32945 + }, + { + "epoch": 0.03318758995649857, + "grad_norm": 16.25904088259404, + "learning_rate": 3.3186954857683865e-05, + "loss": 2.5435, + "mean_token_accuracy": 0.4172413766384125, + "step": 32950 + }, + { + "epoch": 0.033192626009602745, + "grad_norm": 14.010933277384302, + "learning_rate": 3.3191990814414924e-05, + "loss": 2.8915, + "mean_token_accuracy": 0.3758620709180832, + "step": 32955 + }, + { + "epoch": 0.03319766206270692, + "grad_norm": 18.915387018368037, + "learning_rate": 3.319702677114598e-05, + "loss": 2.4008, + "mean_token_accuracy": 0.41034482717514037, + "step": 32960 + }, + { + "epoch": 0.03320269811581109, + "grad_norm": 15.52182085524452, + "learning_rate": 3.320206272787704e-05, + "loss": 2.1514, + "mean_token_accuracy": 0.458620685338974, + "step": 32965 + }, + { + "epoch": 0.033207734168915266, + "grad_norm": 11.995706018739137, + "learning_rate": 3.32070986846081e-05, + "loss": 2.2944, + "mean_token_accuracy": 0.458620685338974, + "step": 32970 + }, + { + "epoch": 0.03321277022201944, + "grad_norm": 18.089039180390436, + "learning_rate": 3.321213464133917e-05, + "loss": 2.791, + "mean_token_accuracy": 0.35172413289546967, + "step": 32975 + }, + { + "epoch": 0.03321780627512361, + "grad_norm": 16.80464875533471, + "learning_rate": 3.321717059807023e-05, + "loss": 2.5939, + "mean_token_accuracy": 0.4432546854019165, + "step": 32980 + }, + { + "epoch": 0.03322284232822778, + "grad_norm": 17.76193985514041, + "learning_rate": 3.322220655480128e-05, + "loss": 2.4068, + "mean_token_accuracy": 0.41034482717514037, + "step": 32985 + }, + { + "epoch": 0.033227878381331954, + "grad_norm": 18.892986445589255, + "learning_rate": 3.322724251153234e-05, + "loss": 2.5575, + "mean_token_accuracy": 0.4068965494632721, + "step": 32990 + }, + { + "epoch": 0.03323291443443613, + "grad_norm": 15.808080648279875, + "learning_rate": 3.32322784682634e-05, + "loss": 2.5703, + "mean_token_accuracy": 0.3931034505367279, + "step": 32995 + }, + { + "epoch": 0.0332379504875403, + "grad_norm": 13.54889599129294, + "learning_rate": 3.3237314424994464e-05, + "loss": 2.9481, + "mean_token_accuracy": 0.36551723480224607, + "step": 33000 + }, + { + "epoch": 0.033242986540644476, + "grad_norm": 12.552144263318109, + "learning_rate": 3.3242350381725523e-05, + "loss": 2.3748, + "mean_token_accuracy": 0.4448275864124298, + "step": 33005 + }, + { + "epoch": 0.03324802259374865, + "grad_norm": 16.066591431698455, + "learning_rate": 3.324738633845658e-05, + "loss": 3.0322, + "mean_token_accuracy": 0.35983061194419863, + "step": 33010 + }, + { + "epoch": 0.033253058646852816, + "grad_norm": 15.490126717108064, + "learning_rate": 3.325242229518764e-05, + "loss": 2.3441, + "mean_token_accuracy": 0.41379310488700866, + "step": 33015 + }, + { + "epoch": 0.03325809469995699, + "grad_norm": 16.990774496164942, + "learning_rate": 3.32574582519187e-05, + "loss": 2.793, + "mean_token_accuracy": 0.42262552976608275, + "step": 33020 + }, + { + "epoch": 0.033263130753061164, + "grad_norm": 15.766862528613073, + "learning_rate": 3.326249420864976e-05, + "loss": 2.7703, + "mean_token_accuracy": 0.39655172228813174, + "step": 33025 + }, + { + "epoch": 0.03326816680616534, + "grad_norm": 16.56155977292928, + "learning_rate": 3.326753016538082e-05, + "loss": 2.7486, + "mean_token_accuracy": 0.4068965494632721, + "step": 33030 + }, + { + "epoch": 0.03327320285926951, + "grad_norm": 14.240409954299082, + "learning_rate": 3.327256612211188e-05, + "loss": 2.2333, + "mean_token_accuracy": 0.4068965494632721, + "step": 33035 + }, + { + "epoch": 0.033278238912373685, + "grad_norm": 16.806186328435505, + "learning_rate": 3.327760207884294e-05, + "loss": 3.1286, + "mean_token_accuracy": 0.3620689630508423, + "step": 33040 + }, + { + "epoch": 0.03328327496547786, + "grad_norm": 15.277516432419226, + "learning_rate": 3.3282638035574e-05, + "loss": 2.5292, + "mean_token_accuracy": 0.40859044194221494, + "step": 33045 + }, + { + "epoch": 0.033288311018582026, + "grad_norm": 14.104097226528332, + "learning_rate": 3.328767399230506e-05, + "loss": 2.529, + "mean_token_accuracy": 0.4206896543502808, + "step": 33050 + }, + { + "epoch": 0.0332933470716862, + "grad_norm": 18.28627326729721, + "learning_rate": 3.329270994903612e-05, + "loss": 2.5753, + "mean_token_accuracy": 0.43793103098869324, + "step": 33055 + }, + { + "epoch": 0.03329838312479037, + "grad_norm": 15.083040234095684, + "learning_rate": 3.329774590576718e-05, + "loss": 2.2659, + "mean_token_accuracy": 0.45172414779663084, + "step": 33060 + }, + { + "epoch": 0.03330341917789455, + "grad_norm": 13.677435807551833, + "learning_rate": 3.330278186249824e-05, + "loss": 2.3495, + "mean_token_accuracy": 0.4206896543502808, + "step": 33065 + }, + { + "epoch": 0.03330845523099872, + "grad_norm": 14.239537944036051, + "learning_rate": 3.33078178192293e-05, + "loss": 2.1205, + "mean_token_accuracy": 0.49522079825401305, + "step": 33070 + }, + { + "epoch": 0.033313491284102895, + "grad_norm": 15.061175451136954, + "learning_rate": 3.331285377596036e-05, + "loss": 2.5303, + "mean_token_accuracy": 0.42413792610168455, + "step": 33075 + }, + { + "epoch": 0.03331852733720707, + "grad_norm": 21.753106809082745, + "learning_rate": 3.331788973269142e-05, + "loss": 2.2662, + "mean_token_accuracy": 0.4655172348022461, + "step": 33080 + }, + { + "epoch": 0.033323563390311235, + "grad_norm": 12.278847152702175, + "learning_rate": 3.332292568942248e-05, + "loss": 2.4451, + "mean_token_accuracy": 0.41379311084747317, + "step": 33085 + }, + { + "epoch": 0.03332859944341541, + "grad_norm": 15.400948490820884, + "learning_rate": 3.332796164615354e-05, + "loss": 2.1137, + "mean_token_accuracy": 0.4642468154430389, + "step": 33090 + }, + { + "epoch": 0.03333363549651958, + "grad_norm": 15.37636375666732, + "learning_rate": 3.33329976028846e-05, + "loss": 2.5247, + "mean_token_accuracy": 0.3827586233615875, + "step": 33095 + }, + { + "epoch": 0.033338671549623757, + "grad_norm": 15.926229810228714, + "learning_rate": 3.3338033559615656e-05, + "loss": 2.7152, + "mean_token_accuracy": 0.3862069010734558, + "step": 33100 + }, + { + "epoch": 0.03334370760272793, + "grad_norm": 15.528552504465917, + "learning_rate": 3.3343069516346715e-05, + "loss": 2.6528, + "mean_token_accuracy": 0.39310344457626345, + "step": 33105 + }, + { + "epoch": 0.033348743655832104, + "grad_norm": 27.259088801723504, + "learning_rate": 3.334810547307778e-05, + "loss": 2.8978, + "mean_token_accuracy": 0.3482758641242981, + "step": 33110 + }, + { + "epoch": 0.03335377970893628, + "grad_norm": 14.96597782987991, + "learning_rate": 3.335314142980884e-05, + "loss": 2.3211, + "mean_token_accuracy": 0.4310344815254211, + "step": 33115 + }, + { + "epoch": 0.033358815762040445, + "grad_norm": 12.354795096235605, + "learning_rate": 3.335817738653989e-05, + "loss": 2.1967, + "mean_token_accuracy": 0.48054186105728147, + "step": 33120 + }, + { + "epoch": 0.03336385181514462, + "grad_norm": 15.789989264706746, + "learning_rate": 3.336321334327095e-05, + "loss": 2.7695, + "mean_token_accuracy": 0.3793103516101837, + "step": 33125 + }, + { + "epoch": 0.03336888786824879, + "grad_norm": 16.110453248553732, + "learning_rate": 3.336824930000201e-05, + "loss": 2.214, + "mean_token_accuracy": 0.4068965494632721, + "step": 33130 + }, + { + "epoch": 0.033373923921352966, + "grad_norm": 18.17640855229856, + "learning_rate": 3.337328525673308e-05, + "loss": 2.6773, + "mean_token_accuracy": 0.36896551251411436, + "step": 33135 + }, + { + "epoch": 0.03337895997445714, + "grad_norm": 14.402292179960261, + "learning_rate": 3.337832121346414e-05, + "loss": 2.5751, + "mean_token_accuracy": 0.3862069010734558, + "step": 33140 + }, + { + "epoch": 0.03338399602756131, + "grad_norm": 18.257369049968627, + "learning_rate": 3.3383357170195196e-05, + "loss": 2.3465, + "mean_token_accuracy": 0.4103448331356049, + "step": 33145 + }, + { + "epoch": 0.03338903208066549, + "grad_norm": 17.06039855843876, + "learning_rate": 3.3388393126926255e-05, + "loss": 2.8489, + "mean_token_accuracy": 0.37241379022598264, + "step": 33150 + }, + { + "epoch": 0.033394068133769654, + "grad_norm": 15.628252019953884, + "learning_rate": 3.3393429083657315e-05, + "loss": 2.5832, + "mean_token_accuracy": 0.40689654350280763, + "step": 33155 + }, + { + "epoch": 0.03339910418687383, + "grad_norm": 12.962348523629823, + "learning_rate": 3.3398465040388374e-05, + "loss": 2.8061, + "mean_token_accuracy": 0.4206896424293518, + "step": 33160 + }, + { + "epoch": 0.033404140239978, + "grad_norm": 17.002289808605582, + "learning_rate": 3.340350099711943e-05, + "loss": 2.9136, + "mean_token_accuracy": 0.3241379290819168, + "step": 33165 + }, + { + "epoch": 0.033409176293082175, + "grad_norm": 16.62418367605977, + "learning_rate": 3.340853695385049e-05, + "loss": 2.6146, + "mean_token_accuracy": 0.4034482777118683, + "step": 33170 + }, + { + "epoch": 0.03341421234618635, + "grad_norm": 12.517377629188488, + "learning_rate": 3.341357291058155e-05, + "loss": 2.2509, + "mean_token_accuracy": 0.44482758045196535, + "step": 33175 + }, + { + "epoch": 0.03341924839929052, + "grad_norm": 23.45826937993233, + "learning_rate": 3.341860886731261e-05, + "loss": 2.8067, + "mean_token_accuracy": 0.3620689630508423, + "step": 33180 + }, + { + "epoch": 0.0334242844523947, + "grad_norm": 14.746604972155437, + "learning_rate": 3.342364482404368e-05, + "loss": 2.5117, + "mean_token_accuracy": 0.3880217790603638, + "step": 33185 + }, + { + "epoch": 0.033429320505498863, + "grad_norm": 17.976318886170816, + "learning_rate": 3.3428680780774736e-05, + "loss": 2.7867, + "mean_token_accuracy": 0.3620689630508423, + "step": 33190 + }, + { + "epoch": 0.03343435655860304, + "grad_norm": 15.689278863022123, + "learning_rate": 3.3433716737505795e-05, + "loss": 3.0675, + "mean_token_accuracy": 0.3551724135875702, + "step": 33195 + }, + { + "epoch": 0.03343939261170721, + "grad_norm": 17.345155392919605, + "learning_rate": 3.3438752694236855e-05, + "loss": 2.6537, + "mean_token_accuracy": 0.3793103456497192, + "step": 33200 + }, + { + "epoch": 0.033444428664811385, + "grad_norm": 15.97772378951692, + "learning_rate": 3.3443788650967914e-05, + "loss": 2.2864, + "mean_token_accuracy": 0.42758620381355283, + "step": 33205 + }, + { + "epoch": 0.03344946471791556, + "grad_norm": 15.058755767480124, + "learning_rate": 3.344882460769897e-05, + "loss": 2.5346, + "mean_token_accuracy": 0.3999999940395355, + "step": 33210 + }, + { + "epoch": 0.03345450077101973, + "grad_norm": 16.07652448115746, + "learning_rate": 3.345386056443003e-05, + "loss": 2.5148, + "mean_token_accuracy": 0.4379310369491577, + "step": 33215 + }, + { + "epoch": 0.033459536824123906, + "grad_norm": 17.095413394026565, + "learning_rate": 3.345889652116109e-05, + "loss": 2.6228, + "mean_token_accuracy": 0.38965516686439516, + "step": 33220 + }, + { + "epoch": 0.03346457287722807, + "grad_norm": 22.11982129443817, + "learning_rate": 3.346393247789215e-05, + "loss": 3.0668, + "mean_token_accuracy": 0.3724137842655182, + "step": 33225 + }, + { + "epoch": 0.03346960893033225, + "grad_norm": 15.029078757370373, + "learning_rate": 3.346896843462321e-05, + "loss": 2.9051, + "mean_token_accuracy": 0.41379311084747317, + "step": 33230 + }, + { + "epoch": 0.03347464498343642, + "grad_norm": 16.191446970451178, + "learning_rate": 3.347400439135427e-05, + "loss": 2.3787, + "mean_token_accuracy": 0.4225045382976532, + "step": 33235 + }, + { + "epoch": 0.033479681036540594, + "grad_norm": 15.824360435998495, + "learning_rate": 3.3479040348085335e-05, + "loss": 2.7137, + "mean_token_accuracy": 0.40344826579093934, + "step": 33240 + }, + { + "epoch": 0.03348471708964477, + "grad_norm": 15.169593277686035, + "learning_rate": 3.3484076304816395e-05, + "loss": 2.7675, + "mean_token_accuracy": 0.38965516686439516, + "step": 33245 + }, + { + "epoch": 0.03348975314274894, + "grad_norm": 15.369812641168073, + "learning_rate": 3.3489112261547454e-05, + "loss": 2.5108, + "mean_token_accuracy": 0.3896551728248596, + "step": 33250 + }, + { + "epoch": 0.033494789195853115, + "grad_norm": 14.809585461815166, + "learning_rate": 3.3494148218278506e-05, + "loss": 2.7273, + "mean_token_accuracy": 0.38620689511299133, + "step": 33255 + }, + { + "epoch": 0.03349982524895728, + "grad_norm": 14.808301768584974, + "learning_rate": 3.3499184175009566e-05, + "loss": 2.7721, + "mean_token_accuracy": 0.3379310339689255, + "step": 33260 + }, + { + "epoch": 0.033504861302061456, + "grad_norm": 13.342922547886442, + "learning_rate": 3.350422013174063e-05, + "loss": 2.3938, + "mean_token_accuracy": 0.43448275327682495, + "step": 33265 + }, + { + "epoch": 0.03350989735516563, + "grad_norm": 16.561012974349115, + "learning_rate": 3.350925608847169e-05, + "loss": 2.7407, + "mean_token_accuracy": 0.36551723480224607, + "step": 33270 + }, + { + "epoch": 0.033514933408269804, + "grad_norm": 13.60932517338548, + "learning_rate": 3.351429204520275e-05, + "loss": 2.5665, + "mean_token_accuracy": 0.3793103456497192, + "step": 33275 + }, + { + "epoch": 0.03351996946137398, + "grad_norm": 16.107395200085517, + "learning_rate": 3.351932800193381e-05, + "loss": 2.4879, + "mean_token_accuracy": 0.42068966031074523, + "step": 33280 + }, + { + "epoch": 0.03352500551447815, + "grad_norm": 16.258837189437735, + "learning_rate": 3.352436395866487e-05, + "loss": 2.4455, + "mean_token_accuracy": 0.44482758045196535, + "step": 33285 + }, + { + "epoch": 0.033530041567582325, + "grad_norm": 12.612442365852178, + "learning_rate": 3.352939991539593e-05, + "loss": 2.5474, + "mean_token_accuracy": 0.4241379380226135, + "step": 33290 + }, + { + "epoch": 0.03353507762068649, + "grad_norm": 15.285059120712274, + "learning_rate": 3.353443587212699e-05, + "loss": 2.6941, + "mean_token_accuracy": 0.4034482777118683, + "step": 33295 + }, + { + "epoch": 0.033540113673790665, + "grad_norm": 17.74235595323413, + "learning_rate": 3.3539471828858046e-05, + "loss": 2.1123, + "mean_token_accuracy": 0.44827585816383364, + "step": 33300 + }, + { + "epoch": 0.03354514972689484, + "grad_norm": 15.476502851511148, + "learning_rate": 3.3544507785589106e-05, + "loss": 2.7773, + "mean_token_accuracy": 0.34827585220336915, + "step": 33305 + }, + { + "epoch": 0.03355018577999901, + "grad_norm": 14.008121096277222, + "learning_rate": 3.3549543742320165e-05, + "loss": 2.7597, + "mean_token_accuracy": 0.3896551728248596, + "step": 33310 + }, + { + "epoch": 0.03355522183310319, + "grad_norm": 16.605581340323674, + "learning_rate": 3.3554579699051224e-05, + "loss": 2.4237, + "mean_token_accuracy": 0.46896551847457885, + "step": 33315 + }, + { + "epoch": 0.03356025788620736, + "grad_norm": 18.331301797876606, + "learning_rate": 3.355961565578229e-05, + "loss": 2.5487, + "mean_token_accuracy": 0.4068965494632721, + "step": 33320 + }, + { + "epoch": 0.033565293939311534, + "grad_norm": 13.460174678949524, + "learning_rate": 3.356465161251335e-05, + "loss": 2.7869, + "mean_token_accuracy": 0.3793103456497192, + "step": 33325 + }, + { + "epoch": 0.0335703299924157, + "grad_norm": 14.242020904590808, + "learning_rate": 3.356968756924441e-05, + "loss": 2.6531, + "mean_token_accuracy": 0.3482758641242981, + "step": 33330 + }, + { + "epoch": 0.033575366045519875, + "grad_norm": 27.131183669236453, + "learning_rate": 3.357472352597547e-05, + "loss": 2.6675, + "mean_token_accuracy": 0.4137930989265442, + "step": 33335 + }, + { + "epoch": 0.03358040209862405, + "grad_norm": 14.409381538665848, + "learning_rate": 3.357975948270653e-05, + "loss": 2.3833, + "mean_token_accuracy": 0.42413793206214906, + "step": 33340 + }, + { + "epoch": 0.03358543815172822, + "grad_norm": 15.404906589490066, + "learning_rate": 3.3584795439437586e-05, + "loss": 2.3128, + "mean_token_accuracy": 0.4344827592372894, + "step": 33345 + }, + { + "epoch": 0.033590474204832396, + "grad_norm": 14.492186166910189, + "learning_rate": 3.3589831396168646e-05, + "loss": 2.5889, + "mean_token_accuracy": 0.42068966031074523, + "step": 33350 + }, + { + "epoch": 0.03359551025793657, + "grad_norm": 16.30733284922257, + "learning_rate": 3.3594867352899705e-05, + "loss": 2.7639, + "mean_token_accuracy": 0.37241379022598264, + "step": 33355 + }, + { + "epoch": 0.033600546311040744, + "grad_norm": 21.649081301761097, + "learning_rate": 3.3599903309630764e-05, + "loss": 2.5192, + "mean_token_accuracy": 0.43103447556495667, + "step": 33360 + }, + { + "epoch": 0.03360558236414491, + "grad_norm": 20.0866325306752, + "learning_rate": 3.3604939266361824e-05, + "loss": 2.4209, + "mean_token_accuracy": 0.3808832406997681, + "step": 33365 + }, + { + "epoch": 0.033610618417249084, + "grad_norm": 13.063299039731445, + "learning_rate": 3.360997522309288e-05, + "loss": 2.3627, + "mean_token_accuracy": 0.4275861978530884, + "step": 33370 + }, + { + "epoch": 0.03361565447035326, + "grad_norm": 20.861546449478343, + "learning_rate": 3.361501117982395e-05, + "loss": 2.1489, + "mean_token_accuracy": 0.44313369393348695, + "step": 33375 + }, + { + "epoch": 0.03362069052345743, + "grad_norm": 20.11652574373875, + "learning_rate": 3.362004713655501e-05, + "loss": 2.5937, + "mean_token_accuracy": 0.36896551847457887, + "step": 33380 + }, + { + "epoch": 0.033625726576561606, + "grad_norm": 13.84720456585736, + "learning_rate": 3.362508309328606e-05, + "loss": 2.4155, + "mean_token_accuracy": 0.43944342732429503, + "step": 33385 + }, + { + "epoch": 0.03363076262966578, + "grad_norm": 22.944019659650202, + "learning_rate": 3.363011905001712e-05, + "loss": 2.5964, + "mean_token_accuracy": 0.43284936547279357, + "step": 33390 + }, + { + "epoch": 0.03363579868276995, + "grad_norm": 15.677187462539987, + "learning_rate": 3.363515500674818e-05, + "loss": 2.2788, + "mean_token_accuracy": 0.44137930274009707, + "step": 33395 + }, + { + "epoch": 0.03364083473587412, + "grad_norm": 26.760753721145292, + "learning_rate": 3.3640190963479245e-05, + "loss": 2.6794, + "mean_token_accuracy": 0.3827586233615875, + "step": 33400 + }, + { + "epoch": 0.033645870788978294, + "grad_norm": 16.055333245280966, + "learning_rate": 3.3645226920210304e-05, + "loss": 2.5818, + "mean_token_accuracy": 0.42909860610961914, + "step": 33405 + }, + { + "epoch": 0.03365090684208247, + "grad_norm": 15.189732950487997, + "learning_rate": 3.3650262876941364e-05, + "loss": 2.651, + "mean_token_accuracy": 0.39655172228813174, + "step": 33410 + }, + { + "epoch": 0.03365594289518664, + "grad_norm": 17.463146441261472, + "learning_rate": 3.365529883367242e-05, + "loss": 2.725, + "mean_token_accuracy": 0.3241379201412201, + "step": 33415 + }, + { + "epoch": 0.033660978948290815, + "grad_norm": 13.955130438432594, + "learning_rate": 3.366033479040348e-05, + "loss": 2.6173, + "mean_token_accuracy": 0.358620685338974, + "step": 33420 + }, + { + "epoch": 0.03366601500139499, + "grad_norm": 17.667702723989937, + "learning_rate": 3.366537074713455e-05, + "loss": 2.648, + "mean_token_accuracy": 0.37241379618644715, + "step": 33425 + }, + { + "epoch": 0.03367105105449916, + "grad_norm": 18.655066953109756, + "learning_rate": 3.36704067038656e-05, + "loss": 2.3834, + "mean_token_accuracy": 0.46551724076271056, + "step": 33430 + }, + { + "epoch": 0.03367608710760333, + "grad_norm": 11.717593924337502, + "learning_rate": 3.367544266059666e-05, + "loss": 2.4696, + "mean_token_accuracy": 0.37586206793785093, + "step": 33435 + }, + { + "epoch": 0.0336811231607075, + "grad_norm": 16.887628886810692, + "learning_rate": 3.368047861732772e-05, + "loss": 2.3334, + "mean_token_accuracy": 0.3724137842655182, + "step": 33440 + }, + { + "epoch": 0.03368615921381168, + "grad_norm": 21.30764226035458, + "learning_rate": 3.368551457405878e-05, + "loss": 2.3051, + "mean_token_accuracy": 0.4448275864124298, + "step": 33445 + }, + { + "epoch": 0.03369119526691585, + "grad_norm": 14.986759248956599, + "learning_rate": 3.369055053078984e-05, + "loss": 2.8938, + "mean_token_accuracy": 0.4034482777118683, + "step": 33450 + }, + { + "epoch": 0.033696231320020024, + "grad_norm": 14.806973340812089, + "learning_rate": 3.3695586487520904e-05, + "loss": 2.4146, + "mean_token_accuracy": 0.4344827473163605, + "step": 33455 + }, + { + "epoch": 0.0337012673731242, + "grad_norm": 14.143464002615191, + "learning_rate": 3.370062244425196e-05, + "loss": 2.4405, + "mean_token_accuracy": 0.42928009629249575, + "step": 33460 + }, + { + "epoch": 0.03370630342622837, + "grad_norm": 18.534554305573916, + "learning_rate": 3.370565840098302e-05, + "loss": 2.556, + "mean_token_accuracy": 0.4068965554237366, + "step": 33465 + }, + { + "epoch": 0.03371133947933254, + "grad_norm": 17.106716346576636, + "learning_rate": 3.371069435771408e-05, + "loss": 2.3672, + "mean_token_accuracy": 0.4344827651977539, + "step": 33470 + }, + { + "epoch": 0.03371637553243671, + "grad_norm": 26.831340390434864, + "learning_rate": 3.371573031444514e-05, + "loss": 2.6057, + "mean_token_accuracy": 0.460591134428978, + "step": 33475 + }, + { + "epoch": 0.033721411585540886, + "grad_norm": 15.727864745970095, + "learning_rate": 3.37207662711762e-05, + "loss": 2.5969, + "mean_token_accuracy": 0.45172414779663084, + "step": 33480 + }, + { + "epoch": 0.03372644763864506, + "grad_norm": 17.164861915220417, + "learning_rate": 3.372580222790726e-05, + "loss": 2.28, + "mean_token_accuracy": 0.4793103516101837, + "step": 33485 + }, + { + "epoch": 0.033731483691749234, + "grad_norm": 20.176578972725714, + "learning_rate": 3.373083818463832e-05, + "loss": 2.8377, + "mean_token_accuracy": 0.41034482717514037, + "step": 33490 + }, + { + "epoch": 0.03373651974485341, + "grad_norm": 15.936296248299778, + "learning_rate": 3.373587414136938e-05, + "loss": 2.3494, + "mean_token_accuracy": 0.41724138259887694, + "step": 33495 + }, + { + "epoch": 0.03374155579795758, + "grad_norm": 15.900457088743305, + "learning_rate": 3.374091009810044e-05, + "loss": 2.4265, + "mean_token_accuracy": 0.3896551787853241, + "step": 33500 + }, + { + "epoch": 0.03374659185106175, + "grad_norm": 33.268559112893676, + "learning_rate": 3.37459460548315e-05, + "loss": 2.3502, + "mean_token_accuracy": 0.4379310369491577, + "step": 33505 + }, + { + "epoch": 0.03375162790416592, + "grad_norm": 13.252180669358543, + "learning_rate": 3.375098201156256e-05, + "loss": 2.4637, + "mean_token_accuracy": 0.41724138855934145, + "step": 33510 + }, + { + "epoch": 0.033756663957270096, + "grad_norm": 15.363289733802445, + "learning_rate": 3.375601796829362e-05, + "loss": 2.5266, + "mean_token_accuracy": 0.41034482717514037, + "step": 33515 + }, + { + "epoch": 0.03376170001037427, + "grad_norm": 16.928419872285037, + "learning_rate": 3.3761053925024674e-05, + "loss": 2.523, + "mean_token_accuracy": 0.37586206793785093, + "step": 33520 + }, + { + "epoch": 0.03376673606347844, + "grad_norm": 15.919952134533274, + "learning_rate": 3.376608988175573e-05, + "loss": 2.4868, + "mean_token_accuracy": 0.4103448212146759, + "step": 33525 + }, + { + "epoch": 0.03377177211658262, + "grad_norm": 12.777495684930544, + "learning_rate": 3.377112583848679e-05, + "loss": 2.2396, + "mean_token_accuracy": 0.4413793087005615, + "step": 33530 + }, + { + "epoch": 0.03377680816968679, + "grad_norm": 17.476087042341497, + "learning_rate": 3.377616179521786e-05, + "loss": 2.4973, + "mean_token_accuracy": 0.3965517282485962, + "step": 33535 + }, + { + "epoch": 0.03378184422279096, + "grad_norm": 14.93544548891769, + "learning_rate": 3.378119775194892e-05, + "loss": 2.7043, + "mean_token_accuracy": 0.40344828367233276, + "step": 33540 + }, + { + "epoch": 0.03378688027589513, + "grad_norm": 15.358212390514966, + "learning_rate": 3.378623370867998e-05, + "loss": 2.5301, + "mean_token_accuracy": 0.4103448212146759, + "step": 33545 + }, + { + "epoch": 0.033791916328999305, + "grad_norm": 16.73696533977745, + "learning_rate": 3.3791269665411036e-05, + "loss": 2.6264, + "mean_token_accuracy": 0.42413793206214906, + "step": 33550 + }, + { + "epoch": 0.03379695238210348, + "grad_norm": 21.232846244977896, + "learning_rate": 3.3796305622142095e-05, + "loss": 2.3784, + "mean_token_accuracy": 0.4137930989265442, + "step": 33555 + }, + { + "epoch": 0.03380198843520765, + "grad_norm": 11.819675051600473, + "learning_rate": 3.3801341578873155e-05, + "loss": 2.1918, + "mean_token_accuracy": 0.4689655125141144, + "step": 33560 + }, + { + "epoch": 0.033807024488311826, + "grad_norm": 15.348407649157389, + "learning_rate": 3.3806377535604214e-05, + "loss": 2.5444, + "mean_token_accuracy": 0.4172413766384125, + "step": 33565 + }, + { + "epoch": 0.033812060541416, + "grad_norm": 17.649377102581006, + "learning_rate": 3.381141349233527e-05, + "loss": 2.6573, + "mean_token_accuracy": 0.3827586203813553, + "step": 33570 + }, + { + "epoch": 0.03381709659452017, + "grad_norm": 21.930410616442188, + "learning_rate": 3.381644944906633e-05, + "loss": 2.789, + "mean_token_accuracy": 0.3879612863063812, + "step": 33575 + }, + { + "epoch": 0.03382213264762434, + "grad_norm": 16.631705264386476, + "learning_rate": 3.382148540579739e-05, + "loss": 2.4717, + "mean_token_accuracy": 0.45578818321228026, + "step": 33580 + }, + { + "epoch": 0.033827168700728515, + "grad_norm": 13.766222915396035, + "learning_rate": 3.382652136252846e-05, + "loss": 2.4466, + "mean_token_accuracy": 0.4034482717514038, + "step": 33585 + }, + { + "epoch": 0.03383220475383269, + "grad_norm": 17.02146465759159, + "learning_rate": 3.383155731925952e-05, + "loss": 2.1857, + "mean_token_accuracy": 0.46721112728118896, + "step": 33590 + }, + { + "epoch": 0.03383724080693686, + "grad_norm": 13.118351357620305, + "learning_rate": 3.3836593275990576e-05, + "loss": 2.409, + "mean_token_accuracy": 0.48342408537864684, + "step": 33595 + }, + { + "epoch": 0.033842276860041036, + "grad_norm": 13.427074149318164, + "learning_rate": 3.3841629232721635e-05, + "loss": 2.4875, + "mean_token_accuracy": 0.44482759237289426, + "step": 33600 + }, + { + "epoch": 0.03384731291314521, + "grad_norm": 14.607017075836513, + "learning_rate": 3.3846665189452695e-05, + "loss": 2.431, + "mean_token_accuracy": 0.43448275327682495, + "step": 33605 + }, + { + "epoch": 0.033852348966249377, + "grad_norm": 17.751284243482814, + "learning_rate": 3.3851701146183754e-05, + "loss": 2.7856, + "mean_token_accuracy": 0.3620689630508423, + "step": 33610 + }, + { + "epoch": 0.03385738501935355, + "grad_norm": 15.42033638768905, + "learning_rate": 3.385673710291481e-05, + "loss": 2.44, + "mean_token_accuracy": 0.4034482777118683, + "step": 33615 + }, + { + "epoch": 0.033862421072457724, + "grad_norm": 12.611103891349403, + "learning_rate": 3.386177305964587e-05, + "loss": 2.193, + "mean_token_accuracy": 0.42758620977401735, + "step": 33620 + }, + { + "epoch": 0.0338674571255619, + "grad_norm": 13.668291883708857, + "learning_rate": 3.386680901637693e-05, + "loss": 2.6532, + "mean_token_accuracy": 0.36206896901130675, + "step": 33625 + }, + { + "epoch": 0.03387249317866607, + "grad_norm": 19.470713150273824, + "learning_rate": 3.387184497310799e-05, + "loss": 2.5028, + "mean_token_accuracy": 0.39310344159603117, + "step": 33630 + }, + { + "epoch": 0.033877529231770245, + "grad_norm": 15.18968727435688, + "learning_rate": 3.387688092983905e-05, + "loss": 2.0236, + "mean_token_accuracy": 0.46600985527038574, + "step": 33635 + }, + { + "epoch": 0.03388256528487442, + "grad_norm": 20.707502905794545, + "learning_rate": 3.3881916886570116e-05, + "loss": 3.0023, + "mean_token_accuracy": 0.36551724672317504, + "step": 33640 + }, + { + "epoch": 0.033887601337978586, + "grad_norm": 13.991047991894307, + "learning_rate": 3.3886952843301176e-05, + "loss": 2.5632, + "mean_token_accuracy": 0.3793103456497192, + "step": 33645 + }, + { + "epoch": 0.03389263739108276, + "grad_norm": 12.805463351154948, + "learning_rate": 3.3891988800032235e-05, + "loss": 2.2792, + "mean_token_accuracy": 0.4, + "step": 33650 + }, + { + "epoch": 0.03389767344418693, + "grad_norm": 13.114297774297848, + "learning_rate": 3.389702475676329e-05, + "loss": 2.0424, + "mean_token_accuracy": 0.49655172824859617, + "step": 33655 + }, + { + "epoch": 0.03390270949729111, + "grad_norm": 16.012034323220714, + "learning_rate": 3.3902060713494347e-05, + "loss": 2.6002, + "mean_token_accuracy": 0.42413793206214906, + "step": 33660 + }, + { + "epoch": 0.03390774555039528, + "grad_norm": 11.986109823559781, + "learning_rate": 3.390709667022541e-05, + "loss": 2.301, + "mean_token_accuracy": 0.5103448271751404, + "step": 33665 + }, + { + "epoch": 0.033912781603499455, + "grad_norm": 15.387117777741706, + "learning_rate": 3.391213262695647e-05, + "loss": 2.297, + "mean_token_accuracy": 0.4241379201412201, + "step": 33670 + }, + { + "epoch": 0.03391781765660363, + "grad_norm": 23.704366234664672, + "learning_rate": 3.391716858368753e-05, + "loss": 2.253, + "mean_token_accuracy": 0.4241379380226135, + "step": 33675 + }, + { + "epoch": 0.033922853709707795, + "grad_norm": 17.171684954468404, + "learning_rate": 3.392220454041859e-05, + "loss": 2.2524, + "mean_token_accuracy": 0.39655172228813174, + "step": 33680 + }, + { + "epoch": 0.03392788976281197, + "grad_norm": 15.154667727304059, + "learning_rate": 3.392724049714965e-05, + "loss": 2.4502, + "mean_token_accuracy": 0.41034482717514037, + "step": 33685 + }, + { + "epoch": 0.03393292581591614, + "grad_norm": 16.388028734471675, + "learning_rate": 3.3932276453880716e-05, + "loss": 2.3603, + "mean_token_accuracy": 0.4344827592372894, + "step": 33690 + }, + { + "epoch": 0.03393796186902032, + "grad_norm": 13.796465123146703, + "learning_rate": 3.393731241061177e-05, + "loss": 2.5177, + "mean_token_accuracy": 0.417241370677948, + "step": 33695 + }, + { + "epoch": 0.03394299792212449, + "grad_norm": 15.78286358304764, + "learning_rate": 3.394234836734283e-05, + "loss": 2.1044, + "mean_token_accuracy": 0.4896551728248596, + "step": 33700 + }, + { + "epoch": 0.033948033975228664, + "grad_norm": 12.053792809791872, + "learning_rate": 3.3947384324073887e-05, + "loss": 2.3663, + "mean_token_accuracy": 0.4504537105560303, + "step": 33705 + }, + { + "epoch": 0.03395307002833284, + "grad_norm": 14.741870524935232, + "learning_rate": 3.3952420280804946e-05, + "loss": 2.3069, + "mean_token_accuracy": 0.4482758641242981, + "step": 33710 + }, + { + "epoch": 0.033958106081437005, + "grad_norm": 15.119479877099353, + "learning_rate": 3.3957456237536005e-05, + "loss": 2.6241, + "mean_token_accuracy": 0.3758620619773865, + "step": 33715 + }, + { + "epoch": 0.03396314213454118, + "grad_norm": 15.363678421117212, + "learning_rate": 3.396249219426707e-05, + "loss": 2.5051, + "mean_token_accuracy": 0.42413793206214906, + "step": 33720 + }, + { + "epoch": 0.03396817818764535, + "grad_norm": 15.251076252583264, + "learning_rate": 3.396752815099813e-05, + "loss": 2.2943, + "mean_token_accuracy": 0.4413793087005615, + "step": 33725 + }, + { + "epoch": 0.033973214240749526, + "grad_norm": 15.115383566892831, + "learning_rate": 3.397256410772919e-05, + "loss": 2.6776, + "mean_token_accuracy": 0.4137930989265442, + "step": 33730 + }, + { + "epoch": 0.0339782502938537, + "grad_norm": 16.26241219602374, + "learning_rate": 3.397760006446025e-05, + "loss": 2.8019, + "mean_token_accuracy": 0.37931033968925476, + "step": 33735 + }, + { + "epoch": 0.033983286346957874, + "grad_norm": 15.49627672981393, + "learning_rate": 3.398263602119131e-05, + "loss": 2.7764, + "mean_token_accuracy": 0.3275862097740173, + "step": 33740 + }, + { + "epoch": 0.03398832240006205, + "grad_norm": 16.5952548377308, + "learning_rate": 3.398767197792237e-05, + "loss": 2.891, + "mean_token_accuracy": 0.3793103486299515, + "step": 33745 + }, + { + "epoch": 0.033993358453166214, + "grad_norm": 16.740261800217052, + "learning_rate": 3.399270793465343e-05, + "loss": 2.476, + "mean_token_accuracy": 0.3965517282485962, + "step": 33750 + }, + { + "epoch": 0.03399839450627039, + "grad_norm": 16.411786038470886, + "learning_rate": 3.3997743891384486e-05, + "loss": 2.5387, + "mean_token_accuracy": 0.4275861978530884, + "step": 33755 + }, + { + "epoch": 0.03400343055937456, + "grad_norm": 16.7520349544638, + "learning_rate": 3.4002779848115545e-05, + "loss": 2.6123, + "mean_token_accuracy": 0.43793101906776427, + "step": 33760 + }, + { + "epoch": 0.034008466612478735, + "grad_norm": 16.161476629062996, + "learning_rate": 3.4007815804846604e-05, + "loss": 2.5211, + "mean_token_accuracy": 0.41465517282485964, + "step": 33765 + }, + { + "epoch": 0.03401350266558291, + "grad_norm": 14.674103235717382, + "learning_rate": 3.401285176157767e-05, + "loss": 2.5252, + "mean_token_accuracy": 0.3517241418361664, + "step": 33770 + }, + { + "epoch": 0.03401853871868708, + "grad_norm": 19.462784442511474, + "learning_rate": 3.401788771830873e-05, + "loss": 2.5809, + "mean_token_accuracy": 0.39655172228813174, + "step": 33775 + }, + { + "epoch": 0.03402357477179126, + "grad_norm": 18.508202179021975, + "learning_rate": 3.402292367503979e-05, + "loss": 2.5125, + "mean_token_accuracy": 0.4034482717514038, + "step": 33780 + }, + { + "epoch": 0.034028610824895424, + "grad_norm": 15.539433873030923, + "learning_rate": 3.402795963177085e-05, + "loss": 2.3488, + "mean_token_accuracy": 0.4551724135875702, + "step": 33785 + }, + { + "epoch": 0.0340336468779996, + "grad_norm": 13.968372594787937, + "learning_rate": 3.40329955885019e-05, + "loss": 2.4434, + "mean_token_accuracy": 0.4310344815254211, + "step": 33790 + }, + { + "epoch": 0.03403868293110377, + "grad_norm": 14.07881312574245, + "learning_rate": 3.403803154523296e-05, + "loss": 2.4394, + "mean_token_accuracy": 0.4448275864124298, + "step": 33795 + }, + { + "epoch": 0.034043718984207945, + "grad_norm": 14.110041114670986, + "learning_rate": 3.4043067501964026e-05, + "loss": 2.8194, + "mean_token_accuracy": 0.33103448152542114, + "step": 33800 + }, + { + "epoch": 0.03404875503731212, + "grad_norm": 16.70874688718829, + "learning_rate": 3.4048103458695085e-05, + "loss": 2.8836, + "mean_token_accuracy": 0.3620689630508423, + "step": 33805 + }, + { + "epoch": 0.03405379109041629, + "grad_norm": 18.13144746709522, + "learning_rate": 3.4053139415426144e-05, + "loss": 2.7949, + "mean_token_accuracy": 0.4068965494632721, + "step": 33810 + }, + { + "epoch": 0.034058827143520466, + "grad_norm": 17.5810539581166, + "learning_rate": 3.4058175372157204e-05, + "loss": 2.3936, + "mean_token_accuracy": 0.38620689511299133, + "step": 33815 + }, + { + "epoch": 0.03406386319662463, + "grad_norm": 16.481199209900517, + "learning_rate": 3.406321132888826e-05, + "loss": 2.5263, + "mean_token_accuracy": 0.39310344457626345, + "step": 33820 + }, + { + "epoch": 0.03406889924972881, + "grad_norm": 14.92482879472569, + "learning_rate": 3.406824728561933e-05, + "loss": 2.472, + "mean_token_accuracy": 0.41034482717514037, + "step": 33825 + }, + { + "epoch": 0.03407393530283298, + "grad_norm": 19.171447277158492, + "learning_rate": 3.407328324235038e-05, + "loss": 2.8402, + "mean_token_accuracy": 0.35172412991523744, + "step": 33830 + }, + { + "epoch": 0.034078971355937154, + "grad_norm": 17.398853947876074, + "learning_rate": 3.407831919908144e-05, + "loss": 2.7976, + "mean_token_accuracy": 0.3551724135875702, + "step": 33835 + }, + { + "epoch": 0.03408400740904133, + "grad_norm": 15.27709314653025, + "learning_rate": 3.40833551558125e-05, + "loss": 2.1321, + "mean_token_accuracy": 0.4827586054801941, + "step": 33840 + }, + { + "epoch": 0.0340890434621455, + "grad_norm": 17.055838417665477, + "learning_rate": 3.408839111254356e-05, + "loss": 2.4088, + "mean_token_accuracy": 0.4016333997249603, + "step": 33845 + }, + { + "epoch": 0.034094079515249676, + "grad_norm": 18.134560739578674, + "learning_rate": 3.4093427069274625e-05, + "loss": 2.3745, + "mean_token_accuracy": 0.4379310369491577, + "step": 33850 + }, + { + "epoch": 0.03409911556835384, + "grad_norm": 13.060604675826816, + "learning_rate": 3.4098463026005685e-05, + "loss": 2.3797, + "mean_token_accuracy": 0.4310344815254211, + "step": 33855 + }, + { + "epoch": 0.034104151621458016, + "grad_norm": 16.85645883129703, + "learning_rate": 3.4103498982736744e-05, + "loss": 2.6108, + "mean_token_accuracy": 0.32758620381355286, + "step": 33860 + }, + { + "epoch": 0.03410918767456219, + "grad_norm": 14.268218037654616, + "learning_rate": 3.41085349394678e-05, + "loss": 2.618, + "mean_token_accuracy": 0.40175438225269317, + "step": 33865 + }, + { + "epoch": 0.034114223727666364, + "grad_norm": 15.902701959772452, + "learning_rate": 3.411357089619886e-05, + "loss": 2.6419, + "mean_token_accuracy": 0.3724137932062149, + "step": 33870 + }, + { + "epoch": 0.03411925978077054, + "grad_norm": 19.571085798288763, + "learning_rate": 3.411860685292992e-05, + "loss": 2.6711, + "mean_token_accuracy": 0.4172413766384125, + "step": 33875 + }, + { + "epoch": 0.03412429583387471, + "grad_norm": 15.741795653911232, + "learning_rate": 3.412364280966098e-05, + "loss": 2.756, + "mean_token_accuracy": 0.39655172228813174, + "step": 33880 + }, + { + "epoch": 0.034129331886978885, + "grad_norm": 15.847061074334219, + "learning_rate": 3.412867876639204e-05, + "loss": 2.3558, + "mean_token_accuracy": 0.3758620619773865, + "step": 33885 + }, + { + "epoch": 0.03413436794008305, + "grad_norm": 14.239911078513291, + "learning_rate": 3.41337147231231e-05, + "loss": 2.8119, + "mean_token_accuracy": 0.38275861740112305, + "step": 33890 + }, + { + "epoch": 0.034139403993187226, + "grad_norm": 16.770907420070984, + "learning_rate": 3.413875067985416e-05, + "loss": 2.1466, + "mean_token_accuracy": 0.48275861144065857, + "step": 33895 + }, + { + "epoch": 0.0341444400462914, + "grad_norm": 19.560814373759353, + "learning_rate": 3.414378663658522e-05, + "loss": 2.7282, + "mean_token_accuracy": 0.4000000059604645, + "step": 33900 + }, + { + "epoch": 0.03414947609939557, + "grad_norm": 16.67254395560786, + "learning_rate": 3.4148822593316284e-05, + "loss": 2.4749, + "mean_token_accuracy": 0.40344828367233276, + "step": 33905 + }, + { + "epoch": 0.03415451215249975, + "grad_norm": 16.066624157074884, + "learning_rate": 3.415385855004734e-05, + "loss": 2.4009, + "mean_token_accuracy": 0.37586206793785093, + "step": 33910 + }, + { + "epoch": 0.03415954820560392, + "grad_norm": 16.74701612385621, + "learning_rate": 3.41588945067784e-05, + "loss": 2.5473, + "mean_token_accuracy": 0.3655172407627106, + "step": 33915 + }, + { + "epoch": 0.034164584258708094, + "grad_norm": 14.490524628766174, + "learning_rate": 3.4163930463509455e-05, + "loss": 2.3936, + "mean_token_accuracy": 0.36551723480224607, + "step": 33920 + }, + { + "epoch": 0.03416962031181226, + "grad_norm": 15.803940977840652, + "learning_rate": 3.4168966420240514e-05, + "loss": 2.4214, + "mean_token_accuracy": 0.42220206260681153, + "step": 33925 + }, + { + "epoch": 0.034174656364916435, + "grad_norm": 12.956760557394777, + "learning_rate": 3.417400237697158e-05, + "loss": 2.5856, + "mean_token_accuracy": 0.42607380747795104, + "step": 33930 + }, + { + "epoch": 0.03417969241802061, + "grad_norm": 20.295608864424725, + "learning_rate": 3.417903833370264e-05, + "loss": 2.4486, + "mean_token_accuracy": 0.4034482777118683, + "step": 33935 + }, + { + "epoch": 0.03418472847112478, + "grad_norm": 17.694047148451265, + "learning_rate": 3.41840742904337e-05, + "loss": 2.216, + "mean_token_accuracy": 0.44331517815589905, + "step": 33940 + }, + { + "epoch": 0.034189764524228956, + "grad_norm": 18.025711657512737, + "learning_rate": 3.418911024716476e-05, + "loss": 2.4034, + "mean_token_accuracy": 0.3793103516101837, + "step": 33945 + }, + { + "epoch": 0.03419480057733313, + "grad_norm": 14.12918994842883, + "learning_rate": 3.419414620389582e-05, + "loss": 2.2498, + "mean_token_accuracy": 0.44827585816383364, + "step": 33950 + }, + { + "epoch": 0.034199836630437304, + "grad_norm": 21.605890947503223, + "learning_rate": 3.419918216062688e-05, + "loss": 3.018, + "mean_token_accuracy": 0.358620685338974, + "step": 33955 + }, + { + "epoch": 0.03420487268354147, + "grad_norm": 15.48640859595419, + "learning_rate": 3.420421811735794e-05, + "loss": 2.3261, + "mean_token_accuracy": 0.417241370677948, + "step": 33960 + }, + { + "epoch": 0.034209908736645644, + "grad_norm": 17.66462995738986, + "learning_rate": 3.4209254074088995e-05, + "loss": 2.6326, + "mean_token_accuracy": 0.4, + "step": 33965 + }, + { + "epoch": 0.03421494478974982, + "grad_norm": 15.671109317636722, + "learning_rate": 3.4214290030820054e-05, + "loss": 2.7434, + "mean_token_accuracy": 0.42758620381355283, + "step": 33970 + }, + { + "epoch": 0.03421998084285399, + "grad_norm": 15.748290969434409, + "learning_rate": 3.421932598755111e-05, + "loss": 2.4462, + "mean_token_accuracy": 0.4172413766384125, + "step": 33975 + }, + { + "epoch": 0.034225016895958166, + "grad_norm": 17.913842390778633, + "learning_rate": 3.422436194428217e-05, + "loss": 2.6313, + "mean_token_accuracy": 0.34827586114406583, + "step": 33980 + }, + { + "epoch": 0.03423005294906234, + "grad_norm": 12.879765159251058, + "learning_rate": 3.422939790101324e-05, + "loss": 2.2138, + "mean_token_accuracy": 0.4310344815254211, + "step": 33985 + }, + { + "epoch": 0.03423508900216651, + "grad_norm": 13.883531032037038, + "learning_rate": 3.42344338577443e-05, + "loss": 2.3723, + "mean_token_accuracy": 0.41034482717514037, + "step": 33990 + }, + { + "epoch": 0.03424012505527068, + "grad_norm": 17.25046061325281, + "learning_rate": 3.423946981447536e-05, + "loss": 2.6528, + "mean_token_accuracy": 0.3517241418361664, + "step": 33995 + }, + { + "epoch": 0.034245161108374854, + "grad_norm": 15.484828877820906, + "learning_rate": 3.4244505771206416e-05, + "loss": 2.3892, + "mean_token_accuracy": 0.49848759174346924, + "step": 34000 + }, + { + "epoch": 0.03425019716147903, + "grad_norm": 17.74652430956716, + "learning_rate": 3.4249541727937476e-05, + "loss": 2.978, + "mean_token_accuracy": 0.4068965494632721, + "step": 34005 + }, + { + "epoch": 0.0342552332145832, + "grad_norm": 14.218214831625037, + "learning_rate": 3.4254577684668535e-05, + "loss": 2.1288, + "mean_token_accuracy": 0.47241379618644713, + "step": 34010 + }, + { + "epoch": 0.034260269267687375, + "grad_norm": 15.711550929757276, + "learning_rate": 3.4259613641399594e-05, + "loss": 2.5334, + "mean_token_accuracy": 0.41034482717514037, + "step": 34015 + }, + { + "epoch": 0.03426530532079155, + "grad_norm": 19.41064982787321, + "learning_rate": 3.4264649598130653e-05, + "loss": 2.5724, + "mean_token_accuracy": 0.4068965494632721, + "step": 34020 + }, + { + "epoch": 0.03427034137389572, + "grad_norm": 15.481279276030175, + "learning_rate": 3.426968555486171e-05, + "loss": 2.6408, + "mean_token_accuracy": 0.4034482717514038, + "step": 34025 + }, + { + "epoch": 0.03427537742699989, + "grad_norm": 15.25332052565089, + "learning_rate": 3.427472151159277e-05, + "loss": 2.4959, + "mean_token_accuracy": 0.4241379201412201, + "step": 34030 + }, + { + "epoch": 0.03428041348010406, + "grad_norm": 15.780460568069019, + "learning_rate": 3.427975746832384e-05, + "loss": 2.701, + "mean_token_accuracy": 0.37586207389831544, + "step": 34035 + }, + { + "epoch": 0.03428544953320824, + "grad_norm": 13.895803619002006, + "learning_rate": 3.42847934250549e-05, + "loss": 2.5724, + "mean_token_accuracy": 0.3982456147670746, + "step": 34040 + }, + { + "epoch": 0.03429048558631241, + "grad_norm": 17.866298224704064, + "learning_rate": 3.4289829381785956e-05, + "loss": 2.5913, + "mean_token_accuracy": 0.3946158528327942, + "step": 34045 + }, + { + "epoch": 0.034295521639416585, + "grad_norm": 13.520626196344217, + "learning_rate": 3.4294865338517016e-05, + "loss": 2.6836, + "mean_token_accuracy": 0.41034482717514037, + "step": 34050 + }, + { + "epoch": 0.03430055769252076, + "grad_norm": 15.010275448022409, + "learning_rate": 3.429990129524807e-05, + "loss": 2.4978, + "mean_token_accuracy": 0.4607380449771881, + "step": 34055 + }, + { + "epoch": 0.03430559374562493, + "grad_norm": 13.074079396081535, + "learning_rate": 3.430493725197913e-05, + "loss": 2.7411, + "mean_token_accuracy": 0.41034482717514037, + "step": 34060 + }, + { + "epoch": 0.0343106297987291, + "grad_norm": 15.350148405145282, + "learning_rate": 3.4309973208710193e-05, + "loss": 2.6671, + "mean_token_accuracy": 0.3482758581638336, + "step": 34065 + }, + { + "epoch": 0.03431566585183327, + "grad_norm": 16.88966018226705, + "learning_rate": 3.431500916544125e-05, + "loss": 2.4739, + "mean_token_accuracy": 0.3931034505367279, + "step": 34070 + }, + { + "epoch": 0.034320701904937446, + "grad_norm": 16.44123458618899, + "learning_rate": 3.432004512217231e-05, + "loss": 2.3103, + "mean_token_accuracy": 0.47428917288780215, + "step": 34075 + }, + { + "epoch": 0.03432573795804162, + "grad_norm": 16.164549064725446, + "learning_rate": 3.432508107890337e-05, + "loss": 2.634, + "mean_token_accuracy": 0.41724138259887694, + "step": 34080 + }, + { + "epoch": 0.034330774011145794, + "grad_norm": 13.849358439860044, + "learning_rate": 3.433011703563443e-05, + "loss": 2.2913, + "mean_token_accuracy": 0.4034482717514038, + "step": 34085 + }, + { + "epoch": 0.03433581006424997, + "grad_norm": 17.608906695227233, + "learning_rate": 3.4335152992365496e-05, + "loss": 2.4927, + "mean_token_accuracy": 0.4502117395401001, + "step": 34090 + }, + { + "epoch": 0.03434084611735414, + "grad_norm": 16.19896311227932, + "learning_rate": 3.434018894909655e-05, + "loss": 2.7337, + "mean_token_accuracy": 0.3793103456497192, + "step": 34095 + }, + { + "epoch": 0.03434588217045831, + "grad_norm": 13.979191227233933, + "learning_rate": 3.434522490582761e-05, + "loss": 2.8432, + "mean_token_accuracy": 0.37241379618644715, + "step": 34100 + }, + { + "epoch": 0.03435091822356248, + "grad_norm": 16.60670509611578, + "learning_rate": 3.435026086255867e-05, + "loss": 2.7363, + "mean_token_accuracy": 0.38275861740112305, + "step": 34105 + }, + { + "epoch": 0.034355954276666656, + "grad_norm": 19.15045486615811, + "learning_rate": 3.435529681928973e-05, + "loss": 2.2882, + "mean_token_accuracy": 0.4255898416042328, + "step": 34110 + }, + { + "epoch": 0.03436099032977083, + "grad_norm": 17.265214814749402, + "learning_rate": 3.436033277602079e-05, + "loss": 2.9429, + "mean_token_accuracy": 0.35172413289546967, + "step": 34115 + }, + { + "epoch": 0.034366026382875, + "grad_norm": 22.96428250450586, + "learning_rate": 3.436536873275185e-05, + "loss": 2.5968, + "mean_token_accuracy": 0.36896551847457887, + "step": 34120 + }, + { + "epoch": 0.03437106243597918, + "grad_norm": 19.78464647723333, + "learning_rate": 3.437040468948291e-05, + "loss": 2.4, + "mean_token_accuracy": 0.43103448748588563, + "step": 34125 + }, + { + "epoch": 0.03437609848908335, + "grad_norm": 16.326230907138537, + "learning_rate": 3.437544064621397e-05, + "loss": 2.7009, + "mean_token_accuracy": 0.3965517163276672, + "step": 34130 + }, + { + "epoch": 0.03438113454218752, + "grad_norm": 17.20404656147665, + "learning_rate": 3.438047660294503e-05, + "loss": 2.5321, + "mean_token_accuracy": 0.37241379618644715, + "step": 34135 + }, + { + "epoch": 0.03438617059529169, + "grad_norm": 13.716251025793976, + "learning_rate": 3.438551255967609e-05, + "loss": 2.4028, + "mean_token_accuracy": 0.44482759237289426, + "step": 34140 + }, + { + "epoch": 0.034391206648395865, + "grad_norm": 14.239828525533264, + "learning_rate": 3.439054851640715e-05, + "loss": 2.2131, + "mean_token_accuracy": 0.4620689690113068, + "step": 34145 + }, + { + "epoch": 0.03439624270150004, + "grad_norm": 15.613954636993007, + "learning_rate": 3.439558447313821e-05, + "loss": 2.3042, + "mean_token_accuracy": 0.4517241418361664, + "step": 34150 + }, + { + "epoch": 0.03440127875460421, + "grad_norm": 13.50958402722549, + "learning_rate": 3.440062042986927e-05, + "loss": 2.9191, + "mean_token_accuracy": 0.3551724135875702, + "step": 34155 + }, + { + "epoch": 0.03440631480770839, + "grad_norm": 12.328542434053968, + "learning_rate": 3.4405656386600326e-05, + "loss": 2.2921, + "mean_token_accuracy": 0.4068965494632721, + "step": 34160 + }, + { + "epoch": 0.03441135086081256, + "grad_norm": 12.962218718339766, + "learning_rate": 3.4410692343331385e-05, + "loss": 2.5559, + "mean_token_accuracy": 0.3965517163276672, + "step": 34165 + }, + { + "epoch": 0.03441638691391673, + "grad_norm": 17.469871074217824, + "learning_rate": 3.441572830006245e-05, + "loss": 2.6591, + "mean_token_accuracy": 0.38965516686439516, + "step": 34170 + }, + { + "epoch": 0.0344214229670209, + "grad_norm": 16.45827663379989, + "learning_rate": 3.442076425679351e-05, + "loss": 2.6024, + "mean_token_accuracy": 0.36896550953388213, + "step": 34175 + }, + { + "epoch": 0.034426459020125075, + "grad_norm": 13.565074547159714, + "learning_rate": 3.442580021352457e-05, + "loss": 2.4708, + "mean_token_accuracy": 0.453841495513916, + "step": 34180 + }, + { + "epoch": 0.03443149507322925, + "grad_norm": 16.661736936377068, + "learning_rate": 3.443083617025563e-05, + "loss": 2.6495, + "mean_token_accuracy": 0.41379310488700866, + "step": 34185 + }, + { + "epoch": 0.03443653112633342, + "grad_norm": 18.075275490504943, + "learning_rate": 3.443587212698668e-05, + "loss": 2.7532, + "mean_token_accuracy": 0.358620697259903, + "step": 34190 + }, + { + "epoch": 0.034441567179437596, + "grad_norm": 10.993543614001391, + "learning_rate": 3.444090808371775e-05, + "loss": 2.3698, + "mean_token_accuracy": 0.417241370677948, + "step": 34195 + }, + { + "epoch": 0.03444660323254177, + "grad_norm": 19.57029793317187, + "learning_rate": 3.444594404044881e-05, + "loss": 2.5461, + "mean_token_accuracy": 0.4310344815254211, + "step": 34200 + }, + { + "epoch": 0.03445163928564594, + "grad_norm": 16.210564188101415, + "learning_rate": 3.4450979997179866e-05, + "loss": 2.9766, + "mean_token_accuracy": 0.37586206793785093, + "step": 34205 + }, + { + "epoch": 0.03445667533875011, + "grad_norm": 16.876624524013554, + "learning_rate": 3.4456015953910925e-05, + "loss": 2.9907, + "mean_token_accuracy": 0.34482758343219755, + "step": 34210 + }, + { + "epoch": 0.034461711391854284, + "grad_norm": 17.280675149070028, + "learning_rate": 3.4461051910641985e-05, + "loss": 2.5087, + "mean_token_accuracy": 0.4172413766384125, + "step": 34215 + }, + { + "epoch": 0.03446674744495846, + "grad_norm": 13.820146239724414, + "learning_rate": 3.4466087867373044e-05, + "loss": 2.9072, + "mean_token_accuracy": 0.36896551251411436, + "step": 34220 + }, + { + "epoch": 0.03447178349806263, + "grad_norm": 14.630898746427063, + "learning_rate": 3.447112382410411e-05, + "loss": 2.1276, + "mean_token_accuracy": 0.46896551847457885, + "step": 34225 + }, + { + "epoch": 0.034476819551166805, + "grad_norm": 19.257986269732434, + "learning_rate": 3.447615978083516e-05, + "loss": 2.868, + "mean_token_accuracy": 0.3774954676628113, + "step": 34230 + }, + { + "epoch": 0.03448185560427098, + "grad_norm": 16.86330253904032, + "learning_rate": 3.448119573756622e-05, + "loss": 2.3603, + "mean_token_accuracy": 0.4434966742992401, + "step": 34235 + }, + { + "epoch": 0.034486891657375146, + "grad_norm": 16.775332881608698, + "learning_rate": 3.448623169429728e-05, + "loss": 2.7784, + "mean_token_accuracy": 0.36551723480224607, + "step": 34240 + }, + { + "epoch": 0.03449192771047932, + "grad_norm": 19.983191333105317, + "learning_rate": 3.449126765102834e-05, + "loss": 2.3983, + "mean_token_accuracy": 0.4294010937213898, + "step": 34245 + }, + { + "epoch": 0.034496963763583494, + "grad_norm": 18.120551284710473, + "learning_rate": 3.4496303607759406e-05, + "loss": 2.7196, + "mean_token_accuracy": 0.3827586233615875, + "step": 34250 + }, + { + "epoch": 0.03450199981668767, + "grad_norm": 13.013714454753044, + "learning_rate": 3.4501339564490465e-05, + "loss": 2.734, + "mean_token_accuracy": 0.36896551847457887, + "step": 34255 + }, + { + "epoch": 0.03450703586979184, + "grad_norm": 14.127661099280106, + "learning_rate": 3.4506375521221525e-05, + "loss": 3.0503, + "mean_token_accuracy": 0.3448275804519653, + "step": 34260 + }, + { + "epoch": 0.034512071922896015, + "grad_norm": 15.841282738405992, + "learning_rate": 3.4511411477952584e-05, + "loss": 2.4476, + "mean_token_accuracy": 0.37586206793785093, + "step": 34265 + }, + { + "epoch": 0.03451710797600019, + "grad_norm": 15.76792098143515, + "learning_rate": 3.451644743468364e-05, + "loss": 2.2326, + "mean_token_accuracy": 0.4068965494632721, + "step": 34270 + }, + { + "epoch": 0.034522144029104355, + "grad_norm": 20.504027869138753, + "learning_rate": 3.45214833914147e-05, + "loss": 2.5074, + "mean_token_accuracy": 0.4103448212146759, + "step": 34275 + }, + { + "epoch": 0.03452718008220853, + "grad_norm": 17.15812435440335, + "learning_rate": 3.452651934814576e-05, + "loss": 2.5533, + "mean_token_accuracy": 0.38657635152339936, + "step": 34280 + }, + { + "epoch": 0.0345322161353127, + "grad_norm": 14.954269071815194, + "learning_rate": 3.453155530487682e-05, + "loss": 2.3977, + "mean_token_accuracy": 0.4, + "step": 34285 + }, + { + "epoch": 0.03453725218841688, + "grad_norm": 14.705818735346607, + "learning_rate": 3.453659126160788e-05, + "loss": 2.2945, + "mean_token_accuracy": 0.4730295598506927, + "step": 34290 + }, + { + "epoch": 0.03454228824152105, + "grad_norm": 15.915321480315606, + "learning_rate": 3.454162721833894e-05, + "loss": 2.4169, + "mean_token_accuracy": 0.42758620381355283, + "step": 34295 + }, + { + "epoch": 0.034547324294625224, + "grad_norm": 15.907780587149457, + "learning_rate": 3.454666317507e-05, + "loss": 2.6603, + "mean_token_accuracy": 0.40689654350280763, + "step": 34300 + }, + { + "epoch": 0.0345523603477294, + "grad_norm": 19.0911661738502, + "learning_rate": 3.4551699131801065e-05, + "loss": 2.3442, + "mean_token_accuracy": 0.4332123339176178, + "step": 34305 + }, + { + "epoch": 0.034557396400833565, + "grad_norm": 15.393271498408437, + "learning_rate": 3.4556735088532124e-05, + "loss": 2.8674, + "mean_token_accuracy": 0.3482758581638336, + "step": 34310 + }, + { + "epoch": 0.03456243245393774, + "grad_norm": 17.61540307012319, + "learning_rate": 3.456177104526318e-05, + "loss": 2.7625, + "mean_token_accuracy": 0.3793103456497192, + "step": 34315 + }, + { + "epoch": 0.03456746850704191, + "grad_norm": 16.679858098993158, + "learning_rate": 3.456680700199424e-05, + "loss": 3.1326, + "mean_token_accuracy": 0.36896551251411436, + "step": 34320 + }, + { + "epoch": 0.034572504560146086, + "grad_norm": 14.909692828759107, + "learning_rate": 3.4571842958725295e-05, + "loss": 2.7697, + "mean_token_accuracy": 0.35862069129943847, + "step": 34325 + }, + { + "epoch": 0.03457754061325026, + "grad_norm": 15.445383712202576, + "learning_rate": 3.457687891545636e-05, + "loss": 2.3825, + "mean_token_accuracy": 0.4068965554237366, + "step": 34330 + }, + { + "epoch": 0.034582576666354434, + "grad_norm": 13.930526148131197, + "learning_rate": 3.458191487218742e-05, + "loss": 2.4642, + "mean_token_accuracy": 0.3999999940395355, + "step": 34335 + }, + { + "epoch": 0.03458761271945861, + "grad_norm": 13.47338949714128, + "learning_rate": 3.458695082891848e-05, + "loss": 2.2247, + "mean_token_accuracy": 0.4482758641242981, + "step": 34340 + }, + { + "epoch": 0.034592648772562774, + "grad_norm": 14.514223383734237, + "learning_rate": 3.459198678564954e-05, + "loss": 2.5925, + "mean_token_accuracy": 0.36551723480224607, + "step": 34345 + }, + { + "epoch": 0.03459768482566695, + "grad_norm": 14.954291704610474, + "learning_rate": 3.45970227423806e-05, + "loss": 2.1633, + "mean_token_accuracy": 0.4571687877178192, + "step": 34350 + }, + { + "epoch": 0.03460272087877112, + "grad_norm": 19.142415862922576, + "learning_rate": 3.4602058699111664e-05, + "loss": 2.6894, + "mean_token_accuracy": 0.4068965494632721, + "step": 34355 + }, + { + "epoch": 0.034607756931875296, + "grad_norm": 16.734867101987728, + "learning_rate": 3.460709465584272e-05, + "loss": 2.5994, + "mean_token_accuracy": 0.34827586114406583, + "step": 34360 + }, + { + "epoch": 0.03461279298497947, + "grad_norm": 19.547124104281572, + "learning_rate": 3.4612130612573776e-05, + "loss": 2.5978, + "mean_token_accuracy": 0.4, + "step": 34365 + }, + { + "epoch": 0.03461782903808364, + "grad_norm": 16.724276952531444, + "learning_rate": 3.4617166569304835e-05, + "loss": 2.4564, + "mean_token_accuracy": 0.42413793206214906, + "step": 34370 + }, + { + "epoch": 0.03462286509118782, + "grad_norm": 15.59306249631678, + "learning_rate": 3.4622202526035894e-05, + "loss": 2.5072, + "mean_token_accuracy": 0.358620685338974, + "step": 34375 + }, + { + "epoch": 0.034627901144291984, + "grad_norm": 17.215472544268252, + "learning_rate": 3.462723848276696e-05, + "loss": 2.7569, + "mean_token_accuracy": 0.3655172437429428, + "step": 34380 + }, + { + "epoch": 0.03463293719739616, + "grad_norm": 17.4131492149336, + "learning_rate": 3.463227443949802e-05, + "loss": 2.6924, + "mean_token_accuracy": 0.36551723778247835, + "step": 34385 + }, + { + "epoch": 0.03463797325050033, + "grad_norm": 16.497925299239114, + "learning_rate": 3.463731039622908e-05, + "loss": 2.4546, + "mean_token_accuracy": 0.43103448748588563, + "step": 34390 + }, + { + "epoch": 0.034643009303604505, + "grad_norm": 13.008799173253962, + "learning_rate": 3.464234635296014e-05, + "loss": 2.4753, + "mean_token_accuracy": 0.3862068921327591, + "step": 34395 + }, + { + "epoch": 0.03464804535670868, + "grad_norm": 13.883261062337679, + "learning_rate": 3.46473823096912e-05, + "loss": 2.2983, + "mean_token_accuracy": 0.4348457396030426, + "step": 34400 + }, + { + "epoch": 0.03465308140981285, + "grad_norm": 14.406086326141299, + "learning_rate": 3.4652418266422257e-05, + "loss": 2.3777, + "mean_token_accuracy": 0.4344827592372894, + "step": 34405 + }, + { + "epoch": 0.034658117462917026, + "grad_norm": 15.925165431045617, + "learning_rate": 3.4657454223153316e-05, + "loss": 2.4159, + "mean_token_accuracy": 0.4034482777118683, + "step": 34410 + }, + { + "epoch": 0.03466315351602119, + "grad_norm": 14.472032126744317, + "learning_rate": 3.4662490179884375e-05, + "loss": 2.736, + "mean_token_accuracy": 0.3793103456497192, + "step": 34415 + }, + { + "epoch": 0.03466818956912537, + "grad_norm": 13.570949773151083, + "learning_rate": 3.4667526136615434e-05, + "loss": 2.4162, + "mean_token_accuracy": 0.42758620977401735, + "step": 34420 + }, + { + "epoch": 0.03467322562222954, + "grad_norm": 13.788320520304786, + "learning_rate": 3.4672562093346494e-05, + "loss": 2.8515, + "mean_token_accuracy": 0.3896551728248596, + "step": 34425 + }, + { + "epoch": 0.034678261675333714, + "grad_norm": 10.106099830762087, + "learning_rate": 3.467759805007755e-05, + "loss": 2.1511, + "mean_token_accuracy": 0.42906404137611387, + "step": 34430 + }, + { + "epoch": 0.03468329772843789, + "grad_norm": 16.513953567921412, + "learning_rate": 3.468263400680862e-05, + "loss": 2.4459, + "mean_token_accuracy": 0.42413792610168455, + "step": 34435 + }, + { + "epoch": 0.03468833378154206, + "grad_norm": 24.889540564844093, + "learning_rate": 3.468766996353968e-05, + "loss": 2.6426, + "mean_token_accuracy": 0.3862069010734558, + "step": 34440 + }, + { + "epoch": 0.034693369834646236, + "grad_norm": 20.250625369503194, + "learning_rate": 3.469270592027074e-05, + "loss": 2.6332, + "mean_token_accuracy": 0.41034482717514037, + "step": 34445 + }, + { + "epoch": 0.0346984058877504, + "grad_norm": 14.376829350731116, + "learning_rate": 3.4697741877001797e-05, + "loss": 2.7246, + "mean_token_accuracy": 0.36896551251411436, + "step": 34450 + }, + { + "epoch": 0.034703441940854576, + "grad_norm": 15.168909115388761, + "learning_rate": 3.470277783373285e-05, + "loss": 2.7558, + "mean_token_accuracy": 0.35862069129943847, + "step": 34455 + }, + { + "epoch": 0.03470847799395875, + "grad_norm": 13.919353414976857, + "learning_rate": 3.4707813790463915e-05, + "loss": 2.5143, + "mean_token_accuracy": 0.42068964838981626, + "step": 34460 + }, + { + "epoch": 0.034713514047062924, + "grad_norm": 16.886317898872004, + "learning_rate": 3.4712849747194974e-05, + "loss": 2.215, + "mean_token_accuracy": 0.42758620977401735, + "step": 34465 + }, + { + "epoch": 0.0347185501001671, + "grad_norm": 13.6069989621035, + "learning_rate": 3.4717885703926034e-05, + "loss": 2.5228, + "mean_token_accuracy": 0.4206896543502808, + "step": 34470 + }, + { + "epoch": 0.03472358615327127, + "grad_norm": 13.256595197787686, + "learning_rate": 3.472292166065709e-05, + "loss": 2.5866, + "mean_token_accuracy": 0.417241370677948, + "step": 34475 + }, + { + "epoch": 0.034728622206375445, + "grad_norm": 16.6983576774615, + "learning_rate": 3.472795761738815e-05, + "loss": 2.7045, + "mean_token_accuracy": 0.37931033968925476, + "step": 34480 + }, + { + "epoch": 0.03473365825947961, + "grad_norm": 14.284368255583972, + "learning_rate": 3.473299357411921e-05, + "loss": 2.3891, + "mean_token_accuracy": 0.39310344457626345, + "step": 34485 + }, + { + "epoch": 0.034738694312583786, + "grad_norm": 18.71597346867723, + "learning_rate": 3.473802953085028e-05, + "loss": 2.8608, + "mean_token_accuracy": 0.3724137872457504, + "step": 34490 + }, + { + "epoch": 0.03474373036568796, + "grad_norm": 18.778149507658444, + "learning_rate": 3.474306548758134e-05, + "loss": 2.6009, + "mean_token_accuracy": 0.3482758581638336, + "step": 34495 + }, + { + "epoch": 0.03474876641879213, + "grad_norm": 18.738622273976702, + "learning_rate": 3.474810144431239e-05, + "loss": 2.4465, + "mean_token_accuracy": 0.37586206793785093, + "step": 34500 + }, + { + "epoch": 0.03475380247189631, + "grad_norm": 15.907321198418725, + "learning_rate": 3.475313740104345e-05, + "loss": 2.4409, + "mean_token_accuracy": 0.4172413766384125, + "step": 34505 + }, + { + "epoch": 0.03475883852500048, + "grad_norm": 12.991090430191488, + "learning_rate": 3.475817335777451e-05, + "loss": 2.5997, + "mean_token_accuracy": 0.41034482717514037, + "step": 34510 + }, + { + "epoch": 0.034763874578104655, + "grad_norm": 16.413449966488265, + "learning_rate": 3.4763209314505574e-05, + "loss": 2.6908, + "mean_token_accuracy": 0.3931034505367279, + "step": 34515 + }, + { + "epoch": 0.03476891063120882, + "grad_norm": 17.890136832056744, + "learning_rate": 3.476824527123663e-05, + "loss": 2.7192, + "mean_token_accuracy": 0.37586206793785093, + "step": 34520 + }, + { + "epoch": 0.034773946684312995, + "grad_norm": 14.8285509932761, + "learning_rate": 3.477328122796769e-05, + "loss": 2.5435, + "mean_token_accuracy": 0.4156079888343811, + "step": 34525 + }, + { + "epoch": 0.03477898273741717, + "grad_norm": 13.982453703419386, + "learning_rate": 3.477831718469875e-05, + "loss": 2.1683, + "mean_token_accuracy": 0.46551724076271056, + "step": 34530 + }, + { + "epoch": 0.03478401879052134, + "grad_norm": 13.888591547304285, + "learning_rate": 3.478335314142981e-05, + "loss": 2.3196, + "mean_token_accuracy": 0.45172414779663084, + "step": 34535 + }, + { + "epoch": 0.034789054843625516, + "grad_norm": 15.662586782728326, + "learning_rate": 3.478838909816087e-05, + "loss": 2.5761, + "mean_token_accuracy": 0.4379310369491577, + "step": 34540 + }, + { + "epoch": 0.03479409089672969, + "grad_norm": 21.941888073634633, + "learning_rate": 3.479342505489193e-05, + "loss": 2.6451, + "mean_token_accuracy": 0.42758620977401735, + "step": 34545 + }, + { + "epoch": 0.034799126949833864, + "grad_norm": 14.658407588655695, + "learning_rate": 3.479846101162299e-05, + "loss": 2.5867, + "mean_token_accuracy": 0.46551724076271056, + "step": 34550 + }, + { + "epoch": 0.03480416300293803, + "grad_norm": 14.58469748594041, + "learning_rate": 3.480349696835405e-05, + "loss": 2.6283, + "mean_token_accuracy": 0.41724138259887694, + "step": 34555 + }, + { + "epoch": 0.034809199056042205, + "grad_norm": 14.613418263710889, + "learning_rate": 3.480853292508511e-05, + "loss": 2.705, + "mean_token_accuracy": 0.3689655214548111, + "step": 34560 + }, + { + "epoch": 0.03481423510914638, + "grad_norm": 16.590421665815768, + "learning_rate": 3.4813568881816166e-05, + "loss": 2.3128, + "mean_token_accuracy": 0.43103447556495667, + "step": 34565 + }, + { + "epoch": 0.03481927116225055, + "grad_norm": 17.28619264389333, + "learning_rate": 3.481860483854723e-05, + "loss": 2.5566, + "mean_token_accuracy": 0.39310343861579894, + "step": 34570 + }, + { + "epoch": 0.034824307215354726, + "grad_norm": 14.897544557703494, + "learning_rate": 3.482364079527829e-05, + "loss": 3.0206, + "mean_token_accuracy": 0.3849364757537842, + "step": 34575 + }, + { + "epoch": 0.0348293432684589, + "grad_norm": 29.75429863225802, + "learning_rate": 3.482867675200935e-05, + "loss": 2.957, + "mean_token_accuracy": 0.3482758581638336, + "step": 34580 + }, + { + "epoch": 0.03483437932156307, + "grad_norm": 12.777747861410624, + "learning_rate": 3.483371270874041e-05, + "loss": 2.4241, + "mean_token_accuracy": 0.47586206793785096, + "step": 34585 + }, + { + "epoch": 0.03483941537466724, + "grad_norm": 19.06062655928057, + "learning_rate": 3.483874866547146e-05, + "loss": 2.4463, + "mean_token_accuracy": 0.4068965494632721, + "step": 34590 + }, + { + "epoch": 0.034844451427771414, + "grad_norm": 14.880322484069044, + "learning_rate": 3.484378462220253e-05, + "loss": 2.5543, + "mean_token_accuracy": 0.3655172407627106, + "step": 34595 + }, + { + "epoch": 0.03484948748087559, + "grad_norm": 18.2088328779293, + "learning_rate": 3.484882057893359e-05, + "loss": 2.5652, + "mean_token_accuracy": 0.4172413766384125, + "step": 34600 + }, + { + "epoch": 0.03485452353397976, + "grad_norm": 17.248621778399723, + "learning_rate": 3.485385653566465e-05, + "loss": 2.6376, + "mean_token_accuracy": 0.4206896543502808, + "step": 34605 + }, + { + "epoch": 0.034859559587083935, + "grad_norm": 14.471448228827747, + "learning_rate": 3.4858892492395706e-05, + "loss": 2.1248, + "mean_token_accuracy": 0.40344828367233276, + "step": 34610 + }, + { + "epoch": 0.03486459564018811, + "grad_norm": 20.867260112306372, + "learning_rate": 3.4863928449126765e-05, + "loss": 2.8706, + "mean_token_accuracy": 0.37241379022598264, + "step": 34615 + }, + { + "epoch": 0.03486963169329228, + "grad_norm": 15.25798863459342, + "learning_rate": 3.486896440585783e-05, + "loss": 2.6889, + "mean_token_accuracy": 0.4, + "step": 34620 + }, + { + "epoch": 0.03487466774639645, + "grad_norm": 14.659442558739642, + "learning_rate": 3.487400036258889e-05, + "loss": 2.6817, + "mean_token_accuracy": 0.38965516686439516, + "step": 34625 + }, + { + "epoch": 0.03487970379950062, + "grad_norm": 15.560786355028446, + "learning_rate": 3.487903631931994e-05, + "loss": 2.3978, + "mean_token_accuracy": 0.4448275864124298, + "step": 34630 + }, + { + "epoch": 0.0348847398526048, + "grad_norm": 13.752442286182815, + "learning_rate": 3.4884072276051e-05, + "loss": 2.6445, + "mean_token_accuracy": 0.4448275864124298, + "step": 34635 + }, + { + "epoch": 0.03488977590570897, + "grad_norm": 15.973012628786337, + "learning_rate": 3.488910823278206e-05, + "loss": 2.7024, + "mean_token_accuracy": 0.38275861740112305, + "step": 34640 + }, + { + "epoch": 0.034894811958813145, + "grad_norm": 13.909753541908835, + "learning_rate": 3.489414418951312e-05, + "loss": 2.222, + "mean_token_accuracy": 0.4413793087005615, + "step": 34645 + }, + { + "epoch": 0.03489984801191732, + "grad_norm": 13.897338376267177, + "learning_rate": 3.489918014624419e-05, + "loss": 2.4116, + "mean_token_accuracy": 0.38275861740112305, + "step": 34650 + }, + { + "epoch": 0.03490488406502149, + "grad_norm": 19.41903406560839, + "learning_rate": 3.4904216102975246e-05, + "loss": 2.4505, + "mean_token_accuracy": 0.39655172228813174, + "step": 34655 + }, + { + "epoch": 0.03490992011812566, + "grad_norm": 20.658938205485974, + "learning_rate": 3.4909252059706306e-05, + "loss": 2.5236, + "mean_token_accuracy": 0.42413793206214906, + "step": 34660 + }, + { + "epoch": 0.03491495617122983, + "grad_norm": 17.096764502387536, + "learning_rate": 3.4914288016437365e-05, + "loss": 2.7408, + "mean_token_accuracy": 0.41724138259887694, + "step": 34665 + }, + { + "epoch": 0.03491999222433401, + "grad_norm": 15.998200593539226, + "learning_rate": 3.4919323973168424e-05, + "loss": 2.5951, + "mean_token_accuracy": 0.3689655214548111, + "step": 34670 + }, + { + "epoch": 0.03492502827743818, + "grad_norm": 13.182363512643608, + "learning_rate": 3.492435992989948e-05, + "loss": 2.6719, + "mean_token_accuracy": 0.3931034505367279, + "step": 34675 + }, + { + "epoch": 0.034930064330542354, + "grad_norm": 16.873519779320084, + "learning_rate": 3.492939588663054e-05, + "loss": 2.606, + "mean_token_accuracy": 0.4256503343582153, + "step": 34680 + }, + { + "epoch": 0.03493510038364653, + "grad_norm": 22.750745524513793, + "learning_rate": 3.49344318433616e-05, + "loss": 2.6748, + "mean_token_accuracy": 0.3827586233615875, + "step": 34685 + }, + { + "epoch": 0.0349401364367507, + "grad_norm": 24.074574644803008, + "learning_rate": 3.493946780009266e-05, + "loss": 2.6533, + "mean_token_accuracy": 0.3793103456497192, + "step": 34690 + }, + { + "epoch": 0.03494517248985487, + "grad_norm": 16.49382448473932, + "learning_rate": 3.494450375682372e-05, + "loss": 2.4331, + "mean_token_accuracy": 0.4206896543502808, + "step": 34695 + }, + { + "epoch": 0.03495020854295904, + "grad_norm": 15.617841678912903, + "learning_rate": 3.4949539713554786e-05, + "loss": 2.652, + "mean_token_accuracy": 0.39310344457626345, + "step": 34700 + }, + { + "epoch": 0.034955244596063216, + "grad_norm": 12.513930646665889, + "learning_rate": 3.4954575670285846e-05, + "loss": 2.5435, + "mean_token_accuracy": 0.39655172228813174, + "step": 34705 + }, + { + "epoch": 0.03496028064916739, + "grad_norm": 20.413935791725716, + "learning_rate": 3.4959611627016905e-05, + "loss": 3.162, + "mean_token_accuracy": 0.3310344874858856, + "step": 34710 + }, + { + "epoch": 0.034965316702271564, + "grad_norm": 17.720985862913732, + "learning_rate": 3.4964647583747964e-05, + "loss": 2.3776, + "mean_token_accuracy": 0.4517241358757019, + "step": 34715 + }, + { + "epoch": 0.03497035275537574, + "grad_norm": 14.364985700435367, + "learning_rate": 3.496968354047902e-05, + "loss": 2.6911, + "mean_token_accuracy": 0.36896551251411436, + "step": 34720 + }, + { + "epoch": 0.03497538880847991, + "grad_norm": 16.598430228124727, + "learning_rate": 3.4974719497210076e-05, + "loss": 2.4671, + "mean_token_accuracy": 0.4137930989265442, + "step": 34725 + }, + { + "epoch": 0.03498042486158408, + "grad_norm": 14.3754381441058, + "learning_rate": 3.497975545394114e-05, + "loss": 2.1248, + "mean_token_accuracy": 0.4689655125141144, + "step": 34730 + }, + { + "epoch": 0.03498546091468825, + "grad_norm": 13.646445526115247, + "learning_rate": 3.49847914106722e-05, + "loss": 2.5737, + "mean_token_accuracy": 0.358620697259903, + "step": 34735 + }, + { + "epoch": 0.034990496967792425, + "grad_norm": 17.308369773458114, + "learning_rate": 3.498982736740326e-05, + "loss": 2.7151, + "mean_token_accuracy": 0.4255898416042328, + "step": 34740 + }, + { + "epoch": 0.0349955330208966, + "grad_norm": 15.08407907683832, + "learning_rate": 3.499486332413432e-05, + "loss": 2.221, + "mean_token_accuracy": 0.44827587008476255, + "step": 34745 + }, + { + "epoch": 0.03500056907400077, + "grad_norm": 15.91755307247523, + "learning_rate": 3.499989928086538e-05, + "loss": 2.5029, + "mean_token_accuracy": 0.441379314661026, + "step": 34750 + }, + { + "epoch": 0.03500560512710495, + "grad_norm": 18.520511646762156, + "learning_rate": 3.5004935237596445e-05, + "loss": 2.3419, + "mean_token_accuracy": 0.4362371563911438, + "step": 34755 + }, + { + "epoch": 0.03501064118020912, + "grad_norm": 18.711625749401353, + "learning_rate": 3.5009971194327504e-05, + "loss": 2.8273, + "mean_token_accuracy": 0.4, + "step": 34760 + }, + { + "epoch": 0.03501567723331329, + "grad_norm": 19.699896135275633, + "learning_rate": 3.501500715105856e-05, + "loss": 2.7136, + "mean_token_accuracy": 0.4068965554237366, + "step": 34765 + }, + { + "epoch": 0.03502071328641746, + "grad_norm": 15.6250548239187, + "learning_rate": 3.5020043107789616e-05, + "loss": 3.0684, + "mean_token_accuracy": 0.3655172407627106, + "step": 34770 + }, + { + "epoch": 0.035025749339521635, + "grad_norm": 14.204615655023026, + "learning_rate": 3.5025079064520675e-05, + "loss": 2.4801, + "mean_token_accuracy": 0.39655173420906065, + "step": 34775 + }, + { + "epoch": 0.03503078539262581, + "grad_norm": 12.056004899335262, + "learning_rate": 3.503011502125174e-05, + "loss": 2.7502, + "mean_token_accuracy": 0.3758620619773865, + "step": 34780 + }, + { + "epoch": 0.03503582144572998, + "grad_norm": 12.752678351786745, + "learning_rate": 3.50351509779828e-05, + "loss": 2.6846, + "mean_token_accuracy": 0.37241379022598264, + "step": 34785 + }, + { + "epoch": 0.035040857498834156, + "grad_norm": 13.187990599671174, + "learning_rate": 3.504018693471386e-05, + "loss": 2.3376, + "mean_token_accuracy": 0.4379310429096222, + "step": 34790 + }, + { + "epoch": 0.03504589355193833, + "grad_norm": 15.3408474812755, + "learning_rate": 3.504522289144492e-05, + "loss": 2.5433, + "mean_token_accuracy": 0.4034482777118683, + "step": 34795 + }, + { + "epoch": 0.0350509296050425, + "grad_norm": 13.461076495520665, + "learning_rate": 3.505025884817598e-05, + "loss": 2.5766, + "mean_token_accuracy": 0.3862068891525269, + "step": 34800 + }, + { + "epoch": 0.03505596565814667, + "grad_norm": 13.60327195472584, + "learning_rate": 3.505529480490704e-05, + "loss": 2.196, + "mean_token_accuracy": 0.4517241358757019, + "step": 34805 + }, + { + "epoch": 0.035061001711250844, + "grad_norm": 14.874605979394588, + "learning_rate": 3.50603307616381e-05, + "loss": 2.4762, + "mean_token_accuracy": 0.4448275864124298, + "step": 34810 + }, + { + "epoch": 0.03506603776435502, + "grad_norm": 18.440378456217555, + "learning_rate": 3.5065366718369156e-05, + "loss": 2.3704, + "mean_token_accuracy": 0.43448275327682495, + "step": 34815 + }, + { + "epoch": 0.03507107381745919, + "grad_norm": 13.8060401283756, + "learning_rate": 3.5070402675100215e-05, + "loss": 2.5296, + "mean_token_accuracy": 0.3827586203813553, + "step": 34820 + }, + { + "epoch": 0.035076109870563366, + "grad_norm": 13.902660422625276, + "learning_rate": 3.5075438631831274e-05, + "loss": 2.4656, + "mean_token_accuracy": 0.4, + "step": 34825 + }, + { + "epoch": 0.03508114592366753, + "grad_norm": 14.433850535863852, + "learning_rate": 3.5080474588562334e-05, + "loss": 2.6364, + "mean_token_accuracy": 0.4241379380226135, + "step": 34830 + }, + { + "epoch": 0.035086181976771706, + "grad_norm": 15.626413363696457, + "learning_rate": 3.50855105452934e-05, + "loss": 2.7109, + "mean_token_accuracy": 0.39655172228813174, + "step": 34835 + }, + { + "epoch": 0.03509121802987588, + "grad_norm": 14.906736564406229, + "learning_rate": 3.509054650202446e-05, + "loss": 2.289, + "mean_token_accuracy": 0.38620689511299133, + "step": 34840 + }, + { + "epoch": 0.035096254082980054, + "grad_norm": 16.290965766880536, + "learning_rate": 3.509558245875552e-05, + "loss": 2.5371, + "mean_token_accuracy": 0.39655172228813174, + "step": 34845 + }, + { + "epoch": 0.03510129013608423, + "grad_norm": 14.443031955984175, + "learning_rate": 3.510061841548658e-05, + "loss": 2.3989, + "mean_token_accuracy": 0.3896551728248596, + "step": 34850 + }, + { + "epoch": 0.0351063261891884, + "grad_norm": 16.647428570156077, + "learning_rate": 3.510565437221764e-05, + "loss": 2.1607, + "mean_token_accuracy": 0.4620689570903778, + "step": 34855 + }, + { + "epoch": 0.035111362242292575, + "grad_norm": 20.02522224677574, + "learning_rate": 3.5110690328948696e-05, + "loss": 2.7904, + "mean_token_accuracy": 0.34137930572032926, + "step": 34860 + }, + { + "epoch": 0.03511639829539674, + "grad_norm": 13.799344289018075, + "learning_rate": 3.5115726285679755e-05, + "loss": 2.7876, + "mean_token_accuracy": 0.4172413766384125, + "step": 34865 + }, + { + "epoch": 0.035121434348500916, + "grad_norm": 21.597128027383445, + "learning_rate": 3.5120762242410814e-05, + "loss": 2.3981, + "mean_token_accuracy": 0.4103448331356049, + "step": 34870 + }, + { + "epoch": 0.03512647040160509, + "grad_norm": 14.612752673523506, + "learning_rate": 3.5125798199141874e-05, + "loss": 2.2843, + "mean_token_accuracy": 0.41034482717514037, + "step": 34875 + }, + { + "epoch": 0.03513150645470926, + "grad_norm": 17.293872710631845, + "learning_rate": 3.513083415587293e-05, + "loss": 2.7855, + "mean_token_accuracy": 0.3655172407627106, + "step": 34880 + }, + { + "epoch": 0.03513654250781344, + "grad_norm": 13.49845232578023, + "learning_rate": 3.5135870112604e-05, + "loss": 2.2603, + "mean_token_accuracy": 0.46551724076271056, + "step": 34885 + }, + { + "epoch": 0.03514157856091761, + "grad_norm": 20.155548607584144, + "learning_rate": 3.514090606933506e-05, + "loss": 2.6396, + "mean_token_accuracy": 0.39310344457626345, + "step": 34890 + }, + { + "epoch": 0.035146614614021784, + "grad_norm": 19.714118561070414, + "learning_rate": 3.514594202606612e-05, + "loss": 2.3158, + "mean_token_accuracy": 0.4396249234676361, + "step": 34895 + }, + { + "epoch": 0.03515165066712595, + "grad_norm": 16.650278951914416, + "learning_rate": 3.515097798279717e-05, + "loss": 2.9605, + "mean_token_accuracy": 0.3448275804519653, + "step": 34900 + }, + { + "epoch": 0.035156686720230125, + "grad_norm": 13.648467445858389, + "learning_rate": 3.515601393952823e-05, + "loss": 2.4078, + "mean_token_accuracy": 0.4103448301553726, + "step": 34905 + }, + { + "epoch": 0.0351617227733343, + "grad_norm": 14.467172873578784, + "learning_rate": 3.516104989625929e-05, + "loss": 2.3961, + "mean_token_accuracy": 0.4448275864124298, + "step": 34910 + }, + { + "epoch": 0.03516675882643847, + "grad_norm": 17.040386607396222, + "learning_rate": 3.5166085852990355e-05, + "loss": 2.6022, + "mean_token_accuracy": 0.39310344457626345, + "step": 34915 + }, + { + "epoch": 0.035171794879542646, + "grad_norm": 15.238184139793118, + "learning_rate": 3.5171121809721414e-05, + "loss": 2.3827, + "mean_token_accuracy": 0.4068965554237366, + "step": 34920 + }, + { + "epoch": 0.03517683093264682, + "grad_norm": 16.129892640271386, + "learning_rate": 3.517615776645247e-05, + "loss": 2.2233, + "mean_token_accuracy": 0.46896551847457885, + "step": 34925 + }, + { + "epoch": 0.035181866985750994, + "grad_norm": 11.842092803308393, + "learning_rate": 3.518119372318353e-05, + "loss": 3.1096, + "mean_token_accuracy": 0.39655172228813174, + "step": 34930 + }, + { + "epoch": 0.03518690303885516, + "grad_norm": 15.73663039863801, + "learning_rate": 3.518622967991459e-05, + "loss": 2.4456, + "mean_token_accuracy": 0.4310344815254211, + "step": 34935 + }, + { + "epoch": 0.035191939091959334, + "grad_norm": 14.288817774901093, + "learning_rate": 3.519126563664565e-05, + "loss": 2.2833, + "mean_token_accuracy": 0.42413793206214906, + "step": 34940 + }, + { + "epoch": 0.03519697514506351, + "grad_norm": 14.469159702042143, + "learning_rate": 3.519630159337671e-05, + "loss": 2.655, + "mean_token_accuracy": 0.38965516686439516, + "step": 34945 + }, + { + "epoch": 0.03520201119816768, + "grad_norm": 18.09243880448009, + "learning_rate": 3.520133755010777e-05, + "loss": 2.4444, + "mean_token_accuracy": 0.424137943983078, + "step": 34950 + }, + { + "epoch": 0.035207047251271856, + "grad_norm": 16.350324481796303, + "learning_rate": 3.520637350683883e-05, + "loss": 2.7826, + "mean_token_accuracy": 0.4034482777118683, + "step": 34955 + }, + { + "epoch": 0.03521208330437603, + "grad_norm": 17.700875394178883, + "learning_rate": 3.521140946356989e-05, + "loss": 3.0237, + "mean_token_accuracy": 0.39655171930789945, + "step": 34960 + }, + { + "epoch": 0.0352171193574802, + "grad_norm": 12.57047174349908, + "learning_rate": 3.5216445420300954e-05, + "loss": 2.6225, + "mean_token_accuracy": 0.41379310488700866, + "step": 34965 + }, + { + "epoch": 0.03522215541058437, + "grad_norm": 13.7806588895724, + "learning_rate": 3.522148137703201e-05, + "loss": 2.5445, + "mean_token_accuracy": 0.42546883821487425, + "step": 34970 + }, + { + "epoch": 0.035227191463688544, + "grad_norm": 15.1779189230023, + "learning_rate": 3.522651733376307e-05, + "loss": 2.4448, + "mean_token_accuracy": 0.42068966031074523, + "step": 34975 + }, + { + "epoch": 0.03523222751679272, + "grad_norm": 16.945905737412804, + "learning_rate": 3.523155329049413e-05, + "loss": 2.5292, + "mean_token_accuracy": 0.41724138259887694, + "step": 34980 + }, + { + "epoch": 0.03523726356989689, + "grad_norm": 15.156172836972884, + "learning_rate": 3.523658924722519e-05, + "loss": 2.3893, + "mean_token_accuracy": 0.44482759237289426, + "step": 34985 + }, + { + "epoch": 0.035242299623001065, + "grad_norm": 15.150424613832756, + "learning_rate": 3.524162520395624e-05, + "loss": 2.5776, + "mean_token_accuracy": 0.4, + "step": 34990 + }, + { + "epoch": 0.03524733567610524, + "grad_norm": 13.181538344579817, + "learning_rate": 3.524666116068731e-05, + "loss": 2.5919, + "mean_token_accuracy": 0.38620689511299133, + "step": 34995 + }, + { + "epoch": 0.03525237172920941, + "grad_norm": 14.545185599738529, + "learning_rate": 3.525169711741837e-05, + "loss": 2.4735, + "mean_token_accuracy": 0.4292801022529602, + "step": 35000 + }, + { + "epoch": 0.03525740778231358, + "grad_norm": 15.321789890941083, + "learning_rate": 3.525673307414943e-05, + "loss": 2.3938, + "mean_token_accuracy": 0.4103448212146759, + "step": 35005 + }, + { + "epoch": 0.03526244383541775, + "grad_norm": 11.889020495085513, + "learning_rate": 3.526176903088049e-05, + "loss": 2.1852, + "mean_token_accuracy": 0.43793103098869324, + "step": 35010 + }, + { + "epoch": 0.03526747988852193, + "grad_norm": 12.509413225799198, + "learning_rate": 3.5266804987611546e-05, + "loss": 2.5663, + "mean_token_accuracy": 0.41379310488700866, + "step": 35015 + }, + { + "epoch": 0.0352725159416261, + "grad_norm": 17.708691800389598, + "learning_rate": 3.527184094434261e-05, + "loss": 2.9172, + "mean_token_accuracy": 0.3252268582582474, + "step": 35020 + }, + { + "epoch": 0.035277551994730275, + "grad_norm": 15.933069921103863, + "learning_rate": 3.527687690107367e-05, + "loss": 2.6766, + "mean_token_accuracy": 0.39806412160396576, + "step": 35025 + }, + { + "epoch": 0.03528258804783445, + "grad_norm": 13.240292165154473, + "learning_rate": 3.528191285780473e-05, + "loss": 2.354, + "mean_token_accuracy": 0.41724137365818026, + "step": 35030 + }, + { + "epoch": 0.03528762410093862, + "grad_norm": 14.053145028200777, + "learning_rate": 3.528694881453578e-05, + "loss": 2.2076, + "mean_token_accuracy": 0.4482758641242981, + "step": 35035 + }, + { + "epoch": 0.03529266015404279, + "grad_norm": 23.347071559969965, + "learning_rate": 3.529198477126684e-05, + "loss": 3.0553, + "mean_token_accuracy": 0.35172412991523744, + "step": 35040 + }, + { + "epoch": 0.03529769620714696, + "grad_norm": 15.750660845861152, + "learning_rate": 3.529702072799791e-05, + "loss": 2.5436, + "mean_token_accuracy": 0.33103448152542114, + "step": 35045 + }, + { + "epoch": 0.035302732260251136, + "grad_norm": 17.313315106769004, + "learning_rate": 3.530205668472897e-05, + "loss": 2.7395, + "mean_token_accuracy": 0.3344827562570572, + "step": 35050 + }, + { + "epoch": 0.03530776831335531, + "grad_norm": 14.856269865060684, + "learning_rate": 3.530709264146003e-05, + "loss": 2.405, + "mean_token_accuracy": 0.43793103098869324, + "step": 35055 + }, + { + "epoch": 0.035312804366459484, + "grad_norm": 19.8915450732745, + "learning_rate": 3.5312128598191086e-05, + "loss": 2.6414, + "mean_token_accuracy": 0.42068964838981626, + "step": 35060 + }, + { + "epoch": 0.03531784041956366, + "grad_norm": 18.253268800770304, + "learning_rate": 3.5317164554922146e-05, + "loss": 2.8963, + "mean_token_accuracy": 0.33793103098869326, + "step": 35065 + }, + { + "epoch": 0.03532287647266783, + "grad_norm": 15.799863142352251, + "learning_rate": 3.5322200511653205e-05, + "loss": 2.3502, + "mean_token_accuracy": 0.42413792610168455, + "step": 35070 + }, + { + "epoch": 0.035327912525772, + "grad_norm": 12.160868357161332, + "learning_rate": 3.5327236468384264e-05, + "loss": 2.3084, + "mean_token_accuracy": 0.46551724076271056, + "step": 35075 + }, + { + "epoch": 0.03533294857887617, + "grad_norm": 16.148838752236983, + "learning_rate": 3.5332272425115323e-05, + "loss": 2.1896, + "mean_token_accuracy": 0.4206896543502808, + "step": 35080 + }, + { + "epoch": 0.035337984631980346, + "grad_norm": 15.60361082735607, + "learning_rate": 3.533730838184638e-05, + "loss": 2.775, + "mean_token_accuracy": 0.358620685338974, + "step": 35085 + }, + { + "epoch": 0.03534302068508452, + "grad_norm": 15.185444271243693, + "learning_rate": 3.534234433857744e-05, + "loss": 2.2067, + "mean_token_accuracy": 0.5, + "step": 35090 + }, + { + "epoch": 0.03534805673818869, + "grad_norm": 17.283525375310372, + "learning_rate": 3.53473802953085e-05, + "loss": 2.8294, + "mean_token_accuracy": 0.39655172228813174, + "step": 35095 + }, + { + "epoch": 0.03535309279129287, + "grad_norm": 13.296519780993291, + "learning_rate": 3.535241625203957e-05, + "loss": 2.9436, + "mean_token_accuracy": 0.39655172228813174, + "step": 35100 + }, + { + "epoch": 0.03535812884439704, + "grad_norm": 14.186059246960328, + "learning_rate": 3.5357452208770626e-05, + "loss": 2.7963, + "mean_token_accuracy": 0.3793103456497192, + "step": 35105 + }, + { + "epoch": 0.03536316489750121, + "grad_norm": 14.568811557967948, + "learning_rate": 3.5362488165501686e-05, + "loss": 2.6306, + "mean_token_accuracy": 0.41379310488700866, + "step": 35110 + }, + { + "epoch": 0.03536820095060538, + "grad_norm": 14.723056368932362, + "learning_rate": 3.5367524122232745e-05, + "loss": 2.6108, + "mean_token_accuracy": 0.41034482717514037, + "step": 35115 + }, + { + "epoch": 0.035373237003709555, + "grad_norm": 12.836738126795844, + "learning_rate": 3.5372560078963804e-05, + "loss": 2.3457, + "mean_token_accuracy": 0.44827585816383364, + "step": 35120 + }, + { + "epoch": 0.03537827305681373, + "grad_norm": 15.460265128156244, + "learning_rate": 3.5377596035694863e-05, + "loss": 2.3418, + "mean_token_accuracy": 0.43103447556495667, + "step": 35125 + }, + { + "epoch": 0.0353833091099179, + "grad_norm": 17.166241826087635, + "learning_rate": 3.538263199242592e-05, + "loss": 2.6302, + "mean_token_accuracy": 0.39764065146446226, + "step": 35130 + }, + { + "epoch": 0.03538834516302208, + "grad_norm": 13.816056938682053, + "learning_rate": 3.538766794915698e-05, + "loss": 2.3703, + "mean_token_accuracy": 0.4413793087005615, + "step": 35135 + }, + { + "epoch": 0.03539338121612625, + "grad_norm": 18.217601662834483, + "learning_rate": 3.539270390588804e-05, + "loss": 2.4424, + "mean_token_accuracy": 0.42413793206214906, + "step": 35140 + }, + { + "epoch": 0.03539841726923042, + "grad_norm": 19.779283108787794, + "learning_rate": 3.53977398626191e-05, + "loss": 2.4743, + "mean_token_accuracy": 0.42068966031074523, + "step": 35145 + }, + { + "epoch": 0.03540345332233459, + "grad_norm": 17.37813870869696, + "learning_rate": 3.540277581935016e-05, + "loss": 2.3088, + "mean_token_accuracy": 0.46206897497177124, + "step": 35150 + }, + { + "epoch": 0.035408489375438765, + "grad_norm": 23.35273640600897, + "learning_rate": 3.5407811776081226e-05, + "loss": 3.076, + "mean_token_accuracy": 0.3379310369491577, + "step": 35155 + }, + { + "epoch": 0.03541352542854294, + "grad_norm": 15.224470085218696, + "learning_rate": 3.5412847732812285e-05, + "loss": 2.4235, + "mean_token_accuracy": 0.4068965554237366, + "step": 35160 + }, + { + "epoch": 0.03541856148164711, + "grad_norm": 14.824299459795785, + "learning_rate": 3.541788368954334e-05, + "loss": 2.4214, + "mean_token_accuracy": 0.47991530895233153, + "step": 35165 + }, + { + "epoch": 0.035423597534751286, + "grad_norm": 13.590798711527745, + "learning_rate": 3.54229196462744e-05, + "loss": 2.5867, + "mean_token_accuracy": 0.3896551728248596, + "step": 35170 + }, + { + "epoch": 0.03542863358785546, + "grad_norm": 16.92664178823344, + "learning_rate": 3.5427955603005456e-05, + "loss": 2.844, + "mean_token_accuracy": 0.36896551847457887, + "step": 35175 + }, + { + "epoch": 0.03543366964095963, + "grad_norm": 15.888231070112926, + "learning_rate": 3.543299155973652e-05, + "loss": 2.5928, + "mean_token_accuracy": 0.34827585220336915, + "step": 35180 + }, + { + "epoch": 0.0354387056940638, + "grad_norm": 16.887654805323134, + "learning_rate": 3.543802751646758e-05, + "loss": 2.5761, + "mean_token_accuracy": 0.37241379618644715, + "step": 35185 + }, + { + "epoch": 0.035443741747167974, + "grad_norm": 11.686358097261753, + "learning_rate": 3.544306347319864e-05, + "loss": 2.3297, + "mean_token_accuracy": 0.4206896543502808, + "step": 35190 + }, + { + "epoch": 0.03544877780027215, + "grad_norm": 15.46425221067465, + "learning_rate": 3.54480994299297e-05, + "loss": 2.5449, + "mean_token_accuracy": 0.3758620619773865, + "step": 35195 + }, + { + "epoch": 0.03545381385337632, + "grad_norm": 13.720126020308937, + "learning_rate": 3.545313538666076e-05, + "loss": 2.5084, + "mean_token_accuracy": 0.3931034505367279, + "step": 35200 + }, + { + "epoch": 0.035458849906480495, + "grad_norm": 14.070012196115618, + "learning_rate": 3.545817134339182e-05, + "loss": 2.3758, + "mean_token_accuracy": 0.41929824352264405, + "step": 35205 + }, + { + "epoch": 0.03546388595958467, + "grad_norm": 16.72302871606665, + "learning_rate": 3.546320730012288e-05, + "loss": 2.6234, + "mean_token_accuracy": 0.3965517282485962, + "step": 35210 + }, + { + "epoch": 0.035468922012688836, + "grad_norm": 14.700955514728081, + "learning_rate": 3.546824325685394e-05, + "loss": 2.4101, + "mean_token_accuracy": 0.46394434571266174, + "step": 35215 + }, + { + "epoch": 0.03547395806579301, + "grad_norm": 13.019223114973107, + "learning_rate": 3.5473279213584996e-05, + "loss": 2.2539, + "mean_token_accuracy": 0.4896551728248596, + "step": 35220 + }, + { + "epoch": 0.035478994118897184, + "grad_norm": 17.608092260275658, + "learning_rate": 3.5478315170316055e-05, + "loss": 3.2916, + "mean_token_accuracy": 0.29999999403953553, + "step": 35225 + }, + { + "epoch": 0.03548403017200136, + "grad_norm": 15.38901232639779, + "learning_rate": 3.548335112704712e-05, + "loss": 2.2428, + "mean_token_accuracy": 0.47931033968925474, + "step": 35230 + }, + { + "epoch": 0.03548906622510553, + "grad_norm": 13.538477744787103, + "learning_rate": 3.548838708377818e-05, + "loss": 2.1607, + "mean_token_accuracy": 0.482758629322052, + "step": 35235 + }, + { + "epoch": 0.035494102278209705, + "grad_norm": 11.764179887355546, + "learning_rate": 3.549342304050924e-05, + "loss": 2.2296, + "mean_token_accuracy": 0.44482759237289426, + "step": 35240 + }, + { + "epoch": 0.03549913833131388, + "grad_norm": 16.424727649302117, + "learning_rate": 3.54984589972403e-05, + "loss": 2.6642, + "mean_token_accuracy": 0.3758620709180832, + "step": 35245 + }, + { + "epoch": 0.035504174384418045, + "grad_norm": 13.469770967869868, + "learning_rate": 3.550349495397136e-05, + "loss": 2.0929, + "mean_token_accuracy": 0.5344827592372894, + "step": 35250 + }, + { + "epoch": 0.03550921043752222, + "grad_norm": 24.100702334841046, + "learning_rate": 3.550853091070242e-05, + "loss": 2.7332, + "mean_token_accuracy": 0.39449485540390017, + "step": 35255 + }, + { + "epoch": 0.03551424649062639, + "grad_norm": 14.76522486670013, + "learning_rate": 3.551356686743348e-05, + "loss": 2.6202, + "mean_token_accuracy": 0.358620685338974, + "step": 35260 + }, + { + "epoch": 0.03551928254373057, + "grad_norm": 13.964797598028799, + "learning_rate": 3.5518602824164536e-05, + "loss": 2.4625, + "mean_token_accuracy": 0.42928009629249575, + "step": 35265 + }, + { + "epoch": 0.03552431859683474, + "grad_norm": 15.861037821874467, + "learning_rate": 3.5523638780895595e-05, + "loss": 2.4677, + "mean_token_accuracy": 0.45517241954803467, + "step": 35270 + }, + { + "epoch": 0.035529354649938914, + "grad_norm": 12.198624988777901, + "learning_rate": 3.5528674737626655e-05, + "loss": 2.4007, + "mean_token_accuracy": 0.4672111332416534, + "step": 35275 + }, + { + "epoch": 0.03553439070304309, + "grad_norm": 14.407577776710445, + "learning_rate": 3.5533710694357714e-05, + "loss": 2.4147, + "mean_token_accuracy": 0.4172413766384125, + "step": 35280 + }, + { + "epoch": 0.035539426756147255, + "grad_norm": 11.017067355657874, + "learning_rate": 3.553874665108878e-05, + "loss": 2.4584, + "mean_token_accuracy": 0.43793103098869324, + "step": 35285 + }, + { + "epoch": 0.03554446280925143, + "grad_norm": 21.129298779075278, + "learning_rate": 3.554378260781984e-05, + "loss": 2.404, + "mean_token_accuracy": 0.4034482717514038, + "step": 35290 + }, + { + "epoch": 0.0355494988623556, + "grad_norm": 13.060785112936866, + "learning_rate": 3.55488185645509e-05, + "loss": 2.7439, + "mean_token_accuracy": 0.37380520105361936, + "step": 35295 + }, + { + "epoch": 0.035554534915459776, + "grad_norm": 13.979399790958748, + "learning_rate": 3.555385452128195e-05, + "loss": 2.4058, + "mean_token_accuracy": 0.4206896543502808, + "step": 35300 + }, + { + "epoch": 0.03555957096856395, + "grad_norm": 15.099301494686506, + "learning_rate": 3.555889047801301e-05, + "loss": 2.5701, + "mean_token_accuracy": 0.3896551787853241, + "step": 35305 + }, + { + "epoch": 0.035564607021668124, + "grad_norm": 15.470220849342617, + "learning_rate": 3.5563926434744076e-05, + "loss": 2.511, + "mean_token_accuracy": 0.3965517282485962, + "step": 35310 + }, + { + "epoch": 0.0355696430747723, + "grad_norm": 17.419314818900872, + "learning_rate": 3.5568962391475135e-05, + "loss": 2.3176, + "mean_token_accuracy": 0.43103447556495667, + "step": 35315 + }, + { + "epoch": 0.035574679127876464, + "grad_norm": 16.95778330627382, + "learning_rate": 3.5573998348206195e-05, + "loss": 2.7032, + "mean_token_accuracy": 0.4034482717514038, + "step": 35320 + }, + { + "epoch": 0.03557971518098064, + "grad_norm": 16.69115441368345, + "learning_rate": 3.5579034304937254e-05, + "loss": 2.5185, + "mean_token_accuracy": 0.44984874725341795, + "step": 35325 + }, + { + "epoch": 0.03558475123408481, + "grad_norm": 11.871965701534956, + "learning_rate": 3.558407026166831e-05, + "loss": 2.5165, + "mean_token_accuracy": 0.4137930989265442, + "step": 35330 + }, + { + "epoch": 0.035589787287188986, + "grad_norm": 12.262484628889235, + "learning_rate": 3.558910621839937e-05, + "loss": 2.2712, + "mean_token_accuracy": 0.45517241954803467, + "step": 35335 + }, + { + "epoch": 0.03559482334029316, + "grad_norm": 14.97926085963146, + "learning_rate": 3.559414217513043e-05, + "loss": 2.7502, + "mean_token_accuracy": 0.3965517282485962, + "step": 35340 + }, + { + "epoch": 0.03559985939339733, + "grad_norm": 18.535705531532795, + "learning_rate": 3.559917813186149e-05, + "loss": 2.4407, + "mean_token_accuracy": 0.41034482717514037, + "step": 35345 + }, + { + "epoch": 0.03560489544650151, + "grad_norm": 15.634884967722432, + "learning_rate": 3.560421408859255e-05, + "loss": 2.3135, + "mean_token_accuracy": 0.41724138259887694, + "step": 35350 + }, + { + "epoch": 0.035609931499605674, + "grad_norm": 13.031608692625467, + "learning_rate": 3.560925004532361e-05, + "loss": 2.5981, + "mean_token_accuracy": 0.42068966031074523, + "step": 35355 + }, + { + "epoch": 0.03561496755270985, + "grad_norm": 18.14733925359142, + "learning_rate": 3.561428600205467e-05, + "loss": 2.8859, + "mean_token_accuracy": 0.3793103456497192, + "step": 35360 + }, + { + "epoch": 0.03562000360581402, + "grad_norm": 16.374472706408568, + "learning_rate": 3.5619321958785735e-05, + "loss": 2.473, + "mean_token_accuracy": 0.4310344815254211, + "step": 35365 + }, + { + "epoch": 0.035625039658918195, + "grad_norm": 23.510849869616205, + "learning_rate": 3.5624357915516794e-05, + "loss": 2.9885, + "mean_token_accuracy": 0.34482758641242983, + "step": 35370 + }, + { + "epoch": 0.03563007571202237, + "grad_norm": 14.792527617906579, + "learning_rate": 3.562939387224785e-05, + "loss": 2.6941, + "mean_token_accuracy": 0.4172413766384125, + "step": 35375 + }, + { + "epoch": 0.03563511176512654, + "grad_norm": 15.468006708389817, + "learning_rate": 3.563442982897891e-05, + "loss": 2.4908, + "mean_token_accuracy": 0.4103448212146759, + "step": 35380 + }, + { + "epoch": 0.035640147818230716, + "grad_norm": 14.71705712569485, + "learning_rate": 3.563946578570997e-05, + "loss": 2.5292, + "mean_token_accuracy": 0.39310344457626345, + "step": 35385 + }, + { + "epoch": 0.03564518387133488, + "grad_norm": 14.688470590413813, + "learning_rate": 3.564450174244103e-05, + "loss": 2.7157, + "mean_token_accuracy": 0.4103448212146759, + "step": 35390 + }, + { + "epoch": 0.03565021992443906, + "grad_norm": 11.901411495723053, + "learning_rate": 3.564953769917209e-05, + "loss": 2.8455, + "mean_token_accuracy": 0.41923774480819703, + "step": 35395 + }, + { + "epoch": 0.03565525597754323, + "grad_norm": 19.39408417714081, + "learning_rate": 3.565457365590315e-05, + "loss": 2.8126, + "mean_token_accuracy": 0.33103448152542114, + "step": 35400 + }, + { + "epoch": 0.035660292030647404, + "grad_norm": 16.166699125198168, + "learning_rate": 3.565960961263421e-05, + "loss": 2.3968, + "mean_token_accuracy": 0.39310344457626345, + "step": 35405 + }, + { + "epoch": 0.03566532808375158, + "grad_norm": 11.622290459096908, + "learning_rate": 3.566464556936527e-05, + "loss": 2.1377, + "mean_token_accuracy": 0.4103448212146759, + "step": 35410 + }, + { + "epoch": 0.03567036413685575, + "grad_norm": 15.372122823406604, + "learning_rate": 3.566968152609633e-05, + "loss": 2.3865, + "mean_token_accuracy": 0.4323653936386108, + "step": 35415 + }, + { + "epoch": 0.035675400189959926, + "grad_norm": 17.5870721417988, + "learning_rate": 3.567471748282739e-05, + "loss": 2.515, + "mean_token_accuracy": 0.42068964838981626, + "step": 35420 + }, + { + "epoch": 0.03568043624306409, + "grad_norm": 14.796916261664762, + "learning_rate": 3.567975343955845e-05, + "loss": 2.3273, + "mean_token_accuracy": 0.46551724076271056, + "step": 35425 + }, + { + "epoch": 0.035685472296168266, + "grad_norm": 16.627819932407505, + "learning_rate": 3.568478939628951e-05, + "loss": 2.7484, + "mean_token_accuracy": 0.42758620381355283, + "step": 35430 + }, + { + "epoch": 0.03569050834927244, + "grad_norm": 15.242096179375515, + "learning_rate": 3.5689825353020564e-05, + "loss": 2.2638, + "mean_token_accuracy": 0.4448275864124298, + "step": 35435 + }, + { + "epoch": 0.035695544402376614, + "grad_norm": 13.796157116777385, + "learning_rate": 3.5694861309751624e-05, + "loss": 3.0278, + "mean_token_accuracy": 0.3482758581638336, + "step": 35440 + }, + { + "epoch": 0.03570058045548079, + "grad_norm": 16.072140421760455, + "learning_rate": 3.569989726648269e-05, + "loss": 2.6115, + "mean_token_accuracy": 0.38275861740112305, + "step": 35445 + }, + { + "epoch": 0.03570561650858496, + "grad_norm": 16.774082359754804, + "learning_rate": 3.570493322321375e-05, + "loss": 2.2743, + "mean_token_accuracy": 0.42068966031074523, + "step": 35450 + }, + { + "epoch": 0.035710652561689135, + "grad_norm": 13.176571392850427, + "learning_rate": 3.570996917994481e-05, + "loss": 2.5807, + "mean_token_accuracy": 0.4034482777118683, + "step": 35455 + }, + { + "epoch": 0.0357156886147933, + "grad_norm": 19.210897493015615, + "learning_rate": 3.571500513667587e-05, + "loss": 2.565, + "mean_token_accuracy": 0.41379311084747317, + "step": 35460 + }, + { + "epoch": 0.035720724667897476, + "grad_norm": 15.797181810095896, + "learning_rate": 3.5720041093406927e-05, + "loss": 2.7047, + "mean_token_accuracy": 0.40689654350280763, + "step": 35465 + }, + { + "epoch": 0.03572576072100165, + "grad_norm": 14.169786013140444, + "learning_rate": 3.572507705013799e-05, + "loss": 2.4288, + "mean_token_accuracy": 0.46896551847457885, + "step": 35470 + }, + { + "epoch": 0.03573079677410582, + "grad_norm": 14.403208626634898, + "learning_rate": 3.5730113006869045e-05, + "loss": 2.9215, + "mean_token_accuracy": 0.3551724076271057, + "step": 35475 + }, + { + "epoch": 0.03573583282721, + "grad_norm": 17.32313323761732, + "learning_rate": 3.5735148963600104e-05, + "loss": 2.5442, + "mean_token_accuracy": 0.4379310369491577, + "step": 35480 + }, + { + "epoch": 0.03574086888031417, + "grad_norm": 15.908719857189077, + "learning_rate": 3.5740184920331164e-05, + "loss": 2.4375, + "mean_token_accuracy": 0.4206896543502808, + "step": 35485 + }, + { + "epoch": 0.035745904933418345, + "grad_norm": 14.23529070873796, + "learning_rate": 3.574522087706222e-05, + "loss": 2.5678, + "mean_token_accuracy": 0.41379310488700866, + "step": 35490 + }, + { + "epoch": 0.03575094098652251, + "grad_norm": 14.05360682853102, + "learning_rate": 3.575025683379328e-05, + "loss": 2.1873, + "mean_token_accuracy": 0.4172413766384125, + "step": 35495 + }, + { + "epoch": 0.035755977039626685, + "grad_norm": 20.427118954821168, + "learning_rate": 3.575529279052435e-05, + "loss": 2.8419, + "mean_token_accuracy": 0.38965516686439516, + "step": 35500 + }, + { + "epoch": 0.03576101309273086, + "grad_norm": 18.399676560837694, + "learning_rate": 3.576032874725541e-05, + "loss": 2.4122, + "mean_token_accuracy": 0.43290985822677613, + "step": 35505 + }, + { + "epoch": 0.03576604914583503, + "grad_norm": 13.685914706727317, + "learning_rate": 3.576536470398647e-05, + "loss": 2.3069, + "mean_token_accuracy": 0.43448275327682495, + "step": 35510 + }, + { + "epoch": 0.035771085198939206, + "grad_norm": 12.876838638126735, + "learning_rate": 3.5770400660717526e-05, + "loss": 2.4114, + "mean_token_accuracy": 0.45862069725990295, + "step": 35515 + }, + { + "epoch": 0.03577612125204338, + "grad_norm": 17.918828232646362, + "learning_rate": 3.5775436617448585e-05, + "loss": 2.713, + "mean_token_accuracy": 0.3965517163276672, + "step": 35520 + }, + { + "epoch": 0.035781157305147554, + "grad_norm": 15.185139507452968, + "learning_rate": 3.5780472574179644e-05, + "loss": 2.5344, + "mean_token_accuracy": 0.40344828367233276, + "step": 35525 + }, + { + "epoch": 0.03578619335825172, + "grad_norm": 16.28109091498346, + "learning_rate": 3.5785508530910704e-05, + "loss": 2.3429, + "mean_token_accuracy": 0.4551724076271057, + "step": 35530 + }, + { + "epoch": 0.035791229411355895, + "grad_norm": 18.947028597711554, + "learning_rate": 3.579054448764176e-05, + "loss": 2.3144, + "mean_token_accuracy": 0.4379310369491577, + "step": 35535 + }, + { + "epoch": 0.03579626546446007, + "grad_norm": 15.371822247827776, + "learning_rate": 3.579558044437282e-05, + "loss": 2.2236, + "mean_token_accuracy": 0.4551724076271057, + "step": 35540 + }, + { + "epoch": 0.03580130151756424, + "grad_norm": 14.49687517519227, + "learning_rate": 3.580061640110388e-05, + "loss": 2.351, + "mean_token_accuracy": 0.4206896543502808, + "step": 35545 + }, + { + "epoch": 0.035806337570668416, + "grad_norm": 25.636853136937354, + "learning_rate": 3.580565235783495e-05, + "loss": 2.6554, + "mean_token_accuracy": 0.34137930572032926, + "step": 35550 + }, + { + "epoch": 0.03581137362377259, + "grad_norm": 17.311992290896285, + "learning_rate": 3.581068831456601e-05, + "loss": 2.8273, + "mean_token_accuracy": 0.4154264986515045, + "step": 35555 + }, + { + "epoch": 0.03581640967687676, + "grad_norm": 13.910138230425096, + "learning_rate": 3.5815724271297066e-05, + "loss": 2.566, + "mean_token_accuracy": 0.4011494338512421, + "step": 35560 + }, + { + "epoch": 0.03582144572998093, + "grad_norm": 13.69434489961823, + "learning_rate": 3.5820760228028125e-05, + "loss": 2.4349, + "mean_token_accuracy": 0.4482758462429047, + "step": 35565 + }, + { + "epoch": 0.035826481783085104, + "grad_norm": 18.490324924281, + "learning_rate": 3.582579618475918e-05, + "loss": 2.9352, + "mean_token_accuracy": 0.37241379618644715, + "step": 35570 + }, + { + "epoch": 0.03583151783618928, + "grad_norm": 12.793083924396608, + "learning_rate": 3.583083214149024e-05, + "loss": 2.5553, + "mean_token_accuracy": 0.4068965494632721, + "step": 35575 + }, + { + "epoch": 0.03583655388929345, + "grad_norm": 15.210389993425762, + "learning_rate": 3.58358680982213e-05, + "loss": 2.5775, + "mean_token_accuracy": 0.37707199454307555, + "step": 35580 + }, + { + "epoch": 0.035841589942397625, + "grad_norm": 15.20890508494618, + "learning_rate": 3.584090405495236e-05, + "loss": 2.5868, + "mean_token_accuracy": 0.41034482717514037, + "step": 35585 + }, + { + "epoch": 0.0358466259955018, + "grad_norm": 13.381917511052428, + "learning_rate": 3.584594001168342e-05, + "loss": 2.9119, + "mean_token_accuracy": 0.38275861740112305, + "step": 35590 + }, + { + "epoch": 0.03585166204860597, + "grad_norm": 12.493109191625829, + "learning_rate": 3.585097596841448e-05, + "loss": 2.4386, + "mean_token_accuracy": 0.38275861740112305, + "step": 35595 + }, + { + "epoch": 0.03585669810171014, + "grad_norm": 17.824904916444197, + "learning_rate": 3.585601192514554e-05, + "loss": 2.5532, + "mean_token_accuracy": 0.41554749608039854, + "step": 35600 + }, + { + "epoch": 0.03586173415481431, + "grad_norm": 15.570239825143327, + "learning_rate": 3.5861047881876606e-05, + "loss": 2.5145, + "mean_token_accuracy": 0.41034482717514037, + "step": 35605 + }, + { + "epoch": 0.03586677020791849, + "grad_norm": 13.622352660830135, + "learning_rate": 3.586608383860766e-05, + "loss": 2.5926, + "mean_token_accuracy": 0.40689654350280763, + "step": 35610 + }, + { + "epoch": 0.03587180626102266, + "grad_norm": 12.536057487095968, + "learning_rate": 3.587111979533872e-05, + "loss": 2.7633, + "mean_token_accuracy": 0.3896551698446274, + "step": 35615 + }, + { + "epoch": 0.035876842314126835, + "grad_norm": 12.958877525157792, + "learning_rate": 3.587615575206978e-05, + "loss": 2.6776, + "mean_token_accuracy": 0.45517241954803467, + "step": 35620 + }, + { + "epoch": 0.03588187836723101, + "grad_norm": 16.389540740193812, + "learning_rate": 3.5881191708800836e-05, + "loss": 2.6803, + "mean_token_accuracy": 0.3827586233615875, + "step": 35625 + }, + { + "epoch": 0.03588691442033518, + "grad_norm": 17.51925880213451, + "learning_rate": 3.58862276655319e-05, + "loss": 2.5911, + "mean_token_accuracy": 0.4482758641242981, + "step": 35630 + }, + { + "epoch": 0.03589195047343935, + "grad_norm": 12.673485753549853, + "learning_rate": 3.589126362226296e-05, + "loss": 2.6774, + "mean_token_accuracy": 0.40344828367233276, + "step": 35635 + }, + { + "epoch": 0.03589698652654352, + "grad_norm": 14.917715377498274, + "learning_rate": 3.589629957899402e-05, + "loss": 2.6512, + "mean_token_accuracy": 0.3344827562570572, + "step": 35640 + }, + { + "epoch": 0.0359020225796477, + "grad_norm": 13.720853268146719, + "learning_rate": 3.590133553572508e-05, + "loss": 2.4033, + "mean_token_accuracy": 0.42758620977401735, + "step": 35645 + }, + { + "epoch": 0.03590705863275187, + "grad_norm": 12.609524471901185, + "learning_rate": 3.590637149245614e-05, + "loss": 2.4937, + "mean_token_accuracy": 0.4620689630508423, + "step": 35650 + }, + { + "epoch": 0.035912094685856044, + "grad_norm": 15.501070616356776, + "learning_rate": 3.59114074491872e-05, + "loss": 2.3365, + "mean_token_accuracy": 0.41724138259887694, + "step": 35655 + }, + { + "epoch": 0.03591713073896022, + "grad_norm": 14.93177713409494, + "learning_rate": 3.591644340591826e-05, + "loss": 2.5277, + "mean_token_accuracy": 0.3999999940395355, + "step": 35660 + }, + { + "epoch": 0.03592216679206439, + "grad_norm": 19.790333414116663, + "learning_rate": 3.592147936264932e-05, + "loss": 2.6167, + "mean_token_accuracy": 0.41379310488700866, + "step": 35665 + }, + { + "epoch": 0.03592720284516856, + "grad_norm": 15.731223545668247, + "learning_rate": 3.5926515319380376e-05, + "loss": 2.6356, + "mean_token_accuracy": 0.3896551728248596, + "step": 35670 + }, + { + "epoch": 0.03593223889827273, + "grad_norm": 17.809563326580847, + "learning_rate": 3.5931551276111436e-05, + "loss": 2.8249, + "mean_token_accuracy": 0.341379314661026, + "step": 35675 + }, + { + "epoch": 0.035937274951376906, + "grad_norm": 18.292534540978913, + "learning_rate": 3.5936587232842495e-05, + "loss": 2.7546, + "mean_token_accuracy": 0.4206896543502808, + "step": 35680 + }, + { + "epoch": 0.03594231100448108, + "grad_norm": 17.084287701479095, + "learning_rate": 3.594162318957356e-05, + "loss": 2.5702, + "mean_token_accuracy": 0.41379310488700866, + "step": 35685 + }, + { + "epoch": 0.035947347057585254, + "grad_norm": 15.509917716608, + "learning_rate": 3.594665914630462e-05, + "loss": 2.3557, + "mean_token_accuracy": 0.44482758045196535, + "step": 35690 + }, + { + "epoch": 0.03595238311068943, + "grad_norm": 13.82853505337028, + "learning_rate": 3.595169510303568e-05, + "loss": 2.3411, + "mean_token_accuracy": 0.4103448331356049, + "step": 35695 + }, + { + "epoch": 0.0359574191637936, + "grad_norm": 16.51209833487878, + "learning_rate": 3.595673105976673e-05, + "loss": 2.3065, + "mean_token_accuracy": 0.41724138259887694, + "step": 35700 + }, + { + "epoch": 0.03596245521689777, + "grad_norm": 15.547442680429047, + "learning_rate": 3.596176701649779e-05, + "loss": 2.7701, + "mean_token_accuracy": 0.3862069010734558, + "step": 35705 + }, + { + "epoch": 0.03596749127000194, + "grad_norm": 47.135882463260295, + "learning_rate": 3.596680297322886e-05, + "loss": 2.9303, + "mean_token_accuracy": 0.37241379022598264, + "step": 35710 + }, + { + "epoch": 0.035972527323106115, + "grad_norm": 19.794887651527695, + "learning_rate": 3.5971838929959916e-05, + "loss": 2.8959, + "mean_token_accuracy": 0.34137930870056155, + "step": 35715 + }, + { + "epoch": 0.03597756337621029, + "grad_norm": 14.356133745325254, + "learning_rate": 3.5976874886690976e-05, + "loss": 2.811, + "mean_token_accuracy": 0.3551724195480347, + "step": 35720 + }, + { + "epoch": 0.03598259942931446, + "grad_norm": 13.27706680193558, + "learning_rate": 3.5981910843422035e-05, + "loss": 2.3341, + "mean_token_accuracy": 0.44827585220336913, + "step": 35725 + }, + { + "epoch": 0.03598763548241864, + "grad_norm": 11.228835141586131, + "learning_rate": 3.5986946800153094e-05, + "loss": 2.3359, + "mean_token_accuracy": 0.43793103098869324, + "step": 35730 + }, + { + "epoch": 0.03599267153552281, + "grad_norm": 12.189650152744777, + "learning_rate": 3.599198275688416e-05, + "loss": 2.5131, + "mean_token_accuracy": 0.4, + "step": 35735 + }, + { + "epoch": 0.03599770758862698, + "grad_norm": 19.267046518760324, + "learning_rate": 3.599701871361521e-05, + "loss": 2.7015, + "mean_token_accuracy": 0.34137930870056155, + "step": 35740 + }, + { + "epoch": 0.03600274364173115, + "grad_norm": 14.870453562178106, + "learning_rate": 3.600205467034627e-05, + "loss": 2.5435, + "mean_token_accuracy": 0.3620689630508423, + "step": 35745 + }, + { + "epoch": 0.036007779694835325, + "grad_norm": 14.618246072353006, + "learning_rate": 3.600709062707733e-05, + "loss": 2.4896, + "mean_token_accuracy": 0.38106473684310915, + "step": 35750 + }, + { + "epoch": 0.0360128157479395, + "grad_norm": 15.901296860817029, + "learning_rate": 3.601212658380839e-05, + "loss": 2.5075, + "mean_token_accuracy": 0.39655172228813174, + "step": 35755 + }, + { + "epoch": 0.03601785180104367, + "grad_norm": 15.575862476953128, + "learning_rate": 3.601716254053945e-05, + "loss": 2.8814, + "mean_token_accuracy": 0.334482753276825, + "step": 35760 + }, + { + "epoch": 0.036022887854147846, + "grad_norm": 14.39027230414355, + "learning_rate": 3.6022198497270516e-05, + "loss": 2.3425, + "mean_token_accuracy": 0.43448275327682495, + "step": 35765 + }, + { + "epoch": 0.03602792390725202, + "grad_norm": 18.875038085220268, + "learning_rate": 3.6027234454001575e-05, + "loss": 2.7167, + "mean_token_accuracy": 0.3724137872457504, + "step": 35770 + }, + { + "epoch": 0.03603295996035619, + "grad_norm": 14.022362398138073, + "learning_rate": 3.6032270410732634e-05, + "loss": 2.3131, + "mean_token_accuracy": 0.43793103098869324, + "step": 35775 + }, + { + "epoch": 0.03603799601346036, + "grad_norm": 13.198568676296196, + "learning_rate": 3.603730636746369e-05, + "loss": 2.5847, + "mean_token_accuracy": 0.4344827592372894, + "step": 35780 + }, + { + "epoch": 0.036043032066564534, + "grad_norm": 16.359619141831356, + "learning_rate": 3.604234232419475e-05, + "loss": 2.2021, + "mean_token_accuracy": 0.4482758641242981, + "step": 35785 + }, + { + "epoch": 0.03604806811966871, + "grad_norm": 15.17237985572359, + "learning_rate": 3.604737828092581e-05, + "loss": 2.6876, + "mean_token_accuracy": 0.32758620381355286, + "step": 35790 + }, + { + "epoch": 0.03605310417277288, + "grad_norm": 17.46830585619335, + "learning_rate": 3.605241423765687e-05, + "loss": 2.7893, + "mean_token_accuracy": 0.3517241358757019, + "step": 35795 + }, + { + "epoch": 0.036058140225877056, + "grad_norm": 11.081676417264928, + "learning_rate": 3.605745019438793e-05, + "loss": 2.5076, + "mean_token_accuracy": 0.37241379022598264, + "step": 35800 + }, + { + "epoch": 0.03606317627898123, + "grad_norm": 17.246101171400483, + "learning_rate": 3.606248615111899e-05, + "loss": 2.3044, + "mean_token_accuracy": 0.4586206912994385, + "step": 35805 + }, + { + "epoch": 0.036068212332085396, + "grad_norm": 17.511476155750902, + "learning_rate": 3.606752210785005e-05, + "loss": 2.6091, + "mean_token_accuracy": 0.4186932861804962, + "step": 35810 + }, + { + "epoch": 0.03607324838518957, + "grad_norm": 32.674096194830184, + "learning_rate": 3.6072558064581115e-05, + "loss": 3.1546, + "mean_token_accuracy": 0.34137930870056155, + "step": 35815 + }, + { + "epoch": 0.036078284438293744, + "grad_norm": 19.707316532075957, + "learning_rate": 3.6077594021312174e-05, + "loss": 2.9366, + "mean_token_accuracy": 0.3793103456497192, + "step": 35820 + }, + { + "epoch": 0.03608332049139792, + "grad_norm": 13.387358123766912, + "learning_rate": 3.6082629978043233e-05, + "loss": 2.3367, + "mean_token_accuracy": 0.44482759237289426, + "step": 35825 + }, + { + "epoch": 0.03608835654450209, + "grad_norm": 17.429435847706067, + "learning_rate": 3.608766593477429e-05, + "loss": 2.4156, + "mean_token_accuracy": 0.441379314661026, + "step": 35830 + }, + { + "epoch": 0.036093392597606265, + "grad_norm": 15.6860832924542, + "learning_rate": 3.6092701891505345e-05, + "loss": 2.2817, + "mean_token_accuracy": 0.42413793206214906, + "step": 35835 + }, + { + "epoch": 0.03609842865071044, + "grad_norm": 29.688036388874316, + "learning_rate": 3.6097737848236404e-05, + "loss": 2.7454, + "mean_token_accuracy": 0.36551723778247835, + "step": 35840 + }, + { + "epoch": 0.036103464703814606, + "grad_norm": 15.220022821820365, + "learning_rate": 3.610277380496747e-05, + "loss": 2.4929, + "mean_token_accuracy": 0.34482758343219755, + "step": 35845 + }, + { + "epoch": 0.03610850075691878, + "grad_norm": 15.242905750411836, + "learning_rate": 3.610780976169853e-05, + "loss": 2.035, + "mean_token_accuracy": 0.44827585816383364, + "step": 35850 + }, + { + "epoch": 0.03611353681002295, + "grad_norm": 12.622525167184936, + "learning_rate": 3.611284571842959e-05, + "loss": 2.5185, + "mean_token_accuracy": 0.43103448748588563, + "step": 35855 + }, + { + "epoch": 0.03611857286312713, + "grad_norm": 15.561342733826251, + "learning_rate": 3.611788167516065e-05, + "loss": 2.3952, + "mean_token_accuracy": 0.4172413766384125, + "step": 35860 + }, + { + "epoch": 0.0361236089162313, + "grad_norm": 14.808678748066189, + "learning_rate": 3.612291763189171e-05, + "loss": 2.8236, + "mean_token_accuracy": 0.3999999940395355, + "step": 35865 + }, + { + "epoch": 0.036128644969335474, + "grad_norm": 12.96414199813315, + "learning_rate": 3.6127953588622773e-05, + "loss": 2.3862, + "mean_token_accuracy": 0.4137930929660797, + "step": 35870 + }, + { + "epoch": 0.03613368102243965, + "grad_norm": 14.804641055529913, + "learning_rate": 3.6132989545353826e-05, + "loss": 2.8642, + "mean_token_accuracy": 0.3896551728248596, + "step": 35875 + }, + { + "epoch": 0.036138717075543815, + "grad_norm": 15.46684701304198, + "learning_rate": 3.6138025502084885e-05, + "loss": 2.5977, + "mean_token_accuracy": 0.37931033968925476, + "step": 35880 + }, + { + "epoch": 0.03614375312864799, + "grad_norm": 16.907079577522516, + "learning_rate": 3.6143061458815944e-05, + "loss": 2.6537, + "mean_token_accuracy": 0.4206896543502808, + "step": 35885 + }, + { + "epoch": 0.03614878918175216, + "grad_norm": 17.363303991893545, + "learning_rate": 3.6148097415547004e-05, + "loss": 2.2237, + "mean_token_accuracy": 0.5146400570869446, + "step": 35890 + }, + { + "epoch": 0.036153825234856336, + "grad_norm": 17.480863184396082, + "learning_rate": 3.615313337227807e-05, + "loss": 2.7495, + "mean_token_accuracy": 0.37241379618644715, + "step": 35895 + }, + { + "epoch": 0.03615886128796051, + "grad_norm": 31.941424221703727, + "learning_rate": 3.615816932900913e-05, + "loss": 2.4973, + "mean_token_accuracy": 0.4517241299152374, + "step": 35900 + }, + { + "epoch": 0.036163897341064684, + "grad_norm": 14.23777291827755, + "learning_rate": 3.616320528574019e-05, + "loss": 2.6203, + "mean_token_accuracy": 0.42758620977401735, + "step": 35905 + }, + { + "epoch": 0.03616893339416886, + "grad_norm": 15.67736916485311, + "learning_rate": 3.616824124247125e-05, + "loss": 2.823, + "mean_token_accuracy": 0.3620689630508423, + "step": 35910 + }, + { + "epoch": 0.036173969447273024, + "grad_norm": 18.786597366271845, + "learning_rate": 3.617327719920231e-05, + "loss": 2.9431, + "mean_token_accuracy": 0.40344828367233276, + "step": 35915 + }, + { + "epoch": 0.0361790055003772, + "grad_norm": 13.235391011107163, + "learning_rate": 3.6178313155933366e-05, + "loss": 2.5783, + "mean_token_accuracy": 0.3965517282485962, + "step": 35920 + }, + { + "epoch": 0.03618404155348137, + "grad_norm": 16.089551166087016, + "learning_rate": 3.6183349112664425e-05, + "loss": 2.4544, + "mean_token_accuracy": 0.42758620381355283, + "step": 35925 + }, + { + "epoch": 0.036189077606585546, + "grad_norm": 11.968365440762746, + "learning_rate": 3.6188385069395485e-05, + "loss": 2.5632, + "mean_token_accuracy": 0.37241379022598264, + "step": 35930 + }, + { + "epoch": 0.03619411365968972, + "grad_norm": 15.917514254278657, + "learning_rate": 3.6193421026126544e-05, + "loss": 2.5223, + "mean_token_accuracy": 0.4448275864124298, + "step": 35935 + }, + { + "epoch": 0.03619914971279389, + "grad_norm": 15.30218224352409, + "learning_rate": 3.61984569828576e-05, + "loss": 2.5706, + "mean_token_accuracy": 0.42165759205818176, + "step": 35940 + }, + { + "epoch": 0.03620418576589807, + "grad_norm": 13.605617111862836, + "learning_rate": 3.620349293958866e-05, + "loss": 2.619, + "mean_token_accuracy": 0.3848759800195694, + "step": 35945 + }, + { + "epoch": 0.036209221819002234, + "grad_norm": 13.452115947324405, + "learning_rate": 3.620852889631973e-05, + "loss": 2.619, + "mean_token_accuracy": 0.3793103456497192, + "step": 35950 + }, + { + "epoch": 0.03621425787210641, + "grad_norm": 14.650236371455364, + "learning_rate": 3.621356485305079e-05, + "loss": 2.5377, + "mean_token_accuracy": 0.4000000059604645, + "step": 35955 + }, + { + "epoch": 0.03621929392521058, + "grad_norm": 13.849238041885808, + "learning_rate": 3.621860080978185e-05, + "loss": 2.3568, + "mean_token_accuracy": 0.4154869973659515, + "step": 35960 + }, + { + "epoch": 0.036224329978314755, + "grad_norm": 18.751799276184475, + "learning_rate": 3.6223636766512906e-05, + "loss": 2.3606, + "mean_token_accuracy": 0.41034482419490814, + "step": 35965 + }, + { + "epoch": 0.03622936603141893, + "grad_norm": 13.095282157229866, + "learning_rate": 3.622867272324396e-05, + "loss": 2.234, + "mean_token_accuracy": 0.4034482717514038, + "step": 35970 + }, + { + "epoch": 0.0362344020845231, + "grad_norm": 12.67767159801466, + "learning_rate": 3.6233708679975025e-05, + "loss": 2.3487, + "mean_token_accuracy": 0.4448275864124298, + "step": 35975 + }, + { + "epoch": 0.036239438137627276, + "grad_norm": 15.397382634031622, + "learning_rate": 3.6238744636706084e-05, + "loss": 2.5323, + "mean_token_accuracy": 0.4034482777118683, + "step": 35980 + }, + { + "epoch": 0.03624447419073144, + "grad_norm": 18.535575416031605, + "learning_rate": 3.624378059343714e-05, + "loss": 2.4384, + "mean_token_accuracy": 0.5041871905326843, + "step": 35985 + }, + { + "epoch": 0.03624951024383562, + "grad_norm": 17.667832184653868, + "learning_rate": 3.62488165501682e-05, + "loss": 2.6409, + "mean_token_accuracy": 0.4137930989265442, + "step": 35990 + }, + { + "epoch": 0.03625454629693979, + "grad_norm": 15.9274282836938, + "learning_rate": 3.625385250689926e-05, + "loss": 2.6306, + "mean_token_accuracy": 0.3896551728248596, + "step": 35995 + }, + { + "epoch": 0.036259582350043965, + "grad_norm": 16.615355871870868, + "learning_rate": 3.625888846363033e-05, + "loss": 2.3913, + "mean_token_accuracy": 0.39310344457626345, + "step": 36000 + }, + { + "epoch": 0.03626461840314814, + "grad_norm": 14.44093224721498, + "learning_rate": 3.626392442036139e-05, + "loss": 2.3716, + "mean_token_accuracy": 0.4, + "step": 36005 + }, + { + "epoch": 0.03626965445625231, + "grad_norm": 14.68171824857839, + "learning_rate": 3.626896037709244e-05, + "loss": 2.6463, + "mean_token_accuracy": 0.3896551698446274, + "step": 36010 + }, + { + "epoch": 0.036274690509356486, + "grad_norm": 18.59762143557379, + "learning_rate": 3.62739963338235e-05, + "loss": 2.3726, + "mean_token_accuracy": 0.4551724135875702, + "step": 36015 + }, + { + "epoch": 0.03627972656246065, + "grad_norm": 18.59015039302462, + "learning_rate": 3.627903229055456e-05, + "loss": 2.6263, + "mean_token_accuracy": 0.4068965494632721, + "step": 36020 + }, + { + "epoch": 0.036284762615564826, + "grad_norm": 14.565903595795094, + "learning_rate": 3.628406824728562e-05, + "loss": 2.478, + "mean_token_accuracy": 0.41379310190677643, + "step": 36025 + }, + { + "epoch": 0.036289798668669, + "grad_norm": 14.991762898592958, + "learning_rate": 3.628910420401668e-05, + "loss": 2.4585, + "mean_token_accuracy": 0.4224440395832062, + "step": 36030 + }, + { + "epoch": 0.036294834721773174, + "grad_norm": 14.522640122472096, + "learning_rate": 3.629414016074774e-05, + "loss": 2.355, + "mean_token_accuracy": 0.43103447556495667, + "step": 36035 + }, + { + "epoch": 0.03629987077487735, + "grad_norm": 15.520254692439453, + "learning_rate": 3.62991761174788e-05, + "loss": 2.9153, + "mean_token_accuracy": 0.37586207389831544, + "step": 36040 + }, + { + "epoch": 0.03630490682798152, + "grad_norm": 14.91854509133642, + "learning_rate": 3.630421207420986e-05, + "loss": 2.6491, + "mean_token_accuracy": 0.4, + "step": 36045 + }, + { + "epoch": 0.036309942881085695, + "grad_norm": 14.574495053276056, + "learning_rate": 3.630924803094092e-05, + "loss": 2.32, + "mean_token_accuracy": 0.44137930274009707, + "step": 36050 + }, + { + "epoch": 0.03631497893418986, + "grad_norm": 14.746171790010926, + "learning_rate": 3.631428398767198e-05, + "loss": 2.5833, + "mean_token_accuracy": 0.37241379618644715, + "step": 36055 + }, + { + "epoch": 0.036320014987294036, + "grad_norm": 13.121010824546477, + "learning_rate": 3.631931994440304e-05, + "loss": 2.6651, + "mean_token_accuracy": 0.38620689809322356, + "step": 36060 + }, + { + "epoch": 0.03632505104039821, + "grad_norm": 14.402146624586079, + "learning_rate": 3.63243559011341e-05, + "loss": 2.8832, + "mean_token_accuracy": 0.35862069129943847, + "step": 36065 + }, + { + "epoch": 0.03633008709350238, + "grad_norm": 17.299694859136945, + "learning_rate": 3.632939185786516e-05, + "loss": 2.5805, + "mean_token_accuracy": 0.4, + "step": 36070 + }, + { + "epoch": 0.03633512314660656, + "grad_norm": 15.853195381210904, + "learning_rate": 3.6334427814596216e-05, + "loss": 2.6807, + "mean_token_accuracy": 0.4034482717514038, + "step": 36075 + }, + { + "epoch": 0.03634015919971073, + "grad_norm": 8.66524039649621, + "learning_rate": 3.633946377132728e-05, + "loss": 2.3307, + "mean_token_accuracy": 0.4631773352622986, + "step": 36080 + }, + { + "epoch": 0.036345195252814905, + "grad_norm": 16.2393771521022, + "learning_rate": 3.634449972805834e-05, + "loss": 2.7093, + "mean_token_accuracy": 0.3793103456497192, + "step": 36085 + }, + { + "epoch": 0.03635023130591907, + "grad_norm": 14.447030893583047, + "learning_rate": 3.63495356847894e-05, + "loss": 2.6829, + "mean_token_accuracy": 0.3862069010734558, + "step": 36090 + }, + { + "epoch": 0.036355267359023245, + "grad_norm": 19.181049367800277, + "learning_rate": 3.635457164152046e-05, + "loss": 2.8938, + "mean_token_accuracy": 0.37241379022598264, + "step": 36095 + }, + { + "epoch": 0.03636030341212742, + "grad_norm": 13.578190266651461, + "learning_rate": 3.635960759825152e-05, + "loss": 2.6843, + "mean_token_accuracy": 0.36206896901130675, + "step": 36100 + }, + { + "epoch": 0.03636533946523159, + "grad_norm": 14.615518215360693, + "learning_rate": 3.636464355498257e-05, + "loss": 2.8081, + "mean_token_accuracy": 0.3827586233615875, + "step": 36105 + }, + { + "epoch": 0.03637037551833577, + "grad_norm": 15.719146325221544, + "learning_rate": 3.636967951171364e-05, + "loss": 2.4677, + "mean_token_accuracy": 0.4068965494632721, + "step": 36110 + }, + { + "epoch": 0.03637541157143994, + "grad_norm": 16.92111035466899, + "learning_rate": 3.63747154684447e-05, + "loss": 2.6666, + "mean_token_accuracy": 0.37241379022598264, + "step": 36115 + }, + { + "epoch": 0.036380447624544114, + "grad_norm": 28.581161192996884, + "learning_rate": 3.6379751425175756e-05, + "loss": 2.6894, + "mean_token_accuracy": 0.3655172407627106, + "step": 36120 + }, + { + "epoch": 0.03638548367764828, + "grad_norm": 13.837011062664848, + "learning_rate": 3.6384787381906816e-05, + "loss": 2.3826, + "mean_token_accuracy": 0.43448275327682495, + "step": 36125 + }, + { + "epoch": 0.036390519730752455, + "grad_norm": 16.073770594099287, + "learning_rate": 3.6389823338637875e-05, + "loss": 2.2991, + "mean_token_accuracy": 0.4448275864124298, + "step": 36130 + }, + { + "epoch": 0.03639555578385663, + "grad_norm": 17.171784335485533, + "learning_rate": 3.639485929536894e-05, + "loss": 2.6711, + "mean_token_accuracy": 0.36896551847457887, + "step": 36135 + }, + { + "epoch": 0.0364005918369608, + "grad_norm": 20.97024408643152, + "learning_rate": 3.63998952521e-05, + "loss": 2.4862, + "mean_token_accuracy": 0.36896551847457887, + "step": 36140 + }, + { + "epoch": 0.036405627890064976, + "grad_norm": 15.290655460565825, + "learning_rate": 3.640493120883105e-05, + "loss": 2.9559, + "mean_token_accuracy": 0.3655172407627106, + "step": 36145 + }, + { + "epoch": 0.03641066394316915, + "grad_norm": 12.990436795360878, + "learning_rate": 3.640996716556211e-05, + "loss": 2.5069, + "mean_token_accuracy": 0.37241379618644715, + "step": 36150 + }, + { + "epoch": 0.036415699996273324, + "grad_norm": 16.559496268266315, + "learning_rate": 3.641500312229317e-05, + "loss": 2.6801, + "mean_token_accuracy": 0.37241379022598264, + "step": 36155 + }, + { + "epoch": 0.03642073604937749, + "grad_norm": 16.427517298038882, + "learning_rate": 3.642003907902424e-05, + "loss": 2.1478, + "mean_token_accuracy": 0.4034482717514038, + "step": 36160 + }, + { + "epoch": 0.036425772102481664, + "grad_norm": 12.08346055587758, + "learning_rate": 3.6425075035755297e-05, + "loss": 2.5865, + "mean_token_accuracy": 0.41379310488700866, + "step": 36165 + }, + { + "epoch": 0.03643080815558584, + "grad_norm": 17.181908043626212, + "learning_rate": 3.6430110992486356e-05, + "loss": 2.9156, + "mean_token_accuracy": 0.3551724016666412, + "step": 36170 + }, + { + "epoch": 0.03643584420869001, + "grad_norm": 16.86227056359431, + "learning_rate": 3.6435146949217415e-05, + "loss": 2.5197, + "mean_token_accuracy": 0.4103448212146759, + "step": 36175 + }, + { + "epoch": 0.036440880261794185, + "grad_norm": 14.590993247296248, + "learning_rate": 3.6440182905948474e-05, + "loss": 2.5491, + "mean_token_accuracy": 0.3620689630508423, + "step": 36180 + }, + { + "epoch": 0.03644591631489836, + "grad_norm": 34.86349183475435, + "learning_rate": 3.6445218862679534e-05, + "loss": 2.4711, + "mean_token_accuracy": 0.3724137991666794, + "step": 36185 + }, + { + "epoch": 0.03645095236800253, + "grad_norm": 13.361965798027391, + "learning_rate": 3.645025481941059e-05, + "loss": 2.3447, + "mean_token_accuracy": 0.4241379201412201, + "step": 36190 + }, + { + "epoch": 0.0364559884211067, + "grad_norm": 14.625573992905954, + "learning_rate": 3.645529077614165e-05, + "loss": 2.4696, + "mean_token_accuracy": 0.3947973370552063, + "step": 36195 + }, + { + "epoch": 0.036461024474210874, + "grad_norm": 16.121511202696592, + "learning_rate": 3.646032673287271e-05, + "loss": 2.4182, + "mean_token_accuracy": 0.4344827592372894, + "step": 36200 + }, + { + "epoch": 0.03646606052731505, + "grad_norm": 14.41206580746921, + "learning_rate": 3.646536268960377e-05, + "loss": 2.2879, + "mean_token_accuracy": 0.4655172288417816, + "step": 36205 + }, + { + "epoch": 0.03647109658041922, + "grad_norm": 16.489156414997574, + "learning_rate": 3.647039864633483e-05, + "loss": 3.0207, + "mean_token_accuracy": 0.32758620381355286, + "step": 36210 + }, + { + "epoch": 0.036476132633523395, + "grad_norm": 17.25028780882638, + "learning_rate": 3.6475434603065896e-05, + "loss": 2.8948, + "mean_token_accuracy": 0.3137931048870087, + "step": 36215 + }, + { + "epoch": 0.03648116868662757, + "grad_norm": 17.330294823616452, + "learning_rate": 3.6480470559796955e-05, + "loss": 2.3841, + "mean_token_accuracy": 0.4379310250282288, + "step": 36220 + }, + { + "epoch": 0.03648620473973174, + "grad_norm": 12.436920029521131, + "learning_rate": 3.6485506516528014e-05, + "loss": 2.4116, + "mean_token_accuracy": 0.4034482777118683, + "step": 36225 + }, + { + "epoch": 0.03649124079283591, + "grad_norm": 15.573834788462632, + "learning_rate": 3.6490542473259074e-05, + "loss": 2.6657, + "mean_token_accuracy": 0.37241379022598264, + "step": 36230 + }, + { + "epoch": 0.03649627684594008, + "grad_norm": 16.24419360018926, + "learning_rate": 3.6495578429990126e-05, + "loss": 2.5524, + "mean_token_accuracy": 0.36896551847457887, + "step": 36235 + }, + { + "epoch": 0.03650131289904426, + "grad_norm": 14.933289594571585, + "learning_rate": 3.650061438672119e-05, + "loss": 2.367, + "mean_token_accuracy": 0.4482758641242981, + "step": 36240 + }, + { + "epoch": 0.03650634895214843, + "grad_norm": 12.666380885376883, + "learning_rate": 3.650565034345225e-05, + "loss": 2.4119, + "mean_token_accuracy": 0.3982456088066101, + "step": 36245 + }, + { + "epoch": 0.036511385005252604, + "grad_norm": 13.100292262310145, + "learning_rate": 3.651068630018331e-05, + "loss": 2.6084, + "mean_token_accuracy": 0.38784028887748717, + "step": 36250 + }, + { + "epoch": 0.03651642105835678, + "grad_norm": 14.347002985750676, + "learning_rate": 3.651572225691437e-05, + "loss": 2.6537, + "mean_token_accuracy": 0.3551724135875702, + "step": 36255 + }, + { + "epoch": 0.03652145711146095, + "grad_norm": 15.716971561083701, + "learning_rate": 3.652075821364543e-05, + "loss": 2.6467, + "mean_token_accuracy": 0.3551724076271057, + "step": 36260 + }, + { + "epoch": 0.03652649316456512, + "grad_norm": 15.209175574867688, + "learning_rate": 3.652579417037649e-05, + "loss": 2.5953, + "mean_token_accuracy": 0.4517241358757019, + "step": 36265 + }, + { + "epoch": 0.03653152921766929, + "grad_norm": 13.446245082686808, + "learning_rate": 3.6530830127107554e-05, + "loss": 2.2906, + "mean_token_accuracy": 0.4103448212146759, + "step": 36270 + }, + { + "epoch": 0.036536565270773466, + "grad_norm": 15.435067096842143, + "learning_rate": 3.653586608383861e-05, + "loss": 2.5666, + "mean_token_accuracy": 0.3896551728248596, + "step": 36275 + }, + { + "epoch": 0.03654160132387764, + "grad_norm": 15.966678975802402, + "learning_rate": 3.6540902040569666e-05, + "loss": 2.6833, + "mean_token_accuracy": 0.38620689511299133, + "step": 36280 + }, + { + "epoch": 0.036546637376981814, + "grad_norm": 16.165294318984767, + "learning_rate": 3.6545937997300725e-05, + "loss": 2.5684, + "mean_token_accuracy": 0.4000000059604645, + "step": 36285 + }, + { + "epoch": 0.03655167343008599, + "grad_norm": 15.952097192636561, + "learning_rate": 3.6550973954031785e-05, + "loss": 2.4569, + "mean_token_accuracy": 0.4053841412067413, + "step": 36290 + }, + { + "epoch": 0.03655670948319016, + "grad_norm": 14.402081919933101, + "learning_rate": 3.655600991076285e-05, + "loss": 2.5428, + "mean_token_accuracy": 0.42413793206214906, + "step": 36295 + }, + { + "epoch": 0.03656174553629433, + "grad_norm": 12.426868957198204, + "learning_rate": 3.656104586749391e-05, + "loss": 2.8947, + "mean_token_accuracy": 0.3896551787853241, + "step": 36300 + }, + { + "epoch": 0.0365667815893985, + "grad_norm": 15.976688919586836, + "learning_rate": 3.656608182422497e-05, + "loss": 2.6303, + "mean_token_accuracy": 0.3758620619773865, + "step": 36305 + }, + { + "epoch": 0.036571817642502676, + "grad_norm": 16.111670452577425, + "learning_rate": 3.657111778095603e-05, + "loss": 2.9224, + "mean_token_accuracy": 0.36551723480224607, + "step": 36310 + }, + { + "epoch": 0.03657685369560685, + "grad_norm": 14.849957117665214, + "learning_rate": 3.657615373768709e-05, + "loss": 2.7598, + "mean_token_accuracy": 0.36896551251411436, + "step": 36315 + }, + { + "epoch": 0.03658188974871102, + "grad_norm": 14.2569943032712, + "learning_rate": 3.658118969441815e-05, + "loss": 2.3419, + "mean_token_accuracy": 0.4172413766384125, + "step": 36320 + }, + { + "epoch": 0.0365869258018152, + "grad_norm": 15.727984292075476, + "learning_rate": 3.6586225651149206e-05, + "loss": 2.8127, + "mean_token_accuracy": 0.3517241418361664, + "step": 36325 + }, + { + "epoch": 0.03659196185491937, + "grad_norm": 14.01803022610442, + "learning_rate": 3.6591261607880265e-05, + "loss": 2.3206, + "mean_token_accuracy": 0.42583181858062746, + "step": 36330 + }, + { + "epoch": 0.03659699790802354, + "grad_norm": 13.4828261759767, + "learning_rate": 3.6596297564611325e-05, + "loss": 2.3721, + "mean_token_accuracy": 0.4620689630508423, + "step": 36335 + }, + { + "epoch": 0.03660203396112771, + "grad_norm": 15.834492333253197, + "learning_rate": 3.6601333521342384e-05, + "loss": 2.4677, + "mean_token_accuracy": 0.3896551728248596, + "step": 36340 + }, + { + "epoch": 0.036607070014231885, + "grad_norm": 15.466322439234206, + "learning_rate": 3.660636947807344e-05, + "loss": 2.7789, + "mean_token_accuracy": 0.38620689511299133, + "step": 36345 + }, + { + "epoch": 0.03661210606733606, + "grad_norm": 14.374797629620218, + "learning_rate": 3.661140543480451e-05, + "loss": 3.0346, + "mean_token_accuracy": 0.36896551847457887, + "step": 36350 + }, + { + "epoch": 0.03661714212044023, + "grad_norm": 19.16983048800411, + "learning_rate": 3.661644139153557e-05, + "loss": 2.4667, + "mean_token_accuracy": 0.41034482717514037, + "step": 36355 + }, + { + "epoch": 0.036622178173544406, + "grad_norm": 12.010314798852168, + "learning_rate": 3.662147734826663e-05, + "loss": 2.4491, + "mean_token_accuracy": 0.4034482717514038, + "step": 36360 + }, + { + "epoch": 0.03662721422664858, + "grad_norm": 17.007335344439117, + "learning_rate": 3.662651330499769e-05, + "loss": 2.4829, + "mean_token_accuracy": 0.4172413766384125, + "step": 36365 + }, + { + "epoch": 0.03663225027975275, + "grad_norm": 15.392404324690036, + "learning_rate": 3.663154926172874e-05, + "loss": 2.7903, + "mean_token_accuracy": 0.4103448331356049, + "step": 36370 + }, + { + "epoch": 0.03663728633285692, + "grad_norm": 16.736351199627077, + "learning_rate": 3.6636585218459805e-05, + "loss": 2.4089, + "mean_token_accuracy": 0.44137930274009707, + "step": 36375 + }, + { + "epoch": 0.036642322385961094, + "grad_norm": 14.165160510607796, + "learning_rate": 3.6641621175190865e-05, + "loss": 2.134, + "mean_token_accuracy": 0.42413793206214906, + "step": 36380 + }, + { + "epoch": 0.03664735843906527, + "grad_norm": 10.178440378193777, + "learning_rate": 3.6646657131921924e-05, + "loss": 2.0906, + "mean_token_accuracy": 0.48694581389427183, + "step": 36385 + }, + { + "epoch": 0.03665239449216944, + "grad_norm": 13.532420242627989, + "learning_rate": 3.665169308865298e-05, + "loss": 2.5315, + "mean_token_accuracy": 0.3620689570903778, + "step": 36390 + }, + { + "epoch": 0.036657430545273616, + "grad_norm": 12.112287261908916, + "learning_rate": 3.665672904538404e-05, + "loss": 2.4276, + "mean_token_accuracy": 0.4497882604598999, + "step": 36395 + }, + { + "epoch": 0.03666246659837779, + "grad_norm": 13.824284013189242, + "learning_rate": 3.666176500211511e-05, + "loss": 2.3524, + "mean_token_accuracy": 0.44827585816383364, + "step": 36400 + }, + { + "epoch": 0.036667502651481956, + "grad_norm": 10.314050921608588, + "learning_rate": 3.666680095884617e-05, + "loss": 2.4139, + "mean_token_accuracy": 0.41034482717514037, + "step": 36405 + }, + { + "epoch": 0.03667253870458613, + "grad_norm": 14.157524875135051, + "learning_rate": 3.667183691557722e-05, + "loss": 2.6844, + "mean_token_accuracy": 0.3827586233615875, + "step": 36410 + }, + { + "epoch": 0.036677574757690304, + "grad_norm": 18.127502866511396, + "learning_rate": 3.667687287230828e-05, + "loss": 3.0694, + "mean_token_accuracy": 0.37586207389831544, + "step": 36415 + }, + { + "epoch": 0.03668261081079448, + "grad_norm": 20.93984412025965, + "learning_rate": 3.668190882903934e-05, + "loss": 2.2199, + "mean_token_accuracy": 0.4586206912994385, + "step": 36420 + }, + { + "epoch": 0.03668764686389865, + "grad_norm": 14.076807432408701, + "learning_rate": 3.6686944785770405e-05, + "loss": 2.483, + "mean_token_accuracy": 0.37241379022598264, + "step": 36425 + }, + { + "epoch": 0.036692682917002825, + "grad_norm": 14.034939192178593, + "learning_rate": 3.6691980742501464e-05, + "loss": 2.6646, + "mean_token_accuracy": 0.37586206793785093, + "step": 36430 + }, + { + "epoch": 0.036697718970107, + "grad_norm": 21.785662007451943, + "learning_rate": 3.669701669923252e-05, + "loss": 2.3723, + "mean_token_accuracy": 0.39999998807907106, + "step": 36435 + }, + { + "epoch": 0.036702755023211166, + "grad_norm": 16.98498210836726, + "learning_rate": 3.670205265596358e-05, + "loss": 2.5823, + "mean_token_accuracy": 0.4206896543502808, + "step": 36440 + }, + { + "epoch": 0.03670779107631534, + "grad_norm": 20.36475786215909, + "learning_rate": 3.670708861269464e-05, + "loss": 2.5037, + "mean_token_accuracy": 0.42758620977401735, + "step": 36445 + }, + { + "epoch": 0.03671282712941951, + "grad_norm": 15.70298399202541, + "learning_rate": 3.67121245694257e-05, + "loss": 2.7215, + "mean_token_accuracy": 0.324137932062149, + "step": 36450 + }, + { + "epoch": 0.03671786318252369, + "grad_norm": 15.341581918664403, + "learning_rate": 3.671716052615676e-05, + "loss": 2.9717, + "mean_token_accuracy": 0.34482758343219755, + "step": 36455 + }, + { + "epoch": 0.03672289923562786, + "grad_norm": 12.913492486317647, + "learning_rate": 3.672219648288782e-05, + "loss": 2.0034, + "mean_token_accuracy": 0.493103438615799, + "step": 36460 + }, + { + "epoch": 0.036727935288732035, + "grad_norm": 14.35918644896692, + "learning_rate": 3.672723243961888e-05, + "loss": 2.4398, + "mean_token_accuracy": 0.41379310488700866, + "step": 36465 + }, + { + "epoch": 0.03673297134183621, + "grad_norm": 25.070769274074035, + "learning_rate": 3.673226839634994e-05, + "loss": 2.9326, + "mean_token_accuracy": 0.3655172437429428, + "step": 36470 + }, + { + "epoch": 0.036738007394940375, + "grad_norm": 17.67738931458292, + "learning_rate": 3.6737304353081e-05, + "loss": 3.031, + "mean_token_accuracy": 0.34137930870056155, + "step": 36475 + }, + { + "epoch": 0.03674304344804455, + "grad_norm": 17.003119450609997, + "learning_rate": 3.674234030981206e-05, + "loss": 2.887, + "mean_token_accuracy": 0.3103448301553726, + "step": 36480 + }, + { + "epoch": 0.03674807950114872, + "grad_norm": 18.814124237527544, + "learning_rate": 3.674737626654312e-05, + "loss": 2.6444, + "mean_token_accuracy": 0.37586206793785093, + "step": 36485 + }, + { + "epoch": 0.036753115554252896, + "grad_norm": 12.884678802483899, + "learning_rate": 3.675241222327418e-05, + "loss": 2.3917, + "mean_token_accuracy": 0.4, + "step": 36490 + }, + { + "epoch": 0.03675815160735707, + "grad_norm": 17.245891982000895, + "learning_rate": 3.675744818000524e-05, + "loss": 2.3929, + "mean_token_accuracy": 0.4206896543502808, + "step": 36495 + }, + { + "epoch": 0.036763187660461244, + "grad_norm": 13.152215944301998, + "learning_rate": 3.67624841367363e-05, + "loss": 2.7032, + "mean_token_accuracy": 0.3879612863063812, + "step": 36500 + }, + { + "epoch": 0.03676822371356542, + "grad_norm": 16.350336362039627, + "learning_rate": 3.676752009346736e-05, + "loss": 2.3987, + "mean_token_accuracy": 0.4241379380226135, + "step": 36505 + }, + { + "epoch": 0.036773259766669585, + "grad_norm": 14.037244549377553, + "learning_rate": 3.677255605019842e-05, + "loss": 2.3596, + "mean_token_accuracy": 0.43448275327682495, + "step": 36510 + }, + { + "epoch": 0.03677829581977376, + "grad_norm": 13.857625168668877, + "learning_rate": 3.677759200692948e-05, + "loss": 2.2814, + "mean_token_accuracy": 0.4172413766384125, + "step": 36515 + }, + { + "epoch": 0.03678333187287793, + "grad_norm": 21.318733556134344, + "learning_rate": 3.678262796366054e-05, + "loss": 2.7173, + "mean_token_accuracy": 0.4257713288068771, + "step": 36520 + }, + { + "epoch": 0.036788367925982106, + "grad_norm": 17.55898067525928, + "learning_rate": 3.6787663920391597e-05, + "loss": 2.4871, + "mean_token_accuracy": 0.39655172228813174, + "step": 36525 + }, + { + "epoch": 0.03679340397908628, + "grad_norm": 14.315485446441079, + "learning_rate": 3.6792699877122656e-05, + "loss": 2.7248, + "mean_token_accuracy": 0.38620689511299133, + "step": 36530 + }, + { + "epoch": 0.03679844003219045, + "grad_norm": 14.217193766507231, + "learning_rate": 3.679773583385372e-05, + "loss": 3.0267, + "mean_token_accuracy": 0.32758620381355286, + "step": 36535 + }, + { + "epoch": 0.03680347608529463, + "grad_norm": 22.39215163177794, + "learning_rate": 3.680277179058478e-05, + "loss": 2.5361, + "mean_token_accuracy": 0.43103448748588563, + "step": 36540 + }, + { + "epoch": 0.036808512138398794, + "grad_norm": 11.005363590176186, + "learning_rate": 3.6807807747315834e-05, + "loss": 2.4369, + "mean_token_accuracy": 0.47586206197738645, + "step": 36545 + }, + { + "epoch": 0.03681354819150297, + "grad_norm": 12.471559578698317, + "learning_rate": 3.681284370404689e-05, + "loss": 2.6909, + "mean_token_accuracy": 0.4172413766384125, + "step": 36550 + }, + { + "epoch": 0.03681858424460714, + "grad_norm": 12.293882083586196, + "learning_rate": 3.681787966077795e-05, + "loss": 2.2911, + "mean_token_accuracy": 0.43448275327682495, + "step": 36555 + }, + { + "epoch": 0.036823620297711315, + "grad_norm": 22.296073766017283, + "learning_rate": 3.682291561750902e-05, + "loss": 2.8727, + "mean_token_accuracy": 0.42068964838981626, + "step": 36560 + }, + { + "epoch": 0.03682865635081549, + "grad_norm": 21.091985036715986, + "learning_rate": 3.682795157424008e-05, + "loss": 2.6694, + "mean_token_accuracy": 0.38100423812866213, + "step": 36565 + }, + { + "epoch": 0.03683369240391966, + "grad_norm": 12.106428830467667, + "learning_rate": 3.683298753097114e-05, + "loss": 2.7159, + "mean_token_accuracy": 0.32758620083332063, + "step": 36570 + }, + { + "epoch": 0.03683872845702384, + "grad_norm": 13.394345368898799, + "learning_rate": 3.6838023487702196e-05, + "loss": 2.4586, + "mean_token_accuracy": 0.42758620977401735, + "step": 36575 + }, + { + "epoch": 0.036843764510128, + "grad_norm": 13.24967801914048, + "learning_rate": 3.6843059444433255e-05, + "loss": 2.3655, + "mean_token_accuracy": 0.44301270246505736, + "step": 36580 + }, + { + "epoch": 0.03684880056323218, + "grad_norm": 40.44384102988305, + "learning_rate": 3.6848095401164314e-05, + "loss": 2.5865, + "mean_token_accuracy": 0.41034482717514037, + "step": 36585 + }, + { + "epoch": 0.03685383661633635, + "grad_norm": 13.768364865999422, + "learning_rate": 3.6853131357895374e-05, + "loss": 2.4702, + "mean_token_accuracy": 0.44827585220336913, + "step": 36590 + }, + { + "epoch": 0.036858872669440525, + "grad_norm": 14.168180245258524, + "learning_rate": 3.685816731462643e-05, + "loss": 2.7905, + "mean_token_accuracy": 0.37241379022598264, + "step": 36595 + }, + { + "epoch": 0.0368639087225447, + "grad_norm": 16.62926980347543, + "learning_rate": 3.686320327135749e-05, + "loss": 2.4051, + "mean_token_accuracy": 0.45862067937850953, + "step": 36600 + }, + { + "epoch": 0.03686894477564887, + "grad_norm": 13.404983881374992, + "learning_rate": 3.686823922808855e-05, + "loss": 2.3596, + "mean_token_accuracy": 0.4413793087005615, + "step": 36605 + }, + { + "epoch": 0.036873980828753046, + "grad_norm": 19.508836861056597, + "learning_rate": 3.687327518481961e-05, + "loss": 2.1978, + "mean_token_accuracy": 0.46551724076271056, + "step": 36610 + }, + { + "epoch": 0.03687901688185721, + "grad_norm": 19.71569079372293, + "learning_rate": 3.687831114155068e-05, + "loss": 2.613, + "mean_token_accuracy": 0.4448275864124298, + "step": 36615 + }, + { + "epoch": 0.03688405293496139, + "grad_norm": 14.73028217038572, + "learning_rate": 3.6883347098281736e-05, + "loss": 2.7805, + "mean_token_accuracy": 0.4052631616592407, + "step": 36620 + }, + { + "epoch": 0.03688908898806556, + "grad_norm": 17.770338187386397, + "learning_rate": 3.6888383055012795e-05, + "loss": 2.7489, + "mean_token_accuracy": 0.41034482717514037, + "step": 36625 + }, + { + "epoch": 0.036894125041169734, + "grad_norm": 20.77891851891104, + "learning_rate": 3.6893419011743854e-05, + "loss": 3.2355, + "mean_token_accuracy": 0.37241379618644715, + "step": 36630 + }, + { + "epoch": 0.03689916109427391, + "grad_norm": 13.3385813629474, + "learning_rate": 3.6898454968474914e-05, + "loss": 2.4978, + "mean_token_accuracy": 0.3965517282485962, + "step": 36635 + }, + { + "epoch": 0.03690419714737808, + "grad_norm": 13.825000542230375, + "learning_rate": 3.690349092520597e-05, + "loss": 2.5561, + "mean_token_accuracy": 0.41379311084747317, + "step": 36640 + }, + { + "epoch": 0.036909233200482255, + "grad_norm": 15.368526569471111, + "learning_rate": 3.690852688193703e-05, + "loss": 2.8062, + "mean_token_accuracy": 0.3551724076271057, + "step": 36645 + }, + { + "epoch": 0.03691426925358642, + "grad_norm": 16.50336998233481, + "learning_rate": 3.691356283866809e-05, + "loss": 2.5526, + "mean_token_accuracy": 0.38620689511299133, + "step": 36650 + }, + { + "epoch": 0.036919305306690596, + "grad_norm": 15.469497237057583, + "learning_rate": 3.691859879539915e-05, + "loss": 2.6811, + "mean_token_accuracy": 0.36896551847457887, + "step": 36655 + }, + { + "epoch": 0.03692434135979477, + "grad_norm": 14.544419116863702, + "learning_rate": 3.692363475213021e-05, + "loss": 2.5209, + "mean_token_accuracy": 0.4103448152542114, + "step": 36660 + }, + { + "epoch": 0.036929377412898944, + "grad_norm": 12.83716851573263, + "learning_rate": 3.6928670708861276e-05, + "loss": 2.4483, + "mean_token_accuracy": 0.4310344815254211, + "step": 36665 + }, + { + "epoch": 0.03693441346600312, + "grad_norm": 13.17123543482321, + "learning_rate": 3.6933706665592335e-05, + "loss": 2.5752, + "mean_token_accuracy": 0.42928009033203124, + "step": 36670 + }, + { + "epoch": 0.03693944951910729, + "grad_norm": 13.88805913012588, + "learning_rate": 3.6938742622323395e-05, + "loss": 2.6828, + "mean_token_accuracy": 0.40344826579093934, + "step": 36675 + }, + { + "epoch": 0.036944485572211465, + "grad_norm": 17.179908879362976, + "learning_rate": 3.694377857905445e-05, + "loss": 2.8689, + "mean_token_accuracy": 0.37241379022598264, + "step": 36680 + }, + { + "epoch": 0.03694952162531563, + "grad_norm": 14.287607241169338, + "learning_rate": 3.6948814535785506e-05, + "loss": 2.5652, + "mean_token_accuracy": 0.37586206793785093, + "step": 36685 + }, + { + "epoch": 0.036954557678419805, + "grad_norm": 17.197553630448738, + "learning_rate": 3.6953850492516566e-05, + "loss": 2.0752, + "mean_token_accuracy": 0.517241370677948, + "step": 36690 + }, + { + "epoch": 0.03695959373152398, + "grad_norm": 14.713473174646044, + "learning_rate": 3.695888644924763e-05, + "loss": 2.5222, + "mean_token_accuracy": 0.4034482717514038, + "step": 36695 + }, + { + "epoch": 0.03696462978462815, + "grad_norm": 16.4823026652872, + "learning_rate": 3.696392240597869e-05, + "loss": 2.5325, + "mean_token_accuracy": 0.4344827592372894, + "step": 36700 + }, + { + "epoch": 0.03696966583773233, + "grad_norm": 17.061207594308204, + "learning_rate": 3.696895836270975e-05, + "loss": 2.5047, + "mean_token_accuracy": 0.4172413766384125, + "step": 36705 + }, + { + "epoch": 0.0369747018908365, + "grad_norm": 15.09426318919535, + "learning_rate": 3.697399431944081e-05, + "loss": 2.9041, + "mean_token_accuracy": 0.3793103456497192, + "step": 36710 + }, + { + "epoch": 0.036979737943940674, + "grad_norm": 12.310223699470448, + "learning_rate": 3.697903027617187e-05, + "loss": 2.5271, + "mean_token_accuracy": 0.38620689511299133, + "step": 36715 + }, + { + "epoch": 0.03698477399704484, + "grad_norm": 20.21067023310719, + "learning_rate": 3.698406623290293e-05, + "loss": 2.7052, + "mean_token_accuracy": 0.3551724076271057, + "step": 36720 + }, + { + "epoch": 0.036989810050149015, + "grad_norm": 13.128369039081202, + "learning_rate": 3.698910218963399e-05, + "loss": 2.5684, + "mean_token_accuracy": 0.4034482777118683, + "step": 36725 + }, + { + "epoch": 0.03699484610325319, + "grad_norm": 13.924755315227733, + "learning_rate": 3.6994138146365046e-05, + "loss": 2.6576, + "mean_token_accuracy": 0.4034482777118683, + "step": 36730 + }, + { + "epoch": 0.03699988215635736, + "grad_norm": 15.301539652949188, + "learning_rate": 3.6999174103096106e-05, + "loss": 2.7914, + "mean_token_accuracy": 0.3620689570903778, + "step": 36735 + }, + { + "epoch": 0.037004918209461536, + "grad_norm": 14.693466406679303, + "learning_rate": 3.7004210059827165e-05, + "loss": 2.6974, + "mean_token_accuracy": 0.35862069129943847, + "step": 36740 + }, + { + "epoch": 0.03700995426256571, + "grad_norm": 14.119999064467663, + "learning_rate": 3.700924601655823e-05, + "loss": 2.5673, + "mean_token_accuracy": 0.400123143196106, + "step": 36745 + }, + { + "epoch": 0.037014990315669884, + "grad_norm": 26.143715144810443, + "learning_rate": 3.701428197328929e-05, + "loss": 2.5585, + "mean_token_accuracy": 0.4068965554237366, + "step": 36750 + }, + { + "epoch": 0.03702002636877405, + "grad_norm": 15.057146841669313, + "learning_rate": 3.701931793002035e-05, + "loss": 2.9046, + "mean_token_accuracy": 0.3620689630508423, + "step": 36755 + }, + { + "epoch": 0.037025062421878224, + "grad_norm": 15.665040739350905, + "learning_rate": 3.702435388675141e-05, + "loss": 2.5491, + "mean_token_accuracy": 0.3724137842655182, + "step": 36760 + }, + { + "epoch": 0.0370300984749824, + "grad_norm": 13.046002718561619, + "learning_rate": 3.702938984348247e-05, + "loss": 2.2847, + "mean_token_accuracy": 0.4103448331356049, + "step": 36765 + }, + { + "epoch": 0.03703513452808657, + "grad_norm": 15.259623645284506, + "learning_rate": 3.703442580021352e-05, + "loss": 2.7348, + "mean_token_accuracy": 0.39655172228813174, + "step": 36770 + }, + { + "epoch": 0.037040170581190746, + "grad_norm": 15.786352190158714, + "learning_rate": 3.7039461756944586e-05, + "loss": 2.6891, + "mean_token_accuracy": 0.3655172407627106, + "step": 36775 + }, + { + "epoch": 0.03704520663429492, + "grad_norm": 10.804181689832195, + "learning_rate": 3.7044497713675646e-05, + "loss": 2.2104, + "mean_token_accuracy": 0.4724137902259827, + "step": 36780 + }, + { + "epoch": 0.03705024268739909, + "grad_norm": 12.783219528094989, + "learning_rate": 3.7049533670406705e-05, + "loss": 2.3824, + "mean_token_accuracy": 0.37241379618644715, + "step": 36785 + }, + { + "epoch": 0.03705527874050326, + "grad_norm": 15.897804868073115, + "learning_rate": 3.7054569627137764e-05, + "loss": 2.6225, + "mean_token_accuracy": 0.38669951558113097, + "step": 36790 + }, + { + "epoch": 0.037060314793607434, + "grad_norm": 13.347424377097235, + "learning_rate": 3.705960558386882e-05, + "loss": 2.574, + "mean_token_accuracy": 0.34137930274009703, + "step": 36795 + }, + { + "epoch": 0.03706535084671161, + "grad_norm": 13.7405612957663, + "learning_rate": 3.706464154059989e-05, + "loss": 2.5837, + "mean_token_accuracy": 0.4068965494632721, + "step": 36800 + }, + { + "epoch": 0.03707038689981578, + "grad_norm": 17.1206256919078, + "learning_rate": 3.706967749733095e-05, + "loss": 2.8932, + "mean_token_accuracy": 0.3551724076271057, + "step": 36805 + }, + { + "epoch": 0.037075422952919955, + "grad_norm": 13.876509502446755, + "learning_rate": 3.7074713454062e-05, + "loss": 2.3711, + "mean_token_accuracy": 0.4517241418361664, + "step": 36810 + }, + { + "epoch": 0.03708045900602413, + "grad_norm": 16.550653718997555, + "learning_rate": 3.707974941079306e-05, + "loss": 2.8437, + "mean_token_accuracy": 0.38275861740112305, + "step": 36815 + }, + { + "epoch": 0.0370854950591283, + "grad_norm": 14.707804878946575, + "learning_rate": 3.708478536752412e-05, + "loss": 2.1756, + "mean_token_accuracy": 0.45366001725196836, + "step": 36820 + }, + { + "epoch": 0.03709053111223247, + "grad_norm": 14.860491872812407, + "learning_rate": 3.7089821324255186e-05, + "loss": 3.0737, + "mean_token_accuracy": 0.3517241388559341, + "step": 36825 + }, + { + "epoch": 0.03709556716533664, + "grad_norm": 14.935375948459374, + "learning_rate": 3.7094857280986245e-05, + "loss": 2.4475, + "mean_token_accuracy": 0.4275861978530884, + "step": 36830 + }, + { + "epoch": 0.03710060321844082, + "grad_norm": 18.9861745522136, + "learning_rate": 3.7099893237717304e-05, + "loss": 2.8778, + "mean_token_accuracy": 0.35862069129943847, + "step": 36835 + }, + { + "epoch": 0.03710563927154499, + "grad_norm": 16.36811136693674, + "learning_rate": 3.7104929194448363e-05, + "loss": 2.5483, + "mean_token_accuracy": 0.4034482777118683, + "step": 36840 + }, + { + "epoch": 0.037110675324649164, + "grad_norm": 13.987039601173036, + "learning_rate": 3.710996515117942e-05, + "loss": 2.4742, + "mean_token_accuracy": 0.382758629322052, + "step": 36845 + }, + { + "epoch": 0.03711571137775334, + "grad_norm": 12.528021198028817, + "learning_rate": 3.711500110791049e-05, + "loss": 2.1765, + "mean_token_accuracy": 0.4517241299152374, + "step": 36850 + }, + { + "epoch": 0.03712074743085751, + "grad_norm": 19.737135447565944, + "learning_rate": 3.712003706464154e-05, + "loss": 3.0308, + "mean_token_accuracy": 0.37586206793785093, + "step": 36855 + }, + { + "epoch": 0.03712578348396168, + "grad_norm": 13.328153713146431, + "learning_rate": 3.71250730213726e-05, + "loss": 3.016, + "mean_token_accuracy": 0.3827586114406586, + "step": 36860 + }, + { + "epoch": 0.03713081953706585, + "grad_norm": 13.034933300677164, + "learning_rate": 3.713010897810366e-05, + "loss": 2.4614, + "mean_token_accuracy": 0.441379314661026, + "step": 36865 + }, + { + "epoch": 0.037135855590170026, + "grad_norm": 13.418181249917087, + "learning_rate": 3.713514493483472e-05, + "loss": 2.4326, + "mean_token_accuracy": 0.42758620977401735, + "step": 36870 + }, + { + "epoch": 0.0371408916432742, + "grad_norm": 12.20188906408476, + "learning_rate": 3.714018089156578e-05, + "loss": 2.3056, + "mean_token_accuracy": 0.4172413766384125, + "step": 36875 + }, + { + "epoch": 0.037145927696378374, + "grad_norm": 21.345372961236794, + "learning_rate": 3.7145216848296844e-05, + "loss": 2.9974, + "mean_token_accuracy": 0.3569872975349426, + "step": 36880 + }, + { + "epoch": 0.03715096374948255, + "grad_norm": 15.597128550286877, + "learning_rate": 3.7150252805027903e-05, + "loss": 2.5023, + "mean_token_accuracy": 0.4310344815254211, + "step": 36885 + }, + { + "epoch": 0.03715599980258672, + "grad_norm": 15.9805739594926, + "learning_rate": 3.715528876175896e-05, + "loss": 2.5682, + "mean_token_accuracy": 0.4517241299152374, + "step": 36890 + }, + { + "epoch": 0.03716103585569089, + "grad_norm": 14.344108333865575, + "learning_rate": 3.716032471849002e-05, + "loss": 2.6906, + "mean_token_accuracy": 0.3896551638841629, + "step": 36895 + }, + { + "epoch": 0.03716607190879506, + "grad_norm": 14.01777929035498, + "learning_rate": 3.716536067522108e-05, + "loss": 2.7899, + "mean_token_accuracy": 0.3275862067937851, + "step": 36900 + }, + { + "epoch": 0.037171107961899236, + "grad_norm": 16.351207492175742, + "learning_rate": 3.717039663195214e-05, + "loss": 2.4145, + "mean_token_accuracy": 0.4206896543502808, + "step": 36905 + }, + { + "epoch": 0.03717614401500341, + "grad_norm": 15.645225942122792, + "learning_rate": 3.71754325886832e-05, + "loss": 2.4171, + "mean_token_accuracy": 0.42758620977401735, + "step": 36910 + }, + { + "epoch": 0.03718118006810758, + "grad_norm": 17.610599082166246, + "learning_rate": 3.718046854541426e-05, + "loss": 2.3899, + "mean_token_accuracy": 0.3620689570903778, + "step": 36915 + }, + { + "epoch": 0.03718621612121176, + "grad_norm": 19.235781277663015, + "learning_rate": 3.718550450214532e-05, + "loss": 2.2603, + "mean_token_accuracy": 0.4862068951129913, + "step": 36920 + }, + { + "epoch": 0.03719125217431593, + "grad_norm": 15.002460756484108, + "learning_rate": 3.719054045887638e-05, + "loss": 2.4973, + "mean_token_accuracy": 0.41379310488700866, + "step": 36925 + }, + { + "epoch": 0.0371962882274201, + "grad_norm": 16.61585405385401, + "learning_rate": 3.7195576415607444e-05, + "loss": 2.6413, + "mean_token_accuracy": 0.41724138855934145, + "step": 36930 + }, + { + "epoch": 0.03720132428052427, + "grad_norm": 15.083693627198638, + "learning_rate": 3.72006123723385e-05, + "loss": 2.588, + "mean_token_accuracy": 0.39655172228813174, + "step": 36935 + }, + { + "epoch": 0.037206360333628445, + "grad_norm": 15.073336156813328, + "learning_rate": 3.720564832906956e-05, + "loss": 2.4992, + "mean_token_accuracy": 0.4430732011795044, + "step": 36940 + }, + { + "epoch": 0.03721139638673262, + "grad_norm": 14.187064410263124, + "learning_rate": 3.7210684285800615e-05, + "loss": 2.2632, + "mean_token_accuracy": 0.42758620381355283, + "step": 36945 + }, + { + "epoch": 0.03721643243983679, + "grad_norm": 17.947019564046133, + "learning_rate": 3.7215720242531674e-05, + "loss": 2.5576, + "mean_token_accuracy": 0.40344828367233276, + "step": 36950 + }, + { + "epoch": 0.037221468492940966, + "grad_norm": 14.59329980851731, + "learning_rate": 3.722075619926273e-05, + "loss": 2.59, + "mean_token_accuracy": 0.4154264986515045, + "step": 36955 + }, + { + "epoch": 0.03722650454604514, + "grad_norm": 12.088184753972115, + "learning_rate": 3.72257921559938e-05, + "loss": 2.2358, + "mean_token_accuracy": 0.4137930989265442, + "step": 36960 + }, + { + "epoch": 0.03723154059914931, + "grad_norm": 19.499888568930547, + "learning_rate": 3.723082811272486e-05, + "loss": 2.61, + "mean_token_accuracy": 0.3689655214548111, + "step": 36965 + }, + { + "epoch": 0.03723657665225348, + "grad_norm": 15.421823701432386, + "learning_rate": 3.723586406945592e-05, + "loss": 2.2798, + "mean_token_accuracy": 0.43623715043067934, + "step": 36970 + }, + { + "epoch": 0.037241612705357655, + "grad_norm": 14.833428354424186, + "learning_rate": 3.724090002618698e-05, + "loss": 2.5679, + "mean_token_accuracy": 0.3862069010734558, + "step": 36975 + }, + { + "epoch": 0.03724664875846183, + "grad_norm": 18.577130206443186, + "learning_rate": 3.7245935982918036e-05, + "loss": 2.7312, + "mean_token_accuracy": 0.3774349629878998, + "step": 36980 + }, + { + "epoch": 0.037251684811566, + "grad_norm": 14.873891519078386, + "learning_rate": 3.7250971939649095e-05, + "loss": 2.6457, + "mean_token_accuracy": 0.3724137842655182, + "step": 36985 + }, + { + "epoch": 0.037256720864670176, + "grad_norm": 14.555570046125922, + "learning_rate": 3.7256007896380155e-05, + "loss": 2.9705, + "mean_token_accuracy": 0.3551724195480347, + "step": 36990 + }, + { + "epoch": 0.03726175691777435, + "grad_norm": 13.116953898879931, + "learning_rate": 3.7261043853111214e-05, + "loss": 2.5321, + "mean_token_accuracy": 0.39999999701976774, + "step": 36995 + }, + { + "epoch": 0.037266792970878516, + "grad_norm": 14.05964570090305, + "learning_rate": 3.726607980984227e-05, + "loss": 2.5995, + "mean_token_accuracy": 0.358620685338974, + "step": 37000 + }, + { + "epoch": 0.03727182902398269, + "grad_norm": 15.272355727642719, + "learning_rate": 3.727111576657333e-05, + "loss": 2.6403, + "mean_token_accuracy": 0.42413793206214906, + "step": 37005 + }, + { + "epoch": 0.037276865077086864, + "grad_norm": 17.128886054715, + "learning_rate": 3.72761517233044e-05, + "loss": 2.5104, + "mean_token_accuracy": 0.42758620977401735, + "step": 37010 + }, + { + "epoch": 0.03728190113019104, + "grad_norm": 15.086621158668816, + "learning_rate": 3.728118768003546e-05, + "loss": 2.5052, + "mean_token_accuracy": 0.38275861740112305, + "step": 37015 + }, + { + "epoch": 0.03728693718329521, + "grad_norm": 15.9754395426235, + "learning_rate": 3.728622363676652e-05, + "loss": 3.0067, + "mean_token_accuracy": 0.31724137961864474, + "step": 37020 + }, + { + "epoch": 0.037291973236399385, + "grad_norm": 15.26129420871469, + "learning_rate": 3.7291259593497576e-05, + "loss": 2.3938, + "mean_token_accuracy": 0.45172412395477296, + "step": 37025 + }, + { + "epoch": 0.03729700928950356, + "grad_norm": 16.882262991485778, + "learning_rate": 3.7296295550228635e-05, + "loss": 3.0585, + "mean_token_accuracy": 0.3793103456497192, + "step": 37030 + }, + { + "epoch": 0.037302045342607726, + "grad_norm": 15.417573989533633, + "learning_rate": 3.7301331506959695e-05, + "loss": 2.543, + "mean_token_accuracy": 0.3862068891525269, + "step": 37035 + }, + { + "epoch": 0.0373070813957119, + "grad_norm": 12.395973961518987, + "learning_rate": 3.7306367463690754e-05, + "loss": 2.3744, + "mean_token_accuracy": 0.4344827592372894, + "step": 37040 + }, + { + "epoch": 0.03731211744881607, + "grad_norm": 18.63392842726373, + "learning_rate": 3.731140342042181e-05, + "loss": 2.7318, + "mean_token_accuracy": 0.3517241358757019, + "step": 37045 + }, + { + "epoch": 0.03731715350192025, + "grad_norm": 25.509881745602566, + "learning_rate": 3.731643937715287e-05, + "loss": 2.405, + "mean_token_accuracy": 0.43254687786102297, + "step": 37050 + }, + { + "epoch": 0.03732218955502442, + "grad_norm": 15.158199915438326, + "learning_rate": 3.732147533388393e-05, + "loss": 2.6761, + "mean_token_accuracy": 0.4079854846000671, + "step": 37055 + }, + { + "epoch": 0.037327225608128595, + "grad_norm": 17.821497858560402, + "learning_rate": 3.732651129061499e-05, + "loss": 2.3383, + "mean_token_accuracy": 0.45517240166664125, + "step": 37060 + }, + { + "epoch": 0.03733226166123277, + "grad_norm": 14.962328222669854, + "learning_rate": 3.733154724734606e-05, + "loss": 2.4325, + "mean_token_accuracy": 0.3896551728248596, + "step": 37065 + }, + { + "epoch": 0.037337297714336935, + "grad_norm": 13.236523969216346, + "learning_rate": 3.7336583204077116e-05, + "loss": 2.4022, + "mean_token_accuracy": 0.4431336998939514, + "step": 37070 + }, + { + "epoch": 0.03734233376744111, + "grad_norm": 17.173124417103086, + "learning_rate": 3.7341619160808175e-05, + "loss": 2.9942, + "mean_token_accuracy": 0.37241379022598264, + "step": 37075 + }, + { + "epoch": 0.03734736982054528, + "grad_norm": 17.751673952909627, + "learning_rate": 3.734665511753923e-05, + "loss": 3.0673, + "mean_token_accuracy": 0.33103448152542114, + "step": 37080 + }, + { + "epoch": 0.03735240587364946, + "grad_norm": 15.133261018386692, + "learning_rate": 3.735169107427029e-05, + "loss": 2.8089, + "mean_token_accuracy": 0.4000000059604645, + "step": 37085 + }, + { + "epoch": 0.03735744192675363, + "grad_norm": 15.351146374087152, + "learning_rate": 3.735672703100135e-05, + "loss": 2.5436, + "mean_token_accuracy": 0.3827586233615875, + "step": 37090 + }, + { + "epoch": 0.037362477979857804, + "grad_norm": 15.842471067688862, + "learning_rate": 3.736176298773241e-05, + "loss": 2.8207, + "mean_token_accuracy": 0.39655172228813174, + "step": 37095 + }, + { + "epoch": 0.03736751403296198, + "grad_norm": 13.255546320989447, + "learning_rate": 3.736679894446347e-05, + "loss": 2.256, + "mean_token_accuracy": 0.4482758641242981, + "step": 37100 + }, + { + "epoch": 0.037372550086066145, + "grad_norm": 15.001262189703693, + "learning_rate": 3.737183490119453e-05, + "loss": 3.0586, + "mean_token_accuracy": 0.3551724076271057, + "step": 37105 + }, + { + "epoch": 0.03737758613917032, + "grad_norm": 15.08775160260167, + "learning_rate": 3.737687085792559e-05, + "loss": 2.3661, + "mean_token_accuracy": 0.42413793206214906, + "step": 37110 + }, + { + "epoch": 0.03738262219227449, + "grad_norm": 13.632981879661061, + "learning_rate": 3.738190681465665e-05, + "loss": 2.4577, + "mean_token_accuracy": 0.4482758641242981, + "step": 37115 + }, + { + "epoch": 0.037387658245378666, + "grad_norm": 12.969719664638676, + "learning_rate": 3.738694277138771e-05, + "loss": 2.6251, + "mean_token_accuracy": 0.39086509346961973, + "step": 37120 + }, + { + "epoch": 0.03739269429848284, + "grad_norm": 12.905571565015103, + "learning_rate": 3.739197872811877e-05, + "loss": 2.6843, + "mean_token_accuracy": 0.37241379022598264, + "step": 37125 + }, + { + "epoch": 0.037397730351587014, + "grad_norm": 18.638340379462747, + "learning_rate": 3.739701468484983e-05, + "loss": 2.7719, + "mean_token_accuracy": 0.3827586233615875, + "step": 37130 + }, + { + "epoch": 0.03740276640469119, + "grad_norm": 15.253056089411233, + "learning_rate": 3.7402050641580886e-05, + "loss": 2.5805, + "mean_token_accuracy": 0.441379314661026, + "step": 37135 + }, + { + "epoch": 0.037407802457795354, + "grad_norm": 14.242773377886216, + "learning_rate": 3.7407086598311946e-05, + "loss": 2.6559, + "mean_token_accuracy": 0.3620689660310745, + "step": 37140 + }, + { + "epoch": 0.03741283851089953, + "grad_norm": 15.260825564855491, + "learning_rate": 3.741212255504301e-05, + "loss": 2.7139, + "mean_token_accuracy": 0.38965516686439516, + "step": 37145 + }, + { + "epoch": 0.0374178745640037, + "grad_norm": 13.076380620682157, + "learning_rate": 3.741715851177407e-05, + "loss": 2.2292, + "mean_token_accuracy": 0.41034482717514037, + "step": 37150 + }, + { + "epoch": 0.037422910617107875, + "grad_norm": 19.260522588702823, + "learning_rate": 3.742219446850513e-05, + "loss": 2.4814, + "mean_token_accuracy": 0.4379310369491577, + "step": 37155 + }, + { + "epoch": 0.03742794667021205, + "grad_norm": 15.382215252876598, + "learning_rate": 3.742723042523619e-05, + "loss": 2.911, + "mean_token_accuracy": 0.3551724076271057, + "step": 37160 + }, + { + "epoch": 0.03743298272331622, + "grad_norm": 14.915892891451094, + "learning_rate": 3.743226638196725e-05, + "loss": 2.9596, + "mean_token_accuracy": 0.3586206793785095, + "step": 37165 + }, + { + "epoch": 0.0374380187764204, + "grad_norm": 21.018577224892457, + "learning_rate": 3.743730233869831e-05, + "loss": 2.8792, + "mean_token_accuracy": 0.4103448331356049, + "step": 37170 + }, + { + "epoch": 0.037443054829524564, + "grad_norm": 14.148458836166203, + "learning_rate": 3.744233829542937e-05, + "loss": 2.7112, + "mean_token_accuracy": 0.3793103456497192, + "step": 37175 + }, + { + "epoch": 0.03744809088262874, + "grad_norm": 10.999292119636277, + "learning_rate": 3.7447374252160427e-05, + "loss": 2.6739, + "mean_token_accuracy": 0.41379310488700866, + "step": 37180 + }, + { + "epoch": 0.03745312693573291, + "grad_norm": 15.551240900808741, + "learning_rate": 3.7452410208891486e-05, + "loss": 2.9199, + "mean_token_accuracy": 0.35517241060733795, + "step": 37185 + }, + { + "epoch": 0.037458162988837085, + "grad_norm": 16.05545767455831, + "learning_rate": 3.7457446165622545e-05, + "loss": 2.7567, + "mean_token_accuracy": 0.3827586233615875, + "step": 37190 + }, + { + "epoch": 0.03746319904194126, + "grad_norm": 12.091876989964625, + "learning_rate": 3.7462482122353604e-05, + "loss": 2.2669, + "mean_token_accuracy": 0.4275862157344818, + "step": 37195 + }, + { + "epoch": 0.03746823509504543, + "grad_norm": 15.017214059884669, + "learning_rate": 3.746751807908467e-05, + "loss": 2.915, + "mean_token_accuracy": 0.35517241060733795, + "step": 37200 + }, + { + "epoch": 0.037473271148149606, + "grad_norm": 17.15534291637143, + "learning_rate": 3.747255403581573e-05, + "loss": 3.0101, + "mean_token_accuracy": 0.3206896513700485, + "step": 37205 + }, + { + "epoch": 0.03747830720125377, + "grad_norm": 12.500291941285042, + "learning_rate": 3.747758999254679e-05, + "loss": 2.4347, + "mean_token_accuracy": 0.41034482717514037, + "step": 37210 + }, + { + "epoch": 0.03748334325435795, + "grad_norm": 16.522644581295456, + "learning_rate": 3.748262594927784e-05, + "loss": 2.9567, + "mean_token_accuracy": 0.37586206793785093, + "step": 37215 + }, + { + "epoch": 0.03748837930746212, + "grad_norm": 13.482915454090747, + "learning_rate": 3.74876619060089e-05, + "loss": 2.2989, + "mean_token_accuracy": 0.42758620381355283, + "step": 37220 + }, + { + "epoch": 0.037493415360566294, + "grad_norm": 13.095617954138069, + "learning_rate": 3.7492697862739967e-05, + "loss": 2.6189, + "mean_token_accuracy": 0.3965517312288284, + "step": 37225 + }, + { + "epoch": 0.03749845141367047, + "grad_norm": 15.214622762015352, + "learning_rate": 3.7497733819471026e-05, + "loss": 2.6386, + "mean_token_accuracy": 0.4116757452487946, + "step": 37230 + }, + { + "epoch": 0.03750348746677464, + "grad_norm": 13.51527068099464, + "learning_rate": 3.7502769776202085e-05, + "loss": 2.4337, + "mean_token_accuracy": 0.4379310369491577, + "step": 37235 + }, + { + "epoch": 0.037508523519878816, + "grad_norm": 18.819319508182396, + "learning_rate": 3.7507805732933144e-05, + "loss": 2.528, + "mean_token_accuracy": 0.4017543852329254, + "step": 37240 + }, + { + "epoch": 0.03751355957298298, + "grad_norm": 18.02407642550917, + "learning_rate": 3.7512841689664204e-05, + "loss": 2.6443, + "mean_token_accuracy": 0.36896551847457887, + "step": 37245 + }, + { + "epoch": 0.037518595626087156, + "grad_norm": 22.40748886666479, + "learning_rate": 3.751787764639527e-05, + "loss": 2.5944, + "mean_token_accuracy": 0.4137930989265442, + "step": 37250 + }, + { + "epoch": 0.03752363167919133, + "grad_norm": 15.073655982234948, + "learning_rate": 3.752291360312632e-05, + "loss": 2.2824, + "mean_token_accuracy": 0.41379310488700866, + "step": 37255 + }, + { + "epoch": 0.037528667732295504, + "grad_norm": 15.205477085343581, + "learning_rate": 3.752794955985738e-05, + "loss": 2.7656, + "mean_token_accuracy": 0.42758620977401735, + "step": 37260 + }, + { + "epoch": 0.03753370378539968, + "grad_norm": 14.833307725558798, + "learning_rate": 3.753298551658844e-05, + "loss": 2.428, + "mean_token_accuracy": 0.4, + "step": 37265 + }, + { + "epoch": 0.03753873983850385, + "grad_norm": 16.49894398318288, + "learning_rate": 3.75380214733195e-05, + "loss": 2.5599, + "mean_token_accuracy": 0.37586206793785093, + "step": 37270 + }, + { + "epoch": 0.037543775891608025, + "grad_norm": 15.12630260029372, + "learning_rate": 3.7543057430050566e-05, + "loss": 2.209, + "mean_token_accuracy": 0.4068965494632721, + "step": 37275 + }, + { + "epoch": 0.03754881194471219, + "grad_norm": 17.13705218859721, + "learning_rate": 3.7548093386781625e-05, + "loss": 2.6415, + "mean_token_accuracy": 0.4034482717514038, + "step": 37280 + }, + { + "epoch": 0.037553847997816366, + "grad_norm": 14.967548616468518, + "learning_rate": 3.7553129343512684e-05, + "loss": 2.7424, + "mean_token_accuracy": 0.37931033968925476, + "step": 37285 + }, + { + "epoch": 0.03755888405092054, + "grad_norm": 12.0525257702542, + "learning_rate": 3.7558165300243744e-05, + "loss": 2.4353, + "mean_token_accuracy": 0.39310344457626345, + "step": 37290 + }, + { + "epoch": 0.03756392010402471, + "grad_norm": 11.980874813087183, + "learning_rate": 3.75632012569748e-05, + "loss": 2.4278, + "mean_token_accuracy": 0.36551723480224607, + "step": 37295 + }, + { + "epoch": 0.03756895615712889, + "grad_norm": 14.102922157654511, + "learning_rate": 3.756823721370586e-05, + "loss": 2.4648, + "mean_token_accuracy": 0.45517241954803467, + "step": 37300 + }, + { + "epoch": 0.03757399221023306, + "grad_norm": 15.27001406095254, + "learning_rate": 3.757327317043692e-05, + "loss": 2.5155, + "mean_token_accuracy": 0.4172413766384125, + "step": 37305 + }, + { + "epoch": 0.037579028263337234, + "grad_norm": 17.021464613701173, + "learning_rate": 3.757830912716798e-05, + "loss": 3.1649, + "mean_token_accuracy": 0.3103448271751404, + "step": 37310 + }, + { + "epoch": 0.0375840643164414, + "grad_norm": 11.76766535096749, + "learning_rate": 3.758334508389904e-05, + "loss": 2.3518, + "mean_token_accuracy": 0.4034482777118683, + "step": 37315 + }, + { + "epoch": 0.037589100369545575, + "grad_norm": 18.2817168173513, + "learning_rate": 3.75883810406301e-05, + "loss": 2.7215, + "mean_token_accuracy": 0.3620689630508423, + "step": 37320 + }, + { + "epoch": 0.03759413642264975, + "grad_norm": 16.139894169946512, + "learning_rate": 3.759341699736116e-05, + "loss": 2.4469, + "mean_token_accuracy": 0.41379310488700866, + "step": 37325 + }, + { + "epoch": 0.03759917247575392, + "grad_norm": 12.959993246528642, + "learning_rate": 3.7598452954092224e-05, + "loss": 2.5899, + "mean_token_accuracy": 0.4223835408687592, + "step": 37330 + }, + { + "epoch": 0.037604208528858096, + "grad_norm": 12.562160644566669, + "learning_rate": 3.7603488910823284e-05, + "loss": 2.5406, + "mean_token_accuracy": 0.42758620381355283, + "step": 37335 + }, + { + "epoch": 0.03760924458196227, + "grad_norm": 11.388757829919916, + "learning_rate": 3.760852486755434e-05, + "loss": 2.604, + "mean_token_accuracy": 0.44482759237289426, + "step": 37340 + }, + { + "epoch": 0.037614280635066444, + "grad_norm": 17.859357054318874, + "learning_rate": 3.7613560824285395e-05, + "loss": 2.4748, + "mean_token_accuracy": 0.4482758641242981, + "step": 37345 + }, + { + "epoch": 0.03761931668817061, + "grad_norm": 12.795253385490184, + "learning_rate": 3.7618596781016455e-05, + "loss": 2.137, + "mean_token_accuracy": 0.4896551609039307, + "step": 37350 + }, + { + "epoch": 0.037624352741274784, + "grad_norm": 11.007730505909652, + "learning_rate": 3.762363273774752e-05, + "loss": 2.7909, + "mean_token_accuracy": 0.39999999701976774, + "step": 37355 + }, + { + "epoch": 0.03762938879437896, + "grad_norm": 17.49928757104641, + "learning_rate": 3.762866869447858e-05, + "loss": 2.909, + "mean_token_accuracy": 0.3655172407627106, + "step": 37360 + }, + { + "epoch": 0.03763442484748313, + "grad_norm": 16.18310434471742, + "learning_rate": 3.763370465120964e-05, + "loss": 2.7706, + "mean_token_accuracy": 0.3551724195480347, + "step": 37365 + }, + { + "epoch": 0.037639460900587306, + "grad_norm": 18.431244996763173, + "learning_rate": 3.76387406079407e-05, + "loss": 2.7118, + "mean_token_accuracy": 0.4068965494632721, + "step": 37370 + }, + { + "epoch": 0.03764449695369148, + "grad_norm": 19.73557123611002, + "learning_rate": 3.764377656467176e-05, + "loss": 2.7471, + "mean_token_accuracy": 0.37241379022598264, + "step": 37375 + }, + { + "epoch": 0.03764953300679565, + "grad_norm": 17.254990550462626, + "learning_rate": 3.764881252140282e-05, + "loss": 2.6761, + "mean_token_accuracy": 0.36896551251411436, + "step": 37380 + }, + { + "epoch": 0.03765456905989982, + "grad_norm": 15.233477798941589, + "learning_rate": 3.765384847813388e-05, + "loss": 2.4229, + "mean_token_accuracy": 0.4, + "step": 37385 + }, + { + "epoch": 0.037659605113003994, + "grad_norm": 14.567631397117367, + "learning_rate": 3.7658884434864935e-05, + "loss": 2.5189, + "mean_token_accuracy": 0.39310344457626345, + "step": 37390 + }, + { + "epoch": 0.03766464116610817, + "grad_norm": 13.663812497416789, + "learning_rate": 3.7663920391595995e-05, + "loss": 2.5284, + "mean_token_accuracy": 0.42068964838981626, + "step": 37395 + }, + { + "epoch": 0.03766967721921234, + "grad_norm": 12.843168890320923, + "learning_rate": 3.7668956348327054e-05, + "loss": 2.33, + "mean_token_accuracy": 0.4310344815254211, + "step": 37400 + }, + { + "epoch": 0.037674713272316515, + "grad_norm": 11.180394036435631, + "learning_rate": 3.767399230505811e-05, + "loss": 2.222, + "mean_token_accuracy": 0.4532365322113037, + "step": 37405 + }, + { + "epoch": 0.03767974932542069, + "grad_norm": 15.16200139613471, + "learning_rate": 3.767902826178918e-05, + "loss": 2.4349, + "mean_token_accuracy": 0.43793103098869324, + "step": 37410 + }, + { + "epoch": 0.03768478537852486, + "grad_norm": 15.00254684676308, + "learning_rate": 3.768406421852024e-05, + "loss": 2.4594, + "mean_token_accuracy": 0.42413792610168455, + "step": 37415 + }, + { + "epoch": 0.03768982143162903, + "grad_norm": 12.064764300251081, + "learning_rate": 3.76891001752513e-05, + "loss": 2.5965, + "mean_token_accuracy": 0.4034482777118683, + "step": 37420 + }, + { + "epoch": 0.0376948574847332, + "grad_norm": 14.782644649373715, + "learning_rate": 3.769413613198236e-05, + "loss": 2.2438, + "mean_token_accuracy": 0.484634006023407, + "step": 37425 + }, + { + "epoch": 0.03769989353783738, + "grad_norm": 16.088616604242418, + "learning_rate": 3.7699172088713416e-05, + "loss": 2.7424, + "mean_token_accuracy": 0.4068965494632721, + "step": 37430 + }, + { + "epoch": 0.03770492959094155, + "grad_norm": 14.861030761559944, + "learning_rate": 3.7704208045444476e-05, + "loss": 2.5604, + "mean_token_accuracy": 0.3896551728248596, + "step": 37435 + }, + { + "epoch": 0.037709965644045725, + "grad_norm": 14.942877241509203, + "learning_rate": 3.7709244002175535e-05, + "loss": 2.4307, + "mean_token_accuracy": 0.41034482717514037, + "step": 37440 + }, + { + "epoch": 0.0377150016971499, + "grad_norm": 11.554954658508198, + "learning_rate": 3.7714279958906594e-05, + "loss": 2.3134, + "mean_token_accuracy": 0.4344827592372894, + "step": 37445 + }, + { + "epoch": 0.03772003775025407, + "grad_norm": 13.234206052930565, + "learning_rate": 3.771931591563765e-05, + "loss": 2.4729, + "mean_token_accuracy": 0.43629764914512636, + "step": 37450 + }, + { + "epoch": 0.03772507380335824, + "grad_norm": 14.550100839923084, + "learning_rate": 3.772435187236871e-05, + "loss": 2.3391, + "mean_token_accuracy": 0.46067755818367007, + "step": 37455 + }, + { + "epoch": 0.03773010985646241, + "grad_norm": 13.860644732971673, + "learning_rate": 3.772938782909977e-05, + "loss": 2.4702, + "mean_token_accuracy": 0.4172413766384125, + "step": 37460 + }, + { + "epoch": 0.037735145909566586, + "grad_norm": 14.26105951773389, + "learning_rate": 3.773442378583084e-05, + "loss": 2.4257, + "mean_token_accuracy": 0.4103448331356049, + "step": 37465 + }, + { + "epoch": 0.03774018196267076, + "grad_norm": 15.029553924801379, + "learning_rate": 3.77394597425619e-05, + "loss": 2.6831, + "mean_token_accuracy": 0.3827586114406586, + "step": 37470 + }, + { + "epoch": 0.037745218015774934, + "grad_norm": 15.884297512686892, + "learning_rate": 3.7744495699292956e-05, + "loss": 2.8042, + "mean_token_accuracy": 0.38965516686439516, + "step": 37475 + }, + { + "epoch": 0.03775025406887911, + "grad_norm": 17.54698322303089, + "learning_rate": 3.774953165602401e-05, + "loss": 2.6781, + "mean_token_accuracy": 0.38620689511299133, + "step": 37480 + }, + { + "epoch": 0.03775529012198328, + "grad_norm": 15.242176840196654, + "learning_rate": 3.775456761275507e-05, + "loss": 2.1139, + "mean_token_accuracy": 0.44670296311378477, + "step": 37485 + }, + { + "epoch": 0.03776032617508745, + "grad_norm": 22.901440042691785, + "learning_rate": 3.7759603569486134e-05, + "loss": 2.7446, + "mean_token_accuracy": 0.39310344457626345, + "step": 37490 + }, + { + "epoch": 0.03776536222819162, + "grad_norm": 14.824067473452105, + "learning_rate": 3.776463952621719e-05, + "loss": 2.6717, + "mean_token_accuracy": 0.3586206823587418, + "step": 37495 + }, + { + "epoch": 0.037770398281295796, + "grad_norm": 15.874462897133325, + "learning_rate": 3.776967548294825e-05, + "loss": 2.5705, + "mean_token_accuracy": 0.41724138259887694, + "step": 37500 + }, + { + "epoch": 0.03777543433439997, + "grad_norm": 16.136880666348233, + "learning_rate": 3.777471143967931e-05, + "loss": 2.3457, + "mean_token_accuracy": 0.4137930989265442, + "step": 37505 + }, + { + "epoch": 0.03778047038750414, + "grad_norm": 23.498264828852093, + "learning_rate": 3.777974739641037e-05, + "loss": 2.8405, + "mean_token_accuracy": 0.3482758581638336, + "step": 37510 + }, + { + "epoch": 0.03778550644060832, + "grad_norm": 16.352136124326414, + "learning_rate": 3.778478335314144e-05, + "loss": 2.6462, + "mean_token_accuracy": 0.4103448331356049, + "step": 37515 + }, + { + "epoch": 0.03779054249371249, + "grad_norm": 13.824106704885109, + "learning_rate": 3.778981930987249e-05, + "loss": 2.6651, + "mean_token_accuracy": 0.37241379022598264, + "step": 37520 + }, + { + "epoch": 0.03779557854681666, + "grad_norm": 18.44190869040901, + "learning_rate": 3.779485526660355e-05, + "loss": 2.6627, + "mean_token_accuracy": 0.36551723480224607, + "step": 37525 + }, + { + "epoch": 0.03780061459992083, + "grad_norm": 12.182179802825718, + "learning_rate": 3.779989122333461e-05, + "loss": 2.3572, + "mean_token_accuracy": 0.47586206793785096, + "step": 37530 + }, + { + "epoch": 0.037805650653025005, + "grad_norm": 30.57724416519205, + "learning_rate": 3.780492718006567e-05, + "loss": 2.6024, + "mean_token_accuracy": 0.44827585518360136, + "step": 37535 + }, + { + "epoch": 0.03781068670612918, + "grad_norm": 15.762529631656243, + "learning_rate": 3.7809963136796727e-05, + "loss": 2.3011, + "mean_token_accuracy": 0.42758620977401735, + "step": 37540 + }, + { + "epoch": 0.03781572275923335, + "grad_norm": 13.101366316160087, + "learning_rate": 3.781499909352779e-05, + "loss": 1.9965, + "mean_token_accuracy": 0.4551724076271057, + "step": 37545 + }, + { + "epoch": 0.03782075881233753, + "grad_norm": 17.902825756778125, + "learning_rate": 3.782003505025885e-05, + "loss": 2.8482, + "mean_token_accuracy": 0.3896551728248596, + "step": 37550 + }, + { + "epoch": 0.0378257948654417, + "grad_norm": 17.788013136817227, + "learning_rate": 3.782507100698991e-05, + "loss": 2.5551, + "mean_token_accuracy": 0.38620689511299133, + "step": 37555 + }, + { + "epoch": 0.03783083091854587, + "grad_norm": 18.931728739740624, + "learning_rate": 3.783010696372097e-05, + "loss": 2.7915, + "mean_token_accuracy": 0.40344828367233276, + "step": 37560 + }, + { + "epoch": 0.03783586697165004, + "grad_norm": 16.888980661762105, + "learning_rate": 3.783514292045203e-05, + "loss": 2.5336, + "mean_token_accuracy": 0.3931034505367279, + "step": 37565 + }, + { + "epoch": 0.037840903024754215, + "grad_norm": 12.659661308799716, + "learning_rate": 3.784017887718309e-05, + "loss": 2.298, + "mean_token_accuracy": 0.4517241358757019, + "step": 37570 + }, + { + "epoch": 0.03784593907785839, + "grad_norm": 14.696549667213176, + "learning_rate": 3.784521483391415e-05, + "loss": 2.6208, + "mean_token_accuracy": 0.37241379022598264, + "step": 37575 + }, + { + "epoch": 0.03785097513096256, + "grad_norm": 13.475020994118884, + "learning_rate": 3.785025079064521e-05, + "loss": 2.4669, + "mean_token_accuracy": 0.4344827592372894, + "step": 37580 + }, + { + "epoch": 0.037856011184066736, + "grad_norm": 15.540453173611905, + "learning_rate": 3.785528674737627e-05, + "loss": 2.5261, + "mean_token_accuracy": 0.4448275864124298, + "step": 37585 + }, + { + "epoch": 0.03786104723717091, + "grad_norm": 13.720858918634693, + "learning_rate": 3.7860322704107326e-05, + "loss": 3.4132, + "mean_token_accuracy": 0.3896551787853241, + "step": 37590 + }, + { + "epoch": 0.03786608329027508, + "grad_norm": 18.110051051886447, + "learning_rate": 3.786535866083839e-05, + "loss": 2.5888, + "mean_token_accuracy": 0.36896551251411436, + "step": 37595 + }, + { + "epoch": 0.03787111934337925, + "grad_norm": 15.025557668487194, + "learning_rate": 3.787039461756945e-05, + "loss": 2.7147, + "mean_token_accuracy": 0.34482758641242983, + "step": 37600 + }, + { + "epoch": 0.037876155396483424, + "grad_norm": 15.881645737485643, + "learning_rate": 3.787543057430051e-05, + "loss": 2.4904, + "mean_token_accuracy": 0.43448275327682495, + "step": 37605 + }, + { + "epoch": 0.0378811914495876, + "grad_norm": 16.244838333260162, + "learning_rate": 3.788046653103157e-05, + "loss": 2.2996, + "mean_token_accuracy": 0.4188142716884613, + "step": 37610 + }, + { + "epoch": 0.03788622750269177, + "grad_norm": 14.142268602458111, + "learning_rate": 3.788550248776262e-05, + "loss": 2.2497, + "mean_token_accuracy": 0.4448275864124298, + "step": 37615 + }, + { + "epoch": 0.037891263555795945, + "grad_norm": 17.428215889340503, + "learning_rate": 3.789053844449368e-05, + "loss": 2.5894, + "mean_token_accuracy": 0.40562613010406495, + "step": 37620 + }, + { + "epoch": 0.03789629960890012, + "grad_norm": 15.416147142399966, + "learning_rate": 3.789557440122475e-05, + "loss": 2.8734, + "mean_token_accuracy": 0.38275861740112305, + "step": 37625 + }, + { + "epoch": 0.037901335662004286, + "grad_norm": 13.53705135546813, + "learning_rate": 3.790061035795581e-05, + "loss": 2.6827, + "mean_token_accuracy": 0.37241379618644715, + "step": 37630 + }, + { + "epoch": 0.03790637171510846, + "grad_norm": 14.122650647766182, + "learning_rate": 3.7905646314686866e-05, + "loss": 2.8602, + "mean_token_accuracy": 0.41034482717514037, + "step": 37635 + }, + { + "epoch": 0.037911407768212634, + "grad_norm": 14.0121383662044, + "learning_rate": 3.7910682271417925e-05, + "loss": 2.3133, + "mean_token_accuracy": 0.44827585220336913, + "step": 37640 + }, + { + "epoch": 0.03791644382131681, + "grad_norm": 12.906907868383122, + "learning_rate": 3.7915718228148984e-05, + "loss": 2.5719, + "mean_token_accuracy": 0.4, + "step": 37645 + }, + { + "epoch": 0.03792147987442098, + "grad_norm": 14.71865617752947, + "learning_rate": 3.792075418488005e-05, + "loss": 2.3419, + "mean_token_accuracy": 0.44337567687034607, + "step": 37650 + }, + { + "epoch": 0.037926515927525155, + "grad_norm": 14.53414532772543, + "learning_rate": 3.79257901416111e-05, + "loss": 2.8254, + "mean_token_accuracy": 0.3793103516101837, + "step": 37655 + }, + { + "epoch": 0.03793155198062933, + "grad_norm": 17.960179634101433, + "learning_rate": 3.793082609834216e-05, + "loss": 2.6533, + "mean_token_accuracy": 0.4137930929660797, + "step": 37660 + }, + { + "epoch": 0.037936588033733495, + "grad_norm": 14.880145906706671, + "learning_rate": 3.793586205507322e-05, + "loss": 2.3652, + "mean_token_accuracy": 0.40000001192092893, + "step": 37665 + }, + { + "epoch": 0.03794162408683767, + "grad_norm": 12.882325953347065, + "learning_rate": 3.794089801180428e-05, + "loss": 2.6547, + "mean_token_accuracy": 0.4206896543502808, + "step": 37670 + }, + { + "epoch": 0.03794666013994184, + "grad_norm": 16.75878623042873, + "learning_rate": 3.794593396853535e-05, + "loss": 2.9485, + "mean_token_accuracy": 0.34482758641242983, + "step": 37675 + }, + { + "epoch": 0.03795169619304602, + "grad_norm": 14.950645580552777, + "learning_rate": 3.7950969925266406e-05, + "loss": 2.7763, + "mean_token_accuracy": 0.38124622106552125, + "step": 37680 + }, + { + "epoch": 0.03795673224615019, + "grad_norm": 12.873083036364172, + "learning_rate": 3.7956005881997465e-05, + "loss": 2.7165, + "mean_token_accuracy": 0.4172413766384125, + "step": 37685 + }, + { + "epoch": 0.037961768299254364, + "grad_norm": 13.341979325639846, + "learning_rate": 3.7961041838728525e-05, + "loss": 2.449, + "mean_token_accuracy": 0.38275861740112305, + "step": 37690 + }, + { + "epoch": 0.03796680435235854, + "grad_norm": 13.637899203896987, + "learning_rate": 3.7966077795459584e-05, + "loss": 2.7396, + "mean_token_accuracy": 0.3655172407627106, + "step": 37695 + }, + { + "epoch": 0.037971840405462705, + "grad_norm": 14.474543480835859, + "learning_rate": 3.797111375219064e-05, + "loss": 2.9682, + "mean_token_accuracy": 0.31724137663841245, + "step": 37700 + }, + { + "epoch": 0.03797687645856688, + "grad_norm": 13.604619763214215, + "learning_rate": 3.79761497089217e-05, + "loss": 2.6191, + "mean_token_accuracy": 0.37241379618644715, + "step": 37705 + }, + { + "epoch": 0.03798191251167105, + "grad_norm": 12.890961451079653, + "learning_rate": 3.798118566565276e-05, + "loss": 2.4378, + "mean_token_accuracy": 0.441379314661026, + "step": 37710 + }, + { + "epoch": 0.037986948564775226, + "grad_norm": 16.86646797943491, + "learning_rate": 3.798622162238382e-05, + "loss": 2.5003, + "mean_token_accuracy": 0.4184728980064392, + "step": 37715 + }, + { + "epoch": 0.0379919846178794, + "grad_norm": 15.261238527802787, + "learning_rate": 3.799125757911488e-05, + "loss": 3.0867, + "mean_token_accuracy": 0.3172413736581802, + "step": 37720 + }, + { + "epoch": 0.037997020670983574, + "grad_norm": 12.858486166253133, + "learning_rate": 3.799629353584594e-05, + "loss": 2.5633, + "mean_token_accuracy": 0.4018148839473724, + "step": 37725 + }, + { + "epoch": 0.03800205672408775, + "grad_norm": 16.522694020114727, + "learning_rate": 3.8001329492577005e-05, + "loss": 2.7418, + "mean_token_accuracy": 0.38620689511299133, + "step": 37730 + }, + { + "epoch": 0.038007092777191914, + "grad_norm": 19.660380668550253, + "learning_rate": 3.8006365449308065e-05, + "loss": 2.4952, + "mean_token_accuracy": 0.44482758045196535, + "step": 37735 + }, + { + "epoch": 0.03801212883029609, + "grad_norm": 14.345364093379661, + "learning_rate": 3.8011401406039124e-05, + "loss": 2.4168, + "mean_token_accuracy": 0.4, + "step": 37740 + }, + { + "epoch": 0.03801716488340026, + "grad_norm": 15.248943692346172, + "learning_rate": 3.801643736277018e-05, + "loss": 2.9315, + "mean_token_accuracy": 0.3448275804519653, + "step": 37745 + }, + { + "epoch": 0.038022200936504436, + "grad_norm": 13.586911914243494, + "learning_rate": 3.8021473319501236e-05, + "loss": 2.4199, + "mean_token_accuracy": 0.43448275327682495, + "step": 37750 + }, + { + "epoch": 0.03802723698960861, + "grad_norm": 14.05078082131833, + "learning_rate": 3.80265092762323e-05, + "loss": 3.0982, + "mean_token_accuracy": 0.33103448152542114, + "step": 37755 + }, + { + "epoch": 0.03803227304271278, + "grad_norm": 21.276259145202413, + "learning_rate": 3.803154523296336e-05, + "loss": 2.7434, + "mean_token_accuracy": 0.3793103456497192, + "step": 37760 + }, + { + "epoch": 0.03803730909581696, + "grad_norm": 12.703429595678553, + "learning_rate": 3.803658118969442e-05, + "loss": 2.4986, + "mean_token_accuracy": 0.403448274731636, + "step": 37765 + }, + { + "epoch": 0.038042345148921124, + "grad_norm": 14.28092880209114, + "learning_rate": 3.804161714642548e-05, + "loss": 2.5141, + "mean_token_accuracy": 0.3551724135875702, + "step": 37770 + }, + { + "epoch": 0.0380473812020253, + "grad_norm": 13.229322818394166, + "learning_rate": 3.804665310315654e-05, + "loss": 2.656, + "mean_token_accuracy": 0.417241370677948, + "step": 37775 + }, + { + "epoch": 0.03805241725512947, + "grad_norm": 15.277840811207552, + "learning_rate": 3.8051689059887605e-05, + "loss": 2.1946, + "mean_token_accuracy": 0.4744101583957672, + "step": 37780 + }, + { + "epoch": 0.038057453308233645, + "grad_norm": 13.83828537259868, + "learning_rate": 3.8056725016618664e-05, + "loss": 2.3753, + "mean_token_accuracy": 0.4413793087005615, + "step": 37785 + }, + { + "epoch": 0.03806248936133782, + "grad_norm": 16.748514017231923, + "learning_rate": 3.8061760973349716e-05, + "loss": 2.648, + "mean_token_accuracy": 0.4413793087005615, + "step": 37790 + }, + { + "epoch": 0.03806752541444199, + "grad_norm": 12.965549435518312, + "learning_rate": 3.8066796930080776e-05, + "loss": 2.2892, + "mean_token_accuracy": 0.4137930989265442, + "step": 37795 + }, + { + "epoch": 0.038072561467546166, + "grad_norm": 13.488241851475362, + "learning_rate": 3.8071832886811835e-05, + "loss": 2.2491, + "mean_token_accuracy": 0.42068964838981626, + "step": 37800 + }, + { + "epoch": 0.03807759752065033, + "grad_norm": 15.05901351740915, + "learning_rate": 3.8076868843542894e-05, + "loss": 3.1551, + "mean_token_accuracy": 0.34137930870056155, + "step": 37805 + }, + { + "epoch": 0.03808263357375451, + "grad_norm": 14.649521441943392, + "learning_rate": 3.808190480027396e-05, + "loss": 2.3197, + "mean_token_accuracy": 0.4241379380226135, + "step": 37810 + }, + { + "epoch": 0.03808766962685868, + "grad_norm": 16.791017205358198, + "learning_rate": 3.808694075700502e-05, + "loss": 3.095, + "mean_token_accuracy": 0.35862069129943847, + "step": 37815 + }, + { + "epoch": 0.038092705679962854, + "grad_norm": 11.666733752996919, + "learning_rate": 3.809197671373608e-05, + "loss": 2.2199, + "mean_token_accuracy": 0.4465214848518372, + "step": 37820 + }, + { + "epoch": 0.03809774173306703, + "grad_norm": 16.63830817275079, + "learning_rate": 3.809701267046714e-05, + "loss": 3.0882, + "mean_token_accuracy": 0.28620689213275907, + "step": 37825 + }, + { + "epoch": 0.0381027777861712, + "grad_norm": 17.471730260980532, + "learning_rate": 3.81020486271982e-05, + "loss": 3.1173, + "mean_token_accuracy": 0.35172413289546967, + "step": 37830 + }, + { + "epoch": 0.038107813839275376, + "grad_norm": 12.163262364675393, + "learning_rate": 3.8107084583929256e-05, + "loss": 2.3014, + "mean_token_accuracy": 0.44482758045196535, + "step": 37835 + }, + { + "epoch": 0.03811284989237954, + "grad_norm": 16.145911724976326, + "learning_rate": 3.8112120540660316e-05, + "loss": 2.7963, + "mean_token_accuracy": 0.3896551728248596, + "step": 37840 + }, + { + "epoch": 0.038117885945483716, + "grad_norm": 19.010040289595896, + "learning_rate": 3.8117156497391375e-05, + "loss": 2.9046, + "mean_token_accuracy": 0.32928009927272794, + "step": 37845 + }, + { + "epoch": 0.03812292199858789, + "grad_norm": 14.727536539950835, + "learning_rate": 3.8122192454122434e-05, + "loss": 2.3857, + "mean_token_accuracy": 0.43793103098869324, + "step": 37850 + }, + { + "epoch": 0.038127958051692064, + "grad_norm": 13.411638013748485, + "learning_rate": 3.8127228410853493e-05, + "loss": 2.3252, + "mean_token_accuracy": 0.42413792610168455, + "step": 37855 + }, + { + "epoch": 0.03813299410479624, + "grad_norm": 16.2483222159373, + "learning_rate": 3.813226436758456e-05, + "loss": 2.7421, + "mean_token_accuracy": 0.37241379618644715, + "step": 37860 + }, + { + "epoch": 0.03813803015790041, + "grad_norm": 22.356008089113793, + "learning_rate": 3.813730032431562e-05, + "loss": 2.5209, + "mean_token_accuracy": 0.3482758581638336, + "step": 37865 + }, + { + "epoch": 0.038143066211004585, + "grad_norm": 23.040959417855625, + "learning_rate": 3.814233628104668e-05, + "loss": 2.5172, + "mean_token_accuracy": 0.3999999940395355, + "step": 37870 + }, + { + "epoch": 0.03814810226410875, + "grad_norm": 14.231902972127429, + "learning_rate": 3.814737223777774e-05, + "loss": 2.3011, + "mean_token_accuracy": 0.3965517282485962, + "step": 37875 + }, + { + "epoch": 0.038153138317212926, + "grad_norm": 12.251146197870462, + "learning_rate": 3.815240819450879e-05, + "loss": 2.5045, + "mean_token_accuracy": 0.38275861740112305, + "step": 37880 + }, + { + "epoch": 0.0381581743703171, + "grad_norm": 16.343568896723845, + "learning_rate": 3.815744415123985e-05, + "loss": 2.6235, + "mean_token_accuracy": 0.3862069010734558, + "step": 37885 + }, + { + "epoch": 0.03816321042342127, + "grad_norm": 12.614292507245805, + "learning_rate": 3.8162480107970915e-05, + "loss": 2.4736, + "mean_token_accuracy": 0.4034482777118683, + "step": 37890 + }, + { + "epoch": 0.03816824647652545, + "grad_norm": 17.410284974319527, + "learning_rate": 3.8167516064701974e-05, + "loss": 2.4558, + "mean_token_accuracy": 0.42232305407524107, + "step": 37895 + }, + { + "epoch": 0.03817328252962962, + "grad_norm": 15.264322979936674, + "learning_rate": 3.8172552021433033e-05, + "loss": 2.4544, + "mean_token_accuracy": 0.4413793087005615, + "step": 37900 + }, + { + "epoch": 0.038178318582733795, + "grad_norm": 11.33407289792448, + "learning_rate": 3.817758797816409e-05, + "loss": 2.398, + "mean_token_accuracy": 0.47047791481018064, + "step": 37905 + }, + { + "epoch": 0.03818335463583796, + "grad_norm": 18.236389684464193, + "learning_rate": 3.818262393489515e-05, + "loss": 2.8812, + "mean_token_accuracy": 0.36896551549434664, + "step": 37910 + }, + { + "epoch": 0.038188390688942135, + "grad_norm": 13.055690610454949, + "learning_rate": 3.818765989162622e-05, + "loss": 2.4693, + "mean_token_accuracy": 0.4310344815254211, + "step": 37915 + }, + { + "epoch": 0.03819342674204631, + "grad_norm": 14.602818689444494, + "learning_rate": 3.819269584835728e-05, + "loss": 2.3434, + "mean_token_accuracy": 0.4592364549636841, + "step": 37920 + }, + { + "epoch": 0.03819846279515048, + "grad_norm": 16.860791310310088, + "learning_rate": 3.819773180508833e-05, + "loss": 2.5999, + "mean_token_accuracy": 0.36206896901130675, + "step": 37925 + }, + { + "epoch": 0.038203498848254656, + "grad_norm": 17.676588156578468, + "learning_rate": 3.820276776181939e-05, + "loss": 2.662, + "mean_token_accuracy": 0.3965517282485962, + "step": 37930 + }, + { + "epoch": 0.03820853490135883, + "grad_norm": 14.424592910128728, + "learning_rate": 3.820780371855045e-05, + "loss": 2.5793, + "mean_token_accuracy": 0.38965516686439516, + "step": 37935 + }, + { + "epoch": 0.038213570954463004, + "grad_norm": 14.465523532596922, + "learning_rate": 3.8212839675281514e-05, + "loss": 2.4059, + "mean_token_accuracy": 0.39999999701976774, + "step": 37940 + }, + { + "epoch": 0.03821860700756717, + "grad_norm": 17.526842622394312, + "learning_rate": 3.8217875632012574e-05, + "loss": 2.5765, + "mean_token_accuracy": 0.4206896543502808, + "step": 37945 + }, + { + "epoch": 0.038223643060671345, + "grad_norm": 13.429102939720687, + "learning_rate": 3.822291158874363e-05, + "loss": 2.4583, + "mean_token_accuracy": 0.4310344815254211, + "step": 37950 + }, + { + "epoch": 0.03822867911377552, + "grad_norm": 14.515700993707537, + "learning_rate": 3.822794754547469e-05, + "loss": 2.4842, + "mean_token_accuracy": 0.39310343861579894, + "step": 37955 + }, + { + "epoch": 0.03823371516687969, + "grad_norm": 13.258251571760692, + "learning_rate": 3.823298350220575e-05, + "loss": 2.8482, + "mean_token_accuracy": 0.3344827651977539, + "step": 37960 + }, + { + "epoch": 0.038238751219983866, + "grad_norm": 12.469689986103063, + "learning_rate": 3.823801945893681e-05, + "loss": 2.5696, + "mean_token_accuracy": 0.4103448212146759, + "step": 37965 + }, + { + "epoch": 0.03824378727308804, + "grad_norm": 16.833768074662025, + "learning_rate": 3.824305541566787e-05, + "loss": 3.2654, + "mean_token_accuracy": 0.36896551251411436, + "step": 37970 + }, + { + "epoch": 0.03824882332619221, + "grad_norm": 29.877819259812252, + "learning_rate": 3.824809137239893e-05, + "loss": 2.4797, + "mean_token_accuracy": 0.4088324248790741, + "step": 37975 + }, + { + "epoch": 0.03825385937929638, + "grad_norm": 19.569854120700658, + "learning_rate": 3.825312732912999e-05, + "loss": 2.5189, + "mean_token_accuracy": 0.42413793206214906, + "step": 37980 + }, + { + "epoch": 0.038258895432400554, + "grad_norm": 14.168327821474506, + "learning_rate": 3.825816328586105e-05, + "loss": 2.3881, + "mean_token_accuracy": 0.3689655244350433, + "step": 37985 + }, + { + "epoch": 0.03826393148550473, + "grad_norm": 15.395164860575624, + "learning_rate": 3.826319924259211e-05, + "loss": 2.2982, + "mean_token_accuracy": 0.43448275327682495, + "step": 37990 + }, + { + "epoch": 0.0382689675386089, + "grad_norm": 16.82363297383371, + "learning_rate": 3.826823519932317e-05, + "loss": 2.5644, + "mean_token_accuracy": 0.36896551251411436, + "step": 37995 + }, + { + "epoch": 0.038274003591713075, + "grad_norm": 15.208807175783207, + "learning_rate": 3.827327115605423e-05, + "loss": 2.8182, + "mean_token_accuracy": 0.35862068831920624, + "step": 38000 + }, + { + "epoch": 0.03827903964481725, + "grad_norm": 10.16155184098454, + "learning_rate": 3.827830711278529e-05, + "loss": 2.3621, + "mean_token_accuracy": 0.41834974884986875, + "step": 38005 + }, + { + "epoch": 0.03828407569792142, + "grad_norm": 14.532251019456645, + "learning_rate": 3.828334306951635e-05, + "loss": 2.7053, + "mean_token_accuracy": 0.37241379618644715, + "step": 38010 + }, + { + "epoch": 0.03828911175102559, + "grad_norm": 20.829843655670896, + "learning_rate": 3.82883790262474e-05, + "loss": 2.8277, + "mean_token_accuracy": 0.3827586114406586, + "step": 38015 + }, + { + "epoch": 0.03829414780412976, + "grad_norm": 15.385980298208624, + "learning_rate": 3.829341498297847e-05, + "loss": 2.3331, + "mean_token_accuracy": 0.4566848039627075, + "step": 38020 + }, + { + "epoch": 0.03829918385723394, + "grad_norm": 13.378946118091237, + "learning_rate": 3.829845093970953e-05, + "loss": 2.7543, + "mean_token_accuracy": 0.3793103456497192, + "step": 38025 + }, + { + "epoch": 0.03830421991033811, + "grad_norm": 14.957470681282642, + "learning_rate": 3.830348689644059e-05, + "loss": 2.5885, + "mean_token_accuracy": 0.3413793116807938, + "step": 38030 + }, + { + "epoch": 0.038309255963442285, + "grad_norm": 13.276386945986111, + "learning_rate": 3.830852285317165e-05, + "loss": 2.532, + "mean_token_accuracy": 0.42758620381355283, + "step": 38035 + }, + { + "epoch": 0.03831429201654646, + "grad_norm": 18.05517090181714, + "learning_rate": 3.8313558809902706e-05, + "loss": 2.5982, + "mean_token_accuracy": 0.38100423812866213, + "step": 38040 + }, + { + "epoch": 0.03831932806965063, + "grad_norm": 13.84326636495037, + "learning_rate": 3.831859476663377e-05, + "loss": 2.5894, + "mean_token_accuracy": 0.41034482717514037, + "step": 38045 + }, + { + "epoch": 0.0383243641227548, + "grad_norm": 15.37424037919201, + "learning_rate": 3.832363072336483e-05, + "loss": 2.5127, + "mean_token_accuracy": 0.41034482717514037, + "step": 38050 + }, + { + "epoch": 0.03832940017585897, + "grad_norm": 19.149411362041537, + "learning_rate": 3.8328666680095884e-05, + "loss": 2.8354, + "mean_token_accuracy": 0.36551723778247835, + "step": 38055 + }, + { + "epoch": 0.03833443622896315, + "grad_norm": 22.01478028517961, + "learning_rate": 3.833370263682694e-05, + "loss": 2.7823, + "mean_token_accuracy": 0.37755596041679385, + "step": 38060 + }, + { + "epoch": 0.03833947228206732, + "grad_norm": 12.696778283793972, + "learning_rate": 3.8338738593558e-05, + "loss": 2.6861, + "mean_token_accuracy": 0.36551724672317504, + "step": 38065 + }, + { + "epoch": 0.038344508335171494, + "grad_norm": 14.099033853751777, + "learning_rate": 3.834377455028906e-05, + "loss": 3.0078, + "mean_token_accuracy": 0.34482758641242983, + "step": 38070 + }, + { + "epoch": 0.03834954438827567, + "grad_norm": 16.4955314916322, + "learning_rate": 3.834881050702013e-05, + "loss": 2.5724, + "mean_token_accuracy": 0.36896551847457887, + "step": 38075 + }, + { + "epoch": 0.03835458044137984, + "grad_norm": 14.968538978818405, + "learning_rate": 3.835384646375119e-05, + "loss": 2.6623, + "mean_token_accuracy": 0.4068965554237366, + "step": 38080 + }, + { + "epoch": 0.03835961649448401, + "grad_norm": 13.870731479022009, + "learning_rate": 3.8358882420482246e-05, + "loss": 2.4284, + "mean_token_accuracy": 0.40689654350280763, + "step": 38085 + }, + { + "epoch": 0.03836465254758818, + "grad_norm": 14.66610542187267, + "learning_rate": 3.8363918377213305e-05, + "loss": 2.3999, + "mean_token_accuracy": 0.42068966031074523, + "step": 38090 + }, + { + "epoch": 0.038369688600692356, + "grad_norm": 14.566716728432645, + "learning_rate": 3.8368954333944365e-05, + "loss": 2.4935, + "mean_token_accuracy": 0.4310344815254211, + "step": 38095 + }, + { + "epoch": 0.03837472465379653, + "grad_norm": 26.088833328390997, + "learning_rate": 3.8373990290675424e-05, + "loss": 2.5664, + "mean_token_accuracy": 0.4448275864124298, + "step": 38100 + }, + { + "epoch": 0.038379760706900704, + "grad_norm": 15.749096942871624, + "learning_rate": 3.837902624740648e-05, + "loss": 2.2186, + "mean_token_accuracy": 0.44482758045196535, + "step": 38105 + }, + { + "epoch": 0.03838479676000488, + "grad_norm": 14.358632283239768, + "learning_rate": 3.838406220413754e-05, + "loss": 2.339, + "mean_token_accuracy": 0.4310344815254211, + "step": 38110 + }, + { + "epoch": 0.03838983281310905, + "grad_norm": 14.259182207545305, + "learning_rate": 3.83890981608686e-05, + "loss": 2.1361, + "mean_token_accuracy": 0.4310344815254211, + "step": 38115 + }, + { + "epoch": 0.03839486886621322, + "grad_norm": 12.627322962958067, + "learning_rate": 3.839413411759966e-05, + "loss": 2.8337, + "mean_token_accuracy": 0.35862068831920624, + "step": 38120 + }, + { + "epoch": 0.03839990491931739, + "grad_norm": 14.694656489401094, + "learning_rate": 3.839917007433073e-05, + "loss": 2.2121, + "mean_token_accuracy": 0.482758617401123, + "step": 38125 + }, + { + "epoch": 0.038404940972421565, + "grad_norm": 13.25566767324547, + "learning_rate": 3.8404206031061786e-05, + "loss": 2.5515, + "mean_token_accuracy": 0.39310344457626345, + "step": 38130 + }, + { + "epoch": 0.03840997702552574, + "grad_norm": 13.66502754784551, + "learning_rate": 3.8409241987792845e-05, + "loss": 2.2646, + "mean_token_accuracy": 0.42413793206214906, + "step": 38135 + }, + { + "epoch": 0.03841501307862991, + "grad_norm": 14.138770750457414, + "learning_rate": 3.8414277944523905e-05, + "loss": 2.4893, + "mean_token_accuracy": 0.4517241418361664, + "step": 38140 + }, + { + "epoch": 0.03842004913173409, + "grad_norm": 13.672508425981114, + "learning_rate": 3.8419313901254964e-05, + "loss": 2.6837, + "mean_token_accuracy": 0.324137932062149, + "step": 38145 + }, + { + "epoch": 0.03842508518483826, + "grad_norm": 14.361431082391956, + "learning_rate": 3.8424349857986016e-05, + "loss": 2.3155, + "mean_token_accuracy": 0.42068966031074523, + "step": 38150 + }, + { + "epoch": 0.03843012123794243, + "grad_norm": 13.759563307405394, + "learning_rate": 3.842938581471708e-05, + "loss": 2.6316, + "mean_token_accuracy": 0.4185722887516022, + "step": 38155 + }, + { + "epoch": 0.0384351572910466, + "grad_norm": 13.97845361877045, + "learning_rate": 3.843442177144814e-05, + "loss": 2.7524, + "mean_token_accuracy": 0.33793103098869326, + "step": 38160 + }, + { + "epoch": 0.038440193344150775, + "grad_norm": 14.263126126280985, + "learning_rate": 3.84394577281792e-05, + "loss": 2.117, + "mean_token_accuracy": 0.47586206197738645, + "step": 38165 + }, + { + "epoch": 0.03844522939725495, + "grad_norm": 15.117477851425155, + "learning_rate": 3.844449368491026e-05, + "loss": 2.524, + "mean_token_accuracy": 0.4, + "step": 38170 + }, + { + "epoch": 0.03845026545035912, + "grad_norm": 24.503855781523896, + "learning_rate": 3.844952964164132e-05, + "loss": 2.8655, + "mean_token_accuracy": 0.37586206793785093, + "step": 38175 + }, + { + "epoch": 0.038455301503463296, + "grad_norm": 18.85976483675346, + "learning_rate": 3.8454565598372386e-05, + "loss": 2.4495, + "mean_token_accuracy": 0.39310344457626345, + "step": 38180 + }, + { + "epoch": 0.03846033755656747, + "grad_norm": 15.754784630369839, + "learning_rate": 3.8459601555103445e-05, + "loss": 2.5747, + "mean_token_accuracy": 0.39310344159603117, + "step": 38185 + }, + { + "epoch": 0.03846537360967164, + "grad_norm": 14.74041618737717, + "learning_rate": 3.84646375118345e-05, + "loss": 2.6815, + "mean_token_accuracy": 0.3862069010734558, + "step": 38190 + }, + { + "epoch": 0.03847040966277581, + "grad_norm": 15.490033775843019, + "learning_rate": 3.8469673468565556e-05, + "loss": 2.536, + "mean_token_accuracy": 0.4344827592372894, + "step": 38195 + }, + { + "epoch": 0.038475445715879984, + "grad_norm": 11.799882722567173, + "learning_rate": 3.8474709425296616e-05, + "loss": 2.3958, + "mean_token_accuracy": 0.42068966031074523, + "step": 38200 + }, + { + "epoch": 0.03848048176898416, + "grad_norm": 19.89391963915202, + "learning_rate": 3.847974538202768e-05, + "loss": 2.9041, + "mean_token_accuracy": 0.38620689511299133, + "step": 38205 + }, + { + "epoch": 0.03848551782208833, + "grad_norm": 13.99630180117272, + "learning_rate": 3.848478133875874e-05, + "loss": 2.4836, + "mean_token_accuracy": 0.4296430706977844, + "step": 38210 + }, + { + "epoch": 0.038490553875192506, + "grad_norm": 16.147191691371855, + "learning_rate": 3.84898172954898e-05, + "loss": 2.4556, + "mean_token_accuracy": 0.4068965554237366, + "step": 38215 + }, + { + "epoch": 0.03849558992829668, + "grad_norm": 14.0017089247059, + "learning_rate": 3.849485325222086e-05, + "loss": 3.0551, + "mean_token_accuracy": 0.3413793116807938, + "step": 38220 + }, + { + "epoch": 0.038500625981400846, + "grad_norm": 15.105778679561979, + "learning_rate": 3.849988920895192e-05, + "loss": 2.4309, + "mean_token_accuracy": 0.4103448212146759, + "step": 38225 + }, + { + "epoch": 0.03850566203450502, + "grad_norm": 14.903483998605717, + "learning_rate": 3.850492516568298e-05, + "loss": 2.1723, + "mean_token_accuracy": 0.44137930274009707, + "step": 38230 + }, + { + "epoch": 0.038510698087609194, + "grad_norm": 15.338867428282827, + "learning_rate": 3.850996112241404e-05, + "loss": 2.5887, + "mean_token_accuracy": 0.3662561535835266, + "step": 38235 + }, + { + "epoch": 0.03851573414071337, + "grad_norm": 11.924206404533528, + "learning_rate": 3.8514997079145097e-05, + "loss": 2.186, + "mean_token_accuracy": 0.42758620977401735, + "step": 38240 + }, + { + "epoch": 0.03852077019381754, + "grad_norm": 19.98645033335937, + "learning_rate": 3.8520033035876156e-05, + "loss": 2.5683, + "mean_token_accuracy": 0.3896551787853241, + "step": 38245 + }, + { + "epoch": 0.038525806246921715, + "grad_norm": 14.201366781945191, + "learning_rate": 3.8525068992607215e-05, + "loss": 2.7904, + "mean_token_accuracy": 0.43103448748588563, + "step": 38250 + }, + { + "epoch": 0.03853084230002589, + "grad_norm": 13.822016067985869, + "learning_rate": 3.8530104949338274e-05, + "loss": 2.7829, + "mean_token_accuracy": 0.31724137663841245, + "step": 38255 + }, + { + "epoch": 0.038535878353130056, + "grad_norm": 13.093084251038144, + "learning_rate": 3.853514090606934e-05, + "loss": 3.0322, + "mean_token_accuracy": 0.3206896483898163, + "step": 38260 + }, + { + "epoch": 0.03854091440623423, + "grad_norm": 12.687391736898395, + "learning_rate": 3.85401768628004e-05, + "loss": 2.3818, + "mean_token_accuracy": 0.42068966031074523, + "step": 38265 + }, + { + "epoch": 0.0385459504593384, + "grad_norm": 14.614551598448754, + "learning_rate": 3.854521281953146e-05, + "loss": 2.5805, + "mean_token_accuracy": 0.3551724076271057, + "step": 38270 + }, + { + "epoch": 0.03855098651244258, + "grad_norm": 14.503808518369269, + "learning_rate": 3.855024877626252e-05, + "loss": 2.6157, + "mean_token_accuracy": 0.34482758343219755, + "step": 38275 + }, + { + "epoch": 0.03855602256554675, + "grad_norm": 21.50399550649558, + "learning_rate": 3.855528473299358e-05, + "loss": 2.9316, + "mean_token_accuracy": 0.35862069129943847, + "step": 38280 + }, + { + "epoch": 0.038561058618650924, + "grad_norm": 13.787310494450276, + "learning_rate": 3.8560320689724637e-05, + "loss": 2.8589, + "mean_token_accuracy": 0.3482758581638336, + "step": 38285 + }, + { + "epoch": 0.0385660946717551, + "grad_norm": 21.51656683130556, + "learning_rate": 3.8565356646455696e-05, + "loss": 2.9416, + "mean_token_accuracy": 0.36896551251411436, + "step": 38290 + }, + { + "epoch": 0.038571130724859265, + "grad_norm": 16.036620720865905, + "learning_rate": 3.8570392603186755e-05, + "loss": 2.8207, + "mean_token_accuracy": 0.38784028887748717, + "step": 38295 + }, + { + "epoch": 0.03857616677796344, + "grad_norm": 13.414725476163719, + "learning_rate": 3.8575428559917814e-05, + "loss": 2.8088, + "mean_token_accuracy": 0.38275861740112305, + "step": 38300 + }, + { + "epoch": 0.03858120283106761, + "grad_norm": 13.735236926175432, + "learning_rate": 3.8580464516648874e-05, + "loss": 2.7301, + "mean_token_accuracy": 0.38620689511299133, + "step": 38305 + }, + { + "epoch": 0.038586238884171786, + "grad_norm": 12.937496294003896, + "learning_rate": 3.858550047337993e-05, + "loss": 2.6606, + "mean_token_accuracy": 0.4, + "step": 38310 + }, + { + "epoch": 0.03859127493727596, + "grad_norm": 15.802319890417898, + "learning_rate": 3.8590536430111e-05, + "loss": 2.7987, + "mean_token_accuracy": 0.3551724135875702, + "step": 38315 + }, + { + "epoch": 0.038596310990380134, + "grad_norm": 13.50307738162297, + "learning_rate": 3.859557238684206e-05, + "loss": 2.4511, + "mean_token_accuracy": 0.4482758641242981, + "step": 38320 + }, + { + "epoch": 0.03860134704348431, + "grad_norm": 15.050724093542225, + "learning_rate": 3.860060834357311e-05, + "loss": 2.6892, + "mean_token_accuracy": 0.3689655065536499, + "step": 38325 + }, + { + "epoch": 0.038606383096588474, + "grad_norm": 15.342623046038343, + "learning_rate": 3.860564430030417e-05, + "loss": 2.402, + "mean_token_accuracy": 0.4172413766384125, + "step": 38330 + }, + { + "epoch": 0.03861141914969265, + "grad_norm": 12.287894148491352, + "learning_rate": 3.861068025703523e-05, + "loss": 2.4966, + "mean_token_accuracy": 0.38965516090393065, + "step": 38335 + }, + { + "epoch": 0.03861645520279682, + "grad_norm": 12.951531416883368, + "learning_rate": 3.8615716213766295e-05, + "loss": 2.4105, + "mean_token_accuracy": 0.4137930989265442, + "step": 38340 + }, + { + "epoch": 0.038621491255900996, + "grad_norm": 17.00901406882622, + "learning_rate": 3.8620752170497354e-05, + "loss": 2.5555, + "mean_token_accuracy": 0.3517241358757019, + "step": 38345 + }, + { + "epoch": 0.03862652730900517, + "grad_norm": 15.691564248157894, + "learning_rate": 3.8625788127228414e-05, + "loss": 2.5077, + "mean_token_accuracy": 0.42643678188323975, + "step": 38350 + }, + { + "epoch": 0.03863156336210934, + "grad_norm": 17.140350545218983, + "learning_rate": 3.863082408395947e-05, + "loss": 2.6344, + "mean_token_accuracy": 0.3896551728248596, + "step": 38355 + }, + { + "epoch": 0.03863659941521352, + "grad_norm": 13.81575954331368, + "learning_rate": 3.863586004069053e-05, + "loss": 2.3762, + "mean_token_accuracy": 0.4344827592372894, + "step": 38360 + }, + { + "epoch": 0.038641635468317684, + "grad_norm": 15.708941526422842, + "learning_rate": 3.864089599742159e-05, + "loss": 2.4715, + "mean_token_accuracy": 0.4379310369491577, + "step": 38365 + }, + { + "epoch": 0.03864667152142186, + "grad_norm": 13.503567091318997, + "learning_rate": 3.864593195415265e-05, + "loss": 2.4124, + "mean_token_accuracy": 0.41379310488700866, + "step": 38370 + }, + { + "epoch": 0.03865170757452603, + "grad_norm": 14.337966375308937, + "learning_rate": 3.865096791088371e-05, + "loss": 2.4697, + "mean_token_accuracy": 0.3827586233615875, + "step": 38375 + }, + { + "epoch": 0.038656743627630205, + "grad_norm": 17.012538545446787, + "learning_rate": 3.865600386761477e-05, + "loss": 2.6393, + "mean_token_accuracy": 0.37755596041679385, + "step": 38380 + }, + { + "epoch": 0.03866177968073438, + "grad_norm": 13.001729921080834, + "learning_rate": 3.866103982434583e-05, + "loss": 2.6144, + "mean_token_accuracy": 0.3931034505367279, + "step": 38385 + }, + { + "epoch": 0.03866681573383855, + "grad_norm": 17.07054699206022, + "learning_rate": 3.866607578107689e-05, + "loss": 2.2787, + "mean_token_accuracy": 0.42758620381355283, + "step": 38390 + }, + { + "epoch": 0.038671851786942726, + "grad_norm": 12.111233371820354, + "learning_rate": 3.8671111737807954e-05, + "loss": 2.7196, + "mean_token_accuracy": 0.3241379290819168, + "step": 38395 + }, + { + "epoch": 0.03867688784004689, + "grad_norm": 12.117422535933377, + "learning_rate": 3.867614769453901e-05, + "loss": 2.3982, + "mean_token_accuracy": 0.4428312063217163, + "step": 38400 + }, + { + "epoch": 0.03868192389315107, + "grad_norm": 19.069326130729035, + "learning_rate": 3.868118365127007e-05, + "loss": 2.5136, + "mean_token_accuracy": 0.43103447556495667, + "step": 38405 + }, + { + "epoch": 0.03868695994625524, + "grad_norm": 12.372257511405454, + "learning_rate": 3.868621960800113e-05, + "loss": 2.5031, + "mean_token_accuracy": 0.4241379380226135, + "step": 38410 + }, + { + "epoch": 0.038691995999359415, + "grad_norm": 16.10223006243153, + "learning_rate": 3.8691255564732184e-05, + "loss": 2.8531, + "mean_token_accuracy": 0.3310344785451889, + "step": 38415 + }, + { + "epoch": 0.03869703205246359, + "grad_norm": 13.03977126188214, + "learning_rate": 3.869629152146325e-05, + "loss": 2.5734, + "mean_token_accuracy": 0.40344826579093934, + "step": 38420 + }, + { + "epoch": 0.03870206810556776, + "grad_norm": 13.234773980211228, + "learning_rate": 3.870132747819431e-05, + "loss": 2.7245, + "mean_token_accuracy": 0.3482758641242981, + "step": 38425 + }, + { + "epoch": 0.038707104158671936, + "grad_norm": 13.895064033941393, + "learning_rate": 3.870636343492537e-05, + "loss": 2.5547, + "mean_token_accuracy": 0.35862069129943847, + "step": 38430 + }, + { + "epoch": 0.0387121402117761, + "grad_norm": 15.122056472264191, + "learning_rate": 3.871139939165643e-05, + "loss": 2.5987, + "mean_token_accuracy": 0.4188142716884613, + "step": 38435 + }, + { + "epoch": 0.038717176264880276, + "grad_norm": 13.610495687546567, + "learning_rate": 3.871643534838749e-05, + "loss": 2.1821, + "mean_token_accuracy": 0.4068965554237366, + "step": 38440 + }, + { + "epoch": 0.03872221231798445, + "grad_norm": 17.14199740429114, + "learning_rate": 3.872147130511855e-05, + "loss": 2.4604, + "mean_token_accuracy": 0.4295220851898193, + "step": 38445 + }, + { + "epoch": 0.038727248371088624, + "grad_norm": 17.654721704807155, + "learning_rate": 3.872650726184961e-05, + "loss": 2.7951, + "mean_token_accuracy": 0.36206896901130675, + "step": 38450 + }, + { + "epoch": 0.0387322844241928, + "grad_norm": 12.92812197832319, + "learning_rate": 3.873154321858067e-05, + "loss": 2.2783, + "mean_token_accuracy": 0.42413793206214906, + "step": 38455 + }, + { + "epoch": 0.03873732047729697, + "grad_norm": 13.32601949336529, + "learning_rate": 3.8736579175311724e-05, + "loss": 2.613, + "mean_token_accuracy": 0.37931033968925476, + "step": 38460 + }, + { + "epoch": 0.038742356530401145, + "grad_norm": 20.302579900468036, + "learning_rate": 3.874161513204278e-05, + "loss": 2.9831, + "mean_token_accuracy": 0.3896551728248596, + "step": 38465 + }, + { + "epoch": 0.03874739258350531, + "grad_norm": 19.702416163403022, + "learning_rate": 3.874665108877385e-05, + "loss": 3.0987, + "mean_token_accuracy": 0.324137932062149, + "step": 38470 + }, + { + "epoch": 0.038752428636609486, + "grad_norm": 15.047604190283767, + "learning_rate": 3.875168704550491e-05, + "loss": 2.5964, + "mean_token_accuracy": 0.41724138259887694, + "step": 38475 + }, + { + "epoch": 0.03875746468971366, + "grad_norm": 17.461245736093673, + "learning_rate": 3.875672300223597e-05, + "loss": 2.8901, + "mean_token_accuracy": 0.41379311084747317, + "step": 38480 + }, + { + "epoch": 0.03876250074281783, + "grad_norm": 14.793532148710232, + "learning_rate": 3.876175895896703e-05, + "loss": 2.8375, + "mean_token_accuracy": 0.3965517282485962, + "step": 38485 + }, + { + "epoch": 0.03876753679592201, + "grad_norm": 13.94459817380488, + "learning_rate": 3.8766794915698086e-05, + "loss": 2.4298, + "mean_token_accuracy": 0.40508166551589964, + "step": 38490 + }, + { + "epoch": 0.03877257284902618, + "grad_norm": 16.015068346315935, + "learning_rate": 3.8771830872429146e-05, + "loss": 2.8024, + "mean_token_accuracy": 0.37586206793785093, + "step": 38495 + }, + { + "epoch": 0.03877760890213035, + "grad_norm": 28.052871323987514, + "learning_rate": 3.8776866829160205e-05, + "loss": 2.8827, + "mean_token_accuracy": 0.3655172437429428, + "step": 38500 + }, + { + "epoch": 0.03878264495523452, + "grad_norm": 14.451302722836752, + "learning_rate": 3.8781902785891264e-05, + "loss": 2.8041, + "mean_token_accuracy": 0.37241379618644715, + "step": 38505 + }, + { + "epoch": 0.038787681008338695, + "grad_norm": 16.237444178506518, + "learning_rate": 3.878693874262232e-05, + "loss": 2.6191, + "mean_token_accuracy": 0.3413793116807938, + "step": 38510 + }, + { + "epoch": 0.03879271706144287, + "grad_norm": 13.54112577823189, + "learning_rate": 3.879197469935338e-05, + "loss": 2.6762, + "mean_token_accuracy": 0.39310344457626345, + "step": 38515 + }, + { + "epoch": 0.03879775311454704, + "grad_norm": 13.254402656175895, + "learning_rate": 3.879701065608444e-05, + "loss": 2.5798, + "mean_token_accuracy": 0.4068965554237366, + "step": 38520 + }, + { + "epoch": 0.03880278916765122, + "grad_norm": 12.511698364984422, + "learning_rate": 3.880204661281551e-05, + "loss": 2.683, + "mean_token_accuracy": 0.40344826579093934, + "step": 38525 + }, + { + "epoch": 0.03880782522075539, + "grad_norm": 17.68335189930687, + "learning_rate": 3.880708256954657e-05, + "loss": 2.6896, + "mean_token_accuracy": 0.3793103516101837, + "step": 38530 + }, + { + "epoch": 0.03881286127385956, + "grad_norm": 13.259262737933796, + "learning_rate": 3.8812118526277626e-05, + "loss": 2.2989, + "mean_token_accuracy": 0.39655172228813174, + "step": 38535 + }, + { + "epoch": 0.03881789732696373, + "grad_norm": 12.791777285225903, + "learning_rate": 3.8817154483008686e-05, + "loss": 2.3641, + "mean_token_accuracy": 0.40689654350280763, + "step": 38540 + }, + { + "epoch": 0.038822933380067905, + "grad_norm": 17.316074796847175, + "learning_rate": 3.8822190439739745e-05, + "loss": 2.7057, + "mean_token_accuracy": 0.4206896543502808, + "step": 38545 + }, + { + "epoch": 0.03882796943317208, + "grad_norm": 20.112405905653954, + "learning_rate": 3.8827226396470804e-05, + "loss": 2.2278, + "mean_token_accuracy": 0.47586206197738645, + "step": 38550 + }, + { + "epoch": 0.03883300548627625, + "grad_norm": 14.511426742591292, + "learning_rate": 3.883226235320186e-05, + "loss": 2.9229, + "mean_token_accuracy": 0.39443435668945315, + "step": 38555 + }, + { + "epoch": 0.038838041539380426, + "grad_norm": 15.875742211139539, + "learning_rate": 3.883729830993292e-05, + "loss": 2.6791, + "mean_token_accuracy": 0.35862069129943847, + "step": 38560 + }, + { + "epoch": 0.0388430775924846, + "grad_norm": 14.829180220728038, + "learning_rate": 3.884233426666398e-05, + "loss": 2.5868, + "mean_token_accuracy": 0.37931033968925476, + "step": 38565 + }, + { + "epoch": 0.03884811364558877, + "grad_norm": 12.364730572976033, + "learning_rate": 3.884737022339504e-05, + "loss": 2.3576, + "mean_token_accuracy": 0.41034482717514037, + "step": 38570 + }, + { + "epoch": 0.03885314969869294, + "grad_norm": 14.087908530494651, + "learning_rate": 3.88524061801261e-05, + "loss": 2.5741, + "mean_token_accuracy": 0.37931033968925476, + "step": 38575 + }, + { + "epoch": 0.038858185751797114, + "grad_norm": 16.571447705206264, + "learning_rate": 3.8857442136857166e-05, + "loss": 2.336, + "mean_token_accuracy": 0.44482757449150084, + "step": 38580 + }, + { + "epoch": 0.03886322180490129, + "grad_norm": 13.680760293574012, + "learning_rate": 3.8862478093588226e-05, + "loss": 2.8325, + "mean_token_accuracy": 0.36896551251411436, + "step": 38585 + }, + { + "epoch": 0.03886825785800546, + "grad_norm": 14.890020421915905, + "learning_rate": 3.886751405031928e-05, + "loss": 2.4543, + "mean_token_accuracy": 0.41724138855934145, + "step": 38590 + }, + { + "epoch": 0.038873293911109635, + "grad_norm": 17.447794546387453, + "learning_rate": 3.887255000705034e-05, + "loss": 3.1763, + "mean_token_accuracy": 0.3793103456497192, + "step": 38595 + }, + { + "epoch": 0.03887832996421381, + "grad_norm": 12.780789311644094, + "learning_rate": 3.88775859637814e-05, + "loss": 2.2954, + "mean_token_accuracy": 0.3965517282485962, + "step": 38600 + }, + { + "epoch": 0.038883366017317976, + "grad_norm": 11.908152044690178, + "learning_rate": 3.888262192051246e-05, + "loss": 2.6285, + "mean_token_accuracy": 0.43793103098869324, + "step": 38605 + }, + { + "epoch": 0.03888840207042215, + "grad_norm": 19.272278434805227, + "learning_rate": 3.888765787724352e-05, + "loss": 2.7464, + "mean_token_accuracy": 0.419252872467041, + "step": 38610 + }, + { + "epoch": 0.038893438123526324, + "grad_norm": 18.916175007153512, + "learning_rate": 3.889269383397458e-05, + "loss": 2.334, + "mean_token_accuracy": 0.46896551847457885, + "step": 38615 + }, + { + "epoch": 0.0388984741766305, + "grad_norm": 15.437027348007515, + "learning_rate": 3.889772979070564e-05, + "loss": 2.6108, + "mean_token_accuracy": 0.37241379618644715, + "step": 38620 + }, + { + "epoch": 0.03890351022973467, + "grad_norm": 14.580554486029213, + "learning_rate": 3.89027657474367e-05, + "loss": 2.6533, + "mean_token_accuracy": 0.3862069010734558, + "step": 38625 + }, + { + "epoch": 0.038908546282838845, + "grad_norm": 11.135602105148733, + "learning_rate": 3.890780170416776e-05, + "loss": 2.0744, + "mean_token_accuracy": 0.49165154099464414, + "step": 38630 + }, + { + "epoch": 0.03891358233594302, + "grad_norm": 21.541042813984255, + "learning_rate": 3.891283766089882e-05, + "loss": 3.0318, + "mean_token_accuracy": 0.34137930870056155, + "step": 38635 + }, + { + "epoch": 0.038918618389047185, + "grad_norm": 13.575323253365715, + "learning_rate": 3.891787361762988e-05, + "loss": 2.4308, + "mean_token_accuracy": 0.4068965554237366, + "step": 38640 + }, + { + "epoch": 0.03892365444215136, + "grad_norm": 29.619872340594796, + "learning_rate": 3.892290957436094e-05, + "loss": 2.2155, + "mean_token_accuracy": 0.4448275864124298, + "step": 38645 + }, + { + "epoch": 0.03892869049525553, + "grad_norm": 17.214147013841465, + "learning_rate": 3.8927945531091996e-05, + "loss": 2.5681, + "mean_token_accuracy": 0.4, + "step": 38650 + }, + { + "epoch": 0.03893372654835971, + "grad_norm": 15.809873276925861, + "learning_rate": 3.8932981487823055e-05, + "loss": 2.553, + "mean_token_accuracy": 0.4034482717514038, + "step": 38655 + }, + { + "epoch": 0.03893876260146388, + "grad_norm": 19.529083004269495, + "learning_rate": 3.893801744455412e-05, + "loss": 2.8468, + "mean_token_accuracy": 0.3931034475564957, + "step": 38660 + }, + { + "epoch": 0.038943798654568054, + "grad_norm": 11.687554079836254, + "learning_rate": 3.894305340128518e-05, + "loss": 2.529, + "mean_token_accuracy": 0.43641862869262693, + "step": 38665 + }, + { + "epoch": 0.03894883470767223, + "grad_norm": 11.91961109614288, + "learning_rate": 3.894808935801624e-05, + "loss": 2.3753, + "mean_token_accuracy": 0.439310348033905, + "step": 38670 + }, + { + "epoch": 0.038953870760776395, + "grad_norm": 17.282072650647486, + "learning_rate": 3.89531253147473e-05, + "loss": 2.6406, + "mean_token_accuracy": 0.34827586114406583, + "step": 38675 + }, + { + "epoch": 0.03895890681388057, + "grad_norm": 15.9014575120477, + "learning_rate": 3.895816127147836e-05, + "loss": 2.5613, + "mean_token_accuracy": 0.3551724076271057, + "step": 38680 + }, + { + "epoch": 0.03896394286698474, + "grad_norm": 16.78874647017567, + "learning_rate": 3.896319722820942e-05, + "loss": 2.5114, + "mean_token_accuracy": 0.42758620977401735, + "step": 38685 + }, + { + "epoch": 0.038968978920088916, + "grad_norm": 13.132809863819832, + "learning_rate": 3.896823318494048e-05, + "loss": 2.822, + "mean_token_accuracy": 0.3551724076271057, + "step": 38690 + }, + { + "epoch": 0.03897401497319309, + "grad_norm": 15.495603375488376, + "learning_rate": 3.8973269141671536e-05, + "loss": 2.5422, + "mean_token_accuracy": 0.4310344815254211, + "step": 38695 + }, + { + "epoch": 0.038979051026297264, + "grad_norm": 19.12854806315473, + "learning_rate": 3.8978305098402595e-05, + "loss": 2.701, + "mean_token_accuracy": 0.3620689570903778, + "step": 38700 + }, + { + "epoch": 0.03898408707940144, + "grad_norm": 14.376982918016251, + "learning_rate": 3.8983341055133654e-05, + "loss": 2.4269, + "mean_token_accuracy": 0.3965517282485962, + "step": 38705 + }, + { + "epoch": 0.038989123132505604, + "grad_norm": 15.798646169923915, + "learning_rate": 3.898837701186472e-05, + "loss": 2.3501, + "mean_token_accuracy": 0.4517241358757019, + "step": 38710 + }, + { + "epoch": 0.03899415918560978, + "grad_norm": 13.119479750589646, + "learning_rate": 3.899341296859578e-05, + "loss": 2.6241, + "mean_token_accuracy": 0.4149425268173218, + "step": 38715 + }, + { + "epoch": 0.03899919523871395, + "grad_norm": 16.755598681688472, + "learning_rate": 3.899844892532684e-05, + "loss": 2.6403, + "mean_token_accuracy": 0.43103448748588563, + "step": 38720 + }, + { + "epoch": 0.039004231291818126, + "grad_norm": 12.280810564057813, + "learning_rate": 3.900348488205789e-05, + "loss": 2.7568, + "mean_token_accuracy": 0.34827586114406583, + "step": 38725 + }, + { + "epoch": 0.0390092673449223, + "grad_norm": 15.87289000272071, + "learning_rate": 3.900852083878895e-05, + "loss": 2.6659, + "mean_token_accuracy": 0.3793103456497192, + "step": 38730 + }, + { + "epoch": 0.03901430339802647, + "grad_norm": 14.59812564396407, + "learning_rate": 3.901355679552001e-05, + "loss": 2.6119, + "mean_token_accuracy": 0.38620689511299133, + "step": 38735 + }, + { + "epoch": 0.03901933945113065, + "grad_norm": 13.686110460533895, + "learning_rate": 3.9018592752251076e-05, + "loss": 2.8186, + "mean_token_accuracy": 0.37931033968925476, + "step": 38740 + }, + { + "epoch": 0.039024375504234814, + "grad_norm": 13.179596708545821, + "learning_rate": 3.9023628708982135e-05, + "loss": 2.7055, + "mean_token_accuracy": 0.40871143341064453, + "step": 38745 + }, + { + "epoch": 0.03902941155733899, + "grad_norm": 13.61369397773637, + "learning_rate": 3.9028664665713195e-05, + "loss": 2.5917, + "mean_token_accuracy": 0.41379310488700866, + "step": 38750 + }, + { + "epoch": 0.03903444761044316, + "grad_norm": 13.795705824078269, + "learning_rate": 3.9033700622444254e-05, + "loss": 2.5287, + "mean_token_accuracy": 0.43103447556495667, + "step": 38755 + }, + { + "epoch": 0.039039483663547335, + "grad_norm": 12.983170848811769, + "learning_rate": 3.903873657917531e-05, + "loss": 2.5331, + "mean_token_accuracy": 0.41379310488700866, + "step": 38760 + }, + { + "epoch": 0.03904451971665151, + "grad_norm": 17.449488767719775, + "learning_rate": 3.904377253590637e-05, + "loss": 2.4696, + "mean_token_accuracy": 0.4000000059604645, + "step": 38765 + }, + { + "epoch": 0.03904955576975568, + "grad_norm": 13.248763612637218, + "learning_rate": 3.904880849263743e-05, + "loss": 2.7131, + "mean_token_accuracy": 0.37241379022598264, + "step": 38770 + }, + { + "epoch": 0.039054591822859856, + "grad_norm": 18.29404768110175, + "learning_rate": 3.905384444936849e-05, + "loss": 2.8362, + "mean_token_accuracy": 0.4172413766384125, + "step": 38775 + }, + { + "epoch": 0.03905962787596402, + "grad_norm": 16.547322035915307, + "learning_rate": 3.905888040609955e-05, + "loss": 2.997, + "mean_token_accuracy": 0.3551724076271057, + "step": 38780 + }, + { + "epoch": 0.0390646639290682, + "grad_norm": 12.317070802377199, + "learning_rate": 3.906391636283061e-05, + "loss": 2.145, + "mean_token_accuracy": 0.45862069725990295, + "step": 38785 + }, + { + "epoch": 0.03906969998217237, + "grad_norm": 17.770034437052445, + "learning_rate": 3.9068952319561675e-05, + "loss": 3.0065, + "mean_token_accuracy": 0.32068965435028074, + "step": 38790 + }, + { + "epoch": 0.039074736035276544, + "grad_norm": 16.797171795798455, + "learning_rate": 3.9073988276292735e-05, + "loss": 3.0249, + "mean_token_accuracy": 0.36551723480224607, + "step": 38795 + }, + { + "epoch": 0.03907977208838072, + "grad_norm": 20.22679474979558, + "learning_rate": 3.9079024233023794e-05, + "loss": 2.7456, + "mean_token_accuracy": 0.441379314661026, + "step": 38800 + }, + { + "epoch": 0.03908480814148489, + "grad_norm": 16.148022244865817, + "learning_rate": 3.908406018975485e-05, + "loss": 2.4045, + "mean_token_accuracy": 0.4068965494632721, + "step": 38805 + }, + { + "epoch": 0.039089844194589066, + "grad_norm": 13.504311675405004, + "learning_rate": 3.908909614648591e-05, + "loss": 2.4682, + "mean_token_accuracy": 0.4068965554237366, + "step": 38810 + }, + { + "epoch": 0.03909488024769323, + "grad_norm": 19.056439954903873, + "learning_rate": 3.909413210321697e-05, + "loss": 2.4998, + "mean_token_accuracy": 0.42413793206214906, + "step": 38815 + }, + { + "epoch": 0.039099916300797406, + "grad_norm": 22.052896919589823, + "learning_rate": 3.909916805994803e-05, + "loss": 2.618, + "mean_token_accuracy": 0.37586207389831544, + "step": 38820 + }, + { + "epoch": 0.03910495235390158, + "grad_norm": 11.851922815075556, + "learning_rate": 3.910420401667909e-05, + "loss": 2.2593, + "mean_token_accuracy": 0.4482758641242981, + "step": 38825 + }, + { + "epoch": 0.039109988407005754, + "grad_norm": 19.939857952714195, + "learning_rate": 3.910923997341015e-05, + "loss": 2.8097, + "mean_token_accuracy": 0.3241379290819168, + "step": 38830 + }, + { + "epoch": 0.03911502446010993, + "grad_norm": 11.604616431471497, + "learning_rate": 3.911427593014121e-05, + "loss": 2.5012, + "mean_token_accuracy": 0.39310344457626345, + "step": 38835 + }, + { + "epoch": 0.0391200605132141, + "grad_norm": 13.006562505662664, + "learning_rate": 3.911931188687227e-05, + "loss": 2.4535, + "mean_token_accuracy": 0.4172413766384125, + "step": 38840 + }, + { + "epoch": 0.039125096566318275, + "grad_norm": 17.154948578072254, + "learning_rate": 3.9124347843603334e-05, + "loss": 2.3754, + "mean_token_accuracy": 0.4068965494632721, + "step": 38845 + }, + { + "epoch": 0.03913013261942244, + "grad_norm": 13.767325408131235, + "learning_rate": 3.912938380033439e-05, + "loss": 2.566, + "mean_token_accuracy": 0.36551723480224607, + "step": 38850 + }, + { + "epoch": 0.039135168672526616, + "grad_norm": 14.13204342923202, + "learning_rate": 3.913441975706545e-05, + "loss": 2.6176, + "mean_token_accuracy": 0.3999999940395355, + "step": 38855 + }, + { + "epoch": 0.03914020472563079, + "grad_norm": 23.92223017244613, + "learning_rate": 3.9139455713796505e-05, + "loss": 2.5893, + "mean_token_accuracy": 0.41379310488700866, + "step": 38860 + }, + { + "epoch": 0.03914524077873496, + "grad_norm": 12.696961875721472, + "learning_rate": 3.9144491670527564e-05, + "loss": 2.4454, + "mean_token_accuracy": 0.43793103098869324, + "step": 38865 + }, + { + "epoch": 0.03915027683183914, + "grad_norm": 20.84491659432228, + "learning_rate": 3.914952762725863e-05, + "loss": 2.7846, + "mean_token_accuracy": 0.4, + "step": 38870 + }, + { + "epoch": 0.03915531288494331, + "grad_norm": 17.036519454695824, + "learning_rate": 3.915456358398969e-05, + "loss": 2.3603, + "mean_token_accuracy": 0.412522679567337, + "step": 38875 + }, + { + "epoch": 0.039160348938047485, + "grad_norm": 14.236087343210915, + "learning_rate": 3.915959954072075e-05, + "loss": 2.7244, + "mean_token_accuracy": 0.42758620977401735, + "step": 38880 + }, + { + "epoch": 0.03916538499115165, + "grad_norm": 13.960424645657923, + "learning_rate": 3.916463549745181e-05, + "loss": 2.4017, + "mean_token_accuracy": 0.4068965494632721, + "step": 38885 + }, + { + "epoch": 0.039170421044255825, + "grad_norm": 14.167016912206758, + "learning_rate": 3.916967145418287e-05, + "loss": 2.5066, + "mean_token_accuracy": 0.41034482717514037, + "step": 38890 + }, + { + "epoch": 0.03917545709736, + "grad_norm": 15.96352251101983, + "learning_rate": 3.917470741091393e-05, + "loss": 2.4379, + "mean_token_accuracy": 0.4103448152542114, + "step": 38895 + }, + { + "epoch": 0.03918049315046417, + "grad_norm": 16.13910166269604, + "learning_rate": 3.9179743367644986e-05, + "loss": 2.5056, + "mean_token_accuracy": 0.4586206912994385, + "step": 38900 + }, + { + "epoch": 0.039185529203568346, + "grad_norm": 14.3241108631312, + "learning_rate": 3.9184779324376045e-05, + "loss": 3.2258, + "mean_token_accuracy": 0.31379309892654417, + "step": 38905 + }, + { + "epoch": 0.03919056525667252, + "grad_norm": 13.893291154046372, + "learning_rate": 3.9189815281107104e-05, + "loss": 2.6545, + "mean_token_accuracy": 0.43998789191246035, + "step": 38910 + }, + { + "epoch": 0.039195601309776694, + "grad_norm": 38.90380102099931, + "learning_rate": 3.9194851237838163e-05, + "loss": 2.6143, + "mean_token_accuracy": 0.36896551251411436, + "step": 38915 + }, + { + "epoch": 0.03920063736288086, + "grad_norm": 10.425371685548125, + "learning_rate": 3.919988719456922e-05, + "loss": 2.4644, + "mean_token_accuracy": 0.441379314661026, + "step": 38920 + }, + { + "epoch": 0.039205673415985035, + "grad_norm": 12.775357601215736, + "learning_rate": 3.920492315130029e-05, + "loss": 2.5944, + "mean_token_accuracy": 0.41724138259887694, + "step": 38925 + }, + { + "epoch": 0.03921070946908921, + "grad_norm": 13.687075921960881, + "learning_rate": 3.920995910803135e-05, + "loss": 2.2032, + "mean_token_accuracy": 0.482758617401123, + "step": 38930 + }, + { + "epoch": 0.03921574552219338, + "grad_norm": 15.081052104354697, + "learning_rate": 3.921499506476241e-05, + "loss": 2.6906, + "mean_token_accuracy": 0.36896551251411436, + "step": 38935 + }, + { + "epoch": 0.039220781575297556, + "grad_norm": 13.717081588147721, + "learning_rate": 3.9220031021493466e-05, + "loss": 2.7863, + "mean_token_accuracy": 0.3827586203813553, + "step": 38940 + }, + { + "epoch": 0.03922581762840173, + "grad_norm": 13.848264871103488, + "learning_rate": 3.9225066978224526e-05, + "loss": 2.6957, + "mean_token_accuracy": 0.4034482717514038, + "step": 38945 + }, + { + "epoch": 0.0392308536815059, + "grad_norm": 13.372593277872918, + "learning_rate": 3.9230102934955585e-05, + "loss": 2.4021, + "mean_token_accuracy": 0.39310344457626345, + "step": 38950 + }, + { + "epoch": 0.03923588973461007, + "grad_norm": 16.369399696426182, + "learning_rate": 3.9235138891686644e-05, + "loss": 2.4953, + "mean_token_accuracy": 0.4310344815254211, + "step": 38955 + }, + { + "epoch": 0.039240925787714244, + "grad_norm": 17.954263836056256, + "learning_rate": 3.9240174848417704e-05, + "loss": 2.6452, + "mean_token_accuracy": 0.36896551847457887, + "step": 38960 + }, + { + "epoch": 0.03924596184081842, + "grad_norm": 13.82868095935141, + "learning_rate": 3.924521080514876e-05, + "loss": 2.702, + "mean_token_accuracy": 0.36896551251411436, + "step": 38965 + }, + { + "epoch": 0.03925099789392259, + "grad_norm": 19.05901643465952, + "learning_rate": 3.925024676187982e-05, + "loss": 2.6551, + "mean_token_accuracy": 0.4068965494632721, + "step": 38970 + }, + { + "epoch": 0.039256033947026765, + "grad_norm": 22.73567806521048, + "learning_rate": 3.925528271861089e-05, + "loss": 2.6067, + "mean_token_accuracy": 0.41379311084747317, + "step": 38975 + }, + { + "epoch": 0.03926107000013094, + "grad_norm": 16.425022425374742, + "learning_rate": 3.926031867534195e-05, + "loss": 2.446, + "mean_token_accuracy": 0.43793103098869324, + "step": 38980 + }, + { + "epoch": 0.03926610605323511, + "grad_norm": 13.713862656993555, + "learning_rate": 3.9265354632073007e-05, + "loss": 2.4991, + "mean_token_accuracy": 0.42758620977401735, + "step": 38985 + }, + { + "epoch": 0.03927114210633928, + "grad_norm": 13.212786090280394, + "learning_rate": 3.9270390588804066e-05, + "loss": 2.6537, + "mean_token_accuracy": 0.41034482717514037, + "step": 38990 + }, + { + "epoch": 0.03927617815944345, + "grad_norm": 14.63901908967399, + "learning_rate": 3.927542654553512e-05, + "loss": 2.9621, + "mean_token_accuracy": 0.32068965435028074, + "step": 38995 + }, + { + "epoch": 0.03928121421254763, + "grad_norm": 13.373920777160192, + "learning_rate": 3.928046250226618e-05, + "loss": 2.4328, + "mean_token_accuracy": 0.4137930989265442, + "step": 39000 + }, + { + "epoch": 0.0392862502656518, + "grad_norm": 18.92285610909184, + "learning_rate": 3.9285498458997244e-05, + "loss": 2.5265, + "mean_token_accuracy": 0.39655172228813174, + "step": 39005 + }, + { + "epoch": 0.039291286318755975, + "grad_norm": 12.47323915534808, + "learning_rate": 3.92905344157283e-05, + "loss": 2.6145, + "mean_token_accuracy": 0.38620689511299133, + "step": 39010 + }, + { + "epoch": 0.03929632237186015, + "grad_norm": 15.928124103717755, + "learning_rate": 3.929557037245936e-05, + "loss": 2.8476, + "mean_token_accuracy": 0.334482753276825, + "step": 39015 + }, + { + "epoch": 0.03930135842496432, + "grad_norm": 16.586202455818835, + "learning_rate": 3.930060632919042e-05, + "loss": 2.6401, + "mean_token_accuracy": 0.39310343861579894, + "step": 39020 + }, + { + "epoch": 0.03930639447806849, + "grad_norm": 16.052724379771487, + "learning_rate": 3.930564228592148e-05, + "loss": 2.5165, + "mean_token_accuracy": 0.33103448152542114, + "step": 39025 + }, + { + "epoch": 0.03931143053117266, + "grad_norm": 16.04497080388729, + "learning_rate": 3.9310678242652547e-05, + "loss": 2.855, + "mean_token_accuracy": 0.39655172228813174, + "step": 39030 + }, + { + "epoch": 0.03931646658427684, + "grad_norm": 16.88602485430087, + "learning_rate": 3.93157141993836e-05, + "loss": 2.8025, + "mean_token_accuracy": 0.3344827562570572, + "step": 39035 + }, + { + "epoch": 0.03932150263738101, + "grad_norm": 13.334421150265165, + "learning_rate": 3.932075015611466e-05, + "loss": 2.7785, + "mean_token_accuracy": 0.40852995216846466, + "step": 39040 + }, + { + "epoch": 0.039326538690485184, + "grad_norm": 13.173590983803653, + "learning_rate": 3.932578611284572e-05, + "loss": 2.5949, + "mean_token_accuracy": 0.39310344457626345, + "step": 39045 + }, + { + "epoch": 0.03933157474358936, + "grad_norm": 13.155064135401226, + "learning_rate": 3.933082206957678e-05, + "loss": 2.6883, + "mean_token_accuracy": 0.42758620977401735, + "step": 39050 + }, + { + "epoch": 0.03933661079669353, + "grad_norm": 15.165735716401644, + "learning_rate": 3.933585802630784e-05, + "loss": 2.6232, + "mean_token_accuracy": 0.34137930274009703, + "step": 39055 + }, + { + "epoch": 0.0393416468497977, + "grad_norm": 11.886119822368393, + "learning_rate": 3.93408939830389e-05, + "loss": 2.8737, + "mean_token_accuracy": 0.3551724135875702, + "step": 39060 + }, + { + "epoch": 0.03934668290290187, + "grad_norm": 12.350236250893216, + "learning_rate": 3.934592993976996e-05, + "loss": 2.5434, + "mean_token_accuracy": 0.42413793206214906, + "step": 39065 + }, + { + "epoch": 0.039351718956006046, + "grad_norm": 13.604534412022453, + "learning_rate": 3.935096589650102e-05, + "loss": 2.579, + "mean_token_accuracy": 0.4448275864124298, + "step": 39070 + }, + { + "epoch": 0.03935675500911022, + "grad_norm": 17.637371062875392, + "learning_rate": 3.935600185323208e-05, + "loss": 2.4638, + "mean_token_accuracy": 0.4, + "step": 39075 + }, + { + "epoch": 0.039361791062214393, + "grad_norm": 14.819936343905479, + "learning_rate": 3.936103780996314e-05, + "loss": 2.6425, + "mean_token_accuracy": 0.35862069129943847, + "step": 39080 + }, + { + "epoch": 0.03936682711531857, + "grad_norm": 20.073671155662876, + "learning_rate": 3.93660737666942e-05, + "loss": 2.799, + "mean_token_accuracy": 0.37410768270492556, + "step": 39085 + }, + { + "epoch": 0.03937186316842274, + "grad_norm": 16.782005443273068, + "learning_rate": 3.937110972342526e-05, + "loss": 2.3635, + "mean_token_accuracy": 0.41379310488700866, + "step": 39090 + }, + { + "epoch": 0.03937689922152691, + "grad_norm": 13.326606143731963, + "learning_rate": 3.937614568015632e-05, + "loss": 2.8715, + "mean_token_accuracy": 0.358620685338974, + "step": 39095 + }, + { + "epoch": 0.03938193527463108, + "grad_norm": 17.283200358835902, + "learning_rate": 3.9381181636887376e-05, + "loss": 2.7853, + "mean_token_accuracy": 0.36364186406135557, + "step": 39100 + }, + { + "epoch": 0.039386971327735255, + "grad_norm": 12.977211605237093, + "learning_rate": 3.9386217593618435e-05, + "loss": 2.7324, + "mean_token_accuracy": 0.3827586114406586, + "step": 39105 + }, + { + "epoch": 0.03939200738083943, + "grad_norm": 12.916576388942303, + "learning_rate": 3.93912535503495e-05, + "loss": 2.7038, + "mean_token_accuracy": 0.3551724076271057, + "step": 39110 + }, + { + "epoch": 0.0393970434339436, + "grad_norm": 17.86370132021268, + "learning_rate": 3.939628950708056e-05, + "loss": 3.2147, + "mean_token_accuracy": 0.36896551251411436, + "step": 39115 + }, + { + "epoch": 0.03940207948704778, + "grad_norm": 13.045852787930244, + "learning_rate": 3.940132546381162e-05, + "loss": 2.7357, + "mean_token_accuracy": 0.36551723480224607, + "step": 39120 + }, + { + "epoch": 0.03940711554015195, + "grad_norm": 15.757778651201477, + "learning_rate": 3.940636142054267e-05, + "loss": 2.2894, + "mean_token_accuracy": 0.3931034505367279, + "step": 39125 + }, + { + "epoch": 0.03941215159325612, + "grad_norm": 16.8276276516325, + "learning_rate": 3.941139737727373e-05, + "loss": 2.9455, + "mean_token_accuracy": 0.3655172407627106, + "step": 39130 + }, + { + "epoch": 0.03941718764636029, + "grad_norm": 14.071941602518693, + "learning_rate": 3.94164333340048e-05, + "loss": 2.4264, + "mean_token_accuracy": 0.4329703629016876, + "step": 39135 + }, + { + "epoch": 0.039422223699464465, + "grad_norm": 17.58594611345993, + "learning_rate": 3.942146929073586e-05, + "loss": 2.6935, + "mean_token_accuracy": 0.4034482777118683, + "step": 39140 + }, + { + "epoch": 0.03942725975256864, + "grad_norm": 16.02610329655536, + "learning_rate": 3.9426505247466916e-05, + "loss": 2.5236, + "mean_token_accuracy": 0.39310345649719236, + "step": 39145 + }, + { + "epoch": 0.03943229580567281, + "grad_norm": 11.940800327529, + "learning_rate": 3.9431541204197975e-05, + "loss": 2.2514, + "mean_token_accuracy": 0.4814881980419159, + "step": 39150 + }, + { + "epoch": 0.039437331858776986, + "grad_norm": 16.033471299773147, + "learning_rate": 3.9436577160929035e-05, + "loss": 2.5791, + "mean_token_accuracy": 0.4256503343582153, + "step": 39155 + }, + { + "epoch": 0.03944236791188116, + "grad_norm": 14.03174889257257, + "learning_rate": 3.9441613117660094e-05, + "loss": 2.8602, + "mean_token_accuracy": 0.37241379022598264, + "step": 39160 + }, + { + "epoch": 0.03944740396498533, + "grad_norm": 13.625744841911843, + "learning_rate": 3.944664907439115e-05, + "loss": 2.3798, + "mean_token_accuracy": 0.4676950931549072, + "step": 39165 + }, + { + "epoch": 0.0394524400180895, + "grad_norm": 13.660925290153212, + "learning_rate": 3.945168503112221e-05, + "loss": 2.6829, + "mean_token_accuracy": 0.3896551728248596, + "step": 39170 + }, + { + "epoch": 0.039457476071193674, + "grad_norm": 17.056419438956265, + "learning_rate": 3.945672098785327e-05, + "loss": 2.7271, + "mean_token_accuracy": 0.36896551847457887, + "step": 39175 + }, + { + "epoch": 0.03946251212429785, + "grad_norm": 13.605546504044312, + "learning_rate": 3.946175694458433e-05, + "loss": 2.2995, + "mean_token_accuracy": 0.4482758641242981, + "step": 39180 + }, + { + "epoch": 0.03946754817740202, + "grad_norm": 14.499820582181188, + "learning_rate": 3.946679290131539e-05, + "loss": 2.6638, + "mean_token_accuracy": 0.4103448331356049, + "step": 39185 + }, + { + "epoch": 0.039472584230506196, + "grad_norm": 16.687328248807603, + "learning_rate": 3.9471828858046456e-05, + "loss": 2.6975, + "mean_token_accuracy": 0.36551724672317504, + "step": 39190 + }, + { + "epoch": 0.03947762028361037, + "grad_norm": 13.185617452566404, + "learning_rate": 3.9476864814777515e-05, + "loss": 2.3954, + "mean_token_accuracy": 0.46551724672317507, + "step": 39195 + }, + { + "epoch": 0.039482656336714536, + "grad_norm": 13.915040673167244, + "learning_rate": 3.9481900771508575e-05, + "loss": 2.3264, + "mean_token_accuracy": 0.42413793206214906, + "step": 39200 + }, + { + "epoch": 0.03948769238981871, + "grad_norm": 14.276362012705954, + "learning_rate": 3.9486936728239634e-05, + "loss": 2.5525, + "mean_token_accuracy": 0.38965518176555636, + "step": 39205 + }, + { + "epoch": 0.039492728442922884, + "grad_norm": 12.361969557379272, + "learning_rate": 3.949197268497069e-05, + "loss": 2.5832, + "mean_token_accuracy": 0.4366606116294861, + "step": 39210 + }, + { + "epoch": 0.03949776449602706, + "grad_norm": 19.32632648050562, + "learning_rate": 3.949700864170175e-05, + "loss": 2.5092, + "mean_token_accuracy": 0.4344827592372894, + "step": 39215 + }, + { + "epoch": 0.03950280054913123, + "grad_norm": 20.43910138073725, + "learning_rate": 3.950204459843281e-05, + "loss": 2.8964, + "mean_token_accuracy": 0.3793103456497192, + "step": 39220 + }, + { + "epoch": 0.039507836602235405, + "grad_norm": 25.823660481786103, + "learning_rate": 3.950708055516387e-05, + "loss": 2.8454, + "mean_token_accuracy": 0.3482758581638336, + "step": 39225 + }, + { + "epoch": 0.03951287265533958, + "grad_norm": 13.509988081653239, + "learning_rate": 3.951211651189493e-05, + "loss": 2.3458, + "mean_token_accuracy": 0.44827585220336913, + "step": 39230 + }, + { + "epoch": 0.039517908708443746, + "grad_norm": 14.843754017966694, + "learning_rate": 3.951715246862599e-05, + "loss": 2.5759, + "mean_token_accuracy": 0.4034482717514038, + "step": 39235 + }, + { + "epoch": 0.03952294476154792, + "grad_norm": 18.787312345872902, + "learning_rate": 3.952218842535705e-05, + "loss": 2.906, + "mean_token_accuracy": 0.3931034505367279, + "step": 39240 + }, + { + "epoch": 0.03952798081465209, + "grad_norm": 16.082131899332857, + "learning_rate": 3.9527224382088115e-05, + "loss": 2.7071, + "mean_token_accuracy": 0.3862069010734558, + "step": 39245 + }, + { + "epoch": 0.03953301686775627, + "grad_norm": 16.29557245253997, + "learning_rate": 3.9532260338819174e-05, + "loss": 2.9652, + "mean_token_accuracy": 0.36896551847457887, + "step": 39250 + }, + { + "epoch": 0.03953805292086044, + "grad_norm": 18.533207059767904, + "learning_rate": 3.953729629555023e-05, + "loss": 2.3665, + "mean_token_accuracy": 0.41724138259887694, + "step": 39255 + }, + { + "epoch": 0.039543088973964614, + "grad_norm": 10.727089125913102, + "learning_rate": 3.9542332252281286e-05, + "loss": 2.5451, + "mean_token_accuracy": 0.37241379022598264, + "step": 39260 + }, + { + "epoch": 0.03954812502706879, + "grad_norm": 12.924395723644993, + "learning_rate": 3.9547368209012345e-05, + "loss": 2.3814, + "mean_token_accuracy": 0.4068965554237366, + "step": 39265 + }, + { + "epoch": 0.039553161080172955, + "grad_norm": 11.500845381146906, + "learning_rate": 3.955240416574341e-05, + "loss": 2.1867, + "mean_token_accuracy": 0.4689655125141144, + "step": 39270 + }, + { + "epoch": 0.03955819713327713, + "grad_norm": 11.441086676486723, + "learning_rate": 3.955744012247447e-05, + "loss": 2.2728, + "mean_token_accuracy": 0.4402298927307129, + "step": 39275 + }, + { + "epoch": 0.0395632331863813, + "grad_norm": 18.25151814645436, + "learning_rate": 3.956247607920553e-05, + "loss": 2.6677, + "mean_token_accuracy": 0.4103448212146759, + "step": 39280 + }, + { + "epoch": 0.039568269239485476, + "grad_norm": 12.327168727445336, + "learning_rate": 3.956751203593659e-05, + "loss": 2.4921, + "mean_token_accuracy": 0.42413793206214906, + "step": 39285 + }, + { + "epoch": 0.03957330529258965, + "grad_norm": 13.901024123853642, + "learning_rate": 3.957254799266765e-05, + "loss": 3.0384, + "mean_token_accuracy": 0.3517241388559341, + "step": 39290 + }, + { + "epoch": 0.039578341345693824, + "grad_norm": 16.37573327854223, + "learning_rate": 3.9577583949398714e-05, + "loss": 2.5583, + "mean_token_accuracy": 0.38275861740112305, + "step": 39295 + }, + { + "epoch": 0.039583377398798, + "grad_norm": 14.923322933260472, + "learning_rate": 3.9582619906129767e-05, + "loss": 2.4811, + "mean_token_accuracy": 0.3896551728248596, + "step": 39300 + }, + { + "epoch": 0.039588413451902164, + "grad_norm": 15.537843465630097, + "learning_rate": 3.9587655862860826e-05, + "loss": 2.7747, + "mean_token_accuracy": 0.36896551251411436, + "step": 39305 + }, + { + "epoch": 0.03959344950500634, + "grad_norm": 15.14466450252084, + "learning_rate": 3.9592691819591885e-05, + "loss": 2.7147, + "mean_token_accuracy": 0.36551723480224607, + "step": 39310 + }, + { + "epoch": 0.03959848555811051, + "grad_norm": 13.217036898690914, + "learning_rate": 3.9597727776322944e-05, + "loss": 2.5231, + "mean_token_accuracy": 0.38965516686439516, + "step": 39315 + }, + { + "epoch": 0.039603521611214686, + "grad_norm": 20.50421717083436, + "learning_rate": 3.960276373305401e-05, + "loss": 2.9057, + "mean_token_accuracy": 0.3896551728248596, + "step": 39320 + }, + { + "epoch": 0.03960855766431886, + "grad_norm": 13.158844564619992, + "learning_rate": 3.960779968978507e-05, + "loss": 2.8296, + "mean_token_accuracy": 0.36551723480224607, + "step": 39325 + }, + { + "epoch": 0.03961359371742303, + "grad_norm": 14.51961099041043, + "learning_rate": 3.961283564651613e-05, + "loss": 2.3579, + "mean_token_accuracy": 0.41724138259887694, + "step": 39330 + }, + { + "epoch": 0.03961862977052721, + "grad_norm": 12.246682889074927, + "learning_rate": 3.961787160324719e-05, + "loss": 2.4549, + "mean_token_accuracy": 0.4068965494632721, + "step": 39335 + }, + { + "epoch": 0.039623665823631374, + "grad_norm": 18.642027729901674, + "learning_rate": 3.962290755997825e-05, + "loss": 2.538, + "mean_token_accuracy": 0.3827586233615875, + "step": 39340 + }, + { + "epoch": 0.03962870187673555, + "grad_norm": 14.764659301778105, + "learning_rate": 3.962794351670931e-05, + "loss": 2.5658, + "mean_token_accuracy": 0.4241379201412201, + "step": 39345 + }, + { + "epoch": 0.03963373792983972, + "grad_norm": 15.888077346363804, + "learning_rate": 3.9632979473440366e-05, + "loss": 2.7151, + "mean_token_accuracy": 0.4413793087005615, + "step": 39350 + }, + { + "epoch": 0.039638773982943895, + "grad_norm": 13.022821189394236, + "learning_rate": 3.9638015430171425e-05, + "loss": 2.4815, + "mean_token_accuracy": 0.4223835408687592, + "step": 39355 + }, + { + "epoch": 0.03964381003604807, + "grad_norm": 17.24888371910432, + "learning_rate": 3.9643051386902484e-05, + "loss": 2.5759, + "mean_token_accuracy": 0.417241370677948, + "step": 39360 + }, + { + "epoch": 0.03964884608915224, + "grad_norm": 14.778974633246731, + "learning_rate": 3.9648087343633544e-05, + "loss": 2.2983, + "mean_token_accuracy": 0.4620689570903778, + "step": 39365 + }, + { + "epoch": 0.039653882142256416, + "grad_norm": 18.294872011754546, + "learning_rate": 3.96531233003646e-05, + "loss": 2.9639, + "mean_token_accuracy": 0.4103448331356049, + "step": 39370 + }, + { + "epoch": 0.03965891819536058, + "grad_norm": 15.05311614354758, + "learning_rate": 3.965815925709567e-05, + "loss": 2.7486, + "mean_token_accuracy": 0.3586206942796707, + "step": 39375 + }, + { + "epoch": 0.03966395424846476, + "grad_norm": 13.28741748287684, + "learning_rate": 3.966319521382673e-05, + "loss": 2.5122, + "mean_token_accuracy": 0.4517241358757019, + "step": 39380 + }, + { + "epoch": 0.03966899030156893, + "grad_norm": 12.48948293722184, + "learning_rate": 3.966823117055779e-05, + "loss": 2.3586, + "mean_token_accuracy": 0.4034482717514038, + "step": 39385 + }, + { + "epoch": 0.039674026354673105, + "grad_norm": 11.478660292164042, + "learning_rate": 3.967326712728885e-05, + "loss": 2.5666, + "mean_token_accuracy": 0.3862068891525269, + "step": 39390 + }, + { + "epoch": 0.03967906240777728, + "grad_norm": 151.94760866317085, + "learning_rate": 3.96783030840199e-05, + "loss": 2.7741, + "mean_token_accuracy": 0.3793103456497192, + "step": 39395 + }, + { + "epoch": 0.03968409846088145, + "grad_norm": 17.343988304911704, + "learning_rate": 3.9683339040750965e-05, + "loss": 2.5792, + "mean_token_accuracy": 0.42068966031074523, + "step": 39400 + }, + { + "epoch": 0.039689134513985626, + "grad_norm": 12.411400232651037, + "learning_rate": 3.9688374997482024e-05, + "loss": 2.3402, + "mean_token_accuracy": 0.43103448748588563, + "step": 39405 + }, + { + "epoch": 0.03969417056708979, + "grad_norm": 13.451871682944464, + "learning_rate": 3.9693410954213084e-05, + "loss": 2.7724, + "mean_token_accuracy": 0.37241379022598264, + "step": 39410 + }, + { + "epoch": 0.039699206620193966, + "grad_norm": 13.462184975519026, + "learning_rate": 3.969844691094414e-05, + "loss": 2.4708, + "mean_token_accuracy": 0.4137930989265442, + "step": 39415 + }, + { + "epoch": 0.03970424267329814, + "grad_norm": 13.412206325538921, + "learning_rate": 3.97034828676752e-05, + "loss": 2.6791, + "mean_token_accuracy": 0.3793103456497192, + "step": 39420 + }, + { + "epoch": 0.039709278726402314, + "grad_norm": 13.708919603103682, + "learning_rate": 3.970851882440626e-05, + "loss": 2.4917, + "mean_token_accuracy": 0.4000000059604645, + "step": 39425 + }, + { + "epoch": 0.03971431477950649, + "grad_norm": 15.09815940955768, + "learning_rate": 3.971355478113733e-05, + "loss": 2.5872, + "mean_token_accuracy": 0.40689654350280763, + "step": 39430 + }, + { + "epoch": 0.03971935083261066, + "grad_norm": 16.309432701949717, + "learning_rate": 3.971859073786838e-05, + "loss": 2.7296, + "mean_token_accuracy": 0.3586206793785095, + "step": 39435 + }, + { + "epoch": 0.039724386885714835, + "grad_norm": 13.121613910149232, + "learning_rate": 3.972362669459944e-05, + "loss": 2.4803, + "mean_token_accuracy": 0.43793103098869324, + "step": 39440 + }, + { + "epoch": 0.039729422938819, + "grad_norm": 14.001426772261661, + "learning_rate": 3.97286626513305e-05, + "loss": 2.4129, + "mean_token_accuracy": 0.3999999940395355, + "step": 39445 + }, + { + "epoch": 0.039734458991923176, + "grad_norm": 12.247200506466168, + "learning_rate": 3.973369860806156e-05, + "loss": 2.6323, + "mean_token_accuracy": 0.37586206793785093, + "step": 39450 + }, + { + "epoch": 0.03973949504502735, + "grad_norm": 17.205158538958848, + "learning_rate": 3.9738734564792624e-05, + "loss": 2.4117, + "mean_token_accuracy": 0.4068965494632721, + "step": 39455 + }, + { + "epoch": 0.03974453109813152, + "grad_norm": 12.588708079672095, + "learning_rate": 3.974377052152368e-05, + "loss": 2.4337, + "mean_token_accuracy": 0.4157894730567932, + "step": 39460 + }, + { + "epoch": 0.0397495671512357, + "grad_norm": 15.630111012201981, + "learning_rate": 3.974880647825474e-05, + "loss": 2.7837, + "mean_token_accuracy": 0.37241379618644715, + "step": 39465 + }, + { + "epoch": 0.03975460320433987, + "grad_norm": 31.02993385507879, + "learning_rate": 3.97538424349858e-05, + "loss": 2.5506, + "mean_token_accuracy": 0.39729064106941225, + "step": 39470 + }, + { + "epoch": 0.039759639257444045, + "grad_norm": 12.081723926618887, + "learning_rate": 3.975887839171686e-05, + "loss": 2.4648, + "mean_token_accuracy": 0.4103448331356049, + "step": 39475 + }, + { + "epoch": 0.03976467531054821, + "grad_norm": 13.35006092217864, + "learning_rate": 3.976391434844792e-05, + "loss": 2.7746, + "mean_token_accuracy": 0.37586206793785093, + "step": 39480 + }, + { + "epoch": 0.039769711363652385, + "grad_norm": 12.747042574499629, + "learning_rate": 3.976895030517898e-05, + "loss": 2.1216, + "mean_token_accuracy": 0.47749547362327577, + "step": 39485 + }, + { + "epoch": 0.03977474741675656, + "grad_norm": 13.063347868763474, + "learning_rate": 3.977398626191004e-05, + "loss": 2.562, + "mean_token_accuracy": 0.39310344457626345, + "step": 39490 + }, + { + "epoch": 0.03977978346986073, + "grad_norm": 16.274363635852616, + "learning_rate": 3.97790222186411e-05, + "loss": 2.9617, + "mean_token_accuracy": 0.33793103098869326, + "step": 39495 + }, + { + "epoch": 0.03978481952296491, + "grad_norm": 18.248884440205874, + "learning_rate": 3.978405817537216e-05, + "loss": 2.9764, + "mean_token_accuracy": 0.3827586233615875, + "step": 39500 + }, + { + "epoch": 0.03978985557606908, + "grad_norm": 22.20417504610283, + "learning_rate": 3.9789094132103216e-05, + "loss": 2.7394, + "mean_token_accuracy": 0.38620689511299133, + "step": 39505 + }, + { + "epoch": 0.039794891629173254, + "grad_norm": 14.698414093180439, + "learning_rate": 3.979413008883428e-05, + "loss": 3.0379, + "mean_token_accuracy": 0.33103448152542114, + "step": 39510 + }, + { + "epoch": 0.03979992768227742, + "grad_norm": 10.820874692897485, + "learning_rate": 3.979916604556534e-05, + "loss": 2.25, + "mean_token_accuracy": 0.4448275864124298, + "step": 39515 + }, + { + "epoch": 0.039804963735381595, + "grad_norm": 16.08883257284815, + "learning_rate": 3.98042020022964e-05, + "loss": 2.6267, + "mean_token_accuracy": 0.37241379022598264, + "step": 39520 + }, + { + "epoch": 0.03980999978848577, + "grad_norm": 17.108290393163657, + "learning_rate": 3.980923795902745e-05, + "loss": 2.5097, + "mean_token_accuracy": 0.41379310488700866, + "step": 39525 + }, + { + "epoch": 0.03981503584158994, + "grad_norm": 12.990902019388828, + "learning_rate": 3.981427391575851e-05, + "loss": 2.7497, + "mean_token_accuracy": 0.3482758581638336, + "step": 39530 + }, + { + "epoch": 0.039820071894694116, + "grad_norm": 18.301559410203318, + "learning_rate": 3.981930987248958e-05, + "loss": 2.6243, + "mean_token_accuracy": 0.3620689630508423, + "step": 39535 + }, + { + "epoch": 0.03982510794779829, + "grad_norm": 15.60913427188001, + "learning_rate": 3.982434582922064e-05, + "loss": 2.6539, + "mean_token_accuracy": 0.43103448748588563, + "step": 39540 + }, + { + "epoch": 0.039830144000902463, + "grad_norm": 14.059613663406635, + "learning_rate": 3.98293817859517e-05, + "loss": 2.3813, + "mean_token_accuracy": 0.41905626058578493, + "step": 39545 + }, + { + "epoch": 0.03983518005400663, + "grad_norm": 14.835932478956138, + "learning_rate": 3.9834417742682756e-05, + "loss": 2.6615, + "mean_token_accuracy": 0.39310345649719236, + "step": 39550 + }, + { + "epoch": 0.039840216107110804, + "grad_norm": 18.976964715379268, + "learning_rate": 3.9839453699413816e-05, + "loss": 2.8048, + "mean_token_accuracy": 0.37586206793785093, + "step": 39555 + }, + { + "epoch": 0.03984525216021498, + "grad_norm": 13.570990325679105, + "learning_rate": 3.984448965614488e-05, + "loss": 2.7024, + "mean_token_accuracy": 0.4, + "step": 39560 + }, + { + "epoch": 0.03985028821331915, + "grad_norm": 15.49435482012221, + "learning_rate": 3.984952561287594e-05, + "loss": 2.8828, + "mean_token_accuracy": 0.37241379022598264, + "step": 39565 + }, + { + "epoch": 0.039855324266423325, + "grad_norm": 17.347070954245464, + "learning_rate": 3.985456156960699e-05, + "loss": 2.9726, + "mean_token_accuracy": 0.36896551251411436, + "step": 39570 + }, + { + "epoch": 0.0398603603195275, + "grad_norm": 15.129222708713614, + "learning_rate": 3.985959752633805e-05, + "loss": 2.7856, + "mean_token_accuracy": 0.36896551847457887, + "step": 39575 + }, + { + "epoch": 0.03986539637263167, + "grad_norm": 16.59690027805056, + "learning_rate": 3.986463348306911e-05, + "loss": 2.9819, + "mean_token_accuracy": 0.38275861740112305, + "step": 39580 + }, + { + "epoch": 0.03987043242573584, + "grad_norm": 12.391903664635596, + "learning_rate": 3.986966943980017e-05, + "loss": 2.3139, + "mean_token_accuracy": 0.41034482717514037, + "step": 39585 + }, + { + "epoch": 0.039875468478840013, + "grad_norm": 16.0721734222642, + "learning_rate": 3.987470539653124e-05, + "loss": 2.6243, + "mean_token_accuracy": 0.41379310488700866, + "step": 39590 + }, + { + "epoch": 0.03988050453194419, + "grad_norm": 14.767124030520424, + "learning_rate": 3.9879741353262296e-05, + "loss": 2.7639, + "mean_token_accuracy": 0.3793103456497192, + "step": 39595 + }, + { + "epoch": 0.03988554058504836, + "grad_norm": 12.71939357137036, + "learning_rate": 3.9884777309993356e-05, + "loss": 2.6112, + "mean_token_accuracy": 0.4103448331356049, + "step": 39600 + }, + { + "epoch": 0.039890576638152535, + "grad_norm": 19.177204748747915, + "learning_rate": 3.9889813266724415e-05, + "loss": 2.7197, + "mean_token_accuracy": 0.3827586203813553, + "step": 39605 + }, + { + "epoch": 0.03989561269125671, + "grad_norm": 14.171236057807544, + "learning_rate": 3.9894849223455474e-05, + "loss": 2.7357, + "mean_token_accuracy": 0.37011494636535647, + "step": 39610 + }, + { + "epoch": 0.03990064874436088, + "grad_norm": 13.365838389479187, + "learning_rate": 3.9899885180186533e-05, + "loss": 2.7664, + "mean_token_accuracy": 0.38620689809322356, + "step": 39615 + }, + { + "epoch": 0.03990568479746505, + "grad_norm": 15.91503837211013, + "learning_rate": 3.990492113691759e-05, + "loss": 2.2819, + "mean_token_accuracy": 0.43103448748588563, + "step": 39620 + }, + { + "epoch": 0.03991072085056922, + "grad_norm": 21.21639858593874, + "learning_rate": 3.990995709364865e-05, + "loss": 2.9169, + "mean_token_accuracy": 0.3741681814193726, + "step": 39625 + }, + { + "epoch": 0.0399157569036734, + "grad_norm": 15.074685093765835, + "learning_rate": 3.991499305037971e-05, + "loss": 2.5945, + "mean_token_accuracy": 0.3862069010734558, + "step": 39630 + }, + { + "epoch": 0.03992079295677757, + "grad_norm": 11.82394670560778, + "learning_rate": 3.992002900711077e-05, + "loss": 2.7318, + "mean_token_accuracy": 0.3551724076271057, + "step": 39635 + }, + { + "epoch": 0.039925829009881744, + "grad_norm": 34.84472649110662, + "learning_rate": 3.9925064963841836e-05, + "loss": 2.5183, + "mean_token_accuracy": 0.3862069010734558, + "step": 39640 + }, + { + "epoch": 0.03993086506298592, + "grad_norm": 14.963263901484002, + "learning_rate": 3.9930100920572896e-05, + "loss": 2.5927, + "mean_token_accuracy": 0.38275861740112305, + "step": 39645 + }, + { + "epoch": 0.03993590111609009, + "grad_norm": 15.574997832659685, + "learning_rate": 3.9935136877303955e-05, + "loss": 2.3676, + "mean_token_accuracy": 0.4503327250480652, + "step": 39650 + }, + { + "epoch": 0.03994093716919426, + "grad_norm": 13.07420189884145, + "learning_rate": 3.9940172834035014e-05, + "loss": 2.7193, + "mean_token_accuracy": 0.35862069129943847, + "step": 39655 + }, + { + "epoch": 0.03994597322229843, + "grad_norm": 15.804916855015218, + "learning_rate": 3.994520879076607e-05, + "loss": 2.6825, + "mean_token_accuracy": 0.3931034505367279, + "step": 39660 + }, + { + "epoch": 0.039951009275402606, + "grad_norm": 13.905909818659488, + "learning_rate": 3.9950244747497126e-05, + "loss": 2.3118, + "mean_token_accuracy": 0.41379310488700866, + "step": 39665 + }, + { + "epoch": 0.03995604532850678, + "grad_norm": 18.204565726185997, + "learning_rate": 3.995528070422819e-05, + "loss": 2.2973, + "mean_token_accuracy": 0.4206896543502808, + "step": 39670 + }, + { + "epoch": 0.039961081381610954, + "grad_norm": 12.825762135670029, + "learning_rate": 3.996031666095925e-05, + "loss": 2.9628, + "mean_token_accuracy": 0.3793103456497192, + "step": 39675 + }, + { + "epoch": 0.03996611743471513, + "grad_norm": 17.326167533405084, + "learning_rate": 3.996535261769031e-05, + "loss": 2.8315, + "mean_token_accuracy": 0.37241379618644715, + "step": 39680 + }, + { + "epoch": 0.0399711534878193, + "grad_norm": 15.953521243193723, + "learning_rate": 3.997038857442137e-05, + "loss": 2.9432, + "mean_token_accuracy": 0.36896551251411436, + "step": 39685 + }, + { + "epoch": 0.03997618954092347, + "grad_norm": 13.293369844590192, + "learning_rate": 3.997542453115243e-05, + "loss": 2.6419, + "mean_token_accuracy": 0.3896551728248596, + "step": 39690 + }, + { + "epoch": 0.03998122559402764, + "grad_norm": 12.508295016875014, + "learning_rate": 3.9980460487883495e-05, + "loss": 2.1332, + "mean_token_accuracy": 0.49258058667182925, + "step": 39695 + }, + { + "epoch": 0.039986261647131816, + "grad_norm": 12.979883233880802, + "learning_rate": 3.998549644461455e-05, + "loss": 2.2111, + "mean_token_accuracy": 0.4241379380226135, + "step": 39700 + }, + { + "epoch": 0.03999129770023599, + "grad_norm": 13.612385737416012, + "learning_rate": 3.999053240134561e-05, + "loss": 2.7444, + "mean_token_accuracy": 0.38965516686439516, + "step": 39705 + }, + { + "epoch": 0.03999633375334016, + "grad_norm": 19.859392133279393, + "learning_rate": 3.9995568358076666e-05, + "loss": 2.7312, + "mean_token_accuracy": 0.37586207389831544, + "step": 39710 + }, + { + "epoch": 0.04000136980644434, + "grad_norm": 13.474925896107422, + "learning_rate": 4.0000604314807725e-05, + "loss": 2.3764, + "mean_token_accuracy": 0.43103447556495667, + "step": 39715 + }, + { + "epoch": 0.04000640585954851, + "grad_norm": 14.251108773177421, + "learning_rate": 4.000564027153879e-05, + "loss": 2.3792, + "mean_token_accuracy": 0.45517241954803467, + "step": 39720 + }, + { + "epoch": 0.04001144191265268, + "grad_norm": 13.369294854956598, + "learning_rate": 4.001067622826985e-05, + "loss": 2.3234, + "mean_token_accuracy": 0.5022383630275726, + "step": 39725 + }, + { + "epoch": 0.04001647796575685, + "grad_norm": 29.233679026782127, + "learning_rate": 4.001571218500091e-05, + "loss": 2.6879, + "mean_token_accuracy": 0.4551724076271057, + "step": 39730 + }, + { + "epoch": 0.040021514018861025, + "grad_norm": 12.945092100152676, + "learning_rate": 4.002074814173197e-05, + "loss": 2.5479, + "mean_token_accuracy": 0.42068966031074523, + "step": 39735 + }, + { + "epoch": 0.0400265500719652, + "grad_norm": 17.017701734706634, + "learning_rate": 4.002578409846303e-05, + "loss": 2.4442, + "mean_token_accuracy": 0.4517241358757019, + "step": 39740 + }, + { + "epoch": 0.04003158612506937, + "grad_norm": 14.541000991914617, + "learning_rate": 4.003082005519409e-05, + "loss": 2.3415, + "mean_token_accuracy": 0.4068965554237366, + "step": 39745 + }, + { + "epoch": 0.040036622178173546, + "grad_norm": 12.55519627141762, + "learning_rate": 4.003585601192515e-05, + "loss": 2.5065, + "mean_token_accuracy": 0.4, + "step": 39750 + }, + { + "epoch": 0.04004165823127772, + "grad_norm": 12.702581169156028, + "learning_rate": 4.0040891968656206e-05, + "loss": 2.1775, + "mean_token_accuracy": 0.4551724076271057, + "step": 39755 + }, + { + "epoch": 0.04004669428438189, + "grad_norm": 12.78524961888808, + "learning_rate": 4.0045927925387265e-05, + "loss": 2.8534, + "mean_token_accuracy": 0.3931034505367279, + "step": 39760 + }, + { + "epoch": 0.04005173033748606, + "grad_norm": 13.276735221617152, + "learning_rate": 4.0050963882118325e-05, + "loss": 2.3624, + "mean_token_accuracy": 0.42068965137004855, + "step": 39765 + }, + { + "epoch": 0.040056766390590234, + "grad_norm": 12.696455481803552, + "learning_rate": 4.0055999838849384e-05, + "loss": 2.522, + "mean_token_accuracy": 0.4034482717514038, + "step": 39770 + }, + { + "epoch": 0.04006180244369441, + "grad_norm": 14.957941700938411, + "learning_rate": 4.006103579558045e-05, + "loss": 2.3928, + "mean_token_accuracy": 0.41379310488700866, + "step": 39775 + }, + { + "epoch": 0.04006683849679858, + "grad_norm": 21.41062825229992, + "learning_rate": 4.006607175231151e-05, + "loss": 2.8774, + "mean_token_accuracy": 0.3999999940395355, + "step": 39780 + }, + { + "epoch": 0.040071874549902756, + "grad_norm": 15.370285361584779, + "learning_rate": 4.007110770904257e-05, + "loss": 2.5028, + "mean_token_accuracy": 0.4461822688579559, + "step": 39785 + }, + { + "epoch": 0.04007691060300693, + "grad_norm": 14.842441323552666, + "learning_rate": 4.007614366577363e-05, + "loss": 2.5884, + "mean_token_accuracy": 0.3999999940395355, + "step": 39790 + }, + { + "epoch": 0.040081946656111096, + "grad_norm": 16.651649652264293, + "learning_rate": 4.008117962250468e-05, + "loss": 2.5498, + "mean_token_accuracy": 0.4016938954591751, + "step": 39795 + }, + { + "epoch": 0.04008698270921527, + "grad_norm": 12.660773969921829, + "learning_rate": 4.0086215579235746e-05, + "loss": 2.1226, + "mean_token_accuracy": 0.4676950991153717, + "step": 39800 + }, + { + "epoch": 0.040092018762319444, + "grad_norm": 12.70686696694512, + "learning_rate": 4.0091251535966805e-05, + "loss": 2.2087, + "mean_token_accuracy": 0.4517241299152374, + "step": 39805 + }, + { + "epoch": 0.04009705481542362, + "grad_norm": 16.460759926264725, + "learning_rate": 4.0096287492697865e-05, + "loss": 2.7078, + "mean_token_accuracy": 0.38275861740112305, + "step": 39810 + }, + { + "epoch": 0.04010209086852779, + "grad_norm": 14.536287357474801, + "learning_rate": 4.0101323449428924e-05, + "loss": 2.8086, + "mean_token_accuracy": 0.3655172437429428, + "step": 39815 + }, + { + "epoch": 0.040107126921631965, + "grad_norm": 13.195345427123746, + "learning_rate": 4.010635940615998e-05, + "loss": 2.3435, + "mean_token_accuracy": 0.43684210777282717, + "step": 39820 + }, + { + "epoch": 0.04011216297473614, + "grad_norm": 14.250115645395615, + "learning_rate": 4.011139536289105e-05, + "loss": 2.3767, + "mean_token_accuracy": 0.41034482717514037, + "step": 39825 + }, + { + "epoch": 0.040117199027840306, + "grad_norm": 14.260382150847615, + "learning_rate": 4.011643131962211e-05, + "loss": 2.6749, + "mean_token_accuracy": 0.34827586114406583, + "step": 39830 + }, + { + "epoch": 0.04012223508094448, + "grad_norm": 14.17119630028932, + "learning_rate": 4.012146727635316e-05, + "loss": 2.3429, + "mean_token_accuracy": 0.46406532526016236, + "step": 39835 + }, + { + "epoch": 0.04012727113404865, + "grad_norm": 13.240532046129761, + "learning_rate": 4.012650323308422e-05, + "loss": 2.309, + "mean_token_accuracy": 0.4709618866443634, + "step": 39840 + }, + { + "epoch": 0.04013230718715283, + "grad_norm": 15.600227316518486, + "learning_rate": 4.013153918981528e-05, + "loss": 2.6155, + "mean_token_accuracy": 0.38275861740112305, + "step": 39845 + }, + { + "epoch": 0.040137343240257, + "grad_norm": 14.03703090974591, + "learning_rate": 4.013657514654634e-05, + "loss": 2.7289, + "mean_token_accuracy": 0.358620685338974, + "step": 39850 + }, + { + "epoch": 0.040142379293361174, + "grad_norm": 11.053685476260426, + "learning_rate": 4.0141611103277405e-05, + "loss": 2.3202, + "mean_token_accuracy": 0.48166969418525696, + "step": 39855 + }, + { + "epoch": 0.04014741534646535, + "grad_norm": 14.032916466049887, + "learning_rate": 4.0146647060008464e-05, + "loss": 2.791, + "mean_token_accuracy": 0.35862068831920624, + "step": 39860 + }, + { + "epoch": 0.040152451399569515, + "grad_norm": 14.408653274915176, + "learning_rate": 4.015168301673952e-05, + "loss": 3.1839, + "mean_token_accuracy": 0.37586207389831544, + "step": 39865 + }, + { + "epoch": 0.04015748745267369, + "grad_norm": 18.680229848765556, + "learning_rate": 4.015671897347058e-05, + "loss": 2.5193, + "mean_token_accuracy": 0.3562008500099182, + "step": 39870 + }, + { + "epoch": 0.04016252350577786, + "grad_norm": 15.1808385690575, + "learning_rate": 4.016175493020164e-05, + "loss": 2.6264, + "mean_token_accuracy": 0.3931034505367279, + "step": 39875 + }, + { + "epoch": 0.040167559558882036, + "grad_norm": 15.970736521705348, + "learning_rate": 4.01667908869327e-05, + "loss": 2.8489, + "mean_token_accuracy": 0.35862068831920624, + "step": 39880 + }, + { + "epoch": 0.04017259561198621, + "grad_norm": 14.517567764246381, + "learning_rate": 4.017182684366376e-05, + "loss": 2.7939, + "mean_token_accuracy": 0.36206896901130675, + "step": 39885 + }, + { + "epoch": 0.040177631665090384, + "grad_norm": 13.234747788343773, + "learning_rate": 4.017686280039482e-05, + "loss": 2.6144, + "mean_token_accuracy": 0.3586206942796707, + "step": 39890 + }, + { + "epoch": 0.04018266771819456, + "grad_norm": 13.063504517567486, + "learning_rate": 4.018189875712588e-05, + "loss": 2.2507, + "mean_token_accuracy": 0.42068966031074523, + "step": 39895 + }, + { + "epoch": 0.040187703771298725, + "grad_norm": 11.50998517758508, + "learning_rate": 4.018693471385694e-05, + "loss": 2.3363, + "mean_token_accuracy": 0.4724137902259827, + "step": 39900 + }, + { + "epoch": 0.0401927398244029, + "grad_norm": 15.583452661683973, + "learning_rate": 4.0191970670588004e-05, + "loss": 2.6645, + "mean_token_accuracy": 0.44271020889282225, + "step": 39905 + }, + { + "epoch": 0.04019777587750707, + "grad_norm": 14.389492054247935, + "learning_rate": 4.019700662731906e-05, + "loss": 2.4437, + "mean_token_accuracy": 0.4448275864124298, + "step": 39910 + }, + { + "epoch": 0.040202811930611246, + "grad_norm": 15.431898764803782, + "learning_rate": 4.020204258405012e-05, + "loss": 2.8849, + "mean_token_accuracy": 0.4068965494632721, + "step": 39915 + }, + { + "epoch": 0.04020784798371542, + "grad_norm": 13.105860591824907, + "learning_rate": 4.020707854078118e-05, + "loss": 2.7021, + "mean_token_accuracy": 0.35517241060733795, + "step": 39920 + }, + { + "epoch": 0.04021288403681959, + "grad_norm": 13.510288944707405, + "learning_rate": 4.021211449751224e-05, + "loss": 2.3518, + "mean_token_accuracy": 0.41379310488700866, + "step": 39925 + }, + { + "epoch": 0.04021792008992377, + "grad_norm": 12.738166811618783, + "learning_rate": 4.0217150454243293e-05, + "loss": 2.0412, + "mean_token_accuracy": 0.46376285552978513, + "step": 39930 + }, + { + "epoch": 0.040222956143027934, + "grad_norm": 13.896454278918146, + "learning_rate": 4.022218641097436e-05, + "loss": 2.6781, + "mean_token_accuracy": 0.42413792610168455, + "step": 39935 + }, + { + "epoch": 0.04022799219613211, + "grad_norm": 19.12239498902643, + "learning_rate": 4.022722236770542e-05, + "loss": 2.2712, + "mean_token_accuracy": 0.45172414779663084, + "step": 39940 + }, + { + "epoch": 0.04023302824923628, + "grad_norm": 14.829419419894139, + "learning_rate": 4.023225832443648e-05, + "loss": 2.716, + "mean_token_accuracy": 0.36896551251411436, + "step": 39945 + }, + { + "epoch": 0.040238064302340455, + "grad_norm": 13.49299500020081, + "learning_rate": 4.023729428116754e-05, + "loss": 2.114, + "mean_token_accuracy": 0.5009852230548859, + "step": 39950 + }, + { + "epoch": 0.04024310035544463, + "grad_norm": 17.450758403217062, + "learning_rate": 4.0242330237898596e-05, + "loss": 2.546, + "mean_token_accuracy": 0.42758620977401735, + "step": 39955 + }, + { + "epoch": 0.0402481364085488, + "grad_norm": 14.608014522234031, + "learning_rate": 4.024736619462966e-05, + "loss": 2.8459, + "mean_token_accuracy": 0.35862069129943847, + "step": 39960 + }, + { + "epoch": 0.040253172461652977, + "grad_norm": 12.724956957101552, + "learning_rate": 4.025240215136072e-05, + "loss": 2.4664, + "mean_token_accuracy": 0.4137930989265442, + "step": 39965 + }, + { + "epoch": 0.04025820851475714, + "grad_norm": 19.27986363761355, + "learning_rate": 4.0257438108091774e-05, + "loss": 2.393, + "mean_token_accuracy": 0.4517241358757019, + "step": 39970 + }, + { + "epoch": 0.04026324456786132, + "grad_norm": 16.71052478726502, + "learning_rate": 4.0262474064822833e-05, + "loss": 3.0029, + "mean_token_accuracy": 0.3482758641242981, + "step": 39975 + }, + { + "epoch": 0.04026828062096549, + "grad_norm": 15.68861715413, + "learning_rate": 4.026751002155389e-05, + "loss": 2.4975, + "mean_token_accuracy": 0.41034482717514037, + "step": 39980 + }, + { + "epoch": 0.040273316674069665, + "grad_norm": 13.350757706504654, + "learning_rate": 4.027254597828496e-05, + "loss": 2.6966, + "mean_token_accuracy": 0.35341801941394807, + "step": 39985 + }, + { + "epoch": 0.04027835272717384, + "grad_norm": 14.178444111448552, + "learning_rate": 4.027758193501602e-05, + "loss": 2.4122, + "mean_token_accuracy": 0.42262552976608275, + "step": 39990 + }, + { + "epoch": 0.04028338878027801, + "grad_norm": 15.074160570797384, + "learning_rate": 4.028261789174708e-05, + "loss": 2.6751, + "mean_token_accuracy": 0.4344827651977539, + "step": 39995 + }, + { + "epoch": 0.040288424833382186, + "grad_norm": 13.633886330334724, + "learning_rate": 4.0287653848478137e-05, + "loss": 2.7902, + "mean_token_accuracy": 0.35172412991523744, + "step": 40000 + }, + { + "epoch": 0.04029346088648635, + "grad_norm": 15.819527152883886, + "learning_rate": 4.0292689805209196e-05, + "loss": 2.4424, + "mean_token_accuracy": 0.4172413766384125, + "step": 40005 + }, + { + "epoch": 0.04029849693959053, + "grad_norm": 14.588128465895133, + "learning_rate": 4.0297725761940255e-05, + "loss": 2.8978, + "mean_token_accuracy": 0.3655172288417816, + "step": 40010 + }, + { + "epoch": 0.0403035329926947, + "grad_norm": 19.418776498569628, + "learning_rate": 4.0302761718671314e-05, + "loss": 2.5772, + "mean_token_accuracy": 0.41034482419490814, + "step": 40015 + }, + { + "epoch": 0.040308569045798874, + "grad_norm": 14.371097075714992, + "learning_rate": 4.0307797675402374e-05, + "loss": 2.7407, + "mean_token_accuracy": 0.341379314661026, + "step": 40020 + }, + { + "epoch": 0.04031360509890305, + "grad_norm": 17.00403547905162, + "learning_rate": 4.031283363213343e-05, + "loss": 2.4783, + "mean_token_accuracy": 0.3896551728248596, + "step": 40025 + }, + { + "epoch": 0.04031864115200722, + "grad_norm": 12.852021026378056, + "learning_rate": 4.031786958886449e-05, + "loss": 2.1619, + "mean_token_accuracy": 0.4586206912994385, + "step": 40030 + }, + { + "epoch": 0.040323677205111395, + "grad_norm": 15.734888043913397, + "learning_rate": 4.032290554559555e-05, + "loss": 2.6248, + "mean_token_accuracy": 0.3620689630508423, + "step": 40035 + }, + { + "epoch": 0.04032871325821556, + "grad_norm": 15.702209025185407, + "learning_rate": 4.032794150232662e-05, + "loss": 2.6463, + "mean_token_accuracy": 0.38620689511299133, + "step": 40040 + }, + { + "epoch": 0.040333749311319736, + "grad_norm": 14.990376540388237, + "learning_rate": 4.0332977459057677e-05, + "loss": 2.4319, + "mean_token_accuracy": 0.41034482717514037, + "step": 40045 + }, + { + "epoch": 0.04033878536442391, + "grad_norm": 19.444441096499443, + "learning_rate": 4.0338013415788736e-05, + "loss": 2.4033, + "mean_token_accuracy": 0.4620689541101456, + "step": 40050 + }, + { + "epoch": 0.040343821417528083, + "grad_norm": 13.55350872524074, + "learning_rate": 4.0343049372519795e-05, + "loss": 2.5339, + "mean_token_accuracy": 0.37241379618644715, + "step": 40055 + }, + { + "epoch": 0.04034885747063226, + "grad_norm": 12.60640981805468, + "learning_rate": 4.034808532925085e-05, + "loss": 2.8221, + "mean_token_accuracy": 0.35862069129943847, + "step": 40060 + }, + { + "epoch": 0.04035389352373643, + "grad_norm": 15.14327400828799, + "learning_rate": 4.0353121285981914e-05, + "loss": 2.255, + "mean_token_accuracy": 0.4264367878437042, + "step": 40065 + }, + { + "epoch": 0.040358929576840605, + "grad_norm": 14.102725519669287, + "learning_rate": 4.035815724271297e-05, + "loss": 2.7173, + "mean_token_accuracy": 0.3482758581638336, + "step": 40070 + }, + { + "epoch": 0.04036396562994477, + "grad_norm": 15.683997247331774, + "learning_rate": 4.036319319944403e-05, + "loss": 2.55, + "mean_token_accuracy": 0.41034482717514037, + "step": 40075 + }, + { + "epoch": 0.040369001683048945, + "grad_norm": 25.78434112953753, + "learning_rate": 4.036822915617509e-05, + "loss": 2.9449, + "mean_token_accuracy": 0.3551724076271057, + "step": 40080 + }, + { + "epoch": 0.04037403773615312, + "grad_norm": 12.603977641579434, + "learning_rate": 4.037326511290615e-05, + "loss": 2.4442, + "mean_token_accuracy": 0.4206896543502808, + "step": 40085 + }, + { + "epoch": 0.04037907378925729, + "grad_norm": 13.792403791601364, + "learning_rate": 4.037830106963722e-05, + "loss": 2.5566, + "mean_token_accuracy": 0.4344827592372894, + "step": 40090 + }, + { + "epoch": 0.04038410984236147, + "grad_norm": 13.97060711966777, + "learning_rate": 4.0383337026368276e-05, + "loss": 2.4337, + "mean_token_accuracy": 0.4344827592372894, + "step": 40095 + }, + { + "epoch": 0.04038914589546564, + "grad_norm": 10.193224227996545, + "learning_rate": 4.0388372983099335e-05, + "loss": 2.5262, + "mean_token_accuracy": 0.40344826579093934, + "step": 40100 + }, + { + "epoch": 0.040394181948569814, + "grad_norm": 14.97493455306575, + "learning_rate": 4.039340893983039e-05, + "loss": 2.5593, + "mean_token_accuracy": 0.3758620619773865, + "step": 40105 + }, + { + "epoch": 0.04039921800167398, + "grad_norm": 14.145988218176424, + "learning_rate": 4.039844489656145e-05, + "loss": 2.6482, + "mean_token_accuracy": 0.41379311084747317, + "step": 40110 + }, + { + "epoch": 0.040404254054778155, + "grad_norm": 13.204790304664215, + "learning_rate": 4.0403480853292506e-05, + "loss": 2.8498, + "mean_token_accuracy": 0.3413793116807938, + "step": 40115 + }, + { + "epoch": 0.04040929010788233, + "grad_norm": 15.070572364371507, + "learning_rate": 4.040851681002357e-05, + "loss": 2.3916, + "mean_token_accuracy": 0.4344827592372894, + "step": 40120 + }, + { + "epoch": 0.0404143261609865, + "grad_norm": 12.063254560680296, + "learning_rate": 4.041355276675463e-05, + "loss": 2.3117, + "mean_token_accuracy": 0.42413792610168455, + "step": 40125 + }, + { + "epoch": 0.040419362214090676, + "grad_norm": 12.057546790790164, + "learning_rate": 4.041858872348569e-05, + "loss": 2.1039, + "mean_token_accuracy": 0.4793103516101837, + "step": 40130 + }, + { + "epoch": 0.04042439826719485, + "grad_norm": 14.310705467692763, + "learning_rate": 4.042362468021675e-05, + "loss": 2.5193, + "mean_token_accuracy": 0.4068965554237366, + "step": 40135 + }, + { + "epoch": 0.040429434320299024, + "grad_norm": 14.819268658965951, + "learning_rate": 4.042866063694781e-05, + "loss": 2.1075, + "mean_token_accuracy": 0.458620685338974, + "step": 40140 + }, + { + "epoch": 0.04043447037340319, + "grad_norm": 14.754042453235211, + "learning_rate": 4.043369659367887e-05, + "loss": 2.59, + "mean_token_accuracy": 0.4434361755847931, + "step": 40145 + }, + { + "epoch": 0.040439506426507364, + "grad_norm": 14.815716888624625, + "learning_rate": 4.043873255040993e-05, + "loss": 2.3313, + "mean_token_accuracy": 0.43793103098869324, + "step": 40150 + }, + { + "epoch": 0.04044454247961154, + "grad_norm": 10.111888274307887, + "learning_rate": 4.044376850714099e-05, + "loss": 2.0824, + "mean_token_accuracy": 0.48965516686439514, + "step": 40155 + }, + { + "epoch": 0.04044957853271571, + "grad_norm": 16.698105248302497, + "learning_rate": 4.0448804463872046e-05, + "loss": 2.7159, + "mean_token_accuracy": 0.3999999940395355, + "step": 40160 + }, + { + "epoch": 0.040454614585819886, + "grad_norm": 15.605077112515527, + "learning_rate": 4.0453840420603105e-05, + "loss": 2.3632, + "mean_token_accuracy": 0.47426108121871946, + "step": 40165 + }, + { + "epoch": 0.04045965063892406, + "grad_norm": 18.035391667495215, + "learning_rate": 4.045887637733417e-05, + "loss": 2.5201, + "mean_token_accuracy": 0.4434361755847931, + "step": 40170 + }, + { + "epoch": 0.04046468669202823, + "grad_norm": 14.36820091883167, + "learning_rate": 4.046391233406523e-05, + "loss": 2.4819, + "mean_token_accuracy": 0.3914095640182495, + "step": 40175 + }, + { + "epoch": 0.0404697227451324, + "grad_norm": 11.573553617957032, + "learning_rate": 4.046894829079629e-05, + "loss": 2.1786, + "mean_token_accuracy": 0.4817906737327576, + "step": 40180 + }, + { + "epoch": 0.040474758798236574, + "grad_norm": 15.42526592277216, + "learning_rate": 4.047398424752735e-05, + "loss": 2.5204, + "mean_token_accuracy": 0.4137930989265442, + "step": 40185 + }, + { + "epoch": 0.04047979485134075, + "grad_norm": 13.493049585503737, + "learning_rate": 4.047902020425841e-05, + "loss": 2.801, + "mean_token_accuracy": 0.38275861740112305, + "step": 40190 + }, + { + "epoch": 0.04048483090444492, + "grad_norm": 11.497841807995542, + "learning_rate": 4.048405616098946e-05, + "loss": 2.7018, + "mean_token_accuracy": 0.43103448748588563, + "step": 40195 + }, + { + "epoch": 0.040489866957549095, + "grad_norm": 12.361256404549827, + "learning_rate": 4.048909211772053e-05, + "loss": 2.5215, + "mean_token_accuracy": 0.4068965494632721, + "step": 40200 + }, + { + "epoch": 0.04049490301065327, + "grad_norm": 19.932367522102613, + "learning_rate": 4.0494128074451586e-05, + "loss": 2.5494, + "mean_token_accuracy": 0.4224440395832062, + "step": 40205 + }, + { + "epoch": 0.04049993906375744, + "grad_norm": 12.405396549533842, + "learning_rate": 4.0499164031182645e-05, + "loss": 2.2776, + "mean_token_accuracy": 0.43103447556495667, + "step": 40210 + }, + { + "epoch": 0.04050497511686161, + "grad_norm": 15.536607853808702, + "learning_rate": 4.0504199987913705e-05, + "loss": 2.8305, + "mean_token_accuracy": 0.4082879602909088, + "step": 40215 + }, + { + "epoch": 0.04051001116996578, + "grad_norm": 15.542400979739453, + "learning_rate": 4.0509235944644764e-05, + "loss": 2.619, + "mean_token_accuracy": 0.37931033968925476, + "step": 40220 + }, + { + "epoch": 0.04051504722306996, + "grad_norm": 13.723849518378993, + "learning_rate": 4.051427190137583e-05, + "loss": 2.6731, + "mean_token_accuracy": 0.4137930989265442, + "step": 40225 + }, + { + "epoch": 0.04052008327617413, + "grad_norm": 12.61960229675886, + "learning_rate": 4.051930785810689e-05, + "loss": 2.3781, + "mean_token_accuracy": 0.42758620381355283, + "step": 40230 + }, + { + "epoch": 0.040525119329278304, + "grad_norm": 14.500142262754462, + "learning_rate": 4.052434381483794e-05, + "loss": 2.174, + "mean_token_accuracy": 0.4849364757537842, + "step": 40235 + }, + { + "epoch": 0.04053015538238248, + "grad_norm": 13.429478477699076, + "learning_rate": 4.0529379771569e-05, + "loss": 2.1939, + "mean_token_accuracy": 0.4758620738983154, + "step": 40240 + }, + { + "epoch": 0.04053519143548665, + "grad_norm": 14.643173046804844, + "learning_rate": 4.053441572830006e-05, + "loss": 2.5536, + "mean_token_accuracy": 0.42758620977401735, + "step": 40245 + }, + { + "epoch": 0.04054022748859082, + "grad_norm": 13.157804366559992, + "learning_rate": 4.0539451685031126e-05, + "loss": 2.7284, + "mean_token_accuracy": 0.3947973370552063, + "step": 40250 + }, + { + "epoch": 0.04054526354169499, + "grad_norm": 17.95852209582341, + "learning_rate": 4.0544487641762186e-05, + "loss": 3.0836, + "mean_token_accuracy": 0.3655172437429428, + "step": 40255 + }, + { + "epoch": 0.040550299594799166, + "grad_norm": 13.817097223215429, + "learning_rate": 4.0549523598493245e-05, + "loss": 2.6153, + "mean_token_accuracy": 0.42068966031074523, + "step": 40260 + }, + { + "epoch": 0.04055533564790334, + "grad_norm": 12.605436879555116, + "learning_rate": 4.0554559555224304e-05, + "loss": 3.0235, + "mean_token_accuracy": 0.35862069129943847, + "step": 40265 + }, + { + "epoch": 0.040560371701007514, + "grad_norm": 15.177462165015818, + "learning_rate": 4.055959551195536e-05, + "loss": 2.8487, + "mean_token_accuracy": 0.35862069129943847, + "step": 40270 + }, + { + "epoch": 0.04056540775411169, + "grad_norm": 14.111872153324049, + "learning_rate": 4.056463146868642e-05, + "loss": 2.2194, + "mean_token_accuracy": 0.45668481588363646, + "step": 40275 + }, + { + "epoch": 0.04057044380721586, + "grad_norm": 15.197010766130397, + "learning_rate": 4.056966742541748e-05, + "loss": 2.596, + "mean_token_accuracy": 0.3551724076271057, + "step": 40280 + }, + { + "epoch": 0.04057547986032003, + "grad_norm": 12.49697646389517, + "learning_rate": 4.057470338214854e-05, + "loss": 2.5275, + "mean_token_accuracy": 0.441379314661026, + "step": 40285 + }, + { + "epoch": 0.0405805159134242, + "grad_norm": 13.131840291435024, + "learning_rate": 4.05797393388796e-05, + "loss": 2.3514, + "mean_token_accuracy": 0.4673926174640656, + "step": 40290 + }, + { + "epoch": 0.040585551966528376, + "grad_norm": 14.32031273790913, + "learning_rate": 4.058477529561066e-05, + "loss": 2.5451, + "mean_token_accuracy": 0.3896551728248596, + "step": 40295 + }, + { + "epoch": 0.04059058801963255, + "grad_norm": 16.39867204657384, + "learning_rate": 4.058981125234172e-05, + "loss": 2.4992, + "mean_token_accuracy": 0.39655172228813174, + "step": 40300 + }, + { + "epoch": 0.04059562407273672, + "grad_norm": 12.401328280417237, + "learning_rate": 4.0594847209072785e-05, + "loss": 2.7137, + "mean_token_accuracy": 0.36896551847457887, + "step": 40305 + }, + { + "epoch": 0.0406006601258409, + "grad_norm": 12.979115967932202, + "learning_rate": 4.0599883165803844e-05, + "loss": 2.6952, + "mean_token_accuracy": 0.3793103456497192, + "step": 40310 + }, + { + "epoch": 0.04060569617894507, + "grad_norm": 16.484774699584264, + "learning_rate": 4.06049191225349e-05, + "loss": 2.6009, + "mean_token_accuracy": 0.391349059343338, + "step": 40315 + }, + { + "epoch": 0.04061073223204924, + "grad_norm": 12.495178891047514, + "learning_rate": 4.060995507926596e-05, + "loss": 2.609, + "mean_token_accuracy": 0.4137930989265442, + "step": 40320 + }, + { + "epoch": 0.04061576828515341, + "grad_norm": 18.98296185885065, + "learning_rate": 4.061499103599702e-05, + "loss": 2.7579, + "mean_token_accuracy": 0.37241379618644715, + "step": 40325 + }, + { + "epoch": 0.040620804338257585, + "grad_norm": 14.085628153444143, + "learning_rate": 4.062002699272808e-05, + "loss": 2.5144, + "mean_token_accuracy": 0.4275862157344818, + "step": 40330 + }, + { + "epoch": 0.04062584039136176, + "grad_norm": 21.086182489396823, + "learning_rate": 4.062506294945914e-05, + "loss": 2.8383, + "mean_token_accuracy": 0.3275862097740173, + "step": 40335 + }, + { + "epoch": 0.04063087644446593, + "grad_norm": 14.218072081541653, + "learning_rate": 4.06300989061902e-05, + "loss": 2.8979, + "mean_token_accuracy": 0.31379309892654417, + "step": 40340 + }, + { + "epoch": 0.040635912497570106, + "grad_norm": 13.484947423457687, + "learning_rate": 4.063513486292126e-05, + "loss": 2.6697, + "mean_token_accuracy": 0.4301875352859497, + "step": 40345 + }, + { + "epoch": 0.04064094855067428, + "grad_norm": 12.268786992553803, + "learning_rate": 4.064017081965232e-05, + "loss": 2.5629, + "mean_token_accuracy": 0.43509851694107055, + "step": 40350 + }, + { + "epoch": 0.04064598460377845, + "grad_norm": 16.521492441440987, + "learning_rate": 4.064520677638338e-05, + "loss": 3.0524, + "mean_token_accuracy": 0.36896551847457887, + "step": 40355 + }, + { + "epoch": 0.04065102065688262, + "grad_norm": 13.281052066680923, + "learning_rate": 4.0650242733114443e-05, + "loss": 2.4557, + "mean_token_accuracy": 0.39655172228813174, + "step": 40360 + }, + { + "epoch": 0.040656056709986794, + "grad_norm": 11.180764094050256, + "learning_rate": 4.06552786898455e-05, + "loss": 2.6835, + "mean_token_accuracy": 0.4156079888343811, + "step": 40365 + }, + { + "epoch": 0.04066109276309097, + "grad_norm": 13.909859648467254, + "learning_rate": 4.0660314646576555e-05, + "loss": 2.4924, + "mean_token_accuracy": 0.4103448212146759, + "step": 40370 + }, + { + "epoch": 0.04066612881619514, + "grad_norm": 14.810362550633833, + "learning_rate": 4.0665350603307614e-05, + "loss": 2.8278, + "mean_token_accuracy": 0.3896551728248596, + "step": 40375 + }, + { + "epoch": 0.040671164869299316, + "grad_norm": 12.80696021388534, + "learning_rate": 4.0670386560038674e-05, + "loss": 2.9772, + "mean_token_accuracy": 0.38620689809322356, + "step": 40380 + }, + { + "epoch": 0.04067620092240349, + "grad_norm": 13.176419183861915, + "learning_rate": 4.067542251676974e-05, + "loss": 3.111, + "mean_token_accuracy": 0.26896551847457884, + "step": 40385 + }, + { + "epoch": 0.040681236975507656, + "grad_norm": 11.698790130399717, + "learning_rate": 4.06804584735008e-05, + "loss": 2.7296, + "mean_token_accuracy": 0.3813672125339508, + "step": 40390 + }, + { + "epoch": 0.04068627302861183, + "grad_norm": 17.973211855135062, + "learning_rate": 4.068549443023186e-05, + "loss": 2.5719, + "mean_token_accuracy": 0.4517241418361664, + "step": 40395 + }, + { + "epoch": 0.040691309081716004, + "grad_norm": 12.221812011902507, + "learning_rate": 4.069053038696292e-05, + "loss": 2.6374, + "mean_token_accuracy": 0.3931034505367279, + "step": 40400 + }, + { + "epoch": 0.04069634513482018, + "grad_norm": 13.467318321528381, + "learning_rate": 4.069556634369398e-05, + "loss": 2.8584, + "mean_token_accuracy": 0.37241379022598264, + "step": 40405 + }, + { + "epoch": 0.04070138118792435, + "grad_norm": 13.343336926680532, + "learning_rate": 4.0700602300425036e-05, + "loss": 2.9105, + "mean_token_accuracy": 0.3551724076271057, + "step": 40410 + }, + { + "epoch": 0.040706417241028525, + "grad_norm": 14.315625409395771, + "learning_rate": 4.0705638257156095e-05, + "loss": 2.938, + "mean_token_accuracy": 0.33793103098869326, + "step": 40415 + }, + { + "epoch": 0.0407114532941327, + "grad_norm": 13.184877751042649, + "learning_rate": 4.0710674213887154e-05, + "loss": 2.4272, + "mean_token_accuracy": 0.3709013909101486, + "step": 40420 + }, + { + "epoch": 0.040716489347236866, + "grad_norm": 17.975178885215517, + "learning_rate": 4.0715710170618214e-05, + "loss": 2.7943, + "mean_token_accuracy": 0.3862069010734558, + "step": 40425 + }, + { + "epoch": 0.04072152540034104, + "grad_norm": 12.249754702959212, + "learning_rate": 4.072074612734927e-05, + "loss": 2.7838, + "mean_token_accuracy": 0.3827586203813553, + "step": 40430 + }, + { + "epoch": 0.04072656145344521, + "grad_norm": 13.972745253645067, + "learning_rate": 4.072578208408033e-05, + "loss": 2.5193, + "mean_token_accuracy": 0.4034482717514038, + "step": 40435 + }, + { + "epoch": 0.04073159750654939, + "grad_norm": 12.950327748990452, + "learning_rate": 4.07308180408114e-05, + "loss": 2.3661, + "mean_token_accuracy": 0.4034482777118683, + "step": 40440 + }, + { + "epoch": 0.04073663355965356, + "grad_norm": 18.68515683284272, + "learning_rate": 4.073585399754246e-05, + "loss": 2.8108, + "mean_token_accuracy": 0.3793103456497192, + "step": 40445 + }, + { + "epoch": 0.040741669612757735, + "grad_norm": 14.957861412946045, + "learning_rate": 4.074088995427352e-05, + "loss": 2.5108, + "mean_token_accuracy": 0.3793103456497192, + "step": 40450 + }, + { + "epoch": 0.04074670566586191, + "grad_norm": 13.415675856912834, + "learning_rate": 4.0745925911004576e-05, + "loss": 2.4644, + "mean_token_accuracy": 0.41379311084747317, + "step": 40455 + }, + { + "epoch": 0.040751741718966075, + "grad_norm": 13.92069037022313, + "learning_rate": 4.0750961867735635e-05, + "loss": 2.2323, + "mean_token_accuracy": 0.46551724076271056, + "step": 40460 + }, + { + "epoch": 0.04075677777207025, + "grad_norm": 14.182163601131016, + "learning_rate": 4.0755997824466694e-05, + "loss": 2.8331, + "mean_token_accuracy": 0.3758620619773865, + "step": 40465 + }, + { + "epoch": 0.04076181382517442, + "grad_norm": 17.17798866693746, + "learning_rate": 4.0761033781197754e-05, + "loss": 2.8712, + "mean_token_accuracy": 0.3758620619773865, + "step": 40470 + }, + { + "epoch": 0.040766849878278597, + "grad_norm": 15.737960040166513, + "learning_rate": 4.076606973792881e-05, + "loss": 2.4134, + "mean_token_accuracy": 0.47697044014930723, + "step": 40475 + }, + { + "epoch": 0.04077188593138277, + "grad_norm": 25.74996968372903, + "learning_rate": 4.077110569465987e-05, + "loss": 2.6363, + "mean_token_accuracy": 0.3896551728248596, + "step": 40480 + }, + { + "epoch": 0.040776921984486944, + "grad_norm": 10.958362938132924, + "learning_rate": 4.077614165139093e-05, + "loss": 2.6637, + "mean_token_accuracy": 0.3793103456497192, + "step": 40485 + }, + { + "epoch": 0.04078195803759112, + "grad_norm": 13.927932063847432, + "learning_rate": 4.0781177608122e-05, + "loss": 2.2969, + "mean_token_accuracy": 0.4517241358757019, + "step": 40490 + }, + { + "epoch": 0.040786994090695285, + "grad_norm": 14.282961590902799, + "learning_rate": 4.078621356485306e-05, + "loss": 2.9345, + "mean_token_accuracy": 0.37586206793785093, + "step": 40495 + }, + { + "epoch": 0.04079203014379946, + "grad_norm": 14.193602746174795, + "learning_rate": 4.0791249521584116e-05, + "loss": 2.6052, + "mean_token_accuracy": 0.4103448212146759, + "step": 40500 + }, + { + "epoch": 0.04079706619690363, + "grad_norm": 15.185313988176489, + "learning_rate": 4.079628547831517e-05, + "loss": 2.386, + "mean_token_accuracy": 0.41724138259887694, + "step": 40505 + }, + { + "epoch": 0.040802102250007806, + "grad_norm": 13.097351522885797, + "learning_rate": 4.080132143504623e-05, + "loss": 2.8522, + "mean_token_accuracy": 0.358620685338974, + "step": 40510 + }, + { + "epoch": 0.04080713830311198, + "grad_norm": 16.253261592358637, + "learning_rate": 4.0806357391777294e-05, + "loss": 2.718, + "mean_token_accuracy": 0.3999999940395355, + "step": 40515 + }, + { + "epoch": 0.040812174356216153, + "grad_norm": 12.998398279697463, + "learning_rate": 4.081139334850835e-05, + "loss": 2.8613, + "mean_token_accuracy": 0.41034482717514037, + "step": 40520 + }, + { + "epoch": 0.04081721040932033, + "grad_norm": 15.977455032235298, + "learning_rate": 4.081642930523941e-05, + "loss": 2.8762, + "mean_token_accuracy": 0.358620685338974, + "step": 40525 + }, + { + "epoch": 0.040822246462424494, + "grad_norm": 13.013625306199334, + "learning_rate": 4.082146526197047e-05, + "loss": 2.3466, + "mean_token_accuracy": 0.43103447556495667, + "step": 40530 + }, + { + "epoch": 0.04082728251552867, + "grad_norm": 13.016290280937307, + "learning_rate": 4.082650121870153e-05, + "loss": 2.6598, + "mean_token_accuracy": 0.4000000059604645, + "step": 40535 + }, + { + "epoch": 0.04083231856863284, + "grad_norm": 12.037041020479435, + "learning_rate": 4.083153717543259e-05, + "loss": 2.3676, + "mean_token_accuracy": 0.41034482717514037, + "step": 40540 + }, + { + "epoch": 0.040837354621737015, + "grad_norm": 13.441339310365143, + "learning_rate": 4.083657313216365e-05, + "loss": 2.6254, + "mean_token_accuracy": 0.40689654648303986, + "step": 40545 + }, + { + "epoch": 0.04084239067484119, + "grad_norm": 13.127658224034365, + "learning_rate": 4.084160908889471e-05, + "loss": 2.2756, + "mean_token_accuracy": 0.44827585220336913, + "step": 40550 + }, + { + "epoch": 0.04084742672794536, + "grad_norm": 15.579187000630988, + "learning_rate": 4.084664504562577e-05, + "loss": 2.6314, + "mean_token_accuracy": 0.34137930870056155, + "step": 40555 + }, + { + "epoch": 0.04085246278104954, + "grad_norm": 13.347342572182912, + "learning_rate": 4.085168100235683e-05, + "loss": 2.3917, + "mean_token_accuracy": 0.43103447556495667, + "step": 40560 + }, + { + "epoch": 0.040857498834153703, + "grad_norm": 9.81468321774655, + "learning_rate": 4.0856716959087886e-05, + "loss": 2.4034, + "mean_token_accuracy": 0.4384236454963684, + "step": 40565 + }, + { + "epoch": 0.04086253488725788, + "grad_norm": 14.268267627089497, + "learning_rate": 4.086175291581895e-05, + "loss": 2.4621, + "mean_token_accuracy": 0.42758620381355283, + "step": 40570 + }, + { + "epoch": 0.04086757094036205, + "grad_norm": 11.908638131299766, + "learning_rate": 4.086678887255001e-05, + "loss": 2.5021, + "mean_token_accuracy": 0.4413793087005615, + "step": 40575 + }, + { + "epoch": 0.040872606993466225, + "grad_norm": 23.1438661088492, + "learning_rate": 4.087182482928107e-05, + "loss": 2.3299, + "mean_token_accuracy": 0.42268602550029755, + "step": 40580 + }, + { + "epoch": 0.0408776430465704, + "grad_norm": 12.996090611875152, + "learning_rate": 4.087686078601213e-05, + "loss": 2.36, + "mean_token_accuracy": 0.4206896543502808, + "step": 40585 + }, + { + "epoch": 0.04088267909967457, + "grad_norm": 12.399271185288155, + "learning_rate": 4.088189674274319e-05, + "loss": 2.4971, + "mean_token_accuracy": 0.3793103456497192, + "step": 40590 + }, + { + "epoch": 0.040887715152778746, + "grad_norm": 12.832666503248527, + "learning_rate": 4.088693269947425e-05, + "loss": 2.3295, + "mean_token_accuracy": 0.441379314661026, + "step": 40595 + }, + { + "epoch": 0.04089275120588291, + "grad_norm": 18.18384056212232, + "learning_rate": 4.089196865620531e-05, + "loss": 2.5061, + "mean_token_accuracy": 0.4156079888343811, + "step": 40600 + }, + { + "epoch": 0.04089778725898709, + "grad_norm": 16.654318686521577, + "learning_rate": 4.089700461293637e-05, + "loss": 2.672, + "mean_token_accuracy": 0.3931034505367279, + "step": 40605 + }, + { + "epoch": 0.04090282331209126, + "grad_norm": 16.24392108644982, + "learning_rate": 4.0902040569667426e-05, + "loss": 2.6491, + "mean_token_accuracy": 0.3620689630508423, + "step": 40610 + }, + { + "epoch": 0.040907859365195434, + "grad_norm": 12.401222077953115, + "learning_rate": 4.0907076526398486e-05, + "loss": 2.3224, + "mean_token_accuracy": 0.4551724135875702, + "step": 40615 + }, + { + "epoch": 0.04091289541829961, + "grad_norm": 15.732593651233447, + "learning_rate": 4.0912112483129545e-05, + "loss": 2.648, + "mean_token_accuracy": 0.3896551787853241, + "step": 40620 + }, + { + "epoch": 0.04091793147140378, + "grad_norm": 13.646305694808373, + "learning_rate": 4.091714843986061e-05, + "loss": 2.8443, + "mean_token_accuracy": 0.3620689630508423, + "step": 40625 + }, + { + "epoch": 0.040922967524507955, + "grad_norm": 11.843081124586387, + "learning_rate": 4.092218439659167e-05, + "loss": 2.5841, + "mean_token_accuracy": 0.42413793206214906, + "step": 40630 + }, + { + "epoch": 0.04092800357761212, + "grad_norm": 12.985064077592844, + "learning_rate": 4.092722035332273e-05, + "loss": 2.5685, + "mean_token_accuracy": 0.3999999940395355, + "step": 40635 + }, + { + "epoch": 0.040933039630716296, + "grad_norm": 16.936283220069605, + "learning_rate": 4.093225631005378e-05, + "loss": 2.9723, + "mean_token_accuracy": 0.324137932062149, + "step": 40640 + }, + { + "epoch": 0.04093807568382047, + "grad_norm": 13.206424090552508, + "learning_rate": 4.093729226678484e-05, + "loss": 2.2142, + "mean_token_accuracy": 0.4517241358757019, + "step": 40645 + }, + { + "epoch": 0.040943111736924644, + "grad_norm": 13.885542152833098, + "learning_rate": 4.094232822351591e-05, + "loss": 2.6863, + "mean_token_accuracy": 0.34137931764125823, + "step": 40650 + }, + { + "epoch": 0.04094814779002882, + "grad_norm": 11.657564970807094, + "learning_rate": 4.0947364180246966e-05, + "loss": 2.5548, + "mean_token_accuracy": 0.42068966031074523, + "step": 40655 + }, + { + "epoch": 0.04095318384313299, + "grad_norm": 14.127762236333332, + "learning_rate": 4.0952400136978026e-05, + "loss": 3.0577, + "mean_token_accuracy": 0.358620685338974, + "step": 40660 + }, + { + "epoch": 0.040958219896237165, + "grad_norm": 13.982090814797386, + "learning_rate": 4.0957436093709085e-05, + "loss": 2.3375, + "mean_token_accuracy": 0.38275861740112305, + "step": 40665 + }, + { + "epoch": 0.04096325594934133, + "grad_norm": 15.156656006116709, + "learning_rate": 4.0962472050440144e-05, + "loss": 2.4323, + "mean_token_accuracy": 0.42068964838981626, + "step": 40670 + }, + { + "epoch": 0.040968292002445506, + "grad_norm": 14.810746183551506, + "learning_rate": 4.096750800717121e-05, + "loss": 2.5467, + "mean_token_accuracy": 0.4103448331356049, + "step": 40675 + }, + { + "epoch": 0.04097332805554968, + "grad_norm": 16.328588017026107, + "learning_rate": 4.097254396390226e-05, + "loss": 2.4857, + "mean_token_accuracy": 0.4206896543502808, + "step": 40680 + }, + { + "epoch": 0.04097836410865385, + "grad_norm": 15.497218747085839, + "learning_rate": 4.097757992063332e-05, + "loss": 2.9045, + "mean_token_accuracy": 0.37241379022598264, + "step": 40685 + }, + { + "epoch": 0.04098340016175803, + "grad_norm": 15.320312370900012, + "learning_rate": 4.098261587736438e-05, + "loss": 2.3623, + "mean_token_accuracy": 0.441379314661026, + "step": 40690 + }, + { + "epoch": 0.0409884362148622, + "grad_norm": 12.993975810623441, + "learning_rate": 4.098765183409544e-05, + "loss": 2.4172, + "mean_token_accuracy": 0.4034482717514038, + "step": 40695 + }, + { + "epoch": 0.040993472267966374, + "grad_norm": 18.559909339732226, + "learning_rate": 4.09926877908265e-05, + "loss": 2.4974, + "mean_token_accuracy": 0.4350985288619995, + "step": 40700 + }, + { + "epoch": 0.04099850832107054, + "grad_norm": 16.04216373848578, + "learning_rate": 4.0997723747557566e-05, + "loss": 3.0837, + "mean_token_accuracy": 0.3551724076271057, + "step": 40705 + }, + { + "epoch": 0.041003544374174715, + "grad_norm": 14.55733688816893, + "learning_rate": 4.1002759704288625e-05, + "loss": 2.39, + "mean_token_accuracy": 0.4310344815254211, + "step": 40710 + }, + { + "epoch": 0.04100858042727889, + "grad_norm": 14.673416038199425, + "learning_rate": 4.1007795661019684e-05, + "loss": 2.7999, + "mean_token_accuracy": 0.4312807857990265, + "step": 40715 + }, + { + "epoch": 0.04101361648038306, + "grad_norm": 13.552807215623094, + "learning_rate": 4.1012831617750743e-05, + "loss": 2.4671, + "mean_token_accuracy": 0.45160314440727234, + "step": 40720 + }, + { + "epoch": 0.041018652533487236, + "grad_norm": 14.282672312221619, + "learning_rate": 4.10178675744818e-05, + "loss": 2.7927, + "mean_token_accuracy": 0.3896551728248596, + "step": 40725 + }, + { + "epoch": 0.04102368858659141, + "grad_norm": 13.053871180716206, + "learning_rate": 4.102290353121286e-05, + "loss": 2.6912, + "mean_token_accuracy": 0.4034482717514038, + "step": 40730 + }, + { + "epoch": 0.041028724639695584, + "grad_norm": 19.232153481280147, + "learning_rate": 4.102793948794392e-05, + "loss": 2.7318, + "mean_token_accuracy": 0.3965517282485962, + "step": 40735 + }, + { + "epoch": 0.04103376069279975, + "grad_norm": 14.51641334838814, + "learning_rate": 4.103297544467498e-05, + "loss": 2.5545, + "mean_token_accuracy": 0.4068965554237366, + "step": 40740 + }, + { + "epoch": 0.041038796745903924, + "grad_norm": 11.484792457384179, + "learning_rate": 4.103801140140604e-05, + "loss": 2.4851, + "mean_token_accuracy": 0.43103448748588563, + "step": 40745 + }, + { + "epoch": 0.0410438327990081, + "grad_norm": 18.21319086556733, + "learning_rate": 4.10430473581371e-05, + "loss": 2.9283, + "mean_token_accuracy": 0.39776164293289185, + "step": 40750 + }, + { + "epoch": 0.04104886885211227, + "grad_norm": 13.126895551105354, + "learning_rate": 4.1048083314868165e-05, + "loss": 2.665, + "mean_token_accuracy": 0.4034482777118683, + "step": 40755 + }, + { + "epoch": 0.041053904905216446, + "grad_norm": 14.165312620519686, + "learning_rate": 4.1053119271599224e-05, + "loss": 2.8806, + "mean_token_accuracy": 0.3620689630508423, + "step": 40760 + }, + { + "epoch": 0.04105894095832062, + "grad_norm": 12.450884132452959, + "learning_rate": 4.1058155228330284e-05, + "loss": 2.6023, + "mean_token_accuracy": 0.3999999940395355, + "step": 40765 + }, + { + "epoch": 0.04106397701142479, + "grad_norm": 11.562451201782611, + "learning_rate": 4.1063191185061336e-05, + "loss": 2.7228, + "mean_token_accuracy": 0.3655172407627106, + "step": 40770 + }, + { + "epoch": 0.04106901306452896, + "grad_norm": 12.769946615354918, + "learning_rate": 4.1068227141792395e-05, + "loss": 2.4852, + "mean_token_accuracy": 0.4137930989265442, + "step": 40775 + }, + { + "epoch": 0.041074049117633134, + "grad_norm": 12.795519257355705, + "learning_rate": 4.1073263098523455e-05, + "loss": 2.4785, + "mean_token_accuracy": 0.37586206793785093, + "step": 40780 + }, + { + "epoch": 0.04107908517073731, + "grad_norm": 16.381124918924257, + "learning_rate": 4.107829905525452e-05, + "loss": 2.4587, + "mean_token_accuracy": 0.4551724135875702, + "step": 40785 + }, + { + "epoch": 0.04108412122384148, + "grad_norm": 20.371574404168975, + "learning_rate": 4.108333501198558e-05, + "loss": 2.8692, + "mean_token_accuracy": 0.3517241358757019, + "step": 40790 + }, + { + "epoch": 0.041089157276945655, + "grad_norm": 16.14120071078137, + "learning_rate": 4.108837096871664e-05, + "loss": 3.2097, + "mean_token_accuracy": 0.3394434332847595, + "step": 40795 + }, + { + "epoch": 0.04109419333004983, + "grad_norm": 13.784063571533569, + "learning_rate": 4.10934069254477e-05, + "loss": 2.7411, + "mean_token_accuracy": 0.3827586233615875, + "step": 40800 + }, + { + "epoch": 0.041099229383154, + "grad_norm": 14.729662024621888, + "learning_rate": 4.109844288217876e-05, + "loss": 2.6535, + "mean_token_accuracy": 0.36551723778247835, + "step": 40805 + }, + { + "epoch": 0.04110426543625817, + "grad_norm": 15.303666740529385, + "learning_rate": 4.1103478838909824e-05, + "loss": 2.6376, + "mean_token_accuracy": 0.3655172407627106, + "step": 40810 + }, + { + "epoch": 0.04110930148936234, + "grad_norm": 35.77075823880536, + "learning_rate": 4.1108514795640876e-05, + "loss": 2.8342, + "mean_token_accuracy": 0.4, + "step": 40815 + }, + { + "epoch": 0.04111433754246652, + "grad_norm": 14.994347527814286, + "learning_rate": 4.1113550752371935e-05, + "loss": 2.6448, + "mean_token_accuracy": 0.37241379618644715, + "step": 40820 + }, + { + "epoch": 0.04111937359557069, + "grad_norm": 15.673555873825235, + "learning_rate": 4.1118586709102995e-05, + "loss": 2.6984, + "mean_token_accuracy": 0.3896551728248596, + "step": 40825 + }, + { + "epoch": 0.041124409648674864, + "grad_norm": 14.019267891140933, + "learning_rate": 4.1123622665834054e-05, + "loss": 2.6319, + "mean_token_accuracy": 0.39310344457626345, + "step": 40830 + }, + { + "epoch": 0.04112944570177904, + "grad_norm": 14.15602331940618, + "learning_rate": 4.112865862256512e-05, + "loss": 2.5096, + "mean_token_accuracy": 0.4608590483665466, + "step": 40835 + }, + { + "epoch": 0.04113448175488321, + "grad_norm": 11.89139031935125, + "learning_rate": 4.113369457929618e-05, + "loss": 2.7402, + "mean_token_accuracy": 0.4241379231214523, + "step": 40840 + }, + { + "epoch": 0.04113951780798738, + "grad_norm": 23.57044647796897, + "learning_rate": 4.113873053602724e-05, + "loss": 2.6675, + "mean_token_accuracy": 0.3689655214548111, + "step": 40845 + }, + { + "epoch": 0.04114455386109155, + "grad_norm": 10.605627096867416, + "learning_rate": 4.11437664927583e-05, + "loss": 2.14, + "mean_token_accuracy": 0.4551724076271057, + "step": 40850 + }, + { + "epoch": 0.041149589914195726, + "grad_norm": 14.140173398304333, + "learning_rate": 4.114880244948936e-05, + "loss": 2.1722, + "mean_token_accuracy": 0.45862067937850953, + "step": 40855 + }, + { + "epoch": 0.0411546259672999, + "grad_norm": 20.450035048048502, + "learning_rate": 4.1153838406220416e-05, + "loss": 2.6898, + "mean_token_accuracy": 0.37931033968925476, + "step": 40860 + }, + { + "epoch": 0.041159662020404074, + "grad_norm": 16.9297007223952, + "learning_rate": 4.1158874362951475e-05, + "loss": 2.9141, + "mean_token_accuracy": 0.3551724135875702, + "step": 40865 + }, + { + "epoch": 0.04116469807350825, + "grad_norm": 14.256559035071326, + "learning_rate": 4.1163910319682535e-05, + "loss": 2.5266, + "mean_token_accuracy": 0.4068965554237366, + "step": 40870 + }, + { + "epoch": 0.04116973412661242, + "grad_norm": 11.461617365367621, + "learning_rate": 4.1168946276413594e-05, + "loss": 2.5697, + "mean_token_accuracy": 0.4379310429096222, + "step": 40875 + }, + { + "epoch": 0.04117477017971659, + "grad_norm": 24.32770304821939, + "learning_rate": 4.117398223314465e-05, + "loss": 2.8787, + "mean_token_accuracy": 0.42413793206214906, + "step": 40880 + }, + { + "epoch": 0.04117980623282076, + "grad_norm": 13.423902885597512, + "learning_rate": 4.117901818987571e-05, + "loss": 2.5955, + "mean_token_accuracy": 0.3482758581638336, + "step": 40885 + }, + { + "epoch": 0.041184842285924936, + "grad_norm": 13.505336019905643, + "learning_rate": 4.118405414660678e-05, + "loss": 2.502, + "mean_token_accuracy": 0.36206896901130675, + "step": 40890 + }, + { + "epoch": 0.04118987833902911, + "grad_norm": 16.971950361154793, + "learning_rate": 4.118909010333784e-05, + "loss": 2.5578, + "mean_token_accuracy": 0.4103448212146759, + "step": 40895 + }, + { + "epoch": 0.04119491439213328, + "grad_norm": 20.999554845587554, + "learning_rate": 4.11941260600689e-05, + "loss": 3.1846, + "mean_token_accuracy": 0.3655172407627106, + "step": 40900 + }, + { + "epoch": 0.04119995044523746, + "grad_norm": 12.941858308243027, + "learning_rate": 4.119916201679995e-05, + "loss": 2.7599, + "mean_token_accuracy": 0.38275861740112305, + "step": 40905 + }, + { + "epoch": 0.04120498649834163, + "grad_norm": 15.099528176547238, + "learning_rate": 4.120419797353101e-05, + "loss": 2.7854, + "mean_token_accuracy": 0.42607380747795104, + "step": 40910 + }, + { + "epoch": 0.0412100225514458, + "grad_norm": 17.38236219110721, + "learning_rate": 4.1209233930262075e-05, + "loss": 2.7553, + "mean_token_accuracy": 0.3931034505367279, + "step": 40915 + }, + { + "epoch": 0.04121505860454997, + "grad_norm": 16.000960089504773, + "learning_rate": 4.1214269886993134e-05, + "loss": 2.9845, + "mean_token_accuracy": 0.2965517222881317, + "step": 40920 + }, + { + "epoch": 0.041220094657654145, + "grad_norm": 11.154266470026233, + "learning_rate": 4.121930584372419e-05, + "loss": 2.382, + "mean_token_accuracy": 0.42413792610168455, + "step": 40925 + }, + { + "epoch": 0.04122513071075832, + "grad_norm": 14.394508625650328, + "learning_rate": 4.122434180045525e-05, + "loss": 2.406, + "mean_token_accuracy": 0.43448275327682495, + "step": 40930 + }, + { + "epoch": 0.04123016676386249, + "grad_norm": 12.989327427270842, + "learning_rate": 4.122937775718631e-05, + "loss": 2.5044, + "mean_token_accuracy": 0.4379310369491577, + "step": 40935 + }, + { + "epoch": 0.041235202816966667, + "grad_norm": 13.531064565209123, + "learning_rate": 4.123441371391738e-05, + "loss": 2.484, + "mean_token_accuracy": 0.43266788125038147, + "step": 40940 + }, + { + "epoch": 0.04124023887007084, + "grad_norm": 14.223310015237725, + "learning_rate": 4.123944967064843e-05, + "loss": 2.8653, + "mean_token_accuracy": 0.3655172288417816, + "step": 40945 + }, + { + "epoch": 0.04124527492317501, + "grad_norm": 21.186694300253812, + "learning_rate": 4.124448562737949e-05, + "loss": 2.6835, + "mean_token_accuracy": 0.4, + "step": 40950 + }, + { + "epoch": 0.04125031097627918, + "grad_norm": 12.444712112429578, + "learning_rate": 4.124952158411055e-05, + "loss": 2.6251, + "mean_token_accuracy": 0.37586206793785093, + "step": 40955 + }, + { + "epoch": 0.041255347029383355, + "grad_norm": 16.970539582878, + "learning_rate": 4.125455754084161e-05, + "loss": 2.2374, + "mean_token_accuracy": 0.42413793206214906, + "step": 40960 + }, + { + "epoch": 0.04126038308248753, + "grad_norm": 22.215357268305826, + "learning_rate": 4.125959349757267e-05, + "loss": 2.9667, + "mean_token_accuracy": 0.3655172407627106, + "step": 40965 + }, + { + "epoch": 0.0412654191355917, + "grad_norm": 14.829487396237798, + "learning_rate": 4.126462945430373e-05, + "loss": 2.4951, + "mean_token_accuracy": 0.37241379022598264, + "step": 40970 + }, + { + "epoch": 0.041270455188695876, + "grad_norm": 15.993909160970029, + "learning_rate": 4.126966541103479e-05, + "loss": 2.405, + "mean_token_accuracy": 0.43103448748588563, + "step": 40975 + }, + { + "epoch": 0.04127549124180005, + "grad_norm": 16.908618265382852, + "learning_rate": 4.127470136776585e-05, + "loss": 2.9847, + "mean_token_accuracy": 0.3517241358757019, + "step": 40980 + }, + { + "epoch": 0.041280527294904217, + "grad_norm": 47.30137931843922, + "learning_rate": 4.127973732449691e-05, + "loss": 2.4514, + "mean_token_accuracy": 0.379310342669487, + "step": 40985 + }, + { + "epoch": 0.04128556334800839, + "grad_norm": 12.39812987397837, + "learning_rate": 4.128477328122797e-05, + "loss": 2.3535, + "mean_token_accuracy": 0.4465214729309082, + "step": 40990 + }, + { + "epoch": 0.041290599401112564, + "grad_norm": 13.83483188648493, + "learning_rate": 4.128980923795903e-05, + "loss": 2.6022, + "mean_token_accuracy": 0.3620689630508423, + "step": 40995 + }, + { + "epoch": 0.04129563545421674, + "grad_norm": 19.701850648418716, + "learning_rate": 4.129484519469009e-05, + "loss": 2.4212, + "mean_token_accuracy": 0.4103448212146759, + "step": 41000 + }, + { + "epoch": 0.04130067150732091, + "grad_norm": 13.143117623214184, + "learning_rate": 4.129988115142115e-05, + "loss": 2.6967, + "mean_token_accuracy": 0.42413792610168455, + "step": 41005 + }, + { + "epoch": 0.041305707560425085, + "grad_norm": 13.032528668268462, + "learning_rate": 4.130491710815221e-05, + "loss": 2.5074, + "mean_token_accuracy": 0.39443435668945315, + "step": 41010 + }, + { + "epoch": 0.04131074361352926, + "grad_norm": 15.299685760033984, + "learning_rate": 4.1309953064883267e-05, + "loss": 2.4205, + "mean_token_accuracy": 0.3965517282485962, + "step": 41015 + }, + { + "epoch": 0.041315779666633426, + "grad_norm": 15.801225287153569, + "learning_rate": 4.131498902161433e-05, + "loss": 2.4402, + "mean_token_accuracy": 0.42413793206214906, + "step": 41020 + }, + { + "epoch": 0.0413208157197376, + "grad_norm": 18.559612280637484, + "learning_rate": 4.132002497834539e-05, + "loss": 2.144, + "mean_token_accuracy": 0.458620673418045, + "step": 41025 + }, + { + "epoch": 0.04132585177284177, + "grad_norm": 18.751832344626706, + "learning_rate": 4.132506093507645e-05, + "loss": 2.4526, + "mean_token_accuracy": 0.42413793206214906, + "step": 41030 + }, + { + "epoch": 0.04133088782594595, + "grad_norm": 17.179923390174572, + "learning_rate": 4.133009689180751e-05, + "loss": 2.5182, + "mean_token_accuracy": 0.4034482777118683, + "step": 41035 + }, + { + "epoch": 0.04133592387905012, + "grad_norm": 12.729919369511332, + "learning_rate": 4.133513284853856e-05, + "loss": 2.4972, + "mean_token_accuracy": 0.3793103456497192, + "step": 41040 + }, + { + "epoch": 0.041340959932154295, + "grad_norm": 13.549367264662392, + "learning_rate": 4.134016880526962e-05, + "loss": 2.4035, + "mean_token_accuracy": 0.41379310488700866, + "step": 41045 + }, + { + "epoch": 0.04134599598525847, + "grad_norm": 17.554360802521778, + "learning_rate": 4.134520476200069e-05, + "loss": 2.9433, + "mean_token_accuracy": 0.36551724672317504, + "step": 41050 + }, + { + "epoch": 0.041351032038362635, + "grad_norm": 14.40718142106458, + "learning_rate": 4.135024071873175e-05, + "loss": 2.708, + "mean_token_accuracy": 0.4034482717514038, + "step": 41055 + }, + { + "epoch": 0.04135606809146681, + "grad_norm": 17.394742010907283, + "learning_rate": 4.1355276675462807e-05, + "loss": 2.7567, + "mean_token_accuracy": 0.3793103456497192, + "step": 41060 + }, + { + "epoch": 0.04136110414457098, + "grad_norm": 24.898044976946323, + "learning_rate": 4.1360312632193866e-05, + "loss": 2.5342, + "mean_token_accuracy": 0.3793103456497192, + "step": 41065 + }, + { + "epoch": 0.04136614019767516, + "grad_norm": 11.442759671823191, + "learning_rate": 4.1365348588924925e-05, + "loss": 2.382, + "mean_token_accuracy": 0.43103448748588563, + "step": 41070 + }, + { + "epoch": 0.04137117625077933, + "grad_norm": 16.668984143667156, + "learning_rate": 4.137038454565599e-05, + "loss": 2.2302, + "mean_token_accuracy": 0.4034482717514038, + "step": 41075 + }, + { + "epoch": 0.041376212303883504, + "grad_norm": 13.794265804700265, + "learning_rate": 4.1375420502387044e-05, + "loss": 2.4324, + "mean_token_accuracy": 0.47586206793785096, + "step": 41080 + }, + { + "epoch": 0.04138124835698768, + "grad_norm": 21.967810262894325, + "learning_rate": 4.13804564591181e-05, + "loss": 2.7261, + "mean_token_accuracy": 0.3655172407627106, + "step": 41085 + }, + { + "epoch": 0.041386284410091845, + "grad_norm": 14.290650095830253, + "learning_rate": 4.138549241584916e-05, + "loss": 2.6638, + "mean_token_accuracy": 0.358620685338974, + "step": 41090 + }, + { + "epoch": 0.04139132046319602, + "grad_norm": 17.955084166042813, + "learning_rate": 4.139052837258022e-05, + "loss": 2.6688, + "mean_token_accuracy": 0.3999999940395355, + "step": 41095 + }, + { + "epoch": 0.04139635651630019, + "grad_norm": 17.726595017148693, + "learning_rate": 4.139556432931129e-05, + "loss": 3.2398, + "mean_token_accuracy": 0.37931033968925476, + "step": 41100 + }, + { + "epoch": 0.041401392569404366, + "grad_norm": 13.803975215593738, + "learning_rate": 4.140060028604235e-05, + "loss": 2.6809, + "mean_token_accuracy": 0.41724138259887694, + "step": 41105 + }, + { + "epoch": 0.04140642862250854, + "grad_norm": 14.969865140735745, + "learning_rate": 4.1405636242773406e-05, + "loss": 2.7729, + "mean_token_accuracy": 0.3810042321681976, + "step": 41110 + }, + { + "epoch": 0.041411464675612714, + "grad_norm": 19.72395236966796, + "learning_rate": 4.1410672199504465e-05, + "loss": 2.6601, + "mean_token_accuracy": 0.4068965554237366, + "step": 41115 + }, + { + "epoch": 0.04141650072871689, + "grad_norm": 22.165726152241525, + "learning_rate": 4.1415708156235524e-05, + "loss": 3.0441, + "mean_token_accuracy": 0.3344827562570572, + "step": 41120 + }, + { + "epoch": 0.041421536781821054, + "grad_norm": 14.276038520085468, + "learning_rate": 4.1420744112966584e-05, + "loss": 2.6462, + "mean_token_accuracy": 0.40344826579093934, + "step": 41125 + }, + { + "epoch": 0.04142657283492523, + "grad_norm": 15.227849216636812, + "learning_rate": 4.142578006969764e-05, + "loss": 2.6021, + "mean_token_accuracy": 0.3793103516101837, + "step": 41130 + }, + { + "epoch": 0.0414316088880294, + "grad_norm": 14.88492297414623, + "learning_rate": 4.14308160264287e-05, + "loss": 3.1043, + "mean_token_accuracy": 0.3068965494632721, + "step": 41135 + }, + { + "epoch": 0.041436644941133575, + "grad_norm": 18.72426752622893, + "learning_rate": 4.143585198315976e-05, + "loss": 2.5418, + "mean_token_accuracy": 0.38620689511299133, + "step": 41140 + }, + { + "epoch": 0.04144168099423775, + "grad_norm": 13.057649633685267, + "learning_rate": 4.144088793989082e-05, + "loss": 2.9402, + "mean_token_accuracy": 0.3482758641242981, + "step": 41145 + }, + { + "epoch": 0.04144671704734192, + "grad_norm": 13.634383275664339, + "learning_rate": 4.144592389662188e-05, + "loss": 3.0117, + "mean_token_accuracy": 0.37586207389831544, + "step": 41150 + }, + { + "epoch": 0.0414517531004461, + "grad_norm": 15.3029209670836, + "learning_rate": 4.1450959853352946e-05, + "loss": 2.8006, + "mean_token_accuracy": 0.38620689511299133, + "step": 41155 + }, + { + "epoch": 0.041456789153550264, + "grad_norm": 22.389501960311364, + "learning_rate": 4.1455995810084005e-05, + "loss": 2.5555, + "mean_token_accuracy": 0.47071990966796873, + "step": 41160 + }, + { + "epoch": 0.04146182520665444, + "grad_norm": 14.786013433363001, + "learning_rate": 4.1461031766815064e-05, + "loss": 2.3108, + "mean_token_accuracy": 0.4310344815254211, + "step": 41165 + }, + { + "epoch": 0.04146686125975861, + "grad_norm": 18.08520394889536, + "learning_rate": 4.1466067723546124e-05, + "loss": 2.7539, + "mean_token_accuracy": 0.3827586233615875, + "step": 41170 + }, + { + "epoch": 0.041471897312862785, + "grad_norm": 14.63492886045943, + "learning_rate": 4.1471103680277176e-05, + "loss": 2.7212, + "mean_token_accuracy": 0.3965517282485962, + "step": 41175 + }, + { + "epoch": 0.04147693336596696, + "grad_norm": 15.744077436140618, + "learning_rate": 4.147613963700824e-05, + "loss": 2.6602, + "mean_token_accuracy": 0.38275861740112305, + "step": 41180 + }, + { + "epoch": 0.04148196941907113, + "grad_norm": 13.690409910973099, + "learning_rate": 4.14811755937393e-05, + "loss": 2.2618, + "mean_token_accuracy": 0.4, + "step": 41185 + }, + { + "epoch": 0.041487005472175306, + "grad_norm": 13.392073387141549, + "learning_rate": 4.148621155047036e-05, + "loss": 2.4978, + "mean_token_accuracy": 0.449969744682312, + "step": 41190 + }, + { + "epoch": 0.04149204152527947, + "grad_norm": 13.539970388178874, + "learning_rate": 4.149124750720142e-05, + "loss": 2.6495, + "mean_token_accuracy": 0.3689655244350433, + "step": 41195 + }, + { + "epoch": 0.04149707757838365, + "grad_norm": 13.971926197438785, + "learning_rate": 4.149628346393248e-05, + "loss": 2.6975, + "mean_token_accuracy": 0.35862069129943847, + "step": 41200 + }, + { + "epoch": 0.04150211363148782, + "grad_norm": 11.967713234288349, + "learning_rate": 4.150131942066354e-05, + "loss": 2.5316, + "mean_token_accuracy": 0.3862068891525269, + "step": 41205 + }, + { + "epoch": 0.041507149684591994, + "grad_norm": 15.188985601352249, + "learning_rate": 4.1506355377394604e-05, + "loss": 2.4204, + "mean_token_accuracy": 0.3931034505367279, + "step": 41210 + }, + { + "epoch": 0.04151218573769617, + "grad_norm": 12.031832599974363, + "learning_rate": 4.151139133412566e-05, + "loss": 2.2701, + "mean_token_accuracy": 0.42413793206214906, + "step": 41215 + }, + { + "epoch": 0.04151722179080034, + "grad_norm": 12.277975250778423, + "learning_rate": 4.1516427290856716e-05, + "loss": 2.5418, + "mean_token_accuracy": 0.41724138259887694, + "step": 41220 + }, + { + "epoch": 0.041522257843904516, + "grad_norm": 11.03156192235485, + "learning_rate": 4.1521463247587775e-05, + "loss": 2.3875, + "mean_token_accuracy": 0.41379310488700866, + "step": 41225 + }, + { + "epoch": 0.04152729389700868, + "grad_norm": 16.072934787809217, + "learning_rate": 4.1526499204318835e-05, + "loss": 2.7604, + "mean_token_accuracy": 0.4, + "step": 41230 + }, + { + "epoch": 0.041532329950112856, + "grad_norm": 15.122269921090608, + "learning_rate": 4.15315351610499e-05, + "loss": 2.5147, + "mean_token_accuracy": 0.4225045382976532, + "step": 41235 + }, + { + "epoch": 0.04153736600321703, + "grad_norm": 17.94022118267691, + "learning_rate": 4.153657111778096e-05, + "loss": 2.8398, + "mean_token_accuracy": 0.3620689630508423, + "step": 41240 + }, + { + "epoch": 0.041542402056321204, + "grad_norm": 18.800729543934647, + "learning_rate": 4.154160707451202e-05, + "loss": 2.4401, + "mean_token_accuracy": 0.4206896543502808, + "step": 41245 + }, + { + "epoch": 0.04154743810942538, + "grad_norm": 19.65109828813384, + "learning_rate": 4.154664303124308e-05, + "loss": 2.9384, + "mean_token_accuracy": 0.3275862097740173, + "step": 41250 + }, + { + "epoch": 0.04155247416252955, + "grad_norm": 13.310684574871418, + "learning_rate": 4.155167898797414e-05, + "loss": 2.6128, + "mean_token_accuracy": 0.3724137842655182, + "step": 41255 + }, + { + "epoch": 0.041557510215633725, + "grad_norm": 19.43584220063741, + "learning_rate": 4.15567149447052e-05, + "loss": 2.6336, + "mean_token_accuracy": 0.45396249890327456, + "step": 41260 + }, + { + "epoch": 0.04156254626873789, + "grad_norm": 14.50507161059811, + "learning_rate": 4.1561750901436256e-05, + "loss": 2.5908, + "mean_token_accuracy": 0.3655172407627106, + "step": 41265 + }, + { + "epoch": 0.041567582321842066, + "grad_norm": 11.650635073771703, + "learning_rate": 4.1566786858167316e-05, + "loss": 2.7806, + "mean_token_accuracy": 0.358620685338974, + "step": 41270 + }, + { + "epoch": 0.04157261837494624, + "grad_norm": 13.760952094118604, + "learning_rate": 4.1571822814898375e-05, + "loss": 2.7298, + "mean_token_accuracy": 0.38620689511299133, + "step": 41275 + }, + { + "epoch": 0.04157765442805041, + "grad_norm": 12.548975116886325, + "learning_rate": 4.1576858771629434e-05, + "loss": 2.876, + "mean_token_accuracy": 0.34137931764125823, + "step": 41280 + }, + { + "epoch": 0.04158269048115459, + "grad_norm": 13.956213457097428, + "learning_rate": 4.158189472836049e-05, + "loss": 2.24, + "mean_token_accuracy": 0.4551724135875702, + "step": 41285 + }, + { + "epoch": 0.04158772653425876, + "grad_norm": 49.69084829804417, + "learning_rate": 4.158693068509156e-05, + "loss": 3.1969, + "mean_token_accuracy": 0.3655172437429428, + "step": 41290 + }, + { + "epoch": 0.041592762587362934, + "grad_norm": 13.981379815087218, + "learning_rate": 4.159196664182262e-05, + "loss": 2.3562, + "mean_token_accuracy": 0.44482757449150084, + "step": 41295 + }, + { + "epoch": 0.0415977986404671, + "grad_norm": 15.403457467624529, + "learning_rate": 4.159700259855368e-05, + "loss": 2.9595, + "mean_token_accuracy": 0.3724138021469116, + "step": 41300 + }, + { + "epoch": 0.041602834693571275, + "grad_norm": 12.431090054433014, + "learning_rate": 4.160203855528473e-05, + "loss": 2.8006, + "mean_token_accuracy": 0.3310344874858856, + "step": 41305 + }, + { + "epoch": 0.04160787074667545, + "grad_norm": 13.18110872472664, + "learning_rate": 4.160707451201579e-05, + "loss": 2.771, + "mean_token_accuracy": 0.35862069129943847, + "step": 41310 + }, + { + "epoch": 0.04161290679977962, + "grad_norm": 14.861732553323238, + "learning_rate": 4.1612110468746856e-05, + "loss": 3.1349, + "mean_token_accuracy": 0.33448275923728943, + "step": 41315 + }, + { + "epoch": 0.041617942852883796, + "grad_norm": 10.614103874888148, + "learning_rate": 4.1617146425477915e-05, + "loss": 2.3604, + "mean_token_accuracy": 0.42758620381355283, + "step": 41320 + }, + { + "epoch": 0.04162297890598797, + "grad_norm": 18.579272521539604, + "learning_rate": 4.1622182382208974e-05, + "loss": 2.4029, + "mean_token_accuracy": 0.46067755222320556, + "step": 41325 + }, + { + "epoch": 0.041628014959092144, + "grad_norm": 12.405900187033808, + "learning_rate": 4.162721833894003e-05, + "loss": 2.3266, + "mean_token_accuracy": 0.42758620977401735, + "step": 41330 + }, + { + "epoch": 0.04163305101219631, + "grad_norm": 13.23086755774929, + "learning_rate": 4.163225429567109e-05, + "loss": 2.9464, + "mean_token_accuracy": 0.36896551847457887, + "step": 41335 + }, + { + "epoch": 0.041638087065300484, + "grad_norm": 14.364062007184435, + "learning_rate": 4.163729025240216e-05, + "loss": 2.5793, + "mean_token_accuracy": 0.33793103098869326, + "step": 41340 + }, + { + "epoch": 0.04164312311840466, + "grad_norm": 13.968180786689885, + "learning_rate": 4.164232620913321e-05, + "loss": 2.535, + "mean_token_accuracy": 0.44827587008476255, + "step": 41345 + }, + { + "epoch": 0.04164815917150883, + "grad_norm": 12.684564198780544, + "learning_rate": 4.164736216586427e-05, + "loss": 2.7156, + "mean_token_accuracy": 0.4034482717514038, + "step": 41350 + }, + { + "epoch": 0.041653195224613006, + "grad_norm": 14.57429857730503, + "learning_rate": 4.165239812259533e-05, + "loss": 2.3134, + "mean_token_accuracy": 0.41034482717514037, + "step": 41355 + }, + { + "epoch": 0.04165823127771718, + "grad_norm": 14.179740373168228, + "learning_rate": 4.165743407932639e-05, + "loss": 3.038, + "mean_token_accuracy": 0.39165154099464417, + "step": 41360 + }, + { + "epoch": 0.04166326733082135, + "grad_norm": 14.747401643973403, + "learning_rate": 4.1662470036057455e-05, + "loss": 2.7026, + "mean_token_accuracy": 0.36551723480224607, + "step": 41365 + }, + { + "epoch": 0.04166830338392552, + "grad_norm": 12.824293556730922, + "learning_rate": 4.1667505992788514e-05, + "loss": 2.5718, + "mean_token_accuracy": 0.40145190358161925, + "step": 41370 + }, + { + "epoch": 0.041673339437029694, + "grad_norm": 16.46905648852369, + "learning_rate": 4.167254194951957e-05, + "loss": 2.6636, + "mean_token_accuracy": 0.4206896543502808, + "step": 41375 + }, + { + "epoch": 0.04167837549013387, + "grad_norm": 14.382500247906147, + "learning_rate": 4.167757790625063e-05, + "loss": 2.8107, + "mean_token_accuracy": 0.32413792610168457, + "step": 41380 + }, + { + "epoch": 0.04168341154323804, + "grad_norm": 11.964911376859925, + "learning_rate": 4.168261386298169e-05, + "loss": 2.5012, + "mean_token_accuracy": 0.38965516686439516, + "step": 41385 + }, + { + "epoch": 0.041688447596342215, + "grad_norm": 15.359449529815512, + "learning_rate": 4.168764981971275e-05, + "loss": 2.5499, + "mean_token_accuracy": 0.37931033968925476, + "step": 41390 + }, + { + "epoch": 0.04169348364944639, + "grad_norm": 18.04229883972483, + "learning_rate": 4.169268577644381e-05, + "loss": 2.7145, + "mean_token_accuracy": 0.39310344457626345, + "step": 41395 + }, + { + "epoch": 0.04169851970255056, + "grad_norm": 20.652033830011202, + "learning_rate": 4.169772173317487e-05, + "loss": 2.8569, + "mean_token_accuracy": 0.36896551251411436, + "step": 41400 + }, + { + "epoch": 0.04170355575565473, + "grad_norm": 17.441893075424723, + "learning_rate": 4.170275768990593e-05, + "loss": 2.3557, + "mean_token_accuracy": 0.37241379022598264, + "step": 41405 + }, + { + "epoch": 0.0417085918087589, + "grad_norm": 13.76016651083468, + "learning_rate": 4.170779364663699e-05, + "loss": 2.749, + "mean_token_accuracy": 0.3241379350423813, + "step": 41410 + }, + { + "epoch": 0.04171362786186308, + "grad_norm": 12.151249614705563, + "learning_rate": 4.171282960336805e-05, + "loss": 2.2827, + "mean_token_accuracy": 0.4517241418361664, + "step": 41415 + }, + { + "epoch": 0.04171866391496725, + "grad_norm": 15.591512579545203, + "learning_rate": 4.1717865560099113e-05, + "loss": 2.4828, + "mean_token_accuracy": 0.4068965554237366, + "step": 41420 + }, + { + "epoch": 0.041723699968071425, + "grad_norm": 13.297030108195143, + "learning_rate": 4.172290151683017e-05, + "loss": 2.2595, + "mean_token_accuracy": 0.4413793087005615, + "step": 41425 + }, + { + "epoch": 0.0417287360211756, + "grad_norm": 14.025324888402166, + "learning_rate": 4.172793747356123e-05, + "loss": 2.2977, + "mean_token_accuracy": 0.4568663060665131, + "step": 41430 + }, + { + "epoch": 0.04173377207427977, + "grad_norm": 20.01779592351257, + "learning_rate": 4.173297343029229e-05, + "loss": 2.8114, + "mean_token_accuracy": 0.36551723480224607, + "step": 41435 + }, + { + "epoch": 0.04173880812738394, + "grad_norm": 13.91000361551655, + "learning_rate": 4.1738009387023344e-05, + "loss": 2.4125, + "mean_token_accuracy": 0.4229280173778534, + "step": 41440 + }, + { + "epoch": 0.04174384418048811, + "grad_norm": 15.754327259476941, + "learning_rate": 4.174304534375441e-05, + "loss": 2.6701, + "mean_token_accuracy": 0.3689655065536499, + "step": 41445 + }, + { + "epoch": 0.041748880233592287, + "grad_norm": 10.288310613628173, + "learning_rate": 4.174808130048547e-05, + "loss": 2.248, + "mean_token_accuracy": 0.4551724135875702, + "step": 41450 + }, + { + "epoch": 0.04175391628669646, + "grad_norm": 17.746416110538224, + "learning_rate": 4.175311725721653e-05, + "loss": 2.5861, + "mean_token_accuracy": 0.36896551251411436, + "step": 41455 + }, + { + "epoch": 0.041758952339800634, + "grad_norm": 13.815651047743707, + "learning_rate": 4.175815321394759e-05, + "loss": 2.4924, + "mean_token_accuracy": 0.4, + "step": 41460 + }, + { + "epoch": 0.04176398839290481, + "grad_norm": 13.006045671977251, + "learning_rate": 4.176318917067865e-05, + "loss": 2.1947, + "mean_token_accuracy": 0.4448275864124298, + "step": 41465 + }, + { + "epoch": 0.04176902444600898, + "grad_norm": 11.308526439456026, + "learning_rate": 4.1768225127409706e-05, + "loss": 2.3156, + "mean_token_accuracy": 0.41379310488700866, + "step": 41470 + }, + { + "epoch": 0.04177406049911315, + "grad_norm": 16.024539077379107, + "learning_rate": 4.177326108414077e-05, + "loss": 2.423, + "mean_token_accuracy": 0.4310344815254211, + "step": 41475 + }, + { + "epoch": 0.04177909655221732, + "grad_norm": 13.203006591749823, + "learning_rate": 4.1778297040871824e-05, + "loss": 2.6127, + "mean_token_accuracy": 0.41034482717514037, + "step": 41480 + }, + { + "epoch": 0.041784132605321496, + "grad_norm": 14.681668284189819, + "learning_rate": 4.1783332997602884e-05, + "loss": 2.4855, + "mean_token_accuracy": 0.4310344815254211, + "step": 41485 + }, + { + "epoch": 0.04178916865842567, + "grad_norm": 13.893456610697903, + "learning_rate": 4.178836895433394e-05, + "loss": 2.3494, + "mean_token_accuracy": 0.47447065711021424, + "step": 41490 + }, + { + "epoch": 0.04179420471152984, + "grad_norm": 11.891801505877606, + "learning_rate": 4.1793404911065e-05, + "loss": 2.3974, + "mean_token_accuracy": 0.4551724135875702, + "step": 41495 + }, + { + "epoch": 0.04179924076463402, + "grad_norm": 13.220660100746214, + "learning_rate": 4.179844086779607e-05, + "loss": 2.4542, + "mean_token_accuracy": 0.3862069010734558, + "step": 41500 + }, + { + "epoch": 0.04180427681773819, + "grad_norm": 13.73682151540009, + "learning_rate": 4.180347682452713e-05, + "loss": 2.5342, + "mean_token_accuracy": 0.4344827651977539, + "step": 41505 + }, + { + "epoch": 0.04180931287084236, + "grad_norm": 12.835747085722033, + "learning_rate": 4.180851278125819e-05, + "loss": 2.3436, + "mean_token_accuracy": 0.417241370677948, + "step": 41510 + }, + { + "epoch": 0.04181434892394653, + "grad_norm": 12.754565771149778, + "learning_rate": 4.1813548737989246e-05, + "loss": 2.3413, + "mean_token_accuracy": 0.4620689630508423, + "step": 41515 + }, + { + "epoch": 0.041819384977050705, + "grad_norm": 15.138618969940085, + "learning_rate": 4.1818584694720305e-05, + "loss": 2.606, + "mean_token_accuracy": 0.4, + "step": 41520 + }, + { + "epoch": 0.04182442103015488, + "grad_norm": 11.785215921888069, + "learning_rate": 4.1823620651451365e-05, + "loss": 2.7353, + "mean_token_accuracy": 0.3999999940395355, + "step": 41525 + }, + { + "epoch": 0.04182945708325905, + "grad_norm": 14.106720060299784, + "learning_rate": 4.1828656608182424e-05, + "loss": 2.8909, + "mean_token_accuracy": 0.3896551728248596, + "step": 41530 + }, + { + "epoch": 0.04183449313636323, + "grad_norm": 11.823814028080365, + "learning_rate": 4.183369256491348e-05, + "loss": 2.5415, + "mean_token_accuracy": 0.3482758551836014, + "step": 41535 + }, + { + "epoch": 0.0418395291894674, + "grad_norm": 16.316010495530996, + "learning_rate": 4.183872852164454e-05, + "loss": 2.7488, + "mean_token_accuracy": 0.3931034505367279, + "step": 41540 + }, + { + "epoch": 0.04184456524257157, + "grad_norm": 12.188700100630532, + "learning_rate": 4.18437644783756e-05, + "loss": 2.5227, + "mean_token_accuracy": 0.42413792610168455, + "step": 41545 + }, + { + "epoch": 0.04184960129567574, + "grad_norm": 24.860162493224838, + "learning_rate": 4.184880043510666e-05, + "loss": 2.7558, + "mean_token_accuracy": 0.38620689511299133, + "step": 41550 + }, + { + "epoch": 0.041854637348779915, + "grad_norm": 13.782909705436278, + "learning_rate": 4.185383639183773e-05, + "loss": 2.7235, + "mean_token_accuracy": 0.3896551728248596, + "step": 41555 + }, + { + "epoch": 0.04185967340188409, + "grad_norm": 12.80075285186288, + "learning_rate": 4.1858872348568786e-05, + "loss": 2.6448, + "mean_token_accuracy": 0.42758620381355283, + "step": 41560 + }, + { + "epoch": 0.04186470945498826, + "grad_norm": 11.58268670858098, + "learning_rate": 4.1863908305299845e-05, + "loss": 2.4803, + "mean_token_accuracy": 0.44137930274009707, + "step": 41565 + }, + { + "epoch": 0.041869745508092436, + "grad_norm": 18.895592544981422, + "learning_rate": 4.1868944262030905e-05, + "loss": 2.9573, + "mean_token_accuracy": 0.3344827562570572, + "step": 41570 + }, + { + "epoch": 0.04187478156119661, + "grad_norm": 12.4378210710087, + "learning_rate": 4.187398021876196e-05, + "loss": 2.6072, + "mean_token_accuracy": 0.4, + "step": 41575 + }, + { + "epoch": 0.04187981761430078, + "grad_norm": 13.857686376418835, + "learning_rate": 4.187901617549302e-05, + "loss": 2.3741, + "mean_token_accuracy": 0.44827585816383364, + "step": 41580 + }, + { + "epoch": 0.04188485366740495, + "grad_norm": 18.692672368729866, + "learning_rate": 4.188405213222408e-05, + "loss": 2.9586, + "mean_token_accuracy": 0.32758620381355286, + "step": 41585 + }, + { + "epoch": 0.041889889720509124, + "grad_norm": 10.414950504374703, + "learning_rate": 4.188908808895514e-05, + "loss": 1.9589, + "mean_token_accuracy": 0.49999999403953554, + "step": 41590 + }, + { + "epoch": 0.0418949257736133, + "grad_norm": 13.90200027800352, + "learning_rate": 4.18941240456862e-05, + "loss": 2.1934, + "mean_token_accuracy": 0.4620689570903778, + "step": 41595 + }, + { + "epoch": 0.04189996182671747, + "grad_norm": 12.058743006538208, + "learning_rate": 4.189916000241726e-05, + "loss": 2.9766, + "mean_token_accuracy": 0.3344827562570572, + "step": 41600 + }, + { + "epoch": 0.041904997879821645, + "grad_norm": 27.594381144469853, + "learning_rate": 4.1904195959148326e-05, + "loss": 2.8017, + "mean_token_accuracy": 0.35862069129943847, + "step": 41605 + }, + { + "epoch": 0.04191003393292582, + "grad_norm": 14.781095319543887, + "learning_rate": 4.1909231915879385e-05, + "loss": 2.6863, + "mean_token_accuracy": 0.3655172407627106, + "step": 41610 + }, + { + "epoch": 0.041915069986029986, + "grad_norm": 12.316890887684048, + "learning_rate": 4.191426787261044e-05, + "loss": 2.1493, + "mean_token_accuracy": 0.4551724076271057, + "step": 41615 + }, + { + "epoch": 0.04192010603913416, + "grad_norm": 12.57431801081382, + "learning_rate": 4.19193038293415e-05, + "loss": 2.7196, + "mean_token_accuracy": 0.41034482717514037, + "step": 41620 + }, + { + "epoch": 0.041925142092238334, + "grad_norm": 12.402311147187143, + "learning_rate": 4.1924339786072556e-05, + "loss": 2.9017, + "mean_token_accuracy": 0.38100423812866213, + "step": 41625 + }, + { + "epoch": 0.04193017814534251, + "grad_norm": 16.25893999664443, + "learning_rate": 4.1929375742803616e-05, + "loss": 2.7347, + "mean_token_accuracy": 0.4310344815254211, + "step": 41630 + }, + { + "epoch": 0.04193521419844668, + "grad_norm": 16.56348751435573, + "learning_rate": 4.193441169953468e-05, + "loss": 2.2631, + "mean_token_accuracy": 0.4206896543502808, + "step": 41635 + }, + { + "epoch": 0.041940250251550855, + "grad_norm": 15.226048619316597, + "learning_rate": 4.193944765626574e-05, + "loss": 2.7178, + "mean_token_accuracy": 0.37931033968925476, + "step": 41640 + }, + { + "epoch": 0.04194528630465503, + "grad_norm": 16.141172199150358, + "learning_rate": 4.19444836129968e-05, + "loss": 2.5711, + "mean_token_accuracy": 0.4172413766384125, + "step": 41645 + }, + { + "epoch": 0.041950322357759195, + "grad_norm": 14.289712125110631, + "learning_rate": 4.194951956972786e-05, + "loss": 2.6165, + "mean_token_accuracy": 0.3705989122390747, + "step": 41650 + }, + { + "epoch": 0.04195535841086337, + "grad_norm": 16.561235495062853, + "learning_rate": 4.195455552645892e-05, + "loss": 3.0758, + "mean_token_accuracy": 0.36206896901130675, + "step": 41655 + }, + { + "epoch": 0.04196039446396754, + "grad_norm": 14.517579475588162, + "learning_rate": 4.195959148318998e-05, + "loss": 2.6308, + "mean_token_accuracy": 0.38620689511299133, + "step": 41660 + }, + { + "epoch": 0.04196543051707172, + "grad_norm": 12.546968940414992, + "learning_rate": 4.196462743992104e-05, + "loss": 2.4348, + "mean_token_accuracy": 0.41034482717514037, + "step": 41665 + }, + { + "epoch": 0.04197046657017589, + "grad_norm": 19.664358909350923, + "learning_rate": 4.1969663396652096e-05, + "loss": 2.7112, + "mean_token_accuracy": 0.3965517282485962, + "step": 41670 + }, + { + "epoch": 0.041975502623280064, + "grad_norm": 12.196021361296129, + "learning_rate": 4.1974699353383156e-05, + "loss": 2.7838, + "mean_token_accuracy": 0.3981246203184128, + "step": 41675 + }, + { + "epoch": 0.04198053867638424, + "grad_norm": 12.196797249522211, + "learning_rate": 4.1979735310114215e-05, + "loss": 2.6309, + "mean_token_accuracy": 0.41379310488700866, + "step": 41680 + }, + { + "epoch": 0.041985574729488405, + "grad_norm": 11.845231325497963, + "learning_rate": 4.198477126684528e-05, + "loss": 2.5764, + "mean_token_accuracy": 0.34137930572032926, + "step": 41685 + }, + { + "epoch": 0.04199061078259258, + "grad_norm": 14.52205529075529, + "learning_rate": 4.198980722357634e-05, + "loss": 2.6205, + "mean_token_accuracy": 0.4, + "step": 41690 + }, + { + "epoch": 0.04199564683569675, + "grad_norm": 13.986418280413199, + "learning_rate": 4.19948431803074e-05, + "loss": 2.6281, + "mean_token_accuracy": 0.37586206793785093, + "step": 41695 + }, + { + "epoch": 0.042000682888800926, + "grad_norm": 21.69263993191705, + "learning_rate": 4.199987913703846e-05, + "loss": 2.7697, + "mean_token_accuracy": 0.4486453115940094, + "step": 41700 + }, + { + "epoch": 0.0420057189419051, + "grad_norm": 11.899754992699709, + "learning_rate": 4.200491509376952e-05, + "loss": 2.242, + "mean_token_accuracy": 0.47931034564971925, + "step": 41705 + }, + { + "epoch": 0.042010754995009274, + "grad_norm": 14.197205453966133, + "learning_rate": 4.200995105050058e-05, + "loss": 2.6726, + "mean_token_accuracy": 0.4000000059604645, + "step": 41710 + }, + { + "epoch": 0.04201579104811345, + "grad_norm": 14.319816827391895, + "learning_rate": 4.2014987007231636e-05, + "loss": 2.7133, + "mean_token_accuracy": 0.42413792908191683, + "step": 41715 + }, + { + "epoch": 0.042020827101217614, + "grad_norm": 18.49433381223356, + "learning_rate": 4.2020022963962696e-05, + "loss": 3.3711, + "mean_token_accuracy": 0.36521475911140444, + "step": 41720 + }, + { + "epoch": 0.04202586315432179, + "grad_norm": 12.832568538903356, + "learning_rate": 4.2025058920693755e-05, + "loss": 2.4101, + "mean_token_accuracy": 0.39310344457626345, + "step": 41725 + }, + { + "epoch": 0.04203089920742596, + "grad_norm": 19.72543660561526, + "learning_rate": 4.2030094877424814e-05, + "loss": 2.3535, + "mean_token_accuracy": 0.46896551847457885, + "step": 41730 + }, + { + "epoch": 0.042035935260530136, + "grad_norm": 12.58584181900052, + "learning_rate": 4.2035130834155873e-05, + "loss": 2.8308, + "mean_token_accuracy": 0.41034482717514037, + "step": 41735 + }, + { + "epoch": 0.04204097131363431, + "grad_norm": 16.45054280482935, + "learning_rate": 4.204016679088694e-05, + "loss": 2.5882, + "mean_token_accuracy": 0.42413792610168455, + "step": 41740 + }, + { + "epoch": 0.04204600736673848, + "grad_norm": 19.811649460175378, + "learning_rate": 4.2045202747618e-05, + "loss": 2.8868, + "mean_token_accuracy": 0.4, + "step": 41745 + }, + { + "epoch": 0.04205104341984266, + "grad_norm": 13.869608553108694, + "learning_rate": 4.205023870434905e-05, + "loss": 2.5202, + "mean_token_accuracy": 0.39310344457626345, + "step": 41750 + }, + { + "epoch": 0.042056079472946824, + "grad_norm": 14.798758324402543, + "learning_rate": 4.205527466108011e-05, + "loss": 2.6703, + "mean_token_accuracy": 0.37586206793785093, + "step": 41755 + }, + { + "epoch": 0.042061115526051, + "grad_norm": 14.977398058464582, + "learning_rate": 4.206031061781117e-05, + "loss": 2.4922, + "mean_token_accuracy": 0.3793103456497192, + "step": 41760 + }, + { + "epoch": 0.04206615157915517, + "grad_norm": 14.211552459282368, + "learning_rate": 4.2065346574542236e-05, + "loss": 2.3848, + "mean_token_accuracy": 0.44652147889137267, + "step": 41765 + }, + { + "epoch": 0.042071187632259345, + "grad_norm": 12.17146546151082, + "learning_rate": 4.2070382531273295e-05, + "loss": 2.3536, + "mean_token_accuracy": 0.43236538767814636, + "step": 41770 + }, + { + "epoch": 0.04207622368536352, + "grad_norm": 16.863958240925193, + "learning_rate": 4.2075418488004354e-05, + "loss": 2.7531, + "mean_token_accuracy": 0.37586207389831544, + "step": 41775 + }, + { + "epoch": 0.04208125973846769, + "grad_norm": 13.510037655146801, + "learning_rate": 4.2080454444735414e-05, + "loss": 2.5447, + "mean_token_accuracy": 0.4137930989265442, + "step": 41780 + }, + { + "epoch": 0.042086295791571866, + "grad_norm": 14.871698933290757, + "learning_rate": 4.208549040146647e-05, + "loss": 2.8904, + "mean_token_accuracy": 0.3793103486299515, + "step": 41785 + }, + { + "epoch": 0.04209133184467603, + "grad_norm": 12.249409337383172, + "learning_rate": 4.209052635819753e-05, + "loss": 2.3642, + "mean_token_accuracy": 0.3965517282485962, + "step": 41790 + }, + { + "epoch": 0.04209636789778021, + "grad_norm": 14.484971940424183, + "learning_rate": 4.209556231492859e-05, + "loss": 2.5277, + "mean_token_accuracy": 0.41379311084747317, + "step": 41795 + }, + { + "epoch": 0.04210140395088438, + "grad_norm": 18.505288290979788, + "learning_rate": 4.210059827165965e-05, + "loss": 2.8781, + "mean_token_accuracy": 0.41724138259887694, + "step": 41800 + }, + { + "epoch": 0.042106440003988554, + "grad_norm": 10.981973479120942, + "learning_rate": 4.210563422839071e-05, + "loss": 2.2296, + "mean_token_accuracy": 0.4833743929862976, + "step": 41805 + }, + { + "epoch": 0.04211147605709273, + "grad_norm": 13.128831235684228, + "learning_rate": 4.211067018512177e-05, + "loss": 2.4139, + "mean_token_accuracy": 0.4551724076271057, + "step": 41810 + }, + { + "epoch": 0.0421165121101969, + "grad_norm": 13.848885356014558, + "learning_rate": 4.211570614185283e-05, + "loss": 2.3585, + "mean_token_accuracy": 0.42758620977401735, + "step": 41815 + }, + { + "epoch": 0.042121548163301076, + "grad_norm": 15.27973561704685, + "learning_rate": 4.2120742098583894e-05, + "loss": 2.4664, + "mean_token_accuracy": 0.3896551638841629, + "step": 41820 + }, + { + "epoch": 0.04212658421640524, + "grad_norm": 18.232381575854937, + "learning_rate": 4.2125778055314954e-05, + "loss": 2.5992, + "mean_token_accuracy": 0.4034482717514038, + "step": 41825 + }, + { + "epoch": 0.042131620269509416, + "grad_norm": 35.92981112914529, + "learning_rate": 4.213081401204601e-05, + "loss": 3.2475, + "mean_token_accuracy": 0.3068965464830399, + "step": 41830 + }, + { + "epoch": 0.04213665632261359, + "grad_norm": 19.293098380789058, + "learning_rate": 4.213584996877707e-05, + "loss": 2.7086, + "mean_token_accuracy": 0.41379310488700866, + "step": 41835 + }, + { + "epoch": 0.042141692375717764, + "grad_norm": 12.292907334985212, + "learning_rate": 4.2140885925508125e-05, + "loss": 3.001, + "mean_token_accuracy": 0.3633393824100494, + "step": 41840 + }, + { + "epoch": 0.04214672842882194, + "grad_norm": 11.603784730699676, + "learning_rate": 4.214592188223919e-05, + "loss": 2.5251, + "mean_token_accuracy": 0.39854809641838074, + "step": 41845 + }, + { + "epoch": 0.04215176448192611, + "grad_norm": 10.853543406412781, + "learning_rate": 4.215095783897025e-05, + "loss": 2.4223, + "mean_token_accuracy": 0.3827586233615875, + "step": 41850 + }, + { + "epoch": 0.042156800535030285, + "grad_norm": 12.300983182140644, + "learning_rate": 4.215599379570131e-05, + "loss": 2.1721, + "mean_token_accuracy": 0.4034482717514038, + "step": 41855 + }, + { + "epoch": 0.04216183658813445, + "grad_norm": 13.37102439527431, + "learning_rate": 4.216102975243237e-05, + "loss": 2.4762, + "mean_token_accuracy": 0.43103447556495667, + "step": 41860 + }, + { + "epoch": 0.042166872641238626, + "grad_norm": 15.324667298293514, + "learning_rate": 4.216606570916343e-05, + "loss": 2.6459, + "mean_token_accuracy": 0.4068965494632721, + "step": 41865 + }, + { + "epoch": 0.0421719086943428, + "grad_norm": 37.98088179474751, + "learning_rate": 4.2171101665894494e-05, + "loss": 2.6729, + "mean_token_accuracy": 0.3620689630508423, + "step": 41870 + }, + { + "epoch": 0.04217694474744697, + "grad_norm": 12.068133269757723, + "learning_rate": 4.217613762262555e-05, + "loss": 2.5234, + "mean_token_accuracy": 0.4344827592372894, + "step": 41875 + }, + { + "epoch": 0.04218198080055115, + "grad_norm": 15.76145118699871, + "learning_rate": 4.2181173579356605e-05, + "loss": 2.4281, + "mean_token_accuracy": 0.42758620977401735, + "step": 41880 + }, + { + "epoch": 0.04218701685365532, + "grad_norm": 14.910335477221567, + "learning_rate": 4.2186209536087665e-05, + "loss": 2.7649, + "mean_token_accuracy": 0.3758620709180832, + "step": 41885 + }, + { + "epoch": 0.042192052906759495, + "grad_norm": 11.81527555509976, + "learning_rate": 4.2191245492818724e-05, + "loss": 2.5669, + "mean_token_accuracy": 0.4103448301553726, + "step": 41890 + }, + { + "epoch": 0.04219708895986366, + "grad_norm": 12.728966719523346, + "learning_rate": 4.219628144954978e-05, + "loss": 2.6018, + "mean_token_accuracy": 0.3482758581638336, + "step": 41895 + }, + { + "epoch": 0.042202125012967835, + "grad_norm": 12.883633091968685, + "learning_rate": 4.220131740628085e-05, + "loss": 2.415, + "mean_token_accuracy": 0.42413792610168455, + "step": 41900 + }, + { + "epoch": 0.04220716106607201, + "grad_norm": 14.008719081821134, + "learning_rate": 4.220635336301191e-05, + "loss": 2.6938, + "mean_token_accuracy": 0.34137930572032926, + "step": 41905 + }, + { + "epoch": 0.04221219711917618, + "grad_norm": 12.8733060908149, + "learning_rate": 4.221138931974297e-05, + "loss": 2.4629, + "mean_token_accuracy": 0.42068964838981626, + "step": 41910 + }, + { + "epoch": 0.042217233172280356, + "grad_norm": 14.369915547441158, + "learning_rate": 4.221642527647403e-05, + "loss": 2.6673, + "mean_token_accuracy": 0.3517241358757019, + "step": 41915 + }, + { + "epoch": 0.04222226922538453, + "grad_norm": 29.377598118090567, + "learning_rate": 4.2221461233205086e-05, + "loss": 3.3392, + "mean_token_accuracy": 0.31034482419490816, + "step": 41920 + }, + { + "epoch": 0.042227305278488704, + "grad_norm": 13.031121699384238, + "learning_rate": 4.2226497189936145e-05, + "loss": 2.6579, + "mean_token_accuracy": 0.4000000059604645, + "step": 41925 + }, + { + "epoch": 0.04223234133159287, + "grad_norm": 12.569187069862734, + "learning_rate": 4.2231533146667205e-05, + "loss": 2.5284, + "mean_token_accuracy": 0.4379310250282288, + "step": 41930 + }, + { + "epoch": 0.042237377384697045, + "grad_norm": 11.713053192207543, + "learning_rate": 4.2236569103398264e-05, + "loss": 2.4914, + "mean_token_accuracy": 0.3620689630508423, + "step": 41935 + }, + { + "epoch": 0.04224241343780122, + "grad_norm": 11.885212434655756, + "learning_rate": 4.224160506012932e-05, + "loss": 2.4609, + "mean_token_accuracy": 0.3793103456497192, + "step": 41940 + }, + { + "epoch": 0.04224744949090539, + "grad_norm": 11.623040491163774, + "learning_rate": 4.224664101686038e-05, + "loss": 2.6083, + "mean_token_accuracy": 0.4054446518421173, + "step": 41945 + }, + { + "epoch": 0.042252485544009566, + "grad_norm": 13.828990567441382, + "learning_rate": 4.225167697359145e-05, + "loss": 2.178, + "mean_token_accuracy": 0.4517241358757019, + "step": 41950 + }, + { + "epoch": 0.04225752159711374, + "grad_norm": 13.72644657704056, + "learning_rate": 4.225671293032251e-05, + "loss": 2.8159, + "mean_token_accuracy": 0.38275861740112305, + "step": 41955 + }, + { + "epoch": 0.04226255765021791, + "grad_norm": 15.187757632517355, + "learning_rate": 4.226174888705357e-05, + "loss": 2.9138, + "mean_token_accuracy": 0.34137930870056155, + "step": 41960 + }, + { + "epoch": 0.04226759370332208, + "grad_norm": 12.413715368462153, + "learning_rate": 4.2266784843784626e-05, + "loss": 2.4774, + "mean_token_accuracy": 0.43793103098869324, + "step": 41965 + }, + { + "epoch": 0.042272629756426254, + "grad_norm": 13.778074047430437, + "learning_rate": 4.2271820800515685e-05, + "loss": 2.8406, + "mean_token_accuracy": 0.32758620381355286, + "step": 41970 + }, + { + "epoch": 0.04227766580953043, + "grad_norm": 15.537898966533053, + "learning_rate": 4.227685675724674e-05, + "loss": 3.0642, + "mean_token_accuracy": 0.3565033197402954, + "step": 41975 + }, + { + "epoch": 0.0422827018626346, + "grad_norm": 13.185269839919933, + "learning_rate": 4.2281892713977804e-05, + "loss": 2.4481, + "mean_token_accuracy": 0.3965517163276672, + "step": 41980 + }, + { + "epoch": 0.042287737915738775, + "grad_norm": 12.420004477453237, + "learning_rate": 4.228692867070886e-05, + "loss": 2.2869, + "mean_token_accuracy": 0.45862067937850953, + "step": 41985 + }, + { + "epoch": 0.04229277396884295, + "grad_norm": 17.975204420268977, + "learning_rate": 4.229196462743992e-05, + "loss": 2.9945, + "mean_token_accuracy": 0.38747731447219846, + "step": 41990 + }, + { + "epoch": 0.04229781002194712, + "grad_norm": 11.952792915868843, + "learning_rate": 4.229700058417098e-05, + "loss": 2.4427, + "mean_token_accuracy": 0.43793101906776427, + "step": 41995 + }, + { + "epoch": 0.04230284607505129, + "grad_norm": 12.832417725715763, + "learning_rate": 4.230203654090204e-05, + "loss": 2.4983, + "mean_token_accuracy": 0.42758620977401735, + "step": 42000 + }, + { + "epoch": 0.04230788212815546, + "grad_norm": 14.326387972889306, + "learning_rate": 4.230707249763311e-05, + "loss": 2.5622, + "mean_token_accuracy": 0.34482758343219755, + "step": 42005 + }, + { + "epoch": 0.04231291818125964, + "grad_norm": 14.617655120901565, + "learning_rate": 4.2312108454364166e-05, + "loss": 2.4867, + "mean_token_accuracy": 0.42758620381355283, + "step": 42010 + }, + { + "epoch": 0.04231795423436381, + "grad_norm": 13.418183133987174, + "learning_rate": 4.231714441109522e-05, + "loss": 2.5945, + "mean_token_accuracy": 0.38275861740112305, + "step": 42015 + }, + { + "epoch": 0.042322990287467985, + "grad_norm": 19.639909320233777, + "learning_rate": 4.232218036782628e-05, + "loss": 2.529, + "mean_token_accuracy": 0.34137930870056155, + "step": 42020 + }, + { + "epoch": 0.04232802634057216, + "grad_norm": 13.374467595890817, + "learning_rate": 4.232721632455734e-05, + "loss": 2.7504, + "mean_token_accuracy": 0.4034482777118683, + "step": 42025 + }, + { + "epoch": 0.04233306239367633, + "grad_norm": 12.143299335538108, + "learning_rate": 4.23322522812884e-05, + "loss": 2.1599, + "mean_token_accuracy": 0.43980641961097716, + "step": 42030 + }, + { + "epoch": 0.0423380984467805, + "grad_norm": 17.61451381729175, + "learning_rate": 4.233728823801946e-05, + "loss": 2.6257, + "mean_token_accuracy": 0.3724137932062149, + "step": 42035 + }, + { + "epoch": 0.04234313449988467, + "grad_norm": 13.847306709784116, + "learning_rate": 4.234232419475052e-05, + "loss": 2.5287, + "mean_token_accuracy": 0.4068965554237366, + "step": 42040 + }, + { + "epoch": 0.04234817055298885, + "grad_norm": 20.84006487057853, + "learning_rate": 4.234736015148158e-05, + "loss": 2.6803, + "mean_token_accuracy": 0.38620689511299133, + "step": 42045 + }, + { + "epoch": 0.04235320660609302, + "grad_norm": 12.870407741873784, + "learning_rate": 4.235239610821264e-05, + "loss": 2.4426, + "mean_token_accuracy": 0.37241379022598264, + "step": 42050 + }, + { + "epoch": 0.042358242659197194, + "grad_norm": 14.138877184780087, + "learning_rate": 4.23574320649437e-05, + "loss": 3.1251, + "mean_token_accuracy": 0.3482758641242981, + "step": 42055 + }, + { + "epoch": 0.04236327871230137, + "grad_norm": 25.720919805583133, + "learning_rate": 4.236246802167476e-05, + "loss": 2.9327, + "mean_token_accuracy": 0.3980036199092865, + "step": 42060 + }, + { + "epoch": 0.04236831476540554, + "grad_norm": 9.206785587147353, + "learning_rate": 4.236750397840582e-05, + "loss": 2.3184, + "mean_token_accuracy": 0.43448275327682495, + "step": 42065 + }, + { + "epoch": 0.04237335081850971, + "grad_norm": 18.975102113100423, + "learning_rate": 4.237253993513688e-05, + "loss": 2.5569, + "mean_token_accuracy": 0.42643678188323975, + "step": 42070 + }, + { + "epoch": 0.04237838687161388, + "grad_norm": 13.395251209303156, + "learning_rate": 4.2377575891867937e-05, + "loss": 2.5877, + "mean_token_accuracy": 0.3896551728248596, + "step": 42075 + }, + { + "epoch": 0.042383422924718056, + "grad_norm": 15.599159847855095, + "learning_rate": 4.2382611848598996e-05, + "loss": 2.6956, + "mean_token_accuracy": 0.3931034505367279, + "step": 42080 + }, + { + "epoch": 0.04238845897782223, + "grad_norm": 17.88762643063267, + "learning_rate": 4.238764780533006e-05, + "loss": 2.9442, + "mean_token_accuracy": 0.37241379022598264, + "step": 42085 + }, + { + "epoch": 0.042393495030926404, + "grad_norm": 18.754705378775704, + "learning_rate": 4.239268376206112e-05, + "loss": 3.0249, + "mean_token_accuracy": 0.36896551847457887, + "step": 42090 + }, + { + "epoch": 0.04239853108403058, + "grad_norm": 11.573587665864922, + "learning_rate": 4.239771971879218e-05, + "loss": 2.5534, + "mean_token_accuracy": 0.42413792610168455, + "step": 42095 + }, + { + "epoch": 0.04240356713713475, + "grad_norm": 18.166801202055836, + "learning_rate": 4.240275567552324e-05, + "loss": 2.6535, + "mean_token_accuracy": 0.44349666833877566, + "step": 42100 + }, + { + "epoch": 0.04240860319023892, + "grad_norm": 18.041575724305037, + "learning_rate": 4.24077916322543e-05, + "loss": 2.8118, + "mean_token_accuracy": 0.3517241358757019, + "step": 42105 + }, + { + "epoch": 0.04241363924334309, + "grad_norm": 17.34579171263323, + "learning_rate": 4.241282758898536e-05, + "loss": 2.5329, + "mean_token_accuracy": 0.3931034505367279, + "step": 42110 + }, + { + "epoch": 0.042418675296447265, + "grad_norm": 11.330988321707263, + "learning_rate": 4.241786354571642e-05, + "loss": 2.2997, + "mean_token_accuracy": 0.4607380449771881, + "step": 42115 + }, + { + "epoch": 0.04242371134955144, + "grad_norm": 14.075020274599758, + "learning_rate": 4.2422899502447477e-05, + "loss": 2.7019, + "mean_token_accuracy": 0.42758620381355283, + "step": 42120 + }, + { + "epoch": 0.04242874740265561, + "grad_norm": 15.0719883830294, + "learning_rate": 4.2427935459178536e-05, + "loss": 2.4078, + "mean_token_accuracy": 0.43448275327682495, + "step": 42125 + }, + { + "epoch": 0.04243378345575979, + "grad_norm": 15.596669861885054, + "learning_rate": 4.2432971415909595e-05, + "loss": 2.5482, + "mean_token_accuracy": 0.3827586233615875, + "step": 42130 + }, + { + "epoch": 0.042438819508863954, + "grad_norm": 13.45930697461373, + "learning_rate": 4.243800737264066e-05, + "loss": 2.2291, + "mean_token_accuracy": 0.48457350730896, + "step": 42135 + }, + { + "epoch": 0.04244385556196813, + "grad_norm": 12.928044486619804, + "learning_rate": 4.244304332937172e-05, + "loss": 2.5144, + "mean_token_accuracy": 0.4034482777118683, + "step": 42140 + }, + { + "epoch": 0.0424488916150723, + "grad_norm": 18.302949803594974, + "learning_rate": 4.244807928610278e-05, + "loss": 2.348, + "mean_token_accuracy": 0.46551724076271056, + "step": 42145 + }, + { + "epoch": 0.042453927668176475, + "grad_norm": 14.097152360466584, + "learning_rate": 4.245311524283383e-05, + "loss": 2.5896, + "mean_token_accuracy": 0.3599515974521637, + "step": 42150 + }, + { + "epoch": 0.04245896372128065, + "grad_norm": 15.873745275098289, + "learning_rate": 4.245815119956489e-05, + "loss": 2.4762, + "mean_token_accuracy": 0.4034482777118683, + "step": 42155 + }, + { + "epoch": 0.04246399977438482, + "grad_norm": 12.642225767181579, + "learning_rate": 4.246318715629595e-05, + "loss": 2.4846, + "mean_token_accuracy": 0.4068965494632721, + "step": 42160 + }, + { + "epoch": 0.042469035827488996, + "grad_norm": 15.742250200906533, + "learning_rate": 4.246822311302702e-05, + "loss": 2.6393, + "mean_token_accuracy": 0.3931034505367279, + "step": 42165 + }, + { + "epoch": 0.04247407188059316, + "grad_norm": 14.336653862827957, + "learning_rate": 4.2473259069758076e-05, + "loss": 2.2442, + "mean_token_accuracy": 0.41203871965408323, + "step": 42170 + }, + { + "epoch": 0.04247910793369734, + "grad_norm": 18.631276554370075, + "learning_rate": 4.2478295026489135e-05, + "loss": 2.8472, + "mean_token_accuracy": 0.3241379290819168, + "step": 42175 + }, + { + "epoch": 0.04248414398680151, + "grad_norm": 19.177148216566874, + "learning_rate": 4.2483330983220194e-05, + "loss": 2.5995, + "mean_token_accuracy": 0.441379314661026, + "step": 42180 + }, + { + "epoch": 0.042489180039905684, + "grad_norm": 14.870899194173267, + "learning_rate": 4.2488366939951254e-05, + "loss": 2.4629, + "mean_token_accuracy": 0.39243799448013306, + "step": 42185 + }, + { + "epoch": 0.04249421609300986, + "grad_norm": 17.319787922211166, + "learning_rate": 4.249340289668231e-05, + "loss": 3.0309, + "mean_token_accuracy": 0.3655172407627106, + "step": 42190 + }, + { + "epoch": 0.04249925214611403, + "grad_norm": 14.776330039628686, + "learning_rate": 4.249843885341337e-05, + "loss": 2.6946, + "mean_token_accuracy": 0.39310344457626345, + "step": 42195 + }, + { + "epoch": 0.042504288199218206, + "grad_norm": 11.958051709792558, + "learning_rate": 4.250347481014443e-05, + "loss": 2.2899, + "mean_token_accuracy": 0.4344827711582184, + "step": 42200 + }, + { + "epoch": 0.04250932425232237, + "grad_norm": 15.111951352392746, + "learning_rate": 4.250851076687549e-05, + "loss": 2.7625, + "mean_token_accuracy": 0.42068964838981626, + "step": 42205 + }, + { + "epoch": 0.042514360305426546, + "grad_norm": 18.35699183872143, + "learning_rate": 4.251354672360655e-05, + "loss": 2.7042, + "mean_token_accuracy": 0.40834845304489137, + "step": 42210 + }, + { + "epoch": 0.04251939635853072, + "grad_norm": 14.412213982532519, + "learning_rate": 4.2518582680337616e-05, + "loss": 2.919, + "mean_token_accuracy": 0.3137931048870087, + "step": 42215 + }, + { + "epoch": 0.042524432411634894, + "grad_norm": 13.924079055611344, + "learning_rate": 4.2523618637068675e-05, + "loss": 3.1514, + "mean_token_accuracy": 0.31379309892654417, + "step": 42220 + }, + { + "epoch": 0.04252946846473907, + "grad_norm": 17.29722559270806, + "learning_rate": 4.2528654593799734e-05, + "loss": 2.677, + "mean_token_accuracy": 0.44482758045196535, + "step": 42225 + }, + { + "epoch": 0.04253450451784324, + "grad_norm": 12.559958089901833, + "learning_rate": 4.2533690550530794e-05, + "loss": 2.3733, + "mean_token_accuracy": 0.4137930989265442, + "step": 42230 + }, + { + "epoch": 0.042539540570947415, + "grad_norm": 13.952628673728361, + "learning_rate": 4.253872650726185e-05, + "loss": 2.2965, + "mean_token_accuracy": 0.4310344815254211, + "step": 42235 + }, + { + "epoch": 0.04254457662405158, + "grad_norm": 14.904182712094958, + "learning_rate": 4.254376246399291e-05, + "loss": 2.6364, + "mean_token_accuracy": 0.3517241358757019, + "step": 42240 + }, + { + "epoch": 0.042549612677155756, + "grad_norm": 14.296432038729126, + "learning_rate": 4.254879842072397e-05, + "loss": 2.5395, + "mean_token_accuracy": 0.3931034505367279, + "step": 42245 + }, + { + "epoch": 0.04255464873025993, + "grad_norm": 15.441271256602201, + "learning_rate": 4.255383437745503e-05, + "loss": 2.6517, + "mean_token_accuracy": 0.38275861740112305, + "step": 42250 + }, + { + "epoch": 0.0425596847833641, + "grad_norm": 11.879655297842078, + "learning_rate": 4.255887033418609e-05, + "loss": 2.9341, + "mean_token_accuracy": 0.38723533153533934, + "step": 42255 + }, + { + "epoch": 0.04256472083646828, + "grad_norm": 16.192863005730583, + "learning_rate": 4.256390629091715e-05, + "loss": 2.5092, + "mean_token_accuracy": 0.41379311084747317, + "step": 42260 + }, + { + "epoch": 0.04256975688957245, + "grad_norm": 15.344551407430263, + "learning_rate": 4.256894224764821e-05, + "loss": 2.5688, + "mean_token_accuracy": 0.3965517163276672, + "step": 42265 + }, + { + "epoch": 0.042574792942676624, + "grad_norm": 18.041216192121656, + "learning_rate": 4.2573978204379275e-05, + "loss": 2.6633, + "mean_token_accuracy": 0.3793103456497192, + "step": 42270 + }, + { + "epoch": 0.04257982899578079, + "grad_norm": 13.095420236141694, + "learning_rate": 4.2579014161110334e-05, + "loss": 2.5455, + "mean_token_accuracy": 0.4534785270690918, + "step": 42275 + }, + { + "epoch": 0.042584865048884965, + "grad_norm": 18.782660568694478, + "learning_rate": 4.258405011784139e-05, + "loss": 2.8752, + "mean_token_accuracy": 0.38620689511299133, + "step": 42280 + }, + { + "epoch": 0.04258990110198914, + "grad_norm": 14.331615403627207, + "learning_rate": 4.2589086074572446e-05, + "loss": 2.4836, + "mean_token_accuracy": 0.4119177222251892, + "step": 42285 + }, + { + "epoch": 0.04259493715509331, + "grad_norm": 14.427374397439023, + "learning_rate": 4.2594122031303505e-05, + "loss": 2.8262, + "mean_token_accuracy": 0.39310344457626345, + "step": 42290 + }, + { + "epoch": 0.042599973208197486, + "grad_norm": 14.580719797050584, + "learning_rate": 4.259915798803457e-05, + "loss": 2.5059, + "mean_token_accuracy": 0.3827586233615875, + "step": 42295 + }, + { + "epoch": 0.04260500926130166, + "grad_norm": 15.737302262281352, + "learning_rate": 4.260419394476563e-05, + "loss": 2.3139, + "mean_token_accuracy": 0.44337567687034607, + "step": 42300 + }, + { + "epoch": 0.042610045314405834, + "grad_norm": 12.396772252258224, + "learning_rate": 4.260922990149669e-05, + "loss": 2.6244, + "mean_token_accuracy": 0.38620689511299133, + "step": 42305 + }, + { + "epoch": 0.04261508136751, + "grad_norm": 13.944666778957275, + "learning_rate": 4.261426585822775e-05, + "loss": 2.4062, + "mean_token_accuracy": 0.41724138259887694, + "step": 42310 + }, + { + "epoch": 0.042620117420614174, + "grad_norm": 15.515671024732626, + "learning_rate": 4.261930181495881e-05, + "loss": 2.717, + "mean_token_accuracy": 0.3620689630508423, + "step": 42315 + }, + { + "epoch": 0.04262515347371835, + "grad_norm": 16.774689058237154, + "learning_rate": 4.262433777168987e-05, + "loss": 2.8777, + "mean_token_accuracy": 0.39655172228813174, + "step": 42320 + }, + { + "epoch": 0.04263018952682252, + "grad_norm": 17.321443930984262, + "learning_rate": 4.2629373728420926e-05, + "loss": 2.8524, + "mean_token_accuracy": 0.34482758343219755, + "step": 42325 + }, + { + "epoch": 0.042635225579926696, + "grad_norm": 14.750339144809603, + "learning_rate": 4.2634409685151986e-05, + "loss": 2.3446, + "mean_token_accuracy": 0.4192377507686615, + "step": 42330 + }, + { + "epoch": 0.04264026163303087, + "grad_norm": 13.441199619491481, + "learning_rate": 4.2639445641883045e-05, + "loss": 2.4216, + "mean_token_accuracy": 0.42758620381355283, + "step": 42335 + }, + { + "epoch": 0.04264529768613504, + "grad_norm": 15.891862524152994, + "learning_rate": 4.2644481598614104e-05, + "loss": 2.3795, + "mean_token_accuracy": 0.42232305407524107, + "step": 42340 + }, + { + "epoch": 0.04265033373923921, + "grad_norm": 12.948383133287336, + "learning_rate": 4.264951755534516e-05, + "loss": 2.1559, + "mean_token_accuracy": 0.4949788272380829, + "step": 42345 + }, + { + "epoch": 0.042655369792343384, + "grad_norm": 15.900049266401648, + "learning_rate": 4.265455351207623e-05, + "loss": 2.42, + "mean_token_accuracy": 0.4068965494632721, + "step": 42350 + }, + { + "epoch": 0.04266040584544756, + "grad_norm": 15.31039837278257, + "learning_rate": 4.265958946880729e-05, + "loss": 2.3829, + "mean_token_accuracy": 0.4413793087005615, + "step": 42355 + }, + { + "epoch": 0.04266544189855173, + "grad_norm": 14.99418747520708, + "learning_rate": 4.266462542553835e-05, + "loss": 2.5893, + "mean_token_accuracy": 0.4310344815254211, + "step": 42360 + }, + { + "epoch": 0.042670477951655905, + "grad_norm": 12.95952244195856, + "learning_rate": 4.266966138226941e-05, + "loss": 2.7455, + "mean_token_accuracy": 0.3999999940395355, + "step": 42365 + }, + { + "epoch": 0.04267551400476008, + "grad_norm": 17.553404897380318, + "learning_rate": 4.2674697339000466e-05, + "loss": 2.4891, + "mean_token_accuracy": 0.3827586233615875, + "step": 42370 + }, + { + "epoch": 0.04268055005786425, + "grad_norm": 16.51956062951908, + "learning_rate": 4.2679733295731526e-05, + "loss": 2.4685, + "mean_token_accuracy": 0.42413792610168455, + "step": 42375 + }, + { + "epoch": 0.04268558611096842, + "grad_norm": 14.219170245675066, + "learning_rate": 4.2684769252462585e-05, + "loss": 2.4356, + "mean_token_accuracy": 0.3965517163276672, + "step": 42380 + }, + { + "epoch": 0.04269062216407259, + "grad_norm": 14.482484529196329, + "learning_rate": 4.2689805209193644e-05, + "loss": 2.6356, + "mean_token_accuracy": 0.3965517282485962, + "step": 42385 + }, + { + "epoch": 0.04269565821717677, + "grad_norm": 15.67689406125198, + "learning_rate": 4.26948411659247e-05, + "loss": 2.9787, + "mean_token_accuracy": 0.38275861740112305, + "step": 42390 + }, + { + "epoch": 0.04270069427028094, + "grad_norm": 12.490245603448765, + "learning_rate": 4.269987712265576e-05, + "loss": 2.4141, + "mean_token_accuracy": 0.39655171930789945, + "step": 42395 + }, + { + "epoch": 0.042705730323385115, + "grad_norm": 15.540813375818972, + "learning_rate": 4.270491307938682e-05, + "loss": 2.7374, + "mean_token_accuracy": 0.3620689630508423, + "step": 42400 + }, + { + "epoch": 0.04271076637648929, + "grad_norm": 12.979856849694029, + "learning_rate": 4.270994903611789e-05, + "loss": 2.8293, + "mean_token_accuracy": 0.36896551847457887, + "step": 42405 + }, + { + "epoch": 0.04271580242959346, + "grad_norm": 11.564042584272398, + "learning_rate": 4.271498499284895e-05, + "loss": 2.2286, + "mean_token_accuracy": 0.4482758641242981, + "step": 42410 + }, + { + "epoch": 0.04272083848269763, + "grad_norm": 14.029106244070876, + "learning_rate": 4.272002094958e-05, + "loss": 2.4029, + "mean_token_accuracy": 0.4103448212146759, + "step": 42415 + }, + { + "epoch": 0.0427258745358018, + "grad_norm": 18.05882644843383, + "learning_rate": 4.272505690631106e-05, + "loss": 2.1484, + "mean_token_accuracy": 0.4400483965873718, + "step": 42420 + }, + { + "epoch": 0.042730910588905976, + "grad_norm": 12.345629538275608, + "learning_rate": 4.273009286304212e-05, + "loss": 2.4144, + "mean_token_accuracy": 0.42244404554367065, + "step": 42425 + }, + { + "epoch": 0.04273594664201015, + "grad_norm": 12.352939394068553, + "learning_rate": 4.2735128819773184e-05, + "loss": 2.4946, + "mean_token_accuracy": 0.4206896543502808, + "step": 42430 + }, + { + "epoch": 0.042740982695114324, + "grad_norm": 15.708169505741802, + "learning_rate": 4.2740164776504243e-05, + "loss": 2.8369, + "mean_token_accuracy": 0.3744706571102142, + "step": 42435 + }, + { + "epoch": 0.0427460187482185, + "grad_norm": 11.675010757335446, + "learning_rate": 4.27452007332353e-05, + "loss": 2.7415, + "mean_token_accuracy": 0.35862069129943847, + "step": 42440 + }, + { + "epoch": 0.04275105480132267, + "grad_norm": 15.58706914867207, + "learning_rate": 4.275023668996636e-05, + "loss": 2.7103, + "mean_token_accuracy": 0.4137930989265442, + "step": 42445 + }, + { + "epoch": 0.04275609085442684, + "grad_norm": 12.666488494284264, + "learning_rate": 4.275527264669742e-05, + "loss": 2.6382, + "mean_token_accuracy": 0.4034482777118683, + "step": 42450 + }, + { + "epoch": 0.04276112690753101, + "grad_norm": 13.826522865107606, + "learning_rate": 4.276030860342849e-05, + "loss": 2.6516, + "mean_token_accuracy": 0.38620689511299133, + "step": 42455 + }, + { + "epoch": 0.042766162960635186, + "grad_norm": 11.654796102732554, + "learning_rate": 4.276534456015954e-05, + "loss": 2.5875, + "mean_token_accuracy": 0.4, + "step": 42460 + }, + { + "epoch": 0.04277119901373936, + "grad_norm": 13.037544524089572, + "learning_rate": 4.27703805168906e-05, + "loss": 2.5072, + "mean_token_accuracy": 0.41379310488700866, + "step": 42465 + }, + { + "epoch": 0.04277623506684353, + "grad_norm": 15.854954238688503, + "learning_rate": 4.277541647362166e-05, + "loss": 2.6881, + "mean_token_accuracy": 0.3620689630508423, + "step": 42470 + }, + { + "epoch": 0.04278127111994771, + "grad_norm": 13.652851304374813, + "learning_rate": 4.278045243035272e-05, + "loss": 2.3754, + "mean_token_accuracy": 0.3999999940395355, + "step": 42475 + }, + { + "epoch": 0.04278630717305188, + "grad_norm": 18.583763066731468, + "learning_rate": 4.278548838708378e-05, + "loss": 2.6745, + "mean_token_accuracy": 0.4017543882131577, + "step": 42480 + }, + { + "epoch": 0.04279134322615605, + "grad_norm": 12.364417676963345, + "learning_rate": 4.279052434381484e-05, + "loss": 2.2651, + "mean_token_accuracy": 0.4641863167285919, + "step": 42485 + }, + { + "epoch": 0.04279637927926022, + "grad_norm": 13.678942247463272, + "learning_rate": 4.27955603005459e-05, + "loss": 2.3284, + "mean_token_accuracy": 0.458620685338974, + "step": 42490 + }, + { + "epoch": 0.042801415332364395, + "grad_norm": 12.527288771637664, + "learning_rate": 4.280059625727696e-05, + "loss": 2.6872, + "mean_token_accuracy": 0.3379310369491577, + "step": 42495 + }, + { + "epoch": 0.04280645138546857, + "grad_norm": 14.437391933939349, + "learning_rate": 4.280563221400802e-05, + "loss": 2.7137, + "mean_token_accuracy": 0.4, + "step": 42500 + }, + { + "epoch": 0.04281148743857274, + "grad_norm": 14.318938536633889, + "learning_rate": 4.281066817073908e-05, + "loss": 2.6067, + "mean_token_accuracy": 0.41379310488700866, + "step": 42505 + }, + { + "epoch": 0.04281652349167692, + "grad_norm": 14.449078548213425, + "learning_rate": 4.281570412747014e-05, + "loss": 2.292, + "mean_token_accuracy": 0.4034482777118683, + "step": 42510 + }, + { + "epoch": 0.04282155954478109, + "grad_norm": 13.363664878706478, + "learning_rate": 4.28207400842012e-05, + "loss": 2.8499, + "mean_token_accuracy": 0.3620689630508423, + "step": 42515 + }, + { + "epoch": 0.04282659559788526, + "grad_norm": 11.886331527812537, + "learning_rate": 4.282577604093226e-05, + "loss": 2.5383, + "mean_token_accuracy": 0.41929823756217954, + "step": 42520 + }, + { + "epoch": 0.04283163165098943, + "grad_norm": 14.07978327348943, + "learning_rate": 4.283081199766332e-05, + "loss": 2.5866, + "mean_token_accuracy": 0.39655172228813174, + "step": 42525 + }, + { + "epoch": 0.042836667704093605, + "grad_norm": 11.879863150683494, + "learning_rate": 4.2835847954394376e-05, + "loss": 2.5158, + "mean_token_accuracy": 0.38620689511299133, + "step": 42530 + }, + { + "epoch": 0.04284170375719778, + "grad_norm": 17.590022392053584, + "learning_rate": 4.284088391112544e-05, + "loss": 2.5401, + "mean_token_accuracy": 0.38965516686439516, + "step": 42535 + }, + { + "epoch": 0.04284673981030195, + "grad_norm": 12.437440381832182, + "learning_rate": 4.28459198678565e-05, + "loss": 2.6532, + "mean_token_accuracy": 0.41379310488700866, + "step": 42540 + }, + { + "epoch": 0.042851775863406126, + "grad_norm": 12.192273837636188, + "learning_rate": 4.285095582458756e-05, + "loss": 2.6076, + "mean_token_accuracy": 0.44482758045196535, + "step": 42545 + }, + { + "epoch": 0.0428568119165103, + "grad_norm": 12.609447648747556, + "learning_rate": 4.285599178131861e-05, + "loss": 2.5528, + "mean_token_accuracy": 0.3793103456497192, + "step": 42550 + }, + { + "epoch": 0.04286184796961447, + "grad_norm": 11.286364434772523, + "learning_rate": 4.286102773804967e-05, + "loss": 2.6489, + "mean_token_accuracy": 0.4034482777118683, + "step": 42555 + }, + { + "epoch": 0.04286688402271864, + "grad_norm": 14.555588426265109, + "learning_rate": 4.286606369478074e-05, + "loss": 2.2227, + "mean_token_accuracy": 0.45862069725990295, + "step": 42560 + }, + { + "epoch": 0.042871920075822814, + "grad_norm": 10.423228708481181, + "learning_rate": 4.28710996515118e-05, + "loss": 2.4897, + "mean_token_accuracy": 0.43920145034790037, + "step": 42565 + }, + { + "epoch": 0.04287695612892699, + "grad_norm": 15.583529909639854, + "learning_rate": 4.287613560824286e-05, + "loss": 2.458, + "mean_token_accuracy": 0.39310344457626345, + "step": 42570 + }, + { + "epoch": 0.04288199218203116, + "grad_norm": 14.819796875372754, + "learning_rate": 4.2881171564973916e-05, + "loss": 2.3799, + "mean_token_accuracy": 0.4034482717514038, + "step": 42575 + }, + { + "epoch": 0.042887028235135335, + "grad_norm": 13.840809874275212, + "learning_rate": 4.2886207521704975e-05, + "loss": 2.5159, + "mean_token_accuracy": 0.4344827592372894, + "step": 42580 + }, + { + "epoch": 0.04289206428823951, + "grad_norm": 11.397364721717684, + "learning_rate": 4.2891243478436035e-05, + "loss": 2.4414, + "mean_token_accuracy": 0.4344827592372894, + "step": 42585 + }, + { + "epoch": 0.042897100341343676, + "grad_norm": 16.904424012318987, + "learning_rate": 4.2896279435167094e-05, + "loss": 2.5979, + "mean_token_accuracy": 0.38620689511299133, + "step": 42590 + }, + { + "epoch": 0.04290213639444785, + "grad_norm": 15.995372441529161, + "learning_rate": 4.290131539189815e-05, + "loss": 2.5569, + "mean_token_accuracy": 0.3965517163276672, + "step": 42595 + }, + { + "epoch": 0.042907172447552024, + "grad_norm": 14.534087408043131, + "learning_rate": 4.290635134862921e-05, + "loss": 2.3422, + "mean_token_accuracy": 0.44482759237289426, + "step": 42600 + }, + { + "epoch": 0.0429122085006562, + "grad_norm": 13.180057679764511, + "learning_rate": 4.291138730536027e-05, + "loss": 2.3241, + "mean_token_accuracy": 0.4310344815254211, + "step": 42605 + }, + { + "epoch": 0.04291724455376037, + "grad_norm": 16.289163593952928, + "learning_rate": 4.291642326209133e-05, + "loss": 2.9597, + "mean_token_accuracy": 0.3517241418361664, + "step": 42610 + }, + { + "epoch": 0.042922280606864545, + "grad_norm": 14.192912700929499, + "learning_rate": 4.29214592188224e-05, + "loss": 2.6274, + "mean_token_accuracy": 0.38118572235107423, + "step": 42615 + }, + { + "epoch": 0.04292731665996872, + "grad_norm": 12.08154023126303, + "learning_rate": 4.2926495175553456e-05, + "loss": 2.3886, + "mean_token_accuracy": 0.4310344815254211, + "step": 42620 + }, + { + "epoch": 0.042932352713072885, + "grad_norm": 14.368803457135082, + "learning_rate": 4.2931531132284515e-05, + "loss": 2.8807, + "mean_token_accuracy": 0.33448275923728943, + "step": 42625 + }, + { + "epoch": 0.04293738876617706, + "grad_norm": 14.22655187383774, + "learning_rate": 4.2936567089015575e-05, + "loss": 2.5781, + "mean_token_accuracy": 0.3896551728248596, + "step": 42630 + }, + { + "epoch": 0.04294242481928123, + "grad_norm": 11.834411363633627, + "learning_rate": 4.2941603045746634e-05, + "loss": 2.224, + "mean_token_accuracy": 0.48118571639060975, + "step": 42635 + }, + { + "epoch": 0.04294746087238541, + "grad_norm": 14.424438503085119, + "learning_rate": 4.294663900247769e-05, + "loss": 2.2941, + "mean_token_accuracy": 0.4310344815254211, + "step": 42640 + }, + { + "epoch": 0.04295249692548958, + "grad_norm": 14.405130546509609, + "learning_rate": 4.295167495920875e-05, + "loss": 2.4781, + "mean_token_accuracy": 0.43103447556495667, + "step": 42645 + }, + { + "epoch": 0.042957532978593754, + "grad_norm": 12.447160404945365, + "learning_rate": 4.295671091593981e-05, + "loss": 2.8076, + "mean_token_accuracy": 0.39497882723808286, + "step": 42650 + }, + { + "epoch": 0.04296256903169793, + "grad_norm": 13.393840471390812, + "learning_rate": 4.296174687267087e-05, + "loss": 2.5322, + "mean_token_accuracy": 0.4206896543502808, + "step": 42655 + }, + { + "epoch": 0.042967605084802095, + "grad_norm": 11.045596972590426, + "learning_rate": 4.296678282940193e-05, + "loss": 2.768, + "mean_token_accuracy": 0.4034482777118683, + "step": 42660 + }, + { + "epoch": 0.04297264113790627, + "grad_norm": 15.474887807216168, + "learning_rate": 4.297181878613299e-05, + "loss": 2.8179, + "mean_token_accuracy": 0.34482758641242983, + "step": 42665 + }, + { + "epoch": 0.04297767719101044, + "grad_norm": 10.401192947622059, + "learning_rate": 4.2976854742864055e-05, + "loss": 2.3975, + "mean_token_accuracy": 0.4172413766384125, + "step": 42670 + }, + { + "epoch": 0.042982713244114616, + "grad_norm": 12.833537782131634, + "learning_rate": 4.2981890699595115e-05, + "loss": 2.5405, + "mean_token_accuracy": 0.37241379022598264, + "step": 42675 + }, + { + "epoch": 0.04298774929721879, + "grad_norm": 14.97859743185117, + "learning_rate": 4.2986926656326174e-05, + "loss": 2.7911, + "mean_token_accuracy": 0.4206896543502808, + "step": 42680 + }, + { + "epoch": 0.042992785350322964, + "grad_norm": 11.945216989507687, + "learning_rate": 4.2991962613057226e-05, + "loss": 2.7857, + "mean_token_accuracy": 0.3655172407627106, + "step": 42685 + }, + { + "epoch": 0.04299782140342714, + "grad_norm": 17.318628864970286, + "learning_rate": 4.2996998569788286e-05, + "loss": 2.5193, + "mean_token_accuracy": 0.4034482777118683, + "step": 42690 + }, + { + "epoch": 0.043002857456531304, + "grad_norm": 13.709504266331162, + "learning_rate": 4.300203452651935e-05, + "loss": 3.0416, + "mean_token_accuracy": 0.3862069070339203, + "step": 42695 + }, + { + "epoch": 0.04300789350963548, + "grad_norm": 15.38949121289357, + "learning_rate": 4.300707048325041e-05, + "loss": 3.1066, + "mean_token_accuracy": 0.3646702915430069, + "step": 42700 + }, + { + "epoch": 0.04301292956273965, + "grad_norm": 13.078836278604516, + "learning_rate": 4.301210643998147e-05, + "loss": 2.2775, + "mean_token_accuracy": 0.43793103098869324, + "step": 42705 + }, + { + "epoch": 0.043017965615843826, + "grad_norm": 12.575992435217163, + "learning_rate": 4.301714239671253e-05, + "loss": 2.8655, + "mean_token_accuracy": 0.358620685338974, + "step": 42710 + }, + { + "epoch": 0.043023001668948, + "grad_norm": 19.974491645020603, + "learning_rate": 4.302217835344359e-05, + "loss": 2.7685, + "mean_token_accuracy": 0.40169388651847837, + "step": 42715 + }, + { + "epoch": 0.04302803772205217, + "grad_norm": 12.851491495664765, + "learning_rate": 4.3027214310174655e-05, + "loss": 2.9187, + "mean_token_accuracy": 0.3827586233615875, + "step": 42720 + }, + { + "epoch": 0.04303307377515635, + "grad_norm": 19.585268603036194, + "learning_rate": 4.303225026690571e-05, + "loss": 2.6213, + "mean_token_accuracy": 0.41379311084747317, + "step": 42725 + }, + { + "epoch": 0.043038109828260514, + "grad_norm": 12.692247164970185, + "learning_rate": 4.3037286223636766e-05, + "loss": 3.0999, + "mean_token_accuracy": 0.32413792610168457, + "step": 42730 + }, + { + "epoch": 0.04304314588136469, + "grad_norm": 12.932886868488078, + "learning_rate": 4.3042322180367826e-05, + "loss": 2.446, + "mean_token_accuracy": 0.3814881980419159, + "step": 42735 + }, + { + "epoch": 0.04304818193446886, + "grad_norm": 14.318685162633372, + "learning_rate": 4.3047358137098885e-05, + "loss": 2.6926, + "mean_token_accuracy": 0.38620689511299133, + "step": 42740 + }, + { + "epoch": 0.043053217987573035, + "grad_norm": 17.429441479973644, + "learning_rate": 4.3052394093829944e-05, + "loss": 2.8886, + "mean_token_accuracy": 0.3655172407627106, + "step": 42745 + }, + { + "epoch": 0.04305825404067721, + "grad_norm": 15.412231337552774, + "learning_rate": 4.305743005056101e-05, + "loss": 2.5241, + "mean_token_accuracy": 0.42758620381355283, + "step": 42750 + }, + { + "epoch": 0.04306329009378138, + "grad_norm": 14.665060980235358, + "learning_rate": 4.306246600729207e-05, + "loss": 2.3398, + "mean_token_accuracy": 0.43103448748588563, + "step": 42755 + }, + { + "epoch": 0.043068326146885556, + "grad_norm": 15.859227202107647, + "learning_rate": 4.306750196402313e-05, + "loss": 2.631, + "mean_token_accuracy": 0.42413792610168455, + "step": 42760 + }, + { + "epoch": 0.04307336219998972, + "grad_norm": 14.728487050105088, + "learning_rate": 4.307253792075419e-05, + "loss": 2.4061, + "mean_token_accuracy": 0.4172413766384125, + "step": 42765 + }, + { + "epoch": 0.0430783982530939, + "grad_norm": 14.875068886786549, + "learning_rate": 4.307757387748525e-05, + "loss": 2.5842, + "mean_token_accuracy": 0.42068964838981626, + "step": 42770 + }, + { + "epoch": 0.04308343430619807, + "grad_norm": 13.31001833252351, + "learning_rate": 4.3082609834216307e-05, + "loss": 2.3986, + "mean_token_accuracy": 0.3827586233615875, + "step": 42775 + }, + { + "epoch": 0.043088470359302244, + "grad_norm": 10.654826789800902, + "learning_rate": 4.3087645790947366e-05, + "loss": 2.5992, + "mean_token_accuracy": 0.38124621510505674, + "step": 42780 + }, + { + "epoch": 0.04309350641240642, + "grad_norm": 13.488921091471132, + "learning_rate": 4.3092681747678425e-05, + "loss": 2.9385, + "mean_token_accuracy": 0.33103448450565337, + "step": 42785 + }, + { + "epoch": 0.04309854246551059, + "grad_norm": 13.752968324755004, + "learning_rate": 4.3097717704409484e-05, + "loss": 2.8167, + "mean_token_accuracy": 0.3482758581638336, + "step": 42790 + }, + { + "epoch": 0.043103578518614766, + "grad_norm": 14.54340265546484, + "learning_rate": 4.3102753661140544e-05, + "loss": 2.6557, + "mean_token_accuracy": 0.3620689630508423, + "step": 42795 + }, + { + "epoch": 0.04310861457171893, + "grad_norm": 13.958273588277942, + "learning_rate": 4.310778961787161e-05, + "loss": 2.8645, + "mean_token_accuracy": 0.33793103098869326, + "step": 42800 + }, + { + "epoch": 0.043113650624823106, + "grad_norm": 15.052724088428423, + "learning_rate": 4.311282557460267e-05, + "loss": 2.6175, + "mean_token_accuracy": 0.43278887271881106, + "step": 42805 + }, + { + "epoch": 0.04311868667792728, + "grad_norm": 25.49424797218806, + "learning_rate": 4.311786153133373e-05, + "loss": 2.7239, + "mean_token_accuracy": 0.3551724135875702, + "step": 42810 + }, + { + "epoch": 0.043123722731031454, + "grad_norm": 8.499539185378897, + "learning_rate": 4.312289748806479e-05, + "loss": 2.3754, + "mean_token_accuracy": 0.4206896543502808, + "step": 42815 + }, + { + "epoch": 0.04312875878413563, + "grad_norm": 12.030569727972235, + "learning_rate": 4.312793344479584e-05, + "loss": 2.3915, + "mean_token_accuracy": 0.4206896543502808, + "step": 42820 + }, + { + "epoch": 0.0431337948372398, + "grad_norm": 21.33801015877675, + "learning_rate": 4.31329694015269e-05, + "loss": 2.7866, + "mean_token_accuracy": 0.3482758551836014, + "step": 42825 + }, + { + "epoch": 0.043138830890343975, + "grad_norm": 11.798603932857539, + "learning_rate": 4.3138005358257965e-05, + "loss": 2.0824, + "mean_token_accuracy": 0.46418632566928864, + "step": 42830 + }, + { + "epoch": 0.04314386694344814, + "grad_norm": 33.40118134418917, + "learning_rate": 4.3143041314989024e-05, + "loss": 2.6826, + "mean_token_accuracy": 0.3655172407627106, + "step": 42835 + }, + { + "epoch": 0.043148902996552316, + "grad_norm": 12.788676569442492, + "learning_rate": 4.3148077271720084e-05, + "loss": 2.5732, + "mean_token_accuracy": 0.41034482717514037, + "step": 42840 + }, + { + "epoch": 0.04315393904965649, + "grad_norm": 11.82799888141429, + "learning_rate": 4.315311322845114e-05, + "loss": 2.5951, + "mean_token_accuracy": 0.3793103456497192, + "step": 42845 + }, + { + "epoch": 0.04315897510276066, + "grad_norm": 14.042403345673172, + "learning_rate": 4.31581491851822e-05, + "loss": 2.5832, + "mean_token_accuracy": 0.42068964838981626, + "step": 42850 + }, + { + "epoch": 0.04316401115586484, + "grad_norm": 12.082065602165219, + "learning_rate": 4.316318514191327e-05, + "loss": 2.4043, + "mean_token_accuracy": 0.39310344457626345, + "step": 42855 + }, + { + "epoch": 0.04316904720896901, + "grad_norm": 10.982935288489422, + "learning_rate": 4.316822109864432e-05, + "loss": 2.494, + "mean_token_accuracy": 0.41724138855934145, + "step": 42860 + }, + { + "epoch": 0.043174083262073185, + "grad_norm": 11.549541624359899, + "learning_rate": 4.317325705537538e-05, + "loss": 2.8055, + "mean_token_accuracy": 0.4, + "step": 42865 + }, + { + "epoch": 0.04317911931517735, + "grad_norm": 17.95237555377303, + "learning_rate": 4.317829301210644e-05, + "loss": 2.7343, + "mean_token_accuracy": 0.4068965494632721, + "step": 42870 + }, + { + "epoch": 0.043184155368281525, + "grad_norm": 13.035323024305521, + "learning_rate": 4.31833289688375e-05, + "loss": 2.4786, + "mean_token_accuracy": 0.42068966031074523, + "step": 42875 + }, + { + "epoch": 0.0431891914213857, + "grad_norm": 10.172569748117262, + "learning_rate": 4.3188364925568564e-05, + "loss": 2.4248, + "mean_token_accuracy": 0.4275862157344818, + "step": 42880 + }, + { + "epoch": 0.04319422747448987, + "grad_norm": 14.896547723341527, + "learning_rate": 4.3193400882299624e-05, + "loss": 2.5989, + "mean_token_accuracy": 0.42758620381355283, + "step": 42885 + }, + { + "epoch": 0.043199263527594046, + "grad_norm": 15.817082310922139, + "learning_rate": 4.319843683903068e-05, + "loss": 2.5206, + "mean_token_accuracy": 0.44615850448608396, + "step": 42890 + }, + { + "epoch": 0.04320429958069822, + "grad_norm": 12.709828779685514, + "learning_rate": 4.320347279576174e-05, + "loss": 2.8897, + "mean_token_accuracy": 0.36551723480224607, + "step": 42895 + }, + { + "epoch": 0.043209335633802394, + "grad_norm": 14.22070682297591, + "learning_rate": 4.32085087524928e-05, + "loss": 2.5968, + "mean_token_accuracy": 0.3931034505367279, + "step": 42900 + }, + { + "epoch": 0.04321437168690656, + "grad_norm": 13.051136429145583, + "learning_rate": 4.321354470922386e-05, + "loss": 2.4117, + "mean_token_accuracy": 0.4068965554237366, + "step": 42905 + }, + { + "epoch": 0.043219407740010735, + "grad_norm": 14.547566634077588, + "learning_rate": 4.321858066595492e-05, + "loss": 2.5519, + "mean_token_accuracy": 0.4172413766384125, + "step": 42910 + }, + { + "epoch": 0.04322444379311491, + "grad_norm": 13.401343506142018, + "learning_rate": 4.322361662268598e-05, + "loss": 2.3575, + "mean_token_accuracy": 0.42413793206214906, + "step": 42915 + }, + { + "epoch": 0.04322947984621908, + "grad_norm": 16.086908711077506, + "learning_rate": 4.322865257941704e-05, + "loss": 2.5245, + "mean_token_accuracy": 0.463520872592926, + "step": 42920 + }, + { + "epoch": 0.043234515899323256, + "grad_norm": 19.539453496341785, + "learning_rate": 4.32336885361481e-05, + "loss": 3.0739, + "mean_token_accuracy": 0.38275861740112305, + "step": 42925 + }, + { + "epoch": 0.04323955195242743, + "grad_norm": 16.102171462159763, + "learning_rate": 4.323872449287916e-05, + "loss": 2.6971, + "mean_token_accuracy": 0.4, + "step": 42930 + }, + { + "epoch": 0.0432445880055316, + "grad_norm": 14.604535188777865, + "learning_rate": 4.324376044961022e-05, + "loss": 2.7085, + "mean_token_accuracy": 0.3655172407627106, + "step": 42935 + }, + { + "epoch": 0.04324962405863577, + "grad_norm": 14.888040688792518, + "learning_rate": 4.324879640634128e-05, + "loss": 3.0682, + "mean_token_accuracy": 0.3793103516101837, + "step": 42940 + }, + { + "epoch": 0.043254660111739944, + "grad_norm": 45.10352474136455, + "learning_rate": 4.325383236307234e-05, + "loss": 3.077, + "mean_token_accuracy": 0.3758620709180832, + "step": 42945 + }, + { + "epoch": 0.04325969616484412, + "grad_norm": 13.255845301379836, + "learning_rate": 4.3258868319803394e-05, + "loss": 2.4893, + "mean_token_accuracy": 0.4068965554237366, + "step": 42950 + }, + { + "epoch": 0.04326473221794829, + "grad_norm": 19.385992686266437, + "learning_rate": 4.326390427653445e-05, + "loss": 3.1516, + "mean_token_accuracy": 0.3517241358757019, + "step": 42955 + }, + { + "epoch": 0.043269768271052465, + "grad_norm": 15.5630097100151, + "learning_rate": 4.326894023326552e-05, + "loss": 3.1361, + "mean_token_accuracy": 0.32758620381355286, + "step": 42960 + }, + { + "epoch": 0.04327480432415664, + "grad_norm": 13.429486287678065, + "learning_rate": 4.327397618999658e-05, + "loss": 2.7853, + "mean_token_accuracy": 0.3517241418361664, + "step": 42965 + }, + { + "epoch": 0.04327984037726081, + "grad_norm": 12.35284586939357, + "learning_rate": 4.327901214672764e-05, + "loss": 2.5725, + "mean_token_accuracy": 0.3896551787853241, + "step": 42970 + }, + { + "epoch": 0.04328487643036498, + "grad_norm": 11.43572494800055, + "learning_rate": 4.32840481034587e-05, + "loss": 2.3576, + "mean_token_accuracy": 0.3758620619773865, + "step": 42975 + }, + { + "epoch": 0.04328991248346915, + "grad_norm": 10.997491340570352, + "learning_rate": 4.3289084060189756e-05, + "loss": 2.4089, + "mean_token_accuracy": 0.4465819835662842, + "step": 42980 + }, + { + "epoch": 0.04329494853657333, + "grad_norm": 12.925946346186857, + "learning_rate": 4.329412001692082e-05, + "loss": 2.9017, + "mean_token_accuracy": 0.36896551251411436, + "step": 42985 + }, + { + "epoch": 0.0432999845896775, + "grad_norm": 12.694847266546367, + "learning_rate": 4.329915597365188e-05, + "loss": 2.9158, + "mean_token_accuracy": 0.39655172228813174, + "step": 42990 + }, + { + "epoch": 0.043305020642781675, + "grad_norm": 12.451086891758022, + "learning_rate": 4.3304191930382934e-05, + "loss": 2.1592, + "mean_token_accuracy": 0.41379310488700866, + "step": 42995 + }, + { + "epoch": 0.04331005669588585, + "grad_norm": 11.098650021708801, + "learning_rate": 4.330922788711399e-05, + "loss": 2.6512, + "mean_token_accuracy": 0.3896551728248596, + "step": 43000 + }, + { + "epoch": 0.04331509274899002, + "grad_norm": 12.663143956804458, + "learning_rate": 4.331426384384505e-05, + "loss": 2.3195, + "mean_token_accuracy": 0.4413793087005615, + "step": 43005 + }, + { + "epoch": 0.04332012880209419, + "grad_norm": 13.805755930495046, + "learning_rate": 4.331929980057611e-05, + "loss": 3.0402, + "mean_token_accuracy": 0.3482758641242981, + "step": 43010 + }, + { + "epoch": 0.04332516485519836, + "grad_norm": 11.348780509726526, + "learning_rate": 4.332433575730718e-05, + "loss": 2.3997, + "mean_token_accuracy": 0.4551724135875702, + "step": 43015 + }, + { + "epoch": 0.04333020090830254, + "grad_norm": 12.963712840786435, + "learning_rate": 4.332937171403824e-05, + "loss": 2.4936, + "mean_token_accuracy": 0.42758620977401735, + "step": 43020 + }, + { + "epoch": 0.04333523696140671, + "grad_norm": 18.92054234869347, + "learning_rate": 4.3334407670769296e-05, + "loss": 2.78, + "mean_token_accuracy": 0.3482758581638336, + "step": 43025 + }, + { + "epoch": 0.043340273014510884, + "grad_norm": 15.808847624735115, + "learning_rate": 4.3339443627500356e-05, + "loss": 2.3086, + "mean_token_accuracy": 0.4034482717514038, + "step": 43030 + }, + { + "epoch": 0.04334530906761506, + "grad_norm": 16.89370061101859, + "learning_rate": 4.3344479584231415e-05, + "loss": 2.6609, + "mean_token_accuracy": 0.3896551728248596, + "step": 43035 + }, + { + "epoch": 0.04335034512071923, + "grad_norm": 14.204253711162275, + "learning_rate": 4.3349515540962474e-05, + "loss": 2.3443, + "mean_token_accuracy": 0.42758620977401735, + "step": 43040 + }, + { + "epoch": 0.0433553811738234, + "grad_norm": 12.960054436857677, + "learning_rate": 4.335455149769353e-05, + "loss": 2.6112, + "mean_token_accuracy": 0.39655172228813174, + "step": 43045 + }, + { + "epoch": 0.04336041722692757, + "grad_norm": 13.395314174471789, + "learning_rate": 4.335958745442459e-05, + "loss": 2.8793, + "mean_token_accuracy": 0.379310342669487, + "step": 43050 + }, + { + "epoch": 0.043365453280031746, + "grad_norm": 17.100956758798244, + "learning_rate": 4.336462341115565e-05, + "loss": 2.5055, + "mean_token_accuracy": 0.4103448212146759, + "step": 43055 + }, + { + "epoch": 0.04337048933313592, + "grad_norm": 18.113587644654412, + "learning_rate": 4.336965936788671e-05, + "loss": 2.7012, + "mean_token_accuracy": 0.3551724076271057, + "step": 43060 + }, + { + "epoch": 0.043375525386240094, + "grad_norm": 13.882859292640665, + "learning_rate": 4.337469532461778e-05, + "loss": 2.5601, + "mean_token_accuracy": 0.4241379380226135, + "step": 43065 + }, + { + "epoch": 0.04338056143934427, + "grad_norm": 13.637055373355933, + "learning_rate": 4.3379731281348836e-05, + "loss": 2.6017, + "mean_token_accuracy": 0.4172413766384125, + "step": 43070 + }, + { + "epoch": 0.04338559749244844, + "grad_norm": 17.09538398175619, + "learning_rate": 4.3384767238079896e-05, + "loss": 2.7013, + "mean_token_accuracy": 0.4034482777118683, + "step": 43075 + }, + { + "epoch": 0.04339063354555261, + "grad_norm": 12.381146266209566, + "learning_rate": 4.3389803194810955e-05, + "loss": 2.099, + "mean_token_accuracy": 0.4586206912994385, + "step": 43080 + }, + { + "epoch": 0.04339566959865678, + "grad_norm": 11.697657761051355, + "learning_rate": 4.339483915154201e-05, + "loss": 2.5589, + "mean_token_accuracy": 0.4103448331356049, + "step": 43085 + }, + { + "epoch": 0.043400705651760955, + "grad_norm": 12.138802313032107, + "learning_rate": 4.3399875108273067e-05, + "loss": 2.37, + "mean_token_accuracy": 0.44827587604522706, + "step": 43090 + }, + { + "epoch": 0.04340574170486513, + "grad_norm": 13.100388377327807, + "learning_rate": 4.340491106500413e-05, + "loss": 2.4283, + "mean_token_accuracy": 0.4517241358757019, + "step": 43095 + }, + { + "epoch": 0.0434107777579693, + "grad_norm": 15.422257364256089, + "learning_rate": 4.340994702173519e-05, + "loss": 2.6166, + "mean_token_accuracy": 0.3935960590839386, + "step": 43100 + }, + { + "epoch": 0.04341581381107348, + "grad_norm": 13.08115260711766, + "learning_rate": 4.341498297846625e-05, + "loss": 2.6272, + "mean_token_accuracy": 0.3551724016666412, + "step": 43105 + }, + { + "epoch": 0.04342084986417765, + "grad_norm": 15.029613899522984, + "learning_rate": 4.342001893519731e-05, + "loss": 2.3388, + "mean_token_accuracy": 0.42068964838981626, + "step": 43110 + }, + { + "epoch": 0.04342588591728182, + "grad_norm": 12.77797923526171, + "learning_rate": 4.342505489192837e-05, + "loss": 2.5419, + "mean_token_accuracy": 0.37241379022598264, + "step": 43115 + }, + { + "epoch": 0.04343092197038599, + "grad_norm": 12.896080787141113, + "learning_rate": 4.3430090848659436e-05, + "loss": 2.4695, + "mean_token_accuracy": 0.42068966627120974, + "step": 43120 + }, + { + "epoch": 0.043435958023490165, + "grad_norm": 11.597229728794334, + "learning_rate": 4.343512680539049e-05, + "loss": 2.6755, + "mean_token_accuracy": 0.3482758641242981, + "step": 43125 + }, + { + "epoch": 0.04344099407659434, + "grad_norm": 10.732689666909268, + "learning_rate": 4.344016276212155e-05, + "loss": 2.4151, + "mean_token_accuracy": 0.4379310369491577, + "step": 43130 + }, + { + "epoch": 0.04344603012969851, + "grad_norm": 14.411309254698672, + "learning_rate": 4.3445198718852607e-05, + "loss": 2.6971, + "mean_token_accuracy": 0.3517241358757019, + "step": 43135 + }, + { + "epoch": 0.043451066182802686, + "grad_norm": 15.761689482204545, + "learning_rate": 4.3450234675583666e-05, + "loss": 2.5084, + "mean_token_accuracy": 0.43103448748588563, + "step": 43140 + }, + { + "epoch": 0.04345610223590686, + "grad_norm": 13.926954334746963, + "learning_rate": 4.345527063231473e-05, + "loss": 2.5789, + "mean_token_accuracy": 0.41379310488700866, + "step": 43145 + }, + { + "epoch": 0.04346113828901103, + "grad_norm": 14.96288766291618, + "learning_rate": 4.346030658904579e-05, + "loss": 2.7384, + "mean_token_accuracy": 0.41034482717514037, + "step": 43150 + }, + { + "epoch": 0.0434661743421152, + "grad_norm": 15.272037767044255, + "learning_rate": 4.346534254577685e-05, + "loss": 2.7702, + "mean_token_accuracy": 0.3862069010734558, + "step": 43155 + }, + { + "epoch": 0.043471210395219374, + "grad_norm": 13.094975164110359, + "learning_rate": 4.347037850250791e-05, + "loss": 2.3679, + "mean_token_accuracy": 0.41724138259887694, + "step": 43160 + }, + { + "epoch": 0.04347624644832355, + "grad_norm": 17.485843776858143, + "learning_rate": 4.347541445923897e-05, + "loss": 2.7991, + "mean_token_accuracy": 0.379310342669487, + "step": 43165 + }, + { + "epoch": 0.04348128250142772, + "grad_norm": 12.355648942361583, + "learning_rate": 4.348045041597003e-05, + "loss": 2.3722, + "mean_token_accuracy": 0.4467634618282318, + "step": 43170 + }, + { + "epoch": 0.043486318554531896, + "grad_norm": 15.748488777614266, + "learning_rate": 4.348548637270109e-05, + "loss": 2.7299, + "mean_token_accuracy": 0.35862069129943847, + "step": 43175 + }, + { + "epoch": 0.04349135460763607, + "grad_norm": 12.174001820518745, + "learning_rate": 4.349052232943215e-05, + "loss": 2.6204, + "mean_token_accuracy": 0.36551724672317504, + "step": 43180 + }, + { + "epoch": 0.043496390660740236, + "grad_norm": 16.239929322014568, + "learning_rate": 4.3495558286163206e-05, + "loss": 2.7893, + "mean_token_accuracy": 0.3310344785451889, + "step": 43185 + }, + { + "epoch": 0.04350142671384441, + "grad_norm": 13.033772908938829, + "learning_rate": 4.3500594242894265e-05, + "loss": 2.6333, + "mean_token_accuracy": 0.3999999940395355, + "step": 43190 + }, + { + "epoch": 0.043506462766948584, + "grad_norm": 19.35030000193822, + "learning_rate": 4.3505630199625324e-05, + "loss": 2.8275, + "mean_token_accuracy": 0.36896551847457887, + "step": 43195 + }, + { + "epoch": 0.04351149882005276, + "grad_norm": 15.278772099255892, + "learning_rate": 4.351066615635639e-05, + "loss": 2.5621, + "mean_token_accuracy": 0.3517241358757019, + "step": 43200 + }, + { + "epoch": 0.04351653487315693, + "grad_norm": 12.560360758937867, + "learning_rate": 4.351570211308745e-05, + "loss": 2.6419, + "mean_token_accuracy": 0.4034482777118683, + "step": 43205 + }, + { + "epoch": 0.043521570926261105, + "grad_norm": 11.615216013448267, + "learning_rate": 4.352073806981851e-05, + "loss": 1.9393, + "mean_token_accuracy": 0.5322660088539124, + "step": 43210 + }, + { + "epoch": 0.04352660697936528, + "grad_norm": 14.429088496901855, + "learning_rate": 4.352577402654957e-05, + "loss": 2.667, + "mean_token_accuracy": 0.3793103456497192, + "step": 43215 + }, + { + "epoch": 0.043531643032469446, + "grad_norm": 15.758399557454972, + "learning_rate": 4.353080998328062e-05, + "loss": 2.6936, + "mean_token_accuracy": 0.4206896543502808, + "step": 43220 + }, + { + "epoch": 0.04353667908557362, + "grad_norm": 13.882410084424185, + "learning_rate": 4.353584594001169e-05, + "loss": 2.58, + "mean_token_accuracy": 0.37931033968925476, + "step": 43225 + }, + { + "epoch": 0.04354171513867779, + "grad_norm": 13.163756328293877, + "learning_rate": 4.3540881896742746e-05, + "loss": 2.946, + "mean_token_accuracy": 0.32758620083332063, + "step": 43230 + }, + { + "epoch": 0.04354675119178197, + "grad_norm": 14.964656195605372, + "learning_rate": 4.3545917853473805e-05, + "loss": 2.7152, + "mean_token_accuracy": 0.3896551728248596, + "step": 43235 + }, + { + "epoch": 0.04355178724488614, + "grad_norm": 13.706497100872635, + "learning_rate": 4.3550953810204864e-05, + "loss": 2.6414, + "mean_token_accuracy": 0.35862069129943847, + "step": 43240 + }, + { + "epoch": 0.043556823297990314, + "grad_norm": 11.65289548517909, + "learning_rate": 4.3555989766935924e-05, + "loss": 2.3751, + "mean_token_accuracy": 0.42413793206214906, + "step": 43245 + }, + { + "epoch": 0.04356185935109449, + "grad_norm": 11.271780160795974, + "learning_rate": 4.356102572366698e-05, + "loss": 2.7489, + "mean_token_accuracy": 0.4034482777118683, + "step": 43250 + }, + { + "epoch": 0.043566895404198655, + "grad_norm": 13.040516500444163, + "learning_rate": 4.356606168039805e-05, + "loss": 2.6822, + "mean_token_accuracy": 0.39092559218406675, + "step": 43255 + }, + { + "epoch": 0.04357193145730283, + "grad_norm": 14.111004047941716, + "learning_rate": 4.35710976371291e-05, + "loss": 2.8235, + "mean_token_accuracy": 0.36896551549434664, + "step": 43260 + }, + { + "epoch": 0.043576967510407, + "grad_norm": 14.84088171486115, + "learning_rate": 4.357613359386016e-05, + "loss": 2.8361, + "mean_token_accuracy": 0.33103448450565337, + "step": 43265 + }, + { + "epoch": 0.043582003563511176, + "grad_norm": 10.913966474422473, + "learning_rate": 4.358116955059122e-05, + "loss": 2.6872, + "mean_token_accuracy": 0.41379311084747317, + "step": 43270 + }, + { + "epoch": 0.04358703961661535, + "grad_norm": 14.482373630583574, + "learning_rate": 4.358620550732228e-05, + "loss": 2.2416, + "mean_token_accuracy": 0.4344827651977539, + "step": 43275 + }, + { + "epoch": 0.043592075669719524, + "grad_norm": 15.040060118216331, + "learning_rate": 4.3591241464053345e-05, + "loss": 2.333, + "mean_token_accuracy": 0.4655172348022461, + "step": 43280 + }, + { + "epoch": 0.0435971117228237, + "grad_norm": 13.172571306772097, + "learning_rate": 4.3596277420784405e-05, + "loss": 2.3704, + "mean_token_accuracy": 0.45517241954803467, + "step": 43285 + }, + { + "epoch": 0.043602147775927864, + "grad_norm": 12.216863193308022, + "learning_rate": 4.3601313377515464e-05, + "loss": 2.8283, + "mean_token_accuracy": 0.3896551728248596, + "step": 43290 + }, + { + "epoch": 0.04360718382903204, + "grad_norm": 12.939393252751017, + "learning_rate": 4.360634933424652e-05, + "loss": 2.6627, + "mean_token_accuracy": 0.42413793206214906, + "step": 43295 + }, + { + "epoch": 0.04361221988213621, + "grad_norm": 13.39822288707477, + "learning_rate": 4.361138529097758e-05, + "loss": 2.8026, + "mean_token_accuracy": 0.41724138259887694, + "step": 43300 + }, + { + "epoch": 0.043617255935240386, + "grad_norm": 10.955606182190163, + "learning_rate": 4.361642124770864e-05, + "loss": 2.3853, + "mean_token_accuracy": 0.4344827592372894, + "step": 43305 + }, + { + "epoch": 0.04362229198834456, + "grad_norm": 15.601764807936371, + "learning_rate": 4.36214572044397e-05, + "loss": 2.5724, + "mean_token_accuracy": 0.4344827592372894, + "step": 43310 + }, + { + "epoch": 0.04362732804144873, + "grad_norm": 13.842995389962418, + "learning_rate": 4.362649316117076e-05, + "loss": 2.3568, + "mean_token_accuracy": 0.41034482717514037, + "step": 43315 + }, + { + "epoch": 0.04363236409455291, + "grad_norm": 12.125376662289419, + "learning_rate": 4.363152911790182e-05, + "loss": 2.6468, + "mean_token_accuracy": 0.4, + "step": 43320 + }, + { + "epoch": 0.043637400147657074, + "grad_norm": 13.351707789151023, + "learning_rate": 4.363656507463288e-05, + "loss": 2.8311, + "mean_token_accuracy": 0.3482758641242981, + "step": 43325 + }, + { + "epoch": 0.04364243620076125, + "grad_norm": 18.631419854504433, + "learning_rate": 4.364160103136394e-05, + "loss": 2.6929, + "mean_token_accuracy": 0.4206896543502808, + "step": 43330 + }, + { + "epoch": 0.04364747225386542, + "grad_norm": 16.702363948374455, + "learning_rate": 4.3646636988095004e-05, + "loss": 2.5476, + "mean_token_accuracy": 0.4000000059604645, + "step": 43335 + }, + { + "epoch": 0.043652508306969595, + "grad_norm": 15.71787728715803, + "learning_rate": 4.365167294482606e-05, + "loss": 2.6224, + "mean_token_accuracy": 0.3655172407627106, + "step": 43340 + }, + { + "epoch": 0.04365754436007377, + "grad_norm": 11.742339375091637, + "learning_rate": 4.365670890155712e-05, + "loss": 2.6505, + "mean_token_accuracy": 0.39140955805778505, + "step": 43345 + }, + { + "epoch": 0.04366258041317794, + "grad_norm": 13.476821274968112, + "learning_rate": 4.366174485828818e-05, + "loss": 3.0041, + "mean_token_accuracy": 0.31724137663841245, + "step": 43350 + }, + { + "epoch": 0.043667616466282116, + "grad_norm": 13.662896839534794, + "learning_rate": 4.3666780815019234e-05, + "loss": 2.8501, + "mean_token_accuracy": 0.38620689511299133, + "step": 43355 + }, + { + "epoch": 0.04367265251938628, + "grad_norm": 12.981283474223575, + "learning_rate": 4.36718167717503e-05, + "loss": 2.3514, + "mean_token_accuracy": 0.4103448212146759, + "step": 43360 + }, + { + "epoch": 0.04367768857249046, + "grad_norm": 13.775124424511866, + "learning_rate": 4.367685272848136e-05, + "loss": 2.8109, + "mean_token_accuracy": 0.35862069129943847, + "step": 43365 + }, + { + "epoch": 0.04368272462559463, + "grad_norm": 15.648144139818218, + "learning_rate": 4.368188868521242e-05, + "loss": 2.873, + "mean_token_accuracy": 0.3551724135875702, + "step": 43370 + }, + { + "epoch": 0.043687760678698805, + "grad_norm": 13.240898273242951, + "learning_rate": 4.368692464194348e-05, + "loss": 2.5301, + "mean_token_accuracy": 0.42068966031074523, + "step": 43375 + }, + { + "epoch": 0.04369279673180298, + "grad_norm": 12.370873212076045, + "learning_rate": 4.369196059867454e-05, + "loss": 2.917, + "mean_token_accuracy": 0.3655172407627106, + "step": 43380 + }, + { + "epoch": 0.04369783278490715, + "grad_norm": 14.79108232576774, + "learning_rate": 4.36969965554056e-05, + "loss": 2.4644, + "mean_token_accuracy": 0.42068966031074523, + "step": 43385 + }, + { + "epoch": 0.043702868838011326, + "grad_norm": 11.096601229503467, + "learning_rate": 4.370203251213666e-05, + "loss": 2.1809, + "mean_token_accuracy": 0.4601330876350403, + "step": 43390 + }, + { + "epoch": 0.04370790489111549, + "grad_norm": 14.3581420605594, + "learning_rate": 4.3707068468867715e-05, + "loss": 2.8766, + "mean_token_accuracy": 0.36551723480224607, + "step": 43395 + }, + { + "epoch": 0.043712940944219666, + "grad_norm": 19.607621485693915, + "learning_rate": 4.3712104425598774e-05, + "loss": 2.4569, + "mean_token_accuracy": 0.4034482717514038, + "step": 43400 + }, + { + "epoch": 0.04371797699732384, + "grad_norm": 15.07783977626869, + "learning_rate": 4.371714038232983e-05, + "loss": 2.7806, + "mean_token_accuracy": 0.3896551787853241, + "step": 43405 + }, + { + "epoch": 0.043723013050428014, + "grad_norm": 14.881302314728204, + "learning_rate": 4.37221763390609e-05, + "loss": 2.5533, + "mean_token_accuracy": 0.37241379022598264, + "step": 43410 + }, + { + "epoch": 0.04372804910353219, + "grad_norm": 12.65799892415228, + "learning_rate": 4.372721229579196e-05, + "loss": 2.7452, + "mean_token_accuracy": 0.4, + "step": 43415 + }, + { + "epoch": 0.04373308515663636, + "grad_norm": 14.571732356950381, + "learning_rate": 4.373224825252302e-05, + "loss": 2.7242, + "mean_token_accuracy": 0.3551724135875702, + "step": 43420 + }, + { + "epoch": 0.043738121209740535, + "grad_norm": 12.2128842564891, + "learning_rate": 4.373728420925408e-05, + "loss": 2.8017, + "mean_token_accuracy": 0.39310344457626345, + "step": 43425 + }, + { + "epoch": 0.0437431572628447, + "grad_norm": 11.055695417531172, + "learning_rate": 4.3742320165985136e-05, + "loss": 2.5375, + "mean_token_accuracy": 0.39655172228813174, + "step": 43430 + }, + { + "epoch": 0.043748193315948876, + "grad_norm": 18.6310118213333, + "learning_rate": 4.3747356122716196e-05, + "loss": 2.9781, + "mean_token_accuracy": 0.39310344457626345, + "step": 43435 + }, + { + "epoch": 0.04375322936905305, + "grad_norm": 11.854790543483865, + "learning_rate": 4.3752392079447255e-05, + "loss": 2.6211, + "mean_token_accuracy": 0.4068965554237366, + "step": 43440 + }, + { + "epoch": 0.04375826542215722, + "grad_norm": 10.767799032639793, + "learning_rate": 4.3757428036178314e-05, + "loss": 2.2343, + "mean_token_accuracy": 0.4379310369491577, + "step": 43445 + }, + { + "epoch": 0.0437633014752614, + "grad_norm": 15.006047101797147, + "learning_rate": 4.3762463992909373e-05, + "loss": 2.3543, + "mean_token_accuracy": 0.4137930989265442, + "step": 43450 + }, + { + "epoch": 0.04376833752836557, + "grad_norm": 14.393955725821202, + "learning_rate": 4.376749994964043e-05, + "loss": 2.6447, + "mean_token_accuracy": 0.38275861740112305, + "step": 43455 + }, + { + "epoch": 0.043773373581469745, + "grad_norm": 15.249000876082574, + "learning_rate": 4.377253590637149e-05, + "loss": 2.7696, + "mean_token_accuracy": 0.3965517163276672, + "step": 43460 + }, + { + "epoch": 0.04377840963457391, + "grad_norm": 15.171234870313565, + "learning_rate": 4.377757186310256e-05, + "loss": 2.5549, + "mean_token_accuracy": 0.38118572235107423, + "step": 43465 + }, + { + "epoch": 0.043783445687678085, + "grad_norm": 12.43069198694147, + "learning_rate": 4.378260781983362e-05, + "loss": 2.7453, + "mean_token_accuracy": 0.38620689809322356, + "step": 43470 + }, + { + "epoch": 0.04378848174078226, + "grad_norm": 11.891889770491794, + "learning_rate": 4.3787643776564676e-05, + "loss": 2.3951, + "mean_token_accuracy": 0.49655171632766726, + "step": 43475 + }, + { + "epoch": 0.04379351779388643, + "grad_norm": 12.501430879955116, + "learning_rate": 4.3792679733295736e-05, + "loss": 3.0204, + "mean_token_accuracy": 0.3620689630508423, + "step": 43480 + }, + { + "epoch": 0.04379855384699061, + "grad_norm": 17.836937478361897, + "learning_rate": 4.379771569002679e-05, + "loss": 2.5659, + "mean_token_accuracy": 0.4103448331356049, + "step": 43485 + }, + { + "epoch": 0.04380358990009478, + "grad_norm": 12.23916541204761, + "learning_rate": 4.3802751646757854e-05, + "loss": 2.5333, + "mean_token_accuracy": 0.4121597111225128, + "step": 43490 + }, + { + "epoch": 0.043808625953198954, + "grad_norm": 12.62429262145911, + "learning_rate": 4.3807787603488913e-05, + "loss": 2.4096, + "mean_token_accuracy": 0.46382336020469667, + "step": 43495 + }, + { + "epoch": 0.04381366200630312, + "grad_norm": 12.086492287296393, + "learning_rate": 4.381282356021997e-05, + "loss": 2.4053, + "mean_token_accuracy": 0.4206896543502808, + "step": 43500 + }, + { + "epoch": 0.043818698059407295, + "grad_norm": 21.91128880272995, + "learning_rate": 4.381785951695103e-05, + "loss": 2.9402, + "mean_token_accuracy": 0.33793103098869326, + "step": 43505 + }, + { + "epoch": 0.04382373411251147, + "grad_norm": 12.286437009550408, + "learning_rate": 4.382289547368209e-05, + "loss": 2.5313, + "mean_token_accuracy": 0.4413793087005615, + "step": 43510 + }, + { + "epoch": 0.04382877016561564, + "grad_norm": 14.393027724450064, + "learning_rate": 4.382793143041315e-05, + "loss": 2.7175, + "mean_token_accuracy": 0.36551723480224607, + "step": 43515 + }, + { + "epoch": 0.043833806218719816, + "grad_norm": 16.187501449458118, + "learning_rate": 4.3832967387144217e-05, + "loss": 2.3068, + "mean_token_accuracy": 0.4310344815254211, + "step": 43520 + }, + { + "epoch": 0.04383884227182399, + "grad_norm": 13.768130229337245, + "learning_rate": 4.3838003343875276e-05, + "loss": 3.0751, + "mean_token_accuracy": 0.3551724076271057, + "step": 43525 + }, + { + "epoch": 0.043843878324928164, + "grad_norm": 12.121721781124167, + "learning_rate": 4.384303930060633e-05, + "loss": 2.5278, + "mean_token_accuracy": 0.38275861740112305, + "step": 43530 + }, + { + "epoch": 0.04384891437803233, + "grad_norm": 23.668780022074916, + "learning_rate": 4.384807525733739e-05, + "loss": 2.7998, + "mean_token_accuracy": 0.3983061134815216, + "step": 43535 + }, + { + "epoch": 0.043853950431136504, + "grad_norm": 14.88753247939582, + "learning_rate": 4.385311121406845e-05, + "loss": 2.7228, + "mean_token_accuracy": 0.38275861740112305, + "step": 43540 + }, + { + "epoch": 0.04385898648424068, + "grad_norm": 18.19546441923281, + "learning_rate": 4.385814717079951e-05, + "loss": 2.5797, + "mean_token_accuracy": 0.3983061194419861, + "step": 43545 + }, + { + "epoch": 0.04386402253734485, + "grad_norm": 11.873757750081026, + "learning_rate": 4.386318312753057e-05, + "loss": 2.4235, + "mean_token_accuracy": 0.37586207389831544, + "step": 43550 + }, + { + "epoch": 0.043869058590449025, + "grad_norm": 15.76499525569811, + "learning_rate": 4.386821908426163e-05, + "loss": 2.609, + "mean_token_accuracy": 0.4621294617652893, + "step": 43555 + }, + { + "epoch": 0.0438740946435532, + "grad_norm": 11.240455594661785, + "learning_rate": 4.387325504099269e-05, + "loss": 2.5631, + "mean_token_accuracy": 0.37586206793785093, + "step": 43560 + }, + { + "epoch": 0.04387913069665737, + "grad_norm": 12.671307529285949, + "learning_rate": 4.387829099772375e-05, + "loss": 2.4368, + "mean_token_accuracy": 0.4295825779438019, + "step": 43565 + }, + { + "epoch": 0.04388416674976154, + "grad_norm": 16.006486189997982, + "learning_rate": 4.388332695445481e-05, + "loss": 2.7819, + "mean_token_accuracy": 0.3724137842655182, + "step": 43570 + }, + { + "epoch": 0.043889202802865714, + "grad_norm": 12.975606203537602, + "learning_rate": 4.388836291118587e-05, + "loss": 2.4424, + "mean_token_accuracy": 0.44137930274009707, + "step": 43575 + }, + { + "epoch": 0.04389423885596989, + "grad_norm": 12.676119609449293, + "learning_rate": 4.389339886791693e-05, + "loss": 2.671, + "mean_token_accuracy": 0.42758620977401735, + "step": 43580 + }, + { + "epoch": 0.04389927490907406, + "grad_norm": 12.11500954564936, + "learning_rate": 4.389843482464799e-05, + "loss": 2.3207, + "mean_token_accuracy": 0.37586206793785093, + "step": 43585 + }, + { + "epoch": 0.043904310962178235, + "grad_norm": 15.230201051089736, + "learning_rate": 4.3903470781379046e-05, + "loss": 2.6726, + "mean_token_accuracy": 0.38106473088264464, + "step": 43590 + }, + { + "epoch": 0.04390934701528241, + "grad_norm": 14.656423872220582, + "learning_rate": 4.3908506738110105e-05, + "loss": 2.7699, + "mean_token_accuracy": 0.39999998807907106, + "step": 43595 + }, + { + "epoch": 0.04391438306838658, + "grad_norm": 15.95194955176313, + "learning_rate": 4.391354269484117e-05, + "loss": 2.9379, + "mean_token_accuracy": 0.3689655065536499, + "step": 43600 + }, + { + "epoch": 0.04391941912149075, + "grad_norm": 15.390449689590145, + "learning_rate": 4.391857865157223e-05, + "loss": 2.874, + "mean_token_accuracy": 0.3793103456497192, + "step": 43605 + }, + { + "epoch": 0.04392445517459492, + "grad_norm": 14.94454643075687, + "learning_rate": 4.392361460830329e-05, + "loss": 2.4643, + "mean_token_accuracy": 0.3896551728248596, + "step": 43610 + }, + { + "epoch": 0.0439294912276991, + "grad_norm": 13.992088897946488, + "learning_rate": 4.392865056503435e-05, + "loss": 3.043, + "mean_token_accuracy": 0.32413792610168457, + "step": 43615 + }, + { + "epoch": 0.04393452728080327, + "grad_norm": 13.114610155239365, + "learning_rate": 4.39336865217654e-05, + "loss": 2.7146, + "mean_token_accuracy": 0.35517241060733795, + "step": 43620 + }, + { + "epoch": 0.043939563333907444, + "grad_norm": 10.539665188350874, + "learning_rate": 4.393872247849647e-05, + "loss": 2.5809, + "mean_token_accuracy": 0.42758620977401735, + "step": 43625 + }, + { + "epoch": 0.04394459938701162, + "grad_norm": 15.518204153169714, + "learning_rate": 4.394375843522753e-05, + "loss": 2.5938, + "mean_token_accuracy": 0.4034482777118683, + "step": 43630 + }, + { + "epoch": 0.04394963544011579, + "grad_norm": 16.01199382653238, + "learning_rate": 4.3948794391958586e-05, + "loss": 2.7238, + "mean_token_accuracy": 0.4137930989265442, + "step": 43635 + }, + { + "epoch": 0.04395467149321996, + "grad_norm": 15.086098118816434, + "learning_rate": 4.3953830348689645e-05, + "loss": 2.7005, + "mean_token_accuracy": 0.39310344457626345, + "step": 43640 + }, + { + "epoch": 0.04395970754632413, + "grad_norm": 14.420266891746307, + "learning_rate": 4.3958866305420705e-05, + "loss": 2.5537, + "mean_token_accuracy": 0.4379310369491577, + "step": 43645 + }, + { + "epoch": 0.043964743599428306, + "grad_norm": 13.540792469571088, + "learning_rate": 4.396390226215177e-05, + "loss": 3.1272, + "mean_token_accuracy": 0.3551724135875702, + "step": 43650 + }, + { + "epoch": 0.04396977965253248, + "grad_norm": 16.99566382146657, + "learning_rate": 4.396893821888283e-05, + "loss": 2.4422, + "mean_token_accuracy": 0.43448275327682495, + "step": 43655 + }, + { + "epoch": 0.043974815705636654, + "grad_norm": 17.44836585646702, + "learning_rate": 4.397397417561388e-05, + "loss": 2.4529, + "mean_token_accuracy": 0.4344827592372894, + "step": 43660 + }, + { + "epoch": 0.04397985175874083, + "grad_norm": 10.885810091377175, + "learning_rate": 4.397901013234494e-05, + "loss": 2.6459, + "mean_token_accuracy": 0.3910465776920319, + "step": 43665 + }, + { + "epoch": 0.043984887811845, + "grad_norm": 12.550285853406592, + "learning_rate": 4.3984046089076e-05, + "loss": 2.6635, + "mean_token_accuracy": 0.3517241388559341, + "step": 43670 + }, + { + "epoch": 0.04398992386494917, + "grad_norm": 9.49613247794277, + "learning_rate": 4.398908204580706e-05, + "loss": 2.4407, + "mean_token_accuracy": 0.3724137872457504, + "step": 43675 + }, + { + "epoch": 0.04399495991805334, + "grad_norm": 13.125437573356784, + "learning_rate": 4.3994118002538126e-05, + "loss": 2.7576, + "mean_token_accuracy": 0.35862069129943847, + "step": 43680 + }, + { + "epoch": 0.043999995971157516, + "grad_norm": 12.839872089977552, + "learning_rate": 4.3999153959269185e-05, + "loss": 2.6956, + "mean_token_accuracy": 0.37931033968925476, + "step": 43685 + }, + { + "epoch": 0.04400503202426169, + "grad_norm": 14.377575481536416, + "learning_rate": 4.4004189916000245e-05, + "loss": 2.6968, + "mean_token_accuracy": 0.39655172228813174, + "step": 43690 + }, + { + "epoch": 0.04401006807736586, + "grad_norm": 10.88990637775244, + "learning_rate": 4.4009225872731304e-05, + "loss": 2.3862, + "mean_token_accuracy": 0.38275861740112305, + "step": 43695 + }, + { + "epoch": 0.04401510413047004, + "grad_norm": 13.919210469504824, + "learning_rate": 4.401426182946236e-05, + "loss": 2.7538, + "mean_token_accuracy": 0.3724137991666794, + "step": 43700 + }, + { + "epoch": 0.04402014018357421, + "grad_norm": 11.150584547296491, + "learning_rate": 4.401929778619342e-05, + "loss": 2.5478, + "mean_token_accuracy": 0.37241379022598264, + "step": 43705 + }, + { + "epoch": 0.04402517623667838, + "grad_norm": 12.913715939089982, + "learning_rate": 4.402433374292448e-05, + "loss": 2.5826, + "mean_token_accuracy": 0.37586206793785093, + "step": 43710 + }, + { + "epoch": 0.04403021228978255, + "grad_norm": 11.547651674715615, + "learning_rate": 4.402936969965554e-05, + "loss": 2.1618, + "mean_token_accuracy": 0.43793103098869324, + "step": 43715 + }, + { + "epoch": 0.044035248342886725, + "grad_norm": 13.768130087692429, + "learning_rate": 4.40344056563866e-05, + "loss": 2.6029, + "mean_token_accuracy": 0.3848154842853546, + "step": 43720 + }, + { + "epoch": 0.0440402843959909, + "grad_norm": 13.752865387757156, + "learning_rate": 4.403944161311766e-05, + "loss": 2.3477, + "mean_token_accuracy": 0.42758620977401735, + "step": 43725 + }, + { + "epoch": 0.04404532044909507, + "grad_norm": 16.475875742150453, + "learning_rate": 4.4044477569848725e-05, + "loss": 3.1165, + "mean_token_accuracy": 0.31379309892654417, + "step": 43730 + }, + { + "epoch": 0.044050356502199246, + "grad_norm": 14.55227356458699, + "learning_rate": 4.4049513526579785e-05, + "loss": 3.0658, + "mean_token_accuracy": 0.4026618242263794, + "step": 43735 + }, + { + "epoch": 0.04405539255530342, + "grad_norm": 20.005117202031062, + "learning_rate": 4.4054549483310844e-05, + "loss": 2.6527, + "mean_token_accuracy": 0.3517241358757019, + "step": 43740 + }, + { + "epoch": 0.04406042860840759, + "grad_norm": 14.050212591558072, + "learning_rate": 4.40595854400419e-05, + "loss": 3.0267, + "mean_token_accuracy": 0.3655172407627106, + "step": 43745 + }, + { + "epoch": 0.04406546466151176, + "grad_norm": 13.67818999601452, + "learning_rate": 4.406462139677296e-05, + "loss": 2.8563, + "mean_token_accuracy": 0.36896551847457887, + "step": 43750 + }, + { + "epoch": 0.044070500714615934, + "grad_norm": 11.334334092583576, + "learning_rate": 4.406965735350402e-05, + "loss": 2.4591, + "mean_token_accuracy": 0.4257108271121979, + "step": 43755 + }, + { + "epoch": 0.04407553676772011, + "grad_norm": 13.272564301251222, + "learning_rate": 4.407469331023508e-05, + "loss": 2.7904, + "mean_token_accuracy": 0.4068965554237366, + "step": 43760 + }, + { + "epoch": 0.04408057282082428, + "grad_norm": 15.710494821795324, + "learning_rate": 4.407972926696614e-05, + "loss": 2.6368, + "mean_token_accuracy": 0.4103448212146759, + "step": 43765 + }, + { + "epoch": 0.044085608873928456, + "grad_norm": 11.905366655545963, + "learning_rate": 4.40847652236972e-05, + "loss": 2.2527, + "mean_token_accuracy": 0.4586206912994385, + "step": 43770 + }, + { + "epoch": 0.04409064492703263, + "grad_norm": 15.595386346370512, + "learning_rate": 4.408980118042826e-05, + "loss": 2.7083, + "mean_token_accuracy": 0.4034482777118683, + "step": 43775 + }, + { + "epoch": 0.044095680980136796, + "grad_norm": 16.178445419278653, + "learning_rate": 4.409483713715932e-05, + "loss": 2.592, + "mean_token_accuracy": 0.42758620381355283, + "step": 43780 + }, + { + "epoch": 0.04410071703324097, + "grad_norm": 13.205894255430962, + "learning_rate": 4.4099873093890384e-05, + "loss": 2.4082, + "mean_token_accuracy": 0.4068965494632721, + "step": 43785 + }, + { + "epoch": 0.044105753086345144, + "grad_norm": 11.394460263659385, + "learning_rate": 4.410490905062144e-05, + "loss": 2.5042, + "mean_token_accuracy": 0.4068965494632721, + "step": 43790 + }, + { + "epoch": 0.04411078913944932, + "grad_norm": 12.675084919974456, + "learning_rate": 4.4109945007352496e-05, + "loss": 2.6957, + "mean_token_accuracy": 0.38275861740112305, + "step": 43795 + }, + { + "epoch": 0.04411582519255349, + "grad_norm": 17.684637539547733, + "learning_rate": 4.4114980964083555e-05, + "loss": 2.8493, + "mean_token_accuracy": 0.42068966031074523, + "step": 43800 + }, + { + "epoch": 0.044120861245657665, + "grad_norm": 11.456533280840457, + "learning_rate": 4.4120016920814614e-05, + "loss": 2.5509, + "mean_token_accuracy": 0.37931033968925476, + "step": 43805 + }, + { + "epoch": 0.04412589729876184, + "grad_norm": 15.016299058846634, + "learning_rate": 4.412505287754568e-05, + "loss": 2.5862, + "mean_token_accuracy": 0.3517241388559341, + "step": 43810 + }, + { + "epoch": 0.044130933351866006, + "grad_norm": 14.383159694238401, + "learning_rate": 4.413008883427674e-05, + "loss": 2.4404, + "mean_token_accuracy": 0.4034482777118683, + "step": 43815 + }, + { + "epoch": 0.04413596940497018, + "grad_norm": 12.118681224847318, + "learning_rate": 4.41351247910078e-05, + "loss": 2.4932, + "mean_token_accuracy": 0.3896551728248596, + "step": 43820 + }, + { + "epoch": 0.04414100545807435, + "grad_norm": 17.133862619974614, + "learning_rate": 4.414016074773886e-05, + "loss": 2.7807, + "mean_token_accuracy": 0.3620689630508423, + "step": 43825 + }, + { + "epoch": 0.04414604151117853, + "grad_norm": 20.056912466891188, + "learning_rate": 4.414519670446992e-05, + "loss": 3.4276, + "mean_token_accuracy": 0.28620689511299136, + "step": 43830 + }, + { + "epoch": 0.0441510775642827, + "grad_norm": 18.102861563149474, + "learning_rate": 4.4150232661200977e-05, + "loss": 2.7318, + "mean_token_accuracy": 0.37586205899715425, + "step": 43835 + }, + { + "epoch": 0.044156113617386875, + "grad_norm": 14.292039411223833, + "learning_rate": 4.4155268617932036e-05, + "loss": 3.1264, + "mean_token_accuracy": 0.3379310339689255, + "step": 43840 + }, + { + "epoch": 0.04416114967049105, + "grad_norm": 10.579454149875932, + "learning_rate": 4.4160304574663095e-05, + "loss": 2.1754, + "mean_token_accuracy": 0.44313369393348695, + "step": 43845 + }, + { + "epoch": 0.044166185723595215, + "grad_norm": 15.396866945426872, + "learning_rate": 4.4165340531394154e-05, + "loss": 2.7405, + "mean_token_accuracy": 0.4, + "step": 43850 + }, + { + "epoch": 0.04417122177669939, + "grad_norm": 15.528453114757294, + "learning_rate": 4.4170376488125214e-05, + "loss": 2.6528, + "mean_token_accuracy": 0.39310343861579894, + "step": 43855 + }, + { + "epoch": 0.04417625782980356, + "grad_norm": 14.74537839469161, + "learning_rate": 4.417541244485627e-05, + "loss": 2.8865, + "mean_token_accuracy": 0.4206896543502808, + "step": 43860 + }, + { + "epoch": 0.044181293882907736, + "grad_norm": 13.775705182332455, + "learning_rate": 4.418044840158734e-05, + "loss": 2.5958, + "mean_token_accuracy": 0.37931033968925476, + "step": 43865 + }, + { + "epoch": 0.04418632993601191, + "grad_norm": 11.376219245554795, + "learning_rate": 4.41854843583184e-05, + "loss": 2.3761, + "mean_token_accuracy": 0.4, + "step": 43870 + }, + { + "epoch": 0.044191365989116084, + "grad_norm": 20.86183329100325, + "learning_rate": 4.419052031504946e-05, + "loss": 2.6758, + "mean_token_accuracy": 0.3482758641242981, + "step": 43875 + }, + { + "epoch": 0.04419640204222026, + "grad_norm": 11.68228321000608, + "learning_rate": 4.4195556271780517e-05, + "loss": 2.5105, + "mean_token_accuracy": 0.40689654648303986, + "step": 43880 + }, + { + "epoch": 0.044201438095324425, + "grad_norm": 13.42922684174359, + "learning_rate": 4.4200592228511576e-05, + "loss": 2.1671, + "mean_token_accuracy": 0.42413793206214906, + "step": 43885 + }, + { + "epoch": 0.0442064741484286, + "grad_norm": 16.972728813627693, + "learning_rate": 4.4205628185242635e-05, + "loss": 3.1125, + "mean_token_accuracy": 0.3620689630508423, + "step": 43890 + }, + { + "epoch": 0.04421151020153277, + "grad_norm": 14.81411444748737, + "learning_rate": 4.4210664141973694e-05, + "loss": 2.5934, + "mean_token_accuracy": 0.41034482717514037, + "step": 43895 + }, + { + "epoch": 0.044216546254636946, + "grad_norm": 15.204932694191843, + "learning_rate": 4.4215700098704754e-05, + "loss": 2.8579, + "mean_token_accuracy": 0.3846340000629425, + "step": 43900 + }, + { + "epoch": 0.04422158230774112, + "grad_norm": 17.49491941604814, + "learning_rate": 4.422073605543581e-05, + "loss": 2.8646, + "mean_token_accuracy": 0.4034482717514038, + "step": 43905 + }, + { + "epoch": 0.04422661836084529, + "grad_norm": 10.743589355484843, + "learning_rate": 4.422577201216687e-05, + "loss": 2.3692, + "mean_token_accuracy": 0.42413793206214906, + "step": 43910 + }, + { + "epoch": 0.04423165441394947, + "grad_norm": 14.414382728486386, + "learning_rate": 4.423080796889794e-05, + "loss": 2.5724, + "mean_token_accuracy": 0.41034482717514037, + "step": 43915 + }, + { + "epoch": 0.044236690467053634, + "grad_norm": 13.165846234432177, + "learning_rate": 4.4235843925629e-05, + "loss": 2.6172, + "mean_token_accuracy": 0.4310344815254211, + "step": 43920 + }, + { + "epoch": 0.04424172652015781, + "grad_norm": 14.486240487120567, + "learning_rate": 4.424087988236006e-05, + "loss": 2.9629, + "mean_token_accuracy": 0.4, + "step": 43925 + }, + { + "epoch": 0.04424676257326198, + "grad_norm": 13.660265930657884, + "learning_rate": 4.424591583909111e-05, + "loss": 2.632, + "mean_token_accuracy": 0.3896551728248596, + "step": 43930 + }, + { + "epoch": 0.044251798626366155, + "grad_norm": 16.07635783765737, + "learning_rate": 4.425095179582217e-05, + "loss": 2.8131, + "mean_token_accuracy": 0.38965516686439516, + "step": 43935 + }, + { + "epoch": 0.04425683467947033, + "grad_norm": 14.385083765692428, + "learning_rate": 4.425598775255323e-05, + "loss": 2.3577, + "mean_token_accuracy": 0.42068966031074523, + "step": 43940 + }, + { + "epoch": 0.0442618707325745, + "grad_norm": 12.208947757770387, + "learning_rate": 4.4261023709284294e-05, + "loss": 2.6754, + "mean_token_accuracy": 0.3448275804519653, + "step": 43945 + }, + { + "epoch": 0.04426690678567868, + "grad_norm": 10.20887015461642, + "learning_rate": 4.426605966601535e-05, + "loss": 2.3222, + "mean_token_accuracy": 0.4091349124908447, + "step": 43950 + }, + { + "epoch": 0.04427194283878284, + "grad_norm": 25.587248342384086, + "learning_rate": 4.427109562274641e-05, + "loss": 2.4235, + "mean_token_accuracy": 0.40344828367233276, + "step": 43955 + }, + { + "epoch": 0.04427697889188702, + "grad_norm": 35.544632456124724, + "learning_rate": 4.427613157947747e-05, + "loss": 2.7106, + "mean_token_accuracy": 0.36733212471008303, + "step": 43960 + }, + { + "epoch": 0.04428201494499119, + "grad_norm": 13.672025715210397, + "learning_rate": 4.428116753620853e-05, + "loss": 2.4017, + "mean_token_accuracy": 0.3827586203813553, + "step": 43965 + }, + { + "epoch": 0.044287050998095365, + "grad_norm": 16.313576476635678, + "learning_rate": 4.428620349293959e-05, + "loss": 2.8721, + "mean_token_accuracy": 0.35359951853752136, + "step": 43970 + }, + { + "epoch": 0.04429208705119954, + "grad_norm": 12.951026491725921, + "learning_rate": 4.429123944967065e-05, + "loss": 2.2072, + "mean_token_accuracy": 0.46436781883239747, + "step": 43975 + }, + { + "epoch": 0.04429712310430371, + "grad_norm": 12.55905992552368, + "learning_rate": 4.429627540640171e-05, + "loss": 2.6696, + "mean_token_accuracy": 0.4068965494632721, + "step": 43980 + }, + { + "epoch": 0.044302159157407886, + "grad_norm": 13.247270304542925, + "learning_rate": 4.430131136313277e-05, + "loss": 3.2904, + "mean_token_accuracy": 0.3517241418361664, + "step": 43985 + }, + { + "epoch": 0.04430719521051205, + "grad_norm": 20.180152137718867, + "learning_rate": 4.430634731986383e-05, + "loss": 2.9211, + "mean_token_accuracy": 0.38777979612350466, + "step": 43990 + }, + { + "epoch": 0.04431223126361623, + "grad_norm": 11.171229089528058, + "learning_rate": 4.431138327659489e-05, + "loss": 2.4675, + "mean_token_accuracy": 0.39310343861579894, + "step": 43995 + }, + { + "epoch": 0.0443172673167204, + "grad_norm": 13.244745715143324, + "learning_rate": 4.431641923332595e-05, + "loss": 2.1049, + "mean_token_accuracy": 0.46763460636138915, + "step": 44000 + }, + { + "epoch": 0.044322303369824574, + "grad_norm": 11.586161596776906, + "learning_rate": 4.432145519005701e-05, + "loss": 2.5895, + "mean_token_accuracy": 0.4347290605306625, + "step": 44005 + }, + { + "epoch": 0.04432733942292875, + "grad_norm": 12.802390562995457, + "learning_rate": 4.432649114678807e-05, + "loss": 2.5838, + "mean_token_accuracy": 0.37931033968925476, + "step": 44010 + }, + { + "epoch": 0.04433237547603292, + "grad_norm": 16.577183767493647, + "learning_rate": 4.433152710351913e-05, + "loss": 2.7776, + "mean_token_accuracy": 0.42068964838981626, + "step": 44015 + }, + { + "epoch": 0.044337411529137095, + "grad_norm": 14.65299066960234, + "learning_rate": 4.433656306025018e-05, + "loss": 2.5993, + "mean_token_accuracy": 0.36896551847457887, + "step": 44020 + }, + { + "epoch": 0.04434244758224126, + "grad_norm": 17.574626153518555, + "learning_rate": 4.434159901698125e-05, + "loss": 2.7832, + "mean_token_accuracy": 0.3758620619773865, + "step": 44025 + }, + { + "epoch": 0.044347483635345436, + "grad_norm": 15.266270271923032, + "learning_rate": 4.434663497371231e-05, + "loss": 2.6148, + "mean_token_accuracy": 0.4000000059604645, + "step": 44030 + }, + { + "epoch": 0.04435251968844961, + "grad_norm": 17.52078813585745, + "learning_rate": 4.435167093044337e-05, + "loss": 3.1746, + "mean_token_accuracy": 0.38620689511299133, + "step": 44035 + }, + { + "epoch": 0.044357555741553784, + "grad_norm": 13.204293847967051, + "learning_rate": 4.4356706887174426e-05, + "loss": 2.7203, + "mean_token_accuracy": 0.3827586233615875, + "step": 44040 + }, + { + "epoch": 0.04436259179465796, + "grad_norm": 15.78829097756, + "learning_rate": 4.4361742843905485e-05, + "loss": 2.7295, + "mean_token_accuracy": 0.40508166551589964, + "step": 44045 + }, + { + "epoch": 0.04436762784776213, + "grad_norm": 11.916087929460101, + "learning_rate": 4.436677880063655e-05, + "loss": 2.304, + "mean_token_accuracy": 0.44827585816383364, + "step": 44050 + }, + { + "epoch": 0.044372663900866305, + "grad_norm": 13.993047007449642, + "learning_rate": 4.437181475736761e-05, + "loss": 2.658, + "mean_token_accuracy": 0.44137930274009707, + "step": 44055 + }, + { + "epoch": 0.04437769995397047, + "grad_norm": 12.34778371608988, + "learning_rate": 4.437685071409867e-05, + "loss": 2.8391, + "mean_token_accuracy": 0.3551724135875702, + "step": 44060 + }, + { + "epoch": 0.044382736007074645, + "grad_norm": 13.896543441911039, + "learning_rate": 4.438188667082972e-05, + "loss": 2.3494, + "mean_token_accuracy": 0.4, + "step": 44065 + }, + { + "epoch": 0.04438777206017882, + "grad_norm": 11.07907737241743, + "learning_rate": 4.438692262756078e-05, + "loss": 2.2097, + "mean_token_accuracy": 0.476043564081192, + "step": 44070 + }, + { + "epoch": 0.04439280811328299, + "grad_norm": 12.03353161885049, + "learning_rate": 4.439195858429185e-05, + "loss": 2.9754, + "mean_token_accuracy": 0.3620689660310745, + "step": 44075 + }, + { + "epoch": 0.04439784416638717, + "grad_norm": 11.057742877871155, + "learning_rate": 4.439699454102291e-05, + "loss": 2.2957, + "mean_token_accuracy": 0.4172413766384125, + "step": 44080 + }, + { + "epoch": 0.04440288021949134, + "grad_norm": 13.268963383080676, + "learning_rate": 4.4402030497753966e-05, + "loss": 2.4289, + "mean_token_accuracy": 0.37931033968925476, + "step": 44085 + }, + { + "epoch": 0.044407916272595514, + "grad_norm": 16.029497578611387, + "learning_rate": 4.4407066454485026e-05, + "loss": 2.9301, + "mean_token_accuracy": 0.3275862067937851, + "step": 44090 + }, + { + "epoch": 0.04441295232569968, + "grad_norm": 13.869820287353509, + "learning_rate": 4.4412102411216085e-05, + "loss": 2.5593, + "mean_token_accuracy": 0.3793103456497192, + "step": 44095 + }, + { + "epoch": 0.044417988378803855, + "grad_norm": 13.255381435259588, + "learning_rate": 4.4417138367947144e-05, + "loss": 2.4582, + "mean_token_accuracy": 0.4241379380226135, + "step": 44100 + }, + { + "epoch": 0.04442302443190803, + "grad_norm": 16.74543282962715, + "learning_rate": 4.44221743246782e-05, + "loss": 2.8708, + "mean_token_accuracy": 0.33103448152542114, + "step": 44105 + }, + { + "epoch": 0.0444280604850122, + "grad_norm": 12.772562394945036, + "learning_rate": 4.442721028140926e-05, + "loss": 2.3225, + "mean_token_accuracy": 0.4689655125141144, + "step": 44110 + }, + { + "epoch": 0.044433096538116376, + "grad_norm": 11.65763473842032, + "learning_rate": 4.443224623814032e-05, + "loss": 2.2633, + "mean_token_accuracy": 0.46061705946922304, + "step": 44115 + }, + { + "epoch": 0.04443813259122055, + "grad_norm": 19.894449697057485, + "learning_rate": 4.443728219487138e-05, + "loss": 2.5711, + "mean_token_accuracy": 0.39310345351696013, + "step": 44120 + }, + { + "epoch": 0.044443168644324724, + "grad_norm": 14.066864522239403, + "learning_rate": 4.444231815160244e-05, + "loss": 2.513, + "mean_token_accuracy": 0.4068965554237366, + "step": 44125 + }, + { + "epoch": 0.04444820469742889, + "grad_norm": 13.095708790248539, + "learning_rate": 4.4447354108333506e-05, + "loss": 2.6114, + "mean_token_accuracy": 0.3965517282485962, + "step": 44130 + }, + { + "epoch": 0.044453240750533064, + "grad_norm": 15.791717803668181, + "learning_rate": 4.4452390065064566e-05, + "loss": 3.0297, + "mean_token_accuracy": 0.3862069010734558, + "step": 44135 + }, + { + "epoch": 0.04445827680363724, + "grad_norm": 16.574606331727026, + "learning_rate": 4.4457426021795625e-05, + "loss": 2.9818, + "mean_token_accuracy": 0.3482758641242981, + "step": 44140 + }, + { + "epoch": 0.04446331285674141, + "grad_norm": 13.482615368760458, + "learning_rate": 4.4462461978526684e-05, + "loss": 2.5806, + "mean_token_accuracy": 0.4172413766384125, + "step": 44145 + }, + { + "epoch": 0.044468348909845586, + "grad_norm": 15.556970806392002, + "learning_rate": 4.446749793525774e-05, + "loss": 2.4947, + "mean_token_accuracy": 0.33103448152542114, + "step": 44150 + }, + { + "epoch": 0.04447338496294976, + "grad_norm": 11.859608417894577, + "learning_rate": 4.44725338919888e-05, + "loss": 2.9373, + "mean_token_accuracy": 0.33448275923728943, + "step": 44155 + }, + { + "epoch": 0.04447842101605393, + "grad_norm": 13.71762418087473, + "learning_rate": 4.447756984871986e-05, + "loss": 3.223, + "mean_token_accuracy": 0.35862069129943847, + "step": 44160 + }, + { + "epoch": 0.0444834570691581, + "grad_norm": 12.389245952732699, + "learning_rate": 4.448260580545092e-05, + "loss": 2.8521, + "mean_token_accuracy": 0.37586206793785093, + "step": 44165 + }, + { + "epoch": 0.044488493122262274, + "grad_norm": 12.575490624298796, + "learning_rate": 4.448764176218198e-05, + "loss": 2.8181, + "mean_token_accuracy": 0.38275861740112305, + "step": 44170 + }, + { + "epoch": 0.04449352917536645, + "grad_norm": 11.180172218204662, + "learning_rate": 4.449267771891304e-05, + "loss": 2.2103, + "mean_token_accuracy": 0.49999999403953554, + "step": 44175 + }, + { + "epoch": 0.04449856522847062, + "grad_norm": 12.24252896763768, + "learning_rate": 4.4497713675644106e-05, + "loss": 2.9076, + "mean_token_accuracy": 0.35862068831920624, + "step": 44180 + }, + { + "epoch": 0.044503601281574795, + "grad_norm": 11.678041234348868, + "learning_rate": 4.4502749632375165e-05, + "loss": 2.5964, + "mean_token_accuracy": 0.36896551251411436, + "step": 44185 + }, + { + "epoch": 0.04450863733467897, + "grad_norm": 13.456046012696975, + "learning_rate": 4.4507785589106224e-05, + "loss": 2.6489, + "mean_token_accuracy": 0.38275861740112305, + "step": 44190 + }, + { + "epoch": 0.04451367338778314, + "grad_norm": 10.8990411141468, + "learning_rate": 4.451282154583728e-05, + "loss": 2.0318, + "mean_token_accuracy": 0.47586206793785096, + "step": 44195 + }, + { + "epoch": 0.04451870944088731, + "grad_norm": 16.139466028786916, + "learning_rate": 4.4517857502568336e-05, + "loss": 2.8554, + "mean_token_accuracy": 0.358620685338974, + "step": 44200 + }, + { + "epoch": 0.04452374549399148, + "grad_norm": 11.493187539473524, + "learning_rate": 4.4522893459299395e-05, + "loss": 2.6881, + "mean_token_accuracy": 0.42758620381355283, + "step": 44205 + }, + { + "epoch": 0.04452878154709566, + "grad_norm": 13.015327060511801, + "learning_rate": 4.452792941603046e-05, + "loss": 2.5566, + "mean_token_accuracy": 0.3807622492313385, + "step": 44210 + }, + { + "epoch": 0.04453381760019983, + "grad_norm": 12.039084357471584, + "learning_rate": 4.453296537276152e-05, + "loss": 2.1018, + "mean_token_accuracy": 0.46551724076271056, + "step": 44215 + }, + { + "epoch": 0.044538853653304004, + "grad_norm": 12.790894339968776, + "learning_rate": 4.453800132949258e-05, + "loss": 2.5324, + "mean_token_accuracy": 0.40689654350280763, + "step": 44220 + }, + { + "epoch": 0.04454388970640818, + "grad_norm": 13.900376808674402, + "learning_rate": 4.454303728622364e-05, + "loss": 2.0906, + "mean_token_accuracy": 0.4535390198230743, + "step": 44225 + }, + { + "epoch": 0.04454892575951235, + "grad_norm": 15.818694148213773, + "learning_rate": 4.45480732429547e-05, + "loss": 2.4829, + "mean_token_accuracy": 0.4089534133672714, + "step": 44230 + }, + { + "epoch": 0.04455396181261652, + "grad_norm": 14.415968311929939, + "learning_rate": 4.455310919968576e-05, + "loss": 2.5899, + "mean_token_accuracy": 0.3848759740591049, + "step": 44235 + }, + { + "epoch": 0.04455899786572069, + "grad_norm": 16.9729841896028, + "learning_rate": 4.455814515641682e-05, + "loss": 2.5431, + "mean_token_accuracy": 0.3862068891525269, + "step": 44240 + }, + { + "epoch": 0.044564033918824866, + "grad_norm": 13.701515115043184, + "learning_rate": 4.4563181113147876e-05, + "loss": 2.228, + "mean_token_accuracy": 0.4413793206214905, + "step": 44245 + }, + { + "epoch": 0.04456906997192904, + "grad_norm": 12.435988768084208, + "learning_rate": 4.4568217069878935e-05, + "loss": 2.4747, + "mean_token_accuracy": 0.41034482717514037, + "step": 44250 + }, + { + "epoch": 0.044574106025033214, + "grad_norm": 22.5872541908809, + "learning_rate": 4.4573253026609994e-05, + "loss": 2.8522, + "mean_token_accuracy": 0.40344826579093934, + "step": 44255 + }, + { + "epoch": 0.04457914207813739, + "grad_norm": 14.338152582830393, + "learning_rate": 4.457828898334106e-05, + "loss": 2.508, + "mean_token_accuracy": 0.4293406009674072, + "step": 44260 + }, + { + "epoch": 0.04458417813124156, + "grad_norm": 15.278189641122646, + "learning_rate": 4.458332494007212e-05, + "loss": 3.1292, + "mean_token_accuracy": 0.3551724165678024, + "step": 44265 + }, + { + "epoch": 0.04458921418434573, + "grad_norm": 12.401834703512241, + "learning_rate": 4.458836089680318e-05, + "loss": 2.4663, + "mean_token_accuracy": 0.3910465896129608, + "step": 44270 + }, + { + "epoch": 0.0445942502374499, + "grad_norm": 12.507967426027502, + "learning_rate": 4.459339685353424e-05, + "loss": 2.4483, + "mean_token_accuracy": 0.38275861740112305, + "step": 44275 + }, + { + "epoch": 0.044599286290554076, + "grad_norm": 14.02532766013267, + "learning_rate": 4.45984328102653e-05, + "loss": 2.6396, + "mean_token_accuracy": 0.37241379022598264, + "step": 44280 + }, + { + "epoch": 0.04460432234365825, + "grad_norm": 18.285235598600288, + "learning_rate": 4.460346876699636e-05, + "loss": 2.4179, + "mean_token_accuracy": 0.47241380214691164, + "step": 44285 + }, + { + "epoch": 0.04460935839676242, + "grad_norm": 15.31588776597003, + "learning_rate": 4.4608504723727416e-05, + "loss": 2.7084, + "mean_token_accuracy": 0.38275861740112305, + "step": 44290 + }, + { + "epoch": 0.0446143944498666, + "grad_norm": 14.054555029166396, + "learning_rate": 4.4613540680458475e-05, + "loss": 2.848, + "mean_token_accuracy": 0.37241379618644715, + "step": 44295 + }, + { + "epoch": 0.04461943050297077, + "grad_norm": 12.215086349634712, + "learning_rate": 4.4618576637189534e-05, + "loss": 2.6303, + "mean_token_accuracy": 0.3862068921327591, + "step": 44300 + }, + { + "epoch": 0.04462446655607494, + "grad_norm": 12.151825269295024, + "learning_rate": 4.4623612593920594e-05, + "loss": 2.3216, + "mean_token_accuracy": 0.43103448748588563, + "step": 44305 + }, + { + "epoch": 0.04462950260917911, + "grad_norm": 12.337592109693574, + "learning_rate": 4.462864855065165e-05, + "loss": 2.6531, + "mean_token_accuracy": 0.35862069129943847, + "step": 44310 + }, + { + "epoch": 0.044634538662283285, + "grad_norm": 11.308240696710952, + "learning_rate": 4.463368450738272e-05, + "loss": 2.4427, + "mean_token_accuracy": 0.45172414779663084, + "step": 44315 + }, + { + "epoch": 0.04463957471538746, + "grad_norm": 15.057823959320642, + "learning_rate": 4.463872046411378e-05, + "loss": 2.1578, + "mean_token_accuracy": 0.5034482836723327, + "step": 44320 + }, + { + "epoch": 0.04464461076849163, + "grad_norm": 13.22310463344464, + "learning_rate": 4.464375642084484e-05, + "loss": 2.8505, + "mean_token_accuracy": 0.42758620381355283, + "step": 44325 + }, + { + "epoch": 0.044649646821595806, + "grad_norm": 12.933959932656371, + "learning_rate": 4.464879237757589e-05, + "loss": 2.4781, + "mean_token_accuracy": 0.3862069010734558, + "step": 44330 + }, + { + "epoch": 0.04465468287469998, + "grad_norm": 14.31492755930774, + "learning_rate": 4.465382833430695e-05, + "loss": 2.677, + "mean_token_accuracy": 0.3793103456497192, + "step": 44335 + }, + { + "epoch": 0.04465971892780415, + "grad_norm": 18.416663979726813, + "learning_rate": 4.4658864291038015e-05, + "loss": 2.7629, + "mean_token_accuracy": 0.3758620619773865, + "step": 44340 + }, + { + "epoch": 0.04466475498090832, + "grad_norm": 15.29278948661555, + "learning_rate": 4.4663900247769075e-05, + "loss": 2.8602, + "mean_token_accuracy": 0.37241379618644715, + "step": 44345 + }, + { + "epoch": 0.044669791034012495, + "grad_norm": 13.182355378266555, + "learning_rate": 4.4668936204500134e-05, + "loss": 2.2739, + "mean_token_accuracy": 0.3999999940395355, + "step": 44350 + }, + { + "epoch": 0.04467482708711667, + "grad_norm": 12.920442536862305, + "learning_rate": 4.467397216123119e-05, + "loss": 2.4517, + "mean_token_accuracy": 0.382758629322052, + "step": 44355 + }, + { + "epoch": 0.04467986314022084, + "grad_norm": 14.157576514165662, + "learning_rate": 4.467900811796225e-05, + "loss": 2.6826, + "mean_token_accuracy": 0.37931033968925476, + "step": 44360 + }, + { + "epoch": 0.044684899193325016, + "grad_norm": 13.451230478782248, + "learning_rate": 4.468404407469331e-05, + "loss": 2.3212, + "mean_token_accuracy": 0.4172413766384125, + "step": 44365 + }, + { + "epoch": 0.04468993524642919, + "grad_norm": 13.861578518274676, + "learning_rate": 4.468908003142437e-05, + "loss": 2.8463, + "mean_token_accuracy": 0.37241379618644715, + "step": 44370 + }, + { + "epoch": 0.044694971299533356, + "grad_norm": 12.230623727688428, + "learning_rate": 4.469411598815543e-05, + "loss": 2.4743, + "mean_token_accuracy": 0.4068965554237366, + "step": 44375 + }, + { + "epoch": 0.04470000735263753, + "grad_norm": 13.447931386339436, + "learning_rate": 4.469915194488649e-05, + "loss": 2.2186, + "mean_token_accuracy": 0.40344826877117157, + "step": 44380 + }, + { + "epoch": 0.044705043405741704, + "grad_norm": 18.12107992714274, + "learning_rate": 4.470418790161755e-05, + "loss": 2.498, + "mean_token_accuracy": 0.42758620977401735, + "step": 44385 + }, + { + "epoch": 0.04471007945884588, + "grad_norm": 12.805731503744495, + "learning_rate": 4.470922385834861e-05, + "loss": 2.6872, + "mean_token_accuracy": 0.42413792610168455, + "step": 44390 + }, + { + "epoch": 0.04471511551195005, + "grad_norm": 13.769819248922563, + "learning_rate": 4.4714259815079674e-05, + "loss": 2.4897, + "mean_token_accuracy": 0.4162129402160645, + "step": 44395 + }, + { + "epoch": 0.044720151565054225, + "grad_norm": 11.864130916783793, + "learning_rate": 4.471929577181073e-05, + "loss": 2.7927, + "mean_token_accuracy": 0.3793103456497192, + "step": 44400 + }, + { + "epoch": 0.0447251876181584, + "grad_norm": 13.396209461691038, + "learning_rate": 4.472433172854179e-05, + "loss": 2.3081, + "mean_token_accuracy": 0.4068965494632721, + "step": 44405 + }, + { + "epoch": 0.044730223671262566, + "grad_norm": 12.709868966267702, + "learning_rate": 4.472936768527285e-05, + "loss": 2.1781, + "mean_token_accuracy": 0.41724138259887694, + "step": 44410 + }, + { + "epoch": 0.04473525972436674, + "grad_norm": 12.11317895640667, + "learning_rate": 4.473440364200391e-05, + "loss": 2.7178, + "mean_token_accuracy": 0.3517241358757019, + "step": 44415 + }, + { + "epoch": 0.04474029577747091, + "grad_norm": 15.407549047555106, + "learning_rate": 4.473943959873497e-05, + "loss": 2.8256, + "mean_token_accuracy": 0.3362976431846619, + "step": 44420 + }, + { + "epoch": 0.04474533183057509, + "grad_norm": 14.130655391230654, + "learning_rate": 4.474447555546603e-05, + "loss": 2.3911, + "mean_token_accuracy": 0.38620689511299133, + "step": 44425 + }, + { + "epoch": 0.04475036788367926, + "grad_norm": 10.926372087200113, + "learning_rate": 4.474951151219709e-05, + "loss": 2.6161, + "mean_token_accuracy": 0.35862069129943847, + "step": 44430 + }, + { + "epoch": 0.044755403936783435, + "grad_norm": 12.325945859230604, + "learning_rate": 4.475454746892815e-05, + "loss": 2.4937, + "mean_token_accuracy": 0.38620689511299133, + "step": 44435 + }, + { + "epoch": 0.04476043998988761, + "grad_norm": 18.215045573944565, + "learning_rate": 4.475958342565921e-05, + "loss": 2.7619, + "mean_token_accuracy": 0.3448275774717331, + "step": 44440 + }, + { + "epoch": 0.044765476042991775, + "grad_norm": 14.807605192228278, + "learning_rate": 4.4764619382390266e-05, + "loss": 2.489, + "mean_token_accuracy": 0.42068964838981626, + "step": 44445 + }, + { + "epoch": 0.04477051209609595, + "grad_norm": 13.892652389388157, + "learning_rate": 4.476965533912133e-05, + "loss": 2.8009, + "mean_token_accuracy": 0.3551724135875702, + "step": 44450 + }, + { + "epoch": 0.04477554814920012, + "grad_norm": 13.361401039907307, + "learning_rate": 4.477469129585239e-05, + "loss": 2.5935, + "mean_token_accuracy": 0.3206896513700485, + "step": 44455 + }, + { + "epoch": 0.0447805842023043, + "grad_norm": 12.729058355462525, + "learning_rate": 4.477972725258345e-05, + "loss": 2.5722, + "mean_token_accuracy": 0.3551724076271057, + "step": 44460 + }, + { + "epoch": 0.04478562025540847, + "grad_norm": 11.627597072418135, + "learning_rate": 4.4784763209314503e-05, + "loss": 2.3534, + "mean_token_accuracy": 0.42413793206214906, + "step": 44465 + }, + { + "epoch": 0.044790656308512644, + "grad_norm": 13.190708742167434, + "learning_rate": 4.478979916604556e-05, + "loss": 3.0369, + "mean_token_accuracy": 0.324137932062149, + "step": 44470 + }, + { + "epoch": 0.04479569236161682, + "grad_norm": 13.32595141358663, + "learning_rate": 4.479483512277663e-05, + "loss": 2.4456, + "mean_token_accuracy": 0.4172413766384125, + "step": 44475 + }, + { + "epoch": 0.044800728414720985, + "grad_norm": 15.825311688568338, + "learning_rate": 4.479987107950769e-05, + "loss": 2.5695, + "mean_token_accuracy": 0.4, + "step": 44480 + }, + { + "epoch": 0.04480576446782516, + "grad_norm": 11.61436552440119, + "learning_rate": 4.480490703623875e-05, + "loss": 2.3753, + "mean_token_accuracy": 0.4482758641242981, + "step": 44485 + }, + { + "epoch": 0.04481080052092933, + "grad_norm": 13.145620395350704, + "learning_rate": 4.4809942992969806e-05, + "loss": 2.3909, + "mean_token_accuracy": 0.42758620977401735, + "step": 44490 + }, + { + "epoch": 0.044815836574033506, + "grad_norm": 13.230035323904444, + "learning_rate": 4.4814978949700866e-05, + "loss": 2.6816, + "mean_token_accuracy": 0.3586206823587418, + "step": 44495 + }, + { + "epoch": 0.04482087262713768, + "grad_norm": 12.534248736435215, + "learning_rate": 4.482001490643193e-05, + "loss": 2.4213, + "mean_token_accuracy": 0.4275862008333206, + "step": 44500 + }, + { + "epoch": 0.044825908680241854, + "grad_norm": 13.447574471687213, + "learning_rate": 4.4825050863162984e-05, + "loss": 2.4677, + "mean_token_accuracy": 0.3931034505367279, + "step": 44505 + }, + { + "epoch": 0.04483094473334603, + "grad_norm": 18.968628176314848, + "learning_rate": 4.4830086819894043e-05, + "loss": 2.9686, + "mean_token_accuracy": 0.36206896901130675, + "step": 44510 + }, + { + "epoch": 0.044835980786450194, + "grad_norm": 16.984272942961983, + "learning_rate": 4.48351227766251e-05, + "loss": 2.9033, + "mean_token_accuracy": 0.33103448152542114, + "step": 44515 + }, + { + "epoch": 0.04484101683955437, + "grad_norm": 15.346671391467446, + "learning_rate": 4.484015873335616e-05, + "loss": 2.6786, + "mean_token_accuracy": 0.3655172407627106, + "step": 44520 + }, + { + "epoch": 0.04484605289265854, + "grad_norm": 12.545366494309848, + "learning_rate": 4.484519469008722e-05, + "loss": 2.9542, + "mean_token_accuracy": 0.358620685338974, + "step": 44525 + }, + { + "epoch": 0.044851088945762715, + "grad_norm": 13.626004997590783, + "learning_rate": 4.485023064681829e-05, + "loss": 2.5628, + "mean_token_accuracy": 0.39473684430122374, + "step": 44530 + }, + { + "epoch": 0.04485612499886689, + "grad_norm": 13.47694154156956, + "learning_rate": 4.4855266603549346e-05, + "loss": 2.4378, + "mean_token_accuracy": 0.4275862067937851, + "step": 44535 + }, + { + "epoch": 0.04486116105197106, + "grad_norm": 15.654751327336484, + "learning_rate": 4.4860302560280406e-05, + "loss": 2.5108, + "mean_token_accuracy": 0.4172413766384125, + "step": 44540 + }, + { + "epoch": 0.04486619710507524, + "grad_norm": 13.82111434068498, + "learning_rate": 4.4865338517011465e-05, + "loss": 2.4908, + "mean_token_accuracy": 0.34137931764125823, + "step": 44545 + }, + { + "epoch": 0.044871233158179404, + "grad_norm": 12.715819856179923, + "learning_rate": 4.4870374473742524e-05, + "loss": 2.6238, + "mean_token_accuracy": 0.39655172228813174, + "step": 44550 + }, + { + "epoch": 0.04487626921128358, + "grad_norm": 13.322495642989322, + "learning_rate": 4.4875410430473583e-05, + "loss": 2.6038, + "mean_token_accuracy": 0.4379310369491577, + "step": 44555 + }, + { + "epoch": 0.04488130526438775, + "grad_norm": 13.986680220984965, + "learning_rate": 4.488044638720464e-05, + "loss": 2.4617, + "mean_token_accuracy": 0.37931033968925476, + "step": 44560 + }, + { + "epoch": 0.044886341317491925, + "grad_norm": 14.0607427660587, + "learning_rate": 4.48854823439357e-05, + "loss": 2.4463, + "mean_token_accuracy": 0.4103448212146759, + "step": 44565 + }, + { + "epoch": 0.0448913773705961, + "grad_norm": 13.703988287454036, + "learning_rate": 4.489051830066676e-05, + "loss": 2.4781, + "mean_token_accuracy": 0.40689654350280763, + "step": 44570 + }, + { + "epoch": 0.04489641342370027, + "grad_norm": 11.035394799873702, + "learning_rate": 4.489555425739782e-05, + "loss": 2.5606, + "mean_token_accuracy": 0.38275861740112305, + "step": 44575 + }, + { + "epoch": 0.044901449476804446, + "grad_norm": 14.7156874787611, + "learning_rate": 4.4900590214128887e-05, + "loss": 2.2619, + "mean_token_accuracy": 0.39655172228813174, + "step": 44580 + }, + { + "epoch": 0.04490648552990861, + "grad_norm": 15.624608073456418, + "learning_rate": 4.4905626170859946e-05, + "loss": 2.4313, + "mean_token_accuracy": 0.42758620977401735, + "step": 44585 + }, + { + "epoch": 0.04491152158301279, + "grad_norm": 21.681843150290984, + "learning_rate": 4.4910662127591005e-05, + "loss": 2.926, + "mean_token_accuracy": 0.3793103456497192, + "step": 44590 + }, + { + "epoch": 0.04491655763611696, + "grad_norm": 14.843879021398818, + "learning_rate": 4.4915698084322064e-05, + "loss": 2.8801, + "mean_token_accuracy": 0.3310344755649567, + "step": 44595 + }, + { + "epoch": 0.044921593689221134, + "grad_norm": 17.1733923615299, + "learning_rate": 4.492073404105312e-05, + "loss": 2.757, + "mean_token_accuracy": 0.35995160341262816, + "step": 44600 + }, + { + "epoch": 0.04492662974232531, + "grad_norm": 13.778073803680591, + "learning_rate": 4.492576999778418e-05, + "loss": 2.764, + "mean_token_accuracy": 0.3827586233615875, + "step": 44605 + }, + { + "epoch": 0.04493166579542948, + "grad_norm": 12.524776993175706, + "learning_rate": 4.493080595451524e-05, + "loss": 2.4319, + "mean_token_accuracy": 0.4034482717514038, + "step": 44610 + }, + { + "epoch": 0.044936701848533656, + "grad_norm": 14.821109219181194, + "learning_rate": 4.49358419112463e-05, + "loss": 2.7008, + "mean_token_accuracy": 0.37586206793785093, + "step": 44615 + }, + { + "epoch": 0.04494173790163782, + "grad_norm": 15.717830315416789, + "learning_rate": 4.494087786797736e-05, + "loss": 2.4452, + "mean_token_accuracy": 0.41724138259887694, + "step": 44620 + }, + { + "epoch": 0.044946773954741996, + "grad_norm": 14.73696298402566, + "learning_rate": 4.494591382470842e-05, + "loss": 2.6372, + "mean_token_accuracy": 0.35862069129943847, + "step": 44625 + }, + { + "epoch": 0.04495181000784617, + "grad_norm": 15.84376418463873, + "learning_rate": 4.495094978143948e-05, + "loss": 2.7592, + "mean_token_accuracy": 0.3620689630508423, + "step": 44630 + }, + { + "epoch": 0.044956846060950344, + "grad_norm": 17.43755312573065, + "learning_rate": 4.4955985738170545e-05, + "loss": 2.7583, + "mean_token_accuracy": 0.4041871964931488, + "step": 44635 + }, + { + "epoch": 0.04496188211405452, + "grad_norm": 13.475833931571286, + "learning_rate": 4.49610216949016e-05, + "loss": 2.8714, + "mean_token_accuracy": 0.3793103456497192, + "step": 44640 + }, + { + "epoch": 0.04496691816715869, + "grad_norm": 13.183023464311228, + "learning_rate": 4.496605765163266e-05, + "loss": 3.1301, + "mean_token_accuracy": 0.3482758641242981, + "step": 44645 + }, + { + "epoch": 0.044971954220262865, + "grad_norm": 14.82176523865926, + "learning_rate": 4.4971093608363716e-05, + "loss": 2.9793, + "mean_token_accuracy": 0.3551724076271057, + "step": 44650 + }, + { + "epoch": 0.04497699027336703, + "grad_norm": 11.718102289194606, + "learning_rate": 4.4976129565094775e-05, + "loss": 2.5778, + "mean_token_accuracy": 0.4071428537368774, + "step": 44655 + }, + { + "epoch": 0.044982026326471206, + "grad_norm": 12.904900760854074, + "learning_rate": 4.498116552182584e-05, + "loss": 2.5993, + "mean_token_accuracy": 0.42413793206214906, + "step": 44660 + }, + { + "epoch": 0.04498706237957538, + "grad_norm": 11.189013646277129, + "learning_rate": 4.49862014785569e-05, + "loss": 2.3111, + "mean_token_accuracy": 0.4482758641242981, + "step": 44665 + }, + { + "epoch": 0.04499209843267955, + "grad_norm": 12.364641210207758, + "learning_rate": 4.499123743528796e-05, + "loss": 2.4811, + "mean_token_accuracy": 0.4068965494632721, + "step": 44670 + }, + { + "epoch": 0.04499713448578373, + "grad_norm": 12.248789589564423, + "learning_rate": 4.499627339201902e-05, + "loss": 2.4579, + "mean_token_accuracy": 0.39310343861579894, + "step": 44675 + }, + { + "epoch": 0.0450021705388879, + "grad_norm": 31.431256882326537, + "learning_rate": 4.500130934875008e-05, + "loss": 2.8885, + "mean_token_accuracy": 0.37586206793785093, + "step": 44680 + }, + { + "epoch": 0.045007206591992074, + "grad_norm": 10.834743652664642, + "learning_rate": 4.500634530548114e-05, + "loss": 2.2241, + "mean_token_accuracy": 0.4517241418361664, + "step": 44685 + }, + { + "epoch": 0.04501224264509624, + "grad_norm": 11.518130140139382, + "learning_rate": 4.50113812622122e-05, + "loss": 2.4581, + "mean_token_accuracy": 0.44137930274009707, + "step": 44690 + }, + { + "epoch": 0.045017278698200415, + "grad_norm": 12.13692751446189, + "learning_rate": 4.5016417218943256e-05, + "loss": 2.5273, + "mean_token_accuracy": 0.38620689511299133, + "step": 44695 + }, + { + "epoch": 0.04502231475130459, + "grad_norm": 12.572883118270255, + "learning_rate": 4.5021453175674315e-05, + "loss": 2.7817, + "mean_token_accuracy": 0.34137930572032926, + "step": 44700 + }, + { + "epoch": 0.04502735080440876, + "grad_norm": 12.618903642930425, + "learning_rate": 4.5026489132405375e-05, + "loss": 2.6376, + "mean_token_accuracy": 0.4206896543502808, + "step": 44705 + }, + { + "epoch": 0.045032386857512936, + "grad_norm": 12.640048671297029, + "learning_rate": 4.5031525089136434e-05, + "loss": 2.6966, + "mean_token_accuracy": 0.34827586114406583, + "step": 44710 + }, + { + "epoch": 0.04503742291061711, + "grad_norm": 26.540501301522333, + "learning_rate": 4.50365610458675e-05, + "loss": 2.7657, + "mean_token_accuracy": 0.40556563436985016, + "step": 44715 + }, + { + "epoch": 0.045042458963721284, + "grad_norm": 12.011423733051945, + "learning_rate": 4.504159700259856e-05, + "loss": 2.4184, + "mean_token_accuracy": 0.4172413766384125, + "step": 44720 + }, + { + "epoch": 0.04504749501682545, + "grad_norm": 15.207243542723162, + "learning_rate": 4.504663295932962e-05, + "loss": 2.5776, + "mean_token_accuracy": 0.3931034505367279, + "step": 44725 + }, + { + "epoch": 0.045052531069929624, + "grad_norm": 13.995078934737471, + "learning_rate": 4.505166891606067e-05, + "loss": 2.3076, + "mean_token_accuracy": 0.41034482717514037, + "step": 44730 + }, + { + "epoch": 0.0450575671230338, + "grad_norm": 11.286122428997395, + "learning_rate": 4.505670487279173e-05, + "loss": 2.5061, + "mean_token_accuracy": 0.4034482717514038, + "step": 44735 + }, + { + "epoch": 0.04506260317613797, + "grad_norm": 14.512346379128926, + "learning_rate": 4.5061740829522796e-05, + "loss": 2.3961, + "mean_token_accuracy": 0.46400484442710876, + "step": 44740 + }, + { + "epoch": 0.045067639229242146, + "grad_norm": 11.71884258864979, + "learning_rate": 4.5066776786253855e-05, + "loss": 2.5092, + "mean_token_accuracy": 0.43793103098869324, + "step": 44745 + }, + { + "epoch": 0.04507267528234632, + "grad_norm": 11.66576226404458, + "learning_rate": 4.5071812742984915e-05, + "loss": 2.6895, + "mean_token_accuracy": 0.3655172407627106, + "step": 44750 + }, + { + "epoch": 0.04507771133545049, + "grad_norm": 12.579058538080561, + "learning_rate": 4.5076848699715974e-05, + "loss": 2.5134, + "mean_token_accuracy": 0.362068971991539, + "step": 44755 + }, + { + "epoch": 0.04508274738855466, + "grad_norm": 14.046241368844642, + "learning_rate": 4.508188465644703e-05, + "loss": 2.9575, + "mean_token_accuracy": 0.3206896483898163, + "step": 44760 + }, + { + "epoch": 0.045087783441658834, + "grad_norm": 24.522396476210343, + "learning_rate": 4.50869206131781e-05, + "loss": 2.5308, + "mean_token_accuracy": 0.4068965554237366, + "step": 44765 + }, + { + "epoch": 0.04509281949476301, + "grad_norm": 14.144320149720668, + "learning_rate": 4.509195656990915e-05, + "loss": 3.056, + "mean_token_accuracy": 0.29605911374092103, + "step": 44770 + }, + { + "epoch": 0.04509785554786718, + "grad_norm": 10.920331475097846, + "learning_rate": 4.509699252664021e-05, + "loss": 2.5061, + "mean_token_accuracy": 0.4068965494632721, + "step": 44775 + }, + { + "epoch": 0.045102891600971355, + "grad_norm": 11.72524101044772, + "learning_rate": 4.510202848337127e-05, + "loss": 2.5745, + "mean_token_accuracy": 0.4310344815254211, + "step": 44780 + }, + { + "epoch": 0.04510792765407553, + "grad_norm": 12.320080506480053, + "learning_rate": 4.510706444010233e-05, + "loss": 2.8897, + "mean_token_accuracy": 0.36896551251411436, + "step": 44785 + }, + { + "epoch": 0.0451129637071797, + "grad_norm": 15.069534790653307, + "learning_rate": 4.511210039683339e-05, + "loss": 2.4126, + "mean_token_accuracy": 0.4296430706977844, + "step": 44790 + }, + { + "epoch": 0.04511799976028387, + "grad_norm": 12.988498484701191, + "learning_rate": 4.5117136353564455e-05, + "loss": 2.3502, + "mean_token_accuracy": 0.4551724135875702, + "step": 44795 + }, + { + "epoch": 0.04512303581338804, + "grad_norm": 15.257683808187931, + "learning_rate": 4.5122172310295514e-05, + "loss": 2.5055, + "mean_token_accuracy": 0.4103448331356049, + "step": 44800 + }, + { + "epoch": 0.04512807186649222, + "grad_norm": 11.028958388596825, + "learning_rate": 4.512720826702657e-05, + "loss": 2.6047, + "mean_token_accuracy": 0.3793103456497192, + "step": 44805 + }, + { + "epoch": 0.04513310791959639, + "grad_norm": 12.38096647608136, + "learning_rate": 4.513224422375763e-05, + "loss": 2.7929, + "mean_token_accuracy": 0.38777979016304015, + "step": 44810 + }, + { + "epoch": 0.045138143972700565, + "grad_norm": 13.084057645359465, + "learning_rate": 4.513728018048869e-05, + "loss": 2.7383, + "mean_token_accuracy": 0.3965517282485962, + "step": 44815 + }, + { + "epoch": 0.04514318002580474, + "grad_norm": 17.363644535670936, + "learning_rate": 4.514231613721975e-05, + "loss": 2.6152, + "mean_token_accuracy": 0.36896551251411436, + "step": 44820 + }, + { + "epoch": 0.04514821607890891, + "grad_norm": 14.35910716313129, + "learning_rate": 4.514735209395081e-05, + "loss": 2.7302, + "mean_token_accuracy": 0.42758620381355283, + "step": 44825 + }, + { + "epoch": 0.04515325213201308, + "grad_norm": 12.787321011253866, + "learning_rate": 4.515238805068187e-05, + "loss": 2.5357, + "mean_token_accuracy": 0.3896551728248596, + "step": 44830 + }, + { + "epoch": 0.04515828818511725, + "grad_norm": 20.71823753765038, + "learning_rate": 4.515742400741293e-05, + "loss": 2.5943, + "mean_token_accuracy": 0.37241379618644715, + "step": 44835 + }, + { + "epoch": 0.045163324238221426, + "grad_norm": 13.894358983274286, + "learning_rate": 4.516245996414399e-05, + "loss": 2.5057, + "mean_token_accuracy": 0.43992740511894224, + "step": 44840 + }, + { + "epoch": 0.0451683602913256, + "grad_norm": 14.753950013805605, + "learning_rate": 4.5167495920875054e-05, + "loss": 2.6425, + "mean_token_accuracy": 0.3724137932062149, + "step": 44845 + }, + { + "epoch": 0.045173396344429774, + "grad_norm": 17.313524455228322, + "learning_rate": 4.517253187760611e-05, + "loss": 2.9129, + "mean_token_accuracy": 0.3827586233615875, + "step": 44850 + }, + { + "epoch": 0.04517843239753395, + "grad_norm": 13.52798270682511, + "learning_rate": 4.517756783433717e-05, + "loss": 2.8442, + "mean_token_accuracy": 0.34137930274009703, + "step": 44855 + }, + { + "epoch": 0.04518346845063812, + "grad_norm": 13.920348650831679, + "learning_rate": 4.518260379106823e-05, + "loss": 2.3065, + "mean_token_accuracy": 0.38275861740112305, + "step": 44860 + }, + { + "epoch": 0.04518850450374229, + "grad_norm": 11.589635436367812, + "learning_rate": 4.5187639747799284e-05, + "loss": 2.7843, + "mean_token_accuracy": 0.3882032632827759, + "step": 44865 + }, + { + "epoch": 0.04519354055684646, + "grad_norm": 12.560111397875085, + "learning_rate": 4.5192675704530344e-05, + "loss": 2.2077, + "mean_token_accuracy": 0.44482759237289426, + "step": 44870 + }, + { + "epoch": 0.045198576609950636, + "grad_norm": 16.236942271857647, + "learning_rate": 4.519771166126141e-05, + "loss": 2.8092, + "mean_token_accuracy": 0.39461584985256193, + "step": 44875 + }, + { + "epoch": 0.04520361266305481, + "grad_norm": 13.976763920840503, + "learning_rate": 4.520274761799247e-05, + "loss": 2.7381, + "mean_token_accuracy": 0.37586207389831544, + "step": 44880 + }, + { + "epoch": 0.04520864871615898, + "grad_norm": 15.358232230559201, + "learning_rate": 4.520778357472353e-05, + "loss": 2.7635, + "mean_token_accuracy": 0.3896551728248596, + "step": 44885 + }, + { + "epoch": 0.04521368476926316, + "grad_norm": 14.11073286749314, + "learning_rate": 4.521281953145459e-05, + "loss": 2.4965, + "mean_token_accuracy": 0.39310343861579894, + "step": 44890 + }, + { + "epoch": 0.04521872082236733, + "grad_norm": 17.809997309013614, + "learning_rate": 4.5217855488185647e-05, + "loss": 2.6129, + "mean_token_accuracy": 0.4103448212146759, + "step": 44895 + }, + { + "epoch": 0.0452237568754715, + "grad_norm": 14.196946351947064, + "learning_rate": 4.522289144491671e-05, + "loss": 2.8313, + "mean_token_accuracy": 0.3931034475564957, + "step": 44900 + }, + { + "epoch": 0.04522879292857567, + "grad_norm": 12.012069827578587, + "learning_rate": 4.5227927401647765e-05, + "loss": 2.3098, + "mean_token_accuracy": 0.44827585220336913, + "step": 44905 + }, + { + "epoch": 0.045233828981679845, + "grad_norm": 14.604945470332536, + "learning_rate": 4.5232963358378824e-05, + "loss": 2.4493, + "mean_token_accuracy": 0.4206896543502808, + "step": 44910 + }, + { + "epoch": 0.04523886503478402, + "grad_norm": 13.102857646316075, + "learning_rate": 4.5237999315109884e-05, + "loss": 2.9991, + "mean_token_accuracy": 0.3689655214548111, + "step": 44915 + }, + { + "epoch": 0.04524390108788819, + "grad_norm": 15.45215553415538, + "learning_rate": 4.524303527184094e-05, + "loss": 2.4055, + "mean_token_accuracy": 0.41379310488700866, + "step": 44920 + }, + { + "epoch": 0.04524893714099237, + "grad_norm": 10.060139766925662, + "learning_rate": 4.524807122857201e-05, + "loss": 2.1536, + "mean_token_accuracy": 0.4799757957458496, + "step": 44925 + }, + { + "epoch": 0.04525397319409654, + "grad_norm": 14.271780639408592, + "learning_rate": 4.525310718530307e-05, + "loss": 2.7418, + "mean_token_accuracy": 0.36896551251411436, + "step": 44930 + }, + { + "epoch": 0.04525900924720071, + "grad_norm": 11.913261194249856, + "learning_rate": 4.525814314203413e-05, + "loss": 2.737, + "mean_token_accuracy": 0.4103448331356049, + "step": 44935 + }, + { + "epoch": 0.04526404530030488, + "grad_norm": 19.468992862647276, + "learning_rate": 4.526317909876519e-05, + "loss": 3.4134, + "mean_token_accuracy": 0.341379314661026, + "step": 44940 + }, + { + "epoch": 0.045269081353409055, + "grad_norm": 15.247338262604975, + "learning_rate": 4.5268215055496246e-05, + "loss": 3.1125, + "mean_token_accuracy": 0.3034482687711716, + "step": 44945 + }, + { + "epoch": 0.04527411740651323, + "grad_norm": 14.71080179980999, + "learning_rate": 4.5273251012227305e-05, + "loss": 2.7258, + "mean_token_accuracy": 0.3517241358757019, + "step": 44950 + }, + { + "epoch": 0.0452791534596174, + "grad_norm": 11.48049556491395, + "learning_rate": 4.5278286968958364e-05, + "loss": 2.2934, + "mean_token_accuracy": 0.46551724076271056, + "step": 44955 + }, + { + "epoch": 0.045284189512721576, + "grad_norm": 15.560301098530603, + "learning_rate": 4.5283322925689424e-05, + "loss": 2.5009, + "mean_token_accuracy": 0.3978826284408569, + "step": 44960 + }, + { + "epoch": 0.04528922556582575, + "grad_norm": 16.093610703512436, + "learning_rate": 4.528835888242048e-05, + "loss": 2.4346, + "mean_token_accuracy": 0.43103448748588563, + "step": 44965 + }, + { + "epoch": 0.04529426161892992, + "grad_norm": 12.474781193179483, + "learning_rate": 4.529339483915154e-05, + "loss": 2.8864, + "mean_token_accuracy": 0.37586206793785093, + "step": 44970 + }, + { + "epoch": 0.04529929767203409, + "grad_norm": 11.21095300596419, + "learning_rate": 4.52984307958826e-05, + "loss": 2.764, + "mean_token_accuracy": 0.379310342669487, + "step": 44975 + }, + { + "epoch": 0.045304333725138264, + "grad_norm": 12.921260807828796, + "learning_rate": 4.530346675261367e-05, + "loss": 2.5652, + "mean_token_accuracy": 0.4, + "step": 44980 + }, + { + "epoch": 0.04530936977824244, + "grad_norm": 16.150416597561772, + "learning_rate": 4.530850270934473e-05, + "loss": 2.6838, + "mean_token_accuracy": 0.3896551728248596, + "step": 44985 + }, + { + "epoch": 0.04531440583134661, + "grad_norm": 16.438275675735824, + "learning_rate": 4.5313538666075786e-05, + "loss": 2.5359, + "mean_token_accuracy": 0.4482758641242981, + "step": 44990 + }, + { + "epoch": 0.045319441884450785, + "grad_norm": 13.23974283340723, + "learning_rate": 4.5318574622806845e-05, + "loss": 2.7044, + "mean_token_accuracy": 0.4191167652606964, + "step": 44995 + }, + { + "epoch": 0.04532447793755496, + "grad_norm": 14.278162896508482, + "learning_rate": 4.53236105795379e-05, + "loss": 2.5955, + "mean_token_accuracy": 0.39310344457626345, + "step": 45000 + }, + { + "epoch": 0.045329513990659126, + "grad_norm": 13.279330456357624, + "learning_rate": 4.5328646536268964e-05, + "loss": 2.7701, + "mean_token_accuracy": 0.36896551847457887, + "step": 45005 + }, + { + "epoch": 0.0453345500437633, + "grad_norm": 12.227205842960329, + "learning_rate": 4.533368249300002e-05, + "loss": 2.305, + "mean_token_accuracy": 0.43793103098869324, + "step": 45010 + }, + { + "epoch": 0.045339586096867474, + "grad_norm": 15.823759611160018, + "learning_rate": 4.533871844973108e-05, + "loss": 2.3793, + "mean_token_accuracy": 0.40689654350280763, + "step": 45015 + }, + { + "epoch": 0.04534462214997165, + "grad_norm": 15.277685693081558, + "learning_rate": 4.534375440646214e-05, + "loss": 2.719, + "mean_token_accuracy": 0.3551724135875702, + "step": 45020 + }, + { + "epoch": 0.04534965820307582, + "grad_norm": 10.948883206537369, + "learning_rate": 4.53487903631932e-05, + "loss": 2.0965, + "mean_token_accuracy": 0.4879007875919342, + "step": 45025 + }, + { + "epoch": 0.045354694256179995, + "grad_norm": 13.032313888006943, + "learning_rate": 4.535382631992427e-05, + "loss": 2.8642, + "mean_token_accuracy": 0.3448275804519653, + "step": 45030 + }, + { + "epoch": 0.04535973030928417, + "grad_norm": 12.41164447401999, + "learning_rate": 4.5358862276655326e-05, + "loss": 2.4806, + "mean_token_accuracy": 0.4310344815254211, + "step": 45035 + }, + { + "epoch": 0.045364766362388335, + "grad_norm": 14.028944506771541, + "learning_rate": 4.536389823338638e-05, + "loss": 2.7482, + "mean_token_accuracy": 0.3807622462511063, + "step": 45040 + }, + { + "epoch": 0.04536980241549251, + "grad_norm": 9.989692737163047, + "learning_rate": 4.536893419011744e-05, + "loss": 2.7546, + "mean_token_accuracy": 0.41246218085289, + "step": 45045 + }, + { + "epoch": 0.04537483846859668, + "grad_norm": 11.981570854103037, + "learning_rate": 4.53739701468485e-05, + "loss": 2.5018, + "mean_token_accuracy": 0.42262552976608275, + "step": 45050 + }, + { + "epoch": 0.04537987452170086, + "grad_norm": 20.789060529480373, + "learning_rate": 4.5379006103579556e-05, + "loss": 2.7282, + "mean_token_accuracy": 0.33448275923728943, + "step": 45055 + }, + { + "epoch": 0.04538491057480503, + "grad_norm": 13.134537467113612, + "learning_rate": 4.538404206031062e-05, + "loss": 2.6924, + "mean_token_accuracy": 0.4068965494632721, + "step": 45060 + }, + { + "epoch": 0.045389946627909204, + "grad_norm": 19.99016207285458, + "learning_rate": 4.538907801704168e-05, + "loss": 2.6623, + "mean_token_accuracy": 0.36206896901130675, + "step": 45065 + }, + { + "epoch": 0.04539498268101338, + "grad_norm": 15.687119097866171, + "learning_rate": 4.539411397377274e-05, + "loss": 2.4346, + "mean_token_accuracy": 0.4172413766384125, + "step": 45070 + }, + { + "epoch": 0.045400018734117545, + "grad_norm": 10.565059484840424, + "learning_rate": 4.53991499305038e-05, + "loss": 2.4925, + "mean_token_accuracy": 0.39310344457626345, + "step": 45075 + }, + { + "epoch": 0.04540505478722172, + "grad_norm": 12.40190254307408, + "learning_rate": 4.540418588723486e-05, + "loss": 2.1745, + "mean_token_accuracy": 0.4517241418361664, + "step": 45080 + }, + { + "epoch": 0.04541009084032589, + "grad_norm": 13.345929675396324, + "learning_rate": 4.540922184396592e-05, + "loss": 2.4919, + "mean_token_accuracy": 0.4137930989265442, + "step": 45085 + }, + { + "epoch": 0.045415126893430066, + "grad_norm": 14.372914290071753, + "learning_rate": 4.541425780069698e-05, + "loss": 2.8893, + "mean_token_accuracy": 0.38965516686439516, + "step": 45090 + }, + { + "epoch": 0.04542016294653424, + "grad_norm": 12.979539390233766, + "learning_rate": 4.541929375742804e-05, + "loss": 2.8954, + "mean_token_accuracy": 0.3793103486299515, + "step": 45095 + }, + { + "epoch": 0.045425198999638414, + "grad_norm": 12.699002533074161, + "learning_rate": 4.5424329714159096e-05, + "loss": 2.5109, + "mean_token_accuracy": 0.4137930989265442, + "step": 45100 + }, + { + "epoch": 0.04543023505274259, + "grad_norm": 12.721114195962278, + "learning_rate": 4.5429365670890156e-05, + "loss": 2.7352, + "mean_token_accuracy": 0.3896551728248596, + "step": 45105 + }, + { + "epoch": 0.045435271105846754, + "grad_norm": 14.015904415423396, + "learning_rate": 4.543440162762122e-05, + "loss": 2.6036, + "mean_token_accuracy": 0.3551724135875702, + "step": 45110 + }, + { + "epoch": 0.04544030715895093, + "grad_norm": 13.133000410533114, + "learning_rate": 4.543943758435228e-05, + "loss": 2.6331, + "mean_token_accuracy": 0.373986679315567, + "step": 45115 + }, + { + "epoch": 0.0454453432120551, + "grad_norm": 13.387916214373474, + "learning_rate": 4.544447354108334e-05, + "loss": 2.5296, + "mean_token_accuracy": 0.36896551549434664, + "step": 45120 + }, + { + "epoch": 0.045450379265159276, + "grad_norm": 12.172324705801335, + "learning_rate": 4.54495094978144e-05, + "loss": 2.5096, + "mean_token_accuracy": 0.39655172228813174, + "step": 45125 + }, + { + "epoch": 0.04545541531826345, + "grad_norm": 11.474715654145777, + "learning_rate": 4.545454545454546e-05, + "loss": 2.5683, + "mean_token_accuracy": 0.4068965494632721, + "step": 45130 + }, + { + "epoch": 0.04546045137136762, + "grad_norm": 15.418982788824612, + "learning_rate": 4.545958141127651e-05, + "loss": 2.4908, + "mean_token_accuracy": 0.4034482717514038, + "step": 45135 + }, + { + "epoch": 0.0454654874244718, + "grad_norm": 13.025867252525092, + "learning_rate": 4.546461736800758e-05, + "loss": 2.5986, + "mean_token_accuracy": 0.3704779237508774, + "step": 45140 + }, + { + "epoch": 0.045470523477575964, + "grad_norm": 13.793031477045323, + "learning_rate": 4.5469653324738636e-05, + "loss": 2.7253, + "mean_token_accuracy": 0.41034482717514037, + "step": 45145 + }, + { + "epoch": 0.04547555953068014, + "grad_norm": 11.780683624012756, + "learning_rate": 4.5474689281469696e-05, + "loss": 2.469, + "mean_token_accuracy": 0.4310344815254211, + "step": 45150 + }, + { + "epoch": 0.04548059558378431, + "grad_norm": 10.843711578316588, + "learning_rate": 4.5479725238200755e-05, + "loss": 2.3262, + "mean_token_accuracy": 0.46454930305480957, + "step": 45155 + }, + { + "epoch": 0.045485631636888485, + "grad_norm": 12.729036297927463, + "learning_rate": 4.5484761194931814e-05, + "loss": 2.3739, + "mean_token_accuracy": 0.4103448212146759, + "step": 45160 + }, + { + "epoch": 0.04549066768999266, + "grad_norm": 14.583126762727854, + "learning_rate": 4.548979715166288e-05, + "loss": 2.5018, + "mean_token_accuracy": 0.3931034505367279, + "step": 45165 + }, + { + "epoch": 0.04549570374309683, + "grad_norm": 12.058081638374917, + "learning_rate": 4.549483310839394e-05, + "loss": 2.4108, + "mean_token_accuracy": 0.4206896543502808, + "step": 45170 + }, + { + "epoch": 0.045500739796201006, + "grad_norm": 30.518044025478805, + "learning_rate": 4.549986906512499e-05, + "loss": 3.3733, + "mean_token_accuracy": 0.3241379290819168, + "step": 45175 + }, + { + "epoch": 0.04550577584930517, + "grad_norm": 13.809847290105825, + "learning_rate": 4.550490502185605e-05, + "loss": 2.8638, + "mean_token_accuracy": 0.37586206793785093, + "step": 45180 + }, + { + "epoch": 0.04551081190240935, + "grad_norm": 10.964380395555134, + "learning_rate": 4.550994097858711e-05, + "loss": 2.83, + "mean_token_accuracy": 0.4068965494632721, + "step": 45185 + }, + { + "epoch": 0.04551584795551352, + "grad_norm": 18.611028361718628, + "learning_rate": 4.5514976935318176e-05, + "loss": 2.6292, + "mean_token_accuracy": 0.43623714447021483, + "step": 45190 + }, + { + "epoch": 0.045520884008617694, + "grad_norm": 13.053846072940907, + "learning_rate": 4.5520012892049236e-05, + "loss": 2.3459, + "mean_token_accuracy": 0.43103448748588563, + "step": 45195 + }, + { + "epoch": 0.04552592006172187, + "grad_norm": 11.836688385513353, + "learning_rate": 4.5525048848780295e-05, + "loss": 2.7525, + "mean_token_accuracy": 0.4068965554237366, + "step": 45200 + }, + { + "epoch": 0.04553095611482604, + "grad_norm": 12.278971542999273, + "learning_rate": 4.5530084805511354e-05, + "loss": 2.3394, + "mean_token_accuracy": 0.44827585816383364, + "step": 45205 + }, + { + "epoch": 0.045535992167930216, + "grad_norm": 14.22381548008495, + "learning_rate": 4.5535120762242413e-05, + "loss": 2.9581, + "mean_token_accuracy": 0.3379310369491577, + "step": 45210 + }, + { + "epoch": 0.04554102822103438, + "grad_norm": 14.57102630617339, + "learning_rate": 4.554015671897347e-05, + "loss": 2.4632, + "mean_token_accuracy": 0.43448275327682495, + "step": 45215 + }, + { + "epoch": 0.045546064274138556, + "grad_norm": 12.507942353808424, + "learning_rate": 4.554519267570453e-05, + "loss": 2.3297, + "mean_token_accuracy": 0.39310344457626345, + "step": 45220 + }, + { + "epoch": 0.04555110032724273, + "grad_norm": 14.5910206215796, + "learning_rate": 4.555022863243559e-05, + "loss": 2.4346, + "mean_token_accuracy": 0.4103448331356049, + "step": 45225 + }, + { + "epoch": 0.045556136380346904, + "grad_norm": 11.991073843444786, + "learning_rate": 4.555526458916665e-05, + "loss": 2.555, + "mean_token_accuracy": 0.4379310369491577, + "step": 45230 + }, + { + "epoch": 0.04556117243345108, + "grad_norm": 15.996963708223808, + "learning_rate": 4.556030054589771e-05, + "loss": 2.8072, + "mean_token_accuracy": 0.3517241388559341, + "step": 45235 + }, + { + "epoch": 0.04556620848655525, + "grad_norm": 10.909285878807983, + "learning_rate": 4.556533650262877e-05, + "loss": 2.4091, + "mean_token_accuracy": 0.4413793087005615, + "step": 45240 + }, + { + "epoch": 0.045571244539659425, + "grad_norm": 10.625567947163363, + "learning_rate": 4.5570372459359835e-05, + "loss": 2.6194, + "mean_token_accuracy": 0.4137930989265442, + "step": 45245 + }, + { + "epoch": 0.04557628059276359, + "grad_norm": 9.362659009210685, + "learning_rate": 4.5575408416090894e-05, + "loss": 2.662, + "mean_token_accuracy": 0.41379310488700866, + "step": 45250 + }, + { + "epoch": 0.045581316645867766, + "grad_norm": 12.567007236797476, + "learning_rate": 4.5580444372821953e-05, + "loss": 2.824, + "mean_token_accuracy": 0.4000000059604645, + "step": 45255 + }, + { + "epoch": 0.04558635269897194, + "grad_norm": 16.84349586642506, + "learning_rate": 4.558548032955301e-05, + "loss": 2.6391, + "mean_token_accuracy": 0.39655171930789945, + "step": 45260 + }, + { + "epoch": 0.04559138875207611, + "grad_norm": 18.160329500038696, + "learning_rate": 4.5590516286284065e-05, + "loss": 2.916, + "mean_token_accuracy": 0.39999998807907106, + "step": 45265 + }, + { + "epoch": 0.04559642480518029, + "grad_norm": 11.510593555131928, + "learning_rate": 4.559555224301513e-05, + "loss": 2.5816, + "mean_token_accuracy": 0.42413792610168455, + "step": 45270 + }, + { + "epoch": 0.04560146085828446, + "grad_norm": 11.64209086899516, + "learning_rate": 4.560058819974619e-05, + "loss": 2.5579, + "mean_token_accuracy": 0.39310344457626345, + "step": 45275 + }, + { + "epoch": 0.045606496911388635, + "grad_norm": 12.3352603329827, + "learning_rate": 4.560562415647725e-05, + "loss": 2.8205, + "mean_token_accuracy": 0.3946763455867767, + "step": 45280 + }, + { + "epoch": 0.0456115329644928, + "grad_norm": 13.91472397634847, + "learning_rate": 4.561066011320831e-05, + "loss": 2.8784, + "mean_token_accuracy": 0.3620689570903778, + "step": 45285 + }, + { + "epoch": 0.045616569017596975, + "grad_norm": 12.334802760260308, + "learning_rate": 4.561569606993937e-05, + "loss": 2.3678, + "mean_token_accuracy": 0.42068966031074523, + "step": 45290 + }, + { + "epoch": 0.04562160507070115, + "grad_norm": 16.51528036705448, + "learning_rate": 4.562073202667043e-05, + "loss": 2.5628, + "mean_token_accuracy": 0.3931034505367279, + "step": 45295 + }, + { + "epoch": 0.04562664112380532, + "grad_norm": 11.096902800062322, + "learning_rate": 4.5625767983401493e-05, + "loss": 2.7286, + "mean_token_accuracy": 0.38965516686439516, + "step": 45300 + }, + { + "epoch": 0.045631677176909496, + "grad_norm": 14.76987054674303, + "learning_rate": 4.5630803940132546e-05, + "loss": 2.817, + "mean_token_accuracy": 0.33448275923728943, + "step": 45305 + }, + { + "epoch": 0.04563671323001367, + "grad_norm": 14.965456859411088, + "learning_rate": 4.5635839896863605e-05, + "loss": 3.4042, + "mean_token_accuracy": 0.31724137961864474, + "step": 45310 + }, + { + "epoch": 0.045641749283117844, + "grad_norm": 13.525317413286624, + "learning_rate": 4.5640875853594664e-05, + "loss": 2.8693, + "mean_token_accuracy": 0.34827586114406583, + "step": 45315 + }, + { + "epoch": 0.04564678533622201, + "grad_norm": 14.514024973758806, + "learning_rate": 4.5645911810325724e-05, + "loss": 2.601, + "mean_token_accuracy": 0.3896551728248596, + "step": 45320 + }, + { + "epoch": 0.045651821389326185, + "grad_norm": 12.470327250810271, + "learning_rate": 4.565094776705679e-05, + "loss": 2.8991, + "mean_token_accuracy": 0.3620689630508423, + "step": 45325 + }, + { + "epoch": 0.04565685744243036, + "grad_norm": 14.101465357793328, + "learning_rate": 4.565598372378785e-05, + "loss": 2.4769, + "mean_token_accuracy": 0.4034482717514038, + "step": 45330 + }, + { + "epoch": 0.04566189349553453, + "grad_norm": 16.096599093008184, + "learning_rate": 4.566101968051891e-05, + "loss": 2.5989, + "mean_token_accuracy": 0.38620689511299133, + "step": 45335 + }, + { + "epoch": 0.045666929548638706, + "grad_norm": 11.885569712547452, + "learning_rate": 4.566605563724997e-05, + "loss": 2.482, + "mean_token_accuracy": 0.3896551728248596, + "step": 45340 + }, + { + "epoch": 0.04567196560174288, + "grad_norm": 14.410809487306187, + "learning_rate": 4.567109159398103e-05, + "loss": 2.3762, + "mean_token_accuracy": 0.45517241954803467, + "step": 45345 + }, + { + "epoch": 0.04567700165484705, + "grad_norm": 11.952594174349422, + "learning_rate": 4.5676127550712086e-05, + "loss": 2.6135, + "mean_token_accuracy": 0.4, + "step": 45350 + }, + { + "epoch": 0.04568203770795122, + "grad_norm": 12.812536415361809, + "learning_rate": 4.5681163507443145e-05, + "loss": 2.5453, + "mean_token_accuracy": 0.41379310488700866, + "step": 45355 + }, + { + "epoch": 0.045687073761055394, + "grad_norm": 13.202700090104042, + "learning_rate": 4.5686199464174205e-05, + "loss": 2.376, + "mean_token_accuracy": 0.4068965494632721, + "step": 45360 + }, + { + "epoch": 0.04569210981415957, + "grad_norm": 11.165970616648178, + "learning_rate": 4.5691235420905264e-05, + "loss": 2.6221, + "mean_token_accuracy": 0.4034482717514038, + "step": 45365 + }, + { + "epoch": 0.04569714586726374, + "grad_norm": 23.302681403952413, + "learning_rate": 4.569627137763632e-05, + "loss": 2.382, + "mean_token_accuracy": 0.41724138259887694, + "step": 45370 + }, + { + "epoch": 0.045702181920367915, + "grad_norm": 11.15379949212112, + "learning_rate": 4.570130733436738e-05, + "loss": 2.2873, + "mean_token_accuracy": 0.4571687877178192, + "step": 45375 + }, + { + "epoch": 0.04570721797347209, + "grad_norm": 12.152728765855322, + "learning_rate": 4.570634329109845e-05, + "loss": 2.3114, + "mean_token_accuracy": 0.38965516686439516, + "step": 45380 + }, + { + "epoch": 0.04571225402657626, + "grad_norm": 15.29930951775352, + "learning_rate": 4.571137924782951e-05, + "loss": 2.4733, + "mean_token_accuracy": 0.39655172228813174, + "step": 45385 + }, + { + "epoch": 0.04571729007968043, + "grad_norm": 9.648632861139788, + "learning_rate": 4.571641520456057e-05, + "loss": 2.3832, + "mean_token_accuracy": 0.44827587008476255, + "step": 45390 + }, + { + "epoch": 0.0457223261327846, + "grad_norm": 16.86753401078081, + "learning_rate": 4.5721451161291626e-05, + "loss": 2.5552, + "mean_token_accuracy": 0.3586206793785095, + "step": 45395 + }, + { + "epoch": 0.04572736218588878, + "grad_norm": 20.935801595693228, + "learning_rate": 4.572648711802268e-05, + "loss": 3.0217, + "mean_token_accuracy": 0.3482758641242981, + "step": 45400 + }, + { + "epoch": 0.04573239823899295, + "grad_norm": 12.513682665244652, + "learning_rate": 4.5731523074753745e-05, + "loss": 2.4091, + "mean_token_accuracy": 0.422202056646347, + "step": 45405 + }, + { + "epoch": 0.045737434292097125, + "grad_norm": 17.436336117296158, + "learning_rate": 4.5736559031484804e-05, + "loss": 3.1206, + "mean_token_accuracy": 0.34137930870056155, + "step": 45410 + }, + { + "epoch": 0.0457424703452013, + "grad_norm": 15.352637817630113, + "learning_rate": 4.574159498821586e-05, + "loss": 3.086, + "mean_token_accuracy": 0.3310344755649567, + "step": 45415 + }, + { + "epoch": 0.04574750639830547, + "grad_norm": 19.698937123809284, + "learning_rate": 4.574663094494692e-05, + "loss": 2.5823, + "mean_token_accuracy": 0.4068965494632721, + "step": 45420 + }, + { + "epoch": 0.04575254245140964, + "grad_norm": 15.238568703625944, + "learning_rate": 4.575166690167798e-05, + "loss": 2.6403, + "mean_token_accuracy": 0.3620689630508423, + "step": 45425 + }, + { + "epoch": 0.04575757850451381, + "grad_norm": 13.403932793058017, + "learning_rate": 4.575670285840905e-05, + "loss": 2.4071, + "mean_token_accuracy": 0.4103448331356049, + "step": 45430 + }, + { + "epoch": 0.04576261455761799, + "grad_norm": 14.554827372958975, + "learning_rate": 4.576173881514011e-05, + "loss": 2.6868, + "mean_token_accuracy": 0.3999999940395355, + "step": 45435 + }, + { + "epoch": 0.04576765061072216, + "grad_norm": 11.762121879511273, + "learning_rate": 4.576677477187116e-05, + "loss": 2.7936, + "mean_token_accuracy": 0.4068965494632721, + "step": 45440 + }, + { + "epoch": 0.045772686663826334, + "grad_norm": 11.798432969573954, + "learning_rate": 4.577181072860222e-05, + "loss": 2.46, + "mean_token_accuracy": 0.4275862157344818, + "step": 45445 + }, + { + "epoch": 0.04577772271693051, + "grad_norm": 15.66176989713497, + "learning_rate": 4.577684668533328e-05, + "loss": 2.8727, + "mean_token_accuracy": 0.3448275804519653, + "step": 45450 + }, + { + "epoch": 0.04578275877003468, + "grad_norm": 12.09495091439029, + "learning_rate": 4.5781882642064344e-05, + "loss": 2.6136, + "mean_token_accuracy": 0.3793103456497192, + "step": 45455 + }, + { + "epoch": 0.04578779482313885, + "grad_norm": 14.498910632951626, + "learning_rate": 4.57869185987954e-05, + "loss": 2.8005, + "mean_token_accuracy": 0.33103448152542114, + "step": 45460 + }, + { + "epoch": 0.04579283087624302, + "grad_norm": 11.130412154352795, + "learning_rate": 4.579195455552646e-05, + "loss": 2.2109, + "mean_token_accuracy": 0.482758617401123, + "step": 45465 + }, + { + "epoch": 0.045797866929347196, + "grad_norm": 14.015365356305098, + "learning_rate": 4.579699051225752e-05, + "loss": 2.8315, + "mean_token_accuracy": 0.3931034505367279, + "step": 45470 + }, + { + "epoch": 0.04580290298245137, + "grad_norm": 12.36931362450068, + "learning_rate": 4.580202646898858e-05, + "loss": 2.5634, + "mean_token_accuracy": 0.3931034505367279, + "step": 45475 + }, + { + "epoch": 0.045807939035555544, + "grad_norm": 13.486536390372022, + "learning_rate": 4.580706242571964e-05, + "loss": 2.8581, + "mean_token_accuracy": 0.3344827651977539, + "step": 45480 + }, + { + "epoch": 0.04581297508865972, + "grad_norm": 13.150043921388892, + "learning_rate": 4.58120983824507e-05, + "loss": 2.5689, + "mean_token_accuracy": 0.3793103456497192, + "step": 45485 + }, + { + "epoch": 0.04581801114176389, + "grad_norm": 12.963822154742406, + "learning_rate": 4.581713433918176e-05, + "loss": 2.4885, + "mean_token_accuracy": 0.4310344815254211, + "step": 45490 + }, + { + "epoch": 0.04582304719486806, + "grad_norm": 11.205178569388478, + "learning_rate": 4.582217029591282e-05, + "loss": 2.4859, + "mean_token_accuracy": 0.45517241954803467, + "step": 45495 + }, + { + "epoch": 0.04582808324797223, + "grad_norm": 12.209173521188307, + "learning_rate": 4.582720625264388e-05, + "loss": 3.1721, + "mean_token_accuracy": 0.3724137872457504, + "step": 45500 + }, + { + "epoch": 0.045833119301076405, + "grad_norm": 12.98699211196934, + "learning_rate": 4.5832242209374936e-05, + "loss": 2.4983, + "mean_token_accuracy": 0.3965517282485962, + "step": 45505 + }, + { + "epoch": 0.04583815535418058, + "grad_norm": 11.009061285521462, + "learning_rate": 4.5837278166106e-05, + "loss": 2.1259, + "mean_token_accuracy": 0.4551724076271057, + "step": 45510 + }, + { + "epoch": 0.04584319140728475, + "grad_norm": 11.623508660644774, + "learning_rate": 4.584231412283706e-05, + "loss": 2.4234, + "mean_token_accuracy": 0.3724137932062149, + "step": 45515 + }, + { + "epoch": 0.04584822746038893, + "grad_norm": 17.628579591246982, + "learning_rate": 4.584735007956812e-05, + "loss": 2.814, + "mean_token_accuracy": 0.4034482717514038, + "step": 45520 + }, + { + "epoch": 0.0458532635134931, + "grad_norm": 13.992630195178101, + "learning_rate": 4.585238603629918e-05, + "loss": 2.5112, + "mean_token_accuracy": 0.43793103098869324, + "step": 45525 + }, + { + "epoch": 0.04585829956659727, + "grad_norm": 11.968142943325905, + "learning_rate": 4.585742199303024e-05, + "loss": 2.3557, + "mean_token_accuracy": 0.4379310250282288, + "step": 45530 + }, + { + "epoch": 0.04586333561970144, + "grad_norm": 15.163929894168575, + "learning_rate": 4.58624579497613e-05, + "loss": 2.6151, + "mean_token_accuracy": 0.3862069010734558, + "step": 45535 + }, + { + "epoch": 0.045868371672805615, + "grad_norm": 15.187845828103997, + "learning_rate": 4.586749390649236e-05, + "loss": 2.8985, + "mean_token_accuracy": 0.32758620381355286, + "step": 45540 + }, + { + "epoch": 0.04587340772590979, + "grad_norm": 31.220236831813427, + "learning_rate": 4.587252986322342e-05, + "loss": 2.7791, + "mean_token_accuracy": 0.35172412991523744, + "step": 45545 + }, + { + "epoch": 0.04587844377901396, + "grad_norm": 16.300278851084947, + "learning_rate": 4.5877565819954476e-05, + "loss": 3.1288, + "mean_token_accuracy": 0.35862069129943847, + "step": 45550 + }, + { + "epoch": 0.045883479832118136, + "grad_norm": 26.21242639945067, + "learning_rate": 4.5882601776685536e-05, + "loss": 2.9057, + "mean_token_accuracy": 0.35172412991523744, + "step": 45555 + }, + { + "epoch": 0.04588851588522231, + "grad_norm": 15.053394884701435, + "learning_rate": 4.5887637733416595e-05, + "loss": 2.3891, + "mean_token_accuracy": 0.40689654350280763, + "step": 45560 + }, + { + "epoch": 0.04589355193832648, + "grad_norm": 13.403645504350676, + "learning_rate": 4.589267369014766e-05, + "loss": 2.2811, + "mean_token_accuracy": 0.47785844206809996, + "step": 45565 + }, + { + "epoch": 0.04589858799143065, + "grad_norm": 16.16527277649867, + "learning_rate": 4.589770964687872e-05, + "loss": 2.6712, + "mean_token_accuracy": 0.3655172407627106, + "step": 45570 + }, + { + "epoch": 0.045903624044534824, + "grad_norm": 14.20188482430698, + "learning_rate": 4.590274560360977e-05, + "loss": 2.5316, + "mean_token_accuracy": 0.4172413766384125, + "step": 45575 + }, + { + "epoch": 0.045908660097639, + "grad_norm": 11.75065649183626, + "learning_rate": 4.590778156034083e-05, + "loss": 2.2585, + "mean_token_accuracy": 0.44137930274009707, + "step": 45580 + }, + { + "epoch": 0.04591369615074317, + "grad_norm": 16.83404339558262, + "learning_rate": 4.591281751707189e-05, + "loss": 2.6975, + "mean_token_accuracy": 0.4047791838645935, + "step": 45585 + }, + { + "epoch": 0.045918732203847346, + "grad_norm": 15.595834662706329, + "learning_rate": 4.591785347380296e-05, + "loss": 2.426, + "mean_token_accuracy": 0.39310344457626345, + "step": 45590 + }, + { + "epoch": 0.04592376825695152, + "grad_norm": 12.95339417384313, + "learning_rate": 4.5922889430534017e-05, + "loss": 2.3539, + "mean_token_accuracy": 0.43103448748588563, + "step": 45595 + }, + { + "epoch": 0.045928804310055686, + "grad_norm": 14.118091642364684, + "learning_rate": 4.5927925387265076e-05, + "loss": 2.9484, + "mean_token_accuracy": 0.33103448450565337, + "step": 45600 + }, + { + "epoch": 0.04593384036315986, + "grad_norm": 15.258472165997654, + "learning_rate": 4.5932961343996135e-05, + "loss": 2.8437, + "mean_token_accuracy": 0.41917725205421447, + "step": 45605 + }, + { + "epoch": 0.045938876416264034, + "grad_norm": 27.655858364668973, + "learning_rate": 4.5937997300727194e-05, + "loss": 3.2397, + "mean_token_accuracy": 0.3620689630508423, + "step": 45610 + }, + { + "epoch": 0.04594391246936821, + "grad_norm": 12.893115737990815, + "learning_rate": 4.5943033257458254e-05, + "loss": 2.5015, + "mean_token_accuracy": 0.4206896543502808, + "step": 45615 + }, + { + "epoch": 0.04594894852247238, + "grad_norm": 15.560259802787362, + "learning_rate": 4.594806921418931e-05, + "loss": 2.6624, + "mean_token_accuracy": 0.4068965554237366, + "step": 45620 + }, + { + "epoch": 0.045953984575576555, + "grad_norm": 13.783219609655367, + "learning_rate": 4.595310517092037e-05, + "loss": 2.7483, + "mean_token_accuracy": 0.3827586233615875, + "step": 45625 + }, + { + "epoch": 0.04595902062868073, + "grad_norm": 12.126746903240528, + "learning_rate": 4.595814112765143e-05, + "loss": 2.5374, + "mean_token_accuracy": 0.37241379022598264, + "step": 45630 + }, + { + "epoch": 0.045964056681784896, + "grad_norm": 35.467791217302356, + "learning_rate": 4.596317708438249e-05, + "loss": 2.7132, + "mean_token_accuracy": 0.3689655065536499, + "step": 45635 + }, + { + "epoch": 0.04596909273488907, + "grad_norm": 15.534479093949958, + "learning_rate": 4.596821304111355e-05, + "loss": 2.9164, + "mean_token_accuracy": 0.32758620381355286, + "step": 45640 + }, + { + "epoch": 0.04597412878799324, + "grad_norm": 13.767392553279088, + "learning_rate": 4.5973248997844616e-05, + "loss": 2.7214, + "mean_token_accuracy": 0.36896551847457887, + "step": 45645 + }, + { + "epoch": 0.04597916484109742, + "grad_norm": 12.815546323290299, + "learning_rate": 4.5978284954575675e-05, + "loss": 2.3025, + "mean_token_accuracy": 0.47586206197738645, + "step": 45650 + }, + { + "epoch": 0.04598420089420159, + "grad_norm": 10.807010621174983, + "learning_rate": 4.5983320911306734e-05, + "loss": 2.5834, + "mean_token_accuracy": 0.3931034505367279, + "step": 45655 + }, + { + "epoch": 0.045989236947305764, + "grad_norm": 13.438594344234566, + "learning_rate": 4.5988356868037794e-05, + "loss": 2.5965, + "mean_token_accuracy": 0.3793103456497192, + "step": 45660 + }, + { + "epoch": 0.04599427300040994, + "grad_norm": 18.72233270912934, + "learning_rate": 4.599339282476885e-05, + "loss": 2.422, + "mean_token_accuracy": 0.3896551728248596, + "step": 45665 + }, + { + "epoch": 0.045999309053514105, + "grad_norm": 11.108895225874964, + "learning_rate": 4.599842878149991e-05, + "loss": 2.1143, + "mean_token_accuracy": 0.42413793206214906, + "step": 45670 + }, + { + "epoch": 0.04600434510661828, + "grad_norm": 13.162516207680952, + "learning_rate": 4.600346473823097e-05, + "loss": 2.5998, + "mean_token_accuracy": 0.44827585816383364, + "step": 45675 + }, + { + "epoch": 0.04600938115972245, + "grad_norm": 14.830432889166389, + "learning_rate": 4.600850069496203e-05, + "loss": 2.1864, + "mean_token_accuracy": 0.42068964838981626, + "step": 45680 + }, + { + "epoch": 0.046014417212826626, + "grad_norm": 18.705022670216028, + "learning_rate": 4.601353665169309e-05, + "loss": 2.8639, + "mean_token_accuracy": 0.334482753276825, + "step": 45685 + }, + { + "epoch": 0.0460194532659308, + "grad_norm": 13.460681302812048, + "learning_rate": 4.601857260842415e-05, + "loss": 2.7348, + "mean_token_accuracy": 0.3793103516101837, + "step": 45690 + }, + { + "epoch": 0.046024489319034974, + "grad_norm": 15.797978028441186, + "learning_rate": 4.6023608565155215e-05, + "loss": 2.5959, + "mean_token_accuracy": 0.334482753276825, + "step": 45695 + }, + { + "epoch": 0.04602952537213915, + "grad_norm": 15.57258454902569, + "learning_rate": 4.6028644521886274e-05, + "loss": 2.5973, + "mean_token_accuracy": 0.3758620619773865, + "step": 45700 + }, + { + "epoch": 0.046034561425243314, + "grad_norm": 10.419813635174432, + "learning_rate": 4.6033680478617334e-05, + "loss": 2.3198, + "mean_token_accuracy": 0.4310344815254211, + "step": 45705 + }, + { + "epoch": 0.04603959747834749, + "grad_norm": 11.78540913536877, + "learning_rate": 4.6038716435348386e-05, + "loss": 2.387, + "mean_token_accuracy": 0.4310344815254211, + "step": 45710 + }, + { + "epoch": 0.04604463353145166, + "grad_norm": 13.189267790984745, + "learning_rate": 4.6043752392079445e-05, + "loss": 2.5371, + "mean_token_accuracy": 0.4137930989265442, + "step": 45715 + }, + { + "epoch": 0.046049669584555836, + "grad_norm": 17.653667702615536, + "learning_rate": 4.6048788348810505e-05, + "loss": 2.8459, + "mean_token_accuracy": 0.3793103456497192, + "step": 45720 + }, + { + "epoch": 0.04605470563766001, + "grad_norm": 12.384130938014792, + "learning_rate": 4.605382430554157e-05, + "loss": 2.6283, + "mean_token_accuracy": 0.4034482777118683, + "step": 45725 + }, + { + "epoch": 0.04605974169076418, + "grad_norm": 11.632733570756454, + "learning_rate": 4.605886026227263e-05, + "loss": 2.3427, + "mean_token_accuracy": 0.4448275864124298, + "step": 45730 + }, + { + "epoch": 0.04606477774386836, + "grad_norm": 11.743119721745678, + "learning_rate": 4.606389621900369e-05, + "loss": 2.316, + "mean_token_accuracy": 0.44827585816383364, + "step": 45735 + }, + { + "epoch": 0.046069813796972524, + "grad_norm": 15.854647080869608, + "learning_rate": 4.606893217573475e-05, + "loss": 2.5813, + "mean_token_accuracy": 0.42758620381355283, + "step": 45740 + }, + { + "epoch": 0.0460748498500767, + "grad_norm": 13.235535356141703, + "learning_rate": 4.607396813246581e-05, + "loss": 2.4936, + "mean_token_accuracy": 0.458620685338974, + "step": 45745 + }, + { + "epoch": 0.04607988590318087, + "grad_norm": 15.36839971987012, + "learning_rate": 4.607900408919687e-05, + "loss": 3.2742, + "mean_token_accuracy": 0.3482758581638336, + "step": 45750 + }, + { + "epoch": 0.046084921956285045, + "grad_norm": 14.199701946720843, + "learning_rate": 4.6084040045927926e-05, + "loss": 2.8233, + "mean_token_accuracy": 0.37241379022598264, + "step": 45755 + }, + { + "epoch": 0.04608995800938922, + "grad_norm": 12.30201440937251, + "learning_rate": 4.6089076002658985e-05, + "loss": 2.698, + "mean_token_accuracy": 0.38275861740112305, + "step": 45760 + }, + { + "epoch": 0.04609499406249339, + "grad_norm": 14.761749768972521, + "learning_rate": 4.6094111959390045e-05, + "loss": 2.7797, + "mean_token_accuracy": 0.3551724135875702, + "step": 45765 + }, + { + "epoch": 0.046100030115597566, + "grad_norm": 11.639297506318416, + "learning_rate": 4.6099147916121104e-05, + "loss": 2.496, + "mean_token_accuracy": 0.4103448331356049, + "step": 45770 + }, + { + "epoch": 0.04610506616870173, + "grad_norm": 13.361179659944613, + "learning_rate": 4.610418387285217e-05, + "loss": 2.7539, + "mean_token_accuracy": 0.379310342669487, + "step": 45775 + }, + { + "epoch": 0.04611010222180591, + "grad_norm": 14.192818919801738, + "learning_rate": 4.610921982958323e-05, + "loss": 2.5288, + "mean_token_accuracy": 0.40689654350280763, + "step": 45780 + }, + { + "epoch": 0.04611513827491008, + "grad_norm": 11.533548260031099, + "learning_rate": 4.611425578631429e-05, + "loss": 2.4879, + "mean_token_accuracy": 0.4172413766384125, + "step": 45785 + }, + { + "epoch": 0.046120174328014255, + "grad_norm": 12.391252335039688, + "learning_rate": 4.611929174304535e-05, + "loss": 2.4831, + "mean_token_accuracy": 0.3655172407627106, + "step": 45790 + }, + { + "epoch": 0.04612521038111843, + "grad_norm": 12.310805400359408, + "learning_rate": 4.612432769977641e-05, + "loss": 2.7256, + "mean_token_accuracy": 0.41379310488700866, + "step": 45795 + }, + { + "epoch": 0.0461302464342226, + "grad_norm": 12.882633636133292, + "learning_rate": 4.6129363656507466e-05, + "loss": 2.4285, + "mean_token_accuracy": 0.46896551847457885, + "step": 45800 + }, + { + "epoch": 0.04613528248732677, + "grad_norm": 12.031589951086408, + "learning_rate": 4.6134399613238525e-05, + "loss": 2.5765, + "mean_token_accuracy": 0.38620689511299133, + "step": 45805 + }, + { + "epoch": 0.04614031854043094, + "grad_norm": 13.095266145802576, + "learning_rate": 4.6139435569969585e-05, + "loss": 2.7523, + "mean_token_accuracy": 0.358620697259903, + "step": 45810 + }, + { + "epoch": 0.046145354593535116, + "grad_norm": 14.355599774014687, + "learning_rate": 4.6144471526700644e-05, + "loss": 2.6419, + "mean_token_accuracy": 0.3793103516101837, + "step": 45815 + }, + { + "epoch": 0.04615039064663929, + "grad_norm": 16.600529842860706, + "learning_rate": 4.61495074834317e-05, + "loss": 2.7405, + "mean_token_accuracy": 0.35662431716918946, + "step": 45820 + }, + { + "epoch": 0.046155426699743464, + "grad_norm": 10.907384480275216, + "learning_rate": 4.615454344016276e-05, + "loss": 2.3394, + "mean_token_accuracy": 0.41724138259887694, + "step": 45825 + }, + { + "epoch": 0.04616046275284764, + "grad_norm": 12.220516256346185, + "learning_rate": 4.615957939689383e-05, + "loss": 2.7312, + "mean_token_accuracy": 0.3724137872457504, + "step": 45830 + }, + { + "epoch": 0.04616549880595181, + "grad_norm": 12.01460540365314, + "learning_rate": 4.616461535362489e-05, + "loss": 2.401, + "mean_token_accuracy": 0.40344828367233276, + "step": 45835 + }, + { + "epoch": 0.04617053485905598, + "grad_norm": 14.88337025700096, + "learning_rate": 4.616965131035594e-05, + "loss": 2.6828, + "mean_token_accuracy": 0.37241379022598264, + "step": 45840 + }, + { + "epoch": 0.04617557091216015, + "grad_norm": 12.086693837324553, + "learning_rate": 4.6174687267087e-05, + "loss": 2.4457, + "mean_token_accuracy": 0.4620689690113068, + "step": 45845 + }, + { + "epoch": 0.046180606965264326, + "grad_norm": 13.645888427371792, + "learning_rate": 4.617972322381806e-05, + "loss": 2.5342, + "mean_token_accuracy": 0.4206896543502808, + "step": 45850 + }, + { + "epoch": 0.0461856430183685, + "grad_norm": 16.31438302023713, + "learning_rate": 4.6184759180549125e-05, + "loss": 2.7061, + "mean_token_accuracy": 0.4000000059604645, + "step": 45855 + }, + { + "epoch": 0.04619067907147267, + "grad_norm": 16.258522309892264, + "learning_rate": 4.6189795137280184e-05, + "loss": 2.8062, + "mean_token_accuracy": 0.37931033968925476, + "step": 45860 + }, + { + "epoch": 0.04619571512457685, + "grad_norm": 13.843857971600505, + "learning_rate": 4.619483109401124e-05, + "loss": 2.6559, + "mean_token_accuracy": 0.3655172407627106, + "step": 45865 + }, + { + "epoch": 0.04620075117768102, + "grad_norm": 11.820816346128776, + "learning_rate": 4.61998670507423e-05, + "loss": 2.5112, + "mean_token_accuracy": 0.3827586233615875, + "step": 45870 + }, + { + "epoch": 0.04620578723078519, + "grad_norm": 16.923931552518596, + "learning_rate": 4.620490300747336e-05, + "loss": 2.4844, + "mean_token_accuracy": 0.40508167147636415, + "step": 45875 + }, + { + "epoch": 0.04621082328388936, + "grad_norm": 13.022211695787268, + "learning_rate": 4.620993896420443e-05, + "loss": 2.5294, + "mean_token_accuracy": 0.4, + "step": 45880 + }, + { + "epoch": 0.046215859336993535, + "grad_norm": 14.568218355406565, + "learning_rate": 4.621497492093548e-05, + "loss": 2.5195, + "mean_token_accuracy": 0.42068966627120974, + "step": 45885 + }, + { + "epoch": 0.04622089539009771, + "grad_norm": 15.08965299059806, + "learning_rate": 4.622001087766654e-05, + "loss": 2.6306, + "mean_token_accuracy": 0.38275861740112305, + "step": 45890 + }, + { + "epoch": 0.04622593144320188, + "grad_norm": 12.658108172795455, + "learning_rate": 4.62250468343976e-05, + "loss": 2.4904, + "mean_token_accuracy": 0.36896551251411436, + "step": 45895 + }, + { + "epoch": 0.04623096749630606, + "grad_norm": 28.73455812794667, + "learning_rate": 4.623008279112866e-05, + "loss": 2.4466, + "mean_token_accuracy": 0.4620689630508423, + "step": 45900 + }, + { + "epoch": 0.04623600354941023, + "grad_norm": 13.900968562454844, + "learning_rate": 4.623511874785972e-05, + "loss": 2.5235, + "mean_token_accuracy": 0.4103448331356049, + "step": 45905 + }, + { + "epoch": 0.0462410396025144, + "grad_norm": 35.17792021822705, + "learning_rate": 4.624015470459078e-05, + "loss": 2.8049, + "mean_token_accuracy": 0.36896551847457887, + "step": 45910 + }, + { + "epoch": 0.04624607565561857, + "grad_norm": 16.21915073681264, + "learning_rate": 4.624519066132184e-05, + "loss": 2.4903, + "mean_token_accuracy": 0.3999999940395355, + "step": 45915 + }, + { + "epoch": 0.046251111708722745, + "grad_norm": 14.413071856923924, + "learning_rate": 4.62502266180529e-05, + "loss": 2.4567, + "mean_token_accuracy": 0.42413793206214906, + "step": 45920 + }, + { + "epoch": 0.04625614776182692, + "grad_norm": 12.162893616460925, + "learning_rate": 4.625526257478396e-05, + "loss": 2.57, + "mean_token_accuracy": 0.38965516090393065, + "step": 45925 + }, + { + "epoch": 0.04626118381493109, + "grad_norm": 16.089058240117364, + "learning_rate": 4.626029853151502e-05, + "loss": 2.9962, + "mean_token_accuracy": 0.3482758641242981, + "step": 45930 + }, + { + "epoch": 0.046266219868035266, + "grad_norm": 13.718959636544689, + "learning_rate": 4.626533448824608e-05, + "loss": 3.0507, + "mean_token_accuracy": 0.36551724672317504, + "step": 45935 + }, + { + "epoch": 0.04627125592113944, + "grad_norm": 44.33563518514151, + "learning_rate": 4.627037044497714e-05, + "loss": 2.7675, + "mean_token_accuracy": 0.3862069010734558, + "step": 45940 + }, + { + "epoch": 0.04627629197424361, + "grad_norm": 13.861620347278153, + "learning_rate": 4.62754064017082e-05, + "loss": 2.3097, + "mean_token_accuracy": 0.447065943479538, + "step": 45945 + }, + { + "epoch": 0.04628132802734778, + "grad_norm": 14.521775461305122, + "learning_rate": 4.628044235843926e-05, + "loss": 2.3223, + "mean_token_accuracy": 0.4620689690113068, + "step": 45950 + }, + { + "epoch": 0.046286364080451954, + "grad_norm": 17.41446443111772, + "learning_rate": 4.628547831517032e-05, + "loss": 2.4409, + "mean_token_accuracy": 0.4562807857990265, + "step": 45955 + }, + { + "epoch": 0.04629140013355613, + "grad_norm": 14.534815184253757, + "learning_rate": 4.629051427190138e-05, + "loss": 2.4058, + "mean_token_accuracy": 0.4157894730567932, + "step": 45960 + }, + { + "epoch": 0.0462964361866603, + "grad_norm": 16.439437777946964, + "learning_rate": 4.629555022863244e-05, + "loss": 2.9946, + "mean_token_accuracy": 0.37586207389831544, + "step": 45965 + }, + { + "epoch": 0.046301472239764475, + "grad_norm": 13.039451096799368, + "learning_rate": 4.63005861853635e-05, + "loss": 2.5333, + "mean_token_accuracy": 0.42068964838981626, + "step": 45970 + }, + { + "epoch": 0.04630650829286865, + "grad_norm": 12.840304665819563, + "learning_rate": 4.6305622142094554e-05, + "loss": 2.5294, + "mean_token_accuracy": 0.37241379022598264, + "step": 45975 + }, + { + "epoch": 0.046311544345972816, + "grad_norm": 15.432918138778208, + "learning_rate": 4.631065809882561e-05, + "loss": 2.7596, + "mean_token_accuracy": 0.37241379618644715, + "step": 45980 + }, + { + "epoch": 0.04631658039907699, + "grad_norm": 15.270511119006484, + "learning_rate": 4.631569405555667e-05, + "loss": 2.5033, + "mean_token_accuracy": 0.3862069010734558, + "step": 45985 + }, + { + "epoch": 0.046321616452181164, + "grad_norm": 13.167415132014218, + "learning_rate": 4.632073001228774e-05, + "loss": 2.6223, + "mean_token_accuracy": 0.3482758581638336, + "step": 45990 + }, + { + "epoch": 0.04632665250528534, + "grad_norm": 13.633104861808908, + "learning_rate": 4.63257659690188e-05, + "loss": 2.7588, + "mean_token_accuracy": 0.3620689630508423, + "step": 45995 + }, + { + "epoch": 0.04633168855838951, + "grad_norm": 11.857464468974793, + "learning_rate": 4.633080192574986e-05, + "loss": 2.5808, + "mean_token_accuracy": 0.40689654350280763, + "step": 46000 + }, + { + "epoch": 0.046336724611493685, + "grad_norm": 24.237490066125293, + "learning_rate": 4.6335837882480916e-05, + "loss": 2.6148, + "mean_token_accuracy": 0.4344827592372894, + "step": 46005 + }, + { + "epoch": 0.04634176066459786, + "grad_norm": 14.701628101743385, + "learning_rate": 4.6340873839211975e-05, + "loss": 2.6068, + "mean_token_accuracy": 0.39503931999206543, + "step": 46010 + }, + { + "epoch": 0.046346796717702025, + "grad_norm": 12.7457082523556, + "learning_rate": 4.6345909795943034e-05, + "loss": 2.7071, + "mean_token_accuracy": 0.37931033968925476, + "step": 46015 + }, + { + "epoch": 0.0463518327708062, + "grad_norm": 9.676119655095004, + "learning_rate": 4.6350945752674094e-05, + "loss": 2.6978, + "mean_token_accuracy": 0.41966121792793276, + "step": 46020 + }, + { + "epoch": 0.04635686882391037, + "grad_norm": 22.42029528285458, + "learning_rate": 4.635598170940515e-05, + "loss": 2.694, + "mean_token_accuracy": 0.4206896543502808, + "step": 46025 + }, + { + "epoch": 0.04636190487701455, + "grad_norm": 12.430331594229502, + "learning_rate": 4.636101766613621e-05, + "loss": 2.3322, + "mean_token_accuracy": 0.4310344815254211, + "step": 46030 + }, + { + "epoch": 0.04636694093011872, + "grad_norm": 13.531095412882072, + "learning_rate": 4.636605362286727e-05, + "loss": 2.53, + "mean_token_accuracy": 0.38620689511299133, + "step": 46035 + }, + { + "epoch": 0.046371976983222894, + "grad_norm": 11.199041581310555, + "learning_rate": 4.637108957959834e-05, + "loss": 2.5986, + "mean_token_accuracy": 0.3758620709180832, + "step": 46040 + }, + { + "epoch": 0.04637701303632707, + "grad_norm": 10.566736499541863, + "learning_rate": 4.63761255363294e-05, + "loss": 2.2256, + "mean_token_accuracy": 0.4329098641872406, + "step": 46045 + }, + { + "epoch": 0.046382049089431235, + "grad_norm": 15.958288357559006, + "learning_rate": 4.6381161493060456e-05, + "loss": 2.5156, + "mean_token_accuracy": 0.41379310488700866, + "step": 46050 + }, + { + "epoch": 0.04638708514253541, + "grad_norm": 12.738298865562033, + "learning_rate": 4.6386197449791515e-05, + "loss": 2.334, + "mean_token_accuracy": 0.41034482717514037, + "step": 46055 + }, + { + "epoch": 0.04639212119563958, + "grad_norm": 18.100403347697107, + "learning_rate": 4.6391233406522574e-05, + "loss": 2.6332, + "mean_token_accuracy": 0.42758620977401735, + "step": 46060 + }, + { + "epoch": 0.046397157248743756, + "grad_norm": 13.061823492778986, + "learning_rate": 4.6396269363253634e-05, + "loss": 2.4386, + "mean_token_accuracy": 0.38275861740112305, + "step": 46065 + }, + { + "epoch": 0.04640219330184793, + "grad_norm": 12.597300846567174, + "learning_rate": 4.640130531998469e-05, + "loss": 2.4289, + "mean_token_accuracy": 0.3758620709180832, + "step": 46070 + }, + { + "epoch": 0.046407229354952104, + "grad_norm": 16.4216660964597, + "learning_rate": 4.640634127671575e-05, + "loss": 2.7114, + "mean_token_accuracy": 0.36896551847457887, + "step": 46075 + }, + { + "epoch": 0.04641226540805628, + "grad_norm": 15.040774392297783, + "learning_rate": 4.641137723344681e-05, + "loss": 2.9222, + "mean_token_accuracy": 0.4034482717514038, + "step": 46080 + }, + { + "epoch": 0.046417301461160444, + "grad_norm": 11.536233569369402, + "learning_rate": 4.641641319017787e-05, + "loss": 2.4877, + "mean_token_accuracy": 0.41034482717514037, + "step": 46085 + }, + { + "epoch": 0.04642233751426462, + "grad_norm": 30.83155244510312, + "learning_rate": 4.642144914690893e-05, + "loss": 2.6535, + "mean_token_accuracy": 0.3965517163276672, + "step": 46090 + }, + { + "epoch": 0.04642737356736879, + "grad_norm": 14.845675195096591, + "learning_rate": 4.6426485103639996e-05, + "loss": 2.2555, + "mean_token_accuracy": 0.47647783160209656, + "step": 46095 + }, + { + "epoch": 0.046432409620472966, + "grad_norm": 12.969754348440164, + "learning_rate": 4.6431521060371055e-05, + "loss": 2.858, + "mean_token_accuracy": 0.38275861740112305, + "step": 46100 + }, + { + "epoch": 0.04643744567357714, + "grad_norm": 13.983398680459782, + "learning_rate": 4.6436557017102115e-05, + "loss": 2.9348, + "mean_token_accuracy": 0.3413793116807938, + "step": 46105 + }, + { + "epoch": 0.04644248172668131, + "grad_norm": 12.883394252233046, + "learning_rate": 4.644159297383317e-05, + "loss": 2.2188, + "mean_token_accuracy": 0.42758620977401735, + "step": 46110 + }, + { + "epoch": 0.04644751777978549, + "grad_norm": 15.153527208853713, + "learning_rate": 4.6446628930564226e-05, + "loss": 2.6293, + "mean_token_accuracy": 0.41379310488700866, + "step": 46115 + }, + { + "epoch": 0.046452553832889654, + "grad_norm": 13.481225231100817, + "learning_rate": 4.645166488729529e-05, + "loss": 2.6796, + "mean_token_accuracy": 0.4344827592372894, + "step": 46120 + }, + { + "epoch": 0.04645758988599383, + "grad_norm": 15.774491627679929, + "learning_rate": 4.645670084402635e-05, + "loss": 2.5303, + "mean_token_accuracy": 0.42413792610168455, + "step": 46125 + }, + { + "epoch": 0.046462625939098, + "grad_norm": 11.896931345667403, + "learning_rate": 4.646173680075741e-05, + "loss": 2.5707, + "mean_token_accuracy": 0.42758620977401735, + "step": 46130 + }, + { + "epoch": 0.046467661992202175, + "grad_norm": 11.005845355316321, + "learning_rate": 4.646677275748847e-05, + "loss": 2.4982, + "mean_token_accuracy": 0.3862068891525269, + "step": 46135 + }, + { + "epoch": 0.04647269804530635, + "grad_norm": 14.304115405775352, + "learning_rate": 4.647180871421953e-05, + "loss": 2.4297, + "mean_token_accuracy": 0.3827586203813553, + "step": 46140 + }, + { + "epoch": 0.04647773409841052, + "grad_norm": 10.847422434972705, + "learning_rate": 4.647684467095059e-05, + "loss": 2.2724, + "mean_token_accuracy": 0.46551724672317507, + "step": 46145 + }, + { + "epoch": 0.046482770151514696, + "grad_norm": 19.815230883292827, + "learning_rate": 4.648188062768165e-05, + "loss": 2.6843, + "mean_token_accuracy": 0.38965517580509185, + "step": 46150 + }, + { + "epoch": 0.04648780620461886, + "grad_norm": 12.690830671352783, + "learning_rate": 4.648691658441271e-05, + "loss": 2.4956, + "mean_token_accuracy": 0.4172413766384125, + "step": 46155 + }, + { + "epoch": 0.04649284225772304, + "grad_norm": 13.450250643956371, + "learning_rate": 4.6491952541143766e-05, + "loss": 2.691, + "mean_token_accuracy": 0.3655172407627106, + "step": 46160 + }, + { + "epoch": 0.04649787831082721, + "grad_norm": 14.628030472441715, + "learning_rate": 4.6496988497874826e-05, + "loss": 2.7533, + "mean_token_accuracy": 0.38421053290367124, + "step": 46165 + }, + { + "epoch": 0.046502914363931384, + "grad_norm": 12.668612685982131, + "learning_rate": 4.6502024454605885e-05, + "loss": 2.6825, + "mean_token_accuracy": 0.3793103456497192, + "step": 46170 + }, + { + "epoch": 0.04650795041703556, + "grad_norm": 13.499269566685483, + "learning_rate": 4.650706041133695e-05, + "loss": 3.0405, + "mean_token_accuracy": 0.33793103098869326, + "step": 46175 + }, + { + "epoch": 0.04651298647013973, + "grad_norm": 15.330818155606407, + "learning_rate": 4.651209636806801e-05, + "loss": 2.3518, + "mean_token_accuracy": 0.4257713258266449, + "step": 46180 + }, + { + "epoch": 0.046518022523243906, + "grad_norm": 13.69473693507867, + "learning_rate": 4.651713232479907e-05, + "loss": 3.1926, + "mean_token_accuracy": 0.34137930572032926, + "step": 46185 + }, + { + "epoch": 0.04652305857634807, + "grad_norm": 22.651420678413547, + "learning_rate": 4.652216828153013e-05, + "loss": 3.2714, + "mean_token_accuracy": 0.3034482777118683, + "step": 46190 + }, + { + "epoch": 0.046528094629452246, + "grad_norm": 11.816002337664438, + "learning_rate": 4.652720423826119e-05, + "loss": 2.7271, + "mean_token_accuracy": 0.41379310488700866, + "step": 46195 + }, + { + "epoch": 0.04653313068255642, + "grad_norm": 15.415220495216337, + "learning_rate": 4.653224019499225e-05, + "loss": 2.5223, + "mean_token_accuracy": 0.3931034505367279, + "step": 46200 + }, + { + "epoch": 0.046538166735660594, + "grad_norm": 11.402544626173952, + "learning_rate": 4.6537276151723306e-05, + "loss": 2.3957, + "mean_token_accuracy": 0.42256503105163573, + "step": 46205 + }, + { + "epoch": 0.04654320278876477, + "grad_norm": 12.322532220079593, + "learning_rate": 4.6542312108454366e-05, + "loss": 2.1181, + "mean_token_accuracy": 0.47797942757606504, + "step": 46210 + }, + { + "epoch": 0.04654823884186894, + "grad_norm": 12.825588110301824, + "learning_rate": 4.6547348065185425e-05, + "loss": 2.7777, + "mean_token_accuracy": 0.4034482777118683, + "step": 46215 + }, + { + "epoch": 0.046553274894973115, + "grad_norm": 29.998588780292636, + "learning_rate": 4.6552384021916484e-05, + "loss": 2.9102, + "mean_token_accuracy": 0.3984270989894867, + "step": 46220 + }, + { + "epoch": 0.04655831094807728, + "grad_norm": 11.784893909019942, + "learning_rate": 4.655741997864755e-05, + "loss": 2.4031, + "mean_token_accuracy": 0.43248639106750486, + "step": 46225 + }, + { + "epoch": 0.046563347001181456, + "grad_norm": 12.466769828053527, + "learning_rate": 4.656245593537861e-05, + "loss": 2.5748, + "mean_token_accuracy": 0.41609196066856385, + "step": 46230 + }, + { + "epoch": 0.04656838305428563, + "grad_norm": 12.779649672054347, + "learning_rate": 4.656749189210967e-05, + "loss": 2.1994, + "mean_token_accuracy": 0.45517240166664125, + "step": 46235 + }, + { + "epoch": 0.0465734191073898, + "grad_norm": 10.789015253044095, + "learning_rate": 4.657252784884073e-05, + "loss": 2.2572, + "mean_token_accuracy": 0.4310344815254211, + "step": 46240 + }, + { + "epoch": 0.04657845516049398, + "grad_norm": 12.226269920113237, + "learning_rate": 4.657756380557178e-05, + "loss": 2.5446, + "mean_token_accuracy": 0.3758620709180832, + "step": 46245 + }, + { + "epoch": 0.04658349121359815, + "grad_norm": 11.036590385017957, + "learning_rate": 4.658259976230284e-05, + "loss": 2.6158, + "mean_token_accuracy": 0.3965517282485962, + "step": 46250 + }, + { + "epoch": 0.046588527266702325, + "grad_norm": 16.40896065968302, + "learning_rate": 4.6587635719033906e-05, + "loss": 2.3883, + "mean_token_accuracy": 0.43103447556495667, + "step": 46255 + }, + { + "epoch": 0.04659356331980649, + "grad_norm": 13.80523218301721, + "learning_rate": 4.6592671675764965e-05, + "loss": 3.0293, + "mean_token_accuracy": 0.39655172228813174, + "step": 46260 + }, + { + "epoch": 0.046598599372910665, + "grad_norm": 14.458737260186087, + "learning_rate": 4.6597707632496024e-05, + "loss": 2.789, + "mean_token_accuracy": 0.3379310369491577, + "step": 46265 + }, + { + "epoch": 0.04660363542601484, + "grad_norm": 11.291111957617563, + "learning_rate": 4.6602743589227083e-05, + "loss": 2.1188, + "mean_token_accuracy": 0.45517240166664125, + "step": 46270 + }, + { + "epoch": 0.04660867147911901, + "grad_norm": 14.800970571681939, + "learning_rate": 4.660777954595814e-05, + "loss": 3.069, + "mean_token_accuracy": 0.33793102502822875, + "step": 46275 + }, + { + "epoch": 0.046613707532223186, + "grad_norm": 11.89288989225702, + "learning_rate": 4.661281550268921e-05, + "loss": 2.8296, + "mean_token_accuracy": 0.32068965435028074, + "step": 46280 + }, + { + "epoch": 0.04661874358532736, + "grad_norm": 11.01402505290828, + "learning_rate": 4.661785145942026e-05, + "loss": 2.3195, + "mean_token_accuracy": 0.4275861978530884, + "step": 46285 + }, + { + "epoch": 0.046623779638431534, + "grad_norm": 16.729750156321543, + "learning_rate": 4.662288741615132e-05, + "loss": 2.8058, + "mean_token_accuracy": 0.3931034505367279, + "step": 46290 + }, + { + "epoch": 0.0466288156915357, + "grad_norm": 13.351939971723985, + "learning_rate": 4.662792337288238e-05, + "loss": 2.6625, + "mean_token_accuracy": 0.4068965494632721, + "step": 46295 + }, + { + "epoch": 0.046633851744639875, + "grad_norm": 11.542197123257866, + "learning_rate": 4.663295932961344e-05, + "loss": 2.277, + "mean_token_accuracy": 0.458620685338974, + "step": 46300 + }, + { + "epoch": 0.04663888779774405, + "grad_norm": 14.183882761164176, + "learning_rate": 4.6637995286344505e-05, + "loss": 2.5842, + "mean_token_accuracy": 0.36896551549434664, + "step": 46305 + }, + { + "epoch": 0.04664392385084822, + "grad_norm": 14.025220305500262, + "learning_rate": 4.6643031243075564e-05, + "loss": 2.7793, + "mean_token_accuracy": 0.33103448152542114, + "step": 46310 + }, + { + "epoch": 0.046648959903952396, + "grad_norm": 12.156555456064357, + "learning_rate": 4.6648067199806623e-05, + "loss": 2.5299, + "mean_token_accuracy": 0.42758620381355283, + "step": 46315 + }, + { + "epoch": 0.04665399595705657, + "grad_norm": 11.56394372480486, + "learning_rate": 4.665310315653768e-05, + "loss": 2.4368, + "mean_token_accuracy": 0.43448275327682495, + "step": 46320 + }, + { + "epoch": 0.04665903201016074, + "grad_norm": 11.621710501489801, + "learning_rate": 4.665813911326874e-05, + "loss": 2.6931, + "mean_token_accuracy": 0.38620689511299133, + "step": 46325 + }, + { + "epoch": 0.04666406806326491, + "grad_norm": 10.122138176410097, + "learning_rate": 4.66631750699998e-05, + "loss": 2.2622, + "mean_token_accuracy": 0.4655172348022461, + "step": 46330 + }, + { + "epoch": 0.046669104116369084, + "grad_norm": 14.11305958198568, + "learning_rate": 4.666821102673086e-05, + "loss": 2.7757, + "mean_token_accuracy": 0.41379310488700866, + "step": 46335 + }, + { + "epoch": 0.04667414016947326, + "grad_norm": 13.018365124729732, + "learning_rate": 4.667324698346192e-05, + "loss": 2.1244, + "mean_token_accuracy": 0.5034482836723327, + "step": 46340 + }, + { + "epoch": 0.04667917622257743, + "grad_norm": 14.468809234865356, + "learning_rate": 4.667828294019298e-05, + "loss": 2.5634, + "mean_token_accuracy": 0.37586206793785093, + "step": 46345 + }, + { + "epoch": 0.046684212275681605, + "grad_norm": 16.391595665965475, + "learning_rate": 4.668331889692404e-05, + "loss": 2.5285, + "mean_token_accuracy": 0.4122807025909424, + "step": 46350 + }, + { + "epoch": 0.04668924832878578, + "grad_norm": 13.869131538328594, + "learning_rate": 4.66883548536551e-05, + "loss": 2.662, + "mean_token_accuracy": 0.36551723480224607, + "step": 46355 + }, + { + "epoch": 0.04669428438188995, + "grad_norm": 13.480983941122094, + "learning_rate": 4.6693390810386164e-05, + "loss": 2.8522, + "mean_token_accuracy": 0.3517241358757019, + "step": 46360 + }, + { + "epoch": 0.04669932043499412, + "grad_norm": 19.603999857757433, + "learning_rate": 4.669842676711722e-05, + "loss": 2.8948, + "mean_token_accuracy": 0.34482758641242983, + "step": 46365 + }, + { + "epoch": 0.04670435648809829, + "grad_norm": 15.79953807659748, + "learning_rate": 4.670346272384828e-05, + "loss": 2.8059, + "mean_token_accuracy": 0.36206897497177126, + "step": 46370 + }, + { + "epoch": 0.04670939254120247, + "grad_norm": 16.76586316731833, + "learning_rate": 4.6708498680579335e-05, + "loss": 2.4394, + "mean_token_accuracy": 0.3980641275644302, + "step": 46375 + }, + { + "epoch": 0.04671442859430664, + "grad_norm": 11.24548291977423, + "learning_rate": 4.6713534637310394e-05, + "loss": 2.8442, + "mean_token_accuracy": 0.3689655214548111, + "step": 46380 + }, + { + "epoch": 0.046719464647410815, + "grad_norm": 13.296213634868522, + "learning_rate": 4.671857059404146e-05, + "loss": 2.7322, + "mean_token_accuracy": 0.3551724076271057, + "step": 46385 + }, + { + "epoch": 0.04672450070051499, + "grad_norm": 11.411439432864737, + "learning_rate": 4.672360655077252e-05, + "loss": 2.7464, + "mean_token_accuracy": 0.34482758641242983, + "step": 46390 + }, + { + "epoch": 0.04672953675361916, + "grad_norm": 10.975083434080346, + "learning_rate": 4.672864250750358e-05, + "loss": 2.3412, + "mean_token_accuracy": 0.44482758045196535, + "step": 46395 + }, + { + "epoch": 0.04673457280672333, + "grad_norm": 14.931923755646302, + "learning_rate": 4.673367846423464e-05, + "loss": 3.081, + "mean_token_accuracy": 0.35862069129943847, + "step": 46400 + }, + { + "epoch": 0.0467396088598275, + "grad_norm": 12.467442786279372, + "learning_rate": 4.67387144209657e-05, + "loss": 2.7651, + "mean_token_accuracy": 0.4068965554237366, + "step": 46405 + }, + { + "epoch": 0.04674464491293168, + "grad_norm": 18.17146844695907, + "learning_rate": 4.6743750377696756e-05, + "loss": 2.547, + "mean_token_accuracy": 0.42413793206214906, + "step": 46410 + }, + { + "epoch": 0.04674968096603585, + "grad_norm": 12.625502605888316, + "learning_rate": 4.674878633442782e-05, + "loss": 2.2155, + "mean_token_accuracy": 0.4379310429096222, + "step": 46415 + }, + { + "epoch": 0.046754717019140024, + "grad_norm": 12.35210482196056, + "learning_rate": 4.6753822291158875e-05, + "loss": 2.7684, + "mean_token_accuracy": 0.37241379618644715, + "step": 46420 + }, + { + "epoch": 0.0467597530722442, + "grad_norm": 17.11679177558172, + "learning_rate": 4.6758858247889934e-05, + "loss": 2.8563, + "mean_token_accuracy": 0.36896551251411436, + "step": 46425 + }, + { + "epoch": 0.04676478912534837, + "grad_norm": 11.122507746611133, + "learning_rate": 4.676389420462099e-05, + "loss": 2.2496, + "mean_token_accuracy": 0.4379310369491577, + "step": 46430 + }, + { + "epoch": 0.04676982517845254, + "grad_norm": 13.852223878838121, + "learning_rate": 4.676893016135205e-05, + "loss": 2.3647, + "mean_token_accuracy": 0.4, + "step": 46435 + }, + { + "epoch": 0.04677486123155671, + "grad_norm": 16.784735380222674, + "learning_rate": 4.677396611808312e-05, + "loss": 2.7288, + "mean_token_accuracy": 0.36206896007061007, + "step": 46440 + }, + { + "epoch": 0.046779897284660886, + "grad_norm": 12.197946667123045, + "learning_rate": 4.677900207481418e-05, + "loss": 2.5614, + "mean_token_accuracy": 0.36896551847457887, + "step": 46445 + }, + { + "epoch": 0.04678493333776506, + "grad_norm": 13.932155933715231, + "learning_rate": 4.678403803154524e-05, + "loss": 2.0727, + "mean_token_accuracy": 0.48275862336158754, + "step": 46450 + }, + { + "epoch": 0.046789969390869234, + "grad_norm": 13.212000835654651, + "learning_rate": 4.6789073988276296e-05, + "loss": 2.5825, + "mean_token_accuracy": 0.4034482777118683, + "step": 46455 + }, + { + "epoch": 0.04679500544397341, + "grad_norm": 14.054493040528458, + "learning_rate": 4.6794109945007355e-05, + "loss": 2.9621, + "mean_token_accuracy": 0.39655172228813174, + "step": 46460 + }, + { + "epoch": 0.04680004149707758, + "grad_norm": 19.154669492394557, + "learning_rate": 4.6799145901738415e-05, + "loss": 3.1245, + "mean_token_accuracy": 0.37586206793785093, + "step": 46465 + }, + { + "epoch": 0.04680507755018175, + "grad_norm": 20.22559632517103, + "learning_rate": 4.6804181858469474e-05, + "loss": 3.0609, + "mean_token_accuracy": 0.4148820281028748, + "step": 46470 + }, + { + "epoch": 0.04681011360328592, + "grad_norm": 14.627308769164507, + "learning_rate": 4.680921781520053e-05, + "loss": 2.5286, + "mean_token_accuracy": 0.39147005677223207, + "step": 46475 + }, + { + "epoch": 0.046815149656390095, + "grad_norm": 14.23085695293475, + "learning_rate": 4.681425377193159e-05, + "loss": 2.6632, + "mean_token_accuracy": 0.34658198058605194, + "step": 46480 + }, + { + "epoch": 0.04682018570949427, + "grad_norm": 12.34526051469207, + "learning_rate": 4.681928972866265e-05, + "loss": 2.6481, + "mean_token_accuracy": 0.40344828367233276, + "step": 46485 + }, + { + "epoch": 0.04682522176259844, + "grad_norm": 14.06021847357866, + "learning_rate": 4.682432568539371e-05, + "loss": 2.6544, + "mean_token_accuracy": 0.3862069010734558, + "step": 46490 + }, + { + "epoch": 0.04683025781570262, + "grad_norm": 13.021144960209002, + "learning_rate": 4.682936164212478e-05, + "loss": 2.4008, + "mean_token_accuracy": 0.40000000298023225, + "step": 46495 + }, + { + "epoch": 0.04683529386880679, + "grad_norm": 11.192147435076782, + "learning_rate": 4.6834397598855836e-05, + "loss": 2.6397, + "mean_token_accuracy": 0.4068965494632721, + "step": 46500 + }, + { + "epoch": 0.04684032992191096, + "grad_norm": 11.75850928410697, + "learning_rate": 4.6839433555586895e-05, + "loss": 2.471, + "mean_token_accuracy": 0.4275861978530884, + "step": 46505 + }, + { + "epoch": 0.04684536597501513, + "grad_norm": 12.914189914686297, + "learning_rate": 4.684446951231795e-05, + "loss": 2.7455, + "mean_token_accuracy": 0.35692680180072783, + "step": 46510 + }, + { + "epoch": 0.046850402028119305, + "grad_norm": 12.41781632823946, + "learning_rate": 4.684950546904901e-05, + "loss": 2.3996, + "mean_token_accuracy": 0.3931034505367279, + "step": 46515 + }, + { + "epoch": 0.04685543808122348, + "grad_norm": 14.353762277644186, + "learning_rate": 4.685454142578007e-05, + "loss": 2.6852, + "mean_token_accuracy": 0.37586206793785093, + "step": 46520 + }, + { + "epoch": 0.04686047413432765, + "grad_norm": 12.600590710543017, + "learning_rate": 4.685957738251113e-05, + "loss": 2.4563, + "mean_token_accuracy": 0.4172413766384125, + "step": 46525 + }, + { + "epoch": 0.046865510187431826, + "grad_norm": 12.313054187027676, + "learning_rate": 4.686461333924219e-05, + "loss": 2.6385, + "mean_token_accuracy": 0.35862069129943847, + "step": 46530 + }, + { + "epoch": 0.046870546240536, + "grad_norm": 18.397559587832312, + "learning_rate": 4.686964929597325e-05, + "loss": 2.518, + "mean_token_accuracy": 0.39655172228813174, + "step": 46535 + }, + { + "epoch": 0.04687558229364017, + "grad_norm": 16.13502934294121, + "learning_rate": 4.687468525270431e-05, + "loss": 2.4234, + "mean_token_accuracy": 0.3655172407627106, + "step": 46540 + }, + { + "epoch": 0.04688061834674434, + "grad_norm": 11.811237622960931, + "learning_rate": 4.6879721209435376e-05, + "loss": 2.8467, + "mean_token_accuracy": 0.40544464588165285, + "step": 46545 + }, + { + "epoch": 0.046885654399848514, + "grad_norm": 12.299191706586337, + "learning_rate": 4.688475716616643e-05, + "loss": 2.8507, + "mean_token_accuracy": 0.3934728980064392, + "step": 46550 + }, + { + "epoch": 0.04689069045295269, + "grad_norm": 12.854228269174623, + "learning_rate": 4.688979312289749e-05, + "loss": 2.6075, + "mean_token_accuracy": 0.41724138259887694, + "step": 46555 + }, + { + "epoch": 0.04689572650605686, + "grad_norm": 12.157262814784477, + "learning_rate": 4.689482907962855e-05, + "loss": 2.7778, + "mean_token_accuracy": 0.3620689630508423, + "step": 46560 + }, + { + "epoch": 0.046900762559161036, + "grad_norm": 11.99758696016837, + "learning_rate": 4.6899865036359606e-05, + "loss": 2.2614, + "mean_token_accuracy": 0.4290381133556366, + "step": 46565 + }, + { + "epoch": 0.04690579861226521, + "grad_norm": 17.606979154400342, + "learning_rate": 4.6904900993090666e-05, + "loss": 3.135, + "mean_token_accuracy": 0.37755595445632933, + "step": 46570 + }, + { + "epoch": 0.046910834665369376, + "grad_norm": 19.66955528139329, + "learning_rate": 4.690993694982173e-05, + "loss": 2.6442, + "mean_token_accuracy": 0.44295220971107485, + "step": 46575 + }, + { + "epoch": 0.04691587071847355, + "grad_norm": 18.63968596257936, + "learning_rate": 4.691497290655279e-05, + "loss": 2.8048, + "mean_token_accuracy": 0.3846340000629425, + "step": 46580 + }, + { + "epoch": 0.046920906771577724, + "grad_norm": 16.46319461788385, + "learning_rate": 4.692000886328385e-05, + "loss": 2.6738, + "mean_token_accuracy": 0.3896551728248596, + "step": 46585 + }, + { + "epoch": 0.0469259428246819, + "grad_norm": 12.245045734264155, + "learning_rate": 4.692504482001491e-05, + "loss": 2.5495, + "mean_token_accuracy": 0.37241379618644715, + "step": 46590 + }, + { + "epoch": 0.04693097887778607, + "grad_norm": 12.385526541243276, + "learning_rate": 4.693008077674597e-05, + "loss": 2.6172, + "mean_token_accuracy": 0.35862069129943847, + "step": 46595 + }, + { + "epoch": 0.046936014930890245, + "grad_norm": 14.888157583386725, + "learning_rate": 4.693511673347703e-05, + "loss": 2.1672, + "mean_token_accuracy": 0.5018148839473724, + "step": 46600 + }, + { + "epoch": 0.04694105098399442, + "grad_norm": 13.143638307203076, + "learning_rate": 4.694015269020809e-05, + "loss": 2.6597, + "mean_token_accuracy": 0.35862069129943847, + "step": 46605 + }, + { + "epoch": 0.046946087037098586, + "grad_norm": 13.563188908257573, + "learning_rate": 4.6945188646939147e-05, + "loss": 2.451, + "mean_token_accuracy": 0.4379310369491577, + "step": 46610 + }, + { + "epoch": 0.04695112309020276, + "grad_norm": 13.18366108581065, + "learning_rate": 4.6950224603670206e-05, + "loss": 2.2403, + "mean_token_accuracy": 0.42758620381355283, + "step": 46615 + }, + { + "epoch": 0.04695615914330693, + "grad_norm": 15.264158148189173, + "learning_rate": 4.6955260560401265e-05, + "loss": 2.7083, + "mean_token_accuracy": 0.40689654350280763, + "step": 46620 + }, + { + "epoch": 0.04696119519641111, + "grad_norm": 16.446309693210857, + "learning_rate": 4.696029651713233e-05, + "loss": 2.4492, + "mean_token_accuracy": 0.41379310488700866, + "step": 46625 + }, + { + "epoch": 0.04696623124951528, + "grad_norm": 13.467528000181634, + "learning_rate": 4.696533247386339e-05, + "loss": 2.4682, + "mean_token_accuracy": 0.39655172228813174, + "step": 46630 + }, + { + "epoch": 0.046971267302619454, + "grad_norm": 11.150932576995208, + "learning_rate": 4.697036843059445e-05, + "loss": 2.4734, + "mean_token_accuracy": 0.38275861740112305, + "step": 46635 + }, + { + "epoch": 0.04697630335572363, + "grad_norm": 12.890656340857578, + "learning_rate": 4.697540438732551e-05, + "loss": 2.5718, + "mean_token_accuracy": 0.3878402948379517, + "step": 46640 + }, + { + "epoch": 0.046981339408827795, + "grad_norm": 12.337165984575114, + "learning_rate": 4.698044034405656e-05, + "loss": 2.2637, + "mean_token_accuracy": 0.4344827592372894, + "step": 46645 + }, + { + "epoch": 0.04698637546193197, + "grad_norm": 12.105350392593298, + "learning_rate": 4.698547630078763e-05, + "loss": 2.6075, + "mean_token_accuracy": 0.3931034505367279, + "step": 46650 + }, + { + "epoch": 0.04699141151503614, + "grad_norm": 13.72554146429605, + "learning_rate": 4.6990512257518687e-05, + "loss": 2.5656, + "mean_token_accuracy": 0.41034482717514037, + "step": 46655 + }, + { + "epoch": 0.046996447568140316, + "grad_norm": 12.644716037836664, + "learning_rate": 4.6995548214249746e-05, + "loss": 2.8063, + "mean_token_accuracy": 0.42413793206214906, + "step": 46660 + }, + { + "epoch": 0.04700148362124449, + "grad_norm": 12.732374530801586, + "learning_rate": 4.7000584170980805e-05, + "loss": 2.9251, + "mean_token_accuracy": 0.3517241358757019, + "step": 46665 + }, + { + "epoch": 0.047006519674348664, + "grad_norm": 13.086661091428466, + "learning_rate": 4.7005620127711864e-05, + "loss": 2.8175, + "mean_token_accuracy": 0.35862069129943847, + "step": 46670 + }, + { + "epoch": 0.04701155572745284, + "grad_norm": 13.954023875466135, + "learning_rate": 4.7010656084442924e-05, + "loss": 2.5536, + "mean_token_accuracy": 0.3758620619773865, + "step": 46675 + }, + { + "epoch": 0.047016591780557004, + "grad_norm": 14.712314792330533, + "learning_rate": 4.701569204117399e-05, + "loss": 2.2301, + "mean_token_accuracy": 0.4534180283546448, + "step": 46680 + }, + { + "epoch": 0.04702162783366118, + "grad_norm": 11.260427677037695, + "learning_rate": 4.702072799790504e-05, + "loss": 2.4246, + "mean_token_accuracy": 0.4206896543502808, + "step": 46685 + }, + { + "epoch": 0.04702666388676535, + "grad_norm": 11.42841780227934, + "learning_rate": 4.70257639546361e-05, + "loss": 2.5593, + "mean_token_accuracy": 0.41034482717514037, + "step": 46690 + }, + { + "epoch": 0.047031699939869526, + "grad_norm": 13.590858004926789, + "learning_rate": 4.703079991136716e-05, + "loss": 2.477, + "mean_token_accuracy": 0.44482759237289426, + "step": 46695 + }, + { + "epoch": 0.0470367359929737, + "grad_norm": 14.123956055775693, + "learning_rate": 4.703583586809822e-05, + "loss": 2.4195, + "mean_token_accuracy": 0.43103447556495667, + "step": 46700 + }, + { + "epoch": 0.04704177204607787, + "grad_norm": 13.566368916941018, + "learning_rate": 4.7040871824829286e-05, + "loss": 2.7452, + "mean_token_accuracy": 0.39310345649719236, + "step": 46705 + }, + { + "epoch": 0.04704680809918205, + "grad_norm": 12.809560043414287, + "learning_rate": 4.7045907781560345e-05, + "loss": 2.6583, + "mean_token_accuracy": 0.3620689630508423, + "step": 46710 + }, + { + "epoch": 0.047051844152286214, + "grad_norm": 12.754422749999405, + "learning_rate": 4.7050943738291404e-05, + "loss": 2.7763, + "mean_token_accuracy": 0.4241379380226135, + "step": 46715 + }, + { + "epoch": 0.04705688020539039, + "grad_norm": 12.232446941077315, + "learning_rate": 4.7055979695022464e-05, + "loss": 2.7733, + "mean_token_accuracy": 0.41724138259887694, + "step": 46720 + }, + { + "epoch": 0.04706191625849456, + "grad_norm": 11.73740908643985, + "learning_rate": 4.706101565175352e-05, + "loss": 2.8888, + "mean_token_accuracy": 0.3827586144208908, + "step": 46725 + }, + { + "epoch": 0.047066952311598735, + "grad_norm": 60.690759407080556, + "learning_rate": 4.706605160848458e-05, + "loss": 2.2582, + "mean_token_accuracy": 0.39999999701976774, + "step": 46730 + }, + { + "epoch": 0.04707198836470291, + "grad_norm": 14.374417901352286, + "learning_rate": 4.707108756521564e-05, + "loss": 2.7803, + "mean_token_accuracy": 0.36896551251411436, + "step": 46735 + }, + { + "epoch": 0.04707702441780708, + "grad_norm": 14.95692805108952, + "learning_rate": 4.70761235219467e-05, + "loss": 2.4981, + "mean_token_accuracy": 0.4123412013053894, + "step": 46740 + }, + { + "epoch": 0.047082060470911256, + "grad_norm": 14.37502519106531, + "learning_rate": 4.708115947867776e-05, + "loss": 2.6742, + "mean_token_accuracy": 0.3965517282485962, + "step": 46745 + }, + { + "epoch": 0.04708709652401542, + "grad_norm": 12.418105731518645, + "learning_rate": 4.708619543540882e-05, + "loss": 2.7018, + "mean_token_accuracy": 0.37931033968925476, + "step": 46750 + }, + { + "epoch": 0.0470921325771196, + "grad_norm": 16.792852637607865, + "learning_rate": 4.709123139213988e-05, + "loss": 2.7577, + "mean_token_accuracy": 0.3896551728248596, + "step": 46755 + }, + { + "epoch": 0.04709716863022377, + "grad_norm": 13.830284554879546, + "learning_rate": 4.7096267348870944e-05, + "loss": 2.8418, + "mean_token_accuracy": 0.37241379618644715, + "step": 46760 + }, + { + "epoch": 0.047102204683327945, + "grad_norm": 15.085139267235832, + "learning_rate": 4.7101303305602004e-05, + "loss": 2.6337, + "mean_token_accuracy": 0.39655172228813174, + "step": 46765 + }, + { + "epoch": 0.04710724073643212, + "grad_norm": 15.565593191214548, + "learning_rate": 4.710633926233306e-05, + "loss": 2.8822, + "mean_token_accuracy": 0.36896551847457887, + "step": 46770 + }, + { + "epoch": 0.04711227678953629, + "grad_norm": 10.84614335939368, + "learning_rate": 4.711137521906412e-05, + "loss": 2.6087, + "mean_token_accuracy": 0.39999998807907106, + "step": 46775 + }, + { + "epoch": 0.047117312842640466, + "grad_norm": 13.971495404225001, + "learning_rate": 4.7116411175795175e-05, + "loss": 2.6162, + "mean_token_accuracy": 0.4310344815254211, + "step": 46780 + }, + { + "epoch": 0.04712234889574463, + "grad_norm": 12.929242894980991, + "learning_rate": 4.712144713252624e-05, + "loss": 2.9224, + "mean_token_accuracy": 0.358620697259903, + "step": 46785 + }, + { + "epoch": 0.047127384948848806, + "grad_norm": 13.407495311855628, + "learning_rate": 4.71264830892573e-05, + "loss": 2.4922, + "mean_token_accuracy": 0.3827586233615875, + "step": 46790 + }, + { + "epoch": 0.04713242100195298, + "grad_norm": 15.943942700876404, + "learning_rate": 4.713151904598836e-05, + "loss": 2.8936, + "mean_token_accuracy": 0.37586206793785093, + "step": 46795 + }, + { + "epoch": 0.047137457055057154, + "grad_norm": 12.231440047204194, + "learning_rate": 4.713655500271942e-05, + "loss": 2.0397, + "mean_token_accuracy": 0.4689655125141144, + "step": 46800 + }, + { + "epoch": 0.04714249310816133, + "grad_norm": 14.937864017153382, + "learning_rate": 4.714159095945048e-05, + "loss": 2.6796, + "mean_token_accuracy": 0.3827586233615875, + "step": 46805 + }, + { + "epoch": 0.0471475291612655, + "grad_norm": 13.834133615584573, + "learning_rate": 4.7146626916181544e-05, + "loss": 2.7042, + "mean_token_accuracy": 0.3517241358757019, + "step": 46810 + }, + { + "epoch": 0.047152565214369675, + "grad_norm": 14.39259426110685, + "learning_rate": 4.71516628729126e-05, + "loss": 2.7232, + "mean_token_accuracy": 0.4068965494632721, + "step": 46815 + }, + { + "epoch": 0.04715760126747384, + "grad_norm": 11.260942291022031, + "learning_rate": 4.7156698829643655e-05, + "loss": 2.5944, + "mean_token_accuracy": 0.4, + "step": 46820 + }, + { + "epoch": 0.047162637320578016, + "grad_norm": 13.613161669997456, + "learning_rate": 4.7161734786374715e-05, + "loss": 2.6142, + "mean_token_accuracy": 0.3965517282485962, + "step": 46825 + }, + { + "epoch": 0.04716767337368219, + "grad_norm": 12.640162064714843, + "learning_rate": 4.7166770743105774e-05, + "loss": 2.2717, + "mean_token_accuracy": 0.3999999940395355, + "step": 46830 + }, + { + "epoch": 0.04717270942678636, + "grad_norm": 12.010281180206457, + "learning_rate": 4.717180669983683e-05, + "loss": 2.6745, + "mean_token_accuracy": 0.3827586114406586, + "step": 46835 + }, + { + "epoch": 0.04717774547989054, + "grad_norm": 12.416691553621817, + "learning_rate": 4.71768426565679e-05, + "loss": 2.3241, + "mean_token_accuracy": 0.4172413766384125, + "step": 46840 + }, + { + "epoch": 0.04718278153299471, + "grad_norm": 16.200551193551572, + "learning_rate": 4.718187861329896e-05, + "loss": 3.1475, + "mean_token_accuracy": 0.324137932062149, + "step": 46845 + }, + { + "epoch": 0.047187817586098885, + "grad_norm": 15.200814381879688, + "learning_rate": 4.718691457003002e-05, + "loss": 2.489, + "mean_token_accuracy": 0.4137930989265442, + "step": 46850 + }, + { + "epoch": 0.04719285363920305, + "grad_norm": 17.46089485339386, + "learning_rate": 4.719195052676108e-05, + "loss": 2.6945, + "mean_token_accuracy": 0.4379310250282288, + "step": 46855 + }, + { + "epoch": 0.047197889692307225, + "grad_norm": 13.388481821637585, + "learning_rate": 4.7196986483492136e-05, + "loss": 2.4581, + "mean_token_accuracy": 0.41034482717514037, + "step": 46860 + }, + { + "epoch": 0.0472029257454114, + "grad_norm": 12.377998255077587, + "learning_rate": 4.7202022440223196e-05, + "loss": 2.7504, + "mean_token_accuracy": 0.34482758641242983, + "step": 46865 + }, + { + "epoch": 0.04720796179851557, + "grad_norm": 13.798328915060818, + "learning_rate": 4.7207058396954255e-05, + "loss": 2.4786, + "mean_token_accuracy": 0.3620689630508423, + "step": 46870 + }, + { + "epoch": 0.04721299785161975, + "grad_norm": 12.422433265992623, + "learning_rate": 4.7212094353685314e-05, + "loss": 2.4578, + "mean_token_accuracy": 0.42413793206214906, + "step": 46875 + }, + { + "epoch": 0.04721803390472392, + "grad_norm": 16.313528050023596, + "learning_rate": 4.721713031041637e-05, + "loss": 2.4113, + "mean_token_accuracy": 0.4000000059604645, + "step": 46880 + }, + { + "epoch": 0.047223069957828094, + "grad_norm": 12.964425729000933, + "learning_rate": 4.722216626714743e-05, + "loss": 2.1286, + "mean_token_accuracy": 0.5034482777118683, + "step": 46885 + }, + { + "epoch": 0.04722810601093226, + "grad_norm": 12.643866987630767, + "learning_rate": 4.72272022238785e-05, + "loss": 2.5198, + "mean_token_accuracy": 0.4034482777118683, + "step": 46890 + }, + { + "epoch": 0.047233142064036435, + "grad_norm": 14.114454052056356, + "learning_rate": 4.723223818060956e-05, + "loss": 2.4455, + "mean_token_accuracy": 0.41724138259887694, + "step": 46895 + }, + { + "epoch": 0.04723817811714061, + "grad_norm": 13.856772837875425, + "learning_rate": 4.723727413734062e-05, + "loss": 2.842, + "mean_token_accuracy": 0.3310344874858856, + "step": 46900 + }, + { + "epoch": 0.04724321417024478, + "grad_norm": 16.488100501934746, + "learning_rate": 4.7242310094071676e-05, + "loss": 2.8484, + "mean_token_accuracy": 0.33448275923728943, + "step": 46905 + }, + { + "epoch": 0.047248250223348956, + "grad_norm": 14.951377472169764, + "learning_rate": 4.724734605080273e-05, + "loss": 2.639, + "mean_token_accuracy": 0.37586206793785093, + "step": 46910 + }, + { + "epoch": 0.04725328627645313, + "grad_norm": 13.946600327990788, + "learning_rate": 4.725238200753379e-05, + "loss": 2.774, + "mean_token_accuracy": 0.3517241358757019, + "step": 46915 + }, + { + "epoch": 0.047258322329557303, + "grad_norm": 12.66580986770408, + "learning_rate": 4.7257417964264854e-05, + "loss": 2.6328, + "mean_token_accuracy": 0.3707199037075043, + "step": 46920 + }, + { + "epoch": 0.04726335838266147, + "grad_norm": 18.739721490999987, + "learning_rate": 4.726245392099591e-05, + "loss": 2.8252, + "mean_token_accuracy": 0.3827586233615875, + "step": 46925 + }, + { + "epoch": 0.047268394435765644, + "grad_norm": 9.900952356996473, + "learning_rate": 4.726748987772697e-05, + "loss": 2.342, + "mean_token_accuracy": 0.43641863465309144, + "step": 46930 + }, + { + "epoch": 0.04727343048886982, + "grad_norm": 13.52618424610419, + "learning_rate": 4.727252583445803e-05, + "loss": 2.5749, + "mean_token_accuracy": 0.3965517282485962, + "step": 46935 + }, + { + "epoch": 0.04727846654197399, + "grad_norm": 15.386735576621984, + "learning_rate": 4.727756179118909e-05, + "loss": 2.6357, + "mean_token_accuracy": 0.3517241388559341, + "step": 46940 + }, + { + "epoch": 0.047283502595078165, + "grad_norm": 15.173831769182488, + "learning_rate": 4.728259774792016e-05, + "loss": 2.6603, + "mean_token_accuracy": 0.44827585816383364, + "step": 46945 + }, + { + "epoch": 0.04728853864818234, + "grad_norm": 12.301488842972068, + "learning_rate": 4.7287633704651216e-05, + "loss": 2.5918, + "mean_token_accuracy": 0.41379310488700866, + "step": 46950 + }, + { + "epoch": 0.04729357470128651, + "grad_norm": 11.213050114188206, + "learning_rate": 4.729266966138227e-05, + "loss": 2.5708, + "mean_token_accuracy": 0.37931033670902253, + "step": 46955 + }, + { + "epoch": 0.04729861075439068, + "grad_norm": 10.193669062110873, + "learning_rate": 4.729770561811333e-05, + "loss": 2.0124, + "mean_token_accuracy": 0.49433496594429016, + "step": 46960 + }, + { + "epoch": 0.047303646807494854, + "grad_norm": 15.41492968451369, + "learning_rate": 4.730274157484439e-05, + "loss": 2.4214, + "mean_token_accuracy": 0.42746521830558776, + "step": 46965 + }, + { + "epoch": 0.04730868286059903, + "grad_norm": 11.619858208351674, + "learning_rate": 4.730777753157545e-05, + "loss": 2.7013, + "mean_token_accuracy": 0.358620685338974, + "step": 46970 + }, + { + "epoch": 0.0473137189137032, + "grad_norm": 12.988284061212138, + "learning_rate": 4.731281348830651e-05, + "loss": 2.5943, + "mean_token_accuracy": 0.3448275804519653, + "step": 46975 + }, + { + "epoch": 0.047318754966807375, + "grad_norm": 15.049561949664746, + "learning_rate": 4.731784944503757e-05, + "loss": 2.5473, + "mean_token_accuracy": 0.4430127084255219, + "step": 46980 + }, + { + "epoch": 0.04732379101991155, + "grad_norm": 13.789845117126283, + "learning_rate": 4.732288540176863e-05, + "loss": 2.4549, + "mean_token_accuracy": 0.4379310369491577, + "step": 46985 + }, + { + "epoch": 0.04732882707301572, + "grad_norm": 12.607336510348475, + "learning_rate": 4.732792135849969e-05, + "loss": 2.2684, + "mean_token_accuracy": 0.42068966031074523, + "step": 46990 + }, + { + "epoch": 0.04733386312611989, + "grad_norm": 16.378718229894094, + "learning_rate": 4.733295731523075e-05, + "loss": 2.6601, + "mean_token_accuracy": 0.3620689570903778, + "step": 46995 + }, + { + "epoch": 0.04733889917922406, + "grad_norm": 11.834586150240387, + "learning_rate": 4.733799327196181e-05, + "loss": 2.5797, + "mean_token_accuracy": 0.44482759237289426, + "step": 47000 + }, + { + "epoch": 0.04734393523232824, + "grad_norm": 40.40144235285299, + "learning_rate": 4.734302922869287e-05, + "loss": 2.6236, + "mean_token_accuracy": 0.37241379022598264, + "step": 47005 + }, + { + "epoch": 0.04734897128543241, + "grad_norm": 12.747632772993049, + "learning_rate": 4.734806518542393e-05, + "loss": 2.6733, + "mean_token_accuracy": 0.3896551728248596, + "step": 47010 + }, + { + "epoch": 0.047354007338536584, + "grad_norm": 12.722645031298677, + "learning_rate": 4.735310114215499e-05, + "loss": 2.5185, + "mean_token_accuracy": 0.39310344457626345, + "step": 47015 + }, + { + "epoch": 0.04735904339164076, + "grad_norm": 12.320600705897778, + "learning_rate": 4.7358137098886046e-05, + "loss": 2.5554, + "mean_token_accuracy": 0.4172413766384125, + "step": 47020 + }, + { + "epoch": 0.04736407944474493, + "grad_norm": 13.482611681704666, + "learning_rate": 4.736317305561711e-05, + "loss": 2.3386, + "mean_token_accuracy": 0.4206896543502808, + "step": 47025 + }, + { + "epoch": 0.0473691154978491, + "grad_norm": 20.707935668632405, + "learning_rate": 4.736820901234817e-05, + "loss": 3.0413, + "mean_token_accuracy": 0.3620689630508423, + "step": 47030 + }, + { + "epoch": 0.04737415155095327, + "grad_norm": 13.179989367674752, + "learning_rate": 4.737324496907923e-05, + "loss": 2.6283, + "mean_token_accuracy": 0.39310344457626345, + "step": 47035 + }, + { + "epoch": 0.047379187604057446, + "grad_norm": 19.258842126936127, + "learning_rate": 4.737828092581029e-05, + "loss": 2.2648, + "mean_token_accuracy": 0.4448275864124298, + "step": 47040 + }, + { + "epoch": 0.04738422365716162, + "grad_norm": 13.674688237312816, + "learning_rate": 4.738331688254134e-05, + "loss": 2.4717, + "mean_token_accuracy": 0.441379314661026, + "step": 47045 + }, + { + "epoch": 0.047389259710265794, + "grad_norm": 13.334375243523906, + "learning_rate": 4.738835283927241e-05, + "loss": 2.5138, + "mean_token_accuracy": 0.3931034505367279, + "step": 47050 + }, + { + "epoch": 0.04739429576336997, + "grad_norm": 14.679757602587795, + "learning_rate": 4.739338879600347e-05, + "loss": 2.7493, + "mean_token_accuracy": 0.3811252325773239, + "step": 47055 + }, + { + "epoch": 0.04739933181647414, + "grad_norm": 14.287232077110433, + "learning_rate": 4.739842475273453e-05, + "loss": 2.7077, + "mean_token_accuracy": 0.3827586233615875, + "step": 47060 + }, + { + "epoch": 0.04740436786957831, + "grad_norm": 15.019999986994842, + "learning_rate": 4.7403460709465586e-05, + "loss": 2.6414, + "mean_token_accuracy": 0.39310343861579894, + "step": 47065 + }, + { + "epoch": 0.04740940392268248, + "grad_norm": 14.946857670030896, + "learning_rate": 4.7408496666196645e-05, + "loss": 2.616, + "mean_token_accuracy": 0.4034482717514038, + "step": 47070 + }, + { + "epoch": 0.047414439975786656, + "grad_norm": 12.492954994257694, + "learning_rate": 4.741353262292771e-05, + "loss": 2.4968, + "mean_token_accuracy": 0.3965517282485962, + "step": 47075 + }, + { + "epoch": 0.04741947602889083, + "grad_norm": 12.572445967030859, + "learning_rate": 4.741856857965877e-05, + "loss": 2.5183, + "mean_token_accuracy": 0.38965516686439516, + "step": 47080 + }, + { + "epoch": 0.047424512081995, + "grad_norm": 16.469168251728778, + "learning_rate": 4.742360453638982e-05, + "loss": 2.976, + "mean_token_accuracy": 0.320689657330513, + "step": 47085 + }, + { + "epoch": 0.04742954813509918, + "grad_norm": 17.248364700739028, + "learning_rate": 4.742864049312088e-05, + "loss": 2.582, + "mean_token_accuracy": 0.3793103456497192, + "step": 47090 + }, + { + "epoch": 0.04743458418820335, + "grad_norm": 12.119347068592662, + "learning_rate": 4.743367644985194e-05, + "loss": 2.4325, + "mean_token_accuracy": 0.36896551847457887, + "step": 47095 + }, + { + "epoch": 0.04743962024130752, + "grad_norm": 13.830515868610359, + "learning_rate": 4.7438712406583e-05, + "loss": 2.5288, + "mean_token_accuracy": 0.4137930989265442, + "step": 47100 + }, + { + "epoch": 0.04744465629441169, + "grad_norm": 15.866506651420842, + "learning_rate": 4.744374836331407e-05, + "loss": 3.1445, + "mean_token_accuracy": 0.3777374565601349, + "step": 47105 + }, + { + "epoch": 0.047449692347515865, + "grad_norm": 11.090754039946026, + "learning_rate": 4.7448784320045126e-05, + "loss": 2.5315, + "mean_token_accuracy": 0.41034482717514037, + "step": 47110 + }, + { + "epoch": 0.04745472840062004, + "grad_norm": 12.911441257970125, + "learning_rate": 4.7453820276776185e-05, + "loss": 2.7885, + "mean_token_accuracy": 0.3862068891525269, + "step": 47115 + }, + { + "epoch": 0.04745976445372421, + "grad_norm": 11.608537008981578, + "learning_rate": 4.7458856233507245e-05, + "loss": 2.3829, + "mean_token_accuracy": 0.44827585816383364, + "step": 47120 + }, + { + "epoch": 0.047464800506828386, + "grad_norm": 15.478812662334786, + "learning_rate": 4.7463892190238304e-05, + "loss": 2.408, + "mean_token_accuracy": 0.4275862157344818, + "step": 47125 + }, + { + "epoch": 0.04746983655993256, + "grad_norm": 17.38623159682501, + "learning_rate": 4.746892814696936e-05, + "loss": 2.577, + "mean_token_accuracy": 0.37586207389831544, + "step": 47130 + }, + { + "epoch": 0.04747487261303673, + "grad_norm": 13.793548870040283, + "learning_rate": 4.747396410370042e-05, + "loss": 2.3443, + "mean_token_accuracy": 0.42068966031074523, + "step": 47135 + }, + { + "epoch": 0.0474799086661409, + "grad_norm": 13.814059382267036, + "learning_rate": 4.747900006043148e-05, + "loss": 2.5931, + "mean_token_accuracy": 0.34827585220336915, + "step": 47140 + }, + { + "epoch": 0.047484944719245074, + "grad_norm": 17.214119318013594, + "learning_rate": 4.748403601716254e-05, + "loss": 2.9402, + "mean_token_accuracy": 0.36001209616661073, + "step": 47145 + }, + { + "epoch": 0.04748998077234925, + "grad_norm": 13.960123917790995, + "learning_rate": 4.74890719738936e-05, + "loss": 2.8189, + "mean_token_accuracy": 0.3620689630508423, + "step": 47150 + }, + { + "epoch": 0.04749501682545342, + "grad_norm": 12.230426437116334, + "learning_rate": 4.7494107930624666e-05, + "loss": 2.5244, + "mean_token_accuracy": 0.4, + "step": 47155 + }, + { + "epoch": 0.047500052878557596, + "grad_norm": 13.23672519959043, + "learning_rate": 4.7499143887355725e-05, + "loss": 2.6871, + "mean_token_accuracy": 0.3517241358757019, + "step": 47160 + }, + { + "epoch": 0.04750508893166177, + "grad_norm": 14.976286482473116, + "learning_rate": 4.7504179844086785e-05, + "loss": 2.8344, + "mean_token_accuracy": 0.3655172407627106, + "step": 47165 + }, + { + "epoch": 0.047510124984765936, + "grad_norm": 16.955313851759026, + "learning_rate": 4.7509215800817844e-05, + "loss": 2.2786, + "mean_token_accuracy": 0.41548699140548706, + "step": 47170 + }, + { + "epoch": 0.04751516103787011, + "grad_norm": 11.764006226293825, + "learning_rate": 4.75142517575489e-05, + "loss": 2.4371, + "mean_token_accuracy": 0.4607380568981171, + "step": 47175 + }, + { + "epoch": 0.047520197090974284, + "grad_norm": 15.404410346834458, + "learning_rate": 4.7519287714279956e-05, + "loss": 2.7769, + "mean_token_accuracy": 0.34482758641242983, + "step": 47180 + }, + { + "epoch": 0.04752523314407846, + "grad_norm": 14.702396660645883, + "learning_rate": 4.752432367101102e-05, + "loss": 2.6084, + "mean_token_accuracy": 0.43448275327682495, + "step": 47185 + }, + { + "epoch": 0.04753026919718263, + "grad_norm": 16.609223765106606, + "learning_rate": 4.752935962774208e-05, + "loss": 2.8907, + "mean_token_accuracy": 0.3482758641242981, + "step": 47190 + }, + { + "epoch": 0.047535305250286805, + "grad_norm": 9.668932980178516, + "learning_rate": 4.753439558447314e-05, + "loss": 2.7194, + "mean_token_accuracy": 0.3620689660310745, + "step": 47195 + }, + { + "epoch": 0.04754034130339098, + "grad_norm": 12.191722932116194, + "learning_rate": 4.75394315412042e-05, + "loss": 2.6017, + "mean_token_accuracy": 0.42068966031074523, + "step": 47200 + }, + { + "epoch": 0.047545377356495146, + "grad_norm": 13.13096284073397, + "learning_rate": 4.754446749793526e-05, + "loss": 2.863, + "mean_token_accuracy": 0.42758620381355283, + "step": 47205 + }, + { + "epoch": 0.04755041340959932, + "grad_norm": 13.630264350466247, + "learning_rate": 4.7549503454666325e-05, + "loss": 3.2923, + "mean_token_accuracy": 0.3551724076271057, + "step": 47210 + }, + { + "epoch": 0.04755544946270349, + "grad_norm": 12.396421987422755, + "learning_rate": 4.7554539411397384e-05, + "loss": 2.3648, + "mean_token_accuracy": 0.4206896543502808, + "step": 47215 + }, + { + "epoch": 0.04756048551580767, + "grad_norm": 12.462003141399798, + "learning_rate": 4.7559575368128436e-05, + "loss": 2.2428, + "mean_token_accuracy": 0.47241379618644713, + "step": 47220 + }, + { + "epoch": 0.04756552156891184, + "grad_norm": 13.405108332087131, + "learning_rate": 4.7564611324859496e-05, + "loss": 2.3946, + "mean_token_accuracy": 0.4363581418991089, + "step": 47225 + }, + { + "epoch": 0.047570557622016015, + "grad_norm": 13.319733269129923, + "learning_rate": 4.7569647281590555e-05, + "loss": 2.7917, + "mean_token_accuracy": 0.41034482717514037, + "step": 47230 + }, + { + "epoch": 0.04757559367512019, + "grad_norm": 11.016908420645432, + "learning_rate": 4.757468323832162e-05, + "loss": 2.4707, + "mean_token_accuracy": 0.4068965494632721, + "step": 47235 + }, + { + "epoch": 0.047580629728224355, + "grad_norm": 13.084299254837374, + "learning_rate": 4.757971919505268e-05, + "loss": 2.2323, + "mean_token_accuracy": 0.41724138259887694, + "step": 47240 + }, + { + "epoch": 0.04758566578132853, + "grad_norm": 15.65328422860084, + "learning_rate": 4.758475515178374e-05, + "loss": 2.3569, + "mean_token_accuracy": 0.441379314661026, + "step": 47245 + }, + { + "epoch": 0.0475907018344327, + "grad_norm": 11.48186512550104, + "learning_rate": 4.75897911085148e-05, + "loss": 2.6651, + "mean_token_accuracy": 0.42413792610168455, + "step": 47250 + }, + { + "epoch": 0.047595737887536876, + "grad_norm": 12.575634653402448, + "learning_rate": 4.759482706524586e-05, + "loss": 2.8765, + "mean_token_accuracy": 0.3739261955022812, + "step": 47255 + }, + { + "epoch": 0.04760077394064105, + "grad_norm": 16.393119883754217, + "learning_rate": 4.759986302197692e-05, + "loss": 2.6222, + "mean_token_accuracy": 0.42413792610168455, + "step": 47260 + }, + { + "epoch": 0.047605809993745224, + "grad_norm": 16.232805732794667, + "learning_rate": 4.7604898978707976e-05, + "loss": 2.6278, + "mean_token_accuracy": 0.4206896543502808, + "step": 47265 + }, + { + "epoch": 0.0476108460468494, + "grad_norm": 15.427475213314917, + "learning_rate": 4.7609934935439036e-05, + "loss": 2.2438, + "mean_token_accuracy": 0.4500302493572235, + "step": 47270 + }, + { + "epoch": 0.047615882099953565, + "grad_norm": 16.608149657952602, + "learning_rate": 4.7614970892170095e-05, + "loss": 2.9403, + "mean_token_accuracy": 0.4275861978530884, + "step": 47275 + }, + { + "epoch": 0.04762091815305774, + "grad_norm": 11.067185965291648, + "learning_rate": 4.7620006848901154e-05, + "loss": 2.1803, + "mean_token_accuracy": 0.4793103337287903, + "step": 47280 + }, + { + "epoch": 0.04762595420616191, + "grad_norm": 16.56551691552814, + "learning_rate": 4.7625042805632213e-05, + "loss": 2.1311, + "mean_token_accuracy": 0.4517241358757019, + "step": 47285 + }, + { + "epoch": 0.047630990259266086, + "grad_norm": 12.862492715300638, + "learning_rate": 4.763007876236328e-05, + "loss": 2.993, + "mean_token_accuracy": 0.3551724135875702, + "step": 47290 + }, + { + "epoch": 0.04763602631237026, + "grad_norm": 14.403407342882064, + "learning_rate": 4.763511471909434e-05, + "loss": 2.4562, + "mean_token_accuracy": 0.39310344457626345, + "step": 47295 + }, + { + "epoch": 0.04764106236547443, + "grad_norm": 12.463416738685545, + "learning_rate": 4.76401506758254e-05, + "loss": 2.9109, + "mean_token_accuracy": 0.37241379618644715, + "step": 47300 + }, + { + "epoch": 0.04764609841857861, + "grad_norm": 14.513384976491107, + "learning_rate": 4.764518663255646e-05, + "loss": 2.3798, + "mean_token_accuracy": 0.4068965494632721, + "step": 47305 + }, + { + "epoch": 0.047651134471682774, + "grad_norm": 13.910357368564709, + "learning_rate": 4.7650222589287516e-05, + "loss": 2.5427, + "mean_token_accuracy": 0.41034482717514037, + "step": 47310 + }, + { + "epoch": 0.04765617052478695, + "grad_norm": 11.58458893753858, + "learning_rate": 4.7655258546018576e-05, + "loss": 2.4199, + "mean_token_accuracy": 0.43103447556495667, + "step": 47315 + }, + { + "epoch": 0.04766120657789112, + "grad_norm": 15.843681974535407, + "learning_rate": 4.7660294502749635e-05, + "loss": 3.0803, + "mean_token_accuracy": 0.3854204475879669, + "step": 47320 + }, + { + "epoch": 0.047666242630995295, + "grad_norm": 15.403869092909018, + "learning_rate": 4.7665330459480694e-05, + "loss": 2.5424, + "mean_token_accuracy": 0.34137930870056155, + "step": 47325 + }, + { + "epoch": 0.04767127868409947, + "grad_norm": 13.446810345697902, + "learning_rate": 4.7670366416211753e-05, + "loss": 3.0083, + "mean_token_accuracy": 0.35172412991523744, + "step": 47330 + }, + { + "epoch": 0.04767631473720364, + "grad_norm": 13.165645274157306, + "learning_rate": 4.767540237294281e-05, + "loss": 2.8023, + "mean_token_accuracy": 0.39655172228813174, + "step": 47335 + }, + { + "epoch": 0.04768135079030782, + "grad_norm": 17.842386870692916, + "learning_rate": 4.768043832967387e-05, + "loss": 2.7204, + "mean_token_accuracy": 0.4034482777118683, + "step": 47340 + }, + { + "epoch": 0.04768638684341198, + "grad_norm": 13.060415327063652, + "learning_rate": 4.768547428640494e-05, + "loss": 2.6878, + "mean_token_accuracy": 0.4119177222251892, + "step": 47345 + }, + { + "epoch": 0.04769142289651616, + "grad_norm": 12.87953628826914, + "learning_rate": 4.7690510243136e-05, + "loss": 2.3714, + "mean_token_accuracy": 0.4586206912994385, + "step": 47350 + }, + { + "epoch": 0.04769645894962033, + "grad_norm": 14.449027697900455, + "learning_rate": 4.769554619986705e-05, + "loss": 2.5832, + "mean_token_accuracy": 0.3999999940395355, + "step": 47355 + }, + { + "epoch": 0.047701495002724505, + "grad_norm": 18.8853246323621, + "learning_rate": 4.770058215659811e-05, + "loss": 3.0338, + "mean_token_accuracy": 0.3620689570903778, + "step": 47360 + }, + { + "epoch": 0.04770653105582868, + "grad_norm": 12.698647948785169, + "learning_rate": 4.770561811332917e-05, + "loss": 2.7163, + "mean_token_accuracy": 0.29655171632766725, + "step": 47365 + }, + { + "epoch": 0.04771156710893285, + "grad_norm": 13.140378143199367, + "learning_rate": 4.7710654070060234e-05, + "loss": 2.2909, + "mean_token_accuracy": 0.47416818141937256, + "step": 47370 + }, + { + "epoch": 0.047716603162037026, + "grad_norm": 12.616226513223177, + "learning_rate": 4.7715690026791294e-05, + "loss": 2.4722, + "mean_token_accuracy": 0.4, + "step": 47375 + }, + { + "epoch": 0.04772163921514119, + "grad_norm": 12.915726412890727, + "learning_rate": 4.772072598352235e-05, + "loss": 2.7875, + "mean_token_accuracy": 0.38275861740112305, + "step": 47380 + }, + { + "epoch": 0.04772667526824537, + "grad_norm": 14.975059782379262, + "learning_rate": 4.772576194025341e-05, + "loss": 2.7531, + "mean_token_accuracy": 0.3620689630508423, + "step": 47385 + }, + { + "epoch": 0.04773171132134954, + "grad_norm": 13.527117323171572, + "learning_rate": 4.773079789698447e-05, + "loss": 2.4961, + "mean_token_accuracy": 0.3827586114406586, + "step": 47390 + }, + { + "epoch": 0.047736747374453714, + "grad_norm": 11.315211655767412, + "learning_rate": 4.773583385371553e-05, + "loss": 2.2745, + "mean_token_accuracy": 0.4137930929660797, + "step": 47395 + }, + { + "epoch": 0.04774178342755789, + "grad_norm": 11.6311722233183, + "learning_rate": 4.774086981044659e-05, + "loss": 2.7554, + "mean_token_accuracy": 0.3551724135875702, + "step": 47400 + }, + { + "epoch": 0.04774681948066206, + "grad_norm": 10.895099229524956, + "learning_rate": 4.774590576717765e-05, + "loss": 2.2494, + "mean_token_accuracy": 0.47434966564178466, + "step": 47405 + }, + { + "epoch": 0.047751855533766235, + "grad_norm": 19.38405650296438, + "learning_rate": 4.775094172390871e-05, + "loss": 2.6273, + "mean_token_accuracy": 0.42746522426605227, + "step": 47410 + }, + { + "epoch": 0.0477568915868704, + "grad_norm": 24.70952099057512, + "learning_rate": 4.775597768063977e-05, + "loss": 3.3244, + "mean_token_accuracy": 0.32758620381355286, + "step": 47415 + }, + { + "epoch": 0.047761927639974576, + "grad_norm": 13.586058362918633, + "learning_rate": 4.776101363737083e-05, + "loss": 2.8287, + "mean_token_accuracy": 0.35862069576978683, + "step": 47420 + }, + { + "epoch": 0.04776696369307875, + "grad_norm": 13.58205968718563, + "learning_rate": 4.776604959410189e-05, + "loss": 2.7323, + "mean_token_accuracy": 0.3620689630508423, + "step": 47425 + }, + { + "epoch": 0.047771999746182923, + "grad_norm": 12.348345831252326, + "learning_rate": 4.777108555083295e-05, + "loss": 2.8127, + "mean_token_accuracy": 0.3965517163276672, + "step": 47430 + }, + { + "epoch": 0.0477770357992871, + "grad_norm": 12.917410425314785, + "learning_rate": 4.777612150756401e-05, + "loss": 2.4543, + "mean_token_accuracy": 0.4253694534301758, + "step": 47435 + }, + { + "epoch": 0.04778207185239127, + "grad_norm": 12.505885661421114, + "learning_rate": 4.778115746429507e-05, + "loss": 2.5886, + "mean_token_accuracy": 0.3931034475564957, + "step": 47440 + }, + { + "epoch": 0.047787107905495445, + "grad_norm": 14.601574616484228, + "learning_rate": 4.778619342102612e-05, + "loss": 2.7373, + "mean_token_accuracy": 0.36551723778247835, + "step": 47445 + }, + { + "epoch": 0.04779214395859961, + "grad_norm": 18.531817192679004, + "learning_rate": 4.779122937775719e-05, + "loss": 2.1943, + "mean_token_accuracy": 0.4517241418361664, + "step": 47450 + }, + { + "epoch": 0.047797180011703785, + "grad_norm": 13.826348788923152, + "learning_rate": 4.779626533448825e-05, + "loss": 2.6413, + "mean_token_accuracy": 0.3862069010734558, + "step": 47455 + }, + { + "epoch": 0.04780221606480796, + "grad_norm": 17.348186625048797, + "learning_rate": 4.780130129121931e-05, + "loss": 2.6157, + "mean_token_accuracy": 0.3827586233615875, + "step": 47460 + }, + { + "epoch": 0.04780725211791213, + "grad_norm": 16.59620582006953, + "learning_rate": 4.780633724795037e-05, + "loss": 2.8049, + "mean_token_accuracy": 0.35862069129943847, + "step": 47465 + }, + { + "epoch": 0.04781228817101631, + "grad_norm": 15.22081984340066, + "learning_rate": 4.7811373204681426e-05, + "loss": 2.4505, + "mean_token_accuracy": 0.4620689690113068, + "step": 47470 + }, + { + "epoch": 0.04781732422412048, + "grad_norm": 14.499682374824959, + "learning_rate": 4.781640916141249e-05, + "loss": 2.6593, + "mean_token_accuracy": 0.4068965554237366, + "step": 47475 + }, + { + "epoch": 0.047822360277224654, + "grad_norm": 13.421152455111045, + "learning_rate": 4.782144511814355e-05, + "loss": 2.4685, + "mean_token_accuracy": 0.4261947989463806, + "step": 47480 + }, + { + "epoch": 0.04782739633032882, + "grad_norm": 14.627054074483992, + "learning_rate": 4.782648107487461e-05, + "loss": 2.9515, + "mean_token_accuracy": 0.36733212918043134, + "step": 47485 + }, + { + "epoch": 0.047832432383432995, + "grad_norm": 13.4258518114012, + "learning_rate": 4.783151703160566e-05, + "loss": 2.7383, + "mean_token_accuracy": 0.38965516686439516, + "step": 47490 + }, + { + "epoch": 0.04783746843653717, + "grad_norm": 11.62399116890208, + "learning_rate": 4.783655298833672e-05, + "loss": 2.4835, + "mean_token_accuracy": 0.41034482717514037, + "step": 47495 + }, + { + "epoch": 0.04784250448964134, + "grad_norm": 15.279447737877808, + "learning_rate": 4.784158894506779e-05, + "loss": 2.3122, + "mean_token_accuracy": 0.458620685338974, + "step": 47500 + }, + { + "epoch": 0.047847540542745516, + "grad_norm": 12.831894407098844, + "learning_rate": 4.784662490179885e-05, + "loss": 2.3874, + "mean_token_accuracy": 0.4847549915313721, + "step": 47505 + }, + { + "epoch": 0.04785257659584969, + "grad_norm": 15.883314481769363, + "learning_rate": 4.785166085852991e-05, + "loss": 2.6389, + "mean_token_accuracy": 0.3793103456497192, + "step": 47510 + }, + { + "epoch": 0.047857612648953864, + "grad_norm": 11.11874174807571, + "learning_rate": 4.7856696815260966e-05, + "loss": 2.5917, + "mean_token_accuracy": 0.4551724135875702, + "step": 47515 + }, + { + "epoch": 0.04786264870205803, + "grad_norm": 14.691309891794297, + "learning_rate": 4.7861732771992025e-05, + "loss": 2.4293, + "mean_token_accuracy": 0.4275861978530884, + "step": 47520 + }, + { + "epoch": 0.047867684755162204, + "grad_norm": 17.30344541484884, + "learning_rate": 4.7866768728723085e-05, + "loss": 2.7887, + "mean_token_accuracy": 0.37241379618644715, + "step": 47525 + }, + { + "epoch": 0.04787272080826638, + "grad_norm": 12.607630269238927, + "learning_rate": 4.7871804685454144e-05, + "loss": 3.0569, + "mean_token_accuracy": 0.36896551251411436, + "step": 47530 + }, + { + "epoch": 0.04787775686137055, + "grad_norm": 13.57270290120401, + "learning_rate": 4.78768406421852e-05, + "loss": 2.6538, + "mean_token_accuracy": 0.4000000089406967, + "step": 47535 + }, + { + "epoch": 0.047882792914474726, + "grad_norm": 12.991385905151033, + "learning_rate": 4.788187659891626e-05, + "loss": 2.3742, + "mean_token_accuracy": 0.4326073884963989, + "step": 47540 + }, + { + "epoch": 0.0478878289675789, + "grad_norm": 29.340508325513145, + "learning_rate": 4.788691255564732e-05, + "loss": 3.1742, + "mean_token_accuracy": 0.3310344874858856, + "step": 47545 + }, + { + "epoch": 0.04789286502068307, + "grad_norm": 14.63007412444858, + "learning_rate": 4.789194851237838e-05, + "loss": 2.6056, + "mean_token_accuracy": 0.39655172526836396, + "step": 47550 + }, + { + "epoch": 0.04789790107378724, + "grad_norm": 12.659361250741476, + "learning_rate": 4.789698446910945e-05, + "loss": 2.4788, + "mean_token_accuracy": 0.4448275864124298, + "step": 47555 + }, + { + "epoch": 0.047902937126891414, + "grad_norm": 13.995823385292335, + "learning_rate": 4.7902020425840506e-05, + "loss": 2.5905, + "mean_token_accuracy": 0.39310344457626345, + "step": 47560 + }, + { + "epoch": 0.04790797317999559, + "grad_norm": 15.033344555188384, + "learning_rate": 4.7907056382571565e-05, + "loss": 2.5446, + "mean_token_accuracy": 0.35862069129943847, + "step": 47565 + }, + { + "epoch": 0.04791300923309976, + "grad_norm": 11.007238044020639, + "learning_rate": 4.7912092339302625e-05, + "loss": 2.085, + "mean_token_accuracy": 0.47931034564971925, + "step": 47570 + }, + { + "epoch": 0.047918045286203935, + "grad_norm": 14.477527488645027, + "learning_rate": 4.7917128296033684e-05, + "loss": 2.7945, + "mean_token_accuracy": 0.3939655214548111, + "step": 47575 + }, + { + "epoch": 0.04792308133930811, + "grad_norm": 11.854237040539159, + "learning_rate": 4.792216425276474e-05, + "loss": 2.9349, + "mean_token_accuracy": 0.36206896901130675, + "step": 47580 + }, + { + "epoch": 0.04792811739241228, + "grad_norm": 14.073913820936744, + "learning_rate": 4.79272002094958e-05, + "loss": 2.4123, + "mean_token_accuracy": 0.41379310488700866, + "step": 47585 + }, + { + "epoch": 0.04793315344551645, + "grad_norm": 14.52593872515165, + "learning_rate": 4.793223616622686e-05, + "loss": 3.046, + "mean_token_accuracy": 0.3551724076271057, + "step": 47590 + }, + { + "epoch": 0.04793818949862062, + "grad_norm": 36.50331504247366, + "learning_rate": 4.793727212295792e-05, + "loss": 2.5737, + "mean_token_accuracy": 0.3724137932062149, + "step": 47595 + }, + { + "epoch": 0.0479432255517248, + "grad_norm": 9.945653554107935, + "learning_rate": 4.794230807968898e-05, + "loss": 2.2149, + "mean_token_accuracy": 0.43793103098869324, + "step": 47600 + }, + { + "epoch": 0.04794826160482897, + "grad_norm": 13.7006631163599, + "learning_rate": 4.794734403642004e-05, + "loss": 2.8674, + "mean_token_accuracy": 0.3724137842655182, + "step": 47605 + }, + { + "epoch": 0.047953297657933144, + "grad_norm": 12.10631630703267, + "learning_rate": 4.7952379993151106e-05, + "loss": 2.6803, + "mean_token_accuracy": 0.3758620619773865, + "step": 47610 + }, + { + "epoch": 0.04795833371103732, + "grad_norm": 15.816984517714396, + "learning_rate": 4.7957415949882165e-05, + "loss": 2.761, + "mean_token_accuracy": 0.3689655244350433, + "step": 47615 + }, + { + "epoch": 0.04796336976414149, + "grad_norm": 12.171990449510867, + "learning_rate": 4.796245190661322e-05, + "loss": 2.4921, + "mean_token_accuracy": 0.3551724076271057, + "step": 47620 + }, + { + "epoch": 0.04796840581724566, + "grad_norm": 12.216730331880434, + "learning_rate": 4.7967487863344276e-05, + "loss": 2.4788, + "mean_token_accuracy": 0.3827586233615875, + "step": 47625 + }, + { + "epoch": 0.04797344187034983, + "grad_norm": 14.69145150871009, + "learning_rate": 4.7972523820075336e-05, + "loss": 2.5772, + "mean_token_accuracy": 0.41379311084747317, + "step": 47630 + }, + { + "epoch": 0.047978477923454006, + "grad_norm": 10.446145548000876, + "learning_rate": 4.79775597768064e-05, + "loss": 2.4547, + "mean_token_accuracy": 0.4344827592372894, + "step": 47635 + }, + { + "epoch": 0.04798351397655818, + "grad_norm": 13.405524575761477, + "learning_rate": 4.798259573353746e-05, + "loss": 2.4195, + "mean_token_accuracy": 0.3931034505367279, + "step": 47640 + }, + { + "epoch": 0.047988550029662354, + "grad_norm": 14.946476472046575, + "learning_rate": 4.798763169026852e-05, + "loss": 2.5484, + "mean_token_accuracy": 0.4034482777118683, + "step": 47645 + }, + { + "epoch": 0.04799358608276653, + "grad_norm": 12.995995782048258, + "learning_rate": 4.799266764699958e-05, + "loss": 2.96, + "mean_token_accuracy": 0.3448275804519653, + "step": 47650 + }, + { + "epoch": 0.0479986221358707, + "grad_norm": 15.57977090773077, + "learning_rate": 4.799770360373064e-05, + "loss": 2.6251, + "mean_token_accuracy": 0.4068965494632721, + "step": 47655 + }, + { + "epoch": 0.04800365818897487, + "grad_norm": 12.119121667074367, + "learning_rate": 4.80027395604617e-05, + "loss": 2.9725, + "mean_token_accuracy": 0.35172413289546967, + "step": 47660 + }, + { + "epoch": 0.04800869424207904, + "grad_norm": 10.204302478008904, + "learning_rate": 4.800777551719276e-05, + "loss": 2.2999, + "mean_token_accuracy": 0.4689655065536499, + "step": 47665 + }, + { + "epoch": 0.048013730295183216, + "grad_norm": 12.36361003065651, + "learning_rate": 4.8012811473923817e-05, + "loss": 2.2134, + "mean_token_accuracy": 0.4896551728248596, + "step": 47670 + }, + { + "epoch": 0.04801876634828739, + "grad_norm": 11.708252266211383, + "learning_rate": 4.8017847430654876e-05, + "loss": 2.2681, + "mean_token_accuracy": 0.43165024518966677, + "step": 47675 + }, + { + "epoch": 0.04802380240139156, + "grad_norm": 15.19515233043819, + "learning_rate": 4.8022883387385935e-05, + "loss": 2.6984, + "mean_token_accuracy": 0.4034482777118683, + "step": 47680 + }, + { + "epoch": 0.04802883845449574, + "grad_norm": 17.662812120916435, + "learning_rate": 4.8027919344116994e-05, + "loss": 2.6104, + "mean_token_accuracy": 0.39310344159603117, + "step": 47685 + }, + { + "epoch": 0.04803387450759991, + "grad_norm": 12.970820082913875, + "learning_rate": 4.803295530084806e-05, + "loss": 2.9455, + "mean_token_accuracy": 0.3551724135875702, + "step": 47690 + }, + { + "epoch": 0.04803891056070408, + "grad_norm": 14.746908378838821, + "learning_rate": 4.803799125757912e-05, + "loss": 2.2427, + "mean_token_accuracy": 0.45517241954803467, + "step": 47695 + }, + { + "epoch": 0.04804394661380825, + "grad_norm": 12.476280128642035, + "learning_rate": 4.804302721431018e-05, + "loss": 2.2109, + "mean_token_accuracy": 0.4275861978530884, + "step": 47700 + }, + { + "epoch": 0.048048982666912425, + "grad_norm": 12.479025086352847, + "learning_rate": 4.804806317104124e-05, + "loss": 2.6034, + "mean_token_accuracy": 0.41379311084747317, + "step": 47705 + }, + { + "epoch": 0.0480540187200166, + "grad_norm": 11.948168911930141, + "learning_rate": 4.80530991277723e-05, + "loss": 2.4805, + "mean_token_accuracy": 0.41554749608039854, + "step": 47710 + }, + { + "epoch": 0.04805905477312077, + "grad_norm": 28.44692819685419, + "learning_rate": 4.8058135084503357e-05, + "loss": 2.8268, + "mean_token_accuracy": 0.41034482717514037, + "step": 47715 + }, + { + "epoch": 0.048064090826224946, + "grad_norm": 10.602146776872049, + "learning_rate": 4.8063171041234416e-05, + "loss": 2.7184, + "mean_token_accuracy": 0.37586206793785093, + "step": 47720 + }, + { + "epoch": 0.04806912687932912, + "grad_norm": 11.99676469080963, + "learning_rate": 4.8068206997965475e-05, + "loss": 2.6639, + "mean_token_accuracy": 0.4172413766384125, + "step": 47725 + }, + { + "epoch": 0.04807416293243329, + "grad_norm": 13.09380050789166, + "learning_rate": 4.8073242954696534e-05, + "loss": 2.4048, + "mean_token_accuracy": 0.42758620381355283, + "step": 47730 + }, + { + "epoch": 0.04807919898553746, + "grad_norm": 29.341074537477407, + "learning_rate": 4.8078278911427594e-05, + "loss": 2.7153, + "mean_token_accuracy": 0.4517241418361664, + "step": 47735 + }, + { + "epoch": 0.048084235038641635, + "grad_norm": 11.00878637611333, + "learning_rate": 4.808331486815866e-05, + "loss": 2.5656, + "mean_token_accuracy": 0.44827585816383364, + "step": 47740 + }, + { + "epoch": 0.04808927109174581, + "grad_norm": 14.482931390313945, + "learning_rate": 4.808835082488972e-05, + "loss": 2.7436, + "mean_token_accuracy": 0.39655172228813174, + "step": 47745 + }, + { + "epoch": 0.04809430714484998, + "grad_norm": 15.894927773457258, + "learning_rate": 4.809338678162078e-05, + "loss": 2.9438, + "mean_token_accuracy": 0.3517241418361664, + "step": 47750 + }, + { + "epoch": 0.048099343197954156, + "grad_norm": 14.022630176455692, + "learning_rate": 4.809842273835183e-05, + "loss": 2.3362, + "mean_token_accuracy": 0.4344827592372894, + "step": 47755 + }, + { + "epoch": 0.04810437925105833, + "grad_norm": 14.088833958288474, + "learning_rate": 4.810345869508289e-05, + "loss": 2.8153, + "mean_token_accuracy": 0.35862069129943847, + "step": 47760 + }, + { + "epoch": 0.048109415304162496, + "grad_norm": 13.092229916064932, + "learning_rate": 4.810849465181395e-05, + "loss": 2.4632, + "mean_token_accuracy": 0.4137930989265442, + "step": 47765 + }, + { + "epoch": 0.04811445135726667, + "grad_norm": 14.637402825964937, + "learning_rate": 4.8113530608545015e-05, + "loss": 2.7748, + "mean_token_accuracy": 0.36551723480224607, + "step": 47770 + }, + { + "epoch": 0.048119487410370844, + "grad_norm": 12.06310851211309, + "learning_rate": 4.8118566565276074e-05, + "loss": 2.6243, + "mean_token_accuracy": 0.40000000298023225, + "step": 47775 + }, + { + "epoch": 0.04812452346347502, + "grad_norm": 9.927750803758858, + "learning_rate": 4.8123602522007134e-05, + "loss": 2.4738, + "mean_token_accuracy": 0.4482758641242981, + "step": 47780 + }, + { + "epoch": 0.04812955951657919, + "grad_norm": 16.537513908344895, + "learning_rate": 4.812863847873819e-05, + "loss": 2.7005, + "mean_token_accuracy": 0.36896551847457887, + "step": 47785 + }, + { + "epoch": 0.048134595569683365, + "grad_norm": 13.002405366863659, + "learning_rate": 4.813367443546925e-05, + "loss": 2.6624, + "mean_token_accuracy": 0.38802178502082824, + "step": 47790 + }, + { + "epoch": 0.04813963162278754, + "grad_norm": 13.58569352081164, + "learning_rate": 4.813871039220031e-05, + "loss": 2.8439, + "mean_token_accuracy": 0.358620685338974, + "step": 47795 + }, + { + "epoch": 0.048144667675891706, + "grad_norm": 13.400893208580715, + "learning_rate": 4.814374634893137e-05, + "loss": 2.7236, + "mean_token_accuracy": 0.42413793206214906, + "step": 47800 + }, + { + "epoch": 0.04814970372899588, + "grad_norm": 16.68576825066535, + "learning_rate": 4.814878230566243e-05, + "loss": 2.4215, + "mean_token_accuracy": 0.4213054239749908, + "step": 47805 + }, + { + "epoch": 0.04815473978210005, + "grad_norm": 11.452505316145178, + "learning_rate": 4.815381826239349e-05, + "loss": 2.5019, + "mean_token_accuracy": 0.39655172228813174, + "step": 47810 + }, + { + "epoch": 0.04815977583520423, + "grad_norm": 12.314096323420344, + "learning_rate": 4.815885421912455e-05, + "loss": 3.052, + "mean_token_accuracy": 0.3827586233615875, + "step": 47815 + }, + { + "epoch": 0.0481648118883084, + "grad_norm": 16.064525852383042, + "learning_rate": 4.8163890175855614e-05, + "loss": 3.132, + "mean_token_accuracy": 0.3758620619773865, + "step": 47820 + }, + { + "epoch": 0.048169847941412575, + "grad_norm": 12.803621983478275, + "learning_rate": 4.8168926132586674e-05, + "loss": 2.7101, + "mean_token_accuracy": 0.3793103456497192, + "step": 47825 + }, + { + "epoch": 0.04817488399451675, + "grad_norm": 13.762935052667222, + "learning_rate": 4.817396208931773e-05, + "loss": 2.7014, + "mean_token_accuracy": 0.3931034505367279, + "step": 47830 + }, + { + "epoch": 0.048179920047620915, + "grad_norm": 17.21572208679351, + "learning_rate": 4.817899804604879e-05, + "loss": 2.7142, + "mean_token_accuracy": 0.3965517282485962, + "step": 47835 + }, + { + "epoch": 0.04818495610072509, + "grad_norm": 37.567714233857586, + "learning_rate": 4.818403400277985e-05, + "loss": 3.2081, + "mean_token_accuracy": 0.320689657330513, + "step": 47840 + }, + { + "epoch": 0.04818999215382926, + "grad_norm": 11.76298402357167, + "learning_rate": 4.818906995951091e-05, + "loss": 2.7678, + "mean_token_accuracy": 0.4137930989265442, + "step": 47845 + }, + { + "epoch": 0.04819502820693344, + "grad_norm": 16.772717131885997, + "learning_rate": 4.819410591624197e-05, + "loss": 2.7511, + "mean_token_accuracy": 0.324137932062149, + "step": 47850 + }, + { + "epoch": 0.04820006426003761, + "grad_norm": 12.000196952383575, + "learning_rate": 4.819914187297303e-05, + "loss": 2.5852, + "mean_token_accuracy": 0.37241379022598264, + "step": 47855 + }, + { + "epoch": 0.048205100313141784, + "grad_norm": 13.62305493359036, + "learning_rate": 4.820417782970409e-05, + "loss": 3.0249, + "mean_token_accuracy": 0.36551723480224607, + "step": 47860 + }, + { + "epoch": 0.04821013636624596, + "grad_norm": 12.60647579190391, + "learning_rate": 4.820921378643515e-05, + "loss": 2.822, + "mean_token_accuracy": 0.3620689630508423, + "step": 47865 + }, + { + "epoch": 0.048215172419350125, + "grad_norm": 11.891029156714671, + "learning_rate": 4.821424974316621e-05, + "loss": 2.5257, + "mean_token_accuracy": 0.3931034505367279, + "step": 47870 + }, + { + "epoch": 0.0482202084724543, + "grad_norm": 12.443899103466219, + "learning_rate": 4.821928569989727e-05, + "loss": 2.5855, + "mean_token_accuracy": 0.3689655244350433, + "step": 47875 + }, + { + "epoch": 0.04822524452555847, + "grad_norm": 13.616605615737052, + "learning_rate": 4.822432165662833e-05, + "loss": 2.6064, + "mean_token_accuracy": 0.39310344457626345, + "step": 47880 + }, + { + "epoch": 0.048230280578662646, + "grad_norm": 10.113312093342765, + "learning_rate": 4.822935761335939e-05, + "loss": 2.122, + "mean_token_accuracy": 0.4448275864124298, + "step": 47885 + }, + { + "epoch": 0.04823531663176682, + "grad_norm": 15.533841247387786, + "learning_rate": 4.8234393570090444e-05, + "loss": 2.4082, + "mean_token_accuracy": 0.458620685338974, + "step": 47890 + }, + { + "epoch": 0.048240352684870993, + "grad_norm": 16.27341223691513, + "learning_rate": 4.82394295268215e-05, + "loss": 3.2313, + "mean_token_accuracy": 0.32758620381355286, + "step": 47895 + }, + { + "epoch": 0.04824538873797517, + "grad_norm": 12.815066340522804, + "learning_rate": 4.824446548355257e-05, + "loss": 2.7286, + "mean_token_accuracy": 0.4379310369491577, + "step": 47900 + }, + { + "epoch": 0.048250424791079334, + "grad_norm": 13.684763976126163, + "learning_rate": 4.824950144028363e-05, + "loss": 2.5653, + "mean_token_accuracy": 0.42952207922935487, + "step": 47905 + }, + { + "epoch": 0.04825546084418351, + "grad_norm": 13.699913146049308, + "learning_rate": 4.825453739701469e-05, + "loss": 2.9401, + "mean_token_accuracy": 0.36896551251411436, + "step": 47910 + }, + { + "epoch": 0.04826049689728768, + "grad_norm": 11.042239964822294, + "learning_rate": 4.825957335374575e-05, + "loss": 2.6792, + "mean_token_accuracy": 0.3689655244350433, + "step": 47915 + }, + { + "epoch": 0.048265532950391855, + "grad_norm": 13.474063338102429, + "learning_rate": 4.8264609310476806e-05, + "loss": 2.7897, + "mean_token_accuracy": 0.38620689809322356, + "step": 47920 + }, + { + "epoch": 0.04827056900349603, + "grad_norm": 13.83060275475539, + "learning_rate": 4.826964526720787e-05, + "loss": 2.595, + "mean_token_accuracy": 0.37931033968925476, + "step": 47925 + }, + { + "epoch": 0.0482756050566002, + "grad_norm": 13.622960931163657, + "learning_rate": 4.8274681223938925e-05, + "loss": 2.6993, + "mean_token_accuracy": 0.41034482717514037, + "step": 47930 + }, + { + "epoch": 0.04828064110970438, + "grad_norm": 16.05431563451574, + "learning_rate": 4.8279717180669984e-05, + "loss": 2.9244, + "mean_token_accuracy": 0.30689655244350433, + "step": 47935 + }, + { + "epoch": 0.048285677162808543, + "grad_norm": 15.072709148803987, + "learning_rate": 4.828475313740104e-05, + "loss": 2.3852, + "mean_token_accuracy": 0.4399878978729248, + "step": 47940 + }, + { + "epoch": 0.04829071321591272, + "grad_norm": 12.231847001336279, + "learning_rate": 4.82897890941321e-05, + "loss": 2.5781, + "mean_token_accuracy": 0.4047791838645935, + "step": 47945 + }, + { + "epoch": 0.04829574926901689, + "grad_norm": 13.723842790085856, + "learning_rate": 4.829482505086316e-05, + "loss": 2.6655, + "mean_token_accuracy": 0.3620689630508423, + "step": 47950 + }, + { + "epoch": 0.048300785322121065, + "grad_norm": 13.123596574714506, + "learning_rate": 4.829986100759423e-05, + "loss": 2.7262, + "mean_token_accuracy": 0.39655172228813174, + "step": 47955 + }, + { + "epoch": 0.04830582137522524, + "grad_norm": 16.12078669152484, + "learning_rate": 4.830489696432529e-05, + "loss": 3.026, + "mean_token_accuracy": 0.36896551549434664, + "step": 47960 + }, + { + "epoch": 0.04831085742832941, + "grad_norm": 14.88394241707456, + "learning_rate": 4.8309932921056346e-05, + "loss": 2.635, + "mean_token_accuracy": 0.4068965494632721, + "step": 47965 + }, + { + "epoch": 0.048315893481433586, + "grad_norm": 11.852550459829525, + "learning_rate": 4.8314968877787406e-05, + "loss": 2.836, + "mean_token_accuracy": 0.3655172437429428, + "step": 47970 + }, + { + "epoch": 0.04832092953453775, + "grad_norm": 13.147051097520313, + "learning_rate": 4.8320004834518465e-05, + "loss": 2.908, + "mean_token_accuracy": 0.36896551251411436, + "step": 47975 + }, + { + "epoch": 0.04832596558764193, + "grad_norm": 10.290878026574449, + "learning_rate": 4.8325040791249524e-05, + "loss": 2.5569, + "mean_token_accuracy": 0.3758620619773865, + "step": 47980 + }, + { + "epoch": 0.0483310016407461, + "grad_norm": 14.911818746624565, + "learning_rate": 4.833007674798058e-05, + "loss": 2.621, + "mean_token_accuracy": 0.3793103486299515, + "step": 47985 + }, + { + "epoch": 0.048336037693850274, + "grad_norm": 15.442953974231326, + "learning_rate": 4.833511270471164e-05, + "loss": 2.8982, + "mean_token_accuracy": 0.33793103098869326, + "step": 47990 + }, + { + "epoch": 0.04834107374695445, + "grad_norm": 11.280672597002635, + "learning_rate": 4.83401486614427e-05, + "loss": 2.8518, + "mean_token_accuracy": 0.39655172228813174, + "step": 47995 + }, + { + "epoch": 0.04834610980005862, + "grad_norm": 10.7260455224953, + "learning_rate": 4.834518461817376e-05, + "loss": 2.3238, + "mean_token_accuracy": 0.4344827592372894, + "step": 48000 + }, + { + "epoch": 0.048351145853162796, + "grad_norm": 11.336672068868449, + "learning_rate": 4.835022057490483e-05, + "loss": 2.6321, + "mean_token_accuracy": 0.3896551728248596, + "step": 48005 + }, + { + "epoch": 0.04835618190626696, + "grad_norm": 11.291506943875316, + "learning_rate": 4.8355256531635886e-05, + "loss": 2.3768, + "mean_token_accuracy": 0.42758620381355283, + "step": 48010 + }, + { + "epoch": 0.048361217959371136, + "grad_norm": 14.296863471149353, + "learning_rate": 4.8360292488366946e-05, + "loss": 2.7143, + "mean_token_accuracy": 0.35862068831920624, + "step": 48015 + }, + { + "epoch": 0.04836625401247531, + "grad_norm": 17.845974770416266, + "learning_rate": 4.8365328445098005e-05, + "loss": 2.3487, + "mean_token_accuracy": 0.4551724255084991, + "step": 48020 + }, + { + "epoch": 0.048371290065579484, + "grad_norm": 17.99301831657516, + "learning_rate": 4.837036440182906e-05, + "loss": 2.6763, + "mean_token_accuracy": 0.42758620977401735, + "step": 48025 + }, + { + "epoch": 0.04837632611868366, + "grad_norm": 13.65630079561519, + "learning_rate": 4.837540035856012e-05, + "loss": 2.5703, + "mean_token_accuracy": 0.4034482717514038, + "step": 48030 + }, + { + "epoch": 0.04838136217178783, + "grad_norm": 14.22024069990981, + "learning_rate": 4.838043631529118e-05, + "loss": 2.4472, + "mean_token_accuracy": 0.3862069010734558, + "step": 48035 + }, + { + "epoch": 0.048386398224892005, + "grad_norm": 12.614602852936184, + "learning_rate": 4.838547227202224e-05, + "loss": 2.3851, + "mean_token_accuracy": 0.3999999940395355, + "step": 48040 + }, + { + "epoch": 0.04839143427799617, + "grad_norm": 12.225588195256037, + "learning_rate": 4.83905082287533e-05, + "loss": 2.7198, + "mean_token_accuracy": 0.3517241418361664, + "step": 48045 + }, + { + "epoch": 0.048396470331100346, + "grad_norm": 16.536903920919432, + "learning_rate": 4.839554418548436e-05, + "loss": 3.0174, + "mean_token_accuracy": 0.36896551847457887, + "step": 48050 + }, + { + "epoch": 0.04840150638420452, + "grad_norm": 11.703344554135915, + "learning_rate": 4.840058014221542e-05, + "loss": 2.2003, + "mean_token_accuracy": 0.4482758641242981, + "step": 48055 + }, + { + "epoch": 0.04840654243730869, + "grad_norm": 12.199465489470057, + "learning_rate": 4.8405616098946486e-05, + "loss": 2.508, + "mean_token_accuracy": 0.43448275327682495, + "step": 48060 + }, + { + "epoch": 0.04841157849041287, + "grad_norm": 10.736565903678072, + "learning_rate": 4.841065205567754e-05, + "loss": 2.7646, + "mean_token_accuracy": 0.3793103456497192, + "step": 48065 + }, + { + "epoch": 0.04841661454351704, + "grad_norm": 11.061564126828895, + "learning_rate": 4.84156880124086e-05, + "loss": 2.6704, + "mean_token_accuracy": 0.34137930870056155, + "step": 48070 + }, + { + "epoch": 0.048421650596621214, + "grad_norm": 12.965823345460366, + "learning_rate": 4.842072396913966e-05, + "loss": 2.7181, + "mean_token_accuracy": 0.35862069129943847, + "step": 48075 + }, + { + "epoch": 0.04842668664972538, + "grad_norm": 12.767373568784635, + "learning_rate": 4.8425759925870716e-05, + "loss": 2.9332, + "mean_token_accuracy": 0.4, + "step": 48080 + }, + { + "epoch": 0.048431722702829555, + "grad_norm": 11.136649343593858, + "learning_rate": 4.843079588260178e-05, + "loss": 2.3221, + "mean_token_accuracy": 0.4620689630508423, + "step": 48085 + }, + { + "epoch": 0.04843675875593373, + "grad_norm": 13.913160544286793, + "learning_rate": 4.843583183933284e-05, + "loss": 2.5998, + "mean_token_accuracy": 0.3827586233615875, + "step": 48090 + }, + { + "epoch": 0.0484417948090379, + "grad_norm": 10.664867515244978, + "learning_rate": 4.84408677960639e-05, + "loss": 2.5048, + "mean_token_accuracy": 0.4034482777118683, + "step": 48095 + }, + { + "epoch": 0.048446830862142076, + "grad_norm": 15.33218767243514, + "learning_rate": 4.844590375279496e-05, + "loss": 2.4069, + "mean_token_accuracy": 0.41379310488700866, + "step": 48100 + }, + { + "epoch": 0.04845186691524625, + "grad_norm": 11.590038592891378, + "learning_rate": 4.845093970952602e-05, + "loss": 2.4749, + "mean_token_accuracy": 0.3827586233615875, + "step": 48105 + }, + { + "epoch": 0.048456902968350424, + "grad_norm": 13.522503788548732, + "learning_rate": 4.845597566625708e-05, + "loss": 2.6779, + "mean_token_accuracy": 0.3655172407627106, + "step": 48110 + }, + { + "epoch": 0.04846193902145459, + "grad_norm": 13.765110744221158, + "learning_rate": 4.846101162298814e-05, + "loss": 2.4557, + "mean_token_accuracy": 0.4068965494632721, + "step": 48115 + }, + { + "epoch": 0.048466975074558764, + "grad_norm": 9.63216924131321, + "learning_rate": 4.84660475797192e-05, + "loss": 2.3762, + "mean_token_accuracy": 0.44664247035980226, + "step": 48120 + }, + { + "epoch": 0.04847201112766294, + "grad_norm": 16.91491101255305, + "learning_rate": 4.8471083536450256e-05, + "loss": 2.5557, + "mean_token_accuracy": 0.4068965554237366, + "step": 48125 + }, + { + "epoch": 0.04847704718076711, + "grad_norm": 12.654562604229833, + "learning_rate": 4.8476119493181315e-05, + "loss": 2.9432, + "mean_token_accuracy": 0.32413792312145234, + "step": 48130 + }, + { + "epoch": 0.048482083233871286, + "grad_norm": 12.003290721499402, + "learning_rate": 4.8481155449912375e-05, + "loss": 2.6067, + "mean_token_accuracy": 0.37931033968925476, + "step": 48135 + }, + { + "epoch": 0.04848711928697546, + "grad_norm": 11.59529740484712, + "learning_rate": 4.848619140664344e-05, + "loss": 2.2355, + "mean_token_accuracy": 0.417241370677948, + "step": 48140 + }, + { + "epoch": 0.04849215534007963, + "grad_norm": 11.644062267615753, + "learning_rate": 4.84912273633745e-05, + "loss": 2.3597, + "mean_token_accuracy": 0.41034482717514037, + "step": 48145 + }, + { + "epoch": 0.0484971913931838, + "grad_norm": 14.941949996041856, + "learning_rate": 4.849626332010556e-05, + "loss": 2.563, + "mean_token_accuracy": 0.35862068831920624, + "step": 48150 + }, + { + "epoch": 0.048502227446287974, + "grad_norm": 15.375850917715237, + "learning_rate": 4.850129927683661e-05, + "loss": 2.9527, + "mean_token_accuracy": 0.3620689630508423, + "step": 48155 + }, + { + "epoch": 0.04850726349939215, + "grad_norm": 13.328263973384196, + "learning_rate": 4.850633523356767e-05, + "loss": 2.5813, + "mean_token_accuracy": 0.3827586233615875, + "step": 48160 + }, + { + "epoch": 0.04851229955249632, + "grad_norm": 11.742490535354298, + "learning_rate": 4.851137119029874e-05, + "loss": 2.5646, + "mean_token_accuracy": 0.4156079888343811, + "step": 48165 + }, + { + "epoch": 0.048517335605600495, + "grad_norm": 13.438952936223133, + "learning_rate": 4.8516407147029796e-05, + "loss": 2.2942, + "mean_token_accuracy": 0.4517241358757019, + "step": 48170 + }, + { + "epoch": 0.04852237165870467, + "grad_norm": 13.674030613944092, + "learning_rate": 4.8521443103760855e-05, + "loss": 2.8094, + "mean_token_accuracy": 0.4034482717514038, + "step": 48175 + }, + { + "epoch": 0.04852740771180884, + "grad_norm": 12.531782944179996, + "learning_rate": 4.8526479060491915e-05, + "loss": 2.669, + "mean_token_accuracy": 0.39310344457626345, + "step": 48180 + }, + { + "epoch": 0.04853244376491301, + "grad_norm": 12.922740689455576, + "learning_rate": 4.8531515017222974e-05, + "loss": 2.6885, + "mean_token_accuracy": 0.37931033968925476, + "step": 48185 + }, + { + "epoch": 0.04853747981801718, + "grad_norm": 14.437195888089473, + "learning_rate": 4.853655097395403e-05, + "loss": 2.569, + "mean_token_accuracy": 0.3931034505367279, + "step": 48190 + }, + { + "epoch": 0.04854251587112136, + "grad_norm": 13.954715030601593, + "learning_rate": 4.854158693068509e-05, + "loss": 2.7136, + "mean_token_accuracy": 0.3655172407627106, + "step": 48195 + }, + { + "epoch": 0.04854755192422553, + "grad_norm": 19.736013762155707, + "learning_rate": 4.854662288741615e-05, + "loss": 2.3335, + "mean_token_accuracy": 0.4310344815254211, + "step": 48200 + }, + { + "epoch": 0.048552587977329704, + "grad_norm": 15.476636621925529, + "learning_rate": 4.855165884414721e-05, + "loss": 2.9275, + "mean_token_accuracy": 0.3551724135875702, + "step": 48205 + }, + { + "epoch": 0.04855762403043388, + "grad_norm": 15.07431611920237, + "learning_rate": 4.855669480087827e-05, + "loss": 2.4141, + "mean_token_accuracy": 0.37586206793785093, + "step": 48210 + }, + { + "epoch": 0.04856266008353805, + "grad_norm": 8.953688372231884, + "learning_rate": 4.856173075760933e-05, + "loss": 2.213, + "mean_token_accuracy": 0.4379310369491577, + "step": 48215 + }, + { + "epoch": 0.04856769613664222, + "grad_norm": 34.78160837929185, + "learning_rate": 4.8566766714340395e-05, + "loss": 2.7686, + "mean_token_accuracy": 0.41379310488700866, + "step": 48220 + }, + { + "epoch": 0.04857273218974639, + "grad_norm": 24.264707806974492, + "learning_rate": 4.8571802671071455e-05, + "loss": 2.7362, + "mean_token_accuracy": 0.37241379022598264, + "step": 48225 + }, + { + "epoch": 0.048577768242850566, + "grad_norm": 15.161720206262045, + "learning_rate": 4.8576838627802514e-05, + "loss": 2.8926, + "mean_token_accuracy": 0.3620689630508423, + "step": 48230 + }, + { + "epoch": 0.04858280429595474, + "grad_norm": 12.990903514245483, + "learning_rate": 4.858187458453357e-05, + "loss": 2.7298, + "mean_token_accuracy": 0.38965516686439516, + "step": 48235 + }, + { + "epoch": 0.048587840349058914, + "grad_norm": 13.319839303133348, + "learning_rate": 4.858691054126463e-05, + "loss": 2.7845, + "mean_token_accuracy": 0.37241379618644715, + "step": 48240 + }, + { + "epoch": 0.04859287640216309, + "grad_norm": 15.394204246819257, + "learning_rate": 4.859194649799569e-05, + "loss": 2.7024, + "mean_token_accuracy": 0.3965517282485962, + "step": 48245 + }, + { + "epoch": 0.04859791245526726, + "grad_norm": 12.367496431187494, + "learning_rate": 4.859698245472675e-05, + "loss": 2.9963, + "mean_token_accuracy": 0.3344827502965927, + "step": 48250 + }, + { + "epoch": 0.04860294850837143, + "grad_norm": 12.259542837307565, + "learning_rate": 4.860201841145781e-05, + "loss": 2.3547, + "mean_token_accuracy": 0.41034482717514037, + "step": 48255 + }, + { + "epoch": 0.0486079845614756, + "grad_norm": 12.827897221164488, + "learning_rate": 4.860705436818887e-05, + "loss": 2.7808, + "mean_token_accuracy": 0.3310344874858856, + "step": 48260 + }, + { + "epoch": 0.048613020614579776, + "grad_norm": 15.185870826588415, + "learning_rate": 4.861209032491993e-05, + "loss": 2.9822, + "mean_token_accuracy": 0.36896551847457887, + "step": 48265 + }, + { + "epoch": 0.04861805666768395, + "grad_norm": 12.194208729305847, + "learning_rate": 4.8617126281650995e-05, + "loss": 2.2269, + "mean_token_accuracy": 0.43103448748588563, + "step": 48270 + }, + { + "epoch": 0.04862309272078812, + "grad_norm": 12.668104981739225, + "learning_rate": 4.8622162238382054e-05, + "loss": 2.5206, + "mean_token_accuracy": 0.3551724195480347, + "step": 48275 + }, + { + "epoch": 0.0486281287738923, + "grad_norm": 13.000552727158809, + "learning_rate": 4.862719819511311e-05, + "loss": 2.3085, + "mean_token_accuracy": 0.4413793087005615, + "step": 48280 + }, + { + "epoch": 0.04863316482699647, + "grad_norm": 13.0346681751547, + "learning_rate": 4.863223415184417e-05, + "loss": 2.2602, + "mean_token_accuracy": 0.46896551847457885, + "step": 48285 + }, + { + "epoch": 0.04863820088010064, + "grad_norm": 12.780303088050125, + "learning_rate": 4.8637270108575225e-05, + "loss": 2.6014, + "mean_token_accuracy": 0.34482758641242983, + "step": 48290 + }, + { + "epoch": 0.04864323693320481, + "grad_norm": 10.94737219672258, + "learning_rate": 4.8642306065306284e-05, + "loss": 2.2147, + "mean_token_accuracy": 0.41724138855934145, + "step": 48295 + }, + { + "epoch": 0.048648272986308985, + "grad_norm": 12.130738409199719, + "learning_rate": 4.864734202203735e-05, + "loss": 2.327, + "mean_token_accuracy": 0.43793103098869324, + "step": 48300 + }, + { + "epoch": 0.04865330903941316, + "grad_norm": 14.122267368545838, + "learning_rate": 4.865237797876841e-05, + "loss": 2.8448, + "mean_token_accuracy": 0.39999998807907106, + "step": 48305 + }, + { + "epoch": 0.04865834509251733, + "grad_norm": 14.716641341102592, + "learning_rate": 4.865741393549947e-05, + "loss": 3.0089, + "mean_token_accuracy": 0.334482753276825, + "step": 48310 + }, + { + "epoch": 0.048663381145621507, + "grad_norm": 12.116616078984062, + "learning_rate": 4.866244989223053e-05, + "loss": 2.8515, + "mean_token_accuracy": 0.3655172407627106, + "step": 48315 + }, + { + "epoch": 0.04866841719872568, + "grad_norm": 13.313615856370046, + "learning_rate": 4.866748584896159e-05, + "loss": 2.9, + "mean_token_accuracy": 0.3275861978530884, + "step": 48320 + }, + { + "epoch": 0.04867345325182985, + "grad_norm": 11.663066221859198, + "learning_rate": 4.867252180569265e-05, + "loss": 2.4684, + "mean_token_accuracy": 0.4034482717514038, + "step": 48325 + }, + { + "epoch": 0.04867848930493402, + "grad_norm": 13.696278580835385, + "learning_rate": 4.8677557762423706e-05, + "loss": 2.6287, + "mean_token_accuracy": 0.41724138259887694, + "step": 48330 + }, + { + "epoch": 0.048683525358038195, + "grad_norm": 19.960609175796876, + "learning_rate": 4.8682593719154765e-05, + "loss": 2.7043, + "mean_token_accuracy": 0.3965517282485962, + "step": 48335 + }, + { + "epoch": 0.04868856141114237, + "grad_norm": 12.417478960739547, + "learning_rate": 4.8687629675885824e-05, + "loss": 2.5205, + "mean_token_accuracy": 0.36896551549434664, + "step": 48340 + }, + { + "epoch": 0.04869359746424654, + "grad_norm": 13.405255716067808, + "learning_rate": 4.8692665632616883e-05, + "loss": 3.0405, + "mean_token_accuracy": 0.3862068891525269, + "step": 48345 + }, + { + "epoch": 0.048698633517350716, + "grad_norm": 13.48379818560196, + "learning_rate": 4.869770158934795e-05, + "loss": 2.5258, + "mean_token_accuracy": 0.4310344815254211, + "step": 48350 + }, + { + "epoch": 0.04870366957045489, + "grad_norm": 12.895154840984414, + "learning_rate": 4.870273754607901e-05, + "loss": 2.5868, + "mean_token_accuracy": 0.41034482717514037, + "step": 48355 + }, + { + "epoch": 0.048708705623559057, + "grad_norm": 13.294338109326006, + "learning_rate": 4.870777350281007e-05, + "loss": 2.6717, + "mean_token_accuracy": 0.3379310369491577, + "step": 48360 + }, + { + "epoch": 0.04871374167666323, + "grad_norm": 13.491877913936998, + "learning_rate": 4.871280945954113e-05, + "loss": 2.3029, + "mean_token_accuracy": 0.4482758641242981, + "step": 48365 + }, + { + "epoch": 0.048718777729767404, + "grad_norm": 13.544388200593719, + "learning_rate": 4.8717845416272186e-05, + "loss": 2.7479, + "mean_token_accuracy": 0.36896551251411436, + "step": 48370 + }, + { + "epoch": 0.04872381378287158, + "grad_norm": 11.416438468398985, + "learning_rate": 4.8722881373003246e-05, + "loss": 2.7715, + "mean_token_accuracy": 0.3793103456497192, + "step": 48375 + }, + { + "epoch": 0.04872884983597575, + "grad_norm": 12.188407945788123, + "learning_rate": 4.8727917329734305e-05, + "loss": 2.5204, + "mean_token_accuracy": 0.4084089457988739, + "step": 48380 + }, + { + "epoch": 0.048733885889079925, + "grad_norm": 12.292729252302372, + "learning_rate": 4.8732953286465364e-05, + "loss": 2.2203, + "mean_token_accuracy": 0.42832512259483335, + "step": 48385 + }, + { + "epoch": 0.0487389219421841, + "grad_norm": 12.448125703528053, + "learning_rate": 4.8737989243196424e-05, + "loss": 2.8356, + "mean_token_accuracy": 0.3793103456497192, + "step": 48390 + }, + { + "epoch": 0.048743957995288266, + "grad_norm": 12.621255462003862, + "learning_rate": 4.874302519992748e-05, + "loss": 2.5078, + "mean_token_accuracy": 0.3896551728248596, + "step": 48395 + }, + { + "epoch": 0.04874899404839244, + "grad_norm": 12.392426273612912, + "learning_rate": 4.874806115665854e-05, + "loss": 2.5789, + "mean_token_accuracy": 0.4223835408687592, + "step": 48400 + }, + { + "epoch": 0.048754030101496613, + "grad_norm": 13.418593738013328, + "learning_rate": 4.875309711338961e-05, + "loss": 2.6193, + "mean_token_accuracy": 0.3848154813051224, + "step": 48405 + }, + { + "epoch": 0.04875906615460079, + "grad_norm": 11.680217457884503, + "learning_rate": 4.875813307012067e-05, + "loss": 2.4637, + "mean_token_accuracy": 0.4137930989265442, + "step": 48410 + }, + { + "epoch": 0.04876410220770496, + "grad_norm": 12.549532962302548, + "learning_rate": 4.8763169026851727e-05, + "loss": 2.9229, + "mean_token_accuracy": 0.3275861978530884, + "step": 48415 + }, + { + "epoch": 0.048769138260809135, + "grad_norm": 13.239880630759775, + "learning_rate": 4.8768204983582786e-05, + "loss": 2.6379, + "mean_token_accuracy": 0.3931034505367279, + "step": 48420 + }, + { + "epoch": 0.04877417431391331, + "grad_norm": 14.94931651380599, + "learning_rate": 4.877324094031384e-05, + "loss": 2.6614, + "mean_token_accuracy": 0.39310344457626345, + "step": 48425 + }, + { + "epoch": 0.048779210367017475, + "grad_norm": 10.35937841170216, + "learning_rate": 4.8778276897044904e-05, + "loss": 2.6427, + "mean_token_accuracy": 0.37241379618644715, + "step": 48430 + }, + { + "epoch": 0.04878424642012165, + "grad_norm": 15.778365637741251, + "learning_rate": 4.8783312853775964e-05, + "loss": 2.5496, + "mean_token_accuracy": 0.4468239605426788, + "step": 48435 + }, + { + "epoch": 0.04878928247322582, + "grad_norm": 13.594801836797828, + "learning_rate": 4.878834881050702e-05, + "loss": 2.8545, + "mean_token_accuracy": 0.37586206793785093, + "step": 48440 + }, + { + "epoch": 0.04879431852633, + "grad_norm": 14.953599458295173, + "learning_rate": 4.879338476723808e-05, + "loss": 2.6875, + "mean_token_accuracy": 0.3965517282485962, + "step": 48445 + }, + { + "epoch": 0.04879935457943417, + "grad_norm": 10.521815384316806, + "learning_rate": 4.879842072396914e-05, + "loss": 2.7746, + "mean_token_accuracy": 0.3862069010734558, + "step": 48450 + }, + { + "epoch": 0.048804390632538344, + "grad_norm": 13.824906564932823, + "learning_rate": 4.88034566807002e-05, + "loss": 2.8319, + "mean_token_accuracy": 0.38275861740112305, + "step": 48455 + }, + { + "epoch": 0.04880942668564252, + "grad_norm": 14.314326851472151, + "learning_rate": 4.8808492637431267e-05, + "loss": 2.6559, + "mean_token_accuracy": 0.38965516686439516, + "step": 48460 + }, + { + "epoch": 0.048814462738746685, + "grad_norm": 13.06217644717499, + "learning_rate": 4.881352859416232e-05, + "loss": 2.5418, + "mean_token_accuracy": 0.3862069010734558, + "step": 48465 + }, + { + "epoch": 0.04881949879185086, + "grad_norm": 18.250455079471973, + "learning_rate": 4.881856455089338e-05, + "loss": 3.0609, + "mean_token_accuracy": 0.3655172407627106, + "step": 48470 + }, + { + "epoch": 0.04882453484495503, + "grad_norm": 13.524749735949026, + "learning_rate": 4.882360050762444e-05, + "loss": 2.7834, + "mean_token_accuracy": 0.34137930572032926, + "step": 48475 + }, + { + "epoch": 0.048829570898059206, + "grad_norm": 12.571227692156853, + "learning_rate": 4.88286364643555e-05, + "loss": 2.8615, + "mean_token_accuracy": 0.37241379022598264, + "step": 48480 + }, + { + "epoch": 0.04883460695116338, + "grad_norm": 13.315856263625749, + "learning_rate": 4.883367242108656e-05, + "loss": 2.5918, + "mean_token_accuracy": 0.4068965554237366, + "step": 48485 + }, + { + "epoch": 0.048839643004267554, + "grad_norm": 15.008201898662728, + "learning_rate": 4.883870837781762e-05, + "loss": 2.4612, + "mean_token_accuracy": 0.4000000059604645, + "step": 48490 + }, + { + "epoch": 0.04884467905737173, + "grad_norm": 12.232209408897178, + "learning_rate": 4.884374433454868e-05, + "loss": 2.4763, + "mean_token_accuracy": 0.4172413766384125, + "step": 48495 + }, + { + "epoch": 0.048849715110475894, + "grad_norm": 14.255163908103704, + "learning_rate": 4.884878029127974e-05, + "loss": 2.2962, + "mean_token_accuracy": 0.3793103456497192, + "step": 48500 + }, + { + "epoch": 0.04885475116358007, + "grad_norm": 13.768624316857094, + "learning_rate": 4.88538162480108e-05, + "loss": 2.3636, + "mean_token_accuracy": 0.42607381343841555, + "step": 48505 + }, + { + "epoch": 0.04885978721668424, + "grad_norm": 11.985706766565356, + "learning_rate": 4.885885220474186e-05, + "loss": 2.4073, + "mean_token_accuracy": 0.3741681814193726, + "step": 48510 + }, + { + "epoch": 0.048864823269788416, + "grad_norm": 14.2329004019485, + "learning_rate": 4.886388816147292e-05, + "loss": 3.1628, + "mean_token_accuracy": 0.32758620381355286, + "step": 48515 + }, + { + "epoch": 0.04886985932289259, + "grad_norm": 11.688248127140819, + "learning_rate": 4.886892411820398e-05, + "loss": 2.4542, + "mean_token_accuracy": 0.40000000298023225, + "step": 48520 + }, + { + "epoch": 0.04887489537599676, + "grad_norm": 12.424367045858348, + "learning_rate": 4.887396007493504e-05, + "loss": 2.5759, + "mean_token_accuracy": 0.37241379618644715, + "step": 48525 + }, + { + "epoch": 0.04887993142910094, + "grad_norm": 13.673095229947196, + "learning_rate": 4.8878996031666096e-05, + "loss": 2.5992, + "mean_token_accuracy": 0.33448275923728943, + "step": 48530 + }, + { + "epoch": 0.048884967482205104, + "grad_norm": 10.333840333112141, + "learning_rate": 4.8884031988397155e-05, + "loss": 2.4575, + "mean_token_accuracy": 0.41379310488700866, + "step": 48535 + }, + { + "epoch": 0.04889000353530928, + "grad_norm": 13.465473085376816, + "learning_rate": 4.888906794512822e-05, + "loss": 2.4419, + "mean_token_accuracy": 0.41034482717514037, + "step": 48540 + }, + { + "epoch": 0.04889503958841345, + "grad_norm": 15.7055676822973, + "learning_rate": 4.889410390185928e-05, + "loss": 3.1634, + "mean_token_accuracy": 0.31379310190677645, + "step": 48545 + }, + { + "epoch": 0.048900075641517625, + "grad_norm": 12.217448120901466, + "learning_rate": 4.889913985859034e-05, + "loss": 2.6261, + "mean_token_accuracy": 0.3931034505367279, + "step": 48550 + }, + { + "epoch": 0.0489051116946218, + "grad_norm": 13.35142035782145, + "learning_rate": 4.89041758153214e-05, + "loss": 3.1392, + "mean_token_accuracy": 0.34827586114406583, + "step": 48555 + }, + { + "epoch": 0.04891014774772597, + "grad_norm": 10.733218452025508, + "learning_rate": 4.890921177205245e-05, + "loss": 2.4871, + "mean_token_accuracy": 0.38620689511299133, + "step": 48560 + }, + { + "epoch": 0.048915183800830146, + "grad_norm": 13.58158465293678, + "learning_rate": 4.891424772878352e-05, + "loss": 2.3137, + "mean_token_accuracy": 0.41724138259887694, + "step": 48565 + }, + { + "epoch": 0.04892021985393431, + "grad_norm": 10.826982571655027, + "learning_rate": 4.891928368551458e-05, + "loss": 2.3018, + "mean_token_accuracy": 0.4379310369491577, + "step": 48570 + }, + { + "epoch": 0.04892525590703849, + "grad_norm": 14.165362100668998, + "learning_rate": 4.8924319642245636e-05, + "loss": 2.6641, + "mean_token_accuracy": 0.4034482717514038, + "step": 48575 + }, + { + "epoch": 0.04893029196014266, + "grad_norm": 12.976475870644922, + "learning_rate": 4.8929355598976695e-05, + "loss": 3.0219, + "mean_token_accuracy": 0.35287356078624726, + "step": 48580 + }, + { + "epoch": 0.048935328013246834, + "grad_norm": 9.84799880495947, + "learning_rate": 4.8934391555707755e-05, + "loss": 2.2046, + "mean_token_accuracy": 0.42413792610168455, + "step": 48585 + }, + { + "epoch": 0.04894036406635101, + "grad_norm": 17.9686088498197, + "learning_rate": 4.893942751243882e-05, + "loss": 3.1188, + "mean_token_accuracy": 0.35172412991523744, + "step": 48590 + }, + { + "epoch": 0.04894540011945518, + "grad_norm": 11.147522915979303, + "learning_rate": 4.894446346916988e-05, + "loss": 2.7349, + "mean_token_accuracy": 0.324137932062149, + "step": 48595 + }, + { + "epoch": 0.048950436172559356, + "grad_norm": 14.18028834326604, + "learning_rate": 4.894949942590093e-05, + "loss": 2.6525, + "mean_token_accuracy": 0.36551724672317504, + "step": 48600 + }, + { + "epoch": 0.04895547222566352, + "grad_norm": 13.937273308945302, + "learning_rate": 4.895453538263199e-05, + "loss": 2.6352, + "mean_token_accuracy": 0.40307881832122805, + "step": 48605 + }, + { + "epoch": 0.048960508278767696, + "grad_norm": 12.602036334793599, + "learning_rate": 4.895957133936305e-05, + "loss": 2.6794, + "mean_token_accuracy": 0.3551724135875702, + "step": 48610 + }, + { + "epoch": 0.04896554433187187, + "grad_norm": 39.03827948099858, + "learning_rate": 4.896460729609411e-05, + "loss": 2.7775, + "mean_token_accuracy": 0.3827586233615875, + "step": 48615 + }, + { + "epoch": 0.048970580384976044, + "grad_norm": 11.693095912421027, + "learning_rate": 4.8969643252825176e-05, + "loss": 2.3885, + "mean_token_accuracy": 0.43103448748588563, + "step": 48620 + }, + { + "epoch": 0.04897561643808022, + "grad_norm": 16.908001976639614, + "learning_rate": 4.8974679209556236e-05, + "loss": 2.641, + "mean_token_accuracy": 0.384694492816925, + "step": 48625 + }, + { + "epoch": 0.04898065249118439, + "grad_norm": 12.129049762220168, + "learning_rate": 4.8979715166287295e-05, + "loss": 2.3119, + "mean_token_accuracy": 0.4275861978530884, + "step": 48630 + }, + { + "epoch": 0.048985688544288565, + "grad_norm": 15.713237207363157, + "learning_rate": 4.8984751123018354e-05, + "loss": 2.5231, + "mean_token_accuracy": 0.3965517282485962, + "step": 48635 + }, + { + "epoch": 0.04899072459739273, + "grad_norm": 15.441070874542985, + "learning_rate": 4.898978707974941e-05, + "loss": 2.4771, + "mean_token_accuracy": 0.36896551847457887, + "step": 48640 + }, + { + "epoch": 0.048995760650496906, + "grad_norm": 10.696272089903482, + "learning_rate": 4.899482303648047e-05, + "loss": 2.771, + "mean_token_accuracy": 0.3965517163276672, + "step": 48645 + }, + { + "epoch": 0.04900079670360108, + "grad_norm": 12.335197204442453, + "learning_rate": 4.899985899321153e-05, + "loss": 2.7099, + "mean_token_accuracy": 0.41034482717514037, + "step": 48650 + }, + { + "epoch": 0.04900583275670525, + "grad_norm": 11.171296736601489, + "learning_rate": 4.900489494994259e-05, + "loss": 2.2138, + "mean_token_accuracy": 0.417241370677948, + "step": 48655 + }, + { + "epoch": 0.04901086880980943, + "grad_norm": 11.07551264107439, + "learning_rate": 4.900993090667365e-05, + "loss": 2.6168, + "mean_token_accuracy": 0.4603750824928284, + "step": 48660 + }, + { + "epoch": 0.0490159048629136, + "grad_norm": 11.41291529272933, + "learning_rate": 4.901496686340471e-05, + "loss": 2.2826, + "mean_token_accuracy": 0.42758620381355283, + "step": 48665 + }, + { + "epoch": 0.049020940916017774, + "grad_norm": 12.35846193357575, + "learning_rate": 4.9020002820135776e-05, + "loss": 2.7803, + "mean_token_accuracy": 0.35862069129943847, + "step": 48670 + }, + { + "epoch": 0.04902597696912194, + "grad_norm": 13.452914487071011, + "learning_rate": 4.9025038776866835e-05, + "loss": 2.9799, + "mean_token_accuracy": 0.39655172228813174, + "step": 48675 + }, + { + "epoch": 0.049031013022226115, + "grad_norm": 10.667367877039903, + "learning_rate": 4.9030074733597894e-05, + "loss": 2.2757, + "mean_token_accuracy": 0.44313369393348695, + "step": 48680 + }, + { + "epoch": 0.04903604907533029, + "grad_norm": 14.034089187491649, + "learning_rate": 4.903511069032895e-05, + "loss": 2.4509, + "mean_token_accuracy": 0.41724138259887694, + "step": 48685 + }, + { + "epoch": 0.04904108512843446, + "grad_norm": 16.065601693166588, + "learning_rate": 4.9040146647060006e-05, + "loss": 2.4202, + "mean_token_accuracy": 0.36896551847457887, + "step": 48690 + }, + { + "epoch": 0.049046121181538636, + "grad_norm": 16.291617589586618, + "learning_rate": 4.904518260379107e-05, + "loss": 2.7196, + "mean_token_accuracy": 0.3793103456497192, + "step": 48695 + }, + { + "epoch": 0.04905115723464281, + "grad_norm": 12.597348660149816, + "learning_rate": 4.905021856052213e-05, + "loss": 2.7101, + "mean_token_accuracy": 0.39310344457626345, + "step": 48700 + }, + { + "epoch": 0.049056193287746984, + "grad_norm": 13.38408563963703, + "learning_rate": 4.905525451725319e-05, + "loss": 2.5155, + "mean_token_accuracy": 0.43793103098869324, + "step": 48705 + }, + { + "epoch": 0.04906122934085115, + "grad_norm": 14.05694404360121, + "learning_rate": 4.906029047398425e-05, + "loss": 2.4763, + "mean_token_accuracy": 0.3965517282485962, + "step": 48710 + }, + { + "epoch": 0.049066265393955324, + "grad_norm": 11.818800615651364, + "learning_rate": 4.906532643071531e-05, + "loss": 2.4328, + "mean_token_accuracy": 0.41724138259887694, + "step": 48715 + }, + { + "epoch": 0.0490713014470595, + "grad_norm": 16.28295995476592, + "learning_rate": 4.907036238744637e-05, + "loss": 2.9192, + "mean_token_accuracy": 0.32758620381355286, + "step": 48720 + }, + { + "epoch": 0.04907633750016367, + "grad_norm": 17.12050061095547, + "learning_rate": 4.9075398344177434e-05, + "loss": 2.6422, + "mean_token_accuracy": 0.41724138855934145, + "step": 48725 + }, + { + "epoch": 0.049081373553267846, + "grad_norm": 12.037829431397777, + "learning_rate": 4.9080434300908487e-05, + "loss": 2.761, + "mean_token_accuracy": 0.3655172407627106, + "step": 48730 + }, + { + "epoch": 0.04908640960637202, + "grad_norm": 11.299898431584603, + "learning_rate": 4.9085470257639546e-05, + "loss": 2.7494, + "mean_token_accuracy": 0.37241379022598264, + "step": 48735 + }, + { + "epoch": 0.04909144565947619, + "grad_norm": 14.026421664454766, + "learning_rate": 4.9090506214370605e-05, + "loss": 2.6895, + "mean_token_accuracy": 0.39655172228813174, + "step": 48740 + }, + { + "epoch": 0.04909648171258036, + "grad_norm": 11.161789309102504, + "learning_rate": 4.9095542171101664e-05, + "loss": 1.9768, + "mean_token_accuracy": 0.47241379618644713, + "step": 48745 + }, + { + "epoch": 0.049101517765684534, + "grad_norm": 15.256682681278365, + "learning_rate": 4.910057812783273e-05, + "loss": 2.6676, + "mean_token_accuracy": 0.4448275864124298, + "step": 48750 + }, + { + "epoch": 0.04910655381878871, + "grad_norm": 12.563981049323946, + "learning_rate": 4.910561408456379e-05, + "loss": 2.5272, + "mean_token_accuracy": 0.42758620977401735, + "step": 48755 + }, + { + "epoch": 0.04911158987189288, + "grad_norm": 13.200370300176669, + "learning_rate": 4.911065004129485e-05, + "loss": 2.8527, + "mean_token_accuracy": 0.37241379022598264, + "step": 48760 + }, + { + "epoch": 0.049116625924997055, + "grad_norm": 12.040463218394537, + "learning_rate": 4.911568599802591e-05, + "loss": 2.5456, + "mean_token_accuracy": 0.4241379201412201, + "step": 48765 + }, + { + "epoch": 0.04912166197810123, + "grad_norm": 11.900442597829505, + "learning_rate": 4.912072195475697e-05, + "loss": 2.562, + "mean_token_accuracy": 0.3551724135875702, + "step": 48770 + }, + { + "epoch": 0.0491266980312054, + "grad_norm": 10.561517018587763, + "learning_rate": 4.912575791148803e-05, + "loss": 2.6782, + "mean_token_accuracy": 0.37241379618644715, + "step": 48775 + }, + { + "epoch": 0.04913173408430957, + "grad_norm": 14.577239316877046, + "learning_rate": 4.9130793868219086e-05, + "loss": 2.4158, + "mean_token_accuracy": 0.40508166551589964, + "step": 48780 + }, + { + "epoch": 0.04913677013741374, + "grad_norm": 14.221353782661387, + "learning_rate": 4.9135829824950145e-05, + "loss": 3.3963, + "mean_token_accuracy": 0.2931034505367279, + "step": 48785 + }, + { + "epoch": 0.04914180619051792, + "grad_norm": 10.390088581283837, + "learning_rate": 4.9140865781681204e-05, + "loss": 2.7245, + "mean_token_accuracy": 0.39655172228813174, + "step": 48790 + }, + { + "epoch": 0.04914684224362209, + "grad_norm": 10.861266271960327, + "learning_rate": 4.9145901738412264e-05, + "loss": 2.6406, + "mean_token_accuracy": 0.4172413766384125, + "step": 48795 + }, + { + "epoch": 0.049151878296726265, + "grad_norm": 13.615060606335325, + "learning_rate": 4.915093769514332e-05, + "loss": 2.8556, + "mean_token_accuracy": 0.38965516686439516, + "step": 48800 + }, + { + "epoch": 0.04915691434983044, + "grad_norm": 14.268334857568226, + "learning_rate": 4.915597365187439e-05, + "loss": 2.7183, + "mean_token_accuracy": 0.3724137932062149, + "step": 48805 + }, + { + "epoch": 0.04916195040293461, + "grad_norm": 13.416458486468091, + "learning_rate": 4.916100960860545e-05, + "loss": 3.0008, + "mean_token_accuracy": 0.34482758641242983, + "step": 48810 + }, + { + "epoch": 0.04916698645603878, + "grad_norm": 13.38075914762203, + "learning_rate": 4.916604556533651e-05, + "loss": 2.9544, + "mean_token_accuracy": 0.37241379022598264, + "step": 48815 + }, + { + "epoch": 0.04917202250914295, + "grad_norm": 13.019920543432152, + "learning_rate": 4.917108152206757e-05, + "loss": 2.4255, + "mean_token_accuracy": 0.42758620977401735, + "step": 48820 + }, + { + "epoch": 0.049177058562247127, + "grad_norm": 14.57767112428131, + "learning_rate": 4.917611747879862e-05, + "loss": 2.4531, + "mean_token_accuracy": 0.43793103098869324, + "step": 48825 + }, + { + "epoch": 0.0491820946153513, + "grad_norm": 12.820668338971199, + "learning_rate": 4.9181153435529685e-05, + "loss": 2.55, + "mean_token_accuracy": 0.4, + "step": 48830 + }, + { + "epoch": 0.049187130668455474, + "grad_norm": 45.18701672184421, + "learning_rate": 4.9186189392260744e-05, + "loss": 2.4031, + "mean_token_accuracy": 0.42068966031074523, + "step": 48835 + }, + { + "epoch": 0.04919216672155965, + "grad_norm": 11.9466223325721, + "learning_rate": 4.9191225348991804e-05, + "loss": 2.3265, + "mean_token_accuracy": 0.441379314661026, + "step": 48840 + }, + { + "epoch": 0.04919720277466382, + "grad_norm": 9.560043074615674, + "learning_rate": 4.919626130572286e-05, + "loss": 2.6376, + "mean_token_accuracy": 0.37241379022598264, + "step": 48845 + }, + { + "epoch": 0.04920223882776799, + "grad_norm": 11.659676178487606, + "learning_rate": 4.920129726245392e-05, + "loss": 2.7371, + "mean_token_accuracy": 0.42758620381355283, + "step": 48850 + }, + { + "epoch": 0.04920727488087216, + "grad_norm": 40.276321061043596, + "learning_rate": 4.920633321918499e-05, + "loss": 2.6005, + "mean_token_accuracy": 0.4253478586673737, + "step": 48855 + }, + { + "epoch": 0.049212310933976336, + "grad_norm": 10.408309385179242, + "learning_rate": 4.921136917591605e-05, + "loss": 2.4425, + "mean_token_accuracy": 0.3931034505367279, + "step": 48860 + }, + { + "epoch": 0.04921734698708051, + "grad_norm": 14.321151651783058, + "learning_rate": 4.92164051326471e-05, + "loss": 2.3596, + "mean_token_accuracy": 0.4482758641242981, + "step": 48865 + }, + { + "epoch": 0.04922238304018468, + "grad_norm": 13.600447757095006, + "learning_rate": 4.922144108937816e-05, + "loss": 2.593, + "mean_token_accuracy": 0.38275861740112305, + "step": 48870 + }, + { + "epoch": 0.04922741909328886, + "grad_norm": 12.751175659898864, + "learning_rate": 4.922647704610922e-05, + "loss": 2.7389, + "mean_token_accuracy": 0.3862068891525269, + "step": 48875 + }, + { + "epoch": 0.04923245514639303, + "grad_norm": 12.11222444077847, + "learning_rate": 4.923151300284028e-05, + "loss": 2.3035, + "mean_token_accuracy": 0.3965517282485962, + "step": 48880 + }, + { + "epoch": 0.0492374911994972, + "grad_norm": 12.144342147668807, + "learning_rate": 4.9236548959571344e-05, + "loss": 3.0019, + "mean_token_accuracy": 0.39655172228813174, + "step": 48885 + }, + { + "epoch": 0.04924252725260137, + "grad_norm": 12.14491031470454, + "learning_rate": 4.92415849163024e-05, + "loss": 2.4509, + "mean_token_accuracy": 0.3945553541183472, + "step": 48890 + }, + { + "epoch": 0.049247563305705545, + "grad_norm": 11.795610286509646, + "learning_rate": 4.924662087303346e-05, + "loss": 2.584, + "mean_token_accuracy": 0.3965517282485962, + "step": 48895 + }, + { + "epoch": 0.04925259935880972, + "grad_norm": 16.301345551630295, + "learning_rate": 4.925165682976452e-05, + "loss": 2.7685, + "mean_token_accuracy": 0.39310344159603117, + "step": 48900 + }, + { + "epoch": 0.04925763541191389, + "grad_norm": 15.19144555577583, + "learning_rate": 4.925669278649558e-05, + "loss": 2.5623, + "mean_token_accuracy": 0.42413793206214906, + "step": 48905 + }, + { + "epoch": 0.04926267146501807, + "grad_norm": 21.835471087718506, + "learning_rate": 4.926172874322664e-05, + "loss": 2.8131, + "mean_token_accuracy": 0.3965517163276672, + "step": 48910 + }, + { + "epoch": 0.04926770751812224, + "grad_norm": 17.810675516982396, + "learning_rate": 4.92667646999577e-05, + "loss": 2.6096, + "mean_token_accuracy": 0.37241379022598264, + "step": 48915 + }, + { + "epoch": 0.04927274357122641, + "grad_norm": 12.504608025082877, + "learning_rate": 4.927180065668876e-05, + "loss": 2.2541, + "mean_token_accuracy": 0.42068964838981626, + "step": 48920 + }, + { + "epoch": 0.04927777962433058, + "grad_norm": 12.89568499758129, + "learning_rate": 4.927683661341982e-05, + "loss": 2.473, + "mean_token_accuracy": 0.4379310250282288, + "step": 48925 + }, + { + "epoch": 0.049282815677434755, + "grad_norm": 13.055089932535353, + "learning_rate": 4.928187257015088e-05, + "loss": 2.8171, + "mean_token_accuracy": 0.41034482717514037, + "step": 48930 + }, + { + "epoch": 0.04928785173053893, + "grad_norm": 13.623002976530392, + "learning_rate": 4.928690852688194e-05, + "loss": 3.1556, + "mean_token_accuracy": 0.3413793116807938, + "step": 48935 + }, + { + "epoch": 0.0492928877836431, + "grad_norm": 12.116065984149035, + "learning_rate": 4.9291944483613e-05, + "loss": 2.6422, + "mean_token_accuracy": 0.3620689630508423, + "step": 48940 + }, + { + "epoch": 0.049297923836747276, + "grad_norm": 12.169340069722987, + "learning_rate": 4.929698044034406e-05, + "loss": 2.8414, + "mean_token_accuracy": 0.3862068921327591, + "step": 48945 + }, + { + "epoch": 0.04930295988985145, + "grad_norm": 13.726568890272771, + "learning_rate": 4.930201639707512e-05, + "loss": 2.2898, + "mean_token_accuracy": 0.4448275864124298, + "step": 48950 + }, + { + "epoch": 0.04930799594295562, + "grad_norm": 14.65710590527926, + "learning_rate": 4.930705235380618e-05, + "loss": 2.5824, + "mean_token_accuracy": 0.4068965554237366, + "step": 48955 + }, + { + "epoch": 0.04931303199605979, + "grad_norm": 17.17793757785788, + "learning_rate": 4.931208831053723e-05, + "loss": 3.0452, + "mean_token_accuracy": 0.37392619252204895, + "step": 48960 + }, + { + "epoch": 0.049318068049163964, + "grad_norm": 10.512203447989698, + "learning_rate": 4.93171242672683e-05, + "loss": 2.4961, + "mean_token_accuracy": 0.4103448152542114, + "step": 48965 + }, + { + "epoch": 0.04932310410226814, + "grad_norm": 13.511176085946092, + "learning_rate": 4.932216022399936e-05, + "loss": 2.6672, + "mean_token_accuracy": 0.4172413766384125, + "step": 48970 + }, + { + "epoch": 0.04932814015537231, + "grad_norm": 11.07085745044113, + "learning_rate": 4.932719618073042e-05, + "loss": 2.8035, + "mean_token_accuracy": 0.41379310488700866, + "step": 48975 + }, + { + "epoch": 0.049333176208476485, + "grad_norm": 12.226242046400936, + "learning_rate": 4.9332232137461476e-05, + "loss": 2.4214, + "mean_token_accuracy": 0.43647912740707395, + "step": 48980 + }, + { + "epoch": 0.04933821226158066, + "grad_norm": 14.929548591129555, + "learning_rate": 4.9337268094192536e-05, + "loss": 2.9902, + "mean_token_accuracy": 0.36551723778247835, + "step": 48985 + }, + { + "epoch": 0.049343248314684826, + "grad_norm": 13.347567918108984, + "learning_rate": 4.93423040509236e-05, + "loss": 2.4535, + "mean_token_accuracy": 0.4261342972517014, + "step": 48990 + }, + { + "epoch": 0.049348284367789, + "grad_norm": 18.38620285207596, + "learning_rate": 4.934734000765466e-05, + "loss": 3.002, + "mean_token_accuracy": 0.35862069129943847, + "step": 48995 + }, + { + "epoch": 0.049353320420893174, + "grad_norm": 12.39865944592867, + "learning_rate": 4.935237596438571e-05, + "loss": 2.9165, + "mean_token_accuracy": 0.33448275923728943, + "step": 49000 + }, + { + "epoch": 0.04935835647399735, + "grad_norm": 11.939019831500048, + "learning_rate": 4.935741192111677e-05, + "loss": 2.6814, + "mean_token_accuracy": 0.4034482777118683, + "step": 49005 + }, + { + "epoch": 0.04936339252710152, + "grad_norm": 16.12697505582482, + "learning_rate": 4.936244787784783e-05, + "loss": 2.8692, + "mean_token_accuracy": 0.35517241060733795, + "step": 49010 + }, + { + "epoch": 0.049368428580205695, + "grad_norm": 12.689607770485951, + "learning_rate": 4.93674838345789e-05, + "loss": 2.5232, + "mean_token_accuracy": 0.4000000059604645, + "step": 49015 + }, + { + "epoch": 0.04937346463330987, + "grad_norm": 10.886131405544253, + "learning_rate": 4.937251979130996e-05, + "loss": 2.2615, + "mean_token_accuracy": 0.46551724076271056, + "step": 49020 + }, + { + "epoch": 0.049378500686414036, + "grad_norm": 13.417360271090889, + "learning_rate": 4.9377555748041016e-05, + "loss": 2.5904, + "mean_token_accuracy": 0.42601330280303956, + "step": 49025 + }, + { + "epoch": 0.04938353673951821, + "grad_norm": 16.915400960677378, + "learning_rate": 4.9382591704772076e-05, + "loss": 2.6125, + "mean_token_accuracy": 0.43793103098869324, + "step": 49030 + }, + { + "epoch": 0.04938857279262238, + "grad_norm": 12.704193536934127, + "learning_rate": 4.9387627661503135e-05, + "loss": 2.6771, + "mean_token_accuracy": 0.41379310488700866, + "step": 49035 + }, + { + "epoch": 0.04939360884572656, + "grad_norm": 11.972975676578281, + "learning_rate": 4.9392663618234194e-05, + "loss": 2.8462, + "mean_token_accuracy": 0.3551724076271057, + "step": 49040 + }, + { + "epoch": 0.04939864489883073, + "grad_norm": 9.949013891257787, + "learning_rate": 4.9397699574965253e-05, + "loss": 2.4206, + "mean_token_accuracy": 0.42413793206214906, + "step": 49045 + }, + { + "epoch": 0.049403680951934904, + "grad_norm": 11.071053548273435, + "learning_rate": 4.940273553169631e-05, + "loss": 2.7129, + "mean_token_accuracy": 0.37241379618644715, + "step": 49050 + }, + { + "epoch": 0.04940871700503908, + "grad_norm": 15.965791486524349, + "learning_rate": 4.940777148842737e-05, + "loss": 2.8587, + "mean_token_accuracy": 0.3620689630508423, + "step": 49055 + }, + { + "epoch": 0.049413753058143245, + "grad_norm": 13.68915150265036, + "learning_rate": 4.941280744515843e-05, + "loss": 2.3533, + "mean_token_accuracy": 0.4241379380226135, + "step": 49060 + }, + { + "epoch": 0.04941878911124742, + "grad_norm": 18.903890479113123, + "learning_rate": 4.941784340188949e-05, + "loss": 2.4972, + "mean_token_accuracy": 0.3793103337287903, + "step": 49065 + }, + { + "epoch": 0.04942382516435159, + "grad_norm": 10.80566940095437, + "learning_rate": 4.9422879358620556e-05, + "loss": 2.2117, + "mean_token_accuracy": 0.4890566885471344, + "step": 49070 + }, + { + "epoch": 0.049428861217455766, + "grad_norm": 11.45007364890809, + "learning_rate": 4.9427915315351616e-05, + "loss": 2.7231, + "mean_token_accuracy": 0.3655172407627106, + "step": 49075 + }, + { + "epoch": 0.04943389727055994, + "grad_norm": 12.875678811664686, + "learning_rate": 4.9432951272082675e-05, + "loss": 2.5675, + "mean_token_accuracy": 0.41034482717514037, + "step": 49080 + }, + { + "epoch": 0.049438933323664114, + "grad_norm": 12.504291587564936, + "learning_rate": 4.9437987228813734e-05, + "loss": 2.7649, + "mean_token_accuracy": 0.3793103456497192, + "step": 49085 + }, + { + "epoch": 0.04944396937676829, + "grad_norm": 17.23653916001431, + "learning_rate": 4.9443023185544793e-05, + "loss": 2.971, + "mean_token_accuracy": 0.35862069129943847, + "step": 49090 + }, + { + "epoch": 0.049449005429872454, + "grad_norm": 16.302425491255768, + "learning_rate": 4.944805914227585e-05, + "loss": 2.7943, + "mean_token_accuracy": 0.41379310488700866, + "step": 49095 + }, + { + "epoch": 0.04945404148297663, + "grad_norm": 13.830579276455648, + "learning_rate": 4.945309509900691e-05, + "loss": 2.7077, + "mean_token_accuracy": 0.41034482717514037, + "step": 49100 + }, + { + "epoch": 0.0494590775360808, + "grad_norm": 9.24250120260068, + "learning_rate": 4.945813105573797e-05, + "loss": 2.2826, + "mean_token_accuracy": 0.4103448212146759, + "step": 49105 + }, + { + "epoch": 0.049464113589184976, + "grad_norm": 13.30485365600019, + "learning_rate": 4.946316701246903e-05, + "loss": 2.5806, + "mean_token_accuracy": 0.4, + "step": 49110 + }, + { + "epoch": 0.04946914964228915, + "grad_norm": 13.226046901555446, + "learning_rate": 4.946820296920009e-05, + "loss": 2.619, + "mean_token_accuracy": 0.37586206793785093, + "step": 49115 + }, + { + "epoch": 0.04947418569539332, + "grad_norm": 10.39212014048255, + "learning_rate": 4.9473238925931156e-05, + "loss": 2.7365, + "mean_token_accuracy": 0.39310344457626345, + "step": 49120 + }, + { + "epoch": 0.0494792217484975, + "grad_norm": 13.625839027020147, + "learning_rate": 4.9478274882662215e-05, + "loss": 2.7067, + "mean_token_accuracy": 0.3620689630508423, + "step": 49125 + }, + { + "epoch": 0.049484257801601664, + "grad_norm": 12.841702873957686, + "learning_rate": 4.9483310839393274e-05, + "loss": 2.5441, + "mean_token_accuracy": 0.4344827651977539, + "step": 49130 + }, + { + "epoch": 0.04948929385470584, + "grad_norm": 12.334404174880499, + "learning_rate": 4.948834679612433e-05, + "loss": 2.1043, + "mean_token_accuracy": 0.47241378426551817, + "step": 49135 + }, + { + "epoch": 0.04949432990781001, + "grad_norm": 15.09710601585035, + "learning_rate": 4.9493382752855386e-05, + "loss": 2.9059, + "mean_token_accuracy": 0.3379310369491577, + "step": 49140 + }, + { + "epoch": 0.049499365960914185, + "grad_norm": 13.027265673330593, + "learning_rate": 4.9498418709586445e-05, + "loss": 2.2881, + "mean_token_accuracy": 0.41379310488700866, + "step": 49145 + }, + { + "epoch": 0.04950440201401836, + "grad_norm": 12.068235801694266, + "learning_rate": 4.950345466631751e-05, + "loss": 2.9198, + "mean_token_accuracy": 0.3551724135875702, + "step": 49150 + }, + { + "epoch": 0.04950943806712253, + "grad_norm": 10.92990075005881, + "learning_rate": 4.950849062304857e-05, + "loss": 2.4792, + "mean_token_accuracy": 0.4413793087005615, + "step": 49155 + }, + { + "epoch": 0.049514474120226706, + "grad_norm": 18.296693353236435, + "learning_rate": 4.951352657977963e-05, + "loss": 3.1022, + "mean_token_accuracy": 0.3103448212146759, + "step": 49160 + }, + { + "epoch": 0.04951951017333087, + "grad_norm": 12.49348015563861, + "learning_rate": 4.951856253651069e-05, + "loss": 3.0282, + "mean_token_accuracy": 0.37241379022598264, + "step": 49165 + }, + { + "epoch": 0.04952454622643505, + "grad_norm": 12.969289920558253, + "learning_rate": 4.952359849324175e-05, + "loss": 3.0352, + "mean_token_accuracy": 0.3275862127542496, + "step": 49170 + }, + { + "epoch": 0.04952958227953922, + "grad_norm": 13.008677609167234, + "learning_rate": 4.952863444997281e-05, + "loss": 3.4216, + "mean_token_accuracy": 0.32413792610168457, + "step": 49175 + }, + { + "epoch": 0.049534618332643394, + "grad_norm": 11.948299790792138, + "learning_rate": 4.953367040670387e-05, + "loss": 3.4276, + "mean_token_accuracy": 0.30689655244350433, + "step": 49180 + }, + { + "epoch": 0.04953965438574757, + "grad_norm": 13.232044398199198, + "learning_rate": 4.9538706363434926e-05, + "loss": 2.7483, + "mean_token_accuracy": 0.38620689511299133, + "step": 49185 + }, + { + "epoch": 0.04954469043885174, + "grad_norm": 12.072298961707434, + "learning_rate": 4.9543742320165985e-05, + "loss": 2.9151, + "mean_token_accuracy": 0.3793103337287903, + "step": 49190 + }, + { + "epoch": 0.049549726491955916, + "grad_norm": 13.992984048405518, + "learning_rate": 4.9548778276897045e-05, + "loss": 2.8701, + "mean_token_accuracy": 0.32068965435028074, + "step": 49195 + }, + { + "epoch": 0.04955476254506008, + "grad_norm": 16.055490129836002, + "learning_rate": 4.955381423362811e-05, + "loss": 2.4514, + "mean_token_accuracy": 0.42758620977401735, + "step": 49200 + }, + { + "epoch": 0.049559798598164256, + "grad_norm": 11.903501170180727, + "learning_rate": 4.955885019035917e-05, + "loss": 2.5416, + "mean_token_accuracy": 0.4344827592372894, + "step": 49205 + }, + { + "epoch": 0.04956483465126843, + "grad_norm": 17.413304193082453, + "learning_rate": 4.956388614709023e-05, + "loss": 2.8184, + "mean_token_accuracy": 0.38620689511299133, + "step": 49210 + }, + { + "epoch": 0.049569870704372604, + "grad_norm": 13.238645675444548, + "learning_rate": 4.956892210382129e-05, + "loss": 2.5133, + "mean_token_accuracy": 0.43793103098869324, + "step": 49215 + }, + { + "epoch": 0.04957490675747678, + "grad_norm": 12.327412690427577, + "learning_rate": 4.957395806055235e-05, + "loss": 2.742, + "mean_token_accuracy": 0.38620689511299133, + "step": 49220 + }, + { + "epoch": 0.04957994281058095, + "grad_norm": 14.263868970130424, + "learning_rate": 4.95789940172834e-05, + "loss": 2.6779, + "mean_token_accuracy": 0.4068965494632721, + "step": 49225 + }, + { + "epoch": 0.049584978863685125, + "grad_norm": 11.108085205111877, + "learning_rate": 4.9584029974014466e-05, + "loss": 2.6186, + "mean_token_accuracy": 0.39655172228813174, + "step": 49230 + }, + { + "epoch": 0.04959001491678929, + "grad_norm": 12.901059445419929, + "learning_rate": 4.9589065930745525e-05, + "loss": 2.6966, + "mean_token_accuracy": 0.3689655244350433, + "step": 49235 + }, + { + "epoch": 0.049595050969893466, + "grad_norm": 12.543300954528458, + "learning_rate": 4.9594101887476585e-05, + "loss": 2.3733, + "mean_token_accuracy": 0.458620685338974, + "step": 49240 + }, + { + "epoch": 0.04960008702299764, + "grad_norm": 17.400105066019382, + "learning_rate": 4.9599137844207644e-05, + "loss": 2.2198, + "mean_token_accuracy": 0.45402299165725707, + "step": 49245 + }, + { + "epoch": 0.04960512307610181, + "grad_norm": 13.30602468258391, + "learning_rate": 4.96041738009387e-05, + "loss": 2.5902, + "mean_token_accuracy": 0.39310344457626345, + "step": 49250 + }, + { + "epoch": 0.04961015912920599, + "grad_norm": 11.124328629447813, + "learning_rate": 4.960920975766977e-05, + "loss": 2.5845, + "mean_token_accuracy": 0.3931034505367279, + "step": 49255 + }, + { + "epoch": 0.04961519518231016, + "grad_norm": 11.775406834473008, + "learning_rate": 4.961424571440083e-05, + "loss": 2.2878, + "mean_token_accuracy": 0.417241370677948, + "step": 49260 + }, + { + "epoch": 0.049620231235414335, + "grad_norm": 14.113907397336439, + "learning_rate": 4.961928167113188e-05, + "loss": 2.7747, + "mean_token_accuracy": 0.39310344159603117, + "step": 49265 + }, + { + "epoch": 0.0496252672885185, + "grad_norm": 13.659875871175261, + "learning_rate": 4.962431762786294e-05, + "loss": 2.665, + "mean_token_accuracy": 0.4000000059604645, + "step": 49270 + }, + { + "epoch": 0.049630303341622675, + "grad_norm": 14.318838856310839, + "learning_rate": 4.9629353584594e-05, + "loss": 2.8541, + "mean_token_accuracy": 0.4, + "step": 49275 + }, + { + "epoch": 0.04963533939472685, + "grad_norm": 13.762589800802216, + "learning_rate": 4.9634389541325065e-05, + "loss": 3.2975, + "mean_token_accuracy": 0.33103448450565337, + "step": 49280 + }, + { + "epoch": 0.04964037544783102, + "grad_norm": 12.148281583557175, + "learning_rate": 4.9639425498056125e-05, + "loss": 2.8777, + "mean_token_accuracy": 0.41379310488700866, + "step": 49285 + }, + { + "epoch": 0.049645411500935197, + "grad_norm": 12.593167754663654, + "learning_rate": 4.9644461454787184e-05, + "loss": 2.7955, + "mean_token_accuracy": 0.4124621868133545, + "step": 49290 + }, + { + "epoch": 0.04965044755403937, + "grad_norm": 11.807473589452503, + "learning_rate": 4.964949741151824e-05, + "loss": 2.8509, + "mean_token_accuracy": 0.42413793206214906, + "step": 49295 + }, + { + "epoch": 0.049655483607143544, + "grad_norm": 14.780928364294947, + "learning_rate": 4.96545333682493e-05, + "loss": 2.5748, + "mean_token_accuracy": 0.43103448748588563, + "step": 49300 + }, + { + "epoch": 0.04966051966024771, + "grad_norm": 21.3860693383878, + "learning_rate": 4.965956932498036e-05, + "loss": 2.5254, + "mean_token_accuracy": 0.3758620619773865, + "step": 49305 + }, + { + "epoch": 0.049665555713351885, + "grad_norm": 15.224877263419117, + "learning_rate": 4.966460528171142e-05, + "loss": 2.3813, + "mean_token_accuracy": 0.42758620381355283, + "step": 49310 + }, + { + "epoch": 0.04967059176645606, + "grad_norm": 11.466907821737939, + "learning_rate": 4.966964123844248e-05, + "loss": 2.6847, + "mean_token_accuracy": 0.3482758641242981, + "step": 49315 + }, + { + "epoch": 0.04967562781956023, + "grad_norm": 12.635880277284619, + "learning_rate": 4.967467719517354e-05, + "loss": 2.3469, + "mean_token_accuracy": 0.3862069010734558, + "step": 49320 + }, + { + "epoch": 0.049680663872664406, + "grad_norm": 11.87581265201363, + "learning_rate": 4.96797131519046e-05, + "loss": 2.327, + "mean_token_accuracy": 0.46896551847457885, + "step": 49325 + }, + { + "epoch": 0.04968569992576858, + "grad_norm": 10.949030175235732, + "learning_rate": 4.968474910863566e-05, + "loss": 2.5311, + "mean_token_accuracy": 0.37241379618644715, + "step": 49330 + }, + { + "epoch": 0.04969073597887275, + "grad_norm": 14.735402843195, + "learning_rate": 4.9689785065366724e-05, + "loss": 2.7708, + "mean_token_accuracy": 0.37241379022598264, + "step": 49335 + }, + { + "epoch": 0.04969577203197692, + "grad_norm": 10.362049912778165, + "learning_rate": 4.969482102209778e-05, + "loss": 2.4387, + "mean_token_accuracy": 0.4103448212146759, + "step": 49340 + }, + { + "epoch": 0.049700808085081094, + "grad_norm": 11.6372504251952, + "learning_rate": 4.969985697882884e-05, + "loss": 2.3279, + "mean_token_accuracy": 0.4344827651977539, + "step": 49345 + }, + { + "epoch": 0.04970584413818527, + "grad_norm": 13.783295856114794, + "learning_rate": 4.97048929355599e-05, + "loss": 2.8837, + "mean_token_accuracy": 0.37241379022598264, + "step": 49350 + }, + { + "epoch": 0.04971088019128944, + "grad_norm": 11.834986404930202, + "learning_rate": 4.970992889229096e-05, + "loss": 2.7457, + "mean_token_accuracy": 0.3517241388559341, + "step": 49355 + }, + { + "epoch": 0.049715916244393615, + "grad_norm": 11.521477246740094, + "learning_rate": 4.971496484902202e-05, + "loss": 2.5048, + "mean_token_accuracy": 0.4172413766384125, + "step": 49360 + }, + { + "epoch": 0.04972095229749779, + "grad_norm": 11.858652590225981, + "learning_rate": 4.972000080575308e-05, + "loss": 2.4719, + "mean_token_accuracy": 0.43448275327682495, + "step": 49365 + }, + { + "epoch": 0.04972598835060196, + "grad_norm": 11.373794773558473, + "learning_rate": 4.972503676248414e-05, + "loss": 2.8187, + "mean_token_accuracy": 0.3482758581638336, + "step": 49370 + }, + { + "epoch": 0.04973102440370613, + "grad_norm": 16.116368217565114, + "learning_rate": 4.97300727192152e-05, + "loss": 2.3998, + "mean_token_accuracy": 0.42413793206214906, + "step": 49375 + }, + { + "epoch": 0.0497360604568103, + "grad_norm": 12.799520076892202, + "learning_rate": 4.973510867594626e-05, + "loss": 2.4465, + "mean_token_accuracy": 0.4379310250282288, + "step": 49380 + }, + { + "epoch": 0.04974109650991448, + "grad_norm": 13.742453803553794, + "learning_rate": 4.9740144632677316e-05, + "loss": 2.6858, + "mean_token_accuracy": 0.38475499153137205, + "step": 49385 + }, + { + "epoch": 0.04974613256301865, + "grad_norm": 9.861523476840427, + "learning_rate": 4.974518058940838e-05, + "loss": 2.4784, + "mean_token_accuracy": 0.4275861978530884, + "step": 49390 + }, + { + "epoch": 0.049751168616122825, + "grad_norm": 14.802353146211662, + "learning_rate": 4.975021654613944e-05, + "loss": 2.8429, + "mean_token_accuracy": 0.3724137842655182, + "step": 49395 + }, + { + "epoch": 0.049756204669227, + "grad_norm": 18.86918748812848, + "learning_rate": 4.9755252502870494e-05, + "loss": 2.8367, + "mean_token_accuracy": 0.4223835527896881, + "step": 49400 + }, + { + "epoch": 0.04976124072233117, + "grad_norm": 11.6322610418858, + "learning_rate": 4.9760288459601553e-05, + "loss": 2.7996, + "mean_token_accuracy": 0.3620689570903778, + "step": 49405 + }, + { + "epoch": 0.04976627677543534, + "grad_norm": 13.821393909998825, + "learning_rate": 4.976532441633261e-05, + "loss": 3.0473, + "mean_token_accuracy": 0.38620689511299133, + "step": 49410 + }, + { + "epoch": 0.04977131282853951, + "grad_norm": 13.345087258669924, + "learning_rate": 4.977036037306368e-05, + "loss": 2.4209, + "mean_token_accuracy": 0.39310344457626345, + "step": 49415 + }, + { + "epoch": 0.04977634888164369, + "grad_norm": 13.137243998874078, + "learning_rate": 4.977539632979474e-05, + "loss": 2.4648, + "mean_token_accuracy": 0.41379310488700866, + "step": 49420 + }, + { + "epoch": 0.04978138493474786, + "grad_norm": 11.613483538823493, + "learning_rate": 4.97804322865258e-05, + "loss": 2.621, + "mean_token_accuracy": 0.43266788125038147, + "step": 49425 + }, + { + "epoch": 0.049786420987852034, + "grad_norm": 12.605566542025525, + "learning_rate": 4.9785468243256857e-05, + "loss": 2.6217, + "mean_token_accuracy": 0.3620689570903778, + "step": 49430 + }, + { + "epoch": 0.04979145704095621, + "grad_norm": 13.598326714699105, + "learning_rate": 4.9790504199987916e-05, + "loss": 2.6605, + "mean_token_accuracy": 0.41379311084747317, + "step": 49435 + }, + { + "epoch": 0.04979649309406038, + "grad_norm": 14.550143317492957, + "learning_rate": 4.9795540156718975e-05, + "loss": 2.4002, + "mean_token_accuracy": 0.39310344457626345, + "step": 49440 + }, + { + "epoch": 0.04980152914716455, + "grad_norm": 14.702991348953809, + "learning_rate": 4.9800576113450034e-05, + "loss": 2.9054, + "mean_token_accuracy": 0.36206896901130675, + "step": 49445 + }, + { + "epoch": 0.04980656520026872, + "grad_norm": 11.302201937499778, + "learning_rate": 4.9805612070181094e-05, + "loss": 2.6867, + "mean_token_accuracy": 0.38620689511299133, + "step": 49450 + }, + { + "epoch": 0.049811601253372896, + "grad_norm": 13.56973108055161, + "learning_rate": 4.981064802691215e-05, + "loss": 2.5946, + "mean_token_accuracy": 0.3517241388559341, + "step": 49455 + }, + { + "epoch": 0.04981663730647707, + "grad_norm": 12.842971988297705, + "learning_rate": 4.981568398364321e-05, + "loss": 2.6623, + "mean_token_accuracy": 0.3809437394142151, + "step": 49460 + }, + { + "epoch": 0.049821673359581244, + "grad_norm": 11.563934427017875, + "learning_rate": 4.982071994037427e-05, + "loss": 2.8514, + "mean_token_accuracy": 0.32413792610168457, + "step": 49465 + }, + { + "epoch": 0.04982670941268542, + "grad_norm": 12.122643995871975, + "learning_rate": 4.982575589710534e-05, + "loss": 2.5237, + "mean_token_accuracy": 0.3896551728248596, + "step": 49470 + }, + { + "epoch": 0.049831745465789584, + "grad_norm": 10.304817959472109, + "learning_rate": 4.9830791853836397e-05, + "loss": 2.7031, + "mean_token_accuracy": 0.37241379022598264, + "step": 49475 + }, + { + "epoch": 0.04983678151889376, + "grad_norm": 11.443792638318499, + "learning_rate": 4.9835827810567456e-05, + "loss": 2.369, + "mean_token_accuracy": 0.3827586233615875, + "step": 49480 + }, + { + "epoch": 0.04984181757199793, + "grad_norm": 17.651962081814688, + "learning_rate": 4.9840863767298515e-05, + "loss": 2.8535, + "mean_token_accuracy": 0.3655172407627106, + "step": 49485 + }, + { + "epoch": 0.049846853625102105, + "grad_norm": 14.28581158762603, + "learning_rate": 4.9845899724029574e-05, + "loss": 2.6115, + "mean_token_accuracy": 0.3517241358757019, + "step": 49490 + }, + { + "epoch": 0.04985188967820628, + "grad_norm": 12.245707832101791, + "learning_rate": 4.9850935680760634e-05, + "loss": 2.9765, + "mean_token_accuracy": 0.33793102502822875, + "step": 49495 + }, + { + "epoch": 0.04985692573131045, + "grad_norm": 11.338788116320691, + "learning_rate": 4.985597163749169e-05, + "loss": 2.2332, + "mean_token_accuracy": 0.4517241358757019, + "step": 49500 + }, + { + "epoch": 0.04986196178441463, + "grad_norm": 15.91943713855223, + "learning_rate": 4.986100759422275e-05, + "loss": 2.5005, + "mean_token_accuracy": 0.42758620977401735, + "step": 49505 + }, + { + "epoch": 0.049866997837518794, + "grad_norm": 15.01142749013905, + "learning_rate": 4.986604355095381e-05, + "loss": 2.4042, + "mean_token_accuracy": 0.42413792610168455, + "step": 49510 + }, + { + "epoch": 0.04987203389062297, + "grad_norm": 23.31739947772715, + "learning_rate": 4.987107950768487e-05, + "loss": 2.7926, + "mean_token_accuracy": 0.33793103098869326, + "step": 49515 + }, + { + "epoch": 0.04987706994372714, + "grad_norm": 11.040523663202919, + "learning_rate": 4.987611546441594e-05, + "loss": 2.4018, + "mean_token_accuracy": 0.4172413766384125, + "step": 49520 + }, + { + "epoch": 0.049882105996831315, + "grad_norm": 13.294381592565193, + "learning_rate": 4.9881151421146996e-05, + "loss": 2.2171, + "mean_token_accuracy": 0.441379314661026, + "step": 49525 + }, + { + "epoch": 0.04988714204993549, + "grad_norm": 11.646579343031293, + "learning_rate": 4.9886187377878055e-05, + "loss": 2.6643, + "mean_token_accuracy": 0.39655172228813174, + "step": 49530 + }, + { + "epoch": 0.04989217810303966, + "grad_norm": 13.71315294642325, + "learning_rate": 4.989122333460911e-05, + "loss": 2.7966, + "mean_token_accuracy": 0.3517241418361664, + "step": 49535 + }, + { + "epoch": 0.049897214156143836, + "grad_norm": 17.97975577463106, + "learning_rate": 4.989625929134017e-05, + "loss": 2.7515, + "mean_token_accuracy": 0.36896551251411436, + "step": 49540 + }, + { + "epoch": 0.049902250209248, + "grad_norm": 12.268858907224732, + "learning_rate": 4.990129524807123e-05, + "loss": 2.6348, + "mean_token_accuracy": 0.3931034505367279, + "step": 49545 + }, + { + "epoch": 0.04990728626235218, + "grad_norm": 20.81019688630803, + "learning_rate": 4.990633120480229e-05, + "loss": 2.8056, + "mean_token_accuracy": 0.4103448331356049, + "step": 49550 + }, + { + "epoch": 0.04991232231545635, + "grad_norm": 13.646494384884486, + "learning_rate": 4.991136716153335e-05, + "loss": 2.5801, + "mean_token_accuracy": 0.38275861740112305, + "step": 49555 + }, + { + "epoch": 0.049917358368560524, + "grad_norm": 13.258184842553518, + "learning_rate": 4.991640311826441e-05, + "loss": 2.4923, + "mean_token_accuracy": 0.37931033968925476, + "step": 49560 + }, + { + "epoch": 0.0499223944216647, + "grad_norm": 13.376109319528164, + "learning_rate": 4.992143907499547e-05, + "loss": 2.8028, + "mean_token_accuracy": 0.3448275923728943, + "step": 49565 + }, + { + "epoch": 0.04992743047476887, + "grad_norm": 12.32846130563162, + "learning_rate": 4.992647503172653e-05, + "loss": 2.612, + "mean_token_accuracy": 0.3482758641242981, + "step": 49570 + }, + { + "epoch": 0.049932466527873046, + "grad_norm": 11.564618024882458, + "learning_rate": 4.993151098845759e-05, + "loss": 2.6198, + "mean_token_accuracy": 0.4034482777118683, + "step": 49575 + }, + { + "epoch": 0.04993750258097721, + "grad_norm": 14.824679698883445, + "learning_rate": 4.993654694518865e-05, + "loss": 2.7361, + "mean_token_accuracy": 0.38275861740112305, + "step": 49580 + }, + { + "epoch": 0.049942538634081386, + "grad_norm": 15.552365673154968, + "learning_rate": 4.994158290191971e-05, + "loss": 2.876, + "mean_token_accuracy": 0.35862069129943847, + "step": 49585 + }, + { + "epoch": 0.04994757468718556, + "grad_norm": 14.199919668919463, + "learning_rate": 4.9946618858650766e-05, + "loss": 2.3108, + "mean_token_accuracy": 0.482758617401123, + "step": 49590 + }, + { + "epoch": 0.049952610740289734, + "grad_norm": 14.47673515226275, + "learning_rate": 4.9951654815381825e-05, + "loss": 2.6364, + "mean_token_accuracy": 0.3655172407627106, + "step": 49595 + }, + { + "epoch": 0.04995764679339391, + "grad_norm": 14.824743793669533, + "learning_rate": 4.995669077211289e-05, + "loss": 2.5179, + "mean_token_accuracy": 0.4117362380027771, + "step": 49600 + }, + { + "epoch": 0.04996268284649808, + "grad_norm": 12.498952004436685, + "learning_rate": 4.996172672884395e-05, + "loss": 2.555, + "mean_token_accuracy": 0.4159104585647583, + "step": 49605 + }, + { + "epoch": 0.049967718899602255, + "grad_norm": 13.935762699559254, + "learning_rate": 4.996676268557501e-05, + "loss": 2.827, + "mean_token_accuracy": 0.3517241358757019, + "step": 49610 + }, + { + "epoch": 0.04997275495270642, + "grad_norm": 14.630953503098178, + "learning_rate": 4.997179864230607e-05, + "loss": 2.3519, + "mean_token_accuracy": 0.36745311319828033, + "step": 49615 + }, + { + "epoch": 0.049977791005810596, + "grad_norm": 12.230547852391641, + "learning_rate": 4.997683459903713e-05, + "loss": 2.7427, + "mean_token_accuracy": 0.39509981870651245, + "step": 49620 + }, + { + "epoch": 0.04998282705891477, + "grad_norm": 12.537907041137958, + "learning_rate": 4.998187055576819e-05, + "loss": 2.8497, + "mean_token_accuracy": 0.36206896007061007, + "step": 49625 + }, + { + "epoch": 0.04998786311201894, + "grad_norm": 11.284924131603896, + "learning_rate": 4.998690651249925e-05, + "loss": 2.4841, + "mean_token_accuracy": 0.403448274731636, + "step": 49630 + }, + { + "epoch": 0.04999289916512312, + "grad_norm": 11.76655415509917, + "learning_rate": 4.9991942469230306e-05, + "loss": 2.8672, + "mean_token_accuracy": 0.35862069129943847, + "step": 49635 + }, + { + "epoch": 0.04999793521822729, + "grad_norm": 15.605220600081669, + "learning_rate": 4.9996978425961365e-05, + "loss": 2.5876, + "mean_token_accuracy": 0.3551724076271057, + "step": 49640 + }, + { + "epoch": 0.050002971271331464, + "grad_norm": 12.006394240076212, + "learning_rate": 4.999999999950077e-05, + "loss": 2.9691, + "mean_token_accuracy": 0.36551724672317504, + "step": 49645 + }, + { + "epoch": 0.05000800732443563, + "grad_norm": 12.6271173068407, + "learning_rate": 4.999999999388436e-05, + "loss": 2.4717, + "mean_token_accuracy": 0.4310344815254211, + "step": 49650 + }, + { + "epoch": 0.050013043377539805, + "grad_norm": 12.234909042511857, + "learning_rate": 4.999999998202748e-05, + "loss": 2.824, + "mean_token_accuracy": 0.37586206793785093, + "step": 49655 + }, + { + "epoch": 0.05001807943064398, + "grad_norm": 44.949610624950466, + "learning_rate": 4.999999996393015e-05, + "loss": 3.1453, + "mean_token_accuracy": 0.27241378724575044, + "step": 49660 + }, + { + "epoch": 0.05002311548374815, + "grad_norm": 12.025230843353924, + "learning_rate": 4.9999999939592364e-05, + "loss": 2.5444, + "mean_token_accuracy": 0.38275861740112305, + "step": 49665 + }, + { + "epoch": 0.050028151536852326, + "grad_norm": 11.381206032498792, + "learning_rate": 4.999999990901411e-05, + "loss": 2.4051, + "mean_token_accuracy": 0.3620689630508423, + "step": 49670 + }, + { + "epoch": 0.0500331875899565, + "grad_norm": 11.788428976912376, + "learning_rate": 4.999999987219541e-05, + "loss": 2.1203, + "mean_token_accuracy": 0.47931034564971925, + "step": 49675 + }, + { + "epoch": 0.050038223643060674, + "grad_norm": 17.25882583624675, + "learning_rate": 4.999999982913624e-05, + "loss": 2.377, + "mean_token_accuracy": 0.4379310429096222, + "step": 49680 + }, + { + "epoch": 0.05004325969616484, + "grad_norm": 15.178981141132901, + "learning_rate": 4.999999977983662e-05, + "loss": 2.5629, + "mean_token_accuracy": 0.4172413766384125, + "step": 49685 + }, + { + "epoch": 0.050048295749269014, + "grad_norm": 15.925549144673484, + "learning_rate": 4.9999999724296524e-05, + "loss": 2.5396, + "mean_token_accuracy": 0.39310344457626345, + "step": 49690 + }, + { + "epoch": 0.05005333180237319, + "grad_norm": 14.720843721851335, + "learning_rate": 4.999999966251598e-05, + "loss": 3.1586, + "mean_token_accuracy": 0.37586206793785093, + "step": 49695 + }, + { + "epoch": 0.05005836785547736, + "grad_norm": 10.552678313176491, + "learning_rate": 4.9999999594494986e-05, + "loss": 2.5198, + "mean_token_accuracy": 0.4172413766384125, + "step": 49700 + }, + { + "epoch": 0.050063403908581536, + "grad_norm": 14.610190061508218, + "learning_rate": 4.999999952023352e-05, + "loss": 2.7993, + "mean_token_accuracy": 0.3310344874858856, + "step": 49705 + }, + { + "epoch": 0.05006843996168571, + "grad_norm": 13.328116361078306, + "learning_rate": 4.99999994397316e-05, + "loss": 2.9146, + "mean_token_accuracy": 0.3482758581638336, + "step": 49710 + }, + { + "epoch": 0.05007347601478988, + "grad_norm": 11.356790665066061, + "learning_rate": 4.9999999352989224e-05, + "loss": 2.5436, + "mean_token_accuracy": 0.38620689511299133, + "step": 49715 + }, + { + "epoch": 0.05007851206789405, + "grad_norm": 12.899454764875982, + "learning_rate": 4.9999999260006385e-05, + "loss": 2.738, + "mean_token_accuracy": 0.42413793206214906, + "step": 49720 + }, + { + "epoch": 0.050083548120998224, + "grad_norm": 13.51586536994713, + "learning_rate": 4.999999916078308e-05, + "loss": 2.4499, + "mean_token_accuracy": 0.39437386989593504, + "step": 49725 + }, + { + "epoch": 0.0500885841741024, + "grad_norm": 16.644151180282265, + "learning_rate": 4.999999905531934e-05, + "loss": 2.7232, + "mean_token_accuracy": 0.38620689511299133, + "step": 49730 + }, + { + "epoch": 0.05009362022720657, + "grad_norm": 13.058655976823317, + "learning_rate": 4.9999998943615124e-05, + "loss": 2.4483, + "mean_token_accuracy": 0.4172413766384125, + "step": 49735 + }, + { + "epoch": 0.050098656280310745, + "grad_norm": 11.179485730759678, + "learning_rate": 4.999999882567045e-05, + "loss": 3.2205, + "mean_token_accuracy": 0.358620685338974, + "step": 49740 + }, + { + "epoch": 0.05010369233341492, + "grad_norm": 14.60636381748364, + "learning_rate": 4.9999998701485314e-05, + "loss": 2.4974, + "mean_token_accuracy": 0.3999999940395355, + "step": 49745 + }, + { + "epoch": 0.05010872838651909, + "grad_norm": 17.494031287034645, + "learning_rate": 4.999999857105973e-05, + "loss": 3.1808, + "mean_token_accuracy": 0.3551724165678024, + "step": 49750 + }, + { + "epoch": 0.05011376443962326, + "grad_norm": 12.266893588534552, + "learning_rate": 4.999999843439368e-05, + "loss": 2.7388, + "mean_token_accuracy": 0.4137930989265442, + "step": 49755 + }, + { + "epoch": 0.05011880049272743, + "grad_norm": 9.465675904973079, + "learning_rate": 4.9999998291487175e-05, + "loss": 2.4497, + "mean_token_accuracy": 0.4255293428897858, + "step": 49760 + }, + { + "epoch": 0.05012383654583161, + "grad_norm": 13.092268195033029, + "learning_rate": 4.999999814234021e-05, + "loss": 2.2549, + "mean_token_accuracy": 0.39310343861579894, + "step": 49765 + }, + { + "epoch": 0.05012887259893578, + "grad_norm": 12.990682712269443, + "learning_rate": 4.999999798695279e-05, + "loss": 2.7591, + "mean_token_accuracy": 0.37241379022598264, + "step": 49770 + }, + { + "epoch": 0.050133908652039955, + "grad_norm": 16.466545408775996, + "learning_rate": 4.999999782532491e-05, + "loss": 2.3737, + "mean_token_accuracy": 0.4255898416042328, + "step": 49775 + }, + { + "epoch": 0.05013894470514413, + "grad_norm": 11.880720595914022, + "learning_rate": 4.9999997657456566e-05, + "loss": 2.4955, + "mean_token_accuracy": 0.4344827592372894, + "step": 49780 + }, + { + "epoch": 0.0501439807582483, + "grad_norm": 12.650095182831993, + "learning_rate": 4.9999997483347765e-05, + "loss": 2.4397, + "mean_token_accuracy": 0.39818512201309203, + "step": 49785 + }, + { + "epoch": 0.05014901681135247, + "grad_norm": 12.43159565953127, + "learning_rate": 4.9999997302998506e-05, + "loss": 2.9674, + "mean_token_accuracy": 0.36896551251411436, + "step": 49790 + }, + { + "epoch": 0.05015405286445664, + "grad_norm": 14.959005423990027, + "learning_rate": 4.9999997116408796e-05, + "loss": 3.1992, + "mean_token_accuracy": 0.2999999910593033, + "step": 49795 + }, + { + "epoch": 0.050159088917560817, + "grad_norm": 15.552031044538174, + "learning_rate": 4.999999692357862e-05, + "loss": 2.7995, + "mean_token_accuracy": 0.38620689511299133, + "step": 49800 + }, + { + "epoch": 0.05016412497066499, + "grad_norm": 12.972217035970225, + "learning_rate": 4.9999996724507996e-05, + "loss": 2.4475, + "mean_token_accuracy": 0.3999999940395355, + "step": 49805 + }, + { + "epoch": 0.050169161023769164, + "grad_norm": 10.596658939758084, + "learning_rate": 4.999999651919691e-05, + "loss": 2.6885, + "mean_token_accuracy": 0.44827585220336913, + "step": 49810 + }, + { + "epoch": 0.05017419707687334, + "grad_norm": 13.479090681305225, + "learning_rate": 4.999999630764536e-05, + "loss": 2.5114, + "mean_token_accuracy": 0.3655172407627106, + "step": 49815 + }, + { + "epoch": 0.05017923312997751, + "grad_norm": 13.780359024390073, + "learning_rate": 4.9999996089853355e-05, + "loss": 2.7229, + "mean_token_accuracy": 0.3454930394887924, + "step": 49820 + }, + { + "epoch": 0.05018426918308168, + "grad_norm": 15.225368901024439, + "learning_rate": 4.999999586582089e-05, + "loss": 2.7321, + "mean_token_accuracy": 0.4068965554237366, + "step": 49825 + }, + { + "epoch": 0.05018930523618585, + "grad_norm": 11.720377494039846, + "learning_rate": 4.999999563554797e-05, + "loss": 2.636, + "mean_token_accuracy": 0.3517241418361664, + "step": 49830 + }, + { + "epoch": 0.050194341289290026, + "grad_norm": 12.61486108463667, + "learning_rate": 4.9999995399034594e-05, + "loss": 2.7784, + "mean_token_accuracy": 0.3551724135875702, + "step": 49835 + }, + { + "epoch": 0.0501993773423942, + "grad_norm": 13.289997714746, + "learning_rate": 4.999999515628075e-05, + "loss": 2.4773, + "mean_token_accuracy": 0.41034482717514037, + "step": 49840 + }, + { + "epoch": 0.05020441339549837, + "grad_norm": 13.177140455998506, + "learning_rate": 4.9999994907286466e-05, + "loss": 2.6352, + "mean_token_accuracy": 0.3986085891723633, + "step": 49845 + }, + { + "epoch": 0.05020944944860255, + "grad_norm": 12.348074467666311, + "learning_rate": 4.9999994652051715e-05, + "loss": 2.426, + "mean_token_accuracy": 0.39491832852363584, + "step": 49850 + }, + { + "epoch": 0.05021448550170672, + "grad_norm": 16.999614863859883, + "learning_rate": 4.999999439057651e-05, + "loss": 2.6804, + "mean_token_accuracy": 0.3827586233615875, + "step": 49855 + }, + { + "epoch": 0.05021952155481089, + "grad_norm": 13.252310613732684, + "learning_rate": 4.999999412286084e-05, + "loss": 2.3524, + "mean_token_accuracy": 0.4034482777118683, + "step": 49860 + }, + { + "epoch": 0.05022455760791506, + "grad_norm": 10.72375928360509, + "learning_rate": 4.999999384890472e-05, + "loss": 2.306, + "mean_token_accuracy": 0.45517240166664125, + "step": 49865 + }, + { + "epoch": 0.050229593661019235, + "grad_norm": 13.797454384393609, + "learning_rate": 4.9999993568708136e-05, + "loss": 2.6761, + "mean_token_accuracy": 0.4068965554237366, + "step": 49870 + }, + { + "epoch": 0.05023462971412341, + "grad_norm": 11.391078182708428, + "learning_rate": 4.9999993282271104e-05, + "loss": 2.4039, + "mean_token_accuracy": 0.4862069010734558, + "step": 49875 + }, + { + "epoch": 0.05023966576722758, + "grad_norm": 13.511557320896227, + "learning_rate": 4.999999298959361e-05, + "loss": 2.5194, + "mean_token_accuracy": 0.3862068891525269, + "step": 49880 + }, + { + "epoch": 0.05024470182033176, + "grad_norm": 11.959401914858365, + "learning_rate": 4.999999269067566e-05, + "loss": 2.9104, + "mean_token_accuracy": 0.41379310488700866, + "step": 49885 + }, + { + "epoch": 0.05024973787343593, + "grad_norm": 10.454554000475806, + "learning_rate": 4.999999238551725e-05, + "loss": 2.8246, + "mean_token_accuracy": 0.39491833448410035, + "step": 49890 + }, + { + "epoch": 0.0502547739265401, + "grad_norm": 11.180505264939498, + "learning_rate": 4.999999207411838e-05, + "loss": 2.7407, + "mean_token_accuracy": 0.42758620977401735, + "step": 49895 + }, + { + "epoch": 0.05025980997964427, + "grad_norm": 16.598709812537702, + "learning_rate": 4.999999175647906e-05, + "loss": 3.0353, + "mean_token_accuracy": 0.3137931048870087, + "step": 49900 + }, + { + "epoch": 0.050264846032748445, + "grad_norm": 18.39314814150104, + "learning_rate": 4.999999143259929e-05, + "loss": 2.4632, + "mean_token_accuracy": 0.40689654350280763, + "step": 49905 + }, + { + "epoch": 0.05026988208585262, + "grad_norm": 23.375139055611236, + "learning_rate": 4.999999110247905e-05, + "loss": 2.7536, + "mean_token_accuracy": 0.3896551728248596, + "step": 49910 + }, + { + "epoch": 0.05027491813895679, + "grad_norm": 12.575790854780092, + "learning_rate": 4.999999076611835e-05, + "loss": 2.756, + "mean_token_accuracy": 0.4034482717514038, + "step": 49915 + }, + { + "epoch": 0.050279954192060966, + "grad_norm": 11.853179869949356, + "learning_rate": 4.999999042351721e-05, + "loss": 2.7287, + "mean_token_accuracy": 0.36551724672317504, + "step": 49920 + }, + { + "epoch": 0.05028499024516514, + "grad_norm": 11.314370035179659, + "learning_rate": 4.99999900746756e-05, + "loss": 2.238, + "mean_token_accuracy": 0.42413792610168455, + "step": 49925 + }, + { + "epoch": 0.05029002629826931, + "grad_norm": 11.80282057662572, + "learning_rate": 4.9999989719593546e-05, + "loss": 2.8643, + "mean_token_accuracy": 0.34482758343219755, + "step": 49930 + }, + { + "epoch": 0.05029506235137348, + "grad_norm": 12.012195216515446, + "learning_rate": 4.999998935827103e-05, + "loss": 2.7738, + "mean_token_accuracy": 0.36896551251411436, + "step": 49935 + }, + { + "epoch": 0.050300098404477654, + "grad_norm": 14.042802247376741, + "learning_rate": 4.9999988990708054e-05, + "loss": 2.174, + "mean_token_accuracy": 0.44827587008476255, + "step": 49940 + }, + { + "epoch": 0.05030513445758183, + "grad_norm": 11.633376202270114, + "learning_rate": 4.999998861690462e-05, + "loss": 2.3763, + "mean_token_accuracy": 0.39655172228813174, + "step": 49945 + }, + { + "epoch": 0.050310170510686, + "grad_norm": 13.548543139366798, + "learning_rate": 4.999998823686074e-05, + "loss": 2.7846, + "mean_token_accuracy": 0.3000000029802322, + "step": 49950 + }, + { + "epoch": 0.050315206563790175, + "grad_norm": 16.761012173907588, + "learning_rate": 4.9999987850576405e-05, + "loss": 2.9079, + "mean_token_accuracy": 0.41034482717514037, + "step": 49955 + }, + { + "epoch": 0.05032024261689435, + "grad_norm": 10.752007701541265, + "learning_rate": 4.99999874580516e-05, + "loss": 2.3022, + "mean_token_accuracy": 0.4551724076271057, + "step": 49960 + }, + { + "epoch": 0.050325278669998516, + "grad_norm": 13.272690428884117, + "learning_rate": 4.999998705928635e-05, + "loss": 3.0675, + "mean_token_accuracy": 0.40344828367233276, + "step": 49965 + }, + { + "epoch": 0.05033031472310269, + "grad_norm": 11.456320473853069, + "learning_rate": 4.999998665428065e-05, + "loss": 2.1809, + "mean_token_accuracy": 0.39655172228813174, + "step": 49970 + }, + { + "epoch": 0.050335350776206864, + "grad_norm": 13.473766954694915, + "learning_rate": 4.9999986243034485e-05, + "loss": 2.5452, + "mean_token_accuracy": 0.37241379022598264, + "step": 49975 + }, + { + "epoch": 0.05034038682931104, + "grad_norm": 26.80994790393054, + "learning_rate": 4.999998582554787e-05, + "loss": 2.9404, + "mean_token_accuracy": 0.3931034505367279, + "step": 49980 + }, + { + "epoch": 0.05034542288241521, + "grad_norm": 11.080244502563596, + "learning_rate": 4.99999854018208e-05, + "loss": 2.5327, + "mean_token_accuracy": 0.42758620977401735, + "step": 49985 + }, + { + "epoch": 0.050350458935519385, + "grad_norm": 17.367509809474647, + "learning_rate": 4.999998497185327e-05, + "loss": 2.673, + "mean_token_accuracy": 0.38965516686439516, + "step": 49990 + }, + { + "epoch": 0.05035549498862356, + "grad_norm": 11.036298754864657, + "learning_rate": 4.99999845356453e-05, + "loss": 2.3717, + "mean_token_accuracy": 0.4712038695812225, + "step": 49995 + }, + { + "epoch": 0.050360531041727725, + "grad_norm": 15.158522527044616, + "learning_rate": 4.999998409319686e-05, + "loss": 2.6372, + "mean_token_accuracy": 0.3827586233615875, + "step": 50000 + }, + { + "epoch": 0.0503655670948319, + "grad_norm": 15.593171681203087, + "learning_rate": 4.999998364450796e-05, + "loss": 2.6878, + "mean_token_accuracy": 0.38620689511299133, + "step": 50005 + }, + { + "epoch": 0.05037060314793607, + "grad_norm": 13.863212335459519, + "learning_rate": 4.999998318957862e-05, + "loss": 2.4158, + "mean_token_accuracy": 0.42413792610168455, + "step": 50010 + }, + { + "epoch": 0.05037563920104025, + "grad_norm": 13.006626728251153, + "learning_rate": 4.999998272840882e-05, + "loss": 2.5361, + "mean_token_accuracy": 0.3793103337287903, + "step": 50015 + }, + { + "epoch": 0.05038067525414442, + "grad_norm": 18.675955816483324, + "learning_rate": 4.9999982260998566e-05, + "loss": 2.6408, + "mean_token_accuracy": 0.41724138259887694, + "step": 50020 + }, + { + "epoch": 0.050385711307248594, + "grad_norm": 12.305510197988752, + "learning_rate": 4.9999981787347865e-05, + "loss": 2.3927, + "mean_token_accuracy": 0.4000000059604645, + "step": 50025 + }, + { + "epoch": 0.05039074736035277, + "grad_norm": 15.104214893830951, + "learning_rate": 4.99999813074567e-05, + "loss": 2.6876, + "mean_token_accuracy": 0.36551723480224607, + "step": 50030 + }, + { + "epoch": 0.050395783413456935, + "grad_norm": 12.659031341136748, + "learning_rate": 4.9999980821325085e-05, + "loss": 2.4855, + "mean_token_accuracy": 0.40689654350280763, + "step": 50035 + }, + { + "epoch": 0.05040081946656111, + "grad_norm": 13.468400778591251, + "learning_rate": 4.9999980328953025e-05, + "loss": 2.8903, + "mean_token_accuracy": 0.3517241358757019, + "step": 50040 + }, + { + "epoch": 0.05040585551966528, + "grad_norm": 10.051057832774758, + "learning_rate": 4.99999798303405e-05, + "loss": 3.0942, + "mean_token_accuracy": 0.36206896901130675, + "step": 50045 + }, + { + "epoch": 0.050410891572769456, + "grad_norm": 11.479981276974836, + "learning_rate": 4.999997932548752e-05, + "loss": 2.5297, + "mean_token_accuracy": 0.3827586203813553, + "step": 50050 + }, + { + "epoch": 0.05041592762587363, + "grad_norm": 12.604137009557448, + "learning_rate": 4.999997881439409e-05, + "loss": 2.7703, + "mean_token_accuracy": 0.3482758581638336, + "step": 50055 + }, + { + "epoch": 0.050420963678977804, + "grad_norm": 12.086906228102732, + "learning_rate": 4.999997829706021e-05, + "loss": 2.6068, + "mean_token_accuracy": 0.40344828367233276, + "step": 50060 + }, + { + "epoch": 0.05042599973208198, + "grad_norm": 10.817874276039062, + "learning_rate": 4.9999977773485875e-05, + "loss": 2.0977, + "mean_token_accuracy": 0.4413793206214905, + "step": 50065 + }, + { + "epoch": 0.050431035785186144, + "grad_norm": 11.903700462637147, + "learning_rate": 4.999997724367109e-05, + "loss": 2.5412, + "mean_token_accuracy": 0.39655172228813174, + "step": 50070 + }, + { + "epoch": 0.05043607183829032, + "grad_norm": 11.62487044538741, + "learning_rate": 4.9999976707615854e-05, + "loss": 2.2405, + "mean_token_accuracy": 0.4344827592372894, + "step": 50075 + }, + { + "epoch": 0.05044110789139449, + "grad_norm": 12.218379749564965, + "learning_rate": 4.9999976165320154e-05, + "loss": 2.7374, + "mean_token_accuracy": 0.35862068831920624, + "step": 50080 + }, + { + "epoch": 0.050446143944498666, + "grad_norm": 11.55715370364926, + "learning_rate": 4.9999975616784016e-05, + "loss": 2.9408, + "mean_token_accuracy": 0.3827586233615875, + "step": 50085 + }, + { + "epoch": 0.05045117999760284, + "grad_norm": 13.13891490985889, + "learning_rate": 4.9999975062007413e-05, + "loss": 2.6482, + "mean_token_accuracy": 0.37241379618644715, + "step": 50090 + }, + { + "epoch": 0.05045621605070701, + "grad_norm": 11.43510281822606, + "learning_rate": 4.9999974500990374e-05, + "loss": 2.4089, + "mean_token_accuracy": 0.4344827651977539, + "step": 50095 + }, + { + "epoch": 0.05046125210381119, + "grad_norm": 13.43929597670826, + "learning_rate": 4.999997393373287e-05, + "loss": 2.6215, + "mean_token_accuracy": 0.32413792610168457, + "step": 50100 + }, + { + "epoch": 0.050466288156915354, + "grad_norm": 14.588231280641814, + "learning_rate": 4.9999973360234915e-05, + "loss": 3.2441, + "mean_token_accuracy": 0.40859044194221494, + "step": 50105 + }, + { + "epoch": 0.05047132421001953, + "grad_norm": 10.787683774273884, + "learning_rate": 4.999997278049651e-05, + "loss": 2.6974, + "mean_token_accuracy": 0.4120387136936188, + "step": 50110 + }, + { + "epoch": 0.0504763602631237, + "grad_norm": 14.236627571128553, + "learning_rate": 4.999997219451766e-05, + "loss": 2.4728, + "mean_token_accuracy": 0.41034482717514037, + "step": 50115 + }, + { + "epoch": 0.050481396316227875, + "grad_norm": 13.962810688334864, + "learning_rate": 4.999997160229835e-05, + "loss": 2.3186, + "mean_token_accuracy": 0.4137930989265442, + "step": 50120 + }, + { + "epoch": 0.05048643236933205, + "grad_norm": 14.015178940218096, + "learning_rate": 4.9999971003838594e-05, + "loss": 2.7623, + "mean_token_accuracy": 0.38620689511299133, + "step": 50125 + }, + { + "epoch": 0.05049146842243622, + "grad_norm": 17.39032792569419, + "learning_rate": 4.999997039913838e-05, + "loss": 2.2941, + "mean_token_accuracy": 0.47980296015739443, + "step": 50130 + }, + { + "epoch": 0.050496504475540396, + "grad_norm": 12.643248063820355, + "learning_rate": 4.999996978819772e-05, + "loss": 2.4715, + "mean_token_accuracy": 0.4241379380226135, + "step": 50135 + }, + { + "epoch": 0.05050154052864456, + "grad_norm": 12.157281732548233, + "learning_rate": 4.9999969171016614e-05, + "loss": 2.609, + "mean_token_accuracy": 0.4034482717514038, + "step": 50140 + }, + { + "epoch": 0.05050657658174874, + "grad_norm": 14.951114291603682, + "learning_rate": 4.999996854759505e-05, + "loss": 2.7245, + "mean_token_accuracy": 0.341379314661026, + "step": 50145 + }, + { + "epoch": 0.05051161263485291, + "grad_norm": 39.30511979152037, + "learning_rate": 4.999996791793304e-05, + "loss": 3.2416, + "mean_token_accuracy": 0.3793103456497192, + "step": 50150 + }, + { + "epoch": 0.050516648687957084, + "grad_norm": 13.275875653103892, + "learning_rate": 4.999996728203058e-05, + "loss": 2.9485, + "mean_token_accuracy": 0.37586206793785093, + "step": 50155 + }, + { + "epoch": 0.05052168474106126, + "grad_norm": 11.42794887378641, + "learning_rate": 4.999996663988767e-05, + "loss": 2.569, + "mean_token_accuracy": 0.4436176598072052, + "step": 50160 + }, + { + "epoch": 0.05052672079416543, + "grad_norm": 12.386741195391972, + "learning_rate": 4.999996599150431e-05, + "loss": 2.4167, + "mean_token_accuracy": 0.4084089457988739, + "step": 50165 + }, + { + "epoch": 0.050531756847269606, + "grad_norm": 11.521676489888533, + "learning_rate": 4.99999653368805e-05, + "loss": 2.7182, + "mean_token_accuracy": 0.3482758581638336, + "step": 50170 + }, + { + "epoch": 0.05053679290037377, + "grad_norm": 11.16330970164469, + "learning_rate": 4.999996467601624e-05, + "loss": 2.5253, + "mean_token_accuracy": 0.4103448212146759, + "step": 50175 + }, + { + "epoch": 0.050541828953477946, + "grad_norm": 12.300427211862397, + "learning_rate": 4.999996400891153e-05, + "loss": 2.933, + "mean_token_accuracy": 0.33103448152542114, + "step": 50180 + }, + { + "epoch": 0.05054686500658212, + "grad_norm": 12.376512949866452, + "learning_rate": 4.999996333556638e-05, + "loss": 2.5401, + "mean_token_accuracy": 0.37241379022598264, + "step": 50185 + }, + { + "epoch": 0.050551901059686294, + "grad_norm": 12.380421031577722, + "learning_rate": 4.9999962655980775e-05, + "loss": 2.6882, + "mean_token_accuracy": 0.3827586233615875, + "step": 50190 + }, + { + "epoch": 0.05055693711279047, + "grad_norm": 16.5603308545267, + "learning_rate": 4.999996197015472e-05, + "loss": 2.6222, + "mean_token_accuracy": 0.3724137872457504, + "step": 50195 + }, + { + "epoch": 0.05056197316589464, + "grad_norm": 12.56249445255557, + "learning_rate": 4.999996127808822e-05, + "loss": 2.5685, + "mean_token_accuracy": 0.4103448212146759, + "step": 50200 + }, + { + "epoch": 0.050567009218998815, + "grad_norm": 13.676499462900354, + "learning_rate": 4.9999960579781265e-05, + "loss": 2.4247, + "mean_token_accuracy": 0.3931034475564957, + "step": 50205 + }, + { + "epoch": 0.05057204527210298, + "grad_norm": 14.396825411179483, + "learning_rate": 4.999995987523386e-05, + "loss": 2.3522, + "mean_token_accuracy": 0.42413793206214906, + "step": 50210 + }, + { + "epoch": 0.050577081325207156, + "grad_norm": 22.056202030539666, + "learning_rate": 4.999995916444602e-05, + "loss": 3.0558, + "mean_token_accuracy": 0.3758620619773865, + "step": 50215 + }, + { + "epoch": 0.05058211737831133, + "grad_norm": 15.905427465072279, + "learning_rate": 4.9999958447417725e-05, + "loss": 2.3306, + "mean_token_accuracy": 0.4413793087005615, + "step": 50220 + }, + { + "epoch": 0.0505871534314155, + "grad_norm": 13.437858679416255, + "learning_rate": 4.999995772414898e-05, + "loss": 2.372, + "mean_token_accuracy": 0.4241379380226135, + "step": 50225 + }, + { + "epoch": 0.05059218948451968, + "grad_norm": 12.349046366925462, + "learning_rate": 4.9999956994639804e-05, + "loss": 2.3315, + "mean_token_accuracy": 0.4257108271121979, + "step": 50230 + }, + { + "epoch": 0.05059722553762385, + "grad_norm": 11.127226646507648, + "learning_rate": 4.9999956258890154e-05, + "loss": 2.5167, + "mean_token_accuracy": 0.4310344785451889, + "step": 50235 + }, + { + "epoch": 0.050602261590728025, + "grad_norm": 12.877153680635537, + "learning_rate": 4.999995551690008e-05, + "loss": 2.6582, + "mean_token_accuracy": 0.42758620977401735, + "step": 50240 + }, + { + "epoch": 0.05060729764383219, + "grad_norm": 12.869601768946875, + "learning_rate": 4.999995476866955e-05, + "loss": 2.3543, + "mean_token_accuracy": 0.4620689630508423, + "step": 50245 + }, + { + "epoch": 0.050612333696936365, + "grad_norm": 13.482445336901161, + "learning_rate": 4.999995401419857e-05, + "loss": 2.4901, + "mean_token_accuracy": 0.4, + "step": 50250 + }, + { + "epoch": 0.05061736975004054, + "grad_norm": 15.084508148166748, + "learning_rate": 4.999995325348715e-05, + "loss": 2.5601, + "mean_token_accuracy": 0.4034482777118683, + "step": 50255 + }, + { + "epoch": 0.05062240580314471, + "grad_norm": 13.386760851950458, + "learning_rate": 4.9999952486535275e-05, + "loss": 2.7355, + "mean_token_accuracy": 0.35517241060733795, + "step": 50260 + }, + { + "epoch": 0.050627441856248886, + "grad_norm": 14.02379392633141, + "learning_rate": 4.999995171334297e-05, + "loss": 2.2317, + "mean_token_accuracy": 0.42413793206214906, + "step": 50265 + }, + { + "epoch": 0.05063247790935306, + "grad_norm": 13.244677349032951, + "learning_rate": 4.999995093391021e-05, + "loss": 2.9401, + "mean_token_accuracy": 0.3551724195480347, + "step": 50270 + }, + { + "epoch": 0.050637513962457234, + "grad_norm": 12.373428618036398, + "learning_rate": 4.9999950148237006e-05, + "loss": 2.9427, + "mean_token_accuracy": 0.3620689630508423, + "step": 50275 + }, + { + "epoch": 0.0506425500155614, + "grad_norm": 13.8730668054857, + "learning_rate": 4.999994935632336e-05, + "loss": 2.679, + "mean_token_accuracy": 0.3724137842655182, + "step": 50280 + }, + { + "epoch": 0.050647586068665575, + "grad_norm": 13.272347289290915, + "learning_rate": 4.999994855816926e-05, + "loss": 2.952, + "mean_token_accuracy": 0.36551723480224607, + "step": 50285 + }, + { + "epoch": 0.05065262212176975, + "grad_norm": 12.233073409564609, + "learning_rate": 4.9999947753774726e-05, + "loss": 3.2847, + "mean_token_accuracy": 0.32413792312145234, + "step": 50290 + }, + { + "epoch": 0.05065765817487392, + "grad_norm": 15.652301657370083, + "learning_rate": 4.999994694313973e-05, + "loss": 2.4582, + "mean_token_accuracy": 0.4137930929660797, + "step": 50295 + }, + { + "epoch": 0.050662694227978096, + "grad_norm": 12.639222699245709, + "learning_rate": 4.999994612626431e-05, + "loss": 2.4381, + "mean_token_accuracy": 0.38965516090393065, + "step": 50300 + }, + { + "epoch": 0.05066773028108227, + "grad_norm": 14.433210586636221, + "learning_rate": 4.9999945303148435e-05, + "loss": 2.8484, + "mean_token_accuracy": 0.3448275774717331, + "step": 50305 + }, + { + "epoch": 0.05067276633418644, + "grad_norm": 15.76482090872602, + "learning_rate": 4.9999944473792124e-05, + "loss": 2.3164, + "mean_token_accuracy": 0.4034482717514038, + "step": 50310 + }, + { + "epoch": 0.05067780238729061, + "grad_norm": 18.40720269668647, + "learning_rate": 4.999994363819536e-05, + "loss": 3.0817, + "mean_token_accuracy": 0.33793103098869326, + "step": 50315 + }, + { + "epoch": 0.050682838440394784, + "grad_norm": 10.704388766496008, + "learning_rate": 4.999994279635816e-05, + "loss": 2.097, + "mean_token_accuracy": 0.45517241954803467, + "step": 50320 + }, + { + "epoch": 0.05068787449349896, + "grad_norm": 15.269389253078632, + "learning_rate": 4.999994194828052e-05, + "loss": 2.6285, + "mean_token_accuracy": 0.41724138259887694, + "step": 50325 + }, + { + "epoch": 0.05069291054660313, + "grad_norm": 13.149014843343057, + "learning_rate": 4.999994109396243e-05, + "loss": 2.5431, + "mean_token_accuracy": 0.3793103456497192, + "step": 50330 + }, + { + "epoch": 0.050697946599707305, + "grad_norm": 18.00027476552083, + "learning_rate": 4.9999940233403904e-05, + "loss": 2.7242, + "mean_token_accuracy": 0.39655172526836396, + "step": 50335 + }, + { + "epoch": 0.05070298265281148, + "grad_norm": 14.417635393764959, + "learning_rate": 4.999993936660493e-05, + "loss": 2.2481, + "mean_token_accuracy": 0.43793103098869324, + "step": 50340 + }, + { + "epoch": 0.05070801870591565, + "grad_norm": 14.529208214345644, + "learning_rate": 4.999993849356551e-05, + "loss": 2.6387, + "mean_token_accuracy": 0.39655172228813174, + "step": 50345 + }, + { + "epoch": 0.05071305475901982, + "grad_norm": 12.498917304320713, + "learning_rate": 4.999993761428565e-05, + "loss": 2.8343, + "mean_token_accuracy": 0.39655172228813174, + "step": 50350 + }, + { + "epoch": 0.05071809081212399, + "grad_norm": 12.215905953034385, + "learning_rate": 4.999993672876535e-05, + "loss": 2.3883, + "mean_token_accuracy": 0.4482758641242981, + "step": 50355 + }, + { + "epoch": 0.05072312686522817, + "grad_norm": 13.195156917622159, + "learning_rate": 4.999993583700461e-05, + "loss": 2.824, + "mean_token_accuracy": 0.39655172228813174, + "step": 50360 + }, + { + "epoch": 0.05072816291833234, + "grad_norm": 14.403178828329962, + "learning_rate": 4.9999934939003434e-05, + "loss": 3.4793, + "mean_token_accuracy": 0.29310344755649564, + "step": 50365 + }, + { + "epoch": 0.050733198971436515, + "grad_norm": 12.027852798397008, + "learning_rate": 4.999993403476181e-05, + "loss": 2.9235, + "mean_token_accuracy": 0.35862068831920624, + "step": 50370 + }, + { + "epoch": 0.05073823502454069, + "grad_norm": 15.27328788978254, + "learning_rate": 4.999993312427975e-05, + "loss": 2.907, + "mean_token_accuracy": 0.37586207389831544, + "step": 50375 + }, + { + "epoch": 0.05074327107764486, + "grad_norm": 12.99541592749973, + "learning_rate": 4.999993220755724e-05, + "loss": 2.874, + "mean_token_accuracy": 0.3448275804519653, + "step": 50380 + }, + { + "epoch": 0.05074830713074903, + "grad_norm": 13.21460625924436, + "learning_rate": 4.9999931284594295e-05, + "loss": 2.8509, + "mean_token_accuracy": 0.3655172437429428, + "step": 50385 + }, + { + "epoch": 0.0507533431838532, + "grad_norm": 13.447178401623969, + "learning_rate": 4.999993035539092e-05, + "loss": 2.5361, + "mean_token_accuracy": 0.3931034505367279, + "step": 50390 + }, + { + "epoch": 0.05075837923695738, + "grad_norm": 11.676902120723726, + "learning_rate": 4.999992941994709e-05, + "loss": 2.4866, + "mean_token_accuracy": 0.3827586233615875, + "step": 50395 + }, + { + "epoch": 0.05076341529006155, + "grad_norm": 10.326934684992437, + "learning_rate": 4.9999928478262834e-05, + "loss": 2.3729, + "mean_token_accuracy": 0.44482758045196535, + "step": 50400 + }, + { + "epoch": 0.050768451343165724, + "grad_norm": 11.223476499367806, + "learning_rate": 4.999992753033813e-05, + "loss": 2.6259, + "mean_token_accuracy": 0.42068964838981626, + "step": 50405 + }, + { + "epoch": 0.0507734873962699, + "grad_norm": 11.533671842982669, + "learning_rate": 4.9999926576172986e-05, + "loss": 3.0445, + "mean_token_accuracy": 0.38275861740112305, + "step": 50410 + }, + { + "epoch": 0.05077852344937407, + "grad_norm": 11.465328071015533, + "learning_rate": 4.999992561576741e-05, + "loss": 2.469, + "mean_token_accuracy": 0.40344828367233276, + "step": 50415 + }, + { + "epoch": 0.05078355950247824, + "grad_norm": 11.463208455361164, + "learning_rate": 4.9999924649121396e-05, + "loss": 2.765, + "mean_token_accuracy": 0.3655172407627106, + "step": 50420 + }, + { + "epoch": 0.05078859555558241, + "grad_norm": 11.151187312782465, + "learning_rate": 4.999992367623494e-05, + "loss": 2.5221, + "mean_token_accuracy": 0.35862068831920624, + "step": 50425 + }, + { + "epoch": 0.050793631608686586, + "grad_norm": 13.622775208407601, + "learning_rate": 4.9999922697108043e-05, + "loss": 2.6598, + "mean_token_accuracy": 0.37241379022598264, + "step": 50430 + }, + { + "epoch": 0.05079866766179076, + "grad_norm": 13.678948168777696, + "learning_rate": 4.999992171174072e-05, + "loss": 2.5965, + "mean_token_accuracy": 0.39310345649719236, + "step": 50435 + }, + { + "epoch": 0.050803703714894934, + "grad_norm": 11.892291183988693, + "learning_rate": 4.9999920720132955e-05, + "loss": 2.4797, + "mean_token_accuracy": 0.41034482717514037, + "step": 50440 + }, + { + "epoch": 0.05080873976799911, + "grad_norm": 13.233194899553208, + "learning_rate": 4.999991972228474e-05, + "loss": 2.7149, + "mean_token_accuracy": 0.358620685338974, + "step": 50445 + }, + { + "epoch": 0.05081377582110328, + "grad_norm": 13.85509292141273, + "learning_rate": 4.99999187181961e-05, + "loss": 2.7106, + "mean_token_accuracy": 0.3551724076271057, + "step": 50450 + }, + { + "epoch": 0.05081881187420745, + "grad_norm": 10.301359830994032, + "learning_rate": 4.999991770786702e-05, + "loss": 2.4974, + "mean_token_accuracy": 0.4241379201412201, + "step": 50455 + }, + { + "epoch": 0.05082384792731162, + "grad_norm": 15.288932018542642, + "learning_rate": 4.999991669129751e-05, + "loss": 2.7898, + "mean_token_accuracy": 0.3395644336938858, + "step": 50460 + }, + { + "epoch": 0.050828883980415795, + "grad_norm": 10.501455040792836, + "learning_rate": 4.999991566848756e-05, + "loss": 2.0661, + "mean_token_accuracy": 0.4724137902259827, + "step": 50465 + }, + { + "epoch": 0.05083392003351997, + "grad_norm": 11.567431115765524, + "learning_rate": 4.999991463943717e-05, + "loss": 2.8463, + "mean_token_accuracy": 0.3517241418361664, + "step": 50470 + }, + { + "epoch": 0.05083895608662414, + "grad_norm": 18.940026618890897, + "learning_rate": 4.9999913604146356e-05, + "loss": 2.9963, + "mean_token_accuracy": 0.3551724195480347, + "step": 50475 + }, + { + "epoch": 0.05084399213972832, + "grad_norm": 13.871105979677806, + "learning_rate": 4.99999125626151e-05, + "loss": 2.7956, + "mean_token_accuracy": 0.37586206793785093, + "step": 50480 + }, + { + "epoch": 0.05084902819283249, + "grad_norm": 11.969604195578384, + "learning_rate": 4.999991151484341e-05, + "loss": 2.2208, + "mean_token_accuracy": 0.4413793087005615, + "step": 50485 + }, + { + "epoch": 0.05085406424593666, + "grad_norm": 13.7483171347264, + "learning_rate": 4.9999910460831286e-05, + "loss": 2.7553, + "mean_token_accuracy": 0.38620689809322356, + "step": 50490 + }, + { + "epoch": 0.05085910029904083, + "grad_norm": 13.374470871774966, + "learning_rate": 4.999990940057873e-05, + "loss": 2.8883, + "mean_token_accuracy": 0.38620689511299133, + "step": 50495 + }, + { + "epoch": 0.050864136352145005, + "grad_norm": 12.051370925671462, + "learning_rate": 4.999990833408573e-05, + "loss": 2.9269, + "mean_token_accuracy": 0.33448275923728943, + "step": 50500 + }, + { + "epoch": 0.05086917240524918, + "grad_norm": 11.177393694866062, + "learning_rate": 4.9999907261352315e-05, + "loss": 2.4536, + "mean_token_accuracy": 0.44482758045196535, + "step": 50505 + }, + { + "epoch": 0.05087420845835335, + "grad_norm": 11.83979212110621, + "learning_rate": 4.999990618237845e-05, + "loss": 2.7094, + "mean_token_accuracy": 0.35517241060733795, + "step": 50510 + }, + { + "epoch": 0.050879244511457526, + "grad_norm": 13.536797330310398, + "learning_rate": 4.9999905097164155e-05, + "loss": 2.5111, + "mean_token_accuracy": 0.3482758581638336, + "step": 50515 + }, + { + "epoch": 0.0508842805645617, + "grad_norm": 13.854247271581649, + "learning_rate": 4.9999904005709435e-05, + "loss": 2.3745, + "mean_token_accuracy": 0.459482753276825, + "step": 50520 + }, + { + "epoch": 0.05088931661766587, + "grad_norm": 14.442259604575176, + "learning_rate": 4.999990290801427e-05, + "loss": 2.9725, + "mean_token_accuracy": 0.3840895354747772, + "step": 50525 + }, + { + "epoch": 0.05089435267077004, + "grad_norm": 14.164961502975885, + "learning_rate": 4.9999901804078685e-05, + "loss": 2.4713, + "mean_token_accuracy": 0.43103448748588563, + "step": 50530 + }, + { + "epoch": 0.050899388723874214, + "grad_norm": 10.612880208792598, + "learning_rate": 4.999990069390267e-05, + "loss": 3.0189, + "mean_token_accuracy": 0.31034482419490816, + "step": 50535 + }, + { + "epoch": 0.05090442477697839, + "grad_norm": 13.397037540691723, + "learning_rate": 4.999989957748622e-05, + "loss": 2.4633, + "mean_token_accuracy": 0.43793103098869324, + "step": 50540 + }, + { + "epoch": 0.05090946083008256, + "grad_norm": 11.751872463325505, + "learning_rate": 4.9999898454829334e-05, + "loss": 2.5125, + "mean_token_accuracy": 0.3862068891525269, + "step": 50545 + }, + { + "epoch": 0.050914496883186736, + "grad_norm": 12.742807249912271, + "learning_rate": 4.999989732593202e-05, + "loss": 2.5619, + "mean_token_accuracy": 0.36896551847457887, + "step": 50550 + }, + { + "epoch": 0.05091953293629091, + "grad_norm": 14.85416411347173, + "learning_rate": 4.999989619079427e-05, + "loss": 2.6726, + "mean_token_accuracy": 0.3862069010734558, + "step": 50555 + }, + { + "epoch": 0.050924568989395076, + "grad_norm": 11.729884494350376, + "learning_rate": 4.99998950494161e-05, + "loss": 2.746, + "mean_token_accuracy": 0.36896551251411436, + "step": 50560 + }, + { + "epoch": 0.05092960504249925, + "grad_norm": 12.456041971939259, + "learning_rate": 4.999989390179749e-05, + "loss": 2.6011, + "mean_token_accuracy": 0.3931034505367279, + "step": 50565 + }, + { + "epoch": 0.050934641095603424, + "grad_norm": 13.049428129856947, + "learning_rate": 4.999989274793846e-05, + "loss": 2.4744, + "mean_token_accuracy": 0.4068965554237366, + "step": 50570 + }, + { + "epoch": 0.0509396771487076, + "grad_norm": 15.172187273750252, + "learning_rate": 4.999989158783899e-05, + "loss": 2.4209, + "mean_token_accuracy": 0.42068966031074523, + "step": 50575 + }, + { + "epoch": 0.05094471320181177, + "grad_norm": 15.284546724590435, + "learning_rate": 4.99998904214991e-05, + "loss": 2.7756, + "mean_token_accuracy": 0.38620689511299133, + "step": 50580 + }, + { + "epoch": 0.050949749254915945, + "grad_norm": 11.171683100945389, + "learning_rate": 4.9999889248918784e-05, + "loss": 2.973, + "mean_token_accuracy": 0.3241379350423813, + "step": 50585 + }, + { + "epoch": 0.05095478530802012, + "grad_norm": 10.318888011024718, + "learning_rate": 4.999988807009803e-05, + "loss": 2.0035, + "mean_token_accuracy": 0.5034482836723327, + "step": 50590 + }, + { + "epoch": 0.050959821361124286, + "grad_norm": 16.495888368016665, + "learning_rate": 4.999988688503685e-05, + "loss": 2.938, + "mean_token_accuracy": 0.36551724672317504, + "step": 50595 + }, + { + "epoch": 0.05096485741422846, + "grad_norm": 12.056136201116475, + "learning_rate": 4.999988569373526e-05, + "loss": 2.6403, + "mean_token_accuracy": 0.36896551847457887, + "step": 50600 + }, + { + "epoch": 0.05096989346733263, + "grad_norm": 11.482882692953941, + "learning_rate": 4.999988449619322e-05, + "loss": 2.5845, + "mean_token_accuracy": 0.41034482717514037, + "step": 50605 + }, + { + "epoch": 0.05097492952043681, + "grad_norm": 12.372321106372752, + "learning_rate": 4.999988329241077e-05, + "loss": 2.4334, + "mean_token_accuracy": 0.44694494605064394, + "step": 50610 + }, + { + "epoch": 0.05097996557354098, + "grad_norm": 12.378444336336448, + "learning_rate": 4.999988208238788e-05, + "loss": 2.8065, + "mean_token_accuracy": 0.3724137842655182, + "step": 50615 + }, + { + "epoch": 0.050985001626645154, + "grad_norm": 12.907399789494304, + "learning_rate": 4.999988086612456e-05, + "loss": 3.0349, + "mean_token_accuracy": 0.35862069129943847, + "step": 50620 + }, + { + "epoch": 0.05099003767974933, + "grad_norm": 12.695328688508344, + "learning_rate": 4.9999879643620836e-05, + "loss": 2.9212, + "mean_token_accuracy": 0.3655172407627106, + "step": 50625 + }, + { + "epoch": 0.050995073732853495, + "grad_norm": 11.378454729698843, + "learning_rate": 4.999987841487667e-05, + "loss": 2.4957, + "mean_token_accuracy": 0.4156079888343811, + "step": 50630 + }, + { + "epoch": 0.05100010978595767, + "grad_norm": 14.206251095541488, + "learning_rate": 4.9999877179892084e-05, + "loss": 2.9049, + "mean_token_accuracy": 0.39310344457626345, + "step": 50635 + }, + { + "epoch": 0.05100514583906184, + "grad_norm": 16.434382829001443, + "learning_rate": 4.9999875938667066e-05, + "loss": 2.7233, + "mean_token_accuracy": 0.3793103456497192, + "step": 50640 + }, + { + "epoch": 0.051010181892166016, + "grad_norm": 12.849534715047369, + "learning_rate": 4.999987469120164e-05, + "loss": 2.9175, + "mean_token_accuracy": 0.3827586233615875, + "step": 50645 + }, + { + "epoch": 0.05101521794527019, + "grad_norm": 16.786675505839124, + "learning_rate": 4.999987343749578e-05, + "loss": 2.4359, + "mean_token_accuracy": 0.39310344159603117, + "step": 50650 + }, + { + "epoch": 0.051020253998374364, + "grad_norm": 12.34115087509322, + "learning_rate": 4.999987217754949e-05, + "loss": 2.7907, + "mean_token_accuracy": 0.37586206793785093, + "step": 50655 + }, + { + "epoch": 0.05102529005147854, + "grad_norm": 36.61510853950055, + "learning_rate": 4.9999870911362775e-05, + "loss": 3.0481, + "mean_token_accuracy": 0.3620689630508423, + "step": 50660 + }, + { + "epoch": 0.051030326104582704, + "grad_norm": 14.500048960266122, + "learning_rate": 4.9999869638935645e-05, + "loss": 2.6012, + "mean_token_accuracy": 0.4053236484527588, + "step": 50665 + }, + { + "epoch": 0.05103536215768688, + "grad_norm": 15.641473581959717, + "learning_rate": 4.999986836026809e-05, + "loss": 2.8094, + "mean_token_accuracy": 0.4, + "step": 50670 + }, + { + "epoch": 0.05104039821079105, + "grad_norm": 13.372895943097312, + "learning_rate": 4.999986707536012e-05, + "loss": 2.4642, + "mean_token_accuracy": 0.4000000059604645, + "step": 50675 + }, + { + "epoch": 0.051045434263895226, + "grad_norm": 11.158541350361103, + "learning_rate": 4.999986578421172e-05, + "loss": 2.4038, + "mean_token_accuracy": 0.44827585816383364, + "step": 50680 + }, + { + "epoch": 0.0510504703169994, + "grad_norm": 10.452277524536793, + "learning_rate": 4.999986448682289e-05, + "loss": 2.286, + "mean_token_accuracy": 0.4103448331356049, + "step": 50685 + }, + { + "epoch": 0.05105550637010357, + "grad_norm": 12.707008852328286, + "learning_rate": 4.999986318319365e-05, + "loss": 2.8223, + "mean_token_accuracy": 0.3931034505367279, + "step": 50690 + }, + { + "epoch": 0.05106054242320775, + "grad_norm": 12.874717018340522, + "learning_rate": 4.999986187332399e-05, + "loss": 2.4357, + "mean_token_accuracy": 0.44827587008476255, + "step": 50695 + }, + { + "epoch": 0.051065578476311914, + "grad_norm": 11.240808453968851, + "learning_rate": 4.999986055721391e-05, + "loss": 2.4639, + "mean_token_accuracy": 0.45172412395477296, + "step": 50700 + }, + { + "epoch": 0.05107061452941609, + "grad_norm": 14.117069554688578, + "learning_rate": 4.999985923486341e-05, + "loss": 2.6966, + "mean_token_accuracy": 0.43448275327682495, + "step": 50705 + }, + { + "epoch": 0.05107565058252026, + "grad_norm": 10.643264149205516, + "learning_rate": 4.999985790627248e-05, + "loss": 2.3975, + "mean_token_accuracy": 0.4396249294281006, + "step": 50710 + }, + { + "epoch": 0.051080686635624435, + "grad_norm": 12.059514891084545, + "learning_rate": 4.999985657144114e-05, + "loss": 2.204, + "mean_token_accuracy": 0.47241379618644713, + "step": 50715 + }, + { + "epoch": 0.05108572268872861, + "grad_norm": 13.301422299121736, + "learning_rate": 4.999985523036937e-05, + "loss": 2.8314, + "mean_token_accuracy": 0.38275861740112305, + "step": 50720 + }, + { + "epoch": 0.05109075874183278, + "grad_norm": 13.176480343357916, + "learning_rate": 4.999985388305719e-05, + "loss": 2.3718, + "mean_token_accuracy": 0.4172413766384125, + "step": 50725 + }, + { + "epoch": 0.051095794794936956, + "grad_norm": 13.078364829447723, + "learning_rate": 4.999985252950459e-05, + "loss": 2.8303, + "mean_token_accuracy": 0.3793103456497192, + "step": 50730 + }, + { + "epoch": 0.05110083084804112, + "grad_norm": 11.523671369849534, + "learning_rate": 4.999985116971158e-05, + "loss": 2.6696, + "mean_token_accuracy": 0.4344827592372894, + "step": 50735 + }, + { + "epoch": 0.0511058669011453, + "grad_norm": 14.418817469698231, + "learning_rate": 4.999984980367814e-05, + "loss": 2.3964, + "mean_token_accuracy": 0.44482758045196535, + "step": 50740 + }, + { + "epoch": 0.05111090295424947, + "grad_norm": 11.923611585157808, + "learning_rate": 4.999984843140429e-05, + "loss": 2.4344, + "mean_token_accuracy": 0.41379310488700866, + "step": 50745 + }, + { + "epoch": 0.051115939007353645, + "grad_norm": 11.478710784659865, + "learning_rate": 4.9999847052890015e-05, + "loss": 2.2794, + "mean_token_accuracy": 0.45517241954803467, + "step": 50750 + }, + { + "epoch": 0.05112097506045782, + "grad_norm": 12.27726143967275, + "learning_rate": 4.999984566813533e-05, + "loss": 2.8494, + "mean_token_accuracy": 0.4032667875289917, + "step": 50755 + }, + { + "epoch": 0.05112601111356199, + "grad_norm": 26.514712610701736, + "learning_rate": 4.9999844277140224e-05, + "loss": 2.976, + "mean_token_accuracy": 0.3379310339689255, + "step": 50760 + }, + { + "epoch": 0.051131047166666166, + "grad_norm": 16.6683723281907, + "learning_rate": 4.999984287990471e-05, + "loss": 2.6059, + "mean_token_accuracy": 0.38275861740112305, + "step": 50765 + }, + { + "epoch": 0.05113608321977033, + "grad_norm": 13.29468259950492, + "learning_rate": 4.999984147642878e-05, + "loss": 2.8289, + "mean_token_accuracy": 0.33793103098869326, + "step": 50770 + }, + { + "epoch": 0.051141119272874506, + "grad_norm": 10.503863866473427, + "learning_rate": 4.999984006671243e-05, + "loss": 2.4664, + "mean_token_accuracy": 0.3896551728248596, + "step": 50775 + }, + { + "epoch": 0.05114615532597868, + "grad_norm": 12.258165165755115, + "learning_rate": 4.9999838650755667e-05, + "loss": 3.0564, + "mean_token_accuracy": 0.36551723480224607, + "step": 50780 + }, + { + "epoch": 0.051151191379082854, + "grad_norm": 11.738062119497405, + "learning_rate": 4.999983722855849e-05, + "loss": 2.423, + "mean_token_accuracy": 0.41379310488700866, + "step": 50785 + }, + { + "epoch": 0.05115622743218703, + "grad_norm": 12.837625698816778, + "learning_rate": 4.99998358001209e-05, + "loss": 2.3323, + "mean_token_accuracy": 0.42068964838981626, + "step": 50790 + }, + { + "epoch": 0.0511612634852912, + "grad_norm": 9.94996131355436, + "learning_rate": 4.9999834365442894e-05, + "loss": 2.5248, + "mean_token_accuracy": 0.4517241299152374, + "step": 50795 + }, + { + "epoch": 0.051166299538395375, + "grad_norm": 12.888870671234065, + "learning_rate": 4.999983292452448e-05, + "loss": 2.9839, + "mean_token_accuracy": 0.32413792610168457, + "step": 50800 + }, + { + "epoch": 0.05117133559149954, + "grad_norm": 14.406153247881461, + "learning_rate": 4.9999831477365645e-05, + "loss": 3.2232, + "mean_token_accuracy": 0.31724138259887696, + "step": 50805 + }, + { + "epoch": 0.051176371644603716, + "grad_norm": 14.378786650015185, + "learning_rate": 4.99998300239664e-05, + "loss": 2.75, + "mean_token_accuracy": 0.3655172407627106, + "step": 50810 + }, + { + "epoch": 0.05118140769770789, + "grad_norm": 19.22735985157965, + "learning_rate": 4.999982856432675e-05, + "loss": 2.4734, + "mean_token_accuracy": 0.41034482717514037, + "step": 50815 + }, + { + "epoch": 0.05118644375081206, + "grad_norm": 15.522569753235098, + "learning_rate": 4.999982709844669e-05, + "loss": 2.9803, + "mean_token_accuracy": 0.3448275804519653, + "step": 50820 + }, + { + "epoch": 0.05119147980391624, + "grad_norm": 14.79317805750623, + "learning_rate": 4.999982562632621e-05, + "loss": 2.6382, + "mean_token_accuracy": 0.38965516686439516, + "step": 50825 + }, + { + "epoch": 0.05119651585702041, + "grad_norm": 12.34551948392326, + "learning_rate": 4.999982414796532e-05, + "loss": 2.3967, + "mean_token_accuracy": 0.41379310488700866, + "step": 50830 + }, + { + "epoch": 0.051201551910124585, + "grad_norm": 12.961008118984303, + "learning_rate": 4.9999822663364026e-05, + "loss": 2.5331, + "mean_token_accuracy": 0.4068965494632721, + "step": 50835 + }, + { + "epoch": 0.05120658796322875, + "grad_norm": 13.437188411337072, + "learning_rate": 4.9999821172522315e-05, + "loss": 2.4934, + "mean_token_accuracy": 0.36896551251411436, + "step": 50840 + }, + { + "epoch": 0.051211624016332925, + "grad_norm": 12.443311128190949, + "learning_rate": 4.99998196754402e-05, + "loss": 2.5719, + "mean_token_accuracy": 0.3551724135875702, + "step": 50845 + }, + { + "epoch": 0.0512166600694371, + "grad_norm": 14.096214012417848, + "learning_rate": 4.9999818172117675e-05, + "loss": 2.419, + "mean_token_accuracy": 0.41379310488700866, + "step": 50850 + }, + { + "epoch": 0.05122169612254127, + "grad_norm": 10.609964847332812, + "learning_rate": 4.999981666255475e-05, + "loss": 2.4425, + "mean_token_accuracy": 0.38965516686439516, + "step": 50855 + }, + { + "epoch": 0.05122673217564545, + "grad_norm": 12.23619203623958, + "learning_rate": 4.99998151467514e-05, + "loss": 2.2994, + "mean_token_accuracy": 0.4275861978530884, + "step": 50860 + }, + { + "epoch": 0.05123176822874962, + "grad_norm": 12.02649319209877, + "learning_rate": 4.999981362470765e-05, + "loss": 2.4735, + "mean_token_accuracy": 0.43103447556495667, + "step": 50865 + }, + { + "epoch": 0.051236804281853794, + "grad_norm": 9.680217071372095, + "learning_rate": 4.9999812096423495e-05, + "loss": 2.2326, + "mean_token_accuracy": 0.4517241418361664, + "step": 50870 + }, + { + "epoch": 0.05124184033495796, + "grad_norm": 15.594835691087551, + "learning_rate": 4.999981056189893e-05, + "loss": 3.6037, + "mean_token_accuracy": 0.29310344755649564, + "step": 50875 + }, + { + "epoch": 0.051246876388062135, + "grad_norm": 15.3248595892866, + "learning_rate": 4.999980902113396e-05, + "loss": 2.5861, + "mean_token_accuracy": 0.42758620977401735, + "step": 50880 + }, + { + "epoch": 0.05125191244116631, + "grad_norm": 12.458959127836875, + "learning_rate": 4.9999807474128584e-05, + "loss": 2.4134, + "mean_token_accuracy": 0.39655172228813174, + "step": 50885 + }, + { + "epoch": 0.05125694849427048, + "grad_norm": 12.11492084612793, + "learning_rate": 4.9999805920882805e-05, + "loss": 2.6406, + "mean_token_accuracy": 0.4, + "step": 50890 + }, + { + "epoch": 0.051261984547374656, + "grad_norm": 13.71665520899789, + "learning_rate": 4.9999804361396615e-05, + "loss": 2.7467, + "mean_token_accuracy": 0.33103448152542114, + "step": 50895 + }, + { + "epoch": 0.05126702060047883, + "grad_norm": 13.49433978997008, + "learning_rate": 4.999980279567003e-05, + "loss": 3.1413, + "mean_token_accuracy": 0.33448275923728943, + "step": 50900 + }, + { + "epoch": 0.051272056653583004, + "grad_norm": 14.392407630712961, + "learning_rate": 4.999980122370303e-05, + "loss": 2.4145, + "mean_token_accuracy": 0.4068965494632721, + "step": 50905 + }, + { + "epoch": 0.05127709270668717, + "grad_norm": 13.091900175287327, + "learning_rate": 4.999979964549564e-05, + "loss": 2.5835, + "mean_token_accuracy": 0.37931033968925476, + "step": 50910 + }, + { + "epoch": 0.051282128759791344, + "grad_norm": 14.003193125547206, + "learning_rate": 4.999979806104783e-05, + "loss": 2.9264, + "mean_token_accuracy": 0.32413792312145234, + "step": 50915 + }, + { + "epoch": 0.05128716481289552, + "grad_norm": 11.263117746217645, + "learning_rate": 4.999979647035963e-05, + "loss": 2.656, + "mean_token_accuracy": 0.38965516686439516, + "step": 50920 + }, + { + "epoch": 0.05129220086599969, + "grad_norm": 13.250377362367928, + "learning_rate": 4.999979487343102e-05, + "loss": 2.2998, + "mean_token_accuracy": 0.43647912740707395, + "step": 50925 + }, + { + "epoch": 0.051297236919103865, + "grad_norm": 14.187996994022388, + "learning_rate": 4.9999793270262006e-05, + "loss": 2.4715, + "mean_token_accuracy": 0.3965517282485962, + "step": 50930 + }, + { + "epoch": 0.05130227297220804, + "grad_norm": 10.238075499048039, + "learning_rate": 4.9999791660852595e-05, + "loss": 2.4173, + "mean_token_accuracy": 0.47241379618644713, + "step": 50935 + }, + { + "epoch": 0.05130730902531221, + "grad_norm": 11.385759304110067, + "learning_rate": 4.999979004520278e-05, + "loss": 2.7381, + "mean_token_accuracy": 0.3551724076271057, + "step": 50940 + }, + { + "epoch": 0.05131234507841638, + "grad_norm": 11.001773678247298, + "learning_rate": 4.999978842331257e-05, + "loss": 2.6548, + "mean_token_accuracy": 0.33103448152542114, + "step": 50945 + }, + { + "epoch": 0.051317381131520554, + "grad_norm": 18.430164746876034, + "learning_rate": 4.9999786795181965e-05, + "loss": 2.719, + "mean_token_accuracy": 0.39310344457626345, + "step": 50950 + }, + { + "epoch": 0.05132241718462473, + "grad_norm": 13.003007931588273, + "learning_rate": 4.999978516081094e-05, + "loss": 2.297, + "mean_token_accuracy": 0.4620689630508423, + "step": 50955 + }, + { + "epoch": 0.0513274532377289, + "grad_norm": 16.122748725297082, + "learning_rate": 4.9999783520199534e-05, + "loss": 2.8609, + "mean_token_accuracy": 0.3843920171260834, + "step": 50960 + }, + { + "epoch": 0.051332489290833075, + "grad_norm": 12.55460038783875, + "learning_rate": 4.999978187334772e-05, + "loss": 2.6097, + "mean_token_accuracy": 0.4000000059604645, + "step": 50965 + }, + { + "epoch": 0.05133752534393725, + "grad_norm": 12.255336830730991, + "learning_rate": 4.999978022025551e-05, + "loss": 2.1952, + "mean_token_accuracy": 0.4497882604598999, + "step": 50970 + }, + { + "epoch": 0.05134256139704142, + "grad_norm": 15.490637324884583, + "learning_rate": 4.99997785609229e-05, + "loss": 2.6487, + "mean_token_accuracy": 0.41034482717514037, + "step": 50975 + }, + { + "epoch": 0.05134759745014559, + "grad_norm": 10.169066129495206, + "learning_rate": 4.99997768953499e-05, + "loss": 2.8483, + "mean_token_accuracy": 0.41724138259887694, + "step": 50980 + }, + { + "epoch": 0.05135263350324976, + "grad_norm": 15.084322176414775, + "learning_rate": 4.9999775223536494e-05, + "loss": 2.635, + "mean_token_accuracy": 0.4, + "step": 50985 + }, + { + "epoch": 0.05135766955635394, + "grad_norm": 12.371185090920786, + "learning_rate": 4.9999773545482694e-05, + "loss": 2.5849, + "mean_token_accuracy": 0.3793103456497192, + "step": 50990 + }, + { + "epoch": 0.05136270560945811, + "grad_norm": 10.646839802882313, + "learning_rate": 4.99997718611885e-05, + "loss": 2.4614, + "mean_token_accuracy": 0.4172413766384125, + "step": 50995 + }, + { + "epoch": 0.051367741662562284, + "grad_norm": 13.205666166697197, + "learning_rate": 4.999977017065391e-05, + "loss": 2.5323, + "mean_token_accuracy": 0.3896551728248596, + "step": 51000 + }, + { + "epoch": 0.05137277771566646, + "grad_norm": 12.226788417692278, + "learning_rate": 4.999976847387892e-05, + "loss": 2.8101, + "mean_token_accuracy": 0.37241379618644715, + "step": 51005 + }, + { + "epoch": 0.05137781376877063, + "grad_norm": 12.031641548340382, + "learning_rate": 4.999976677086354e-05, + "loss": 2.4819, + "mean_token_accuracy": 0.3793103337287903, + "step": 51010 + }, + { + "epoch": 0.0513828498218748, + "grad_norm": 11.968784098747346, + "learning_rate": 4.999976506160777e-05, + "loss": 2.8444, + "mean_token_accuracy": 0.41034482717514037, + "step": 51015 + }, + { + "epoch": 0.05138788587497897, + "grad_norm": 11.478885496503556, + "learning_rate": 4.9999763346111595e-05, + "loss": 2.8047, + "mean_token_accuracy": 0.3965517282485962, + "step": 51020 + }, + { + "epoch": 0.051392921928083146, + "grad_norm": 12.300036949229812, + "learning_rate": 4.999976162437504e-05, + "loss": 2.6939, + "mean_token_accuracy": 0.39310343861579894, + "step": 51025 + }, + { + "epoch": 0.05139795798118732, + "grad_norm": 11.8263698470749, + "learning_rate": 4.999975989639808e-05, + "loss": 2.4497, + "mean_token_accuracy": 0.42068964838981626, + "step": 51030 + }, + { + "epoch": 0.051402994034291494, + "grad_norm": 11.737202767093606, + "learning_rate": 4.9999758162180735e-05, + "loss": 2.6731, + "mean_token_accuracy": 0.3896551728248596, + "step": 51035 + }, + { + "epoch": 0.05140803008739567, + "grad_norm": 11.514363948016042, + "learning_rate": 4.9999756421722995e-05, + "loss": 2.6212, + "mean_token_accuracy": 0.39310344457626345, + "step": 51040 + }, + { + "epoch": 0.05141306614049984, + "grad_norm": 10.675617574535094, + "learning_rate": 4.9999754675024865e-05, + "loss": 2.8054, + "mean_token_accuracy": 0.35172412991523744, + "step": 51045 + }, + { + "epoch": 0.05141810219360401, + "grad_norm": 12.256579072647709, + "learning_rate": 4.9999752922086345e-05, + "loss": 2.5923, + "mean_token_accuracy": 0.3862068891525269, + "step": 51050 + }, + { + "epoch": 0.05142313824670818, + "grad_norm": 12.154667936496908, + "learning_rate": 4.999975116290743e-05, + "loss": 2.8157, + "mean_token_accuracy": 0.4137930989265442, + "step": 51055 + }, + { + "epoch": 0.051428174299812356, + "grad_norm": 16.363384659472228, + "learning_rate": 4.999974939748813e-05, + "loss": 2.7208, + "mean_token_accuracy": 0.43189655542373656, + "step": 51060 + }, + { + "epoch": 0.05143321035291653, + "grad_norm": 12.830348100363567, + "learning_rate": 4.999974762582843e-05, + "loss": 2.8218, + "mean_token_accuracy": 0.3862068891525269, + "step": 51065 + }, + { + "epoch": 0.0514382464060207, + "grad_norm": 11.622833138463445, + "learning_rate": 4.9999745847928354e-05, + "loss": 2.7577, + "mean_token_accuracy": 0.3448275804519653, + "step": 51070 + }, + { + "epoch": 0.05144328245912488, + "grad_norm": 10.258774237612803, + "learning_rate": 4.999974406378789e-05, + "loss": 2.2819, + "mean_token_accuracy": 0.47428916692733764, + "step": 51075 + }, + { + "epoch": 0.05144831851222905, + "grad_norm": 12.441505544879986, + "learning_rate": 4.999974227340703e-05, + "loss": 2.6171, + "mean_token_accuracy": 0.39655172228813174, + "step": 51080 + }, + { + "epoch": 0.05145335456533322, + "grad_norm": 14.365569243100772, + "learning_rate": 4.999974047678579e-05, + "loss": 2.7089, + "mean_token_accuracy": 0.3758620619773865, + "step": 51085 + }, + { + "epoch": 0.05145839061843739, + "grad_norm": 13.577520838396728, + "learning_rate": 4.999973867392415e-05, + "loss": 2.9569, + "mean_token_accuracy": 0.31724138259887696, + "step": 51090 + }, + { + "epoch": 0.051463426671541565, + "grad_norm": 17.16753810806592, + "learning_rate": 4.999973686482213e-05, + "loss": 2.6726, + "mean_token_accuracy": 0.37241379022598264, + "step": 51095 + }, + { + "epoch": 0.05146846272464574, + "grad_norm": 11.49485354134092, + "learning_rate": 4.999973504947974e-05, + "loss": 2.826, + "mean_token_accuracy": 0.3344827562570572, + "step": 51100 + }, + { + "epoch": 0.05147349877774991, + "grad_norm": 11.053012541885813, + "learning_rate": 4.9999733227896944e-05, + "loss": 2.2644, + "mean_token_accuracy": 0.4344827592372894, + "step": 51105 + }, + { + "epoch": 0.051478534830854086, + "grad_norm": 11.770033855592528, + "learning_rate": 4.999973140007377e-05, + "loss": 2.4222, + "mean_token_accuracy": 0.4503327250480652, + "step": 51110 + }, + { + "epoch": 0.05148357088395826, + "grad_norm": 12.51707852906171, + "learning_rate": 4.999972956601022e-05, + "loss": 2.6553, + "mean_token_accuracy": 0.37586206793785093, + "step": 51115 + }, + { + "epoch": 0.05148860693706243, + "grad_norm": 13.271440631770309, + "learning_rate": 4.999972772570627e-05, + "loss": 2.6644, + "mean_token_accuracy": 0.4103448212146759, + "step": 51120 + }, + { + "epoch": 0.0514936429901666, + "grad_norm": 11.531979296531613, + "learning_rate": 4.999972587916195e-05, + "loss": 3.3401, + "mean_token_accuracy": 0.31724137961864474, + "step": 51125 + }, + { + "epoch": 0.051498679043270774, + "grad_norm": 11.839866141934124, + "learning_rate": 4.999972402637724e-05, + "loss": 2.6027, + "mean_token_accuracy": 0.3724137842655182, + "step": 51130 + }, + { + "epoch": 0.05150371509637495, + "grad_norm": 10.175524918131666, + "learning_rate": 4.9999722167352143e-05, + "loss": 2.4688, + "mean_token_accuracy": 0.3949183315038681, + "step": 51135 + }, + { + "epoch": 0.05150875114947912, + "grad_norm": 12.9938993814973, + "learning_rate": 4.9999720302086674e-05, + "loss": 2.4997, + "mean_token_accuracy": 0.37586206793785093, + "step": 51140 + }, + { + "epoch": 0.051513787202583296, + "grad_norm": 14.039919128716551, + "learning_rate": 4.999971843058082e-05, + "loss": 2.5303, + "mean_token_accuracy": 0.41929823756217954, + "step": 51145 + }, + { + "epoch": 0.05151882325568747, + "grad_norm": 10.985297110573095, + "learning_rate": 4.9999716552834585e-05, + "loss": 2.7327, + "mean_token_accuracy": 0.4, + "step": 51150 + }, + { + "epoch": 0.051523859308791636, + "grad_norm": 15.03183286569151, + "learning_rate": 4.9999714668847966e-05, + "loss": 2.5817, + "mean_token_accuracy": 0.43103448748588563, + "step": 51155 + }, + { + "epoch": 0.05152889536189581, + "grad_norm": 10.679756864212496, + "learning_rate": 4.999971277862097e-05, + "loss": 2.7841, + "mean_token_accuracy": 0.4103448212146759, + "step": 51160 + }, + { + "epoch": 0.051533931414999984, + "grad_norm": 12.764146387174659, + "learning_rate": 4.99997108821536e-05, + "loss": 2.4377, + "mean_token_accuracy": 0.38620689511299133, + "step": 51165 + }, + { + "epoch": 0.05153896746810416, + "grad_norm": 13.489231022979125, + "learning_rate": 4.9999708979445844e-05, + "loss": 2.3699, + "mean_token_accuracy": 0.40689656138420105, + "step": 51170 + }, + { + "epoch": 0.05154400352120833, + "grad_norm": 12.3125489594521, + "learning_rate": 4.999970707049772e-05, + "loss": 2.5374, + "mean_token_accuracy": 0.42758620977401735, + "step": 51175 + }, + { + "epoch": 0.051549039574312505, + "grad_norm": 12.669920988744435, + "learning_rate": 4.999970515530921e-05, + "loss": 2.1664, + "mean_token_accuracy": 0.4862069010734558, + "step": 51180 + }, + { + "epoch": 0.05155407562741668, + "grad_norm": 12.05119087802671, + "learning_rate": 4.999970323388032e-05, + "loss": 2.6343, + "mean_token_accuracy": 0.36896551847457887, + "step": 51185 + }, + { + "epoch": 0.051559111680520846, + "grad_norm": 32.00112245250176, + "learning_rate": 4.999970130621106e-05, + "loss": 3.1162, + "mean_token_accuracy": 0.3655172407627106, + "step": 51190 + }, + { + "epoch": 0.05156414773362502, + "grad_norm": 11.417235798421409, + "learning_rate": 4.999969937230142e-05, + "loss": 2.3447, + "mean_token_accuracy": 0.44827585816383364, + "step": 51195 + }, + { + "epoch": 0.05156918378672919, + "grad_norm": 13.061022705416535, + "learning_rate": 4.9999697432151405e-05, + "loss": 2.7388, + "mean_token_accuracy": 0.3448275804519653, + "step": 51200 + }, + { + "epoch": 0.05157421983983337, + "grad_norm": 10.423564866242934, + "learning_rate": 4.9999695485761016e-05, + "loss": 2.3348, + "mean_token_accuracy": 0.4811857223510742, + "step": 51205 + }, + { + "epoch": 0.05157925589293754, + "grad_norm": 12.289162809274536, + "learning_rate": 4.999969353313025e-05, + "loss": 2.7429, + "mean_token_accuracy": 0.36896551847457887, + "step": 51210 + }, + { + "epoch": 0.051584291946041715, + "grad_norm": 14.513978196107391, + "learning_rate": 4.9999691574259116e-05, + "loss": 2.2719, + "mean_token_accuracy": 0.4310344815254211, + "step": 51215 + }, + { + "epoch": 0.05158932799914589, + "grad_norm": 12.759548930126149, + "learning_rate": 4.9999689609147605e-05, + "loss": 3.0401, + "mean_token_accuracy": 0.34482758641242983, + "step": 51220 + }, + { + "epoch": 0.051594364052250055, + "grad_norm": 14.07693033166377, + "learning_rate": 4.9999687637795724e-05, + "loss": 2.635, + "mean_token_accuracy": 0.37241379618644715, + "step": 51225 + }, + { + "epoch": 0.05159940010535423, + "grad_norm": 12.769116993783562, + "learning_rate": 4.999968566020346e-05, + "loss": 2.4998, + "mean_token_accuracy": 0.4068965494632721, + "step": 51230 + }, + { + "epoch": 0.0516044361584584, + "grad_norm": 13.519669918323107, + "learning_rate": 4.999968367637083e-05, + "loss": 2.5875, + "mean_token_accuracy": 0.4103448212146759, + "step": 51235 + }, + { + "epoch": 0.051609472211562576, + "grad_norm": 13.091166493068675, + "learning_rate": 4.999968168629783e-05, + "loss": 3.0719, + "mean_token_accuracy": 0.37931033968925476, + "step": 51240 + }, + { + "epoch": 0.05161450826466675, + "grad_norm": 13.276585034756858, + "learning_rate": 4.9999679689984464e-05, + "loss": 2.7862, + "mean_token_accuracy": 0.3620689630508423, + "step": 51245 + }, + { + "epoch": 0.051619544317770924, + "grad_norm": 13.147118886267071, + "learning_rate": 4.999967768743072e-05, + "loss": 2.6313, + "mean_token_accuracy": 0.37586206793785093, + "step": 51250 + }, + { + "epoch": 0.0516245803708751, + "grad_norm": 11.7782902842485, + "learning_rate": 4.999967567863661e-05, + "loss": 2.563, + "mean_token_accuracy": 0.3862068891525269, + "step": 51255 + }, + { + "epoch": 0.051629616423979265, + "grad_norm": 13.650756641481388, + "learning_rate": 4.999967366360213e-05, + "loss": 2.2632, + "mean_token_accuracy": 0.44827587008476255, + "step": 51260 + }, + { + "epoch": 0.05163465247708344, + "grad_norm": 9.390638167383264, + "learning_rate": 4.999967164232729e-05, + "loss": 2.4564, + "mean_token_accuracy": 0.41034482717514037, + "step": 51265 + }, + { + "epoch": 0.05163968853018761, + "grad_norm": 11.207402260636966, + "learning_rate": 4.999966961481207e-05, + "loss": 2.3716, + "mean_token_accuracy": 0.39655172228813174, + "step": 51270 + }, + { + "epoch": 0.051644724583291786, + "grad_norm": 11.66384245821523, + "learning_rate": 4.999966758105649e-05, + "loss": 3.0666, + "mean_token_accuracy": 0.34482758641242983, + "step": 51275 + }, + { + "epoch": 0.05164976063639596, + "grad_norm": 16.26844994417877, + "learning_rate": 4.9999665541060536e-05, + "loss": 2.5265, + "mean_token_accuracy": 0.3999999940395355, + "step": 51280 + }, + { + "epoch": 0.05165479668950013, + "grad_norm": 9.416030849993486, + "learning_rate": 4.999966349482423e-05, + "loss": 2.1766, + "mean_token_accuracy": 0.4379310369491577, + "step": 51285 + }, + { + "epoch": 0.05165983274260431, + "grad_norm": 11.228840046175568, + "learning_rate": 4.999966144234754e-05, + "loss": 2.7072, + "mean_token_accuracy": 0.35862069129943847, + "step": 51290 + }, + { + "epoch": 0.051664868795708474, + "grad_norm": 17.258905051698523, + "learning_rate": 4.99996593836305e-05, + "loss": 2.8208, + "mean_token_accuracy": 0.4119782209396362, + "step": 51295 + }, + { + "epoch": 0.05166990484881265, + "grad_norm": 13.075396370550544, + "learning_rate": 4.999965731867309e-05, + "loss": 2.7675, + "mean_token_accuracy": 0.4068965494632721, + "step": 51300 + }, + { + "epoch": 0.05167494090191682, + "grad_norm": 13.227620589192181, + "learning_rate": 4.999965524747531e-05, + "loss": 2.5024, + "mean_token_accuracy": 0.38106473684310915, + "step": 51305 + }, + { + "epoch": 0.051679976955020995, + "grad_norm": 13.439786276457877, + "learning_rate": 4.9999653170037175e-05, + "loss": 2.6135, + "mean_token_accuracy": 0.4137930989265442, + "step": 51310 + }, + { + "epoch": 0.05168501300812517, + "grad_norm": 10.306234272154814, + "learning_rate": 4.9999651086358676e-05, + "loss": 2.5221, + "mean_token_accuracy": 0.4310949832201004, + "step": 51315 + }, + { + "epoch": 0.05169004906122934, + "grad_norm": 13.783639158989327, + "learning_rate": 4.9999648996439814e-05, + "loss": 2.668, + "mean_token_accuracy": 0.35862069129943847, + "step": 51320 + }, + { + "epoch": 0.05169508511433352, + "grad_norm": 21.178810627013096, + "learning_rate": 4.999964690028059e-05, + "loss": 2.7821, + "mean_token_accuracy": 0.38275861740112305, + "step": 51325 + }, + { + "epoch": 0.05170012116743768, + "grad_norm": 11.872769669137597, + "learning_rate": 4.999964479788101e-05, + "loss": 2.5241, + "mean_token_accuracy": 0.4137930989265442, + "step": 51330 + }, + { + "epoch": 0.05170515722054186, + "grad_norm": 16.811573310792177, + "learning_rate": 4.999964268924106e-05, + "loss": 2.6429, + "mean_token_accuracy": 0.3620689630508423, + "step": 51335 + }, + { + "epoch": 0.05171019327364603, + "grad_norm": 14.060322519721295, + "learning_rate": 4.9999640574360754e-05, + "loss": 2.2734, + "mean_token_accuracy": 0.4379310369491577, + "step": 51340 + }, + { + "epoch": 0.051715229326750205, + "grad_norm": 14.041968949191542, + "learning_rate": 4.999963845324009e-05, + "loss": 2.7971, + "mean_token_accuracy": 0.37241379618644715, + "step": 51345 + }, + { + "epoch": 0.05172026537985438, + "grad_norm": 26.808375329128097, + "learning_rate": 4.999963632587906e-05, + "loss": 3.2629, + "mean_token_accuracy": 0.33793102502822875, + "step": 51350 + }, + { + "epoch": 0.05172530143295855, + "grad_norm": 12.249344033156914, + "learning_rate": 4.9999634192277686e-05, + "loss": 2.3921, + "mean_token_accuracy": 0.4413793087005615, + "step": 51355 + }, + { + "epoch": 0.051730337486062726, + "grad_norm": 12.729540291138045, + "learning_rate": 4.999963205243594e-05, + "loss": 2.7996, + "mean_token_accuracy": 0.39655172228813174, + "step": 51360 + }, + { + "epoch": 0.05173537353916689, + "grad_norm": 13.20535539595901, + "learning_rate": 4.999962990635385e-05, + "loss": 2.9736, + "mean_token_accuracy": 0.3896551728248596, + "step": 51365 + }, + { + "epoch": 0.05174040959227107, + "grad_norm": 11.84301437140591, + "learning_rate": 4.9999627754031395e-05, + "loss": 2.3046, + "mean_token_accuracy": 0.4413793057203293, + "step": 51370 + }, + { + "epoch": 0.05174544564537524, + "grad_norm": 11.289490458225915, + "learning_rate": 4.99996255954686e-05, + "loss": 2.5873, + "mean_token_accuracy": 0.38965516686439516, + "step": 51375 + }, + { + "epoch": 0.051750481698479414, + "grad_norm": 15.591185201798947, + "learning_rate": 4.999962343066543e-05, + "loss": 2.7343, + "mean_token_accuracy": 0.3551724135875702, + "step": 51380 + }, + { + "epoch": 0.05175551775158359, + "grad_norm": 12.352233481478585, + "learning_rate": 4.999962125962191e-05, + "loss": 2.5227, + "mean_token_accuracy": 0.3793103456497192, + "step": 51385 + }, + { + "epoch": 0.05176055380468776, + "grad_norm": 14.817630170556203, + "learning_rate": 4.999961908233804e-05, + "loss": 3.08, + "mean_token_accuracy": 0.3551724135875702, + "step": 51390 + }, + { + "epoch": 0.051765589857791935, + "grad_norm": 11.995264548169345, + "learning_rate": 4.999961689881382e-05, + "loss": 2.673, + "mean_token_accuracy": 0.39655172228813174, + "step": 51395 + }, + { + "epoch": 0.0517706259108961, + "grad_norm": 11.389041458113113, + "learning_rate": 4.9999614709049234e-05, + "loss": 2.2819, + "mean_token_accuracy": 0.4310344815254211, + "step": 51400 + }, + { + "epoch": 0.051775661964000276, + "grad_norm": 12.232049000050218, + "learning_rate": 4.9999612513044306e-05, + "loss": 2.8424, + "mean_token_accuracy": 0.3655172437429428, + "step": 51405 + }, + { + "epoch": 0.05178069801710445, + "grad_norm": 11.672051781631591, + "learning_rate": 4.999961031079903e-05, + "loss": 2.5749, + "mean_token_accuracy": 0.4, + "step": 51410 + }, + { + "epoch": 0.051785734070208624, + "grad_norm": 12.653184436831527, + "learning_rate": 4.99996081023134e-05, + "loss": 2.6616, + "mean_token_accuracy": 0.40344828367233276, + "step": 51415 + }, + { + "epoch": 0.0517907701233128, + "grad_norm": 11.76446431990547, + "learning_rate": 4.999960588758742e-05, + "loss": 2.8634, + "mean_token_accuracy": 0.404718691110611, + "step": 51420 + }, + { + "epoch": 0.05179580617641697, + "grad_norm": 18.54822727521575, + "learning_rate": 4.9999603666621085e-05, + "loss": 2.7432, + "mean_token_accuracy": 0.4310344815254211, + "step": 51425 + }, + { + "epoch": 0.051800842229521145, + "grad_norm": 15.21484669007347, + "learning_rate": 4.9999601439414405e-05, + "loss": 2.6596, + "mean_token_accuracy": 0.37241379022598264, + "step": 51430 + }, + { + "epoch": 0.05180587828262531, + "grad_norm": 11.519423853241912, + "learning_rate": 4.9999599205967376e-05, + "loss": 2.4685, + "mean_token_accuracy": 0.37241379618644715, + "step": 51435 + }, + { + "epoch": 0.051810914335729485, + "grad_norm": 14.557774915715163, + "learning_rate": 4.999959696628e-05, + "loss": 2.6852, + "mean_token_accuracy": 0.39655173420906065, + "step": 51440 + }, + { + "epoch": 0.05181595038883366, + "grad_norm": 13.602200023616282, + "learning_rate": 4.999959472035228e-05, + "loss": 2.6285, + "mean_token_accuracy": 0.39655172228813174, + "step": 51445 + }, + { + "epoch": 0.05182098644193783, + "grad_norm": 18.327072905315713, + "learning_rate": 4.99995924681842e-05, + "loss": 2.4651, + "mean_token_accuracy": 0.4034482717514038, + "step": 51450 + }, + { + "epoch": 0.05182602249504201, + "grad_norm": 15.257442315203056, + "learning_rate": 4.999959020977579e-05, + "loss": 2.5188, + "mean_token_accuracy": 0.43569267392158506, + "step": 51455 + }, + { + "epoch": 0.05183105854814618, + "grad_norm": 11.769360807657616, + "learning_rate": 4.999958794512703e-05, + "loss": 2.4121, + "mean_token_accuracy": 0.42413793206214906, + "step": 51460 + }, + { + "epoch": 0.051836094601250354, + "grad_norm": 13.543668896104613, + "learning_rate": 4.999958567423793e-05, + "loss": 2.6288, + "mean_token_accuracy": 0.3931034505367279, + "step": 51465 + }, + { + "epoch": 0.05184113065435452, + "grad_norm": 13.846616772056356, + "learning_rate": 4.999958339710847e-05, + "loss": 2.5003, + "mean_token_accuracy": 0.3931034356355667, + "step": 51470 + }, + { + "epoch": 0.051846166707458695, + "grad_norm": 13.021003741549926, + "learning_rate": 4.999958111373867e-05, + "loss": 2.6672, + "mean_token_accuracy": 0.3931034505367279, + "step": 51475 + }, + { + "epoch": 0.05185120276056287, + "grad_norm": 24.264462849697768, + "learning_rate": 4.999957882412855e-05, + "loss": 2.6374, + "mean_token_accuracy": 0.4186932861804962, + "step": 51480 + }, + { + "epoch": 0.05185623881366704, + "grad_norm": 11.040854275060669, + "learning_rate": 4.999957652827807e-05, + "loss": 2.3008, + "mean_token_accuracy": 0.44827587008476255, + "step": 51485 + }, + { + "epoch": 0.051861274866771216, + "grad_norm": 13.086499436466484, + "learning_rate": 4.9999574226187244e-05, + "loss": 2.3235, + "mean_token_accuracy": 0.41034482717514037, + "step": 51490 + }, + { + "epoch": 0.05186631091987539, + "grad_norm": 15.227021717539586, + "learning_rate": 4.999957191785608e-05, + "loss": 2.8223, + "mean_token_accuracy": 0.38396853804588316, + "step": 51495 + }, + { + "epoch": 0.051871346972979564, + "grad_norm": 13.044571434700508, + "learning_rate": 4.999956960328458e-05, + "loss": 2.9737, + "mean_token_accuracy": 0.3517241358757019, + "step": 51500 + }, + { + "epoch": 0.05187638302608373, + "grad_norm": 13.56020772141275, + "learning_rate": 4.999956728247274e-05, + "loss": 2.955, + "mean_token_accuracy": 0.3344827562570572, + "step": 51505 + }, + { + "epoch": 0.051881419079187904, + "grad_norm": 13.283307949099592, + "learning_rate": 4.999956495542057e-05, + "loss": 2.6386, + "mean_token_accuracy": 0.38275861740112305, + "step": 51510 + }, + { + "epoch": 0.05188645513229208, + "grad_norm": 15.351473583826742, + "learning_rate": 4.9999562622128045e-05, + "loss": 2.4528, + "mean_token_accuracy": 0.3965517163276672, + "step": 51515 + }, + { + "epoch": 0.05189149118539625, + "grad_norm": 14.749473682088102, + "learning_rate": 4.999956028259519e-05, + "loss": 2.6267, + "mean_token_accuracy": 0.3448275804519653, + "step": 51520 + }, + { + "epoch": 0.051896527238500426, + "grad_norm": 11.639286347159493, + "learning_rate": 4.9999557936822e-05, + "loss": 2.3788, + "mean_token_accuracy": 0.4137930989265442, + "step": 51525 + }, + { + "epoch": 0.0519015632916046, + "grad_norm": 14.391390418917288, + "learning_rate": 4.9999555584808465e-05, + "loss": 2.8489, + "mean_token_accuracy": 0.33793103098869326, + "step": 51530 + }, + { + "epoch": 0.05190659934470877, + "grad_norm": 13.133728509129215, + "learning_rate": 4.99995532265546e-05, + "loss": 2.5761, + "mean_token_accuracy": 0.37586206793785093, + "step": 51535 + }, + { + "epoch": 0.05191163539781294, + "grad_norm": 12.494381510844713, + "learning_rate": 4.99995508620604e-05, + "loss": 2.8219, + "mean_token_accuracy": 0.3758620619773865, + "step": 51540 + }, + { + "epoch": 0.051916671450917114, + "grad_norm": 11.07871394599296, + "learning_rate": 4.999954849132588e-05, + "loss": 2.907, + "mean_token_accuracy": 0.36896551251411436, + "step": 51545 + }, + { + "epoch": 0.05192170750402129, + "grad_norm": 13.829573751328976, + "learning_rate": 4.999954611435101e-05, + "loss": 2.7079, + "mean_token_accuracy": 0.358620685338974, + "step": 51550 + }, + { + "epoch": 0.05192674355712546, + "grad_norm": 11.982410234925275, + "learning_rate": 4.99995437311358e-05, + "loss": 2.5301, + "mean_token_accuracy": 0.39655172228813174, + "step": 51555 + }, + { + "epoch": 0.051931779610229635, + "grad_norm": 12.280582909639634, + "learning_rate": 4.999954134168027e-05, + "loss": 2.8146, + "mean_token_accuracy": 0.39310344457626345, + "step": 51560 + }, + { + "epoch": 0.05193681566333381, + "grad_norm": 12.607713751972994, + "learning_rate": 4.999953894598441e-05, + "loss": 2.7687, + "mean_token_accuracy": 0.4310344815254211, + "step": 51565 + }, + { + "epoch": 0.05194185171643798, + "grad_norm": 17.117914948019205, + "learning_rate": 4.999953654404821e-05, + "loss": 2.938, + "mean_token_accuracy": 0.3586206942796707, + "step": 51570 + }, + { + "epoch": 0.05194688776954215, + "grad_norm": 13.74302062263069, + "learning_rate": 4.999953413587169e-05, + "loss": 2.9738, + "mean_token_accuracy": 0.35862069129943847, + "step": 51575 + }, + { + "epoch": 0.05195192382264632, + "grad_norm": 12.981803458447178, + "learning_rate": 4.999953172145484e-05, + "loss": 2.9753, + "mean_token_accuracy": 0.324137932062149, + "step": 51580 + }, + { + "epoch": 0.0519569598757505, + "grad_norm": 15.36083719700262, + "learning_rate": 4.999952930079765e-05, + "loss": 2.8738, + "mean_token_accuracy": 0.3620689630508423, + "step": 51585 + }, + { + "epoch": 0.05196199592885467, + "grad_norm": 9.99752250040171, + "learning_rate": 4.9999526873900133e-05, + "loss": 2.3806, + "mean_token_accuracy": 0.4103448212146759, + "step": 51590 + }, + { + "epoch": 0.051967031981958844, + "grad_norm": 19.43790074288775, + "learning_rate": 4.999952444076229e-05, + "loss": 2.9239, + "mean_token_accuracy": 0.3482758581638336, + "step": 51595 + }, + { + "epoch": 0.05197206803506302, + "grad_norm": 13.717880053116458, + "learning_rate": 4.999952200138413e-05, + "loss": 2.5889, + "mean_token_accuracy": 0.3896551787853241, + "step": 51600 + }, + { + "epoch": 0.05197710408816719, + "grad_norm": 11.498437928134406, + "learning_rate": 4.9999519555765625e-05, + "loss": 2.7037, + "mean_token_accuracy": 0.3551724165678024, + "step": 51605 + }, + { + "epoch": 0.05198214014127136, + "grad_norm": 12.341570437492297, + "learning_rate": 4.9999517103906813e-05, + "loss": 2.8332, + "mean_token_accuracy": 0.3620689630508423, + "step": 51610 + }, + { + "epoch": 0.05198717619437553, + "grad_norm": 14.564671008418772, + "learning_rate": 4.9999514645807666e-05, + "loss": 2.2651, + "mean_token_accuracy": 0.47241379618644713, + "step": 51615 + }, + { + "epoch": 0.051992212247479706, + "grad_norm": 12.519847970455341, + "learning_rate": 4.99995121814682e-05, + "loss": 3.4641, + "mean_token_accuracy": 0.2758620709180832, + "step": 51620 + }, + { + "epoch": 0.05199724830058388, + "grad_norm": 11.639487803280984, + "learning_rate": 4.9999509710888406e-05, + "loss": 2.4807, + "mean_token_accuracy": 0.4310344815254211, + "step": 51625 + }, + { + "epoch": 0.052002284353688054, + "grad_norm": 9.916910592851048, + "learning_rate": 4.9999507234068285e-05, + "loss": 2.4368, + "mean_token_accuracy": 0.44827585816383364, + "step": 51630 + }, + { + "epoch": 0.05200732040679223, + "grad_norm": 12.906453941158953, + "learning_rate": 4.999950475100785e-05, + "loss": 3.1005, + "mean_token_accuracy": 0.3310344755649567, + "step": 51635 + }, + { + "epoch": 0.0520123564598964, + "grad_norm": 24.560825195198234, + "learning_rate": 4.999950226170709e-05, + "loss": 3.3394, + "mean_token_accuracy": 0.33448275923728943, + "step": 51640 + }, + { + "epoch": 0.05201739251300057, + "grad_norm": 10.597612371892525, + "learning_rate": 4.9999499766166006e-05, + "loss": 2.4425, + "mean_token_accuracy": 0.4206896543502808, + "step": 51645 + }, + { + "epoch": 0.05202242856610474, + "grad_norm": 12.16400483823541, + "learning_rate": 4.9999497264384604e-05, + "loss": 2.5568, + "mean_token_accuracy": 0.3758620619773865, + "step": 51650 + }, + { + "epoch": 0.052027464619208916, + "grad_norm": 12.369190939519763, + "learning_rate": 4.9999494756362886e-05, + "loss": 2.7597, + "mean_token_accuracy": 0.4172413766384125, + "step": 51655 + }, + { + "epoch": 0.05203250067231309, + "grad_norm": 13.127422412411152, + "learning_rate": 4.999949224210085e-05, + "loss": 2.1539, + "mean_token_accuracy": 0.4551724076271057, + "step": 51660 + }, + { + "epoch": 0.05203753672541726, + "grad_norm": 14.268059670233528, + "learning_rate": 4.999948972159849e-05, + "loss": 2.9776, + "mean_token_accuracy": 0.3827586233615875, + "step": 51665 + }, + { + "epoch": 0.05204257277852144, + "grad_norm": 11.595923127274558, + "learning_rate": 4.9999487194855815e-05, + "loss": 2.7785, + "mean_token_accuracy": 0.403448274731636, + "step": 51670 + }, + { + "epoch": 0.05204760883162561, + "grad_norm": 12.720134450636516, + "learning_rate": 4.999948466187282e-05, + "loss": 2.6524, + "mean_token_accuracy": 0.36206896901130675, + "step": 51675 + }, + { + "epoch": 0.05205264488472978, + "grad_norm": 15.636618475390302, + "learning_rate": 4.999948212264952e-05, + "loss": 2.7185, + "mean_token_accuracy": 0.33103448152542114, + "step": 51680 + }, + { + "epoch": 0.05205768093783395, + "grad_norm": 14.81298550824267, + "learning_rate": 4.999947957718589e-05, + "loss": 2.7656, + "mean_token_accuracy": 0.3655172407627106, + "step": 51685 + }, + { + "epoch": 0.052062716990938125, + "grad_norm": 10.339322387030133, + "learning_rate": 4.999947702548196e-05, + "loss": 2.6231, + "mean_token_accuracy": 0.39655172228813174, + "step": 51690 + }, + { + "epoch": 0.0520677530440423, + "grad_norm": 13.347273905295713, + "learning_rate": 4.999947446753771e-05, + "loss": 2.6441, + "mean_token_accuracy": 0.41911675930023196, + "step": 51695 + }, + { + "epoch": 0.05207278909714647, + "grad_norm": 10.594017643763067, + "learning_rate": 4.999947190335314e-05, + "loss": 2.4394, + "mean_token_accuracy": 0.43793103098869324, + "step": 51700 + }, + { + "epoch": 0.052077825150250646, + "grad_norm": 12.803844328009665, + "learning_rate": 4.999946933292827e-05, + "loss": 2.7878, + "mean_token_accuracy": 0.38620689511299133, + "step": 51705 + }, + { + "epoch": 0.05208286120335482, + "grad_norm": 14.976573455990588, + "learning_rate": 4.9999466756263075e-05, + "loss": 2.3552, + "mean_token_accuracy": 0.37241379022598264, + "step": 51710 + }, + { + "epoch": 0.05208789725645899, + "grad_norm": 12.657586640069539, + "learning_rate": 4.9999464173357574e-05, + "loss": 2.416, + "mean_token_accuracy": 0.43448275327682495, + "step": 51715 + }, + { + "epoch": 0.05209293330956316, + "grad_norm": 12.644352917246177, + "learning_rate": 4.999946158421177e-05, + "loss": 2.4875, + "mean_token_accuracy": 0.417241370677948, + "step": 51720 + }, + { + "epoch": 0.052097969362667335, + "grad_norm": 12.32544767388427, + "learning_rate": 4.9999458988825646e-05, + "loss": 2.7227, + "mean_token_accuracy": 0.38620689511299133, + "step": 51725 + }, + { + "epoch": 0.05210300541577151, + "grad_norm": 12.918179747887688, + "learning_rate": 4.999945638719922e-05, + "loss": 2.9189, + "mean_token_accuracy": 0.32413792610168457, + "step": 51730 + }, + { + "epoch": 0.05210804146887568, + "grad_norm": 92.34655459463046, + "learning_rate": 4.999945377933248e-05, + "loss": 2.6129, + "mean_token_accuracy": 0.4310344696044922, + "step": 51735 + }, + { + "epoch": 0.052113077521979856, + "grad_norm": 12.836650729429833, + "learning_rate": 4.999945116522544e-05, + "loss": 2.2936, + "mean_token_accuracy": 0.4862069010734558, + "step": 51740 + }, + { + "epoch": 0.05211811357508403, + "grad_norm": 11.377132865226105, + "learning_rate": 4.9999448544878086e-05, + "loss": 2.6369, + "mean_token_accuracy": 0.38620689511299133, + "step": 51745 + }, + { + "epoch": 0.052123149628188196, + "grad_norm": 10.256913076416197, + "learning_rate": 4.999944591829043e-05, + "loss": 2.3834, + "mean_token_accuracy": 0.4344827592372894, + "step": 51750 + }, + { + "epoch": 0.05212818568129237, + "grad_norm": 11.238740482668224, + "learning_rate": 4.999944328546247e-05, + "loss": 2.3435, + "mean_token_accuracy": 0.45172414779663084, + "step": 51755 + }, + { + "epoch": 0.052133221734396544, + "grad_norm": 14.771451343348712, + "learning_rate": 4.9999440646394204e-05, + "loss": 2.466, + "mean_token_accuracy": 0.4137930989265442, + "step": 51760 + }, + { + "epoch": 0.05213825778750072, + "grad_norm": 17.362302224155048, + "learning_rate": 4.999943800108564e-05, + "loss": 2.2653, + "mean_token_accuracy": 0.4801724135875702, + "step": 51765 + }, + { + "epoch": 0.05214329384060489, + "grad_norm": 14.571726134069939, + "learning_rate": 4.999943534953676e-05, + "loss": 2.3732, + "mean_token_accuracy": 0.4448275864124298, + "step": 51770 + }, + { + "epoch": 0.052148329893709065, + "grad_norm": 13.157735294888793, + "learning_rate": 4.999943269174759e-05, + "loss": 2.5321, + "mean_token_accuracy": 0.4241379380226135, + "step": 51775 + }, + { + "epoch": 0.05215336594681324, + "grad_norm": 15.09499073769964, + "learning_rate": 4.9999430027718106e-05, + "loss": 3.0651, + "mean_token_accuracy": 0.3000000029802322, + "step": 51780 + }, + { + "epoch": 0.052158401999917406, + "grad_norm": 12.929928086677865, + "learning_rate": 4.999942735744833e-05, + "loss": 2.6868, + "mean_token_accuracy": 0.37586206793785093, + "step": 51785 + }, + { + "epoch": 0.05216343805302158, + "grad_norm": 12.062415224600912, + "learning_rate": 4.999942468093825e-05, + "loss": 3.1127, + "mean_token_accuracy": 0.35517241060733795, + "step": 51790 + }, + { + "epoch": 0.05216847410612575, + "grad_norm": 12.788421092382398, + "learning_rate": 4.999942199818787e-05, + "loss": 2.6278, + "mean_token_accuracy": 0.42256503701210024, + "step": 51795 + }, + { + "epoch": 0.05217351015922993, + "grad_norm": 24.8238997749523, + "learning_rate": 4.99994193091972e-05, + "loss": 2.7558, + "mean_token_accuracy": 0.4068965494632721, + "step": 51800 + }, + { + "epoch": 0.0521785462123341, + "grad_norm": 12.04873849761242, + "learning_rate": 4.999941661396622e-05, + "loss": 2.7465, + "mean_token_accuracy": 0.39655172228813174, + "step": 51805 + }, + { + "epoch": 0.052183582265438275, + "grad_norm": 15.141144305014784, + "learning_rate": 4.999941391249495e-05, + "loss": 3.0766, + "mean_token_accuracy": 0.36551724672317504, + "step": 51810 + }, + { + "epoch": 0.05218861831854245, + "grad_norm": 12.924091047932116, + "learning_rate": 4.9999411204783386e-05, + "loss": 2.5162, + "mean_token_accuracy": 0.43103448748588563, + "step": 51815 + }, + { + "epoch": 0.052193654371646615, + "grad_norm": 11.939436121383284, + "learning_rate": 4.999940849083152e-05, + "loss": 2.6405, + "mean_token_accuracy": 0.3551724076271057, + "step": 51820 + }, + { + "epoch": 0.05219869042475079, + "grad_norm": 11.766971299794431, + "learning_rate": 4.9999405770639366e-05, + "loss": 2.4433, + "mean_token_accuracy": 0.4206896543502808, + "step": 51825 + }, + { + "epoch": 0.05220372647785496, + "grad_norm": 13.464807863130709, + "learning_rate": 4.9999403044206916e-05, + "loss": 2.6405, + "mean_token_accuracy": 0.36896551847457887, + "step": 51830 + }, + { + "epoch": 0.05220876253095914, + "grad_norm": 10.643781827120558, + "learning_rate": 4.9999400311534165e-05, + "loss": 2.5752, + "mean_token_accuracy": 0.44137930274009707, + "step": 51835 + }, + { + "epoch": 0.05221379858406331, + "grad_norm": 14.622420326068994, + "learning_rate": 4.999939757262113e-05, + "loss": 2.5617, + "mean_token_accuracy": 0.41034482717514037, + "step": 51840 + }, + { + "epoch": 0.052218834637167484, + "grad_norm": 12.281396744478528, + "learning_rate": 4.999939482746779e-05, + "loss": 2.7527, + "mean_token_accuracy": 0.38620689511299133, + "step": 51845 + }, + { + "epoch": 0.05222387069027166, + "grad_norm": 13.165128704313105, + "learning_rate": 4.9999392076074176e-05, + "loss": 2.2695, + "mean_token_accuracy": 0.46394434571266174, + "step": 51850 + }, + { + "epoch": 0.052228906743375825, + "grad_norm": 11.12759346888424, + "learning_rate": 4.999938931844026e-05, + "loss": 2.5365, + "mean_token_accuracy": 0.38965516686439516, + "step": 51855 + }, + { + "epoch": 0.05223394279648, + "grad_norm": 11.397626474171725, + "learning_rate": 4.9999386554566067e-05, + "loss": 2.9969, + "mean_token_accuracy": 0.3344827562570572, + "step": 51860 + }, + { + "epoch": 0.05223897884958417, + "grad_norm": 12.487777989322241, + "learning_rate": 4.999938378445157e-05, + "loss": 2.8486, + "mean_token_accuracy": 0.3862069010734558, + "step": 51865 + }, + { + "epoch": 0.052244014902688346, + "grad_norm": 9.433906355649128, + "learning_rate": 4.99993810080968e-05, + "loss": 2.3709, + "mean_token_accuracy": 0.3896551728248596, + "step": 51870 + }, + { + "epoch": 0.05224905095579252, + "grad_norm": 11.904287031743506, + "learning_rate": 4.9999378225501734e-05, + "loss": 2.4249, + "mean_token_accuracy": 0.4241379380226135, + "step": 51875 + }, + { + "epoch": 0.052254087008896694, + "grad_norm": 12.040427949653834, + "learning_rate": 4.999937543666638e-05, + "loss": 2.3865, + "mean_token_accuracy": 0.47241379618644713, + "step": 51880 + }, + { + "epoch": 0.05225912306200087, + "grad_norm": 13.920480453844364, + "learning_rate": 4.999937264159074e-05, + "loss": 2.4145, + "mean_token_accuracy": 0.3999999940395355, + "step": 51885 + }, + { + "epoch": 0.052264159115105034, + "grad_norm": 14.586932860755967, + "learning_rate": 4.999936984027483e-05, + "loss": 2.462, + "mean_token_accuracy": 0.3793103456497192, + "step": 51890 + }, + { + "epoch": 0.05226919516820921, + "grad_norm": 18.092399449270154, + "learning_rate": 4.999936703271862e-05, + "loss": 2.507, + "mean_token_accuracy": 0.40302479863166807, + "step": 51895 + }, + { + "epoch": 0.05227423122131338, + "grad_norm": 14.177754307940212, + "learning_rate": 4.9999364218922126e-05, + "loss": 3.1606, + "mean_token_accuracy": 0.35172413289546967, + "step": 51900 + }, + { + "epoch": 0.052279267274417555, + "grad_norm": 10.697912852304334, + "learning_rate": 4.999936139888535e-05, + "loss": 2.2602, + "mean_token_accuracy": 0.4482758641242981, + "step": 51905 + }, + { + "epoch": 0.05228430332752173, + "grad_norm": 11.28488865369071, + "learning_rate": 4.99993585726083e-05, + "loss": 2.5272, + "mean_token_accuracy": 0.3655172407627106, + "step": 51910 + }, + { + "epoch": 0.0522893393806259, + "grad_norm": 12.66233430467587, + "learning_rate": 4.9999355740090956e-05, + "loss": 3.0583, + "mean_token_accuracy": 0.358620685338974, + "step": 51915 + }, + { + "epoch": 0.05229437543373008, + "grad_norm": 10.620397930173764, + "learning_rate": 4.9999352901333346e-05, + "loss": 2.8388, + "mean_token_accuracy": 0.3655172407627106, + "step": 51920 + }, + { + "epoch": 0.052299411486834244, + "grad_norm": 13.238533620290577, + "learning_rate": 4.9999350056335454e-05, + "loss": 2.7829, + "mean_token_accuracy": 0.38275861740112305, + "step": 51925 + }, + { + "epoch": 0.05230444753993842, + "grad_norm": 12.28203650063212, + "learning_rate": 4.999934720509728e-05, + "loss": 2.4589, + "mean_token_accuracy": 0.3931034505367279, + "step": 51930 + }, + { + "epoch": 0.05230948359304259, + "grad_norm": 20.535431642558393, + "learning_rate": 4.9999344347618826e-05, + "loss": 2.7109, + "mean_token_accuracy": 0.41240169405937194, + "step": 51935 + }, + { + "epoch": 0.052314519646146765, + "grad_norm": 11.985089735323992, + "learning_rate": 4.99993414839001e-05, + "loss": 2.4698, + "mean_token_accuracy": 0.36206896901130675, + "step": 51940 + }, + { + "epoch": 0.05231955569925094, + "grad_norm": 13.02402898449528, + "learning_rate": 4.999933861394109e-05, + "loss": 2.7767, + "mean_token_accuracy": 0.37931033968925476, + "step": 51945 + }, + { + "epoch": 0.05232459175235511, + "grad_norm": 11.319896378824003, + "learning_rate": 4.9999335737741814e-05, + "loss": 2.5596, + "mean_token_accuracy": 0.4034482717514038, + "step": 51950 + }, + { + "epoch": 0.052329627805459286, + "grad_norm": 12.545421124510765, + "learning_rate": 4.9999332855302254e-05, + "loss": 2.1279, + "mean_token_accuracy": 0.47586206793785096, + "step": 51955 + }, + { + "epoch": 0.05233466385856345, + "grad_norm": 14.406110329255547, + "learning_rate": 4.999932996662243e-05, + "loss": 3.1266, + "mean_token_accuracy": 0.3275862067937851, + "step": 51960 + }, + { + "epoch": 0.05233969991166763, + "grad_norm": 17.209945065764238, + "learning_rate": 4.999932707170233e-05, + "loss": 3.1088, + "mean_token_accuracy": 0.341379314661026, + "step": 51965 + }, + { + "epoch": 0.0523447359647718, + "grad_norm": 12.8563753325561, + "learning_rate": 4.999932417054196e-05, + "loss": 2.3161, + "mean_token_accuracy": 0.4620689630508423, + "step": 51970 + }, + { + "epoch": 0.052349772017875974, + "grad_norm": 9.630827999758104, + "learning_rate": 4.999932126314131e-05, + "loss": 2.6604, + "mean_token_accuracy": 0.37931033968925476, + "step": 51975 + }, + { + "epoch": 0.05235480807098015, + "grad_norm": 18.231017588421746, + "learning_rate": 4.9999318349500394e-05, + "loss": 3.3551, + "mean_token_accuracy": 0.34137930870056155, + "step": 51980 + }, + { + "epoch": 0.05235984412408432, + "grad_norm": 11.046257661122137, + "learning_rate": 4.9999315429619206e-05, + "loss": 2.8109, + "mean_token_accuracy": 0.37586205899715425, + "step": 51985 + }, + { + "epoch": 0.052364880177188496, + "grad_norm": 11.303123043893011, + "learning_rate": 4.999931250349775e-05, + "loss": 2.6773, + "mean_token_accuracy": 0.38275861740112305, + "step": 51990 + }, + { + "epoch": 0.05236991623029266, + "grad_norm": 13.253435412221885, + "learning_rate": 4.999930957113602e-05, + "loss": 2.7006, + "mean_token_accuracy": 0.3965517282485962, + "step": 51995 + }, + { + "epoch": 0.052374952283396836, + "grad_norm": 11.63368960927172, + "learning_rate": 4.9999306632534034e-05, + "loss": 2.4454, + "mean_token_accuracy": 0.41724138259887694, + "step": 52000 + }, + { + "epoch": 0.05237998833650101, + "grad_norm": 12.17144724304306, + "learning_rate": 4.999930368769178e-05, + "loss": 2.1679, + "mean_token_accuracy": 0.48275861144065857, + "step": 52005 + }, + { + "epoch": 0.052385024389605184, + "grad_norm": 13.787507713672492, + "learning_rate": 4.999930073660925e-05, + "loss": 2.521, + "mean_token_accuracy": 0.42758620977401735, + "step": 52010 + }, + { + "epoch": 0.05239006044270936, + "grad_norm": 13.54498544070483, + "learning_rate": 4.9999297779286465e-05, + "loss": 2.3914, + "mean_token_accuracy": 0.3931034505367279, + "step": 52015 + }, + { + "epoch": 0.05239509649581353, + "grad_norm": 13.138131937217258, + "learning_rate": 4.999929481572341e-05, + "loss": 2.5963, + "mean_token_accuracy": 0.38487597107887267, + "step": 52020 + }, + { + "epoch": 0.052400132548917705, + "grad_norm": 12.979288464583135, + "learning_rate": 4.99992918459201e-05, + "loss": 2.7456, + "mean_token_accuracy": 0.3655172407627106, + "step": 52025 + }, + { + "epoch": 0.05240516860202187, + "grad_norm": 14.21718228221012, + "learning_rate": 4.999928886987651e-05, + "loss": 2.6792, + "mean_token_accuracy": 0.4206896543502808, + "step": 52030 + }, + { + "epoch": 0.052410204655126046, + "grad_norm": 12.966998853878286, + "learning_rate": 4.999928588759267e-05, + "loss": 2.8929, + "mean_token_accuracy": 0.3793103456497192, + "step": 52035 + }, + { + "epoch": 0.05241524070823022, + "grad_norm": 14.839121343102141, + "learning_rate": 4.999928289906858e-05, + "loss": 2.7162, + "mean_token_accuracy": 0.38275861740112305, + "step": 52040 + }, + { + "epoch": 0.05242027676133439, + "grad_norm": 11.869848406801163, + "learning_rate": 4.999927990430421e-05, + "loss": 2.6238, + "mean_token_accuracy": 0.3862069010734558, + "step": 52045 + }, + { + "epoch": 0.05242531281443857, + "grad_norm": 11.81959211718063, + "learning_rate": 4.99992769032996e-05, + "loss": 2.6896, + "mean_token_accuracy": 0.3827586233615875, + "step": 52050 + }, + { + "epoch": 0.05243034886754274, + "grad_norm": 10.656825799457476, + "learning_rate": 4.999927389605471e-05, + "loss": 2.3044, + "mean_token_accuracy": 0.41222020983695984, + "step": 52055 + }, + { + "epoch": 0.052435384920646914, + "grad_norm": 10.987871883063795, + "learning_rate": 4.999927088256958e-05, + "loss": 2.8076, + "mean_token_accuracy": 0.34827586114406583, + "step": 52060 + }, + { + "epoch": 0.05244042097375108, + "grad_norm": 10.797993233098772, + "learning_rate": 4.9999267862844176e-05, + "loss": 2.6516, + "mean_token_accuracy": 0.3724137842655182, + "step": 52065 + }, + { + "epoch": 0.052445457026855255, + "grad_norm": 12.458782090006233, + "learning_rate": 4.9999264836878536e-05, + "loss": 2.9152, + "mean_token_accuracy": 0.3551724135875702, + "step": 52070 + }, + { + "epoch": 0.05245049307995943, + "grad_norm": 13.23608192693537, + "learning_rate": 4.9999261804672634e-05, + "loss": 2.4127, + "mean_token_accuracy": 0.4068965554237366, + "step": 52075 + }, + { + "epoch": 0.0524555291330636, + "grad_norm": 11.488794148054623, + "learning_rate": 4.999925876622647e-05, + "loss": 2.1559, + "mean_token_accuracy": 0.4156079888343811, + "step": 52080 + }, + { + "epoch": 0.052460565186167776, + "grad_norm": 12.090644229971591, + "learning_rate": 4.999925572154006e-05, + "loss": 2.5503, + "mean_token_accuracy": 0.358620685338974, + "step": 52085 + }, + { + "epoch": 0.05246560123927195, + "grad_norm": 10.342209494298395, + "learning_rate": 4.99992526706134e-05, + "loss": 2.2552, + "mean_token_accuracy": 0.4689655065536499, + "step": 52090 + }, + { + "epoch": 0.052470637292376124, + "grad_norm": 14.598708871373963, + "learning_rate": 4.999924961344648e-05, + "loss": 2.7511, + "mean_token_accuracy": 0.3999999940395355, + "step": 52095 + }, + { + "epoch": 0.05247567334548029, + "grad_norm": 14.616971671790006, + "learning_rate": 4.9999246550039316e-05, + "loss": 2.2227, + "mean_token_accuracy": 0.4551724076271057, + "step": 52100 + }, + { + "epoch": 0.052480709398584464, + "grad_norm": 12.528962196313383, + "learning_rate": 4.99992434803919e-05, + "loss": 2.862, + "mean_token_accuracy": 0.3827586233615875, + "step": 52105 + }, + { + "epoch": 0.05248574545168864, + "grad_norm": 12.023485513785992, + "learning_rate": 4.9999240404504226e-05, + "loss": 2.8143, + "mean_token_accuracy": 0.3965517282485962, + "step": 52110 + }, + { + "epoch": 0.05249078150479281, + "grad_norm": 11.761039655052471, + "learning_rate": 4.999923732237632e-05, + "loss": 2.3418, + "mean_token_accuracy": 0.4034482717514038, + "step": 52115 + }, + { + "epoch": 0.052495817557896986, + "grad_norm": 13.230800703921457, + "learning_rate": 4.999923423400816e-05, + "loss": 2.5962, + "mean_token_accuracy": 0.37931033968925476, + "step": 52120 + }, + { + "epoch": 0.05250085361100116, + "grad_norm": 14.240433525950861, + "learning_rate": 4.999923113939975e-05, + "loss": 2.4569, + "mean_token_accuracy": 0.41034482717514037, + "step": 52125 + }, + { + "epoch": 0.05250588966410533, + "grad_norm": 12.457293250188362, + "learning_rate": 4.9999228038551096e-05, + "loss": 2.1803, + "mean_token_accuracy": 0.458620685338974, + "step": 52130 + }, + { + "epoch": 0.0525109257172095, + "grad_norm": 12.27904909587513, + "learning_rate": 4.99992249314622e-05, + "loss": 2.7667, + "mean_token_accuracy": 0.37931033968925476, + "step": 52135 + }, + { + "epoch": 0.052515961770313674, + "grad_norm": 11.14996876610598, + "learning_rate": 4.9999221818133055e-05, + "loss": 2.6647, + "mean_token_accuracy": 0.4068965494632721, + "step": 52140 + }, + { + "epoch": 0.05252099782341785, + "grad_norm": 9.900206047952885, + "learning_rate": 4.9999218698563674e-05, + "loss": 2.7649, + "mean_token_accuracy": 0.3827586233615875, + "step": 52145 + }, + { + "epoch": 0.05252603387652202, + "grad_norm": 12.207255185333542, + "learning_rate": 4.9999215572754045e-05, + "loss": 2.646, + "mean_token_accuracy": 0.34137930870056155, + "step": 52150 + }, + { + "epoch": 0.052531069929626195, + "grad_norm": 10.577116839930989, + "learning_rate": 4.9999212440704175e-05, + "loss": 2.7892, + "mean_token_accuracy": 0.3965517282485962, + "step": 52155 + }, + { + "epoch": 0.05253610598273037, + "grad_norm": 20.21637411673557, + "learning_rate": 4.999920930241407e-05, + "loss": 2.7213, + "mean_token_accuracy": 0.3620689630508423, + "step": 52160 + }, + { + "epoch": 0.05254114203583454, + "grad_norm": 13.903731067244967, + "learning_rate": 4.999920615788371e-05, + "loss": 2.7897, + "mean_token_accuracy": 0.34482758939266206, + "step": 52165 + }, + { + "epoch": 0.05254617808893871, + "grad_norm": 18.24469380220512, + "learning_rate": 4.999920300711313e-05, + "loss": 2.8853, + "mean_token_accuracy": 0.4034482717514038, + "step": 52170 + }, + { + "epoch": 0.05255121414204288, + "grad_norm": 13.533902094835367, + "learning_rate": 4.99991998501023e-05, + "loss": 2.6875, + "mean_token_accuracy": 0.43103447556495667, + "step": 52175 + }, + { + "epoch": 0.05255625019514706, + "grad_norm": 14.24378219346739, + "learning_rate": 4.999919668685124e-05, + "loss": 2.6151, + "mean_token_accuracy": 0.4137930989265442, + "step": 52180 + }, + { + "epoch": 0.05256128624825123, + "grad_norm": 10.292270013990894, + "learning_rate": 4.9999193517359947e-05, + "loss": 2.5914, + "mean_token_accuracy": 0.40344828367233276, + "step": 52185 + }, + { + "epoch": 0.052566322301355405, + "grad_norm": 13.872307093643263, + "learning_rate": 4.9999190341628405e-05, + "loss": 2.8447, + "mean_token_accuracy": 0.4, + "step": 52190 + }, + { + "epoch": 0.05257135835445958, + "grad_norm": 13.849164232194378, + "learning_rate": 4.999918715965664e-05, + "loss": 2.4755, + "mean_token_accuracy": 0.40689654350280763, + "step": 52195 + }, + { + "epoch": 0.05257639440756375, + "grad_norm": 11.87651107097205, + "learning_rate": 4.999918397144463e-05, + "loss": 2.1827, + "mean_token_accuracy": 0.4689655125141144, + "step": 52200 + }, + { + "epoch": 0.05258143046066792, + "grad_norm": 14.367810572842119, + "learning_rate": 4.99991807769924e-05, + "loss": 2.9821, + "mean_token_accuracy": 0.37241379618644715, + "step": 52205 + }, + { + "epoch": 0.05258646651377209, + "grad_norm": 11.381407012232026, + "learning_rate": 4.999917757629993e-05, + "loss": 2.5382, + "mean_token_accuracy": 0.401875376701355, + "step": 52210 + }, + { + "epoch": 0.052591502566876266, + "grad_norm": 16.406232845676236, + "learning_rate": 4.999917436936723e-05, + "loss": 2.4639, + "mean_token_accuracy": 0.41379311084747317, + "step": 52215 + }, + { + "epoch": 0.05259653861998044, + "grad_norm": 15.499543231027019, + "learning_rate": 4.9999171156194307e-05, + "loss": 2.505, + "mean_token_accuracy": 0.40689654350280763, + "step": 52220 + }, + { + "epoch": 0.052601574673084614, + "grad_norm": 14.858624427498272, + "learning_rate": 4.999916793678115e-05, + "loss": 2.5045, + "mean_token_accuracy": 0.4206896543502808, + "step": 52225 + }, + { + "epoch": 0.05260661072618879, + "grad_norm": 11.233828437139257, + "learning_rate": 4.999916471112777e-05, + "loss": 2.5937, + "mean_token_accuracy": 0.4034482777118683, + "step": 52230 + }, + { + "epoch": 0.05261164677929296, + "grad_norm": 11.208375804292299, + "learning_rate": 4.999916147923415e-05, + "loss": 2.3938, + "mean_token_accuracy": 0.43103448748588563, + "step": 52235 + }, + { + "epoch": 0.05261668283239713, + "grad_norm": 11.254985535819657, + "learning_rate": 4.999915824110032e-05, + "loss": 2.583, + "mean_token_accuracy": 0.4931034445762634, + "step": 52240 + }, + { + "epoch": 0.0526217188855013, + "grad_norm": 12.674404404396473, + "learning_rate": 4.9999154996726256e-05, + "loss": 2.453, + "mean_token_accuracy": 0.4586206912994385, + "step": 52245 + }, + { + "epoch": 0.052626754938605476, + "grad_norm": 12.48225736606735, + "learning_rate": 4.999915174611197e-05, + "loss": 2.9606, + "mean_token_accuracy": 0.36551723182201384, + "step": 52250 + }, + { + "epoch": 0.05263179099170965, + "grad_norm": 15.392282192757758, + "learning_rate": 4.999914848925746e-05, + "loss": 3.0142, + "mean_token_accuracy": 0.37586206793785093, + "step": 52255 + }, + { + "epoch": 0.05263682704481382, + "grad_norm": 11.248839819767962, + "learning_rate": 4.999914522616272e-05, + "loss": 2.3361, + "mean_token_accuracy": 0.4399878978729248, + "step": 52260 + }, + { + "epoch": 0.052641863097918, + "grad_norm": 11.771158604649797, + "learning_rate": 4.999914195682776e-05, + "loss": 2.4946, + "mean_token_accuracy": 0.4034482777118683, + "step": 52265 + }, + { + "epoch": 0.05264689915102217, + "grad_norm": 11.195915050907423, + "learning_rate": 4.999913868125259e-05, + "loss": 2.8939, + "mean_token_accuracy": 0.34482758641242983, + "step": 52270 + }, + { + "epoch": 0.05265193520412634, + "grad_norm": 12.575116565538005, + "learning_rate": 4.9999135399437187e-05, + "loss": 2.4399, + "mean_token_accuracy": 0.4502117335796356, + "step": 52275 + }, + { + "epoch": 0.05265697125723051, + "grad_norm": 20.08568004960425, + "learning_rate": 4.999913211138157e-05, + "loss": 3.2782, + "mean_token_accuracy": 0.3429522067308426, + "step": 52280 + }, + { + "epoch": 0.052662007310334685, + "grad_norm": 12.85778333940458, + "learning_rate": 4.9999128817085735e-05, + "loss": 2.6607, + "mean_token_accuracy": 0.3896551787853241, + "step": 52285 + }, + { + "epoch": 0.05266704336343886, + "grad_norm": 11.118907725801146, + "learning_rate": 4.999912551654968e-05, + "loss": 2.7966, + "mean_token_accuracy": 0.3896551728248596, + "step": 52290 + }, + { + "epoch": 0.05267207941654303, + "grad_norm": 9.51625197672453, + "learning_rate": 4.999912220977342e-05, + "loss": 2.5506, + "mean_token_accuracy": 0.3896551728248596, + "step": 52295 + }, + { + "epoch": 0.05267711546964721, + "grad_norm": 12.048989906041559, + "learning_rate": 4.9999118896756935e-05, + "loss": 2.7023, + "mean_token_accuracy": 0.3965517282485962, + "step": 52300 + }, + { + "epoch": 0.05268215152275138, + "grad_norm": 14.695106589562384, + "learning_rate": 4.999911557750024e-05, + "loss": 2.4345, + "mean_token_accuracy": 0.4034482777118683, + "step": 52305 + }, + { + "epoch": 0.05268718757585555, + "grad_norm": 11.999980746598482, + "learning_rate": 4.999911225200331e-05, + "loss": 2.7438, + "mean_token_accuracy": 0.341379314661026, + "step": 52310 + }, + { + "epoch": 0.05269222362895972, + "grad_norm": 14.935420635563466, + "learning_rate": 4.99991089202662e-05, + "loss": 2.3546, + "mean_token_accuracy": 0.39310344457626345, + "step": 52315 + }, + { + "epoch": 0.052697259682063895, + "grad_norm": 13.72130692864623, + "learning_rate": 4.9999105582288857e-05, + "loss": 2.6643, + "mean_token_accuracy": 0.36896551251411436, + "step": 52320 + }, + { + "epoch": 0.05270229573516807, + "grad_norm": 15.733599661062378, + "learning_rate": 4.9999102238071315e-05, + "loss": 2.6731, + "mean_token_accuracy": 0.3896551728248596, + "step": 52325 + }, + { + "epoch": 0.05270733178827224, + "grad_norm": 21.428603513808433, + "learning_rate": 4.999909888761355e-05, + "loss": 2.8353, + "mean_token_accuracy": 0.37356321811676024, + "step": 52330 + }, + { + "epoch": 0.052712367841376416, + "grad_norm": 12.056837297684389, + "learning_rate": 4.999909553091559e-05, + "loss": 2.3452, + "mean_token_accuracy": 0.4379310369491577, + "step": 52335 + }, + { + "epoch": 0.05271740389448059, + "grad_norm": 12.71091112723237, + "learning_rate": 4.999909216797742e-05, + "loss": 2.4256, + "mean_token_accuracy": 0.4137930989265442, + "step": 52340 + }, + { + "epoch": 0.05272243994758476, + "grad_norm": 15.914802195092928, + "learning_rate": 4.999908879879903e-05, + "loss": 3.1311, + "mean_token_accuracy": 0.3310344874858856, + "step": 52345 + }, + { + "epoch": 0.05272747600068893, + "grad_norm": 13.982267070269382, + "learning_rate": 4.999908542338044e-05, + "loss": 3.2666, + "mean_token_accuracy": 0.33793103992938994, + "step": 52350 + }, + { + "epoch": 0.052732512053793104, + "grad_norm": 12.10752493266362, + "learning_rate": 4.999908204172165e-05, + "loss": 2.8074, + "mean_token_accuracy": 0.32758620381355286, + "step": 52355 + }, + { + "epoch": 0.05273754810689728, + "grad_norm": 14.449191870762254, + "learning_rate": 4.999907865382265e-05, + "loss": 2.7442, + "mean_token_accuracy": 0.3551724046468735, + "step": 52360 + }, + { + "epoch": 0.05274258416000145, + "grad_norm": 11.512158697732623, + "learning_rate": 4.9999075259683455e-05, + "loss": 2.6798, + "mean_token_accuracy": 0.37586206793785093, + "step": 52365 + }, + { + "epoch": 0.052747620213105625, + "grad_norm": 10.156931007852432, + "learning_rate": 4.999907185930405e-05, + "loss": 2.0482, + "mean_token_accuracy": 0.4896551609039307, + "step": 52370 + }, + { + "epoch": 0.0527526562662098, + "grad_norm": 11.729109946496521, + "learning_rate": 4.9999068452684446e-05, + "loss": 2.418, + "mean_token_accuracy": 0.42758620977401735, + "step": 52375 + }, + { + "epoch": 0.052757692319313966, + "grad_norm": 17.216350448924175, + "learning_rate": 4.999906503982464e-05, + "loss": 3.0039, + "mean_token_accuracy": 0.38620689511299133, + "step": 52380 + }, + { + "epoch": 0.05276272837241814, + "grad_norm": 13.894546698102543, + "learning_rate": 4.9999061620724636e-05, + "loss": 2.3688, + "mean_token_accuracy": 0.3879007875919342, + "step": 52385 + }, + { + "epoch": 0.052767764425522314, + "grad_norm": 12.907526872862055, + "learning_rate": 4.999905819538443e-05, + "loss": 2.1838, + "mean_token_accuracy": 0.4655172348022461, + "step": 52390 + }, + { + "epoch": 0.05277280047862649, + "grad_norm": 12.008549283232462, + "learning_rate": 4.9999054763804026e-05, + "loss": 3.0417, + "mean_token_accuracy": 0.33448275923728943, + "step": 52395 + }, + { + "epoch": 0.05277783653173066, + "grad_norm": 10.949887694973663, + "learning_rate": 4.9999051325983435e-05, + "loss": 2.5315, + "mean_token_accuracy": 0.4034482717514038, + "step": 52400 + }, + { + "epoch": 0.052782872584834835, + "grad_norm": 12.196457541091707, + "learning_rate": 4.9999047881922636e-05, + "loss": 2.6658, + "mean_token_accuracy": 0.4068965554237366, + "step": 52405 + }, + { + "epoch": 0.05278790863793901, + "grad_norm": 11.026794824583384, + "learning_rate": 4.999904443162165e-05, + "loss": 2.0451, + "mean_token_accuracy": 0.47586206793785096, + "step": 52410 + }, + { + "epoch": 0.052792944691043175, + "grad_norm": 13.76854502876863, + "learning_rate": 4.9999040975080466e-05, + "loss": 2.5316, + "mean_token_accuracy": 0.3620689630508423, + "step": 52415 + }, + { + "epoch": 0.05279798074414735, + "grad_norm": 18.612129031037366, + "learning_rate": 4.999903751229909e-05, + "loss": 3.131, + "mean_token_accuracy": 0.337931028008461, + "step": 52420 + }, + { + "epoch": 0.05280301679725152, + "grad_norm": 13.071645329114645, + "learning_rate": 4.999903404327752e-05, + "loss": 2.3479, + "mean_token_accuracy": 0.4586206912994385, + "step": 52425 + }, + { + "epoch": 0.0528080528503557, + "grad_norm": 12.641279256627136, + "learning_rate": 4.9999030568015763e-05, + "loss": 2.0556, + "mean_token_accuracy": 0.4931034445762634, + "step": 52430 + }, + { + "epoch": 0.05281308890345987, + "grad_norm": 16.645951299439346, + "learning_rate": 4.999902708651381e-05, + "loss": 2.8782, + "mean_token_accuracy": 0.3827586114406586, + "step": 52435 + }, + { + "epoch": 0.052818124956564044, + "grad_norm": 14.949139066603236, + "learning_rate": 4.999902359877167e-05, + "loss": 2.5619, + "mean_token_accuracy": 0.38771929740905764, + "step": 52440 + }, + { + "epoch": 0.05282316100966822, + "grad_norm": 12.577712155506225, + "learning_rate": 4.9999020104789355e-05, + "loss": 2.3735, + "mean_token_accuracy": 0.3931034505367279, + "step": 52445 + }, + { + "epoch": 0.052828197062772385, + "grad_norm": 10.61630628844499, + "learning_rate": 4.999901660456684e-05, + "loss": 2.4444, + "mean_token_accuracy": 0.4190562665462494, + "step": 52450 + }, + { + "epoch": 0.05283323311587656, + "grad_norm": 15.0253532284693, + "learning_rate": 4.999901309810414e-05, + "loss": 3.1995, + "mean_token_accuracy": 0.3241379290819168, + "step": 52455 + }, + { + "epoch": 0.05283826916898073, + "grad_norm": 11.930183669267848, + "learning_rate": 4.9999009585401246e-05, + "loss": 2.7477, + "mean_token_accuracy": 0.34137930870056155, + "step": 52460 + }, + { + "epoch": 0.052843305222084906, + "grad_norm": 9.665504944820308, + "learning_rate": 4.999900606645818e-05, + "loss": 2.4115, + "mean_token_accuracy": 0.4103448331356049, + "step": 52465 + }, + { + "epoch": 0.05284834127518908, + "grad_norm": 11.245917775491176, + "learning_rate": 4.999900254127493e-05, + "loss": 2.6707, + "mean_token_accuracy": 0.3586206793785095, + "step": 52470 + }, + { + "epoch": 0.052853377328293254, + "grad_norm": 12.67937417176609, + "learning_rate": 4.99989990098515e-05, + "loss": 2.9381, + "mean_token_accuracy": 0.3655172407627106, + "step": 52475 + }, + { + "epoch": 0.05285841338139743, + "grad_norm": 11.88575920668747, + "learning_rate": 4.999899547218788e-05, + "loss": 2.6887, + "mean_token_accuracy": 0.3517241418361664, + "step": 52480 + }, + { + "epoch": 0.052863449434501594, + "grad_norm": 14.430140644439849, + "learning_rate": 4.9998991928284084e-05, + "loss": 2.6276, + "mean_token_accuracy": 0.38275861740112305, + "step": 52485 + }, + { + "epoch": 0.05286848548760577, + "grad_norm": 13.935806624669114, + "learning_rate": 4.999898837814011e-05, + "loss": 3.0041, + "mean_token_accuracy": 0.3344827562570572, + "step": 52490 + }, + { + "epoch": 0.05287352154070994, + "grad_norm": 10.722903919471433, + "learning_rate": 4.9998984821755956e-05, + "loss": 2.811, + "mean_token_accuracy": 0.3878402948379517, + "step": 52495 + }, + { + "epoch": 0.052878557593814116, + "grad_norm": 11.963375711383776, + "learning_rate": 4.999898125913162e-05, + "loss": 2.4768, + "mean_token_accuracy": 0.42068966031074523, + "step": 52500 + }, + { + "epoch": 0.05288359364691829, + "grad_norm": 11.471988950483025, + "learning_rate": 4.9998977690267116e-05, + "loss": 2.6276, + "mean_token_accuracy": 0.37241379022598264, + "step": 52505 + }, + { + "epoch": 0.05288862970002246, + "grad_norm": 12.533240115342801, + "learning_rate": 4.999897411516243e-05, + "loss": 2.303, + "mean_token_accuracy": 0.4530550420284271, + "step": 52510 + }, + { + "epoch": 0.05289366575312664, + "grad_norm": 10.379311569672394, + "learning_rate": 4.9998970533817576e-05, + "loss": 2.4079, + "mean_token_accuracy": 0.3896551728248596, + "step": 52515 + }, + { + "epoch": 0.052898701806230804, + "grad_norm": 13.575178704284905, + "learning_rate": 4.999896694623255e-05, + "loss": 2.8839, + "mean_token_accuracy": 0.3275861978530884, + "step": 52520 + }, + { + "epoch": 0.05290373785933498, + "grad_norm": 10.542823767956984, + "learning_rate": 4.9998963352407345e-05, + "loss": 2.7316, + "mean_token_accuracy": 0.4068965584039688, + "step": 52525 + }, + { + "epoch": 0.05290877391243915, + "grad_norm": 11.468709330186027, + "learning_rate": 4.999895975234197e-05, + "loss": 2.5425, + "mean_token_accuracy": 0.3827586263418198, + "step": 52530 + }, + { + "epoch": 0.052913809965543325, + "grad_norm": 10.71005920483588, + "learning_rate": 4.999895614603642e-05, + "loss": 2.5727, + "mean_token_accuracy": 0.4413793206214905, + "step": 52535 + }, + { + "epoch": 0.0529188460186475, + "grad_norm": 14.776546143806273, + "learning_rate": 4.999895253349071e-05, + "loss": 2.3197, + "mean_token_accuracy": 0.42413793206214906, + "step": 52540 + }, + { + "epoch": 0.05292388207175167, + "grad_norm": 14.562609889895791, + "learning_rate": 4.9998948914704826e-05, + "loss": 3.0693, + "mean_token_accuracy": 0.3482758641242981, + "step": 52545 + }, + { + "epoch": 0.052928918124855846, + "grad_norm": 12.498456911660455, + "learning_rate": 4.999894528967877e-05, + "loss": 2.8945, + "mean_token_accuracy": 0.34137930870056155, + "step": 52550 + }, + { + "epoch": 0.05293395417796001, + "grad_norm": 11.635851674457173, + "learning_rate": 4.999894165841256e-05, + "loss": 2.886, + "mean_token_accuracy": 0.4068965494632721, + "step": 52555 + }, + { + "epoch": 0.05293899023106419, + "grad_norm": 13.204996711937499, + "learning_rate": 4.999893802090618e-05, + "loss": 2.4327, + "mean_token_accuracy": 0.40689654350280763, + "step": 52560 + }, + { + "epoch": 0.05294402628416836, + "grad_norm": 10.321895599351686, + "learning_rate": 4.999893437715963e-05, + "loss": 2.4239, + "mean_token_accuracy": 0.3827586233615875, + "step": 52565 + }, + { + "epoch": 0.052949062337272534, + "grad_norm": 10.560755174824738, + "learning_rate": 4.999893072717292e-05, + "loss": 2.4879, + "mean_token_accuracy": 0.3896551728248596, + "step": 52570 + }, + { + "epoch": 0.05295409839037671, + "grad_norm": 17.248603148432775, + "learning_rate": 4.999892707094605e-05, + "loss": 2.1451, + "mean_token_accuracy": 0.43242589235305784, + "step": 52575 + }, + { + "epoch": 0.05295913444348088, + "grad_norm": 12.085934325050895, + "learning_rate": 4.999892340847901e-05, + "loss": 2.2653, + "mean_token_accuracy": 0.4517241418361664, + "step": 52580 + }, + { + "epoch": 0.052964170496585056, + "grad_norm": 12.128206931118553, + "learning_rate": 4.999891973977182e-05, + "loss": 2.3303, + "mean_token_accuracy": 0.4310344815254211, + "step": 52585 + }, + { + "epoch": 0.05296920654968922, + "grad_norm": 12.038340814259648, + "learning_rate": 4.9998916064824465e-05, + "loss": 2.4507, + "mean_token_accuracy": 0.4275861978530884, + "step": 52590 + }, + { + "epoch": 0.052974242602793396, + "grad_norm": 11.18072493193772, + "learning_rate": 4.9998912383636956e-05, + "loss": 2.5699, + "mean_token_accuracy": 0.4393224477767944, + "step": 52595 + }, + { + "epoch": 0.05297927865589757, + "grad_norm": 11.264856928966092, + "learning_rate": 4.999890869620929e-05, + "loss": 2.758, + "mean_token_accuracy": 0.3655172407627106, + "step": 52600 + }, + { + "epoch": 0.052984314709001744, + "grad_norm": 12.662740725118304, + "learning_rate": 4.9998905002541465e-05, + "loss": 2.3017, + "mean_token_accuracy": 0.42413793206214906, + "step": 52605 + }, + { + "epoch": 0.05298935076210592, + "grad_norm": 11.568444933710694, + "learning_rate": 4.9998901302633484e-05, + "loss": 2.5195, + "mean_token_accuracy": 0.39655171930789945, + "step": 52610 + }, + { + "epoch": 0.05299438681521009, + "grad_norm": 13.604484609869235, + "learning_rate": 4.999889759648535e-05, + "loss": 2.275, + "mean_token_accuracy": 0.4413793087005615, + "step": 52615 + }, + { + "epoch": 0.052999422868314265, + "grad_norm": 16.32571150369406, + "learning_rate": 4.999889388409706e-05, + "loss": 3.0466, + "mean_token_accuracy": 0.34827586114406583, + "step": 52620 + }, + { + "epoch": 0.05300445892141843, + "grad_norm": 14.32251835227207, + "learning_rate": 4.999889016546862e-05, + "loss": 2.6345, + "mean_token_accuracy": 0.401875376701355, + "step": 52625 + }, + { + "epoch": 0.053009494974522606, + "grad_norm": 11.619585681906743, + "learning_rate": 4.999888644060003e-05, + "loss": 2.5406, + "mean_token_accuracy": 0.42413793206214906, + "step": 52630 + }, + { + "epoch": 0.05301453102762678, + "grad_norm": 10.999043949392975, + "learning_rate": 4.999888270949129e-05, + "loss": 2.6576, + "mean_token_accuracy": 0.4206896543502808, + "step": 52635 + }, + { + "epoch": 0.05301956708073095, + "grad_norm": 10.797483300813841, + "learning_rate": 4.99988789721424e-05, + "loss": 2.3416, + "mean_token_accuracy": 0.44827585220336913, + "step": 52640 + }, + { + "epoch": 0.05302460313383513, + "grad_norm": 10.190071336878232, + "learning_rate": 4.999887522855336e-05, + "loss": 2.3408, + "mean_token_accuracy": 0.4379310369491577, + "step": 52645 + }, + { + "epoch": 0.0530296391869393, + "grad_norm": 13.195678833858894, + "learning_rate": 4.999887147872417e-05, + "loss": 2.5173, + "mean_token_accuracy": 0.3827586114406586, + "step": 52650 + }, + { + "epoch": 0.053034675240043475, + "grad_norm": 10.293024908823016, + "learning_rate": 4.999886772265484e-05, + "loss": 2.3027, + "mean_token_accuracy": 0.47241379618644713, + "step": 52655 + }, + { + "epoch": 0.05303971129314764, + "grad_norm": 11.182903712942725, + "learning_rate": 4.999886396034537e-05, + "loss": 2.5046, + "mean_token_accuracy": 0.3827586233615875, + "step": 52660 + }, + { + "epoch": 0.053044747346251815, + "grad_norm": 11.992129139191514, + "learning_rate": 4.999886019179575e-05, + "loss": 2.9038, + "mean_token_accuracy": 0.3758620709180832, + "step": 52665 + }, + { + "epoch": 0.05304978339935599, + "grad_norm": 19.11379512584089, + "learning_rate": 4.999885641700598e-05, + "loss": 2.8824, + "mean_token_accuracy": 0.37931033968925476, + "step": 52670 + }, + { + "epoch": 0.05305481945246016, + "grad_norm": 10.984800346228653, + "learning_rate": 4.999885263597608e-05, + "loss": 2.2453, + "mean_token_accuracy": 0.4379310369491577, + "step": 52675 + }, + { + "epoch": 0.053059855505564336, + "grad_norm": 14.184798519481314, + "learning_rate": 4.999884884870604e-05, + "loss": 3.1606, + "mean_token_accuracy": 0.3241379290819168, + "step": 52680 + }, + { + "epoch": 0.05306489155866851, + "grad_norm": 10.616186951384595, + "learning_rate": 4.999884505519585e-05, + "loss": 2.6455, + "mean_token_accuracy": 0.37241379022598264, + "step": 52685 + }, + { + "epoch": 0.053069927611772684, + "grad_norm": 13.663478059557402, + "learning_rate": 4.9998841255445525e-05, + "loss": 2.6163, + "mean_token_accuracy": 0.4103448212146759, + "step": 52690 + }, + { + "epoch": 0.05307496366487685, + "grad_norm": 13.970196535234571, + "learning_rate": 4.999883744945506e-05, + "loss": 2.3919, + "mean_token_accuracy": 0.44506956934928893, + "step": 52695 + }, + { + "epoch": 0.053079999717981025, + "grad_norm": 12.480725699350362, + "learning_rate": 4.999883363722447e-05, + "loss": 2.775, + "mean_token_accuracy": 0.40689654350280763, + "step": 52700 + }, + { + "epoch": 0.0530850357710852, + "grad_norm": 11.175515791859308, + "learning_rate": 4.9998829818753735e-05, + "loss": 2.4434, + "mean_token_accuracy": 0.4517241358757019, + "step": 52705 + }, + { + "epoch": 0.05309007182418937, + "grad_norm": 11.595580399340005, + "learning_rate": 4.9998825994042876e-05, + "loss": 2.5742, + "mean_token_accuracy": 0.3931034505367279, + "step": 52710 + }, + { + "epoch": 0.053095107877293546, + "grad_norm": 12.343190414901912, + "learning_rate": 4.999882216309187e-05, + "loss": 2.7913, + "mean_token_accuracy": 0.3620689630508423, + "step": 52715 + }, + { + "epoch": 0.05310014393039772, + "grad_norm": 12.346017048299837, + "learning_rate": 4.999881832590074e-05, + "loss": 2.5219, + "mean_token_accuracy": 0.38620689511299133, + "step": 52720 + }, + { + "epoch": 0.05310517998350189, + "grad_norm": 11.662371357810025, + "learning_rate": 4.999881448246947e-05, + "loss": 2.2389, + "mean_token_accuracy": 0.41034482717514037, + "step": 52725 + }, + { + "epoch": 0.05311021603660606, + "grad_norm": 19.659439428407314, + "learning_rate": 4.9998810632798075e-05, + "loss": 2.8296, + "mean_token_accuracy": 0.3551724135875702, + "step": 52730 + }, + { + "epoch": 0.053115252089710234, + "grad_norm": 13.09192498933278, + "learning_rate": 4.999880677688656e-05, + "loss": 2.0579, + "mean_token_accuracy": 0.47931034564971925, + "step": 52735 + }, + { + "epoch": 0.05312028814281441, + "grad_norm": 11.797746097397859, + "learning_rate": 4.999880291473491e-05, + "loss": 2.5272, + "mean_token_accuracy": 0.3862069010734558, + "step": 52740 + }, + { + "epoch": 0.05312532419591858, + "grad_norm": 10.6882470671556, + "learning_rate": 4.999879904634313e-05, + "loss": 2.2672, + "mean_token_accuracy": 0.4034482717514038, + "step": 52745 + }, + { + "epoch": 0.053130360249022755, + "grad_norm": 13.282527204142411, + "learning_rate": 4.999879517171122e-05, + "loss": 2.2753, + "mean_token_accuracy": 0.45517241954803467, + "step": 52750 + }, + { + "epoch": 0.05313539630212693, + "grad_norm": 13.267004112529877, + "learning_rate": 4.999879129083919e-05, + "loss": 2.3213, + "mean_token_accuracy": 0.40871143341064453, + "step": 52755 + }, + { + "epoch": 0.0531404323552311, + "grad_norm": 13.23764106515179, + "learning_rate": 4.999878740372704e-05, + "loss": 2.4392, + "mean_token_accuracy": 0.4137930989265442, + "step": 52760 + }, + { + "epoch": 0.05314546840833527, + "grad_norm": 11.87947414365668, + "learning_rate": 4.999878351037476e-05, + "loss": 2.3639, + "mean_token_accuracy": 0.36896551847457887, + "step": 52765 + }, + { + "epoch": 0.05315050446143944, + "grad_norm": 11.079826528737392, + "learning_rate": 4.999877961078237e-05, + "loss": 2.4969, + "mean_token_accuracy": 0.41379310488700866, + "step": 52770 + }, + { + "epoch": 0.05315554051454362, + "grad_norm": 10.443981284297202, + "learning_rate": 4.999877570494985e-05, + "loss": 2.3254, + "mean_token_accuracy": 0.43278887271881106, + "step": 52775 + }, + { + "epoch": 0.05316057656764779, + "grad_norm": 10.519975025616317, + "learning_rate": 4.999877179287722e-05, + "loss": 2.7338, + "mean_token_accuracy": 0.39704433679580686, + "step": 52780 + }, + { + "epoch": 0.053165612620751965, + "grad_norm": 14.733345302411507, + "learning_rate": 4.999876787456446e-05, + "loss": 2.2582, + "mean_token_accuracy": 0.4068965554237366, + "step": 52785 + }, + { + "epoch": 0.05317064867385614, + "grad_norm": 16.245384216423915, + "learning_rate": 4.999876395001159e-05, + "loss": 2.4911, + "mean_token_accuracy": 0.42413793206214906, + "step": 52790 + }, + { + "epoch": 0.05317568472696031, + "grad_norm": 13.503076131143054, + "learning_rate": 4.99987600192186e-05, + "loss": 2.9788, + "mean_token_accuracy": 0.43448275327682495, + "step": 52795 + }, + { + "epoch": 0.05318072078006448, + "grad_norm": 12.252898555965514, + "learning_rate": 4.99987560821855e-05, + "loss": 2.5199, + "mean_token_accuracy": 0.38275861740112305, + "step": 52800 + }, + { + "epoch": 0.05318575683316865, + "grad_norm": 15.842404251043618, + "learning_rate": 4.999875213891228e-05, + "loss": 2.4493, + "mean_token_accuracy": 0.4068965554237366, + "step": 52805 + }, + { + "epoch": 0.05319079288627283, + "grad_norm": 12.198436866938454, + "learning_rate": 4.999874818939895e-05, + "loss": 2.723, + "mean_token_accuracy": 0.341379314661026, + "step": 52810 + }, + { + "epoch": 0.053195828939377, + "grad_norm": 12.705963902335885, + "learning_rate": 4.9998744233645514e-05, + "loss": 2.801, + "mean_token_accuracy": 0.3999999940395355, + "step": 52815 + }, + { + "epoch": 0.053200864992481174, + "grad_norm": 12.579729153343981, + "learning_rate": 4.999874027165196e-05, + "loss": 2.937, + "mean_token_accuracy": 0.38275861740112305, + "step": 52820 + }, + { + "epoch": 0.05320590104558535, + "grad_norm": 14.678412721580386, + "learning_rate": 4.99987363034183e-05, + "loss": 2.522, + "mean_token_accuracy": 0.4330308556556702, + "step": 52825 + }, + { + "epoch": 0.05321093709868952, + "grad_norm": 12.258802189162726, + "learning_rate": 4.999873232894453e-05, + "loss": 2.424, + "mean_token_accuracy": 0.4068965554237366, + "step": 52830 + }, + { + "epoch": 0.05321597315179369, + "grad_norm": 12.142042640151569, + "learning_rate": 4.999872834823064e-05, + "loss": 2.7172, + "mean_token_accuracy": 0.3758620619773865, + "step": 52835 + }, + { + "epoch": 0.05322100920489786, + "grad_norm": 13.442158739379614, + "learning_rate": 4.999872436127667e-05, + "loss": 2.7406, + "mean_token_accuracy": 0.3827586114406586, + "step": 52840 + }, + { + "epoch": 0.053226045258002036, + "grad_norm": 12.054948859060215, + "learning_rate": 4.999872036808257e-05, + "loss": 2.7899, + "mean_token_accuracy": 0.36206896901130675, + "step": 52845 + }, + { + "epoch": 0.05323108131110621, + "grad_norm": 12.795347486395773, + "learning_rate": 4.999871636864838e-05, + "loss": 2.7907, + "mean_token_accuracy": 0.38965516686439516, + "step": 52850 + }, + { + "epoch": 0.053236117364210384, + "grad_norm": 12.006954291849244, + "learning_rate": 4.999871236297409e-05, + "loss": 3.0269, + "mean_token_accuracy": 0.36896551251411436, + "step": 52855 + }, + { + "epoch": 0.05324115341731456, + "grad_norm": 12.154761433250204, + "learning_rate": 4.9998708351059684e-05, + "loss": 2.9575, + "mean_token_accuracy": 0.358620685338974, + "step": 52860 + }, + { + "epoch": 0.05324618947041873, + "grad_norm": 12.09703492091388, + "learning_rate": 4.999870433290519e-05, + "loss": 2.2414, + "mean_token_accuracy": 0.4637023627758026, + "step": 52865 + }, + { + "epoch": 0.0532512255235229, + "grad_norm": 11.377316730118268, + "learning_rate": 4.9998700308510595e-05, + "loss": 2.6494, + "mean_token_accuracy": 0.39655172228813174, + "step": 52870 + }, + { + "epoch": 0.05325626157662707, + "grad_norm": 10.769509740795984, + "learning_rate": 4.9998696277875896e-05, + "loss": 2.4547, + "mean_token_accuracy": 0.39310344457626345, + "step": 52875 + }, + { + "epoch": 0.053261297629731245, + "grad_norm": 12.323636501363245, + "learning_rate": 4.99986922410011e-05, + "loss": 2.408, + "mean_token_accuracy": 0.43103448748588563, + "step": 52880 + }, + { + "epoch": 0.05326633368283542, + "grad_norm": 14.89145612944545, + "learning_rate": 4.999868819788621e-05, + "loss": 2.4483, + "mean_token_accuracy": 0.4186932921409607, + "step": 52885 + }, + { + "epoch": 0.05327136973593959, + "grad_norm": 12.980147072098786, + "learning_rate": 4.999868414853123e-05, + "loss": 2.6049, + "mean_token_accuracy": 0.4034482717514038, + "step": 52890 + }, + { + "epoch": 0.05327640578904377, + "grad_norm": 13.143195734858024, + "learning_rate": 4.999868009293615e-05, + "loss": 2.4916, + "mean_token_accuracy": 0.4431941986083984, + "step": 52895 + }, + { + "epoch": 0.05328144184214794, + "grad_norm": 11.853857309029957, + "learning_rate": 4.999867603110098e-05, + "loss": 2.6125, + "mean_token_accuracy": 0.3999999940395355, + "step": 52900 + }, + { + "epoch": 0.05328647789525211, + "grad_norm": 14.364572928581355, + "learning_rate": 4.9998671963025714e-05, + "loss": 2.7712, + "mean_token_accuracy": 0.3517241358757019, + "step": 52905 + }, + { + "epoch": 0.05329151394835628, + "grad_norm": 10.665113525426142, + "learning_rate": 4.999866788871035e-05, + "loss": 2.1914, + "mean_token_accuracy": 0.42413793206214906, + "step": 52910 + }, + { + "epoch": 0.053296550001460455, + "grad_norm": 12.847233348713168, + "learning_rate": 4.999866380815491e-05, + "loss": 2.5993, + "mean_token_accuracy": 0.4000000059604645, + "step": 52915 + }, + { + "epoch": 0.05330158605456463, + "grad_norm": 11.72936127154475, + "learning_rate": 4.999865972135938e-05, + "loss": 2.278, + "mean_token_accuracy": 0.4551724135875702, + "step": 52920 + }, + { + "epoch": 0.0533066221076688, + "grad_norm": 11.407950980351982, + "learning_rate": 4.999865562832376e-05, + "loss": 2.569, + "mean_token_accuracy": 0.39655172526836396, + "step": 52925 + }, + { + "epoch": 0.053311658160772976, + "grad_norm": 11.468870440087771, + "learning_rate": 4.9998651529048063e-05, + "loss": 2.5952, + "mean_token_accuracy": 0.39310344457626345, + "step": 52930 + }, + { + "epoch": 0.05331669421387715, + "grad_norm": 14.596768094651525, + "learning_rate": 4.999864742353227e-05, + "loss": 2.395, + "mean_token_accuracy": 0.44640047550201417, + "step": 52935 + }, + { + "epoch": 0.05332173026698132, + "grad_norm": 13.683764367482794, + "learning_rate": 4.999864331177639e-05, + "loss": 2.7397, + "mean_token_accuracy": 0.38965517580509185, + "step": 52940 + }, + { + "epoch": 0.05332676632008549, + "grad_norm": 28.264960451564423, + "learning_rate": 4.9998639193780436e-05, + "loss": 2.3059, + "mean_token_accuracy": 0.4551724135875702, + "step": 52945 + }, + { + "epoch": 0.053331802373189664, + "grad_norm": 11.721084785281288, + "learning_rate": 4.99986350695444e-05, + "loss": 2.5699, + "mean_token_accuracy": 0.41724138557910917, + "step": 52950 + }, + { + "epoch": 0.05333683842629384, + "grad_norm": 20.13673538691939, + "learning_rate": 4.999863093906828e-05, + "loss": 2.4718, + "mean_token_accuracy": 0.4068965494632721, + "step": 52955 + }, + { + "epoch": 0.05334187447939801, + "grad_norm": 13.0804956244583, + "learning_rate": 4.9998626802352085e-05, + "loss": 2.5879, + "mean_token_accuracy": 0.4034482777118683, + "step": 52960 + }, + { + "epoch": 0.053346910532502186, + "grad_norm": 12.420669520664438, + "learning_rate": 4.99986226593958e-05, + "loss": 2.7264, + "mean_token_accuracy": 0.37241379022598264, + "step": 52965 + }, + { + "epoch": 0.05335194658560636, + "grad_norm": 12.68211208107712, + "learning_rate": 4.9998618510199456e-05, + "loss": 2.6559, + "mean_token_accuracy": 0.3620689630508423, + "step": 52970 + }, + { + "epoch": 0.053356982638710526, + "grad_norm": 11.851400456756226, + "learning_rate": 4.9998614354763024e-05, + "loss": 2.281, + "mean_token_accuracy": 0.5103448331356049, + "step": 52975 + }, + { + "epoch": 0.0533620186918147, + "grad_norm": 8.346634887058913, + "learning_rate": 4.999861019308652e-05, + "loss": 2.3549, + "mean_token_accuracy": 0.4366606056690216, + "step": 52980 + }, + { + "epoch": 0.053367054744918874, + "grad_norm": 11.624734140259012, + "learning_rate": 4.9998606025169946e-05, + "loss": 2.914, + "mean_token_accuracy": 0.35366001129150393, + "step": 52985 + }, + { + "epoch": 0.05337209079802305, + "grad_norm": 11.739164407751714, + "learning_rate": 4.999860185101329e-05, + "loss": 2.6577, + "mean_token_accuracy": 0.38620689511299133, + "step": 52990 + }, + { + "epoch": 0.05337712685112722, + "grad_norm": 12.384144804613255, + "learning_rate": 4.9998597670616575e-05, + "loss": 2.4268, + "mean_token_accuracy": 0.42758620381355283, + "step": 52995 + }, + { + "epoch": 0.053382162904231395, + "grad_norm": 13.889224771537172, + "learning_rate": 4.9998593483979786e-05, + "loss": 2.7332, + "mean_token_accuracy": 0.36896551251411436, + "step": 53000 + }, + { + "epoch": 0.05338719895733557, + "grad_norm": 13.496604042900646, + "learning_rate": 4.9998589291102926e-05, + "loss": 2.4705, + "mean_token_accuracy": 0.42068964838981626, + "step": 53005 + }, + { + "epoch": 0.053392235010439736, + "grad_norm": 35.4073262906832, + "learning_rate": 4.9998585091986e-05, + "loss": 2.8587, + "mean_token_accuracy": 0.40496068000793456, + "step": 53010 + }, + { + "epoch": 0.05339727106354391, + "grad_norm": 14.719910046514489, + "learning_rate": 4.999858088662901e-05, + "loss": 3.1237, + "mean_token_accuracy": 0.33103448152542114, + "step": 53015 + }, + { + "epoch": 0.05340230711664808, + "grad_norm": 12.816248183109348, + "learning_rate": 4.999857667503195e-05, + "loss": 2.3176, + "mean_token_accuracy": 0.4709618866443634, + "step": 53020 + }, + { + "epoch": 0.05340734316975226, + "grad_norm": 13.036208878562023, + "learning_rate": 4.9998572457194825e-05, + "loss": 2.7488, + "mean_token_accuracy": 0.4103448212146759, + "step": 53025 + }, + { + "epoch": 0.05341237922285643, + "grad_norm": 12.113656357668281, + "learning_rate": 4.9998568233117635e-05, + "loss": 2.845, + "mean_token_accuracy": 0.4034482717514038, + "step": 53030 + }, + { + "epoch": 0.053417415275960604, + "grad_norm": 10.352034456624176, + "learning_rate": 4.9998564002800393e-05, + "loss": 2.6079, + "mean_token_accuracy": 0.3517241358757019, + "step": 53035 + }, + { + "epoch": 0.05342245132906478, + "grad_norm": 13.115334892020975, + "learning_rate": 4.999855976624309e-05, + "loss": 2.2802, + "mean_token_accuracy": 0.42413792610168455, + "step": 53040 + }, + { + "epoch": 0.053427487382168945, + "grad_norm": 11.193886649903126, + "learning_rate": 4.999855552344572e-05, + "loss": 2.4384, + "mean_token_accuracy": 0.4344827651977539, + "step": 53045 + }, + { + "epoch": 0.05343252343527312, + "grad_norm": 13.379268528543985, + "learning_rate": 4.999855127440829e-05, + "loss": 2.885, + "mean_token_accuracy": 0.37241379022598264, + "step": 53050 + }, + { + "epoch": 0.05343755948837729, + "grad_norm": 13.16404786891074, + "learning_rate": 4.999854701913081e-05, + "loss": 2.7824, + "mean_token_accuracy": 0.4000000059604645, + "step": 53055 + }, + { + "epoch": 0.053442595541481466, + "grad_norm": 13.50443575715912, + "learning_rate": 4.999854275761327e-05, + "loss": 2.8787, + "mean_token_accuracy": 0.32413792610168457, + "step": 53060 + }, + { + "epoch": 0.05344763159458564, + "grad_norm": 10.553667173306518, + "learning_rate": 4.999853848985567e-05, + "loss": 2.3649, + "mean_token_accuracy": 0.4261947929859161, + "step": 53065 + }, + { + "epoch": 0.053452667647689814, + "grad_norm": 9.85019786647655, + "learning_rate": 4.999853421585803e-05, + "loss": 2.8954, + "mean_token_accuracy": 0.3310344755649567, + "step": 53070 + }, + { + "epoch": 0.05345770370079399, + "grad_norm": 13.154996558988962, + "learning_rate": 4.9998529935620334e-05, + "loss": 2.7391, + "mean_token_accuracy": 0.3482758581638336, + "step": 53075 + }, + { + "epoch": 0.053462739753898154, + "grad_norm": 10.067660974458168, + "learning_rate": 4.999852564914258e-05, + "loss": 2.4131, + "mean_token_accuracy": 0.37241379618644715, + "step": 53080 + }, + { + "epoch": 0.05346777580700233, + "grad_norm": 12.099311175556132, + "learning_rate": 4.999852135642478e-05, + "loss": 2.6574, + "mean_token_accuracy": 0.4068965494632721, + "step": 53085 + }, + { + "epoch": 0.0534728118601065, + "grad_norm": 13.572079937624716, + "learning_rate": 4.999851705746693e-05, + "loss": 3.1079, + "mean_token_accuracy": 0.3999999940395355, + "step": 53090 + }, + { + "epoch": 0.053477847913210676, + "grad_norm": 10.590901101458197, + "learning_rate": 4.9998512752269034e-05, + "loss": 2.3247, + "mean_token_accuracy": 0.4551724135875702, + "step": 53095 + }, + { + "epoch": 0.05348288396631485, + "grad_norm": 17.19284234830098, + "learning_rate": 4.999850844083109e-05, + "loss": 2.7113, + "mean_token_accuracy": 0.3862068891525269, + "step": 53100 + }, + { + "epoch": 0.05348792001941902, + "grad_norm": 18.657905664139136, + "learning_rate": 4.99985041231531e-05, + "loss": 2.2172, + "mean_token_accuracy": 0.4517241358757019, + "step": 53105 + }, + { + "epoch": 0.0534929560725232, + "grad_norm": 13.933958320769486, + "learning_rate": 4.9998499799235064e-05, + "loss": 2.4311, + "mean_token_accuracy": 0.4000000059604645, + "step": 53110 + }, + { + "epoch": 0.053497992125627364, + "grad_norm": 13.80928778502695, + "learning_rate": 4.9998495469076985e-05, + "loss": 2.6719, + "mean_token_accuracy": 0.4162734389305115, + "step": 53115 + }, + { + "epoch": 0.05350302817873154, + "grad_norm": 12.713315118406596, + "learning_rate": 4.9998491132678876e-05, + "loss": 2.4797, + "mean_token_accuracy": 0.4150635123252869, + "step": 53120 + }, + { + "epoch": 0.05350806423183571, + "grad_norm": 14.563944890132191, + "learning_rate": 4.9998486790040716e-05, + "loss": 1.9744, + "mean_token_accuracy": 0.43448275327682495, + "step": 53125 + }, + { + "epoch": 0.053513100284939885, + "grad_norm": 13.578007760895447, + "learning_rate": 4.999848244116252e-05, + "loss": 2.7582, + "mean_token_accuracy": 0.36551723480224607, + "step": 53130 + }, + { + "epoch": 0.05351813633804406, + "grad_norm": 14.180388287283959, + "learning_rate": 4.999847808604428e-05, + "loss": 2.8588, + "mean_token_accuracy": 0.42758620381355283, + "step": 53135 + }, + { + "epoch": 0.05352317239114823, + "grad_norm": 16.675086573544075, + "learning_rate": 4.999847372468601e-05, + "loss": 2.6974, + "mean_token_accuracy": 0.4137930989265442, + "step": 53140 + }, + { + "epoch": 0.0535282084442524, + "grad_norm": 12.87754162991758, + "learning_rate": 4.99984693570877e-05, + "loss": 2.7523, + "mean_token_accuracy": 0.32758620381355286, + "step": 53145 + }, + { + "epoch": 0.05353324449735657, + "grad_norm": 12.1817153582392, + "learning_rate": 4.999846498324936e-05, + "loss": 2.7332, + "mean_token_accuracy": 0.4310344815254211, + "step": 53150 + }, + { + "epoch": 0.05353828055046075, + "grad_norm": 12.117218534786808, + "learning_rate": 4.999846060317098e-05, + "loss": 2.4481, + "mean_token_accuracy": 0.4482758641242981, + "step": 53155 + }, + { + "epoch": 0.05354331660356492, + "grad_norm": 13.28273842369201, + "learning_rate": 4.9998456216852576e-05, + "loss": 2.6973, + "mean_token_accuracy": 0.3793103456497192, + "step": 53160 + }, + { + "epoch": 0.053548352656669095, + "grad_norm": 11.52469420988639, + "learning_rate": 4.999845182429414e-05, + "loss": 2.5191, + "mean_token_accuracy": 0.4379310429096222, + "step": 53165 + }, + { + "epoch": 0.05355338870977327, + "grad_norm": 13.910088770943712, + "learning_rate": 4.999844742549567e-05, + "loss": 2.8339, + "mean_token_accuracy": 0.3620689570903778, + "step": 53170 + }, + { + "epoch": 0.05355842476287744, + "grad_norm": 11.01092415936324, + "learning_rate": 4.999844302045718e-05, + "loss": 2.6481, + "mean_token_accuracy": 0.4190562665462494, + "step": 53175 + }, + { + "epoch": 0.05356346081598161, + "grad_norm": 17.993466375526516, + "learning_rate": 4.999843860917865e-05, + "loss": 2.5487, + "mean_token_accuracy": 0.4206896543502808, + "step": 53180 + }, + { + "epoch": 0.05356849686908578, + "grad_norm": 9.38016912278869, + "learning_rate": 4.99984341916601e-05, + "loss": 2.368, + "mean_token_accuracy": 0.4344827592372894, + "step": 53185 + }, + { + "epoch": 0.053573532922189956, + "grad_norm": 21.656518136485545, + "learning_rate": 4.999842976790153e-05, + "loss": 2.7492, + "mean_token_accuracy": 0.3655172407627106, + "step": 53190 + }, + { + "epoch": 0.05357856897529413, + "grad_norm": 15.43732532456617, + "learning_rate": 4.9998425337902924e-05, + "loss": 2.351, + "mean_token_accuracy": 0.4172413766384125, + "step": 53195 + }, + { + "epoch": 0.053583605028398304, + "grad_norm": 14.825546692394683, + "learning_rate": 4.999842090166431e-05, + "loss": 2.4702, + "mean_token_accuracy": 0.39310344457626345, + "step": 53200 + }, + { + "epoch": 0.05358864108150248, + "grad_norm": 11.970549585530728, + "learning_rate": 4.999841645918567e-05, + "loss": 2.2551, + "mean_token_accuracy": 0.43793103098869324, + "step": 53205 + }, + { + "epoch": 0.05359367713460665, + "grad_norm": 11.785975047199853, + "learning_rate": 4.999841201046701e-05, + "loss": 2.3459, + "mean_token_accuracy": 0.4517241358757019, + "step": 53210 + }, + { + "epoch": 0.05359871318771082, + "grad_norm": 12.363590388557657, + "learning_rate": 4.999840755550833e-05, + "loss": 2.5401, + "mean_token_accuracy": 0.42020567655563357, + "step": 53215 + }, + { + "epoch": 0.05360374924081499, + "grad_norm": 13.899363203093284, + "learning_rate": 4.9998403094309645e-05, + "loss": 2.8381, + "mean_token_accuracy": 0.43793103098869324, + "step": 53220 + }, + { + "epoch": 0.053608785293919166, + "grad_norm": 9.840183718434782, + "learning_rate": 4.9998398626870924e-05, + "loss": 2.8497, + "mean_token_accuracy": 0.34482758939266206, + "step": 53225 + }, + { + "epoch": 0.05361382134702334, + "grad_norm": 9.89270097489401, + "learning_rate": 4.99983941531922e-05, + "loss": 2.5249, + "mean_token_accuracy": 0.4, + "step": 53230 + }, + { + "epoch": 0.05361885740012751, + "grad_norm": 13.084982610131592, + "learning_rate": 4.999838967327346e-05, + "loss": 2.928, + "mean_token_accuracy": 0.38965516686439516, + "step": 53235 + }, + { + "epoch": 0.05362389345323169, + "grad_norm": 12.705022880064721, + "learning_rate": 4.9998385187114706e-05, + "loss": 3.036, + "mean_token_accuracy": 0.4068965494632721, + "step": 53240 + }, + { + "epoch": 0.05362892950633586, + "grad_norm": 12.623017729649321, + "learning_rate": 4.999838069471594e-05, + "loss": 2.5145, + "mean_token_accuracy": 0.43103448748588563, + "step": 53245 + }, + { + "epoch": 0.05363396555944003, + "grad_norm": 11.525889629523258, + "learning_rate": 4.999837619607717e-05, + "loss": 2.5071, + "mean_token_accuracy": 0.42970356345176697, + "step": 53250 + }, + { + "epoch": 0.0536390016125442, + "grad_norm": 10.796276764118186, + "learning_rate": 4.999837169119839e-05, + "loss": 2.0471, + "mean_token_accuracy": 0.47785844206809996, + "step": 53255 + }, + { + "epoch": 0.053644037665648375, + "grad_norm": 10.964210108293592, + "learning_rate": 4.99983671800796e-05, + "loss": 2.3864, + "mean_token_accuracy": 0.41379310488700866, + "step": 53260 + }, + { + "epoch": 0.05364907371875255, + "grad_norm": 12.254445670335736, + "learning_rate": 4.99983626627208e-05, + "loss": 2.3101, + "mean_token_accuracy": 0.4659407138824463, + "step": 53265 + }, + { + "epoch": 0.05365410977185672, + "grad_norm": 13.475788553071837, + "learning_rate": 4.9998358139122e-05, + "loss": 2.6494, + "mean_token_accuracy": 0.4157289803028107, + "step": 53270 + }, + { + "epoch": 0.0536591458249609, + "grad_norm": 13.377262777791412, + "learning_rate": 4.9998353609283196e-05, + "loss": 2.8284, + "mean_token_accuracy": 0.38620689511299133, + "step": 53275 + }, + { + "epoch": 0.05366418187806507, + "grad_norm": 12.806046963963635, + "learning_rate": 4.9998349073204395e-05, + "loss": 2.9124, + "mean_token_accuracy": 0.36339987218379977, + "step": 53280 + }, + { + "epoch": 0.05366921793116924, + "grad_norm": 10.830858858988913, + "learning_rate": 4.999834453088559e-05, + "loss": 2.6208, + "mean_token_accuracy": 0.3848759770393372, + "step": 53285 + }, + { + "epoch": 0.05367425398427341, + "grad_norm": 12.410675058286438, + "learning_rate": 4.999833998232678e-05, + "loss": 2.7488, + "mean_token_accuracy": 0.36206896901130675, + "step": 53290 + }, + { + "epoch": 0.053679290037377585, + "grad_norm": 10.310156045379767, + "learning_rate": 4.999833542752797e-05, + "loss": 2.4567, + "mean_token_accuracy": 0.4068965494632721, + "step": 53295 + }, + { + "epoch": 0.05368432609048176, + "grad_norm": 18.25908334129646, + "learning_rate": 4.999833086648917e-05, + "loss": 3.0179, + "mean_token_accuracy": 0.3793103456497192, + "step": 53300 + }, + { + "epoch": 0.05368936214358593, + "grad_norm": 12.350485789078926, + "learning_rate": 4.999832629921037e-05, + "loss": 2.6429, + "mean_token_accuracy": 0.42758620381355283, + "step": 53305 + }, + { + "epoch": 0.053694398196690106, + "grad_norm": 12.9789388557274, + "learning_rate": 4.999832172569158e-05, + "loss": 2.4805, + "mean_token_accuracy": 0.41724138259887694, + "step": 53310 + }, + { + "epoch": 0.05369943424979428, + "grad_norm": 13.202807711300615, + "learning_rate": 4.999831714593279e-05, + "loss": 2.4868, + "mean_token_accuracy": 0.40344826877117157, + "step": 53315 + }, + { + "epoch": 0.05370447030289845, + "grad_norm": 11.860060841812048, + "learning_rate": 4.9998312559934015e-05, + "loss": 2.7827, + "mean_token_accuracy": 0.33103448450565337, + "step": 53320 + }, + { + "epoch": 0.05370950635600262, + "grad_norm": 15.048911938393122, + "learning_rate": 4.999830796769525e-05, + "loss": 2.4595, + "mean_token_accuracy": 0.42413792610168455, + "step": 53325 + }, + { + "epoch": 0.053714542409106794, + "grad_norm": 11.783162414885554, + "learning_rate": 4.9998303369216485e-05, + "loss": 2.7389, + "mean_token_accuracy": 0.3551724076271057, + "step": 53330 + }, + { + "epoch": 0.05371957846221097, + "grad_norm": 10.613639827264706, + "learning_rate": 4.9998298764497745e-05, + "loss": 2.432, + "mean_token_accuracy": 0.42413793206214906, + "step": 53335 + }, + { + "epoch": 0.05372461451531514, + "grad_norm": 14.11402585171257, + "learning_rate": 4.9998294153539e-05, + "loss": 2.6934, + "mean_token_accuracy": 0.37586206793785093, + "step": 53340 + }, + { + "epoch": 0.053729650568419315, + "grad_norm": 10.322053677971374, + "learning_rate": 4.999828953634028e-05, + "loss": 2.1674, + "mean_token_accuracy": 0.5034482836723327, + "step": 53345 + }, + { + "epoch": 0.05373468662152349, + "grad_norm": 12.974255550693718, + "learning_rate": 4.999828491290158e-05, + "loss": 2.4477, + "mean_token_accuracy": 0.42758620977401735, + "step": 53350 + }, + { + "epoch": 0.053739722674627656, + "grad_norm": 12.831179856913947, + "learning_rate": 4.999828028322289e-05, + "loss": 2.5955, + "mean_token_accuracy": 0.4, + "step": 53355 + }, + { + "epoch": 0.05374475872773183, + "grad_norm": 12.139266041067364, + "learning_rate": 4.999827564730422e-05, + "loss": 2.1915, + "mean_token_accuracy": 0.47931033968925474, + "step": 53360 + }, + { + "epoch": 0.053749794780836004, + "grad_norm": 15.756750168391394, + "learning_rate": 4.999827100514556e-05, + "loss": 2.9547, + "mean_token_accuracy": 0.3517241358757019, + "step": 53365 + }, + { + "epoch": 0.05375483083394018, + "grad_norm": 13.123215494186805, + "learning_rate": 4.999826635674694e-05, + "loss": 2.8284, + "mean_token_accuracy": 0.38620689511299133, + "step": 53370 + }, + { + "epoch": 0.05375986688704435, + "grad_norm": 13.300777459230037, + "learning_rate": 4.9998261702108326e-05, + "loss": 2.8751, + "mean_token_accuracy": 0.33793103098869326, + "step": 53375 + }, + { + "epoch": 0.053764902940148525, + "grad_norm": 10.943921861277873, + "learning_rate": 4.999825704122974e-05, + "loss": 2.8933, + "mean_token_accuracy": 0.33206291794776915, + "step": 53380 + }, + { + "epoch": 0.0537699389932527, + "grad_norm": 13.350706019397954, + "learning_rate": 4.999825237411118e-05, + "loss": 2.9381, + "mean_token_accuracy": 0.3482758641242981, + "step": 53385 + }, + { + "epoch": 0.053774975046356865, + "grad_norm": 12.780375947019301, + "learning_rate": 4.999824770075265e-05, + "loss": 2.8705, + "mean_token_accuracy": 0.3827586203813553, + "step": 53390 + }, + { + "epoch": 0.05378001109946104, + "grad_norm": 16.802684385443204, + "learning_rate": 4.999824302115414e-05, + "loss": 2.7176, + "mean_token_accuracy": 0.36551723480224607, + "step": 53395 + }, + { + "epoch": 0.05378504715256521, + "grad_norm": 13.92322779619066, + "learning_rate": 4.9998238335315654e-05, + "loss": 2.5725, + "mean_token_accuracy": 0.4034482777118683, + "step": 53400 + }, + { + "epoch": 0.05379008320566939, + "grad_norm": 15.014922166388505, + "learning_rate": 4.999823364323721e-05, + "loss": 2.8755, + "mean_token_accuracy": 0.33793103098869326, + "step": 53405 + }, + { + "epoch": 0.05379511925877356, + "grad_norm": 14.2573605523194, + "learning_rate": 4.999822894491879e-05, + "loss": 2.3816, + "mean_token_accuracy": 0.35862069129943847, + "step": 53410 + }, + { + "epoch": 0.053800155311877734, + "grad_norm": 12.532273714394055, + "learning_rate": 4.9998224240360396e-05, + "loss": 2.6323, + "mean_token_accuracy": 0.33103448152542114, + "step": 53415 + }, + { + "epoch": 0.05380519136498191, + "grad_norm": 13.988515040582149, + "learning_rate": 4.999821952956205e-05, + "loss": 2.7388, + "mean_token_accuracy": 0.3805807590484619, + "step": 53420 + }, + { + "epoch": 0.053810227418086075, + "grad_norm": 13.682906345152427, + "learning_rate": 4.999821481252373e-05, + "loss": 2.4726, + "mean_token_accuracy": 0.42758620977401735, + "step": 53425 + }, + { + "epoch": 0.05381526347119025, + "grad_norm": 14.301343233340672, + "learning_rate": 4.999821008924545e-05, + "loss": 3.2137, + "mean_token_accuracy": 0.3188747763633728, + "step": 53430 + }, + { + "epoch": 0.05382029952429442, + "grad_norm": 18.289178843537798, + "learning_rate": 4.99982053597272e-05, + "loss": 2.8926, + "mean_token_accuracy": 0.3482758641242981, + "step": 53435 + }, + { + "epoch": 0.053825335577398596, + "grad_norm": 11.132204877060822, + "learning_rate": 4.9998200623969e-05, + "loss": 2.2486, + "mean_token_accuracy": 0.4551724135875702, + "step": 53440 + }, + { + "epoch": 0.05383037163050277, + "grad_norm": 11.701772598762922, + "learning_rate": 4.999819588197083e-05, + "loss": 2.4288, + "mean_token_accuracy": 0.38275861740112305, + "step": 53445 + }, + { + "epoch": 0.053835407683606944, + "grad_norm": 11.391821246140948, + "learning_rate": 4.99981911337327e-05, + "loss": 2.6397, + "mean_token_accuracy": 0.4080459773540497, + "step": 53450 + }, + { + "epoch": 0.05384044373671112, + "grad_norm": 11.69322178028498, + "learning_rate": 4.9998186379254625e-05, + "loss": 2.5769, + "mean_token_accuracy": 0.42413792610168455, + "step": 53455 + }, + { + "epoch": 0.053845479789815284, + "grad_norm": 14.991644009548025, + "learning_rate": 4.999818161853659e-05, + "loss": 2.9003, + "mean_token_accuracy": 0.3655172407627106, + "step": 53460 + }, + { + "epoch": 0.05385051584291946, + "grad_norm": 13.222980218432527, + "learning_rate": 4.999817685157859e-05, + "loss": 2.7091, + "mean_token_accuracy": 0.3931034505367279, + "step": 53465 + }, + { + "epoch": 0.05385555189602363, + "grad_norm": 13.96982971994947, + "learning_rate": 4.999817207838065e-05, + "loss": 2.4253, + "mean_token_accuracy": 0.4068965494632721, + "step": 53470 + }, + { + "epoch": 0.053860587949127806, + "grad_norm": 18.978969604747643, + "learning_rate": 4.999816729894275e-05, + "loss": 2.4091, + "mean_token_accuracy": 0.37586206793785093, + "step": 53475 + }, + { + "epoch": 0.05386562400223198, + "grad_norm": 17.76736945611813, + "learning_rate": 4.999816251326491e-05, + "loss": 2.3444, + "mean_token_accuracy": 0.44289171099662783, + "step": 53480 + }, + { + "epoch": 0.05387066005533615, + "grad_norm": 20.433569378202648, + "learning_rate": 4.99981577213471e-05, + "loss": 2.9264, + "mean_token_accuracy": 0.358620685338974, + "step": 53485 + }, + { + "epoch": 0.05387569610844033, + "grad_norm": 21.83505482978997, + "learning_rate": 4.999815292318936e-05, + "loss": 2.5636, + "mean_token_accuracy": 0.4034482777118683, + "step": 53490 + }, + { + "epoch": 0.053880732161544494, + "grad_norm": 13.616043057728692, + "learning_rate": 4.999814811879167e-05, + "loss": 2.7593, + "mean_token_accuracy": 0.3931034505367279, + "step": 53495 + }, + { + "epoch": 0.05388576821464867, + "grad_norm": 12.561869388210585, + "learning_rate": 4.999814330815403e-05, + "loss": 2.4966, + "mean_token_accuracy": 0.4413793087005615, + "step": 53500 + }, + { + "epoch": 0.05389080426775284, + "grad_norm": 10.028205253702849, + "learning_rate": 4.9998138491276455e-05, + "loss": 2.4312, + "mean_token_accuracy": 0.4122807025909424, + "step": 53505 + }, + { + "epoch": 0.053895840320857015, + "grad_norm": 13.929207773274339, + "learning_rate": 4.999813366815893e-05, + "loss": 2.5925, + "mean_token_accuracy": 0.38965516686439516, + "step": 53510 + }, + { + "epoch": 0.05390087637396119, + "grad_norm": 12.343186518783545, + "learning_rate": 4.999812883880146e-05, + "loss": 2.4939, + "mean_token_accuracy": 0.417241370677948, + "step": 53515 + }, + { + "epoch": 0.05390591242706536, + "grad_norm": 13.028492009216768, + "learning_rate": 4.999812400320406e-05, + "loss": 3.0868, + "mean_token_accuracy": 0.31379310190677645, + "step": 53520 + }, + { + "epoch": 0.053910948480169536, + "grad_norm": 13.338966609837291, + "learning_rate": 4.999811916136671e-05, + "loss": 2.2548, + "mean_token_accuracy": 0.42758620977401735, + "step": 53525 + }, + { + "epoch": 0.0539159845332737, + "grad_norm": 13.113855228020165, + "learning_rate": 4.999811431328944e-05, + "loss": 3.0098, + "mean_token_accuracy": 0.358620685338974, + "step": 53530 + }, + { + "epoch": 0.05392102058637788, + "grad_norm": 14.29777802306572, + "learning_rate": 4.999810945897221e-05, + "loss": 2.2996, + "mean_token_accuracy": 0.458620685338974, + "step": 53535 + }, + { + "epoch": 0.05392605663948205, + "grad_norm": 21.073877416446035, + "learning_rate": 4.9998104598415065e-05, + "loss": 2.714, + "mean_token_accuracy": 0.3965517163276672, + "step": 53540 + }, + { + "epoch": 0.053931092692586224, + "grad_norm": 10.880882504412943, + "learning_rate": 4.9998099731617975e-05, + "loss": 3.044, + "mean_token_accuracy": 0.4068965494632721, + "step": 53545 + }, + { + "epoch": 0.0539361287456904, + "grad_norm": 10.185084705512226, + "learning_rate": 4.9998094858580956e-05, + "loss": 2.5614, + "mean_token_accuracy": 0.43103448748588563, + "step": 53550 + }, + { + "epoch": 0.05394116479879457, + "grad_norm": 12.025156945931665, + "learning_rate": 4.999808997930402e-05, + "loss": 2.6493, + "mean_token_accuracy": 0.4000000059604645, + "step": 53555 + }, + { + "epoch": 0.053946200851898746, + "grad_norm": 10.184677405639917, + "learning_rate": 4.999808509378713e-05, + "loss": 2.4094, + "mean_token_accuracy": 0.43793103098869324, + "step": 53560 + }, + { + "epoch": 0.05395123690500291, + "grad_norm": 11.609287808930308, + "learning_rate": 4.999808020203033e-05, + "loss": 2.6424, + "mean_token_accuracy": 0.3827586233615875, + "step": 53565 + }, + { + "epoch": 0.053956272958107086, + "grad_norm": 16.695190736609316, + "learning_rate": 4.99980753040336e-05, + "loss": 2.6145, + "mean_token_accuracy": 0.36206896901130675, + "step": 53570 + }, + { + "epoch": 0.05396130901121126, + "grad_norm": 15.110250873955165, + "learning_rate": 4.999807039979695e-05, + "loss": 3.053, + "mean_token_accuracy": 0.3620689630508423, + "step": 53575 + }, + { + "epoch": 0.053966345064315434, + "grad_norm": 14.193496698538517, + "learning_rate": 4.9998065489320365e-05, + "loss": 2.363, + "mean_token_accuracy": 0.39655172228813174, + "step": 53580 + }, + { + "epoch": 0.05397138111741961, + "grad_norm": 11.129491206058479, + "learning_rate": 4.999806057260387e-05, + "loss": 2.3486, + "mean_token_accuracy": 0.4482758641242981, + "step": 53585 + }, + { + "epoch": 0.05397641717052378, + "grad_norm": 11.943943954695788, + "learning_rate": 4.9998055649647445e-05, + "loss": 2.5825, + "mean_token_accuracy": 0.3862069010734558, + "step": 53590 + }, + { + "epoch": 0.053981453223627955, + "grad_norm": 12.238307474477383, + "learning_rate": 4.99980507204511e-05, + "loss": 2.2918, + "mean_token_accuracy": 0.458620685338974, + "step": 53595 + }, + { + "epoch": 0.05398648927673212, + "grad_norm": 12.154455113336633, + "learning_rate": 4.999804578501484e-05, + "loss": 2.9044, + "mean_token_accuracy": 0.37586206793785093, + "step": 53600 + }, + { + "epoch": 0.053991525329836296, + "grad_norm": 12.80897333719012, + "learning_rate": 4.9998040843338665e-05, + "loss": 2.4185, + "mean_token_accuracy": 0.41034482717514037, + "step": 53605 + }, + { + "epoch": 0.05399656138294047, + "grad_norm": 12.236871033425325, + "learning_rate": 4.999803589542257e-05, + "loss": 2.4977, + "mean_token_accuracy": 0.4310344815254211, + "step": 53610 + }, + { + "epoch": 0.05400159743604464, + "grad_norm": 12.470547130273014, + "learning_rate": 4.999803094126656e-05, + "loss": 2.8487, + "mean_token_accuracy": 0.37241379618644715, + "step": 53615 + }, + { + "epoch": 0.05400663348914882, + "grad_norm": 12.997722904682266, + "learning_rate": 4.9998025980870644e-05, + "loss": 2.5829, + "mean_token_accuracy": 0.39655172228813174, + "step": 53620 + }, + { + "epoch": 0.05401166954225299, + "grad_norm": 11.080540512830911, + "learning_rate": 4.999802101423481e-05, + "loss": 2.1365, + "mean_token_accuracy": 0.4344827592372894, + "step": 53625 + }, + { + "epoch": 0.054016705595357165, + "grad_norm": 11.36495288910569, + "learning_rate": 4.999801604135907e-05, + "loss": 2.4176, + "mean_token_accuracy": 0.4137930989265442, + "step": 53630 + }, + { + "epoch": 0.05402174164846133, + "grad_norm": 13.563556905218828, + "learning_rate": 4.999801106224342e-05, + "loss": 2.3597, + "mean_token_accuracy": 0.4532970428466797, + "step": 53635 + }, + { + "epoch": 0.054026777701565505, + "grad_norm": 14.694521090131635, + "learning_rate": 4.999800607688786e-05, + "loss": 3.0304, + "mean_token_accuracy": 0.36370235979557036, + "step": 53640 + }, + { + "epoch": 0.05403181375466968, + "grad_norm": 12.406892401175796, + "learning_rate": 4.99980010852924e-05, + "loss": 2.6515, + "mean_token_accuracy": 0.38275861740112305, + "step": 53645 + }, + { + "epoch": 0.05403684980777385, + "grad_norm": 12.304064391575528, + "learning_rate": 4.9997996087457026e-05, + "loss": 2.9965, + "mean_token_accuracy": 0.3068965539336205, + "step": 53650 + }, + { + "epoch": 0.054041885860878026, + "grad_norm": 10.745659492088434, + "learning_rate": 4.9997991083381754e-05, + "loss": 2.5702, + "mean_token_accuracy": 0.3999999940395355, + "step": 53655 + }, + { + "epoch": 0.0540469219139822, + "grad_norm": 13.345864484364192, + "learning_rate": 4.999798607306658e-05, + "loss": 2.7008, + "mean_token_accuracy": 0.4034482777118683, + "step": 53660 + }, + { + "epoch": 0.054051957967086374, + "grad_norm": 13.730207462407472, + "learning_rate": 4.999798105651151e-05, + "loss": 2.3414, + "mean_token_accuracy": 0.4413793087005615, + "step": 53665 + }, + { + "epoch": 0.05405699402019054, + "grad_norm": 12.363216610526056, + "learning_rate": 4.999797603371654e-05, + "loss": 2.7455, + "mean_token_accuracy": 0.3810042321681976, + "step": 53670 + }, + { + "epoch": 0.054062030073294715, + "grad_norm": 11.444101484058908, + "learning_rate": 4.9997971004681665e-05, + "loss": 2.4455, + "mean_token_accuracy": 0.4137930989265442, + "step": 53675 + }, + { + "epoch": 0.05406706612639889, + "grad_norm": 10.474391557150467, + "learning_rate": 4.9997965969406896e-05, + "loss": 2.3744, + "mean_token_accuracy": 0.42413792610168455, + "step": 53680 + }, + { + "epoch": 0.05407210217950306, + "grad_norm": 14.077640673046057, + "learning_rate": 4.999796092789224e-05, + "loss": 3.4186, + "mean_token_accuracy": 0.3241379350423813, + "step": 53685 + }, + { + "epoch": 0.054077138232607236, + "grad_norm": 17.160786387542075, + "learning_rate": 4.999795588013769e-05, + "loss": 2.5364, + "mean_token_accuracy": 0.41034482717514037, + "step": 53690 + }, + { + "epoch": 0.05408217428571141, + "grad_norm": 10.766147743582179, + "learning_rate": 4.999795082614324e-05, + "loss": 2.6038, + "mean_token_accuracy": 0.3896551698446274, + "step": 53695 + }, + { + "epoch": 0.05408721033881558, + "grad_norm": 14.677747092803214, + "learning_rate": 4.9997945765908896e-05, + "loss": 2.5502, + "mean_token_accuracy": 0.42068964838981626, + "step": 53700 + }, + { + "epoch": 0.05409224639191975, + "grad_norm": 12.769271928328553, + "learning_rate": 4.999794069943467e-05, + "loss": 2.7477, + "mean_token_accuracy": 0.3793103456497192, + "step": 53705 + }, + { + "epoch": 0.054097282445023924, + "grad_norm": 12.400346931510436, + "learning_rate": 4.999793562672056e-05, + "loss": 2.6056, + "mean_token_accuracy": 0.3793103456497192, + "step": 53710 + }, + { + "epoch": 0.0541023184981281, + "grad_norm": 11.287944249081166, + "learning_rate": 4.999793054776656e-05, + "loss": 2.9649, + "mean_token_accuracy": 0.324137932062149, + "step": 53715 + }, + { + "epoch": 0.05410735455123227, + "grad_norm": 10.969953116591077, + "learning_rate": 4.9997925462572674e-05, + "loss": 3.4667, + "mean_token_accuracy": 0.34482758343219755, + "step": 53720 + }, + { + "epoch": 0.054112390604336445, + "grad_norm": 13.054006170367336, + "learning_rate": 4.999792037113891e-05, + "loss": 3.0844, + "mean_token_accuracy": 0.3551724135875702, + "step": 53725 + }, + { + "epoch": 0.05411742665744062, + "grad_norm": 16.037814288865174, + "learning_rate": 4.999791527346525e-05, + "loss": 2.4932, + "mean_token_accuracy": 0.3931034505367279, + "step": 53730 + }, + { + "epoch": 0.05412246271054479, + "grad_norm": 17.213810627283163, + "learning_rate": 4.999791016955172e-05, + "loss": 2.4509, + "mean_token_accuracy": 0.3862069010734558, + "step": 53735 + }, + { + "epoch": 0.05412749876364896, + "grad_norm": 11.716465574290336, + "learning_rate": 4.999790505939831e-05, + "loss": 2.2793, + "mean_token_accuracy": 0.45517241954803467, + "step": 53740 + }, + { + "epoch": 0.05413253481675313, + "grad_norm": 12.003322282470714, + "learning_rate": 4.999789994300502e-05, + "loss": 2.5329, + "mean_token_accuracy": 0.41034482717514037, + "step": 53745 + }, + { + "epoch": 0.05413757086985731, + "grad_norm": 11.703077591036108, + "learning_rate": 4.9997894820371855e-05, + "loss": 2.6602, + "mean_token_accuracy": 0.4053841531276703, + "step": 53750 + }, + { + "epoch": 0.05414260692296148, + "grad_norm": 12.497987994517377, + "learning_rate": 4.9997889691498814e-05, + "loss": 2.8614, + "mean_token_accuracy": 0.36896551251411436, + "step": 53755 + }, + { + "epoch": 0.054147642976065655, + "grad_norm": 14.65336341702027, + "learning_rate": 4.99978845563859e-05, + "loss": 2.9145, + "mean_token_accuracy": 0.34137930572032926, + "step": 53760 + }, + { + "epoch": 0.05415267902916983, + "grad_norm": 10.946588020110655, + "learning_rate": 4.9997879415033114e-05, + "loss": 2.2789, + "mean_token_accuracy": 0.4504537105560303, + "step": 53765 + }, + { + "epoch": 0.054157715082274, + "grad_norm": 10.62388792977892, + "learning_rate": 4.9997874267440455e-05, + "loss": 2.6099, + "mean_token_accuracy": 0.34137930870056155, + "step": 53770 + }, + { + "epoch": 0.05416275113537817, + "grad_norm": 13.3845489452758, + "learning_rate": 4.9997869113607935e-05, + "loss": 2.6704, + "mean_token_accuracy": 0.36551723480224607, + "step": 53775 + }, + { + "epoch": 0.05416778718848234, + "grad_norm": 12.321608355058734, + "learning_rate": 4.999786395353553e-05, + "loss": 2.3503, + "mean_token_accuracy": 0.3827586233615875, + "step": 53780 + }, + { + "epoch": 0.05417282324158652, + "grad_norm": 15.758723657304026, + "learning_rate": 4.9997858787223276e-05, + "loss": 2.5535, + "mean_token_accuracy": 0.36551723778247835, + "step": 53785 + }, + { + "epoch": 0.05417785929469069, + "grad_norm": 12.708130712218528, + "learning_rate": 4.999785361467115e-05, + "loss": 2.842, + "mean_token_accuracy": 0.41203871965408323, + "step": 53790 + }, + { + "epoch": 0.054182895347794864, + "grad_norm": 11.452532087921881, + "learning_rate": 4.999784843587916e-05, + "loss": 1.9419, + "mean_token_accuracy": 0.5185960710048676, + "step": 53795 + }, + { + "epoch": 0.05418793140089904, + "grad_norm": 13.008851026016064, + "learning_rate": 4.999784325084731e-05, + "loss": 2.6844, + "mean_token_accuracy": 0.39310344457626345, + "step": 53800 + }, + { + "epoch": 0.05419296745400321, + "grad_norm": 11.156132799423109, + "learning_rate": 4.9997838059575596e-05, + "loss": 2.3681, + "mean_token_accuracy": 0.447126442193985, + "step": 53805 + }, + { + "epoch": 0.05419800350710738, + "grad_norm": 14.593307415040004, + "learning_rate": 4.999783286206402e-05, + "loss": 2.9648, + "mean_token_accuracy": 0.37586206793785093, + "step": 53810 + }, + { + "epoch": 0.05420303956021155, + "grad_norm": 10.3345314067087, + "learning_rate": 4.999782765831259e-05, + "loss": 2.7223, + "mean_token_accuracy": 0.3896551698446274, + "step": 53815 + }, + { + "epoch": 0.054208075613315726, + "grad_norm": 12.376332067036385, + "learning_rate": 4.999782244832131e-05, + "loss": 2.5899, + "mean_token_accuracy": 0.38620689511299133, + "step": 53820 + }, + { + "epoch": 0.0542131116664199, + "grad_norm": 10.452128327487172, + "learning_rate": 4.999781723209016e-05, + "loss": 2.2664, + "mean_token_accuracy": 0.4034482717514038, + "step": 53825 + }, + { + "epoch": 0.054218147719524074, + "grad_norm": 18.391573364387483, + "learning_rate": 4.9997812009619174e-05, + "loss": 3.0707, + "mean_token_accuracy": 0.358620685338974, + "step": 53830 + }, + { + "epoch": 0.05422318377262825, + "grad_norm": 14.890771930466618, + "learning_rate": 4.9997806780908325e-05, + "loss": 2.9583, + "mean_token_accuracy": 0.33448276221752166, + "step": 53835 + }, + { + "epoch": 0.05422821982573242, + "grad_norm": 11.494590459401621, + "learning_rate": 4.999780154595763e-05, + "loss": 2.688, + "mean_token_accuracy": 0.4, + "step": 53840 + }, + { + "epoch": 0.05423325587883659, + "grad_norm": 13.91047496171786, + "learning_rate": 4.999779630476708e-05, + "loss": 2.6772, + "mean_token_accuracy": 0.4379310369491577, + "step": 53845 + }, + { + "epoch": 0.05423829193194076, + "grad_norm": 14.142156072605141, + "learning_rate": 4.999779105733669e-05, + "loss": 2.8989, + "mean_token_accuracy": 0.3620689630508423, + "step": 53850 + }, + { + "epoch": 0.054243327985044935, + "grad_norm": 12.225168186552901, + "learning_rate": 4.999778580366644e-05, + "loss": 2.4245, + "mean_token_accuracy": 0.40689654350280763, + "step": 53855 + }, + { + "epoch": 0.05424836403814911, + "grad_norm": 14.784843511490099, + "learning_rate": 4.999778054375636e-05, + "loss": 2.4102, + "mean_token_accuracy": 0.4310344815254211, + "step": 53860 + }, + { + "epoch": 0.05425340009125328, + "grad_norm": 13.193032349691608, + "learning_rate": 4.999777527760643e-05, + "loss": 2.7318, + "mean_token_accuracy": 0.37586207389831544, + "step": 53865 + }, + { + "epoch": 0.05425843614435746, + "grad_norm": 10.308914678114846, + "learning_rate": 4.999777000521666e-05, + "loss": 2.536, + "mean_token_accuracy": 0.4206896543502808, + "step": 53870 + }, + { + "epoch": 0.05426347219746163, + "grad_norm": 17.11432377423662, + "learning_rate": 4.999776472658705e-05, + "loss": 2.3365, + "mean_token_accuracy": 0.44137930274009707, + "step": 53875 + }, + { + "epoch": 0.0542685082505658, + "grad_norm": 9.408458829399539, + "learning_rate": 4.99977594417176e-05, + "loss": 2.6308, + "mean_token_accuracy": 0.3902709364891052, + "step": 53880 + }, + { + "epoch": 0.05427354430366997, + "grad_norm": 19.909679161899852, + "learning_rate": 4.999775415060832e-05, + "loss": 3.726, + "mean_token_accuracy": 0.29655172526836393, + "step": 53885 + }, + { + "epoch": 0.054278580356774145, + "grad_norm": 10.492820950117368, + "learning_rate": 4.9997748853259184e-05, + "loss": 2.4399, + "mean_token_accuracy": 0.4000000059604645, + "step": 53890 + }, + { + "epoch": 0.05428361640987832, + "grad_norm": 10.887721613014627, + "learning_rate": 4.999774354967023e-05, + "loss": 2.0946, + "mean_token_accuracy": 0.4620689690113068, + "step": 53895 + }, + { + "epoch": 0.05428865246298249, + "grad_norm": 9.215919668736971, + "learning_rate": 4.999773823984144e-05, + "loss": 2.3564, + "mean_token_accuracy": 0.4413793087005615, + "step": 53900 + }, + { + "epoch": 0.054293688516086666, + "grad_norm": 16.040806172194344, + "learning_rate": 4.9997732923772816e-05, + "loss": 3.0961, + "mean_token_accuracy": 0.379310342669487, + "step": 53905 + }, + { + "epoch": 0.05429872456919084, + "grad_norm": 10.386997877987103, + "learning_rate": 4.999772760146436e-05, + "loss": 2.4712, + "mean_token_accuracy": 0.37241379618644715, + "step": 53910 + }, + { + "epoch": 0.05430376062229501, + "grad_norm": 10.771292438339708, + "learning_rate": 4.999772227291608e-05, + "loss": 2.2725, + "mean_token_accuracy": 0.4482758641242981, + "step": 53915 + }, + { + "epoch": 0.05430879667539918, + "grad_norm": 11.587152762044953, + "learning_rate": 4.999771693812797e-05, + "loss": 2.5614, + "mean_token_accuracy": 0.3413792967796326, + "step": 53920 + }, + { + "epoch": 0.054313832728503354, + "grad_norm": 10.417644101112003, + "learning_rate": 4.999771159710004e-05, + "loss": 2.5876, + "mean_token_accuracy": 0.39818512201309203, + "step": 53925 + }, + { + "epoch": 0.05431886878160753, + "grad_norm": 13.604045167660464, + "learning_rate": 4.999770624983228e-05, + "loss": 2.5467, + "mean_token_accuracy": 0.4068965494632721, + "step": 53930 + }, + { + "epoch": 0.0543239048347117, + "grad_norm": 11.767215749978025, + "learning_rate": 4.999770089632469e-05, + "loss": 2.3774, + "mean_token_accuracy": 0.3965517163276672, + "step": 53935 + }, + { + "epoch": 0.054328940887815876, + "grad_norm": 10.601196216208526, + "learning_rate": 4.999769553657729e-05, + "loss": 2.4349, + "mean_token_accuracy": 0.36551723480224607, + "step": 53940 + }, + { + "epoch": 0.05433397694092005, + "grad_norm": 15.275302812283423, + "learning_rate": 4.9997690170590074e-05, + "loss": 3.6193, + "mean_token_accuracy": 0.27931034564971924, + "step": 53945 + }, + { + "epoch": 0.054339012994024216, + "grad_norm": 13.18380812645147, + "learning_rate": 4.9997684798363024e-05, + "loss": 2.8234, + "mean_token_accuracy": 0.39655172228813174, + "step": 53950 + }, + { + "epoch": 0.05434404904712839, + "grad_norm": 11.211606771693553, + "learning_rate": 4.999767941989617e-05, + "loss": 2.5606, + "mean_token_accuracy": 0.3620689630508423, + "step": 53955 + }, + { + "epoch": 0.054349085100232564, + "grad_norm": 13.057766226252904, + "learning_rate": 4.9997674035189496e-05, + "loss": 2.8529, + "mean_token_accuracy": 0.3551724076271057, + "step": 53960 + }, + { + "epoch": 0.05435412115333674, + "grad_norm": 11.985242858576337, + "learning_rate": 4.9997668644243004e-05, + "loss": 2.8048, + "mean_token_accuracy": 0.33103448152542114, + "step": 53965 + }, + { + "epoch": 0.05435915720644091, + "grad_norm": 12.872207299026323, + "learning_rate": 4.999766324705671e-05, + "loss": 2.6505, + "mean_token_accuracy": 0.39655172228813174, + "step": 53970 + }, + { + "epoch": 0.054364193259545085, + "grad_norm": 12.934939352560217, + "learning_rate": 4.99976578436306e-05, + "loss": 2.6791, + "mean_token_accuracy": 0.3862068891525269, + "step": 53975 + }, + { + "epoch": 0.05436922931264926, + "grad_norm": 13.198426602591098, + "learning_rate": 4.999765243396468e-05, + "loss": 2.1877, + "mean_token_accuracy": 0.4620689630508423, + "step": 53980 + }, + { + "epoch": 0.054374265365753426, + "grad_norm": 13.486393585761201, + "learning_rate": 4.999764701805895e-05, + "loss": 2.9929, + "mean_token_accuracy": 0.324137932062149, + "step": 53985 + }, + { + "epoch": 0.0543793014188576, + "grad_norm": 14.76517937377239, + "learning_rate": 4.999764159591342e-05, + "loss": 3.0706, + "mean_token_accuracy": 0.36896551251411436, + "step": 53990 + }, + { + "epoch": 0.05438433747196177, + "grad_norm": 9.428751123244538, + "learning_rate": 4.9997636167528076e-05, + "loss": 2.6947, + "mean_token_accuracy": 0.41034482717514037, + "step": 53995 + }, + { + "epoch": 0.05438937352506595, + "grad_norm": 12.285506809661081, + "learning_rate": 4.999763073290294e-05, + "loss": 2.5627, + "mean_token_accuracy": 0.3931034505367279, + "step": 54000 + }, + { + "epoch": 0.05439440957817012, + "grad_norm": 10.736808833239031, + "learning_rate": 4.9997625292038e-05, + "loss": 2.2772, + "mean_token_accuracy": 0.39310344457626345, + "step": 54005 + }, + { + "epoch": 0.054399445631274294, + "grad_norm": 13.27385560071303, + "learning_rate": 4.999761984493325e-05, + "loss": 2.3316, + "mean_token_accuracy": 0.42413793206214906, + "step": 54010 + }, + { + "epoch": 0.05440448168437847, + "grad_norm": 14.079620012529094, + "learning_rate": 4.999761439158871e-05, + "loss": 2.2749, + "mean_token_accuracy": 0.4413793087005615, + "step": 54015 + }, + { + "epoch": 0.054409517737482635, + "grad_norm": 15.80187386483194, + "learning_rate": 4.999760893200437e-05, + "loss": 2.748, + "mean_token_accuracy": 0.362068971991539, + "step": 54020 + }, + { + "epoch": 0.05441455379058681, + "grad_norm": 13.297376669183109, + "learning_rate": 4.9997603466180234e-05, + "loss": 3.0282, + "mean_token_accuracy": 0.38275861740112305, + "step": 54025 + }, + { + "epoch": 0.05441958984369098, + "grad_norm": 14.885002948421775, + "learning_rate": 4.9997597994116304e-05, + "loss": 2.5697, + "mean_token_accuracy": 0.3655172437429428, + "step": 54030 + }, + { + "epoch": 0.054424625896795156, + "grad_norm": 14.516186343224915, + "learning_rate": 4.9997592515812574e-05, + "loss": 2.4575, + "mean_token_accuracy": 0.37586207389831544, + "step": 54035 + }, + { + "epoch": 0.05442966194989933, + "grad_norm": 12.207578521600366, + "learning_rate": 4.999758703126906e-05, + "loss": 2.544, + "mean_token_accuracy": 0.37931033968925476, + "step": 54040 + }, + { + "epoch": 0.054434698003003504, + "grad_norm": 12.319141839508259, + "learning_rate": 4.999758154048576e-05, + "loss": 2.3456, + "mean_token_accuracy": 0.43103447556495667, + "step": 54045 + }, + { + "epoch": 0.05443973405610768, + "grad_norm": 17.911899431205587, + "learning_rate": 4.9997576043462675e-05, + "loss": 2.6841, + "mean_token_accuracy": 0.403448274731636, + "step": 54050 + }, + { + "epoch": 0.054444770109211844, + "grad_norm": 16.376945136733447, + "learning_rate": 4.999757054019979e-05, + "loss": 2.9254, + "mean_token_accuracy": 0.3551724135875702, + "step": 54055 + }, + { + "epoch": 0.05444980616231602, + "grad_norm": 11.122444459641732, + "learning_rate": 4.999756503069713e-05, + "loss": 2.3591, + "mean_token_accuracy": 0.4398064136505127, + "step": 54060 + }, + { + "epoch": 0.05445484221542019, + "grad_norm": 10.914612837861824, + "learning_rate": 4.999755951495468e-05, + "loss": 2.4446, + "mean_token_accuracy": 0.43793103098869324, + "step": 54065 + }, + { + "epoch": 0.054459878268524366, + "grad_norm": 16.312173981981157, + "learning_rate": 4.999755399297245e-05, + "loss": 2.6329, + "mean_token_accuracy": 0.4068965554237366, + "step": 54070 + }, + { + "epoch": 0.05446491432162854, + "grad_norm": 10.776177989232732, + "learning_rate": 4.999754846475044e-05, + "loss": 2.0429, + "mean_token_accuracy": 0.46739262342453003, + "step": 54075 + }, + { + "epoch": 0.05446995037473271, + "grad_norm": 10.042907189674924, + "learning_rate": 4.999754293028865e-05, + "loss": 2.4901, + "mean_token_accuracy": 0.4344827592372894, + "step": 54080 + }, + { + "epoch": 0.05447498642783689, + "grad_norm": 13.37657953275907, + "learning_rate": 4.999753738958708e-05, + "loss": 2.5044, + "mean_token_accuracy": 0.43793103098869324, + "step": 54085 + }, + { + "epoch": 0.054480022480941054, + "grad_norm": 11.733893965659687, + "learning_rate": 4.9997531842645745e-05, + "loss": 2.5528, + "mean_token_accuracy": 0.38808228075504303, + "step": 54090 + }, + { + "epoch": 0.05448505853404523, + "grad_norm": 10.895627029035035, + "learning_rate": 4.999752628946463e-05, + "loss": 2.5496, + "mean_token_accuracy": 0.39655172228813174, + "step": 54095 + }, + { + "epoch": 0.0544900945871494, + "grad_norm": 12.456552738200319, + "learning_rate": 4.9997520730043736e-05, + "loss": 3.1742, + "mean_token_accuracy": 0.36896551251411436, + "step": 54100 + }, + { + "epoch": 0.054495130640253575, + "grad_norm": 11.188090946774155, + "learning_rate": 4.999751516438307e-05, + "loss": 2.6878, + "mean_token_accuracy": 0.37931033968925476, + "step": 54105 + }, + { + "epoch": 0.05450016669335775, + "grad_norm": 11.231086997805155, + "learning_rate": 4.999750959248264e-05, + "loss": 2.6557, + "mean_token_accuracy": 0.3896551728248596, + "step": 54110 + }, + { + "epoch": 0.05450520274646192, + "grad_norm": 15.856141130023543, + "learning_rate": 4.9997504014342444e-05, + "loss": 2.4951, + "mean_token_accuracy": 0.37586206793785093, + "step": 54115 + }, + { + "epoch": 0.054510238799566096, + "grad_norm": 12.261658078039867, + "learning_rate": 4.999749842996249e-05, + "loss": 2.5281, + "mean_token_accuracy": 0.3793103456497192, + "step": 54120 + }, + { + "epoch": 0.05451527485267026, + "grad_norm": 21.43156017662129, + "learning_rate": 4.9997492839342744e-05, + "loss": 2.9841, + "mean_token_accuracy": 0.3827586114406586, + "step": 54125 + }, + { + "epoch": 0.05452031090577444, + "grad_norm": 12.239773363389329, + "learning_rate": 4.999748724248326e-05, + "loss": 2.6943, + "mean_token_accuracy": 0.3551724135875702, + "step": 54130 + }, + { + "epoch": 0.05452534695887861, + "grad_norm": 14.604508286717447, + "learning_rate": 4.9997481639384e-05, + "loss": 2.4466, + "mean_token_accuracy": 0.3862069010734558, + "step": 54135 + }, + { + "epoch": 0.054530383011982785, + "grad_norm": 10.991348896845013, + "learning_rate": 4.999747603004499e-05, + "loss": 2.7659, + "mean_token_accuracy": 0.3931034505367279, + "step": 54140 + }, + { + "epoch": 0.05453541906508696, + "grad_norm": 9.415641213471579, + "learning_rate": 4.999747041446621e-05, + "loss": 2.4321, + "mean_token_accuracy": 0.44664246439933775, + "step": 54145 + }, + { + "epoch": 0.05454045511819113, + "grad_norm": 8.548283421150996, + "learning_rate": 4.999746479264768e-05, + "loss": 2.4075, + "mean_token_accuracy": 0.47428917288780215, + "step": 54150 + }, + { + "epoch": 0.054545491171295306, + "grad_norm": 11.059116112859952, + "learning_rate": 4.999745916458939e-05, + "loss": 2.8004, + "mean_token_accuracy": 0.39655172228813174, + "step": 54155 + }, + { + "epoch": 0.05455052722439947, + "grad_norm": 12.248777925373007, + "learning_rate": 4.999745353029134e-05, + "loss": 2.5656, + "mean_token_accuracy": 0.4103448331356049, + "step": 54160 + }, + { + "epoch": 0.054555563277503646, + "grad_norm": 11.075679094109898, + "learning_rate": 4.999744788975355e-05, + "loss": 2.6393, + "mean_token_accuracy": 0.3793103456497192, + "step": 54165 + }, + { + "epoch": 0.05456059933060782, + "grad_norm": 16.17636583863506, + "learning_rate": 4.9997442242976e-05, + "loss": 2.7734, + "mean_token_accuracy": 0.4034482777118683, + "step": 54170 + }, + { + "epoch": 0.054565635383711994, + "grad_norm": 12.715526057272394, + "learning_rate": 4.999743658995871e-05, + "loss": 2.5157, + "mean_token_accuracy": 0.4497277677059174, + "step": 54175 + }, + { + "epoch": 0.05457067143681617, + "grad_norm": 13.87766693780733, + "learning_rate": 4.999743093070167e-05, + "loss": 2.7745, + "mean_token_accuracy": 0.3827586233615875, + "step": 54180 + }, + { + "epoch": 0.05457570748992034, + "grad_norm": 17.41464747954218, + "learning_rate": 4.9997425265204876e-05, + "loss": 2.795, + "mean_token_accuracy": 0.38717483878135683, + "step": 54185 + }, + { + "epoch": 0.054580743543024515, + "grad_norm": 11.779048531960992, + "learning_rate": 4.999741959346834e-05, + "loss": 2.5468, + "mean_token_accuracy": 0.4034482717514038, + "step": 54190 + }, + { + "epoch": 0.05458577959612868, + "grad_norm": 13.465500772077153, + "learning_rate": 4.999741391549206e-05, + "loss": 2.7821, + "mean_token_accuracy": 0.38620689511299133, + "step": 54195 + }, + { + "epoch": 0.054590815649232856, + "grad_norm": 13.776070959656394, + "learning_rate": 4.9997408231276036e-05, + "loss": 2.793, + "mean_token_accuracy": 0.4103448212146759, + "step": 54200 + }, + { + "epoch": 0.05459585170233703, + "grad_norm": 10.374760641388725, + "learning_rate": 4.999740254082028e-05, + "loss": 2.6205, + "mean_token_accuracy": 0.4, + "step": 54205 + }, + { + "epoch": 0.0546008877554412, + "grad_norm": 10.685216006201408, + "learning_rate": 4.9997396844124784e-05, + "loss": 3.007, + "mean_token_accuracy": 0.37241379022598264, + "step": 54210 + }, + { + "epoch": 0.05460592380854538, + "grad_norm": 14.334845886791479, + "learning_rate": 4.999739114118954e-05, + "loss": 2.425, + "mean_token_accuracy": 0.3931034505367279, + "step": 54215 + }, + { + "epoch": 0.05461095986164955, + "grad_norm": 11.875115785044175, + "learning_rate": 4.9997385432014575e-05, + "loss": 2.7835, + "mean_token_accuracy": 0.3896551728248596, + "step": 54220 + }, + { + "epoch": 0.054615995914753725, + "grad_norm": 12.754674301187459, + "learning_rate": 4.999737971659987e-05, + "loss": 2.5647, + "mean_token_accuracy": 0.41034482717514037, + "step": 54225 + }, + { + "epoch": 0.05462103196785789, + "grad_norm": 15.23874272656464, + "learning_rate": 4.999737399494543e-05, + "loss": 2.7344, + "mean_token_accuracy": 0.38965517580509185, + "step": 54230 + }, + { + "epoch": 0.054626068020962065, + "grad_norm": 36.901815142983025, + "learning_rate": 4.999736826705126e-05, + "loss": 2.0797, + "mean_token_accuracy": 0.4344827592372894, + "step": 54235 + }, + { + "epoch": 0.05463110407406624, + "grad_norm": 13.430485074266526, + "learning_rate": 4.999736253291737e-05, + "loss": 2.5209, + "mean_token_accuracy": 0.4, + "step": 54240 + }, + { + "epoch": 0.05463614012717041, + "grad_norm": 17.49833231313112, + "learning_rate": 4.999735679254374e-05, + "loss": 2.6312, + "mean_token_accuracy": 0.3931034505367279, + "step": 54245 + }, + { + "epoch": 0.05464117618027459, + "grad_norm": 13.262399028389577, + "learning_rate": 4.999735104593039e-05, + "loss": 2.6093, + "mean_token_accuracy": 0.3551724076271057, + "step": 54250 + }, + { + "epoch": 0.05464621223337876, + "grad_norm": 20.89465837522596, + "learning_rate": 4.9997345293077316e-05, + "loss": 2.4356, + "mean_token_accuracy": 0.3946158468723297, + "step": 54255 + }, + { + "epoch": 0.054651248286482934, + "grad_norm": 10.647909151783944, + "learning_rate": 4.999733953398452e-05, + "loss": 2.4035, + "mean_token_accuracy": 0.38620689511299133, + "step": 54260 + }, + { + "epoch": 0.0546562843395871, + "grad_norm": 12.837324155958994, + "learning_rate": 4.999733376865201e-05, + "loss": 2.6298, + "mean_token_accuracy": 0.3931034475564957, + "step": 54265 + }, + { + "epoch": 0.054661320392691275, + "grad_norm": 13.924427303998854, + "learning_rate": 4.999732799707977e-05, + "loss": 2.4358, + "mean_token_accuracy": 0.3620689660310745, + "step": 54270 + }, + { + "epoch": 0.05466635644579545, + "grad_norm": 11.49985932214395, + "learning_rate": 4.999732221926781e-05, + "loss": 2.495, + "mean_token_accuracy": 0.42274651527404783, + "step": 54275 + }, + { + "epoch": 0.05467139249889962, + "grad_norm": 13.135449276344067, + "learning_rate": 4.999731643521614e-05, + "loss": 2.5321, + "mean_token_accuracy": 0.3724137842655182, + "step": 54280 + }, + { + "epoch": 0.054676428552003796, + "grad_norm": 11.62245100960335, + "learning_rate": 4.999731064492476e-05, + "loss": 2.2225, + "mean_token_accuracy": 0.4620689630508423, + "step": 54285 + }, + { + "epoch": 0.05468146460510797, + "grad_norm": 14.027978017442027, + "learning_rate": 4.9997304848393656e-05, + "loss": 2.6396, + "mean_token_accuracy": 0.41379310488700866, + "step": 54290 + }, + { + "epoch": 0.054686500658212144, + "grad_norm": 27.80286178382515, + "learning_rate": 4.999729904562285e-05, + "loss": 2.7357, + "mean_token_accuracy": 0.37931033968925476, + "step": 54295 + }, + { + "epoch": 0.05469153671131631, + "grad_norm": 11.32676543320186, + "learning_rate": 4.999729323661232e-05, + "loss": 2.1157, + "mean_token_accuracy": 0.4689655125141144, + "step": 54300 + }, + { + "epoch": 0.054696572764420484, + "grad_norm": 12.728543843048245, + "learning_rate": 4.999728742136209e-05, + "loss": 2.599, + "mean_token_accuracy": 0.3620689630508423, + "step": 54305 + }, + { + "epoch": 0.05470160881752466, + "grad_norm": 14.131418345484875, + "learning_rate": 4.999728159987217e-05, + "loss": 2.5245, + "mean_token_accuracy": 0.3965517282485962, + "step": 54310 + }, + { + "epoch": 0.05470664487062883, + "grad_norm": 13.708912455739048, + "learning_rate": 4.9997275772142524e-05, + "loss": 2.4708, + "mean_token_accuracy": 0.38620689511299133, + "step": 54315 + }, + { + "epoch": 0.054711680923733005, + "grad_norm": 13.183366545925505, + "learning_rate": 4.9997269938173176e-05, + "loss": 2.6331, + "mean_token_accuracy": 0.33793102502822875, + "step": 54320 + }, + { + "epoch": 0.05471671697683718, + "grad_norm": 12.059306886787619, + "learning_rate": 4.999726409796413e-05, + "loss": 2.5516, + "mean_token_accuracy": 0.39310344457626345, + "step": 54325 + }, + { + "epoch": 0.05472175302994135, + "grad_norm": 11.62830121580452, + "learning_rate": 4.999725825151538e-05, + "loss": 2.769, + "mean_token_accuracy": 0.41379310488700866, + "step": 54330 + }, + { + "epoch": 0.05472678908304552, + "grad_norm": 18.20200037305479, + "learning_rate": 4.9997252398826946e-05, + "loss": 2.7932, + "mean_token_accuracy": 0.3827586203813553, + "step": 54335 + }, + { + "epoch": 0.054731825136149694, + "grad_norm": 12.865533348690079, + "learning_rate": 4.99972465398988e-05, + "loss": 2.7858, + "mean_token_accuracy": 0.37241379618644715, + "step": 54340 + }, + { + "epoch": 0.05473686118925387, + "grad_norm": 13.1580197447787, + "learning_rate": 4.999724067473097e-05, + "loss": 2.5233, + "mean_token_accuracy": 0.37931033968925476, + "step": 54345 + }, + { + "epoch": 0.05474189724235804, + "grad_norm": 14.16512295381298, + "learning_rate": 4.999723480332344e-05, + "loss": 2.5528, + "mean_token_accuracy": 0.4241379380226135, + "step": 54350 + }, + { + "epoch": 0.054746933295462215, + "grad_norm": 12.154094385131897, + "learning_rate": 4.999722892567622e-05, + "loss": 2.4514, + "mean_token_accuracy": 0.4482758641242981, + "step": 54355 + }, + { + "epoch": 0.05475196934856639, + "grad_norm": 13.671438246452615, + "learning_rate": 4.99972230417893e-05, + "loss": 2.2815, + "mean_token_accuracy": 0.4310344815254211, + "step": 54360 + }, + { + "epoch": 0.05475700540167056, + "grad_norm": 12.743191833251798, + "learning_rate": 4.999721715166271e-05, + "loss": 2.7251, + "mean_token_accuracy": 0.3896551728248596, + "step": 54365 + }, + { + "epoch": 0.05476204145477473, + "grad_norm": 10.254736494907352, + "learning_rate": 4.9997211255296414e-05, + "loss": 2.7059, + "mean_token_accuracy": 0.41379310488700866, + "step": 54370 + }, + { + "epoch": 0.0547670775078789, + "grad_norm": 11.314195982705172, + "learning_rate": 4.999720535269045e-05, + "loss": 2.5785, + "mean_token_accuracy": 0.37241379022598264, + "step": 54375 + }, + { + "epoch": 0.05477211356098308, + "grad_norm": 10.25062799256725, + "learning_rate": 4.9997199443844783e-05, + "loss": 2.5568, + "mean_token_accuracy": 0.4379310369491577, + "step": 54380 + }, + { + "epoch": 0.05477714961408725, + "grad_norm": 11.277378520683582, + "learning_rate": 4.999719352875945e-05, + "loss": 2.4438, + "mean_token_accuracy": 0.43103447556495667, + "step": 54385 + }, + { + "epoch": 0.054782185667191424, + "grad_norm": 9.629413550103756, + "learning_rate": 4.9997187607434426e-05, + "loss": 2.1146, + "mean_token_accuracy": 0.4605565667152405, + "step": 54390 + }, + { + "epoch": 0.0547872217202956, + "grad_norm": 10.977584866380166, + "learning_rate": 4.9997181679869726e-05, + "loss": 2.3172, + "mean_token_accuracy": 0.43103447556495667, + "step": 54395 + }, + { + "epoch": 0.05479225777339977, + "grad_norm": 10.66862273991511, + "learning_rate": 4.9997175746065354e-05, + "loss": 2.2606, + "mean_token_accuracy": 0.44482758045196535, + "step": 54400 + }, + { + "epoch": 0.05479729382650394, + "grad_norm": 16.692960050717513, + "learning_rate": 4.99971698060213e-05, + "loss": 2.6912, + "mean_token_accuracy": 0.3931034505367279, + "step": 54405 + }, + { + "epoch": 0.05480232987960811, + "grad_norm": 11.945205236638019, + "learning_rate": 4.999716385973757e-05, + "loss": 2.6448, + "mean_token_accuracy": 0.4344827592372894, + "step": 54410 + }, + { + "epoch": 0.054807365932712286, + "grad_norm": 10.624755482170864, + "learning_rate": 4.9997157907214176e-05, + "loss": 2.6628, + "mean_token_accuracy": 0.41034482717514037, + "step": 54415 + }, + { + "epoch": 0.05481240198581646, + "grad_norm": 13.99169543678255, + "learning_rate": 4.9997151948451105e-05, + "loss": 2.3363, + "mean_token_accuracy": 0.458620685338974, + "step": 54420 + }, + { + "epoch": 0.054817438038920634, + "grad_norm": 10.321447297620917, + "learning_rate": 4.9997145983448375e-05, + "loss": 2.7296, + "mean_token_accuracy": 0.3793103456497192, + "step": 54425 + }, + { + "epoch": 0.05482247409202481, + "grad_norm": 12.560805386784308, + "learning_rate": 4.999714001220596e-05, + "loss": 2.2502, + "mean_token_accuracy": 0.4689655125141144, + "step": 54430 + }, + { + "epoch": 0.05482751014512898, + "grad_norm": 11.622107835143565, + "learning_rate": 4.9997134034723895e-05, + "loss": 2.1959, + "mean_token_accuracy": 0.46551724076271056, + "step": 54435 + }, + { + "epoch": 0.05483254619823315, + "grad_norm": 11.088411785650322, + "learning_rate": 4.999712805100216e-05, + "loss": 2.7162, + "mean_token_accuracy": 0.37241379022598264, + "step": 54440 + }, + { + "epoch": 0.05483758225133732, + "grad_norm": 10.397474582694896, + "learning_rate": 4.999712206104076e-05, + "loss": 2.6024, + "mean_token_accuracy": 0.3379310369491577, + "step": 54445 + }, + { + "epoch": 0.054842618304441496, + "grad_norm": 11.75477049590005, + "learning_rate": 4.99971160648397e-05, + "loss": 2.8731, + "mean_token_accuracy": 0.38790078163146974, + "step": 54450 + }, + { + "epoch": 0.05484765435754567, + "grad_norm": 10.903901256255777, + "learning_rate": 4.999711006239899e-05, + "loss": 2.2529, + "mean_token_accuracy": 0.46551724076271056, + "step": 54455 + }, + { + "epoch": 0.05485269041064984, + "grad_norm": 9.920129346600094, + "learning_rate": 4.999710405371861e-05, + "loss": 2.262, + "mean_token_accuracy": 0.4172413766384125, + "step": 54460 + }, + { + "epoch": 0.05485772646375402, + "grad_norm": 10.223240500440891, + "learning_rate": 4.9997098038798585e-05, + "loss": 2.5247, + "mean_token_accuracy": 0.38620689511299133, + "step": 54465 + }, + { + "epoch": 0.05486276251685819, + "grad_norm": 10.59049060922838, + "learning_rate": 4.99970920176389e-05, + "loss": 2.1664, + "mean_token_accuracy": 0.4448275864124298, + "step": 54470 + }, + { + "epoch": 0.05486779856996236, + "grad_norm": 11.845171339417996, + "learning_rate": 4.999708599023956e-05, + "loss": 3.2442, + "mean_token_accuracy": 0.35862069129943847, + "step": 54475 + }, + { + "epoch": 0.05487283462306653, + "grad_norm": 11.74041524184736, + "learning_rate": 4.9997079956600574e-05, + "loss": 2.3933, + "mean_token_accuracy": 0.4103448212146759, + "step": 54480 + }, + { + "epoch": 0.054877870676170705, + "grad_norm": 10.621372110218356, + "learning_rate": 4.9997073916721935e-05, + "loss": 2.3266, + "mean_token_accuracy": 0.4517241299152374, + "step": 54485 + }, + { + "epoch": 0.05488290672927488, + "grad_norm": 10.814890275601295, + "learning_rate": 4.999706787060365e-05, + "loss": 2.2058, + "mean_token_accuracy": 0.47586207985877993, + "step": 54490 + }, + { + "epoch": 0.05488794278237905, + "grad_norm": 11.583624713048449, + "learning_rate": 4.999706181824573e-05, + "loss": 2.0823, + "mean_token_accuracy": 0.456745320558548, + "step": 54495 + }, + { + "epoch": 0.054892978835483226, + "grad_norm": 11.461569199699918, + "learning_rate": 4.9997055759648156e-05, + "loss": 2.6524, + "mean_token_accuracy": 0.3841500341892242, + "step": 54500 + }, + { + "epoch": 0.0548980148885874, + "grad_norm": 12.291886836126451, + "learning_rate": 4.999704969481094e-05, + "loss": 2.423, + "mean_token_accuracy": 0.43103447556495667, + "step": 54505 + }, + { + "epoch": 0.05490305094169157, + "grad_norm": 11.139050865953255, + "learning_rate": 4.999704362373408e-05, + "loss": 2.4622, + "mean_token_accuracy": 0.4724137902259827, + "step": 54510 + }, + { + "epoch": 0.05490808699479574, + "grad_norm": 11.020074725867492, + "learning_rate": 4.9997037546417584e-05, + "loss": 2.504, + "mean_token_accuracy": 0.41379310488700866, + "step": 54515 + }, + { + "epoch": 0.054913123047899914, + "grad_norm": 11.170791713838243, + "learning_rate": 4.9997031462861446e-05, + "loss": 2.373, + "mean_token_accuracy": 0.43793103098869324, + "step": 54520 + }, + { + "epoch": 0.05491815910100409, + "grad_norm": 13.68047590302003, + "learning_rate": 4.9997025373065685e-05, + "loss": 2.471, + "mean_token_accuracy": 0.4310344815254211, + "step": 54525 + }, + { + "epoch": 0.05492319515410826, + "grad_norm": 13.040297655525265, + "learning_rate": 4.999701927703028e-05, + "loss": 2.6178, + "mean_token_accuracy": 0.39310344457626345, + "step": 54530 + }, + { + "epoch": 0.054928231207212436, + "grad_norm": 10.90090243179482, + "learning_rate": 4.999701317475525e-05, + "loss": 2.6843, + "mean_token_accuracy": 0.4172413766384125, + "step": 54535 + }, + { + "epoch": 0.05493326726031661, + "grad_norm": 13.825858356836811, + "learning_rate": 4.999700706624058e-05, + "loss": 2.5872, + "mean_token_accuracy": 0.3862068891525269, + "step": 54540 + }, + { + "epoch": 0.054938303313420776, + "grad_norm": 14.077127569330168, + "learning_rate": 4.999700095148629e-05, + "loss": 2.4339, + "mean_token_accuracy": 0.45862069725990295, + "step": 54545 + }, + { + "epoch": 0.05494333936652495, + "grad_norm": 10.849000917150184, + "learning_rate": 4.9996994830492365e-05, + "loss": 2.3763, + "mean_token_accuracy": 0.3931034505367279, + "step": 54550 + }, + { + "epoch": 0.054948375419629124, + "grad_norm": 13.451222479195048, + "learning_rate": 4.999698870325882e-05, + "loss": 2.6759, + "mean_token_accuracy": 0.3620689630508423, + "step": 54555 + }, + { + "epoch": 0.0549534114727333, + "grad_norm": 9.459283196923552, + "learning_rate": 4.9996982569785647e-05, + "loss": 2.4764, + "mean_token_accuracy": 0.4206896543502808, + "step": 54560 + }, + { + "epoch": 0.05495844752583747, + "grad_norm": 16.03525928253831, + "learning_rate": 4.999697643007286e-05, + "loss": 2.8863, + "mean_token_accuracy": 0.39655172228813174, + "step": 54565 + }, + { + "epoch": 0.054963483578941645, + "grad_norm": 13.0083486471545, + "learning_rate": 4.9996970284120445e-05, + "loss": 2.797, + "mean_token_accuracy": 0.341379314661026, + "step": 54570 + }, + { + "epoch": 0.05496851963204582, + "grad_norm": 10.228530865355939, + "learning_rate": 4.9996964131928415e-05, + "loss": 2.3807, + "mean_token_accuracy": 0.41440887451171876, + "step": 54575 + }, + { + "epoch": 0.054973555685149986, + "grad_norm": 15.406325236978802, + "learning_rate": 4.999695797349677e-05, + "loss": 2.4131, + "mean_token_accuracy": 0.40689654350280763, + "step": 54580 + }, + { + "epoch": 0.05497859173825416, + "grad_norm": 11.910937457865513, + "learning_rate": 4.99969518088255e-05, + "loss": 2.3949, + "mean_token_accuracy": 0.46551724672317507, + "step": 54585 + }, + { + "epoch": 0.05498362779135833, + "grad_norm": 11.703762405300274, + "learning_rate": 4.9996945637914626e-05, + "loss": 2.8361, + "mean_token_accuracy": 0.3620689630508423, + "step": 54590 + }, + { + "epoch": 0.05498866384446251, + "grad_norm": 12.581152816015233, + "learning_rate": 4.999693946076413e-05, + "loss": 2.7149, + "mean_token_accuracy": 0.38275861740112305, + "step": 54595 + }, + { + "epoch": 0.05499369989756668, + "grad_norm": 11.198934160804313, + "learning_rate": 4.9996933277374036e-05, + "loss": 2.682, + "mean_token_accuracy": 0.4034482777118683, + "step": 54600 + }, + { + "epoch": 0.054998735950670855, + "grad_norm": 10.951667333775461, + "learning_rate": 4.999692708774433e-05, + "loss": 2.5106, + "mean_token_accuracy": 0.42068966031074523, + "step": 54605 + }, + { + "epoch": 0.05500377200377503, + "grad_norm": 12.258298067842022, + "learning_rate": 4.999692089187502e-05, + "loss": 2.5145, + "mean_token_accuracy": 0.4344827651977539, + "step": 54610 + }, + { + "epoch": 0.055008808056879195, + "grad_norm": 12.57908508116266, + "learning_rate": 4.99969146897661e-05, + "loss": 2.6359, + "mean_token_accuracy": 0.39655172228813174, + "step": 54615 + }, + { + "epoch": 0.05501384410998337, + "grad_norm": 13.454749275789666, + "learning_rate": 4.9996908481417583e-05, + "loss": 2.2876, + "mean_token_accuracy": 0.4517241358757019, + "step": 54620 + }, + { + "epoch": 0.05501888016308754, + "grad_norm": 13.591296765081454, + "learning_rate": 4.9996902266829454e-05, + "loss": 2.6108, + "mean_token_accuracy": 0.3999999940395355, + "step": 54625 + }, + { + "epoch": 0.055023916216191716, + "grad_norm": 12.201824360800561, + "learning_rate": 4.9996896046001734e-05, + "loss": 2.5995, + "mean_token_accuracy": 0.4137930989265442, + "step": 54630 + }, + { + "epoch": 0.05502895226929589, + "grad_norm": 12.196068308050652, + "learning_rate": 4.999688981893441e-05, + "loss": 2.9516, + "mean_token_accuracy": 0.38777979016304015, + "step": 54635 + }, + { + "epoch": 0.055033988322400064, + "grad_norm": 9.339111483778993, + "learning_rate": 4.999688358562749e-05, + "loss": 2.761, + "mean_token_accuracy": 0.4012704133987427, + "step": 54640 + }, + { + "epoch": 0.05503902437550424, + "grad_norm": 10.293807014862155, + "learning_rate": 4.9996877346080985e-05, + "loss": 2.5875, + "mean_token_accuracy": 0.3999999940395355, + "step": 54645 + }, + { + "epoch": 0.055044060428608405, + "grad_norm": 18.971478335498414, + "learning_rate": 4.999687110029487e-05, + "loss": 2.67, + "mean_token_accuracy": 0.38275861740112305, + "step": 54650 + }, + { + "epoch": 0.05504909648171258, + "grad_norm": 8.637741971439697, + "learning_rate": 4.999686484826918e-05, + "loss": 2.4124, + "mean_token_accuracy": 0.44273399114608764, + "step": 54655 + }, + { + "epoch": 0.05505413253481675, + "grad_norm": 11.226614721447172, + "learning_rate": 4.999685859000389e-05, + "loss": 2.1836, + "mean_token_accuracy": 0.4413793087005615, + "step": 54660 + }, + { + "epoch": 0.055059168587920926, + "grad_norm": 12.386958927601722, + "learning_rate": 4.9996852325499025e-05, + "loss": 2.5318, + "mean_token_accuracy": 0.37241379618644715, + "step": 54665 + }, + { + "epoch": 0.0550642046410251, + "grad_norm": 9.939035573701915, + "learning_rate": 4.999684605475456e-05, + "loss": 2.6086, + "mean_token_accuracy": 0.4, + "step": 54670 + }, + { + "epoch": 0.05506924069412927, + "grad_norm": 11.172244204017614, + "learning_rate": 4.999683977777052e-05, + "loss": 2.6617, + "mean_token_accuracy": 0.41379310488700866, + "step": 54675 + }, + { + "epoch": 0.05507427674723345, + "grad_norm": 10.333740799497116, + "learning_rate": 4.9996833494546894e-05, + "loss": 2.3853, + "mean_token_accuracy": 0.42934059500694277, + "step": 54680 + }, + { + "epoch": 0.055079312800337614, + "grad_norm": 12.316261347915422, + "learning_rate": 4.999682720508369e-05, + "loss": 2.5662, + "mean_token_accuracy": 0.37241379022598264, + "step": 54685 + }, + { + "epoch": 0.05508434885344179, + "grad_norm": 16.82090551280434, + "learning_rate": 4.9996820909380905e-05, + "loss": 2.8953, + "mean_token_accuracy": 0.3068965494632721, + "step": 54690 + }, + { + "epoch": 0.05508938490654596, + "grad_norm": 16.188878317780716, + "learning_rate": 4.999681460743854e-05, + "loss": 2.7584, + "mean_token_accuracy": 0.36551723480224607, + "step": 54695 + }, + { + "epoch": 0.055094420959650135, + "grad_norm": 15.858139182180416, + "learning_rate": 4.99968082992566e-05, + "loss": 2.5018, + "mean_token_accuracy": 0.41034482717514037, + "step": 54700 + }, + { + "epoch": 0.05509945701275431, + "grad_norm": 11.342753520822262, + "learning_rate": 4.999680198483509e-05, + "loss": 2.7252, + "mean_token_accuracy": 0.3517241358757019, + "step": 54705 + }, + { + "epoch": 0.05510449306585848, + "grad_norm": 11.24062554770422, + "learning_rate": 4.9996795664174006e-05, + "loss": 3.0165, + "mean_token_accuracy": 0.3931034505367279, + "step": 54710 + }, + { + "epoch": 0.05510952911896266, + "grad_norm": 12.371929176551218, + "learning_rate": 4.999678933727335e-05, + "loss": 2.7066, + "mean_token_accuracy": 0.42413793206214906, + "step": 54715 + }, + { + "epoch": 0.05511456517206682, + "grad_norm": 11.723007719062226, + "learning_rate": 4.999678300413313e-05, + "loss": 2.3224, + "mean_token_accuracy": 0.46206897497177124, + "step": 54720 + }, + { + "epoch": 0.055119601225171, + "grad_norm": 10.453787645205768, + "learning_rate": 4.999677666475335e-05, + "loss": 2.2804, + "mean_token_accuracy": 0.42413792610168455, + "step": 54725 + }, + { + "epoch": 0.05512463727827517, + "grad_norm": 13.661984922244748, + "learning_rate": 4.999677031913398e-05, + "loss": 2.7923, + "mean_token_accuracy": 0.39310344457626345, + "step": 54730 + }, + { + "epoch": 0.055129673331379345, + "grad_norm": 9.843513305710964, + "learning_rate": 4.9996763967275076e-05, + "loss": 2.668, + "mean_token_accuracy": 0.4413793087005615, + "step": 54735 + }, + { + "epoch": 0.05513470938448352, + "grad_norm": 9.839692307253982, + "learning_rate": 4.999675760917659e-05, + "loss": 2.3811, + "mean_token_accuracy": 0.4034482777118683, + "step": 54740 + }, + { + "epoch": 0.05513974543758769, + "grad_norm": 13.187591163422763, + "learning_rate": 4.9996751244838554e-05, + "loss": 2.4789, + "mean_token_accuracy": 0.4172413766384125, + "step": 54745 + }, + { + "epoch": 0.055144781490691866, + "grad_norm": 16.90574741904902, + "learning_rate": 4.999674487426096e-05, + "loss": 2.9113, + "mean_token_accuracy": 0.3655172407627106, + "step": 54750 + }, + { + "epoch": 0.05514981754379603, + "grad_norm": 13.186336481179007, + "learning_rate": 4.999673849744381e-05, + "loss": 2.5756, + "mean_token_accuracy": 0.37241379618644715, + "step": 54755 + }, + { + "epoch": 0.05515485359690021, + "grad_norm": 14.696653946080515, + "learning_rate": 4.99967321143871e-05, + "loss": 2.5788, + "mean_token_accuracy": 0.41379310488700866, + "step": 54760 + }, + { + "epoch": 0.05515988965000438, + "grad_norm": 10.572708750101505, + "learning_rate": 4.999672572509084e-05, + "loss": 2.4267, + "mean_token_accuracy": 0.44331517815589905, + "step": 54765 + }, + { + "epoch": 0.055164925703108554, + "grad_norm": 13.739411337779165, + "learning_rate": 4.999671932955503e-05, + "loss": 2.4821, + "mean_token_accuracy": 0.4137930989265442, + "step": 54770 + }, + { + "epoch": 0.05516996175621273, + "grad_norm": 24.99825366576518, + "learning_rate": 4.999671292777968e-05, + "loss": 2.5, + "mean_token_accuracy": 0.4068965494632721, + "step": 54775 + }, + { + "epoch": 0.0551749978093169, + "grad_norm": 16.297228687295462, + "learning_rate": 4.999670651976477e-05, + "loss": 2.8089, + "mean_token_accuracy": 0.4, + "step": 54780 + }, + { + "epoch": 0.055180033862421075, + "grad_norm": 14.188861622743655, + "learning_rate": 4.9996700105510315e-05, + "loss": 2.4683, + "mean_token_accuracy": 0.41379310488700866, + "step": 54785 + }, + { + "epoch": 0.05518506991552524, + "grad_norm": 13.417367038458002, + "learning_rate": 4.999669368501633e-05, + "loss": 2.3871, + "mean_token_accuracy": 0.4379310369491577, + "step": 54790 + }, + { + "epoch": 0.055190105968629416, + "grad_norm": 11.90027213645835, + "learning_rate": 4.999668725828278e-05, + "loss": 2.4416, + "mean_token_accuracy": 0.4137930929660797, + "step": 54795 + }, + { + "epoch": 0.05519514202173359, + "grad_norm": 11.830590540658173, + "learning_rate": 4.9996680825309704e-05, + "loss": 2.5619, + "mean_token_accuracy": 0.38965516686439516, + "step": 54800 + }, + { + "epoch": 0.055200178074837764, + "grad_norm": 13.40633330215915, + "learning_rate": 4.999667438609709e-05, + "loss": 2.6303, + "mean_token_accuracy": 0.42758620977401735, + "step": 54805 + }, + { + "epoch": 0.05520521412794194, + "grad_norm": 11.422063169877035, + "learning_rate": 4.999666794064494e-05, + "loss": 2.6097, + "mean_token_accuracy": 0.3793103516101837, + "step": 54810 + }, + { + "epoch": 0.05521025018104611, + "grad_norm": 10.719332551447245, + "learning_rate": 4.999666148895325e-05, + "loss": 2.1441, + "mean_token_accuracy": 0.44827585816383364, + "step": 54815 + }, + { + "epoch": 0.055215286234150285, + "grad_norm": 14.500468249219592, + "learning_rate": 4.9996655031022025e-05, + "loss": 2.6859, + "mean_token_accuracy": 0.37241379022598264, + "step": 54820 + }, + { + "epoch": 0.05522032228725445, + "grad_norm": 12.90705665187925, + "learning_rate": 4.999664856685128e-05, + "loss": 2.7797, + "mean_token_accuracy": 0.4, + "step": 54825 + }, + { + "epoch": 0.055225358340358625, + "grad_norm": 11.555240877864215, + "learning_rate": 4.9996642096441006e-05, + "loss": 2.4373, + "mean_token_accuracy": 0.4379310369491577, + "step": 54830 + }, + { + "epoch": 0.0552303943934628, + "grad_norm": 11.067382229684112, + "learning_rate": 4.9996635619791196e-05, + "loss": 2.1078, + "mean_token_accuracy": 0.4931034505367279, + "step": 54835 + }, + { + "epoch": 0.05523543044656697, + "grad_norm": 9.955400835859779, + "learning_rate": 4.9996629136901864e-05, + "loss": 2.2166, + "mean_token_accuracy": 0.4517241299152374, + "step": 54840 + }, + { + "epoch": 0.05524046649967115, + "grad_norm": 13.587307787873684, + "learning_rate": 4.9996622647772995e-05, + "loss": 2.34, + "mean_token_accuracy": 0.38275861740112305, + "step": 54845 + }, + { + "epoch": 0.05524550255277532, + "grad_norm": 13.831716374193146, + "learning_rate": 4.999661615240462e-05, + "loss": 2.8683, + "mean_token_accuracy": 0.33448276221752166, + "step": 54850 + }, + { + "epoch": 0.055250538605879494, + "grad_norm": 18.409390174585916, + "learning_rate": 4.9996609650796724e-05, + "loss": 2.8376, + "mean_token_accuracy": 0.3669691503047943, + "step": 54855 + }, + { + "epoch": 0.05525557465898366, + "grad_norm": 14.096979531387243, + "learning_rate": 4.99966031429493e-05, + "loss": 2.6939, + "mean_token_accuracy": 0.3448275804519653, + "step": 54860 + }, + { + "epoch": 0.055260610712087835, + "grad_norm": 12.132522164587572, + "learning_rate": 4.9996596628862363e-05, + "loss": 2.4797, + "mean_token_accuracy": 0.4434361755847931, + "step": 54865 + }, + { + "epoch": 0.05526564676519201, + "grad_norm": 13.113101118002294, + "learning_rate": 4.999659010853591e-05, + "loss": 2.3233, + "mean_token_accuracy": 0.4793103575706482, + "step": 54870 + }, + { + "epoch": 0.05527068281829618, + "grad_norm": 12.160092444708887, + "learning_rate": 4.9996583581969946e-05, + "loss": 2.68, + "mean_token_accuracy": 0.36896551251411436, + "step": 54875 + }, + { + "epoch": 0.055275718871400356, + "grad_norm": 11.922595660041557, + "learning_rate": 4.9996577049164474e-05, + "loss": 2.5988, + "mean_token_accuracy": 0.39310345649719236, + "step": 54880 + }, + { + "epoch": 0.05528075492450453, + "grad_norm": 12.816866636640722, + "learning_rate": 4.999657051011949e-05, + "loss": 2.8605, + "mean_token_accuracy": 0.36896551847457887, + "step": 54885 + }, + { + "epoch": 0.055285790977608704, + "grad_norm": 13.323420661861707, + "learning_rate": 4.9996563964835e-05, + "loss": 2.4045, + "mean_token_accuracy": 0.41034482419490814, + "step": 54890 + }, + { + "epoch": 0.05529082703071287, + "grad_norm": 13.41144168505522, + "learning_rate": 4.9996557413311e-05, + "loss": 2.39, + "mean_token_accuracy": 0.41034482717514037, + "step": 54895 + }, + { + "epoch": 0.055295863083817044, + "grad_norm": 13.579021989642717, + "learning_rate": 4.999655085554749e-05, + "loss": 2.6607, + "mean_token_accuracy": 0.3931034505367279, + "step": 54900 + }, + { + "epoch": 0.05530089913692122, + "grad_norm": 11.506504825942018, + "learning_rate": 4.999654429154449e-05, + "loss": 2.7671, + "mean_token_accuracy": 0.38620690405368807, + "step": 54905 + }, + { + "epoch": 0.05530593519002539, + "grad_norm": 12.614480926633107, + "learning_rate": 4.999653772130198e-05, + "loss": 2.8117, + "mean_token_accuracy": 0.3482758641242981, + "step": 54910 + }, + { + "epoch": 0.055310971243129566, + "grad_norm": 18.655108904377578, + "learning_rate": 4.9996531144819974e-05, + "loss": 2.6272, + "mean_token_accuracy": 0.4310344815254211, + "step": 54915 + }, + { + "epoch": 0.05531600729623374, + "grad_norm": 16.31915096539104, + "learning_rate": 4.999652456209847e-05, + "loss": 2.2107, + "mean_token_accuracy": 0.47241380214691164, + "step": 54920 + }, + { + "epoch": 0.05532104334933791, + "grad_norm": 12.164337612431556, + "learning_rate": 4.999651797313748e-05, + "loss": 2.4467, + "mean_token_accuracy": 0.41034482717514037, + "step": 54925 + }, + { + "epoch": 0.05532607940244208, + "grad_norm": 11.09874962605666, + "learning_rate": 4.999651137793699e-05, + "loss": 2.5066, + "mean_token_accuracy": 0.4137930989265442, + "step": 54930 + }, + { + "epoch": 0.055331115455546254, + "grad_norm": 11.873227816740608, + "learning_rate": 4.9996504776497e-05, + "loss": 2.4948, + "mean_token_accuracy": 0.4206896543502808, + "step": 54935 + }, + { + "epoch": 0.05533615150865043, + "grad_norm": 9.231988415391495, + "learning_rate": 4.999649816881754e-05, + "loss": 2.5976, + "mean_token_accuracy": 0.3862068861722946, + "step": 54940 + }, + { + "epoch": 0.0553411875617546, + "grad_norm": 10.243498909869446, + "learning_rate": 4.9996491554898585e-05, + "loss": 2.3952, + "mean_token_accuracy": 0.4486453115940094, + "step": 54945 + }, + { + "epoch": 0.055346223614858775, + "grad_norm": 13.308800715201759, + "learning_rate": 4.999648493474014e-05, + "loss": 2.4796, + "mean_token_accuracy": 0.4, + "step": 54950 + }, + { + "epoch": 0.05535125966796295, + "grad_norm": 12.46458750927132, + "learning_rate": 4.999647830834221e-05, + "loss": 2.7676, + "mean_token_accuracy": 0.4206896543502808, + "step": 54955 + }, + { + "epoch": 0.05535629572106712, + "grad_norm": 12.439676291250857, + "learning_rate": 4.9996471675704795e-05, + "loss": 2.6972, + "mean_token_accuracy": 0.43793103098869324, + "step": 54960 + }, + { + "epoch": 0.05536133177417129, + "grad_norm": 15.166152034644938, + "learning_rate": 4.9996465036827914e-05, + "loss": 2.8801, + "mean_token_accuracy": 0.341379314661026, + "step": 54965 + }, + { + "epoch": 0.05536636782727546, + "grad_norm": 12.126232198079368, + "learning_rate": 4.999645839171154e-05, + "loss": 2.5715, + "mean_token_accuracy": 0.37241379022598264, + "step": 54970 + }, + { + "epoch": 0.05537140388037964, + "grad_norm": 20.05079463876575, + "learning_rate": 4.9996451740355694e-05, + "loss": 2.8157, + "mean_token_accuracy": 0.40689654350280763, + "step": 54975 + }, + { + "epoch": 0.05537643993348381, + "grad_norm": 15.326401851155078, + "learning_rate": 4.9996445082760375e-05, + "loss": 2.4846, + "mean_token_accuracy": 0.391288560628891, + "step": 54980 + }, + { + "epoch": 0.055381475986587984, + "grad_norm": 15.116986372941707, + "learning_rate": 4.999643841892558e-05, + "loss": 2.6962, + "mean_token_accuracy": 0.4325468838214874, + "step": 54985 + }, + { + "epoch": 0.05538651203969216, + "grad_norm": 11.004673847189965, + "learning_rate": 4.999643174885132e-05, + "loss": 2.357, + "mean_token_accuracy": 0.4275862008333206, + "step": 54990 + }, + { + "epoch": 0.05539154809279633, + "grad_norm": 13.684405634514107, + "learning_rate": 4.999642507253759e-05, + "loss": 2.4922, + "mean_token_accuracy": 0.41034482717514037, + "step": 54995 + }, + { + "epoch": 0.0553965841459005, + "grad_norm": 13.130869878476563, + "learning_rate": 4.999641838998438e-05, + "loss": 2.3238, + "mean_token_accuracy": 0.458620685338974, + "step": 55000 + }, + { + "epoch": 0.05540162019900467, + "grad_norm": 14.063261380293753, + "learning_rate": 4.9996411701191714e-05, + "loss": 2.467, + "mean_token_accuracy": 0.3965517163276672, + "step": 55005 + }, + { + "epoch": 0.055406656252108846, + "grad_norm": 12.096044865196449, + "learning_rate": 4.9996405006159585e-05, + "loss": 2.7491, + "mean_token_accuracy": 0.38620689511299133, + "step": 55010 + }, + { + "epoch": 0.05541169230521302, + "grad_norm": 14.141718681045582, + "learning_rate": 4.999639830488799e-05, + "loss": 2.956, + "mean_token_accuracy": 0.38275861740112305, + "step": 55015 + }, + { + "epoch": 0.055416728358317194, + "grad_norm": 11.175247956516836, + "learning_rate": 4.999639159737694e-05, + "loss": 2.4365, + "mean_token_accuracy": 0.38620689511299133, + "step": 55020 + }, + { + "epoch": 0.05542176441142137, + "grad_norm": 14.315991564166548, + "learning_rate": 4.999638488362642e-05, + "loss": 2.9045, + "mean_token_accuracy": 0.3862069010734558, + "step": 55025 + }, + { + "epoch": 0.05542680046452554, + "grad_norm": 9.565224112753468, + "learning_rate": 4.999637816363646e-05, + "loss": 2.6469, + "mean_token_accuracy": 0.4186932861804962, + "step": 55030 + }, + { + "epoch": 0.05543183651762971, + "grad_norm": 11.566151668107477, + "learning_rate": 4.999637143740704e-05, + "loss": 3.2032, + "mean_token_accuracy": 0.33793103992938994, + "step": 55035 + }, + { + "epoch": 0.05543687257073388, + "grad_norm": 10.812055382251499, + "learning_rate": 4.9996364704938166e-05, + "loss": 2.6318, + "mean_token_accuracy": 0.4068965494632721, + "step": 55040 + }, + { + "epoch": 0.055441908623838056, + "grad_norm": 12.330865894265942, + "learning_rate": 4.9996357966229846e-05, + "loss": 2.4412, + "mean_token_accuracy": 0.4172413766384125, + "step": 55045 + }, + { + "epoch": 0.05544694467694223, + "grad_norm": 10.842066863927242, + "learning_rate": 4.9996351221282066e-05, + "loss": 2.5264, + "mean_token_accuracy": 0.39310344457626345, + "step": 55050 + }, + { + "epoch": 0.0554519807300464, + "grad_norm": 9.823117900966967, + "learning_rate": 4.9996344470094844e-05, + "loss": 2.4135, + "mean_token_accuracy": 0.43793103098869324, + "step": 55055 + }, + { + "epoch": 0.05545701678315058, + "grad_norm": 13.950908591630437, + "learning_rate": 4.999633771266818e-05, + "loss": 2.5919, + "mean_token_accuracy": 0.4172413766384125, + "step": 55060 + }, + { + "epoch": 0.05546205283625475, + "grad_norm": 12.823019800805229, + "learning_rate": 4.999633094900208e-05, + "loss": 2.5533, + "mean_token_accuracy": 0.42274652123451234, + "step": 55065 + }, + { + "epoch": 0.05546708888935892, + "grad_norm": 10.38702277510885, + "learning_rate": 4.9996324179096524e-05, + "loss": 2.381, + "mean_token_accuracy": 0.3931034505367279, + "step": 55070 + }, + { + "epoch": 0.05547212494246309, + "grad_norm": 10.821313330688637, + "learning_rate": 4.999631740295154e-05, + "loss": 2.4651, + "mean_token_accuracy": 0.39310344457626345, + "step": 55075 + }, + { + "epoch": 0.055477160995567265, + "grad_norm": 11.114194031609195, + "learning_rate": 4.999631062056712e-05, + "loss": 2.7361, + "mean_token_accuracy": 0.37586206793785093, + "step": 55080 + }, + { + "epoch": 0.05548219704867144, + "grad_norm": 10.485797390707267, + "learning_rate": 4.9996303831943254e-05, + "loss": 2.7935, + "mean_token_accuracy": 0.358620685338974, + "step": 55085 + }, + { + "epoch": 0.05548723310177561, + "grad_norm": 12.536233359062974, + "learning_rate": 4.999629703707995e-05, + "loss": 2.4863, + "mean_token_accuracy": 0.4137930989265442, + "step": 55090 + }, + { + "epoch": 0.055492269154879786, + "grad_norm": 12.769839693722233, + "learning_rate": 4.999629023597723e-05, + "loss": 2.4737, + "mean_token_accuracy": 0.42068966031074523, + "step": 55095 + }, + { + "epoch": 0.05549730520798396, + "grad_norm": 9.258883644066165, + "learning_rate": 4.999628342863507e-05, + "loss": 2.6758, + "mean_token_accuracy": 0.36757410764694215, + "step": 55100 + }, + { + "epoch": 0.05550234126108813, + "grad_norm": 12.27060918185047, + "learning_rate": 4.9996276615053475e-05, + "loss": 2.6892, + "mean_token_accuracy": 0.4068965494632721, + "step": 55105 + }, + { + "epoch": 0.0555073773141923, + "grad_norm": 14.654778470302091, + "learning_rate": 4.999626979523247e-05, + "loss": 2.8632, + "mean_token_accuracy": 0.39310344457626345, + "step": 55110 + }, + { + "epoch": 0.055512413367296475, + "grad_norm": 11.163513667086654, + "learning_rate": 4.999626296917204e-05, + "loss": 2.4608, + "mean_token_accuracy": 0.4137930989265442, + "step": 55115 + }, + { + "epoch": 0.05551744942040065, + "grad_norm": 14.131974380245504, + "learning_rate": 4.999625613687217e-05, + "loss": 2.8842, + "mean_token_accuracy": 0.379310342669487, + "step": 55120 + }, + { + "epoch": 0.05552248547350482, + "grad_norm": 12.94364310046765, + "learning_rate": 4.999624929833289e-05, + "loss": 3.0354, + "mean_token_accuracy": 0.3517241388559341, + "step": 55125 + }, + { + "epoch": 0.055527521526608996, + "grad_norm": 11.806762270520412, + "learning_rate": 4.9996242453554196e-05, + "loss": 2.4638, + "mean_token_accuracy": 0.417241370677948, + "step": 55130 + }, + { + "epoch": 0.05553255757971317, + "grad_norm": 16.698416340214298, + "learning_rate": 4.999623560253608e-05, + "loss": 2.7585, + "mean_token_accuracy": 0.3827586233615875, + "step": 55135 + }, + { + "epoch": 0.055537593632817336, + "grad_norm": 10.985591156033887, + "learning_rate": 4.999622874527855e-05, + "loss": 2.5439, + "mean_token_accuracy": 0.4620689630508423, + "step": 55140 + }, + { + "epoch": 0.05554262968592151, + "grad_norm": 9.43855972912326, + "learning_rate": 4.9996221881781605e-05, + "loss": 2.3659, + "mean_token_accuracy": 0.44652147889137267, + "step": 55145 + }, + { + "epoch": 0.055547665739025684, + "grad_norm": 16.989438754881242, + "learning_rate": 4.999621501204526e-05, + "loss": 2.5003, + "mean_token_accuracy": 0.44827585816383364, + "step": 55150 + }, + { + "epoch": 0.05555270179212986, + "grad_norm": 14.04569726459809, + "learning_rate": 4.999620813606948e-05, + "loss": 2.23, + "mean_token_accuracy": 0.4413793087005615, + "step": 55155 + }, + { + "epoch": 0.05555773784523403, + "grad_norm": 13.265368926279924, + "learning_rate": 4.999620125385432e-05, + "loss": 3.3609, + "mean_token_accuracy": 0.3551724076271057, + "step": 55160 + }, + { + "epoch": 0.055562773898338205, + "grad_norm": 14.550531278267712, + "learning_rate": 4.999619436539974e-05, + "loss": 2.8696, + "mean_token_accuracy": 0.3931034505367279, + "step": 55165 + }, + { + "epoch": 0.05556780995144238, + "grad_norm": 11.762144796235553, + "learning_rate": 4.999618747070576e-05, + "loss": 2.6729, + "mean_token_accuracy": 0.36896551251411436, + "step": 55170 + }, + { + "epoch": 0.055572846004546546, + "grad_norm": 11.219606238524557, + "learning_rate": 4.9996180569772385e-05, + "loss": 2.5713, + "mean_token_accuracy": 0.3703569293022156, + "step": 55175 + }, + { + "epoch": 0.05557788205765072, + "grad_norm": 14.27059387550376, + "learning_rate": 4.99961736625996e-05, + "loss": 2.7238, + "mean_token_accuracy": 0.37586206793785093, + "step": 55180 + }, + { + "epoch": 0.05558291811075489, + "grad_norm": 11.34166498933833, + "learning_rate": 4.999616674918742e-05, + "loss": 2.5613, + "mean_token_accuracy": 0.34827586114406583, + "step": 55185 + }, + { + "epoch": 0.05558795416385907, + "grad_norm": 12.222003998124842, + "learning_rate": 4.999615982953586e-05, + "loss": 2.3422, + "mean_token_accuracy": 0.42758620381355283, + "step": 55190 + }, + { + "epoch": 0.05559299021696324, + "grad_norm": 12.704621209609298, + "learning_rate": 4.9996152903644885e-05, + "loss": 2.7622, + "mean_token_accuracy": 0.3620689630508423, + "step": 55195 + }, + { + "epoch": 0.055598026270067415, + "grad_norm": 10.818785098566572, + "learning_rate": 4.9996145971514526e-05, + "loss": 2.4925, + "mean_token_accuracy": 0.4103448152542114, + "step": 55200 + }, + { + "epoch": 0.05560306232317159, + "grad_norm": 14.540562821421046, + "learning_rate": 4.999613903314477e-05, + "loss": 2.8531, + "mean_token_accuracy": 0.3068965464830399, + "step": 55205 + }, + { + "epoch": 0.055608098376275755, + "grad_norm": 9.694080516003845, + "learning_rate": 4.999613208853563e-05, + "loss": 2.7534, + "mean_token_accuracy": 0.40689654350280763, + "step": 55210 + }, + { + "epoch": 0.05561313442937993, + "grad_norm": 11.99486416886706, + "learning_rate": 4.999612513768711e-05, + "loss": 2.2911, + "mean_token_accuracy": 0.482758617401123, + "step": 55215 + }, + { + "epoch": 0.0556181704824841, + "grad_norm": 12.69469958096461, + "learning_rate": 4.9996118180599205e-05, + "loss": 2.9784, + "mean_token_accuracy": 0.29655172526836393, + "step": 55220 + }, + { + "epoch": 0.05562320653558828, + "grad_norm": 13.064659655759353, + "learning_rate": 4.999611121727191e-05, + "loss": 2.7467, + "mean_token_accuracy": 0.37241379022598264, + "step": 55225 + }, + { + "epoch": 0.05562824258869245, + "grad_norm": 11.095888516987154, + "learning_rate": 4.999610424770524e-05, + "loss": 2.3222, + "mean_token_accuracy": 0.4172413766384125, + "step": 55230 + }, + { + "epoch": 0.055633278641796624, + "grad_norm": 11.891335015197479, + "learning_rate": 4.999609727189919e-05, + "loss": 2.4741, + "mean_token_accuracy": 0.3862068921327591, + "step": 55235 + }, + { + "epoch": 0.0556383146949008, + "grad_norm": 12.038023638940437, + "learning_rate": 4.9996090289853766e-05, + "loss": 2.6813, + "mean_token_accuracy": 0.41034482717514037, + "step": 55240 + }, + { + "epoch": 0.055643350748004965, + "grad_norm": 10.037299761142773, + "learning_rate": 4.999608330156897e-05, + "loss": 2.3029, + "mean_token_accuracy": 0.4655172348022461, + "step": 55245 + }, + { + "epoch": 0.05564838680110914, + "grad_norm": 11.698946265773344, + "learning_rate": 4.999607630704479e-05, + "loss": 2.4444, + "mean_token_accuracy": 0.4034482717514038, + "step": 55250 + }, + { + "epoch": 0.05565342285421331, + "grad_norm": 11.989167923008816, + "learning_rate": 4.999606930628125e-05, + "loss": 2.5002, + "mean_token_accuracy": 0.3551724165678024, + "step": 55255 + }, + { + "epoch": 0.055658458907317486, + "grad_norm": 11.92598926128034, + "learning_rate": 4.999606229927834e-05, + "loss": 2.6421, + "mean_token_accuracy": 0.37931033968925476, + "step": 55260 + }, + { + "epoch": 0.05566349496042166, + "grad_norm": 9.733177344807933, + "learning_rate": 4.999605528603606e-05, + "loss": 2.9483, + "mean_token_accuracy": 0.3551724135875702, + "step": 55265 + }, + { + "epoch": 0.055668531013525833, + "grad_norm": 12.984678228421926, + "learning_rate": 4.9996048266554416e-05, + "loss": 2.5744, + "mean_token_accuracy": 0.4034482777118683, + "step": 55270 + }, + { + "epoch": 0.05567356706663001, + "grad_norm": 11.436758068912521, + "learning_rate": 4.999604124083341e-05, + "loss": 3.0646, + "mean_token_accuracy": 0.3724137842655182, + "step": 55275 + }, + { + "epoch": 0.055678603119734174, + "grad_norm": 11.825761933554366, + "learning_rate": 4.999603420887304e-05, + "loss": 2.6151, + "mean_token_accuracy": 0.4206896543502808, + "step": 55280 + }, + { + "epoch": 0.05568363917283835, + "grad_norm": 12.182385662508128, + "learning_rate": 4.999602717067332e-05, + "loss": 2.3252, + "mean_token_accuracy": 0.46551724076271056, + "step": 55285 + }, + { + "epoch": 0.05568867522594252, + "grad_norm": 13.670035836345328, + "learning_rate": 4.999602012623423e-05, + "loss": 2.7682, + "mean_token_accuracy": 0.3793103456497192, + "step": 55290 + }, + { + "epoch": 0.055693711279046695, + "grad_norm": 11.172175710103827, + "learning_rate": 4.999601307555579e-05, + "loss": 2.2023, + "mean_token_accuracy": 0.44827585816383364, + "step": 55295 + }, + { + "epoch": 0.05569874733215087, + "grad_norm": 22.686569157079127, + "learning_rate": 4.9996006018637995e-05, + "loss": 2.7921, + "mean_token_accuracy": 0.3413793116807938, + "step": 55300 + }, + { + "epoch": 0.05570378338525504, + "grad_norm": 12.437272474566399, + "learning_rate": 4.9995998955480854e-05, + "loss": 2.5276, + "mean_token_accuracy": 0.37586207389831544, + "step": 55305 + }, + { + "epoch": 0.05570881943835922, + "grad_norm": 12.45842858314631, + "learning_rate": 4.999599188608437e-05, + "loss": 2.529, + "mean_token_accuracy": 0.41379310488700866, + "step": 55310 + }, + { + "epoch": 0.055713855491463384, + "grad_norm": 11.484855808485914, + "learning_rate": 4.999598481044853e-05, + "loss": 2.8285, + "mean_token_accuracy": 0.38275861740112305, + "step": 55315 + }, + { + "epoch": 0.05571889154456756, + "grad_norm": 10.70018745369348, + "learning_rate": 4.999597772857334e-05, + "loss": 2.78, + "mean_token_accuracy": 0.3379310339689255, + "step": 55320 + }, + { + "epoch": 0.05572392759767173, + "grad_norm": 10.836576268570814, + "learning_rate": 4.999597064045881e-05, + "loss": 2.383, + "mean_token_accuracy": 0.4310344815254211, + "step": 55325 + }, + { + "epoch": 0.055728963650775905, + "grad_norm": 8.854809769308766, + "learning_rate": 4.999596354610495e-05, + "loss": 2.8599, + "mean_token_accuracy": 0.3793103456497192, + "step": 55330 + }, + { + "epoch": 0.05573399970388008, + "grad_norm": 11.066354031969151, + "learning_rate": 4.999595644551173e-05, + "loss": 2.5095, + "mean_token_accuracy": 0.41379310488700866, + "step": 55335 + }, + { + "epoch": 0.05573903575698425, + "grad_norm": 12.040554830990093, + "learning_rate": 4.9995949338679186e-05, + "loss": 2.742, + "mean_token_accuracy": 0.40344828367233276, + "step": 55340 + }, + { + "epoch": 0.055744071810088426, + "grad_norm": 19.297636080693888, + "learning_rate": 4.99959422256073e-05, + "loss": 3.1752, + "mean_token_accuracy": 0.31379309594631194, + "step": 55345 + }, + { + "epoch": 0.05574910786319259, + "grad_norm": 11.888817621477607, + "learning_rate": 4.9995935106296085e-05, + "loss": 2.5193, + "mean_token_accuracy": 0.39310344457626345, + "step": 55350 + }, + { + "epoch": 0.05575414391629677, + "grad_norm": 10.485442336677337, + "learning_rate": 4.9995927980745535e-05, + "loss": 2.427, + "mean_token_accuracy": 0.43448275327682495, + "step": 55355 + }, + { + "epoch": 0.05575917996940094, + "grad_norm": 11.66947506324508, + "learning_rate": 4.9995920848955666e-05, + "loss": 2.5891, + "mean_token_accuracy": 0.38620689511299133, + "step": 55360 + }, + { + "epoch": 0.055764216022505114, + "grad_norm": 9.83141530421554, + "learning_rate": 4.9995913710926464e-05, + "loss": 2.2626, + "mean_token_accuracy": 0.4448275864124298, + "step": 55365 + }, + { + "epoch": 0.05576925207560929, + "grad_norm": 11.424608987189329, + "learning_rate": 4.999590656665793e-05, + "loss": 2.7924, + "mean_token_accuracy": 0.35862069129943847, + "step": 55370 + }, + { + "epoch": 0.05577428812871346, + "grad_norm": 10.748982709331269, + "learning_rate": 4.9995899416150076e-05, + "loss": 2.8064, + "mean_token_accuracy": 0.3620689630508423, + "step": 55375 + }, + { + "epoch": 0.055779324181817636, + "grad_norm": 12.826696067805928, + "learning_rate": 4.99958922594029e-05, + "loss": 2.7681, + "mean_token_accuracy": 0.3674531102180481, + "step": 55380 + }, + { + "epoch": 0.0557843602349218, + "grad_norm": 15.455034190302108, + "learning_rate": 4.9995885096416404e-05, + "loss": 2.5121, + "mean_token_accuracy": 0.43448275327682495, + "step": 55385 + }, + { + "epoch": 0.055789396288025976, + "grad_norm": 13.671120657076818, + "learning_rate": 4.999587792719059e-05, + "loss": 2.7879, + "mean_token_accuracy": 0.358620685338974, + "step": 55390 + }, + { + "epoch": 0.05579443234113015, + "grad_norm": 12.232140129938331, + "learning_rate": 4.999587075172546e-05, + "loss": 2.5779, + "mean_token_accuracy": 0.4068965554237366, + "step": 55395 + }, + { + "epoch": 0.055799468394234324, + "grad_norm": 11.820937504017776, + "learning_rate": 4.9995863570021026e-05, + "loss": 2.306, + "mean_token_accuracy": 0.44827585220336913, + "step": 55400 + }, + { + "epoch": 0.0558045044473385, + "grad_norm": 13.893171730636613, + "learning_rate": 4.999585638207727e-05, + "loss": 2.3531, + "mean_token_accuracy": 0.43103448748588563, + "step": 55405 + }, + { + "epoch": 0.05580954050044267, + "grad_norm": 24.812036864556504, + "learning_rate": 4.99958491878942e-05, + "loss": 2.469, + "mean_token_accuracy": 0.4881773412227631, + "step": 55410 + }, + { + "epoch": 0.055814576553546845, + "grad_norm": 13.471408796672502, + "learning_rate": 4.9995841987471835e-05, + "loss": 2.6876, + "mean_token_accuracy": 0.4103448331356049, + "step": 55415 + }, + { + "epoch": 0.05581961260665101, + "grad_norm": 15.37994208949692, + "learning_rate": 4.9995834780810156e-05, + "loss": 2.2825, + "mean_token_accuracy": 0.46551724076271056, + "step": 55420 + }, + { + "epoch": 0.055824648659755186, + "grad_norm": 14.850270434701018, + "learning_rate": 4.9995827567909185e-05, + "loss": 2.3724, + "mean_token_accuracy": 0.41379310488700866, + "step": 55425 + }, + { + "epoch": 0.05582968471285936, + "grad_norm": 10.873612191458651, + "learning_rate": 4.9995820348768894e-05, + "loss": 2.5251, + "mean_token_accuracy": 0.41379311084747317, + "step": 55430 + }, + { + "epoch": 0.05583472076596353, + "grad_norm": 10.343179244525905, + "learning_rate": 4.999581312338931e-05, + "loss": 2.365, + "mean_token_accuracy": 0.44827587008476255, + "step": 55435 + }, + { + "epoch": 0.05583975681906771, + "grad_norm": 14.56217552975939, + "learning_rate": 4.9995805891770436e-05, + "loss": 2.7866, + "mean_token_accuracy": 0.3517241358757019, + "step": 55440 + }, + { + "epoch": 0.05584479287217188, + "grad_norm": 12.006735489290199, + "learning_rate": 4.999579865391226e-05, + "loss": 2.4464, + "mean_token_accuracy": 0.4109605967998505, + "step": 55445 + }, + { + "epoch": 0.055849828925276054, + "grad_norm": 11.601561176441633, + "learning_rate": 4.999579140981479e-05, + "loss": 2.7549, + "mean_token_accuracy": 0.3793103456497192, + "step": 55450 + }, + { + "epoch": 0.05585486497838022, + "grad_norm": 13.315224014545223, + "learning_rate": 4.999578415947803e-05, + "loss": 2.5823, + "mean_token_accuracy": 0.37586206793785093, + "step": 55455 + }, + { + "epoch": 0.055859901031484395, + "grad_norm": 14.807845251258229, + "learning_rate": 4.999577690290198e-05, + "loss": 2.7736, + "mean_token_accuracy": 0.4000000059604645, + "step": 55460 + }, + { + "epoch": 0.05586493708458857, + "grad_norm": 10.425430771972495, + "learning_rate": 4.9995769640086645e-05, + "loss": 2.5801, + "mean_token_accuracy": 0.37931033968925476, + "step": 55465 + }, + { + "epoch": 0.05586997313769274, + "grad_norm": 12.085998532704712, + "learning_rate": 4.9995762371032025e-05, + "loss": 2.6439, + "mean_token_accuracy": 0.3793103456497192, + "step": 55470 + }, + { + "epoch": 0.055875009190796916, + "grad_norm": 10.901559273113412, + "learning_rate": 4.999575509573812e-05, + "loss": 2.3454, + "mean_token_accuracy": 0.44827585816383364, + "step": 55475 + }, + { + "epoch": 0.05588004524390109, + "grad_norm": 13.592038832710685, + "learning_rate": 4.999574781420493e-05, + "loss": 3.045, + "mean_token_accuracy": 0.32068965435028074, + "step": 55480 + }, + { + "epoch": 0.055885081297005264, + "grad_norm": 13.372528576934323, + "learning_rate": 4.9995740526432456e-05, + "loss": 2.6479, + "mean_token_accuracy": 0.3931034505367279, + "step": 55485 + }, + { + "epoch": 0.05589011735010943, + "grad_norm": 19.076839293726408, + "learning_rate": 4.999573323242071e-05, + "loss": 3.2535, + "mean_token_accuracy": 0.320689657330513, + "step": 55490 + }, + { + "epoch": 0.055895153403213604, + "grad_norm": 17.44462576839078, + "learning_rate": 4.999572593216969e-05, + "loss": 2.7354, + "mean_token_accuracy": 0.3950393259525299, + "step": 55495 + }, + { + "epoch": 0.05590018945631778, + "grad_norm": 12.35275274178227, + "learning_rate": 4.9995718625679395e-05, + "loss": 2.8131, + "mean_token_accuracy": 0.33793103098869326, + "step": 55500 + }, + { + "epoch": 0.05590522550942195, + "grad_norm": 11.678609786300013, + "learning_rate": 4.999571131294983e-05, + "loss": 2.5529, + "mean_token_accuracy": 0.4172413766384125, + "step": 55505 + }, + { + "epoch": 0.055910261562526126, + "grad_norm": 9.885349932085091, + "learning_rate": 4.9995703993980995e-05, + "loss": 2.4263, + "mean_token_accuracy": 0.4291590929031372, + "step": 55510 + }, + { + "epoch": 0.0559152976156303, + "grad_norm": 10.978215761057383, + "learning_rate": 4.999569666877289e-05, + "loss": 2.3606, + "mean_token_accuracy": 0.3999999940395355, + "step": 55515 + }, + { + "epoch": 0.05592033366873447, + "grad_norm": 11.804899225600598, + "learning_rate": 4.999568933732552e-05, + "loss": 2.4634, + "mean_token_accuracy": 0.38620689511299133, + "step": 55520 + }, + { + "epoch": 0.05592536972183864, + "grad_norm": 14.5273973854347, + "learning_rate": 4.99956819996389e-05, + "loss": 2.8908, + "mean_token_accuracy": 0.3827586114406586, + "step": 55525 + }, + { + "epoch": 0.055930405774942814, + "grad_norm": 11.646108210631468, + "learning_rate": 4.9995674655712994e-05, + "loss": 2.7042, + "mean_token_accuracy": 0.3482758581638336, + "step": 55530 + }, + { + "epoch": 0.05593544182804699, + "grad_norm": 12.103754202466247, + "learning_rate": 4.9995667305547847e-05, + "loss": 2.411, + "mean_token_accuracy": 0.4034482717514038, + "step": 55535 + }, + { + "epoch": 0.05594047788115116, + "grad_norm": 14.91880903910126, + "learning_rate": 4.9995659949143434e-05, + "loss": 2.7799, + "mean_token_accuracy": 0.36654567122459414, + "step": 55540 + }, + { + "epoch": 0.055945513934255335, + "grad_norm": 11.824224370379714, + "learning_rate": 4.999565258649977e-05, + "loss": 2.8428, + "mean_token_accuracy": 0.39655172228813174, + "step": 55545 + }, + { + "epoch": 0.05595054998735951, + "grad_norm": 13.08274366988657, + "learning_rate": 4.9995645217616855e-05, + "loss": 2.6681, + "mean_token_accuracy": 0.37241379618644715, + "step": 55550 + }, + { + "epoch": 0.05595558604046368, + "grad_norm": 12.739105684928443, + "learning_rate": 4.999563784249468e-05, + "loss": 2.6601, + "mean_token_accuracy": 0.3655172288417816, + "step": 55555 + }, + { + "epoch": 0.05596062209356785, + "grad_norm": 14.869876125362172, + "learning_rate": 4.999563046113327e-05, + "loss": 2.5195, + "mean_token_accuracy": 0.3931034505367279, + "step": 55560 + }, + { + "epoch": 0.05596565814667202, + "grad_norm": 13.541932171509355, + "learning_rate": 4.99956230735326e-05, + "loss": 2.4727, + "mean_token_accuracy": 0.4034482777118683, + "step": 55565 + }, + { + "epoch": 0.0559706941997762, + "grad_norm": 13.002342327516184, + "learning_rate": 4.9995615679692685e-05, + "loss": 2.6708, + "mean_token_accuracy": 0.38620689511299133, + "step": 55570 + }, + { + "epoch": 0.05597573025288037, + "grad_norm": 12.362158743427823, + "learning_rate": 4.999560827961354e-05, + "loss": 2.6806, + "mean_token_accuracy": 0.3931034505367279, + "step": 55575 + }, + { + "epoch": 0.055980766305984545, + "grad_norm": 11.83658952349421, + "learning_rate": 4.999560087329514e-05, + "loss": 2.4679, + "mean_token_accuracy": 0.3931034505367279, + "step": 55580 + }, + { + "epoch": 0.05598580235908872, + "grad_norm": 11.100836656050399, + "learning_rate": 4.9995593460737514e-05, + "loss": 2.148, + "mean_token_accuracy": 0.4620689630508423, + "step": 55585 + }, + { + "epoch": 0.05599083841219289, + "grad_norm": 12.676637616371321, + "learning_rate": 4.999558604194064e-05, + "loss": 2.6513, + "mean_token_accuracy": 0.4, + "step": 55590 + }, + { + "epoch": 0.05599587446529706, + "grad_norm": 10.279832895428571, + "learning_rate": 4.999557861690454e-05, + "loss": 2.3522, + "mean_token_accuracy": 0.4517241358757019, + "step": 55595 + }, + { + "epoch": 0.05600091051840123, + "grad_norm": 13.650754241162055, + "learning_rate": 4.9995571185629195e-05, + "loss": 2.4038, + "mean_token_accuracy": 0.4310344815254211, + "step": 55600 + }, + { + "epoch": 0.056005946571505406, + "grad_norm": 11.493670205352528, + "learning_rate": 4.9995563748114635e-05, + "loss": 2.3417, + "mean_token_accuracy": 0.4413793206214905, + "step": 55605 + }, + { + "epoch": 0.05601098262460958, + "grad_norm": 11.946136066864376, + "learning_rate": 4.999555630436084e-05, + "loss": 2.7034, + "mean_token_accuracy": 0.37586206793785093, + "step": 55610 + }, + { + "epoch": 0.056016018677713754, + "grad_norm": 13.4038665965711, + "learning_rate": 4.9995548854367816e-05, + "loss": 2.7833, + "mean_token_accuracy": 0.37241379022598264, + "step": 55615 + }, + { + "epoch": 0.05602105473081793, + "grad_norm": 11.994142765887478, + "learning_rate": 4.999554139813556e-05, + "loss": 2.4096, + "mean_token_accuracy": 0.4310344934463501, + "step": 55620 + }, + { + "epoch": 0.0560260907839221, + "grad_norm": 10.846849599478308, + "learning_rate": 4.99955339356641e-05, + "loss": 2.4347, + "mean_token_accuracy": 0.4517241418361664, + "step": 55625 + }, + { + "epoch": 0.05603112683702627, + "grad_norm": 15.755515820038195, + "learning_rate": 4.9995526466953404e-05, + "loss": 2.804, + "mean_token_accuracy": 0.39310344457626345, + "step": 55630 + }, + { + "epoch": 0.05603616289013044, + "grad_norm": 9.815562962525917, + "learning_rate": 4.9995518992003505e-05, + "loss": 2.5899, + "mean_token_accuracy": 0.39310344457626345, + "step": 55635 + }, + { + "epoch": 0.056041198943234616, + "grad_norm": 12.549017794520173, + "learning_rate": 4.9995511510814375e-05, + "loss": 3.0064, + "mean_token_accuracy": 0.3172413736581802, + "step": 55640 + }, + { + "epoch": 0.05604623499633879, + "grad_norm": 12.804856187185829, + "learning_rate": 4.9995504023386034e-05, + "loss": 2.658, + "mean_token_accuracy": 0.3758620619773865, + "step": 55645 + }, + { + "epoch": 0.05605127104944296, + "grad_norm": 12.347900923551641, + "learning_rate": 4.999549652971849e-05, + "loss": 2.6637, + "mean_token_accuracy": 0.3551724076271057, + "step": 55650 + }, + { + "epoch": 0.05605630710254714, + "grad_norm": 11.287367752838922, + "learning_rate": 4.999548902981173e-05, + "loss": 2.3648, + "mean_token_accuracy": 0.4413793087005615, + "step": 55655 + }, + { + "epoch": 0.05606134315565131, + "grad_norm": 12.158360580826953, + "learning_rate": 4.999548152366576e-05, + "loss": 2.6061, + "mean_token_accuracy": 0.3344827502965927, + "step": 55660 + }, + { + "epoch": 0.05606637920875548, + "grad_norm": 11.714547896287067, + "learning_rate": 4.999547401128058e-05, + "loss": 2.5602, + "mean_token_accuracy": 0.32413793802261354, + "step": 55665 + }, + { + "epoch": 0.05607141526185965, + "grad_norm": 10.959726244231172, + "learning_rate": 4.9995466492656215e-05, + "loss": 2.4428, + "mean_token_accuracy": 0.42068966031074523, + "step": 55670 + }, + { + "epoch": 0.056076451314963825, + "grad_norm": 12.691551244843508, + "learning_rate": 4.999545896779263e-05, + "loss": 2.4154, + "mean_token_accuracy": 0.4188747763633728, + "step": 55675 + }, + { + "epoch": 0.056081487368068, + "grad_norm": 12.216984822867625, + "learning_rate": 4.9995451436689854e-05, + "loss": 2.6198, + "mean_token_accuracy": 0.4050211668014526, + "step": 55680 + }, + { + "epoch": 0.05608652342117217, + "grad_norm": 12.83083705583494, + "learning_rate": 4.9995443899347874e-05, + "loss": 2.807, + "mean_token_accuracy": 0.4068965494632721, + "step": 55685 + }, + { + "epoch": 0.05609155947427635, + "grad_norm": 12.43070173344862, + "learning_rate": 4.999543635576671e-05, + "loss": 2.3477, + "mean_token_accuracy": 0.4689655125141144, + "step": 55690 + }, + { + "epoch": 0.05609659552738052, + "grad_norm": 11.726576799844116, + "learning_rate": 4.999542880594634e-05, + "loss": 2.8829, + "mean_token_accuracy": 0.3655172407627106, + "step": 55695 + }, + { + "epoch": 0.05610163158048469, + "grad_norm": 10.989027421236075, + "learning_rate": 4.999542124988679e-05, + "loss": 2.4286, + "mean_token_accuracy": 0.4344827651977539, + "step": 55700 + }, + { + "epoch": 0.05610666763358886, + "grad_norm": 14.314008801388935, + "learning_rate": 4.999541368758804e-05, + "loss": 2.7678, + "mean_token_accuracy": 0.4, + "step": 55705 + }, + { + "epoch": 0.056111703686693035, + "grad_norm": 11.22885216669747, + "learning_rate": 4.999540611905012e-05, + "loss": 2.7311, + "mean_token_accuracy": 0.3827586114406586, + "step": 55710 + }, + { + "epoch": 0.05611673973979721, + "grad_norm": 13.384945511617811, + "learning_rate": 4.9995398544273e-05, + "loss": 2.6346, + "mean_token_accuracy": 0.37586206793785093, + "step": 55715 + }, + { + "epoch": 0.05612177579290138, + "grad_norm": 9.91603656885435, + "learning_rate": 4.999539096325671e-05, + "loss": 2.7614, + "mean_token_accuracy": 0.3448275804519653, + "step": 55720 + }, + { + "epoch": 0.056126811846005556, + "grad_norm": 12.446619852554413, + "learning_rate": 4.9995383376001225e-05, + "loss": 2.4104, + "mean_token_accuracy": 0.43309134244918823, + "step": 55725 + }, + { + "epoch": 0.05613184789910973, + "grad_norm": 12.557396447300956, + "learning_rate": 4.9995375782506564e-05, + "loss": 2.5875, + "mean_token_accuracy": 0.39655172228813174, + "step": 55730 + }, + { + "epoch": 0.0561368839522139, + "grad_norm": 17.408677750925104, + "learning_rate": 4.999536818277273e-05, + "loss": 2.6708, + "mean_token_accuracy": 0.41379310488700866, + "step": 55735 + }, + { + "epoch": 0.05614192000531807, + "grad_norm": 11.915190773259242, + "learning_rate": 4.9995360576799726e-05, + "loss": 2.6907, + "mean_token_accuracy": 0.334482753276825, + "step": 55740 + }, + { + "epoch": 0.056146956058422244, + "grad_norm": 24.665368382376034, + "learning_rate": 4.999535296458755e-05, + "loss": 2.9262, + "mean_token_accuracy": 0.34137930870056155, + "step": 55745 + }, + { + "epoch": 0.05615199211152642, + "grad_norm": 13.810800373524092, + "learning_rate": 4.9995345346136194e-05, + "loss": 2.6177, + "mean_token_accuracy": 0.35862069129943847, + "step": 55750 + }, + { + "epoch": 0.05615702816463059, + "grad_norm": 12.4799682799755, + "learning_rate": 4.9995337721445676e-05, + "loss": 2.7356, + "mean_token_accuracy": 0.3724137932062149, + "step": 55755 + }, + { + "epoch": 0.056162064217734765, + "grad_norm": 10.216149877141273, + "learning_rate": 4.999533009051599e-05, + "loss": 2.3663, + "mean_token_accuracy": 0.47586206793785096, + "step": 55760 + }, + { + "epoch": 0.05616710027083894, + "grad_norm": 13.21183106887741, + "learning_rate": 4.9995322453347143e-05, + "loss": 2.6759, + "mean_token_accuracy": 0.3671506345272064, + "step": 55765 + }, + { + "epoch": 0.056172136323943106, + "grad_norm": 10.91491129085444, + "learning_rate": 4.9995314809939135e-05, + "loss": 2.663, + "mean_token_accuracy": 0.4121597170829773, + "step": 55770 + }, + { + "epoch": 0.05617717237704728, + "grad_norm": 11.026360965791449, + "learning_rate": 4.9995307160291964e-05, + "loss": 2.2988, + "mean_token_accuracy": 0.3965517282485962, + "step": 55775 + }, + { + "epoch": 0.056182208430151453, + "grad_norm": 12.154726158672773, + "learning_rate": 4.999529950440564e-05, + "loss": 2.1225, + "mean_token_accuracy": 0.4517241358757019, + "step": 55780 + }, + { + "epoch": 0.05618724448325563, + "grad_norm": 13.978067003386796, + "learning_rate": 4.9995291842280165e-05, + "loss": 2.7723, + "mean_token_accuracy": 0.36896551847457887, + "step": 55785 + }, + { + "epoch": 0.0561922805363598, + "grad_norm": 19.492661877672266, + "learning_rate": 4.9995284173915524e-05, + "loss": 2.563, + "mean_token_accuracy": 0.3862069010734558, + "step": 55790 + }, + { + "epoch": 0.056197316589463975, + "grad_norm": 10.692583344408439, + "learning_rate": 4.999527649931174e-05, + "loss": 2.3476, + "mean_token_accuracy": 0.44827585816383364, + "step": 55795 + }, + { + "epoch": 0.05620235264256815, + "grad_norm": 12.032537297566773, + "learning_rate": 4.9995268818468806e-05, + "loss": 2.299, + "mean_token_accuracy": 0.44700543880462645, + "step": 55800 + }, + { + "epoch": 0.056207388695672315, + "grad_norm": 13.987750025456895, + "learning_rate": 4.999526113138672e-05, + "loss": 2.4934, + "mean_token_accuracy": 0.4448275864124298, + "step": 55805 + }, + { + "epoch": 0.05621242474877649, + "grad_norm": 14.669170392521945, + "learning_rate": 4.9995253438065495e-05, + "loss": 2.6139, + "mean_token_accuracy": 0.4310344815254211, + "step": 55810 + }, + { + "epoch": 0.05621746080188066, + "grad_norm": 10.902668319945485, + "learning_rate": 4.999524573850513e-05, + "loss": 2.4678, + "mean_token_accuracy": 0.37241379618644715, + "step": 55815 + }, + { + "epoch": 0.05622249685498484, + "grad_norm": 12.862289039856709, + "learning_rate": 4.999523803270562e-05, + "loss": 2.7704, + "mean_token_accuracy": 0.35862069129943847, + "step": 55820 + }, + { + "epoch": 0.05622753290808901, + "grad_norm": 14.175467101771451, + "learning_rate": 4.9995230320666976e-05, + "loss": 2.7167, + "mean_token_accuracy": 0.36206896901130675, + "step": 55825 + }, + { + "epoch": 0.056232568961193184, + "grad_norm": 9.621561650234693, + "learning_rate": 4.999522260238919e-05, + "loss": 2.6136, + "mean_token_accuracy": 0.4310344815254211, + "step": 55830 + }, + { + "epoch": 0.05623760501429736, + "grad_norm": 9.619174855323118, + "learning_rate": 4.9995214877872275e-05, + "loss": 2.3365, + "mean_token_accuracy": 0.4482758641242981, + "step": 55835 + }, + { + "epoch": 0.056242641067401525, + "grad_norm": 10.634638051473122, + "learning_rate": 4.999520714711623e-05, + "loss": 2.3196, + "mean_token_accuracy": 0.42758620381355283, + "step": 55840 + }, + { + "epoch": 0.0562476771205057, + "grad_norm": 12.627631835259768, + "learning_rate": 4.999519941012104e-05, + "loss": 2.7594, + "mean_token_accuracy": 0.3620689570903778, + "step": 55845 + }, + { + "epoch": 0.05625271317360987, + "grad_norm": 12.526010201682999, + "learning_rate": 4.999519166688674e-05, + "loss": 2.7274, + "mean_token_accuracy": 0.417241370677948, + "step": 55850 + }, + { + "epoch": 0.056257749226714046, + "grad_norm": 13.640183196419043, + "learning_rate": 4.999518391741331e-05, + "loss": 2.7894, + "mean_token_accuracy": 0.33448274731636046, + "step": 55855 + }, + { + "epoch": 0.05626278527981822, + "grad_norm": 11.45198418309512, + "learning_rate": 4.999517616170076e-05, + "loss": 2.6135, + "mean_token_accuracy": 0.3712643623352051, + "step": 55860 + }, + { + "epoch": 0.056267821332922394, + "grad_norm": 11.337182460217367, + "learning_rate": 4.9995168399749086e-05, + "loss": 2.6033, + "mean_token_accuracy": 0.3758620619773865, + "step": 55865 + }, + { + "epoch": 0.05627285738602657, + "grad_norm": 12.815870179073693, + "learning_rate": 4.999516063155829e-05, + "loss": 2.5371, + "mean_token_accuracy": 0.4068965494632721, + "step": 55870 + }, + { + "epoch": 0.056277893439130734, + "grad_norm": 11.100899962844503, + "learning_rate": 4.999515285712838e-05, + "loss": 2.3923, + "mean_token_accuracy": 0.4310344815254211, + "step": 55875 + }, + { + "epoch": 0.05628292949223491, + "grad_norm": 9.908457361270326, + "learning_rate": 4.9995145076459355e-05, + "loss": 2.6029, + "mean_token_accuracy": 0.41034482717514037, + "step": 55880 + }, + { + "epoch": 0.05628796554533908, + "grad_norm": 10.497554776812228, + "learning_rate": 4.999513728955122e-05, + "loss": 2.293, + "mean_token_accuracy": 0.4724137902259827, + "step": 55885 + }, + { + "epoch": 0.056293001598443256, + "grad_norm": 14.141165602947416, + "learning_rate": 4.999512949640397e-05, + "loss": 2.8359, + "mean_token_accuracy": 0.3379310369491577, + "step": 55890 + }, + { + "epoch": 0.05629803765154743, + "grad_norm": 11.288246736623863, + "learning_rate": 4.999512169701761e-05, + "loss": 2.5181, + "mean_token_accuracy": 0.458620685338974, + "step": 55895 + }, + { + "epoch": 0.0563030737046516, + "grad_norm": 12.87596935670035, + "learning_rate": 4.9995113891392145e-05, + "loss": 2.7432, + "mean_token_accuracy": 0.3827586233615875, + "step": 55900 + }, + { + "epoch": 0.05630810975775578, + "grad_norm": 12.996531430316704, + "learning_rate": 4.999510607952758e-05, + "loss": 2.344, + "mean_token_accuracy": 0.4983666181564331, + "step": 55905 + }, + { + "epoch": 0.056313145810859944, + "grad_norm": 11.67859835841478, + "learning_rate": 4.999509826142392e-05, + "loss": 2.8238, + "mean_token_accuracy": 0.3862068891525269, + "step": 55910 + }, + { + "epoch": 0.05631818186396412, + "grad_norm": 12.2505877663274, + "learning_rate": 4.9995090437081155e-05, + "loss": 2.1617, + "mean_token_accuracy": 0.46206897497177124, + "step": 55915 + }, + { + "epoch": 0.05632321791706829, + "grad_norm": 12.447113254753495, + "learning_rate": 4.999508260649929e-05, + "loss": 2.8807, + "mean_token_accuracy": 0.417241370677948, + "step": 55920 + }, + { + "epoch": 0.056328253970172465, + "grad_norm": 15.289555280321078, + "learning_rate": 4.999507476967833e-05, + "loss": 2.9021, + "mean_token_accuracy": 0.35862069129943847, + "step": 55925 + }, + { + "epoch": 0.05633329002327664, + "grad_norm": 15.956216883924265, + "learning_rate": 4.999506692661828e-05, + "loss": 2.9148, + "mean_token_accuracy": 0.34137930870056155, + "step": 55930 + }, + { + "epoch": 0.05633832607638081, + "grad_norm": 12.094896997133805, + "learning_rate": 4.999505907731913e-05, + "loss": 2.1952, + "mean_token_accuracy": 0.4206896543502808, + "step": 55935 + }, + { + "epoch": 0.056343362129484986, + "grad_norm": 12.041920918747772, + "learning_rate": 4.9995051221780904e-05, + "loss": 2.5919, + "mean_token_accuracy": 0.42758620977401735, + "step": 55940 + }, + { + "epoch": 0.05634839818258915, + "grad_norm": 12.712888331040201, + "learning_rate": 4.999504336000359e-05, + "loss": 2.5698, + "mean_token_accuracy": 0.38965516686439516, + "step": 55945 + }, + { + "epoch": 0.05635343423569333, + "grad_norm": 11.715254665003515, + "learning_rate": 4.999503549198718e-05, + "loss": 2.7534, + "mean_token_accuracy": 0.4, + "step": 55950 + }, + { + "epoch": 0.0563584702887975, + "grad_norm": 13.25115930468187, + "learning_rate": 4.99950276177317e-05, + "loss": 2.6856, + "mean_token_accuracy": 0.39086509346961973, + "step": 55955 + }, + { + "epoch": 0.056363506341901674, + "grad_norm": 10.62711436502527, + "learning_rate": 4.999501973723714e-05, + "loss": 2.4892, + "mean_token_accuracy": 0.4103448331356049, + "step": 55960 + }, + { + "epoch": 0.05636854239500585, + "grad_norm": 14.387932801703723, + "learning_rate": 4.99950118505035e-05, + "loss": 3.1966, + "mean_token_accuracy": 0.3517241418361664, + "step": 55965 + }, + { + "epoch": 0.05637357844811002, + "grad_norm": 13.092175663752013, + "learning_rate": 4.999500395753078e-05, + "loss": 2.8125, + "mean_token_accuracy": 0.38965516686439516, + "step": 55970 + }, + { + "epoch": 0.056378614501214196, + "grad_norm": 9.395236459054871, + "learning_rate": 4.9994996058318985e-05, + "loss": 2.5765, + "mean_token_accuracy": 0.3551724135875702, + "step": 55975 + }, + { + "epoch": 0.05638365055431836, + "grad_norm": 11.487358980309494, + "learning_rate": 4.999498815286812e-05, + "loss": 2.6209, + "mean_token_accuracy": 0.4068965494632721, + "step": 55980 + }, + { + "epoch": 0.056388686607422536, + "grad_norm": 9.770517729132662, + "learning_rate": 4.99949802411782e-05, + "loss": 2.2947, + "mean_token_accuracy": 0.4551724135875702, + "step": 55985 + }, + { + "epoch": 0.05639372266052671, + "grad_norm": 23.677678380445162, + "learning_rate": 4.99949723232492e-05, + "loss": 3.2096, + "mean_token_accuracy": 0.3103448301553726, + "step": 55990 + }, + { + "epoch": 0.056398758713630884, + "grad_norm": 13.871452817833232, + "learning_rate": 4.999496439908113e-05, + "loss": 2.5721, + "mean_token_accuracy": 0.3655172407627106, + "step": 55995 + }, + { + "epoch": 0.05640379476673506, + "grad_norm": 13.299049886697127, + "learning_rate": 4.9994956468674014e-05, + "loss": 2.7384, + "mean_token_accuracy": 0.42758620977401735, + "step": 56000 + }, + { + "epoch": 0.05640883081983923, + "grad_norm": 20.86346437692601, + "learning_rate": 4.9994948532027834e-05, + "loss": 2.5434, + "mean_token_accuracy": 0.4689655125141144, + "step": 56005 + }, + { + "epoch": 0.056413866872943405, + "grad_norm": 11.747560814955518, + "learning_rate": 4.999494058914259e-05, + "loss": 2.2009, + "mean_token_accuracy": 0.4758620738983154, + "step": 56010 + }, + { + "epoch": 0.05641890292604757, + "grad_norm": 12.800483172331791, + "learning_rate": 4.99949326400183e-05, + "loss": 2.5922, + "mean_token_accuracy": 0.39655172228813174, + "step": 56015 + }, + { + "epoch": 0.056423938979151746, + "grad_norm": 10.591866492106657, + "learning_rate": 4.999492468465494e-05, + "loss": 2.4855, + "mean_token_accuracy": 0.4448275864124298, + "step": 56020 + }, + { + "epoch": 0.05642897503225592, + "grad_norm": 12.301025568847635, + "learning_rate": 4.999491672305254e-05, + "loss": 2.1691, + "mean_token_accuracy": 0.44688445925712583, + "step": 56025 + }, + { + "epoch": 0.05643401108536009, + "grad_norm": 12.424858740053356, + "learning_rate": 4.999490875521109e-05, + "loss": 2.4156, + "mean_token_accuracy": 0.4068965494632721, + "step": 56030 + }, + { + "epoch": 0.05643904713846427, + "grad_norm": 14.318980443474068, + "learning_rate": 4.999490078113059e-05, + "loss": 2.3559, + "mean_token_accuracy": 0.417241370677948, + "step": 56035 + }, + { + "epoch": 0.05644408319156844, + "grad_norm": 12.00690280503789, + "learning_rate": 4.999489280081105e-05, + "loss": 2.6854, + "mean_token_accuracy": 0.4172413796186447, + "step": 56040 + }, + { + "epoch": 0.056449119244672614, + "grad_norm": 9.933033563182349, + "learning_rate": 4.999488481425247e-05, + "loss": 2.5825, + "mean_token_accuracy": 0.3709618806838989, + "step": 56045 + }, + { + "epoch": 0.05645415529777678, + "grad_norm": 10.14170066469442, + "learning_rate": 4.999487682145484e-05, + "loss": 2.6453, + "mean_token_accuracy": 0.37586206793785093, + "step": 56050 + }, + { + "epoch": 0.056459191350880955, + "grad_norm": 11.864981174959619, + "learning_rate": 4.999486882241818e-05, + "loss": 3.2991, + "mean_token_accuracy": 0.3275862067937851, + "step": 56055 + }, + { + "epoch": 0.05646422740398513, + "grad_norm": 12.922577336153592, + "learning_rate": 4.999486081714248e-05, + "loss": 2.4108, + "mean_token_accuracy": 0.4137930989265442, + "step": 56060 + }, + { + "epoch": 0.0564692634570893, + "grad_norm": 16.35320997636631, + "learning_rate": 4.9994852805627754e-05, + "loss": 2.6591, + "mean_token_accuracy": 0.37241379022598264, + "step": 56065 + }, + { + "epoch": 0.056474299510193476, + "grad_norm": 24.246943848686776, + "learning_rate": 4.999484478787399e-05, + "loss": 2.8389, + "mean_token_accuracy": 0.4263762950897217, + "step": 56070 + }, + { + "epoch": 0.05647933556329765, + "grad_norm": 13.000042837197599, + "learning_rate": 4.999483676388119e-05, + "loss": 2.6574, + "mean_token_accuracy": 0.4, + "step": 56075 + }, + { + "epoch": 0.056484371616401824, + "grad_norm": 10.291728969346774, + "learning_rate": 4.9994828733649376e-05, + "loss": 2.2894, + "mean_token_accuracy": 0.44857833385467527, + "step": 56080 + }, + { + "epoch": 0.05648940766950599, + "grad_norm": 13.35767519366003, + "learning_rate": 4.999482069717853e-05, + "loss": 2.8827, + "mean_token_accuracy": 0.37241379022598264, + "step": 56085 + }, + { + "epoch": 0.056494443722610165, + "grad_norm": 17.04829436534816, + "learning_rate": 4.999481265446866e-05, + "loss": 2.7501, + "mean_token_accuracy": 0.3724137991666794, + "step": 56090 + }, + { + "epoch": 0.05649947977571434, + "grad_norm": 11.224689600497436, + "learning_rate": 4.999480460551977e-05, + "loss": 2.7012, + "mean_token_accuracy": 0.3999999940395355, + "step": 56095 + }, + { + "epoch": 0.05650451582881851, + "grad_norm": 9.264897784219167, + "learning_rate": 4.999479655033187e-05, + "loss": 2.2701, + "mean_token_accuracy": 0.47447065711021424, + "step": 56100 + }, + { + "epoch": 0.056509551881922686, + "grad_norm": 11.588187551590568, + "learning_rate": 4.9994788488904956e-05, + "loss": 2.9947, + "mean_token_accuracy": 0.4, + "step": 56105 + }, + { + "epoch": 0.05651458793502686, + "grad_norm": 11.780256155652385, + "learning_rate": 4.9994780421239015e-05, + "loss": 2.6263, + "mean_token_accuracy": 0.38620689511299133, + "step": 56110 + }, + { + "epoch": 0.05651962398813103, + "grad_norm": 19.174406558996136, + "learning_rate": 4.9994772347334074e-05, + "loss": 2.8523, + "mean_token_accuracy": 0.4000000059604645, + "step": 56115 + }, + { + "epoch": 0.0565246600412352, + "grad_norm": 14.08217172564536, + "learning_rate": 4.999476426719012e-05, + "loss": 2.4826, + "mean_token_accuracy": 0.4068965494632721, + "step": 56120 + }, + { + "epoch": 0.056529696094339374, + "grad_norm": 10.772878961266995, + "learning_rate": 4.9994756180807156e-05, + "loss": 2.35, + "mean_token_accuracy": 0.4586206912994385, + "step": 56125 + }, + { + "epoch": 0.05653473214744355, + "grad_norm": 10.95309091243865, + "learning_rate": 4.9994748088185185e-05, + "loss": 2.649, + "mean_token_accuracy": 0.4103448212146759, + "step": 56130 + }, + { + "epoch": 0.05653976820054772, + "grad_norm": 12.073484655424734, + "learning_rate": 4.999473998932422e-05, + "loss": 2.2722, + "mean_token_accuracy": 0.44137930274009707, + "step": 56135 + }, + { + "epoch": 0.056544804253651895, + "grad_norm": 9.428735317417326, + "learning_rate": 4.999473188422425e-05, + "loss": 2.1036, + "mean_token_accuracy": 0.4551724135875702, + "step": 56140 + }, + { + "epoch": 0.05654984030675607, + "grad_norm": 10.173790946437139, + "learning_rate": 4.999472377288529e-05, + "loss": 2.3319, + "mean_token_accuracy": 0.37586206793785093, + "step": 56145 + }, + { + "epoch": 0.05655487635986024, + "grad_norm": 11.114154994222837, + "learning_rate": 4.999471565530732e-05, + "loss": 2.2554, + "mean_token_accuracy": 0.3896551787853241, + "step": 56150 + }, + { + "epoch": 0.05655991241296441, + "grad_norm": 9.898848299447293, + "learning_rate": 4.999470753149037e-05, + "loss": 2.2788, + "mean_token_accuracy": 0.4724137902259827, + "step": 56155 + }, + { + "epoch": 0.05656494846606858, + "grad_norm": 12.956859726813956, + "learning_rate": 4.999469940143442e-05, + "loss": 2.7046, + "mean_token_accuracy": 0.36896551251411436, + "step": 56160 + }, + { + "epoch": 0.05656998451917276, + "grad_norm": 13.070449112114877, + "learning_rate": 4.9994691265139486e-05, + "loss": 2.4649, + "mean_token_accuracy": 0.3896551728248596, + "step": 56165 + }, + { + "epoch": 0.05657502057227693, + "grad_norm": 12.290126336581281, + "learning_rate": 4.999468312260557e-05, + "loss": 2.3197, + "mean_token_accuracy": 0.4431336998939514, + "step": 56170 + }, + { + "epoch": 0.056580056625381105, + "grad_norm": 14.128821139115507, + "learning_rate": 4.999467497383266e-05, + "loss": 2.755, + "mean_token_accuracy": 0.3827586203813553, + "step": 56175 + }, + { + "epoch": 0.05658509267848528, + "grad_norm": 12.364446326420033, + "learning_rate": 4.999466681882077e-05, + "loss": 2.5985, + "mean_token_accuracy": 0.3931034505367279, + "step": 56180 + }, + { + "epoch": 0.05659012873158945, + "grad_norm": 13.388260171930906, + "learning_rate": 4.999465865756991e-05, + "loss": 2.6423, + "mean_token_accuracy": 0.32068965435028074, + "step": 56185 + }, + { + "epoch": 0.05659516478469362, + "grad_norm": 13.957538619221943, + "learning_rate": 4.9994650490080064e-05, + "loss": 2.2126, + "mean_token_accuracy": 0.4413793087005615, + "step": 56190 + }, + { + "epoch": 0.05660020083779779, + "grad_norm": 16.75906020462544, + "learning_rate": 4.999464231635124e-05, + "loss": 2.8412, + "mean_token_accuracy": 0.3620689660310745, + "step": 56195 + }, + { + "epoch": 0.056605236890901967, + "grad_norm": 11.382926498965539, + "learning_rate": 4.999463413638344e-05, + "loss": 2.178, + "mean_token_accuracy": 0.43103448748588563, + "step": 56200 + }, + { + "epoch": 0.05661027294400614, + "grad_norm": 12.923868180315626, + "learning_rate": 4.9994625950176684e-05, + "loss": 2.5344, + "mean_token_accuracy": 0.39655172228813174, + "step": 56205 + }, + { + "epoch": 0.056615308997110314, + "grad_norm": 13.55973469079723, + "learning_rate": 4.999461775773095e-05, + "loss": 2.6146, + "mean_token_accuracy": 0.3827586233615875, + "step": 56210 + }, + { + "epoch": 0.05662034505021449, + "grad_norm": 9.751300016926047, + "learning_rate": 4.9994609559046253e-05, + "loss": 2.7455, + "mean_token_accuracy": 0.41724138259887694, + "step": 56215 + }, + { + "epoch": 0.05662538110331866, + "grad_norm": 13.519018713060435, + "learning_rate": 4.999460135412259e-05, + "loss": 2.2581, + "mean_token_accuracy": 0.43103448748588563, + "step": 56220 + }, + { + "epoch": 0.05663041715642283, + "grad_norm": 10.709237063735674, + "learning_rate": 4.999459314295997e-05, + "loss": 2.2562, + "mean_token_accuracy": 0.4535390317440033, + "step": 56225 + }, + { + "epoch": 0.056635453209527, + "grad_norm": 11.982745584180012, + "learning_rate": 4.999458492555838e-05, + "loss": 2.8691, + "mean_token_accuracy": 0.32413792610168457, + "step": 56230 + }, + { + "epoch": 0.056640489262631176, + "grad_norm": 14.51272503985993, + "learning_rate": 4.9994576701917845e-05, + "loss": 2.9809, + "mean_token_accuracy": 0.3620689630508423, + "step": 56235 + }, + { + "epoch": 0.05664552531573535, + "grad_norm": 11.658987217873193, + "learning_rate": 4.999456847203835e-05, + "loss": 2.5789, + "mean_token_accuracy": 0.43103447556495667, + "step": 56240 + }, + { + "epoch": 0.056650561368839523, + "grad_norm": 21.208352180773883, + "learning_rate": 4.99945602359199e-05, + "loss": 2.6997, + "mean_token_accuracy": 0.3879612863063812, + "step": 56245 + }, + { + "epoch": 0.0566555974219437, + "grad_norm": 16.53399268085399, + "learning_rate": 4.99945519935625e-05, + "loss": 3.6324, + "mean_token_accuracy": 0.33793103098869326, + "step": 56250 + }, + { + "epoch": 0.05666063347504787, + "grad_norm": 9.458524176170812, + "learning_rate": 4.9994543744966156e-05, + "loss": 2.7289, + "mean_token_accuracy": 0.4103448331356049, + "step": 56255 + }, + { + "epoch": 0.05666566952815204, + "grad_norm": 14.302232017194198, + "learning_rate": 4.999453549013087e-05, + "loss": 2.611, + "mean_token_accuracy": 0.3620689630508423, + "step": 56260 + }, + { + "epoch": 0.05667070558125621, + "grad_norm": 12.619289203556953, + "learning_rate": 4.999452722905664e-05, + "loss": 2.322, + "mean_token_accuracy": 0.4137930989265442, + "step": 56265 + }, + { + "epoch": 0.056675741634360385, + "grad_norm": 10.150124337348615, + "learning_rate": 4.999451896174346e-05, + "loss": 2.4294, + "mean_token_accuracy": 0.3965517163276672, + "step": 56270 + }, + { + "epoch": 0.05668077768746456, + "grad_norm": 10.242251614143063, + "learning_rate": 4.999451068819134e-05, + "loss": 2.3582, + "mean_token_accuracy": 0.42758620381355283, + "step": 56275 + }, + { + "epoch": 0.05668581374056873, + "grad_norm": 10.246147487295602, + "learning_rate": 4.999450240840029e-05, + "loss": 2.1904, + "mean_token_accuracy": 0.4724137902259827, + "step": 56280 + }, + { + "epoch": 0.05669084979367291, + "grad_norm": 11.057121714349869, + "learning_rate": 4.999449412237031e-05, + "loss": 2.1769, + "mean_token_accuracy": 0.47241378426551817, + "step": 56285 + }, + { + "epoch": 0.05669588584677708, + "grad_norm": 11.667221193909262, + "learning_rate": 4.999448583010139e-05, + "loss": 2.1731, + "mean_token_accuracy": 0.4534785270690918, + "step": 56290 + }, + { + "epoch": 0.05670092189988125, + "grad_norm": 12.065859193423591, + "learning_rate": 4.999447753159355e-05, + "loss": 2.4814, + "mean_token_accuracy": 0.4068965554237366, + "step": 56295 + }, + { + "epoch": 0.05670595795298542, + "grad_norm": 14.484821953608458, + "learning_rate": 4.9994469226846766e-05, + "loss": 2.5945, + "mean_token_accuracy": 0.44482757449150084, + "step": 56300 + }, + { + "epoch": 0.056710994006089595, + "grad_norm": 13.859050104033333, + "learning_rate": 4.999446091586107e-05, + "loss": 2.5457, + "mean_token_accuracy": 0.3813067197799683, + "step": 56305 + }, + { + "epoch": 0.05671603005919377, + "grad_norm": 11.60379609079237, + "learning_rate": 4.9994452598636455e-05, + "loss": 2.3094, + "mean_token_accuracy": 0.4413793087005615, + "step": 56310 + }, + { + "epoch": 0.05672106611229794, + "grad_norm": 10.271892577097054, + "learning_rate": 4.9994444275172914e-05, + "loss": 2.7395, + "mean_token_accuracy": 0.3655172288417816, + "step": 56315 + }, + { + "epoch": 0.056726102165402116, + "grad_norm": 10.677611282381557, + "learning_rate": 4.9994435945470454e-05, + "loss": 2.5213, + "mean_token_accuracy": 0.4395320177078247, + "step": 56320 + }, + { + "epoch": 0.05673113821850629, + "grad_norm": 13.53152976591521, + "learning_rate": 4.9994427609529074e-05, + "loss": 2.8761, + "mean_token_accuracy": 0.3448275804519653, + "step": 56325 + }, + { + "epoch": 0.05673617427161046, + "grad_norm": 11.493406745716078, + "learning_rate": 4.999441926734879e-05, + "loss": 2.9171, + "mean_token_accuracy": 0.3965517282485962, + "step": 56330 + }, + { + "epoch": 0.05674121032471463, + "grad_norm": 11.001316752283271, + "learning_rate": 4.999441091892958e-05, + "loss": 2.4803, + "mean_token_accuracy": 0.38275861740112305, + "step": 56335 + }, + { + "epoch": 0.056746246377818804, + "grad_norm": 14.269474406388586, + "learning_rate": 4.999440256427148e-05, + "loss": 3.3146, + "mean_token_accuracy": 0.35862068831920624, + "step": 56340 + }, + { + "epoch": 0.05675128243092298, + "grad_norm": 11.118253508436803, + "learning_rate": 4.999439420337447e-05, + "loss": 2.8236, + "mean_token_accuracy": 0.35862069129943847, + "step": 56345 + }, + { + "epoch": 0.05675631848402715, + "grad_norm": 14.291688247099108, + "learning_rate": 4.9994385836238545e-05, + "loss": 2.7143, + "mean_token_accuracy": 0.3758620709180832, + "step": 56350 + }, + { + "epoch": 0.056761354537131326, + "grad_norm": 11.205151955235964, + "learning_rate": 4.999437746286372e-05, + "loss": 2.5443, + "mean_token_accuracy": 0.4103448212146759, + "step": 56355 + }, + { + "epoch": 0.0567663905902355, + "grad_norm": 12.473834344319762, + "learning_rate": 4.999436908325e-05, + "loss": 2.5924, + "mean_token_accuracy": 0.42413792610168455, + "step": 56360 + }, + { + "epoch": 0.056771426643339666, + "grad_norm": 11.336414579736598, + "learning_rate": 4.999436069739739e-05, + "loss": 2.0133, + "mean_token_accuracy": 0.49872957468032836, + "step": 56365 + }, + { + "epoch": 0.05677646269644384, + "grad_norm": 20.3590680282211, + "learning_rate": 4.999435230530587e-05, + "loss": 2.7892, + "mean_token_accuracy": 0.35862069129943847, + "step": 56370 + }, + { + "epoch": 0.056781498749548014, + "grad_norm": 12.879898678239105, + "learning_rate": 4.999434390697547e-05, + "loss": 2.884, + "mean_token_accuracy": 0.34137930274009703, + "step": 56375 + }, + { + "epoch": 0.05678653480265219, + "grad_norm": 12.304504974064427, + "learning_rate": 4.999433550240617e-05, + "loss": 2.8961, + "mean_token_accuracy": 0.3793103456497192, + "step": 56380 + }, + { + "epoch": 0.05679157085575636, + "grad_norm": 8.071786596332304, + "learning_rate": 4.999432709159799e-05, + "loss": 2.2054, + "mean_token_accuracy": 0.49458127617836, + "step": 56385 + }, + { + "epoch": 0.056796606908860535, + "grad_norm": 11.572834566115532, + "learning_rate": 4.999431867455092e-05, + "loss": 2.8567, + "mean_token_accuracy": 0.37241379022598264, + "step": 56390 + }, + { + "epoch": 0.05680164296196471, + "grad_norm": 12.712594272603269, + "learning_rate": 4.999431025126497e-05, + "loss": 2.5742, + "mean_token_accuracy": 0.42068966031074523, + "step": 56395 + }, + { + "epoch": 0.056806679015068876, + "grad_norm": 14.400194125714002, + "learning_rate": 4.9994301821740136e-05, + "loss": 2.6776, + "mean_token_accuracy": 0.3620689630508423, + "step": 56400 + }, + { + "epoch": 0.05681171506817305, + "grad_norm": 10.380315261291337, + "learning_rate": 4.9994293385976425e-05, + "loss": 2.9552, + "mean_token_accuracy": 0.37586206793785093, + "step": 56405 + }, + { + "epoch": 0.05681675112127722, + "grad_norm": 13.227092326127044, + "learning_rate": 4.999428494397384e-05, + "loss": 2.7054, + "mean_token_accuracy": 0.3655172407627106, + "step": 56410 + }, + { + "epoch": 0.0568217871743814, + "grad_norm": 10.508610112598312, + "learning_rate": 4.999427649573238e-05, + "loss": 2.3083, + "mean_token_accuracy": 0.42413792610168455, + "step": 56415 + }, + { + "epoch": 0.05682682322748557, + "grad_norm": 12.624586416395621, + "learning_rate": 4.9994268041252045e-05, + "loss": 2.7765, + "mean_token_accuracy": 0.4000000059604645, + "step": 56420 + }, + { + "epoch": 0.056831859280589744, + "grad_norm": 10.41998066209456, + "learning_rate": 4.9994259580532846e-05, + "loss": 2.3733, + "mean_token_accuracy": 0.41724138259887694, + "step": 56425 + }, + { + "epoch": 0.05683689533369392, + "grad_norm": 10.45594154390414, + "learning_rate": 4.9994251113574775e-05, + "loss": 2.5369, + "mean_token_accuracy": 0.4344827651977539, + "step": 56430 + }, + { + "epoch": 0.056841931386798085, + "grad_norm": 11.681893947984362, + "learning_rate": 4.9994242640377845e-05, + "loss": 2.1973, + "mean_token_accuracy": 0.4068965554237366, + "step": 56435 + }, + { + "epoch": 0.05684696743990226, + "grad_norm": 9.10628743171219, + "learning_rate": 4.999423416094204e-05, + "loss": 2.2625, + "mean_token_accuracy": 0.442241370677948, + "step": 56440 + }, + { + "epoch": 0.05685200349300643, + "grad_norm": 11.80361943668972, + "learning_rate": 4.999422567526739e-05, + "loss": 2.9042, + "mean_token_accuracy": 0.41034482717514037, + "step": 56445 + }, + { + "epoch": 0.056857039546110606, + "grad_norm": 13.6678205562962, + "learning_rate": 4.999421718335388e-05, + "loss": 2.6146, + "mean_token_accuracy": 0.35172412991523744, + "step": 56450 + }, + { + "epoch": 0.05686207559921478, + "grad_norm": 13.859787509418128, + "learning_rate": 4.999420868520151e-05, + "loss": 2.3706, + "mean_token_accuracy": 0.4689655125141144, + "step": 56455 + }, + { + "epoch": 0.056867111652318954, + "grad_norm": 13.277506787650813, + "learning_rate": 4.999420018081029e-05, + "loss": 2.6024, + "mean_token_accuracy": 0.4068965494632721, + "step": 56460 + }, + { + "epoch": 0.05687214770542313, + "grad_norm": 14.109818025313617, + "learning_rate": 4.999419167018022e-05, + "loss": 2.694, + "mean_token_accuracy": 0.42758620381355283, + "step": 56465 + }, + { + "epoch": 0.056877183758527294, + "grad_norm": 11.269401403688033, + "learning_rate": 4.99941831533113e-05, + "loss": 2.297, + "mean_token_accuracy": 0.4379310369491577, + "step": 56470 + }, + { + "epoch": 0.05688221981163147, + "grad_norm": 12.709780661513586, + "learning_rate": 4.9994174630203536e-05, + "loss": 2.6484, + "mean_token_accuracy": 0.38620689511299133, + "step": 56475 + }, + { + "epoch": 0.05688725586473564, + "grad_norm": 17.303118051058295, + "learning_rate": 4.9994166100856935e-05, + "loss": 2.7457, + "mean_token_accuracy": 0.34482758641242983, + "step": 56480 + }, + { + "epoch": 0.056892291917839816, + "grad_norm": 13.01333729607657, + "learning_rate": 4.999415756527148e-05, + "loss": 2.6314, + "mean_token_accuracy": 0.39655172228813174, + "step": 56485 + }, + { + "epoch": 0.05689732797094399, + "grad_norm": 13.563553806049347, + "learning_rate": 4.99941490234472e-05, + "loss": 2.4584, + "mean_token_accuracy": 0.36896551847457887, + "step": 56490 + }, + { + "epoch": 0.05690236402404816, + "grad_norm": 13.334147009016538, + "learning_rate": 4.999414047538407e-05, + "loss": 2.8138, + "mean_token_accuracy": 0.3827586114406586, + "step": 56495 + }, + { + "epoch": 0.05690740007715234, + "grad_norm": 13.422772944943365, + "learning_rate": 4.999413192108211e-05, + "loss": 2.297, + "mean_token_accuracy": 0.40689654350280763, + "step": 56500 + }, + { + "epoch": 0.056912436130256504, + "grad_norm": 15.363227252479481, + "learning_rate": 4.999412336054132e-05, + "loss": 2.7919, + "mean_token_accuracy": 0.37241379022598264, + "step": 56505 + }, + { + "epoch": 0.05691747218336068, + "grad_norm": 11.604301231651428, + "learning_rate": 4.999411479376171e-05, + "loss": 2.85, + "mean_token_accuracy": 0.42068966031074523, + "step": 56510 + }, + { + "epoch": 0.05692250823646485, + "grad_norm": 9.96877686448567, + "learning_rate": 4.9994106220743264e-05, + "loss": 2.6506, + "mean_token_accuracy": 0.37931033968925476, + "step": 56515 + }, + { + "epoch": 0.056927544289569025, + "grad_norm": 12.941568440317083, + "learning_rate": 4.9994097641486e-05, + "loss": 2.6947, + "mean_token_accuracy": 0.4137930929660797, + "step": 56520 + }, + { + "epoch": 0.0569325803426732, + "grad_norm": 10.824598286122152, + "learning_rate": 4.999408905598991e-05, + "loss": 2.557, + "mean_token_accuracy": 0.41379310488700866, + "step": 56525 + }, + { + "epoch": 0.05693761639577737, + "grad_norm": 10.86162265677599, + "learning_rate": 4.9994080464255e-05, + "loss": 2.3411, + "mean_token_accuracy": 0.4551724135875702, + "step": 56530 + }, + { + "epoch": 0.056942652448881546, + "grad_norm": 11.236091300799007, + "learning_rate": 4.9994071866281266e-05, + "loss": 2.5978, + "mean_token_accuracy": 0.42758620381355283, + "step": 56535 + }, + { + "epoch": 0.05694768850198571, + "grad_norm": 13.017061893734818, + "learning_rate": 4.999406326206873e-05, + "loss": 2.8506, + "mean_token_accuracy": 0.37586206793785093, + "step": 56540 + }, + { + "epoch": 0.05695272455508989, + "grad_norm": 12.37134557196057, + "learning_rate": 4.9994054651617375e-05, + "loss": 2.7767, + "mean_token_accuracy": 0.35517241060733795, + "step": 56545 + }, + { + "epoch": 0.05695776060819406, + "grad_norm": 13.582973032604299, + "learning_rate": 4.999404603492721e-05, + "loss": 2.8826, + "mean_token_accuracy": 0.37241379022598264, + "step": 56550 + }, + { + "epoch": 0.056962796661298234, + "grad_norm": 14.301344107185333, + "learning_rate": 4.999403741199824e-05, + "loss": 2.7908, + "mean_token_accuracy": 0.38620689511299133, + "step": 56555 + }, + { + "epoch": 0.05696783271440241, + "grad_norm": 12.896444287071745, + "learning_rate": 4.999402878283046e-05, + "loss": 2.6563, + "mean_token_accuracy": 0.37537809610366824, + "step": 56560 + }, + { + "epoch": 0.05697286876750658, + "grad_norm": 12.670547878720026, + "learning_rate": 4.9994020147423886e-05, + "loss": 3.1056, + "mean_token_accuracy": 0.3103448301553726, + "step": 56565 + }, + { + "epoch": 0.056977904820610756, + "grad_norm": 12.158586390827315, + "learning_rate": 4.9994011505778504e-05, + "loss": 3.0262, + "mean_token_accuracy": 0.39467633962631227, + "step": 56570 + }, + { + "epoch": 0.05698294087371492, + "grad_norm": 10.703297164729264, + "learning_rate": 4.999400285789433e-05, + "loss": 2.3984, + "mean_token_accuracy": 0.41034482717514037, + "step": 56575 + }, + { + "epoch": 0.056987976926819096, + "grad_norm": 16.317895529075017, + "learning_rate": 4.999399420377135e-05, + "loss": 3.1215, + "mean_token_accuracy": 0.3827586233615875, + "step": 56580 + }, + { + "epoch": 0.05699301297992327, + "grad_norm": 12.666430448808347, + "learning_rate": 4.999398554340959e-05, + "loss": 2.8564, + "mean_token_accuracy": 0.3310344755649567, + "step": 56585 + }, + { + "epoch": 0.056998049033027444, + "grad_norm": 11.299648589087205, + "learning_rate": 4.9993976876809026e-05, + "loss": 2.1512, + "mean_token_accuracy": 0.4379310369491577, + "step": 56590 + }, + { + "epoch": 0.05700308508613162, + "grad_norm": 11.083467015914241, + "learning_rate": 4.999396820396968e-05, + "loss": 2.6386, + "mean_token_accuracy": 0.33793103098869326, + "step": 56595 + }, + { + "epoch": 0.05700812113923579, + "grad_norm": 10.61333249978888, + "learning_rate": 4.999395952489156e-05, + "loss": 2.4733, + "mean_token_accuracy": 0.3965517163276672, + "step": 56600 + }, + { + "epoch": 0.057013157192339965, + "grad_norm": 10.70603936225942, + "learning_rate": 4.9993950839574645e-05, + "loss": 2.744, + "mean_token_accuracy": 0.35172414481639863, + "step": 56605 + }, + { + "epoch": 0.05701819324544413, + "grad_norm": 12.308270656272272, + "learning_rate": 4.9993942148018944e-05, + "loss": 2.4612, + "mean_token_accuracy": 0.3586206793785095, + "step": 56610 + }, + { + "epoch": 0.057023229298548306, + "grad_norm": 9.368210681493663, + "learning_rate": 4.9993933450224465e-05, + "loss": 2.6723, + "mean_token_accuracy": 0.3482758581638336, + "step": 56615 + }, + { + "epoch": 0.05702826535165248, + "grad_norm": 15.327668951134902, + "learning_rate": 4.999392474619122e-05, + "loss": 2.6949, + "mean_token_accuracy": 0.4172413766384125, + "step": 56620 + }, + { + "epoch": 0.05703330140475665, + "grad_norm": 15.044080408132025, + "learning_rate": 4.999391603591919e-05, + "loss": 2.6487, + "mean_token_accuracy": 0.38620689809322356, + "step": 56625 + }, + { + "epoch": 0.05703833745786083, + "grad_norm": 14.224665452817053, + "learning_rate": 4.999390731940839e-05, + "loss": 2.9042, + "mean_token_accuracy": 0.39655172228813174, + "step": 56630 + }, + { + "epoch": 0.057043373510965, + "grad_norm": 11.596556895308801, + "learning_rate": 4.999389859665883e-05, + "loss": 2.4464, + "mean_token_accuracy": 0.4206896543502808, + "step": 56635 + }, + { + "epoch": 0.057048409564069175, + "grad_norm": 13.644686007689952, + "learning_rate": 4.99938898676705e-05, + "loss": 2.6181, + "mean_token_accuracy": 0.4068965494632721, + "step": 56640 + }, + { + "epoch": 0.05705344561717334, + "grad_norm": 10.98998780923295, + "learning_rate": 4.999388113244339e-05, + "loss": 2.5684, + "mean_token_accuracy": 0.34137930870056155, + "step": 56645 + }, + { + "epoch": 0.057058481670277515, + "grad_norm": 13.100379761807801, + "learning_rate": 4.999387239097754e-05, + "loss": 2.5429, + "mean_token_accuracy": 0.42413792610168455, + "step": 56650 + }, + { + "epoch": 0.05706351772338169, + "grad_norm": 12.669400216197241, + "learning_rate": 4.9993863643272925e-05, + "loss": 2.6447, + "mean_token_accuracy": 0.4068965494632721, + "step": 56655 + }, + { + "epoch": 0.05706855377648586, + "grad_norm": 9.991243376319659, + "learning_rate": 4.999385488932955e-05, + "loss": 2.5012, + "mean_token_accuracy": 0.43103448748588563, + "step": 56660 + }, + { + "epoch": 0.057073589829590037, + "grad_norm": 12.684805165409262, + "learning_rate": 4.999384612914741e-05, + "loss": 2.5898, + "mean_token_accuracy": 0.4517241418361664, + "step": 56665 + }, + { + "epoch": 0.05707862588269421, + "grad_norm": 14.863206011557716, + "learning_rate": 4.9993837362726535e-05, + "loss": 2.6497, + "mean_token_accuracy": 0.38965517580509185, + "step": 56670 + }, + { + "epoch": 0.057083661935798384, + "grad_norm": 9.54376402274776, + "learning_rate": 4.99938285900669e-05, + "loss": 2.8495, + "mean_token_accuracy": 0.38451301455497744, + "step": 56675 + }, + { + "epoch": 0.05708869798890255, + "grad_norm": 11.451032319023579, + "learning_rate": 4.999381981116852e-05, + "loss": 2.5374, + "mean_token_accuracy": 0.44137930274009707, + "step": 56680 + }, + { + "epoch": 0.057093734042006725, + "grad_norm": 12.017098853339334, + "learning_rate": 4.99938110260314e-05, + "loss": 2.8144, + "mean_token_accuracy": 0.41911675930023196, + "step": 56685 + }, + { + "epoch": 0.0570987700951109, + "grad_norm": 9.809498333264006, + "learning_rate": 4.999380223465554e-05, + "loss": 2.2108, + "mean_token_accuracy": 0.5000000119209289, + "step": 56690 + }, + { + "epoch": 0.05710380614821507, + "grad_norm": 13.157593622371417, + "learning_rate": 4.999379343704092e-05, + "loss": 2.4579, + "mean_token_accuracy": 0.4020568668842316, + "step": 56695 + }, + { + "epoch": 0.057108842201319246, + "grad_norm": 12.011618339410687, + "learning_rate": 4.999378463318758e-05, + "loss": 2.1722, + "mean_token_accuracy": 0.47586206197738645, + "step": 56700 + }, + { + "epoch": 0.05711387825442342, + "grad_norm": 11.168214245040689, + "learning_rate": 4.99937758230955e-05, + "loss": 2.5496, + "mean_token_accuracy": 0.4206896543502808, + "step": 56705 + }, + { + "epoch": 0.05711891430752759, + "grad_norm": 12.698882778783561, + "learning_rate": 4.999376700676469e-05, + "loss": 3.0421, + "mean_token_accuracy": 0.3620689630508423, + "step": 56710 + }, + { + "epoch": 0.05712395036063176, + "grad_norm": 14.221623990764911, + "learning_rate": 4.9993758184195155e-05, + "loss": 2.574, + "mean_token_accuracy": 0.35862068831920624, + "step": 56715 + }, + { + "epoch": 0.057128986413735934, + "grad_norm": 9.818318443324813, + "learning_rate": 4.999374935538688e-05, + "loss": 1.9802, + "mean_token_accuracy": 0.5034482717514038, + "step": 56720 + }, + { + "epoch": 0.05713402246684011, + "grad_norm": 10.501232626200357, + "learning_rate": 4.999374052033988e-05, + "loss": 2.5839, + "mean_token_accuracy": 0.45517240166664125, + "step": 56725 + }, + { + "epoch": 0.05713905851994428, + "grad_norm": 11.922148685766393, + "learning_rate": 4.999373167905417e-05, + "loss": 2.2856, + "mean_token_accuracy": 0.42068966031074523, + "step": 56730 + }, + { + "epoch": 0.057144094573048455, + "grad_norm": 13.573548065732542, + "learning_rate": 4.9993722831529726e-05, + "loss": 2.4923, + "mean_token_accuracy": 0.41379310488700866, + "step": 56735 + }, + { + "epoch": 0.05714913062615263, + "grad_norm": 9.992344237366302, + "learning_rate": 4.999371397776658e-05, + "loss": 2.6208, + "mean_token_accuracy": 0.37241379022598264, + "step": 56740 + }, + { + "epoch": 0.0571541666792568, + "grad_norm": 10.399378884825357, + "learning_rate": 4.999370511776471e-05, + "loss": 2.6868, + "mean_token_accuracy": 0.40108892917633054, + "step": 56745 + }, + { + "epoch": 0.05715920273236097, + "grad_norm": 10.276211302833515, + "learning_rate": 4.999369625152412e-05, + "loss": 2.289, + "mean_token_accuracy": 0.4172413766384125, + "step": 56750 + }, + { + "epoch": 0.057164238785465143, + "grad_norm": 11.692390152843773, + "learning_rate": 4.9993687379044825e-05, + "loss": 2.518, + "mean_token_accuracy": 0.42068964838981626, + "step": 56755 + }, + { + "epoch": 0.05716927483856932, + "grad_norm": 11.997247677473002, + "learning_rate": 4.999367850032682e-05, + "loss": 2.7736, + "mean_token_accuracy": 0.37931033968925476, + "step": 56760 + }, + { + "epoch": 0.05717431089167349, + "grad_norm": 14.042188805674451, + "learning_rate": 4.999366961537011e-05, + "loss": 2.3709, + "mean_token_accuracy": 0.43103447556495667, + "step": 56765 + }, + { + "epoch": 0.057179346944777665, + "grad_norm": 11.875428311547322, + "learning_rate": 4.99936607241747e-05, + "loss": 2.2407, + "mean_token_accuracy": 0.4586206912994385, + "step": 56770 + }, + { + "epoch": 0.05718438299788184, + "grad_norm": 12.342037824124963, + "learning_rate": 4.999365182674059e-05, + "loss": 2.5136, + "mean_token_accuracy": 0.4000000059604645, + "step": 56775 + }, + { + "epoch": 0.05718941905098601, + "grad_norm": 11.703384937812958, + "learning_rate": 4.999364292306778e-05, + "loss": 2.8979, + "mean_token_accuracy": 0.3896551787853241, + "step": 56780 + }, + { + "epoch": 0.05719445510409018, + "grad_norm": 18.73220542652447, + "learning_rate": 4.999363401315627e-05, + "loss": 2.4692, + "mean_token_accuracy": 0.44827585816383364, + "step": 56785 + }, + { + "epoch": 0.05719949115719435, + "grad_norm": 11.41014236617544, + "learning_rate": 4.999362509700607e-05, + "loss": 2.8347, + "mean_token_accuracy": 0.34137930274009703, + "step": 56790 + }, + { + "epoch": 0.05720452721029853, + "grad_norm": 11.296188985940232, + "learning_rate": 4.9993616174617176e-05, + "loss": 2.8115, + "mean_token_accuracy": 0.36896551847457887, + "step": 56795 + }, + { + "epoch": 0.0572095632634027, + "grad_norm": 12.720673603196303, + "learning_rate": 4.99936072459896e-05, + "loss": 2.7329, + "mean_token_accuracy": 0.3896551787853241, + "step": 56800 + }, + { + "epoch": 0.057214599316506874, + "grad_norm": 16.44234651430403, + "learning_rate": 4.999359831112333e-05, + "loss": 2.9081, + "mean_token_accuracy": 0.3551724076271057, + "step": 56805 + }, + { + "epoch": 0.05721963536961105, + "grad_norm": 11.070626955878454, + "learning_rate": 4.9993589370018386e-05, + "loss": 2.4931, + "mean_token_accuracy": 0.37241379618644715, + "step": 56810 + }, + { + "epoch": 0.057224671422715215, + "grad_norm": 12.989076161573143, + "learning_rate": 4.999358042267476e-05, + "loss": 2.8422, + "mean_token_accuracy": 0.43611615896224976, + "step": 56815 + }, + { + "epoch": 0.05722970747581939, + "grad_norm": 12.911740067532618, + "learning_rate": 4.9993571469092446e-05, + "loss": 2.6351, + "mean_token_accuracy": 0.38620689511299133, + "step": 56820 + }, + { + "epoch": 0.05723474352892356, + "grad_norm": 12.05249379890571, + "learning_rate": 4.999356250927147e-05, + "loss": 2.8662, + "mean_token_accuracy": 0.3655172407627106, + "step": 56825 + }, + { + "epoch": 0.057239779582027736, + "grad_norm": 13.663370097536497, + "learning_rate": 4.999355354321181e-05, + "loss": 2.4579, + "mean_token_accuracy": 0.42589232325553894, + "step": 56830 + }, + { + "epoch": 0.05724481563513191, + "grad_norm": 10.314765474430244, + "learning_rate": 4.999354457091348e-05, + "loss": 2.6578, + "mean_token_accuracy": 0.4103448212146759, + "step": 56835 + }, + { + "epoch": 0.057249851688236084, + "grad_norm": 12.31622455841909, + "learning_rate": 4.999353559237649e-05, + "loss": 2.5917, + "mean_token_accuracy": 0.4103448212146759, + "step": 56840 + }, + { + "epoch": 0.05725488774134026, + "grad_norm": 11.728756520791755, + "learning_rate": 4.999352660760082e-05, + "loss": 2.7015, + "mean_token_accuracy": 0.42413792610168455, + "step": 56845 + }, + { + "epoch": 0.057259923794444424, + "grad_norm": 11.259289049534395, + "learning_rate": 4.9993517616586495e-05, + "loss": 2.342, + "mean_token_accuracy": 0.4310344815254211, + "step": 56850 + }, + { + "epoch": 0.0572649598475486, + "grad_norm": 15.92954637707232, + "learning_rate": 4.9993508619333507e-05, + "loss": 2.656, + "mean_token_accuracy": 0.37586206793785093, + "step": 56855 + }, + { + "epoch": 0.05726999590065277, + "grad_norm": 10.858295477319485, + "learning_rate": 4.999349961584186e-05, + "loss": 2.585, + "mean_token_accuracy": 0.4000000059604645, + "step": 56860 + }, + { + "epoch": 0.057275031953756946, + "grad_norm": 16.730859815588158, + "learning_rate": 4.9993490606111564e-05, + "loss": 2.6695, + "mean_token_accuracy": 0.4103448212146759, + "step": 56865 + }, + { + "epoch": 0.05728006800686112, + "grad_norm": 12.761229812272866, + "learning_rate": 4.9993481590142596e-05, + "loss": 2.8642, + "mean_token_accuracy": 0.3551724135875702, + "step": 56870 + }, + { + "epoch": 0.05728510405996529, + "grad_norm": 11.078519546809535, + "learning_rate": 4.9993472567935e-05, + "loss": 2.5934, + "mean_token_accuracy": 0.42758620381355283, + "step": 56875 + }, + { + "epoch": 0.05729014011306947, + "grad_norm": 11.131810978330439, + "learning_rate": 4.9993463539488735e-05, + "loss": 2.8056, + "mean_token_accuracy": 0.35862069129943847, + "step": 56880 + }, + { + "epoch": 0.057295176166173634, + "grad_norm": 11.377291993784423, + "learning_rate": 4.9993454504803836e-05, + "loss": 2.8052, + "mean_token_accuracy": 0.39655172228813174, + "step": 56885 + }, + { + "epoch": 0.05730021221927781, + "grad_norm": 12.631874717573584, + "learning_rate": 4.99934454638803e-05, + "loss": 3.2904, + "mean_token_accuracy": 0.3275862097740173, + "step": 56890 + }, + { + "epoch": 0.05730524827238198, + "grad_norm": 12.074191134010285, + "learning_rate": 4.9993436416718106e-05, + "loss": 2.6655, + "mean_token_accuracy": 0.38620689511299133, + "step": 56895 + }, + { + "epoch": 0.057310284325486155, + "grad_norm": 11.418836705153414, + "learning_rate": 4.999342736331729e-05, + "loss": 2.4377, + "mean_token_accuracy": 0.41379311084747317, + "step": 56900 + }, + { + "epoch": 0.05731532037859033, + "grad_norm": 13.317354684661773, + "learning_rate": 4.999341830367782e-05, + "loss": 2.7542, + "mean_token_accuracy": 0.37241379022598264, + "step": 56905 + }, + { + "epoch": 0.0573203564316945, + "grad_norm": 11.29641471057222, + "learning_rate": 4.999340923779973e-05, + "loss": 2.5982, + "mean_token_accuracy": 0.4551724076271057, + "step": 56910 + }, + { + "epoch": 0.057325392484798676, + "grad_norm": 14.907122514986007, + "learning_rate": 4.9993400165683014e-05, + "loss": 2.7071, + "mean_token_accuracy": 0.3931034505367279, + "step": 56915 + }, + { + "epoch": 0.05733042853790284, + "grad_norm": 14.089748028717697, + "learning_rate": 4.999339108732766e-05, + "loss": 2.5909, + "mean_token_accuracy": 0.43448275327682495, + "step": 56920 + }, + { + "epoch": 0.05733546459100702, + "grad_norm": 11.361229564412609, + "learning_rate": 4.999338200273368e-05, + "loss": 2.779, + "mean_token_accuracy": 0.37586206793785093, + "step": 56925 + }, + { + "epoch": 0.05734050064411119, + "grad_norm": 14.691571966199357, + "learning_rate": 4.999337291190107e-05, + "loss": 2.5925, + "mean_token_accuracy": 0.358620685338974, + "step": 56930 + }, + { + "epoch": 0.057345536697215364, + "grad_norm": 12.8469023059474, + "learning_rate": 4.999336381482985e-05, + "loss": 2.4472, + "mean_token_accuracy": 0.4275862067937851, + "step": 56935 + }, + { + "epoch": 0.05735057275031954, + "grad_norm": 11.871279079412997, + "learning_rate": 4.999335471152001e-05, + "loss": 2.3841, + "mean_token_accuracy": 0.41034482717514037, + "step": 56940 + }, + { + "epoch": 0.05735560880342371, + "grad_norm": 10.799570568070731, + "learning_rate": 4.999334560197156e-05, + "loss": 2.8568, + "mean_token_accuracy": 0.4068965494632721, + "step": 56945 + }, + { + "epoch": 0.057360644856527886, + "grad_norm": 11.653328137726932, + "learning_rate": 4.9993336486184484e-05, + "loss": 2.5634, + "mean_token_accuracy": 0.4000000059604645, + "step": 56950 + }, + { + "epoch": 0.05736568090963205, + "grad_norm": 15.016177322586971, + "learning_rate": 4.9993327364158805e-05, + "loss": 2.3711, + "mean_token_accuracy": 0.43793103396892546, + "step": 56955 + }, + { + "epoch": 0.057370716962736226, + "grad_norm": 12.96952385783414, + "learning_rate": 4.999331823589451e-05, + "loss": 2.7216, + "mean_token_accuracy": 0.38275861740112305, + "step": 56960 + }, + { + "epoch": 0.0573757530158404, + "grad_norm": 12.860157160238321, + "learning_rate": 4.999330910139162e-05, + "loss": 2.8701, + "mean_token_accuracy": 0.3655172407627106, + "step": 56965 + }, + { + "epoch": 0.057380789068944574, + "grad_norm": 11.812576295148537, + "learning_rate": 4.999329996065012e-05, + "loss": 2.5399, + "mean_token_accuracy": 0.42891712188720704, + "step": 56970 + }, + { + "epoch": 0.05738582512204875, + "grad_norm": 12.062505926389953, + "learning_rate": 4.999329081367002e-05, + "loss": 2.1792, + "mean_token_accuracy": 0.4482758641242981, + "step": 56975 + }, + { + "epoch": 0.05739086117515292, + "grad_norm": 11.09370697907473, + "learning_rate": 4.9993281660451324e-05, + "loss": 2.4699, + "mean_token_accuracy": 0.3999999940395355, + "step": 56980 + }, + { + "epoch": 0.057395897228257095, + "grad_norm": 13.293259854541144, + "learning_rate": 4.999327250099403e-05, + "loss": 2.4625, + "mean_token_accuracy": 0.4551724076271057, + "step": 56985 + }, + { + "epoch": 0.05740093328136126, + "grad_norm": 12.8649436675316, + "learning_rate": 4.999326333529815e-05, + "loss": 2.5167, + "mean_token_accuracy": 0.41034482717514037, + "step": 56990 + }, + { + "epoch": 0.057405969334465436, + "grad_norm": 9.749384495903454, + "learning_rate": 4.999325416336367e-05, + "loss": 2.7343, + "mean_token_accuracy": 0.40689654350280763, + "step": 56995 + }, + { + "epoch": 0.05741100538756961, + "grad_norm": 15.035200528519873, + "learning_rate": 4.99932449851906e-05, + "loss": 3.0004, + "mean_token_accuracy": 0.37241379022598264, + "step": 57000 + }, + { + "epoch": 0.05741604144067378, + "grad_norm": 14.398701262274576, + "learning_rate": 4.9993235800778955e-05, + "loss": 2.6973, + "mean_token_accuracy": 0.3310344755649567, + "step": 57005 + }, + { + "epoch": 0.05742107749377796, + "grad_norm": 17.007631412929175, + "learning_rate": 4.999322661012872e-05, + "loss": 2.6348, + "mean_token_accuracy": 0.3517241388559341, + "step": 57010 + }, + { + "epoch": 0.05742611354688213, + "grad_norm": 11.797415074860707, + "learning_rate": 4.9993217413239905e-05, + "loss": 2.688, + "mean_token_accuracy": 0.38130671381950376, + "step": 57015 + }, + { + "epoch": 0.057431149599986304, + "grad_norm": 13.729055890320819, + "learning_rate": 4.9993208210112516e-05, + "loss": 2.2852, + "mean_token_accuracy": 0.45862067937850953, + "step": 57020 + }, + { + "epoch": 0.05743618565309047, + "grad_norm": 13.161836475681744, + "learning_rate": 4.9993199000746546e-05, + "loss": 2.6303, + "mean_token_accuracy": 0.45517241954803467, + "step": 57025 + }, + { + "epoch": 0.057441221706194645, + "grad_norm": 11.79899314016646, + "learning_rate": 4.9993189785142016e-05, + "loss": 2.7679, + "mean_token_accuracy": 0.4, + "step": 57030 + }, + { + "epoch": 0.05744625775929882, + "grad_norm": 13.946650389403713, + "learning_rate": 4.9993180563298905e-05, + "loss": 2.8332, + "mean_token_accuracy": 0.3896551728248596, + "step": 57035 + }, + { + "epoch": 0.05745129381240299, + "grad_norm": 11.48907705757755, + "learning_rate": 4.9993171335217234e-05, + "loss": 2.5309, + "mean_token_accuracy": 0.44966726899147036, + "step": 57040 + }, + { + "epoch": 0.057456329865507166, + "grad_norm": 11.972960574446068, + "learning_rate": 4.999316210089698e-05, + "loss": 2.6538, + "mean_token_accuracy": 0.3915305495262146, + "step": 57045 + }, + { + "epoch": 0.05746136591861134, + "grad_norm": 11.594613079123697, + "learning_rate": 4.9993152860338185e-05, + "loss": 2.6242, + "mean_token_accuracy": 0.3827586233615875, + "step": 57050 + }, + { + "epoch": 0.057466401971715514, + "grad_norm": 10.357912538196837, + "learning_rate": 4.999314361354082e-05, + "loss": 2.3195, + "mean_token_accuracy": 0.4689655065536499, + "step": 57055 + }, + { + "epoch": 0.05747143802481968, + "grad_norm": 11.731625652063055, + "learning_rate": 4.99931343605049e-05, + "loss": 2.7205, + "mean_token_accuracy": 0.3482758641242981, + "step": 57060 + }, + { + "epoch": 0.057476474077923854, + "grad_norm": 13.112582550156544, + "learning_rate": 4.999312510123042e-05, + "loss": 2.23, + "mean_token_accuracy": 0.4398064225912094, + "step": 57065 + }, + { + "epoch": 0.05748151013102803, + "grad_norm": 11.962309472100149, + "learning_rate": 4.9993115835717395e-05, + "loss": 2.7023, + "mean_token_accuracy": 0.39655172228813174, + "step": 57070 + }, + { + "epoch": 0.0574865461841322, + "grad_norm": 11.37899721418775, + "learning_rate": 4.9993106563965816e-05, + "loss": 2.5962, + "mean_token_accuracy": 0.358620685338974, + "step": 57075 + }, + { + "epoch": 0.057491582237236376, + "grad_norm": 12.192532695870286, + "learning_rate": 4.999309728597569e-05, + "loss": 2.5903, + "mean_token_accuracy": 0.38965516686439516, + "step": 57080 + }, + { + "epoch": 0.05749661829034055, + "grad_norm": 13.706159168820415, + "learning_rate": 4.9993088001747024e-05, + "loss": 2.7902, + "mean_token_accuracy": 0.39655172228813174, + "step": 57085 + }, + { + "epoch": 0.05750165434344472, + "grad_norm": 10.113394495890585, + "learning_rate": 4.999307871127982e-05, + "loss": 2.4097, + "mean_token_accuracy": 0.42758620381355283, + "step": 57090 + }, + { + "epoch": 0.05750669039654889, + "grad_norm": 10.613393503025039, + "learning_rate": 4.9993069414574065e-05, + "loss": 2.1837, + "mean_token_accuracy": 0.4620689630508423, + "step": 57095 + }, + { + "epoch": 0.057511726449653064, + "grad_norm": 12.050989443295737, + "learning_rate": 4.999306011162978e-05, + "loss": 2.6479, + "mean_token_accuracy": 0.3965517163276672, + "step": 57100 + }, + { + "epoch": 0.05751676250275724, + "grad_norm": 15.467484510326555, + "learning_rate": 4.999305080244695e-05, + "loss": 2.5776, + "mean_token_accuracy": 0.36551723480224607, + "step": 57105 + }, + { + "epoch": 0.05752179855586141, + "grad_norm": 12.342780615064195, + "learning_rate": 4.99930414870256e-05, + "loss": 2.7746, + "mean_token_accuracy": 0.3896551728248596, + "step": 57110 + }, + { + "epoch": 0.057526834608965585, + "grad_norm": 11.820305853203543, + "learning_rate": 4.999303216536572e-05, + "loss": 2.6576, + "mean_token_accuracy": 0.37586206793785093, + "step": 57115 + }, + { + "epoch": 0.05753187066206976, + "grad_norm": 13.260307017033362, + "learning_rate": 4.999302283746731e-05, + "loss": 2.5661, + "mean_token_accuracy": 0.42068964838981626, + "step": 57120 + }, + { + "epoch": 0.05753690671517393, + "grad_norm": 11.97019432492, + "learning_rate": 4.9993013503330385e-05, + "loss": 2.6747, + "mean_token_accuracy": 0.38620689511299133, + "step": 57125 + }, + { + "epoch": 0.0575419427682781, + "grad_norm": 13.20226543863521, + "learning_rate": 4.9993004162954934e-05, + "loss": 2.7406, + "mean_token_accuracy": 0.29999999403953553, + "step": 57130 + }, + { + "epoch": 0.05754697882138227, + "grad_norm": 13.67484690078131, + "learning_rate": 4.999299481634096e-05, + "loss": 2.8391, + "mean_token_accuracy": 0.3965517282485962, + "step": 57135 + }, + { + "epoch": 0.05755201487448645, + "grad_norm": 16.107649669539523, + "learning_rate": 4.999298546348847e-05, + "loss": 2.5779, + "mean_token_accuracy": 0.39655172228813174, + "step": 57140 + }, + { + "epoch": 0.05755705092759062, + "grad_norm": 9.318478617579295, + "learning_rate": 4.999297610439747e-05, + "loss": 2.5527, + "mean_token_accuracy": 0.4034482717514038, + "step": 57145 + }, + { + "epoch": 0.057562086980694795, + "grad_norm": 14.936182827666997, + "learning_rate": 4.999296673906796e-05, + "loss": 2.9974, + "mean_token_accuracy": 0.36551724672317504, + "step": 57150 + }, + { + "epoch": 0.05756712303379897, + "grad_norm": 10.308502224547889, + "learning_rate": 4.9992957367499945e-05, + "loss": 2.4932, + "mean_token_accuracy": 0.4068965554237366, + "step": 57155 + }, + { + "epoch": 0.05757215908690314, + "grad_norm": 11.885303757486795, + "learning_rate": 4.9992947989693417e-05, + "loss": 2.6451, + "mean_token_accuracy": 0.4344827592372894, + "step": 57160 + }, + { + "epoch": 0.05757719514000731, + "grad_norm": 12.399681148125469, + "learning_rate": 4.999293860564838e-05, + "loss": 3.4179, + "mean_token_accuracy": 0.3493042945861816, + "step": 57165 + }, + { + "epoch": 0.05758223119311148, + "grad_norm": 11.336888019123098, + "learning_rate": 4.9992929215364855e-05, + "loss": 2.6676, + "mean_token_accuracy": 0.3655172407627106, + "step": 57170 + }, + { + "epoch": 0.057587267246215657, + "grad_norm": 12.544787637386213, + "learning_rate": 4.999291981884283e-05, + "loss": 2.7191, + "mean_token_accuracy": 0.41034482717514037, + "step": 57175 + }, + { + "epoch": 0.05759230329931983, + "grad_norm": 9.074175836044176, + "learning_rate": 4.999291041608231e-05, + "loss": 2.6352, + "mean_token_accuracy": 0.3586206823587418, + "step": 57180 + }, + { + "epoch": 0.057597339352424004, + "grad_norm": 8.90452963227952, + "learning_rate": 4.999290100708329e-05, + "loss": 2.2349, + "mean_token_accuracy": 0.4344827592372894, + "step": 57185 + }, + { + "epoch": 0.05760237540552818, + "grad_norm": 12.793787761560177, + "learning_rate": 4.999289159184579e-05, + "loss": 2.4676, + "mean_token_accuracy": 0.4034482717514038, + "step": 57190 + }, + { + "epoch": 0.05760741145863235, + "grad_norm": 13.610230889253506, + "learning_rate": 4.99928821703698e-05, + "loss": 3.0442, + "mean_token_accuracy": 0.334482753276825, + "step": 57195 + }, + { + "epoch": 0.05761244751173652, + "grad_norm": 12.090811196471845, + "learning_rate": 4.9992872742655324e-05, + "loss": 2.6437, + "mean_token_accuracy": 0.382758629322052, + "step": 57200 + }, + { + "epoch": 0.05761748356484069, + "grad_norm": 15.814008938062297, + "learning_rate": 4.999286330870236e-05, + "loss": 2.3298, + "mean_token_accuracy": 0.44482758045196535, + "step": 57205 + }, + { + "epoch": 0.057622519617944866, + "grad_norm": 11.704183741350963, + "learning_rate": 4.999285386851092e-05, + "loss": 2.7125, + "mean_token_accuracy": 0.36206896901130675, + "step": 57210 + }, + { + "epoch": 0.05762755567104904, + "grad_norm": 9.948972792238973, + "learning_rate": 4.999284442208101e-05, + "loss": 2.3295, + "mean_token_accuracy": 0.441379314661026, + "step": 57215 + }, + { + "epoch": 0.05763259172415321, + "grad_norm": 11.496668419502647, + "learning_rate": 4.999283496941262e-05, + "loss": 2.3162, + "mean_token_accuracy": 0.41379310488700866, + "step": 57220 + }, + { + "epoch": 0.05763762777725739, + "grad_norm": 12.070797776477239, + "learning_rate": 4.9992825510505756e-05, + "loss": 2.4811, + "mean_token_accuracy": 0.4068965554237366, + "step": 57225 + }, + { + "epoch": 0.05764266383036156, + "grad_norm": 9.575953221627952, + "learning_rate": 4.999281604536043e-05, + "loss": 2.195, + "mean_token_accuracy": 0.4413793087005615, + "step": 57230 + }, + { + "epoch": 0.05764769988346573, + "grad_norm": 9.53784330188606, + "learning_rate": 4.9992806573976633e-05, + "loss": 2.3927, + "mean_token_accuracy": 0.38620689511299133, + "step": 57235 + }, + { + "epoch": 0.0576527359365699, + "grad_norm": 11.774543382035088, + "learning_rate": 4.9992797096354374e-05, + "loss": 2.8829, + "mean_token_accuracy": 0.34482758641242983, + "step": 57240 + }, + { + "epoch": 0.057657771989674075, + "grad_norm": 12.537331619660613, + "learning_rate": 4.999278761249365e-05, + "loss": 2.6089, + "mean_token_accuracy": 0.39310344159603117, + "step": 57245 + }, + { + "epoch": 0.05766280804277825, + "grad_norm": 11.2690024717912, + "learning_rate": 4.9992778122394465e-05, + "loss": 2.7197, + "mean_token_accuracy": 0.4068965494632721, + "step": 57250 + }, + { + "epoch": 0.05766784409588242, + "grad_norm": 15.50691234585414, + "learning_rate": 4.999276862605683e-05, + "loss": 2.5386, + "mean_token_accuracy": 0.4068965554237366, + "step": 57255 + }, + { + "epoch": 0.0576728801489866, + "grad_norm": 9.044296447289152, + "learning_rate": 4.999275912348075e-05, + "loss": 2.2877, + "mean_token_accuracy": 0.4400483965873718, + "step": 57260 + }, + { + "epoch": 0.05767791620209077, + "grad_norm": 13.047760548798507, + "learning_rate": 4.99927496146662e-05, + "loss": 2.5269, + "mean_token_accuracy": 0.4103448331356049, + "step": 57265 + }, + { + "epoch": 0.05768295225519494, + "grad_norm": 15.508047282207551, + "learning_rate": 4.999274009961322e-05, + "loss": 2.9094, + "mean_token_accuracy": 0.4000000059604645, + "step": 57270 + }, + { + "epoch": 0.05768798830829911, + "grad_norm": 20.046362997074926, + "learning_rate": 4.9992730578321776e-05, + "loss": 2.6688, + "mean_token_accuracy": 0.3620689570903778, + "step": 57275 + }, + { + "epoch": 0.057693024361403285, + "grad_norm": 13.237840251920288, + "learning_rate": 4.99927210507919e-05, + "loss": 3.0317, + "mean_token_accuracy": 0.3551724135875702, + "step": 57280 + }, + { + "epoch": 0.05769806041450746, + "grad_norm": 14.038161316368479, + "learning_rate": 4.999271151702359e-05, + "loss": 2.3832, + "mean_token_accuracy": 0.42413793206214906, + "step": 57285 + }, + { + "epoch": 0.05770309646761163, + "grad_norm": 13.342003811419074, + "learning_rate": 4.999270197701684e-05, + "loss": 3.2633, + "mean_token_accuracy": 0.3310344785451889, + "step": 57290 + }, + { + "epoch": 0.057708132520715806, + "grad_norm": 10.644195659735548, + "learning_rate": 4.999269243077164e-05, + "loss": 2.9996, + "mean_token_accuracy": 0.35353902280330657, + "step": 57295 + }, + { + "epoch": 0.05771316857381998, + "grad_norm": 11.76812814495945, + "learning_rate": 4.999268287828803e-05, + "loss": 2.5163, + "mean_token_accuracy": 0.41724138259887694, + "step": 57300 + }, + { + "epoch": 0.05771820462692415, + "grad_norm": 11.970856248790895, + "learning_rate": 4.9992673319565974e-05, + "loss": 2.5546, + "mean_token_accuracy": 0.37586207389831544, + "step": 57305 + }, + { + "epoch": 0.05772324068002832, + "grad_norm": 12.144989780344805, + "learning_rate": 4.9992663754605504e-05, + "loss": 2.8722, + "mean_token_accuracy": 0.37241379618644715, + "step": 57310 + }, + { + "epoch": 0.057728276733132494, + "grad_norm": 14.625796514674176, + "learning_rate": 4.9992654183406595e-05, + "loss": 2.6345, + "mean_token_accuracy": 0.3999999940395355, + "step": 57315 + }, + { + "epoch": 0.05773331278623667, + "grad_norm": 12.539052656071842, + "learning_rate": 4.9992644605969275e-05, + "loss": 2.7417, + "mean_token_accuracy": 0.43793103098869324, + "step": 57320 + }, + { + "epoch": 0.05773834883934084, + "grad_norm": 11.27053693874464, + "learning_rate": 4.999263502229353e-05, + "loss": 2.5806, + "mean_token_accuracy": 0.4310344815254211, + "step": 57325 + }, + { + "epoch": 0.057743384892445015, + "grad_norm": 12.801565739271933, + "learning_rate": 4.999262543237937e-05, + "loss": 2.6702, + "mean_token_accuracy": 0.35517241060733795, + "step": 57330 + }, + { + "epoch": 0.05774842094554919, + "grad_norm": 10.219399920074073, + "learning_rate": 4.99926158362268e-05, + "loss": 2.5418, + "mean_token_accuracy": 0.43218390345573426, + "step": 57335 + }, + { + "epoch": 0.057753456998653356, + "grad_norm": 11.451479382917173, + "learning_rate": 4.999260623383582e-05, + "loss": 2.1758, + "mean_token_accuracy": 0.45862069725990295, + "step": 57340 + }, + { + "epoch": 0.05775849305175753, + "grad_norm": 11.395144014385595, + "learning_rate": 4.9992596625206425e-05, + "loss": 2.5118, + "mean_token_accuracy": 0.3896551728248596, + "step": 57345 + }, + { + "epoch": 0.057763529104861704, + "grad_norm": 13.331076268662285, + "learning_rate": 4.9992587010338624e-05, + "loss": 3.0526, + "mean_token_accuracy": 0.3758620619773865, + "step": 57350 + }, + { + "epoch": 0.05776856515796588, + "grad_norm": 10.323330932600783, + "learning_rate": 4.999257738923243e-05, + "loss": 2.5068, + "mean_token_accuracy": 0.43103447556495667, + "step": 57355 + }, + { + "epoch": 0.05777360121107005, + "grad_norm": 12.252978462491669, + "learning_rate": 4.999256776188783e-05, + "loss": 2.4116, + "mean_token_accuracy": 0.40000000298023225, + "step": 57360 + }, + { + "epoch": 0.057778637264174225, + "grad_norm": 11.888449251354396, + "learning_rate": 4.9992558128304836e-05, + "loss": 2.0593, + "mean_token_accuracy": 0.4482758641242981, + "step": 57365 + }, + { + "epoch": 0.0577836733172784, + "grad_norm": 11.733392916596229, + "learning_rate": 4.9992548488483444e-05, + "loss": 2.3107, + "mean_token_accuracy": 0.4498487591743469, + "step": 57370 + }, + { + "epoch": 0.057788709370382566, + "grad_norm": 17.838861536484373, + "learning_rate": 4.999253884242366e-05, + "loss": 2.8911, + "mean_token_accuracy": 0.35923645198345183, + "step": 57375 + }, + { + "epoch": 0.05779374542348674, + "grad_norm": 11.747631666976154, + "learning_rate": 4.999252919012549e-05, + "loss": 2.7115, + "mean_token_accuracy": 0.39999999701976774, + "step": 57380 + }, + { + "epoch": 0.05779878147659091, + "grad_norm": 11.147413750359455, + "learning_rate": 4.9992519531588924e-05, + "loss": 2.4241, + "mean_token_accuracy": 0.403448274731636, + "step": 57385 + }, + { + "epoch": 0.05780381752969509, + "grad_norm": 10.121058995504669, + "learning_rate": 4.999250986681398e-05, + "loss": 2.4734, + "mean_token_accuracy": 0.41379311084747317, + "step": 57390 + }, + { + "epoch": 0.05780885358279926, + "grad_norm": 12.908633713141727, + "learning_rate": 4.999250019580065e-05, + "loss": 2.8051, + "mean_token_accuracy": 0.41034482717514037, + "step": 57395 + }, + { + "epoch": 0.057813889635903434, + "grad_norm": 12.442985635396475, + "learning_rate": 4.9992490518548954e-05, + "loss": 2.8611, + "mean_token_accuracy": 0.317241370677948, + "step": 57400 + }, + { + "epoch": 0.05781892568900761, + "grad_norm": 11.799025195749557, + "learning_rate": 4.999248083505886e-05, + "loss": 2.2785, + "mean_token_accuracy": 0.4413793087005615, + "step": 57405 + }, + { + "epoch": 0.057823961742111775, + "grad_norm": 12.040343127760002, + "learning_rate": 4.999247114533041e-05, + "loss": 2.8043, + "mean_token_accuracy": 0.36551724672317504, + "step": 57410 + }, + { + "epoch": 0.05782899779521595, + "grad_norm": 11.121771006601772, + "learning_rate": 4.999246144936359e-05, + "loss": 2.2651, + "mean_token_accuracy": 0.4, + "step": 57415 + }, + { + "epoch": 0.05783403384832012, + "grad_norm": 12.317496100583458, + "learning_rate": 4.999245174715839e-05, + "loss": 2.2893, + "mean_token_accuracy": 0.47241380214691164, + "step": 57420 + }, + { + "epoch": 0.057839069901424296, + "grad_norm": 13.761624120829946, + "learning_rate": 4.9992442038714834e-05, + "loss": 2.4881, + "mean_token_accuracy": 0.4, + "step": 57425 + }, + { + "epoch": 0.05784410595452847, + "grad_norm": 15.254943869277371, + "learning_rate": 4.9992432324032915e-05, + "loss": 2.9177, + "mean_token_accuracy": 0.33448275923728943, + "step": 57430 + }, + { + "epoch": 0.057849142007632644, + "grad_norm": 14.307423581770168, + "learning_rate": 4.9992422603112625e-05, + "loss": 2.3086, + "mean_token_accuracy": 0.4592364549636841, + "step": 57435 + }, + { + "epoch": 0.05785417806073682, + "grad_norm": 13.165139153254296, + "learning_rate": 4.9992412875953985e-05, + "loss": 2.9389, + "mean_token_accuracy": 0.3620689630508423, + "step": 57440 + }, + { + "epoch": 0.057859214113840984, + "grad_norm": 12.932970189358256, + "learning_rate": 4.9992403142557e-05, + "loss": 2.6163, + "mean_token_accuracy": 0.4172413766384125, + "step": 57445 + }, + { + "epoch": 0.05786425016694516, + "grad_norm": 14.332783040365282, + "learning_rate": 4.999239340292165e-05, + "loss": 2.3578, + "mean_token_accuracy": 0.4379310369491577, + "step": 57450 + }, + { + "epoch": 0.05786928622004933, + "grad_norm": 10.263400752145794, + "learning_rate": 4.9992383657047956e-05, + "loss": 2.3915, + "mean_token_accuracy": 0.44482759237289426, + "step": 57455 + }, + { + "epoch": 0.057874322273153506, + "grad_norm": 11.723888353251912, + "learning_rate": 4.9992373904935915e-05, + "loss": 2.4145, + "mean_token_accuracy": 0.4206896543502808, + "step": 57460 + }, + { + "epoch": 0.05787935832625768, + "grad_norm": 13.521375727847918, + "learning_rate": 4.9992364146585524e-05, + "loss": 2.8984, + "mean_token_accuracy": 0.34482758641242983, + "step": 57465 + }, + { + "epoch": 0.05788439437936185, + "grad_norm": 10.161957143438055, + "learning_rate": 4.9992354381996796e-05, + "loss": 2.2156, + "mean_token_accuracy": 0.4534785270690918, + "step": 57470 + }, + { + "epoch": 0.05788943043246603, + "grad_norm": 12.792552184339858, + "learning_rate": 4.999234461116973e-05, + "loss": 2.7206, + "mean_token_accuracy": 0.36896551549434664, + "step": 57475 + }, + { + "epoch": 0.057894466485570194, + "grad_norm": 15.32101753239313, + "learning_rate": 4.999233483410433e-05, + "loss": 2.6226, + "mean_token_accuracy": 0.358620685338974, + "step": 57480 + }, + { + "epoch": 0.05789950253867437, + "grad_norm": 10.854834194514783, + "learning_rate": 4.99923250508006e-05, + "loss": 2.6908, + "mean_token_accuracy": 0.36896551251411436, + "step": 57485 + }, + { + "epoch": 0.05790453859177854, + "grad_norm": 11.283132763968833, + "learning_rate": 4.999231526125854e-05, + "loss": 2.6612, + "mean_token_accuracy": 0.4137930989265442, + "step": 57490 + }, + { + "epoch": 0.057909574644882715, + "grad_norm": 12.346559381771973, + "learning_rate": 4.9992305465478146e-05, + "loss": 2.4768, + "mean_token_accuracy": 0.3862069010734558, + "step": 57495 + }, + { + "epoch": 0.05791461069798689, + "grad_norm": 11.681939082177314, + "learning_rate": 4.999229566345943e-05, + "loss": 2.0069, + "mean_token_accuracy": 0.501477837562561, + "step": 57500 + }, + { + "epoch": 0.05791964675109106, + "grad_norm": 14.769353369377393, + "learning_rate": 4.9992285855202395e-05, + "loss": 2.7215, + "mean_token_accuracy": 0.3931034505367279, + "step": 57505 + }, + { + "epoch": 0.057924682804195236, + "grad_norm": 12.2244556083009, + "learning_rate": 4.999227604070703e-05, + "loss": 2.4174, + "mean_token_accuracy": 0.4310344815254211, + "step": 57510 + }, + { + "epoch": 0.0579297188572994, + "grad_norm": 10.291547084749382, + "learning_rate": 4.999226621997336e-05, + "loss": 2.3936, + "mean_token_accuracy": 0.45517241954803467, + "step": 57515 + }, + { + "epoch": 0.05793475491040358, + "grad_norm": 12.48981549632761, + "learning_rate": 4.999225639300137e-05, + "loss": 2.8655, + "mean_token_accuracy": 0.3793103456497192, + "step": 57520 + }, + { + "epoch": 0.05793979096350775, + "grad_norm": 11.235521559472508, + "learning_rate": 4.9992246559791076e-05, + "loss": 2.529, + "mean_token_accuracy": 0.37586206793785093, + "step": 57525 + }, + { + "epoch": 0.057944827016611924, + "grad_norm": 12.106284504725325, + "learning_rate": 4.9992236720342464e-05, + "loss": 2.6062, + "mean_token_accuracy": 0.3827586263418198, + "step": 57530 + }, + { + "epoch": 0.0579498630697161, + "grad_norm": 11.635058009067654, + "learning_rate": 4.999222687465555e-05, + "loss": 2.6488, + "mean_token_accuracy": 0.3620689630508423, + "step": 57535 + }, + { + "epoch": 0.05795489912282027, + "grad_norm": 13.136728557414289, + "learning_rate": 4.999221702273034e-05, + "loss": 2.9238, + "mean_token_accuracy": 0.324137932062149, + "step": 57540 + }, + { + "epoch": 0.057959935175924446, + "grad_norm": 12.273208997040715, + "learning_rate": 4.9992207164566824e-05, + "loss": 3.92, + "mean_token_accuracy": 0.2827586218714714, + "step": 57545 + }, + { + "epoch": 0.05796497122902861, + "grad_norm": 16.738844625018178, + "learning_rate": 4.999219730016501e-05, + "loss": 2.3522, + "mean_token_accuracy": 0.4068965494632721, + "step": 57550 + }, + { + "epoch": 0.057970007282132786, + "grad_norm": 10.973796224215365, + "learning_rate": 4.999218742952489e-05, + "loss": 3.0449, + "mean_token_accuracy": 0.358620685338974, + "step": 57555 + }, + { + "epoch": 0.05797504333523696, + "grad_norm": 11.081445871501213, + "learning_rate": 4.999217755264649e-05, + "loss": 2.1733, + "mean_token_accuracy": 0.4813067138195038, + "step": 57560 + }, + { + "epoch": 0.057980079388341134, + "grad_norm": 16.197548411130477, + "learning_rate": 4.99921676695298e-05, + "loss": 2.9741, + "mean_token_accuracy": 0.38620689511299133, + "step": 57565 + }, + { + "epoch": 0.05798511544144531, + "grad_norm": 13.843367774552254, + "learning_rate": 4.999215778017482e-05, + "loss": 2.3978, + "mean_token_accuracy": 0.43303084969520567, + "step": 57570 + }, + { + "epoch": 0.05799015149454948, + "grad_norm": 15.754287428143902, + "learning_rate": 4.999214788458156e-05, + "loss": 2.655, + "mean_token_accuracy": 0.41379311084747317, + "step": 57575 + }, + { + "epoch": 0.057995187547653655, + "grad_norm": 11.91576056316204, + "learning_rate": 4.999213798275002e-05, + "loss": 2.6878, + "mean_token_accuracy": 0.36206896901130675, + "step": 57580 + }, + { + "epoch": 0.05800022360075782, + "grad_norm": 14.166664682472806, + "learning_rate": 4.99921280746802e-05, + "loss": 2.5356, + "mean_token_accuracy": 0.3793103456497192, + "step": 57585 + }, + { + "epoch": 0.058005259653861996, + "grad_norm": 21.730674564662785, + "learning_rate": 4.9992118160372103e-05, + "loss": 2.8743, + "mean_token_accuracy": 0.40344826877117157, + "step": 57590 + }, + { + "epoch": 0.05801029570696617, + "grad_norm": 13.078804639144298, + "learning_rate": 4.999210823982573e-05, + "loss": 2.8637, + "mean_token_accuracy": 0.3790078639984131, + "step": 57595 + }, + { + "epoch": 0.05801533176007034, + "grad_norm": 12.663240104256626, + "learning_rate": 4.999209831304109e-05, + "loss": 2.5151, + "mean_token_accuracy": 0.4103448212146759, + "step": 57600 + }, + { + "epoch": 0.05802036781317452, + "grad_norm": 9.719288688943896, + "learning_rate": 4.999208838001818e-05, + "loss": 2.4989, + "mean_token_accuracy": 0.4, + "step": 57605 + }, + { + "epoch": 0.05802540386627869, + "grad_norm": 11.700511829390239, + "learning_rate": 4.9992078440757015e-05, + "loss": 2.5765, + "mean_token_accuracy": 0.44137930274009707, + "step": 57610 + }, + { + "epoch": 0.058030439919382865, + "grad_norm": 13.876858200714418, + "learning_rate": 4.999206849525759e-05, + "loss": 2.8379, + "mean_token_accuracy": 0.36896551847457887, + "step": 57615 + }, + { + "epoch": 0.05803547597248703, + "grad_norm": 12.327747228256067, + "learning_rate": 4.999205854351989e-05, + "loss": 2.8716, + "mean_token_accuracy": 0.38965516686439516, + "step": 57620 + }, + { + "epoch": 0.058040512025591205, + "grad_norm": 17.440115893457662, + "learning_rate": 4.999204858554394e-05, + "loss": 2.5576, + "mean_token_accuracy": 0.43103447556495667, + "step": 57625 + }, + { + "epoch": 0.05804554807869538, + "grad_norm": 12.300303390321904, + "learning_rate": 4.999203862132974e-05, + "loss": 3.0254, + "mean_token_accuracy": 0.33448275923728943, + "step": 57630 + }, + { + "epoch": 0.05805058413179955, + "grad_norm": 10.964151359838722, + "learning_rate": 4.999202865087729e-05, + "loss": 2.3905, + "mean_token_accuracy": 0.46896551847457885, + "step": 57635 + }, + { + "epoch": 0.058055620184903726, + "grad_norm": 9.47427538400915, + "learning_rate": 4.9992018674186585e-05, + "loss": 2.6198, + "mean_token_accuracy": 0.4362976372241974, + "step": 57640 + }, + { + "epoch": 0.0580606562380079, + "grad_norm": 15.123975525876906, + "learning_rate": 4.999200869125764e-05, + "loss": 2.4058, + "mean_token_accuracy": 0.47241379618644713, + "step": 57645 + }, + { + "epoch": 0.058065692291112074, + "grad_norm": 14.146877072745678, + "learning_rate": 4.999199870209045e-05, + "loss": 2.6921, + "mean_token_accuracy": 0.3482758581638336, + "step": 57650 + }, + { + "epoch": 0.05807072834421624, + "grad_norm": 10.985118527789368, + "learning_rate": 4.9991988706685015e-05, + "loss": 2.4065, + "mean_token_accuracy": 0.41034482717514037, + "step": 57655 + }, + { + "epoch": 0.058075764397320415, + "grad_norm": 13.4627334283685, + "learning_rate": 4.999197870504136e-05, + "loss": 2.9795, + "mean_token_accuracy": 0.30689655244350433, + "step": 57660 + }, + { + "epoch": 0.05808080045042459, + "grad_norm": 12.403825298055832, + "learning_rate": 4.999196869715945e-05, + "loss": 3.1009, + "mean_token_accuracy": 0.33448275923728943, + "step": 57665 + }, + { + "epoch": 0.05808583650352876, + "grad_norm": 10.995858145357698, + "learning_rate": 4.999195868303932e-05, + "loss": 2.2313, + "mean_token_accuracy": 0.45172414779663084, + "step": 57670 + }, + { + "epoch": 0.058090872556632936, + "grad_norm": 10.302747809018088, + "learning_rate": 4.9991948662680966e-05, + "loss": 2.3699, + "mean_token_accuracy": 0.43103448748588563, + "step": 57675 + }, + { + "epoch": 0.05809590860973711, + "grad_norm": 10.87509910651572, + "learning_rate": 4.9991938636084374e-05, + "loss": 2.2866, + "mean_token_accuracy": 0.4034482777118683, + "step": 57680 + }, + { + "epoch": 0.05810094466284128, + "grad_norm": 12.901040367542265, + "learning_rate": 4.999192860324957e-05, + "loss": 2.4967, + "mean_token_accuracy": 0.3586206942796707, + "step": 57685 + }, + { + "epoch": 0.05810598071594545, + "grad_norm": 10.36413112704375, + "learning_rate": 4.9991918564176534e-05, + "loss": 2.7469, + "mean_token_accuracy": 0.4068965494632721, + "step": 57690 + }, + { + "epoch": 0.058111016769049624, + "grad_norm": 14.56691083345873, + "learning_rate": 4.999190851886529e-05, + "loss": 2.3583, + "mean_token_accuracy": 0.43793103098869324, + "step": 57695 + }, + { + "epoch": 0.0581160528221538, + "grad_norm": 12.067074123999799, + "learning_rate": 4.999189846731583e-05, + "loss": 2.4572, + "mean_token_accuracy": 0.42758620381355283, + "step": 57700 + }, + { + "epoch": 0.05812108887525797, + "grad_norm": 10.259203268914284, + "learning_rate": 4.999188840952816e-05, + "loss": 2.8742, + "mean_token_accuracy": 0.35862068831920624, + "step": 57705 + }, + { + "epoch": 0.058126124928362145, + "grad_norm": 10.825345728105177, + "learning_rate": 4.999187834550228e-05, + "loss": 2.3084, + "mean_token_accuracy": 0.4413793087005615, + "step": 57710 + }, + { + "epoch": 0.05813116098146632, + "grad_norm": 12.591825416151927, + "learning_rate": 4.9991868275238176e-05, + "loss": 2.5617, + "mean_token_accuracy": 0.4, + "step": 57715 + }, + { + "epoch": 0.05813619703457049, + "grad_norm": 12.088768302424826, + "learning_rate": 4.9991858198735895e-05, + "loss": 2.7255, + "mean_token_accuracy": 0.38275861740112305, + "step": 57720 + }, + { + "epoch": 0.05814123308767466, + "grad_norm": 10.785434828475342, + "learning_rate": 4.99918481159954e-05, + "loss": 2.3296, + "mean_token_accuracy": 0.37586206793785093, + "step": 57725 + }, + { + "epoch": 0.05814626914077883, + "grad_norm": 10.54794844790711, + "learning_rate": 4.999183802701671e-05, + "loss": 2.9022, + "mean_token_accuracy": 0.38620688319206237, + "step": 57730 + }, + { + "epoch": 0.05815130519388301, + "grad_norm": 16.51180580278284, + "learning_rate": 4.999182793179982e-05, + "loss": 2.2915, + "mean_token_accuracy": 0.4206896543502808, + "step": 57735 + }, + { + "epoch": 0.05815634124698718, + "grad_norm": 28.158845464716098, + "learning_rate": 4.999181783034474e-05, + "loss": 2.8655, + "mean_token_accuracy": 0.42758621871471403, + "step": 57740 + }, + { + "epoch": 0.058161377300091355, + "grad_norm": 10.648634792828563, + "learning_rate": 4.999180772265148e-05, + "loss": 2.5358, + "mean_token_accuracy": 0.41203871965408323, + "step": 57745 + }, + { + "epoch": 0.05816641335319553, + "grad_norm": 13.731288030418478, + "learning_rate": 4.999179760872002e-05, + "loss": 2.4154, + "mean_token_accuracy": 0.42068964838981626, + "step": 57750 + }, + { + "epoch": 0.0581714494062997, + "grad_norm": 13.008247167496366, + "learning_rate": 4.9991787488550385e-05, + "loss": 2.3941, + "mean_token_accuracy": 0.4328493714332581, + "step": 57755 + }, + { + "epoch": 0.05817648545940387, + "grad_norm": 17.049905067474427, + "learning_rate": 4.999177736214256e-05, + "loss": 2.2495, + "mean_token_accuracy": 0.4689655125141144, + "step": 57760 + }, + { + "epoch": 0.05818152151250804, + "grad_norm": 15.982037971520343, + "learning_rate": 4.999176722949656e-05, + "loss": 3.2155, + "mean_token_accuracy": 0.31724137961864474, + "step": 57765 + }, + { + "epoch": 0.05818655756561222, + "grad_norm": 13.135824375298366, + "learning_rate": 4.999175709061239e-05, + "loss": 3.0663, + "mean_token_accuracy": 0.37241379618644715, + "step": 57770 + }, + { + "epoch": 0.05819159361871639, + "grad_norm": 12.334039478210656, + "learning_rate": 4.9991746945490035e-05, + "loss": 2.6428, + "mean_token_accuracy": 0.3620689630508423, + "step": 57775 + }, + { + "epoch": 0.058196629671820564, + "grad_norm": 13.47687592940928, + "learning_rate": 4.999173679412952e-05, + "loss": 2.7659, + "mean_token_accuracy": 0.3551724076271057, + "step": 57780 + }, + { + "epoch": 0.05820166572492474, + "grad_norm": 11.064908352222812, + "learning_rate": 4.999172663653084e-05, + "loss": 2.1654, + "mean_token_accuracy": 0.49655171632766726, + "step": 57785 + }, + { + "epoch": 0.05820670177802891, + "grad_norm": 14.19722561308654, + "learning_rate": 4.999171647269399e-05, + "loss": 2.5727, + "mean_token_accuracy": 0.4, + "step": 57790 + }, + { + "epoch": 0.05821173783113308, + "grad_norm": 13.1108835999998, + "learning_rate": 4.999170630261898e-05, + "loss": 2.6319, + "mean_token_accuracy": 0.4068965494632721, + "step": 57795 + }, + { + "epoch": 0.05821677388423725, + "grad_norm": 11.90530903715145, + "learning_rate": 4.999169612630581e-05, + "loss": 2.2424, + "mean_token_accuracy": 0.4275861978530884, + "step": 57800 + }, + { + "epoch": 0.058221809937341426, + "grad_norm": 16.733961714294786, + "learning_rate": 4.999168594375449e-05, + "loss": 2.3704, + "mean_token_accuracy": 0.398064124584198, + "step": 57805 + }, + { + "epoch": 0.0582268459904456, + "grad_norm": 11.639555534171263, + "learning_rate": 4.999167575496501e-05, + "loss": 2.6935, + "mean_token_accuracy": 0.3517241418361664, + "step": 57810 + }, + { + "epoch": 0.058231882043549774, + "grad_norm": 11.754040016050185, + "learning_rate": 4.999166555993738e-05, + "loss": 2.9862, + "mean_token_accuracy": 0.3551724165678024, + "step": 57815 + }, + { + "epoch": 0.05823691809665395, + "grad_norm": 12.054007186036795, + "learning_rate": 4.9991655358671603e-05, + "loss": 2.6064, + "mean_token_accuracy": 0.4103448212146759, + "step": 57820 + }, + { + "epoch": 0.05824195414975812, + "grad_norm": 16.254996743767766, + "learning_rate": 4.999164515116769e-05, + "loss": 2.3429, + "mean_token_accuracy": 0.45517241954803467, + "step": 57825 + }, + { + "epoch": 0.05824699020286229, + "grad_norm": 13.008722324888442, + "learning_rate": 4.999163493742563e-05, + "loss": 2.9736, + "mean_token_accuracy": 0.32413792610168457, + "step": 57830 + }, + { + "epoch": 0.05825202625596646, + "grad_norm": 9.588482074195866, + "learning_rate": 4.999162471744543e-05, + "loss": 2.6122, + "mean_token_accuracy": 0.4034482717514038, + "step": 57835 + }, + { + "epoch": 0.058257062309070635, + "grad_norm": 10.654409263734909, + "learning_rate": 4.999161449122709e-05, + "loss": 2.6545, + "mean_token_accuracy": 0.4379310429096222, + "step": 57840 + }, + { + "epoch": 0.05826209836217481, + "grad_norm": 13.525322120422883, + "learning_rate": 4.999160425877063e-05, + "loss": 2.3582, + "mean_token_accuracy": 0.4310344815254211, + "step": 57845 + }, + { + "epoch": 0.05826713441527898, + "grad_norm": 11.672146926890395, + "learning_rate": 4.9991594020076024e-05, + "loss": 2.5143, + "mean_token_accuracy": 0.4172413766384125, + "step": 57850 + }, + { + "epoch": 0.05827217046838316, + "grad_norm": 13.544392515386573, + "learning_rate": 4.9991583775143294e-05, + "loss": 2.0403, + "mean_token_accuracy": 0.5103448152542114, + "step": 57855 + }, + { + "epoch": 0.05827720652148733, + "grad_norm": 12.36115802947356, + "learning_rate": 4.9991573523972444e-05, + "loss": 2.4344, + "mean_token_accuracy": 0.4137930989265442, + "step": 57860 + }, + { + "epoch": 0.0582822425745915, + "grad_norm": 15.13027655154658, + "learning_rate": 4.999156326656347e-05, + "loss": 2.531, + "mean_token_accuracy": 0.3758620619773865, + "step": 57865 + }, + { + "epoch": 0.05828727862769567, + "grad_norm": 15.975172065166893, + "learning_rate": 4.999155300291638e-05, + "loss": 2.9955, + "mean_token_accuracy": 0.31724137663841245, + "step": 57870 + }, + { + "epoch": 0.058292314680799845, + "grad_norm": 14.975872428801766, + "learning_rate": 4.9991542733031175e-05, + "loss": 2.8214, + "mean_token_accuracy": 0.3517241418361664, + "step": 57875 + }, + { + "epoch": 0.05829735073390402, + "grad_norm": 20.413676515678013, + "learning_rate": 4.9991532456907846e-05, + "loss": 3.0033, + "mean_token_accuracy": 0.3655172437429428, + "step": 57880 + }, + { + "epoch": 0.05830238678700819, + "grad_norm": 10.05592673239935, + "learning_rate": 4.999152217454642e-05, + "loss": 2.403, + "mean_token_accuracy": 0.41724138259887694, + "step": 57885 + }, + { + "epoch": 0.058307422840112366, + "grad_norm": 14.11237520890565, + "learning_rate": 4.9991511885946875e-05, + "loss": 2.5631, + "mean_token_accuracy": 0.42758620977401735, + "step": 57890 + }, + { + "epoch": 0.05831245889321654, + "grad_norm": 14.223319435263825, + "learning_rate": 4.999150159110923e-05, + "loss": 3.6761, + "mean_token_accuracy": 0.2965517222881317, + "step": 57895 + }, + { + "epoch": 0.05831749494632071, + "grad_norm": 20.94054315704241, + "learning_rate": 4.9991491290033486e-05, + "loss": 2.7996, + "mean_token_accuracy": 0.3896551787853241, + "step": 57900 + }, + { + "epoch": 0.05832253099942488, + "grad_norm": 10.271261499703916, + "learning_rate": 4.999148098271963e-05, + "loss": 2.8756, + "mean_token_accuracy": 0.38620689511299133, + "step": 57905 + }, + { + "epoch": 0.058327567052529054, + "grad_norm": 12.064438465710602, + "learning_rate": 4.99914706691677e-05, + "loss": 2.5901, + "mean_token_accuracy": 0.42413793206214906, + "step": 57910 + }, + { + "epoch": 0.05833260310563323, + "grad_norm": 12.995473435766087, + "learning_rate": 4.999146034937766e-05, + "loss": 2.5431, + "mean_token_accuracy": 0.36551723778247835, + "step": 57915 + }, + { + "epoch": 0.0583376391587374, + "grad_norm": 13.863187038851837, + "learning_rate": 4.999145002334954e-05, + "loss": 2.329, + "mean_token_accuracy": 0.4137930989265442, + "step": 57920 + }, + { + "epoch": 0.058342675211841576, + "grad_norm": 11.49930694142943, + "learning_rate": 4.999143969108332e-05, + "loss": 3.1369, + "mean_token_accuracy": 0.31379310190677645, + "step": 57925 + }, + { + "epoch": 0.05834771126494575, + "grad_norm": 10.141836428712791, + "learning_rate": 4.9991429352579026e-05, + "loss": 2.7194, + "mean_token_accuracy": 0.38747731447219846, + "step": 57930 + }, + { + "epoch": 0.058352747318049916, + "grad_norm": 16.391698204565703, + "learning_rate": 4.999141900783665e-05, + "loss": 2.6354, + "mean_token_accuracy": 0.3965517163276672, + "step": 57935 + }, + { + "epoch": 0.05835778337115409, + "grad_norm": 13.077879931804969, + "learning_rate": 4.999140865685619e-05, + "loss": 2.697, + "mean_token_accuracy": 0.41379310488700866, + "step": 57940 + }, + { + "epoch": 0.058362819424258264, + "grad_norm": 11.990448768023217, + "learning_rate": 4.999139829963765e-05, + "loss": 2.4792, + "mean_token_accuracy": 0.39310344457626345, + "step": 57945 + }, + { + "epoch": 0.05836785547736244, + "grad_norm": 12.498307081894962, + "learning_rate": 4.9991387936181046e-05, + "loss": 2.5416, + "mean_token_accuracy": 0.41379310488700866, + "step": 57950 + }, + { + "epoch": 0.05837289153046661, + "grad_norm": 10.477484880842788, + "learning_rate": 4.9991377566486366e-05, + "loss": 2.9785, + "mean_token_accuracy": 0.37241379022598264, + "step": 57955 + }, + { + "epoch": 0.058377927583570785, + "grad_norm": 15.894034534519928, + "learning_rate": 4.999136719055363e-05, + "loss": 2.6587, + "mean_token_accuracy": 0.38965516686439516, + "step": 57960 + }, + { + "epoch": 0.05838296363667496, + "grad_norm": 14.148774096539112, + "learning_rate": 4.999135680838281e-05, + "loss": 2.6312, + "mean_token_accuracy": 0.39310344457626345, + "step": 57965 + }, + { + "epoch": 0.058387999689779126, + "grad_norm": 10.181345441465206, + "learning_rate": 4.999134641997394e-05, + "loss": 2.7742, + "mean_token_accuracy": 0.35862069129943847, + "step": 57970 + }, + { + "epoch": 0.0583930357428833, + "grad_norm": 11.195069592652244, + "learning_rate": 4.999133602532701e-05, + "loss": 2.9507, + "mean_token_accuracy": 0.358620685338974, + "step": 57975 + }, + { + "epoch": 0.05839807179598747, + "grad_norm": 11.86939297354784, + "learning_rate": 4.9991325624442025e-05, + "loss": 2.7666, + "mean_token_accuracy": 0.3724137842655182, + "step": 57980 + }, + { + "epoch": 0.05840310784909165, + "grad_norm": 12.89826127961735, + "learning_rate": 4.9991315217318986e-05, + "loss": 2.7215, + "mean_token_accuracy": 0.36896551847457887, + "step": 57985 + }, + { + "epoch": 0.05840814390219582, + "grad_norm": 13.718473336137832, + "learning_rate": 4.9991304803957894e-05, + "loss": 2.7096, + "mean_token_accuracy": 0.3620689630508423, + "step": 57990 + }, + { + "epoch": 0.058413179955299994, + "grad_norm": 11.34960453657506, + "learning_rate": 4.999129438435876e-05, + "loss": 2.5112, + "mean_token_accuracy": 0.42934059500694277, + "step": 57995 + }, + { + "epoch": 0.05841821600840417, + "grad_norm": 11.598299525067583, + "learning_rate": 4.9991283958521576e-05, + "loss": 2.3874, + "mean_token_accuracy": 0.43103447556495667, + "step": 58000 + }, + { + "epoch": 0.058423252061508335, + "grad_norm": 9.250771122393951, + "learning_rate": 4.9991273526446355e-05, + "loss": 2.4969, + "mean_token_accuracy": 0.39310344457626345, + "step": 58005 + }, + { + "epoch": 0.05842828811461251, + "grad_norm": 16.090905288244663, + "learning_rate": 4.9991263088133096e-05, + "loss": 2.8318, + "mean_token_accuracy": 0.3517241358757019, + "step": 58010 + }, + { + "epoch": 0.05843332416771668, + "grad_norm": 11.153094155757385, + "learning_rate": 4.999125264358179e-05, + "loss": 2.4945, + "mean_token_accuracy": 0.42413792610168455, + "step": 58015 + }, + { + "epoch": 0.058438360220820856, + "grad_norm": 10.082462112520616, + "learning_rate": 4.999124219279246e-05, + "loss": 2.2097, + "mean_token_accuracy": 0.41724138259887694, + "step": 58020 + }, + { + "epoch": 0.05844339627392503, + "grad_norm": 11.85228843738946, + "learning_rate": 4.99912317357651e-05, + "loss": 2.772, + "mean_token_accuracy": 0.37241379022598264, + "step": 58025 + }, + { + "epoch": 0.058448432327029204, + "grad_norm": 11.813587319919813, + "learning_rate": 4.9991221272499716e-05, + "loss": 2.114, + "mean_token_accuracy": 0.46551724076271056, + "step": 58030 + }, + { + "epoch": 0.05845346838013338, + "grad_norm": 12.360024985062351, + "learning_rate": 4.9991210802996305e-05, + "loss": 2.6769, + "mean_token_accuracy": 0.3896551728248596, + "step": 58035 + }, + { + "epoch": 0.058458504433237544, + "grad_norm": 11.953774505596208, + "learning_rate": 4.9991200327254876e-05, + "loss": 2.5199, + "mean_token_accuracy": 0.4, + "step": 58040 + }, + { + "epoch": 0.05846354048634172, + "grad_norm": 14.770105386271911, + "learning_rate": 4.999118984527542e-05, + "loss": 3.0267, + "mean_token_accuracy": 0.31034482419490816, + "step": 58045 + }, + { + "epoch": 0.05846857653944589, + "grad_norm": 13.226518206451283, + "learning_rate": 4.999117935705796e-05, + "loss": 2.6368, + "mean_token_accuracy": 0.4068965494632721, + "step": 58050 + }, + { + "epoch": 0.058473612592550066, + "grad_norm": 11.8642711839088, + "learning_rate": 4.999116886260247e-05, + "loss": 2.6854, + "mean_token_accuracy": 0.4, + "step": 58055 + }, + { + "epoch": 0.05847864864565424, + "grad_norm": 13.057276474699377, + "learning_rate": 4.999115836190899e-05, + "loss": 2.7641, + "mean_token_accuracy": 0.41203871965408323, + "step": 58060 + }, + { + "epoch": 0.05848368469875841, + "grad_norm": 11.440452499953622, + "learning_rate": 4.999114785497749e-05, + "loss": 2.5918, + "mean_token_accuracy": 0.3793103456497192, + "step": 58065 + }, + { + "epoch": 0.05848872075186259, + "grad_norm": 10.868024124000197, + "learning_rate": 4.9991137341807994e-05, + "loss": 2.95, + "mean_token_accuracy": 0.39310343861579894, + "step": 58070 + }, + { + "epoch": 0.058493756804966754, + "grad_norm": 11.160758586213356, + "learning_rate": 4.99911268224005e-05, + "loss": 2.7233, + "mean_token_accuracy": 0.4, + "step": 58075 + }, + { + "epoch": 0.05849879285807093, + "grad_norm": 12.002629460198245, + "learning_rate": 4.9991116296755e-05, + "loss": 2.4347, + "mean_token_accuracy": 0.4068965554237366, + "step": 58080 + }, + { + "epoch": 0.0585038289111751, + "grad_norm": 12.41181055114698, + "learning_rate": 4.999110576487151e-05, + "loss": 2.7508, + "mean_token_accuracy": 0.3620689570903778, + "step": 58085 + }, + { + "epoch": 0.058508864964279275, + "grad_norm": 16.269076589220752, + "learning_rate": 4.999109522675002e-05, + "loss": 2.5703, + "mean_token_accuracy": 0.3896551728248596, + "step": 58090 + }, + { + "epoch": 0.05851390101738345, + "grad_norm": 12.44436747315813, + "learning_rate": 4.9991084682390545e-05, + "loss": 2.5524, + "mean_token_accuracy": 0.3827586233615875, + "step": 58095 + }, + { + "epoch": 0.05851893707048762, + "grad_norm": 17.237771108475272, + "learning_rate": 4.999107413179309e-05, + "loss": 2.8552, + "mean_token_accuracy": 0.324137932062149, + "step": 58100 + }, + { + "epoch": 0.058523973123591796, + "grad_norm": 13.402741473614277, + "learning_rate": 4.999106357495764e-05, + "loss": 2.6562, + "mean_token_accuracy": 0.37931033968925476, + "step": 58105 + }, + { + "epoch": 0.05852900917669596, + "grad_norm": 11.566139204000446, + "learning_rate": 4.999105301188421e-05, + "loss": 2.6592, + "mean_token_accuracy": 0.3793103456497192, + "step": 58110 + }, + { + "epoch": 0.05853404522980014, + "grad_norm": 10.331620575840399, + "learning_rate": 4.999104244257282e-05, + "loss": 2.5494, + "mean_token_accuracy": 0.3931034505367279, + "step": 58115 + }, + { + "epoch": 0.05853908128290431, + "grad_norm": 9.523379002408879, + "learning_rate": 4.999103186702344e-05, + "loss": 2.0584, + "mean_token_accuracy": 0.5008620619773865, + "step": 58120 + }, + { + "epoch": 0.058544117336008485, + "grad_norm": 14.473476228343763, + "learning_rate": 4.9991021285236096e-05, + "loss": 2.5162, + "mean_token_accuracy": 0.417241370677948, + "step": 58125 + }, + { + "epoch": 0.05854915338911266, + "grad_norm": 11.399740809464562, + "learning_rate": 4.999101069721078e-05, + "loss": 2.6497, + "mean_token_accuracy": 0.3551724135875702, + "step": 58130 + }, + { + "epoch": 0.05855418944221683, + "grad_norm": 12.546754776768667, + "learning_rate": 4.999100010294749e-05, + "loss": 2.5099, + "mean_token_accuracy": 0.3620689630508423, + "step": 58135 + }, + { + "epoch": 0.058559225495321006, + "grad_norm": 12.632080453992177, + "learning_rate": 4.999098950244625e-05, + "loss": 2.7654, + "mean_token_accuracy": 0.4034482717514038, + "step": 58140 + }, + { + "epoch": 0.05856426154842517, + "grad_norm": 12.261728765929192, + "learning_rate": 4.9990978895707055e-05, + "loss": 3.2115, + "mean_token_accuracy": 0.341379314661026, + "step": 58145 + }, + { + "epoch": 0.058569297601529346, + "grad_norm": 11.980163944109137, + "learning_rate": 4.999096828272989e-05, + "loss": 2.8505, + "mean_token_accuracy": 0.3551724195480347, + "step": 58150 + }, + { + "epoch": 0.05857433365463352, + "grad_norm": 14.016943305420114, + "learning_rate": 4.9990957663514774e-05, + "loss": 2.4265, + "mean_token_accuracy": 0.43103447556495667, + "step": 58155 + }, + { + "epoch": 0.058579369707737694, + "grad_norm": 12.126653259054368, + "learning_rate": 4.999094703806171e-05, + "loss": 2.5097, + "mean_token_accuracy": 0.38965516686439516, + "step": 58160 + }, + { + "epoch": 0.05858440576084187, + "grad_norm": 11.905316722668148, + "learning_rate": 4.999093640637069e-05, + "loss": 2.7617, + "mean_token_accuracy": 0.36551723480224607, + "step": 58165 + }, + { + "epoch": 0.05858944181394604, + "grad_norm": 12.682281901996925, + "learning_rate": 4.999092576844173e-05, + "loss": 2.2564, + "mean_token_accuracy": 0.47931033968925474, + "step": 58170 + }, + { + "epoch": 0.058594477867050215, + "grad_norm": 12.867172448281504, + "learning_rate": 4.999091512427483e-05, + "loss": 2.6203, + "mean_token_accuracy": 0.3724137932062149, + "step": 58175 + }, + { + "epoch": 0.05859951392015438, + "grad_norm": 9.872803945522755, + "learning_rate": 4.999090447386999e-05, + "loss": 2.246, + "mean_token_accuracy": 0.41379311084747317, + "step": 58180 + }, + { + "epoch": 0.058604549973258556, + "grad_norm": 13.591890382368568, + "learning_rate": 4.999089381722722e-05, + "loss": 2.828, + "mean_token_accuracy": 0.3965517163276672, + "step": 58185 + }, + { + "epoch": 0.05860958602636273, + "grad_norm": 12.799307227230408, + "learning_rate": 4.99908831543465e-05, + "loss": 2.4925, + "mean_token_accuracy": 0.4379310369491577, + "step": 58190 + }, + { + "epoch": 0.0586146220794669, + "grad_norm": 13.631473350222166, + "learning_rate": 4.9990872485227865e-05, + "loss": 2.4108, + "mean_token_accuracy": 0.4344827651977539, + "step": 58195 + }, + { + "epoch": 0.05861965813257108, + "grad_norm": 12.171327677650542, + "learning_rate": 4.999086180987129e-05, + "loss": 2.7575, + "mean_token_accuracy": 0.37241379022598264, + "step": 58200 + }, + { + "epoch": 0.05862469418567525, + "grad_norm": 11.03572367657223, + "learning_rate": 4.9990851128276806e-05, + "loss": 2.5818, + "mean_token_accuracy": 0.4034482717514038, + "step": 58205 + }, + { + "epoch": 0.058629730238779425, + "grad_norm": 11.281731532225864, + "learning_rate": 4.9990840440444385e-05, + "loss": 2.5378, + "mean_token_accuracy": 0.44482757449150084, + "step": 58210 + }, + { + "epoch": 0.05863476629188359, + "grad_norm": 11.412665405880785, + "learning_rate": 4.9990829746374054e-05, + "loss": 2.5639, + "mean_token_accuracy": 0.43103448748588563, + "step": 58215 + }, + { + "epoch": 0.058639802344987765, + "grad_norm": 11.682116446048864, + "learning_rate": 4.999081904606581e-05, + "loss": 2.6572, + "mean_token_accuracy": 0.37586206793785093, + "step": 58220 + }, + { + "epoch": 0.05864483839809194, + "grad_norm": 14.395120572704545, + "learning_rate": 4.999080833951964e-05, + "loss": 2.9651, + "mean_token_accuracy": 0.3551724076271057, + "step": 58225 + }, + { + "epoch": 0.05864987445119611, + "grad_norm": 12.194332282673388, + "learning_rate": 4.9990797626735576e-05, + "loss": 2.6309, + "mean_token_accuracy": 0.4, + "step": 58230 + }, + { + "epoch": 0.05865491050430029, + "grad_norm": 13.362911112583335, + "learning_rate": 4.99907869077136e-05, + "loss": 2.8832, + "mean_token_accuracy": 0.32068965435028074, + "step": 58235 + }, + { + "epoch": 0.05865994655740446, + "grad_norm": 21.273003475449, + "learning_rate": 4.999077618245372e-05, + "loss": 2.6794, + "mean_token_accuracy": 0.4034482717514038, + "step": 58240 + }, + { + "epoch": 0.058664982610508634, + "grad_norm": 12.925637928370348, + "learning_rate": 4.999076545095594e-05, + "loss": 2.5548, + "mean_token_accuracy": 0.38620689511299133, + "step": 58245 + }, + { + "epoch": 0.0586700186636128, + "grad_norm": 13.418241772915989, + "learning_rate": 4.9990754713220265e-05, + "loss": 2.9526, + "mean_token_accuracy": 0.3413793116807938, + "step": 58250 + }, + { + "epoch": 0.058675054716716975, + "grad_norm": 11.212893672015234, + "learning_rate": 4.999074396924668e-05, + "loss": 2.7394, + "mean_token_accuracy": 0.38620689511299133, + "step": 58255 + }, + { + "epoch": 0.05868009076982115, + "grad_norm": 13.179140378005286, + "learning_rate": 4.9990733219035225e-05, + "loss": 2.7069, + "mean_token_accuracy": 0.37241379618644715, + "step": 58260 + }, + { + "epoch": 0.05868512682292532, + "grad_norm": 12.43030802440109, + "learning_rate": 4.9990722462585864e-05, + "loss": 2.6006, + "mean_token_accuracy": 0.36896551847457887, + "step": 58265 + }, + { + "epoch": 0.058690162876029496, + "grad_norm": 14.2593349061398, + "learning_rate": 4.999071169989862e-05, + "loss": 2.6138, + "mean_token_accuracy": 0.4068965494632721, + "step": 58270 + }, + { + "epoch": 0.05869519892913367, + "grad_norm": 10.21676172874777, + "learning_rate": 4.999070093097351e-05, + "loss": 2.5327, + "mean_token_accuracy": 0.4137930989265442, + "step": 58275 + }, + { + "epoch": 0.058700234982237844, + "grad_norm": 15.391777245551568, + "learning_rate": 4.9990690155810506e-05, + "loss": 2.6252, + "mean_token_accuracy": 0.41724138259887694, + "step": 58280 + }, + { + "epoch": 0.05870527103534201, + "grad_norm": 15.132827438776575, + "learning_rate": 4.999067937440963e-05, + "loss": 2.8262, + "mean_token_accuracy": 0.4034482777118683, + "step": 58285 + }, + { + "epoch": 0.058710307088446184, + "grad_norm": 10.65237468266514, + "learning_rate": 4.9990668586770874e-05, + "loss": 2.1596, + "mean_token_accuracy": 0.44482758045196535, + "step": 58290 + }, + { + "epoch": 0.05871534314155036, + "grad_norm": 11.283660606828796, + "learning_rate": 4.9990657792894245e-05, + "loss": 2.6391, + "mean_token_accuracy": 0.4172413766384125, + "step": 58295 + }, + { + "epoch": 0.05872037919465453, + "grad_norm": 12.993867137727007, + "learning_rate": 4.999064699277976e-05, + "loss": 2.4458, + "mean_token_accuracy": 0.4137930989265442, + "step": 58300 + }, + { + "epoch": 0.058725415247758705, + "grad_norm": 14.4344808791683, + "learning_rate": 4.99906361864274e-05, + "loss": 2.7955, + "mean_token_accuracy": 0.38275861740112305, + "step": 58305 + }, + { + "epoch": 0.05873045130086288, + "grad_norm": 9.9734801806251, + "learning_rate": 4.999062537383718e-05, + "loss": 2.3838, + "mean_token_accuracy": 0.42068964838981626, + "step": 58310 + }, + { + "epoch": 0.05873548735396705, + "grad_norm": 12.078830607747605, + "learning_rate": 4.99906145550091e-05, + "loss": 3.6281, + "mean_token_accuracy": 0.31724137961864474, + "step": 58315 + }, + { + "epoch": 0.05874052340707122, + "grad_norm": 11.67002858074103, + "learning_rate": 4.999060372994317e-05, + "loss": 2.5244, + "mean_token_accuracy": 0.4034482777118683, + "step": 58320 + }, + { + "epoch": 0.058745559460175394, + "grad_norm": 14.060579593033284, + "learning_rate": 4.9990592898639385e-05, + "loss": 2.4573, + "mean_token_accuracy": 0.4052026629447937, + "step": 58325 + }, + { + "epoch": 0.05875059551327957, + "grad_norm": 10.370626652725292, + "learning_rate": 4.9990582061097746e-05, + "loss": 2.4551, + "mean_token_accuracy": 0.4, + "step": 58330 + }, + { + "epoch": 0.05875563156638374, + "grad_norm": 11.677830279831795, + "learning_rate": 4.999057121731826e-05, + "loss": 2.307, + "mean_token_accuracy": 0.3965517163276672, + "step": 58335 + }, + { + "epoch": 0.058760667619487915, + "grad_norm": 11.009148218948424, + "learning_rate": 4.9990560367300936e-05, + "loss": 2.8369, + "mean_token_accuracy": 0.3827586203813553, + "step": 58340 + }, + { + "epoch": 0.05876570367259209, + "grad_norm": 14.235536079942097, + "learning_rate": 4.999054951104578e-05, + "loss": 2.5526, + "mean_token_accuracy": 0.4310344815254211, + "step": 58345 + }, + { + "epoch": 0.05877073972569626, + "grad_norm": 10.210656930119285, + "learning_rate": 4.999053864855277e-05, + "loss": 2.4531, + "mean_token_accuracy": 0.39655172228813174, + "step": 58350 + }, + { + "epoch": 0.05877577577880043, + "grad_norm": 11.769400229235206, + "learning_rate": 4.999052777982193e-05, + "loss": 2.5959, + "mean_token_accuracy": 0.4103448212146759, + "step": 58355 + }, + { + "epoch": 0.0587808118319046, + "grad_norm": 12.651299092601084, + "learning_rate": 4.999051690485326e-05, + "loss": 2.4817, + "mean_token_accuracy": 0.43793103098869324, + "step": 58360 + }, + { + "epoch": 0.05878584788500878, + "grad_norm": 10.99216600684398, + "learning_rate": 4.999050602364675e-05, + "loss": 2.6773, + "mean_token_accuracy": 0.38965516090393065, + "step": 58365 + }, + { + "epoch": 0.05879088393811295, + "grad_norm": 12.900921248889595, + "learning_rate": 4.9990495136202425e-05, + "loss": 2.8438, + "mean_token_accuracy": 0.3620689570903778, + "step": 58370 + }, + { + "epoch": 0.058795919991217124, + "grad_norm": 12.818526582345942, + "learning_rate": 4.999048424252028e-05, + "loss": 3.0079, + "mean_token_accuracy": 0.3448275923728943, + "step": 58375 + }, + { + "epoch": 0.0588009560443213, + "grad_norm": 13.85222009221222, + "learning_rate": 4.999047334260031e-05, + "loss": 2.0867, + "mean_token_accuracy": 0.4988505780696869, + "step": 58380 + }, + { + "epoch": 0.05880599209742547, + "grad_norm": 13.26861068753513, + "learning_rate": 4.9990462436442526e-05, + "loss": 2.1906, + "mean_token_accuracy": 0.4804187178611755, + "step": 58385 + }, + { + "epoch": 0.05881102815052964, + "grad_norm": 16.18407443104682, + "learning_rate": 4.9990451524046926e-05, + "loss": 2.6036, + "mean_token_accuracy": 0.37586206793785093, + "step": 58390 + }, + { + "epoch": 0.05881606420363381, + "grad_norm": 10.280644257513085, + "learning_rate": 4.999044060541352e-05, + "loss": 2.5311, + "mean_token_accuracy": 0.4068965494632721, + "step": 58395 + }, + { + "epoch": 0.058821100256737986, + "grad_norm": 13.714916978082842, + "learning_rate": 4.999042968054231e-05, + "loss": 2.7961, + "mean_token_accuracy": 0.36896551847457887, + "step": 58400 + }, + { + "epoch": 0.05882613630984216, + "grad_norm": 12.940845277379792, + "learning_rate": 4.9990418749433275e-05, + "loss": 2.5207, + "mean_token_accuracy": 0.39310344457626345, + "step": 58405 + }, + { + "epoch": 0.058831172362946334, + "grad_norm": 14.326161257822992, + "learning_rate": 4.999040781208646e-05, + "loss": 2.7674, + "mean_token_accuracy": 0.3724137932062149, + "step": 58410 + }, + { + "epoch": 0.05883620841605051, + "grad_norm": 10.90764893764017, + "learning_rate": 4.9990396868501835e-05, + "loss": 2.5689, + "mean_token_accuracy": 0.39310344457626345, + "step": 58415 + }, + { + "epoch": 0.05884124446915468, + "grad_norm": 13.166823241845668, + "learning_rate": 4.999038591867942e-05, + "loss": 2.7041, + "mean_token_accuracy": 0.4189352631568909, + "step": 58420 + }, + { + "epoch": 0.05884628052225885, + "grad_norm": 16.672581592606456, + "learning_rate": 4.999037496261921e-05, + "loss": 2.729, + "mean_token_accuracy": 0.39655172228813174, + "step": 58425 + }, + { + "epoch": 0.05885131657536302, + "grad_norm": 10.810355758828537, + "learning_rate": 4.999036400032121e-05, + "loss": 3.0283, + "mean_token_accuracy": 0.4068965524435043, + "step": 58430 + }, + { + "epoch": 0.058856352628467196, + "grad_norm": 13.479095774881834, + "learning_rate": 4.999035303178542e-05, + "loss": 3.0113, + "mean_token_accuracy": 0.35172414481639863, + "step": 58435 + }, + { + "epoch": 0.05886138868157137, + "grad_norm": 8.664528726341905, + "learning_rate": 4.999034205701185e-05, + "loss": 2.1789, + "mean_token_accuracy": 0.45517241954803467, + "step": 58440 + }, + { + "epoch": 0.05886642473467554, + "grad_norm": 14.32364779979314, + "learning_rate": 4.99903310760005e-05, + "loss": 2.8923, + "mean_token_accuracy": 0.36551724672317504, + "step": 58445 + }, + { + "epoch": 0.05887146078777972, + "grad_norm": 11.928761527027048, + "learning_rate": 4.999032008875138e-05, + "loss": 2.8976, + "mean_token_accuracy": 0.3689655244350433, + "step": 58450 + }, + { + "epoch": 0.05887649684088389, + "grad_norm": 10.869123869897926, + "learning_rate": 4.999030909526448e-05, + "loss": 2.61, + "mean_token_accuracy": 0.37241379022598264, + "step": 58455 + }, + { + "epoch": 0.05888153289398806, + "grad_norm": 13.104803976666249, + "learning_rate": 4.999029809553981e-05, + "loss": 2.7761, + "mean_token_accuracy": 0.4034482777118683, + "step": 58460 + }, + { + "epoch": 0.05888656894709223, + "grad_norm": 13.322490311816996, + "learning_rate": 4.999028708957737e-05, + "loss": 2.4232, + "mean_token_accuracy": 0.42413792610168455, + "step": 58465 + }, + { + "epoch": 0.058891605000196405, + "grad_norm": 11.526170585399552, + "learning_rate": 4.999027607737716e-05, + "loss": 2.6289, + "mean_token_accuracy": 0.3724137932062149, + "step": 58470 + }, + { + "epoch": 0.05889664105330058, + "grad_norm": 12.589440447052223, + "learning_rate": 4.9990265058939194e-05, + "loss": 2.3249, + "mean_token_accuracy": 0.47241378426551817, + "step": 58475 + }, + { + "epoch": 0.05890167710640475, + "grad_norm": 9.612419431975118, + "learning_rate": 4.9990254034263474e-05, + "loss": 2.2433, + "mean_token_accuracy": 0.4551724076271057, + "step": 58480 + }, + { + "epoch": 0.058906713159508926, + "grad_norm": 10.977731267675512, + "learning_rate": 4.999024300334999e-05, + "loss": 2.492, + "mean_token_accuracy": 0.3827586233615875, + "step": 58485 + }, + { + "epoch": 0.0589117492126131, + "grad_norm": 12.074313522164669, + "learning_rate": 4.999023196619876e-05, + "loss": 2.871, + "mean_token_accuracy": 0.38112522959709166, + "step": 58490 + }, + { + "epoch": 0.05891678526571727, + "grad_norm": 11.14220313225627, + "learning_rate": 4.999022092280977e-05, + "loss": 2.0878, + "mean_token_accuracy": 0.47931034564971925, + "step": 58495 + }, + { + "epoch": 0.05892182131882144, + "grad_norm": 11.298115960860097, + "learning_rate": 4.999020987318304e-05, + "loss": 2.0852, + "mean_token_accuracy": 0.4206896543502808, + "step": 58500 + }, + { + "epoch": 0.058926857371925614, + "grad_norm": 11.542361760487763, + "learning_rate": 4.9990198817318566e-05, + "loss": 2.8396, + "mean_token_accuracy": 0.3586206823587418, + "step": 58505 + }, + { + "epoch": 0.05893189342502979, + "grad_norm": 11.342139309668278, + "learning_rate": 4.999018775521635e-05, + "loss": 2.8269, + "mean_token_accuracy": 0.3758620649576187, + "step": 58510 + }, + { + "epoch": 0.05893692947813396, + "grad_norm": 11.40022600407774, + "learning_rate": 4.9990176686876394e-05, + "loss": 2.5803, + "mean_token_accuracy": 0.3551724135875702, + "step": 58515 + }, + { + "epoch": 0.058941965531238136, + "grad_norm": 11.216281018400004, + "learning_rate": 4.999016561229871e-05, + "loss": 2.5116, + "mean_token_accuracy": 0.41379310488700866, + "step": 58520 + }, + { + "epoch": 0.05894700158434231, + "grad_norm": 11.6746912367142, + "learning_rate": 4.999015453148329e-05, + "loss": 2.1654, + "mean_token_accuracy": 0.4482758641242981, + "step": 58525 + }, + { + "epoch": 0.058952037637446476, + "grad_norm": 11.64503849487109, + "learning_rate": 4.999014344443015e-05, + "loss": 2.6106, + "mean_token_accuracy": 0.3862068891525269, + "step": 58530 + }, + { + "epoch": 0.05895707369055065, + "grad_norm": 13.018170076737198, + "learning_rate": 4.999013235113927e-05, + "loss": 2.3881, + "mean_token_accuracy": 0.45862069725990295, + "step": 58535 + }, + { + "epoch": 0.058962109743654824, + "grad_norm": 10.367387122473016, + "learning_rate": 4.999012125161068e-05, + "loss": 2.6577, + "mean_token_accuracy": 0.4482758641242981, + "step": 58540 + }, + { + "epoch": 0.058967145796759, + "grad_norm": 13.97914158018676, + "learning_rate": 4.999011014584437e-05, + "loss": 2.6244, + "mean_token_accuracy": 0.3896551728248596, + "step": 58545 + }, + { + "epoch": 0.05897218184986317, + "grad_norm": 10.521847327121893, + "learning_rate": 4.999009903384034e-05, + "loss": 2.2097, + "mean_token_accuracy": 0.4724137902259827, + "step": 58550 + }, + { + "epoch": 0.058977217902967345, + "grad_norm": 13.48655769158509, + "learning_rate": 4.9990087915598596e-05, + "loss": 2.5905, + "mean_token_accuracy": 0.38275861740112305, + "step": 58555 + }, + { + "epoch": 0.05898225395607152, + "grad_norm": 12.16582228205211, + "learning_rate": 4.999007679111915e-05, + "loss": 2.3999, + "mean_token_accuracy": 0.3862069010734558, + "step": 58560 + }, + { + "epoch": 0.058987290009175686, + "grad_norm": 15.793279579127145, + "learning_rate": 4.999006566040199e-05, + "loss": 2.723, + "mean_token_accuracy": 0.37931033968925476, + "step": 58565 + }, + { + "epoch": 0.05899232606227986, + "grad_norm": 11.257375885495158, + "learning_rate": 4.999005452344713e-05, + "loss": 2.575, + "mean_token_accuracy": 0.4034482777118683, + "step": 58570 + }, + { + "epoch": 0.05899736211538403, + "grad_norm": 12.876908535610417, + "learning_rate": 4.999004338025456e-05, + "loss": 2.3862, + "mean_token_accuracy": 0.43448275327682495, + "step": 58575 + }, + { + "epoch": 0.05900239816848821, + "grad_norm": 15.801554971622984, + "learning_rate": 4.9990032230824306e-05, + "loss": 2.8597, + "mean_token_accuracy": 0.3517241418361664, + "step": 58580 + }, + { + "epoch": 0.05900743422159238, + "grad_norm": 11.464720521975142, + "learning_rate": 4.999002107515636e-05, + "loss": 2.5469, + "mean_token_accuracy": 0.3931034505367279, + "step": 58585 + }, + { + "epoch": 0.059012470274696555, + "grad_norm": 14.328906988062338, + "learning_rate": 4.999000991325071e-05, + "loss": 2.9102, + "mean_token_accuracy": 0.39310344457626345, + "step": 58590 + }, + { + "epoch": 0.05901750632780073, + "grad_norm": 14.894068611859675, + "learning_rate": 4.9989998745107374e-05, + "loss": 2.685, + "mean_token_accuracy": 0.3551724076271057, + "step": 58595 + }, + { + "epoch": 0.059022542380904895, + "grad_norm": 13.077909713655979, + "learning_rate": 4.998998757072635e-05, + "loss": 2.637, + "mean_token_accuracy": 0.43448276817798615, + "step": 58600 + }, + { + "epoch": 0.05902757843400907, + "grad_norm": 13.600544804941952, + "learning_rate": 4.998997639010766e-05, + "loss": 2.3094, + "mean_token_accuracy": 0.41881427764892576, + "step": 58605 + }, + { + "epoch": 0.05903261448711324, + "grad_norm": 11.293686919184719, + "learning_rate": 4.998996520325128e-05, + "loss": 2.2382, + "mean_token_accuracy": 0.43103447556495667, + "step": 58610 + }, + { + "epoch": 0.059037650540217416, + "grad_norm": 12.04888194881693, + "learning_rate": 4.998995401015723e-05, + "loss": 2.8035, + "mean_token_accuracy": 0.36896551251411436, + "step": 58615 + }, + { + "epoch": 0.05904268659332159, + "grad_norm": 13.571958763294521, + "learning_rate": 4.99899428108255e-05, + "loss": 2.5817, + "mean_token_accuracy": 0.3931034505367279, + "step": 58620 + }, + { + "epoch": 0.059047722646425764, + "grad_norm": 13.126813654059614, + "learning_rate": 4.99899316052561e-05, + "loss": 2.1028, + "mean_token_accuracy": 0.482758617401123, + "step": 58625 + }, + { + "epoch": 0.05905275869952994, + "grad_norm": 10.855866496352922, + "learning_rate": 4.9989920393449035e-05, + "loss": 2.3827, + "mean_token_accuracy": 0.4344827592372894, + "step": 58630 + }, + { + "epoch": 0.059057794752634105, + "grad_norm": 11.483846777040888, + "learning_rate": 4.998990917540431e-05, + "loss": 2.2769, + "mean_token_accuracy": 0.43103448748588563, + "step": 58635 + }, + { + "epoch": 0.05906283080573828, + "grad_norm": 25.1884739134712, + "learning_rate": 4.9989897951121925e-05, + "loss": 3.0654, + "mean_token_accuracy": 0.32758620381355286, + "step": 58640 + }, + { + "epoch": 0.05906786685884245, + "grad_norm": 20.865457539453736, + "learning_rate": 4.998988672060188e-05, + "loss": 2.7352, + "mean_token_accuracy": 0.41379310488700866, + "step": 58645 + }, + { + "epoch": 0.059072902911946626, + "grad_norm": 10.575030327891007, + "learning_rate": 4.998987548384418e-05, + "loss": 2.1478, + "mean_token_accuracy": 0.46551724672317507, + "step": 58650 + }, + { + "epoch": 0.0590779389650508, + "grad_norm": 13.024581534294235, + "learning_rate": 4.998986424084884e-05, + "loss": 2.4822, + "mean_token_accuracy": 0.3965517163276672, + "step": 58655 + }, + { + "epoch": 0.05908297501815497, + "grad_norm": 11.03628367800722, + "learning_rate": 4.998985299161584e-05, + "loss": 2.7691, + "mean_token_accuracy": 0.3517241358757019, + "step": 58660 + }, + { + "epoch": 0.05908801107125915, + "grad_norm": 16.19723759953313, + "learning_rate": 4.9989841736145193e-05, + "loss": 2.6805, + "mean_token_accuracy": 0.36206896901130675, + "step": 58665 + }, + { + "epoch": 0.059093047124363314, + "grad_norm": 13.677810962693576, + "learning_rate": 4.998983047443692e-05, + "loss": 2.7276, + "mean_token_accuracy": 0.3793103456497192, + "step": 58670 + }, + { + "epoch": 0.05909808317746749, + "grad_norm": 12.968755204008977, + "learning_rate": 4.9989819206491e-05, + "loss": 3.2961, + "mean_token_accuracy": 0.31724137514829637, + "step": 58675 + }, + { + "epoch": 0.05910311923057166, + "grad_norm": 10.642179541018857, + "learning_rate": 4.998980793230743e-05, + "loss": 2.6734, + "mean_token_accuracy": 0.41034482717514037, + "step": 58680 + }, + { + "epoch": 0.059108155283675835, + "grad_norm": 11.85518441420973, + "learning_rate": 4.998979665188625e-05, + "loss": 2.1298, + "mean_token_accuracy": 0.4586206912994385, + "step": 58685 + }, + { + "epoch": 0.05911319133678001, + "grad_norm": 13.04737656379371, + "learning_rate": 4.998978536522743e-05, + "loss": 2.2877, + "mean_token_accuracy": 0.4155474901199341, + "step": 58690 + }, + { + "epoch": 0.05911822738988418, + "grad_norm": 8.967105319313283, + "learning_rate": 4.9989774072330985e-05, + "loss": 2.7519, + "mean_token_accuracy": 0.40689656138420105, + "step": 58695 + }, + { + "epoch": 0.05912326344298836, + "grad_norm": 14.133498256694917, + "learning_rate": 4.998976277319692e-05, + "loss": 2.7748, + "mean_token_accuracy": 0.3896551787853241, + "step": 58700 + }, + { + "epoch": 0.05912829949609252, + "grad_norm": 11.674818432265326, + "learning_rate": 4.9989751467825234e-05, + "loss": 2.4714, + "mean_token_accuracy": 0.4724137783050537, + "step": 58705 + }, + { + "epoch": 0.0591333355491967, + "grad_norm": 11.657068020650277, + "learning_rate": 4.998974015621593e-05, + "loss": 2.4509, + "mean_token_accuracy": 0.3808832406997681, + "step": 58710 + }, + { + "epoch": 0.05913837160230087, + "grad_norm": 12.474614161025137, + "learning_rate": 4.9989728838369017e-05, + "loss": 2.344, + "mean_token_accuracy": 0.45517241954803467, + "step": 58715 + }, + { + "epoch": 0.059143407655405045, + "grad_norm": 13.133626532397379, + "learning_rate": 4.998971751428449e-05, + "loss": 2.5609, + "mean_token_accuracy": 0.39655172228813174, + "step": 58720 + }, + { + "epoch": 0.05914844370850922, + "grad_norm": 13.074275921696513, + "learning_rate": 4.998970618396236e-05, + "loss": 2.8272, + "mean_token_accuracy": 0.3931034505367279, + "step": 58725 + }, + { + "epoch": 0.05915347976161339, + "grad_norm": 10.788938455627191, + "learning_rate": 4.998969484740262e-05, + "loss": 2.3963, + "mean_token_accuracy": 0.42413793206214906, + "step": 58730 + }, + { + "epoch": 0.059158515814717566, + "grad_norm": 11.316826019995348, + "learning_rate": 4.998968350460528e-05, + "loss": 2.353, + "mean_token_accuracy": 0.42068964838981626, + "step": 58735 + }, + { + "epoch": 0.05916355186782173, + "grad_norm": 12.39146491039513, + "learning_rate": 4.998967215557036e-05, + "loss": 2.4522, + "mean_token_accuracy": 0.41724138259887694, + "step": 58740 + }, + { + "epoch": 0.05916858792092591, + "grad_norm": 10.612586463996262, + "learning_rate": 4.998966080029782e-05, + "loss": 2.6475, + "mean_token_accuracy": 0.3931034505367279, + "step": 58745 + }, + { + "epoch": 0.05917362397403008, + "grad_norm": 17.499743318498293, + "learning_rate": 4.99896494387877e-05, + "loss": 2.7012, + "mean_token_accuracy": 0.4310344815254211, + "step": 58750 + }, + { + "epoch": 0.059178660027134254, + "grad_norm": 11.600560531866211, + "learning_rate": 4.998963807103999e-05, + "loss": 2.5386, + "mean_token_accuracy": 0.4689655065536499, + "step": 58755 + }, + { + "epoch": 0.05918369608023843, + "grad_norm": 13.13240034680552, + "learning_rate": 4.99896266970547e-05, + "loss": 2.7715, + "mean_token_accuracy": 0.3517241418361664, + "step": 58760 + }, + { + "epoch": 0.0591887321333426, + "grad_norm": 10.103749376060998, + "learning_rate": 4.998961531683182e-05, + "loss": 2.3537, + "mean_token_accuracy": 0.42068964838981626, + "step": 58765 + }, + { + "epoch": 0.059193768186446775, + "grad_norm": 11.161734177668214, + "learning_rate": 4.998960393037136e-05, + "loss": 3.0349, + "mean_token_accuracy": 0.3241379350423813, + "step": 58770 + }, + { + "epoch": 0.05919880423955094, + "grad_norm": 11.848904103937022, + "learning_rate": 4.998959253767334e-05, + "loss": 2.7163, + "mean_token_accuracy": 0.3655172407627106, + "step": 58775 + }, + { + "epoch": 0.059203840292655116, + "grad_norm": 13.130009703028195, + "learning_rate": 4.998958113873774e-05, + "loss": 2.8563, + "mean_token_accuracy": 0.324137932062149, + "step": 58780 + }, + { + "epoch": 0.05920887634575929, + "grad_norm": 11.74073416222131, + "learning_rate": 4.9989569733564565e-05, + "loss": 2.4175, + "mean_token_accuracy": 0.41379311084747317, + "step": 58785 + }, + { + "epoch": 0.059213912398863464, + "grad_norm": 14.925767644174199, + "learning_rate": 4.998955832215382e-05, + "loss": 2.3978, + "mean_token_accuracy": 0.43201970160007475, + "step": 58790 + }, + { + "epoch": 0.05921894845196764, + "grad_norm": 10.30197493523468, + "learning_rate": 4.998954690450553e-05, + "loss": 2.5684, + "mean_token_accuracy": 0.37931033968925476, + "step": 58795 + }, + { + "epoch": 0.05922398450507181, + "grad_norm": 12.764724089239957, + "learning_rate": 4.998953548061966e-05, + "loss": 2.6887, + "mean_token_accuracy": 0.3551724135875702, + "step": 58800 + }, + { + "epoch": 0.059229020558175985, + "grad_norm": 11.109116743531258, + "learning_rate": 4.998952405049625e-05, + "loss": 2.2581, + "mean_token_accuracy": 0.4448275864124298, + "step": 58805 + }, + { + "epoch": 0.05923405661128015, + "grad_norm": 11.743442505319946, + "learning_rate": 4.998951261413528e-05, + "loss": 3.3181, + "mean_token_accuracy": 0.2965517193078995, + "step": 58810 + }, + { + "epoch": 0.059239092664384325, + "grad_norm": 8.93081909462916, + "learning_rate": 4.998950117153676e-05, + "loss": 2.288, + "mean_token_accuracy": 0.4194192349910736, + "step": 58815 + }, + { + "epoch": 0.0592441287174885, + "grad_norm": 12.971720090174436, + "learning_rate": 4.99894897227007e-05, + "loss": 2.3667, + "mean_token_accuracy": 0.41034482419490814, + "step": 58820 + }, + { + "epoch": 0.05924916477059267, + "grad_norm": 13.140478678687847, + "learning_rate": 4.998947826762708e-05, + "loss": 3.114, + "mean_token_accuracy": 0.3862068891525269, + "step": 58825 + }, + { + "epoch": 0.05925420082369685, + "grad_norm": 10.627860052609678, + "learning_rate": 4.9989466806315924e-05, + "loss": 2.4268, + "mean_token_accuracy": 0.3827586233615875, + "step": 58830 + }, + { + "epoch": 0.05925923687680102, + "grad_norm": 11.184528999212581, + "learning_rate": 4.998945533876724e-05, + "loss": 2.3816, + "mean_token_accuracy": 0.43793103098869324, + "step": 58835 + }, + { + "epoch": 0.059264272929905194, + "grad_norm": 11.639119160357138, + "learning_rate": 4.998944386498101e-05, + "loss": 2.5424, + "mean_token_accuracy": 0.4344827651977539, + "step": 58840 + }, + { + "epoch": 0.05926930898300936, + "grad_norm": 8.394337740541758, + "learning_rate": 4.998943238495726e-05, + "loss": 2.3434, + "mean_token_accuracy": 0.409558367729187, + "step": 58845 + }, + { + "epoch": 0.059274345036113535, + "grad_norm": 10.806652685193619, + "learning_rate": 4.9989420898695974e-05, + "loss": 2.6451, + "mean_token_accuracy": 0.37241379618644715, + "step": 58850 + }, + { + "epoch": 0.05927938108921771, + "grad_norm": 11.597558016421797, + "learning_rate": 4.998940940619716e-05, + "loss": 2.4474, + "mean_token_accuracy": 0.43103447556495667, + "step": 58855 + }, + { + "epoch": 0.05928441714232188, + "grad_norm": 10.309277776711358, + "learning_rate": 4.998939790746083e-05, + "loss": 2.6181, + "mean_token_accuracy": 0.42915910482406616, + "step": 58860 + }, + { + "epoch": 0.059289453195426056, + "grad_norm": 10.463224979661796, + "learning_rate": 4.9989386402486985e-05, + "loss": 2.207, + "mean_token_accuracy": 0.48275862336158754, + "step": 58865 + }, + { + "epoch": 0.05929448924853023, + "grad_norm": 11.234273239966175, + "learning_rate": 4.998937489127563e-05, + "loss": 2.4072, + "mean_token_accuracy": 0.4689655065536499, + "step": 58870 + }, + { + "epoch": 0.059299525301634404, + "grad_norm": 11.068259497123158, + "learning_rate": 4.998936337382675e-05, + "loss": 2.2707, + "mean_token_accuracy": 0.48275861144065857, + "step": 58875 + }, + { + "epoch": 0.05930456135473857, + "grad_norm": 11.06848020478526, + "learning_rate": 4.9989351850140364e-05, + "loss": 2.3281, + "mean_token_accuracy": 0.4344827592372894, + "step": 58880 + }, + { + "epoch": 0.059309597407842744, + "grad_norm": 13.723681499008585, + "learning_rate": 4.998934032021648e-05, + "loss": 2.555, + "mean_token_accuracy": 0.4015124022960663, + "step": 58885 + }, + { + "epoch": 0.05931463346094692, + "grad_norm": 11.040905928725397, + "learning_rate": 4.998932878405508e-05, + "loss": 2.5701, + "mean_token_accuracy": 0.41724138855934145, + "step": 58890 + }, + { + "epoch": 0.05931966951405109, + "grad_norm": 14.75788704568464, + "learning_rate": 4.9989317241656195e-05, + "loss": 2.2684, + "mean_token_accuracy": 0.42546883821487425, + "step": 58895 + }, + { + "epoch": 0.059324705567155266, + "grad_norm": 15.88856058941254, + "learning_rate": 4.9989305693019806e-05, + "loss": 2.7051, + "mean_token_accuracy": 0.4517241418361664, + "step": 58900 + }, + { + "epoch": 0.05932974162025944, + "grad_norm": 14.567054702115339, + "learning_rate": 4.998929413814593e-05, + "loss": 2.5808, + "mean_token_accuracy": 0.462431937456131, + "step": 58905 + }, + { + "epoch": 0.05933477767336361, + "grad_norm": 11.032849426141484, + "learning_rate": 4.998928257703456e-05, + "loss": 2.2975, + "mean_token_accuracy": 0.4206896543502808, + "step": 58910 + }, + { + "epoch": 0.05933981372646778, + "grad_norm": 12.105191391268958, + "learning_rate": 4.99892710096857e-05, + "loss": 2.75, + "mean_token_accuracy": 0.34996975064277647, + "step": 58915 + }, + { + "epoch": 0.059344849779571954, + "grad_norm": 12.244677299270796, + "learning_rate": 4.998925943609936e-05, + "loss": 2.7432, + "mean_token_accuracy": 0.39655172228813174, + "step": 58920 + }, + { + "epoch": 0.05934988583267613, + "grad_norm": 15.820009419247432, + "learning_rate": 4.998924785627555e-05, + "loss": 3.0405, + "mean_token_accuracy": 0.36896551847457887, + "step": 58925 + }, + { + "epoch": 0.0593549218857803, + "grad_norm": 13.246164002115812, + "learning_rate": 4.998923627021425e-05, + "loss": 2.4165, + "mean_token_accuracy": 0.41034482717514037, + "step": 58930 + }, + { + "epoch": 0.059359957938884475, + "grad_norm": 10.337498680249992, + "learning_rate": 4.998922467791547e-05, + "loss": 2.794, + "mean_token_accuracy": 0.35862069129943847, + "step": 58935 + }, + { + "epoch": 0.05936499399198865, + "grad_norm": 14.248775080146833, + "learning_rate": 4.998921307937923e-05, + "loss": 2.8217, + "mean_token_accuracy": 0.36551723480224607, + "step": 58940 + }, + { + "epoch": 0.05937003004509282, + "grad_norm": 12.796284043959828, + "learning_rate": 4.998920147460553e-05, + "loss": 2.4711, + "mean_token_accuracy": 0.4, + "step": 58945 + }, + { + "epoch": 0.05937506609819699, + "grad_norm": 11.454191912198333, + "learning_rate": 4.9989189863594355e-05, + "loss": 2.7239, + "mean_token_accuracy": 0.3931034505367279, + "step": 58950 + }, + { + "epoch": 0.05938010215130116, + "grad_norm": 12.23214085029897, + "learning_rate": 4.998917824634572e-05, + "loss": 2.5001, + "mean_token_accuracy": 0.441379314661026, + "step": 58955 + }, + { + "epoch": 0.05938513820440534, + "grad_norm": 9.973659262778462, + "learning_rate": 4.998916662285963e-05, + "loss": 2.3519, + "mean_token_accuracy": 0.4534785270690918, + "step": 58960 + }, + { + "epoch": 0.05939017425750951, + "grad_norm": 11.93709296416662, + "learning_rate": 4.998915499313609e-05, + "loss": 2.4867, + "mean_token_accuracy": 0.40689654350280763, + "step": 58965 + }, + { + "epoch": 0.059395210310613684, + "grad_norm": 9.876055243894868, + "learning_rate": 4.9989143357175095e-05, + "loss": 2.4771, + "mean_token_accuracy": 0.3862069010734558, + "step": 58970 + }, + { + "epoch": 0.05940024636371786, + "grad_norm": 11.41139172671979, + "learning_rate": 4.998913171497666e-05, + "loss": 2.6784, + "mean_token_accuracy": 0.39310344457626345, + "step": 58975 + }, + { + "epoch": 0.05940528241682203, + "grad_norm": 11.828379832320408, + "learning_rate": 4.998912006654077e-05, + "loss": 2.4829, + "mean_token_accuracy": 0.4379310429096222, + "step": 58980 + }, + { + "epoch": 0.0594103184699262, + "grad_norm": 13.485698050340023, + "learning_rate": 4.9989108411867434e-05, + "loss": 2.4139, + "mean_token_accuracy": 0.4679975748062134, + "step": 58985 + }, + { + "epoch": 0.05941535452303037, + "grad_norm": 11.069308520352765, + "learning_rate": 4.998909675095667e-05, + "loss": 2.7838, + "mean_token_accuracy": 0.35517241060733795, + "step": 58990 + }, + { + "epoch": 0.059420390576134546, + "grad_norm": 11.984372879863162, + "learning_rate": 4.9989085083808467e-05, + "loss": 2.4866, + "mean_token_accuracy": 0.3965517282485962, + "step": 58995 + }, + { + "epoch": 0.05942542662923872, + "grad_norm": 13.220351423790104, + "learning_rate": 4.998907341042284e-05, + "loss": 2.503, + "mean_token_accuracy": 0.43793103098869324, + "step": 59000 + }, + { + "epoch": 0.059430462682342894, + "grad_norm": 11.247430702188277, + "learning_rate": 4.998906173079977e-05, + "loss": 2.1379, + "mean_token_accuracy": 0.44482758045196535, + "step": 59005 + }, + { + "epoch": 0.05943549873544707, + "grad_norm": 9.37822968939124, + "learning_rate": 4.9989050044939294e-05, + "loss": 2.2321, + "mean_token_accuracy": 0.4379310429096222, + "step": 59010 + }, + { + "epoch": 0.05944053478855124, + "grad_norm": 10.850501384633876, + "learning_rate": 4.998903835284138e-05, + "loss": 2.6105, + "mean_token_accuracy": 0.3965517282485962, + "step": 59015 + }, + { + "epoch": 0.05944557084165541, + "grad_norm": 9.866232552324306, + "learning_rate": 4.9989026654506055e-05, + "loss": 2.8135, + "mean_token_accuracy": 0.4068965494632721, + "step": 59020 + }, + { + "epoch": 0.05945060689475958, + "grad_norm": 14.181364156638187, + "learning_rate": 4.998901494993331e-05, + "loss": 2.8982, + "mean_token_accuracy": 0.337931028008461, + "step": 59025 + }, + { + "epoch": 0.059455642947863756, + "grad_norm": 13.415891054843744, + "learning_rate": 4.998900323912316e-05, + "loss": 2.7652, + "mean_token_accuracy": 0.37586206793785093, + "step": 59030 + }, + { + "epoch": 0.05946067900096793, + "grad_norm": 11.956808598122114, + "learning_rate": 4.998899152207559e-05, + "loss": 2.83, + "mean_token_accuracy": 0.36206896901130675, + "step": 59035 + }, + { + "epoch": 0.0594657150540721, + "grad_norm": 11.500521746110556, + "learning_rate": 4.998897979879063e-05, + "loss": 2.1805, + "mean_token_accuracy": 0.4517241299152374, + "step": 59040 + }, + { + "epoch": 0.05947075110717628, + "grad_norm": 12.949250173088183, + "learning_rate": 4.998896806926825e-05, + "loss": 2.3127, + "mean_token_accuracy": 0.43448275327682495, + "step": 59045 + }, + { + "epoch": 0.05947578716028045, + "grad_norm": 14.719307788093833, + "learning_rate": 4.998895633350848e-05, + "loss": 2.6667, + "mean_token_accuracy": 0.42068966031074523, + "step": 59050 + }, + { + "epoch": 0.05948082321338462, + "grad_norm": 13.919527423465116, + "learning_rate": 4.998894459151131e-05, + "loss": 2.5504, + "mean_token_accuracy": 0.38275861740112305, + "step": 59055 + }, + { + "epoch": 0.05948585926648879, + "grad_norm": 12.331897373957322, + "learning_rate": 4.9988932843276755e-05, + "loss": 2.3281, + "mean_token_accuracy": 0.4379310369491577, + "step": 59060 + }, + { + "epoch": 0.059490895319592965, + "grad_norm": 12.866995286202128, + "learning_rate": 4.99889210888048e-05, + "loss": 2.8571, + "mean_token_accuracy": 0.36896551251411436, + "step": 59065 + }, + { + "epoch": 0.05949593137269714, + "grad_norm": 11.681512910371756, + "learning_rate": 4.998890932809547e-05, + "loss": 2.5211, + "mean_token_accuracy": 0.4, + "step": 59070 + }, + { + "epoch": 0.05950096742580131, + "grad_norm": 9.361534693454695, + "learning_rate": 4.9988897561148753e-05, + "loss": 2.2687, + "mean_token_accuracy": 0.45366000533103945, + "step": 59075 + }, + { + "epoch": 0.059506003478905486, + "grad_norm": 12.491551574861692, + "learning_rate": 4.998888578796465e-05, + "loss": 2.2045, + "mean_token_accuracy": 0.47241379618644713, + "step": 59080 + }, + { + "epoch": 0.05951103953200966, + "grad_norm": 13.831472514405807, + "learning_rate": 4.998887400854318e-05, + "loss": 2.5779, + "mean_token_accuracy": 0.41034482717514037, + "step": 59085 + }, + { + "epoch": 0.05951607558511383, + "grad_norm": 12.575829861660889, + "learning_rate": 4.9988862222884334e-05, + "loss": 2.4249, + "mean_token_accuracy": 0.42413792610168455, + "step": 59090 + }, + { + "epoch": 0.059521111638218, + "grad_norm": 11.649071540014495, + "learning_rate": 4.998885043098811e-05, + "loss": 2.6822, + "mean_token_accuracy": 0.4275861978530884, + "step": 59095 + }, + { + "epoch": 0.059526147691322175, + "grad_norm": 10.069215234913727, + "learning_rate": 4.998883863285453e-05, + "loss": 2.6944, + "mean_token_accuracy": 0.398064124584198, + "step": 59100 + }, + { + "epoch": 0.05953118374442635, + "grad_norm": 13.129619059399054, + "learning_rate": 4.998882682848358e-05, + "loss": 2.612, + "mean_token_accuracy": 0.43623715043067934, + "step": 59105 + }, + { + "epoch": 0.05953621979753052, + "grad_norm": 12.093226529297628, + "learning_rate": 4.998881501787528e-05, + "loss": 2.5563, + "mean_token_accuracy": 0.3793103456497192, + "step": 59110 + }, + { + "epoch": 0.059541255850634696, + "grad_norm": 13.509121380601837, + "learning_rate": 4.998880320102961e-05, + "loss": 2.1198, + "mean_token_accuracy": 0.4379310250282288, + "step": 59115 + }, + { + "epoch": 0.05954629190373887, + "grad_norm": 12.451777149945938, + "learning_rate": 4.9988791377946596e-05, + "loss": 2.3015, + "mean_token_accuracy": 0.4551724135875702, + "step": 59120 + }, + { + "epoch": 0.059551327956843036, + "grad_norm": 11.11355618957618, + "learning_rate": 4.998877954862622e-05, + "loss": 2.567, + "mean_token_accuracy": 0.40344828367233276, + "step": 59125 + }, + { + "epoch": 0.05955636400994721, + "grad_norm": 12.93909149387017, + "learning_rate": 4.9988767713068515e-05, + "loss": 2.5374, + "mean_token_accuracy": 0.35862069129943847, + "step": 59130 + }, + { + "epoch": 0.059561400063051384, + "grad_norm": 12.987545314469871, + "learning_rate": 4.998875587127346e-05, + "loss": 2.4685, + "mean_token_accuracy": 0.36896551251411436, + "step": 59135 + }, + { + "epoch": 0.05956643611615556, + "grad_norm": 11.218906518217826, + "learning_rate": 4.9988744023241065e-05, + "loss": 2.5418, + "mean_token_accuracy": 0.39310343861579894, + "step": 59140 + }, + { + "epoch": 0.05957147216925973, + "grad_norm": 10.566109615957771, + "learning_rate": 4.998873216897132e-05, + "loss": 2.5067, + "mean_token_accuracy": 0.3827586233615875, + "step": 59145 + }, + { + "epoch": 0.059576508222363905, + "grad_norm": 8.851009062316985, + "learning_rate": 4.998872030846425e-05, + "loss": 2.4225, + "mean_token_accuracy": 0.471082878112793, + "step": 59150 + }, + { + "epoch": 0.05958154427546808, + "grad_norm": 16.534884292833965, + "learning_rate": 4.998870844171985e-05, + "loss": 2.7906, + "mean_token_accuracy": 0.4082512348890305, + "step": 59155 + }, + { + "epoch": 0.059586580328572246, + "grad_norm": 12.053098994529302, + "learning_rate": 4.9988696568738125e-05, + "loss": 2.9011, + "mean_token_accuracy": 0.3862068891525269, + "step": 59160 + }, + { + "epoch": 0.05959161638167642, + "grad_norm": 9.493123152225449, + "learning_rate": 4.998868468951908e-05, + "loss": 2.4774, + "mean_token_accuracy": 0.41724138855934145, + "step": 59165 + }, + { + "epoch": 0.05959665243478059, + "grad_norm": 11.565401476227244, + "learning_rate": 4.998867280406271e-05, + "loss": 2.168, + "mean_token_accuracy": 0.43103447556495667, + "step": 59170 + }, + { + "epoch": 0.05960168848788477, + "grad_norm": 13.477905766948895, + "learning_rate": 4.998866091236902e-05, + "loss": 2.2617, + "mean_token_accuracy": 0.458620685338974, + "step": 59175 + }, + { + "epoch": 0.05960672454098894, + "grad_norm": 11.043773767713418, + "learning_rate": 4.9988649014438014e-05, + "loss": 2.6218, + "mean_token_accuracy": 0.3827586233615875, + "step": 59180 + }, + { + "epoch": 0.059611760594093115, + "grad_norm": 10.945185808575705, + "learning_rate": 4.9988637110269696e-05, + "loss": 2.613, + "mean_token_accuracy": 0.4344827651977539, + "step": 59185 + }, + { + "epoch": 0.05961679664719729, + "grad_norm": 12.134704144445134, + "learning_rate": 4.998862519986408e-05, + "loss": 2.537, + "mean_token_accuracy": 0.4137930989265442, + "step": 59190 + }, + { + "epoch": 0.059621832700301455, + "grad_norm": 11.120474640907574, + "learning_rate": 4.998861328322115e-05, + "loss": 2.3208, + "mean_token_accuracy": 0.4448275864124298, + "step": 59195 + }, + { + "epoch": 0.05962686875340563, + "grad_norm": 12.757172908089746, + "learning_rate": 4.998860136034093e-05, + "loss": 2.9531, + "mean_token_accuracy": 0.334482753276825, + "step": 59200 + }, + { + "epoch": 0.0596319048065098, + "grad_norm": 12.826090039123786, + "learning_rate": 4.9988589431223404e-05, + "loss": 2.7802, + "mean_token_accuracy": 0.3827586233615875, + "step": 59205 + }, + { + "epoch": 0.05963694085961398, + "grad_norm": 15.39407109151909, + "learning_rate": 4.9988577495868586e-05, + "loss": 2.6464, + "mean_token_accuracy": 0.39310344457626345, + "step": 59210 + }, + { + "epoch": 0.05964197691271815, + "grad_norm": 13.059895387948467, + "learning_rate": 4.998856555427648e-05, + "loss": 2.6999, + "mean_token_accuracy": 0.36551723480224607, + "step": 59215 + }, + { + "epoch": 0.059647012965822324, + "grad_norm": 10.062127209586782, + "learning_rate": 4.9988553606447085e-05, + "loss": 2.7815, + "mean_token_accuracy": 0.3379310339689255, + "step": 59220 + }, + { + "epoch": 0.0596520490189265, + "grad_norm": 12.271155622465557, + "learning_rate": 4.998854165238041e-05, + "loss": 2.4575, + "mean_token_accuracy": 0.3999999940395355, + "step": 59225 + }, + { + "epoch": 0.059657085072030665, + "grad_norm": 11.936558622545698, + "learning_rate": 4.9988529692076445e-05, + "loss": 2.303, + "mean_token_accuracy": 0.42413793206214906, + "step": 59230 + }, + { + "epoch": 0.05966212112513484, + "grad_norm": 13.517049830487577, + "learning_rate": 4.99885177255352e-05, + "loss": 2.9089, + "mean_token_accuracy": 0.37241379618644715, + "step": 59235 + }, + { + "epoch": 0.05966715717823901, + "grad_norm": 11.049540743935365, + "learning_rate": 4.99885057527567e-05, + "loss": 2.8311, + "mean_token_accuracy": 0.36896551847457887, + "step": 59240 + }, + { + "epoch": 0.059672193231343186, + "grad_norm": 11.453550879188398, + "learning_rate": 4.9988493773740906e-05, + "loss": 2.2979, + "mean_token_accuracy": 0.441379314661026, + "step": 59245 + }, + { + "epoch": 0.05967722928444736, + "grad_norm": 10.921660403326115, + "learning_rate": 4.998848178848786e-05, + "loss": 2.614, + "mean_token_accuracy": 0.4034482777118683, + "step": 59250 + }, + { + "epoch": 0.059682265337551534, + "grad_norm": 11.108212582161558, + "learning_rate": 4.998846979699755e-05, + "loss": 3.0907, + "mean_token_accuracy": 0.3482758641242981, + "step": 59255 + }, + { + "epoch": 0.05968730139065571, + "grad_norm": 11.748110530126153, + "learning_rate": 4.9988457799269965e-05, + "loss": 2.4208, + "mean_token_accuracy": 0.4257108271121979, + "step": 59260 + }, + { + "epoch": 0.059692337443759874, + "grad_norm": 10.912580708692309, + "learning_rate": 4.9988445795305133e-05, + "loss": 2.4398, + "mean_token_accuracy": 0.44827585816383364, + "step": 59265 + }, + { + "epoch": 0.05969737349686405, + "grad_norm": 13.800260412086224, + "learning_rate": 4.998843378510304e-05, + "loss": 2.5043, + "mean_token_accuracy": 0.3862069010734558, + "step": 59270 + }, + { + "epoch": 0.05970240954996822, + "grad_norm": 14.446100492260674, + "learning_rate": 4.9988421768663705e-05, + "loss": 2.7125, + "mean_token_accuracy": 0.3517241358757019, + "step": 59275 + }, + { + "epoch": 0.059707445603072395, + "grad_norm": 10.16095486423684, + "learning_rate": 4.9988409745987116e-05, + "loss": 2.5052, + "mean_token_accuracy": 0.4284482777118683, + "step": 59280 + }, + { + "epoch": 0.05971248165617657, + "grad_norm": 14.02129291494595, + "learning_rate": 4.998839771707329e-05, + "loss": 2.4124, + "mean_token_accuracy": 0.4813067138195038, + "step": 59285 + }, + { + "epoch": 0.05971751770928074, + "grad_norm": 12.391065431821701, + "learning_rate": 4.9988385681922214e-05, + "loss": 2.7404, + "mean_token_accuracy": 0.3517241358757019, + "step": 59290 + }, + { + "epoch": 0.05972255376238492, + "grad_norm": 12.877249695923679, + "learning_rate": 4.998837364053391e-05, + "loss": 2.712, + "mean_token_accuracy": 0.37241379022598264, + "step": 59295 + }, + { + "epoch": 0.059727589815489084, + "grad_norm": 12.090281693100781, + "learning_rate": 4.998836159290836e-05, + "loss": 2.5805, + "mean_token_accuracy": 0.41379310488700866, + "step": 59300 + }, + { + "epoch": 0.05973262586859326, + "grad_norm": 11.646682585206323, + "learning_rate": 4.9988349539045585e-05, + "loss": 2.439, + "mean_token_accuracy": 0.38965516686439516, + "step": 59305 + }, + { + "epoch": 0.05973766192169743, + "grad_norm": 11.084913954681372, + "learning_rate": 4.998833747894558e-05, + "loss": 2.3275, + "mean_token_accuracy": 0.41379310488700866, + "step": 59310 + }, + { + "epoch": 0.059742697974801605, + "grad_norm": 10.549612511443312, + "learning_rate": 4.9988325412608355e-05, + "loss": 2.3279, + "mean_token_accuracy": 0.4344827473163605, + "step": 59315 + }, + { + "epoch": 0.05974773402790578, + "grad_norm": 11.13607974083221, + "learning_rate": 4.99883133400339e-05, + "loss": 2.4961, + "mean_token_accuracy": 0.4344827592372894, + "step": 59320 + }, + { + "epoch": 0.05975277008100995, + "grad_norm": 11.657041666816866, + "learning_rate": 4.9988301261222244e-05, + "loss": 2.3488, + "mean_token_accuracy": 0.4172413766384125, + "step": 59325 + }, + { + "epoch": 0.059757806134114126, + "grad_norm": 12.983451479916209, + "learning_rate": 4.998828917617336e-05, + "loss": 2.1332, + "mean_token_accuracy": 0.47241379618644713, + "step": 59330 + }, + { + "epoch": 0.05976284218721829, + "grad_norm": 12.21424864363209, + "learning_rate": 4.998827708488727e-05, + "loss": 2.6681, + "mean_token_accuracy": 0.3896551728248596, + "step": 59335 + }, + { + "epoch": 0.05976787824032247, + "grad_norm": 11.457072119560102, + "learning_rate": 4.998826498736397e-05, + "loss": 3.3909, + "mean_token_accuracy": 0.3689655214548111, + "step": 59340 + }, + { + "epoch": 0.05977291429342664, + "grad_norm": 9.971550609495056, + "learning_rate": 4.998825288360347e-05, + "loss": 2.556, + "mean_token_accuracy": 0.3827586233615875, + "step": 59345 + }, + { + "epoch": 0.059777950346530814, + "grad_norm": 10.560008599451335, + "learning_rate": 4.998824077360577e-05, + "loss": 2.4283, + "mean_token_accuracy": 0.45517240166664125, + "step": 59350 + }, + { + "epoch": 0.05978298639963499, + "grad_norm": 11.614668656238733, + "learning_rate": 4.998822865737086e-05, + "loss": 2.5327, + "mean_token_accuracy": 0.4206896543502808, + "step": 59355 + }, + { + "epoch": 0.05978802245273916, + "grad_norm": 17.717088870462884, + "learning_rate": 4.9988216534898766e-05, + "loss": 2.7206, + "mean_token_accuracy": 0.4191167622804642, + "step": 59360 + }, + { + "epoch": 0.059793058505843336, + "grad_norm": 11.410887643105244, + "learning_rate": 4.9988204406189474e-05, + "loss": 2.6089, + "mean_token_accuracy": 0.36896551847457887, + "step": 59365 + }, + { + "epoch": 0.0597980945589475, + "grad_norm": 14.251915393865902, + "learning_rate": 4.998819227124301e-05, + "loss": 2.5267, + "mean_token_accuracy": 0.42068964838981626, + "step": 59370 + }, + { + "epoch": 0.059803130612051676, + "grad_norm": 11.63581847261669, + "learning_rate": 4.998818013005936e-05, + "loss": 2.4243, + "mean_token_accuracy": 0.36551723480224607, + "step": 59375 + }, + { + "epoch": 0.05980816666515585, + "grad_norm": 12.923047952378496, + "learning_rate": 4.998816798263851e-05, + "loss": 2.8772, + "mean_token_accuracy": 0.3517241388559341, + "step": 59380 + }, + { + "epoch": 0.059813202718260024, + "grad_norm": 10.343788155452241, + "learning_rate": 4.99881558289805e-05, + "loss": 2.5149, + "mean_token_accuracy": 0.4055656373500824, + "step": 59385 + }, + { + "epoch": 0.0598182387713642, + "grad_norm": 12.49373128049217, + "learning_rate": 4.9988143669085305e-05, + "loss": 2.8911, + "mean_token_accuracy": 0.341379314661026, + "step": 59390 + }, + { + "epoch": 0.05982327482446837, + "grad_norm": 14.094836569974099, + "learning_rate": 4.998813150295295e-05, + "loss": 2.6158, + "mean_token_accuracy": 0.3999999940395355, + "step": 59395 + }, + { + "epoch": 0.059828310877572545, + "grad_norm": 10.713253083037117, + "learning_rate": 4.9988119330583416e-05, + "loss": 2.5476, + "mean_token_accuracy": 0.4, + "step": 59400 + }, + { + "epoch": 0.05983334693067671, + "grad_norm": 12.83443578572033, + "learning_rate": 4.9988107151976734e-05, + "loss": 2.781, + "mean_token_accuracy": 0.36206896901130675, + "step": 59405 + }, + { + "epoch": 0.059838382983780886, + "grad_norm": 16.660705207425867, + "learning_rate": 4.998809496713288e-05, + "loss": 2.4138, + "mean_token_accuracy": 0.3945553541183472, + "step": 59410 + }, + { + "epoch": 0.05984341903688506, + "grad_norm": 10.606487049228424, + "learning_rate": 4.998808277605187e-05, + "loss": 2.6592, + "mean_token_accuracy": 0.3758620709180832, + "step": 59415 + }, + { + "epoch": 0.05984845508998923, + "grad_norm": 12.620624992318984, + "learning_rate": 4.998807057873371e-05, + "loss": 2.8247, + "mean_token_accuracy": 0.3655172407627106, + "step": 59420 + }, + { + "epoch": 0.05985349114309341, + "grad_norm": 13.185249411779646, + "learning_rate": 4.998805837517839e-05, + "loss": 2.9325, + "mean_token_accuracy": 0.2944343626499176, + "step": 59425 + }, + { + "epoch": 0.05985852719619758, + "grad_norm": 11.126089390281564, + "learning_rate": 4.9988046165385936e-05, + "loss": 2.6194, + "mean_token_accuracy": 0.42068964838981626, + "step": 59430 + }, + { + "epoch": 0.059863563249301754, + "grad_norm": 12.025941591309918, + "learning_rate": 4.998803394935634e-05, + "loss": 2.2773, + "mean_token_accuracy": 0.4348457396030426, + "step": 59435 + }, + { + "epoch": 0.05986859930240592, + "grad_norm": 14.000797982976875, + "learning_rate": 4.998802172708959e-05, + "loss": 2.8515, + "mean_token_accuracy": 0.35862069129943847, + "step": 59440 + }, + { + "epoch": 0.059873635355510095, + "grad_norm": 13.233763103786222, + "learning_rate": 4.998800949858571e-05, + "loss": 2.4839, + "mean_token_accuracy": 0.39310344159603117, + "step": 59445 + }, + { + "epoch": 0.05987867140861427, + "grad_norm": 8.877295375036313, + "learning_rate": 4.99879972638447e-05, + "loss": 2.514, + "mean_token_accuracy": 0.4172413766384125, + "step": 59450 + }, + { + "epoch": 0.05988370746171844, + "grad_norm": 11.806624309802467, + "learning_rate": 4.9987985022866565e-05, + "loss": 2.3429, + "mean_token_accuracy": 0.4344827592372894, + "step": 59455 + }, + { + "epoch": 0.059888743514822616, + "grad_norm": 15.883750006449572, + "learning_rate": 4.9987972775651285e-05, + "loss": 2.97, + "mean_token_accuracy": 0.341379314661026, + "step": 59460 + }, + { + "epoch": 0.05989377956792679, + "grad_norm": 12.344167519307211, + "learning_rate": 4.9987960522198896e-05, + "loss": 2.6452, + "mean_token_accuracy": 0.31724137961864474, + "step": 59465 + }, + { + "epoch": 0.059898815621030964, + "grad_norm": 9.142959610492534, + "learning_rate": 4.998794826250939e-05, + "loss": 2.1563, + "mean_token_accuracy": 0.43793103098869324, + "step": 59470 + }, + { + "epoch": 0.05990385167413513, + "grad_norm": 12.735447448286457, + "learning_rate": 4.998793599658276e-05, + "loss": 3.3874, + "mean_token_accuracy": 0.33103448152542114, + "step": 59475 + }, + { + "epoch": 0.059908887727239304, + "grad_norm": 11.979825693716485, + "learning_rate": 4.998792372441902e-05, + "loss": 2.5736, + "mean_token_accuracy": 0.4068965554237366, + "step": 59480 + }, + { + "epoch": 0.05991392378034348, + "grad_norm": 13.062132417989382, + "learning_rate": 4.998791144601817e-05, + "loss": 2.9357, + "mean_token_accuracy": 0.3551724135875702, + "step": 59485 + }, + { + "epoch": 0.05991895983344765, + "grad_norm": 13.019736661118591, + "learning_rate": 4.998789916138021e-05, + "loss": 2.7705, + "mean_token_accuracy": 0.39800362586975097, + "step": 59490 + }, + { + "epoch": 0.059923995886551826, + "grad_norm": 9.867417397999207, + "learning_rate": 4.998788687050515e-05, + "loss": 2.7306, + "mean_token_accuracy": 0.44954627752304077, + "step": 59495 + }, + { + "epoch": 0.059929031939656, + "grad_norm": 12.536633716292986, + "learning_rate": 4.9987874573392996e-05, + "loss": 2.4962, + "mean_token_accuracy": 0.3931034505367279, + "step": 59500 + }, + { + "epoch": 0.05993406799276017, + "grad_norm": 12.695959644494598, + "learning_rate": 4.998786227004373e-05, + "loss": 2.3952, + "mean_token_accuracy": 0.44482758045196535, + "step": 59505 + }, + { + "epoch": 0.05993910404586434, + "grad_norm": 10.85802627210156, + "learning_rate": 4.998784996045739e-05, + "loss": 2.1494, + "mean_token_accuracy": 0.475862056016922, + "step": 59510 + }, + { + "epoch": 0.059944140098968514, + "grad_norm": 11.654137978600769, + "learning_rate": 4.998783764463395e-05, + "loss": 2.1496, + "mean_token_accuracy": 0.42758620977401735, + "step": 59515 + }, + { + "epoch": 0.05994917615207269, + "grad_norm": 11.12386974931033, + "learning_rate": 4.998782532257343e-05, + "loss": 2.1136, + "mean_token_accuracy": 0.45716878175735476, + "step": 59520 + }, + { + "epoch": 0.05995421220517686, + "grad_norm": 16.392503275748204, + "learning_rate": 4.998781299427583e-05, + "loss": 2.6689, + "mean_token_accuracy": 0.3862069010734558, + "step": 59525 + }, + { + "epoch": 0.059959248258281035, + "grad_norm": 11.379326196915587, + "learning_rate": 4.9987800659741145e-05, + "loss": 2.2285, + "mean_token_accuracy": 0.45517241954803467, + "step": 59530 + }, + { + "epoch": 0.05996428431138521, + "grad_norm": 26.328991251545858, + "learning_rate": 4.998778831896939e-05, + "loss": 3.0549, + "mean_token_accuracy": 0.3206896513700485, + "step": 59535 + }, + { + "epoch": 0.05996932036448938, + "grad_norm": 11.743714546901664, + "learning_rate": 4.998777597196056e-05, + "loss": 2.3982, + "mean_token_accuracy": 0.41379310488700866, + "step": 59540 + }, + { + "epoch": 0.05997435641759355, + "grad_norm": 13.119055653771602, + "learning_rate": 4.998776361871465e-05, + "loss": 2.6844, + "mean_token_accuracy": 0.41724138259887694, + "step": 59545 + }, + { + "epoch": 0.05997939247069772, + "grad_norm": 11.189087024210414, + "learning_rate": 4.998775125923169e-05, + "loss": 2.2738, + "mean_token_accuracy": 0.4482758641242981, + "step": 59550 + }, + { + "epoch": 0.0599844285238019, + "grad_norm": 10.742250809119756, + "learning_rate": 4.9987738893511664e-05, + "loss": 2.6016, + "mean_token_accuracy": 0.36206896901130675, + "step": 59555 + }, + { + "epoch": 0.05998946457690607, + "grad_norm": 10.333434906491423, + "learning_rate": 4.998772652155458e-05, + "loss": 2.044, + "mean_token_accuracy": 0.4776164650917053, + "step": 59560 + }, + { + "epoch": 0.059994500630010245, + "grad_norm": 14.350533203427775, + "learning_rate": 4.9987714143360436e-05, + "loss": 2.8602, + "mean_token_accuracy": 0.4034482777118683, + "step": 59565 + }, + { + "epoch": 0.05999953668311442, + "grad_norm": 12.550634542419782, + "learning_rate": 4.998770175892925e-05, + "loss": 2.408, + "mean_token_accuracy": 0.3965517163276672, + "step": 59570 + }, + { + "epoch": 0.06000457273621859, + "grad_norm": 15.459495366372062, + "learning_rate": 4.998768936826102e-05, + "loss": 2.9993, + "mean_token_accuracy": 0.358620685338974, + "step": 59575 + }, + { + "epoch": 0.06000960878932276, + "grad_norm": 14.553089608833684, + "learning_rate": 4.998767697135573e-05, + "loss": 2.3786, + "mean_token_accuracy": 0.5068965435028077, + "step": 59580 + }, + { + "epoch": 0.06001464484242693, + "grad_norm": 13.07238082776562, + "learning_rate": 4.99876645682134e-05, + "loss": 2.3192, + "mean_token_accuracy": 0.4431337058544159, + "step": 59585 + }, + { + "epoch": 0.060019680895531106, + "grad_norm": 12.545785352888924, + "learning_rate": 4.9987652158834044e-05, + "loss": 2.6139, + "mean_token_accuracy": 0.39897156953811647, + "step": 59590 + }, + { + "epoch": 0.06002471694863528, + "grad_norm": 12.962010170514223, + "learning_rate": 4.9987639743217644e-05, + "loss": 2.9514, + "mean_token_accuracy": 0.3517241358757019, + "step": 59595 + }, + { + "epoch": 0.060029753001739454, + "grad_norm": 15.15373373828142, + "learning_rate": 4.998762732136422e-05, + "loss": 2.5135, + "mean_token_accuracy": 0.41379310488700866, + "step": 59600 + }, + { + "epoch": 0.06003478905484363, + "grad_norm": 11.435366842002603, + "learning_rate": 4.9987614893273764e-05, + "loss": 2.3548, + "mean_token_accuracy": 0.4068965494632721, + "step": 59605 + }, + { + "epoch": 0.0600398251079478, + "grad_norm": 9.093383276254944, + "learning_rate": 4.9987602458946284e-05, + "loss": 2.1739, + "mean_token_accuracy": 0.47126436829566953, + "step": 59610 + }, + { + "epoch": 0.06004486116105197, + "grad_norm": 13.400935093259095, + "learning_rate": 4.998759001838179e-05, + "loss": 2.6708, + "mean_token_accuracy": 0.34137930870056155, + "step": 59615 + }, + { + "epoch": 0.06004989721415614, + "grad_norm": 12.972030144694294, + "learning_rate": 4.998757757158027e-05, + "loss": 2.7125, + "mean_token_accuracy": 0.41379310488700866, + "step": 59620 + }, + { + "epoch": 0.060054933267260316, + "grad_norm": 14.487095415789456, + "learning_rate": 4.998756511854175e-05, + "loss": 2.8431, + "mean_token_accuracy": 0.3482758581638336, + "step": 59625 + }, + { + "epoch": 0.06005996932036449, + "grad_norm": 20.125952498857895, + "learning_rate": 4.9987552659266205e-05, + "loss": 2.5321, + "mean_token_accuracy": 0.4000000059604645, + "step": 59630 + }, + { + "epoch": 0.06006500537346866, + "grad_norm": 13.314513484836636, + "learning_rate": 4.998754019375366e-05, + "loss": 2.3828, + "mean_token_accuracy": 0.38620689511299133, + "step": 59635 + }, + { + "epoch": 0.06007004142657284, + "grad_norm": 11.785292901370154, + "learning_rate": 4.998752772200411e-05, + "loss": 2.4689, + "mean_token_accuracy": 0.43103448748588563, + "step": 59640 + }, + { + "epoch": 0.06007507747967701, + "grad_norm": 14.120998984492328, + "learning_rate": 4.9987515244017564e-05, + "loss": 2.6114, + "mean_token_accuracy": 0.37241379022598264, + "step": 59645 + }, + { + "epoch": 0.06008011353278118, + "grad_norm": 13.584983502600508, + "learning_rate": 4.9987502759794016e-05, + "loss": 2.4416, + "mean_token_accuracy": 0.43103448748588563, + "step": 59650 + }, + { + "epoch": 0.06008514958588535, + "grad_norm": 11.1294115075235, + "learning_rate": 4.998749026933348e-05, + "loss": 2.4503, + "mean_token_accuracy": 0.40490018129348754, + "step": 59655 + }, + { + "epoch": 0.060090185638989525, + "grad_norm": 15.49364027934721, + "learning_rate": 4.9987477772635956e-05, + "loss": 2.2914, + "mean_token_accuracy": 0.45517241954803467, + "step": 59660 + }, + { + "epoch": 0.0600952216920937, + "grad_norm": 12.152018785689359, + "learning_rate": 4.998746526970144e-05, + "loss": 2.4934, + "mean_token_accuracy": 0.3999999910593033, + "step": 59665 + }, + { + "epoch": 0.06010025774519787, + "grad_norm": 12.977575433875804, + "learning_rate": 4.998745276052995e-05, + "loss": 2.4053, + "mean_token_accuracy": 0.4344827592372894, + "step": 59670 + }, + { + "epoch": 0.06010529379830205, + "grad_norm": 12.122288629443316, + "learning_rate": 4.998744024512147e-05, + "loss": 2.4169, + "mean_token_accuracy": 0.42413793206214906, + "step": 59675 + }, + { + "epoch": 0.06011032985140622, + "grad_norm": 11.11940454693258, + "learning_rate": 4.998742772347603e-05, + "loss": 2.5868, + "mean_token_accuracy": 0.37586206793785093, + "step": 59680 + }, + { + "epoch": 0.06011536590451039, + "grad_norm": 14.151276432393722, + "learning_rate": 4.99874151955936e-05, + "loss": 2.4643, + "mean_token_accuracy": 0.4103448331356049, + "step": 59685 + }, + { + "epoch": 0.06012040195761456, + "grad_norm": 11.130885793871853, + "learning_rate": 4.9987402661474206e-05, + "loss": 2.7559, + "mean_token_accuracy": 0.4103448212146759, + "step": 59690 + }, + { + "epoch": 0.060125438010718735, + "grad_norm": 12.525075176265739, + "learning_rate": 4.998739012111785e-05, + "loss": 2.6125, + "mean_token_accuracy": 0.43103448748588563, + "step": 59695 + }, + { + "epoch": 0.06013047406382291, + "grad_norm": 12.392339569503312, + "learning_rate": 4.9987377574524544e-05, + "loss": 2.7337, + "mean_token_accuracy": 0.4034482717514038, + "step": 59700 + }, + { + "epoch": 0.06013551011692708, + "grad_norm": 10.784317121496919, + "learning_rate": 4.998736502169427e-05, + "loss": 2.3148, + "mean_token_accuracy": 0.4241379380226135, + "step": 59705 + }, + { + "epoch": 0.060140546170031256, + "grad_norm": 11.10753607184445, + "learning_rate": 4.998735246262704e-05, + "loss": 2.3083, + "mean_token_accuracy": 0.4068965494632721, + "step": 59710 + }, + { + "epoch": 0.06014558222313543, + "grad_norm": 12.124979964809748, + "learning_rate": 4.998733989732286e-05, + "loss": 2.5379, + "mean_token_accuracy": 0.42934059500694277, + "step": 59715 + }, + { + "epoch": 0.0601506182762396, + "grad_norm": 18.34314546619084, + "learning_rate": 4.998732732578173e-05, + "loss": 2.8352, + "mean_token_accuracy": 0.3565638244152069, + "step": 59720 + }, + { + "epoch": 0.06015565432934377, + "grad_norm": 11.672721160694284, + "learning_rate": 4.9987314748003655e-05, + "loss": 2.6699, + "mean_token_accuracy": 0.42758620977401735, + "step": 59725 + }, + { + "epoch": 0.060160690382447944, + "grad_norm": 14.416140762131478, + "learning_rate": 4.998730216398865e-05, + "loss": 3.0213, + "mean_token_accuracy": 0.33448276221752166, + "step": 59730 + }, + { + "epoch": 0.06016572643555212, + "grad_norm": 11.746530634189202, + "learning_rate": 4.9987289573736694e-05, + "loss": 2.3977, + "mean_token_accuracy": 0.42758620381355283, + "step": 59735 + }, + { + "epoch": 0.06017076248865629, + "grad_norm": 11.895933316095848, + "learning_rate": 4.998727697724781e-05, + "loss": 2.638, + "mean_token_accuracy": 0.37241379022598264, + "step": 59740 + }, + { + "epoch": 0.060175798541760465, + "grad_norm": 15.502253553471702, + "learning_rate": 4.9987264374522004e-05, + "loss": 2.4929, + "mean_token_accuracy": 0.4137930989265442, + "step": 59745 + }, + { + "epoch": 0.06018083459486464, + "grad_norm": 12.755146662967555, + "learning_rate": 4.9987251765559255e-05, + "loss": 2.6315, + "mean_token_accuracy": 0.3655172437429428, + "step": 59750 + }, + { + "epoch": 0.060185870647968806, + "grad_norm": 12.660283827053775, + "learning_rate": 4.99872391503596e-05, + "loss": 2.4188, + "mean_token_accuracy": 0.5034482717514038, + "step": 59755 + }, + { + "epoch": 0.06019090670107298, + "grad_norm": 11.730197487064284, + "learning_rate": 4.9987226528923015e-05, + "loss": 2.6482, + "mean_token_accuracy": 0.4103448331356049, + "step": 59760 + }, + { + "epoch": 0.060195942754177154, + "grad_norm": 12.52773561348265, + "learning_rate": 4.998721390124951e-05, + "loss": 2.6061, + "mean_token_accuracy": 0.3965517282485962, + "step": 59765 + }, + { + "epoch": 0.06020097880728133, + "grad_norm": 17.613043304394427, + "learning_rate": 4.99872012673391e-05, + "loss": 3.0406, + "mean_token_accuracy": 0.3724137991666794, + "step": 59770 + }, + { + "epoch": 0.0602060148603855, + "grad_norm": 14.670522293326249, + "learning_rate": 4.998718862719178e-05, + "loss": 2.2898, + "mean_token_accuracy": 0.41724138259887694, + "step": 59775 + }, + { + "epoch": 0.060211050913489675, + "grad_norm": 12.757640893468812, + "learning_rate": 4.998717598080756e-05, + "loss": 2.5494, + "mean_token_accuracy": 0.4363581418991089, + "step": 59780 + }, + { + "epoch": 0.06021608696659385, + "grad_norm": 14.013815541766238, + "learning_rate": 4.9987163328186426e-05, + "loss": 2.5006, + "mean_token_accuracy": 0.42413792610168455, + "step": 59785 + }, + { + "epoch": 0.060221123019698015, + "grad_norm": 10.166078086108945, + "learning_rate": 4.99871506693284e-05, + "loss": 2.4605, + "mean_token_accuracy": 0.4068965554237366, + "step": 59790 + }, + { + "epoch": 0.06022615907280219, + "grad_norm": 12.971790277309642, + "learning_rate": 4.998713800423348e-05, + "loss": 2.6372, + "mean_token_accuracy": 0.3999999940395355, + "step": 59795 + }, + { + "epoch": 0.06023119512590636, + "grad_norm": 10.596786219396497, + "learning_rate": 4.9987125332901664e-05, + "loss": 2.3203, + "mean_token_accuracy": 0.4230490028858185, + "step": 59800 + }, + { + "epoch": 0.06023623117901054, + "grad_norm": 11.690829737780179, + "learning_rate": 4.998711265533296e-05, + "loss": 2.7754, + "mean_token_accuracy": 0.3310344755649567, + "step": 59805 + }, + { + "epoch": 0.06024126723211471, + "grad_norm": 11.50016808744197, + "learning_rate": 4.9987099971527375e-05, + "loss": 2.4702, + "mean_token_accuracy": 0.39310344457626345, + "step": 59810 + }, + { + "epoch": 0.060246303285218884, + "grad_norm": 10.879362455649536, + "learning_rate": 4.9987087281484904e-05, + "loss": 2.5458, + "mean_token_accuracy": 0.3931034505367279, + "step": 59815 + }, + { + "epoch": 0.06025133933832306, + "grad_norm": 14.771462504508076, + "learning_rate": 4.998707458520556e-05, + "loss": 2.6046, + "mean_token_accuracy": 0.4137930989265442, + "step": 59820 + }, + { + "epoch": 0.060256375391427225, + "grad_norm": 11.848969213101583, + "learning_rate": 4.998706188268934e-05, + "loss": 2.7071, + "mean_token_accuracy": 0.36896551251411436, + "step": 59825 + }, + { + "epoch": 0.0602614114445314, + "grad_norm": 10.907298145077508, + "learning_rate": 4.998704917393625e-05, + "loss": 2.3334, + "mean_token_accuracy": 0.4241379380226135, + "step": 59830 + }, + { + "epoch": 0.06026644749763557, + "grad_norm": 18.027190068014384, + "learning_rate": 4.9987036458946296e-05, + "loss": 2.4879, + "mean_token_accuracy": 0.441379314661026, + "step": 59835 + }, + { + "epoch": 0.060271483550739746, + "grad_norm": 10.418753322740224, + "learning_rate": 4.998702373771948e-05, + "loss": 2.9406, + "mean_token_accuracy": 0.39655172228813174, + "step": 59840 + }, + { + "epoch": 0.06027651960384392, + "grad_norm": 10.525568920771065, + "learning_rate": 4.99870110102558e-05, + "loss": 2.1265, + "mean_token_accuracy": 0.4896551728248596, + "step": 59845 + }, + { + "epoch": 0.060281555656948094, + "grad_norm": 10.658760513044863, + "learning_rate": 4.9986998276555257e-05, + "loss": 2.84, + "mean_token_accuracy": 0.3827586233615875, + "step": 59850 + }, + { + "epoch": 0.06028659171005227, + "grad_norm": 11.348452341551063, + "learning_rate": 4.998698553661786e-05, + "loss": 2.6081, + "mean_token_accuracy": 0.3844525098800659, + "step": 59855 + }, + { + "epoch": 0.060291627763156434, + "grad_norm": 16.848964426182967, + "learning_rate": 4.998697279044363e-05, + "loss": 2.5082, + "mean_token_accuracy": 0.39310344457626345, + "step": 59860 + }, + { + "epoch": 0.06029666381626061, + "grad_norm": 10.504821671303686, + "learning_rate": 4.998696003803254e-05, + "loss": 2.4431, + "mean_token_accuracy": 0.3793103456497192, + "step": 59865 + }, + { + "epoch": 0.06030169986936478, + "grad_norm": 12.233948083948667, + "learning_rate": 4.998694727938462e-05, + "loss": 2.3754, + "mean_token_accuracy": 0.46551724672317507, + "step": 59870 + }, + { + "epoch": 0.060306735922468956, + "grad_norm": 11.48442236794374, + "learning_rate": 4.9986934514499854e-05, + "loss": 2.3734, + "mean_token_accuracy": 0.44295220971107485, + "step": 59875 + }, + { + "epoch": 0.06031177197557313, + "grad_norm": 15.487155269098027, + "learning_rate": 4.998692174337826e-05, + "loss": 2.9844, + "mean_token_accuracy": 0.3827586233615875, + "step": 59880 + }, + { + "epoch": 0.0603168080286773, + "grad_norm": 11.275179843203274, + "learning_rate": 4.998690896601982e-05, + "loss": 2.7116, + "mean_token_accuracy": 0.4, + "step": 59885 + }, + { + "epoch": 0.06032184408178148, + "grad_norm": 14.19080205594785, + "learning_rate": 4.998689618242456e-05, + "loss": 2.6233, + "mean_token_accuracy": 0.3758620619773865, + "step": 59890 + }, + { + "epoch": 0.060326880134885644, + "grad_norm": 12.957617768697727, + "learning_rate": 4.9986883392592474e-05, + "loss": 2.4637, + "mean_token_accuracy": 0.4344827651977539, + "step": 59895 + }, + { + "epoch": 0.06033191618798982, + "grad_norm": 9.967806201063379, + "learning_rate": 4.998687059652357e-05, + "loss": 2.5508, + "mean_token_accuracy": 0.3896551787853241, + "step": 59900 + }, + { + "epoch": 0.06033695224109399, + "grad_norm": 13.926913038634073, + "learning_rate": 4.9986857794217844e-05, + "loss": 2.6041, + "mean_token_accuracy": 0.39310344457626345, + "step": 59905 + }, + { + "epoch": 0.060341988294198165, + "grad_norm": 15.341317250958774, + "learning_rate": 4.998684498567531e-05, + "loss": 2.5893, + "mean_token_accuracy": 0.40544464290142057, + "step": 59910 + }, + { + "epoch": 0.06034702434730234, + "grad_norm": 10.336816873410708, + "learning_rate": 4.998683217089596e-05, + "loss": 2.3364, + "mean_token_accuracy": 0.44110060334205625, + "step": 59915 + }, + { + "epoch": 0.06035206040040651, + "grad_norm": 14.068893860348433, + "learning_rate": 4.9986819349879806e-05, + "loss": 2.6563, + "mean_token_accuracy": 0.38620689511299133, + "step": 59920 + }, + { + "epoch": 0.060357096453510686, + "grad_norm": 12.236963696234776, + "learning_rate": 4.998680652262685e-05, + "loss": 2.3348, + "mean_token_accuracy": 0.4413793087005615, + "step": 59925 + }, + { + "epoch": 0.06036213250661485, + "grad_norm": 10.288734821083162, + "learning_rate": 4.998679368913709e-05, + "loss": 2.1933, + "mean_token_accuracy": 0.4620689570903778, + "step": 59930 + }, + { + "epoch": 0.06036716855971903, + "grad_norm": 12.441265787756322, + "learning_rate": 4.9986780849410534e-05, + "loss": 2.6126, + "mean_token_accuracy": 0.37931033968925476, + "step": 59935 + }, + { + "epoch": 0.0603722046128232, + "grad_norm": 10.879870699239097, + "learning_rate": 4.9986768003447187e-05, + "loss": 2.6538, + "mean_token_accuracy": 0.42915910482406616, + "step": 59940 + }, + { + "epoch": 0.060377240665927374, + "grad_norm": 14.206694797349522, + "learning_rate": 4.9986755151247054e-05, + "loss": 2.5902, + "mean_token_accuracy": 0.41034482717514037, + "step": 59945 + }, + { + "epoch": 0.06038227671903155, + "grad_norm": 12.869044612626936, + "learning_rate": 4.998674229281014e-05, + "loss": 2.4545, + "mean_token_accuracy": 0.4, + "step": 59950 + }, + { + "epoch": 0.06038731277213572, + "grad_norm": 10.429904428146598, + "learning_rate": 4.998672942813643e-05, + "loss": 2.6385, + "mean_token_accuracy": 0.4379310369491577, + "step": 59955 + }, + { + "epoch": 0.060392348825239896, + "grad_norm": 11.98122458392187, + "learning_rate": 4.998671655722595e-05, + "loss": 2.8358, + "mean_token_accuracy": 0.3931034505367279, + "step": 59960 + }, + { + "epoch": 0.06039738487834406, + "grad_norm": 10.411633204926014, + "learning_rate": 4.9986703680078696e-05, + "loss": 2.4479, + "mean_token_accuracy": 0.43103447556495667, + "step": 59965 + }, + { + "epoch": 0.060402420931448236, + "grad_norm": 12.818387006602684, + "learning_rate": 4.9986690796694667e-05, + "loss": 2.2345, + "mean_token_accuracy": 0.44827585816383364, + "step": 59970 + }, + { + "epoch": 0.06040745698455241, + "grad_norm": 11.67885513876231, + "learning_rate": 4.998667790707387e-05, + "loss": 2.6997, + "mean_token_accuracy": 0.4034482777118683, + "step": 59975 + }, + { + "epoch": 0.060412493037656584, + "grad_norm": 16.688956989975097, + "learning_rate": 4.998666501121631e-05, + "loss": 3.2698, + "mean_token_accuracy": 0.3310344755649567, + "step": 59980 + }, + { + "epoch": 0.06041752909076076, + "grad_norm": 12.06363331229713, + "learning_rate": 4.9986652109122e-05, + "loss": 2.5187, + "mean_token_accuracy": 0.42068964838981626, + "step": 59985 + }, + { + "epoch": 0.06042256514386493, + "grad_norm": 14.86786945248949, + "learning_rate": 4.998663920079092e-05, + "loss": 2.9713, + "mean_token_accuracy": 0.3827586114406586, + "step": 59990 + }, + { + "epoch": 0.060427601196969105, + "grad_norm": 11.984329780165446, + "learning_rate": 4.998662628622309e-05, + "loss": 2.612, + "mean_token_accuracy": 0.4517241358757019, + "step": 59995 + }, + { + "epoch": 0.06043263725007327, + "grad_norm": 12.580185968798194, + "learning_rate": 4.998661336541851e-05, + "loss": 2.981, + "mean_token_accuracy": 0.3620689570903778, + "step": 60000 + }, + { + "epoch": 0.060437673303177446, + "grad_norm": 15.529730774839786, + "learning_rate": 4.998660043837719e-05, + "loss": 2.6903, + "mean_token_accuracy": 0.3827586233615875, + "step": 60005 + }, + { + "epoch": 0.06044270935628162, + "grad_norm": 12.360154325550063, + "learning_rate": 4.998658750509913e-05, + "loss": 2.5538, + "mean_token_accuracy": 0.4221415638923645, + "step": 60010 + }, + { + "epoch": 0.06044774540938579, + "grad_norm": 10.574775660906774, + "learning_rate": 4.998657456558432e-05, + "loss": 2.7388, + "mean_token_accuracy": 0.41034482717514037, + "step": 60015 + }, + { + "epoch": 0.06045278146248997, + "grad_norm": 8.96101463114581, + "learning_rate": 4.998656161983278e-05, + "loss": 2.3431, + "mean_token_accuracy": 0.45741077661514284, + "step": 60020 + }, + { + "epoch": 0.06045781751559414, + "grad_norm": 11.930530223985759, + "learning_rate": 4.998654866784451e-05, + "loss": 2.5322, + "mean_token_accuracy": 0.43448275327682495, + "step": 60025 + }, + { + "epoch": 0.060462853568698315, + "grad_norm": 10.778106719263068, + "learning_rate": 4.9986535709619506e-05, + "loss": 2.6913, + "mean_token_accuracy": 0.39310344159603117, + "step": 60030 + }, + { + "epoch": 0.06046788962180248, + "grad_norm": 9.993204034145037, + "learning_rate": 4.998652274515778e-05, + "loss": 2.3516, + "mean_token_accuracy": 0.42758620977401735, + "step": 60035 + }, + { + "epoch": 0.060472925674906655, + "grad_norm": 12.269064043754108, + "learning_rate": 4.9986509774459325e-05, + "loss": 2.4559, + "mean_token_accuracy": 0.4, + "step": 60040 + }, + { + "epoch": 0.06047796172801083, + "grad_norm": 9.82140621958726, + "learning_rate": 4.9986496797524165e-05, + "loss": 2.4714, + "mean_token_accuracy": 0.417241370677948, + "step": 60045 + }, + { + "epoch": 0.060482997781115, + "grad_norm": 11.787171908430809, + "learning_rate": 4.998648381435229e-05, + "loss": 2.6083, + "mean_token_accuracy": 0.4034482777118683, + "step": 60050 + }, + { + "epoch": 0.060488033834219176, + "grad_norm": 12.684875947742814, + "learning_rate": 4.99864708249437e-05, + "loss": 2.537, + "mean_token_accuracy": 0.4206896543502808, + "step": 60055 + }, + { + "epoch": 0.06049306988732335, + "grad_norm": 12.088832710673248, + "learning_rate": 4.9986457829298404e-05, + "loss": 3.0092, + "mean_token_accuracy": 0.3275862097740173, + "step": 60060 + }, + { + "epoch": 0.060498105940427524, + "grad_norm": 14.581697280791245, + "learning_rate": 4.998644482741641e-05, + "loss": 2.6427, + "mean_token_accuracy": 0.4068965554237366, + "step": 60065 + }, + { + "epoch": 0.06050314199353169, + "grad_norm": 11.586149085424024, + "learning_rate": 4.9986431819297705e-05, + "loss": 2.4205, + "mean_token_accuracy": 0.3896551728248596, + "step": 60070 + }, + { + "epoch": 0.060508178046635865, + "grad_norm": 11.718066779780797, + "learning_rate": 4.998641880494232e-05, + "loss": 2.3301, + "mean_token_accuracy": 0.4517241418361664, + "step": 60075 + }, + { + "epoch": 0.06051321409974004, + "grad_norm": 11.820044481061363, + "learning_rate": 4.998640578435023e-05, + "loss": 2.3732, + "mean_token_accuracy": 0.46896552443504336, + "step": 60080 + }, + { + "epoch": 0.06051825015284421, + "grad_norm": 13.244326264345723, + "learning_rate": 4.998639275752146e-05, + "loss": 2.3792, + "mean_token_accuracy": 0.4344827592372894, + "step": 60085 + }, + { + "epoch": 0.060523286205948386, + "grad_norm": 14.350680706776217, + "learning_rate": 4.9986379724456e-05, + "loss": 2.4969, + "mean_token_accuracy": 0.4241379380226135, + "step": 60090 + }, + { + "epoch": 0.06052832225905256, + "grad_norm": 10.040245267785558, + "learning_rate": 4.9986366685153865e-05, + "loss": 2.7277, + "mean_token_accuracy": 0.4, + "step": 60095 + }, + { + "epoch": 0.06053335831215673, + "grad_norm": 12.280614296063147, + "learning_rate": 4.998635363961505e-05, + "loss": 2.5743, + "mean_token_accuracy": 0.38620689511299133, + "step": 60100 + }, + { + "epoch": 0.0605383943652609, + "grad_norm": 12.334375084236823, + "learning_rate": 4.998634058783955e-05, + "loss": 2.4669, + "mean_token_accuracy": 0.3965517163276672, + "step": 60105 + }, + { + "epoch": 0.060543430418365074, + "grad_norm": 12.741120290193614, + "learning_rate": 4.9986327529827396e-05, + "loss": 2.7242, + "mean_token_accuracy": 0.3655172407627106, + "step": 60110 + }, + { + "epoch": 0.06054846647146925, + "grad_norm": 13.062818067139155, + "learning_rate": 4.998631446557856e-05, + "loss": 2.684, + "mean_token_accuracy": 0.3724137842655182, + "step": 60115 + }, + { + "epoch": 0.06055350252457342, + "grad_norm": 13.82010109560419, + "learning_rate": 4.998630139509307e-05, + "loss": 2.5226, + "mean_token_accuracy": 0.4363581359386444, + "step": 60120 + }, + { + "epoch": 0.060558538577677595, + "grad_norm": 10.64260053966043, + "learning_rate": 4.998628831837092e-05, + "loss": 2.5423, + "mean_token_accuracy": 0.3862069010734558, + "step": 60125 + }, + { + "epoch": 0.06056357463078177, + "grad_norm": 11.216303083397731, + "learning_rate": 4.998627523541211e-05, + "loss": 2.3611, + "mean_token_accuracy": 0.4068965554237366, + "step": 60130 + }, + { + "epoch": 0.06056861068388594, + "grad_norm": 10.923251052414535, + "learning_rate": 4.998626214621666e-05, + "loss": 2.7107, + "mean_token_accuracy": 0.3827586233615875, + "step": 60135 + }, + { + "epoch": 0.06057364673699011, + "grad_norm": 28.46770741694154, + "learning_rate": 4.9986249050784545e-05, + "loss": 2.972, + "mean_token_accuracy": 0.38759830594062805, + "step": 60140 + }, + { + "epoch": 0.06057868279009428, + "grad_norm": 11.801732654688525, + "learning_rate": 4.9986235949115794e-05, + "loss": 2.7585, + "mean_token_accuracy": 0.3620689630508423, + "step": 60145 + }, + { + "epoch": 0.06058371884319846, + "grad_norm": 11.786285933769491, + "learning_rate": 4.9986222841210395e-05, + "loss": 2.4121, + "mean_token_accuracy": 0.46049606800079346, + "step": 60150 + }, + { + "epoch": 0.06058875489630263, + "grad_norm": 24.889343474540897, + "learning_rate": 4.998620972706836e-05, + "loss": 2.9434, + "mean_token_accuracy": 0.34640048146247865, + "step": 60155 + }, + { + "epoch": 0.060593790949406805, + "grad_norm": 12.101806703842897, + "learning_rate": 4.99861966066897e-05, + "loss": 2.853, + "mean_token_accuracy": 0.36551723480224607, + "step": 60160 + }, + { + "epoch": 0.06059882700251098, + "grad_norm": 11.007724488234762, + "learning_rate": 4.998618348007441e-05, + "loss": 2.2967, + "mean_token_accuracy": 0.4862069070339203, + "step": 60165 + }, + { + "epoch": 0.06060386305561515, + "grad_norm": 10.705870518944355, + "learning_rate": 4.998617034722248e-05, + "loss": 2.0629, + "mean_token_accuracy": 0.48275862336158754, + "step": 60170 + }, + { + "epoch": 0.06060889910871932, + "grad_norm": 12.08846234051216, + "learning_rate": 4.9986157208133934e-05, + "loss": 2.3347, + "mean_token_accuracy": 0.4482758641242981, + "step": 60175 + }, + { + "epoch": 0.06061393516182349, + "grad_norm": 12.276982540208824, + "learning_rate": 4.9986144062808766e-05, + "loss": 3.1694, + "mean_token_accuracy": 0.3482758581638336, + "step": 60180 + }, + { + "epoch": 0.06061897121492767, + "grad_norm": 11.411186142119309, + "learning_rate": 4.9986130911246984e-05, + "loss": 2.3465, + "mean_token_accuracy": 0.42068966031074523, + "step": 60185 + }, + { + "epoch": 0.06062400726803184, + "grad_norm": 10.937005503246166, + "learning_rate": 4.998611775344858e-05, + "loss": 2.4895, + "mean_token_accuracy": 0.39655172228813174, + "step": 60190 + }, + { + "epoch": 0.060629043321136014, + "grad_norm": 12.61097424752664, + "learning_rate": 4.998610458941358e-05, + "loss": 2.6122, + "mean_token_accuracy": 0.4068965494632721, + "step": 60195 + }, + { + "epoch": 0.06063407937424019, + "grad_norm": 11.564774950674384, + "learning_rate": 4.998609141914197e-05, + "loss": 3.028, + "mean_token_accuracy": 0.324137932062149, + "step": 60200 + }, + { + "epoch": 0.06063911542734436, + "grad_norm": 12.018986871849423, + "learning_rate": 4.998607824263376e-05, + "loss": 2.5712, + "mean_token_accuracy": 0.3655172407627106, + "step": 60205 + }, + { + "epoch": 0.06064415148044853, + "grad_norm": 11.369298761463416, + "learning_rate": 4.998606505988896e-05, + "loss": 2.1714, + "mean_token_accuracy": 0.4781004309654236, + "step": 60210 + }, + { + "epoch": 0.0606491875335527, + "grad_norm": 10.687775661142522, + "learning_rate": 4.998605187090755e-05, + "loss": 2.7691, + "mean_token_accuracy": 0.3758620619773865, + "step": 60215 + }, + { + "epoch": 0.060654223586656876, + "grad_norm": 17.628866667855135, + "learning_rate": 4.998603867568956e-05, + "loss": 2.7761, + "mean_token_accuracy": 0.3655172407627106, + "step": 60220 + }, + { + "epoch": 0.06065925963976105, + "grad_norm": 10.446900215293864, + "learning_rate": 4.9986025474234984e-05, + "loss": 2.3303, + "mean_token_accuracy": 0.41379310488700866, + "step": 60225 + }, + { + "epoch": 0.060664295692865224, + "grad_norm": 13.925298463867044, + "learning_rate": 4.998601226654382e-05, + "loss": 2.8146, + "mean_token_accuracy": 0.3551724076271057, + "step": 60230 + }, + { + "epoch": 0.0606693317459694, + "grad_norm": 13.651459381222505, + "learning_rate": 4.998599905261608e-05, + "loss": 2.8703, + "mean_token_accuracy": 0.37586207389831544, + "step": 60235 + }, + { + "epoch": 0.06067436779907357, + "grad_norm": 12.524136696148837, + "learning_rate": 4.998598583245176e-05, + "loss": 2.7203, + "mean_token_accuracy": 0.3896551787853241, + "step": 60240 + }, + { + "epoch": 0.06067940385217774, + "grad_norm": 11.7518991894446, + "learning_rate": 4.998597260605088e-05, + "loss": 2.5573, + "mean_token_accuracy": 0.39310344457626345, + "step": 60245 + }, + { + "epoch": 0.06068443990528191, + "grad_norm": 10.30108572371737, + "learning_rate": 4.998595937341342e-05, + "loss": 2.2024, + "mean_token_accuracy": 0.4896551787853241, + "step": 60250 + }, + { + "epoch": 0.060689475958386085, + "grad_norm": 16.99647508997087, + "learning_rate": 4.998594613453939e-05, + "loss": 2.5761, + "mean_token_accuracy": 0.36551723480224607, + "step": 60255 + }, + { + "epoch": 0.06069451201149026, + "grad_norm": 12.451184333997794, + "learning_rate": 4.998593288942881e-05, + "loss": 2.6463, + "mean_token_accuracy": 0.43103447556495667, + "step": 60260 + }, + { + "epoch": 0.06069954806459443, + "grad_norm": 11.329699917084962, + "learning_rate": 4.9985919638081666e-05, + "loss": 2.8007, + "mean_token_accuracy": 0.38070176243782045, + "step": 60265 + }, + { + "epoch": 0.06070458411769861, + "grad_norm": 12.135995239458376, + "learning_rate": 4.998590638049797e-05, + "loss": 2.207, + "mean_token_accuracy": 0.4655172348022461, + "step": 60270 + }, + { + "epoch": 0.06070962017080278, + "grad_norm": 11.286267948409566, + "learning_rate": 4.998589311667773e-05, + "loss": 2.3756, + "mean_token_accuracy": 0.42413793206214906, + "step": 60275 + }, + { + "epoch": 0.06071465622390695, + "grad_norm": 11.414847165245911, + "learning_rate": 4.998587984662094e-05, + "loss": 2.3012, + "mean_token_accuracy": 0.4793103337287903, + "step": 60280 + }, + { + "epoch": 0.06071969227701112, + "grad_norm": 14.536249151442135, + "learning_rate": 4.9985866570327604e-05, + "loss": 2.8038, + "mean_token_accuracy": 0.3068965464830399, + "step": 60285 + }, + { + "epoch": 0.060724728330115295, + "grad_norm": 13.07535618216538, + "learning_rate": 4.9985853287797736e-05, + "loss": 2.8498, + "mean_token_accuracy": 0.3986085891723633, + "step": 60290 + }, + { + "epoch": 0.06072976438321947, + "grad_norm": 10.869326955763011, + "learning_rate": 4.998583999903133e-05, + "loss": 2.18, + "mean_token_accuracy": 0.45396249294281005, + "step": 60295 + }, + { + "epoch": 0.06073480043632364, + "grad_norm": 11.741379002789953, + "learning_rate": 4.998582670402839e-05, + "loss": 2.3583, + "mean_token_accuracy": 0.4275862157344818, + "step": 60300 + }, + { + "epoch": 0.060739836489427816, + "grad_norm": 10.981064595351743, + "learning_rate": 4.9985813402788914e-05, + "loss": 2.478, + "mean_token_accuracy": 0.4448275864124298, + "step": 60305 + }, + { + "epoch": 0.06074487254253199, + "grad_norm": 14.230856898344285, + "learning_rate": 4.998580009531293e-05, + "loss": 2.6615, + "mean_token_accuracy": 0.4, + "step": 60310 + }, + { + "epoch": 0.06074990859563616, + "grad_norm": 12.479934422119927, + "learning_rate": 4.998578678160041e-05, + "loss": 2.4785, + "mean_token_accuracy": 0.41034482717514037, + "step": 60315 + }, + { + "epoch": 0.06075494464874033, + "grad_norm": 10.677533580463997, + "learning_rate": 4.9985773461651386e-05, + "loss": 2.443, + "mean_token_accuracy": 0.44482758045196535, + "step": 60320 + }, + { + "epoch": 0.060759980701844504, + "grad_norm": 12.637577450779567, + "learning_rate": 4.9985760135465846e-05, + "loss": 2.436, + "mean_token_accuracy": 0.43448275327682495, + "step": 60325 + }, + { + "epoch": 0.06076501675494868, + "grad_norm": 10.81078548379524, + "learning_rate": 4.9985746803043786e-05, + "loss": 2.3427, + "mean_token_accuracy": 0.3827586203813553, + "step": 60330 + }, + { + "epoch": 0.06077005280805285, + "grad_norm": 11.68373646296189, + "learning_rate": 4.998573346438524e-05, + "loss": 2.1599, + "mean_token_accuracy": 0.4517241418361664, + "step": 60335 + }, + { + "epoch": 0.060775088861157026, + "grad_norm": 12.653157064650847, + "learning_rate": 4.998572011949018e-05, + "loss": 2.7148, + "mean_token_accuracy": 0.40344826579093934, + "step": 60340 + }, + { + "epoch": 0.0607801249142612, + "grad_norm": 13.949734212715072, + "learning_rate": 4.998570676835862e-05, + "loss": 2.6103, + "mean_token_accuracy": 0.3655172407627106, + "step": 60345 + }, + { + "epoch": 0.060785160967365366, + "grad_norm": 15.034014465596217, + "learning_rate": 4.998569341099056e-05, + "loss": 2.6412, + "mean_token_accuracy": 0.4413793087005615, + "step": 60350 + }, + { + "epoch": 0.06079019702046954, + "grad_norm": 12.86815841928514, + "learning_rate": 4.998568004738602e-05, + "loss": 3.0264, + "mean_token_accuracy": 0.3551724195480347, + "step": 60355 + }, + { + "epoch": 0.060795233073573714, + "grad_norm": 12.862917287364654, + "learning_rate": 4.9985666677544994e-05, + "loss": 2.7915, + "mean_token_accuracy": 0.4034482717514038, + "step": 60360 + }, + { + "epoch": 0.06080026912667789, + "grad_norm": 12.81333125531648, + "learning_rate": 4.998565330146748e-05, + "loss": 2.6665, + "mean_token_accuracy": 0.3655172407627106, + "step": 60365 + }, + { + "epoch": 0.06080530517978206, + "grad_norm": 11.228291208523077, + "learning_rate": 4.9985639919153484e-05, + "loss": 2.028, + "mean_token_accuracy": 0.47241379618644713, + "step": 60370 + }, + { + "epoch": 0.060810341232886235, + "grad_norm": 9.111681455818363, + "learning_rate": 4.9985626530603016e-05, + "loss": 2.5606, + "mean_token_accuracy": 0.4574107676744461, + "step": 60375 + }, + { + "epoch": 0.06081537728599041, + "grad_norm": 11.525290645037707, + "learning_rate": 4.9985613135816075e-05, + "loss": 2.5915, + "mean_token_accuracy": 0.38275861740112305, + "step": 60380 + }, + { + "epoch": 0.060820413339094576, + "grad_norm": 14.753366910506818, + "learning_rate": 4.998559973479266e-05, + "loss": 2.7312, + "mean_token_accuracy": 0.3551724135875702, + "step": 60385 + }, + { + "epoch": 0.06082544939219875, + "grad_norm": 11.988090048112577, + "learning_rate": 4.9985586327532794e-05, + "loss": 2.4103, + "mean_token_accuracy": 0.5057471334934235, + "step": 60390 + }, + { + "epoch": 0.06083048544530292, + "grad_norm": 11.051442766305067, + "learning_rate": 4.9985572914036446e-05, + "loss": 2.2724, + "mean_token_accuracy": 0.4294010877609253, + "step": 60395 + }, + { + "epoch": 0.0608355214984071, + "grad_norm": 11.824832207162391, + "learning_rate": 4.998555949430365e-05, + "loss": 3.1633, + "mean_token_accuracy": 0.35172414481639863, + "step": 60400 + }, + { + "epoch": 0.06084055755151127, + "grad_norm": 19.420752650911407, + "learning_rate": 4.998554606833441e-05, + "loss": 2.8683, + "mean_token_accuracy": 0.3896551728248596, + "step": 60405 + }, + { + "epoch": 0.060845593604615444, + "grad_norm": 13.053557668476259, + "learning_rate": 4.998553263612871e-05, + "loss": 2.6707, + "mean_token_accuracy": 0.31034481823444365, + "step": 60410 + }, + { + "epoch": 0.06085062965771962, + "grad_norm": 11.689821421754035, + "learning_rate": 4.998551919768656e-05, + "loss": 2.4285, + "mean_token_accuracy": 0.39122806787490844, + "step": 60415 + }, + { + "epoch": 0.060855665710823785, + "grad_norm": 13.695400171664188, + "learning_rate": 4.998550575300798e-05, + "loss": 2.9181, + "mean_token_accuracy": 0.38620689511299133, + "step": 60420 + }, + { + "epoch": 0.06086070176392796, + "grad_norm": 13.170333591299064, + "learning_rate": 4.998549230209295e-05, + "loss": 2.6872, + "mean_token_accuracy": 0.3655172407627106, + "step": 60425 + }, + { + "epoch": 0.06086573781703213, + "grad_norm": 17.380961133057244, + "learning_rate": 4.998547884494148e-05, + "loss": 2.5364, + "mean_token_accuracy": 0.4310344815254211, + "step": 60430 + }, + { + "epoch": 0.060870773870136306, + "grad_norm": 9.877438870099555, + "learning_rate": 4.998546538155359e-05, + "loss": 2.1051, + "mean_token_accuracy": 0.5334543228149414, + "step": 60435 + }, + { + "epoch": 0.06087580992324048, + "grad_norm": 11.476739883362733, + "learning_rate": 4.998545191192927e-05, + "loss": 3.0133, + "mean_token_accuracy": 0.32413793802261354, + "step": 60440 + }, + { + "epoch": 0.060880845976344654, + "grad_norm": 13.418254303902149, + "learning_rate": 4.998543843606853e-05, + "loss": 2.4849, + "mean_token_accuracy": 0.35172414481639863, + "step": 60445 + }, + { + "epoch": 0.06088588202944882, + "grad_norm": 11.520542978845263, + "learning_rate": 4.998542495397136e-05, + "loss": 2.3928, + "mean_token_accuracy": 0.41034482419490814, + "step": 60450 + }, + { + "epoch": 0.060890918082552994, + "grad_norm": 13.561369119546196, + "learning_rate": 4.998541146563778e-05, + "loss": 2.4797, + "mean_token_accuracy": 0.3965517282485962, + "step": 60455 + }, + { + "epoch": 0.06089595413565717, + "grad_norm": 11.238565536828915, + "learning_rate": 4.998539797106778e-05, + "loss": 2.6553, + "mean_token_accuracy": 0.4413793087005615, + "step": 60460 + }, + { + "epoch": 0.06090099018876134, + "grad_norm": 13.100258335234958, + "learning_rate": 4.998538447026138e-05, + "loss": 2.7412, + "mean_token_accuracy": 0.37241379618644715, + "step": 60465 + }, + { + "epoch": 0.060906026241865516, + "grad_norm": 11.084291485640083, + "learning_rate": 4.9985370963218575e-05, + "loss": 3.0759, + "mean_token_accuracy": 0.34137930274009703, + "step": 60470 + }, + { + "epoch": 0.06091106229496969, + "grad_norm": 12.976846685434413, + "learning_rate": 4.998535744993936e-05, + "loss": 2.7507, + "mean_token_accuracy": 0.4068965554237366, + "step": 60475 + }, + { + "epoch": 0.06091609834807386, + "grad_norm": 11.412004920752665, + "learning_rate": 4.9985343930423756e-05, + "loss": 2.2269, + "mean_token_accuracy": 0.44482759237289426, + "step": 60480 + }, + { + "epoch": 0.06092113440117803, + "grad_norm": 17.03733666817087, + "learning_rate": 4.998533040467175e-05, + "loss": 2.3482, + "mean_token_accuracy": 0.42758620977401735, + "step": 60485 + }, + { + "epoch": 0.060926170454282204, + "grad_norm": 12.677766350963694, + "learning_rate": 4.998531687268336e-05, + "loss": 2.9805, + "mean_token_accuracy": 0.31379310190677645, + "step": 60490 + }, + { + "epoch": 0.06093120650738638, + "grad_norm": 10.642242725827334, + "learning_rate": 4.998530333445858e-05, + "loss": 2.662, + "mean_token_accuracy": 0.3793103456497192, + "step": 60495 + }, + { + "epoch": 0.06093624256049055, + "grad_norm": 11.043577991225295, + "learning_rate": 4.998528978999742e-05, + "loss": 2.6405, + "mean_token_accuracy": 0.3655172407627106, + "step": 60500 + }, + { + "epoch": 0.060941278613594725, + "grad_norm": 11.167720525855971, + "learning_rate": 4.998527623929988e-05, + "loss": 2.3208, + "mean_token_accuracy": 0.4586206912994385, + "step": 60505 + }, + { + "epoch": 0.0609463146666989, + "grad_norm": 12.521070552822009, + "learning_rate": 4.998526268236596e-05, + "loss": 2.4588, + "mean_token_accuracy": 0.4137930989265442, + "step": 60510 + }, + { + "epoch": 0.06095135071980307, + "grad_norm": 10.416724015822673, + "learning_rate": 4.9985249119195674e-05, + "loss": 2.2841, + "mean_token_accuracy": 0.47241379618644713, + "step": 60515 + }, + { + "epoch": 0.06095638677290724, + "grad_norm": 14.473165870052979, + "learning_rate": 4.998523554978902e-05, + "loss": 2.5636, + "mean_token_accuracy": 0.4, + "step": 60520 + }, + { + "epoch": 0.06096142282601141, + "grad_norm": 12.749256885024645, + "learning_rate": 4.9985221974146004e-05, + "loss": 2.8357, + "mean_token_accuracy": 0.38275861740112305, + "step": 60525 + }, + { + "epoch": 0.06096645887911559, + "grad_norm": 13.803196445154077, + "learning_rate": 4.998520839226662e-05, + "loss": 3.0427, + "mean_token_accuracy": 0.35862069129943847, + "step": 60530 + }, + { + "epoch": 0.06097149493221976, + "grad_norm": 9.377474000977855, + "learning_rate": 4.998519480415089e-05, + "loss": 2.2234, + "mean_token_accuracy": 0.47241379618644713, + "step": 60535 + }, + { + "epoch": 0.060976530985323935, + "grad_norm": 10.02835186439114, + "learning_rate": 4.99851812097988e-05, + "loss": 3.2793, + "mean_token_accuracy": 0.3137931048870087, + "step": 60540 + }, + { + "epoch": 0.06098156703842811, + "grad_norm": 11.205267450980456, + "learning_rate": 4.998516760921036e-05, + "loss": 2.5558, + "mean_token_accuracy": 0.4034482717514038, + "step": 60545 + }, + { + "epoch": 0.06098660309153228, + "grad_norm": 12.091668180846625, + "learning_rate": 4.998515400238558e-05, + "loss": 2.7795, + "mean_token_accuracy": 0.36551723480224607, + "step": 60550 + }, + { + "epoch": 0.06099163914463645, + "grad_norm": 11.662943833008136, + "learning_rate": 4.998514038932446e-05, + "loss": 2.4262, + "mean_token_accuracy": 0.35862069129943847, + "step": 60555 + }, + { + "epoch": 0.06099667519774062, + "grad_norm": 11.842173556365761, + "learning_rate": 4.9985126770027e-05, + "loss": 2.4756, + "mean_token_accuracy": 0.44482759237289426, + "step": 60560 + }, + { + "epoch": 0.061001711250844796, + "grad_norm": 12.62525317520617, + "learning_rate": 4.99851131444932e-05, + "loss": 2.651, + "mean_token_accuracy": 0.3827586233615875, + "step": 60565 + }, + { + "epoch": 0.06100674730394897, + "grad_norm": 11.575389337232481, + "learning_rate": 4.998509951272308e-05, + "loss": 2.848, + "mean_token_accuracy": 0.34137930274009703, + "step": 60570 + }, + { + "epoch": 0.061011783357053144, + "grad_norm": 11.513188914590241, + "learning_rate": 4.998508587471662e-05, + "loss": 2.905, + "mean_token_accuracy": 0.37241379618644715, + "step": 60575 + }, + { + "epoch": 0.06101681941015732, + "grad_norm": 14.516846633896915, + "learning_rate": 4.998507223047386e-05, + "loss": 2.7737, + "mean_token_accuracy": 0.3551724076271057, + "step": 60580 + }, + { + "epoch": 0.06102185546326149, + "grad_norm": 11.786578284217796, + "learning_rate": 4.998505857999476e-05, + "loss": 2.4641, + "mean_token_accuracy": 0.4291590988636017, + "step": 60585 + }, + { + "epoch": 0.06102689151636566, + "grad_norm": 12.067663404063822, + "learning_rate": 4.9985044923279354e-05, + "loss": 2.6296, + "mean_token_accuracy": 0.4379310369491577, + "step": 60590 + }, + { + "epoch": 0.06103192756946983, + "grad_norm": 10.738264213435809, + "learning_rate": 4.9985031260327636e-05, + "loss": 2.3867, + "mean_token_accuracy": 0.4172413766384125, + "step": 60595 + }, + { + "epoch": 0.061036963622574006, + "grad_norm": 12.952086503885884, + "learning_rate": 4.998501759113961e-05, + "loss": 3.0961, + "mean_token_accuracy": 0.3448275923728943, + "step": 60600 + }, + { + "epoch": 0.06104199967567818, + "grad_norm": 10.884299905413041, + "learning_rate": 4.9985003915715276e-05, + "loss": 2.264, + "mean_token_accuracy": 0.4034482717514038, + "step": 60605 + }, + { + "epoch": 0.06104703572878235, + "grad_norm": 11.46011344216903, + "learning_rate": 4.9984990234054654e-05, + "loss": 2.2516, + "mean_token_accuracy": 0.4620689690113068, + "step": 60610 + }, + { + "epoch": 0.06105207178188653, + "grad_norm": 16.747914550529572, + "learning_rate": 4.998497654615773e-05, + "loss": 3.0367, + "mean_token_accuracy": 0.3655172437429428, + "step": 60615 + }, + { + "epoch": 0.0610571078349907, + "grad_norm": 12.088291194497128, + "learning_rate": 4.9984962852024505e-05, + "loss": 2.5426, + "mean_token_accuracy": 0.3655172407627106, + "step": 60620 + }, + { + "epoch": 0.06106214388809487, + "grad_norm": 14.280081083323502, + "learning_rate": 4.9984949151655005e-05, + "loss": 2.3373, + "mean_token_accuracy": 0.4724137902259827, + "step": 60625 + }, + { + "epoch": 0.06106717994119904, + "grad_norm": 13.259700781870801, + "learning_rate": 4.998493544504922e-05, + "loss": 2.5638, + "mean_token_accuracy": 0.39310344457626345, + "step": 60630 + }, + { + "epoch": 0.061072215994303215, + "grad_norm": 12.51917726089093, + "learning_rate": 4.998492173220715e-05, + "loss": 2.4645, + "mean_token_accuracy": 0.39310343861579894, + "step": 60635 + }, + { + "epoch": 0.06107725204740739, + "grad_norm": 11.935221135321953, + "learning_rate": 4.99849080131288e-05, + "loss": 2.3157, + "mean_token_accuracy": 0.4103448152542114, + "step": 60640 + }, + { + "epoch": 0.06108228810051156, + "grad_norm": 10.784701545140745, + "learning_rate": 4.998489428781418e-05, + "loss": 2.4664, + "mean_token_accuracy": 0.4330308556556702, + "step": 60645 + }, + { + "epoch": 0.06108732415361574, + "grad_norm": 14.174958720568027, + "learning_rate": 4.998488055626329e-05, + "loss": 3.0776, + "mean_token_accuracy": 0.33103448152542114, + "step": 60650 + }, + { + "epoch": 0.06109236020671991, + "grad_norm": 12.999759848422103, + "learning_rate": 4.9984866818476136e-05, + "loss": 2.3552, + "mean_token_accuracy": 0.4543859601020813, + "step": 60655 + }, + { + "epoch": 0.06109739625982408, + "grad_norm": 10.469801022448646, + "learning_rate": 4.998485307445271e-05, + "loss": 2.7086, + "mean_token_accuracy": 0.3724137932062149, + "step": 60660 + }, + { + "epoch": 0.06110243231292825, + "grad_norm": 12.20101436416402, + "learning_rate": 4.998483932419305e-05, + "loss": 2.1409, + "mean_token_accuracy": 0.4640048325061798, + "step": 60665 + }, + { + "epoch": 0.061107468366032425, + "grad_norm": 11.6016952055602, + "learning_rate": 4.998482556769711e-05, + "loss": 2.6587, + "mean_token_accuracy": 0.38620689511299133, + "step": 60670 + }, + { + "epoch": 0.0611125044191366, + "grad_norm": 15.07959621201417, + "learning_rate": 4.998481180496494e-05, + "loss": 2.6004, + "mean_token_accuracy": 0.42068964838981626, + "step": 60675 + }, + { + "epoch": 0.06111754047224077, + "grad_norm": 10.706297776946737, + "learning_rate": 4.998479803599651e-05, + "loss": 2.414, + "mean_token_accuracy": 0.4, + "step": 60680 + }, + { + "epoch": 0.061122576525344946, + "grad_norm": 12.583561090343297, + "learning_rate": 4.998478426079184e-05, + "loss": 2.3766, + "mean_token_accuracy": 0.46079854369163514, + "step": 60685 + }, + { + "epoch": 0.06112761257844912, + "grad_norm": 11.375496134716098, + "learning_rate": 4.998477047935094e-05, + "loss": 2.852, + "mean_token_accuracy": 0.3482758581638336, + "step": 60690 + }, + { + "epoch": 0.06113264863155329, + "grad_norm": 11.921027954536335, + "learning_rate": 4.998475669167379e-05, + "loss": 2.2144, + "mean_token_accuracy": 0.46551724672317507, + "step": 60695 + }, + { + "epoch": 0.06113768468465746, + "grad_norm": 11.75837918289072, + "learning_rate": 4.998474289776042e-05, + "loss": 2.4636, + "mean_token_accuracy": 0.358620685338974, + "step": 60700 + }, + { + "epoch": 0.061142720737761634, + "grad_norm": 11.73011377112593, + "learning_rate": 4.998472909761082e-05, + "loss": 2.5863, + "mean_token_accuracy": 0.41379310488700866, + "step": 60705 + }, + { + "epoch": 0.06114775679086581, + "grad_norm": 13.210835905967208, + "learning_rate": 4.9984715291225e-05, + "loss": 2.6836, + "mean_token_accuracy": 0.3448275804519653, + "step": 60710 + }, + { + "epoch": 0.06115279284396998, + "grad_norm": 10.622805748016017, + "learning_rate": 4.9984701478602955e-05, + "loss": 2.184, + "mean_token_accuracy": 0.482758617401123, + "step": 60715 + }, + { + "epoch": 0.061157828897074155, + "grad_norm": 11.914364265685528, + "learning_rate": 4.998468765974469e-05, + "loss": 2.3602, + "mean_token_accuracy": 0.41724138259887694, + "step": 60720 + }, + { + "epoch": 0.06116286495017833, + "grad_norm": 9.665203000557105, + "learning_rate": 4.998467383465022e-05, + "loss": 2.2192, + "mean_token_accuracy": 0.4310344815254211, + "step": 60725 + }, + { + "epoch": 0.061167901003282496, + "grad_norm": 11.508144277754866, + "learning_rate": 4.998466000331954e-05, + "loss": 2.2544, + "mean_token_accuracy": 0.43793103098869324, + "step": 60730 + }, + { + "epoch": 0.06117293705638667, + "grad_norm": 13.371516394193609, + "learning_rate": 4.998464616575266e-05, + "loss": 2.5949, + "mean_token_accuracy": 0.36896551847457887, + "step": 60735 + }, + { + "epoch": 0.061177973109490844, + "grad_norm": 13.640361312297172, + "learning_rate": 4.998463232194957e-05, + "loss": 2.6681, + "mean_token_accuracy": 0.4, + "step": 60740 + }, + { + "epoch": 0.06118300916259502, + "grad_norm": 13.46600094455518, + "learning_rate": 4.99846184719103e-05, + "loss": 2.372, + "mean_token_accuracy": 0.4185722887516022, + "step": 60745 + }, + { + "epoch": 0.06118804521569919, + "grad_norm": 12.70777933808871, + "learning_rate": 4.9984604615634815e-05, + "loss": 2.8276, + "mean_token_accuracy": 0.3827586233615875, + "step": 60750 + }, + { + "epoch": 0.061193081268803365, + "grad_norm": 11.533950931569853, + "learning_rate": 4.998459075312316e-05, + "loss": 2.4559, + "mean_token_accuracy": 0.4448880791664124, + "step": 60755 + }, + { + "epoch": 0.06119811732190754, + "grad_norm": 12.28890369858678, + "learning_rate": 4.9984576884375306e-05, + "loss": 2.631, + "mean_token_accuracy": 0.4068965554237366, + "step": 60760 + }, + { + "epoch": 0.061203153375011705, + "grad_norm": 12.632913584543306, + "learning_rate": 4.998456300939128e-05, + "loss": 2.69, + "mean_token_accuracy": 0.40000001192092893, + "step": 60765 + }, + { + "epoch": 0.06120818942811588, + "grad_norm": 11.72593839271051, + "learning_rate": 4.998454912817107e-05, + "loss": 2.8603, + "mean_token_accuracy": 0.38064125180244446, + "step": 60770 + }, + { + "epoch": 0.06121322548122005, + "grad_norm": 13.16968082160041, + "learning_rate": 4.998453524071469e-05, + "loss": 2.3786, + "mean_token_accuracy": 0.4724137902259827, + "step": 60775 + }, + { + "epoch": 0.06121826153432423, + "grad_norm": 13.710325395783022, + "learning_rate": 4.998452134702214e-05, + "loss": 2.612, + "mean_token_accuracy": 0.358620685338974, + "step": 60780 + }, + { + "epoch": 0.0612232975874284, + "grad_norm": 13.88846842862795, + "learning_rate": 4.998450744709342e-05, + "loss": 2.7457, + "mean_token_accuracy": 0.36896551251411436, + "step": 60785 + }, + { + "epoch": 0.061228333640532574, + "grad_norm": 11.178429985622644, + "learning_rate": 4.9984493540928545e-05, + "loss": 2.422, + "mean_token_accuracy": 0.4206896543502808, + "step": 60790 + }, + { + "epoch": 0.06123336969363675, + "grad_norm": 12.246653927549643, + "learning_rate": 4.9984479628527515e-05, + "loss": 2.4918, + "mean_token_accuracy": 0.4248768508434296, + "step": 60795 + }, + { + "epoch": 0.061238405746740915, + "grad_norm": 14.601659334825566, + "learning_rate": 4.9984465709890324e-05, + "loss": 2.3189, + "mean_token_accuracy": 0.41379308700561523, + "step": 60800 + }, + { + "epoch": 0.06124344179984509, + "grad_norm": 9.825405226156088, + "learning_rate": 4.9984451785016984e-05, + "loss": 2.1877, + "mean_token_accuracy": 0.44827585220336913, + "step": 60805 + }, + { + "epoch": 0.06124847785294926, + "grad_norm": 14.2518741044702, + "learning_rate": 4.99844378539075e-05, + "loss": 2.4936, + "mean_token_accuracy": 0.4448275864124298, + "step": 60810 + }, + { + "epoch": 0.061253513906053436, + "grad_norm": 12.503127658708125, + "learning_rate": 4.998442391656186e-05, + "loss": 2.8253, + "mean_token_accuracy": 0.37241379022598264, + "step": 60815 + }, + { + "epoch": 0.06125854995915761, + "grad_norm": 11.365220650088974, + "learning_rate": 4.998440997298009e-05, + "loss": 2.8196, + "mean_token_accuracy": 0.38275861740112305, + "step": 60820 + }, + { + "epoch": 0.061263586012261784, + "grad_norm": 11.19644098456086, + "learning_rate": 4.998439602316219e-05, + "loss": 2.2451, + "mean_token_accuracy": 0.4379310369491577, + "step": 60825 + }, + { + "epoch": 0.06126862206536596, + "grad_norm": 10.843849066229062, + "learning_rate": 4.9984382067108157e-05, + "loss": 2.2162, + "mean_token_accuracy": 0.4344827592372894, + "step": 60830 + }, + { + "epoch": 0.061273658118470124, + "grad_norm": 9.978040903871667, + "learning_rate": 4.9984368104818e-05, + "loss": 2.1801, + "mean_token_accuracy": 0.41724138259887694, + "step": 60835 + }, + { + "epoch": 0.0612786941715743, + "grad_norm": 15.48391771231129, + "learning_rate": 4.998435413629171e-05, + "loss": 2.765, + "mean_token_accuracy": 0.37586206793785093, + "step": 60840 + }, + { + "epoch": 0.06128373022467847, + "grad_norm": 12.79379909886921, + "learning_rate": 4.99843401615293e-05, + "loss": 2.6533, + "mean_token_accuracy": 0.37586206793785093, + "step": 60845 + }, + { + "epoch": 0.061288766277782646, + "grad_norm": 12.432238199306587, + "learning_rate": 4.998432618053078e-05, + "loss": 2.5838, + "mean_token_accuracy": 0.4137930989265442, + "step": 60850 + }, + { + "epoch": 0.06129380233088682, + "grad_norm": 9.906281777612866, + "learning_rate": 4.998431219329615e-05, + "loss": 2.0733, + "mean_token_accuracy": 0.4965517222881317, + "step": 60855 + }, + { + "epoch": 0.06129883838399099, + "grad_norm": 11.667680713918477, + "learning_rate": 4.998429819982541e-05, + "loss": 2.5069, + "mean_token_accuracy": 0.42413793206214906, + "step": 60860 + }, + { + "epoch": 0.06130387443709517, + "grad_norm": 9.985311962092275, + "learning_rate": 4.9984284200118566e-05, + "loss": 2.678, + "mean_token_accuracy": 0.4275862067937851, + "step": 60865 + }, + { + "epoch": 0.061308910490199334, + "grad_norm": 9.87021724048607, + "learning_rate": 4.998427019417562e-05, + "loss": 2.3677, + "mean_token_accuracy": 0.4586206912994385, + "step": 60870 + }, + { + "epoch": 0.06131394654330351, + "grad_norm": 12.870368588308825, + "learning_rate": 4.9984256181996584e-05, + "loss": 2.8536, + "mean_token_accuracy": 0.36551724672317504, + "step": 60875 + }, + { + "epoch": 0.06131898259640768, + "grad_norm": 11.81527390751178, + "learning_rate": 4.9984242163581445e-05, + "loss": 2.352, + "mean_token_accuracy": 0.4068965554237366, + "step": 60880 + }, + { + "epoch": 0.061324018649511855, + "grad_norm": 14.361974485249515, + "learning_rate": 4.998422813893022e-05, + "loss": 2.8835, + "mean_token_accuracy": 0.3551724076271057, + "step": 60885 + }, + { + "epoch": 0.06132905470261603, + "grad_norm": 8.293796362084402, + "learning_rate": 4.998421410804292e-05, + "loss": 2.2778, + "mean_token_accuracy": 0.4816697001457214, + "step": 60890 + }, + { + "epoch": 0.0613340907557202, + "grad_norm": 12.337936978376892, + "learning_rate": 4.9984200070919536e-05, + "loss": 2.2816, + "mean_token_accuracy": 0.4620689630508423, + "step": 60895 + }, + { + "epoch": 0.061339126808824376, + "grad_norm": 14.269805893767883, + "learning_rate": 4.998418602756007e-05, + "loss": 2.541, + "mean_token_accuracy": 0.4034482777118683, + "step": 60900 + }, + { + "epoch": 0.06134416286192854, + "grad_norm": 12.014824054042165, + "learning_rate": 4.998417197796454e-05, + "loss": 2.7445, + "mean_token_accuracy": 0.34482758939266206, + "step": 60905 + }, + { + "epoch": 0.06134919891503272, + "grad_norm": 12.578812375891, + "learning_rate": 4.998415792213293e-05, + "loss": 2.9219, + "mean_token_accuracy": 0.3896551728248596, + "step": 60910 + }, + { + "epoch": 0.06135423496813689, + "grad_norm": 11.515401433789897, + "learning_rate": 4.998414386006526e-05, + "loss": 2.5048, + "mean_token_accuracy": 0.4300060421228409, + "step": 60915 + }, + { + "epoch": 0.061359271021241064, + "grad_norm": 11.896152881686483, + "learning_rate": 4.998412979176153e-05, + "loss": 2.4348, + "mean_token_accuracy": 0.39655172228813174, + "step": 60920 + }, + { + "epoch": 0.06136430707434524, + "grad_norm": 11.382969923357996, + "learning_rate": 4.9984115717221735e-05, + "loss": 2.7127, + "mean_token_accuracy": 0.41034482717514037, + "step": 60925 + }, + { + "epoch": 0.06136934312744941, + "grad_norm": 10.758223073812248, + "learning_rate": 4.99841016364459e-05, + "loss": 2.8235, + "mean_token_accuracy": 0.3551724135875702, + "step": 60930 + }, + { + "epoch": 0.061374379180553586, + "grad_norm": 12.881763063486847, + "learning_rate": 4.9984087549434005e-05, + "loss": 2.7333, + "mean_token_accuracy": 0.33103448152542114, + "step": 60935 + }, + { + "epoch": 0.06137941523365775, + "grad_norm": 10.519227375378943, + "learning_rate": 4.998407345618607e-05, + "loss": 2.3672, + "mean_token_accuracy": 0.41034482717514037, + "step": 60940 + }, + { + "epoch": 0.061384451286761926, + "grad_norm": 12.902271769599553, + "learning_rate": 4.998405935670209e-05, + "loss": 2.4911, + "mean_token_accuracy": 0.3931034505367279, + "step": 60945 + }, + { + "epoch": 0.0613894873398661, + "grad_norm": 11.681012936576305, + "learning_rate": 4.998404525098207e-05, + "loss": 2.4153, + "mean_token_accuracy": 0.37586206793785093, + "step": 60950 + }, + { + "epoch": 0.061394523392970274, + "grad_norm": 11.446626796566896, + "learning_rate": 4.9984031139026024e-05, + "loss": 2.6148, + "mean_token_accuracy": 0.4413793087005615, + "step": 60955 + }, + { + "epoch": 0.06139955944607445, + "grad_norm": 12.24068049422373, + "learning_rate": 4.998401702083394e-05, + "loss": 2.7793, + "mean_token_accuracy": 0.3809437394142151, + "step": 60960 + }, + { + "epoch": 0.06140459549917862, + "grad_norm": 13.282359976507566, + "learning_rate": 4.9984002896405834e-05, + "loss": 2.6898, + "mean_token_accuracy": 0.37241379618644715, + "step": 60965 + }, + { + "epoch": 0.061409631552282795, + "grad_norm": 12.229371942059815, + "learning_rate": 4.99839887657417e-05, + "loss": 2.4021, + "mean_token_accuracy": 0.42758620977401735, + "step": 60970 + }, + { + "epoch": 0.06141466760538696, + "grad_norm": 9.72245085093366, + "learning_rate": 4.998397462884156e-05, + "loss": 2.5888, + "mean_token_accuracy": 0.4427707254886627, + "step": 60975 + }, + { + "epoch": 0.061419703658491136, + "grad_norm": 11.576517467180311, + "learning_rate": 4.99839604857054e-05, + "loss": 2.2239, + "mean_token_accuracy": 0.48469448685646055, + "step": 60980 + }, + { + "epoch": 0.06142473971159531, + "grad_norm": 14.575449453874423, + "learning_rate": 4.998394633633323e-05, + "loss": 2.3665, + "mean_token_accuracy": 0.42758620381355283, + "step": 60985 + }, + { + "epoch": 0.06142977576469948, + "grad_norm": 10.703681873516116, + "learning_rate": 4.998393218072505e-05, + "loss": 2.6956, + "mean_token_accuracy": 0.3793103516101837, + "step": 60990 + }, + { + "epoch": 0.06143481181780366, + "grad_norm": 12.821291466823785, + "learning_rate": 4.9983918018880876e-05, + "loss": 2.7018, + "mean_token_accuracy": 0.38620689511299133, + "step": 60995 + }, + { + "epoch": 0.06143984787090783, + "grad_norm": 12.437399177555061, + "learning_rate": 4.9983903850800695e-05, + "loss": 2.6565, + "mean_token_accuracy": 0.3655172437429428, + "step": 61000 + }, + { + "epoch": 0.061444883924012005, + "grad_norm": 8.836579479929444, + "learning_rate": 4.998388967648452e-05, + "loss": 2.5141, + "mean_token_accuracy": 0.4068965554237366, + "step": 61005 + }, + { + "epoch": 0.06144991997711617, + "grad_norm": 10.358773795347407, + "learning_rate": 4.9983875495932355e-05, + "loss": 2.2272, + "mean_token_accuracy": 0.4275862157344818, + "step": 61010 + }, + { + "epoch": 0.061454956030220345, + "grad_norm": 10.655984042280286, + "learning_rate": 4.998386130914421e-05, + "loss": 2.617, + "mean_token_accuracy": 0.4225045442581177, + "step": 61015 + }, + { + "epoch": 0.06145999208332452, + "grad_norm": 10.06587294477688, + "learning_rate": 4.998384711612007e-05, + "loss": 2.1148, + "mean_token_accuracy": 0.458620685338974, + "step": 61020 + }, + { + "epoch": 0.06146502813642869, + "grad_norm": 10.959257957527587, + "learning_rate": 4.998383291685996e-05, + "loss": 2.6675, + "mean_token_accuracy": 0.38965516686439516, + "step": 61025 + }, + { + "epoch": 0.061470064189532866, + "grad_norm": 12.979696527583272, + "learning_rate": 4.998381871136387e-05, + "loss": 2.8825, + "mean_token_accuracy": 0.37241379022598264, + "step": 61030 + }, + { + "epoch": 0.06147510024263704, + "grad_norm": 11.820341484208374, + "learning_rate": 4.9983804499631815e-05, + "loss": 2.6082, + "mean_token_accuracy": 0.3655172407627106, + "step": 61035 + }, + { + "epoch": 0.061480136295741214, + "grad_norm": 10.594816698177208, + "learning_rate": 4.998379028166379e-05, + "loss": 1.8772, + "mean_token_accuracy": 0.5125226855278016, + "step": 61040 + }, + { + "epoch": 0.06148517234884538, + "grad_norm": 13.748667149548465, + "learning_rate": 4.99837760574598e-05, + "loss": 2.448, + "mean_token_accuracy": 0.3551724165678024, + "step": 61045 + }, + { + "epoch": 0.061490208401949555, + "grad_norm": 20.695947567361593, + "learning_rate": 4.9983761827019857e-05, + "loss": 2.5128, + "mean_token_accuracy": 0.46551724672317507, + "step": 61050 + }, + { + "epoch": 0.06149524445505373, + "grad_norm": 25.19566171312912, + "learning_rate": 4.998374759034395e-05, + "loss": 2.5511, + "mean_token_accuracy": 0.44827587008476255, + "step": 61055 + }, + { + "epoch": 0.0615002805081579, + "grad_norm": 12.622428596948168, + "learning_rate": 4.9983733347432096e-05, + "loss": 2.6227, + "mean_token_accuracy": 0.417241370677948, + "step": 61060 + }, + { + "epoch": 0.061505316561262076, + "grad_norm": 11.744017226978869, + "learning_rate": 4.9983719098284295e-05, + "loss": 2.4806, + "mean_token_accuracy": 0.4034482777118683, + "step": 61065 + }, + { + "epoch": 0.06151035261436625, + "grad_norm": 11.555240398490348, + "learning_rate": 4.998370484290055e-05, + "loss": 2.2825, + "mean_token_accuracy": 0.4413793087005615, + "step": 61070 + }, + { + "epoch": 0.06151538866747042, + "grad_norm": 12.039358341119847, + "learning_rate": 4.9983690581280866e-05, + "loss": 2.7447, + "mean_token_accuracy": 0.31724137663841245, + "step": 61075 + }, + { + "epoch": 0.06152042472057459, + "grad_norm": 12.732539508710907, + "learning_rate": 4.9983676313425245e-05, + "loss": 2.7096, + "mean_token_accuracy": 0.36896551847457887, + "step": 61080 + }, + { + "epoch": 0.061525460773678764, + "grad_norm": 10.941224027987998, + "learning_rate": 4.998366203933369e-05, + "loss": 2.4579, + "mean_token_accuracy": 0.4551724076271057, + "step": 61085 + }, + { + "epoch": 0.06153049682678294, + "grad_norm": 22.649235881120084, + "learning_rate": 4.9983647759006216e-05, + "loss": 2.6894, + "mean_token_accuracy": 0.4000000059604645, + "step": 61090 + }, + { + "epoch": 0.06153553287988711, + "grad_norm": 9.160445996849305, + "learning_rate": 4.9983633472442815e-05, + "loss": 2.499, + "mean_token_accuracy": 0.3655172407627106, + "step": 61095 + }, + { + "epoch": 0.061540568932991285, + "grad_norm": 11.617730606290884, + "learning_rate": 4.998361917964349e-05, + "loss": 2.0935, + "mean_token_accuracy": 0.4502117335796356, + "step": 61100 + }, + { + "epoch": 0.06154560498609546, + "grad_norm": 10.068171510737814, + "learning_rate": 4.998360488060826e-05, + "loss": 2.6928, + "mean_token_accuracy": 0.3758620649576187, + "step": 61105 + }, + { + "epoch": 0.06155064103919963, + "grad_norm": 11.915712620806389, + "learning_rate": 4.99835905753371e-05, + "loss": 2.4442, + "mean_token_accuracy": 0.4379310369491577, + "step": 61110 + }, + { + "epoch": 0.0615556770923038, + "grad_norm": 12.12056400646725, + "learning_rate": 4.998357626383005e-05, + "loss": 2.5804, + "mean_token_accuracy": 0.33448275923728943, + "step": 61115 + }, + { + "epoch": 0.06156071314540797, + "grad_norm": 14.717487990470058, + "learning_rate": 4.998356194608709e-05, + "loss": 3.0927, + "mean_token_accuracy": 0.3551724135875702, + "step": 61120 + }, + { + "epoch": 0.06156574919851215, + "grad_norm": 16.075721285934897, + "learning_rate": 4.998354762210823e-05, + "loss": 2.7387, + "mean_token_accuracy": 0.36896551847457887, + "step": 61125 + }, + { + "epoch": 0.06157078525161632, + "grad_norm": 10.377626617221766, + "learning_rate": 4.998353329189347e-05, + "loss": 2.5762, + "mean_token_accuracy": 0.4068965584039688, + "step": 61130 + }, + { + "epoch": 0.061575821304720495, + "grad_norm": 12.658106343727441, + "learning_rate": 4.998351895544283e-05, + "loss": 2.674, + "mean_token_accuracy": 0.3827586233615875, + "step": 61135 + }, + { + "epoch": 0.06158085735782467, + "grad_norm": 11.341667485107928, + "learning_rate": 4.9983504612756297e-05, + "loss": 2.286, + "mean_token_accuracy": 0.49491833448410033, + "step": 61140 + }, + { + "epoch": 0.06158589341092884, + "grad_norm": 13.858331409055655, + "learning_rate": 4.998349026383387e-05, + "loss": 2.5145, + "mean_token_accuracy": 0.4, + "step": 61145 + }, + { + "epoch": 0.06159092946403301, + "grad_norm": 9.79071140793119, + "learning_rate": 4.998347590867557e-05, + "loss": 2.8608, + "mean_token_accuracy": 0.3724137991666794, + "step": 61150 + }, + { + "epoch": 0.06159596551713718, + "grad_norm": 11.974664670359026, + "learning_rate": 4.99834615472814e-05, + "loss": 2.9615, + "mean_token_accuracy": 0.33793103098869326, + "step": 61155 + }, + { + "epoch": 0.06160100157024136, + "grad_norm": 12.170581596823617, + "learning_rate": 4.9983447179651346e-05, + "loss": 2.3914, + "mean_token_accuracy": 0.44482758045196535, + "step": 61160 + }, + { + "epoch": 0.06160603762334553, + "grad_norm": 11.937746124976512, + "learning_rate": 4.998343280578543e-05, + "loss": 3.018, + "mean_token_accuracy": 0.3379310369491577, + "step": 61165 + }, + { + "epoch": 0.061611073676449704, + "grad_norm": 11.425222585585725, + "learning_rate": 4.9983418425683654e-05, + "loss": 2.5276, + "mean_token_accuracy": 0.4103448331356049, + "step": 61170 + }, + { + "epoch": 0.06161610972955388, + "grad_norm": 10.972341242013544, + "learning_rate": 4.998340403934601e-05, + "loss": 2.2552, + "mean_token_accuracy": 0.4728372633457184, + "step": 61175 + }, + { + "epoch": 0.06162114578265805, + "grad_norm": 13.238054488495989, + "learning_rate": 4.9983389646772515e-05, + "loss": 2.4851, + "mean_token_accuracy": 0.4355911314487457, + "step": 61180 + }, + { + "epoch": 0.06162618183576222, + "grad_norm": 12.658361503474714, + "learning_rate": 4.9983375247963174e-05, + "loss": 2.5511, + "mean_token_accuracy": 0.38965516686439516, + "step": 61185 + }, + { + "epoch": 0.06163121788886639, + "grad_norm": 10.903764341131021, + "learning_rate": 4.998336084291798e-05, + "loss": 2.6009, + "mean_token_accuracy": 0.3827586114406586, + "step": 61190 + }, + { + "epoch": 0.061636253941970566, + "grad_norm": 12.255867419784378, + "learning_rate": 4.9983346431636943e-05, + "loss": 2.728, + "mean_token_accuracy": 0.3620689630508423, + "step": 61195 + }, + { + "epoch": 0.06164128999507474, + "grad_norm": 11.86671729161742, + "learning_rate": 4.998333201412006e-05, + "loss": 2.578, + "mean_token_accuracy": 0.38965516686439516, + "step": 61200 + }, + { + "epoch": 0.061646326048178914, + "grad_norm": 10.095263787843054, + "learning_rate": 4.998331759036734e-05, + "loss": 2.6843, + "mean_token_accuracy": 0.3379310339689255, + "step": 61205 + }, + { + "epoch": 0.06165136210128309, + "grad_norm": 12.394359044802973, + "learning_rate": 4.998330316037879e-05, + "loss": 2.811, + "mean_token_accuracy": 0.3379310369491577, + "step": 61210 + }, + { + "epoch": 0.06165639815438726, + "grad_norm": 10.650654758974728, + "learning_rate": 4.998328872415441e-05, + "loss": 2.5656, + "mean_token_accuracy": 0.4, + "step": 61215 + }, + { + "epoch": 0.06166143420749143, + "grad_norm": 13.700319957960788, + "learning_rate": 4.998327428169422e-05, + "loss": 2.9714, + "mean_token_accuracy": 0.3827586233615875, + "step": 61220 + }, + { + "epoch": 0.0616664702605956, + "grad_norm": 12.871321434088754, + "learning_rate": 4.9983259832998195e-05, + "loss": 2.546, + "mean_token_accuracy": 0.4241379380226135, + "step": 61225 + }, + { + "epoch": 0.061671506313699775, + "grad_norm": 13.541510953289714, + "learning_rate": 4.998324537806637e-05, + "loss": 2.8257, + "mean_token_accuracy": 0.3620689630508423, + "step": 61230 + }, + { + "epoch": 0.06167654236680395, + "grad_norm": 10.485501814367387, + "learning_rate": 4.998323091689871e-05, + "loss": 2.5537, + "mean_token_accuracy": 0.42758620381355283, + "step": 61235 + }, + { + "epoch": 0.06168157841990812, + "grad_norm": 11.510750758742542, + "learning_rate": 4.9983216449495255e-05, + "loss": 2.3985, + "mean_token_accuracy": 0.4206896543502808, + "step": 61240 + }, + { + "epoch": 0.0616866144730123, + "grad_norm": 11.042795540177016, + "learning_rate": 4.9983201975856e-05, + "loss": 2.7364, + "mean_token_accuracy": 0.324137932062149, + "step": 61245 + }, + { + "epoch": 0.06169165052611647, + "grad_norm": 14.562593856534097, + "learning_rate": 4.9983187495980946e-05, + "loss": 2.317, + "mean_token_accuracy": 0.4344827592372894, + "step": 61250 + }, + { + "epoch": 0.06169668657922064, + "grad_norm": 12.035032599055087, + "learning_rate": 4.9983173009870094e-05, + "loss": 2.3153, + "mean_token_accuracy": 0.42413793206214906, + "step": 61255 + }, + { + "epoch": 0.06170172263232481, + "grad_norm": 13.366867866959446, + "learning_rate": 4.998315851752344e-05, + "loss": 2.9947, + "mean_token_accuracy": 0.3965517163276672, + "step": 61260 + }, + { + "epoch": 0.061706758685428985, + "grad_norm": 9.908334140111782, + "learning_rate": 4.9983144018941005e-05, + "loss": 2.4755, + "mean_token_accuracy": 0.39310344457626345, + "step": 61265 + }, + { + "epoch": 0.06171179473853316, + "grad_norm": 11.736464688904853, + "learning_rate": 4.998312951412279e-05, + "loss": 2.4541, + "mean_token_accuracy": 0.4344827592372894, + "step": 61270 + }, + { + "epoch": 0.06171683079163733, + "grad_norm": 11.181537124416623, + "learning_rate": 4.998311500306879e-05, + "loss": 2.1947, + "mean_token_accuracy": 0.4241379380226135, + "step": 61275 + }, + { + "epoch": 0.061721866844741506, + "grad_norm": 11.468012607208172, + "learning_rate": 4.998310048577901e-05, + "loss": 2.3766, + "mean_token_accuracy": 0.42413793206214906, + "step": 61280 + }, + { + "epoch": 0.06172690289784568, + "grad_norm": 14.570667613752613, + "learning_rate": 4.998308596225347e-05, + "loss": 2.4263, + "mean_token_accuracy": 0.42758620381355283, + "step": 61285 + }, + { + "epoch": 0.06173193895094985, + "grad_norm": 13.077842010640376, + "learning_rate": 4.998307143249215e-05, + "loss": 2.5806, + "mean_token_accuracy": 0.42758620977401735, + "step": 61290 + }, + { + "epoch": 0.06173697500405402, + "grad_norm": 14.151620355347168, + "learning_rate": 4.9983056896495076e-05, + "loss": 2.8954, + "mean_token_accuracy": 0.38965516686439516, + "step": 61295 + }, + { + "epoch": 0.061742011057158194, + "grad_norm": 11.624414056422784, + "learning_rate": 4.998304235426224e-05, + "loss": 2.5111, + "mean_token_accuracy": 0.35862069129943847, + "step": 61300 + }, + { + "epoch": 0.06174704711026237, + "grad_norm": 10.152922198967312, + "learning_rate": 4.998302780579365e-05, + "loss": 2.3029, + "mean_token_accuracy": 0.44482758045196535, + "step": 61305 + }, + { + "epoch": 0.06175208316336654, + "grad_norm": 13.498896543630305, + "learning_rate": 4.99830132510893e-05, + "loss": 2.8595, + "mean_token_accuracy": 0.36551724672317504, + "step": 61310 + }, + { + "epoch": 0.061757119216470716, + "grad_norm": 13.329928019732176, + "learning_rate": 4.9982998690149205e-05, + "loss": 2.8889, + "mean_token_accuracy": 0.3482758641242981, + "step": 61315 + }, + { + "epoch": 0.06176215526957489, + "grad_norm": 11.494763865806634, + "learning_rate": 4.998298412297337e-05, + "loss": 2.4117, + "mean_token_accuracy": 0.46896552443504336, + "step": 61320 + }, + { + "epoch": 0.061767191322679056, + "grad_norm": 10.730761202838421, + "learning_rate": 4.99829695495618e-05, + "loss": 2.313, + "mean_token_accuracy": 0.4448275864124298, + "step": 61325 + }, + { + "epoch": 0.06177222737578323, + "grad_norm": 15.295204853239804, + "learning_rate": 4.998295496991448e-05, + "loss": 2.8224, + "mean_token_accuracy": 0.3689655065536499, + "step": 61330 + }, + { + "epoch": 0.061777263428887404, + "grad_norm": 11.66116157561056, + "learning_rate": 4.9982940384031445e-05, + "loss": 2.6388, + "mean_token_accuracy": 0.4068965494632721, + "step": 61335 + }, + { + "epoch": 0.06178229948199158, + "grad_norm": 13.158571961052234, + "learning_rate": 4.998292579191268e-05, + "loss": 2.4017, + "mean_token_accuracy": 0.4137930989265442, + "step": 61340 + }, + { + "epoch": 0.06178733553509575, + "grad_norm": 11.415450732643984, + "learning_rate": 4.998291119355818e-05, + "loss": 2.6976, + "mean_token_accuracy": 0.35862069129943847, + "step": 61345 + }, + { + "epoch": 0.061792371588199925, + "grad_norm": 9.88108759925936, + "learning_rate": 4.998289658896797e-05, + "loss": 2.4407, + "mean_token_accuracy": 0.42068966031074523, + "step": 61350 + }, + { + "epoch": 0.0617974076413041, + "grad_norm": 9.796304636079038, + "learning_rate": 4.998288197814204e-05, + "loss": 2.469, + "mean_token_accuracy": 0.4517241418361664, + "step": 61355 + }, + { + "epoch": 0.061802443694408266, + "grad_norm": 10.544512717436971, + "learning_rate": 4.99828673610804e-05, + "loss": 3.0007, + "mean_token_accuracy": 0.3482758641242981, + "step": 61360 + }, + { + "epoch": 0.06180747974751244, + "grad_norm": 10.84167618452037, + "learning_rate": 4.998285273778306e-05, + "loss": 2.5306, + "mean_token_accuracy": 0.42758620381355283, + "step": 61365 + }, + { + "epoch": 0.06181251580061661, + "grad_norm": 12.565363374042542, + "learning_rate": 4.998283810825001e-05, + "loss": 2.6312, + "mean_token_accuracy": 0.40217784941196444, + "step": 61370 + }, + { + "epoch": 0.06181755185372079, + "grad_norm": 13.163550310429573, + "learning_rate": 4.998282347248126e-05, + "loss": 2.7995, + "mean_token_accuracy": 0.3758620619773865, + "step": 61375 + }, + { + "epoch": 0.06182258790682496, + "grad_norm": 11.57908482655264, + "learning_rate": 4.998280883047682e-05, + "loss": 2.3172, + "mean_token_accuracy": 0.482819128036499, + "step": 61380 + }, + { + "epoch": 0.061827623959929134, + "grad_norm": 10.656431164734363, + "learning_rate": 4.998279418223668e-05, + "loss": 2.8556, + "mean_token_accuracy": 0.3241379290819168, + "step": 61385 + }, + { + "epoch": 0.06183266001303331, + "grad_norm": 11.238398042517932, + "learning_rate": 4.998277952776087e-05, + "loss": 2.6576, + "mean_token_accuracy": 0.4, + "step": 61390 + }, + { + "epoch": 0.061837696066137475, + "grad_norm": 16.50338690932847, + "learning_rate": 4.998276486704936e-05, + "loss": 2.4246, + "mean_token_accuracy": 0.42413792610168455, + "step": 61395 + }, + { + "epoch": 0.06184273211924165, + "grad_norm": 10.755975690163028, + "learning_rate": 4.998275020010217e-05, + "loss": 2.6001, + "mean_token_accuracy": 0.4052026629447937, + "step": 61400 + }, + { + "epoch": 0.06184776817234582, + "grad_norm": 11.747854794290209, + "learning_rate": 4.998273552691932e-05, + "loss": 2.2736, + "mean_token_accuracy": 0.4739866852760315, + "step": 61405 + }, + { + "epoch": 0.061852804225449996, + "grad_norm": 11.624956893573927, + "learning_rate": 4.9982720847500794e-05, + "loss": 2.6577, + "mean_token_accuracy": 0.42413792610168455, + "step": 61410 + }, + { + "epoch": 0.06185784027855417, + "grad_norm": 11.109034234775965, + "learning_rate": 4.9982706161846605e-05, + "loss": 2.702, + "mean_token_accuracy": 0.37241379022598264, + "step": 61415 + }, + { + "epoch": 0.061862876331658344, + "grad_norm": 11.007641647308722, + "learning_rate": 4.998269146995674e-05, + "loss": 1.9984, + "mean_token_accuracy": 0.4676950931549072, + "step": 61420 + }, + { + "epoch": 0.06186791238476252, + "grad_norm": 11.01345259138696, + "learning_rate": 4.9982676771831234e-05, + "loss": 2.6161, + "mean_token_accuracy": 0.4, + "step": 61425 + }, + { + "epoch": 0.061872948437866684, + "grad_norm": 12.415990840501886, + "learning_rate": 4.998266206747006e-05, + "loss": 2.2763, + "mean_token_accuracy": 0.4379310369491577, + "step": 61430 + }, + { + "epoch": 0.06187798449097086, + "grad_norm": 11.7144231041422, + "learning_rate": 4.998264735687324e-05, + "loss": 2.4861, + "mean_token_accuracy": 0.3999999940395355, + "step": 61435 + }, + { + "epoch": 0.06188302054407503, + "grad_norm": 15.08400134978886, + "learning_rate": 4.9982632640040774e-05, + "loss": 2.7548, + "mean_token_accuracy": 0.4448275864124298, + "step": 61440 + }, + { + "epoch": 0.061888056597179206, + "grad_norm": 10.997616597536501, + "learning_rate": 4.9982617916972666e-05, + "loss": 2.6518, + "mean_token_accuracy": 0.41379310488700866, + "step": 61445 + }, + { + "epoch": 0.06189309265028338, + "grad_norm": 13.172450491629998, + "learning_rate": 4.998260318766892e-05, + "loss": 2.5827, + "mean_token_accuracy": 0.4310344815254211, + "step": 61450 + }, + { + "epoch": 0.06189812870338755, + "grad_norm": 13.653861908568981, + "learning_rate": 4.9982588452129537e-05, + "loss": 2.9313, + "mean_token_accuracy": 0.3517241388559341, + "step": 61455 + }, + { + "epoch": 0.06190316475649173, + "grad_norm": 17.443022336596822, + "learning_rate": 4.998257371035453e-05, + "loss": 2.9806, + "mean_token_accuracy": 0.3356321781873703, + "step": 61460 + }, + { + "epoch": 0.061908200809595894, + "grad_norm": 11.553446681609449, + "learning_rate": 4.9982558962343896e-05, + "loss": 2.5557, + "mean_token_accuracy": 0.4275861978530884, + "step": 61465 + }, + { + "epoch": 0.06191323686270007, + "grad_norm": 10.864588855604435, + "learning_rate": 4.998254420809764e-05, + "loss": 2.8106, + "mean_token_accuracy": 0.3724137872457504, + "step": 61470 + }, + { + "epoch": 0.06191827291580424, + "grad_norm": 16.738330702560827, + "learning_rate": 4.9982529447615764e-05, + "loss": 2.3987, + "mean_token_accuracy": 0.4344827592372894, + "step": 61475 + }, + { + "epoch": 0.061923308968908415, + "grad_norm": 13.050785724502227, + "learning_rate": 4.9982514680898286e-05, + "loss": 2.679, + "mean_token_accuracy": 0.36896551251411436, + "step": 61480 + }, + { + "epoch": 0.06192834502201259, + "grad_norm": 9.085657887867605, + "learning_rate": 4.998249990794518e-05, + "loss": 2.182, + "mean_token_accuracy": 0.5034482777118683, + "step": 61485 + }, + { + "epoch": 0.06193338107511676, + "grad_norm": 13.003771993114151, + "learning_rate": 4.998248512875648e-05, + "loss": 2.5409, + "mean_token_accuracy": 0.4261947929859161, + "step": 61490 + }, + { + "epoch": 0.061938417128220936, + "grad_norm": 10.261386855504673, + "learning_rate": 4.9982470343332184e-05, + "loss": 2.3032, + "mean_token_accuracy": 0.4965517222881317, + "step": 61495 + }, + { + "epoch": 0.0619434531813251, + "grad_norm": 11.90611017119523, + "learning_rate": 4.998245555167228e-05, + "loss": 2.5309, + "mean_token_accuracy": 0.4068965554237366, + "step": 61500 + }, + { + "epoch": 0.06194848923442928, + "grad_norm": 11.928910853836483, + "learning_rate": 4.998244075377679e-05, + "loss": 2.3976, + "mean_token_accuracy": 0.40689656138420105, + "step": 61505 + }, + { + "epoch": 0.06195352528753345, + "grad_norm": 10.719240542862194, + "learning_rate": 4.998242594964571e-05, + "loss": 2.7833, + "mean_token_accuracy": 0.3931034505367279, + "step": 61510 + }, + { + "epoch": 0.061958561340637625, + "grad_norm": 12.64631656298641, + "learning_rate": 4.998241113927904e-05, + "loss": 2.3892, + "mean_token_accuracy": 0.4379310369491577, + "step": 61515 + }, + { + "epoch": 0.0619635973937418, + "grad_norm": 12.055597877717409, + "learning_rate": 4.998239632267679e-05, + "loss": 2.5573, + "mean_token_accuracy": 0.44827585816383364, + "step": 61520 + }, + { + "epoch": 0.06196863344684597, + "grad_norm": 10.132957613726102, + "learning_rate": 4.998238149983897e-05, + "loss": 2.308, + "mean_token_accuracy": 0.4432546854019165, + "step": 61525 + }, + { + "epoch": 0.061973669499950146, + "grad_norm": 10.988356005985198, + "learning_rate": 4.998236667076557e-05, + "loss": 2.446, + "mean_token_accuracy": 0.43448275327682495, + "step": 61530 + }, + { + "epoch": 0.06197870555305431, + "grad_norm": 17.548017373955883, + "learning_rate": 4.998235183545661e-05, + "loss": 2.7644, + "mean_token_accuracy": 0.4153055131435394, + "step": 61535 + }, + { + "epoch": 0.061983741606158486, + "grad_norm": 11.903423291765865, + "learning_rate": 4.998233699391208e-05, + "loss": 2.6269, + "mean_token_accuracy": 0.3604355752468109, + "step": 61540 + }, + { + "epoch": 0.06198877765926266, + "grad_norm": 18.07622877201099, + "learning_rate": 4.998232214613199e-05, + "loss": 2.8877, + "mean_token_accuracy": 0.37586206793785093, + "step": 61545 + }, + { + "epoch": 0.061993813712366834, + "grad_norm": 13.071540668714718, + "learning_rate": 4.998230729211635e-05, + "loss": 3.1695, + "mean_token_accuracy": 0.358620685338974, + "step": 61550 + }, + { + "epoch": 0.06199884976547101, + "grad_norm": 13.043094008636514, + "learning_rate": 4.998229243186516e-05, + "loss": 2.7874, + "mean_token_accuracy": 0.36896551847457887, + "step": 61555 + }, + { + "epoch": 0.06200388581857518, + "grad_norm": 14.24071937558595, + "learning_rate": 4.9982277565378413e-05, + "loss": 2.5308, + "mean_token_accuracy": 0.4137930989265442, + "step": 61560 + }, + { + "epoch": 0.062008921871679355, + "grad_norm": 11.834815486386319, + "learning_rate": 4.9982262692656124e-05, + "loss": 2.2782, + "mean_token_accuracy": 0.4379310369491577, + "step": 61565 + }, + { + "epoch": 0.06201395792478352, + "grad_norm": 14.797756083500381, + "learning_rate": 4.99822478136983e-05, + "loss": 2.5491, + "mean_token_accuracy": 0.43968542516231535, + "step": 61570 + }, + { + "epoch": 0.062018993977887696, + "grad_norm": 9.609729725483426, + "learning_rate": 4.998223292850494e-05, + "loss": 2.6458, + "mean_token_accuracy": 0.35862068831920624, + "step": 61575 + }, + { + "epoch": 0.06202403003099187, + "grad_norm": 10.514796291873681, + "learning_rate": 4.998221803707604e-05, + "loss": 2.6998, + "mean_token_accuracy": 0.4206896543502808, + "step": 61580 + }, + { + "epoch": 0.06202906608409604, + "grad_norm": 13.643705881042427, + "learning_rate": 4.998220313941162e-05, + "loss": 2.2231, + "mean_token_accuracy": 0.43448275327682495, + "step": 61585 + }, + { + "epoch": 0.06203410213720022, + "grad_norm": 11.050182656052737, + "learning_rate": 4.998218823551168e-05, + "loss": 2.6722, + "mean_token_accuracy": 0.3931034505367279, + "step": 61590 + }, + { + "epoch": 0.06203913819030439, + "grad_norm": 12.576074982804448, + "learning_rate": 4.998217332537621e-05, + "loss": 2.6449, + "mean_token_accuracy": 0.41167573928833007, + "step": 61595 + }, + { + "epoch": 0.062044174243408565, + "grad_norm": 12.006587185633139, + "learning_rate": 4.9982158409005236e-05, + "loss": 3.498, + "mean_token_accuracy": 0.35862068831920624, + "step": 61600 + }, + { + "epoch": 0.06204921029651273, + "grad_norm": 12.362629218756272, + "learning_rate": 4.998214348639876e-05, + "loss": 2.7446, + "mean_token_accuracy": 0.39655172228813174, + "step": 61605 + }, + { + "epoch": 0.062054246349616905, + "grad_norm": 13.26180847716348, + "learning_rate": 4.998212855755676e-05, + "loss": 2.7949, + "mean_token_accuracy": 0.3655172407627106, + "step": 61610 + }, + { + "epoch": 0.06205928240272108, + "grad_norm": 14.37448408636518, + "learning_rate": 4.998211362247927e-05, + "loss": 2.5717, + "mean_token_accuracy": 0.417241370677948, + "step": 61615 + }, + { + "epoch": 0.06206431845582525, + "grad_norm": 12.695326992367068, + "learning_rate": 4.9982098681166276e-05, + "loss": 2.7182, + "mean_token_accuracy": 0.3862068891525269, + "step": 61620 + }, + { + "epoch": 0.06206935450892943, + "grad_norm": 8.761091798657166, + "learning_rate": 4.998208373361779e-05, + "loss": 2.3486, + "mean_token_accuracy": 0.43103448748588563, + "step": 61625 + }, + { + "epoch": 0.0620743905620336, + "grad_norm": 20.943093883948997, + "learning_rate": 4.998206877983381e-05, + "loss": 2.2828, + "mean_token_accuracy": 0.46551724672317507, + "step": 61630 + }, + { + "epoch": 0.062079426615137774, + "grad_norm": 11.689794179888644, + "learning_rate": 4.9982053819814345e-05, + "loss": 2.7125, + "mean_token_accuracy": 0.3827586233615875, + "step": 61635 + }, + { + "epoch": 0.06208446266824194, + "grad_norm": 10.345225077018595, + "learning_rate": 4.99820388535594e-05, + "loss": 2.3627, + "mean_token_accuracy": 0.42758620381355283, + "step": 61640 + }, + { + "epoch": 0.062089498721346115, + "grad_norm": 13.681093662428259, + "learning_rate": 4.998202388106898e-05, + "loss": 3.1751, + "mean_token_accuracy": 0.3620689630508423, + "step": 61645 + }, + { + "epoch": 0.06209453477445029, + "grad_norm": 11.27277845197705, + "learning_rate": 4.998200890234308e-05, + "loss": 2.3893, + "mean_token_accuracy": 0.38620689511299133, + "step": 61650 + }, + { + "epoch": 0.06209957082755446, + "grad_norm": 11.057330853131813, + "learning_rate": 4.9981993917381715e-05, + "loss": 2.5688, + "mean_token_accuracy": 0.37931033968925476, + "step": 61655 + }, + { + "epoch": 0.062104606880658636, + "grad_norm": 11.908632076717923, + "learning_rate": 4.998197892618489e-05, + "loss": 2.4993, + "mean_token_accuracy": 0.38620689511299133, + "step": 61660 + }, + { + "epoch": 0.06210964293376281, + "grad_norm": 16.94587915679362, + "learning_rate": 4.99819639287526e-05, + "loss": 2.778, + "mean_token_accuracy": 0.36896551847457887, + "step": 61665 + }, + { + "epoch": 0.062114678986866984, + "grad_norm": 9.49488581888489, + "learning_rate": 4.998194892508486e-05, + "loss": 2.3184, + "mean_token_accuracy": 0.4068965494632721, + "step": 61670 + }, + { + "epoch": 0.06211971503997115, + "grad_norm": 11.843151707883353, + "learning_rate": 4.9981933915181654e-05, + "loss": 2.842, + "mean_token_accuracy": 0.33793102502822875, + "step": 61675 + }, + { + "epoch": 0.062124751093075324, + "grad_norm": 12.028622846928004, + "learning_rate": 4.9981918899043006e-05, + "loss": 2.7788, + "mean_token_accuracy": 0.37586207389831544, + "step": 61680 + }, + { + "epoch": 0.0621297871461795, + "grad_norm": 11.432723118710356, + "learning_rate": 4.998190387666892e-05, + "loss": 2.4197, + "mean_token_accuracy": 0.4316502511501312, + "step": 61685 + }, + { + "epoch": 0.06213482319928367, + "grad_norm": 11.607407856322915, + "learning_rate": 4.9981888848059386e-05, + "loss": 2.6172, + "mean_token_accuracy": 0.4034482717514038, + "step": 61690 + }, + { + "epoch": 0.062139859252387845, + "grad_norm": 11.953709685273008, + "learning_rate": 4.998187381321443e-05, + "loss": 2.5251, + "mean_token_accuracy": 0.41379310488700866, + "step": 61695 + }, + { + "epoch": 0.06214489530549202, + "grad_norm": 14.634164138506524, + "learning_rate": 4.998185877213402e-05, + "loss": 2.841, + "mean_token_accuracy": 0.3827586203813553, + "step": 61700 + }, + { + "epoch": 0.06214993135859619, + "grad_norm": 11.020396881208146, + "learning_rate": 4.99818437248182e-05, + "loss": 2.1501, + "mean_token_accuracy": 0.42758620381355283, + "step": 61705 + }, + { + "epoch": 0.06215496741170036, + "grad_norm": 10.582910782870869, + "learning_rate": 4.998182867126695e-05, + "loss": 2.9725, + "mean_token_accuracy": 0.38965516686439516, + "step": 61710 + }, + { + "epoch": 0.062160003464804534, + "grad_norm": 15.826538511016324, + "learning_rate": 4.998181361148028e-05, + "loss": 2.8664, + "mean_token_accuracy": 0.34482758343219755, + "step": 61715 + }, + { + "epoch": 0.06216503951790871, + "grad_norm": 12.476983410135572, + "learning_rate": 4.9981798545458194e-05, + "loss": 2.8217, + "mean_token_accuracy": 0.3793103456497192, + "step": 61720 + }, + { + "epoch": 0.06217007557101288, + "grad_norm": 11.32230152982007, + "learning_rate": 4.9981783473200706e-05, + "loss": 2.4206, + "mean_token_accuracy": 0.42758620381355283, + "step": 61725 + }, + { + "epoch": 0.062175111624117055, + "grad_norm": 13.521489465661448, + "learning_rate": 4.998176839470781e-05, + "loss": 2.6459, + "mean_token_accuracy": 0.358620685338974, + "step": 61730 + }, + { + "epoch": 0.06218014767722123, + "grad_norm": 14.177422435647726, + "learning_rate": 4.99817533099795e-05, + "loss": 3.3368, + "mean_token_accuracy": 0.33103448450565337, + "step": 61735 + }, + { + "epoch": 0.0621851837303254, + "grad_norm": 11.518514194533516, + "learning_rate": 4.9981738219015805e-05, + "loss": 2.4828, + "mean_token_accuracy": 0.4206896543502808, + "step": 61740 + }, + { + "epoch": 0.06219021978342957, + "grad_norm": 12.296069505257273, + "learning_rate": 4.9981723121816706e-05, + "loss": 2.5777, + "mean_token_accuracy": 0.4482758641242981, + "step": 61745 + }, + { + "epoch": 0.06219525583653374, + "grad_norm": 10.738606660014648, + "learning_rate": 4.9981708018382226e-05, + "loss": 2.446, + "mean_token_accuracy": 0.441379314661026, + "step": 61750 + }, + { + "epoch": 0.06220029188963792, + "grad_norm": 13.339306566492768, + "learning_rate": 4.9981692908712356e-05, + "loss": 2.7554, + "mean_token_accuracy": 0.36896551251411436, + "step": 61755 + }, + { + "epoch": 0.06220532794274209, + "grad_norm": 16.269448035814364, + "learning_rate": 4.9981677792807104e-05, + "loss": 2.1499, + "mean_token_accuracy": 0.4801724135875702, + "step": 61760 + }, + { + "epoch": 0.062210363995846264, + "grad_norm": 14.117542349461383, + "learning_rate": 4.998166267066648e-05, + "loss": 2.7082, + "mean_token_accuracy": 0.41034482717514037, + "step": 61765 + }, + { + "epoch": 0.06221540004895044, + "grad_norm": 10.645616699602432, + "learning_rate": 4.998164754229048e-05, + "loss": 2.2028, + "mean_token_accuracy": 0.4620689630508423, + "step": 61770 + }, + { + "epoch": 0.06222043610205461, + "grad_norm": 10.826693667004337, + "learning_rate": 4.99816324076791e-05, + "loss": 2.2169, + "mean_token_accuracy": 0.4437386512756348, + "step": 61775 + }, + { + "epoch": 0.06222547215515878, + "grad_norm": 12.707149006296968, + "learning_rate": 4.998161726683236e-05, + "loss": 2.6857, + "mean_token_accuracy": 0.4, + "step": 61780 + }, + { + "epoch": 0.06223050820826295, + "grad_norm": 10.261297904611293, + "learning_rate": 4.9981602119750274e-05, + "loss": 2.2401, + "mean_token_accuracy": 0.43793103098869324, + "step": 61785 + }, + { + "epoch": 0.062235544261367126, + "grad_norm": 14.694302372307588, + "learning_rate": 4.9981586966432817e-05, + "loss": 3.0429, + "mean_token_accuracy": 0.3103448271751404, + "step": 61790 + }, + { + "epoch": 0.0622405803144713, + "grad_norm": 10.908509107415773, + "learning_rate": 4.998157180688002e-05, + "loss": 2.4925, + "mean_token_accuracy": 0.4, + "step": 61795 + }, + { + "epoch": 0.062245616367575474, + "grad_norm": 9.635762385977953, + "learning_rate": 4.998155664109187e-05, + "loss": 2.1652, + "mean_token_accuracy": 0.4429521977901459, + "step": 61800 + }, + { + "epoch": 0.06225065242067965, + "grad_norm": 13.38136062659689, + "learning_rate": 4.998154146906837e-05, + "loss": 2.777, + "mean_token_accuracy": 0.37586206793785093, + "step": 61805 + }, + { + "epoch": 0.06225568847378382, + "grad_norm": 10.620972850122714, + "learning_rate": 4.998152629080954e-05, + "loss": 2.3338, + "mean_token_accuracy": 0.42413793206214906, + "step": 61810 + }, + { + "epoch": 0.06226072452688799, + "grad_norm": 12.519783467993364, + "learning_rate": 4.9981511106315373e-05, + "loss": 2.4997, + "mean_token_accuracy": 0.3980036318302155, + "step": 61815 + }, + { + "epoch": 0.06226576057999216, + "grad_norm": 11.343947943883633, + "learning_rate": 4.998149591558586e-05, + "loss": 2.4283, + "mean_token_accuracy": 0.40689654648303986, + "step": 61820 + }, + { + "epoch": 0.062270796633096336, + "grad_norm": 12.52557943799043, + "learning_rate": 4.998148071862104e-05, + "loss": 2.3068, + "mean_token_accuracy": 0.44482758045196535, + "step": 61825 + }, + { + "epoch": 0.06227583268620051, + "grad_norm": 14.261396731502801, + "learning_rate": 4.998146551542089e-05, + "loss": 2.552, + "mean_token_accuracy": 0.3620689630508423, + "step": 61830 + }, + { + "epoch": 0.06228086873930468, + "grad_norm": 11.74810635695176, + "learning_rate": 4.998145030598542e-05, + "loss": 2.7293, + "mean_token_accuracy": 0.3965517282485962, + "step": 61835 + }, + { + "epoch": 0.06228590479240886, + "grad_norm": 10.307344834578016, + "learning_rate": 4.998143509031464e-05, + "loss": 2.569, + "mean_token_accuracy": 0.41034482717514037, + "step": 61840 + }, + { + "epoch": 0.06229094084551303, + "grad_norm": 10.998418980740832, + "learning_rate": 4.998141986840855e-05, + "loss": 2.2983, + "mean_token_accuracy": 0.4551724076271057, + "step": 61845 + }, + { + "epoch": 0.0622959768986172, + "grad_norm": 9.847837404921052, + "learning_rate": 4.998140464026715e-05, + "loss": 2.4723, + "mean_token_accuracy": 0.4310344845056534, + "step": 61850 + }, + { + "epoch": 0.06230101295172137, + "grad_norm": 14.109341428392359, + "learning_rate": 4.998138940589045e-05, + "loss": 2.7926, + "mean_token_accuracy": 0.36896551847457887, + "step": 61855 + }, + { + "epoch": 0.062306049004825545, + "grad_norm": 11.709216617380044, + "learning_rate": 4.998137416527846e-05, + "loss": 2.4043, + "mean_token_accuracy": 0.4034482777118683, + "step": 61860 + }, + { + "epoch": 0.06231108505792972, + "grad_norm": 33.766816009536704, + "learning_rate": 4.9981358918431164e-05, + "loss": 3.0452, + "mean_token_accuracy": 0.32758620083332063, + "step": 61865 + }, + { + "epoch": 0.06231612111103389, + "grad_norm": 12.504112267695515, + "learning_rate": 4.998134366534859e-05, + "loss": 2.645, + "mean_token_accuracy": 0.37241379022598264, + "step": 61870 + }, + { + "epoch": 0.062321157164138066, + "grad_norm": 11.65046722838868, + "learning_rate": 4.9981328406030724e-05, + "loss": 2.5522, + "mean_token_accuracy": 0.4103448212146759, + "step": 61875 + }, + { + "epoch": 0.06232619321724224, + "grad_norm": 12.863270455853554, + "learning_rate": 4.9981313140477586e-05, + "loss": 2.3978, + "mean_token_accuracy": 0.41584996581077577, + "step": 61880 + }, + { + "epoch": 0.06233122927034641, + "grad_norm": 16.634138675513306, + "learning_rate": 4.998129786868916e-05, + "loss": 2.7586, + "mean_token_accuracy": 0.3827586233615875, + "step": 61885 + }, + { + "epoch": 0.06233626532345058, + "grad_norm": 10.80945416969448, + "learning_rate": 4.9981282590665476e-05, + "loss": 2.6719, + "mean_token_accuracy": 0.38620689511299133, + "step": 61890 + }, + { + "epoch": 0.062341301376554754, + "grad_norm": 12.800422405253357, + "learning_rate": 4.998126730640652e-05, + "loss": 2.7708, + "mean_token_accuracy": 0.39310345649719236, + "step": 61895 + }, + { + "epoch": 0.06234633742965893, + "grad_norm": 18.586244439152633, + "learning_rate": 4.99812520159123e-05, + "loss": 2.8749, + "mean_token_accuracy": 0.36551724672317504, + "step": 61900 + }, + { + "epoch": 0.0623513734827631, + "grad_norm": 23.62107819192551, + "learning_rate": 4.998123671918281e-05, + "loss": 2.6968, + "mean_token_accuracy": 0.4068965554237366, + "step": 61905 + }, + { + "epoch": 0.062356409535867276, + "grad_norm": 11.65694559386593, + "learning_rate": 4.998122141621808e-05, + "loss": 2.3701, + "mean_token_accuracy": 0.42413793206214906, + "step": 61910 + }, + { + "epoch": 0.06236144558897145, + "grad_norm": 16.95050829440943, + "learning_rate": 4.998120610701809e-05, + "loss": 2.9717, + "mean_token_accuracy": 0.3586206942796707, + "step": 61915 + }, + { + "epoch": 0.062366481642075616, + "grad_norm": 10.624334752438582, + "learning_rate": 4.998119079158286e-05, + "loss": 2.5054, + "mean_token_accuracy": 0.4172413766384125, + "step": 61920 + }, + { + "epoch": 0.06237151769517979, + "grad_norm": 13.58626828102806, + "learning_rate": 4.9981175469912385e-05, + "loss": 2.8818, + "mean_token_accuracy": 0.38275861740112305, + "step": 61925 + }, + { + "epoch": 0.062376553748283964, + "grad_norm": 13.01411404810873, + "learning_rate": 4.998116014200667e-05, + "loss": 2.1237, + "mean_token_accuracy": 0.4896551728248596, + "step": 61930 + }, + { + "epoch": 0.06238158980138814, + "grad_norm": 12.770864170944165, + "learning_rate": 4.9981144807865726e-05, + "loss": 2.8048, + "mean_token_accuracy": 0.32413792610168457, + "step": 61935 + }, + { + "epoch": 0.06238662585449231, + "grad_norm": 10.946546030009342, + "learning_rate": 4.9981129467489546e-05, + "loss": 2.2685, + "mean_token_accuracy": 0.4226860284805298, + "step": 61940 + }, + { + "epoch": 0.062391661907596485, + "grad_norm": 13.55843980299917, + "learning_rate": 4.998111412087815e-05, + "loss": 2.1777, + "mean_token_accuracy": 0.4344827592372894, + "step": 61945 + }, + { + "epoch": 0.06239669796070066, + "grad_norm": 12.043078623741616, + "learning_rate": 4.998109876803153e-05, + "loss": 2.5453, + "mean_token_accuracy": 0.37434967160224913, + "step": 61950 + }, + { + "epoch": 0.062401734013804826, + "grad_norm": 10.397129641310412, + "learning_rate": 4.9981083408949684e-05, + "loss": 2.3512, + "mean_token_accuracy": 0.41034482717514037, + "step": 61955 + }, + { + "epoch": 0.062406770066909, + "grad_norm": 11.412631668368094, + "learning_rate": 4.998106804363263e-05, + "loss": 2.5047, + "mean_token_accuracy": 0.39655172228813174, + "step": 61960 + }, + { + "epoch": 0.06241180612001317, + "grad_norm": 11.519264941441573, + "learning_rate": 4.9981052672080376e-05, + "loss": 2.797, + "mean_token_accuracy": 0.3655172437429428, + "step": 61965 + }, + { + "epoch": 0.06241684217311735, + "grad_norm": 11.808168315591233, + "learning_rate": 4.9981037294292906e-05, + "loss": 2.6116, + "mean_token_accuracy": 0.38275861740112305, + "step": 61970 + }, + { + "epoch": 0.06242187822622152, + "grad_norm": 10.822011851370267, + "learning_rate": 4.998102191027025e-05, + "loss": 2.5593, + "mean_token_accuracy": 0.4068965554237366, + "step": 61975 + }, + { + "epoch": 0.062426914279325695, + "grad_norm": 18.2423352482425, + "learning_rate": 4.9981006520012396e-05, + "loss": 3.2342, + "mean_token_accuracy": 0.3620689570903778, + "step": 61980 + }, + { + "epoch": 0.06243195033242987, + "grad_norm": 11.10424193673708, + "learning_rate": 4.998099112351935e-05, + "loss": 2.2929, + "mean_token_accuracy": 0.42068966031074523, + "step": 61985 + }, + { + "epoch": 0.062436986385534035, + "grad_norm": 12.850949859280323, + "learning_rate": 4.998097572079111e-05, + "loss": 2.6371, + "mean_token_accuracy": 0.4068965494632721, + "step": 61990 + }, + { + "epoch": 0.06244202243863821, + "grad_norm": 11.093332127385423, + "learning_rate": 4.99809603118277e-05, + "loss": 2.7391, + "mean_token_accuracy": 0.40344828069210054, + "step": 61995 + }, + { + "epoch": 0.06244705849174238, + "grad_norm": 10.780032226152947, + "learning_rate": 4.99809448966291e-05, + "loss": 2.4933, + "mean_token_accuracy": 0.39310343861579894, + "step": 62000 + }, + { + "epoch": 0.062452094544846556, + "grad_norm": 11.848169836114655, + "learning_rate": 4.998092947519533e-05, + "loss": 2.6122, + "mean_token_accuracy": 0.3551724135875702, + "step": 62005 + }, + { + "epoch": 0.06245713059795073, + "grad_norm": 14.059507591029904, + "learning_rate": 4.998091404752639e-05, + "loss": 2.9683, + "mean_token_accuracy": 0.36896551251411436, + "step": 62010 + }, + { + "epoch": 0.062462166651054904, + "grad_norm": 10.40035764032004, + "learning_rate": 4.998089861362229e-05, + "loss": 2.1151, + "mean_token_accuracy": 0.4620689570903778, + "step": 62015 + }, + { + "epoch": 0.06246720270415908, + "grad_norm": 14.344305624532506, + "learning_rate": 4.9980883173483015e-05, + "loss": 2.9559, + "mean_token_accuracy": 0.38275861740112305, + "step": 62020 + }, + { + "epoch": 0.062472238757263245, + "grad_norm": 10.842127439917341, + "learning_rate": 4.99808677271086e-05, + "loss": 2.6955, + "mean_token_accuracy": 0.3931034475564957, + "step": 62025 + }, + { + "epoch": 0.06247727481036742, + "grad_norm": 11.413335416232472, + "learning_rate": 4.998085227449902e-05, + "loss": 2.7276, + "mean_token_accuracy": 0.3965517282485962, + "step": 62030 + }, + { + "epoch": 0.06248231086347159, + "grad_norm": 10.829702277830625, + "learning_rate": 4.99808368156543e-05, + "loss": 2.6741, + "mean_token_accuracy": 0.3926799833774567, + "step": 62035 + }, + { + "epoch": 0.062487346916575766, + "grad_norm": 12.277328026949816, + "learning_rate": 4.9980821350574426e-05, + "loss": 2.533, + "mean_token_accuracy": 0.39310344159603117, + "step": 62040 + }, + { + "epoch": 0.06249238296967994, + "grad_norm": 11.53507928483763, + "learning_rate": 4.9980805879259415e-05, + "loss": 2.8252, + "mean_token_accuracy": 0.37241379022598264, + "step": 62045 + }, + { + "epoch": 0.06249741902278411, + "grad_norm": 13.82972267864647, + "learning_rate": 4.998079040170928e-05, + "loss": 2.2513, + "mean_token_accuracy": 0.42068964838981626, + "step": 62050 + }, + { + "epoch": 0.06250245507588828, + "grad_norm": 12.446997992507965, + "learning_rate": 4.9980774917924004e-05, + "loss": 2.535, + "mean_token_accuracy": 0.42068964838981626, + "step": 62055 + }, + { + "epoch": 0.06250749112899245, + "grad_norm": 9.69013145088671, + "learning_rate": 4.9980759427903605e-05, + "loss": 2.3636, + "mean_token_accuracy": 0.41379310488700866, + "step": 62060 + }, + { + "epoch": 0.06251252718209663, + "grad_norm": 13.894256486106459, + "learning_rate": 4.998074393164808e-05, + "loss": 2.4733, + "mean_token_accuracy": 0.4517241358757019, + "step": 62065 + }, + { + "epoch": 0.0625175632352008, + "grad_norm": 11.621507297185293, + "learning_rate": 4.998072842915744e-05, + "loss": 2.5019, + "mean_token_accuracy": 0.38275861740112305, + "step": 62070 + }, + { + "epoch": 0.06252259928830498, + "grad_norm": 15.41459338095443, + "learning_rate": 4.998071292043168e-05, + "loss": 2.5948, + "mean_token_accuracy": 0.38965516686439516, + "step": 62075 + }, + { + "epoch": 0.06252763534140915, + "grad_norm": 10.021925979295649, + "learning_rate": 4.998069740547082e-05, + "loss": 2.2906, + "mean_token_accuracy": 0.4675136089324951, + "step": 62080 + }, + { + "epoch": 0.06253267139451332, + "grad_norm": 12.014763205989091, + "learning_rate": 4.998068188427485e-05, + "loss": 2.4702, + "mean_token_accuracy": 0.45680580139160154, + "step": 62085 + }, + { + "epoch": 0.0625377074476175, + "grad_norm": 14.312796546645393, + "learning_rate": 4.998066635684378e-05, + "loss": 2.3265, + "mean_token_accuracy": 0.452339905500412, + "step": 62090 + }, + { + "epoch": 0.06254274350072167, + "grad_norm": 10.44504548052275, + "learning_rate": 4.998065082317761e-05, + "loss": 2.1138, + "mean_token_accuracy": 0.46896551847457885, + "step": 62095 + }, + { + "epoch": 0.06254777955382584, + "grad_norm": 11.502758684110288, + "learning_rate": 4.9980635283276355e-05, + "loss": 2.4433, + "mean_token_accuracy": 0.37931033968925476, + "step": 62100 + }, + { + "epoch": 0.06255281560693002, + "grad_norm": 13.332451955401085, + "learning_rate": 4.998061973714e-05, + "loss": 3.1752, + "mean_token_accuracy": 0.3034482717514038, + "step": 62105 + }, + { + "epoch": 0.06255785166003419, + "grad_norm": 14.440886858729266, + "learning_rate": 4.9980604184768574e-05, + "loss": 2.9196, + "mean_token_accuracy": 0.4, + "step": 62110 + }, + { + "epoch": 0.06256288771313835, + "grad_norm": 11.32702333989073, + "learning_rate": 4.998058862616206e-05, + "loss": 2.8711, + "mean_token_accuracy": 0.36551723480224607, + "step": 62115 + }, + { + "epoch": 0.06256792376624253, + "grad_norm": 11.505385622910655, + "learning_rate": 4.9980573061320476e-05, + "loss": 2.6412, + "mean_token_accuracy": 0.42758620381355283, + "step": 62120 + }, + { + "epoch": 0.0625729598193467, + "grad_norm": 12.269783643147237, + "learning_rate": 4.9980557490243814e-05, + "loss": 2.4396, + "mean_token_accuracy": 0.4275861978530884, + "step": 62125 + }, + { + "epoch": 0.06257799587245087, + "grad_norm": 12.531133988426804, + "learning_rate": 4.9980541912932096e-05, + "loss": 2.4807, + "mean_token_accuracy": 0.4206896543502808, + "step": 62130 + }, + { + "epoch": 0.06258303192555505, + "grad_norm": 12.765495797590255, + "learning_rate": 4.998052632938531e-05, + "loss": 2.5847, + "mean_token_accuracy": 0.36551724672317504, + "step": 62135 + }, + { + "epoch": 0.06258806797865922, + "grad_norm": 11.297646077447837, + "learning_rate": 4.998051073960346e-05, + "loss": 2.6343, + "mean_token_accuracy": 0.4034482717514038, + "step": 62140 + }, + { + "epoch": 0.0625931040317634, + "grad_norm": 11.952887798249673, + "learning_rate": 4.998049514358657e-05, + "loss": 2.6151, + "mean_token_accuracy": 0.3758620649576187, + "step": 62145 + }, + { + "epoch": 0.06259814008486757, + "grad_norm": 12.026730173799285, + "learning_rate": 4.9980479541334626e-05, + "loss": 2.7523, + "mean_token_accuracy": 0.29310344755649564, + "step": 62150 + }, + { + "epoch": 0.06260317613797174, + "grad_norm": 12.232513431985785, + "learning_rate": 4.998046393284763e-05, + "loss": 2.4238, + "mean_token_accuracy": 0.4034482717514038, + "step": 62155 + }, + { + "epoch": 0.06260821219107592, + "grad_norm": 10.806554916932077, + "learning_rate": 4.99804483181256e-05, + "loss": 2.3376, + "mean_token_accuracy": 0.42413792610168455, + "step": 62160 + }, + { + "epoch": 0.06261324824418009, + "grad_norm": 12.470881576532385, + "learning_rate": 4.9980432697168534e-05, + "loss": 2.3288, + "mean_token_accuracy": 0.39655172228813174, + "step": 62165 + }, + { + "epoch": 0.06261828429728426, + "grad_norm": 12.778385305951138, + "learning_rate": 4.9980417069976434e-05, + "loss": 2.8237, + "mean_token_accuracy": 0.37586207389831544, + "step": 62170 + }, + { + "epoch": 0.06262332035038844, + "grad_norm": 11.538637777292367, + "learning_rate": 4.99804014365493e-05, + "loss": 2.3894, + "mean_token_accuracy": 0.37586207389831544, + "step": 62175 + }, + { + "epoch": 0.06262835640349261, + "grad_norm": 9.922858878422783, + "learning_rate": 4.998038579688715e-05, + "loss": 2.7127, + "mean_token_accuracy": 0.3987295746803284, + "step": 62180 + }, + { + "epoch": 0.06263339245659677, + "grad_norm": 14.53588101588764, + "learning_rate": 4.9980370150989984e-05, + "loss": 2.6879, + "mean_token_accuracy": 0.36896551847457887, + "step": 62185 + }, + { + "epoch": 0.06263842850970094, + "grad_norm": 10.241607436354613, + "learning_rate": 4.99803544988578e-05, + "loss": 2.2922, + "mean_token_accuracy": 0.42068966031074523, + "step": 62190 + }, + { + "epoch": 0.06264346456280512, + "grad_norm": 11.796449185512044, + "learning_rate": 4.998033884049061e-05, + "loss": 2.6286, + "mean_token_accuracy": 0.37586206793785093, + "step": 62195 + }, + { + "epoch": 0.06264850061590929, + "grad_norm": 10.365798586997801, + "learning_rate": 4.998032317588841e-05, + "loss": 2.6028, + "mean_token_accuracy": 0.37241379022598264, + "step": 62200 + }, + { + "epoch": 0.06265353666901347, + "grad_norm": 11.7155269448105, + "learning_rate": 4.998030750505121e-05, + "loss": 2.7615, + "mean_token_accuracy": 0.3517241358757019, + "step": 62205 + }, + { + "epoch": 0.06265857272211764, + "grad_norm": 13.962457311917545, + "learning_rate": 4.9980291827979004e-05, + "loss": 2.537, + "mean_token_accuracy": 0.3946763455867767, + "step": 62210 + }, + { + "epoch": 0.06266360877522181, + "grad_norm": 13.24449080095707, + "learning_rate": 4.998027614467182e-05, + "loss": 2.8266, + "mean_token_accuracy": 0.33448274731636046, + "step": 62215 + }, + { + "epoch": 0.06266864482832599, + "grad_norm": 53.2126738579196, + "learning_rate": 4.998026045512964e-05, + "loss": 2.4494, + "mean_token_accuracy": 0.3862068891525269, + "step": 62220 + }, + { + "epoch": 0.06267368088143016, + "grad_norm": 10.560050104835417, + "learning_rate": 4.9980244759352474e-05, + "loss": 2.5377, + "mean_token_accuracy": 0.3999999940395355, + "step": 62225 + }, + { + "epoch": 0.06267871693453433, + "grad_norm": 12.351089354460358, + "learning_rate": 4.998022905734033e-05, + "loss": 2.3385, + "mean_token_accuracy": 0.403448274731636, + "step": 62230 + }, + { + "epoch": 0.06268375298763851, + "grad_norm": 9.651060796633615, + "learning_rate": 4.9980213349093206e-05, + "loss": 2.4843, + "mean_token_accuracy": 0.41379310488700866, + "step": 62235 + }, + { + "epoch": 0.06268878904074268, + "grad_norm": 12.666003900243949, + "learning_rate": 4.998019763461112e-05, + "loss": 2.3178, + "mean_token_accuracy": 0.38620689511299133, + "step": 62240 + }, + { + "epoch": 0.06269382509384686, + "grad_norm": 9.644461810277864, + "learning_rate": 4.9980181913894056e-05, + "loss": 2.527, + "mean_token_accuracy": 0.4034482717514038, + "step": 62245 + }, + { + "epoch": 0.06269886114695103, + "grad_norm": 14.743235851091555, + "learning_rate": 4.998016618694204e-05, + "loss": 2.4974, + "mean_token_accuracy": 0.3896551728248596, + "step": 62250 + }, + { + "epoch": 0.06270389720005519, + "grad_norm": 17.25992273652925, + "learning_rate": 4.9980150453755065e-05, + "loss": 3.2558, + "mean_token_accuracy": 0.31034482419490816, + "step": 62255 + }, + { + "epoch": 0.06270893325315936, + "grad_norm": 12.113644535999981, + "learning_rate": 4.998013471433313e-05, + "loss": 3.2247, + "mean_token_accuracy": 0.35862069129943847, + "step": 62260 + }, + { + "epoch": 0.06271396930626354, + "grad_norm": 12.398181084451958, + "learning_rate": 4.998011896867625e-05, + "loss": 2.5813, + "mean_token_accuracy": 0.3793103456497192, + "step": 62265 + }, + { + "epoch": 0.06271900535936771, + "grad_norm": 14.211156881198992, + "learning_rate": 4.998010321678443e-05, + "loss": 2.877, + "mean_token_accuracy": 0.38275861740112305, + "step": 62270 + }, + { + "epoch": 0.06272404141247188, + "grad_norm": 12.704639576900126, + "learning_rate": 4.9980087458657665e-05, + "loss": 2.4824, + "mean_token_accuracy": 0.4241379380226135, + "step": 62275 + }, + { + "epoch": 0.06272907746557606, + "grad_norm": 10.103183824034012, + "learning_rate": 4.998007169429596e-05, + "loss": 2.7058, + "mean_token_accuracy": 0.42232305407524107, + "step": 62280 + }, + { + "epoch": 0.06273411351868023, + "grad_norm": 11.425038295772477, + "learning_rate": 4.998005592369933e-05, + "loss": 2.6438, + "mean_token_accuracy": 0.4034482777118683, + "step": 62285 + }, + { + "epoch": 0.0627391495717844, + "grad_norm": 12.882669653487197, + "learning_rate": 4.9980040146867777e-05, + "loss": 2.7466, + "mean_token_accuracy": 0.37586206793785093, + "step": 62290 + }, + { + "epoch": 0.06274418562488858, + "grad_norm": 12.94803587512305, + "learning_rate": 4.9980024363801286e-05, + "loss": 2.5969, + "mean_token_accuracy": 0.4012704133987427, + "step": 62295 + }, + { + "epoch": 0.06274922167799275, + "grad_norm": 12.035849375951202, + "learning_rate": 4.998000857449988e-05, + "loss": 2.4455, + "mean_token_accuracy": 0.4935960590839386, + "step": 62300 + }, + { + "epoch": 0.06275425773109693, + "grad_norm": 11.447652566367399, + "learning_rate": 4.997999277896357e-05, + "loss": 2.6115, + "mean_token_accuracy": 0.38620689809322356, + "step": 62305 + }, + { + "epoch": 0.0627592937842011, + "grad_norm": 11.777480976953067, + "learning_rate": 4.997997697719234e-05, + "loss": 2.7785, + "mean_token_accuracy": 0.36896551847457887, + "step": 62310 + }, + { + "epoch": 0.06276432983730527, + "grad_norm": 11.486996245868356, + "learning_rate": 4.9979961169186204e-05, + "loss": 2.3841, + "mean_token_accuracy": 0.43103448748588563, + "step": 62315 + }, + { + "epoch": 0.06276936589040943, + "grad_norm": 13.241748370206066, + "learning_rate": 4.997994535494518e-05, + "loss": 2.5649, + "mean_token_accuracy": 0.4137930989265442, + "step": 62320 + }, + { + "epoch": 0.06277440194351361, + "grad_norm": 15.575348358716179, + "learning_rate": 4.997992953446925e-05, + "loss": 2.18, + "mean_token_accuracy": 0.4413793087005615, + "step": 62325 + }, + { + "epoch": 0.06277943799661778, + "grad_norm": 10.078360050648557, + "learning_rate": 4.997991370775843e-05, + "loss": 2.3921, + "mean_token_accuracy": 0.4137930989265442, + "step": 62330 + }, + { + "epoch": 0.06278447404972196, + "grad_norm": 12.581449749526632, + "learning_rate": 4.997989787481271e-05, + "loss": 2.3721, + "mean_token_accuracy": 0.41034482717514037, + "step": 62335 + }, + { + "epoch": 0.06278951010282613, + "grad_norm": 13.209318898230753, + "learning_rate": 4.997988203563212e-05, + "loss": 2.5787, + "mean_token_accuracy": 0.3551724135875702, + "step": 62340 + }, + { + "epoch": 0.0627945461559303, + "grad_norm": 11.748341230340182, + "learning_rate": 4.997986619021664e-05, + "loss": 2.4305, + "mean_token_accuracy": 0.4068965554237366, + "step": 62345 + }, + { + "epoch": 0.06279958220903448, + "grad_norm": 11.188471622974934, + "learning_rate": 4.99798503385663e-05, + "loss": 2.3276, + "mean_token_accuracy": 0.4839901506900787, + "step": 62350 + }, + { + "epoch": 0.06280461826213865, + "grad_norm": 12.34101110526177, + "learning_rate": 4.9979834480681086e-05, + "loss": 2.6021, + "mean_token_accuracy": 0.4034482777118683, + "step": 62355 + }, + { + "epoch": 0.06280965431524282, + "grad_norm": 12.61302959898061, + "learning_rate": 4.9979818616561e-05, + "loss": 2.9714, + "mean_token_accuracy": 0.3643073230981827, + "step": 62360 + }, + { + "epoch": 0.062814690368347, + "grad_norm": 13.245669597804927, + "learning_rate": 4.997980274620605e-05, + "loss": 2.7502, + "mean_token_accuracy": 0.3896551787853241, + "step": 62365 + }, + { + "epoch": 0.06281972642145117, + "grad_norm": 15.808861519887156, + "learning_rate": 4.997978686961625e-05, + "loss": 2.7324, + "mean_token_accuracy": 0.41379310488700866, + "step": 62370 + }, + { + "epoch": 0.06282476247455535, + "grad_norm": 11.41619169676549, + "learning_rate": 4.9979770986791605e-05, + "loss": 2.8882, + "mean_token_accuracy": 0.3862068891525269, + "step": 62375 + }, + { + "epoch": 0.06282979852765952, + "grad_norm": 10.40929396866401, + "learning_rate": 4.9979755097732094e-05, + "loss": 2.1438, + "mean_token_accuracy": 0.4915305495262146, + "step": 62380 + }, + { + "epoch": 0.0628348345807637, + "grad_norm": 12.645847794576438, + "learning_rate": 4.997973920243775e-05, + "loss": 2.6442, + "mean_token_accuracy": 0.39655172228813174, + "step": 62385 + }, + { + "epoch": 0.06283987063386785, + "grad_norm": 10.774207501865742, + "learning_rate": 4.997972330090856e-05, + "loss": 2.5731, + "mean_token_accuracy": 0.38965516686439516, + "step": 62390 + }, + { + "epoch": 0.06284490668697203, + "grad_norm": 12.35902198080692, + "learning_rate": 4.997970739314454e-05, + "loss": 2.4009, + "mean_token_accuracy": 0.4206896543502808, + "step": 62395 + }, + { + "epoch": 0.0628499427400762, + "grad_norm": 11.223575094144403, + "learning_rate": 4.997969147914569e-05, + "loss": 2.59, + "mean_token_accuracy": 0.3965517282485962, + "step": 62400 + }, + { + "epoch": 0.06285497879318037, + "grad_norm": 11.181386096219601, + "learning_rate": 4.997967555891201e-05, + "loss": 2.2681, + "mean_token_accuracy": 0.4344827592372894, + "step": 62405 + }, + { + "epoch": 0.06286001484628455, + "grad_norm": 11.092854536486229, + "learning_rate": 4.997965963244351e-05, + "loss": 2.3509, + "mean_token_accuracy": 0.42758620381355283, + "step": 62410 + }, + { + "epoch": 0.06286505089938872, + "grad_norm": 10.449986378487464, + "learning_rate": 4.9979643699740195e-05, + "loss": 2.3912, + "mean_token_accuracy": 0.441379314661026, + "step": 62415 + }, + { + "epoch": 0.0628700869524929, + "grad_norm": 12.587710324529981, + "learning_rate": 4.997962776080206e-05, + "loss": 2.4135, + "mean_token_accuracy": 0.3807622492313385, + "step": 62420 + }, + { + "epoch": 0.06287512300559707, + "grad_norm": 15.892085125642838, + "learning_rate": 4.997961181562912e-05, + "loss": 2.6002, + "mean_token_accuracy": 0.42068966031074523, + "step": 62425 + }, + { + "epoch": 0.06288015905870124, + "grad_norm": 12.967656971103171, + "learning_rate": 4.997959586422138e-05, + "loss": 2.2028, + "mean_token_accuracy": 0.4793103337287903, + "step": 62430 + }, + { + "epoch": 0.06288519511180542, + "grad_norm": 13.225917543495527, + "learning_rate": 4.9979579906578836e-05, + "loss": 3.1354, + "mean_token_accuracy": 0.36817906498909, + "step": 62435 + }, + { + "epoch": 0.06289023116490959, + "grad_norm": 13.472481209756575, + "learning_rate": 4.9979563942701494e-05, + "loss": 2.3012, + "mean_token_accuracy": 0.4482758641242981, + "step": 62440 + }, + { + "epoch": 0.06289526721801376, + "grad_norm": 11.42313288857788, + "learning_rate": 4.9979547972589366e-05, + "loss": 2.3888, + "mean_token_accuracy": 0.420689657330513, + "step": 62445 + }, + { + "epoch": 0.06290030327111794, + "grad_norm": 11.673780985232318, + "learning_rate": 4.9979531996242446e-05, + "loss": 2.2684, + "mean_token_accuracy": 0.4517241299152374, + "step": 62450 + }, + { + "epoch": 0.06290533932422211, + "grad_norm": 9.654641446594203, + "learning_rate": 4.9979516013660746e-05, + "loss": 2.4958, + "mean_token_accuracy": 0.39310344457626345, + "step": 62455 + }, + { + "epoch": 0.06291037537732627, + "grad_norm": 11.093014663417016, + "learning_rate": 4.9979500024844274e-05, + "loss": 2.3332, + "mean_token_accuracy": 0.42667876482009887, + "step": 62460 + }, + { + "epoch": 0.06291541143043045, + "grad_norm": 12.458585914553815, + "learning_rate": 4.997948402979302e-05, + "loss": 2.5219, + "mean_token_accuracy": 0.3999999940395355, + "step": 62465 + }, + { + "epoch": 0.06292044748353462, + "grad_norm": 11.048079183779482, + "learning_rate": 4.9979468028507006e-05, + "loss": 2.3889, + "mean_token_accuracy": 0.4068965554237366, + "step": 62470 + }, + { + "epoch": 0.0629254835366388, + "grad_norm": 11.244244600912156, + "learning_rate": 4.997945202098623e-05, + "loss": 2.7073, + "mean_token_accuracy": 0.3827586114406586, + "step": 62475 + }, + { + "epoch": 0.06293051958974297, + "grad_norm": 22.257837315165585, + "learning_rate": 4.9979436007230676e-05, + "loss": 2.6666, + "mean_token_accuracy": 0.401935875415802, + "step": 62480 + }, + { + "epoch": 0.06293555564284714, + "grad_norm": 12.109950462166607, + "learning_rate": 4.9979419987240376e-05, + "loss": 2.7719, + "mean_token_accuracy": 0.38771929740905764, + "step": 62485 + }, + { + "epoch": 0.06294059169595131, + "grad_norm": 12.61607693666439, + "learning_rate": 4.997940396101533e-05, + "loss": 2.3295, + "mean_token_accuracy": 0.43448275327682495, + "step": 62490 + }, + { + "epoch": 0.06294562774905549, + "grad_norm": 10.711281986169466, + "learning_rate": 4.997938792855554e-05, + "loss": 3.1789, + "mean_token_accuracy": 0.3379310339689255, + "step": 62495 + }, + { + "epoch": 0.06295066380215966, + "grad_norm": 10.510978886405585, + "learning_rate": 4.9979371889861e-05, + "loss": 2.5674, + "mean_token_accuracy": 0.4172413766384125, + "step": 62500 + }, + { + "epoch": 0.06295569985526384, + "grad_norm": 12.330968358361359, + "learning_rate": 4.9979355844931724e-05, + "loss": 2.4827, + "mean_token_accuracy": 0.4344827473163605, + "step": 62505 + }, + { + "epoch": 0.06296073590836801, + "grad_norm": 11.424444234372416, + "learning_rate": 4.997933979376772e-05, + "loss": 2.8833, + "mean_token_accuracy": 0.3705989122390747, + "step": 62510 + }, + { + "epoch": 0.06296577196147218, + "grad_norm": 11.735958911456319, + "learning_rate": 4.9979323736368974e-05, + "loss": 2.9061, + "mean_token_accuracy": 0.3517241358757019, + "step": 62515 + }, + { + "epoch": 0.06297080801457636, + "grad_norm": 18.22613404235153, + "learning_rate": 4.997930767273552e-05, + "loss": 2.7004, + "mean_token_accuracy": 0.4000000059604645, + "step": 62520 + }, + { + "epoch": 0.06297584406768053, + "grad_norm": 23.61983593764508, + "learning_rate": 4.997929160286734e-05, + "loss": 2.8826, + "mean_token_accuracy": 0.4034482717514038, + "step": 62525 + }, + { + "epoch": 0.06298088012078469, + "grad_norm": 11.853141917536039, + "learning_rate": 4.997927552676444e-05, + "loss": 2.8192, + "mean_token_accuracy": 0.3931034505367279, + "step": 62530 + }, + { + "epoch": 0.06298591617388886, + "grad_norm": 12.224987472827817, + "learning_rate": 4.9979259444426836e-05, + "loss": 2.3744, + "mean_token_accuracy": 0.41724138259887694, + "step": 62535 + }, + { + "epoch": 0.06299095222699304, + "grad_norm": 14.501782720695113, + "learning_rate": 4.997924335585452e-05, + "loss": 2.6304, + "mean_token_accuracy": 0.36896551847457887, + "step": 62540 + }, + { + "epoch": 0.06299598828009721, + "grad_norm": 12.22618132731522, + "learning_rate": 4.997922726104751e-05, + "loss": 2.6139, + "mean_token_accuracy": 0.41724138259887694, + "step": 62545 + }, + { + "epoch": 0.06300102433320139, + "grad_norm": 12.805109773441645, + "learning_rate": 4.997921116000579e-05, + "loss": 2.4927, + "mean_token_accuracy": 0.42758620381355283, + "step": 62550 + }, + { + "epoch": 0.06300606038630556, + "grad_norm": 10.616842235617703, + "learning_rate": 4.9979195052729384e-05, + "loss": 2.6438, + "mean_token_accuracy": 0.39655172228813174, + "step": 62555 + }, + { + "epoch": 0.06301109643940973, + "grad_norm": 10.899057300856157, + "learning_rate": 4.9979178939218285e-05, + "loss": 2.3265, + "mean_token_accuracy": 0.4068965494632721, + "step": 62560 + }, + { + "epoch": 0.06301613249251391, + "grad_norm": 9.906509931671884, + "learning_rate": 4.997916281947251e-05, + "loss": 2.4277, + "mean_token_accuracy": 0.38421053290367124, + "step": 62565 + }, + { + "epoch": 0.06302116854561808, + "grad_norm": 11.847082871403346, + "learning_rate": 4.997914669349205e-05, + "loss": 2.4823, + "mean_token_accuracy": 0.44568965435028074, + "step": 62570 + }, + { + "epoch": 0.06302620459872225, + "grad_norm": 17.997542366909325, + "learning_rate": 4.9979130561276915e-05, + "loss": 2.492, + "mean_token_accuracy": 0.41724138259887694, + "step": 62575 + }, + { + "epoch": 0.06303124065182643, + "grad_norm": 16.56944355255586, + "learning_rate": 4.9979114422827106e-05, + "loss": 3.0716, + "mean_token_accuracy": 0.33103448450565337, + "step": 62580 + }, + { + "epoch": 0.0630362767049306, + "grad_norm": 11.862315666691755, + "learning_rate": 4.997909827814264e-05, + "loss": 2.4111, + "mean_token_accuracy": 0.4379310369491577, + "step": 62585 + }, + { + "epoch": 0.06304131275803478, + "grad_norm": 11.48370584649829, + "learning_rate": 4.997908212722351e-05, + "loss": 2.7825, + "mean_token_accuracy": 0.3793103456497192, + "step": 62590 + }, + { + "epoch": 0.06304634881113895, + "grad_norm": 14.275106085885836, + "learning_rate": 4.9979065970069716e-05, + "loss": 2.8273, + "mean_token_accuracy": 0.38275861740112305, + "step": 62595 + }, + { + "epoch": 0.06305138486424311, + "grad_norm": 13.054366175325045, + "learning_rate": 4.997904980668127e-05, + "loss": 2.6159, + "mean_token_accuracy": 0.4152450144290924, + "step": 62600 + }, + { + "epoch": 0.06305642091734728, + "grad_norm": 11.460283129677492, + "learning_rate": 4.997903363705818e-05, + "loss": 2.7363, + "mean_token_accuracy": 0.3551724135875702, + "step": 62605 + }, + { + "epoch": 0.06306145697045146, + "grad_norm": 16.86719803656878, + "learning_rate": 4.997901746120045e-05, + "loss": 2.7191, + "mean_token_accuracy": 0.4206896543502808, + "step": 62610 + }, + { + "epoch": 0.06306649302355563, + "grad_norm": 11.60301527465864, + "learning_rate": 4.9979001279108076e-05, + "loss": 2.7312, + "mean_token_accuracy": 0.38275861740112305, + "step": 62615 + }, + { + "epoch": 0.0630715290766598, + "grad_norm": 9.59334785716302, + "learning_rate": 4.9978985090781064e-05, + "loss": 2.4335, + "mean_token_accuracy": 0.415426504611969, + "step": 62620 + }, + { + "epoch": 0.06307656512976398, + "grad_norm": 10.853431205632479, + "learning_rate": 4.997896889621942e-05, + "loss": 2.6157, + "mean_token_accuracy": 0.39655172228813174, + "step": 62625 + }, + { + "epoch": 0.06308160118286815, + "grad_norm": 12.081148675996888, + "learning_rate": 4.9978952695423156e-05, + "loss": 2.493, + "mean_token_accuracy": 0.43448275327682495, + "step": 62630 + }, + { + "epoch": 0.06308663723597233, + "grad_norm": 12.755527740367013, + "learning_rate": 4.997893648839227e-05, + "loss": 2.5389, + "mean_token_accuracy": 0.4913793087005615, + "step": 62635 + }, + { + "epoch": 0.0630916732890765, + "grad_norm": 11.795269004460417, + "learning_rate": 4.997892027512677e-05, + "loss": 3.228, + "mean_token_accuracy": 0.33793103098869326, + "step": 62640 + }, + { + "epoch": 0.06309670934218067, + "grad_norm": 10.671812678460117, + "learning_rate": 4.997890405562665e-05, + "loss": 2.7223, + "mean_token_accuracy": 0.3551724135875702, + "step": 62645 + }, + { + "epoch": 0.06310174539528485, + "grad_norm": 11.676922883180511, + "learning_rate": 4.997888782989193e-05, + "loss": 2.6014, + "mean_token_accuracy": 0.39655172228813174, + "step": 62650 + }, + { + "epoch": 0.06310678144838902, + "grad_norm": 10.538287782999582, + "learning_rate": 4.99788715979226e-05, + "loss": 2.2629, + "mean_token_accuracy": 0.4206896543502808, + "step": 62655 + }, + { + "epoch": 0.0631118175014932, + "grad_norm": 14.90855415026501, + "learning_rate": 4.997885535971867e-05, + "loss": 2.7905, + "mean_token_accuracy": 0.4137930989265442, + "step": 62660 + }, + { + "epoch": 0.06311685355459737, + "grad_norm": 9.237068849497524, + "learning_rate": 4.9978839115280144e-05, + "loss": 2.0829, + "mean_token_accuracy": 0.49602670073509214, + "step": 62665 + }, + { + "epoch": 0.06312188960770153, + "grad_norm": 10.361769433594771, + "learning_rate": 4.997882286460704e-05, + "loss": 2.3876, + "mean_token_accuracy": 0.4241379380226135, + "step": 62670 + }, + { + "epoch": 0.0631269256608057, + "grad_norm": 12.489250870863886, + "learning_rate": 4.997880660769934e-05, + "loss": 2.696, + "mean_token_accuracy": 0.39655172228813174, + "step": 62675 + }, + { + "epoch": 0.06313196171390988, + "grad_norm": 13.983554636084667, + "learning_rate": 4.997879034455707e-05, + "loss": 2.5567, + "mean_token_accuracy": 0.37586206793785093, + "step": 62680 + }, + { + "epoch": 0.06313699776701405, + "grad_norm": 9.814819149116904, + "learning_rate": 4.997877407518021e-05, + "loss": 2.4475, + "mean_token_accuracy": 0.42758620977401735, + "step": 62685 + }, + { + "epoch": 0.06314203382011822, + "grad_norm": 10.93821320454277, + "learning_rate": 4.9978757799568784e-05, + "loss": 2.7511, + "mean_token_accuracy": 0.3758620649576187, + "step": 62690 + }, + { + "epoch": 0.0631470698732224, + "grad_norm": 10.829397335044291, + "learning_rate": 4.99787415177228e-05, + "loss": 2.245, + "mean_token_accuracy": 0.4851784646511078, + "step": 62695 + }, + { + "epoch": 0.06315210592632657, + "grad_norm": 9.712067111178953, + "learning_rate": 4.997872522964224e-05, + "loss": 2.1791, + "mean_token_accuracy": 0.41379310488700866, + "step": 62700 + }, + { + "epoch": 0.06315714197943074, + "grad_norm": 11.873982588167959, + "learning_rate": 4.997870893532713e-05, + "loss": 2.7286, + "mean_token_accuracy": 0.4344827473163605, + "step": 62705 + }, + { + "epoch": 0.06316217803253492, + "grad_norm": 24.710975842656893, + "learning_rate": 4.997869263477746e-05, + "loss": 2.8054, + "mean_token_accuracy": 0.3379310369491577, + "step": 62710 + }, + { + "epoch": 0.06316721408563909, + "grad_norm": 11.575714797171193, + "learning_rate": 4.997867632799324e-05, + "loss": 2.5979, + "mean_token_accuracy": 0.34137930870056155, + "step": 62715 + }, + { + "epoch": 0.06317225013874327, + "grad_norm": 10.513917086488547, + "learning_rate": 4.997866001497448e-05, + "loss": 2.4359, + "mean_token_accuracy": 0.4119782209396362, + "step": 62720 + }, + { + "epoch": 0.06317728619184744, + "grad_norm": 10.698267865912632, + "learning_rate": 4.9978643695721184e-05, + "loss": 2.4976, + "mean_token_accuracy": 0.4103448331356049, + "step": 62725 + }, + { + "epoch": 0.06318232224495161, + "grad_norm": 18.16761284445716, + "learning_rate": 4.997862737023334e-05, + "loss": 2.4092, + "mean_token_accuracy": 0.41584996581077577, + "step": 62730 + }, + { + "epoch": 0.06318735829805579, + "grad_norm": 12.508270562390786, + "learning_rate": 4.9978611038510976e-05, + "loss": 2.654, + "mean_token_accuracy": 0.38965516686439516, + "step": 62735 + }, + { + "epoch": 0.06319239435115995, + "grad_norm": 11.397088166177975, + "learning_rate": 4.997859470055408e-05, + "loss": 2.968, + "mean_token_accuracy": 0.34482758641242983, + "step": 62740 + }, + { + "epoch": 0.06319743040426412, + "grad_norm": 10.100531089945259, + "learning_rate": 4.997857835636266e-05, + "loss": 2.2038, + "mean_token_accuracy": 0.4551724135875702, + "step": 62745 + }, + { + "epoch": 0.0632024664573683, + "grad_norm": 11.226503582629404, + "learning_rate": 4.9978562005936724e-05, + "loss": 2.9389, + "mean_token_accuracy": 0.36896551847457887, + "step": 62750 + }, + { + "epoch": 0.06320750251047247, + "grad_norm": 12.870337472897727, + "learning_rate": 4.997854564927628e-05, + "loss": 2.9107, + "mean_token_accuracy": 0.36896551251411436, + "step": 62755 + }, + { + "epoch": 0.06321253856357664, + "grad_norm": 12.829992082370925, + "learning_rate": 4.997852928638132e-05, + "loss": 2.7187, + "mean_token_accuracy": 0.41724138855934145, + "step": 62760 + }, + { + "epoch": 0.06321757461668082, + "grad_norm": 12.229133994137596, + "learning_rate": 4.997851291725186e-05, + "loss": 2.6039, + "mean_token_accuracy": 0.41034482717514037, + "step": 62765 + }, + { + "epoch": 0.06322261066978499, + "grad_norm": 11.061082915404828, + "learning_rate": 4.99784965418879e-05, + "loss": 2.3104, + "mean_token_accuracy": 0.4068965554237366, + "step": 62770 + }, + { + "epoch": 0.06322764672288916, + "grad_norm": 15.358048199655533, + "learning_rate": 4.9978480160289447e-05, + "loss": 2.4977, + "mean_token_accuracy": 0.42413793206214906, + "step": 62775 + }, + { + "epoch": 0.06323268277599334, + "grad_norm": 10.643038854491767, + "learning_rate": 4.997846377245651e-05, + "loss": 2.578, + "mean_token_accuracy": 0.39655172228813174, + "step": 62780 + }, + { + "epoch": 0.06323771882909751, + "grad_norm": 12.07324951302468, + "learning_rate": 4.997844737838907e-05, + "loss": 2.5732, + "mean_token_accuracy": 0.4206896543502808, + "step": 62785 + }, + { + "epoch": 0.06324275488220168, + "grad_norm": 13.497796717479257, + "learning_rate": 4.9978430978087157e-05, + "loss": 2.6489, + "mean_token_accuracy": 0.38620689511299133, + "step": 62790 + }, + { + "epoch": 0.06324779093530586, + "grad_norm": 11.293321670814565, + "learning_rate": 4.997841457155077e-05, + "loss": 2.2181, + "mean_token_accuracy": 0.441379314661026, + "step": 62795 + }, + { + "epoch": 0.06325282698841003, + "grad_norm": 11.953459418662739, + "learning_rate": 4.9978398158779914e-05, + "loss": 2.5272, + "mean_token_accuracy": 0.41034482717514037, + "step": 62800 + }, + { + "epoch": 0.0632578630415142, + "grad_norm": 10.55252555515734, + "learning_rate": 4.9978381739774585e-05, + "loss": 2.5238, + "mean_token_accuracy": 0.458620685338974, + "step": 62805 + }, + { + "epoch": 0.06326289909461837, + "grad_norm": 12.656446890749, + "learning_rate": 4.997836531453479e-05, + "loss": 2.6859, + "mean_token_accuracy": 0.3310344755649567, + "step": 62810 + }, + { + "epoch": 0.06326793514772254, + "grad_norm": 10.21942769802935, + "learning_rate": 4.997834888306054e-05, + "loss": 2.5237, + "mean_token_accuracy": 0.4068965494632721, + "step": 62815 + }, + { + "epoch": 0.06327297120082671, + "grad_norm": 10.12716900947491, + "learning_rate": 4.9978332445351834e-05, + "loss": 2.1677, + "mean_token_accuracy": 0.4819116711616516, + "step": 62820 + }, + { + "epoch": 0.06327800725393089, + "grad_norm": 11.776979181085718, + "learning_rate": 4.997831600140869e-05, + "loss": 2.9502, + "mean_token_accuracy": 0.3551724076271057, + "step": 62825 + }, + { + "epoch": 0.06328304330703506, + "grad_norm": 10.794313497532576, + "learning_rate": 4.997829955123109e-05, + "loss": 2.5536, + "mean_token_accuracy": 0.4220810651779175, + "step": 62830 + }, + { + "epoch": 0.06328807936013923, + "grad_norm": 10.836106390048661, + "learning_rate": 4.997828309481905e-05, + "loss": 2.3873, + "mean_token_accuracy": 0.42758620381355283, + "step": 62835 + }, + { + "epoch": 0.06329311541324341, + "grad_norm": 21.372688150776362, + "learning_rate": 4.9978266632172573e-05, + "loss": 2.7892, + "mean_token_accuracy": 0.45722927451133727, + "step": 62840 + }, + { + "epoch": 0.06329815146634758, + "grad_norm": 10.962490477211139, + "learning_rate": 4.997825016329167e-05, + "loss": 2.6139, + "mean_token_accuracy": 0.42758620977401735, + "step": 62845 + }, + { + "epoch": 0.06330318751945176, + "grad_norm": 11.831796561043065, + "learning_rate": 4.997823368817634e-05, + "loss": 2.7147, + "mean_token_accuracy": 0.3827586233615875, + "step": 62850 + }, + { + "epoch": 0.06330822357255593, + "grad_norm": 11.027178918930792, + "learning_rate": 4.997821720682658e-05, + "loss": 2.5693, + "mean_token_accuracy": 0.38965516686439516, + "step": 62855 + }, + { + "epoch": 0.0633132596256601, + "grad_norm": 14.645262125735309, + "learning_rate": 4.997820071924241e-05, + "loss": 2.5093, + "mean_token_accuracy": 0.38965516686439516, + "step": 62860 + }, + { + "epoch": 0.06331829567876428, + "grad_norm": 11.806101436397247, + "learning_rate": 4.997818422542384e-05, + "loss": 2.5099, + "mean_token_accuracy": 0.4172413766384125, + "step": 62865 + }, + { + "epoch": 0.06332333173186845, + "grad_norm": 11.735624725176145, + "learning_rate": 4.997816772537084e-05, + "loss": 2.881, + "mean_token_accuracy": 0.36896551549434664, + "step": 62870 + }, + { + "epoch": 0.06332836778497263, + "grad_norm": 10.974194335934163, + "learning_rate": 4.997815121908344e-05, + "loss": 2.4363, + "mean_token_accuracy": 0.42758620977401735, + "step": 62875 + }, + { + "epoch": 0.06333340383807678, + "grad_norm": 15.37668527091801, + "learning_rate": 4.997813470656165e-05, + "loss": 2.3278, + "mean_token_accuracy": 0.4068965494632721, + "step": 62880 + }, + { + "epoch": 0.06333843989118096, + "grad_norm": 9.995425936929692, + "learning_rate": 4.9978118187805464e-05, + "loss": 2.6105, + "mean_token_accuracy": 0.45862069725990295, + "step": 62885 + }, + { + "epoch": 0.06334347594428513, + "grad_norm": 11.345111963198455, + "learning_rate": 4.997810166281488e-05, + "loss": 2.3433, + "mean_token_accuracy": 0.4517241299152374, + "step": 62890 + }, + { + "epoch": 0.0633485119973893, + "grad_norm": 13.867665584125604, + "learning_rate": 4.9978085131589917e-05, + "loss": 2.7356, + "mean_token_accuracy": 0.3482758641242981, + "step": 62895 + }, + { + "epoch": 0.06335354805049348, + "grad_norm": 10.51114361338589, + "learning_rate": 4.9978068594130574e-05, + "loss": 2.5965, + "mean_token_accuracy": 0.3896551728248596, + "step": 62900 + }, + { + "epoch": 0.06335858410359765, + "grad_norm": 12.408525903786222, + "learning_rate": 4.9978052050436845e-05, + "loss": 2.4405, + "mean_token_accuracy": 0.39655172228813174, + "step": 62905 + }, + { + "epoch": 0.06336362015670183, + "grad_norm": 11.164442797086085, + "learning_rate": 4.997803550050876e-05, + "loss": 2.4056, + "mean_token_accuracy": 0.4551724135875702, + "step": 62910 + }, + { + "epoch": 0.063368656209806, + "grad_norm": 13.51298160588357, + "learning_rate": 4.99780189443463e-05, + "loss": 2.6889, + "mean_token_accuracy": 0.3793103456497192, + "step": 62915 + }, + { + "epoch": 0.06337369226291018, + "grad_norm": 11.619742638909166, + "learning_rate": 4.997800238194947e-05, + "loss": 2.5925, + "mean_token_accuracy": 0.38275861740112305, + "step": 62920 + }, + { + "epoch": 0.06337872831601435, + "grad_norm": 13.307964549359367, + "learning_rate": 4.997798581331829e-05, + "loss": 2.2547, + "mean_token_accuracy": 0.4344827592372894, + "step": 62925 + }, + { + "epoch": 0.06338376436911852, + "grad_norm": 14.397365160648638, + "learning_rate": 4.997796923845276e-05, + "loss": 2.7711, + "mean_token_accuracy": 0.38275861740112305, + "step": 62930 + }, + { + "epoch": 0.0633888004222227, + "grad_norm": 10.497841330405317, + "learning_rate": 4.997795265735288e-05, + "loss": 2.5143, + "mean_token_accuracy": 0.43103448748588563, + "step": 62935 + }, + { + "epoch": 0.06339383647532687, + "grad_norm": 10.823227285467338, + "learning_rate": 4.997793607001865e-05, + "loss": 2.9231, + "mean_token_accuracy": 0.37241379618644715, + "step": 62940 + }, + { + "epoch": 0.06339887252843104, + "grad_norm": 12.947377640832107, + "learning_rate": 4.997791947645009e-05, + "loss": 2.5871, + "mean_token_accuracy": 0.38620689809322356, + "step": 62945 + }, + { + "epoch": 0.0634039085815352, + "grad_norm": 12.817206924961173, + "learning_rate": 4.9977902876647184e-05, + "loss": 2.5038, + "mean_token_accuracy": 0.3862068891525269, + "step": 62950 + }, + { + "epoch": 0.06340894463463938, + "grad_norm": 12.508946739520635, + "learning_rate": 4.9977886270609954e-05, + "loss": 2.8962, + "mean_token_accuracy": 0.35862069129943847, + "step": 62955 + }, + { + "epoch": 0.06341398068774355, + "grad_norm": 11.33053567371234, + "learning_rate": 4.99778696583384e-05, + "loss": 2.3936, + "mean_token_accuracy": 0.38620689511299133, + "step": 62960 + }, + { + "epoch": 0.06341901674084773, + "grad_norm": 10.486351483419694, + "learning_rate": 4.997785303983252e-05, + "loss": 2.4282, + "mean_token_accuracy": 0.3965517163276672, + "step": 62965 + }, + { + "epoch": 0.0634240527939519, + "grad_norm": 13.012336525853955, + "learning_rate": 4.9977836415092324e-05, + "loss": 2.9493, + "mean_token_accuracy": 0.3172413736581802, + "step": 62970 + }, + { + "epoch": 0.06342908884705607, + "grad_norm": 15.176180764010281, + "learning_rate": 4.997781978411782e-05, + "loss": 2.3287, + "mean_token_accuracy": 0.3931034505367279, + "step": 62975 + }, + { + "epoch": 0.06343412490016025, + "grad_norm": 12.208617826001646, + "learning_rate": 4.997780314690901e-05, + "loss": 2.1636, + "mean_token_accuracy": 0.4620689630508423, + "step": 62980 + }, + { + "epoch": 0.06343916095326442, + "grad_norm": 12.202523748047229, + "learning_rate": 4.99777865034659e-05, + "loss": 2.4651, + "mean_token_accuracy": 0.4068965494632721, + "step": 62985 + }, + { + "epoch": 0.0634441970063686, + "grad_norm": 10.509024103551493, + "learning_rate": 4.997776985378848e-05, + "loss": 2.7396, + "mean_token_accuracy": 0.3862069070339203, + "step": 62990 + }, + { + "epoch": 0.06344923305947277, + "grad_norm": 12.697886661582267, + "learning_rate": 4.997775319787678e-05, + "loss": 2.3262, + "mean_token_accuracy": 0.38275861740112305, + "step": 62995 + }, + { + "epoch": 0.06345426911257694, + "grad_norm": 9.792681006966529, + "learning_rate": 4.997773653573078e-05, + "loss": 2.6428, + "mean_token_accuracy": 0.3931034505367279, + "step": 63000 + }, + { + "epoch": 0.06345930516568112, + "grad_norm": 10.925680548089185, + "learning_rate": 4.99777198673505e-05, + "loss": 2.3808, + "mean_token_accuracy": 0.41379310488700866, + "step": 63005 + }, + { + "epoch": 0.06346434121878529, + "grad_norm": 11.889808588793308, + "learning_rate": 4.997770319273594e-05, + "loss": 2.6302, + "mean_token_accuracy": 0.37931033968925476, + "step": 63010 + }, + { + "epoch": 0.06346937727188946, + "grad_norm": 11.566248975214169, + "learning_rate": 4.997768651188711e-05, + "loss": 2.5148, + "mean_token_accuracy": 0.3931034505367279, + "step": 63015 + }, + { + "epoch": 0.06347441332499362, + "grad_norm": 10.211655840622063, + "learning_rate": 4.9977669824804e-05, + "loss": 2.5806, + "mean_token_accuracy": 0.4137930989265442, + "step": 63020 + }, + { + "epoch": 0.0634794493780978, + "grad_norm": 10.87467976315671, + "learning_rate": 4.997765313148664e-05, + "loss": 2.3792, + "mean_token_accuracy": 0.4551724135875702, + "step": 63025 + }, + { + "epoch": 0.06348448543120197, + "grad_norm": 12.49875647957717, + "learning_rate": 4.9977636431935005e-05, + "loss": 2.168, + "mean_token_accuracy": 0.4517241299152374, + "step": 63030 + }, + { + "epoch": 0.06348952148430614, + "grad_norm": 13.348785850842898, + "learning_rate": 4.997761972614912e-05, + "loss": 2.3538, + "mean_token_accuracy": 0.37241379022598264, + "step": 63035 + }, + { + "epoch": 0.06349455753741032, + "grad_norm": 11.006129740225912, + "learning_rate": 4.997760301412898e-05, + "loss": 2.2569, + "mean_token_accuracy": 0.44682395458221436, + "step": 63040 + }, + { + "epoch": 0.06349959359051449, + "grad_norm": 17.709199345690887, + "learning_rate": 4.99775862958746e-05, + "loss": 2.4971, + "mean_token_accuracy": 0.37586206793785093, + "step": 63045 + }, + { + "epoch": 0.06350462964361867, + "grad_norm": 10.296251984232336, + "learning_rate": 4.9977569571385966e-05, + "loss": 2.3312, + "mean_token_accuracy": 0.45862067937850953, + "step": 63050 + }, + { + "epoch": 0.06350966569672284, + "grad_norm": 12.590140769430512, + "learning_rate": 4.9977552840663106e-05, + "loss": 2.512, + "mean_token_accuracy": 0.42413793206214906, + "step": 63055 + }, + { + "epoch": 0.06351470174982701, + "grad_norm": 11.295289432710671, + "learning_rate": 4.997753610370601e-05, + "loss": 2.235, + "mean_token_accuracy": 0.47931034564971925, + "step": 63060 + }, + { + "epoch": 0.06351973780293119, + "grad_norm": 17.41531609938963, + "learning_rate": 4.997751936051468e-05, + "loss": 2.8108, + "mean_token_accuracy": 0.3862069010734558, + "step": 63065 + }, + { + "epoch": 0.06352477385603536, + "grad_norm": 11.526873377100701, + "learning_rate": 4.997750261108913e-05, + "loss": 2.4722, + "mean_token_accuracy": 0.4413793087005615, + "step": 63070 + }, + { + "epoch": 0.06352980990913953, + "grad_norm": 10.80343095231409, + "learning_rate": 4.997748585542936e-05, + "loss": 2.7652, + "mean_token_accuracy": 0.3827586233615875, + "step": 63075 + }, + { + "epoch": 0.06353484596224371, + "grad_norm": 12.391753481395607, + "learning_rate": 4.997746909353538e-05, + "loss": 2.6906, + "mean_token_accuracy": 0.36896551847457887, + "step": 63080 + }, + { + "epoch": 0.06353988201534788, + "grad_norm": 10.849596166029507, + "learning_rate": 4.9977452325407184e-05, + "loss": 2.3387, + "mean_token_accuracy": 0.4344827592372894, + "step": 63085 + }, + { + "epoch": 0.06354491806845204, + "grad_norm": 11.34962919939225, + "learning_rate": 4.997743555104478e-05, + "loss": 2.8666, + "mean_token_accuracy": 0.38620689511299133, + "step": 63090 + }, + { + "epoch": 0.06354995412155622, + "grad_norm": 9.645527777748962, + "learning_rate": 4.997741877044818e-05, + "loss": 2.6885, + "mean_token_accuracy": 0.3965517163276672, + "step": 63095 + }, + { + "epoch": 0.06355499017466039, + "grad_norm": 9.765620035514612, + "learning_rate": 4.9977401983617386e-05, + "loss": 2.8432, + "mean_token_accuracy": 0.47241379618644713, + "step": 63100 + }, + { + "epoch": 0.06356002622776456, + "grad_norm": 9.516551623707082, + "learning_rate": 4.9977385190552394e-05, + "loss": 2.6742, + "mean_token_accuracy": 0.3655172437429428, + "step": 63105 + }, + { + "epoch": 0.06356506228086874, + "grad_norm": 9.295572835471232, + "learning_rate": 4.9977368391253225e-05, + "loss": 2.3065, + "mean_token_accuracy": 0.41560798287391665, + "step": 63110 + }, + { + "epoch": 0.06357009833397291, + "grad_norm": 10.821879250169616, + "learning_rate": 4.997735158571986e-05, + "loss": 2.7486, + "mean_token_accuracy": 0.3844525098800659, + "step": 63115 + }, + { + "epoch": 0.06357513438707708, + "grad_norm": 13.628662211805263, + "learning_rate": 4.9977334773952333e-05, + "loss": 3.116, + "mean_token_accuracy": 0.4, + "step": 63120 + }, + { + "epoch": 0.06358017044018126, + "grad_norm": 21.20882584696481, + "learning_rate": 4.9977317955950623e-05, + "loss": 2.3007, + "mean_token_accuracy": 0.3965517282485962, + "step": 63125 + }, + { + "epoch": 0.06358520649328543, + "grad_norm": 14.21690545699418, + "learning_rate": 4.997730113171475e-05, + "loss": 2.3823, + "mean_token_accuracy": 0.458620685338974, + "step": 63130 + }, + { + "epoch": 0.0635902425463896, + "grad_norm": 14.38705054726303, + "learning_rate": 4.9977284301244716e-05, + "loss": 2.5231, + "mean_token_accuracy": 0.41379310488700866, + "step": 63135 + }, + { + "epoch": 0.06359527859949378, + "grad_norm": 11.278973915636076, + "learning_rate": 4.997726746454052e-05, + "loss": 2.562, + "mean_token_accuracy": 0.41034482717514037, + "step": 63140 + }, + { + "epoch": 0.06360031465259795, + "grad_norm": 11.427734078640018, + "learning_rate": 4.9977250621602165e-05, + "loss": 2.5195, + "mean_token_accuracy": 0.39655172228813174, + "step": 63145 + }, + { + "epoch": 0.06360535070570213, + "grad_norm": 12.187791017923425, + "learning_rate": 4.997723377242966e-05, + "loss": 3.1222, + "mean_token_accuracy": 0.334482753276825, + "step": 63150 + }, + { + "epoch": 0.0636103867588063, + "grad_norm": 11.539350390152805, + "learning_rate": 4.9977216917023026e-05, + "loss": 2.644, + "mean_token_accuracy": 0.3793103456497192, + "step": 63155 + }, + { + "epoch": 0.06361542281191046, + "grad_norm": 11.24816336109071, + "learning_rate": 4.997720005538224e-05, + "loss": 2.6427, + "mean_token_accuracy": 0.3999999940395355, + "step": 63160 + }, + { + "epoch": 0.06362045886501463, + "grad_norm": 12.509004571078448, + "learning_rate": 4.997718318750732e-05, + "loss": 2.7501, + "mean_token_accuracy": 0.3275862067937851, + "step": 63165 + }, + { + "epoch": 0.06362549491811881, + "grad_norm": 11.598917291669215, + "learning_rate": 4.9977166313398274e-05, + "loss": 2.8979, + "mean_token_accuracy": 0.3620689630508423, + "step": 63170 + }, + { + "epoch": 0.06363053097122298, + "grad_norm": 11.166131181654826, + "learning_rate": 4.9977149433055095e-05, + "loss": 2.3234, + "mean_token_accuracy": 0.43448275327682495, + "step": 63175 + }, + { + "epoch": 0.06363556702432716, + "grad_norm": 13.551153532101033, + "learning_rate": 4.9977132546477794e-05, + "loss": 2.5216, + "mean_token_accuracy": 0.3931034505367279, + "step": 63180 + }, + { + "epoch": 0.06364060307743133, + "grad_norm": 18.9003953210268, + "learning_rate": 4.9977115653666385e-05, + "loss": 3.004, + "mean_token_accuracy": 0.3344827651977539, + "step": 63185 + }, + { + "epoch": 0.0636456391305355, + "grad_norm": 11.158137401513866, + "learning_rate": 4.9977098754620854e-05, + "loss": 2.556, + "mean_token_accuracy": 0.43103448748588563, + "step": 63190 + }, + { + "epoch": 0.06365067518363968, + "grad_norm": 23.87861349720783, + "learning_rate": 4.997708184934123e-05, + "loss": 2.5001, + "mean_token_accuracy": 0.41724138259887694, + "step": 63195 + }, + { + "epoch": 0.06365571123674385, + "grad_norm": 12.204114721710244, + "learning_rate": 4.997706493782749e-05, + "loss": 2.6374, + "mean_token_accuracy": 0.38275861740112305, + "step": 63200 + }, + { + "epoch": 0.06366074728984802, + "grad_norm": 13.214232527625812, + "learning_rate": 4.997704802007965e-05, + "loss": 2.2976, + "mean_token_accuracy": 0.4034482777118683, + "step": 63205 + }, + { + "epoch": 0.0636657833429522, + "grad_norm": 11.084630459788958, + "learning_rate": 4.997703109609773e-05, + "loss": 2.3995, + "mean_token_accuracy": 0.3827586233615875, + "step": 63210 + }, + { + "epoch": 0.06367081939605637, + "grad_norm": 10.238150474875157, + "learning_rate": 4.997701416588172e-05, + "loss": 2.5116, + "mean_token_accuracy": 0.39177253246307375, + "step": 63215 + }, + { + "epoch": 0.06367585544916055, + "grad_norm": 10.594671521227854, + "learning_rate": 4.997699722943162e-05, + "loss": 2.4187, + "mean_token_accuracy": 0.4517241418361664, + "step": 63220 + }, + { + "epoch": 0.06368089150226472, + "grad_norm": 13.080216595447656, + "learning_rate": 4.9976980286747444e-05, + "loss": 2.9102, + "mean_token_accuracy": 0.358620685338974, + "step": 63225 + }, + { + "epoch": 0.06368592755536888, + "grad_norm": 12.23305530208788, + "learning_rate": 4.997696333782919e-05, + "loss": 2.7292, + "mean_token_accuracy": 0.3793103456497192, + "step": 63230 + }, + { + "epoch": 0.06369096360847305, + "grad_norm": 11.94714586330912, + "learning_rate": 4.997694638267687e-05, + "loss": 2.6121, + "mean_token_accuracy": 0.358620685338974, + "step": 63235 + }, + { + "epoch": 0.06369599966157723, + "grad_norm": 12.489467614460153, + "learning_rate": 4.997692942129048e-05, + "loss": 2.6647, + "mean_token_accuracy": 0.4172413766384125, + "step": 63240 + }, + { + "epoch": 0.0637010357146814, + "grad_norm": 9.919376558515882, + "learning_rate": 4.997691245367004e-05, + "loss": 2.5829, + "mean_token_accuracy": 0.42413793206214906, + "step": 63245 + }, + { + "epoch": 0.06370607176778557, + "grad_norm": 13.804107657013484, + "learning_rate": 4.9976895479815534e-05, + "loss": 2.9093, + "mean_token_accuracy": 0.36896551251411436, + "step": 63250 + }, + { + "epoch": 0.06371110782088975, + "grad_norm": 11.990478593465738, + "learning_rate": 4.997687849972698e-05, + "loss": 2.7383, + "mean_token_accuracy": 0.3931034505367279, + "step": 63255 + }, + { + "epoch": 0.06371614387399392, + "grad_norm": 15.679260867060291, + "learning_rate": 4.997686151340438e-05, + "loss": 2.6363, + "mean_token_accuracy": 0.42413793206214906, + "step": 63260 + }, + { + "epoch": 0.0637211799270981, + "grad_norm": 8.256280967282471, + "learning_rate": 4.9976844520847735e-05, + "loss": 2.2399, + "mean_token_accuracy": 0.4551724076271057, + "step": 63265 + }, + { + "epoch": 0.06372621598020227, + "grad_norm": 12.107231461333452, + "learning_rate": 4.997682752205706e-05, + "loss": 2.453, + "mean_token_accuracy": 0.42758620381355283, + "step": 63270 + }, + { + "epoch": 0.06373125203330644, + "grad_norm": 12.659418740034596, + "learning_rate": 4.997681051703236e-05, + "loss": 2.5193, + "mean_token_accuracy": 0.4482758641242981, + "step": 63275 + }, + { + "epoch": 0.06373628808641062, + "grad_norm": 13.315742665187408, + "learning_rate": 4.997679350577362e-05, + "loss": 3.1609, + "mean_token_accuracy": 0.3068965464830399, + "step": 63280 + }, + { + "epoch": 0.06374132413951479, + "grad_norm": 11.20398462130214, + "learning_rate": 4.997677648828086e-05, + "loss": 2.545, + "mean_token_accuracy": 0.3931034505367279, + "step": 63285 + }, + { + "epoch": 0.06374636019261896, + "grad_norm": 10.878976319003169, + "learning_rate": 4.997675946455408e-05, + "loss": 2.2835, + "mean_token_accuracy": 0.4310344815254211, + "step": 63290 + }, + { + "epoch": 0.06375139624572314, + "grad_norm": 11.127145562508073, + "learning_rate": 4.997674243459329e-05, + "loss": 2.4556, + "mean_token_accuracy": 0.3999999940395355, + "step": 63295 + }, + { + "epoch": 0.0637564322988273, + "grad_norm": 16.925077194729635, + "learning_rate": 4.997672539839849e-05, + "loss": 2.8662, + "mean_token_accuracy": 0.3655172407627106, + "step": 63300 + }, + { + "epoch": 0.06376146835193147, + "grad_norm": 14.375031439210662, + "learning_rate": 4.9976708355969696e-05, + "loss": 2.9231, + "mean_token_accuracy": 0.38620689511299133, + "step": 63305 + }, + { + "epoch": 0.06376650440503565, + "grad_norm": 11.408587774770599, + "learning_rate": 4.9976691307306895e-05, + "loss": 2.436, + "mean_token_accuracy": 0.3551724076271057, + "step": 63310 + }, + { + "epoch": 0.06377154045813982, + "grad_norm": 10.83959831649001, + "learning_rate": 4.99766742524101e-05, + "loss": 2.3578, + "mean_token_accuracy": 0.4517241358757019, + "step": 63315 + }, + { + "epoch": 0.063776576511244, + "grad_norm": 13.484847677117825, + "learning_rate": 4.997665719127931e-05, + "loss": 2.7433, + "mean_token_accuracy": 0.3655172407627106, + "step": 63320 + }, + { + "epoch": 0.06378161256434817, + "grad_norm": 11.77695444858873, + "learning_rate": 4.9976640123914545e-05, + "loss": 2.602, + "mean_token_accuracy": 0.3931034505367279, + "step": 63325 + }, + { + "epoch": 0.06378664861745234, + "grad_norm": 11.811662870956198, + "learning_rate": 4.99766230503158e-05, + "loss": 2.3361, + "mean_token_accuracy": 0.4517241418361664, + "step": 63330 + }, + { + "epoch": 0.06379168467055651, + "grad_norm": 10.276126145341447, + "learning_rate": 4.997660597048307e-05, + "loss": 2.5772, + "mean_token_accuracy": 0.3620689630508423, + "step": 63335 + }, + { + "epoch": 0.06379672072366069, + "grad_norm": 12.198763836826688, + "learning_rate": 4.997658888441637e-05, + "loss": 2.6502, + "mean_token_accuracy": 0.3896551728248596, + "step": 63340 + }, + { + "epoch": 0.06380175677676486, + "grad_norm": 11.661362387885186, + "learning_rate": 4.997657179211572e-05, + "loss": 2.2321, + "mean_token_accuracy": 0.48965518474578856, + "step": 63345 + }, + { + "epoch": 0.06380679282986904, + "grad_norm": 12.647521332896089, + "learning_rate": 4.997655469358109e-05, + "loss": 2.6739, + "mean_token_accuracy": 0.3655172407627106, + "step": 63350 + }, + { + "epoch": 0.06381182888297321, + "grad_norm": 13.875023624712433, + "learning_rate": 4.997653758881252e-05, + "loss": 2.6246, + "mean_token_accuracy": 0.3999999940395355, + "step": 63355 + }, + { + "epoch": 0.06381686493607738, + "grad_norm": 10.24518492284466, + "learning_rate": 4.9976520477809984e-05, + "loss": 2.3814, + "mean_token_accuracy": 0.4206896543502808, + "step": 63360 + }, + { + "epoch": 0.06382190098918156, + "grad_norm": 10.964505185797076, + "learning_rate": 4.997650336057351e-05, + "loss": 2.4232, + "mean_token_accuracy": 0.4103448212146759, + "step": 63365 + }, + { + "epoch": 0.06382693704228572, + "grad_norm": 12.912499536411739, + "learning_rate": 4.9976486237103096e-05, + "loss": 2.5719, + "mean_token_accuracy": 0.4034482717514038, + "step": 63370 + }, + { + "epoch": 0.06383197309538989, + "grad_norm": 11.440229352750475, + "learning_rate": 4.997646910739874e-05, + "loss": 2.9031, + "mean_token_accuracy": 0.3241379290819168, + "step": 63375 + }, + { + "epoch": 0.06383700914849406, + "grad_norm": 12.906632040609026, + "learning_rate": 4.997645197146045e-05, + "loss": 2.7181, + "mean_token_accuracy": 0.38275861740112305, + "step": 63380 + }, + { + "epoch": 0.06384204520159824, + "grad_norm": 11.715984917397773, + "learning_rate": 4.9976434829288235e-05, + "loss": 2.6308, + "mean_token_accuracy": 0.3827586233615875, + "step": 63385 + }, + { + "epoch": 0.06384708125470241, + "grad_norm": 13.657477189207045, + "learning_rate": 4.99764176808821e-05, + "loss": 2.8588, + "mean_token_accuracy": 0.34137930274009703, + "step": 63390 + }, + { + "epoch": 0.06385211730780659, + "grad_norm": 11.755087163734018, + "learning_rate": 4.997640052624204e-05, + "loss": 2.4587, + "mean_token_accuracy": 0.43793103098869324, + "step": 63395 + }, + { + "epoch": 0.06385715336091076, + "grad_norm": 11.367460200987733, + "learning_rate": 4.997638336536807e-05, + "loss": 2.2045, + "mean_token_accuracy": 0.4310344815254211, + "step": 63400 + }, + { + "epoch": 0.06386218941401493, + "grad_norm": 11.591915284504195, + "learning_rate": 4.997636619826019e-05, + "loss": 2.7151, + "mean_token_accuracy": 0.42413792610168455, + "step": 63405 + }, + { + "epoch": 0.06386722546711911, + "grad_norm": 10.571675202571237, + "learning_rate": 4.997634902491841e-05, + "loss": 2.1931, + "mean_token_accuracy": 0.40865094065666197, + "step": 63410 + }, + { + "epoch": 0.06387226152022328, + "grad_norm": 11.07506351615169, + "learning_rate": 4.997633184534272e-05, + "loss": 2.4622, + "mean_token_accuracy": 0.4413793206214905, + "step": 63415 + }, + { + "epoch": 0.06387729757332745, + "grad_norm": 13.903265912634309, + "learning_rate": 4.997631465953315e-05, + "loss": 2.5838, + "mean_token_accuracy": 0.3965517282485962, + "step": 63420 + }, + { + "epoch": 0.06388233362643163, + "grad_norm": 10.819228478310656, + "learning_rate": 4.997629746748968e-05, + "loss": 2.5452, + "mean_token_accuracy": 0.3965517282485962, + "step": 63425 + }, + { + "epoch": 0.0638873696795358, + "grad_norm": 12.73018680328224, + "learning_rate": 4.997628026921232e-05, + "loss": 2.3459, + "mean_token_accuracy": 0.46364185214042664, + "step": 63430 + }, + { + "epoch": 0.06389240573263998, + "grad_norm": 10.221986991331171, + "learning_rate": 4.9976263064701096e-05, + "loss": 2.8595, + "mean_token_accuracy": 0.4068965494632721, + "step": 63435 + }, + { + "epoch": 0.06389744178574414, + "grad_norm": 10.529447617098226, + "learning_rate": 4.9976245853955985e-05, + "loss": 2.4286, + "mean_token_accuracy": 0.4620689630508423, + "step": 63440 + }, + { + "epoch": 0.06390247783884831, + "grad_norm": 10.191378591498585, + "learning_rate": 4.997622863697701e-05, + "loss": 2.4441, + "mean_token_accuracy": 0.47108287215232847, + "step": 63445 + }, + { + "epoch": 0.06390751389195248, + "grad_norm": 13.20666845510418, + "learning_rate": 4.9976211413764164e-05, + "loss": 2.7672, + "mean_token_accuracy": 0.36551723480224607, + "step": 63450 + }, + { + "epoch": 0.06391254994505666, + "grad_norm": 10.932915369588255, + "learning_rate": 4.997619418431746e-05, + "loss": 2.6027, + "mean_token_accuracy": 0.3793103456497192, + "step": 63455 + }, + { + "epoch": 0.06391758599816083, + "grad_norm": 15.673128488812408, + "learning_rate": 4.99761769486369e-05, + "loss": 2.8893, + "mean_token_accuracy": 0.324137932062149, + "step": 63460 + }, + { + "epoch": 0.063922622051265, + "grad_norm": 12.211686268731789, + "learning_rate": 4.997615970672249e-05, + "loss": 2.4525, + "mean_token_accuracy": 0.3827586233615875, + "step": 63465 + }, + { + "epoch": 0.06392765810436918, + "grad_norm": 8.739707962963923, + "learning_rate": 4.9976142458574224e-05, + "loss": 3.1536, + "mean_token_accuracy": 0.3655172407627106, + "step": 63470 + }, + { + "epoch": 0.06393269415747335, + "grad_norm": 11.208363184433805, + "learning_rate": 4.9976125204192124e-05, + "loss": 2.5098, + "mean_token_accuracy": 0.43103448748588563, + "step": 63475 + }, + { + "epoch": 0.06393773021057753, + "grad_norm": 9.79950717846871, + "learning_rate": 4.997610794357618e-05, + "loss": 2.4275, + "mean_token_accuracy": 0.41034482717514037, + "step": 63480 + }, + { + "epoch": 0.0639427662636817, + "grad_norm": 15.209781113691305, + "learning_rate": 4.997609067672641e-05, + "loss": 2.5033, + "mean_token_accuracy": 0.4310344815254211, + "step": 63485 + }, + { + "epoch": 0.06394780231678587, + "grad_norm": 14.167752256467516, + "learning_rate": 4.997607340364282e-05, + "loss": 2.6225, + "mean_token_accuracy": 0.41712037920951844, + "step": 63490 + }, + { + "epoch": 0.06395283836989005, + "grad_norm": 11.100999586764857, + "learning_rate": 4.997605612432539e-05, + "loss": 2.2545, + "mean_token_accuracy": 0.4448275864124298, + "step": 63495 + }, + { + "epoch": 0.06395787442299422, + "grad_norm": 10.966504768297337, + "learning_rate": 4.9976038838774154e-05, + "loss": 2.2442, + "mean_token_accuracy": 0.4620689630508423, + "step": 63500 + }, + { + "epoch": 0.0639629104760984, + "grad_norm": 14.388775829550012, + "learning_rate": 4.9976021546989105e-05, + "loss": 2.8611, + "mean_token_accuracy": 0.3999999940395355, + "step": 63505 + }, + { + "epoch": 0.06396794652920255, + "grad_norm": 12.387886631425452, + "learning_rate": 4.9976004248970246e-05, + "loss": 3.1523, + "mean_token_accuracy": 0.32068965435028074, + "step": 63510 + }, + { + "epoch": 0.06397298258230673, + "grad_norm": 11.362997099574981, + "learning_rate": 4.9975986944717576e-05, + "loss": 2.5575, + "mean_token_accuracy": 0.36896551251411436, + "step": 63515 + }, + { + "epoch": 0.0639780186354109, + "grad_norm": 16.192976984623527, + "learning_rate": 4.997596963423112e-05, + "loss": 2.6553, + "mean_token_accuracy": 0.3999999940395355, + "step": 63520 + }, + { + "epoch": 0.06398305468851508, + "grad_norm": 9.875343457876722, + "learning_rate": 4.997595231751086e-05, + "loss": 2.3841, + "mean_token_accuracy": 0.4448275864124298, + "step": 63525 + }, + { + "epoch": 0.06398809074161925, + "grad_norm": 13.665571163569808, + "learning_rate": 4.99759349945568e-05, + "loss": 2.4868, + "mean_token_accuracy": 0.42758620381355283, + "step": 63530 + }, + { + "epoch": 0.06399312679472342, + "grad_norm": 11.1912015876693, + "learning_rate": 4.9975917665368975e-05, + "loss": 2.5395, + "mean_token_accuracy": 0.39310344457626345, + "step": 63535 + }, + { + "epoch": 0.0639981628478276, + "grad_norm": 9.120064874876928, + "learning_rate": 4.997590032994736e-05, + "loss": 2.3697, + "mean_token_accuracy": 0.42758620977401735, + "step": 63540 + }, + { + "epoch": 0.06400319890093177, + "grad_norm": 9.272633248285443, + "learning_rate": 4.997588298829198e-05, + "loss": 2.0672, + "mean_token_accuracy": 0.5103448271751404, + "step": 63545 + }, + { + "epoch": 0.06400823495403594, + "grad_norm": 12.948368051972265, + "learning_rate": 4.9975865640402825e-05, + "loss": 2.6744, + "mean_token_accuracy": 0.39310344457626345, + "step": 63550 + }, + { + "epoch": 0.06401327100714012, + "grad_norm": 12.423413399008998, + "learning_rate": 4.9975848286279906e-05, + "loss": 2.4794, + "mean_token_accuracy": 0.43103448748588563, + "step": 63555 + }, + { + "epoch": 0.06401830706024429, + "grad_norm": 11.395164187350844, + "learning_rate": 4.997583092592322e-05, + "loss": 2.4467, + "mean_token_accuracy": 0.42758620381355283, + "step": 63560 + }, + { + "epoch": 0.06402334311334847, + "grad_norm": 10.13471431021179, + "learning_rate": 4.9975813559332785e-05, + "loss": 2.9657, + "mean_token_accuracy": 0.37586206793785093, + "step": 63565 + }, + { + "epoch": 0.06402837916645264, + "grad_norm": 11.549504181070393, + "learning_rate": 4.99757961865086e-05, + "loss": 2.3976, + "mean_token_accuracy": 0.44618227481842043, + "step": 63570 + }, + { + "epoch": 0.06403341521955681, + "grad_norm": 11.536189460612412, + "learning_rate": 4.997577880745066e-05, + "loss": 2.6871, + "mean_token_accuracy": 0.4206896543502808, + "step": 63575 + }, + { + "epoch": 0.06403845127266097, + "grad_norm": 10.38922256890761, + "learning_rate": 4.997576142215899e-05, + "loss": 2.8363, + "mean_token_accuracy": 0.35862069129943847, + "step": 63580 + }, + { + "epoch": 0.06404348732576515, + "grad_norm": 10.317430042015749, + "learning_rate": 4.997574403063358e-05, + "loss": 2.4315, + "mean_token_accuracy": 0.42413792610168455, + "step": 63585 + }, + { + "epoch": 0.06404852337886932, + "grad_norm": 10.544819944891124, + "learning_rate": 4.997572663287444e-05, + "loss": 2.5318, + "mean_token_accuracy": 0.37931033670902253, + "step": 63590 + }, + { + "epoch": 0.0640535594319735, + "grad_norm": 14.657058336835519, + "learning_rate": 4.9975709228881565e-05, + "loss": 2.4867, + "mean_token_accuracy": 0.39310344457626345, + "step": 63595 + }, + { + "epoch": 0.06405859548507767, + "grad_norm": 11.280024663806302, + "learning_rate": 4.997569181865498e-05, + "loss": 2.571, + "mean_token_accuracy": 0.4310344815254211, + "step": 63600 + }, + { + "epoch": 0.06406363153818184, + "grad_norm": 12.156751533400804, + "learning_rate": 4.997567440219467e-05, + "loss": 2.6509, + "mean_token_accuracy": 0.3689655244350433, + "step": 63605 + }, + { + "epoch": 0.06406866759128602, + "grad_norm": 12.979973551279228, + "learning_rate": 4.997565697950065e-05, + "loss": 2.2349, + "mean_token_accuracy": 0.44664678573608396, + "step": 63610 + }, + { + "epoch": 0.06407370364439019, + "grad_norm": 14.28149695784813, + "learning_rate": 4.9975639550572916e-05, + "loss": 2.4403, + "mean_token_accuracy": 0.42068964838981626, + "step": 63615 + }, + { + "epoch": 0.06407873969749436, + "grad_norm": 10.78705531233412, + "learning_rate": 4.9975622115411495e-05, + "loss": 2.3318, + "mean_token_accuracy": 0.42413793206214906, + "step": 63620 + }, + { + "epoch": 0.06408377575059854, + "grad_norm": 12.448957464403424, + "learning_rate": 4.997560467401637e-05, + "loss": 1.9822, + "mean_token_accuracy": 0.5277072012424469, + "step": 63625 + }, + { + "epoch": 0.06408881180370271, + "grad_norm": 13.446651787861287, + "learning_rate": 4.997558722638754e-05, + "loss": 2.6381, + "mean_token_accuracy": 0.4226255267858505, + "step": 63630 + }, + { + "epoch": 0.06409384785680688, + "grad_norm": 11.679342005142507, + "learning_rate": 4.997556977252504e-05, + "loss": 2.5566, + "mean_token_accuracy": 0.4310344815254211, + "step": 63635 + }, + { + "epoch": 0.06409888390991106, + "grad_norm": 13.121698477220587, + "learning_rate": 4.9975552312428846e-05, + "loss": 2.5438, + "mean_token_accuracy": 0.42928009629249575, + "step": 63640 + }, + { + "epoch": 0.06410391996301523, + "grad_norm": 12.01048748425883, + "learning_rate": 4.997553484609898e-05, + "loss": 2.5324, + "mean_token_accuracy": 0.4034482717514038, + "step": 63645 + }, + { + "epoch": 0.06410895601611939, + "grad_norm": 13.442931495953765, + "learning_rate": 4.997551737353543e-05, + "loss": 2.7197, + "mean_token_accuracy": 0.35862069129943847, + "step": 63650 + }, + { + "epoch": 0.06411399206922357, + "grad_norm": 10.99846432018099, + "learning_rate": 4.997549989473822e-05, + "loss": 2.3362, + "mean_token_accuracy": 0.4068965554237366, + "step": 63655 + }, + { + "epoch": 0.06411902812232774, + "grad_norm": 11.674528634927146, + "learning_rate": 4.997548240970735e-05, + "loss": 2.5813, + "mean_token_accuracy": 0.3896551728248596, + "step": 63660 + }, + { + "epoch": 0.06412406417543191, + "grad_norm": 10.737567429673202, + "learning_rate": 4.997546491844281e-05, + "loss": 2.4706, + "mean_token_accuracy": 0.4034482717514038, + "step": 63665 + }, + { + "epoch": 0.06412910022853609, + "grad_norm": 11.479922823042527, + "learning_rate": 4.997544742094462e-05, + "loss": 2.7219, + "mean_token_accuracy": 0.3965517282485962, + "step": 63670 + }, + { + "epoch": 0.06413413628164026, + "grad_norm": 13.827038573206677, + "learning_rate": 4.9975429917212793e-05, + "loss": 2.5697, + "mean_token_accuracy": 0.38620689511299133, + "step": 63675 + }, + { + "epoch": 0.06413917233474443, + "grad_norm": 13.153667121994358, + "learning_rate": 4.997541240724731e-05, + "loss": 2.5, + "mean_token_accuracy": 0.4448275864124298, + "step": 63680 + }, + { + "epoch": 0.06414420838784861, + "grad_norm": 11.549180428583616, + "learning_rate": 4.997539489104818e-05, + "loss": 2.3835, + "mean_token_accuracy": 0.4310344815254211, + "step": 63685 + }, + { + "epoch": 0.06414924444095278, + "grad_norm": 10.731964296006106, + "learning_rate": 4.997537736861543e-05, + "loss": 2.568, + "mean_token_accuracy": 0.41724138259887694, + "step": 63690 + }, + { + "epoch": 0.06415428049405696, + "grad_norm": 15.682129388935925, + "learning_rate": 4.997535983994905e-05, + "loss": 2.6059, + "mean_token_accuracy": 0.43448275327682495, + "step": 63695 + }, + { + "epoch": 0.06415931654716113, + "grad_norm": 10.530591610677469, + "learning_rate": 4.9975342305049033e-05, + "loss": 2.7889, + "mean_token_accuracy": 0.3620689630508423, + "step": 63700 + }, + { + "epoch": 0.0641643526002653, + "grad_norm": 9.193220562309554, + "learning_rate": 4.99753247639154e-05, + "loss": 2.1755, + "mean_token_accuracy": 0.4689655125141144, + "step": 63705 + }, + { + "epoch": 0.06416938865336948, + "grad_norm": 11.853488898559288, + "learning_rate": 4.997530721654816e-05, + "loss": 2.7762, + "mean_token_accuracy": 0.37586206793785093, + "step": 63710 + }, + { + "epoch": 0.06417442470647365, + "grad_norm": 13.090050233671361, + "learning_rate": 4.99752896629473e-05, + "loss": 2.8838, + "mean_token_accuracy": 0.40689654648303986, + "step": 63715 + }, + { + "epoch": 0.06417946075957781, + "grad_norm": 13.012997227267867, + "learning_rate": 4.9975272103112843e-05, + "loss": 3.1407, + "mean_token_accuracy": 0.34137930274009703, + "step": 63720 + }, + { + "epoch": 0.06418449681268198, + "grad_norm": 11.670499843261506, + "learning_rate": 4.997525453704478e-05, + "loss": 2.7562, + "mean_token_accuracy": 0.3551724135875702, + "step": 63725 + }, + { + "epoch": 0.06418953286578616, + "grad_norm": 10.741735803707135, + "learning_rate": 4.997523696474312e-05, + "loss": 2.5863, + "mean_token_accuracy": 0.4517241299152374, + "step": 63730 + }, + { + "epoch": 0.06419456891889033, + "grad_norm": 12.298533556515801, + "learning_rate": 4.9975219386207865e-05, + "loss": 2.8177, + "mean_token_accuracy": 0.36896551847457887, + "step": 63735 + }, + { + "epoch": 0.0641996049719945, + "grad_norm": 10.04131896935388, + "learning_rate": 4.997520180143904e-05, + "loss": 2.1383, + "mean_token_accuracy": 0.4896551609039307, + "step": 63740 + }, + { + "epoch": 0.06420464102509868, + "grad_norm": 10.379943756188952, + "learning_rate": 4.997518421043662e-05, + "loss": 2.7052, + "mean_token_accuracy": 0.38275861740112305, + "step": 63745 + }, + { + "epoch": 0.06420967707820285, + "grad_norm": 11.9223669739286, + "learning_rate": 4.997516661320063e-05, + "loss": 2.5935, + "mean_token_accuracy": 0.38620689511299133, + "step": 63750 + }, + { + "epoch": 0.06421471313130703, + "grad_norm": 9.966227697130536, + "learning_rate": 4.997514900973106e-05, + "loss": 2.6824, + "mean_token_accuracy": 0.37241379618644715, + "step": 63755 + }, + { + "epoch": 0.0642197491844112, + "grad_norm": 13.030955730356617, + "learning_rate": 4.997513140002794e-05, + "loss": 2.4712, + "mean_token_accuracy": 0.47586206197738645, + "step": 63760 + }, + { + "epoch": 0.06422478523751537, + "grad_norm": 11.495548804624207, + "learning_rate": 4.997511378409125e-05, + "loss": 2.1722, + "mean_token_accuracy": 0.42758620381355283, + "step": 63765 + }, + { + "epoch": 0.06422982129061955, + "grad_norm": 15.074710887003405, + "learning_rate": 4.9975096161921e-05, + "loss": 2.2432, + "mean_token_accuracy": 0.4, + "step": 63770 + }, + { + "epoch": 0.06423485734372372, + "grad_norm": 11.335977363717813, + "learning_rate": 4.99750785335172e-05, + "loss": 2.5278, + "mean_token_accuracy": 0.37931033968925476, + "step": 63775 + }, + { + "epoch": 0.0642398933968279, + "grad_norm": 14.239503286117822, + "learning_rate": 4.9975060898879853e-05, + "loss": 2.9525, + "mean_token_accuracy": 0.38620689511299133, + "step": 63780 + }, + { + "epoch": 0.06424492944993207, + "grad_norm": 11.719927453144036, + "learning_rate": 4.9975043258008966e-05, + "loss": 2.6005, + "mean_token_accuracy": 0.4034482717514038, + "step": 63785 + }, + { + "epoch": 0.06424996550303623, + "grad_norm": 12.309092734026658, + "learning_rate": 4.997502561090454e-05, + "loss": 2.7066, + "mean_token_accuracy": 0.37392619252204895, + "step": 63790 + }, + { + "epoch": 0.0642550015561404, + "grad_norm": 13.876190564420307, + "learning_rate": 4.997500795756658e-05, + "loss": 2.7105, + "mean_token_accuracy": 0.3827586203813553, + "step": 63795 + }, + { + "epoch": 0.06426003760924458, + "grad_norm": 12.36681760661571, + "learning_rate": 4.99749902979951e-05, + "loss": 2.8244, + "mean_token_accuracy": 0.3896551787853241, + "step": 63800 + }, + { + "epoch": 0.06426507366234875, + "grad_norm": 12.480970715421119, + "learning_rate": 4.997497263219009e-05, + "loss": 2.7236, + "mean_token_accuracy": 0.37586206793785093, + "step": 63805 + }, + { + "epoch": 0.06427010971545292, + "grad_norm": 13.317197676434864, + "learning_rate": 4.9974954960151564e-05, + "loss": 2.753, + "mean_token_accuracy": 0.358620685338974, + "step": 63810 + }, + { + "epoch": 0.0642751457685571, + "grad_norm": 12.630853561344956, + "learning_rate": 4.997493728187953e-05, + "loss": 2.3891, + "mean_token_accuracy": 0.4103448212146759, + "step": 63815 + }, + { + "epoch": 0.06428018182166127, + "grad_norm": 12.664991632455454, + "learning_rate": 4.9974919597373984e-05, + "loss": 2.4977, + "mean_token_accuracy": 0.37586206793785093, + "step": 63820 + }, + { + "epoch": 0.06428521787476545, + "grad_norm": 12.81571130296138, + "learning_rate": 4.997490190663493e-05, + "loss": 2.7666, + "mean_token_accuracy": 0.34482758641242983, + "step": 63825 + }, + { + "epoch": 0.06429025392786962, + "grad_norm": 14.564564405837809, + "learning_rate": 4.997488420966239e-05, + "loss": 2.8642, + "mean_token_accuracy": 0.3379310339689255, + "step": 63830 + }, + { + "epoch": 0.0642952899809738, + "grad_norm": 11.95610424880165, + "learning_rate": 4.997486650645635e-05, + "loss": 2.401, + "mean_token_accuracy": 0.43448275327682495, + "step": 63835 + }, + { + "epoch": 0.06430032603407797, + "grad_norm": 11.263058632023299, + "learning_rate": 4.997484879701682e-05, + "loss": 2.3107, + "mean_token_accuracy": 0.4467029690742493, + "step": 63840 + }, + { + "epoch": 0.06430536208718214, + "grad_norm": 11.773886463553232, + "learning_rate": 4.997483108134381e-05, + "loss": 2.5012, + "mean_token_accuracy": 0.43103448748588563, + "step": 63845 + }, + { + "epoch": 0.06431039814028632, + "grad_norm": 14.744681009582063, + "learning_rate": 4.997481335943733e-05, + "loss": 2.6286, + "mean_token_accuracy": 0.38027827739715575, + "step": 63850 + }, + { + "epoch": 0.06431543419339049, + "grad_norm": 10.496723461057591, + "learning_rate": 4.997479563129736e-05, + "loss": 2.2493, + "mean_token_accuracy": 0.5009852170944213, + "step": 63855 + }, + { + "epoch": 0.06432047024649465, + "grad_norm": 11.804996611979675, + "learning_rate": 4.997477789692394e-05, + "loss": 2.6289, + "mean_token_accuracy": 0.36551723480224607, + "step": 63860 + }, + { + "epoch": 0.06432550629959882, + "grad_norm": 14.075878930327566, + "learning_rate": 4.997476015631705e-05, + "loss": 2.4714, + "mean_token_accuracy": 0.4381773293018341, + "step": 63865 + }, + { + "epoch": 0.064330542352703, + "grad_norm": 10.657117037528382, + "learning_rate": 4.9974742409476695e-05, + "loss": 2.5214, + "mean_token_accuracy": 0.37586206793785093, + "step": 63870 + }, + { + "epoch": 0.06433557840580717, + "grad_norm": 22.491929351378598, + "learning_rate": 4.997472465640289e-05, + "loss": 2.9506, + "mean_token_accuracy": 0.37241379022598264, + "step": 63875 + }, + { + "epoch": 0.06434061445891134, + "grad_norm": 17.16744040100882, + "learning_rate": 4.997470689709563e-05, + "loss": 3.0854, + "mean_token_accuracy": 0.33103448152542114, + "step": 63880 + }, + { + "epoch": 0.06434565051201552, + "grad_norm": 12.262910492123531, + "learning_rate": 4.997468913155493e-05, + "loss": 2.6062, + "mean_token_accuracy": 0.4344827592372894, + "step": 63885 + }, + { + "epoch": 0.06435068656511969, + "grad_norm": 10.098699416163296, + "learning_rate": 4.99746713597808e-05, + "loss": 2.3469, + "mean_token_accuracy": 0.4137930989265442, + "step": 63890 + }, + { + "epoch": 0.06435572261822387, + "grad_norm": 13.73743601938842, + "learning_rate": 4.997465358177322e-05, + "loss": 2.2336, + "mean_token_accuracy": 0.4034482777118683, + "step": 63895 + }, + { + "epoch": 0.06436075867132804, + "grad_norm": 11.466730222345506, + "learning_rate": 4.997463579753222e-05, + "loss": 2.5123, + "mean_token_accuracy": 0.3862069010734558, + "step": 63900 + }, + { + "epoch": 0.06436579472443221, + "grad_norm": 11.486469043796623, + "learning_rate": 4.99746180070578e-05, + "loss": 2.5861, + "mean_token_accuracy": 0.42068964838981626, + "step": 63905 + }, + { + "epoch": 0.06437083077753639, + "grad_norm": 13.782417808476762, + "learning_rate": 4.997460021034996e-05, + "loss": 3.013, + "mean_token_accuracy": 0.358620685338974, + "step": 63910 + }, + { + "epoch": 0.06437586683064056, + "grad_norm": 11.167547901522223, + "learning_rate": 4.9974582407408704e-05, + "loss": 2.6748, + "mean_token_accuracy": 0.42894089221954346, + "step": 63915 + }, + { + "epoch": 0.06438090288374473, + "grad_norm": 10.26441809185597, + "learning_rate": 4.997456459823404e-05, + "loss": 2.9932, + "mean_token_accuracy": 0.35517241060733795, + "step": 63920 + }, + { + "epoch": 0.06438593893684891, + "grad_norm": 11.051865409669025, + "learning_rate": 4.997454678282597e-05, + "loss": 2.4311, + "mean_token_accuracy": 0.3965517282485962, + "step": 63925 + }, + { + "epoch": 0.06439097498995307, + "grad_norm": 10.704890159256053, + "learning_rate": 4.99745289611845e-05, + "loss": 2.6075, + "mean_token_accuracy": 0.41034482717514037, + "step": 63930 + }, + { + "epoch": 0.06439601104305724, + "grad_norm": 13.649647681719939, + "learning_rate": 4.9974511133309634e-05, + "loss": 2.4808, + "mean_token_accuracy": 0.4137930989265442, + "step": 63935 + }, + { + "epoch": 0.06440104709616142, + "grad_norm": 11.241760095239865, + "learning_rate": 4.997449329920137e-05, + "loss": 2.6612, + "mean_token_accuracy": 0.38130671381950376, + "step": 63940 + }, + { + "epoch": 0.06440608314926559, + "grad_norm": 22.272813939689495, + "learning_rate": 4.9974475458859735e-05, + "loss": 2.8221, + "mean_token_accuracy": 0.34482758641242983, + "step": 63945 + }, + { + "epoch": 0.06441111920236976, + "grad_norm": 9.712746697748715, + "learning_rate": 4.997445761228472e-05, + "loss": 2.57, + "mean_token_accuracy": 0.4034482717514038, + "step": 63950 + }, + { + "epoch": 0.06441615525547394, + "grad_norm": 11.658141622732634, + "learning_rate": 4.9974439759476324e-05, + "loss": 2.9764, + "mean_token_accuracy": 0.36896551847457887, + "step": 63955 + }, + { + "epoch": 0.06442119130857811, + "grad_norm": 10.526798421273325, + "learning_rate": 4.997442190043456e-05, + "loss": 2.0067, + "mean_token_accuracy": 0.4310344815254211, + "step": 63960 + }, + { + "epoch": 0.06442622736168228, + "grad_norm": 12.159988197425585, + "learning_rate": 4.9974404035159434e-05, + "loss": 2.4717, + "mean_token_accuracy": 0.37931033968925476, + "step": 63965 + }, + { + "epoch": 0.06443126341478646, + "grad_norm": 12.69663521520186, + "learning_rate": 4.997438616365095e-05, + "loss": 3.2534, + "mean_token_accuracy": 0.3805202662944794, + "step": 63970 + }, + { + "epoch": 0.06443629946789063, + "grad_norm": 9.810880034009662, + "learning_rate": 4.9974368285909107e-05, + "loss": 2.5654, + "mean_token_accuracy": 0.4034482777118683, + "step": 63975 + }, + { + "epoch": 0.0644413355209948, + "grad_norm": 10.755751722861335, + "learning_rate": 4.997435040193391e-05, + "loss": 2.3036, + "mean_token_accuracy": 0.4103448212146759, + "step": 63980 + }, + { + "epoch": 0.06444637157409898, + "grad_norm": 10.757302081398992, + "learning_rate": 4.9974332511725375e-05, + "loss": 2.2935, + "mean_token_accuracy": 0.4103448212146759, + "step": 63985 + }, + { + "epoch": 0.06445140762720315, + "grad_norm": 13.433487368100982, + "learning_rate": 4.99743146152835e-05, + "loss": 2.5781, + "mean_token_accuracy": 0.39310344457626345, + "step": 63990 + }, + { + "epoch": 0.06445644368030733, + "grad_norm": 14.246225424916659, + "learning_rate": 4.9974296712608286e-05, + "loss": 2.8529, + "mean_token_accuracy": 0.36551723480224607, + "step": 63995 + }, + { + "epoch": 0.06446147973341149, + "grad_norm": 14.434841834711976, + "learning_rate": 4.9974278803699745e-05, + "loss": 2.7772, + "mean_token_accuracy": 0.39310344457626345, + "step": 64000 + }, + { + "epoch": 0.06446651578651566, + "grad_norm": 13.191140429067788, + "learning_rate": 4.997426088855788e-05, + "loss": 2.4506, + "mean_token_accuracy": 0.4103448212146759, + "step": 64005 + }, + { + "epoch": 0.06447155183961983, + "grad_norm": 12.195452131685462, + "learning_rate": 4.997424296718269e-05, + "loss": 2.5895, + "mean_token_accuracy": 0.40344826579093934, + "step": 64010 + }, + { + "epoch": 0.06447658789272401, + "grad_norm": 11.848964311693365, + "learning_rate": 4.997422503957419e-05, + "loss": 2.6724, + "mean_token_accuracy": 0.38275861740112305, + "step": 64015 + }, + { + "epoch": 0.06448162394582818, + "grad_norm": 8.019860897280749, + "learning_rate": 4.997420710573237e-05, + "loss": 2.126, + "mean_token_accuracy": 0.5246478259563446, + "step": 64020 + }, + { + "epoch": 0.06448665999893236, + "grad_norm": 11.007923007798906, + "learning_rate": 4.997418916565726e-05, + "loss": 2.664, + "mean_token_accuracy": 0.39655172228813174, + "step": 64025 + }, + { + "epoch": 0.06449169605203653, + "grad_norm": 11.074576514163029, + "learning_rate": 4.9974171219348845e-05, + "loss": 2.5075, + "mean_token_accuracy": 0.4344827592372894, + "step": 64030 + }, + { + "epoch": 0.0644967321051407, + "grad_norm": 10.096199345785292, + "learning_rate": 4.997415326680713e-05, + "loss": 2.2276, + "mean_token_accuracy": 0.43448275327682495, + "step": 64035 + }, + { + "epoch": 0.06450176815824488, + "grad_norm": 26.574250836800967, + "learning_rate": 4.9974135308032126e-05, + "loss": 2.5144, + "mean_token_accuracy": 0.4689655125141144, + "step": 64040 + }, + { + "epoch": 0.06450680421134905, + "grad_norm": 10.624223084262825, + "learning_rate": 4.997411734302384e-05, + "loss": 2.4923, + "mean_token_accuracy": 0.41379310488700866, + "step": 64045 + }, + { + "epoch": 0.06451184026445322, + "grad_norm": 12.178128824958737, + "learning_rate": 4.997409937178226e-05, + "loss": 2.3801, + "mean_token_accuracy": 0.4275861978530884, + "step": 64050 + }, + { + "epoch": 0.0645168763175574, + "grad_norm": 13.353332345662182, + "learning_rate": 4.9974081394307425e-05, + "loss": 3.0235, + "mean_token_accuracy": 0.3689655244350433, + "step": 64055 + }, + { + "epoch": 0.06452191237066157, + "grad_norm": 14.606862493042199, + "learning_rate": 4.997406341059931e-05, + "loss": 2.4179, + "mean_token_accuracy": 0.4482758641242981, + "step": 64060 + }, + { + "epoch": 0.06452694842376575, + "grad_norm": 12.499269107363759, + "learning_rate": 4.997404542065793e-05, + "loss": 2.6466, + "mean_token_accuracy": 0.4103448331356049, + "step": 64065 + }, + { + "epoch": 0.0645319844768699, + "grad_norm": 13.151894696237415, + "learning_rate": 4.997402742448329e-05, + "loss": 3.0328, + "mean_token_accuracy": 0.3517241358757019, + "step": 64070 + }, + { + "epoch": 0.06453702052997408, + "grad_norm": 11.175666403720752, + "learning_rate": 4.997400942207539e-05, + "loss": 2.5056, + "mean_token_accuracy": 0.36896551251411436, + "step": 64075 + }, + { + "epoch": 0.06454205658307825, + "grad_norm": 11.921136064374776, + "learning_rate": 4.997399141343425e-05, + "loss": 2.5301, + "mean_token_accuracy": 0.4034482777118683, + "step": 64080 + }, + { + "epoch": 0.06454709263618243, + "grad_norm": 12.235957248222135, + "learning_rate": 4.997397339855985e-05, + "loss": 2.4466, + "mean_token_accuracy": 0.4517241299152374, + "step": 64085 + }, + { + "epoch": 0.0645521286892866, + "grad_norm": 10.848690601300751, + "learning_rate": 4.997395537745222e-05, + "loss": 2.8101, + "mean_token_accuracy": 0.3793103456497192, + "step": 64090 + }, + { + "epoch": 0.06455716474239077, + "grad_norm": 12.377799695251019, + "learning_rate": 4.997393735011135e-05, + "loss": 2.439, + "mean_token_accuracy": 0.441379314661026, + "step": 64095 + }, + { + "epoch": 0.06456220079549495, + "grad_norm": 10.836704037683083, + "learning_rate": 4.997391931653726e-05, + "loss": 2.7065, + "mean_token_accuracy": 0.3620689630508423, + "step": 64100 + }, + { + "epoch": 0.06456723684859912, + "grad_norm": 10.302030178970655, + "learning_rate": 4.997390127672994e-05, + "loss": 2.5574, + "mean_token_accuracy": 0.39655172228813174, + "step": 64105 + }, + { + "epoch": 0.0645722729017033, + "grad_norm": 15.68610649148647, + "learning_rate": 4.9973883230689396e-05, + "loss": 2.4479, + "mean_token_accuracy": 0.39310344457626345, + "step": 64110 + }, + { + "epoch": 0.06457730895480747, + "grad_norm": 11.027718873307434, + "learning_rate": 4.9973865178415636e-05, + "loss": 2.4574, + "mean_token_accuracy": 0.4137930989265442, + "step": 64115 + }, + { + "epoch": 0.06458234500791164, + "grad_norm": 13.810414138721175, + "learning_rate": 4.997384711990867e-05, + "loss": 2.2749, + "mean_token_accuracy": 0.4493842363357544, + "step": 64120 + }, + { + "epoch": 0.06458738106101582, + "grad_norm": 11.030816720393899, + "learning_rate": 4.9973829055168494e-05, + "loss": 2.4186, + "mean_token_accuracy": 0.44646098017692565, + "step": 64125 + }, + { + "epoch": 0.06459241711411999, + "grad_norm": 13.172110285785811, + "learning_rate": 4.997381098419512e-05, + "loss": 2.3714, + "mean_token_accuracy": 0.4172413766384125, + "step": 64130 + }, + { + "epoch": 0.06459745316722416, + "grad_norm": 15.170130103537646, + "learning_rate": 4.997379290698855e-05, + "loss": 2.7812, + "mean_token_accuracy": 0.3999999940395355, + "step": 64135 + }, + { + "epoch": 0.06460248922032832, + "grad_norm": 11.776145195231349, + "learning_rate": 4.9973774823548784e-05, + "loss": 2.3001, + "mean_token_accuracy": 0.4448275864124298, + "step": 64140 + }, + { + "epoch": 0.0646075252734325, + "grad_norm": 11.464090806591756, + "learning_rate": 4.997375673387585e-05, + "loss": 2.2881, + "mean_token_accuracy": 0.41034482717514037, + "step": 64145 + }, + { + "epoch": 0.06461256132653667, + "grad_norm": 13.79262832385389, + "learning_rate": 4.997373863796972e-05, + "loss": 2.6135, + "mean_token_accuracy": 0.4379310369491577, + "step": 64150 + }, + { + "epoch": 0.06461759737964085, + "grad_norm": 10.412831287317363, + "learning_rate": 4.9973720535830414e-05, + "loss": 2.3245, + "mean_token_accuracy": 0.41222020983695984, + "step": 64155 + }, + { + "epoch": 0.06462263343274502, + "grad_norm": 11.201051804504871, + "learning_rate": 4.9973702427457946e-05, + "loss": 2.6355, + "mean_token_accuracy": 0.4344827473163605, + "step": 64160 + }, + { + "epoch": 0.06462766948584919, + "grad_norm": 10.752423248207196, + "learning_rate": 4.997368431285231e-05, + "loss": 2.4431, + "mean_token_accuracy": 0.3655172407627106, + "step": 64165 + }, + { + "epoch": 0.06463270553895337, + "grad_norm": 13.283866287842937, + "learning_rate": 4.997366619201351e-05, + "loss": 3.2648, + "mean_token_accuracy": 0.3103448271751404, + "step": 64170 + }, + { + "epoch": 0.06463774159205754, + "grad_norm": 16.365355752037107, + "learning_rate": 4.9973648064941565e-05, + "loss": 3.0095, + "mean_token_accuracy": 0.39310343861579894, + "step": 64175 + }, + { + "epoch": 0.06464277764516171, + "grad_norm": 14.612515784138765, + "learning_rate": 4.997362993163646e-05, + "loss": 3.2296, + "mean_token_accuracy": 0.3379310369491577, + "step": 64180 + }, + { + "epoch": 0.06464781369826589, + "grad_norm": 15.517731967393404, + "learning_rate": 4.997361179209822e-05, + "loss": 2.2093, + "mean_token_accuracy": 0.43793103098869324, + "step": 64185 + }, + { + "epoch": 0.06465284975137006, + "grad_norm": 17.446604047988114, + "learning_rate": 4.997359364632683e-05, + "loss": 2.3404, + "mean_token_accuracy": 0.43103448748588563, + "step": 64190 + }, + { + "epoch": 0.06465788580447424, + "grad_norm": 13.684150084020356, + "learning_rate": 4.9973575494322306e-05, + "loss": 2.9885, + "mean_token_accuracy": 0.3793103456497192, + "step": 64195 + }, + { + "epoch": 0.06466292185757841, + "grad_norm": 11.57862017360859, + "learning_rate": 4.9973557336084657e-05, + "loss": 2.2945, + "mean_token_accuracy": 0.4379310429096222, + "step": 64200 + }, + { + "epoch": 0.06466795791068258, + "grad_norm": 14.760826999245856, + "learning_rate": 4.9973539171613875e-05, + "loss": 2.7184, + "mean_token_accuracy": 0.4034482717514038, + "step": 64205 + }, + { + "epoch": 0.06467299396378674, + "grad_norm": 11.298026787926146, + "learning_rate": 4.997352100090998e-05, + "loss": 2.3735, + "mean_token_accuracy": 0.4482758641242981, + "step": 64210 + }, + { + "epoch": 0.06467803001689092, + "grad_norm": 10.68761647316672, + "learning_rate": 4.997350282397297e-05, + "loss": 2.2346, + "mean_token_accuracy": 0.4724137902259827, + "step": 64215 + }, + { + "epoch": 0.06468306606999509, + "grad_norm": 16.821699326825687, + "learning_rate": 4.997348464080284e-05, + "loss": 2.8541, + "mean_token_accuracy": 0.4018148809671402, + "step": 64220 + }, + { + "epoch": 0.06468810212309926, + "grad_norm": 13.340399189198891, + "learning_rate": 4.997346645139962e-05, + "loss": 2.593, + "mean_token_accuracy": 0.334482753276825, + "step": 64225 + }, + { + "epoch": 0.06469313817620344, + "grad_norm": 13.54095100715977, + "learning_rate": 4.9973448255763295e-05, + "loss": 2.4762, + "mean_token_accuracy": 0.358620685338974, + "step": 64230 + }, + { + "epoch": 0.06469817422930761, + "grad_norm": 9.772481957326963, + "learning_rate": 4.9973430053893874e-05, + "loss": 2.6568, + "mean_token_accuracy": 0.39655172228813174, + "step": 64235 + }, + { + "epoch": 0.06470321028241179, + "grad_norm": 12.339049224399552, + "learning_rate": 4.997341184579137e-05, + "loss": 2.9468, + "mean_token_accuracy": 0.3310344874858856, + "step": 64240 + }, + { + "epoch": 0.06470824633551596, + "grad_norm": 11.451027552109528, + "learning_rate": 4.997339363145577e-05, + "loss": 2.4428, + "mean_token_accuracy": 0.42068964838981626, + "step": 64245 + }, + { + "epoch": 0.06471328238862013, + "grad_norm": 11.974183421025108, + "learning_rate": 4.9973375410887094e-05, + "loss": 2.4077, + "mean_token_accuracy": 0.4034482777118683, + "step": 64250 + }, + { + "epoch": 0.0647183184417243, + "grad_norm": 11.316537646700016, + "learning_rate": 4.9973357184085346e-05, + "loss": 2.907, + "mean_token_accuracy": 0.3620689570903778, + "step": 64255 + }, + { + "epoch": 0.06472335449482848, + "grad_norm": 13.282958298747062, + "learning_rate": 4.9973338951050526e-05, + "loss": 2.4081, + "mean_token_accuracy": 0.4344827592372894, + "step": 64260 + }, + { + "epoch": 0.06472839054793265, + "grad_norm": 9.693364631375667, + "learning_rate": 4.997332071178264e-05, + "loss": 2.5132, + "mean_token_accuracy": 0.44827585816383364, + "step": 64265 + }, + { + "epoch": 0.06473342660103683, + "grad_norm": 9.083962245428125, + "learning_rate": 4.99733024662817e-05, + "loss": 2.4691, + "mean_token_accuracy": 0.39655172228813174, + "step": 64270 + }, + { + "epoch": 0.064738462654141, + "grad_norm": 10.869289258292717, + "learning_rate": 4.99732842145477e-05, + "loss": 2.2227, + "mean_token_accuracy": 0.41379311084747317, + "step": 64275 + }, + { + "epoch": 0.06474349870724516, + "grad_norm": 11.512431001819683, + "learning_rate": 4.9973265956580656e-05, + "loss": 2.1794, + "mean_token_accuracy": 0.46551724076271056, + "step": 64280 + }, + { + "epoch": 0.06474853476034934, + "grad_norm": 12.428739851892022, + "learning_rate": 4.997324769238057e-05, + "loss": 2.15, + "mean_token_accuracy": 0.4379310369491577, + "step": 64285 + }, + { + "epoch": 0.06475357081345351, + "grad_norm": 11.465309843589882, + "learning_rate": 4.997322942194744e-05, + "loss": 2.3877, + "mean_token_accuracy": 0.4655172348022461, + "step": 64290 + }, + { + "epoch": 0.06475860686655768, + "grad_norm": 12.587870477992764, + "learning_rate": 4.9973211145281276e-05, + "loss": 2.9417, + "mean_token_accuracy": 0.358620685338974, + "step": 64295 + }, + { + "epoch": 0.06476364291966186, + "grad_norm": 13.525054158896475, + "learning_rate": 4.997319286238208e-05, + "loss": 2.9354, + "mean_token_accuracy": 0.37241379022598264, + "step": 64300 + }, + { + "epoch": 0.06476867897276603, + "grad_norm": 10.729778690187633, + "learning_rate": 4.997317457324987e-05, + "loss": 2.6319, + "mean_token_accuracy": 0.4194192409515381, + "step": 64305 + }, + { + "epoch": 0.0647737150258702, + "grad_norm": 10.235201793442, + "learning_rate": 4.997315627788463e-05, + "loss": 2.8422, + "mean_token_accuracy": 0.37241379618644715, + "step": 64310 + }, + { + "epoch": 0.06477875107897438, + "grad_norm": 16.11368499215886, + "learning_rate": 4.9973137976286386e-05, + "loss": 2.2916, + "mean_token_accuracy": 0.4257713258266449, + "step": 64315 + }, + { + "epoch": 0.06478378713207855, + "grad_norm": 10.050932784300809, + "learning_rate": 4.997311966845513e-05, + "loss": 2.4409, + "mean_token_accuracy": 0.379310342669487, + "step": 64320 + }, + { + "epoch": 0.06478882318518273, + "grad_norm": 12.382238606793473, + "learning_rate": 4.997310135439087e-05, + "loss": 2.8203, + "mean_token_accuracy": 0.34827586114406583, + "step": 64325 + }, + { + "epoch": 0.0647938592382869, + "grad_norm": 10.46937191893655, + "learning_rate": 4.99730830340936e-05, + "loss": 2.3171, + "mean_token_accuracy": 0.42413792610168455, + "step": 64330 + }, + { + "epoch": 0.06479889529139107, + "grad_norm": 10.954808292286012, + "learning_rate": 4.9973064707563354e-05, + "loss": 2.3764, + "mean_token_accuracy": 0.42068966031074523, + "step": 64335 + }, + { + "epoch": 0.06480393134449525, + "grad_norm": 11.477447268386127, + "learning_rate": 4.9973046374800116e-05, + "loss": 2.8465, + "mean_token_accuracy": 0.36896551251411436, + "step": 64340 + }, + { + "epoch": 0.06480896739759942, + "grad_norm": 12.046964173549153, + "learning_rate": 4.9973028035803894e-05, + "loss": 2.749, + "mean_token_accuracy": 0.39310344457626345, + "step": 64345 + }, + { + "epoch": 0.06481400345070358, + "grad_norm": 14.043353924031237, + "learning_rate": 4.997300969057469e-05, + "loss": 2.675, + "mean_token_accuracy": 0.37241379022598264, + "step": 64350 + }, + { + "epoch": 0.06481903950380775, + "grad_norm": 13.49394776275331, + "learning_rate": 4.997299133911251e-05, + "loss": 2.4688, + "mean_token_accuracy": 0.4379310369491577, + "step": 64355 + }, + { + "epoch": 0.06482407555691193, + "grad_norm": 10.682503590506679, + "learning_rate": 4.9972972981417375e-05, + "loss": 2.7388, + "mean_token_accuracy": 0.3517241358757019, + "step": 64360 + }, + { + "epoch": 0.0648291116100161, + "grad_norm": 13.65735895072366, + "learning_rate": 4.9972954617489267e-05, + "loss": 2.9424, + "mean_token_accuracy": 0.37586207389831544, + "step": 64365 + }, + { + "epoch": 0.06483414766312028, + "grad_norm": 18.995142843425135, + "learning_rate": 4.997293624732821e-05, + "loss": 3.043, + "mean_token_accuracy": 0.3896551728248596, + "step": 64370 + }, + { + "epoch": 0.06483918371622445, + "grad_norm": 12.649926974781994, + "learning_rate": 4.997291787093419e-05, + "loss": 2.5788, + "mean_token_accuracy": 0.3896551728248596, + "step": 64375 + }, + { + "epoch": 0.06484421976932862, + "grad_norm": 10.552278067844838, + "learning_rate": 4.997289948830722e-05, + "loss": 2.5708, + "mean_token_accuracy": 0.324137932062149, + "step": 64380 + }, + { + "epoch": 0.0648492558224328, + "grad_norm": 10.303347633902186, + "learning_rate": 4.997288109944731e-05, + "loss": 2.2614, + "mean_token_accuracy": 0.44295220375061034, + "step": 64385 + }, + { + "epoch": 0.06485429187553697, + "grad_norm": 14.705594623507556, + "learning_rate": 4.997286270435447e-05, + "loss": 2.4055, + "mean_token_accuracy": 0.37586206793785093, + "step": 64390 + }, + { + "epoch": 0.06485932792864114, + "grad_norm": 11.729385059518776, + "learning_rate": 4.997284430302869e-05, + "loss": 2.7863, + "mean_token_accuracy": 0.4103448212146759, + "step": 64395 + }, + { + "epoch": 0.06486436398174532, + "grad_norm": 12.803408239644686, + "learning_rate": 4.997282589546999e-05, + "loss": 2.5585, + "mean_token_accuracy": 0.4241379380226135, + "step": 64400 + }, + { + "epoch": 0.06486940003484949, + "grad_norm": 11.201385453311024, + "learning_rate": 4.997280748167836e-05, + "loss": 2.42, + "mean_token_accuracy": 0.4482758641242981, + "step": 64405 + }, + { + "epoch": 0.06487443608795367, + "grad_norm": 10.986594973708124, + "learning_rate": 4.997278906165382e-05, + "loss": 2.8192, + "mean_token_accuracy": 0.38275861740112305, + "step": 64410 + }, + { + "epoch": 0.06487947214105784, + "grad_norm": 18.701416465142216, + "learning_rate": 4.9972770635396366e-05, + "loss": 2.6144, + "mean_token_accuracy": 0.40889292657375337, + "step": 64415 + }, + { + "epoch": 0.064884508194162, + "grad_norm": 44.61634841175035, + "learning_rate": 4.9972752202906e-05, + "loss": 2.9275, + "mean_token_accuracy": 0.3724137932062149, + "step": 64420 + }, + { + "epoch": 0.06488954424726617, + "grad_norm": 13.105377296466584, + "learning_rate": 4.9972733764182744e-05, + "loss": 2.1343, + "mean_token_accuracy": 0.4344827651977539, + "step": 64425 + }, + { + "epoch": 0.06489458030037035, + "grad_norm": 14.182019272883563, + "learning_rate": 4.997271531922658e-05, + "loss": 2.454, + "mean_token_accuracy": 0.3482758581638336, + "step": 64430 + }, + { + "epoch": 0.06489961635347452, + "grad_norm": 62.06514582154275, + "learning_rate": 4.997269686803753e-05, + "loss": 3.3458, + "mean_token_accuracy": 0.3448275804519653, + "step": 64435 + }, + { + "epoch": 0.0649046524065787, + "grad_norm": 10.233137165283622, + "learning_rate": 4.997267841061559e-05, + "loss": 2.5675, + "mean_token_accuracy": 0.38275861740112305, + "step": 64440 + }, + { + "epoch": 0.06490968845968287, + "grad_norm": 11.549209858309151, + "learning_rate": 4.997265994696077e-05, + "loss": 2.2648, + "mean_token_accuracy": 0.44482758045196535, + "step": 64445 + }, + { + "epoch": 0.06491472451278704, + "grad_norm": 10.074304624642464, + "learning_rate": 4.997264147707308e-05, + "loss": 2.4121, + "mean_token_accuracy": 0.4448275864124298, + "step": 64450 + }, + { + "epoch": 0.06491976056589122, + "grad_norm": 13.79346876029164, + "learning_rate": 4.997262300095252e-05, + "loss": 2.3976, + "mean_token_accuracy": 0.42068964838981626, + "step": 64455 + }, + { + "epoch": 0.06492479661899539, + "grad_norm": 9.903401345435258, + "learning_rate": 4.9972604518599074e-05, + "loss": 2.9169, + "mean_token_accuracy": 0.32068965435028074, + "step": 64460 + }, + { + "epoch": 0.06492983267209956, + "grad_norm": 15.028279366823238, + "learning_rate": 4.997258603001279e-05, + "loss": 2.2728, + "mean_token_accuracy": 0.4379310250282288, + "step": 64465 + }, + { + "epoch": 0.06493486872520374, + "grad_norm": 13.807539647029355, + "learning_rate": 4.997256753519364e-05, + "loss": 2.5347, + "mean_token_accuracy": 0.43448275327682495, + "step": 64470 + }, + { + "epoch": 0.06493990477830791, + "grad_norm": 11.056730389574923, + "learning_rate": 4.997254903414164e-05, + "loss": 2.3145, + "mean_token_accuracy": 0.4221415579319, + "step": 64475 + }, + { + "epoch": 0.06494494083141208, + "grad_norm": 14.311660939163673, + "learning_rate": 4.9972530526856796e-05, + "loss": 2.7011, + "mean_token_accuracy": 0.37350272536277773, + "step": 64480 + }, + { + "epoch": 0.06494997688451626, + "grad_norm": 12.843706402846506, + "learning_rate": 4.997251201333912e-05, + "loss": 2.7279, + "mean_token_accuracy": 0.3586206793785095, + "step": 64485 + }, + { + "epoch": 0.06495501293762042, + "grad_norm": 15.875209786489487, + "learning_rate": 4.997249349358859e-05, + "loss": 2.6704, + "mean_token_accuracy": 0.3724137872457504, + "step": 64490 + }, + { + "epoch": 0.06496004899072459, + "grad_norm": 12.037182070395405, + "learning_rate": 4.997247496760524e-05, + "loss": 2.6475, + "mean_token_accuracy": 0.39999999701976774, + "step": 64495 + }, + { + "epoch": 0.06496508504382877, + "grad_norm": 11.449280414327877, + "learning_rate": 4.997245643538907e-05, + "loss": 2.4058, + "mean_token_accuracy": 0.44482759237289426, + "step": 64500 + }, + { + "epoch": 0.06497012109693294, + "grad_norm": 12.037660169357627, + "learning_rate": 4.9972437896940085e-05, + "loss": 2.9194, + "mean_token_accuracy": 0.37586207389831544, + "step": 64505 + }, + { + "epoch": 0.06497515715003711, + "grad_norm": 10.891206822157644, + "learning_rate": 4.997241935225828e-05, + "loss": 3.0671, + "mean_token_accuracy": 0.35862068831920624, + "step": 64510 + }, + { + "epoch": 0.06498019320314129, + "grad_norm": 11.028936144512006, + "learning_rate": 4.9972400801343654e-05, + "loss": 2.691, + "mean_token_accuracy": 0.4034482717514038, + "step": 64515 + }, + { + "epoch": 0.06498522925624546, + "grad_norm": 10.888050699110789, + "learning_rate": 4.997238224419624e-05, + "loss": 2.2876, + "mean_token_accuracy": 0.4620689630508423, + "step": 64520 + }, + { + "epoch": 0.06499026530934963, + "grad_norm": 11.247672566663509, + "learning_rate": 4.9972363680816014e-05, + "loss": 2.5056, + "mean_token_accuracy": 0.4, + "step": 64525 + }, + { + "epoch": 0.06499530136245381, + "grad_norm": 11.122148665001722, + "learning_rate": 4.9972345111203004e-05, + "loss": 2.5485, + "mean_token_accuracy": 0.4310344815254211, + "step": 64530 + }, + { + "epoch": 0.06500033741555798, + "grad_norm": 11.764745549335244, + "learning_rate": 4.99723265353572e-05, + "loss": 2.5666, + "mean_token_accuracy": 0.3724137932062149, + "step": 64535 + }, + { + "epoch": 0.06500537346866216, + "grad_norm": 12.839580403082458, + "learning_rate": 4.9972307953278616e-05, + "loss": 2.6142, + "mean_token_accuracy": 0.3879612863063812, + "step": 64540 + }, + { + "epoch": 0.06501040952176633, + "grad_norm": 12.899165506140532, + "learning_rate": 4.9972289364967245e-05, + "loss": 2.5436, + "mean_token_accuracy": 0.4034482777118683, + "step": 64545 + }, + { + "epoch": 0.0650154455748705, + "grad_norm": 10.70216559570666, + "learning_rate": 4.9972270770423114e-05, + "loss": 2.1912, + "mean_token_accuracy": 0.43103447556495667, + "step": 64550 + }, + { + "epoch": 0.06502048162797468, + "grad_norm": 10.856771326405397, + "learning_rate": 4.99722521696462e-05, + "loss": 2.6818, + "mean_token_accuracy": 0.39999999701976774, + "step": 64555 + }, + { + "epoch": 0.06502551768107884, + "grad_norm": 11.158353802554075, + "learning_rate": 4.997223356263653e-05, + "loss": 2.7908, + "mean_token_accuracy": 0.3827586233615875, + "step": 64560 + }, + { + "epoch": 0.06503055373418301, + "grad_norm": 17.972207128376052, + "learning_rate": 4.997221494939411e-05, + "loss": 2.8989, + "mean_token_accuracy": 0.3709013879299164, + "step": 64565 + }, + { + "epoch": 0.06503558978728718, + "grad_norm": 14.059351678273316, + "learning_rate": 4.997219632991893e-05, + "loss": 2.9316, + "mean_token_accuracy": 0.36896551251411436, + "step": 64570 + }, + { + "epoch": 0.06504062584039136, + "grad_norm": 12.055380770889936, + "learning_rate": 4.9972177704211e-05, + "loss": 2.7269, + "mean_token_accuracy": 0.36551724970340727, + "step": 64575 + }, + { + "epoch": 0.06504566189349553, + "grad_norm": 14.312793556288577, + "learning_rate": 4.997215907227034e-05, + "loss": 2.4403, + "mean_token_accuracy": 0.41379310488700866, + "step": 64580 + }, + { + "epoch": 0.0650506979465997, + "grad_norm": 12.09510453877924, + "learning_rate": 4.997214043409692e-05, + "loss": 2.5542, + "mean_token_accuracy": 0.4413793087005615, + "step": 64585 + }, + { + "epoch": 0.06505573399970388, + "grad_norm": 12.104201672350323, + "learning_rate": 4.997212178969078e-05, + "loss": 2.3494, + "mean_token_accuracy": 0.4310344815254211, + "step": 64590 + }, + { + "epoch": 0.06506077005280805, + "grad_norm": 10.61952417803201, + "learning_rate": 4.997210313905192e-05, + "loss": 2.4235, + "mean_token_accuracy": 0.4206896543502808, + "step": 64595 + }, + { + "epoch": 0.06506580610591223, + "grad_norm": 11.559627814047222, + "learning_rate": 4.9972084482180334e-05, + "loss": 2.5661, + "mean_token_accuracy": 0.4258923172950745, + "step": 64600 + }, + { + "epoch": 0.0650708421590164, + "grad_norm": 12.723862637075733, + "learning_rate": 4.997206581907603e-05, + "loss": 2.2586, + "mean_token_accuracy": 0.47931033968925474, + "step": 64605 + }, + { + "epoch": 0.06507587821212057, + "grad_norm": 14.42859616275144, + "learning_rate": 4.997204714973902e-05, + "loss": 2.6249, + "mean_token_accuracy": 0.35172414481639863, + "step": 64610 + }, + { + "epoch": 0.06508091426522475, + "grad_norm": 10.472781295105523, + "learning_rate": 4.99720284741693e-05, + "loss": 2.4711, + "mean_token_accuracy": 0.4413793087005615, + "step": 64615 + }, + { + "epoch": 0.06508595031832892, + "grad_norm": 11.302363272003406, + "learning_rate": 4.997200979236688e-05, + "loss": 2.6958, + "mean_token_accuracy": 0.3896551728248596, + "step": 64620 + }, + { + "epoch": 0.0650909863714331, + "grad_norm": 11.587146672952281, + "learning_rate": 4.9971991104331756e-05, + "loss": 2.2381, + "mean_token_accuracy": 0.43448275327682495, + "step": 64625 + }, + { + "epoch": 0.06509602242453726, + "grad_norm": 11.823615056675573, + "learning_rate": 4.997197241006395e-05, + "loss": 2.3815, + "mean_token_accuracy": 0.42413792610168455, + "step": 64630 + }, + { + "epoch": 0.06510105847764143, + "grad_norm": 12.373576628380274, + "learning_rate": 4.997195370956346e-05, + "loss": 2.9002, + "mean_token_accuracy": 0.32413792312145234, + "step": 64635 + }, + { + "epoch": 0.0651060945307456, + "grad_norm": 12.005467761941809, + "learning_rate": 4.997193500283029e-05, + "loss": 2.6778, + "mean_token_accuracy": 0.3655172407627106, + "step": 64640 + }, + { + "epoch": 0.06511113058384978, + "grad_norm": 15.477387324848285, + "learning_rate": 4.997191628986443e-05, + "loss": 3.0425, + "mean_token_accuracy": 0.3206896483898163, + "step": 64645 + }, + { + "epoch": 0.06511616663695395, + "grad_norm": 11.672775474684398, + "learning_rate": 4.997189757066592e-05, + "loss": 2.3548, + "mean_token_accuracy": 0.40344828367233276, + "step": 64650 + }, + { + "epoch": 0.06512120269005812, + "grad_norm": 10.23840963476569, + "learning_rate": 4.9971878845234725e-05, + "loss": 1.9522, + "mean_token_accuracy": 0.4965517222881317, + "step": 64655 + }, + { + "epoch": 0.0651262387431623, + "grad_norm": 13.3825730346537, + "learning_rate": 4.997186011357089e-05, + "loss": 2.4851, + "mean_token_accuracy": 0.441379314661026, + "step": 64660 + }, + { + "epoch": 0.06513127479626647, + "grad_norm": 13.764024830346779, + "learning_rate": 4.997184137567439e-05, + "loss": 2.4183, + "mean_token_accuracy": 0.4034482717514038, + "step": 64665 + }, + { + "epoch": 0.06513631084937065, + "grad_norm": 19.518019082262786, + "learning_rate": 4.9971822631545246e-05, + "loss": 2.8121, + "mean_token_accuracy": 0.3551724076271057, + "step": 64670 + }, + { + "epoch": 0.06514134690247482, + "grad_norm": 14.415682710026498, + "learning_rate": 4.9971803881183455e-05, + "loss": 3.0653, + "mean_token_accuracy": 0.3482758581638336, + "step": 64675 + }, + { + "epoch": 0.065146382955579, + "grad_norm": 15.619678696416816, + "learning_rate": 4.9971785124589026e-05, + "loss": 2.5638, + "mean_token_accuracy": 0.38620689511299133, + "step": 64680 + }, + { + "epoch": 0.06515141900868317, + "grad_norm": 9.323511561960446, + "learning_rate": 4.997176636176196e-05, + "loss": 2.2029, + "mean_token_accuracy": 0.4551724135875702, + "step": 64685 + }, + { + "epoch": 0.06515645506178734, + "grad_norm": 10.850596250930028, + "learning_rate": 4.9971747592702275e-05, + "loss": 2.4466, + "mean_token_accuracy": 0.41530550122261045, + "step": 64690 + }, + { + "epoch": 0.06516149111489151, + "grad_norm": 15.080409907641867, + "learning_rate": 4.997172881740996e-05, + "loss": 2.8381, + "mean_token_accuracy": 0.3774954617023468, + "step": 64695 + }, + { + "epoch": 0.06516652716799567, + "grad_norm": 12.034546684761619, + "learning_rate": 4.997171003588503e-05, + "loss": 2.7027, + "mean_token_accuracy": 0.4137930989265442, + "step": 64700 + }, + { + "epoch": 0.06517156322109985, + "grad_norm": 13.431052317163712, + "learning_rate": 4.997169124812749e-05, + "loss": 2.5817, + "mean_token_accuracy": 0.4034482717514038, + "step": 64705 + }, + { + "epoch": 0.06517659927420402, + "grad_norm": 11.802645776078489, + "learning_rate": 4.9971672454137336e-05, + "loss": 2.4169, + "mean_token_accuracy": 0.40344828367233276, + "step": 64710 + }, + { + "epoch": 0.0651816353273082, + "grad_norm": 11.408052577280777, + "learning_rate": 4.9971653653914585e-05, + "loss": 2.6236, + "mean_token_accuracy": 0.42413792610168455, + "step": 64715 + }, + { + "epoch": 0.06518667138041237, + "grad_norm": 11.298046432800817, + "learning_rate": 4.9971634847459236e-05, + "loss": 3.0018, + "mean_token_accuracy": 0.3310344785451889, + "step": 64720 + }, + { + "epoch": 0.06519170743351654, + "grad_norm": 12.73069033944942, + "learning_rate": 4.997161603477129e-05, + "loss": 2.8028, + "mean_token_accuracy": 0.33793102502822875, + "step": 64725 + }, + { + "epoch": 0.06519674348662072, + "grad_norm": 14.931542357445197, + "learning_rate": 4.997159721585076e-05, + "loss": 2.6301, + "mean_token_accuracy": 0.3793103456497192, + "step": 64730 + }, + { + "epoch": 0.06520177953972489, + "grad_norm": 15.815234975068162, + "learning_rate": 4.997157839069766e-05, + "loss": 2.4818, + "mean_token_accuracy": 0.38620689511299133, + "step": 64735 + }, + { + "epoch": 0.06520681559282906, + "grad_norm": 11.480762283574364, + "learning_rate": 4.997155955931197e-05, + "loss": 2.6949, + "mean_token_accuracy": 0.38965516686439516, + "step": 64740 + }, + { + "epoch": 0.06521185164593324, + "grad_norm": 11.356607353407735, + "learning_rate": 4.997154072169371e-05, + "loss": 2.2604, + "mean_token_accuracy": 0.4379310429096222, + "step": 64745 + }, + { + "epoch": 0.06521688769903741, + "grad_norm": 10.353515464197038, + "learning_rate": 4.9971521877842895e-05, + "loss": 2.4588, + "mean_token_accuracy": 0.38275861740112305, + "step": 64750 + }, + { + "epoch": 0.06522192375214159, + "grad_norm": 10.921828868284166, + "learning_rate": 4.997150302775951e-05, + "loss": 2.4878, + "mean_token_accuracy": 0.4191772550344467, + "step": 64755 + }, + { + "epoch": 0.06522695980524576, + "grad_norm": 11.504221921652743, + "learning_rate": 4.997148417144357e-05, + "loss": 2.6078, + "mean_token_accuracy": 0.3793103456497192, + "step": 64760 + }, + { + "epoch": 0.06523199585834993, + "grad_norm": 8.862244972105684, + "learning_rate": 4.997146530889508e-05, + "loss": 1.9771, + "mean_token_accuracy": 0.5093163907527923, + "step": 64765 + }, + { + "epoch": 0.0652370319114541, + "grad_norm": 10.023737836310328, + "learning_rate": 4.997144644011405e-05, + "loss": 2.5176, + "mean_token_accuracy": 0.4068965494632721, + "step": 64770 + }, + { + "epoch": 0.06524206796455827, + "grad_norm": 12.031062247578312, + "learning_rate": 4.997142756510048e-05, + "loss": 2.3024, + "mean_token_accuracy": 0.45172414779663084, + "step": 64775 + }, + { + "epoch": 0.06524710401766244, + "grad_norm": 11.197543566436527, + "learning_rate": 4.997140868385437e-05, + "loss": 2.114, + "mean_token_accuracy": 0.4655172288417816, + "step": 64780 + }, + { + "epoch": 0.06525214007076661, + "grad_norm": 10.872040225264275, + "learning_rate": 4.997138979637574e-05, + "loss": 2.7369, + "mean_token_accuracy": 0.36896551251411436, + "step": 64785 + }, + { + "epoch": 0.06525717612387079, + "grad_norm": 17.21210975966897, + "learning_rate": 4.997137090266457e-05, + "loss": 3.0848, + "mean_token_accuracy": 0.3413793116807938, + "step": 64790 + }, + { + "epoch": 0.06526221217697496, + "grad_norm": 18.5246921478657, + "learning_rate": 4.99713520027209e-05, + "loss": 2.8659, + "mean_token_accuracy": 0.3517241388559341, + "step": 64795 + }, + { + "epoch": 0.06526724823007914, + "grad_norm": 11.264843091788103, + "learning_rate": 4.997133309654471e-05, + "loss": 2.6731, + "mean_token_accuracy": 0.35517241060733795, + "step": 64800 + }, + { + "epoch": 0.06527228428318331, + "grad_norm": 10.178931708263343, + "learning_rate": 4.997131418413601e-05, + "loss": 2.2645, + "mean_token_accuracy": 0.4482758641242981, + "step": 64805 + }, + { + "epoch": 0.06527732033628748, + "grad_norm": 15.86252835577339, + "learning_rate": 4.997129526549481e-05, + "loss": 2.7594, + "mean_token_accuracy": 0.38052027225494384, + "step": 64810 + }, + { + "epoch": 0.06528235638939166, + "grad_norm": 14.624945036599803, + "learning_rate": 4.997127634062111e-05, + "loss": 2.6795, + "mean_token_accuracy": 0.334482753276825, + "step": 64815 + }, + { + "epoch": 0.06528739244249583, + "grad_norm": 10.318307007651585, + "learning_rate": 4.9971257409514913e-05, + "loss": 2.4694, + "mean_token_accuracy": 0.4103448331356049, + "step": 64820 + }, + { + "epoch": 0.0652924284956, + "grad_norm": 9.831507872199152, + "learning_rate": 4.9971238472176244e-05, + "loss": 2.4667, + "mean_token_accuracy": 0.46896551847457885, + "step": 64825 + }, + { + "epoch": 0.06529746454870418, + "grad_norm": 11.424650355093075, + "learning_rate": 4.997121952860508e-05, + "loss": 2.7934, + "mean_token_accuracy": 0.3551724076271057, + "step": 64830 + }, + { + "epoch": 0.06530250060180835, + "grad_norm": 10.191790828262855, + "learning_rate": 4.997120057880144e-05, + "loss": 2.5271, + "mean_token_accuracy": 0.41379310488700866, + "step": 64835 + }, + { + "epoch": 0.06530753665491251, + "grad_norm": 13.391319344101985, + "learning_rate": 4.997118162276533e-05, + "loss": 2.7077, + "mean_token_accuracy": 0.43793103098869324, + "step": 64840 + }, + { + "epoch": 0.06531257270801669, + "grad_norm": 38.5238216736616, + "learning_rate": 4.997116266049676e-05, + "loss": 2.4215, + "mean_token_accuracy": 0.38965517580509185, + "step": 64845 + }, + { + "epoch": 0.06531760876112086, + "grad_norm": 10.117411369956777, + "learning_rate": 4.997114369199573e-05, + "loss": 2.2416, + "mean_token_accuracy": 0.46551724076271056, + "step": 64850 + }, + { + "epoch": 0.06532264481422503, + "grad_norm": 10.528389456067643, + "learning_rate": 4.997112471726223e-05, + "loss": 2.578, + "mean_token_accuracy": 0.4724137902259827, + "step": 64855 + }, + { + "epoch": 0.06532768086732921, + "grad_norm": 11.571369150529442, + "learning_rate": 4.997110573629629e-05, + "loss": 2.7088, + "mean_token_accuracy": 0.4191772609949112, + "step": 64860 + }, + { + "epoch": 0.06533271692043338, + "grad_norm": 11.775724665888097, + "learning_rate": 4.99710867490979e-05, + "loss": 2.7593, + "mean_token_accuracy": 0.3793103456497192, + "step": 64865 + }, + { + "epoch": 0.06533775297353756, + "grad_norm": 10.749331150639613, + "learning_rate": 4.997106775566708e-05, + "loss": 2.4014, + "mean_token_accuracy": 0.43534483313560485, + "step": 64870 + }, + { + "epoch": 0.06534278902664173, + "grad_norm": 13.15106900743184, + "learning_rate": 4.997104875600382e-05, + "loss": 2.6002, + "mean_token_accuracy": 0.42758620381355283, + "step": 64875 + }, + { + "epoch": 0.0653478250797459, + "grad_norm": 14.305468702104733, + "learning_rate": 4.9971029750108125e-05, + "loss": 2.5057, + "mean_token_accuracy": 0.44343616962432864, + "step": 64880 + }, + { + "epoch": 0.06535286113285008, + "grad_norm": 10.558288104573247, + "learning_rate": 4.997101073798002e-05, + "loss": 2.3549, + "mean_token_accuracy": 0.4502722263336182, + "step": 64885 + }, + { + "epoch": 0.06535789718595425, + "grad_norm": 11.781244081846133, + "learning_rate": 4.997099171961948e-05, + "loss": 2.4368, + "mean_token_accuracy": 0.4103448331356049, + "step": 64890 + }, + { + "epoch": 0.06536293323905842, + "grad_norm": 10.389559056637578, + "learning_rate": 4.997097269502654e-05, + "loss": 2.8408, + "mean_token_accuracy": 0.4068965554237366, + "step": 64895 + }, + { + "epoch": 0.0653679692921626, + "grad_norm": 13.37008570775885, + "learning_rate": 4.997095366420119e-05, + "loss": 2.6028, + "mean_token_accuracy": 0.3793103456497192, + "step": 64900 + }, + { + "epoch": 0.06537300534526677, + "grad_norm": 12.484963620782418, + "learning_rate": 4.9970934627143424e-05, + "loss": 2.597, + "mean_token_accuracy": 0.39655172228813174, + "step": 64905 + }, + { + "epoch": 0.06537804139837093, + "grad_norm": 12.145935788478827, + "learning_rate": 4.9970915583853265e-05, + "loss": 2.6924, + "mean_token_accuracy": 0.39310344457626345, + "step": 64910 + }, + { + "epoch": 0.0653830774514751, + "grad_norm": 11.072032557913573, + "learning_rate": 4.9970896534330726e-05, + "loss": 2.5457, + "mean_token_accuracy": 0.3551724165678024, + "step": 64915 + }, + { + "epoch": 0.06538811350457928, + "grad_norm": 10.557541119590413, + "learning_rate": 4.9970877478575794e-05, + "loss": 2.3734, + "mean_token_accuracy": 0.44482758045196535, + "step": 64920 + }, + { + "epoch": 0.06539314955768345, + "grad_norm": 10.044485034200772, + "learning_rate": 4.997085841658848e-05, + "loss": 2.5603, + "mean_token_accuracy": 0.37586206793785093, + "step": 64925 + }, + { + "epoch": 0.06539818561078763, + "grad_norm": 10.303432992622986, + "learning_rate": 4.997083934836879e-05, + "loss": 2.2256, + "mean_token_accuracy": 0.5017543911933899, + "step": 64930 + }, + { + "epoch": 0.0654032216638918, + "grad_norm": 12.56077878271298, + "learning_rate": 4.997082027391673e-05, + "loss": 2.7891, + "mean_token_accuracy": 0.3827586233615875, + "step": 64935 + }, + { + "epoch": 0.06540825771699597, + "grad_norm": 10.12361899449248, + "learning_rate": 4.9970801193232293e-05, + "loss": 2.7714, + "mean_token_accuracy": 0.37241379022598264, + "step": 64940 + }, + { + "epoch": 0.06541329377010015, + "grad_norm": 10.296899232489508, + "learning_rate": 4.9970782106315507e-05, + "loss": 2.4054, + "mean_token_accuracy": 0.4413793087005615, + "step": 64945 + }, + { + "epoch": 0.06541832982320432, + "grad_norm": 12.346419620284966, + "learning_rate": 4.9970763013166366e-05, + "loss": 2.5035, + "mean_token_accuracy": 0.41034482717514037, + "step": 64950 + }, + { + "epoch": 0.0654233658763085, + "grad_norm": 12.84787002862828, + "learning_rate": 4.9970743913784866e-05, + "loss": 2.7812, + "mean_token_accuracy": 0.3551724135875702, + "step": 64955 + }, + { + "epoch": 0.06542840192941267, + "grad_norm": 11.163481770167147, + "learning_rate": 4.9970724808171026e-05, + "loss": 2.533, + "mean_token_accuracy": 0.44029037952423095, + "step": 64960 + }, + { + "epoch": 0.06543343798251684, + "grad_norm": 9.986359020432651, + "learning_rate": 4.9970705696324854e-05, + "loss": 2.6214, + "mean_token_accuracy": 0.3965517282485962, + "step": 64965 + }, + { + "epoch": 0.06543847403562102, + "grad_norm": 14.888717559987086, + "learning_rate": 4.997068657824634e-05, + "loss": 2.7874, + "mean_token_accuracy": 0.3896551728248596, + "step": 64970 + }, + { + "epoch": 0.06544351008872519, + "grad_norm": 11.885474923123517, + "learning_rate": 4.9970667453935496e-05, + "loss": 2.5644, + "mean_token_accuracy": 0.41724138259887694, + "step": 64975 + }, + { + "epoch": 0.06544854614182935, + "grad_norm": 10.508784083928356, + "learning_rate": 4.997064832339234e-05, + "loss": 2.3807, + "mean_token_accuracy": 0.4034482777118683, + "step": 64980 + }, + { + "epoch": 0.06545358219493352, + "grad_norm": 12.24653912273854, + "learning_rate": 4.9970629186616854e-05, + "loss": 2.6336, + "mean_token_accuracy": 0.4344827592372894, + "step": 64985 + }, + { + "epoch": 0.0654586182480377, + "grad_norm": 9.388594060783554, + "learning_rate": 4.997061004360906e-05, + "loss": 2.4518, + "mean_token_accuracy": 0.47586206197738645, + "step": 64990 + }, + { + "epoch": 0.06546365430114187, + "grad_norm": 13.320067058419248, + "learning_rate": 4.9970590894368955e-05, + "loss": 2.6827, + "mean_token_accuracy": 0.36551723778247835, + "step": 64995 + }, + { + "epoch": 0.06546869035424605, + "grad_norm": 13.264288908506451, + "learning_rate": 4.9970571738896553e-05, + "loss": 2.1275, + "mean_token_accuracy": 0.46551724076271056, + "step": 65000 + }, + { + "epoch": 0.06547372640735022, + "grad_norm": 11.209152338039866, + "learning_rate": 4.9970552577191846e-05, + "loss": 2.4391, + "mean_token_accuracy": 0.47931033968925474, + "step": 65005 + }, + { + "epoch": 0.06547876246045439, + "grad_norm": 11.47444929782627, + "learning_rate": 4.997053340925485e-05, + "loss": 2.5459, + "mean_token_accuracy": 0.3896551728248596, + "step": 65010 + }, + { + "epoch": 0.06548379851355857, + "grad_norm": 14.30823744507397, + "learning_rate": 4.9970514235085575e-05, + "loss": 2.8399, + "mean_token_accuracy": 0.36206896901130675, + "step": 65015 + }, + { + "epoch": 0.06548883456666274, + "grad_norm": 10.838760144550397, + "learning_rate": 4.997049505468401e-05, + "loss": 2.5964, + "mean_token_accuracy": 0.36896551251411436, + "step": 65020 + }, + { + "epoch": 0.06549387061976691, + "grad_norm": 10.417844340793048, + "learning_rate": 4.9970475868050176e-05, + "loss": 2.3465, + "mean_token_accuracy": 0.4586207032203674, + "step": 65025 + }, + { + "epoch": 0.06549890667287109, + "grad_norm": 16.602607201217918, + "learning_rate": 4.997045667518407e-05, + "loss": 3.3829, + "mean_token_accuracy": 0.3043557196855545, + "step": 65030 + }, + { + "epoch": 0.06550394272597526, + "grad_norm": 12.241008812288587, + "learning_rate": 4.9970437476085696e-05, + "loss": 2.6415, + "mean_token_accuracy": 0.3931034505367279, + "step": 65035 + }, + { + "epoch": 0.06550897877907944, + "grad_norm": 14.920428789402699, + "learning_rate": 4.9970418270755065e-05, + "loss": 2.4694, + "mean_token_accuracy": 0.4310344815254211, + "step": 65040 + }, + { + "epoch": 0.06551401483218361, + "grad_norm": 12.836177164438281, + "learning_rate": 4.997039905919218e-05, + "loss": 2.5929, + "mean_token_accuracy": 0.37586206793785093, + "step": 65045 + }, + { + "epoch": 0.06551905088528777, + "grad_norm": 10.950852650712818, + "learning_rate": 4.997037984139705e-05, + "loss": 2.1905, + "mean_token_accuracy": 0.4640048384666443, + "step": 65050 + }, + { + "epoch": 0.06552408693839194, + "grad_norm": 11.217662110234622, + "learning_rate": 4.997036061736967e-05, + "loss": 2.696, + "mean_token_accuracy": 0.439443439245224, + "step": 65055 + }, + { + "epoch": 0.06552912299149612, + "grad_norm": 11.032539848109701, + "learning_rate": 4.997034138711005e-05, + "loss": 2.4892, + "mean_token_accuracy": 0.3620689660310745, + "step": 65060 + }, + { + "epoch": 0.06553415904460029, + "grad_norm": 12.149552392652938, + "learning_rate": 4.9970322150618205e-05, + "loss": 2.6778, + "mean_token_accuracy": 0.37931033968925476, + "step": 65065 + }, + { + "epoch": 0.06553919509770446, + "grad_norm": 13.784001695228483, + "learning_rate": 4.9970302907894125e-05, + "loss": 2.5073, + "mean_token_accuracy": 0.4103448212146759, + "step": 65070 + }, + { + "epoch": 0.06554423115080864, + "grad_norm": 13.425647604919858, + "learning_rate": 4.997028365893782e-05, + "loss": 2.5335, + "mean_token_accuracy": 0.42413793206214906, + "step": 65075 + }, + { + "epoch": 0.06554926720391281, + "grad_norm": 12.41897223403293, + "learning_rate": 4.997026440374931e-05, + "loss": 2.835, + "mean_token_accuracy": 0.3413792967796326, + "step": 65080 + }, + { + "epoch": 0.06555430325701699, + "grad_norm": 15.404729861662698, + "learning_rate": 4.997024514232858e-05, + "loss": 2.3594, + "mean_token_accuracy": 0.458620685338974, + "step": 65085 + }, + { + "epoch": 0.06555933931012116, + "grad_norm": 10.711907536267494, + "learning_rate": 4.9970225874675644e-05, + "loss": 2.7029, + "mean_token_accuracy": 0.33793103098869326, + "step": 65090 + }, + { + "epoch": 0.06556437536322533, + "grad_norm": 16.269545982721876, + "learning_rate": 4.997020660079051e-05, + "loss": 2.5518, + "mean_token_accuracy": 0.4034482777118683, + "step": 65095 + }, + { + "epoch": 0.0655694114163295, + "grad_norm": 10.219959538209208, + "learning_rate": 4.997018732067318e-05, + "loss": 2.4329, + "mean_token_accuracy": 0.3896551728248596, + "step": 65100 + }, + { + "epoch": 0.06557444746943368, + "grad_norm": 15.75559532341415, + "learning_rate": 4.9970168034323656e-05, + "loss": 2.5308, + "mean_token_accuracy": 0.42758620977401735, + "step": 65105 + }, + { + "epoch": 0.06557948352253785, + "grad_norm": 11.025504952672485, + "learning_rate": 4.997014874174195e-05, + "loss": 2.2578, + "mean_token_accuracy": 0.44482759237289426, + "step": 65110 + }, + { + "epoch": 0.06558451957564203, + "grad_norm": 22.84973417952192, + "learning_rate": 4.997012944292806e-05, + "loss": 2.8426, + "mean_token_accuracy": 0.3275862127542496, + "step": 65115 + }, + { + "epoch": 0.06558955562874619, + "grad_norm": 9.87374959651685, + "learning_rate": 4.9970110137882e-05, + "loss": 2.2776, + "mean_token_accuracy": 0.44827585816383364, + "step": 65120 + }, + { + "epoch": 0.06559459168185036, + "grad_norm": 13.443132294185327, + "learning_rate": 4.997009082660377e-05, + "loss": 3.1697, + "mean_token_accuracy": 0.34827585220336915, + "step": 65125 + }, + { + "epoch": 0.06559962773495454, + "grad_norm": 12.677891725561125, + "learning_rate": 4.997007150909338e-05, + "loss": 2.9893, + "mean_token_accuracy": 0.35517241060733795, + "step": 65130 + }, + { + "epoch": 0.06560466378805871, + "grad_norm": 19.026056424161254, + "learning_rate": 4.997005218535083e-05, + "loss": 2.7172, + "mean_token_accuracy": 0.41724138557910917, + "step": 65135 + }, + { + "epoch": 0.06560969984116288, + "grad_norm": 9.275741226191961, + "learning_rate": 4.9970032855376125e-05, + "loss": 2.2838, + "mean_token_accuracy": 0.45366001725196836, + "step": 65140 + }, + { + "epoch": 0.06561473589426706, + "grad_norm": 13.323789479767306, + "learning_rate": 4.997001351916928e-05, + "loss": 3.0301, + "mean_token_accuracy": 0.3758620619773865, + "step": 65145 + }, + { + "epoch": 0.06561977194737123, + "grad_norm": 12.065905908725158, + "learning_rate": 4.996999417673028e-05, + "loss": 3.113, + "mean_token_accuracy": 0.3655172407627106, + "step": 65150 + }, + { + "epoch": 0.0656248080004754, + "grad_norm": 10.920392079088483, + "learning_rate": 4.996997482805915e-05, + "loss": 2.475, + "mean_token_accuracy": 0.41379311084747317, + "step": 65155 + }, + { + "epoch": 0.06562984405357958, + "grad_norm": 11.795404252943495, + "learning_rate": 4.996995547315589e-05, + "loss": 2.6588, + "mean_token_accuracy": 0.3655172407627106, + "step": 65160 + }, + { + "epoch": 0.06563488010668375, + "grad_norm": 11.979996194782537, + "learning_rate": 4.996993611202051e-05, + "loss": 2.9733, + "mean_token_accuracy": 0.37241379022598264, + "step": 65165 + }, + { + "epoch": 0.06563991615978793, + "grad_norm": 11.804553602510314, + "learning_rate": 4.9969916744653e-05, + "loss": 2.4313, + "mean_token_accuracy": 0.4137930989265442, + "step": 65170 + }, + { + "epoch": 0.0656449522128921, + "grad_norm": 10.229640365191054, + "learning_rate": 4.9969897371053376e-05, + "loss": 2.4493, + "mean_token_accuracy": 0.43278887271881106, + "step": 65175 + }, + { + "epoch": 0.06564998826599627, + "grad_norm": 10.613681559636753, + "learning_rate": 4.9969877991221646e-05, + "loss": 2.3053, + "mean_token_accuracy": 0.42758620977401735, + "step": 65180 + }, + { + "epoch": 0.06565502431910045, + "grad_norm": 9.993974881358307, + "learning_rate": 4.9969858605157806e-05, + "loss": 2.0233, + "mean_token_accuracy": 0.5034482657909394, + "step": 65185 + }, + { + "epoch": 0.0656600603722046, + "grad_norm": 9.00498623604641, + "learning_rate": 4.996983921286188e-05, + "loss": 2.3311, + "mean_token_accuracy": 0.46551724672317507, + "step": 65190 + }, + { + "epoch": 0.06566509642530878, + "grad_norm": 12.07519456171486, + "learning_rate": 4.996981981433384e-05, + "loss": 2.4227, + "mean_token_accuracy": 0.4103448212146759, + "step": 65195 + }, + { + "epoch": 0.06567013247841295, + "grad_norm": 17.766870765365134, + "learning_rate": 4.996980040957372e-05, + "loss": 2.396, + "mean_token_accuracy": 0.4034482717514038, + "step": 65200 + }, + { + "epoch": 0.06567516853151713, + "grad_norm": 14.790583946375987, + "learning_rate": 4.9969780998581517e-05, + "loss": 2.7338, + "mean_token_accuracy": 0.3379310369491577, + "step": 65205 + }, + { + "epoch": 0.0656802045846213, + "grad_norm": 11.209361809518665, + "learning_rate": 4.996976158135724e-05, + "loss": 2.5497, + "mean_token_accuracy": 0.41379310488700866, + "step": 65210 + }, + { + "epoch": 0.06568524063772548, + "grad_norm": 9.694014627628961, + "learning_rate": 4.996974215790089e-05, + "loss": 2.6238, + "mean_token_accuracy": 0.3896551728248596, + "step": 65215 + }, + { + "epoch": 0.06569027669082965, + "grad_norm": 12.97105282522778, + "learning_rate": 4.996972272821247e-05, + "loss": 2.253, + "mean_token_accuracy": 0.441379314661026, + "step": 65220 + }, + { + "epoch": 0.06569531274393382, + "grad_norm": 12.645206393031733, + "learning_rate": 4.9969703292291993e-05, + "loss": 2.4302, + "mean_token_accuracy": 0.4294615864753723, + "step": 65225 + }, + { + "epoch": 0.065700348797038, + "grad_norm": 10.642080425667514, + "learning_rate": 4.9969683850139455e-05, + "loss": 2.5925, + "mean_token_accuracy": 0.4172413766384125, + "step": 65230 + }, + { + "epoch": 0.06570538485014217, + "grad_norm": 12.48197100135752, + "learning_rate": 4.9969664401754874e-05, + "loss": 2.5984, + "mean_token_accuracy": 0.38275861740112305, + "step": 65235 + }, + { + "epoch": 0.06571042090324634, + "grad_norm": 10.855704386081841, + "learning_rate": 4.996964494713823e-05, + "loss": 2.4823, + "mean_token_accuracy": 0.3999999940395355, + "step": 65240 + }, + { + "epoch": 0.06571545695635052, + "grad_norm": 14.556694010523788, + "learning_rate": 4.996962548628957e-05, + "loss": 2.9918, + "mean_token_accuracy": 0.2965517222881317, + "step": 65245 + }, + { + "epoch": 0.06572049300945469, + "grad_norm": 12.821300500706608, + "learning_rate": 4.9969606019208864e-05, + "loss": 2.9859, + "mean_token_accuracy": 0.3551724076271057, + "step": 65250 + }, + { + "epoch": 0.06572552906255887, + "grad_norm": 12.962174296129136, + "learning_rate": 4.9969586545896124e-05, + "loss": 2.6853, + "mean_token_accuracy": 0.4, + "step": 65255 + }, + { + "epoch": 0.06573056511566303, + "grad_norm": 8.571240969110962, + "learning_rate": 4.9969567066351364e-05, + "loss": 2.2543, + "mean_token_accuracy": 0.4241379380226135, + "step": 65260 + }, + { + "epoch": 0.0657356011687672, + "grad_norm": 12.993517324652368, + "learning_rate": 4.9969547580574595e-05, + "loss": 3.0137, + "mean_token_accuracy": 0.3482758581638336, + "step": 65265 + }, + { + "epoch": 0.06574063722187137, + "grad_norm": 13.705610152611374, + "learning_rate": 4.99695280885658e-05, + "loss": 2.4454, + "mean_token_accuracy": 0.3896551728248596, + "step": 65270 + }, + { + "epoch": 0.06574567327497555, + "grad_norm": 9.862894972909425, + "learning_rate": 4.9969508590325e-05, + "loss": 2.6734, + "mean_token_accuracy": 0.4068965494632721, + "step": 65275 + }, + { + "epoch": 0.06575070932807972, + "grad_norm": 9.802660144385916, + "learning_rate": 4.99694890858522e-05, + "loss": 2.3862, + "mean_token_accuracy": 0.4517241358757019, + "step": 65280 + }, + { + "epoch": 0.0657557453811839, + "grad_norm": 12.977800842000624, + "learning_rate": 4.996946957514741e-05, + "loss": 2.8093, + "mean_token_accuracy": 0.4068965554237366, + "step": 65285 + }, + { + "epoch": 0.06576078143428807, + "grad_norm": 12.468434318598709, + "learning_rate": 4.996945005821062e-05, + "loss": 2.5929, + "mean_token_accuracy": 0.41379310488700866, + "step": 65290 + }, + { + "epoch": 0.06576581748739224, + "grad_norm": 11.783458630643858, + "learning_rate": 4.996943053504185e-05, + "loss": 2.7317, + "mean_token_accuracy": 0.3896551728248596, + "step": 65295 + }, + { + "epoch": 0.06577085354049642, + "grad_norm": 12.366942193288237, + "learning_rate": 4.99694110056411e-05, + "loss": 2.3928, + "mean_token_accuracy": 0.42758620977401735, + "step": 65300 + }, + { + "epoch": 0.06577588959360059, + "grad_norm": 11.922713344887724, + "learning_rate": 4.996939147000837e-05, + "loss": 2.3201, + "mean_token_accuracy": 0.3999999940395355, + "step": 65305 + }, + { + "epoch": 0.06578092564670476, + "grad_norm": 11.681194711248034, + "learning_rate": 4.9969371928143674e-05, + "loss": 2.7589, + "mean_token_accuracy": 0.38965516686439516, + "step": 65310 + }, + { + "epoch": 0.06578596169980894, + "grad_norm": 12.562881920801395, + "learning_rate": 4.996935238004701e-05, + "loss": 2.4379, + "mean_token_accuracy": 0.38275861740112305, + "step": 65315 + }, + { + "epoch": 0.06579099775291311, + "grad_norm": 11.79460403787884, + "learning_rate": 4.996933282571839e-05, + "loss": 2.4015, + "mean_token_accuracy": 0.4586206912994385, + "step": 65320 + }, + { + "epoch": 0.06579603380601728, + "grad_norm": 10.899699593934638, + "learning_rate": 4.9969313265157815e-05, + "loss": 2.6911, + "mean_token_accuracy": 0.37586206793785093, + "step": 65325 + }, + { + "epoch": 0.06580106985912144, + "grad_norm": 15.153869071725529, + "learning_rate": 4.99692936983653e-05, + "loss": 2.722, + "mean_token_accuracy": 0.38620689511299133, + "step": 65330 + }, + { + "epoch": 0.06580610591222562, + "grad_norm": 13.434080964390509, + "learning_rate": 4.996927412534083e-05, + "loss": 2.414, + "mean_token_accuracy": 0.4034482777118683, + "step": 65335 + }, + { + "epoch": 0.06581114196532979, + "grad_norm": 12.710116680022304, + "learning_rate": 4.9969254546084435e-05, + "loss": 2.3528, + "mean_token_accuracy": 0.45033273100852966, + "step": 65340 + }, + { + "epoch": 0.06581617801843397, + "grad_norm": 12.42083563790262, + "learning_rate": 4.99692349605961e-05, + "loss": 2.4836, + "mean_token_accuracy": 0.43260738253593445, + "step": 65345 + }, + { + "epoch": 0.06582121407153814, + "grad_norm": 11.562562996137192, + "learning_rate": 4.9969215368875846e-05, + "loss": 2.5185, + "mean_token_accuracy": 0.42068964838981626, + "step": 65350 + }, + { + "epoch": 0.06582625012464231, + "grad_norm": 12.232805064529998, + "learning_rate": 4.996919577092366e-05, + "loss": 2.8474, + "mean_token_accuracy": 0.3620689630508423, + "step": 65355 + }, + { + "epoch": 0.06583128617774649, + "grad_norm": 14.135918759739035, + "learning_rate": 4.9969176166739564e-05, + "loss": 2.4846, + "mean_token_accuracy": 0.4, + "step": 65360 + }, + { + "epoch": 0.06583632223085066, + "grad_norm": 11.041238749026546, + "learning_rate": 4.9969156556323565e-05, + "loss": 2.3784, + "mean_token_accuracy": 0.41724138259887694, + "step": 65365 + }, + { + "epoch": 0.06584135828395483, + "grad_norm": 10.679499426768874, + "learning_rate": 4.996913693967565e-05, + "loss": 2.4045, + "mean_token_accuracy": 0.44827585816383364, + "step": 65370 + }, + { + "epoch": 0.06584639433705901, + "grad_norm": 12.11767092220549, + "learning_rate": 4.996911731679585e-05, + "loss": 2.6773, + "mean_token_accuracy": 0.39310343861579894, + "step": 65375 + }, + { + "epoch": 0.06585143039016318, + "grad_norm": 11.275364754430525, + "learning_rate": 4.996909768768414e-05, + "loss": 2.6749, + "mean_token_accuracy": 0.36896551251411436, + "step": 65380 + }, + { + "epoch": 0.06585646644326736, + "grad_norm": 12.49622211374794, + "learning_rate": 4.996907805234055e-05, + "loss": 2.6873, + "mean_token_accuracy": 0.3655172407627106, + "step": 65385 + }, + { + "epoch": 0.06586150249637153, + "grad_norm": 10.923790030390217, + "learning_rate": 4.9969058410765085e-05, + "loss": 2.554, + "mean_token_accuracy": 0.39655172228813174, + "step": 65390 + }, + { + "epoch": 0.0658665385494757, + "grad_norm": 23.347232922109853, + "learning_rate": 4.996903876295773e-05, + "loss": 2.3908, + "mean_token_accuracy": 0.4586206912994385, + "step": 65395 + }, + { + "epoch": 0.06587157460257986, + "grad_norm": 10.662415067270413, + "learning_rate": 4.996901910891851e-05, + "loss": 2.4954, + "mean_token_accuracy": 0.4068965494632721, + "step": 65400 + }, + { + "epoch": 0.06587661065568404, + "grad_norm": 11.253295135655188, + "learning_rate": 4.9968999448647426e-05, + "loss": 2.2926, + "mean_token_accuracy": 0.4689655065536499, + "step": 65405 + }, + { + "epoch": 0.06588164670878821, + "grad_norm": 15.634729814723372, + "learning_rate": 4.996897978214448e-05, + "loss": 2.435, + "mean_token_accuracy": 0.4310344815254211, + "step": 65410 + }, + { + "epoch": 0.06588668276189238, + "grad_norm": 11.351158233786778, + "learning_rate": 4.996896010940967e-05, + "loss": 2.2554, + "mean_token_accuracy": 0.43968542814254763, + "step": 65415 + }, + { + "epoch": 0.06589171881499656, + "grad_norm": 12.56871335377824, + "learning_rate": 4.9968940430443026e-05, + "loss": 3.0543, + "mean_token_accuracy": 0.3999999940395355, + "step": 65420 + }, + { + "epoch": 0.06589675486810073, + "grad_norm": 12.460619345925506, + "learning_rate": 4.996892074524452e-05, + "loss": 2.7554, + "mean_token_accuracy": 0.3999999940395355, + "step": 65425 + }, + { + "epoch": 0.0659017909212049, + "grad_norm": 13.48566835728215, + "learning_rate": 4.996890105381418e-05, + "loss": 2.4997, + "mean_token_accuracy": 0.39310344457626345, + "step": 65430 + }, + { + "epoch": 0.06590682697430908, + "grad_norm": 10.569177780371906, + "learning_rate": 4.9968881356152014e-05, + "loss": 3.0132, + "mean_token_accuracy": 0.35172412991523744, + "step": 65435 + }, + { + "epoch": 0.06591186302741325, + "grad_norm": 10.595327235477416, + "learning_rate": 4.9968861652258014e-05, + "loss": 2.3642, + "mean_token_accuracy": 0.4103448212146759, + "step": 65440 + }, + { + "epoch": 0.06591689908051743, + "grad_norm": 14.203039817953197, + "learning_rate": 4.9968841942132196e-05, + "loss": 2.6731, + "mean_token_accuracy": 0.4172413766384125, + "step": 65445 + }, + { + "epoch": 0.0659219351336216, + "grad_norm": 10.823733679587228, + "learning_rate": 4.996882222577456e-05, + "loss": 2.2352, + "mean_token_accuracy": 0.42758620977401735, + "step": 65450 + }, + { + "epoch": 0.06592697118672577, + "grad_norm": 10.998957445125669, + "learning_rate": 4.996880250318511e-05, + "loss": 2.2451, + "mean_token_accuracy": 0.46551724076271056, + "step": 65455 + }, + { + "epoch": 0.06593200723982995, + "grad_norm": 9.91620354828304, + "learning_rate": 4.996878277436385e-05, + "loss": 2.7678, + "mean_token_accuracy": 0.4172413766384125, + "step": 65460 + }, + { + "epoch": 0.06593704329293412, + "grad_norm": 13.26655773789606, + "learning_rate": 4.99687630393108e-05, + "loss": 2.4446, + "mean_token_accuracy": 0.4379310369491577, + "step": 65465 + }, + { + "epoch": 0.06594207934603828, + "grad_norm": 9.875355402548797, + "learning_rate": 4.9968743298025946e-05, + "loss": 2.8613, + "mean_token_accuracy": 0.3551724135875702, + "step": 65470 + }, + { + "epoch": 0.06594711539914246, + "grad_norm": 11.998546982321045, + "learning_rate": 4.99687235505093e-05, + "loss": 2.6614, + "mean_token_accuracy": 0.3827586203813553, + "step": 65475 + }, + { + "epoch": 0.06595215145224663, + "grad_norm": 11.852341240306094, + "learning_rate": 4.996870379676088e-05, + "loss": 2.5705, + "mean_token_accuracy": 0.40689654350280763, + "step": 65480 + }, + { + "epoch": 0.0659571875053508, + "grad_norm": 12.049928221307734, + "learning_rate": 4.996868403678067e-05, + "loss": 2.3508, + "mean_token_accuracy": 0.4517241358757019, + "step": 65485 + }, + { + "epoch": 0.06596222355845498, + "grad_norm": 9.60336556484452, + "learning_rate": 4.996866427056869e-05, + "loss": 2.2918, + "mean_token_accuracy": 0.44670296311378477, + "step": 65490 + }, + { + "epoch": 0.06596725961155915, + "grad_norm": 15.722390574896556, + "learning_rate": 4.996864449812495e-05, + "loss": 2.4369, + "mean_token_accuracy": 0.3896551728248596, + "step": 65495 + }, + { + "epoch": 0.06597229566466332, + "grad_norm": 11.625071153538663, + "learning_rate": 4.996862471944944e-05, + "loss": 2.5663, + "mean_token_accuracy": 0.4103448301553726, + "step": 65500 + }, + { + "epoch": 0.0659773317177675, + "grad_norm": 11.568527243962391, + "learning_rate": 4.9968604934542176e-05, + "loss": 2.6505, + "mean_token_accuracy": 0.3896551728248596, + "step": 65505 + }, + { + "epoch": 0.06598236777087167, + "grad_norm": 11.63023585917754, + "learning_rate": 4.996858514340316e-05, + "loss": 2.8583, + "mean_token_accuracy": 0.3827586233615875, + "step": 65510 + }, + { + "epoch": 0.06598740382397585, + "grad_norm": 11.305279730670007, + "learning_rate": 4.9968565346032406e-05, + "loss": 2.4131, + "mean_token_accuracy": 0.41724138259887694, + "step": 65515 + }, + { + "epoch": 0.06599243987708002, + "grad_norm": 12.009779761898915, + "learning_rate": 4.9968545542429895e-05, + "loss": 2.365, + "mean_token_accuracy": 0.41379310488700866, + "step": 65520 + }, + { + "epoch": 0.0659974759301842, + "grad_norm": 11.518594019424027, + "learning_rate": 4.996852573259566e-05, + "loss": 2.6181, + "mean_token_accuracy": 0.42068964838981626, + "step": 65525 + }, + { + "epoch": 0.06600251198328837, + "grad_norm": 10.441983443094456, + "learning_rate": 4.996850591652969e-05, + "loss": 2.532, + "mean_token_accuracy": 0.3551724135875702, + "step": 65530 + }, + { + "epoch": 0.06600754803639254, + "grad_norm": 7.5885939350469265, + "learning_rate": 4.9968486094232e-05, + "loss": 2.0881, + "mean_token_accuracy": 0.4758620738983154, + "step": 65535 + }, + { + "epoch": 0.0660125840894967, + "grad_norm": 11.380359667745816, + "learning_rate": 4.996846626570259e-05, + "loss": 2.579, + "mean_token_accuracy": 0.42601331472396853, + "step": 65540 + }, + { + "epoch": 0.06601762014260087, + "grad_norm": 10.108514869349683, + "learning_rate": 4.996844643094147e-05, + "loss": 2.7189, + "mean_token_accuracy": 0.4413793087005615, + "step": 65545 + }, + { + "epoch": 0.06602265619570505, + "grad_norm": 10.655178481754222, + "learning_rate": 4.9968426589948634e-05, + "loss": 2.2949, + "mean_token_accuracy": 0.45172414779663084, + "step": 65550 + }, + { + "epoch": 0.06602769224880922, + "grad_norm": 10.923417970271302, + "learning_rate": 4.99684067427241e-05, + "loss": 2.3969, + "mean_token_accuracy": 0.4586206912994385, + "step": 65555 + }, + { + "epoch": 0.0660327283019134, + "grad_norm": 15.948263248874573, + "learning_rate": 4.9968386889267876e-05, + "loss": 2.3435, + "mean_token_accuracy": 0.45517241954803467, + "step": 65560 + }, + { + "epoch": 0.06603776435501757, + "grad_norm": 11.670395371206665, + "learning_rate": 4.996836702957995e-05, + "loss": 2.3725, + "mean_token_accuracy": 0.458620685338974, + "step": 65565 + }, + { + "epoch": 0.06604280040812174, + "grad_norm": 15.271588626473754, + "learning_rate": 4.996834716366035e-05, + "loss": 2.6224, + "mean_token_accuracy": 0.3793103456497192, + "step": 65570 + }, + { + "epoch": 0.06604783646122592, + "grad_norm": 10.170640835993577, + "learning_rate": 4.9968327291509067e-05, + "loss": 2.5474, + "mean_token_accuracy": 0.3655172437429428, + "step": 65575 + }, + { + "epoch": 0.06605287251433009, + "grad_norm": 12.238107989862943, + "learning_rate": 4.996830741312611e-05, + "loss": 2.6937, + "mean_token_accuracy": 0.37586207389831544, + "step": 65580 + }, + { + "epoch": 0.06605790856743426, + "grad_norm": 13.92690058202146, + "learning_rate": 4.9968287528511485e-05, + "loss": 2.8264, + "mean_token_accuracy": 0.37586206793785093, + "step": 65585 + }, + { + "epoch": 0.06606294462053844, + "grad_norm": 11.698076546974281, + "learning_rate": 4.99682676376652e-05, + "loss": 2.786, + "mean_token_accuracy": 0.36551723480224607, + "step": 65590 + }, + { + "epoch": 0.06606798067364261, + "grad_norm": 14.217071781944645, + "learning_rate": 4.996824774058724e-05, + "loss": 3.0245, + "mean_token_accuracy": 0.3241379290819168, + "step": 65595 + }, + { + "epoch": 0.06607301672674679, + "grad_norm": 13.444287330945965, + "learning_rate": 4.9968227837277636e-05, + "loss": 2.7208, + "mean_token_accuracy": 0.39655172228813174, + "step": 65600 + }, + { + "epoch": 0.06607805277985096, + "grad_norm": 11.201321278333081, + "learning_rate": 4.9968207927736396e-05, + "loss": 2.8515, + "mean_token_accuracy": 0.41724138259887694, + "step": 65605 + }, + { + "epoch": 0.06608308883295512, + "grad_norm": 10.66057533373802, + "learning_rate": 4.99681880119635e-05, + "loss": 2.5016, + "mean_token_accuracy": 0.4, + "step": 65610 + }, + { + "epoch": 0.0660881248860593, + "grad_norm": 12.160024654406152, + "learning_rate": 4.996816808995897e-05, + "loss": 2.8296, + "mean_token_accuracy": 0.358620685338974, + "step": 65615 + }, + { + "epoch": 0.06609316093916347, + "grad_norm": 11.441429474045075, + "learning_rate": 4.996814816172282e-05, + "loss": 2.6658, + "mean_token_accuracy": 0.36206896901130675, + "step": 65620 + }, + { + "epoch": 0.06609819699226764, + "grad_norm": 10.888089839160742, + "learning_rate": 4.996812822725504e-05, + "loss": 2.3375, + "mean_token_accuracy": 0.4551724135875702, + "step": 65625 + }, + { + "epoch": 0.06610323304537181, + "grad_norm": 10.75715846620542, + "learning_rate": 4.9968108286555646e-05, + "loss": 2.6154, + "mean_token_accuracy": 0.40344826579093934, + "step": 65630 + }, + { + "epoch": 0.06610826909847599, + "grad_norm": 11.476308576044449, + "learning_rate": 4.996808833962463e-05, + "loss": 2.2108, + "mean_token_accuracy": 0.4379310250282288, + "step": 65635 + }, + { + "epoch": 0.06611330515158016, + "grad_norm": 12.2273121829413, + "learning_rate": 4.996806838646201e-05, + "loss": 2.4679, + "mean_token_accuracy": 0.40344828367233276, + "step": 65640 + }, + { + "epoch": 0.06611834120468434, + "grad_norm": 12.313745232382882, + "learning_rate": 4.996804842706778e-05, + "loss": 2.4771, + "mean_token_accuracy": 0.44700544476509096, + "step": 65645 + }, + { + "epoch": 0.06612337725778851, + "grad_norm": 11.600091502920273, + "learning_rate": 4.996802846144197e-05, + "loss": 2.4729, + "mean_token_accuracy": 0.42068964838981626, + "step": 65650 + }, + { + "epoch": 0.06612841331089268, + "grad_norm": 10.953615196343153, + "learning_rate": 4.996800848958455e-05, + "loss": 2.5514, + "mean_token_accuracy": 0.3793103486299515, + "step": 65655 + }, + { + "epoch": 0.06613344936399686, + "grad_norm": 11.125559930841042, + "learning_rate": 4.996798851149556e-05, + "loss": 2.5017, + "mean_token_accuracy": 0.394313371181488, + "step": 65660 + }, + { + "epoch": 0.06613848541710103, + "grad_norm": 81.94796238135298, + "learning_rate": 4.9967968527174976e-05, + "loss": 2.6867, + "mean_token_accuracy": 0.3931034505367279, + "step": 65665 + }, + { + "epoch": 0.0661435214702052, + "grad_norm": 11.984488202314138, + "learning_rate": 4.996794853662282e-05, + "loss": 2.7475, + "mean_token_accuracy": 0.38620689511299133, + "step": 65670 + }, + { + "epoch": 0.06614855752330938, + "grad_norm": 11.524644538598913, + "learning_rate": 4.99679285398391e-05, + "loss": 2.7002, + "mean_token_accuracy": 0.36206896007061007, + "step": 65675 + }, + { + "epoch": 0.06615359357641354, + "grad_norm": 12.395259635343379, + "learning_rate": 4.996790853682381e-05, + "loss": 3.0995, + "mean_token_accuracy": 0.3103448212146759, + "step": 65680 + }, + { + "epoch": 0.06615862962951771, + "grad_norm": 12.251259793762424, + "learning_rate": 4.9967888527576965e-05, + "loss": 2.6303, + "mean_token_accuracy": 0.44150031208992, + "step": 65685 + }, + { + "epoch": 0.06616366568262189, + "grad_norm": 12.563306247392003, + "learning_rate": 4.996786851209856e-05, + "loss": 2.3918, + "mean_token_accuracy": 0.42413792610168455, + "step": 65690 + }, + { + "epoch": 0.06616870173572606, + "grad_norm": 11.160389155646747, + "learning_rate": 4.9967848490388625e-05, + "loss": 2.9554, + "mean_token_accuracy": 0.36206896901130675, + "step": 65695 + }, + { + "epoch": 0.06617373778883023, + "grad_norm": 10.799258024802246, + "learning_rate": 4.996782846244714e-05, + "loss": 3.0101, + "mean_token_accuracy": 0.3413793116807938, + "step": 65700 + }, + { + "epoch": 0.06617877384193441, + "grad_norm": 11.974176905365884, + "learning_rate": 4.996780842827412e-05, + "loss": 2.9939, + "mean_token_accuracy": 0.36896551847457887, + "step": 65705 + }, + { + "epoch": 0.06618380989503858, + "grad_norm": 11.663029508529444, + "learning_rate": 4.996778838786956e-05, + "loss": 2.8163, + "mean_token_accuracy": 0.3896551728248596, + "step": 65710 + }, + { + "epoch": 0.06618884594814275, + "grad_norm": 10.644703937730162, + "learning_rate": 4.996776834123348e-05, + "loss": 2.6914, + "mean_token_accuracy": 0.37731397449970244, + "step": 65715 + }, + { + "epoch": 0.06619388200124693, + "grad_norm": 11.927257307523423, + "learning_rate": 4.9967748288365886e-05, + "loss": 2.3882, + "mean_token_accuracy": 0.46551724076271056, + "step": 65720 + }, + { + "epoch": 0.0661989180543511, + "grad_norm": 10.737075136749617, + "learning_rate": 4.9967728229266776e-05, + "loss": 3.602, + "mean_token_accuracy": 0.279310342669487, + "step": 65725 + }, + { + "epoch": 0.06620395410745528, + "grad_norm": 10.917126384689633, + "learning_rate": 4.9967708163936154e-05, + "loss": 2.5821, + "mean_token_accuracy": 0.4068965494632721, + "step": 65730 + }, + { + "epoch": 0.06620899016055945, + "grad_norm": 11.006160497014983, + "learning_rate": 4.996768809237404e-05, + "loss": 2.6702, + "mean_token_accuracy": 0.43103448748588563, + "step": 65735 + }, + { + "epoch": 0.06621402621366362, + "grad_norm": 17.033276816009113, + "learning_rate": 4.996766801458042e-05, + "loss": 2.6449, + "mean_token_accuracy": 0.358620685338974, + "step": 65740 + }, + { + "epoch": 0.0662190622667678, + "grad_norm": 14.323147274450582, + "learning_rate": 4.99676479305553e-05, + "loss": 2.6891, + "mean_token_accuracy": 0.358620685338974, + "step": 65745 + }, + { + "epoch": 0.06622409831987196, + "grad_norm": 12.353574921814861, + "learning_rate": 4.996762784029871e-05, + "loss": 2.7006, + "mean_token_accuracy": 0.3620689630508423, + "step": 65750 + }, + { + "epoch": 0.06622913437297613, + "grad_norm": 13.647310163335069, + "learning_rate": 4.996760774381064e-05, + "loss": 2.6207, + "mean_token_accuracy": 0.4, + "step": 65755 + }, + { + "epoch": 0.0662341704260803, + "grad_norm": 10.859201021842837, + "learning_rate": 4.9967587641091086e-05, + "loss": 2.6595, + "mean_token_accuracy": 0.4310344696044922, + "step": 65760 + }, + { + "epoch": 0.06623920647918448, + "grad_norm": 12.52026717261555, + "learning_rate": 4.9967567532140056e-05, + "loss": 2.5308, + "mean_token_accuracy": 0.4068965554237366, + "step": 65765 + }, + { + "epoch": 0.06624424253228865, + "grad_norm": 13.471069302938337, + "learning_rate": 4.996754741695757e-05, + "loss": 2.5951, + "mean_token_accuracy": 0.37241379618644715, + "step": 65770 + }, + { + "epoch": 0.06624927858539283, + "grad_norm": 10.489732797141977, + "learning_rate": 4.996752729554364e-05, + "loss": 2.5884, + "mean_token_accuracy": 0.4172413766384125, + "step": 65775 + }, + { + "epoch": 0.066254314638497, + "grad_norm": 9.12093494395487, + "learning_rate": 4.996750716789824e-05, + "loss": 2.3982, + "mean_token_accuracy": 0.42413792610168455, + "step": 65780 + }, + { + "epoch": 0.06625935069160117, + "grad_norm": 12.375699342483061, + "learning_rate": 4.9967487034021393e-05, + "loss": 2.356, + "mean_token_accuracy": 0.42758620381355283, + "step": 65785 + }, + { + "epoch": 0.06626438674470535, + "grad_norm": 11.694837723731352, + "learning_rate": 4.99674668939131e-05, + "loss": 2.24, + "mean_token_accuracy": 0.4344827473163605, + "step": 65790 + }, + { + "epoch": 0.06626942279780952, + "grad_norm": 12.398774857071942, + "learning_rate": 4.9967446747573395e-05, + "loss": 2.5127, + "mean_token_accuracy": 0.40992135405540464, + "step": 65795 + }, + { + "epoch": 0.0662744588509137, + "grad_norm": 10.890134969853603, + "learning_rate": 4.9967426595002234e-05, + "loss": 2.3732, + "mean_token_accuracy": 0.3965517282485962, + "step": 65800 + }, + { + "epoch": 0.06627949490401787, + "grad_norm": 12.002805868490261, + "learning_rate": 4.996740643619967e-05, + "loss": 2.4098, + "mean_token_accuracy": 0.4517241418361664, + "step": 65805 + }, + { + "epoch": 0.06628453095712204, + "grad_norm": 13.332440332637358, + "learning_rate": 4.996738627116567e-05, + "loss": 2.6436, + "mean_token_accuracy": 0.38965516686439516, + "step": 65810 + }, + { + "epoch": 0.06628956701022622, + "grad_norm": 10.309022170250916, + "learning_rate": 4.996736609990027e-05, + "loss": 2.9232, + "mean_token_accuracy": 0.3862069010734558, + "step": 65815 + }, + { + "epoch": 0.06629460306333038, + "grad_norm": 13.323531585741378, + "learning_rate": 4.996734592240346e-05, + "loss": 2.7114, + "mean_token_accuracy": 0.39310344457626345, + "step": 65820 + }, + { + "epoch": 0.06629963911643455, + "grad_norm": 10.925388578891596, + "learning_rate": 4.996732573867524e-05, + "loss": 2.2188, + "mean_token_accuracy": 0.4586206912994385, + "step": 65825 + }, + { + "epoch": 0.06630467516953872, + "grad_norm": 11.204509515647834, + "learning_rate": 4.9967305548715626e-05, + "loss": 2.6651, + "mean_token_accuracy": 0.38965516686439516, + "step": 65830 + }, + { + "epoch": 0.0663097112226429, + "grad_norm": 10.618147065907145, + "learning_rate": 4.996728535252462e-05, + "loss": 2.6012, + "mean_token_accuracy": 0.42413793206214906, + "step": 65835 + }, + { + "epoch": 0.06631474727574707, + "grad_norm": 11.183392959370348, + "learning_rate": 4.996726515010223e-05, + "loss": 2.4456, + "mean_token_accuracy": 0.42413793206214906, + "step": 65840 + }, + { + "epoch": 0.06631978332885125, + "grad_norm": 13.765901532246167, + "learning_rate": 4.996724494144846e-05, + "loss": 2.5826, + "mean_token_accuracy": 0.4103448212146759, + "step": 65845 + }, + { + "epoch": 0.06632481938195542, + "grad_norm": 11.380528605699602, + "learning_rate": 4.996722472656332e-05, + "loss": 2.1582, + "mean_token_accuracy": 0.4896551728248596, + "step": 65850 + }, + { + "epoch": 0.06632985543505959, + "grad_norm": 14.630692242046269, + "learning_rate": 4.99672045054468e-05, + "loss": 2.5606, + "mean_token_accuracy": 0.4551724135875702, + "step": 65855 + }, + { + "epoch": 0.06633489148816377, + "grad_norm": 10.774751331395201, + "learning_rate": 4.9967184278098936e-05, + "loss": 2.7397, + "mean_token_accuracy": 0.38965516686439516, + "step": 65860 + }, + { + "epoch": 0.06633992754126794, + "grad_norm": 15.087119219723876, + "learning_rate": 4.99671640445197e-05, + "loss": 2.7007, + "mean_token_accuracy": 0.3931034505367279, + "step": 65865 + }, + { + "epoch": 0.06634496359437211, + "grad_norm": 11.557235178390245, + "learning_rate": 4.996714380470912e-05, + "loss": 2.5228, + "mean_token_accuracy": 0.4034482777118683, + "step": 65870 + }, + { + "epoch": 0.06634999964747629, + "grad_norm": 13.472117220627453, + "learning_rate": 4.996712355866718e-05, + "loss": 2.2327, + "mean_token_accuracy": 0.47084088921546935, + "step": 65875 + }, + { + "epoch": 0.06635503570058046, + "grad_norm": 13.387657730666263, + "learning_rate": 4.9967103306393914e-05, + "loss": 2.6246, + "mean_token_accuracy": 0.3896551728248596, + "step": 65880 + }, + { + "epoch": 0.06636007175368464, + "grad_norm": 11.799704346306692, + "learning_rate": 4.996708304788931e-05, + "loss": 2.617, + "mean_token_accuracy": 0.3827586233615875, + "step": 65885 + }, + { + "epoch": 0.0663651078067888, + "grad_norm": 10.943174417674227, + "learning_rate": 4.9967062783153376e-05, + "loss": 2.5643, + "mean_token_accuracy": 0.44137930274009707, + "step": 65890 + }, + { + "epoch": 0.06637014385989297, + "grad_norm": 9.93561905902741, + "learning_rate": 4.996704251218612e-05, + "loss": 2.5709, + "mean_token_accuracy": 0.36206896901130675, + "step": 65895 + }, + { + "epoch": 0.06637517991299714, + "grad_norm": 11.33470499025213, + "learning_rate": 4.996702223498754e-05, + "loss": 2.4371, + "mean_token_accuracy": 0.43103447556495667, + "step": 65900 + }, + { + "epoch": 0.06638021596610132, + "grad_norm": 13.04370995573622, + "learning_rate": 4.9967001951557654e-05, + "loss": 2.6165, + "mean_token_accuracy": 0.3620689630508423, + "step": 65905 + }, + { + "epoch": 0.06638525201920549, + "grad_norm": 18.48936608924017, + "learning_rate": 4.9966981661896454e-05, + "loss": 2.6098, + "mean_token_accuracy": 0.4517241418361664, + "step": 65910 + }, + { + "epoch": 0.06639028807230966, + "grad_norm": 14.3094350939331, + "learning_rate": 4.996696136600396e-05, + "loss": 2.4373, + "mean_token_accuracy": 0.4034482777118683, + "step": 65915 + }, + { + "epoch": 0.06639532412541384, + "grad_norm": 12.793646615269228, + "learning_rate": 4.9966941063880165e-05, + "loss": 2.2073, + "mean_token_accuracy": 0.4551724076271057, + "step": 65920 + }, + { + "epoch": 0.06640036017851801, + "grad_norm": 13.487501468303014, + "learning_rate": 4.996692075552508e-05, + "loss": 2.5007, + "mean_token_accuracy": 0.42758620381355283, + "step": 65925 + }, + { + "epoch": 0.06640539623162219, + "grad_norm": 11.656551067846463, + "learning_rate": 4.996690044093871e-05, + "loss": 2.6905, + "mean_token_accuracy": 0.36206896901130675, + "step": 65930 + }, + { + "epoch": 0.06641043228472636, + "grad_norm": 11.114755804889356, + "learning_rate": 4.9966880120121066e-05, + "loss": 2.4705, + "mean_token_accuracy": 0.37931033968925476, + "step": 65935 + }, + { + "epoch": 0.06641546833783053, + "grad_norm": 9.980562101917032, + "learning_rate": 4.996685979307214e-05, + "loss": 2.4551, + "mean_token_accuracy": 0.4309113323688507, + "step": 65940 + }, + { + "epoch": 0.0664205043909347, + "grad_norm": 10.90274938301233, + "learning_rate": 4.996683945979196e-05, + "loss": 2.4641, + "mean_token_accuracy": 0.4413793087005615, + "step": 65945 + }, + { + "epoch": 0.06642554044403888, + "grad_norm": 10.398303642904503, + "learning_rate": 4.996681912028051e-05, + "loss": 2.2057, + "mean_token_accuracy": 0.4379310250282288, + "step": 65950 + }, + { + "epoch": 0.06643057649714304, + "grad_norm": 10.7211467870642, + "learning_rate": 4.9966798774537815e-05, + "loss": 2.1404, + "mean_token_accuracy": 0.4310344815254211, + "step": 65955 + }, + { + "epoch": 0.06643561255024721, + "grad_norm": 13.147318041123757, + "learning_rate": 4.996677842256386e-05, + "loss": 2.5036, + "mean_token_accuracy": 0.403448274731636, + "step": 65960 + }, + { + "epoch": 0.06644064860335139, + "grad_norm": 13.727520547328762, + "learning_rate": 4.996675806435866e-05, + "loss": 2.3904, + "mean_token_accuracy": 0.4363581418991089, + "step": 65965 + }, + { + "epoch": 0.06644568465645556, + "grad_norm": 9.955144829553648, + "learning_rate": 4.9966737699922215e-05, + "loss": 2.4317, + "mean_token_accuracy": 0.4137930989265442, + "step": 65970 + }, + { + "epoch": 0.06645072070955974, + "grad_norm": 12.658262905199003, + "learning_rate": 4.9966717329254545e-05, + "loss": 2.3449, + "mean_token_accuracy": 0.42413793206214906, + "step": 65975 + }, + { + "epoch": 0.06645575676266391, + "grad_norm": 11.025214466267453, + "learning_rate": 4.9966696952355646e-05, + "loss": 2.8289, + "mean_token_accuracy": 0.3655172437429428, + "step": 65980 + }, + { + "epoch": 0.06646079281576808, + "grad_norm": 14.432668410039296, + "learning_rate": 4.9966676569225526e-05, + "loss": 2.545, + "mean_token_accuracy": 0.4344827592372894, + "step": 65985 + }, + { + "epoch": 0.06646582886887226, + "grad_norm": 10.853109190891788, + "learning_rate": 4.9966656179864184e-05, + "loss": 2.3786, + "mean_token_accuracy": 0.4, + "step": 65990 + }, + { + "epoch": 0.06647086492197643, + "grad_norm": 12.032013677507848, + "learning_rate": 4.9966635784271634e-05, + "loss": 2.9064, + "mean_token_accuracy": 0.3793103456497192, + "step": 65995 + }, + { + "epoch": 0.0664759009750806, + "grad_norm": 12.127793305778987, + "learning_rate": 4.996661538244788e-05, + "loss": 3.0199, + "mean_token_accuracy": 0.32758620381355286, + "step": 66000 + }, + { + "epoch": 0.06648093702818478, + "grad_norm": 11.304633570665953, + "learning_rate": 4.996659497439293e-05, + "loss": 2.3188, + "mean_token_accuracy": 0.45033273100852966, + "step": 66005 + }, + { + "epoch": 0.06648597308128895, + "grad_norm": 12.688480046994256, + "learning_rate": 4.996657456010678e-05, + "loss": 2.5242, + "mean_token_accuracy": 0.4137930989265442, + "step": 66010 + }, + { + "epoch": 0.06649100913439313, + "grad_norm": 13.583451222265722, + "learning_rate": 4.996655413958944e-05, + "loss": 2.609, + "mean_token_accuracy": 0.3931034505367279, + "step": 66015 + }, + { + "epoch": 0.0664960451874973, + "grad_norm": 10.364487326804996, + "learning_rate": 4.9966533712840926e-05, + "loss": 2.2668, + "mean_token_accuracy": 0.4344827592372894, + "step": 66020 + }, + { + "epoch": 0.06650108124060146, + "grad_norm": 13.992668736757036, + "learning_rate": 4.996651327986123e-05, + "loss": 2.7204, + "mean_token_accuracy": 0.37241379022598264, + "step": 66025 + }, + { + "epoch": 0.06650611729370563, + "grad_norm": 11.473504692004495, + "learning_rate": 4.996649284065036e-05, + "loss": 2.934, + "mean_token_accuracy": 0.3586206793785095, + "step": 66030 + }, + { + "epoch": 0.0665111533468098, + "grad_norm": 9.878126237242672, + "learning_rate": 4.996647239520833e-05, + "loss": 2.1597, + "mean_token_accuracy": 0.48275862336158754, + "step": 66035 + }, + { + "epoch": 0.06651618939991398, + "grad_norm": 10.292787453798343, + "learning_rate": 4.9966451943535136e-05, + "loss": 2.9654, + "mean_token_accuracy": 0.34827586114406583, + "step": 66040 + }, + { + "epoch": 0.06652122545301815, + "grad_norm": 9.83307573248238, + "learning_rate": 4.996643148563079e-05, + "loss": 2.3712, + "mean_token_accuracy": 0.44827585816383364, + "step": 66045 + }, + { + "epoch": 0.06652626150612233, + "grad_norm": 14.316953108209084, + "learning_rate": 4.99664110214953e-05, + "loss": 2.5468, + "mean_token_accuracy": 0.382758629322052, + "step": 66050 + }, + { + "epoch": 0.0665312975592265, + "grad_norm": 13.42775053662216, + "learning_rate": 4.996639055112866e-05, + "loss": 2.4322, + "mean_token_accuracy": 0.39310344457626345, + "step": 66055 + }, + { + "epoch": 0.06653633361233068, + "grad_norm": 10.224744925443344, + "learning_rate": 4.996637007453088e-05, + "loss": 2.3002, + "mean_token_accuracy": 0.42758620381355283, + "step": 66060 + }, + { + "epoch": 0.06654136966543485, + "grad_norm": 9.919176088618395, + "learning_rate": 4.996634959170198e-05, + "loss": 2.4556, + "mean_token_accuracy": 0.4068965554237366, + "step": 66065 + }, + { + "epoch": 0.06654640571853902, + "grad_norm": 10.939818057880961, + "learning_rate": 4.996632910264194e-05, + "loss": 2.3508, + "mean_token_accuracy": 0.42413793206214906, + "step": 66070 + }, + { + "epoch": 0.0665514417716432, + "grad_norm": 10.177822217083444, + "learning_rate": 4.9966308607350784e-05, + "loss": 2.3622, + "mean_token_accuracy": 0.4448275864124298, + "step": 66075 + }, + { + "epoch": 0.06655647782474737, + "grad_norm": 12.486180782006812, + "learning_rate": 4.996628810582852e-05, + "loss": 2.4, + "mean_token_accuracy": 0.42220205068588257, + "step": 66080 + }, + { + "epoch": 0.06656151387785154, + "grad_norm": 11.43461385909063, + "learning_rate": 4.9966267598075144e-05, + "loss": 2.6108, + "mean_token_accuracy": 0.4206896543502808, + "step": 66085 + }, + { + "epoch": 0.06656654993095572, + "grad_norm": 18.720697106714304, + "learning_rate": 4.9966247084090664e-05, + "loss": 2.7222, + "mean_token_accuracy": 0.42068964838981626, + "step": 66090 + }, + { + "epoch": 0.06657158598405988, + "grad_norm": 13.76627800060079, + "learning_rate": 4.9966226563875085e-05, + "loss": 2.788, + "mean_token_accuracy": 0.3827586233615875, + "step": 66095 + }, + { + "epoch": 0.06657662203716405, + "grad_norm": 10.800804864620961, + "learning_rate": 4.996620603742842e-05, + "loss": 2.6495, + "mean_token_accuracy": 0.3931034505367279, + "step": 66100 + }, + { + "epoch": 0.06658165809026823, + "grad_norm": 10.338340693017049, + "learning_rate": 4.996618550475066e-05, + "loss": 2.2443, + "mean_token_accuracy": 0.4379310369491577, + "step": 66105 + }, + { + "epoch": 0.0665866941433724, + "grad_norm": 10.496507351196147, + "learning_rate": 4.9966164965841825e-05, + "loss": 2.3545, + "mean_token_accuracy": 0.4533013582229614, + "step": 66110 + }, + { + "epoch": 0.06659173019647657, + "grad_norm": 11.211391242286485, + "learning_rate": 4.9966144420701915e-05, + "loss": 2.6675, + "mean_token_accuracy": 0.4068965554237366, + "step": 66115 + }, + { + "epoch": 0.06659676624958075, + "grad_norm": 10.654729424699223, + "learning_rate": 4.996612386933094e-05, + "loss": 2.4456, + "mean_token_accuracy": 0.41034482717514037, + "step": 66120 + }, + { + "epoch": 0.06660180230268492, + "grad_norm": 12.127180584455239, + "learning_rate": 4.99661033117289e-05, + "loss": 3.0584, + "mean_token_accuracy": 0.3310344755649567, + "step": 66125 + }, + { + "epoch": 0.0666068383557891, + "grad_norm": 13.424407290936784, + "learning_rate": 4.996608274789579e-05, + "loss": 2.7523, + "mean_token_accuracy": 0.3793103456497192, + "step": 66130 + }, + { + "epoch": 0.06661187440889327, + "grad_norm": 10.42645562022817, + "learning_rate": 4.996606217783164e-05, + "loss": 2.8638, + "mean_token_accuracy": 0.41724138259887694, + "step": 66135 + }, + { + "epoch": 0.06661691046199744, + "grad_norm": 10.311070855724283, + "learning_rate": 4.996604160153645e-05, + "loss": 2.4923, + "mean_token_accuracy": 0.4551724076271057, + "step": 66140 + }, + { + "epoch": 0.06662194651510162, + "grad_norm": 11.742212835403546, + "learning_rate": 4.996602101901021e-05, + "loss": 2.6908, + "mean_token_accuracy": 0.4379310429096222, + "step": 66145 + }, + { + "epoch": 0.06662698256820579, + "grad_norm": 10.567838622035188, + "learning_rate": 4.9966000430252936e-05, + "loss": 2.1252, + "mean_token_accuracy": 0.49655172824859617, + "step": 66150 + }, + { + "epoch": 0.06663201862130996, + "grad_norm": 10.782181528480487, + "learning_rate": 4.996597983526463e-05, + "loss": 2.4665, + "mean_token_accuracy": 0.3862069010734558, + "step": 66155 + }, + { + "epoch": 0.06663705467441414, + "grad_norm": 12.511544159229862, + "learning_rate": 4.9965959234045304e-05, + "loss": 2.3473, + "mean_token_accuracy": 0.3931034505367279, + "step": 66160 + }, + { + "epoch": 0.0666420907275183, + "grad_norm": 17.61815705317741, + "learning_rate": 4.996593862659497e-05, + "loss": 3.5867, + "mean_token_accuracy": 0.28965516984462736, + "step": 66165 + }, + { + "epoch": 0.06664712678062247, + "grad_norm": 11.248825848364577, + "learning_rate": 4.996591801291361e-05, + "loss": 2.3255, + "mean_token_accuracy": 0.38965516686439516, + "step": 66170 + }, + { + "epoch": 0.06665216283372664, + "grad_norm": 11.311981066189935, + "learning_rate": 4.996589739300125e-05, + "loss": 2.4769, + "mean_token_accuracy": 0.3841500341892242, + "step": 66175 + }, + { + "epoch": 0.06665719888683082, + "grad_norm": 11.221232385959333, + "learning_rate": 4.996587676685789e-05, + "loss": 2.2453, + "mean_token_accuracy": 0.4379310369491577, + "step": 66180 + }, + { + "epoch": 0.06666223493993499, + "grad_norm": 11.94160116416949, + "learning_rate": 4.996585613448353e-05, + "loss": 2.4141, + "mean_token_accuracy": 0.43448275327682495, + "step": 66185 + }, + { + "epoch": 0.06666727099303917, + "grad_norm": 10.786340701858007, + "learning_rate": 4.9965835495878184e-05, + "loss": 2.4601, + "mean_token_accuracy": 0.41034482717514037, + "step": 66190 + }, + { + "epoch": 0.06667230704614334, + "grad_norm": 11.053909988506982, + "learning_rate": 4.996581485104185e-05, + "loss": 2.5627, + "mean_token_accuracy": 0.42758620977401735, + "step": 66195 + }, + { + "epoch": 0.06667734309924751, + "grad_norm": 10.981913019405885, + "learning_rate": 4.996579419997455e-05, + "loss": 2.602, + "mean_token_accuracy": 0.42068964838981626, + "step": 66200 + }, + { + "epoch": 0.06668237915235169, + "grad_norm": 12.293638032973194, + "learning_rate": 4.996577354267628e-05, + "loss": 2.5349, + "mean_token_accuracy": 0.4068965554237366, + "step": 66205 + }, + { + "epoch": 0.06668741520545586, + "grad_norm": 13.557173867472162, + "learning_rate": 4.9965752879147026e-05, + "loss": 2.7926, + "mean_token_accuracy": 0.37241379022598264, + "step": 66210 + }, + { + "epoch": 0.06669245125856003, + "grad_norm": 11.377244170566494, + "learning_rate": 4.9965732209386825e-05, + "loss": 2.6472, + "mean_token_accuracy": 0.39310344457626345, + "step": 66215 + }, + { + "epoch": 0.06669748731166421, + "grad_norm": 11.347822761402815, + "learning_rate": 4.9965711533395666e-05, + "loss": 2.4998, + "mean_token_accuracy": 0.3862068891525269, + "step": 66220 + }, + { + "epoch": 0.06670252336476838, + "grad_norm": 11.890712740848503, + "learning_rate": 4.9965690851173564e-05, + "loss": 2.5737, + "mean_token_accuracy": 0.4, + "step": 66225 + }, + { + "epoch": 0.06670755941787256, + "grad_norm": 16.643008433513096, + "learning_rate": 4.996567016272051e-05, + "loss": 2.4979, + "mean_token_accuracy": 0.34827586114406583, + "step": 66230 + }, + { + "epoch": 0.06671259547097672, + "grad_norm": 10.197193658359037, + "learning_rate": 4.996564946803652e-05, + "loss": 2.1559, + "mean_token_accuracy": 0.4482758641242981, + "step": 66235 + }, + { + "epoch": 0.06671763152408089, + "grad_norm": 12.737640907976207, + "learning_rate": 4.99656287671216e-05, + "loss": 2.4784, + "mean_token_accuracy": 0.45517241954803467, + "step": 66240 + }, + { + "epoch": 0.06672266757718506, + "grad_norm": 12.8756444598531, + "learning_rate": 4.996560805997576e-05, + "loss": 2.4792, + "mean_token_accuracy": 0.46896551847457885, + "step": 66245 + }, + { + "epoch": 0.06672770363028924, + "grad_norm": 15.651209179632009, + "learning_rate": 4.996558734659899e-05, + "loss": 2.3762, + "mean_token_accuracy": 0.4068965554237366, + "step": 66250 + }, + { + "epoch": 0.06673273968339341, + "grad_norm": 13.677928631192895, + "learning_rate": 4.9965566626991306e-05, + "loss": 2.8615, + "mean_token_accuracy": 0.404839688539505, + "step": 66255 + }, + { + "epoch": 0.06673777573649758, + "grad_norm": 9.645667872175167, + "learning_rate": 4.996554590115272e-05, + "loss": 2.436, + "mean_token_accuracy": 0.4121597111225128, + "step": 66260 + }, + { + "epoch": 0.06674281178960176, + "grad_norm": 9.708286297232284, + "learning_rate": 4.996552516908322e-05, + "loss": 2.4455, + "mean_token_accuracy": 0.47352216243743894, + "step": 66265 + }, + { + "epoch": 0.06674784784270593, + "grad_norm": 10.765000699298964, + "learning_rate": 4.9965504430782836e-05, + "loss": 2.4801, + "mean_token_accuracy": 0.4379310369491577, + "step": 66270 + }, + { + "epoch": 0.0667528838958101, + "grad_norm": 13.248726752845258, + "learning_rate": 4.9965483686251553e-05, + "loss": 2.9254, + "mean_token_accuracy": 0.37931033968925476, + "step": 66275 + }, + { + "epoch": 0.06675791994891428, + "grad_norm": 12.099536990807216, + "learning_rate": 4.996546293548939e-05, + "loss": 2.6196, + "mean_token_accuracy": 0.38620689511299133, + "step": 66280 + }, + { + "epoch": 0.06676295600201845, + "grad_norm": 11.33384583693756, + "learning_rate": 4.996544217849634e-05, + "loss": 2.4398, + "mean_token_accuracy": 0.39310344457626345, + "step": 66285 + }, + { + "epoch": 0.06676799205512263, + "grad_norm": 10.929501978455228, + "learning_rate": 4.996542141527242e-05, + "loss": 2.0183, + "mean_token_accuracy": 0.46206897497177124, + "step": 66290 + }, + { + "epoch": 0.0667730281082268, + "grad_norm": 10.74936858062237, + "learning_rate": 4.996540064581763e-05, + "loss": 2.4025, + "mean_token_accuracy": 0.4586206912994385, + "step": 66295 + }, + { + "epoch": 0.06677806416133097, + "grad_norm": 11.388001539917504, + "learning_rate": 4.996537987013198e-05, + "loss": 2.4409, + "mean_token_accuracy": 0.41554749608039854, + "step": 66300 + }, + { + "epoch": 0.06678310021443513, + "grad_norm": 13.040340411719935, + "learning_rate": 4.996535908821546e-05, + "loss": 2.5849, + "mean_token_accuracy": 0.38965516686439516, + "step": 66305 + }, + { + "epoch": 0.06678813626753931, + "grad_norm": 10.07712603774365, + "learning_rate": 4.996533830006811e-05, + "loss": 2.6262, + "mean_token_accuracy": 0.43793103098869324, + "step": 66310 + }, + { + "epoch": 0.06679317232064348, + "grad_norm": 14.504661389094485, + "learning_rate": 4.99653175056899e-05, + "loss": 2.3005, + "mean_token_accuracy": 0.43103448748588563, + "step": 66315 + }, + { + "epoch": 0.06679820837374766, + "grad_norm": 12.810916479184415, + "learning_rate": 4.9965296705080854e-05, + "loss": 2.6076, + "mean_token_accuracy": 0.39310344457626345, + "step": 66320 + }, + { + "epoch": 0.06680324442685183, + "grad_norm": 12.430741989260607, + "learning_rate": 4.9965275898240974e-05, + "loss": 2.271, + "mean_token_accuracy": 0.4517241358757019, + "step": 66325 + }, + { + "epoch": 0.066808280479956, + "grad_norm": 10.913284868433577, + "learning_rate": 4.996525508517027e-05, + "loss": 2.6121, + "mean_token_accuracy": 0.43793103098869324, + "step": 66330 + }, + { + "epoch": 0.06681331653306018, + "grad_norm": 10.560775268068141, + "learning_rate": 4.996523426586874e-05, + "loss": 2.7409, + "mean_token_accuracy": 0.34482758939266206, + "step": 66335 + }, + { + "epoch": 0.06681835258616435, + "grad_norm": 11.953360432909303, + "learning_rate": 4.99652134403364e-05, + "loss": 2.4478, + "mean_token_accuracy": 0.39655172228813174, + "step": 66340 + }, + { + "epoch": 0.06682338863926852, + "grad_norm": 11.00197151173896, + "learning_rate": 4.996519260857324e-05, + "loss": 2.0902, + "mean_token_accuracy": 0.45904416441917417, + "step": 66345 + }, + { + "epoch": 0.0668284246923727, + "grad_norm": 12.781750319426335, + "learning_rate": 4.996517177057928e-05, + "loss": 2.6293, + "mean_token_accuracy": 0.36551723480224607, + "step": 66350 + }, + { + "epoch": 0.06683346074547687, + "grad_norm": 14.00670925132052, + "learning_rate": 4.996515092635452e-05, + "loss": 2.3557, + "mean_token_accuracy": 0.42758620381355283, + "step": 66355 + }, + { + "epoch": 0.06683849679858105, + "grad_norm": 12.518401487653934, + "learning_rate": 4.9965130075898966e-05, + "loss": 2.4733, + "mean_token_accuracy": 0.38620689511299133, + "step": 66360 + }, + { + "epoch": 0.06684353285168522, + "grad_norm": 11.078366028732237, + "learning_rate": 4.996510921921262e-05, + "loss": 2.2695, + "mean_token_accuracy": 0.4655172348022461, + "step": 66365 + }, + { + "epoch": 0.0668485689047894, + "grad_norm": 12.282512262483078, + "learning_rate": 4.99650883562955e-05, + "loss": 2.4037, + "mean_token_accuracy": 0.4068965494632721, + "step": 66370 + }, + { + "epoch": 0.06685360495789355, + "grad_norm": 12.267136676654756, + "learning_rate": 4.99650674871476e-05, + "loss": 2.8638, + "mean_token_accuracy": 0.34482758343219755, + "step": 66375 + }, + { + "epoch": 0.06685864101099773, + "grad_norm": 12.057096154087924, + "learning_rate": 4.996504661176893e-05, + "loss": 2.6268, + "mean_token_accuracy": 0.43448275327682495, + "step": 66380 + }, + { + "epoch": 0.0668636770641019, + "grad_norm": 21.9234800827101, + "learning_rate": 4.99650257301595e-05, + "loss": 3.0769, + "mean_token_accuracy": 0.36551723778247835, + "step": 66385 + }, + { + "epoch": 0.06686871311720607, + "grad_norm": 12.252119370924136, + "learning_rate": 4.9965004842319295e-05, + "loss": 2.4356, + "mean_token_accuracy": 0.4, + "step": 66390 + }, + { + "epoch": 0.06687374917031025, + "grad_norm": 10.288089197958827, + "learning_rate": 4.996498394824836e-05, + "loss": 2.3763, + "mean_token_accuracy": 0.4241379380226135, + "step": 66395 + }, + { + "epoch": 0.06687878522341442, + "grad_norm": 10.0161835604596, + "learning_rate": 4.996496304794666e-05, + "loss": 2.4996, + "mean_token_accuracy": 0.42068966031074523, + "step": 66400 + }, + { + "epoch": 0.0668838212765186, + "grad_norm": 11.704628900208423, + "learning_rate": 4.9964942141414226e-05, + "loss": 2.6656, + "mean_token_accuracy": 0.3862068891525269, + "step": 66405 + }, + { + "epoch": 0.06688885732962277, + "grad_norm": 12.267536675916393, + "learning_rate": 4.996492122865106e-05, + "loss": 3.003, + "mean_token_accuracy": 0.37241379618644715, + "step": 66410 + }, + { + "epoch": 0.06689389338272694, + "grad_norm": 13.519538905654098, + "learning_rate": 4.996490030965716e-05, + "loss": 2.6373, + "mean_token_accuracy": 0.3793103456497192, + "step": 66415 + }, + { + "epoch": 0.06689892943583112, + "grad_norm": 15.616310525688995, + "learning_rate": 4.996487938443253e-05, + "loss": 2.8172, + "mean_token_accuracy": 0.37241379618644715, + "step": 66420 + }, + { + "epoch": 0.06690396548893529, + "grad_norm": 12.845896272408565, + "learning_rate": 4.996485845297719e-05, + "loss": 3.042, + "mean_token_accuracy": 0.4068965494632721, + "step": 66425 + }, + { + "epoch": 0.06690900154203946, + "grad_norm": 14.504624316100706, + "learning_rate": 4.996483751529113e-05, + "loss": 3.115, + "mean_token_accuracy": 0.2931034415960312, + "step": 66430 + }, + { + "epoch": 0.06691403759514364, + "grad_norm": 15.229301324578802, + "learning_rate": 4.996481657137437e-05, + "loss": 2.5941, + "mean_token_accuracy": 0.42413792610168455, + "step": 66435 + }, + { + "epoch": 0.06691907364824781, + "grad_norm": 13.10067009100101, + "learning_rate": 4.996479562122692e-05, + "loss": 2.5921, + "mean_token_accuracy": 0.42413792610168455, + "step": 66440 + }, + { + "epoch": 0.06692410970135197, + "grad_norm": 11.70337096649754, + "learning_rate": 4.996477466484876e-05, + "loss": 2.4948, + "mean_token_accuracy": 0.4748422861099243, + "step": 66445 + }, + { + "epoch": 0.06692914575445615, + "grad_norm": 12.453526372118437, + "learning_rate": 4.996475370223991e-05, + "loss": 2.7174, + "mean_token_accuracy": 0.37931033968925476, + "step": 66450 + }, + { + "epoch": 0.06693418180756032, + "grad_norm": 11.38317314700908, + "learning_rate": 4.996473273340039e-05, + "loss": 2.5292, + "mean_token_accuracy": 0.42758620977401735, + "step": 66455 + }, + { + "epoch": 0.0669392178606645, + "grad_norm": 10.416058309859338, + "learning_rate": 4.9964711758330176e-05, + "loss": 2.4109, + "mean_token_accuracy": 0.4172413766384125, + "step": 66460 + }, + { + "epoch": 0.06694425391376867, + "grad_norm": 14.971705524172318, + "learning_rate": 4.9964690777029305e-05, + "loss": 2.4989, + "mean_token_accuracy": 0.41379310488700866, + "step": 66465 + }, + { + "epoch": 0.06694928996687284, + "grad_norm": 11.108388651801024, + "learning_rate": 4.996466978949776e-05, + "loss": 2.2532, + "mean_token_accuracy": 0.47586206197738645, + "step": 66470 + }, + { + "epoch": 0.06695432601997701, + "grad_norm": 10.237561659161601, + "learning_rate": 4.996464879573555e-05, + "loss": 2.7172, + "mean_token_accuracy": 0.4137930989265442, + "step": 66475 + }, + { + "epoch": 0.06695936207308119, + "grad_norm": 17.021811522446626, + "learning_rate": 4.9964627795742697e-05, + "loss": 2.3248, + "mean_token_accuracy": 0.4189352750778198, + "step": 66480 + }, + { + "epoch": 0.06696439812618536, + "grad_norm": 9.494988672117511, + "learning_rate": 4.996460678951919e-05, + "loss": 2.4786, + "mean_token_accuracy": 0.4379310369491577, + "step": 66485 + }, + { + "epoch": 0.06696943417928954, + "grad_norm": 10.190426798938667, + "learning_rate": 4.996458577706504e-05, + "loss": 2.2563, + "mean_token_accuracy": 0.44482758045196535, + "step": 66490 + }, + { + "epoch": 0.06697447023239371, + "grad_norm": 10.32396335679019, + "learning_rate": 4.996456475838026e-05, + "loss": 2.3576, + "mean_token_accuracy": 0.4068965494632721, + "step": 66495 + }, + { + "epoch": 0.06697950628549788, + "grad_norm": 12.246758404889505, + "learning_rate": 4.996454373346484e-05, + "loss": 2.6409, + "mean_token_accuracy": 0.4517241418361664, + "step": 66500 + }, + { + "epoch": 0.06698454233860206, + "grad_norm": 12.211494968754495, + "learning_rate": 4.9964522702318804e-05, + "loss": 2.4704, + "mean_token_accuracy": 0.39310343861579894, + "step": 66505 + }, + { + "epoch": 0.06698957839170623, + "grad_norm": 10.81631141158981, + "learning_rate": 4.996450166494214e-05, + "loss": 2.3867, + "mean_token_accuracy": 0.4379310369491577, + "step": 66510 + }, + { + "epoch": 0.06699461444481039, + "grad_norm": 12.97370473461869, + "learning_rate": 4.996448062133487e-05, + "loss": 2.7514, + "mean_token_accuracy": 0.3931034505367279, + "step": 66515 + }, + { + "epoch": 0.06699965049791456, + "grad_norm": 13.109195365246542, + "learning_rate": 4.996445957149699e-05, + "loss": 2.3204, + "mean_token_accuracy": 0.4034482717514038, + "step": 66520 + }, + { + "epoch": 0.06700468655101874, + "grad_norm": 14.052975182372732, + "learning_rate": 4.996443851542851e-05, + "loss": 2.6518, + "mean_token_accuracy": 0.4344827592372894, + "step": 66525 + }, + { + "epoch": 0.06700972260412291, + "grad_norm": 10.71664872937317, + "learning_rate": 4.996441745312943e-05, + "loss": 2.8489, + "mean_token_accuracy": 0.3931034475564957, + "step": 66530 + }, + { + "epoch": 0.06701475865722709, + "grad_norm": 11.155514931099361, + "learning_rate": 4.996439638459976e-05, + "loss": 2.3655, + "mean_token_accuracy": 0.4, + "step": 66535 + }, + { + "epoch": 0.06701979471033126, + "grad_norm": 12.101380216637022, + "learning_rate": 4.9964375309839506e-05, + "loss": 2.5079, + "mean_token_accuracy": 0.4344827592372894, + "step": 66540 + }, + { + "epoch": 0.06702483076343543, + "grad_norm": 10.833667075683776, + "learning_rate": 4.996435422884868e-05, + "loss": 2.9275, + "mean_token_accuracy": 0.37931033968925476, + "step": 66545 + }, + { + "epoch": 0.06702986681653961, + "grad_norm": 11.787569472731738, + "learning_rate": 4.996433314162727e-05, + "loss": 2.245, + "mean_token_accuracy": 0.4551724076271057, + "step": 66550 + }, + { + "epoch": 0.06703490286964378, + "grad_norm": 9.736350720648918, + "learning_rate": 4.99643120481753e-05, + "loss": 2.5392, + "mean_token_accuracy": 0.39310344457626345, + "step": 66555 + }, + { + "epoch": 0.06703993892274795, + "grad_norm": 9.950342161125144, + "learning_rate": 4.9964290948492774e-05, + "loss": 2.5868, + "mean_token_accuracy": 0.4, + "step": 66560 + }, + { + "epoch": 0.06704497497585213, + "grad_norm": 9.860318320968823, + "learning_rate": 4.996426984257969e-05, + "loss": 2.4557, + "mean_token_accuracy": 0.4034482717514038, + "step": 66565 + }, + { + "epoch": 0.0670500110289563, + "grad_norm": 9.906724235485221, + "learning_rate": 4.996424873043605e-05, + "loss": 2.453, + "mean_token_accuracy": 0.43793103098869324, + "step": 66570 + }, + { + "epoch": 0.06705504708206048, + "grad_norm": 11.00668052403052, + "learning_rate": 4.996422761206187e-05, + "loss": 2.7181, + "mean_token_accuracy": 0.3827586114406586, + "step": 66575 + }, + { + "epoch": 0.06706008313516465, + "grad_norm": 11.180226648486332, + "learning_rate": 4.996420648745716e-05, + "loss": 2.7871, + "mean_token_accuracy": 0.41246217489242554, + "step": 66580 + }, + { + "epoch": 0.06706511918826881, + "grad_norm": 10.75446951369793, + "learning_rate": 4.9964185356621915e-05, + "loss": 2.5292, + "mean_token_accuracy": 0.36896551549434664, + "step": 66585 + }, + { + "epoch": 0.06707015524137298, + "grad_norm": 11.80038901779949, + "learning_rate": 4.9964164219556144e-05, + "loss": 2.6273, + "mean_token_accuracy": 0.3862069010734558, + "step": 66590 + }, + { + "epoch": 0.06707519129447716, + "grad_norm": 10.243490797371122, + "learning_rate": 4.9964143076259856e-05, + "loss": 2.5812, + "mean_token_accuracy": 0.4068965554237366, + "step": 66595 + }, + { + "epoch": 0.06708022734758133, + "grad_norm": 12.626895299069702, + "learning_rate": 4.996412192673305e-05, + "loss": 2.3503, + "mean_token_accuracy": 0.39854809641838074, + "step": 66600 + }, + { + "epoch": 0.0670852634006855, + "grad_norm": 15.847229053232997, + "learning_rate": 4.996410077097573e-05, + "loss": 2.5768, + "mean_token_accuracy": 0.41465516686439513, + "step": 66605 + }, + { + "epoch": 0.06709029945378968, + "grad_norm": 12.822353886604263, + "learning_rate": 4.996407960898792e-05, + "loss": 2.3201, + "mean_token_accuracy": 0.44482758045196535, + "step": 66610 + }, + { + "epoch": 0.06709533550689385, + "grad_norm": 12.513690605522882, + "learning_rate": 4.996405844076961e-05, + "loss": 2.5141, + "mean_token_accuracy": 0.4310344934463501, + "step": 66615 + }, + { + "epoch": 0.06710037155999803, + "grad_norm": 9.548230026074894, + "learning_rate": 4.9964037266320805e-05, + "loss": 2.3701, + "mean_token_accuracy": 0.41724138259887694, + "step": 66620 + }, + { + "epoch": 0.0671054076131022, + "grad_norm": 14.078674194656493, + "learning_rate": 4.996401608564151e-05, + "loss": 2.989, + "mean_token_accuracy": 0.4310344815254211, + "step": 66625 + }, + { + "epoch": 0.06711044366620637, + "grad_norm": 10.81586433670949, + "learning_rate": 4.996399489873175e-05, + "loss": 2.1178, + "mean_token_accuracy": 0.48275861144065857, + "step": 66630 + }, + { + "epoch": 0.06711547971931055, + "grad_norm": 10.175057358263796, + "learning_rate": 4.996397370559151e-05, + "loss": 2.6213, + "mean_token_accuracy": 0.39999999701976774, + "step": 66635 + }, + { + "epoch": 0.06712051577241472, + "grad_norm": 11.448417905865412, + "learning_rate": 4.9963952506220805e-05, + "loss": 2.4597, + "mean_token_accuracy": 0.42758620977401735, + "step": 66640 + }, + { + "epoch": 0.0671255518255189, + "grad_norm": 11.49240490608862, + "learning_rate": 4.9963931300619646e-05, + "loss": 2.723, + "mean_token_accuracy": 0.4103448212146759, + "step": 66645 + }, + { + "epoch": 0.06713058787862307, + "grad_norm": 10.53276629750285, + "learning_rate": 4.996391008878802e-05, + "loss": 2.7349, + "mean_token_accuracy": 0.37586206793785093, + "step": 66650 + }, + { + "epoch": 0.06713562393172723, + "grad_norm": 11.263428559437706, + "learning_rate": 4.996388887072595e-05, + "loss": 2.9117, + "mean_token_accuracy": 0.36551724672317504, + "step": 66655 + }, + { + "epoch": 0.0671406599848314, + "grad_norm": 12.21368948762846, + "learning_rate": 4.9963867646433435e-05, + "loss": 2.6873, + "mean_token_accuracy": 0.3689655214548111, + "step": 66660 + }, + { + "epoch": 0.06714569603793558, + "grad_norm": 13.145141395210068, + "learning_rate": 4.996384641591048e-05, + "loss": 2.4459, + "mean_token_accuracy": 0.45741077065467833, + "step": 66665 + }, + { + "epoch": 0.06715073209103975, + "grad_norm": 11.066067423473065, + "learning_rate": 4.99638251791571e-05, + "loss": 2.0474, + "mean_token_accuracy": 0.493103438615799, + "step": 66670 + }, + { + "epoch": 0.06715576814414392, + "grad_norm": 11.58968755524119, + "learning_rate": 4.996380393617329e-05, + "loss": 2.8409, + "mean_token_accuracy": 0.33448275923728943, + "step": 66675 + }, + { + "epoch": 0.0671608041972481, + "grad_norm": 12.538509447157185, + "learning_rate": 4.9963782686959063e-05, + "loss": 2.4654, + "mean_token_accuracy": 0.403448274731636, + "step": 66680 + }, + { + "epoch": 0.06716584025035227, + "grad_norm": 10.789055215632521, + "learning_rate": 4.996376143151442e-05, + "loss": 2.2391, + "mean_token_accuracy": 0.4448275864124298, + "step": 66685 + }, + { + "epoch": 0.06717087630345644, + "grad_norm": 12.251237474054077, + "learning_rate": 4.996374016983937e-05, + "loss": 2.5378, + "mean_token_accuracy": 0.4068965554237366, + "step": 66690 + }, + { + "epoch": 0.06717591235656062, + "grad_norm": 12.392354397685262, + "learning_rate": 4.9963718901933916e-05, + "loss": 2.408, + "mean_token_accuracy": 0.4206896543502808, + "step": 66695 + }, + { + "epoch": 0.06718094840966479, + "grad_norm": 11.359841077316215, + "learning_rate": 4.996369762779806e-05, + "loss": 2.8094, + "mean_token_accuracy": 0.37241379618644715, + "step": 66700 + }, + { + "epoch": 0.06718598446276897, + "grad_norm": 11.907759345750748, + "learning_rate": 4.9963676347431826e-05, + "loss": 2.8483, + "mean_token_accuracy": 0.3517241358757019, + "step": 66705 + }, + { + "epoch": 0.06719102051587314, + "grad_norm": 13.269003915590288, + "learning_rate": 4.996365506083521e-05, + "loss": 2.5665, + "mean_token_accuracy": 0.34137930870056155, + "step": 66710 + }, + { + "epoch": 0.06719605656897731, + "grad_norm": 10.588641798089172, + "learning_rate": 4.99636337680082e-05, + "loss": 2.5978, + "mean_token_accuracy": 0.41724138259887694, + "step": 66715 + }, + { + "epoch": 0.06720109262208149, + "grad_norm": 13.209726674173757, + "learning_rate": 4.996361246895083e-05, + "loss": 2.3437, + "mean_token_accuracy": 0.47586206793785096, + "step": 66720 + }, + { + "epoch": 0.06720612867518565, + "grad_norm": 10.45269197069057, + "learning_rate": 4.996359116366308e-05, + "loss": 3.0883, + "mean_token_accuracy": 0.38620689511299133, + "step": 66725 + }, + { + "epoch": 0.06721116472828982, + "grad_norm": 15.104066406467888, + "learning_rate": 4.996356985214498e-05, + "loss": 2.9087, + "mean_token_accuracy": 0.358620685338974, + "step": 66730 + }, + { + "epoch": 0.067216200781394, + "grad_norm": 10.568726535010004, + "learning_rate": 4.9963548534396524e-05, + "loss": 2.296, + "mean_token_accuracy": 0.42068966031074523, + "step": 66735 + }, + { + "epoch": 0.06722123683449817, + "grad_norm": 15.407399788282733, + "learning_rate": 4.996352721041771e-05, + "loss": 2.4913, + "mean_token_accuracy": 0.36551724672317504, + "step": 66740 + }, + { + "epoch": 0.06722627288760234, + "grad_norm": 11.563819310911443, + "learning_rate": 4.996350588020856e-05, + "loss": 3.1188, + "mean_token_accuracy": 0.37586207389831544, + "step": 66745 + }, + { + "epoch": 0.06723130894070652, + "grad_norm": 8.280932933580422, + "learning_rate": 4.996348454376908e-05, + "loss": 2.0372, + "mean_token_accuracy": 0.5119177162647247, + "step": 66750 + }, + { + "epoch": 0.06723634499381069, + "grad_norm": 10.056772689633416, + "learning_rate": 4.996346320109926e-05, + "loss": 2.6026, + "mean_token_accuracy": 0.3448275804519653, + "step": 66755 + }, + { + "epoch": 0.06724138104691486, + "grad_norm": 17.317440650067866, + "learning_rate": 4.996344185219912e-05, + "loss": 2.6029, + "mean_token_accuracy": 0.49122806191444396, + "step": 66760 + }, + { + "epoch": 0.06724641710001904, + "grad_norm": 14.310388164368542, + "learning_rate": 4.996342049706865e-05, + "loss": 2.6977, + "mean_token_accuracy": 0.4, + "step": 66765 + }, + { + "epoch": 0.06725145315312321, + "grad_norm": 11.475248519525076, + "learning_rate": 4.996339913570788e-05, + "loss": 2.6605, + "mean_token_accuracy": 0.324137932062149, + "step": 66770 + }, + { + "epoch": 0.06725648920622739, + "grad_norm": 11.154919777060767, + "learning_rate": 4.99633777681168e-05, + "loss": 2.3222, + "mean_token_accuracy": 0.35172412991523744, + "step": 66775 + }, + { + "epoch": 0.06726152525933156, + "grad_norm": 10.471266830729128, + "learning_rate": 4.9963356394295405e-05, + "loss": 2.4061, + "mean_token_accuracy": 0.4034482777118683, + "step": 66780 + }, + { + "epoch": 0.06726656131243573, + "grad_norm": 11.769411481946213, + "learning_rate": 4.9963335014243725e-05, + "loss": 2.8421, + "mean_token_accuracy": 0.36896551847457887, + "step": 66785 + }, + { + "epoch": 0.0672715973655399, + "grad_norm": 14.786294494172749, + "learning_rate": 4.996331362796176e-05, + "loss": 3.183, + "mean_token_accuracy": 0.37241379022598264, + "step": 66790 + }, + { + "epoch": 0.06727663341864407, + "grad_norm": 12.393007473662973, + "learning_rate": 4.9963292235449506e-05, + "loss": 2.6695, + "mean_token_accuracy": 0.41379311084747317, + "step": 66795 + }, + { + "epoch": 0.06728166947174824, + "grad_norm": 9.854176477470542, + "learning_rate": 4.996327083670697e-05, + "loss": 2.3081, + "mean_token_accuracy": 0.42068966031074523, + "step": 66800 + }, + { + "epoch": 0.06728670552485241, + "grad_norm": 12.416715328291085, + "learning_rate": 4.996324943173417e-05, + "loss": 2.2217, + "mean_token_accuracy": 0.4862068951129913, + "step": 66805 + }, + { + "epoch": 0.06729174157795659, + "grad_norm": 14.681472221336847, + "learning_rate": 4.9963228020531096e-05, + "loss": 3.0274, + "mean_token_accuracy": 0.3758620619773865, + "step": 66810 + }, + { + "epoch": 0.06729677763106076, + "grad_norm": 18.141202716235053, + "learning_rate": 4.9963206603097765e-05, + "loss": 2.747, + "mean_token_accuracy": 0.4068965494632721, + "step": 66815 + }, + { + "epoch": 0.06730181368416494, + "grad_norm": 12.707231134026733, + "learning_rate": 4.996318517943418e-05, + "loss": 2.4685, + "mean_token_accuracy": 0.3827586233615875, + "step": 66820 + }, + { + "epoch": 0.06730684973726911, + "grad_norm": 10.896778525483665, + "learning_rate": 4.996316374954035e-05, + "loss": 2.6503, + "mean_token_accuracy": 0.37241379618644715, + "step": 66825 + }, + { + "epoch": 0.06731188579037328, + "grad_norm": 10.054716391023703, + "learning_rate": 4.996314231341627e-05, + "loss": 2.6947, + "mean_token_accuracy": 0.37931033968925476, + "step": 66830 + }, + { + "epoch": 0.06731692184347746, + "grad_norm": 12.300484822991066, + "learning_rate": 4.9963120871061955e-05, + "loss": 2.0499, + "mean_token_accuracy": 0.4693349778652191, + "step": 66835 + }, + { + "epoch": 0.06732195789658163, + "grad_norm": 14.69458882845443, + "learning_rate": 4.996309942247742e-05, + "loss": 2.682, + "mean_token_accuracy": 0.417241370677948, + "step": 66840 + }, + { + "epoch": 0.0673269939496858, + "grad_norm": 8.385039260412185, + "learning_rate": 4.9963077967662656e-05, + "loss": 2.6782, + "mean_token_accuracy": 0.423048996925354, + "step": 66845 + }, + { + "epoch": 0.06733203000278998, + "grad_norm": 12.11948566499924, + "learning_rate": 4.9963056506617666e-05, + "loss": 2.5017, + "mean_token_accuracy": 0.42413792610168455, + "step": 66850 + }, + { + "epoch": 0.06733706605589415, + "grad_norm": 9.867621708876474, + "learning_rate": 4.996303503934246e-05, + "loss": 2.2286, + "mean_token_accuracy": 0.46049606800079346, + "step": 66855 + }, + { + "epoch": 0.06734210210899833, + "grad_norm": 11.639781115886299, + "learning_rate": 4.996301356583706e-05, + "loss": 2.3547, + "mean_token_accuracy": 0.48275862336158754, + "step": 66860 + }, + { + "epoch": 0.06734713816210249, + "grad_norm": 10.70618330954457, + "learning_rate": 4.996299208610146e-05, + "loss": 2.8635, + "mean_token_accuracy": 0.39655172228813174, + "step": 66865 + }, + { + "epoch": 0.06735217421520666, + "grad_norm": 14.469362975647158, + "learning_rate": 4.9962970600135664e-05, + "loss": 2.7166, + "mean_token_accuracy": 0.42068966031074523, + "step": 66870 + }, + { + "epoch": 0.06735721026831083, + "grad_norm": 15.67853168768889, + "learning_rate": 4.9962949107939675e-05, + "loss": 2.8365, + "mean_token_accuracy": 0.36896551847457887, + "step": 66875 + }, + { + "epoch": 0.067362246321415, + "grad_norm": 14.190450839355602, + "learning_rate": 4.996292760951351e-05, + "loss": 2.675, + "mean_token_accuracy": 0.37374470829963685, + "step": 66880 + }, + { + "epoch": 0.06736728237451918, + "grad_norm": 12.119532691076238, + "learning_rate": 4.996290610485716e-05, + "loss": 2.2003, + "mean_token_accuracy": 0.441379314661026, + "step": 66885 + }, + { + "epoch": 0.06737231842762335, + "grad_norm": 9.517649264280843, + "learning_rate": 4.996288459397064e-05, + "loss": 2.7633, + "mean_token_accuracy": 0.42413792610168455, + "step": 66890 + }, + { + "epoch": 0.06737735448072753, + "grad_norm": 17.23643415957157, + "learning_rate": 4.9962863076853964e-05, + "loss": 2.7324, + "mean_token_accuracy": 0.37241379618644715, + "step": 66895 + }, + { + "epoch": 0.0673823905338317, + "grad_norm": 10.233719374832946, + "learning_rate": 4.996284155350711e-05, + "loss": 2.0902, + "mean_token_accuracy": 0.4689655125141144, + "step": 66900 + }, + { + "epoch": 0.06738742658693588, + "grad_norm": 16.110485862541438, + "learning_rate": 4.996282002393012e-05, + "loss": 3.0173, + "mean_token_accuracy": 0.3448275804519653, + "step": 66905 + }, + { + "epoch": 0.06739246264004005, + "grad_norm": 11.482682621697949, + "learning_rate": 4.996279848812297e-05, + "loss": 3.3195, + "mean_token_accuracy": 0.2862068980932236, + "step": 66910 + }, + { + "epoch": 0.06739749869314422, + "grad_norm": 11.966290626313109, + "learning_rate": 4.9962776946085695e-05, + "loss": 2.7014, + "mean_token_accuracy": 0.36206896901130675, + "step": 66915 + }, + { + "epoch": 0.0674025347462484, + "grad_norm": 10.83398369102492, + "learning_rate": 4.996275539781827e-05, + "loss": 2.5769, + "mean_token_accuracy": 0.3793103456497192, + "step": 66920 + }, + { + "epoch": 0.06740757079935257, + "grad_norm": 11.373673803082418, + "learning_rate": 4.996273384332073e-05, + "loss": 2.6891, + "mean_token_accuracy": 0.37241379618644715, + "step": 66925 + }, + { + "epoch": 0.06741260685245674, + "grad_norm": 16.64877335301928, + "learning_rate": 4.996271228259306e-05, + "loss": 2.6717, + "mean_token_accuracy": 0.3999999940395355, + "step": 66930 + }, + { + "epoch": 0.0674176429055609, + "grad_norm": 9.868919142182797, + "learning_rate": 4.996269071563527e-05, + "loss": 2.4018, + "mean_token_accuracy": 0.4178571432828903, + "step": 66935 + }, + { + "epoch": 0.06742267895866508, + "grad_norm": 10.737975148707518, + "learning_rate": 4.996266914244738e-05, + "loss": 2.5896, + "mean_token_accuracy": 0.42758620977401735, + "step": 66940 + }, + { + "epoch": 0.06742771501176925, + "grad_norm": 12.712598410094635, + "learning_rate": 4.996264756302938e-05, + "loss": 2.5282, + "mean_token_accuracy": 0.39655171930789945, + "step": 66945 + }, + { + "epoch": 0.06743275106487343, + "grad_norm": 11.47336961800442, + "learning_rate": 4.996262597738127e-05, + "loss": 3.1264, + "mean_token_accuracy": 0.34137930870056155, + "step": 66950 + }, + { + "epoch": 0.0674377871179776, + "grad_norm": 10.806450502524452, + "learning_rate": 4.996260438550308e-05, + "loss": 2.2778, + "mean_token_accuracy": 0.41034482717514037, + "step": 66955 + }, + { + "epoch": 0.06744282317108177, + "grad_norm": 12.390950227154349, + "learning_rate": 4.996258278739479e-05, + "loss": 2.6376, + "mean_token_accuracy": 0.38620689511299133, + "step": 66960 + }, + { + "epoch": 0.06744785922418595, + "grad_norm": 14.657661411706721, + "learning_rate": 4.9962561183056425e-05, + "loss": 2.8026, + "mean_token_accuracy": 0.4206896543502808, + "step": 66965 + }, + { + "epoch": 0.06745289527729012, + "grad_norm": 11.739278033800597, + "learning_rate": 4.9962539572487995e-05, + "loss": 2.4325, + "mean_token_accuracy": 0.4275861978530884, + "step": 66970 + }, + { + "epoch": 0.0674579313303943, + "grad_norm": 10.500762657481578, + "learning_rate": 4.996251795568948e-05, + "loss": 2.367, + "mean_token_accuracy": 0.458620685338974, + "step": 66975 + }, + { + "epoch": 0.06746296738349847, + "grad_norm": 12.998466386042733, + "learning_rate": 4.996249633266091e-05, + "loss": 2.8257, + "mean_token_accuracy": 0.35862069129943847, + "step": 66980 + }, + { + "epoch": 0.06746800343660264, + "grad_norm": 13.00314995925543, + "learning_rate": 4.9962474703402286e-05, + "loss": 2.562, + "mean_token_accuracy": 0.4137930989265442, + "step": 66985 + }, + { + "epoch": 0.06747303948970682, + "grad_norm": 15.29670955926325, + "learning_rate": 4.996245306791361e-05, + "loss": 2.3516, + "mean_token_accuracy": 0.41724138259887694, + "step": 66990 + }, + { + "epoch": 0.06747807554281099, + "grad_norm": 10.441484115550292, + "learning_rate": 4.9962431426194886e-05, + "loss": 2.3945, + "mean_token_accuracy": 0.4068965494632721, + "step": 66995 + }, + { + "epoch": 0.06748311159591516, + "grad_norm": 13.218105124046247, + "learning_rate": 4.996240977824612e-05, + "loss": 2.4046, + "mean_token_accuracy": 0.41379310488700866, + "step": 67000 + }, + { + "epoch": 0.06748814764901932, + "grad_norm": 25.754788445555292, + "learning_rate": 4.9962388124067316e-05, + "loss": 2.8148, + "mean_token_accuracy": 0.3793103456497192, + "step": 67005 + }, + { + "epoch": 0.0674931837021235, + "grad_norm": 11.208079478097277, + "learning_rate": 4.996236646365849e-05, + "loss": 2.3225, + "mean_token_accuracy": 0.4517241358757019, + "step": 67010 + }, + { + "epoch": 0.06749821975522767, + "grad_norm": 11.868801826532009, + "learning_rate": 4.996234479701966e-05, + "loss": 2.6438, + "mean_token_accuracy": 0.3620689630508423, + "step": 67015 + }, + { + "epoch": 0.06750325580833184, + "grad_norm": 15.23628387592478, + "learning_rate": 4.996232312415079e-05, + "loss": 2.8674, + "mean_token_accuracy": 0.36896551251411436, + "step": 67020 + }, + { + "epoch": 0.06750829186143602, + "grad_norm": 10.393696434561733, + "learning_rate": 4.9962301445051924e-05, + "loss": 2.3269, + "mean_token_accuracy": 0.43103448748588563, + "step": 67025 + }, + { + "epoch": 0.06751332791454019, + "grad_norm": 11.294359285267616, + "learning_rate": 4.996227975972305e-05, + "loss": 2.2557, + "mean_token_accuracy": 0.43998790383338926, + "step": 67030 + }, + { + "epoch": 0.06751836396764437, + "grad_norm": 22.42339184567104, + "learning_rate": 4.996225806816418e-05, + "loss": 2.4931, + "mean_token_accuracy": 0.4261947929859161, + "step": 67035 + }, + { + "epoch": 0.06752340002074854, + "grad_norm": 11.993048303486567, + "learning_rate": 4.996223637037532e-05, + "loss": 2.5109, + "mean_token_accuracy": 0.42413792610168455, + "step": 67040 + }, + { + "epoch": 0.06752843607385271, + "grad_norm": 12.485389464593876, + "learning_rate": 4.996221466635647e-05, + "loss": 2.5414, + "mean_token_accuracy": 0.43103447556495667, + "step": 67045 + }, + { + "epoch": 0.06753347212695689, + "grad_norm": 11.795725234915455, + "learning_rate": 4.9962192956107656e-05, + "loss": 2.9844, + "mean_token_accuracy": 0.3551724135875702, + "step": 67050 + }, + { + "epoch": 0.06753850818006106, + "grad_norm": 11.955884163332714, + "learning_rate": 4.9962171239628855e-05, + "loss": 2.7221, + "mean_token_accuracy": 0.36551724970340727, + "step": 67055 + }, + { + "epoch": 0.06754354423316523, + "grad_norm": 10.463737775556934, + "learning_rate": 4.9962149516920086e-05, + "loss": 2.3867, + "mean_token_accuracy": 0.41724138259887694, + "step": 67060 + }, + { + "epoch": 0.06754858028626941, + "grad_norm": 13.144162083535685, + "learning_rate": 4.996212778798137e-05, + "loss": 2.5106, + "mean_token_accuracy": 0.36206896901130675, + "step": 67065 + }, + { + "epoch": 0.06755361633937358, + "grad_norm": 10.245347764621835, + "learning_rate": 4.9962106052812684e-05, + "loss": 2.4242, + "mean_token_accuracy": 0.4294010877609253, + "step": 67070 + }, + { + "epoch": 0.06755865239247774, + "grad_norm": 12.986900879162782, + "learning_rate": 4.996208431141405e-05, + "loss": 2.5525, + "mean_token_accuracy": 0.3724137932062149, + "step": 67075 + }, + { + "epoch": 0.06756368844558192, + "grad_norm": 10.445739522836554, + "learning_rate": 4.9962062563785485e-05, + "loss": 2.6275, + "mean_token_accuracy": 0.36896551549434664, + "step": 67080 + }, + { + "epoch": 0.06756872449868609, + "grad_norm": 10.98501891754874, + "learning_rate": 4.996204080992697e-05, + "loss": 2.567, + "mean_token_accuracy": 0.37241379022598264, + "step": 67085 + }, + { + "epoch": 0.06757376055179026, + "grad_norm": 11.497465995822587, + "learning_rate": 4.996201904983853e-05, + "loss": 2.1825, + "mean_token_accuracy": 0.4482758641242981, + "step": 67090 + }, + { + "epoch": 0.06757879660489444, + "grad_norm": 12.039617411411594, + "learning_rate": 4.9961997283520175e-05, + "loss": 2.5203, + "mean_token_accuracy": 0.3758620619773865, + "step": 67095 + }, + { + "epoch": 0.06758383265799861, + "grad_norm": 10.015699039604872, + "learning_rate": 4.9961975510971885e-05, + "loss": 2.3675, + "mean_token_accuracy": 0.46551724672317507, + "step": 67100 + }, + { + "epoch": 0.06758886871110278, + "grad_norm": 14.855815885476023, + "learning_rate": 4.996195373219369e-05, + "loss": 2.8582, + "mean_token_accuracy": 0.39999999701976774, + "step": 67105 + }, + { + "epoch": 0.06759390476420696, + "grad_norm": 10.466077040514117, + "learning_rate": 4.996193194718559e-05, + "loss": 2.1597, + "mean_token_accuracy": 0.5000000059604645, + "step": 67110 + }, + { + "epoch": 0.06759894081731113, + "grad_norm": 12.736028576252895, + "learning_rate": 4.996191015594758e-05, + "loss": 2.5993, + "mean_token_accuracy": 0.39310344457626345, + "step": 67115 + }, + { + "epoch": 0.0676039768704153, + "grad_norm": 12.21883910705969, + "learning_rate": 4.9961888358479685e-05, + "loss": 2.5403, + "mean_token_accuracy": 0.43103448748588563, + "step": 67120 + }, + { + "epoch": 0.06760901292351948, + "grad_norm": 10.240365041317796, + "learning_rate": 4.99618665547819e-05, + "loss": 2.7566, + "mean_token_accuracy": 0.3655172407627106, + "step": 67125 + }, + { + "epoch": 0.06761404897662365, + "grad_norm": 11.458247817225393, + "learning_rate": 4.996184474485424e-05, + "loss": 2.3849, + "mean_token_accuracy": 0.42413793206214906, + "step": 67130 + }, + { + "epoch": 0.06761908502972783, + "grad_norm": 11.61828536219056, + "learning_rate": 4.996182292869668e-05, + "loss": 2.8068, + "mean_token_accuracy": 0.30344827473163605, + "step": 67135 + }, + { + "epoch": 0.067624121082832, + "grad_norm": 10.157948890184533, + "learning_rate": 4.996180110630927e-05, + "loss": 2.2953, + "mean_token_accuracy": 0.4103448212146759, + "step": 67140 + }, + { + "epoch": 0.06762915713593616, + "grad_norm": 9.89068279978254, + "learning_rate": 4.9961779277691995e-05, + "loss": 2.2129, + "mean_token_accuracy": 0.44827585816383364, + "step": 67145 + }, + { + "epoch": 0.06763419318904033, + "grad_norm": 13.10096930667577, + "learning_rate": 4.996175744284485e-05, + "loss": 2.6437, + "mean_token_accuracy": 0.4068965554237366, + "step": 67150 + }, + { + "epoch": 0.06763922924214451, + "grad_norm": 11.481394154649427, + "learning_rate": 4.9961735601767865e-05, + "loss": 2.8022, + "mean_token_accuracy": 0.42413792610168455, + "step": 67155 + }, + { + "epoch": 0.06764426529524868, + "grad_norm": 10.358445070618533, + "learning_rate": 4.996171375446102e-05, + "loss": 2.4989, + "mean_token_accuracy": 0.43248639106750486, + "step": 67160 + }, + { + "epoch": 0.06764930134835286, + "grad_norm": 10.436242424246117, + "learning_rate": 4.996169190092435e-05, + "loss": 2.8865, + "mean_token_accuracy": 0.4103448212146759, + "step": 67165 + }, + { + "epoch": 0.06765433740145703, + "grad_norm": 14.839277919618446, + "learning_rate": 4.996167004115783e-05, + "loss": 2.7692, + "mean_token_accuracy": 0.33793103098869326, + "step": 67170 + }, + { + "epoch": 0.0676593734545612, + "grad_norm": 13.492378378118119, + "learning_rate": 4.9961648175161505e-05, + "loss": 1.984, + "mean_token_accuracy": 0.483620685338974, + "step": 67175 + }, + { + "epoch": 0.06766440950766538, + "grad_norm": 12.651399260909276, + "learning_rate": 4.996162630293534e-05, + "loss": 2.437, + "mean_token_accuracy": 0.4413793087005615, + "step": 67180 + }, + { + "epoch": 0.06766944556076955, + "grad_norm": 10.568344586148733, + "learning_rate": 4.9961604424479364e-05, + "loss": 2.3853, + "mean_token_accuracy": 0.39310344457626345, + "step": 67185 + }, + { + "epoch": 0.06767448161387372, + "grad_norm": 9.965870755168254, + "learning_rate": 4.996158253979357e-05, + "loss": 2.2008, + "mean_token_accuracy": 0.47241379618644713, + "step": 67190 + }, + { + "epoch": 0.0676795176669779, + "grad_norm": 10.024144281175825, + "learning_rate": 4.996156064887799e-05, + "loss": 2.1957, + "mean_token_accuracy": 0.4413793087005615, + "step": 67195 + }, + { + "epoch": 0.06768455372008207, + "grad_norm": 12.001781084237187, + "learning_rate": 4.9961538751732596e-05, + "loss": 2.7533, + "mean_token_accuracy": 0.3482758581638336, + "step": 67200 + }, + { + "epoch": 0.06768958977318625, + "grad_norm": 11.501462439530142, + "learning_rate": 4.996151684835742e-05, + "loss": 2.7381, + "mean_token_accuracy": 0.3655172407627106, + "step": 67205 + }, + { + "epoch": 0.06769462582629042, + "grad_norm": 12.549317992865358, + "learning_rate": 4.996149493875245e-05, + "loss": 2.6685, + "mean_token_accuracy": 0.4034482777118683, + "step": 67210 + }, + { + "epoch": 0.06769966187939458, + "grad_norm": 9.512701605408347, + "learning_rate": 4.996147302291771e-05, + "loss": 2.2965, + "mean_token_accuracy": 0.44827585816383364, + "step": 67215 + }, + { + "epoch": 0.06770469793249875, + "grad_norm": 13.342850571823028, + "learning_rate": 4.996145110085319e-05, + "loss": 2.3302, + "mean_token_accuracy": 0.40205685794353485, + "step": 67220 + }, + { + "epoch": 0.06770973398560293, + "grad_norm": 12.060401212528726, + "learning_rate": 4.996142917255891e-05, + "loss": 2.6294, + "mean_token_accuracy": 0.36896551251411436, + "step": 67225 + }, + { + "epoch": 0.0677147700387071, + "grad_norm": 16.510702017442465, + "learning_rate": 4.9961407238034864e-05, + "loss": 2.9967, + "mean_token_accuracy": 0.33793103098869326, + "step": 67230 + }, + { + "epoch": 0.06771980609181127, + "grad_norm": 11.549027162827482, + "learning_rate": 4.996138529728106e-05, + "loss": 2.7201, + "mean_token_accuracy": 0.3827586114406586, + "step": 67235 + }, + { + "epoch": 0.06772484214491545, + "grad_norm": 13.61727168939817, + "learning_rate": 4.996136335029751e-05, + "loss": 2.7156, + "mean_token_accuracy": 0.42413793206214906, + "step": 67240 + }, + { + "epoch": 0.06772987819801962, + "grad_norm": 18.617417639402902, + "learning_rate": 4.996134139708422e-05, + "loss": 2.9854, + "mean_token_accuracy": 0.3931034505367279, + "step": 67245 + }, + { + "epoch": 0.0677349142511238, + "grad_norm": 13.68848994459133, + "learning_rate": 4.9961319437641186e-05, + "loss": 2.5168, + "mean_token_accuracy": 0.4068965554237366, + "step": 67250 + }, + { + "epoch": 0.06773995030422797, + "grad_norm": 9.575553070203956, + "learning_rate": 4.996129747196843e-05, + "loss": 2.2442, + "mean_token_accuracy": 0.4344827473163605, + "step": 67255 + }, + { + "epoch": 0.06774498635733214, + "grad_norm": 10.388905567732426, + "learning_rate": 4.9961275500065935e-05, + "loss": 2.5333, + "mean_token_accuracy": 0.41724138259887694, + "step": 67260 + }, + { + "epoch": 0.06775002241043632, + "grad_norm": 11.939926059464282, + "learning_rate": 4.996125352193374e-05, + "loss": 2.2168, + "mean_token_accuracy": 0.4297035664319992, + "step": 67265 + }, + { + "epoch": 0.06775505846354049, + "grad_norm": 15.299713297974325, + "learning_rate": 4.996123153757182e-05, + "loss": 2.5924, + "mean_token_accuracy": 0.42413793206214906, + "step": 67270 + }, + { + "epoch": 0.06776009451664466, + "grad_norm": 11.662170775456454, + "learning_rate": 4.996120954698019e-05, + "loss": 2.7956, + "mean_token_accuracy": 0.382758629322052, + "step": 67275 + }, + { + "epoch": 0.06776513056974884, + "grad_norm": 10.895639590356428, + "learning_rate": 4.996118755015887e-05, + "loss": 2.8838, + "mean_token_accuracy": 0.35862069129943847, + "step": 67280 + }, + { + "epoch": 0.067770166622853, + "grad_norm": 11.85989149287716, + "learning_rate": 4.996116554710786e-05, + "loss": 2.2879, + "mean_token_accuracy": 0.4724137902259827, + "step": 67285 + }, + { + "epoch": 0.06777520267595717, + "grad_norm": 9.950000819257724, + "learning_rate": 4.996114353782715e-05, + "loss": 2.5777, + "mean_token_accuracy": 0.37755595743656156, + "step": 67290 + }, + { + "epoch": 0.06778023872906135, + "grad_norm": 13.544972622163318, + "learning_rate": 4.9961121522316764e-05, + "loss": 2.5555, + "mean_token_accuracy": 0.3896551728248596, + "step": 67295 + }, + { + "epoch": 0.06778527478216552, + "grad_norm": 11.92808480532343, + "learning_rate": 4.99610995005767e-05, + "loss": 2.5163, + "mean_token_accuracy": 0.38620689511299133, + "step": 67300 + }, + { + "epoch": 0.0677903108352697, + "grad_norm": 12.48316811856645, + "learning_rate": 4.996107747260697e-05, + "loss": 2.7119, + "mean_token_accuracy": 0.3965517282485962, + "step": 67305 + }, + { + "epoch": 0.06779534688837387, + "grad_norm": 9.645120743568606, + "learning_rate": 4.996105543840757e-05, + "loss": 2.9179, + "mean_token_accuracy": 0.39655172228813174, + "step": 67310 + }, + { + "epoch": 0.06780038294147804, + "grad_norm": 12.64863669903991, + "learning_rate": 4.996103339797852e-05, + "loss": 2.3396, + "mean_token_accuracy": 0.40139141082763674, + "step": 67315 + }, + { + "epoch": 0.06780541899458221, + "grad_norm": 12.135480346179358, + "learning_rate": 4.996101135131981e-05, + "loss": 2.2305, + "mean_token_accuracy": 0.42068966031074523, + "step": 67320 + }, + { + "epoch": 0.06781045504768639, + "grad_norm": 11.27921527237365, + "learning_rate": 4.996098929843147e-05, + "loss": 2.24, + "mean_token_accuracy": 0.4068965494632721, + "step": 67325 + }, + { + "epoch": 0.06781549110079056, + "grad_norm": 12.57431877673111, + "learning_rate": 4.9960967239313474e-05, + "loss": 2.7581, + "mean_token_accuracy": 0.37241379618644715, + "step": 67330 + }, + { + "epoch": 0.06782052715389474, + "grad_norm": 13.574854446208409, + "learning_rate": 4.996094517396586e-05, + "loss": 2.2787, + "mean_token_accuracy": 0.48275862336158754, + "step": 67335 + }, + { + "epoch": 0.06782556320699891, + "grad_norm": 10.59474355774425, + "learning_rate": 4.99609231023886e-05, + "loss": 2.209, + "mean_token_accuracy": 0.41379310488700866, + "step": 67340 + }, + { + "epoch": 0.06783059926010308, + "grad_norm": 11.367021752139053, + "learning_rate": 4.996090102458174e-05, + "loss": 2.206, + "mean_token_accuracy": 0.441379314661026, + "step": 67345 + }, + { + "epoch": 0.06783563531320726, + "grad_norm": 11.081263619138939, + "learning_rate": 4.996087894054525e-05, + "loss": 2.3788, + "mean_token_accuracy": 0.39836660623550413, + "step": 67350 + }, + { + "epoch": 0.06784067136631142, + "grad_norm": 11.626939441149753, + "learning_rate": 4.9960856850279155e-05, + "loss": 2.7242, + "mean_token_accuracy": 0.3413793116807938, + "step": 67355 + }, + { + "epoch": 0.06784570741941559, + "grad_norm": 10.581893445234016, + "learning_rate": 4.996083475378346e-05, + "loss": 2.7101, + "mean_token_accuracy": 0.37241379022598264, + "step": 67360 + }, + { + "epoch": 0.06785074347251976, + "grad_norm": 11.68098749250743, + "learning_rate": 4.9960812651058164e-05, + "loss": 2.3089, + "mean_token_accuracy": 0.39310343861579894, + "step": 67365 + }, + { + "epoch": 0.06785577952562394, + "grad_norm": 12.12890635528658, + "learning_rate": 4.9960790542103286e-05, + "loss": 2.5668, + "mean_token_accuracy": 0.38965516686439516, + "step": 67370 + }, + { + "epoch": 0.06786081557872811, + "grad_norm": 10.161050837069254, + "learning_rate": 4.996076842691881e-05, + "loss": 2.2435, + "mean_token_accuracy": 0.4275862067937851, + "step": 67375 + }, + { + "epoch": 0.06786585163183229, + "grad_norm": 11.53014823268052, + "learning_rate": 4.996074630550478e-05, + "loss": 2.6079, + "mean_token_accuracy": 0.4310344815254211, + "step": 67380 + }, + { + "epoch": 0.06787088768493646, + "grad_norm": 12.063003575458287, + "learning_rate": 4.9960724177861154e-05, + "loss": 2.563, + "mean_token_accuracy": 0.4, + "step": 67385 + }, + { + "epoch": 0.06787592373804063, + "grad_norm": 11.264524635845152, + "learning_rate": 4.9960702043987976e-05, + "loss": 2.5012, + "mean_token_accuracy": 0.4344827592372894, + "step": 67390 + }, + { + "epoch": 0.06788095979114481, + "grad_norm": 11.444576010868468, + "learning_rate": 4.996067990388523e-05, + "loss": 2.99, + "mean_token_accuracy": 0.3517241388559341, + "step": 67395 + }, + { + "epoch": 0.06788599584424898, + "grad_norm": 11.418281410115434, + "learning_rate": 4.9960657757552936e-05, + "loss": 2.4291, + "mean_token_accuracy": 0.3655172437429428, + "step": 67400 + }, + { + "epoch": 0.06789103189735315, + "grad_norm": 11.890321702629741, + "learning_rate": 4.99606356049911e-05, + "loss": 2.0241, + "mean_token_accuracy": 0.4793103516101837, + "step": 67405 + }, + { + "epoch": 0.06789606795045733, + "grad_norm": 12.243551617707503, + "learning_rate": 4.996061344619971e-05, + "loss": 2.5762, + "mean_token_accuracy": 0.38275861740112305, + "step": 67410 + }, + { + "epoch": 0.0679011040035615, + "grad_norm": 21.381524704358892, + "learning_rate": 4.996059128117879e-05, + "loss": 2.5058, + "mean_token_accuracy": 0.4275861978530884, + "step": 67415 + }, + { + "epoch": 0.06790614005666568, + "grad_norm": 14.060916445705557, + "learning_rate": 4.996056910992835e-05, + "loss": 2.4795, + "mean_token_accuracy": 0.39842710494995115, + "step": 67420 + }, + { + "epoch": 0.06791117610976984, + "grad_norm": 11.178324643286626, + "learning_rate": 4.996054693244838e-05, + "loss": 2.5733, + "mean_token_accuracy": 0.4344827592372894, + "step": 67425 + }, + { + "epoch": 0.06791621216287401, + "grad_norm": 20.75714070717515, + "learning_rate": 4.996052474873889e-05, + "loss": 2.869, + "mean_token_accuracy": 0.38620689511299133, + "step": 67430 + }, + { + "epoch": 0.06792124821597818, + "grad_norm": 17.44797667877228, + "learning_rate": 4.9960502558799896e-05, + "loss": 2.7087, + "mean_token_accuracy": 0.4279556632041931, + "step": 67435 + }, + { + "epoch": 0.06792628426908236, + "grad_norm": 9.870783310204883, + "learning_rate": 4.9960480362631395e-05, + "loss": 2.7272, + "mean_token_accuracy": 0.41724138259887694, + "step": 67440 + }, + { + "epoch": 0.06793132032218653, + "grad_norm": 9.962690673683175, + "learning_rate": 4.9960458160233394e-05, + "loss": 2.2792, + "mean_token_accuracy": 0.43623715043067934, + "step": 67445 + }, + { + "epoch": 0.0679363563752907, + "grad_norm": 9.905796679745507, + "learning_rate": 4.9960435951605906e-05, + "loss": 2.0922, + "mean_token_accuracy": 0.4724137902259827, + "step": 67450 + }, + { + "epoch": 0.06794139242839488, + "grad_norm": 15.651557683541277, + "learning_rate": 4.996041373674893e-05, + "loss": 2.5725, + "mean_token_accuracy": 0.4034482717514038, + "step": 67455 + }, + { + "epoch": 0.06794642848149905, + "grad_norm": 11.14024475231413, + "learning_rate": 4.9960391515662476e-05, + "loss": 2.7269, + "mean_token_accuracy": 0.37241379022598264, + "step": 67460 + }, + { + "epoch": 0.06795146453460323, + "grad_norm": 12.45319230666954, + "learning_rate": 4.996036928834655e-05, + "loss": 2.6676, + "mean_token_accuracy": 0.35862069129943847, + "step": 67465 + }, + { + "epoch": 0.0679565005877074, + "grad_norm": 7.461908081910165, + "learning_rate": 4.9960347054801146e-05, + "loss": 2.3315, + "mean_token_accuracy": 0.44047186374664304, + "step": 67470 + }, + { + "epoch": 0.06796153664081157, + "grad_norm": 11.096509855502292, + "learning_rate": 4.996032481502629e-05, + "loss": 2.3765, + "mean_token_accuracy": 0.4068965554237366, + "step": 67475 + }, + { + "epoch": 0.06796657269391575, + "grad_norm": 11.41034978158626, + "learning_rate": 4.9960302569021984e-05, + "loss": 2.8313, + "mean_token_accuracy": 0.36206896901130675, + "step": 67480 + }, + { + "epoch": 0.06797160874701992, + "grad_norm": 12.83777653911419, + "learning_rate": 4.9960280316788224e-05, + "loss": 2.6273, + "mean_token_accuracy": 0.44482759237289426, + "step": 67485 + }, + { + "epoch": 0.0679766448001241, + "grad_norm": 10.848310144283532, + "learning_rate": 4.9960258058325024e-05, + "loss": 2.3365, + "mean_token_accuracy": 0.4931034445762634, + "step": 67490 + }, + { + "epoch": 0.06798168085322825, + "grad_norm": 11.078363954704422, + "learning_rate": 4.9960235793632384e-05, + "loss": 2.8658, + "mean_token_accuracy": 0.3793103516101837, + "step": 67495 + }, + { + "epoch": 0.06798671690633243, + "grad_norm": 12.177495073962945, + "learning_rate": 4.9960213522710305e-05, + "loss": 2.3426, + "mean_token_accuracy": 0.42758620381355283, + "step": 67500 + }, + { + "epoch": 0.0679917529594366, + "grad_norm": 9.622854364499236, + "learning_rate": 4.9960191245558814e-05, + "loss": 2.4772, + "mean_token_accuracy": 0.3862069010734558, + "step": 67505 + }, + { + "epoch": 0.06799678901254078, + "grad_norm": 10.369879005177918, + "learning_rate": 4.9960168962177904e-05, + "loss": 2.2492, + "mean_token_accuracy": 0.4206896543502808, + "step": 67510 + }, + { + "epoch": 0.06800182506564495, + "grad_norm": 11.608792595384573, + "learning_rate": 4.996014667256758e-05, + "loss": 2.3711, + "mean_token_accuracy": 0.4034482777118683, + "step": 67515 + }, + { + "epoch": 0.06800686111874912, + "grad_norm": 10.70929953467183, + "learning_rate": 4.996012437672785e-05, + "loss": 2.2026, + "mean_token_accuracy": 0.4620689630508423, + "step": 67520 + }, + { + "epoch": 0.0680118971718533, + "grad_norm": 12.01475489016206, + "learning_rate": 4.9960102074658726e-05, + "loss": 2.5589, + "mean_token_accuracy": 0.4034482717514038, + "step": 67525 + }, + { + "epoch": 0.06801693322495747, + "grad_norm": 12.613384642969377, + "learning_rate": 4.99600797663602e-05, + "loss": 2.306, + "mean_token_accuracy": 0.4517241358757019, + "step": 67530 + }, + { + "epoch": 0.06802196927806164, + "grad_norm": 11.085014845943247, + "learning_rate": 4.9960057451832296e-05, + "loss": 2.3998, + "mean_token_accuracy": 0.4034482717514038, + "step": 67535 + }, + { + "epoch": 0.06802700533116582, + "grad_norm": 13.0302800298626, + "learning_rate": 4.9960035131075006e-05, + "loss": 2.5572, + "mean_token_accuracy": 0.41379310488700866, + "step": 67540 + }, + { + "epoch": 0.06803204138426999, + "grad_norm": 13.95461241458804, + "learning_rate": 4.996001280408834e-05, + "loss": 2.2505, + "mean_token_accuracy": 0.43793103098869324, + "step": 67545 + }, + { + "epoch": 0.06803707743737417, + "grad_norm": 15.891194375730802, + "learning_rate": 4.995999047087231e-05, + "loss": 2.4138, + "mean_token_accuracy": 0.4413793087005615, + "step": 67550 + }, + { + "epoch": 0.06804211349047834, + "grad_norm": 10.391275457994764, + "learning_rate": 4.995996813142692e-05, + "loss": 2.3541, + "mean_token_accuracy": 0.4206896543502808, + "step": 67555 + }, + { + "epoch": 0.06804714954358251, + "grad_norm": 10.731958739480316, + "learning_rate": 4.995994578575217e-05, + "loss": 2.5376, + "mean_token_accuracy": 0.39999999701976774, + "step": 67560 + }, + { + "epoch": 0.06805218559668667, + "grad_norm": 11.592575490023139, + "learning_rate": 4.9959923433848065e-05, + "loss": 2.3071, + "mean_token_accuracy": 0.43103448748588563, + "step": 67565 + }, + { + "epoch": 0.06805722164979085, + "grad_norm": 11.409479831506632, + "learning_rate": 4.995990107571463e-05, + "loss": 2.4719, + "mean_token_accuracy": 0.40828797221183777, + "step": 67570 + }, + { + "epoch": 0.06806225770289502, + "grad_norm": 10.563076503681634, + "learning_rate": 4.9959878711351844e-05, + "loss": 2.5372, + "mean_token_accuracy": 0.38620689511299133, + "step": 67575 + }, + { + "epoch": 0.0680672937559992, + "grad_norm": 11.866089400411754, + "learning_rate": 4.995985634075973e-05, + "loss": 2.7538, + "mean_token_accuracy": 0.379310342669487, + "step": 67580 + }, + { + "epoch": 0.06807232980910337, + "grad_norm": 11.653004661676793, + "learning_rate": 4.99598339639383e-05, + "loss": 2.5502, + "mean_token_accuracy": 0.39655172228813174, + "step": 67585 + }, + { + "epoch": 0.06807736586220754, + "grad_norm": 12.518365995849269, + "learning_rate": 4.995981158088754e-05, + "loss": 2.3229, + "mean_token_accuracy": 0.482758629322052, + "step": 67590 + }, + { + "epoch": 0.06808240191531172, + "grad_norm": 11.438220508897544, + "learning_rate": 4.995978919160747e-05, + "loss": 2.8819, + "mean_token_accuracy": 0.3034482657909393, + "step": 67595 + }, + { + "epoch": 0.06808743796841589, + "grad_norm": 11.136927770625919, + "learning_rate": 4.99597667960981e-05, + "loss": 3.0726, + "mean_token_accuracy": 0.3448275804519653, + "step": 67600 + }, + { + "epoch": 0.06809247402152006, + "grad_norm": 10.506544096031416, + "learning_rate": 4.995974439435943e-05, + "loss": 2.33, + "mean_token_accuracy": 0.42758620977401735, + "step": 67605 + }, + { + "epoch": 0.06809751007462424, + "grad_norm": 13.550237585257541, + "learning_rate": 4.9959721986391456e-05, + "loss": 2.4761, + "mean_token_accuracy": 0.42413792610168455, + "step": 67610 + }, + { + "epoch": 0.06810254612772841, + "grad_norm": 12.572156595164099, + "learning_rate": 4.9959699572194194e-05, + "loss": 2.6315, + "mean_token_accuracy": 0.38965516686439516, + "step": 67615 + }, + { + "epoch": 0.06810758218083258, + "grad_norm": 12.230959063692618, + "learning_rate": 4.995967715176766e-05, + "loss": 2.2209, + "mean_token_accuracy": 0.47241379618644713, + "step": 67620 + }, + { + "epoch": 0.06811261823393676, + "grad_norm": 13.281003520680764, + "learning_rate": 4.995965472511185e-05, + "loss": 2.9073, + "mean_token_accuracy": 0.34137930274009703, + "step": 67625 + }, + { + "epoch": 0.06811765428704093, + "grad_norm": 11.759880574911456, + "learning_rate": 4.995963229222677e-05, + "loss": 2.555, + "mean_token_accuracy": 0.4103448331356049, + "step": 67630 + }, + { + "epoch": 0.06812269034014509, + "grad_norm": 13.669642152052498, + "learning_rate": 4.995960985311242e-05, + "loss": 2.4342, + "mean_token_accuracy": 0.38965516686439516, + "step": 67635 + }, + { + "epoch": 0.06812772639324927, + "grad_norm": 10.198767347032152, + "learning_rate": 4.995958740776883e-05, + "loss": 2.4611, + "mean_token_accuracy": 0.3620689570903778, + "step": 67640 + }, + { + "epoch": 0.06813276244635344, + "grad_norm": 17.182630498137463, + "learning_rate": 4.9959564956195975e-05, + "loss": 2.4019, + "mean_token_accuracy": 0.44482757449150084, + "step": 67645 + }, + { + "epoch": 0.06813779849945761, + "grad_norm": 12.112798067206478, + "learning_rate": 4.995954249839388e-05, + "loss": 2.6985, + "mean_token_accuracy": 0.39655172228813174, + "step": 67650 + }, + { + "epoch": 0.06814283455256179, + "grad_norm": 11.63531380206764, + "learning_rate": 4.9959520034362544e-05, + "loss": 2.5345, + "mean_token_accuracy": 0.4, + "step": 67655 + }, + { + "epoch": 0.06814787060566596, + "grad_norm": 10.33939208886827, + "learning_rate": 4.9959497564101974e-05, + "loss": 2.5698, + "mean_token_accuracy": 0.43103448748588563, + "step": 67660 + }, + { + "epoch": 0.06815290665877013, + "grad_norm": 10.27792208520894, + "learning_rate": 4.995947508761219e-05, + "loss": 2.1212, + "mean_token_accuracy": 0.45517241954803467, + "step": 67665 + }, + { + "epoch": 0.06815794271187431, + "grad_norm": 10.160824333528534, + "learning_rate": 4.9959452604893174e-05, + "loss": 2.5278, + "mean_token_accuracy": 0.40689654350280763, + "step": 67670 + }, + { + "epoch": 0.06816297876497848, + "grad_norm": 12.775698051615551, + "learning_rate": 4.9959430115944957e-05, + "loss": 2.511, + "mean_token_accuracy": 0.37241379022598264, + "step": 67675 + }, + { + "epoch": 0.06816801481808266, + "grad_norm": 14.194501544052104, + "learning_rate": 4.995940762076753e-05, + "loss": 2.6071, + "mean_token_accuracy": 0.3517241358757019, + "step": 67680 + }, + { + "epoch": 0.06817305087118683, + "grad_norm": 14.31971292179627, + "learning_rate": 4.9959385119360895e-05, + "loss": 2.503, + "mean_token_accuracy": 0.45741077661514284, + "step": 67685 + }, + { + "epoch": 0.068178086924291, + "grad_norm": 11.341180389497971, + "learning_rate": 4.995936261172507e-05, + "loss": 2.4121, + "mean_token_accuracy": 0.3931034505367279, + "step": 67690 + }, + { + "epoch": 0.06818312297739518, + "grad_norm": 11.053927591732855, + "learning_rate": 4.9959340097860065e-05, + "loss": 2.8424, + "mean_token_accuracy": 0.3827586233615875, + "step": 67695 + }, + { + "epoch": 0.06818815903049935, + "grad_norm": 10.439632239773552, + "learning_rate": 4.995931757776587e-05, + "loss": 2.3849, + "mean_token_accuracy": 0.37586206793785093, + "step": 67700 + }, + { + "epoch": 0.06819319508360351, + "grad_norm": 12.20094578112412, + "learning_rate": 4.99592950514425e-05, + "loss": 2.5721, + "mean_token_accuracy": 0.4103448212146759, + "step": 67705 + }, + { + "epoch": 0.06819823113670768, + "grad_norm": 10.041808514940353, + "learning_rate": 4.995927251888996e-05, + "loss": 2.2557, + "mean_token_accuracy": 0.4103448331356049, + "step": 67710 + }, + { + "epoch": 0.06820326718981186, + "grad_norm": 11.625215221839394, + "learning_rate": 4.9959249980108256e-05, + "loss": 2.8524, + "mean_token_accuracy": 0.3620689630508423, + "step": 67715 + }, + { + "epoch": 0.06820830324291603, + "grad_norm": 13.636346638878388, + "learning_rate": 4.99592274350974e-05, + "loss": 2.2736, + "mean_token_accuracy": 0.47241380214691164, + "step": 67720 + }, + { + "epoch": 0.0682133392960202, + "grad_norm": 14.760031215533257, + "learning_rate": 4.995920488385739e-05, + "loss": 2.4786, + "mean_token_accuracy": 0.42413793206214906, + "step": 67725 + }, + { + "epoch": 0.06821837534912438, + "grad_norm": 12.678838320150144, + "learning_rate": 4.995918232638824e-05, + "loss": 2.4347, + "mean_token_accuracy": 0.3793103456497192, + "step": 67730 + }, + { + "epoch": 0.06822341140222855, + "grad_norm": 12.053792102128776, + "learning_rate": 4.995915976268995e-05, + "loss": 2.3119, + "mean_token_accuracy": 0.48275862336158754, + "step": 67735 + }, + { + "epoch": 0.06822844745533273, + "grad_norm": 22.061651031457604, + "learning_rate": 4.995913719276252e-05, + "loss": 2.9199, + "mean_token_accuracy": 0.3862069010734558, + "step": 67740 + }, + { + "epoch": 0.0682334835084369, + "grad_norm": 9.855669142905063, + "learning_rate": 4.995911461660598e-05, + "loss": 2.7263, + "mean_token_accuracy": 0.358620685338974, + "step": 67745 + }, + { + "epoch": 0.06823851956154108, + "grad_norm": 10.984783747030557, + "learning_rate": 4.9959092034220306e-05, + "loss": 2.1303, + "mean_token_accuracy": 0.47931034564971925, + "step": 67750 + }, + { + "epoch": 0.06824355561464525, + "grad_norm": 11.553586965491832, + "learning_rate": 4.995906944560553e-05, + "loss": 2.2412, + "mean_token_accuracy": 0.41724138259887694, + "step": 67755 + }, + { + "epoch": 0.06824859166774942, + "grad_norm": 14.007340867442336, + "learning_rate": 4.995904685076164e-05, + "loss": 2.3331, + "mean_token_accuracy": 0.4467029690742493, + "step": 67760 + }, + { + "epoch": 0.0682536277208536, + "grad_norm": 13.047073571210488, + "learning_rate": 4.995902424968865e-05, + "loss": 2.2024, + "mean_token_accuracy": 0.44137930274009707, + "step": 67765 + }, + { + "epoch": 0.06825866377395777, + "grad_norm": 12.38055620311324, + "learning_rate": 4.995900164238657e-05, + "loss": 2.8741, + "mean_token_accuracy": 0.3724138021469116, + "step": 67770 + }, + { + "epoch": 0.06826369982706193, + "grad_norm": 11.916159724152177, + "learning_rate": 4.99589790288554e-05, + "loss": 2.6477, + "mean_token_accuracy": 0.4172413796186447, + "step": 67775 + }, + { + "epoch": 0.0682687358801661, + "grad_norm": 11.294999070671961, + "learning_rate": 4.995895640909515e-05, + "loss": 2.5906, + "mean_token_accuracy": 0.36896551251411436, + "step": 67780 + }, + { + "epoch": 0.06827377193327028, + "grad_norm": 10.755489763010823, + "learning_rate": 4.995893378310583e-05, + "loss": 2.4273, + "mean_token_accuracy": 0.4206896543502808, + "step": 67785 + }, + { + "epoch": 0.06827880798637445, + "grad_norm": 11.726088754582799, + "learning_rate": 4.9958911150887424e-05, + "loss": 2.8993, + "mean_token_accuracy": 0.36896551251411436, + "step": 67790 + }, + { + "epoch": 0.06828384403947863, + "grad_norm": 9.97170460780227, + "learning_rate": 4.995888851243997e-05, + "loss": 2.3642, + "mean_token_accuracy": 0.44137930274009707, + "step": 67795 + }, + { + "epoch": 0.0682888800925828, + "grad_norm": 11.5999564998719, + "learning_rate": 4.995886586776345e-05, + "loss": 2.6157, + "mean_token_accuracy": 0.46206897497177124, + "step": 67800 + }, + { + "epoch": 0.06829391614568697, + "grad_norm": 11.451137283201733, + "learning_rate": 4.995884321685789e-05, + "loss": 2.3552, + "mean_token_accuracy": 0.42238354682922363, + "step": 67805 + }, + { + "epoch": 0.06829895219879115, + "grad_norm": 11.045433382768143, + "learning_rate": 4.9958820559723276e-05, + "loss": 2.3819, + "mean_token_accuracy": 0.41034482717514037, + "step": 67810 + }, + { + "epoch": 0.06830398825189532, + "grad_norm": 10.39142777362513, + "learning_rate": 4.995879789635963e-05, + "loss": 2.7999, + "mean_token_accuracy": 0.4000000089406967, + "step": 67815 + }, + { + "epoch": 0.0683090243049995, + "grad_norm": 12.374181197035556, + "learning_rate": 4.9958775226766954e-05, + "loss": 3.5747, + "mean_token_accuracy": 0.2896551787853241, + "step": 67820 + }, + { + "epoch": 0.06831406035810367, + "grad_norm": 12.506992164076818, + "learning_rate": 4.995875255094525e-05, + "loss": 2.4624, + "mean_token_accuracy": 0.39310344457626345, + "step": 67825 + }, + { + "epoch": 0.06831909641120784, + "grad_norm": 10.777338236416748, + "learning_rate": 4.995872986889453e-05, + "loss": 2.3829, + "mean_token_accuracy": 0.4086509466171265, + "step": 67830 + }, + { + "epoch": 0.06832413246431202, + "grad_norm": 7.047548656024399, + "learning_rate": 4.995870718061479e-05, + "loss": 2.1891, + "mean_token_accuracy": 0.4912561535835266, + "step": 67835 + }, + { + "epoch": 0.06832916851741619, + "grad_norm": 10.548905972028468, + "learning_rate": 4.995868448610605e-05, + "loss": 2.4246, + "mean_token_accuracy": 0.43793103098869324, + "step": 67840 + }, + { + "epoch": 0.06833420457052035, + "grad_norm": 11.131581273911983, + "learning_rate": 4.995866178536831e-05, + "loss": 2.5971, + "mean_token_accuracy": 0.4137930989265442, + "step": 67845 + }, + { + "epoch": 0.06833924062362452, + "grad_norm": 15.116506786151675, + "learning_rate": 4.995863907840158e-05, + "loss": 2.3582, + "mean_token_accuracy": 0.4379310369491577, + "step": 67850 + }, + { + "epoch": 0.0683442766767287, + "grad_norm": 9.175100172438112, + "learning_rate": 4.995861636520586e-05, + "loss": 2.354, + "mean_token_accuracy": 0.4551724076271057, + "step": 67855 + }, + { + "epoch": 0.06834931272983287, + "grad_norm": 12.5592146316937, + "learning_rate": 4.995859364578115e-05, + "loss": 2.3818, + "mean_token_accuracy": 0.42758620381355283, + "step": 67860 + }, + { + "epoch": 0.06835434878293704, + "grad_norm": 18.79740162845886, + "learning_rate": 4.995857092012748e-05, + "loss": 2.5061, + "mean_token_accuracy": 0.4275862067937851, + "step": 67865 + }, + { + "epoch": 0.06835938483604122, + "grad_norm": 14.57022183557931, + "learning_rate": 4.995854818824484e-05, + "loss": 2.5396, + "mean_token_accuracy": 0.441379314661026, + "step": 67870 + }, + { + "epoch": 0.06836442088914539, + "grad_norm": 15.352158693312692, + "learning_rate": 4.995852545013322e-05, + "loss": 2.8335, + "mean_token_accuracy": 0.3793103456497192, + "step": 67875 + }, + { + "epoch": 0.06836945694224957, + "grad_norm": 17.923213263395787, + "learning_rate": 4.995850270579266e-05, + "loss": 2.7775, + "mean_token_accuracy": 0.38620689511299133, + "step": 67880 + }, + { + "epoch": 0.06837449299535374, + "grad_norm": 11.88786806200433, + "learning_rate": 4.9958479955223146e-05, + "loss": 2.4185, + "mean_token_accuracy": 0.4103448212146759, + "step": 67885 + }, + { + "epoch": 0.06837952904845791, + "grad_norm": 11.57498501957785, + "learning_rate": 4.9958457198424687e-05, + "loss": 2.2756, + "mean_token_accuracy": 0.40490018725395205, + "step": 67890 + }, + { + "epoch": 0.06838456510156209, + "grad_norm": 9.935760864532636, + "learning_rate": 4.9958434435397295e-05, + "loss": 3.0267, + "mean_token_accuracy": 0.3172413736581802, + "step": 67895 + }, + { + "epoch": 0.06838960115466626, + "grad_norm": 12.066123936091081, + "learning_rate": 4.995841166614097e-05, + "loss": 2.4891, + "mean_token_accuracy": 0.39655172228813174, + "step": 67900 + }, + { + "epoch": 0.06839463720777043, + "grad_norm": 13.218251543035977, + "learning_rate": 4.9958388890655727e-05, + "loss": 2.3807, + "mean_token_accuracy": 0.41724138259887694, + "step": 67905 + }, + { + "epoch": 0.06839967326087461, + "grad_norm": 9.76936640312093, + "learning_rate": 4.995836610894156e-05, + "loss": 2.0089, + "mean_token_accuracy": 0.4620689630508423, + "step": 67910 + }, + { + "epoch": 0.06840470931397877, + "grad_norm": 14.56657002513992, + "learning_rate": 4.995834332099848e-05, + "loss": 2.5509, + "mean_token_accuracy": 0.42068964838981626, + "step": 67915 + }, + { + "epoch": 0.06840974536708294, + "grad_norm": 11.758562428182222, + "learning_rate": 4.9958320526826494e-05, + "loss": 2.4621, + "mean_token_accuracy": 0.4034482777118683, + "step": 67920 + }, + { + "epoch": 0.06841478142018712, + "grad_norm": 11.254236143449344, + "learning_rate": 4.995829772642561e-05, + "loss": 2.5021, + "mean_token_accuracy": 0.41034482717514037, + "step": 67925 + }, + { + "epoch": 0.06841981747329129, + "grad_norm": 10.170986982691067, + "learning_rate": 4.995827491979584e-05, + "loss": 2.3279, + "mean_token_accuracy": 0.4620689630508423, + "step": 67930 + }, + { + "epoch": 0.06842485352639546, + "grad_norm": 11.98224232358821, + "learning_rate": 4.995825210693718e-05, + "loss": 2.7845, + "mean_token_accuracy": 0.4137930989265442, + "step": 67935 + }, + { + "epoch": 0.06842988957949964, + "grad_norm": 14.614397198425685, + "learning_rate": 4.995822928784964e-05, + "loss": 2.5635, + "mean_token_accuracy": 0.3862068921327591, + "step": 67940 + }, + { + "epoch": 0.06843492563260381, + "grad_norm": 10.303188806157545, + "learning_rate": 4.9958206462533225e-05, + "loss": 2.1692, + "mean_token_accuracy": 0.4551724135875702, + "step": 67945 + }, + { + "epoch": 0.06843996168570798, + "grad_norm": 12.048774232995173, + "learning_rate": 4.9958183630987943e-05, + "loss": 2.2902, + "mean_token_accuracy": 0.4034482777118683, + "step": 67950 + }, + { + "epoch": 0.06844499773881216, + "grad_norm": 12.657010052946985, + "learning_rate": 4.99581607932138e-05, + "loss": 2.9604, + "mean_token_accuracy": 0.36206896901130675, + "step": 67955 + }, + { + "epoch": 0.06845003379191633, + "grad_norm": 13.6825145832331, + "learning_rate": 4.99581379492108e-05, + "loss": 2.5835, + "mean_token_accuracy": 0.420689657330513, + "step": 67960 + }, + { + "epoch": 0.0684550698450205, + "grad_norm": 11.003620092788015, + "learning_rate": 4.995811509897895e-05, + "loss": 2.462, + "mean_token_accuracy": 0.4172413766384125, + "step": 67965 + }, + { + "epoch": 0.06846010589812468, + "grad_norm": 15.19296718060808, + "learning_rate": 4.995809224251827e-05, + "loss": 2.8776, + "mean_token_accuracy": 0.36551723480224607, + "step": 67970 + }, + { + "epoch": 0.06846514195122885, + "grad_norm": 13.945249586069592, + "learning_rate": 4.995806937982874e-05, + "loss": 2.9054, + "mean_token_accuracy": 0.36896551251411436, + "step": 67975 + }, + { + "epoch": 0.06847017800433303, + "grad_norm": 11.244731370560537, + "learning_rate": 4.995804651091039e-05, + "loss": 2.7528, + "mean_token_accuracy": 0.3689655244350433, + "step": 67980 + }, + { + "epoch": 0.06847521405743719, + "grad_norm": 22.110342838687064, + "learning_rate": 4.995802363576321e-05, + "loss": 2.7398, + "mean_token_accuracy": 0.4113300502300262, + "step": 67985 + }, + { + "epoch": 0.06848025011054136, + "grad_norm": 19.26432173437627, + "learning_rate": 4.995800075438722e-05, + "loss": 2.9468, + "mean_token_accuracy": 0.3793103456497192, + "step": 67990 + }, + { + "epoch": 0.06848528616364553, + "grad_norm": 11.276852072657423, + "learning_rate": 4.995797786678242e-05, + "loss": 2.6102, + "mean_token_accuracy": 0.3689655065536499, + "step": 67995 + }, + { + "epoch": 0.06849032221674971, + "grad_norm": 11.090639232304385, + "learning_rate": 4.995795497294881e-05, + "loss": 2.4122, + "mean_token_accuracy": 0.39655172228813174, + "step": 68000 + }, + { + "epoch": 0.06849535826985388, + "grad_norm": 12.562089443373422, + "learning_rate": 4.9957932072886405e-05, + "loss": 2.4198, + "mean_token_accuracy": 0.43448275327682495, + "step": 68005 + }, + { + "epoch": 0.06850039432295806, + "grad_norm": 11.527090376631108, + "learning_rate": 4.995790916659521e-05, + "loss": 2.4738, + "mean_token_accuracy": 0.3862069010734558, + "step": 68010 + }, + { + "epoch": 0.06850543037606223, + "grad_norm": 10.722513711569446, + "learning_rate": 4.9957886254075223e-05, + "loss": 2.3529, + "mean_token_accuracy": 0.4206896543502808, + "step": 68015 + }, + { + "epoch": 0.0685104664291664, + "grad_norm": 12.398642602053979, + "learning_rate": 4.995786333532647e-05, + "loss": 2.5458, + "mean_token_accuracy": 0.3999999940395355, + "step": 68020 + }, + { + "epoch": 0.06851550248227058, + "grad_norm": 10.456643458396158, + "learning_rate": 4.995784041034894e-05, + "loss": 2.5577, + "mean_token_accuracy": 0.41034482717514037, + "step": 68025 + }, + { + "epoch": 0.06852053853537475, + "grad_norm": 12.887406245332665, + "learning_rate": 4.9957817479142636e-05, + "loss": 2.8691, + "mean_token_accuracy": 0.3774349570274353, + "step": 68030 + }, + { + "epoch": 0.06852557458847892, + "grad_norm": 14.174355464637435, + "learning_rate": 4.9957794541707583e-05, + "loss": 2.8707, + "mean_token_accuracy": 0.4103448331356049, + "step": 68035 + }, + { + "epoch": 0.0685306106415831, + "grad_norm": 13.144835040752119, + "learning_rate": 4.9957771598043776e-05, + "loss": 2.3696, + "mean_token_accuracy": 0.4068965494632721, + "step": 68040 + }, + { + "epoch": 0.06853564669468727, + "grad_norm": 11.246789224224312, + "learning_rate": 4.995774864815121e-05, + "loss": 2.0803, + "mean_token_accuracy": 0.458620685338974, + "step": 68045 + }, + { + "epoch": 0.06854068274779145, + "grad_norm": 13.394944572588896, + "learning_rate": 4.9957725692029915e-05, + "loss": 2.8871, + "mean_token_accuracy": 0.35335753560066224, + "step": 68050 + }, + { + "epoch": 0.0685457188008956, + "grad_norm": 12.257848607816527, + "learning_rate": 4.995770272967989e-05, + "loss": 2.4439, + "mean_token_accuracy": 0.42413792610168455, + "step": 68055 + }, + { + "epoch": 0.06855075485399978, + "grad_norm": 10.090734651197305, + "learning_rate": 4.9957679761101137e-05, + "loss": 3.3997, + "mean_token_accuracy": 0.3413793116807938, + "step": 68060 + }, + { + "epoch": 0.06855579090710395, + "grad_norm": 11.942178948826568, + "learning_rate": 4.995765678629365e-05, + "loss": 2.4048, + "mean_token_accuracy": 0.4275862157344818, + "step": 68065 + }, + { + "epoch": 0.06856082696020813, + "grad_norm": 10.453425893756625, + "learning_rate": 4.995763380525746e-05, + "loss": 2.3919, + "mean_token_accuracy": 0.44482758045196535, + "step": 68070 + }, + { + "epoch": 0.0685658630133123, + "grad_norm": 13.436341964765807, + "learning_rate": 4.9957610817992556e-05, + "loss": 2.7575, + "mean_token_accuracy": 0.37931033968925476, + "step": 68075 + }, + { + "epoch": 0.06857089906641647, + "grad_norm": 14.202240067666892, + "learning_rate": 4.9957587824498946e-05, + "loss": 2.4885, + "mean_token_accuracy": 0.42068964838981626, + "step": 68080 + }, + { + "epoch": 0.06857593511952065, + "grad_norm": 10.061776810203515, + "learning_rate": 4.995756482477665e-05, + "loss": 2.2977, + "mean_token_accuracy": 0.4379310369491577, + "step": 68085 + }, + { + "epoch": 0.06858097117262482, + "grad_norm": 10.701233608231643, + "learning_rate": 4.9957541818825655e-05, + "loss": 2.6224, + "mean_token_accuracy": 0.4310344815254211, + "step": 68090 + }, + { + "epoch": 0.068586007225729, + "grad_norm": 11.140484726397732, + "learning_rate": 4.995751880664599e-05, + "loss": 2.7755, + "mean_token_accuracy": 0.33103448152542114, + "step": 68095 + }, + { + "epoch": 0.06859104327883317, + "grad_norm": 11.254314639549678, + "learning_rate": 4.9957495788237636e-05, + "loss": 2.5011, + "mean_token_accuracy": 0.4034482777118683, + "step": 68100 + }, + { + "epoch": 0.06859607933193734, + "grad_norm": 12.035333883603743, + "learning_rate": 4.995747276360061e-05, + "loss": 2.5645, + "mean_token_accuracy": 0.4172413766384125, + "step": 68105 + }, + { + "epoch": 0.06860111538504152, + "grad_norm": 9.458219513973852, + "learning_rate": 4.9957449732734936e-05, + "loss": 2.369, + "mean_token_accuracy": 0.45517241954803467, + "step": 68110 + }, + { + "epoch": 0.06860615143814569, + "grad_norm": 15.328357994755201, + "learning_rate": 4.995742669564059e-05, + "loss": 2.5749, + "mean_token_accuracy": 0.44482758045196535, + "step": 68115 + }, + { + "epoch": 0.06861118749124986, + "grad_norm": 12.214712588989148, + "learning_rate": 4.9957403652317595e-05, + "loss": 2.3947, + "mean_token_accuracy": 0.4344827592372894, + "step": 68120 + }, + { + "epoch": 0.06861622354435402, + "grad_norm": 12.128713044444613, + "learning_rate": 4.9957380602765954e-05, + "loss": 2.6187, + "mean_token_accuracy": 0.4103448301553726, + "step": 68125 + }, + { + "epoch": 0.0686212595974582, + "grad_norm": 10.459589655249898, + "learning_rate": 4.995735754698568e-05, + "loss": 2.1151, + "mean_token_accuracy": 0.4655172288417816, + "step": 68130 + }, + { + "epoch": 0.06862629565056237, + "grad_norm": 10.085066977629412, + "learning_rate": 4.9957334484976774e-05, + "loss": 2.3608, + "mean_token_accuracy": 0.42758620977401735, + "step": 68135 + }, + { + "epoch": 0.06863133170366655, + "grad_norm": 10.470334883663183, + "learning_rate": 4.995731141673924e-05, + "loss": 2.3027, + "mean_token_accuracy": 0.4122807025909424, + "step": 68140 + }, + { + "epoch": 0.06863636775677072, + "grad_norm": 11.819030230479202, + "learning_rate": 4.995728834227309e-05, + "loss": 2.05, + "mean_token_accuracy": 0.47586206197738645, + "step": 68145 + }, + { + "epoch": 0.06864140380987489, + "grad_norm": 10.985890241995858, + "learning_rate": 4.9957265261578324e-05, + "loss": 2.5963, + "mean_token_accuracy": 0.39310343861579894, + "step": 68150 + }, + { + "epoch": 0.06864643986297907, + "grad_norm": 12.03026032241558, + "learning_rate": 4.995724217465495e-05, + "loss": 2.1793, + "mean_token_accuracy": 0.443254691362381, + "step": 68155 + }, + { + "epoch": 0.06865147591608324, + "grad_norm": 8.53095996252538, + "learning_rate": 4.995721908150298e-05, + "loss": 2.415, + "mean_token_accuracy": 0.44248768091201784, + "step": 68160 + }, + { + "epoch": 0.06865651196918741, + "grad_norm": 11.409718350880656, + "learning_rate": 4.995719598212242e-05, + "loss": 2.4775, + "mean_token_accuracy": 0.4034482717514038, + "step": 68165 + }, + { + "epoch": 0.06866154802229159, + "grad_norm": 9.7141628757598, + "learning_rate": 4.9957172876513265e-05, + "loss": 2.0407, + "mean_token_accuracy": 0.4827586054801941, + "step": 68170 + }, + { + "epoch": 0.06866658407539576, + "grad_norm": 10.851265392690806, + "learning_rate": 4.995714976467553e-05, + "loss": 2.4063, + "mean_token_accuracy": 0.3896551728248596, + "step": 68175 + }, + { + "epoch": 0.06867162012849994, + "grad_norm": 13.489016763291328, + "learning_rate": 4.995712664660923e-05, + "loss": 2.2229, + "mean_token_accuracy": 0.42068964838981626, + "step": 68180 + }, + { + "epoch": 0.06867665618160411, + "grad_norm": 11.157386216273943, + "learning_rate": 4.9957103522314354e-05, + "loss": 2.9373, + "mean_token_accuracy": 0.34137931317090986, + "step": 68185 + }, + { + "epoch": 0.06868169223470828, + "grad_norm": 12.671492426012367, + "learning_rate": 4.995708039179092e-05, + "loss": 2.7379, + "mean_token_accuracy": 0.39655172526836396, + "step": 68190 + }, + { + "epoch": 0.06868672828781244, + "grad_norm": 12.22262975315247, + "learning_rate": 4.995705725503893e-05, + "loss": 2.5968, + "mean_token_accuracy": 0.4034482717514038, + "step": 68195 + }, + { + "epoch": 0.06869176434091662, + "grad_norm": 13.270736734349299, + "learning_rate": 4.995703411205839e-05, + "loss": 1.9786, + "mean_token_accuracy": 0.4965517222881317, + "step": 68200 + }, + { + "epoch": 0.06869680039402079, + "grad_norm": 10.561188892829515, + "learning_rate": 4.995701096284931e-05, + "loss": 2.657, + "mean_token_accuracy": 0.3034482717514038, + "step": 68205 + }, + { + "epoch": 0.06870183644712496, + "grad_norm": 11.405297136851035, + "learning_rate": 4.9956987807411694e-05, + "loss": 3.0318, + "mean_token_accuracy": 0.34482758641242983, + "step": 68210 + }, + { + "epoch": 0.06870687250022914, + "grad_norm": 10.807959589992901, + "learning_rate": 4.995696464574554e-05, + "loss": 2.2939, + "mean_token_accuracy": 0.4551724076271057, + "step": 68215 + }, + { + "epoch": 0.06871190855333331, + "grad_norm": 11.183639038708124, + "learning_rate": 4.9956941477850874e-05, + "loss": 2.3126, + "mean_token_accuracy": 0.4448275864124298, + "step": 68220 + }, + { + "epoch": 0.06871694460643749, + "grad_norm": 11.341184527072784, + "learning_rate": 4.9956918303727684e-05, + "loss": 2.43, + "mean_token_accuracy": 0.4034482777118683, + "step": 68225 + }, + { + "epoch": 0.06872198065954166, + "grad_norm": 11.867086148677712, + "learning_rate": 4.995689512337599e-05, + "loss": 2.8162, + "mean_token_accuracy": 0.35862069129943847, + "step": 68230 + }, + { + "epoch": 0.06872701671264583, + "grad_norm": 11.63069911886928, + "learning_rate": 4.995687193679579e-05, + "loss": 2.4218, + "mean_token_accuracy": 0.4103448331356049, + "step": 68235 + }, + { + "epoch": 0.06873205276575, + "grad_norm": 7.837334287424601, + "learning_rate": 4.9956848743987094e-05, + "loss": 2.3421, + "mean_token_accuracy": 0.46098004579544066, + "step": 68240 + }, + { + "epoch": 0.06873708881885418, + "grad_norm": 9.923742170854597, + "learning_rate": 4.9956825544949906e-05, + "loss": 2.1277, + "mean_token_accuracy": 0.4206896424293518, + "step": 68245 + }, + { + "epoch": 0.06874212487195835, + "grad_norm": 12.650398859564838, + "learning_rate": 4.9956802339684235e-05, + "loss": 2.6756, + "mean_token_accuracy": 0.34482758343219755, + "step": 68250 + }, + { + "epoch": 0.06874716092506253, + "grad_norm": 11.424958710758034, + "learning_rate": 4.995677912819008e-05, + "loss": 3.1312, + "mean_token_accuracy": 0.32758620381355286, + "step": 68255 + }, + { + "epoch": 0.0687521969781667, + "grad_norm": 10.719106329382566, + "learning_rate": 4.995675591046746e-05, + "loss": 2.655, + "mean_token_accuracy": 0.41724138259887694, + "step": 68260 + }, + { + "epoch": 0.06875723303127086, + "grad_norm": 14.989340031481367, + "learning_rate": 4.995673268651637e-05, + "loss": 3.1029, + "mean_token_accuracy": 0.3551724076271057, + "step": 68265 + }, + { + "epoch": 0.06876226908437504, + "grad_norm": 11.097026851255778, + "learning_rate": 4.995670945633683e-05, + "loss": 2.3953, + "mean_token_accuracy": 0.4206896543502808, + "step": 68270 + }, + { + "epoch": 0.06876730513747921, + "grad_norm": 11.672875333645576, + "learning_rate": 4.9956686219928836e-05, + "loss": 2.3799, + "mean_token_accuracy": 0.42413792610168455, + "step": 68275 + }, + { + "epoch": 0.06877234119058338, + "grad_norm": 9.769184310215607, + "learning_rate": 4.995666297729239e-05, + "loss": 2.2208, + "mean_token_accuracy": 0.4758620738983154, + "step": 68280 + }, + { + "epoch": 0.06877737724368756, + "grad_norm": 10.75739287361541, + "learning_rate": 4.9956639728427507e-05, + "loss": 2.391, + "mean_token_accuracy": 0.4774349570274353, + "step": 68285 + }, + { + "epoch": 0.06878241329679173, + "grad_norm": 13.394639760364598, + "learning_rate": 4.9956616473334194e-05, + "loss": 2.6972, + "mean_token_accuracy": 0.3862068891525269, + "step": 68290 + }, + { + "epoch": 0.0687874493498959, + "grad_norm": 11.041523905372237, + "learning_rate": 4.995659321201245e-05, + "loss": 2.7111, + "mean_token_accuracy": 0.4, + "step": 68295 + }, + { + "epoch": 0.06879248540300008, + "grad_norm": 8.625332400582879, + "learning_rate": 4.995656994446229e-05, + "loss": 2.6796, + "mean_token_accuracy": 0.4068965554237366, + "step": 68300 + }, + { + "epoch": 0.06879752145610425, + "grad_norm": 9.720938484340916, + "learning_rate": 4.995654667068371e-05, + "loss": 1.9673, + "mean_token_accuracy": 0.46995073556900024, + "step": 68305 + }, + { + "epoch": 0.06880255750920843, + "grad_norm": 10.52488012676982, + "learning_rate": 4.995652339067673e-05, + "loss": 2.3777, + "mean_token_accuracy": 0.42758620977401735, + "step": 68310 + }, + { + "epoch": 0.0688075935623126, + "grad_norm": 13.01065089626831, + "learning_rate": 4.995650010444135e-05, + "loss": 2.1194, + "mean_token_accuracy": 0.4620689630508423, + "step": 68315 + }, + { + "epoch": 0.06881262961541677, + "grad_norm": 14.07132543863634, + "learning_rate": 4.995647681197758e-05, + "loss": 2.4308, + "mean_token_accuracy": 0.42915910482406616, + "step": 68320 + }, + { + "epoch": 0.06881766566852095, + "grad_norm": 12.946774338917402, + "learning_rate": 4.995645351328541e-05, + "loss": 2.5139, + "mean_token_accuracy": 0.4034482777118683, + "step": 68325 + }, + { + "epoch": 0.06882270172162512, + "grad_norm": 14.721830659351117, + "learning_rate": 4.995643020836487e-05, + "loss": 2.7721, + "mean_token_accuracy": 0.37241379618644715, + "step": 68330 + }, + { + "epoch": 0.06882773777472928, + "grad_norm": 12.811521538146035, + "learning_rate": 4.995640689721595e-05, + "loss": 2.637, + "mean_token_accuracy": 0.41034482717514037, + "step": 68335 + }, + { + "epoch": 0.06883277382783345, + "grad_norm": 12.188066228182917, + "learning_rate": 4.9956383579838656e-05, + "loss": 3.0205, + "mean_token_accuracy": 0.3379310339689255, + "step": 68340 + }, + { + "epoch": 0.06883780988093763, + "grad_norm": 10.189306275715216, + "learning_rate": 4.995636025623301e-05, + "loss": 2.3359, + "mean_token_accuracy": 0.42413793206214906, + "step": 68345 + }, + { + "epoch": 0.0688428459340418, + "grad_norm": 11.552688370595554, + "learning_rate": 4.9956336926399006e-05, + "loss": 2.6752, + "mean_token_accuracy": 0.36896551251411436, + "step": 68350 + }, + { + "epoch": 0.06884788198714598, + "grad_norm": 10.64053065552082, + "learning_rate": 4.995631359033665e-05, + "loss": 2.3537, + "mean_token_accuracy": 0.42014518976211546, + "step": 68355 + }, + { + "epoch": 0.06885291804025015, + "grad_norm": 13.825055121473975, + "learning_rate": 4.995629024804595e-05, + "loss": 2.2019, + "mean_token_accuracy": 0.4848154723644257, + "step": 68360 + }, + { + "epoch": 0.06885795409335432, + "grad_norm": 11.848816493515631, + "learning_rate": 4.9956266899526915e-05, + "loss": 2.4217, + "mean_token_accuracy": 0.40344826579093934, + "step": 68365 + }, + { + "epoch": 0.0688629901464585, + "grad_norm": 11.377183686229952, + "learning_rate": 4.995624354477956e-05, + "loss": 2.1334, + "mean_token_accuracy": 0.4931034445762634, + "step": 68370 + }, + { + "epoch": 0.06886802619956267, + "grad_norm": 9.698027321301755, + "learning_rate": 4.9956220183803875e-05, + "loss": 2.3202, + "mean_token_accuracy": 0.4034482777118683, + "step": 68375 + }, + { + "epoch": 0.06887306225266684, + "grad_norm": 11.509564562701797, + "learning_rate": 4.995619681659987e-05, + "loss": 2.5417, + "mean_token_accuracy": 0.38275861740112305, + "step": 68380 + }, + { + "epoch": 0.06887809830577102, + "grad_norm": 10.040928349171919, + "learning_rate": 4.9956173443167564e-05, + "loss": 2.401, + "mean_token_accuracy": 0.4586206912994385, + "step": 68385 + }, + { + "epoch": 0.06888313435887519, + "grad_norm": 19.906730386696204, + "learning_rate": 4.9956150063506943e-05, + "loss": 2.7352, + "mean_token_accuracy": 0.3896551728248596, + "step": 68390 + }, + { + "epoch": 0.06888817041197937, + "grad_norm": 12.616991827631823, + "learning_rate": 4.9956126677618035e-05, + "loss": 2.435, + "mean_token_accuracy": 0.38620689809322356, + "step": 68395 + }, + { + "epoch": 0.06889320646508354, + "grad_norm": 10.26895841272872, + "learning_rate": 4.995610328550084e-05, + "loss": 2.3165, + "mean_token_accuracy": 0.443254691362381, + "step": 68400 + }, + { + "epoch": 0.0688982425181877, + "grad_norm": 9.993477608082141, + "learning_rate": 4.995607988715534e-05, + "loss": 2.4207, + "mean_token_accuracy": 0.40689654350280763, + "step": 68405 + }, + { + "epoch": 0.06890327857129187, + "grad_norm": 15.538352396027378, + "learning_rate": 4.9956056482581586e-05, + "loss": 2.7349, + "mean_token_accuracy": 0.358620685338974, + "step": 68410 + }, + { + "epoch": 0.06890831462439605, + "grad_norm": 17.254591896865257, + "learning_rate": 4.995603307177955e-05, + "loss": 2.7628, + "mean_token_accuracy": 0.41724138259887694, + "step": 68415 + }, + { + "epoch": 0.06891335067750022, + "grad_norm": 11.561106690820623, + "learning_rate": 4.995600965474925e-05, + "loss": 2.4659, + "mean_token_accuracy": 0.42413792610168455, + "step": 68420 + }, + { + "epoch": 0.0689183867306044, + "grad_norm": 10.916660391260852, + "learning_rate": 4.995598623149069e-05, + "loss": 3.2277, + "mean_token_accuracy": 0.39310344457626345, + "step": 68425 + }, + { + "epoch": 0.06892342278370857, + "grad_norm": 10.310989828371996, + "learning_rate": 4.9955962802003887e-05, + "loss": 2.3647, + "mean_token_accuracy": 0.3999999940395355, + "step": 68430 + }, + { + "epoch": 0.06892845883681274, + "grad_norm": 11.032759181497525, + "learning_rate": 4.9955939366288834e-05, + "loss": 2.8766, + "mean_token_accuracy": 0.4103448331356049, + "step": 68435 + }, + { + "epoch": 0.06893349488991692, + "grad_norm": 10.017227522900951, + "learning_rate": 4.995591592434554e-05, + "loss": 2.4006, + "mean_token_accuracy": 0.39310344457626345, + "step": 68440 + }, + { + "epoch": 0.06893853094302109, + "grad_norm": 14.614787203511902, + "learning_rate": 4.995589247617402e-05, + "loss": 2.464, + "mean_token_accuracy": 0.4034482717514038, + "step": 68445 + }, + { + "epoch": 0.06894356699612526, + "grad_norm": 11.21538102570232, + "learning_rate": 4.995586902177427e-05, + "loss": 2.5783, + "mean_token_accuracy": 0.36896550953388213, + "step": 68450 + }, + { + "epoch": 0.06894860304922944, + "grad_norm": 11.130687187389741, + "learning_rate": 4.99558455611463e-05, + "loss": 2.9655, + "mean_token_accuracy": 0.3482758551836014, + "step": 68455 + }, + { + "epoch": 0.06895363910233361, + "grad_norm": 11.980783784234031, + "learning_rate": 4.9955822094290116e-05, + "loss": 2.3263, + "mean_token_accuracy": 0.4275861978530884, + "step": 68460 + }, + { + "epoch": 0.06895867515543778, + "grad_norm": 11.627874988993845, + "learning_rate": 4.995579862120573e-05, + "loss": 2.7726, + "mean_token_accuracy": 0.37586206793785093, + "step": 68465 + }, + { + "epoch": 0.06896371120854196, + "grad_norm": 11.993794892656815, + "learning_rate": 4.995577514189314e-05, + "loss": 2.3595, + "mean_token_accuracy": 0.4, + "step": 68470 + }, + { + "epoch": 0.06896874726164612, + "grad_norm": 16.163800529366693, + "learning_rate": 4.995575165635236e-05, + "loss": 2.4382, + "mean_token_accuracy": 0.3999999940395355, + "step": 68475 + }, + { + "epoch": 0.06897378331475029, + "grad_norm": 11.085237128187229, + "learning_rate": 4.99557281645834e-05, + "loss": 2.7072, + "mean_token_accuracy": 0.37241379022598264, + "step": 68480 + }, + { + "epoch": 0.06897881936785447, + "grad_norm": 14.76984410113925, + "learning_rate": 4.995570466658625e-05, + "loss": 2.8102, + "mean_token_accuracy": 0.3574107691645622, + "step": 68485 + }, + { + "epoch": 0.06898385542095864, + "grad_norm": 10.53966114243923, + "learning_rate": 4.995568116236093e-05, + "loss": 2.2478, + "mean_token_accuracy": 0.4344827592372894, + "step": 68490 + }, + { + "epoch": 0.06898889147406281, + "grad_norm": 10.72842197344829, + "learning_rate": 4.995565765190744e-05, + "loss": 2.5779, + "mean_token_accuracy": 0.4137930929660797, + "step": 68495 + }, + { + "epoch": 0.06899392752716699, + "grad_norm": 12.145113604289183, + "learning_rate": 4.99556341352258e-05, + "loss": 2.3373, + "mean_token_accuracy": 0.42413793206214906, + "step": 68500 + }, + { + "epoch": 0.06899896358027116, + "grad_norm": 14.661452125856623, + "learning_rate": 4.995561061231599e-05, + "loss": 2.3491, + "mean_token_accuracy": 0.401875376701355, + "step": 68505 + }, + { + "epoch": 0.06900399963337533, + "grad_norm": 11.43938343223494, + "learning_rate": 4.9955587083178044e-05, + "loss": 2.2729, + "mean_token_accuracy": 0.46031457781791685, + "step": 68510 + }, + { + "epoch": 0.06900903568647951, + "grad_norm": 11.947915279705231, + "learning_rate": 4.995556354781195e-05, + "loss": 2.6768, + "mean_token_accuracy": 0.3896551728248596, + "step": 68515 + }, + { + "epoch": 0.06901407173958368, + "grad_norm": 12.195475639202975, + "learning_rate": 4.9955540006217726e-05, + "loss": 2.6219, + "mean_token_accuracy": 0.3655172407627106, + "step": 68520 + }, + { + "epoch": 0.06901910779268786, + "grad_norm": 11.328282163588128, + "learning_rate": 4.9955516458395376e-05, + "loss": 2.4307, + "mean_token_accuracy": 0.4206896543502808, + "step": 68525 + }, + { + "epoch": 0.06902414384579203, + "grad_norm": 12.1180032101301, + "learning_rate": 4.995549290434491e-05, + "loss": 3.143, + "mean_token_accuracy": 0.37241379022598264, + "step": 68530 + }, + { + "epoch": 0.0690291798988962, + "grad_norm": 10.425640153208771, + "learning_rate": 4.9955469344066316e-05, + "loss": 2.2673, + "mean_token_accuracy": 0.41379310488700866, + "step": 68535 + }, + { + "epoch": 0.06903421595200038, + "grad_norm": 12.139066728407032, + "learning_rate": 4.9955445777559626e-05, + "loss": 2.6509, + "mean_token_accuracy": 0.33103448152542114, + "step": 68540 + }, + { + "epoch": 0.06903925200510454, + "grad_norm": 10.515487193559398, + "learning_rate": 4.995542220482483e-05, + "loss": 2.5458, + "mean_token_accuracy": 0.42413792610168455, + "step": 68545 + }, + { + "epoch": 0.06904428805820871, + "grad_norm": 10.28783048709047, + "learning_rate": 4.995539862586194e-05, + "loss": 2.4284, + "mean_token_accuracy": 0.3896551728248596, + "step": 68550 + }, + { + "epoch": 0.06904932411131288, + "grad_norm": 10.3329281536301, + "learning_rate": 4.9955375040670956e-05, + "loss": 2.3182, + "mean_token_accuracy": 0.42413792610168455, + "step": 68555 + }, + { + "epoch": 0.06905436016441706, + "grad_norm": 13.747447509805834, + "learning_rate": 4.995535144925189e-05, + "loss": 2.8041, + "mean_token_accuracy": 0.36896551251411436, + "step": 68560 + }, + { + "epoch": 0.06905939621752123, + "grad_norm": 13.445670757314042, + "learning_rate": 4.995532785160475e-05, + "loss": 2.5272, + "mean_token_accuracy": 0.3827586233615875, + "step": 68565 + }, + { + "epoch": 0.0690644322706254, + "grad_norm": 9.686006047570606, + "learning_rate": 4.995530424772955e-05, + "loss": 2.4205, + "mean_token_accuracy": 0.3724137932062149, + "step": 68570 + }, + { + "epoch": 0.06906946832372958, + "grad_norm": 10.238154704688407, + "learning_rate": 4.9955280637626275e-05, + "loss": 2.6881, + "mean_token_accuracy": 0.4000000059604645, + "step": 68575 + }, + { + "epoch": 0.06907450437683375, + "grad_norm": 10.636075214163679, + "learning_rate": 4.995525702129495e-05, + "loss": 2.3645, + "mean_token_accuracy": 0.3999999940395355, + "step": 68580 + }, + { + "epoch": 0.06907954042993793, + "grad_norm": 13.234580379916961, + "learning_rate": 4.995523339873557e-05, + "loss": 2.6993, + "mean_token_accuracy": 0.3862069010734558, + "step": 68585 + }, + { + "epoch": 0.0690845764830421, + "grad_norm": 13.167188487716343, + "learning_rate": 4.9955209769948155e-05, + "loss": 2.3473, + "mean_token_accuracy": 0.4310344815254211, + "step": 68590 + }, + { + "epoch": 0.06908961253614627, + "grad_norm": 11.08968255253257, + "learning_rate": 4.99551861349327e-05, + "loss": 2.8021, + "mean_token_accuracy": 0.37586207389831544, + "step": 68595 + }, + { + "epoch": 0.06909464858925045, + "grad_norm": 10.957593880194658, + "learning_rate": 4.9955162493689225e-05, + "loss": 2.5719, + "mean_token_accuracy": 0.3620689630508423, + "step": 68600 + }, + { + "epoch": 0.06909968464235462, + "grad_norm": 9.444082987842588, + "learning_rate": 4.995513884621771e-05, + "loss": 2.6809, + "mean_token_accuracy": 0.3880822777748108, + "step": 68605 + }, + { + "epoch": 0.0691047206954588, + "grad_norm": 10.452925842264674, + "learning_rate": 4.9955115192518195e-05, + "loss": 2.607, + "mean_token_accuracy": 0.3931034505367279, + "step": 68610 + }, + { + "epoch": 0.06910975674856296, + "grad_norm": 11.284994723468822, + "learning_rate": 4.9955091532590655e-05, + "loss": 2.6513, + "mean_token_accuracy": 0.3862069010734558, + "step": 68615 + }, + { + "epoch": 0.06911479280166713, + "grad_norm": 11.455402326753806, + "learning_rate": 4.995506786643512e-05, + "loss": 2.5495, + "mean_token_accuracy": 0.43448275327682495, + "step": 68620 + }, + { + "epoch": 0.0691198288547713, + "grad_norm": 12.156602715431191, + "learning_rate": 4.995504419405159e-05, + "loss": 2.1219, + "mean_token_accuracy": 0.4310344815254211, + "step": 68625 + }, + { + "epoch": 0.06912486490787548, + "grad_norm": 14.44076571141514, + "learning_rate": 4.995502051544007e-05, + "loss": 2.6554, + "mean_token_accuracy": 0.3931034505367279, + "step": 68630 + }, + { + "epoch": 0.06912990096097965, + "grad_norm": 9.331211057562246, + "learning_rate": 4.9954996830600557e-05, + "loss": 2.6234, + "mean_token_accuracy": 0.4137930989265442, + "step": 68635 + }, + { + "epoch": 0.06913493701408382, + "grad_norm": 13.387125275798537, + "learning_rate": 4.995497313953307e-05, + "loss": 2.5179, + "mean_token_accuracy": 0.3896551728248596, + "step": 68640 + }, + { + "epoch": 0.069139973067188, + "grad_norm": 11.614357486543609, + "learning_rate": 4.995494944223761e-05, + "loss": 2.8698, + "mean_token_accuracy": 0.37586206793785093, + "step": 68645 + }, + { + "epoch": 0.06914500912029217, + "grad_norm": 10.2091869982369, + "learning_rate": 4.99549257387142e-05, + "loss": 2.3818, + "mean_token_accuracy": 0.47586206197738645, + "step": 68650 + }, + { + "epoch": 0.06915004517339635, + "grad_norm": 12.294449936209256, + "learning_rate": 4.995490202896281e-05, + "loss": 2.1653, + "mean_token_accuracy": 0.4620689690113068, + "step": 68655 + }, + { + "epoch": 0.06915508122650052, + "grad_norm": 11.628665868012853, + "learning_rate": 4.99548783129835e-05, + "loss": 2.4145, + "mean_token_accuracy": 0.3896551728248596, + "step": 68660 + }, + { + "epoch": 0.0691601172796047, + "grad_norm": 10.41633871684809, + "learning_rate": 4.995485459077622e-05, + "loss": 2.4332, + "mean_token_accuracy": 0.42758620977401735, + "step": 68665 + }, + { + "epoch": 0.06916515333270887, + "grad_norm": 11.501497858255746, + "learning_rate": 4.995483086234101e-05, + "loss": 2.6771, + "mean_token_accuracy": 0.38275861740112305, + "step": 68670 + }, + { + "epoch": 0.06917018938581304, + "grad_norm": 10.792549804645974, + "learning_rate": 4.995480712767787e-05, + "loss": 2.1903, + "mean_token_accuracy": 0.4593596041202545, + "step": 68675 + }, + { + "epoch": 0.06917522543891721, + "grad_norm": 11.947470278700079, + "learning_rate": 4.99547833867868e-05, + "loss": 2.5797, + "mean_token_accuracy": 0.40689656138420105, + "step": 68680 + }, + { + "epoch": 0.06918026149202137, + "grad_norm": 14.180908310440948, + "learning_rate": 4.995475963966782e-05, + "loss": 2.8306, + "mean_token_accuracy": 0.38620689511299133, + "step": 68685 + }, + { + "epoch": 0.06918529754512555, + "grad_norm": 12.015600515394084, + "learning_rate": 4.995473588632093e-05, + "loss": 2.4433, + "mean_token_accuracy": 0.41034482717514037, + "step": 68690 + }, + { + "epoch": 0.06919033359822972, + "grad_norm": 10.326587450970427, + "learning_rate": 4.9954712126746125e-05, + "loss": 2.4806, + "mean_token_accuracy": 0.4223230481147766, + "step": 68695 + }, + { + "epoch": 0.0691953696513339, + "grad_norm": 10.907207101792315, + "learning_rate": 4.9954688360943424e-05, + "loss": 2.4136, + "mean_token_accuracy": 0.3979431390762329, + "step": 68700 + }, + { + "epoch": 0.06920040570443807, + "grad_norm": 12.443756143362155, + "learning_rate": 4.995466458891284e-05, + "loss": 2.4007, + "mean_token_accuracy": 0.41034482717514037, + "step": 68705 + }, + { + "epoch": 0.06920544175754224, + "grad_norm": 11.355663241090976, + "learning_rate": 4.995464081065437e-05, + "loss": 2.411, + "mean_token_accuracy": 0.42758620381355283, + "step": 68710 + }, + { + "epoch": 0.06921047781064642, + "grad_norm": 12.154822167650508, + "learning_rate": 4.9954617026168017e-05, + "loss": 2.5309, + "mean_token_accuracy": 0.41034482717514037, + "step": 68715 + }, + { + "epoch": 0.06921551386375059, + "grad_norm": 17.316687640054468, + "learning_rate": 4.995459323545379e-05, + "loss": 2.456, + "mean_token_accuracy": 0.3793103456497192, + "step": 68720 + }, + { + "epoch": 0.06922054991685476, + "grad_norm": 11.649141479761466, + "learning_rate": 4.995456943851171e-05, + "loss": 2.6445, + "mean_token_accuracy": 0.3655172407627106, + "step": 68725 + }, + { + "epoch": 0.06922558596995894, + "grad_norm": 16.368233549752873, + "learning_rate": 4.9954545635341755e-05, + "loss": 2.8127, + "mean_token_accuracy": 0.38275861740112305, + "step": 68730 + }, + { + "epoch": 0.06923062202306311, + "grad_norm": 13.01168856137935, + "learning_rate": 4.9954521825943956e-05, + "loss": 2.399, + "mean_token_accuracy": 0.41724138259887694, + "step": 68735 + }, + { + "epoch": 0.06923565807616729, + "grad_norm": 11.763145816658948, + "learning_rate": 4.995449801031831e-05, + "loss": 2.6363, + "mean_token_accuracy": 0.40344826579093934, + "step": 68740 + }, + { + "epoch": 0.06924069412927146, + "grad_norm": 12.234524227982364, + "learning_rate": 4.9954474188464824e-05, + "loss": 2.9011, + "mean_token_accuracy": 0.3911070764064789, + "step": 68745 + }, + { + "epoch": 0.06924573018237563, + "grad_norm": 10.43238559814186, + "learning_rate": 4.995445036038351e-05, + "loss": 2.4521, + "mean_token_accuracy": 0.3896551728248596, + "step": 68750 + }, + { + "epoch": 0.0692507662354798, + "grad_norm": 12.467078694985444, + "learning_rate": 4.995442652607437e-05, + "loss": 2.8374, + "mean_token_accuracy": 0.358620685338974, + "step": 68755 + }, + { + "epoch": 0.06925580228858397, + "grad_norm": 14.385823381623696, + "learning_rate": 4.9954402685537404e-05, + "loss": 2.8, + "mean_token_accuracy": 0.38965516686439516, + "step": 68760 + }, + { + "epoch": 0.06926083834168814, + "grad_norm": 15.04342781232573, + "learning_rate": 4.995437883877263e-05, + "loss": 2.4493, + "mean_token_accuracy": 0.4517241358757019, + "step": 68765 + }, + { + "epoch": 0.06926587439479232, + "grad_norm": 10.560124977734038, + "learning_rate": 4.995435498578005e-05, + "loss": 2.2557, + "mean_token_accuracy": 0.45862069725990295, + "step": 68770 + }, + { + "epoch": 0.06927091044789649, + "grad_norm": 10.450295484171619, + "learning_rate": 4.995433112655968e-05, + "loss": 2.6005, + "mean_token_accuracy": 0.37586206793785093, + "step": 68775 + }, + { + "epoch": 0.06927594650100066, + "grad_norm": 12.078299774100959, + "learning_rate": 4.99543072611115e-05, + "loss": 2.454, + "mean_token_accuracy": 0.42758620381355283, + "step": 68780 + }, + { + "epoch": 0.06928098255410484, + "grad_norm": 13.698929098045454, + "learning_rate": 4.9954283389435545e-05, + "loss": 2.2923, + "mean_token_accuracy": 0.45051422119140627, + "step": 68785 + }, + { + "epoch": 0.06928601860720901, + "grad_norm": 11.01010567253589, + "learning_rate": 4.9954259511531815e-05, + "loss": 2.6219, + "mean_token_accuracy": 0.38620689511299133, + "step": 68790 + }, + { + "epoch": 0.06929105466031318, + "grad_norm": 9.777383742958852, + "learning_rate": 4.995423562740031e-05, + "loss": 2.4177, + "mean_token_accuracy": 0.41530550122261045, + "step": 68795 + }, + { + "epoch": 0.06929609071341736, + "grad_norm": 9.150017781487204, + "learning_rate": 4.995421173704103e-05, + "loss": 2.2728, + "mean_token_accuracy": 0.42413792610168455, + "step": 68800 + }, + { + "epoch": 0.06930112676652153, + "grad_norm": 10.855660054710444, + "learning_rate": 4.9954187840453996e-05, + "loss": 2.4523, + "mean_token_accuracy": 0.41034482717514037, + "step": 68805 + }, + { + "epoch": 0.0693061628196257, + "grad_norm": 10.884763382822344, + "learning_rate": 4.995416393763922e-05, + "loss": 2.5689, + "mean_token_accuracy": 0.3517241358757019, + "step": 68810 + }, + { + "epoch": 0.06931119887272988, + "grad_norm": 10.917681794814634, + "learning_rate": 4.995414002859668e-05, + "loss": 2.4923, + "mean_token_accuracy": 0.3827586263418198, + "step": 68815 + }, + { + "epoch": 0.06931623492583405, + "grad_norm": 16.430117647235722, + "learning_rate": 4.99541161133264e-05, + "loss": 2.6375, + "mean_token_accuracy": 0.35172414481639863, + "step": 68820 + }, + { + "epoch": 0.06932127097893821, + "grad_norm": 12.930266203709143, + "learning_rate": 4.99540921918284e-05, + "loss": 2.4642, + "mean_token_accuracy": 0.42068966031074523, + "step": 68825 + }, + { + "epoch": 0.06932630703204239, + "grad_norm": 9.73922959309583, + "learning_rate": 4.995406826410267e-05, + "loss": 2.6668, + "mean_token_accuracy": 0.4360556542873383, + "step": 68830 + }, + { + "epoch": 0.06933134308514656, + "grad_norm": 11.414551492463593, + "learning_rate": 4.995404433014922e-05, + "loss": 2.2866, + "mean_token_accuracy": 0.4206896543502808, + "step": 68835 + }, + { + "epoch": 0.06933637913825073, + "grad_norm": 12.034953929478462, + "learning_rate": 4.995402038996805e-05, + "loss": 2.2224, + "mean_token_accuracy": 0.42413792610168455, + "step": 68840 + }, + { + "epoch": 0.06934141519135491, + "grad_norm": 11.870792432328185, + "learning_rate": 4.995399644355919e-05, + "loss": 2.4494, + "mean_token_accuracy": 0.4068965554237366, + "step": 68845 + }, + { + "epoch": 0.06934645124445908, + "grad_norm": 11.558940644956172, + "learning_rate": 4.995397249092261e-05, + "loss": 2.2503, + "mean_token_accuracy": 0.47586206197738645, + "step": 68850 + }, + { + "epoch": 0.06935148729756326, + "grad_norm": 10.759807408604575, + "learning_rate": 4.995394853205835e-05, + "loss": 2.6779, + "mean_token_accuracy": 0.4151845157146454, + "step": 68855 + }, + { + "epoch": 0.06935652335066743, + "grad_norm": 10.330986829314917, + "learning_rate": 4.9953924566966404e-05, + "loss": 2.4523, + "mean_token_accuracy": 0.38620689511299133, + "step": 68860 + }, + { + "epoch": 0.0693615594037716, + "grad_norm": 10.753541205291183, + "learning_rate": 4.995390059564677e-05, + "loss": 2.6479, + "mean_token_accuracy": 0.4068965554237366, + "step": 68865 + }, + { + "epoch": 0.06936659545687578, + "grad_norm": 11.073421025987594, + "learning_rate": 4.9953876618099475e-05, + "loss": 2.1975, + "mean_token_accuracy": 0.42758620977401735, + "step": 68870 + }, + { + "epoch": 0.06937163150997995, + "grad_norm": 11.213531552285495, + "learning_rate": 4.9953852634324514e-05, + "loss": 2.5785, + "mean_token_accuracy": 0.4206896543502808, + "step": 68875 + }, + { + "epoch": 0.06937666756308412, + "grad_norm": 11.725228726465163, + "learning_rate": 4.9953828644321875e-05, + "loss": 2.7197, + "mean_token_accuracy": 0.36896551847457887, + "step": 68880 + }, + { + "epoch": 0.0693817036161883, + "grad_norm": 11.068580729639695, + "learning_rate": 4.995380464809159e-05, + "loss": 2.5585, + "mean_token_accuracy": 0.42068966031074523, + "step": 68885 + }, + { + "epoch": 0.06938673966929247, + "grad_norm": 11.913123053052246, + "learning_rate": 4.995378064563367e-05, + "loss": 2.5933, + "mean_token_accuracy": 0.41724138259887694, + "step": 68890 + }, + { + "epoch": 0.06939177572239663, + "grad_norm": 10.090095709382323, + "learning_rate": 4.9953756636948104e-05, + "loss": 2.3276, + "mean_token_accuracy": 0.42758620977401735, + "step": 68895 + }, + { + "epoch": 0.0693968117755008, + "grad_norm": 11.615033413924445, + "learning_rate": 4.9953732622034904e-05, + "loss": 2.2036, + "mean_token_accuracy": 0.4503327250480652, + "step": 68900 + }, + { + "epoch": 0.06940184782860498, + "grad_norm": 16.901866636531842, + "learning_rate": 4.995370860089408e-05, + "loss": 2.8175, + "mean_token_accuracy": 0.33448275923728943, + "step": 68905 + }, + { + "epoch": 0.06940688388170915, + "grad_norm": 13.989519230748044, + "learning_rate": 4.995368457352563e-05, + "loss": 2.1626, + "mean_token_accuracy": 0.44827585816383364, + "step": 68910 + }, + { + "epoch": 0.06941191993481333, + "grad_norm": 11.210757881121143, + "learning_rate": 4.995366053992957e-05, + "loss": 2.7814, + "mean_token_accuracy": 0.39993950724601746, + "step": 68915 + }, + { + "epoch": 0.0694169559879175, + "grad_norm": 13.854054188760335, + "learning_rate": 4.995363650010589e-05, + "loss": 2.7845, + "mean_token_accuracy": 0.358620685338974, + "step": 68920 + }, + { + "epoch": 0.06942199204102167, + "grad_norm": 11.14041695564461, + "learning_rate": 4.9953612454054636e-05, + "loss": 2.4939, + "mean_token_accuracy": 0.38620689511299133, + "step": 68925 + }, + { + "epoch": 0.06942702809412585, + "grad_norm": 14.482412728840401, + "learning_rate": 4.995358840177577e-05, + "loss": 2.9452, + "mean_token_accuracy": 0.3965517282485962, + "step": 68930 + }, + { + "epoch": 0.06943206414723002, + "grad_norm": 11.06778317753632, + "learning_rate": 4.995356434326933e-05, + "loss": 2.3983, + "mean_token_accuracy": 0.4379310429096222, + "step": 68935 + }, + { + "epoch": 0.0694371002003342, + "grad_norm": 11.456746249681887, + "learning_rate": 4.99535402785353e-05, + "loss": 3.2143, + "mean_token_accuracy": 0.317241370677948, + "step": 68940 + }, + { + "epoch": 0.06944213625343837, + "grad_norm": 11.641652918004821, + "learning_rate": 4.9953516207573703e-05, + "loss": 2.6824, + "mean_token_accuracy": 0.39655172228813174, + "step": 68945 + }, + { + "epoch": 0.06944717230654254, + "grad_norm": 11.379517999043522, + "learning_rate": 4.9953492130384534e-05, + "loss": 2.4344, + "mean_token_accuracy": 0.4000000059604645, + "step": 68950 + }, + { + "epoch": 0.06945220835964672, + "grad_norm": 9.640750488301148, + "learning_rate": 4.99534680469678e-05, + "loss": 2.2563, + "mean_token_accuracy": 0.458620685338974, + "step": 68955 + }, + { + "epoch": 0.06945724441275089, + "grad_norm": 14.3951908657317, + "learning_rate": 4.995344395732353e-05, + "loss": 2.4955, + "mean_token_accuracy": 0.3931034505367279, + "step": 68960 + }, + { + "epoch": 0.06946228046585505, + "grad_norm": 12.58434918281384, + "learning_rate": 4.9953419861451705e-05, + "loss": 2.5236, + "mean_token_accuracy": 0.44827585816383364, + "step": 68965 + }, + { + "epoch": 0.06946731651895922, + "grad_norm": 17.206526493849143, + "learning_rate": 4.995339575935234e-05, + "loss": 2.9202, + "mean_token_accuracy": 0.34137930870056155, + "step": 68970 + }, + { + "epoch": 0.0694723525720634, + "grad_norm": 12.553841148156884, + "learning_rate": 4.995337165102544e-05, + "loss": 2.4965, + "mean_token_accuracy": 0.37586206793785093, + "step": 68975 + }, + { + "epoch": 0.06947738862516757, + "grad_norm": 13.399813701867968, + "learning_rate": 4.995334753647102e-05, + "loss": 2.2564, + "mean_token_accuracy": 0.44150246381759645, + "step": 68980 + }, + { + "epoch": 0.06948242467827175, + "grad_norm": 10.10172739297303, + "learning_rate": 4.9953323415689074e-05, + "loss": 2.4994, + "mean_token_accuracy": 0.4241379380226135, + "step": 68985 + }, + { + "epoch": 0.06948746073137592, + "grad_norm": 12.208975771978402, + "learning_rate": 4.995329928867961e-05, + "loss": 2.6605, + "mean_token_accuracy": 0.42413793206214906, + "step": 68990 + }, + { + "epoch": 0.06949249678448009, + "grad_norm": 16.570529539590613, + "learning_rate": 4.9953275155442655e-05, + "loss": 2.3356, + "mean_token_accuracy": 0.4172413766384125, + "step": 68995 + }, + { + "epoch": 0.06949753283758427, + "grad_norm": 10.433911929830472, + "learning_rate": 4.9953251015978186e-05, + "loss": 2.4537, + "mean_token_accuracy": 0.44482758045196535, + "step": 69000 + }, + { + "epoch": 0.06950256889068844, + "grad_norm": 13.250607140260652, + "learning_rate": 4.995322687028623e-05, + "loss": 2.6566, + "mean_token_accuracy": 0.3862069010734558, + "step": 69005 + }, + { + "epoch": 0.06950760494379261, + "grad_norm": 14.175663597725457, + "learning_rate": 4.9953202718366786e-05, + "loss": 2.5445, + "mean_token_accuracy": 0.41034482717514037, + "step": 69010 + }, + { + "epoch": 0.06951264099689679, + "grad_norm": 10.254019031143196, + "learning_rate": 4.995317856021987e-05, + "loss": 2.4162, + "mean_token_accuracy": 0.46896551847457885, + "step": 69015 + }, + { + "epoch": 0.06951767705000096, + "grad_norm": 14.30277014771815, + "learning_rate": 4.9953154395845475e-05, + "loss": 2.7011, + "mean_token_accuracy": 0.37241379022598264, + "step": 69020 + }, + { + "epoch": 0.06952271310310514, + "grad_norm": 11.70104131128868, + "learning_rate": 4.995313022524361e-05, + "loss": 2.7611, + "mean_token_accuracy": 0.38620689511299133, + "step": 69025 + }, + { + "epoch": 0.06952774915620931, + "grad_norm": 10.253303509416424, + "learning_rate": 4.9953106048414293e-05, + "loss": 2.584, + "mean_token_accuracy": 0.4034482777118683, + "step": 69030 + }, + { + "epoch": 0.06953278520931347, + "grad_norm": 11.874387052025757, + "learning_rate": 4.995308186535752e-05, + "loss": 2.5118, + "mean_token_accuracy": 0.40344828367233276, + "step": 69035 + }, + { + "epoch": 0.06953782126241764, + "grad_norm": 11.512632384894767, + "learning_rate": 4.99530576760733e-05, + "loss": 2.6066, + "mean_token_accuracy": 0.4398064136505127, + "step": 69040 + }, + { + "epoch": 0.06954285731552182, + "grad_norm": 11.610155950263424, + "learning_rate": 4.995303348056164e-05, + "loss": 2.4158, + "mean_token_accuracy": 0.39655172228813174, + "step": 69045 + }, + { + "epoch": 0.06954789336862599, + "grad_norm": 9.889454074803922, + "learning_rate": 4.995300927882256e-05, + "loss": 2.5037, + "mean_token_accuracy": 0.43103448748588563, + "step": 69050 + }, + { + "epoch": 0.06955292942173016, + "grad_norm": 11.308732888993708, + "learning_rate": 4.9952985070856034e-05, + "loss": 2.5873, + "mean_token_accuracy": 0.4, + "step": 69055 + }, + { + "epoch": 0.06955796547483434, + "grad_norm": 10.879324492961441, + "learning_rate": 4.99529608566621e-05, + "loss": 2.4889, + "mean_token_accuracy": 0.44137930274009707, + "step": 69060 + }, + { + "epoch": 0.06956300152793851, + "grad_norm": 12.151599275915636, + "learning_rate": 4.995293663624075e-05, + "loss": 2.7146, + "mean_token_accuracy": 0.36896551847457887, + "step": 69065 + }, + { + "epoch": 0.06956803758104269, + "grad_norm": 13.889498073531236, + "learning_rate": 4.9952912409592e-05, + "loss": 2.8549, + "mean_token_accuracy": 0.37241379618644715, + "step": 69070 + }, + { + "epoch": 0.06957307363414686, + "grad_norm": 12.177187852860959, + "learning_rate": 4.995288817671584e-05, + "loss": 2.7181, + "mean_token_accuracy": 0.41724138259887694, + "step": 69075 + }, + { + "epoch": 0.06957810968725103, + "grad_norm": 12.384731382360865, + "learning_rate": 4.99528639376123e-05, + "loss": 2.3653, + "mean_token_accuracy": 0.42413793206214906, + "step": 69080 + }, + { + "epoch": 0.0695831457403552, + "grad_norm": 10.976640050596247, + "learning_rate": 4.995283969228137e-05, + "loss": 3.1003, + "mean_token_accuracy": 0.3655172407627106, + "step": 69085 + }, + { + "epoch": 0.06958818179345938, + "grad_norm": 12.026789175310098, + "learning_rate": 4.9952815440723055e-05, + "loss": 2.5681, + "mean_token_accuracy": 0.4344827592372894, + "step": 69090 + }, + { + "epoch": 0.06959321784656355, + "grad_norm": 12.121541156799083, + "learning_rate": 4.9952791182937374e-05, + "loss": 2.5274, + "mean_token_accuracy": 0.41034482717514037, + "step": 69095 + }, + { + "epoch": 0.06959825389966773, + "grad_norm": 14.927715533335313, + "learning_rate": 4.995276691892433e-05, + "loss": 2.4328, + "mean_token_accuracy": 0.38275861740112305, + "step": 69100 + }, + { + "epoch": 0.06960328995277189, + "grad_norm": 10.265588983489966, + "learning_rate": 4.995274264868392e-05, + "loss": 2.5255, + "mean_token_accuracy": 0.4310344815254211, + "step": 69105 + }, + { + "epoch": 0.06960832600587606, + "grad_norm": 9.780438737432963, + "learning_rate": 4.995271837221616e-05, + "loss": 2.429, + "mean_token_accuracy": 0.4329703629016876, + "step": 69110 + }, + { + "epoch": 0.06961336205898024, + "grad_norm": 15.03281090190755, + "learning_rate": 4.995269408952106e-05, + "loss": 2.4947, + "mean_token_accuracy": 0.38965516686439516, + "step": 69115 + }, + { + "epoch": 0.06961839811208441, + "grad_norm": 9.484920805830832, + "learning_rate": 4.995266980059862e-05, + "loss": 2.5013, + "mean_token_accuracy": 0.4034482717514038, + "step": 69120 + }, + { + "epoch": 0.06962343416518858, + "grad_norm": 13.915142028456298, + "learning_rate": 4.995264550544885e-05, + "loss": 2.2544, + "mean_token_accuracy": 0.4379310250282288, + "step": 69125 + }, + { + "epoch": 0.06962847021829276, + "grad_norm": 13.229519668757016, + "learning_rate": 4.995262120407175e-05, + "loss": 2.4697, + "mean_token_accuracy": 0.42499999403953553, + "step": 69130 + }, + { + "epoch": 0.06963350627139693, + "grad_norm": 11.68384412914266, + "learning_rate": 4.995259689646733e-05, + "loss": 2.7728, + "mean_token_accuracy": 0.3551724135875702, + "step": 69135 + }, + { + "epoch": 0.0696385423245011, + "grad_norm": 15.215890104803373, + "learning_rate": 4.99525725826356e-05, + "loss": 3.0822, + "mean_token_accuracy": 0.32758620083332063, + "step": 69140 + }, + { + "epoch": 0.06964357837760528, + "grad_norm": 10.178072054726151, + "learning_rate": 4.9952548262576566e-05, + "loss": 2.1224, + "mean_token_accuracy": 0.4862069010734558, + "step": 69145 + }, + { + "epoch": 0.06964861443070945, + "grad_norm": 12.110112671056603, + "learning_rate": 4.995252393629024e-05, + "loss": 2.7521, + "mean_token_accuracy": 0.3793103456497192, + "step": 69150 + }, + { + "epoch": 0.06965365048381363, + "grad_norm": 14.311716667498578, + "learning_rate": 4.9952499603776615e-05, + "loss": 2.3517, + "mean_token_accuracy": 0.4068965554237366, + "step": 69155 + }, + { + "epoch": 0.0696586865369178, + "grad_norm": 10.50301247360131, + "learning_rate": 4.9952475265035706e-05, + "loss": 2.3775, + "mean_token_accuracy": 0.44827585816383364, + "step": 69160 + }, + { + "epoch": 0.06966372259002197, + "grad_norm": 10.910908198938966, + "learning_rate": 4.9952450920067526e-05, + "loss": 2.4826, + "mean_token_accuracy": 0.3965517282485962, + "step": 69165 + }, + { + "epoch": 0.06966875864312615, + "grad_norm": 12.543590583091008, + "learning_rate": 4.9952426568872075e-05, + "loss": 2.8059, + "mean_token_accuracy": 0.37586207389831544, + "step": 69170 + }, + { + "epoch": 0.0696737946962303, + "grad_norm": 10.17003951953099, + "learning_rate": 4.995240221144935e-05, + "loss": 2.6055, + "mean_token_accuracy": 0.3931034505367279, + "step": 69175 + }, + { + "epoch": 0.06967883074933448, + "grad_norm": 11.175095015564159, + "learning_rate": 4.9952377847799366e-05, + "loss": 2.4398, + "mean_token_accuracy": 0.44827585220336913, + "step": 69180 + }, + { + "epoch": 0.06968386680243865, + "grad_norm": 11.395416032224416, + "learning_rate": 4.995235347792214e-05, + "loss": 2.4825, + "mean_token_accuracy": 0.41724138259887694, + "step": 69185 + }, + { + "epoch": 0.06968890285554283, + "grad_norm": 12.152853753237443, + "learning_rate": 4.9952329101817676e-05, + "loss": 2.5999, + "mean_token_accuracy": 0.42758620977401735, + "step": 69190 + }, + { + "epoch": 0.069693938908647, + "grad_norm": 10.979722187268564, + "learning_rate": 4.995230471948596e-05, + "loss": 2.0078, + "mean_token_accuracy": 0.48094373345375063, + "step": 69195 + }, + { + "epoch": 0.06969897496175118, + "grad_norm": 11.643808166754644, + "learning_rate": 4.995228033092703e-05, + "loss": 2.7891, + "mean_token_accuracy": 0.41034482717514037, + "step": 69200 + }, + { + "epoch": 0.06970401101485535, + "grad_norm": 12.59336213559089, + "learning_rate": 4.995225593614086e-05, + "loss": 2.663, + "mean_token_accuracy": 0.36206896007061007, + "step": 69205 + }, + { + "epoch": 0.06970904706795952, + "grad_norm": 11.353422203876665, + "learning_rate": 4.995223153512748e-05, + "loss": 2.5689, + "mean_token_accuracy": 0.4068965494632721, + "step": 69210 + }, + { + "epoch": 0.0697140831210637, + "grad_norm": 11.891655055457258, + "learning_rate": 4.995220712788689e-05, + "loss": 2.6296, + "mean_token_accuracy": 0.4034482717514038, + "step": 69215 + }, + { + "epoch": 0.06971911917416787, + "grad_norm": 12.16609974850152, + "learning_rate": 4.9952182714419094e-05, + "loss": 2.693, + "mean_token_accuracy": 0.4137930989265442, + "step": 69220 + }, + { + "epoch": 0.06972415522727204, + "grad_norm": 10.697233800366224, + "learning_rate": 4.99521582947241e-05, + "loss": 2.3711, + "mean_token_accuracy": 0.4241379380226135, + "step": 69225 + }, + { + "epoch": 0.06972919128037622, + "grad_norm": 10.90460049265713, + "learning_rate": 4.995213386880192e-05, + "loss": 2.3015, + "mean_token_accuracy": 0.4620689630508423, + "step": 69230 + }, + { + "epoch": 0.06973422733348039, + "grad_norm": 10.046911844460293, + "learning_rate": 4.9952109436652556e-05, + "loss": 2.855, + "mean_token_accuracy": 0.4068965554237366, + "step": 69235 + }, + { + "epoch": 0.06973926338658457, + "grad_norm": 11.284415791745777, + "learning_rate": 4.995208499827602e-05, + "loss": 2.3464, + "mean_token_accuracy": 0.4206896543502808, + "step": 69240 + }, + { + "epoch": 0.06974429943968873, + "grad_norm": 13.568014567785893, + "learning_rate": 4.995206055367231e-05, + "loss": 2.6642, + "mean_token_accuracy": 0.38620689511299133, + "step": 69245 + }, + { + "epoch": 0.0697493354927929, + "grad_norm": 31.495450284490325, + "learning_rate": 4.995203610284144e-05, + "loss": 2.6996, + "mean_token_accuracy": 0.43448275327682495, + "step": 69250 + }, + { + "epoch": 0.06975437154589707, + "grad_norm": 11.330954547460001, + "learning_rate": 4.995201164578341e-05, + "loss": 2.7512, + "mean_token_accuracy": 0.4137930989265442, + "step": 69255 + }, + { + "epoch": 0.06975940759900125, + "grad_norm": 13.391847941845976, + "learning_rate": 4.995198718249823e-05, + "loss": 2.2322, + "mean_token_accuracy": 0.443993604183197, + "step": 69260 + }, + { + "epoch": 0.06976444365210542, + "grad_norm": 11.052345841105538, + "learning_rate": 4.995196271298591e-05, + "loss": 2.1857, + "mean_token_accuracy": 0.46551724672317507, + "step": 69265 + }, + { + "epoch": 0.0697694797052096, + "grad_norm": 9.670145462510254, + "learning_rate": 4.995193823724646e-05, + "loss": 2.4563, + "mean_token_accuracy": 0.40098522007465365, + "step": 69270 + }, + { + "epoch": 0.06977451575831377, + "grad_norm": 13.39834029234346, + "learning_rate": 4.995191375527987e-05, + "loss": 2.4028, + "mean_token_accuracy": 0.43266787827014924, + "step": 69275 + }, + { + "epoch": 0.06977955181141794, + "grad_norm": 12.509633988924094, + "learning_rate": 4.995188926708617e-05, + "loss": 2.9001, + "mean_token_accuracy": 0.3448275804519653, + "step": 69280 + }, + { + "epoch": 0.06978458786452212, + "grad_norm": 10.904922590481267, + "learning_rate": 4.9951864772665345e-05, + "loss": 2.5634, + "mean_token_accuracy": 0.38965517580509185, + "step": 69285 + }, + { + "epoch": 0.06978962391762629, + "grad_norm": 11.899745762346427, + "learning_rate": 4.995184027201742e-05, + "loss": 2.7106, + "mean_token_accuracy": 0.36551724672317504, + "step": 69290 + }, + { + "epoch": 0.06979465997073046, + "grad_norm": 13.124551297736753, + "learning_rate": 4.995181576514239e-05, + "loss": 2.4451, + "mean_token_accuracy": 0.4689655125141144, + "step": 69295 + }, + { + "epoch": 0.06979969602383464, + "grad_norm": 12.832785151138568, + "learning_rate": 4.995179125204026e-05, + "loss": 2.7844, + "mean_token_accuracy": 0.3655172407627106, + "step": 69300 + }, + { + "epoch": 0.06980473207693881, + "grad_norm": 11.834963449405778, + "learning_rate": 4.9951766732711044e-05, + "loss": 2.4623, + "mean_token_accuracy": 0.441379314661026, + "step": 69305 + }, + { + "epoch": 0.06980976813004298, + "grad_norm": 11.806922302441444, + "learning_rate": 4.995174220715475e-05, + "loss": 2.3533, + "mean_token_accuracy": 0.4344827592372894, + "step": 69310 + }, + { + "epoch": 0.06981480418314714, + "grad_norm": 14.02394715771266, + "learning_rate": 4.9951717675371385e-05, + "loss": 2.4274, + "mean_token_accuracy": 0.42413792908191683, + "step": 69315 + }, + { + "epoch": 0.06981984023625132, + "grad_norm": 9.243206114209983, + "learning_rate": 4.995169313736094e-05, + "loss": 2.0182, + "mean_token_accuracy": 0.4965517222881317, + "step": 69320 + }, + { + "epoch": 0.06982487628935549, + "grad_norm": 12.386063511544483, + "learning_rate": 4.995166859312344e-05, + "loss": 3.052, + "mean_token_accuracy": 0.37241379618644715, + "step": 69325 + }, + { + "epoch": 0.06982991234245967, + "grad_norm": 11.798422481080193, + "learning_rate": 4.995164404265889e-05, + "loss": 2.4693, + "mean_token_accuracy": 0.42068966031074523, + "step": 69330 + }, + { + "epoch": 0.06983494839556384, + "grad_norm": 12.73035962158462, + "learning_rate": 4.995161948596729e-05, + "loss": 2.5947, + "mean_token_accuracy": 0.3172413736581802, + "step": 69335 + }, + { + "epoch": 0.06983998444866801, + "grad_norm": 13.955569574120261, + "learning_rate": 4.995159492304865e-05, + "loss": 2.5831, + "mean_token_accuracy": 0.39655172228813174, + "step": 69340 + }, + { + "epoch": 0.06984502050177219, + "grad_norm": 13.94063524864331, + "learning_rate": 4.995157035390298e-05, + "loss": 2.728, + "mean_token_accuracy": 0.38620689511299133, + "step": 69345 + }, + { + "epoch": 0.06985005655487636, + "grad_norm": 9.72088114737023, + "learning_rate": 4.995154577853027e-05, + "loss": 2.4625, + "mean_token_accuracy": 0.42196006774902345, + "step": 69350 + }, + { + "epoch": 0.06985509260798053, + "grad_norm": 14.947868423690267, + "learning_rate": 4.9951521196930554e-05, + "loss": 2.8758, + "mean_token_accuracy": 0.31724138259887696, + "step": 69355 + }, + { + "epoch": 0.06986012866108471, + "grad_norm": 11.663484283600335, + "learning_rate": 4.995149660910382e-05, + "loss": 2.658, + "mean_token_accuracy": 0.3843920111656189, + "step": 69360 + }, + { + "epoch": 0.06986516471418888, + "grad_norm": 11.765140212739167, + "learning_rate": 4.9951472015050076e-05, + "loss": 2.3417, + "mean_token_accuracy": 0.4413793087005615, + "step": 69365 + }, + { + "epoch": 0.06987020076729306, + "grad_norm": 12.267805124783123, + "learning_rate": 4.995144741476934e-05, + "loss": 2.6408, + "mean_token_accuracy": 0.3931034505367279, + "step": 69370 + }, + { + "epoch": 0.06987523682039723, + "grad_norm": 12.304540046812004, + "learning_rate": 4.9951422808261604e-05, + "loss": 2.5695, + "mean_token_accuracy": 0.38275861740112305, + "step": 69375 + }, + { + "epoch": 0.0698802728735014, + "grad_norm": 11.243892863772443, + "learning_rate": 4.995139819552689e-05, + "loss": 2.3582, + "mean_token_accuracy": 0.42413792610168455, + "step": 69380 + }, + { + "epoch": 0.06988530892660556, + "grad_norm": 12.225143773666671, + "learning_rate": 4.9951373576565194e-05, + "loss": 2.4377, + "mean_token_accuracy": 0.3965517163276672, + "step": 69385 + }, + { + "epoch": 0.06989034497970974, + "grad_norm": 11.262541976423735, + "learning_rate": 4.9951348951376526e-05, + "loss": 2.2699, + "mean_token_accuracy": 0.4344827592372894, + "step": 69390 + }, + { + "epoch": 0.06989538103281391, + "grad_norm": 10.27466864976267, + "learning_rate": 4.995132431996089e-05, + "loss": 2.8771, + "mean_token_accuracy": 0.33103448450565337, + "step": 69395 + }, + { + "epoch": 0.06990041708591808, + "grad_norm": 11.977076679845837, + "learning_rate": 4.995129968231829e-05, + "loss": 2.6374, + "mean_token_accuracy": 0.3655172407627106, + "step": 69400 + }, + { + "epoch": 0.06990545313902226, + "grad_norm": 10.195777546517574, + "learning_rate": 4.9951275038448745e-05, + "loss": 2.2, + "mean_token_accuracy": 0.44827585816383364, + "step": 69405 + }, + { + "epoch": 0.06991048919212643, + "grad_norm": 10.907084953414586, + "learning_rate": 4.9951250388352254e-05, + "loss": 2.4091, + "mean_token_accuracy": 0.3896551728248596, + "step": 69410 + }, + { + "epoch": 0.0699155252452306, + "grad_norm": 16.30838130408925, + "learning_rate": 4.995122573202883e-05, + "loss": 2.8221, + "mean_token_accuracy": 0.37241379022598264, + "step": 69415 + }, + { + "epoch": 0.06992056129833478, + "grad_norm": 11.022869358334763, + "learning_rate": 4.9951201069478474e-05, + "loss": 2.3465, + "mean_token_accuracy": 0.42601330280303956, + "step": 69420 + }, + { + "epoch": 0.06992559735143895, + "grad_norm": 11.256428626658082, + "learning_rate": 4.9951176400701185e-05, + "loss": 2.4913, + "mean_token_accuracy": 0.4294010907411575, + "step": 69425 + }, + { + "epoch": 0.06993063340454313, + "grad_norm": 18.402258974283903, + "learning_rate": 4.9951151725696985e-05, + "loss": 2.6719, + "mean_token_accuracy": 0.37241379618644715, + "step": 69430 + }, + { + "epoch": 0.0699356694576473, + "grad_norm": 11.789977862035215, + "learning_rate": 4.995112704446587e-05, + "loss": 1.7707, + "mean_token_accuracy": 0.5540834784507751, + "step": 69435 + }, + { + "epoch": 0.06994070551075147, + "grad_norm": 13.78246144306605, + "learning_rate": 4.995110235700786e-05, + "loss": 2.3375, + "mean_token_accuracy": 0.43793103098869324, + "step": 69440 + }, + { + "epoch": 0.06994574156385565, + "grad_norm": 16.486755208211243, + "learning_rate": 4.995107766332294e-05, + "loss": 2.7606, + "mean_token_accuracy": 0.34137930274009703, + "step": 69445 + }, + { + "epoch": 0.06995077761695982, + "grad_norm": 11.060542696042813, + "learning_rate": 4.995105296341114e-05, + "loss": 2.726, + "mean_token_accuracy": 0.4034482777118683, + "step": 69450 + }, + { + "epoch": 0.06995581367006398, + "grad_norm": 12.35527957243082, + "learning_rate": 4.9951028257272454e-05, + "loss": 2.5467, + "mean_token_accuracy": 0.41034482717514037, + "step": 69455 + }, + { + "epoch": 0.06996084972316816, + "grad_norm": 14.938953210635248, + "learning_rate": 4.995100354490689e-05, + "loss": 2.6991, + "mean_token_accuracy": 0.41034482717514037, + "step": 69460 + }, + { + "epoch": 0.06996588577627233, + "grad_norm": 10.484548207854473, + "learning_rate": 4.995097882631446e-05, + "loss": 2.0337, + "mean_token_accuracy": 0.4379310429096222, + "step": 69465 + }, + { + "epoch": 0.0699709218293765, + "grad_norm": 17.450387784811692, + "learning_rate": 4.9950954101495165e-05, + "loss": 2.7992, + "mean_token_accuracy": 0.3793103486299515, + "step": 69470 + }, + { + "epoch": 0.06997595788248068, + "grad_norm": 11.210921256885348, + "learning_rate": 4.9950929370449015e-05, + "loss": 2.3836, + "mean_token_accuracy": 0.44482759237289426, + "step": 69475 + }, + { + "epoch": 0.06998099393558485, + "grad_norm": 13.366179210038005, + "learning_rate": 4.9950904633176014e-05, + "loss": 2.7474, + "mean_token_accuracy": 0.3999999940395355, + "step": 69480 + }, + { + "epoch": 0.06998602998868902, + "grad_norm": 11.12457493388408, + "learning_rate": 4.995087988967617e-05, + "loss": 2.803, + "mean_token_accuracy": 0.3965517282485962, + "step": 69485 + }, + { + "epoch": 0.0699910660417932, + "grad_norm": 16.008310915574448, + "learning_rate": 4.99508551399495e-05, + "loss": 2.5542, + "mean_token_accuracy": 0.42262552976608275, + "step": 69490 + }, + { + "epoch": 0.06999610209489737, + "grad_norm": 11.035513787228433, + "learning_rate": 4.995083038399599e-05, + "loss": 2.561, + "mean_token_accuracy": 0.4050211727619171, + "step": 69495 + }, + { + "epoch": 0.07000113814800155, + "grad_norm": 11.461473696640395, + "learning_rate": 4.995080562181566e-05, + "loss": 2.3698, + "mean_token_accuracy": 0.41379310488700866, + "step": 69500 + }, + { + "epoch": 0.07000617420110572, + "grad_norm": 12.745879256223596, + "learning_rate": 4.995078085340852e-05, + "loss": 2.6604, + "mean_token_accuracy": 0.37241379618644715, + "step": 69505 + }, + { + "epoch": 0.0700112102542099, + "grad_norm": 12.038720499588772, + "learning_rate": 4.9950756078774575e-05, + "loss": 2.6503, + "mean_token_accuracy": 0.38421053290367124, + "step": 69510 + }, + { + "epoch": 0.07001624630731407, + "grad_norm": 11.07098850780782, + "learning_rate": 4.995073129791382e-05, + "loss": 2.7462, + "mean_token_accuracy": 0.39310344457626345, + "step": 69515 + }, + { + "epoch": 0.07002128236041824, + "grad_norm": 10.13703713905326, + "learning_rate": 4.995070651082628e-05, + "loss": 2.472, + "mean_token_accuracy": 0.41379310488700866, + "step": 69520 + }, + { + "epoch": 0.0700263184135224, + "grad_norm": 10.500432430369951, + "learning_rate": 4.9950681717511944e-05, + "loss": 2.6223, + "mean_token_accuracy": 0.42758620381355283, + "step": 69525 + }, + { + "epoch": 0.07003135446662657, + "grad_norm": 10.043471758540381, + "learning_rate": 4.995065691797083e-05, + "loss": 3.0629, + "mean_token_accuracy": 0.3965517282485962, + "step": 69530 + }, + { + "epoch": 0.07003639051973075, + "grad_norm": 11.554672517968806, + "learning_rate": 4.995063211220294e-05, + "loss": 2.8337, + "mean_token_accuracy": 0.38620689511299133, + "step": 69535 + }, + { + "epoch": 0.07004142657283492, + "grad_norm": 12.432307810997044, + "learning_rate": 4.995060730020829e-05, + "loss": 2.9088, + "mean_token_accuracy": 0.3103448212146759, + "step": 69540 + }, + { + "epoch": 0.0700464626259391, + "grad_norm": 12.507465170038829, + "learning_rate": 4.995058248198688e-05, + "loss": 2.9678, + "mean_token_accuracy": 0.36896551251411436, + "step": 69545 + }, + { + "epoch": 0.07005149867904327, + "grad_norm": 14.783108963858476, + "learning_rate": 4.995055765753871e-05, + "loss": 2.7604, + "mean_token_accuracy": 0.3632788896560669, + "step": 69550 + }, + { + "epoch": 0.07005653473214744, + "grad_norm": 11.856391559304246, + "learning_rate": 4.995053282686381e-05, + "loss": 2.2951, + "mean_token_accuracy": 0.41379310488700866, + "step": 69555 + }, + { + "epoch": 0.07006157078525162, + "grad_norm": 12.418202274308635, + "learning_rate": 4.9950507989962156e-05, + "loss": 2.126, + "mean_token_accuracy": 0.4034482717514038, + "step": 69560 + }, + { + "epoch": 0.07006660683835579, + "grad_norm": 14.535076150447885, + "learning_rate": 4.9950483146833774e-05, + "loss": 2.6364, + "mean_token_accuracy": 0.4172413766384125, + "step": 69565 + }, + { + "epoch": 0.07007164289145996, + "grad_norm": 8.168030818246812, + "learning_rate": 4.995045829747866e-05, + "loss": 2.3793, + "mean_token_accuracy": 0.4806650340557098, + "step": 69570 + }, + { + "epoch": 0.07007667894456414, + "grad_norm": 11.494508623615944, + "learning_rate": 4.995043344189684e-05, + "loss": 2.6566, + "mean_token_accuracy": 0.36896551549434664, + "step": 69575 + }, + { + "epoch": 0.07008171499766831, + "grad_norm": 10.635212663559185, + "learning_rate": 4.99504085800883e-05, + "loss": 2.4652, + "mean_token_accuracy": 0.4517241299152374, + "step": 69580 + }, + { + "epoch": 0.07008675105077249, + "grad_norm": 21.654275351000837, + "learning_rate": 4.995038371205306e-05, + "loss": 2.7141, + "mean_token_accuracy": 0.37931033968925476, + "step": 69585 + }, + { + "epoch": 0.07009178710387666, + "grad_norm": 11.48067681033711, + "learning_rate": 4.995035883779112e-05, + "loss": 2.7844, + "mean_token_accuracy": 0.35172414034605026, + "step": 69590 + }, + { + "epoch": 0.07009682315698082, + "grad_norm": 9.999109430497885, + "learning_rate": 4.9950333957302494e-05, + "loss": 2.5422, + "mean_token_accuracy": 0.3793103456497192, + "step": 69595 + }, + { + "epoch": 0.070101859210085, + "grad_norm": 11.088837991756646, + "learning_rate": 4.995030907058718e-05, + "loss": 2.4798, + "mean_token_accuracy": 0.41379310488700866, + "step": 69600 + }, + { + "epoch": 0.07010689526318917, + "grad_norm": 10.009586206231806, + "learning_rate": 4.995028417764518e-05, + "loss": 2.5302, + "mean_token_accuracy": 0.4068965554237366, + "step": 69605 + }, + { + "epoch": 0.07011193131629334, + "grad_norm": 11.904666115757921, + "learning_rate": 4.9950259278476525e-05, + "loss": 2.6684, + "mean_token_accuracy": 0.3620689630508423, + "step": 69610 + }, + { + "epoch": 0.07011696736939751, + "grad_norm": 16.113996255392088, + "learning_rate": 4.995023437308119e-05, + "loss": 2.415, + "mean_token_accuracy": 0.42068966031074523, + "step": 69615 + }, + { + "epoch": 0.07012200342250169, + "grad_norm": 11.133348347879226, + "learning_rate": 4.9950209461459226e-05, + "loss": 2.1962, + "mean_token_accuracy": 0.4620689570903778, + "step": 69620 + }, + { + "epoch": 0.07012703947560586, + "grad_norm": 11.341987866646178, + "learning_rate": 4.995018454361059e-05, + "loss": 2.7539, + "mean_token_accuracy": 0.3999999940395355, + "step": 69625 + }, + { + "epoch": 0.07013207552871004, + "grad_norm": 18.13640625720166, + "learning_rate": 4.9950159619535316e-05, + "loss": 2.3494, + "mean_token_accuracy": 0.46896551847457885, + "step": 69630 + }, + { + "epoch": 0.07013711158181421, + "grad_norm": 12.59155015495586, + "learning_rate": 4.9950134689233414e-05, + "loss": 2.642, + "mean_token_accuracy": 0.37241379618644715, + "step": 69635 + }, + { + "epoch": 0.07014214763491838, + "grad_norm": 10.722239303318371, + "learning_rate": 4.995010975270487e-05, + "loss": 2.3089, + "mean_token_accuracy": 0.3569872975349426, + "step": 69640 + }, + { + "epoch": 0.07014718368802256, + "grad_norm": 10.279371909132163, + "learning_rate": 4.9950084809949714e-05, + "loss": 2.3933, + "mean_token_accuracy": 0.4360556542873383, + "step": 69645 + }, + { + "epoch": 0.07015221974112673, + "grad_norm": 10.071647993025032, + "learning_rate": 4.995005986096794e-05, + "loss": 2.2757, + "mean_token_accuracy": 0.45862069725990295, + "step": 69650 + }, + { + "epoch": 0.0701572557942309, + "grad_norm": 8.541142760606844, + "learning_rate": 4.9950034905759565e-05, + "loss": 2.4894, + "mean_token_accuracy": 0.45396249294281005, + "step": 69655 + }, + { + "epoch": 0.07016229184733506, + "grad_norm": 11.400823534331346, + "learning_rate": 4.995000994432458e-05, + "loss": 2.2971, + "mean_token_accuracy": 0.4965517342090607, + "step": 69660 + }, + { + "epoch": 0.07016732790043924, + "grad_norm": 10.875925949693752, + "learning_rate": 4.9949984976663e-05, + "loss": 2.6775, + "mean_token_accuracy": 0.36551723480224607, + "step": 69665 + }, + { + "epoch": 0.07017236395354341, + "grad_norm": 12.34677600713615, + "learning_rate": 4.9949960002774844e-05, + "loss": 2.3332, + "mean_token_accuracy": 0.41034482717514037, + "step": 69670 + }, + { + "epoch": 0.07017740000664759, + "grad_norm": 11.845025741998386, + "learning_rate": 4.99499350226601e-05, + "loss": 2.3622, + "mean_token_accuracy": 0.441379314661026, + "step": 69675 + }, + { + "epoch": 0.07018243605975176, + "grad_norm": 13.367114636321551, + "learning_rate": 4.994991003631879e-05, + "loss": 2.4081, + "mean_token_accuracy": 0.41379310488700866, + "step": 69680 + }, + { + "epoch": 0.07018747211285593, + "grad_norm": 12.978048580129567, + "learning_rate": 4.9949885043750905e-05, + "loss": 2.5635, + "mean_token_accuracy": 0.42238354682922363, + "step": 69685 + }, + { + "epoch": 0.07019250816596011, + "grad_norm": 10.44722846996975, + "learning_rate": 4.994986004495646e-05, + "loss": 2.4769, + "mean_token_accuracy": 0.39310344457626345, + "step": 69690 + }, + { + "epoch": 0.07019754421906428, + "grad_norm": 14.857527995963384, + "learning_rate": 4.9949835039935465e-05, + "loss": 2.5865, + "mean_token_accuracy": 0.441379314661026, + "step": 69695 + }, + { + "epoch": 0.07020258027216845, + "grad_norm": 11.039129814326783, + "learning_rate": 4.994981002868792e-05, + "loss": 2.1958, + "mean_token_accuracy": 0.41724138855934145, + "step": 69700 + }, + { + "epoch": 0.07020761632527263, + "grad_norm": 11.380242989337086, + "learning_rate": 4.994978501121385e-05, + "loss": 2.5089, + "mean_token_accuracy": 0.4379310250282288, + "step": 69705 + }, + { + "epoch": 0.0702126523783768, + "grad_norm": 11.855869925134776, + "learning_rate": 4.994975998751324e-05, + "loss": 2.489, + "mean_token_accuracy": 0.43103448748588563, + "step": 69710 + }, + { + "epoch": 0.07021768843148098, + "grad_norm": 11.151152909273577, + "learning_rate": 4.99497349575861e-05, + "loss": 2.6825, + "mean_token_accuracy": 0.3931034505367279, + "step": 69715 + }, + { + "epoch": 0.07022272448458515, + "grad_norm": 32.409307369269165, + "learning_rate": 4.9949709921432444e-05, + "loss": 2.672, + "mean_token_accuracy": 0.4, + "step": 69720 + }, + { + "epoch": 0.07022776053768932, + "grad_norm": 10.4068415668025, + "learning_rate": 4.994968487905229e-05, + "loss": 2.1966, + "mean_token_accuracy": 0.4379310369491577, + "step": 69725 + }, + { + "epoch": 0.07023279659079348, + "grad_norm": 11.31702510649938, + "learning_rate": 4.9949659830445625e-05, + "loss": 2.8858, + "mean_token_accuracy": 0.41034482717514037, + "step": 69730 + }, + { + "epoch": 0.07023783264389766, + "grad_norm": 11.271868322047075, + "learning_rate": 4.994963477561246e-05, + "loss": 2.4991, + "mean_token_accuracy": 0.3655172407627106, + "step": 69735 + }, + { + "epoch": 0.07024286869700183, + "grad_norm": 10.096947754148971, + "learning_rate": 4.9949609714552805e-05, + "loss": 2.165, + "mean_token_accuracy": 0.4344827592372894, + "step": 69740 + }, + { + "epoch": 0.070247904750106, + "grad_norm": 13.003698538652918, + "learning_rate": 4.994958464726667e-05, + "loss": 2.3747, + "mean_token_accuracy": 0.42758620977401735, + "step": 69745 + }, + { + "epoch": 0.07025294080321018, + "grad_norm": 11.654631156614874, + "learning_rate": 4.994955957375405e-05, + "loss": 2.3772, + "mean_token_accuracy": 0.41034482717514037, + "step": 69750 + }, + { + "epoch": 0.07025797685631435, + "grad_norm": 12.27998841863513, + "learning_rate": 4.994953449401498e-05, + "loss": 2.3903, + "mean_token_accuracy": 0.44827585816383364, + "step": 69755 + }, + { + "epoch": 0.07026301290941853, + "grad_norm": 12.424257644030815, + "learning_rate": 4.994950940804943e-05, + "loss": 2.6115, + "mean_token_accuracy": 0.4034482777118683, + "step": 69760 + }, + { + "epoch": 0.0702680489625227, + "grad_norm": 9.211236995586136, + "learning_rate": 4.9949484315857424e-05, + "loss": 2.2901, + "mean_token_accuracy": 0.43793103098869324, + "step": 69765 + }, + { + "epoch": 0.07027308501562687, + "grad_norm": 15.668940058768534, + "learning_rate": 4.994945921743898e-05, + "loss": 2.5867, + "mean_token_accuracy": 0.4034482717514038, + "step": 69770 + }, + { + "epoch": 0.07027812106873105, + "grad_norm": 10.809164413991063, + "learning_rate": 4.994943411279409e-05, + "loss": 2.6077, + "mean_token_accuracy": 0.3517241358757019, + "step": 69775 + }, + { + "epoch": 0.07028315712183522, + "grad_norm": 11.503470297408914, + "learning_rate": 4.994940900192277e-05, + "loss": 2.7308, + "mean_token_accuracy": 0.3965517282485962, + "step": 69780 + }, + { + "epoch": 0.0702881931749394, + "grad_norm": 10.622207724141463, + "learning_rate": 4.9949383884825016e-05, + "loss": 2.0582, + "mean_token_accuracy": 0.4379310369491577, + "step": 69785 + }, + { + "epoch": 0.07029322922804357, + "grad_norm": 10.376749624812218, + "learning_rate": 4.9949358761500846e-05, + "loss": 2.5192, + "mean_token_accuracy": 0.3793103456497192, + "step": 69790 + }, + { + "epoch": 0.07029826528114774, + "grad_norm": 11.737771743887164, + "learning_rate": 4.9949333631950265e-05, + "loss": 2.4925, + "mean_token_accuracy": 0.4517241358757019, + "step": 69795 + }, + { + "epoch": 0.0703033013342519, + "grad_norm": 10.767390804500213, + "learning_rate": 4.9949308496173274e-05, + "loss": 2.648, + "mean_token_accuracy": 0.41379310488700866, + "step": 69800 + }, + { + "epoch": 0.07030833738735608, + "grad_norm": 11.977953585204077, + "learning_rate": 4.9949283354169886e-05, + "loss": 2.6357, + "mean_token_accuracy": 0.3620689660310745, + "step": 69805 + }, + { + "epoch": 0.07031337344046025, + "grad_norm": 10.409118567773453, + "learning_rate": 4.99492582059401e-05, + "loss": 2.3679, + "mean_token_accuracy": 0.4379310369491577, + "step": 69810 + }, + { + "epoch": 0.07031840949356442, + "grad_norm": 12.729640270764301, + "learning_rate": 4.994923305148393e-05, + "loss": 2.4064, + "mean_token_accuracy": 0.39655172228813174, + "step": 69815 + }, + { + "epoch": 0.0703234455466686, + "grad_norm": 12.453183048204174, + "learning_rate": 4.994920789080139e-05, + "loss": 2.1585, + "mean_token_accuracy": 0.46551724076271056, + "step": 69820 + }, + { + "epoch": 0.07032848159977277, + "grad_norm": 11.056744969222756, + "learning_rate": 4.9949182723892466e-05, + "loss": 2.6957, + "mean_token_accuracy": 0.40689656138420105, + "step": 69825 + }, + { + "epoch": 0.07033351765287695, + "grad_norm": 9.15557981945039, + "learning_rate": 4.994915755075719e-05, + "loss": 2.6863, + "mean_token_accuracy": 0.44482758045196535, + "step": 69830 + }, + { + "epoch": 0.07033855370598112, + "grad_norm": 9.827789595049888, + "learning_rate": 4.994913237139554e-05, + "loss": 2.5802, + "mean_token_accuracy": 0.3896551728248596, + "step": 69835 + }, + { + "epoch": 0.07034358975908529, + "grad_norm": 12.680246767565977, + "learning_rate": 4.994910718580756e-05, + "loss": 2.6141, + "mean_token_accuracy": 0.4172413766384125, + "step": 69840 + }, + { + "epoch": 0.07034862581218947, + "grad_norm": 11.180447620889874, + "learning_rate": 4.9949081993993225e-05, + "loss": 2.2204, + "mean_token_accuracy": 0.4517241358757019, + "step": 69845 + }, + { + "epoch": 0.07035366186529364, + "grad_norm": 10.985769559063929, + "learning_rate": 4.994905679595255e-05, + "loss": 2.4519, + "mean_token_accuracy": 0.42208107113838195, + "step": 69850 + }, + { + "epoch": 0.07035869791839781, + "grad_norm": 9.83306237917322, + "learning_rate": 4.9949031591685545e-05, + "loss": 2.5348, + "mean_token_accuracy": 0.40889292359352114, + "step": 69855 + }, + { + "epoch": 0.07036373397150199, + "grad_norm": 12.458498794702932, + "learning_rate": 4.994900638119222e-05, + "loss": 2.5662, + "mean_token_accuracy": 0.3758620619773865, + "step": 69860 + }, + { + "epoch": 0.07036877002460616, + "grad_norm": 12.117661943442595, + "learning_rate": 4.994898116447259e-05, + "loss": 2.2766, + "mean_token_accuracy": 0.4586206912994385, + "step": 69865 + }, + { + "epoch": 0.07037380607771032, + "grad_norm": 11.935412359878029, + "learning_rate": 4.994895594152664e-05, + "loss": 2.5278, + "mean_token_accuracy": 0.39655172228813174, + "step": 69870 + }, + { + "epoch": 0.0703788421308145, + "grad_norm": 11.278127372248036, + "learning_rate": 4.9948930712354384e-05, + "loss": 2.4024, + "mean_token_accuracy": 0.4224440336227417, + "step": 69875 + }, + { + "epoch": 0.07038387818391867, + "grad_norm": 10.415047722213039, + "learning_rate": 4.994890547695585e-05, + "loss": 2.152, + "mean_token_accuracy": 0.4620689570903778, + "step": 69880 + }, + { + "epoch": 0.07038891423702284, + "grad_norm": 14.122167575289225, + "learning_rate": 4.994888023533101e-05, + "loss": 2.6162, + "mean_token_accuracy": 0.4310344815254211, + "step": 69885 + }, + { + "epoch": 0.07039395029012702, + "grad_norm": 12.02363446907806, + "learning_rate": 4.99488549874799e-05, + "loss": 2.6891, + "mean_token_accuracy": 0.3758620649576187, + "step": 69890 + }, + { + "epoch": 0.07039898634323119, + "grad_norm": 11.767719322244815, + "learning_rate": 4.994882973340252e-05, + "loss": 2.6346, + "mean_token_accuracy": 0.39655172228813174, + "step": 69895 + }, + { + "epoch": 0.07040402239633536, + "grad_norm": 9.838135125304456, + "learning_rate": 4.994880447309886e-05, + "loss": 2.4675, + "mean_token_accuracy": 0.39310344457626345, + "step": 69900 + }, + { + "epoch": 0.07040905844943954, + "grad_norm": 14.303024192707186, + "learning_rate": 4.9948779206568955e-05, + "loss": 2.4041, + "mean_token_accuracy": 0.4034482717514038, + "step": 69905 + }, + { + "epoch": 0.07041409450254371, + "grad_norm": 11.183068214689438, + "learning_rate": 4.9948753933812784e-05, + "loss": 2.5764, + "mean_token_accuracy": 0.4172413766384125, + "step": 69910 + }, + { + "epoch": 0.07041913055564789, + "grad_norm": 10.032529287705792, + "learning_rate": 4.994872865483037e-05, + "loss": 2.578, + "mean_token_accuracy": 0.4551724076271057, + "step": 69915 + }, + { + "epoch": 0.07042416660875206, + "grad_norm": 11.404396072358262, + "learning_rate": 4.9948703369621725e-05, + "loss": 2.5771, + "mean_token_accuracy": 0.38275861740112305, + "step": 69920 + }, + { + "epoch": 0.07042920266185623, + "grad_norm": 11.18417577804517, + "learning_rate": 4.994867807818684e-05, + "loss": 2.2584, + "mean_token_accuracy": 0.43793103098869324, + "step": 69925 + }, + { + "epoch": 0.0704342387149604, + "grad_norm": 13.19364626264909, + "learning_rate": 4.994865278052574e-05, + "loss": 2.1594, + "mean_token_accuracy": 0.458620685338974, + "step": 69930 + }, + { + "epoch": 0.07043927476806458, + "grad_norm": 14.321413778851447, + "learning_rate": 4.994862747663841e-05, + "loss": 2.2627, + "mean_token_accuracy": 0.45662431716918944, + "step": 69935 + }, + { + "epoch": 0.07044431082116874, + "grad_norm": 10.839431982956105, + "learning_rate": 4.994860216652488e-05, + "loss": 2.165, + "mean_token_accuracy": 0.4620689570903778, + "step": 69940 + }, + { + "epoch": 0.07044934687427291, + "grad_norm": 9.49059378089504, + "learning_rate": 4.994857685018514e-05, + "loss": 2.4656, + "mean_token_accuracy": 0.37586206793785093, + "step": 69945 + }, + { + "epoch": 0.07045438292737709, + "grad_norm": 11.347706602923962, + "learning_rate": 4.9948551527619205e-05, + "loss": 2.4309, + "mean_token_accuracy": 0.4137930989265442, + "step": 69950 + }, + { + "epoch": 0.07045941898048126, + "grad_norm": 14.523649271161622, + "learning_rate": 4.994852619882708e-05, + "loss": 2.7919, + "mean_token_accuracy": 0.3843920141458511, + "step": 69955 + }, + { + "epoch": 0.07046445503358544, + "grad_norm": 16.06267499555078, + "learning_rate": 4.994850086380878e-05, + "loss": 2.6834, + "mean_token_accuracy": 0.3551724076271057, + "step": 69960 + }, + { + "epoch": 0.07046949108668961, + "grad_norm": 12.782328249358063, + "learning_rate": 4.9948475522564295e-05, + "loss": 2.5631, + "mean_token_accuracy": 0.441379314661026, + "step": 69965 + }, + { + "epoch": 0.07047452713979378, + "grad_norm": 15.091712984578308, + "learning_rate": 4.9948450175093644e-05, + "loss": 2.778, + "mean_token_accuracy": 0.3827586233615875, + "step": 69970 + }, + { + "epoch": 0.07047956319289796, + "grad_norm": 11.012103182008387, + "learning_rate": 4.994842482139684e-05, + "loss": 2.3691, + "mean_token_accuracy": 0.4227465093135834, + "step": 69975 + }, + { + "epoch": 0.07048459924600213, + "grad_norm": 10.991870748391035, + "learning_rate": 4.994839946147387e-05, + "loss": 2.6774, + "mean_token_accuracy": 0.44482758939266204, + "step": 69980 + }, + { + "epoch": 0.0704896352991063, + "grad_norm": 10.586103667495937, + "learning_rate": 4.9948374095324756e-05, + "loss": 2.5506, + "mean_token_accuracy": 0.4482758641242981, + "step": 69985 + }, + { + "epoch": 0.07049467135221048, + "grad_norm": 9.59525358930973, + "learning_rate": 4.99483487229495e-05, + "loss": 2.8354, + "mean_token_accuracy": 0.33448276221752166, + "step": 69990 + }, + { + "epoch": 0.07049970740531465, + "grad_norm": 13.803506418756518, + "learning_rate": 4.994832334434812e-05, + "loss": 2.277, + "mean_token_accuracy": 0.44706594944000244, + "step": 69995 + }, + { + "epoch": 0.07050474345841883, + "grad_norm": 10.838546646167913, + "learning_rate": 4.9948297959520606e-05, + "loss": 2.9031, + "mean_token_accuracy": 0.37931033968925476, + "step": 70000 + }, + { + "epoch": 0.070509779511523, + "grad_norm": 11.094394206516935, + "learning_rate": 4.994827256846697e-05, + "loss": 2.2989, + "mean_token_accuracy": 0.42758620977401735, + "step": 70005 + }, + { + "epoch": 0.07051481556462716, + "grad_norm": 13.708120559395848, + "learning_rate": 4.994824717118723e-05, + "loss": 2.6848, + "mean_token_accuracy": 0.3517241358757019, + "step": 70010 + }, + { + "epoch": 0.07051985161773133, + "grad_norm": 12.191850113563884, + "learning_rate": 4.994822176768138e-05, + "loss": 2.443, + "mean_token_accuracy": 0.4103448331356049, + "step": 70015 + }, + { + "epoch": 0.0705248876708355, + "grad_norm": 9.034555752848037, + "learning_rate": 4.9948196357949435e-05, + "loss": 2.1186, + "mean_token_accuracy": 0.4431941986083984, + "step": 70020 + }, + { + "epoch": 0.07052992372393968, + "grad_norm": 11.59937010618008, + "learning_rate": 4.9948170941991394e-05, + "loss": 2.5496, + "mean_token_accuracy": 0.4034482777118683, + "step": 70025 + }, + { + "epoch": 0.07053495977704385, + "grad_norm": 13.363683074532052, + "learning_rate": 4.9948145519807274e-05, + "loss": 2.7358, + "mean_token_accuracy": 0.4000000059604645, + "step": 70030 + }, + { + "epoch": 0.07053999583014803, + "grad_norm": 12.655578160789627, + "learning_rate": 4.994812009139707e-05, + "loss": 3.0334, + "mean_token_accuracy": 0.3551724135875702, + "step": 70035 + }, + { + "epoch": 0.0705450318832522, + "grad_norm": 11.211074491146233, + "learning_rate": 4.9948094656760806e-05, + "loss": 2.5472, + "mean_token_accuracy": 0.3896551728248596, + "step": 70040 + }, + { + "epoch": 0.07055006793635638, + "grad_norm": 14.573713070832257, + "learning_rate": 4.994806921589847e-05, + "loss": 2.5855, + "mean_token_accuracy": 0.4068965554237366, + "step": 70045 + }, + { + "epoch": 0.07055510398946055, + "grad_norm": 11.296433024599299, + "learning_rate": 4.9948043768810086e-05, + "loss": 2.4483, + "mean_token_accuracy": 0.4517241418361664, + "step": 70050 + }, + { + "epoch": 0.07056014004256472, + "grad_norm": 12.220005260662424, + "learning_rate": 4.9948018315495656e-05, + "loss": 2.8256, + "mean_token_accuracy": 0.3551724076271057, + "step": 70055 + }, + { + "epoch": 0.0705651760956689, + "grad_norm": 11.51624445154521, + "learning_rate": 4.9947992855955174e-05, + "loss": 2.7386, + "mean_token_accuracy": 0.3931034505367279, + "step": 70060 + }, + { + "epoch": 0.07057021214877307, + "grad_norm": 10.75761686448214, + "learning_rate": 4.994796739018866e-05, + "loss": 2.4213, + "mean_token_accuracy": 0.42758620381355283, + "step": 70065 + }, + { + "epoch": 0.07057524820187724, + "grad_norm": 11.94413647642222, + "learning_rate": 4.994794191819612e-05, + "loss": 2.3364, + "mean_token_accuracy": 0.44137930274009707, + "step": 70070 + }, + { + "epoch": 0.07058028425498142, + "grad_norm": 11.2733463610061, + "learning_rate": 4.994791643997756e-05, + "loss": 2.3157, + "mean_token_accuracy": 0.3896551728248596, + "step": 70075 + }, + { + "epoch": 0.07058532030808558, + "grad_norm": 9.987565487335917, + "learning_rate": 4.9947890955532985e-05, + "loss": 2.4635, + "mean_token_accuracy": 0.3965517282485962, + "step": 70080 + }, + { + "epoch": 0.07059035636118975, + "grad_norm": 9.743064332823339, + "learning_rate": 4.99478654648624e-05, + "loss": 2.1529, + "mean_token_accuracy": 0.4517241358757019, + "step": 70085 + }, + { + "epoch": 0.07059539241429393, + "grad_norm": 13.113106224089798, + "learning_rate": 4.994783996796582e-05, + "loss": 3.0923, + "mean_token_accuracy": 0.36551723182201384, + "step": 70090 + }, + { + "epoch": 0.0706004284673981, + "grad_norm": 11.838157421329166, + "learning_rate": 4.994781446484325e-05, + "loss": 2.3477, + "mean_token_accuracy": 0.4206896543502808, + "step": 70095 + }, + { + "epoch": 0.07060546452050227, + "grad_norm": 11.603879568912266, + "learning_rate": 4.99477889554947e-05, + "loss": 2.4082, + "mean_token_accuracy": 0.4206896543502808, + "step": 70100 + }, + { + "epoch": 0.07061050057360645, + "grad_norm": 9.783486458554057, + "learning_rate": 4.994776343992016e-05, + "loss": 2.6037, + "mean_token_accuracy": 0.38620689511299133, + "step": 70105 + }, + { + "epoch": 0.07061553662671062, + "grad_norm": 10.902571260882754, + "learning_rate": 4.9947737918119656e-05, + "loss": 2.1717, + "mean_token_accuracy": 0.44827585816383364, + "step": 70110 + }, + { + "epoch": 0.0706205726798148, + "grad_norm": 13.502509125841813, + "learning_rate": 4.994771239009319e-05, + "loss": 2.5407, + "mean_token_accuracy": 0.4172413766384125, + "step": 70115 + }, + { + "epoch": 0.07062560873291897, + "grad_norm": 11.665694178564694, + "learning_rate": 4.9947686855840766e-05, + "loss": 2.321, + "mean_token_accuracy": 0.4206896543502808, + "step": 70120 + }, + { + "epoch": 0.07063064478602314, + "grad_norm": 14.663085770559602, + "learning_rate": 4.994766131536239e-05, + "loss": 2.3175, + "mean_token_accuracy": 0.4103448331356049, + "step": 70125 + }, + { + "epoch": 0.07063568083912732, + "grad_norm": 12.503811366774348, + "learning_rate": 4.994763576865807e-05, + "loss": 2.4005, + "mean_token_accuracy": 0.4275862157344818, + "step": 70130 + }, + { + "epoch": 0.07064071689223149, + "grad_norm": 12.20670408581205, + "learning_rate": 4.9947610215727816e-05, + "loss": 2.3608, + "mean_token_accuracy": 0.4068965494632721, + "step": 70135 + }, + { + "epoch": 0.07064575294533566, + "grad_norm": 15.73518047464689, + "learning_rate": 4.994758465657163e-05, + "loss": 2.9314, + "mean_token_accuracy": 0.3724137842655182, + "step": 70140 + }, + { + "epoch": 0.07065078899843984, + "grad_norm": 10.968998148164612, + "learning_rate": 4.9947559091189525e-05, + "loss": 2.6002, + "mean_token_accuracy": 0.3931034505367279, + "step": 70145 + }, + { + "epoch": 0.070655825051544, + "grad_norm": 11.486852583736642, + "learning_rate": 4.9947533519581515e-05, + "loss": 2.5346, + "mean_token_accuracy": 0.41034482717514037, + "step": 70150 + }, + { + "epoch": 0.07066086110464817, + "grad_norm": 11.2181640902374, + "learning_rate": 4.994750794174759e-05, + "loss": 2.5319, + "mean_token_accuracy": 0.36551723778247835, + "step": 70155 + }, + { + "epoch": 0.07066589715775234, + "grad_norm": 11.551371768768886, + "learning_rate": 4.994748235768777e-05, + "loss": 2.982, + "mean_token_accuracy": 0.3655172437429428, + "step": 70160 + }, + { + "epoch": 0.07067093321085652, + "grad_norm": 11.217070619224495, + "learning_rate": 4.994745676740205e-05, + "loss": 2.8034, + "mean_token_accuracy": 0.37931033968925476, + "step": 70165 + }, + { + "epoch": 0.07067596926396069, + "grad_norm": 12.825053661880082, + "learning_rate": 4.994743117089044e-05, + "loss": 2.5864, + "mean_token_accuracy": 0.419237756729126, + "step": 70170 + }, + { + "epoch": 0.07068100531706487, + "grad_norm": 12.334139789154278, + "learning_rate": 4.9947405568152956e-05, + "loss": 2.8432, + "mean_token_accuracy": 0.3793103456497192, + "step": 70175 + }, + { + "epoch": 0.07068604137016904, + "grad_norm": 13.006068381029577, + "learning_rate": 4.994737995918961e-05, + "loss": 2.8025, + "mean_token_accuracy": 0.34482758343219755, + "step": 70180 + }, + { + "epoch": 0.07069107742327321, + "grad_norm": 10.582250081420506, + "learning_rate": 4.9947354344000385e-05, + "loss": 2.8485, + "mean_token_accuracy": 0.39310345351696013, + "step": 70185 + }, + { + "epoch": 0.07069611347637739, + "grad_norm": 11.094040684807956, + "learning_rate": 4.994732872258531e-05, + "loss": 3.0127, + "mean_token_accuracy": 0.32068965435028074, + "step": 70190 + }, + { + "epoch": 0.07070114952948156, + "grad_norm": 11.38693825418227, + "learning_rate": 4.994730309494438e-05, + "loss": 2.2579, + "mean_token_accuracy": 0.4931034505367279, + "step": 70195 + }, + { + "epoch": 0.07070618558258573, + "grad_norm": 11.449289242783431, + "learning_rate": 4.9947277461077613e-05, + "loss": 2.4568, + "mean_token_accuracy": 0.4344827592372894, + "step": 70200 + }, + { + "epoch": 0.07071122163568991, + "grad_norm": 10.565394974660665, + "learning_rate": 4.9947251820985005e-05, + "loss": 2.6645, + "mean_token_accuracy": 0.36206896901130675, + "step": 70205 + }, + { + "epoch": 0.07071625768879408, + "grad_norm": 36.701769490279204, + "learning_rate": 4.9947226174666576e-05, + "loss": 2.9153, + "mean_token_accuracy": 0.4413793087005615, + "step": 70210 + }, + { + "epoch": 0.07072129374189826, + "grad_norm": 13.022516024978197, + "learning_rate": 4.994720052212231e-05, + "loss": 2.6438, + "mean_token_accuracy": 0.3793103456497192, + "step": 70215 + }, + { + "epoch": 0.07072632979500242, + "grad_norm": 10.870449841617692, + "learning_rate": 4.994717486335224e-05, + "loss": 2.4152, + "mean_token_accuracy": 0.4551724135875702, + "step": 70220 + }, + { + "epoch": 0.07073136584810659, + "grad_norm": 9.79890828638806, + "learning_rate": 4.994714919835636e-05, + "loss": 2.2415, + "mean_token_accuracy": 0.46551724672317507, + "step": 70225 + }, + { + "epoch": 0.07073640190121076, + "grad_norm": 16.586794358637526, + "learning_rate": 4.994712352713469e-05, + "loss": 2.6496, + "mean_token_accuracy": 0.37586206793785093, + "step": 70230 + }, + { + "epoch": 0.07074143795431494, + "grad_norm": 15.927491754098385, + "learning_rate": 4.994709784968721e-05, + "loss": 2.9354, + "mean_token_accuracy": 0.36551723480224607, + "step": 70235 + }, + { + "epoch": 0.07074647400741911, + "grad_norm": 11.458290820812287, + "learning_rate": 4.994707216601395e-05, + "loss": 2.1528, + "mean_token_accuracy": 0.4770935893058777, + "step": 70240 + }, + { + "epoch": 0.07075151006052328, + "grad_norm": 9.22030731304803, + "learning_rate": 4.994704647611491e-05, + "loss": 2.6861, + "mean_token_accuracy": 0.35862069129943847, + "step": 70245 + }, + { + "epoch": 0.07075654611362746, + "grad_norm": 12.130277285974119, + "learning_rate": 4.99470207799901e-05, + "loss": 2.318, + "mean_token_accuracy": 0.4329098641872406, + "step": 70250 + }, + { + "epoch": 0.07076158216673163, + "grad_norm": 11.614749385287697, + "learning_rate": 4.994699507763952e-05, + "loss": 2.2809, + "mean_token_accuracy": 0.45862067937850953, + "step": 70255 + }, + { + "epoch": 0.0707666182198358, + "grad_norm": 14.3490363492776, + "learning_rate": 4.994696936906319e-05, + "loss": 2.9985, + "mean_token_accuracy": 0.34137930274009703, + "step": 70260 + }, + { + "epoch": 0.07077165427293998, + "grad_norm": 11.092444764676017, + "learning_rate": 4.99469436542611e-05, + "loss": 2.563, + "mean_token_accuracy": 0.3965517282485962, + "step": 70265 + }, + { + "epoch": 0.07077669032604415, + "grad_norm": 11.703473651671526, + "learning_rate": 4.9946917933233275e-05, + "loss": 2.3834, + "mean_token_accuracy": 0.4517241299152374, + "step": 70270 + }, + { + "epoch": 0.07078172637914833, + "grad_norm": 12.571302385546824, + "learning_rate": 4.9946892205979716e-05, + "loss": 2.3563, + "mean_token_accuracy": 0.4034482777118683, + "step": 70275 + }, + { + "epoch": 0.0707867624322525, + "grad_norm": 11.626379970921274, + "learning_rate": 4.994686647250042e-05, + "loss": 2.7711, + "mean_token_accuracy": 0.37586206793785093, + "step": 70280 + }, + { + "epoch": 0.07079179848535667, + "grad_norm": 9.168076987963088, + "learning_rate": 4.99468407327954e-05, + "loss": 2.1858, + "mean_token_accuracy": 0.4448275864124298, + "step": 70285 + }, + { + "epoch": 0.07079683453846083, + "grad_norm": 22.22364781611157, + "learning_rate": 4.994681498686467e-05, + "loss": 2.3999, + "mean_token_accuracy": 0.4172413766384125, + "step": 70290 + }, + { + "epoch": 0.07080187059156501, + "grad_norm": 12.37357109785026, + "learning_rate": 4.994678923470823e-05, + "loss": 2.2884, + "mean_token_accuracy": 0.4344827473163605, + "step": 70295 + }, + { + "epoch": 0.07080690664466918, + "grad_norm": 15.551315688587673, + "learning_rate": 4.9946763476326084e-05, + "loss": 2.6999, + "mean_token_accuracy": 0.3896551728248596, + "step": 70300 + }, + { + "epoch": 0.07081194269777336, + "grad_norm": 15.739390909726957, + "learning_rate": 4.9946737711718264e-05, + "loss": 2.5253, + "mean_token_accuracy": 0.4068965554237366, + "step": 70305 + }, + { + "epoch": 0.07081697875087753, + "grad_norm": 10.78033626591525, + "learning_rate": 4.994671194088474e-05, + "loss": 2.6061, + "mean_token_accuracy": 0.4068965554237366, + "step": 70310 + }, + { + "epoch": 0.0708220148039817, + "grad_norm": 10.343567116164026, + "learning_rate": 4.994668616382553e-05, + "loss": 2.5544, + "mean_token_accuracy": 0.42758620381355283, + "step": 70315 + }, + { + "epoch": 0.07082705085708588, + "grad_norm": 10.26829496006489, + "learning_rate": 4.994666038054067e-05, + "loss": 2.631, + "mean_token_accuracy": 0.3724137842655182, + "step": 70320 + }, + { + "epoch": 0.07083208691019005, + "grad_norm": 10.482382703601703, + "learning_rate": 4.9946634591030134e-05, + "loss": 2.3862, + "mean_token_accuracy": 0.42068966031074523, + "step": 70325 + }, + { + "epoch": 0.07083712296329422, + "grad_norm": 11.361212344503103, + "learning_rate": 4.9946608795293934e-05, + "loss": 2.326, + "mean_token_accuracy": 0.45862067937850953, + "step": 70330 + }, + { + "epoch": 0.0708421590163984, + "grad_norm": 20.47980482561033, + "learning_rate": 4.994658299333209e-05, + "loss": 2.8915, + "mean_token_accuracy": 0.3482758581638336, + "step": 70335 + }, + { + "epoch": 0.07084719506950257, + "grad_norm": 11.251681891829271, + "learning_rate": 4.994655718514461e-05, + "loss": 2.4911, + "mean_token_accuracy": 0.46896551847457885, + "step": 70340 + }, + { + "epoch": 0.07085223112260675, + "grad_norm": 11.27873034566599, + "learning_rate": 4.9946531370731486e-05, + "loss": 2.4166, + "mean_token_accuracy": 0.4068965554237366, + "step": 70345 + }, + { + "epoch": 0.07085726717571092, + "grad_norm": 10.252753041237728, + "learning_rate": 4.994650555009273e-05, + "loss": 2.3593, + "mean_token_accuracy": 0.4344827592372894, + "step": 70350 + }, + { + "epoch": 0.0708623032288151, + "grad_norm": 10.835968553243204, + "learning_rate": 4.994647972322835e-05, + "loss": 2.6561, + "mean_token_accuracy": 0.3979431390762329, + "step": 70355 + }, + { + "epoch": 0.07086733928191925, + "grad_norm": 11.95764031271148, + "learning_rate": 4.9946453890138364e-05, + "loss": 2.6965, + "mean_token_accuracy": 0.4310344815254211, + "step": 70360 + }, + { + "epoch": 0.07087237533502343, + "grad_norm": 9.403211441988184, + "learning_rate": 4.994642805082276e-05, + "loss": 2.5574, + "mean_token_accuracy": 0.42795565724372864, + "step": 70365 + }, + { + "epoch": 0.0708774113881276, + "grad_norm": 12.897701822973511, + "learning_rate": 4.9946402205281564e-05, + "loss": 2.6712, + "mean_token_accuracy": 0.4068965494632721, + "step": 70370 + }, + { + "epoch": 0.07088244744123177, + "grad_norm": 12.999910278542746, + "learning_rate": 4.994637635351477e-05, + "loss": 2.5711, + "mean_token_accuracy": 0.3620689570903778, + "step": 70375 + }, + { + "epoch": 0.07088748349433595, + "grad_norm": 11.22917503418133, + "learning_rate": 4.9946350495522395e-05, + "loss": 2.4414, + "mean_token_accuracy": 0.4034482777118683, + "step": 70380 + }, + { + "epoch": 0.07089251954744012, + "grad_norm": 15.954125966379527, + "learning_rate": 4.9946324631304435e-05, + "loss": 2.2183, + "mean_token_accuracy": 0.4261947929859161, + "step": 70385 + }, + { + "epoch": 0.0708975556005443, + "grad_norm": 11.830939815228778, + "learning_rate": 4.9946298760860905e-05, + "loss": 1.9035, + "mean_token_accuracy": 0.4689655125141144, + "step": 70390 + }, + { + "epoch": 0.07090259165364847, + "grad_norm": 10.89484928053885, + "learning_rate": 4.994627288419181e-05, + "loss": 2.7462, + "mean_token_accuracy": 0.3655172407627106, + "step": 70395 + }, + { + "epoch": 0.07090762770675264, + "grad_norm": 11.572669114648221, + "learning_rate": 4.994624700129716e-05, + "loss": 2.8133, + "mean_token_accuracy": 0.37241379022598264, + "step": 70400 + }, + { + "epoch": 0.07091266375985682, + "grad_norm": 18.673855278837642, + "learning_rate": 4.9946221112176954e-05, + "loss": 2.4031, + "mean_token_accuracy": 0.4360556542873383, + "step": 70405 + }, + { + "epoch": 0.07091769981296099, + "grad_norm": 13.865598974918965, + "learning_rate": 4.994619521683121e-05, + "loss": 2.6168, + "mean_token_accuracy": 0.39655172228813174, + "step": 70410 + }, + { + "epoch": 0.07092273586606516, + "grad_norm": 14.78720936435485, + "learning_rate": 4.994616931525993e-05, + "loss": 2.3095, + "mean_token_accuracy": 0.482758617401123, + "step": 70415 + }, + { + "epoch": 0.07092777191916934, + "grad_norm": 11.287426048725141, + "learning_rate": 4.994614340746312e-05, + "loss": 2.3295, + "mean_token_accuracy": 0.47241379618644713, + "step": 70420 + }, + { + "epoch": 0.07093280797227351, + "grad_norm": 12.701047023927524, + "learning_rate": 4.9946117493440784e-05, + "loss": 2.0455, + "mean_token_accuracy": 0.5034482717514038, + "step": 70425 + }, + { + "epoch": 0.07093784402537767, + "grad_norm": 10.681017681772339, + "learning_rate": 4.994609157319294e-05, + "loss": 2.2949, + "mean_token_accuracy": 0.46551724672317507, + "step": 70430 + }, + { + "epoch": 0.07094288007848185, + "grad_norm": 11.203134653035066, + "learning_rate": 4.9946065646719586e-05, + "loss": 2.6983, + "mean_token_accuracy": 0.4068965494632721, + "step": 70435 + }, + { + "epoch": 0.07094791613158602, + "grad_norm": 9.551566658009277, + "learning_rate": 4.994603971402074e-05, + "loss": 2.5711, + "mean_token_accuracy": 0.43641862869262693, + "step": 70440 + }, + { + "epoch": 0.0709529521846902, + "grad_norm": 12.66403455105608, + "learning_rate": 4.994601377509639e-05, + "loss": 2.249, + "mean_token_accuracy": 0.4551724076271057, + "step": 70445 + }, + { + "epoch": 0.07095798823779437, + "grad_norm": 11.48321384635629, + "learning_rate": 4.9945987829946555e-05, + "loss": 2.4984, + "mean_token_accuracy": 0.42413792610168455, + "step": 70450 + }, + { + "epoch": 0.07096302429089854, + "grad_norm": 15.721868524296804, + "learning_rate": 4.994596187857124e-05, + "loss": 2.884, + "mean_token_accuracy": 0.3931034505367279, + "step": 70455 + }, + { + "epoch": 0.07096806034400271, + "grad_norm": 11.253078832531148, + "learning_rate": 4.994593592097047e-05, + "loss": 2.5807, + "mean_token_accuracy": 0.42413793206214906, + "step": 70460 + }, + { + "epoch": 0.07097309639710689, + "grad_norm": 10.999852698410633, + "learning_rate": 4.994590995714422e-05, + "loss": 2.3727, + "mean_token_accuracy": 0.4172413766384125, + "step": 70465 + }, + { + "epoch": 0.07097813245021106, + "grad_norm": 9.23632191408146, + "learning_rate": 4.994588398709252e-05, + "loss": 2.662, + "mean_token_accuracy": 0.4034482717514038, + "step": 70470 + }, + { + "epoch": 0.07098316850331524, + "grad_norm": 9.606709373726503, + "learning_rate": 4.994585801081537e-05, + "loss": 2.3544, + "mean_token_accuracy": 0.4482758641242981, + "step": 70475 + }, + { + "epoch": 0.07098820455641941, + "grad_norm": 13.974434235057336, + "learning_rate": 4.994583202831277e-05, + "loss": 2.4835, + "mean_token_accuracy": 0.47931033968925474, + "step": 70480 + }, + { + "epoch": 0.07099324060952358, + "grad_norm": 11.383371115003428, + "learning_rate": 4.9945806039584745e-05, + "loss": 2.8178, + "mean_token_accuracy": 0.34482758641242983, + "step": 70485 + }, + { + "epoch": 0.07099827666262776, + "grad_norm": 9.89035478607554, + "learning_rate": 4.994578004463128e-05, + "loss": 2.9994, + "mean_token_accuracy": 0.36551724523305895, + "step": 70490 + }, + { + "epoch": 0.07100331271573193, + "grad_norm": 12.684106307925722, + "learning_rate": 4.9945754043452404e-05, + "loss": 2.6003, + "mean_token_accuracy": 0.4379310369491577, + "step": 70495 + }, + { + "epoch": 0.07100834876883609, + "grad_norm": 10.973785199857181, + "learning_rate": 4.994572803604811e-05, + "loss": 2.4096, + "mean_token_accuracy": 0.44640047550201417, + "step": 70500 + }, + { + "epoch": 0.07101338482194026, + "grad_norm": 9.242831473521587, + "learning_rate": 4.994570202241841e-05, + "loss": 2.4515, + "mean_token_accuracy": 0.4715668439865112, + "step": 70505 + }, + { + "epoch": 0.07101842087504444, + "grad_norm": 10.626044452083827, + "learning_rate": 4.994567600256331e-05, + "loss": 2.7643, + "mean_token_accuracy": 0.3793103456497192, + "step": 70510 + }, + { + "epoch": 0.07102345692814861, + "grad_norm": 9.969247839922538, + "learning_rate": 4.994564997648283e-05, + "loss": 2.4395, + "mean_token_accuracy": 0.4448275864124298, + "step": 70515 + }, + { + "epoch": 0.07102849298125279, + "grad_norm": 13.360452960741611, + "learning_rate": 4.994562394417694e-05, + "loss": 2.6033, + "mean_token_accuracy": 0.4103448212146759, + "step": 70520 + }, + { + "epoch": 0.07103352903435696, + "grad_norm": 10.858572421287231, + "learning_rate": 4.99455979056457e-05, + "loss": 2.7581, + "mean_token_accuracy": 0.34482758641242983, + "step": 70525 + }, + { + "epoch": 0.07103856508746113, + "grad_norm": 15.027180309419482, + "learning_rate": 4.994557186088907e-05, + "loss": 2.5156, + "mean_token_accuracy": 0.39310344457626345, + "step": 70530 + }, + { + "epoch": 0.07104360114056531, + "grad_norm": 11.97993970181806, + "learning_rate": 4.994554580990709e-05, + "loss": 2.1166, + "mean_token_accuracy": 0.5034482717514038, + "step": 70535 + }, + { + "epoch": 0.07104863719366948, + "grad_norm": 11.761352999830295, + "learning_rate": 4.994551975269975e-05, + "loss": 3.0568, + "mean_token_accuracy": 0.37586206793785093, + "step": 70540 + }, + { + "epoch": 0.07105367324677365, + "grad_norm": 14.05508897304383, + "learning_rate": 4.9945493689267056e-05, + "loss": 2.3359, + "mean_token_accuracy": 0.4689655125141144, + "step": 70545 + }, + { + "epoch": 0.07105870929987783, + "grad_norm": 11.28660922907751, + "learning_rate": 4.994546761960903e-05, + "loss": 2.7629, + "mean_token_accuracy": 0.3586206823587418, + "step": 70550 + }, + { + "epoch": 0.071063745352982, + "grad_norm": 11.277862044307968, + "learning_rate": 4.9945441543725665e-05, + "loss": 2.372, + "mean_token_accuracy": 0.38118572235107423, + "step": 70555 + }, + { + "epoch": 0.07106878140608618, + "grad_norm": 10.897882154331445, + "learning_rate": 4.994541546161698e-05, + "loss": 2.6017, + "mean_token_accuracy": 0.417241370677948, + "step": 70560 + }, + { + "epoch": 0.07107381745919035, + "grad_norm": 12.28403797642235, + "learning_rate": 4.9945389373282955e-05, + "loss": 2.2271, + "mean_token_accuracy": 0.3931034475564957, + "step": 70565 + }, + { + "epoch": 0.07107885351229451, + "grad_norm": 9.681751996483266, + "learning_rate": 4.9945363278723635e-05, + "loss": 2.2388, + "mean_token_accuracy": 0.4034482777118683, + "step": 70570 + }, + { + "epoch": 0.07108388956539868, + "grad_norm": 12.330387989249559, + "learning_rate": 4.9945337177939e-05, + "loss": 2.5294, + "mean_token_accuracy": 0.4482758641242981, + "step": 70575 + }, + { + "epoch": 0.07108892561850286, + "grad_norm": 8.955662623094511, + "learning_rate": 4.9945311070929076e-05, + "loss": 2.292, + "mean_token_accuracy": 0.41034482717514037, + "step": 70580 + }, + { + "epoch": 0.07109396167160703, + "grad_norm": 11.172593592406988, + "learning_rate": 4.994528495769386e-05, + "loss": 2.5601, + "mean_token_accuracy": 0.43103447556495667, + "step": 70585 + }, + { + "epoch": 0.0710989977247112, + "grad_norm": 15.035465566728494, + "learning_rate": 4.994525883823335e-05, + "loss": 3.1064, + "mean_token_accuracy": 0.34137930572032926, + "step": 70590 + }, + { + "epoch": 0.07110403377781538, + "grad_norm": 11.739660563380184, + "learning_rate": 4.994523271254757e-05, + "loss": 2.5366, + "mean_token_accuracy": 0.43793103098869324, + "step": 70595 + }, + { + "epoch": 0.07110906983091955, + "grad_norm": 15.849692761176208, + "learning_rate": 4.9945206580636525e-05, + "loss": 2.5307, + "mean_token_accuracy": 0.42068964838981626, + "step": 70600 + }, + { + "epoch": 0.07111410588402373, + "grad_norm": 12.17764555566839, + "learning_rate": 4.994518044250021e-05, + "loss": 2.6912, + "mean_token_accuracy": 0.35517241060733795, + "step": 70605 + }, + { + "epoch": 0.0711191419371279, + "grad_norm": 10.391699855304093, + "learning_rate": 4.9945154298138655e-05, + "loss": 2.6097, + "mean_token_accuracy": 0.3551724135875702, + "step": 70610 + }, + { + "epoch": 0.07112417799023207, + "grad_norm": 14.296021921515297, + "learning_rate": 4.994512814755184e-05, + "loss": 2.452, + "mean_token_accuracy": 0.4360556542873383, + "step": 70615 + }, + { + "epoch": 0.07112921404333625, + "grad_norm": 11.724903376050053, + "learning_rate": 4.994510199073979e-05, + "loss": 2.3895, + "mean_token_accuracy": 0.38620689511299133, + "step": 70620 + }, + { + "epoch": 0.07113425009644042, + "grad_norm": 12.671271247381794, + "learning_rate": 4.9945075827702506e-05, + "loss": 2.4711, + "mean_token_accuracy": 0.4000000059604645, + "step": 70625 + }, + { + "epoch": 0.0711392861495446, + "grad_norm": 10.207370688475175, + "learning_rate": 4.9945049658439995e-05, + "loss": 2.6741, + "mean_token_accuracy": 0.42758620977401735, + "step": 70630 + }, + { + "epoch": 0.07114432220264877, + "grad_norm": 13.12782775187658, + "learning_rate": 4.994502348295227e-05, + "loss": 2.5277, + "mean_token_accuracy": 0.43793103098869324, + "step": 70635 + }, + { + "epoch": 0.07114935825575293, + "grad_norm": 11.105889321200712, + "learning_rate": 4.994499730123933e-05, + "loss": 2.5424, + "mean_token_accuracy": 0.4068965554237366, + "step": 70640 + }, + { + "epoch": 0.0711543943088571, + "grad_norm": 9.661949209135395, + "learning_rate": 4.9944971113301185e-05, + "loss": 2.4156, + "mean_token_accuracy": 0.41379310488700866, + "step": 70645 + }, + { + "epoch": 0.07115943036196128, + "grad_norm": 9.549024782555119, + "learning_rate": 4.994494491913785e-05, + "loss": 2.2862, + "mean_token_accuracy": 0.4517241358757019, + "step": 70650 + }, + { + "epoch": 0.07116446641506545, + "grad_norm": 9.406550492214441, + "learning_rate": 4.9944918718749326e-05, + "loss": 2.2725, + "mean_token_accuracy": 0.44482759237289426, + "step": 70655 + }, + { + "epoch": 0.07116950246816962, + "grad_norm": 11.826400912592916, + "learning_rate": 4.994489251213561e-05, + "loss": 2.7083, + "mean_token_accuracy": 0.41034482717514037, + "step": 70660 + }, + { + "epoch": 0.0711745385212738, + "grad_norm": 11.335575003963879, + "learning_rate": 4.9944866299296724e-05, + "loss": 2.386, + "mean_token_accuracy": 0.43793103098869324, + "step": 70665 + }, + { + "epoch": 0.07117957457437797, + "grad_norm": 10.268075162359352, + "learning_rate": 4.9944840080232675e-05, + "loss": 2.4905, + "mean_token_accuracy": 0.38275861740112305, + "step": 70670 + }, + { + "epoch": 0.07118461062748214, + "grad_norm": 10.25182759608856, + "learning_rate": 4.994481385494346e-05, + "loss": 2.6252, + "mean_token_accuracy": 0.42577131986618044, + "step": 70675 + }, + { + "epoch": 0.07118964668058632, + "grad_norm": 11.086095444721575, + "learning_rate": 4.994478762342909e-05, + "loss": 2.7612, + "mean_token_accuracy": 0.3931034505367279, + "step": 70680 + }, + { + "epoch": 0.07119468273369049, + "grad_norm": 13.501977265932828, + "learning_rate": 4.994476138568958e-05, + "loss": 2.4447, + "mean_token_accuracy": 0.40344826579093934, + "step": 70685 + }, + { + "epoch": 0.07119971878679467, + "grad_norm": 9.632548851095104, + "learning_rate": 4.9944735141724935e-05, + "loss": 2.5812, + "mean_token_accuracy": 0.42413794100284574, + "step": 70690 + }, + { + "epoch": 0.07120475483989884, + "grad_norm": 11.746044520824606, + "learning_rate": 4.9944708891535155e-05, + "loss": 2.7043, + "mean_token_accuracy": 0.37241379022598264, + "step": 70695 + }, + { + "epoch": 0.07120979089300301, + "grad_norm": 13.110302070609862, + "learning_rate": 4.9944682635120246e-05, + "loss": 2.3288, + "mean_token_accuracy": 0.40344826579093934, + "step": 70700 + }, + { + "epoch": 0.07121482694610719, + "grad_norm": 12.378242719394526, + "learning_rate": 4.9944656372480234e-05, + "loss": 2.7406, + "mean_token_accuracy": 0.4103448331356049, + "step": 70705 + }, + { + "epoch": 0.07121986299921135, + "grad_norm": 12.702405264020008, + "learning_rate": 4.99446301036151e-05, + "loss": 2.2296, + "mean_token_accuracy": 0.4641863226890564, + "step": 70710 + }, + { + "epoch": 0.07122489905231552, + "grad_norm": 12.30321329424009, + "learning_rate": 4.9944603828524875e-05, + "loss": 2.5986, + "mean_token_accuracy": 0.38965516686439516, + "step": 70715 + }, + { + "epoch": 0.0712299351054197, + "grad_norm": 13.58039097977116, + "learning_rate": 4.994457754720954e-05, + "loss": 2.4328, + "mean_token_accuracy": 0.4034482717514038, + "step": 70720 + }, + { + "epoch": 0.07123497115852387, + "grad_norm": 10.177707733071324, + "learning_rate": 4.994455125966913e-05, + "loss": 2.386, + "mean_token_accuracy": 0.40163339376449586, + "step": 70725 + }, + { + "epoch": 0.07124000721162804, + "grad_norm": 14.085057433278212, + "learning_rate": 4.994452496590364e-05, + "loss": 2.4562, + "mean_token_accuracy": 0.41379310488700866, + "step": 70730 + }, + { + "epoch": 0.07124504326473222, + "grad_norm": 12.290509395230686, + "learning_rate": 4.994449866591307e-05, + "loss": 2.3773, + "mean_token_accuracy": 0.4413793087005615, + "step": 70735 + }, + { + "epoch": 0.07125007931783639, + "grad_norm": 11.15151188212305, + "learning_rate": 4.994447235969744e-05, + "loss": 2.7459, + "mean_token_accuracy": 0.37931033670902253, + "step": 70740 + }, + { + "epoch": 0.07125511537094056, + "grad_norm": 11.125330115072433, + "learning_rate": 4.994444604725675e-05, + "loss": 2.6763, + "mean_token_accuracy": 0.3724137991666794, + "step": 70745 + }, + { + "epoch": 0.07126015142404474, + "grad_norm": 11.930243160710932, + "learning_rate": 4.994441972859101e-05, + "loss": 2.4483, + "mean_token_accuracy": 0.4034482717514038, + "step": 70750 + }, + { + "epoch": 0.07126518747714891, + "grad_norm": 9.576407649277836, + "learning_rate": 4.994439340370022e-05, + "loss": 2.2071, + "mean_token_accuracy": 0.4310344815254211, + "step": 70755 + }, + { + "epoch": 0.07127022353025309, + "grad_norm": 12.4216056083062, + "learning_rate": 4.99443670725844e-05, + "loss": 2.3091, + "mean_token_accuracy": 0.4329098641872406, + "step": 70760 + }, + { + "epoch": 0.07127525958335726, + "grad_norm": 9.884915579538406, + "learning_rate": 4.9944340735243556e-05, + "loss": 2.9061, + "mean_token_accuracy": 0.37362371683120726, + "step": 70765 + }, + { + "epoch": 0.07128029563646143, + "grad_norm": 14.157224059188993, + "learning_rate": 4.9944314391677685e-05, + "loss": 2.8259, + "mean_token_accuracy": 0.3551724135875702, + "step": 70770 + }, + { + "epoch": 0.0712853316895656, + "grad_norm": 10.323825554514238, + "learning_rate": 4.99442880418868e-05, + "loss": 2.6118, + "mean_token_accuracy": 0.3882637619972229, + "step": 70775 + }, + { + "epoch": 0.07129036774266977, + "grad_norm": 12.421319425485658, + "learning_rate": 4.994426168587091e-05, + "loss": 2.698, + "mean_token_accuracy": 0.4172413766384125, + "step": 70780 + }, + { + "epoch": 0.07129540379577394, + "grad_norm": 11.387986991483675, + "learning_rate": 4.9944235323630016e-05, + "loss": 2.5358, + "mean_token_accuracy": 0.3965517282485962, + "step": 70785 + }, + { + "epoch": 0.07130043984887811, + "grad_norm": 9.535196823406435, + "learning_rate": 4.9944208955164134e-05, + "loss": 2.1317, + "mean_token_accuracy": 0.441379314661026, + "step": 70790 + }, + { + "epoch": 0.07130547590198229, + "grad_norm": 14.409028249020343, + "learning_rate": 4.9944182580473264e-05, + "loss": 2.2947, + "mean_token_accuracy": 0.43950392603874205, + "step": 70795 + }, + { + "epoch": 0.07131051195508646, + "grad_norm": 14.52114121439506, + "learning_rate": 4.994415619955742e-05, + "loss": 2.9437, + "mean_token_accuracy": 0.38965516686439516, + "step": 70800 + }, + { + "epoch": 0.07131554800819064, + "grad_norm": 11.357618243921069, + "learning_rate": 4.99441298124166e-05, + "loss": 2.7781, + "mean_token_accuracy": 0.33103448152542114, + "step": 70805 + }, + { + "epoch": 0.07132058406129481, + "grad_norm": 24.778665969770362, + "learning_rate": 4.994410341905082e-05, + "loss": 2.3848, + "mean_token_accuracy": 0.43793103098869324, + "step": 70810 + }, + { + "epoch": 0.07132562011439898, + "grad_norm": 11.029616389724485, + "learning_rate": 4.9944077019460086e-05, + "loss": 2.3692, + "mean_token_accuracy": 0.4206896543502808, + "step": 70815 + }, + { + "epoch": 0.07133065616750316, + "grad_norm": 12.106278635067156, + "learning_rate": 4.99440506136444e-05, + "loss": 2.3405, + "mean_token_accuracy": 0.44137930274009707, + "step": 70820 + }, + { + "epoch": 0.07133569222060733, + "grad_norm": 12.420718495372768, + "learning_rate": 4.9944024201603776e-05, + "loss": 2.3677, + "mean_token_accuracy": 0.4586206912994385, + "step": 70825 + }, + { + "epoch": 0.0713407282737115, + "grad_norm": 11.229742365331946, + "learning_rate": 4.994399778333821e-05, + "loss": 2.9375, + "mean_token_accuracy": 0.38965517580509185, + "step": 70830 + }, + { + "epoch": 0.07134576432681568, + "grad_norm": 15.134934905351564, + "learning_rate": 4.9943971358847726e-05, + "loss": 2.8809, + "mean_token_accuracy": 0.3793103516101837, + "step": 70835 + }, + { + "epoch": 0.07135080037991985, + "grad_norm": 17.92020690498485, + "learning_rate": 4.994394492813232e-05, + "loss": 2.8504, + "mean_token_accuracy": 0.3965517282485962, + "step": 70840 + }, + { + "epoch": 0.07135583643302403, + "grad_norm": 10.241089256154567, + "learning_rate": 4.9943918491192005e-05, + "loss": 2.3839, + "mean_token_accuracy": 0.422202056646347, + "step": 70845 + }, + { + "epoch": 0.07136087248612819, + "grad_norm": 12.034115180949227, + "learning_rate": 4.994389204802678e-05, + "loss": 2.658, + "mean_token_accuracy": 0.39310345649719236, + "step": 70850 + }, + { + "epoch": 0.07136590853923236, + "grad_norm": 11.706252008332417, + "learning_rate": 4.9943865598636665e-05, + "loss": 2.5979, + "mean_token_accuracy": 0.3793103456497192, + "step": 70855 + }, + { + "epoch": 0.07137094459233653, + "grad_norm": 10.101452874172487, + "learning_rate": 4.9943839143021657e-05, + "loss": 2.4062, + "mean_token_accuracy": 0.4206896424293518, + "step": 70860 + }, + { + "epoch": 0.0713759806454407, + "grad_norm": 10.833255491465126, + "learning_rate": 4.9943812681181756e-05, + "loss": 2.2773, + "mean_token_accuracy": 0.4448275864124298, + "step": 70865 + }, + { + "epoch": 0.07138101669854488, + "grad_norm": 16.60778438387288, + "learning_rate": 4.9943786213116996e-05, + "loss": 2.9333, + "mean_token_accuracy": 0.39655172228813174, + "step": 70870 + }, + { + "epoch": 0.07138605275164905, + "grad_norm": 10.331685605483557, + "learning_rate": 4.994375973882736e-05, + "loss": 2.0486, + "mean_token_accuracy": 0.5034482657909394, + "step": 70875 + }, + { + "epoch": 0.07139108880475323, + "grad_norm": 11.627511842995375, + "learning_rate": 4.9943733258312866e-05, + "loss": 2.5898, + "mean_token_accuracy": 0.44482758045196535, + "step": 70880 + }, + { + "epoch": 0.0713961248578574, + "grad_norm": 11.421274344674455, + "learning_rate": 4.994370677157352e-05, + "loss": 2.5111, + "mean_token_accuracy": 0.41724138855934145, + "step": 70885 + }, + { + "epoch": 0.07140116091096158, + "grad_norm": 11.734189405984683, + "learning_rate": 4.994368027860932e-05, + "loss": 2.7348, + "mean_token_accuracy": 0.40139141082763674, + "step": 70890 + }, + { + "epoch": 0.07140619696406575, + "grad_norm": 13.021598803225036, + "learning_rate": 4.994365377942029e-05, + "loss": 2.0335, + "mean_token_accuracy": 0.47973382472991943, + "step": 70895 + }, + { + "epoch": 0.07141123301716992, + "grad_norm": 10.364770088464093, + "learning_rate": 4.9943627274006425e-05, + "loss": 1.9974, + "mean_token_accuracy": 0.4689655125141144, + "step": 70900 + }, + { + "epoch": 0.0714162690702741, + "grad_norm": 13.522544635575718, + "learning_rate": 4.994360076236774e-05, + "loss": 2.8219, + "mean_token_accuracy": 0.4172413766384125, + "step": 70905 + }, + { + "epoch": 0.07142130512337827, + "grad_norm": 11.70887946184419, + "learning_rate": 4.9943574244504235e-05, + "loss": 2.1361, + "mean_token_accuracy": 0.44827585816383364, + "step": 70910 + }, + { + "epoch": 0.07142634117648244, + "grad_norm": 14.234147069994952, + "learning_rate": 4.9943547720415924e-05, + "loss": 3.0879, + "mean_token_accuracy": 0.341379314661026, + "step": 70915 + }, + { + "epoch": 0.0714313772295866, + "grad_norm": 11.432731353608675, + "learning_rate": 4.994352119010281e-05, + "loss": 2.6016, + "mean_token_accuracy": 0.4034482717514038, + "step": 70920 + }, + { + "epoch": 0.07143641328269078, + "grad_norm": 13.259819065600094, + "learning_rate": 4.99434946535649e-05, + "loss": 2.4433, + "mean_token_accuracy": 0.4137930989265442, + "step": 70925 + }, + { + "epoch": 0.07144144933579495, + "grad_norm": 10.589519400769717, + "learning_rate": 4.99434681108022e-05, + "loss": 2.5367, + "mean_token_accuracy": 0.42413793206214906, + "step": 70930 + }, + { + "epoch": 0.07144648538889913, + "grad_norm": 10.075779622427428, + "learning_rate": 4.994344156181473e-05, + "loss": 1.9828, + "mean_token_accuracy": 0.5122201979160309, + "step": 70935 + }, + { + "epoch": 0.0714515214420033, + "grad_norm": 19.118272964405886, + "learning_rate": 4.994341500660248e-05, + "loss": 2.301, + "mean_token_accuracy": 0.5, + "step": 70940 + }, + { + "epoch": 0.07145655749510747, + "grad_norm": 11.220535284426372, + "learning_rate": 4.994338844516547e-05, + "loss": 2.6837, + "mean_token_accuracy": 0.3896551728248596, + "step": 70945 + }, + { + "epoch": 0.07146159354821165, + "grad_norm": 11.665190623666764, + "learning_rate": 4.9943361877503704e-05, + "loss": 2.8422, + "mean_token_accuracy": 0.3896551728248596, + "step": 70950 + }, + { + "epoch": 0.07146662960131582, + "grad_norm": 10.681579785810431, + "learning_rate": 4.994333530361718e-05, + "loss": 2.3563, + "mean_token_accuracy": 0.42758620381355283, + "step": 70955 + }, + { + "epoch": 0.07147166565442, + "grad_norm": 11.409474997479817, + "learning_rate": 4.994330872350591e-05, + "loss": 2.3415, + "mean_token_accuracy": 0.441379314661026, + "step": 70960 + }, + { + "epoch": 0.07147670170752417, + "grad_norm": 11.381029927339695, + "learning_rate": 4.9943282137169916e-05, + "loss": 2.1509, + "mean_token_accuracy": 0.4776164650917053, + "step": 70965 + }, + { + "epoch": 0.07148173776062834, + "grad_norm": 12.13730130404337, + "learning_rate": 4.994325554460919e-05, + "loss": 2.2033, + "mean_token_accuracy": 0.4551724076271057, + "step": 70970 + }, + { + "epoch": 0.07148677381373252, + "grad_norm": 10.478375160203383, + "learning_rate": 4.9943228945823745e-05, + "loss": 2.5283, + "mean_token_accuracy": 0.4186932861804962, + "step": 70975 + }, + { + "epoch": 0.07149180986683669, + "grad_norm": 12.937254344908167, + "learning_rate": 4.9943202340813585e-05, + "loss": 2.5381, + "mean_token_accuracy": 0.4172413766384125, + "step": 70980 + }, + { + "epoch": 0.07149684591994086, + "grad_norm": 10.97568586555971, + "learning_rate": 4.9943175729578715e-05, + "loss": 2.6197, + "mean_token_accuracy": 0.3793103456497192, + "step": 70985 + }, + { + "epoch": 0.07150188197304502, + "grad_norm": 11.684862273904743, + "learning_rate": 4.9943149112119156e-05, + "loss": 2.5448, + "mean_token_accuracy": 0.42413793206214906, + "step": 70990 + }, + { + "epoch": 0.0715069180261492, + "grad_norm": 13.791824517154222, + "learning_rate": 4.9943122488434895e-05, + "loss": 3.0424, + "mean_token_accuracy": 0.35607985258102415, + "step": 70995 + }, + { + "epoch": 0.07151195407925337, + "grad_norm": 10.227219959114166, + "learning_rate": 4.994309585852596e-05, + "loss": 2.5011, + "mean_token_accuracy": 0.4068965494632721, + "step": 71000 + }, + { + "epoch": 0.07151699013235754, + "grad_norm": 12.65714794969652, + "learning_rate": 4.9943069222392344e-05, + "loss": 2.6511, + "mean_token_accuracy": 0.4000000059604645, + "step": 71005 + }, + { + "epoch": 0.07152202618546172, + "grad_norm": 12.587031453739757, + "learning_rate": 4.9943042580034055e-05, + "loss": 2.7937, + "mean_token_accuracy": 0.3655172407627106, + "step": 71010 + }, + { + "epoch": 0.07152706223856589, + "grad_norm": 10.78725374731294, + "learning_rate": 4.994301593145111e-05, + "loss": 2.4543, + "mean_token_accuracy": 0.3655172407627106, + "step": 71015 + }, + { + "epoch": 0.07153209829167007, + "grad_norm": 11.986500068067201, + "learning_rate": 4.994298927664351e-05, + "loss": 2.4143, + "mean_token_accuracy": 0.4137930989265442, + "step": 71020 + }, + { + "epoch": 0.07153713434477424, + "grad_norm": 13.649569073006116, + "learning_rate": 4.994296261561126e-05, + "loss": 2.3044, + "mean_token_accuracy": 0.3825166344642639, + "step": 71025 + }, + { + "epoch": 0.07154217039787841, + "grad_norm": 9.276241223936314, + "learning_rate": 4.994293594835437e-05, + "loss": 2.5017, + "mean_token_accuracy": 0.3655172407627106, + "step": 71030 + }, + { + "epoch": 0.07154720645098259, + "grad_norm": 12.442769516306736, + "learning_rate": 4.9942909274872854e-05, + "loss": 2.4891, + "mean_token_accuracy": 0.4517241358757019, + "step": 71035 + }, + { + "epoch": 0.07155224250408676, + "grad_norm": 11.340496530123296, + "learning_rate": 4.9942882595166714e-05, + "loss": 2.6579, + "mean_token_accuracy": 0.39310345649719236, + "step": 71040 + }, + { + "epoch": 0.07155727855719093, + "grad_norm": 13.943519110063361, + "learning_rate": 4.994285590923595e-05, + "loss": 2.3911, + "mean_token_accuracy": 0.458620685338974, + "step": 71045 + }, + { + "epoch": 0.07156231461029511, + "grad_norm": 13.184404215926195, + "learning_rate": 4.9942829217080584e-05, + "loss": 2.5661, + "mean_token_accuracy": 0.4137930989265442, + "step": 71050 + }, + { + "epoch": 0.07156735066339928, + "grad_norm": 9.31016052864668, + "learning_rate": 4.994280251870061e-05, + "loss": 2.0516, + "mean_token_accuracy": 0.4862068951129913, + "step": 71055 + }, + { + "epoch": 0.07157238671650344, + "grad_norm": 10.08712730881105, + "learning_rate": 4.9942775814096034e-05, + "loss": 2.3825, + "mean_token_accuracy": 0.43103447556495667, + "step": 71060 + }, + { + "epoch": 0.07157742276960762, + "grad_norm": 9.979294918136405, + "learning_rate": 4.994274910326689e-05, + "loss": 2.6759, + "mean_token_accuracy": 0.3931034505367279, + "step": 71065 + }, + { + "epoch": 0.07158245882271179, + "grad_norm": 12.681070326031314, + "learning_rate": 4.994272238621315e-05, + "loss": 2.6941, + "mean_token_accuracy": 0.41724138855934145, + "step": 71070 + }, + { + "epoch": 0.07158749487581596, + "grad_norm": 11.360753952805513, + "learning_rate": 4.994269566293484e-05, + "loss": 2.3541, + "mean_token_accuracy": 0.4572292804718018, + "step": 71075 + }, + { + "epoch": 0.07159253092892014, + "grad_norm": 11.577007670442573, + "learning_rate": 4.9942668933431966e-05, + "loss": 2.5938, + "mean_token_accuracy": 0.38275861740112305, + "step": 71080 + }, + { + "epoch": 0.07159756698202431, + "grad_norm": 7.992803332204998, + "learning_rate": 4.994264219770454e-05, + "loss": 2.8432, + "mean_token_accuracy": 0.3793103456497192, + "step": 71085 + }, + { + "epoch": 0.07160260303512848, + "grad_norm": 10.838283024964884, + "learning_rate": 4.9942615455752555e-05, + "loss": 2.2356, + "mean_token_accuracy": 0.43103448748588563, + "step": 71090 + }, + { + "epoch": 0.07160763908823266, + "grad_norm": 11.49725009298919, + "learning_rate": 4.994258870757603e-05, + "loss": 2.9394, + "mean_token_accuracy": 0.3517241418361664, + "step": 71095 + }, + { + "epoch": 0.07161267514133683, + "grad_norm": 13.078473539226987, + "learning_rate": 4.994256195317496e-05, + "loss": 2.9376, + "mean_token_accuracy": 0.37241379022598264, + "step": 71100 + }, + { + "epoch": 0.071617711194441, + "grad_norm": 10.559175004260128, + "learning_rate": 4.994253519254938e-05, + "loss": 2.0873, + "mean_token_accuracy": 0.4413793087005615, + "step": 71105 + }, + { + "epoch": 0.07162274724754518, + "grad_norm": 9.903892844234257, + "learning_rate": 4.994250842569927e-05, + "loss": 2.398, + "mean_token_accuracy": 0.4206896543502808, + "step": 71110 + }, + { + "epoch": 0.07162778330064935, + "grad_norm": 11.44775330064501, + "learning_rate": 4.994248165262465e-05, + "loss": 2.7172, + "mean_token_accuracy": 0.36896551847457887, + "step": 71115 + }, + { + "epoch": 0.07163281935375353, + "grad_norm": 12.53561062053418, + "learning_rate": 4.994245487332552e-05, + "loss": 2.8782, + "mean_token_accuracy": 0.3862068891525269, + "step": 71120 + }, + { + "epoch": 0.0716378554068577, + "grad_norm": 10.485570941287847, + "learning_rate": 4.994242808780189e-05, + "loss": 2.4474, + "mean_token_accuracy": 0.4344827651977539, + "step": 71125 + }, + { + "epoch": 0.07164289145996186, + "grad_norm": 11.542010741540413, + "learning_rate": 4.994240129605377e-05, + "loss": 2.5578, + "mean_token_accuracy": 0.39310344457626345, + "step": 71130 + }, + { + "epoch": 0.07164792751306603, + "grad_norm": 15.285713704974636, + "learning_rate": 4.9942374498081164e-05, + "loss": 2.5681, + "mean_token_accuracy": 0.42589232325553894, + "step": 71135 + }, + { + "epoch": 0.07165296356617021, + "grad_norm": 10.05700926700659, + "learning_rate": 4.9942347693884084e-05, + "loss": 2.36, + "mean_token_accuracy": 0.4379310369491577, + "step": 71140 + }, + { + "epoch": 0.07165799961927438, + "grad_norm": 10.907575103528712, + "learning_rate": 4.994232088346253e-05, + "loss": 2.6808, + "mean_token_accuracy": 0.3517241358757019, + "step": 71145 + }, + { + "epoch": 0.07166303567237856, + "grad_norm": 11.229350665402336, + "learning_rate": 4.994229406681653e-05, + "loss": 2.4989, + "mean_token_accuracy": 0.3931034475564957, + "step": 71150 + }, + { + "epoch": 0.07166807172548273, + "grad_norm": 15.889429707573136, + "learning_rate": 4.9942267243946066e-05, + "loss": 2.8523, + "mean_token_accuracy": 0.36896551251411436, + "step": 71155 + }, + { + "epoch": 0.0716731077785869, + "grad_norm": 11.62771780136095, + "learning_rate": 4.994224041485115e-05, + "loss": 2.3217, + "mean_token_accuracy": 0.36896551847457887, + "step": 71160 + }, + { + "epoch": 0.07167814383169108, + "grad_norm": 12.14494475705175, + "learning_rate": 4.9942213579531804e-05, + "loss": 3.1062, + "mean_token_accuracy": 0.3517241358757019, + "step": 71165 + }, + { + "epoch": 0.07168317988479525, + "grad_norm": 9.08373474438411, + "learning_rate": 4.9942186737988026e-05, + "loss": 2.3713, + "mean_token_accuracy": 0.4137930989265442, + "step": 71170 + }, + { + "epoch": 0.07168821593789942, + "grad_norm": 10.565575481143629, + "learning_rate": 4.9942159890219815e-05, + "loss": 2.1803, + "mean_token_accuracy": 0.44137930274009707, + "step": 71175 + }, + { + "epoch": 0.0716932519910036, + "grad_norm": 13.1256046310764, + "learning_rate": 4.9942133036227194e-05, + "loss": 2.1523, + "mean_token_accuracy": 0.441379314661026, + "step": 71180 + }, + { + "epoch": 0.07169828804410777, + "grad_norm": 11.03369307458606, + "learning_rate": 4.994210617601017e-05, + "loss": 2.2698, + "mean_token_accuracy": 0.4310344934463501, + "step": 71185 + }, + { + "epoch": 0.07170332409721195, + "grad_norm": 12.688652136524157, + "learning_rate": 4.9942079309568736e-05, + "loss": 2.2658, + "mean_token_accuracy": 0.48275861144065857, + "step": 71190 + }, + { + "epoch": 0.07170836015031612, + "grad_norm": 11.776628233817998, + "learning_rate": 4.99420524369029e-05, + "loss": 2.204, + "mean_token_accuracy": 0.43793103098869324, + "step": 71195 + }, + { + "epoch": 0.07171339620342028, + "grad_norm": 9.008451077284565, + "learning_rate": 4.994202555801269e-05, + "loss": 2.271, + "mean_token_accuracy": 0.5360837399959564, + "step": 71200 + }, + { + "epoch": 0.07171843225652445, + "grad_norm": 11.893562401900843, + "learning_rate": 4.9941998672898096e-05, + "loss": 2.3008, + "mean_token_accuracy": 0.39655172228813174, + "step": 71205 + }, + { + "epoch": 0.07172346830962863, + "grad_norm": 12.40901272116406, + "learning_rate": 4.994197178155913e-05, + "loss": 2.3866, + "mean_token_accuracy": 0.43103448748588563, + "step": 71210 + }, + { + "epoch": 0.0717285043627328, + "grad_norm": 9.892954597035462, + "learning_rate": 4.99419448839958e-05, + "loss": 2.1194, + "mean_token_accuracy": 0.47241380214691164, + "step": 71215 + }, + { + "epoch": 0.07173354041583697, + "grad_norm": 10.882423306649848, + "learning_rate": 4.994191798020811e-05, + "loss": 2.4034, + "mean_token_accuracy": 0.4344827651977539, + "step": 71220 + }, + { + "epoch": 0.07173857646894115, + "grad_norm": 14.889056478861342, + "learning_rate": 4.994189107019607e-05, + "loss": 2.749, + "mean_token_accuracy": 0.3635813593864441, + "step": 71225 + }, + { + "epoch": 0.07174361252204532, + "grad_norm": 11.704562400583475, + "learning_rate": 4.9941864153959695e-05, + "loss": 2.252, + "mean_token_accuracy": 0.45172414779663084, + "step": 71230 + }, + { + "epoch": 0.0717486485751495, + "grad_norm": 13.005324690076995, + "learning_rate": 4.9941837231498975e-05, + "loss": 2.3403, + "mean_token_accuracy": 0.39655172228813174, + "step": 71235 + }, + { + "epoch": 0.07175368462825367, + "grad_norm": 16.08874533363667, + "learning_rate": 4.994181030281394e-05, + "loss": 2.5644, + "mean_token_accuracy": 0.3965517163276672, + "step": 71240 + }, + { + "epoch": 0.07175872068135784, + "grad_norm": 14.833879327567997, + "learning_rate": 4.9941783367904577e-05, + "loss": 2.3939, + "mean_token_accuracy": 0.4206896543502808, + "step": 71245 + }, + { + "epoch": 0.07176375673446202, + "grad_norm": 14.1276398175455, + "learning_rate": 4.99417564267709e-05, + "loss": 2.9489, + "mean_token_accuracy": 0.3517241418361664, + "step": 71250 + }, + { + "epoch": 0.07176879278756619, + "grad_norm": 12.579352639587158, + "learning_rate": 4.994172947941292e-05, + "loss": 2.4039, + "mean_token_accuracy": 0.4344827711582184, + "step": 71255 + }, + { + "epoch": 0.07177382884067036, + "grad_norm": 11.01249615252546, + "learning_rate": 4.994170252583065e-05, + "loss": 2.4503, + "mean_token_accuracy": 0.44482758045196535, + "step": 71260 + }, + { + "epoch": 0.07177886489377454, + "grad_norm": 11.645856211015182, + "learning_rate": 4.9941675566024077e-05, + "loss": 2.6739, + "mean_token_accuracy": 0.3551724135875702, + "step": 71265 + }, + { + "epoch": 0.0717839009468787, + "grad_norm": 11.937534664044211, + "learning_rate": 4.994164859999323e-05, + "loss": 2.3175, + "mean_token_accuracy": 0.4379310369491577, + "step": 71270 + }, + { + "epoch": 0.07178893699998287, + "grad_norm": 11.645731182485507, + "learning_rate": 4.994162162773811e-05, + "loss": 2.7318, + "mean_token_accuracy": 0.38965516686439516, + "step": 71275 + }, + { + "epoch": 0.07179397305308705, + "grad_norm": 19.20205931515992, + "learning_rate": 4.994159464925872e-05, + "loss": 3.6012, + "mean_token_accuracy": 0.34930428564548494, + "step": 71280 + }, + { + "epoch": 0.07179900910619122, + "grad_norm": 10.947561815680867, + "learning_rate": 4.994156766455507e-05, + "loss": 2.4425, + "mean_token_accuracy": 0.41724138259887694, + "step": 71285 + }, + { + "epoch": 0.0718040451592954, + "grad_norm": 11.203567658609044, + "learning_rate": 4.994154067362716e-05, + "loss": 2.1872, + "mean_token_accuracy": 0.4517241418361664, + "step": 71290 + }, + { + "epoch": 0.07180908121239957, + "grad_norm": 11.384364356111817, + "learning_rate": 4.9941513676475015e-05, + "loss": 2.706, + "mean_token_accuracy": 0.3758620619773865, + "step": 71295 + }, + { + "epoch": 0.07181411726550374, + "grad_norm": 15.956695767848043, + "learning_rate": 4.9941486673098626e-05, + "loss": 2.9547, + "mean_token_accuracy": 0.3310344785451889, + "step": 71300 + }, + { + "epoch": 0.07181915331860791, + "grad_norm": 13.431933856396167, + "learning_rate": 4.994145966349801e-05, + "loss": 2.3287, + "mean_token_accuracy": 0.4570477962493896, + "step": 71305 + }, + { + "epoch": 0.07182418937171209, + "grad_norm": 10.636052524922706, + "learning_rate": 4.9941432647673176e-05, + "loss": 2.2743, + "mean_token_accuracy": 0.42758620977401735, + "step": 71310 + }, + { + "epoch": 0.07182922542481626, + "grad_norm": 11.879931948821524, + "learning_rate": 4.994140562562413e-05, + "loss": 2.5965, + "mean_token_accuracy": 0.40852994918823243, + "step": 71315 + }, + { + "epoch": 0.07183426147792044, + "grad_norm": 11.926525612138873, + "learning_rate": 4.994137859735086e-05, + "loss": 2.9058, + "mean_token_accuracy": 0.37241379618644715, + "step": 71320 + }, + { + "epoch": 0.07183929753102461, + "grad_norm": 19.584321373983283, + "learning_rate": 4.99413515628534e-05, + "loss": 2.3878, + "mean_token_accuracy": 0.42068964838981626, + "step": 71325 + }, + { + "epoch": 0.07184433358412878, + "grad_norm": 12.978709994744785, + "learning_rate": 4.9941324522131744e-05, + "loss": 2.5934, + "mean_token_accuracy": 0.36896551847457887, + "step": 71330 + }, + { + "epoch": 0.07184936963723296, + "grad_norm": 10.848848452406486, + "learning_rate": 4.994129747518591e-05, + "loss": 2.6662, + "mean_token_accuracy": 0.36896551847457887, + "step": 71335 + }, + { + "epoch": 0.07185440569033712, + "grad_norm": 11.046680781075052, + "learning_rate": 4.994127042201589e-05, + "loss": 2.3881, + "mean_token_accuracy": 0.3999999940395355, + "step": 71340 + }, + { + "epoch": 0.07185944174344129, + "grad_norm": 9.852345600411851, + "learning_rate": 4.9941243362621696e-05, + "loss": 2.9423, + "mean_token_accuracy": 0.4000000059604645, + "step": 71345 + }, + { + "epoch": 0.07186447779654546, + "grad_norm": 14.362708099023973, + "learning_rate": 4.9941216297003354e-05, + "loss": 2.4185, + "mean_token_accuracy": 0.41034482717514037, + "step": 71350 + }, + { + "epoch": 0.07186951384964964, + "grad_norm": 12.331388340334398, + "learning_rate": 4.994118922516085e-05, + "loss": 2.4748, + "mean_token_accuracy": 0.4344827473163605, + "step": 71355 + }, + { + "epoch": 0.07187454990275381, + "grad_norm": 10.917560575430834, + "learning_rate": 4.9941162147094195e-05, + "loss": 2.1875, + "mean_token_accuracy": 0.48124622106552123, + "step": 71360 + }, + { + "epoch": 0.07187958595585799, + "grad_norm": 13.38597094905091, + "learning_rate": 4.9941135062803405e-05, + "loss": 3.2665, + "mean_token_accuracy": 0.3275862067937851, + "step": 71365 + }, + { + "epoch": 0.07188462200896216, + "grad_norm": 11.599937822207165, + "learning_rate": 4.994110797228849e-05, + "loss": 2.573, + "mean_token_accuracy": 0.40889292359352114, + "step": 71370 + }, + { + "epoch": 0.07188965806206633, + "grad_norm": 11.539004668726683, + "learning_rate": 4.994108087554944e-05, + "loss": 2.6042, + "mean_token_accuracy": 0.42413792610168455, + "step": 71375 + }, + { + "epoch": 0.07189469411517051, + "grad_norm": 10.027263978393824, + "learning_rate": 4.994105377258627e-05, + "loss": 2.6138, + "mean_token_accuracy": 0.3793103456497192, + "step": 71380 + }, + { + "epoch": 0.07189973016827468, + "grad_norm": 11.420537320879685, + "learning_rate": 4.994102666339899e-05, + "loss": 2.2196, + "mean_token_accuracy": 0.47586206197738645, + "step": 71385 + }, + { + "epoch": 0.07190476622137885, + "grad_norm": 19.034901146450853, + "learning_rate": 4.994099954798761e-05, + "loss": 3.0506, + "mean_token_accuracy": 0.334482753276825, + "step": 71390 + }, + { + "epoch": 0.07190980227448303, + "grad_norm": 14.237850931540196, + "learning_rate": 4.9940972426352135e-05, + "loss": 2.6819, + "mean_token_accuracy": 0.37586206793785093, + "step": 71395 + }, + { + "epoch": 0.0719148383275872, + "grad_norm": 11.061479042848587, + "learning_rate": 4.994094529849257e-05, + "loss": 2.5385, + "mean_token_accuracy": 0.3862069010734558, + "step": 71400 + }, + { + "epoch": 0.07191987438069138, + "grad_norm": 11.396092282928125, + "learning_rate": 4.9940918164408937e-05, + "loss": 2.4133, + "mean_token_accuracy": 0.4413793087005615, + "step": 71405 + }, + { + "epoch": 0.07192491043379554, + "grad_norm": 11.938490827029275, + "learning_rate": 4.994089102410122e-05, + "loss": 2.3523, + "mean_token_accuracy": 0.4241379380226135, + "step": 71410 + }, + { + "epoch": 0.07192994648689971, + "grad_norm": 9.963676438646516, + "learning_rate": 4.994086387756944e-05, + "loss": 2.2604, + "mean_token_accuracy": 0.4672111332416534, + "step": 71415 + }, + { + "epoch": 0.07193498254000388, + "grad_norm": 12.025109602407882, + "learning_rate": 4.994083672481361e-05, + "loss": 2.3229, + "mean_token_accuracy": 0.4465214729309082, + "step": 71420 + }, + { + "epoch": 0.07194001859310806, + "grad_norm": 11.84345575701104, + "learning_rate": 4.9940809565833715e-05, + "loss": 2.1297, + "mean_token_accuracy": 0.4862069070339203, + "step": 71425 + }, + { + "epoch": 0.07194505464621223, + "grad_norm": 11.478245950459932, + "learning_rate": 4.994078240062979e-05, + "loss": 2.4269, + "mean_token_accuracy": 0.3999999940395355, + "step": 71430 + }, + { + "epoch": 0.0719500906993164, + "grad_norm": 11.222448409408251, + "learning_rate": 4.994075522920182e-05, + "loss": 2.7749, + "mean_token_accuracy": 0.37586206793785093, + "step": 71435 + }, + { + "epoch": 0.07195512675242058, + "grad_norm": 14.795736672515089, + "learning_rate": 4.994072805154983e-05, + "loss": 2.1834, + "mean_token_accuracy": 0.4862069010734558, + "step": 71440 + }, + { + "epoch": 0.07196016280552475, + "grad_norm": 11.985102016269979, + "learning_rate": 4.994070086767383e-05, + "loss": 2.8104, + "mean_token_accuracy": 0.3551724135875702, + "step": 71445 + }, + { + "epoch": 0.07196519885862893, + "grad_norm": 11.758987771025575, + "learning_rate": 4.994067367757381e-05, + "loss": 2.4874, + "mean_token_accuracy": 0.36551723480224607, + "step": 71450 + }, + { + "epoch": 0.0719702349117331, + "grad_norm": 16.29002748074399, + "learning_rate": 4.994064648124978e-05, + "loss": 2.3843, + "mean_token_accuracy": 0.42068964838981626, + "step": 71455 + }, + { + "epoch": 0.07197527096483727, + "grad_norm": 14.266394796785562, + "learning_rate": 4.994061927870176e-05, + "loss": 2.4995, + "mean_token_accuracy": 0.4068965554237366, + "step": 71460 + }, + { + "epoch": 0.07198030701794145, + "grad_norm": 10.418399530343343, + "learning_rate": 4.9940592069929745e-05, + "loss": 2.3103, + "mean_token_accuracy": 0.48620688915252686, + "step": 71465 + }, + { + "epoch": 0.07198534307104562, + "grad_norm": 13.556227852654867, + "learning_rate": 4.9940564854933755e-05, + "loss": 2.3784, + "mean_token_accuracy": 0.4125226855278015, + "step": 71470 + }, + { + "epoch": 0.0719903791241498, + "grad_norm": 12.672570721125945, + "learning_rate": 4.994053763371378e-05, + "loss": 2.9103, + "mean_token_accuracy": 0.37241379618644715, + "step": 71475 + }, + { + "epoch": 0.07199541517725395, + "grad_norm": 12.99002147265881, + "learning_rate": 4.994051040626985e-05, + "loss": 2.8115, + "mean_token_accuracy": 0.3620689570903778, + "step": 71480 + }, + { + "epoch": 0.07200045123035813, + "grad_norm": 11.009675239447201, + "learning_rate": 4.994048317260195e-05, + "loss": 2.6494, + "mean_token_accuracy": 0.42068966031074523, + "step": 71485 + }, + { + "epoch": 0.0720054872834623, + "grad_norm": 10.987640962150886, + "learning_rate": 4.9940455932710104e-05, + "loss": 2.7236, + "mean_token_accuracy": 0.3896551728248596, + "step": 71490 + }, + { + "epoch": 0.07201052333656648, + "grad_norm": 13.856438387420134, + "learning_rate": 4.994042868659431e-05, + "loss": 2.3668, + "mean_token_accuracy": 0.4432546854019165, + "step": 71495 + }, + { + "epoch": 0.07201555938967065, + "grad_norm": 13.693493315924862, + "learning_rate": 4.9940401434254585e-05, + "loss": 2.7748, + "mean_token_accuracy": 0.3807622492313385, + "step": 71500 + }, + { + "epoch": 0.07202059544277482, + "grad_norm": 12.42467102695888, + "learning_rate": 4.9940374175690926e-05, + "loss": 2.2863, + "mean_token_accuracy": 0.43793103098869324, + "step": 71505 + }, + { + "epoch": 0.072025631495879, + "grad_norm": 10.613430633949436, + "learning_rate": 4.994034691090335e-05, + "loss": 2.7947, + "mean_token_accuracy": 0.37586206793785093, + "step": 71510 + }, + { + "epoch": 0.07203066754898317, + "grad_norm": 10.89109345361714, + "learning_rate": 4.994031963989185e-05, + "loss": 2.5677, + "mean_token_accuracy": 0.3931034505367279, + "step": 71515 + }, + { + "epoch": 0.07203570360208734, + "grad_norm": 10.50883382429764, + "learning_rate": 4.994029236265646e-05, + "loss": 2.4916, + "mean_token_accuracy": 0.38620689511299133, + "step": 71520 + }, + { + "epoch": 0.07204073965519152, + "grad_norm": 11.048682815706222, + "learning_rate": 4.9940265079197165e-05, + "loss": 3.0293, + "mean_token_accuracy": 0.36896551251411436, + "step": 71525 + }, + { + "epoch": 0.07204577570829569, + "grad_norm": 12.349478738727637, + "learning_rate": 4.994023778951397e-05, + "loss": 2.6617, + "mean_token_accuracy": 0.4103448212146759, + "step": 71530 + }, + { + "epoch": 0.07205081176139987, + "grad_norm": 9.684595661527045, + "learning_rate": 4.9940210493606906e-05, + "loss": 2.4296, + "mean_token_accuracy": 0.3793103486299515, + "step": 71535 + }, + { + "epoch": 0.07205584781450404, + "grad_norm": 11.113699252455035, + "learning_rate": 4.994018319147595e-05, + "loss": 2.3171, + "mean_token_accuracy": 0.45517241954803467, + "step": 71540 + }, + { + "epoch": 0.07206088386760821, + "grad_norm": 10.029020265956625, + "learning_rate": 4.9940155883121135e-05, + "loss": 2.8643, + "mean_token_accuracy": 0.41379310488700866, + "step": 71545 + }, + { + "epoch": 0.07206591992071237, + "grad_norm": 14.717267114791252, + "learning_rate": 4.994012856854246e-05, + "loss": 2.2799, + "mean_token_accuracy": 0.475862056016922, + "step": 71550 + }, + { + "epoch": 0.07207095597381655, + "grad_norm": 20.959088006214035, + "learning_rate": 4.994010124773992e-05, + "loss": 3.2726, + "mean_token_accuracy": 0.40532365441322327, + "step": 71555 + }, + { + "epoch": 0.07207599202692072, + "grad_norm": 10.440547784348782, + "learning_rate": 4.994007392071355e-05, + "loss": 2.4183, + "mean_token_accuracy": 0.42758620381355283, + "step": 71560 + }, + { + "epoch": 0.0720810280800249, + "grad_norm": 11.9672385259416, + "learning_rate": 4.994004658746334e-05, + "loss": 2.6002, + "mean_token_accuracy": 0.3827586233615875, + "step": 71565 + }, + { + "epoch": 0.07208606413312907, + "grad_norm": 12.691108105709239, + "learning_rate": 4.994001924798929e-05, + "loss": 2.7679, + "mean_token_accuracy": 0.37931033968925476, + "step": 71570 + }, + { + "epoch": 0.07209110018623324, + "grad_norm": 11.317498288857527, + "learning_rate": 4.993999190229142e-05, + "loss": 2.3456, + "mean_token_accuracy": 0.44827585816383364, + "step": 71575 + }, + { + "epoch": 0.07209613623933742, + "grad_norm": 12.32279075434726, + "learning_rate": 4.993996455036974e-05, + "loss": 3.2205, + "mean_token_accuracy": 0.3482758581638336, + "step": 71580 + }, + { + "epoch": 0.07210117229244159, + "grad_norm": 15.547267147676845, + "learning_rate": 4.993993719222425e-05, + "loss": 2.3294, + "mean_token_accuracy": 0.4551724135875702, + "step": 71585 + }, + { + "epoch": 0.07210620834554576, + "grad_norm": 11.253616591538012, + "learning_rate": 4.9939909827854946e-05, + "loss": 2.7361, + "mean_token_accuracy": 0.3827586233615875, + "step": 71590 + }, + { + "epoch": 0.07211124439864994, + "grad_norm": 10.969314150306264, + "learning_rate": 4.993988245726186e-05, + "loss": 2.9053, + "mean_token_accuracy": 0.3551724165678024, + "step": 71595 + }, + { + "epoch": 0.07211628045175411, + "grad_norm": 10.175069966923722, + "learning_rate": 4.993985508044499e-05, + "loss": 2.258, + "mean_token_accuracy": 0.4310344815254211, + "step": 71600 + }, + { + "epoch": 0.07212131650485828, + "grad_norm": 11.237158970258017, + "learning_rate": 4.9939827697404346e-05, + "loss": 2.4367, + "mean_token_accuracy": 0.3827586233615875, + "step": 71605 + }, + { + "epoch": 0.07212635255796246, + "grad_norm": 11.049151223250492, + "learning_rate": 4.993980030813993e-05, + "loss": 2.3572, + "mean_token_accuracy": 0.37241379022598264, + "step": 71610 + }, + { + "epoch": 0.07213138861106663, + "grad_norm": 11.38296387544779, + "learning_rate": 4.993977291265176e-05, + "loss": 2.2649, + "mean_token_accuracy": 0.4551724135875702, + "step": 71615 + }, + { + "epoch": 0.07213642466417079, + "grad_norm": 10.538894471367406, + "learning_rate": 4.993974551093981e-05, + "loss": 2.7164, + "mean_token_accuracy": 0.39310344457626345, + "step": 71620 + }, + { + "epoch": 0.07214146071727497, + "grad_norm": 12.113524464550544, + "learning_rate": 4.9939718103004134e-05, + "loss": 2.4251, + "mean_token_accuracy": 0.42413792610168455, + "step": 71625 + }, + { + "epoch": 0.07214649677037914, + "grad_norm": 10.377896812698255, + "learning_rate": 4.993969068884471e-05, + "loss": 2.9003, + "mean_token_accuracy": 0.358620685338974, + "step": 71630 + }, + { + "epoch": 0.07215153282348331, + "grad_norm": 9.37534405713517, + "learning_rate": 4.9939663268461554e-05, + "loss": 2.7102, + "mean_token_accuracy": 0.3655172407627106, + "step": 71635 + }, + { + "epoch": 0.07215656887658749, + "grad_norm": 13.674747681983112, + "learning_rate": 4.9939635841854684e-05, + "loss": 2.248, + "mean_token_accuracy": 0.41379311084747317, + "step": 71640 + }, + { + "epoch": 0.07216160492969166, + "grad_norm": 16.603969991780037, + "learning_rate": 4.9939608409024086e-05, + "loss": 2.907, + "mean_token_accuracy": 0.32413792610168457, + "step": 71645 + }, + { + "epoch": 0.07216664098279583, + "grad_norm": 17.820120120799444, + "learning_rate": 4.993958096996978e-05, + "loss": 2.6106, + "mean_token_accuracy": 0.3827586144208908, + "step": 71650 + }, + { + "epoch": 0.07217167703590001, + "grad_norm": 10.650218917746896, + "learning_rate": 4.993955352469178e-05, + "loss": 2.1474, + "mean_token_accuracy": 0.4724137902259827, + "step": 71655 + }, + { + "epoch": 0.07217671308900418, + "grad_norm": 9.264443813989137, + "learning_rate": 4.993952607319009e-05, + "loss": 2.5643, + "mean_token_accuracy": 0.3931034505367279, + "step": 71660 + }, + { + "epoch": 0.07218174914210836, + "grad_norm": 11.552777484702569, + "learning_rate": 4.993949861546469e-05, + "loss": 2.4183, + "mean_token_accuracy": 0.42758620381355283, + "step": 71665 + }, + { + "epoch": 0.07218678519521253, + "grad_norm": 15.255875301593512, + "learning_rate": 4.993947115151563e-05, + "loss": 2.7141, + "mean_token_accuracy": 0.34137930572032926, + "step": 71670 + }, + { + "epoch": 0.0721918212483167, + "grad_norm": 11.550434574086205, + "learning_rate": 4.99394436813429e-05, + "loss": 2.5029, + "mean_token_accuracy": 0.41379310488700866, + "step": 71675 + }, + { + "epoch": 0.07219685730142088, + "grad_norm": 10.291515343467934, + "learning_rate": 4.99394162049465e-05, + "loss": 2.9086, + "mean_token_accuracy": 0.3324258953332901, + "step": 71680 + }, + { + "epoch": 0.07220189335452505, + "grad_norm": 15.028745392036004, + "learning_rate": 4.993938872232645e-05, + "loss": 2.6665, + "mean_token_accuracy": 0.4137930989265442, + "step": 71685 + }, + { + "epoch": 0.07220692940762921, + "grad_norm": 9.97353187889815, + "learning_rate": 4.993936123348274e-05, + "loss": 2.7902, + "mean_token_accuracy": 0.4448275864124298, + "step": 71690 + }, + { + "epoch": 0.07221196546073338, + "grad_norm": 11.65076030860426, + "learning_rate": 4.99393337384154e-05, + "loss": 2.5136, + "mean_token_accuracy": 0.41222020983695984, + "step": 71695 + }, + { + "epoch": 0.07221700151383756, + "grad_norm": 11.349795190571083, + "learning_rate": 4.993930623712442e-05, + "loss": 2.0942, + "mean_token_accuracy": 0.46551724076271056, + "step": 71700 + }, + { + "epoch": 0.07222203756694173, + "grad_norm": 11.859164930650953, + "learning_rate": 4.9939278729609825e-05, + "loss": 2.1408, + "mean_token_accuracy": 0.49999999403953554, + "step": 71705 + }, + { + "epoch": 0.0722270736200459, + "grad_norm": 10.308000695684758, + "learning_rate": 4.993925121587161e-05, + "loss": 2.1409, + "mean_token_accuracy": 0.4620689630508423, + "step": 71710 + }, + { + "epoch": 0.07223210967315008, + "grad_norm": 11.749737668050528, + "learning_rate": 4.9939223695909775e-05, + "loss": 2.556, + "mean_token_accuracy": 0.4310344815254211, + "step": 71715 + }, + { + "epoch": 0.07223714572625425, + "grad_norm": 11.231502145240697, + "learning_rate": 4.993919616972434e-05, + "loss": 2.7329, + "mean_token_accuracy": 0.37586206793785093, + "step": 71720 + }, + { + "epoch": 0.07224218177935843, + "grad_norm": 14.222976137875554, + "learning_rate": 4.993916863731532e-05, + "loss": 2.5219, + "mean_token_accuracy": 0.4702964246273041, + "step": 71725 + }, + { + "epoch": 0.0722472178324626, + "grad_norm": 11.299418272584601, + "learning_rate": 4.993914109868271e-05, + "loss": 2.6462, + "mean_token_accuracy": 0.4, + "step": 71730 + }, + { + "epoch": 0.07225225388556678, + "grad_norm": 11.102176581359085, + "learning_rate": 4.9939113553826516e-05, + "loss": 2.585, + "mean_token_accuracy": 0.3999999940395355, + "step": 71735 + }, + { + "epoch": 0.07225728993867095, + "grad_norm": 10.325806036488645, + "learning_rate": 4.993908600274675e-05, + "loss": 2.0121, + "mean_token_accuracy": 0.49165154099464414, + "step": 71740 + }, + { + "epoch": 0.07226232599177512, + "grad_norm": 10.160428786610945, + "learning_rate": 4.9939058445443415e-05, + "loss": 2.261, + "mean_token_accuracy": 0.44827585816383364, + "step": 71745 + }, + { + "epoch": 0.0722673620448793, + "grad_norm": 10.12163781549173, + "learning_rate": 4.9939030881916535e-05, + "loss": 2.2614, + "mean_token_accuracy": 0.4482758641242981, + "step": 71750 + }, + { + "epoch": 0.07227239809798347, + "grad_norm": 11.985489712251821, + "learning_rate": 4.9939003312166104e-05, + "loss": 2.5713, + "mean_token_accuracy": 0.39503931999206543, + "step": 71755 + }, + { + "epoch": 0.07227743415108763, + "grad_norm": 7.619634944740773, + "learning_rate": 4.993897573619213e-05, + "loss": 2.3959, + "mean_token_accuracy": 0.44137930274009707, + "step": 71760 + }, + { + "epoch": 0.0722824702041918, + "grad_norm": 12.311164023543075, + "learning_rate": 4.993894815399462e-05, + "loss": 2.727, + "mean_token_accuracy": 0.37241379618644715, + "step": 71765 + }, + { + "epoch": 0.07228750625729598, + "grad_norm": 11.10212120097582, + "learning_rate": 4.993892056557359e-05, + "loss": 2.2481, + "mean_token_accuracy": 0.46551724672317507, + "step": 71770 + }, + { + "epoch": 0.07229254231040015, + "grad_norm": 11.572472619544609, + "learning_rate": 4.993889297092903e-05, + "loss": 2.8596, + "mean_token_accuracy": 0.40344828367233276, + "step": 71775 + }, + { + "epoch": 0.07229757836350433, + "grad_norm": 10.899249628748388, + "learning_rate": 4.9938865370060964e-05, + "loss": 2.2756, + "mean_token_accuracy": 0.448033881187439, + "step": 71780 + }, + { + "epoch": 0.0723026144166085, + "grad_norm": 17.75386818786281, + "learning_rate": 4.9938837762969405e-05, + "loss": 2.6961, + "mean_token_accuracy": 0.4, + "step": 71785 + }, + { + "epoch": 0.07230765046971267, + "grad_norm": 11.16595582573691, + "learning_rate": 4.9938810149654334e-05, + "loss": 2.7256, + "mean_token_accuracy": 0.408711439371109, + "step": 71790 + }, + { + "epoch": 0.07231268652281685, + "grad_norm": 11.250038977207629, + "learning_rate": 4.993878253011579e-05, + "loss": 2.7665, + "mean_token_accuracy": 0.4172413766384125, + "step": 71795 + }, + { + "epoch": 0.07231772257592102, + "grad_norm": 11.269175702498613, + "learning_rate": 4.993875490435375e-05, + "loss": 2.578, + "mean_token_accuracy": 0.3517241418361664, + "step": 71800 + }, + { + "epoch": 0.0723227586290252, + "grad_norm": 12.81919392102948, + "learning_rate": 4.993872727236825e-05, + "loss": 2.6231, + "mean_token_accuracy": 0.3896551728248596, + "step": 71805 + }, + { + "epoch": 0.07232779468212937, + "grad_norm": 10.507017666292143, + "learning_rate": 4.9938699634159284e-05, + "loss": 2.6256, + "mean_token_accuracy": 0.3827586233615875, + "step": 71810 + }, + { + "epoch": 0.07233283073523354, + "grad_norm": 12.620833815019466, + "learning_rate": 4.993867198972686e-05, + "loss": 2.6904, + "mean_token_accuracy": 0.3827586233615875, + "step": 71815 + }, + { + "epoch": 0.07233786678833772, + "grad_norm": 12.29287347086289, + "learning_rate": 4.993864433907099e-05, + "loss": 2.556, + "mean_token_accuracy": 0.46896552443504336, + "step": 71820 + }, + { + "epoch": 0.07234290284144189, + "grad_norm": 10.328125362362801, + "learning_rate": 4.993861668219168e-05, + "loss": 2.4117, + "mean_token_accuracy": 0.42068965137004855, + "step": 71825 + }, + { + "epoch": 0.07234793889454605, + "grad_norm": 10.046770713240933, + "learning_rate": 4.9938589019088924e-05, + "loss": 2.8931, + "mean_token_accuracy": 0.3724137991666794, + "step": 71830 + }, + { + "epoch": 0.07235297494765022, + "grad_norm": 10.040703964946038, + "learning_rate": 4.993856134976275e-05, + "loss": 2.4024, + "mean_token_accuracy": 0.4130066573619843, + "step": 71835 + }, + { + "epoch": 0.0723580110007544, + "grad_norm": 12.677489016372917, + "learning_rate": 4.9938533674213154e-05, + "loss": 2.4964, + "mean_token_accuracy": 0.41203871965408323, + "step": 71840 + }, + { + "epoch": 0.07236304705385857, + "grad_norm": 12.679327368113709, + "learning_rate": 4.9938505992440156e-05, + "loss": 2.7582, + "mean_token_accuracy": 0.3551724076271057, + "step": 71845 + }, + { + "epoch": 0.07236808310696274, + "grad_norm": 10.223952074981142, + "learning_rate": 4.9938478304443736e-05, + "loss": 2.3166, + "mean_token_accuracy": 0.41724138259887694, + "step": 71850 + }, + { + "epoch": 0.07237311916006692, + "grad_norm": 10.914424594517488, + "learning_rate": 4.993845061022394e-05, + "loss": 2.6105, + "mean_token_accuracy": 0.34137930274009703, + "step": 71855 + }, + { + "epoch": 0.07237815521317109, + "grad_norm": 13.997772289838162, + "learning_rate": 4.993842290978074e-05, + "loss": 2.6974, + "mean_token_accuracy": 0.38965516686439516, + "step": 71860 + }, + { + "epoch": 0.07238319126627527, + "grad_norm": 10.442603433876926, + "learning_rate": 4.9938395203114166e-05, + "loss": 2.4615, + "mean_token_accuracy": 0.4030248045921326, + "step": 71865 + }, + { + "epoch": 0.07238822731937944, + "grad_norm": 12.634259999590816, + "learning_rate": 4.993836749022423e-05, + "loss": 2.4025, + "mean_token_accuracy": 0.46073804795742035, + "step": 71870 + }, + { + "epoch": 0.07239326337248361, + "grad_norm": 10.207431223287152, + "learning_rate": 4.9938339771110914e-05, + "loss": 2.7267, + "mean_token_accuracy": 0.3655172407627106, + "step": 71875 + }, + { + "epoch": 0.07239829942558779, + "grad_norm": 13.36800365443819, + "learning_rate": 4.9938312045774245e-05, + "loss": 2.9456, + "mean_token_accuracy": 0.41724138259887694, + "step": 71880 + }, + { + "epoch": 0.07240333547869196, + "grad_norm": 10.166108518841511, + "learning_rate": 4.9938284314214236e-05, + "loss": 2.429, + "mean_token_accuracy": 0.4517241299152374, + "step": 71885 + }, + { + "epoch": 0.07240837153179613, + "grad_norm": 11.45276519937559, + "learning_rate": 4.9938256576430866e-05, + "loss": 2.4781, + "mean_token_accuracy": 0.441379314661026, + "step": 71890 + }, + { + "epoch": 0.07241340758490031, + "grad_norm": 11.990444026856164, + "learning_rate": 4.9938228832424176e-05, + "loss": 2.5159, + "mean_token_accuracy": 0.4413793087005615, + "step": 71895 + }, + { + "epoch": 0.07241844363800447, + "grad_norm": 11.892029101980777, + "learning_rate": 4.9938201082194165e-05, + "loss": 2.2349, + "mean_token_accuracy": 0.4310344815254211, + "step": 71900 + }, + { + "epoch": 0.07242347969110864, + "grad_norm": 12.787725198137968, + "learning_rate": 4.9938173325740826e-05, + "loss": 2.5388, + "mean_token_accuracy": 0.38620689511299133, + "step": 71905 + }, + { + "epoch": 0.07242851574421282, + "grad_norm": 10.912425670937424, + "learning_rate": 4.993814556306417e-05, + "loss": 2.3725, + "mean_token_accuracy": 0.44211822748184204, + "step": 71910 + }, + { + "epoch": 0.07243355179731699, + "grad_norm": 11.389933788171122, + "learning_rate": 4.9938117794164216e-05, + "loss": 2.4931, + "mean_token_accuracy": 0.38620689511299133, + "step": 71915 + }, + { + "epoch": 0.07243858785042116, + "grad_norm": 11.717433617948556, + "learning_rate": 4.993809001904097e-05, + "loss": 2.6956, + "mean_token_accuracy": 0.3517241358757019, + "step": 71920 + }, + { + "epoch": 0.07244362390352534, + "grad_norm": 10.261877036974804, + "learning_rate": 4.993806223769443e-05, + "loss": 2.1946, + "mean_token_accuracy": 0.4758620738983154, + "step": 71925 + }, + { + "epoch": 0.07244865995662951, + "grad_norm": 9.539748662534658, + "learning_rate": 4.993803445012462e-05, + "loss": 2.3636, + "mean_token_accuracy": 0.4689655065536499, + "step": 71930 + }, + { + "epoch": 0.07245369600973368, + "grad_norm": 11.76595595517886, + "learning_rate": 4.9938006656331534e-05, + "loss": 2.2152, + "mean_token_accuracy": 0.4241379201412201, + "step": 71935 + }, + { + "epoch": 0.07245873206283786, + "grad_norm": 15.278631243674903, + "learning_rate": 4.993797885631517e-05, + "loss": 2.6372, + "mean_token_accuracy": 0.3793103456497192, + "step": 71940 + }, + { + "epoch": 0.07246376811594203, + "grad_norm": 11.014822693461545, + "learning_rate": 4.993795105007556e-05, + "loss": 2.246, + "mean_token_accuracy": 0.42758620977401735, + "step": 71945 + }, + { + "epoch": 0.0724688041690462, + "grad_norm": 10.971033396490107, + "learning_rate": 4.99379232376127e-05, + "loss": 2.4775, + "mean_token_accuracy": 0.441379314661026, + "step": 71950 + }, + { + "epoch": 0.07247384022215038, + "grad_norm": 14.161680313459602, + "learning_rate": 4.993789541892661e-05, + "loss": 2.157, + "mean_token_accuracy": 0.4429521977901459, + "step": 71955 + }, + { + "epoch": 0.07247887627525455, + "grad_norm": 14.92910822459536, + "learning_rate": 4.9937867594017265e-05, + "loss": 2.7432, + "mean_token_accuracy": 0.4049606740474701, + "step": 71960 + }, + { + "epoch": 0.07248391232835873, + "grad_norm": 11.418711023117943, + "learning_rate": 4.99378397628847e-05, + "loss": 2.2859, + "mean_token_accuracy": 0.44295220971107485, + "step": 71965 + }, + { + "epoch": 0.07248894838146289, + "grad_norm": 10.657142537386534, + "learning_rate": 4.993781192552892e-05, + "loss": 2.3025, + "mean_token_accuracy": 0.4413793087005615, + "step": 71970 + }, + { + "epoch": 0.07249398443456706, + "grad_norm": 14.876535604831842, + "learning_rate": 4.993778408194993e-05, + "loss": 2.478, + "mean_token_accuracy": 0.4137930989265442, + "step": 71975 + }, + { + "epoch": 0.07249902048767123, + "grad_norm": 11.088488940009128, + "learning_rate": 4.993775623214773e-05, + "loss": 2.6126, + "mean_token_accuracy": 0.3896551728248596, + "step": 71980 + }, + { + "epoch": 0.07250405654077541, + "grad_norm": 11.134606476520943, + "learning_rate": 4.993772837612234e-05, + "loss": 2.8547, + "mean_token_accuracy": 0.36896551847457887, + "step": 71985 + }, + { + "epoch": 0.07250909259387958, + "grad_norm": 11.605691479055297, + "learning_rate": 4.993770051387377e-05, + "loss": 2.6515, + "mean_token_accuracy": 0.4421182215213776, + "step": 71990 + }, + { + "epoch": 0.07251412864698376, + "grad_norm": 12.938916871274403, + "learning_rate": 4.9937672645402004e-05, + "loss": 2.6485, + "mean_token_accuracy": 0.38620689511299133, + "step": 71995 + }, + { + "epoch": 0.07251916470008793, + "grad_norm": 13.305607967632266, + "learning_rate": 4.9937644770707074e-05, + "loss": 2.5614, + "mean_token_accuracy": 0.3586206942796707, + "step": 72000 + }, + { + "epoch": 0.0725242007531921, + "grad_norm": 11.999243140979976, + "learning_rate": 4.993761688978898e-05, + "loss": 2.3539, + "mean_token_accuracy": 0.3946763455867767, + "step": 72005 + }, + { + "epoch": 0.07252923680629628, + "grad_norm": 12.661554151121283, + "learning_rate": 4.993758900264773e-05, + "loss": 2.7432, + "mean_token_accuracy": 0.37586206793785093, + "step": 72010 + }, + { + "epoch": 0.07253427285940045, + "grad_norm": 12.773481139331302, + "learning_rate": 4.9937561109283325e-05, + "loss": 2.3902, + "mean_token_accuracy": 0.40199636816978457, + "step": 72015 + }, + { + "epoch": 0.07253930891250462, + "grad_norm": 9.119114489379617, + "learning_rate": 4.9937533209695784e-05, + "loss": 2.5232, + "mean_token_accuracy": 0.4275862157344818, + "step": 72020 + }, + { + "epoch": 0.0725443449656088, + "grad_norm": 13.102672201812354, + "learning_rate": 4.993750530388511e-05, + "loss": 2.9672, + "mean_token_accuracy": 0.3689655244350433, + "step": 72025 + }, + { + "epoch": 0.07254938101871297, + "grad_norm": 13.502906922192079, + "learning_rate": 4.993747739185131e-05, + "loss": 2.7273, + "mean_token_accuracy": 0.37586206793785093, + "step": 72030 + }, + { + "epoch": 0.07255441707181715, + "grad_norm": 10.931625583675523, + "learning_rate": 4.993744947359439e-05, + "loss": 2.6936, + "mean_token_accuracy": 0.43448275327682495, + "step": 72035 + }, + { + "epoch": 0.0725594531249213, + "grad_norm": 10.58375945266393, + "learning_rate": 4.993742154911437e-05, + "loss": 2.7918, + "mean_token_accuracy": 0.42758620977401735, + "step": 72040 + }, + { + "epoch": 0.07256448917802548, + "grad_norm": 9.376099288205099, + "learning_rate": 4.993739361841123e-05, + "loss": 3.1053, + "mean_token_accuracy": 0.3482758581638336, + "step": 72045 + }, + { + "epoch": 0.07256952523112965, + "grad_norm": 11.821425431240204, + "learning_rate": 4.9937365681485004e-05, + "loss": 2.5655, + "mean_token_accuracy": 0.38620689511299133, + "step": 72050 + }, + { + "epoch": 0.07257456128423383, + "grad_norm": 10.043402433289465, + "learning_rate": 4.993733773833569e-05, + "loss": 1.8754, + "mean_token_accuracy": 0.4952813029289246, + "step": 72055 + }, + { + "epoch": 0.072579597337338, + "grad_norm": 9.713557864727024, + "learning_rate": 4.9937309788963294e-05, + "loss": 2.9376, + "mean_token_accuracy": 0.36206896901130675, + "step": 72060 + }, + { + "epoch": 0.07258463339044217, + "grad_norm": 10.492289895225737, + "learning_rate": 4.9937281833367834e-05, + "loss": 2.5403, + "mean_token_accuracy": 0.42068966031074523, + "step": 72065 + }, + { + "epoch": 0.07258966944354635, + "grad_norm": 11.964340224084317, + "learning_rate": 4.99372538715493e-05, + "loss": 2.558, + "mean_token_accuracy": 0.4172413766384125, + "step": 72070 + }, + { + "epoch": 0.07259470549665052, + "grad_norm": 11.833406691465733, + "learning_rate": 4.993722590350772e-05, + "loss": 2.221, + "mean_token_accuracy": 0.41034482717514037, + "step": 72075 + }, + { + "epoch": 0.0725997415497547, + "grad_norm": 9.568507002032092, + "learning_rate": 4.9937197929243084e-05, + "loss": 2.1943, + "mean_token_accuracy": 0.49491834044456484, + "step": 72080 + }, + { + "epoch": 0.07260477760285887, + "grad_norm": 11.615251721085219, + "learning_rate": 4.9937169948755406e-05, + "loss": 2.2385, + "mean_token_accuracy": 0.4517241358757019, + "step": 72085 + }, + { + "epoch": 0.07260981365596304, + "grad_norm": 13.639964199022765, + "learning_rate": 4.9937141962044706e-05, + "loss": 2.7596, + "mean_token_accuracy": 0.37241379618644715, + "step": 72090 + }, + { + "epoch": 0.07261484970906722, + "grad_norm": 11.953674673718709, + "learning_rate": 4.993711396911096e-05, + "loss": 2.8162, + "mean_token_accuracy": 0.3862068891525269, + "step": 72095 + }, + { + "epoch": 0.07261988576217139, + "grad_norm": 14.500681391463356, + "learning_rate": 4.9937085969954225e-05, + "loss": 2.5205, + "mean_token_accuracy": 0.47931034564971925, + "step": 72100 + }, + { + "epoch": 0.07262492181527556, + "grad_norm": 11.1343364020398, + "learning_rate": 4.9937057964574465e-05, + "loss": 2.4026, + "mean_token_accuracy": 0.37586207389831544, + "step": 72105 + }, + { + "epoch": 0.07262995786837972, + "grad_norm": 11.745569630947235, + "learning_rate": 4.99370299529717e-05, + "loss": 2.3465, + "mean_token_accuracy": 0.4068965554237366, + "step": 72110 + }, + { + "epoch": 0.0726349939214839, + "grad_norm": 12.516530776257065, + "learning_rate": 4.993700193514595e-05, + "loss": 2.7408, + "mean_token_accuracy": 0.3517241358757019, + "step": 72115 + }, + { + "epoch": 0.07264002997458807, + "grad_norm": 9.915700377416577, + "learning_rate": 4.993697391109721e-05, + "loss": 2.5748, + "mean_token_accuracy": 0.41034482717514037, + "step": 72120 + }, + { + "epoch": 0.07264506602769225, + "grad_norm": 10.115278657277177, + "learning_rate": 4.993694588082549e-05, + "loss": 2.4531, + "mean_token_accuracy": 0.42413793206214906, + "step": 72125 + }, + { + "epoch": 0.07265010208079642, + "grad_norm": 12.783923515468887, + "learning_rate": 4.9936917844330796e-05, + "loss": 2.4075, + "mean_token_accuracy": 0.4298850655555725, + "step": 72130 + }, + { + "epoch": 0.07265513813390059, + "grad_norm": 10.562461824078984, + "learning_rate": 4.993688980161315e-05, + "loss": 2.8101, + "mean_token_accuracy": 0.4034482717514038, + "step": 72135 + }, + { + "epoch": 0.07266017418700477, + "grad_norm": 11.484376494793619, + "learning_rate": 4.993686175267254e-05, + "loss": 2.8034, + "mean_token_accuracy": 0.37241379618644715, + "step": 72140 + }, + { + "epoch": 0.07266521024010894, + "grad_norm": 10.654652045444353, + "learning_rate": 4.993683369750899e-05, + "loss": 2.2444, + "mean_token_accuracy": 0.47241380214691164, + "step": 72145 + }, + { + "epoch": 0.07267024629321311, + "grad_norm": 11.001404227402434, + "learning_rate": 4.9936805636122495e-05, + "loss": 2.5194, + "mean_token_accuracy": 0.3896551728248596, + "step": 72150 + }, + { + "epoch": 0.07267528234631729, + "grad_norm": 12.64844526512769, + "learning_rate": 4.9936777568513074e-05, + "loss": 2.7745, + "mean_token_accuracy": 0.41034482717514037, + "step": 72155 + }, + { + "epoch": 0.07268031839942146, + "grad_norm": 14.115372997041845, + "learning_rate": 4.9936749494680726e-05, + "loss": 2.5279, + "mean_token_accuracy": 0.41034482717514037, + "step": 72160 + }, + { + "epoch": 0.07268535445252564, + "grad_norm": 14.11745542487036, + "learning_rate": 4.993672141462545e-05, + "loss": 2.2876, + "mean_token_accuracy": 0.40689654350280763, + "step": 72165 + }, + { + "epoch": 0.07269039050562981, + "grad_norm": 10.935087388866803, + "learning_rate": 4.993669332834728e-05, + "loss": 2.6153, + "mean_token_accuracy": 0.4413793087005615, + "step": 72170 + }, + { + "epoch": 0.07269542655873398, + "grad_norm": 11.459747297695882, + "learning_rate": 4.9936665235846204e-05, + "loss": 2.1953, + "mean_token_accuracy": 0.4413793087005615, + "step": 72175 + }, + { + "epoch": 0.07270046261183814, + "grad_norm": 10.797975898809698, + "learning_rate": 4.9936637137122234e-05, + "loss": 2.3074, + "mean_token_accuracy": 0.4206896543502808, + "step": 72180 + }, + { + "epoch": 0.07270549866494232, + "grad_norm": 9.927769965576273, + "learning_rate": 4.9936609032175384e-05, + "loss": 2.656, + "mean_token_accuracy": 0.36551723480224607, + "step": 72185 + }, + { + "epoch": 0.07271053471804649, + "grad_norm": 9.9366059119274, + "learning_rate": 4.9936580921005653e-05, + "loss": 2.2896, + "mean_token_accuracy": 0.3862069010734558, + "step": 72190 + }, + { + "epoch": 0.07271557077115066, + "grad_norm": 10.182686578031506, + "learning_rate": 4.993655280361306e-05, + "loss": 2.5989, + "mean_token_accuracy": 0.3931034475564957, + "step": 72195 + }, + { + "epoch": 0.07272060682425484, + "grad_norm": 11.362470730104189, + "learning_rate": 4.9936524679997594e-05, + "loss": 2.7519, + "mean_token_accuracy": 0.3655172407627106, + "step": 72200 + }, + { + "epoch": 0.07272564287735901, + "grad_norm": 12.85851822140653, + "learning_rate": 4.9936496550159284e-05, + "loss": 2.4645, + "mean_token_accuracy": 0.4034482777118683, + "step": 72205 + }, + { + "epoch": 0.07273067893046319, + "grad_norm": 8.800501259946556, + "learning_rate": 4.993646841409812e-05, + "loss": 2.6502, + "mean_token_accuracy": 0.42571083307266233, + "step": 72210 + }, + { + "epoch": 0.07273571498356736, + "grad_norm": 10.211404845777034, + "learning_rate": 4.9936440271814114e-05, + "loss": 2.3605, + "mean_token_accuracy": 0.4137930929660797, + "step": 72215 + }, + { + "epoch": 0.07274075103667153, + "grad_norm": 17.08572072247456, + "learning_rate": 4.9936412123307294e-05, + "loss": 2.6309, + "mean_token_accuracy": 0.42413793206214906, + "step": 72220 + }, + { + "epoch": 0.07274578708977571, + "grad_norm": 11.839789272502871, + "learning_rate": 4.993638396857764e-05, + "loss": 2.8775, + "mean_token_accuracy": 0.3620689630508423, + "step": 72225 + }, + { + "epoch": 0.07275082314287988, + "grad_norm": 26.127949241353303, + "learning_rate": 4.993635580762517e-05, + "loss": 2.4665, + "mean_token_accuracy": 0.4, + "step": 72230 + }, + { + "epoch": 0.07275585919598405, + "grad_norm": 10.570506277354669, + "learning_rate": 4.99363276404499e-05, + "loss": 2.5491, + "mean_token_accuracy": 0.4, + "step": 72235 + }, + { + "epoch": 0.07276089524908823, + "grad_norm": 10.065490456879969, + "learning_rate": 4.993629946705182e-05, + "loss": 2.7251, + "mean_token_accuracy": 0.4103448212146759, + "step": 72240 + }, + { + "epoch": 0.0727659313021924, + "grad_norm": 18.147303802028727, + "learning_rate": 4.993627128743097e-05, + "loss": 2.3385, + "mean_token_accuracy": 0.42068966031074523, + "step": 72245 + }, + { + "epoch": 0.07277096735529656, + "grad_norm": 10.749321348813726, + "learning_rate": 4.993624310158731e-05, + "loss": 2.4098, + "mean_token_accuracy": 0.3655172407627106, + "step": 72250 + }, + { + "epoch": 0.07277600340840074, + "grad_norm": 24.404530509693284, + "learning_rate": 4.993621490952089e-05, + "loss": 2.2571, + "mean_token_accuracy": 0.43793103098869324, + "step": 72255 + }, + { + "epoch": 0.07278103946150491, + "grad_norm": 11.801749490094599, + "learning_rate": 4.99361867112317e-05, + "loss": 2.6827, + "mean_token_accuracy": 0.42068966031074523, + "step": 72260 + }, + { + "epoch": 0.07278607551460908, + "grad_norm": 17.657090311227172, + "learning_rate": 4.993615850671975e-05, + "loss": 2.6583, + "mean_token_accuracy": 0.42413793206214906, + "step": 72265 + }, + { + "epoch": 0.07279111156771326, + "grad_norm": 12.306756401687004, + "learning_rate": 4.9936130295985045e-05, + "loss": 2.3396, + "mean_token_accuracy": 0.4482758641242981, + "step": 72270 + }, + { + "epoch": 0.07279614762081743, + "grad_norm": 14.534504993578793, + "learning_rate": 4.9936102079027595e-05, + "loss": 2.618, + "mean_token_accuracy": 0.38275861740112305, + "step": 72275 + }, + { + "epoch": 0.0728011836739216, + "grad_norm": 10.073668040250075, + "learning_rate": 4.9936073855847407e-05, + "loss": 3.0102, + "mean_token_accuracy": 0.4137930989265442, + "step": 72280 + }, + { + "epoch": 0.07280621972702578, + "grad_norm": 14.749881275112061, + "learning_rate": 4.9936045626444494e-05, + "loss": 2.8117, + "mean_token_accuracy": 0.4413793087005615, + "step": 72285 + }, + { + "epoch": 0.07281125578012995, + "grad_norm": 9.835752814550867, + "learning_rate": 4.993601739081886e-05, + "loss": 2.3581, + "mean_token_accuracy": 0.42068966031074523, + "step": 72290 + }, + { + "epoch": 0.07281629183323413, + "grad_norm": 18.30930332703122, + "learning_rate": 4.993598914897051e-05, + "loss": 2.6393, + "mean_token_accuracy": 0.40344826579093934, + "step": 72295 + }, + { + "epoch": 0.0728213278863383, + "grad_norm": 10.852575199357078, + "learning_rate": 4.993596090089946e-05, + "loss": 2.9509, + "mean_token_accuracy": 0.37241379618644715, + "step": 72300 + }, + { + "epoch": 0.07282636393944247, + "grad_norm": 9.510306256276776, + "learning_rate": 4.993593264660571e-05, + "loss": 2.5359, + "mean_token_accuracy": 0.3827586233615875, + "step": 72305 + }, + { + "epoch": 0.07283139999254665, + "grad_norm": 9.593495084847603, + "learning_rate": 4.993590438608927e-05, + "loss": 3.006, + "mean_token_accuracy": 0.4068965554237366, + "step": 72310 + }, + { + "epoch": 0.07283643604565082, + "grad_norm": 12.223821199916264, + "learning_rate": 4.9935876119350146e-05, + "loss": 2.6485, + "mean_token_accuracy": 0.36551724672317504, + "step": 72315 + }, + { + "epoch": 0.07284147209875498, + "grad_norm": 12.001795356391348, + "learning_rate": 4.993584784638835e-05, + "loss": 2.3903, + "mean_token_accuracy": 0.42262553572654726, + "step": 72320 + }, + { + "epoch": 0.07284650815185915, + "grad_norm": 9.460677259005962, + "learning_rate": 4.993581956720389e-05, + "loss": 2.3558, + "mean_token_accuracy": 0.41379310488700866, + "step": 72325 + }, + { + "epoch": 0.07285154420496333, + "grad_norm": 9.705487608003441, + "learning_rate": 4.9935791281796775e-05, + "loss": 2.1228, + "mean_token_accuracy": 0.45021172761917116, + "step": 72330 + }, + { + "epoch": 0.0728565802580675, + "grad_norm": 15.311335958846053, + "learning_rate": 4.9935762990167e-05, + "loss": 2.5427, + "mean_token_accuracy": 0.42413792610168455, + "step": 72335 + }, + { + "epoch": 0.07286161631117168, + "grad_norm": 11.397359013493828, + "learning_rate": 4.993573469231458e-05, + "loss": 2.5952, + "mean_token_accuracy": 0.42758620381355283, + "step": 72340 + }, + { + "epoch": 0.07286665236427585, + "grad_norm": 13.633317444586888, + "learning_rate": 4.993570638823954e-05, + "loss": 2.3747, + "mean_token_accuracy": 0.4532970368862152, + "step": 72345 + }, + { + "epoch": 0.07287168841738002, + "grad_norm": 10.12042932312706, + "learning_rate": 4.993567807794186e-05, + "loss": 2.1829, + "mean_token_accuracy": 0.48275862336158754, + "step": 72350 + }, + { + "epoch": 0.0728767244704842, + "grad_norm": 12.510523246293966, + "learning_rate": 4.993564976142157e-05, + "loss": 3.4516, + "mean_token_accuracy": 0.32068965435028074, + "step": 72355 + }, + { + "epoch": 0.07288176052358837, + "grad_norm": 10.86703246565231, + "learning_rate": 4.993562143867866e-05, + "loss": 2.489, + "mean_token_accuracy": 0.3896551787853241, + "step": 72360 + }, + { + "epoch": 0.07288679657669254, + "grad_norm": 11.620403511029545, + "learning_rate": 4.993559310971316e-05, + "loss": 2.4835, + "mean_token_accuracy": 0.43647912740707395, + "step": 72365 + }, + { + "epoch": 0.07289183262979672, + "grad_norm": 10.292147022120481, + "learning_rate": 4.993556477452505e-05, + "loss": 2.181, + "mean_token_accuracy": 0.43623715043067934, + "step": 72370 + }, + { + "epoch": 0.07289686868290089, + "grad_norm": 8.893932292214556, + "learning_rate": 4.993553643311437e-05, + "loss": 1.9211, + "mean_token_accuracy": 0.5310344755649566, + "step": 72375 + }, + { + "epoch": 0.07290190473600507, + "grad_norm": 12.720922187810377, + "learning_rate": 4.993550808548109e-05, + "loss": 2.5298, + "mean_token_accuracy": 0.3896551638841629, + "step": 72380 + }, + { + "epoch": 0.07290694078910924, + "grad_norm": 11.61154605670652, + "learning_rate": 4.993547973162525e-05, + "loss": 2.4601, + "mean_token_accuracy": 0.40344828367233276, + "step": 72385 + }, + { + "epoch": 0.0729119768422134, + "grad_norm": 10.985398217705818, + "learning_rate": 4.9935451371546844e-05, + "loss": 2.6579, + "mean_token_accuracy": 0.44137930274009707, + "step": 72390 + }, + { + "epoch": 0.07291701289531757, + "grad_norm": 11.172170055961857, + "learning_rate": 4.993542300524588e-05, + "loss": 2.6228, + "mean_token_accuracy": 0.37241379618644715, + "step": 72395 + }, + { + "epoch": 0.07292204894842175, + "grad_norm": 16.236557106760667, + "learning_rate": 4.993539463272236e-05, + "loss": 2.7382, + "mean_token_accuracy": 0.38620689511299133, + "step": 72400 + }, + { + "epoch": 0.07292708500152592, + "grad_norm": 12.407548622779077, + "learning_rate": 4.9935366253976314e-05, + "loss": 2.2455, + "mean_token_accuracy": 0.44482758045196535, + "step": 72405 + }, + { + "epoch": 0.0729321210546301, + "grad_norm": 11.202663897107664, + "learning_rate": 4.993533786900773e-05, + "loss": 2.1573, + "mean_token_accuracy": 0.4673926293849945, + "step": 72410 + }, + { + "epoch": 0.07293715710773427, + "grad_norm": 12.91132643619679, + "learning_rate": 4.993530947781662e-05, + "loss": 2.5486, + "mean_token_accuracy": 0.4310344815254211, + "step": 72415 + }, + { + "epoch": 0.07294219316083844, + "grad_norm": 12.97893000198072, + "learning_rate": 4.9935281080403e-05, + "loss": 2.8238, + "mean_token_accuracy": 0.3517241358757019, + "step": 72420 + }, + { + "epoch": 0.07294722921394262, + "grad_norm": 10.834034842587899, + "learning_rate": 4.993525267676686e-05, + "loss": 2.462, + "mean_token_accuracy": 0.36896551847457887, + "step": 72425 + }, + { + "epoch": 0.07295226526704679, + "grad_norm": 10.0147594962151, + "learning_rate": 4.993522426690823e-05, + "loss": 2.8127, + "mean_token_accuracy": 0.3999999940395355, + "step": 72430 + }, + { + "epoch": 0.07295730132015096, + "grad_norm": 10.293541524180158, + "learning_rate": 4.99351958508271e-05, + "loss": 2.6511, + "mean_token_accuracy": 0.3999999940395355, + "step": 72435 + }, + { + "epoch": 0.07296233737325514, + "grad_norm": 10.250687016878397, + "learning_rate": 4.993516742852349e-05, + "loss": 2.3762, + "mean_token_accuracy": 0.3827586203813553, + "step": 72440 + }, + { + "epoch": 0.07296737342635931, + "grad_norm": 10.448315549117739, + "learning_rate": 4.99351389999974e-05, + "loss": 2.4875, + "mean_token_accuracy": 0.4344827592372894, + "step": 72445 + }, + { + "epoch": 0.07297240947946348, + "grad_norm": 13.210731001332078, + "learning_rate": 4.993511056524883e-05, + "loss": 2.4359, + "mean_token_accuracy": 0.46551724672317507, + "step": 72450 + }, + { + "epoch": 0.07297744553256766, + "grad_norm": 11.238519528966805, + "learning_rate": 4.993508212427781e-05, + "loss": 2.4323, + "mean_token_accuracy": 0.4572292804718018, + "step": 72455 + }, + { + "epoch": 0.07298248158567182, + "grad_norm": 10.99616577918631, + "learning_rate": 4.993505367708433e-05, + "loss": 2.4446, + "mean_token_accuracy": 0.42758620381355283, + "step": 72460 + }, + { + "epoch": 0.07298751763877599, + "grad_norm": 10.712084970411805, + "learning_rate": 4.993502522366841e-05, + "loss": 2.7277, + "mean_token_accuracy": 0.4379310369491577, + "step": 72465 + }, + { + "epoch": 0.07299255369188017, + "grad_norm": 16.203551666641417, + "learning_rate": 4.993499676403006e-05, + "loss": 2.5145, + "mean_token_accuracy": 0.42413793206214906, + "step": 72470 + }, + { + "epoch": 0.07299758974498434, + "grad_norm": 13.12164136223451, + "learning_rate": 4.9934968298169266e-05, + "loss": 2.387, + "mean_token_accuracy": 0.4103448331356049, + "step": 72475 + }, + { + "epoch": 0.07300262579808851, + "grad_norm": 9.449511741401048, + "learning_rate": 4.9934939826086055e-05, + "loss": 2.5212, + "mean_token_accuracy": 0.42413793206214906, + "step": 72480 + }, + { + "epoch": 0.07300766185119269, + "grad_norm": 10.526415801719274, + "learning_rate": 4.993491134778043e-05, + "loss": 2.9463, + "mean_token_accuracy": 0.3344827562570572, + "step": 72485 + }, + { + "epoch": 0.07301269790429686, + "grad_norm": 9.042886627410118, + "learning_rate": 4.9934882863252394e-05, + "loss": 2.5385, + "mean_token_accuracy": 0.4052026629447937, + "step": 72490 + }, + { + "epoch": 0.07301773395740103, + "grad_norm": 14.822220398822482, + "learning_rate": 4.9934854372501966e-05, + "loss": 2.4732, + "mean_token_accuracy": 0.4155474901199341, + "step": 72495 + }, + { + "epoch": 0.07302277001050521, + "grad_norm": 12.682939884496433, + "learning_rate": 4.9934825875529145e-05, + "loss": 3.0513, + "mean_token_accuracy": 0.35862069129943847, + "step": 72500 + }, + { + "epoch": 0.07302780606360938, + "grad_norm": 9.793739351461229, + "learning_rate": 4.993479737233394e-05, + "loss": 2.3138, + "mean_token_accuracy": 0.44482758045196535, + "step": 72505 + }, + { + "epoch": 0.07303284211671356, + "grad_norm": 13.67818198823798, + "learning_rate": 4.993476886291636e-05, + "loss": 2.8014, + "mean_token_accuracy": 0.37241379618644715, + "step": 72510 + }, + { + "epoch": 0.07303787816981773, + "grad_norm": 12.882179464599904, + "learning_rate": 4.993474034727642e-05, + "loss": 2.3201, + "mean_token_accuracy": 0.4091349124908447, + "step": 72515 + }, + { + "epoch": 0.0730429142229219, + "grad_norm": 13.22214372743453, + "learning_rate": 4.99347118254141e-05, + "loss": 3.0196, + "mean_token_accuracy": 0.3551724135875702, + "step": 72520 + }, + { + "epoch": 0.07304795027602608, + "grad_norm": 10.797604812921078, + "learning_rate": 4.993468329732945e-05, + "loss": 2.773, + "mean_token_accuracy": 0.3896551698446274, + "step": 72525 + }, + { + "epoch": 0.07305298632913024, + "grad_norm": 14.85796495571364, + "learning_rate": 4.9934654763022447e-05, + "loss": 2.6073, + "mean_token_accuracy": 0.4068965494632721, + "step": 72530 + }, + { + "epoch": 0.07305802238223441, + "grad_norm": 12.887502078811295, + "learning_rate": 4.993462622249312e-05, + "loss": 2.773, + "mean_token_accuracy": 0.38275861740112305, + "step": 72535 + }, + { + "epoch": 0.07306305843533858, + "grad_norm": 14.42940983692418, + "learning_rate": 4.993459767574146e-05, + "loss": 2.6287, + "mean_token_accuracy": 0.3910465776920319, + "step": 72540 + }, + { + "epoch": 0.07306809448844276, + "grad_norm": 12.414631158400018, + "learning_rate": 4.9934569122767475e-05, + "loss": 2.5637, + "mean_token_accuracy": 0.4379310369491577, + "step": 72545 + }, + { + "epoch": 0.07307313054154693, + "grad_norm": 16.570800641473394, + "learning_rate": 4.9934540563571187e-05, + "loss": 2.4767, + "mean_token_accuracy": 0.44307319521903993, + "step": 72550 + }, + { + "epoch": 0.0730781665946511, + "grad_norm": 10.08503365826091, + "learning_rate": 4.993451199815258e-05, + "loss": 2.0344, + "mean_token_accuracy": 0.49479734897613525, + "step": 72555 + }, + { + "epoch": 0.07308320264775528, + "grad_norm": 16.5182442673272, + "learning_rate": 4.99344834265117e-05, + "loss": 2.5348, + "mean_token_accuracy": 0.41724138259887694, + "step": 72560 + }, + { + "epoch": 0.07308823870085945, + "grad_norm": 14.428776681431172, + "learning_rate": 4.993445484864851e-05, + "loss": 2.4657, + "mean_token_accuracy": 0.41379310488700866, + "step": 72565 + }, + { + "epoch": 0.07309327475396363, + "grad_norm": 12.09529962967115, + "learning_rate": 4.9934426264563055e-05, + "loss": 2.4007, + "mean_token_accuracy": 0.4620689630508423, + "step": 72570 + }, + { + "epoch": 0.0730983108070678, + "grad_norm": 9.860982635321337, + "learning_rate": 4.993439767425532e-05, + "loss": 2.4117, + "mean_token_accuracy": 0.41034482717514037, + "step": 72575 + }, + { + "epoch": 0.07310334686017197, + "grad_norm": 9.620055548909258, + "learning_rate": 4.993436907772533e-05, + "loss": 2.1648, + "mean_token_accuracy": 0.44137930274009707, + "step": 72580 + }, + { + "epoch": 0.07310838291327615, + "grad_norm": 13.321161916269885, + "learning_rate": 4.9934340474973085e-05, + "loss": 2.1513, + "mean_token_accuracy": 0.49655172824859617, + "step": 72585 + }, + { + "epoch": 0.07311341896638032, + "grad_norm": 9.368303441956412, + "learning_rate": 4.993431186599858e-05, + "loss": 2.29, + "mean_token_accuracy": 0.42068964838981626, + "step": 72590 + }, + { + "epoch": 0.0731184550194845, + "grad_norm": 12.628769166968187, + "learning_rate": 4.993428325080184e-05, + "loss": 2.378, + "mean_token_accuracy": 0.4050211727619171, + "step": 72595 + }, + { + "epoch": 0.07312349107258866, + "grad_norm": 9.00620295301634, + "learning_rate": 4.993425462938288e-05, + "loss": 2.4072, + "mean_token_accuracy": 0.4206896543502808, + "step": 72600 + }, + { + "epoch": 0.07312852712569283, + "grad_norm": 11.16097721676781, + "learning_rate": 4.993422600174168e-05, + "loss": 2.5759, + "mean_token_accuracy": 0.35862068831920624, + "step": 72605 + }, + { + "epoch": 0.073133563178797, + "grad_norm": 11.679502619804406, + "learning_rate": 4.993419736787827e-05, + "loss": 2.8263, + "mean_token_accuracy": 0.3896551728248596, + "step": 72610 + }, + { + "epoch": 0.07313859923190118, + "grad_norm": 12.992594811845812, + "learning_rate": 4.993416872779265e-05, + "loss": 2.6594, + "mean_token_accuracy": 0.36896551847457887, + "step": 72615 + }, + { + "epoch": 0.07314363528500535, + "grad_norm": 10.745330435193642, + "learning_rate": 4.993414008148483e-05, + "loss": 2.5455, + "mean_token_accuracy": 0.3931034505367279, + "step": 72620 + }, + { + "epoch": 0.07314867133810952, + "grad_norm": 11.082977663490098, + "learning_rate": 4.9934111428954815e-05, + "loss": 2.9057, + "mean_token_accuracy": 0.37241379022598264, + "step": 72625 + }, + { + "epoch": 0.0731537073912137, + "grad_norm": 14.518551347517656, + "learning_rate": 4.9934082770202625e-05, + "loss": 2.5439, + "mean_token_accuracy": 0.4517241299152374, + "step": 72630 + }, + { + "epoch": 0.07315874344431787, + "grad_norm": 10.504482318490387, + "learning_rate": 4.9934054105228246e-05, + "loss": 2.2096, + "mean_token_accuracy": 0.4034482777118683, + "step": 72635 + }, + { + "epoch": 0.07316377949742205, + "grad_norm": 12.078324314967087, + "learning_rate": 4.993402543403171e-05, + "loss": 2.2463, + "mean_token_accuracy": 0.46551724076271056, + "step": 72640 + }, + { + "epoch": 0.07316881555052622, + "grad_norm": 11.53250257379269, + "learning_rate": 4.9933996756613e-05, + "loss": 2.7343, + "mean_token_accuracy": 0.39655172228813174, + "step": 72645 + }, + { + "epoch": 0.0731738516036304, + "grad_norm": 11.351865964699005, + "learning_rate": 4.9933968072972145e-05, + "loss": 2.6475, + "mean_token_accuracy": 0.3482758581638336, + "step": 72650 + }, + { + "epoch": 0.07317888765673457, + "grad_norm": 10.706577265826784, + "learning_rate": 4.9933939383109146e-05, + "loss": 2.6182, + "mean_token_accuracy": 0.4, + "step": 72655 + }, + { + "epoch": 0.07318392370983874, + "grad_norm": 14.59814882408465, + "learning_rate": 4.993391068702401e-05, + "loss": 2.7607, + "mean_token_accuracy": 0.3931034475564957, + "step": 72660 + }, + { + "epoch": 0.07318895976294292, + "grad_norm": 10.27141236090536, + "learning_rate": 4.9933881984716745e-05, + "loss": 2.8877, + "mean_token_accuracy": 0.3482758581638336, + "step": 72665 + }, + { + "epoch": 0.07319399581604707, + "grad_norm": 13.515274801470566, + "learning_rate": 4.993385327618736e-05, + "loss": 2.423, + "mean_token_accuracy": 0.41034482717514037, + "step": 72670 + }, + { + "epoch": 0.07319903186915125, + "grad_norm": 11.496856593179961, + "learning_rate": 4.9933824561435865e-05, + "loss": 2.2281, + "mean_token_accuracy": 0.4344827592372894, + "step": 72675 + }, + { + "epoch": 0.07320406792225542, + "grad_norm": 10.023850480946967, + "learning_rate": 4.9933795840462254e-05, + "loss": 2.6116, + "mean_token_accuracy": 0.3862069010734558, + "step": 72680 + }, + { + "epoch": 0.0732091039753596, + "grad_norm": 10.748159748281589, + "learning_rate": 4.993376711326655e-05, + "loss": 2.4868, + "mean_token_accuracy": 0.40344828069210054, + "step": 72685 + }, + { + "epoch": 0.07321414002846377, + "grad_norm": 10.671028914736983, + "learning_rate": 4.9933738379848764e-05, + "loss": 2.8209, + "mean_token_accuracy": 0.36206896901130675, + "step": 72690 + }, + { + "epoch": 0.07321917608156794, + "grad_norm": 9.402937272038015, + "learning_rate": 4.993370964020889e-05, + "loss": 2.1738, + "mean_token_accuracy": 0.4413793087005615, + "step": 72695 + }, + { + "epoch": 0.07322421213467212, + "grad_norm": 11.180710195116706, + "learning_rate": 4.9933680894346945e-05, + "loss": 2.7296, + "mean_token_accuracy": 0.38620689809322356, + "step": 72700 + }, + { + "epoch": 0.07322924818777629, + "grad_norm": 11.731293533991138, + "learning_rate": 4.993365214226294e-05, + "loss": 2.7864, + "mean_token_accuracy": 0.39310344457626345, + "step": 72705 + }, + { + "epoch": 0.07323428424088047, + "grad_norm": 12.775391319381187, + "learning_rate": 4.993362338395687e-05, + "loss": 2.7982, + "mean_token_accuracy": 0.3620689630508423, + "step": 72710 + }, + { + "epoch": 0.07323932029398464, + "grad_norm": 10.168188944155752, + "learning_rate": 4.993359461942876e-05, + "loss": 2.9086, + "mean_token_accuracy": 0.3448275804519653, + "step": 72715 + }, + { + "epoch": 0.07324435634708881, + "grad_norm": 9.619224530721427, + "learning_rate": 4.99335658486786e-05, + "loss": 2.3741, + "mean_token_accuracy": 0.4, + "step": 72720 + }, + { + "epoch": 0.07324939240019299, + "grad_norm": 12.235530491565955, + "learning_rate": 4.993353707170641e-05, + "loss": 2.4882, + "mean_token_accuracy": 0.3931034505367279, + "step": 72725 + }, + { + "epoch": 0.07325442845329716, + "grad_norm": 14.161830482019129, + "learning_rate": 4.9933508288512196e-05, + "loss": 2.7582, + "mean_token_accuracy": 0.40344825983047483, + "step": 72730 + }, + { + "epoch": 0.07325946450640133, + "grad_norm": 11.065786996003816, + "learning_rate": 4.9933479499095965e-05, + "loss": 2.1326, + "mean_token_accuracy": 0.5124016880989075, + "step": 72735 + }, + { + "epoch": 0.0732645005595055, + "grad_norm": 10.943910892945322, + "learning_rate": 4.993345070345772e-05, + "loss": 2.8873, + "mean_token_accuracy": 0.36206896901130675, + "step": 72740 + }, + { + "epoch": 0.07326953661260967, + "grad_norm": 9.084490565789176, + "learning_rate": 4.9933421901597484e-05, + "loss": 2.3348, + "mean_token_accuracy": 0.42758620381355283, + "step": 72745 + }, + { + "epoch": 0.07327457266571384, + "grad_norm": 13.439504902987066, + "learning_rate": 4.993339309351524e-05, + "loss": 2.7223, + "mean_token_accuracy": 0.3965517282485962, + "step": 72750 + }, + { + "epoch": 0.07327960871881802, + "grad_norm": 11.124198028256757, + "learning_rate": 4.993336427921102e-05, + "loss": 2.5535, + "mean_token_accuracy": 0.39655172228813174, + "step": 72755 + }, + { + "epoch": 0.07328464477192219, + "grad_norm": 9.53519373722695, + "learning_rate": 4.993333545868482e-05, + "loss": 2.2601, + "mean_token_accuracy": 0.38965516686439516, + "step": 72760 + }, + { + "epoch": 0.07328968082502636, + "grad_norm": 11.02635320199772, + "learning_rate": 4.993330663193666e-05, + "loss": 2.7454, + "mean_token_accuracy": 0.3724137932062149, + "step": 72765 + }, + { + "epoch": 0.07329471687813054, + "grad_norm": 12.42936850765551, + "learning_rate": 4.993327779896652e-05, + "loss": 2.4891, + "mean_token_accuracy": 0.3931034505367279, + "step": 72770 + }, + { + "epoch": 0.07329975293123471, + "grad_norm": 11.532799952079722, + "learning_rate": 4.9933248959774445e-05, + "loss": 2.331, + "mean_token_accuracy": 0.41905626058578493, + "step": 72775 + }, + { + "epoch": 0.07330478898433888, + "grad_norm": 15.412544825014487, + "learning_rate": 4.993322011436042e-05, + "loss": 2.9721, + "mean_token_accuracy": 0.3517241418361664, + "step": 72780 + }, + { + "epoch": 0.07330982503744306, + "grad_norm": 13.022551276168073, + "learning_rate": 4.993319126272445e-05, + "loss": 2.7537, + "mean_token_accuracy": 0.34827586710453035, + "step": 72785 + }, + { + "epoch": 0.07331486109054723, + "grad_norm": 9.848614495781279, + "learning_rate": 4.993316240486656e-05, + "loss": 2.2557, + "mean_token_accuracy": 0.47586206197738645, + "step": 72790 + }, + { + "epoch": 0.0733198971436514, + "grad_norm": 13.677853765357694, + "learning_rate": 4.9933133540786743e-05, + "loss": 2.3623, + "mean_token_accuracy": 0.4206896543502808, + "step": 72795 + }, + { + "epoch": 0.07332493319675558, + "grad_norm": 14.000033698736551, + "learning_rate": 4.993310467048501e-05, + "loss": 2.9051, + "mean_token_accuracy": 0.36896551251411436, + "step": 72800 + }, + { + "epoch": 0.07332996924985975, + "grad_norm": 10.790073204684578, + "learning_rate": 4.993307579396138e-05, + "loss": 2.6302, + "mean_token_accuracy": 0.37586206793785093, + "step": 72805 + }, + { + "epoch": 0.07333500530296391, + "grad_norm": 10.949172310787837, + "learning_rate": 4.993304691121585e-05, + "loss": 2.3274, + "mean_token_accuracy": 0.4482758641242981, + "step": 72810 + }, + { + "epoch": 0.07334004135606809, + "grad_norm": 9.36385497372334, + "learning_rate": 4.993301802224843e-05, + "loss": 2.376, + "mean_token_accuracy": 0.41379311084747317, + "step": 72815 + }, + { + "epoch": 0.07334507740917226, + "grad_norm": 12.324114294063513, + "learning_rate": 4.9932989127059124e-05, + "loss": 3.2396, + "mean_token_accuracy": 0.3655172437429428, + "step": 72820 + }, + { + "epoch": 0.07335011346227643, + "grad_norm": 10.531440125641488, + "learning_rate": 4.993296022564795e-05, + "loss": 2.4096, + "mean_token_accuracy": 0.4, + "step": 72825 + }, + { + "epoch": 0.07335514951538061, + "grad_norm": 10.631136358316258, + "learning_rate": 4.993293131801491e-05, + "loss": 2.3275, + "mean_token_accuracy": 0.41379310488700866, + "step": 72830 + }, + { + "epoch": 0.07336018556848478, + "grad_norm": 15.867949896793366, + "learning_rate": 4.993290240416001e-05, + "loss": 2.5764, + "mean_token_accuracy": 0.4551724135875702, + "step": 72835 + }, + { + "epoch": 0.07336522162158896, + "grad_norm": 13.735210710100157, + "learning_rate": 4.993287348408326e-05, + "loss": 2.4499, + "mean_token_accuracy": 0.3601935863494873, + "step": 72840 + }, + { + "epoch": 0.07337025767469313, + "grad_norm": 11.580109443927407, + "learning_rate": 4.993284455778467e-05, + "loss": 2.4587, + "mean_token_accuracy": 0.4, + "step": 72845 + }, + { + "epoch": 0.0733752937277973, + "grad_norm": 13.355206504472298, + "learning_rate": 4.993281562526426e-05, + "loss": 2.3773, + "mean_token_accuracy": 0.43103447556495667, + "step": 72850 + }, + { + "epoch": 0.07338032978090148, + "grad_norm": 15.364359142280078, + "learning_rate": 4.993278668652201e-05, + "loss": 2.1644, + "mean_token_accuracy": 0.46613301038742067, + "step": 72855 + }, + { + "epoch": 0.07338536583400565, + "grad_norm": 9.531368633444488, + "learning_rate": 4.9932757741557945e-05, + "loss": 2.4609, + "mean_token_accuracy": 0.3862068891525269, + "step": 72860 + }, + { + "epoch": 0.07339040188710982, + "grad_norm": 13.954673235391898, + "learning_rate": 4.9932728790372075e-05, + "loss": 2.4213, + "mean_token_accuracy": 0.4344827592372894, + "step": 72865 + }, + { + "epoch": 0.073395437940214, + "grad_norm": 11.378571871075621, + "learning_rate": 4.99326998329644e-05, + "loss": 2.6886, + "mean_token_accuracy": 0.42758620977401735, + "step": 72870 + }, + { + "epoch": 0.07340047399331817, + "grad_norm": 9.849227185488344, + "learning_rate": 4.993267086933493e-05, + "loss": 1.8146, + "mean_token_accuracy": 0.5354679763317108, + "step": 72875 + }, + { + "epoch": 0.07340551004642233, + "grad_norm": 10.380432057865466, + "learning_rate": 4.993264189948368e-05, + "loss": 2.5326, + "mean_token_accuracy": 0.3965517163276672, + "step": 72880 + }, + { + "epoch": 0.0734105460995265, + "grad_norm": 10.51658420074308, + "learning_rate": 4.993261292341066e-05, + "loss": 2.356, + "mean_token_accuracy": 0.41905626058578493, + "step": 72885 + }, + { + "epoch": 0.07341558215263068, + "grad_norm": 9.936882840583388, + "learning_rate": 4.9932583941115855e-05, + "loss": 2.4113, + "mean_token_accuracy": 0.4413793057203293, + "step": 72890 + }, + { + "epoch": 0.07342061820573485, + "grad_norm": 10.70935240015387, + "learning_rate": 4.9932554952599305e-05, + "loss": 2.6979, + "mean_token_accuracy": 0.3827586203813553, + "step": 72895 + }, + { + "epoch": 0.07342565425883903, + "grad_norm": 13.129191040735558, + "learning_rate": 4.993252595786099e-05, + "loss": 2.3924, + "mean_token_accuracy": 0.3896551728248596, + "step": 72900 + }, + { + "epoch": 0.0734306903119432, + "grad_norm": 10.439409353260482, + "learning_rate": 4.993249695690094e-05, + "loss": 2.8437, + "mean_token_accuracy": 0.3724138021469116, + "step": 72905 + }, + { + "epoch": 0.07343572636504737, + "grad_norm": 12.301473552007637, + "learning_rate": 4.9932467949719144e-05, + "loss": 2.2727, + "mean_token_accuracy": 0.4310344815254211, + "step": 72910 + }, + { + "epoch": 0.07344076241815155, + "grad_norm": 12.647491035609791, + "learning_rate": 4.993243893631562e-05, + "loss": 2.547, + "mean_token_accuracy": 0.4517241299152374, + "step": 72915 + }, + { + "epoch": 0.07344579847125572, + "grad_norm": 13.183942009543351, + "learning_rate": 4.9932409916690377e-05, + "loss": 2.3256, + "mean_token_accuracy": 0.4, + "step": 72920 + }, + { + "epoch": 0.0734508345243599, + "grad_norm": 9.769951232078048, + "learning_rate": 4.9932380890843424e-05, + "loss": 2.5652, + "mean_token_accuracy": 0.379310342669487, + "step": 72925 + }, + { + "epoch": 0.07345587057746407, + "grad_norm": 10.333832272303406, + "learning_rate": 4.9932351858774764e-05, + "loss": 2.8044, + "mean_token_accuracy": 0.3793103516101837, + "step": 72930 + }, + { + "epoch": 0.07346090663056824, + "grad_norm": 13.539689825793765, + "learning_rate": 4.993232282048441e-05, + "loss": 2.5553, + "mean_token_accuracy": 0.3896551787853241, + "step": 72935 + }, + { + "epoch": 0.07346594268367242, + "grad_norm": 13.204383090535488, + "learning_rate": 4.993229377597237e-05, + "loss": 2.619, + "mean_token_accuracy": 0.35862069129943847, + "step": 72940 + }, + { + "epoch": 0.07347097873677659, + "grad_norm": 14.516340446792917, + "learning_rate": 4.993226472523865e-05, + "loss": 2.6091, + "mean_token_accuracy": 0.39310344457626345, + "step": 72945 + }, + { + "epoch": 0.07347601478988075, + "grad_norm": 10.791738044635656, + "learning_rate": 4.993223566828325e-05, + "loss": 2.559, + "mean_token_accuracy": 0.3999999940395355, + "step": 72950 + }, + { + "epoch": 0.07348105084298492, + "grad_norm": 12.825047126786925, + "learning_rate": 4.993220660510619e-05, + "loss": 2.886, + "mean_token_accuracy": 0.41379310488700866, + "step": 72955 + }, + { + "epoch": 0.0734860868960891, + "grad_norm": 11.793790759054717, + "learning_rate": 4.993217753570747e-05, + "loss": 2.4459, + "mean_token_accuracy": 0.43103447556495667, + "step": 72960 + }, + { + "epoch": 0.07349112294919327, + "grad_norm": 11.096154805450695, + "learning_rate": 4.993214846008711e-05, + "loss": 2.6377, + "mean_token_accuracy": 0.3689655065536499, + "step": 72965 + }, + { + "epoch": 0.07349615900229745, + "grad_norm": 12.399624168464097, + "learning_rate": 4.9932119378245105e-05, + "loss": 2.4626, + "mean_token_accuracy": 0.4034482777118683, + "step": 72970 + }, + { + "epoch": 0.07350119505540162, + "grad_norm": 16.21415537734024, + "learning_rate": 4.993209029018147e-05, + "loss": 2.7677, + "mean_token_accuracy": 0.4034482717514038, + "step": 72975 + }, + { + "epoch": 0.07350623110850579, + "grad_norm": 10.765529652947311, + "learning_rate": 4.9932061195896206e-05, + "loss": 2.8114, + "mean_token_accuracy": 0.3379310339689255, + "step": 72980 + }, + { + "epoch": 0.07351126716160997, + "grad_norm": 10.63799905466745, + "learning_rate": 4.993203209538933e-05, + "loss": 2.5619, + "mean_token_accuracy": 0.4068965554237366, + "step": 72985 + }, + { + "epoch": 0.07351630321471414, + "grad_norm": 8.956843544101632, + "learning_rate": 4.993200298866085e-05, + "loss": 2.5059, + "mean_token_accuracy": 0.4137930989265442, + "step": 72990 + }, + { + "epoch": 0.07352133926781831, + "grad_norm": 11.615231406846759, + "learning_rate": 4.993197387571076e-05, + "loss": 2.3642, + "mean_token_accuracy": 0.42758620977401735, + "step": 72995 + }, + { + "epoch": 0.07352637532092249, + "grad_norm": 11.737252297790269, + "learning_rate": 4.993194475653909e-05, + "loss": 2.7102, + "mean_token_accuracy": 0.37586207389831544, + "step": 73000 + }, + { + "epoch": 0.07353141137402666, + "grad_norm": 14.812531579043862, + "learning_rate": 4.993191563114583e-05, + "loss": 2.9156, + "mean_token_accuracy": 0.3689655244350433, + "step": 73005 + }, + { + "epoch": 0.07353644742713084, + "grad_norm": 12.820856068959575, + "learning_rate": 4.9931886499530996e-05, + "loss": 2.9206, + "mean_token_accuracy": 0.31379309892654417, + "step": 73010 + }, + { + "epoch": 0.07354148348023501, + "grad_norm": 11.658359093517545, + "learning_rate": 4.993185736169459e-05, + "loss": 2.7762, + "mean_token_accuracy": 0.36551723480224607, + "step": 73015 + }, + { + "epoch": 0.07354651953333917, + "grad_norm": 11.594873309794945, + "learning_rate": 4.993182821763663e-05, + "loss": 2.6411, + "mean_token_accuracy": 0.3551724076271057, + "step": 73020 + }, + { + "epoch": 0.07355155558644334, + "grad_norm": 10.11901341791273, + "learning_rate": 4.9931799067357116e-05, + "loss": 2.6076, + "mean_token_accuracy": 0.3551724135875702, + "step": 73025 + }, + { + "epoch": 0.07355659163954752, + "grad_norm": 9.808915724599974, + "learning_rate": 4.993176991085606e-05, + "loss": 2.5748, + "mean_token_accuracy": 0.42068964838981626, + "step": 73030 + }, + { + "epoch": 0.07356162769265169, + "grad_norm": 24.537367685285453, + "learning_rate": 4.993174074813347e-05, + "loss": 2.6997, + "mean_token_accuracy": 0.42413792610168455, + "step": 73035 + }, + { + "epoch": 0.07356666374575586, + "grad_norm": 11.758586122645296, + "learning_rate": 4.993171157918935e-05, + "loss": 2.5735, + "mean_token_accuracy": 0.358620685338974, + "step": 73040 + }, + { + "epoch": 0.07357169979886004, + "grad_norm": 12.208058050294424, + "learning_rate": 4.993168240402372e-05, + "loss": 2.7919, + "mean_token_accuracy": 0.37241378128528596, + "step": 73045 + }, + { + "epoch": 0.07357673585196421, + "grad_norm": 12.493677360779838, + "learning_rate": 4.993165322263657e-05, + "loss": 2.7788, + "mean_token_accuracy": 0.40193586945533755, + "step": 73050 + }, + { + "epoch": 0.07358177190506839, + "grad_norm": 11.389702768056457, + "learning_rate": 4.993162403502792e-05, + "loss": 2.3373, + "mean_token_accuracy": 0.44137930274009707, + "step": 73055 + }, + { + "epoch": 0.07358680795817256, + "grad_norm": 9.421003383414227, + "learning_rate": 4.9931594841197776e-05, + "loss": 2.6476, + "mean_token_accuracy": 0.35862068831920624, + "step": 73060 + }, + { + "epoch": 0.07359184401127673, + "grad_norm": 8.400750637578986, + "learning_rate": 4.9931565641146145e-05, + "loss": 2.3728, + "mean_token_accuracy": 0.41034482419490814, + "step": 73065 + }, + { + "epoch": 0.0735968800643809, + "grad_norm": 12.137269326414177, + "learning_rate": 4.993153643487303e-05, + "loss": 2.2787, + "mean_token_accuracy": 0.47586206793785096, + "step": 73070 + }, + { + "epoch": 0.07360191611748508, + "grad_norm": 12.6350670098919, + "learning_rate": 4.993150722237845e-05, + "loss": 3.0055, + "mean_token_accuracy": 0.3379310369491577, + "step": 73075 + }, + { + "epoch": 0.07360695217058925, + "grad_norm": 13.414931316332321, + "learning_rate": 4.9931478003662416e-05, + "loss": 2.535, + "mean_token_accuracy": 0.4068965554237366, + "step": 73080 + }, + { + "epoch": 0.07361198822369343, + "grad_norm": 12.710207224975148, + "learning_rate": 4.9931448778724916e-05, + "loss": 2.4041, + "mean_token_accuracy": 0.4034482777118683, + "step": 73085 + }, + { + "epoch": 0.07361702427679759, + "grad_norm": 11.125108155865965, + "learning_rate": 4.993141954756597e-05, + "loss": 2.594, + "mean_token_accuracy": 0.38620689511299133, + "step": 73090 + }, + { + "epoch": 0.07362206032990176, + "grad_norm": 9.41901852823503, + "learning_rate": 4.993139031018559e-05, + "loss": 2.1291, + "mean_token_accuracy": 0.5137930989265442, + "step": 73095 + }, + { + "epoch": 0.07362709638300594, + "grad_norm": 11.251465777286766, + "learning_rate": 4.9931361066583784e-05, + "loss": 2.4294, + "mean_token_accuracy": 0.4588626801967621, + "step": 73100 + }, + { + "epoch": 0.07363213243611011, + "grad_norm": 12.278595844752559, + "learning_rate": 4.993133181676055e-05, + "loss": 2.4409, + "mean_token_accuracy": 0.44289171099662783, + "step": 73105 + }, + { + "epoch": 0.07363716848921428, + "grad_norm": 14.714100078449578, + "learning_rate": 4.99313025607159e-05, + "loss": 2.6067, + "mean_token_accuracy": 0.4068965554237366, + "step": 73110 + }, + { + "epoch": 0.07364220454231846, + "grad_norm": 12.336767987476094, + "learning_rate": 4.993127329844986e-05, + "loss": 2.5407, + "mean_token_accuracy": 0.4034482717514038, + "step": 73115 + }, + { + "epoch": 0.07364724059542263, + "grad_norm": 11.33591917145981, + "learning_rate": 4.993124402996241e-05, + "loss": 2.4933, + "mean_token_accuracy": 0.39310344457626345, + "step": 73120 + }, + { + "epoch": 0.0736522766485268, + "grad_norm": 11.24738157986904, + "learning_rate": 4.993121475525357e-05, + "loss": 2.495, + "mean_token_accuracy": 0.4034482717514038, + "step": 73125 + }, + { + "epoch": 0.07365731270163098, + "grad_norm": 13.058965455704127, + "learning_rate": 4.993118547432335e-05, + "loss": 2.8349, + "mean_token_accuracy": 0.32758620083332063, + "step": 73130 + }, + { + "epoch": 0.07366234875473515, + "grad_norm": 11.169526090513259, + "learning_rate": 4.9931156187171764e-05, + "loss": 2.4791, + "mean_token_accuracy": 0.42413792610168455, + "step": 73135 + }, + { + "epoch": 0.07366738480783933, + "grad_norm": 10.116047180143173, + "learning_rate": 4.993112689379881e-05, + "loss": 2.0562, + "mean_token_accuracy": 0.4814277052879333, + "step": 73140 + }, + { + "epoch": 0.0736724208609435, + "grad_norm": 10.115841487983777, + "learning_rate": 4.993109759420449e-05, + "loss": 2.3807, + "mean_token_accuracy": 0.42758620977401735, + "step": 73145 + }, + { + "epoch": 0.07367745691404767, + "grad_norm": 14.134774698324755, + "learning_rate": 4.993106828838883e-05, + "loss": 2.9347, + "mean_token_accuracy": 0.3655172407627106, + "step": 73150 + }, + { + "epoch": 0.07368249296715185, + "grad_norm": 11.87922621593828, + "learning_rate": 4.993103897635182e-05, + "loss": 2.734, + "mean_token_accuracy": 0.37241379022598264, + "step": 73155 + }, + { + "epoch": 0.073687529020256, + "grad_norm": 11.964981753196366, + "learning_rate": 4.99310096580935e-05, + "loss": 2.683, + "mean_token_accuracy": 0.4068965494632721, + "step": 73160 + }, + { + "epoch": 0.07369256507336018, + "grad_norm": 9.447267569067238, + "learning_rate": 4.993098033361384e-05, + "loss": 2.7416, + "mean_token_accuracy": 0.4068965554237366, + "step": 73165 + }, + { + "epoch": 0.07369760112646435, + "grad_norm": 11.940509159511976, + "learning_rate": 4.9930951002912864e-05, + "loss": 2.7267, + "mean_token_accuracy": 0.40344828367233276, + "step": 73170 + }, + { + "epoch": 0.07370263717956853, + "grad_norm": 11.806198565836313, + "learning_rate": 4.993092166599058e-05, + "loss": 2.6046, + "mean_token_accuracy": 0.42982456684112547, + "step": 73175 + }, + { + "epoch": 0.0737076732326727, + "grad_norm": 15.673490059999738, + "learning_rate": 4.993089232284699e-05, + "loss": 2.8232, + "mean_token_accuracy": 0.3609195381402969, + "step": 73180 + }, + { + "epoch": 0.07371270928577688, + "grad_norm": 12.507235744234682, + "learning_rate": 4.993086297348213e-05, + "loss": 2.5893, + "mean_token_accuracy": 0.4206896543502808, + "step": 73185 + }, + { + "epoch": 0.07371774533888105, + "grad_norm": 12.826594827947092, + "learning_rate": 4.993083361789597e-05, + "loss": 2.5548, + "mean_token_accuracy": 0.4206896543502808, + "step": 73190 + }, + { + "epoch": 0.07372278139198522, + "grad_norm": 10.418834537369257, + "learning_rate": 4.993080425608853e-05, + "loss": 2.3695, + "mean_token_accuracy": 0.4310344815254211, + "step": 73195 + }, + { + "epoch": 0.0737278174450894, + "grad_norm": 11.778388546034375, + "learning_rate": 4.993077488805983e-05, + "loss": 2.8824, + "mean_token_accuracy": 0.34482758343219755, + "step": 73200 + }, + { + "epoch": 0.07373285349819357, + "grad_norm": 12.483087695271434, + "learning_rate": 4.993074551380988e-05, + "loss": 3.0121, + "mean_token_accuracy": 0.3635208696126938, + "step": 73205 + }, + { + "epoch": 0.07373788955129774, + "grad_norm": 11.274167954279978, + "learning_rate": 4.993071613333866e-05, + "loss": 2.6433, + "mean_token_accuracy": 0.38620689511299133, + "step": 73210 + }, + { + "epoch": 0.07374292560440192, + "grad_norm": 10.70864629119354, + "learning_rate": 4.9930686746646215e-05, + "loss": 2.3339, + "mean_token_accuracy": 0.44482759237289426, + "step": 73215 + }, + { + "epoch": 0.07374796165750609, + "grad_norm": 13.29235905194873, + "learning_rate": 4.993065735373252e-05, + "loss": 2.4882, + "mean_token_accuracy": 0.42758620381355283, + "step": 73220 + }, + { + "epoch": 0.07375299771061027, + "grad_norm": 16.602851434641263, + "learning_rate": 4.993062795459761e-05, + "loss": 2.706, + "mean_token_accuracy": 0.3793103456497192, + "step": 73225 + }, + { + "epoch": 0.07375803376371443, + "grad_norm": 12.610507239342391, + "learning_rate": 4.993059854924148e-05, + "loss": 2.438, + "mean_token_accuracy": 0.41724138259887694, + "step": 73230 + }, + { + "epoch": 0.0737630698168186, + "grad_norm": 13.885425634307948, + "learning_rate": 4.993056913766413e-05, + "loss": 2.469, + "mean_token_accuracy": 0.4068965554237366, + "step": 73235 + }, + { + "epoch": 0.07376810586992277, + "grad_norm": 12.057756074369651, + "learning_rate": 4.993053971986559e-05, + "loss": 2.3373, + "mean_token_accuracy": 0.47586206793785096, + "step": 73240 + }, + { + "epoch": 0.07377314192302695, + "grad_norm": 12.806532649367833, + "learning_rate": 4.9930510295845854e-05, + "loss": 2.3493, + "mean_token_accuracy": 0.46896551847457885, + "step": 73245 + }, + { + "epoch": 0.07377817797613112, + "grad_norm": 9.972113055927416, + "learning_rate": 4.993048086560493e-05, + "loss": 2.423, + "mean_token_accuracy": 0.398064124584198, + "step": 73250 + }, + { + "epoch": 0.0737832140292353, + "grad_norm": 16.327261704016014, + "learning_rate": 4.9930451429142824e-05, + "loss": 2.6025, + "mean_token_accuracy": 0.38620689511299133, + "step": 73255 + }, + { + "epoch": 0.07378825008233947, + "grad_norm": 16.451730189726295, + "learning_rate": 4.993042198645956e-05, + "loss": 2.6037, + "mean_token_accuracy": 0.4206896543502808, + "step": 73260 + }, + { + "epoch": 0.07379328613544364, + "grad_norm": 13.095720142230004, + "learning_rate": 4.9930392537555125e-05, + "loss": 2.6395, + "mean_token_accuracy": 0.4068965494632721, + "step": 73265 + }, + { + "epoch": 0.07379832218854782, + "grad_norm": 11.887171489716877, + "learning_rate": 4.993036308242954e-05, + "loss": 2.2182, + "mean_token_accuracy": 0.48275862336158754, + "step": 73270 + }, + { + "epoch": 0.07380335824165199, + "grad_norm": 12.825473541932265, + "learning_rate": 4.993033362108281e-05, + "loss": 2.9701, + "mean_token_accuracy": 0.3310344785451889, + "step": 73275 + }, + { + "epoch": 0.07380839429475616, + "grad_norm": 12.856875113497944, + "learning_rate": 4.993030415351494e-05, + "loss": 2.4272, + "mean_token_accuracy": 0.4413793087005615, + "step": 73280 + }, + { + "epoch": 0.07381343034786034, + "grad_norm": 14.206885665999089, + "learning_rate": 4.993027467972595e-05, + "loss": 2.4515, + "mean_token_accuracy": 0.41034482717514037, + "step": 73285 + }, + { + "epoch": 0.07381846640096451, + "grad_norm": 12.168017849721823, + "learning_rate": 4.993024519971583e-05, + "loss": 2.3596, + "mean_token_accuracy": 0.4310344815254211, + "step": 73290 + }, + { + "epoch": 0.07382350245406867, + "grad_norm": 12.218374452565037, + "learning_rate": 4.9930215713484596e-05, + "loss": 2.6897, + "mean_token_accuracy": 0.42413792610168455, + "step": 73295 + }, + { + "epoch": 0.07382853850717284, + "grad_norm": 10.063962451529624, + "learning_rate": 4.9930186221032265e-05, + "loss": 2.3322, + "mean_token_accuracy": 0.41379310488700866, + "step": 73300 + }, + { + "epoch": 0.07383357456027702, + "grad_norm": 11.130853008735604, + "learning_rate": 4.9930156722358843e-05, + "loss": 2.5183, + "mean_token_accuracy": 0.3517241358757019, + "step": 73305 + }, + { + "epoch": 0.07383861061338119, + "grad_norm": 18.641129227695448, + "learning_rate": 4.993012721746433e-05, + "loss": 2.7564, + "mean_token_accuracy": 0.4379310369491577, + "step": 73310 + }, + { + "epoch": 0.07384364666648537, + "grad_norm": 12.818905117003162, + "learning_rate": 4.9930097706348733e-05, + "loss": 2.6477, + "mean_token_accuracy": 0.4, + "step": 73315 + }, + { + "epoch": 0.07384868271958954, + "grad_norm": 11.189332873647237, + "learning_rate": 4.9930068189012066e-05, + "loss": 2.6046, + "mean_token_accuracy": 0.4172413766384125, + "step": 73320 + }, + { + "epoch": 0.07385371877269371, + "grad_norm": 18.980725744883244, + "learning_rate": 4.993003866545434e-05, + "loss": 2.66, + "mean_token_accuracy": 0.4034482777118683, + "step": 73325 + }, + { + "epoch": 0.07385875482579789, + "grad_norm": 13.13947859116219, + "learning_rate": 4.993000913567555e-05, + "loss": 2.3051, + "mean_token_accuracy": 0.42758620977401735, + "step": 73330 + }, + { + "epoch": 0.07386379087890206, + "grad_norm": 15.785769750312156, + "learning_rate": 4.9929979599675726e-05, + "loss": 2.7005, + "mean_token_accuracy": 0.3999999940395355, + "step": 73335 + }, + { + "epoch": 0.07386882693200623, + "grad_norm": 12.087716114462479, + "learning_rate": 4.9929950057454856e-05, + "loss": 2.7669, + "mean_token_accuracy": 0.358620685338974, + "step": 73340 + }, + { + "epoch": 0.07387386298511041, + "grad_norm": 10.450370328644395, + "learning_rate": 4.992992050901295e-05, + "loss": 2.3548, + "mean_token_accuracy": 0.4344827592372894, + "step": 73345 + }, + { + "epoch": 0.07387889903821458, + "grad_norm": 9.918572847349624, + "learning_rate": 4.992989095435003e-05, + "loss": 2.3837, + "mean_token_accuracy": 0.4206896543502808, + "step": 73350 + }, + { + "epoch": 0.07388393509131876, + "grad_norm": 11.18134995551533, + "learning_rate": 4.9929861393466094e-05, + "loss": 2.2824, + "mean_token_accuracy": 0.4517241358757019, + "step": 73355 + }, + { + "epoch": 0.07388897114442293, + "grad_norm": 16.190041723660187, + "learning_rate": 4.992983182636115e-05, + "loss": 2.6668, + "mean_token_accuracy": 0.4, + "step": 73360 + }, + { + "epoch": 0.07389400719752709, + "grad_norm": 12.270344644505114, + "learning_rate": 4.992980225303521e-05, + "loss": 2.5687, + "mean_token_accuracy": 0.43793103098869324, + "step": 73365 + }, + { + "epoch": 0.07389904325063126, + "grad_norm": 12.532088683868126, + "learning_rate": 4.992977267348829e-05, + "loss": 2.8418, + "mean_token_accuracy": 0.35862069129943847, + "step": 73370 + }, + { + "epoch": 0.07390407930373544, + "grad_norm": 11.897729541711936, + "learning_rate": 4.9929743087720374e-05, + "loss": 2.318, + "mean_token_accuracy": 0.4620689630508423, + "step": 73375 + }, + { + "epoch": 0.07390911535683961, + "grad_norm": 10.775705310205334, + "learning_rate": 4.9929713495731494e-05, + "loss": 2.4972, + "mean_token_accuracy": 0.34137930572032926, + "step": 73380 + }, + { + "epoch": 0.07391415140994378, + "grad_norm": 9.662684205347084, + "learning_rate": 4.9929683897521645e-05, + "loss": 2.6227, + "mean_token_accuracy": 0.4206896543502808, + "step": 73385 + }, + { + "epoch": 0.07391918746304796, + "grad_norm": 10.574146389962053, + "learning_rate": 4.992965429309085e-05, + "loss": 2.2444, + "mean_token_accuracy": 0.4310344934463501, + "step": 73390 + }, + { + "epoch": 0.07392422351615213, + "grad_norm": 9.983893004555474, + "learning_rate": 4.9929624682439093e-05, + "loss": 2.4134, + "mean_token_accuracy": 0.4006157636642456, + "step": 73395 + }, + { + "epoch": 0.0739292595692563, + "grad_norm": 8.32590652239136, + "learning_rate": 4.99295950655664e-05, + "loss": 2.3049, + "mean_token_accuracy": 0.4482758641242981, + "step": 73400 + }, + { + "epoch": 0.07393429562236048, + "grad_norm": 14.164893740010019, + "learning_rate": 4.9929565442472774e-05, + "loss": 2.5778, + "mean_token_accuracy": 0.4241379350423813, + "step": 73405 + }, + { + "epoch": 0.07393933167546465, + "grad_norm": 10.591174181187723, + "learning_rate": 4.992953581315823e-05, + "loss": 2.7636, + "mean_token_accuracy": 0.37241379618644715, + "step": 73410 + }, + { + "epoch": 0.07394436772856883, + "grad_norm": 10.574883267593195, + "learning_rate": 4.992950617762276e-05, + "loss": 2.8891, + "mean_token_accuracy": 0.36896551847457887, + "step": 73415 + }, + { + "epoch": 0.073949403781673, + "grad_norm": 10.887037162874512, + "learning_rate": 4.992947653586639e-05, + "loss": 2.6807, + "mean_token_accuracy": 0.36896551847457887, + "step": 73420 + }, + { + "epoch": 0.07395443983477717, + "grad_norm": 13.198337434367195, + "learning_rate": 4.992944688788912e-05, + "loss": 2.2645, + "mean_token_accuracy": 0.43793103098869324, + "step": 73425 + }, + { + "epoch": 0.07395947588788135, + "grad_norm": 13.66676876916609, + "learning_rate": 4.992941723369096e-05, + "loss": 2.512, + "mean_token_accuracy": 0.3724138021469116, + "step": 73430 + }, + { + "epoch": 0.07396451194098551, + "grad_norm": 9.754701692776823, + "learning_rate": 4.992938757327193e-05, + "loss": 2.4385, + "mean_token_accuracy": 0.42413792610168455, + "step": 73435 + }, + { + "epoch": 0.07396954799408968, + "grad_norm": 12.740677496675143, + "learning_rate": 4.992935790663201e-05, + "loss": 2.536, + "mean_token_accuracy": 0.44283121824264526, + "step": 73440 + }, + { + "epoch": 0.07397458404719386, + "grad_norm": 11.446803123817192, + "learning_rate": 4.992932823377122e-05, + "loss": 2.2865, + "mean_token_accuracy": 0.3827586233615875, + "step": 73445 + }, + { + "epoch": 0.07397962010029803, + "grad_norm": 13.44293547264596, + "learning_rate": 4.992929855468958e-05, + "loss": 2.5088, + "mean_token_accuracy": 0.4206896543502808, + "step": 73450 + }, + { + "epoch": 0.0739846561534022, + "grad_norm": 9.961757320240517, + "learning_rate": 4.992926886938709e-05, + "loss": 2.3921, + "mean_token_accuracy": 0.37931033968925476, + "step": 73455 + }, + { + "epoch": 0.07398969220650638, + "grad_norm": 13.262266961680465, + "learning_rate": 4.992923917786376e-05, + "loss": 2.5422, + "mean_token_accuracy": 0.4620689630508423, + "step": 73460 + }, + { + "epoch": 0.07399472825961055, + "grad_norm": 10.811437051006283, + "learning_rate": 4.99292094801196e-05, + "loss": 2.0815, + "mean_token_accuracy": 0.48275861144065857, + "step": 73465 + }, + { + "epoch": 0.07399976431271472, + "grad_norm": 14.136141277277854, + "learning_rate": 4.9929179776154605e-05, + "loss": 2.7742, + "mean_token_accuracy": 0.382758629322052, + "step": 73470 + }, + { + "epoch": 0.0740048003658189, + "grad_norm": 11.171392109253986, + "learning_rate": 4.99291500659688e-05, + "loss": 2.6843, + "mean_token_accuracy": 0.37241379618644715, + "step": 73475 + }, + { + "epoch": 0.07400983641892307, + "grad_norm": 10.163146715926262, + "learning_rate": 4.992912034956218e-05, + "loss": 2.8853, + "mean_token_accuracy": 0.37241379022598264, + "step": 73480 + }, + { + "epoch": 0.07401487247202725, + "grad_norm": 9.574556097390916, + "learning_rate": 4.9929090626934765e-05, + "loss": 2.4248, + "mean_token_accuracy": 0.4186932861804962, + "step": 73485 + }, + { + "epoch": 0.07401990852513142, + "grad_norm": 10.902211327207015, + "learning_rate": 4.992906089808656e-05, + "loss": 2.6733, + "mean_token_accuracy": 0.37241379022598264, + "step": 73490 + }, + { + "epoch": 0.0740249445782356, + "grad_norm": 18.03793466589256, + "learning_rate": 4.9929031163017576e-05, + "loss": 2.7994, + "mean_token_accuracy": 0.4103448331356049, + "step": 73495 + }, + { + "epoch": 0.07402998063133977, + "grad_norm": 11.989701035481728, + "learning_rate": 4.99290014217278e-05, + "loss": 2.5555, + "mean_token_accuracy": 0.3813067078590393, + "step": 73500 + }, + { + "epoch": 0.07403501668444393, + "grad_norm": 11.234611477752159, + "learning_rate": 4.992897167421727e-05, + "loss": 2.8132, + "mean_token_accuracy": 0.3569872945547104, + "step": 73505 + }, + { + "epoch": 0.0740400527375481, + "grad_norm": 10.779105568387152, + "learning_rate": 4.992894192048598e-05, + "loss": 2.5425, + "mean_token_accuracy": 0.41034482717514037, + "step": 73510 + }, + { + "epoch": 0.07404508879065227, + "grad_norm": 10.815033309149127, + "learning_rate": 4.9928912160533935e-05, + "loss": 2.3157, + "mean_token_accuracy": 0.43103447556495667, + "step": 73515 + }, + { + "epoch": 0.07405012484375645, + "grad_norm": 11.653483333677219, + "learning_rate": 4.9928882394361145e-05, + "loss": 3.2556, + "mean_token_accuracy": 0.3793103486299515, + "step": 73520 + }, + { + "epoch": 0.07405516089686062, + "grad_norm": 12.153108381758727, + "learning_rate": 4.992885262196763e-05, + "loss": 2.6998, + "mean_token_accuracy": 0.4137930989265442, + "step": 73525 + }, + { + "epoch": 0.0740601969499648, + "grad_norm": 25.679417456639484, + "learning_rate": 4.9928822843353384e-05, + "loss": 2.8483, + "mean_token_accuracy": 0.40689654350280763, + "step": 73530 + }, + { + "epoch": 0.07406523300306897, + "grad_norm": 10.142071095366905, + "learning_rate": 4.992879305851842e-05, + "loss": 2.6875, + "mean_token_accuracy": 0.36896551847457887, + "step": 73535 + }, + { + "epoch": 0.07407026905617314, + "grad_norm": 9.301850941240241, + "learning_rate": 4.992876326746275e-05, + "loss": 2.4339, + "mean_token_accuracy": 0.4482758641242981, + "step": 73540 + }, + { + "epoch": 0.07407530510927732, + "grad_norm": 12.888182695309183, + "learning_rate": 4.992873347018638e-05, + "loss": 2.6849, + "mean_token_accuracy": 0.38965516686439516, + "step": 73545 + }, + { + "epoch": 0.07408034116238149, + "grad_norm": 14.152800807487276, + "learning_rate": 4.992870366668931e-05, + "loss": 2.8768, + "mean_token_accuracy": 0.39479734301567077, + "step": 73550 + }, + { + "epoch": 0.07408537721548566, + "grad_norm": 9.95740163546874, + "learning_rate": 4.992867385697155e-05, + "loss": 2.6272, + "mean_token_accuracy": 0.358620685338974, + "step": 73555 + }, + { + "epoch": 0.07409041326858984, + "grad_norm": 11.774748379783778, + "learning_rate": 4.9928644041033124e-05, + "loss": 2.7446, + "mean_token_accuracy": 0.4103448212146759, + "step": 73560 + }, + { + "epoch": 0.07409544932169401, + "grad_norm": 12.301603490002, + "learning_rate": 4.992861421887403e-05, + "loss": 2.2436, + "mean_token_accuracy": 0.45517241954803467, + "step": 73565 + }, + { + "epoch": 0.07410048537479819, + "grad_norm": 11.280020643228573, + "learning_rate": 4.9928584390494275e-05, + "loss": 2.4999, + "mean_token_accuracy": 0.38275861740112305, + "step": 73570 + }, + { + "epoch": 0.07410552142790235, + "grad_norm": 10.926622947459446, + "learning_rate": 4.992855455589386e-05, + "loss": 2.54, + "mean_token_accuracy": 0.4241379380226135, + "step": 73575 + }, + { + "epoch": 0.07411055748100652, + "grad_norm": 10.038760634445248, + "learning_rate": 4.992852471507282e-05, + "loss": 2.4701, + "mean_token_accuracy": 0.42758620381355283, + "step": 73580 + }, + { + "epoch": 0.0741155935341107, + "grad_norm": 11.186144441657776, + "learning_rate": 4.9928494868031136e-05, + "loss": 2.6027, + "mean_token_accuracy": 0.4034482717514038, + "step": 73585 + }, + { + "epoch": 0.07412062958721487, + "grad_norm": 16.284438748928075, + "learning_rate": 4.9928465014768825e-05, + "loss": 2.587, + "mean_token_accuracy": 0.43793103098869324, + "step": 73590 + }, + { + "epoch": 0.07412566564031904, + "grad_norm": 12.354442883816848, + "learning_rate": 4.992843515528588e-05, + "loss": 2.5547, + "mean_token_accuracy": 0.42068964838981626, + "step": 73595 + }, + { + "epoch": 0.07413070169342321, + "grad_norm": 11.57128647679959, + "learning_rate": 4.9928405289582345e-05, + "loss": 2.6626, + "mean_token_accuracy": 0.3896551728248596, + "step": 73600 + }, + { + "epoch": 0.07413573774652739, + "grad_norm": 11.370223771657468, + "learning_rate": 4.9928375417658204e-05, + "loss": 2.2989, + "mean_token_accuracy": 0.40689654350280763, + "step": 73605 + }, + { + "epoch": 0.07414077379963156, + "grad_norm": 9.501279251962071, + "learning_rate": 4.9928345539513466e-05, + "loss": 2.0364, + "mean_token_accuracy": 0.48856624960899353, + "step": 73610 + }, + { + "epoch": 0.07414580985273574, + "grad_norm": 12.992595672818029, + "learning_rate": 4.9928315655148145e-05, + "loss": 2.3876, + "mean_token_accuracy": 0.3834975302219391, + "step": 73615 + }, + { + "epoch": 0.07415084590583991, + "grad_norm": 12.60359006748628, + "learning_rate": 4.992828576456224e-05, + "loss": 2.5225, + "mean_token_accuracy": 0.3988505780696869, + "step": 73620 + }, + { + "epoch": 0.07415588195894408, + "grad_norm": 12.979697820965283, + "learning_rate": 4.992825586775578e-05, + "loss": 2.4946, + "mean_token_accuracy": 0.40344828069210054, + "step": 73625 + }, + { + "epoch": 0.07416091801204826, + "grad_norm": 11.567143070661796, + "learning_rate": 4.992822596472875e-05, + "loss": 2.5673, + "mean_token_accuracy": 0.3827586233615875, + "step": 73630 + }, + { + "epoch": 0.07416595406515243, + "grad_norm": 10.218169555661172, + "learning_rate": 4.992819605548117e-05, + "loss": 2.2641, + "mean_token_accuracy": 0.4258923172950745, + "step": 73635 + }, + { + "epoch": 0.0741709901182566, + "grad_norm": 10.88494920878584, + "learning_rate": 4.992816614001305e-05, + "loss": 2.2941, + "mean_token_accuracy": 0.4068965494632721, + "step": 73640 + }, + { + "epoch": 0.07417602617136076, + "grad_norm": 15.06867257197011, + "learning_rate": 4.992813621832439e-05, + "loss": 2.774, + "mean_token_accuracy": 0.3862068891525269, + "step": 73645 + }, + { + "epoch": 0.07418106222446494, + "grad_norm": 10.858335850392738, + "learning_rate": 4.99281062904152e-05, + "loss": 2.1657, + "mean_token_accuracy": 0.4620689690113068, + "step": 73650 + }, + { + "epoch": 0.07418609827756911, + "grad_norm": 12.931085581666954, + "learning_rate": 4.99280763562855e-05, + "loss": 2.6343, + "mean_token_accuracy": 0.32413793057203294, + "step": 73655 + }, + { + "epoch": 0.07419113433067329, + "grad_norm": 18.1439776834488, + "learning_rate": 4.9928046415935284e-05, + "loss": 2.4466, + "mean_token_accuracy": 0.4034482777118683, + "step": 73660 + }, + { + "epoch": 0.07419617038377746, + "grad_norm": 11.338843231084653, + "learning_rate": 4.992801646936456e-05, + "loss": 2.6214, + "mean_token_accuracy": 0.4241379380226135, + "step": 73665 + }, + { + "epoch": 0.07420120643688163, + "grad_norm": 11.021891851803653, + "learning_rate": 4.9927986516573345e-05, + "loss": 2.1308, + "mean_token_accuracy": 0.43448275327682495, + "step": 73670 + }, + { + "epoch": 0.07420624248998581, + "grad_norm": 12.188399459987645, + "learning_rate": 4.992795655756165e-05, + "loss": 2.5989, + "mean_token_accuracy": 0.4034482717514038, + "step": 73675 + }, + { + "epoch": 0.07421127854308998, + "grad_norm": 13.513414605959973, + "learning_rate": 4.9927926592329474e-05, + "loss": 2.7421, + "mean_token_accuracy": 0.36896551251411436, + "step": 73680 + }, + { + "epoch": 0.07421631459619416, + "grad_norm": 11.738558941992899, + "learning_rate": 4.992789662087683e-05, + "loss": 2.6094, + "mean_token_accuracy": 0.3910465896129608, + "step": 73685 + }, + { + "epoch": 0.07422135064929833, + "grad_norm": 11.59881670224189, + "learning_rate": 4.9927866643203725e-05, + "loss": 2.595, + "mean_token_accuracy": 0.38620689511299133, + "step": 73690 + }, + { + "epoch": 0.0742263867024025, + "grad_norm": 9.57494476153104, + "learning_rate": 4.992783665931017e-05, + "loss": 2.6028, + "mean_token_accuracy": 0.3793103456497192, + "step": 73695 + }, + { + "epoch": 0.07423142275550668, + "grad_norm": 11.096350505871342, + "learning_rate": 4.992780666919617e-05, + "loss": 2.6475, + "mean_token_accuracy": 0.41034482717514037, + "step": 73700 + }, + { + "epoch": 0.07423645880861085, + "grad_norm": 9.861435241801793, + "learning_rate": 4.992777667286172e-05, + "loss": 2.2941, + "mean_token_accuracy": 0.42068966031074523, + "step": 73705 + }, + { + "epoch": 0.07424149486171502, + "grad_norm": 10.047507831909865, + "learning_rate": 4.992774667030686e-05, + "loss": 2.4564, + "mean_token_accuracy": 0.37241379022598264, + "step": 73710 + }, + { + "epoch": 0.07424653091481918, + "grad_norm": 11.714142853487562, + "learning_rate": 4.992771666153157e-05, + "loss": 2.3799, + "mean_token_accuracy": 0.41034482717514037, + "step": 73715 + }, + { + "epoch": 0.07425156696792336, + "grad_norm": 12.139510687514797, + "learning_rate": 4.992768664653588e-05, + "loss": 2.5127, + "mean_token_accuracy": 0.42413793206214906, + "step": 73720 + }, + { + "epoch": 0.07425660302102753, + "grad_norm": 11.030309031569125, + "learning_rate": 4.992765662531978e-05, + "loss": 2.378, + "mean_token_accuracy": 0.4206896543502808, + "step": 73725 + }, + { + "epoch": 0.0742616390741317, + "grad_norm": 16.011755293173092, + "learning_rate": 4.992762659788328e-05, + "loss": 2.6507, + "mean_token_accuracy": 0.4379310369491577, + "step": 73730 + }, + { + "epoch": 0.07426667512723588, + "grad_norm": 10.666542903429121, + "learning_rate": 4.992759656422641e-05, + "loss": 2.3661, + "mean_token_accuracy": 0.4122806966304779, + "step": 73735 + }, + { + "epoch": 0.07427171118034005, + "grad_norm": 11.700557396980534, + "learning_rate": 4.992756652434915e-05, + "loss": 2.7827, + "mean_token_accuracy": 0.37586207687854767, + "step": 73740 + }, + { + "epoch": 0.07427674723344423, + "grad_norm": 14.5221424078562, + "learning_rate": 4.9927536478251524e-05, + "loss": 3.0883, + "mean_token_accuracy": 0.3643678233027458, + "step": 73745 + }, + { + "epoch": 0.0742817832865484, + "grad_norm": 9.616768043136565, + "learning_rate": 4.992750642593354e-05, + "loss": 2.2404, + "mean_token_accuracy": 0.4448275864124298, + "step": 73750 + }, + { + "epoch": 0.07428681933965257, + "grad_norm": 11.638447503321581, + "learning_rate": 4.99274763673952e-05, + "loss": 2.4899, + "mean_token_accuracy": 0.4413793087005615, + "step": 73755 + }, + { + "epoch": 0.07429185539275675, + "grad_norm": 11.575827599551001, + "learning_rate": 4.992744630263652e-05, + "loss": 2.5081, + "mean_token_accuracy": 0.3827586114406586, + "step": 73760 + }, + { + "epoch": 0.07429689144586092, + "grad_norm": 10.555713534593526, + "learning_rate": 4.99274162316575e-05, + "loss": 2.6938, + "mean_token_accuracy": 0.34482758641242983, + "step": 73765 + }, + { + "epoch": 0.0743019274989651, + "grad_norm": 11.767886188270383, + "learning_rate": 4.992738615445815e-05, + "loss": 2.199, + "mean_token_accuracy": 0.4448275864124298, + "step": 73770 + }, + { + "epoch": 0.07430696355206927, + "grad_norm": 10.267336498757054, + "learning_rate": 4.992735607103849e-05, + "loss": 2.7611, + "mean_token_accuracy": 0.3931034505367279, + "step": 73775 + }, + { + "epoch": 0.07431199960517344, + "grad_norm": 11.956318339799104, + "learning_rate": 4.992732598139851e-05, + "loss": 2.5448, + "mean_token_accuracy": 0.4103448212146759, + "step": 73780 + }, + { + "epoch": 0.0743170356582776, + "grad_norm": 12.035066326776143, + "learning_rate": 4.992729588553823e-05, + "loss": 2.2049, + "mean_token_accuracy": 0.43103448748588563, + "step": 73785 + }, + { + "epoch": 0.07432207171138178, + "grad_norm": 11.407407675402807, + "learning_rate": 4.992726578345765e-05, + "loss": 2.3361, + "mean_token_accuracy": 0.4482758641242981, + "step": 73790 + }, + { + "epoch": 0.07432710776448595, + "grad_norm": 10.56347306608944, + "learning_rate": 4.99272356751568e-05, + "loss": 2.211, + "mean_token_accuracy": 0.46388384103775027, + "step": 73795 + }, + { + "epoch": 0.07433214381759012, + "grad_norm": 10.726761001651662, + "learning_rate": 4.992720556063566e-05, + "loss": 2.7758, + "mean_token_accuracy": 0.4206896543502808, + "step": 73800 + }, + { + "epoch": 0.0743371798706943, + "grad_norm": 10.02281202840823, + "learning_rate": 4.992717543989426e-05, + "loss": 2.4204, + "mean_token_accuracy": 0.4068965494632721, + "step": 73805 + }, + { + "epoch": 0.07434221592379847, + "grad_norm": 9.929350755460154, + "learning_rate": 4.992714531293259e-05, + "loss": 2.1412, + "mean_token_accuracy": 0.45517241954803467, + "step": 73810 + }, + { + "epoch": 0.07434725197690265, + "grad_norm": 10.5921077840851, + "learning_rate": 4.992711517975067e-05, + "loss": 2.6367, + "mean_token_accuracy": 0.35692681074142457, + "step": 73815 + }, + { + "epoch": 0.07435228803000682, + "grad_norm": 11.405852522544334, + "learning_rate": 4.992708504034851e-05, + "loss": 2.2662, + "mean_token_accuracy": 0.4536600112915039, + "step": 73820 + }, + { + "epoch": 0.07435732408311099, + "grad_norm": 12.797345112831934, + "learning_rate": 4.992705489472611e-05, + "loss": 2.3771, + "mean_token_accuracy": 0.46896551847457885, + "step": 73825 + }, + { + "epoch": 0.07436236013621517, + "grad_norm": 11.526747412515364, + "learning_rate": 4.992702474288349e-05, + "loss": 2.5823, + "mean_token_accuracy": 0.3876588046550751, + "step": 73830 + }, + { + "epoch": 0.07436739618931934, + "grad_norm": 11.740531093059374, + "learning_rate": 4.992699458482064e-05, + "loss": 2.468, + "mean_token_accuracy": 0.4, + "step": 73835 + }, + { + "epoch": 0.07437243224242351, + "grad_norm": 11.66094294133679, + "learning_rate": 4.992696442053758e-05, + "loss": 2.4595, + "mean_token_accuracy": 0.42238354682922363, + "step": 73840 + }, + { + "epoch": 0.07437746829552769, + "grad_norm": 12.487936707267025, + "learning_rate": 4.9926934250034325e-05, + "loss": 2.3687, + "mean_token_accuracy": 0.44827585816383364, + "step": 73845 + }, + { + "epoch": 0.07438250434863186, + "grad_norm": 12.365072504406971, + "learning_rate": 4.992690407331087e-05, + "loss": 2.5797, + "mean_token_accuracy": 0.358620685338974, + "step": 73850 + }, + { + "epoch": 0.07438754040173602, + "grad_norm": 13.00708206286683, + "learning_rate": 4.992687389036724e-05, + "loss": 2.6934, + "mean_token_accuracy": 0.41034482717514037, + "step": 73855 + }, + { + "epoch": 0.0743925764548402, + "grad_norm": 14.072368648969313, + "learning_rate": 4.992684370120342e-05, + "loss": 2.5628, + "mean_token_accuracy": 0.4586206912994385, + "step": 73860 + }, + { + "epoch": 0.07439761250794437, + "grad_norm": 12.03604809660669, + "learning_rate": 4.992681350581944e-05, + "loss": 2.5602, + "mean_token_accuracy": 0.4363581299781799, + "step": 73865 + }, + { + "epoch": 0.07440264856104854, + "grad_norm": 12.493015138253426, + "learning_rate": 4.992678330421529e-05, + "loss": 2.3488, + "mean_token_accuracy": 0.41034482717514037, + "step": 73870 + }, + { + "epoch": 0.07440768461415272, + "grad_norm": 13.458675910740086, + "learning_rate": 4.9926753096391e-05, + "loss": 2.9717, + "mean_token_accuracy": 0.35172412991523744, + "step": 73875 + }, + { + "epoch": 0.07441272066725689, + "grad_norm": 15.650197165600286, + "learning_rate": 4.992672288234656e-05, + "loss": 2.5704, + "mean_token_accuracy": 0.3931034505367279, + "step": 73880 + }, + { + "epoch": 0.07441775672036106, + "grad_norm": 16.382106190663176, + "learning_rate": 4.992669266208198e-05, + "loss": 2.8511, + "mean_token_accuracy": 0.3620689660310745, + "step": 73885 + }, + { + "epoch": 0.07442279277346524, + "grad_norm": 9.292887021953403, + "learning_rate": 4.9926662435597285e-05, + "loss": 2.3653, + "mean_token_accuracy": 0.4344827592372894, + "step": 73890 + }, + { + "epoch": 0.07442782882656941, + "grad_norm": 10.133451892027326, + "learning_rate": 4.992663220289246e-05, + "loss": 2.3177, + "mean_token_accuracy": 0.46702963709831236, + "step": 73895 + }, + { + "epoch": 0.07443286487967359, + "grad_norm": 13.072493975901146, + "learning_rate": 4.9926601963967526e-05, + "loss": 2.5747, + "mean_token_accuracy": 0.4137930989265442, + "step": 73900 + }, + { + "epoch": 0.07443790093277776, + "grad_norm": 9.24923147309057, + "learning_rate": 4.99265717188225e-05, + "loss": 2.2384, + "mean_token_accuracy": 0.4315270930528641, + "step": 73905 + }, + { + "epoch": 0.07444293698588193, + "grad_norm": 10.921803307393334, + "learning_rate": 4.9926541467457374e-05, + "loss": 2.3667, + "mean_token_accuracy": 0.38753780722618103, + "step": 73910 + }, + { + "epoch": 0.0744479730389861, + "grad_norm": 17.321765298143042, + "learning_rate": 4.9926511209872166e-05, + "loss": 2.7622, + "mean_token_accuracy": 0.3965517282485962, + "step": 73915 + }, + { + "epoch": 0.07445300909209028, + "grad_norm": 12.299526909174983, + "learning_rate": 4.9926480946066876e-05, + "loss": 2.7418, + "mean_token_accuracy": 0.34482758343219755, + "step": 73920 + }, + { + "epoch": 0.07445804514519444, + "grad_norm": 11.327556309525347, + "learning_rate": 4.9926450676041526e-05, + "loss": 2.9629, + "mean_token_accuracy": 0.3551724135875702, + "step": 73925 + }, + { + "epoch": 0.07446308119829861, + "grad_norm": 13.44288272157233, + "learning_rate": 4.9926420399796114e-05, + "loss": 2.4074, + "mean_token_accuracy": 0.441379314661026, + "step": 73930 + }, + { + "epoch": 0.07446811725140279, + "grad_norm": 10.800823138773753, + "learning_rate": 4.992639011733065e-05, + "loss": 2.9734, + "mean_token_accuracy": 0.4034482777118683, + "step": 73935 + }, + { + "epoch": 0.07447315330450696, + "grad_norm": 15.073112974654988, + "learning_rate": 4.9926359828645134e-05, + "loss": 2.3611, + "mean_token_accuracy": 0.37586206793785093, + "step": 73940 + }, + { + "epoch": 0.07447818935761114, + "grad_norm": 10.523337776663704, + "learning_rate": 4.992632953373959e-05, + "loss": 2.5109, + "mean_token_accuracy": 0.42758620977401735, + "step": 73945 + }, + { + "epoch": 0.07448322541071531, + "grad_norm": 11.435353100698668, + "learning_rate": 4.9926299232614025e-05, + "loss": 2.7945, + "mean_token_accuracy": 0.3448275804519653, + "step": 73950 + }, + { + "epoch": 0.07448826146381948, + "grad_norm": 10.590176818118957, + "learning_rate": 4.9926268925268436e-05, + "loss": 2.555, + "mean_token_accuracy": 0.42758620977401735, + "step": 73955 + }, + { + "epoch": 0.07449329751692366, + "grad_norm": 10.30890059769074, + "learning_rate": 4.9926238611702834e-05, + "loss": 2.5352, + "mean_token_accuracy": 0.37398669123649597, + "step": 73960 + }, + { + "epoch": 0.07449833357002783, + "grad_norm": 10.143187054028136, + "learning_rate": 4.9926208291917246e-05, + "loss": 2.5683, + "mean_token_accuracy": 0.37241379618644715, + "step": 73965 + }, + { + "epoch": 0.074503369623132, + "grad_norm": 12.098449141244899, + "learning_rate": 4.992617796591165e-05, + "loss": 2.5848, + "mean_token_accuracy": 0.41379310488700866, + "step": 73970 + }, + { + "epoch": 0.07450840567623618, + "grad_norm": 12.575821298055848, + "learning_rate": 4.9926147633686075e-05, + "loss": 2.8461, + "mean_token_accuracy": 0.36551723778247835, + "step": 73975 + }, + { + "epoch": 0.07451344172934035, + "grad_norm": 15.01439106295847, + "learning_rate": 4.9926117295240526e-05, + "loss": 2.6533, + "mean_token_accuracy": 0.4655172348022461, + "step": 73980 + }, + { + "epoch": 0.07451847778244453, + "grad_norm": 10.019842620970008, + "learning_rate": 4.992608695057501e-05, + "loss": 2.4815, + "mean_token_accuracy": 0.4655172288417816, + "step": 73985 + }, + { + "epoch": 0.0745235138355487, + "grad_norm": 10.133255677470762, + "learning_rate": 4.992605659968953e-05, + "loss": 2.2421, + "mean_token_accuracy": 0.4379310369491577, + "step": 73990 + }, + { + "epoch": 0.07452854988865286, + "grad_norm": 10.761493921330587, + "learning_rate": 4.9926026242584104e-05, + "loss": 2.7199, + "mean_token_accuracy": 0.38620689511299133, + "step": 73995 + }, + { + "epoch": 0.07453358594175703, + "grad_norm": 11.69394381662905, + "learning_rate": 4.992599587925874e-05, + "loss": 2.7066, + "mean_token_accuracy": 0.3862068891525269, + "step": 74000 + }, + { + "epoch": 0.0745386219948612, + "grad_norm": 9.184443872991706, + "learning_rate": 4.992596550971344e-05, + "loss": 2.6132, + "mean_token_accuracy": 0.4310344785451889, + "step": 74005 + }, + { + "epoch": 0.07454365804796538, + "grad_norm": 11.29588590447559, + "learning_rate": 4.992593513394821e-05, + "loss": 2.3564, + "mean_token_accuracy": 0.4068965494632721, + "step": 74010 + }, + { + "epoch": 0.07454869410106955, + "grad_norm": 11.256098383171906, + "learning_rate": 4.9925904751963074e-05, + "loss": 2.2401, + "mean_token_accuracy": 0.44827585816383364, + "step": 74015 + }, + { + "epoch": 0.07455373015417373, + "grad_norm": 12.207555627296458, + "learning_rate": 4.992587436375802e-05, + "loss": 2.3819, + "mean_token_accuracy": 0.47586206793785096, + "step": 74020 + }, + { + "epoch": 0.0745587662072779, + "grad_norm": 15.714446832321306, + "learning_rate": 4.992584396933306e-05, + "loss": 2.5931, + "mean_token_accuracy": 0.3896551638841629, + "step": 74025 + }, + { + "epoch": 0.07456380226038208, + "grad_norm": 12.002240277487008, + "learning_rate": 4.9925813568688215e-05, + "loss": 2.745, + "mean_token_accuracy": 0.42068964838981626, + "step": 74030 + }, + { + "epoch": 0.07456883831348625, + "grad_norm": 11.199166995912377, + "learning_rate": 4.992578316182349e-05, + "loss": 2.9546, + "mean_token_accuracy": 0.35995160341262816, + "step": 74035 + }, + { + "epoch": 0.07457387436659042, + "grad_norm": 12.211916670978802, + "learning_rate": 4.992575274873889e-05, + "loss": 3.0113, + "mean_token_accuracy": 0.3896551787853241, + "step": 74040 + }, + { + "epoch": 0.0745789104196946, + "grad_norm": 12.28447005126784, + "learning_rate": 4.992572232943442e-05, + "loss": 2.5282, + "mean_token_accuracy": 0.4034482717514038, + "step": 74045 + }, + { + "epoch": 0.07458394647279877, + "grad_norm": 11.491021536384107, + "learning_rate": 4.9925691903910096e-05, + "loss": 2.8965, + "mean_token_accuracy": 0.40344828367233276, + "step": 74050 + }, + { + "epoch": 0.07458898252590294, + "grad_norm": 9.861451129187612, + "learning_rate": 4.992566147216592e-05, + "loss": 2.4128, + "mean_token_accuracy": 0.3896551728248596, + "step": 74055 + }, + { + "epoch": 0.07459401857900712, + "grad_norm": 11.676553106242576, + "learning_rate": 4.99256310342019e-05, + "loss": 2.4998, + "mean_token_accuracy": 0.38965516686439516, + "step": 74060 + }, + { + "epoch": 0.07459905463211128, + "grad_norm": 11.468093858443208, + "learning_rate": 4.9925600590018054e-05, + "loss": 2.3262, + "mean_token_accuracy": 0.4120387136936188, + "step": 74065 + }, + { + "epoch": 0.07460409068521545, + "grad_norm": 11.207836179168911, + "learning_rate": 4.992557013961439e-05, + "loss": 2.8414, + "mean_token_accuracy": 0.41379310488700866, + "step": 74070 + }, + { + "epoch": 0.07460912673831963, + "grad_norm": 10.411372061587082, + "learning_rate": 4.99255396829909e-05, + "loss": 2.1922, + "mean_token_accuracy": 0.4724137902259827, + "step": 74075 + }, + { + "epoch": 0.0746141627914238, + "grad_norm": 7.938392302790149, + "learning_rate": 4.99255092201476e-05, + "loss": 1.8021, + "mean_token_accuracy": 0.5531760454177856, + "step": 74080 + }, + { + "epoch": 0.07461919884452797, + "grad_norm": 13.26272150778796, + "learning_rate": 4.992547875108451e-05, + "loss": 2.7256, + "mean_token_accuracy": 0.39655172228813174, + "step": 74085 + }, + { + "epoch": 0.07462423489763215, + "grad_norm": 12.906795732617358, + "learning_rate": 4.992544827580163e-05, + "loss": 2.4238, + "mean_token_accuracy": 0.4344827592372894, + "step": 74090 + }, + { + "epoch": 0.07462927095073632, + "grad_norm": 11.635695899199026, + "learning_rate": 4.992541779429896e-05, + "loss": 2.4453, + "mean_token_accuracy": 0.3827586233615875, + "step": 74095 + }, + { + "epoch": 0.0746343070038405, + "grad_norm": 8.850676857887878, + "learning_rate": 4.992538730657652e-05, + "loss": 2.6257, + "mean_token_accuracy": 0.41034482717514037, + "step": 74100 + }, + { + "epoch": 0.07463934305694467, + "grad_norm": 9.291045497361653, + "learning_rate": 4.9925356812634324e-05, + "loss": 2.2709, + "mean_token_accuracy": 0.4689655125141144, + "step": 74105 + }, + { + "epoch": 0.07464437911004884, + "grad_norm": 12.51893398984911, + "learning_rate": 4.9925326312472364e-05, + "loss": 2.5777, + "mean_token_accuracy": 0.38421052098274233, + "step": 74110 + }, + { + "epoch": 0.07464941516315302, + "grad_norm": 14.849839847027791, + "learning_rate": 4.9925295806090655e-05, + "loss": 2.4177, + "mean_token_accuracy": 0.4517241358757019, + "step": 74115 + }, + { + "epoch": 0.07465445121625719, + "grad_norm": 11.594724948797252, + "learning_rate": 4.992526529348921e-05, + "loss": 2.4153, + "mean_token_accuracy": 0.4482758641242981, + "step": 74120 + }, + { + "epoch": 0.07465948726936136, + "grad_norm": 11.51934983668768, + "learning_rate": 4.992523477466803e-05, + "loss": 2.6677, + "mean_token_accuracy": 0.39655172228813174, + "step": 74125 + }, + { + "epoch": 0.07466452332246554, + "grad_norm": 10.365735737793699, + "learning_rate": 4.992520424962713e-05, + "loss": 2.4432, + "mean_token_accuracy": 0.38275861740112305, + "step": 74130 + }, + { + "epoch": 0.0746695593755697, + "grad_norm": 17.859420800833238, + "learning_rate": 4.9925173718366515e-05, + "loss": 2.5676, + "mean_token_accuracy": 0.5054446518421173, + "step": 74135 + }, + { + "epoch": 0.07467459542867387, + "grad_norm": 12.917236174423152, + "learning_rate": 4.992514318088619e-05, + "loss": 2.5153, + "mean_token_accuracy": 0.3896551728248596, + "step": 74140 + }, + { + "epoch": 0.07467963148177804, + "grad_norm": 10.55110949034747, + "learning_rate": 4.992511263718617e-05, + "loss": 2.2726, + "mean_token_accuracy": 0.4, + "step": 74145 + }, + { + "epoch": 0.07468466753488222, + "grad_norm": 12.13870132225336, + "learning_rate": 4.992508208726647e-05, + "loss": 2.6376, + "mean_token_accuracy": 0.3896551787853241, + "step": 74150 + }, + { + "epoch": 0.07468970358798639, + "grad_norm": 8.89327466054518, + "learning_rate": 4.992505153112709e-05, + "loss": 2.7559, + "mean_token_accuracy": 0.42068966031074523, + "step": 74155 + }, + { + "epoch": 0.07469473964109057, + "grad_norm": 10.64888647954188, + "learning_rate": 4.992502096876802e-05, + "loss": 2.6124, + "mean_token_accuracy": 0.36896551251411436, + "step": 74160 + }, + { + "epoch": 0.07469977569419474, + "grad_norm": 13.627241509361527, + "learning_rate": 4.99249904001893e-05, + "loss": 3.0611, + "mean_token_accuracy": 0.3655172437429428, + "step": 74165 + }, + { + "epoch": 0.07470481174729891, + "grad_norm": 11.908444496688013, + "learning_rate": 4.992495982539093e-05, + "loss": 2.4042, + "mean_token_accuracy": 0.4206896543502808, + "step": 74170 + }, + { + "epoch": 0.07470984780040309, + "grad_norm": 19.982741144463642, + "learning_rate": 4.99249292443729e-05, + "loss": 2.7375, + "mean_token_accuracy": 0.4344827622175217, + "step": 74175 + }, + { + "epoch": 0.07471488385350726, + "grad_norm": 12.58664641906852, + "learning_rate": 4.992489865713524e-05, + "loss": 2.6528, + "mean_token_accuracy": 0.41724138259887694, + "step": 74180 + }, + { + "epoch": 0.07471991990661143, + "grad_norm": 11.333665117570488, + "learning_rate": 4.992486806367796e-05, + "loss": 2.3685, + "mean_token_accuracy": 0.42413793206214906, + "step": 74185 + }, + { + "epoch": 0.07472495595971561, + "grad_norm": 13.665573995973682, + "learning_rate": 4.992483746400105e-05, + "loss": 2.6871, + "mean_token_accuracy": 0.4000000059604645, + "step": 74190 + }, + { + "epoch": 0.07472999201281978, + "grad_norm": 11.080945448061124, + "learning_rate": 4.9924806858104525e-05, + "loss": 2.6011, + "mean_token_accuracy": 0.4, + "step": 74195 + }, + { + "epoch": 0.07473502806592396, + "grad_norm": 11.629961530886522, + "learning_rate": 4.9924776245988405e-05, + "loss": 1.963, + "mean_token_accuracy": 0.47931034564971925, + "step": 74200 + }, + { + "epoch": 0.07474006411902812, + "grad_norm": 9.322069575672932, + "learning_rate": 4.992474562765268e-05, + "loss": 2.4343, + "mean_token_accuracy": 0.42413792610168455, + "step": 74205 + }, + { + "epoch": 0.07474510017213229, + "grad_norm": 9.621145206695322, + "learning_rate": 4.992471500309736e-05, + "loss": 2.3957, + "mean_token_accuracy": 0.44827587008476255, + "step": 74210 + }, + { + "epoch": 0.07475013622523646, + "grad_norm": 9.89626262987316, + "learning_rate": 4.992468437232248e-05, + "loss": 2.9162, + "mean_token_accuracy": 0.38965516686439516, + "step": 74215 + }, + { + "epoch": 0.07475517227834064, + "grad_norm": 16.778608014872635, + "learning_rate": 4.992465373532802e-05, + "loss": 2.0217, + "mean_token_accuracy": 0.47931033968925474, + "step": 74220 + }, + { + "epoch": 0.07476020833144481, + "grad_norm": 10.877294811953972, + "learning_rate": 4.992462309211401e-05, + "loss": 2.4218, + "mean_token_accuracy": 0.41724138259887694, + "step": 74225 + }, + { + "epoch": 0.07476524438454898, + "grad_norm": 10.196359983468803, + "learning_rate": 4.992459244268044e-05, + "loss": 1.9349, + "mean_token_accuracy": 0.4862068951129913, + "step": 74230 + }, + { + "epoch": 0.07477028043765316, + "grad_norm": 11.063878158498069, + "learning_rate": 4.9924561787027324e-05, + "loss": 2.4044, + "mean_token_accuracy": 0.4206896543502808, + "step": 74235 + }, + { + "epoch": 0.07477531649075733, + "grad_norm": 10.797848261889524, + "learning_rate": 4.992453112515467e-05, + "loss": 2.3162, + "mean_token_accuracy": 0.41379310488700866, + "step": 74240 + }, + { + "epoch": 0.0747803525438615, + "grad_norm": 14.902552168981597, + "learning_rate": 4.9924500457062495e-05, + "loss": 2.5545, + "mean_token_accuracy": 0.41034482717514037, + "step": 74245 + }, + { + "epoch": 0.07478538859696568, + "grad_norm": 11.51870346550696, + "learning_rate": 4.992446978275079e-05, + "loss": 2.4248, + "mean_token_accuracy": 0.36551723480224607, + "step": 74250 + }, + { + "epoch": 0.07479042465006985, + "grad_norm": 9.876411185200189, + "learning_rate": 4.992443910221959e-05, + "loss": 2.4941, + "mean_token_accuracy": 0.4172413766384125, + "step": 74255 + }, + { + "epoch": 0.07479546070317403, + "grad_norm": 12.343835083903386, + "learning_rate": 4.9924408415468876e-05, + "loss": 2.9519, + "mean_token_accuracy": 0.36896551251411436, + "step": 74260 + }, + { + "epoch": 0.0748004967562782, + "grad_norm": 11.875383357672037, + "learning_rate": 4.992437772249867e-05, + "loss": 2.6959, + "mean_token_accuracy": 0.3896551728248596, + "step": 74265 + }, + { + "epoch": 0.07480553280938237, + "grad_norm": 11.855927324418904, + "learning_rate": 4.9924347023308986e-05, + "loss": 2.3392, + "mean_token_accuracy": 0.40689656138420105, + "step": 74270 + }, + { + "epoch": 0.07481056886248653, + "grad_norm": 13.251153291016989, + "learning_rate": 4.992431631789982e-05, + "loss": 2.7221, + "mean_token_accuracy": 0.3551724135875702, + "step": 74275 + }, + { + "epoch": 0.07481560491559071, + "grad_norm": 9.596593419375113, + "learning_rate": 4.9924285606271196e-05, + "loss": 2.2946, + "mean_token_accuracy": 0.43103447556495667, + "step": 74280 + }, + { + "epoch": 0.07482064096869488, + "grad_norm": 20.558329273704697, + "learning_rate": 4.99242548884231e-05, + "loss": 2.834, + "mean_token_accuracy": 0.43793103098869324, + "step": 74285 + }, + { + "epoch": 0.07482567702179906, + "grad_norm": 13.820635556792803, + "learning_rate": 4.992422416435556e-05, + "loss": 2.7312, + "mean_token_accuracy": 0.3655172407627106, + "step": 74290 + }, + { + "epoch": 0.07483071307490323, + "grad_norm": 10.315810325782191, + "learning_rate": 4.992419343406858e-05, + "loss": 2.4181, + "mean_token_accuracy": 0.41034482717514037, + "step": 74295 + }, + { + "epoch": 0.0748357491280074, + "grad_norm": 9.31226254551686, + "learning_rate": 4.9924162697562154e-05, + "loss": 2.256, + "mean_token_accuracy": 0.41724138259887694, + "step": 74300 + }, + { + "epoch": 0.07484078518111158, + "grad_norm": 11.402568319134643, + "learning_rate": 4.992413195483631e-05, + "loss": 2.7093, + "mean_token_accuracy": 0.39655172228813174, + "step": 74305 + }, + { + "epoch": 0.07484582123421575, + "grad_norm": 9.96133520409277, + "learning_rate": 4.992410120589106e-05, + "loss": 2.4894, + "mean_token_accuracy": 0.42413793206214906, + "step": 74310 + }, + { + "epoch": 0.07485085728731992, + "grad_norm": 10.596791688506148, + "learning_rate": 4.992407045072639e-05, + "loss": 2.0331, + "mean_token_accuracy": 0.5034482717514038, + "step": 74315 + }, + { + "epoch": 0.0748558933404241, + "grad_norm": 9.604333871957234, + "learning_rate": 4.992403968934232e-05, + "loss": 2.3393, + "mean_token_accuracy": 0.41034482717514037, + "step": 74320 + }, + { + "epoch": 0.07486092939352827, + "grad_norm": 11.190424881352772, + "learning_rate": 4.992400892173886e-05, + "loss": 2.5643, + "mean_token_accuracy": 0.41034482717514037, + "step": 74325 + }, + { + "epoch": 0.07486596544663245, + "grad_norm": 11.41769878893656, + "learning_rate": 4.992397814791602e-05, + "loss": 2.1853, + "mean_token_accuracy": 0.43448275327682495, + "step": 74330 + }, + { + "epoch": 0.07487100149973662, + "grad_norm": 11.235681949946677, + "learning_rate": 4.992394736787381e-05, + "loss": 3.4314, + "mean_token_accuracy": 0.32214156687259676, + "step": 74335 + }, + { + "epoch": 0.0748760375528408, + "grad_norm": 9.020089417613077, + "learning_rate": 4.992391658161223e-05, + "loss": 2.0489, + "mean_token_accuracy": 0.5137930929660797, + "step": 74340 + }, + { + "epoch": 0.07488107360594495, + "grad_norm": 13.535140799396936, + "learning_rate": 4.992388578913129e-05, + "loss": 2.5086, + "mean_token_accuracy": 0.3896551728248596, + "step": 74345 + }, + { + "epoch": 0.07488610965904913, + "grad_norm": 11.196682218085444, + "learning_rate": 4.9923854990431e-05, + "loss": 2.618, + "mean_token_accuracy": 0.4034482777118683, + "step": 74350 + }, + { + "epoch": 0.0748911457121533, + "grad_norm": 9.526240174426706, + "learning_rate": 4.992382418551137e-05, + "loss": 2.8418, + "mean_token_accuracy": 0.3655172407627106, + "step": 74355 + }, + { + "epoch": 0.07489618176525747, + "grad_norm": 11.37867831104485, + "learning_rate": 4.992379337437242e-05, + "loss": 2.3842, + "mean_token_accuracy": 0.4015124022960663, + "step": 74360 + }, + { + "epoch": 0.07490121781836165, + "grad_norm": 10.805245750128467, + "learning_rate": 4.992376255701414e-05, + "loss": 2.4074, + "mean_token_accuracy": 0.38965516090393065, + "step": 74365 + }, + { + "epoch": 0.07490625387146582, + "grad_norm": 11.216256501506264, + "learning_rate": 4.9923731733436555e-05, + "loss": 2.1359, + "mean_token_accuracy": 0.49655171632766726, + "step": 74370 + }, + { + "epoch": 0.07491128992457, + "grad_norm": 13.901447306731669, + "learning_rate": 4.992370090363965e-05, + "loss": 2.9417, + "mean_token_accuracy": 0.34482758343219755, + "step": 74375 + }, + { + "epoch": 0.07491632597767417, + "grad_norm": 12.293800677139833, + "learning_rate": 4.9923670067623454e-05, + "loss": 3.212, + "mean_token_accuracy": 0.42512314915657046, + "step": 74380 + }, + { + "epoch": 0.07492136203077834, + "grad_norm": 11.986101737088507, + "learning_rate": 4.992363922538797e-05, + "loss": 2.5723, + "mean_token_accuracy": 0.4068965554237366, + "step": 74385 + }, + { + "epoch": 0.07492639808388252, + "grad_norm": 12.70300256486122, + "learning_rate": 4.99236083769332e-05, + "loss": 2.4379, + "mean_token_accuracy": 0.4379310369491577, + "step": 74390 + }, + { + "epoch": 0.07493143413698669, + "grad_norm": 10.837827362826351, + "learning_rate": 4.992357752225917e-05, + "loss": 2.8077, + "mean_token_accuracy": 0.3689655244350433, + "step": 74395 + }, + { + "epoch": 0.07493647019009086, + "grad_norm": 11.52890530681104, + "learning_rate": 4.992354666136587e-05, + "loss": 2.4655, + "mean_token_accuracy": 0.38620689511299133, + "step": 74400 + }, + { + "epoch": 0.07494150624319504, + "grad_norm": 12.805526043302219, + "learning_rate": 4.992351579425332e-05, + "loss": 3.0517, + "mean_token_accuracy": 0.3965517163276672, + "step": 74405 + }, + { + "epoch": 0.07494654229629921, + "grad_norm": 9.196198975401728, + "learning_rate": 4.9923484920921515e-05, + "loss": 2.6373, + "mean_token_accuracy": 0.4310344815254211, + "step": 74410 + }, + { + "epoch": 0.07495157834940337, + "grad_norm": 12.050633963051533, + "learning_rate": 4.9923454041370476e-05, + "loss": 2.4575, + "mean_token_accuracy": 0.4034482777118683, + "step": 74415 + }, + { + "epoch": 0.07495661440250755, + "grad_norm": 11.829061245530673, + "learning_rate": 4.992342315560022e-05, + "loss": 2.6844, + "mean_token_accuracy": 0.39655172228813174, + "step": 74420 + }, + { + "epoch": 0.07496165045561172, + "grad_norm": 11.118065710046862, + "learning_rate": 4.992339226361073e-05, + "loss": 2.4732, + "mean_token_accuracy": 0.38965516686439516, + "step": 74425 + }, + { + "epoch": 0.0749666865087159, + "grad_norm": 17.736158544830545, + "learning_rate": 4.992336136540203e-05, + "loss": 2.5543, + "mean_token_accuracy": 0.37586206793785093, + "step": 74430 + }, + { + "epoch": 0.07497172256182007, + "grad_norm": 10.290261594254197, + "learning_rate": 4.992333046097413e-05, + "loss": 2.452, + "mean_token_accuracy": 0.3931034505367279, + "step": 74435 + }, + { + "epoch": 0.07497675861492424, + "grad_norm": 11.476327631613017, + "learning_rate": 4.992329955032704e-05, + "loss": 2.7828, + "mean_token_accuracy": 0.38275861740112305, + "step": 74440 + }, + { + "epoch": 0.07498179466802841, + "grad_norm": 11.22055784981995, + "learning_rate": 4.992326863346076e-05, + "loss": 2.5464, + "mean_token_accuracy": 0.43448275327682495, + "step": 74445 + }, + { + "epoch": 0.07498683072113259, + "grad_norm": 9.429611475148532, + "learning_rate": 4.9923237710375306e-05, + "loss": 2.495, + "mean_token_accuracy": 0.4000000059604645, + "step": 74450 + }, + { + "epoch": 0.07499186677423676, + "grad_norm": 12.229032192088107, + "learning_rate": 4.9923206781070676e-05, + "loss": 2.3035, + "mean_token_accuracy": 0.44827585816383364, + "step": 74455 + }, + { + "epoch": 0.07499690282734094, + "grad_norm": 11.765983549458799, + "learning_rate": 4.9923175845546894e-05, + "loss": 2.627, + "mean_token_accuracy": 0.3862068891525269, + "step": 74460 + }, + { + "epoch": 0.07500193888044511, + "grad_norm": 10.583596940391164, + "learning_rate": 4.992314490380395e-05, + "loss": 2.7771, + "mean_token_accuracy": 0.38620689511299133, + "step": 74465 + }, + { + "epoch": 0.07500697493354928, + "grad_norm": 10.977811221583897, + "learning_rate": 4.992311395584188e-05, + "loss": 2.4795, + "mean_token_accuracy": 0.4068965494632721, + "step": 74470 + }, + { + "epoch": 0.07501201098665346, + "grad_norm": 15.17782509000339, + "learning_rate": 4.992308300166066e-05, + "loss": 2.7612, + "mean_token_accuracy": 0.3413792997598648, + "step": 74475 + }, + { + "epoch": 0.07501704703975763, + "grad_norm": 11.47026806933236, + "learning_rate": 4.992305204126032e-05, + "loss": 2.4991, + "mean_token_accuracy": 0.41034482717514037, + "step": 74480 + }, + { + "epoch": 0.07502208309286179, + "grad_norm": 10.133484811205273, + "learning_rate": 4.992302107464086e-05, + "loss": 2.1546, + "mean_token_accuracy": 0.4310344815254211, + "step": 74485 + }, + { + "epoch": 0.07502711914596596, + "grad_norm": 11.412941777633272, + "learning_rate": 4.99229901018023e-05, + "loss": 2.3158, + "mean_token_accuracy": 0.4517241358757019, + "step": 74490 + }, + { + "epoch": 0.07503215519907014, + "grad_norm": 11.299828877273477, + "learning_rate": 4.992295912274463e-05, + "loss": 2.6367, + "mean_token_accuracy": 0.39310344159603117, + "step": 74495 + }, + { + "epoch": 0.07503719125217431, + "grad_norm": 10.327596041858158, + "learning_rate": 4.992292813746787e-05, + "loss": 2.7698, + "mean_token_accuracy": 0.4137930989265442, + "step": 74500 + }, + { + "epoch": 0.07504222730527849, + "grad_norm": 11.783911849212519, + "learning_rate": 4.992289714597203e-05, + "loss": 2.5006, + "mean_token_accuracy": 0.4172413766384125, + "step": 74505 + }, + { + "epoch": 0.07504726335838266, + "grad_norm": 10.577087843329933, + "learning_rate": 4.992286614825712e-05, + "loss": 2.6119, + "mean_token_accuracy": 0.3911675751209259, + "step": 74510 + }, + { + "epoch": 0.07505229941148683, + "grad_norm": 9.820734268288287, + "learning_rate": 4.9922835144323143e-05, + "loss": 2.3405, + "mean_token_accuracy": 0.43811252117156985, + "step": 74515 + }, + { + "epoch": 0.07505733546459101, + "grad_norm": 13.293306421561798, + "learning_rate": 4.992280413417011e-05, + "loss": 2.6027, + "mean_token_accuracy": 0.4157289773225784, + "step": 74520 + }, + { + "epoch": 0.07506237151769518, + "grad_norm": 8.667513840389539, + "learning_rate": 4.992277311779802e-05, + "loss": 2.2602, + "mean_token_accuracy": 0.4206896543502808, + "step": 74525 + }, + { + "epoch": 0.07506740757079935, + "grad_norm": 12.233327291281816, + "learning_rate": 4.99227420952069e-05, + "loss": 2.7096, + "mean_token_accuracy": 0.3482758641242981, + "step": 74530 + }, + { + "epoch": 0.07507244362390353, + "grad_norm": 10.225347339780871, + "learning_rate": 4.992271106639674e-05, + "loss": 2.2108, + "mean_token_accuracy": 0.4379310250282288, + "step": 74535 + }, + { + "epoch": 0.0750774796770077, + "grad_norm": 12.041037110684764, + "learning_rate": 4.992268003136756e-05, + "loss": 2.3027, + "mean_token_accuracy": 0.40344828367233276, + "step": 74540 + }, + { + "epoch": 0.07508251573011188, + "grad_norm": 8.636441606126446, + "learning_rate": 4.992264899011937e-05, + "loss": 2.4258, + "mean_token_accuracy": 0.44827585816383364, + "step": 74545 + }, + { + "epoch": 0.07508755178321605, + "grad_norm": 10.54169446949642, + "learning_rate": 4.9922617942652174e-05, + "loss": 2.4574, + "mean_token_accuracy": 0.40828797221183777, + "step": 74550 + }, + { + "epoch": 0.07509258783632021, + "grad_norm": 11.336913707808195, + "learning_rate": 4.9922586888965974e-05, + "loss": 3.0471, + "mean_token_accuracy": 0.40000000298023225, + "step": 74555 + }, + { + "epoch": 0.07509762388942438, + "grad_norm": 14.800399641137508, + "learning_rate": 4.99225558290608e-05, + "loss": 2.7155, + "mean_token_accuracy": 0.4068965554237366, + "step": 74560 + }, + { + "epoch": 0.07510265994252856, + "grad_norm": 11.135429433151344, + "learning_rate": 4.992252476293664e-05, + "loss": 2.9798, + "mean_token_accuracy": 0.37931033968925476, + "step": 74565 + }, + { + "epoch": 0.07510769599563273, + "grad_norm": 10.607252001795178, + "learning_rate": 4.99224936905935e-05, + "loss": 2.5171, + "mean_token_accuracy": 0.3896551728248596, + "step": 74570 + }, + { + "epoch": 0.0751127320487369, + "grad_norm": 12.889920770178732, + "learning_rate": 4.99224626120314e-05, + "loss": 2.6022, + "mean_token_accuracy": 0.43448275327682495, + "step": 74575 + }, + { + "epoch": 0.07511776810184108, + "grad_norm": 12.17719043215868, + "learning_rate": 4.992243152725035e-05, + "loss": 2.8148, + "mean_token_accuracy": 0.4034482717514038, + "step": 74580 + }, + { + "epoch": 0.07512280415494525, + "grad_norm": 11.873839557877341, + "learning_rate": 4.992240043625037e-05, + "loss": 2.5221, + "mean_token_accuracy": 0.41034482717514037, + "step": 74585 + }, + { + "epoch": 0.07512784020804943, + "grad_norm": 11.211933986921421, + "learning_rate": 4.992236933903143e-05, + "loss": 2.75, + "mean_token_accuracy": 0.4034482717514038, + "step": 74590 + }, + { + "epoch": 0.0751328762611536, + "grad_norm": 11.632186027497143, + "learning_rate": 4.992233823559357e-05, + "loss": 2.7421, + "mean_token_accuracy": 0.4, + "step": 74595 + }, + { + "epoch": 0.07513791231425777, + "grad_norm": 11.803262561850177, + "learning_rate": 4.99223071259368e-05, + "loss": 3.2136, + "mean_token_accuracy": 0.3344827562570572, + "step": 74600 + }, + { + "epoch": 0.07514294836736195, + "grad_norm": 11.01942354518176, + "learning_rate": 4.992227601006111e-05, + "loss": 2.6837, + "mean_token_accuracy": 0.3862069010734558, + "step": 74605 + }, + { + "epoch": 0.07514798442046612, + "grad_norm": 13.273993822861753, + "learning_rate": 4.992224488796652e-05, + "loss": 2.3285, + "mean_token_accuracy": 0.4379310369491577, + "step": 74610 + }, + { + "epoch": 0.0751530204735703, + "grad_norm": 8.749877709278744, + "learning_rate": 4.992221375965304e-05, + "loss": 2.8436, + "mean_token_accuracy": 0.40490664839744567, + "step": 74615 + }, + { + "epoch": 0.07515805652667447, + "grad_norm": 12.88511731777342, + "learning_rate": 4.9922182625120675e-05, + "loss": 2.8848, + "mean_token_accuracy": 0.35172412991523744, + "step": 74620 + }, + { + "epoch": 0.07516309257977863, + "grad_norm": 12.392676243800622, + "learning_rate": 4.992215148436943e-05, + "loss": 2.6462, + "mean_token_accuracy": 0.4206896543502808, + "step": 74625 + }, + { + "epoch": 0.0751681286328828, + "grad_norm": 11.643080672138181, + "learning_rate": 4.9922120337399314e-05, + "loss": 2.7088, + "mean_token_accuracy": 0.4, + "step": 74630 + }, + { + "epoch": 0.07517316468598698, + "grad_norm": 12.264324180787263, + "learning_rate": 4.9922089184210344e-05, + "loss": 2.67, + "mean_token_accuracy": 0.36551723480224607, + "step": 74635 + }, + { + "epoch": 0.07517820073909115, + "grad_norm": 12.73314834192547, + "learning_rate": 4.992205802480253e-05, + "loss": 2.4767, + "mean_token_accuracy": 0.4379310429096222, + "step": 74640 + }, + { + "epoch": 0.07518323679219532, + "grad_norm": 10.956683053691346, + "learning_rate": 4.992202685917586e-05, + "loss": 2.3922, + "mean_token_accuracy": 0.46551724672317507, + "step": 74645 + }, + { + "epoch": 0.0751882728452995, + "grad_norm": 12.334739672775177, + "learning_rate": 4.992199568733037e-05, + "loss": 3.1054, + "mean_token_accuracy": 0.334482753276825, + "step": 74650 + }, + { + "epoch": 0.07519330889840367, + "grad_norm": 12.158124519690034, + "learning_rate": 4.992196450926605e-05, + "loss": 2.3824, + "mean_token_accuracy": 0.42413793206214906, + "step": 74655 + }, + { + "epoch": 0.07519834495150785, + "grad_norm": 13.63345370381226, + "learning_rate": 4.9921933324982924e-05, + "loss": 2.9769, + "mean_token_accuracy": 0.3529340624809265, + "step": 74660 + }, + { + "epoch": 0.07520338100461202, + "grad_norm": 13.752571481195474, + "learning_rate": 4.992190213448098e-05, + "loss": 2.9366, + "mean_token_accuracy": 0.3896551728248596, + "step": 74665 + }, + { + "epoch": 0.07520841705771619, + "grad_norm": 10.056726483025185, + "learning_rate": 4.992187093776024e-05, + "loss": 2.1112, + "mean_token_accuracy": 0.4986085891723633, + "step": 74670 + }, + { + "epoch": 0.07521345311082037, + "grad_norm": 21.573326588952398, + "learning_rate": 4.992183973482071e-05, + "loss": 2.9975, + "mean_token_accuracy": 0.4034482717514038, + "step": 74675 + }, + { + "epoch": 0.07521848916392454, + "grad_norm": 12.323525491808502, + "learning_rate": 4.992180852566241e-05, + "loss": 2.5283, + "mean_token_accuracy": 0.3931034505367279, + "step": 74680 + }, + { + "epoch": 0.07522352521702871, + "grad_norm": 10.163318492412415, + "learning_rate": 4.9921777310285323e-05, + "loss": 2.213, + "mean_token_accuracy": 0.4551724135875702, + "step": 74685 + }, + { + "epoch": 0.07522856127013289, + "grad_norm": 13.512074268775086, + "learning_rate": 4.992174608868948e-05, + "loss": 2.153, + "mean_token_accuracy": 0.4931034505367279, + "step": 74690 + }, + { + "epoch": 0.07523359732323705, + "grad_norm": 11.169950554489096, + "learning_rate": 4.992171486087488e-05, + "loss": 2.2033, + "mean_token_accuracy": 0.4103448331356049, + "step": 74695 + }, + { + "epoch": 0.07523863337634122, + "grad_norm": 10.07473264576246, + "learning_rate": 4.992168362684153e-05, + "loss": 2.3912, + "mean_token_accuracy": 0.41034482717514037, + "step": 74700 + }, + { + "epoch": 0.0752436694294454, + "grad_norm": 13.218831131423002, + "learning_rate": 4.992165238658945e-05, + "loss": 2.5886, + "mean_token_accuracy": 0.40865094065666197, + "step": 74705 + }, + { + "epoch": 0.07524870548254957, + "grad_norm": 11.366994764467304, + "learning_rate": 4.9921621140118634e-05, + "loss": 2.4869, + "mean_token_accuracy": 0.4448275864124298, + "step": 74710 + }, + { + "epoch": 0.07525374153565374, + "grad_norm": 11.366717384675882, + "learning_rate": 4.9921589887429104e-05, + "loss": 2.7037, + "mean_token_accuracy": 0.36896551847457887, + "step": 74715 + }, + { + "epoch": 0.07525877758875792, + "grad_norm": 13.53990782652659, + "learning_rate": 4.9921558628520854e-05, + "loss": 2.4363, + "mean_token_accuracy": 0.39655172526836396, + "step": 74720 + }, + { + "epoch": 0.07526381364186209, + "grad_norm": 10.544459823573526, + "learning_rate": 4.992152736339391e-05, + "loss": 2.1318, + "mean_token_accuracy": 0.4551724135875702, + "step": 74725 + }, + { + "epoch": 0.07526884969496626, + "grad_norm": 10.648861148027509, + "learning_rate": 4.992149609204827e-05, + "loss": 3.1111, + "mean_token_accuracy": 0.3448275804519653, + "step": 74730 + }, + { + "epoch": 0.07527388574807044, + "grad_norm": 11.409313736938818, + "learning_rate": 4.9921464814483946e-05, + "loss": 2.5184, + "mean_token_accuracy": 0.36551723778247835, + "step": 74735 + }, + { + "epoch": 0.07527892180117461, + "grad_norm": 10.7091061616946, + "learning_rate": 4.992143353070095e-05, + "loss": 2.2665, + "mean_token_accuracy": 0.458620685338974, + "step": 74740 + }, + { + "epoch": 0.07528395785427879, + "grad_norm": 11.173008504531076, + "learning_rate": 4.9921402240699264e-05, + "loss": 2.7655, + "mean_token_accuracy": 0.3620689630508423, + "step": 74745 + }, + { + "epoch": 0.07528899390738296, + "grad_norm": 12.409134088851763, + "learning_rate": 4.992137094447894e-05, + "loss": 2.4964, + "mean_token_accuracy": 0.39655172228813174, + "step": 74750 + }, + { + "epoch": 0.07529402996048713, + "grad_norm": 12.713230281053635, + "learning_rate": 4.992133964203995e-05, + "loss": 2.2552, + "mean_token_accuracy": 0.4172413766384125, + "step": 74755 + }, + { + "epoch": 0.0752990660135913, + "grad_norm": 12.23417559377399, + "learning_rate": 4.992130833338233e-05, + "loss": 2.7398, + "mean_token_accuracy": 0.4021173596382141, + "step": 74760 + }, + { + "epoch": 0.07530410206669547, + "grad_norm": 11.117608986130428, + "learning_rate": 4.992127701850607e-05, + "loss": 2.4732, + "mean_token_accuracy": 0.4068965494632721, + "step": 74765 + }, + { + "epoch": 0.07530913811979964, + "grad_norm": 12.623774228153135, + "learning_rate": 4.9921245697411186e-05, + "loss": 2.8768, + "mean_token_accuracy": 0.3999999940395355, + "step": 74770 + }, + { + "epoch": 0.07531417417290381, + "grad_norm": 13.707661883557265, + "learning_rate": 4.992121437009769e-05, + "loss": 2.366, + "mean_token_accuracy": 0.42758620977401735, + "step": 74775 + }, + { + "epoch": 0.07531921022600799, + "grad_norm": 13.63212038269677, + "learning_rate": 4.992118303656558e-05, + "loss": 2.5893, + "mean_token_accuracy": 0.3862069010734558, + "step": 74780 + }, + { + "epoch": 0.07532424627911216, + "grad_norm": 9.674901258760745, + "learning_rate": 4.992115169681487e-05, + "loss": 2.5405, + "mean_token_accuracy": 0.4256503343582153, + "step": 74785 + }, + { + "epoch": 0.07532928233221634, + "grad_norm": 9.204550764866626, + "learning_rate": 4.992112035084558e-05, + "loss": 2.1137, + "mean_token_accuracy": 0.4730187654495239, + "step": 74790 + }, + { + "epoch": 0.07533431838532051, + "grad_norm": 9.413424401161253, + "learning_rate": 4.99210889986577e-05, + "loss": 2.2227, + "mean_token_accuracy": 0.4551724076271057, + "step": 74795 + }, + { + "epoch": 0.07533935443842468, + "grad_norm": 12.444701421944044, + "learning_rate": 4.9921057640251254e-05, + "loss": 2.1964, + "mean_token_accuracy": 0.474168187379837, + "step": 74800 + }, + { + "epoch": 0.07534439049152886, + "grad_norm": 13.93003237255313, + "learning_rate": 4.992102627562624e-05, + "loss": 2.9358, + "mean_token_accuracy": 0.3896551728248596, + "step": 74805 + }, + { + "epoch": 0.07534942654463303, + "grad_norm": 12.873166648127915, + "learning_rate": 4.9920994904782666e-05, + "loss": 2.6602, + "mean_token_accuracy": 0.3896551728248596, + "step": 74810 + }, + { + "epoch": 0.0753544625977372, + "grad_norm": 12.016921192262771, + "learning_rate": 4.992096352772055e-05, + "loss": 2.3504, + "mean_token_accuracy": 0.4551724076271057, + "step": 74815 + }, + { + "epoch": 0.07535949865084138, + "grad_norm": 11.067540092049498, + "learning_rate": 4.99209321444399e-05, + "loss": 2.7399, + "mean_token_accuracy": 0.3896551728248596, + "step": 74820 + }, + { + "epoch": 0.07536453470394555, + "grad_norm": 9.775524552778675, + "learning_rate": 4.992090075494071e-05, + "loss": 2.6802, + "mean_token_accuracy": 0.34482758641242983, + "step": 74825 + }, + { + "epoch": 0.07536957075704973, + "grad_norm": 10.397829904370594, + "learning_rate": 4.992086935922301e-05, + "loss": 2.5004, + "mean_token_accuracy": 0.4413793206214905, + "step": 74830 + }, + { + "epoch": 0.07537460681015389, + "grad_norm": 10.15349748766837, + "learning_rate": 4.99208379572868e-05, + "loss": 2.3549, + "mean_token_accuracy": 0.43103448748588563, + "step": 74835 + }, + { + "epoch": 0.07537964286325806, + "grad_norm": 8.352377690529696, + "learning_rate": 4.9920806549132084e-05, + "loss": 2.4642, + "mean_token_accuracy": 0.43793103098869324, + "step": 74840 + }, + { + "epoch": 0.07538467891636223, + "grad_norm": 10.558177066840473, + "learning_rate": 4.9920775134758866e-05, + "loss": 2.7048, + "mean_token_accuracy": 0.3907440960407257, + "step": 74845 + }, + { + "epoch": 0.0753897149694664, + "grad_norm": 11.024318798441008, + "learning_rate": 4.9920743714167165e-05, + "loss": 2.4186, + "mean_token_accuracy": 0.3965517282485962, + "step": 74850 + }, + { + "epoch": 0.07539475102257058, + "grad_norm": 13.901048938707923, + "learning_rate": 4.9920712287356995e-05, + "loss": 2.6477, + "mean_token_accuracy": 0.3931034505367279, + "step": 74855 + }, + { + "epoch": 0.07539978707567475, + "grad_norm": 10.070421997084713, + "learning_rate": 4.992068085432836e-05, + "loss": 2.5728, + "mean_token_accuracy": 0.35862069129943847, + "step": 74860 + }, + { + "epoch": 0.07540482312877893, + "grad_norm": 10.291092774816319, + "learning_rate": 4.992064941508125e-05, + "loss": 3.0999, + "mean_token_accuracy": 0.34482758939266206, + "step": 74865 + }, + { + "epoch": 0.0754098591818831, + "grad_norm": 12.20101099803728, + "learning_rate": 4.9920617969615695e-05, + "loss": 2.5461, + "mean_token_accuracy": 0.4034482777118683, + "step": 74870 + }, + { + "epoch": 0.07541489523498728, + "grad_norm": 10.826455358059864, + "learning_rate": 4.99205865179317e-05, + "loss": 2.4063, + "mean_token_accuracy": 0.47586206793785096, + "step": 74875 + }, + { + "epoch": 0.07541993128809145, + "grad_norm": 13.705391209273914, + "learning_rate": 4.992055506002928e-05, + "loss": 2.3439, + "mean_token_accuracy": 0.47126436829566953, + "step": 74880 + }, + { + "epoch": 0.07542496734119562, + "grad_norm": 11.89151438287245, + "learning_rate": 4.992052359590842e-05, + "loss": 2.5816, + "mean_token_accuracy": 0.4068965554237366, + "step": 74885 + }, + { + "epoch": 0.0754300033942998, + "grad_norm": 10.02118133757943, + "learning_rate": 4.9920492125569154e-05, + "loss": 2.4312, + "mean_token_accuracy": 0.4050211668014526, + "step": 74890 + }, + { + "epoch": 0.07543503944740397, + "grad_norm": 10.72219849342627, + "learning_rate": 4.992046064901147e-05, + "loss": 2.333, + "mean_token_accuracy": 0.3965517282485962, + "step": 74895 + }, + { + "epoch": 0.07544007550050814, + "grad_norm": 10.7963269405344, + "learning_rate": 4.9920429166235396e-05, + "loss": 2.6162, + "mean_token_accuracy": 0.4034482717514038, + "step": 74900 + }, + { + "epoch": 0.0754451115536123, + "grad_norm": 11.740382548584366, + "learning_rate": 4.992039767724094e-05, + "loss": 2.4468, + "mean_token_accuracy": 0.4103448331356049, + "step": 74905 + }, + { + "epoch": 0.07545014760671648, + "grad_norm": 13.801400719597702, + "learning_rate": 4.992036618202809e-05, + "loss": 2.7828, + "mean_token_accuracy": 0.33932244777679443, + "step": 74910 + }, + { + "epoch": 0.07545518365982065, + "grad_norm": 11.367273892067944, + "learning_rate": 4.992033468059687e-05, + "loss": 2.152, + "mean_token_accuracy": 0.458620685338974, + "step": 74915 + }, + { + "epoch": 0.07546021971292483, + "grad_norm": 14.990574499188641, + "learning_rate": 4.9920303172947294e-05, + "loss": 2.8169, + "mean_token_accuracy": 0.38275861740112305, + "step": 74920 + }, + { + "epoch": 0.075465255766029, + "grad_norm": 10.951458740752074, + "learning_rate": 4.9920271659079356e-05, + "loss": 2.8005, + "mean_token_accuracy": 0.42758620381355283, + "step": 74925 + }, + { + "epoch": 0.07547029181913317, + "grad_norm": 11.804120355851495, + "learning_rate": 4.992024013899308e-05, + "loss": 2.6265, + "mean_token_accuracy": 0.3425892323255539, + "step": 74930 + }, + { + "epoch": 0.07547532787223735, + "grad_norm": 10.666261833024391, + "learning_rate": 4.992020861268846e-05, + "loss": 2.8334, + "mean_token_accuracy": 0.3793103486299515, + "step": 74935 + }, + { + "epoch": 0.07548036392534152, + "grad_norm": 15.16713252266937, + "learning_rate": 4.992017708016551e-05, + "loss": 2.717, + "mean_token_accuracy": 0.3965517282485962, + "step": 74940 + }, + { + "epoch": 0.0754853999784457, + "grad_norm": 11.39411820527837, + "learning_rate": 4.992014554142424e-05, + "loss": 2.8957, + "mean_token_accuracy": 0.32758620381355286, + "step": 74945 + }, + { + "epoch": 0.07549043603154987, + "grad_norm": 10.781770197595755, + "learning_rate": 4.992011399646467e-05, + "loss": 2.36, + "mean_token_accuracy": 0.3965517163276672, + "step": 74950 + }, + { + "epoch": 0.07549547208465404, + "grad_norm": 10.76762410311463, + "learning_rate": 4.9920082445286786e-05, + "loss": 2.6586, + "mean_token_accuracy": 0.3999999940395355, + "step": 74955 + }, + { + "epoch": 0.07550050813775822, + "grad_norm": 26.165188428211046, + "learning_rate": 4.9920050887890605e-05, + "loss": 2.7767, + "mean_token_accuracy": 0.3862069010734558, + "step": 74960 + }, + { + "epoch": 0.07550554419086239, + "grad_norm": 12.313893745380385, + "learning_rate": 4.992001932427615e-05, + "loss": 2.2627, + "mean_token_accuracy": 0.441379314661026, + "step": 74965 + }, + { + "epoch": 0.07551058024396656, + "grad_norm": 12.885221729148988, + "learning_rate": 4.991998775444342e-05, + "loss": 2.7019, + "mean_token_accuracy": 0.36551724672317504, + "step": 74970 + }, + { + "epoch": 0.07551561629707072, + "grad_norm": 15.883124242299214, + "learning_rate": 4.991995617839241e-05, + "loss": 3.2918, + "mean_token_accuracy": 0.33103448450565337, + "step": 74975 + }, + { + "epoch": 0.0755206523501749, + "grad_norm": 9.868879961174354, + "learning_rate": 4.9919924596123155e-05, + "loss": 2.5042, + "mean_token_accuracy": 0.4137930989265442, + "step": 74980 + }, + { + "epoch": 0.07552568840327907, + "grad_norm": 12.040100280413919, + "learning_rate": 4.991989300763565e-05, + "loss": 2.2684, + "mean_token_accuracy": 0.4586206912994385, + "step": 74985 + }, + { + "epoch": 0.07553072445638324, + "grad_norm": 11.377887154298785, + "learning_rate": 4.99198614129299e-05, + "loss": 2.7991, + "mean_token_accuracy": 0.3758620649576187, + "step": 74990 + }, + { + "epoch": 0.07553576050948742, + "grad_norm": 13.449003657266344, + "learning_rate": 4.991982981200591e-05, + "loss": 2.07, + "mean_token_accuracy": 0.4413793087005615, + "step": 74995 + }, + { + "epoch": 0.07554079656259159, + "grad_norm": 11.686053735967564, + "learning_rate": 4.9919798204863704e-05, + "loss": 2.6353, + "mean_token_accuracy": 0.3931034505367279, + "step": 75000 + }, + { + "epoch": 0.07554583261569577, + "grad_norm": 10.587006491035401, + "learning_rate": 4.991976659150328e-05, + "loss": 2.7143, + "mean_token_accuracy": 0.358620685338974, + "step": 75005 + }, + { + "epoch": 0.07555086866879994, + "grad_norm": 12.384930857089538, + "learning_rate": 4.991973497192466e-05, + "loss": 2.518, + "mean_token_accuracy": 0.39655172228813174, + "step": 75010 + }, + { + "epoch": 0.07555590472190411, + "grad_norm": 10.653591346609966, + "learning_rate": 4.9919703346127834e-05, + "loss": 2.5711, + "mean_token_accuracy": 0.4068965494632721, + "step": 75015 + }, + { + "epoch": 0.07556094077500829, + "grad_norm": 9.406840835700065, + "learning_rate": 4.991967171411283e-05, + "loss": 2.149, + "mean_token_accuracy": 0.48275861144065857, + "step": 75020 + }, + { + "epoch": 0.07556597682811246, + "grad_norm": 10.567768380908667, + "learning_rate": 4.991964007587963e-05, + "loss": 2.3411, + "mean_token_accuracy": 0.4184729039669037, + "step": 75025 + }, + { + "epoch": 0.07557101288121663, + "grad_norm": 14.564808823177678, + "learning_rate": 4.991960843142827e-05, + "loss": 2.492, + "mean_token_accuracy": 0.3793103456497192, + "step": 75030 + }, + { + "epoch": 0.07557604893432081, + "grad_norm": 11.328341539931163, + "learning_rate": 4.991957678075874e-05, + "loss": 2.5425, + "mean_token_accuracy": 0.39310344457626345, + "step": 75035 + }, + { + "epoch": 0.07558108498742498, + "grad_norm": 12.783123250394299, + "learning_rate": 4.9919545123871066e-05, + "loss": 2.1677, + "mean_token_accuracy": 0.41724138259887694, + "step": 75040 + }, + { + "epoch": 0.07558612104052914, + "grad_norm": 12.207032229240859, + "learning_rate": 4.991951346076524e-05, + "loss": 2.5307, + "mean_token_accuracy": 0.37241379022598264, + "step": 75045 + }, + { + "epoch": 0.07559115709363332, + "grad_norm": 10.518209511430392, + "learning_rate": 4.9919481791441284e-05, + "loss": 2.6607, + "mean_token_accuracy": 0.37586206793785093, + "step": 75050 + }, + { + "epoch": 0.07559619314673749, + "grad_norm": 10.159564609341814, + "learning_rate": 4.99194501158992e-05, + "loss": 2.3876, + "mean_token_accuracy": 0.4034482717514038, + "step": 75055 + }, + { + "epoch": 0.07560122919984166, + "grad_norm": 11.029800643270065, + "learning_rate": 4.991941843413899e-05, + "loss": 2.1125, + "mean_token_accuracy": 0.4413793087005615, + "step": 75060 + }, + { + "epoch": 0.07560626525294584, + "grad_norm": 10.07005251684402, + "learning_rate": 4.991938674616068e-05, + "loss": 2.519, + "mean_token_accuracy": 0.38275861740112305, + "step": 75065 + }, + { + "epoch": 0.07561130130605001, + "grad_norm": 8.983026109880992, + "learning_rate": 4.9919355051964264e-05, + "loss": 2.3491, + "mean_token_accuracy": 0.45311554074287413, + "step": 75070 + }, + { + "epoch": 0.07561633735915418, + "grad_norm": 11.384320478722957, + "learning_rate": 4.991932335154976e-05, + "loss": 2.6466, + "mean_token_accuracy": 0.4, + "step": 75075 + }, + { + "epoch": 0.07562137341225836, + "grad_norm": 10.983822068273314, + "learning_rate": 4.991929164491717e-05, + "loss": 2.7304, + "mean_token_accuracy": 0.4137930989265442, + "step": 75080 + }, + { + "epoch": 0.07562640946536253, + "grad_norm": 12.479218082718674, + "learning_rate": 4.991925993206651e-05, + "loss": 2.2225, + "mean_token_accuracy": 0.44827585816383364, + "step": 75085 + }, + { + "epoch": 0.0756314455184667, + "grad_norm": 17.26288710670105, + "learning_rate": 4.991922821299778e-05, + "loss": 2.7226, + "mean_token_accuracy": 0.4011494338512421, + "step": 75090 + }, + { + "epoch": 0.07563648157157088, + "grad_norm": 10.530593769733136, + "learning_rate": 4.9919196487711e-05, + "loss": 2.4504, + "mean_token_accuracy": 0.44482759237289426, + "step": 75095 + }, + { + "epoch": 0.07564151762467505, + "grad_norm": 14.00326368564235, + "learning_rate": 4.991916475620617e-05, + "loss": 3.0926, + "mean_token_accuracy": 0.3241379290819168, + "step": 75100 + }, + { + "epoch": 0.07564655367777923, + "grad_norm": 9.753542809414052, + "learning_rate": 4.9919133018483294e-05, + "loss": 2.1342, + "mean_token_accuracy": 0.46551724672317507, + "step": 75105 + }, + { + "epoch": 0.0756515897308834, + "grad_norm": 9.843300597729117, + "learning_rate": 4.9919101274542396e-05, + "loss": 2.3689, + "mean_token_accuracy": 0.4724137902259827, + "step": 75110 + }, + { + "epoch": 0.07565662578398756, + "grad_norm": 8.555182944116257, + "learning_rate": 4.991906952438347e-05, + "loss": 2.2321, + "mean_token_accuracy": 0.4517241418361664, + "step": 75115 + }, + { + "epoch": 0.07566166183709173, + "grad_norm": 10.986875355918743, + "learning_rate": 4.991903776800653e-05, + "loss": 2.2199, + "mean_token_accuracy": 0.42068964838981626, + "step": 75120 + }, + { + "epoch": 0.07566669789019591, + "grad_norm": 10.485706803715818, + "learning_rate": 4.99190060054116e-05, + "loss": 2.6549, + "mean_token_accuracy": 0.3551724135875702, + "step": 75125 + }, + { + "epoch": 0.07567173394330008, + "grad_norm": 9.47754968698919, + "learning_rate": 4.991897423659867e-05, + "loss": 2.666, + "mean_token_accuracy": 0.39716748893260956, + "step": 75130 + }, + { + "epoch": 0.07567676999640426, + "grad_norm": 9.510873780546204, + "learning_rate": 4.9918942461567744e-05, + "loss": 2.2567, + "mean_token_accuracy": 0.4517241358757019, + "step": 75135 + }, + { + "epoch": 0.07568180604950843, + "grad_norm": 10.237405787222407, + "learning_rate": 4.991891068031884e-05, + "loss": 2.5903, + "mean_token_accuracy": 0.37241379618644715, + "step": 75140 + }, + { + "epoch": 0.0756868421026126, + "grad_norm": 13.129441059355418, + "learning_rate": 4.991887889285198e-05, + "loss": 2.8956, + "mean_token_accuracy": 0.38275861740112305, + "step": 75145 + }, + { + "epoch": 0.07569187815571678, + "grad_norm": 9.53919184752354, + "learning_rate": 4.991884709916716e-05, + "loss": 2.6506, + "mean_token_accuracy": 0.43103448748588563, + "step": 75150 + }, + { + "epoch": 0.07569691420882095, + "grad_norm": 10.903487703731537, + "learning_rate": 4.991881529926438e-05, + "loss": 2.5109, + "mean_token_accuracy": 0.41379310488700866, + "step": 75155 + }, + { + "epoch": 0.07570195026192512, + "grad_norm": 11.465358873248912, + "learning_rate": 4.991878349314365e-05, + "loss": 2.4678, + "mean_token_accuracy": 0.4, + "step": 75160 + }, + { + "epoch": 0.0757069863150293, + "grad_norm": 13.629843458820442, + "learning_rate": 4.9918751680805e-05, + "loss": 2.7878, + "mean_token_accuracy": 0.3551724195480347, + "step": 75165 + }, + { + "epoch": 0.07571202236813347, + "grad_norm": 12.560534730204415, + "learning_rate": 4.9918719862248436e-05, + "loss": 2.6814, + "mean_token_accuracy": 0.39310344457626345, + "step": 75170 + }, + { + "epoch": 0.07571705842123765, + "grad_norm": 12.18031538631984, + "learning_rate": 4.991868803747394e-05, + "loss": 2.6904, + "mean_token_accuracy": 0.36206896901130675, + "step": 75175 + }, + { + "epoch": 0.07572209447434182, + "grad_norm": 10.788068454084915, + "learning_rate": 4.991865620648154e-05, + "loss": 2.3876, + "mean_token_accuracy": 0.4448275864124298, + "step": 75180 + }, + { + "epoch": 0.07572713052744598, + "grad_norm": 14.966159818821014, + "learning_rate": 4.991862436927124e-05, + "loss": 2.4394, + "mean_token_accuracy": 0.40816696882247927, + "step": 75185 + }, + { + "epoch": 0.07573216658055015, + "grad_norm": 13.041823685079718, + "learning_rate": 4.991859252584306e-05, + "loss": 2.6527, + "mean_token_accuracy": 0.41911675930023196, + "step": 75190 + }, + { + "epoch": 0.07573720263365433, + "grad_norm": 12.014922361026857, + "learning_rate": 4.9918560676197e-05, + "loss": 2.7402, + "mean_token_accuracy": 0.33793103098869326, + "step": 75195 + }, + { + "epoch": 0.0757422386867585, + "grad_norm": 11.441267165296328, + "learning_rate": 4.991852882033306e-05, + "loss": 2.2222, + "mean_token_accuracy": 0.4122807025909424, + "step": 75200 + }, + { + "epoch": 0.07574727473986267, + "grad_norm": 11.42361996016671, + "learning_rate": 4.9918496958251255e-05, + "loss": 2.3394, + "mean_token_accuracy": 0.4448275834321976, + "step": 75205 + }, + { + "epoch": 0.07575231079296685, + "grad_norm": 9.409352254791383, + "learning_rate": 4.99184650899516e-05, + "loss": 2.5803, + "mean_token_accuracy": 0.4068965554237366, + "step": 75210 + }, + { + "epoch": 0.07575734684607102, + "grad_norm": 11.569302664848456, + "learning_rate": 4.9918433215434106e-05, + "loss": 2.8479, + "mean_token_accuracy": 0.3655172407627106, + "step": 75215 + }, + { + "epoch": 0.0757623828991752, + "grad_norm": 17.280672085888906, + "learning_rate": 4.991840133469877e-05, + "loss": 2.5755, + "mean_token_accuracy": 0.3793103456497192, + "step": 75220 + }, + { + "epoch": 0.07576741895227937, + "grad_norm": 10.896118151161081, + "learning_rate": 4.991836944774561e-05, + "loss": 2.9679, + "mean_token_accuracy": 0.3620689630508423, + "step": 75225 + }, + { + "epoch": 0.07577245500538354, + "grad_norm": 13.55580319769656, + "learning_rate": 4.991833755457463e-05, + "loss": 2.0373, + "mean_token_accuracy": 0.44827587008476255, + "step": 75230 + }, + { + "epoch": 0.07577749105848772, + "grad_norm": 12.551735767135769, + "learning_rate": 4.991830565518584e-05, + "loss": 2.6958, + "mean_token_accuracy": 0.37241379022598264, + "step": 75235 + }, + { + "epoch": 0.07578252711159189, + "grad_norm": 12.31565082725626, + "learning_rate": 4.9918273749579244e-05, + "loss": 2.3541, + "mean_token_accuracy": 0.41185722351074217, + "step": 75240 + }, + { + "epoch": 0.07578756316469606, + "grad_norm": 10.348166999400974, + "learning_rate": 4.9918241837754865e-05, + "loss": 2.5623, + "mean_token_accuracy": 0.41724138259887694, + "step": 75245 + }, + { + "epoch": 0.07579259921780024, + "grad_norm": 12.465487218834705, + "learning_rate": 4.99182099197127e-05, + "loss": 2.3039, + "mean_token_accuracy": 0.4034482717514038, + "step": 75250 + }, + { + "epoch": 0.0757976352709044, + "grad_norm": 11.270258523822815, + "learning_rate": 4.991817799545277e-05, + "loss": 2.1368, + "mean_token_accuracy": 0.44319419264793397, + "step": 75255 + }, + { + "epoch": 0.07580267132400857, + "grad_norm": 14.004005560534106, + "learning_rate": 4.9918146064975055e-05, + "loss": 2.4613, + "mean_token_accuracy": 0.4620689690113068, + "step": 75260 + }, + { + "epoch": 0.07580770737711275, + "grad_norm": 11.713717500858031, + "learning_rate": 4.99181141282796e-05, + "loss": 2.558, + "mean_token_accuracy": 0.3517241358757019, + "step": 75265 + }, + { + "epoch": 0.07581274343021692, + "grad_norm": 12.189310493823035, + "learning_rate": 4.991808218536639e-05, + "loss": 2.2257, + "mean_token_accuracy": 0.41379310488700866, + "step": 75270 + }, + { + "epoch": 0.0758177794833211, + "grad_norm": 20.639764066035305, + "learning_rate": 4.991805023623544e-05, + "loss": 2.7185, + "mean_token_accuracy": 0.41034482717514037, + "step": 75275 + }, + { + "epoch": 0.07582281553642527, + "grad_norm": 10.784292056196412, + "learning_rate": 4.991801828088676e-05, + "loss": 2.5384, + "mean_token_accuracy": 0.42758620977401735, + "step": 75280 + }, + { + "epoch": 0.07582785158952944, + "grad_norm": 11.041400958105278, + "learning_rate": 4.991798631932036e-05, + "loss": 3.1766, + "mean_token_accuracy": 0.3241379201412201, + "step": 75285 + }, + { + "epoch": 0.07583288764263361, + "grad_norm": 11.957961793293695, + "learning_rate": 4.991795435153626e-05, + "loss": 3.0443, + "mean_token_accuracy": 0.3551724076271057, + "step": 75290 + }, + { + "epoch": 0.07583792369573779, + "grad_norm": 11.03728810755462, + "learning_rate": 4.9917922377534435e-05, + "loss": 1.9332, + "mean_token_accuracy": 0.5034482717514038, + "step": 75295 + }, + { + "epoch": 0.07584295974884196, + "grad_norm": 7.7894521237925, + "learning_rate": 4.991789039731493e-05, + "loss": 2.3052, + "mean_token_accuracy": 0.4809437394142151, + "step": 75300 + }, + { + "epoch": 0.07584799580194614, + "grad_norm": 9.925731783100467, + "learning_rate": 4.9917858410877736e-05, + "loss": 2.4541, + "mean_token_accuracy": 0.42413793206214906, + "step": 75305 + }, + { + "epoch": 0.07585303185505031, + "grad_norm": 12.186828387113746, + "learning_rate": 4.9917826418222866e-05, + "loss": 1.9564, + "mean_token_accuracy": 0.4793103516101837, + "step": 75310 + }, + { + "epoch": 0.07585806790815448, + "grad_norm": 9.130023978551153, + "learning_rate": 4.9917794419350334e-05, + "loss": 2.1532, + "mean_token_accuracy": 0.4862069010734558, + "step": 75315 + }, + { + "epoch": 0.07586310396125866, + "grad_norm": 10.719552968103416, + "learning_rate": 4.991776241426013e-05, + "loss": 2.6554, + "mean_token_accuracy": 0.3862068891525269, + "step": 75320 + }, + { + "epoch": 0.07586814001436282, + "grad_norm": 11.588590398022935, + "learning_rate": 4.991773040295228e-05, + "loss": 2.801, + "mean_token_accuracy": 0.37931033968925476, + "step": 75325 + }, + { + "epoch": 0.07587317606746699, + "grad_norm": 10.377063041000913, + "learning_rate": 4.99176983854268e-05, + "loss": 3.0175, + "mean_token_accuracy": 0.32413792610168457, + "step": 75330 + }, + { + "epoch": 0.07587821212057116, + "grad_norm": 11.329290877943746, + "learning_rate": 4.991766636168368e-05, + "loss": 2.7734, + "mean_token_accuracy": 0.4068965554237366, + "step": 75335 + }, + { + "epoch": 0.07588324817367534, + "grad_norm": 10.184275511932269, + "learning_rate": 4.991763433172293e-05, + "loss": 2.6586, + "mean_token_accuracy": 0.3655172407627106, + "step": 75340 + }, + { + "epoch": 0.07588828422677951, + "grad_norm": 11.522456523744934, + "learning_rate": 4.991760229554457e-05, + "loss": 2.3303, + "mean_token_accuracy": 0.42413793206214906, + "step": 75345 + }, + { + "epoch": 0.07589332027988369, + "grad_norm": 10.294440350523576, + "learning_rate": 4.991757025314861e-05, + "loss": 2.8296, + "mean_token_accuracy": 0.3551724135875702, + "step": 75350 + }, + { + "epoch": 0.07589835633298786, + "grad_norm": 12.98451454093618, + "learning_rate": 4.9917538204535046e-05, + "loss": 2.5555, + "mean_token_accuracy": 0.35517241060733795, + "step": 75355 + }, + { + "epoch": 0.07590339238609203, + "grad_norm": 9.376752104112684, + "learning_rate": 4.99175061497039e-05, + "loss": 1.9258, + "mean_token_accuracy": 0.5241379261016845, + "step": 75360 + }, + { + "epoch": 0.07590842843919621, + "grad_norm": 10.175879366119066, + "learning_rate": 4.991747408865517e-05, + "loss": 2.341, + "mean_token_accuracy": 0.4567453145980835, + "step": 75365 + }, + { + "epoch": 0.07591346449230038, + "grad_norm": 14.439015036731913, + "learning_rate": 4.991744202138887e-05, + "loss": 2.4136, + "mean_token_accuracy": 0.4379310369491577, + "step": 75370 + }, + { + "epoch": 0.07591850054540455, + "grad_norm": 11.081259550873272, + "learning_rate": 4.991740994790501e-05, + "loss": 2.6835, + "mean_token_accuracy": 0.3931034505367279, + "step": 75375 + }, + { + "epoch": 0.07592353659850873, + "grad_norm": 13.832344843354749, + "learning_rate": 4.991737786820361e-05, + "loss": 2.7176, + "mean_token_accuracy": 0.4, + "step": 75380 + }, + { + "epoch": 0.0759285726516129, + "grad_norm": 13.03687956768484, + "learning_rate": 4.991734578228465e-05, + "loss": 2.3686, + "mean_token_accuracy": 0.4896551728248596, + "step": 75385 + }, + { + "epoch": 0.07593360870471708, + "grad_norm": 11.489477238906288, + "learning_rate": 4.991731369014816e-05, + "loss": 2.1997, + "mean_token_accuracy": 0.44482757449150084, + "step": 75390 + }, + { + "epoch": 0.07593864475782124, + "grad_norm": 11.3353927141494, + "learning_rate": 4.991728159179414e-05, + "loss": 2.6566, + "mean_token_accuracy": 0.3793103456497192, + "step": 75395 + }, + { + "epoch": 0.07594368081092541, + "grad_norm": 9.45146214503811, + "learning_rate": 4.991724948722261e-05, + "loss": 2.2825, + "mean_token_accuracy": 0.4172413766384125, + "step": 75400 + }, + { + "epoch": 0.07594871686402958, + "grad_norm": 11.096286193814361, + "learning_rate": 4.9917217376433575e-05, + "loss": 2.7253, + "mean_token_accuracy": 0.37586206793785093, + "step": 75405 + }, + { + "epoch": 0.07595375291713376, + "grad_norm": 10.350356556885208, + "learning_rate": 4.991718525942704e-05, + "loss": 2.9124, + "mean_token_accuracy": 0.3931034505367279, + "step": 75410 + }, + { + "epoch": 0.07595878897023793, + "grad_norm": 12.469181930014196, + "learning_rate": 4.991715313620301e-05, + "loss": 2.5351, + "mean_token_accuracy": 0.4, + "step": 75415 + }, + { + "epoch": 0.0759638250233421, + "grad_norm": 10.705802817729872, + "learning_rate": 4.99171210067615e-05, + "loss": 2.6078, + "mean_token_accuracy": 0.42413792610168455, + "step": 75420 + }, + { + "epoch": 0.07596886107644628, + "grad_norm": 10.836660424684972, + "learning_rate": 4.991708887110252e-05, + "loss": 2.7352, + "mean_token_accuracy": 0.34827586114406583, + "step": 75425 + }, + { + "epoch": 0.07597389712955045, + "grad_norm": 12.088037949854911, + "learning_rate": 4.991705672922608e-05, + "loss": 2.4011, + "mean_token_accuracy": 0.43793103098869324, + "step": 75430 + }, + { + "epoch": 0.07597893318265463, + "grad_norm": 10.52323614940045, + "learning_rate": 4.991702458113218e-05, + "loss": 2.2881, + "mean_token_accuracy": 0.38965516686439516, + "step": 75435 + }, + { + "epoch": 0.0759839692357588, + "grad_norm": 10.062417117715869, + "learning_rate": 4.991699242682084e-05, + "loss": 2.2873, + "mean_token_accuracy": 0.4517241358757019, + "step": 75440 + }, + { + "epoch": 0.07598900528886297, + "grad_norm": 10.416310908719684, + "learning_rate": 4.9916960266292055e-05, + "loss": 2.4658, + "mean_token_accuracy": 0.39655172228813174, + "step": 75445 + }, + { + "epoch": 0.07599404134196715, + "grad_norm": 11.000369668934095, + "learning_rate": 4.9916928099545846e-05, + "loss": 2.4586, + "mean_token_accuracy": 0.4413793087005615, + "step": 75450 + }, + { + "epoch": 0.07599907739507132, + "grad_norm": 11.457982912788287, + "learning_rate": 4.991689592658222e-05, + "loss": 2.442, + "mean_token_accuracy": 0.4172413766384125, + "step": 75455 + }, + { + "epoch": 0.0760041134481755, + "grad_norm": 12.307633852153732, + "learning_rate": 4.991686374740118e-05, + "loss": 2.5387, + "mean_token_accuracy": 0.4034482777118683, + "step": 75460 + }, + { + "epoch": 0.07600914950127965, + "grad_norm": 10.954265543886638, + "learning_rate": 4.991683156200275e-05, + "loss": 1.9785, + "mean_token_accuracy": 0.5206896543502808, + "step": 75465 + }, + { + "epoch": 0.07601418555438383, + "grad_norm": 12.20215608218267, + "learning_rate": 4.9916799370386915e-05, + "loss": 2.4452, + "mean_token_accuracy": 0.42758620381355283, + "step": 75470 + }, + { + "epoch": 0.076019221607488, + "grad_norm": 9.70822357113351, + "learning_rate": 4.991676717255371e-05, + "loss": 2.5331, + "mean_token_accuracy": 0.38275861740112305, + "step": 75475 + }, + { + "epoch": 0.07602425766059218, + "grad_norm": 16.52616441978274, + "learning_rate": 4.991673496850312e-05, + "loss": 2.6698, + "mean_token_accuracy": 0.36551723778247835, + "step": 75480 + }, + { + "epoch": 0.07602929371369635, + "grad_norm": 10.132111930425728, + "learning_rate": 4.9916702758235165e-05, + "loss": 2.826, + "mean_token_accuracy": 0.33103448152542114, + "step": 75485 + }, + { + "epoch": 0.07603432976680052, + "grad_norm": 11.558014058055821, + "learning_rate": 4.991667054174985e-05, + "loss": 2.3401, + "mean_token_accuracy": 0.4068965554237366, + "step": 75490 + }, + { + "epoch": 0.0760393658199047, + "grad_norm": 13.188818462888594, + "learning_rate": 4.9916638319047204e-05, + "loss": 2.5967, + "mean_token_accuracy": 0.3655172407627106, + "step": 75495 + }, + { + "epoch": 0.07604440187300887, + "grad_norm": 15.504357527271171, + "learning_rate": 4.991660609012721e-05, + "loss": 2.7195, + "mean_token_accuracy": 0.3896551728248596, + "step": 75500 + }, + { + "epoch": 0.07604943792611304, + "grad_norm": 12.890457659504058, + "learning_rate": 4.991657385498989e-05, + "loss": 3.0247, + "mean_token_accuracy": 0.3517241358757019, + "step": 75505 + }, + { + "epoch": 0.07605447397921722, + "grad_norm": 10.865775989051082, + "learning_rate": 4.9916541613635256e-05, + "loss": 2.3676, + "mean_token_accuracy": 0.4766009867191315, + "step": 75510 + }, + { + "epoch": 0.07605951003232139, + "grad_norm": 11.447554423078827, + "learning_rate": 4.99165093660633e-05, + "loss": 2.6091, + "mean_token_accuracy": 0.40211735367774964, + "step": 75515 + }, + { + "epoch": 0.07606454608542557, + "grad_norm": 12.192775550775172, + "learning_rate": 4.991647711227404e-05, + "loss": 2.2874, + "mean_token_accuracy": 0.4551724135875702, + "step": 75520 + }, + { + "epoch": 0.07606958213852974, + "grad_norm": 11.445999369619152, + "learning_rate": 4.991644485226749e-05, + "loss": 2.4792, + "mean_token_accuracy": 0.4482758641242981, + "step": 75525 + }, + { + "epoch": 0.07607461819163391, + "grad_norm": 10.741578388438642, + "learning_rate": 4.991641258604365e-05, + "loss": 2.549, + "mean_token_accuracy": 0.41034482717514037, + "step": 75530 + }, + { + "epoch": 0.07607965424473807, + "grad_norm": 12.080167817450432, + "learning_rate": 4.991638031360254e-05, + "loss": 2.6935, + "mean_token_accuracy": 0.3896551728248596, + "step": 75535 + }, + { + "epoch": 0.07608469029784225, + "grad_norm": 13.905967665457455, + "learning_rate": 4.991634803494417e-05, + "loss": 2.5429, + "mean_token_accuracy": 0.4034482777118683, + "step": 75540 + }, + { + "epoch": 0.07608972635094642, + "grad_norm": 9.955735813353055, + "learning_rate": 4.9916315750068535e-05, + "loss": 2.6584, + "mean_token_accuracy": 0.4034482777118683, + "step": 75545 + }, + { + "epoch": 0.0760947624040506, + "grad_norm": 11.70529111528291, + "learning_rate": 4.991628345897565e-05, + "loss": 2.5981, + "mean_token_accuracy": 0.3758620649576187, + "step": 75550 + }, + { + "epoch": 0.07609979845715477, + "grad_norm": 17.690014833130675, + "learning_rate": 4.991625116166553e-05, + "loss": 2.6763, + "mean_token_accuracy": 0.4103448331356049, + "step": 75555 + }, + { + "epoch": 0.07610483451025894, + "grad_norm": 11.515322335558984, + "learning_rate": 4.991621885813818e-05, + "loss": 2.9642, + "mean_token_accuracy": 0.38965517580509185, + "step": 75560 + }, + { + "epoch": 0.07610987056336312, + "grad_norm": 12.495993637539948, + "learning_rate": 4.9916186548393596e-05, + "loss": 2.6704, + "mean_token_accuracy": 0.35862069129943847, + "step": 75565 + }, + { + "epoch": 0.07611490661646729, + "grad_norm": 14.325867460775958, + "learning_rate": 4.9916154232431806e-05, + "loss": 2.2145, + "mean_token_accuracy": 0.4620689570903778, + "step": 75570 + }, + { + "epoch": 0.07611994266957146, + "grad_norm": 11.182823668119719, + "learning_rate": 4.9916121910252815e-05, + "loss": 2.5401, + "mean_token_accuracy": 0.4551724076271057, + "step": 75575 + }, + { + "epoch": 0.07612497872267564, + "grad_norm": 10.766892936911312, + "learning_rate": 4.991608958185662e-05, + "loss": 2.0896, + "mean_token_accuracy": 0.44640048742294314, + "step": 75580 + }, + { + "epoch": 0.07613001477577981, + "grad_norm": 11.20327310654806, + "learning_rate": 4.9916057247243256e-05, + "loss": 2.2858, + "mean_token_accuracy": 0.44482759237289426, + "step": 75585 + }, + { + "epoch": 0.07613505082888399, + "grad_norm": 16.05615146700863, + "learning_rate": 4.991602490641271e-05, + "loss": 2.6344, + "mean_token_accuracy": 0.4034482777118683, + "step": 75590 + }, + { + "epoch": 0.07614008688198816, + "grad_norm": 11.014872425188628, + "learning_rate": 4.9915992559364986e-05, + "loss": 2.4541, + "mean_token_accuracy": 0.4620689630508423, + "step": 75595 + }, + { + "epoch": 0.07614512293509233, + "grad_norm": 11.084625853288069, + "learning_rate": 4.99159602061001e-05, + "loss": 2.2378, + "mean_token_accuracy": 0.4551724076271057, + "step": 75600 + }, + { + "epoch": 0.07615015898819649, + "grad_norm": 12.029958990516516, + "learning_rate": 4.991592784661807e-05, + "loss": 2.4505, + "mean_token_accuracy": 0.44343616962432864, + "step": 75605 + }, + { + "epoch": 0.07615519504130067, + "grad_norm": 9.449986214103564, + "learning_rate": 4.991589548091891e-05, + "loss": 2.3957, + "mean_token_accuracy": 0.4310344815254211, + "step": 75610 + }, + { + "epoch": 0.07616023109440484, + "grad_norm": 11.236790398904777, + "learning_rate": 4.99158631090026e-05, + "loss": 2.5347, + "mean_token_accuracy": 0.3793103516101837, + "step": 75615 + }, + { + "epoch": 0.07616526714750901, + "grad_norm": 12.7769487512638, + "learning_rate": 4.991583073086918e-05, + "loss": 2.7342, + "mean_token_accuracy": 0.38965516686439516, + "step": 75620 + }, + { + "epoch": 0.07617030320061319, + "grad_norm": 13.557394454357604, + "learning_rate": 4.991579834651864e-05, + "loss": 2.5632, + "mean_token_accuracy": 0.4263157844543457, + "step": 75625 + }, + { + "epoch": 0.07617533925371736, + "grad_norm": 11.01410154138524, + "learning_rate": 4.991576595595099e-05, + "loss": 2.5489, + "mean_token_accuracy": 0.39999999701976774, + "step": 75630 + }, + { + "epoch": 0.07618037530682154, + "grad_norm": 10.00399654338808, + "learning_rate": 4.9915733559166256e-05, + "loss": 2.4339, + "mean_token_accuracy": 0.3758620619773865, + "step": 75635 + }, + { + "epoch": 0.07618541135992571, + "grad_norm": 15.52837338351932, + "learning_rate": 4.9915701156164425e-05, + "loss": 2.5025, + "mean_token_accuracy": 0.3880822718143463, + "step": 75640 + }, + { + "epoch": 0.07619044741302988, + "grad_norm": 10.646481008341645, + "learning_rate": 4.9915668746945515e-05, + "loss": 2.2505, + "mean_token_accuracy": 0.4655172348022461, + "step": 75645 + }, + { + "epoch": 0.07619548346613406, + "grad_norm": 9.818662496962382, + "learning_rate": 4.991563633150954e-05, + "loss": 2.6408, + "mean_token_accuracy": 0.3931034505367279, + "step": 75650 + }, + { + "epoch": 0.07620051951923823, + "grad_norm": 12.10137909331914, + "learning_rate": 4.991560390985651e-05, + "loss": 2.4038, + "mean_token_accuracy": 0.45366001725196836, + "step": 75655 + }, + { + "epoch": 0.0762055555723424, + "grad_norm": 10.83122901867495, + "learning_rate": 4.991557148198642e-05, + "loss": 2.2971, + "mean_token_accuracy": 0.46896552443504336, + "step": 75660 + }, + { + "epoch": 0.07621059162544658, + "grad_norm": 12.343724186293136, + "learning_rate": 4.9915539047899286e-05, + "loss": 2.1792, + "mean_token_accuracy": 0.47586206793785096, + "step": 75665 + }, + { + "epoch": 0.07621562767855075, + "grad_norm": 10.272449368344718, + "learning_rate": 4.991550660759512e-05, + "loss": 2.8551, + "mean_token_accuracy": 0.37931033968925476, + "step": 75670 + }, + { + "epoch": 0.07622066373165491, + "grad_norm": 12.521664763843308, + "learning_rate": 4.9915474161073936e-05, + "loss": 2.376, + "mean_token_accuracy": 0.4620689690113068, + "step": 75675 + }, + { + "epoch": 0.07622569978475909, + "grad_norm": 8.823733341684603, + "learning_rate": 4.991544170833573e-05, + "loss": 2.2021, + "mean_token_accuracy": 0.4551724135875702, + "step": 75680 + }, + { + "epoch": 0.07623073583786326, + "grad_norm": 10.227872309189037, + "learning_rate": 4.991540924938052e-05, + "loss": 2.6469, + "mean_token_accuracy": 0.3793103516101837, + "step": 75685 + }, + { + "epoch": 0.07623577189096743, + "grad_norm": 11.176571091423007, + "learning_rate": 4.991537678420831e-05, + "loss": 2.8674, + "mean_token_accuracy": 0.36206896007061007, + "step": 75690 + }, + { + "epoch": 0.0762408079440716, + "grad_norm": 9.119711797283024, + "learning_rate": 4.991534431281912e-05, + "loss": 2.5772, + "mean_token_accuracy": 0.40895341634750365, + "step": 75695 + }, + { + "epoch": 0.07624584399717578, + "grad_norm": 13.13742528826164, + "learning_rate": 4.9915311835212944e-05, + "loss": 2.5808, + "mean_token_accuracy": 0.42758620977401735, + "step": 75700 + }, + { + "epoch": 0.07625088005027995, + "grad_norm": 15.485259453933407, + "learning_rate": 4.991527935138979e-05, + "loss": 2.6134, + "mean_token_accuracy": 0.39310344457626345, + "step": 75705 + }, + { + "epoch": 0.07625591610338413, + "grad_norm": 12.245407266659196, + "learning_rate": 4.991524686134969e-05, + "loss": 2.6322, + "mean_token_accuracy": 0.4034482777118683, + "step": 75710 + }, + { + "epoch": 0.0762609521564883, + "grad_norm": 10.410091201452078, + "learning_rate": 4.991521436509263e-05, + "loss": 2.3037, + "mean_token_accuracy": 0.458620685338974, + "step": 75715 + }, + { + "epoch": 0.07626598820959248, + "grad_norm": 11.236174950689106, + "learning_rate": 4.991518186261863e-05, + "loss": 2.5427, + "mean_token_accuracy": 0.41034482717514037, + "step": 75720 + }, + { + "epoch": 0.07627102426269665, + "grad_norm": 10.479069129460154, + "learning_rate": 4.9915149353927696e-05, + "loss": 2.579, + "mean_token_accuracy": 0.38275861740112305, + "step": 75725 + }, + { + "epoch": 0.07627606031580082, + "grad_norm": 10.383650281887649, + "learning_rate": 4.991511683901983e-05, + "loss": 2.8116, + "mean_token_accuracy": 0.3965517282485962, + "step": 75730 + }, + { + "epoch": 0.076281096368905, + "grad_norm": 13.455229293508971, + "learning_rate": 4.991508431789505e-05, + "loss": 2.5172, + "mean_token_accuracy": 0.3758620709180832, + "step": 75735 + }, + { + "epoch": 0.07628613242200917, + "grad_norm": 10.967299834092879, + "learning_rate": 4.991505179055337e-05, + "loss": 2.4315, + "mean_token_accuracy": 0.4275861978530884, + "step": 75740 + }, + { + "epoch": 0.07629116847511333, + "grad_norm": 11.403334189242738, + "learning_rate": 4.991501925699478e-05, + "loss": 2.5231, + "mean_token_accuracy": 0.3931034505367279, + "step": 75745 + }, + { + "epoch": 0.0762962045282175, + "grad_norm": 11.555463087203313, + "learning_rate": 4.991498671721931e-05, + "loss": 2.3855, + "mean_token_accuracy": 0.41724138557910917, + "step": 75750 + }, + { + "epoch": 0.07630124058132168, + "grad_norm": 12.708598989309463, + "learning_rate": 4.991495417122696e-05, + "loss": 2.6184, + "mean_token_accuracy": 0.4, + "step": 75755 + }, + { + "epoch": 0.07630627663442585, + "grad_norm": 10.568363549210302, + "learning_rate": 4.991492161901773e-05, + "loss": 2.31, + "mean_token_accuracy": 0.44482758045196535, + "step": 75760 + }, + { + "epoch": 0.07631131268753003, + "grad_norm": 15.921399308053875, + "learning_rate": 4.991488906059164e-05, + "loss": 2.8976, + "mean_token_accuracy": 0.4329703629016876, + "step": 75765 + }, + { + "epoch": 0.0763163487406342, + "grad_norm": 9.652541944142751, + "learning_rate": 4.9914856495948706e-05, + "loss": 2.631, + "mean_token_accuracy": 0.403448274731636, + "step": 75770 + }, + { + "epoch": 0.07632138479373837, + "grad_norm": 11.303889242152, + "learning_rate": 4.991482392508892e-05, + "loss": 2.8653, + "mean_token_accuracy": 0.3275861978530884, + "step": 75775 + }, + { + "epoch": 0.07632642084684255, + "grad_norm": 17.392034790670053, + "learning_rate": 4.991479134801231e-05, + "loss": 2.725, + "mean_token_accuracy": 0.36896551847457887, + "step": 75780 + }, + { + "epoch": 0.07633145689994672, + "grad_norm": 11.004525866629024, + "learning_rate": 4.9914758764718865e-05, + "loss": 2.4916, + "mean_token_accuracy": 0.4034482777118683, + "step": 75785 + }, + { + "epoch": 0.0763364929530509, + "grad_norm": 11.529170184847597, + "learning_rate": 4.9914726175208605e-05, + "loss": 2.4252, + "mean_token_accuracy": 0.43103447556495667, + "step": 75790 + }, + { + "epoch": 0.07634152900615507, + "grad_norm": 15.050603487959778, + "learning_rate": 4.991469357948153e-05, + "loss": 2.5003, + "mean_token_accuracy": 0.3896551728248596, + "step": 75795 + }, + { + "epoch": 0.07634656505925924, + "grad_norm": 10.789635727337897, + "learning_rate": 4.991466097753767e-05, + "loss": 2.4802, + "mean_token_accuracy": 0.4448275864124298, + "step": 75800 + }, + { + "epoch": 0.07635160111236342, + "grad_norm": 11.683563473038868, + "learning_rate": 4.9914628369377e-05, + "loss": 2.5668, + "mean_token_accuracy": 0.39310344457626345, + "step": 75805 + }, + { + "epoch": 0.07635663716546759, + "grad_norm": 10.765910310748934, + "learning_rate": 4.991459575499956e-05, + "loss": 2.6054, + "mean_token_accuracy": 0.3896551728248596, + "step": 75810 + }, + { + "epoch": 0.07636167321857175, + "grad_norm": 9.635504361083918, + "learning_rate": 4.9914563134405355e-05, + "loss": 2.2248, + "mean_token_accuracy": 0.4517241358757019, + "step": 75815 + }, + { + "epoch": 0.07636670927167592, + "grad_norm": 21.691697319154112, + "learning_rate": 4.991453050759438e-05, + "loss": 2.8796, + "mean_token_accuracy": 0.4000000089406967, + "step": 75820 + }, + { + "epoch": 0.0763717453247801, + "grad_norm": 11.224333017730409, + "learning_rate": 4.991449787456665e-05, + "loss": 2.5865, + "mean_token_accuracy": 0.44137930274009707, + "step": 75825 + }, + { + "epoch": 0.07637678137788427, + "grad_norm": 8.47216788328823, + "learning_rate": 4.991446523532218e-05, + "loss": 2.1865, + "mean_token_accuracy": 0.49243800044059755, + "step": 75830 + }, + { + "epoch": 0.07638181743098844, + "grad_norm": 11.29522572380612, + "learning_rate": 4.9914432589860977e-05, + "loss": 2.2213, + "mean_token_accuracy": 0.4643678247928619, + "step": 75835 + }, + { + "epoch": 0.07638685348409262, + "grad_norm": 9.267673691772961, + "learning_rate": 4.991439993818304e-05, + "loss": 2.5495, + "mean_token_accuracy": 0.417241370677948, + "step": 75840 + }, + { + "epoch": 0.07639188953719679, + "grad_norm": 10.437566058020566, + "learning_rate": 4.9914367280288394e-05, + "loss": 2.0566, + "mean_token_accuracy": 0.46896551847457885, + "step": 75845 + }, + { + "epoch": 0.07639692559030097, + "grad_norm": 13.906158956665738, + "learning_rate": 4.991433461617703e-05, + "loss": 2.5037, + "mean_token_accuracy": 0.3551724135875702, + "step": 75850 + }, + { + "epoch": 0.07640196164340514, + "grad_norm": 12.036452471682875, + "learning_rate": 4.991430194584898e-05, + "loss": 2.7087, + "mean_token_accuracy": 0.4034482717514038, + "step": 75855 + }, + { + "epoch": 0.07640699769650931, + "grad_norm": 10.73824030041405, + "learning_rate": 4.991426926930422e-05, + "loss": 2.418, + "mean_token_accuracy": 0.3965517163276672, + "step": 75860 + }, + { + "epoch": 0.07641203374961349, + "grad_norm": 14.802172998273921, + "learning_rate": 4.991423658654279e-05, + "loss": 2.21, + "mean_token_accuracy": 0.4344827592372894, + "step": 75865 + }, + { + "epoch": 0.07641706980271766, + "grad_norm": 10.500910000184115, + "learning_rate": 4.9914203897564696e-05, + "loss": 2.0921, + "mean_token_accuracy": 0.47428917288780215, + "step": 75870 + }, + { + "epoch": 0.07642210585582183, + "grad_norm": 11.144134351017298, + "learning_rate": 4.9914171202369925e-05, + "loss": 2.2022, + "mean_token_accuracy": 0.44137930274009707, + "step": 75875 + }, + { + "epoch": 0.07642714190892601, + "grad_norm": 12.494834211201681, + "learning_rate": 4.991413850095851e-05, + "loss": 2.6443, + "mean_token_accuracy": 0.375862056016922, + "step": 75880 + }, + { + "epoch": 0.07643217796203017, + "grad_norm": 13.508432611391783, + "learning_rate": 4.9914105793330446e-05, + "loss": 2.5734, + "mean_token_accuracy": 0.3965517282485962, + "step": 75885 + }, + { + "epoch": 0.07643721401513434, + "grad_norm": 10.966296543416753, + "learning_rate": 4.991407307948574e-05, + "loss": 2.6002, + "mean_token_accuracy": 0.3689655214548111, + "step": 75890 + }, + { + "epoch": 0.07644225006823852, + "grad_norm": 11.5297554266137, + "learning_rate": 4.991404035942441e-05, + "loss": 2.2939, + "mean_token_accuracy": 0.4413793087005615, + "step": 75895 + }, + { + "epoch": 0.07644728612134269, + "grad_norm": 12.691866007702652, + "learning_rate": 4.991400763314647e-05, + "loss": 2.5595, + "mean_token_accuracy": 0.4206896543502808, + "step": 75900 + }, + { + "epoch": 0.07645232217444686, + "grad_norm": 10.596178340938968, + "learning_rate": 4.9913974900651914e-05, + "loss": 2.0649, + "mean_token_accuracy": 0.4910098612308502, + "step": 75905 + }, + { + "epoch": 0.07645735822755104, + "grad_norm": 12.24591751101665, + "learning_rate": 4.9913942161940765e-05, + "loss": 2.3893, + "mean_token_accuracy": 0.42413793206214906, + "step": 75910 + }, + { + "epoch": 0.07646239428065521, + "grad_norm": 10.41183630750871, + "learning_rate": 4.991390941701302e-05, + "loss": 2.536, + "mean_token_accuracy": 0.3931034505367279, + "step": 75915 + }, + { + "epoch": 0.07646743033375938, + "grad_norm": 9.28940065559611, + "learning_rate": 4.99138766658687e-05, + "loss": 2.1056, + "mean_token_accuracy": 0.47731398344039916, + "step": 75920 + }, + { + "epoch": 0.07647246638686356, + "grad_norm": 11.280956626278062, + "learning_rate": 4.9913843908507794e-05, + "loss": 2.6147, + "mean_token_accuracy": 0.41379310488700866, + "step": 75925 + }, + { + "epoch": 0.07647750243996773, + "grad_norm": 10.418037589122415, + "learning_rate": 4.991381114493033e-05, + "loss": 2.0753, + "mean_token_accuracy": 0.4137930989265442, + "step": 75930 + }, + { + "epoch": 0.0764825384930719, + "grad_norm": 12.295293040516704, + "learning_rate": 4.991377837513632e-05, + "loss": 2.773, + "mean_token_accuracy": 0.3482758551836014, + "step": 75935 + }, + { + "epoch": 0.07648757454617608, + "grad_norm": 9.647666117705828, + "learning_rate": 4.991374559912576e-05, + "loss": 2.5544, + "mean_token_accuracy": 0.40689654350280763, + "step": 75940 + }, + { + "epoch": 0.07649261059928025, + "grad_norm": 10.129150239704519, + "learning_rate": 4.991371281689867e-05, + "loss": 2.1761, + "mean_token_accuracy": 0.46896551847457885, + "step": 75945 + }, + { + "epoch": 0.07649764665238443, + "grad_norm": 11.802967044004843, + "learning_rate": 4.991368002845504e-05, + "loss": 2.833, + "mean_token_accuracy": 0.31379309892654417, + "step": 75950 + }, + { + "epoch": 0.07650268270548859, + "grad_norm": 9.096263347688152, + "learning_rate": 4.99136472337949e-05, + "loss": 2.2723, + "mean_token_accuracy": 0.4620689690113068, + "step": 75955 + }, + { + "epoch": 0.07650771875859276, + "grad_norm": 12.2025513547769, + "learning_rate": 4.9913614432918256e-05, + "loss": 2.8166, + "mean_token_accuracy": 0.38275861740112305, + "step": 75960 + }, + { + "epoch": 0.07651275481169693, + "grad_norm": 9.476075024808113, + "learning_rate": 4.9913581625825107e-05, + "loss": 2.3497, + "mean_token_accuracy": 0.43793103098869324, + "step": 75965 + }, + { + "epoch": 0.07651779086480111, + "grad_norm": 8.167493639580654, + "learning_rate": 4.991354881251547e-05, + "loss": 1.9435, + "mean_token_accuracy": 0.46551724076271056, + "step": 75970 + }, + { + "epoch": 0.07652282691790528, + "grad_norm": 11.920780312356031, + "learning_rate": 4.9913515992989345e-05, + "loss": 2.0221, + "mean_token_accuracy": 0.4862068831920624, + "step": 75975 + }, + { + "epoch": 0.07652786297100946, + "grad_norm": 10.868402975467268, + "learning_rate": 4.991348316724675e-05, + "loss": 2.1724, + "mean_token_accuracy": 0.4551724135875702, + "step": 75980 + }, + { + "epoch": 0.07653289902411363, + "grad_norm": 16.608397154067035, + "learning_rate": 4.9913450335287705e-05, + "loss": 2.8311, + "mean_token_accuracy": 0.3896551728248596, + "step": 75985 + }, + { + "epoch": 0.0765379350772178, + "grad_norm": 10.539757071855773, + "learning_rate": 4.9913417497112194e-05, + "loss": 2.394, + "mean_token_accuracy": 0.42758620977401735, + "step": 75990 + }, + { + "epoch": 0.07654297113032198, + "grad_norm": 16.595900709794915, + "learning_rate": 4.991338465272023e-05, + "loss": 2.7186, + "mean_token_accuracy": 0.4068965494632721, + "step": 75995 + }, + { + "epoch": 0.07654800718342615, + "grad_norm": 9.316557899786188, + "learning_rate": 4.9913351802111844e-05, + "loss": 2.0665, + "mean_token_accuracy": 0.4913490653038025, + "step": 76000 + }, + { + "epoch": 0.07655304323653032, + "grad_norm": 9.346339293452436, + "learning_rate": 4.991331894528703e-05, + "loss": 2.2158, + "mean_token_accuracy": 0.44137930274009707, + "step": 76005 + }, + { + "epoch": 0.0765580792896345, + "grad_norm": 15.873856871661653, + "learning_rate": 4.99132860822458e-05, + "loss": 3.0836, + "mean_token_accuracy": 0.3724137991666794, + "step": 76010 + }, + { + "epoch": 0.07656311534273867, + "grad_norm": 10.017197081567192, + "learning_rate": 4.991325321298815e-05, + "loss": 2.3409, + "mean_token_accuracy": 0.42068966031074523, + "step": 76015 + }, + { + "epoch": 0.07656815139584285, + "grad_norm": 11.805809406800645, + "learning_rate": 4.9913220337514105e-05, + "loss": 2.6357, + "mean_token_accuracy": 0.3774954617023468, + "step": 76020 + }, + { + "epoch": 0.076573187448947, + "grad_norm": 10.951204614392159, + "learning_rate": 4.991318745582367e-05, + "loss": 3.0615, + "mean_token_accuracy": 0.37241379022598264, + "step": 76025 + }, + { + "epoch": 0.07657822350205118, + "grad_norm": 12.913009617101537, + "learning_rate": 4.991315456791686e-05, + "loss": 2.5665, + "mean_token_accuracy": 0.40496068000793456, + "step": 76030 + }, + { + "epoch": 0.07658325955515535, + "grad_norm": 9.701074470634271, + "learning_rate": 4.9913121673793675e-05, + "loss": 2.3706, + "mean_token_accuracy": 0.4310344815254211, + "step": 76035 + }, + { + "epoch": 0.07658829560825953, + "grad_norm": 10.769471869941352, + "learning_rate": 4.9913088773454125e-05, + "loss": 2.2415, + "mean_token_accuracy": 0.4620689630508423, + "step": 76040 + }, + { + "epoch": 0.0765933316613637, + "grad_norm": 14.152698115220998, + "learning_rate": 4.991305586689822e-05, + "loss": 2.9488, + "mean_token_accuracy": 0.3965517282485962, + "step": 76045 + }, + { + "epoch": 0.07659836771446787, + "grad_norm": 11.285083982795774, + "learning_rate": 4.9913022954125974e-05, + "loss": 2.9882, + "mean_token_accuracy": 0.3793103516101837, + "step": 76050 + }, + { + "epoch": 0.07660340376757205, + "grad_norm": 12.285616983751531, + "learning_rate": 4.991299003513739e-05, + "loss": 2.4519, + "mean_token_accuracy": 0.44827585816383364, + "step": 76055 + }, + { + "epoch": 0.07660843982067622, + "grad_norm": 9.699297240927653, + "learning_rate": 4.991295710993248e-05, + "loss": 2.4837, + "mean_token_accuracy": 0.4517241418361664, + "step": 76060 + }, + { + "epoch": 0.0766134758737804, + "grad_norm": 10.660974168449924, + "learning_rate": 4.991292417851126e-05, + "loss": 2.4614, + "mean_token_accuracy": 0.4137930989265442, + "step": 76065 + }, + { + "epoch": 0.07661851192688457, + "grad_norm": 10.477693084192317, + "learning_rate": 4.991289124087372e-05, + "loss": 2.8269, + "mean_token_accuracy": 0.37586206793785093, + "step": 76070 + }, + { + "epoch": 0.07662354797998874, + "grad_norm": 10.094157344090616, + "learning_rate": 4.991285829701988e-05, + "loss": 2.5221, + "mean_token_accuracy": 0.41379310488700866, + "step": 76075 + }, + { + "epoch": 0.07662858403309292, + "grad_norm": 13.617982164643193, + "learning_rate": 4.991282534694976e-05, + "loss": 2.6224, + "mean_token_accuracy": 0.4, + "step": 76080 + }, + { + "epoch": 0.07663362008619709, + "grad_norm": 10.226769012475385, + "learning_rate": 4.991279239066336e-05, + "loss": 2.5625, + "mean_token_accuracy": 0.3965517282485962, + "step": 76085 + }, + { + "epoch": 0.07663865613930126, + "grad_norm": 11.753950744444511, + "learning_rate": 4.991275942816068e-05, + "loss": 2.4381, + "mean_token_accuracy": 0.4379310369491577, + "step": 76090 + }, + { + "epoch": 0.07664369219240542, + "grad_norm": 13.238021353430796, + "learning_rate": 4.991272645944174e-05, + "loss": 2.6456, + "mean_token_accuracy": 0.4310344815254211, + "step": 76095 + }, + { + "epoch": 0.0766487282455096, + "grad_norm": 13.221475504182036, + "learning_rate": 4.9912693484506554e-05, + "loss": 2.5872, + "mean_token_accuracy": 0.4, + "step": 76100 + }, + { + "epoch": 0.07665376429861377, + "grad_norm": 11.153834513315722, + "learning_rate": 4.9912660503355114e-05, + "loss": 2.5138, + "mean_token_accuracy": 0.39842710494995115, + "step": 76105 + }, + { + "epoch": 0.07665880035171795, + "grad_norm": 12.348462795601518, + "learning_rate": 4.991262751598744e-05, + "loss": 2.5586, + "mean_token_accuracy": 0.40344826579093934, + "step": 76110 + }, + { + "epoch": 0.07666383640482212, + "grad_norm": 11.058857682253631, + "learning_rate": 4.991259452240354e-05, + "loss": 2.6813, + "mean_token_accuracy": 0.3931034505367279, + "step": 76115 + }, + { + "epoch": 0.0766688724579263, + "grad_norm": 13.45940396384504, + "learning_rate": 4.991256152260343e-05, + "loss": 2.3806, + "mean_token_accuracy": 0.3758620649576187, + "step": 76120 + }, + { + "epoch": 0.07667390851103047, + "grad_norm": 10.380294446826063, + "learning_rate": 4.991252851658711e-05, + "loss": 2.5863, + "mean_token_accuracy": 0.38965516686439516, + "step": 76125 + }, + { + "epoch": 0.07667894456413464, + "grad_norm": 10.210874984121272, + "learning_rate": 4.9912495504354594e-05, + "loss": 2.4781, + "mean_token_accuracy": 0.4068965554237366, + "step": 76130 + }, + { + "epoch": 0.07668398061723881, + "grad_norm": 12.355744459605228, + "learning_rate": 4.991246248590588e-05, + "loss": 2.4452, + "mean_token_accuracy": 0.4103448331356049, + "step": 76135 + }, + { + "epoch": 0.07668901667034299, + "grad_norm": 14.781936750895781, + "learning_rate": 4.9912429461240996e-05, + "loss": 2.4778, + "mean_token_accuracy": 0.4482758641242981, + "step": 76140 + }, + { + "epoch": 0.07669405272344716, + "grad_norm": 10.311429793408207, + "learning_rate": 4.991239643035994e-05, + "loss": 2.1637, + "mean_token_accuracy": 0.46896551847457885, + "step": 76145 + }, + { + "epoch": 0.07669908877655134, + "grad_norm": 12.740274594516142, + "learning_rate": 4.991236339326272e-05, + "loss": 2.7306, + "mean_token_accuracy": 0.3793103516101837, + "step": 76150 + }, + { + "epoch": 0.07670412482965551, + "grad_norm": 13.465462341415407, + "learning_rate": 4.9912330349949347e-05, + "loss": 2.7967, + "mean_token_accuracy": 0.3517241358757019, + "step": 76155 + }, + { + "epoch": 0.07670916088275968, + "grad_norm": 22.085605173399905, + "learning_rate": 4.9912297300419816e-05, + "loss": 2.5842, + "mean_token_accuracy": 0.42413792610168455, + "step": 76160 + }, + { + "epoch": 0.07671419693586384, + "grad_norm": 11.377126568261668, + "learning_rate": 4.991226424467417e-05, + "loss": 2.9084, + "mean_token_accuracy": 0.341379314661026, + "step": 76165 + }, + { + "epoch": 0.07671923298896802, + "grad_norm": 9.068799027785182, + "learning_rate": 4.991223118271239e-05, + "loss": 2.1182, + "mean_token_accuracy": 0.4379310369491577, + "step": 76170 + }, + { + "epoch": 0.07672426904207219, + "grad_norm": 11.215560346188667, + "learning_rate": 4.99121981145345e-05, + "loss": 2.5119, + "mean_token_accuracy": 0.4068965554237366, + "step": 76175 + }, + { + "epoch": 0.07672930509517636, + "grad_norm": 10.954607236901209, + "learning_rate": 4.99121650401405e-05, + "loss": 2.6158, + "mean_token_accuracy": 0.3448275774717331, + "step": 76180 + }, + { + "epoch": 0.07673434114828054, + "grad_norm": 12.593666432344227, + "learning_rate": 4.99121319595304e-05, + "loss": 2.678, + "mean_token_accuracy": 0.3655172407627106, + "step": 76185 + }, + { + "epoch": 0.07673937720138471, + "grad_norm": 10.397434917774126, + "learning_rate": 4.991209887270422e-05, + "loss": 2.3829, + "mean_token_accuracy": 0.4413793087005615, + "step": 76190 + }, + { + "epoch": 0.07674441325448889, + "grad_norm": 9.991234837891225, + "learning_rate": 4.9912065779661956e-05, + "loss": 2.2518, + "mean_token_accuracy": 0.42758620381355283, + "step": 76195 + }, + { + "epoch": 0.07674944930759306, + "grad_norm": 12.336582522125166, + "learning_rate": 4.991203268040362e-05, + "loss": 2.2671, + "mean_token_accuracy": 0.4586206912994385, + "step": 76200 + }, + { + "epoch": 0.07675448536069723, + "grad_norm": 10.379597317759364, + "learning_rate": 4.991199957492922e-05, + "loss": 2.2362, + "mean_token_accuracy": 0.4172413766384125, + "step": 76205 + }, + { + "epoch": 0.07675952141380141, + "grad_norm": 10.465127083070813, + "learning_rate": 4.991196646323878e-05, + "loss": 2.6993, + "mean_token_accuracy": 0.3862069010734558, + "step": 76210 + }, + { + "epoch": 0.07676455746690558, + "grad_norm": 15.090868812203627, + "learning_rate": 4.9911933345332286e-05, + "loss": 2.7168, + "mean_token_accuracy": 0.3206896513700485, + "step": 76215 + }, + { + "epoch": 0.07676959352000975, + "grad_norm": 12.652042280143231, + "learning_rate": 4.991190022120977e-05, + "loss": 2.638, + "mean_token_accuracy": 0.4125226855278015, + "step": 76220 + }, + { + "epoch": 0.07677462957311393, + "grad_norm": 13.18150121367197, + "learning_rate": 4.9911867090871215e-05, + "loss": 2.7215, + "mean_token_accuracy": 0.43448275327682495, + "step": 76225 + }, + { + "epoch": 0.0767796656262181, + "grad_norm": 11.995238335150015, + "learning_rate": 4.991183395431665e-05, + "loss": 2.6699, + "mean_token_accuracy": 0.4517241358757019, + "step": 76230 + }, + { + "epoch": 0.07678470167932226, + "grad_norm": 10.423203353669022, + "learning_rate": 4.991180081154608e-05, + "loss": 2.3231, + "mean_token_accuracy": 0.42068964838981626, + "step": 76235 + }, + { + "epoch": 0.07678973773242644, + "grad_norm": 12.167635075071818, + "learning_rate": 4.9911767662559514e-05, + "loss": 2.698, + "mean_token_accuracy": 0.379310342669487, + "step": 76240 + }, + { + "epoch": 0.07679477378553061, + "grad_norm": 11.959190249219661, + "learning_rate": 4.9911734507356965e-05, + "loss": 2.6089, + "mean_token_accuracy": 0.4172413766384125, + "step": 76245 + }, + { + "epoch": 0.07679980983863478, + "grad_norm": 12.57301525973076, + "learning_rate": 4.9911701345938435e-05, + "loss": 2.3945, + "mean_token_accuracy": 0.40344826579093934, + "step": 76250 + }, + { + "epoch": 0.07680484589173896, + "grad_norm": 11.488919352560186, + "learning_rate": 4.9911668178303936e-05, + "loss": 2.4881, + "mean_token_accuracy": 0.4310344815254211, + "step": 76255 + }, + { + "epoch": 0.07680988194484313, + "grad_norm": 11.666034113108683, + "learning_rate": 4.9911635004453476e-05, + "loss": 2.4, + "mean_token_accuracy": 0.38965516686439516, + "step": 76260 + }, + { + "epoch": 0.0768149179979473, + "grad_norm": 11.383287288828535, + "learning_rate": 4.991160182438707e-05, + "loss": 2.7235, + "mean_token_accuracy": 0.35862068831920624, + "step": 76265 + }, + { + "epoch": 0.07681995405105148, + "grad_norm": 9.501910846751839, + "learning_rate": 4.991156863810472e-05, + "loss": 2.3704, + "mean_token_accuracy": 0.42758620381355283, + "step": 76270 + }, + { + "epoch": 0.07682499010415565, + "grad_norm": 10.52889500955914, + "learning_rate": 4.9911535445606436e-05, + "loss": 2.3017, + "mean_token_accuracy": 0.4517241358757019, + "step": 76275 + }, + { + "epoch": 0.07683002615725983, + "grad_norm": 10.127493030419503, + "learning_rate": 4.9911502246892225e-05, + "loss": 2.3911, + "mean_token_accuracy": 0.4206896543502808, + "step": 76280 + }, + { + "epoch": 0.076835062210364, + "grad_norm": 10.467548309573269, + "learning_rate": 4.9911469041962114e-05, + "loss": 2.856, + "mean_token_accuracy": 0.37241379618644715, + "step": 76285 + }, + { + "epoch": 0.07684009826346817, + "grad_norm": 13.920408180237013, + "learning_rate": 4.991143583081608e-05, + "loss": 3.2088, + "mean_token_accuracy": 0.37586206793785093, + "step": 76290 + }, + { + "epoch": 0.07684513431657235, + "grad_norm": 14.69910938298018, + "learning_rate": 4.991140261345417e-05, + "loss": 2.4855, + "mean_token_accuracy": 0.3517241418361664, + "step": 76295 + }, + { + "epoch": 0.07685017036967652, + "grad_norm": 12.792508531622563, + "learning_rate": 4.991136938987636e-05, + "loss": 2.3878, + "mean_token_accuracy": 0.4172413766384125, + "step": 76300 + }, + { + "epoch": 0.07685520642278068, + "grad_norm": 12.281887660239775, + "learning_rate": 4.991133616008268e-05, + "loss": 2.3622, + "mean_token_accuracy": 0.4517241358757019, + "step": 76305 + }, + { + "epoch": 0.07686024247588485, + "grad_norm": 13.286946207923359, + "learning_rate": 4.9911302924073125e-05, + "loss": 2.7695, + "mean_token_accuracy": 0.41724138259887694, + "step": 76310 + }, + { + "epoch": 0.07686527852898903, + "grad_norm": 14.49087467089417, + "learning_rate": 4.991126968184772e-05, + "loss": 2.3895, + "mean_token_accuracy": 0.39491833448410035, + "step": 76315 + }, + { + "epoch": 0.0768703145820932, + "grad_norm": 11.705957293550219, + "learning_rate": 4.991123643340646e-05, + "loss": 2.3873, + "mean_token_accuracy": 0.4344827651977539, + "step": 76320 + }, + { + "epoch": 0.07687535063519738, + "grad_norm": 14.995636561082682, + "learning_rate": 4.991120317874937e-05, + "loss": 2.8071, + "mean_token_accuracy": 0.36896551251411436, + "step": 76325 + }, + { + "epoch": 0.07688038668830155, + "grad_norm": 10.024962210705622, + "learning_rate": 4.991116991787643e-05, + "loss": 2.4787, + "mean_token_accuracy": 0.39655172228813174, + "step": 76330 + }, + { + "epoch": 0.07688542274140572, + "grad_norm": 11.147795370803149, + "learning_rate": 4.991113665078768e-05, + "loss": 2.7896, + "mean_token_accuracy": 0.38275861740112305, + "step": 76335 + }, + { + "epoch": 0.0768904587945099, + "grad_norm": 11.17179787394256, + "learning_rate": 4.991110337748312e-05, + "loss": 2.4725, + "mean_token_accuracy": 0.43793103098869324, + "step": 76340 + }, + { + "epoch": 0.07689549484761407, + "grad_norm": 12.938188381538987, + "learning_rate": 4.991107009796275e-05, + "loss": 2.3297, + "mean_token_accuracy": 0.42068966031074523, + "step": 76345 + }, + { + "epoch": 0.07690053090071824, + "grad_norm": 9.673208918236057, + "learning_rate": 4.99110368122266e-05, + "loss": 2.7345, + "mean_token_accuracy": 0.4517241358757019, + "step": 76350 + }, + { + "epoch": 0.07690556695382242, + "grad_norm": 9.300023807391415, + "learning_rate": 4.991100352027465e-05, + "loss": 2.4001, + "mean_token_accuracy": 0.4655172348022461, + "step": 76355 + }, + { + "epoch": 0.07691060300692659, + "grad_norm": 12.743634072448442, + "learning_rate": 4.991097022210693e-05, + "loss": 3.0314, + "mean_token_accuracy": 0.3517241358757019, + "step": 76360 + }, + { + "epoch": 0.07691563906003077, + "grad_norm": 16.05954807301064, + "learning_rate": 4.9910936917723446e-05, + "loss": 2.8275, + "mean_token_accuracy": 0.36896551549434664, + "step": 76365 + }, + { + "epoch": 0.07692067511313494, + "grad_norm": 11.978254526751734, + "learning_rate": 4.99109036071242e-05, + "loss": 2.8069, + "mean_token_accuracy": 0.3689655244350433, + "step": 76370 + }, + { + "epoch": 0.0769257111662391, + "grad_norm": 12.11489318519132, + "learning_rate": 4.991087029030921e-05, + "loss": 2.6078, + "mean_token_accuracy": 0.37586206793785093, + "step": 76375 + }, + { + "epoch": 0.07693074721934327, + "grad_norm": 18.95962277745913, + "learning_rate": 4.991083696727848e-05, + "loss": 2.5207, + "mean_token_accuracy": 0.4310344815254211, + "step": 76380 + }, + { + "epoch": 0.07693578327244745, + "grad_norm": 12.507215749014307, + "learning_rate": 4.9910803638032024e-05, + "loss": 2.3747, + "mean_token_accuracy": 0.43103448748588563, + "step": 76385 + }, + { + "epoch": 0.07694081932555162, + "grad_norm": 11.040131696764059, + "learning_rate": 4.991077030256984e-05, + "loss": 2.4414, + "mean_token_accuracy": 0.39655172228813174, + "step": 76390 + }, + { + "epoch": 0.0769458553786558, + "grad_norm": 11.162206482310072, + "learning_rate": 4.991073696089195e-05, + "loss": 2.6542, + "mean_token_accuracy": 0.3379310339689255, + "step": 76395 + }, + { + "epoch": 0.07695089143175997, + "grad_norm": 10.077112072607074, + "learning_rate": 4.991070361299836e-05, + "loss": 2.6271, + "mean_token_accuracy": 0.3931034505367279, + "step": 76400 + }, + { + "epoch": 0.07695592748486414, + "grad_norm": 11.070413085272996, + "learning_rate": 4.991067025888908e-05, + "loss": 2.2648, + "mean_token_accuracy": 0.4620689630508423, + "step": 76405 + }, + { + "epoch": 0.07696096353796832, + "grad_norm": 9.917713323973205, + "learning_rate": 4.9910636898564116e-05, + "loss": 2.4849, + "mean_token_accuracy": 0.4034482717514038, + "step": 76410 + }, + { + "epoch": 0.07696599959107249, + "grad_norm": 11.47594400030761, + "learning_rate": 4.991060353202347e-05, + "loss": 2.0804, + "mean_token_accuracy": 0.4435571551322937, + "step": 76415 + }, + { + "epoch": 0.07697103564417666, + "grad_norm": 11.729643919708229, + "learning_rate": 4.991057015926717e-05, + "loss": 2.6369, + "mean_token_accuracy": 0.40538414716720583, + "step": 76420 + }, + { + "epoch": 0.07697607169728084, + "grad_norm": 13.291758812686288, + "learning_rate": 4.991053678029521e-05, + "loss": 2.5011, + "mean_token_accuracy": 0.42758620381355283, + "step": 76425 + }, + { + "epoch": 0.07698110775038501, + "grad_norm": 13.620731717411319, + "learning_rate": 4.99105033951076e-05, + "loss": 2.6175, + "mean_token_accuracy": 0.4034482777118683, + "step": 76430 + }, + { + "epoch": 0.07698614380348918, + "grad_norm": 12.220741589624009, + "learning_rate": 4.991047000370436e-05, + "loss": 2.574, + "mean_token_accuracy": 0.3482758641242981, + "step": 76435 + }, + { + "epoch": 0.07699117985659336, + "grad_norm": 13.102777587086644, + "learning_rate": 4.9910436606085495e-05, + "loss": 2.393, + "mean_token_accuracy": 0.38620689511299133, + "step": 76440 + }, + { + "epoch": 0.07699621590969752, + "grad_norm": 10.567325511205329, + "learning_rate": 4.991040320225101e-05, + "loss": 2.2524, + "mean_token_accuracy": 0.4724137902259827, + "step": 76445 + }, + { + "epoch": 0.07700125196280169, + "grad_norm": 13.399178970245588, + "learning_rate": 4.9910369792200904e-05, + "loss": 2.5242, + "mean_token_accuracy": 0.428796124458313, + "step": 76450 + }, + { + "epoch": 0.07700628801590587, + "grad_norm": 10.514605026897222, + "learning_rate": 4.991033637593521e-05, + "loss": 1.9882, + "mean_token_accuracy": 0.4918330252170563, + "step": 76455 + }, + { + "epoch": 0.07701132406901004, + "grad_norm": 10.480547488976926, + "learning_rate": 4.9910302953453924e-05, + "loss": 2.4896, + "mean_token_accuracy": 0.42758620381355283, + "step": 76460 + }, + { + "epoch": 0.07701636012211421, + "grad_norm": 14.0726287037887, + "learning_rate": 4.9910269524757054e-05, + "loss": 2.604, + "mean_token_accuracy": 0.4068965494632721, + "step": 76465 + }, + { + "epoch": 0.07702139617521839, + "grad_norm": 11.20674176483731, + "learning_rate": 4.9910236089844616e-05, + "loss": 2.4799, + "mean_token_accuracy": 0.4257108271121979, + "step": 76470 + }, + { + "epoch": 0.07702643222832256, + "grad_norm": 10.417336116662733, + "learning_rate": 4.9910202648716616e-05, + "loss": 2.2341, + "mean_token_accuracy": 0.47931033968925474, + "step": 76475 + }, + { + "epoch": 0.07703146828142673, + "grad_norm": 10.816822667545958, + "learning_rate": 4.991016920137306e-05, + "loss": 2.2454, + "mean_token_accuracy": 0.43793103098869324, + "step": 76480 + }, + { + "epoch": 0.07703650433453091, + "grad_norm": 11.45095069200206, + "learning_rate": 4.9910135747813965e-05, + "loss": 2.5324, + "mean_token_accuracy": 0.4551724076271057, + "step": 76485 + }, + { + "epoch": 0.07704154038763508, + "grad_norm": 15.113232681737985, + "learning_rate": 4.991010228803932e-05, + "loss": 2.7173, + "mean_token_accuracy": 0.3965517282485962, + "step": 76490 + }, + { + "epoch": 0.07704657644073926, + "grad_norm": 9.01982280753244, + "learning_rate": 4.991006882204916e-05, + "loss": 2.3972, + "mean_token_accuracy": 0.41597095131874084, + "step": 76495 + }, + { + "epoch": 0.07705161249384343, + "grad_norm": 15.21468290379302, + "learning_rate": 4.991003534984349e-05, + "loss": 2.5124, + "mean_token_accuracy": 0.403448274731636, + "step": 76500 + }, + { + "epoch": 0.0770566485469476, + "grad_norm": 9.744995689730901, + "learning_rate": 4.991000187142231e-05, + "loss": 2.3012, + "mean_token_accuracy": 0.4482758641242981, + "step": 76505 + }, + { + "epoch": 0.07706168460005178, + "grad_norm": 9.801163742930056, + "learning_rate": 4.990996838678563e-05, + "loss": 2.6783, + "mean_token_accuracy": 0.4206896543502808, + "step": 76510 + }, + { + "epoch": 0.07706672065315594, + "grad_norm": 14.915655934076208, + "learning_rate": 4.9909934895933455e-05, + "loss": 2.7936, + "mean_token_accuracy": 0.4034482777118683, + "step": 76515 + }, + { + "epoch": 0.07707175670626011, + "grad_norm": 11.998220359109627, + "learning_rate": 4.9909901398865806e-05, + "loss": 2.3207, + "mean_token_accuracy": 0.4172413766384125, + "step": 76520 + }, + { + "epoch": 0.07707679275936428, + "grad_norm": 10.869897207155173, + "learning_rate": 4.9909867895582685e-05, + "loss": 2.4536, + "mean_token_accuracy": 0.4620689630508423, + "step": 76525 + }, + { + "epoch": 0.07708182881246846, + "grad_norm": 9.082628587109152, + "learning_rate": 4.990983438608411e-05, + "loss": 1.9954, + "mean_token_accuracy": 0.49655171632766726, + "step": 76530 + }, + { + "epoch": 0.07708686486557263, + "grad_norm": 10.34806382485586, + "learning_rate": 4.990980087037008e-05, + "loss": 2.6256, + "mean_token_accuracy": 0.4379310369491577, + "step": 76535 + }, + { + "epoch": 0.0770919009186768, + "grad_norm": 12.660727962211393, + "learning_rate": 4.9909767348440614e-05, + "loss": 2.4198, + "mean_token_accuracy": 0.48275862336158754, + "step": 76540 + }, + { + "epoch": 0.07709693697178098, + "grad_norm": 12.717709141544155, + "learning_rate": 4.9909733820295706e-05, + "loss": 2.2693, + "mean_token_accuracy": 0.44827585816383364, + "step": 76545 + }, + { + "epoch": 0.07710197302488515, + "grad_norm": 12.716322951460041, + "learning_rate": 4.990970028593538e-05, + "loss": 2.2004, + "mean_token_accuracy": 0.44827585816383364, + "step": 76550 + }, + { + "epoch": 0.07710700907798933, + "grad_norm": 13.334343662746278, + "learning_rate": 4.990966674535964e-05, + "loss": 2.9398, + "mean_token_accuracy": 0.36551724672317504, + "step": 76555 + }, + { + "epoch": 0.0771120451310935, + "grad_norm": 10.184239888061123, + "learning_rate": 4.9909633198568495e-05, + "loss": 2.5475, + "mean_token_accuracy": 0.3793103456497192, + "step": 76560 + }, + { + "epoch": 0.07711708118419767, + "grad_norm": 13.435392843453657, + "learning_rate": 4.990959964556195e-05, + "loss": 3.0107, + "mean_token_accuracy": 0.33448275923728943, + "step": 76565 + }, + { + "epoch": 0.07712211723730185, + "grad_norm": 10.490908638250238, + "learning_rate": 4.990956608634003e-05, + "loss": 2.6949, + "mean_token_accuracy": 0.4, + "step": 76570 + }, + { + "epoch": 0.07712715329040602, + "grad_norm": 10.76983059693207, + "learning_rate": 4.990953252090272e-05, + "loss": 2.4025, + "mean_token_accuracy": 0.3862068891525269, + "step": 76575 + }, + { + "epoch": 0.0771321893435102, + "grad_norm": 10.674401436385175, + "learning_rate": 4.990949894925005e-05, + "loss": 2.3607, + "mean_token_accuracy": 0.4517241418361664, + "step": 76580 + }, + { + "epoch": 0.07713722539661436, + "grad_norm": 12.45154984299301, + "learning_rate": 4.990946537138202e-05, + "loss": 2.1622, + "mean_token_accuracy": 0.4565270960330963, + "step": 76585 + }, + { + "epoch": 0.07714226144971853, + "grad_norm": 11.973738759218802, + "learning_rate": 4.9909431787298644e-05, + "loss": 2.8777, + "mean_token_accuracy": 0.3965517282485962, + "step": 76590 + }, + { + "epoch": 0.0771472975028227, + "grad_norm": 12.357447580176368, + "learning_rate": 4.990939819699992e-05, + "loss": 2.8391, + "mean_token_accuracy": 0.358620685338974, + "step": 76595 + }, + { + "epoch": 0.07715233355592688, + "grad_norm": 11.742388869704904, + "learning_rate": 4.990936460048588e-05, + "loss": 2.3411, + "mean_token_accuracy": 0.4344827651977539, + "step": 76600 + }, + { + "epoch": 0.07715736960903105, + "grad_norm": 11.273713494057555, + "learning_rate": 4.9909330997756506e-05, + "loss": 2.701, + "mean_token_accuracy": 0.37586206793785093, + "step": 76605 + }, + { + "epoch": 0.07716240566213523, + "grad_norm": 12.415423112366938, + "learning_rate": 4.9909297388811825e-05, + "loss": 2.8771, + "mean_token_accuracy": 0.35862069129943847, + "step": 76610 + }, + { + "epoch": 0.0771674417152394, + "grad_norm": 10.065836057157659, + "learning_rate": 4.9909263773651846e-05, + "loss": 2.176, + "mean_token_accuracy": 0.447005432844162, + "step": 76615 + }, + { + "epoch": 0.07717247776834357, + "grad_norm": 12.12845413323766, + "learning_rate": 4.990923015227656e-05, + "loss": 2.7089, + "mean_token_accuracy": 0.3655172437429428, + "step": 76620 + }, + { + "epoch": 0.07717751382144775, + "grad_norm": 13.282871201607344, + "learning_rate": 4.990919652468601e-05, + "loss": 2.6983, + "mean_token_accuracy": 0.37241379022598264, + "step": 76625 + }, + { + "epoch": 0.07718254987455192, + "grad_norm": 11.831922340032506, + "learning_rate": 4.990916289088017e-05, + "loss": 2.7447, + "mean_token_accuracy": 0.33793103098869326, + "step": 76630 + }, + { + "epoch": 0.0771875859276561, + "grad_norm": 12.146721469423477, + "learning_rate": 4.990912925085908e-05, + "loss": 2.6499, + "mean_token_accuracy": 0.36551723480224607, + "step": 76635 + }, + { + "epoch": 0.07719262198076027, + "grad_norm": 11.138006972203636, + "learning_rate": 4.990909560462272e-05, + "loss": 2.1437, + "mean_token_accuracy": 0.4620689690113068, + "step": 76640 + }, + { + "epoch": 0.07719765803386444, + "grad_norm": 8.764424727819648, + "learning_rate": 4.990906195217112e-05, + "loss": 2.3502, + "mean_token_accuracy": 0.44482758045196535, + "step": 76645 + }, + { + "epoch": 0.07720269408696862, + "grad_norm": 10.57674547070549, + "learning_rate": 4.9909028293504285e-05, + "loss": 2.4115, + "mean_token_accuracy": 0.3931034505367279, + "step": 76650 + }, + { + "epoch": 0.07720773014007278, + "grad_norm": 11.668071100589232, + "learning_rate": 4.9908994628622225e-05, + "loss": 2.8501, + "mean_token_accuracy": 0.35517241060733795, + "step": 76655 + }, + { + "epoch": 0.07721276619317695, + "grad_norm": 11.580241426482525, + "learning_rate": 4.9908960957524936e-05, + "loss": 2.5062, + "mean_token_accuracy": 0.3965517163276672, + "step": 76660 + }, + { + "epoch": 0.07721780224628112, + "grad_norm": 10.068895550433217, + "learning_rate": 4.9908927280212444e-05, + "loss": 2.0169, + "mean_token_accuracy": 0.47791893482208253, + "step": 76665 + }, + { + "epoch": 0.0772228382993853, + "grad_norm": 18.721556810818345, + "learning_rate": 4.990889359668476e-05, + "loss": 2.6902, + "mean_token_accuracy": 0.37241379022598264, + "step": 76670 + }, + { + "epoch": 0.07722787435248947, + "grad_norm": 19.770743462184516, + "learning_rate": 4.9908859906941874e-05, + "loss": 2.3348, + "mean_token_accuracy": 0.46637930870056155, + "step": 76675 + }, + { + "epoch": 0.07723291040559364, + "grad_norm": 10.747204373323715, + "learning_rate": 4.990882621098381e-05, + "loss": 2.2541, + "mean_token_accuracy": 0.4344827592372894, + "step": 76680 + }, + { + "epoch": 0.07723794645869782, + "grad_norm": 8.670902402922255, + "learning_rate": 4.9908792508810574e-05, + "loss": 2.3974, + "mean_token_accuracy": 0.42625529170036314, + "step": 76685 + }, + { + "epoch": 0.07724298251180199, + "grad_norm": 12.54872853327849, + "learning_rate": 4.990875880042218e-05, + "loss": 2.5139, + "mean_token_accuracy": 0.4379310369491577, + "step": 76690 + }, + { + "epoch": 0.07724801856490617, + "grad_norm": 9.833978259582063, + "learning_rate": 4.990872508581863e-05, + "loss": 2.5944, + "mean_token_accuracy": 0.39655172228813174, + "step": 76695 + }, + { + "epoch": 0.07725305461801034, + "grad_norm": 14.217952865564726, + "learning_rate": 4.9908691364999935e-05, + "loss": 2.5453, + "mean_token_accuracy": 0.3965517282485962, + "step": 76700 + }, + { + "epoch": 0.07725809067111451, + "grad_norm": 11.377033165299036, + "learning_rate": 4.990865763796611e-05, + "loss": 2.8863, + "mean_token_accuracy": 0.3827586233615875, + "step": 76705 + }, + { + "epoch": 0.07726312672421869, + "grad_norm": 10.807157102551283, + "learning_rate": 4.990862390471716e-05, + "loss": 2.1228, + "mean_token_accuracy": 0.5068965554237366, + "step": 76710 + }, + { + "epoch": 0.07726816277732286, + "grad_norm": 8.96121664337475, + "learning_rate": 4.990859016525309e-05, + "loss": 2.4218, + "mean_token_accuracy": 0.43448275327682495, + "step": 76715 + }, + { + "epoch": 0.07727319883042703, + "grad_norm": 11.592092091633278, + "learning_rate": 4.990855641957391e-05, + "loss": 2.1826, + "mean_token_accuracy": 0.42758620977401735, + "step": 76720 + }, + { + "epoch": 0.0772782348835312, + "grad_norm": 13.238505096344939, + "learning_rate": 4.990852266767965e-05, + "loss": 2.6173, + "mean_token_accuracy": 0.3896551728248596, + "step": 76725 + }, + { + "epoch": 0.07728327093663537, + "grad_norm": 12.9143989630657, + "learning_rate": 4.990848890957029e-05, + "loss": 2.7285, + "mean_token_accuracy": 0.3275862097740173, + "step": 76730 + }, + { + "epoch": 0.07728830698973954, + "grad_norm": 12.617083302482092, + "learning_rate": 4.990845514524586e-05, + "loss": 3.0764, + "mean_token_accuracy": 0.3482758641242981, + "step": 76735 + }, + { + "epoch": 0.07729334304284372, + "grad_norm": 14.81254473176798, + "learning_rate": 4.990842137470635e-05, + "loss": 3.1178, + "mean_token_accuracy": 0.32068965435028074, + "step": 76740 + }, + { + "epoch": 0.07729837909594789, + "grad_norm": 13.43898111578754, + "learning_rate": 4.990838759795179e-05, + "loss": 2.7771, + "mean_token_accuracy": 0.3620689630508423, + "step": 76745 + }, + { + "epoch": 0.07730341514905206, + "grad_norm": 20.497545315103874, + "learning_rate": 4.990835381498217e-05, + "loss": 2.5833, + "mean_token_accuracy": 0.4344827592372894, + "step": 76750 + }, + { + "epoch": 0.07730845120215624, + "grad_norm": 21.708362381297757, + "learning_rate": 4.990832002579752e-05, + "loss": 2.4177, + "mean_token_accuracy": 0.44640048742294314, + "step": 76755 + }, + { + "epoch": 0.07731348725526041, + "grad_norm": 12.329471219398691, + "learning_rate": 4.990828623039784e-05, + "loss": 2.3797, + "mean_token_accuracy": 0.41724138259887694, + "step": 76760 + }, + { + "epoch": 0.07731852330836458, + "grad_norm": 11.289764539150537, + "learning_rate": 4.990825242878313e-05, + "loss": 2.4791, + "mean_token_accuracy": 0.43593466877937315, + "step": 76765 + }, + { + "epoch": 0.07732355936146876, + "grad_norm": 11.264886517167614, + "learning_rate": 4.990821862095341e-05, + "loss": 2.7125, + "mean_token_accuracy": 0.36896551251411436, + "step": 76770 + }, + { + "epoch": 0.07732859541457293, + "grad_norm": 11.48903927551036, + "learning_rate": 4.990818480690869e-05, + "loss": 2.3278, + "mean_token_accuracy": 0.4206896543502808, + "step": 76775 + }, + { + "epoch": 0.0773336314676771, + "grad_norm": 10.448613586665761, + "learning_rate": 4.990815098664897e-05, + "loss": 2.3312, + "mean_token_accuracy": 0.44137930274009707, + "step": 76780 + }, + { + "epoch": 0.07733866752078128, + "grad_norm": 13.39716654387735, + "learning_rate": 4.990811716017427e-05, + "loss": 2.7367, + "mean_token_accuracy": 0.4034482777118683, + "step": 76785 + }, + { + "epoch": 0.07734370357388545, + "grad_norm": 12.903394287129473, + "learning_rate": 4.99080833274846e-05, + "loss": 2.203, + "mean_token_accuracy": 0.44301270246505736, + "step": 76790 + }, + { + "epoch": 0.07734873962698961, + "grad_norm": 11.637415678341826, + "learning_rate": 4.990804948857995e-05, + "loss": 2.4361, + "mean_token_accuracy": 0.42413792610168455, + "step": 76795 + }, + { + "epoch": 0.07735377568009379, + "grad_norm": 13.273942442474059, + "learning_rate": 4.990801564346037e-05, + "loss": 2.7994, + "mean_token_accuracy": 0.3793103456497192, + "step": 76800 + }, + { + "epoch": 0.07735881173319796, + "grad_norm": 11.144215004101454, + "learning_rate": 4.990798179212582e-05, + "loss": 2.3099, + "mean_token_accuracy": 0.3999999940395355, + "step": 76805 + }, + { + "epoch": 0.07736384778630213, + "grad_norm": 11.025364371891106, + "learning_rate": 4.9907947934576336e-05, + "loss": 1.9878, + "mean_token_accuracy": 0.4949788212776184, + "step": 76810 + }, + { + "epoch": 0.07736888383940631, + "grad_norm": 9.952123290815283, + "learning_rate": 4.9907914070811936e-05, + "loss": 2.4067, + "mean_token_accuracy": 0.4206896543502808, + "step": 76815 + }, + { + "epoch": 0.07737391989251048, + "grad_norm": 11.798667252282023, + "learning_rate": 4.9907880200832605e-05, + "loss": 2.8622, + "mean_token_accuracy": 0.40889292359352114, + "step": 76820 + }, + { + "epoch": 0.07737895594561466, + "grad_norm": 11.045752987632365, + "learning_rate": 4.990784632463837e-05, + "loss": 2.1986, + "mean_token_accuracy": 0.44827587008476255, + "step": 76825 + }, + { + "epoch": 0.07738399199871883, + "grad_norm": 10.172233878407384, + "learning_rate": 4.990781244222924e-05, + "loss": 2.2363, + "mean_token_accuracy": 0.41724138259887694, + "step": 76830 + }, + { + "epoch": 0.077389028051823, + "grad_norm": 8.50641354289392, + "learning_rate": 4.9907778553605206e-05, + "loss": 2.4945, + "mean_token_accuracy": 0.47931033968925474, + "step": 76835 + }, + { + "epoch": 0.07739406410492718, + "grad_norm": 10.730578535301627, + "learning_rate": 4.99077446587663e-05, + "loss": 2.2329, + "mean_token_accuracy": 0.47931034564971925, + "step": 76840 + }, + { + "epoch": 0.07739910015803135, + "grad_norm": 13.61393164705282, + "learning_rate": 4.9907710757712514e-05, + "loss": 2.3032, + "mean_token_accuracy": 0.42413793206214906, + "step": 76845 + }, + { + "epoch": 0.07740413621113552, + "grad_norm": 11.29643594938041, + "learning_rate": 4.990767685044387e-05, + "loss": 2.3159, + "mean_token_accuracy": 0.41034482717514037, + "step": 76850 + }, + { + "epoch": 0.0774091722642397, + "grad_norm": 10.20622086958567, + "learning_rate": 4.990764293696039e-05, + "loss": 2.5905, + "mean_token_accuracy": 0.3862069010734558, + "step": 76855 + }, + { + "epoch": 0.07741420831734387, + "grad_norm": 11.304876840964768, + "learning_rate": 4.9907609017262045e-05, + "loss": 2.7241, + "mean_token_accuracy": 0.36896551251411436, + "step": 76860 + }, + { + "epoch": 0.07741924437044803, + "grad_norm": 10.504188508325953, + "learning_rate": 4.990757509134887e-05, + "loss": 2.191, + "mean_token_accuracy": 0.4310344815254211, + "step": 76865 + }, + { + "epoch": 0.0774242804235522, + "grad_norm": 11.821432158141727, + "learning_rate": 4.990754115922088e-05, + "loss": 2.6792, + "mean_token_accuracy": 0.4137930929660797, + "step": 76870 + }, + { + "epoch": 0.07742931647665638, + "grad_norm": 10.740577286919532, + "learning_rate": 4.9907507220878066e-05, + "loss": 2.6304, + "mean_token_accuracy": 0.4119177222251892, + "step": 76875 + }, + { + "epoch": 0.07743435252976055, + "grad_norm": 10.39652888062975, + "learning_rate": 4.990747327632045e-05, + "loss": 2.6618, + "mean_token_accuracy": 0.37241379618644715, + "step": 76880 + }, + { + "epoch": 0.07743938858286473, + "grad_norm": 10.014224301433858, + "learning_rate": 4.990743932554804e-05, + "loss": 2.3836, + "mean_token_accuracy": 0.4413793087005615, + "step": 76885 + }, + { + "epoch": 0.0774444246359689, + "grad_norm": 12.000288053337657, + "learning_rate": 4.990740536856084e-05, + "loss": 2.6225, + "mean_token_accuracy": 0.38620689511299133, + "step": 76890 + }, + { + "epoch": 0.07744946068907307, + "grad_norm": 11.293931783274486, + "learning_rate": 4.990737140535887e-05, + "loss": 2.539, + "mean_token_accuracy": 0.38965516686439516, + "step": 76895 + }, + { + "epoch": 0.07745449674217725, + "grad_norm": 12.037554042093848, + "learning_rate": 4.990733743594212e-05, + "loss": 2.2965, + "mean_token_accuracy": 0.3965517282485962, + "step": 76900 + }, + { + "epoch": 0.07745953279528142, + "grad_norm": 12.199135105204514, + "learning_rate": 4.990730346031061e-05, + "loss": 2.3232, + "mean_token_accuracy": 0.45359952449798585, + "step": 76905 + }, + { + "epoch": 0.0774645688483856, + "grad_norm": 11.57851067043257, + "learning_rate": 4.9907269478464366e-05, + "loss": 2.7526, + "mean_token_accuracy": 0.36551723480224607, + "step": 76910 + }, + { + "epoch": 0.07746960490148977, + "grad_norm": 10.786326529970497, + "learning_rate": 4.990723549040337e-05, + "loss": 2.5696, + "mean_token_accuracy": 0.4324258863925934, + "step": 76915 + }, + { + "epoch": 0.07747464095459394, + "grad_norm": 10.272268245864172, + "learning_rate": 4.9907201496127646e-05, + "loss": 2.5471, + "mean_token_accuracy": 0.42758620381355283, + "step": 76920 + }, + { + "epoch": 0.07747967700769812, + "grad_norm": 9.00461927597447, + "learning_rate": 4.99071674956372e-05, + "loss": 2.0975, + "mean_token_accuracy": 0.4673926174640656, + "step": 76925 + }, + { + "epoch": 0.07748471306080229, + "grad_norm": 10.374740276214355, + "learning_rate": 4.990713348893205e-05, + "loss": 2.6679, + "mean_token_accuracy": 0.4103448212146759, + "step": 76930 + }, + { + "epoch": 0.07748974911390645, + "grad_norm": 10.850154591565463, + "learning_rate": 4.9907099476012194e-05, + "loss": 2.1278, + "mean_token_accuracy": 0.4781004309654236, + "step": 76935 + }, + { + "epoch": 0.07749478516701062, + "grad_norm": 10.917676404025936, + "learning_rate": 4.990706545687764e-05, + "loss": 2.4571, + "mean_token_accuracy": 0.42758620977401735, + "step": 76940 + }, + { + "epoch": 0.0774998212201148, + "grad_norm": 9.758435948472803, + "learning_rate": 4.99070314315284e-05, + "loss": 2.2532, + "mean_token_accuracy": 0.43653962314128875, + "step": 76945 + }, + { + "epoch": 0.07750485727321897, + "grad_norm": 12.685584803701056, + "learning_rate": 4.990699739996449e-05, + "loss": 3.1165, + "mean_token_accuracy": 0.3738656997680664, + "step": 76950 + }, + { + "epoch": 0.07750989332632315, + "grad_norm": 9.13455298637988, + "learning_rate": 4.9906963362185936e-05, + "loss": 2.402, + "mean_token_accuracy": 0.3931034505367279, + "step": 76955 + }, + { + "epoch": 0.07751492937942732, + "grad_norm": 9.88367341178582, + "learning_rate": 4.9906929318192705e-05, + "loss": 2.5017, + "mean_token_accuracy": 0.4344827592372894, + "step": 76960 + }, + { + "epoch": 0.07751996543253149, + "grad_norm": 10.865189248617265, + "learning_rate": 4.990689526798483e-05, + "loss": 2.365, + "mean_token_accuracy": 0.44137930274009707, + "step": 76965 + }, + { + "epoch": 0.07752500148563567, + "grad_norm": 12.906540801424704, + "learning_rate": 4.990686121156232e-05, + "loss": 2.5605, + "mean_token_accuracy": 0.41379310488700866, + "step": 76970 + }, + { + "epoch": 0.07753003753873984, + "grad_norm": 12.200622409007982, + "learning_rate": 4.990682714892519e-05, + "loss": 2.7969, + "mean_token_accuracy": 0.33103448152542114, + "step": 76975 + }, + { + "epoch": 0.07753507359184401, + "grad_norm": 12.551370541912643, + "learning_rate": 4.9906793080073435e-05, + "loss": 2.564, + "mean_token_accuracy": 0.41379310488700866, + "step": 76980 + }, + { + "epoch": 0.07754010964494819, + "grad_norm": 11.073032989595514, + "learning_rate": 4.990675900500708e-05, + "loss": 2.4957, + "mean_token_accuracy": 0.4344827592372894, + "step": 76985 + }, + { + "epoch": 0.07754514569805236, + "grad_norm": 10.549245650561533, + "learning_rate": 4.9906724923726116e-05, + "loss": 2.2975, + "mean_token_accuracy": 0.42068966031074523, + "step": 76990 + }, + { + "epoch": 0.07755018175115654, + "grad_norm": 11.067350904676875, + "learning_rate": 4.9906690836230574e-05, + "loss": 2.7179, + "mean_token_accuracy": 0.3965517282485962, + "step": 76995 + }, + { + "epoch": 0.0775552178042607, + "grad_norm": 9.383835295026707, + "learning_rate": 4.9906656742520446e-05, + "loss": 2.5448, + "mean_token_accuracy": 0.3896551728248596, + "step": 77000 + }, + { + "epoch": 0.07756025385736487, + "grad_norm": 9.001360798490378, + "learning_rate": 4.9906622642595746e-05, + "loss": 2.6067, + "mean_token_accuracy": 0.38620689511299133, + "step": 77005 + }, + { + "epoch": 0.07756528991046904, + "grad_norm": 12.140236485318464, + "learning_rate": 4.9906588536456494e-05, + "loss": 2.5044, + "mean_token_accuracy": 0.4, + "step": 77010 + }, + { + "epoch": 0.07757032596357322, + "grad_norm": 12.20984322279911, + "learning_rate": 4.990655442410268e-05, + "loss": 2.2532, + "mean_token_accuracy": 0.47791893482208253, + "step": 77015 + }, + { + "epoch": 0.07757536201667739, + "grad_norm": 16.296733836829336, + "learning_rate": 4.990652030553434e-05, + "loss": 3.2788, + "mean_token_accuracy": 0.31379309892654417, + "step": 77020 + }, + { + "epoch": 0.07758039806978156, + "grad_norm": 13.490840504622232, + "learning_rate": 4.9906486180751467e-05, + "loss": 2.4881, + "mean_token_accuracy": 0.42820197343826294, + "step": 77025 + }, + { + "epoch": 0.07758543412288574, + "grad_norm": 10.258618845610997, + "learning_rate": 4.9906452049754054e-05, + "loss": 2.1728, + "mean_token_accuracy": 0.45172414779663084, + "step": 77030 + }, + { + "epoch": 0.07759047017598991, + "grad_norm": 13.046263594947206, + "learning_rate": 4.9906417912542144e-05, + "loss": 2.39, + "mean_token_accuracy": 0.4172413766384125, + "step": 77035 + }, + { + "epoch": 0.07759550622909409, + "grad_norm": 11.720428962883737, + "learning_rate": 4.990638376911572e-05, + "loss": 2.9226, + "mean_token_accuracy": 0.358620685338974, + "step": 77040 + }, + { + "epoch": 0.07760054228219826, + "grad_norm": 11.36679894835108, + "learning_rate": 4.990634961947481e-05, + "loss": 2.6462, + "mean_token_accuracy": 0.41935873627662656, + "step": 77045 + }, + { + "epoch": 0.07760557833530243, + "grad_norm": 8.042992720656542, + "learning_rate": 4.990631546361941e-05, + "loss": 2.1437, + "mean_token_accuracy": 0.4738916367292404, + "step": 77050 + }, + { + "epoch": 0.0776106143884066, + "grad_norm": 11.33166364550852, + "learning_rate": 4.990628130154954e-05, + "loss": 2.5928, + "mean_token_accuracy": 0.4379310369491577, + "step": 77055 + }, + { + "epoch": 0.07761565044151078, + "grad_norm": 15.931823682485888, + "learning_rate": 4.99062471332652e-05, + "loss": 2.9103, + "mean_token_accuracy": 0.43260737657547, + "step": 77060 + }, + { + "epoch": 0.07762068649461495, + "grad_norm": 11.696198242036498, + "learning_rate": 4.9906212958766405e-05, + "loss": 2.2325, + "mean_token_accuracy": 0.4744101643562317, + "step": 77065 + }, + { + "epoch": 0.07762572254771911, + "grad_norm": 11.868072379388977, + "learning_rate": 4.990617877805317e-05, + "loss": 2.726, + "mean_token_accuracy": 0.3620689630508423, + "step": 77070 + }, + { + "epoch": 0.07763075860082329, + "grad_norm": 12.575450868013142, + "learning_rate": 4.990614459112549e-05, + "loss": 2.2071, + "mean_token_accuracy": 0.4482758641242981, + "step": 77075 + }, + { + "epoch": 0.07763579465392746, + "grad_norm": 9.8635547296028, + "learning_rate": 4.990611039798338e-05, + "loss": 2.6878, + "mean_token_accuracy": 0.42661826610565184, + "step": 77080 + }, + { + "epoch": 0.07764083070703164, + "grad_norm": 10.656224504866053, + "learning_rate": 4.9906076198626855e-05, + "loss": 2.5413, + "mean_token_accuracy": 0.39655172526836396, + "step": 77085 + }, + { + "epoch": 0.07764586676013581, + "grad_norm": 12.21576913900044, + "learning_rate": 4.990604199305592e-05, + "loss": 2.463, + "mean_token_accuracy": 0.4, + "step": 77090 + }, + { + "epoch": 0.07765090281323998, + "grad_norm": 10.542462046468861, + "learning_rate": 4.990600778127059e-05, + "loss": 2.498, + "mean_token_accuracy": 0.3931034505367279, + "step": 77095 + }, + { + "epoch": 0.07765593886634416, + "grad_norm": 10.419810014271121, + "learning_rate": 4.990597356327086e-05, + "loss": 2.6778, + "mean_token_accuracy": 0.37241379022598264, + "step": 77100 + }, + { + "epoch": 0.07766097491944833, + "grad_norm": 12.272945888129174, + "learning_rate": 4.990593933905676e-05, + "loss": 2.655, + "mean_token_accuracy": 0.3965517163276672, + "step": 77105 + }, + { + "epoch": 0.0776660109725525, + "grad_norm": 8.488296768323771, + "learning_rate": 4.990590510862829e-05, + "loss": 2.3775, + "mean_token_accuracy": 0.4448275864124298, + "step": 77110 + }, + { + "epoch": 0.07767104702565668, + "grad_norm": 11.327549218912567, + "learning_rate": 4.990587087198545e-05, + "loss": 2.464, + "mean_token_accuracy": 0.4034482777118683, + "step": 77115 + }, + { + "epoch": 0.07767608307876085, + "grad_norm": 13.340035863347502, + "learning_rate": 4.990583662912826e-05, + "loss": 2.4023, + "mean_token_accuracy": 0.4034482717514038, + "step": 77120 + }, + { + "epoch": 0.07768111913186503, + "grad_norm": 10.111026447498293, + "learning_rate": 4.990580238005673e-05, + "loss": 2.4683, + "mean_token_accuracy": 0.42758620977401735, + "step": 77125 + }, + { + "epoch": 0.0776861551849692, + "grad_norm": 12.198276974909005, + "learning_rate": 4.990576812477087e-05, + "loss": 2.3424, + "mean_token_accuracy": 0.44482757449150084, + "step": 77130 + }, + { + "epoch": 0.07769119123807337, + "grad_norm": 12.269838164945227, + "learning_rate": 4.990573386327068e-05, + "loss": 2.7472, + "mean_token_accuracy": 0.38620689511299133, + "step": 77135 + }, + { + "epoch": 0.07769622729117753, + "grad_norm": 11.214334445580265, + "learning_rate": 4.9905699595556186e-05, + "loss": 2.6984, + "mean_token_accuracy": 0.34137930274009703, + "step": 77140 + }, + { + "epoch": 0.0777012633442817, + "grad_norm": 11.868328614473265, + "learning_rate": 4.9905665321627385e-05, + "loss": 2.8829, + "mean_token_accuracy": 0.37241379618644715, + "step": 77145 + }, + { + "epoch": 0.07770629939738588, + "grad_norm": 9.692249384541242, + "learning_rate": 4.9905631041484276e-05, + "loss": 2.3625, + "mean_token_accuracy": 0.44137930274009707, + "step": 77150 + }, + { + "epoch": 0.07771133545049005, + "grad_norm": 9.260543071964502, + "learning_rate": 4.990559675512689e-05, + "loss": 2.388, + "mean_token_accuracy": 0.46424682140350343, + "step": 77155 + }, + { + "epoch": 0.07771637150359423, + "grad_norm": 10.25993233647754, + "learning_rate": 4.9905562462555234e-05, + "loss": 2.4785, + "mean_token_accuracy": 0.4379310369491577, + "step": 77160 + }, + { + "epoch": 0.0777214075566984, + "grad_norm": 14.465526981916323, + "learning_rate": 4.990552816376931e-05, + "loss": 2.3105, + "mean_token_accuracy": 0.42413792610168455, + "step": 77165 + }, + { + "epoch": 0.07772644360980258, + "grad_norm": 13.472094789788683, + "learning_rate": 4.9905493858769123e-05, + "loss": 2.6939, + "mean_token_accuracy": 0.44972777366638184, + "step": 77170 + }, + { + "epoch": 0.07773147966290675, + "grad_norm": 10.675178218827837, + "learning_rate": 4.990545954755469e-05, + "loss": 2.2165, + "mean_token_accuracy": 0.4931034505367279, + "step": 77175 + }, + { + "epoch": 0.07773651571601092, + "grad_norm": 15.638613658725355, + "learning_rate": 4.990542523012602e-05, + "loss": 2.3421, + "mean_token_accuracy": 0.47586206793785096, + "step": 77180 + }, + { + "epoch": 0.0777415517691151, + "grad_norm": 11.171000247407731, + "learning_rate": 4.9905390906483125e-05, + "loss": 2.5364, + "mean_token_accuracy": 0.4172413766384125, + "step": 77185 + }, + { + "epoch": 0.07774658782221927, + "grad_norm": 10.72922325361121, + "learning_rate": 4.990535657662601e-05, + "loss": 2.5636, + "mean_token_accuracy": 0.4103448331356049, + "step": 77190 + }, + { + "epoch": 0.07775162387532344, + "grad_norm": 13.229697650166804, + "learning_rate": 4.9905322240554685e-05, + "loss": 2.5378, + "mean_token_accuracy": 0.40852994918823243, + "step": 77195 + }, + { + "epoch": 0.07775665992842762, + "grad_norm": 11.55424419307672, + "learning_rate": 4.9905287898269166e-05, + "loss": 2.6512, + "mean_token_accuracy": 0.44827585816383364, + "step": 77200 + }, + { + "epoch": 0.07776169598153179, + "grad_norm": 12.450374053633396, + "learning_rate": 4.990525354976944e-05, + "loss": 2.9809, + "mean_token_accuracy": 0.3827586233615875, + "step": 77205 + }, + { + "epoch": 0.07776673203463595, + "grad_norm": 11.808871111308113, + "learning_rate": 4.990521919505555e-05, + "loss": 2.4263, + "mean_token_accuracy": 0.3655172407627106, + "step": 77210 + }, + { + "epoch": 0.07777176808774013, + "grad_norm": 10.688207290992883, + "learning_rate": 4.9905184834127486e-05, + "loss": 2.5481, + "mean_token_accuracy": 0.3758620619773865, + "step": 77215 + }, + { + "epoch": 0.0777768041408443, + "grad_norm": 11.820694163833142, + "learning_rate": 4.990515046698526e-05, + "loss": 2.357, + "mean_token_accuracy": 0.46896551847457885, + "step": 77220 + }, + { + "epoch": 0.07778184019394847, + "grad_norm": 12.063104326206414, + "learning_rate": 4.9905116093628875e-05, + "loss": 2.5647, + "mean_token_accuracy": 0.4103448212146759, + "step": 77225 + }, + { + "epoch": 0.07778687624705265, + "grad_norm": 10.62478618256457, + "learning_rate": 4.990508171405835e-05, + "loss": 2.4493, + "mean_token_accuracy": 0.39310343861579894, + "step": 77230 + }, + { + "epoch": 0.07779191230015682, + "grad_norm": 17.639347325034, + "learning_rate": 4.9905047328273696e-05, + "loss": 2.6097, + "mean_token_accuracy": 0.4206896543502808, + "step": 77235 + }, + { + "epoch": 0.077796948353261, + "grad_norm": 12.862348450890504, + "learning_rate": 4.990501293627491e-05, + "loss": 3.1285, + "mean_token_accuracy": 0.36551724672317504, + "step": 77240 + }, + { + "epoch": 0.07780198440636517, + "grad_norm": 8.260103950497417, + "learning_rate": 4.9904978538062024e-05, + "loss": 2.0416, + "mean_token_accuracy": 0.5353448271751404, + "step": 77245 + }, + { + "epoch": 0.07780702045946934, + "grad_norm": 11.992925238808867, + "learning_rate": 4.9904944133635024e-05, + "loss": 2.2865, + "mean_token_accuracy": 0.40344828367233276, + "step": 77250 + }, + { + "epoch": 0.07781205651257352, + "grad_norm": 11.40457495918098, + "learning_rate": 4.990490972299393e-05, + "loss": 2.5308, + "mean_token_accuracy": 0.42758620381355283, + "step": 77255 + }, + { + "epoch": 0.07781709256567769, + "grad_norm": 12.143363322759624, + "learning_rate": 4.9904875306138756e-05, + "loss": 2.4874, + "mean_token_accuracy": 0.41379310488700866, + "step": 77260 + }, + { + "epoch": 0.07782212861878186, + "grad_norm": 10.511484738956336, + "learning_rate": 4.99048408830695e-05, + "loss": 2.5931, + "mean_token_accuracy": 0.3620689570903778, + "step": 77265 + }, + { + "epoch": 0.07782716467188604, + "grad_norm": 11.486398400465754, + "learning_rate": 4.9904806453786185e-05, + "loss": 2.216, + "mean_token_accuracy": 0.4551724135875702, + "step": 77270 + }, + { + "epoch": 0.07783220072499021, + "grad_norm": 8.304870115817954, + "learning_rate": 4.9904772018288804e-05, + "loss": 2.1638, + "mean_token_accuracy": 0.47767695784568787, + "step": 77275 + }, + { + "epoch": 0.07783723677809437, + "grad_norm": 10.641655282768328, + "learning_rate": 4.990473757657738e-05, + "loss": 2.0042, + "mean_token_accuracy": 0.4604355812072754, + "step": 77280 + }, + { + "epoch": 0.07784227283119854, + "grad_norm": 15.624176700946522, + "learning_rate": 4.990470312865192e-05, + "loss": 2.438, + "mean_token_accuracy": 0.42413793206214906, + "step": 77285 + }, + { + "epoch": 0.07784730888430272, + "grad_norm": 10.778359249061541, + "learning_rate": 4.990466867451244e-05, + "loss": 2.2777, + "mean_token_accuracy": 0.43793101906776427, + "step": 77290 + }, + { + "epoch": 0.07785234493740689, + "grad_norm": 11.972668001524685, + "learning_rate": 4.990463421415892e-05, + "loss": 2.5658, + "mean_token_accuracy": 0.41034482717514037, + "step": 77295 + }, + { + "epoch": 0.07785738099051107, + "grad_norm": 10.822023264494888, + "learning_rate": 4.990459974759141e-05, + "loss": 2.253, + "mean_token_accuracy": 0.46896551847457885, + "step": 77300 + }, + { + "epoch": 0.07786241704361524, + "grad_norm": 9.67239073341343, + "learning_rate": 4.990456527480989e-05, + "loss": 2.5554, + "mean_token_accuracy": 0.36551724672317504, + "step": 77305 + }, + { + "epoch": 0.07786745309671941, + "grad_norm": 10.868694307630763, + "learning_rate": 4.990453079581439e-05, + "loss": 2.6433, + "mean_token_accuracy": 0.3896551728248596, + "step": 77310 + }, + { + "epoch": 0.07787248914982359, + "grad_norm": 10.405223147062465, + "learning_rate": 4.99044963106049e-05, + "loss": 2.2549, + "mean_token_accuracy": 0.4379310369491577, + "step": 77315 + }, + { + "epoch": 0.07787752520292776, + "grad_norm": 12.01185719692832, + "learning_rate": 4.990446181918144e-05, + "loss": 2.7374, + "mean_token_accuracy": 0.3689655065536499, + "step": 77320 + }, + { + "epoch": 0.07788256125603193, + "grad_norm": 15.449307330379405, + "learning_rate": 4.990442732154403e-05, + "loss": 2.6466, + "mean_token_accuracy": 0.3517241358757019, + "step": 77325 + }, + { + "epoch": 0.07788759730913611, + "grad_norm": 8.708893643828267, + "learning_rate": 4.9904392817692655e-05, + "loss": 2.2835, + "mean_token_accuracy": 0.4712038695812225, + "step": 77330 + }, + { + "epoch": 0.07789263336224028, + "grad_norm": 9.249796535795644, + "learning_rate": 4.990435830762735e-05, + "loss": 2.6712, + "mean_token_accuracy": 0.3946763426065445, + "step": 77335 + }, + { + "epoch": 0.07789766941534446, + "grad_norm": 15.216945957980988, + "learning_rate": 4.9904323791348106e-05, + "loss": 2.7996, + "mean_token_accuracy": 0.3517241358757019, + "step": 77340 + }, + { + "epoch": 0.07790270546844863, + "grad_norm": 10.228957677910476, + "learning_rate": 4.9904289268854945e-05, + "loss": 2.5776, + "mean_token_accuracy": 0.417241370677948, + "step": 77345 + }, + { + "epoch": 0.07790774152155279, + "grad_norm": 10.671223399816135, + "learning_rate": 4.990425474014786e-05, + "loss": 2.4744, + "mean_token_accuracy": 0.42413792610168455, + "step": 77350 + }, + { + "epoch": 0.07791277757465696, + "grad_norm": 11.711857515719785, + "learning_rate": 4.990422020522688e-05, + "loss": 2.7866, + "mean_token_accuracy": 0.3896551728248596, + "step": 77355 + }, + { + "epoch": 0.07791781362776114, + "grad_norm": 11.155314803416971, + "learning_rate": 4.9904185664092e-05, + "loss": 2.3209, + "mean_token_accuracy": 0.41379310488700866, + "step": 77360 + }, + { + "epoch": 0.07792284968086531, + "grad_norm": 13.383179664023888, + "learning_rate": 4.990415111674324e-05, + "loss": 2.5017, + "mean_token_accuracy": 0.4, + "step": 77365 + }, + { + "epoch": 0.07792788573396948, + "grad_norm": 10.281608306125618, + "learning_rate": 4.9904116563180605e-05, + "loss": 2.1787, + "mean_token_accuracy": 0.4206896543502808, + "step": 77370 + }, + { + "epoch": 0.07793292178707366, + "grad_norm": 12.190920997018454, + "learning_rate": 4.99040820034041e-05, + "loss": 2.793, + "mean_token_accuracy": 0.42413793206214906, + "step": 77375 + }, + { + "epoch": 0.07793795784017783, + "grad_norm": 10.823050388879269, + "learning_rate": 4.990404743741374e-05, + "loss": 2.4381, + "mean_token_accuracy": 0.3793103456497192, + "step": 77380 + }, + { + "epoch": 0.077942993893282, + "grad_norm": 8.443542591124924, + "learning_rate": 4.9904012865209544e-05, + "loss": 2.45, + "mean_token_accuracy": 0.39842710494995115, + "step": 77385 + }, + { + "epoch": 0.07794802994638618, + "grad_norm": 12.59677876481605, + "learning_rate": 4.99039782867915e-05, + "loss": 2.5679, + "mean_token_accuracy": 0.43103448748588563, + "step": 77390 + }, + { + "epoch": 0.07795306599949035, + "grad_norm": 11.752848582766706, + "learning_rate": 4.9903943702159636e-05, + "loss": 2.2728, + "mean_token_accuracy": 0.42413792610168455, + "step": 77395 + }, + { + "epoch": 0.07795810205259453, + "grad_norm": 12.170300177241133, + "learning_rate": 4.990390911131395e-05, + "loss": 2.6047, + "mean_token_accuracy": 0.38620689511299133, + "step": 77400 + }, + { + "epoch": 0.0779631381056987, + "grad_norm": 11.626570584768757, + "learning_rate": 4.9903874514254455e-05, + "loss": 2.1739, + "mean_token_accuracy": 0.47586207985877993, + "step": 77405 + }, + { + "epoch": 0.07796817415880287, + "grad_norm": 9.541032503015185, + "learning_rate": 4.990383991098117e-05, + "loss": 2.3631, + "mean_token_accuracy": 0.4172413766384125, + "step": 77410 + }, + { + "epoch": 0.07797321021190705, + "grad_norm": 11.004657934081129, + "learning_rate": 4.990380530149409e-05, + "loss": 2.8755, + "mean_token_accuracy": 0.3896551728248596, + "step": 77415 + }, + { + "epoch": 0.07797824626501121, + "grad_norm": 14.366950146538729, + "learning_rate": 4.9903770685793225e-05, + "loss": 2.8183, + "mean_token_accuracy": 0.37586206793785093, + "step": 77420 + }, + { + "epoch": 0.07798328231811538, + "grad_norm": 9.618222271910392, + "learning_rate": 4.99037360638786e-05, + "loss": 2.4779, + "mean_token_accuracy": 0.4344827592372894, + "step": 77425 + }, + { + "epoch": 0.07798831837121956, + "grad_norm": 9.190506763231031, + "learning_rate": 4.990370143575022e-05, + "loss": 2.4545, + "mean_token_accuracy": 0.41379311084747317, + "step": 77430 + }, + { + "epoch": 0.07799335442432373, + "grad_norm": 11.497295798524467, + "learning_rate": 4.990366680140808e-05, + "loss": 2.4318, + "mean_token_accuracy": 0.4275861978530884, + "step": 77435 + }, + { + "epoch": 0.0779983904774279, + "grad_norm": 9.496800597117138, + "learning_rate": 4.99036321608522e-05, + "loss": 2.2893, + "mean_token_accuracy": 0.441379314661026, + "step": 77440 + }, + { + "epoch": 0.07800342653053208, + "grad_norm": 10.152861910006921, + "learning_rate": 4.99035975140826e-05, + "loss": 2.3003, + "mean_token_accuracy": 0.4620689630508423, + "step": 77445 + }, + { + "epoch": 0.07800846258363625, + "grad_norm": 12.35096853437537, + "learning_rate": 4.990356286109927e-05, + "loss": 2.5475, + "mean_token_accuracy": 0.3482758581638336, + "step": 77450 + }, + { + "epoch": 0.07801349863674042, + "grad_norm": 11.840694224903142, + "learning_rate": 4.990352820190222e-05, + "loss": 2.6721, + "mean_token_accuracy": 0.43793103098869324, + "step": 77455 + }, + { + "epoch": 0.0780185346898446, + "grad_norm": 10.858485155808516, + "learning_rate": 4.990349353649148e-05, + "loss": 2.2983, + "mean_token_accuracy": 0.42758620381355283, + "step": 77460 + }, + { + "epoch": 0.07802357074294877, + "grad_norm": 82.99634059416334, + "learning_rate": 4.990345886486704e-05, + "loss": 2.8293, + "mean_token_accuracy": 0.34827585369348524, + "step": 77465 + }, + { + "epoch": 0.07802860679605295, + "grad_norm": 10.649379035150293, + "learning_rate": 4.9903424187028927e-05, + "loss": 2.4238, + "mean_token_accuracy": 0.41524500846862794, + "step": 77470 + }, + { + "epoch": 0.07803364284915712, + "grad_norm": 10.263754428461128, + "learning_rate": 4.990338950297713e-05, + "loss": 2.7195, + "mean_token_accuracy": 0.38965516686439516, + "step": 77475 + }, + { + "epoch": 0.0780386789022613, + "grad_norm": 11.122087049645675, + "learning_rate": 4.9903354812711675e-05, + "loss": 2.5077, + "mean_token_accuracy": 0.3949788331985474, + "step": 77480 + }, + { + "epoch": 0.07804371495536547, + "grad_norm": 8.688720918703758, + "learning_rate": 4.9903320116232567e-05, + "loss": 2.3453, + "mean_token_accuracy": 0.42413793206214906, + "step": 77485 + }, + { + "epoch": 0.07804875100846963, + "grad_norm": 9.092528986781291, + "learning_rate": 4.9903285413539814e-05, + "loss": 2.2281, + "mean_token_accuracy": 0.43793103098869324, + "step": 77490 + }, + { + "epoch": 0.0780537870615738, + "grad_norm": 10.449192995940253, + "learning_rate": 4.990325070463342e-05, + "loss": 2.7452, + "mean_token_accuracy": 0.3827586233615875, + "step": 77495 + }, + { + "epoch": 0.07805882311467797, + "grad_norm": 11.473330004250018, + "learning_rate": 4.990321598951341e-05, + "loss": 2.4092, + "mean_token_accuracy": 0.3986085891723633, + "step": 77500 + }, + { + "epoch": 0.07806385916778215, + "grad_norm": 10.37823741854408, + "learning_rate": 4.9903181268179774e-05, + "loss": 2.4559, + "mean_token_accuracy": 0.42068964838981626, + "step": 77505 + }, + { + "epoch": 0.07806889522088632, + "grad_norm": 17.58227547817744, + "learning_rate": 4.990314654063255e-05, + "loss": 2.8371, + "mean_token_accuracy": 0.36551724672317504, + "step": 77510 + }, + { + "epoch": 0.0780739312739905, + "grad_norm": 9.642618620771664, + "learning_rate": 4.9903111806871703e-05, + "loss": 2.3973, + "mean_token_accuracy": 0.42068966031074523, + "step": 77515 + }, + { + "epoch": 0.07807896732709467, + "grad_norm": 12.428955489996632, + "learning_rate": 4.990307706689729e-05, + "loss": 2.378, + "mean_token_accuracy": 0.42758620977401735, + "step": 77520 + }, + { + "epoch": 0.07808400338019884, + "grad_norm": 9.031763481518894, + "learning_rate": 4.9903042320709294e-05, + "loss": 2.3154, + "mean_token_accuracy": 0.4413793087005615, + "step": 77525 + }, + { + "epoch": 0.07808903943330302, + "grad_norm": 12.109553461055786, + "learning_rate": 4.9903007568307736e-05, + "loss": 3.0783, + "mean_token_accuracy": 0.3517241358757019, + "step": 77530 + }, + { + "epoch": 0.07809407548640719, + "grad_norm": 9.973297290249706, + "learning_rate": 4.990297280969261e-05, + "loss": 2.0104, + "mean_token_accuracy": 0.4620689630508423, + "step": 77535 + }, + { + "epoch": 0.07809911153951136, + "grad_norm": 13.298594226513286, + "learning_rate": 4.9902938044863944e-05, + "loss": 2.7978, + "mean_token_accuracy": 0.36896551847457887, + "step": 77540 + }, + { + "epoch": 0.07810414759261554, + "grad_norm": 10.326294686298171, + "learning_rate": 4.990290327382174e-05, + "loss": 2.5082, + "mean_token_accuracy": 0.4379310250282288, + "step": 77545 + }, + { + "epoch": 0.07810918364571971, + "grad_norm": 9.936743616913024, + "learning_rate": 4.9902868496565996e-05, + "loss": 2.5449, + "mean_token_accuracy": 0.4448275864124298, + "step": 77550 + }, + { + "epoch": 0.07811421969882389, + "grad_norm": 9.44577001416769, + "learning_rate": 4.990283371309674e-05, + "loss": 2.2585, + "mean_token_accuracy": 0.45547489523887635, + "step": 77555 + }, + { + "epoch": 0.07811925575192805, + "grad_norm": 10.373712115208718, + "learning_rate": 4.990279892341398e-05, + "loss": 2.4515, + "mean_token_accuracy": 0.4103448152542114, + "step": 77560 + }, + { + "epoch": 0.07812429180503222, + "grad_norm": 10.619338880950037, + "learning_rate": 4.9902764127517714e-05, + "loss": 2.2902, + "mean_token_accuracy": 0.44295220375061034, + "step": 77565 + }, + { + "epoch": 0.0781293278581364, + "grad_norm": 9.006362702548142, + "learning_rate": 4.990272932540796e-05, + "loss": 2.51, + "mean_token_accuracy": 0.40344828367233276, + "step": 77570 + }, + { + "epoch": 0.07813436391124057, + "grad_norm": 13.01488985470614, + "learning_rate": 4.990269451708472e-05, + "loss": 2.5705, + "mean_token_accuracy": 0.4436781644821167, + "step": 77575 + }, + { + "epoch": 0.07813939996434474, + "grad_norm": 15.26902274071977, + "learning_rate": 4.990265970254802e-05, + "loss": 2.2906, + "mean_token_accuracy": 0.4517241418361664, + "step": 77580 + }, + { + "epoch": 0.07814443601744891, + "grad_norm": 11.576915857324082, + "learning_rate": 4.990262488179785e-05, + "loss": 2.3379, + "mean_token_accuracy": 0.441379314661026, + "step": 77585 + }, + { + "epoch": 0.07814947207055309, + "grad_norm": 20.549483069711094, + "learning_rate": 4.990259005483423e-05, + "loss": 2.8464, + "mean_token_accuracy": 0.358620685338974, + "step": 77590 + }, + { + "epoch": 0.07815450812365726, + "grad_norm": 12.046618130577828, + "learning_rate": 4.9902555221657175e-05, + "loss": 2.7274, + "mean_token_accuracy": 0.3793103456497192, + "step": 77595 + }, + { + "epoch": 0.07815954417676144, + "grad_norm": 10.130409015216102, + "learning_rate": 4.9902520382266674e-05, + "loss": 2.7123, + "mean_token_accuracy": 0.37241379618644715, + "step": 77600 + }, + { + "epoch": 0.07816458022986561, + "grad_norm": 11.914377863204965, + "learning_rate": 4.9902485536662766e-05, + "loss": 2.7487, + "mean_token_accuracy": 0.3517241358757019, + "step": 77605 + }, + { + "epoch": 0.07816961628296978, + "grad_norm": 11.462090845801173, + "learning_rate": 4.990245068484543e-05, + "loss": 2.8318, + "mean_token_accuracy": 0.4068965494632721, + "step": 77610 + }, + { + "epoch": 0.07817465233607396, + "grad_norm": 11.709615840720161, + "learning_rate": 4.9902415826814704e-05, + "loss": 2.6475, + "mean_token_accuracy": 0.37586207389831544, + "step": 77615 + }, + { + "epoch": 0.07817968838917813, + "grad_norm": 10.041665743247364, + "learning_rate": 4.990238096257057e-05, + "loss": 2.8671, + "mean_token_accuracy": 0.3931034505367279, + "step": 77620 + }, + { + "epoch": 0.0781847244422823, + "grad_norm": 15.994878669830632, + "learning_rate": 4.990234609211307e-05, + "loss": 2.9271, + "mean_token_accuracy": 0.4034482777118683, + "step": 77625 + }, + { + "epoch": 0.07818976049538647, + "grad_norm": 10.421153728606646, + "learning_rate": 4.990231121544218e-05, + "loss": 2.3234, + "mean_token_accuracy": 0.4689655125141144, + "step": 77630 + }, + { + "epoch": 0.07819479654849064, + "grad_norm": 14.663087599721488, + "learning_rate": 4.9902276332557935e-05, + "loss": 2.3937, + "mean_token_accuracy": 0.4, + "step": 77635 + }, + { + "epoch": 0.07819983260159481, + "grad_norm": 10.84457910859915, + "learning_rate": 4.990224144346033e-05, + "loss": 2.3409, + "mean_token_accuracy": 0.42068966031074523, + "step": 77640 + }, + { + "epoch": 0.07820486865469899, + "grad_norm": 11.029728344334556, + "learning_rate": 4.9902206548149384e-05, + "loss": 2.6145, + "mean_token_accuracy": 0.4137930989265442, + "step": 77645 + }, + { + "epoch": 0.07820990470780316, + "grad_norm": 12.36272269839812, + "learning_rate": 4.990217164662509e-05, + "loss": 2.5562, + "mean_token_accuracy": 0.39806411862373353, + "step": 77650 + }, + { + "epoch": 0.07821494076090733, + "grad_norm": 10.871148998262232, + "learning_rate": 4.9902136738887483e-05, + "loss": 2.4392, + "mean_token_accuracy": 0.4310344815254211, + "step": 77655 + }, + { + "epoch": 0.07821997681401151, + "grad_norm": 9.143362222638888, + "learning_rate": 4.990210182493656e-05, + "loss": 2.1655, + "mean_token_accuracy": 0.4413793087005615, + "step": 77660 + }, + { + "epoch": 0.07822501286711568, + "grad_norm": 11.813268953545558, + "learning_rate": 4.990206690477233e-05, + "loss": 2.0417, + "mean_token_accuracy": 0.49879008531570435, + "step": 77665 + }, + { + "epoch": 0.07823004892021986, + "grad_norm": 12.2874043873012, + "learning_rate": 4.99020319783948e-05, + "loss": 2.43, + "mean_token_accuracy": 0.3931034505367279, + "step": 77670 + }, + { + "epoch": 0.07823508497332403, + "grad_norm": 12.474367189892519, + "learning_rate": 4.990199704580398e-05, + "loss": 2.8515, + "mean_token_accuracy": 0.3896551728248596, + "step": 77675 + }, + { + "epoch": 0.0782401210264282, + "grad_norm": 11.362234313004377, + "learning_rate": 4.9901962106999886e-05, + "loss": 2.5978, + "mean_token_accuracy": 0.37241379618644715, + "step": 77680 + }, + { + "epoch": 0.07824515707953238, + "grad_norm": 16.68431183229866, + "learning_rate": 4.990192716198252e-05, + "loss": 2.4829, + "mean_token_accuracy": 0.44482758045196535, + "step": 77685 + }, + { + "epoch": 0.07825019313263655, + "grad_norm": 10.477123648196688, + "learning_rate": 4.99018922107519e-05, + "loss": 2.6772, + "mean_token_accuracy": 0.38082274198532107, + "step": 77690 + }, + { + "epoch": 0.07825522918574072, + "grad_norm": 10.534043742860597, + "learning_rate": 4.9901857253308035e-05, + "loss": 2.5877, + "mean_token_accuracy": 0.3999999940395355, + "step": 77695 + }, + { + "epoch": 0.07826026523884488, + "grad_norm": 12.662280196862161, + "learning_rate": 4.990182228965093e-05, + "loss": 2.1179, + "mean_token_accuracy": 0.4793103516101837, + "step": 77700 + }, + { + "epoch": 0.07826530129194906, + "grad_norm": 11.314560749507471, + "learning_rate": 4.9901787319780594e-05, + "loss": 2.015, + "mean_token_accuracy": 0.4448275864124298, + "step": 77705 + }, + { + "epoch": 0.07827033734505323, + "grad_norm": 10.133887669552253, + "learning_rate": 4.9901752343697035e-05, + "loss": 2.0298, + "mean_token_accuracy": 0.46896551847457885, + "step": 77710 + }, + { + "epoch": 0.0782753733981574, + "grad_norm": 11.3691894100655, + "learning_rate": 4.990171736140027e-05, + "loss": 2.2396, + "mean_token_accuracy": 0.4620689690113068, + "step": 77715 + }, + { + "epoch": 0.07828040945126158, + "grad_norm": 14.87837346497743, + "learning_rate": 4.9901682372890305e-05, + "loss": 3.0485, + "mean_token_accuracy": 0.28965516984462736, + "step": 77720 + }, + { + "epoch": 0.07828544550436575, + "grad_norm": 10.914356450856655, + "learning_rate": 4.990164737816715e-05, + "loss": 2.4296, + "mean_token_accuracy": 0.4310344815254211, + "step": 77725 + }, + { + "epoch": 0.07829048155746993, + "grad_norm": 13.270479447871297, + "learning_rate": 4.9901612377230815e-05, + "loss": 2.5211, + "mean_token_accuracy": 0.4310344815254211, + "step": 77730 + }, + { + "epoch": 0.0782955176105741, + "grad_norm": 10.238196086365328, + "learning_rate": 4.990157737008131e-05, + "loss": 2.4752, + "mean_token_accuracy": 0.41379310488700866, + "step": 77735 + }, + { + "epoch": 0.07830055366367827, + "grad_norm": 11.050401422033408, + "learning_rate": 4.990154235671865e-05, + "loss": 2.5396, + "mean_token_accuracy": 0.4310344815254211, + "step": 77740 + }, + { + "epoch": 0.07830558971678245, + "grad_norm": 9.237864583147893, + "learning_rate": 4.990150733714282e-05, + "loss": 2.5144, + "mean_token_accuracy": 0.4344827592372894, + "step": 77745 + }, + { + "epoch": 0.07831062576988662, + "grad_norm": 15.751728247058942, + "learning_rate": 4.990147231135386e-05, + "loss": 2.8522, + "mean_token_accuracy": 0.36551724672317504, + "step": 77750 + }, + { + "epoch": 0.0783156618229908, + "grad_norm": 10.760721915459754, + "learning_rate": 4.990143727935177e-05, + "loss": 2.1458, + "mean_token_accuracy": 0.4517241358757019, + "step": 77755 + }, + { + "epoch": 0.07832069787609497, + "grad_norm": 12.6336216729002, + "learning_rate": 4.990140224113656e-05, + "loss": 2.8572, + "mean_token_accuracy": 0.38620689511299133, + "step": 77760 + }, + { + "epoch": 0.07832573392919914, + "grad_norm": 11.989119826292361, + "learning_rate": 4.990136719670823e-05, + "loss": 2.2877, + "mean_token_accuracy": 0.43793103098869324, + "step": 77765 + }, + { + "epoch": 0.0783307699823033, + "grad_norm": 18.65303384860608, + "learning_rate": 4.9901332146066804e-05, + "loss": 2.8702, + "mean_token_accuracy": 0.3551724135875702, + "step": 77770 + }, + { + "epoch": 0.07833580603540748, + "grad_norm": 13.41770640591508, + "learning_rate": 4.9901297089212276e-05, + "loss": 2.9513, + "mean_token_accuracy": 0.4229280114173889, + "step": 77775 + }, + { + "epoch": 0.07834084208851165, + "grad_norm": 11.37938688058401, + "learning_rate": 4.9901262026144674e-05, + "loss": 2.3659, + "mean_token_accuracy": 0.4103448212146759, + "step": 77780 + }, + { + "epoch": 0.07834587814161582, + "grad_norm": 8.918228064831073, + "learning_rate": 4.9901226956863996e-05, + "loss": 2.2168, + "mean_token_accuracy": 0.46551724076271056, + "step": 77785 + }, + { + "epoch": 0.07835091419472, + "grad_norm": 9.845983978064451, + "learning_rate": 4.9901191881370245e-05, + "loss": 2.4309, + "mean_token_accuracy": 0.42068966031074523, + "step": 77790 + }, + { + "epoch": 0.07835595024782417, + "grad_norm": 10.020543979179083, + "learning_rate": 4.990115679966345e-05, + "loss": 2.3145, + "mean_token_accuracy": 0.4310344815254211, + "step": 77795 + }, + { + "epoch": 0.07836098630092835, + "grad_norm": 14.177133947426526, + "learning_rate": 4.990112171174361e-05, + "loss": 2.7465, + "mean_token_accuracy": 0.3827586114406586, + "step": 77800 + }, + { + "epoch": 0.07836602235403252, + "grad_norm": 15.02286040244891, + "learning_rate": 4.990108661761073e-05, + "loss": 2.5008, + "mean_token_accuracy": 0.4448275864124298, + "step": 77805 + }, + { + "epoch": 0.07837105840713669, + "grad_norm": 11.229555815813425, + "learning_rate": 4.990105151726483e-05, + "loss": 2.3011, + "mean_token_accuracy": 0.43793103098869324, + "step": 77810 + }, + { + "epoch": 0.07837609446024087, + "grad_norm": 11.442773974119374, + "learning_rate": 4.990101641070592e-05, + "loss": 2.7001, + "mean_token_accuracy": 0.3620689630508423, + "step": 77815 + }, + { + "epoch": 0.07838113051334504, + "grad_norm": 8.993383660399237, + "learning_rate": 4.990098129793399e-05, + "loss": 2.5105, + "mean_token_accuracy": 0.42758620977401735, + "step": 77820 + }, + { + "epoch": 0.07838616656644921, + "grad_norm": 12.538639900582348, + "learning_rate": 4.9900946178949074e-05, + "loss": 2.3592, + "mean_token_accuracy": 0.43448275327682495, + "step": 77825 + }, + { + "epoch": 0.07839120261955339, + "grad_norm": 11.642842560089266, + "learning_rate": 4.9900911053751164e-05, + "loss": 3.0603, + "mean_token_accuracy": 0.3448275804519653, + "step": 77830 + }, + { + "epoch": 0.07839623867265756, + "grad_norm": 10.167745246079939, + "learning_rate": 4.990087592234029e-05, + "loss": 2.3072, + "mean_token_accuracy": 0.43793103098869324, + "step": 77835 + }, + { + "epoch": 0.07840127472576172, + "grad_norm": 10.983872289131089, + "learning_rate": 4.9900840784716446e-05, + "loss": 2.6044, + "mean_token_accuracy": 0.39473684430122374, + "step": 77840 + }, + { + "epoch": 0.0784063107788659, + "grad_norm": 10.705447759493023, + "learning_rate": 4.990080564087964e-05, + "loss": 2.603, + "mean_token_accuracy": 0.4068965494632721, + "step": 77845 + }, + { + "epoch": 0.07841134683197007, + "grad_norm": 10.865801967797859, + "learning_rate": 4.990077049082989e-05, + "loss": 2.1642, + "mean_token_accuracy": 0.45862069725990295, + "step": 77850 + }, + { + "epoch": 0.07841638288507424, + "grad_norm": 18.314199933038083, + "learning_rate": 4.99007353345672e-05, + "loss": 3.0366, + "mean_token_accuracy": 0.34137930870056155, + "step": 77855 + }, + { + "epoch": 0.07842141893817842, + "grad_norm": 10.64427808962319, + "learning_rate": 4.9900700172091584e-05, + "loss": 2.6459, + "mean_token_accuracy": 0.43448275327682495, + "step": 77860 + }, + { + "epoch": 0.07842645499128259, + "grad_norm": 10.48096390187752, + "learning_rate": 4.990066500340305e-05, + "loss": 2.714, + "mean_token_accuracy": 0.38620689511299133, + "step": 77865 + }, + { + "epoch": 0.07843149104438676, + "grad_norm": 14.82336082220531, + "learning_rate": 4.990062982850161e-05, + "loss": 2.6551, + "mean_token_accuracy": 0.4172413766384125, + "step": 77870 + }, + { + "epoch": 0.07843652709749094, + "grad_norm": 10.624774862309952, + "learning_rate": 4.990059464738727e-05, + "loss": 2.6161, + "mean_token_accuracy": 0.39086509943008424, + "step": 77875 + }, + { + "epoch": 0.07844156315059511, + "grad_norm": 10.099761055375216, + "learning_rate": 4.9900559460060046e-05, + "loss": 2.3368, + "mean_token_accuracy": 0.4, + "step": 77880 + }, + { + "epoch": 0.07844659920369929, + "grad_norm": 10.866077071626687, + "learning_rate": 4.990052426651994e-05, + "loss": 2.1657, + "mean_token_accuracy": 0.4310344815254211, + "step": 77885 + }, + { + "epoch": 0.07845163525680346, + "grad_norm": 12.525396996980634, + "learning_rate": 4.9900489066766956e-05, + "loss": 2.642, + "mean_token_accuracy": 0.41034482717514037, + "step": 77890 + }, + { + "epoch": 0.07845667130990763, + "grad_norm": 9.679378957677699, + "learning_rate": 4.990045386080113e-05, + "loss": 2.5682, + "mean_token_accuracy": 0.42758620977401735, + "step": 77895 + }, + { + "epoch": 0.0784617073630118, + "grad_norm": 10.401800885080567, + "learning_rate": 4.990041864862244e-05, + "loss": 2.2683, + "mean_token_accuracy": 0.4517241418361664, + "step": 77900 + }, + { + "epoch": 0.07846674341611598, + "grad_norm": 14.962260149546244, + "learning_rate": 4.9900383430230925e-05, + "loss": 2.4249, + "mean_token_accuracy": 0.43660011887550354, + "step": 77905 + }, + { + "epoch": 0.07847177946922014, + "grad_norm": 10.783198967962724, + "learning_rate": 4.990034820562657e-05, + "loss": 2.4569, + "mean_token_accuracy": 0.42413793206214906, + "step": 77910 + }, + { + "epoch": 0.07847681552232431, + "grad_norm": 11.112238044939174, + "learning_rate": 4.990031297480939e-05, + "loss": 2.3168, + "mean_token_accuracy": 0.4482758641242981, + "step": 77915 + }, + { + "epoch": 0.07848185157542849, + "grad_norm": 11.517421531163409, + "learning_rate": 4.990027773777941e-05, + "loss": 2.1478, + "mean_token_accuracy": 0.43793103098869324, + "step": 77920 + }, + { + "epoch": 0.07848688762853266, + "grad_norm": 10.552561776653235, + "learning_rate": 4.9900242494536624e-05, + "loss": 2.0982, + "mean_token_accuracy": 0.48620688915252686, + "step": 77925 + }, + { + "epoch": 0.07849192368163684, + "grad_norm": 11.778632147178488, + "learning_rate": 4.990020724508105e-05, + "loss": 2.3175, + "mean_token_accuracy": 0.4068965554237366, + "step": 77930 + }, + { + "epoch": 0.07849695973474101, + "grad_norm": 12.470143389751312, + "learning_rate": 4.990017198941269e-05, + "loss": 2.5224, + "mean_token_accuracy": 0.34137930870056155, + "step": 77935 + }, + { + "epoch": 0.07850199578784518, + "grad_norm": 9.650937328947304, + "learning_rate": 4.9900136727531566e-05, + "loss": 2.2958, + "mean_token_accuracy": 0.443254691362381, + "step": 77940 + }, + { + "epoch": 0.07850703184094936, + "grad_norm": 13.149721790520653, + "learning_rate": 4.9900101459437673e-05, + "loss": 2.3286, + "mean_token_accuracy": 0.477339905500412, + "step": 77945 + }, + { + "epoch": 0.07851206789405353, + "grad_norm": 11.06665839861471, + "learning_rate": 4.990006618513103e-05, + "loss": 2.9532, + "mean_token_accuracy": 0.3724137932062149, + "step": 77950 + }, + { + "epoch": 0.0785171039471577, + "grad_norm": 11.665802059271918, + "learning_rate": 4.990003090461165e-05, + "loss": 2.6354, + "mean_token_accuracy": 0.38275861740112305, + "step": 77955 + }, + { + "epoch": 0.07852214000026188, + "grad_norm": 12.252963413251779, + "learning_rate": 4.989999561787953e-05, + "loss": 2.6806, + "mean_token_accuracy": 0.3862068891525269, + "step": 77960 + }, + { + "epoch": 0.07852717605336605, + "grad_norm": 11.424115839781741, + "learning_rate": 4.989996032493469e-05, + "loss": 2.4271, + "mean_token_accuracy": 0.4034482777118683, + "step": 77965 + }, + { + "epoch": 0.07853221210647023, + "grad_norm": 10.443348288980115, + "learning_rate": 4.9899925025777145e-05, + "loss": 2.6289, + "mean_token_accuracy": 0.4, + "step": 77970 + }, + { + "epoch": 0.0785372481595744, + "grad_norm": 12.22355407825415, + "learning_rate": 4.98998897204069e-05, + "loss": 2.4876, + "mean_token_accuracy": 0.4172413766384125, + "step": 77975 + }, + { + "epoch": 0.07854228421267856, + "grad_norm": 10.188499107388187, + "learning_rate": 4.9899854408823956e-05, + "loss": 2.3492, + "mean_token_accuracy": 0.44827585816383364, + "step": 77980 + }, + { + "epoch": 0.07854732026578273, + "grad_norm": 12.686641330895586, + "learning_rate": 4.989981909102832e-05, + "loss": 2.7979, + "mean_token_accuracy": 0.3827586233615875, + "step": 77985 + }, + { + "epoch": 0.0785523563188869, + "grad_norm": 12.798850868888554, + "learning_rate": 4.989978376702001e-05, + "loss": 2.8899, + "mean_token_accuracy": 0.37931033968925476, + "step": 77990 + }, + { + "epoch": 0.07855739237199108, + "grad_norm": 10.912500127828649, + "learning_rate": 4.989974843679905e-05, + "loss": 2.3205, + "mean_token_accuracy": 0.47931034564971925, + "step": 77995 + }, + { + "epoch": 0.07856242842509525, + "grad_norm": 10.00375394393882, + "learning_rate": 4.989971310036543e-05, + "loss": 2.2352, + "mean_token_accuracy": 0.45517241954803467, + "step": 78000 + }, + { + "epoch": 0.07856746447819943, + "grad_norm": 12.388914182107136, + "learning_rate": 4.989967775771917e-05, + "loss": 2.6464, + "mean_token_accuracy": 0.3896551787853241, + "step": 78005 + }, + { + "epoch": 0.0785725005313036, + "grad_norm": 11.986064372242483, + "learning_rate": 4.989964240886027e-05, + "loss": 2.5315, + "mean_token_accuracy": 0.42413793206214906, + "step": 78010 + }, + { + "epoch": 0.07857753658440778, + "grad_norm": 11.919565627560743, + "learning_rate": 4.989960705378875e-05, + "loss": 2.705, + "mean_token_accuracy": 0.3862069010734558, + "step": 78015 + }, + { + "epoch": 0.07858257263751195, + "grad_norm": 9.718258522378472, + "learning_rate": 4.9899571692504606e-05, + "loss": 2.5904, + "mean_token_accuracy": 0.4103448331356049, + "step": 78020 + }, + { + "epoch": 0.07858760869061612, + "grad_norm": 9.924243929549906, + "learning_rate": 4.989953632500787e-05, + "loss": 2.7506, + "mean_token_accuracy": 0.37241379618644715, + "step": 78025 + }, + { + "epoch": 0.0785926447437203, + "grad_norm": 11.598700105704713, + "learning_rate": 4.9899500951298535e-05, + "loss": 2.5934, + "mean_token_accuracy": 0.4103448212146759, + "step": 78030 + }, + { + "epoch": 0.07859768079682447, + "grad_norm": 10.628935677654898, + "learning_rate": 4.9899465571376615e-05, + "loss": 2.6348, + "mean_token_accuracy": 0.38427101969718935, + "step": 78035 + }, + { + "epoch": 0.07860271684992864, + "grad_norm": 11.034283704894001, + "learning_rate": 4.9899430185242115e-05, + "loss": 2.5264, + "mean_token_accuracy": 0.37241379022598264, + "step": 78040 + }, + { + "epoch": 0.07860775290303282, + "grad_norm": 13.187847758266072, + "learning_rate": 4.989939479289505e-05, + "loss": 2.595, + "mean_token_accuracy": 0.3551724135875702, + "step": 78045 + }, + { + "epoch": 0.07861278895613698, + "grad_norm": 9.679061406343292, + "learning_rate": 4.989935939433543e-05, + "loss": 2.0989, + "mean_token_accuracy": 0.43448275327682495, + "step": 78050 + }, + { + "epoch": 0.07861782500924115, + "grad_norm": 11.821685886627446, + "learning_rate": 4.989932398956327e-05, + "loss": 2.3925, + "mean_token_accuracy": 0.4255898356437683, + "step": 78055 + }, + { + "epoch": 0.07862286106234533, + "grad_norm": 10.644059295370624, + "learning_rate": 4.989928857857856e-05, + "loss": 2.5019, + "mean_token_accuracy": 0.4172413766384125, + "step": 78060 + }, + { + "epoch": 0.0786278971154495, + "grad_norm": 11.0075802549604, + "learning_rate": 4.9899253161381335e-05, + "loss": 2.9096, + "mean_token_accuracy": 0.3620689630508423, + "step": 78065 + }, + { + "epoch": 0.07863293316855367, + "grad_norm": 10.710006767371173, + "learning_rate": 4.98992177379716e-05, + "loss": 2.499, + "mean_token_accuracy": 0.4034482777118683, + "step": 78070 + }, + { + "epoch": 0.07863796922165785, + "grad_norm": 10.414498772499007, + "learning_rate": 4.989918230834934e-05, + "loss": 1.995, + "mean_token_accuracy": 0.4987900793552399, + "step": 78075 + }, + { + "epoch": 0.07864300527476202, + "grad_norm": 15.929957814957554, + "learning_rate": 4.98991468725146e-05, + "loss": 2.87, + "mean_token_accuracy": 0.3551724165678024, + "step": 78080 + }, + { + "epoch": 0.0786480413278662, + "grad_norm": 11.275233058068917, + "learning_rate": 4.989911143046736e-05, + "loss": 2.5036, + "mean_token_accuracy": 0.37586206793785093, + "step": 78085 + }, + { + "epoch": 0.07865307738097037, + "grad_norm": 11.990835674424549, + "learning_rate": 4.989907598220765e-05, + "loss": 2.7431, + "mean_token_accuracy": 0.34827585220336915, + "step": 78090 + }, + { + "epoch": 0.07865811343407454, + "grad_norm": 11.93487259957259, + "learning_rate": 4.989904052773548e-05, + "loss": 2.2425, + "mean_token_accuracy": 0.39655172228813174, + "step": 78095 + }, + { + "epoch": 0.07866314948717872, + "grad_norm": 11.17485918984423, + "learning_rate": 4.989900506705084e-05, + "loss": 2.3554, + "mean_token_accuracy": 0.42758620381355283, + "step": 78100 + }, + { + "epoch": 0.07866818554028289, + "grad_norm": 10.75351023064932, + "learning_rate": 4.989896960015376e-05, + "loss": 2.0886, + "mean_token_accuracy": 0.5275862097740174, + "step": 78105 + }, + { + "epoch": 0.07867322159338706, + "grad_norm": 20.11025771059245, + "learning_rate": 4.989893412704424e-05, + "loss": 2.6657, + "mean_token_accuracy": 0.45015124082565305, + "step": 78110 + }, + { + "epoch": 0.07867825764649124, + "grad_norm": 10.536596538058998, + "learning_rate": 4.989889864772229e-05, + "loss": 2.1567, + "mean_token_accuracy": 0.4551724076271057, + "step": 78115 + }, + { + "epoch": 0.0786832936995954, + "grad_norm": 11.115732843162577, + "learning_rate": 4.9898863162187934e-05, + "loss": 2.606, + "mean_token_accuracy": 0.41034482717514037, + "step": 78120 + }, + { + "epoch": 0.07868832975269957, + "grad_norm": 11.238252077459066, + "learning_rate": 4.9898827670441145e-05, + "loss": 2.5128, + "mean_token_accuracy": 0.37719298601150514, + "step": 78125 + }, + { + "epoch": 0.07869336580580374, + "grad_norm": 11.708453129461873, + "learning_rate": 4.989879217248197e-05, + "loss": 2.209, + "mean_token_accuracy": 0.4310344815254211, + "step": 78130 + }, + { + "epoch": 0.07869840185890792, + "grad_norm": 11.33740620553316, + "learning_rate": 4.9898756668310415e-05, + "loss": 2.4957, + "mean_token_accuracy": 0.44482758045196535, + "step": 78135 + }, + { + "epoch": 0.07870343791201209, + "grad_norm": 12.193548063808446, + "learning_rate": 4.9898721157926475e-05, + "loss": 2.5928, + "mean_token_accuracy": 0.41530550718307496, + "step": 78140 + }, + { + "epoch": 0.07870847396511627, + "grad_norm": 9.019370941574635, + "learning_rate": 4.9898685641330165e-05, + "loss": 2.8856, + "mean_token_accuracy": 0.3896551728248596, + "step": 78145 + }, + { + "epoch": 0.07871351001822044, + "grad_norm": 12.854838406201386, + "learning_rate": 4.98986501185215e-05, + "loss": 3.0098, + "mean_token_accuracy": 0.3241379290819168, + "step": 78150 + }, + { + "epoch": 0.07871854607132461, + "grad_norm": 13.864132592054066, + "learning_rate": 4.989861458950048e-05, + "loss": 2.6143, + "mean_token_accuracy": 0.4172413766384125, + "step": 78155 + }, + { + "epoch": 0.07872358212442879, + "grad_norm": 26.63420088731216, + "learning_rate": 4.989857905426712e-05, + "loss": 2.5678, + "mean_token_accuracy": 0.40689654350280763, + "step": 78160 + }, + { + "epoch": 0.07872861817753296, + "grad_norm": 11.158711318742565, + "learning_rate": 4.9898543512821435e-05, + "loss": 2.4516, + "mean_token_accuracy": 0.44482759237289426, + "step": 78165 + }, + { + "epoch": 0.07873365423063713, + "grad_norm": 10.633237016291352, + "learning_rate": 4.989850796516343e-05, + "loss": 2.7876, + "mean_token_accuracy": 0.37931033968925476, + "step": 78170 + }, + { + "epoch": 0.07873869028374131, + "grad_norm": 10.878535885777696, + "learning_rate": 4.989847241129312e-05, + "loss": 2.1544, + "mean_token_accuracy": 0.48118571639060975, + "step": 78175 + }, + { + "epoch": 0.07874372633684548, + "grad_norm": 10.870838227236137, + "learning_rate": 4.98984368512105e-05, + "loss": 2.8581, + "mean_token_accuracy": 0.3517241358757019, + "step": 78180 + }, + { + "epoch": 0.07874876238994966, + "grad_norm": 11.29869714069804, + "learning_rate": 4.989840128491559e-05, + "loss": 2.6302, + "mean_token_accuracy": 0.38965516686439516, + "step": 78185 + }, + { + "epoch": 0.07875379844305382, + "grad_norm": 11.768136283267921, + "learning_rate": 4.989836571240841e-05, + "loss": 2.7723, + "mean_token_accuracy": 0.37586206793785093, + "step": 78190 + }, + { + "epoch": 0.07875883449615799, + "grad_norm": 12.465739118803386, + "learning_rate": 4.989833013368895e-05, + "loss": 2.6103, + "mean_token_accuracy": 0.4034482717514038, + "step": 78195 + }, + { + "epoch": 0.07876387054926216, + "grad_norm": 11.579993657782195, + "learning_rate": 4.9898294548757234e-05, + "loss": 2.5917, + "mean_token_accuracy": 0.37241379618644715, + "step": 78200 + }, + { + "epoch": 0.07876890660236634, + "grad_norm": 11.871879928764042, + "learning_rate": 4.989825895761326e-05, + "loss": 2.6607, + "mean_token_accuracy": 0.42413793206214906, + "step": 78205 + }, + { + "epoch": 0.07877394265547051, + "grad_norm": 10.877670938099344, + "learning_rate": 4.989822336025706e-05, + "loss": 2.3815, + "mean_token_accuracy": 0.4103448331356049, + "step": 78210 + }, + { + "epoch": 0.07877897870857468, + "grad_norm": 9.601080917141719, + "learning_rate": 4.989818775668862e-05, + "loss": 2.5743, + "mean_token_accuracy": 0.44137930274009707, + "step": 78215 + }, + { + "epoch": 0.07878401476167886, + "grad_norm": 12.315288445007246, + "learning_rate": 4.989815214690796e-05, + "loss": 2.6005, + "mean_token_accuracy": 0.4275861978530884, + "step": 78220 + }, + { + "epoch": 0.07878905081478303, + "grad_norm": 10.692325740790034, + "learning_rate": 4.989811653091509e-05, + "loss": 2.3773, + "mean_token_accuracy": 0.4620689570903778, + "step": 78225 + }, + { + "epoch": 0.0787940868678872, + "grad_norm": 10.718865183674883, + "learning_rate": 4.989808090871002e-05, + "loss": 2.6373, + "mean_token_accuracy": 0.37931033968925476, + "step": 78230 + }, + { + "epoch": 0.07879912292099138, + "grad_norm": 12.791412510972354, + "learning_rate": 4.989804528029276e-05, + "loss": 2.8541, + "mean_token_accuracy": 0.39310344457626345, + "step": 78235 + }, + { + "epoch": 0.07880415897409555, + "grad_norm": 8.768748426886317, + "learning_rate": 4.989800964566331e-05, + "loss": 2.1414, + "mean_token_accuracy": 0.45862067937850953, + "step": 78240 + }, + { + "epoch": 0.07880919502719973, + "grad_norm": 15.67332287652288, + "learning_rate": 4.9897974004821696e-05, + "loss": 2.4058, + "mean_token_accuracy": 0.4344827651977539, + "step": 78245 + }, + { + "epoch": 0.0788142310803039, + "grad_norm": 17.235932745695596, + "learning_rate": 4.989793835776791e-05, + "loss": 2.542, + "mean_token_accuracy": 0.39310344457626345, + "step": 78250 + }, + { + "epoch": 0.07881926713340807, + "grad_norm": 11.97124630879913, + "learning_rate": 4.989790270450199e-05, + "loss": 2.9625, + "mean_token_accuracy": 0.37931033968925476, + "step": 78255 + }, + { + "epoch": 0.07882430318651223, + "grad_norm": 11.641866945741356, + "learning_rate": 4.989786704502391e-05, + "loss": 2.1454, + "mean_token_accuracy": 0.4310344815254211, + "step": 78260 + }, + { + "epoch": 0.07882933923961641, + "grad_norm": 13.39836576076939, + "learning_rate": 4.989783137933372e-05, + "loss": 2.6927, + "mean_token_accuracy": 0.40689654350280763, + "step": 78265 + }, + { + "epoch": 0.07883437529272058, + "grad_norm": 11.896583936661386, + "learning_rate": 4.9897795707431396e-05, + "loss": 2.2044, + "mean_token_accuracy": 0.46896551847457885, + "step": 78270 + }, + { + "epoch": 0.07883941134582476, + "grad_norm": 11.969052013258853, + "learning_rate": 4.989776002931695e-05, + "loss": 2.5295, + "mean_token_accuracy": 0.3965517282485962, + "step": 78275 + }, + { + "epoch": 0.07884444739892893, + "grad_norm": 11.77452714337248, + "learning_rate": 4.9897724344990415e-05, + "loss": 2.3893, + "mean_token_accuracy": 0.4172413766384125, + "step": 78280 + }, + { + "epoch": 0.0788494834520331, + "grad_norm": 8.82001672170787, + "learning_rate": 4.989768865445178e-05, + "loss": 2.1824, + "mean_token_accuracy": 0.4569872975349426, + "step": 78285 + }, + { + "epoch": 0.07885451950513728, + "grad_norm": 12.685321253895612, + "learning_rate": 4.9897652957701065e-05, + "loss": 2.3342, + "mean_token_accuracy": 0.4344827651977539, + "step": 78290 + }, + { + "epoch": 0.07885955555824145, + "grad_norm": 11.106801939889955, + "learning_rate": 4.9897617254738276e-05, + "loss": 2.4034, + "mean_token_accuracy": 0.43793103098869324, + "step": 78295 + }, + { + "epoch": 0.07886459161134562, + "grad_norm": 12.515324623807322, + "learning_rate": 4.989758154556343e-05, + "loss": 2.59, + "mean_token_accuracy": 0.42758620381355283, + "step": 78300 + }, + { + "epoch": 0.0788696276644498, + "grad_norm": 8.449631689722558, + "learning_rate": 4.989754583017652e-05, + "loss": 2.5047, + "mean_token_accuracy": 0.382758629322052, + "step": 78305 + }, + { + "epoch": 0.07887466371755397, + "grad_norm": 9.545694913009477, + "learning_rate": 4.989751010857758e-05, + "loss": 2.3564, + "mean_token_accuracy": 0.4517241299152374, + "step": 78310 + }, + { + "epoch": 0.07887969977065815, + "grad_norm": 12.286567312827732, + "learning_rate": 4.98974743807666e-05, + "loss": 2.5101, + "mean_token_accuracy": 0.38965517580509185, + "step": 78315 + }, + { + "epoch": 0.07888473582376232, + "grad_norm": 11.504772314547145, + "learning_rate": 4.9897438646743596e-05, + "loss": 2.7542, + "mean_token_accuracy": 0.38275861740112305, + "step": 78320 + }, + { + "epoch": 0.0788897718768665, + "grad_norm": 10.75957659578681, + "learning_rate": 4.9897402906508585e-05, + "loss": 3.0954, + "mean_token_accuracy": 0.35862068831920624, + "step": 78325 + }, + { + "epoch": 0.07889480792997065, + "grad_norm": 9.327434533143025, + "learning_rate": 4.9897367160061564e-05, + "loss": 2.3284, + "mean_token_accuracy": 0.41724137365818026, + "step": 78330 + }, + { + "epoch": 0.07889984398307483, + "grad_norm": 11.36911630927558, + "learning_rate": 4.989733140740255e-05, + "loss": 2.6571, + "mean_token_accuracy": 0.36896551847457887, + "step": 78335 + }, + { + "epoch": 0.078904880036179, + "grad_norm": 11.28981496466912, + "learning_rate": 4.9897295648531556e-05, + "loss": 2.8349, + "mean_token_accuracy": 0.4241379380226135, + "step": 78340 + }, + { + "epoch": 0.07890991608928317, + "grad_norm": 11.712463525010799, + "learning_rate": 4.989725988344859e-05, + "loss": 2.507, + "mean_token_accuracy": 0.37586206793785093, + "step": 78345 + }, + { + "epoch": 0.07891495214238735, + "grad_norm": 12.308655374647532, + "learning_rate": 4.989722411215366e-05, + "loss": 2.6173, + "mean_token_accuracy": 0.4053841531276703, + "step": 78350 + }, + { + "epoch": 0.07891998819549152, + "grad_norm": 21.8597767339935, + "learning_rate": 4.9897188334646774e-05, + "loss": 2.7904, + "mean_token_accuracy": 0.3655172437429428, + "step": 78355 + }, + { + "epoch": 0.0789250242485957, + "grad_norm": 9.836588536941452, + "learning_rate": 4.9897152550927945e-05, + "loss": 2.3368, + "mean_token_accuracy": 0.4344827651977539, + "step": 78360 + }, + { + "epoch": 0.07893006030169987, + "grad_norm": 10.364152791705735, + "learning_rate": 4.989711676099718e-05, + "loss": 2.1182, + "mean_token_accuracy": 0.44827585816383364, + "step": 78365 + }, + { + "epoch": 0.07893509635480404, + "grad_norm": 10.715354779091028, + "learning_rate": 4.989708096485449e-05, + "loss": 2.3901, + "mean_token_accuracy": 0.41724138855934145, + "step": 78370 + }, + { + "epoch": 0.07894013240790822, + "grad_norm": 11.602813852620008, + "learning_rate": 4.9897045162499886e-05, + "loss": 2.3435, + "mean_token_accuracy": 0.42068964838981626, + "step": 78375 + }, + { + "epoch": 0.07894516846101239, + "grad_norm": 10.253440608869086, + "learning_rate": 4.9897009353933386e-05, + "loss": 2.808, + "mean_token_accuracy": 0.37586207389831544, + "step": 78380 + }, + { + "epoch": 0.07895020451411656, + "grad_norm": 11.335797322360873, + "learning_rate": 4.989697353915498e-05, + "loss": 2.8148, + "mean_token_accuracy": 0.3758620619773865, + "step": 78385 + }, + { + "epoch": 0.07895524056722074, + "grad_norm": 11.143192649833749, + "learning_rate": 4.989693771816471e-05, + "loss": 2.674, + "mean_token_accuracy": 0.37586206793785093, + "step": 78390 + }, + { + "epoch": 0.07896027662032491, + "grad_norm": 14.164056237246406, + "learning_rate": 4.989690189096255e-05, + "loss": 2.6687, + "mean_token_accuracy": 0.4034482777118683, + "step": 78395 + }, + { + "epoch": 0.07896531267342907, + "grad_norm": 11.924397553928824, + "learning_rate": 4.989686605754852e-05, + "loss": 2.5302, + "mean_token_accuracy": 0.4310344815254211, + "step": 78400 + }, + { + "epoch": 0.07897034872653325, + "grad_norm": 13.27827908848532, + "learning_rate": 4.989683021792265e-05, + "loss": 2.5886, + "mean_token_accuracy": 0.37586206793785093, + "step": 78405 + }, + { + "epoch": 0.07897538477963742, + "grad_norm": 10.714199390729068, + "learning_rate": 4.989679437208493e-05, + "loss": 2.1248, + "mean_token_accuracy": 0.4848154842853546, + "step": 78410 + }, + { + "epoch": 0.0789804208327416, + "grad_norm": 12.211020084039685, + "learning_rate": 4.9896758520035375e-05, + "loss": 2.2881, + "mean_token_accuracy": 0.4551724076271057, + "step": 78415 + }, + { + "epoch": 0.07898545688584577, + "grad_norm": 12.936267522472036, + "learning_rate": 4.9896722661773994e-05, + "loss": 2.7644, + "mean_token_accuracy": 0.3517241418361664, + "step": 78420 + }, + { + "epoch": 0.07899049293894994, + "grad_norm": 12.982692873641906, + "learning_rate": 4.98966867973008e-05, + "loss": 2.6529, + "mean_token_accuracy": 0.42413793206214906, + "step": 78425 + }, + { + "epoch": 0.07899552899205411, + "grad_norm": 11.433586607962994, + "learning_rate": 4.98966509266158e-05, + "loss": 2.7599, + "mean_token_accuracy": 0.3896551728248596, + "step": 78430 + }, + { + "epoch": 0.07900056504515829, + "grad_norm": 9.317903010298034, + "learning_rate": 4.9896615049719005e-05, + "loss": 2.358, + "mean_token_accuracy": 0.4724137902259827, + "step": 78435 + }, + { + "epoch": 0.07900560109826246, + "grad_norm": 9.206169183648598, + "learning_rate": 4.989657916661043e-05, + "loss": 2.4098, + "mean_token_accuracy": 0.42068966031074523, + "step": 78440 + }, + { + "epoch": 0.07901063715136664, + "grad_norm": 10.618737812209181, + "learning_rate": 4.9896543277290076e-05, + "loss": 2.4763, + "mean_token_accuracy": 0.4344827592372894, + "step": 78445 + }, + { + "epoch": 0.07901567320447081, + "grad_norm": 13.249988257152847, + "learning_rate": 4.989650738175796e-05, + "loss": 2.2168, + "mean_token_accuracy": 0.4551724135875702, + "step": 78450 + }, + { + "epoch": 0.07902070925757498, + "grad_norm": 9.789724521286935, + "learning_rate": 4.9896471480014086e-05, + "loss": 2.356, + "mean_token_accuracy": 0.41379310488700866, + "step": 78455 + }, + { + "epoch": 0.07902574531067916, + "grad_norm": 12.737389570698216, + "learning_rate": 4.989643557205846e-05, + "loss": 2.6491, + "mean_token_accuracy": 0.3448275923728943, + "step": 78460 + }, + { + "epoch": 0.07903078136378333, + "grad_norm": 12.171954394195058, + "learning_rate": 4.9896399657891116e-05, + "loss": 2.3918, + "mean_token_accuracy": 0.40459770560264585, + "step": 78465 + }, + { + "epoch": 0.07903581741688749, + "grad_norm": 9.626619542814526, + "learning_rate": 4.989636373751204e-05, + "loss": 2.5488, + "mean_token_accuracy": 0.4517241299152374, + "step": 78470 + }, + { + "epoch": 0.07904085346999166, + "grad_norm": 12.786666036995987, + "learning_rate": 4.989632781092125e-05, + "loss": 2.8176, + "mean_token_accuracy": 0.3793103456497192, + "step": 78475 + }, + { + "epoch": 0.07904588952309584, + "grad_norm": 9.672040446618084, + "learning_rate": 4.9896291878118745e-05, + "loss": 2.5524, + "mean_token_accuracy": 0.4310344815254211, + "step": 78480 + }, + { + "epoch": 0.07905092557620001, + "grad_norm": 14.22173018931263, + "learning_rate": 4.989625593910455e-05, + "loss": 2.5094, + "mean_token_accuracy": 0.4275862157344818, + "step": 78485 + }, + { + "epoch": 0.07905596162930419, + "grad_norm": 11.453353872963568, + "learning_rate": 4.9896219993878676e-05, + "loss": 2.7493, + "mean_token_accuracy": 0.37241379022598264, + "step": 78490 + }, + { + "epoch": 0.07906099768240836, + "grad_norm": 10.614517530726477, + "learning_rate": 4.989618404244112e-05, + "loss": 2.3692, + "mean_token_accuracy": 0.4586206912994385, + "step": 78495 + }, + { + "epoch": 0.07906603373551253, + "grad_norm": 11.345152082085463, + "learning_rate": 4.9896148084791904e-05, + "loss": 2.5453, + "mean_token_accuracy": 0.37241379618644715, + "step": 78500 + }, + { + "epoch": 0.07907106978861671, + "grad_norm": 10.192445124255062, + "learning_rate": 4.9896112120931034e-05, + "loss": 2.821, + "mean_token_accuracy": 0.401935875415802, + "step": 78505 + }, + { + "epoch": 0.07907610584172088, + "grad_norm": 11.426046682426364, + "learning_rate": 4.989607615085851e-05, + "loss": 2.1573, + "mean_token_accuracy": 0.4517241418361664, + "step": 78510 + }, + { + "epoch": 0.07908114189482505, + "grad_norm": 11.006950411716689, + "learning_rate": 4.989604017457436e-05, + "loss": 2.3259, + "mean_token_accuracy": 0.43103448748588563, + "step": 78515 + }, + { + "epoch": 0.07908617794792923, + "grad_norm": 11.472446347213923, + "learning_rate": 4.9896004192078574e-05, + "loss": 2.3709, + "mean_token_accuracy": 0.4517241418361664, + "step": 78520 + }, + { + "epoch": 0.0790912140010334, + "grad_norm": 14.135369688170858, + "learning_rate": 4.989596820337118e-05, + "loss": 2.33, + "mean_token_accuracy": 0.4436176657676697, + "step": 78525 + }, + { + "epoch": 0.07909625005413758, + "grad_norm": 7.583904831950299, + "learning_rate": 4.9895932208452185e-05, + "loss": 2.2528, + "mean_token_accuracy": 0.44053236246109007, + "step": 78530 + }, + { + "epoch": 0.07910128610724175, + "grad_norm": 12.696054766514253, + "learning_rate": 4.9895896207321594e-05, + "loss": 2.6148, + "mean_token_accuracy": 0.3931034505367279, + "step": 78535 + }, + { + "epoch": 0.07910632216034591, + "grad_norm": 11.855054225172006, + "learning_rate": 4.989586019997941e-05, + "loss": 2.3555, + "mean_token_accuracy": 0.42329097986221315, + "step": 78540 + }, + { + "epoch": 0.07911135821345008, + "grad_norm": 15.98435187546126, + "learning_rate": 4.9895824186425646e-05, + "loss": 2.446, + "mean_token_accuracy": 0.47586206793785096, + "step": 78545 + }, + { + "epoch": 0.07911639426655426, + "grad_norm": 13.448484458402408, + "learning_rate": 4.989578816666033e-05, + "loss": 2.6782, + "mean_token_accuracy": 0.41379310488700866, + "step": 78550 + }, + { + "epoch": 0.07912143031965843, + "grad_norm": 10.973437750868962, + "learning_rate": 4.989575214068346e-05, + "loss": 2.347, + "mean_token_accuracy": 0.4122202038764954, + "step": 78555 + }, + { + "epoch": 0.0791264663727626, + "grad_norm": 11.05906416877406, + "learning_rate": 4.9895716108495035e-05, + "loss": 2.1053, + "mean_token_accuracy": 0.4965517222881317, + "step": 78560 + }, + { + "epoch": 0.07913150242586678, + "grad_norm": 15.603490232309085, + "learning_rate": 4.9895680070095076e-05, + "loss": 2.7859, + "mean_token_accuracy": 0.46551723778247833, + "step": 78565 + }, + { + "epoch": 0.07913653847897095, + "grad_norm": 13.585602495892667, + "learning_rate": 4.989564402548359e-05, + "loss": 2.5675, + "mean_token_accuracy": 0.4137930989265442, + "step": 78570 + }, + { + "epoch": 0.07914157453207513, + "grad_norm": 14.125507906797763, + "learning_rate": 4.989560797466059e-05, + "loss": 2.2475, + "mean_token_accuracy": 0.458620685338974, + "step": 78575 + }, + { + "epoch": 0.0791466105851793, + "grad_norm": 11.39027051451844, + "learning_rate": 4.9895571917626094e-05, + "loss": 2.6381, + "mean_token_accuracy": 0.38620689511299133, + "step": 78580 + }, + { + "epoch": 0.07915164663828347, + "grad_norm": 10.286879992247956, + "learning_rate": 4.989553585438009e-05, + "loss": 2.4588, + "mean_token_accuracy": 0.3987900733947754, + "step": 78585 + }, + { + "epoch": 0.07915668269138765, + "grad_norm": 10.036019679285083, + "learning_rate": 4.98954997849226e-05, + "loss": 2.1679, + "mean_token_accuracy": 0.47241378426551817, + "step": 78590 + }, + { + "epoch": 0.07916171874449182, + "grad_norm": 11.005379804823022, + "learning_rate": 4.989546370925364e-05, + "loss": 2.2059, + "mean_token_accuracy": 0.4689655125141144, + "step": 78595 + }, + { + "epoch": 0.079166754797596, + "grad_norm": 9.036371321835324, + "learning_rate": 4.989542762737321e-05, + "loss": 2.2338, + "mean_token_accuracy": 0.39655172228813174, + "step": 78600 + }, + { + "epoch": 0.07917179085070017, + "grad_norm": 10.758523305561363, + "learning_rate": 4.989539153928134e-05, + "loss": 2.3823, + "mean_token_accuracy": 0.4344827592372894, + "step": 78605 + }, + { + "epoch": 0.07917682690380433, + "grad_norm": 10.918537252225011, + "learning_rate": 4.989535544497801e-05, + "loss": 2.371, + "mean_token_accuracy": 0.4310344815254211, + "step": 78610 + }, + { + "epoch": 0.0791818629569085, + "grad_norm": 14.586341409415386, + "learning_rate": 4.989531934446325e-05, + "loss": 2.7082, + "mean_token_accuracy": 0.38620689511299133, + "step": 78615 + }, + { + "epoch": 0.07918689901001268, + "grad_norm": 11.865600807322288, + "learning_rate": 4.989528323773706e-05, + "loss": 2.4662, + "mean_token_accuracy": 0.45517241954803467, + "step": 78620 + }, + { + "epoch": 0.07919193506311685, + "grad_norm": 11.802136188100564, + "learning_rate": 4.989524712479946e-05, + "loss": 2.4161, + "mean_token_accuracy": 0.4551724135875702, + "step": 78625 + }, + { + "epoch": 0.07919697111622102, + "grad_norm": 11.268328041407166, + "learning_rate": 4.989521100565045e-05, + "loss": 2.538, + "mean_token_accuracy": 0.4413793087005615, + "step": 78630 + }, + { + "epoch": 0.0792020071693252, + "grad_norm": 12.968375867174217, + "learning_rate": 4.989517488029005e-05, + "loss": 2.2995, + "mean_token_accuracy": 0.441379314661026, + "step": 78635 + }, + { + "epoch": 0.07920704322242937, + "grad_norm": 10.799306106715989, + "learning_rate": 4.989513874871825e-05, + "loss": 2.3502, + "mean_token_accuracy": 0.46896551847457885, + "step": 78640 + }, + { + "epoch": 0.07921207927553355, + "grad_norm": 14.87386045397428, + "learning_rate": 4.98951026109351e-05, + "loss": 2.5901, + "mean_token_accuracy": 0.37586206793785093, + "step": 78645 + }, + { + "epoch": 0.07921711532863772, + "grad_norm": 13.477035956769711, + "learning_rate": 4.989506646694056e-05, + "loss": 2.8726, + "mean_token_accuracy": 0.358620685338974, + "step": 78650 + }, + { + "epoch": 0.07922215138174189, + "grad_norm": 12.933320886047325, + "learning_rate": 4.989503031673469e-05, + "loss": 2.1843, + "mean_token_accuracy": 0.4460979998111725, + "step": 78655 + }, + { + "epoch": 0.07922718743484607, + "grad_norm": 11.061208132985234, + "learning_rate": 4.989499416031745e-05, + "loss": 2.7445, + "mean_token_accuracy": 0.4103448331356049, + "step": 78660 + }, + { + "epoch": 0.07923222348795024, + "grad_norm": 13.287697380642538, + "learning_rate": 4.989495799768889e-05, + "loss": 2.7283, + "mean_token_accuracy": 0.44271021485328677, + "step": 78665 + }, + { + "epoch": 0.07923725954105441, + "grad_norm": 13.057654779275143, + "learning_rate": 4.9894921828849e-05, + "loss": 2.7293, + "mean_token_accuracy": 0.3875983119010925, + "step": 78670 + }, + { + "epoch": 0.07924229559415859, + "grad_norm": 12.87269880187759, + "learning_rate": 4.98948856537978e-05, + "loss": 2.3815, + "mean_token_accuracy": 0.4562807857990265, + "step": 78675 + }, + { + "epoch": 0.07924733164726275, + "grad_norm": 11.001155470594274, + "learning_rate": 4.989484947253529e-05, + "loss": 2.5103, + "mean_token_accuracy": 0.38620689511299133, + "step": 78680 + }, + { + "epoch": 0.07925236770036692, + "grad_norm": 12.85507519341154, + "learning_rate": 4.989481328506148e-05, + "loss": 2.3292, + "mean_token_accuracy": 0.4448275864124298, + "step": 78685 + }, + { + "epoch": 0.0792574037534711, + "grad_norm": 10.873968555930258, + "learning_rate": 4.989477709137639e-05, + "loss": 2.4972, + "mean_token_accuracy": 0.45862069725990295, + "step": 78690 + }, + { + "epoch": 0.07926243980657527, + "grad_norm": 9.120444015320231, + "learning_rate": 4.989474089148002e-05, + "loss": 2.4746, + "mean_token_accuracy": 0.4344827592372894, + "step": 78695 + }, + { + "epoch": 0.07926747585967944, + "grad_norm": 11.689257984390649, + "learning_rate": 4.989470468537239e-05, + "loss": 2.3675, + "mean_token_accuracy": 0.44827585220336913, + "step": 78700 + }, + { + "epoch": 0.07927251191278362, + "grad_norm": 15.050799163266042, + "learning_rate": 4.989466847305351e-05, + "loss": 3.0557, + "mean_token_accuracy": 0.3448275804519653, + "step": 78705 + }, + { + "epoch": 0.07927754796588779, + "grad_norm": 10.952844709093384, + "learning_rate": 4.989463225452338e-05, + "loss": 2.3942, + "mean_token_accuracy": 0.4457350313663483, + "step": 78710 + }, + { + "epoch": 0.07928258401899196, + "grad_norm": 9.914772054164848, + "learning_rate": 4.9894596029782015e-05, + "loss": 2.5209, + "mean_token_accuracy": 0.4086509346961975, + "step": 78715 + }, + { + "epoch": 0.07928762007209614, + "grad_norm": 12.307315501031175, + "learning_rate": 4.989455979882943e-05, + "loss": 2.5019, + "mean_token_accuracy": 0.37931033968925476, + "step": 78720 + }, + { + "epoch": 0.07929265612520031, + "grad_norm": 10.282622224261065, + "learning_rate": 4.989452356166562e-05, + "loss": 2.1762, + "mean_token_accuracy": 0.4482758641242981, + "step": 78725 + }, + { + "epoch": 0.07929769217830449, + "grad_norm": 17.520930150895307, + "learning_rate": 4.989448731829062e-05, + "loss": 2.7802, + "mean_token_accuracy": 0.3793103337287903, + "step": 78730 + }, + { + "epoch": 0.07930272823140866, + "grad_norm": 13.783355895095683, + "learning_rate": 4.9894451068704415e-05, + "loss": 2.5909, + "mean_token_accuracy": 0.3586206793785095, + "step": 78735 + }, + { + "epoch": 0.07930776428451283, + "grad_norm": 10.399493469513057, + "learning_rate": 4.9894414812907026e-05, + "loss": 2.2132, + "mean_token_accuracy": 0.4344827592372894, + "step": 78740 + }, + { + "epoch": 0.079312800337617, + "grad_norm": 11.531716203666184, + "learning_rate": 4.989437855089846e-05, + "loss": 2.5759, + "mean_token_accuracy": 0.42413792610168455, + "step": 78745 + }, + { + "epoch": 0.07931783639072117, + "grad_norm": 11.95798148547566, + "learning_rate": 4.9894342282678736e-05, + "loss": 2.2443, + "mean_token_accuracy": 0.4533575356006622, + "step": 78750 + }, + { + "epoch": 0.07932287244382534, + "grad_norm": 13.766153512752494, + "learning_rate": 4.989430600824786e-05, + "loss": 2.9394, + "mean_token_accuracy": 0.39310343861579894, + "step": 78755 + }, + { + "epoch": 0.07932790849692951, + "grad_norm": 10.605239506360876, + "learning_rate": 4.989426972760583e-05, + "loss": 2.8322, + "mean_token_accuracy": 0.358620685338974, + "step": 78760 + }, + { + "epoch": 0.07933294455003369, + "grad_norm": 12.716464716503063, + "learning_rate": 4.989423344075267e-05, + "loss": 2.2988, + "mean_token_accuracy": 0.42413793206214906, + "step": 78765 + }, + { + "epoch": 0.07933798060313786, + "grad_norm": 10.853243913568248, + "learning_rate": 4.9894197147688386e-05, + "loss": 2.5564, + "mean_token_accuracy": 0.3965517163276672, + "step": 78770 + }, + { + "epoch": 0.07934301665624204, + "grad_norm": 19.491684384785483, + "learning_rate": 4.9894160848412986e-05, + "loss": 2.6944, + "mean_token_accuracy": 0.3620689630508423, + "step": 78775 + }, + { + "epoch": 0.07934805270934621, + "grad_norm": 12.154723347879697, + "learning_rate": 4.989412454292649e-05, + "loss": 2.5494, + "mean_token_accuracy": 0.4538415014743805, + "step": 78780 + }, + { + "epoch": 0.07935308876245038, + "grad_norm": 10.741446964274648, + "learning_rate": 4.989408823122889e-05, + "loss": 2.7048, + "mean_token_accuracy": 0.40689654350280763, + "step": 78785 + }, + { + "epoch": 0.07935812481555456, + "grad_norm": 11.029414854368818, + "learning_rate": 4.9894051913320214e-05, + "loss": 2.8098, + "mean_token_accuracy": 0.3896551728248596, + "step": 78790 + }, + { + "epoch": 0.07936316086865873, + "grad_norm": 12.150412561726041, + "learning_rate": 4.989401558920046e-05, + "loss": 2.5273, + "mean_token_accuracy": 0.4034482777118683, + "step": 78795 + }, + { + "epoch": 0.0793681969217629, + "grad_norm": 11.101015916840135, + "learning_rate": 4.989397925886964e-05, + "loss": 2.7468, + "mean_token_accuracy": 0.4103448212146759, + "step": 78800 + }, + { + "epoch": 0.07937323297486708, + "grad_norm": 12.009185015198462, + "learning_rate": 4.989394292232777e-05, + "loss": 2.5021, + "mean_token_accuracy": 0.4275257170200348, + "step": 78805 + }, + { + "epoch": 0.07937826902797125, + "grad_norm": 10.455358762100778, + "learning_rate": 4.989390657957486e-05, + "loss": 2.6097, + "mean_token_accuracy": 0.4103448331356049, + "step": 78810 + }, + { + "epoch": 0.07938330508107543, + "grad_norm": 9.961001676297018, + "learning_rate": 4.9893870230610905e-05, + "loss": 2.3874, + "mean_token_accuracy": 0.4034482777118683, + "step": 78815 + }, + { + "epoch": 0.07938834113417959, + "grad_norm": 12.877451003272213, + "learning_rate": 4.989383387543593e-05, + "loss": 2.5086, + "mean_token_accuracy": 0.42413793206214906, + "step": 78820 + }, + { + "epoch": 0.07939337718728376, + "grad_norm": 9.07781812412281, + "learning_rate": 4.989379751404995e-05, + "loss": 2.5926, + "mean_token_accuracy": 0.43448275327682495, + "step": 78825 + }, + { + "epoch": 0.07939841324038793, + "grad_norm": 10.865417837493823, + "learning_rate": 4.989376114645296e-05, + "loss": 2.133, + "mean_token_accuracy": 0.4241379380226135, + "step": 78830 + }, + { + "epoch": 0.0794034492934921, + "grad_norm": 11.610448477741116, + "learning_rate": 4.989372477264498e-05, + "loss": 2.3317, + "mean_token_accuracy": 0.47586206793785096, + "step": 78835 + }, + { + "epoch": 0.07940848534659628, + "grad_norm": 10.720084598734902, + "learning_rate": 4.9893688392626016e-05, + "loss": 2.189, + "mean_token_accuracy": 0.46551724672317507, + "step": 78840 + }, + { + "epoch": 0.07941352139970045, + "grad_norm": 8.9643043953185, + "learning_rate": 4.989365200639608e-05, + "loss": 2.3278, + "mean_token_accuracy": 0.48349754214286805, + "step": 78845 + }, + { + "epoch": 0.07941855745280463, + "grad_norm": 9.640619694269047, + "learning_rate": 4.989361561395518e-05, + "loss": 2.299, + "mean_token_accuracy": 0.4034482777118683, + "step": 78850 + }, + { + "epoch": 0.0794235935059088, + "grad_norm": 22.134719662697403, + "learning_rate": 4.989357921530333e-05, + "loss": 2.6759, + "mean_token_accuracy": 0.4551724076271057, + "step": 78855 + }, + { + "epoch": 0.07942862955901298, + "grad_norm": 9.35638709776434, + "learning_rate": 4.989354281044052e-05, + "loss": 2.1153, + "mean_token_accuracy": 0.4379310429096222, + "step": 78860 + }, + { + "epoch": 0.07943366561211715, + "grad_norm": 9.469346322042352, + "learning_rate": 4.98935063993668e-05, + "loss": 2.5177, + "mean_token_accuracy": 0.43103448748588563, + "step": 78865 + }, + { + "epoch": 0.07943870166522132, + "grad_norm": 11.22746914136466, + "learning_rate": 4.9893469982082154e-05, + "loss": 2.6082, + "mean_token_accuracy": 0.3862068891525269, + "step": 78870 + }, + { + "epoch": 0.0794437377183255, + "grad_norm": 10.072645815832288, + "learning_rate": 4.9893433558586586e-05, + "loss": 2.2005, + "mean_token_accuracy": 0.42262552976608275, + "step": 78875 + }, + { + "epoch": 0.07944877377142967, + "grad_norm": 11.779955280243406, + "learning_rate": 4.989339712888012e-05, + "loss": 2.2313, + "mean_token_accuracy": 0.4572897791862488, + "step": 78880 + }, + { + "epoch": 0.07945380982453384, + "grad_norm": 9.609052843104465, + "learning_rate": 4.989336069296276e-05, + "loss": 2.7638, + "mean_token_accuracy": 0.4000000089406967, + "step": 78885 + }, + { + "epoch": 0.079458845877638, + "grad_norm": 9.16755332617589, + "learning_rate": 4.989332425083452e-05, + "loss": 2.402, + "mean_token_accuracy": 0.4517241299152374, + "step": 78890 + }, + { + "epoch": 0.07946388193074218, + "grad_norm": 11.664351058790395, + "learning_rate": 4.989328780249541e-05, + "loss": 2.3447, + "mean_token_accuracy": 0.4172413766384125, + "step": 78895 + }, + { + "epoch": 0.07946891798384635, + "grad_norm": 12.990235074942706, + "learning_rate": 4.989325134794543e-05, + "loss": 2.6636, + "mean_token_accuracy": 0.39310344159603117, + "step": 78900 + }, + { + "epoch": 0.07947395403695053, + "grad_norm": 10.37226894374493, + "learning_rate": 4.989321488718461e-05, + "loss": 2.0345, + "mean_token_accuracy": 0.46551724076271056, + "step": 78905 + }, + { + "epoch": 0.0794789900900547, + "grad_norm": 8.524706626251902, + "learning_rate": 4.9893178420212935e-05, + "loss": 2.4904, + "mean_token_accuracy": 0.42413793206214906, + "step": 78910 + }, + { + "epoch": 0.07948402614315887, + "grad_norm": 9.80941971775653, + "learning_rate": 4.989314194703044e-05, + "loss": 2.4988, + "mean_token_accuracy": 0.39310344457626345, + "step": 78915 + }, + { + "epoch": 0.07948906219626305, + "grad_norm": 14.686539685166338, + "learning_rate": 4.989310546763712e-05, + "loss": 3.1782, + "mean_token_accuracy": 0.35862069129943847, + "step": 78920 + }, + { + "epoch": 0.07949409824936722, + "grad_norm": 11.711138792220204, + "learning_rate": 4.989306898203299e-05, + "loss": 2.4634, + "mean_token_accuracy": 0.42256503701210024, + "step": 78925 + }, + { + "epoch": 0.0794991343024714, + "grad_norm": 9.64314148430209, + "learning_rate": 4.9893032490218054e-05, + "loss": 2.2562, + "mean_token_accuracy": 0.4517241358757019, + "step": 78930 + }, + { + "epoch": 0.07950417035557557, + "grad_norm": 12.987007564596025, + "learning_rate": 4.989299599219233e-05, + "loss": 2.9858, + "mean_token_accuracy": 0.3839080512523651, + "step": 78935 + }, + { + "epoch": 0.07950920640867974, + "grad_norm": 9.625225588429506, + "learning_rate": 4.989295948795582e-05, + "loss": 2.3827, + "mean_token_accuracy": 0.3758620709180832, + "step": 78940 + }, + { + "epoch": 0.07951424246178392, + "grad_norm": 10.73914048859907, + "learning_rate": 4.9892922977508545e-05, + "loss": 2.5297, + "mean_token_accuracy": 0.4103448212146759, + "step": 78945 + }, + { + "epoch": 0.07951927851488809, + "grad_norm": 10.098809545148274, + "learning_rate": 4.989288646085051e-05, + "loss": 2.1728, + "mean_token_accuracy": 0.4379310369491577, + "step": 78950 + }, + { + "epoch": 0.07952431456799226, + "grad_norm": 16.568157861316664, + "learning_rate": 4.9892849937981726e-05, + "loss": 2.5212, + "mean_token_accuracy": 0.38620689511299133, + "step": 78955 + }, + { + "epoch": 0.07952935062109642, + "grad_norm": 12.32224581310678, + "learning_rate": 4.98928134089022e-05, + "loss": 2.2655, + "mean_token_accuracy": 0.4724137902259827, + "step": 78960 + }, + { + "epoch": 0.0795343866742006, + "grad_norm": 10.195108793425687, + "learning_rate": 4.989277687361193e-05, + "loss": 2.9423, + "mean_token_accuracy": 0.3448275804519653, + "step": 78965 + }, + { + "epoch": 0.07953942272730477, + "grad_norm": 10.731613936504187, + "learning_rate": 4.989274033211096e-05, + "loss": 2.7249, + "mean_token_accuracy": 0.40344828367233276, + "step": 78970 + }, + { + "epoch": 0.07954445878040894, + "grad_norm": 12.180209031044248, + "learning_rate": 4.989270378439928e-05, + "loss": 2.3179, + "mean_token_accuracy": 0.44482759237289426, + "step": 78975 + }, + { + "epoch": 0.07954949483351312, + "grad_norm": 12.780501983801642, + "learning_rate": 4.9892667230476886e-05, + "loss": 2.425, + "mean_token_accuracy": 0.4448275864124298, + "step": 78980 + }, + { + "epoch": 0.07955453088661729, + "grad_norm": 12.066920059026836, + "learning_rate": 4.9892630670343805e-05, + "loss": 2.3433, + "mean_token_accuracy": 0.42758620977401735, + "step": 78985 + }, + { + "epoch": 0.07955956693972147, + "grad_norm": 10.467678043016184, + "learning_rate": 4.989259410400005e-05, + "loss": 2.4125, + "mean_token_accuracy": 0.45517241954803467, + "step": 78990 + }, + { + "epoch": 0.07956460299282564, + "grad_norm": 12.494793377844234, + "learning_rate": 4.9892557531445615e-05, + "loss": 2.9205, + "mean_token_accuracy": 0.3655172407627106, + "step": 78995 + }, + { + "epoch": 0.07956963904592981, + "grad_norm": 10.26614850114355, + "learning_rate": 4.9892520952680526e-05, + "loss": 2.6883, + "mean_token_accuracy": 0.4188142716884613, + "step": 79000 + }, + { + "epoch": 0.07957467509903399, + "grad_norm": 11.361967622074925, + "learning_rate": 4.9892484367704795e-05, + "loss": 2.5872, + "mean_token_accuracy": 0.4, + "step": 79005 + }, + { + "epoch": 0.07957971115213816, + "grad_norm": 11.918631573846808, + "learning_rate": 4.989244777651842e-05, + "loss": 2.7012, + "mean_token_accuracy": 0.3827586263418198, + "step": 79010 + }, + { + "epoch": 0.07958474720524233, + "grad_norm": 9.967427030029807, + "learning_rate": 4.989241117912142e-05, + "loss": 2.4855, + "mean_token_accuracy": 0.4310344815254211, + "step": 79015 + }, + { + "epoch": 0.07958978325834651, + "grad_norm": 10.273479260232072, + "learning_rate": 4.989237457551379e-05, + "loss": 2.1678, + "mean_token_accuracy": 0.46896551847457885, + "step": 79020 + }, + { + "epoch": 0.07959481931145068, + "grad_norm": 10.468749290174168, + "learning_rate": 4.989233796569556e-05, + "loss": 2.4507, + "mean_token_accuracy": 0.4206896543502808, + "step": 79025 + }, + { + "epoch": 0.07959985536455484, + "grad_norm": 10.891219009419883, + "learning_rate": 4.989230134966673e-05, + "loss": 2.3346, + "mean_token_accuracy": 0.41034482717514037, + "step": 79030 + }, + { + "epoch": 0.07960489141765902, + "grad_norm": 9.679580535328958, + "learning_rate": 4.989226472742731e-05, + "loss": 2.1463, + "mean_token_accuracy": 0.4569268047809601, + "step": 79035 + }, + { + "epoch": 0.07960992747076319, + "grad_norm": 10.169769202129375, + "learning_rate": 4.989222809897732e-05, + "loss": 2.2505, + "mean_token_accuracy": 0.41034482717514037, + "step": 79040 + }, + { + "epoch": 0.07961496352386736, + "grad_norm": 10.606880381568496, + "learning_rate": 4.989219146431675e-05, + "loss": 2.4708, + "mean_token_accuracy": 0.4068965554237366, + "step": 79045 + }, + { + "epoch": 0.07961999957697154, + "grad_norm": 10.666109848881554, + "learning_rate": 4.989215482344563e-05, + "loss": 2.5997, + "mean_token_accuracy": 0.3896551787853241, + "step": 79050 + }, + { + "epoch": 0.07962503563007571, + "grad_norm": 13.305649768803201, + "learning_rate": 4.989211817636396e-05, + "loss": 2.9108, + "mean_token_accuracy": 0.3448275804519653, + "step": 79055 + }, + { + "epoch": 0.07963007168317988, + "grad_norm": 10.28781482381563, + "learning_rate": 4.989208152307175e-05, + "loss": 2.7319, + "mean_token_accuracy": 0.358620685338974, + "step": 79060 + }, + { + "epoch": 0.07963510773628406, + "grad_norm": 9.357041371563932, + "learning_rate": 4.989204486356901e-05, + "loss": 2.3215, + "mean_token_accuracy": 0.42413792610168455, + "step": 79065 + }, + { + "epoch": 0.07964014378938823, + "grad_norm": 12.330352138985358, + "learning_rate": 4.989200819785576e-05, + "loss": 2.4427, + "mean_token_accuracy": 0.3655172407627106, + "step": 79070 + }, + { + "epoch": 0.0796451798424924, + "grad_norm": 11.604326180890489, + "learning_rate": 4.9891971525932e-05, + "loss": 2.5581, + "mean_token_accuracy": 0.3896551728248596, + "step": 79075 + }, + { + "epoch": 0.07965021589559658, + "grad_norm": 11.880013785101163, + "learning_rate": 4.989193484779775e-05, + "loss": 2.3486, + "mean_token_accuracy": 0.46733213067054746, + "step": 79080 + }, + { + "epoch": 0.07965525194870075, + "grad_norm": 12.204031395575958, + "learning_rate": 4.989189816345301e-05, + "loss": 2.6204, + "mean_token_accuracy": 0.37241379618644715, + "step": 79085 + }, + { + "epoch": 0.07966028800180493, + "grad_norm": 11.249298395271282, + "learning_rate": 4.9891861472897785e-05, + "loss": 2.4101, + "mean_token_accuracy": 0.41034482717514037, + "step": 79090 + }, + { + "epoch": 0.0796653240549091, + "grad_norm": 11.409319515844729, + "learning_rate": 4.98918247761321e-05, + "loss": 2.342, + "mean_token_accuracy": 0.4586206912994385, + "step": 79095 + }, + { + "epoch": 0.07967036010801326, + "grad_norm": 14.619813850222037, + "learning_rate": 4.989178807315596e-05, + "loss": 2.5412, + "mean_token_accuracy": 0.41379310488700866, + "step": 79100 + }, + { + "epoch": 0.07967539616111743, + "grad_norm": 11.591505544280942, + "learning_rate": 4.989175136396937e-05, + "loss": 2.461, + "mean_token_accuracy": 0.4310344815254211, + "step": 79105 + }, + { + "epoch": 0.07968043221422161, + "grad_norm": 12.188130524149408, + "learning_rate": 4.989171464857235e-05, + "loss": 2.4768, + "mean_token_accuracy": 0.42413793206214906, + "step": 79110 + }, + { + "epoch": 0.07968546826732578, + "grad_norm": 10.025258511065248, + "learning_rate": 4.9891677926964905e-05, + "loss": 2.6062, + "mean_token_accuracy": 0.42413793206214906, + "step": 79115 + }, + { + "epoch": 0.07969050432042996, + "grad_norm": 10.353772316045388, + "learning_rate": 4.9891641199147035e-05, + "loss": 2.2524, + "mean_token_accuracy": 0.417241370677948, + "step": 79120 + }, + { + "epoch": 0.07969554037353413, + "grad_norm": 10.598122069498606, + "learning_rate": 4.989160446511877e-05, + "loss": 2.5497, + "mean_token_accuracy": 0.3896551698446274, + "step": 79125 + }, + { + "epoch": 0.0797005764266383, + "grad_norm": 16.909737503054817, + "learning_rate": 4.989156772488011e-05, + "loss": 2.641, + "mean_token_accuracy": 0.4206896543502808, + "step": 79130 + }, + { + "epoch": 0.07970561247974248, + "grad_norm": 13.120796250005803, + "learning_rate": 4.989153097843105e-05, + "loss": 2.8636, + "mean_token_accuracy": 0.3620689570903778, + "step": 79135 + }, + { + "epoch": 0.07971064853284665, + "grad_norm": 10.786061274805299, + "learning_rate": 4.989149422577162e-05, + "loss": 2.3404, + "mean_token_accuracy": 0.43623715043067934, + "step": 79140 + }, + { + "epoch": 0.07971568458595082, + "grad_norm": 10.297845580415794, + "learning_rate": 4.989145746690184e-05, + "loss": 2.5618, + "mean_token_accuracy": 0.42413793206214906, + "step": 79145 + }, + { + "epoch": 0.079720720639055, + "grad_norm": 11.605618769114415, + "learning_rate": 4.9891420701821694e-05, + "loss": 2.2503, + "mean_token_accuracy": 0.4538415014743805, + "step": 79150 + }, + { + "epoch": 0.07972575669215917, + "grad_norm": 13.484618283359, + "learning_rate": 4.9891383930531214e-05, + "loss": 2.6268, + "mean_token_accuracy": 0.38275861740112305, + "step": 79155 + }, + { + "epoch": 0.07973079274526335, + "grad_norm": 12.150237886939786, + "learning_rate": 4.989134715303039e-05, + "loss": 2.4696, + "mean_token_accuracy": 0.3827586233615875, + "step": 79160 + }, + { + "epoch": 0.07973582879836752, + "grad_norm": 12.327805385403058, + "learning_rate": 4.989131036931924e-05, + "loss": 2.5036, + "mean_token_accuracy": 0.39655172228813174, + "step": 79165 + }, + { + "epoch": 0.07974086485147168, + "grad_norm": 11.228189666213435, + "learning_rate": 4.989127357939779e-05, + "loss": 2.3775, + "mean_token_accuracy": 0.42232305407524107, + "step": 79170 + }, + { + "epoch": 0.07974590090457585, + "grad_norm": 15.766313352953114, + "learning_rate": 4.989123678326603e-05, + "loss": 2.9528, + "mean_token_accuracy": 0.3551724076271057, + "step": 79175 + }, + { + "epoch": 0.07975093695768003, + "grad_norm": 14.661573875554476, + "learning_rate": 4.989119998092397e-05, + "loss": 2.7111, + "mean_token_accuracy": 0.4482758641242981, + "step": 79180 + }, + { + "epoch": 0.0797559730107842, + "grad_norm": 10.27537912167993, + "learning_rate": 4.989116317237164e-05, + "loss": 2.3425, + "mean_token_accuracy": 0.41379310488700866, + "step": 79185 + }, + { + "epoch": 0.07976100906388837, + "grad_norm": 9.666719547046183, + "learning_rate": 4.989112635760902e-05, + "loss": 2.4718, + "mean_token_accuracy": 0.3999999940395355, + "step": 79190 + }, + { + "epoch": 0.07976604511699255, + "grad_norm": 12.82334278739967, + "learning_rate": 4.9891089536636156e-05, + "loss": 2.277, + "mean_token_accuracy": 0.4586206912994385, + "step": 79195 + }, + { + "epoch": 0.07977108117009672, + "grad_norm": 12.451765225044706, + "learning_rate": 4.989105270945303e-05, + "loss": 2.3536, + "mean_token_accuracy": 0.4620689630508423, + "step": 79200 + }, + { + "epoch": 0.0797761172232009, + "grad_norm": 12.061392168171809, + "learning_rate": 4.9891015876059663e-05, + "loss": 2.5005, + "mean_token_accuracy": 0.4206896543502808, + "step": 79205 + }, + { + "epoch": 0.07978115327630507, + "grad_norm": 11.569599215930866, + "learning_rate": 4.9890979036456064e-05, + "loss": 2.4667, + "mean_token_accuracy": 0.3862068891525269, + "step": 79210 + }, + { + "epoch": 0.07978618932940924, + "grad_norm": 10.922474577070293, + "learning_rate": 4.989094219064225e-05, + "loss": 2.4825, + "mean_token_accuracy": 0.41379310488700866, + "step": 79215 + }, + { + "epoch": 0.07979122538251342, + "grad_norm": 9.916546552249, + "learning_rate": 4.989090533861821e-05, + "loss": 2.6726, + "mean_token_accuracy": 0.3999999940395355, + "step": 79220 + }, + { + "epoch": 0.07979626143561759, + "grad_norm": 9.326509209526781, + "learning_rate": 4.989086848038398e-05, + "loss": 2.5372, + "mean_token_accuracy": 0.3965517163276672, + "step": 79225 + }, + { + "epoch": 0.07980129748872176, + "grad_norm": 11.621949220131109, + "learning_rate": 4.989083161593956e-05, + "loss": 2.362, + "mean_token_accuracy": 0.42413792610168455, + "step": 79230 + }, + { + "epoch": 0.07980633354182594, + "grad_norm": 11.158118043453529, + "learning_rate": 4.9890794745284955e-05, + "loss": 2.6022, + "mean_token_accuracy": 0.4137930989265442, + "step": 79235 + }, + { + "epoch": 0.0798113695949301, + "grad_norm": 11.61270927968363, + "learning_rate": 4.989075786842018e-05, + "loss": 2.6715, + "mean_token_accuracy": 0.4137930989265442, + "step": 79240 + }, + { + "epoch": 0.07981640564803427, + "grad_norm": 13.703164591070829, + "learning_rate": 4.989072098534524e-05, + "loss": 2.915, + "mean_token_accuracy": 0.34827586710453035, + "step": 79245 + }, + { + "epoch": 0.07982144170113845, + "grad_norm": 15.24638430762623, + "learning_rate": 4.9890684096060166e-05, + "loss": 3.1375, + "mean_token_accuracy": 0.3310344725847244, + "step": 79250 + }, + { + "epoch": 0.07982647775424262, + "grad_norm": 9.540202790414654, + "learning_rate": 4.989064720056494e-05, + "loss": 2.6545, + "mean_token_accuracy": 0.42413793206214906, + "step": 79255 + }, + { + "epoch": 0.0798315138073468, + "grad_norm": 10.596690551745844, + "learning_rate": 4.989061029885959e-05, + "loss": 2.7929, + "mean_token_accuracy": 0.38965516686439516, + "step": 79260 + }, + { + "epoch": 0.07983654986045097, + "grad_norm": 9.936718320036045, + "learning_rate": 4.989057339094412e-05, + "loss": 2.6504, + "mean_token_accuracy": 0.3758620709180832, + "step": 79265 + }, + { + "epoch": 0.07984158591355514, + "grad_norm": 11.048165294774309, + "learning_rate": 4.989053647681853e-05, + "loss": 2.5218, + "mean_token_accuracy": 0.39310344457626345, + "step": 79270 + }, + { + "epoch": 0.07984662196665931, + "grad_norm": 10.04943602513441, + "learning_rate": 4.989049955648285e-05, + "loss": 2.6439, + "mean_token_accuracy": 0.37241379022598264, + "step": 79275 + }, + { + "epoch": 0.07985165801976349, + "grad_norm": 8.82320170070897, + "learning_rate": 4.989046262993708e-05, + "loss": 2.4372, + "mean_token_accuracy": 0.41929823756217954, + "step": 79280 + }, + { + "epoch": 0.07985669407286766, + "grad_norm": 11.795950361431165, + "learning_rate": 4.989042569718124e-05, + "loss": 2.707, + "mean_token_accuracy": 0.34827586114406583, + "step": 79285 + }, + { + "epoch": 0.07986173012597184, + "grad_norm": 16.722107412778513, + "learning_rate": 4.989038875821532e-05, + "loss": 2.794, + "mean_token_accuracy": 0.3793103456497192, + "step": 79290 + }, + { + "epoch": 0.07986676617907601, + "grad_norm": 11.847275538094214, + "learning_rate": 4.989035181303935e-05, + "loss": 3.0336, + "mean_token_accuracy": 0.39310343861579894, + "step": 79295 + }, + { + "epoch": 0.07987180223218018, + "grad_norm": 11.73960278611399, + "learning_rate": 4.989031486165332e-05, + "loss": 2.72, + "mean_token_accuracy": 0.334482753276825, + "step": 79300 + }, + { + "epoch": 0.07987683828528436, + "grad_norm": 10.140944938298517, + "learning_rate": 4.989027790405727e-05, + "loss": 2.3956, + "mean_token_accuracy": 0.4103448331356049, + "step": 79305 + }, + { + "epoch": 0.07988187433838852, + "grad_norm": 10.651185281269854, + "learning_rate": 4.989024094025118e-05, + "loss": 2.802, + "mean_token_accuracy": 0.3482758581638336, + "step": 79310 + }, + { + "epoch": 0.07988691039149269, + "grad_norm": 10.065431122913454, + "learning_rate": 4.9890203970235076e-05, + "loss": 2.7406, + "mean_token_accuracy": 0.36896551847457887, + "step": 79315 + }, + { + "epoch": 0.07989194644459686, + "grad_norm": 15.532866536386312, + "learning_rate": 4.989016699400897e-05, + "loss": 3.0697, + "mean_token_accuracy": 0.324137932062149, + "step": 79320 + }, + { + "epoch": 0.07989698249770104, + "grad_norm": 11.036025609573155, + "learning_rate": 4.989013001157286e-05, + "loss": 2.6956, + "mean_token_accuracy": 0.41167573928833007, + "step": 79325 + }, + { + "epoch": 0.07990201855080521, + "grad_norm": 12.862779580150415, + "learning_rate": 4.989009302292677e-05, + "loss": 2.795, + "mean_token_accuracy": 0.3689655244350433, + "step": 79330 + }, + { + "epoch": 0.07990705460390939, + "grad_norm": 12.042692920864773, + "learning_rate": 4.9890056028070704e-05, + "loss": 2.6827, + "mean_token_accuracy": 0.36896551847457887, + "step": 79335 + }, + { + "epoch": 0.07991209065701356, + "grad_norm": 10.637622421730365, + "learning_rate": 4.9890019027004675e-05, + "loss": 2.8021, + "mean_token_accuracy": 0.4034482777118683, + "step": 79340 + }, + { + "epoch": 0.07991712671011773, + "grad_norm": 11.788736092348069, + "learning_rate": 4.988998201972868e-05, + "loss": 2.449, + "mean_token_accuracy": 0.3999999940395355, + "step": 79345 + }, + { + "epoch": 0.07992216276322191, + "grad_norm": 10.888020666608107, + "learning_rate": 4.9889945006242756e-05, + "loss": 2.3412, + "mean_token_accuracy": 0.458620685338974, + "step": 79350 + }, + { + "epoch": 0.07992719881632608, + "grad_norm": 12.733626524190512, + "learning_rate": 4.988990798654689e-05, + "loss": 2.2461, + "mean_token_accuracy": 0.42413792610168455, + "step": 79355 + }, + { + "epoch": 0.07993223486943025, + "grad_norm": 10.538600366181154, + "learning_rate": 4.9889870960641095e-05, + "loss": 2.2895, + "mean_token_accuracy": 0.43448275327682495, + "step": 79360 + }, + { + "epoch": 0.07993727092253443, + "grad_norm": 9.958773640758265, + "learning_rate": 4.988983392852538e-05, + "loss": 2.3394, + "mean_token_accuracy": 0.4482758641242981, + "step": 79365 + }, + { + "epoch": 0.0799423069756386, + "grad_norm": 12.862544927659476, + "learning_rate": 4.988979689019977e-05, + "loss": 2.5036, + "mean_token_accuracy": 0.45862067937850953, + "step": 79370 + }, + { + "epoch": 0.07994734302874278, + "grad_norm": 12.494765187812574, + "learning_rate": 4.988975984566427e-05, + "loss": 2.8422, + "mean_token_accuracy": 0.2931034505367279, + "step": 79375 + }, + { + "epoch": 0.07995237908184694, + "grad_norm": 13.62324926522192, + "learning_rate": 4.9889722794918884e-05, + "loss": 2.5984, + "mean_token_accuracy": 0.43793103098869324, + "step": 79380 + }, + { + "epoch": 0.07995741513495111, + "grad_norm": 12.306840764973211, + "learning_rate": 4.9889685737963615e-05, + "loss": 2.4794, + "mean_token_accuracy": 0.4413793087005615, + "step": 79385 + }, + { + "epoch": 0.07996245118805528, + "grad_norm": 12.593377958422776, + "learning_rate": 4.9889648674798496e-05, + "loss": 2.5965, + "mean_token_accuracy": 0.43599515557289126, + "step": 79390 + }, + { + "epoch": 0.07996748724115946, + "grad_norm": 15.03936765847017, + "learning_rate": 4.988961160542352e-05, + "loss": 2.1961, + "mean_token_accuracy": 0.44827585220336913, + "step": 79395 + }, + { + "epoch": 0.07997252329426363, + "grad_norm": 12.81936495222095, + "learning_rate": 4.98895745298387e-05, + "loss": 2.9628, + "mean_token_accuracy": 0.3241379290819168, + "step": 79400 + }, + { + "epoch": 0.0799775593473678, + "grad_norm": 15.82207117024875, + "learning_rate": 4.988953744804405e-05, + "loss": 2.798, + "mean_token_accuracy": 0.3896551728248596, + "step": 79405 + }, + { + "epoch": 0.07998259540047198, + "grad_norm": 15.50903712450827, + "learning_rate": 4.988950036003958e-05, + "loss": 2.7759, + "mean_token_accuracy": 0.3724137842655182, + "step": 79410 + }, + { + "epoch": 0.07998763145357615, + "grad_norm": 10.24700952681741, + "learning_rate": 4.98894632658253e-05, + "loss": 2.5835, + "mean_token_accuracy": 0.4068965554237366, + "step": 79415 + }, + { + "epoch": 0.07999266750668033, + "grad_norm": 11.702107205661529, + "learning_rate": 4.988942616540121e-05, + "loss": 2.5464, + "mean_token_accuracy": 0.42413793206214906, + "step": 79420 + }, + { + "epoch": 0.0799977035597845, + "grad_norm": 9.62508201514851, + "learning_rate": 4.9889389058767336e-05, + "loss": 2.616, + "mean_token_accuracy": 0.4068965554237366, + "step": 79425 + }, + { + "epoch": 0.08000273961288867, + "grad_norm": 10.912720722740826, + "learning_rate": 4.988935194592368e-05, + "loss": 2.6568, + "mean_token_accuracy": 0.4206896543502808, + "step": 79430 + }, + { + "epoch": 0.08000777566599285, + "grad_norm": 10.91455076097727, + "learning_rate": 4.988931482687025e-05, + "loss": 2.487, + "mean_token_accuracy": 0.45045371651649474, + "step": 79435 + }, + { + "epoch": 0.08001281171909702, + "grad_norm": 10.86007851545835, + "learning_rate": 4.988927770160706e-05, + "loss": 2.4039, + "mean_token_accuracy": 0.4379310369491577, + "step": 79440 + }, + { + "epoch": 0.0800178477722012, + "grad_norm": 10.90186781334332, + "learning_rate": 4.988924057013413e-05, + "loss": 2.4577, + "mean_token_accuracy": 0.3551724135875702, + "step": 79445 + }, + { + "epoch": 0.08002288382530535, + "grad_norm": 9.539891858373304, + "learning_rate": 4.988920343245145e-05, + "loss": 2.5775, + "mean_token_accuracy": 0.42413793206214906, + "step": 79450 + }, + { + "epoch": 0.08002791987840953, + "grad_norm": 10.036409760452624, + "learning_rate": 4.988916628855905e-05, + "loss": 2.3667, + "mean_token_accuracy": 0.4068965554237366, + "step": 79455 + }, + { + "epoch": 0.0800329559315137, + "grad_norm": 12.020411914002882, + "learning_rate": 4.988912913845692e-05, + "loss": 2.4363, + "mean_token_accuracy": 0.4103448212146759, + "step": 79460 + }, + { + "epoch": 0.08003799198461788, + "grad_norm": 11.553561959101906, + "learning_rate": 4.988909198214509e-05, + "loss": 2.5985, + "mean_token_accuracy": 0.4068965554237366, + "step": 79465 + }, + { + "epoch": 0.08004302803772205, + "grad_norm": 12.018691851414367, + "learning_rate": 4.988905481962356e-05, + "loss": 2.5504, + "mean_token_accuracy": 0.4310344815254211, + "step": 79470 + }, + { + "epoch": 0.08004806409082622, + "grad_norm": 10.79057088037424, + "learning_rate": 4.988901765089235e-05, + "loss": 2.5433, + "mean_token_accuracy": 0.3724137842655182, + "step": 79475 + }, + { + "epoch": 0.0800531001439304, + "grad_norm": 16.415658349364726, + "learning_rate": 4.988898047595145e-05, + "loss": 2.8936, + "mean_token_accuracy": 0.3793103516101837, + "step": 79480 + }, + { + "epoch": 0.08005813619703457, + "grad_norm": 10.98445964087828, + "learning_rate": 4.988894329480088e-05, + "loss": 2.1526, + "mean_token_accuracy": 0.47241379618644713, + "step": 79485 + }, + { + "epoch": 0.08006317225013874, + "grad_norm": 10.04208022061848, + "learning_rate": 4.988890610744067e-05, + "loss": 2.9845, + "mean_token_accuracy": 0.37241379022598264, + "step": 79490 + }, + { + "epoch": 0.08006820830324292, + "grad_norm": 9.28462111364128, + "learning_rate": 4.9888868913870794e-05, + "loss": 2.2329, + "mean_token_accuracy": 0.4344827592372894, + "step": 79495 + }, + { + "epoch": 0.08007324435634709, + "grad_norm": 13.963385023502035, + "learning_rate": 4.988883171409129e-05, + "loss": 2.5368, + "mean_token_accuracy": 0.3793103516101837, + "step": 79500 + }, + { + "epoch": 0.08007828040945127, + "grad_norm": 11.910005668954845, + "learning_rate": 4.988879450810216e-05, + "loss": 2.4019, + "mean_token_accuracy": 0.41724138259887694, + "step": 79505 + }, + { + "epoch": 0.08008331646255544, + "grad_norm": 10.484470060039783, + "learning_rate": 4.9888757295903416e-05, + "loss": 2.5086, + "mean_token_accuracy": 0.4156079888343811, + "step": 79510 + }, + { + "epoch": 0.08008835251565961, + "grad_norm": 10.622992811141126, + "learning_rate": 4.988872007749507e-05, + "loss": 2.3137, + "mean_token_accuracy": 0.4517241418361664, + "step": 79515 + }, + { + "epoch": 0.08009338856876377, + "grad_norm": 10.542131848954652, + "learning_rate": 4.988868285287712e-05, + "loss": 2.6186, + "mean_token_accuracy": 0.39655172526836396, + "step": 79520 + }, + { + "epoch": 0.08009842462186795, + "grad_norm": 11.752066254907898, + "learning_rate": 4.988864562204959e-05, + "loss": 2.4696, + "mean_token_accuracy": 0.4, + "step": 79525 + }, + { + "epoch": 0.08010346067497212, + "grad_norm": 12.31359622501086, + "learning_rate": 4.988860838501249e-05, + "loss": 2.9548, + "mean_token_accuracy": 0.3896551728248596, + "step": 79530 + }, + { + "epoch": 0.0801084967280763, + "grad_norm": 10.015468910699429, + "learning_rate": 4.988857114176582e-05, + "loss": 2.3145, + "mean_token_accuracy": 0.4689655125141144, + "step": 79535 + }, + { + "epoch": 0.08011353278118047, + "grad_norm": 11.515838320838489, + "learning_rate": 4.98885338923096e-05, + "loss": 2.7304, + "mean_token_accuracy": 0.4206896543502808, + "step": 79540 + }, + { + "epoch": 0.08011856883428464, + "grad_norm": 11.36721479113331, + "learning_rate": 4.988849663664383e-05, + "loss": 2.5044, + "mean_token_accuracy": 0.41724138259887694, + "step": 79545 + }, + { + "epoch": 0.08012360488738882, + "grad_norm": 11.68749132872101, + "learning_rate": 4.988845937476853e-05, + "loss": 2.5287, + "mean_token_accuracy": 0.4172413766384125, + "step": 79550 + }, + { + "epoch": 0.08012864094049299, + "grad_norm": 10.99722820556555, + "learning_rate": 4.9888422106683713e-05, + "loss": 2.2147, + "mean_token_accuracy": 0.4354679763317108, + "step": 79555 + }, + { + "epoch": 0.08013367699359716, + "grad_norm": 10.15516759491134, + "learning_rate": 4.9888384832389385e-05, + "loss": 2.3187, + "mean_token_accuracy": 0.3931034475564957, + "step": 79560 + }, + { + "epoch": 0.08013871304670134, + "grad_norm": 10.90276231219662, + "learning_rate": 4.988834755188555e-05, + "loss": 2.3714, + "mean_token_accuracy": 0.40689656138420105, + "step": 79565 + }, + { + "epoch": 0.08014374909980551, + "grad_norm": 11.89418994604228, + "learning_rate": 4.988831026517222e-05, + "loss": 2.6674, + "mean_token_accuracy": 0.3896551728248596, + "step": 79570 + }, + { + "epoch": 0.08014878515290969, + "grad_norm": 11.53098581794788, + "learning_rate": 4.9888272972249406e-05, + "loss": 2.4177, + "mean_token_accuracy": 0.4103448331356049, + "step": 79575 + }, + { + "epoch": 0.08015382120601386, + "grad_norm": 10.979256057860457, + "learning_rate": 4.988823567311713e-05, + "loss": 2.5364, + "mean_token_accuracy": 0.4566243290901184, + "step": 79580 + }, + { + "epoch": 0.08015885725911803, + "grad_norm": 12.53954995100159, + "learning_rate": 4.988819836777539e-05, + "loss": 2.2688, + "mean_token_accuracy": 0.4482758641242981, + "step": 79585 + }, + { + "epoch": 0.08016389331222219, + "grad_norm": 8.985642105762485, + "learning_rate": 4.98881610562242e-05, + "loss": 2.514, + "mean_token_accuracy": 0.39999998807907106, + "step": 79590 + }, + { + "epoch": 0.08016892936532637, + "grad_norm": 15.280237782500492, + "learning_rate": 4.988812373846357e-05, + "loss": 2.6493, + "mean_token_accuracy": 0.4, + "step": 79595 + }, + { + "epoch": 0.08017396541843054, + "grad_norm": 12.741831699507062, + "learning_rate": 4.988808641449351e-05, + "loss": 2.5508, + "mean_token_accuracy": 0.44137930274009707, + "step": 79600 + }, + { + "epoch": 0.08017900147153471, + "grad_norm": 11.058858186616307, + "learning_rate": 4.988804908431403e-05, + "loss": 3.029, + "mean_token_accuracy": 0.37586206793785093, + "step": 79605 + }, + { + "epoch": 0.08018403752463889, + "grad_norm": 12.225983720457378, + "learning_rate": 4.9888011747925136e-05, + "loss": 2.6872, + "mean_token_accuracy": 0.42758620977401735, + "step": 79610 + }, + { + "epoch": 0.08018907357774306, + "grad_norm": 10.153732164002742, + "learning_rate": 4.988797440532685e-05, + "loss": 2.2537, + "mean_token_accuracy": 0.4396249234676361, + "step": 79615 + }, + { + "epoch": 0.08019410963084724, + "grad_norm": 9.692566389941037, + "learning_rate": 4.9887937056519176e-05, + "loss": 2.6086, + "mean_token_accuracy": 0.36551724672317504, + "step": 79620 + }, + { + "epoch": 0.08019914568395141, + "grad_norm": 11.19721172131227, + "learning_rate": 4.988789970150212e-05, + "loss": 3.1417, + "mean_token_accuracy": 0.3931034505367279, + "step": 79625 + }, + { + "epoch": 0.08020418173705558, + "grad_norm": 10.626421905870574, + "learning_rate": 4.9887862340275696e-05, + "loss": 2.4906, + "mean_token_accuracy": 0.4, + "step": 79630 + }, + { + "epoch": 0.08020921779015976, + "grad_norm": 13.48661091513199, + "learning_rate": 4.9887824972839924e-05, + "loss": 2.4112, + "mean_token_accuracy": 0.417241370677948, + "step": 79635 + }, + { + "epoch": 0.08021425384326393, + "grad_norm": 11.391024371543178, + "learning_rate": 4.98877875991948e-05, + "loss": 2.5537, + "mean_token_accuracy": 0.3896551728248596, + "step": 79640 + }, + { + "epoch": 0.0802192898963681, + "grad_norm": 10.763922938761674, + "learning_rate": 4.988775021934034e-05, + "loss": 3.0357, + "mean_token_accuracy": 0.3137931048870087, + "step": 79645 + }, + { + "epoch": 0.08022432594947228, + "grad_norm": 11.802065843454798, + "learning_rate": 4.988771283327654e-05, + "loss": 2.6795, + "mean_token_accuracy": 0.41724138259887694, + "step": 79650 + }, + { + "epoch": 0.08022936200257645, + "grad_norm": 13.615335567169247, + "learning_rate": 4.9887675441003454e-05, + "loss": 2.8496, + "mean_token_accuracy": 0.4, + "step": 79655 + }, + { + "epoch": 0.08023439805568061, + "grad_norm": 12.75109226458029, + "learning_rate": 4.988763804252104e-05, + "loss": 2.3276, + "mean_token_accuracy": 0.43793103098869324, + "step": 79660 + }, + { + "epoch": 0.08023943410878479, + "grad_norm": 10.966371952175226, + "learning_rate": 4.9887600637829335e-05, + "loss": 2.4453, + "mean_token_accuracy": 0.3999999940395355, + "step": 79665 + }, + { + "epoch": 0.08024447016188896, + "grad_norm": 9.734638172584472, + "learning_rate": 4.9887563226928345e-05, + "loss": 2.1118, + "mean_token_accuracy": 0.47931034564971925, + "step": 79670 + }, + { + "epoch": 0.08024950621499313, + "grad_norm": 10.442486615425338, + "learning_rate": 4.988752580981809e-05, + "loss": 2.5411, + "mean_token_accuracy": 0.3655172407627106, + "step": 79675 + }, + { + "epoch": 0.0802545422680973, + "grad_norm": 12.56943107664165, + "learning_rate": 4.9887488386498566e-05, + "loss": 2.5066, + "mean_token_accuracy": 0.41034482717514037, + "step": 79680 + }, + { + "epoch": 0.08025957832120148, + "grad_norm": 12.139026560210146, + "learning_rate": 4.988745095696978e-05, + "loss": 2.7677, + "mean_token_accuracy": 0.36896551847457887, + "step": 79685 + }, + { + "epoch": 0.08026461437430565, + "grad_norm": 10.783844072848778, + "learning_rate": 4.9887413521231755e-05, + "loss": 2.1863, + "mean_token_accuracy": 0.47241379618644713, + "step": 79690 + }, + { + "epoch": 0.08026965042740983, + "grad_norm": 12.714070347197381, + "learning_rate": 4.98873760792845e-05, + "loss": 2.8798, + "mean_token_accuracy": 0.39999999701976774, + "step": 79695 + }, + { + "epoch": 0.080274686480514, + "grad_norm": 12.097323280906638, + "learning_rate": 4.988733863112803e-05, + "loss": 2.8448, + "mean_token_accuracy": 0.3586206793785095, + "step": 79700 + }, + { + "epoch": 0.08027972253361818, + "grad_norm": 11.318475823153298, + "learning_rate": 4.988730117676234e-05, + "loss": 2.5614, + "mean_token_accuracy": 0.38275861740112305, + "step": 79705 + }, + { + "epoch": 0.08028475858672235, + "grad_norm": 12.342792221412806, + "learning_rate": 4.988726371618745e-05, + "loss": 2.2565, + "mean_token_accuracy": 0.47447065711021424, + "step": 79710 + }, + { + "epoch": 0.08028979463982652, + "grad_norm": 13.079296823306526, + "learning_rate": 4.9887226249403366e-05, + "loss": 3.2312, + "mean_token_accuracy": 0.3034482777118683, + "step": 79715 + }, + { + "epoch": 0.0802948306929307, + "grad_norm": 10.189609894998457, + "learning_rate": 4.9887188776410104e-05, + "loss": 2.212, + "mean_token_accuracy": 0.446059113740921, + "step": 79720 + }, + { + "epoch": 0.08029986674603487, + "grad_norm": 12.880201862628562, + "learning_rate": 4.988715129720767e-05, + "loss": 2.5805, + "mean_token_accuracy": 0.4379310369491577, + "step": 79725 + }, + { + "epoch": 0.08030490279913903, + "grad_norm": 10.701305269857007, + "learning_rate": 4.988711381179608e-05, + "loss": 2.2229, + "mean_token_accuracy": 0.4482758641242981, + "step": 79730 + }, + { + "epoch": 0.0803099388522432, + "grad_norm": 10.74862441620486, + "learning_rate": 4.9887076320175344e-05, + "loss": 2.3637, + "mean_token_accuracy": 0.379310342669487, + "step": 79735 + }, + { + "epoch": 0.08031497490534738, + "grad_norm": 10.315495838198936, + "learning_rate": 4.988703882234546e-05, + "loss": 2.13, + "mean_token_accuracy": 0.4793103516101837, + "step": 79740 + }, + { + "epoch": 0.08032001095845155, + "grad_norm": 9.859496816317955, + "learning_rate": 4.9887001318306445e-05, + "loss": 2.5702, + "mean_token_accuracy": 0.3873563170433044, + "step": 79745 + }, + { + "epoch": 0.08032504701155573, + "grad_norm": 11.39502861035003, + "learning_rate": 4.9886963808058316e-05, + "loss": 2.2766, + "mean_token_accuracy": 0.4482758641242981, + "step": 79750 + }, + { + "epoch": 0.0803300830646599, + "grad_norm": 11.030634730035914, + "learning_rate": 4.988692629160109e-05, + "loss": 2.442, + "mean_token_accuracy": 0.4121597111225128, + "step": 79755 + }, + { + "epoch": 0.08033511911776407, + "grad_norm": 11.330420949228133, + "learning_rate": 4.988688876893475e-05, + "loss": 2.7946, + "mean_token_accuracy": 0.34137930870056155, + "step": 79760 + }, + { + "epoch": 0.08034015517086825, + "grad_norm": 10.644631745597845, + "learning_rate": 4.9886851240059326e-05, + "loss": 2.5511, + "mean_token_accuracy": 0.4310344815254211, + "step": 79765 + }, + { + "epoch": 0.08034519122397242, + "grad_norm": 9.669454591323623, + "learning_rate": 4.9886813704974836e-05, + "loss": 2.8437, + "mean_token_accuracy": 0.38620689511299133, + "step": 79770 + }, + { + "epoch": 0.0803502272770766, + "grad_norm": 11.374484953795273, + "learning_rate": 4.988677616368127e-05, + "loss": 2.2781, + "mean_token_accuracy": 0.4295220851898193, + "step": 79775 + }, + { + "epoch": 0.08035526333018077, + "grad_norm": 9.795831235410121, + "learning_rate": 4.9886738616178646e-05, + "loss": 2.5287, + "mean_token_accuracy": 0.41379310488700866, + "step": 79780 + }, + { + "epoch": 0.08036029938328494, + "grad_norm": 10.655827587640758, + "learning_rate": 4.9886701062466986e-05, + "loss": 2.3407, + "mean_token_accuracy": 0.38965516090393065, + "step": 79785 + }, + { + "epoch": 0.08036533543638912, + "grad_norm": 12.586780844475395, + "learning_rate": 4.9886663502546284e-05, + "loss": 2.4843, + "mean_token_accuracy": 0.39655173420906065, + "step": 79790 + }, + { + "epoch": 0.08037037148949329, + "grad_norm": 11.927960813721993, + "learning_rate": 4.9886625936416566e-05, + "loss": 2.678, + "mean_token_accuracy": 0.3793103516101837, + "step": 79795 + }, + { + "epoch": 0.08037540754259745, + "grad_norm": 9.521429979326083, + "learning_rate": 4.9886588364077825e-05, + "loss": 2.7178, + "mean_token_accuracy": 0.40689654350280763, + "step": 79800 + }, + { + "epoch": 0.08038044359570162, + "grad_norm": 11.899903336030563, + "learning_rate": 4.9886550785530076e-05, + "loss": 2.3666, + "mean_token_accuracy": 0.42068966031074523, + "step": 79805 + }, + { + "epoch": 0.0803854796488058, + "grad_norm": 12.300148921835435, + "learning_rate": 4.9886513200773344e-05, + "loss": 2.2913, + "mean_token_accuracy": 0.4517241299152374, + "step": 79810 + }, + { + "epoch": 0.08039051570190997, + "grad_norm": 10.563158826999986, + "learning_rate": 4.988647560980762e-05, + "loss": 2.2174, + "mean_token_accuracy": 0.42758620381355283, + "step": 79815 + }, + { + "epoch": 0.08039555175501414, + "grad_norm": 10.750558351441953, + "learning_rate": 4.988643801263293e-05, + "loss": 2.3908, + "mean_token_accuracy": 0.43448275327682495, + "step": 79820 + }, + { + "epoch": 0.08040058780811832, + "grad_norm": 14.524846482764712, + "learning_rate": 4.988640040924928e-05, + "loss": 2.5401, + "mean_token_accuracy": 0.3896551728248596, + "step": 79825 + }, + { + "epoch": 0.08040562386122249, + "grad_norm": 12.221590944296963, + "learning_rate": 4.988636279965667e-05, + "loss": 2.8444, + "mean_token_accuracy": 0.36896551847457887, + "step": 79830 + }, + { + "epoch": 0.08041065991432667, + "grad_norm": 12.74631890165887, + "learning_rate": 4.988632518385513e-05, + "loss": 2.5182, + "mean_token_accuracy": 0.42068964838981626, + "step": 79835 + }, + { + "epoch": 0.08041569596743084, + "grad_norm": 11.687276256584504, + "learning_rate": 4.988628756184465e-05, + "loss": 2.7375, + "mean_token_accuracy": 0.3482758641242981, + "step": 79840 + }, + { + "epoch": 0.08042073202053501, + "grad_norm": 14.082311400520085, + "learning_rate": 4.988624993362525e-05, + "loss": 2.2607, + "mean_token_accuracy": 0.493103438615799, + "step": 79845 + }, + { + "epoch": 0.08042576807363919, + "grad_norm": 12.966420308281704, + "learning_rate": 4.988621229919694e-05, + "loss": 2.5812, + "mean_token_accuracy": 0.41034482717514037, + "step": 79850 + }, + { + "epoch": 0.08043080412674336, + "grad_norm": 10.991819187188435, + "learning_rate": 4.988617465855974e-05, + "loss": 2.6653, + "mean_token_accuracy": 0.42758620381355283, + "step": 79855 + }, + { + "epoch": 0.08043584017984753, + "grad_norm": 10.739196316802005, + "learning_rate": 4.9886137011713644e-05, + "loss": 2.4861, + "mean_token_accuracy": 0.41724138259887694, + "step": 79860 + }, + { + "epoch": 0.08044087623295171, + "grad_norm": 12.909467950256174, + "learning_rate": 4.988609935865867e-05, + "loss": 2.5022, + "mean_token_accuracy": 0.4431336998939514, + "step": 79865 + }, + { + "epoch": 0.08044591228605587, + "grad_norm": 11.670864669742258, + "learning_rate": 4.988606169939482e-05, + "loss": 2.6424, + "mean_token_accuracy": 0.4172413766384125, + "step": 79870 + }, + { + "epoch": 0.08045094833916004, + "grad_norm": 15.221183157398855, + "learning_rate": 4.988602403392212e-05, + "loss": 2.7179, + "mean_token_accuracy": 0.3448275804519653, + "step": 79875 + }, + { + "epoch": 0.08045598439226422, + "grad_norm": 11.838323875449849, + "learning_rate": 4.988598636224057e-05, + "loss": 2.2819, + "mean_token_accuracy": 0.4344827592372894, + "step": 79880 + }, + { + "epoch": 0.08046102044536839, + "grad_norm": 11.225833823955256, + "learning_rate": 4.988594868435019e-05, + "loss": 2.6866, + "mean_token_accuracy": 0.39310344457626345, + "step": 79885 + }, + { + "epoch": 0.08046605649847256, + "grad_norm": 12.98032907270105, + "learning_rate": 4.988591100025097e-05, + "loss": 2.3301, + "mean_token_accuracy": 0.4206896543502808, + "step": 79890 + }, + { + "epoch": 0.08047109255157674, + "grad_norm": 13.410106224052436, + "learning_rate": 4.988587330994295e-05, + "loss": 2.4293, + "mean_token_accuracy": 0.4000000059604645, + "step": 79895 + }, + { + "epoch": 0.08047612860468091, + "grad_norm": 13.169469317421113, + "learning_rate": 4.988583561342612e-05, + "loss": 2.318, + "mean_token_accuracy": 0.44137930274009707, + "step": 79900 + }, + { + "epoch": 0.08048116465778508, + "grad_norm": 10.486903331143807, + "learning_rate": 4.9885797910700485e-05, + "loss": 2.9473, + "mean_token_accuracy": 0.3844525098800659, + "step": 79905 + }, + { + "epoch": 0.08048620071088926, + "grad_norm": 8.90262439587754, + "learning_rate": 4.9885760201766074e-05, + "loss": 2.435, + "mean_token_accuracy": 0.38275861740112305, + "step": 79910 + }, + { + "epoch": 0.08049123676399343, + "grad_norm": 11.884550774846595, + "learning_rate": 4.988572248662289e-05, + "loss": 2.6421, + "mean_token_accuracy": 0.40344828069210054, + "step": 79915 + }, + { + "epoch": 0.0804962728170976, + "grad_norm": 11.787390357462737, + "learning_rate": 4.988568476527094e-05, + "loss": 2.2563, + "mean_token_accuracy": 0.44137930274009707, + "step": 79920 + }, + { + "epoch": 0.08050130887020178, + "grad_norm": 10.90209612277338, + "learning_rate": 4.9885647037710236e-05, + "loss": 2.3895, + "mean_token_accuracy": 0.41034482717514037, + "step": 79925 + }, + { + "epoch": 0.08050634492330595, + "grad_norm": 11.093734961714159, + "learning_rate": 4.988560930394079e-05, + "loss": 2.5695, + "mean_token_accuracy": 0.3551724135875702, + "step": 79930 + }, + { + "epoch": 0.08051138097641013, + "grad_norm": 11.126401106702302, + "learning_rate": 4.9885571563962617e-05, + "loss": 2.6632, + "mean_token_accuracy": 0.4068965554237366, + "step": 79935 + }, + { + "epoch": 0.08051641702951429, + "grad_norm": 11.105289026821751, + "learning_rate": 4.988553381777571e-05, + "loss": 2.3929, + "mean_token_accuracy": 0.4068965494632721, + "step": 79940 + }, + { + "epoch": 0.08052145308261846, + "grad_norm": 10.380835545502952, + "learning_rate": 4.98854960653801e-05, + "loss": 2.4595, + "mean_token_accuracy": 0.42413792610168455, + "step": 79945 + }, + { + "epoch": 0.08052648913572263, + "grad_norm": 12.144611770987705, + "learning_rate": 4.988545830677579e-05, + "loss": 2.7452, + "mean_token_accuracy": 0.44343616962432864, + "step": 79950 + }, + { + "epoch": 0.08053152518882681, + "grad_norm": 10.389274956117513, + "learning_rate": 4.988542054196279e-05, + "loss": 2.6751, + "mean_token_accuracy": 0.38620689511299133, + "step": 79955 + }, + { + "epoch": 0.08053656124193098, + "grad_norm": 11.256659021841969, + "learning_rate": 4.988538277094111e-05, + "loss": 2.3903, + "mean_token_accuracy": 0.41379310488700866, + "step": 79960 + }, + { + "epoch": 0.08054159729503516, + "grad_norm": 13.262805525885387, + "learning_rate": 4.988534499371076e-05, + "loss": 3.2736, + "mean_token_accuracy": 0.31379310190677645, + "step": 79965 + }, + { + "epoch": 0.08054663334813933, + "grad_norm": 12.817269076485031, + "learning_rate": 4.988530721027174e-05, + "loss": 2.4415, + "mean_token_accuracy": 0.4103448152542114, + "step": 79970 + }, + { + "epoch": 0.0805516694012435, + "grad_norm": 10.52413004771572, + "learning_rate": 4.988526942062408e-05, + "loss": 2.3273, + "mean_token_accuracy": 0.49522080421447756, + "step": 79975 + }, + { + "epoch": 0.08055670545434768, + "grad_norm": 10.340954679511736, + "learning_rate": 4.988523162476779e-05, + "loss": 2.9416, + "mean_token_accuracy": 0.3689655214548111, + "step": 79980 + }, + { + "epoch": 0.08056174150745185, + "grad_norm": 11.591033114318126, + "learning_rate": 4.988519382270286e-05, + "loss": 2.5983, + "mean_token_accuracy": 0.4, + "step": 79985 + }, + { + "epoch": 0.08056677756055602, + "grad_norm": 8.90478801103833, + "learning_rate": 4.988515601442932e-05, + "loss": 2.9364, + "mean_token_accuracy": 0.3862068921327591, + "step": 79990 + }, + { + "epoch": 0.0805718136136602, + "grad_norm": 13.762661286990895, + "learning_rate": 4.988511819994717e-05, + "loss": 2.2698, + "mean_token_accuracy": 0.4655172526836395, + "step": 79995 + }, + { + "epoch": 0.08057684966676437, + "grad_norm": 10.912207858523226, + "learning_rate": 4.988508037925643e-05, + "loss": 2.1762, + "mean_token_accuracy": 0.4413793087005615, + "step": 80000 + }, + { + "epoch": 0.08058188571986855, + "grad_norm": 9.51618328037348, + "learning_rate": 4.98850425523571e-05, + "loss": 2.237, + "mean_token_accuracy": 0.42758620381355283, + "step": 80005 + }, + { + "epoch": 0.0805869217729727, + "grad_norm": 11.867797072088862, + "learning_rate": 4.9885004719249196e-05, + "loss": 2.853, + "mean_token_accuracy": 0.3793103456497192, + "step": 80010 + }, + { + "epoch": 0.08059195782607688, + "grad_norm": 13.887625076652968, + "learning_rate": 4.9884966879932725e-05, + "loss": 2.7307, + "mean_token_accuracy": 0.38275861740112305, + "step": 80015 + }, + { + "epoch": 0.08059699387918105, + "grad_norm": 10.029960111341182, + "learning_rate": 4.98849290344077e-05, + "loss": 2.0672, + "mean_token_accuracy": 0.5068965494632721, + "step": 80020 + }, + { + "epoch": 0.08060202993228523, + "grad_norm": 9.974470470143453, + "learning_rate": 4.988489118267413e-05, + "loss": 2.5035, + "mean_token_accuracy": 0.38275861740112305, + "step": 80025 + }, + { + "epoch": 0.0806070659853894, + "grad_norm": 11.435146159564116, + "learning_rate": 4.988485332473204e-05, + "loss": 2.6457, + "mean_token_accuracy": 0.36896551251411436, + "step": 80030 + }, + { + "epoch": 0.08061210203849357, + "grad_norm": 12.318485210083049, + "learning_rate": 4.988481546058141e-05, + "loss": 2.4789, + "mean_token_accuracy": 0.4261947989463806, + "step": 80035 + }, + { + "epoch": 0.08061713809159775, + "grad_norm": 13.11043811340994, + "learning_rate": 4.988477759022228e-05, + "loss": 2.3176, + "mean_token_accuracy": 0.47090138792991637, + "step": 80040 + }, + { + "epoch": 0.08062217414470192, + "grad_norm": 9.232902861239058, + "learning_rate": 4.988473971365464e-05, + "loss": 2.2328, + "mean_token_accuracy": 0.41724138557910917, + "step": 80045 + }, + { + "epoch": 0.0806272101978061, + "grad_norm": 15.231336022835224, + "learning_rate": 4.9884701830878514e-05, + "loss": 2.7632, + "mean_token_accuracy": 0.3758620649576187, + "step": 80050 + }, + { + "epoch": 0.08063224625091027, + "grad_norm": 11.171454595368516, + "learning_rate": 4.988466394189391e-05, + "loss": 2.3664, + "mean_token_accuracy": 0.4310344815254211, + "step": 80055 + }, + { + "epoch": 0.08063728230401444, + "grad_norm": 9.475104022093126, + "learning_rate": 4.988462604670083e-05, + "loss": 2.5767, + "mean_token_accuracy": 0.4068965494632721, + "step": 80060 + }, + { + "epoch": 0.08064231835711862, + "grad_norm": 10.139071607950687, + "learning_rate": 4.988458814529929e-05, + "loss": 2.3909, + "mean_token_accuracy": 0.4020568609237671, + "step": 80065 + }, + { + "epoch": 0.08064735441022279, + "grad_norm": 11.389050934764665, + "learning_rate": 4.988455023768931e-05, + "loss": 2.4227, + "mean_token_accuracy": 0.4655172288417816, + "step": 80070 + }, + { + "epoch": 0.08065239046332696, + "grad_norm": 8.679097856972227, + "learning_rate": 4.988451232387088e-05, + "loss": 2.3241, + "mean_token_accuracy": 0.4724137902259827, + "step": 80075 + }, + { + "epoch": 0.08065742651643112, + "grad_norm": 12.146492281222727, + "learning_rate": 4.988447440384402e-05, + "loss": 2.6556, + "mean_token_accuracy": 0.43448275327682495, + "step": 80080 + }, + { + "epoch": 0.0806624625695353, + "grad_norm": 14.02000345331648, + "learning_rate": 4.988443647760876e-05, + "loss": 2.5912, + "mean_token_accuracy": 0.3620689660310745, + "step": 80085 + }, + { + "epoch": 0.08066749862263947, + "grad_norm": 11.506086277764037, + "learning_rate": 4.9884398545165076e-05, + "loss": 2.6011, + "mean_token_accuracy": 0.37931033968925476, + "step": 80090 + }, + { + "epoch": 0.08067253467574365, + "grad_norm": 10.218058789640365, + "learning_rate": 4.9884360606513006e-05, + "loss": 2.2922, + "mean_token_accuracy": 0.4868118643760681, + "step": 80095 + }, + { + "epoch": 0.08067757072884782, + "grad_norm": 14.27204041098857, + "learning_rate": 4.988432266165254e-05, + "loss": 2.2842, + "mean_token_accuracy": 0.441379314661026, + "step": 80100 + }, + { + "epoch": 0.080682606781952, + "grad_norm": 12.51315977299802, + "learning_rate": 4.9884284710583714e-05, + "loss": 2.3819, + "mean_token_accuracy": 0.38275861740112305, + "step": 80105 + }, + { + "epoch": 0.08068764283505617, + "grad_norm": 10.869885159969265, + "learning_rate": 4.988424675330651e-05, + "loss": 2.545, + "mean_token_accuracy": 0.3793103486299515, + "step": 80110 + }, + { + "epoch": 0.08069267888816034, + "grad_norm": 10.941300973683427, + "learning_rate": 4.9884208789820955e-05, + "loss": 2.5336, + "mean_token_accuracy": 0.4, + "step": 80115 + }, + { + "epoch": 0.08069771494126451, + "grad_norm": 11.211951391725778, + "learning_rate": 4.988417082012706e-05, + "loss": 2.5925, + "mean_token_accuracy": 0.43944343328475954, + "step": 80120 + }, + { + "epoch": 0.08070275099436869, + "grad_norm": 10.058595229674568, + "learning_rate": 4.988413284422482e-05, + "loss": 2.7034, + "mean_token_accuracy": 0.4034482717514038, + "step": 80125 + }, + { + "epoch": 0.08070778704747286, + "grad_norm": 19.511809694640156, + "learning_rate": 4.988409486211427e-05, + "loss": 2.2603, + "mean_token_accuracy": 0.4862069010734558, + "step": 80130 + }, + { + "epoch": 0.08071282310057704, + "grad_norm": 11.644891592733625, + "learning_rate": 4.9884056873795406e-05, + "loss": 2.9915, + "mean_token_accuracy": 0.358620685338974, + "step": 80135 + }, + { + "epoch": 0.08071785915368121, + "grad_norm": 15.7495380519481, + "learning_rate": 4.988401887926824e-05, + "loss": 3.0508, + "mean_token_accuracy": 0.358620685338974, + "step": 80140 + }, + { + "epoch": 0.08072289520678538, + "grad_norm": 12.507406787038004, + "learning_rate": 4.988398087853277e-05, + "loss": 3.0027, + "mean_token_accuracy": 0.3517241358757019, + "step": 80145 + }, + { + "epoch": 0.08072793125988954, + "grad_norm": 10.038542193021751, + "learning_rate": 4.9883942871589036e-05, + "loss": 2.4415, + "mean_token_accuracy": 0.43448275327682495, + "step": 80150 + }, + { + "epoch": 0.08073296731299372, + "grad_norm": 9.411735462598953, + "learning_rate": 4.988390485843702e-05, + "loss": 2.237, + "mean_token_accuracy": 0.43793103098869324, + "step": 80155 + }, + { + "epoch": 0.08073800336609789, + "grad_norm": 9.72046616540649, + "learning_rate": 4.988386683907675e-05, + "loss": 2.2092, + "mean_token_accuracy": 0.4793103337287903, + "step": 80160 + }, + { + "epoch": 0.08074303941920206, + "grad_norm": 12.138774288037089, + "learning_rate": 4.988382881350824e-05, + "loss": 2.7918, + "mean_token_accuracy": 0.33793103098869326, + "step": 80165 + }, + { + "epoch": 0.08074807547230624, + "grad_norm": 18.716953340008946, + "learning_rate": 4.988379078173148e-05, + "loss": 2.6917, + "mean_token_accuracy": 0.40000000298023225, + "step": 80170 + }, + { + "epoch": 0.08075311152541041, + "grad_norm": 10.86495990632601, + "learning_rate": 4.9883752743746495e-05, + "loss": 2.1994, + "mean_token_accuracy": 0.4586206912994385, + "step": 80175 + }, + { + "epoch": 0.08075814757851459, + "grad_norm": 9.131201003497042, + "learning_rate": 4.98837146995533e-05, + "loss": 2.6416, + "mean_token_accuracy": 0.4034482777118683, + "step": 80180 + }, + { + "epoch": 0.08076318363161876, + "grad_norm": 10.07188350968997, + "learning_rate": 4.988367664915188e-05, + "loss": 2.5278, + "mean_token_accuracy": 0.42068964838981626, + "step": 80185 + }, + { + "epoch": 0.08076821968472293, + "grad_norm": 9.71708828707169, + "learning_rate": 4.988363859254228e-05, + "loss": 2.1993, + "mean_token_accuracy": 0.45517241954803467, + "step": 80190 + }, + { + "epoch": 0.08077325573782711, + "grad_norm": 11.043367429420679, + "learning_rate": 4.988360052972449e-05, + "loss": 2.1815, + "mean_token_accuracy": 0.4551724076271057, + "step": 80195 + }, + { + "epoch": 0.08077829179093128, + "grad_norm": 9.97235453722267, + "learning_rate": 4.9883562460698524e-05, + "loss": 2.4646, + "mean_token_accuracy": 0.4344827592372894, + "step": 80200 + }, + { + "epoch": 0.08078332784403545, + "grad_norm": 12.43980982333486, + "learning_rate": 4.988352438546439e-05, + "loss": 2.1286, + "mean_token_accuracy": 0.4586206912994385, + "step": 80205 + }, + { + "epoch": 0.08078836389713963, + "grad_norm": 11.658957859859289, + "learning_rate": 4.9883486304022106e-05, + "loss": 2.5462, + "mean_token_accuracy": 0.4172413766384125, + "step": 80210 + }, + { + "epoch": 0.0807933999502438, + "grad_norm": 11.799511945372018, + "learning_rate": 4.988344821637168e-05, + "loss": 2.2434, + "mean_token_accuracy": 0.43448275327682495, + "step": 80215 + }, + { + "epoch": 0.08079843600334796, + "grad_norm": 12.098013786992764, + "learning_rate": 4.9883410122513116e-05, + "loss": 2.6017, + "mean_token_accuracy": 0.3793103456497192, + "step": 80220 + }, + { + "epoch": 0.08080347205645214, + "grad_norm": 12.422930525499568, + "learning_rate": 4.9883372022446435e-05, + "loss": 2.7533, + "mean_token_accuracy": 0.41034482717514037, + "step": 80225 + }, + { + "epoch": 0.08080850810955631, + "grad_norm": 14.897009786497573, + "learning_rate": 4.988333391617164e-05, + "loss": 2.8545, + "mean_token_accuracy": 0.35862069129943847, + "step": 80230 + }, + { + "epoch": 0.08081354416266048, + "grad_norm": 11.826513220996073, + "learning_rate": 4.988329580368875e-05, + "loss": 2.246, + "mean_token_accuracy": 0.43793103098869324, + "step": 80235 + }, + { + "epoch": 0.08081858021576466, + "grad_norm": 15.203374607575356, + "learning_rate": 4.988325768499775e-05, + "loss": 2.8242, + "mean_token_accuracy": 0.3931034505367279, + "step": 80240 + }, + { + "epoch": 0.08082361626886883, + "grad_norm": 11.174047717908518, + "learning_rate": 4.9883219560098696e-05, + "loss": 2.1627, + "mean_token_accuracy": 0.43103447556495667, + "step": 80245 + }, + { + "epoch": 0.080828652321973, + "grad_norm": 12.682668948223332, + "learning_rate": 4.988318142899156e-05, + "loss": 2.7145, + "mean_token_accuracy": 0.3986690878868103, + "step": 80250 + }, + { + "epoch": 0.08083368837507718, + "grad_norm": 12.50380770396913, + "learning_rate": 4.988314329167636e-05, + "loss": 2.496, + "mean_token_accuracy": 0.36896551251411436, + "step": 80255 + }, + { + "epoch": 0.08083872442818135, + "grad_norm": 13.32962268240316, + "learning_rate": 4.988310514815312e-05, + "loss": 2.3401, + "mean_token_accuracy": 0.48784029483795166, + "step": 80260 + }, + { + "epoch": 0.08084376048128553, + "grad_norm": 12.12451603861171, + "learning_rate": 4.9883066998421835e-05, + "loss": 2.4031, + "mean_token_accuracy": 0.4517241418361664, + "step": 80265 + }, + { + "epoch": 0.0808487965343897, + "grad_norm": 15.099892896296414, + "learning_rate": 4.9883028842482524e-05, + "loss": 2.9451, + "mean_token_accuracy": 0.3482758551836014, + "step": 80270 + }, + { + "epoch": 0.08085383258749387, + "grad_norm": 12.094979330002506, + "learning_rate": 4.9882990680335206e-05, + "loss": 2.7055, + "mean_token_accuracy": 0.38965516686439516, + "step": 80275 + }, + { + "epoch": 0.08085886864059805, + "grad_norm": 11.665387114445087, + "learning_rate": 4.9882952511979875e-05, + "loss": 2.2193, + "mean_token_accuracy": 0.4448275864124298, + "step": 80280 + }, + { + "epoch": 0.08086390469370222, + "grad_norm": 14.10158159130548, + "learning_rate": 4.9882914337416545e-05, + "loss": 3.0817, + "mean_token_accuracy": 0.34319419860839845, + "step": 80285 + }, + { + "epoch": 0.08086894074680638, + "grad_norm": 10.179150286633963, + "learning_rate": 4.9882876156645236e-05, + "loss": 2.7764, + "mean_token_accuracy": 0.34482758641242983, + "step": 80290 + }, + { + "epoch": 0.08087397679991055, + "grad_norm": 10.871569142489253, + "learning_rate": 4.9882837969665955e-05, + "loss": 2.6984, + "mean_token_accuracy": 0.39655172526836396, + "step": 80295 + }, + { + "epoch": 0.08087901285301473, + "grad_norm": 12.875749533800725, + "learning_rate": 4.98827997764787e-05, + "loss": 2.5701, + "mean_token_accuracy": 0.4103448331356049, + "step": 80300 + }, + { + "epoch": 0.0808840489061189, + "grad_norm": 13.002409126383187, + "learning_rate": 4.98827615770835e-05, + "loss": 2.4456, + "mean_token_accuracy": 0.4034482777118683, + "step": 80305 + }, + { + "epoch": 0.08088908495922308, + "grad_norm": 10.513427848861115, + "learning_rate": 4.988272337148036e-05, + "loss": 2.5324, + "mean_token_accuracy": 0.4379310429096222, + "step": 80310 + }, + { + "epoch": 0.08089412101232725, + "grad_norm": 10.709374298765223, + "learning_rate": 4.988268515966928e-05, + "loss": 2.6932, + "mean_token_accuracy": 0.4103448212146759, + "step": 80315 + }, + { + "epoch": 0.08089915706543142, + "grad_norm": 11.039740581851477, + "learning_rate": 4.988264694165029e-05, + "loss": 2.6414, + "mean_token_accuracy": 0.34482758939266206, + "step": 80320 + }, + { + "epoch": 0.0809041931185356, + "grad_norm": 9.579921519239816, + "learning_rate": 4.988260871742337e-05, + "loss": 1.9643, + "mean_token_accuracy": 0.5034482777118683, + "step": 80325 + }, + { + "epoch": 0.08090922917163977, + "grad_norm": 15.581674042565298, + "learning_rate": 4.988257048698857e-05, + "loss": 2.9888, + "mean_token_accuracy": 0.3620689630508423, + "step": 80330 + }, + { + "epoch": 0.08091426522474394, + "grad_norm": 8.304602156844403, + "learning_rate": 4.9882532250345876e-05, + "loss": 1.9195, + "mean_token_accuracy": 0.5086206912994384, + "step": 80335 + }, + { + "epoch": 0.08091930127784812, + "grad_norm": 13.298919231907949, + "learning_rate": 4.98824940074953e-05, + "loss": 2.7577, + "mean_token_accuracy": 0.3965517163276672, + "step": 80340 + }, + { + "epoch": 0.08092433733095229, + "grad_norm": 11.09491780705811, + "learning_rate": 4.988245575843685e-05, + "loss": 2.4673, + "mean_token_accuracy": 0.4206896543502808, + "step": 80345 + }, + { + "epoch": 0.08092937338405647, + "grad_norm": 10.64679779297878, + "learning_rate": 4.988241750317055e-05, + "loss": 2.3759, + "mean_token_accuracy": 0.4517241418361664, + "step": 80350 + }, + { + "epoch": 0.08093440943716064, + "grad_norm": 10.777953916426817, + "learning_rate": 4.9882379241696404e-05, + "loss": 2.4596, + "mean_token_accuracy": 0.4034482777118683, + "step": 80355 + }, + { + "epoch": 0.0809394454902648, + "grad_norm": 10.967138295568134, + "learning_rate": 4.9882340974014426e-05, + "loss": 2.2674, + "mean_token_accuracy": 0.4310344815254211, + "step": 80360 + }, + { + "epoch": 0.08094448154336897, + "grad_norm": 12.183091784277773, + "learning_rate": 4.9882302700124604e-05, + "loss": 2.9372, + "mean_token_accuracy": 0.3482758641242981, + "step": 80365 + }, + { + "epoch": 0.08094951759647315, + "grad_norm": 13.835472889224372, + "learning_rate": 4.988226442002698e-05, + "loss": 2.6108, + "mean_token_accuracy": 0.39655172228813174, + "step": 80370 + }, + { + "epoch": 0.08095455364957732, + "grad_norm": 10.378683760424224, + "learning_rate": 4.988222613372155e-05, + "loss": 2.3908, + "mean_token_accuracy": 0.42413793206214906, + "step": 80375 + }, + { + "epoch": 0.0809595897026815, + "grad_norm": 10.333852996097557, + "learning_rate": 4.988218784120833e-05, + "loss": 2.316, + "mean_token_accuracy": 0.43793103098869324, + "step": 80380 + }, + { + "epoch": 0.08096462575578567, + "grad_norm": 11.81532983430724, + "learning_rate": 4.988214954248732e-05, + "loss": 2.4685, + "mean_token_accuracy": 0.4206896543502808, + "step": 80385 + }, + { + "epoch": 0.08096966180888984, + "grad_norm": 13.466744886582683, + "learning_rate": 4.988211123755854e-05, + "loss": 2.5621, + "mean_token_accuracy": 0.3827586233615875, + "step": 80390 + }, + { + "epoch": 0.08097469786199402, + "grad_norm": 10.700760446542624, + "learning_rate": 4.9882072926422e-05, + "loss": 2.4769, + "mean_token_accuracy": 0.45172414779663084, + "step": 80395 + }, + { + "epoch": 0.08097973391509819, + "grad_norm": 9.993418519949108, + "learning_rate": 4.98820346090777e-05, + "loss": 2.7326, + "mean_token_accuracy": 0.37586206793785093, + "step": 80400 + }, + { + "epoch": 0.08098476996820236, + "grad_norm": 10.983636937786894, + "learning_rate": 4.9881996285525676e-05, + "loss": 2.3388, + "mean_token_accuracy": 0.3876587986946106, + "step": 80405 + }, + { + "epoch": 0.08098980602130654, + "grad_norm": 16.44801484288338, + "learning_rate": 4.9881957955765906e-05, + "loss": 2.5329, + "mean_token_accuracy": 0.4482758641242981, + "step": 80410 + }, + { + "epoch": 0.08099484207441071, + "grad_norm": 11.539544601640708, + "learning_rate": 4.9881919619798415e-05, + "loss": 2.4551, + "mean_token_accuracy": 0.4034482777118683, + "step": 80415 + }, + { + "epoch": 0.08099987812751488, + "grad_norm": 10.210634267785238, + "learning_rate": 4.988188127762322e-05, + "loss": 2.3251, + "mean_token_accuracy": 0.44482758045196535, + "step": 80420 + }, + { + "epoch": 0.08100491418061906, + "grad_norm": 12.300412509665371, + "learning_rate": 4.9881842929240336e-05, + "loss": 2.2474, + "mean_token_accuracy": 0.4310344815254211, + "step": 80425 + }, + { + "epoch": 0.08100995023372322, + "grad_norm": 16.171579462595655, + "learning_rate": 4.9881804574649755e-05, + "loss": 2.7253, + "mean_token_accuracy": 0.36896551847457887, + "step": 80430 + }, + { + "epoch": 0.08101498628682739, + "grad_norm": 12.045463551483095, + "learning_rate": 4.98817662138515e-05, + "loss": 2.6204, + "mean_token_accuracy": 0.4034482777118683, + "step": 80435 + }, + { + "epoch": 0.08102002233993157, + "grad_norm": 16.59282934226582, + "learning_rate": 4.988172784684558e-05, + "loss": 2.6744, + "mean_token_accuracy": 0.3965517163276672, + "step": 80440 + }, + { + "epoch": 0.08102505839303574, + "grad_norm": 11.697301474228242, + "learning_rate": 4.9881689473632e-05, + "loss": 2.7192, + "mean_token_accuracy": 0.39310345649719236, + "step": 80445 + }, + { + "epoch": 0.08103009444613991, + "grad_norm": 12.67939947331828, + "learning_rate": 4.988165109421077e-05, + "loss": 2.63, + "mean_token_accuracy": 0.3551724076271057, + "step": 80450 + }, + { + "epoch": 0.08103513049924409, + "grad_norm": 11.676686093097386, + "learning_rate": 4.9881612708581915e-05, + "loss": 2.3637, + "mean_token_accuracy": 0.4172413766384125, + "step": 80455 + }, + { + "epoch": 0.08104016655234826, + "grad_norm": 10.519003900866483, + "learning_rate": 4.988157431674543e-05, + "loss": 2.5983, + "mean_token_accuracy": 0.43629764318466185, + "step": 80460 + }, + { + "epoch": 0.08104520260545243, + "grad_norm": 11.984610301428427, + "learning_rate": 4.9881535918701336e-05, + "loss": 2.6656, + "mean_token_accuracy": 0.3772534728050232, + "step": 80465 + }, + { + "epoch": 0.08105023865855661, + "grad_norm": 10.495241763905344, + "learning_rate": 4.988149751444964e-05, + "loss": 2.6824, + "mean_token_accuracy": 0.38965516686439516, + "step": 80470 + }, + { + "epoch": 0.08105527471166078, + "grad_norm": 11.054858489966934, + "learning_rate": 4.988145910399036e-05, + "loss": 2.6188, + "mean_token_accuracy": 0.4275862067937851, + "step": 80475 + }, + { + "epoch": 0.08106031076476496, + "grad_norm": 15.446198923535718, + "learning_rate": 4.9881420687323484e-05, + "loss": 2.4128, + "mean_token_accuracy": 0.44482758045196535, + "step": 80480 + }, + { + "epoch": 0.08106534681786913, + "grad_norm": 9.937294660003108, + "learning_rate": 4.988138226444904e-05, + "loss": 2.4816, + "mean_token_accuracy": 0.42758620977401735, + "step": 80485 + }, + { + "epoch": 0.0810703828709733, + "grad_norm": 12.60511158817512, + "learning_rate": 4.988134383536704e-05, + "loss": 2.7229, + "mean_token_accuracy": 0.3965517282485962, + "step": 80490 + }, + { + "epoch": 0.08107541892407748, + "grad_norm": 10.867341439604449, + "learning_rate": 4.988130540007749e-05, + "loss": 2.3165, + "mean_token_accuracy": 0.42413792610168455, + "step": 80495 + }, + { + "epoch": 0.08108045497718164, + "grad_norm": 11.315171846829784, + "learning_rate": 4.98812669585804e-05, + "loss": 2.3471, + "mean_token_accuracy": 0.44482758045196535, + "step": 80500 + }, + { + "epoch": 0.08108549103028581, + "grad_norm": 11.326825572156388, + "learning_rate": 4.988122851087579e-05, + "loss": 2.6614, + "mean_token_accuracy": 0.3931034505367279, + "step": 80505 + }, + { + "epoch": 0.08109052708338998, + "grad_norm": 13.25628106266195, + "learning_rate": 4.988119005696366e-05, + "loss": 2.6381, + "mean_token_accuracy": 0.36551723480224607, + "step": 80510 + }, + { + "epoch": 0.08109556313649416, + "grad_norm": 11.740345704206694, + "learning_rate": 4.988115159684402e-05, + "loss": 2.7657, + "mean_token_accuracy": 0.4, + "step": 80515 + }, + { + "epoch": 0.08110059918959833, + "grad_norm": 11.858175323397843, + "learning_rate": 4.9881113130516876e-05, + "loss": 2.3949, + "mean_token_accuracy": 0.45862069725990295, + "step": 80520 + }, + { + "epoch": 0.0811056352427025, + "grad_norm": 8.921639315255058, + "learning_rate": 4.9881074657982256e-05, + "loss": 2.3166, + "mean_token_accuracy": 0.4517241418361664, + "step": 80525 + }, + { + "epoch": 0.08111067129580668, + "grad_norm": 9.84686569291601, + "learning_rate": 4.9881036179240165e-05, + "loss": 2.6999, + "mean_token_accuracy": 0.4310344815254211, + "step": 80530 + }, + { + "epoch": 0.08111570734891085, + "grad_norm": 13.04787468673603, + "learning_rate": 4.9880997694290604e-05, + "loss": 2.8996, + "mean_token_accuracy": 0.33448275923728943, + "step": 80535 + }, + { + "epoch": 0.08112074340201503, + "grad_norm": 9.54958123655461, + "learning_rate": 4.988095920313359e-05, + "loss": 2.1938, + "mean_token_accuracy": 0.4379310369491577, + "step": 80540 + }, + { + "epoch": 0.0811257794551192, + "grad_norm": 10.934699852134056, + "learning_rate": 4.9880920705769136e-05, + "loss": 2.2643, + "mean_token_accuracy": 0.4103448301553726, + "step": 80545 + }, + { + "epoch": 0.08113081550822338, + "grad_norm": 9.247376493774293, + "learning_rate": 4.988088220219725e-05, + "loss": 2.6106, + "mean_token_accuracy": 0.452339905500412, + "step": 80550 + }, + { + "epoch": 0.08113585156132755, + "grad_norm": 12.145502451659977, + "learning_rate": 4.988084369241794e-05, + "loss": 2.6369, + "mean_token_accuracy": 0.4241379201412201, + "step": 80555 + }, + { + "epoch": 0.08114088761443172, + "grad_norm": 11.13356690082388, + "learning_rate": 4.988080517643122e-05, + "loss": 2.4567, + "mean_token_accuracy": 0.4068965554237366, + "step": 80560 + }, + { + "epoch": 0.0811459236675359, + "grad_norm": 16.310321519827284, + "learning_rate": 4.988076665423711e-05, + "loss": 3.1015, + "mean_token_accuracy": 0.3517241358757019, + "step": 80565 + }, + { + "epoch": 0.08115095972064006, + "grad_norm": 11.69844915258311, + "learning_rate": 4.9880728125835596e-05, + "loss": 2.7174, + "mean_token_accuracy": 0.36896551847457887, + "step": 80570 + }, + { + "epoch": 0.08115599577374423, + "grad_norm": 12.55853948228214, + "learning_rate": 4.988068959122671e-05, + "loss": 2.7191, + "mean_token_accuracy": 0.3999999940395355, + "step": 80575 + }, + { + "epoch": 0.0811610318268484, + "grad_norm": 9.916995438745841, + "learning_rate": 4.988065105041046e-05, + "loss": 2.3463, + "mean_token_accuracy": 0.43103448748588563, + "step": 80580 + }, + { + "epoch": 0.08116606787995258, + "grad_norm": 11.445161336951655, + "learning_rate": 4.988061250338685e-05, + "loss": 2.2488, + "mean_token_accuracy": 0.45347853302955626, + "step": 80585 + }, + { + "epoch": 0.08117110393305675, + "grad_norm": 13.775213524954982, + "learning_rate": 4.988057395015589e-05, + "loss": 2.6585, + "mean_token_accuracy": 0.38620689511299133, + "step": 80590 + }, + { + "epoch": 0.08117613998616093, + "grad_norm": 12.221722020087352, + "learning_rate": 4.9880535390717596e-05, + "loss": 2.2576, + "mean_token_accuracy": 0.42413793206214906, + "step": 80595 + }, + { + "epoch": 0.0811811760392651, + "grad_norm": 9.562788938033352, + "learning_rate": 4.988049682507198e-05, + "loss": 2.0319, + "mean_token_accuracy": 0.5172413647174835, + "step": 80600 + }, + { + "epoch": 0.08118621209236927, + "grad_norm": 13.39292304397339, + "learning_rate": 4.9880458253219046e-05, + "loss": 2.7925, + "mean_token_accuracy": 0.35862068831920624, + "step": 80605 + }, + { + "epoch": 0.08119124814547345, + "grad_norm": 10.781562737169876, + "learning_rate": 4.988041967515881e-05, + "loss": 2.2439, + "mean_token_accuracy": 0.42758620381355283, + "step": 80610 + }, + { + "epoch": 0.08119628419857762, + "grad_norm": 12.143270616113, + "learning_rate": 4.988038109089128e-05, + "loss": 2.3752, + "mean_token_accuracy": 0.4379310250282288, + "step": 80615 + }, + { + "epoch": 0.0812013202516818, + "grad_norm": 10.879837736361424, + "learning_rate": 4.9880342500416466e-05, + "loss": 2.5596, + "mean_token_accuracy": 0.38965516686439516, + "step": 80620 + }, + { + "epoch": 0.08120635630478597, + "grad_norm": 11.84345163700897, + "learning_rate": 4.988030390373439e-05, + "loss": 2.6932, + "mean_token_accuracy": 0.3896551728248596, + "step": 80625 + }, + { + "epoch": 0.08121139235789014, + "grad_norm": 12.507698336899674, + "learning_rate": 4.9880265300845044e-05, + "loss": 2.6337, + "mean_token_accuracy": 0.39655172228813174, + "step": 80630 + }, + { + "epoch": 0.0812164284109943, + "grad_norm": 10.812020139835624, + "learning_rate": 4.988022669174845e-05, + "loss": 2.5561, + "mean_token_accuracy": 0.3862069010734558, + "step": 80635 + }, + { + "epoch": 0.08122146446409848, + "grad_norm": 9.544459293385454, + "learning_rate": 4.988018807644462e-05, + "loss": 2.5739, + "mean_token_accuracy": 0.37586206793785093, + "step": 80640 + }, + { + "epoch": 0.08122650051720265, + "grad_norm": 16.935397057645687, + "learning_rate": 4.9880149454933554e-05, + "loss": 2.5199, + "mean_token_accuracy": 0.441379314661026, + "step": 80645 + }, + { + "epoch": 0.08123153657030682, + "grad_norm": 11.279963659866782, + "learning_rate": 4.988011082721528e-05, + "loss": 2.3649, + "mean_token_accuracy": 0.41724138259887694, + "step": 80650 + }, + { + "epoch": 0.081236572623411, + "grad_norm": 14.104323968969988, + "learning_rate": 4.988007219328979e-05, + "loss": 2.4824, + "mean_token_accuracy": 0.38620689511299133, + "step": 80655 + }, + { + "epoch": 0.08124160867651517, + "grad_norm": 9.579509212770297, + "learning_rate": 4.988003355315711e-05, + "loss": 2.2785, + "mean_token_accuracy": 0.4517241358757019, + "step": 80660 + }, + { + "epoch": 0.08124664472961934, + "grad_norm": 13.321921090103697, + "learning_rate": 4.987999490681724e-05, + "loss": 2.8125, + "mean_token_accuracy": 0.3758620619773865, + "step": 80665 + }, + { + "epoch": 0.08125168078272352, + "grad_norm": 13.455675274793983, + "learning_rate": 4.98799562542702e-05, + "loss": 2.6695, + "mean_token_accuracy": 0.4034482777118683, + "step": 80670 + }, + { + "epoch": 0.08125671683582769, + "grad_norm": 12.337300330850573, + "learning_rate": 4.987991759551599e-05, + "loss": 2.6627, + "mean_token_accuracy": 0.4, + "step": 80675 + }, + { + "epoch": 0.08126175288893187, + "grad_norm": 10.13627910878303, + "learning_rate": 4.987987893055463e-05, + "loss": 2.7342, + "mean_token_accuracy": 0.4, + "step": 80680 + }, + { + "epoch": 0.08126678894203604, + "grad_norm": 12.098234710743167, + "learning_rate": 4.987984025938612e-05, + "loss": 2.9283, + "mean_token_accuracy": 0.33793103098869326, + "step": 80685 + }, + { + "epoch": 0.08127182499514021, + "grad_norm": 17.27515027493642, + "learning_rate": 4.9879801582010486e-05, + "loss": 2.4923, + "mean_token_accuracy": 0.4344827651977539, + "step": 80690 + }, + { + "epoch": 0.08127686104824439, + "grad_norm": 9.235110828918094, + "learning_rate": 4.987976289842773e-05, + "loss": 2.6481, + "mean_token_accuracy": 0.41929824352264405, + "step": 80695 + }, + { + "epoch": 0.08128189710134856, + "grad_norm": 11.372605573155063, + "learning_rate": 4.987972420863786e-05, + "loss": 3.056, + "mean_token_accuracy": 0.4068965554237366, + "step": 80700 + }, + { + "epoch": 0.08128693315445272, + "grad_norm": 12.678842363017122, + "learning_rate": 4.987968551264089e-05, + "loss": 2.3025, + "mean_token_accuracy": 0.441379314661026, + "step": 80705 + }, + { + "epoch": 0.0812919692075569, + "grad_norm": 10.202974900111595, + "learning_rate": 4.987964681043683e-05, + "loss": 2.4311, + "mean_token_accuracy": 0.41191771626472473, + "step": 80710 + }, + { + "epoch": 0.08129700526066107, + "grad_norm": 10.493990314270032, + "learning_rate": 4.987960810202569e-05, + "loss": 2.6218, + "mean_token_accuracy": 0.4068965554237366, + "step": 80715 + }, + { + "epoch": 0.08130204131376524, + "grad_norm": 13.703774648739824, + "learning_rate": 4.987956938740749e-05, + "loss": 3.001, + "mean_token_accuracy": 0.3586206793785095, + "step": 80720 + }, + { + "epoch": 0.08130707736686942, + "grad_norm": 14.270355294741167, + "learning_rate": 4.9879530666582225e-05, + "loss": 2.711, + "mean_token_accuracy": 0.36896551549434664, + "step": 80725 + }, + { + "epoch": 0.08131211341997359, + "grad_norm": 9.939802733315174, + "learning_rate": 4.987949193954992e-05, + "loss": 2.3826, + "mean_token_accuracy": 0.4172413766384125, + "step": 80730 + }, + { + "epoch": 0.08131714947307776, + "grad_norm": 11.59544362284299, + "learning_rate": 4.9879453206310575e-05, + "loss": 3.4914, + "mean_token_accuracy": 0.34827586114406583, + "step": 80735 + }, + { + "epoch": 0.08132218552618194, + "grad_norm": 10.291815631230232, + "learning_rate": 4.987941446686421e-05, + "loss": 2.3719, + "mean_token_accuracy": 0.43793103098869324, + "step": 80740 + }, + { + "epoch": 0.08132722157928611, + "grad_norm": 10.861994219440632, + "learning_rate": 4.9879375721210825e-05, + "loss": 2.4622, + "mean_token_accuracy": 0.40689656138420105, + "step": 80745 + }, + { + "epoch": 0.08133225763239028, + "grad_norm": 10.867006763651423, + "learning_rate": 4.987933696935044e-05, + "loss": 2.1243, + "mean_token_accuracy": 0.43448275327682495, + "step": 80750 + }, + { + "epoch": 0.08133729368549446, + "grad_norm": 16.05325381802022, + "learning_rate": 4.987929821128306e-05, + "loss": 2.6652, + "mean_token_accuracy": 0.3793103456497192, + "step": 80755 + }, + { + "epoch": 0.08134232973859863, + "grad_norm": 12.228687821770087, + "learning_rate": 4.9879259447008705e-05, + "loss": 2.5697, + "mean_token_accuracy": 0.34482758641242983, + "step": 80760 + }, + { + "epoch": 0.0813473657917028, + "grad_norm": 10.336646190393555, + "learning_rate": 4.987922067652737e-05, + "loss": 2.4539, + "mean_token_accuracy": 0.3862068891525269, + "step": 80765 + }, + { + "epoch": 0.08135240184480698, + "grad_norm": 11.36042793705165, + "learning_rate": 4.987918189983908e-05, + "loss": 2.3363, + "mean_token_accuracy": 0.47931034564971925, + "step": 80770 + }, + { + "epoch": 0.08135743789791114, + "grad_norm": 9.997425549224465, + "learning_rate": 4.9879143116943834e-05, + "loss": 2.6877, + "mean_token_accuracy": 0.3448275804519653, + "step": 80775 + }, + { + "epoch": 0.08136247395101531, + "grad_norm": 9.517219810764468, + "learning_rate": 4.9879104327841655e-05, + "loss": 2.0694, + "mean_token_accuracy": 0.47931034564971925, + "step": 80780 + }, + { + "epoch": 0.08136751000411949, + "grad_norm": 12.900330376974662, + "learning_rate": 4.987906553253255e-05, + "loss": 2.7264, + "mean_token_accuracy": 0.38275861740112305, + "step": 80785 + }, + { + "epoch": 0.08137254605722366, + "grad_norm": 13.128841987131354, + "learning_rate": 4.987902673101652e-05, + "loss": 2.4306, + "mean_token_accuracy": 0.44482758045196535, + "step": 80790 + }, + { + "epoch": 0.08137758211032783, + "grad_norm": 32.49612206356215, + "learning_rate": 4.98789879232936e-05, + "loss": 2.98, + "mean_token_accuracy": 0.3965517163276672, + "step": 80795 + }, + { + "epoch": 0.08138261816343201, + "grad_norm": 10.304227890398673, + "learning_rate": 4.9878949109363766e-05, + "loss": 3.0978, + "mean_token_accuracy": 0.3068965464830399, + "step": 80800 + }, + { + "epoch": 0.08138765421653618, + "grad_norm": 11.76261118447038, + "learning_rate": 4.987891028922705e-05, + "loss": 2.7044, + "mean_token_accuracy": 0.38620689511299133, + "step": 80805 + }, + { + "epoch": 0.08139269026964036, + "grad_norm": 9.579032064442766, + "learning_rate": 4.987887146288346e-05, + "loss": 2.7355, + "mean_token_accuracy": 0.3862069010734558, + "step": 80810 + }, + { + "epoch": 0.08139772632274453, + "grad_norm": 9.038490077277798, + "learning_rate": 4.9878832630333014e-05, + "loss": 2.4376, + "mean_token_accuracy": 0.4, + "step": 80815 + }, + { + "epoch": 0.0814027623758487, + "grad_norm": 14.756393535797246, + "learning_rate": 4.987879379157571e-05, + "loss": 2.7933, + "mean_token_accuracy": 0.37586206793785093, + "step": 80820 + }, + { + "epoch": 0.08140779842895288, + "grad_norm": 10.246050987750717, + "learning_rate": 4.9878754946611565e-05, + "loss": 2.5148, + "mean_token_accuracy": 0.39655171930789945, + "step": 80825 + }, + { + "epoch": 0.08141283448205705, + "grad_norm": 10.816268117662576, + "learning_rate": 4.987871609544059e-05, + "loss": 2.708, + "mean_token_accuracy": 0.40344826579093934, + "step": 80830 + }, + { + "epoch": 0.08141787053516122, + "grad_norm": 11.651370278472523, + "learning_rate": 4.9878677238062795e-05, + "loss": 2.7155, + "mean_token_accuracy": 0.3793103456497192, + "step": 80835 + }, + { + "epoch": 0.0814229065882654, + "grad_norm": 12.180002383182236, + "learning_rate": 4.987863837447818e-05, + "loss": 2.5, + "mean_token_accuracy": 0.42413792610168455, + "step": 80840 + }, + { + "epoch": 0.08142794264136956, + "grad_norm": 16.070291286245048, + "learning_rate": 4.987859950468677e-05, + "loss": 2.5269, + "mean_token_accuracy": 0.42068966031074523, + "step": 80845 + }, + { + "epoch": 0.08143297869447373, + "grad_norm": 10.580847623124962, + "learning_rate": 4.9878560628688586e-05, + "loss": 2.5778, + "mean_token_accuracy": 0.44137931764125826, + "step": 80850 + }, + { + "epoch": 0.0814380147475779, + "grad_norm": 11.882722821194134, + "learning_rate": 4.9878521746483615e-05, + "loss": 2.6037, + "mean_token_accuracy": 0.3793103486299515, + "step": 80855 + }, + { + "epoch": 0.08144305080068208, + "grad_norm": 13.474487396525651, + "learning_rate": 4.987848285807188e-05, + "loss": 2.7154, + "mean_token_accuracy": 0.3827586114406586, + "step": 80860 + }, + { + "epoch": 0.08144808685378625, + "grad_norm": 10.421423643089492, + "learning_rate": 4.987844396345338e-05, + "loss": 2.0914, + "mean_token_accuracy": 0.44827585816383364, + "step": 80865 + }, + { + "epoch": 0.08145312290689043, + "grad_norm": 9.848272006215531, + "learning_rate": 4.9878405062628146e-05, + "loss": 2.4973, + "mean_token_accuracy": 0.4034482777118683, + "step": 80870 + }, + { + "epoch": 0.0814581589599946, + "grad_norm": 9.261356638443749, + "learning_rate": 4.987836615559617e-05, + "loss": 2.1214, + "mean_token_accuracy": 0.5034482717514038, + "step": 80875 + }, + { + "epoch": 0.08146319501309877, + "grad_norm": 18.075430985244505, + "learning_rate": 4.9878327242357485e-05, + "loss": 2.9689, + "mean_token_accuracy": 0.37931033968925476, + "step": 80880 + }, + { + "epoch": 0.08146823106620295, + "grad_norm": 12.571121944768532, + "learning_rate": 4.987828832291207e-05, + "loss": 2.5551, + "mean_token_accuracy": 0.4517241418361664, + "step": 80885 + }, + { + "epoch": 0.08147326711930712, + "grad_norm": 12.730770016847952, + "learning_rate": 4.987824939725997e-05, + "loss": 2.9734, + "mean_token_accuracy": 0.3172413736581802, + "step": 80890 + }, + { + "epoch": 0.0814783031724113, + "grad_norm": 10.614779103187868, + "learning_rate": 4.987821046540116e-05, + "loss": 2.7368, + "mean_token_accuracy": 0.36551723480224607, + "step": 80895 + }, + { + "epoch": 0.08148333922551547, + "grad_norm": 14.846407870020794, + "learning_rate": 4.987817152733568e-05, + "loss": 2.9338, + "mean_token_accuracy": 0.37586206793785093, + "step": 80900 + }, + { + "epoch": 0.08148837527861964, + "grad_norm": 11.768193154523427, + "learning_rate": 4.987813258306353e-05, + "loss": 2.8692, + "mean_token_accuracy": 0.3931034505367279, + "step": 80905 + }, + { + "epoch": 0.08149341133172382, + "grad_norm": 12.714827909541008, + "learning_rate": 4.9878093632584724e-05, + "loss": 2.2394, + "mean_token_accuracy": 0.44966728091239927, + "step": 80910 + }, + { + "epoch": 0.08149844738482798, + "grad_norm": 11.059337599429616, + "learning_rate": 4.987805467589927e-05, + "loss": 2.4695, + "mean_token_accuracy": 0.42068966031074523, + "step": 80915 + }, + { + "epoch": 0.08150348343793215, + "grad_norm": 10.303392276725578, + "learning_rate": 4.9878015713007175e-05, + "loss": 2.3578, + "mean_token_accuracy": 0.38620689511299133, + "step": 80920 + }, + { + "epoch": 0.08150851949103632, + "grad_norm": 12.041536715508318, + "learning_rate": 4.9877976743908455e-05, + "loss": 2.3054, + "mean_token_accuracy": 0.43793103098869324, + "step": 80925 + }, + { + "epoch": 0.0815135555441405, + "grad_norm": 12.56715689495364, + "learning_rate": 4.9877937768603124e-05, + "loss": 2.6111, + "mean_token_accuracy": 0.36702964305877683, + "step": 80930 + }, + { + "epoch": 0.08151859159724467, + "grad_norm": 11.72463429644742, + "learning_rate": 4.987789878709118e-05, + "loss": 2.6106, + "mean_token_accuracy": 0.3551724135875702, + "step": 80935 + }, + { + "epoch": 0.08152362765034885, + "grad_norm": 11.824934475719614, + "learning_rate": 4.987785979937265e-05, + "loss": 2.3363, + "mean_token_accuracy": 0.43793103098869324, + "step": 80940 + }, + { + "epoch": 0.08152866370345302, + "grad_norm": 10.86685550254957, + "learning_rate": 4.987782080544753e-05, + "loss": 2.5327, + "mean_token_accuracy": 0.4379310369491577, + "step": 80945 + }, + { + "epoch": 0.08153369975655719, + "grad_norm": 10.62866279330211, + "learning_rate": 4.987778180531585e-05, + "loss": 2.1464, + "mean_token_accuracy": 0.4586206912994385, + "step": 80950 + }, + { + "epoch": 0.08153873580966137, + "grad_norm": 11.33004527422251, + "learning_rate": 4.98777427989776e-05, + "loss": 2.909, + "mean_token_accuracy": 0.36896551847457887, + "step": 80955 + }, + { + "epoch": 0.08154377186276554, + "grad_norm": 11.196355243749153, + "learning_rate": 4.98777037864328e-05, + "loss": 2.6224, + "mean_token_accuracy": 0.35862069129943847, + "step": 80960 + }, + { + "epoch": 0.08154880791586971, + "grad_norm": 11.031057410550053, + "learning_rate": 4.987766476768146e-05, + "loss": 2.767, + "mean_token_accuracy": 0.42068966031074523, + "step": 80965 + }, + { + "epoch": 0.08155384396897389, + "grad_norm": 10.725091472969833, + "learning_rate": 4.987762574272359e-05, + "loss": 2.3007, + "mean_token_accuracy": 0.47931034564971925, + "step": 80970 + }, + { + "epoch": 0.08155888002207806, + "grad_norm": 11.915379982721193, + "learning_rate": 4.98775867115592e-05, + "loss": 2.3971, + "mean_token_accuracy": 0.4103448331356049, + "step": 80975 + }, + { + "epoch": 0.08156391607518224, + "grad_norm": 13.398931031628925, + "learning_rate": 4.987754767418831e-05, + "loss": 2.7, + "mean_token_accuracy": 0.37241379022598264, + "step": 80980 + }, + { + "epoch": 0.0815689521282864, + "grad_norm": 11.980596998938738, + "learning_rate": 4.987750863061092e-05, + "loss": 2.7258, + "mean_token_accuracy": 0.3517241418361664, + "step": 80985 + }, + { + "epoch": 0.08157398818139057, + "grad_norm": 12.050181510152736, + "learning_rate": 4.9877469580827046e-05, + "loss": 2.3833, + "mean_token_accuracy": 0.4103448212146759, + "step": 80990 + }, + { + "epoch": 0.08157902423449474, + "grad_norm": 11.9405839134718, + "learning_rate": 4.98774305248367e-05, + "loss": 2.2712, + "mean_token_accuracy": 0.4801724135875702, + "step": 80995 + }, + { + "epoch": 0.08158406028759892, + "grad_norm": 10.521387340723319, + "learning_rate": 4.9877391462639875e-05, + "loss": 2.475, + "mean_token_accuracy": 0.4310344815254211, + "step": 81000 + }, + { + "epoch": 0.08158909634070309, + "grad_norm": 9.62052871298048, + "learning_rate": 4.987735239423662e-05, + "loss": 2.2586, + "mean_token_accuracy": 0.43103448748588563, + "step": 81005 + }, + { + "epoch": 0.08159413239380726, + "grad_norm": 9.637375199138168, + "learning_rate": 4.9877313319626914e-05, + "loss": 2.2953, + "mean_token_accuracy": 0.43448275327682495, + "step": 81010 + }, + { + "epoch": 0.08159916844691144, + "grad_norm": 10.785433247382567, + "learning_rate": 4.987727423881077e-05, + "loss": 2.2796, + "mean_token_accuracy": 0.4655172348022461, + "step": 81015 + }, + { + "epoch": 0.08160420450001561, + "grad_norm": 11.18051399030864, + "learning_rate": 4.987723515178821e-05, + "loss": 2.2251, + "mean_token_accuracy": 0.39310344457626345, + "step": 81020 + }, + { + "epoch": 0.08160924055311979, + "grad_norm": 9.237148460856131, + "learning_rate": 4.9877196058559236e-05, + "loss": 2.4817, + "mean_token_accuracy": 0.4034482777118683, + "step": 81025 + }, + { + "epoch": 0.08161427660622396, + "grad_norm": 11.521486360077368, + "learning_rate": 4.9877156959123864e-05, + "loss": 2.5331, + "mean_token_accuracy": 0.40163339376449586, + "step": 81030 + }, + { + "epoch": 0.08161931265932813, + "grad_norm": 12.014665381750389, + "learning_rate": 4.987711785348211e-05, + "loss": 2.7369, + "mean_token_accuracy": 0.36206896901130675, + "step": 81035 + }, + { + "epoch": 0.08162434871243231, + "grad_norm": 11.295725638311858, + "learning_rate": 4.987707874163398e-05, + "loss": 2.5128, + "mean_token_accuracy": 0.39655172228813174, + "step": 81040 + }, + { + "epoch": 0.08162938476553648, + "grad_norm": 9.620289602564789, + "learning_rate": 4.987703962357948e-05, + "loss": 2.5488, + "mean_token_accuracy": 0.42413793206214906, + "step": 81045 + }, + { + "epoch": 0.08163442081864065, + "grad_norm": 10.723430668467355, + "learning_rate": 4.987700049931862e-05, + "loss": 2.4451, + "mean_token_accuracy": 0.3827586114406586, + "step": 81050 + }, + { + "epoch": 0.08163945687174481, + "grad_norm": 11.849135233110083, + "learning_rate": 4.9876961368851424e-05, + "loss": 2.7102, + "mean_token_accuracy": 0.3724137872457504, + "step": 81055 + }, + { + "epoch": 0.08164449292484899, + "grad_norm": 10.540233038215963, + "learning_rate": 4.987692223217789e-05, + "loss": 2.6537, + "mean_token_accuracy": 0.38620689511299133, + "step": 81060 + }, + { + "epoch": 0.08164952897795316, + "grad_norm": 13.15798465431207, + "learning_rate": 4.987688308929803e-05, + "loss": 2.8105, + "mean_token_accuracy": 0.4137930989265442, + "step": 81065 + }, + { + "epoch": 0.08165456503105734, + "grad_norm": 10.32822121119542, + "learning_rate": 4.9876843940211865e-05, + "loss": 2.4364, + "mean_token_accuracy": 0.42413793206214906, + "step": 81070 + }, + { + "epoch": 0.08165960108416151, + "grad_norm": 12.71600164309924, + "learning_rate": 4.98768047849194e-05, + "loss": 2.3828, + "mean_token_accuracy": 0.40834846496582033, + "step": 81075 + }, + { + "epoch": 0.08166463713726568, + "grad_norm": 8.28888240311657, + "learning_rate": 4.987676562342063e-05, + "loss": 2.0854, + "mean_token_accuracy": 0.5131276428699494, + "step": 81080 + }, + { + "epoch": 0.08166967319036986, + "grad_norm": 10.145516571151319, + "learning_rate": 4.987672645571559e-05, + "loss": 2.2285, + "mean_token_accuracy": 0.43629764318466185, + "step": 81085 + }, + { + "epoch": 0.08167470924347403, + "grad_norm": 11.115886152634676, + "learning_rate": 4.9876687281804285e-05, + "loss": 2.6775, + "mean_token_accuracy": 0.4448275864124298, + "step": 81090 + }, + { + "epoch": 0.0816797452965782, + "grad_norm": 13.424199232884765, + "learning_rate": 4.987664810168672e-05, + "loss": 2.3594, + "mean_token_accuracy": 0.42758620381355283, + "step": 81095 + }, + { + "epoch": 0.08168478134968238, + "grad_norm": 10.448907505988625, + "learning_rate": 4.98766089153629e-05, + "loss": 2.4992, + "mean_token_accuracy": 0.41379310488700866, + "step": 81100 + }, + { + "epoch": 0.08168981740278655, + "grad_norm": 10.920361393007603, + "learning_rate": 4.987656972283286e-05, + "loss": 2.6147, + "mean_token_accuracy": 0.36551723480224607, + "step": 81105 + }, + { + "epoch": 0.08169485345589073, + "grad_norm": 11.549175706480233, + "learning_rate": 4.987653052409658e-05, + "loss": 2.8872, + "mean_token_accuracy": 0.3517241418361664, + "step": 81110 + }, + { + "epoch": 0.0816998895089949, + "grad_norm": 11.680179664268882, + "learning_rate": 4.98764913191541e-05, + "loss": 2.3128, + "mean_token_accuracy": 0.428078818321228, + "step": 81115 + }, + { + "epoch": 0.08170492556209907, + "grad_norm": 11.162243522567726, + "learning_rate": 4.98764521080054e-05, + "loss": 2.3091, + "mean_token_accuracy": 0.4379310369491577, + "step": 81120 + }, + { + "epoch": 0.08170996161520323, + "grad_norm": 9.822567084184392, + "learning_rate": 4.987641289065052e-05, + "loss": 2.6185, + "mean_token_accuracy": 0.3620689630508423, + "step": 81125 + }, + { + "epoch": 0.08171499766830741, + "grad_norm": 8.93756438664108, + "learning_rate": 4.987637366708945e-05, + "loss": 2.2498, + "mean_token_accuracy": 0.4223835408687592, + "step": 81130 + }, + { + "epoch": 0.08172003372141158, + "grad_norm": 14.753175349206241, + "learning_rate": 4.987633443732222e-05, + "loss": 2.3147, + "mean_token_accuracy": 0.44827585816383364, + "step": 81135 + }, + { + "epoch": 0.08172506977451575, + "grad_norm": 9.73792950577434, + "learning_rate": 4.987629520134883e-05, + "loss": 1.9809, + "mean_token_accuracy": 0.5068965494632721, + "step": 81140 + }, + { + "epoch": 0.08173010582761993, + "grad_norm": 10.919551548041344, + "learning_rate": 4.987625595916928e-05, + "loss": 2.6392, + "mean_token_accuracy": 0.358620685338974, + "step": 81145 + }, + { + "epoch": 0.0817351418807241, + "grad_norm": 11.258242545266995, + "learning_rate": 4.98762167107836e-05, + "loss": 2.8099, + "mean_token_accuracy": 0.36896551847457887, + "step": 81150 + }, + { + "epoch": 0.08174017793382828, + "grad_norm": 11.035441564656917, + "learning_rate": 4.9876177456191785e-05, + "loss": 1.8944, + "mean_token_accuracy": 0.4758620738983154, + "step": 81155 + }, + { + "epoch": 0.08174521398693245, + "grad_norm": 11.419492494209264, + "learning_rate": 4.987613819539386e-05, + "loss": 2.184, + "mean_token_accuracy": 0.4517241358757019, + "step": 81160 + }, + { + "epoch": 0.08175025004003662, + "grad_norm": 15.362632091442393, + "learning_rate": 4.987609892838983e-05, + "loss": 2.5469, + "mean_token_accuracy": 0.379310342669487, + "step": 81165 + }, + { + "epoch": 0.0817552860931408, + "grad_norm": 10.948886071885237, + "learning_rate": 4.98760596551797e-05, + "loss": 2.7753, + "mean_token_accuracy": 0.41034482717514037, + "step": 81170 + }, + { + "epoch": 0.08176032214624497, + "grad_norm": 10.424296943075129, + "learning_rate": 4.987602037576349e-05, + "loss": 2.5047, + "mean_token_accuracy": 0.4448275864124298, + "step": 81175 + }, + { + "epoch": 0.08176535819934914, + "grad_norm": 10.26206713251904, + "learning_rate": 4.987598109014121e-05, + "loss": 2.4, + "mean_token_accuracy": 0.38275861740112305, + "step": 81180 + }, + { + "epoch": 0.08177039425245332, + "grad_norm": 11.038373921761233, + "learning_rate": 4.9875941798312866e-05, + "loss": 2.4196, + "mean_token_accuracy": 0.43793103098869324, + "step": 81185 + }, + { + "epoch": 0.08177543030555749, + "grad_norm": 10.152708213978844, + "learning_rate": 4.987590250027846e-05, + "loss": 2.209, + "mean_token_accuracy": 0.4551724076271057, + "step": 81190 + }, + { + "epoch": 0.08178046635866165, + "grad_norm": 9.82734789084404, + "learning_rate": 4.987586319603803e-05, + "loss": 2.4812, + "mean_token_accuracy": 0.4206896543502808, + "step": 81195 + }, + { + "epoch": 0.08178550241176583, + "grad_norm": 10.750970673794946, + "learning_rate": 4.9875823885591565e-05, + "loss": 2.489, + "mean_token_accuracy": 0.458620685338974, + "step": 81200 + }, + { + "epoch": 0.08179053846487, + "grad_norm": 15.08769117332296, + "learning_rate": 4.9875784568939085e-05, + "loss": 2.4726, + "mean_token_accuracy": 0.42068964838981626, + "step": 81205 + }, + { + "epoch": 0.08179557451797417, + "grad_norm": 15.253691948709307, + "learning_rate": 4.98757452460806e-05, + "loss": 2.4641, + "mean_token_accuracy": 0.3896551728248596, + "step": 81210 + }, + { + "epoch": 0.08180061057107835, + "grad_norm": 10.295033977631194, + "learning_rate": 4.9875705917016105e-05, + "loss": 2.5201, + "mean_token_accuracy": 0.45704779028892517, + "step": 81215 + }, + { + "epoch": 0.08180564662418252, + "grad_norm": 11.975836658117911, + "learning_rate": 4.987566658174564e-05, + "loss": 2.3585, + "mean_token_accuracy": 0.4034482717514038, + "step": 81220 + }, + { + "epoch": 0.0818106826772867, + "grad_norm": 9.948566422066165, + "learning_rate": 4.9875627240269184e-05, + "loss": 2.4004, + "mean_token_accuracy": 0.43793103098869324, + "step": 81225 + }, + { + "epoch": 0.08181571873039087, + "grad_norm": 12.307416332226992, + "learning_rate": 4.987558789258678e-05, + "loss": 2.7307, + "mean_token_accuracy": 0.3862069010734558, + "step": 81230 + }, + { + "epoch": 0.08182075478349504, + "grad_norm": 12.002340596906384, + "learning_rate": 4.987554853869842e-05, + "loss": 2.2401, + "mean_token_accuracy": 0.4533575356006622, + "step": 81235 + }, + { + "epoch": 0.08182579083659922, + "grad_norm": 13.23536567022831, + "learning_rate": 4.987550917860411e-05, + "loss": 2.8642, + "mean_token_accuracy": 0.3428917109966278, + "step": 81240 + }, + { + "epoch": 0.08183082688970339, + "grad_norm": 11.444329663869775, + "learning_rate": 4.9875469812303874e-05, + "loss": 2.6881, + "mean_token_accuracy": 0.3896551728248596, + "step": 81245 + }, + { + "epoch": 0.08183586294280756, + "grad_norm": 10.337526132040864, + "learning_rate": 4.9875430439797716e-05, + "loss": 2.1752, + "mean_token_accuracy": 0.43260737657547, + "step": 81250 + }, + { + "epoch": 0.08184089899591174, + "grad_norm": 10.420520038880248, + "learning_rate": 4.987539106108566e-05, + "loss": 2.3796, + "mean_token_accuracy": 0.42413793206214906, + "step": 81255 + }, + { + "epoch": 0.08184593504901591, + "grad_norm": 12.108330417189388, + "learning_rate": 4.987535167616769e-05, + "loss": 2.5087, + "mean_token_accuracy": 0.4482758641242981, + "step": 81260 + }, + { + "epoch": 0.08185097110212007, + "grad_norm": 17.833796206525847, + "learning_rate": 4.987531228504384e-05, + "loss": 2.5276, + "mean_token_accuracy": 0.4034482777118683, + "step": 81265 + }, + { + "epoch": 0.08185600715522424, + "grad_norm": 8.032582884638982, + "learning_rate": 4.987527288771412e-05, + "loss": 2.0377, + "mean_token_accuracy": 0.5008620738983154, + "step": 81270 + }, + { + "epoch": 0.08186104320832842, + "grad_norm": 10.342873646082817, + "learning_rate": 4.9875233484178524e-05, + "loss": 2.44, + "mean_token_accuracy": 0.38620689511299133, + "step": 81275 + }, + { + "epoch": 0.08186607926143259, + "grad_norm": 12.889865072343847, + "learning_rate": 4.9875194074437076e-05, + "loss": 2.8287, + "mean_token_accuracy": 0.34137930274009703, + "step": 81280 + }, + { + "epoch": 0.08187111531453677, + "grad_norm": 12.483038298095767, + "learning_rate": 4.9875154658489784e-05, + "loss": 2.6404, + "mean_token_accuracy": 0.38275861740112305, + "step": 81285 + }, + { + "epoch": 0.08187615136764094, + "grad_norm": 11.282489430913788, + "learning_rate": 4.987511523633667e-05, + "loss": 2.5054, + "mean_token_accuracy": 0.4034482777118683, + "step": 81290 + }, + { + "epoch": 0.08188118742074511, + "grad_norm": 12.31106979400585, + "learning_rate": 4.987507580797772e-05, + "loss": 2.1622, + "mean_token_accuracy": 0.48154870271682737, + "step": 81295 + }, + { + "epoch": 0.08188622347384929, + "grad_norm": 12.385695953066737, + "learning_rate": 4.987503637341296e-05, + "loss": 2.2948, + "mean_token_accuracy": 0.4482758641242981, + "step": 81300 + }, + { + "epoch": 0.08189125952695346, + "grad_norm": 10.844347294984537, + "learning_rate": 4.9874996932642406e-05, + "loss": 2.7856, + "mean_token_accuracy": 0.4190562665462494, + "step": 81305 + }, + { + "epoch": 0.08189629558005763, + "grad_norm": 11.500084795989771, + "learning_rate": 4.987495748566606e-05, + "loss": 2.8147, + "mean_token_accuracy": 0.37586206793785093, + "step": 81310 + }, + { + "epoch": 0.08190133163316181, + "grad_norm": 11.786504526093998, + "learning_rate": 4.987491803248394e-05, + "loss": 2.5437, + "mean_token_accuracy": 0.4158499717712402, + "step": 81315 + }, + { + "epoch": 0.08190636768626598, + "grad_norm": 11.920140643983734, + "learning_rate": 4.9874878573096054e-05, + "loss": 2.4836, + "mean_token_accuracy": 0.3703569233417511, + "step": 81320 + }, + { + "epoch": 0.08191140373937016, + "grad_norm": 7.397232508638606, + "learning_rate": 4.9874839107502405e-05, + "loss": 2.4173, + "mean_token_accuracy": 0.42982457280159, + "step": 81325 + }, + { + "epoch": 0.08191643979247433, + "grad_norm": 9.566002506836593, + "learning_rate": 4.987479963570302e-05, + "loss": 2.1633, + "mean_token_accuracy": 0.4862068951129913, + "step": 81330 + }, + { + "epoch": 0.08192147584557849, + "grad_norm": 10.2917756475441, + "learning_rate": 4.987476015769789e-05, + "loss": 2.1104, + "mean_token_accuracy": 0.4448275864124298, + "step": 81335 + }, + { + "epoch": 0.08192651189868266, + "grad_norm": 11.220089186264747, + "learning_rate": 4.9874720673487043e-05, + "loss": 2.4531, + "mean_token_accuracy": 0.4344827592372894, + "step": 81340 + }, + { + "epoch": 0.08193154795178684, + "grad_norm": 12.107307079566278, + "learning_rate": 4.9874681183070485e-05, + "loss": 2.8069, + "mean_token_accuracy": 0.420689657330513, + "step": 81345 + }, + { + "epoch": 0.08193658400489101, + "grad_norm": 10.745517735179662, + "learning_rate": 4.9874641686448224e-05, + "loss": 2.6981, + "mean_token_accuracy": 0.4068965494632721, + "step": 81350 + }, + { + "epoch": 0.08194162005799518, + "grad_norm": 11.857550199321265, + "learning_rate": 4.9874602183620274e-05, + "loss": 2.6019, + "mean_token_accuracy": 0.40689654350280763, + "step": 81355 + }, + { + "epoch": 0.08194665611109936, + "grad_norm": 10.46215998687996, + "learning_rate": 4.987456267458664e-05, + "loss": 2.7538, + "mean_token_accuracy": 0.3896551728248596, + "step": 81360 + }, + { + "epoch": 0.08195169216420353, + "grad_norm": 13.716299429833622, + "learning_rate": 4.987452315934734e-05, + "loss": 2.6918, + "mean_token_accuracy": 0.4068965554237366, + "step": 81365 + }, + { + "epoch": 0.0819567282173077, + "grad_norm": 10.33593890629514, + "learning_rate": 4.987448363790239e-05, + "loss": 2.2856, + "mean_token_accuracy": 0.4206896543502808, + "step": 81370 + }, + { + "epoch": 0.08196176427041188, + "grad_norm": 9.765382202081087, + "learning_rate": 4.9874444110251785e-05, + "loss": 1.9348, + "mean_token_accuracy": 0.4841133117675781, + "step": 81375 + }, + { + "epoch": 0.08196680032351605, + "grad_norm": 11.726860739314754, + "learning_rate": 4.987440457639554e-05, + "loss": 2.7666, + "mean_token_accuracy": 0.3620689630508423, + "step": 81380 + }, + { + "epoch": 0.08197183637662023, + "grad_norm": 9.475170778713064, + "learning_rate": 4.9874365036333684e-05, + "loss": 2.1273, + "mean_token_accuracy": 0.475274395942688, + "step": 81385 + }, + { + "epoch": 0.0819768724297244, + "grad_norm": 11.093570151975394, + "learning_rate": 4.9874325490066205e-05, + "loss": 2.2124, + "mean_token_accuracy": 0.47586206197738645, + "step": 81390 + }, + { + "epoch": 0.08198190848282857, + "grad_norm": 12.448440988637792, + "learning_rate": 4.9874285937593126e-05, + "loss": 2.4409, + "mean_token_accuracy": 0.4034482717514038, + "step": 81395 + }, + { + "epoch": 0.08198694453593275, + "grad_norm": 13.249236770524718, + "learning_rate": 4.987424637891446e-05, + "loss": 2.7062, + "mean_token_accuracy": 0.39655172228813174, + "step": 81400 + }, + { + "epoch": 0.08199198058903691, + "grad_norm": 10.82851623829469, + "learning_rate": 4.9874206814030207e-05, + "loss": 2.6589, + "mean_token_accuracy": 0.3827586233615875, + "step": 81405 + }, + { + "epoch": 0.08199701664214108, + "grad_norm": 10.449966397354737, + "learning_rate": 4.987416724294039e-05, + "loss": 2.4951, + "mean_token_accuracy": 0.4068965494632721, + "step": 81410 + }, + { + "epoch": 0.08200205269524526, + "grad_norm": 9.507344802305338, + "learning_rate": 4.987412766564501e-05, + "loss": 2.8058, + "mean_token_accuracy": 0.4020568609237671, + "step": 81415 + }, + { + "epoch": 0.08200708874834943, + "grad_norm": 13.236516356606005, + "learning_rate": 4.9874088082144074e-05, + "loss": 2.5722, + "mean_token_accuracy": 0.3931034475564957, + "step": 81420 + }, + { + "epoch": 0.0820121248014536, + "grad_norm": 10.513473207006554, + "learning_rate": 4.9874048492437615e-05, + "loss": 2.0874, + "mean_token_accuracy": 0.4482758641242981, + "step": 81425 + }, + { + "epoch": 0.08201716085455778, + "grad_norm": 10.152692613411277, + "learning_rate": 4.9874008896525624e-05, + "loss": 2.1786, + "mean_token_accuracy": 0.43297035694122316, + "step": 81430 + }, + { + "epoch": 0.08202219690766195, + "grad_norm": 10.125156631137767, + "learning_rate": 4.987396929440812e-05, + "loss": 3.0436, + "mean_token_accuracy": 0.3879612863063812, + "step": 81435 + }, + { + "epoch": 0.08202723296076612, + "grad_norm": 11.240906940550802, + "learning_rate": 4.987392968608511e-05, + "loss": 2.4295, + "mean_token_accuracy": 0.41034482717514037, + "step": 81440 + }, + { + "epoch": 0.0820322690138703, + "grad_norm": 14.689263014332754, + "learning_rate": 4.98738900715566e-05, + "loss": 2.4392, + "mean_token_accuracy": 0.43793103098869324, + "step": 81445 + }, + { + "epoch": 0.08203730506697447, + "grad_norm": 13.140273078749582, + "learning_rate": 4.987385045082262e-05, + "loss": 2.9028, + "mean_token_accuracy": 0.38112522959709166, + "step": 81450 + }, + { + "epoch": 0.08204234112007865, + "grad_norm": 12.178984734427017, + "learning_rate": 4.987381082388317e-05, + "loss": 2.6935, + "mean_token_accuracy": 0.37931033968925476, + "step": 81455 + }, + { + "epoch": 0.08204737717318282, + "grad_norm": 12.05904797084661, + "learning_rate": 4.987377119073825e-05, + "loss": 2.6447, + "mean_token_accuracy": 0.38275861740112305, + "step": 81460 + }, + { + "epoch": 0.082052413226287, + "grad_norm": 8.257418900540168, + "learning_rate": 4.987373155138789e-05, + "loss": 2.0941, + "mean_token_accuracy": 0.49655172824859617, + "step": 81465 + }, + { + "epoch": 0.08205744927939117, + "grad_norm": 12.347566438871892, + "learning_rate": 4.987369190583208e-05, + "loss": 2.6886, + "mean_token_accuracy": 0.4034482777118683, + "step": 81470 + }, + { + "epoch": 0.08206248533249533, + "grad_norm": 10.50822540173339, + "learning_rate": 4.987365225407085e-05, + "loss": 2.5617, + "mean_token_accuracy": 0.39655172228813174, + "step": 81475 + }, + { + "epoch": 0.0820675213855995, + "grad_norm": 17.157931285693447, + "learning_rate": 4.9873612596104216e-05, + "loss": 2.5032, + "mean_token_accuracy": 0.4068965494632721, + "step": 81480 + }, + { + "epoch": 0.08207255743870367, + "grad_norm": 11.236838451856055, + "learning_rate": 4.9873572931932164e-05, + "loss": 2.2415, + "mean_token_accuracy": 0.43793103098869324, + "step": 81485 + }, + { + "epoch": 0.08207759349180785, + "grad_norm": 11.985930629826742, + "learning_rate": 4.9873533261554714e-05, + "loss": 2.7209, + "mean_token_accuracy": 0.379310342669487, + "step": 81490 + }, + { + "epoch": 0.08208262954491202, + "grad_norm": 10.388284944476947, + "learning_rate": 4.9873493584971894e-05, + "loss": 2.7286, + "mean_token_accuracy": 0.3896551728248596, + "step": 81495 + }, + { + "epoch": 0.0820876655980162, + "grad_norm": 13.085601354797138, + "learning_rate": 4.987345390218369e-05, + "loss": 2.5696, + "mean_token_accuracy": 0.41034482717514037, + "step": 81500 + }, + { + "epoch": 0.08209270165112037, + "grad_norm": 13.915127084540062, + "learning_rate": 4.987341421319014e-05, + "loss": 2.581, + "mean_token_accuracy": 0.37241379618644715, + "step": 81505 + }, + { + "epoch": 0.08209773770422454, + "grad_norm": 12.721252808747396, + "learning_rate": 4.987337451799122e-05, + "loss": 2.3806, + "mean_token_accuracy": 0.4448275983333588, + "step": 81510 + }, + { + "epoch": 0.08210277375732872, + "grad_norm": 10.936574024571263, + "learning_rate": 4.9873334816586973e-05, + "loss": 2.5832, + "mean_token_accuracy": 0.38965516686439516, + "step": 81515 + }, + { + "epoch": 0.08210780981043289, + "grad_norm": 11.902919384748596, + "learning_rate": 4.9873295108977404e-05, + "loss": 2.6235, + "mean_token_accuracy": 0.3827586263418198, + "step": 81520 + }, + { + "epoch": 0.08211284586353707, + "grad_norm": 9.358887264122853, + "learning_rate": 4.987325539516251e-05, + "loss": 2.3114, + "mean_token_accuracy": 0.44827587008476255, + "step": 81525 + }, + { + "epoch": 0.08211788191664124, + "grad_norm": 11.517641638446594, + "learning_rate": 4.987321567514231e-05, + "loss": 2.6765, + "mean_token_accuracy": 0.36896551847457887, + "step": 81530 + }, + { + "epoch": 0.08212291796974541, + "grad_norm": 11.280099649574352, + "learning_rate": 4.9873175948916815e-05, + "loss": 2.5016, + "mean_token_accuracy": 0.4586206912994385, + "step": 81535 + }, + { + "epoch": 0.08212795402284959, + "grad_norm": 10.617870411712952, + "learning_rate": 4.9873136216486036e-05, + "loss": 2.3628, + "mean_token_accuracy": 0.43103448748588563, + "step": 81540 + }, + { + "epoch": 0.08213299007595375, + "grad_norm": 10.20799085076727, + "learning_rate": 4.987309647784999e-05, + "loss": 2.3382, + "mean_token_accuracy": 0.3981851160526276, + "step": 81545 + }, + { + "epoch": 0.08213802612905792, + "grad_norm": 11.518904351515944, + "learning_rate": 4.987305673300868e-05, + "loss": 2.5585, + "mean_token_accuracy": 0.39310344457626345, + "step": 81550 + }, + { + "epoch": 0.0821430621821621, + "grad_norm": 15.303858250456429, + "learning_rate": 4.9873016981962116e-05, + "loss": 2.8698, + "mean_token_accuracy": 0.3620689660310745, + "step": 81555 + }, + { + "epoch": 0.08214809823526627, + "grad_norm": 15.498837697743825, + "learning_rate": 4.987297722471032e-05, + "loss": 2.6036, + "mean_token_accuracy": 0.3620689630508423, + "step": 81560 + }, + { + "epoch": 0.08215313428837044, + "grad_norm": 11.756474935370587, + "learning_rate": 4.9872937461253285e-05, + "loss": 2.497, + "mean_token_accuracy": 0.4052026689052582, + "step": 81565 + }, + { + "epoch": 0.08215817034147462, + "grad_norm": 14.289805961167959, + "learning_rate": 4.987289769159104e-05, + "loss": 2.3978, + "mean_token_accuracy": 0.45517241954803467, + "step": 81570 + }, + { + "epoch": 0.08216320639457879, + "grad_norm": 12.28272871661099, + "learning_rate": 4.987285791572358e-05, + "loss": 2.7464, + "mean_token_accuracy": 0.36896551847457887, + "step": 81575 + }, + { + "epoch": 0.08216824244768296, + "grad_norm": 10.699613980549676, + "learning_rate": 4.987281813365093e-05, + "loss": 2.5454, + "mean_token_accuracy": 0.39310344457626345, + "step": 81580 + }, + { + "epoch": 0.08217327850078714, + "grad_norm": 11.685694707438968, + "learning_rate": 4.98727783453731e-05, + "loss": 2.5342, + "mean_token_accuracy": 0.4000000059604645, + "step": 81585 + }, + { + "epoch": 0.08217831455389131, + "grad_norm": 23.34661911038643, + "learning_rate": 4.987273855089009e-05, + "loss": 2.7759, + "mean_token_accuracy": 0.3965517163276672, + "step": 81590 + }, + { + "epoch": 0.08218335060699548, + "grad_norm": 13.067482193216819, + "learning_rate": 4.987269875020191e-05, + "loss": 2.6956, + "mean_token_accuracy": 0.4034482717514038, + "step": 81595 + }, + { + "epoch": 0.08218838666009966, + "grad_norm": 10.427030374886247, + "learning_rate": 4.9872658943308595e-05, + "loss": 2.6692, + "mean_token_accuracy": 0.36896551847457887, + "step": 81600 + }, + { + "epoch": 0.08219342271320383, + "grad_norm": 12.070745073352324, + "learning_rate": 4.987261913021013e-05, + "loss": 2.5313, + "mean_token_accuracy": 0.4068965554237366, + "step": 81605 + }, + { + "epoch": 0.082198458766308, + "grad_norm": 13.252374773247201, + "learning_rate": 4.987257931090654e-05, + "loss": 2.6322, + "mean_token_accuracy": 0.3965517163276672, + "step": 81610 + }, + { + "epoch": 0.08220349481941217, + "grad_norm": 10.865300776872681, + "learning_rate": 4.987253948539783e-05, + "loss": 2.5031, + "mean_token_accuracy": 0.3965517163276672, + "step": 81615 + }, + { + "epoch": 0.08220853087251634, + "grad_norm": 10.05526959551301, + "learning_rate": 4.987249965368401e-05, + "loss": 2.4438, + "mean_token_accuracy": 0.41034482717514037, + "step": 81620 + }, + { + "epoch": 0.08221356692562051, + "grad_norm": 11.626754750788221, + "learning_rate": 4.9872459815765094e-05, + "loss": 2.3676, + "mean_token_accuracy": 0.44482758045196535, + "step": 81625 + }, + { + "epoch": 0.08221860297872469, + "grad_norm": 9.994483452143669, + "learning_rate": 4.9872419971641095e-05, + "loss": 2.1402, + "mean_token_accuracy": 0.44482758045196535, + "step": 81630 + }, + { + "epoch": 0.08222363903182886, + "grad_norm": 14.135512749617225, + "learning_rate": 4.987238012131202e-05, + "loss": 2.7536, + "mean_token_accuracy": 0.37931033968925476, + "step": 81635 + }, + { + "epoch": 0.08222867508493303, + "grad_norm": 11.11952855182373, + "learning_rate": 4.987234026477788e-05, + "loss": 2.6523, + "mean_token_accuracy": 0.4379310429096222, + "step": 81640 + }, + { + "epoch": 0.08223371113803721, + "grad_norm": 10.84750169684057, + "learning_rate": 4.987230040203869e-05, + "loss": 2.4172, + "mean_token_accuracy": 0.4103448212146759, + "step": 81645 + }, + { + "epoch": 0.08223874719114138, + "grad_norm": 10.89622725244941, + "learning_rate": 4.987226053309446e-05, + "loss": 2.4901, + "mean_token_accuracy": 0.42758620977401735, + "step": 81650 + }, + { + "epoch": 0.08224378324424556, + "grad_norm": 14.345987262528412, + "learning_rate": 4.98722206579452e-05, + "loss": 2.6574, + "mean_token_accuracy": 0.37586206793785093, + "step": 81655 + }, + { + "epoch": 0.08224881929734973, + "grad_norm": 10.496282400150783, + "learning_rate": 4.987218077659093e-05, + "loss": 2.4616, + "mean_token_accuracy": 0.441379314661026, + "step": 81660 + }, + { + "epoch": 0.0822538553504539, + "grad_norm": 12.776862838804494, + "learning_rate": 4.9872140889031635e-05, + "loss": 2.7211, + "mean_token_accuracy": 0.37586206793785093, + "step": 81665 + }, + { + "epoch": 0.08225889140355808, + "grad_norm": 11.222362330731583, + "learning_rate": 4.987210099526735e-05, + "loss": 2.2948, + "mean_token_accuracy": 0.42758620381355283, + "step": 81670 + }, + { + "epoch": 0.08226392745666225, + "grad_norm": 11.672519332361157, + "learning_rate": 4.9872061095298084e-05, + "loss": 2.4291, + "mean_token_accuracy": 0.46031458377838136, + "step": 81675 + }, + { + "epoch": 0.08226896350976642, + "grad_norm": 10.197068900600106, + "learning_rate": 4.987202118912384e-05, + "loss": 2.4444, + "mean_token_accuracy": 0.39310344457626345, + "step": 81680 + }, + { + "epoch": 0.08227399956287058, + "grad_norm": 12.669812135959248, + "learning_rate": 4.987198127674464e-05, + "loss": 2.6298, + "mean_token_accuracy": 0.35862069129943847, + "step": 81685 + }, + { + "epoch": 0.08227903561597476, + "grad_norm": 10.173115569780382, + "learning_rate": 4.987194135816048e-05, + "loss": 2.3556, + "mean_token_accuracy": 0.42758620381355283, + "step": 81690 + }, + { + "epoch": 0.08228407166907893, + "grad_norm": 10.16899322916087, + "learning_rate": 4.987190143337138e-05, + "loss": 2.5863, + "mean_token_accuracy": 0.36551724672317504, + "step": 81695 + }, + { + "epoch": 0.0822891077221831, + "grad_norm": 10.727720209556825, + "learning_rate": 4.987186150237734e-05, + "loss": 2.3479, + "mean_token_accuracy": 0.42758620977401735, + "step": 81700 + }, + { + "epoch": 0.08229414377528728, + "grad_norm": 10.525654188297539, + "learning_rate": 4.9871821565178394e-05, + "loss": 2.2155, + "mean_token_accuracy": 0.47241379618644713, + "step": 81705 + }, + { + "epoch": 0.08229917982839145, + "grad_norm": 10.37779794688868, + "learning_rate": 4.987178162177453e-05, + "loss": 2.5508, + "mean_token_accuracy": 0.38965516686439516, + "step": 81710 + }, + { + "epoch": 0.08230421588149563, + "grad_norm": 10.865760064256426, + "learning_rate": 4.987174167216578e-05, + "loss": 2.6565, + "mean_token_accuracy": 0.38620689511299133, + "step": 81715 + }, + { + "epoch": 0.0823092519345998, + "grad_norm": 9.424715212643328, + "learning_rate": 4.987170171635214e-05, + "loss": 2.8124, + "mean_token_accuracy": 0.37241377830505373, + "step": 81720 + }, + { + "epoch": 0.08231428798770397, + "grad_norm": 11.474249265987654, + "learning_rate": 4.987166175433361e-05, + "loss": 2.6937, + "mean_token_accuracy": 0.36896551847457887, + "step": 81725 + }, + { + "epoch": 0.08231932404080815, + "grad_norm": 14.227867848242319, + "learning_rate": 4.987162178611024e-05, + "loss": 2.2906, + "mean_token_accuracy": 0.41379310488700866, + "step": 81730 + }, + { + "epoch": 0.08232436009391232, + "grad_norm": 14.620717487780217, + "learning_rate": 4.9871581811682014e-05, + "loss": 2.7476, + "mean_token_accuracy": 0.36206896901130675, + "step": 81735 + }, + { + "epoch": 0.0823293961470165, + "grad_norm": 10.673865826163299, + "learning_rate": 4.987154183104893e-05, + "loss": 2.2111, + "mean_token_accuracy": 0.41724138259887694, + "step": 81740 + }, + { + "epoch": 0.08233443220012067, + "grad_norm": 10.240199485641792, + "learning_rate": 4.987150184421102e-05, + "loss": 2.8363, + "mean_token_accuracy": 0.3689655214548111, + "step": 81745 + }, + { + "epoch": 0.08233946825322484, + "grad_norm": 14.392806777788987, + "learning_rate": 4.98714618511683e-05, + "loss": 2.7746, + "mean_token_accuracy": 0.3896551728248596, + "step": 81750 + }, + { + "epoch": 0.082344504306329, + "grad_norm": 10.61370188078038, + "learning_rate": 4.987142185192078e-05, + "loss": 2.7205, + "mean_token_accuracy": 0.38747731447219846, + "step": 81755 + }, + { + "epoch": 0.08234954035943318, + "grad_norm": 12.577605056208963, + "learning_rate": 4.9871381846468443e-05, + "loss": 3.0185, + "mean_token_accuracy": 0.3482758581638336, + "step": 81760 + }, + { + "epoch": 0.08235457641253735, + "grad_norm": 10.137700347648314, + "learning_rate": 4.987134183481132e-05, + "loss": 2.1899, + "mean_token_accuracy": 0.4034482777118683, + "step": 81765 + }, + { + "epoch": 0.08235961246564152, + "grad_norm": 12.604500774433353, + "learning_rate": 4.987130181694943e-05, + "loss": 2.3482, + "mean_token_accuracy": 0.4413793087005615, + "step": 81770 + }, + { + "epoch": 0.0823646485187457, + "grad_norm": 9.320877205301201, + "learning_rate": 4.987126179288277e-05, + "loss": 1.9868, + "mean_token_accuracy": 0.48620688915252686, + "step": 81775 + }, + { + "epoch": 0.08236968457184987, + "grad_norm": 11.342020207347884, + "learning_rate": 4.987122176261136e-05, + "loss": 2.6241, + "mean_token_accuracy": 0.36896551251411436, + "step": 81780 + }, + { + "epoch": 0.08237472062495405, + "grad_norm": 11.416143382131429, + "learning_rate": 4.9871181726135216e-05, + "loss": 2.8221, + "mean_token_accuracy": 0.4, + "step": 81785 + }, + { + "epoch": 0.08237975667805822, + "grad_norm": 11.532615481089342, + "learning_rate": 4.9871141683454334e-05, + "loss": 2.6775, + "mean_token_accuracy": 0.42068964838981626, + "step": 81790 + }, + { + "epoch": 0.08238479273116239, + "grad_norm": 11.167263412310776, + "learning_rate": 4.9871101634568726e-05, + "loss": 2.4119, + "mean_token_accuracy": 0.43103447556495667, + "step": 81795 + }, + { + "epoch": 0.08238982878426657, + "grad_norm": 9.038242018533337, + "learning_rate": 4.9871061579478425e-05, + "loss": 2.7853, + "mean_token_accuracy": 0.3896551728248596, + "step": 81800 + }, + { + "epoch": 0.08239486483737074, + "grad_norm": 13.83529029876891, + "learning_rate": 4.987102151818341e-05, + "loss": 2.8959, + "mean_token_accuracy": 0.417241370677948, + "step": 81805 + }, + { + "epoch": 0.08239990089047491, + "grad_norm": 10.071578066235098, + "learning_rate": 4.9870981450683726e-05, + "loss": 2.4057, + "mean_token_accuracy": 0.3965517282485962, + "step": 81810 + }, + { + "epoch": 0.08240493694357909, + "grad_norm": 8.780635461193746, + "learning_rate": 4.9870941376979355e-05, + "loss": 2.1417, + "mean_token_accuracy": 0.5055051445960999, + "step": 81815 + }, + { + "epoch": 0.08240997299668326, + "grad_norm": 11.764658599877748, + "learning_rate": 4.987090129707032e-05, + "loss": 2.1675, + "mean_token_accuracy": 0.47586206197738645, + "step": 81820 + }, + { + "epoch": 0.08241500904978742, + "grad_norm": 10.50080843552927, + "learning_rate": 4.9870861210956636e-05, + "loss": 2.3203, + "mean_token_accuracy": 0.4206896543502808, + "step": 81825 + }, + { + "epoch": 0.0824200451028916, + "grad_norm": 10.950683623588455, + "learning_rate": 4.987082111863831e-05, + "loss": 2.5013, + "mean_token_accuracy": 0.4862068951129913, + "step": 81830 + }, + { + "epoch": 0.08242508115599577, + "grad_norm": 9.728412376229704, + "learning_rate": 4.987078102011535e-05, + "loss": 2.9069, + "mean_token_accuracy": 0.3931034505367279, + "step": 81835 + }, + { + "epoch": 0.08243011720909994, + "grad_norm": 10.352568670066955, + "learning_rate": 4.987074091538777e-05, + "loss": 2.8801, + "mean_token_accuracy": 0.42413792610168455, + "step": 81840 + }, + { + "epoch": 0.08243515326220412, + "grad_norm": 10.76995682927042, + "learning_rate": 4.9870700804455586e-05, + "loss": 2.5537, + "mean_token_accuracy": 0.3999999940395355, + "step": 81845 + }, + { + "epoch": 0.08244018931530829, + "grad_norm": 14.49241323472634, + "learning_rate": 4.987066068731881e-05, + "loss": 2.785, + "mean_token_accuracy": 0.38275861740112305, + "step": 81850 + }, + { + "epoch": 0.08244522536841246, + "grad_norm": 10.51078363191253, + "learning_rate": 4.987062056397743e-05, + "loss": 2.7253, + "mean_token_accuracy": 0.3379310339689255, + "step": 81855 + }, + { + "epoch": 0.08245026142151664, + "grad_norm": 12.917833298624373, + "learning_rate": 4.987058043443149e-05, + "loss": 2.4394, + "mean_token_accuracy": 0.4382335126399994, + "step": 81860 + }, + { + "epoch": 0.08245529747462081, + "grad_norm": 14.729484288394806, + "learning_rate": 4.987054029868098e-05, + "loss": 2.8656, + "mean_token_accuracy": 0.4049606740474701, + "step": 81865 + }, + { + "epoch": 0.08246033352772499, + "grad_norm": 10.457324489532406, + "learning_rate": 4.987050015672592e-05, + "loss": 2.2777, + "mean_token_accuracy": 0.4413793087005615, + "step": 81870 + }, + { + "epoch": 0.08246536958082916, + "grad_norm": 11.187049451528614, + "learning_rate": 4.9870460008566315e-05, + "loss": 2.4473, + "mean_token_accuracy": 0.4344827592372894, + "step": 81875 + }, + { + "epoch": 0.08247040563393333, + "grad_norm": 10.217322086023508, + "learning_rate": 4.987041985420218e-05, + "loss": 2.3903, + "mean_token_accuracy": 0.35862069129943847, + "step": 81880 + }, + { + "epoch": 0.0824754416870375, + "grad_norm": 10.32136740904889, + "learning_rate": 4.987037969363354e-05, + "loss": 2.3202, + "mean_token_accuracy": 0.4724137902259827, + "step": 81885 + }, + { + "epoch": 0.08248047774014168, + "grad_norm": 12.401026039110102, + "learning_rate": 4.9870339526860364e-05, + "loss": 2.2188, + "mean_token_accuracy": 0.4758620738983154, + "step": 81890 + }, + { + "epoch": 0.08248551379324584, + "grad_norm": 9.001122061774357, + "learning_rate": 4.987029935388271e-05, + "loss": 2.5788, + "mean_token_accuracy": 0.36896551847457887, + "step": 81895 + }, + { + "epoch": 0.08249054984635001, + "grad_norm": 9.872967106819132, + "learning_rate": 4.9870259174700576e-05, + "loss": 3.1086, + "mean_token_accuracy": 0.36896551847457887, + "step": 81900 + }, + { + "epoch": 0.08249558589945419, + "grad_norm": 10.563606782590925, + "learning_rate": 4.987021898931395e-05, + "loss": 2.3515, + "mean_token_accuracy": 0.4068965494632721, + "step": 81905 + }, + { + "epoch": 0.08250062195255836, + "grad_norm": 10.618502337308142, + "learning_rate": 4.9870178797722865e-05, + "loss": 2.6355, + "mean_token_accuracy": 0.41379310488700866, + "step": 81910 + }, + { + "epoch": 0.08250565800566254, + "grad_norm": 10.174453803502516, + "learning_rate": 4.987013859992733e-05, + "loss": 2.338, + "mean_token_accuracy": 0.43793103098869324, + "step": 81915 + }, + { + "epoch": 0.08251069405876671, + "grad_norm": 9.687709146429334, + "learning_rate": 4.9870098395927354e-05, + "loss": 2.4959, + "mean_token_accuracy": 0.38275861740112305, + "step": 81920 + }, + { + "epoch": 0.08251573011187088, + "grad_norm": 10.633532069546701, + "learning_rate": 4.987005818572295e-05, + "loss": 2.435, + "mean_token_accuracy": 0.3931034505367279, + "step": 81925 + }, + { + "epoch": 0.08252076616497506, + "grad_norm": 8.707036057138948, + "learning_rate": 4.987001796931412e-05, + "loss": 2.4132, + "mean_token_accuracy": 0.4620689630508423, + "step": 81930 + }, + { + "epoch": 0.08252580221807923, + "grad_norm": 12.281550205326257, + "learning_rate": 4.9869977746700884e-05, + "loss": 1.981, + "mean_token_accuracy": 0.48965516686439514, + "step": 81935 + }, + { + "epoch": 0.0825308382711834, + "grad_norm": 11.734018832835128, + "learning_rate": 4.986993751788326e-05, + "loss": 2.7577, + "mean_token_accuracy": 0.3482758581638336, + "step": 81940 + }, + { + "epoch": 0.08253587432428758, + "grad_norm": 14.694972533357946, + "learning_rate": 4.986989728286124e-05, + "loss": 2.7719, + "mean_token_accuracy": 0.36551723480224607, + "step": 81945 + }, + { + "epoch": 0.08254091037739175, + "grad_norm": 11.969210016525244, + "learning_rate": 4.986985704163485e-05, + "loss": 2.3409, + "mean_token_accuracy": 0.4344827592372894, + "step": 81950 + }, + { + "epoch": 0.08254594643049593, + "grad_norm": 10.201358146922619, + "learning_rate": 4.98698167942041e-05, + "loss": 2.3967, + "mean_token_accuracy": 0.4655172526836395, + "step": 81955 + }, + { + "epoch": 0.0825509824836001, + "grad_norm": 12.047767625250296, + "learning_rate": 4.986977654056899e-05, + "loss": 2.9362, + "mean_token_accuracy": 0.32758620381355286, + "step": 81960 + }, + { + "epoch": 0.08255601853670426, + "grad_norm": 10.21606920406701, + "learning_rate": 4.986973628072954e-05, + "loss": 2.3996, + "mean_token_accuracy": 0.4, + "step": 81965 + }, + { + "epoch": 0.08256105458980843, + "grad_norm": 20.35672984140577, + "learning_rate": 4.9869696014685766e-05, + "loss": 3.4923, + "mean_token_accuracy": 0.3517241418361664, + "step": 81970 + }, + { + "epoch": 0.0825660906429126, + "grad_norm": 11.084608581569237, + "learning_rate": 4.986965574243767e-05, + "loss": 2.2638, + "mean_token_accuracy": 0.44482759237289426, + "step": 81975 + }, + { + "epoch": 0.08257112669601678, + "grad_norm": 18.233912395461548, + "learning_rate": 4.986961546398527e-05, + "loss": 2.6584, + "mean_token_accuracy": 0.40689654350280763, + "step": 81980 + }, + { + "epoch": 0.08257616274912095, + "grad_norm": 11.384874414194416, + "learning_rate": 4.986957517932857e-05, + "loss": 2.1579, + "mean_token_accuracy": 0.49165154695510865, + "step": 81985 + }, + { + "epoch": 0.08258119880222513, + "grad_norm": 12.94025713846464, + "learning_rate": 4.9869534888467584e-05, + "loss": 2.5271, + "mean_token_accuracy": 0.38965516686439516, + "step": 81990 + }, + { + "epoch": 0.0825862348553293, + "grad_norm": 11.449083551705915, + "learning_rate": 4.9869494591402325e-05, + "loss": 2.6842, + "mean_token_accuracy": 0.36896551847457887, + "step": 81995 + }, + { + "epoch": 0.08259127090843348, + "grad_norm": 13.377856212639013, + "learning_rate": 4.986945428813281e-05, + "loss": 2.6191, + "mean_token_accuracy": 0.4034482717514038, + "step": 82000 + }, + { + "epoch": 0.08259630696153765, + "grad_norm": 11.388965822941854, + "learning_rate": 4.986941397865903e-05, + "loss": 2.5936, + "mean_token_accuracy": 0.36551723480224607, + "step": 82005 + }, + { + "epoch": 0.08260134301464182, + "grad_norm": 11.286257685456427, + "learning_rate": 4.986937366298102e-05, + "loss": 2.3464, + "mean_token_accuracy": 0.3896551787853241, + "step": 82010 + }, + { + "epoch": 0.082606379067746, + "grad_norm": 9.294967098094592, + "learning_rate": 4.9869333341098776e-05, + "loss": 2.1069, + "mean_token_accuracy": 0.5000000059604645, + "step": 82015 + }, + { + "epoch": 0.08261141512085017, + "grad_norm": 11.199128741270535, + "learning_rate": 4.986929301301232e-05, + "loss": 2.5909, + "mean_token_accuracy": 0.38620689511299133, + "step": 82020 + }, + { + "epoch": 0.08261645117395434, + "grad_norm": 14.091829685915462, + "learning_rate": 4.986925267872165e-05, + "loss": 2.875, + "mean_token_accuracy": 0.3310344755649567, + "step": 82025 + }, + { + "epoch": 0.08262148722705852, + "grad_norm": 11.736163944762179, + "learning_rate": 4.986921233822678e-05, + "loss": 2.731, + "mean_token_accuracy": 0.34137930870056155, + "step": 82030 + }, + { + "epoch": 0.08262652328016268, + "grad_norm": 13.325874216542108, + "learning_rate": 4.9869171991527736e-05, + "loss": 2.5499, + "mean_token_accuracy": 0.3965517282485962, + "step": 82035 + }, + { + "epoch": 0.08263155933326685, + "grad_norm": 11.927570398095131, + "learning_rate": 4.9869131638624514e-05, + "loss": 2.5634, + "mean_token_accuracy": 0.41379311084747317, + "step": 82040 + }, + { + "epoch": 0.08263659538637103, + "grad_norm": 11.239408786162556, + "learning_rate": 4.9869091279517134e-05, + "loss": 2.5338, + "mean_token_accuracy": 0.4206896543502808, + "step": 82045 + }, + { + "epoch": 0.0826416314394752, + "grad_norm": 12.245354198665796, + "learning_rate": 4.98690509142056e-05, + "loss": 2.4502, + "mean_token_accuracy": 0.38620689511299133, + "step": 82050 + }, + { + "epoch": 0.08264666749257937, + "grad_norm": 10.740303854170767, + "learning_rate": 4.9869010542689924e-05, + "loss": 2.6616, + "mean_token_accuracy": 0.4068965494632721, + "step": 82055 + }, + { + "epoch": 0.08265170354568355, + "grad_norm": 11.809428567162636, + "learning_rate": 4.986897016497012e-05, + "loss": 2.3787, + "mean_token_accuracy": 0.4, + "step": 82060 + }, + { + "epoch": 0.08265673959878772, + "grad_norm": 9.6422481385882, + "learning_rate": 4.9868929781046206e-05, + "loss": 2.377, + "mean_token_accuracy": 0.4434966742992401, + "step": 82065 + }, + { + "epoch": 0.0826617756518919, + "grad_norm": 10.301294175866536, + "learning_rate": 4.9868889390918176e-05, + "loss": 2.1604, + "mean_token_accuracy": 0.43272837400436404, + "step": 82070 + }, + { + "epoch": 0.08266681170499607, + "grad_norm": 10.072941030867309, + "learning_rate": 4.986884899458606e-05, + "loss": 2.7969, + "mean_token_accuracy": 0.38620689511299133, + "step": 82075 + }, + { + "epoch": 0.08267184775810024, + "grad_norm": 10.914905542380184, + "learning_rate": 4.9868808592049855e-05, + "loss": 2.4481, + "mean_token_accuracy": 0.4328493714332581, + "step": 82080 + }, + { + "epoch": 0.08267688381120442, + "grad_norm": 12.500324915024295, + "learning_rate": 4.986876818330958e-05, + "loss": 2.5834, + "mean_token_accuracy": 0.36206896901130675, + "step": 82085 + }, + { + "epoch": 0.08268191986430859, + "grad_norm": 11.138756071296362, + "learning_rate": 4.9868727768365236e-05, + "loss": 2.4558, + "mean_token_accuracy": 0.39679802060127256, + "step": 82090 + }, + { + "epoch": 0.08268695591741276, + "grad_norm": 9.875283062794104, + "learning_rate": 4.9868687347216854e-05, + "loss": 2.2244, + "mean_token_accuracy": 0.39655172228813174, + "step": 82095 + }, + { + "epoch": 0.08269199197051694, + "grad_norm": 9.371911844877621, + "learning_rate": 4.986864691986442e-05, + "loss": 2.5634, + "mean_token_accuracy": 0.4103448331356049, + "step": 82100 + }, + { + "epoch": 0.0826970280236211, + "grad_norm": 11.787498245271317, + "learning_rate": 4.986860648630797e-05, + "loss": 2.496, + "mean_token_accuracy": 0.4417487621307373, + "step": 82105 + }, + { + "epoch": 0.08270206407672527, + "grad_norm": 8.76541442878178, + "learning_rate": 4.9868566046547495e-05, + "loss": 2.2096, + "mean_token_accuracy": 0.43103448748588563, + "step": 82110 + }, + { + "epoch": 0.08270710012982944, + "grad_norm": 11.857872328799202, + "learning_rate": 4.986852560058302e-05, + "loss": 2.1527, + "mean_token_accuracy": 0.49171203970909116, + "step": 82115 + }, + { + "epoch": 0.08271213618293362, + "grad_norm": 12.348058283588381, + "learning_rate": 4.986848514841455e-05, + "loss": 2.0096, + "mean_token_accuracy": 0.5413793087005615, + "step": 82120 + }, + { + "epoch": 0.08271717223603779, + "grad_norm": 12.29504213267743, + "learning_rate": 4.98684446900421e-05, + "loss": 2.278, + "mean_token_accuracy": 0.41034482717514037, + "step": 82125 + }, + { + "epoch": 0.08272220828914197, + "grad_norm": 15.624190110093977, + "learning_rate": 4.986840422546567e-05, + "loss": 2.1763, + "mean_token_accuracy": 0.4758620738983154, + "step": 82130 + }, + { + "epoch": 0.08272724434224614, + "grad_norm": 12.995336919819847, + "learning_rate": 4.9868363754685286e-05, + "loss": 2.5681, + "mean_token_accuracy": 0.36206897497177126, + "step": 82135 + }, + { + "epoch": 0.08273228039535031, + "grad_norm": 11.552659013863321, + "learning_rate": 4.986832327770096e-05, + "loss": 2.5575, + "mean_token_accuracy": 0.4034482777118683, + "step": 82140 + }, + { + "epoch": 0.08273731644845449, + "grad_norm": 14.038935445953188, + "learning_rate": 4.9868282794512685e-05, + "loss": 2.9021, + "mean_token_accuracy": 0.4013309121131897, + "step": 82145 + }, + { + "epoch": 0.08274235250155866, + "grad_norm": 10.669460757107403, + "learning_rate": 4.9868242305120485e-05, + "loss": 2.7179, + "mean_token_accuracy": 0.3379310369491577, + "step": 82150 + }, + { + "epoch": 0.08274738855466283, + "grad_norm": 14.202824286472207, + "learning_rate": 4.986820180952437e-05, + "loss": 3.0896, + "mean_token_accuracy": 0.337931028008461, + "step": 82155 + }, + { + "epoch": 0.08275242460776701, + "grad_norm": 11.511900297238604, + "learning_rate": 4.986816130772435e-05, + "loss": 2.37, + "mean_token_accuracy": 0.4448275864124298, + "step": 82160 + }, + { + "epoch": 0.08275746066087118, + "grad_norm": 12.12579496229365, + "learning_rate": 4.986812079972044e-05, + "loss": 3.1261, + "mean_token_accuracy": 0.31379310190677645, + "step": 82165 + }, + { + "epoch": 0.08276249671397536, + "grad_norm": 11.069243108656195, + "learning_rate": 4.9868080285512645e-05, + "loss": 2.4434, + "mean_token_accuracy": 0.4275861978530884, + "step": 82170 + }, + { + "epoch": 0.08276753276707952, + "grad_norm": 10.270612461211211, + "learning_rate": 4.986803976510098e-05, + "loss": 2.5181, + "mean_token_accuracy": 0.4517241358757019, + "step": 82175 + }, + { + "epoch": 0.08277256882018369, + "grad_norm": 16.201747485654018, + "learning_rate": 4.9867999238485455e-05, + "loss": 2.2256, + "mean_token_accuracy": 0.46551724076271056, + "step": 82180 + }, + { + "epoch": 0.08277760487328786, + "grad_norm": 12.974753120540193, + "learning_rate": 4.9867958705666085e-05, + "loss": 2.3936, + "mean_token_accuracy": 0.4344827592372894, + "step": 82185 + }, + { + "epoch": 0.08278264092639204, + "grad_norm": 11.66416062296085, + "learning_rate": 4.986791816664288e-05, + "loss": 2.7619, + "mean_token_accuracy": 0.431280779838562, + "step": 82190 + }, + { + "epoch": 0.08278767697949621, + "grad_norm": 11.579143840457157, + "learning_rate": 4.986787762141584e-05, + "loss": 2.477, + "mean_token_accuracy": 0.47496975064277647, + "step": 82195 + }, + { + "epoch": 0.08279271303260038, + "grad_norm": 11.042978300523743, + "learning_rate": 4.9867837069985e-05, + "loss": 2.4684, + "mean_token_accuracy": 0.3965517282485962, + "step": 82200 + }, + { + "epoch": 0.08279774908570456, + "grad_norm": 11.23755340544159, + "learning_rate": 4.986779651235035e-05, + "loss": 2.5728, + "mean_token_accuracy": 0.43448275327682495, + "step": 82205 + }, + { + "epoch": 0.08280278513880873, + "grad_norm": 10.10717044253312, + "learning_rate": 4.9867755948511906e-05, + "loss": 2.3824, + "mean_token_accuracy": 0.40169388651847837, + "step": 82210 + }, + { + "epoch": 0.0828078211919129, + "grad_norm": 9.12514766672047, + "learning_rate": 4.986771537846968e-05, + "loss": 2.2814, + "mean_token_accuracy": 0.4482758641242981, + "step": 82215 + }, + { + "epoch": 0.08281285724501708, + "grad_norm": 11.593216219970484, + "learning_rate": 4.986767480222369e-05, + "loss": 2.1515, + "mean_token_accuracy": 0.46206897497177124, + "step": 82220 + }, + { + "epoch": 0.08281789329812125, + "grad_norm": 10.792839425321397, + "learning_rate": 4.986763421977394e-05, + "loss": 2.1905, + "mean_token_accuracy": 0.46551724076271056, + "step": 82225 + }, + { + "epoch": 0.08282292935122543, + "grad_norm": 9.24773944485411, + "learning_rate": 4.986759363112044e-05, + "loss": 2.3699, + "mean_token_accuracy": 0.38965516686439516, + "step": 82230 + }, + { + "epoch": 0.0828279654043296, + "grad_norm": 19.49526015743598, + "learning_rate": 4.986755303626321e-05, + "loss": 3.0063, + "mean_token_accuracy": 0.3895320177078247, + "step": 82235 + }, + { + "epoch": 0.08283300145743377, + "grad_norm": 11.23071830529706, + "learning_rate": 4.986751243520225e-05, + "loss": 2.427, + "mean_token_accuracy": 0.4172413766384125, + "step": 82240 + }, + { + "epoch": 0.08283803751053793, + "grad_norm": 13.545678706775375, + "learning_rate": 4.9867471827937576e-05, + "loss": 2.7822, + "mean_token_accuracy": 0.3965517163276672, + "step": 82245 + }, + { + "epoch": 0.08284307356364211, + "grad_norm": 21.45802840981344, + "learning_rate": 4.986743121446921e-05, + "loss": 2.6908, + "mean_token_accuracy": 0.4, + "step": 82250 + }, + { + "epoch": 0.08284810961674628, + "grad_norm": 17.136502283038606, + "learning_rate": 4.986739059479715e-05, + "loss": 2.5603, + "mean_token_accuracy": 0.38620689511299133, + "step": 82255 + }, + { + "epoch": 0.08285314566985046, + "grad_norm": 15.125587180355422, + "learning_rate": 4.98673499689214e-05, + "loss": 2.8803, + "mean_token_accuracy": 0.3482758700847626, + "step": 82260 + }, + { + "epoch": 0.08285818172295463, + "grad_norm": 15.590895857715374, + "learning_rate": 4.986730933684199e-05, + "loss": 2.8564, + "mean_token_accuracy": 0.3620689630508423, + "step": 82265 + }, + { + "epoch": 0.0828632177760588, + "grad_norm": 12.05218254791798, + "learning_rate": 4.9867268698558936e-05, + "loss": 2.5547, + "mean_token_accuracy": 0.42758620977401735, + "step": 82270 + }, + { + "epoch": 0.08286825382916298, + "grad_norm": 11.379713224787885, + "learning_rate": 4.986722805407222e-05, + "loss": 2.7613, + "mean_token_accuracy": 0.3896551787853241, + "step": 82275 + }, + { + "epoch": 0.08287328988226715, + "grad_norm": 15.58638918653899, + "learning_rate": 4.986718740338187e-05, + "loss": 2.5878, + "mean_token_accuracy": 0.441379314661026, + "step": 82280 + }, + { + "epoch": 0.08287832593537132, + "grad_norm": 13.41194509230148, + "learning_rate": 4.98671467464879e-05, + "loss": 2.8009, + "mean_token_accuracy": 0.34252873361110686, + "step": 82285 + }, + { + "epoch": 0.0828833619884755, + "grad_norm": 10.568042948199562, + "learning_rate": 4.9867106083390325e-05, + "loss": 2.4719, + "mean_token_accuracy": 0.4275861978530884, + "step": 82290 + }, + { + "epoch": 0.08288839804157967, + "grad_norm": 10.72876268931549, + "learning_rate": 4.9867065414089146e-05, + "loss": 2.3343, + "mean_token_accuracy": 0.458620685338974, + "step": 82295 + }, + { + "epoch": 0.08289343409468385, + "grad_norm": 12.059655562712353, + "learning_rate": 4.986702473858437e-05, + "loss": 2.6755, + "mean_token_accuracy": 0.36896551251411436, + "step": 82300 + }, + { + "epoch": 0.08289847014778802, + "grad_norm": 11.985264149627843, + "learning_rate": 4.986698405687603e-05, + "loss": 2.0459, + "mean_token_accuracy": 0.47761645913124084, + "step": 82305 + }, + { + "epoch": 0.0829035062008922, + "grad_norm": 10.939417492358507, + "learning_rate": 4.986694336896411e-05, + "loss": 2.3665, + "mean_token_accuracy": 0.43793103098869324, + "step": 82310 + }, + { + "epoch": 0.08290854225399635, + "grad_norm": 11.373189070197762, + "learning_rate": 4.986690267484864e-05, + "loss": 2.2552, + "mean_token_accuracy": 0.43448275327682495, + "step": 82315 + }, + { + "epoch": 0.08291357830710053, + "grad_norm": 11.206124896487783, + "learning_rate": 4.9866861974529636e-05, + "loss": 2.1531, + "mean_token_accuracy": 0.4379310369491577, + "step": 82320 + }, + { + "epoch": 0.0829186143602047, + "grad_norm": 12.015208372476694, + "learning_rate": 4.9866821268007085e-05, + "loss": 2.4326, + "mean_token_accuracy": 0.41724138259887694, + "step": 82325 + }, + { + "epoch": 0.08292365041330887, + "grad_norm": 11.283109762344512, + "learning_rate": 4.9866780555281014e-05, + "loss": 2.1348, + "mean_token_accuracy": 0.44827585816383364, + "step": 82330 + }, + { + "epoch": 0.08292868646641305, + "grad_norm": 10.400926014463256, + "learning_rate": 4.9866739836351436e-05, + "loss": 2.5959, + "mean_token_accuracy": 0.42758620381355283, + "step": 82335 + }, + { + "epoch": 0.08293372251951722, + "grad_norm": 10.264456714365265, + "learning_rate": 4.9866699111218366e-05, + "loss": 2.5533, + "mean_token_accuracy": 0.3793103456497192, + "step": 82340 + }, + { + "epoch": 0.0829387585726214, + "grad_norm": 11.470615730278977, + "learning_rate": 4.986665837988181e-05, + "loss": 2.6789, + "mean_token_accuracy": 0.37241379618644715, + "step": 82345 + }, + { + "epoch": 0.08294379462572557, + "grad_norm": 11.418931579053867, + "learning_rate": 4.9866617642341765e-05, + "loss": 2.7184, + "mean_token_accuracy": 0.3620689630508423, + "step": 82350 + }, + { + "epoch": 0.08294883067882974, + "grad_norm": 9.797881937803332, + "learning_rate": 4.986657689859826e-05, + "loss": 2.4456, + "mean_token_accuracy": 0.3827586233615875, + "step": 82355 + }, + { + "epoch": 0.08295386673193392, + "grad_norm": 13.552657740748518, + "learning_rate": 4.98665361486513e-05, + "loss": 2.6914, + "mean_token_accuracy": 0.36896551847457887, + "step": 82360 + }, + { + "epoch": 0.08295890278503809, + "grad_norm": 10.309420123735132, + "learning_rate": 4.9866495392500906e-05, + "loss": 2.4082, + "mean_token_accuracy": 0.4206896543502808, + "step": 82365 + }, + { + "epoch": 0.08296393883814226, + "grad_norm": 10.242104718101537, + "learning_rate": 4.986645463014707e-05, + "loss": 2.4196, + "mean_token_accuracy": 0.4517241418361664, + "step": 82370 + }, + { + "epoch": 0.08296897489124644, + "grad_norm": 10.258008269604604, + "learning_rate": 4.986641386158982e-05, + "loss": 1.7767, + "mean_token_accuracy": 0.510344821214676, + "step": 82375 + }, + { + "epoch": 0.08297401094435061, + "grad_norm": 13.11827123491122, + "learning_rate": 4.9866373086829165e-05, + "loss": 2.7863, + "mean_token_accuracy": 0.37241379022598264, + "step": 82380 + }, + { + "epoch": 0.08297904699745477, + "grad_norm": 8.276961334496558, + "learning_rate": 4.986633230586511e-05, + "loss": 2.033, + "mean_token_accuracy": 0.4689655125141144, + "step": 82385 + }, + { + "epoch": 0.08298408305055895, + "grad_norm": 11.129658267833584, + "learning_rate": 4.9866291518697663e-05, + "loss": 2.4358, + "mean_token_accuracy": 0.4172413766384125, + "step": 82390 + }, + { + "epoch": 0.08298911910366312, + "grad_norm": 11.311449258103943, + "learning_rate": 4.986625072532685e-05, + "loss": 2.4932, + "mean_token_accuracy": 0.4103448331356049, + "step": 82395 + }, + { + "epoch": 0.0829941551567673, + "grad_norm": 10.540236135203783, + "learning_rate": 4.986620992575268e-05, + "loss": 2.4471, + "mean_token_accuracy": 0.4068965494632721, + "step": 82400 + }, + { + "epoch": 0.08299919120987147, + "grad_norm": 10.80015560392916, + "learning_rate": 4.986616911997514e-05, + "loss": 2.2977, + "mean_token_accuracy": 0.4517241418361664, + "step": 82405 + }, + { + "epoch": 0.08300422726297564, + "grad_norm": 10.115286718282931, + "learning_rate": 4.9866128307994275e-05, + "loss": 2.3791, + "mean_token_accuracy": 0.4068965494632721, + "step": 82410 + }, + { + "epoch": 0.08300926331607981, + "grad_norm": 17.769597000061392, + "learning_rate": 4.986608748981007e-05, + "loss": 2.1898, + "mean_token_accuracy": 0.5000000059604645, + "step": 82415 + }, + { + "epoch": 0.08301429936918399, + "grad_norm": 11.613024122986275, + "learning_rate": 4.986604666542255e-05, + "loss": 2.6336, + "mean_token_accuracy": 0.39310344457626345, + "step": 82420 + }, + { + "epoch": 0.08301933542228816, + "grad_norm": 11.693155807341295, + "learning_rate": 4.986600583483173e-05, + "loss": 2.5195, + "mean_token_accuracy": 0.3999999940395355, + "step": 82425 + }, + { + "epoch": 0.08302437147539234, + "grad_norm": 11.018920325517906, + "learning_rate": 4.9865964998037605e-05, + "loss": 3.0672, + "mean_token_accuracy": 0.35862068831920624, + "step": 82430 + }, + { + "epoch": 0.08302940752849651, + "grad_norm": 10.247827377121986, + "learning_rate": 4.9865924155040205e-05, + "loss": 2.4922, + "mean_token_accuracy": 0.4034482717514038, + "step": 82435 + }, + { + "epoch": 0.08303444358160068, + "grad_norm": 12.609278637767037, + "learning_rate": 4.986588330583953e-05, + "loss": 2.5104, + "mean_token_accuracy": 0.4413793087005615, + "step": 82440 + }, + { + "epoch": 0.08303947963470486, + "grad_norm": 11.592829360869892, + "learning_rate": 4.986584245043559e-05, + "loss": 2.3057, + "mean_token_accuracy": 0.4517241299152374, + "step": 82445 + }, + { + "epoch": 0.08304451568780903, + "grad_norm": 10.442397922436294, + "learning_rate": 4.9865801588828406e-05, + "loss": 2.3694, + "mean_token_accuracy": 0.38965516686439516, + "step": 82450 + }, + { + "epoch": 0.08304955174091319, + "grad_norm": 10.234166271392697, + "learning_rate": 4.986576072101798e-05, + "loss": 2.5674, + "mean_token_accuracy": 0.43629764318466185, + "step": 82455 + }, + { + "epoch": 0.08305458779401736, + "grad_norm": 12.070769226955278, + "learning_rate": 4.986571984700433e-05, + "loss": 2.4629, + "mean_token_accuracy": 0.4517241418361664, + "step": 82460 + }, + { + "epoch": 0.08305962384712154, + "grad_norm": 10.769423826566602, + "learning_rate": 4.986567896678746e-05, + "loss": 2.2905, + "mean_token_accuracy": 0.4551724135875702, + "step": 82465 + }, + { + "epoch": 0.08306465990022571, + "grad_norm": 12.663762146462663, + "learning_rate": 4.9865638080367384e-05, + "loss": 2.3578, + "mean_token_accuracy": 0.4137930989265442, + "step": 82470 + }, + { + "epoch": 0.08306969595332989, + "grad_norm": 12.634583502315403, + "learning_rate": 4.9865597187744125e-05, + "loss": 2.5017, + "mean_token_accuracy": 0.4034482777118683, + "step": 82475 + }, + { + "epoch": 0.08307473200643406, + "grad_norm": 10.270875818208602, + "learning_rate": 4.986555628891767e-05, + "loss": 2.7021, + "mean_token_accuracy": 0.35172414779663086, + "step": 82480 + }, + { + "epoch": 0.08307976805953823, + "grad_norm": 10.042170572891742, + "learning_rate": 4.986551538388806e-05, + "loss": 2.4761, + "mean_token_accuracy": 0.4379310250282288, + "step": 82485 + }, + { + "epoch": 0.08308480411264241, + "grad_norm": 12.715301454530257, + "learning_rate": 4.986547447265527e-05, + "loss": 2.6405, + "mean_token_accuracy": 0.45172413885593415, + "step": 82490 + }, + { + "epoch": 0.08308984016574658, + "grad_norm": 9.966131609728272, + "learning_rate": 4.986543355521935e-05, + "loss": 2.8094, + "mean_token_accuracy": 0.39310344457626345, + "step": 82495 + }, + { + "epoch": 0.08309487621885076, + "grad_norm": 10.081602601673959, + "learning_rate": 4.986539263158029e-05, + "loss": 2.3483, + "mean_token_accuracy": 0.5103448271751404, + "step": 82500 + }, + { + "epoch": 0.08309991227195493, + "grad_norm": 11.672426357501793, + "learning_rate": 4.98653517017381e-05, + "loss": 2.6305, + "mean_token_accuracy": 0.37241379022598264, + "step": 82505 + }, + { + "epoch": 0.0831049483250591, + "grad_norm": 10.545887748686948, + "learning_rate": 4.98653107656928e-05, + "loss": 2.2094, + "mean_token_accuracy": 0.41034482717514037, + "step": 82510 + }, + { + "epoch": 0.08310998437816328, + "grad_norm": 14.434290881032153, + "learning_rate": 4.98652698234444e-05, + "loss": 3.1225, + "mean_token_accuracy": 0.32758620381355286, + "step": 82515 + }, + { + "epoch": 0.08311502043126745, + "grad_norm": 10.939133217799744, + "learning_rate": 4.98652288749929e-05, + "loss": 2.6625, + "mean_token_accuracy": 0.3517241358757019, + "step": 82520 + }, + { + "epoch": 0.08312005648437161, + "grad_norm": 9.127346029818197, + "learning_rate": 4.986518792033833e-05, + "loss": 2.3697, + "mean_token_accuracy": 0.46896551847457885, + "step": 82525 + }, + { + "epoch": 0.08312509253747578, + "grad_norm": 9.41830891574775, + "learning_rate": 4.986514695948069e-05, + "loss": 2.2095, + "mean_token_accuracy": 0.4724137902259827, + "step": 82530 + }, + { + "epoch": 0.08313012859057996, + "grad_norm": 11.182816224776074, + "learning_rate": 4.9865105992419985e-05, + "loss": 2.2927, + "mean_token_accuracy": 0.46551724076271056, + "step": 82535 + }, + { + "epoch": 0.08313516464368413, + "grad_norm": 12.831482045296415, + "learning_rate": 4.986506501915625e-05, + "loss": 2.6978, + "mean_token_accuracy": 0.3737447053194046, + "step": 82540 + }, + { + "epoch": 0.0831402006967883, + "grad_norm": 9.21525293783706, + "learning_rate": 4.986502403968947e-05, + "loss": 2.8101, + "mean_token_accuracy": 0.35172414481639863, + "step": 82545 + }, + { + "epoch": 0.08314523674989248, + "grad_norm": 9.180655615267488, + "learning_rate": 4.986498305401967e-05, + "loss": 2.162, + "mean_token_accuracy": 0.4310344815254211, + "step": 82550 + }, + { + "epoch": 0.08315027280299665, + "grad_norm": 13.102176631612645, + "learning_rate": 4.9864942062146854e-05, + "loss": 2.8436, + "mean_token_accuracy": 0.3517241358757019, + "step": 82555 + }, + { + "epoch": 0.08315530885610083, + "grad_norm": 11.055183765231957, + "learning_rate": 4.986490106407104e-05, + "loss": 2.435, + "mean_token_accuracy": 0.4034482717514038, + "step": 82560 + }, + { + "epoch": 0.083160344909205, + "grad_norm": 11.00716294889182, + "learning_rate": 4.9864860059792233e-05, + "loss": 2.5923, + "mean_token_accuracy": 0.3931034505367279, + "step": 82565 + }, + { + "epoch": 0.08316538096230917, + "grad_norm": 8.597140943359383, + "learning_rate": 4.986481904931046e-05, + "loss": 2.2193, + "mean_token_accuracy": 0.4814881980419159, + "step": 82570 + }, + { + "epoch": 0.08317041701541335, + "grad_norm": 11.88079292483633, + "learning_rate": 4.9864778032625714e-05, + "loss": 2.3942, + "mean_token_accuracy": 0.4241379380226135, + "step": 82575 + }, + { + "epoch": 0.08317545306851752, + "grad_norm": 10.130839029263173, + "learning_rate": 4.986473700973802e-05, + "loss": 2.4907, + "mean_token_accuracy": 0.43793103098869324, + "step": 82580 + }, + { + "epoch": 0.0831804891216217, + "grad_norm": 11.058094850283616, + "learning_rate": 4.986469598064738e-05, + "loss": 2.6836, + "mean_token_accuracy": 0.3793103456497192, + "step": 82585 + }, + { + "epoch": 0.08318552517472587, + "grad_norm": 10.541970667913601, + "learning_rate": 4.986465494535379e-05, + "loss": 2.367, + "mean_token_accuracy": 0.47931034564971925, + "step": 82590 + }, + { + "epoch": 0.08319056122783003, + "grad_norm": 10.298919433337783, + "learning_rate": 4.98646139038573e-05, + "loss": 2.4729, + "mean_token_accuracy": 0.4206896543502808, + "step": 82595 + }, + { + "epoch": 0.0831955972809342, + "grad_norm": 10.396375609137843, + "learning_rate": 4.98645728561579e-05, + "loss": 2.3314, + "mean_token_accuracy": 0.42068964838981626, + "step": 82600 + }, + { + "epoch": 0.08320063333403838, + "grad_norm": 11.24772062722133, + "learning_rate": 4.9864531802255596e-05, + "loss": 2.2713, + "mean_token_accuracy": 0.4379310369491577, + "step": 82605 + }, + { + "epoch": 0.08320566938714255, + "grad_norm": 12.439737389563897, + "learning_rate": 4.98644907421504e-05, + "loss": 2.2975, + "mean_token_accuracy": 0.4206896543502808, + "step": 82610 + }, + { + "epoch": 0.08321070544024672, + "grad_norm": 11.990642166322042, + "learning_rate": 4.986444967584235e-05, + "loss": 2.4687, + "mean_token_accuracy": 0.3655172407627106, + "step": 82615 + }, + { + "epoch": 0.0832157414933509, + "grad_norm": 10.643555331061288, + "learning_rate": 4.9864408603331416e-05, + "loss": 2.9553, + "mean_token_accuracy": 0.3703569293022156, + "step": 82620 + }, + { + "epoch": 0.08322077754645507, + "grad_norm": 11.394726795178814, + "learning_rate": 4.986436752461764e-05, + "loss": 2.3523, + "mean_token_accuracy": 0.47029643058776854, + "step": 82625 + }, + { + "epoch": 0.08322581359955925, + "grad_norm": 9.645019771558594, + "learning_rate": 4.986432643970102e-05, + "loss": 2.3189, + "mean_token_accuracy": 0.4, + "step": 82630 + }, + { + "epoch": 0.08323084965266342, + "grad_norm": 9.794633256007705, + "learning_rate": 4.9864285348581565e-05, + "loss": 2.4805, + "mean_token_accuracy": 0.4137930989265442, + "step": 82635 + }, + { + "epoch": 0.08323588570576759, + "grad_norm": 9.415129871817223, + "learning_rate": 4.98642442512593e-05, + "loss": 2.5274, + "mean_token_accuracy": 0.499939501285553, + "step": 82640 + }, + { + "epoch": 0.08324092175887177, + "grad_norm": 9.681877618821478, + "learning_rate": 4.9864203147734226e-05, + "loss": 2.5213, + "mean_token_accuracy": 0.36896551847457887, + "step": 82645 + }, + { + "epoch": 0.08324595781197594, + "grad_norm": 11.284220978964598, + "learning_rate": 4.9864162038006354e-05, + "loss": 2.3518, + "mean_token_accuracy": 0.42413792610168455, + "step": 82650 + }, + { + "epoch": 0.08325099386508011, + "grad_norm": 12.299902521987429, + "learning_rate": 4.98641209220757e-05, + "loss": 2.3673, + "mean_token_accuracy": 0.42758620977401735, + "step": 82655 + }, + { + "epoch": 0.08325602991818429, + "grad_norm": 14.359614690450313, + "learning_rate": 4.9864079799942284e-05, + "loss": 2.7329, + "mean_token_accuracy": 0.36896551251411436, + "step": 82660 + }, + { + "epoch": 0.08326106597128845, + "grad_norm": 10.127414832485723, + "learning_rate": 4.9864038671606094e-05, + "loss": 2.1576, + "mean_token_accuracy": 0.4241379380226135, + "step": 82665 + }, + { + "epoch": 0.08326610202439262, + "grad_norm": 15.438443729499125, + "learning_rate": 4.986399753706716e-05, + "loss": 2.7622, + "mean_token_accuracy": 0.4172413766384125, + "step": 82670 + }, + { + "epoch": 0.0832711380774968, + "grad_norm": 15.747582974856801, + "learning_rate": 4.9863956396325485e-05, + "loss": 2.4678, + "mean_token_accuracy": 0.42758620977401735, + "step": 82675 + }, + { + "epoch": 0.08327617413060097, + "grad_norm": 11.205610932179288, + "learning_rate": 4.9863915249381085e-05, + "loss": 2.7303, + "mean_token_accuracy": 0.3862068921327591, + "step": 82680 + }, + { + "epoch": 0.08328121018370514, + "grad_norm": 10.75878697001982, + "learning_rate": 4.986387409623397e-05, + "loss": 2.5576, + "mean_token_accuracy": 0.38275861740112305, + "step": 82685 + }, + { + "epoch": 0.08328624623680932, + "grad_norm": 10.329895116175349, + "learning_rate": 4.9863832936884145e-05, + "loss": 2.1803, + "mean_token_accuracy": 0.47586206197738645, + "step": 82690 + }, + { + "epoch": 0.08329128228991349, + "grad_norm": 11.408368361612242, + "learning_rate": 4.986379177133163e-05, + "loss": 2.3212, + "mean_token_accuracy": 0.44137930274009707, + "step": 82695 + }, + { + "epoch": 0.08329631834301766, + "grad_norm": 11.429149217489485, + "learning_rate": 4.9863750599576444e-05, + "loss": 2.6446, + "mean_token_accuracy": 0.4068965554237366, + "step": 82700 + }, + { + "epoch": 0.08330135439612184, + "grad_norm": 9.610288105148737, + "learning_rate": 4.986370942161858e-05, + "loss": 2.3372, + "mean_token_accuracy": 0.4103448212146759, + "step": 82705 + }, + { + "epoch": 0.08330639044922601, + "grad_norm": 12.099681776037754, + "learning_rate": 4.986366823745806e-05, + "loss": 2.649, + "mean_token_accuracy": 0.4413793087005615, + "step": 82710 + }, + { + "epoch": 0.08331142650233019, + "grad_norm": 12.518378304491279, + "learning_rate": 4.986362704709489e-05, + "loss": 2.5691, + "mean_token_accuracy": 0.4103448212146759, + "step": 82715 + }, + { + "epoch": 0.08331646255543436, + "grad_norm": 10.520785558671852, + "learning_rate": 4.986358585052908e-05, + "loss": 1.9944, + "mean_token_accuracy": 0.47241379618644713, + "step": 82720 + }, + { + "epoch": 0.08332149860853853, + "grad_norm": 10.79122326106139, + "learning_rate": 4.9863544647760655e-05, + "loss": 2.1735, + "mean_token_accuracy": 0.4310344815254211, + "step": 82725 + }, + { + "epoch": 0.0833265346616427, + "grad_norm": 11.989668210155806, + "learning_rate": 4.986350343878962e-05, + "loss": 2.3488, + "mean_token_accuracy": 0.4413793087005615, + "step": 82730 + }, + { + "epoch": 0.08333157071474687, + "grad_norm": 10.370662232070792, + "learning_rate": 4.986346222361598e-05, + "loss": 2.6357, + "mean_token_accuracy": 0.3571082890033722, + "step": 82735 + }, + { + "epoch": 0.08333660676785104, + "grad_norm": 12.986089711490512, + "learning_rate": 4.986342100223974e-05, + "loss": 2.6291, + "mean_token_accuracy": 0.4103448331356049, + "step": 82740 + }, + { + "epoch": 0.08334164282095521, + "grad_norm": 9.429207836036195, + "learning_rate": 4.986337977466094e-05, + "loss": 2.4347, + "mean_token_accuracy": 0.41379310488700866, + "step": 82745 + }, + { + "epoch": 0.08334667887405939, + "grad_norm": 11.71014257183348, + "learning_rate": 4.986333854087956e-05, + "loss": 2.3085, + "mean_token_accuracy": 0.42758620381355283, + "step": 82750 + }, + { + "epoch": 0.08335171492716356, + "grad_norm": 8.865422849601968, + "learning_rate": 4.9863297300895626e-05, + "loss": 2.5315, + "mean_token_accuracy": 0.39999998807907106, + "step": 82755 + }, + { + "epoch": 0.08335675098026774, + "grad_norm": 10.843287885844795, + "learning_rate": 4.986325605470916e-05, + "loss": 2.2121, + "mean_token_accuracy": 0.482758617401123, + "step": 82760 + }, + { + "epoch": 0.08336178703337191, + "grad_norm": 9.85226027274634, + "learning_rate": 4.986321480232015e-05, + "loss": 2.4583, + "mean_token_accuracy": 0.41034482717514037, + "step": 82765 + }, + { + "epoch": 0.08336682308647608, + "grad_norm": 11.139862959308285, + "learning_rate": 4.986317354372862e-05, + "loss": 2.6922, + "mean_token_accuracy": 0.3482758641242981, + "step": 82770 + }, + { + "epoch": 0.08337185913958026, + "grad_norm": 11.661981576904845, + "learning_rate": 4.9863132278934585e-05, + "loss": 2.5937, + "mean_token_accuracy": 0.38965516686439516, + "step": 82775 + }, + { + "epoch": 0.08337689519268443, + "grad_norm": 10.575097645057724, + "learning_rate": 4.986309100793804e-05, + "loss": 2.3208, + "mean_token_accuracy": 0.3931034505367279, + "step": 82780 + }, + { + "epoch": 0.0833819312457886, + "grad_norm": 9.377124098019275, + "learning_rate": 4.986304973073902e-05, + "loss": 2.3938, + "mean_token_accuracy": 0.41379310488700866, + "step": 82785 + }, + { + "epoch": 0.08338696729889278, + "grad_norm": 9.59378778467062, + "learning_rate": 4.986300844733752e-05, + "loss": 2.1794, + "mean_token_accuracy": 0.47586206197738645, + "step": 82790 + }, + { + "epoch": 0.08339200335199695, + "grad_norm": 11.162354912547405, + "learning_rate": 4.9862967157733564e-05, + "loss": 2.7136, + "mean_token_accuracy": 0.3758620649576187, + "step": 82795 + }, + { + "epoch": 0.08339703940510113, + "grad_norm": 9.670769318582886, + "learning_rate": 4.986292586192714e-05, + "loss": 2.2376, + "mean_token_accuracy": 0.45396249890327456, + "step": 82800 + }, + { + "epoch": 0.08340207545820529, + "grad_norm": 12.352703104482522, + "learning_rate": 4.9862884559918285e-05, + "loss": 2.5833, + "mean_token_accuracy": 0.4278325140476227, + "step": 82805 + }, + { + "epoch": 0.08340711151130946, + "grad_norm": 11.10472151922161, + "learning_rate": 4.986284325170701e-05, + "loss": 2.3118, + "mean_token_accuracy": 0.42068964838981626, + "step": 82810 + }, + { + "epoch": 0.08341214756441363, + "grad_norm": 14.819942600467058, + "learning_rate": 4.98628019372933e-05, + "loss": 2.9864, + "mean_token_accuracy": 0.33448275923728943, + "step": 82815 + }, + { + "epoch": 0.0834171836175178, + "grad_norm": 11.216083681449321, + "learning_rate": 4.98627606166772e-05, + "loss": 2.6831, + "mean_token_accuracy": 0.4034482717514038, + "step": 82820 + }, + { + "epoch": 0.08342221967062198, + "grad_norm": 8.698610313581378, + "learning_rate": 4.98627192898587e-05, + "loss": 2.02, + "mean_token_accuracy": 0.46896552443504336, + "step": 82825 + }, + { + "epoch": 0.08342725572372615, + "grad_norm": 10.275269383389702, + "learning_rate": 4.98626779568378e-05, + "loss": 2.4617, + "mean_token_accuracy": 0.45517240166664125, + "step": 82830 + }, + { + "epoch": 0.08343229177683033, + "grad_norm": 13.307403436034104, + "learning_rate": 4.9862636617614546e-05, + "loss": 2.4454, + "mean_token_accuracy": 0.38620689511299133, + "step": 82835 + }, + { + "epoch": 0.0834373278299345, + "grad_norm": 11.33589318710027, + "learning_rate": 4.9862595272188914e-05, + "loss": 2.0192, + "mean_token_accuracy": 0.4793103516101837, + "step": 82840 + }, + { + "epoch": 0.08344236388303868, + "grad_norm": 12.468456769436129, + "learning_rate": 4.986255392056095e-05, + "loss": 2.389, + "mean_token_accuracy": 0.4517241418361664, + "step": 82845 + }, + { + "epoch": 0.08344739993614285, + "grad_norm": 9.401665585925628, + "learning_rate": 4.986251256273064e-05, + "loss": 2.2423, + "mean_token_accuracy": 0.47586206197738645, + "step": 82850 + }, + { + "epoch": 0.08345243598924702, + "grad_norm": 10.67840874556472, + "learning_rate": 4.9862471198698005e-05, + "loss": 2.4844, + "mean_token_accuracy": 0.41724138259887694, + "step": 82855 + }, + { + "epoch": 0.0834574720423512, + "grad_norm": 11.146497111660969, + "learning_rate": 4.9862429828463056e-05, + "loss": 2.7394, + "mean_token_accuracy": 0.41724138259887694, + "step": 82860 + }, + { + "epoch": 0.08346250809545537, + "grad_norm": 11.339253852790847, + "learning_rate": 4.986238845202581e-05, + "loss": 2.1454, + "mean_token_accuracy": 0.4586206912994385, + "step": 82865 + }, + { + "epoch": 0.08346754414855954, + "grad_norm": 10.638422928250272, + "learning_rate": 4.986234706938626e-05, + "loss": 2.1987, + "mean_token_accuracy": 0.4517241418361664, + "step": 82870 + }, + { + "epoch": 0.0834725802016637, + "grad_norm": 11.373663882252632, + "learning_rate": 4.986230568054444e-05, + "loss": 2.8942, + "mean_token_accuracy": 0.39310343861579894, + "step": 82875 + }, + { + "epoch": 0.08347761625476788, + "grad_norm": 14.98459242917824, + "learning_rate": 4.986226428550034e-05, + "loss": 2.6782, + "mean_token_accuracy": 0.3793103456497192, + "step": 82880 + }, + { + "epoch": 0.08348265230787205, + "grad_norm": 11.847385312192364, + "learning_rate": 4.986222288425399e-05, + "loss": 2.2851, + "mean_token_accuracy": 0.42758620381355283, + "step": 82885 + }, + { + "epoch": 0.08348768836097623, + "grad_norm": 9.533590532619968, + "learning_rate": 4.9862181476805396e-05, + "loss": 2.3982, + "mean_token_accuracy": 0.441379314661026, + "step": 82890 + }, + { + "epoch": 0.0834927244140804, + "grad_norm": 14.374307413990477, + "learning_rate": 4.986214006315456e-05, + "loss": 2.8215, + "mean_token_accuracy": 0.36551724672317504, + "step": 82895 + }, + { + "epoch": 0.08349776046718457, + "grad_norm": 8.214865386659008, + "learning_rate": 4.986209864330151e-05, + "loss": 1.9521, + "mean_token_accuracy": 0.482758617401123, + "step": 82900 + }, + { + "epoch": 0.08350279652028875, + "grad_norm": 13.981274575220105, + "learning_rate": 4.986205721724625e-05, + "loss": 2.9051, + "mean_token_accuracy": 0.34137930870056155, + "step": 82905 + }, + { + "epoch": 0.08350783257339292, + "grad_norm": 13.087368983137498, + "learning_rate": 4.986201578498878e-05, + "loss": 2.5624, + "mean_token_accuracy": 0.4, + "step": 82910 + }, + { + "epoch": 0.0835128686264971, + "grad_norm": 12.38059333661824, + "learning_rate": 4.986197434652912e-05, + "loss": 2.4325, + "mean_token_accuracy": 0.42758620381355283, + "step": 82915 + }, + { + "epoch": 0.08351790467960127, + "grad_norm": 10.42108401114212, + "learning_rate": 4.986193290186729e-05, + "loss": 2.3919, + "mean_token_accuracy": 0.4620689690113068, + "step": 82920 + }, + { + "epoch": 0.08352294073270544, + "grad_norm": 10.809411618508351, + "learning_rate": 4.9861891451003296e-05, + "loss": 2.1537, + "mean_token_accuracy": 0.4275861978530884, + "step": 82925 + }, + { + "epoch": 0.08352797678580962, + "grad_norm": 14.837942985273731, + "learning_rate": 4.986184999393715e-05, + "loss": 2.761, + "mean_token_accuracy": 0.42413793206214906, + "step": 82930 + }, + { + "epoch": 0.08353301283891379, + "grad_norm": 11.506331624579325, + "learning_rate": 4.986180853066885e-05, + "loss": 2.3066, + "mean_token_accuracy": 0.41379310488700866, + "step": 82935 + }, + { + "epoch": 0.08353804889201796, + "grad_norm": 11.318133632255877, + "learning_rate": 4.986176706119843e-05, + "loss": 2.2004, + "mean_token_accuracy": 0.458620685338974, + "step": 82940 + }, + { + "epoch": 0.08354308494512212, + "grad_norm": 14.286271040364733, + "learning_rate": 4.986172558552588e-05, + "loss": 2.5893, + "mean_token_accuracy": 0.4448275864124298, + "step": 82945 + }, + { + "epoch": 0.0835481209982263, + "grad_norm": 10.307457282477662, + "learning_rate": 4.986168410365123e-05, + "loss": 2.4582, + "mean_token_accuracy": 0.41724138855934145, + "step": 82950 + }, + { + "epoch": 0.08355315705133047, + "grad_norm": 11.865857535641164, + "learning_rate": 4.9861642615574485e-05, + "loss": 2.4798, + "mean_token_accuracy": 0.4137930989265442, + "step": 82955 + }, + { + "epoch": 0.08355819310443464, + "grad_norm": 10.689273741160822, + "learning_rate": 4.986160112129565e-05, + "loss": 2.3675, + "mean_token_accuracy": 0.4068965524435043, + "step": 82960 + }, + { + "epoch": 0.08356322915753882, + "grad_norm": 7.188308982097855, + "learning_rate": 4.9861559620814745e-05, + "loss": 2.0852, + "mean_token_accuracy": 0.5160098552703858, + "step": 82965 + }, + { + "epoch": 0.08356826521064299, + "grad_norm": 11.229698860769501, + "learning_rate": 4.9861518114131773e-05, + "loss": 3.0638, + "mean_token_accuracy": 0.3517241358757019, + "step": 82970 + }, + { + "epoch": 0.08357330126374717, + "grad_norm": 11.158785103427302, + "learning_rate": 4.986147660124676e-05, + "loss": 2.4454, + "mean_token_accuracy": 0.42758620381355283, + "step": 82975 + }, + { + "epoch": 0.08357833731685134, + "grad_norm": 9.751250228631148, + "learning_rate": 4.98614350821597e-05, + "loss": 2.3353, + "mean_token_accuracy": 0.46896551847457885, + "step": 82980 + }, + { + "epoch": 0.08358337336995551, + "grad_norm": 11.965337134827106, + "learning_rate": 4.9861393556870614e-05, + "loss": 2.7998, + "mean_token_accuracy": 0.32068965435028074, + "step": 82985 + }, + { + "epoch": 0.08358840942305969, + "grad_norm": 12.468862950010557, + "learning_rate": 4.986135202537951e-05, + "loss": 2.8967, + "mean_token_accuracy": 0.4034482717514038, + "step": 82990 + }, + { + "epoch": 0.08359344547616386, + "grad_norm": 11.889973584537154, + "learning_rate": 4.98613104876864e-05, + "loss": 2.514, + "mean_token_accuracy": 0.41524500846862794, + "step": 82995 + }, + { + "epoch": 0.08359848152926803, + "grad_norm": 10.39654880073071, + "learning_rate": 4.9861268943791305e-05, + "loss": 2.1639, + "mean_token_accuracy": 0.46079854369163514, + "step": 83000 + }, + { + "epoch": 0.08360351758237221, + "grad_norm": 10.37141449284123, + "learning_rate": 4.986122739369423e-05, + "loss": 2.8829, + "mean_token_accuracy": 0.4034482777118683, + "step": 83005 + }, + { + "epoch": 0.08360855363547638, + "grad_norm": 10.105373484060541, + "learning_rate": 4.986118583739518e-05, + "loss": 2.5437, + "mean_token_accuracy": 0.4396249294281006, + "step": 83010 + }, + { + "epoch": 0.08361358968858054, + "grad_norm": 13.239378237296776, + "learning_rate": 4.986114427489416e-05, + "loss": 2.6328, + "mean_token_accuracy": 0.39310344457626345, + "step": 83015 + }, + { + "epoch": 0.08361862574168472, + "grad_norm": 10.676070233725856, + "learning_rate": 4.986110270619122e-05, + "loss": 2.241, + "mean_token_accuracy": 0.42758620381355283, + "step": 83020 + }, + { + "epoch": 0.08362366179478889, + "grad_norm": 11.342473487449631, + "learning_rate": 4.986106113128632e-05, + "loss": 2.3042, + "mean_token_accuracy": 0.4517241358757019, + "step": 83025 + }, + { + "epoch": 0.08362869784789306, + "grad_norm": 11.464720293349988, + "learning_rate": 4.9861019550179505e-05, + "loss": 2.7189, + "mean_token_accuracy": 0.4034482717514038, + "step": 83030 + }, + { + "epoch": 0.08363373390099724, + "grad_norm": 11.219807923161536, + "learning_rate": 4.986097796287078e-05, + "loss": 2.263, + "mean_token_accuracy": 0.4517241299152374, + "step": 83035 + }, + { + "epoch": 0.08363876995410141, + "grad_norm": 11.582436365012168, + "learning_rate": 4.9860936369360156e-05, + "loss": 2.2919, + "mean_token_accuracy": 0.42758620381355283, + "step": 83040 + }, + { + "epoch": 0.08364380600720558, + "grad_norm": 26.07383653524439, + "learning_rate": 4.986089476964764e-05, + "loss": 2.9936, + "mean_token_accuracy": 0.39655171930789945, + "step": 83045 + }, + { + "epoch": 0.08364884206030976, + "grad_norm": 14.741572202758435, + "learning_rate": 4.9860853163733244e-05, + "loss": 2.572, + "mean_token_accuracy": 0.40689656138420105, + "step": 83050 + }, + { + "epoch": 0.08365387811341393, + "grad_norm": 13.26011684732726, + "learning_rate": 4.986081155161698e-05, + "loss": 2.538, + "mean_token_accuracy": 0.3862068891525269, + "step": 83055 + }, + { + "epoch": 0.0836589141665181, + "grad_norm": 10.208249407708344, + "learning_rate": 4.986076993329887e-05, + "loss": 3.1153, + "mean_token_accuracy": 0.36896551549434664, + "step": 83060 + }, + { + "epoch": 0.08366395021962228, + "grad_norm": 10.538724381325476, + "learning_rate": 4.986072830877891e-05, + "loss": 2.3663, + "mean_token_accuracy": 0.4448275864124298, + "step": 83065 + }, + { + "epoch": 0.08366898627272645, + "grad_norm": 11.11193206991651, + "learning_rate": 4.986068667805712e-05, + "loss": 2.6124, + "mean_token_accuracy": 0.3862068891525269, + "step": 83070 + }, + { + "epoch": 0.08367402232583063, + "grad_norm": 10.677506839797585, + "learning_rate": 4.986064504113351e-05, + "loss": 2.5111, + "mean_token_accuracy": 0.41724138557910917, + "step": 83075 + }, + { + "epoch": 0.0836790583789348, + "grad_norm": 10.256579178278194, + "learning_rate": 4.98606033980081e-05, + "loss": 2.4099, + "mean_token_accuracy": 0.43284936547279357, + "step": 83080 + }, + { + "epoch": 0.08368409443203896, + "grad_norm": 11.957307785556674, + "learning_rate": 4.9860561748680884e-05, + "loss": 2.6292, + "mean_token_accuracy": 0.3827586144208908, + "step": 83085 + }, + { + "epoch": 0.08368913048514313, + "grad_norm": 14.171708756823811, + "learning_rate": 4.986052009315189e-05, + "loss": 2.748, + "mean_token_accuracy": 0.4137930929660797, + "step": 83090 + }, + { + "epoch": 0.08369416653824731, + "grad_norm": 9.34690622564794, + "learning_rate": 4.9860478431421115e-05, + "loss": 2.6633, + "mean_token_accuracy": 0.38965517580509185, + "step": 83095 + }, + { + "epoch": 0.08369920259135148, + "grad_norm": 11.211966450382377, + "learning_rate": 4.986043676348858e-05, + "loss": 2.6758, + "mean_token_accuracy": 0.37586207389831544, + "step": 83100 + }, + { + "epoch": 0.08370423864445566, + "grad_norm": 11.322832974585808, + "learning_rate": 4.986039508935429e-05, + "loss": 2.7295, + "mean_token_accuracy": 0.4110837459564209, + "step": 83105 + }, + { + "epoch": 0.08370927469755983, + "grad_norm": 9.845022844999637, + "learning_rate": 4.9860353409018264e-05, + "loss": 2.4116, + "mean_token_accuracy": 0.4068965524435043, + "step": 83110 + }, + { + "epoch": 0.083714310750664, + "grad_norm": 9.593409045018639, + "learning_rate": 4.986031172248052e-05, + "loss": 2.5494, + "mean_token_accuracy": 0.46031458377838136, + "step": 83115 + }, + { + "epoch": 0.08371934680376818, + "grad_norm": 11.460032509523739, + "learning_rate": 4.9860270029741054e-05, + "loss": 2.229, + "mean_token_accuracy": 0.4310344815254211, + "step": 83120 + }, + { + "epoch": 0.08372438285687235, + "grad_norm": 8.595612385810396, + "learning_rate": 4.986022833079987e-05, + "loss": 2.1374, + "mean_token_accuracy": 0.4460375070571899, + "step": 83125 + }, + { + "epoch": 0.08372941890997652, + "grad_norm": 10.51958155458344, + "learning_rate": 4.986018662565701e-05, + "loss": 2.9014, + "mean_token_accuracy": 0.41034482717514037, + "step": 83130 + }, + { + "epoch": 0.0837344549630807, + "grad_norm": 10.963323145351263, + "learning_rate": 4.9860144914312463e-05, + "loss": 2.8365, + "mean_token_accuracy": 0.3896551728248596, + "step": 83135 + }, + { + "epoch": 0.08373949101618487, + "grad_norm": 12.475843505631406, + "learning_rate": 4.986010319676625e-05, + "loss": 2.4888, + "mean_token_accuracy": 0.4034482717514038, + "step": 83140 + }, + { + "epoch": 0.08374452706928905, + "grad_norm": 10.852976164484186, + "learning_rate": 4.986006147301838e-05, + "loss": 2.7658, + "mean_token_accuracy": 0.38620689511299133, + "step": 83145 + }, + { + "epoch": 0.08374956312239322, + "grad_norm": 12.625646125963442, + "learning_rate": 4.986001974306886e-05, + "loss": 2.5164, + "mean_token_accuracy": 0.41724138557910917, + "step": 83150 + }, + { + "epoch": 0.08375459917549738, + "grad_norm": 9.073883619489958, + "learning_rate": 4.9859978006917705e-05, + "loss": 2.6762, + "mean_token_accuracy": 0.4103448331356049, + "step": 83155 + }, + { + "epoch": 0.08375963522860155, + "grad_norm": 9.87505331301339, + "learning_rate": 4.9859936264564926e-05, + "loss": 2.7848, + "mean_token_accuracy": 0.3482758581638336, + "step": 83160 + }, + { + "epoch": 0.08376467128170573, + "grad_norm": 8.741733627967973, + "learning_rate": 4.985989451601054e-05, + "loss": 1.9837, + "mean_token_accuracy": 0.544367390871048, + "step": 83165 + }, + { + "epoch": 0.0837697073348099, + "grad_norm": 10.613866469402993, + "learning_rate": 4.9859852761254546e-05, + "loss": 2.6031, + "mean_token_accuracy": 0.3482758581638336, + "step": 83170 + }, + { + "epoch": 0.08377474338791407, + "grad_norm": 11.341855242737278, + "learning_rate": 4.9859811000296966e-05, + "loss": 2.7525, + "mean_token_accuracy": 0.3689655244350433, + "step": 83175 + }, + { + "epoch": 0.08377977944101825, + "grad_norm": 9.587401421148714, + "learning_rate": 4.985976923313782e-05, + "loss": 2.5865, + "mean_token_accuracy": 0.4068965554237366, + "step": 83180 + }, + { + "epoch": 0.08378481549412242, + "grad_norm": 16.58057132708195, + "learning_rate": 4.9859727459777094e-05, + "loss": 2.4006, + "mean_token_accuracy": 0.4482758641242981, + "step": 83185 + }, + { + "epoch": 0.0837898515472266, + "grad_norm": 13.07455993981948, + "learning_rate": 4.9859685680214815e-05, + "loss": 2.6401, + "mean_token_accuracy": 0.3827586263418198, + "step": 83190 + }, + { + "epoch": 0.08379488760033077, + "grad_norm": 12.093203659277888, + "learning_rate": 4.9859643894451e-05, + "loss": 2.6058, + "mean_token_accuracy": 0.415365993976593, + "step": 83195 + }, + { + "epoch": 0.08379992365343494, + "grad_norm": 12.405668138378134, + "learning_rate": 4.985960210248565e-05, + "loss": 2.5005, + "mean_token_accuracy": 0.42413793206214906, + "step": 83200 + }, + { + "epoch": 0.08380495970653912, + "grad_norm": 11.061046364004444, + "learning_rate": 4.985956030431879e-05, + "loss": 2.2196, + "mean_token_accuracy": 0.41379310190677643, + "step": 83205 + }, + { + "epoch": 0.08380999575964329, + "grad_norm": 15.416844184669113, + "learning_rate": 4.985951849995041e-05, + "loss": 2.4872, + "mean_token_accuracy": 0.4000000059604645, + "step": 83210 + }, + { + "epoch": 0.08381503181274746, + "grad_norm": 11.737768805503919, + "learning_rate": 4.985947668938054e-05, + "loss": 2.5916, + "mean_token_accuracy": 0.3896551728248596, + "step": 83215 + }, + { + "epoch": 0.08382006786585164, + "grad_norm": 11.40953277103002, + "learning_rate": 4.985943487260919e-05, + "loss": 3.1841, + "mean_token_accuracy": 0.38620689511299133, + "step": 83220 + }, + { + "epoch": 0.0838251039189558, + "grad_norm": 11.680256934264111, + "learning_rate": 4.985939304963636e-05, + "loss": 2.4775, + "mean_token_accuracy": 0.3655172407627106, + "step": 83225 + }, + { + "epoch": 0.08383013997205997, + "grad_norm": 9.689944282517445, + "learning_rate": 4.985935122046207e-05, + "loss": 2.381, + "mean_token_accuracy": 0.4448275864124298, + "step": 83230 + }, + { + "epoch": 0.08383517602516415, + "grad_norm": 8.095969975617331, + "learning_rate": 4.985930938508634e-05, + "loss": 2.1982, + "mean_token_accuracy": 0.47241379618644713, + "step": 83235 + }, + { + "epoch": 0.08384021207826832, + "grad_norm": 10.029520564491996, + "learning_rate": 4.9859267543509155e-05, + "loss": 2.1695, + "mean_token_accuracy": 0.4482758641242981, + "step": 83240 + }, + { + "epoch": 0.0838452481313725, + "grad_norm": 11.99928970860891, + "learning_rate": 4.9859225695730556e-05, + "loss": 2.4908, + "mean_token_accuracy": 0.4206896543502808, + "step": 83245 + }, + { + "epoch": 0.08385028418447667, + "grad_norm": 10.5699337411793, + "learning_rate": 4.985918384175054e-05, + "loss": 2.4345, + "mean_token_accuracy": 0.45862067937850953, + "step": 83250 + }, + { + "epoch": 0.08385532023758084, + "grad_norm": 11.697460338333602, + "learning_rate": 4.985914198156912e-05, + "loss": 2.0608, + "mean_token_accuracy": 0.43448275327682495, + "step": 83255 + }, + { + "epoch": 0.08386035629068501, + "grad_norm": 9.588130151317326, + "learning_rate": 4.985910011518631e-05, + "loss": 2.308, + "mean_token_accuracy": 0.4172413766384125, + "step": 83260 + }, + { + "epoch": 0.08386539234378919, + "grad_norm": 9.66939931499166, + "learning_rate": 4.985905824260212e-05, + "loss": 2.3924, + "mean_token_accuracy": 0.42758620977401735, + "step": 83265 + }, + { + "epoch": 0.08387042839689336, + "grad_norm": 13.407319088290151, + "learning_rate": 4.985901636381656e-05, + "loss": 3.0651, + "mean_token_accuracy": 0.38275861740112305, + "step": 83270 + }, + { + "epoch": 0.08387546444999754, + "grad_norm": 11.570050876747313, + "learning_rate": 4.985897447882964e-05, + "loss": 2.674, + "mean_token_accuracy": 0.3482758581638336, + "step": 83275 + }, + { + "epoch": 0.08388050050310171, + "grad_norm": 12.402931613388107, + "learning_rate": 4.9858932587641385e-05, + "loss": 2.3106, + "mean_token_accuracy": 0.44137930274009707, + "step": 83280 + }, + { + "epoch": 0.08388553655620588, + "grad_norm": 12.485987034234936, + "learning_rate": 4.9858890690251794e-05, + "loss": 2.3368, + "mean_token_accuracy": 0.41379310488700866, + "step": 83285 + }, + { + "epoch": 0.08389057260931006, + "grad_norm": 10.333118726636856, + "learning_rate": 4.985884878666088e-05, + "loss": 2.518, + "mean_token_accuracy": 0.4310344815254211, + "step": 83290 + }, + { + "epoch": 0.08389560866241422, + "grad_norm": 9.538062171671852, + "learning_rate": 4.9858806876868654e-05, + "loss": 2.4016, + "mean_token_accuracy": 0.44137930274009707, + "step": 83295 + }, + { + "epoch": 0.08390064471551839, + "grad_norm": 10.074059507341508, + "learning_rate": 4.985876496087512e-05, + "loss": 2.3968, + "mean_token_accuracy": 0.42068966031074523, + "step": 83300 + }, + { + "epoch": 0.08390568076862256, + "grad_norm": 11.876317798531739, + "learning_rate": 4.985872303868032e-05, + "loss": 2.397, + "mean_token_accuracy": 0.4344827711582184, + "step": 83305 + }, + { + "epoch": 0.08391071682172674, + "grad_norm": 10.28582294796457, + "learning_rate": 4.985868111028423e-05, + "loss": 2.4197, + "mean_token_accuracy": 0.42413793206214906, + "step": 83310 + }, + { + "epoch": 0.08391575287483091, + "grad_norm": 12.01814970649126, + "learning_rate": 4.9858639175686875e-05, + "loss": 2.4169, + "mean_token_accuracy": 0.42413793206214906, + "step": 83315 + }, + { + "epoch": 0.08392078892793509, + "grad_norm": 11.01635476178143, + "learning_rate": 4.9858597234888276e-05, + "loss": 2.071, + "mean_token_accuracy": 0.4675741076469421, + "step": 83320 + }, + { + "epoch": 0.08392582498103926, + "grad_norm": 10.758304955650187, + "learning_rate": 4.9858555287888434e-05, + "loss": 2.5264, + "mean_token_accuracy": 0.42068966031074523, + "step": 83325 + }, + { + "epoch": 0.08393086103414343, + "grad_norm": 11.71823484031071, + "learning_rate": 4.985851333468737e-05, + "loss": 2.8669, + "mean_token_accuracy": 0.3310344755649567, + "step": 83330 + }, + { + "epoch": 0.08393589708724761, + "grad_norm": 10.834454257246085, + "learning_rate": 4.9858471375285074e-05, + "loss": 2.3563, + "mean_token_accuracy": 0.3965517282485962, + "step": 83335 + }, + { + "epoch": 0.08394093314035178, + "grad_norm": 10.126775380980824, + "learning_rate": 4.985842940968158e-05, + "loss": 2.236, + "mean_token_accuracy": 0.40852994918823243, + "step": 83340 + }, + { + "epoch": 0.08394596919345595, + "grad_norm": 10.8159106229767, + "learning_rate": 4.9858387437876895e-05, + "loss": 2.3857, + "mean_token_accuracy": 0.4413793087005615, + "step": 83345 + }, + { + "epoch": 0.08395100524656013, + "grad_norm": 11.342946971137298, + "learning_rate": 4.985834545987103e-05, + "loss": 2.8594, + "mean_token_accuracy": 0.3241379290819168, + "step": 83350 + }, + { + "epoch": 0.0839560412996643, + "grad_norm": 15.755150936394266, + "learning_rate": 4.9858303475663994e-05, + "loss": 2.497, + "mean_token_accuracy": 0.417241370677948, + "step": 83355 + }, + { + "epoch": 0.08396107735276848, + "grad_norm": 11.182494798203983, + "learning_rate": 4.9858261485255794e-05, + "loss": 2.99, + "mean_token_accuracy": 0.3758620619773865, + "step": 83360 + }, + { + "epoch": 0.08396611340587264, + "grad_norm": 13.5176061440183, + "learning_rate": 4.985821948864644e-05, + "loss": 2.3469, + "mean_token_accuracy": 0.4675136148929596, + "step": 83365 + }, + { + "epoch": 0.08397114945897681, + "grad_norm": 9.869952761977242, + "learning_rate": 4.985817748583596e-05, + "loss": 2.248, + "mean_token_accuracy": 0.43448275327682495, + "step": 83370 + }, + { + "epoch": 0.08397618551208098, + "grad_norm": 10.083993494759188, + "learning_rate": 4.9858135476824355e-05, + "loss": 2.3983, + "mean_token_accuracy": 0.4586206912994385, + "step": 83375 + }, + { + "epoch": 0.08398122156518516, + "grad_norm": 12.032422385467223, + "learning_rate": 4.985809346161164e-05, + "loss": 2.4095, + "mean_token_accuracy": 0.4448275983333588, + "step": 83380 + }, + { + "epoch": 0.08398625761828933, + "grad_norm": 13.460077153896455, + "learning_rate": 4.985805144019782e-05, + "loss": 2.4692, + "mean_token_accuracy": 0.47241379618644713, + "step": 83385 + }, + { + "epoch": 0.0839912936713935, + "grad_norm": 10.482575455635631, + "learning_rate": 4.985800941258291e-05, + "loss": 2.3637, + "mean_token_accuracy": 0.4189957737922668, + "step": 83390 + }, + { + "epoch": 0.08399632972449768, + "grad_norm": 10.403140586478827, + "learning_rate": 4.985796737876693e-05, + "loss": 2.3694, + "mean_token_accuracy": 0.4551724076271057, + "step": 83395 + }, + { + "epoch": 0.08400136577760185, + "grad_norm": 18.558199476790985, + "learning_rate": 4.985792533874988e-05, + "loss": 2.5721, + "mean_token_accuracy": 0.4103448331356049, + "step": 83400 + }, + { + "epoch": 0.08400640183070603, + "grad_norm": 10.78607015806668, + "learning_rate": 4.985788329253177e-05, + "loss": 2.533, + "mean_token_accuracy": 0.43793103098869324, + "step": 83405 + }, + { + "epoch": 0.0840114378838102, + "grad_norm": 10.016962450835825, + "learning_rate": 4.9857841240112625e-05, + "loss": 2.4461, + "mean_token_accuracy": 0.42758620977401735, + "step": 83410 + }, + { + "epoch": 0.08401647393691437, + "grad_norm": 10.761255996201754, + "learning_rate": 4.985779918149245e-05, + "loss": 2.5842, + "mean_token_accuracy": 0.3551724225282669, + "step": 83415 + }, + { + "epoch": 0.08402150999001855, + "grad_norm": 12.575771029547626, + "learning_rate": 4.985775711667125e-05, + "loss": 2.4588, + "mean_token_accuracy": 0.4344827592372894, + "step": 83420 + }, + { + "epoch": 0.08402654604312272, + "grad_norm": 10.217571490778349, + "learning_rate": 4.985771504564905e-05, + "loss": 2.5513, + "mean_token_accuracy": 0.3931034505367279, + "step": 83425 + }, + { + "epoch": 0.0840315820962269, + "grad_norm": 9.37783715546154, + "learning_rate": 4.985767296842585e-05, + "loss": 2.207, + "mean_token_accuracy": 0.47931034564971925, + "step": 83430 + }, + { + "epoch": 0.08403661814933105, + "grad_norm": 12.324104982332953, + "learning_rate": 4.9857630885001664e-05, + "loss": 2.5952, + "mean_token_accuracy": 0.39038112163543703, + "step": 83435 + }, + { + "epoch": 0.08404165420243523, + "grad_norm": 9.411516401782722, + "learning_rate": 4.985758879537651e-05, + "loss": 2.3721, + "mean_token_accuracy": 0.4344827592372894, + "step": 83440 + }, + { + "epoch": 0.0840466902555394, + "grad_norm": 14.129667545512621, + "learning_rate": 4.98575466995504e-05, + "loss": 2.9329, + "mean_token_accuracy": 0.3689655244350433, + "step": 83445 + }, + { + "epoch": 0.08405172630864358, + "grad_norm": 8.680730247975015, + "learning_rate": 4.985750459752333e-05, + "loss": 2.1472, + "mean_token_accuracy": 0.47241378426551817, + "step": 83450 + }, + { + "epoch": 0.08405676236174775, + "grad_norm": 12.679439612347666, + "learning_rate": 4.985746248929533e-05, + "loss": 2.8274, + "mean_token_accuracy": 0.37725347876548765, + "step": 83455 + }, + { + "epoch": 0.08406179841485192, + "grad_norm": 9.79999333168821, + "learning_rate": 4.98574203748664e-05, + "loss": 2.407, + "mean_token_accuracy": 0.44137930274009707, + "step": 83460 + }, + { + "epoch": 0.0840668344679561, + "grad_norm": 9.100487384750533, + "learning_rate": 4.985737825423656e-05, + "loss": 2.1897, + "mean_token_accuracy": 0.4517241358757019, + "step": 83465 + }, + { + "epoch": 0.08407187052106027, + "grad_norm": 12.288822990260268, + "learning_rate": 4.985733612740581e-05, + "loss": 2.778, + "mean_token_accuracy": 0.382758629322052, + "step": 83470 + }, + { + "epoch": 0.08407690657416445, + "grad_norm": 9.627459132036227, + "learning_rate": 4.9857293994374175e-05, + "loss": 2.0448, + "mean_token_accuracy": 0.4793103516101837, + "step": 83475 + }, + { + "epoch": 0.08408194262726862, + "grad_norm": 15.442524535760647, + "learning_rate": 4.985725185514166e-05, + "loss": 2.8338, + "mean_token_accuracy": 0.38275861740112305, + "step": 83480 + }, + { + "epoch": 0.08408697868037279, + "grad_norm": 10.60837560632719, + "learning_rate": 4.985720970970828e-05, + "loss": 2.4749, + "mean_token_accuracy": 0.39655172228813174, + "step": 83485 + }, + { + "epoch": 0.08409201473347697, + "grad_norm": 10.268959755285113, + "learning_rate": 4.9857167558074045e-05, + "loss": 2.2077, + "mean_token_accuracy": 0.4674531161785126, + "step": 83490 + }, + { + "epoch": 0.08409705078658114, + "grad_norm": 8.579567045135438, + "learning_rate": 4.985712540023896e-05, + "loss": 2.6732, + "mean_token_accuracy": 0.3896551728248596, + "step": 83495 + }, + { + "epoch": 0.08410208683968531, + "grad_norm": 10.863909003507919, + "learning_rate": 4.985708323620305e-05, + "loss": 2.3116, + "mean_token_accuracy": 0.4206896543502808, + "step": 83500 + }, + { + "epoch": 0.08410712289278947, + "grad_norm": 20.447739638391393, + "learning_rate": 4.9857041065966315e-05, + "loss": 2.7044, + "mean_token_accuracy": 0.4431941986083984, + "step": 83505 + }, + { + "epoch": 0.08411215894589365, + "grad_norm": 14.0968070478773, + "learning_rate": 4.9856998889528775e-05, + "loss": 2.5483, + "mean_token_accuracy": 0.4034482717514038, + "step": 83510 + }, + { + "epoch": 0.08411719499899782, + "grad_norm": 10.669455041495757, + "learning_rate": 4.985695670689044e-05, + "loss": 2.3945, + "mean_token_accuracy": 0.4448275864124298, + "step": 83515 + }, + { + "epoch": 0.084122231052102, + "grad_norm": 10.679598527568388, + "learning_rate": 4.985691451805131e-05, + "loss": 2.2929, + "mean_token_accuracy": 0.4344827592372894, + "step": 83520 + }, + { + "epoch": 0.08412726710520617, + "grad_norm": 12.933184483045327, + "learning_rate": 4.985687232301141e-05, + "loss": 2.5172, + "mean_token_accuracy": 0.4413793087005615, + "step": 83525 + }, + { + "epoch": 0.08413230315831034, + "grad_norm": 11.56637720851547, + "learning_rate": 4.985683012177075e-05, + "loss": 2.7266, + "mean_token_accuracy": 0.324137932062149, + "step": 83530 + }, + { + "epoch": 0.08413733921141452, + "grad_norm": 9.070347912354439, + "learning_rate": 4.985678791432935e-05, + "loss": 2.4695, + "mean_token_accuracy": 0.42068966031074523, + "step": 83535 + }, + { + "epoch": 0.08414237526451869, + "grad_norm": 11.508919523569414, + "learning_rate": 4.98567457006872e-05, + "loss": 2.2462, + "mean_token_accuracy": 0.42413793206214906, + "step": 83540 + }, + { + "epoch": 0.08414741131762286, + "grad_norm": 10.757802391824914, + "learning_rate": 4.9856703480844315e-05, + "loss": 2.362, + "mean_token_accuracy": 0.41724138259887694, + "step": 83545 + }, + { + "epoch": 0.08415244737072704, + "grad_norm": 12.755158349735522, + "learning_rate": 4.985666125480073e-05, + "loss": 2.4378, + "mean_token_accuracy": 0.42758620977401735, + "step": 83550 + }, + { + "epoch": 0.08415748342383121, + "grad_norm": 12.063317805286587, + "learning_rate": 4.985661902255643e-05, + "loss": 2.1539, + "mean_token_accuracy": 0.44827587008476255, + "step": 83555 + }, + { + "epoch": 0.08416251947693539, + "grad_norm": 20.29228412387696, + "learning_rate": 4.985657678411145e-05, + "loss": 2.6971, + "mean_token_accuracy": 0.41724138259887694, + "step": 83560 + }, + { + "epoch": 0.08416755553003956, + "grad_norm": 10.257929939489882, + "learning_rate": 4.985653453946578e-05, + "loss": 2.1913, + "mean_token_accuracy": 0.42758620977401735, + "step": 83565 + }, + { + "epoch": 0.08417259158314373, + "grad_norm": 14.383276524396509, + "learning_rate": 4.985649228861945e-05, + "loss": 2.948, + "mean_token_accuracy": 0.3827586233615875, + "step": 83570 + }, + { + "epoch": 0.08417762763624789, + "grad_norm": 10.30648390708572, + "learning_rate": 4.985645003157246e-05, + "loss": 2.4667, + "mean_token_accuracy": 0.4482758641242981, + "step": 83575 + }, + { + "epoch": 0.08418266368935207, + "grad_norm": 10.249379067291272, + "learning_rate": 4.985640776832483e-05, + "loss": 2.4525, + "mean_token_accuracy": 0.41034482717514037, + "step": 83580 + }, + { + "epoch": 0.08418769974245624, + "grad_norm": 10.855730506101349, + "learning_rate": 4.985636549887655e-05, + "loss": 2.6559, + "mean_token_accuracy": 0.3793103456497192, + "step": 83585 + }, + { + "epoch": 0.08419273579556041, + "grad_norm": 9.661826057351298, + "learning_rate": 4.985632322322766e-05, + "loss": 2.6538, + "mean_token_accuracy": 0.41034482717514037, + "step": 83590 + }, + { + "epoch": 0.08419777184866459, + "grad_norm": 12.333247295666878, + "learning_rate": 4.985628094137817e-05, + "loss": 2.9186, + "mean_token_accuracy": 0.33448275923728943, + "step": 83595 + }, + { + "epoch": 0.08420280790176876, + "grad_norm": 9.861304076755584, + "learning_rate": 4.985623865332807e-05, + "loss": 2.8294, + "mean_token_accuracy": 0.3896551698446274, + "step": 83600 + }, + { + "epoch": 0.08420784395487294, + "grad_norm": 8.767464209051358, + "learning_rate": 4.9856196359077386e-05, + "loss": 2.0935, + "mean_token_accuracy": 0.4310344815254211, + "step": 83605 + }, + { + "epoch": 0.08421288000797711, + "grad_norm": 9.222419076136434, + "learning_rate": 4.985615405862613e-05, + "loss": 2.2168, + "mean_token_accuracy": 0.43103448748588563, + "step": 83610 + }, + { + "epoch": 0.08421791606108128, + "grad_norm": 10.9804586818531, + "learning_rate": 4.9856111751974304e-05, + "loss": 2.5717, + "mean_token_accuracy": 0.40689654350280763, + "step": 83615 + }, + { + "epoch": 0.08422295211418546, + "grad_norm": 12.711654891537941, + "learning_rate": 4.985606943912193e-05, + "loss": 2.5657, + "mean_token_accuracy": 0.3655172407627106, + "step": 83620 + }, + { + "epoch": 0.08422798816728963, + "grad_norm": 9.780426719619351, + "learning_rate": 4.985602712006902e-05, + "loss": 2.3777, + "mean_token_accuracy": 0.4413793087005615, + "step": 83625 + }, + { + "epoch": 0.0842330242203938, + "grad_norm": 12.01340667319542, + "learning_rate": 4.985598479481558e-05, + "loss": 2.699, + "mean_token_accuracy": 0.3551724076271057, + "step": 83630 + }, + { + "epoch": 0.08423806027349798, + "grad_norm": 9.300339252866545, + "learning_rate": 4.9855942463361635e-05, + "loss": 2.0927, + "mean_token_accuracy": 0.5051724135875701, + "step": 83635 + }, + { + "epoch": 0.08424309632660215, + "grad_norm": 13.691430369832881, + "learning_rate": 4.9855900125707175e-05, + "loss": 3.0368, + "mean_token_accuracy": 0.34482758343219755, + "step": 83640 + }, + { + "epoch": 0.08424813237970631, + "grad_norm": 10.211380844463578, + "learning_rate": 4.9855857781852224e-05, + "loss": 2.0413, + "mean_token_accuracy": 0.506896561384201, + "step": 83645 + }, + { + "epoch": 0.08425316843281049, + "grad_norm": 11.607502455241056, + "learning_rate": 4.9855815431796796e-05, + "loss": 2.1468, + "mean_token_accuracy": 0.4535995125770569, + "step": 83650 + }, + { + "epoch": 0.08425820448591466, + "grad_norm": 9.650552742950898, + "learning_rate": 4.9855773075540896e-05, + "loss": 2.6727, + "mean_token_accuracy": 0.38275861740112305, + "step": 83655 + }, + { + "epoch": 0.08426324053901883, + "grad_norm": 11.50047321352506, + "learning_rate": 4.985573071308454e-05, + "loss": 2.2598, + "mean_token_accuracy": 0.38965516686439516, + "step": 83660 + }, + { + "epoch": 0.084268276592123, + "grad_norm": 10.018957259214215, + "learning_rate": 4.985568834442773e-05, + "loss": 2.0793, + "mean_token_accuracy": 0.47586206793785096, + "step": 83665 + }, + { + "epoch": 0.08427331264522718, + "grad_norm": 15.339343266534113, + "learning_rate": 4.9855645969570494e-05, + "loss": 2.7098, + "mean_token_accuracy": 0.38427101969718935, + "step": 83670 + }, + { + "epoch": 0.08427834869833135, + "grad_norm": 10.151174820132887, + "learning_rate": 4.9855603588512846e-05, + "loss": 2.23, + "mean_token_accuracy": 0.4379310250282288, + "step": 83675 + }, + { + "epoch": 0.08428338475143553, + "grad_norm": 11.190103851056818, + "learning_rate": 4.985556120125477e-05, + "loss": 2.7104, + "mean_token_accuracy": 0.3551724016666412, + "step": 83680 + }, + { + "epoch": 0.0842884208045397, + "grad_norm": 11.45299452698718, + "learning_rate": 4.98555188077963e-05, + "loss": 2.7769, + "mean_token_accuracy": 0.3620689570903778, + "step": 83685 + }, + { + "epoch": 0.08429345685764388, + "grad_norm": 8.633813503853581, + "learning_rate": 4.985547640813745e-05, + "loss": 2.1723, + "mean_token_accuracy": 0.4551724135875702, + "step": 83690 + }, + { + "epoch": 0.08429849291074805, + "grad_norm": 12.423418933991139, + "learning_rate": 4.985543400227823e-05, + "loss": 2.7155, + "mean_token_accuracy": 0.3896551728248596, + "step": 83695 + }, + { + "epoch": 0.08430352896385222, + "grad_norm": 14.56417705586622, + "learning_rate": 4.985539159021863e-05, + "loss": 3.1379, + "mean_token_accuracy": 0.37241379022598264, + "step": 83700 + }, + { + "epoch": 0.0843085650169564, + "grad_norm": 10.59480027066553, + "learning_rate": 4.985534917195869e-05, + "loss": 2.1449, + "mean_token_accuracy": 0.4517241418361664, + "step": 83705 + }, + { + "epoch": 0.08431360107006057, + "grad_norm": 11.772442408747924, + "learning_rate": 4.985530674749842e-05, + "loss": 2.5681, + "mean_token_accuracy": 0.36551723480224607, + "step": 83710 + }, + { + "epoch": 0.08431863712316473, + "grad_norm": 12.439573023320447, + "learning_rate": 4.98552643168378e-05, + "loss": 2.5881, + "mean_token_accuracy": 0.42068965137004855, + "step": 83715 + }, + { + "epoch": 0.0843236731762689, + "grad_norm": 16.36763947195971, + "learning_rate": 4.9855221879976885e-05, + "loss": 2.6941, + "mean_token_accuracy": 0.3999999940395355, + "step": 83720 + }, + { + "epoch": 0.08432870922937308, + "grad_norm": 13.275772855999945, + "learning_rate": 4.985517943691565e-05, + "loss": 2.3868, + "mean_token_accuracy": 0.4429521977901459, + "step": 83725 + }, + { + "epoch": 0.08433374528247725, + "grad_norm": 10.251653508773567, + "learning_rate": 4.985513698765413e-05, + "loss": 3.1793, + "mean_token_accuracy": 0.2931034505367279, + "step": 83730 + }, + { + "epoch": 0.08433878133558143, + "grad_norm": 10.395316168758846, + "learning_rate": 4.9855094532192325e-05, + "loss": 2.3831, + "mean_token_accuracy": 0.482758617401123, + "step": 83735 + }, + { + "epoch": 0.0843438173886856, + "grad_norm": 11.100283423538864, + "learning_rate": 4.9855052070530264e-05, + "loss": 2.5817, + "mean_token_accuracy": 0.3896551728248596, + "step": 83740 + }, + { + "epoch": 0.08434885344178977, + "grad_norm": 11.055251755741168, + "learning_rate": 4.9855009602667934e-05, + "loss": 2.4166, + "mean_token_accuracy": 0.4241379201412201, + "step": 83745 + }, + { + "epoch": 0.08435388949489395, + "grad_norm": 9.972814619484913, + "learning_rate": 4.985496712860536e-05, + "loss": 2.3152, + "mean_token_accuracy": 0.4620689630508423, + "step": 83750 + }, + { + "epoch": 0.08435892554799812, + "grad_norm": 16.024162181669592, + "learning_rate": 4.985492464834255e-05, + "loss": 2.589, + "mean_token_accuracy": 0.4620689630508423, + "step": 83755 + }, + { + "epoch": 0.0843639616011023, + "grad_norm": 11.163473070194046, + "learning_rate": 4.9854882161879516e-05, + "loss": 2.7607, + "mean_token_accuracy": 0.3758620619773865, + "step": 83760 + }, + { + "epoch": 0.08436899765420647, + "grad_norm": 9.880954849879211, + "learning_rate": 4.985483966921628e-05, + "loss": 2.6464, + "mean_token_accuracy": 0.37241379022598264, + "step": 83765 + }, + { + "epoch": 0.08437403370731064, + "grad_norm": 10.94050574802675, + "learning_rate": 4.985479717035285e-05, + "loss": 2.3329, + "mean_token_accuracy": 0.4137930989265442, + "step": 83770 + }, + { + "epoch": 0.08437906976041482, + "grad_norm": 12.557494976653757, + "learning_rate": 4.9854754665289225e-05, + "loss": 2.7767, + "mean_token_accuracy": 0.3758620649576187, + "step": 83775 + }, + { + "epoch": 0.08438410581351899, + "grad_norm": 12.66255309221867, + "learning_rate": 4.985471215402542e-05, + "loss": 2.8687, + "mean_token_accuracy": 0.4034482717514038, + "step": 83780 + }, + { + "epoch": 0.08438914186662315, + "grad_norm": 10.057855018967532, + "learning_rate": 4.9854669636561466e-05, + "loss": 2.409, + "mean_token_accuracy": 0.42244404554367065, + "step": 83785 + }, + { + "epoch": 0.08439417791972732, + "grad_norm": 12.208123004929682, + "learning_rate": 4.985462711289736e-05, + "loss": 2.3515, + "mean_token_accuracy": 0.4344827592372894, + "step": 83790 + }, + { + "epoch": 0.0843992139728315, + "grad_norm": 12.147974165143134, + "learning_rate": 4.98545845830331e-05, + "loss": 2.3825, + "mean_token_accuracy": 0.4344827592372894, + "step": 83795 + }, + { + "epoch": 0.08440425002593567, + "grad_norm": 11.813704677351128, + "learning_rate": 4.9854542046968726e-05, + "loss": 2.6515, + "mean_token_accuracy": 0.37241379618644715, + "step": 83800 + }, + { + "epoch": 0.08440928607903984, + "grad_norm": 10.602102671869622, + "learning_rate": 4.985449950470423e-05, + "loss": 2.6925, + "mean_token_accuracy": 0.4, + "step": 83805 + }, + { + "epoch": 0.08441432213214402, + "grad_norm": 11.398026330272291, + "learning_rate": 4.985445695623963e-05, + "loss": 2.4871, + "mean_token_accuracy": 0.44482759237289426, + "step": 83810 + }, + { + "epoch": 0.08441935818524819, + "grad_norm": 12.717923909937156, + "learning_rate": 4.985441440157494e-05, + "loss": 3.0629, + "mean_token_accuracy": 0.4286945790052414, + "step": 83815 + }, + { + "epoch": 0.08442439423835237, + "grad_norm": 12.245332608468338, + "learning_rate": 4.9854371840710174e-05, + "loss": 2.3668, + "mean_token_accuracy": 0.4603750824928284, + "step": 83820 + }, + { + "epoch": 0.08442943029145654, + "grad_norm": 12.599469001959527, + "learning_rate": 4.985432927364533e-05, + "loss": 2.2551, + "mean_token_accuracy": 0.45517241954803467, + "step": 83825 + }, + { + "epoch": 0.08443446634456071, + "grad_norm": 13.254328463078659, + "learning_rate": 4.9854286700380444e-05, + "loss": 2.6073, + "mean_token_accuracy": 0.4068965494632721, + "step": 83830 + }, + { + "epoch": 0.08443950239766489, + "grad_norm": 13.8261174746677, + "learning_rate": 4.98542441209155e-05, + "loss": 2.5303, + "mean_token_accuracy": 0.4034482717514038, + "step": 83835 + }, + { + "epoch": 0.08444453845076906, + "grad_norm": 8.34441362785068, + "learning_rate": 4.9854201535250525e-05, + "loss": 1.9624, + "mean_token_accuracy": 0.5344827592372894, + "step": 83840 + }, + { + "epoch": 0.08444957450387323, + "grad_norm": 17.064341031092617, + "learning_rate": 4.985415894338553e-05, + "loss": 2.8597, + "mean_token_accuracy": 0.36206896901130675, + "step": 83845 + }, + { + "epoch": 0.08445461055697741, + "grad_norm": 10.152699338881392, + "learning_rate": 4.985411634532053e-05, + "loss": 2.5955, + "mean_token_accuracy": 0.4172413766384125, + "step": 83850 + }, + { + "epoch": 0.08445964661008157, + "grad_norm": 12.417441380586595, + "learning_rate": 4.9854073741055526e-05, + "loss": 2.6186, + "mean_token_accuracy": 0.4344827502965927, + "step": 83855 + }, + { + "epoch": 0.08446468266318574, + "grad_norm": 13.26591848561438, + "learning_rate": 4.985403113059054e-05, + "loss": 2.6703, + "mean_token_accuracy": 0.41724138557910917, + "step": 83860 + }, + { + "epoch": 0.08446971871628992, + "grad_norm": 12.022348160886756, + "learning_rate": 4.985398851392558e-05, + "loss": 2.4453, + "mean_token_accuracy": 0.38965517580509185, + "step": 83865 + }, + { + "epoch": 0.08447475476939409, + "grad_norm": 10.89647471812784, + "learning_rate": 4.985394589106066e-05, + "loss": 2.7098, + "mean_token_accuracy": 0.38275861740112305, + "step": 83870 + }, + { + "epoch": 0.08447979082249826, + "grad_norm": 10.76333245874957, + "learning_rate": 4.985390326199579e-05, + "loss": 2.7949, + "mean_token_accuracy": 0.37241379022598264, + "step": 83875 + }, + { + "epoch": 0.08448482687560244, + "grad_norm": 10.398702424385043, + "learning_rate": 4.985386062673098e-05, + "loss": 2.3506, + "mean_token_accuracy": 0.4482758641242981, + "step": 83880 + }, + { + "epoch": 0.08448986292870661, + "grad_norm": 13.013179038133718, + "learning_rate": 4.985381798526624e-05, + "loss": 2.7344, + "mean_token_accuracy": 0.38275861740112305, + "step": 83885 + }, + { + "epoch": 0.08449489898181078, + "grad_norm": 11.727675745059933, + "learning_rate": 4.985377533760158e-05, + "loss": 2.4588, + "mean_token_accuracy": 0.4103448152542114, + "step": 83890 + }, + { + "epoch": 0.08449993503491496, + "grad_norm": 11.952265164870486, + "learning_rate": 4.985373268373703e-05, + "loss": 2.4226, + "mean_token_accuracy": 0.44482758045196535, + "step": 83895 + }, + { + "epoch": 0.08450497108801913, + "grad_norm": 11.32807850503098, + "learning_rate": 4.985369002367258e-05, + "loss": 2.4614, + "mean_token_accuracy": 0.42758620381355283, + "step": 83900 + }, + { + "epoch": 0.0845100071411233, + "grad_norm": 11.632994481702731, + "learning_rate": 4.985364735740826e-05, + "loss": 2.6008, + "mean_token_accuracy": 0.4344827592372894, + "step": 83905 + }, + { + "epoch": 0.08451504319422748, + "grad_norm": 11.551755796900668, + "learning_rate": 4.985360468494407e-05, + "loss": 2.5523, + "mean_token_accuracy": 0.4241379380226135, + "step": 83910 + }, + { + "epoch": 0.08452007924733165, + "grad_norm": 11.4252815344893, + "learning_rate": 4.985356200628001e-05, + "loss": 2.4624, + "mean_token_accuracy": 0.3896551787853241, + "step": 83915 + }, + { + "epoch": 0.08452511530043583, + "grad_norm": 12.283142337152427, + "learning_rate": 4.985351932141612e-05, + "loss": 2.346, + "mean_token_accuracy": 0.42758620977401735, + "step": 83920 + }, + { + "epoch": 0.08453015135353999, + "grad_norm": 11.443495033579033, + "learning_rate": 4.98534766303524e-05, + "loss": 2.6946, + "mean_token_accuracy": 0.32758620381355286, + "step": 83925 + }, + { + "epoch": 0.08453518740664416, + "grad_norm": 13.267505861650257, + "learning_rate": 4.985343393308885e-05, + "loss": 2.8911, + "mean_token_accuracy": 0.3705989122390747, + "step": 83930 + }, + { + "epoch": 0.08454022345974833, + "grad_norm": 9.973610948780596, + "learning_rate": 4.98533912296255e-05, + "loss": 2.6067, + "mean_token_accuracy": 0.35862068831920624, + "step": 83935 + }, + { + "epoch": 0.08454525951285251, + "grad_norm": 12.205285253287201, + "learning_rate": 4.985334851996235e-05, + "loss": 2.5644, + "mean_token_accuracy": 0.3896551728248596, + "step": 83940 + }, + { + "epoch": 0.08455029556595668, + "grad_norm": 15.360173552977056, + "learning_rate": 4.985330580409941e-05, + "loss": 3.1112, + "mean_token_accuracy": 0.33103448152542114, + "step": 83945 + }, + { + "epoch": 0.08455533161906086, + "grad_norm": 12.476658347866254, + "learning_rate": 4.9853263082036704e-05, + "loss": 2.6085, + "mean_token_accuracy": 0.3551724076271057, + "step": 83950 + }, + { + "epoch": 0.08456036767216503, + "grad_norm": 10.232198684314515, + "learning_rate": 4.985322035377424e-05, + "loss": 2.5336, + "mean_token_accuracy": 0.4551724076271057, + "step": 83955 + }, + { + "epoch": 0.0845654037252692, + "grad_norm": 12.87012878214802, + "learning_rate": 4.985317761931202e-05, + "loss": 2.3909, + "mean_token_accuracy": 0.4172413766384125, + "step": 83960 + }, + { + "epoch": 0.08457043977837338, + "grad_norm": 10.973916665385671, + "learning_rate": 4.985313487865006e-05, + "loss": 2.3329, + "mean_token_accuracy": 0.46551724672317507, + "step": 83965 + }, + { + "epoch": 0.08457547583147755, + "grad_norm": 10.253114698910295, + "learning_rate": 4.985309213178839e-05, + "loss": 2.4348, + "mean_token_accuracy": 0.41379310488700866, + "step": 83970 + }, + { + "epoch": 0.08458051188458172, + "grad_norm": 12.071961388193635, + "learning_rate": 4.985304937872699e-05, + "loss": 2.7512, + "mean_token_accuracy": 0.3743496656417847, + "step": 83975 + }, + { + "epoch": 0.0845855479376859, + "grad_norm": 13.324347642940728, + "learning_rate": 4.985300661946589e-05, + "loss": 2.6727, + "mean_token_accuracy": 0.379310342669487, + "step": 83980 + }, + { + "epoch": 0.08459058399079007, + "grad_norm": 11.411819037813563, + "learning_rate": 4.985296385400512e-05, + "loss": 2.7982, + "mean_token_accuracy": 0.36896551251411436, + "step": 83985 + }, + { + "epoch": 0.08459562004389425, + "grad_norm": 11.257611565136207, + "learning_rate": 4.985292108234465e-05, + "loss": 2.3297, + "mean_token_accuracy": 0.37586206793785093, + "step": 83990 + }, + { + "epoch": 0.0846006560969984, + "grad_norm": 13.537053775996858, + "learning_rate": 4.985287830448452e-05, + "loss": 2.5451, + "mean_token_accuracy": 0.38965516686439516, + "step": 83995 + }, + { + "epoch": 0.08460569215010258, + "grad_norm": 13.906050299881006, + "learning_rate": 4.985283552042474e-05, + "loss": 2.7442, + "mean_token_accuracy": 0.3793103337287903, + "step": 84000 + }, + { + "epoch": 0.08461072820320675, + "grad_norm": 10.419281836334983, + "learning_rate": 4.9852792730165314e-05, + "loss": 2.6115, + "mean_token_accuracy": 0.4068965494632721, + "step": 84005 + }, + { + "epoch": 0.08461576425631093, + "grad_norm": 9.158864964003985, + "learning_rate": 4.9852749933706256e-05, + "loss": 2.1702, + "mean_token_accuracy": 0.46551724672317507, + "step": 84010 + }, + { + "epoch": 0.0846208003094151, + "grad_norm": 11.763344817770335, + "learning_rate": 4.9852707131047575e-05, + "loss": 2.6802, + "mean_token_accuracy": 0.38620689511299133, + "step": 84015 + }, + { + "epoch": 0.08462583636251927, + "grad_norm": 11.895008350369881, + "learning_rate": 4.98526643221893e-05, + "loss": 2.4321, + "mean_token_accuracy": 0.44827585816383364, + "step": 84020 + }, + { + "epoch": 0.08463087241562345, + "grad_norm": 12.071503916305106, + "learning_rate": 4.985262150713142e-05, + "loss": 2.491, + "mean_token_accuracy": 0.42413793206214906, + "step": 84025 + }, + { + "epoch": 0.08463590846872762, + "grad_norm": 10.246187236831082, + "learning_rate": 4.985257868587396e-05, + "loss": 2.5981, + "mean_token_accuracy": 0.3862068891525269, + "step": 84030 + }, + { + "epoch": 0.0846409445218318, + "grad_norm": 13.257689269394195, + "learning_rate": 4.9852535858416934e-05, + "loss": 2.5774, + "mean_token_accuracy": 0.43103448748588563, + "step": 84035 + }, + { + "epoch": 0.08464598057493597, + "grad_norm": 12.95358319162498, + "learning_rate": 4.985249302476035e-05, + "loss": 2.8369, + "mean_token_accuracy": 0.3620689630508423, + "step": 84040 + }, + { + "epoch": 0.08465101662804014, + "grad_norm": 10.245910768807207, + "learning_rate": 4.985245018490421e-05, + "loss": 2.628, + "mean_token_accuracy": 0.37241379022598264, + "step": 84045 + }, + { + "epoch": 0.08465605268114432, + "grad_norm": 14.578725142548901, + "learning_rate": 4.985240733884854e-05, + "loss": 2.9109, + "mean_token_accuracy": 0.35172413289546967, + "step": 84050 + }, + { + "epoch": 0.08466108873424849, + "grad_norm": 10.383156952358455, + "learning_rate": 4.985236448659334e-05, + "loss": 2.4021, + "mean_token_accuracy": 0.39655172228813174, + "step": 84055 + }, + { + "epoch": 0.08466612478735266, + "grad_norm": 12.575567199223936, + "learning_rate": 4.985232162813863e-05, + "loss": 2.2968, + "mean_token_accuracy": 0.39655172228813174, + "step": 84060 + }, + { + "epoch": 0.08467116084045682, + "grad_norm": 12.513828451956813, + "learning_rate": 4.9852278763484426e-05, + "loss": 2.4196, + "mean_token_accuracy": 0.4344827651977539, + "step": 84065 + }, + { + "epoch": 0.084676196893561, + "grad_norm": 9.393905132363841, + "learning_rate": 4.9852235892630736e-05, + "loss": 2.5785, + "mean_token_accuracy": 0.4482758641242981, + "step": 84070 + }, + { + "epoch": 0.08468123294666517, + "grad_norm": 12.269920962193842, + "learning_rate": 4.9852193015577564e-05, + "loss": 2.7835, + "mean_token_accuracy": 0.4103448212146759, + "step": 84075 + }, + { + "epoch": 0.08468626899976935, + "grad_norm": 9.79828248148721, + "learning_rate": 4.9852150132324924e-05, + "loss": 2.5029, + "mean_token_accuracy": 0.3758620649576187, + "step": 84080 + }, + { + "epoch": 0.08469130505287352, + "grad_norm": 10.583222648786517, + "learning_rate": 4.985210724287284e-05, + "loss": 2.2096, + "mean_token_accuracy": 0.44718694090843203, + "step": 84085 + }, + { + "epoch": 0.0846963411059777, + "grad_norm": 12.602681685972973, + "learning_rate": 4.9852064347221314e-05, + "loss": 2.5306, + "mean_token_accuracy": 0.4275861978530884, + "step": 84090 + }, + { + "epoch": 0.08470137715908187, + "grad_norm": 10.647772372043272, + "learning_rate": 4.9852021445370365e-05, + "loss": 2.6933, + "mean_token_accuracy": 0.39655172228813174, + "step": 84095 + }, + { + "epoch": 0.08470641321218604, + "grad_norm": 11.40920354490283, + "learning_rate": 4.985197853731999e-05, + "loss": 2.3704, + "mean_token_accuracy": 0.42413792610168455, + "step": 84100 + }, + { + "epoch": 0.08471144926529021, + "grad_norm": 15.877702334359258, + "learning_rate": 4.9851935623070204e-05, + "loss": 2.3639, + "mean_token_accuracy": 0.43950392603874205, + "step": 84105 + }, + { + "epoch": 0.08471648531839439, + "grad_norm": 11.707089327675579, + "learning_rate": 4.985189270262104e-05, + "loss": 2.2041, + "mean_token_accuracy": 0.4448275864124298, + "step": 84110 + }, + { + "epoch": 0.08472152137149856, + "grad_norm": 9.87025037329012, + "learning_rate": 4.9851849775972484e-05, + "loss": 2.2986, + "mean_token_accuracy": 0.4310344815254211, + "step": 84115 + }, + { + "epoch": 0.08472655742460274, + "grad_norm": 11.815281298484093, + "learning_rate": 4.985180684312457e-05, + "loss": 2.3183, + "mean_token_accuracy": 0.4984876036643982, + "step": 84120 + }, + { + "epoch": 0.08473159347770691, + "grad_norm": 10.829481199935287, + "learning_rate": 4.985176390407729e-05, + "loss": 2.4824, + "mean_token_accuracy": 0.43448275327682495, + "step": 84125 + }, + { + "epoch": 0.08473662953081108, + "grad_norm": 10.874685122677445, + "learning_rate": 4.9851720958830675e-05, + "loss": 2.421, + "mean_token_accuracy": 0.4517241299152374, + "step": 84130 + }, + { + "epoch": 0.08474166558391524, + "grad_norm": 17.465566195839703, + "learning_rate": 4.985167800738471e-05, + "loss": 2.7733, + "mean_token_accuracy": 0.3931034505367279, + "step": 84135 + }, + { + "epoch": 0.08474670163701942, + "grad_norm": 12.40572395076069, + "learning_rate": 4.985163504973944e-05, + "loss": 2.4605, + "mean_token_accuracy": 0.37586207389831544, + "step": 84140 + }, + { + "epoch": 0.08475173769012359, + "grad_norm": 11.207461417053574, + "learning_rate": 4.9851592085894854e-05, + "loss": 2.5483, + "mean_token_accuracy": 0.4689655125141144, + "step": 84145 + }, + { + "epoch": 0.08475677374322776, + "grad_norm": 13.662482575849292, + "learning_rate": 4.985154911585097e-05, + "loss": 2.7909, + "mean_token_accuracy": 0.3827586203813553, + "step": 84150 + }, + { + "epoch": 0.08476180979633194, + "grad_norm": 13.355202985175895, + "learning_rate": 4.98515061396078e-05, + "loss": 2.7766, + "mean_token_accuracy": 0.39310344457626345, + "step": 84155 + }, + { + "epoch": 0.08476684584943611, + "grad_norm": 11.200713467896184, + "learning_rate": 4.9851463157165365e-05, + "loss": 2.3058, + "mean_token_accuracy": 0.4655172348022461, + "step": 84160 + }, + { + "epoch": 0.08477188190254029, + "grad_norm": 14.286598093369648, + "learning_rate": 4.985142016852366e-05, + "loss": 2.9078, + "mean_token_accuracy": 0.42413793206214906, + "step": 84165 + }, + { + "epoch": 0.08477691795564446, + "grad_norm": 13.078357659040503, + "learning_rate": 4.98513771736827e-05, + "loss": 2.7162, + "mean_token_accuracy": 0.39655172228813174, + "step": 84170 + }, + { + "epoch": 0.08478195400874863, + "grad_norm": 9.648647865007197, + "learning_rate": 4.9851334172642514e-05, + "loss": 2.7713, + "mean_token_accuracy": 0.41034482419490814, + "step": 84175 + }, + { + "epoch": 0.08478699006185281, + "grad_norm": 18.711205379361328, + "learning_rate": 4.985129116540309e-05, + "loss": 2.7187, + "mean_token_accuracy": 0.3999999940395355, + "step": 84180 + }, + { + "epoch": 0.08479202611495698, + "grad_norm": 11.151462122417966, + "learning_rate": 4.9851248151964466e-05, + "loss": 2.6574, + "mean_token_accuracy": 0.3965517282485962, + "step": 84185 + }, + { + "epoch": 0.08479706216806115, + "grad_norm": 13.566341440282317, + "learning_rate": 4.9851205132326626e-05, + "loss": 2.8484, + "mean_token_accuracy": 0.358620685338974, + "step": 84190 + }, + { + "epoch": 0.08480209822116533, + "grad_norm": 14.286713248963341, + "learning_rate": 4.985116210648961e-05, + "loss": 2.7069, + "mean_token_accuracy": 0.39655172228813174, + "step": 84195 + }, + { + "epoch": 0.0848071342742695, + "grad_norm": 10.611132602784021, + "learning_rate": 4.98511190744534e-05, + "loss": 2.3405, + "mean_token_accuracy": 0.4310344815254211, + "step": 84200 + }, + { + "epoch": 0.08481217032737366, + "grad_norm": 11.03091011470549, + "learning_rate": 4.985107603621803e-05, + "loss": 2.766, + "mean_token_accuracy": 0.358620685338974, + "step": 84205 + }, + { + "epoch": 0.08481720638047784, + "grad_norm": 11.096924314656782, + "learning_rate": 4.9851032991783514e-05, + "loss": 2.1868, + "mean_token_accuracy": 0.4275861978530884, + "step": 84210 + }, + { + "epoch": 0.08482224243358201, + "grad_norm": 13.203385357226272, + "learning_rate": 4.985098994114985e-05, + "loss": 2.3506, + "mean_token_accuracy": 0.47586206197738645, + "step": 84215 + }, + { + "epoch": 0.08482727848668618, + "grad_norm": 20.08745848220974, + "learning_rate": 4.9850946884317054e-05, + "loss": 2.484, + "mean_token_accuracy": 0.46031458377838136, + "step": 84220 + }, + { + "epoch": 0.08483231453979036, + "grad_norm": 12.186828164252939, + "learning_rate": 4.985090382128515e-05, + "loss": 2.6952, + "mean_token_accuracy": 0.34137930274009703, + "step": 84225 + }, + { + "epoch": 0.08483735059289453, + "grad_norm": 11.552932838614483, + "learning_rate": 4.9850860752054124e-05, + "loss": 2.8502, + "mean_token_accuracy": 0.4068965494632721, + "step": 84230 + }, + { + "epoch": 0.0848423866459987, + "grad_norm": 9.979725244695615, + "learning_rate": 4.9850817676624006e-05, + "loss": 2.8145, + "mean_token_accuracy": 0.39655172228813174, + "step": 84235 + }, + { + "epoch": 0.08484742269910288, + "grad_norm": 9.994969684032503, + "learning_rate": 4.985077459499481e-05, + "loss": 2.3201, + "mean_token_accuracy": 0.43103448748588563, + "step": 84240 + }, + { + "epoch": 0.08485245875220705, + "grad_norm": 36.515186336496186, + "learning_rate": 4.985073150716654e-05, + "loss": 2.8818, + "mean_token_accuracy": 0.4034482777118683, + "step": 84245 + }, + { + "epoch": 0.08485749480531123, + "grad_norm": 11.10316859874001, + "learning_rate": 4.985068841313921e-05, + "loss": 2.521, + "mean_token_accuracy": 0.4034482717514038, + "step": 84250 + }, + { + "epoch": 0.0848625308584154, + "grad_norm": 9.287877523729954, + "learning_rate": 4.985064531291284e-05, + "loss": 2.5038, + "mean_token_accuracy": 0.36896551847457887, + "step": 84255 + }, + { + "epoch": 0.08486756691151957, + "grad_norm": 9.536114648483075, + "learning_rate": 4.9850602206487434e-05, + "loss": 2.0329, + "mean_token_accuracy": 0.4517241418361664, + "step": 84260 + }, + { + "epoch": 0.08487260296462375, + "grad_norm": 11.6830558221404, + "learning_rate": 4.9850559093862995e-05, + "loss": 2.5461, + "mean_token_accuracy": 0.4206896543502808, + "step": 84265 + }, + { + "epoch": 0.08487763901772791, + "grad_norm": 9.273789837989519, + "learning_rate": 4.9850515975039555e-05, + "loss": 2.3566, + "mean_token_accuracy": 0.4103448212146759, + "step": 84270 + }, + { + "epoch": 0.08488267507083208, + "grad_norm": 11.391562241937056, + "learning_rate": 4.985047285001712e-05, + "loss": 2.1968, + "mean_token_accuracy": 0.4655172348022461, + "step": 84275 + }, + { + "epoch": 0.08488771112393625, + "grad_norm": 11.030895412866492, + "learning_rate": 4.985042971879569e-05, + "loss": 2.7381, + "mean_token_accuracy": 0.4068965554237366, + "step": 84280 + }, + { + "epoch": 0.08489274717704043, + "grad_norm": 11.100432651413575, + "learning_rate": 4.985038658137529e-05, + "loss": 2.3755, + "mean_token_accuracy": 0.4344827651977539, + "step": 84285 + }, + { + "epoch": 0.0848977832301446, + "grad_norm": 11.103488017606288, + "learning_rate": 4.985034343775592e-05, + "loss": 2.4156, + "mean_token_accuracy": 0.41034482717514037, + "step": 84290 + }, + { + "epoch": 0.08490281928324878, + "grad_norm": 16.16668250795376, + "learning_rate": 4.985030028793761e-05, + "loss": 2.4185, + "mean_token_accuracy": 0.4068965494632721, + "step": 84295 + }, + { + "epoch": 0.08490785533635295, + "grad_norm": 12.333367533235188, + "learning_rate": 4.985025713192035e-05, + "loss": 2.4736, + "mean_token_accuracy": 0.42413792610168455, + "step": 84300 + }, + { + "epoch": 0.08491289138945712, + "grad_norm": 9.978560169014587, + "learning_rate": 4.985021396970416e-05, + "loss": 2.3768, + "mean_token_accuracy": 0.3931034505367279, + "step": 84305 + }, + { + "epoch": 0.0849179274425613, + "grad_norm": 11.769228079090027, + "learning_rate": 4.985017080128906e-05, + "loss": 2.4486, + "mean_token_accuracy": 0.41929823756217954, + "step": 84310 + }, + { + "epoch": 0.08492296349566547, + "grad_norm": 8.574489023713143, + "learning_rate": 4.9850127626675055e-05, + "loss": 2.3457, + "mean_token_accuracy": 0.4862068951129913, + "step": 84315 + }, + { + "epoch": 0.08492799954876964, + "grad_norm": 10.470837457361215, + "learning_rate": 4.985008444586216e-05, + "loss": 2.4462, + "mean_token_accuracy": 0.4068965494632721, + "step": 84320 + }, + { + "epoch": 0.08493303560187382, + "grad_norm": 16.492463480945514, + "learning_rate": 4.985004125885039e-05, + "loss": 2.5968, + "mean_token_accuracy": 0.4172413766384125, + "step": 84325 + }, + { + "epoch": 0.08493807165497799, + "grad_norm": 11.094778413128005, + "learning_rate": 4.9849998065639754e-05, + "loss": 2.558, + "mean_token_accuracy": 0.4344827592372894, + "step": 84330 + }, + { + "epoch": 0.08494310770808217, + "grad_norm": 12.184278906935136, + "learning_rate": 4.984995486623026e-05, + "loss": 2.3188, + "mean_token_accuracy": 0.4517241299152374, + "step": 84335 + }, + { + "epoch": 0.08494814376118633, + "grad_norm": 11.600304892147106, + "learning_rate": 4.9849911660621916e-05, + "loss": 2.1649, + "mean_token_accuracy": 0.48088324069976807, + "step": 84340 + }, + { + "epoch": 0.0849531798142905, + "grad_norm": 10.19200883249514, + "learning_rate": 4.984986844881474e-05, + "loss": 2.7399, + "mean_token_accuracy": 0.42891711592674253, + "step": 84345 + }, + { + "epoch": 0.08495821586739467, + "grad_norm": 9.686585989275923, + "learning_rate": 4.984982523080876e-05, + "loss": 2.8912, + "mean_token_accuracy": 0.35862068831920624, + "step": 84350 + }, + { + "epoch": 0.08496325192049885, + "grad_norm": 11.453765416333471, + "learning_rate": 4.984978200660396e-05, + "loss": 2.3594, + "mean_token_accuracy": 0.41724138259887694, + "step": 84355 + }, + { + "epoch": 0.08496828797360302, + "grad_norm": 12.184808333398058, + "learning_rate": 4.984973877620037e-05, + "loss": 2.7556, + "mean_token_accuracy": 0.38275861740112305, + "step": 84360 + }, + { + "epoch": 0.0849733240267072, + "grad_norm": 10.48616267662966, + "learning_rate": 4.984969553959799e-05, + "loss": 2.4211, + "mean_token_accuracy": 0.41724138259887694, + "step": 84365 + }, + { + "epoch": 0.08497836007981137, + "grad_norm": 10.767591480089358, + "learning_rate": 4.984965229679684e-05, + "loss": 2.4245, + "mean_token_accuracy": 0.40689654648303986, + "step": 84370 + }, + { + "epoch": 0.08498339613291554, + "grad_norm": 12.828653247519341, + "learning_rate": 4.984960904779693e-05, + "loss": 2.5812, + "mean_token_accuracy": 0.41379310190677643, + "step": 84375 + }, + { + "epoch": 0.08498843218601972, + "grad_norm": 11.287974775453847, + "learning_rate": 4.984956579259828e-05, + "loss": 2.4971, + "mean_token_accuracy": 0.4137930989265442, + "step": 84380 + }, + { + "epoch": 0.08499346823912389, + "grad_norm": 8.766901434050418, + "learning_rate": 4.9849522531200895e-05, + "loss": 2.4971, + "mean_token_accuracy": 0.3987295746803284, + "step": 84385 + }, + { + "epoch": 0.08499850429222806, + "grad_norm": 9.485667330991594, + "learning_rate": 4.984947926360478e-05, + "loss": 2.2096, + "mean_token_accuracy": 0.48620688915252686, + "step": 84390 + }, + { + "epoch": 0.08500354034533224, + "grad_norm": 11.78892736914468, + "learning_rate": 4.984943598980996e-05, + "loss": 2.4543, + "mean_token_accuracy": 0.4172413766384125, + "step": 84395 + }, + { + "epoch": 0.08500857639843641, + "grad_norm": 10.194695042178273, + "learning_rate": 4.984939270981643e-05, + "loss": 2.1158, + "mean_token_accuracy": 0.47931034564971925, + "step": 84400 + }, + { + "epoch": 0.08501361245154058, + "grad_norm": 11.861296369831022, + "learning_rate": 4.984934942362422e-05, + "loss": 2.3728, + "mean_token_accuracy": 0.42758620977401735, + "step": 84405 + }, + { + "epoch": 0.08501864850464474, + "grad_norm": 10.151868496312817, + "learning_rate": 4.984930613123333e-05, + "loss": 2.6983, + "mean_token_accuracy": 0.4482758641242981, + "step": 84410 + }, + { + "epoch": 0.08502368455774892, + "grad_norm": 13.26141666607643, + "learning_rate": 4.984926283264378e-05, + "loss": 2.7361, + "mean_token_accuracy": 0.3724137872457504, + "step": 84415 + }, + { + "epoch": 0.08502872061085309, + "grad_norm": 12.346749844357271, + "learning_rate": 4.984921952785558e-05, + "loss": 2.2865, + "mean_token_accuracy": 0.4758620738983154, + "step": 84420 + }, + { + "epoch": 0.08503375666395727, + "grad_norm": 12.020528042669813, + "learning_rate": 4.984917621686874e-05, + "loss": 2.6498, + "mean_token_accuracy": 0.3758620619773865, + "step": 84425 + }, + { + "epoch": 0.08503879271706144, + "grad_norm": 14.00184951981294, + "learning_rate": 4.984913289968327e-05, + "loss": 2.7715, + "mean_token_accuracy": 0.40689654350280763, + "step": 84430 + }, + { + "epoch": 0.08504382877016561, + "grad_norm": 10.131981225888152, + "learning_rate": 4.984908957629919e-05, + "loss": 2.6893, + "mean_token_accuracy": 0.3862069010734558, + "step": 84435 + }, + { + "epoch": 0.08504886482326979, + "grad_norm": 13.688999439861478, + "learning_rate": 4.9849046246716495e-05, + "loss": 2.8097, + "mean_token_accuracy": 0.4137930989265442, + "step": 84440 + }, + { + "epoch": 0.08505390087637396, + "grad_norm": 13.240859668973329, + "learning_rate": 4.9849002910935216e-05, + "loss": 2.7108, + "mean_token_accuracy": 0.334482753276825, + "step": 84445 + }, + { + "epoch": 0.08505893692947814, + "grad_norm": 10.47270945730837, + "learning_rate": 4.984895956895536e-05, + "loss": 2.6691, + "mean_token_accuracy": 0.4275862067937851, + "step": 84450 + }, + { + "epoch": 0.08506397298258231, + "grad_norm": 11.94512787235383, + "learning_rate": 4.984891622077693e-05, + "loss": 2.3838, + "mean_token_accuracy": 0.4103448331356049, + "step": 84455 + }, + { + "epoch": 0.08506900903568648, + "grad_norm": 11.341801043786331, + "learning_rate": 4.9848872866399956e-05, + "loss": 2.5576, + "mean_token_accuracy": 0.37586206793785093, + "step": 84460 + }, + { + "epoch": 0.08507404508879066, + "grad_norm": 12.43020649005942, + "learning_rate": 4.984882950582443e-05, + "loss": 2.412, + "mean_token_accuracy": 0.45172414779663084, + "step": 84465 + }, + { + "epoch": 0.08507908114189483, + "grad_norm": 11.199267528472951, + "learning_rate": 4.9848786139050377e-05, + "loss": 2.2614, + "mean_token_accuracy": 0.441379314661026, + "step": 84470 + }, + { + "epoch": 0.085084117194999, + "grad_norm": 15.572892566192156, + "learning_rate": 4.98487427660778e-05, + "loss": 2.8474, + "mean_token_accuracy": 0.4137930989265442, + "step": 84475 + }, + { + "epoch": 0.08508915324810316, + "grad_norm": 11.538591631311911, + "learning_rate": 4.9848699386906713e-05, + "loss": 2.2946, + "mean_token_accuracy": 0.4206896543502808, + "step": 84480 + }, + { + "epoch": 0.08509418930120734, + "grad_norm": 10.315776318851695, + "learning_rate": 4.9848656001537143e-05, + "loss": 2.5654, + "mean_token_accuracy": 0.4379310250282288, + "step": 84485 + }, + { + "epoch": 0.08509922535431151, + "grad_norm": 15.212140708893696, + "learning_rate": 4.984861260996908e-05, + "loss": 2.4587, + "mean_token_accuracy": 0.4137930929660797, + "step": 84490 + }, + { + "epoch": 0.08510426140741569, + "grad_norm": 12.003674766681122, + "learning_rate": 4.984856921220255e-05, + "loss": 2.4464, + "mean_token_accuracy": 0.4551724076271057, + "step": 84495 + }, + { + "epoch": 0.08510929746051986, + "grad_norm": 8.968695648919995, + "learning_rate": 4.984852580823756e-05, + "loss": 2.2852, + "mean_token_accuracy": 0.42413792610168455, + "step": 84500 + }, + { + "epoch": 0.08511433351362403, + "grad_norm": 12.589572264812752, + "learning_rate": 4.984848239807412e-05, + "loss": 2.2461, + "mean_token_accuracy": 0.41379310488700866, + "step": 84505 + }, + { + "epoch": 0.0851193695667282, + "grad_norm": 15.8679545111173, + "learning_rate": 4.9848438981712256e-05, + "loss": 2.4241, + "mean_token_accuracy": 0.4103448212146759, + "step": 84510 + }, + { + "epoch": 0.08512440561983238, + "grad_norm": 39.63071783502666, + "learning_rate": 4.984839555915196e-05, + "loss": 2.4455, + "mean_token_accuracy": 0.43448275327682495, + "step": 84515 + }, + { + "epoch": 0.08512944167293655, + "grad_norm": 10.151088486100734, + "learning_rate": 4.984835213039325e-05, + "loss": 2.5042, + "mean_token_accuracy": 0.4620689630508423, + "step": 84520 + }, + { + "epoch": 0.08513447772604073, + "grad_norm": 12.41790032572079, + "learning_rate": 4.9848308695436144e-05, + "loss": 2.4006, + "mean_token_accuracy": 0.4172413766384125, + "step": 84525 + }, + { + "epoch": 0.0851395137791449, + "grad_norm": 10.877655507858771, + "learning_rate": 4.984826525428066e-05, + "loss": 2.2611, + "mean_token_accuracy": 0.43103447556495667, + "step": 84530 + }, + { + "epoch": 0.08514454983224908, + "grad_norm": 9.97160224644332, + "learning_rate": 4.984822180692679e-05, + "loss": 2.6174, + "mean_token_accuracy": 0.3862068891525269, + "step": 84535 + }, + { + "epoch": 0.08514958588535325, + "grad_norm": 11.921062781809043, + "learning_rate": 4.984817835337456e-05, + "loss": 2.7481, + "mean_token_accuracy": 0.36896550953388213, + "step": 84540 + }, + { + "epoch": 0.08515462193845742, + "grad_norm": 11.668033587057701, + "learning_rate": 4.984813489362398e-05, + "loss": 2.4834, + "mean_token_accuracy": 0.4226860165596008, + "step": 84545 + }, + { + "epoch": 0.08515965799156158, + "grad_norm": 12.55272835602653, + "learning_rate": 4.9848091427675056e-05, + "loss": 2.3746, + "mean_token_accuracy": 0.38275861740112305, + "step": 84550 + }, + { + "epoch": 0.08516469404466576, + "grad_norm": 11.237504842676676, + "learning_rate": 4.9848047955527815e-05, + "loss": 2.5704, + "mean_token_accuracy": 0.44827585816383364, + "step": 84555 + }, + { + "epoch": 0.08516973009776993, + "grad_norm": 12.23209827172702, + "learning_rate": 4.984800447718226e-05, + "loss": 2.3089, + "mean_token_accuracy": 0.4586206912994385, + "step": 84560 + }, + { + "epoch": 0.0851747661508741, + "grad_norm": 10.903491627479307, + "learning_rate": 4.984796099263839e-05, + "loss": 2.4635, + "mean_token_accuracy": 0.4448275864124298, + "step": 84565 + }, + { + "epoch": 0.08517980220397828, + "grad_norm": 10.474051700439304, + "learning_rate": 4.984791750189625e-05, + "loss": 2.598, + "mean_token_accuracy": 0.3586206793785095, + "step": 84570 + }, + { + "epoch": 0.08518483825708245, + "grad_norm": 19.794011059465998, + "learning_rate": 4.984787400495582e-05, + "loss": 2.5507, + "mean_token_accuracy": 0.41379310488700866, + "step": 84575 + }, + { + "epoch": 0.08518987431018663, + "grad_norm": 10.280470348202181, + "learning_rate": 4.984783050181712e-05, + "loss": 2.5124, + "mean_token_accuracy": 0.4310344815254211, + "step": 84580 + }, + { + "epoch": 0.0851949103632908, + "grad_norm": 11.475574558126407, + "learning_rate": 4.984778699248017e-05, + "loss": 2.86, + "mean_token_accuracy": 0.3275862067937851, + "step": 84585 + }, + { + "epoch": 0.08519994641639497, + "grad_norm": 10.03683621786851, + "learning_rate": 4.984774347694498e-05, + "loss": 2.4673, + "mean_token_accuracy": 0.43793103098869324, + "step": 84590 + }, + { + "epoch": 0.08520498246949915, + "grad_norm": 12.12989956981753, + "learning_rate": 4.9847699955211555e-05, + "loss": 2.7586, + "mean_token_accuracy": 0.3517241388559341, + "step": 84595 + }, + { + "epoch": 0.08521001852260332, + "grad_norm": 10.244175830724057, + "learning_rate": 4.984765642727991e-05, + "loss": 2.5346, + "mean_token_accuracy": 0.3862068891525269, + "step": 84600 + }, + { + "epoch": 0.0852150545757075, + "grad_norm": 14.628615663778005, + "learning_rate": 4.984761289315006e-05, + "loss": 2.4964, + "mean_token_accuracy": 0.4137930989265442, + "step": 84605 + }, + { + "epoch": 0.08522009062881167, + "grad_norm": 9.870934901720963, + "learning_rate": 4.984756935282203e-05, + "loss": 2.6974, + "mean_token_accuracy": 0.39999999701976774, + "step": 84610 + }, + { + "epoch": 0.08522512668191584, + "grad_norm": 11.01603438631754, + "learning_rate": 4.98475258062958e-05, + "loss": 2.1647, + "mean_token_accuracy": 0.45517241954803467, + "step": 84615 + }, + { + "epoch": 0.08523016273502, + "grad_norm": 11.524847961866984, + "learning_rate": 4.984748225357141e-05, + "loss": 2.2512, + "mean_token_accuracy": 0.47586206197738645, + "step": 84620 + }, + { + "epoch": 0.08523519878812418, + "grad_norm": 13.4895765933074, + "learning_rate": 4.984743869464887e-05, + "loss": 2.3777, + "mean_token_accuracy": 0.43793103098869324, + "step": 84625 + }, + { + "epoch": 0.08524023484122835, + "grad_norm": 10.225665315704582, + "learning_rate": 4.984739512952817e-05, + "loss": 2.5444, + "mean_token_accuracy": 0.4068965494632721, + "step": 84630 + }, + { + "epoch": 0.08524527089433252, + "grad_norm": 10.278993971954705, + "learning_rate": 4.984735155820934e-05, + "loss": 2.6896, + "mean_token_accuracy": 0.4034482717514038, + "step": 84635 + }, + { + "epoch": 0.0852503069474367, + "grad_norm": 10.670394700520866, + "learning_rate": 4.984730798069239e-05, + "loss": 2.3579, + "mean_token_accuracy": 0.41724138259887694, + "step": 84640 + }, + { + "epoch": 0.08525534300054087, + "grad_norm": 11.55058103019508, + "learning_rate": 4.984726439697733e-05, + "loss": 2.8365, + "mean_token_accuracy": 0.4068965554237366, + "step": 84645 + }, + { + "epoch": 0.08526037905364504, + "grad_norm": 9.32338116219339, + "learning_rate": 4.9847220807064174e-05, + "loss": 2.3138, + "mean_token_accuracy": 0.43793103098869324, + "step": 84650 + }, + { + "epoch": 0.08526541510674922, + "grad_norm": 12.316541387402355, + "learning_rate": 4.9847177210952935e-05, + "loss": 2.4248, + "mean_token_accuracy": 0.4172413766384125, + "step": 84655 + }, + { + "epoch": 0.08527045115985339, + "grad_norm": 7.241387908224625, + "learning_rate": 4.984713360864362e-05, + "loss": 2.2264, + "mean_token_accuracy": 0.5020935952663421, + "step": 84660 + }, + { + "epoch": 0.08527548721295757, + "grad_norm": 14.523320856703261, + "learning_rate": 4.984709000013625e-05, + "loss": 2.9934, + "mean_token_accuracy": 0.3620689630508423, + "step": 84665 + }, + { + "epoch": 0.08528052326606174, + "grad_norm": 9.015426040179431, + "learning_rate": 4.984704638543082e-05, + "loss": 2.0629, + "mean_token_accuracy": 0.47586206197738645, + "step": 84670 + }, + { + "epoch": 0.08528555931916591, + "grad_norm": 13.302223112506663, + "learning_rate": 4.984700276452737e-05, + "loss": 2.2786, + "mean_token_accuracy": 0.458620685338974, + "step": 84675 + }, + { + "epoch": 0.08529059537227009, + "grad_norm": 11.16595514248166, + "learning_rate": 4.984695913742588e-05, + "loss": 2.4225, + "mean_token_accuracy": 0.4137930989265442, + "step": 84680 + }, + { + "epoch": 0.08529563142537426, + "grad_norm": 11.354800401263947, + "learning_rate": 4.9846915504126376e-05, + "loss": 2.6468, + "mean_token_accuracy": 0.458620685338974, + "step": 84685 + }, + { + "epoch": 0.08530066747847842, + "grad_norm": 11.583386402158622, + "learning_rate": 4.9846871864628884e-05, + "loss": 1.9683, + "mean_token_accuracy": 0.5172413766384125, + "step": 84690 + }, + { + "epoch": 0.0853057035315826, + "grad_norm": 10.327043853429629, + "learning_rate": 4.98468282189334e-05, + "loss": 2.1179, + "mean_token_accuracy": 0.4620689570903778, + "step": 84695 + }, + { + "epoch": 0.08531073958468677, + "grad_norm": 12.983142422137602, + "learning_rate": 4.984678456703994e-05, + "loss": 2.2929, + "mean_token_accuracy": 0.4310344815254211, + "step": 84700 + }, + { + "epoch": 0.08531577563779094, + "grad_norm": 8.933503006154062, + "learning_rate": 4.984674090894851e-05, + "loss": 2.5905, + "mean_token_accuracy": 0.417241370677948, + "step": 84705 + }, + { + "epoch": 0.08532081169089512, + "grad_norm": 12.272507498438783, + "learning_rate": 4.984669724465913e-05, + "loss": 2.7015, + "mean_token_accuracy": 0.3517241358757019, + "step": 84710 + }, + { + "epoch": 0.08532584774399929, + "grad_norm": 25.551052579900965, + "learning_rate": 4.984665357417182e-05, + "loss": 2.3321, + "mean_token_accuracy": 0.45862067937850953, + "step": 84715 + }, + { + "epoch": 0.08533088379710346, + "grad_norm": 10.29086593150973, + "learning_rate": 4.984660989748657e-05, + "loss": 2.4128, + "mean_token_accuracy": 0.37586206793785093, + "step": 84720 + }, + { + "epoch": 0.08533591985020764, + "grad_norm": 8.840366702981505, + "learning_rate": 4.9846566214603405e-05, + "loss": 2.4247, + "mean_token_accuracy": 0.4103448182344437, + "step": 84725 + }, + { + "epoch": 0.08534095590331181, + "grad_norm": 13.020275954691687, + "learning_rate": 4.9846522525522346e-05, + "loss": 2.2968, + "mean_token_accuracy": 0.4344827592372894, + "step": 84730 + }, + { + "epoch": 0.08534599195641598, + "grad_norm": 14.529412065382648, + "learning_rate": 4.984647883024339e-05, + "loss": 2.4133, + "mean_token_accuracy": 0.4262552857398987, + "step": 84735 + }, + { + "epoch": 0.08535102800952016, + "grad_norm": 9.838055254839905, + "learning_rate": 4.984643512876655e-05, + "loss": 2.6812, + "mean_token_accuracy": 0.4019358724355698, + "step": 84740 + }, + { + "epoch": 0.08535606406262433, + "grad_norm": 10.668436783535542, + "learning_rate": 4.984639142109185e-05, + "loss": 2.3161, + "mean_token_accuracy": 0.44482758045196535, + "step": 84745 + }, + { + "epoch": 0.0853611001157285, + "grad_norm": 10.318678538761349, + "learning_rate": 4.984634770721929e-05, + "loss": 2.4585, + "mean_token_accuracy": 0.4344827592372894, + "step": 84750 + }, + { + "epoch": 0.08536613616883268, + "grad_norm": 10.741265438412023, + "learning_rate": 4.984630398714889e-05, + "loss": 2.5932, + "mean_token_accuracy": 0.34827586114406583, + "step": 84755 + }, + { + "epoch": 0.08537117222193684, + "grad_norm": 10.263368501053987, + "learning_rate": 4.984626026088066e-05, + "loss": 2.4388, + "mean_token_accuracy": 0.4172413766384125, + "step": 84760 + }, + { + "epoch": 0.08537620827504101, + "grad_norm": 12.316449344697741, + "learning_rate": 4.9846216528414614e-05, + "loss": 2.5857, + "mean_token_accuracy": 0.42758620381355283, + "step": 84765 + }, + { + "epoch": 0.08538124432814519, + "grad_norm": 12.786897991712799, + "learning_rate": 4.984617278975075e-05, + "loss": 2.3611, + "mean_token_accuracy": 0.41724138259887694, + "step": 84770 + }, + { + "epoch": 0.08538628038124936, + "grad_norm": 18.4907771248876, + "learning_rate": 4.9846129044889104e-05, + "loss": 2.5688, + "mean_token_accuracy": 0.4586206912994385, + "step": 84775 + }, + { + "epoch": 0.08539131643435353, + "grad_norm": 11.127326146716031, + "learning_rate": 4.984608529382967e-05, + "loss": 2.3356, + "mean_token_accuracy": 0.43448275327682495, + "step": 84780 + }, + { + "epoch": 0.08539635248745771, + "grad_norm": 13.730937723790454, + "learning_rate": 4.9846041536572464e-05, + "loss": 2.6096, + "mean_token_accuracy": 0.3827586233615875, + "step": 84785 + }, + { + "epoch": 0.08540138854056188, + "grad_norm": 13.329863010094043, + "learning_rate": 4.98459977731175e-05, + "loss": 2.8554, + "mean_token_accuracy": 0.36551723480224607, + "step": 84790 + }, + { + "epoch": 0.08540642459366606, + "grad_norm": 9.54327958102799, + "learning_rate": 4.98459540034648e-05, + "loss": 2.531, + "mean_token_accuracy": 0.4, + "step": 84795 + }, + { + "epoch": 0.08541146064677023, + "grad_norm": 10.822723935559171, + "learning_rate": 4.984591022761436e-05, + "loss": 2.4074, + "mean_token_accuracy": 0.42413793206214906, + "step": 84800 + }, + { + "epoch": 0.0854164966998744, + "grad_norm": 10.337298328923886, + "learning_rate": 4.9845866445566195e-05, + "loss": 2.9763, + "mean_token_accuracy": 0.35862069129943847, + "step": 84805 + }, + { + "epoch": 0.08542153275297858, + "grad_norm": 11.754754415851192, + "learning_rate": 4.9845822657320325e-05, + "loss": 2.6188, + "mean_token_accuracy": 0.4, + "step": 84810 + }, + { + "epoch": 0.08542656880608275, + "grad_norm": 10.776760190852373, + "learning_rate": 4.984577886287675e-05, + "loss": 2.1525, + "mean_token_accuracy": 0.4463054269552231, + "step": 84815 + }, + { + "epoch": 0.08543160485918692, + "grad_norm": 8.700307520826124, + "learning_rate": 4.98457350622355e-05, + "loss": 2.1411, + "mean_token_accuracy": 0.458620685338974, + "step": 84820 + }, + { + "epoch": 0.0854366409122911, + "grad_norm": 14.362223184421186, + "learning_rate": 4.984569125539657e-05, + "loss": 2.2321, + "mean_token_accuracy": 0.4344827592372894, + "step": 84825 + }, + { + "epoch": 0.08544167696539526, + "grad_norm": 9.604512842635893, + "learning_rate": 4.984564744235998e-05, + "loss": 2.6096, + "mean_token_accuracy": 0.41379310488700866, + "step": 84830 + }, + { + "epoch": 0.08544671301849943, + "grad_norm": 12.749780105578493, + "learning_rate": 4.984560362312575e-05, + "loss": 2.2819, + "mean_token_accuracy": 0.4327283680438995, + "step": 84835 + }, + { + "epoch": 0.0854517490716036, + "grad_norm": 11.016649748836276, + "learning_rate": 4.984555979769387e-05, + "loss": 2.7468, + "mean_token_accuracy": 0.39310344457626345, + "step": 84840 + }, + { + "epoch": 0.08545678512470778, + "grad_norm": 11.22140505825962, + "learning_rate": 4.9845515966064383e-05, + "loss": 2.3012, + "mean_token_accuracy": 0.44482758045196535, + "step": 84845 + }, + { + "epoch": 0.08546182117781195, + "grad_norm": 14.846301264081985, + "learning_rate": 4.984547212823727e-05, + "loss": 2.5009, + "mean_token_accuracy": 0.44827585816383364, + "step": 84850 + }, + { + "epoch": 0.08546685723091613, + "grad_norm": 10.941981200457525, + "learning_rate": 4.984542828421256e-05, + "loss": 2.3513, + "mean_token_accuracy": 0.4379310429096222, + "step": 84855 + }, + { + "epoch": 0.0854718932840203, + "grad_norm": 10.20418301186699, + "learning_rate": 4.984538443399026e-05, + "loss": 2.1964, + "mean_token_accuracy": 0.4761645495891571, + "step": 84860 + }, + { + "epoch": 0.08547692933712447, + "grad_norm": 10.3312941527418, + "learning_rate": 4.984534057757039e-05, + "loss": 2.5541, + "mean_token_accuracy": 0.4103448212146759, + "step": 84865 + }, + { + "epoch": 0.08548196539022865, + "grad_norm": 11.719371097398584, + "learning_rate": 4.9845296714952956e-05, + "loss": 2.3445, + "mean_token_accuracy": 0.41034482717514037, + "step": 84870 + }, + { + "epoch": 0.08548700144333282, + "grad_norm": 11.559634627596225, + "learning_rate": 4.984525284613796e-05, + "loss": 2.4386, + "mean_token_accuracy": 0.42952207922935487, + "step": 84875 + }, + { + "epoch": 0.085492037496437, + "grad_norm": 10.366322945678176, + "learning_rate": 4.984520897112543e-05, + "loss": 2.3779, + "mean_token_accuracy": 0.36896551251411436, + "step": 84880 + }, + { + "epoch": 0.08549707354954117, + "grad_norm": 10.931057461523595, + "learning_rate": 4.9845165089915386e-05, + "loss": 2.2369, + "mean_token_accuracy": 0.441379314661026, + "step": 84885 + }, + { + "epoch": 0.08550210960264534, + "grad_norm": 9.658941064382661, + "learning_rate": 4.984512120250781e-05, + "loss": 2.2516, + "mean_token_accuracy": 0.4310344815254211, + "step": 84890 + }, + { + "epoch": 0.08550714565574952, + "grad_norm": 10.563480312026174, + "learning_rate": 4.9845077308902737e-05, + "loss": 2.3477, + "mean_token_accuracy": 0.5015728950500489, + "step": 84895 + }, + { + "epoch": 0.08551218170885368, + "grad_norm": 10.397525702347464, + "learning_rate": 4.984503340910018e-05, + "loss": 2.6549, + "mean_token_accuracy": 0.3896551728248596, + "step": 84900 + }, + { + "epoch": 0.08551721776195785, + "grad_norm": 9.583679109625598, + "learning_rate": 4.9844989503100136e-05, + "loss": 2.6838, + "mean_token_accuracy": 0.34482758641242983, + "step": 84905 + }, + { + "epoch": 0.08552225381506202, + "grad_norm": 10.539810370061335, + "learning_rate": 4.984494559090263e-05, + "loss": 2.515, + "mean_token_accuracy": 0.4358136713504791, + "step": 84910 + }, + { + "epoch": 0.0855272898681662, + "grad_norm": 12.121793066494048, + "learning_rate": 4.984490167250767e-05, + "loss": 2.797, + "mean_token_accuracy": 0.3827586233615875, + "step": 84915 + }, + { + "epoch": 0.08553232592127037, + "grad_norm": 12.837429395372114, + "learning_rate": 4.984485774791526e-05, + "loss": 2.6255, + "mean_token_accuracy": 0.4344827651977539, + "step": 84920 + }, + { + "epoch": 0.08553736197437455, + "grad_norm": 14.972352772295832, + "learning_rate": 4.984481381712543e-05, + "loss": 2.2976, + "mean_token_accuracy": 0.4986085891723633, + "step": 84925 + }, + { + "epoch": 0.08554239802747872, + "grad_norm": 12.282918672086751, + "learning_rate": 4.984476988013818e-05, + "loss": 2.5349, + "mean_token_accuracy": 0.3827586233615875, + "step": 84930 + }, + { + "epoch": 0.0855474340805829, + "grad_norm": 11.976660683360974, + "learning_rate": 4.984472593695353e-05, + "loss": 2.5609, + "mean_token_accuracy": 0.39310344457626345, + "step": 84935 + }, + { + "epoch": 0.08555247013368707, + "grad_norm": 11.803909869463793, + "learning_rate": 4.984468198757147e-05, + "loss": 2.5298, + "mean_token_accuracy": 0.3896551728248596, + "step": 84940 + }, + { + "epoch": 0.08555750618679124, + "grad_norm": 13.703224266665037, + "learning_rate": 4.9844638031992046e-05, + "loss": 2.8073, + "mean_token_accuracy": 0.3758620709180832, + "step": 84945 + }, + { + "epoch": 0.08556254223989541, + "grad_norm": 11.036223547197736, + "learning_rate": 4.984459407021525e-05, + "loss": 2.4779, + "mean_token_accuracy": 0.45640393495559695, + "step": 84950 + }, + { + "epoch": 0.08556757829299959, + "grad_norm": 10.039615791075628, + "learning_rate": 4.9844550102241095e-05, + "loss": 2.4055, + "mean_token_accuracy": 0.43932244181632996, + "step": 84955 + }, + { + "epoch": 0.08557261434610376, + "grad_norm": 10.257827787558794, + "learning_rate": 4.9844506128069596e-05, + "loss": 2.1784, + "mean_token_accuracy": 0.441379314661026, + "step": 84960 + }, + { + "epoch": 0.08557765039920794, + "grad_norm": 9.987932639948705, + "learning_rate": 4.984446214770076e-05, + "loss": 2.5851, + "mean_token_accuracy": 0.417241370677948, + "step": 84965 + }, + { + "epoch": 0.0855826864523121, + "grad_norm": 9.327547024341568, + "learning_rate": 4.984441816113461e-05, + "loss": 2.3614, + "mean_token_accuracy": 0.41724138259887694, + "step": 84970 + }, + { + "epoch": 0.08558772250541627, + "grad_norm": 10.509345441916844, + "learning_rate": 4.984437416837115e-05, + "loss": 2.5987, + "mean_token_accuracy": 0.3896551728248596, + "step": 84975 + }, + { + "epoch": 0.08559275855852044, + "grad_norm": 22.691751866348152, + "learning_rate": 4.98443301694104e-05, + "loss": 2.9315, + "mean_token_accuracy": 0.37586206793785093, + "step": 84980 + }, + { + "epoch": 0.08559779461162462, + "grad_norm": 12.726756382048482, + "learning_rate": 4.9844286164252355e-05, + "loss": 2.6035, + "mean_token_accuracy": 0.3965517282485962, + "step": 84985 + }, + { + "epoch": 0.08560283066472879, + "grad_norm": 12.428323106298778, + "learning_rate": 4.984424215289705e-05, + "loss": 2.4165, + "mean_token_accuracy": 0.39655172228813174, + "step": 84990 + }, + { + "epoch": 0.08560786671783296, + "grad_norm": 10.665648957937009, + "learning_rate": 4.984419813534448e-05, + "loss": 2.7658, + "mean_token_accuracy": 0.37586206793785093, + "step": 84995 + }, + { + "epoch": 0.08561290277093714, + "grad_norm": 10.993976142502344, + "learning_rate": 4.984415411159467e-05, + "loss": 2.266, + "mean_token_accuracy": 0.4344827592372894, + "step": 85000 + }, + { + "epoch": 0.08561793882404131, + "grad_norm": 10.852785505734532, + "learning_rate": 4.984411008164761e-05, + "loss": 2.3338, + "mean_token_accuracy": 0.41724138259887694, + "step": 85005 + }, + { + "epoch": 0.08562297487714549, + "grad_norm": 16.2854306918082, + "learning_rate": 4.984406604550335e-05, + "loss": 2.7387, + "mean_token_accuracy": 0.3965517163276672, + "step": 85010 + }, + { + "epoch": 0.08562801093024966, + "grad_norm": 10.045389519119256, + "learning_rate": 4.9844022003161865e-05, + "loss": 3.1191, + "mean_token_accuracy": 0.34137930870056155, + "step": 85015 + }, + { + "epoch": 0.08563304698335383, + "grad_norm": 11.963637450806097, + "learning_rate": 4.9843977954623184e-05, + "loss": 1.7848, + "mean_token_accuracy": 0.4896551787853241, + "step": 85020 + }, + { + "epoch": 0.08563808303645801, + "grad_norm": 11.725593220763537, + "learning_rate": 4.984393389988732e-05, + "loss": 3.1554, + "mean_token_accuracy": 0.2999999910593033, + "step": 85025 + }, + { + "epoch": 0.08564311908956218, + "grad_norm": 10.814965414121675, + "learning_rate": 4.984388983895428e-05, + "loss": 2.3534, + "mean_token_accuracy": 0.4344827592372894, + "step": 85030 + }, + { + "epoch": 0.08564815514266635, + "grad_norm": 14.875319527171524, + "learning_rate": 4.984384577182408e-05, + "loss": 2.536, + "mean_token_accuracy": 0.4344827592372894, + "step": 85035 + }, + { + "epoch": 0.08565319119577051, + "grad_norm": 13.70196564985981, + "learning_rate": 4.9843801698496736e-05, + "loss": 2.5501, + "mean_token_accuracy": 0.41034482717514037, + "step": 85040 + }, + { + "epoch": 0.08565822724887469, + "grad_norm": 10.37800883297888, + "learning_rate": 4.9843757618972255e-05, + "loss": 2.4787, + "mean_token_accuracy": 0.4068965494632721, + "step": 85045 + }, + { + "epoch": 0.08566326330197886, + "grad_norm": 9.468077476166412, + "learning_rate": 4.9843713533250636e-05, + "loss": 2.1058, + "mean_token_accuracy": 0.46406533718109133, + "step": 85050 + }, + { + "epoch": 0.08566829935508304, + "grad_norm": 10.636380153051858, + "learning_rate": 4.984366944133192e-05, + "loss": 2.5486, + "mean_token_accuracy": 0.41379310488700866, + "step": 85055 + }, + { + "epoch": 0.08567333540818721, + "grad_norm": 11.265505901346586, + "learning_rate": 4.98436253432161e-05, + "loss": 2.3632, + "mean_token_accuracy": 0.36551723480224607, + "step": 85060 + }, + { + "epoch": 0.08567837146129138, + "grad_norm": 10.095387213446813, + "learning_rate": 4.984358123890319e-05, + "loss": 2.3107, + "mean_token_accuracy": 0.4310344815254211, + "step": 85065 + }, + { + "epoch": 0.08568340751439556, + "grad_norm": 11.512709355570946, + "learning_rate": 4.9843537128393206e-05, + "loss": 2.4333, + "mean_token_accuracy": 0.4551724076271057, + "step": 85070 + }, + { + "epoch": 0.08568844356749973, + "grad_norm": 11.044413263613098, + "learning_rate": 4.9843493011686147e-05, + "loss": 2.5651, + "mean_token_accuracy": 0.4034482777118683, + "step": 85075 + }, + { + "epoch": 0.0856934796206039, + "grad_norm": 10.054607931507087, + "learning_rate": 4.9843448888782045e-05, + "loss": 2.5286, + "mean_token_accuracy": 0.4034482777118683, + "step": 85080 + }, + { + "epoch": 0.08569851567370808, + "grad_norm": 10.322332338126126, + "learning_rate": 4.9843404759680915e-05, + "loss": 2.8437, + "mean_token_accuracy": 0.3620689630508423, + "step": 85085 + }, + { + "epoch": 0.08570355172681225, + "grad_norm": 10.145294778338041, + "learning_rate": 4.984336062438275e-05, + "loss": 2.3634, + "mean_token_accuracy": 0.4601330816745758, + "step": 85090 + }, + { + "epoch": 0.08570858777991643, + "grad_norm": 10.914833485392618, + "learning_rate": 4.984331648288756e-05, + "loss": 2.8339, + "mean_token_accuracy": 0.39655172228813174, + "step": 85095 + }, + { + "epoch": 0.0857136238330206, + "grad_norm": 16.277723299251342, + "learning_rate": 4.984327233519539e-05, + "loss": 2.7048, + "mean_token_accuracy": 0.40344826579093934, + "step": 85100 + }, + { + "epoch": 0.08571865988612477, + "grad_norm": 9.859197820580183, + "learning_rate": 4.984322818130621e-05, + "loss": 2.6799, + "mean_token_accuracy": 0.3896551728248596, + "step": 85105 + }, + { + "epoch": 0.08572369593922893, + "grad_norm": 11.117010409719517, + "learning_rate": 4.984318402122006e-05, + "loss": 2.2761, + "mean_token_accuracy": 0.4517241418361664, + "step": 85110 + }, + { + "epoch": 0.08572873199233311, + "grad_norm": 9.05985384918835, + "learning_rate": 4.984313985493694e-05, + "loss": 2.3284, + "mean_token_accuracy": 0.43103447556495667, + "step": 85115 + }, + { + "epoch": 0.08573376804543728, + "grad_norm": 10.60185069467593, + "learning_rate": 4.984309568245688e-05, + "loss": 2.4633, + "mean_token_accuracy": 0.4399273991584778, + "step": 85120 + }, + { + "epoch": 0.08573880409854145, + "grad_norm": 10.281534014023256, + "learning_rate": 4.984305150377986e-05, + "loss": 2.917, + "mean_token_accuracy": 0.3827586233615875, + "step": 85125 + }, + { + "epoch": 0.08574384015164563, + "grad_norm": 11.942132248683746, + "learning_rate": 4.984300731890593e-05, + "loss": 2.2177, + "mean_token_accuracy": 0.4413793087005615, + "step": 85130 + }, + { + "epoch": 0.0857488762047498, + "grad_norm": 15.441877972568376, + "learning_rate": 4.9842963127835075e-05, + "loss": 2.4032, + "mean_token_accuracy": 0.4, + "step": 85135 + }, + { + "epoch": 0.08575391225785398, + "grad_norm": 9.825619475064997, + "learning_rate": 4.9842918930567316e-05, + "loss": 2.5206, + "mean_token_accuracy": 0.4189957737922668, + "step": 85140 + }, + { + "epoch": 0.08575894831095815, + "grad_norm": 11.588200095330597, + "learning_rate": 4.984287472710267e-05, + "loss": 2.6097, + "mean_token_accuracy": 0.4517241358757019, + "step": 85145 + }, + { + "epoch": 0.08576398436406232, + "grad_norm": 10.736010995061317, + "learning_rate": 4.984283051744114e-05, + "loss": 2.4531, + "mean_token_accuracy": 0.42413792610168455, + "step": 85150 + }, + { + "epoch": 0.0857690204171665, + "grad_norm": 15.14280086212328, + "learning_rate": 4.984278630158273e-05, + "loss": 3.0132, + "mean_token_accuracy": 0.3448275923728943, + "step": 85155 + }, + { + "epoch": 0.08577405647027067, + "grad_norm": 14.294142789283367, + "learning_rate": 4.984274207952748e-05, + "loss": 2.6942, + "mean_token_accuracy": 0.38620689511299133, + "step": 85160 + }, + { + "epoch": 0.08577909252337484, + "grad_norm": 10.396403966585945, + "learning_rate": 4.984269785127539e-05, + "loss": 2.3815, + "mean_token_accuracy": 0.4448275864124298, + "step": 85165 + }, + { + "epoch": 0.08578412857647902, + "grad_norm": 18.176901069389736, + "learning_rate": 4.9842653616826456e-05, + "loss": 3.205, + "mean_token_accuracy": 0.3999999940395355, + "step": 85170 + }, + { + "epoch": 0.08578916462958319, + "grad_norm": 11.11701505904349, + "learning_rate": 4.9842609376180716e-05, + "loss": 2.5131, + "mean_token_accuracy": 0.403448274731636, + "step": 85175 + }, + { + "epoch": 0.08579420068268735, + "grad_norm": 10.079025940852375, + "learning_rate": 4.9842565129338164e-05, + "loss": 2.3245, + "mean_token_accuracy": 0.43793103098869324, + "step": 85180 + }, + { + "epoch": 0.08579923673579153, + "grad_norm": 10.94634268586728, + "learning_rate": 4.984252087629882e-05, + "loss": 2.378, + "mean_token_accuracy": 0.4344827592372894, + "step": 85185 + }, + { + "epoch": 0.0858042727888957, + "grad_norm": 12.858617851379925, + "learning_rate": 4.984247661706269e-05, + "loss": 2.6084, + "mean_token_accuracy": 0.4551724135875702, + "step": 85190 + }, + { + "epoch": 0.08580930884199987, + "grad_norm": 14.985527937415704, + "learning_rate": 4.98424323516298e-05, + "loss": 2.5601, + "mean_token_accuracy": 0.4499697506427765, + "step": 85195 + }, + { + "epoch": 0.08581434489510405, + "grad_norm": 11.18939523392338, + "learning_rate": 4.984238808000015e-05, + "loss": 2.9234, + "mean_token_accuracy": 0.40145190358161925, + "step": 85200 + }, + { + "epoch": 0.08581938094820822, + "grad_norm": 11.111570224074125, + "learning_rate": 4.984234380217375e-05, + "loss": 3.2566, + "mean_token_accuracy": 0.4000000059604645, + "step": 85205 + }, + { + "epoch": 0.0858244170013124, + "grad_norm": 11.4627003574483, + "learning_rate": 4.984229951815062e-05, + "loss": 2.2581, + "mean_token_accuracy": 0.4379310250282288, + "step": 85210 + }, + { + "epoch": 0.08582945305441657, + "grad_norm": 11.13292601703709, + "learning_rate": 4.984225522793077e-05, + "loss": 2.4919, + "mean_token_accuracy": 0.43793103098869324, + "step": 85215 + }, + { + "epoch": 0.08583448910752074, + "grad_norm": 11.206306358297843, + "learning_rate": 4.9842210931514214e-05, + "loss": 2.3423, + "mean_token_accuracy": 0.4310344815254211, + "step": 85220 + }, + { + "epoch": 0.08583952516062492, + "grad_norm": 13.484362749937183, + "learning_rate": 4.9842166628900966e-05, + "loss": 2.6471, + "mean_token_accuracy": 0.3689655214548111, + "step": 85225 + }, + { + "epoch": 0.08584456121372909, + "grad_norm": 11.365625490029073, + "learning_rate": 4.984212232009102e-05, + "loss": 2.328, + "mean_token_accuracy": 0.43103448748588563, + "step": 85230 + }, + { + "epoch": 0.08584959726683326, + "grad_norm": 9.932729322464247, + "learning_rate": 4.984207800508442e-05, + "loss": 2.2987, + "mean_token_accuracy": 0.4206896543502808, + "step": 85235 + }, + { + "epoch": 0.08585463331993744, + "grad_norm": 13.909141499203182, + "learning_rate": 4.984203368388115e-05, + "loss": 2.9652, + "mean_token_accuracy": 0.31724137663841245, + "step": 85240 + }, + { + "epoch": 0.08585966937304161, + "grad_norm": 11.153437789051395, + "learning_rate": 4.984198935648124e-05, + "loss": 2.2758, + "mean_token_accuracy": 0.4344827592372894, + "step": 85245 + }, + { + "epoch": 0.08586470542614577, + "grad_norm": 11.7182021532364, + "learning_rate": 4.9841945022884693e-05, + "loss": 2.4504, + "mean_token_accuracy": 0.4310344815254211, + "step": 85250 + }, + { + "epoch": 0.08586974147924994, + "grad_norm": 10.576830298391938, + "learning_rate": 4.984190068309153e-05, + "loss": 2.5444, + "mean_token_accuracy": 0.4, + "step": 85255 + }, + { + "epoch": 0.08587477753235412, + "grad_norm": 11.739602391987388, + "learning_rate": 4.984185633710175e-05, + "loss": 2.7259, + "mean_token_accuracy": 0.38275861740112305, + "step": 85260 + }, + { + "epoch": 0.08587981358545829, + "grad_norm": 10.869190652738734, + "learning_rate": 4.9841811984915374e-05, + "loss": 2.247, + "mean_token_accuracy": 0.4310344815254211, + "step": 85265 + }, + { + "epoch": 0.08588484963856247, + "grad_norm": 10.187997515510254, + "learning_rate": 4.984176762653241e-05, + "loss": 2.2519, + "mean_token_accuracy": 0.44827585816383364, + "step": 85270 + }, + { + "epoch": 0.08588988569166664, + "grad_norm": 10.63250994867134, + "learning_rate": 4.9841723261952885e-05, + "loss": 2.3426, + "mean_token_accuracy": 0.42413793206214906, + "step": 85275 + }, + { + "epoch": 0.08589492174477081, + "grad_norm": 9.660467397302225, + "learning_rate": 4.984167889117679e-05, + "loss": 2.4858, + "mean_token_accuracy": 0.41379310488700866, + "step": 85280 + }, + { + "epoch": 0.08589995779787499, + "grad_norm": 13.710668627505783, + "learning_rate": 4.984163451420415e-05, + "loss": 2.7851, + "mean_token_accuracy": 0.3482758581638336, + "step": 85285 + }, + { + "epoch": 0.08590499385097916, + "grad_norm": 8.799701314304338, + "learning_rate": 4.9841590131034974e-05, + "loss": 2.1146, + "mean_token_accuracy": 0.4551724076271057, + "step": 85290 + }, + { + "epoch": 0.08591002990408333, + "grad_norm": 9.630625662645343, + "learning_rate": 4.9841545741669276e-05, + "loss": 2.3573, + "mean_token_accuracy": 0.4172413766384125, + "step": 85295 + }, + { + "epoch": 0.08591506595718751, + "grad_norm": 10.001590124425215, + "learning_rate": 4.9841501346107064e-05, + "loss": 2.1674, + "mean_token_accuracy": 0.4310344815254211, + "step": 85300 + }, + { + "epoch": 0.08592010201029168, + "grad_norm": 13.609977664991971, + "learning_rate": 4.984145694434835e-05, + "loss": 2.6749, + "mean_token_accuracy": 0.3965517163276672, + "step": 85305 + }, + { + "epoch": 0.08592513806339586, + "grad_norm": 12.239505555693631, + "learning_rate": 4.9841412536393155e-05, + "loss": 2.594, + "mean_token_accuracy": 0.37241379022598264, + "step": 85310 + }, + { + "epoch": 0.08593017411650003, + "grad_norm": 16.714787973824745, + "learning_rate": 4.984136812224149e-05, + "loss": 2.8628, + "mean_token_accuracy": 0.3241379290819168, + "step": 85315 + }, + { + "epoch": 0.08593521016960419, + "grad_norm": 11.161491601939215, + "learning_rate": 4.984132370189335e-05, + "loss": 2.6454, + "mean_token_accuracy": 0.3896551728248596, + "step": 85320 + }, + { + "epoch": 0.08594024622270836, + "grad_norm": 10.39513729267292, + "learning_rate": 4.984127927534878e-05, + "loss": 2.8, + "mean_token_accuracy": 0.3620689630508423, + "step": 85325 + }, + { + "epoch": 0.08594528227581254, + "grad_norm": 10.97565235178341, + "learning_rate": 4.9841234842607755e-05, + "loss": 2.4869, + "mean_token_accuracy": 0.4517241358757019, + "step": 85330 + }, + { + "epoch": 0.08595031832891671, + "grad_norm": 13.094679448185023, + "learning_rate": 4.9841190403670315e-05, + "loss": 2.3968, + "mean_token_accuracy": 0.43617664873600004, + "step": 85335 + }, + { + "epoch": 0.08595535438202088, + "grad_norm": 9.268381841700993, + "learning_rate": 4.9841145958536455e-05, + "loss": 2.4659, + "mean_token_accuracy": 0.37586206793785093, + "step": 85340 + }, + { + "epoch": 0.08596039043512506, + "grad_norm": 10.754550261288411, + "learning_rate": 4.98411015072062e-05, + "loss": 3.1551, + "mean_token_accuracy": 0.3068965494632721, + "step": 85345 + }, + { + "epoch": 0.08596542648822923, + "grad_norm": 12.350153453574032, + "learning_rate": 4.984105704967956e-05, + "loss": 2.4209, + "mean_token_accuracy": 0.441379314661026, + "step": 85350 + }, + { + "epoch": 0.0859704625413334, + "grad_norm": 12.896445346898673, + "learning_rate": 4.9841012585956534e-05, + "loss": 2.6002, + "mean_token_accuracy": 0.3517241418361664, + "step": 85355 + }, + { + "epoch": 0.08597549859443758, + "grad_norm": 11.133722761190292, + "learning_rate": 4.9840968116037156e-05, + "loss": 2.5355, + "mean_token_accuracy": 0.4068965494632721, + "step": 85360 + }, + { + "epoch": 0.08598053464754175, + "grad_norm": 10.917072446468302, + "learning_rate": 4.984092363992142e-05, + "loss": 2.4788, + "mean_token_accuracy": 0.3862068891525269, + "step": 85365 + }, + { + "epoch": 0.08598557070064593, + "grad_norm": 9.424491955677622, + "learning_rate": 4.984087915760935e-05, + "loss": 2.1365, + "mean_token_accuracy": 0.46551724672317507, + "step": 85370 + }, + { + "epoch": 0.0859906067537501, + "grad_norm": 12.778505686665666, + "learning_rate": 4.9840834669100956e-05, + "loss": 2.9923, + "mean_token_accuracy": 0.3896551728248596, + "step": 85375 + }, + { + "epoch": 0.08599564280685427, + "grad_norm": 9.726605335924999, + "learning_rate": 4.984079017439624e-05, + "loss": 2.3379, + "mean_token_accuracy": 0.41548699140548706, + "step": 85380 + }, + { + "epoch": 0.08600067885995845, + "grad_norm": 10.513691817665345, + "learning_rate": 4.9840745673495224e-05, + "loss": 2.4615, + "mean_token_accuracy": 0.4379310429096222, + "step": 85385 + }, + { + "epoch": 0.08600571491306261, + "grad_norm": 11.08152069332261, + "learning_rate": 4.984070116639792e-05, + "loss": 2.3981, + "mean_token_accuracy": 0.43793103098869324, + "step": 85390 + }, + { + "epoch": 0.08601075096616678, + "grad_norm": 9.407697331771768, + "learning_rate": 4.9840656653104346e-05, + "loss": 2.4266, + "mean_token_accuracy": 0.4379310369491577, + "step": 85395 + }, + { + "epoch": 0.08601578701927096, + "grad_norm": 11.360659170297724, + "learning_rate": 4.98406121336145e-05, + "loss": 2.2574, + "mean_token_accuracy": 0.4448275864124298, + "step": 85400 + }, + { + "epoch": 0.08602082307237513, + "grad_norm": 11.062678540631175, + "learning_rate": 4.984056760792841e-05, + "loss": 2.7794, + "mean_token_accuracy": 0.3758620619773865, + "step": 85405 + }, + { + "epoch": 0.0860258591254793, + "grad_norm": 15.597091022284994, + "learning_rate": 4.984052307604607e-05, + "loss": 2.0737, + "mean_token_accuracy": 0.4815486967563629, + "step": 85410 + }, + { + "epoch": 0.08603089517858348, + "grad_norm": 14.38673739807895, + "learning_rate": 4.98404785379675e-05, + "loss": 2.8609, + "mean_token_accuracy": 0.3517241358757019, + "step": 85415 + }, + { + "epoch": 0.08603593123168765, + "grad_norm": 9.681448442049616, + "learning_rate": 4.9840433993692726e-05, + "loss": 2.4233, + "mean_token_accuracy": 0.4365396201610565, + "step": 85420 + }, + { + "epoch": 0.08604096728479182, + "grad_norm": 13.878501551785694, + "learning_rate": 4.984038944322175e-05, + "loss": 2.3872, + "mean_token_accuracy": 0.422202056646347, + "step": 85425 + }, + { + "epoch": 0.086046003337896, + "grad_norm": 11.807575497010157, + "learning_rate": 4.984034488655458e-05, + "loss": 2.7204, + "mean_token_accuracy": 0.3689655065536499, + "step": 85430 + }, + { + "epoch": 0.08605103939100017, + "grad_norm": 12.546339320064558, + "learning_rate": 4.984030032369123e-05, + "loss": 2.3969, + "mean_token_accuracy": 0.4379310429096222, + "step": 85435 + }, + { + "epoch": 0.08605607544410435, + "grad_norm": 11.113824852858606, + "learning_rate": 4.984025575463172e-05, + "loss": 2.7767, + "mean_token_accuracy": 0.39655172228813174, + "step": 85440 + }, + { + "epoch": 0.08606111149720852, + "grad_norm": 10.351716956585157, + "learning_rate": 4.984021117937605e-05, + "loss": 2.5045, + "mean_token_accuracy": 0.41379310488700866, + "step": 85445 + }, + { + "epoch": 0.0860661475503127, + "grad_norm": 14.0565772878961, + "learning_rate": 4.9840166597924235e-05, + "loss": 2.5841, + "mean_token_accuracy": 0.4103448331356049, + "step": 85450 + }, + { + "epoch": 0.08607118360341687, + "grad_norm": 11.39834192620065, + "learning_rate": 4.98401220102763e-05, + "loss": 2.5898, + "mean_token_accuracy": 0.3551724135875702, + "step": 85455 + }, + { + "epoch": 0.08607621965652103, + "grad_norm": 11.159408865013248, + "learning_rate": 4.9840077416432244e-05, + "loss": 2.5626, + "mean_token_accuracy": 0.4379310369491577, + "step": 85460 + }, + { + "epoch": 0.0860812557096252, + "grad_norm": 11.162242629254587, + "learning_rate": 4.9840032816392084e-05, + "loss": 1.9189, + "mean_token_accuracy": 0.4724137902259827, + "step": 85465 + }, + { + "epoch": 0.08608629176272937, + "grad_norm": 12.65093655706823, + "learning_rate": 4.9839988210155845e-05, + "loss": 2.9522, + "mean_token_accuracy": 0.3379310339689255, + "step": 85470 + }, + { + "epoch": 0.08609132781583355, + "grad_norm": 12.16896884426746, + "learning_rate": 4.983994359772351e-05, + "loss": 2.4605, + "mean_token_accuracy": 0.4000000059604645, + "step": 85475 + }, + { + "epoch": 0.08609636386893772, + "grad_norm": 11.75199572193061, + "learning_rate": 4.983989897909512e-05, + "loss": 2.4777, + "mean_token_accuracy": 0.4448275864124298, + "step": 85480 + }, + { + "epoch": 0.0861013999220419, + "grad_norm": 14.382335485108813, + "learning_rate": 4.983985435427067e-05, + "loss": 2.7906, + "mean_token_accuracy": 0.39655172228813174, + "step": 85485 + }, + { + "epoch": 0.08610643597514607, + "grad_norm": 9.945878369688634, + "learning_rate": 4.9839809723250176e-05, + "loss": 2.3956, + "mean_token_accuracy": 0.4275861978530884, + "step": 85490 + }, + { + "epoch": 0.08611147202825024, + "grad_norm": 13.850151860829053, + "learning_rate": 4.983976508603365e-05, + "loss": 2.623, + "mean_token_accuracy": 0.39655172228813174, + "step": 85495 + }, + { + "epoch": 0.08611650808135442, + "grad_norm": 11.612715385123675, + "learning_rate": 4.983972044262112e-05, + "loss": 2.3115, + "mean_token_accuracy": 0.47447065711021424, + "step": 85500 + }, + { + "epoch": 0.08612154413445859, + "grad_norm": 10.950592942516883, + "learning_rate": 4.9839675793012576e-05, + "loss": 2.4697, + "mean_token_accuracy": 0.4122807025909424, + "step": 85505 + }, + { + "epoch": 0.08612658018756277, + "grad_norm": 9.935897196464015, + "learning_rate": 4.983963113720804e-05, + "loss": 2.3464, + "mean_token_accuracy": 0.4482758641242981, + "step": 85510 + }, + { + "epoch": 0.08613161624066694, + "grad_norm": 10.300166444493629, + "learning_rate": 4.9839586475207525e-05, + "loss": 2.5632, + "mean_token_accuracy": 0.4, + "step": 85515 + }, + { + "epoch": 0.08613665229377111, + "grad_norm": 19.439069266082544, + "learning_rate": 4.983954180701104e-05, + "loss": 2.9965, + "mean_token_accuracy": 0.37931033968925476, + "step": 85520 + }, + { + "epoch": 0.08614168834687529, + "grad_norm": 11.741771304225871, + "learning_rate": 4.983949713261861e-05, + "loss": 2.4968, + "mean_token_accuracy": 0.4034482717514038, + "step": 85525 + }, + { + "epoch": 0.08614672439997945, + "grad_norm": 11.682057675342719, + "learning_rate": 4.983945245203023e-05, + "loss": 2.8479, + "mean_token_accuracy": 0.3999999940395355, + "step": 85530 + }, + { + "epoch": 0.08615176045308362, + "grad_norm": 9.507023362124603, + "learning_rate": 4.9839407765245914e-05, + "loss": 2.3831, + "mean_token_accuracy": 0.42413793206214906, + "step": 85535 + }, + { + "epoch": 0.0861567965061878, + "grad_norm": 23.008108641110102, + "learning_rate": 4.983936307226568e-05, + "loss": 3.7925, + "mean_token_accuracy": 0.31724137514829637, + "step": 85540 + }, + { + "epoch": 0.08616183255929197, + "grad_norm": 9.494186571883864, + "learning_rate": 4.983931837308955e-05, + "loss": 2.2603, + "mean_token_accuracy": 0.4551724135875702, + "step": 85545 + }, + { + "epoch": 0.08616686861239614, + "grad_norm": 10.707906647919105, + "learning_rate": 4.9839273667717524e-05, + "loss": 2.8206, + "mean_token_accuracy": 0.3896551787853241, + "step": 85550 + }, + { + "epoch": 0.08617190466550032, + "grad_norm": 12.001503670005464, + "learning_rate": 4.983922895614961e-05, + "loss": 2.5182, + "mean_token_accuracy": 0.36551723480224607, + "step": 85555 + }, + { + "epoch": 0.08617694071860449, + "grad_norm": 10.22107580929728, + "learning_rate": 4.9839184238385834e-05, + "loss": 2.3689, + "mean_token_accuracy": 0.41724138259887694, + "step": 85560 + }, + { + "epoch": 0.08618197677170866, + "grad_norm": 9.56316902610241, + "learning_rate": 4.98391395144262e-05, + "loss": 2.4329, + "mean_token_accuracy": 0.3931034505367279, + "step": 85565 + }, + { + "epoch": 0.08618701282481284, + "grad_norm": 11.36748756237982, + "learning_rate": 4.9839094784270725e-05, + "loss": 2.4364, + "mean_token_accuracy": 0.43448275327682495, + "step": 85570 + }, + { + "epoch": 0.08619204887791701, + "grad_norm": 12.713213482473531, + "learning_rate": 4.983905004791942e-05, + "loss": 2.8481, + "mean_token_accuracy": 0.3551724016666412, + "step": 85575 + }, + { + "epoch": 0.08619708493102118, + "grad_norm": 10.562700749580577, + "learning_rate": 4.9839005305372286e-05, + "loss": 2.3568, + "mean_token_accuracy": 0.44827585816383364, + "step": 85580 + }, + { + "epoch": 0.08620212098412536, + "grad_norm": 11.202289890344549, + "learning_rate": 4.983896055662935e-05, + "loss": 2.5404, + "mean_token_accuracy": 0.4344827651977539, + "step": 85585 + }, + { + "epoch": 0.08620715703722953, + "grad_norm": 11.118000911378077, + "learning_rate": 4.983891580169062e-05, + "loss": 2.2792, + "mean_token_accuracy": 0.44482759237289426, + "step": 85590 + }, + { + "epoch": 0.0862121930903337, + "grad_norm": 11.95394681498992, + "learning_rate": 4.983887104055611e-05, + "loss": 2.8942, + "mean_token_accuracy": 0.34482758641242983, + "step": 85595 + }, + { + "epoch": 0.08621722914343787, + "grad_norm": 10.893648767087248, + "learning_rate": 4.983882627322584e-05, + "loss": 2.6495, + "mean_token_accuracy": 0.37241379022598264, + "step": 85600 + }, + { + "epoch": 0.08622226519654204, + "grad_norm": 9.821974307133512, + "learning_rate": 4.9838781499699805e-05, + "loss": 2.2899, + "mean_token_accuracy": 0.44827585816383364, + "step": 85605 + }, + { + "epoch": 0.08622730124964621, + "grad_norm": 11.041534966239618, + "learning_rate": 4.9838736719978025e-05, + "loss": 2.9946, + "mean_token_accuracy": 0.3637023627758026, + "step": 85610 + }, + { + "epoch": 0.08623233730275039, + "grad_norm": 10.722951042243706, + "learning_rate": 4.983869193406051e-05, + "loss": 2.4547, + "mean_token_accuracy": 0.4189957737922668, + "step": 85615 + }, + { + "epoch": 0.08623737335585456, + "grad_norm": 10.83658484455416, + "learning_rate": 4.983864714194728e-05, + "loss": 2.3429, + "mean_token_accuracy": 0.4532365381717682, + "step": 85620 + }, + { + "epoch": 0.08624240940895873, + "grad_norm": 11.518934069709971, + "learning_rate": 4.983860234363833e-05, + "loss": 2.6994, + "mean_token_accuracy": 0.39310345649719236, + "step": 85625 + }, + { + "epoch": 0.08624744546206291, + "grad_norm": 11.98270298914532, + "learning_rate": 4.98385575391337e-05, + "loss": 2.6463, + "mean_token_accuracy": 0.41724138259887694, + "step": 85630 + }, + { + "epoch": 0.08625248151516708, + "grad_norm": 11.742933095477468, + "learning_rate": 4.9838512728433386e-05, + "loss": 2.7388, + "mean_token_accuracy": 0.3517241358757019, + "step": 85635 + }, + { + "epoch": 0.08625751756827126, + "grad_norm": 10.641977258120052, + "learning_rate": 4.9838467911537403e-05, + "loss": 2.3493, + "mean_token_accuracy": 0.41724138259887694, + "step": 85640 + }, + { + "epoch": 0.08626255362137543, + "grad_norm": 10.493203291432566, + "learning_rate": 4.9838423088445754e-05, + "loss": 2.7103, + "mean_token_accuracy": 0.3931034505367279, + "step": 85645 + }, + { + "epoch": 0.0862675896744796, + "grad_norm": 14.192394149965299, + "learning_rate": 4.983837825915847e-05, + "loss": 2.6765, + "mean_token_accuracy": 0.36896551251411436, + "step": 85650 + }, + { + "epoch": 0.08627262572758378, + "grad_norm": 12.54586541101411, + "learning_rate": 4.983833342367555e-05, + "loss": 2.8413, + "mean_token_accuracy": 0.39310344457626345, + "step": 85655 + }, + { + "epoch": 0.08627766178068795, + "grad_norm": 10.185967309541747, + "learning_rate": 4.983828858199701e-05, + "loss": 2.6258, + "mean_token_accuracy": 0.3551724076271057, + "step": 85660 + }, + { + "epoch": 0.08628269783379212, + "grad_norm": 12.258931326834743, + "learning_rate": 4.9838243734122863e-05, + "loss": 2.6827, + "mean_token_accuracy": 0.4000000059604645, + "step": 85665 + }, + { + "epoch": 0.08628773388689628, + "grad_norm": 10.17082181339242, + "learning_rate": 4.983819888005312e-05, + "loss": 1.9994, + "mean_token_accuracy": 0.4517241418361664, + "step": 85670 + }, + { + "epoch": 0.08629276994000046, + "grad_norm": 10.99133632123915, + "learning_rate": 4.9838154019787795e-05, + "loss": 2.5435, + "mean_token_accuracy": 0.4034482777118683, + "step": 85675 + }, + { + "epoch": 0.08629780599310463, + "grad_norm": 10.781972261556966, + "learning_rate": 4.983810915332689e-05, + "loss": 2.1627, + "mean_token_accuracy": 0.46551724076271056, + "step": 85680 + }, + { + "epoch": 0.0863028420462088, + "grad_norm": 10.973681971932715, + "learning_rate": 4.983806428067045e-05, + "loss": 2.3818, + "mean_token_accuracy": 0.4241379201412201, + "step": 85685 + }, + { + "epoch": 0.08630787809931298, + "grad_norm": 10.35220906443794, + "learning_rate": 4.983801940181844e-05, + "loss": 2.7237, + "mean_token_accuracy": 0.3793103456497192, + "step": 85690 + }, + { + "epoch": 0.08631291415241715, + "grad_norm": 14.417212321958647, + "learning_rate": 4.983797451677091e-05, + "loss": 2.81, + "mean_token_accuracy": 0.4275861978530884, + "step": 85695 + }, + { + "epoch": 0.08631795020552133, + "grad_norm": 11.864233491738585, + "learning_rate": 4.983792962552786e-05, + "loss": 2.3464, + "mean_token_accuracy": 0.42909860610961914, + "step": 85700 + }, + { + "epoch": 0.0863229862586255, + "grad_norm": 10.344941526930626, + "learning_rate": 4.9837884728089297e-05, + "loss": 2.2091, + "mean_token_accuracy": 0.4862068951129913, + "step": 85705 + }, + { + "epoch": 0.08632802231172967, + "grad_norm": 14.738853829836518, + "learning_rate": 4.9837839824455236e-05, + "loss": 2.2436, + "mean_token_accuracy": 0.4931034564971924, + "step": 85710 + }, + { + "epoch": 0.08633305836483385, + "grad_norm": 10.69648651783352, + "learning_rate": 4.98377949146257e-05, + "loss": 2.5748, + "mean_token_accuracy": 0.39310343861579894, + "step": 85715 + }, + { + "epoch": 0.08633809441793802, + "grad_norm": 14.469735797404404, + "learning_rate": 4.9837749998600684e-05, + "loss": 2.5056, + "mean_token_accuracy": 0.3862069010734558, + "step": 85720 + }, + { + "epoch": 0.0863431304710422, + "grad_norm": 10.548277186525102, + "learning_rate": 4.983770507638021e-05, + "loss": 2.2705, + "mean_token_accuracy": 0.3862069010734558, + "step": 85725 + }, + { + "epoch": 0.08634816652414637, + "grad_norm": 10.644048411011205, + "learning_rate": 4.98376601479643e-05, + "loss": 2.1422, + "mean_token_accuracy": 0.5103448271751404, + "step": 85730 + }, + { + "epoch": 0.08635320257725054, + "grad_norm": 8.763835568818326, + "learning_rate": 4.9837615213352956e-05, + "loss": 2.1488, + "mean_token_accuracy": 0.4551724135875702, + "step": 85735 + }, + { + "epoch": 0.0863582386303547, + "grad_norm": 16.035138850691173, + "learning_rate": 4.983757027254618e-05, + "loss": 2.437, + "mean_token_accuracy": 0.4103448331356049, + "step": 85740 + }, + { + "epoch": 0.08636327468345888, + "grad_norm": 10.99502836984945, + "learning_rate": 4.9837525325544e-05, + "loss": 2.6739, + "mean_token_accuracy": 0.3827586233615875, + "step": 85745 + }, + { + "epoch": 0.08636831073656305, + "grad_norm": 11.144157045446766, + "learning_rate": 4.983748037234643e-05, + "loss": 2.459, + "mean_token_accuracy": 0.3862069010734558, + "step": 85750 + }, + { + "epoch": 0.08637334678966722, + "grad_norm": 10.839074288340393, + "learning_rate": 4.983743541295347e-05, + "loss": 2.5423, + "mean_token_accuracy": 0.42413793206214906, + "step": 85755 + }, + { + "epoch": 0.0863783828427714, + "grad_norm": 18.0758760556222, + "learning_rate": 4.9837390447365145e-05, + "loss": 2.4696, + "mean_token_accuracy": 0.37931033968925476, + "step": 85760 + }, + { + "epoch": 0.08638341889587557, + "grad_norm": 10.514795273276423, + "learning_rate": 4.9837345475581456e-05, + "loss": 2.4579, + "mean_token_accuracy": 0.42758620977401735, + "step": 85765 + }, + { + "epoch": 0.08638845494897975, + "grad_norm": 12.49302638113191, + "learning_rate": 4.9837300497602425e-05, + "loss": 2.7628, + "mean_token_accuracy": 0.36896551549434664, + "step": 85770 + }, + { + "epoch": 0.08639349100208392, + "grad_norm": 10.324372238504612, + "learning_rate": 4.9837255513428054e-05, + "loss": 2.8566, + "mean_token_accuracy": 0.3896551728248596, + "step": 85775 + }, + { + "epoch": 0.08639852705518809, + "grad_norm": 9.816893236187164, + "learning_rate": 4.983721052305837e-05, + "loss": 2.2771, + "mean_token_accuracy": 0.4551724076271057, + "step": 85780 + }, + { + "epoch": 0.08640356310829227, + "grad_norm": 9.302876224937018, + "learning_rate": 4.983716552649337e-05, + "loss": 2.337, + "mean_token_accuracy": 0.4534785270690918, + "step": 85785 + }, + { + "epoch": 0.08640859916139644, + "grad_norm": 16.174493748462172, + "learning_rate": 4.9837120523733074e-05, + "loss": 2.704, + "mean_token_accuracy": 0.39618874788284303, + "step": 85790 + }, + { + "epoch": 0.08641363521450061, + "grad_norm": 10.08429382932957, + "learning_rate": 4.983707551477751e-05, + "loss": 2.5566, + "mean_token_accuracy": 0.42758620977401735, + "step": 85795 + }, + { + "epoch": 0.08641867126760479, + "grad_norm": 11.289616944867047, + "learning_rate": 4.983703049962665e-05, + "loss": 2.4404, + "mean_token_accuracy": 0.4034482717514038, + "step": 85800 + }, + { + "epoch": 0.08642370732070896, + "grad_norm": 11.926243148678576, + "learning_rate": 4.9836985478280545e-05, + "loss": 2.4055, + "mean_token_accuracy": 0.4724137902259827, + "step": 85805 + }, + { + "epoch": 0.08642874337381312, + "grad_norm": 10.053274195140387, + "learning_rate": 4.983694045073919e-05, + "loss": 2.8584, + "mean_token_accuracy": 0.38275861740112305, + "step": 85810 + }, + { + "epoch": 0.0864337794269173, + "grad_norm": 10.356995719112014, + "learning_rate": 4.9836895417002604e-05, + "loss": 2.4488, + "mean_token_accuracy": 0.42758620381355283, + "step": 85815 + }, + { + "epoch": 0.08643881548002147, + "grad_norm": 11.475080760975624, + "learning_rate": 4.98368503770708e-05, + "loss": 2.4335, + "mean_token_accuracy": 0.403448274731636, + "step": 85820 + }, + { + "epoch": 0.08644385153312564, + "grad_norm": 14.292287719772636, + "learning_rate": 4.983680533094378e-05, + "loss": 2.5817, + "mean_token_accuracy": 0.4206896543502808, + "step": 85825 + }, + { + "epoch": 0.08644888758622982, + "grad_norm": 10.267525702771389, + "learning_rate": 4.983676027862156e-05, + "loss": 2.2198, + "mean_token_accuracy": 0.44827585220336913, + "step": 85830 + }, + { + "epoch": 0.08645392363933399, + "grad_norm": 14.629452839284372, + "learning_rate": 4.983671522010417e-05, + "loss": 2.3362, + "mean_token_accuracy": 0.4241379380226135, + "step": 85835 + }, + { + "epoch": 0.08645895969243816, + "grad_norm": 10.576145367635034, + "learning_rate": 4.9836670155391604e-05, + "loss": 2.4545, + "mean_token_accuracy": 0.3896551728248596, + "step": 85840 + }, + { + "epoch": 0.08646399574554234, + "grad_norm": 12.40155451548599, + "learning_rate": 4.983662508448388e-05, + "loss": 2.541, + "mean_token_accuracy": 0.358620685338974, + "step": 85845 + }, + { + "epoch": 0.08646903179864651, + "grad_norm": 11.685705279560947, + "learning_rate": 4.9836580007381006e-05, + "loss": 2.6927, + "mean_token_accuracy": 0.4310344815254211, + "step": 85850 + }, + { + "epoch": 0.08647406785175069, + "grad_norm": 9.25899390114931, + "learning_rate": 4.9836534924083e-05, + "loss": 2.3112, + "mean_token_accuracy": 0.43103448748588563, + "step": 85855 + }, + { + "epoch": 0.08647910390485486, + "grad_norm": 10.821564865270874, + "learning_rate": 4.9836489834589863e-05, + "loss": 2.4348, + "mean_token_accuracy": 0.4137930989265442, + "step": 85860 + }, + { + "epoch": 0.08648413995795903, + "grad_norm": 11.850768211085699, + "learning_rate": 4.983644473890163e-05, + "loss": 2.3831, + "mean_token_accuracy": 0.40689654350280763, + "step": 85865 + }, + { + "epoch": 0.0864891760110632, + "grad_norm": 11.385951252408212, + "learning_rate": 4.98363996370183e-05, + "loss": 2.3363, + "mean_token_accuracy": 0.43448275327682495, + "step": 85870 + }, + { + "epoch": 0.08649421206416738, + "grad_norm": 15.483238949539185, + "learning_rate": 4.9836354528939885e-05, + "loss": 2.6649, + "mean_token_accuracy": 0.37586207389831544, + "step": 85875 + }, + { + "epoch": 0.08649924811727154, + "grad_norm": 13.627558359624706, + "learning_rate": 4.9836309414666396e-05, + "loss": 2.3306, + "mean_token_accuracy": 0.45862067937850953, + "step": 85880 + }, + { + "epoch": 0.08650428417037571, + "grad_norm": 9.667361053916233, + "learning_rate": 4.9836264294197844e-05, + "loss": 2.4061, + "mean_token_accuracy": 0.43103448748588563, + "step": 85885 + }, + { + "epoch": 0.08650932022347989, + "grad_norm": 11.93158573649606, + "learning_rate": 4.9836219167534255e-05, + "loss": 2.4194, + "mean_token_accuracy": 0.4137930989265442, + "step": 85890 + }, + { + "epoch": 0.08651435627658406, + "grad_norm": 13.411382955143935, + "learning_rate": 4.983617403467562e-05, + "loss": 2.4228, + "mean_token_accuracy": 0.4620689570903778, + "step": 85895 + }, + { + "epoch": 0.08651939232968824, + "grad_norm": 12.003036849684241, + "learning_rate": 4.9836128895621975e-05, + "loss": 2.8335, + "mean_token_accuracy": 0.38275861740112305, + "step": 85900 + }, + { + "epoch": 0.08652442838279241, + "grad_norm": 11.77528367765045, + "learning_rate": 4.9836083750373324e-05, + "loss": 2.8777, + "mean_token_accuracy": 0.34827586710453035, + "step": 85905 + }, + { + "epoch": 0.08652946443589658, + "grad_norm": 11.427256847570119, + "learning_rate": 4.983603859892967e-05, + "loss": 2.6068, + "mean_token_accuracy": 0.3655172407627106, + "step": 85910 + }, + { + "epoch": 0.08653450048900076, + "grad_norm": 10.672561919520579, + "learning_rate": 4.983599344129103e-05, + "loss": 2.3823, + "mean_token_accuracy": 0.441379314661026, + "step": 85915 + }, + { + "epoch": 0.08653953654210493, + "grad_norm": 8.087882048585717, + "learning_rate": 4.9835948277457425e-05, + "loss": 2.1931, + "mean_token_accuracy": 0.48965516686439514, + "step": 85920 + }, + { + "epoch": 0.0865445725952091, + "grad_norm": 9.61838684599959, + "learning_rate": 4.983590310742886e-05, + "loss": 2.3961, + "mean_token_accuracy": 0.4551724076271057, + "step": 85925 + }, + { + "epoch": 0.08654960864831328, + "grad_norm": 10.421914128119445, + "learning_rate": 4.9835857931205346e-05, + "loss": 2.3432, + "mean_token_accuracy": 0.41203871965408323, + "step": 85930 + }, + { + "epoch": 0.08655464470141745, + "grad_norm": 11.950647950209852, + "learning_rate": 4.9835812748786905e-05, + "loss": 2.6569, + "mean_token_accuracy": 0.4103448331356049, + "step": 85935 + }, + { + "epoch": 0.08655968075452163, + "grad_norm": 12.671659578020153, + "learning_rate": 4.983576756017353e-05, + "loss": 2.4077, + "mean_token_accuracy": 0.4034482777118683, + "step": 85940 + }, + { + "epoch": 0.0865647168076258, + "grad_norm": 9.61105772479072, + "learning_rate": 4.983572236536526e-05, + "loss": 2.4934, + "mean_token_accuracy": 0.3896551787853241, + "step": 85945 + }, + { + "epoch": 0.08656975286072996, + "grad_norm": 9.555263617199538, + "learning_rate": 4.983567716436208e-05, + "loss": 2.3492, + "mean_token_accuracy": 0.4413793087005615, + "step": 85950 + }, + { + "epoch": 0.08657478891383413, + "grad_norm": 18.792391372218272, + "learning_rate": 4.983563195716403e-05, + "loss": 2.8506, + "mean_token_accuracy": 0.4068965554237366, + "step": 85955 + }, + { + "epoch": 0.0865798249669383, + "grad_norm": 11.848153592222557, + "learning_rate": 4.983558674377111e-05, + "loss": 2.4634, + "mean_token_accuracy": 0.4034482717514038, + "step": 85960 + }, + { + "epoch": 0.08658486102004248, + "grad_norm": 12.3563194987663, + "learning_rate": 4.9835541524183324e-05, + "loss": 2.6842, + "mean_token_accuracy": 0.4068965554237366, + "step": 85965 + }, + { + "epoch": 0.08658989707314665, + "grad_norm": 11.69272996333826, + "learning_rate": 4.983549629840069e-05, + "loss": 2.2356, + "mean_token_accuracy": 0.44482758045196535, + "step": 85970 + }, + { + "epoch": 0.08659493312625083, + "grad_norm": 10.874992142249097, + "learning_rate": 4.9835451066423226e-05, + "loss": 2.0792, + "mean_token_accuracy": 0.4655172526836395, + "step": 85975 + }, + { + "epoch": 0.086599969179355, + "grad_norm": 20.365826147334218, + "learning_rate": 4.983540582825095e-05, + "loss": 2.7875, + "mean_token_accuracy": 0.39310345351696013, + "step": 85980 + }, + { + "epoch": 0.08660500523245918, + "grad_norm": 9.49120931981935, + "learning_rate": 4.983536058388385e-05, + "loss": 2.2274, + "mean_token_accuracy": 0.4379310250282288, + "step": 85985 + }, + { + "epoch": 0.08661004128556335, + "grad_norm": 9.965378216462042, + "learning_rate": 4.983531533332196e-05, + "loss": 2.2841, + "mean_token_accuracy": 0.4259528160095215, + "step": 85990 + }, + { + "epoch": 0.08661507733866752, + "grad_norm": 12.958903556615356, + "learning_rate": 4.98352700765653e-05, + "loss": 3.0026, + "mean_token_accuracy": 0.3103448271751404, + "step": 85995 + }, + { + "epoch": 0.0866201133917717, + "grad_norm": 11.869039519069343, + "learning_rate": 4.983522481361386e-05, + "loss": 2.5708, + "mean_token_accuracy": 0.38620689511299133, + "step": 86000 + }, + { + "epoch": 0.08662514944487587, + "grad_norm": 12.047713454749866, + "learning_rate": 4.983517954446766e-05, + "loss": 2.6233, + "mean_token_accuracy": 0.3793103456497192, + "step": 86005 + }, + { + "epoch": 0.08663018549798004, + "grad_norm": 10.008161366499168, + "learning_rate": 4.983513426912671e-05, + "loss": 2.2921, + "mean_token_accuracy": 0.44137930274009707, + "step": 86010 + }, + { + "epoch": 0.08663522155108422, + "grad_norm": 9.912010630812357, + "learning_rate": 4.983508898759104e-05, + "loss": 2.6927, + "mean_token_accuracy": 0.4310344815254211, + "step": 86015 + }, + { + "epoch": 0.08664025760418838, + "grad_norm": 12.046994306364532, + "learning_rate": 4.983504369986064e-05, + "loss": 2.9386, + "mean_token_accuracy": 0.38275861740112305, + "step": 86020 + }, + { + "epoch": 0.08664529365729255, + "grad_norm": 11.289340462478492, + "learning_rate": 4.983499840593553e-05, + "loss": 2.5038, + "mean_token_accuracy": 0.4448275864124298, + "step": 86025 + }, + { + "epoch": 0.08665032971039673, + "grad_norm": 9.533670788355828, + "learning_rate": 4.983495310581574e-05, + "loss": 2.6954, + "mean_token_accuracy": 0.34482758641242983, + "step": 86030 + }, + { + "epoch": 0.0866553657635009, + "grad_norm": 11.75650870858756, + "learning_rate": 4.983490779950126e-05, + "loss": 2.4145, + "mean_token_accuracy": 0.4816696882247925, + "step": 86035 + }, + { + "epoch": 0.08666040181660507, + "grad_norm": 9.657465516571568, + "learning_rate": 4.98348624869921e-05, + "loss": 2.6209, + "mean_token_accuracy": 0.4206896543502808, + "step": 86040 + }, + { + "epoch": 0.08666543786970925, + "grad_norm": 13.663554580295168, + "learning_rate": 4.983481716828829e-05, + "loss": 3.2414, + "mean_token_accuracy": 0.31724137663841245, + "step": 86045 + }, + { + "epoch": 0.08667047392281342, + "grad_norm": 13.839672324889879, + "learning_rate": 4.983477184338984e-05, + "loss": 2.4131, + "mean_token_accuracy": 0.42758620977401735, + "step": 86050 + }, + { + "epoch": 0.0866755099759176, + "grad_norm": 9.915769522595859, + "learning_rate": 4.9834726512296745e-05, + "loss": 2.1302, + "mean_token_accuracy": 0.5021173536777497, + "step": 86055 + }, + { + "epoch": 0.08668054602902177, + "grad_norm": 10.133407924491985, + "learning_rate": 4.983468117500904e-05, + "loss": 2.6179, + "mean_token_accuracy": 0.36551724672317504, + "step": 86060 + }, + { + "epoch": 0.08668558208212594, + "grad_norm": 11.542681569791872, + "learning_rate": 4.983463583152673e-05, + "loss": 2.6594, + "mean_token_accuracy": 0.4068965554237366, + "step": 86065 + }, + { + "epoch": 0.08669061813523012, + "grad_norm": 11.148843643515967, + "learning_rate": 4.983459048184982e-05, + "loss": 2.3945, + "mean_token_accuracy": 0.4482758641242981, + "step": 86070 + }, + { + "epoch": 0.08669565418833429, + "grad_norm": 11.10073841352252, + "learning_rate": 4.983454512597833e-05, + "loss": 2.8404, + "mean_token_accuracy": 0.3931034505367279, + "step": 86075 + }, + { + "epoch": 0.08670069024143846, + "grad_norm": 13.213615982641578, + "learning_rate": 4.9834499763912265e-05, + "loss": 2.5827, + "mean_token_accuracy": 0.42413793206214906, + "step": 86080 + }, + { + "epoch": 0.08670572629454264, + "grad_norm": 9.953562392049552, + "learning_rate": 4.983445439565165e-05, + "loss": 2.4707, + "mean_token_accuracy": 0.45698729157447815, + "step": 86085 + }, + { + "epoch": 0.0867107623476468, + "grad_norm": 11.19288200020367, + "learning_rate": 4.9834409021196494e-05, + "loss": 2.8666, + "mean_token_accuracy": 0.37586206793785093, + "step": 86090 + }, + { + "epoch": 0.08671579840075097, + "grad_norm": 16.79206967013229, + "learning_rate": 4.9834363640546797e-05, + "loss": 2.4193, + "mean_token_accuracy": 0.4600725889205933, + "step": 86095 + }, + { + "epoch": 0.08672083445385514, + "grad_norm": 10.40342554165823, + "learning_rate": 4.9834318253702584e-05, + "loss": 2.6385, + "mean_token_accuracy": 0.3965517282485962, + "step": 86100 + }, + { + "epoch": 0.08672587050695932, + "grad_norm": 11.514682486430118, + "learning_rate": 4.9834272860663864e-05, + "loss": 2.5171, + "mean_token_accuracy": 0.4379310369491577, + "step": 86105 + }, + { + "epoch": 0.08673090656006349, + "grad_norm": 10.742702192675887, + "learning_rate": 4.9834227461430656e-05, + "loss": 2.942, + "mean_token_accuracy": 0.34482758641242983, + "step": 86110 + }, + { + "epoch": 0.08673594261316767, + "grad_norm": 9.636867658145365, + "learning_rate": 4.983418205600297e-05, + "loss": 2.358, + "mean_token_accuracy": 0.4344827592372894, + "step": 86115 + }, + { + "epoch": 0.08674097866627184, + "grad_norm": 11.9567650551864, + "learning_rate": 4.9834136644380806e-05, + "loss": 2.7101, + "mean_token_accuracy": 0.34137930870056155, + "step": 86120 + }, + { + "epoch": 0.08674601471937601, + "grad_norm": 9.223526399465987, + "learning_rate": 4.983409122656419e-05, + "loss": 2.5782, + "mean_token_accuracy": 0.4124621868133545, + "step": 86125 + }, + { + "epoch": 0.08675105077248019, + "grad_norm": 9.495158153590731, + "learning_rate": 4.983404580255313e-05, + "loss": 2.7089, + "mean_token_accuracy": 0.38275861740112305, + "step": 86130 + }, + { + "epoch": 0.08675608682558436, + "grad_norm": 13.040039760771384, + "learning_rate": 4.983400037234763e-05, + "loss": 2.7491, + "mean_token_accuracy": 0.3517241358757019, + "step": 86135 + }, + { + "epoch": 0.08676112287868853, + "grad_norm": 10.16921181916363, + "learning_rate": 4.983395493594772e-05, + "loss": 2.3211, + "mean_token_accuracy": 0.41034482717514037, + "step": 86140 + }, + { + "epoch": 0.08676615893179271, + "grad_norm": 12.274446196978882, + "learning_rate": 4.98339094933534e-05, + "loss": 2.293, + "mean_token_accuracy": 0.45033273100852966, + "step": 86145 + }, + { + "epoch": 0.08677119498489688, + "grad_norm": 12.000737017353911, + "learning_rate": 4.983386404456469e-05, + "loss": 2.5044, + "mean_token_accuracy": 0.36896551847457887, + "step": 86150 + }, + { + "epoch": 0.08677623103800106, + "grad_norm": 11.042622553676953, + "learning_rate": 4.98338185895816e-05, + "loss": 2.8826, + "mean_token_accuracy": 0.38965516090393065, + "step": 86155 + }, + { + "epoch": 0.08678126709110522, + "grad_norm": 11.157016237629469, + "learning_rate": 4.983377312840414e-05, + "loss": 2.5411, + "mean_token_accuracy": 0.42413792610168455, + "step": 86160 + }, + { + "epoch": 0.08678630314420939, + "grad_norm": 10.86416483573507, + "learning_rate": 4.983372766103233e-05, + "loss": 2.3252, + "mean_token_accuracy": 0.4776164650917053, + "step": 86165 + }, + { + "epoch": 0.08679133919731356, + "grad_norm": 11.811959331255306, + "learning_rate": 4.983368218746617e-05, + "loss": 2.4912, + "mean_token_accuracy": 0.42068965137004855, + "step": 86170 + }, + { + "epoch": 0.08679637525041774, + "grad_norm": 11.51004545374619, + "learning_rate": 4.9833636707705686e-05, + "loss": 2.5171, + "mean_token_accuracy": 0.42758620381355283, + "step": 86175 + }, + { + "epoch": 0.08680141130352191, + "grad_norm": 12.039657038884906, + "learning_rate": 4.9833591221750887e-05, + "loss": 2.6749, + "mean_token_accuracy": 0.4034482777118683, + "step": 86180 + }, + { + "epoch": 0.08680644735662608, + "grad_norm": 11.215071559776549, + "learning_rate": 4.9833545729601776e-05, + "loss": 2.4918, + "mean_token_accuracy": 0.441379314661026, + "step": 86185 + }, + { + "epoch": 0.08681148340973026, + "grad_norm": 11.197142773621714, + "learning_rate": 4.9833500231258375e-05, + "loss": 2.4815, + "mean_token_accuracy": 0.41034482717514037, + "step": 86190 + }, + { + "epoch": 0.08681651946283443, + "grad_norm": 8.876092523623365, + "learning_rate": 4.9833454726720696e-05, + "loss": 2.2781, + "mean_token_accuracy": 0.4413793087005615, + "step": 86195 + }, + { + "epoch": 0.0868215555159386, + "grad_norm": 12.53948120040899, + "learning_rate": 4.983340921598875e-05, + "loss": 2.3235, + "mean_token_accuracy": 0.42413793206214906, + "step": 86200 + }, + { + "epoch": 0.08682659156904278, + "grad_norm": 9.29443192637752, + "learning_rate": 4.983336369906255e-05, + "loss": 2.8324, + "mean_token_accuracy": 0.3758620649576187, + "step": 86205 + }, + { + "epoch": 0.08683162762214695, + "grad_norm": 13.324375278355333, + "learning_rate": 4.98333181759421e-05, + "loss": 2.0264, + "mean_token_accuracy": 0.4517241418361664, + "step": 86210 + }, + { + "epoch": 0.08683666367525113, + "grad_norm": 10.27157071510639, + "learning_rate": 4.983327264662743e-05, + "loss": 2.4632, + "mean_token_accuracy": 0.38965516686439516, + "step": 86215 + }, + { + "epoch": 0.0868416997283553, + "grad_norm": 10.237486979217078, + "learning_rate": 4.983322711111854e-05, + "loss": 2.1802, + "mean_token_accuracy": 0.49842709898948667, + "step": 86220 + }, + { + "epoch": 0.08684673578145947, + "grad_norm": 10.600766508349675, + "learning_rate": 4.983318156941545e-05, + "loss": 2.4158, + "mean_token_accuracy": 0.43103448748588563, + "step": 86225 + }, + { + "epoch": 0.08685177183456363, + "grad_norm": 8.130187666066819, + "learning_rate": 4.983313602151817e-05, + "loss": 2.2274, + "mean_token_accuracy": 0.44827585816383364, + "step": 86230 + }, + { + "epoch": 0.08685680788766781, + "grad_norm": 12.039287798293214, + "learning_rate": 4.9833090467426704e-05, + "loss": 2.4962, + "mean_token_accuracy": 0.3999999940395355, + "step": 86235 + }, + { + "epoch": 0.08686184394077198, + "grad_norm": 12.467670197539608, + "learning_rate": 4.983304490714108e-05, + "loss": 2.2156, + "mean_token_accuracy": 0.48148820400238035, + "step": 86240 + }, + { + "epoch": 0.08686687999387616, + "grad_norm": 15.500244431264585, + "learning_rate": 4.9832999340661296e-05, + "loss": 2.8387, + "mean_token_accuracy": 0.38275861740112305, + "step": 86245 + }, + { + "epoch": 0.08687191604698033, + "grad_norm": 10.973068833905813, + "learning_rate": 4.9832953767987365e-05, + "loss": 2.4226, + "mean_token_accuracy": 0.42758620977401735, + "step": 86250 + }, + { + "epoch": 0.0868769521000845, + "grad_norm": 10.12925357349675, + "learning_rate": 4.983290818911932e-05, + "loss": 2.6403, + "mean_token_accuracy": 0.36551723778247835, + "step": 86255 + }, + { + "epoch": 0.08688198815318868, + "grad_norm": 12.62430868079707, + "learning_rate": 4.983286260405716e-05, + "loss": 2.558, + "mean_token_accuracy": 0.4862069010734558, + "step": 86260 + }, + { + "epoch": 0.08688702420629285, + "grad_norm": 10.90219902747991, + "learning_rate": 4.9832817012800885e-05, + "loss": 2.6732, + "mean_token_accuracy": 0.4068965494632721, + "step": 86265 + }, + { + "epoch": 0.08689206025939702, + "grad_norm": 10.065177362608695, + "learning_rate": 4.983277141535052e-05, + "loss": 2.7294, + "mean_token_accuracy": 0.3931034505367279, + "step": 86270 + }, + { + "epoch": 0.0868970963125012, + "grad_norm": 10.896072485267833, + "learning_rate": 4.9832725811706086e-05, + "loss": 2.1008, + "mean_token_accuracy": 0.4862068951129913, + "step": 86275 + }, + { + "epoch": 0.08690213236560537, + "grad_norm": 10.444235410185499, + "learning_rate": 4.983268020186759e-05, + "loss": 2.4354, + "mean_token_accuracy": 0.42885662317276, + "step": 86280 + }, + { + "epoch": 0.08690716841870955, + "grad_norm": 11.4526631570211, + "learning_rate": 4.983263458583503e-05, + "loss": 2.4047, + "mean_token_accuracy": 0.37586206793785093, + "step": 86285 + }, + { + "epoch": 0.08691220447181372, + "grad_norm": 10.529449793034436, + "learning_rate": 4.983258896360844e-05, + "loss": 1.9105, + "mean_token_accuracy": 0.48275862336158754, + "step": 86290 + }, + { + "epoch": 0.0869172405249179, + "grad_norm": 10.809918082435273, + "learning_rate": 4.983254333518782e-05, + "loss": 2.0751, + "mean_token_accuracy": 0.48814276456832884, + "step": 86295 + }, + { + "epoch": 0.08692227657802205, + "grad_norm": 21.292560433174188, + "learning_rate": 4.9832497700573185e-05, + "loss": 2.5043, + "mean_token_accuracy": 0.41724138259887694, + "step": 86300 + }, + { + "epoch": 0.08692731263112623, + "grad_norm": 12.816673339828267, + "learning_rate": 4.983245205976455e-05, + "loss": 2.9364, + "mean_token_accuracy": 0.3620689660310745, + "step": 86305 + }, + { + "epoch": 0.0869323486842304, + "grad_norm": 11.833869941073408, + "learning_rate": 4.9832406412761924e-05, + "loss": 2.9552, + "mean_token_accuracy": 0.36660616397857665, + "step": 86310 + }, + { + "epoch": 0.08693738473733457, + "grad_norm": 13.609791094392282, + "learning_rate": 4.983236075956532e-05, + "loss": 2.6351, + "mean_token_accuracy": 0.39310343861579894, + "step": 86315 + }, + { + "epoch": 0.08694242079043875, + "grad_norm": 10.176924012645953, + "learning_rate": 4.9832315100174767e-05, + "loss": 2.5671, + "mean_token_accuracy": 0.3827586233615875, + "step": 86320 + }, + { + "epoch": 0.08694745684354292, + "grad_norm": 12.350553303582004, + "learning_rate": 4.983226943459025e-05, + "loss": 2.6646, + "mean_token_accuracy": 0.39836661219596864, + "step": 86325 + }, + { + "epoch": 0.0869524928966471, + "grad_norm": 16.064885238913867, + "learning_rate": 4.9832223762811794e-05, + "loss": 2.2893, + "mean_token_accuracy": 0.540350866317749, + "step": 86330 + }, + { + "epoch": 0.08695752894975127, + "grad_norm": 10.63716420320088, + "learning_rate": 4.9832178084839417e-05, + "loss": 2.2531, + "mean_token_accuracy": 0.3896551728248596, + "step": 86335 + }, + { + "epoch": 0.08696256500285544, + "grad_norm": 10.309520842853864, + "learning_rate": 4.983213240067312e-05, + "loss": 2.1544, + "mean_token_accuracy": 0.4724137902259827, + "step": 86340 + }, + { + "epoch": 0.08696760105595962, + "grad_norm": 11.535638647414885, + "learning_rate": 4.983208671031293e-05, + "loss": 2.3552, + "mean_token_accuracy": 0.441379314661026, + "step": 86345 + }, + { + "epoch": 0.08697263710906379, + "grad_norm": 12.796032050910778, + "learning_rate": 4.983204101375885e-05, + "loss": 2.0742, + "mean_token_accuracy": 0.4758620738983154, + "step": 86350 + }, + { + "epoch": 0.08697767316216796, + "grad_norm": 10.353262405733663, + "learning_rate": 4.9831995311010894e-05, + "loss": 2.0933, + "mean_token_accuracy": 0.48965518474578856, + "step": 86355 + }, + { + "epoch": 0.08698270921527214, + "grad_norm": 11.734730262023911, + "learning_rate": 4.983194960206908e-05, + "loss": 2.5185, + "mean_token_accuracy": 0.4188747763633728, + "step": 86360 + }, + { + "epoch": 0.08698774526837631, + "grad_norm": 14.351259118278191, + "learning_rate": 4.983190388693342e-05, + "loss": 2.9389, + "mean_token_accuracy": 0.3655172407627106, + "step": 86365 + }, + { + "epoch": 0.08699278132148047, + "grad_norm": 24.74334912592318, + "learning_rate": 4.983185816560391e-05, + "loss": 2.5021, + "mean_token_accuracy": 0.4034482717514038, + "step": 86370 + }, + { + "epoch": 0.08699781737458465, + "grad_norm": 10.264127803465115, + "learning_rate": 4.983181243808058e-05, + "loss": 1.9235, + "mean_token_accuracy": 0.5119458079338074, + "step": 86375 + }, + { + "epoch": 0.08700285342768882, + "grad_norm": 11.75127999648214, + "learning_rate": 4.983176670436344e-05, + "loss": 2.3934, + "mean_token_accuracy": 0.45686631202697753, + "step": 86380 + }, + { + "epoch": 0.087007889480793, + "grad_norm": 14.235735181515555, + "learning_rate": 4.9831720964452504e-05, + "loss": 2.5501, + "mean_token_accuracy": 0.43448275327682495, + "step": 86385 + }, + { + "epoch": 0.08701292553389717, + "grad_norm": 11.051051473869903, + "learning_rate": 4.9831675218347785e-05, + "loss": 2.5402, + "mean_token_accuracy": 0.40689654350280763, + "step": 86390 + }, + { + "epoch": 0.08701796158700134, + "grad_norm": 13.455568910844482, + "learning_rate": 4.983162946604929e-05, + "loss": 2.3657, + "mean_token_accuracy": 0.42758620381355283, + "step": 86395 + }, + { + "epoch": 0.08702299764010551, + "grad_norm": 9.001625401058353, + "learning_rate": 4.9831583707557025e-05, + "loss": 2.2946, + "mean_token_accuracy": 0.4620689690113068, + "step": 86400 + }, + { + "epoch": 0.08702803369320969, + "grad_norm": 10.509782201886818, + "learning_rate": 4.983153794287102e-05, + "loss": 2.681, + "mean_token_accuracy": 0.4034482717514038, + "step": 86405 + }, + { + "epoch": 0.08703306974631386, + "grad_norm": 11.021595009635545, + "learning_rate": 4.983149217199128e-05, + "loss": 2.4339, + "mean_token_accuracy": 0.4551724135875702, + "step": 86410 + }, + { + "epoch": 0.08703810579941804, + "grad_norm": 12.984169552387932, + "learning_rate": 4.9831446394917813e-05, + "loss": 2.5304, + "mean_token_accuracy": 0.4206896543502808, + "step": 86415 + }, + { + "epoch": 0.08704314185252221, + "grad_norm": 9.813264549211276, + "learning_rate": 4.983140061165064e-05, + "loss": 2.2901, + "mean_token_accuracy": 0.4310344815254211, + "step": 86420 + }, + { + "epoch": 0.08704817790562638, + "grad_norm": 9.218139689167623, + "learning_rate": 4.9831354822189775e-05, + "loss": 2.4292, + "mean_token_accuracy": 0.38275861740112305, + "step": 86425 + }, + { + "epoch": 0.08705321395873056, + "grad_norm": 9.699046529322704, + "learning_rate": 4.9831309026535214e-05, + "loss": 2.5026, + "mean_token_accuracy": 0.46412582099437716, + "step": 86430 + }, + { + "epoch": 0.08705825001183473, + "grad_norm": 9.998793929431956, + "learning_rate": 4.983126322468699e-05, + "loss": 2.123, + "mean_token_accuracy": 0.4705989122390747, + "step": 86435 + }, + { + "epoch": 0.08706328606493889, + "grad_norm": 12.300134160564687, + "learning_rate": 4.98312174166451e-05, + "loss": 2.4133, + "mean_token_accuracy": 0.4448275864124298, + "step": 86440 + }, + { + "epoch": 0.08706832211804306, + "grad_norm": 11.727302607047914, + "learning_rate": 4.983117160240957e-05, + "loss": 2.1086, + "mean_token_accuracy": 0.47241380214691164, + "step": 86445 + }, + { + "epoch": 0.08707335817114724, + "grad_norm": 11.382489306883448, + "learning_rate": 4.9831125781980396e-05, + "loss": 2.3407, + "mean_token_accuracy": 0.46896552443504336, + "step": 86450 + }, + { + "epoch": 0.08707839422425141, + "grad_norm": 9.486189437630772, + "learning_rate": 4.983107995535761e-05, + "loss": 2.3655, + "mean_token_accuracy": 0.4068965554237366, + "step": 86455 + }, + { + "epoch": 0.08708343027735559, + "grad_norm": 14.725889154213995, + "learning_rate": 4.9831034122541216e-05, + "loss": 2.7955, + "mean_token_accuracy": 0.4000000059604645, + "step": 86460 + }, + { + "epoch": 0.08708846633045976, + "grad_norm": 11.08604586911833, + "learning_rate": 4.9830988283531226e-05, + "loss": 2.8231, + "mean_token_accuracy": 0.37586206793785093, + "step": 86465 + }, + { + "epoch": 0.08709350238356393, + "grad_norm": 13.048654158113271, + "learning_rate": 4.983094243832765e-05, + "loss": 2.6801, + "mean_token_accuracy": 0.3896551728248596, + "step": 86470 + }, + { + "epoch": 0.08709853843666811, + "grad_norm": 11.60035183850772, + "learning_rate": 4.983089658693051e-05, + "loss": 2.4768, + "mean_token_accuracy": 0.4103448212146759, + "step": 86475 + }, + { + "epoch": 0.08710357448977228, + "grad_norm": 10.128145077852396, + "learning_rate": 4.98308507293398e-05, + "loss": 2.214, + "mean_token_accuracy": 0.4258318245410919, + "step": 86480 + }, + { + "epoch": 0.08710861054287646, + "grad_norm": 12.06838822051253, + "learning_rate": 4.983080486555556e-05, + "loss": 2.8142, + "mean_token_accuracy": 0.3896551728248596, + "step": 86485 + }, + { + "epoch": 0.08711364659598063, + "grad_norm": 11.006822229917786, + "learning_rate": 4.9830758995577775e-05, + "loss": 2.8015, + "mean_token_accuracy": 0.3896551787853241, + "step": 86490 + }, + { + "epoch": 0.0871186826490848, + "grad_norm": 10.962877414412725, + "learning_rate": 4.983071311940648e-05, + "loss": 2.6848, + "mean_token_accuracy": 0.403448274731636, + "step": 86495 + }, + { + "epoch": 0.08712371870218898, + "grad_norm": 10.034322644338795, + "learning_rate": 4.9830667237041676e-05, + "loss": 2.5382, + "mean_token_accuracy": 0.4379310429096222, + "step": 86500 + }, + { + "epoch": 0.08712875475529315, + "grad_norm": 13.701506922557952, + "learning_rate": 4.9830621348483376e-05, + "loss": 2.2847, + "mean_token_accuracy": 0.4620689630508423, + "step": 86505 + }, + { + "epoch": 0.08713379080839731, + "grad_norm": 10.222796104743601, + "learning_rate": 4.983057545373159e-05, + "loss": 2.5303, + "mean_token_accuracy": 0.3896551638841629, + "step": 86510 + }, + { + "epoch": 0.08713882686150148, + "grad_norm": 13.420470133741237, + "learning_rate": 4.983052955278634e-05, + "loss": 2.1636, + "mean_token_accuracy": 0.43793103098869324, + "step": 86515 + }, + { + "epoch": 0.08714386291460566, + "grad_norm": 13.057280724532147, + "learning_rate": 4.983048364564764e-05, + "loss": 2.4996, + "mean_token_accuracy": 0.43968542814254763, + "step": 86520 + }, + { + "epoch": 0.08714889896770983, + "grad_norm": 12.684822732156206, + "learning_rate": 4.983043773231549e-05, + "loss": 2.5853, + "mean_token_accuracy": 0.4034482777118683, + "step": 86525 + }, + { + "epoch": 0.087153935020814, + "grad_norm": 10.154302671341128, + "learning_rate": 4.983039181278992e-05, + "loss": 2.5854, + "mean_token_accuracy": 0.3915305495262146, + "step": 86530 + }, + { + "epoch": 0.08715897107391818, + "grad_norm": 12.384619706703251, + "learning_rate": 4.983034588707092e-05, + "loss": 2.8389, + "mean_token_accuracy": 0.4241379380226135, + "step": 86535 + }, + { + "epoch": 0.08716400712702235, + "grad_norm": 10.433217851793765, + "learning_rate": 4.983029995515852e-05, + "loss": 2.579, + "mean_token_accuracy": 0.37931033968925476, + "step": 86540 + }, + { + "epoch": 0.08716904318012653, + "grad_norm": 10.434153679564936, + "learning_rate": 4.983025401705273e-05, + "loss": 1.9213, + "mean_token_accuracy": 0.5034482777118683, + "step": 86545 + }, + { + "epoch": 0.0871740792332307, + "grad_norm": 9.216721062422392, + "learning_rate": 4.983020807275356e-05, + "loss": 2.1524, + "mean_token_accuracy": 0.4724137902259827, + "step": 86550 + }, + { + "epoch": 0.08717911528633487, + "grad_norm": 9.728362939193179, + "learning_rate": 4.9830162122261026e-05, + "loss": 2.4813, + "mean_token_accuracy": 0.4103448212146759, + "step": 86555 + }, + { + "epoch": 0.08718415133943905, + "grad_norm": 10.508825678536265, + "learning_rate": 4.983011616557513e-05, + "loss": 2.4365, + "mean_token_accuracy": 0.4517241358757019, + "step": 86560 + }, + { + "epoch": 0.08718918739254322, + "grad_norm": 10.507069489212183, + "learning_rate": 4.98300702026959e-05, + "loss": 2.1034, + "mean_token_accuracy": 0.458620685338974, + "step": 86565 + }, + { + "epoch": 0.0871942234456474, + "grad_norm": 10.097421939080888, + "learning_rate": 4.9830024233623337e-05, + "loss": 2.2706, + "mean_token_accuracy": 0.4068965554237366, + "step": 86570 + }, + { + "epoch": 0.08719925949875157, + "grad_norm": 12.001618999410463, + "learning_rate": 4.9829978258357465e-05, + "loss": 2.6023, + "mean_token_accuracy": 0.3896551787853241, + "step": 86575 + }, + { + "epoch": 0.08720429555185573, + "grad_norm": 13.415222099877685, + "learning_rate": 4.9829932276898285e-05, + "loss": 2.5043, + "mean_token_accuracy": 0.41034482717514037, + "step": 86580 + }, + { + "epoch": 0.0872093316049599, + "grad_norm": 10.559601364458098, + "learning_rate": 4.982988628924581e-05, + "loss": 2.1764, + "mean_token_accuracy": 0.46896551847457885, + "step": 86585 + }, + { + "epoch": 0.08721436765806408, + "grad_norm": 8.924331869007865, + "learning_rate": 4.982984029540007e-05, + "loss": 1.9151, + "mean_token_accuracy": 0.4778584361076355, + "step": 86590 + }, + { + "epoch": 0.08721940371116825, + "grad_norm": 10.365479045967056, + "learning_rate": 4.982979429536105e-05, + "loss": 2.6874, + "mean_token_accuracy": 0.41034482717514037, + "step": 86595 + }, + { + "epoch": 0.08722443976427242, + "grad_norm": 9.124149321907941, + "learning_rate": 4.9829748289128794e-05, + "loss": 2.8934, + "mean_token_accuracy": 0.36551723480224607, + "step": 86600 + }, + { + "epoch": 0.0872294758173766, + "grad_norm": 13.425100270446846, + "learning_rate": 4.9829702276703296e-05, + "loss": 2.6568, + "mean_token_accuracy": 0.42758620381355283, + "step": 86605 + }, + { + "epoch": 0.08723451187048077, + "grad_norm": 9.877935617474126, + "learning_rate": 4.9829656258084564e-05, + "loss": 2.4363, + "mean_token_accuracy": 0.42758620381355283, + "step": 86610 + }, + { + "epoch": 0.08723954792358495, + "grad_norm": 11.059372673197064, + "learning_rate": 4.9829610233272625e-05, + "loss": 2.4101, + "mean_token_accuracy": 0.3931034505367279, + "step": 86615 + }, + { + "epoch": 0.08724458397668912, + "grad_norm": 10.58638706795638, + "learning_rate": 4.9829564202267486e-05, + "loss": 2.5312, + "mean_token_accuracy": 0.37586206793785093, + "step": 86620 + }, + { + "epoch": 0.08724962002979329, + "grad_norm": 10.919830328714982, + "learning_rate": 4.9829518165069154e-05, + "loss": 2.2479, + "mean_token_accuracy": 0.41379310488700866, + "step": 86625 + }, + { + "epoch": 0.08725465608289747, + "grad_norm": 9.771688996947136, + "learning_rate": 4.982947212167765e-05, + "loss": 2.4804, + "mean_token_accuracy": 0.42413793206214906, + "step": 86630 + }, + { + "epoch": 0.08725969213600164, + "grad_norm": 9.404738746355573, + "learning_rate": 4.9829426072092985e-05, + "loss": 2.528, + "mean_token_accuracy": 0.4172413766384125, + "step": 86635 + }, + { + "epoch": 0.08726472818910581, + "grad_norm": 9.875264115181432, + "learning_rate": 4.982938001631517e-05, + "loss": 2.5382, + "mean_token_accuracy": 0.40344828367233276, + "step": 86640 + }, + { + "epoch": 0.08726976424220999, + "grad_norm": 9.505666994960123, + "learning_rate": 4.9829333954344206e-05, + "loss": 2.3996, + "mean_token_accuracy": 0.4724137902259827, + "step": 86645 + }, + { + "epoch": 0.08727480029531415, + "grad_norm": 9.31943869703653, + "learning_rate": 4.982928788618014e-05, + "loss": 2.5269, + "mean_token_accuracy": 0.4379310369491577, + "step": 86650 + }, + { + "epoch": 0.08727983634841832, + "grad_norm": 11.538320658559636, + "learning_rate": 4.9829241811822946e-05, + "loss": 2.4703, + "mean_token_accuracy": 0.4103448331356049, + "step": 86655 + }, + { + "epoch": 0.0872848724015225, + "grad_norm": 11.151273360181728, + "learning_rate": 4.982919573127266e-05, + "loss": 2.4613, + "mean_token_accuracy": 0.44482759237289426, + "step": 86660 + }, + { + "epoch": 0.08728990845462667, + "grad_norm": 10.596943778836422, + "learning_rate": 4.9829149644529285e-05, + "loss": 2.5471, + "mean_token_accuracy": 0.4206896543502808, + "step": 86665 + }, + { + "epoch": 0.08729494450773084, + "grad_norm": 10.969281037038321, + "learning_rate": 4.9829103551592845e-05, + "loss": 2.2822, + "mean_token_accuracy": 0.458620685338974, + "step": 86670 + }, + { + "epoch": 0.08729998056083502, + "grad_norm": 12.733654876446808, + "learning_rate": 4.9829057452463326e-05, + "loss": 2.1458, + "mean_token_accuracy": 0.45396249890327456, + "step": 86675 + }, + { + "epoch": 0.08730501661393919, + "grad_norm": 10.739116079069797, + "learning_rate": 4.9829011347140784e-05, + "loss": 2.5958, + "mean_token_accuracy": 0.37241379022598264, + "step": 86680 + }, + { + "epoch": 0.08731005266704336, + "grad_norm": 11.006318469723423, + "learning_rate": 4.982896523562519e-05, + "loss": 2.1154, + "mean_token_accuracy": 0.46551724672317507, + "step": 86685 + }, + { + "epoch": 0.08731508872014754, + "grad_norm": 10.177031584406697, + "learning_rate": 4.982891911791658e-05, + "loss": 2.5977, + "mean_token_accuracy": 0.3793103516101837, + "step": 86690 + }, + { + "epoch": 0.08732012477325171, + "grad_norm": 11.789513173750395, + "learning_rate": 4.9828872994014966e-05, + "loss": 2.4298, + "mean_token_accuracy": 0.4137930929660797, + "step": 86695 + }, + { + "epoch": 0.08732516082635589, + "grad_norm": 11.145334680598076, + "learning_rate": 4.982882686392035e-05, + "loss": 2.5246, + "mean_token_accuracy": 0.43793103098869324, + "step": 86700 + }, + { + "epoch": 0.08733019687946006, + "grad_norm": 9.739285686700763, + "learning_rate": 4.982878072763275e-05, + "loss": 2.7633, + "mean_token_accuracy": 0.38965516686439516, + "step": 86705 + }, + { + "epoch": 0.08733523293256423, + "grad_norm": 11.714416685611136, + "learning_rate": 4.982873458515218e-05, + "loss": 2.3571, + "mean_token_accuracy": 0.4586206912994385, + "step": 86710 + }, + { + "epoch": 0.0873402689856684, + "grad_norm": 10.953880668972772, + "learning_rate": 4.9828688436478646e-05, + "loss": 2.7275, + "mean_token_accuracy": 0.4366606116294861, + "step": 86715 + }, + { + "epoch": 0.08734530503877257, + "grad_norm": 9.919426386669343, + "learning_rate": 4.9828642281612184e-05, + "loss": 2.2293, + "mean_token_accuracy": 0.4517241418361664, + "step": 86720 + }, + { + "epoch": 0.08735034109187674, + "grad_norm": 13.089126783304165, + "learning_rate": 4.982859612055277e-05, + "loss": 2.384, + "mean_token_accuracy": 0.42758620381355283, + "step": 86725 + }, + { + "epoch": 0.08735537714498091, + "grad_norm": 11.603534112298055, + "learning_rate": 4.982854995330045e-05, + "loss": 2.4878, + "mean_token_accuracy": 0.39655172228813174, + "step": 86730 + }, + { + "epoch": 0.08736041319808509, + "grad_norm": 9.309726994722043, + "learning_rate": 4.982850377985522e-05, + "loss": 2.2412, + "mean_token_accuracy": 0.48275862336158754, + "step": 86735 + }, + { + "epoch": 0.08736544925118926, + "grad_norm": 8.149088414169967, + "learning_rate": 4.9828457600217096e-05, + "loss": 2.7559, + "mean_token_accuracy": 0.39310343861579894, + "step": 86740 + }, + { + "epoch": 0.08737048530429344, + "grad_norm": 11.883112089343422, + "learning_rate": 4.982841141438609e-05, + "loss": 2.2867, + "mean_token_accuracy": 0.4689655125141144, + "step": 86745 + }, + { + "epoch": 0.08737552135739761, + "grad_norm": 12.74491885443032, + "learning_rate": 4.982836522236222e-05, + "loss": 2.5502, + "mean_token_accuracy": 0.4206896543502808, + "step": 86750 + }, + { + "epoch": 0.08738055741050178, + "grad_norm": 11.418983817165802, + "learning_rate": 4.9828319024145485e-05, + "loss": 2.4003, + "mean_token_accuracy": 0.4034482717514038, + "step": 86755 + }, + { + "epoch": 0.08738559346360596, + "grad_norm": 10.973314148317188, + "learning_rate": 4.982827281973592e-05, + "loss": 2.3717, + "mean_token_accuracy": 0.4103448212146759, + "step": 86760 + }, + { + "epoch": 0.08739062951671013, + "grad_norm": 22.53830700427761, + "learning_rate": 4.982822660913351e-05, + "loss": 2.2425, + "mean_token_accuracy": 0.4551724135875702, + "step": 86765 + }, + { + "epoch": 0.0873956655698143, + "grad_norm": 12.04259844614838, + "learning_rate": 4.982818039233829e-05, + "loss": 2.6112, + "mean_token_accuracy": 0.4, + "step": 86770 + }, + { + "epoch": 0.08740070162291848, + "grad_norm": 10.039397257311627, + "learning_rate": 4.9828134169350277e-05, + "loss": 2.6759, + "mean_token_accuracy": 0.3793103456497192, + "step": 86775 + }, + { + "epoch": 0.08740573767602265, + "grad_norm": 15.017689868611921, + "learning_rate": 4.982808794016946e-05, + "loss": 2.7856, + "mean_token_accuracy": 0.3620689630508423, + "step": 86780 + }, + { + "epoch": 0.08741077372912683, + "grad_norm": 11.048605932412197, + "learning_rate": 4.9828041704795865e-05, + "loss": 2.4491, + "mean_token_accuracy": 0.4206896543502808, + "step": 86785 + }, + { + "epoch": 0.08741580978223099, + "grad_norm": 10.579281017008608, + "learning_rate": 4.98279954632295e-05, + "loss": 2.241, + "mean_token_accuracy": 0.43448275327682495, + "step": 86790 + }, + { + "epoch": 0.08742084583533516, + "grad_norm": 10.712214844866077, + "learning_rate": 4.9827949215470394e-05, + "loss": 2.1358, + "mean_token_accuracy": 0.4206896543502808, + "step": 86795 + }, + { + "epoch": 0.08742588188843933, + "grad_norm": 12.529640105942413, + "learning_rate": 4.982790296151855e-05, + "loss": 2.5098, + "mean_token_accuracy": 0.36896551251411436, + "step": 86800 + }, + { + "epoch": 0.0874309179415435, + "grad_norm": 9.999315738669619, + "learning_rate": 4.982785670137397e-05, + "loss": 2.2154, + "mean_token_accuracy": 0.42758620977401735, + "step": 86805 + }, + { + "epoch": 0.08743595399464768, + "grad_norm": 9.88797328963264, + "learning_rate": 4.9827810435036676e-05, + "loss": 2.6249, + "mean_token_accuracy": 0.37241379022598264, + "step": 86810 + }, + { + "epoch": 0.08744099004775185, + "grad_norm": 10.783193471154624, + "learning_rate": 4.982776416250668e-05, + "loss": 2.4282, + "mean_token_accuracy": 0.4103448212146759, + "step": 86815 + }, + { + "epoch": 0.08744602610085603, + "grad_norm": 16.876841926864223, + "learning_rate": 4.9827717883783996e-05, + "loss": 3.1343, + "mean_token_accuracy": 0.3137931048870087, + "step": 86820 + }, + { + "epoch": 0.0874510621539602, + "grad_norm": 9.761759132626144, + "learning_rate": 4.982767159886864e-05, + "loss": 2.0192, + "mean_token_accuracy": 0.441379314661026, + "step": 86825 + }, + { + "epoch": 0.08745609820706438, + "grad_norm": 10.196037604958772, + "learning_rate": 4.982762530776062e-05, + "loss": 2.9055, + "mean_token_accuracy": 0.37931033968925476, + "step": 86830 + }, + { + "epoch": 0.08746113426016855, + "grad_norm": 13.017654115123996, + "learning_rate": 4.982757901045995e-05, + "loss": 2.7014, + "mean_token_accuracy": 0.39310344457626345, + "step": 86835 + }, + { + "epoch": 0.08746617031327272, + "grad_norm": 11.64475351660073, + "learning_rate": 4.9827532706966635e-05, + "loss": 2.4145, + "mean_token_accuracy": 0.3896551698446274, + "step": 86840 + }, + { + "epoch": 0.0874712063663769, + "grad_norm": 11.971806701511674, + "learning_rate": 4.9827486397280704e-05, + "loss": 2.3643, + "mean_token_accuracy": 0.3931034505367279, + "step": 86845 + }, + { + "epoch": 0.08747624241948107, + "grad_norm": 9.943546299129222, + "learning_rate": 4.982744008140216e-05, + "loss": 2.7105, + "mean_token_accuracy": 0.37586207389831544, + "step": 86850 + }, + { + "epoch": 0.08748127847258524, + "grad_norm": 9.24082139907043, + "learning_rate": 4.982739375933102e-05, + "loss": 2.3129, + "mean_token_accuracy": 0.3862069010734558, + "step": 86855 + }, + { + "epoch": 0.0874863145256894, + "grad_norm": 9.391245096358105, + "learning_rate": 4.982734743106728e-05, + "loss": 2.0059, + "mean_token_accuracy": 0.5018753707408905, + "step": 86860 + }, + { + "epoch": 0.08749135057879358, + "grad_norm": 8.907376340652393, + "learning_rate": 4.9827301096610985e-05, + "loss": 1.8845, + "mean_token_accuracy": 0.501996374130249, + "step": 86865 + }, + { + "epoch": 0.08749638663189775, + "grad_norm": 11.261764231638274, + "learning_rate": 4.9827254755962124e-05, + "loss": 2.7146, + "mean_token_accuracy": 0.4206896543502808, + "step": 86870 + }, + { + "epoch": 0.08750142268500193, + "grad_norm": 12.998363865034651, + "learning_rate": 4.9827208409120706e-05, + "loss": 2.6097, + "mean_token_accuracy": 0.3862069010734558, + "step": 86875 + }, + { + "epoch": 0.0875064587381061, + "grad_norm": 11.121655010994333, + "learning_rate": 4.982716205608677e-05, + "loss": 2.6102, + "mean_token_accuracy": 0.3896551728248596, + "step": 86880 + }, + { + "epoch": 0.08751149479121027, + "grad_norm": 14.46171780760077, + "learning_rate": 4.98271156968603e-05, + "loss": 2.6401, + "mean_token_accuracy": 0.38620689511299133, + "step": 86885 + }, + { + "epoch": 0.08751653084431445, + "grad_norm": 12.159818863566842, + "learning_rate": 4.982706933144133e-05, + "loss": 2.6378, + "mean_token_accuracy": 0.38275861740112305, + "step": 86890 + }, + { + "epoch": 0.08752156689741862, + "grad_norm": 9.713863270946488, + "learning_rate": 4.9827022959829854e-05, + "loss": 2.1876, + "mean_token_accuracy": 0.4448275864124298, + "step": 86895 + }, + { + "epoch": 0.0875266029505228, + "grad_norm": 12.78569760820009, + "learning_rate": 4.98269765820259e-05, + "loss": 2.3564, + "mean_token_accuracy": 0.441379314661026, + "step": 86900 + }, + { + "epoch": 0.08753163900362697, + "grad_norm": 11.14834387010444, + "learning_rate": 4.982693019802948e-05, + "loss": 2.5375, + "mean_token_accuracy": 0.43103448748588563, + "step": 86905 + }, + { + "epoch": 0.08753667505673114, + "grad_norm": 9.31341223847559, + "learning_rate": 4.982688380784059e-05, + "loss": 2.3245, + "mean_token_accuracy": 0.4344827651977539, + "step": 86910 + }, + { + "epoch": 0.08754171110983532, + "grad_norm": 9.94550920296541, + "learning_rate": 4.982683741145927e-05, + "loss": 2.4138, + "mean_token_accuracy": 0.4620689630508423, + "step": 86915 + }, + { + "epoch": 0.08754674716293949, + "grad_norm": 10.891548127398309, + "learning_rate": 4.98267910088855e-05, + "loss": 2.6362, + "mean_token_accuracy": 0.38620689511299133, + "step": 86920 + }, + { + "epoch": 0.08755178321604366, + "grad_norm": 9.971053290444187, + "learning_rate": 4.982674460011933e-05, + "loss": 2.466, + "mean_token_accuracy": 0.42909860610961914, + "step": 86925 + }, + { + "epoch": 0.08755681926914782, + "grad_norm": 11.701906139022288, + "learning_rate": 4.982669818516075e-05, + "loss": 2.4669, + "mean_token_accuracy": 0.4448275864124298, + "step": 86930 + }, + { + "epoch": 0.087561855322252, + "grad_norm": 13.57037109092577, + "learning_rate": 4.982665176400977e-05, + "loss": 2.6491, + "mean_token_accuracy": 0.41724138259887694, + "step": 86935 + }, + { + "epoch": 0.08756689137535617, + "grad_norm": 10.595351682605596, + "learning_rate": 4.982660533666641e-05, + "loss": 2.1679, + "mean_token_accuracy": 0.4379310369491577, + "step": 86940 + }, + { + "epoch": 0.08757192742846034, + "grad_norm": 13.863960425673907, + "learning_rate": 4.9826558903130686e-05, + "loss": 2.6916, + "mean_token_accuracy": 0.41379311084747317, + "step": 86945 + }, + { + "epoch": 0.08757696348156452, + "grad_norm": 10.634173686977757, + "learning_rate": 4.9826512463402606e-05, + "loss": 2.2498, + "mean_token_accuracy": 0.458620685338974, + "step": 86950 + }, + { + "epoch": 0.08758199953466869, + "grad_norm": 15.582914926598583, + "learning_rate": 4.9826466017482186e-05, + "loss": 2.7264, + "mean_token_accuracy": 0.4103448331356049, + "step": 86955 + }, + { + "epoch": 0.08758703558777287, + "grad_norm": 12.316523176198718, + "learning_rate": 4.982641956536944e-05, + "loss": 2.6294, + "mean_token_accuracy": 0.41724138259887694, + "step": 86960 + }, + { + "epoch": 0.08759207164087704, + "grad_norm": 9.060379955347333, + "learning_rate": 4.9826373107064376e-05, + "loss": 2.393, + "mean_token_accuracy": 0.4413793087005615, + "step": 86965 + }, + { + "epoch": 0.08759710769398121, + "grad_norm": 10.285193891329168, + "learning_rate": 4.982632664256702e-05, + "loss": 2.3874, + "mean_token_accuracy": 0.41034482717514037, + "step": 86970 + }, + { + "epoch": 0.08760214374708539, + "grad_norm": 10.865535924474674, + "learning_rate": 4.982628017187736e-05, + "loss": 2.6657, + "mean_token_accuracy": 0.4, + "step": 86975 + }, + { + "epoch": 0.08760717980018956, + "grad_norm": 11.527270722616372, + "learning_rate": 4.982623369499542e-05, + "loss": 2.6692, + "mean_token_accuracy": 0.34827585220336915, + "step": 86980 + }, + { + "epoch": 0.08761221585329373, + "grad_norm": 14.159475648746131, + "learning_rate": 4.9826187211921225e-05, + "loss": 2.5246, + "mean_token_accuracy": 0.3862068891525269, + "step": 86985 + }, + { + "epoch": 0.08761725190639791, + "grad_norm": 10.677494534097285, + "learning_rate": 4.9826140722654775e-05, + "loss": 2.2707, + "mean_token_accuracy": 0.45317603945732116, + "step": 86990 + }, + { + "epoch": 0.08762228795950208, + "grad_norm": 9.682594111536721, + "learning_rate": 4.982609422719609e-05, + "loss": 2.3598, + "mean_token_accuracy": 0.4137930929660797, + "step": 86995 + }, + { + "epoch": 0.08762732401260624, + "grad_norm": 11.314496390733074, + "learning_rate": 4.982604772554517e-05, + "loss": 2.5376, + "mean_token_accuracy": 0.3965517282485962, + "step": 87000 + }, + { + "epoch": 0.08763236006571042, + "grad_norm": 10.95452738267262, + "learning_rate": 4.982600121770205e-05, + "loss": 2.7097, + "mean_token_accuracy": 0.3896551728248596, + "step": 87005 + }, + { + "epoch": 0.08763739611881459, + "grad_norm": 9.182224645586281, + "learning_rate": 4.982595470366672e-05, + "loss": 2.3037, + "mean_token_accuracy": 0.4379310369491577, + "step": 87010 + }, + { + "epoch": 0.08764243217191876, + "grad_norm": 11.642238156512468, + "learning_rate": 4.982590818343921e-05, + "loss": 2.4352, + "mean_token_accuracy": 0.3931034505367279, + "step": 87015 + }, + { + "epoch": 0.08764746822502294, + "grad_norm": 11.940889618409653, + "learning_rate": 4.982586165701952e-05, + "loss": 2.7302, + "mean_token_accuracy": 0.41034482717514037, + "step": 87020 + }, + { + "epoch": 0.08765250427812711, + "grad_norm": 11.44544664641868, + "learning_rate": 4.9825815124407674e-05, + "loss": 2.3605, + "mean_token_accuracy": 0.4068965554237366, + "step": 87025 + }, + { + "epoch": 0.08765754033123128, + "grad_norm": 10.943955321660269, + "learning_rate": 4.982576858560368e-05, + "loss": 2.5892, + "mean_token_accuracy": 0.37586206793785093, + "step": 87030 + }, + { + "epoch": 0.08766257638433546, + "grad_norm": 13.232842492203126, + "learning_rate": 4.9825722040607545e-05, + "loss": 2.3656, + "mean_token_accuracy": 0.4034482717514038, + "step": 87035 + }, + { + "epoch": 0.08766761243743963, + "grad_norm": 12.13484111208282, + "learning_rate": 4.9825675489419294e-05, + "loss": 2.8652, + "mean_token_accuracy": 0.3620689630508423, + "step": 87040 + }, + { + "epoch": 0.0876726484905438, + "grad_norm": 11.897758290369051, + "learning_rate": 4.9825628932038934e-05, + "loss": 2.5452, + "mean_token_accuracy": 0.4344827651977539, + "step": 87045 + }, + { + "epoch": 0.08767768454364798, + "grad_norm": 22.21820611773179, + "learning_rate": 4.982558236846648e-05, + "loss": 2.5833, + "mean_token_accuracy": 0.4068965554237366, + "step": 87050 + }, + { + "epoch": 0.08768272059675215, + "grad_norm": 10.818350400260256, + "learning_rate": 4.982553579870193e-05, + "loss": 2.6378, + "mean_token_accuracy": 0.37241379022598264, + "step": 87055 + }, + { + "epoch": 0.08768775664985633, + "grad_norm": 9.882879615463496, + "learning_rate": 4.982548922274532e-05, + "loss": 2.4579, + "mean_token_accuracy": 0.4379310429096222, + "step": 87060 + }, + { + "epoch": 0.0876927927029605, + "grad_norm": 12.040847120836338, + "learning_rate": 4.982544264059665e-05, + "loss": 2.3141, + "mean_token_accuracy": 0.4034482777118683, + "step": 87065 + }, + { + "epoch": 0.08769782875606466, + "grad_norm": 8.75294841010402, + "learning_rate": 4.982539605225594e-05, + "loss": 2.2632, + "mean_token_accuracy": 0.4344827592372894, + "step": 87070 + }, + { + "epoch": 0.08770286480916883, + "grad_norm": 10.143227851954201, + "learning_rate": 4.982534945772319e-05, + "loss": 2.1208, + "mean_token_accuracy": 0.44482758045196535, + "step": 87075 + }, + { + "epoch": 0.08770790086227301, + "grad_norm": 11.00323708617181, + "learning_rate": 4.9825302856998416e-05, + "loss": 2.6254, + "mean_token_accuracy": 0.38275861740112305, + "step": 87080 + }, + { + "epoch": 0.08771293691537718, + "grad_norm": 12.954254014768114, + "learning_rate": 4.9825256250081645e-05, + "loss": 2.4369, + "mean_token_accuracy": 0.42256503403186796, + "step": 87085 + }, + { + "epoch": 0.08771797296848136, + "grad_norm": 10.665546905339701, + "learning_rate": 4.982520963697288e-05, + "loss": 2.4251, + "mean_token_accuracy": 0.37931033968925476, + "step": 87090 + }, + { + "epoch": 0.08772300902158553, + "grad_norm": 11.867585382526197, + "learning_rate": 4.982516301767214e-05, + "loss": 2.7233, + "mean_token_accuracy": 0.3774954676628113, + "step": 87095 + }, + { + "epoch": 0.0877280450746897, + "grad_norm": 10.937442238322449, + "learning_rate": 4.982511639217942e-05, + "loss": 2.6178, + "mean_token_accuracy": 0.42068964838981626, + "step": 87100 + }, + { + "epoch": 0.08773308112779388, + "grad_norm": 13.142302888107096, + "learning_rate": 4.9825069760494756e-05, + "loss": 2.4144, + "mean_token_accuracy": 0.3793103456497192, + "step": 87105 + }, + { + "epoch": 0.08773811718089805, + "grad_norm": 9.594812700444695, + "learning_rate": 4.982502312261815e-05, + "loss": 2.8426, + "mean_token_accuracy": 0.4034482717514038, + "step": 87110 + }, + { + "epoch": 0.08774315323400222, + "grad_norm": 10.295609570047027, + "learning_rate": 4.9824976478549606e-05, + "loss": 2.3922, + "mean_token_accuracy": 0.4206896543502808, + "step": 87115 + }, + { + "epoch": 0.0877481892871064, + "grad_norm": 11.520112799592354, + "learning_rate": 4.9824929828289156e-05, + "loss": 2.4154, + "mean_token_accuracy": 0.4413793087005615, + "step": 87120 + }, + { + "epoch": 0.08775322534021057, + "grad_norm": 10.48388389337717, + "learning_rate": 4.98248831718368e-05, + "loss": 2.3563, + "mean_token_accuracy": 0.42758620381355283, + "step": 87125 + }, + { + "epoch": 0.08775826139331475, + "grad_norm": 10.266186740548447, + "learning_rate": 4.982483650919256e-05, + "loss": 2.3745, + "mean_token_accuracy": 0.4620689570903778, + "step": 87130 + }, + { + "epoch": 0.08776329744641892, + "grad_norm": 11.86086897499934, + "learning_rate": 4.982478984035644e-05, + "loss": 2.2111, + "mean_token_accuracy": 0.4241379380226135, + "step": 87135 + }, + { + "epoch": 0.08776833349952308, + "grad_norm": 10.242304814505573, + "learning_rate": 4.9824743165328446e-05, + "loss": 2.5535, + "mean_token_accuracy": 0.4379310369491577, + "step": 87140 + }, + { + "epoch": 0.08777336955262725, + "grad_norm": 9.913777131119105, + "learning_rate": 4.982469648410861e-05, + "loss": 2.3605, + "mean_token_accuracy": 0.42068964540958403, + "step": 87145 + }, + { + "epoch": 0.08777840560573143, + "grad_norm": 10.644775365379722, + "learning_rate": 4.9824649796696935e-05, + "loss": 2.5673, + "mean_token_accuracy": 0.4344827592372894, + "step": 87150 + }, + { + "epoch": 0.0877834416588356, + "grad_norm": 16.418794025065058, + "learning_rate": 4.982460310309344e-05, + "loss": 2.2685, + "mean_token_accuracy": 0.458620685338974, + "step": 87155 + }, + { + "epoch": 0.08778847771193977, + "grad_norm": 10.779892780686236, + "learning_rate": 4.982455640329813e-05, + "loss": 3.0532, + "mean_token_accuracy": 0.3517241418361664, + "step": 87160 + }, + { + "epoch": 0.08779351376504395, + "grad_norm": 11.973887640332048, + "learning_rate": 4.9824509697311026e-05, + "loss": 2.8534, + "mean_token_accuracy": 0.3862068891525269, + "step": 87165 + }, + { + "epoch": 0.08779854981814812, + "grad_norm": 10.158666604452753, + "learning_rate": 4.982446298513213e-05, + "loss": 2.694, + "mean_token_accuracy": 0.379310342669487, + "step": 87170 + }, + { + "epoch": 0.0878035858712523, + "grad_norm": 19.113830497743432, + "learning_rate": 4.982441626676146e-05, + "loss": 2.2325, + "mean_token_accuracy": 0.4586206912994385, + "step": 87175 + }, + { + "epoch": 0.08780862192435647, + "grad_norm": 10.731565663587366, + "learning_rate": 4.982436954219903e-05, + "loss": 2.3533, + "mean_token_accuracy": 0.42758620977401735, + "step": 87180 + }, + { + "epoch": 0.08781365797746064, + "grad_norm": 10.16734182813282, + "learning_rate": 4.982432281144485e-05, + "loss": 3.0081, + "mean_token_accuracy": 0.36993345618247986, + "step": 87185 + }, + { + "epoch": 0.08781869403056482, + "grad_norm": 11.817832648405341, + "learning_rate": 4.982427607449894e-05, + "loss": 2.2116, + "mean_token_accuracy": 0.44482758045196535, + "step": 87190 + }, + { + "epoch": 0.08782373008366899, + "grad_norm": 10.230732061590063, + "learning_rate": 4.982422933136131e-05, + "loss": 2.4577, + "mean_token_accuracy": 0.39655172228813174, + "step": 87195 + }, + { + "epoch": 0.08782876613677316, + "grad_norm": 15.242906643769517, + "learning_rate": 4.982418258203197e-05, + "loss": 2.9703, + "mean_token_accuracy": 0.3620689630508423, + "step": 87200 + }, + { + "epoch": 0.08783380218987734, + "grad_norm": 10.460467061839495, + "learning_rate": 4.982413582651094e-05, + "loss": 2.4856, + "mean_token_accuracy": 0.41379310488700866, + "step": 87205 + }, + { + "epoch": 0.0878388382429815, + "grad_norm": 13.165955247944053, + "learning_rate": 4.982408906479821e-05, + "loss": 2.2239, + "mean_token_accuracy": 0.45517241954803467, + "step": 87210 + }, + { + "epoch": 0.08784387429608567, + "grad_norm": 13.092107528941291, + "learning_rate": 4.982404229689383e-05, + "loss": 2.5351, + "mean_token_accuracy": 0.38275861740112305, + "step": 87215 + }, + { + "epoch": 0.08784891034918985, + "grad_norm": 11.707153090853298, + "learning_rate": 4.9823995522797784e-05, + "loss": 2.3156, + "mean_token_accuracy": 0.4724137902259827, + "step": 87220 + }, + { + "epoch": 0.08785394640229402, + "grad_norm": 11.882739974292752, + "learning_rate": 4.98239487425101e-05, + "loss": 2.5486, + "mean_token_accuracy": 0.4, + "step": 87225 + }, + { + "epoch": 0.0878589824553982, + "grad_norm": 10.902814296904362, + "learning_rate": 4.982390195603078e-05, + "loss": 2.7152, + "mean_token_accuracy": 0.3793103456497192, + "step": 87230 + }, + { + "epoch": 0.08786401850850237, + "grad_norm": 9.745525060057464, + "learning_rate": 4.9823855163359844e-05, + "loss": 2.3072, + "mean_token_accuracy": 0.43103448748588563, + "step": 87235 + }, + { + "epoch": 0.08786905456160654, + "grad_norm": 12.037085323003463, + "learning_rate": 4.98238083644973e-05, + "loss": 2.3546, + "mean_token_accuracy": 0.4172413766384125, + "step": 87240 + }, + { + "epoch": 0.08787409061471071, + "grad_norm": 9.525344242504563, + "learning_rate": 4.982376155944318e-05, + "loss": 2.3021, + "mean_token_accuracy": 0.4379310369491577, + "step": 87245 + }, + { + "epoch": 0.08787912666781489, + "grad_norm": 14.751099453954428, + "learning_rate": 4.982371474819747e-05, + "loss": 2.4847, + "mean_token_accuracy": 0.4, + "step": 87250 + }, + { + "epoch": 0.08788416272091906, + "grad_norm": 11.95270383405833, + "learning_rate": 4.9823667930760194e-05, + "loss": 2.5688, + "mean_token_accuracy": 0.41379311084747317, + "step": 87255 + }, + { + "epoch": 0.08788919877402324, + "grad_norm": 11.52681163526228, + "learning_rate": 4.9823621107131364e-05, + "loss": 2.4911, + "mean_token_accuracy": 0.4172413766384125, + "step": 87260 + }, + { + "epoch": 0.08789423482712741, + "grad_norm": 10.79811814801969, + "learning_rate": 4.9823574277310995e-05, + "loss": 2.2961, + "mean_token_accuracy": 0.43448275327682495, + "step": 87265 + }, + { + "epoch": 0.08789927088023158, + "grad_norm": 13.366674123513226, + "learning_rate": 4.98235274412991e-05, + "loss": 2.6388, + "mean_token_accuracy": 0.412583190202713, + "step": 87270 + }, + { + "epoch": 0.08790430693333576, + "grad_norm": 17.31629646573536, + "learning_rate": 4.98234805990957e-05, + "loss": 2.4811, + "mean_token_accuracy": 0.37931033968925476, + "step": 87275 + }, + { + "epoch": 0.08790934298643992, + "grad_norm": 12.282512364194803, + "learning_rate": 4.982343375070079e-05, + "loss": 2.4888, + "mean_token_accuracy": 0.39310345649719236, + "step": 87280 + }, + { + "epoch": 0.08791437903954409, + "grad_norm": 12.545339487312773, + "learning_rate": 4.9823386896114396e-05, + "loss": 2.5608, + "mean_token_accuracy": 0.41034482717514037, + "step": 87285 + }, + { + "epoch": 0.08791941509264826, + "grad_norm": 12.963097446800727, + "learning_rate": 4.982334003533652e-05, + "loss": 2.7561, + "mean_token_accuracy": 0.33793103992938994, + "step": 87290 + }, + { + "epoch": 0.08792445114575244, + "grad_norm": 13.587400378766953, + "learning_rate": 4.9823293168367194e-05, + "loss": 2.7437, + "mean_token_accuracy": 0.38965516686439516, + "step": 87295 + }, + { + "epoch": 0.08792948719885661, + "grad_norm": 10.430936503568278, + "learning_rate": 4.982324629520641e-05, + "loss": 2.4984, + "mean_token_accuracy": 0.4551724135875702, + "step": 87300 + }, + { + "epoch": 0.08793452325196079, + "grad_norm": 11.156440753178853, + "learning_rate": 4.982319941585419e-05, + "loss": 2.4182, + "mean_token_accuracy": 0.4206896543502808, + "step": 87305 + }, + { + "epoch": 0.08793955930506496, + "grad_norm": 8.027622348766691, + "learning_rate": 4.982315253031055e-05, + "loss": 2.3781, + "mean_token_accuracy": 0.44827587008476255, + "step": 87310 + }, + { + "epoch": 0.08794459535816913, + "grad_norm": 10.954552107193345, + "learning_rate": 4.9823105638575504e-05, + "loss": 2.5739, + "mean_token_accuracy": 0.39655172228813174, + "step": 87315 + }, + { + "epoch": 0.08794963141127331, + "grad_norm": 9.976398696761777, + "learning_rate": 4.982305874064906e-05, + "loss": 2.2393, + "mean_token_accuracy": 0.44482758045196535, + "step": 87320 + }, + { + "epoch": 0.08795466746437748, + "grad_norm": 12.339657634043824, + "learning_rate": 4.9823011836531233e-05, + "loss": 2.8757, + "mean_token_accuracy": 0.38965516686439516, + "step": 87325 + }, + { + "epoch": 0.08795970351748165, + "grad_norm": 12.458222174712473, + "learning_rate": 4.982296492622203e-05, + "loss": 2.6814, + "mean_token_accuracy": 0.36206896901130675, + "step": 87330 + }, + { + "epoch": 0.08796473957058583, + "grad_norm": 9.570308925693526, + "learning_rate": 4.9822918009721475e-05, + "loss": 2.6587, + "mean_token_accuracy": 0.4034482777118683, + "step": 87335 + }, + { + "epoch": 0.08796977562369, + "grad_norm": 10.868775942585662, + "learning_rate": 4.9822871087029573e-05, + "loss": 2.3548, + "mean_token_accuracy": 0.46091954708099364, + "step": 87340 + }, + { + "epoch": 0.08797481167679418, + "grad_norm": 11.555428055826185, + "learning_rate": 4.982282415814634e-05, + "loss": 2.6108, + "mean_token_accuracy": 0.4034482777118683, + "step": 87345 + }, + { + "epoch": 0.08797984772989834, + "grad_norm": 9.935083830958119, + "learning_rate": 4.982277722307179e-05, + "loss": 2.2064, + "mean_token_accuracy": 0.44827585816383364, + "step": 87350 + }, + { + "epoch": 0.08798488378300251, + "grad_norm": 10.111592160790885, + "learning_rate": 4.982273028180593e-05, + "loss": 2.8531, + "mean_token_accuracy": 0.35862068831920624, + "step": 87355 + }, + { + "epoch": 0.08798991983610668, + "grad_norm": 14.254124842018488, + "learning_rate": 4.982268333434878e-05, + "loss": 2.4886, + "mean_token_accuracy": 0.4068965554237366, + "step": 87360 + }, + { + "epoch": 0.08799495588921086, + "grad_norm": 14.393053490649478, + "learning_rate": 4.982263638070035e-05, + "loss": 2.511, + "mean_token_accuracy": 0.4068965494632721, + "step": 87365 + }, + { + "epoch": 0.08799999194231503, + "grad_norm": 10.156249388278196, + "learning_rate": 4.982258942086066e-05, + "loss": 2.2918, + "mean_token_accuracy": 0.4310344815254211, + "step": 87370 + }, + { + "epoch": 0.0880050279954192, + "grad_norm": 12.834463738934561, + "learning_rate": 4.982254245482971e-05, + "loss": 2.1622, + "mean_token_accuracy": 0.4344827592372894, + "step": 87375 + }, + { + "epoch": 0.08801006404852338, + "grad_norm": 11.13323113480036, + "learning_rate": 4.982249548260752e-05, + "loss": 2.7082, + "mean_token_accuracy": 0.38124621510505674, + "step": 87380 + }, + { + "epoch": 0.08801510010162755, + "grad_norm": 12.44538439734185, + "learning_rate": 4.98224485041941e-05, + "loss": 2.7141, + "mean_token_accuracy": 0.32413792610168457, + "step": 87385 + }, + { + "epoch": 0.08802013615473173, + "grad_norm": 12.090517429445391, + "learning_rate": 4.982240151958947e-05, + "loss": 2.2997, + "mean_token_accuracy": 0.4344827651977539, + "step": 87390 + }, + { + "epoch": 0.0880251722078359, + "grad_norm": 9.802507496600402, + "learning_rate": 4.9822354528793636e-05, + "loss": 2.437, + "mean_token_accuracy": 0.4172413766384125, + "step": 87395 + }, + { + "epoch": 0.08803020826094007, + "grad_norm": 13.246691159984875, + "learning_rate": 4.982230753180662e-05, + "loss": 3.0241, + "mean_token_accuracy": 0.36551723778247835, + "step": 87400 + }, + { + "epoch": 0.08803524431404425, + "grad_norm": 8.914377844947316, + "learning_rate": 4.982226052862843e-05, + "loss": 2.2299, + "mean_token_accuracy": 0.4770935833454132, + "step": 87405 + }, + { + "epoch": 0.08804028036714842, + "grad_norm": 11.888247169072717, + "learning_rate": 4.982221351925906e-05, + "loss": 2.6188, + "mean_token_accuracy": 0.4241379380226135, + "step": 87410 + }, + { + "epoch": 0.0880453164202526, + "grad_norm": 12.354941749975119, + "learning_rate": 4.9822166503698556e-05, + "loss": 3.0598, + "mean_token_accuracy": 0.37586207687854767, + "step": 87415 + }, + { + "epoch": 0.08805035247335675, + "grad_norm": 15.932795667046111, + "learning_rate": 4.982211948194691e-05, + "loss": 2.318, + "mean_token_accuracy": 0.4758620738983154, + "step": 87420 + }, + { + "epoch": 0.08805538852646093, + "grad_norm": 10.011272983179536, + "learning_rate": 4.9822072454004146e-05, + "loss": 2.4428, + "mean_token_accuracy": 0.42413793206214906, + "step": 87425 + }, + { + "epoch": 0.0880604245795651, + "grad_norm": 12.206825930185158, + "learning_rate": 4.982202541987027e-05, + "loss": 2.5906, + "mean_token_accuracy": 0.40344828069210054, + "step": 87430 + }, + { + "epoch": 0.08806546063266928, + "grad_norm": 12.026779364915114, + "learning_rate": 4.98219783795453e-05, + "loss": 2.5482, + "mean_token_accuracy": 0.3497277647256851, + "step": 87435 + }, + { + "epoch": 0.08807049668577345, + "grad_norm": 9.385388861481918, + "learning_rate": 4.9821931333029234e-05, + "loss": 2.6932, + "mean_token_accuracy": 0.43103448748588563, + "step": 87440 + }, + { + "epoch": 0.08807553273887762, + "grad_norm": 11.135418625599826, + "learning_rate": 4.9821884280322106e-05, + "loss": 2.6164, + "mean_token_accuracy": 0.37586206793785093, + "step": 87445 + }, + { + "epoch": 0.0880805687919818, + "grad_norm": 11.258497299871738, + "learning_rate": 4.9821837221423925e-05, + "loss": 2.6222, + "mean_token_accuracy": 0.44827585816383364, + "step": 87450 + }, + { + "epoch": 0.08808560484508597, + "grad_norm": 10.02352636895581, + "learning_rate": 4.982179015633469e-05, + "loss": 2.597, + "mean_token_accuracy": 0.3931034505367279, + "step": 87455 + }, + { + "epoch": 0.08809064089819015, + "grad_norm": 11.675619599129817, + "learning_rate": 4.982174308505443e-05, + "loss": 2.2365, + "mean_token_accuracy": 0.4655172348022461, + "step": 87460 + }, + { + "epoch": 0.08809567695129432, + "grad_norm": 11.686532819961682, + "learning_rate": 4.982169600758315e-05, + "loss": 3.0111, + "mean_token_accuracy": 0.358620685338974, + "step": 87465 + }, + { + "epoch": 0.08810071300439849, + "grad_norm": 10.829387671461287, + "learning_rate": 4.9821648923920854e-05, + "loss": 2.5108, + "mean_token_accuracy": 0.43968542814254763, + "step": 87470 + }, + { + "epoch": 0.08810574905750267, + "grad_norm": 9.897642529755426, + "learning_rate": 4.982160183406758e-05, + "loss": 2.5988, + "mean_token_accuracy": 0.4103448331356049, + "step": 87475 + }, + { + "epoch": 0.08811078511060684, + "grad_norm": 11.884456762703117, + "learning_rate": 4.9821554738023315e-05, + "loss": 2.727, + "mean_token_accuracy": 0.39655172228813174, + "step": 87480 + }, + { + "epoch": 0.08811582116371101, + "grad_norm": 11.68597755685487, + "learning_rate": 4.98215076357881e-05, + "loss": 2.7716, + "mean_token_accuracy": 0.37241379618644715, + "step": 87485 + }, + { + "epoch": 0.08812085721681517, + "grad_norm": 12.643818332552291, + "learning_rate": 4.9821460527361915e-05, + "loss": 2.5391, + "mean_token_accuracy": 0.4465819776058197, + "step": 87490 + }, + { + "epoch": 0.08812589326991935, + "grad_norm": 9.679271877094756, + "learning_rate": 4.982141341274479e-05, + "loss": 2.5196, + "mean_token_accuracy": 0.3793103456497192, + "step": 87495 + }, + { + "epoch": 0.08813092932302352, + "grad_norm": 13.44168573418974, + "learning_rate": 4.982136629193674e-05, + "loss": 2.5705, + "mean_token_accuracy": 0.4068965494632721, + "step": 87500 + }, + { + "epoch": 0.0881359653761277, + "grad_norm": 9.732729086347861, + "learning_rate": 4.982131916493778e-05, + "loss": 2.4329, + "mean_token_accuracy": 0.3965517282485962, + "step": 87505 + }, + { + "epoch": 0.08814100142923187, + "grad_norm": 9.844585183354637, + "learning_rate": 4.982127203174792e-05, + "loss": 2.7978, + "mean_token_accuracy": 0.3709013909101486, + "step": 87510 + }, + { + "epoch": 0.08814603748233604, + "grad_norm": 12.715067642577257, + "learning_rate": 4.982122489236717e-05, + "loss": 2.2821, + "mean_token_accuracy": 0.4655172348022461, + "step": 87515 + }, + { + "epoch": 0.08815107353544022, + "grad_norm": 13.135338725852302, + "learning_rate": 4.9821177746795536e-05, + "loss": 2.5331, + "mean_token_accuracy": 0.4103448331356049, + "step": 87520 + }, + { + "epoch": 0.08815610958854439, + "grad_norm": 9.589405395692806, + "learning_rate": 4.9821130595033054e-05, + "loss": 2.468, + "mean_token_accuracy": 0.4344827651977539, + "step": 87525 + }, + { + "epoch": 0.08816114564164856, + "grad_norm": 11.375059671353128, + "learning_rate": 4.982108343707972e-05, + "loss": 2.3441, + "mean_token_accuracy": 0.4568663060665131, + "step": 87530 + }, + { + "epoch": 0.08816618169475274, + "grad_norm": 24.812493939893496, + "learning_rate": 4.9821036272935545e-05, + "loss": 2.482, + "mean_token_accuracy": 0.4206896543502808, + "step": 87535 + }, + { + "epoch": 0.08817121774785691, + "grad_norm": 9.157420465336113, + "learning_rate": 4.982098910260055e-05, + "loss": 2.5036, + "mean_token_accuracy": 0.4401693969964981, + "step": 87540 + }, + { + "epoch": 0.08817625380096109, + "grad_norm": 10.446821851318393, + "learning_rate": 4.982094192607475e-05, + "loss": 2.2407, + "mean_token_accuracy": 0.4465819776058197, + "step": 87545 + }, + { + "epoch": 0.08818128985406526, + "grad_norm": 11.642744374592246, + "learning_rate": 4.982089474335815e-05, + "loss": 2.4322, + "mean_token_accuracy": 0.42413793206214906, + "step": 87550 + }, + { + "epoch": 0.08818632590716943, + "grad_norm": 11.983437453154355, + "learning_rate": 4.982084755445077e-05, + "loss": 2.8039, + "mean_token_accuracy": 0.37586206793785093, + "step": 87555 + }, + { + "epoch": 0.08819136196027359, + "grad_norm": 13.668340564527643, + "learning_rate": 4.9820800359352605e-05, + "loss": 2.5894, + "mean_token_accuracy": 0.417241370677948, + "step": 87560 + }, + { + "epoch": 0.08819639801337777, + "grad_norm": 10.479670811162928, + "learning_rate": 4.98207531580637e-05, + "loss": 2.2537, + "mean_token_accuracy": 0.44482759237289426, + "step": 87565 + }, + { + "epoch": 0.08820143406648194, + "grad_norm": 10.631849719830198, + "learning_rate": 4.9820705950584035e-05, + "loss": 2.3667, + "mean_token_accuracy": 0.42413793206214906, + "step": 87570 + }, + { + "epoch": 0.08820647011958611, + "grad_norm": 12.904279813597961, + "learning_rate": 4.982065873691365e-05, + "loss": 2.5433, + "mean_token_accuracy": 0.4137930929660797, + "step": 87575 + }, + { + "epoch": 0.08821150617269029, + "grad_norm": 11.02525058458468, + "learning_rate": 4.9820611517052545e-05, + "loss": 2.5532, + "mean_token_accuracy": 0.3793103456497192, + "step": 87580 + }, + { + "epoch": 0.08821654222579446, + "grad_norm": 11.504604589686444, + "learning_rate": 4.982056429100074e-05, + "loss": 2.1735, + "mean_token_accuracy": 0.49655171632766726, + "step": 87585 + }, + { + "epoch": 0.08822157827889864, + "grad_norm": 9.749868773533054, + "learning_rate": 4.982051705875823e-05, + "loss": 2.164, + "mean_token_accuracy": 0.44827585220336913, + "step": 87590 + }, + { + "epoch": 0.08822661433200281, + "grad_norm": 12.687294243642373, + "learning_rate": 4.9820469820325053e-05, + "loss": 2.5342, + "mean_token_accuracy": 0.4241379380226135, + "step": 87595 + }, + { + "epoch": 0.08823165038510698, + "grad_norm": 10.388131070098565, + "learning_rate": 4.9820422575701205e-05, + "loss": 2.5767, + "mean_token_accuracy": 0.3999999940395355, + "step": 87600 + }, + { + "epoch": 0.08823668643821116, + "grad_norm": 18.894800300736478, + "learning_rate": 4.982037532488671e-05, + "loss": 2.6195, + "mean_token_accuracy": 0.39310344457626345, + "step": 87605 + }, + { + "epoch": 0.08824172249131533, + "grad_norm": 9.997621936202252, + "learning_rate": 4.982032806788157e-05, + "loss": 2.0228, + "mean_token_accuracy": 0.4620689570903778, + "step": 87610 + }, + { + "epoch": 0.0882467585444195, + "grad_norm": 13.676212609794502, + "learning_rate": 4.9820280804685806e-05, + "loss": 2.6558, + "mean_token_accuracy": 0.4103448331356049, + "step": 87615 + }, + { + "epoch": 0.08825179459752368, + "grad_norm": 11.467256539505701, + "learning_rate": 4.982023353529943e-05, + "loss": 2.5336, + "mean_token_accuracy": 0.4103448212146759, + "step": 87620 + }, + { + "epoch": 0.08825683065062785, + "grad_norm": 11.218107029167502, + "learning_rate": 4.982018625972244e-05, + "loss": 2.3208, + "mean_token_accuracy": 0.4103448212146759, + "step": 87625 + }, + { + "epoch": 0.08826186670373201, + "grad_norm": 9.838688047664524, + "learning_rate": 4.982013897795489e-05, + "loss": 2.4711, + "mean_token_accuracy": 0.38620689511299133, + "step": 87630 + }, + { + "epoch": 0.08826690275683619, + "grad_norm": 10.528866640887859, + "learning_rate": 4.982009168999675e-05, + "loss": 2.4343, + "mean_token_accuracy": 0.47586206197738645, + "step": 87635 + }, + { + "epoch": 0.08827193880994036, + "grad_norm": 8.865704197902149, + "learning_rate": 4.982004439584805e-05, + "loss": 2.2717, + "mean_token_accuracy": 0.42413792610168455, + "step": 87640 + }, + { + "epoch": 0.08827697486304453, + "grad_norm": 15.057659895772288, + "learning_rate": 4.9819997095508805e-05, + "loss": 2.4646, + "mean_token_accuracy": 0.40344828069210054, + "step": 87645 + }, + { + "epoch": 0.0882820109161487, + "grad_norm": 11.12231753508984, + "learning_rate": 4.981994978897902e-05, + "loss": 2.5484, + "mean_token_accuracy": 0.40689654350280763, + "step": 87650 + }, + { + "epoch": 0.08828704696925288, + "grad_norm": 12.298926677072737, + "learning_rate": 4.9819902476258714e-05, + "loss": 2.2015, + "mean_token_accuracy": 0.4620689690113068, + "step": 87655 + }, + { + "epoch": 0.08829208302235705, + "grad_norm": 9.089841585734444, + "learning_rate": 4.9819855157347914e-05, + "loss": 2.3696, + "mean_token_accuracy": 0.4344827592372894, + "step": 87660 + }, + { + "epoch": 0.08829711907546123, + "grad_norm": 14.201703198175254, + "learning_rate": 4.9819807832246604e-05, + "loss": 2.3236, + "mean_token_accuracy": 0.4551724135875702, + "step": 87665 + }, + { + "epoch": 0.0883021551285654, + "grad_norm": 11.852600873225015, + "learning_rate": 4.981976050095482e-05, + "loss": 2.4893, + "mean_token_accuracy": 0.3793103456497192, + "step": 87670 + }, + { + "epoch": 0.08830719118166958, + "grad_norm": 12.506076091065202, + "learning_rate": 4.9819713163472564e-05, + "loss": 2.4389, + "mean_token_accuracy": 0.4376890480518341, + "step": 87675 + }, + { + "epoch": 0.08831222723477375, + "grad_norm": 10.798763548637718, + "learning_rate": 4.9819665819799846e-05, + "loss": 2.2858, + "mean_token_accuracy": 0.44827587008476255, + "step": 87680 + }, + { + "epoch": 0.08831726328787792, + "grad_norm": 13.390477937099545, + "learning_rate": 4.9819618469936694e-05, + "loss": 2.6976, + "mean_token_accuracy": 0.41379310488700866, + "step": 87685 + }, + { + "epoch": 0.0883222993409821, + "grad_norm": 10.88316406307631, + "learning_rate": 4.981957111388311e-05, + "loss": 2.2735, + "mean_token_accuracy": 0.4448275864124298, + "step": 87690 + }, + { + "epoch": 0.08832733539408627, + "grad_norm": 13.39144109417419, + "learning_rate": 4.981952375163911e-05, + "loss": 2.5784, + "mean_token_accuracy": 0.3655172407627106, + "step": 87695 + }, + { + "epoch": 0.08833237144719043, + "grad_norm": 11.101355904297504, + "learning_rate": 4.9819476383204706e-05, + "loss": 2.4605, + "mean_token_accuracy": 0.46031458377838136, + "step": 87700 + }, + { + "epoch": 0.0883374075002946, + "grad_norm": 11.56462169048237, + "learning_rate": 4.981942900857991e-05, + "loss": 2.5035, + "mean_token_accuracy": 0.39655172228813174, + "step": 87705 + }, + { + "epoch": 0.08834244355339878, + "grad_norm": 10.444161711773909, + "learning_rate": 4.981938162776474e-05, + "loss": 2.4502, + "mean_token_accuracy": 0.4172413766384125, + "step": 87710 + }, + { + "epoch": 0.08834747960650295, + "grad_norm": 11.47963795079637, + "learning_rate": 4.981933424075921e-05, + "loss": 2.4509, + "mean_token_accuracy": 0.4, + "step": 87715 + }, + { + "epoch": 0.08835251565960713, + "grad_norm": 10.2852508608451, + "learning_rate": 4.9819286847563326e-05, + "loss": 2.3855, + "mean_token_accuracy": 0.4517241299152374, + "step": 87720 + }, + { + "epoch": 0.0883575517127113, + "grad_norm": 11.703583073120697, + "learning_rate": 4.9819239448177105e-05, + "loss": 2.2483, + "mean_token_accuracy": 0.42758620977401735, + "step": 87725 + }, + { + "epoch": 0.08836258776581547, + "grad_norm": 11.829333089207436, + "learning_rate": 4.981919204260056e-05, + "loss": 2.2301, + "mean_token_accuracy": 0.4241379380226135, + "step": 87730 + }, + { + "epoch": 0.08836762381891965, + "grad_norm": 11.219533018776746, + "learning_rate": 4.9819144630833695e-05, + "loss": 2.467, + "mean_token_accuracy": 0.4310344815254211, + "step": 87735 + }, + { + "epoch": 0.08837265987202382, + "grad_norm": 11.177485302727815, + "learning_rate": 4.9819097212876534e-05, + "loss": 2.4254, + "mean_token_accuracy": 0.4448275864124298, + "step": 87740 + }, + { + "epoch": 0.088377695925128, + "grad_norm": 13.004480022597187, + "learning_rate": 4.9819049788729094e-05, + "loss": 1.9713, + "mean_token_accuracy": 0.4931034505367279, + "step": 87745 + }, + { + "epoch": 0.08838273197823217, + "grad_norm": 13.221198867385475, + "learning_rate": 4.981900235839139e-05, + "loss": 2.4457, + "mean_token_accuracy": 0.42068966031074523, + "step": 87750 + }, + { + "epoch": 0.08838776803133634, + "grad_norm": 13.253144482229784, + "learning_rate": 4.9818954921863415e-05, + "loss": 2.3111, + "mean_token_accuracy": 0.4448275864124298, + "step": 87755 + }, + { + "epoch": 0.08839280408444052, + "grad_norm": 9.364057670469068, + "learning_rate": 4.98189074791452e-05, + "loss": 2.3802, + "mean_token_accuracy": 0.4620689630508423, + "step": 87760 + }, + { + "epoch": 0.08839784013754469, + "grad_norm": 10.418001624689813, + "learning_rate": 4.9818860030236754e-05, + "loss": 2.0979, + "mean_token_accuracy": 0.5034482777118683, + "step": 87765 + }, + { + "epoch": 0.08840287619064885, + "grad_norm": 12.828709084193635, + "learning_rate": 4.9818812575138086e-05, + "loss": 2.5804, + "mean_token_accuracy": 0.4551724135875702, + "step": 87770 + }, + { + "epoch": 0.08840791224375302, + "grad_norm": 9.670134148783331, + "learning_rate": 4.9818765113849206e-05, + "loss": 2.372, + "mean_token_accuracy": 0.458620685338974, + "step": 87775 + }, + { + "epoch": 0.0884129482968572, + "grad_norm": 10.651866321093772, + "learning_rate": 4.981871764637014e-05, + "loss": 2.901, + "mean_token_accuracy": 0.417241370677948, + "step": 87780 + }, + { + "epoch": 0.08841798434996137, + "grad_norm": 15.820722669884764, + "learning_rate": 4.981867017270089e-05, + "loss": 2.4162, + "mean_token_accuracy": 0.43448275327682495, + "step": 87785 + }, + { + "epoch": 0.08842302040306554, + "grad_norm": 11.330669954628922, + "learning_rate": 4.981862269284148e-05, + "loss": 2.7044, + "mean_token_accuracy": 0.3655172407627106, + "step": 87790 + }, + { + "epoch": 0.08842805645616972, + "grad_norm": 10.562349229340056, + "learning_rate": 4.9818575206791916e-05, + "loss": 2.2856, + "mean_token_accuracy": 0.4482758641242981, + "step": 87795 + }, + { + "epoch": 0.08843309250927389, + "grad_norm": 12.241803030407496, + "learning_rate": 4.9818527714552215e-05, + "loss": 2.7995, + "mean_token_accuracy": 0.35862069129943847, + "step": 87800 + }, + { + "epoch": 0.08843812856237807, + "grad_norm": 11.690954324615594, + "learning_rate": 4.981848021612238e-05, + "loss": 2.7606, + "mean_token_accuracy": 0.3620689630508423, + "step": 87805 + }, + { + "epoch": 0.08844316461548224, + "grad_norm": 11.233172023577994, + "learning_rate": 4.9818432711502436e-05, + "loss": 2.4705, + "mean_token_accuracy": 0.4103448331356049, + "step": 87810 + }, + { + "epoch": 0.08844820066858641, + "grad_norm": 10.480789938385163, + "learning_rate": 4.981838520069238e-05, + "loss": 2.1786, + "mean_token_accuracy": 0.4620689690113068, + "step": 87815 + }, + { + "epoch": 0.08845323672169059, + "grad_norm": 10.841802129327977, + "learning_rate": 4.9818337683692244e-05, + "loss": 2.4375, + "mean_token_accuracy": 0.4344827651977539, + "step": 87820 + }, + { + "epoch": 0.08845827277479476, + "grad_norm": 12.848947789351016, + "learning_rate": 4.981829016050205e-05, + "loss": 2.3929, + "mean_token_accuracy": 0.41379310488700866, + "step": 87825 + }, + { + "epoch": 0.08846330882789893, + "grad_norm": 10.80538439172015, + "learning_rate": 4.9818242631121774e-05, + "loss": 2.2153, + "mean_token_accuracy": 0.48620688915252686, + "step": 87830 + }, + { + "epoch": 0.08846834488100311, + "grad_norm": 10.187808847368395, + "learning_rate": 4.981819509555145e-05, + "loss": 2.4428, + "mean_token_accuracy": 0.3827586233615875, + "step": 87835 + }, + { + "epoch": 0.08847338093410727, + "grad_norm": 11.69449228151899, + "learning_rate": 4.981814755379111e-05, + "loss": 2.6341, + "mean_token_accuracy": 0.4137930989265442, + "step": 87840 + }, + { + "epoch": 0.08847841698721144, + "grad_norm": 10.03887366549081, + "learning_rate": 4.981810000584073e-05, + "loss": 2.3729, + "mean_token_accuracy": 0.42068966031074523, + "step": 87845 + }, + { + "epoch": 0.08848345304031562, + "grad_norm": 12.747454825514911, + "learning_rate": 4.981805245170035e-05, + "loss": 2.5979, + "mean_token_accuracy": 0.4137930989265442, + "step": 87850 + }, + { + "epoch": 0.08848848909341979, + "grad_norm": 11.150895883090465, + "learning_rate": 4.9818004891369966e-05, + "loss": 2.2396, + "mean_token_accuracy": 0.4586206912994385, + "step": 87855 + }, + { + "epoch": 0.08849352514652396, + "grad_norm": 13.141795495815318, + "learning_rate": 4.981795732484961e-05, + "loss": 2.9082, + "mean_token_accuracy": 0.3724137872457504, + "step": 87860 + }, + { + "epoch": 0.08849856119962814, + "grad_norm": 15.321646771853748, + "learning_rate": 4.9817909752139285e-05, + "loss": 2.971, + "mean_token_accuracy": 0.337931028008461, + "step": 87865 + }, + { + "epoch": 0.08850359725273231, + "grad_norm": 10.821358503351378, + "learning_rate": 4.9817862173239e-05, + "loss": 2.1116, + "mean_token_accuracy": 0.4620689690113068, + "step": 87870 + }, + { + "epoch": 0.08850863330583648, + "grad_norm": 12.661220390796624, + "learning_rate": 4.981781458814877e-05, + "loss": 2.4968, + "mean_token_accuracy": 0.417241370677948, + "step": 87875 + }, + { + "epoch": 0.08851366935894066, + "grad_norm": 17.646373875883018, + "learning_rate": 4.981776699686862e-05, + "loss": 3.1473, + "mean_token_accuracy": 0.3517241358757019, + "step": 87880 + }, + { + "epoch": 0.08851870541204483, + "grad_norm": 9.587476550573102, + "learning_rate": 4.9817719399398546e-05, + "loss": 2.7776, + "mean_token_accuracy": 0.44137930274009707, + "step": 87885 + }, + { + "epoch": 0.088523741465149, + "grad_norm": 11.667651580756042, + "learning_rate": 4.981767179573857e-05, + "loss": 2.6116, + "mean_token_accuracy": 0.41379311084747317, + "step": 87890 + }, + { + "epoch": 0.08852877751825318, + "grad_norm": 10.613058487223562, + "learning_rate": 4.9817624185888713e-05, + "loss": 2.0334, + "mean_token_accuracy": 0.4896551609039307, + "step": 87895 + }, + { + "epoch": 0.08853381357135735, + "grad_norm": 10.785547654042238, + "learning_rate": 4.9817576569848974e-05, + "loss": 2.6073, + "mean_token_accuracy": 0.42068964838981626, + "step": 87900 + }, + { + "epoch": 0.08853884962446153, + "grad_norm": 12.023743516054164, + "learning_rate": 4.981752894761937e-05, + "loss": 2.7896, + "mean_token_accuracy": 0.3931034505367279, + "step": 87905 + }, + { + "epoch": 0.08854388567756569, + "grad_norm": 10.84399122971706, + "learning_rate": 4.9817481319199924e-05, + "loss": 2.5543, + "mean_token_accuracy": 0.4344827592372894, + "step": 87910 + }, + { + "epoch": 0.08854892173066986, + "grad_norm": 9.180768734603031, + "learning_rate": 4.981743368459063e-05, + "loss": 2.1375, + "mean_token_accuracy": 0.4517241358757019, + "step": 87915 + }, + { + "epoch": 0.08855395778377403, + "grad_norm": 8.938753631615219, + "learning_rate": 4.981738604379151e-05, + "loss": 2.2546, + "mean_token_accuracy": 0.4771324872970581, + "step": 87920 + }, + { + "epoch": 0.08855899383687821, + "grad_norm": 13.926193669797161, + "learning_rate": 4.981733839680259e-05, + "loss": 2.7287, + "mean_token_accuracy": 0.4137930989265442, + "step": 87925 + }, + { + "epoch": 0.08856402988998238, + "grad_norm": 12.325971245645706, + "learning_rate": 4.981729074362387e-05, + "loss": 2.2006, + "mean_token_accuracy": 0.42758620381355283, + "step": 87930 + }, + { + "epoch": 0.08856906594308656, + "grad_norm": 11.207253569357638, + "learning_rate": 4.981724308425537e-05, + "loss": 2.6595, + "mean_token_accuracy": 0.38620689511299133, + "step": 87935 + }, + { + "epoch": 0.08857410199619073, + "grad_norm": 14.064535457601599, + "learning_rate": 4.98171954186971e-05, + "loss": 2.4378, + "mean_token_accuracy": 0.4068965494632721, + "step": 87940 + }, + { + "epoch": 0.0885791380492949, + "grad_norm": 9.960627697770352, + "learning_rate": 4.981714774694907e-05, + "loss": 2.1481, + "mean_token_accuracy": 0.4344827651977539, + "step": 87945 + }, + { + "epoch": 0.08858417410239908, + "grad_norm": 9.993233894479232, + "learning_rate": 4.981710006901129e-05, + "loss": 2.509, + "mean_token_accuracy": 0.4689655125141144, + "step": 87950 + }, + { + "epoch": 0.08858921015550325, + "grad_norm": 10.113483608820857, + "learning_rate": 4.9817052384883785e-05, + "loss": 2.3949, + "mean_token_accuracy": 0.40344828069210054, + "step": 87955 + }, + { + "epoch": 0.08859424620860742, + "grad_norm": 11.393337158619628, + "learning_rate": 4.981700469456656e-05, + "loss": 2.3427, + "mean_token_accuracy": 0.42220205068588257, + "step": 87960 + }, + { + "epoch": 0.0885992822617116, + "grad_norm": 9.484012663581293, + "learning_rate": 4.981695699805963e-05, + "loss": 2.596, + "mean_token_accuracy": 0.3931034505367279, + "step": 87965 + }, + { + "epoch": 0.08860431831481577, + "grad_norm": 16.120742144363355, + "learning_rate": 4.981690929536301e-05, + "loss": 2.6038, + "mean_token_accuracy": 0.41724138259887694, + "step": 87970 + }, + { + "epoch": 0.08860935436791993, + "grad_norm": 9.711929420038334, + "learning_rate": 4.981686158647671e-05, + "loss": 2.5201, + "mean_token_accuracy": 0.38620689511299133, + "step": 87975 + }, + { + "epoch": 0.0886143904210241, + "grad_norm": 9.248194928750396, + "learning_rate": 4.9816813871400746e-05, + "loss": 2.442, + "mean_token_accuracy": 0.4016333997249603, + "step": 87980 + }, + { + "epoch": 0.08861942647412828, + "grad_norm": 11.893083325521431, + "learning_rate": 4.981676615013513e-05, + "loss": 2.4671, + "mean_token_accuracy": 0.4310344815254211, + "step": 87985 + }, + { + "epoch": 0.08862446252723245, + "grad_norm": 10.955280481873618, + "learning_rate": 4.981671842267988e-05, + "loss": 2.851, + "mean_token_accuracy": 0.3896551728248596, + "step": 87990 + }, + { + "epoch": 0.08862949858033663, + "grad_norm": 12.355012198273926, + "learning_rate": 4.9816670689035e-05, + "loss": 2.3293, + "mean_token_accuracy": 0.43103447556495667, + "step": 87995 + }, + { + "epoch": 0.0886345346334408, + "grad_norm": 15.01858193412576, + "learning_rate": 4.981662294920051e-05, + "loss": 2.17, + "mean_token_accuracy": 0.4589901328086853, + "step": 88000 + }, + { + "epoch": 0.08863957068654497, + "grad_norm": 10.262428157981082, + "learning_rate": 4.9816575203176416e-05, + "loss": 2.2651, + "mean_token_accuracy": 0.47931033968925474, + "step": 88005 + }, + { + "epoch": 0.08864460673964915, + "grad_norm": 9.476088887849924, + "learning_rate": 4.981652745096275e-05, + "loss": 2.3649, + "mean_token_accuracy": 0.4034482777118683, + "step": 88010 + }, + { + "epoch": 0.08864964279275332, + "grad_norm": 10.96851701050586, + "learning_rate": 4.98164796925595e-05, + "loss": 2.5077, + "mean_token_accuracy": 0.39147005379199984, + "step": 88015 + }, + { + "epoch": 0.0886546788458575, + "grad_norm": 11.95229470923467, + "learning_rate": 4.98164319279667e-05, + "loss": 2.3368, + "mean_token_accuracy": 0.43103447556495667, + "step": 88020 + }, + { + "epoch": 0.08865971489896167, + "grad_norm": 12.142336831484991, + "learning_rate": 4.981638415718434e-05, + "loss": 2.6342, + "mean_token_accuracy": 0.3758620649576187, + "step": 88025 + }, + { + "epoch": 0.08866475095206584, + "grad_norm": 9.37232380597033, + "learning_rate": 4.981633638021246e-05, + "loss": 2.2108, + "mean_token_accuracy": 0.42758620977401735, + "step": 88030 + }, + { + "epoch": 0.08866978700517002, + "grad_norm": 10.467462267537808, + "learning_rate": 4.981628859705106e-05, + "loss": 2.6763, + "mean_token_accuracy": 0.4034482777118683, + "step": 88035 + }, + { + "epoch": 0.08867482305827419, + "grad_norm": 13.585630991246987, + "learning_rate": 4.981624080770015e-05, + "loss": 2.6103, + "mean_token_accuracy": 0.41724138259887694, + "step": 88040 + }, + { + "epoch": 0.08867985911137835, + "grad_norm": 10.910376517535214, + "learning_rate": 4.981619301215975e-05, + "loss": 2.2711, + "mean_token_accuracy": 0.47931033968925474, + "step": 88045 + }, + { + "epoch": 0.08868489516448252, + "grad_norm": 9.565287758192914, + "learning_rate": 4.981614521042987e-05, + "loss": 2.1115, + "mean_token_accuracy": 0.4655172348022461, + "step": 88050 + }, + { + "epoch": 0.0886899312175867, + "grad_norm": 10.265469483756734, + "learning_rate": 4.981609740251053e-05, + "loss": 2.722, + "mean_token_accuracy": 0.38620689511299133, + "step": 88055 + }, + { + "epoch": 0.08869496727069087, + "grad_norm": 13.984695819649867, + "learning_rate": 4.981604958840172e-05, + "loss": 2.5449, + "mean_token_accuracy": 0.4034482717514038, + "step": 88060 + }, + { + "epoch": 0.08870000332379505, + "grad_norm": 11.490298826910722, + "learning_rate": 4.9816001768103485e-05, + "loss": 2.5584, + "mean_token_accuracy": 0.3793103456497192, + "step": 88065 + }, + { + "epoch": 0.08870503937689922, + "grad_norm": 10.73307178993154, + "learning_rate": 4.9815953941615814e-05, + "loss": 2.5555, + "mean_token_accuracy": 0.41724138259887694, + "step": 88070 + }, + { + "epoch": 0.0887100754300034, + "grad_norm": 9.737514866336396, + "learning_rate": 4.9815906108938745e-05, + "loss": 2.4479, + "mean_token_accuracy": 0.46696914434432985, + "step": 88075 + }, + { + "epoch": 0.08871511148310757, + "grad_norm": 10.049729271602805, + "learning_rate": 4.9815858270072264e-05, + "loss": 2.3386, + "mean_token_accuracy": 0.43103448748588563, + "step": 88080 + }, + { + "epoch": 0.08872014753621174, + "grad_norm": 10.112141150316083, + "learning_rate": 4.98158104250164e-05, + "loss": 2.6625, + "mean_token_accuracy": 0.4034482777118683, + "step": 88085 + }, + { + "epoch": 0.08872518358931591, + "grad_norm": 10.9528475206623, + "learning_rate": 4.9815762573771155e-05, + "loss": 1.8999, + "mean_token_accuracy": 0.493103438615799, + "step": 88090 + }, + { + "epoch": 0.08873021964242009, + "grad_norm": 10.791139790458981, + "learning_rate": 4.9815714716336566e-05, + "loss": 2.1594, + "mean_token_accuracy": 0.4413793087005615, + "step": 88095 + }, + { + "epoch": 0.08873525569552426, + "grad_norm": 11.287480888915699, + "learning_rate": 4.981566685271262e-05, + "loss": 2.7638, + "mean_token_accuracy": 0.31379309892654417, + "step": 88100 + }, + { + "epoch": 0.08874029174862844, + "grad_norm": 13.451286924950844, + "learning_rate": 4.981561898289934e-05, + "loss": 2.2972, + "mean_token_accuracy": 0.46551724076271056, + "step": 88105 + }, + { + "epoch": 0.08874532780173261, + "grad_norm": 11.763888455204093, + "learning_rate": 4.981557110689674e-05, + "loss": 2.3433, + "mean_token_accuracy": 0.4068965554237366, + "step": 88110 + }, + { + "epoch": 0.08875036385483677, + "grad_norm": 13.99681202925253, + "learning_rate": 4.981552322470484e-05, + "loss": 2.8444, + "mean_token_accuracy": 0.3827586233615875, + "step": 88115 + }, + { + "epoch": 0.08875539990794094, + "grad_norm": 11.451954376677241, + "learning_rate": 4.981547533632364e-05, + "loss": 2.548, + "mean_token_accuracy": 0.42413793206214906, + "step": 88120 + }, + { + "epoch": 0.08876043596104512, + "grad_norm": 8.937528006109897, + "learning_rate": 4.981542744175316e-05, + "loss": 2.2427, + "mean_token_accuracy": 0.47586206793785096, + "step": 88125 + }, + { + "epoch": 0.08876547201414929, + "grad_norm": 15.956200834208557, + "learning_rate": 4.981537954099341e-05, + "loss": 2.5141, + "mean_token_accuracy": 0.45517241954803467, + "step": 88130 + }, + { + "epoch": 0.08877050806725346, + "grad_norm": 13.398975260285793, + "learning_rate": 4.9815331634044414e-05, + "loss": 2.9357, + "mean_token_accuracy": 0.3655172407627106, + "step": 88135 + }, + { + "epoch": 0.08877554412035764, + "grad_norm": 12.037393996641288, + "learning_rate": 4.9815283720906174e-05, + "loss": 2.8407, + "mean_token_accuracy": 0.3931034505367279, + "step": 88140 + }, + { + "epoch": 0.08878058017346181, + "grad_norm": 9.425014503997332, + "learning_rate": 4.9815235801578706e-05, + "loss": 2.5829, + "mean_token_accuracy": 0.4206896543502808, + "step": 88145 + }, + { + "epoch": 0.08878561622656599, + "grad_norm": 10.755955148242137, + "learning_rate": 4.981518787606203e-05, + "loss": 2.5614, + "mean_token_accuracy": 0.38620689511299133, + "step": 88150 + }, + { + "epoch": 0.08879065227967016, + "grad_norm": 10.968081030510584, + "learning_rate": 4.981513994435614e-05, + "loss": 2.4424, + "mean_token_accuracy": 0.42758620977401735, + "step": 88155 + }, + { + "epoch": 0.08879568833277433, + "grad_norm": 11.238602196070923, + "learning_rate": 4.981509200646107e-05, + "loss": 2.4673, + "mean_token_accuracy": 0.43793103098869324, + "step": 88160 + }, + { + "epoch": 0.08880072438587851, + "grad_norm": 9.20094631117747, + "learning_rate": 4.9815044062376825e-05, + "loss": 2.16, + "mean_token_accuracy": 0.5, + "step": 88165 + }, + { + "epoch": 0.08880576043898268, + "grad_norm": 11.46798057363033, + "learning_rate": 4.981499611210342e-05, + "loss": 2.7783, + "mean_token_accuracy": 0.34137930870056155, + "step": 88170 + }, + { + "epoch": 0.08881079649208685, + "grad_norm": 11.340504787504353, + "learning_rate": 4.9814948155640875e-05, + "loss": 2.4942, + "mean_token_accuracy": 0.4275861978530884, + "step": 88175 + }, + { + "epoch": 0.08881583254519103, + "grad_norm": 16.418668225567245, + "learning_rate": 4.981490019298919e-05, + "loss": 2.4762, + "mean_token_accuracy": 0.41724138259887694, + "step": 88180 + }, + { + "epoch": 0.08882086859829519, + "grad_norm": 10.471701628887658, + "learning_rate": 4.981485222414838e-05, + "loss": 2.8361, + "mean_token_accuracy": 0.39310344457626345, + "step": 88185 + }, + { + "epoch": 0.08882590465139936, + "grad_norm": 9.464536189225788, + "learning_rate": 4.981480424911846e-05, + "loss": 2.3155, + "mean_token_accuracy": 0.46412583589553835, + "step": 88190 + }, + { + "epoch": 0.08883094070450354, + "grad_norm": 11.984185953018105, + "learning_rate": 4.981475626789946e-05, + "loss": 2.8221, + "mean_token_accuracy": 0.4, + "step": 88195 + }, + { + "epoch": 0.08883597675760771, + "grad_norm": 12.122517540991241, + "learning_rate": 4.981470828049137e-05, + "loss": 2.483, + "mean_token_accuracy": 0.42068966031074523, + "step": 88200 + }, + { + "epoch": 0.08884101281071188, + "grad_norm": 10.55925564768586, + "learning_rate": 4.981466028689421e-05, + "loss": 2.2514, + "mean_token_accuracy": 0.4689655125141144, + "step": 88205 + }, + { + "epoch": 0.08884604886381606, + "grad_norm": 11.945111735010908, + "learning_rate": 4.9814612287108006e-05, + "loss": 2.5595, + "mean_token_accuracy": 0.4206896543502808, + "step": 88210 + }, + { + "epoch": 0.08885108491692023, + "grad_norm": 9.748495150875272, + "learning_rate": 4.981456428113275e-05, + "loss": 2.3752, + "mean_token_accuracy": 0.4172413766384125, + "step": 88215 + }, + { + "epoch": 0.0888561209700244, + "grad_norm": 13.19253592187581, + "learning_rate": 4.981451626896847e-05, + "loss": 2.3144, + "mean_token_accuracy": 0.43448275327682495, + "step": 88220 + }, + { + "epoch": 0.08886115702312858, + "grad_norm": 9.833835454451819, + "learning_rate": 4.981446825061518e-05, + "loss": 2.3138, + "mean_token_accuracy": 0.44482759237289426, + "step": 88225 + }, + { + "epoch": 0.08886619307623275, + "grad_norm": 9.757708002450425, + "learning_rate": 4.981442022607288e-05, + "loss": 2.2574, + "mean_token_accuracy": 0.45347853302955626, + "step": 88230 + }, + { + "epoch": 0.08887122912933693, + "grad_norm": 9.55098972091099, + "learning_rate": 4.9814372195341605e-05, + "loss": 2.1682, + "mean_token_accuracy": 0.4344827592372894, + "step": 88235 + }, + { + "epoch": 0.0888762651824411, + "grad_norm": 10.44393236548825, + "learning_rate": 4.9814324158421345e-05, + "loss": 2.2303, + "mean_token_accuracy": 0.47586206197738645, + "step": 88240 + }, + { + "epoch": 0.08888130123554527, + "grad_norm": 11.81333264912459, + "learning_rate": 4.981427611531213e-05, + "loss": 2.5796, + "mean_token_accuracy": 0.39655172228813174, + "step": 88245 + }, + { + "epoch": 0.08888633728864945, + "grad_norm": 10.149998411617167, + "learning_rate": 4.981422806601396e-05, + "loss": 2.1921, + "mean_token_accuracy": 0.4620689570903778, + "step": 88250 + }, + { + "epoch": 0.08889137334175361, + "grad_norm": 10.616800871727099, + "learning_rate": 4.981418001052686e-05, + "loss": 2.684, + "mean_token_accuracy": 0.3655172407627106, + "step": 88255 + }, + { + "epoch": 0.08889640939485778, + "grad_norm": 14.241874143915625, + "learning_rate": 4.981413194885084e-05, + "loss": 2.7456, + "mean_token_accuracy": 0.36206896007061007, + "step": 88260 + }, + { + "epoch": 0.08890144544796195, + "grad_norm": 12.79969314837717, + "learning_rate": 4.981408388098591e-05, + "loss": 2.1283, + "mean_token_accuracy": 0.42413792610168455, + "step": 88265 + }, + { + "epoch": 0.08890648150106613, + "grad_norm": 17.055605273053796, + "learning_rate": 4.981403580693208e-05, + "loss": 2.418, + "mean_token_accuracy": 0.4724137902259827, + "step": 88270 + }, + { + "epoch": 0.0889115175541703, + "grad_norm": 11.344691468805907, + "learning_rate": 4.9813987726689385e-05, + "loss": 2.4015, + "mean_token_accuracy": 0.41979949474334716, + "step": 88275 + }, + { + "epoch": 0.08891655360727448, + "grad_norm": 11.564263236144832, + "learning_rate": 4.9813939640257806e-05, + "loss": 2.4473, + "mean_token_accuracy": 0.45517240166664125, + "step": 88280 + }, + { + "epoch": 0.08892158966037865, + "grad_norm": 14.324921227716956, + "learning_rate": 4.9813891547637385e-05, + "loss": 2.7665, + "mean_token_accuracy": 0.42413793206214906, + "step": 88285 + }, + { + "epoch": 0.08892662571348282, + "grad_norm": 10.279499215315559, + "learning_rate": 4.981384344882812e-05, + "loss": 2.4464, + "mean_token_accuracy": 0.4344827592372894, + "step": 88290 + }, + { + "epoch": 0.088931661766587, + "grad_norm": 12.210893266693814, + "learning_rate": 4.981379534383002e-05, + "loss": 2.3941, + "mean_token_accuracy": 0.43793103098869324, + "step": 88295 + }, + { + "epoch": 0.08893669781969117, + "grad_norm": 12.192906495433, + "learning_rate": 4.981374723264311e-05, + "loss": 2.267, + "mean_token_accuracy": 0.45172413885593415, + "step": 88300 + }, + { + "epoch": 0.08894173387279534, + "grad_norm": 13.580096100733952, + "learning_rate": 4.98136991152674e-05, + "loss": 3.1224, + "mean_token_accuracy": 0.3448275804519653, + "step": 88305 + }, + { + "epoch": 0.08894676992589952, + "grad_norm": 11.121474910423105, + "learning_rate": 4.98136509917029e-05, + "loss": 2.2364, + "mean_token_accuracy": 0.4517241418361664, + "step": 88310 + }, + { + "epoch": 0.08895180597900369, + "grad_norm": 12.726503836828549, + "learning_rate": 4.9813602861949623e-05, + "loss": 2.8347, + "mean_token_accuracy": 0.3482758551836014, + "step": 88315 + }, + { + "epoch": 0.08895684203210787, + "grad_norm": 11.227749281178523, + "learning_rate": 4.981355472600759e-05, + "loss": 2.6061, + "mean_token_accuracy": 0.3827586114406586, + "step": 88320 + }, + { + "epoch": 0.08896187808521203, + "grad_norm": 11.796305929356699, + "learning_rate": 4.9813506583876815e-05, + "loss": 2.7586, + "mean_token_accuracy": 0.37586206793785093, + "step": 88325 + }, + { + "epoch": 0.0889669141383162, + "grad_norm": 8.585885773844021, + "learning_rate": 4.9813458435557296e-05, + "loss": 2.349, + "mean_token_accuracy": 0.42758620381355283, + "step": 88330 + }, + { + "epoch": 0.08897195019142037, + "grad_norm": 14.752013523218668, + "learning_rate": 4.981341028104906e-05, + "loss": 2.6184, + "mean_token_accuracy": 0.34137930870056155, + "step": 88335 + }, + { + "epoch": 0.08897698624452455, + "grad_norm": 13.64467951166241, + "learning_rate": 4.9813362120352115e-05, + "loss": 2.7721, + "mean_token_accuracy": 0.37241379618644715, + "step": 88340 + }, + { + "epoch": 0.08898202229762872, + "grad_norm": 11.182612266176323, + "learning_rate": 4.981331395346648e-05, + "loss": 2.6699, + "mean_token_accuracy": 0.42758620381355283, + "step": 88345 + }, + { + "epoch": 0.0889870583507329, + "grad_norm": 11.778045830837229, + "learning_rate": 4.981326578039216e-05, + "loss": 2.8188, + "mean_token_accuracy": 0.3571082890033722, + "step": 88350 + }, + { + "epoch": 0.08899209440383707, + "grad_norm": 13.343582253881378, + "learning_rate": 4.981321760112917e-05, + "loss": 2.5702, + "mean_token_accuracy": 0.42413792610168455, + "step": 88355 + }, + { + "epoch": 0.08899713045694124, + "grad_norm": 10.803557516748283, + "learning_rate": 4.981316941567754e-05, + "loss": 2.5504, + "mean_token_accuracy": 0.3965517163276672, + "step": 88360 + }, + { + "epoch": 0.08900216651004542, + "grad_norm": 10.172451433090384, + "learning_rate": 4.981312122403725e-05, + "loss": 2.5646, + "mean_token_accuracy": 0.37931033968925476, + "step": 88365 + }, + { + "epoch": 0.08900720256314959, + "grad_norm": 12.586700150046994, + "learning_rate": 4.9813073026208336e-05, + "loss": 2.2526, + "mean_token_accuracy": 0.41034482717514037, + "step": 88370 + }, + { + "epoch": 0.08901223861625376, + "grad_norm": 9.05318852082888, + "learning_rate": 4.981302482219082e-05, + "loss": 2.108, + "mean_token_accuracy": 0.47586206793785096, + "step": 88375 + }, + { + "epoch": 0.08901727466935794, + "grad_norm": 10.418086001980356, + "learning_rate": 4.981297661198469e-05, + "loss": 2.3438, + "mean_token_accuracy": 0.46551724672317507, + "step": 88380 + }, + { + "epoch": 0.08902231072246211, + "grad_norm": 12.757490771064086, + "learning_rate": 4.981292839558999e-05, + "loss": 2.7367, + "mean_token_accuracy": 0.3793103337287903, + "step": 88385 + }, + { + "epoch": 0.08902734677556629, + "grad_norm": 11.454282546502752, + "learning_rate": 4.98128801730067e-05, + "loss": 2.2747, + "mean_token_accuracy": 0.4413793087005615, + "step": 88390 + }, + { + "epoch": 0.08903238282867044, + "grad_norm": 19.536367320268926, + "learning_rate": 4.9812831944234853e-05, + "loss": 2.6747, + "mean_token_accuracy": 0.4103448212146759, + "step": 88395 + }, + { + "epoch": 0.08903741888177462, + "grad_norm": 9.838760332683808, + "learning_rate": 4.9812783709274455e-05, + "loss": 2.4179, + "mean_token_accuracy": 0.41185722351074217, + "step": 88400 + }, + { + "epoch": 0.08904245493487879, + "grad_norm": 8.821647631950514, + "learning_rate": 4.981273546812553e-05, + "loss": 2.6125, + "mean_token_accuracy": 0.4310344815254211, + "step": 88405 + }, + { + "epoch": 0.08904749098798297, + "grad_norm": 10.797919908335528, + "learning_rate": 4.9812687220788085e-05, + "loss": 2.1077, + "mean_token_accuracy": 0.45862069725990295, + "step": 88410 + }, + { + "epoch": 0.08905252704108714, + "grad_norm": 10.20014079587903, + "learning_rate": 4.9812638967262134e-05, + "loss": 2.3504, + "mean_token_accuracy": 0.4673926293849945, + "step": 88415 + }, + { + "epoch": 0.08905756309419131, + "grad_norm": 19.3823305401709, + "learning_rate": 4.981259070754768e-05, + "loss": 2.4823, + "mean_token_accuracy": 0.4517241418361664, + "step": 88420 + }, + { + "epoch": 0.08906259914729549, + "grad_norm": 10.268804649975353, + "learning_rate": 4.981254244164475e-05, + "loss": 2.5385, + "mean_token_accuracy": 0.4103448212146759, + "step": 88425 + }, + { + "epoch": 0.08906763520039966, + "grad_norm": 11.306011665288132, + "learning_rate": 4.981249416955336e-05, + "loss": 2.4799, + "mean_token_accuracy": 0.4068965494632721, + "step": 88430 + }, + { + "epoch": 0.08907267125350384, + "grad_norm": 13.320936019408437, + "learning_rate": 4.981244589127351e-05, + "loss": 2.7569, + "mean_token_accuracy": 0.41034482717514037, + "step": 88435 + }, + { + "epoch": 0.08907770730660801, + "grad_norm": 9.832002826380656, + "learning_rate": 4.9812397606805224e-05, + "loss": 2.5133, + "mean_token_accuracy": 0.4310344815254211, + "step": 88440 + }, + { + "epoch": 0.08908274335971218, + "grad_norm": 12.017551111659943, + "learning_rate": 4.9812349316148507e-05, + "loss": 2.3956, + "mean_token_accuracy": 0.4310344815254211, + "step": 88445 + }, + { + "epoch": 0.08908777941281636, + "grad_norm": 15.94137603383071, + "learning_rate": 4.981230101930337e-05, + "loss": 2.576, + "mean_token_accuracy": 0.4448275864124298, + "step": 88450 + }, + { + "epoch": 0.08909281546592053, + "grad_norm": 13.717884695676045, + "learning_rate": 4.981225271626985e-05, + "loss": 2.8414, + "mean_token_accuracy": 0.34827586114406583, + "step": 88455 + }, + { + "epoch": 0.0890978515190247, + "grad_norm": 15.247413543689415, + "learning_rate": 4.981220440704794e-05, + "loss": 2.5534, + "mean_token_accuracy": 0.3724137932062149, + "step": 88460 + }, + { + "epoch": 0.08910288757212886, + "grad_norm": 9.403994355589962, + "learning_rate": 4.9812156091637645e-05, + "loss": 2.3149, + "mean_token_accuracy": 0.4620689630508423, + "step": 88465 + }, + { + "epoch": 0.08910792362523304, + "grad_norm": 13.912584185858721, + "learning_rate": 4.9812107770039004e-05, + "loss": 2.8523, + "mean_token_accuracy": 0.38620689511299133, + "step": 88470 + }, + { + "epoch": 0.08911295967833721, + "grad_norm": 13.406585293328076, + "learning_rate": 4.981205944225201e-05, + "loss": 2.3045, + "mean_token_accuracy": 0.46551724076271056, + "step": 88475 + }, + { + "epoch": 0.08911799573144139, + "grad_norm": 10.726055731378517, + "learning_rate": 4.981201110827668e-05, + "loss": 2.3646, + "mean_token_accuracy": 0.4413793206214905, + "step": 88480 + }, + { + "epoch": 0.08912303178454556, + "grad_norm": 8.211322244315545, + "learning_rate": 4.981196276811303e-05, + "loss": 2.3994, + "mean_token_accuracy": 0.4931034505367279, + "step": 88485 + }, + { + "epoch": 0.08912806783764973, + "grad_norm": 10.091749856479506, + "learning_rate": 4.981191442176108e-05, + "loss": 2.0191, + "mean_token_accuracy": 0.5000000059604645, + "step": 88490 + }, + { + "epoch": 0.0891331038907539, + "grad_norm": 16.788940807438482, + "learning_rate": 4.981186606922083e-05, + "loss": 2.4965, + "mean_token_accuracy": 0.4517241418361664, + "step": 88495 + }, + { + "epoch": 0.08913813994385808, + "grad_norm": 10.716928908616541, + "learning_rate": 4.981181771049231e-05, + "loss": 2.4667, + "mean_token_accuracy": 0.40344826579093934, + "step": 88500 + }, + { + "epoch": 0.08914317599696225, + "grad_norm": 9.822811137664917, + "learning_rate": 4.981176934557552e-05, + "loss": 2.3252, + "mean_token_accuracy": 0.42758620381355283, + "step": 88505 + }, + { + "epoch": 0.08914821205006643, + "grad_norm": 11.311884010511532, + "learning_rate": 4.9811720974470475e-05, + "loss": 2.1513, + "mean_token_accuracy": 0.495160311460495, + "step": 88510 + }, + { + "epoch": 0.0891532481031706, + "grad_norm": 11.551471667786691, + "learning_rate": 4.981167259717719e-05, + "loss": 2.8687, + "mean_token_accuracy": 0.34137930572032926, + "step": 88515 + }, + { + "epoch": 0.08915828415627478, + "grad_norm": 10.224425634835514, + "learning_rate": 4.981162421369569e-05, + "loss": 2.4248, + "mean_token_accuracy": 0.42758620381355283, + "step": 88520 + }, + { + "epoch": 0.08916332020937895, + "grad_norm": 10.120152346658063, + "learning_rate": 4.981157582402596e-05, + "loss": 2.5911, + "mean_token_accuracy": 0.39461584091186525, + "step": 88525 + }, + { + "epoch": 0.08916835626248312, + "grad_norm": 9.372358473185935, + "learning_rate": 4.981152742816804e-05, + "loss": 2.2235, + "mean_token_accuracy": 0.46896551847457885, + "step": 88530 + }, + { + "epoch": 0.08917339231558728, + "grad_norm": 12.72747694610874, + "learning_rate": 4.9811479026121945e-05, + "loss": 2.4633, + "mean_token_accuracy": 0.41034482717514037, + "step": 88535 + }, + { + "epoch": 0.08917842836869146, + "grad_norm": 11.283195217690508, + "learning_rate": 4.981143061788767e-05, + "loss": 2.0434, + "mean_token_accuracy": 0.441379314661026, + "step": 88540 + }, + { + "epoch": 0.08918346442179563, + "grad_norm": 11.617391086592413, + "learning_rate": 4.981138220346523e-05, + "loss": 2.6947, + "mean_token_accuracy": 0.4000000059604645, + "step": 88545 + }, + { + "epoch": 0.0891885004748998, + "grad_norm": 14.80718775723769, + "learning_rate": 4.981133378285465e-05, + "loss": 2.6486, + "mean_token_accuracy": 0.3965517163276672, + "step": 88550 + }, + { + "epoch": 0.08919353652800398, + "grad_norm": 10.813875284555783, + "learning_rate": 4.981128535605594e-05, + "loss": 2.2735, + "mean_token_accuracy": 0.4586206912994385, + "step": 88555 + }, + { + "epoch": 0.08919857258110815, + "grad_norm": 11.291826413437617, + "learning_rate": 4.981123692306911e-05, + "loss": 2.2853, + "mean_token_accuracy": 0.45535390377044677, + "step": 88560 + }, + { + "epoch": 0.08920360863421233, + "grad_norm": 11.697506059245041, + "learning_rate": 4.981118848389418e-05, + "loss": 2.5882, + "mean_token_accuracy": 0.4068965554237366, + "step": 88565 + }, + { + "epoch": 0.0892086446873165, + "grad_norm": 10.997245902251391, + "learning_rate": 4.9811140038531154e-05, + "loss": 2.5946, + "mean_token_accuracy": 0.417241370677948, + "step": 88570 + }, + { + "epoch": 0.08921368074042067, + "grad_norm": 10.083958968404207, + "learning_rate": 4.9811091586980046e-05, + "loss": 2.4107, + "mean_token_accuracy": 0.42413792610168455, + "step": 88575 + }, + { + "epoch": 0.08921871679352485, + "grad_norm": 8.281066632901785, + "learning_rate": 4.981104312924088e-05, + "loss": 2.486, + "mean_token_accuracy": 0.4344827592372894, + "step": 88580 + }, + { + "epoch": 0.08922375284662902, + "grad_norm": 13.570746231715745, + "learning_rate": 4.981099466531366e-05, + "loss": 2.7587, + "mean_token_accuracy": 0.44827585816383364, + "step": 88585 + }, + { + "epoch": 0.0892287888997332, + "grad_norm": 11.792382226269343, + "learning_rate": 4.9810946195198404e-05, + "loss": 2.5379, + "mean_token_accuracy": 0.4310344815254211, + "step": 88590 + }, + { + "epoch": 0.08923382495283737, + "grad_norm": 9.3939267167085, + "learning_rate": 4.9810897718895124e-05, + "loss": 2.4109, + "mean_token_accuracy": 0.43103447556495667, + "step": 88595 + }, + { + "epoch": 0.08923886100594154, + "grad_norm": 10.139717915996181, + "learning_rate": 4.981084923640383e-05, + "loss": 2.6587, + "mean_token_accuracy": 0.41881427466869353, + "step": 88600 + }, + { + "epoch": 0.0892438970590457, + "grad_norm": 12.457259495789197, + "learning_rate": 4.981080074772454e-05, + "loss": 2.0711, + "mean_token_accuracy": 0.4862069010734558, + "step": 88605 + }, + { + "epoch": 0.08924893311214988, + "grad_norm": 11.01616087442303, + "learning_rate": 4.9810752252857274e-05, + "loss": 2.7153, + "mean_token_accuracy": 0.37241379022598264, + "step": 88610 + }, + { + "epoch": 0.08925396916525405, + "grad_norm": 12.707151120157436, + "learning_rate": 4.981070375180203e-05, + "loss": 2.4768, + "mean_token_accuracy": 0.43793103098869324, + "step": 88615 + }, + { + "epoch": 0.08925900521835822, + "grad_norm": 11.747031254045655, + "learning_rate": 4.9810655244558826e-05, + "loss": 2.6156, + "mean_token_accuracy": 0.4, + "step": 88620 + }, + { + "epoch": 0.0892640412714624, + "grad_norm": 12.035667167847626, + "learning_rate": 4.9810606731127686e-05, + "loss": 2.6258, + "mean_token_accuracy": 0.38965516686439516, + "step": 88625 + }, + { + "epoch": 0.08926907732456657, + "grad_norm": 10.607826688754747, + "learning_rate": 4.981055821150861e-05, + "loss": 2.5199, + "mean_token_accuracy": 0.4256503224372864, + "step": 88630 + }, + { + "epoch": 0.08927411337767074, + "grad_norm": 11.601784277743983, + "learning_rate": 4.981050968570162e-05, + "loss": 2.4547, + "mean_token_accuracy": 0.42758620977401735, + "step": 88635 + }, + { + "epoch": 0.08927914943077492, + "grad_norm": 11.320044625146535, + "learning_rate": 4.9810461153706726e-05, + "loss": 2.5633, + "mean_token_accuracy": 0.4000000059604645, + "step": 88640 + }, + { + "epoch": 0.08928418548387909, + "grad_norm": 16.309084913136417, + "learning_rate": 4.981041261552394e-05, + "loss": 2.8536, + "mean_token_accuracy": 0.36551723480224607, + "step": 88645 + }, + { + "epoch": 0.08928922153698327, + "grad_norm": 11.73820038773606, + "learning_rate": 4.981036407115329e-05, + "loss": 2.0621, + "mean_token_accuracy": 0.517241370677948, + "step": 88650 + }, + { + "epoch": 0.08929425759008744, + "grad_norm": 9.873145075753127, + "learning_rate": 4.9810315520594765e-05, + "loss": 2.1935, + "mean_token_accuracy": 0.42413792610168455, + "step": 88655 + }, + { + "epoch": 0.08929929364319161, + "grad_norm": 12.225217101135746, + "learning_rate": 4.981026696384839e-05, + "loss": 2.3013, + "mean_token_accuracy": 0.4586206912994385, + "step": 88660 + }, + { + "epoch": 0.08930432969629579, + "grad_norm": 10.782369566066784, + "learning_rate": 4.981021840091419e-05, + "loss": 2.404, + "mean_token_accuracy": 0.42758620381355283, + "step": 88665 + }, + { + "epoch": 0.08930936574939996, + "grad_norm": 10.959229780250883, + "learning_rate": 4.9810169831792156e-05, + "loss": 2.3447, + "mean_token_accuracy": 0.4206896543502808, + "step": 88670 + }, + { + "epoch": 0.08931440180250412, + "grad_norm": 10.128813364265458, + "learning_rate": 4.9810121256482325e-05, + "loss": 2.6536, + "mean_token_accuracy": 0.4206896543502808, + "step": 88675 + }, + { + "epoch": 0.0893194378556083, + "grad_norm": 11.530614708630122, + "learning_rate": 4.981007267498469e-05, + "loss": 2.4418, + "mean_token_accuracy": 0.3896551787853241, + "step": 88680 + }, + { + "epoch": 0.08932447390871247, + "grad_norm": 13.54442770991281, + "learning_rate": 4.981002408729927e-05, + "loss": 2.6123, + "mean_token_accuracy": 0.3862068891525269, + "step": 88685 + }, + { + "epoch": 0.08932950996181664, + "grad_norm": 12.256078575058927, + "learning_rate": 4.9809975493426085e-05, + "loss": 2.0949, + "mean_token_accuracy": 0.47241379618644713, + "step": 88690 + }, + { + "epoch": 0.08933454601492082, + "grad_norm": 11.681332258014175, + "learning_rate": 4.980992689336515e-05, + "loss": 2.2527, + "mean_token_accuracy": 0.443315190076828, + "step": 88695 + }, + { + "epoch": 0.08933958206802499, + "grad_norm": 19.221952606856398, + "learning_rate": 4.9809878287116466e-05, + "loss": 2.2652, + "mean_token_accuracy": 0.4620689690113068, + "step": 88700 + }, + { + "epoch": 0.08934461812112916, + "grad_norm": 10.748527206752398, + "learning_rate": 4.9809829674680064e-05, + "loss": 2.3248, + "mean_token_accuracy": 0.42413792610168455, + "step": 88705 + }, + { + "epoch": 0.08934965417423334, + "grad_norm": 19.02627348692893, + "learning_rate": 4.980978105605594e-05, + "loss": 2.7091, + "mean_token_accuracy": 0.34482758641242983, + "step": 88710 + }, + { + "epoch": 0.08935469022733751, + "grad_norm": 12.880097884012867, + "learning_rate": 4.980973243124412e-05, + "loss": 2.594, + "mean_token_accuracy": 0.3896551728248596, + "step": 88715 + }, + { + "epoch": 0.08935972628044168, + "grad_norm": 11.691876961986177, + "learning_rate": 4.980968380024461e-05, + "loss": 2.2871, + "mean_token_accuracy": 0.4344827592372894, + "step": 88720 + }, + { + "epoch": 0.08936476233354586, + "grad_norm": 9.820351069932935, + "learning_rate": 4.980963516305742e-05, + "loss": 2.2912, + "mean_token_accuracy": 0.5137930929660797, + "step": 88725 + }, + { + "epoch": 0.08936979838665003, + "grad_norm": 11.811294119579873, + "learning_rate": 4.980958651968258e-05, + "loss": 2.5051, + "mean_token_accuracy": 0.3931034505367279, + "step": 88730 + }, + { + "epoch": 0.0893748344397542, + "grad_norm": 9.730203203550879, + "learning_rate": 4.980953787012008e-05, + "loss": 2.0427, + "mean_token_accuracy": 0.4931034505367279, + "step": 88735 + }, + { + "epoch": 0.08937987049285838, + "grad_norm": 25.87116323521781, + "learning_rate": 4.980948921436996e-05, + "loss": 2.0511, + "mean_token_accuracy": 0.47586207985877993, + "step": 88740 + }, + { + "epoch": 0.08938490654596254, + "grad_norm": 12.112253550446777, + "learning_rate": 4.980944055243221e-05, + "loss": 2.4128, + "mean_token_accuracy": 0.3655172437429428, + "step": 88745 + }, + { + "epoch": 0.08938994259906671, + "grad_norm": 12.748044329990211, + "learning_rate": 4.980939188430686e-05, + "loss": 2.509, + "mean_token_accuracy": 0.4379310369491577, + "step": 88750 + }, + { + "epoch": 0.08939497865217089, + "grad_norm": 15.257796320872282, + "learning_rate": 4.9809343209993917e-05, + "loss": 2.6401, + "mean_token_accuracy": 0.39818512797355654, + "step": 88755 + }, + { + "epoch": 0.08940001470527506, + "grad_norm": 15.912562536718099, + "learning_rate": 4.980929452949339e-05, + "loss": 2.4125, + "mean_token_accuracy": 0.38965516686439516, + "step": 88760 + }, + { + "epoch": 0.08940505075837923, + "grad_norm": 11.136093316441404, + "learning_rate": 4.98092458428053e-05, + "loss": 2.7067, + "mean_token_accuracy": 0.3551724135875702, + "step": 88765 + }, + { + "epoch": 0.08941008681148341, + "grad_norm": 11.794137154006105, + "learning_rate": 4.980919714992965e-05, + "loss": 2.2529, + "mean_token_accuracy": 0.4641863226890564, + "step": 88770 + }, + { + "epoch": 0.08941512286458758, + "grad_norm": 18.425034497237206, + "learning_rate": 4.980914845086647e-05, + "loss": 2.718, + "mean_token_accuracy": 0.42413792610168455, + "step": 88775 + }, + { + "epoch": 0.08942015891769176, + "grad_norm": 14.373899774130383, + "learning_rate": 4.980909974561577e-05, + "loss": 2.3775, + "mean_token_accuracy": 0.4241379380226135, + "step": 88780 + }, + { + "epoch": 0.08942519497079593, + "grad_norm": 10.927495392278031, + "learning_rate": 4.9809051034177545e-05, + "loss": 2.588, + "mean_token_accuracy": 0.3517241358757019, + "step": 88785 + }, + { + "epoch": 0.0894302310239001, + "grad_norm": 10.760302741924633, + "learning_rate": 4.980900231655182e-05, + "loss": 2.344, + "mean_token_accuracy": 0.4034482717514038, + "step": 88790 + }, + { + "epoch": 0.08943526707700428, + "grad_norm": 11.710142723926088, + "learning_rate": 4.980895359273862e-05, + "loss": 2.504, + "mean_token_accuracy": 0.3931034505367279, + "step": 88795 + }, + { + "epoch": 0.08944030313010845, + "grad_norm": 11.430697738447106, + "learning_rate": 4.980890486273794e-05, + "loss": 2.7736, + "mean_token_accuracy": 0.3896551728248596, + "step": 88800 + }, + { + "epoch": 0.08944533918321262, + "grad_norm": 10.420876302055264, + "learning_rate": 4.9808856126549815e-05, + "loss": 2.4461, + "mean_token_accuracy": 0.4034482777118683, + "step": 88805 + }, + { + "epoch": 0.0894503752363168, + "grad_norm": 9.85966700874723, + "learning_rate": 4.9808807384174235e-05, + "loss": 2.3485, + "mean_token_accuracy": 0.38965516686439516, + "step": 88810 + }, + { + "epoch": 0.08945541128942096, + "grad_norm": 14.021783881127808, + "learning_rate": 4.980875863561122e-05, + "loss": 2.5708, + "mean_token_accuracy": 0.3827586114406586, + "step": 88815 + }, + { + "epoch": 0.08946044734252513, + "grad_norm": 10.97386090813227, + "learning_rate": 4.98087098808608e-05, + "loss": 2.5292, + "mean_token_accuracy": 0.4068965494632721, + "step": 88820 + }, + { + "epoch": 0.0894654833956293, + "grad_norm": 12.39595984720901, + "learning_rate": 4.9808661119922975e-05, + "loss": 2.5683, + "mean_token_accuracy": 0.4034482717514038, + "step": 88825 + }, + { + "epoch": 0.08947051944873348, + "grad_norm": 9.059698588952815, + "learning_rate": 4.980861235279774e-05, + "loss": 2.3682, + "mean_token_accuracy": 0.41379310488700866, + "step": 88830 + }, + { + "epoch": 0.08947555550183765, + "grad_norm": 8.812770000059357, + "learning_rate": 4.980856357948515e-05, + "loss": 2.7104, + "mean_token_accuracy": 0.41034482717514037, + "step": 88835 + }, + { + "epoch": 0.08948059155494183, + "grad_norm": 9.401394732155904, + "learning_rate": 4.980851479998519e-05, + "loss": 2.6027, + "mean_token_accuracy": 0.4206896543502808, + "step": 88840 + }, + { + "epoch": 0.089485627608046, + "grad_norm": 10.184007204548672, + "learning_rate": 4.980846601429788e-05, + "loss": 2.1776, + "mean_token_accuracy": 0.44482759237289426, + "step": 88845 + }, + { + "epoch": 0.08949066366115017, + "grad_norm": 11.269501231204929, + "learning_rate": 4.980841722242323e-05, + "loss": 2.1187, + "mean_token_accuracy": 0.4758620738983154, + "step": 88850 + }, + { + "epoch": 0.08949569971425435, + "grad_norm": 11.630401138207038, + "learning_rate": 4.9808368424361265e-05, + "loss": 2.305, + "mean_token_accuracy": 0.40344826579093934, + "step": 88855 + }, + { + "epoch": 0.08950073576735852, + "grad_norm": 12.86499098309019, + "learning_rate": 4.980831962011198e-05, + "loss": 2.5492, + "mean_token_accuracy": 0.42413793206214906, + "step": 88860 + }, + { + "epoch": 0.0895057718204627, + "grad_norm": 10.232198629674695, + "learning_rate": 4.9808270809675406e-05, + "loss": 2.2401, + "mean_token_accuracy": 0.47241379618644713, + "step": 88865 + }, + { + "epoch": 0.08951080787356687, + "grad_norm": 11.246561723037441, + "learning_rate": 4.9808221993051554e-05, + "loss": 2.5824, + "mean_token_accuracy": 0.3862068921327591, + "step": 88870 + }, + { + "epoch": 0.08951584392667104, + "grad_norm": 10.776144683650923, + "learning_rate": 4.980817317024043e-05, + "loss": 2.1402, + "mean_token_accuracy": 0.4413793087005615, + "step": 88875 + }, + { + "epoch": 0.08952087997977522, + "grad_norm": 12.116261601442222, + "learning_rate": 4.980812434124204e-05, + "loss": 2.6475, + "mean_token_accuracy": 0.4, + "step": 88880 + }, + { + "epoch": 0.08952591603287938, + "grad_norm": 10.781475579518753, + "learning_rate": 4.9808075506056426e-05, + "loss": 2.8414, + "mean_token_accuracy": 0.3517241418361664, + "step": 88885 + }, + { + "epoch": 0.08953095208598355, + "grad_norm": 9.480294957987562, + "learning_rate": 4.980802666468358e-05, + "loss": 2.3973, + "mean_token_accuracy": 0.4172413766384125, + "step": 88890 + }, + { + "epoch": 0.08953598813908772, + "grad_norm": 11.729358087061678, + "learning_rate": 4.9807977817123514e-05, + "loss": 3.0077, + "mean_token_accuracy": 0.39655172228813174, + "step": 88895 + }, + { + "epoch": 0.0895410241921919, + "grad_norm": 12.384990244062646, + "learning_rate": 4.980792896337625e-05, + "loss": 2.0314, + "mean_token_accuracy": 0.44827585816383364, + "step": 88900 + }, + { + "epoch": 0.08954606024529607, + "grad_norm": 11.668598783205345, + "learning_rate": 4.980788010344179e-05, + "loss": 2.2008, + "mean_token_accuracy": 0.4931034505367279, + "step": 88905 + }, + { + "epoch": 0.08955109629840025, + "grad_norm": 16.795184590849257, + "learning_rate": 4.9807831237320166e-05, + "loss": 2.664, + "mean_token_accuracy": 0.38620689511299133, + "step": 88910 + }, + { + "epoch": 0.08955613235150442, + "grad_norm": 12.34637462326204, + "learning_rate": 4.9807782365011385e-05, + "loss": 2.7931, + "mean_token_accuracy": 0.36551723480224607, + "step": 88915 + }, + { + "epoch": 0.0895611684046086, + "grad_norm": 10.34173261495006, + "learning_rate": 4.980773348651545e-05, + "loss": 2.5549, + "mean_token_accuracy": 0.4379310250282288, + "step": 88920 + }, + { + "epoch": 0.08956620445771277, + "grad_norm": 10.658613668468902, + "learning_rate": 4.980768460183238e-05, + "loss": 2.9469, + "mean_token_accuracy": 0.37586206793785093, + "step": 88925 + }, + { + "epoch": 0.08957124051081694, + "grad_norm": 10.716763629797212, + "learning_rate": 4.9807635710962196e-05, + "loss": 2.6615, + "mean_token_accuracy": 0.4034482717514038, + "step": 88930 + }, + { + "epoch": 0.08957627656392111, + "grad_norm": 11.284583244256021, + "learning_rate": 4.980758681390491e-05, + "loss": 2.5194, + "mean_token_accuracy": 0.4310344815254211, + "step": 88935 + }, + { + "epoch": 0.08958131261702529, + "grad_norm": 10.856493708208125, + "learning_rate": 4.9807537910660526e-05, + "loss": 2.4815, + "mean_token_accuracy": 0.3896551728248596, + "step": 88940 + }, + { + "epoch": 0.08958634867012946, + "grad_norm": 11.126605977991746, + "learning_rate": 4.9807489001229055e-05, + "loss": 2.3011, + "mean_token_accuracy": 0.42413793206214906, + "step": 88945 + }, + { + "epoch": 0.08959138472323364, + "grad_norm": 12.783895315112714, + "learning_rate": 4.980744008561053e-05, + "loss": 2.2365, + "mean_token_accuracy": 0.458620685338974, + "step": 88950 + }, + { + "epoch": 0.0895964207763378, + "grad_norm": 11.93786668747794, + "learning_rate": 4.980739116380495e-05, + "loss": 2.8233, + "mean_token_accuracy": 0.3862069010734558, + "step": 88955 + }, + { + "epoch": 0.08960145682944197, + "grad_norm": 9.96566782357653, + "learning_rate": 4.9807342235812334e-05, + "loss": 2.3387, + "mean_token_accuracy": 0.458620685338974, + "step": 88960 + }, + { + "epoch": 0.08960649288254614, + "grad_norm": 9.591225869909564, + "learning_rate": 4.980729330163269e-05, + "loss": 2.2193, + "mean_token_accuracy": 0.4896551728248596, + "step": 88965 + }, + { + "epoch": 0.08961152893565032, + "grad_norm": 11.419279517587784, + "learning_rate": 4.980724436126604e-05, + "loss": 2.6288, + "mean_token_accuracy": 0.41034482717514037, + "step": 88970 + }, + { + "epoch": 0.08961656498875449, + "grad_norm": 9.68558232542842, + "learning_rate": 4.9807195414712385e-05, + "loss": 1.9563, + "mean_token_accuracy": 0.5206896543502808, + "step": 88975 + }, + { + "epoch": 0.08962160104185866, + "grad_norm": 17.290443635755594, + "learning_rate": 4.980714646197175e-05, + "loss": 2.2139, + "mean_token_accuracy": 0.45517240166664125, + "step": 88980 + }, + { + "epoch": 0.08962663709496284, + "grad_norm": 11.754790486088517, + "learning_rate": 4.980709750304415e-05, + "loss": 2.7672, + "mean_token_accuracy": 0.39655172228813174, + "step": 88985 + }, + { + "epoch": 0.08963167314806701, + "grad_norm": 8.548199383403793, + "learning_rate": 4.980704853792958e-05, + "loss": 2.4781, + "mean_token_accuracy": 0.4551724076271057, + "step": 88990 + }, + { + "epoch": 0.08963670920117119, + "grad_norm": 12.300607751217163, + "learning_rate": 4.9806999566628074e-05, + "loss": 2.4459, + "mean_token_accuracy": 0.40169388651847837, + "step": 88995 + }, + { + "epoch": 0.08964174525427536, + "grad_norm": 11.928165946405505, + "learning_rate": 4.980695058913965e-05, + "loss": 2.2292, + "mean_token_accuracy": 0.44482758045196535, + "step": 89000 + }, + { + "epoch": 0.08964678130737953, + "grad_norm": 12.320084509580049, + "learning_rate": 4.980690160546429e-05, + "loss": 2.4652, + "mean_token_accuracy": 0.44682395458221436, + "step": 89005 + }, + { + "epoch": 0.08965181736048371, + "grad_norm": 12.67278095080595, + "learning_rate": 4.980685261560204e-05, + "loss": 2.8141, + "mean_token_accuracy": 0.3586206823587418, + "step": 89010 + }, + { + "epoch": 0.08965685341358788, + "grad_norm": 12.892253987390248, + "learning_rate": 4.98068036195529e-05, + "loss": 2.2555, + "mean_token_accuracy": 0.47241379618644713, + "step": 89015 + }, + { + "epoch": 0.08966188946669205, + "grad_norm": 11.173543877242919, + "learning_rate": 4.980675461731689e-05, + "loss": 2.7552, + "mean_token_accuracy": 0.42413793206214906, + "step": 89020 + }, + { + "epoch": 0.08966692551979621, + "grad_norm": 14.26831942806606, + "learning_rate": 4.9806705608894004e-05, + "loss": 2.2437, + "mean_token_accuracy": 0.4517241418361664, + "step": 89025 + }, + { + "epoch": 0.08967196157290039, + "grad_norm": 11.138337558261712, + "learning_rate": 4.980665659428428e-05, + "loss": 2.8268, + "mean_token_accuracy": 0.4103448212146759, + "step": 89030 + }, + { + "epoch": 0.08967699762600456, + "grad_norm": 10.031194739446683, + "learning_rate": 4.980660757348772e-05, + "loss": 3.1029, + "mean_token_accuracy": 0.3896551728248596, + "step": 89035 + }, + { + "epoch": 0.08968203367910874, + "grad_norm": 9.442011550358695, + "learning_rate": 4.980655854650434e-05, + "loss": 2.2487, + "mean_token_accuracy": 0.42413793206214906, + "step": 89040 + }, + { + "epoch": 0.08968706973221291, + "grad_norm": 11.161102510269616, + "learning_rate": 4.980650951333414e-05, + "loss": 2.6213, + "mean_token_accuracy": 0.3931034505367279, + "step": 89045 + }, + { + "epoch": 0.08969210578531708, + "grad_norm": 10.34183894956065, + "learning_rate": 4.980646047397716e-05, + "loss": 2.4111, + "mean_token_accuracy": 0.41034482419490814, + "step": 89050 + }, + { + "epoch": 0.08969714183842126, + "grad_norm": 11.031198680088721, + "learning_rate": 4.98064114284334e-05, + "loss": 2.552, + "mean_token_accuracy": 0.3758620649576187, + "step": 89055 + }, + { + "epoch": 0.08970217789152543, + "grad_norm": 10.513530044492862, + "learning_rate": 4.980636237670287e-05, + "loss": 2.6727, + "mean_token_accuracy": 0.3482758641242981, + "step": 89060 + }, + { + "epoch": 0.0897072139446296, + "grad_norm": 12.392702818878702, + "learning_rate": 4.9806313318785586e-05, + "loss": 2.7229, + "mean_token_accuracy": 0.41724138855934145, + "step": 89065 + }, + { + "epoch": 0.08971224999773378, + "grad_norm": 10.058024018084051, + "learning_rate": 4.980626425468157e-05, + "loss": 2.058, + "mean_token_accuracy": 0.4689655125141144, + "step": 89070 + }, + { + "epoch": 0.08971728605083795, + "grad_norm": 9.750503298341185, + "learning_rate": 4.980621518439082e-05, + "loss": 2.3217, + "mean_token_accuracy": 0.4465819835662842, + "step": 89075 + }, + { + "epoch": 0.08972232210394213, + "grad_norm": 10.478791689941625, + "learning_rate": 4.9806166107913364e-05, + "loss": 2.4015, + "mean_token_accuracy": 0.41379310488700866, + "step": 89080 + }, + { + "epoch": 0.0897273581570463, + "grad_norm": 11.212419021102791, + "learning_rate": 4.98061170252492e-05, + "loss": 2.5125, + "mean_token_accuracy": 0.43641863465309144, + "step": 89085 + }, + { + "epoch": 0.08973239421015047, + "grad_norm": 12.010196342465585, + "learning_rate": 4.9806067936398366e-05, + "loss": 2.4077, + "mean_token_accuracy": 0.44827585816383364, + "step": 89090 + }, + { + "epoch": 0.08973743026325463, + "grad_norm": 11.746203634177599, + "learning_rate": 4.9806018841360855e-05, + "loss": 2.4096, + "mean_token_accuracy": 0.42758620977401735, + "step": 89095 + }, + { + "epoch": 0.08974246631635881, + "grad_norm": 11.398230844590623, + "learning_rate": 4.9805969740136684e-05, + "loss": 2.4469, + "mean_token_accuracy": 0.403448274731636, + "step": 89100 + }, + { + "epoch": 0.08974750236946298, + "grad_norm": 9.80675841969019, + "learning_rate": 4.9805920632725865e-05, + "loss": 2.1556, + "mean_token_accuracy": 0.4517241418361664, + "step": 89105 + }, + { + "epoch": 0.08975253842256715, + "grad_norm": 10.946226520240662, + "learning_rate": 4.980587151912842e-05, + "loss": 2.3536, + "mean_token_accuracy": 0.44700543880462645, + "step": 89110 + }, + { + "epoch": 0.08975757447567133, + "grad_norm": 10.053619413286702, + "learning_rate": 4.980582239934436e-05, + "loss": 2.157, + "mean_token_accuracy": 0.4620689690113068, + "step": 89115 + }, + { + "epoch": 0.0897626105287755, + "grad_norm": 11.446653916110963, + "learning_rate": 4.980577327337369e-05, + "loss": 2.2358, + "mean_token_accuracy": 0.4758620738983154, + "step": 89120 + }, + { + "epoch": 0.08976764658187968, + "grad_norm": 12.066149768237091, + "learning_rate": 4.980572414121644e-05, + "loss": 2.8071, + "mean_token_accuracy": 0.39655172228813174, + "step": 89125 + }, + { + "epoch": 0.08977268263498385, + "grad_norm": 9.168009126835535, + "learning_rate": 4.980567500287261e-05, + "loss": 2.5513, + "mean_token_accuracy": 0.3965517282485962, + "step": 89130 + }, + { + "epoch": 0.08977771868808802, + "grad_norm": 12.152801571607393, + "learning_rate": 4.980562585834222e-05, + "loss": 2.8497, + "mean_token_accuracy": 0.3827586233615875, + "step": 89135 + }, + { + "epoch": 0.0897827547411922, + "grad_norm": 10.78609264106069, + "learning_rate": 4.980557670762528e-05, + "loss": 2.3863, + "mean_token_accuracy": 0.4537810027599335, + "step": 89140 + }, + { + "epoch": 0.08978779079429637, + "grad_norm": 12.53664476511771, + "learning_rate": 4.9805527550721805e-05, + "loss": 2.6806, + "mean_token_accuracy": 0.379310342669487, + "step": 89145 + }, + { + "epoch": 0.08979282684740054, + "grad_norm": 11.653323448303745, + "learning_rate": 4.9805478387631803e-05, + "loss": 2.5135, + "mean_token_accuracy": 0.41724138259887694, + "step": 89150 + }, + { + "epoch": 0.08979786290050472, + "grad_norm": 11.781907634974027, + "learning_rate": 4.9805429218355304e-05, + "loss": 2.3832, + "mean_token_accuracy": 0.4034482717514038, + "step": 89155 + }, + { + "epoch": 0.08980289895360889, + "grad_norm": 11.120036669823298, + "learning_rate": 4.98053800428923e-05, + "loss": 2.2474, + "mean_token_accuracy": 0.4610405325889587, + "step": 89160 + }, + { + "epoch": 0.08980793500671305, + "grad_norm": 12.561784539326514, + "learning_rate": 4.9805330861242824e-05, + "loss": 2.7697, + "mean_token_accuracy": 0.3655172437429428, + "step": 89165 + }, + { + "epoch": 0.08981297105981723, + "grad_norm": 10.710851267986031, + "learning_rate": 4.9805281673406884e-05, + "loss": 2.8721, + "mean_token_accuracy": 0.3896551728248596, + "step": 89170 + }, + { + "epoch": 0.0898180071129214, + "grad_norm": 11.43177074189504, + "learning_rate": 4.9805232479384486e-05, + "loss": 2.2844, + "mean_token_accuracy": 0.42413792610168455, + "step": 89175 + }, + { + "epoch": 0.08982304316602557, + "grad_norm": 10.47288126508769, + "learning_rate": 4.980518327917564e-05, + "loss": 2.2873, + "mean_token_accuracy": 0.458620685338974, + "step": 89180 + }, + { + "epoch": 0.08982807921912975, + "grad_norm": 10.841134882792245, + "learning_rate": 4.980513407278038e-05, + "loss": 2.2116, + "mean_token_accuracy": 0.42758620977401735, + "step": 89185 + }, + { + "epoch": 0.08983311527223392, + "grad_norm": 9.69031705929063, + "learning_rate": 4.9805084860198704e-05, + "loss": 2.7386, + "mean_token_accuracy": 0.41379310488700866, + "step": 89190 + }, + { + "epoch": 0.0898381513253381, + "grad_norm": 9.098758904633891, + "learning_rate": 4.980503564143063e-05, + "loss": 2.3527, + "mean_token_accuracy": 0.4310344785451889, + "step": 89195 + }, + { + "epoch": 0.08984318737844227, + "grad_norm": 11.033472964786132, + "learning_rate": 4.980498641647617e-05, + "loss": 2.4341, + "mean_token_accuracy": 0.4600121021270752, + "step": 89200 + }, + { + "epoch": 0.08984822343154644, + "grad_norm": 9.506357489399608, + "learning_rate": 4.980493718533534e-05, + "loss": 2.1277, + "mean_token_accuracy": 0.42068966031074523, + "step": 89205 + }, + { + "epoch": 0.08985325948465062, + "grad_norm": 10.247112141175057, + "learning_rate": 4.980488794800815e-05, + "loss": 2.4222, + "mean_token_accuracy": 0.4413793087005615, + "step": 89210 + }, + { + "epoch": 0.08985829553775479, + "grad_norm": 11.062786215659182, + "learning_rate": 4.980483870449462e-05, + "loss": 2.2784, + "mean_token_accuracy": 0.4517241299152374, + "step": 89215 + }, + { + "epoch": 0.08986333159085896, + "grad_norm": 10.154611166102077, + "learning_rate": 4.980478945479476e-05, + "loss": 2.5179, + "mean_token_accuracy": 0.3724137842655182, + "step": 89220 + }, + { + "epoch": 0.08986836764396314, + "grad_norm": 10.978198502735562, + "learning_rate": 4.980474019890858e-05, + "loss": 2.9226, + "mean_token_accuracy": 0.4, + "step": 89225 + }, + { + "epoch": 0.08987340369706731, + "grad_norm": 10.962235650044633, + "learning_rate": 4.98046909368361e-05, + "loss": 2.4186, + "mean_token_accuracy": 0.4034482717514038, + "step": 89230 + }, + { + "epoch": 0.08987843975017147, + "grad_norm": 11.383121740940902, + "learning_rate": 4.980464166857733e-05, + "loss": 2.3145, + "mean_token_accuracy": 0.3896551728248596, + "step": 89235 + }, + { + "epoch": 0.08988347580327564, + "grad_norm": 11.139209226289138, + "learning_rate": 4.980459239413228e-05, + "loss": 2.2044, + "mean_token_accuracy": 0.4551724076271057, + "step": 89240 + }, + { + "epoch": 0.08988851185637982, + "grad_norm": 10.893190142158977, + "learning_rate": 4.980454311350097e-05, + "loss": 2.3664, + "mean_token_accuracy": 0.42758620977401735, + "step": 89245 + }, + { + "epoch": 0.08989354790948399, + "grad_norm": 11.585124757743001, + "learning_rate": 4.9804493826683416e-05, + "loss": 3.0074, + "mean_token_accuracy": 0.31451905369758604, + "step": 89250 + }, + { + "epoch": 0.08989858396258817, + "grad_norm": 10.74201603368157, + "learning_rate": 4.980444453367962e-05, + "loss": 2.1617, + "mean_token_accuracy": 0.42068966031074523, + "step": 89255 + }, + { + "epoch": 0.08990362001569234, + "grad_norm": 11.474935523830943, + "learning_rate": 4.980439523448961e-05, + "loss": 2.4509, + "mean_token_accuracy": 0.4000000059604645, + "step": 89260 + }, + { + "epoch": 0.08990865606879651, + "grad_norm": 9.877916416891493, + "learning_rate": 4.980434592911339e-05, + "loss": 1.9656, + "mean_token_accuracy": 0.5068014919757843, + "step": 89265 + }, + { + "epoch": 0.08991369212190069, + "grad_norm": 12.763712341506066, + "learning_rate": 4.9804296617550976e-05, + "loss": 2.3534, + "mean_token_accuracy": 0.417241370677948, + "step": 89270 + }, + { + "epoch": 0.08991872817500486, + "grad_norm": 13.078230341401856, + "learning_rate": 4.980424729980239e-05, + "loss": 2.5354, + "mean_token_accuracy": 0.4224440515041351, + "step": 89275 + }, + { + "epoch": 0.08992376422810903, + "grad_norm": 9.49898908395129, + "learning_rate": 4.9804197975867626e-05, + "loss": 2.5313, + "mean_token_accuracy": 0.413793095946312, + "step": 89280 + }, + { + "epoch": 0.08992880028121321, + "grad_norm": 12.266704101766987, + "learning_rate": 4.980414864574672e-05, + "loss": 2.4059, + "mean_token_accuracy": 0.4672111332416534, + "step": 89285 + }, + { + "epoch": 0.08993383633431738, + "grad_norm": 9.959348986637753, + "learning_rate": 4.9804099309439666e-05, + "loss": 2.8252, + "mean_token_accuracy": 0.37931033968925476, + "step": 89290 + }, + { + "epoch": 0.08993887238742156, + "grad_norm": 9.945778559127607, + "learning_rate": 4.98040499669465e-05, + "loss": 2.3529, + "mean_token_accuracy": 0.4000000059604645, + "step": 89295 + }, + { + "epoch": 0.08994390844052573, + "grad_norm": 14.949670262162178, + "learning_rate": 4.98040006182672e-05, + "loss": 2.607, + "mean_token_accuracy": 0.4034482777118683, + "step": 89300 + }, + { + "epoch": 0.08994894449362989, + "grad_norm": 11.37624161930812, + "learning_rate": 4.980395126340182e-05, + "loss": 2.457, + "mean_token_accuracy": 0.41724138259887694, + "step": 89305 + }, + { + "epoch": 0.08995398054673406, + "grad_norm": 12.02775055752868, + "learning_rate": 4.9803901902350354e-05, + "loss": 2.3368, + "mean_token_accuracy": 0.45353902578353883, + "step": 89310 + }, + { + "epoch": 0.08995901659983824, + "grad_norm": 9.431806944366356, + "learning_rate": 4.9803852535112815e-05, + "loss": 2.4278, + "mean_token_accuracy": 0.4758620738983154, + "step": 89315 + }, + { + "epoch": 0.08996405265294241, + "grad_norm": 8.607377616206996, + "learning_rate": 4.980380316168922e-05, + "loss": 2.5854, + "mean_token_accuracy": 0.441379314661026, + "step": 89320 + }, + { + "epoch": 0.08996908870604658, + "grad_norm": 11.406114013034856, + "learning_rate": 4.9803753782079585e-05, + "loss": 2.4872, + "mean_token_accuracy": 0.3931034505367279, + "step": 89325 + }, + { + "epoch": 0.08997412475915076, + "grad_norm": 12.909625668938764, + "learning_rate": 4.9803704396283914e-05, + "loss": 2.2817, + "mean_token_accuracy": 0.4103448301553726, + "step": 89330 + }, + { + "epoch": 0.08997916081225493, + "grad_norm": 11.329548408424255, + "learning_rate": 4.980365500430223e-05, + "loss": 2.3183, + "mean_token_accuracy": 0.4261947929859161, + "step": 89335 + }, + { + "epoch": 0.0899841968653591, + "grad_norm": 13.03113378760136, + "learning_rate": 4.980360560613454e-05, + "loss": 2.5486, + "mean_token_accuracy": 0.42758620977401735, + "step": 89340 + }, + { + "epoch": 0.08998923291846328, + "grad_norm": 9.738836440166937, + "learning_rate": 4.980355620178087e-05, + "loss": 2.65, + "mean_token_accuracy": 0.3931034505367279, + "step": 89345 + }, + { + "epoch": 0.08999426897156745, + "grad_norm": 9.312330069141487, + "learning_rate": 4.980350679124122e-05, + "loss": 2.1247, + "mean_token_accuracy": 0.4448275864124298, + "step": 89350 + }, + { + "epoch": 0.08999930502467163, + "grad_norm": 11.791326709408748, + "learning_rate": 4.980345737451561e-05, + "loss": 2.4289, + "mean_token_accuracy": 0.41034482717514037, + "step": 89355 + }, + { + "epoch": 0.0900043410777758, + "grad_norm": 12.633562027631791, + "learning_rate": 4.980340795160406e-05, + "loss": 3.0207, + "mean_token_accuracy": 0.29655171632766725, + "step": 89360 + }, + { + "epoch": 0.09000937713087998, + "grad_norm": 9.34854942967477, + "learning_rate": 4.980335852250657e-05, + "loss": 2.456, + "mean_token_accuracy": 0.4517241358757019, + "step": 89365 + }, + { + "epoch": 0.09001441318398415, + "grad_norm": 10.922874467958811, + "learning_rate": 4.980330908722315e-05, + "loss": 2.628, + "mean_token_accuracy": 0.4034482777118683, + "step": 89370 + }, + { + "epoch": 0.09001944923708831, + "grad_norm": 10.59371329513189, + "learning_rate": 4.980325964575384e-05, + "loss": 2.3972, + "mean_token_accuracy": 0.42068966031074523, + "step": 89375 + }, + { + "epoch": 0.09002448529019248, + "grad_norm": 9.766748815422012, + "learning_rate": 4.9803210198098636e-05, + "loss": 2.3807, + "mean_token_accuracy": 0.4689655125141144, + "step": 89380 + }, + { + "epoch": 0.09002952134329666, + "grad_norm": 9.475546826070007, + "learning_rate": 4.980316074425754e-05, + "loss": 2.154, + "mean_token_accuracy": 0.441379314661026, + "step": 89385 + }, + { + "epoch": 0.09003455739640083, + "grad_norm": 11.64292061366294, + "learning_rate": 4.9803111284230594e-05, + "loss": 2.629, + "mean_token_accuracy": 0.4379310429096222, + "step": 89390 + }, + { + "epoch": 0.090039593449505, + "grad_norm": 10.984056284813985, + "learning_rate": 4.980306181801779e-05, + "loss": 2.5703, + "mean_token_accuracy": 0.42068964838981626, + "step": 89395 + }, + { + "epoch": 0.09004462950260918, + "grad_norm": 19.264303809883685, + "learning_rate": 4.980301234561916e-05, + "loss": 2.4223, + "mean_token_accuracy": 0.42068964838981626, + "step": 89400 + }, + { + "epoch": 0.09004966555571335, + "grad_norm": 9.419958913467397, + "learning_rate": 4.980296286703469e-05, + "loss": 2.2604, + "mean_token_accuracy": 0.43793103098869324, + "step": 89405 + }, + { + "epoch": 0.09005470160881753, + "grad_norm": 9.936090255601128, + "learning_rate": 4.980291338226442e-05, + "loss": 2.3986, + "mean_token_accuracy": 0.41034482717514037, + "step": 89410 + }, + { + "epoch": 0.0900597376619217, + "grad_norm": 9.96388540448984, + "learning_rate": 4.980286389130835e-05, + "loss": 2.5763, + "mean_token_accuracy": 0.4172413766384125, + "step": 89415 + }, + { + "epoch": 0.09006477371502587, + "grad_norm": 10.24179014970209, + "learning_rate": 4.98028143941665e-05, + "loss": 2.7051, + "mean_token_accuracy": 0.3551724195480347, + "step": 89420 + }, + { + "epoch": 0.09006980976813005, + "grad_norm": 12.894785073754278, + "learning_rate": 4.980276489083888e-05, + "loss": 2.4915, + "mean_token_accuracy": 0.42450737953186035, + "step": 89425 + }, + { + "epoch": 0.09007484582123422, + "grad_norm": 12.648845959972073, + "learning_rate": 4.98027153813255e-05, + "loss": 2.3551, + "mean_token_accuracy": 0.4241379380226135, + "step": 89430 + }, + { + "epoch": 0.0900798818743384, + "grad_norm": 13.979314309011984, + "learning_rate": 4.9802665865626385e-05, + "loss": 2.687, + "mean_token_accuracy": 0.39655172526836396, + "step": 89435 + }, + { + "epoch": 0.09008491792744257, + "grad_norm": 9.6190303624526, + "learning_rate": 4.9802616343741535e-05, + "loss": 2.5995, + "mean_token_accuracy": 0.4034482717514038, + "step": 89440 + }, + { + "epoch": 0.09008995398054673, + "grad_norm": 11.234231625437367, + "learning_rate": 4.980256681567098e-05, + "loss": 2.4409, + "mean_token_accuracy": 0.47931033968925474, + "step": 89445 + }, + { + "epoch": 0.0900949900336509, + "grad_norm": 11.285223602031163, + "learning_rate": 4.9802517281414726e-05, + "loss": 2.5347, + "mean_token_accuracy": 0.40689654350280763, + "step": 89450 + }, + { + "epoch": 0.09010002608675508, + "grad_norm": 9.597119945194336, + "learning_rate": 4.980246774097278e-05, + "loss": 2.6043, + "mean_token_accuracy": 0.4724137902259827, + "step": 89455 + }, + { + "epoch": 0.09010506213985925, + "grad_norm": 9.613447208049644, + "learning_rate": 4.980241819434516e-05, + "loss": 2.3459, + "mean_token_accuracy": 0.4482758641242981, + "step": 89460 + }, + { + "epoch": 0.09011009819296342, + "grad_norm": 7.8082269457076645, + "learning_rate": 4.980236864153188e-05, + "loss": 2.0781, + "mean_token_accuracy": 0.4758620738983154, + "step": 89465 + }, + { + "epoch": 0.0901151342460676, + "grad_norm": 10.489823163825507, + "learning_rate": 4.980231908253297e-05, + "loss": 2.8922, + "mean_token_accuracy": 0.37586207389831544, + "step": 89470 + }, + { + "epoch": 0.09012017029917177, + "grad_norm": 13.498182730425654, + "learning_rate": 4.9802269517348416e-05, + "loss": 2.5733, + "mean_token_accuracy": 0.4, + "step": 89475 + }, + { + "epoch": 0.09012520635227594, + "grad_norm": 9.595057518077255, + "learning_rate": 4.980221994597825e-05, + "loss": 2.1619, + "mean_token_accuracy": 0.4862068831920624, + "step": 89480 + }, + { + "epoch": 0.09013024240538012, + "grad_norm": 11.974777938557821, + "learning_rate": 4.980217036842246e-05, + "loss": 2.5547, + "mean_token_accuracy": 0.41893526911735535, + "step": 89485 + }, + { + "epoch": 0.09013527845848429, + "grad_norm": 15.161573706472566, + "learning_rate": 4.98021207846811e-05, + "loss": 2.7025, + "mean_token_accuracy": 0.3793103337287903, + "step": 89490 + }, + { + "epoch": 0.09014031451158847, + "grad_norm": 9.78861561822191, + "learning_rate": 4.980207119475416e-05, + "loss": 2.3206, + "mean_token_accuracy": 0.44482759237289426, + "step": 89495 + }, + { + "epoch": 0.09014535056469264, + "grad_norm": 10.873358152148521, + "learning_rate": 4.980202159864166e-05, + "loss": 2.407, + "mean_token_accuracy": 0.36896551847457887, + "step": 89500 + }, + { + "epoch": 0.09015038661779681, + "grad_norm": 11.863697391982628, + "learning_rate": 4.980197199634361e-05, + "loss": 2.1326, + "mean_token_accuracy": 0.4950393199920654, + "step": 89505 + }, + { + "epoch": 0.09015542267090099, + "grad_norm": 24.480723984677372, + "learning_rate": 4.980192238786002e-05, + "loss": 2.0521, + "mean_token_accuracy": 0.49195402264595034, + "step": 89510 + }, + { + "epoch": 0.09016045872400515, + "grad_norm": 14.897479359287122, + "learning_rate": 4.9801872773190904e-05, + "loss": 2.8093, + "mean_token_accuracy": 0.3724137842655182, + "step": 89515 + }, + { + "epoch": 0.09016549477710932, + "grad_norm": 10.331445419890413, + "learning_rate": 4.980182315233629e-05, + "loss": 2.561, + "mean_token_accuracy": 0.4068965554237366, + "step": 89520 + }, + { + "epoch": 0.0901705308302135, + "grad_norm": 10.157616423943821, + "learning_rate": 4.9801773525296184e-05, + "loss": 2.2291, + "mean_token_accuracy": 0.5019963562488556, + "step": 89525 + }, + { + "epoch": 0.09017556688331767, + "grad_norm": 8.578523808417657, + "learning_rate": 4.9801723892070595e-05, + "loss": 2.4999, + "mean_token_accuracy": 0.4931034445762634, + "step": 89530 + }, + { + "epoch": 0.09018060293642184, + "grad_norm": 11.199658568549152, + "learning_rate": 4.980167425265953e-05, + "loss": 2.56, + "mean_token_accuracy": 0.4068965554237366, + "step": 89535 + }, + { + "epoch": 0.09018563898952602, + "grad_norm": 10.81261562574399, + "learning_rate": 4.9801624607063025e-05, + "loss": 2.5521, + "mean_token_accuracy": 0.4068965494632721, + "step": 89540 + }, + { + "epoch": 0.09019067504263019, + "grad_norm": 12.397684042542485, + "learning_rate": 4.980157495528107e-05, + "loss": 2.246, + "mean_token_accuracy": 0.48275861144065857, + "step": 89545 + }, + { + "epoch": 0.09019571109573436, + "grad_norm": 10.263558624186125, + "learning_rate": 4.9801525297313703e-05, + "loss": 2.4698, + "mean_token_accuracy": 0.4500302612781525, + "step": 89550 + }, + { + "epoch": 0.09020074714883854, + "grad_norm": 10.911008053012855, + "learning_rate": 4.980147563316092e-05, + "loss": 2.4743, + "mean_token_accuracy": 0.4034482717514038, + "step": 89555 + }, + { + "epoch": 0.09020578320194271, + "grad_norm": 10.398200123418599, + "learning_rate": 4.980142596282273e-05, + "loss": 2.1676, + "mean_token_accuracy": 0.46551724076271056, + "step": 89560 + }, + { + "epoch": 0.09021081925504688, + "grad_norm": 11.741667481138428, + "learning_rate": 4.980137628629917e-05, + "loss": 2.592, + "mean_token_accuracy": 0.4206896543502808, + "step": 89565 + }, + { + "epoch": 0.09021585530815106, + "grad_norm": 10.278878996890148, + "learning_rate": 4.9801326603590235e-05, + "loss": 2.4424, + "mean_token_accuracy": 0.44137930274009707, + "step": 89570 + }, + { + "epoch": 0.09022089136125523, + "grad_norm": 9.518535191942501, + "learning_rate": 4.9801276914695946e-05, + "loss": 2.4707, + "mean_token_accuracy": 0.4413793087005615, + "step": 89575 + }, + { + "epoch": 0.0902259274143594, + "grad_norm": 10.744519140850255, + "learning_rate": 4.980122721961631e-05, + "loss": 2.3023, + "mean_token_accuracy": 0.47428917288780215, + "step": 89580 + }, + { + "epoch": 0.09023096346746357, + "grad_norm": 11.08300507358741, + "learning_rate": 4.9801177518351343e-05, + "loss": 2.5837, + "mean_token_accuracy": 0.441379314661026, + "step": 89585 + }, + { + "epoch": 0.09023599952056774, + "grad_norm": 11.068931772379434, + "learning_rate": 4.980112781090107e-05, + "loss": 2.4723, + "mean_token_accuracy": 0.4517241358757019, + "step": 89590 + }, + { + "epoch": 0.09024103557367191, + "grad_norm": 11.007884296600027, + "learning_rate": 4.980107809726549e-05, + "loss": 1.9707, + "mean_token_accuracy": 0.4551724135875702, + "step": 89595 + }, + { + "epoch": 0.09024607162677609, + "grad_norm": 9.982572388667446, + "learning_rate": 4.980102837744463e-05, + "loss": 2.5701, + "mean_token_accuracy": 0.458620685338974, + "step": 89600 + }, + { + "epoch": 0.09025110767988026, + "grad_norm": 12.544098994060565, + "learning_rate": 4.980097865143849e-05, + "loss": 2.3746, + "mean_token_accuracy": 0.4068965584039688, + "step": 89605 + }, + { + "epoch": 0.09025614373298443, + "grad_norm": 10.963597351737873, + "learning_rate": 4.980092891924709e-05, + "loss": 2.3418, + "mean_token_accuracy": 0.4, + "step": 89610 + }, + { + "epoch": 0.09026117978608861, + "grad_norm": 10.337950950454628, + "learning_rate": 4.9800879180870444e-05, + "loss": 1.8491, + "mean_token_accuracy": 0.45172414779663084, + "step": 89615 + }, + { + "epoch": 0.09026621583919278, + "grad_norm": 11.726784907954611, + "learning_rate": 4.980082943630857e-05, + "loss": 2.3897, + "mean_token_accuracy": 0.4068965494632721, + "step": 89620 + }, + { + "epoch": 0.09027125189229696, + "grad_norm": 9.159151951332664, + "learning_rate": 4.980077968556148e-05, + "loss": 2.1515, + "mean_token_accuracy": 0.46896551847457885, + "step": 89625 + }, + { + "epoch": 0.09027628794540113, + "grad_norm": 12.791561095039945, + "learning_rate": 4.980072992862917e-05, + "loss": 2.2198, + "mean_token_accuracy": 0.4571687877178192, + "step": 89630 + }, + { + "epoch": 0.0902813239985053, + "grad_norm": 10.650282470473591, + "learning_rate": 4.980068016551168e-05, + "loss": 2.3643, + "mean_token_accuracy": 0.4068965494632721, + "step": 89635 + }, + { + "epoch": 0.09028636005160948, + "grad_norm": 9.791089150752125, + "learning_rate": 4.980063039620901e-05, + "loss": 2.6021, + "mean_token_accuracy": 0.3896551728248596, + "step": 89640 + }, + { + "epoch": 0.09029139610471365, + "grad_norm": 11.23902155681011, + "learning_rate": 4.980058062072119e-05, + "loss": 2.0895, + "mean_token_accuracy": 0.44827587008476255, + "step": 89645 + }, + { + "epoch": 0.09029643215781782, + "grad_norm": 10.976026087324039, + "learning_rate": 4.980053083904821e-05, + "loss": 2.5002, + "mean_token_accuracy": 0.4172413766384125, + "step": 89650 + }, + { + "epoch": 0.09030146821092198, + "grad_norm": 9.683779651959597, + "learning_rate": 4.98004810511901e-05, + "loss": 2.2294, + "mean_token_accuracy": 0.43793103098869324, + "step": 89655 + }, + { + "epoch": 0.09030650426402616, + "grad_norm": 8.858658165494553, + "learning_rate": 4.980043125714686e-05, + "loss": 2.47, + "mean_token_accuracy": 0.4257713258266449, + "step": 89660 + }, + { + "epoch": 0.09031154031713033, + "grad_norm": 9.620203503659363, + "learning_rate": 4.980038145691851e-05, + "loss": 2.4133, + "mean_token_accuracy": 0.47241378426551817, + "step": 89665 + }, + { + "epoch": 0.0903165763702345, + "grad_norm": 9.6808906980857, + "learning_rate": 4.980033165050508e-05, + "loss": 2.338, + "mean_token_accuracy": 0.4620689690113068, + "step": 89670 + }, + { + "epoch": 0.09032161242333868, + "grad_norm": 12.628712858241414, + "learning_rate": 4.980028183790657e-05, + "loss": 2.3345, + "mean_token_accuracy": 0.42758620381355283, + "step": 89675 + }, + { + "epoch": 0.09032664847644285, + "grad_norm": 10.708773431178253, + "learning_rate": 4.980023201912298e-05, + "loss": 2.4065, + "mean_token_accuracy": 0.42546884417533876, + "step": 89680 + }, + { + "epoch": 0.09033168452954703, + "grad_norm": 14.110464517957853, + "learning_rate": 4.980018219415434e-05, + "loss": 2.8139, + "mean_token_accuracy": 0.39655172526836396, + "step": 89685 + }, + { + "epoch": 0.0903367205826512, + "grad_norm": 9.466674212911029, + "learning_rate": 4.9800132363000664e-05, + "loss": 2.2661, + "mean_token_accuracy": 0.46896551847457885, + "step": 89690 + }, + { + "epoch": 0.09034175663575537, + "grad_norm": 12.009711878692949, + "learning_rate": 4.980008252566197e-05, + "loss": 2.6866, + "mean_token_accuracy": 0.43793103098869324, + "step": 89695 + }, + { + "epoch": 0.09034679268885955, + "grad_norm": 10.177266035445074, + "learning_rate": 4.980003268213826e-05, + "loss": 2.8138, + "mean_token_accuracy": 0.32758620381355286, + "step": 89700 + }, + { + "epoch": 0.09035182874196372, + "grad_norm": 14.795475884401593, + "learning_rate": 4.9799982832429545e-05, + "loss": 2.9559, + "mean_token_accuracy": 0.34137930274009703, + "step": 89705 + }, + { + "epoch": 0.0903568647950679, + "grad_norm": 10.004349344472923, + "learning_rate": 4.9799932976535853e-05, + "loss": 2.2157, + "mean_token_accuracy": 0.4172413766384125, + "step": 89710 + }, + { + "epoch": 0.09036190084817207, + "grad_norm": 9.373980865264821, + "learning_rate": 4.979988311445719e-05, + "loss": 2.3224, + "mean_token_accuracy": 0.4344827592372894, + "step": 89715 + }, + { + "epoch": 0.09036693690127624, + "grad_norm": 11.487682078253425, + "learning_rate": 4.979983324619357e-05, + "loss": 2.5058, + "mean_token_accuracy": 0.4206896543502808, + "step": 89720 + }, + { + "epoch": 0.0903719729543804, + "grad_norm": 11.08545359585476, + "learning_rate": 4.979978337174501e-05, + "loss": 2.543, + "mean_token_accuracy": 0.44482758045196535, + "step": 89725 + }, + { + "epoch": 0.09037700900748458, + "grad_norm": 9.368259448465437, + "learning_rate": 4.979973349111152e-05, + "loss": 2.3643, + "mean_token_accuracy": 0.4552955687046051, + "step": 89730 + }, + { + "epoch": 0.09038204506058875, + "grad_norm": 11.056629978553346, + "learning_rate": 4.979968360429313e-05, + "loss": 2.5208, + "mean_token_accuracy": 0.44482759237289426, + "step": 89735 + }, + { + "epoch": 0.09038708111369292, + "grad_norm": 10.97682821744861, + "learning_rate": 4.979963371128982e-05, + "loss": 2.1671, + "mean_token_accuracy": 0.45396249890327456, + "step": 89740 + }, + { + "epoch": 0.0903921171667971, + "grad_norm": 14.92401666873045, + "learning_rate": 4.979958381210163e-05, + "loss": 2.8555, + "mean_token_accuracy": 0.37586206793785093, + "step": 89745 + }, + { + "epoch": 0.09039715321990127, + "grad_norm": 10.490020018437995, + "learning_rate": 4.979953390672858e-05, + "loss": 2.2008, + "mean_token_accuracy": 0.5137930929660797, + "step": 89750 + }, + { + "epoch": 0.09040218927300545, + "grad_norm": 10.577593088039851, + "learning_rate": 4.979948399517065e-05, + "loss": 2.8129, + "mean_token_accuracy": 0.38965516686439516, + "step": 89755 + }, + { + "epoch": 0.09040722532610962, + "grad_norm": 13.78668540609991, + "learning_rate": 4.979943407742789e-05, + "loss": 2.3458, + "mean_token_accuracy": 0.4034482717514038, + "step": 89760 + }, + { + "epoch": 0.09041226137921379, + "grad_norm": 11.242434678654442, + "learning_rate": 4.9799384153500286e-05, + "loss": 2.3997, + "mean_token_accuracy": 0.4344827592372894, + "step": 89765 + }, + { + "epoch": 0.09041729743231797, + "grad_norm": 10.471158829010824, + "learning_rate": 4.979933422338787e-05, + "loss": 2.4459, + "mean_token_accuracy": 0.41566848158836367, + "step": 89770 + }, + { + "epoch": 0.09042233348542214, + "grad_norm": 10.466539592990177, + "learning_rate": 4.9799284287090656e-05, + "loss": 2.5413, + "mean_token_accuracy": 0.379310342669487, + "step": 89775 + }, + { + "epoch": 0.09042736953852631, + "grad_norm": 12.344009992791031, + "learning_rate": 4.979923434460865e-05, + "loss": 2.818, + "mean_token_accuracy": 0.324137932062149, + "step": 89780 + }, + { + "epoch": 0.09043240559163049, + "grad_norm": 10.85778929331864, + "learning_rate": 4.9799184395941864e-05, + "loss": 2.2806, + "mean_token_accuracy": 0.4310344815254211, + "step": 89785 + }, + { + "epoch": 0.09043744164473466, + "grad_norm": 10.353984305726781, + "learning_rate": 4.9799134441090315e-05, + "loss": 2.8948, + "mean_token_accuracy": 0.36896551549434664, + "step": 89790 + }, + { + "epoch": 0.09044247769783882, + "grad_norm": 11.195854668208888, + "learning_rate": 4.9799084480054024e-05, + "loss": 2.6097, + "mean_token_accuracy": 0.4, + "step": 89795 + }, + { + "epoch": 0.090447513750943, + "grad_norm": 9.865166784270995, + "learning_rate": 4.9799034512832996e-05, + "loss": 2.3296, + "mean_token_accuracy": 0.4034482777118683, + "step": 89800 + }, + { + "epoch": 0.09045254980404717, + "grad_norm": 9.755263433932416, + "learning_rate": 4.979898453942725e-05, + "loss": 2.0767, + "mean_token_accuracy": 0.5206896483898162, + "step": 89805 + }, + { + "epoch": 0.09045758585715134, + "grad_norm": 12.596984690778859, + "learning_rate": 4.97989345598368e-05, + "loss": 2.9297, + "mean_token_accuracy": 0.37586206793785093, + "step": 89810 + }, + { + "epoch": 0.09046262191025552, + "grad_norm": 14.704391942833652, + "learning_rate": 4.979888457406165e-05, + "loss": 2.7163, + "mean_token_accuracy": 0.40471869707107544, + "step": 89815 + }, + { + "epoch": 0.09046765796335969, + "grad_norm": 11.139590053658273, + "learning_rate": 4.9798834582101826e-05, + "loss": 2.453, + "mean_token_accuracy": 0.3896551728248596, + "step": 89820 + }, + { + "epoch": 0.09047269401646386, + "grad_norm": 13.086361339131246, + "learning_rate": 4.979878458395733e-05, + "loss": 2.8472, + "mean_token_accuracy": 0.4, + "step": 89825 + }, + { + "epoch": 0.09047773006956804, + "grad_norm": 12.80889241486848, + "learning_rate": 4.979873457962819e-05, + "loss": 2.5557, + "mean_token_accuracy": 0.36896551251411436, + "step": 89830 + }, + { + "epoch": 0.09048276612267221, + "grad_norm": 10.906013250754546, + "learning_rate": 4.9798684569114415e-05, + "loss": 2.7782, + "mean_token_accuracy": 0.3620689660310745, + "step": 89835 + }, + { + "epoch": 0.09048780217577639, + "grad_norm": 10.449931341292485, + "learning_rate": 4.979863455241601e-05, + "loss": 2.5355, + "mean_token_accuracy": 0.37241379618644715, + "step": 89840 + }, + { + "epoch": 0.09049283822888056, + "grad_norm": 10.725942993950925, + "learning_rate": 4.979858452953299e-05, + "loss": 2.9839, + "mean_token_accuracy": 0.4103448331356049, + "step": 89845 + }, + { + "epoch": 0.09049787428198473, + "grad_norm": 9.700439098044725, + "learning_rate": 4.9798534500465386e-05, + "loss": 2.2058, + "mean_token_accuracy": 0.46019359230995177, + "step": 89850 + }, + { + "epoch": 0.0905029103350889, + "grad_norm": 10.185458098897252, + "learning_rate": 4.97984844652132e-05, + "loss": 2.3467, + "mean_token_accuracy": 0.4068965494632721, + "step": 89855 + }, + { + "epoch": 0.09050794638819308, + "grad_norm": 10.124563778032973, + "learning_rate": 4.979843442377644e-05, + "loss": 2.5142, + "mean_token_accuracy": 0.45172414779663084, + "step": 89860 + }, + { + "epoch": 0.09051298244129724, + "grad_norm": 9.831461088488423, + "learning_rate": 4.979838437615513e-05, + "loss": 2.4274, + "mean_token_accuracy": 0.3896551728248596, + "step": 89865 + }, + { + "epoch": 0.09051801849440141, + "grad_norm": 12.343217333677707, + "learning_rate": 4.979833432234928e-05, + "loss": 2.6617, + "mean_token_accuracy": 0.3862068891525269, + "step": 89870 + }, + { + "epoch": 0.09052305454750559, + "grad_norm": 13.668558250500858, + "learning_rate": 4.9798284262358904e-05, + "loss": 2.2909, + "mean_token_accuracy": 0.5119458138942719, + "step": 89875 + }, + { + "epoch": 0.09052809060060976, + "grad_norm": 8.997900885936795, + "learning_rate": 4.979823419618401e-05, + "loss": 2.8476, + "mean_token_accuracy": 0.38275861740112305, + "step": 89880 + }, + { + "epoch": 0.09053312665371394, + "grad_norm": 10.638642856656697, + "learning_rate": 4.9798184123824626e-05, + "loss": 2.5555, + "mean_token_accuracy": 0.37586206793785093, + "step": 89885 + }, + { + "epoch": 0.09053816270681811, + "grad_norm": 10.930350248426473, + "learning_rate": 4.979813404528075e-05, + "loss": 2.6318, + "mean_token_accuracy": 0.44482759237289426, + "step": 89890 + }, + { + "epoch": 0.09054319875992228, + "grad_norm": 12.253057347086001, + "learning_rate": 4.979808396055241e-05, + "loss": 2.6129, + "mean_token_accuracy": 0.4206896424293518, + "step": 89895 + }, + { + "epoch": 0.09054823481302646, + "grad_norm": 10.356642769829785, + "learning_rate": 4.97980338696396e-05, + "loss": 2.8061, + "mean_token_accuracy": 0.3689655065536499, + "step": 89900 + }, + { + "epoch": 0.09055327086613063, + "grad_norm": 10.957841223937617, + "learning_rate": 4.979798377254236e-05, + "loss": 2.1052, + "mean_token_accuracy": 0.4931034564971924, + "step": 89905 + }, + { + "epoch": 0.0905583069192348, + "grad_norm": 10.397135914787276, + "learning_rate": 4.979793366926069e-05, + "loss": 2.6678, + "mean_token_accuracy": 0.38275861740112305, + "step": 89910 + }, + { + "epoch": 0.09056334297233898, + "grad_norm": 8.581688175686919, + "learning_rate": 4.97978835597946e-05, + "loss": 2.393, + "mean_token_accuracy": 0.44482758045196535, + "step": 89915 + }, + { + "epoch": 0.09056837902544315, + "grad_norm": 13.028720723379386, + "learning_rate": 4.979783344414411e-05, + "loss": 2.7789, + "mean_token_accuracy": 0.4068965554237366, + "step": 89920 + }, + { + "epoch": 0.09057341507854733, + "grad_norm": 11.497920484227917, + "learning_rate": 4.9797783322309236e-05, + "loss": 2.6603, + "mean_token_accuracy": 0.42758620381355283, + "step": 89925 + }, + { + "epoch": 0.0905784511316515, + "grad_norm": 11.501101468125015, + "learning_rate": 4.979773319428999e-05, + "loss": 2.4437, + "mean_token_accuracy": 0.48965516686439514, + "step": 89930 + }, + { + "epoch": 0.09058348718475566, + "grad_norm": 10.944577200199786, + "learning_rate": 4.979768306008638e-05, + "loss": 2.624, + "mean_token_accuracy": 0.4103448212146759, + "step": 89935 + }, + { + "epoch": 0.09058852323785983, + "grad_norm": 14.702623777716283, + "learning_rate": 4.979763291969842e-05, + "loss": 2.5248, + "mean_token_accuracy": 0.3896551728248596, + "step": 89940 + }, + { + "epoch": 0.09059355929096401, + "grad_norm": 8.970624052944412, + "learning_rate": 4.979758277312614e-05, + "loss": 2.4454, + "mean_token_accuracy": 0.4793103516101837, + "step": 89945 + }, + { + "epoch": 0.09059859534406818, + "grad_norm": 10.712482946901611, + "learning_rate": 4.979753262036954e-05, + "loss": 2.5091, + "mean_token_accuracy": 0.43103447556495667, + "step": 89950 + }, + { + "epoch": 0.09060363139717235, + "grad_norm": 11.831569064849802, + "learning_rate": 4.979748246142863e-05, + "loss": 2.421, + "mean_token_accuracy": 0.3965517282485962, + "step": 89955 + }, + { + "epoch": 0.09060866745027653, + "grad_norm": 10.11255921332216, + "learning_rate": 4.9797432296303434e-05, + "loss": 2.3327, + "mean_token_accuracy": 0.441379314661026, + "step": 89960 + }, + { + "epoch": 0.0906137035033807, + "grad_norm": 11.105098978324026, + "learning_rate": 4.979738212499396e-05, + "loss": 2.2692, + "mean_token_accuracy": 0.434664249420166, + "step": 89965 + }, + { + "epoch": 0.09061873955648488, + "grad_norm": 10.376273405905163, + "learning_rate": 4.979733194750022e-05, + "loss": 2.6551, + "mean_token_accuracy": 0.4103448212146759, + "step": 89970 + }, + { + "epoch": 0.09062377560958905, + "grad_norm": 10.251359336170374, + "learning_rate": 4.979728176382223e-05, + "loss": 2.7534, + "mean_token_accuracy": 0.4085299462080002, + "step": 89975 + }, + { + "epoch": 0.09062881166269322, + "grad_norm": 15.203125868701372, + "learning_rate": 4.979723157396002e-05, + "loss": 2.5864, + "mean_token_accuracy": 0.43103447258472444, + "step": 89980 + }, + { + "epoch": 0.0906338477157974, + "grad_norm": 10.45509224782312, + "learning_rate": 4.979718137791358e-05, + "loss": 1.9837, + "mean_token_accuracy": 0.48620688915252686, + "step": 89985 + }, + { + "epoch": 0.09063888376890157, + "grad_norm": 11.831825205248354, + "learning_rate": 4.979713117568294e-05, + "loss": 2.4685, + "mean_token_accuracy": 0.4034482777118683, + "step": 89990 + }, + { + "epoch": 0.09064391982200574, + "grad_norm": 11.2030171028342, + "learning_rate": 4.97970809672681e-05, + "loss": 2.3308, + "mean_token_accuracy": 0.4586206912994385, + "step": 89995 + }, + { + "epoch": 0.09064895587510992, + "grad_norm": 10.636865517144962, + "learning_rate": 4.9797030752669086e-05, + "loss": 2.3383, + "mean_token_accuracy": 0.44482759237289426, + "step": 90000 + }, + { + "epoch": 0.09065399192821408, + "grad_norm": 8.417781344952532, + "learning_rate": 4.97969805318859e-05, + "loss": 2.5096, + "mean_token_accuracy": 0.4068965494632721, + "step": 90005 + }, + { + "epoch": 0.09065902798131825, + "grad_norm": 14.519615140287229, + "learning_rate": 4.979693030491857e-05, + "loss": 2.5658, + "mean_token_accuracy": 0.40459770560264585, + "step": 90010 + }, + { + "epoch": 0.09066406403442243, + "grad_norm": 12.358196881177804, + "learning_rate": 4.97968800717671e-05, + "loss": 2.4126, + "mean_token_accuracy": 0.4034482717514038, + "step": 90015 + }, + { + "epoch": 0.0906691000875266, + "grad_norm": 10.484100275508538, + "learning_rate": 4.979682983243151e-05, + "loss": 2.3515, + "mean_token_accuracy": 0.43793103098869324, + "step": 90020 + }, + { + "epoch": 0.09067413614063077, + "grad_norm": 12.517378663361818, + "learning_rate": 4.979677958691181e-05, + "loss": 2.3503, + "mean_token_accuracy": 0.4603750824928284, + "step": 90025 + }, + { + "epoch": 0.09067917219373495, + "grad_norm": 11.401731936434388, + "learning_rate": 4.979672933520801e-05, + "loss": 2.4802, + "mean_token_accuracy": 0.43284936547279357, + "step": 90030 + }, + { + "epoch": 0.09068420824683912, + "grad_norm": 9.21107120337026, + "learning_rate": 4.979667907732014e-05, + "loss": 2.3437, + "mean_token_accuracy": 0.4034482717514038, + "step": 90035 + }, + { + "epoch": 0.0906892442999433, + "grad_norm": 8.63920093549094, + "learning_rate": 4.97966288132482e-05, + "loss": 2.6577, + "mean_token_accuracy": 0.39310345649719236, + "step": 90040 + }, + { + "epoch": 0.09069428035304747, + "grad_norm": 9.382052401386913, + "learning_rate": 4.97965785429922e-05, + "loss": 2.2856, + "mean_token_accuracy": 0.4448275864124298, + "step": 90045 + }, + { + "epoch": 0.09069931640615164, + "grad_norm": 11.874455773091102, + "learning_rate": 4.979652826655217e-05, + "loss": 2.6981, + "mean_token_accuracy": 0.3655172407627106, + "step": 90050 + }, + { + "epoch": 0.09070435245925582, + "grad_norm": 12.620921942031611, + "learning_rate": 4.97964779839281e-05, + "loss": 2.2865, + "mean_token_accuracy": 0.4344827592372894, + "step": 90055 + }, + { + "epoch": 0.09070938851235999, + "grad_norm": 11.087335124788046, + "learning_rate": 4.979642769512004e-05, + "loss": 2.5002, + "mean_token_accuracy": 0.43998789191246035, + "step": 90060 + }, + { + "epoch": 0.09071442456546416, + "grad_norm": 9.944084144460588, + "learning_rate": 4.979637740012796e-05, + "loss": 2.3205, + "mean_token_accuracy": 0.42068964540958403, + "step": 90065 + }, + { + "epoch": 0.09071946061856834, + "grad_norm": 10.376947689776596, + "learning_rate": 4.979632709895191e-05, + "loss": 2.0403, + "mean_token_accuracy": 0.4871921122074127, + "step": 90070 + }, + { + "epoch": 0.0907244966716725, + "grad_norm": 14.013078594211372, + "learning_rate": 4.979627679159189e-05, + "loss": 2.3627, + "mean_token_accuracy": 0.45722927451133727, + "step": 90075 + }, + { + "epoch": 0.09072953272477667, + "grad_norm": 9.749797366255892, + "learning_rate": 4.9796226478047906e-05, + "loss": 2.7198, + "mean_token_accuracy": 0.4034482777118683, + "step": 90080 + }, + { + "epoch": 0.09073456877788084, + "grad_norm": 17.446174134908766, + "learning_rate": 4.979617615831999e-05, + "loss": 2.2867, + "mean_token_accuracy": 0.44349666833877566, + "step": 90085 + }, + { + "epoch": 0.09073960483098502, + "grad_norm": 16.443375965076694, + "learning_rate": 4.979612583240814e-05, + "loss": 2.6136, + "mean_token_accuracy": 0.4361766457557678, + "step": 90090 + }, + { + "epoch": 0.09074464088408919, + "grad_norm": 12.423581880476975, + "learning_rate": 4.9796075500312385e-05, + "loss": 2.3477, + "mean_token_accuracy": 0.41034482717514037, + "step": 90095 + }, + { + "epoch": 0.09074967693719337, + "grad_norm": 15.382391685590592, + "learning_rate": 4.9796025162032724e-05, + "loss": 2.6125, + "mean_token_accuracy": 0.44827585816383364, + "step": 90100 + }, + { + "epoch": 0.09075471299029754, + "grad_norm": 10.486824218668685, + "learning_rate": 4.9795974817569175e-05, + "loss": 2.2152, + "mean_token_accuracy": 0.46551724076271056, + "step": 90105 + }, + { + "epoch": 0.09075974904340171, + "grad_norm": 11.96119698924273, + "learning_rate": 4.979592446692175e-05, + "loss": 2.2717, + "mean_token_accuracy": 0.4448275864124298, + "step": 90110 + }, + { + "epoch": 0.09076478509650589, + "grad_norm": 11.971491652001868, + "learning_rate": 4.979587411009048e-05, + "loss": 2.2746, + "mean_token_accuracy": 0.4206896543502808, + "step": 90115 + }, + { + "epoch": 0.09076982114961006, + "grad_norm": 9.120103620342542, + "learning_rate": 4.979582374707536e-05, + "loss": 2.6215, + "mean_token_accuracy": 0.42758620977401735, + "step": 90120 + }, + { + "epoch": 0.09077485720271423, + "grad_norm": 24.832586891941805, + "learning_rate": 4.979577337787641e-05, + "loss": 2.4918, + "mean_token_accuracy": 0.4517241299152374, + "step": 90125 + }, + { + "epoch": 0.09077989325581841, + "grad_norm": 13.918599225202675, + "learning_rate": 4.979572300249364e-05, + "loss": 2.0342, + "mean_token_accuracy": 0.4448275864124298, + "step": 90130 + }, + { + "epoch": 0.09078492930892258, + "grad_norm": 9.695191817223243, + "learning_rate": 4.9795672620927075e-05, + "loss": 2.6955, + "mean_token_accuracy": 0.3689655214548111, + "step": 90135 + }, + { + "epoch": 0.09078996536202676, + "grad_norm": 11.79130036712235, + "learning_rate": 4.979562223317672e-05, + "loss": 2.656, + "mean_token_accuracy": 0.37241379022598264, + "step": 90140 + }, + { + "epoch": 0.09079500141513092, + "grad_norm": 11.308758070166531, + "learning_rate": 4.979557183924259e-05, + "loss": 2.2333, + "mean_token_accuracy": 0.480541867017746, + "step": 90145 + }, + { + "epoch": 0.09080003746823509, + "grad_norm": 10.866395967795059, + "learning_rate": 4.97955214391247e-05, + "loss": 2.4114, + "mean_token_accuracy": 0.432667875289917, + "step": 90150 + }, + { + "epoch": 0.09080507352133926, + "grad_norm": 9.87271174112646, + "learning_rate": 4.9795471032823064e-05, + "loss": 2.6152, + "mean_token_accuracy": 0.3620689630508423, + "step": 90155 + }, + { + "epoch": 0.09081010957444344, + "grad_norm": 10.651213614860794, + "learning_rate": 4.9795420620337693e-05, + "loss": 2.2372, + "mean_token_accuracy": 0.4517241358757019, + "step": 90160 + }, + { + "epoch": 0.09081514562754761, + "grad_norm": 13.321726294103668, + "learning_rate": 4.979537020166861e-05, + "loss": 2.547, + "mean_token_accuracy": 0.3999999940395355, + "step": 90165 + }, + { + "epoch": 0.09082018168065178, + "grad_norm": 11.77181217350565, + "learning_rate": 4.9795319776815826e-05, + "loss": 2.2948, + "mean_token_accuracy": 0.43103448748588563, + "step": 90170 + }, + { + "epoch": 0.09082521773375596, + "grad_norm": 11.78587914091947, + "learning_rate": 4.979526934577934e-05, + "loss": 2.7219, + "mean_token_accuracy": 0.334482753276825, + "step": 90175 + }, + { + "epoch": 0.09083025378686013, + "grad_norm": 9.024357791585787, + "learning_rate": 4.979521890855919e-05, + "loss": 2.515, + "mean_token_accuracy": 0.39310345649719236, + "step": 90180 + }, + { + "epoch": 0.0908352898399643, + "grad_norm": 9.455658382449057, + "learning_rate": 4.979516846515536e-05, + "loss": 2.3388, + "mean_token_accuracy": 0.42413793206214906, + "step": 90185 + }, + { + "epoch": 0.09084032589306848, + "grad_norm": 8.943009744953645, + "learning_rate": 4.97951180155679e-05, + "loss": 2.3904, + "mean_token_accuracy": 0.42758620977401735, + "step": 90190 + }, + { + "epoch": 0.09084536194617265, + "grad_norm": 9.176328295144586, + "learning_rate": 4.97950675597968e-05, + "loss": 2.209, + "mean_token_accuracy": 0.47931034564971925, + "step": 90195 + }, + { + "epoch": 0.09085039799927683, + "grad_norm": 9.00930930413258, + "learning_rate": 4.979501709784208e-05, + "loss": 2.2891, + "mean_token_accuracy": 0.4882637619972229, + "step": 90200 + }, + { + "epoch": 0.090855434052381, + "grad_norm": 13.5390219560126, + "learning_rate": 4.9794966629703755e-05, + "loss": 2.8878, + "mean_token_accuracy": 0.39812461137771604, + "step": 90205 + }, + { + "epoch": 0.09086047010548517, + "grad_norm": 11.348178108166087, + "learning_rate": 4.9794916155381835e-05, + "loss": 2.4645, + "mean_token_accuracy": 0.43103448748588563, + "step": 90210 + }, + { + "epoch": 0.09086550615858933, + "grad_norm": 11.940855772603383, + "learning_rate": 4.979486567487634e-05, + "loss": 2.4485, + "mean_token_accuracy": 0.3827586233615875, + "step": 90215 + }, + { + "epoch": 0.09087054221169351, + "grad_norm": 12.71153254281118, + "learning_rate": 4.9794815188187276e-05, + "loss": 2.777, + "mean_token_accuracy": 0.3896551728248596, + "step": 90220 + }, + { + "epoch": 0.09087557826479768, + "grad_norm": 11.566195167024897, + "learning_rate": 4.9794764695314663e-05, + "loss": 2.4122, + "mean_token_accuracy": 0.3827586233615875, + "step": 90225 + }, + { + "epoch": 0.09088061431790186, + "grad_norm": 10.03224433079392, + "learning_rate": 4.9794714196258514e-05, + "loss": 2.3411, + "mean_token_accuracy": 0.4310344815254211, + "step": 90230 + }, + { + "epoch": 0.09088565037100603, + "grad_norm": 9.755984745215926, + "learning_rate": 4.979466369101884e-05, + "loss": 2.4842, + "mean_token_accuracy": 0.482758617401123, + "step": 90235 + }, + { + "epoch": 0.0908906864241102, + "grad_norm": 11.44696252590824, + "learning_rate": 4.979461317959567e-05, + "loss": 2.4561, + "mean_token_accuracy": 0.41379310488700866, + "step": 90240 + }, + { + "epoch": 0.09089572247721438, + "grad_norm": 8.635530114473005, + "learning_rate": 4.979456266198899e-05, + "loss": 2.3462, + "mean_token_accuracy": 0.42413793206214906, + "step": 90245 + }, + { + "epoch": 0.09090075853031855, + "grad_norm": 9.829816833076503, + "learning_rate": 4.979451213819884e-05, + "loss": 2.3878, + "mean_token_accuracy": 0.46206897497177124, + "step": 90250 + }, + { + "epoch": 0.09090579458342272, + "grad_norm": 10.734777912593835, + "learning_rate": 4.979446160822522e-05, + "loss": 2.6515, + "mean_token_accuracy": 0.4206896543502808, + "step": 90255 + }, + { + "epoch": 0.0909108306365269, + "grad_norm": 9.240119736500542, + "learning_rate": 4.979441107206816e-05, + "loss": 2.4341, + "mean_token_accuracy": 0.4034482717514038, + "step": 90260 + }, + { + "epoch": 0.09091586668963107, + "grad_norm": 14.992409494532323, + "learning_rate": 4.9794360529727645e-05, + "loss": 2.8744, + "mean_token_accuracy": 0.36551723480224607, + "step": 90265 + }, + { + "epoch": 0.09092090274273525, + "grad_norm": 11.038814978434667, + "learning_rate": 4.9794309981203715e-05, + "loss": 2.4813, + "mean_token_accuracy": 0.37586206793785093, + "step": 90270 + }, + { + "epoch": 0.09092593879583942, + "grad_norm": 11.626340404857135, + "learning_rate": 4.979425942649636e-05, + "loss": 2.7444, + "mean_token_accuracy": 0.29999999701976776, + "step": 90275 + }, + { + "epoch": 0.0909309748489436, + "grad_norm": 11.933873914580772, + "learning_rate": 4.9794208865605636e-05, + "loss": 2.7416, + "mean_token_accuracy": 0.3655172407627106, + "step": 90280 + }, + { + "epoch": 0.09093601090204775, + "grad_norm": 10.36206408283393, + "learning_rate": 4.979415829853151e-05, + "loss": 2.3698, + "mean_token_accuracy": 0.42413793206214906, + "step": 90285 + }, + { + "epoch": 0.09094104695515193, + "grad_norm": 13.638229922322656, + "learning_rate": 4.9794107725274025e-05, + "loss": 2.5357, + "mean_token_accuracy": 0.43448275327682495, + "step": 90290 + }, + { + "epoch": 0.0909460830082561, + "grad_norm": 9.93508467206544, + "learning_rate": 4.9794057145833175e-05, + "loss": 2.3269, + "mean_token_accuracy": 0.4413793087005615, + "step": 90295 + }, + { + "epoch": 0.09095111906136027, + "grad_norm": 12.955726782874757, + "learning_rate": 4.9794006560209e-05, + "loss": 2.4371, + "mean_token_accuracy": 0.39310344457626345, + "step": 90300 + }, + { + "epoch": 0.09095615511446445, + "grad_norm": 11.427335303798769, + "learning_rate": 4.979395596840149e-05, + "loss": 2.614, + "mean_token_accuracy": 0.441379314661026, + "step": 90305 + }, + { + "epoch": 0.09096119116756862, + "grad_norm": 12.7170375383586, + "learning_rate": 4.9793905370410664e-05, + "loss": 2.2512, + "mean_token_accuracy": 0.4620689690113068, + "step": 90310 + }, + { + "epoch": 0.0909662272206728, + "grad_norm": 10.663190632699047, + "learning_rate": 4.9793854766236547e-05, + "loss": 2.2782, + "mean_token_accuracy": 0.39310344457626345, + "step": 90315 + }, + { + "epoch": 0.09097126327377697, + "grad_norm": 14.009714275799094, + "learning_rate": 4.979380415587915e-05, + "loss": 2.9549, + "mean_token_accuracy": 0.3827586233615875, + "step": 90320 + }, + { + "epoch": 0.09097629932688114, + "grad_norm": 9.563323599008491, + "learning_rate": 4.9793753539338474e-05, + "loss": 2.4672, + "mean_token_accuracy": 0.39999999701976774, + "step": 90325 + }, + { + "epoch": 0.09098133537998532, + "grad_norm": 10.790800811109358, + "learning_rate": 4.979370291661455e-05, + "loss": 2.0366, + "mean_token_accuracy": 0.5034482717514038, + "step": 90330 + }, + { + "epoch": 0.09098637143308949, + "grad_norm": 10.01730169428084, + "learning_rate": 4.979365228770738e-05, + "loss": 2.218, + "mean_token_accuracy": 0.42068965137004855, + "step": 90335 + }, + { + "epoch": 0.09099140748619367, + "grad_norm": 10.114256161866365, + "learning_rate": 4.9793601652616985e-05, + "loss": 2.0994, + "mean_token_accuracy": 0.4793103516101837, + "step": 90340 + }, + { + "epoch": 0.09099644353929784, + "grad_norm": 11.932799460616552, + "learning_rate": 4.979355101134337e-05, + "loss": 2.4318, + "mean_token_accuracy": 0.4, + "step": 90345 + }, + { + "epoch": 0.09100147959240201, + "grad_norm": 12.641709572950468, + "learning_rate": 4.979350036388656e-05, + "loss": 2.5387, + "mean_token_accuracy": 0.4172413766384125, + "step": 90350 + }, + { + "epoch": 0.09100651564550617, + "grad_norm": 12.813043403925558, + "learning_rate": 4.979344971024656e-05, + "loss": 2.5127, + "mean_token_accuracy": 0.45009074807167054, + "step": 90355 + }, + { + "epoch": 0.09101155169861035, + "grad_norm": 10.79932317974267, + "learning_rate": 4.97933990504234e-05, + "loss": 2.2764, + "mean_token_accuracy": 0.4586206912994385, + "step": 90360 + }, + { + "epoch": 0.09101658775171452, + "grad_norm": 12.074016544407364, + "learning_rate": 4.9793348384417075e-05, + "loss": 2.677, + "mean_token_accuracy": 0.38965516686439516, + "step": 90365 + }, + { + "epoch": 0.0910216238048187, + "grad_norm": 10.937451877435155, + "learning_rate": 4.979329771222761e-05, + "loss": 3.0193, + "mean_token_accuracy": 0.3551724135875702, + "step": 90370 + }, + { + "epoch": 0.09102665985792287, + "grad_norm": 8.539580040942838, + "learning_rate": 4.979324703385501e-05, + "loss": 2.5895, + "mean_token_accuracy": 0.3862069010734558, + "step": 90375 + }, + { + "epoch": 0.09103169591102704, + "grad_norm": 11.671894538238327, + "learning_rate": 4.9793196349299296e-05, + "loss": 2.2363, + "mean_token_accuracy": 0.4620689630508423, + "step": 90380 + }, + { + "epoch": 0.09103673196413122, + "grad_norm": 10.886463193863884, + "learning_rate": 4.9793145658560486e-05, + "loss": 2.2605, + "mean_token_accuracy": 0.47090138792991637, + "step": 90385 + }, + { + "epoch": 0.09104176801723539, + "grad_norm": 12.256596198909342, + "learning_rate": 4.9793094961638585e-05, + "loss": 2.1743, + "mean_token_accuracy": 0.5000000119209289, + "step": 90390 + }, + { + "epoch": 0.09104680407033956, + "grad_norm": 11.070599545868937, + "learning_rate": 4.979304425853361e-05, + "loss": 2.5198, + "mean_token_accuracy": 0.40344828367233276, + "step": 90395 + }, + { + "epoch": 0.09105184012344374, + "grad_norm": 12.587351358309391, + "learning_rate": 4.979299354924558e-05, + "loss": 2.6132, + "mean_token_accuracy": 0.42758620977401735, + "step": 90400 + }, + { + "epoch": 0.09105687617654791, + "grad_norm": 10.271802623825817, + "learning_rate": 4.97929428337745e-05, + "loss": 2.4358, + "mean_token_accuracy": 0.41034482717514037, + "step": 90405 + }, + { + "epoch": 0.09106191222965208, + "grad_norm": 9.849203672541021, + "learning_rate": 4.979289211212039e-05, + "loss": 2.2003, + "mean_token_accuracy": 0.4448275864124298, + "step": 90410 + }, + { + "epoch": 0.09106694828275626, + "grad_norm": 9.15943413962576, + "learning_rate": 4.979284138428327e-05, + "loss": 2.2174, + "mean_token_accuracy": 0.43793103098869324, + "step": 90415 + }, + { + "epoch": 0.09107198433586043, + "grad_norm": 9.932951153609872, + "learning_rate": 4.9792790650263136e-05, + "loss": 2.1313, + "mean_token_accuracy": 0.46896552443504336, + "step": 90420 + }, + { + "epoch": 0.09107702038896459, + "grad_norm": 11.427666093479466, + "learning_rate": 4.979273991006003e-05, + "loss": 2.9135, + "mean_token_accuracy": 0.39794313311576845, + "step": 90425 + }, + { + "epoch": 0.09108205644206877, + "grad_norm": 10.427592659859615, + "learning_rate": 4.9792689163673937e-05, + "loss": 2.2687, + "mean_token_accuracy": 0.37586206793785093, + "step": 90430 + }, + { + "epoch": 0.09108709249517294, + "grad_norm": 12.109536432537869, + "learning_rate": 4.979263841110488e-05, + "loss": 2.1345, + "mean_token_accuracy": 0.4845735013484955, + "step": 90435 + }, + { + "epoch": 0.09109212854827711, + "grad_norm": 12.145734702257682, + "learning_rate": 4.979258765235289e-05, + "loss": 2.341, + "mean_token_accuracy": 0.4379310369491577, + "step": 90440 + }, + { + "epoch": 0.09109716460138129, + "grad_norm": 10.95845020122772, + "learning_rate": 4.9792536887417953e-05, + "loss": 2.3744, + "mean_token_accuracy": 0.42413793206214906, + "step": 90445 + }, + { + "epoch": 0.09110220065448546, + "grad_norm": 10.76244480901666, + "learning_rate": 4.979248611630011e-05, + "loss": 2.7237, + "mean_token_accuracy": 0.36206896007061007, + "step": 90450 + }, + { + "epoch": 0.09110723670758963, + "grad_norm": 12.123624168753311, + "learning_rate": 4.9792435338999354e-05, + "loss": 2.3586, + "mean_token_accuracy": 0.4517241299152374, + "step": 90455 + }, + { + "epoch": 0.09111227276069381, + "grad_norm": 13.433875712063838, + "learning_rate": 4.979238455551571e-05, + "loss": 2.4971, + "mean_token_accuracy": 0.4448275864124298, + "step": 90460 + }, + { + "epoch": 0.09111730881379798, + "grad_norm": 11.954055079329189, + "learning_rate": 4.9792333765849194e-05, + "loss": 2.1803, + "mean_token_accuracy": 0.4482758641242981, + "step": 90465 + }, + { + "epoch": 0.09112234486690216, + "grad_norm": 11.822011253803288, + "learning_rate": 4.979228296999981e-05, + "loss": 2.5046, + "mean_token_accuracy": 0.4379310369491577, + "step": 90470 + }, + { + "epoch": 0.09112738092000633, + "grad_norm": 11.42334023357203, + "learning_rate": 4.979223216796758e-05, + "loss": 2.7535, + "mean_token_accuracy": 0.39655172228813174, + "step": 90475 + }, + { + "epoch": 0.0911324169731105, + "grad_norm": 13.134017112740706, + "learning_rate": 4.979218135975253e-05, + "loss": 2.4668, + "mean_token_accuracy": 0.39655172526836396, + "step": 90480 + }, + { + "epoch": 0.09113745302621468, + "grad_norm": 11.330034704872727, + "learning_rate": 4.979213054535464e-05, + "loss": 2.6321, + "mean_token_accuracy": 0.38965516686439516, + "step": 90485 + }, + { + "epoch": 0.09114248907931885, + "grad_norm": 11.917819059138314, + "learning_rate": 4.979207972477395e-05, + "loss": 2.532, + "mean_token_accuracy": 0.4172413766384125, + "step": 90490 + }, + { + "epoch": 0.09114752513242301, + "grad_norm": 13.918984551781895, + "learning_rate": 4.979202889801047e-05, + "loss": 2.632, + "mean_token_accuracy": 0.41034482717514037, + "step": 90495 + }, + { + "epoch": 0.09115256118552718, + "grad_norm": 10.06744244741391, + "learning_rate": 4.979197806506421e-05, + "loss": 2.6835, + "mean_token_accuracy": 0.4184512972831726, + "step": 90500 + }, + { + "epoch": 0.09115759723863136, + "grad_norm": 9.427658596157837, + "learning_rate": 4.979192722593519e-05, + "loss": 1.9898, + "mean_token_accuracy": 0.5332728505134583, + "step": 90505 + }, + { + "epoch": 0.09116263329173553, + "grad_norm": 10.294488142990467, + "learning_rate": 4.9791876380623424e-05, + "loss": 2.4381, + "mean_token_accuracy": 0.417241370677948, + "step": 90510 + }, + { + "epoch": 0.0911676693448397, + "grad_norm": 10.380071947045385, + "learning_rate": 4.979182552912892e-05, + "loss": 2.0485, + "mean_token_accuracy": 0.4931034505367279, + "step": 90515 + }, + { + "epoch": 0.09117270539794388, + "grad_norm": 13.95284837348923, + "learning_rate": 4.979177467145169e-05, + "loss": 2.9467, + "mean_token_accuracy": 0.37241379618644715, + "step": 90520 + }, + { + "epoch": 0.09117774145104805, + "grad_norm": 13.305801505372806, + "learning_rate": 4.979172380759177e-05, + "loss": 2.0719, + "mean_token_accuracy": 0.5034482717514038, + "step": 90525 + }, + { + "epoch": 0.09118277750415223, + "grad_norm": 11.51578153973372, + "learning_rate": 4.979167293754914e-05, + "loss": 2.7472, + "mean_token_accuracy": 0.41379310488700866, + "step": 90530 + }, + { + "epoch": 0.0911878135572564, + "grad_norm": 14.005653518694388, + "learning_rate": 4.9791622061323837e-05, + "loss": 2.1233, + "mean_token_accuracy": 0.4482758641242981, + "step": 90535 + }, + { + "epoch": 0.09119284961036057, + "grad_norm": 16.603290507121553, + "learning_rate": 4.979157117891587e-05, + "loss": 2.33, + "mean_token_accuracy": 0.44827585816383364, + "step": 90540 + }, + { + "epoch": 0.09119788566346475, + "grad_norm": 19.344976339485882, + "learning_rate": 4.979152029032525e-05, + "loss": 2.3921, + "mean_token_accuracy": 0.4206896424293518, + "step": 90545 + }, + { + "epoch": 0.09120292171656892, + "grad_norm": 12.532908099895122, + "learning_rate": 4.9791469395552e-05, + "loss": 2.7797, + "mean_token_accuracy": 0.3482758641242981, + "step": 90550 + }, + { + "epoch": 0.0912079577696731, + "grad_norm": 11.00086755507857, + "learning_rate": 4.9791418494596126e-05, + "loss": 2.742, + "mean_token_accuracy": 0.32413792610168457, + "step": 90555 + }, + { + "epoch": 0.09121299382277727, + "grad_norm": 12.889850960941862, + "learning_rate": 4.9791367587457636e-05, + "loss": 2.6405, + "mean_token_accuracy": 0.39310344457626345, + "step": 90560 + }, + { + "epoch": 0.09121802987588143, + "grad_norm": 8.803262891234478, + "learning_rate": 4.9791316674136564e-05, + "loss": 2.466, + "mean_token_accuracy": 0.41379310488700866, + "step": 90565 + }, + { + "epoch": 0.0912230659289856, + "grad_norm": 9.790388551810393, + "learning_rate": 4.9791265754632904e-05, + "loss": 2.2316, + "mean_token_accuracy": 0.4620689690113068, + "step": 90570 + }, + { + "epoch": 0.09122810198208978, + "grad_norm": 11.23008474108112, + "learning_rate": 4.9791214828946676e-05, + "loss": 2.413, + "mean_token_accuracy": 0.4620689690113068, + "step": 90575 + }, + { + "epoch": 0.09123313803519395, + "grad_norm": 10.128532772528109, + "learning_rate": 4.97911638970779e-05, + "loss": 2.6874, + "mean_token_accuracy": 0.4344827622175217, + "step": 90580 + }, + { + "epoch": 0.09123817408829812, + "grad_norm": 9.292320863219803, + "learning_rate": 4.9791112959026584e-05, + "loss": 2.2474, + "mean_token_accuracy": 0.42413792610168455, + "step": 90585 + }, + { + "epoch": 0.0912432101414023, + "grad_norm": 13.28177556868752, + "learning_rate": 4.9791062014792754e-05, + "loss": 2.891, + "mean_token_accuracy": 0.42202056646347047, + "step": 90590 + }, + { + "epoch": 0.09124824619450647, + "grad_norm": 9.513147556239415, + "learning_rate": 4.979101106437641e-05, + "loss": 2.4266, + "mean_token_accuracy": 0.39310345649719236, + "step": 90595 + }, + { + "epoch": 0.09125328224761065, + "grad_norm": 13.834591506315695, + "learning_rate": 4.9790960107777565e-05, + "loss": 2.593, + "mean_token_accuracy": 0.4103448212146759, + "step": 90600 + }, + { + "epoch": 0.09125831830071482, + "grad_norm": 13.569034296855074, + "learning_rate": 4.979090914499624e-05, + "loss": 2.6677, + "mean_token_accuracy": 0.3931034505367279, + "step": 90605 + }, + { + "epoch": 0.09126335435381899, + "grad_norm": 13.022871098093944, + "learning_rate": 4.979085817603245e-05, + "loss": 2.5268, + "mean_token_accuracy": 0.4344827651977539, + "step": 90610 + }, + { + "epoch": 0.09126839040692317, + "grad_norm": 9.828888983216695, + "learning_rate": 4.9790807200886213e-05, + "loss": 2.0444, + "mean_token_accuracy": 0.45517241954803467, + "step": 90615 + }, + { + "epoch": 0.09127342646002734, + "grad_norm": 12.827386655104135, + "learning_rate": 4.979075621955753e-05, + "loss": 2.7361, + "mean_token_accuracy": 0.4344827651977539, + "step": 90620 + }, + { + "epoch": 0.09127846251313151, + "grad_norm": 14.577570724483381, + "learning_rate": 4.979070523204642e-05, + "loss": 2.5248, + "mean_token_accuracy": 0.3758620649576187, + "step": 90625 + }, + { + "epoch": 0.09128349856623569, + "grad_norm": 10.604168408774838, + "learning_rate": 4.979065423835292e-05, + "loss": 2.3437, + "mean_token_accuracy": 0.47586206793785096, + "step": 90630 + }, + { + "epoch": 0.09128853461933985, + "grad_norm": 17.039432442128064, + "learning_rate": 4.979060323847701e-05, + "loss": 2.9704, + "mean_token_accuracy": 0.38965516686439516, + "step": 90635 + }, + { + "epoch": 0.09129357067244402, + "grad_norm": 11.654590768566218, + "learning_rate": 4.9790552232418705e-05, + "loss": 3.0543, + "mean_token_accuracy": 0.3758620619773865, + "step": 90640 + }, + { + "epoch": 0.0912986067255482, + "grad_norm": 10.132216565602302, + "learning_rate": 4.9790501220178056e-05, + "loss": 2.2722, + "mean_token_accuracy": 0.4413793206214905, + "step": 90645 + }, + { + "epoch": 0.09130364277865237, + "grad_norm": 8.865474886685172, + "learning_rate": 4.979045020175504e-05, + "loss": 2.5735, + "mean_token_accuracy": 0.39655172228813174, + "step": 90650 + }, + { + "epoch": 0.09130867883175654, + "grad_norm": 14.298358312157104, + "learning_rate": 4.9790399177149684e-05, + "loss": 2.4501, + "mean_token_accuracy": 0.4137930929660797, + "step": 90655 + }, + { + "epoch": 0.09131371488486072, + "grad_norm": 9.560555186015067, + "learning_rate": 4.9790348146362e-05, + "loss": 2.5912, + "mean_token_accuracy": 0.4344827592372894, + "step": 90660 + }, + { + "epoch": 0.09131875093796489, + "grad_norm": 10.178985971743959, + "learning_rate": 4.9790297109392017e-05, + "loss": 2.2362, + "mean_token_accuracy": 0.41034482717514037, + "step": 90665 + }, + { + "epoch": 0.09132378699106906, + "grad_norm": 13.399232515024941, + "learning_rate": 4.9790246066239734e-05, + "loss": 2.1666, + "mean_token_accuracy": 0.4620689630508423, + "step": 90670 + }, + { + "epoch": 0.09132882304417324, + "grad_norm": 10.840527955665955, + "learning_rate": 4.979019501690516e-05, + "loss": 2.4118, + "mean_token_accuracy": 0.4689655125141144, + "step": 90675 + }, + { + "epoch": 0.09133385909727741, + "grad_norm": 9.436586919515545, + "learning_rate": 4.979014396138831e-05, + "loss": 2.3065, + "mean_token_accuracy": 0.42413792610168455, + "step": 90680 + }, + { + "epoch": 0.09133889515038159, + "grad_norm": 10.86709552401155, + "learning_rate": 4.9790092899689226e-05, + "loss": 2.6396, + "mean_token_accuracy": 0.4137930989265442, + "step": 90685 + }, + { + "epoch": 0.09134393120348576, + "grad_norm": 10.32896685249154, + "learning_rate": 4.979004183180789e-05, + "loss": 2.4058, + "mean_token_accuracy": 0.42758620977401735, + "step": 90690 + }, + { + "epoch": 0.09134896725658993, + "grad_norm": 10.069455114522174, + "learning_rate": 4.978999075774434e-05, + "loss": 3.1, + "mean_token_accuracy": 0.3310344755649567, + "step": 90695 + }, + { + "epoch": 0.0913540033096941, + "grad_norm": 11.873116990985515, + "learning_rate": 4.978993967749856e-05, + "loss": 3.0076, + "mean_token_accuracy": 0.3275862097740173, + "step": 90700 + }, + { + "epoch": 0.09135903936279827, + "grad_norm": 12.949946246719957, + "learning_rate": 4.978988859107059e-05, + "loss": 2.275, + "mean_token_accuracy": 0.43103447556495667, + "step": 90705 + }, + { + "epoch": 0.09136407541590244, + "grad_norm": 14.766304129031719, + "learning_rate": 4.9789837498460434e-05, + "loss": 2.4875, + "mean_token_accuracy": 0.42068964838981626, + "step": 90710 + }, + { + "epoch": 0.09136911146900661, + "grad_norm": 11.446498943434896, + "learning_rate": 4.9789786399668116e-05, + "loss": 2.0552, + "mean_token_accuracy": 0.41724138855934145, + "step": 90715 + }, + { + "epoch": 0.09137414752211079, + "grad_norm": 10.215952945408292, + "learning_rate": 4.978973529469363e-05, + "loss": 2.2684, + "mean_token_accuracy": 0.4121597111225128, + "step": 90720 + }, + { + "epoch": 0.09137918357521496, + "grad_norm": 12.394624234398494, + "learning_rate": 4.9789684183537015e-05, + "loss": 2.5155, + "mean_token_accuracy": 0.41034482717514037, + "step": 90725 + }, + { + "epoch": 0.09138421962831914, + "grad_norm": 8.94612254831858, + "learning_rate": 4.9789633066198266e-05, + "loss": 2.6316, + "mean_token_accuracy": 0.4275861978530884, + "step": 90730 + }, + { + "epoch": 0.09138925568142331, + "grad_norm": 9.57877828392471, + "learning_rate": 4.9789581942677404e-05, + "loss": 2.7033, + "mean_token_accuracy": 0.3827586233615875, + "step": 90735 + }, + { + "epoch": 0.09139429173452748, + "grad_norm": 9.352924585756302, + "learning_rate": 4.978953081297445e-05, + "loss": 2.1651, + "mean_token_accuracy": 0.4620689511299133, + "step": 90740 + }, + { + "epoch": 0.09139932778763166, + "grad_norm": 13.0342959548774, + "learning_rate": 4.9789479677089404e-05, + "loss": 2.4411, + "mean_token_accuracy": 0.41724138259887694, + "step": 90745 + }, + { + "epoch": 0.09140436384073583, + "grad_norm": 9.575965786675098, + "learning_rate": 4.9789428535022294e-05, + "loss": 2.7624, + "mean_token_accuracy": 0.4156079888343811, + "step": 90750 + }, + { + "epoch": 0.09140939989384, + "grad_norm": 10.256049240278756, + "learning_rate": 4.9789377386773125e-05, + "loss": 2.575, + "mean_token_accuracy": 0.4275862008333206, + "step": 90755 + }, + { + "epoch": 0.09141443594694418, + "grad_norm": 12.969379706655172, + "learning_rate": 4.978932623234191e-05, + "loss": 2.6783, + "mean_token_accuracy": 0.3911675691604614, + "step": 90760 + }, + { + "epoch": 0.09141947200004835, + "grad_norm": 13.497087733534899, + "learning_rate": 4.9789275071728674e-05, + "loss": 2.4845, + "mean_token_accuracy": 0.3758620619773865, + "step": 90765 + }, + { + "epoch": 0.09142450805315253, + "grad_norm": 9.117923971815056, + "learning_rate": 4.978922390493342e-05, + "loss": 2.5062, + "mean_token_accuracy": 0.4689655125141144, + "step": 90770 + }, + { + "epoch": 0.09142954410625669, + "grad_norm": 13.82756726187566, + "learning_rate": 4.978917273195617e-05, + "loss": 2.5924, + "mean_token_accuracy": 0.4482758641242981, + "step": 90775 + }, + { + "epoch": 0.09143458015936086, + "grad_norm": 10.129820451205916, + "learning_rate": 4.978912155279692e-05, + "loss": 2.2354, + "mean_token_accuracy": 0.4413793087005615, + "step": 90780 + }, + { + "epoch": 0.09143961621246503, + "grad_norm": 11.054948340056065, + "learning_rate": 4.978907036745572e-05, + "loss": 2.704, + "mean_token_accuracy": 0.4034482717514038, + "step": 90785 + }, + { + "epoch": 0.0914446522655692, + "grad_norm": 16.174054692559015, + "learning_rate": 4.978901917593255e-05, + "loss": 2.2476, + "mean_token_accuracy": 0.4206896543502808, + "step": 90790 + }, + { + "epoch": 0.09144968831867338, + "grad_norm": 11.188054268859252, + "learning_rate": 4.9788967978227445e-05, + "loss": 2.1996, + "mean_token_accuracy": 0.4931034445762634, + "step": 90795 + }, + { + "epoch": 0.09145472437177755, + "grad_norm": 12.159251287221021, + "learning_rate": 4.978891677434041e-05, + "loss": 2.6476, + "mean_token_accuracy": 0.3275862127542496, + "step": 90800 + }, + { + "epoch": 0.09145976042488173, + "grad_norm": 13.460968075733728, + "learning_rate": 4.978886556427146e-05, + "loss": 2.3511, + "mean_token_accuracy": 0.4034482777118683, + "step": 90805 + }, + { + "epoch": 0.0914647964779859, + "grad_norm": 9.718573725500347, + "learning_rate": 4.978881434802061e-05, + "loss": 1.9663, + "mean_token_accuracy": 0.47586206197738645, + "step": 90810 + }, + { + "epoch": 0.09146983253109008, + "grad_norm": 13.677593013482635, + "learning_rate": 4.978876312558787e-05, + "loss": 2.7577, + "mean_token_accuracy": 0.3586206942796707, + "step": 90815 + }, + { + "epoch": 0.09147486858419425, + "grad_norm": 8.38362387308308, + "learning_rate": 4.978871189697326e-05, + "loss": 2.3008, + "mean_token_accuracy": 0.4310344815254211, + "step": 90820 + }, + { + "epoch": 0.09147990463729842, + "grad_norm": 14.270403772669965, + "learning_rate": 4.97886606621768e-05, + "loss": 2.8973, + "mean_token_accuracy": 0.4137930989265442, + "step": 90825 + }, + { + "epoch": 0.0914849406904026, + "grad_norm": 13.636687567834448, + "learning_rate": 4.978860942119849e-05, + "loss": 2.455, + "mean_token_accuracy": 0.44137930274009707, + "step": 90830 + }, + { + "epoch": 0.09148997674350677, + "grad_norm": 12.822580915724185, + "learning_rate": 4.9788558174038353e-05, + "loss": 2.2571, + "mean_token_accuracy": 0.46551724672317507, + "step": 90835 + }, + { + "epoch": 0.09149501279661094, + "grad_norm": 10.246435403048995, + "learning_rate": 4.9788506920696397e-05, + "loss": 2.6831, + "mean_token_accuracy": 0.37931033968925476, + "step": 90840 + }, + { + "epoch": 0.0915000488497151, + "grad_norm": 9.925755719085261, + "learning_rate": 4.9788455661172644e-05, + "loss": 2.0752, + "mean_token_accuracy": 0.46896552443504336, + "step": 90845 + }, + { + "epoch": 0.09150508490281928, + "grad_norm": 12.083805192232951, + "learning_rate": 4.97884043954671e-05, + "loss": 2.7616, + "mean_token_accuracy": 0.39310344457626345, + "step": 90850 + }, + { + "epoch": 0.09151012095592345, + "grad_norm": 12.131000154951197, + "learning_rate": 4.9788353123579794e-05, + "loss": 2.1764, + "mean_token_accuracy": 0.4448275864124298, + "step": 90855 + }, + { + "epoch": 0.09151515700902763, + "grad_norm": 9.923317496449467, + "learning_rate": 4.978830184551072e-05, + "loss": 2.2892, + "mean_token_accuracy": 0.4137930989265442, + "step": 90860 + }, + { + "epoch": 0.0915201930621318, + "grad_norm": 10.543632140348988, + "learning_rate": 4.978825056125992e-05, + "loss": 2.3153, + "mean_token_accuracy": 0.4379310250282288, + "step": 90865 + }, + { + "epoch": 0.09152522911523597, + "grad_norm": 8.772785049755147, + "learning_rate": 4.9788199270827376e-05, + "loss": 2.1919, + "mean_token_accuracy": 0.4103448212146759, + "step": 90870 + }, + { + "epoch": 0.09153026516834015, + "grad_norm": 12.306312426433067, + "learning_rate": 4.9788147974213115e-05, + "loss": 2.3901, + "mean_token_accuracy": 0.44827587008476255, + "step": 90875 + }, + { + "epoch": 0.09153530122144432, + "grad_norm": 12.371878323458768, + "learning_rate": 4.978809667141715e-05, + "loss": 2.6801, + "mean_token_accuracy": 0.4206896543502808, + "step": 90880 + }, + { + "epoch": 0.0915403372745485, + "grad_norm": 10.801793776274907, + "learning_rate": 4.9788045362439504e-05, + "loss": 2.5133, + "mean_token_accuracy": 0.3827586114406586, + "step": 90885 + }, + { + "epoch": 0.09154537332765267, + "grad_norm": 12.998255239260608, + "learning_rate": 4.978799404728019e-05, + "loss": 2.4602, + "mean_token_accuracy": 0.4068965494632721, + "step": 90890 + }, + { + "epoch": 0.09155040938075684, + "grad_norm": 12.735133937176153, + "learning_rate": 4.978794272593921e-05, + "loss": 2.0186, + "mean_token_accuracy": 0.4816696882247925, + "step": 90895 + }, + { + "epoch": 0.09155544543386102, + "grad_norm": 11.601192044198942, + "learning_rate": 4.97878913984166e-05, + "loss": 3.2055, + "mean_token_accuracy": 0.34827586114406583, + "step": 90900 + }, + { + "epoch": 0.09156048148696519, + "grad_norm": 8.181351658599045, + "learning_rate": 4.978784006471234e-05, + "loss": 2.3685, + "mean_token_accuracy": 0.46061705946922304, + "step": 90905 + }, + { + "epoch": 0.09156551754006936, + "grad_norm": 11.253534394075366, + "learning_rate": 4.978778872482648e-05, + "loss": 2.7065, + "mean_token_accuracy": 0.41554749608039854, + "step": 90910 + }, + { + "epoch": 0.09157055359317352, + "grad_norm": 10.23745815098917, + "learning_rate": 4.978773737875901e-05, + "loss": 2.7288, + "mean_token_accuracy": 0.4344827711582184, + "step": 90915 + }, + { + "epoch": 0.0915755896462777, + "grad_norm": 10.87043324931439, + "learning_rate": 4.9787686026509955e-05, + "loss": 2.1362, + "mean_token_accuracy": 0.4551724135875702, + "step": 90920 + }, + { + "epoch": 0.09158062569938187, + "grad_norm": 13.260782976745794, + "learning_rate": 4.9787634668079325e-05, + "loss": 2.4237, + "mean_token_accuracy": 0.41379310488700866, + "step": 90925 + }, + { + "epoch": 0.09158566175248604, + "grad_norm": 9.924293225660122, + "learning_rate": 4.978758330346714e-05, + "loss": 2.4388, + "mean_token_accuracy": 0.4, + "step": 90930 + }, + { + "epoch": 0.09159069780559022, + "grad_norm": 10.772146633205935, + "learning_rate": 4.9787531932673415e-05, + "loss": 2.3506, + "mean_token_accuracy": 0.39655172228813174, + "step": 90935 + }, + { + "epoch": 0.09159573385869439, + "grad_norm": 11.5605364807525, + "learning_rate": 4.978748055569815e-05, + "loss": 2.829, + "mean_token_accuracy": 0.3827586114406586, + "step": 90940 + }, + { + "epoch": 0.09160076991179857, + "grad_norm": 9.62955078092501, + "learning_rate": 4.978742917254138e-05, + "loss": 2.5052, + "mean_token_accuracy": 0.3827586233615875, + "step": 90945 + }, + { + "epoch": 0.09160580596490274, + "grad_norm": 14.336809030837403, + "learning_rate": 4.97873777832031e-05, + "loss": 2.433, + "mean_token_accuracy": 0.38620689511299133, + "step": 90950 + }, + { + "epoch": 0.09161084201800691, + "grad_norm": 10.668974401281597, + "learning_rate": 4.9787326387683336e-05, + "loss": 2.861, + "mean_token_accuracy": 0.358620685338974, + "step": 90955 + }, + { + "epoch": 0.09161587807111109, + "grad_norm": 10.689576635563803, + "learning_rate": 4.9787274985982094e-05, + "loss": 2.9082, + "mean_token_accuracy": 0.3706594049930573, + "step": 90960 + }, + { + "epoch": 0.09162091412421526, + "grad_norm": 13.111652477133445, + "learning_rate": 4.97872235780994e-05, + "loss": 2.472, + "mean_token_accuracy": 0.4517241358757019, + "step": 90965 + }, + { + "epoch": 0.09162595017731943, + "grad_norm": 11.413221592210212, + "learning_rate": 4.9787172164035255e-05, + "loss": 2.4236, + "mean_token_accuracy": 0.43968542814254763, + "step": 90970 + }, + { + "epoch": 0.09163098623042361, + "grad_norm": 12.329692031597835, + "learning_rate": 4.978712074378969e-05, + "loss": 2.3945, + "mean_token_accuracy": 0.45069570541381837, + "step": 90975 + }, + { + "epoch": 0.09163602228352778, + "grad_norm": 10.289655021961428, + "learning_rate": 4.978706931736269e-05, + "loss": 2.3951, + "mean_token_accuracy": 0.42758620977401735, + "step": 90980 + }, + { + "epoch": 0.09164105833663194, + "grad_norm": 12.282528363652855, + "learning_rate": 4.978701788475431e-05, + "loss": 2.6259, + "mean_token_accuracy": 0.41246218085289, + "step": 90985 + }, + { + "epoch": 0.09164609438973612, + "grad_norm": 13.890347942870086, + "learning_rate": 4.978696644596453e-05, + "loss": 2.5411, + "mean_token_accuracy": 0.44482759237289426, + "step": 90990 + }, + { + "epoch": 0.09165113044284029, + "grad_norm": 11.329850069988508, + "learning_rate": 4.9786915000993384e-05, + "loss": 2.3926, + "mean_token_accuracy": 0.39655172228813174, + "step": 90995 + }, + { + "epoch": 0.09165616649594446, + "grad_norm": 12.067304061637882, + "learning_rate": 4.9786863549840875e-05, + "loss": 2.3705, + "mean_token_accuracy": 0.4586206912994385, + "step": 91000 + }, + { + "epoch": 0.09166120254904864, + "grad_norm": 12.330877014349838, + "learning_rate": 4.978681209250701e-05, + "loss": 2.3797, + "mean_token_accuracy": 0.3947973310947418, + "step": 91005 + }, + { + "epoch": 0.09166623860215281, + "grad_norm": 13.164310074296923, + "learning_rate": 4.978676062899183e-05, + "loss": 2.7087, + "mean_token_accuracy": 0.3482758641242981, + "step": 91010 + }, + { + "epoch": 0.09167127465525698, + "grad_norm": 11.151750244471126, + "learning_rate": 4.9786709159295334e-05, + "loss": 2.3916, + "mean_token_accuracy": 0.4172413766384125, + "step": 91015 + }, + { + "epoch": 0.09167631070836116, + "grad_norm": 9.213784796293915, + "learning_rate": 4.978665768341753e-05, + "loss": 2.4231, + "mean_token_accuracy": 0.4620689630508423, + "step": 91020 + }, + { + "epoch": 0.09168134676146533, + "grad_norm": 10.567450431742312, + "learning_rate": 4.978660620135844e-05, + "loss": 2.4791, + "mean_token_accuracy": 0.42068966031074523, + "step": 91025 + }, + { + "epoch": 0.0916863828145695, + "grad_norm": 9.333370236418645, + "learning_rate": 4.978655471311807e-05, + "loss": 1.9976, + "mean_token_accuracy": 0.5310344755649566, + "step": 91030 + }, + { + "epoch": 0.09169141886767368, + "grad_norm": 8.572559705260733, + "learning_rate": 4.978650321869645e-05, + "loss": 2.599, + "mean_token_accuracy": 0.42262552976608275, + "step": 91035 + }, + { + "epoch": 0.09169645492077785, + "grad_norm": 10.57211996119525, + "learning_rate": 4.978645171809358e-05, + "loss": 2.5638, + "mean_token_accuracy": 0.33793103098869326, + "step": 91040 + }, + { + "epoch": 0.09170149097388203, + "grad_norm": 11.856143308769521, + "learning_rate": 4.978640021130948e-05, + "loss": 2.6072, + "mean_token_accuracy": 0.337931028008461, + "step": 91045 + }, + { + "epoch": 0.0917065270269862, + "grad_norm": 9.570392324391149, + "learning_rate": 4.978634869834417e-05, + "loss": 2.3877, + "mean_token_accuracy": 0.42758620977401735, + "step": 91050 + }, + { + "epoch": 0.09171156308009036, + "grad_norm": 11.430163687315764, + "learning_rate": 4.9786297179197646e-05, + "loss": 2.6755, + "mean_token_accuracy": 0.37241379022598264, + "step": 91055 + }, + { + "epoch": 0.09171659913319453, + "grad_norm": 10.843737410843842, + "learning_rate": 4.978624565386995e-05, + "loss": 2.4512, + "mean_token_accuracy": 0.4310344815254211, + "step": 91060 + }, + { + "epoch": 0.09172163518629871, + "grad_norm": 11.358222977494112, + "learning_rate": 4.978619412236107e-05, + "loss": 2.0878, + "mean_token_accuracy": 0.49999999403953554, + "step": 91065 + }, + { + "epoch": 0.09172667123940288, + "grad_norm": 10.162079198746095, + "learning_rate": 4.9786142584671033e-05, + "loss": 2.4035, + "mean_token_accuracy": 0.39655172228813174, + "step": 91070 + }, + { + "epoch": 0.09173170729250706, + "grad_norm": 14.268007778474047, + "learning_rate": 4.9786091040799845e-05, + "loss": 2.3024, + "mean_token_accuracy": 0.4464004814624786, + "step": 91075 + }, + { + "epoch": 0.09173674334561123, + "grad_norm": 10.606547059993941, + "learning_rate": 4.9786039490747546e-05, + "loss": 2.4423, + "mean_token_accuracy": 0.3896551787853241, + "step": 91080 + }, + { + "epoch": 0.0917417793987154, + "grad_norm": 13.216989838962837, + "learning_rate": 4.978598793451411e-05, + "loss": 2.3964, + "mean_token_accuracy": 0.4344827592372894, + "step": 91085 + }, + { + "epoch": 0.09174681545181958, + "grad_norm": 11.276264329114268, + "learning_rate": 4.978593637209958e-05, + "loss": 2.5131, + "mean_token_accuracy": 0.4068965554237366, + "step": 91090 + }, + { + "epoch": 0.09175185150492375, + "grad_norm": 12.688798500865797, + "learning_rate": 4.978588480350397e-05, + "loss": 2.697, + "mean_token_accuracy": 0.4087719261646271, + "step": 91095 + }, + { + "epoch": 0.09175688755802792, + "grad_norm": 12.388882303348021, + "learning_rate": 4.978583322872727e-05, + "loss": 2.8277, + "mean_token_accuracy": 0.37586207389831544, + "step": 91100 + }, + { + "epoch": 0.0917619236111321, + "grad_norm": 10.238108676625371, + "learning_rate": 4.9785781647769525e-05, + "loss": 2.2034, + "mean_token_accuracy": 0.4746521532535553, + "step": 91105 + }, + { + "epoch": 0.09176695966423627, + "grad_norm": 10.688883014791124, + "learning_rate": 4.978573006063072e-05, + "loss": 2.7253, + "mean_token_accuracy": 0.38965516686439516, + "step": 91110 + }, + { + "epoch": 0.09177199571734045, + "grad_norm": 10.101650083739493, + "learning_rate": 4.97856784673109e-05, + "loss": 2.6327, + "mean_token_accuracy": 0.4068965554237366, + "step": 91115 + }, + { + "epoch": 0.09177703177044462, + "grad_norm": 10.954479480035626, + "learning_rate": 4.978562686781007e-05, + "loss": 2.5482, + "mean_token_accuracy": 0.39655172228813174, + "step": 91120 + }, + { + "epoch": 0.09178206782354878, + "grad_norm": 11.7706401268676, + "learning_rate": 4.9785575262128215e-05, + "loss": 2.4409, + "mean_token_accuracy": 0.4068965554237366, + "step": 91125 + }, + { + "epoch": 0.09178710387665295, + "grad_norm": 10.888179909089782, + "learning_rate": 4.9785523650265394e-05, + "loss": 2.5967, + "mean_token_accuracy": 0.3655172407627106, + "step": 91130 + }, + { + "epoch": 0.09179213992975713, + "grad_norm": 8.097531004249877, + "learning_rate": 4.9785472032221584e-05, + "loss": 2.6859, + "mean_token_accuracy": 0.4020568609237671, + "step": 91135 + }, + { + "epoch": 0.0917971759828613, + "grad_norm": 9.791141932951607, + "learning_rate": 4.9785420407996825e-05, + "loss": 2.8971, + "mean_token_accuracy": 0.34137930572032926, + "step": 91140 + }, + { + "epoch": 0.09180221203596547, + "grad_norm": 15.024615932269995, + "learning_rate": 4.978536877759112e-05, + "loss": 2.5507, + "mean_token_accuracy": 0.43103448748588563, + "step": 91145 + }, + { + "epoch": 0.09180724808906965, + "grad_norm": 11.452345941900216, + "learning_rate": 4.9785317141004476e-05, + "loss": 3.0245, + "mean_token_accuracy": 0.3620689630508423, + "step": 91150 + }, + { + "epoch": 0.09181228414217382, + "grad_norm": 9.576852377368313, + "learning_rate": 4.9785265498236926e-05, + "loss": 2.1225, + "mean_token_accuracy": 0.47931034564971925, + "step": 91155 + }, + { + "epoch": 0.091817320195278, + "grad_norm": 10.715093676823798, + "learning_rate": 4.978521384928847e-05, + "loss": 2.5336, + "mean_token_accuracy": 0.3793103516101837, + "step": 91160 + }, + { + "epoch": 0.09182235624838217, + "grad_norm": 11.649782628733607, + "learning_rate": 4.978516219415913e-05, + "loss": 2.6976, + "mean_token_accuracy": 0.39443435668945315, + "step": 91165 + }, + { + "epoch": 0.09182739230148634, + "grad_norm": 11.257245827100283, + "learning_rate": 4.978511053284892e-05, + "loss": 2.3676, + "mean_token_accuracy": 0.47241379618644713, + "step": 91170 + }, + { + "epoch": 0.09183242835459052, + "grad_norm": 12.138631260258938, + "learning_rate": 4.9785058865357845e-05, + "loss": 2.2924, + "mean_token_accuracy": 0.4724137902259827, + "step": 91175 + }, + { + "epoch": 0.09183746440769469, + "grad_norm": 11.678413161577193, + "learning_rate": 4.9785007191685925e-05, + "loss": 2.2639, + "mean_token_accuracy": 0.4413793087005615, + "step": 91180 + }, + { + "epoch": 0.09184250046079886, + "grad_norm": 9.64401156495291, + "learning_rate": 4.978495551183318e-05, + "loss": 2.6117, + "mean_token_accuracy": 0.441379314661026, + "step": 91185 + }, + { + "epoch": 0.09184753651390304, + "grad_norm": 17.846222257789034, + "learning_rate": 4.978490382579962e-05, + "loss": 2.4625, + "mean_token_accuracy": 0.44482758045196535, + "step": 91190 + }, + { + "epoch": 0.0918525725670072, + "grad_norm": 10.405443464258369, + "learning_rate": 4.9784852133585255e-05, + "loss": 2.3639, + "mean_token_accuracy": 0.4172413766384125, + "step": 91195 + }, + { + "epoch": 0.09185760862011137, + "grad_norm": 9.00877648839713, + "learning_rate": 4.9784800435190104e-05, + "loss": 2.5032, + "mean_token_accuracy": 0.4172413796186447, + "step": 91200 + }, + { + "epoch": 0.09186264467321555, + "grad_norm": 11.719650160135206, + "learning_rate": 4.978474873061419e-05, + "loss": 2.3411, + "mean_token_accuracy": 0.46382336020469667, + "step": 91205 + }, + { + "epoch": 0.09186768072631972, + "grad_norm": 18.081283107055093, + "learning_rate": 4.97846970198575e-05, + "loss": 2.5881, + "mean_token_accuracy": 0.38620689511299133, + "step": 91210 + }, + { + "epoch": 0.0918727167794239, + "grad_norm": 12.122610555969581, + "learning_rate": 4.978464530292008e-05, + "loss": 2.3651, + "mean_token_accuracy": 0.42758620381355283, + "step": 91215 + }, + { + "epoch": 0.09187775283252807, + "grad_norm": 10.66503984841771, + "learning_rate": 4.978459357980192e-05, + "loss": 2.6304, + "mean_token_accuracy": 0.3655172407627106, + "step": 91220 + }, + { + "epoch": 0.09188278888563224, + "grad_norm": 9.235295270831163, + "learning_rate": 4.9784541850503055e-05, + "loss": 2.194, + "mean_token_accuracy": 0.46896551847457885, + "step": 91225 + }, + { + "epoch": 0.09188782493873641, + "grad_norm": 10.517772011518831, + "learning_rate": 4.978449011502348e-05, + "loss": 2.2202, + "mean_token_accuracy": 0.4551724076271057, + "step": 91230 + }, + { + "epoch": 0.09189286099184059, + "grad_norm": 12.460092814106488, + "learning_rate": 4.978443837336322e-05, + "loss": 2.4646, + "mean_token_accuracy": 0.36896551847457887, + "step": 91235 + }, + { + "epoch": 0.09189789704494476, + "grad_norm": 9.828437523843268, + "learning_rate": 4.97843866255223e-05, + "loss": 2.431, + "mean_token_accuracy": 0.46896551847457885, + "step": 91240 + }, + { + "epoch": 0.09190293309804894, + "grad_norm": 8.31641743333721, + "learning_rate": 4.9784334871500704e-05, + "loss": 2.0964, + "mean_token_accuracy": 0.48275862336158754, + "step": 91245 + }, + { + "epoch": 0.09190796915115311, + "grad_norm": 12.874378177323996, + "learning_rate": 4.978428311129847e-05, + "loss": 2.4202, + "mean_token_accuracy": 0.37586207389831544, + "step": 91250 + }, + { + "epoch": 0.09191300520425728, + "grad_norm": 10.731294427394966, + "learning_rate": 4.978423134491562e-05, + "loss": 2.1245, + "mean_token_accuracy": 0.47241378426551817, + "step": 91255 + }, + { + "epoch": 0.09191804125736146, + "grad_norm": 9.52657583058018, + "learning_rate": 4.978417957235215e-05, + "loss": 2.302, + "mean_token_accuracy": 0.4931034445762634, + "step": 91260 + }, + { + "epoch": 0.09192307731046562, + "grad_norm": 12.513974946585465, + "learning_rate": 4.978412779360807e-05, + "loss": 2.3103, + "mean_token_accuracy": 0.3999999940395355, + "step": 91265 + }, + { + "epoch": 0.09192811336356979, + "grad_norm": 9.77231374462633, + "learning_rate": 4.978407600868341e-05, + "loss": 2.2763, + "mean_token_accuracy": 0.4448275864124298, + "step": 91270 + }, + { + "epoch": 0.09193314941667396, + "grad_norm": 9.313448215191945, + "learning_rate": 4.978402421757818e-05, + "loss": 2.4836, + "mean_token_accuracy": 0.4448275864124298, + "step": 91275 + }, + { + "epoch": 0.09193818546977814, + "grad_norm": 10.76638902821825, + "learning_rate": 4.978397242029239e-05, + "loss": 2.3406, + "mean_token_accuracy": 0.4396249234676361, + "step": 91280 + }, + { + "epoch": 0.09194322152288231, + "grad_norm": 13.039744634128711, + "learning_rate": 4.978392061682605e-05, + "loss": 2.8103, + "mean_token_accuracy": 0.42068966031074523, + "step": 91285 + }, + { + "epoch": 0.09194825757598649, + "grad_norm": 13.512668424616919, + "learning_rate": 4.9783868807179195e-05, + "loss": 2.3436, + "mean_token_accuracy": 0.4137930989265442, + "step": 91290 + }, + { + "epoch": 0.09195329362909066, + "grad_norm": 9.212437342648288, + "learning_rate": 4.978381699135182e-05, + "loss": 2.3996, + "mean_token_accuracy": 0.4551724135875702, + "step": 91295 + }, + { + "epoch": 0.09195832968219483, + "grad_norm": 10.786412347645207, + "learning_rate": 4.978376516934394e-05, + "loss": 2.5634, + "mean_token_accuracy": 0.3862068891525269, + "step": 91300 + }, + { + "epoch": 0.09196336573529901, + "grad_norm": 10.37327827270894, + "learning_rate": 4.978371334115559e-05, + "loss": 2.7917, + "mean_token_accuracy": 0.358620685338974, + "step": 91305 + }, + { + "epoch": 0.09196840178840318, + "grad_norm": 11.51002076350022, + "learning_rate": 4.978366150678675e-05, + "loss": 2.4022, + "mean_token_accuracy": 0.4137930989265442, + "step": 91310 + }, + { + "epoch": 0.09197343784150736, + "grad_norm": 11.841280611853335, + "learning_rate": 4.9783609666237465e-05, + "loss": 2.82, + "mean_token_accuracy": 0.37931033968925476, + "step": 91315 + }, + { + "epoch": 0.09197847389461153, + "grad_norm": 10.058080052906332, + "learning_rate": 4.9783557819507735e-05, + "loss": 2.9309, + "mean_token_accuracy": 0.3862068891525269, + "step": 91320 + }, + { + "epoch": 0.0919835099477157, + "grad_norm": 9.902150094797275, + "learning_rate": 4.978350596659758e-05, + "loss": 2.6271, + "mean_token_accuracy": 0.36896551251411436, + "step": 91325 + }, + { + "epoch": 0.09198854600081988, + "grad_norm": 9.574789530730675, + "learning_rate": 4.9783454107507003e-05, + "loss": 2.4316, + "mean_token_accuracy": 0.37586207389831544, + "step": 91330 + }, + { + "epoch": 0.09199358205392404, + "grad_norm": 12.645337941566169, + "learning_rate": 4.978340224223604e-05, + "loss": 2.6544, + "mean_token_accuracy": 0.37586206793785093, + "step": 91335 + }, + { + "epoch": 0.09199861810702821, + "grad_norm": 10.022772676418775, + "learning_rate": 4.9783350370784687e-05, + "loss": 2.3916, + "mean_token_accuracy": 0.41034482717514037, + "step": 91340 + }, + { + "epoch": 0.09200365416013238, + "grad_norm": 13.479686085998255, + "learning_rate": 4.978329849315295e-05, + "loss": 2.6592, + "mean_token_accuracy": 0.3965517282485962, + "step": 91345 + }, + { + "epoch": 0.09200869021323656, + "grad_norm": 11.761702815337637, + "learning_rate": 4.978324660934088e-05, + "loss": 2.7515, + "mean_token_accuracy": 0.36896551251411436, + "step": 91350 + }, + { + "epoch": 0.09201372626634073, + "grad_norm": 11.791522341050483, + "learning_rate": 4.978319471934845e-05, + "loss": 3.1241, + "mean_token_accuracy": 0.3379310339689255, + "step": 91355 + }, + { + "epoch": 0.0920187623194449, + "grad_norm": 9.399245591651717, + "learning_rate": 4.97831428231757e-05, + "loss": 2.5292, + "mean_token_accuracy": 0.42891711592674253, + "step": 91360 + }, + { + "epoch": 0.09202379837254908, + "grad_norm": 10.27640859674723, + "learning_rate": 4.9783090920822636e-05, + "loss": 2.0546, + "mean_token_accuracy": 0.4862069070339203, + "step": 91365 + }, + { + "epoch": 0.09202883442565325, + "grad_norm": 12.845246502024796, + "learning_rate": 4.978303901228928e-05, + "loss": 2.536, + "mean_token_accuracy": 0.39310344457626345, + "step": 91370 + }, + { + "epoch": 0.09203387047875743, + "grad_norm": 11.126769991830244, + "learning_rate": 4.9782987097575626e-05, + "loss": 2.3211, + "mean_token_accuracy": 0.4413793087005615, + "step": 91375 + }, + { + "epoch": 0.0920389065318616, + "grad_norm": 11.790111470068537, + "learning_rate": 4.978293517668172e-05, + "loss": 2.558, + "mean_token_accuracy": 0.39655172228813174, + "step": 91380 + }, + { + "epoch": 0.09204394258496577, + "grad_norm": 8.722133410112265, + "learning_rate": 4.9782883249607546e-05, + "loss": 2.0106, + "mean_token_accuracy": 0.49655171036720275, + "step": 91385 + }, + { + "epoch": 0.09204897863806995, + "grad_norm": 11.546907510635183, + "learning_rate": 4.978283131635314e-05, + "loss": 2.3021, + "mean_token_accuracy": 0.4103448331356049, + "step": 91390 + }, + { + "epoch": 0.09205401469117412, + "grad_norm": 16.525178089386465, + "learning_rate": 4.97827793769185e-05, + "loss": 2.3678, + "mean_token_accuracy": 0.45335754156112673, + "step": 91395 + }, + { + "epoch": 0.0920590507442783, + "grad_norm": 11.611752325724158, + "learning_rate": 4.978272743130364e-05, + "loss": 2.4188, + "mean_token_accuracy": 0.4206896543502808, + "step": 91400 + }, + { + "epoch": 0.09206408679738246, + "grad_norm": 11.816883859746438, + "learning_rate": 4.97826754795086e-05, + "loss": 2.4548, + "mean_token_accuracy": 0.42758620381355283, + "step": 91405 + }, + { + "epoch": 0.09206912285048663, + "grad_norm": 9.839824991648943, + "learning_rate": 4.978262352153337e-05, + "loss": 2.7432, + "mean_token_accuracy": 0.3827586233615875, + "step": 91410 + }, + { + "epoch": 0.0920741589035908, + "grad_norm": 12.816350938713398, + "learning_rate": 4.978257155737797e-05, + "loss": 2.458, + "mean_token_accuracy": 0.42413793206214906, + "step": 91415 + }, + { + "epoch": 0.09207919495669498, + "grad_norm": 10.681758742236385, + "learning_rate": 4.978251958704242e-05, + "loss": 2.5561, + "mean_token_accuracy": 0.39655173420906065, + "step": 91420 + }, + { + "epoch": 0.09208423100979915, + "grad_norm": 10.462061438455038, + "learning_rate": 4.978246761052672e-05, + "loss": 2.1847, + "mean_token_accuracy": 0.4862068831920624, + "step": 91425 + }, + { + "epoch": 0.09208926706290332, + "grad_norm": 9.390953316882701, + "learning_rate": 4.9782415627830904e-05, + "loss": 2.3809, + "mean_token_accuracy": 0.4137930989265442, + "step": 91430 + }, + { + "epoch": 0.0920943031160075, + "grad_norm": 10.039678233060283, + "learning_rate": 4.978236363895498e-05, + "loss": 2.4045, + "mean_token_accuracy": 0.4379310429096222, + "step": 91435 + }, + { + "epoch": 0.09209933916911167, + "grad_norm": 13.06541766087045, + "learning_rate": 4.978231164389895e-05, + "loss": 2.0001, + "mean_token_accuracy": 0.47241378426551817, + "step": 91440 + }, + { + "epoch": 0.09210437522221585, + "grad_norm": 9.28070298066796, + "learning_rate": 4.9782259642662844e-05, + "loss": 2.2202, + "mean_token_accuracy": 0.44343616962432864, + "step": 91445 + }, + { + "epoch": 0.09210941127532002, + "grad_norm": 11.242640551191501, + "learning_rate": 4.978220763524666e-05, + "loss": 2.6348, + "mean_token_accuracy": 0.38106473684310915, + "step": 91450 + }, + { + "epoch": 0.09211444732842419, + "grad_norm": 10.331647111100947, + "learning_rate": 4.978215562165043e-05, + "loss": 2.4388, + "mean_token_accuracy": 0.41034482717514037, + "step": 91455 + }, + { + "epoch": 0.09211948338152837, + "grad_norm": 10.853816320991227, + "learning_rate": 4.978210360187416e-05, + "loss": 2.4833, + "mean_token_accuracy": 0.4379310369491577, + "step": 91460 + }, + { + "epoch": 0.09212451943463254, + "grad_norm": 10.435375444345308, + "learning_rate": 4.978205157591787e-05, + "loss": 2.1422, + "mean_token_accuracy": 0.42758620381355283, + "step": 91465 + }, + { + "epoch": 0.09212955548773671, + "grad_norm": 12.174594495056327, + "learning_rate": 4.978199954378157e-05, + "loss": 2.4169, + "mean_token_accuracy": 0.38620689511299133, + "step": 91470 + }, + { + "epoch": 0.09213459154084087, + "grad_norm": 11.430912925076578, + "learning_rate": 4.978194750546527e-05, + "loss": 2.3814, + "mean_token_accuracy": 0.44482758045196535, + "step": 91475 + }, + { + "epoch": 0.09213962759394505, + "grad_norm": 11.990234621061962, + "learning_rate": 4.978189546096899e-05, + "loss": 2.5902, + "mean_token_accuracy": 0.4297035574913025, + "step": 91480 + }, + { + "epoch": 0.09214466364704922, + "grad_norm": 12.729142575973905, + "learning_rate": 4.978184341029275e-05, + "loss": 2.3773, + "mean_token_accuracy": 0.441379314661026, + "step": 91485 + }, + { + "epoch": 0.0921496997001534, + "grad_norm": 10.29821072746131, + "learning_rate": 4.978179135343654e-05, + "loss": 2.2013, + "mean_token_accuracy": 0.4551724135875702, + "step": 91490 + }, + { + "epoch": 0.09215473575325757, + "grad_norm": 11.741602705842391, + "learning_rate": 4.9781739290400406e-05, + "loss": 2.8711, + "mean_token_accuracy": 0.31034482419490816, + "step": 91495 + }, + { + "epoch": 0.09215977180636174, + "grad_norm": 10.201847294756833, + "learning_rate": 4.978168722118435e-05, + "loss": 2.2467, + "mean_token_accuracy": 0.4482758641242981, + "step": 91500 + }, + { + "epoch": 0.09216480785946592, + "grad_norm": 8.356342195898398, + "learning_rate": 4.978163514578837e-05, + "loss": 1.9762, + "mean_token_accuracy": 0.5068965435028077, + "step": 91505 + }, + { + "epoch": 0.09216984391257009, + "grad_norm": 11.703853903717821, + "learning_rate": 4.978158306421251e-05, + "loss": 2.4705, + "mean_token_accuracy": 0.38965516686439516, + "step": 91510 + }, + { + "epoch": 0.09217487996567426, + "grad_norm": 12.911967002653734, + "learning_rate": 4.9781530976456756e-05, + "loss": 2.447, + "mean_token_accuracy": 0.3793103456497192, + "step": 91515 + }, + { + "epoch": 0.09217991601877844, + "grad_norm": 13.121119445270315, + "learning_rate": 4.978147888252115e-05, + "loss": 2.3268, + "mean_token_accuracy": 0.4551724135875702, + "step": 91520 + }, + { + "epoch": 0.09218495207188261, + "grad_norm": 10.455021403239186, + "learning_rate": 4.978142678240569e-05, + "loss": 2.6135, + "mean_token_accuracy": 0.358620685338974, + "step": 91525 + }, + { + "epoch": 0.09218998812498679, + "grad_norm": 9.56973498370362, + "learning_rate": 4.9781374676110385e-05, + "loss": 2.7487, + "mean_token_accuracy": 0.36551724672317504, + "step": 91530 + }, + { + "epoch": 0.09219502417809096, + "grad_norm": 13.85458236160237, + "learning_rate": 4.978132256363527e-05, + "loss": 2.2364, + "mean_token_accuracy": 0.458620685338974, + "step": 91535 + }, + { + "epoch": 0.09220006023119513, + "grad_norm": 8.693696820227887, + "learning_rate": 4.978127044498034e-05, + "loss": 1.8794, + "mean_token_accuracy": 0.49655171632766726, + "step": 91540 + }, + { + "epoch": 0.09220509628429929, + "grad_norm": 11.20318368926345, + "learning_rate": 4.9781218320145616e-05, + "loss": 2.2114, + "mean_token_accuracy": 0.44482759237289426, + "step": 91545 + }, + { + "epoch": 0.09221013233740347, + "grad_norm": 10.661506949574862, + "learning_rate": 4.978116618913111e-05, + "loss": 2.0798, + "mean_token_accuracy": 0.4801724135875702, + "step": 91550 + }, + { + "epoch": 0.09221516839050764, + "grad_norm": 11.27391694580578, + "learning_rate": 4.9781114051936843e-05, + "loss": 2.5363, + "mean_token_accuracy": 0.3551724076271057, + "step": 91555 + }, + { + "epoch": 0.09222020444361181, + "grad_norm": 11.420019388772344, + "learning_rate": 4.9781061908562834e-05, + "loss": 2.7347, + "mean_token_accuracy": 0.38620689511299133, + "step": 91560 + }, + { + "epoch": 0.09222524049671599, + "grad_norm": 11.98924588703446, + "learning_rate": 4.978100975900908e-05, + "loss": 2.6827, + "mean_token_accuracy": 0.3551724076271057, + "step": 91565 + }, + { + "epoch": 0.09223027654982016, + "grad_norm": 9.197592441243513, + "learning_rate": 4.978095760327561e-05, + "loss": 2.3162, + "mean_token_accuracy": 0.4275862008333206, + "step": 91570 + }, + { + "epoch": 0.09223531260292434, + "grad_norm": 12.51261344716163, + "learning_rate": 4.9780905441362424e-05, + "loss": 2.4994, + "mean_token_accuracy": 0.4379310369491577, + "step": 91575 + }, + { + "epoch": 0.09224034865602851, + "grad_norm": 12.749508383699188, + "learning_rate": 4.9780853273269554e-05, + "loss": 2.173, + "mean_token_accuracy": 0.46896552443504336, + "step": 91580 + }, + { + "epoch": 0.09224538470913268, + "grad_norm": 10.787347778131876, + "learning_rate": 4.9780801098997e-05, + "loss": 2.238, + "mean_token_accuracy": 0.49051724672317504, + "step": 91585 + }, + { + "epoch": 0.09225042076223686, + "grad_norm": 9.35630598120748, + "learning_rate": 4.978074891854479e-05, + "loss": 2.0199, + "mean_token_accuracy": 0.458620685338974, + "step": 91590 + }, + { + "epoch": 0.09225545681534103, + "grad_norm": 10.885274669303373, + "learning_rate": 4.978069673191292e-05, + "loss": 2.8517, + "mean_token_accuracy": 0.4034482777118683, + "step": 91595 + }, + { + "epoch": 0.0922604928684452, + "grad_norm": 9.616921929781528, + "learning_rate": 4.978064453910143e-05, + "loss": 2.4912, + "mean_token_accuracy": 0.4103448331356049, + "step": 91600 + }, + { + "epoch": 0.09226552892154938, + "grad_norm": 10.062089845878605, + "learning_rate": 4.978059234011031e-05, + "loss": 2.4715, + "mean_token_accuracy": 0.38965516686439516, + "step": 91605 + }, + { + "epoch": 0.09227056497465354, + "grad_norm": 10.07272620028625, + "learning_rate": 4.97805401349396e-05, + "loss": 2.1668, + "mean_token_accuracy": 0.4528325110673904, + "step": 91610 + }, + { + "epoch": 0.09227560102775771, + "grad_norm": 14.241190986049615, + "learning_rate": 4.978048792358928e-05, + "loss": 2.9241, + "mean_token_accuracy": 0.3517241388559341, + "step": 91615 + }, + { + "epoch": 0.09228063708086189, + "grad_norm": 12.457992284389933, + "learning_rate": 4.9780435706059395e-05, + "loss": 2.4205, + "mean_token_accuracy": 0.40157289505004884, + "step": 91620 + }, + { + "epoch": 0.09228567313396606, + "grad_norm": 14.140774760851768, + "learning_rate": 4.978038348234994e-05, + "loss": 2.8097, + "mean_token_accuracy": 0.39479734003543854, + "step": 91625 + }, + { + "epoch": 0.09229070918707023, + "grad_norm": 10.81906840772269, + "learning_rate": 4.978033125246094e-05, + "loss": 2.7073, + "mean_token_accuracy": 0.36551723480224607, + "step": 91630 + }, + { + "epoch": 0.0922957452401744, + "grad_norm": 9.324775650957427, + "learning_rate": 4.978027901639241e-05, + "loss": 2.7326, + "mean_token_accuracy": 0.4206896543502808, + "step": 91635 + }, + { + "epoch": 0.09230078129327858, + "grad_norm": 9.289029355684029, + "learning_rate": 4.978022677414436e-05, + "loss": 2.4426, + "mean_token_accuracy": 0.4172413766384125, + "step": 91640 + }, + { + "epoch": 0.09230581734638275, + "grad_norm": 7.782671014750335, + "learning_rate": 4.97801745257168e-05, + "loss": 2.3891, + "mean_token_accuracy": 0.482758629322052, + "step": 91645 + }, + { + "epoch": 0.09231085339948693, + "grad_norm": 12.689298582568577, + "learning_rate": 4.978012227110976e-05, + "loss": 2.2271, + "mean_token_accuracy": 0.44827587008476255, + "step": 91650 + }, + { + "epoch": 0.0923158894525911, + "grad_norm": 10.974861990713135, + "learning_rate": 4.978007001032324e-05, + "loss": 2.5394, + "mean_token_accuracy": 0.3896551728248596, + "step": 91655 + }, + { + "epoch": 0.09232092550569528, + "grad_norm": 10.28262970441023, + "learning_rate": 4.978001774335727e-05, + "loss": 2.6577, + "mean_token_accuracy": 0.39655172228813174, + "step": 91660 + }, + { + "epoch": 0.09232596155879945, + "grad_norm": 10.440677114680609, + "learning_rate": 4.977996547021184e-05, + "loss": 2.8434, + "mean_token_accuracy": 0.3586206942796707, + "step": 91665 + }, + { + "epoch": 0.09233099761190362, + "grad_norm": 12.068466683882658, + "learning_rate": 4.977991319088698e-05, + "loss": 2.3134, + "mean_token_accuracy": 0.4137930989265442, + "step": 91670 + }, + { + "epoch": 0.0923360336650078, + "grad_norm": 11.875474533938775, + "learning_rate": 4.977986090538271e-05, + "loss": 2.2169, + "mean_token_accuracy": 0.4413793087005615, + "step": 91675 + }, + { + "epoch": 0.09234106971811196, + "grad_norm": 9.975442368986737, + "learning_rate": 4.977980861369903e-05, + "loss": 2.3594, + "mean_token_accuracy": 0.4068965494632721, + "step": 91680 + }, + { + "epoch": 0.09234610577121613, + "grad_norm": 9.282875046921536, + "learning_rate": 4.977975631583596e-05, + "loss": 2.3858, + "mean_token_accuracy": 0.4103448331356049, + "step": 91685 + }, + { + "epoch": 0.0923511418243203, + "grad_norm": 10.018154854933718, + "learning_rate": 4.977970401179353e-05, + "loss": 2.1811, + "mean_token_accuracy": 0.4896551728248596, + "step": 91690 + }, + { + "epoch": 0.09235617787742448, + "grad_norm": 12.707676003404776, + "learning_rate": 4.9779651701571735e-05, + "loss": 2.531, + "mean_token_accuracy": 0.4172413766384125, + "step": 91695 + }, + { + "epoch": 0.09236121393052865, + "grad_norm": 10.944946015227982, + "learning_rate": 4.977959938517059e-05, + "loss": 2.4025, + "mean_token_accuracy": 0.40689654350280763, + "step": 91700 + }, + { + "epoch": 0.09236624998363283, + "grad_norm": 9.27722498234549, + "learning_rate": 4.977954706259011e-05, + "loss": 2.1266, + "mean_token_accuracy": 0.4896551728248596, + "step": 91705 + }, + { + "epoch": 0.092371286036737, + "grad_norm": 11.33304535786818, + "learning_rate": 4.977949473383033e-05, + "loss": 2.4682, + "mean_token_accuracy": 0.4294010877609253, + "step": 91710 + }, + { + "epoch": 0.09237632208984117, + "grad_norm": 8.75964993407579, + "learning_rate": 4.977944239889125e-05, + "loss": 2.3978, + "mean_token_accuracy": 0.41379310488700866, + "step": 91715 + }, + { + "epoch": 0.09238135814294535, + "grad_norm": 12.24738178738062, + "learning_rate": 4.977939005777287e-05, + "loss": 2.7404, + "mean_token_accuracy": 0.3896551728248596, + "step": 91720 + }, + { + "epoch": 0.09238639419604952, + "grad_norm": 8.731044733469064, + "learning_rate": 4.977933771047522e-05, + "loss": 2.3912, + "mean_token_accuracy": 0.4448275864124298, + "step": 91725 + }, + { + "epoch": 0.0923914302491537, + "grad_norm": 10.509514323265288, + "learning_rate": 4.977928535699832e-05, + "loss": 2.7638, + "mean_token_accuracy": 0.3931034505367279, + "step": 91730 + }, + { + "epoch": 0.09239646630225787, + "grad_norm": 8.406831182886414, + "learning_rate": 4.9779232997342176e-05, + "loss": 2.1772, + "mean_token_accuracy": 0.46551724672317507, + "step": 91735 + }, + { + "epoch": 0.09240150235536204, + "grad_norm": 13.073561562939256, + "learning_rate": 4.9779180631506794e-05, + "loss": 2.4156, + "mean_token_accuracy": 0.40344828367233276, + "step": 91740 + }, + { + "epoch": 0.09240653840846622, + "grad_norm": 10.106868256835467, + "learning_rate": 4.9779128259492205e-05, + "loss": 2.3699, + "mean_token_accuracy": 0.44827585220336913, + "step": 91745 + }, + { + "epoch": 0.09241157446157038, + "grad_norm": 11.061674537900139, + "learning_rate": 4.977907588129842e-05, + "loss": 2.5383, + "mean_token_accuracy": 0.4259528160095215, + "step": 91750 + }, + { + "epoch": 0.09241661051467455, + "grad_norm": 11.601144673137227, + "learning_rate": 4.9779023496925446e-05, + "loss": 2.84, + "mean_token_accuracy": 0.358620685338974, + "step": 91755 + }, + { + "epoch": 0.09242164656777872, + "grad_norm": 8.999449245397361, + "learning_rate": 4.9778971106373304e-05, + "loss": 2.51, + "mean_token_accuracy": 0.4344827592372894, + "step": 91760 + }, + { + "epoch": 0.0924266826208829, + "grad_norm": 11.407555044080482, + "learning_rate": 4.9778918709642e-05, + "loss": 2.5785, + "mean_token_accuracy": 0.38620689511299133, + "step": 91765 + }, + { + "epoch": 0.09243171867398707, + "grad_norm": 10.828041041878562, + "learning_rate": 4.977886630673156e-05, + "loss": 2.4427, + "mean_token_accuracy": 0.42413792610168455, + "step": 91770 + }, + { + "epoch": 0.09243675472709124, + "grad_norm": 10.078295495759738, + "learning_rate": 4.9778813897642e-05, + "loss": 2.5963, + "mean_token_accuracy": 0.4103448212146759, + "step": 91775 + }, + { + "epoch": 0.09244179078019542, + "grad_norm": 14.683737954380915, + "learning_rate": 4.9778761482373315e-05, + "loss": 3.0128, + "mean_token_accuracy": 0.33448275923728943, + "step": 91780 + }, + { + "epoch": 0.09244682683329959, + "grad_norm": 12.284099927649612, + "learning_rate": 4.9778709060925545e-05, + "loss": 2.3091, + "mean_token_accuracy": 0.4586206912994385, + "step": 91785 + }, + { + "epoch": 0.09245186288640377, + "grad_norm": 10.951028182152557, + "learning_rate": 4.977865663329868e-05, + "loss": 2.4797, + "mean_token_accuracy": 0.43448275327682495, + "step": 91790 + }, + { + "epoch": 0.09245689893950794, + "grad_norm": 12.810140409139485, + "learning_rate": 4.9778604199492755e-05, + "loss": 2.4609, + "mean_token_accuracy": 0.4068965494632721, + "step": 91795 + }, + { + "epoch": 0.09246193499261211, + "grad_norm": 8.976250219029128, + "learning_rate": 4.977855175950777e-05, + "loss": 2.0758, + "mean_token_accuracy": 0.482758629322052, + "step": 91800 + }, + { + "epoch": 0.09246697104571629, + "grad_norm": 10.488429325737775, + "learning_rate": 4.977849931334375e-05, + "loss": 2.0741, + "mean_token_accuracy": 0.4620689570903778, + "step": 91805 + }, + { + "epoch": 0.09247200709882046, + "grad_norm": 15.265020753872792, + "learning_rate": 4.97784468610007e-05, + "loss": 2.2982, + "mean_token_accuracy": 0.4379310369491577, + "step": 91810 + }, + { + "epoch": 0.09247704315192463, + "grad_norm": 12.302034888542556, + "learning_rate": 4.977839440247864e-05, + "loss": 3.0835, + "mean_token_accuracy": 0.3793103516101837, + "step": 91815 + }, + { + "epoch": 0.0924820792050288, + "grad_norm": 13.534320079919688, + "learning_rate": 4.977834193777759e-05, + "loss": 2.5138, + "mean_token_accuracy": 0.4586206912994385, + "step": 91820 + }, + { + "epoch": 0.09248711525813297, + "grad_norm": 9.922570608599559, + "learning_rate": 4.977828946689756e-05, + "loss": 2.6005, + "mean_token_accuracy": 0.44482758045196535, + "step": 91825 + }, + { + "epoch": 0.09249215131123714, + "grad_norm": 10.145738653780745, + "learning_rate": 4.977823698983856e-05, + "loss": 2.7858, + "mean_token_accuracy": 0.3965517282485962, + "step": 91830 + }, + { + "epoch": 0.09249718736434132, + "grad_norm": 18.54497894520018, + "learning_rate": 4.977818450660061e-05, + "loss": 2.5204, + "mean_token_accuracy": 0.39999999701976774, + "step": 91835 + }, + { + "epoch": 0.09250222341744549, + "grad_norm": 16.19976698891755, + "learning_rate": 4.977813201718371e-05, + "loss": 2.4778, + "mean_token_accuracy": 0.3758620649576187, + "step": 91840 + }, + { + "epoch": 0.09250725947054966, + "grad_norm": 10.165292794722225, + "learning_rate": 4.97780795215879e-05, + "loss": 2.4351, + "mean_token_accuracy": 0.43793103098869324, + "step": 91845 + }, + { + "epoch": 0.09251229552365384, + "grad_norm": 17.872040505445923, + "learning_rate": 4.9778027019813175e-05, + "loss": 2.4495, + "mean_token_accuracy": 0.4172413766384125, + "step": 91850 + }, + { + "epoch": 0.09251733157675801, + "grad_norm": 9.677642463929592, + "learning_rate": 4.977797451185956e-05, + "loss": 2.1387, + "mean_token_accuracy": 0.47931034564971925, + "step": 91855 + }, + { + "epoch": 0.09252236762986218, + "grad_norm": 12.743173363533447, + "learning_rate": 4.977792199772707e-05, + "loss": 2.521, + "mean_token_accuracy": 0.3620689630508423, + "step": 91860 + }, + { + "epoch": 0.09252740368296636, + "grad_norm": 11.17643234497167, + "learning_rate": 4.9777869477415706e-05, + "loss": 2.5418, + "mean_token_accuracy": 0.41379310488700866, + "step": 91865 + }, + { + "epoch": 0.09253243973607053, + "grad_norm": 11.979008736123793, + "learning_rate": 4.9777816950925496e-05, + "loss": 2.5577, + "mean_token_accuracy": 0.37241379618644715, + "step": 91870 + }, + { + "epoch": 0.0925374757891747, + "grad_norm": 10.53161624886413, + "learning_rate": 4.977776441825645e-05, + "loss": 2.9344, + "mean_token_accuracy": 0.3482758641242981, + "step": 91875 + }, + { + "epoch": 0.09254251184227888, + "grad_norm": 10.437213775023281, + "learning_rate": 4.9777711879408586e-05, + "loss": 2.4423, + "mean_token_accuracy": 0.4034482777118683, + "step": 91880 + }, + { + "epoch": 0.09254754789538305, + "grad_norm": 12.176535903551, + "learning_rate": 4.977765933438191e-05, + "loss": 2.4869, + "mean_token_accuracy": 0.4275861978530884, + "step": 91885 + }, + { + "epoch": 0.09255258394848721, + "grad_norm": 11.249453368327826, + "learning_rate": 4.977760678317645e-05, + "loss": 2.6986, + "mean_token_accuracy": 0.39655172228813174, + "step": 91890 + }, + { + "epoch": 0.09255762000159139, + "grad_norm": 9.249909664423493, + "learning_rate": 4.97775542257922e-05, + "loss": 2.4171, + "mean_token_accuracy": 0.44694494605064394, + "step": 91895 + }, + { + "epoch": 0.09256265605469556, + "grad_norm": 8.855090959371763, + "learning_rate": 4.9777501662229195e-05, + "loss": 2.5927, + "mean_token_accuracy": 0.4, + "step": 91900 + }, + { + "epoch": 0.09256769210779973, + "grad_norm": 9.9698017791453, + "learning_rate": 4.977744909248744e-05, + "loss": 2.0786, + "mean_token_accuracy": 0.4896551609039307, + "step": 91905 + }, + { + "epoch": 0.09257272816090391, + "grad_norm": 10.428980185046118, + "learning_rate": 4.977739651656696e-05, + "loss": 2.6379, + "mean_token_accuracy": 0.39655172228813174, + "step": 91910 + }, + { + "epoch": 0.09257776421400808, + "grad_norm": 13.786325710956842, + "learning_rate": 4.9777343934467747e-05, + "loss": 2.6835, + "mean_token_accuracy": 0.39655172228813174, + "step": 91915 + }, + { + "epoch": 0.09258280026711226, + "grad_norm": 13.878426982845411, + "learning_rate": 4.977729134618984e-05, + "loss": 2.2496, + "mean_token_accuracy": 0.42758620381355283, + "step": 91920 + }, + { + "epoch": 0.09258783632021643, + "grad_norm": 10.888687605112107, + "learning_rate": 4.9777238751733235e-05, + "loss": 2.3617, + "mean_token_accuracy": 0.44482759237289426, + "step": 91925 + }, + { + "epoch": 0.0925928723733206, + "grad_norm": 10.68763075756421, + "learning_rate": 4.977718615109796e-05, + "loss": 2.4413, + "mean_token_accuracy": 0.3862069010734558, + "step": 91930 + }, + { + "epoch": 0.09259790842642478, + "grad_norm": 10.353540514424385, + "learning_rate": 4.9777133544284024e-05, + "loss": 2.3548, + "mean_token_accuracy": 0.44137929677963256, + "step": 91935 + }, + { + "epoch": 0.09260294447952895, + "grad_norm": 10.65611579966718, + "learning_rate": 4.9777080931291435e-05, + "loss": 2.5851, + "mean_token_accuracy": 0.43103448748588563, + "step": 91940 + }, + { + "epoch": 0.09260798053263312, + "grad_norm": 10.4293616869514, + "learning_rate": 4.977702831212023e-05, + "loss": 2.1197, + "mean_token_accuracy": 0.4793103337287903, + "step": 91945 + }, + { + "epoch": 0.0926130165857373, + "grad_norm": 10.48002616586928, + "learning_rate": 4.9776975686770397e-05, + "loss": 2.3118, + "mean_token_accuracy": 0.46430732011795045, + "step": 91950 + }, + { + "epoch": 0.09261805263884147, + "grad_norm": 12.153737526317196, + "learning_rate": 4.977692305524196e-05, + "loss": 2.7396, + "mean_token_accuracy": 0.3793103456497192, + "step": 91955 + }, + { + "epoch": 0.09262308869194563, + "grad_norm": 10.435647939083914, + "learning_rate": 4.977687041753494e-05, + "loss": 1.8212, + "mean_token_accuracy": 0.482758617401123, + "step": 91960 + }, + { + "epoch": 0.0926281247450498, + "grad_norm": 11.182411693703138, + "learning_rate": 4.9776817773649354e-05, + "loss": 2.3089, + "mean_token_accuracy": 0.45172414779663084, + "step": 91965 + }, + { + "epoch": 0.09263316079815398, + "grad_norm": 10.100994992939976, + "learning_rate": 4.9776765123585195e-05, + "loss": 2.2155, + "mean_token_accuracy": 0.47586206793785096, + "step": 91970 + }, + { + "epoch": 0.09263819685125815, + "grad_norm": 11.24097245390771, + "learning_rate": 4.9776712467342494e-05, + "loss": 2.5727, + "mean_token_accuracy": 0.37241377830505373, + "step": 91975 + }, + { + "epoch": 0.09264323290436233, + "grad_norm": 9.800373781442817, + "learning_rate": 4.977665980492127e-05, + "loss": 2.6779, + "mean_token_accuracy": 0.35172412991523744, + "step": 91980 + }, + { + "epoch": 0.0926482689574665, + "grad_norm": 8.997495749366735, + "learning_rate": 4.977660713632153e-05, + "loss": 2.972, + "mean_token_accuracy": 0.4068965554237366, + "step": 91985 + }, + { + "epoch": 0.09265330501057067, + "grad_norm": 12.638631931501637, + "learning_rate": 4.977655446154329e-05, + "loss": 2.6116, + "mean_token_accuracy": 0.4103448331356049, + "step": 91990 + }, + { + "epoch": 0.09265834106367485, + "grad_norm": 11.09903312623628, + "learning_rate": 4.977650178058656e-05, + "loss": 2.4968, + "mean_token_accuracy": 0.4379310250282288, + "step": 91995 + }, + { + "epoch": 0.09266337711677902, + "grad_norm": 10.132028741577532, + "learning_rate": 4.977644909345137e-05, + "loss": 2.5507, + "mean_token_accuracy": 0.42758620381355283, + "step": 92000 + }, + { + "epoch": 0.0926684131698832, + "grad_norm": 13.763401665158721, + "learning_rate": 4.977639640013772e-05, + "loss": 2.3868, + "mean_token_accuracy": 0.4310344845056534, + "step": 92005 + }, + { + "epoch": 0.09267344922298737, + "grad_norm": 10.427830041259291, + "learning_rate": 4.9776343700645614e-05, + "loss": 2.4399, + "mean_token_accuracy": 0.43103447556495667, + "step": 92010 + }, + { + "epoch": 0.09267848527609154, + "grad_norm": 12.422102661140848, + "learning_rate": 4.97762909949751e-05, + "loss": 3.0089, + "mean_token_accuracy": 0.3275862067937851, + "step": 92015 + }, + { + "epoch": 0.09268352132919572, + "grad_norm": 10.961419462194126, + "learning_rate": 4.977623828312617e-05, + "loss": 2.4861, + "mean_token_accuracy": 0.37931033968925476, + "step": 92020 + }, + { + "epoch": 0.09268855738229989, + "grad_norm": 10.1723811928474, + "learning_rate": 4.977618556509883e-05, + "loss": 2.4913, + "mean_token_accuracy": 0.4172413766384125, + "step": 92025 + }, + { + "epoch": 0.09269359343540405, + "grad_norm": 9.29291224249899, + "learning_rate": 4.977613284089312e-05, + "loss": 2.8625, + "mean_token_accuracy": 0.3827586144208908, + "step": 92030 + }, + { + "epoch": 0.09269862948850822, + "grad_norm": 9.704491513823605, + "learning_rate": 4.977608011050903e-05, + "loss": 2.2936, + "mean_token_accuracy": 0.3827586233615875, + "step": 92035 + }, + { + "epoch": 0.0927036655416124, + "grad_norm": 9.715265628073409, + "learning_rate": 4.9776027373946594e-05, + "loss": 2.1545, + "mean_token_accuracy": 0.43103448748588563, + "step": 92040 + }, + { + "epoch": 0.09270870159471657, + "grad_norm": 11.498606343050595, + "learning_rate": 4.9775974631205816e-05, + "loss": 2.4863, + "mean_token_accuracy": 0.4068965554237366, + "step": 92045 + }, + { + "epoch": 0.09271373764782075, + "grad_norm": 13.063819258469659, + "learning_rate": 4.977592188228671e-05, + "loss": 2.2778, + "mean_token_accuracy": 0.4068965494632721, + "step": 92050 + }, + { + "epoch": 0.09271877370092492, + "grad_norm": 10.182690010366171, + "learning_rate": 4.977586912718931e-05, + "loss": 2.0258, + "mean_token_accuracy": 0.49655172824859617, + "step": 92055 + }, + { + "epoch": 0.0927238097540291, + "grad_norm": 9.723184051776537, + "learning_rate": 4.9775816365913594e-05, + "loss": 2.3052, + "mean_token_accuracy": 0.46551724076271056, + "step": 92060 + }, + { + "epoch": 0.09272884580713327, + "grad_norm": 8.779780888062787, + "learning_rate": 4.9775763598459613e-05, + "loss": 2.0233, + "mean_token_accuracy": 0.4896551549434662, + "step": 92065 + }, + { + "epoch": 0.09273388186023744, + "grad_norm": 10.213109816403085, + "learning_rate": 4.9775710824827354e-05, + "loss": 2.35, + "mean_token_accuracy": 0.4434361755847931, + "step": 92070 + }, + { + "epoch": 0.09273891791334161, + "grad_norm": 8.541281858234596, + "learning_rate": 4.977565804501685e-05, + "loss": 2.0914, + "mean_token_accuracy": 0.4758620738983154, + "step": 92075 + }, + { + "epoch": 0.09274395396644579, + "grad_norm": 11.550270211639331, + "learning_rate": 4.97756052590281e-05, + "loss": 2.9165, + "mean_token_accuracy": 0.3068965494632721, + "step": 92080 + }, + { + "epoch": 0.09274899001954996, + "grad_norm": 12.056004785210744, + "learning_rate": 4.977555246686114e-05, + "loss": 2.7581, + "mean_token_accuracy": 0.40000000298023225, + "step": 92085 + }, + { + "epoch": 0.09275402607265414, + "grad_norm": 11.424738048819354, + "learning_rate": 4.9775499668515965e-05, + "loss": 2.5809, + "mean_token_accuracy": 0.37586206793785093, + "step": 92090 + }, + { + "epoch": 0.09275906212575831, + "grad_norm": 11.131140235142094, + "learning_rate": 4.977544686399261e-05, + "loss": 2.2902, + "mean_token_accuracy": 0.4068965494632721, + "step": 92095 + }, + { + "epoch": 0.09276409817886247, + "grad_norm": 8.079514965820419, + "learning_rate": 4.977539405329106e-05, + "loss": 1.9756, + "mean_token_accuracy": 0.46896551847457885, + "step": 92100 + }, + { + "epoch": 0.09276913423196664, + "grad_norm": 11.705788493417108, + "learning_rate": 4.977534123641136e-05, + "loss": 2.5874, + "mean_token_accuracy": 0.3896551728248596, + "step": 92105 + }, + { + "epoch": 0.09277417028507082, + "grad_norm": 11.018933014607764, + "learning_rate": 4.97752884133535e-05, + "loss": 2.6168, + "mean_token_accuracy": 0.4068965494632721, + "step": 92110 + }, + { + "epoch": 0.09277920633817499, + "grad_norm": 11.789714327346651, + "learning_rate": 4.9775235584117505e-05, + "loss": 2.4127, + "mean_token_accuracy": 0.42413792610168455, + "step": 92115 + }, + { + "epoch": 0.09278424239127916, + "grad_norm": 9.193109234180799, + "learning_rate": 4.9775182748703394e-05, + "loss": 2.3977, + "mean_token_accuracy": 0.4172413766384125, + "step": 92120 + }, + { + "epoch": 0.09278927844438334, + "grad_norm": 11.21024811940659, + "learning_rate": 4.977512990711118e-05, + "loss": 2.3065, + "mean_token_accuracy": 0.42758620977401735, + "step": 92125 + }, + { + "epoch": 0.09279431449748751, + "grad_norm": 10.58554497592227, + "learning_rate": 4.977507705934087e-05, + "loss": 2.4399, + "mean_token_accuracy": 0.4152450144290924, + "step": 92130 + }, + { + "epoch": 0.09279935055059169, + "grad_norm": 12.252767802961273, + "learning_rate": 4.9775024205392496e-05, + "loss": 2.9294, + "mean_token_accuracy": 0.3965517282485962, + "step": 92135 + }, + { + "epoch": 0.09280438660369586, + "grad_norm": 13.29023453607371, + "learning_rate": 4.977497134526605e-05, + "loss": 2.4149, + "mean_token_accuracy": 0.41034482717514037, + "step": 92140 + }, + { + "epoch": 0.09280942265680003, + "grad_norm": 10.185945496245392, + "learning_rate": 4.977491847896156e-05, + "loss": 2.4953, + "mean_token_accuracy": 0.4034482777118683, + "step": 92145 + }, + { + "epoch": 0.09281445870990421, + "grad_norm": 10.617728271658123, + "learning_rate": 4.977486560647904e-05, + "loss": 2.4225, + "mean_token_accuracy": 0.37586207389831544, + "step": 92150 + }, + { + "epoch": 0.09281949476300838, + "grad_norm": 12.479448377256997, + "learning_rate": 4.9774812727818504e-05, + "loss": 2.3188, + "mean_token_accuracy": 0.4448275864124298, + "step": 92155 + }, + { + "epoch": 0.09282453081611255, + "grad_norm": 8.531249062996755, + "learning_rate": 4.977475984297997e-05, + "loss": 2.1227, + "mean_token_accuracy": 0.4517241299152374, + "step": 92160 + }, + { + "epoch": 0.09282956686921673, + "grad_norm": 11.376709565160194, + "learning_rate": 4.977470695196344e-05, + "loss": 2.3685, + "mean_token_accuracy": 0.4068965554237366, + "step": 92165 + }, + { + "epoch": 0.09283460292232089, + "grad_norm": 15.471327162921611, + "learning_rate": 4.977465405476894e-05, + "loss": 2.6729, + "mean_token_accuracy": 0.43793103098869324, + "step": 92170 + }, + { + "epoch": 0.09283963897542506, + "grad_norm": 11.84291796918407, + "learning_rate": 4.977460115139648e-05, + "loss": 2.4258, + "mean_token_accuracy": 0.44137930274009707, + "step": 92175 + }, + { + "epoch": 0.09284467502852924, + "grad_norm": 10.573241099918995, + "learning_rate": 4.977454824184607e-05, + "loss": 2.1563, + "mean_token_accuracy": 0.4379310369491577, + "step": 92180 + }, + { + "epoch": 0.09284971108163341, + "grad_norm": 13.910168303833267, + "learning_rate": 4.977449532611774e-05, + "loss": 2.3143, + "mean_token_accuracy": 0.43793103098869324, + "step": 92185 + }, + { + "epoch": 0.09285474713473758, + "grad_norm": 12.128865982554586, + "learning_rate": 4.97744424042115e-05, + "loss": 2.6239, + "mean_token_accuracy": 0.382758629322052, + "step": 92190 + }, + { + "epoch": 0.09285978318784176, + "grad_norm": 18.301792247461226, + "learning_rate": 4.977438947612735e-05, + "loss": 2.4265, + "mean_token_accuracy": 0.42068966031074523, + "step": 92195 + }, + { + "epoch": 0.09286481924094593, + "grad_norm": 15.053752274233478, + "learning_rate": 4.977433654186532e-05, + "loss": 2.5898, + "mean_token_accuracy": 0.4517241418361664, + "step": 92200 + }, + { + "epoch": 0.0928698552940501, + "grad_norm": 10.879343736393263, + "learning_rate": 4.977428360142541e-05, + "loss": 2.2888, + "mean_token_accuracy": 0.42758620381355283, + "step": 92205 + }, + { + "epoch": 0.09287489134715428, + "grad_norm": 11.39817981346126, + "learning_rate": 4.977423065480766e-05, + "loss": 2.8916, + "mean_token_accuracy": 0.3931034505367279, + "step": 92210 + }, + { + "epoch": 0.09287992740025845, + "grad_norm": 12.344893920747747, + "learning_rate": 4.977417770201206e-05, + "loss": 2.4541, + "mean_token_accuracy": 0.4310344815254211, + "step": 92215 + }, + { + "epoch": 0.09288496345336263, + "grad_norm": 10.12584355401762, + "learning_rate": 4.977412474303864e-05, + "loss": 2.4165, + "mean_token_accuracy": 0.42413792610168455, + "step": 92220 + }, + { + "epoch": 0.0928899995064668, + "grad_norm": 11.231283778892562, + "learning_rate": 4.977407177788741e-05, + "loss": 2.3637, + "mean_token_accuracy": 0.45172414779663084, + "step": 92225 + }, + { + "epoch": 0.09289503555957097, + "grad_norm": 13.572449450162914, + "learning_rate": 4.977401880655837e-05, + "loss": 2.5735, + "mean_token_accuracy": 0.3862068891525269, + "step": 92230 + }, + { + "epoch": 0.09290007161267515, + "grad_norm": 13.088077005900482, + "learning_rate": 4.9773965829051555e-05, + "loss": 2.195, + "mean_token_accuracy": 0.5084694564342499, + "step": 92235 + }, + { + "epoch": 0.09290510766577931, + "grad_norm": 9.53956793636164, + "learning_rate": 4.977391284536697e-05, + "loss": 2.5247, + "mean_token_accuracy": 0.37586205899715425, + "step": 92240 + }, + { + "epoch": 0.09291014371888348, + "grad_norm": 12.331755354295211, + "learning_rate": 4.9773859855504634e-05, + "loss": 2.6305, + "mean_token_accuracy": 0.43448275327682495, + "step": 92245 + }, + { + "epoch": 0.09291517977198765, + "grad_norm": 8.445318060540886, + "learning_rate": 4.977380685946457e-05, + "loss": 2.503, + "mean_token_accuracy": 0.42758620977401735, + "step": 92250 + }, + { + "epoch": 0.09292021582509183, + "grad_norm": 13.125980582437558, + "learning_rate": 4.977375385724676e-05, + "loss": 2.2676, + "mean_token_accuracy": 0.4517241299152374, + "step": 92255 + }, + { + "epoch": 0.092925251878196, + "grad_norm": 11.168617743320192, + "learning_rate": 4.977370084885126e-05, + "loss": 2.5238, + "mean_token_accuracy": 0.36896551251411436, + "step": 92260 + }, + { + "epoch": 0.09293028793130018, + "grad_norm": 12.623181626605657, + "learning_rate": 4.977364783427806e-05, + "loss": 2.4592, + "mean_token_accuracy": 0.4172413766384125, + "step": 92265 + }, + { + "epoch": 0.09293532398440435, + "grad_norm": 10.46315237037504, + "learning_rate": 4.977359481352718e-05, + "loss": 1.9819, + "mean_token_accuracy": 0.49655172824859617, + "step": 92270 + }, + { + "epoch": 0.09294036003750852, + "grad_norm": 9.68745395691892, + "learning_rate": 4.9773541786598636e-05, + "loss": 2.6187, + "mean_token_accuracy": 0.358620685338974, + "step": 92275 + }, + { + "epoch": 0.0929453960906127, + "grad_norm": 15.587786286686514, + "learning_rate": 4.9773488753492444e-05, + "loss": 2.1654, + "mean_token_accuracy": 0.48275862336158754, + "step": 92280 + }, + { + "epoch": 0.09295043214371687, + "grad_norm": 10.624603633727313, + "learning_rate": 4.9773435714208616e-05, + "loss": 2.172, + "mean_token_accuracy": 0.4620689690113068, + "step": 92285 + }, + { + "epoch": 0.09295546819682105, + "grad_norm": 11.818433108672334, + "learning_rate": 4.9773382668747166e-05, + "loss": 2.3845, + "mean_token_accuracy": 0.4393224537372589, + "step": 92290 + }, + { + "epoch": 0.09296050424992522, + "grad_norm": 12.2907485413273, + "learning_rate": 4.977332961710811e-05, + "loss": 2.4794, + "mean_token_accuracy": 0.39655172228813174, + "step": 92295 + }, + { + "epoch": 0.09296554030302939, + "grad_norm": 10.636346049314323, + "learning_rate": 4.9773276559291465e-05, + "loss": 2.6095, + "mean_token_accuracy": 0.4896551787853241, + "step": 92300 + }, + { + "epoch": 0.09297057635613357, + "grad_norm": 10.924448326483795, + "learning_rate": 4.977322349529725e-05, + "loss": 2.5351, + "mean_token_accuracy": 0.42915910482406616, + "step": 92305 + }, + { + "epoch": 0.09297561240923773, + "grad_norm": 10.208026338177653, + "learning_rate": 4.977317042512546e-05, + "loss": 2.8844, + "mean_token_accuracy": 0.3896551728248596, + "step": 92310 + }, + { + "epoch": 0.0929806484623419, + "grad_norm": 10.245016768810828, + "learning_rate": 4.9773117348776136e-05, + "loss": 2.657, + "mean_token_accuracy": 0.3931034505367279, + "step": 92315 + }, + { + "epoch": 0.09298568451544607, + "grad_norm": 13.039856796282708, + "learning_rate": 4.977306426624926e-05, + "loss": 2.6561, + "mean_token_accuracy": 0.3482758581638336, + "step": 92320 + }, + { + "epoch": 0.09299072056855025, + "grad_norm": 14.861437747977563, + "learning_rate": 4.9773011177544885e-05, + "loss": 2.2725, + "mean_token_accuracy": 0.41379311084747317, + "step": 92325 + }, + { + "epoch": 0.09299575662165442, + "grad_norm": 11.474356884105152, + "learning_rate": 4.9772958082663e-05, + "loss": 2.2247, + "mean_token_accuracy": 0.4689655125141144, + "step": 92330 + }, + { + "epoch": 0.0930007926747586, + "grad_norm": 10.888752820123793, + "learning_rate": 4.9772904981603636e-05, + "loss": 2.2366, + "mean_token_accuracy": 0.3931034505367279, + "step": 92335 + }, + { + "epoch": 0.09300582872786277, + "grad_norm": 9.103565766910908, + "learning_rate": 4.977285187436678e-05, + "loss": 2.2712, + "mean_token_accuracy": 0.44482759237289426, + "step": 92340 + }, + { + "epoch": 0.09301086478096694, + "grad_norm": 13.376822153954766, + "learning_rate": 4.9772798760952485e-05, + "loss": 2.617, + "mean_token_accuracy": 0.42068964838981626, + "step": 92345 + }, + { + "epoch": 0.09301590083407112, + "grad_norm": 9.158230367936357, + "learning_rate": 4.977274564136073e-05, + "loss": 2.2565, + "mean_token_accuracy": 0.4413793087005615, + "step": 92350 + }, + { + "epoch": 0.09302093688717529, + "grad_norm": 10.562088824944562, + "learning_rate": 4.9772692515591554e-05, + "loss": 2.376, + "mean_token_accuracy": 0.43793103098869324, + "step": 92355 + }, + { + "epoch": 0.09302597294027946, + "grad_norm": 13.52528386028242, + "learning_rate": 4.977263938364497e-05, + "loss": 2.914, + "mean_token_accuracy": 0.3896551728248596, + "step": 92360 + }, + { + "epoch": 0.09303100899338364, + "grad_norm": 9.776135298229528, + "learning_rate": 4.977258624552098e-05, + "loss": 2.0694, + "mean_token_accuracy": 0.47931034564971925, + "step": 92365 + }, + { + "epoch": 0.09303604504648781, + "grad_norm": 9.673529790468157, + "learning_rate": 4.9772533101219606e-05, + "loss": 2.1271, + "mean_token_accuracy": 0.4448275864124298, + "step": 92370 + }, + { + "epoch": 0.09304108109959199, + "grad_norm": 11.27846004579561, + "learning_rate": 4.977247995074086e-05, + "loss": 2.3891, + "mean_token_accuracy": 0.41724138259887694, + "step": 92375 + }, + { + "epoch": 0.09304611715269615, + "grad_norm": 13.089590304339058, + "learning_rate": 4.977242679408476e-05, + "loss": 2.3708, + "mean_token_accuracy": 0.4344827592372894, + "step": 92380 + }, + { + "epoch": 0.09305115320580032, + "grad_norm": 11.624101259178094, + "learning_rate": 4.9772373631251315e-05, + "loss": 2.6765, + "mean_token_accuracy": 0.41379310488700866, + "step": 92385 + }, + { + "epoch": 0.09305618925890449, + "grad_norm": 12.758754334082468, + "learning_rate": 4.977232046224055e-05, + "loss": 2.6919, + "mean_token_accuracy": 0.35172412991523744, + "step": 92390 + }, + { + "epoch": 0.09306122531200867, + "grad_norm": 13.648556543919906, + "learning_rate": 4.9772267287052474e-05, + "loss": 2.6681, + "mean_token_accuracy": 0.38620689809322356, + "step": 92395 + }, + { + "epoch": 0.09306626136511284, + "grad_norm": 12.107248006856352, + "learning_rate": 4.9772214105687095e-05, + "loss": 2.7961, + "mean_token_accuracy": 0.39310344457626345, + "step": 92400 + }, + { + "epoch": 0.09307129741821701, + "grad_norm": 10.430876193386759, + "learning_rate": 4.977216091814444e-05, + "loss": 2.4226, + "mean_token_accuracy": 0.42401694059371947, + "step": 92405 + }, + { + "epoch": 0.09307633347132119, + "grad_norm": 9.508432308920876, + "learning_rate": 4.977210772442452e-05, + "loss": 2.7651, + "mean_token_accuracy": 0.4206896543502808, + "step": 92410 + }, + { + "epoch": 0.09308136952442536, + "grad_norm": 10.718863890587022, + "learning_rate": 4.9772054524527346e-05, + "loss": 2.7359, + "mean_token_accuracy": 0.38965516686439516, + "step": 92415 + }, + { + "epoch": 0.09308640557752954, + "grad_norm": 9.752604468726473, + "learning_rate": 4.977200131845293e-05, + "loss": 2.5699, + "mean_token_accuracy": 0.43448275327682495, + "step": 92420 + }, + { + "epoch": 0.09309144163063371, + "grad_norm": 12.063498928439017, + "learning_rate": 4.97719481062013e-05, + "loss": 2.1773, + "mean_token_accuracy": 0.42758620381355283, + "step": 92425 + }, + { + "epoch": 0.09309647768373788, + "grad_norm": 9.955827190444213, + "learning_rate": 4.977189488777246e-05, + "loss": 2.1902, + "mean_token_accuracy": 0.5137930989265442, + "step": 92430 + }, + { + "epoch": 0.09310151373684206, + "grad_norm": 11.283787100903222, + "learning_rate": 4.977184166316642e-05, + "loss": 2.6678, + "mean_token_accuracy": 0.3965517282485962, + "step": 92435 + }, + { + "epoch": 0.09310654978994623, + "grad_norm": 12.862959013953216, + "learning_rate": 4.9771788432383205e-05, + "loss": 2.5473, + "mean_token_accuracy": 0.4517241418361664, + "step": 92440 + }, + { + "epoch": 0.0931115858430504, + "grad_norm": 18.87527828649575, + "learning_rate": 4.977173519542282e-05, + "loss": 3.0212, + "mean_token_accuracy": 0.3586206823587418, + "step": 92445 + }, + { + "epoch": 0.09311662189615456, + "grad_norm": 12.948380552154804, + "learning_rate": 4.97716819522853e-05, + "loss": 2.5381, + "mean_token_accuracy": 0.41379310488700866, + "step": 92450 + }, + { + "epoch": 0.09312165794925874, + "grad_norm": 13.459223684172914, + "learning_rate": 4.9771628702970633e-05, + "loss": 2.5765, + "mean_token_accuracy": 0.44482759237289426, + "step": 92455 + }, + { + "epoch": 0.09312669400236291, + "grad_norm": 9.79990962662244, + "learning_rate": 4.9771575447478854e-05, + "loss": 2.4353, + "mean_token_accuracy": 0.44827585816383364, + "step": 92460 + }, + { + "epoch": 0.09313173005546709, + "grad_norm": 11.94439005235742, + "learning_rate": 4.977152218580997e-05, + "loss": 2.6181, + "mean_token_accuracy": 0.41034482717514037, + "step": 92465 + }, + { + "epoch": 0.09313676610857126, + "grad_norm": 11.116290695884967, + "learning_rate": 4.977146891796399e-05, + "loss": 2.7058, + "mean_token_accuracy": 0.4068965554237366, + "step": 92470 + }, + { + "epoch": 0.09314180216167543, + "grad_norm": 11.675002101422656, + "learning_rate": 4.9771415643940945e-05, + "loss": 2.6381, + "mean_token_accuracy": 0.36206896007061007, + "step": 92475 + }, + { + "epoch": 0.0931468382147796, + "grad_norm": 11.550484897198121, + "learning_rate": 4.9771362363740835e-05, + "loss": 2.3384, + "mean_token_accuracy": 0.4310344815254211, + "step": 92480 + }, + { + "epoch": 0.09315187426788378, + "grad_norm": 12.021461095014665, + "learning_rate": 4.977130907736367e-05, + "loss": 2.3555, + "mean_token_accuracy": 0.4310344815254211, + "step": 92485 + }, + { + "epoch": 0.09315691032098795, + "grad_norm": 9.123185054846829, + "learning_rate": 4.977125578480949e-05, + "loss": 2.4685, + "mean_token_accuracy": 0.3999999940395355, + "step": 92490 + }, + { + "epoch": 0.09316194637409213, + "grad_norm": 11.88124269712118, + "learning_rate": 4.977120248607829e-05, + "loss": 2.9889, + "mean_token_accuracy": 0.4, + "step": 92495 + }, + { + "epoch": 0.0931669824271963, + "grad_norm": 9.808831903113491, + "learning_rate": 4.977114918117009e-05, + "loss": 2.2065, + "mean_token_accuracy": 0.42758620381355283, + "step": 92500 + }, + { + "epoch": 0.09317201848030048, + "grad_norm": 12.290037315707783, + "learning_rate": 4.9771095870084896e-05, + "loss": 2.285, + "mean_token_accuracy": 0.3896551728248596, + "step": 92505 + }, + { + "epoch": 0.09317705453340465, + "grad_norm": 13.617816476940597, + "learning_rate": 4.977104255282274e-05, + "loss": 2.7779, + "mean_token_accuracy": 0.3999999940395355, + "step": 92510 + }, + { + "epoch": 0.09318209058650882, + "grad_norm": 10.05176596408957, + "learning_rate": 4.977098922938362e-05, + "loss": 2.4282, + "mean_token_accuracy": 0.412099215388298, + "step": 92515 + }, + { + "epoch": 0.09318712663961298, + "grad_norm": 11.908948292467882, + "learning_rate": 4.9770935899767556e-05, + "loss": 2.91, + "mean_token_accuracy": 0.3620689570903778, + "step": 92520 + }, + { + "epoch": 0.09319216269271716, + "grad_norm": 11.298092776541889, + "learning_rate": 4.9770882563974566e-05, + "loss": 2.5022, + "mean_token_accuracy": 0.3965517282485962, + "step": 92525 + }, + { + "epoch": 0.09319719874582133, + "grad_norm": 12.556377468936898, + "learning_rate": 4.977082922200468e-05, + "loss": 2.1543, + "mean_token_accuracy": 0.5059286117553711, + "step": 92530 + }, + { + "epoch": 0.0932022347989255, + "grad_norm": 12.00185048374263, + "learning_rate": 4.977077587385788e-05, + "loss": 2.5706, + "mean_token_accuracy": 0.37931033968925476, + "step": 92535 + }, + { + "epoch": 0.09320727085202968, + "grad_norm": 10.3115550833988, + "learning_rate": 4.97707225195342e-05, + "loss": 2.5789, + "mean_token_accuracy": 0.38965517580509185, + "step": 92540 + }, + { + "epoch": 0.09321230690513385, + "grad_norm": 12.468289418287181, + "learning_rate": 4.977066915903365e-05, + "loss": 2.3518, + "mean_token_accuracy": 0.4620689690113068, + "step": 92545 + }, + { + "epoch": 0.09321734295823803, + "grad_norm": 12.026601684565376, + "learning_rate": 4.9770615792356254e-05, + "loss": 2.3714, + "mean_token_accuracy": 0.42413793206214906, + "step": 92550 + }, + { + "epoch": 0.0932223790113422, + "grad_norm": 11.397426746458457, + "learning_rate": 4.977056241950201e-05, + "loss": 2.6073, + "mean_token_accuracy": 0.4137930989265442, + "step": 92555 + }, + { + "epoch": 0.09322741506444637, + "grad_norm": 12.522030582816551, + "learning_rate": 4.977050904047095e-05, + "loss": 2.9241, + "mean_token_accuracy": 0.36896551251411436, + "step": 92560 + }, + { + "epoch": 0.09323245111755055, + "grad_norm": 8.975437569726779, + "learning_rate": 4.977045565526308e-05, + "loss": 2.6703, + "mean_token_accuracy": 0.3827586233615875, + "step": 92565 + }, + { + "epoch": 0.09323748717065472, + "grad_norm": 11.967474950934282, + "learning_rate": 4.9770402263878416e-05, + "loss": 2.3723, + "mean_token_accuracy": 0.415426504611969, + "step": 92570 + }, + { + "epoch": 0.0932425232237589, + "grad_norm": 9.45573059764412, + "learning_rate": 4.9770348866316965e-05, + "loss": 2.5565, + "mean_token_accuracy": 0.41034482717514037, + "step": 92575 + }, + { + "epoch": 0.09324755927686307, + "grad_norm": 11.261449647949393, + "learning_rate": 4.977029546257876e-05, + "loss": 2.4819, + "mean_token_accuracy": 0.46896551847457885, + "step": 92580 + }, + { + "epoch": 0.09325259532996724, + "grad_norm": 10.71339535193657, + "learning_rate": 4.9770242052663794e-05, + "loss": 2.8431, + "mean_token_accuracy": 0.3909255862236023, + "step": 92585 + }, + { + "epoch": 0.0932576313830714, + "grad_norm": 9.737008574500482, + "learning_rate": 4.977018863657211e-05, + "loss": 2.4798, + "mean_token_accuracy": 0.44482759237289426, + "step": 92590 + }, + { + "epoch": 0.09326266743617558, + "grad_norm": 11.497102399346923, + "learning_rate": 4.9770135214303695e-05, + "loss": 2.4002, + "mean_token_accuracy": 0.41034482717514037, + "step": 92595 + }, + { + "epoch": 0.09326770348927975, + "grad_norm": 12.329289841573527, + "learning_rate": 4.9770081785858576e-05, + "loss": 2.4525, + "mean_token_accuracy": 0.4551724135875702, + "step": 92600 + }, + { + "epoch": 0.09327273954238392, + "grad_norm": 11.056950583508407, + "learning_rate": 4.977002835123677e-05, + "loss": 2.4617, + "mean_token_accuracy": 0.4482758641242981, + "step": 92605 + }, + { + "epoch": 0.0932777755954881, + "grad_norm": 12.087564256325242, + "learning_rate": 4.976997491043829e-05, + "loss": 2.5137, + "mean_token_accuracy": 0.4172413766384125, + "step": 92610 + }, + { + "epoch": 0.09328281164859227, + "grad_norm": 9.06300492334387, + "learning_rate": 4.976992146346314e-05, + "loss": 2.4869, + "mean_token_accuracy": 0.42928009629249575, + "step": 92615 + }, + { + "epoch": 0.09328784770169644, + "grad_norm": 8.153345723630702, + "learning_rate": 4.9769868010311344e-05, + "loss": 2.4661, + "mean_token_accuracy": 0.41034482717514037, + "step": 92620 + }, + { + "epoch": 0.09329288375480062, + "grad_norm": 10.361493382764031, + "learning_rate": 4.976981455098292e-05, + "loss": 2.4599, + "mean_token_accuracy": 0.37241379618644715, + "step": 92625 + }, + { + "epoch": 0.09329791980790479, + "grad_norm": 11.815837297641073, + "learning_rate": 4.976976108547789e-05, + "loss": 2.7318, + "mean_token_accuracy": 0.42758620977401735, + "step": 92630 + }, + { + "epoch": 0.09330295586100897, + "grad_norm": 10.588833025503037, + "learning_rate": 4.976970761379625e-05, + "loss": 2.1989, + "mean_token_accuracy": 0.42413792610168455, + "step": 92635 + }, + { + "epoch": 0.09330799191411314, + "grad_norm": 9.68139490931494, + "learning_rate": 4.976965413593803e-05, + "loss": 2.6461, + "mean_token_accuracy": 0.4369026005268097, + "step": 92640 + }, + { + "epoch": 0.09331302796721731, + "grad_norm": 10.080561392583725, + "learning_rate": 4.976960065190323e-05, + "loss": 2.4649, + "mean_token_accuracy": 0.46896551847457885, + "step": 92645 + }, + { + "epoch": 0.09331806402032149, + "grad_norm": 10.266502420945113, + "learning_rate": 4.976954716169187e-05, + "loss": 2.3872, + "mean_token_accuracy": 0.4620689570903778, + "step": 92650 + }, + { + "epoch": 0.09332310007342566, + "grad_norm": 10.865245518004352, + "learning_rate": 4.9769493665303986e-05, + "loss": 2.7558, + "mean_token_accuracy": 0.38620689511299133, + "step": 92655 + }, + { + "epoch": 0.09332813612652982, + "grad_norm": 12.255538383261452, + "learning_rate": 4.976944016273956e-05, + "loss": 3.0601, + "mean_token_accuracy": 0.36896551251411436, + "step": 92660 + }, + { + "epoch": 0.093333172179634, + "grad_norm": 13.479489374202345, + "learning_rate": 4.976938665399863e-05, + "loss": 2.5504, + "mean_token_accuracy": 0.38965516686439516, + "step": 92665 + }, + { + "epoch": 0.09333820823273817, + "grad_norm": 10.751500446597479, + "learning_rate": 4.9769333139081185e-05, + "loss": 2.0808, + "mean_token_accuracy": 0.48965516686439514, + "step": 92670 + }, + { + "epoch": 0.09334324428584234, + "grad_norm": 10.649360807449133, + "learning_rate": 4.976927961798728e-05, + "loss": 2.1683, + "mean_token_accuracy": 0.4896551728248596, + "step": 92675 + }, + { + "epoch": 0.09334828033894652, + "grad_norm": 11.226245054681119, + "learning_rate": 4.97692260907169e-05, + "loss": 2.5563, + "mean_token_accuracy": 0.39655172228813174, + "step": 92680 + }, + { + "epoch": 0.09335331639205069, + "grad_norm": 15.789296700315713, + "learning_rate": 4.976917255727006e-05, + "loss": 2.6985, + "mean_token_accuracy": 0.4068965494632721, + "step": 92685 + }, + { + "epoch": 0.09335835244515486, + "grad_norm": 9.815690936516095, + "learning_rate": 4.976911901764679e-05, + "loss": 2.4069, + "mean_token_accuracy": 0.43103447556495667, + "step": 92690 + }, + { + "epoch": 0.09336338849825904, + "grad_norm": 30.93793282628475, + "learning_rate": 4.9769065471847084e-05, + "loss": 3.825, + "mean_token_accuracy": 0.2655172377824783, + "step": 92695 + }, + { + "epoch": 0.09336842455136321, + "grad_norm": 11.34813796181201, + "learning_rate": 4.976901191987098e-05, + "loss": 2.4703, + "mean_token_accuracy": 0.4, + "step": 92700 + }, + { + "epoch": 0.09337346060446738, + "grad_norm": 11.939390201434765, + "learning_rate": 4.976895836171849e-05, + "loss": 2.4997, + "mean_token_accuracy": 0.4137930989265442, + "step": 92705 + }, + { + "epoch": 0.09337849665757156, + "grad_norm": 19.794229274360738, + "learning_rate": 4.97689047973896e-05, + "loss": 2.5721, + "mean_token_accuracy": 0.38620689511299133, + "step": 92710 + }, + { + "epoch": 0.09338353271067573, + "grad_norm": 9.290166109030958, + "learning_rate": 4.976885122688436e-05, + "loss": 2.6266, + "mean_token_accuracy": 0.4137930929660797, + "step": 92715 + }, + { + "epoch": 0.0933885687637799, + "grad_norm": 9.06630074574957, + "learning_rate": 4.976879765020277e-05, + "loss": 2.2635, + "mean_token_accuracy": 0.4448275864124298, + "step": 92720 + }, + { + "epoch": 0.09339360481688408, + "grad_norm": 11.956370372052069, + "learning_rate": 4.976874406734485e-05, + "loss": 2.5353, + "mean_token_accuracy": 0.493103438615799, + "step": 92725 + }, + { + "epoch": 0.09339864086998824, + "grad_norm": 9.328049034772038, + "learning_rate": 4.9768690478310604e-05, + "loss": 2.2657, + "mean_token_accuracy": 0.4448275864124298, + "step": 92730 + }, + { + "epoch": 0.09340367692309241, + "grad_norm": 9.81283040580341, + "learning_rate": 4.976863688310006e-05, + "loss": 2.3843, + "mean_token_accuracy": 0.41379311084747317, + "step": 92735 + }, + { + "epoch": 0.09340871297619659, + "grad_norm": 11.683928363714813, + "learning_rate": 4.976858328171322e-05, + "loss": 2.9107, + "mean_token_accuracy": 0.37241379618644715, + "step": 92740 + }, + { + "epoch": 0.09341374902930076, + "grad_norm": 9.134145965917087, + "learning_rate": 4.97685296741501e-05, + "loss": 2.4425, + "mean_token_accuracy": 0.47453115582466127, + "step": 92745 + }, + { + "epoch": 0.09341878508240493, + "grad_norm": 10.768601625409739, + "learning_rate": 4.976847606041073e-05, + "loss": 2.8465, + "mean_token_accuracy": 0.34827585220336915, + "step": 92750 + }, + { + "epoch": 0.09342382113550911, + "grad_norm": 14.582288336736537, + "learning_rate": 4.9768422440495116e-05, + "loss": 2.891, + "mean_token_accuracy": 0.3655172437429428, + "step": 92755 + }, + { + "epoch": 0.09342885718861328, + "grad_norm": 9.542700587414329, + "learning_rate": 4.976836881440327e-05, + "loss": 2.2164, + "mean_token_accuracy": 0.41016334295272827, + "step": 92760 + }, + { + "epoch": 0.09343389324171746, + "grad_norm": 15.739911789447238, + "learning_rate": 4.976831518213521e-05, + "loss": 2.6229, + "mean_token_accuracy": 0.4586206912994385, + "step": 92765 + }, + { + "epoch": 0.09343892929482163, + "grad_norm": 12.337118209961616, + "learning_rate": 4.976826154369095e-05, + "loss": 2.2329, + "mean_token_accuracy": 0.4379310369491577, + "step": 92770 + }, + { + "epoch": 0.0934439653479258, + "grad_norm": 11.795702055333203, + "learning_rate": 4.97682078990705e-05, + "loss": 2.9782, + "mean_token_accuracy": 0.3655172437429428, + "step": 92775 + }, + { + "epoch": 0.09344900140102998, + "grad_norm": 13.525298118143013, + "learning_rate": 4.976815424827388e-05, + "loss": 2.7375, + "mean_token_accuracy": 0.34137930870056155, + "step": 92780 + }, + { + "epoch": 0.09345403745413415, + "grad_norm": 9.915901863999816, + "learning_rate": 4.9768100591301106e-05, + "loss": 2.4255, + "mean_token_accuracy": 0.382758629322052, + "step": 92785 + }, + { + "epoch": 0.09345907350723832, + "grad_norm": 11.610746273242707, + "learning_rate": 4.976804692815219e-05, + "loss": 2.3981, + "mean_token_accuracy": 0.37241379022598264, + "step": 92790 + }, + { + "epoch": 0.0934641095603425, + "grad_norm": 9.929665599632138, + "learning_rate": 4.976799325882715e-05, + "loss": 2.5908, + "mean_token_accuracy": 0.41724138259887694, + "step": 92795 + }, + { + "epoch": 0.09346914561344666, + "grad_norm": 12.502597543460368, + "learning_rate": 4.9767939583326e-05, + "loss": 2.6876, + "mean_token_accuracy": 0.35172413289546967, + "step": 92800 + }, + { + "epoch": 0.09347418166655083, + "grad_norm": 12.0264402983314, + "learning_rate": 4.9767885901648745e-05, + "loss": 2.7972, + "mean_token_accuracy": 0.3896551787853241, + "step": 92805 + }, + { + "epoch": 0.093479217719655, + "grad_norm": 12.872803196102195, + "learning_rate": 4.976783221379542e-05, + "loss": 2.5473, + "mean_token_accuracy": 0.3931034505367279, + "step": 92810 + }, + { + "epoch": 0.09348425377275918, + "grad_norm": 11.218713767646067, + "learning_rate": 4.976777851976603e-05, + "loss": 2.4669, + "mean_token_accuracy": 0.4551724135875702, + "step": 92815 + }, + { + "epoch": 0.09348928982586335, + "grad_norm": 11.21067105207157, + "learning_rate": 4.976772481956057e-05, + "loss": 2.7913, + "mean_token_accuracy": 0.337931028008461, + "step": 92820 + }, + { + "epoch": 0.09349432587896753, + "grad_norm": 11.5304409438593, + "learning_rate": 4.9767671113179094e-05, + "loss": 2.6654, + "mean_token_accuracy": 0.3655172407627106, + "step": 92825 + }, + { + "epoch": 0.0934993619320717, + "grad_norm": 10.100310297208768, + "learning_rate": 4.9767617400621594e-05, + "loss": 2.4878, + "mean_token_accuracy": 0.4172413766384125, + "step": 92830 + }, + { + "epoch": 0.09350439798517587, + "grad_norm": 15.031401884169995, + "learning_rate": 4.976756368188808e-05, + "loss": 2.9683, + "mean_token_accuracy": 0.358620685338974, + "step": 92835 + }, + { + "epoch": 0.09350943403828005, + "grad_norm": 11.074894011660747, + "learning_rate": 4.9767509956978575e-05, + "loss": 2.4594, + "mean_token_accuracy": 0.44482758045196535, + "step": 92840 + }, + { + "epoch": 0.09351447009138422, + "grad_norm": 10.790180562306132, + "learning_rate": 4.9767456225893096e-05, + "loss": 2.6397, + "mean_token_accuracy": 0.4103448212146759, + "step": 92845 + }, + { + "epoch": 0.0935195061444884, + "grad_norm": 9.971107304948978, + "learning_rate": 4.976740248863165e-05, + "loss": 2.2712, + "mean_token_accuracy": 0.4586206912994385, + "step": 92850 + }, + { + "epoch": 0.09352454219759257, + "grad_norm": 14.923968101643517, + "learning_rate": 4.9767348745194256e-05, + "loss": 2.6099, + "mean_token_accuracy": 0.40344826579093934, + "step": 92855 + }, + { + "epoch": 0.09352957825069674, + "grad_norm": 10.257936206929745, + "learning_rate": 4.9767294995580934e-05, + "loss": 2.337, + "mean_token_accuracy": 0.4086509466171265, + "step": 92860 + }, + { + "epoch": 0.09353461430380092, + "grad_norm": 11.451341204745237, + "learning_rate": 4.976724123979169e-05, + "loss": 2.3821, + "mean_token_accuracy": 0.42758620977401735, + "step": 92865 + }, + { + "epoch": 0.09353965035690508, + "grad_norm": 10.386684890816936, + "learning_rate": 4.976718747782656e-05, + "loss": 2.2279, + "mean_token_accuracy": 0.47126436829566953, + "step": 92870 + }, + { + "epoch": 0.09354468641000925, + "grad_norm": 11.245326408676142, + "learning_rate": 4.976713370968553e-05, + "loss": 2.2516, + "mean_token_accuracy": 0.4344827592372894, + "step": 92875 + }, + { + "epoch": 0.09354972246311342, + "grad_norm": 11.85728772701635, + "learning_rate": 4.976707993536863e-05, + "loss": 2.4737, + "mean_token_accuracy": 0.4448275864124298, + "step": 92880 + }, + { + "epoch": 0.0935547585162176, + "grad_norm": 9.382579490391777, + "learning_rate": 4.9767026154875865e-05, + "loss": 2.3123, + "mean_token_accuracy": 0.47586206197738645, + "step": 92885 + }, + { + "epoch": 0.09355979456932177, + "grad_norm": 9.820522558071156, + "learning_rate": 4.976697236820726e-05, + "loss": 2.2382, + "mean_token_accuracy": 0.4275861978530884, + "step": 92890 + }, + { + "epoch": 0.09356483062242595, + "grad_norm": 11.095776261192862, + "learning_rate": 4.976691857536283e-05, + "loss": 2.2401, + "mean_token_accuracy": 0.5085299432277679, + "step": 92895 + }, + { + "epoch": 0.09356986667553012, + "grad_norm": 11.237277379479858, + "learning_rate": 4.9766864776342594e-05, + "loss": 2.1821, + "mean_token_accuracy": 0.46896551847457885, + "step": 92900 + }, + { + "epoch": 0.0935749027286343, + "grad_norm": 10.882568940158816, + "learning_rate": 4.976681097114655e-05, + "loss": 2.6394, + "mean_token_accuracy": 0.42758620381355283, + "step": 92905 + }, + { + "epoch": 0.09357993878173847, + "grad_norm": 12.376169467126376, + "learning_rate": 4.976675715977472e-05, + "loss": 2.3338, + "mean_token_accuracy": 0.4689655125141144, + "step": 92910 + }, + { + "epoch": 0.09358497483484264, + "grad_norm": 11.95347216128273, + "learning_rate": 4.9766703342227136e-05, + "loss": 2.2289, + "mean_token_accuracy": 0.4914700508117676, + "step": 92915 + }, + { + "epoch": 0.09359001088794681, + "grad_norm": 10.60752367373058, + "learning_rate": 4.976664951850379e-05, + "loss": 2.1172, + "mean_token_accuracy": 0.46551724076271056, + "step": 92920 + }, + { + "epoch": 0.09359504694105099, + "grad_norm": 12.280786195925531, + "learning_rate": 4.97665956886047e-05, + "loss": 2.2369, + "mean_token_accuracy": 0.44482759237289426, + "step": 92925 + }, + { + "epoch": 0.09360008299415516, + "grad_norm": 13.230850490859646, + "learning_rate": 4.976654185252989e-05, + "loss": 2.8799, + "mean_token_accuracy": 0.3793103456497192, + "step": 92930 + }, + { + "epoch": 0.09360511904725934, + "grad_norm": 8.986820056734857, + "learning_rate": 4.9766488010279374e-05, + "loss": 1.8635, + "mean_token_accuracy": 0.5329703569412232, + "step": 92935 + }, + { + "epoch": 0.0936101551003635, + "grad_norm": 11.128783391075146, + "learning_rate": 4.976643416185317e-05, + "loss": 1.8699, + "mean_token_accuracy": 0.5448275864124298, + "step": 92940 + }, + { + "epoch": 0.09361519115346767, + "grad_norm": 8.956521516257721, + "learning_rate": 4.976638030725128e-05, + "loss": 2.2377, + "mean_token_accuracy": 0.46206897497177124, + "step": 92945 + }, + { + "epoch": 0.09362022720657184, + "grad_norm": 11.081685447838057, + "learning_rate": 4.976632644647373e-05, + "loss": 2.0652, + "mean_token_accuracy": 0.4862068951129913, + "step": 92950 + }, + { + "epoch": 0.09362526325967602, + "grad_norm": 11.259181423267059, + "learning_rate": 4.976627257952054e-05, + "loss": 2.106, + "mean_token_accuracy": 0.4896551728248596, + "step": 92955 + }, + { + "epoch": 0.09363029931278019, + "grad_norm": 10.778768502039574, + "learning_rate": 4.976621870639169e-05, + "loss": 2.4444, + "mean_token_accuracy": 0.43793103098869324, + "step": 92960 + }, + { + "epoch": 0.09363533536588436, + "grad_norm": 13.798955386865313, + "learning_rate": 4.9766164827087245e-05, + "loss": 2.5698, + "mean_token_accuracy": 0.38965516686439516, + "step": 92965 + }, + { + "epoch": 0.09364037141898854, + "grad_norm": 14.723063509418964, + "learning_rate": 4.976611094160719e-05, + "loss": 2.3381, + "mean_token_accuracy": 0.48965516686439514, + "step": 92970 + }, + { + "epoch": 0.09364540747209271, + "grad_norm": 11.358259754575805, + "learning_rate": 4.976605704995154e-05, + "loss": 2.5215, + "mean_token_accuracy": 0.4344827651977539, + "step": 92975 + }, + { + "epoch": 0.09365044352519689, + "grad_norm": 10.120690628897718, + "learning_rate": 4.976600315212032e-05, + "loss": 2.3113, + "mean_token_accuracy": 0.41379311084747317, + "step": 92980 + }, + { + "epoch": 0.09365547957830106, + "grad_norm": 10.436636703961874, + "learning_rate": 4.9765949248113546e-05, + "loss": 2.4461, + "mean_token_accuracy": 0.4344827651977539, + "step": 92985 + }, + { + "epoch": 0.09366051563140523, + "grad_norm": 10.01201053586698, + "learning_rate": 4.976589533793122e-05, + "loss": 2.2459, + "mean_token_accuracy": 0.4172413766384125, + "step": 92990 + }, + { + "epoch": 0.09366555168450941, + "grad_norm": 8.786147704793626, + "learning_rate": 4.976584142157337e-05, + "loss": 2.4204, + "mean_token_accuracy": 0.44827585220336913, + "step": 92995 + }, + { + "epoch": 0.09367058773761358, + "grad_norm": 12.86482715463306, + "learning_rate": 4.976578749904e-05, + "loss": 2.5068, + "mean_token_accuracy": 0.4068965494632721, + "step": 93000 + }, + { + "epoch": 0.09367562379071775, + "grad_norm": 7.876332952257544, + "learning_rate": 4.9765733570331136e-05, + "loss": 1.9628, + "mean_token_accuracy": 0.45862067937850953, + "step": 93005 + }, + { + "epoch": 0.09368065984382191, + "grad_norm": 11.389875037933686, + "learning_rate": 4.9765679635446786e-05, + "loss": 2.2948, + "mean_token_accuracy": 0.4034482717514038, + "step": 93010 + }, + { + "epoch": 0.09368569589692609, + "grad_norm": 9.808344169641051, + "learning_rate": 4.976562569438697e-05, + "loss": 2.4078, + "mean_token_accuracy": 0.38275861740112305, + "step": 93015 + }, + { + "epoch": 0.09369073195003026, + "grad_norm": 9.674567995378498, + "learning_rate": 4.9765571747151694e-05, + "loss": 2.1584, + "mean_token_accuracy": 0.4620689570903778, + "step": 93020 + }, + { + "epoch": 0.09369576800313444, + "grad_norm": 14.020655524611346, + "learning_rate": 4.9765517793740985e-05, + "loss": 2.4751, + "mean_token_accuracy": 0.3931034475564957, + "step": 93025 + }, + { + "epoch": 0.09370080405623861, + "grad_norm": 9.253712249790578, + "learning_rate": 4.976546383415484e-05, + "loss": 2.5186, + "mean_token_accuracy": 0.38965516686439516, + "step": 93030 + }, + { + "epoch": 0.09370584010934278, + "grad_norm": 13.97677376993939, + "learning_rate": 4.9765409868393296e-05, + "loss": 2.7374, + "mean_token_accuracy": 0.3896551728248596, + "step": 93035 + }, + { + "epoch": 0.09371087616244696, + "grad_norm": 14.475171837191011, + "learning_rate": 4.976535589645636e-05, + "loss": 2.5171, + "mean_token_accuracy": 0.47586206793785096, + "step": 93040 + }, + { + "epoch": 0.09371591221555113, + "grad_norm": 12.496448517715145, + "learning_rate": 4.976530191834404e-05, + "loss": 2.5097, + "mean_token_accuracy": 0.4206896543502808, + "step": 93045 + }, + { + "epoch": 0.0937209482686553, + "grad_norm": 11.850822630948127, + "learning_rate": 4.976524793405635e-05, + "loss": 2.349, + "mean_token_accuracy": 0.4413793087005615, + "step": 93050 + }, + { + "epoch": 0.09372598432175948, + "grad_norm": 12.162001219858503, + "learning_rate": 4.976519394359332e-05, + "loss": 2.2964, + "mean_token_accuracy": 0.43793103098869324, + "step": 93055 + }, + { + "epoch": 0.09373102037486365, + "grad_norm": 10.836631901637716, + "learning_rate": 4.9765139946954946e-05, + "loss": 2.4334, + "mean_token_accuracy": 0.42758620977401735, + "step": 93060 + }, + { + "epoch": 0.09373605642796783, + "grad_norm": 11.846747313726004, + "learning_rate": 4.976508594414126e-05, + "loss": 2.286, + "mean_token_accuracy": 0.44137930274009707, + "step": 93065 + }, + { + "epoch": 0.093741092481072, + "grad_norm": 13.244826298425306, + "learning_rate": 4.9765031935152265e-05, + "loss": 2.4157, + "mean_token_accuracy": 0.44137929677963256, + "step": 93070 + }, + { + "epoch": 0.09374612853417617, + "grad_norm": 11.043719649685853, + "learning_rate": 4.976497791998798e-05, + "loss": 2.1403, + "mean_token_accuracy": 0.48094373345375063, + "step": 93075 + }, + { + "epoch": 0.09375116458728033, + "grad_norm": 11.252232788321866, + "learning_rate": 4.9764923898648416e-05, + "loss": 2.5802, + "mean_token_accuracy": 0.4310344815254211, + "step": 93080 + }, + { + "epoch": 0.09375620064038451, + "grad_norm": 11.324223444030354, + "learning_rate": 4.9764869871133607e-05, + "loss": 2.7732, + "mean_token_accuracy": 0.41379311084747317, + "step": 93085 + }, + { + "epoch": 0.09376123669348868, + "grad_norm": 9.558078622276323, + "learning_rate": 4.976481583744355e-05, + "loss": 2.6255, + "mean_token_accuracy": 0.44827587008476255, + "step": 93090 + }, + { + "epoch": 0.09376627274659285, + "grad_norm": 12.666129130192001, + "learning_rate": 4.976476179757825e-05, + "loss": 2.6499, + "mean_token_accuracy": 0.3551724076271057, + "step": 93095 + }, + { + "epoch": 0.09377130879969703, + "grad_norm": 11.282855602729889, + "learning_rate": 4.976470775153775e-05, + "loss": 2.7059, + "mean_token_accuracy": 0.44827585816383364, + "step": 93100 + }, + { + "epoch": 0.0937763448528012, + "grad_norm": 9.568974110381832, + "learning_rate": 4.976465369932204e-05, + "loss": 2.2253, + "mean_token_accuracy": 0.44482758045196535, + "step": 93105 + }, + { + "epoch": 0.09378138090590538, + "grad_norm": 9.97680478969816, + "learning_rate": 4.976459964093114e-05, + "loss": 2.4123, + "mean_token_accuracy": 0.46551724076271056, + "step": 93110 + }, + { + "epoch": 0.09378641695900955, + "grad_norm": 10.967253389825235, + "learning_rate": 4.976454557636509e-05, + "loss": 2.1697, + "mean_token_accuracy": 0.44482759237289426, + "step": 93115 + }, + { + "epoch": 0.09379145301211372, + "grad_norm": 10.546145964553508, + "learning_rate": 4.976449150562387e-05, + "loss": 2.2218, + "mean_token_accuracy": 0.4620689630508423, + "step": 93120 + }, + { + "epoch": 0.0937964890652179, + "grad_norm": 13.65094195399109, + "learning_rate": 4.9764437428707514e-05, + "loss": 2.8169, + "mean_token_accuracy": 0.37241379022598264, + "step": 93125 + }, + { + "epoch": 0.09380152511832207, + "grad_norm": 9.45009948101431, + "learning_rate": 4.9764383345616033e-05, + "loss": 2.5152, + "mean_token_accuracy": 0.47302955389022827, + "step": 93130 + }, + { + "epoch": 0.09380656117142624, + "grad_norm": 10.961186482380842, + "learning_rate": 4.9764329256349446e-05, + "loss": 2.6161, + "mean_token_accuracy": 0.42758620381355283, + "step": 93135 + }, + { + "epoch": 0.09381159722453042, + "grad_norm": 10.990956903050208, + "learning_rate": 4.976427516090776e-05, + "loss": 2.097, + "mean_token_accuracy": 0.44568966031074525, + "step": 93140 + }, + { + "epoch": 0.09381663327763459, + "grad_norm": 11.016899560724887, + "learning_rate": 4.9764221059291e-05, + "loss": 2.8037, + "mean_token_accuracy": 0.36551724672317504, + "step": 93145 + }, + { + "epoch": 0.09382166933073875, + "grad_norm": 12.59275148945663, + "learning_rate": 4.976416695149916e-05, + "loss": 2.727, + "mean_token_accuracy": 0.3655172407627106, + "step": 93150 + }, + { + "epoch": 0.09382670538384293, + "grad_norm": 13.276339818626704, + "learning_rate": 4.9764112837532287e-05, + "loss": 2.6629, + "mean_token_accuracy": 0.39310344457626345, + "step": 93155 + }, + { + "epoch": 0.0938317414369471, + "grad_norm": 10.127358391367475, + "learning_rate": 4.976405871739038e-05, + "loss": 2.3117, + "mean_token_accuracy": 0.49655172824859617, + "step": 93160 + }, + { + "epoch": 0.09383677749005127, + "grad_norm": 11.50562517117822, + "learning_rate": 4.976400459107344e-05, + "loss": 2.4372, + "mean_token_accuracy": 0.3862069010734558, + "step": 93165 + }, + { + "epoch": 0.09384181354315545, + "grad_norm": 6.682313721336983, + "learning_rate": 4.97639504585815e-05, + "loss": 1.6961, + "mean_token_accuracy": 0.5332728326320648, + "step": 93170 + }, + { + "epoch": 0.09384684959625962, + "grad_norm": 11.24456851501498, + "learning_rate": 4.976389631991457e-05, + "loss": 2.5845, + "mean_token_accuracy": 0.39310344457626345, + "step": 93175 + }, + { + "epoch": 0.0938518856493638, + "grad_norm": 10.826705861434709, + "learning_rate": 4.9763842175072665e-05, + "loss": 2.4385, + "mean_token_accuracy": 0.3931034505367279, + "step": 93180 + }, + { + "epoch": 0.09385692170246797, + "grad_norm": 9.91587807834643, + "learning_rate": 4.976378802405581e-05, + "loss": 2.8099, + "mean_token_accuracy": 0.4310344815254211, + "step": 93185 + }, + { + "epoch": 0.09386195775557214, + "grad_norm": 10.289772472303778, + "learning_rate": 4.9763733866864e-05, + "loss": 2.3091, + "mean_token_accuracy": 0.4482758641242981, + "step": 93190 + }, + { + "epoch": 0.09386699380867632, + "grad_norm": 10.441717954730507, + "learning_rate": 4.976367970349726e-05, + "loss": 2.511, + "mean_token_accuracy": 0.4332123398780823, + "step": 93195 + }, + { + "epoch": 0.09387202986178049, + "grad_norm": 10.185858471927068, + "learning_rate": 4.976362553395561e-05, + "loss": 2.631, + "mean_token_accuracy": 0.4172413766384125, + "step": 93200 + }, + { + "epoch": 0.09387706591488466, + "grad_norm": 10.638513408547466, + "learning_rate": 4.976357135823906e-05, + "loss": 2.4362, + "mean_token_accuracy": 0.4586206912994385, + "step": 93205 + }, + { + "epoch": 0.09388210196798884, + "grad_norm": 12.062711364709205, + "learning_rate": 4.976351717634762e-05, + "loss": 2.3548, + "mean_token_accuracy": 0.441379314661026, + "step": 93210 + }, + { + "epoch": 0.09388713802109301, + "grad_norm": 11.209233862842803, + "learning_rate": 4.9763462988281324e-05, + "loss": 2.6796, + "mean_token_accuracy": 0.4103448212146759, + "step": 93215 + }, + { + "epoch": 0.09389217407419717, + "grad_norm": 13.033225749328151, + "learning_rate": 4.9763408794040154e-05, + "loss": 2.8447, + "mean_token_accuracy": 0.4068965494632721, + "step": 93220 + }, + { + "epoch": 0.09389721012730134, + "grad_norm": 12.242285313587148, + "learning_rate": 4.976335459362416e-05, + "loss": 2.6098, + "mean_token_accuracy": 0.41379310488700866, + "step": 93225 + }, + { + "epoch": 0.09390224618040552, + "grad_norm": 11.672257732824011, + "learning_rate": 4.976330038703333e-05, + "loss": 2.477, + "mean_token_accuracy": 0.4034482777118683, + "step": 93230 + }, + { + "epoch": 0.09390728223350969, + "grad_norm": 12.383770062230347, + "learning_rate": 4.97632461742677e-05, + "loss": 2.7849, + "mean_token_accuracy": 0.3241379290819168, + "step": 93235 + }, + { + "epoch": 0.09391231828661387, + "grad_norm": 10.469512445202916, + "learning_rate": 4.976319195532727e-05, + "loss": 2.4067, + "mean_token_accuracy": 0.41724138259887694, + "step": 93240 + }, + { + "epoch": 0.09391735433971804, + "grad_norm": 11.620291003463986, + "learning_rate": 4.9763137730212066e-05, + "loss": 2.6917, + "mean_token_accuracy": 0.4172413766384125, + "step": 93245 + }, + { + "epoch": 0.09392239039282221, + "grad_norm": 9.764571488040007, + "learning_rate": 4.9763083498922096e-05, + "loss": 2.6827, + "mean_token_accuracy": 0.39310344457626345, + "step": 93250 + }, + { + "epoch": 0.09392742644592639, + "grad_norm": 10.76449687127288, + "learning_rate": 4.976302926145738e-05, + "loss": 2.3687, + "mean_token_accuracy": 0.4379310250282288, + "step": 93255 + }, + { + "epoch": 0.09393246249903056, + "grad_norm": 12.339009233002926, + "learning_rate": 4.9762975017817924e-05, + "loss": 2.3613, + "mean_token_accuracy": 0.36896551251411436, + "step": 93260 + }, + { + "epoch": 0.09393749855213473, + "grad_norm": 10.62470451957075, + "learning_rate": 4.976292076800375e-05, + "loss": 2.2172, + "mean_token_accuracy": 0.46551724672317507, + "step": 93265 + }, + { + "epoch": 0.09394253460523891, + "grad_norm": 11.063958874213542, + "learning_rate": 4.9762866512014874e-05, + "loss": 2.172, + "mean_token_accuracy": 0.4275861978530884, + "step": 93270 + }, + { + "epoch": 0.09394757065834308, + "grad_norm": 9.8511043703983, + "learning_rate": 4.97628122498513e-05, + "loss": 2.5852, + "mean_token_accuracy": 0.4413793087005615, + "step": 93275 + }, + { + "epoch": 0.09395260671144726, + "grad_norm": 9.944804429942453, + "learning_rate": 4.976275798151306e-05, + "loss": 2.5648, + "mean_token_accuracy": 0.3724137872457504, + "step": 93280 + }, + { + "epoch": 0.09395764276455143, + "grad_norm": 13.698295167887576, + "learning_rate": 4.9762703707000165e-05, + "loss": 2.6808, + "mean_token_accuracy": 0.42413792610168455, + "step": 93285 + }, + { + "epoch": 0.09396267881765559, + "grad_norm": 8.869193279337125, + "learning_rate": 4.976264942631262e-05, + "loss": 2.1228, + "mean_token_accuracy": 0.4965517222881317, + "step": 93290 + }, + { + "epoch": 0.09396771487075976, + "grad_norm": 10.606050482256475, + "learning_rate": 4.976259513945044e-05, + "loss": 2.3644, + "mean_token_accuracy": 0.41034482419490814, + "step": 93295 + }, + { + "epoch": 0.09397275092386394, + "grad_norm": 11.63149980598661, + "learning_rate": 4.976254084641366e-05, + "loss": 2.7444, + "mean_token_accuracy": 0.38275861740112305, + "step": 93300 + }, + { + "epoch": 0.09397778697696811, + "grad_norm": 9.549253153383603, + "learning_rate": 4.9762486547202276e-05, + "loss": 2.4961, + "mean_token_accuracy": 0.3896551728248596, + "step": 93305 + }, + { + "epoch": 0.09398282303007228, + "grad_norm": 11.174655951933664, + "learning_rate": 4.97624322418163e-05, + "loss": 2.2278, + "mean_token_accuracy": 0.4448275864124298, + "step": 93310 + }, + { + "epoch": 0.09398785908317646, + "grad_norm": 11.546262547242273, + "learning_rate": 4.9762377930255764e-05, + "loss": 2.174, + "mean_token_accuracy": 0.4344827592372894, + "step": 93315 + }, + { + "epoch": 0.09399289513628063, + "grad_norm": 10.377504964895614, + "learning_rate": 4.976232361252068e-05, + "loss": 2.2115, + "mean_token_accuracy": 0.44482758045196535, + "step": 93320 + }, + { + "epoch": 0.0939979311893848, + "grad_norm": 9.484271421616363, + "learning_rate": 4.9762269288611045e-05, + "loss": 2.4136, + "mean_token_accuracy": 0.3758620619773865, + "step": 93325 + }, + { + "epoch": 0.09400296724248898, + "grad_norm": 12.783151694832894, + "learning_rate": 4.976221495852689e-05, + "loss": 2.2815, + "mean_token_accuracy": 0.4137930989265442, + "step": 93330 + }, + { + "epoch": 0.09400800329559315, + "grad_norm": 8.123686383888591, + "learning_rate": 4.9762160622268235e-05, + "loss": 2.3432, + "mean_token_accuracy": 0.42758620381355283, + "step": 93335 + }, + { + "epoch": 0.09401303934869733, + "grad_norm": 9.006053908335527, + "learning_rate": 4.976210627983507e-05, + "loss": 2.8218, + "mean_token_accuracy": 0.36551723778247835, + "step": 93340 + }, + { + "epoch": 0.0940180754018015, + "grad_norm": 10.803364212176326, + "learning_rate": 4.976205193122744e-05, + "loss": 2.1425, + "mean_token_accuracy": 0.4497277677059174, + "step": 93345 + }, + { + "epoch": 0.09402311145490568, + "grad_norm": 12.833449529251215, + "learning_rate": 4.9761997576445346e-05, + "loss": 2.6397, + "mean_token_accuracy": 0.3344827562570572, + "step": 93350 + }, + { + "epoch": 0.09402814750800985, + "grad_norm": 10.759054762538439, + "learning_rate": 4.976194321548881e-05, + "loss": 2.68, + "mean_token_accuracy": 0.3896551728248596, + "step": 93355 + }, + { + "epoch": 0.09403318356111401, + "grad_norm": 11.392916736789296, + "learning_rate": 4.9761888848357825e-05, + "loss": 2.6934, + "mean_token_accuracy": 0.3862069010734558, + "step": 93360 + }, + { + "epoch": 0.09403821961421818, + "grad_norm": 13.355663311068163, + "learning_rate": 4.976183447505243e-05, + "loss": 2.5513, + "mean_token_accuracy": 0.41379310488700866, + "step": 93365 + }, + { + "epoch": 0.09404325566732236, + "grad_norm": 16.263050210260122, + "learning_rate": 4.976178009557264e-05, + "loss": 2.6799, + "mean_token_accuracy": 0.39655172228813174, + "step": 93370 + }, + { + "epoch": 0.09404829172042653, + "grad_norm": 10.389317954795642, + "learning_rate": 4.976172570991844e-05, + "loss": 2.4516, + "mean_token_accuracy": 0.4413793087005615, + "step": 93375 + }, + { + "epoch": 0.0940533277735307, + "grad_norm": 13.303103308223422, + "learning_rate": 4.976167131808988e-05, + "loss": 2.5749, + "mean_token_accuracy": 0.4068965554237366, + "step": 93380 + }, + { + "epoch": 0.09405836382663488, + "grad_norm": 11.193183975221022, + "learning_rate": 4.9761616920086976e-05, + "loss": 3.3286, + "mean_token_accuracy": 0.37090138494968417, + "step": 93385 + }, + { + "epoch": 0.09406339987973905, + "grad_norm": 13.051894801016754, + "learning_rate": 4.9761562515909714e-05, + "loss": 2.9833, + "mean_token_accuracy": 0.3530550479888916, + "step": 93390 + }, + { + "epoch": 0.09406843593284323, + "grad_norm": 10.442787016084559, + "learning_rate": 4.976150810555813e-05, + "loss": 2.3693, + "mean_token_accuracy": 0.41724138259887694, + "step": 93395 + }, + { + "epoch": 0.0940734719859474, + "grad_norm": 13.019957700363257, + "learning_rate": 4.9761453689032225e-05, + "loss": 2.3714, + "mean_token_accuracy": 0.41724138855934145, + "step": 93400 + }, + { + "epoch": 0.09407850803905157, + "grad_norm": 11.994705902390926, + "learning_rate": 4.976139926633203e-05, + "loss": 2.3616, + "mean_token_accuracy": 0.4137930989265442, + "step": 93405 + }, + { + "epoch": 0.09408354409215575, + "grad_norm": 12.900729814301334, + "learning_rate": 4.976134483745755e-05, + "loss": 2.1395, + "mean_token_accuracy": 0.4620689630508423, + "step": 93410 + }, + { + "epoch": 0.09408858014525992, + "grad_norm": 12.086310244999563, + "learning_rate": 4.976129040240882e-05, + "loss": 2.3367, + "mean_token_accuracy": 0.4620689690113068, + "step": 93415 + }, + { + "epoch": 0.0940936161983641, + "grad_norm": 10.76988756460759, + "learning_rate": 4.976123596118581e-05, + "loss": 2.559, + "mean_token_accuracy": 0.39655172228813174, + "step": 93420 + }, + { + "epoch": 0.09409865225146827, + "grad_norm": 11.401492980232348, + "learning_rate": 4.9761181513788585e-05, + "loss": 2.3397, + "mean_token_accuracy": 0.42413793206214906, + "step": 93425 + }, + { + "epoch": 0.09410368830457243, + "grad_norm": 24.042180130981492, + "learning_rate": 4.9761127060217125e-05, + "loss": 2.8713, + "mean_token_accuracy": 0.4, + "step": 93430 + }, + { + "epoch": 0.0941087243576766, + "grad_norm": 11.372429340350397, + "learning_rate": 4.9761072600471465e-05, + "loss": 2.7881, + "mean_token_accuracy": 0.39655172228813174, + "step": 93435 + }, + { + "epoch": 0.09411376041078078, + "grad_norm": 10.769655212987994, + "learning_rate": 4.976101813455161e-05, + "loss": 2.8612, + "mean_token_accuracy": 0.4034482777118683, + "step": 93440 + }, + { + "epoch": 0.09411879646388495, + "grad_norm": 11.378830648691853, + "learning_rate": 4.976096366245758e-05, + "loss": 2.2618, + "mean_token_accuracy": 0.41034482717514037, + "step": 93445 + }, + { + "epoch": 0.09412383251698912, + "grad_norm": 15.033486243402097, + "learning_rate": 4.976090918418939e-05, + "loss": 2.5002, + "mean_token_accuracy": 0.4413793087005615, + "step": 93450 + }, + { + "epoch": 0.0941288685700933, + "grad_norm": 9.407489659310537, + "learning_rate": 4.976085469974706e-05, + "loss": 2.6696, + "mean_token_accuracy": 0.4413793087005615, + "step": 93455 + }, + { + "epoch": 0.09413390462319747, + "grad_norm": 11.265177386642094, + "learning_rate": 4.9760800209130584e-05, + "loss": 2.7471, + "mean_token_accuracy": 0.37241379022598264, + "step": 93460 + }, + { + "epoch": 0.09413894067630164, + "grad_norm": 9.514155027222621, + "learning_rate": 4.976074571234e-05, + "loss": 2.1029, + "mean_token_accuracy": 0.4517241418361664, + "step": 93465 + }, + { + "epoch": 0.09414397672940582, + "grad_norm": 9.331723860831088, + "learning_rate": 4.976069120937532e-05, + "loss": 2.471, + "mean_token_accuracy": 0.38620689511299133, + "step": 93470 + }, + { + "epoch": 0.09414901278250999, + "grad_norm": 14.235241846256773, + "learning_rate": 4.976063670023654e-05, + "loss": 2.4226, + "mean_token_accuracy": 0.4679975748062134, + "step": 93475 + }, + { + "epoch": 0.09415404883561417, + "grad_norm": 10.398092174856519, + "learning_rate": 4.97605821849237e-05, + "loss": 2.3256, + "mean_token_accuracy": 0.4448275864124298, + "step": 93480 + }, + { + "epoch": 0.09415908488871834, + "grad_norm": 11.666527372335825, + "learning_rate": 4.97605276634368e-05, + "loss": 2.4412, + "mean_token_accuracy": 0.45517241954803467, + "step": 93485 + }, + { + "epoch": 0.09416412094182251, + "grad_norm": 9.434644363207424, + "learning_rate": 4.976047313577586e-05, + "loss": 2.3851, + "mean_token_accuracy": 0.5137930989265442, + "step": 93490 + }, + { + "epoch": 0.09416915699492669, + "grad_norm": 11.023644429272872, + "learning_rate": 4.976041860194091e-05, + "loss": 2.5825, + "mean_token_accuracy": 0.4465819835662842, + "step": 93495 + }, + { + "epoch": 0.09417419304803085, + "grad_norm": 13.583222457920636, + "learning_rate": 4.976036406193193e-05, + "loss": 2.5883, + "mean_token_accuracy": 0.36551723480224607, + "step": 93500 + }, + { + "epoch": 0.09417922910113502, + "grad_norm": 10.728433614798897, + "learning_rate": 4.9760309515748966e-05, + "loss": 2.6421, + "mean_token_accuracy": 0.3813067078590393, + "step": 93505 + }, + { + "epoch": 0.0941842651542392, + "grad_norm": 10.78453840567406, + "learning_rate": 4.976025496339202e-05, + "loss": 2.4792, + "mean_token_accuracy": 0.37586206793785093, + "step": 93510 + }, + { + "epoch": 0.09418930120734337, + "grad_norm": 12.729352341218307, + "learning_rate": 4.976020040486111e-05, + "loss": 2.6238, + "mean_token_accuracy": 0.404839688539505, + "step": 93515 + }, + { + "epoch": 0.09419433726044754, + "grad_norm": 11.797521575361689, + "learning_rate": 4.976014584015624e-05, + "loss": 2.3313, + "mean_token_accuracy": 0.4344827592372894, + "step": 93520 + }, + { + "epoch": 0.09419937331355172, + "grad_norm": 10.363134768154238, + "learning_rate": 4.976009126927744e-05, + "loss": 2.329, + "mean_token_accuracy": 0.4971566915512085, + "step": 93525 + }, + { + "epoch": 0.09420440936665589, + "grad_norm": 13.138645060910342, + "learning_rate": 4.976003669222472e-05, + "loss": 2.4366, + "mean_token_accuracy": 0.3896551728248596, + "step": 93530 + }, + { + "epoch": 0.09420944541976006, + "grad_norm": 9.762158332985264, + "learning_rate": 4.975998210899811e-05, + "loss": 2.5049, + "mean_token_accuracy": 0.4517241299152374, + "step": 93535 + }, + { + "epoch": 0.09421448147286424, + "grad_norm": 11.762140572787576, + "learning_rate": 4.97599275195976e-05, + "loss": 2.1855, + "mean_token_accuracy": 0.44827585816383364, + "step": 93540 + }, + { + "epoch": 0.09421951752596841, + "grad_norm": 15.429912509949562, + "learning_rate": 4.9759872924023215e-05, + "loss": 2.6731, + "mean_token_accuracy": 0.4068965494632721, + "step": 93545 + }, + { + "epoch": 0.09422455357907258, + "grad_norm": 12.283650127048253, + "learning_rate": 4.9759818322274976e-05, + "loss": 2.5697, + "mean_token_accuracy": 0.4034482717514038, + "step": 93550 + }, + { + "epoch": 0.09422958963217676, + "grad_norm": 9.892054657273672, + "learning_rate": 4.975976371435289e-05, + "loss": 2.269, + "mean_token_accuracy": 0.43448275327682495, + "step": 93555 + }, + { + "epoch": 0.09423462568528093, + "grad_norm": 10.682805381327249, + "learning_rate": 4.9759709100256976e-05, + "loss": 2.5749, + "mean_token_accuracy": 0.44482758045196535, + "step": 93560 + }, + { + "epoch": 0.0942396617383851, + "grad_norm": 8.79682834740559, + "learning_rate": 4.975965447998725e-05, + "loss": 2.6551, + "mean_token_accuracy": 0.4310344815254211, + "step": 93565 + }, + { + "epoch": 0.09424469779148927, + "grad_norm": 11.46873677855604, + "learning_rate": 4.975959985354372e-05, + "loss": 2.3811, + "mean_token_accuracy": 0.43793103098869324, + "step": 93570 + }, + { + "epoch": 0.09424973384459344, + "grad_norm": 9.897303091773074, + "learning_rate": 4.9759545220926406e-05, + "loss": 2.5478, + "mean_token_accuracy": 0.4344827592372894, + "step": 93575 + }, + { + "epoch": 0.09425476989769761, + "grad_norm": 9.925567240265732, + "learning_rate": 4.9759490582135336e-05, + "loss": 2.4771, + "mean_token_accuracy": 0.45862069725990295, + "step": 93580 + }, + { + "epoch": 0.09425980595080179, + "grad_norm": 11.009107424762867, + "learning_rate": 4.9759435937170506e-05, + "loss": 2.0894, + "mean_token_accuracy": 0.4655172348022461, + "step": 93585 + }, + { + "epoch": 0.09426484200390596, + "grad_norm": 11.4778292505902, + "learning_rate": 4.9759381286031945e-05, + "loss": 2.393, + "mean_token_accuracy": 0.4517241299152374, + "step": 93590 + }, + { + "epoch": 0.09426987805701013, + "grad_norm": 10.445558245673917, + "learning_rate": 4.975932662871965e-05, + "loss": 2.7289, + "mean_token_accuracy": 0.398124623298645, + "step": 93595 + }, + { + "epoch": 0.09427491411011431, + "grad_norm": 10.797943325203851, + "learning_rate": 4.975927196523365e-05, + "loss": 2.4743, + "mean_token_accuracy": 0.4607380449771881, + "step": 93600 + }, + { + "epoch": 0.09427995016321848, + "grad_norm": 10.242685818374698, + "learning_rate": 4.975921729557396e-05, + "loss": 2.3322, + "mean_token_accuracy": 0.43793103098869324, + "step": 93605 + }, + { + "epoch": 0.09428498621632266, + "grad_norm": 10.940434899695527, + "learning_rate": 4.975916261974059e-05, + "loss": 2.3607, + "mean_token_accuracy": 0.4034482717514038, + "step": 93610 + }, + { + "epoch": 0.09429002226942683, + "grad_norm": 12.077135158807842, + "learning_rate": 4.975910793773357e-05, + "loss": 2.6846, + "mean_token_accuracy": 0.41379310488700866, + "step": 93615 + }, + { + "epoch": 0.094295058322531, + "grad_norm": 12.460848345033678, + "learning_rate": 4.9759053249552895e-05, + "loss": 2.1797, + "mean_token_accuracy": 0.43793103098869324, + "step": 93620 + }, + { + "epoch": 0.09430009437563518, + "grad_norm": 9.585981960975458, + "learning_rate": 4.975899855519859e-05, + "loss": 2.6752, + "mean_token_accuracy": 0.3586206793785095, + "step": 93625 + }, + { + "epoch": 0.09430513042873935, + "grad_norm": 8.840340478336367, + "learning_rate": 4.9758943854670664e-05, + "loss": 2.2101, + "mean_token_accuracy": 0.47931033968925474, + "step": 93630 + }, + { + "epoch": 0.09431016648184352, + "grad_norm": 11.295277769578684, + "learning_rate": 4.975888914796914e-05, + "loss": 2.1588, + "mean_token_accuracy": 0.5009852230548859, + "step": 93635 + }, + { + "epoch": 0.09431520253494768, + "grad_norm": 7.934810751667857, + "learning_rate": 4.975883443509404e-05, + "loss": 2.254, + "mean_token_accuracy": 0.4172413766384125, + "step": 93640 + }, + { + "epoch": 0.09432023858805186, + "grad_norm": 10.47871861006057, + "learning_rate": 4.975877971604536e-05, + "loss": 2.8562, + "mean_token_accuracy": 0.3896551787853241, + "step": 93645 + }, + { + "epoch": 0.09432527464115603, + "grad_norm": 13.767857022316344, + "learning_rate": 4.975872499082312e-05, + "loss": 2.4261, + "mean_token_accuracy": 0.4181034445762634, + "step": 93650 + }, + { + "epoch": 0.0943303106942602, + "grad_norm": 13.075076981592666, + "learning_rate": 4.975867025942735e-05, + "loss": 2.2473, + "mean_token_accuracy": 0.4344827592372894, + "step": 93655 + }, + { + "epoch": 0.09433534674736438, + "grad_norm": 9.364591035153868, + "learning_rate": 4.975861552185804e-05, + "loss": 2.4141, + "mean_token_accuracy": 0.4344827592372894, + "step": 93660 + }, + { + "epoch": 0.09434038280046855, + "grad_norm": 10.25661809750755, + "learning_rate": 4.9758560778115235e-05, + "loss": 2.1552, + "mean_token_accuracy": 0.4517241299152374, + "step": 93665 + }, + { + "epoch": 0.09434541885357273, + "grad_norm": 9.884431864468795, + "learning_rate": 4.9758506028198924e-05, + "loss": 3.4007, + "mean_token_accuracy": 0.34827586114406583, + "step": 93670 + }, + { + "epoch": 0.0943504549066769, + "grad_norm": 9.615450543475237, + "learning_rate": 4.975845127210914e-05, + "loss": 2.6631, + "mean_token_accuracy": 0.39655172228813174, + "step": 93675 + }, + { + "epoch": 0.09435549095978107, + "grad_norm": 11.350525849564994, + "learning_rate": 4.975839650984589e-05, + "loss": 2.6136, + "mean_token_accuracy": 0.3482758641242981, + "step": 93680 + }, + { + "epoch": 0.09436052701288525, + "grad_norm": 13.450916891584987, + "learning_rate": 4.9758341741409185e-05, + "loss": 2.6038, + "mean_token_accuracy": 0.4172413766384125, + "step": 93685 + }, + { + "epoch": 0.09436556306598942, + "grad_norm": 10.736945087669675, + "learning_rate": 4.975828696679906e-05, + "loss": 2.401, + "mean_token_accuracy": 0.4068965494632721, + "step": 93690 + }, + { + "epoch": 0.0943705991190936, + "grad_norm": 11.081771108980188, + "learning_rate": 4.9758232186015504e-05, + "loss": 2.489, + "mean_token_accuracy": 0.41034482717514037, + "step": 93695 + }, + { + "epoch": 0.09437563517219777, + "grad_norm": 13.35974699866472, + "learning_rate": 4.975817739905855e-05, + "loss": 2.5538, + "mean_token_accuracy": 0.3793103456497192, + "step": 93700 + }, + { + "epoch": 0.09438067122530194, + "grad_norm": 13.859442474504705, + "learning_rate": 4.97581226059282e-05, + "loss": 2.4258, + "mean_token_accuracy": 0.41724138855934145, + "step": 93705 + }, + { + "epoch": 0.0943857072784061, + "grad_norm": 10.873035095123415, + "learning_rate": 4.975806780662448e-05, + "loss": 2.4042, + "mean_token_accuracy": 0.44137930274009707, + "step": 93710 + }, + { + "epoch": 0.09439074333151028, + "grad_norm": 10.349623797127418, + "learning_rate": 4.97580130011474e-05, + "loss": 2.6153, + "mean_token_accuracy": 0.4034482717514038, + "step": 93715 + }, + { + "epoch": 0.09439577938461445, + "grad_norm": 11.135879822000295, + "learning_rate": 4.975795818949698e-05, + "loss": 2.6427, + "mean_token_accuracy": 0.3689655244350433, + "step": 93720 + }, + { + "epoch": 0.09440081543771862, + "grad_norm": 17.953275008329015, + "learning_rate": 4.975790337167324e-05, + "loss": 2.9001, + "mean_token_accuracy": 0.37586206793785093, + "step": 93725 + }, + { + "epoch": 0.0944058514908228, + "grad_norm": 9.210224754235076, + "learning_rate": 4.975784854767618e-05, + "loss": 2.2413, + "mean_token_accuracy": 0.42413793206214906, + "step": 93730 + }, + { + "epoch": 0.09441088754392697, + "grad_norm": 9.338199779764377, + "learning_rate": 4.975779371750582e-05, + "loss": 2.1955, + "mean_token_accuracy": 0.4620689570903778, + "step": 93735 + }, + { + "epoch": 0.09441592359703115, + "grad_norm": 8.986614141636872, + "learning_rate": 4.9757738881162185e-05, + "loss": 2.3057, + "mean_token_accuracy": 0.47241379618644713, + "step": 93740 + }, + { + "epoch": 0.09442095965013532, + "grad_norm": 15.678215099298507, + "learning_rate": 4.975768403864528e-05, + "loss": 2.4146, + "mean_token_accuracy": 0.43793103098869324, + "step": 93745 + }, + { + "epoch": 0.0944259957032395, + "grad_norm": 9.96931783290619, + "learning_rate": 4.975762918995512e-05, + "loss": 2.4002, + "mean_token_accuracy": 0.4344827651977539, + "step": 93750 + }, + { + "epoch": 0.09443103175634367, + "grad_norm": 13.117964465837305, + "learning_rate": 4.9757574335091724e-05, + "loss": 2.5288, + "mean_token_accuracy": 0.3896551728248596, + "step": 93755 + }, + { + "epoch": 0.09443606780944784, + "grad_norm": 8.261185835731977, + "learning_rate": 4.9757519474055106e-05, + "loss": 2.1644, + "mean_token_accuracy": 0.474954628944397, + "step": 93760 + }, + { + "epoch": 0.09444110386255201, + "grad_norm": 10.854261072355659, + "learning_rate": 4.975746460684529e-05, + "loss": 2.5593, + "mean_token_accuracy": 0.35862069129943847, + "step": 93765 + }, + { + "epoch": 0.09444613991565619, + "grad_norm": 10.62232783710421, + "learning_rate": 4.975740973346228e-05, + "loss": 2.2855, + "mean_token_accuracy": 0.3965517282485962, + "step": 93770 + }, + { + "epoch": 0.09445117596876036, + "grad_norm": 9.071071475778446, + "learning_rate": 4.975735485390609e-05, + "loss": 2.5849, + "mean_token_accuracy": 0.41034482717514037, + "step": 93775 + }, + { + "epoch": 0.09445621202186452, + "grad_norm": 11.250621575352612, + "learning_rate": 4.975729996817674e-05, + "loss": 2.5456, + "mean_token_accuracy": 0.4275861978530884, + "step": 93780 + }, + { + "epoch": 0.0944612480749687, + "grad_norm": 10.635798755881865, + "learning_rate": 4.975724507627425e-05, + "loss": 2.431, + "mean_token_accuracy": 0.4413793087005615, + "step": 93785 + }, + { + "epoch": 0.09446628412807287, + "grad_norm": 11.068434215526537, + "learning_rate": 4.975719017819863e-05, + "loss": 2.6297, + "mean_token_accuracy": 0.3620689630508423, + "step": 93790 + }, + { + "epoch": 0.09447132018117704, + "grad_norm": 11.350134195918969, + "learning_rate": 4.9757135273949883e-05, + "loss": 2.5652, + "mean_token_accuracy": 0.3931034505367279, + "step": 93795 + }, + { + "epoch": 0.09447635623428122, + "grad_norm": 9.743414085774845, + "learning_rate": 4.975708036352805e-05, + "loss": 2.3428, + "mean_token_accuracy": 0.4068965554237366, + "step": 93800 + }, + { + "epoch": 0.09448139228738539, + "grad_norm": 11.090275212118284, + "learning_rate": 4.975702544693313e-05, + "loss": 2.6732, + "mean_token_accuracy": 0.3655172407627106, + "step": 93805 + }, + { + "epoch": 0.09448642834048956, + "grad_norm": 10.533639835381354, + "learning_rate": 4.975697052416514e-05, + "loss": 2.4948, + "mean_token_accuracy": 0.4103448331356049, + "step": 93810 + }, + { + "epoch": 0.09449146439359374, + "grad_norm": 12.407908713513581, + "learning_rate": 4.975691559522409e-05, + "loss": 2.7467, + "mean_token_accuracy": 0.38965516686439516, + "step": 93815 + }, + { + "epoch": 0.09449650044669791, + "grad_norm": 11.290234417151717, + "learning_rate": 4.975686066011e-05, + "loss": 2.4718, + "mean_token_accuracy": 0.3965517282485962, + "step": 93820 + }, + { + "epoch": 0.09450153649980209, + "grad_norm": 11.927253177402545, + "learning_rate": 4.97568057188229e-05, + "loss": 2.5858, + "mean_token_accuracy": 0.4137930989265442, + "step": 93825 + }, + { + "epoch": 0.09450657255290626, + "grad_norm": 10.068815763515476, + "learning_rate": 4.9756750771362786e-05, + "loss": 2.5294, + "mean_token_accuracy": 0.4068965494632721, + "step": 93830 + }, + { + "epoch": 0.09451160860601043, + "grad_norm": 10.385499160633943, + "learning_rate": 4.975669581772968e-05, + "loss": 2.4088, + "mean_token_accuracy": 0.4310344815254211, + "step": 93835 + }, + { + "epoch": 0.09451664465911461, + "grad_norm": 16.898756621614776, + "learning_rate": 4.975664085792359e-05, + "loss": 2.8586, + "mean_token_accuracy": 0.3379310369491577, + "step": 93840 + }, + { + "epoch": 0.09452168071221878, + "grad_norm": 9.677758414103813, + "learning_rate": 4.975658589194454e-05, + "loss": 2.3842, + "mean_token_accuracy": 0.45517241954803467, + "step": 93845 + }, + { + "epoch": 0.09452671676532294, + "grad_norm": 16.605940577719693, + "learning_rate": 4.9756530919792555e-05, + "loss": 2.63, + "mean_token_accuracy": 0.42413793206214906, + "step": 93850 + }, + { + "epoch": 0.09453175281842711, + "grad_norm": 11.359746430765545, + "learning_rate": 4.975647594146762e-05, + "loss": 2.9495, + "mean_token_accuracy": 0.3793103456497192, + "step": 93855 + }, + { + "epoch": 0.09453678887153129, + "grad_norm": 11.232638441593778, + "learning_rate": 4.9756420956969785e-05, + "loss": 2.2464, + "mean_token_accuracy": 0.44827587008476255, + "step": 93860 + }, + { + "epoch": 0.09454182492463546, + "grad_norm": 10.781007337363807, + "learning_rate": 4.975636596629904e-05, + "loss": 2.2326, + "mean_token_accuracy": 0.4072660028934479, + "step": 93865 + }, + { + "epoch": 0.09454686097773964, + "grad_norm": 10.771587444999625, + "learning_rate": 4.975631096945541e-05, + "loss": 2.2469, + "mean_token_accuracy": 0.38275861740112305, + "step": 93870 + }, + { + "epoch": 0.09455189703084381, + "grad_norm": 9.97903466176054, + "learning_rate": 4.975625596643891e-05, + "loss": 2.4957, + "mean_token_accuracy": 0.4034482777118683, + "step": 93875 + }, + { + "epoch": 0.09455693308394798, + "grad_norm": 9.667966985155022, + "learning_rate": 4.975620095724955e-05, + "loss": 2.5884, + "mean_token_accuracy": 0.39310345351696013, + "step": 93880 + }, + { + "epoch": 0.09456196913705216, + "grad_norm": 10.422221882259715, + "learning_rate": 4.975614594188735e-05, + "loss": 2.6904, + "mean_token_accuracy": 0.42068966031074523, + "step": 93885 + }, + { + "epoch": 0.09456700519015633, + "grad_norm": 10.301213538322283, + "learning_rate": 4.9756090920352325e-05, + "loss": 2.2933, + "mean_token_accuracy": 0.40344826579093934, + "step": 93890 + }, + { + "epoch": 0.0945720412432605, + "grad_norm": 15.333407073218519, + "learning_rate": 4.975603589264449e-05, + "loss": 2.4217, + "mean_token_accuracy": 0.38771929144859313, + "step": 93895 + }, + { + "epoch": 0.09457707729636468, + "grad_norm": 9.863553787669066, + "learning_rate": 4.975598085876387e-05, + "loss": 2.0677, + "mean_token_accuracy": 0.4862069010734558, + "step": 93900 + }, + { + "epoch": 0.09458211334946885, + "grad_norm": 9.029298240516116, + "learning_rate": 4.975592581871047e-05, + "loss": 2.5861, + "mean_token_accuracy": 0.4206896543502808, + "step": 93905 + }, + { + "epoch": 0.09458714940257303, + "grad_norm": 11.363515938139154, + "learning_rate": 4.975587077248429e-05, + "loss": 2.8235, + "mean_token_accuracy": 0.38620689511299133, + "step": 93910 + }, + { + "epoch": 0.0945921854556772, + "grad_norm": 9.89855368105006, + "learning_rate": 4.975581572008537e-05, + "loss": 2.3144, + "mean_token_accuracy": 0.4551724135875702, + "step": 93915 + }, + { + "epoch": 0.09459722150878136, + "grad_norm": 9.229849367909186, + "learning_rate": 4.975576066151372e-05, + "loss": 2.4823, + "mean_token_accuracy": 0.4137930989265442, + "step": 93920 + }, + { + "epoch": 0.09460225756188553, + "grad_norm": 11.301061163007358, + "learning_rate": 4.975570559676935e-05, + "loss": 2.7197, + "mean_token_accuracy": 0.4, + "step": 93925 + }, + { + "epoch": 0.09460729361498971, + "grad_norm": 9.573180422833483, + "learning_rate": 4.975565052585227e-05, + "loss": 2.3194, + "mean_token_accuracy": 0.4620689511299133, + "step": 93930 + }, + { + "epoch": 0.09461232966809388, + "grad_norm": 10.904534787616782, + "learning_rate": 4.9755595448762516e-05, + "loss": 2.3012, + "mean_token_accuracy": 0.4379310429096222, + "step": 93935 + }, + { + "epoch": 0.09461736572119805, + "grad_norm": 9.692243851405578, + "learning_rate": 4.9755540365500074e-05, + "loss": 2.3029, + "mean_token_accuracy": 0.43103448748588563, + "step": 93940 + }, + { + "epoch": 0.09462240177430223, + "grad_norm": 8.989314435235977, + "learning_rate": 4.975548527606499e-05, + "loss": 2.8007, + "mean_token_accuracy": 0.38275861740112305, + "step": 93945 + }, + { + "epoch": 0.0946274378274064, + "grad_norm": 10.058742094646837, + "learning_rate": 4.975543018045725e-05, + "loss": 2.8439, + "mean_token_accuracy": 0.38620689511299133, + "step": 93950 + }, + { + "epoch": 0.09463247388051058, + "grad_norm": 11.084764594746185, + "learning_rate": 4.975537507867689e-05, + "loss": 2.6334, + "mean_token_accuracy": 0.3896551728248596, + "step": 93955 + }, + { + "epoch": 0.09463750993361475, + "grad_norm": 13.561411008815563, + "learning_rate": 4.975531997072392e-05, + "loss": 2.3086, + "mean_token_accuracy": 0.43103448748588563, + "step": 93960 + }, + { + "epoch": 0.09464254598671892, + "grad_norm": 10.247279109235317, + "learning_rate": 4.975526485659836e-05, + "loss": 2.4773, + "mean_token_accuracy": 0.4172413766384125, + "step": 93965 + }, + { + "epoch": 0.0946475820398231, + "grad_norm": 10.552376383481556, + "learning_rate": 4.975520973630021e-05, + "loss": 2.3349, + "mean_token_accuracy": 0.4620689630508423, + "step": 93970 + }, + { + "epoch": 0.09465261809292727, + "grad_norm": 10.255334484756231, + "learning_rate": 4.97551546098295e-05, + "loss": 2.4085, + "mean_token_accuracy": 0.41120689511299136, + "step": 93975 + }, + { + "epoch": 0.09465765414603144, + "grad_norm": 9.439251456045085, + "learning_rate": 4.9755099477186234e-05, + "loss": 2.5489, + "mean_token_accuracy": 0.38275861740112305, + "step": 93980 + }, + { + "epoch": 0.09466269019913562, + "grad_norm": 14.047924728690122, + "learning_rate": 4.975504433837043e-05, + "loss": 2.9445, + "mean_token_accuracy": 0.42758620381355283, + "step": 93985 + }, + { + "epoch": 0.09466772625223978, + "grad_norm": 12.020407930655042, + "learning_rate": 4.975498919338212e-05, + "loss": 2.852, + "mean_token_accuracy": 0.3862068891525269, + "step": 93990 + }, + { + "epoch": 0.09467276230534395, + "grad_norm": 8.361322479137154, + "learning_rate": 4.975493404222129e-05, + "loss": 2.4528, + "mean_token_accuracy": 0.403448274731636, + "step": 93995 + }, + { + "epoch": 0.09467779835844813, + "grad_norm": 10.593476554822363, + "learning_rate": 4.975487888488798e-05, + "loss": 2.6315, + "mean_token_accuracy": 0.3655172407627106, + "step": 94000 + }, + { + "epoch": 0.0946828344115523, + "grad_norm": 12.378403596429438, + "learning_rate": 4.975482372138219e-05, + "loss": 2.3028, + "mean_token_accuracy": 0.4329098641872406, + "step": 94005 + }, + { + "epoch": 0.09468787046465647, + "grad_norm": 10.548593689097432, + "learning_rate": 4.975476855170395e-05, + "loss": 2.5973, + "mean_token_accuracy": 0.36896551847457887, + "step": 94010 + }, + { + "epoch": 0.09469290651776065, + "grad_norm": 14.381290414232296, + "learning_rate": 4.975471337585327e-05, + "loss": 2.6314, + "mean_token_accuracy": 0.495220810174942, + "step": 94015 + }, + { + "epoch": 0.09469794257086482, + "grad_norm": 10.499366230234484, + "learning_rate": 4.975465819383015e-05, + "loss": 2.8451, + "mean_token_accuracy": 0.35517241060733795, + "step": 94020 + }, + { + "epoch": 0.094702978623969, + "grad_norm": 8.710095164692776, + "learning_rate": 4.975460300563463e-05, + "loss": 1.9622, + "mean_token_accuracy": 0.5048029601573945, + "step": 94025 + }, + { + "epoch": 0.09470801467707317, + "grad_norm": 8.633174442755298, + "learning_rate": 4.9754547811266705e-05, + "loss": 2.2888, + "mean_token_accuracy": 0.4000000059604645, + "step": 94030 + }, + { + "epoch": 0.09471305073017734, + "grad_norm": 10.056991291146847, + "learning_rate": 4.97544926107264e-05, + "loss": 2.3549, + "mean_token_accuracy": 0.4517241358757019, + "step": 94035 + }, + { + "epoch": 0.09471808678328152, + "grad_norm": 11.101551432308566, + "learning_rate": 4.9754437404013723e-05, + "loss": 2.5931, + "mean_token_accuracy": 0.4034482777118683, + "step": 94040 + }, + { + "epoch": 0.09472312283638569, + "grad_norm": 11.9635585038568, + "learning_rate": 4.9754382191128705e-05, + "loss": 2.4539, + "mean_token_accuracy": 0.4068965494632721, + "step": 94045 + }, + { + "epoch": 0.09472815888948986, + "grad_norm": 10.428366813764141, + "learning_rate": 4.9754326972071345e-05, + "loss": 2.4026, + "mean_token_accuracy": 0.422202056646347, + "step": 94050 + }, + { + "epoch": 0.09473319494259404, + "grad_norm": 10.885062873179944, + "learning_rate": 4.975427174684166e-05, + "loss": 2.0915, + "mean_token_accuracy": 0.4965517342090607, + "step": 94055 + }, + { + "epoch": 0.0947382309956982, + "grad_norm": 11.401241691136448, + "learning_rate": 4.9754216515439676e-05, + "loss": 2.5789, + "mean_token_accuracy": 0.40689654350280763, + "step": 94060 + }, + { + "epoch": 0.09474326704880237, + "grad_norm": 11.04480346381988, + "learning_rate": 4.97541612778654e-05, + "loss": 2.2026, + "mean_token_accuracy": 0.43938294649124143, + "step": 94065 + }, + { + "epoch": 0.09474830310190654, + "grad_norm": 15.084593694797695, + "learning_rate": 4.9754106034118855e-05, + "loss": 2.5102, + "mean_token_accuracy": 0.42758620381355283, + "step": 94070 + }, + { + "epoch": 0.09475333915501072, + "grad_norm": 9.107846441030375, + "learning_rate": 4.9754050784200045e-05, + "loss": 2.6216, + "mean_token_accuracy": 0.3551724135875702, + "step": 94075 + }, + { + "epoch": 0.09475837520811489, + "grad_norm": 13.17832664332979, + "learning_rate": 4.9753995528108994e-05, + "loss": 2.5337, + "mean_token_accuracy": 0.3758620619773865, + "step": 94080 + }, + { + "epoch": 0.09476341126121907, + "grad_norm": 13.080730355864766, + "learning_rate": 4.97539402658457e-05, + "loss": 2.5378, + "mean_token_accuracy": 0.3862069010734558, + "step": 94085 + }, + { + "epoch": 0.09476844731432324, + "grad_norm": 10.075251771425732, + "learning_rate": 4.975388499741022e-05, + "loss": 2.2274, + "mean_token_accuracy": 0.46896551847457885, + "step": 94090 + }, + { + "epoch": 0.09477348336742741, + "grad_norm": 9.989801220531925, + "learning_rate": 4.975382972280252e-05, + "loss": 2.5694, + "mean_token_accuracy": 0.4172413766384125, + "step": 94095 + }, + { + "epoch": 0.09477851942053159, + "grad_norm": 10.717814255447083, + "learning_rate": 4.9753774442022646e-05, + "loss": 2.8049, + "mean_token_accuracy": 0.4034482777118683, + "step": 94100 + }, + { + "epoch": 0.09478355547363576, + "grad_norm": 8.8756890634186, + "learning_rate": 4.97537191550706e-05, + "loss": 2.3629, + "mean_token_accuracy": 0.4517241418361664, + "step": 94105 + }, + { + "epoch": 0.09478859152673993, + "grad_norm": 9.482943544631453, + "learning_rate": 4.975366386194641e-05, + "loss": 2.1467, + "mean_token_accuracy": 0.46400484442710876, + "step": 94110 + }, + { + "epoch": 0.09479362757984411, + "grad_norm": 12.526991037223246, + "learning_rate": 4.975360856265008e-05, + "loss": 2.1067, + "mean_token_accuracy": 0.4918330252170563, + "step": 94115 + }, + { + "epoch": 0.09479866363294828, + "grad_norm": 9.876211166521708, + "learning_rate": 4.975355325718162e-05, + "loss": 2.209, + "mean_token_accuracy": 0.47465215921401976, + "step": 94120 + }, + { + "epoch": 0.09480369968605246, + "grad_norm": 11.19505914460475, + "learning_rate": 4.975349794554106e-05, + "loss": 2.6571, + "mean_token_accuracy": 0.3586206942796707, + "step": 94125 + }, + { + "epoch": 0.09480873573915662, + "grad_norm": 10.898563708567476, + "learning_rate": 4.975344262772841e-05, + "loss": 2.919, + "mean_token_accuracy": 0.39655172228813174, + "step": 94130 + }, + { + "epoch": 0.09481377179226079, + "grad_norm": 11.244838518836662, + "learning_rate": 4.9753387303743685e-05, + "loss": 2.3881, + "mean_token_accuracy": 0.3965517282485962, + "step": 94135 + }, + { + "epoch": 0.09481880784536496, + "grad_norm": 11.716285137324823, + "learning_rate": 4.97533319735869e-05, + "loss": 2.3839, + "mean_token_accuracy": 0.39310344457626345, + "step": 94140 + }, + { + "epoch": 0.09482384389846914, + "grad_norm": 14.632955222398527, + "learning_rate": 4.975327663725807e-05, + "loss": 2.4098, + "mean_token_accuracy": 0.4379310369491577, + "step": 94145 + }, + { + "epoch": 0.09482887995157331, + "grad_norm": 10.83865468385621, + "learning_rate": 4.9753221294757214e-05, + "loss": 2.0551, + "mean_token_accuracy": 0.4882032573223114, + "step": 94150 + }, + { + "epoch": 0.09483391600467748, + "grad_norm": 10.374182648422599, + "learning_rate": 4.975316594608434e-05, + "loss": 2.6452, + "mean_token_accuracy": 0.3931034505367279, + "step": 94155 + }, + { + "epoch": 0.09483895205778166, + "grad_norm": 10.775848461761601, + "learning_rate": 4.975311059123947e-05, + "loss": 1.9772, + "mean_token_accuracy": 0.5, + "step": 94160 + }, + { + "epoch": 0.09484398811088583, + "grad_norm": 9.96367579724092, + "learning_rate": 4.975305523022262e-05, + "loss": 2.5035, + "mean_token_accuracy": 0.39655172228813174, + "step": 94165 + }, + { + "epoch": 0.09484902416399, + "grad_norm": 14.70227219675699, + "learning_rate": 4.975299986303379e-05, + "loss": 2.5891, + "mean_token_accuracy": 0.38275861740112305, + "step": 94170 + }, + { + "epoch": 0.09485406021709418, + "grad_norm": 12.71775505855947, + "learning_rate": 4.975294448967302e-05, + "loss": 2.7976, + "mean_token_accuracy": 0.37931033968925476, + "step": 94175 + }, + { + "epoch": 0.09485909627019835, + "grad_norm": 11.794687455909681, + "learning_rate": 4.97528891101403e-05, + "loss": 2.916, + "mean_token_accuracy": 0.3620689630508423, + "step": 94180 + }, + { + "epoch": 0.09486413232330253, + "grad_norm": 12.633535851677799, + "learning_rate": 4.9752833724435675e-05, + "loss": 2.5863, + "mean_token_accuracy": 0.43103447556495667, + "step": 94185 + }, + { + "epoch": 0.0948691683764067, + "grad_norm": 12.930173928268955, + "learning_rate": 4.975277833255913e-05, + "loss": 2.1001, + "mean_token_accuracy": 0.47241378426551817, + "step": 94190 + }, + { + "epoch": 0.09487420442951087, + "grad_norm": 11.595969660077177, + "learning_rate": 4.97527229345107e-05, + "loss": 2.4139, + "mean_token_accuracy": 0.4172413766384125, + "step": 94195 + }, + { + "epoch": 0.09487924048261503, + "grad_norm": 12.720048473859977, + "learning_rate": 4.975266753029039e-05, + "loss": 2.2742, + "mean_token_accuracy": 0.44482758045196535, + "step": 94200 + }, + { + "epoch": 0.09488427653571921, + "grad_norm": 10.32637864530848, + "learning_rate": 4.975261211989822e-05, + "loss": 2.1682, + "mean_token_accuracy": 0.4620689630508423, + "step": 94205 + }, + { + "epoch": 0.09488931258882338, + "grad_norm": 11.023335206984484, + "learning_rate": 4.975255670333421e-05, + "loss": 2.7367, + "mean_token_accuracy": 0.37586206793785093, + "step": 94210 + }, + { + "epoch": 0.09489434864192756, + "grad_norm": 11.919258846605922, + "learning_rate": 4.975250128059837e-05, + "loss": 2.6812, + "mean_token_accuracy": 0.41034482717514037, + "step": 94215 + }, + { + "epoch": 0.09489938469503173, + "grad_norm": 11.849966399148158, + "learning_rate": 4.975244585169072e-05, + "loss": 2.3836, + "mean_token_accuracy": 0.4379310369491577, + "step": 94220 + }, + { + "epoch": 0.0949044207481359, + "grad_norm": 12.702649132053262, + "learning_rate": 4.975239041661126e-05, + "loss": 2.8226, + "mean_token_accuracy": 0.3793103456497192, + "step": 94225 + }, + { + "epoch": 0.09490945680124008, + "grad_norm": 12.077951498912952, + "learning_rate": 4.975233497536002e-05, + "loss": 2.6892, + "mean_token_accuracy": 0.4103448331356049, + "step": 94230 + }, + { + "epoch": 0.09491449285434425, + "grad_norm": 11.005103946081398, + "learning_rate": 4.9752279527937005e-05, + "loss": 2.5906, + "mean_token_accuracy": 0.36206896901130675, + "step": 94235 + }, + { + "epoch": 0.09491952890744842, + "grad_norm": 10.209504940691707, + "learning_rate": 4.9752224074342246e-05, + "loss": 2.2826, + "mean_token_accuracy": 0.42758620977401735, + "step": 94240 + }, + { + "epoch": 0.0949245649605526, + "grad_norm": 12.376001238853574, + "learning_rate": 4.975216861457575e-05, + "loss": 2.5954, + "mean_token_accuracy": 0.4344827562570572, + "step": 94245 + }, + { + "epoch": 0.09492960101365677, + "grad_norm": 11.935646126591037, + "learning_rate": 4.975211314863753e-05, + "loss": 2.6454, + "mean_token_accuracy": 0.3655172407627106, + "step": 94250 + }, + { + "epoch": 0.09493463706676095, + "grad_norm": 11.61411597924925, + "learning_rate": 4.9752057676527604e-05, + "loss": 2.5126, + "mean_token_accuracy": 0.4387931048870087, + "step": 94255 + }, + { + "epoch": 0.09493967311986512, + "grad_norm": 14.295694513225758, + "learning_rate": 4.975200219824599e-05, + "loss": 2.8176, + "mean_token_accuracy": 0.3931034505367279, + "step": 94260 + }, + { + "epoch": 0.0949447091729693, + "grad_norm": 10.035437451532458, + "learning_rate": 4.97519467137927e-05, + "loss": 2.3864, + "mean_token_accuracy": 0.38620689511299133, + "step": 94265 + }, + { + "epoch": 0.09494974522607345, + "grad_norm": 11.45228448017396, + "learning_rate": 4.9751891223167746e-05, + "loss": 2.5046, + "mean_token_accuracy": 0.42577131986618044, + "step": 94270 + }, + { + "epoch": 0.09495478127917763, + "grad_norm": 12.881094630419891, + "learning_rate": 4.975183572637115e-05, + "loss": 2.6303, + "mean_token_accuracy": 0.4517241358757019, + "step": 94275 + }, + { + "epoch": 0.0949598173322818, + "grad_norm": 15.687804508943161, + "learning_rate": 4.975178022340292e-05, + "loss": 2.4506, + "mean_token_accuracy": 0.4172413766384125, + "step": 94280 + }, + { + "epoch": 0.09496485338538597, + "grad_norm": 8.539945164305356, + "learning_rate": 4.975172471426307e-05, + "loss": 2.5762, + "mean_token_accuracy": 0.41034482717514037, + "step": 94285 + }, + { + "epoch": 0.09496988943849015, + "grad_norm": 17.11511330502168, + "learning_rate": 4.9751669198951636e-05, + "loss": 2.0523, + "mean_token_accuracy": 0.47241380214691164, + "step": 94290 + }, + { + "epoch": 0.09497492549159432, + "grad_norm": 12.335162234094282, + "learning_rate": 4.975161367746861e-05, + "loss": 2.5183, + "mean_token_accuracy": 0.4034482717514038, + "step": 94295 + }, + { + "epoch": 0.0949799615446985, + "grad_norm": 11.693466600893839, + "learning_rate": 4.975155814981402e-05, + "loss": 2.7601, + "mean_token_accuracy": 0.38802178502082824, + "step": 94300 + }, + { + "epoch": 0.09498499759780267, + "grad_norm": 10.13867408139205, + "learning_rate": 4.975150261598787e-05, + "loss": 2.4015, + "mean_token_accuracy": 0.43992740511894224, + "step": 94305 + }, + { + "epoch": 0.09499003365090684, + "grad_norm": 10.663304629606444, + "learning_rate": 4.9751447075990187e-05, + "loss": 2.5268, + "mean_token_accuracy": 0.40689654350280763, + "step": 94310 + }, + { + "epoch": 0.09499506970401102, + "grad_norm": 10.048043207604094, + "learning_rate": 4.975139152982099e-05, + "loss": 2.6298, + "mean_token_accuracy": 0.41379311084747317, + "step": 94315 + }, + { + "epoch": 0.09500010575711519, + "grad_norm": 8.955446389755506, + "learning_rate": 4.975133597748027e-05, + "loss": 2.235, + "mean_token_accuracy": 0.4413793087005615, + "step": 94320 + }, + { + "epoch": 0.09500514181021937, + "grad_norm": 11.76075599888514, + "learning_rate": 4.975128041896807e-05, + "loss": 2.4855, + "mean_token_accuracy": 0.42413793206214906, + "step": 94325 + }, + { + "epoch": 0.09501017786332354, + "grad_norm": 12.276962253209081, + "learning_rate": 4.975122485428439e-05, + "loss": 2.3946, + "mean_token_accuracy": 0.47931033968925474, + "step": 94330 + }, + { + "epoch": 0.09501521391642771, + "grad_norm": 9.808878404526567, + "learning_rate": 4.975116928342925e-05, + "loss": 2.4667, + "mean_token_accuracy": 0.43986691236495973, + "step": 94335 + }, + { + "epoch": 0.09502024996953187, + "grad_norm": 10.948924324172822, + "learning_rate": 4.975111370640268e-05, + "loss": 2.4636, + "mean_token_accuracy": 0.3931034505367279, + "step": 94340 + }, + { + "epoch": 0.09502528602263605, + "grad_norm": 10.972611290763291, + "learning_rate": 4.975105812320466e-05, + "loss": 2.2427, + "mean_token_accuracy": 0.37241379022598264, + "step": 94345 + }, + { + "epoch": 0.09503032207574022, + "grad_norm": 13.597641603841403, + "learning_rate": 4.9751002533835235e-05, + "loss": 2.9661, + "mean_token_accuracy": 0.3551724165678024, + "step": 94350 + }, + { + "epoch": 0.0950353581288444, + "grad_norm": 13.740024119148027, + "learning_rate": 4.9750946938294416e-05, + "loss": 2.6825, + "mean_token_accuracy": 0.4034482777118683, + "step": 94355 + }, + { + "epoch": 0.09504039418194857, + "grad_norm": 9.617909462409292, + "learning_rate": 4.97508913365822e-05, + "loss": 2.298, + "mean_token_accuracy": 0.4034482717514038, + "step": 94360 + }, + { + "epoch": 0.09504543023505274, + "grad_norm": 11.78687467126387, + "learning_rate": 4.975083572869863e-05, + "loss": 2.6124, + "mean_token_accuracy": 0.37586207389831544, + "step": 94365 + }, + { + "epoch": 0.09505046628815692, + "grad_norm": 11.669013559961504, + "learning_rate": 4.97507801146437e-05, + "loss": 2.0374, + "mean_token_accuracy": 0.47779794335365294, + "step": 94370 + }, + { + "epoch": 0.09505550234126109, + "grad_norm": 9.853320361316026, + "learning_rate": 4.975072449441743e-05, + "loss": 2.2049, + "mean_token_accuracy": 0.4691470146179199, + "step": 94375 + }, + { + "epoch": 0.09506053839436526, + "grad_norm": 9.578687421638223, + "learning_rate": 4.9750668868019846e-05, + "loss": 2.4483, + "mean_token_accuracy": 0.38965518176555636, + "step": 94380 + }, + { + "epoch": 0.09506557444746944, + "grad_norm": 10.25236658852576, + "learning_rate": 4.9750613235450954e-05, + "loss": 2.149, + "mean_token_accuracy": 0.4294010937213898, + "step": 94385 + }, + { + "epoch": 0.09507061050057361, + "grad_norm": 13.93944134060669, + "learning_rate": 4.975055759671077e-05, + "loss": 3.4523, + "mean_token_accuracy": 0.34137930572032926, + "step": 94390 + }, + { + "epoch": 0.09507564655367778, + "grad_norm": 16.91302420894603, + "learning_rate": 4.9750501951799315e-05, + "loss": 2.928, + "mean_token_accuracy": 0.379310342669487, + "step": 94395 + }, + { + "epoch": 0.09508068260678196, + "grad_norm": 10.64670914900411, + "learning_rate": 4.9750446300716595e-05, + "loss": 2.6618, + "mean_token_accuracy": 0.3620689630508423, + "step": 94400 + }, + { + "epoch": 0.09508571865988613, + "grad_norm": 11.155151718042662, + "learning_rate": 4.9750390643462636e-05, + "loss": 2.6848, + "mean_token_accuracy": 0.4620689690113068, + "step": 94405 + }, + { + "epoch": 0.09509075471299029, + "grad_norm": 10.811400923585001, + "learning_rate": 4.9750334980037445e-05, + "loss": 2.5087, + "mean_token_accuracy": 0.38275861740112305, + "step": 94410 + }, + { + "epoch": 0.09509579076609447, + "grad_norm": 10.38125911337505, + "learning_rate": 4.975027931044104e-05, + "loss": 2.3799, + "mean_token_accuracy": 0.4310344815254211, + "step": 94415 + }, + { + "epoch": 0.09510082681919864, + "grad_norm": 10.159771520319072, + "learning_rate": 4.975022363467344e-05, + "loss": 2.6813, + "mean_token_accuracy": 0.3517241358757019, + "step": 94420 + }, + { + "epoch": 0.09510586287230281, + "grad_norm": 10.530161255303089, + "learning_rate": 4.9750167952734656e-05, + "loss": 2.0551, + "mean_token_accuracy": 0.44827585816383364, + "step": 94425 + }, + { + "epoch": 0.09511089892540699, + "grad_norm": 10.381008716554916, + "learning_rate": 4.9750112264624696e-05, + "loss": 2.2635, + "mean_token_accuracy": 0.47931033968925474, + "step": 94430 + }, + { + "epoch": 0.09511593497851116, + "grad_norm": 11.189350621478148, + "learning_rate": 4.97500565703436e-05, + "loss": 2.5448, + "mean_token_accuracy": 0.42965604066848756, + "step": 94435 + }, + { + "epoch": 0.09512097103161533, + "grad_norm": 11.349959911623296, + "learning_rate": 4.975000086989136e-05, + "loss": 2.3029, + "mean_token_accuracy": 0.4620689690113068, + "step": 94440 + }, + { + "epoch": 0.09512600708471951, + "grad_norm": 11.226907896870637, + "learning_rate": 4.9749945163268e-05, + "loss": 2.4935, + "mean_token_accuracy": 0.42413793206214906, + "step": 94445 + }, + { + "epoch": 0.09513104313782368, + "grad_norm": 13.009262458724498, + "learning_rate": 4.9749889450473534e-05, + "loss": 2.5195, + "mean_token_accuracy": 0.4206896543502808, + "step": 94450 + }, + { + "epoch": 0.09513607919092786, + "grad_norm": 10.08457291797649, + "learning_rate": 4.974983373150798e-05, + "loss": 2.2717, + "mean_token_accuracy": 0.43448275327682495, + "step": 94455 + }, + { + "epoch": 0.09514111524403203, + "grad_norm": 13.651733527016296, + "learning_rate": 4.974977800637135e-05, + "loss": 2.5546, + "mean_token_accuracy": 0.3793103456497192, + "step": 94460 + }, + { + "epoch": 0.0951461512971362, + "grad_norm": 11.14307438847547, + "learning_rate": 4.974972227506366e-05, + "loss": 2.5358, + "mean_token_accuracy": 0.3862069010734558, + "step": 94465 + }, + { + "epoch": 0.09515118735024038, + "grad_norm": 9.575953675411071, + "learning_rate": 4.9749666537584926e-05, + "loss": 2.2173, + "mean_token_accuracy": 0.49830610752105714, + "step": 94470 + }, + { + "epoch": 0.09515622340334455, + "grad_norm": 10.814108298782577, + "learning_rate": 4.9749610793935165e-05, + "loss": 2.3948, + "mean_token_accuracy": 0.4413793087005615, + "step": 94475 + }, + { + "epoch": 0.09516125945644871, + "grad_norm": 9.878307143859834, + "learning_rate": 4.9749555044114397e-05, + "loss": 2.8667, + "mean_token_accuracy": 0.4310344696044922, + "step": 94480 + }, + { + "epoch": 0.09516629550955288, + "grad_norm": 11.70583758934135, + "learning_rate": 4.974949928812263e-05, + "loss": 2.2695, + "mean_token_accuracy": 0.43793103098869324, + "step": 94485 + }, + { + "epoch": 0.09517133156265706, + "grad_norm": 14.088369576505439, + "learning_rate": 4.974944352595988e-05, + "loss": 2.2757, + "mean_token_accuracy": 0.41724138259887694, + "step": 94490 + }, + { + "epoch": 0.09517636761576123, + "grad_norm": 12.062304066988213, + "learning_rate": 4.974938775762615e-05, + "loss": 2.2055, + "mean_token_accuracy": 0.4551724135875702, + "step": 94495 + }, + { + "epoch": 0.0951814036688654, + "grad_norm": 9.722449162366482, + "learning_rate": 4.974933198312149e-05, + "loss": 2.9638, + "mean_token_accuracy": 0.38275861740112305, + "step": 94500 + }, + { + "epoch": 0.09518643972196958, + "grad_norm": 14.021284043765027, + "learning_rate": 4.9749276202445885e-05, + "loss": 2.3248, + "mean_token_accuracy": 0.46551724076271056, + "step": 94505 + }, + { + "epoch": 0.09519147577507375, + "grad_norm": 10.409375346892778, + "learning_rate": 4.9749220415599354e-05, + "loss": 2.3532, + "mean_token_accuracy": 0.4482758641242981, + "step": 94510 + }, + { + "epoch": 0.09519651182817793, + "grad_norm": 12.078261072916789, + "learning_rate": 4.974916462258193e-05, + "loss": 2.3442, + "mean_token_accuracy": 0.4344827592372894, + "step": 94515 + }, + { + "epoch": 0.0952015478812821, + "grad_norm": 10.218559721014014, + "learning_rate": 4.974910882339362e-05, + "loss": 2.5441, + "mean_token_accuracy": 0.3896551728248596, + "step": 94520 + }, + { + "epoch": 0.09520658393438627, + "grad_norm": 10.33042513440372, + "learning_rate": 4.974905301803443e-05, + "loss": 2.4093, + "mean_token_accuracy": 0.42413793206214906, + "step": 94525 + }, + { + "epoch": 0.09521161998749045, + "grad_norm": 12.233469919119408, + "learning_rate": 4.974899720650438e-05, + "loss": 2.3777, + "mean_token_accuracy": 0.4344827592372894, + "step": 94530 + }, + { + "epoch": 0.09521665604059462, + "grad_norm": 10.393902809728301, + "learning_rate": 4.974894138880349e-05, + "loss": 2.747, + "mean_token_accuracy": 0.4034482717514038, + "step": 94535 + }, + { + "epoch": 0.0952216920936988, + "grad_norm": 11.776635055997412, + "learning_rate": 4.9748885564931775e-05, + "loss": 2.3422, + "mean_token_accuracy": 0.47586206197738645, + "step": 94540 + }, + { + "epoch": 0.09522672814680297, + "grad_norm": 12.935806101970833, + "learning_rate": 4.974882973488925e-05, + "loss": 2.6704, + "mean_token_accuracy": 0.400967937707901, + "step": 94545 + }, + { + "epoch": 0.09523176419990713, + "grad_norm": 13.063717830711987, + "learning_rate": 4.974877389867592e-05, + "loss": 2.5925, + "mean_token_accuracy": 0.3965517282485962, + "step": 94550 + }, + { + "epoch": 0.0952368002530113, + "grad_norm": 11.44425006093507, + "learning_rate": 4.9748718056291814e-05, + "loss": 2.1588, + "mean_token_accuracy": 0.4744101643562317, + "step": 94555 + }, + { + "epoch": 0.09524183630611548, + "grad_norm": 10.950576231870489, + "learning_rate": 4.974866220773695e-05, + "loss": 2.3105, + "mean_token_accuracy": 0.441379314661026, + "step": 94560 + }, + { + "epoch": 0.09524687235921965, + "grad_norm": 10.952308859340937, + "learning_rate": 4.974860635301133e-05, + "loss": 2.5981, + "mean_token_accuracy": 0.4310344815254211, + "step": 94565 + }, + { + "epoch": 0.09525190841232382, + "grad_norm": 13.542165867321689, + "learning_rate": 4.974855049211497e-05, + "loss": 2.4144, + "mean_token_accuracy": 0.3931034475564957, + "step": 94570 + }, + { + "epoch": 0.095256944465428, + "grad_norm": 11.361301090665656, + "learning_rate": 4.9748494625047896e-05, + "loss": 2.2613, + "mean_token_accuracy": 0.4655172348022461, + "step": 94575 + }, + { + "epoch": 0.09526198051853217, + "grad_norm": 11.30719623355096, + "learning_rate": 4.974843875181013e-05, + "loss": 2.6614, + "mean_token_accuracy": 0.3206896513700485, + "step": 94580 + }, + { + "epoch": 0.09526701657163635, + "grad_norm": 10.762342854483771, + "learning_rate": 4.974838287240167e-05, + "loss": 2.301, + "mean_token_accuracy": 0.44827585816383364, + "step": 94585 + }, + { + "epoch": 0.09527205262474052, + "grad_norm": 12.106367180240598, + "learning_rate": 4.974832698682253e-05, + "loss": 2.4093, + "mean_token_accuracy": 0.43448275327682495, + "step": 94590 + }, + { + "epoch": 0.09527708867784469, + "grad_norm": 14.755460106901154, + "learning_rate": 4.974827109507274e-05, + "loss": 2.663, + "mean_token_accuracy": 0.3620689630508423, + "step": 94595 + }, + { + "epoch": 0.09528212473094887, + "grad_norm": 11.225503012028776, + "learning_rate": 4.9748215197152306e-05, + "loss": 2.3223, + "mean_token_accuracy": 0.42413793206214906, + "step": 94600 + }, + { + "epoch": 0.09528716078405304, + "grad_norm": 9.718317812402411, + "learning_rate": 4.974815929306125e-05, + "loss": 2.5357, + "mean_token_accuracy": 0.42068964838981626, + "step": 94605 + }, + { + "epoch": 0.09529219683715721, + "grad_norm": 9.940904359790782, + "learning_rate": 4.9748103382799594e-05, + "loss": 2.5037, + "mean_token_accuracy": 0.4137930989265442, + "step": 94610 + }, + { + "epoch": 0.09529723289026139, + "grad_norm": 12.58440386971749, + "learning_rate": 4.974804746636733e-05, + "loss": 2.1013, + "mean_token_accuracy": 0.47755595445632937, + "step": 94615 + }, + { + "epoch": 0.09530226894336555, + "grad_norm": 8.968576334231596, + "learning_rate": 4.974799154376449e-05, + "loss": 2.5682, + "mean_token_accuracy": 0.3965517163276672, + "step": 94620 + }, + { + "epoch": 0.09530730499646972, + "grad_norm": 10.656318762549384, + "learning_rate": 4.9747935614991084e-05, + "loss": 2.0898, + "mean_token_accuracy": 0.4689655125141144, + "step": 94625 + }, + { + "epoch": 0.0953123410495739, + "grad_norm": 10.163285500441466, + "learning_rate": 4.9747879680047125e-05, + "loss": 2.2884, + "mean_token_accuracy": 0.4482758641242981, + "step": 94630 + }, + { + "epoch": 0.09531737710267807, + "grad_norm": 10.008543569360336, + "learning_rate": 4.9747823738932645e-05, + "loss": 2.2527, + "mean_token_accuracy": 0.44827585220336913, + "step": 94635 + }, + { + "epoch": 0.09532241315578224, + "grad_norm": 12.16610033195966, + "learning_rate": 4.974776779164764e-05, + "loss": 2.6516, + "mean_token_accuracy": 0.42068964838981626, + "step": 94640 + }, + { + "epoch": 0.09532744920888642, + "grad_norm": 9.861103575187354, + "learning_rate": 4.974771183819215e-05, + "loss": 2.0716, + "mean_token_accuracy": 0.46551724076271056, + "step": 94645 + }, + { + "epoch": 0.09533248526199059, + "grad_norm": 10.858438253292226, + "learning_rate": 4.974765587856615e-05, + "loss": 2.5596, + "mean_token_accuracy": 0.43103448748588563, + "step": 94650 + }, + { + "epoch": 0.09533752131509476, + "grad_norm": 17.1107835076412, + "learning_rate": 4.974759991276969e-05, + "loss": 2.5084, + "mean_token_accuracy": 0.4126436859369278, + "step": 94655 + }, + { + "epoch": 0.09534255736819894, + "grad_norm": 12.10068936124448, + "learning_rate": 4.974754394080278e-05, + "loss": 2.5597, + "mean_token_accuracy": 0.4068965494632721, + "step": 94660 + }, + { + "epoch": 0.09534759342130311, + "grad_norm": 11.058706165880562, + "learning_rate": 4.974748796266543e-05, + "loss": 2.6365, + "mean_token_accuracy": 0.35172412991523744, + "step": 94665 + }, + { + "epoch": 0.09535262947440729, + "grad_norm": 14.457252502894958, + "learning_rate": 4.974743197835765e-05, + "loss": 2.802, + "mean_token_accuracy": 0.3655172407627106, + "step": 94670 + }, + { + "epoch": 0.09535766552751146, + "grad_norm": 11.224242092943397, + "learning_rate": 4.974737598787947e-05, + "loss": 2.2444, + "mean_token_accuracy": 0.46551724076271056, + "step": 94675 + }, + { + "epoch": 0.09536270158061563, + "grad_norm": 13.668233992853766, + "learning_rate": 4.974731999123089e-05, + "loss": 2.6124, + "mean_token_accuracy": 0.37931033968925476, + "step": 94680 + }, + { + "epoch": 0.0953677376337198, + "grad_norm": 11.970030337080674, + "learning_rate": 4.974726398841194e-05, + "loss": 2.4412, + "mean_token_accuracy": 0.3931034505367279, + "step": 94685 + }, + { + "epoch": 0.09537277368682397, + "grad_norm": 13.688263451171593, + "learning_rate": 4.9747207979422614e-05, + "loss": 2.7388, + "mean_token_accuracy": 0.358620685338974, + "step": 94690 + }, + { + "epoch": 0.09537780973992814, + "grad_norm": 17.151685549202128, + "learning_rate": 4.9747151964262954e-05, + "loss": 2.4786, + "mean_token_accuracy": 0.4068965494632721, + "step": 94695 + }, + { + "epoch": 0.09538284579303231, + "grad_norm": 12.165867633536022, + "learning_rate": 4.9747095942932964e-05, + "loss": 2.1961, + "mean_token_accuracy": 0.44827585220336913, + "step": 94700 + }, + { + "epoch": 0.09538788184613649, + "grad_norm": 11.380384055982072, + "learning_rate": 4.974703991543265e-05, + "loss": 2.6343, + "mean_token_accuracy": 0.4117362380027771, + "step": 94705 + }, + { + "epoch": 0.09539291789924066, + "grad_norm": 11.494369031151589, + "learning_rate": 4.974698388176205e-05, + "loss": 2.6934, + "mean_token_accuracy": 0.4000000059604645, + "step": 94710 + }, + { + "epoch": 0.09539795395234484, + "grad_norm": 9.68863462297922, + "learning_rate": 4.9746927841921154e-05, + "loss": 2.1479, + "mean_token_accuracy": 0.4551724135875702, + "step": 94715 + }, + { + "epoch": 0.09540299000544901, + "grad_norm": 14.898849018032697, + "learning_rate": 4.974687179591e-05, + "loss": 2.5465, + "mean_token_accuracy": 0.41893526911735535, + "step": 94720 + }, + { + "epoch": 0.09540802605855318, + "grad_norm": 11.673891039193846, + "learning_rate": 4.974681574372858e-05, + "loss": 2.4013, + "mean_token_accuracy": 0.37931033968925476, + "step": 94725 + }, + { + "epoch": 0.09541306211165736, + "grad_norm": 10.89689195801521, + "learning_rate": 4.974675968537693e-05, + "loss": 2.5527, + "mean_token_accuracy": 0.4257713258266449, + "step": 94730 + }, + { + "epoch": 0.09541809816476153, + "grad_norm": 9.914859209642662, + "learning_rate": 4.974670362085506e-05, + "loss": 2.3718, + "mean_token_accuracy": 0.41379310488700866, + "step": 94735 + }, + { + "epoch": 0.0954231342178657, + "grad_norm": 11.860349924209084, + "learning_rate": 4.974664755016298e-05, + "loss": 2.407, + "mean_token_accuracy": 0.42413792610168455, + "step": 94740 + }, + { + "epoch": 0.09542817027096988, + "grad_norm": 9.393524062544435, + "learning_rate": 4.974659147330071e-05, + "loss": 2.198, + "mean_token_accuracy": 0.47586206793785096, + "step": 94745 + }, + { + "epoch": 0.09543320632407405, + "grad_norm": 13.743792810744072, + "learning_rate": 4.974653539026827e-05, + "loss": 2.4292, + "mean_token_accuracy": 0.41034482717514037, + "step": 94750 + }, + { + "epoch": 0.09543824237717823, + "grad_norm": 10.654313256120993, + "learning_rate": 4.974647930106566e-05, + "loss": 2.196, + "mean_token_accuracy": 0.441379314661026, + "step": 94755 + }, + { + "epoch": 0.09544327843028239, + "grad_norm": 9.936535530794197, + "learning_rate": 4.974642320569292e-05, + "loss": 2.1142, + "mean_token_accuracy": 0.47586206793785096, + "step": 94760 + }, + { + "epoch": 0.09544831448338656, + "grad_norm": 15.085458545417078, + "learning_rate": 4.9746367104150045e-05, + "loss": 2.8066, + "mean_token_accuracy": 0.4034482777118683, + "step": 94765 + }, + { + "epoch": 0.09545335053649073, + "grad_norm": 13.074004864517716, + "learning_rate": 4.974631099643705e-05, + "loss": 3.0965, + "mean_token_accuracy": 0.3310344785451889, + "step": 94770 + }, + { + "epoch": 0.0954583865895949, + "grad_norm": 10.188909623217869, + "learning_rate": 4.9746254882553974e-05, + "loss": 2.4005, + "mean_token_accuracy": 0.4206896543502808, + "step": 94775 + }, + { + "epoch": 0.09546342264269908, + "grad_norm": 13.386580181369357, + "learning_rate": 4.9746198762500797e-05, + "loss": 2.5279, + "mean_token_accuracy": 0.44343618154525755, + "step": 94780 + }, + { + "epoch": 0.09546845869580325, + "grad_norm": 9.288974971891502, + "learning_rate": 4.9746142636277566e-05, + "loss": 2.2543, + "mean_token_accuracy": 0.4896551728248596, + "step": 94785 + }, + { + "epoch": 0.09547349474890743, + "grad_norm": 11.635673771943251, + "learning_rate": 4.974608650388428e-05, + "loss": 2.9918, + "mean_token_accuracy": 0.38620689511299133, + "step": 94790 + }, + { + "epoch": 0.0954785308020116, + "grad_norm": 9.986568505131213, + "learning_rate": 4.9746030365320956e-05, + "loss": 2.3621, + "mean_token_accuracy": 0.3896551728248596, + "step": 94795 + }, + { + "epoch": 0.09548356685511578, + "grad_norm": 12.57825710711765, + "learning_rate": 4.9745974220587624e-05, + "loss": 2.3384, + "mean_token_accuracy": 0.4448275864124298, + "step": 94800 + }, + { + "epoch": 0.09548860290821995, + "grad_norm": 8.531957527337706, + "learning_rate": 4.974591806968428e-05, + "loss": 2.5854, + "mean_token_accuracy": 0.43103448748588563, + "step": 94805 + }, + { + "epoch": 0.09549363896132412, + "grad_norm": 9.690930384485728, + "learning_rate": 4.9745861912610944e-05, + "loss": 2.2907, + "mean_token_accuracy": 0.4931034445762634, + "step": 94810 + }, + { + "epoch": 0.0954986750144283, + "grad_norm": 8.749484874079112, + "learning_rate": 4.974580574936764e-05, + "loss": 2.3819, + "mean_token_accuracy": 0.4, + "step": 94815 + }, + { + "epoch": 0.09550371106753247, + "grad_norm": 11.990962765794626, + "learning_rate": 4.974574957995437e-05, + "loss": 2.4787, + "mean_token_accuracy": 0.4068965494632721, + "step": 94820 + }, + { + "epoch": 0.09550874712063664, + "grad_norm": 11.722516132033057, + "learning_rate": 4.974569340437117e-05, + "loss": 2.5211, + "mean_token_accuracy": 0.3965517282485962, + "step": 94825 + }, + { + "epoch": 0.0955137831737408, + "grad_norm": 11.305392278878944, + "learning_rate": 4.974563722261804e-05, + "loss": 2.5161, + "mean_token_accuracy": 0.44827585816383364, + "step": 94830 + }, + { + "epoch": 0.09551881922684498, + "grad_norm": 10.522259997967701, + "learning_rate": 4.9745581034695006e-05, + "loss": 2.4336, + "mean_token_accuracy": 0.41724138259887694, + "step": 94835 + }, + { + "epoch": 0.09552385527994915, + "grad_norm": 10.525775788157246, + "learning_rate": 4.974552484060207e-05, + "loss": 2.5041, + "mean_token_accuracy": 0.42413793206214906, + "step": 94840 + }, + { + "epoch": 0.09552889133305333, + "grad_norm": 10.263121549009181, + "learning_rate": 4.9745468640339255e-05, + "loss": 2.0596, + "mean_token_accuracy": 0.4413793087005615, + "step": 94845 + }, + { + "epoch": 0.0955339273861575, + "grad_norm": 11.013966811225886, + "learning_rate": 4.974541243390658e-05, + "loss": 2.3204, + "mean_token_accuracy": 0.43103448748588563, + "step": 94850 + }, + { + "epoch": 0.09553896343926167, + "grad_norm": 12.835218383506, + "learning_rate": 4.9745356221304054e-05, + "loss": 2.2652, + "mean_token_accuracy": 0.46896551847457885, + "step": 94855 + }, + { + "epoch": 0.09554399949236585, + "grad_norm": 13.64492457682945, + "learning_rate": 4.9745300002531686e-05, + "loss": 2.6832, + "mean_token_accuracy": 0.42413792610168455, + "step": 94860 + }, + { + "epoch": 0.09554903554547002, + "grad_norm": 12.76582704381585, + "learning_rate": 4.974524377758952e-05, + "loss": 2.832, + "mean_token_accuracy": 0.4122807025909424, + "step": 94865 + }, + { + "epoch": 0.0955540715985742, + "grad_norm": 13.85082413261805, + "learning_rate": 4.974518754647754e-05, + "loss": 2.3891, + "mean_token_accuracy": 0.44137930274009707, + "step": 94870 + }, + { + "epoch": 0.09555910765167837, + "grad_norm": 14.552383073151365, + "learning_rate": 4.9745131309195775e-05, + "loss": 2.6957, + "mean_token_accuracy": 0.38965516686439516, + "step": 94875 + }, + { + "epoch": 0.09556414370478254, + "grad_norm": 10.496693327162854, + "learning_rate": 4.974507506574424e-05, + "loss": 3.1934, + "mean_token_accuracy": 0.3068965464830399, + "step": 94880 + }, + { + "epoch": 0.09556917975788672, + "grad_norm": 11.75693894470164, + "learning_rate": 4.9745018816122955e-05, + "loss": 2.4581, + "mean_token_accuracy": 0.4186932861804962, + "step": 94885 + }, + { + "epoch": 0.09557421581099089, + "grad_norm": 11.341472619917889, + "learning_rate": 4.974496256033193e-05, + "loss": 2.1995, + "mean_token_accuracy": 0.4551724076271057, + "step": 94890 + }, + { + "epoch": 0.09557925186409506, + "grad_norm": 10.700923521699153, + "learning_rate": 4.974490629837117e-05, + "loss": 2.5308, + "mean_token_accuracy": 0.38275861740112305, + "step": 94895 + }, + { + "epoch": 0.09558428791719922, + "grad_norm": 14.67259598041589, + "learning_rate": 4.974485003024072e-05, + "loss": 2.2058, + "mean_token_accuracy": 0.4586206912994385, + "step": 94900 + }, + { + "epoch": 0.0955893239703034, + "grad_norm": 12.320419502220172, + "learning_rate": 4.974479375594057e-05, + "loss": 2.5006, + "mean_token_accuracy": 0.4034482717514038, + "step": 94905 + }, + { + "epoch": 0.09559436002340757, + "grad_norm": 11.817813604898392, + "learning_rate": 4.974473747547074e-05, + "loss": 2.5091, + "mean_token_accuracy": 0.4379310369491577, + "step": 94910 + }, + { + "epoch": 0.09559939607651174, + "grad_norm": 10.025453690586517, + "learning_rate": 4.974468118883126e-05, + "loss": 2.8662, + "mean_token_accuracy": 0.38965516686439516, + "step": 94915 + }, + { + "epoch": 0.09560443212961592, + "grad_norm": 8.418689104953579, + "learning_rate": 4.9744624896022125e-05, + "loss": 2.4478, + "mean_token_accuracy": 0.43793103098869324, + "step": 94920 + }, + { + "epoch": 0.09560946818272009, + "grad_norm": 9.730269597273631, + "learning_rate": 4.974456859704336e-05, + "loss": 2.5161, + "mean_token_accuracy": 0.4189352631568909, + "step": 94925 + }, + { + "epoch": 0.09561450423582427, + "grad_norm": 11.540124881513567, + "learning_rate": 4.974451229189499e-05, + "loss": 2.0733, + "mean_token_accuracy": 0.4551724135875702, + "step": 94930 + }, + { + "epoch": 0.09561954028892844, + "grad_norm": 10.780998516075206, + "learning_rate": 4.974445598057701e-05, + "loss": 2.6449, + "mean_token_accuracy": 0.4, + "step": 94935 + }, + { + "epoch": 0.09562457634203261, + "grad_norm": 11.52226467486293, + "learning_rate": 4.974439966308946e-05, + "loss": 2.721, + "mean_token_accuracy": 0.4068965554237366, + "step": 94940 + }, + { + "epoch": 0.09562961239513679, + "grad_norm": 12.189615036989622, + "learning_rate": 4.974434333943233e-05, + "loss": 2.6215, + "mean_token_accuracy": 0.36206896901130675, + "step": 94945 + }, + { + "epoch": 0.09563464844824096, + "grad_norm": 10.24924620940812, + "learning_rate": 4.9744287009605654e-05, + "loss": 2.6726, + "mean_token_accuracy": 0.4103448212146759, + "step": 94950 + }, + { + "epoch": 0.09563968450134513, + "grad_norm": 10.647043189335427, + "learning_rate": 4.9744230673609435e-05, + "loss": 2.4305, + "mean_token_accuracy": 0.42068966031074523, + "step": 94955 + }, + { + "epoch": 0.09564472055444931, + "grad_norm": 11.17706644424301, + "learning_rate": 4.974417433144371e-05, + "loss": 2.4764, + "mean_token_accuracy": 0.4482758641242981, + "step": 94960 + }, + { + "epoch": 0.09564975660755348, + "grad_norm": 7.519048895750799, + "learning_rate": 4.974411798310847e-05, + "loss": 2.1688, + "mean_token_accuracy": 0.44627949595451355, + "step": 94965 + }, + { + "epoch": 0.09565479266065764, + "grad_norm": 10.07972849325475, + "learning_rate": 4.974406162860375e-05, + "loss": 2.5268, + "mean_token_accuracy": 0.41034482717514037, + "step": 94970 + }, + { + "epoch": 0.09565982871376182, + "grad_norm": 13.248575042152243, + "learning_rate": 4.974400526792954e-05, + "loss": 2.2337, + "mean_token_accuracy": 0.4446460962295532, + "step": 94975 + }, + { + "epoch": 0.09566486476686599, + "grad_norm": 11.356533330831423, + "learning_rate": 4.974394890108588e-05, + "loss": 3.4618, + "mean_token_accuracy": 0.334482753276825, + "step": 94980 + }, + { + "epoch": 0.09566990081997016, + "grad_norm": 10.274391509196915, + "learning_rate": 4.9743892528072785e-05, + "loss": 2.2477, + "mean_token_accuracy": 0.4448275864124298, + "step": 94985 + }, + { + "epoch": 0.09567493687307434, + "grad_norm": 9.833458321377458, + "learning_rate": 4.974383614889026e-05, + "loss": 2.1255, + "mean_token_accuracy": 0.4517241358757019, + "step": 94990 + }, + { + "epoch": 0.09567997292617851, + "grad_norm": 8.917809890262825, + "learning_rate": 4.974377976353832e-05, + "loss": 2.3166, + "mean_token_accuracy": 0.4551724135875702, + "step": 94995 + }, + { + "epoch": 0.09568500897928268, + "grad_norm": 9.232902469592608, + "learning_rate": 4.9743723372016985e-05, + "loss": 2.2817, + "mean_token_accuracy": 0.45069569945335386, + "step": 95000 + }, + { + "epoch": 0.09569004503238686, + "grad_norm": 12.135207050337424, + "learning_rate": 4.9743666974326275e-05, + "loss": 2.7972, + "mean_token_accuracy": 0.32758620381355286, + "step": 95005 + }, + { + "epoch": 0.09569508108549103, + "grad_norm": 12.261816690951363, + "learning_rate": 4.97436105704662e-05, + "loss": 2.1962, + "mean_token_accuracy": 0.49171202778816225, + "step": 95010 + }, + { + "epoch": 0.0957001171385952, + "grad_norm": 9.611368017874604, + "learning_rate": 4.974355416043678e-05, + "loss": 2.6051, + "mean_token_accuracy": 0.4103448152542114, + "step": 95015 + }, + { + "epoch": 0.09570515319169938, + "grad_norm": 10.038113931219529, + "learning_rate": 4.974349774423802e-05, + "loss": 2.4367, + "mean_token_accuracy": 0.4103448331356049, + "step": 95020 + }, + { + "epoch": 0.09571018924480355, + "grad_norm": 13.454845802283526, + "learning_rate": 4.9743441321869946e-05, + "loss": 2.875, + "mean_token_accuracy": 0.3517241418361664, + "step": 95025 + }, + { + "epoch": 0.09571522529790773, + "grad_norm": 10.093895354658807, + "learning_rate": 4.974338489333256e-05, + "loss": 2.9521, + "mean_token_accuracy": 0.37241379022598264, + "step": 95030 + }, + { + "epoch": 0.0957202613510119, + "grad_norm": 15.148435349623952, + "learning_rate": 4.9743328458625906e-05, + "loss": 2.5434, + "mean_token_accuracy": 0.3896551728248596, + "step": 95035 + }, + { + "epoch": 0.09572529740411606, + "grad_norm": 12.889347345312244, + "learning_rate": 4.974327201774997e-05, + "loss": 2.6411, + "mean_token_accuracy": 0.3775559484958649, + "step": 95040 + }, + { + "epoch": 0.09573033345722023, + "grad_norm": 9.611750448691255, + "learning_rate": 4.974321557070478e-05, + "loss": 2.4627, + "mean_token_accuracy": 0.41941922903060913, + "step": 95045 + }, + { + "epoch": 0.09573536951032441, + "grad_norm": 12.618658527608344, + "learning_rate": 4.9743159117490364e-05, + "loss": 2.4415, + "mean_token_accuracy": 0.417241370677948, + "step": 95050 + }, + { + "epoch": 0.09574040556342858, + "grad_norm": 8.913507302493644, + "learning_rate": 4.974310265810671e-05, + "loss": 2.3102, + "mean_token_accuracy": 0.42928009629249575, + "step": 95055 + }, + { + "epoch": 0.09574544161653276, + "grad_norm": 8.807254106186633, + "learning_rate": 4.974304619255385e-05, + "loss": 2.1104, + "mean_token_accuracy": 0.4862069010734558, + "step": 95060 + }, + { + "epoch": 0.09575047766963693, + "grad_norm": 10.489873744662276, + "learning_rate": 4.974298972083181e-05, + "loss": 2.3116, + "mean_token_accuracy": 0.42068966031074523, + "step": 95065 + }, + { + "epoch": 0.0957555137227411, + "grad_norm": 10.788675303453362, + "learning_rate": 4.974293324294059e-05, + "loss": 2.6141, + "mean_token_accuracy": 0.3931034505367279, + "step": 95070 + }, + { + "epoch": 0.09576054977584528, + "grad_norm": 11.692099525996737, + "learning_rate": 4.97428767588802e-05, + "loss": 2.5831, + "mean_token_accuracy": 0.422202056646347, + "step": 95075 + }, + { + "epoch": 0.09576558582894945, + "grad_norm": 10.309317769893221, + "learning_rate": 4.974282026865067e-05, + "loss": 2.999, + "mean_token_accuracy": 0.37241379618644715, + "step": 95080 + }, + { + "epoch": 0.09577062188205362, + "grad_norm": 10.026510251368254, + "learning_rate": 4.974276377225201e-05, + "loss": 2.4884, + "mean_token_accuracy": 0.4275862157344818, + "step": 95085 + }, + { + "epoch": 0.0957756579351578, + "grad_norm": 12.316708318434136, + "learning_rate": 4.974270726968424e-05, + "loss": 2.8857, + "mean_token_accuracy": 0.3620689630508423, + "step": 95090 + }, + { + "epoch": 0.09578069398826197, + "grad_norm": 10.719079459536728, + "learning_rate": 4.9742650760947366e-05, + "loss": 2.5759, + "mean_token_accuracy": 0.41379310488700866, + "step": 95095 + }, + { + "epoch": 0.09578573004136615, + "grad_norm": 10.91063171802848, + "learning_rate": 4.974259424604141e-05, + "loss": 2.683, + "mean_token_accuracy": 0.39655172228813174, + "step": 95100 + }, + { + "epoch": 0.09579076609447032, + "grad_norm": 10.728893168116445, + "learning_rate": 4.9742537724966394e-05, + "loss": 2.3566, + "mean_token_accuracy": 0.41724138855934145, + "step": 95105 + }, + { + "epoch": 0.09579580214757448, + "grad_norm": 10.377178786357568, + "learning_rate": 4.9742481197722326e-05, + "loss": 2.5352, + "mean_token_accuracy": 0.4034482777118683, + "step": 95110 + }, + { + "epoch": 0.09580083820067865, + "grad_norm": 9.739944444428108, + "learning_rate": 4.9742424664309223e-05, + "loss": 2.3447, + "mean_token_accuracy": 0.42758620381355283, + "step": 95115 + }, + { + "epoch": 0.09580587425378283, + "grad_norm": 8.591985846228438, + "learning_rate": 4.97423681247271e-05, + "loss": 2.1148, + "mean_token_accuracy": 0.4172413766384125, + "step": 95120 + }, + { + "epoch": 0.095810910306887, + "grad_norm": 15.004131485630086, + "learning_rate": 4.974231157897597e-05, + "loss": 2.2685, + "mean_token_accuracy": 0.47586206197738645, + "step": 95125 + }, + { + "epoch": 0.09581594635999117, + "grad_norm": 15.30312026025668, + "learning_rate": 4.9742255027055855e-05, + "loss": 3.0556, + "mean_token_accuracy": 0.3620689630508423, + "step": 95130 + }, + { + "epoch": 0.09582098241309535, + "grad_norm": 10.349299386271236, + "learning_rate": 4.974219846896677e-05, + "loss": 2.4487, + "mean_token_accuracy": 0.4344827651977539, + "step": 95135 + }, + { + "epoch": 0.09582601846619952, + "grad_norm": 12.246292717670443, + "learning_rate": 4.974214190470872e-05, + "loss": 2.5275, + "mean_token_accuracy": 0.4, + "step": 95140 + }, + { + "epoch": 0.0958310545193037, + "grad_norm": 12.807837955210342, + "learning_rate": 4.9742085334281734e-05, + "loss": 2.4405, + "mean_token_accuracy": 0.4103448331356049, + "step": 95145 + }, + { + "epoch": 0.09583609057240787, + "grad_norm": 10.300060522568472, + "learning_rate": 4.974202875768582e-05, + "loss": 2.5302, + "mean_token_accuracy": 0.41034482717514037, + "step": 95150 + }, + { + "epoch": 0.09584112662551204, + "grad_norm": 8.781037580734248, + "learning_rate": 4.9741972174921e-05, + "loss": 2.2635, + "mean_token_accuracy": 0.45517241954803467, + "step": 95155 + }, + { + "epoch": 0.09584616267861622, + "grad_norm": 11.171426014680025, + "learning_rate": 4.974191558598729e-05, + "loss": 2.3941, + "mean_token_accuracy": 0.4413793087005615, + "step": 95160 + }, + { + "epoch": 0.09585119873172039, + "grad_norm": 12.49779640966048, + "learning_rate": 4.97418589908847e-05, + "loss": 2.5187, + "mean_token_accuracy": 0.42758620977401735, + "step": 95165 + }, + { + "epoch": 0.09585623478482456, + "grad_norm": 10.422837510643445, + "learning_rate": 4.974180238961324e-05, + "loss": 2.8408, + "mean_token_accuracy": 0.38451300859451293, + "step": 95170 + }, + { + "epoch": 0.09586127083792874, + "grad_norm": 11.69109951096343, + "learning_rate": 4.9741745782172946e-05, + "loss": 2.6397, + "mean_token_accuracy": 0.3827586233615875, + "step": 95175 + }, + { + "epoch": 0.0958663068910329, + "grad_norm": 10.797278782926025, + "learning_rate": 4.97416891685638e-05, + "loss": 2.3604, + "mean_token_accuracy": 0.4137930989265442, + "step": 95180 + }, + { + "epoch": 0.09587134294413707, + "grad_norm": 11.172458391187575, + "learning_rate": 4.9741632548785856e-05, + "loss": 2.0963, + "mean_token_accuracy": 0.46896551847457885, + "step": 95185 + }, + { + "epoch": 0.09587637899724125, + "grad_norm": 11.851418916900489, + "learning_rate": 4.97415759228391e-05, + "loss": 2.2374, + "mean_token_accuracy": 0.4206896543502808, + "step": 95190 + }, + { + "epoch": 0.09588141505034542, + "grad_norm": 10.372008028498017, + "learning_rate": 4.974151929072358e-05, + "loss": 2.4251, + "mean_token_accuracy": 0.4448275864124298, + "step": 95195 + }, + { + "epoch": 0.0958864511034496, + "grad_norm": 11.24761765345112, + "learning_rate": 4.974146265243927e-05, + "loss": 2.1414, + "mean_token_accuracy": 0.4310344934463501, + "step": 95200 + }, + { + "epoch": 0.09589148715655377, + "grad_norm": 10.434198333460122, + "learning_rate": 4.9741406007986214e-05, + "loss": 2.5401, + "mean_token_accuracy": 0.44827585220336913, + "step": 95205 + }, + { + "epoch": 0.09589652320965794, + "grad_norm": 9.156831173850202, + "learning_rate": 4.9741349357364425e-05, + "loss": 2.2876, + "mean_token_accuracy": 0.4482758641242981, + "step": 95210 + }, + { + "epoch": 0.09590155926276211, + "grad_norm": 14.967983911508021, + "learning_rate": 4.974129270057391e-05, + "loss": 2.6626, + "mean_token_accuracy": 0.42758620977401735, + "step": 95215 + }, + { + "epoch": 0.09590659531586629, + "grad_norm": 9.19180242349692, + "learning_rate": 4.9741236037614696e-05, + "loss": 2.5643, + "mean_token_accuracy": 0.4620689570903778, + "step": 95220 + }, + { + "epoch": 0.09591163136897046, + "grad_norm": 9.82127039237697, + "learning_rate": 4.974117936848679e-05, + "loss": 2.3989, + "mean_token_accuracy": 0.4675136089324951, + "step": 95225 + }, + { + "epoch": 0.09591666742207464, + "grad_norm": 11.429674489842016, + "learning_rate": 4.9741122693190206e-05, + "loss": 2.289, + "mean_token_accuracy": 0.4344827651977539, + "step": 95230 + }, + { + "epoch": 0.09592170347517881, + "grad_norm": 12.757624100458566, + "learning_rate": 4.9741066011724966e-05, + "loss": 2.4595, + "mean_token_accuracy": 0.3896551728248596, + "step": 95235 + }, + { + "epoch": 0.09592673952828298, + "grad_norm": 9.974750157081148, + "learning_rate": 4.9741009324091084e-05, + "loss": 2.9408, + "mean_token_accuracy": 0.37931033968925476, + "step": 95240 + }, + { + "epoch": 0.09593177558138716, + "grad_norm": 12.369568369633587, + "learning_rate": 4.974095263028857e-05, + "loss": 2.1176, + "mean_token_accuracy": 0.48148819208145144, + "step": 95245 + }, + { + "epoch": 0.09593681163449132, + "grad_norm": 10.921786368388444, + "learning_rate": 4.974089593031746e-05, + "loss": 2.4931, + "mean_token_accuracy": 0.41034482717514037, + "step": 95250 + }, + { + "epoch": 0.09594184768759549, + "grad_norm": 8.733011968045858, + "learning_rate": 4.9740839224177735e-05, + "loss": 2.9242, + "mean_token_accuracy": 0.39655172228813174, + "step": 95255 + }, + { + "epoch": 0.09594688374069966, + "grad_norm": 11.757400447120189, + "learning_rate": 4.974078251186944e-05, + "loss": 2.3107, + "mean_token_accuracy": 0.47241379618644713, + "step": 95260 + }, + { + "epoch": 0.09595191979380384, + "grad_norm": 10.710397849462495, + "learning_rate": 4.9740725793392585e-05, + "loss": 2.5068, + "mean_token_accuracy": 0.4034482777118683, + "step": 95265 + }, + { + "epoch": 0.09595695584690801, + "grad_norm": 10.007541540913595, + "learning_rate": 4.974066906874718e-05, + "loss": 2.5806, + "mean_token_accuracy": 0.4068965494632721, + "step": 95270 + }, + { + "epoch": 0.09596199190001219, + "grad_norm": 10.735108749819753, + "learning_rate": 4.9740612337933236e-05, + "loss": 2.2604, + "mean_token_accuracy": 0.42413793206214906, + "step": 95275 + }, + { + "epoch": 0.09596702795311636, + "grad_norm": 11.52115412808419, + "learning_rate": 4.9740555600950773e-05, + "loss": 2.13, + "mean_token_accuracy": 0.4620689570903778, + "step": 95280 + }, + { + "epoch": 0.09597206400622053, + "grad_norm": 11.671862935639766, + "learning_rate": 4.974049885779982e-05, + "loss": 2.2819, + "mean_token_accuracy": 0.4206896543502808, + "step": 95285 + }, + { + "epoch": 0.09597710005932471, + "grad_norm": 9.441640752481726, + "learning_rate": 4.9740442108480374e-05, + "loss": 2.2665, + "mean_token_accuracy": 0.4344827592372894, + "step": 95290 + }, + { + "epoch": 0.09598213611242888, + "grad_norm": 11.249877090710932, + "learning_rate": 4.9740385352992464e-05, + "loss": 2.6736, + "mean_token_accuracy": 0.43448275327682495, + "step": 95295 + }, + { + "epoch": 0.09598717216553306, + "grad_norm": 22.27137243971899, + "learning_rate": 4.9740328591336095e-05, + "loss": 2.9338, + "mean_token_accuracy": 0.388021782040596, + "step": 95300 + }, + { + "epoch": 0.09599220821863723, + "grad_norm": 11.408239725400664, + "learning_rate": 4.9740271823511286e-05, + "loss": 2.541, + "mean_token_accuracy": 0.4, + "step": 95305 + }, + { + "epoch": 0.0959972442717414, + "grad_norm": 12.492169711289046, + "learning_rate": 4.9740215049518064e-05, + "loss": 2.5706, + "mean_token_accuracy": 0.3862069010734558, + "step": 95310 + }, + { + "epoch": 0.09600228032484556, + "grad_norm": 14.138862123100731, + "learning_rate": 4.974015826935643e-05, + "loss": 2.8088, + "mean_token_accuracy": 0.43297035098075864, + "step": 95315 + }, + { + "epoch": 0.09600731637794974, + "grad_norm": 13.239929582280176, + "learning_rate": 4.9740101483026404e-05, + "loss": 2.5245, + "mean_token_accuracy": 0.4186932921409607, + "step": 95320 + }, + { + "epoch": 0.09601235243105391, + "grad_norm": 12.575069965290933, + "learning_rate": 4.9740044690528e-05, + "loss": 2.9031, + "mean_token_accuracy": 0.3827586233615875, + "step": 95325 + }, + { + "epoch": 0.09601738848415808, + "grad_norm": 14.018476110013284, + "learning_rate": 4.973998789186124e-05, + "loss": 2.9294, + "mean_token_accuracy": 0.34482758939266206, + "step": 95330 + }, + { + "epoch": 0.09602242453726226, + "grad_norm": 9.18123459734771, + "learning_rate": 4.973993108702614e-05, + "loss": 2.3597, + "mean_token_accuracy": 0.4554144024848938, + "step": 95335 + }, + { + "epoch": 0.09602746059036643, + "grad_norm": 12.76576450220605, + "learning_rate": 4.9739874276022705e-05, + "loss": 3.0349, + "mean_token_accuracy": 0.4034482717514038, + "step": 95340 + }, + { + "epoch": 0.0960324966434706, + "grad_norm": 10.05518841573313, + "learning_rate": 4.973981745885096e-05, + "loss": 2.7251, + "mean_token_accuracy": 0.36896551251411436, + "step": 95345 + }, + { + "epoch": 0.09603753269657478, + "grad_norm": 8.828798661659626, + "learning_rate": 4.973976063551092e-05, + "loss": 2.3836, + "mean_token_accuracy": 0.4482758641242981, + "step": 95350 + }, + { + "epoch": 0.09604256874967895, + "grad_norm": 12.4454148324595, + "learning_rate": 4.97397038060026e-05, + "loss": 2.3836, + "mean_token_accuracy": 0.4103448331356049, + "step": 95355 + }, + { + "epoch": 0.09604760480278313, + "grad_norm": 11.204530418282397, + "learning_rate": 4.973964697032602e-05, + "loss": 2.5204, + "mean_token_accuracy": 0.39310343861579894, + "step": 95360 + }, + { + "epoch": 0.0960526408558873, + "grad_norm": 12.60579714319122, + "learning_rate": 4.973959012848118e-05, + "loss": 2.575, + "mean_token_accuracy": 0.37586206793785093, + "step": 95365 + }, + { + "epoch": 0.09605767690899147, + "grad_norm": 12.338052911550129, + "learning_rate": 4.973953328046812e-05, + "loss": 2.6658, + "mean_token_accuracy": 0.45716878175735476, + "step": 95370 + }, + { + "epoch": 0.09606271296209565, + "grad_norm": 10.431963632177885, + "learning_rate": 4.9739476426286835e-05, + "loss": 2.4253, + "mean_token_accuracy": 0.41034482717514037, + "step": 95375 + }, + { + "epoch": 0.09606774901519982, + "grad_norm": 11.214174061238747, + "learning_rate": 4.9739419565937344e-05, + "loss": 2.6459, + "mean_token_accuracy": 0.39655172228813174, + "step": 95380 + }, + { + "epoch": 0.09607278506830398, + "grad_norm": 11.798086063985945, + "learning_rate": 4.973936269941967e-05, + "loss": 2.4664, + "mean_token_accuracy": 0.4344827592372894, + "step": 95385 + }, + { + "epoch": 0.09607782112140816, + "grad_norm": 10.639804585164638, + "learning_rate": 4.973930582673383e-05, + "loss": 2.5283, + "mean_token_accuracy": 0.41724138259887694, + "step": 95390 + }, + { + "epoch": 0.09608285717451233, + "grad_norm": 14.569916951859103, + "learning_rate": 4.973924894787983e-05, + "loss": 2.5843, + "mean_token_accuracy": 0.3724137842655182, + "step": 95395 + }, + { + "epoch": 0.0960878932276165, + "grad_norm": 9.548282917476612, + "learning_rate": 4.9739192062857696e-05, + "loss": 2.8062, + "mean_token_accuracy": 0.41379310488700866, + "step": 95400 + }, + { + "epoch": 0.09609292928072068, + "grad_norm": 10.008246028920373, + "learning_rate": 4.973913517166744e-05, + "loss": 3.067, + "mean_token_accuracy": 0.3275862097740173, + "step": 95405 + }, + { + "epoch": 0.09609796533382485, + "grad_norm": 11.757367358864384, + "learning_rate": 4.973907827430908e-05, + "loss": 2.3479, + "mean_token_accuracy": 0.43103448748588563, + "step": 95410 + }, + { + "epoch": 0.09610300138692902, + "grad_norm": 10.19495218585726, + "learning_rate": 4.973902137078261e-05, + "loss": 2.4408, + "mean_token_accuracy": 0.45722927451133727, + "step": 95415 + }, + { + "epoch": 0.0961080374400332, + "grad_norm": 11.777703318046838, + "learning_rate": 4.9738964461088085e-05, + "loss": 2.6761, + "mean_token_accuracy": 0.4103448212146759, + "step": 95420 + }, + { + "epoch": 0.09611307349313737, + "grad_norm": 10.078515922786325, + "learning_rate": 4.973890754522549e-05, + "loss": 2.437, + "mean_token_accuracy": 0.42413793206214906, + "step": 95425 + }, + { + "epoch": 0.09611810954624155, + "grad_norm": 10.97734125006126, + "learning_rate": 4.973885062319486e-05, + "loss": 2.5886, + "mean_token_accuracy": 0.42758620381355283, + "step": 95430 + }, + { + "epoch": 0.09612314559934572, + "grad_norm": 11.485868225425484, + "learning_rate": 4.973879369499619e-05, + "loss": 2.1637, + "mean_token_accuracy": 0.43448275327682495, + "step": 95435 + }, + { + "epoch": 0.09612818165244989, + "grad_norm": 9.698961065775471, + "learning_rate": 4.973873676062951e-05, + "loss": 2.5072, + "mean_token_accuracy": 0.39655172228813174, + "step": 95440 + }, + { + "epoch": 0.09613321770555407, + "grad_norm": 13.108996826312467, + "learning_rate": 4.973867982009484e-05, + "loss": 2.6176, + "mean_token_accuracy": 0.45172414779663084, + "step": 95445 + }, + { + "epoch": 0.09613825375865824, + "grad_norm": 10.183431395851537, + "learning_rate": 4.9738622873392184e-05, + "loss": 2.5043, + "mean_token_accuracy": 0.4103448212146759, + "step": 95450 + }, + { + "epoch": 0.0961432898117624, + "grad_norm": 10.58621637377267, + "learning_rate": 4.973856592052156e-05, + "loss": 2.3518, + "mean_token_accuracy": 0.4310344815254211, + "step": 95455 + }, + { + "epoch": 0.09614832586486657, + "grad_norm": 12.104231081275431, + "learning_rate": 4.9738508961482996e-05, + "loss": 2.7263, + "mean_token_accuracy": 0.3655172407627106, + "step": 95460 + }, + { + "epoch": 0.09615336191797075, + "grad_norm": 10.152698554513846, + "learning_rate": 4.973845199627649e-05, + "loss": 2.6048, + "mean_token_accuracy": 0.39655172228813174, + "step": 95465 + }, + { + "epoch": 0.09615839797107492, + "grad_norm": 10.672249214311165, + "learning_rate": 4.9738395024902075e-05, + "loss": 2.6455, + "mean_token_accuracy": 0.40344826579093934, + "step": 95470 + }, + { + "epoch": 0.0961634340241791, + "grad_norm": 12.720320339710737, + "learning_rate": 4.973833804735975e-05, + "loss": 2.4842, + "mean_token_accuracy": 0.4620689570903778, + "step": 95475 + }, + { + "epoch": 0.09616847007728327, + "grad_norm": 10.368635716914332, + "learning_rate": 4.973828106364955e-05, + "loss": 2.4516, + "mean_token_accuracy": 0.44482759237289426, + "step": 95480 + }, + { + "epoch": 0.09617350613038744, + "grad_norm": 11.884396607829155, + "learning_rate": 4.9738224073771464e-05, + "loss": 2.5678, + "mean_token_accuracy": 0.4206896543502808, + "step": 95485 + }, + { + "epoch": 0.09617854218349162, + "grad_norm": 9.162142600567304, + "learning_rate": 4.973816707772553e-05, + "loss": 2.3943, + "mean_token_accuracy": 0.4068965494632721, + "step": 95490 + }, + { + "epoch": 0.09618357823659579, + "grad_norm": 14.383877509317129, + "learning_rate": 4.9738110075511766e-05, + "loss": 2.9026, + "mean_token_accuracy": 0.37586206793785093, + "step": 95495 + }, + { + "epoch": 0.09618861428969996, + "grad_norm": 11.902541659823498, + "learning_rate": 4.973805306713017e-05, + "loss": 2.1208, + "mean_token_accuracy": 0.4620689511299133, + "step": 95500 + }, + { + "epoch": 0.09619365034280414, + "grad_norm": 11.93334597153728, + "learning_rate": 4.973799605258077e-05, + "loss": 2.8371, + "mean_token_accuracy": 0.3551724135875702, + "step": 95505 + }, + { + "epoch": 0.09619868639590831, + "grad_norm": 10.739982971163023, + "learning_rate": 4.9737939031863574e-05, + "loss": 2.2036, + "mean_token_accuracy": 0.46896552443504336, + "step": 95510 + }, + { + "epoch": 0.09620372244901249, + "grad_norm": 11.949905095042597, + "learning_rate": 4.973788200497861e-05, + "loss": 2.1036, + "mean_token_accuracy": 0.47586206793785096, + "step": 95515 + }, + { + "epoch": 0.09620875850211666, + "grad_norm": 11.64971327379536, + "learning_rate": 4.9737824971925884e-05, + "loss": 2.4053, + "mean_token_accuracy": 0.4137930989265442, + "step": 95520 + }, + { + "epoch": 0.09621379455522082, + "grad_norm": 13.77358130243912, + "learning_rate": 4.973776793270542e-05, + "loss": 3.0702, + "mean_token_accuracy": 0.3655172437429428, + "step": 95525 + }, + { + "epoch": 0.09621883060832499, + "grad_norm": 11.467515980572148, + "learning_rate": 4.9737710887317215e-05, + "loss": 2.6149, + "mean_token_accuracy": 0.42413792610168455, + "step": 95530 + }, + { + "epoch": 0.09622386666142917, + "grad_norm": 9.394389486470782, + "learning_rate": 4.9737653835761305e-05, + "loss": 2.2787, + "mean_token_accuracy": 0.4413793087005615, + "step": 95535 + }, + { + "epoch": 0.09622890271453334, + "grad_norm": 10.547651787129592, + "learning_rate": 4.97375967780377e-05, + "loss": 2.5225, + "mean_token_accuracy": 0.3843315154314041, + "step": 95540 + }, + { + "epoch": 0.09623393876763751, + "grad_norm": 10.731070862890196, + "learning_rate": 4.9737539714146417e-05, + "loss": 2.1733, + "mean_token_accuracy": 0.4448275864124298, + "step": 95545 + }, + { + "epoch": 0.09623897482074169, + "grad_norm": 14.664328380143658, + "learning_rate": 4.9737482644087465e-05, + "loss": 2.5798, + "mean_token_accuracy": 0.4310344815254211, + "step": 95550 + }, + { + "epoch": 0.09624401087384586, + "grad_norm": 8.598998287700047, + "learning_rate": 4.973742556786086e-05, + "loss": 2.276, + "mean_token_accuracy": 0.4413793087005615, + "step": 95555 + }, + { + "epoch": 0.09624904692695004, + "grad_norm": 11.565497628327153, + "learning_rate": 4.973736848546663e-05, + "loss": 2.5683, + "mean_token_accuracy": 0.37241379022598264, + "step": 95560 + }, + { + "epoch": 0.09625408298005421, + "grad_norm": 11.571824559239893, + "learning_rate": 4.973731139690478e-05, + "loss": 2.4504, + "mean_token_accuracy": 0.4413793087005615, + "step": 95565 + }, + { + "epoch": 0.09625911903315838, + "grad_norm": 10.717194962127268, + "learning_rate": 4.973725430217532e-05, + "loss": 2.5162, + "mean_token_accuracy": 0.44827585816383364, + "step": 95570 + }, + { + "epoch": 0.09626415508626256, + "grad_norm": 10.89973909651337, + "learning_rate": 4.973719720127829e-05, + "loss": 2.6385, + "mean_token_accuracy": 0.41379310488700866, + "step": 95575 + }, + { + "epoch": 0.09626919113936673, + "grad_norm": 11.39647182504023, + "learning_rate": 4.973714009421368e-05, + "loss": 2.4942, + "mean_token_accuracy": 0.35862068831920624, + "step": 95580 + }, + { + "epoch": 0.0962742271924709, + "grad_norm": 14.428872892700461, + "learning_rate": 4.9737082980981524e-05, + "loss": 2.7628, + "mean_token_accuracy": 0.37931033968925476, + "step": 95585 + }, + { + "epoch": 0.09627926324557508, + "grad_norm": 11.699979598475725, + "learning_rate": 4.9737025861581826e-05, + "loss": 2.8966, + "mean_token_accuracy": 0.3689655244350433, + "step": 95590 + }, + { + "epoch": 0.09628429929867924, + "grad_norm": 9.642883272713432, + "learning_rate": 4.97369687360146e-05, + "loss": 2.4313, + "mean_token_accuracy": 0.4379310369491577, + "step": 95595 + }, + { + "epoch": 0.09628933535178341, + "grad_norm": 14.97896320994989, + "learning_rate": 4.9736911604279876e-05, + "loss": 2.4291, + "mean_token_accuracy": 0.4228675127029419, + "step": 95600 + }, + { + "epoch": 0.09629437140488759, + "grad_norm": 12.513926607230237, + "learning_rate": 4.973685446637766e-05, + "loss": 2.6394, + "mean_token_accuracy": 0.4, + "step": 95605 + }, + { + "epoch": 0.09629940745799176, + "grad_norm": 10.965204580777522, + "learning_rate": 4.973679732230797e-05, + "loss": 2.0242, + "mean_token_accuracy": 0.48620688915252686, + "step": 95610 + }, + { + "epoch": 0.09630444351109593, + "grad_norm": 10.843799467313156, + "learning_rate": 4.973674017207082e-05, + "loss": 2.2967, + "mean_token_accuracy": 0.4344827592372894, + "step": 95615 + }, + { + "epoch": 0.0963094795642001, + "grad_norm": 15.558143047880607, + "learning_rate": 4.973668301566622e-05, + "loss": 2.6606, + "mean_token_accuracy": 0.4206896543502808, + "step": 95620 + }, + { + "epoch": 0.09631451561730428, + "grad_norm": 12.068201771821075, + "learning_rate": 4.9736625853094196e-05, + "loss": 2.4344, + "mean_token_accuracy": 0.44827585816383364, + "step": 95625 + }, + { + "epoch": 0.09631955167040845, + "grad_norm": 12.988432751145448, + "learning_rate": 4.9736568684354774e-05, + "loss": 2.7037, + "mean_token_accuracy": 0.43986691236495973, + "step": 95630 + }, + { + "epoch": 0.09632458772351263, + "grad_norm": 11.380903415375702, + "learning_rate": 4.973651150944795e-05, + "loss": 2.424, + "mean_token_accuracy": 0.44289171099662783, + "step": 95635 + }, + { + "epoch": 0.0963296237766168, + "grad_norm": 9.907444102014804, + "learning_rate": 4.973645432837374e-05, + "loss": 2.2341, + "mean_token_accuracy": 0.417241370677948, + "step": 95640 + }, + { + "epoch": 0.09633465982972098, + "grad_norm": 10.458270135841472, + "learning_rate": 4.9736397141132174e-05, + "loss": 2.4337, + "mean_token_accuracy": 0.41379310488700866, + "step": 95645 + }, + { + "epoch": 0.09633969588282515, + "grad_norm": 14.72088143261794, + "learning_rate": 4.973633994772326e-05, + "loss": 2.4482, + "mean_token_accuracy": 0.44482759237289426, + "step": 95650 + }, + { + "epoch": 0.09634473193592932, + "grad_norm": 11.508555078679926, + "learning_rate": 4.9736282748147005e-05, + "loss": 2.5893, + "mean_token_accuracy": 0.3896551728248596, + "step": 95655 + }, + { + "epoch": 0.0963497679890335, + "grad_norm": 9.201790434392777, + "learning_rate": 4.9736225542403445e-05, + "loss": 2.1809, + "mean_token_accuracy": 0.5103448271751404, + "step": 95660 + }, + { + "epoch": 0.09635480404213766, + "grad_norm": 9.31917742079431, + "learning_rate": 4.973616833049258e-05, + "loss": 2.0619, + "mean_token_accuracy": 0.5011083722114563, + "step": 95665 + }, + { + "epoch": 0.09635984009524183, + "grad_norm": 9.183284527513235, + "learning_rate": 4.973611111241442e-05, + "loss": 2.249, + "mean_token_accuracy": 0.4965517222881317, + "step": 95670 + }, + { + "epoch": 0.096364876148346, + "grad_norm": 10.232376520690101, + "learning_rate": 4.973605388816901e-05, + "loss": 2.9215, + "mean_token_accuracy": 0.3758620709180832, + "step": 95675 + }, + { + "epoch": 0.09636991220145018, + "grad_norm": 10.986216334999819, + "learning_rate": 4.973599665775634e-05, + "loss": 2.5622, + "mean_token_accuracy": 0.43793103098869324, + "step": 95680 + }, + { + "epoch": 0.09637494825455435, + "grad_norm": 11.558583340787235, + "learning_rate": 4.973593942117644e-05, + "loss": 2.724, + "mean_token_accuracy": 0.3827586114406586, + "step": 95685 + }, + { + "epoch": 0.09637998430765853, + "grad_norm": 12.18118432090526, + "learning_rate": 4.9735882178429296e-05, + "loss": 2.1419, + "mean_token_accuracy": 0.48620688915252686, + "step": 95690 + }, + { + "epoch": 0.0963850203607627, + "grad_norm": 9.48633808403315, + "learning_rate": 4.9735824929514974e-05, + "loss": 2.3335, + "mean_token_accuracy": 0.4419950693845749, + "step": 95695 + }, + { + "epoch": 0.09639005641386687, + "grad_norm": 10.16351059166901, + "learning_rate": 4.9735767674433446e-05, + "loss": 2.5048, + "mean_token_accuracy": 0.3896551728248596, + "step": 95700 + }, + { + "epoch": 0.09639509246697105, + "grad_norm": 11.45007847585355, + "learning_rate": 4.973571041318476e-05, + "loss": 2.9145, + "mean_token_accuracy": 0.38275861740112305, + "step": 95705 + }, + { + "epoch": 0.09640012852007522, + "grad_norm": 9.794638265298522, + "learning_rate": 4.9735653145768905e-05, + "loss": 2.6946, + "mean_token_accuracy": 0.4188747763633728, + "step": 95710 + }, + { + "epoch": 0.0964051645731794, + "grad_norm": 10.383656504047751, + "learning_rate": 4.973559587218591e-05, + "loss": 2.1924, + "mean_token_accuracy": 0.42758620977401735, + "step": 95715 + }, + { + "epoch": 0.09641020062628357, + "grad_norm": 9.797923282699946, + "learning_rate": 4.9735538592435784e-05, + "loss": 2.5675, + "mean_token_accuracy": 0.4517241358757019, + "step": 95720 + }, + { + "epoch": 0.09641523667938774, + "grad_norm": 10.1815607039788, + "learning_rate": 4.9735481306518564e-05, + "loss": 2.206, + "mean_token_accuracy": 0.4931034445762634, + "step": 95725 + }, + { + "epoch": 0.09642027273249192, + "grad_norm": 9.209017687332727, + "learning_rate": 4.973542401443424e-05, + "loss": 2.2278, + "mean_token_accuracy": 0.45862067937850953, + "step": 95730 + }, + { + "epoch": 0.09642530878559608, + "grad_norm": 14.296437625385964, + "learning_rate": 4.973536671618285e-05, + "loss": 2.7697, + "mean_token_accuracy": 0.41379310488700866, + "step": 95735 + }, + { + "epoch": 0.09643034483870025, + "grad_norm": 9.98482931261745, + "learning_rate": 4.973530941176438e-05, + "loss": 2.1104, + "mean_token_accuracy": 0.46400484442710876, + "step": 95740 + }, + { + "epoch": 0.09643538089180442, + "grad_norm": 10.12526689256265, + "learning_rate": 4.973525210117887e-05, + "loss": 2.2251, + "mean_token_accuracy": 0.45862069725990295, + "step": 95745 + }, + { + "epoch": 0.0964404169449086, + "grad_norm": 9.129096993210004, + "learning_rate": 4.973519478442633e-05, + "loss": 2.2087, + "mean_token_accuracy": 0.4413793087005615, + "step": 95750 + }, + { + "epoch": 0.09644545299801277, + "grad_norm": 9.65077524578711, + "learning_rate": 4.973513746150678e-05, + "loss": 2.2998, + "mean_token_accuracy": 0.4517241448163986, + "step": 95755 + }, + { + "epoch": 0.09645048905111694, + "grad_norm": 8.982296758206601, + "learning_rate": 4.9735080132420235e-05, + "loss": 2.2156, + "mean_token_accuracy": 0.45662431716918944, + "step": 95760 + }, + { + "epoch": 0.09645552510422112, + "grad_norm": 13.448091338844382, + "learning_rate": 4.97350227971667e-05, + "loss": 2.3376, + "mean_token_accuracy": 0.413793095946312, + "step": 95765 + }, + { + "epoch": 0.09646056115732529, + "grad_norm": 11.772039005188871, + "learning_rate": 4.9734965455746194e-05, + "loss": 2.707, + "mean_token_accuracy": 0.4137931078672409, + "step": 95770 + }, + { + "epoch": 0.09646559721042947, + "grad_norm": 11.879850749863365, + "learning_rate": 4.9734908108158754e-05, + "loss": 2.6102, + "mean_token_accuracy": 0.42915910482406616, + "step": 95775 + }, + { + "epoch": 0.09647063326353364, + "grad_norm": 11.657083250369277, + "learning_rate": 4.973485075440436e-05, + "loss": 2.5562, + "mean_token_accuracy": 0.42413793206214906, + "step": 95780 + }, + { + "epoch": 0.09647566931663781, + "grad_norm": 14.492282877325094, + "learning_rate": 4.973479339448306e-05, + "loss": 2.4502, + "mean_token_accuracy": 0.3758620619773865, + "step": 95785 + }, + { + "epoch": 0.09648070536974199, + "grad_norm": 13.781841487340314, + "learning_rate": 4.973473602839486e-05, + "loss": 2.3901, + "mean_token_accuracy": 0.44827585816383364, + "step": 95790 + }, + { + "epoch": 0.09648574142284616, + "grad_norm": 14.441629090690327, + "learning_rate": 4.9734678656139764e-05, + "loss": 2.7771, + "mean_token_accuracy": 0.39655172228813174, + "step": 95795 + }, + { + "epoch": 0.09649077747595033, + "grad_norm": 11.487883457846998, + "learning_rate": 4.9734621277717804e-05, + "loss": 2.5891, + "mean_token_accuracy": 0.3741681814193726, + "step": 95800 + }, + { + "epoch": 0.0964958135290545, + "grad_norm": 11.14689210303026, + "learning_rate": 4.973456389312898e-05, + "loss": 2.5727, + "mean_token_accuracy": 0.43793103098869324, + "step": 95805 + }, + { + "epoch": 0.09650084958215867, + "grad_norm": 11.58444777450018, + "learning_rate": 4.973450650237332e-05, + "loss": 2.0923, + "mean_token_accuracy": 0.46896552443504336, + "step": 95810 + }, + { + "epoch": 0.09650588563526284, + "grad_norm": 11.424052190992473, + "learning_rate": 4.9734449105450835e-05, + "loss": 2.3583, + "mean_token_accuracy": 0.4, + "step": 95815 + }, + { + "epoch": 0.09651092168836702, + "grad_norm": 18.93291269783059, + "learning_rate": 4.973439170236155e-05, + "loss": 2.5933, + "mean_token_accuracy": 0.4448275864124298, + "step": 95820 + }, + { + "epoch": 0.09651595774147119, + "grad_norm": 10.57508454012662, + "learning_rate": 4.973433429310547e-05, + "loss": 2.5739, + "mean_token_accuracy": 0.4034482717514038, + "step": 95825 + }, + { + "epoch": 0.09652099379457536, + "grad_norm": 11.219250899516567, + "learning_rate": 4.9734276877682614e-05, + "loss": 2.713, + "mean_token_accuracy": 0.4275861978530884, + "step": 95830 + }, + { + "epoch": 0.09652602984767954, + "grad_norm": 11.672250145695347, + "learning_rate": 4.9734219456092996e-05, + "loss": 2.0933, + "mean_token_accuracy": 0.5143375635147095, + "step": 95835 + }, + { + "epoch": 0.09653106590078371, + "grad_norm": 11.980270805883723, + "learning_rate": 4.973416202833663e-05, + "loss": 2.6869, + "mean_token_accuracy": 0.3862069010734558, + "step": 95840 + }, + { + "epoch": 0.09653610195388788, + "grad_norm": 11.282549418647301, + "learning_rate": 4.973410459441355e-05, + "loss": 2.9144, + "mean_token_accuracy": 0.38275861740112305, + "step": 95845 + }, + { + "epoch": 0.09654113800699206, + "grad_norm": 11.047125343444737, + "learning_rate": 4.973404715432375e-05, + "loss": 2.4748, + "mean_token_accuracy": 0.3999999940395355, + "step": 95850 + }, + { + "epoch": 0.09654617406009623, + "grad_norm": 9.04157310466515, + "learning_rate": 4.973398970806725e-05, + "loss": 2.4612, + "mean_token_accuracy": 0.4413793087005615, + "step": 95855 + }, + { + "epoch": 0.0965512101132004, + "grad_norm": 11.034733325377017, + "learning_rate": 4.973393225564408e-05, + "loss": 2.508, + "mean_token_accuracy": 0.4068965494632721, + "step": 95860 + }, + { + "epoch": 0.09655624616630458, + "grad_norm": 10.233317929177732, + "learning_rate": 4.973387479705423e-05, + "loss": 2.5123, + "mean_token_accuracy": 0.37241379618644715, + "step": 95865 + }, + { + "epoch": 0.09656128221940875, + "grad_norm": 9.961053688192642, + "learning_rate": 4.9733817332297755e-05, + "loss": 2.5144, + "mean_token_accuracy": 0.4068965554237366, + "step": 95870 + }, + { + "epoch": 0.09656631827251291, + "grad_norm": 11.024613728240325, + "learning_rate": 4.9733759861374634e-05, + "loss": 2.3592, + "mean_token_accuracy": 0.39655172228813174, + "step": 95875 + }, + { + "epoch": 0.09657135432561709, + "grad_norm": 11.254095753287107, + "learning_rate": 4.9733702384284896e-05, + "loss": 2.3619, + "mean_token_accuracy": 0.43448275327682495, + "step": 95880 + }, + { + "epoch": 0.09657639037872126, + "grad_norm": 11.915891445690459, + "learning_rate": 4.973364490102856e-05, + "loss": 2.9767, + "mean_token_accuracy": 0.3344827637076378, + "step": 95885 + }, + { + "epoch": 0.09658142643182543, + "grad_norm": 10.806872996625806, + "learning_rate": 4.973358741160564e-05, + "loss": 2.2556, + "mean_token_accuracy": 0.458620685338974, + "step": 95890 + }, + { + "epoch": 0.09658646248492961, + "grad_norm": 17.377844458778835, + "learning_rate": 4.9733529916016144e-05, + "loss": 2.7035, + "mean_token_accuracy": 0.38275861740112305, + "step": 95895 + }, + { + "epoch": 0.09659149853803378, + "grad_norm": 10.607088381926005, + "learning_rate": 4.973347241426011e-05, + "loss": 2.3611, + "mean_token_accuracy": 0.41724138259887694, + "step": 95900 + }, + { + "epoch": 0.09659653459113796, + "grad_norm": 11.18473749044348, + "learning_rate": 4.973341490633753e-05, + "loss": 2.2676, + "mean_token_accuracy": 0.42413793206214906, + "step": 95905 + }, + { + "epoch": 0.09660157064424213, + "grad_norm": 10.419121767678288, + "learning_rate": 4.973335739224843e-05, + "loss": 2.223, + "mean_token_accuracy": 0.5, + "step": 95910 + }, + { + "epoch": 0.0966066066973463, + "grad_norm": 11.239529198177836, + "learning_rate": 4.973329987199283e-05, + "loss": 2.4312, + "mean_token_accuracy": 0.4034482777118683, + "step": 95915 + }, + { + "epoch": 0.09661164275045048, + "grad_norm": 11.441966118051846, + "learning_rate": 4.9733242345570735e-05, + "loss": 2.6674, + "mean_token_accuracy": 0.4294010818004608, + "step": 95920 + }, + { + "epoch": 0.09661667880355465, + "grad_norm": 10.23847271530617, + "learning_rate": 4.973318481298217e-05, + "loss": 2.4201, + "mean_token_accuracy": 0.4379310369491577, + "step": 95925 + }, + { + "epoch": 0.09662171485665882, + "grad_norm": 13.944979632815208, + "learning_rate": 4.973312727422715e-05, + "loss": 2.7637, + "mean_token_accuracy": 0.38965516686439516, + "step": 95930 + }, + { + "epoch": 0.096626750909763, + "grad_norm": 10.196543502345914, + "learning_rate": 4.973306972930569e-05, + "loss": 2.608, + "mean_token_accuracy": 0.42413792610168455, + "step": 95935 + }, + { + "epoch": 0.09663178696286717, + "grad_norm": 11.796788012221578, + "learning_rate": 4.9733012178217805e-05, + "loss": 2.2963, + "mean_token_accuracy": 0.4068965494632721, + "step": 95940 + }, + { + "epoch": 0.09663682301597133, + "grad_norm": 11.832794770506567, + "learning_rate": 4.97329546209635e-05, + "loss": 2.577, + "mean_token_accuracy": 0.42413792610168455, + "step": 95945 + }, + { + "epoch": 0.0966418590690755, + "grad_norm": 13.215137265567424, + "learning_rate": 4.9732897057542815e-05, + "loss": 2.8343, + "mean_token_accuracy": 0.36896551847457887, + "step": 95950 + }, + { + "epoch": 0.09664689512217968, + "grad_norm": 10.893704891276444, + "learning_rate": 4.973283948795575e-05, + "loss": 2.3234, + "mean_token_accuracy": 0.43448275327682495, + "step": 95955 + }, + { + "epoch": 0.09665193117528385, + "grad_norm": 9.129939451688788, + "learning_rate": 4.973278191220233e-05, + "loss": 2.5442, + "mean_token_accuracy": 0.43103448748588563, + "step": 95960 + }, + { + "epoch": 0.09665696722838803, + "grad_norm": 10.722429686171576, + "learning_rate": 4.973272433028255e-05, + "loss": 2.3845, + "mean_token_accuracy": 0.4640048384666443, + "step": 95965 + }, + { + "epoch": 0.0966620032814922, + "grad_norm": 11.493399771299801, + "learning_rate": 4.973266674219645e-05, + "loss": 2.5402, + "mean_token_accuracy": 0.38965516686439516, + "step": 95970 + }, + { + "epoch": 0.09666703933459637, + "grad_norm": 13.013174249036021, + "learning_rate": 4.9732609147944045e-05, + "loss": 2.3215, + "mean_token_accuracy": 0.42068966031074523, + "step": 95975 + }, + { + "epoch": 0.09667207538770055, + "grad_norm": 9.343154297195118, + "learning_rate": 4.973255154752533e-05, + "loss": 2.5795, + "mean_token_accuracy": 0.3862069010734558, + "step": 95980 + }, + { + "epoch": 0.09667711144080472, + "grad_norm": 12.62790854592106, + "learning_rate": 4.973249394094033e-05, + "loss": 2.9361, + "mean_token_accuracy": 0.4019358724355698, + "step": 95985 + }, + { + "epoch": 0.0966821474939089, + "grad_norm": 11.411243323414508, + "learning_rate": 4.973243632818908e-05, + "loss": 2.641, + "mean_token_accuracy": 0.4379310369491577, + "step": 95990 + }, + { + "epoch": 0.09668718354701307, + "grad_norm": 11.685013754155161, + "learning_rate": 4.973237870927157e-05, + "loss": 2.0594, + "mean_token_accuracy": 0.458620685338974, + "step": 95995 + }, + { + "epoch": 0.09669221960011724, + "grad_norm": 13.533572355695014, + "learning_rate": 4.9732321084187833e-05, + "loss": 2.2359, + "mean_token_accuracy": 0.4724137902259827, + "step": 96000 + }, + { + "epoch": 0.09669725565322142, + "grad_norm": 11.062699204201715, + "learning_rate": 4.973226345293788e-05, + "loss": 2.6765, + "mean_token_accuracy": 0.45359950661659243, + "step": 96005 + }, + { + "epoch": 0.09670229170632559, + "grad_norm": 11.373954333487799, + "learning_rate": 4.973220581552172e-05, + "loss": 2.0574, + "mean_token_accuracy": 0.5068965435028077, + "step": 96010 + }, + { + "epoch": 0.09670732775942975, + "grad_norm": 11.626099731117069, + "learning_rate": 4.9732148171939376e-05, + "loss": 2.2313, + "mean_token_accuracy": 0.4896551609039307, + "step": 96015 + }, + { + "epoch": 0.09671236381253392, + "grad_norm": 10.646406791711593, + "learning_rate": 4.973209052219086e-05, + "loss": 2.5111, + "mean_token_accuracy": 0.3896551787853241, + "step": 96020 + }, + { + "epoch": 0.0967173998656381, + "grad_norm": 8.975575421539746, + "learning_rate": 4.973203286627619e-05, + "loss": 1.9766, + "mean_token_accuracy": 0.5000000059604645, + "step": 96025 + }, + { + "epoch": 0.09672243591874227, + "grad_norm": 9.905961868142816, + "learning_rate": 4.9731975204195394e-05, + "loss": 2.8307, + "mean_token_accuracy": 0.3724137842655182, + "step": 96030 + }, + { + "epoch": 0.09672747197184645, + "grad_norm": 11.311171945919167, + "learning_rate": 4.973191753594847e-05, + "loss": 2.6204, + "mean_token_accuracy": 0.4, + "step": 96035 + }, + { + "epoch": 0.09673250802495062, + "grad_norm": 12.751102596909488, + "learning_rate": 4.973185986153544e-05, + "loss": 2.836, + "mean_token_accuracy": 0.3862069010734558, + "step": 96040 + }, + { + "epoch": 0.0967375440780548, + "grad_norm": 12.346532435988118, + "learning_rate": 4.973180218095632e-05, + "loss": 2.1248, + "mean_token_accuracy": 0.482758617401123, + "step": 96045 + }, + { + "epoch": 0.09674258013115897, + "grad_norm": 11.055937058947467, + "learning_rate": 4.973174449421113e-05, + "loss": 2.4476, + "mean_token_accuracy": 0.4219600737094879, + "step": 96050 + }, + { + "epoch": 0.09674761618426314, + "grad_norm": 8.153207989374925, + "learning_rate": 4.973168680129988e-05, + "loss": 2.4609, + "mean_token_accuracy": 0.4084089457988739, + "step": 96055 + }, + { + "epoch": 0.09675265223736731, + "grad_norm": 11.459184625177668, + "learning_rate": 4.9731629102222595e-05, + "loss": 2.3485, + "mean_token_accuracy": 0.47931033968925474, + "step": 96060 + }, + { + "epoch": 0.09675768829047149, + "grad_norm": 13.04516176813894, + "learning_rate": 4.973157139697928e-05, + "loss": 2.516, + "mean_token_accuracy": 0.39999999701976774, + "step": 96065 + }, + { + "epoch": 0.09676272434357566, + "grad_norm": 9.924561542112423, + "learning_rate": 4.973151368556995e-05, + "loss": 2.4021, + "mean_token_accuracy": 0.41724138259887694, + "step": 96070 + }, + { + "epoch": 0.09676776039667984, + "grad_norm": 9.565373161095515, + "learning_rate": 4.973145596799463e-05, + "loss": 1.9065, + "mean_token_accuracy": 0.5, + "step": 96075 + }, + { + "epoch": 0.09677279644978401, + "grad_norm": 12.17088369811793, + "learning_rate": 4.973139824425334e-05, + "loss": 2.6999, + "mean_token_accuracy": 0.3827586144208908, + "step": 96080 + }, + { + "epoch": 0.09677783250288817, + "grad_norm": 10.965535509259638, + "learning_rate": 4.973134051434608e-05, + "loss": 2.3523, + "mean_token_accuracy": 0.4620689630508423, + "step": 96085 + }, + { + "epoch": 0.09678286855599234, + "grad_norm": 14.448583333824653, + "learning_rate": 4.973128277827288e-05, + "loss": 2.6201, + "mean_token_accuracy": 0.34482758641242983, + "step": 96090 + }, + { + "epoch": 0.09678790460909652, + "grad_norm": 10.404628082769083, + "learning_rate": 4.973122503603375e-05, + "loss": 2.277, + "mean_token_accuracy": 0.4329098641872406, + "step": 96095 + }, + { + "epoch": 0.09679294066220069, + "grad_norm": 11.013236642376073, + "learning_rate": 4.973116728762871e-05, + "loss": 2.1249, + "mean_token_accuracy": 0.46400484442710876, + "step": 96100 + }, + { + "epoch": 0.09679797671530486, + "grad_norm": 14.88639302637622, + "learning_rate": 4.973110953305776e-05, + "loss": 2.6015, + "mean_token_accuracy": 0.3931034505367279, + "step": 96105 + }, + { + "epoch": 0.09680301276840904, + "grad_norm": 10.536896474497441, + "learning_rate": 4.973105177232094e-05, + "loss": 2.5943, + "mean_token_accuracy": 0.42068964838981626, + "step": 96110 + }, + { + "epoch": 0.09680804882151321, + "grad_norm": 10.111898946629537, + "learning_rate": 4.973099400541825e-05, + "loss": 2.2529, + "mean_token_accuracy": 0.4793103516101837, + "step": 96115 + }, + { + "epoch": 0.09681308487461739, + "grad_norm": 9.374293697206927, + "learning_rate": 4.9730936232349717e-05, + "loss": 2.2736, + "mean_token_accuracy": 0.48620688915252686, + "step": 96120 + }, + { + "epoch": 0.09681812092772156, + "grad_norm": 11.173764937985394, + "learning_rate": 4.973087845311535e-05, + "loss": 2.4755, + "mean_token_accuracy": 0.3655172407627106, + "step": 96125 + }, + { + "epoch": 0.09682315698082573, + "grad_norm": 11.239552971529916, + "learning_rate": 4.9730820667715154e-05, + "loss": 2.0815, + "mean_token_accuracy": 0.5039409041404724, + "step": 96130 + }, + { + "epoch": 0.09682819303392991, + "grad_norm": 16.270708762434673, + "learning_rate": 4.9730762876149174e-05, + "loss": 2.4635, + "mean_token_accuracy": 0.39655172228813174, + "step": 96135 + }, + { + "epoch": 0.09683322908703408, + "grad_norm": 11.269419700698975, + "learning_rate": 4.97307050784174e-05, + "loss": 2.3694, + "mean_token_accuracy": 0.46025408506393434, + "step": 96140 + }, + { + "epoch": 0.09683826514013825, + "grad_norm": 8.612185721874907, + "learning_rate": 4.973064727451985e-05, + "loss": 2.2357, + "mean_token_accuracy": 0.37241379618644715, + "step": 96145 + }, + { + "epoch": 0.09684330119324243, + "grad_norm": 12.009912764954073, + "learning_rate": 4.9730589464456555e-05, + "loss": 2.1394, + "mean_token_accuracy": 0.48275862336158754, + "step": 96150 + }, + { + "epoch": 0.09684833724634659, + "grad_norm": 12.295191839639305, + "learning_rate": 4.973053164822753e-05, + "loss": 2.4012, + "mean_token_accuracy": 0.45045371651649474, + "step": 96155 + }, + { + "epoch": 0.09685337329945076, + "grad_norm": 14.089965048754415, + "learning_rate": 4.973047382583278e-05, + "loss": 2.8956, + "mean_token_accuracy": 0.3482758581638336, + "step": 96160 + }, + { + "epoch": 0.09685840935255494, + "grad_norm": 11.40317014489123, + "learning_rate": 4.973041599727232e-05, + "loss": 2.3978, + "mean_token_accuracy": 0.44482758045196535, + "step": 96165 + }, + { + "epoch": 0.09686344540565911, + "grad_norm": 13.420203127753577, + "learning_rate": 4.973035816254618e-05, + "loss": 2.5303, + "mean_token_accuracy": 0.4, + "step": 96170 + }, + { + "epoch": 0.09686848145876328, + "grad_norm": 12.319196145455095, + "learning_rate": 4.973030032165435e-05, + "loss": 2.5451, + "mean_token_accuracy": 0.3862068891525269, + "step": 96175 + }, + { + "epoch": 0.09687351751186746, + "grad_norm": 12.732813582792284, + "learning_rate": 4.973024247459688e-05, + "loss": 2.278, + "mean_token_accuracy": 0.42413793206214906, + "step": 96180 + }, + { + "epoch": 0.09687855356497163, + "grad_norm": 13.621883033214095, + "learning_rate": 4.973018462137376e-05, + "loss": 3.0151, + "mean_token_accuracy": 0.3780399262905121, + "step": 96185 + }, + { + "epoch": 0.0968835896180758, + "grad_norm": 11.150237884305117, + "learning_rate": 4.9730126761985016e-05, + "loss": 2.4589, + "mean_token_accuracy": 0.4172413766384125, + "step": 96190 + }, + { + "epoch": 0.09688862567117998, + "grad_norm": 12.855475477307328, + "learning_rate": 4.973006889643067e-05, + "loss": 2.228, + "mean_token_accuracy": 0.4742891788482666, + "step": 96195 + }, + { + "epoch": 0.09689366172428415, + "grad_norm": 11.25511450098093, + "learning_rate": 4.9730011024710726e-05, + "loss": 2.5857, + "mean_token_accuracy": 0.3862069010734558, + "step": 96200 + }, + { + "epoch": 0.09689869777738833, + "grad_norm": 13.083329506488003, + "learning_rate": 4.972995314682522e-05, + "loss": 2.5352, + "mean_token_accuracy": 0.4103448331356049, + "step": 96205 + }, + { + "epoch": 0.0969037338304925, + "grad_norm": 11.601643945186618, + "learning_rate": 4.972989526277413e-05, + "loss": 2.5609, + "mean_token_accuracy": 0.46551724076271056, + "step": 96210 + }, + { + "epoch": 0.09690876988359667, + "grad_norm": 10.942019147451784, + "learning_rate": 4.972983737255751e-05, + "loss": 2.4678, + "mean_token_accuracy": 0.3827586233615875, + "step": 96215 + }, + { + "epoch": 0.09691380593670085, + "grad_norm": 11.612622010701468, + "learning_rate": 4.9729779476175355e-05, + "loss": 2.2853, + "mean_token_accuracy": 0.4620689570903778, + "step": 96220 + }, + { + "epoch": 0.09691884198980501, + "grad_norm": 11.05772072843664, + "learning_rate": 4.972972157362769e-05, + "loss": 2.3635, + "mean_token_accuracy": 0.39655172228813174, + "step": 96225 + }, + { + "epoch": 0.09692387804290918, + "grad_norm": 11.399108174838723, + "learning_rate": 4.9729663664914526e-05, + "loss": 2.5442, + "mean_token_accuracy": 0.4034482717514038, + "step": 96230 + }, + { + "epoch": 0.09692891409601335, + "grad_norm": 9.76014568351947, + "learning_rate": 4.9729605750035883e-05, + "loss": 2.531, + "mean_token_accuracy": 0.42413792610168455, + "step": 96235 + }, + { + "epoch": 0.09693395014911753, + "grad_norm": 11.222964816437702, + "learning_rate": 4.972954782899178e-05, + "loss": 2.9663, + "mean_token_accuracy": 0.36551724672317504, + "step": 96240 + }, + { + "epoch": 0.0969389862022217, + "grad_norm": 14.06239042786985, + "learning_rate": 4.972948990178223e-05, + "loss": 2.6517, + "mean_token_accuracy": 0.3862069010734558, + "step": 96245 + }, + { + "epoch": 0.09694402225532588, + "grad_norm": 9.206205384956768, + "learning_rate": 4.972943196840725e-05, + "loss": 2.345, + "mean_token_accuracy": 0.38620689511299133, + "step": 96250 + }, + { + "epoch": 0.09694905830843005, + "grad_norm": 10.518785193823504, + "learning_rate": 4.972937402886684e-05, + "loss": 2.7558, + "mean_token_accuracy": 0.3827586233615875, + "step": 96255 + }, + { + "epoch": 0.09695409436153422, + "grad_norm": 10.70244045104713, + "learning_rate": 4.972931608316104e-05, + "loss": 2.6989, + "mean_token_accuracy": 0.36896551549434664, + "step": 96260 + }, + { + "epoch": 0.0969591304146384, + "grad_norm": 12.173620876534342, + "learning_rate": 4.972925813128986e-05, + "loss": 2.2743, + "mean_token_accuracy": 0.47586206197738645, + "step": 96265 + }, + { + "epoch": 0.09696416646774257, + "grad_norm": 12.750506759064681, + "learning_rate": 4.972920017325331e-05, + "loss": 2.3576, + "mean_token_accuracy": 0.42068966031074523, + "step": 96270 + }, + { + "epoch": 0.09696920252084675, + "grad_norm": 14.50099229685208, + "learning_rate": 4.9729142209051404e-05, + "loss": 2.0598, + "mean_token_accuracy": 0.4448275864124298, + "step": 96275 + }, + { + "epoch": 0.09697423857395092, + "grad_norm": 12.28813207536173, + "learning_rate": 4.972908423868417e-05, + "loss": 2.7888, + "mean_token_accuracy": 0.3517241358757019, + "step": 96280 + }, + { + "epoch": 0.09697927462705509, + "grad_norm": 10.2264249771214, + "learning_rate": 4.9729026262151604e-05, + "loss": 2.8685, + "mean_token_accuracy": 0.34137930274009703, + "step": 96285 + }, + { + "epoch": 0.09698431068015927, + "grad_norm": 9.66089230367393, + "learning_rate": 4.9728968279453744e-05, + "loss": 2.2912, + "mean_token_accuracy": 0.4517241358757019, + "step": 96290 + }, + { + "epoch": 0.09698934673326343, + "grad_norm": 10.161013127273788, + "learning_rate": 4.9728910290590595e-05, + "loss": 2.3912, + "mean_token_accuracy": 0.43103448748588563, + "step": 96295 + }, + { + "epoch": 0.0969943827863676, + "grad_norm": 10.836896898606948, + "learning_rate": 4.972885229556217e-05, + "loss": 2.4086, + "mean_token_accuracy": 0.42068964838981626, + "step": 96300 + }, + { + "epoch": 0.09699941883947177, + "grad_norm": 9.121540904836728, + "learning_rate": 4.97287942943685e-05, + "loss": 2.454, + "mean_token_accuracy": 0.4241379380226135, + "step": 96305 + }, + { + "epoch": 0.09700445489257595, + "grad_norm": 10.68270404516779, + "learning_rate": 4.9728736287009595e-05, + "loss": 2.4654, + "mean_token_accuracy": 0.43103448748588563, + "step": 96310 + }, + { + "epoch": 0.09700949094568012, + "grad_norm": 12.206640846089213, + "learning_rate": 4.9728678273485455e-05, + "loss": 2.4648, + "mean_token_accuracy": 0.4517241418361664, + "step": 96315 + }, + { + "epoch": 0.0970145269987843, + "grad_norm": 11.266393821578305, + "learning_rate": 4.972862025379611e-05, + "loss": 1.9887, + "mean_token_accuracy": 0.482758617401123, + "step": 96320 + }, + { + "epoch": 0.09701956305188847, + "grad_norm": 11.670018432279093, + "learning_rate": 4.972856222794157e-05, + "loss": 2.6934, + "mean_token_accuracy": 0.38620689511299133, + "step": 96325 + }, + { + "epoch": 0.09702459910499264, + "grad_norm": 15.708158982980189, + "learning_rate": 4.972850419592186e-05, + "loss": 2.723, + "mean_token_accuracy": 0.38965516686439516, + "step": 96330 + }, + { + "epoch": 0.09702963515809682, + "grad_norm": 11.90522545723829, + "learning_rate": 4.9728446157737e-05, + "loss": 2.4806, + "mean_token_accuracy": 0.4379310369491577, + "step": 96335 + }, + { + "epoch": 0.09703467121120099, + "grad_norm": 10.722682593397936, + "learning_rate": 4.9728388113386985e-05, + "loss": 2.3176, + "mean_token_accuracy": 0.44640048742294314, + "step": 96340 + }, + { + "epoch": 0.09703970726430516, + "grad_norm": 10.28194351545637, + "learning_rate": 4.9728330062871845e-05, + "loss": 2.1514, + "mean_token_accuracy": 0.4689655125141144, + "step": 96345 + }, + { + "epoch": 0.09704474331740934, + "grad_norm": 13.302127974048107, + "learning_rate": 4.97282720061916e-05, + "loss": 2.4013, + "mean_token_accuracy": 0.42413793206214906, + "step": 96350 + }, + { + "epoch": 0.09704977937051351, + "grad_norm": 14.791178161344368, + "learning_rate": 4.9728213943346257e-05, + "loss": 2.7785, + "mean_token_accuracy": 0.3772534757852554, + "step": 96355 + }, + { + "epoch": 0.09705481542361769, + "grad_norm": 9.59324219540868, + "learning_rate": 4.9728155874335834e-05, + "loss": 2.443, + "mean_token_accuracy": 0.4293406009674072, + "step": 96360 + }, + { + "epoch": 0.09705985147672185, + "grad_norm": 11.679455275355581, + "learning_rate": 4.972809779916036e-05, + "loss": 2.6782, + "mean_token_accuracy": 0.4034482717514038, + "step": 96365 + }, + { + "epoch": 0.09706488752982602, + "grad_norm": 13.765571351258297, + "learning_rate": 4.972803971781983e-05, + "loss": 2.7268, + "mean_token_accuracy": 0.35862069129943847, + "step": 96370 + }, + { + "epoch": 0.09706992358293019, + "grad_norm": 14.508634534154936, + "learning_rate": 4.972798163031427e-05, + "loss": 2.857, + "mean_token_accuracy": 0.3793103456497192, + "step": 96375 + }, + { + "epoch": 0.09707495963603437, + "grad_norm": 11.500947054818651, + "learning_rate": 4.97279235366437e-05, + "loss": 2.3497, + "mean_token_accuracy": 0.4862069010734558, + "step": 96380 + }, + { + "epoch": 0.09707999568913854, + "grad_norm": 10.221659800141756, + "learning_rate": 4.9727865436808135e-05, + "loss": 2.2958, + "mean_token_accuracy": 0.4103448331356049, + "step": 96385 + }, + { + "epoch": 0.09708503174224271, + "grad_norm": 11.803839976263612, + "learning_rate": 4.9727807330807594e-05, + "loss": 2.4989, + "mean_token_accuracy": 0.4275861978530884, + "step": 96390 + }, + { + "epoch": 0.09709006779534689, + "grad_norm": 13.160267529879707, + "learning_rate": 4.972774921864207e-05, + "loss": 2.7066, + "mean_token_accuracy": 0.3896551728248596, + "step": 96395 + }, + { + "epoch": 0.09709510384845106, + "grad_norm": 15.134069777010525, + "learning_rate": 4.9727691100311615e-05, + "loss": 2.8563, + "mean_token_accuracy": 0.42413793206214906, + "step": 96400 + }, + { + "epoch": 0.09710013990155524, + "grad_norm": 9.776494339148266, + "learning_rate": 4.972763297581621e-05, + "loss": 2.3772, + "mean_token_accuracy": 0.44482758045196535, + "step": 96405 + }, + { + "epoch": 0.09710517595465941, + "grad_norm": 12.878171520094753, + "learning_rate": 4.97275748451559e-05, + "loss": 2.6478, + "mean_token_accuracy": 0.41034482717514037, + "step": 96410 + }, + { + "epoch": 0.09711021200776358, + "grad_norm": 10.362518908496101, + "learning_rate": 4.9727516708330684e-05, + "loss": 2.102, + "mean_token_accuracy": 0.45172414779663084, + "step": 96415 + }, + { + "epoch": 0.09711524806086776, + "grad_norm": 8.79744196470157, + "learning_rate": 4.972745856534059e-05, + "loss": 2.4121, + "mean_token_accuracy": 0.42758620381355283, + "step": 96420 + }, + { + "epoch": 0.09712028411397193, + "grad_norm": 11.285929014660118, + "learning_rate": 4.9727400416185615e-05, + "loss": 2.8295, + "mean_token_accuracy": 0.37586206793785093, + "step": 96425 + }, + { + "epoch": 0.0971253201670761, + "grad_norm": 14.245513012876458, + "learning_rate": 4.972734226086579e-05, + "loss": 2.6871, + "mean_token_accuracy": 0.42758620381355283, + "step": 96430 + }, + { + "epoch": 0.09713035622018026, + "grad_norm": 11.89543996490038, + "learning_rate": 4.972728409938114e-05, + "loss": 2.247, + "mean_token_accuracy": 0.47931033968925474, + "step": 96435 + }, + { + "epoch": 0.09713539227328444, + "grad_norm": 12.290008674368332, + "learning_rate": 4.972722593173166e-05, + "loss": 2.3052, + "mean_token_accuracy": 0.44137930274009707, + "step": 96440 + }, + { + "epoch": 0.09714042832638861, + "grad_norm": 9.45734294689451, + "learning_rate": 4.972716775791738e-05, + "loss": 2.2678, + "mean_token_accuracy": 0.43793103098869324, + "step": 96445 + }, + { + "epoch": 0.09714546437949279, + "grad_norm": 13.050424096547166, + "learning_rate": 4.972710957793832e-05, + "loss": 2.0249, + "mean_token_accuracy": 0.4689655065536499, + "step": 96450 + }, + { + "epoch": 0.09715050043259696, + "grad_norm": 11.33976963118481, + "learning_rate": 4.972705139179447e-05, + "loss": 2.1287, + "mean_token_accuracy": 0.44827585816383364, + "step": 96455 + }, + { + "epoch": 0.09715553648570113, + "grad_norm": 9.66695533952064, + "learning_rate": 4.9726993199485876e-05, + "loss": 2.6678, + "mean_token_accuracy": 0.37241379618644715, + "step": 96460 + }, + { + "epoch": 0.0971605725388053, + "grad_norm": 10.76286878115784, + "learning_rate": 4.9726935001012546e-05, + "loss": 2.4642, + "mean_token_accuracy": 0.42758620381355283, + "step": 96465 + }, + { + "epoch": 0.09716560859190948, + "grad_norm": 11.671631007756726, + "learning_rate": 4.972687679637448e-05, + "loss": 2.311, + "mean_token_accuracy": 0.43103447556495667, + "step": 96470 + }, + { + "epoch": 0.09717064464501365, + "grad_norm": 10.934151094439425, + "learning_rate": 4.9726818585571714e-05, + "loss": 2.37, + "mean_token_accuracy": 0.4530550479888916, + "step": 96475 + }, + { + "epoch": 0.09717568069811783, + "grad_norm": 11.345214959531479, + "learning_rate": 4.972676036860426e-05, + "loss": 2.7723, + "mean_token_accuracy": 0.36896551251411436, + "step": 96480 + }, + { + "epoch": 0.097180716751222, + "grad_norm": 22.268393400185147, + "learning_rate": 4.972670214547213e-05, + "loss": 2.4091, + "mean_token_accuracy": 0.41034482717514037, + "step": 96485 + }, + { + "epoch": 0.09718575280432618, + "grad_norm": 11.723755830109697, + "learning_rate": 4.9726643916175336e-05, + "loss": 2.4575, + "mean_token_accuracy": 0.43278886675834655, + "step": 96490 + }, + { + "epoch": 0.09719078885743035, + "grad_norm": 11.063633820950281, + "learning_rate": 4.9726585680713906e-05, + "loss": 2.6602, + "mean_token_accuracy": 0.3896551787853241, + "step": 96495 + }, + { + "epoch": 0.09719582491053452, + "grad_norm": 10.265460409837928, + "learning_rate": 4.972652743908784e-05, + "loss": 2.4788, + "mean_token_accuracy": 0.4034482717514038, + "step": 96500 + }, + { + "epoch": 0.09720086096363868, + "grad_norm": 10.596362514804271, + "learning_rate": 4.972646919129717e-05, + "loss": 2.3332, + "mean_token_accuracy": 0.4413793206214905, + "step": 96505 + }, + { + "epoch": 0.09720589701674286, + "grad_norm": 10.68046442908292, + "learning_rate": 4.9726410937341906e-05, + "loss": 2.3237, + "mean_token_accuracy": 0.36896551251411436, + "step": 96510 + }, + { + "epoch": 0.09721093306984703, + "grad_norm": 11.37897603361246, + "learning_rate": 4.972635267722206e-05, + "loss": 2.2783, + "mean_token_accuracy": 0.41379310488700866, + "step": 96515 + }, + { + "epoch": 0.0972159691229512, + "grad_norm": 13.388013996660984, + "learning_rate": 4.972629441093766e-05, + "loss": 2.7003, + "mean_token_accuracy": 0.3947973370552063, + "step": 96520 + }, + { + "epoch": 0.09722100517605538, + "grad_norm": 9.976030042400652, + "learning_rate": 4.9726236138488705e-05, + "loss": 2.5135, + "mean_token_accuracy": 0.42758620977401735, + "step": 96525 + }, + { + "epoch": 0.09722604122915955, + "grad_norm": 10.32330164844996, + "learning_rate": 4.9726177859875236e-05, + "loss": 2.4576, + "mean_token_accuracy": 0.4103448331356049, + "step": 96530 + }, + { + "epoch": 0.09723107728226373, + "grad_norm": 10.579976905442924, + "learning_rate": 4.972611957509724e-05, + "loss": 2.4994, + "mean_token_accuracy": 0.4206896543502808, + "step": 96535 + }, + { + "epoch": 0.0972361133353679, + "grad_norm": 12.948064003966051, + "learning_rate": 4.9726061284154744e-05, + "loss": 2.4175, + "mean_token_accuracy": 0.4190562665462494, + "step": 96540 + }, + { + "epoch": 0.09724114938847207, + "grad_norm": 9.934205874096246, + "learning_rate": 4.9726002987047775e-05, + "loss": 2.524, + "mean_token_accuracy": 0.4034482777118683, + "step": 96545 + }, + { + "epoch": 0.09724618544157625, + "grad_norm": 15.147394639964748, + "learning_rate": 4.9725944683776344e-05, + "loss": 2.9924, + "mean_token_accuracy": 0.341379314661026, + "step": 96550 + }, + { + "epoch": 0.09725122149468042, + "grad_norm": 10.6263312476722, + "learning_rate": 4.972588637434045e-05, + "loss": 2.4374, + "mean_token_accuracy": 0.4206896543502808, + "step": 96555 + }, + { + "epoch": 0.0972562575477846, + "grad_norm": 12.236231670175371, + "learning_rate": 4.972582805874014e-05, + "loss": 2.6418, + "mean_token_accuracy": 0.4310344815254211, + "step": 96560 + }, + { + "epoch": 0.09726129360088877, + "grad_norm": 9.383906429932566, + "learning_rate": 4.972576973697541e-05, + "loss": 2.1375, + "mean_token_accuracy": 0.4413793087005615, + "step": 96565 + }, + { + "epoch": 0.09726632965399294, + "grad_norm": 8.455281460553742, + "learning_rate": 4.972571140904627e-05, + "loss": 2.3818, + "mean_token_accuracy": 0.3896551698446274, + "step": 96570 + }, + { + "epoch": 0.0972713657070971, + "grad_norm": 11.134645701505692, + "learning_rate": 4.9725653074952755e-05, + "loss": 2.6941, + "mean_token_accuracy": 0.4448275864124298, + "step": 96575 + }, + { + "epoch": 0.09727640176020128, + "grad_norm": 11.831679969379401, + "learning_rate": 4.972559473469487e-05, + "loss": 2.3259, + "mean_token_accuracy": 0.46551724076271056, + "step": 96580 + }, + { + "epoch": 0.09728143781330545, + "grad_norm": 11.97817942967243, + "learning_rate": 4.9725536388272634e-05, + "loss": 2.6208, + "mean_token_accuracy": 0.36896551549434664, + "step": 96585 + }, + { + "epoch": 0.09728647386640962, + "grad_norm": 10.981562391008152, + "learning_rate": 4.9725478035686064e-05, + "loss": 2.0739, + "mean_token_accuracy": 0.4979064166545868, + "step": 96590 + }, + { + "epoch": 0.0972915099195138, + "grad_norm": 9.143473257145102, + "learning_rate": 4.9725419676935174e-05, + "loss": 2.538, + "mean_token_accuracy": 0.42413793206214906, + "step": 96595 + }, + { + "epoch": 0.09729654597261797, + "grad_norm": 9.73518849612509, + "learning_rate": 4.972536131201997e-05, + "loss": 2.406, + "mean_token_accuracy": 0.38275861740112305, + "step": 96600 + }, + { + "epoch": 0.09730158202572214, + "grad_norm": 9.378363356724913, + "learning_rate": 4.97253029409405e-05, + "loss": 2.4046, + "mean_token_accuracy": 0.3758620619773865, + "step": 96605 + }, + { + "epoch": 0.09730661807882632, + "grad_norm": 15.153794753572866, + "learning_rate": 4.9725244563696743e-05, + "loss": 3.1952, + "mean_token_accuracy": 0.36896551251411436, + "step": 96610 + }, + { + "epoch": 0.09731165413193049, + "grad_norm": 12.534745261124794, + "learning_rate": 4.972518618028874e-05, + "loss": 2.9086, + "mean_token_accuracy": 0.37586206793785093, + "step": 96615 + }, + { + "epoch": 0.09731669018503467, + "grad_norm": 10.439136848106703, + "learning_rate": 4.9725127790716495e-05, + "loss": 2.4867, + "mean_token_accuracy": 0.42928009629249575, + "step": 96620 + }, + { + "epoch": 0.09732172623813884, + "grad_norm": 11.952303776150343, + "learning_rate": 4.972506939498002e-05, + "loss": 2.4577, + "mean_token_accuracy": 0.43793103098869324, + "step": 96625 + }, + { + "epoch": 0.09732676229124301, + "grad_norm": 8.528502611793806, + "learning_rate": 4.9725010993079354e-05, + "loss": 2.09, + "mean_token_accuracy": 0.4931034564971924, + "step": 96630 + }, + { + "epoch": 0.09733179834434719, + "grad_norm": 11.0862047676407, + "learning_rate": 4.972495258501449e-05, + "loss": 2.3867, + "mean_token_accuracy": 0.39310344457626345, + "step": 96635 + }, + { + "epoch": 0.09733683439745136, + "grad_norm": 12.67013460086597, + "learning_rate": 4.972489417078545e-05, + "loss": 2.9974, + "mean_token_accuracy": 0.4034482717514038, + "step": 96640 + }, + { + "epoch": 0.09734187045055552, + "grad_norm": 11.490876939259309, + "learning_rate": 4.972483575039226e-05, + "loss": 2.244, + "mean_token_accuracy": 0.4344827592372894, + "step": 96645 + }, + { + "epoch": 0.0973469065036597, + "grad_norm": 22.726410779267297, + "learning_rate": 4.972477732383492e-05, + "loss": 2.4759, + "mean_token_accuracy": 0.3999999940395355, + "step": 96650 + }, + { + "epoch": 0.09735194255676387, + "grad_norm": 10.32493373933577, + "learning_rate": 4.972471889111346e-05, + "loss": 2.606, + "mean_token_accuracy": 0.4172413766384125, + "step": 96655 + }, + { + "epoch": 0.09735697860986804, + "grad_norm": 14.034009480366368, + "learning_rate": 4.97246604522279e-05, + "loss": 2.5075, + "mean_token_accuracy": 0.3896551787853241, + "step": 96660 + }, + { + "epoch": 0.09736201466297222, + "grad_norm": 9.69913131713579, + "learning_rate": 4.972460200717824e-05, + "loss": 2.3392, + "mean_token_accuracy": 0.44827585220336913, + "step": 96665 + }, + { + "epoch": 0.09736705071607639, + "grad_norm": 10.68681207746539, + "learning_rate": 4.97245435559645e-05, + "loss": 2.2344, + "mean_token_accuracy": 0.39310344457626345, + "step": 96670 + }, + { + "epoch": 0.09737208676918056, + "grad_norm": 9.419592364122364, + "learning_rate": 4.972448509858671e-05, + "loss": 2.3428, + "mean_token_accuracy": 0.4034482777118683, + "step": 96675 + }, + { + "epoch": 0.09737712282228474, + "grad_norm": 10.311463980498708, + "learning_rate": 4.9724426635044865e-05, + "loss": 2.3887, + "mean_token_accuracy": 0.4551724076271057, + "step": 96680 + }, + { + "epoch": 0.09738215887538891, + "grad_norm": 11.602580995050053, + "learning_rate": 4.9724368165338995e-05, + "loss": 2.2862, + "mean_token_accuracy": 0.47241379618644713, + "step": 96685 + }, + { + "epoch": 0.09738719492849308, + "grad_norm": 10.192490845618508, + "learning_rate": 4.972430968946911e-05, + "loss": 2.0668, + "mean_token_accuracy": 0.47931034564971925, + "step": 96690 + }, + { + "epoch": 0.09739223098159726, + "grad_norm": 8.355083860858045, + "learning_rate": 4.972425120743524e-05, + "loss": 2.0145, + "mean_token_accuracy": 0.47931034564971925, + "step": 96695 + }, + { + "epoch": 0.09739726703470143, + "grad_norm": 10.979154835600486, + "learning_rate": 4.9724192719237385e-05, + "loss": 2.2679, + "mean_token_accuracy": 0.45716878175735476, + "step": 96700 + }, + { + "epoch": 0.0974023030878056, + "grad_norm": 10.966468621282301, + "learning_rate": 4.9724134224875565e-05, + "loss": 2.3768, + "mean_token_accuracy": 0.44482758045196535, + "step": 96705 + }, + { + "epoch": 0.09740733914090978, + "grad_norm": 10.51274154958129, + "learning_rate": 4.9724075724349805e-05, + "loss": 2.7362, + "mean_token_accuracy": 0.379310342669487, + "step": 96710 + }, + { + "epoch": 0.09741237519401394, + "grad_norm": 11.749460811620443, + "learning_rate": 4.972401721766011e-05, + "loss": 2.1498, + "mean_token_accuracy": 0.42413792610168455, + "step": 96715 + }, + { + "epoch": 0.09741741124711811, + "grad_norm": 11.455565578309374, + "learning_rate": 4.9723958704806505e-05, + "loss": 2.5497, + "mean_token_accuracy": 0.4586206912994385, + "step": 96720 + }, + { + "epoch": 0.09742244730022229, + "grad_norm": 14.03513215479129, + "learning_rate": 4.9723900185789e-05, + "loss": 2.4985, + "mean_token_accuracy": 0.4379310369491577, + "step": 96725 + }, + { + "epoch": 0.09742748335332646, + "grad_norm": 12.871879266978096, + "learning_rate": 4.9723841660607615e-05, + "loss": 2.3151, + "mean_token_accuracy": 0.42758620977401735, + "step": 96730 + }, + { + "epoch": 0.09743251940643063, + "grad_norm": 9.878963807440284, + "learning_rate": 4.972378312926236e-05, + "loss": 2.2238, + "mean_token_accuracy": 0.4413793087005615, + "step": 96735 + }, + { + "epoch": 0.09743755545953481, + "grad_norm": 10.909500353747827, + "learning_rate": 4.9723724591753255e-05, + "loss": 2.6082, + "mean_token_accuracy": 0.36896551847457887, + "step": 96740 + }, + { + "epoch": 0.09744259151263898, + "grad_norm": 12.746491718297278, + "learning_rate": 4.972366604808033e-05, + "loss": 2.7272, + "mean_token_accuracy": 0.4310344815254211, + "step": 96745 + }, + { + "epoch": 0.09744762756574316, + "grad_norm": 10.632690916188793, + "learning_rate": 4.9723607498243575e-05, + "loss": 2.5296, + "mean_token_accuracy": 0.4310344815254211, + "step": 96750 + }, + { + "epoch": 0.09745266361884733, + "grad_norm": 10.125773622232813, + "learning_rate": 4.972354894224302e-05, + "loss": 2.3954, + "mean_token_accuracy": 0.38965516090393065, + "step": 96755 + }, + { + "epoch": 0.0974576996719515, + "grad_norm": 9.02353078138823, + "learning_rate": 4.9723490380078685e-05, + "loss": 2.4236, + "mean_token_accuracy": 0.41724138259887694, + "step": 96760 + }, + { + "epoch": 0.09746273572505568, + "grad_norm": 10.689419546854017, + "learning_rate": 4.972343181175058e-05, + "loss": 2.4708, + "mean_token_accuracy": 0.36896551549434664, + "step": 96765 + }, + { + "epoch": 0.09746777177815985, + "grad_norm": 10.41181986035487, + "learning_rate": 4.9723373237258725e-05, + "loss": 2.4957, + "mean_token_accuracy": 0.3931034505367279, + "step": 96770 + }, + { + "epoch": 0.09747280783126402, + "grad_norm": 11.467168512770447, + "learning_rate": 4.972331465660313e-05, + "loss": 2.375, + "mean_token_accuracy": 0.4517241358757019, + "step": 96775 + }, + { + "epoch": 0.0974778438843682, + "grad_norm": 9.281237339857533, + "learning_rate": 4.9723256069783824e-05, + "loss": 2.4694, + "mean_token_accuracy": 0.441379314661026, + "step": 96780 + }, + { + "epoch": 0.09748287993747236, + "grad_norm": 9.638861439223946, + "learning_rate": 4.9723197476800806e-05, + "loss": 2.1548, + "mean_token_accuracy": 0.4344827592372894, + "step": 96785 + }, + { + "epoch": 0.09748791599057653, + "grad_norm": 10.722541410704167, + "learning_rate": 4.972313887765411e-05, + "loss": 2.809, + "mean_token_accuracy": 0.31379311084747313, + "step": 96790 + }, + { + "epoch": 0.0974929520436807, + "grad_norm": 10.504543482092245, + "learning_rate": 4.9723080272343736e-05, + "loss": 2.3093, + "mean_token_accuracy": 0.441379314661026, + "step": 96795 + }, + { + "epoch": 0.09749798809678488, + "grad_norm": 12.40963604534285, + "learning_rate": 4.972302166086971e-05, + "loss": 2.7733, + "mean_token_accuracy": 0.33448276221752166, + "step": 96800 + }, + { + "epoch": 0.09750302414988905, + "grad_norm": 17.54052414336084, + "learning_rate": 4.972296304323205e-05, + "loss": 2.7422, + "mean_token_accuracy": 0.4257713258266449, + "step": 96805 + }, + { + "epoch": 0.09750806020299323, + "grad_norm": 8.67166551567929, + "learning_rate": 4.972290441943077e-05, + "loss": 2.2464, + "mean_token_accuracy": 0.43103447556495667, + "step": 96810 + }, + { + "epoch": 0.0975130962560974, + "grad_norm": 10.283620969702627, + "learning_rate": 4.972284578946588e-05, + "loss": 2.5574, + "mean_token_accuracy": 0.42758620977401735, + "step": 96815 + }, + { + "epoch": 0.09751813230920157, + "grad_norm": 10.516645771920953, + "learning_rate": 4.97227871533374e-05, + "loss": 2.414, + "mean_token_accuracy": 0.3689655244350433, + "step": 96820 + }, + { + "epoch": 0.09752316836230575, + "grad_norm": 10.089120465956569, + "learning_rate": 4.972272851104535e-05, + "loss": 2.2177, + "mean_token_accuracy": 0.4379310369491577, + "step": 96825 + }, + { + "epoch": 0.09752820441540992, + "grad_norm": 10.628548014782055, + "learning_rate": 4.972266986258974e-05, + "loss": 2.3846, + "mean_token_accuracy": 0.441379314661026, + "step": 96830 + }, + { + "epoch": 0.0975332404685141, + "grad_norm": 11.092546364207022, + "learning_rate": 4.972261120797059e-05, + "loss": 2.4652, + "mean_token_accuracy": 0.3896551728248596, + "step": 96835 + }, + { + "epoch": 0.09753827652161827, + "grad_norm": 11.414157122837512, + "learning_rate": 4.972255254718791e-05, + "loss": 2.7347, + "mean_token_accuracy": 0.3896551638841629, + "step": 96840 + }, + { + "epoch": 0.09754331257472244, + "grad_norm": 12.616799986478412, + "learning_rate": 4.972249388024173e-05, + "loss": 2.4133, + "mean_token_accuracy": 0.3620689630508423, + "step": 96845 + }, + { + "epoch": 0.09754834862782662, + "grad_norm": 18.457025654276315, + "learning_rate": 4.972243520713206e-05, + "loss": 2.775, + "mean_token_accuracy": 0.36896551847457887, + "step": 96850 + }, + { + "epoch": 0.09755338468093078, + "grad_norm": 10.25863709002003, + "learning_rate": 4.9722376527858916e-05, + "loss": 2.4029, + "mean_token_accuracy": 0.4517241418361664, + "step": 96855 + }, + { + "epoch": 0.09755842073403495, + "grad_norm": 9.884468111241974, + "learning_rate": 4.972231784242231e-05, + "loss": 2.1127, + "mean_token_accuracy": 0.49655171632766726, + "step": 96860 + }, + { + "epoch": 0.09756345678713912, + "grad_norm": 14.342379478344315, + "learning_rate": 4.972225915082226e-05, + "loss": 2.5208, + "mean_token_accuracy": 0.4034482717514038, + "step": 96865 + }, + { + "epoch": 0.0975684928402433, + "grad_norm": 10.39999822511253, + "learning_rate": 4.9722200453058784e-05, + "loss": 2.4626, + "mean_token_accuracy": 0.4586206912994385, + "step": 96870 + }, + { + "epoch": 0.09757352889334747, + "grad_norm": 10.518821936936972, + "learning_rate": 4.97221417491319e-05, + "loss": 2.4562, + "mean_token_accuracy": 0.41379311084747317, + "step": 96875 + }, + { + "epoch": 0.09757856494645165, + "grad_norm": 12.604331784907307, + "learning_rate": 4.9722083039041615e-05, + "loss": 2.6805, + "mean_token_accuracy": 0.44319419264793397, + "step": 96880 + }, + { + "epoch": 0.09758360099955582, + "grad_norm": 11.225025639171049, + "learning_rate": 4.972202432278796e-05, + "loss": 2.7914, + "mean_token_accuracy": 0.38620689511299133, + "step": 96885 + }, + { + "epoch": 0.09758863705266, + "grad_norm": 10.483903283155431, + "learning_rate": 4.972196560037094e-05, + "loss": 2.2835, + "mean_token_accuracy": 0.4517241358757019, + "step": 96890 + }, + { + "epoch": 0.09759367310576417, + "grad_norm": 10.141427078451763, + "learning_rate": 4.9721906871790574e-05, + "loss": 2.3951, + "mean_token_accuracy": 0.39310344457626345, + "step": 96895 + }, + { + "epoch": 0.09759870915886834, + "grad_norm": 13.686144988127884, + "learning_rate": 4.972184813704688e-05, + "loss": 2.3838, + "mean_token_accuracy": 0.41379310488700866, + "step": 96900 + }, + { + "epoch": 0.09760374521197251, + "grad_norm": 12.859264222888662, + "learning_rate": 4.972178939613988e-05, + "loss": 2.058, + "mean_token_accuracy": 0.512583178281784, + "step": 96905 + }, + { + "epoch": 0.09760878126507669, + "grad_norm": 9.388715490160703, + "learning_rate": 4.9721730649069574e-05, + "loss": 2.498, + "mean_token_accuracy": 0.44482759237289426, + "step": 96910 + }, + { + "epoch": 0.09761381731818086, + "grad_norm": 10.146798647166731, + "learning_rate": 4.972167189583599e-05, + "loss": 2.6846, + "mean_token_accuracy": 0.4034482717514038, + "step": 96915 + }, + { + "epoch": 0.09761885337128504, + "grad_norm": 9.6972940625141, + "learning_rate": 4.972161313643915e-05, + "loss": 2.5748, + "mean_token_accuracy": 0.41724138259887694, + "step": 96920 + }, + { + "epoch": 0.0976238894243892, + "grad_norm": 13.675951276330927, + "learning_rate": 4.972155437087905e-05, + "loss": 2.3274, + "mean_token_accuracy": 0.42413793206214906, + "step": 96925 + }, + { + "epoch": 0.09762892547749337, + "grad_norm": 12.747957802078194, + "learning_rate": 4.972149559915573e-05, + "loss": 2.3109, + "mean_token_accuracy": 0.47931034564971925, + "step": 96930 + }, + { + "epoch": 0.09763396153059754, + "grad_norm": 14.625429362218068, + "learning_rate": 4.972143682126919e-05, + "loss": 2.513, + "mean_token_accuracy": 0.3999999940395355, + "step": 96935 + }, + { + "epoch": 0.09763899758370172, + "grad_norm": 10.324819747378985, + "learning_rate": 4.972137803721946e-05, + "loss": 2.2237, + "mean_token_accuracy": 0.4689655065536499, + "step": 96940 + }, + { + "epoch": 0.09764403363680589, + "grad_norm": 10.039963294724497, + "learning_rate": 4.9721319247006544e-05, + "loss": 2.5182, + "mean_token_accuracy": 0.3965517163276672, + "step": 96945 + }, + { + "epoch": 0.09764906968991006, + "grad_norm": 10.16367309737379, + "learning_rate": 4.972126045063046e-05, + "loss": 2.5677, + "mean_token_accuracy": 0.4172413766384125, + "step": 96950 + }, + { + "epoch": 0.09765410574301424, + "grad_norm": 10.475601876917041, + "learning_rate": 4.972120164809122e-05, + "loss": 2.6817, + "mean_token_accuracy": 0.41034482717514037, + "step": 96955 + }, + { + "epoch": 0.09765914179611841, + "grad_norm": 12.355243013818377, + "learning_rate": 4.972114283938886e-05, + "loss": 2.4014, + "mean_token_accuracy": 0.39310344457626345, + "step": 96960 + }, + { + "epoch": 0.09766417784922259, + "grad_norm": 15.27779979410524, + "learning_rate": 4.9721084024523376e-05, + "loss": 2.5096, + "mean_token_accuracy": 0.3655172407627106, + "step": 96965 + }, + { + "epoch": 0.09766921390232676, + "grad_norm": 9.78506719719141, + "learning_rate": 4.972102520349479e-05, + "loss": 2.5003, + "mean_token_accuracy": 0.4399878978729248, + "step": 96970 + }, + { + "epoch": 0.09767424995543093, + "grad_norm": 10.300117061269772, + "learning_rate": 4.9720966376303115e-05, + "loss": 2.7006, + "mean_token_accuracy": 0.4379310369491577, + "step": 96975 + }, + { + "epoch": 0.09767928600853511, + "grad_norm": 9.5872906212588, + "learning_rate": 4.972090754294838e-05, + "loss": 2.0816, + "mean_token_accuracy": 0.4517241358757019, + "step": 96980 + }, + { + "epoch": 0.09768432206163928, + "grad_norm": 11.323234929765944, + "learning_rate": 4.972084870343059e-05, + "loss": 2.53, + "mean_token_accuracy": 0.4379310369491577, + "step": 96985 + }, + { + "epoch": 0.09768935811474345, + "grad_norm": 9.668635615328112, + "learning_rate": 4.9720789857749774e-05, + "loss": 2.4561, + "mean_token_accuracy": 0.3896551728248596, + "step": 96990 + }, + { + "epoch": 0.09769439416784761, + "grad_norm": 14.27955008293976, + "learning_rate": 4.9720731005905925e-05, + "loss": 2.1514, + "mean_token_accuracy": 0.493103438615799, + "step": 96995 + }, + { + "epoch": 0.09769943022095179, + "grad_norm": 10.349679125934957, + "learning_rate": 4.972067214789908e-05, + "loss": 3.3402, + "mean_token_accuracy": 0.29999999403953553, + "step": 97000 + }, + { + "epoch": 0.09770446627405596, + "grad_norm": 15.34680864772722, + "learning_rate": 4.9720613283729246e-05, + "loss": 2.6464, + "mean_token_accuracy": 0.37931033968925476, + "step": 97005 + }, + { + "epoch": 0.09770950232716014, + "grad_norm": 9.737995876588291, + "learning_rate": 4.972055441339645e-05, + "loss": 3.0523, + "mean_token_accuracy": 0.3724137932062149, + "step": 97010 + }, + { + "epoch": 0.09771453838026431, + "grad_norm": 12.078511110337947, + "learning_rate": 4.972049553690069e-05, + "loss": 2.744, + "mean_token_accuracy": 0.38620689511299133, + "step": 97015 + }, + { + "epoch": 0.09771957443336848, + "grad_norm": 10.952544947212989, + "learning_rate": 4.9720436654242005e-05, + "loss": 2.4528, + "mean_token_accuracy": 0.3724137842655182, + "step": 97020 + }, + { + "epoch": 0.09772461048647266, + "grad_norm": 10.289349562810282, + "learning_rate": 4.9720377765420384e-05, + "loss": 2.9472, + "mean_token_accuracy": 0.3827586233615875, + "step": 97025 + }, + { + "epoch": 0.09772964653957683, + "grad_norm": 11.303116774976345, + "learning_rate": 4.972031887043587e-05, + "loss": 2.3916, + "mean_token_accuracy": 0.46206897497177124, + "step": 97030 + }, + { + "epoch": 0.097734682592681, + "grad_norm": 11.914021258129052, + "learning_rate": 4.9720259969288465e-05, + "loss": 2.7275, + "mean_token_accuracy": 0.3827586233615875, + "step": 97035 + }, + { + "epoch": 0.09773971864578518, + "grad_norm": 10.290119889460751, + "learning_rate": 4.972020106197818e-05, + "loss": 2.5011, + "mean_token_accuracy": 0.4241379201412201, + "step": 97040 + }, + { + "epoch": 0.09774475469888935, + "grad_norm": 9.94221672020278, + "learning_rate": 4.9720142148505055e-05, + "loss": 2.5097, + "mean_token_accuracy": 0.44482758045196535, + "step": 97045 + }, + { + "epoch": 0.09774979075199353, + "grad_norm": 10.99525162725582, + "learning_rate": 4.972008322886908e-05, + "loss": 2.6319, + "mean_token_accuracy": 0.4403508722782135, + "step": 97050 + }, + { + "epoch": 0.0977548268050977, + "grad_norm": 9.632273198130806, + "learning_rate": 4.972002430307028e-05, + "loss": 2.4982, + "mean_token_accuracy": 0.417241370677948, + "step": 97055 + }, + { + "epoch": 0.09775986285820187, + "grad_norm": 11.81225659262148, + "learning_rate": 4.971996537110868e-05, + "loss": 2.7566, + "mean_token_accuracy": 0.4, + "step": 97060 + }, + { + "epoch": 0.09776489891130603, + "grad_norm": 10.29227096519714, + "learning_rate": 4.9719906432984284e-05, + "loss": 2.6421, + "mean_token_accuracy": 0.39655172228813174, + "step": 97065 + }, + { + "epoch": 0.09776993496441021, + "grad_norm": 9.038997782769467, + "learning_rate": 4.9719847488697116e-05, + "loss": 2.4468, + "mean_token_accuracy": 0.40689654350280763, + "step": 97070 + }, + { + "epoch": 0.09777497101751438, + "grad_norm": 9.218105843716975, + "learning_rate": 4.971978853824719e-05, + "loss": 2.7998, + "mean_token_accuracy": 0.4137930989265442, + "step": 97075 + }, + { + "epoch": 0.09778000707061855, + "grad_norm": 10.579686076686146, + "learning_rate": 4.971972958163453e-05, + "loss": 2.9357, + "mean_token_accuracy": 0.3551724195480347, + "step": 97080 + }, + { + "epoch": 0.09778504312372273, + "grad_norm": 12.681114501875788, + "learning_rate": 4.971967061885913e-05, + "loss": 2.715, + "mean_token_accuracy": 0.37586206793785093, + "step": 97085 + }, + { + "epoch": 0.0977900791768269, + "grad_norm": 12.44891396194757, + "learning_rate": 4.9719611649921036e-05, + "loss": 2.5625, + "mean_token_accuracy": 0.4534785211086273, + "step": 97090 + }, + { + "epoch": 0.09779511522993108, + "grad_norm": 9.09394664856038, + "learning_rate": 4.971955267482024e-05, + "loss": 2.5012, + "mean_token_accuracy": 0.41379311084747317, + "step": 97095 + }, + { + "epoch": 0.09780015128303525, + "grad_norm": 10.683238073930516, + "learning_rate": 4.9719493693556776e-05, + "loss": 2.7314, + "mean_token_accuracy": 0.37931033968925476, + "step": 97100 + }, + { + "epoch": 0.09780518733613942, + "grad_norm": 11.115355685730496, + "learning_rate": 4.971943470613065e-05, + "loss": 3.0163, + "mean_token_accuracy": 0.4034482717514038, + "step": 97105 + }, + { + "epoch": 0.0978102233892436, + "grad_norm": 12.03270088867243, + "learning_rate": 4.971937571254188e-05, + "loss": 3.09, + "mean_token_accuracy": 0.34482758641242983, + "step": 97110 + }, + { + "epoch": 0.09781525944234777, + "grad_norm": 12.062632611404117, + "learning_rate": 4.9719316712790484e-05, + "loss": 2.2458, + "mean_token_accuracy": 0.4344827651977539, + "step": 97115 + }, + { + "epoch": 0.09782029549545194, + "grad_norm": 10.650345045574621, + "learning_rate": 4.971925770687648e-05, + "loss": 1.9558, + "mean_token_accuracy": 0.4896551787853241, + "step": 97120 + }, + { + "epoch": 0.09782533154855612, + "grad_norm": 18.457186575756406, + "learning_rate": 4.9719198694799874e-05, + "loss": 2.7712, + "mean_token_accuracy": 0.3758620619773865, + "step": 97125 + }, + { + "epoch": 0.09783036760166029, + "grad_norm": 13.788095787988311, + "learning_rate": 4.97191396765607e-05, + "loss": 2.62, + "mean_token_accuracy": 0.43448275327682495, + "step": 97130 + }, + { + "epoch": 0.09783540365476445, + "grad_norm": 9.512838211464976, + "learning_rate": 4.971908065215896e-05, + "loss": 2.5173, + "mean_token_accuracy": 0.42413793206214906, + "step": 97135 + }, + { + "epoch": 0.09784043970786863, + "grad_norm": 8.025492767362715, + "learning_rate": 4.971902162159467e-05, + "loss": 2.2136, + "mean_token_accuracy": 0.42758620977401735, + "step": 97140 + }, + { + "epoch": 0.0978454757609728, + "grad_norm": 11.875854353894487, + "learning_rate": 4.971896258486785e-05, + "loss": 2.3736, + "mean_token_accuracy": 0.4310344696044922, + "step": 97145 + }, + { + "epoch": 0.09785051181407697, + "grad_norm": 10.544674018491566, + "learning_rate": 4.971890354197853e-05, + "loss": 2.0914, + "mean_token_accuracy": 0.49999999403953554, + "step": 97150 + }, + { + "epoch": 0.09785554786718115, + "grad_norm": 11.22064793589582, + "learning_rate": 4.971884449292671e-05, + "loss": 2.4418, + "mean_token_accuracy": 0.4206896543502808, + "step": 97155 + }, + { + "epoch": 0.09786058392028532, + "grad_norm": 10.645219980388783, + "learning_rate": 4.971878543771241e-05, + "loss": 2.7939, + "mean_token_accuracy": 0.39655171930789945, + "step": 97160 + }, + { + "epoch": 0.0978656199733895, + "grad_norm": 11.228176947380152, + "learning_rate": 4.971872637633565e-05, + "loss": 2.0494, + "mean_token_accuracy": 0.47586206793785096, + "step": 97165 + }, + { + "epoch": 0.09787065602649367, + "grad_norm": 14.192479031075733, + "learning_rate": 4.971866730879644e-05, + "loss": 2.4823, + "mean_token_accuracy": 0.45517240166664125, + "step": 97170 + }, + { + "epoch": 0.09787569207959784, + "grad_norm": 11.141254372866738, + "learning_rate": 4.971860823509479e-05, + "loss": 2.6461, + "mean_token_accuracy": 0.4137930989265442, + "step": 97175 + }, + { + "epoch": 0.09788072813270202, + "grad_norm": 11.076276741959363, + "learning_rate": 4.971854915523074e-05, + "loss": 2.4316, + "mean_token_accuracy": 0.42758620381355283, + "step": 97180 + }, + { + "epoch": 0.09788576418580619, + "grad_norm": 10.0024741676314, + "learning_rate": 4.971849006920429e-05, + "loss": 2.2462, + "mean_token_accuracy": 0.4225045382976532, + "step": 97185 + }, + { + "epoch": 0.09789080023891036, + "grad_norm": 12.174778838548878, + "learning_rate": 4.971843097701545e-05, + "loss": 2.7311, + "mean_token_accuracy": 0.4034482777118683, + "step": 97190 + }, + { + "epoch": 0.09789583629201454, + "grad_norm": 11.093035923026877, + "learning_rate": 4.9718371878664257e-05, + "loss": 2.5509, + "mean_token_accuracy": 0.41379311084747317, + "step": 97195 + }, + { + "epoch": 0.09790087234511871, + "grad_norm": 11.39140168290409, + "learning_rate": 4.97183127741507e-05, + "loss": 2.6019, + "mean_token_accuracy": 0.3862069010734558, + "step": 97200 + }, + { + "epoch": 0.09790590839822287, + "grad_norm": 12.984332168372463, + "learning_rate": 4.971825366347483e-05, + "loss": 2.4617, + "mean_token_accuracy": 0.41379310190677643, + "step": 97205 + }, + { + "epoch": 0.09791094445132704, + "grad_norm": 9.62460337736579, + "learning_rate": 4.9718194546636634e-05, + "loss": 2.5987, + "mean_token_accuracy": 0.40443349480628965, + "step": 97210 + }, + { + "epoch": 0.09791598050443122, + "grad_norm": 10.49651796765576, + "learning_rate": 4.971813542363614e-05, + "loss": 2.8555, + "mean_token_accuracy": 0.3620689630508423, + "step": 97215 + }, + { + "epoch": 0.09792101655753539, + "grad_norm": 11.223380933308345, + "learning_rate": 4.9718076294473364e-05, + "loss": 2.3865, + "mean_token_accuracy": 0.4448275864124298, + "step": 97220 + }, + { + "epoch": 0.09792605261063957, + "grad_norm": 12.550344772911117, + "learning_rate": 4.971801715914833e-05, + "loss": 2.6305, + "mean_token_accuracy": 0.37586206793785093, + "step": 97225 + }, + { + "epoch": 0.09793108866374374, + "grad_norm": 9.534220346404886, + "learning_rate": 4.971795801766103e-05, + "loss": 2.137, + "mean_token_accuracy": 0.43103447556495667, + "step": 97230 + }, + { + "epoch": 0.09793612471684791, + "grad_norm": 21.661130683323204, + "learning_rate": 4.971789887001151e-05, + "loss": 2.6367, + "mean_token_accuracy": 0.4275861978530884, + "step": 97235 + }, + { + "epoch": 0.09794116076995209, + "grad_norm": 12.45245511823149, + "learning_rate": 4.9717839716199764e-05, + "loss": 2.2642, + "mean_token_accuracy": 0.44482759237289426, + "step": 97240 + }, + { + "epoch": 0.09794619682305626, + "grad_norm": 8.64125924468296, + "learning_rate": 4.971778055622582e-05, + "loss": 2.4256, + "mean_token_accuracy": 0.41379310488700866, + "step": 97245 + }, + { + "epoch": 0.09795123287616044, + "grad_norm": 8.089768182130541, + "learning_rate": 4.971772139008969e-05, + "loss": 2.1212, + "mean_token_accuracy": 0.4465214729309082, + "step": 97250 + }, + { + "epoch": 0.09795626892926461, + "grad_norm": 9.954138781133398, + "learning_rate": 4.97176622177914e-05, + "loss": 2.3069, + "mean_token_accuracy": 0.4172413766384125, + "step": 97255 + }, + { + "epoch": 0.09796130498236878, + "grad_norm": 10.604904212375914, + "learning_rate": 4.9717603039330946e-05, + "loss": 2.4347, + "mean_token_accuracy": 0.4448275864124298, + "step": 97260 + }, + { + "epoch": 0.09796634103547296, + "grad_norm": 9.939429487131802, + "learning_rate": 4.9717543854708364e-05, + "loss": 2.5792, + "mean_token_accuracy": 0.4034482717514038, + "step": 97265 + }, + { + "epoch": 0.09797137708857713, + "grad_norm": 12.327897802146444, + "learning_rate": 4.971748466392367e-05, + "loss": 2.4984, + "mean_token_accuracy": 0.37241379022598264, + "step": 97270 + }, + { + "epoch": 0.09797641314168129, + "grad_norm": 9.70918229342822, + "learning_rate": 4.971742546697687e-05, + "loss": 2.6968, + "mean_token_accuracy": 0.33793102502822875, + "step": 97275 + }, + { + "epoch": 0.09798144919478546, + "grad_norm": 9.603534685573297, + "learning_rate": 4.9717366263867974e-05, + "loss": 2.5284, + "mean_token_accuracy": 0.4068965494632721, + "step": 97280 + }, + { + "epoch": 0.09798648524788964, + "grad_norm": 9.98280413246755, + "learning_rate": 4.971730705459702e-05, + "loss": 2.4625, + "mean_token_accuracy": 0.36551723480224607, + "step": 97285 + }, + { + "epoch": 0.09799152130099381, + "grad_norm": 8.73835032248004, + "learning_rate": 4.971724783916401e-05, + "loss": 3.0467, + "mean_token_accuracy": 0.358620685338974, + "step": 97290 + }, + { + "epoch": 0.09799655735409799, + "grad_norm": 13.120886250657922, + "learning_rate": 4.971718861756896e-05, + "loss": 2.3393, + "mean_token_accuracy": 0.4413793087005615, + "step": 97295 + }, + { + "epoch": 0.09800159340720216, + "grad_norm": 11.246169075642408, + "learning_rate": 4.971712938981189e-05, + "loss": 2.0537, + "mean_token_accuracy": 0.4637628495693207, + "step": 97300 + }, + { + "epoch": 0.09800662946030633, + "grad_norm": 13.524247366323623, + "learning_rate": 4.971707015589281e-05, + "loss": 2.621, + "mean_token_accuracy": 0.40852994918823243, + "step": 97305 + }, + { + "epoch": 0.0980116655134105, + "grad_norm": 13.292531467620794, + "learning_rate": 4.9717010915811764e-05, + "loss": 2.2512, + "mean_token_accuracy": 0.4501512348651886, + "step": 97310 + }, + { + "epoch": 0.09801670156651468, + "grad_norm": 10.845938151751648, + "learning_rate": 4.971695166956873e-05, + "loss": 2.6533, + "mean_token_accuracy": 0.4075123190879822, + "step": 97315 + }, + { + "epoch": 0.09802173761961885, + "grad_norm": 10.804970780779586, + "learning_rate": 4.971689241716374e-05, + "loss": 2.2309, + "mean_token_accuracy": 0.4482758641242981, + "step": 97320 + }, + { + "epoch": 0.09802677367272303, + "grad_norm": 11.232538178954089, + "learning_rate": 4.9716833158596824e-05, + "loss": 2.463, + "mean_token_accuracy": 0.40865094065666197, + "step": 97325 + }, + { + "epoch": 0.0980318097258272, + "grad_norm": 10.280132166881835, + "learning_rate": 4.971677389386798e-05, + "loss": 2.4881, + "mean_token_accuracy": 0.47084088921546935, + "step": 97330 + }, + { + "epoch": 0.09803684577893138, + "grad_norm": 11.784304700738083, + "learning_rate": 4.971671462297723e-05, + "loss": 2.139, + "mean_token_accuracy": 0.5041871964931488, + "step": 97335 + }, + { + "epoch": 0.09804188183203555, + "grad_norm": 11.541599706620644, + "learning_rate": 4.971665534592459e-05, + "loss": 2.3401, + "mean_token_accuracy": 0.4068965494632721, + "step": 97340 + }, + { + "epoch": 0.09804691788513971, + "grad_norm": 10.862798458183697, + "learning_rate": 4.971659606271008e-05, + "loss": 2.3288, + "mean_token_accuracy": 0.458620685338974, + "step": 97345 + }, + { + "epoch": 0.09805195393824388, + "grad_norm": 12.807057917961442, + "learning_rate": 4.971653677333371e-05, + "loss": 2.6562, + "mean_token_accuracy": 0.37931033968925476, + "step": 97350 + }, + { + "epoch": 0.09805698999134806, + "grad_norm": 10.899880027749033, + "learning_rate": 4.971647747779551e-05, + "loss": 2.4409, + "mean_token_accuracy": 0.4034482717514038, + "step": 97355 + }, + { + "epoch": 0.09806202604445223, + "grad_norm": 12.727422271199172, + "learning_rate": 4.971641817609548e-05, + "loss": 2.4622, + "mean_token_accuracy": 0.43103448748588563, + "step": 97360 + }, + { + "epoch": 0.0980670620975564, + "grad_norm": 13.234311009544532, + "learning_rate": 4.971635886823365e-05, + "loss": 2.6209, + "mean_token_accuracy": 0.3862069010734558, + "step": 97365 + }, + { + "epoch": 0.09807209815066058, + "grad_norm": 10.673717316435907, + "learning_rate": 4.971629955421002e-05, + "loss": 2.103, + "mean_token_accuracy": 0.5068965494632721, + "step": 97370 + }, + { + "epoch": 0.09807713420376475, + "grad_norm": 13.643854912138634, + "learning_rate": 4.9716240234024625e-05, + "loss": 2.7575, + "mean_token_accuracy": 0.39310344457626345, + "step": 97375 + }, + { + "epoch": 0.09808217025686893, + "grad_norm": 15.536629801504441, + "learning_rate": 4.9716180907677466e-05, + "loss": 3.0589, + "mean_token_accuracy": 0.3724137872457504, + "step": 97380 + }, + { + "epoch": 0.0980872063099731, + "grad_norm": 11.069006068684299, + "learning_rate": 4.9716121575168576e-05, + "loss": 2.5988, + "mean_token_accuracy": 0.4463399827480316, + "step": 97385 + }, + { + "epoch": 0.09809224236307727, + "grad_norm": 11.328459349448453, + "learning_rate": 4.9716062236497954e-05, + "loss": 2.3668, + "mean_token_accuracy": 0.42068964838981626, + "step": 97390 + }, + { + "epoch": 0.09809727841618145, + "grad_norm": 9.307879567752911, + "learning_rate": 4.971600289166562e-05, + "loss": 2.234, + "mean_token_accuracy": 0.4655172526836395, + "step": 97395 + }, + { + "epoch": 0.09810231446928562, + "grad_norm": 9.623135758282569, + "learning_rate": 4.971594354067161e-05, + "loss": 2.3823, + "mean_token_accuracy": 0.4551724135875702, + "step": 97400 + }, + { + "epoch": 0.0981073505223898, + "grad_norm": 11.00608412046635, + "learning_rate": 4.971588418351591e-05, + "loss": 2.936, + "mean_token_accuracy": 0.37586206793785093, + "step": 97405 + }, + { + "epoch": 0.09811238657549397, + "grad_norm": 14.724998573745744, + "learning_rate": 4.971582482019856e-05, + "loss": 2.3686, + "mean_token_accuracy": 0.4448275864124298, + "step": 97410 + }, + { + "epoch": 0.09811742262859813, + "grad_norm": 10.423728585907053, + "learning_rate": 4.971576545071957e-05, + "loss": 2.2985, + "mean_token_accuracy": 0.458620685338974, + "step": 97415 + }, + { + "epoch": 0.0981224586817023, + "grad_norm": 10.165969535174682, + "learning_rate": 4.9715706075078946e-05, + "loss": 2.4443, + "mean_token_accuracy": 0.37586206793785093, + "step": 97420 + }, + { + "epoch": 0.09812749473480648, + "grad_norm": 11.168144384515777, + "learning_rate": 4.9715646693276716e-05, + "loss": 2.6236, + "mean_token_accuracy": 0.4000000059604645, + "step": 97425 + }, + { + "epoch": 0.09813253078791065, + "grad_norm": 10.717334549610356, + "learning_rate": 4.97155873053129e-05, + "loss": 2.9845, + "mean_token_accuracy": 0.3983666092157364, + "step": 97430 + }, + { + "epoch": 0.09813756684101482, + "grad_norm": 10.073634886322745, + "learning_rate": 4.9715527911187496e-05, + "loss": 2.3298, + "mean_token_accuracy": 0.46055657267570493, + "step": 97435 + }, + { + "epoch": 0.098142602894119, + "grad_norm": 10.609131957458919, + "learning_rate": 4.971546851090054e-05, + "loss": 2.3797, + "mean_token_accuracy": 0.42413793206214906, + "step": 97440 + }, + { + "epoch": 0.09814763894722317, + "grad_norm": 9.962656117989539, + "learning_rate": 4.9715409104452044e-05, + "loss": 2.4454, + "mean_token_accuracy": 0.4090139091014862, + "step": 97445 + }, + { + "epoch": 0.09815267500032734, + "grad_norm": 10.262654879251988, + "learning_rate": 4.971534969184202e-05, + "loss": 2.3542, + "mean_token_accuracy": 0.4467029690742493, + "step": 97450 + }, + { + "epoch": 0.09815771105343152, + "grad_norm": 9.60327381626616, + "learning_rate": 4.971529027307049e-05, + "loss": 2.6289, + "mean_token_accuracy": 0.4034482717514038, + "step": 97455 + }, + { + "epoch": 0.09816274710653569, + "grad_norm": 11.189314917002674, + "learning_rate": 4.971523084813745e-05, + "loss": 2.2961, + "mean_token_accuracy": 0.4517241418361664, + "step": 97460 + }, + { + "epoch": 0.09816778315963987, + "grad_norm": 15.70734421804332, + "learning_rate": 4.971517141704294e-05, + "loss": 2.4001, + "mean_token_accuracy": 0.3862068921327591, + "step": 97465 + }, + { + "epoch": 0.09817281921274404, + "grad_norm": 9.87305913079176, + "learning_rate": 4.9715111979786976e-05, + "loss": 2.3866, + "mean_token_accuracy": 0.4172413766384125, + "step": 97470 + }, + { + "epoch": 0.09817785526584821, + "grad_norm": 12.197450815255955, + "learning_rate": 4.971505253636956e-05, + "loss": 2.1722, + "mean_token_accuracy": 0.47586206793785096, + "step": 97475 + }, + { + "epoch": 0.09818289131895239, + "grad_norm": 8.488843296150286, + "learning_rate": 4.9714993086790724e-05, + "loss": 1.9413, + "mean_token_accuracy": 0.5275862038135528, + "step": 97480 + }, + { + "epoch": 0.09818792737205655, + "grad_norm": 14.485895980656561, + "learning_rate": 4.9714933631050467e-05, + "loss": 2.8323, + "mean_token_accuracy": 0.37241379022598264, + "step": 97485 + }, + { + "epoch": 0.09819296342516072, + "grad_norm": 12.255518271692631, + "learning_rate": 4.971487416914882e-05, + "loss": 2.8109, + "mean_token_accuracy": 0.3965517282485962, + "step": 97490 + }, + { + "epoch": 0.0981979994782649, + "grad_norm": 11.982830983494642, + "learning_rate": 4.97148147010858e-05, + "loss": 2.5581, + "mean_token_accuracy": 0.4344827592372894, + "step": 97495 + }, + { + "epoch": 0.09820303553136907, + "grad_norm": 12.521465508428536, + "learning_rate": 4.9714755226861414e-05, + "loss": 2.2261, + "mean_token_accuracy": 0.4159709572792053, + "step": 97500 + }, + { + "epoch": 0.09820807158447324, + "grad_norm": 15.04539223274665, + "learning_rate": 4.971469574647569e-05, + "loss": 2.7264, + "mean_token_accuracy": 0.38620689511299133, + "step": 97505 + }, + { + "epoch": 0.09821310763757742, + "grad_norm": 11.668143506291376, + "learning_rate": 4.971463625992863e-05, + "loss": 2.7448, + "mean_token_accuracy": 0.3724137932062149, + "step": 97510 + }, + { + "epoch": 0.09821814369068159, + "grad_norm": 9.032773450275338, + "learning_rate": 4.971457676722025e-05, + "loss": 2.5133, + "mean_token_accuracy": 0.42068964838981626, + "step": 97515 + }, + { + "epoch": 0.09822317974378576, + "grad_norm": 11.333041564792559, + "learning_rate": 4.971451726835059e-05, + "loss": 2.1615, + "mean_token_accuracy": 0.48411330580711365, + "step": 97520 + }, + { + "epoch": 0.09822821579688994, + "grad_norm": 20.78318693750358, + "learning_rate": 4.9714457763319647e-05, + "loss": 2.8879, + "mean_token_accuracy": 0.3482758581638336, + "step": 97525 + }, + { + "epoch": 0.09823325184999411, + "grad_norm": 10.053705962477762, + "learning_rate": 4.971439825212743e-05, + "loss": 2.6089, + "mean_token_accuracy": 0.42068964838981626, + "step": 97530 + }, + { + "epoch": 0.09823828790309828, + "grad_norm": 9.960216416667782, + "learning_rate": 4.971433873477398e-05, + "loss": 2.3145, + "mean_token_accuracy": 0.3827586233615875, + "step": 97535 + }, + { + "epoch": 0.09824332395620246, + "grad_norm": 10.789542657871245, + "learning_rate": 4.971427921125929e-05, + "loss": 2.1866, + "mean_token_accuracy": 0.44337567687034607, + "step": 97540 + }, + { + "epoch": 0.09824836000930663, + "grad_norm": 13.201018749330451, + "learning_rate": 4.971421968158339e-05, + "loss": 2.6757, + "mean_token_accuracy": 0.3655172407627106, + "step": 97545 + }, + { + "epoch": 0.0982533960624108, + "grad_norm": 11.724652705249134, + "learning_rate": 4.97141601457463e-05, + "loss": 2.943, + "mean_token_accuracy": 0.32068966031074525, + "step": 97550 + }, + { + "epoch": 0.09825843211551497, + "grad_norm": 11.606750931432911, + "learning_rate": 4.971410060374802e-05, + "loss": 2.4199, + "mean_token_accuracy": 0.4206896543502808, + "step": 97555 + }, + { + "epoch": 0.09826346816861914, + "grad_norm": 11.937365107114122, + "learning_rate": 4.971404105558858e-05, + "loss": 2.7247, + "mean_token_accuracy": 0.3551724135875702, + "step": 97560 + }, + { + "epoch": 0.09826850422172331, + "grad_norm": 11.715178106022424, + "learning_rate": 4.9713981501268e-05, + "loss": 2.6865, + "mean_token_accuracy": 0.3827586114406586, + "step": 97565 + }, + { + "epoch": 0.09827354027482749, + "grad_norm": 15.544238260013042, + "learning_rate": 4.971392194078628e-05, + "loss": 2.5179, + "mean_token_accuracy": 0.3896551728248596, + "step": 97570 + }, + { + "epoch": 0.09827857632793166, + "grad_norm": 11.724707568759483, + "learning_rate": 4.9713862374143446e-05, + "loss": 2.6605, + "mean_token_accuracy": 0.37368420958518983, + "step": 97575 + }, + { + "epoch": 0.09828361238103583, + "grad_norm": 11.083256266009073, + "learning_rate": 4.971380280133952e-05, + "loss": 2.6656, + "mean_token_accuracy": 0.4, + "step": 97580 + }, + { + "epoch": 0.09828864843414001, + "grad_norm": 15.486292743747654, + "learning_rate": 4.971374322237451e-05, + "loss": 2.3749, + "mean_token_accuracy": 0.41724138259887694, + "step": 97585 + }, + { + "epoch": 0.09829368448724418, + "grad_norm": 12.017135034034935, + "learning_rate": 4.971368363724844e-05, + "loss": 2.7109, + "mean_token_accuracy": 0.36896551847457887, + "step": 97590 + }, + { + "epoch": 0.09829872054034836, + "grad_norm": 9.506074659347666, + "learning_rate": 4.971362404596131e-05, + "loss": 2.525, + "mean_token_accuracy": 0.43448275327682495, + "step": 97595 + }, + { + "epoch": 0.09830375659345253, + "grad_norm": 9.813523271529426, + "learning_rate": 4.971356444851316e-05, + "loss": 2.3511, + "mean_token_accuracy": 0.4034482717514038, + "step": 97600 + }, + { + "epoch": 0.0983087926465567, + "grad_norm": 10.183859763990748, + "learning_rate": 4.971350484490399e-05, + "loss": 2.117, + "mean_token_accuracy": 0.4448275864124298, + "step": 97605 + }, + { + "epoch": 0.09831382869966088, + "grad_norm": 10.741336989471424, + "learning_rate": 4.971344523513382e-05, + "loss": 2.8036, + "mean_token_accuracy": 0.3379310339689255, + "step": 97610 + }, + { + "epoch": 0.09831886475276505, + "grad_norm": 10.665047545224013, + "learning_rate": 4.971338561920268e-05, + "loss": 2.4216, + "mean_token_accuracy": 0.4344827592372894, + "step": 97615 + }, + { + "epoch": 0.09832390080586922, + "grad_norm": 10.710897658826648, + "learning_rate": 4.9713325997110555e-05, + "loss": 1.9869, + "mean_token_accuracy": 0.4568663060665131, + "step": 97620 + }, + { + "epoch": 0.09832893685897338, + "grad_norm": 10.963266200000158, + "learning_rate": 4.9713266368857496e-05, + "loss": 2.4876, + "mean_token_accuracy": 0.38275861740112305, + "step": 97625 + }, + { + "epoch": 0.09833397291207756, + "grad_norm": 11.854665488687933, + "learning_rate": 4.9713206734443505e-05, + "loss": 2.3924, + "mean_token_accuracy": 0.4137930989265442, + "step": 97630 + }, + { + "epoch": 0.09833900896518173, + "grad_norm": 11.901892481175224, + "learning_rate": 4.971314709386859e-05, + "loss": 2.1592, + "mean_token_accuracy": 0.4482758641242981, + "step": 97635 + }, + { + "epoch": 0.0983440450182859, + "grad_norm": 11.7973940481483, + "learning_rate": 4.971308744713278e-05, + "loss": 2.4243, + "mean_token_accuracy": 0.3827586233615875, + "step": 97640 + }, + { + "epoch": 0.09834908107139008, + "grad_norm": 11.551590428875171, + "learning_rate": 4.9713027794236086e-05, + "loss": 2.5728, + "mean_token_accuracy": 0.382758629322052, + "step": 97645 + }, + { + "epoch": 0.09835411712449425, + "grad_norm": 10.817075350852532, + "learning_rate": 4.9712968135178526e-05, + "loss": 2.8453, + "mean_token_accuracy": 0.3999999940395355, + "step": 97650 + }, + { + "epoch": 0.09835915317759843, + "grad_norm": 9.90386646038119, + "learning_rate": 4.9712908469960115e-05, + "loss": 1.9628, + "mean_token_accuracy": 0.5275862157344818, + "step": 97655 + }, + { + "epoch": 0.0983641892307026, + "grad_norm": 15.244139527243627, + "learning_rate": 4.9712848798580875e-05, + "loss": 2.3377, + "mean_token_accuracy": 0.46727163195610044, + "step": 97660 + }, + { + "epoch": 0.09836922528380677, + "grad_norm": 10.379719878210476, + "learning_rate": 4.971278912104082e-05, + "loss": 2.438, + "mean_token_accuracy": 0.3896551728248596, + "step": 97665 + }, + { + "epoch": 0.09837426133691095, + "grad_norm": 12.691440791058897, + "learning_rate": 4.971272943733996e-05, + "loss": 2.1843, + "mean_token_accuracy": 0.4503932178020477, + "step": 97670 + }, + { + "epoch": 0.09837929739001512, + "grad_norm": 10.156655678547976, + "learning_rate": 4.971266974747831e-05, + "loss": 2.43, + "mean_token_accuracy": 0.4137930989265442, + "step": 97675 + }, + { + "epoch": 0.0983843334431193, + "grad_norm": 11.188542692657098, + "learning_rate": 4.9712610051455906e-05, + "loss": 2.2728, + "mean_token_accuracy": 0.45716878175735476, + "step": 97680 + }, + { + "epoch": 0.09838936949622347, + "grad_norm": 10.18426456430586, + "learning_rate": 4.971255034927275e-05, + "loss": 2.5288, + "mean_token_accuracy": 0.44482758045196535, + "step": 97685 + }, + { + "epoch": 0.09839440554932764, + "grad_norm": 9.682478064778387, + "learning_rate": 4.971249064092886e-05, + "loss": 2.4859, + "mean_token_accuracy": 0.38275861740112305, + "step": 97690 + }, + { + "epoch": 0.0983994416024318, + "grad_norm": 20.967570590919298, + "learning_rate": 4.9712430926424245e-05, + "loss": 2.5718, + "mean_token_accuracy": 0.4448275864124298, + "step": 97695 + }, + { + "epoch": 0.09840447765553598, + "grad_norm": 11.277447443484027, + "learning_rate": 4.9712371205758935e-05, + "loss": 2.61, + "mean_token_accuracy": 0.44137929677963256, + "step": 97700 + }, + { + "epoch": 0.09840951370864015, + "grad_norm": 13.159311013004782, + "learning_rate": 4.971231147893294e-05, + "loss": 2.5431, + "mean_token_accuracy": 0.4034482777118683, + "step": 97705 + }, + { + "epoch": 0.09841454976174432, + "grad_norm": 10.810680149560186, + "learning_rate": 4.971225174594628e-05, + "loss": 2.3309, + "mean_token_accuracy": 0.4620689690113068, + "step": 97710 + }, + { + "epoch": 0.0984195858148485, + "grad_norm": 12.993853314255958, + "learning_rate": 4.971219200679897e-05, + "loss": 2.5806, + "mean_token_accuracy": 0.3724137842655182, + "step": 97715 + }, + { + "epoch": 0.09842462186795267, + "grad_norm": 12.271317359396697, + "learning_rate": 4.9712132261491026e-05, + "loss": 2.4679, + "mean_token_accuracy": 0.4379310429096222, + "step": 97720 + }, + { + "epoch": 0.09842965792105685, + "grad_norm": 10.786675561259617, + "learning_rate": 4.971207251002246e-05, + "loss": 2.3974, + "mean_token_accuracy": 0.4275862157344818, + "step": 97725 + }, + { + "epoch": 0.09843469397416102, + "grad_norm": 12.00821831828006, + "learning_rate": 4.9712012752393286e-05, + "loss": 2.3894, + "mean_token_accuracy": 0.42758620381355283, + "step": 97730 + }, + { + "epoch": 0.0984397300272652, + "grad_norm": 13.918423615708935, + "learning_rate": 4.971195298860354e-05, + "loss": 2.7863, + "mean_token_accuracy": 0.38620689511299133, + "step": 97735 + }, + { + "epoch": 0.09844476608036937, + "grad_norm": 11.310016190182642, + "learning_rate": 4.971189321865322e-05, + "loss": 2.3451, + "mean_token_accuracy": 0.3965517282485962, + "step": 97740 + }, + { + "epoch": 0.09844980213347354, + "grad_norm": 15.278417521748223, + "learning_rate": 4.971183344254235e-05, + "loss": 2.7006, + "mean_token_accuracy": 0.3655172407627106, + "step": 97745 + }, + { + "epoch": 0.09845483818657771, + "grad_norm": 11.850355035572427, + "learning_rate": 4.9711773660270944e-05, + "loss": 2.5265, + "mean_token_accuracy": 0.41379311084747317, + "step": 97750 + }, + { + "epoch": 0.09845987423968189, + "grad_norm": 10.381352584053383, + "learning_rate": 4.971171387183902e-05, + "loss": 2.3247, + "mean_token_accuracy": 0.4137930989265442, + "step": 97755 + }, + { + "epoch": 0.09846491029278606, + "grad_norm": 11.94405345979582, + "learning_rate": 4.97116540772466e-05, + "loss": 2.5648, + "mean_token_accuracy": 0.4310344815254211, + "step": 97760 + }, + { + "epoch": 0.09846994634589022, + "grad_norm": 15.581402902773647, + "learning_rate": 4.9711594276493696e-05, + "loss": 2.5292, + "mean_token_accuracy": 0.4206896543502808, + "step": 97765 + }, + { + "epoch": 0.0984749823989944, + "grad_norm": 10.066017039142059, + "learning_rate": 4.9711534469580314e-05, + "loss": 2.191, + "mean_token_accuracy": 0.42413792610168455, + "step": 97770 + }, + { + "epoch": 0.09848001845209857, + "grad_norm": 11.954761015831895, + "learning_rate": 4.971147465650648e-05, + "loss": 2.4863, + "mean_token_accuracy": 0.4034482777118683, + "step": 97775 + }, + { + "epoch": 0.09848505450520274, + "grad_norm": 9.946762291167051, + "learning_rate": 4.971141483727221e-05, + "loss": 2.3726, + "mean_token_accuracy": 0.45517241954803467, + "step": 97780 + }, + { + "epoch": 0.09849009055830692, + "grad_norm": 13.102692373701913, + "learning_rate": 4.971135501187753e-05, + "loss": 2.0279, + "mean_token_accuracy": 0.5310344696044922, + "step": 97785 + }, + { + "epoch": 0.09849512661141109, + "grad_norm": 11.037640847425841, + "learning_rate": 4.9711295180322445e-05, + "loss": 2.4661, + "mean_token_accuracy": 0.4068965554237366, + "step": 97790 + }, + { + "epoch": 0.09850016266451526, + "grad_norm": 11.567240324795689, + "learning_rate": 4.9711235342606973e-05, + "loss": 2.5298, + "mean_token_accuracy": 0.3999999940395355, + "step": 97795 + }, + { + "epoch": 0.09850519871761944, + "grad_norm": 10.827189769815602, + "learning_rate": 4.971117549873113e-05, + "loss": 2.6369, + "mean_token_accuracy": 0.3862069010734558, + "step": 97800 + }, + { + "epoch": 0.09851023477072361, + "grad_norm": 13.12286126856711, + "learning_rate": 4.971111564869494e-05, + "loss": 2.3658, + "mean_token_accuracy": 0.43448275327682495, + "step": 97805 + }, + { + "epoch": 0.09851527082382779, + "grad_norm": 9.393014220587276, + "learning_rate": 4.971105579249841e-05, + "loss": 2.5006, + "mean_token_accuracy": 0.4185722887516022, + "step": 97810 + }, + { + "epoch": 0.09852030687693196, + "grad_norm": 10.319542079087064, + "learning_rate": 4.9710995930141555e-05, + "loss": 2.296, + "mean_token_accuracy": 0.41379310488700866, + "step": 97815 + }, + { + "epoch": 0.09852534293003613, + "grad_norm": 10.375331788137311, + "learning_rate": 4.97109360616244e-05, + "loss": 1.9967, + "mean_token_accuracy": 0.5084089457988739, + "step": 97820 + }, + { + "epoch": 0.09853037898314031, + "grad_norm": 12.003457196367076, + "learning_rate": 4.971087618694697e-05, + "loss": 2.629, + "mean_token_accuracy": 0.4068965554237366, + "step": 97825 + }, + { + "epoch": 0.09853541503624448, + "grad_norm": 9.031175134666386, + "learning_rate": 4.9710816306109266e-05, + "loss": 2.7957, + "mean_token_accuracy": 0.33793103098869326, + "step": 97830 + }, + { + "epoch": 0.09854045108934864, + "grad_norm": 10.478666577288436, + "learning_rate": 4.9710756419111304e-05, + "loss": 2.4722, + "mean_token_accuracy": 0.4172413766384125, + "step": 97835 + }, + { + "epoch": 0.09854548714245281, + "grad_norm": 8.791131633410815, + "learning_rate": 4.97106965259531e-05, + "loss": 2.1674, + "mean_token_accuracy": 0.4517241418361664, + "step": 97840 + }, + { + "epoch": 0.09855052319555699, + "grad_norm": 25.177652745530988, + "learning_rate": 4.9710636626634686e-05, + "loss": 2.9007, + "mean_token_accuracy": 0.38620689511299133, + "step": 97845 + }, + { + "epoch": 0.09855555924866116, + "grad_norm": 15.418782117170883, + "learning_rate": 4.971057672115607e-05, + "loss": 2.5872, + "mean_token_accuracy": 0.4206896543502808, + "step": 97850 + }, + { + "epoch": 0.09856059530176534, + "grad_norm": 11.393417580420463, + "learning_rate": 4.971051680951727e-05, + "loss": 2.8021, + "mean_token_accuracy": 0.39655172228813174, + "step": 97855 + }, + { + "epoch": 0.09856563135486951, + "grad_norm": 9.521671743746897, + "learning_rate": 4.9710456891718297e-05, + "loss": 2.3299, + "mean_token_accuracy": 0.4413793087005615, + "step": 97860 + }, + { + "epoch": 0.09857066740797368, + "grad_norm": 10.66119952614436, + "learning_rate": 4.971039696775917e-05, + "loss": 2.3629, + "mean_token_accuracy": 0.4206896543502808, + "step": 97865 + }, + { + "epoch": 0.09857570346107786, + "grad_norm": 10.29864309607302, + "learning_rate": 4.9710337037639907e-05, + "loss": 2.4084, + "mean_token_accuracy": 0.39310344457626345, + "step": 97870 + }, + { + "epoch": 0.09858073951418203, + "grad_norm": 10.867251487601596, + "learning_rate": 4.9710277101360524e-05, + "loss": 2.5367, + "mean_token_accuracy": 0.3999999940395355, + "step": 97875 + }, + { + "epoch": 0.0985857755672862, + "grad_norm": 9.017821802684573, + "learning_rate": 4.971021715892104e-05, + "loss": 2.3609, + "mean_token_accuracy": 0.4275862127542496, + "step": 97880 + }, + { + "epoch": 0.09859081162039038, + "grad_norm": 13.5371312437017, + "learning_rate": 4.9710157210321465e-05, + "loss": 2.7683, + "mean_token_accuracy": 0.4000000059604645, + "step": 97885 + }, + { + "epoch": 0.09859584767349455, + "grad_norm": 12.831430711166034, + "learning_rate": 4.971009725556183e-05, + "loss": 2.2862, + "mean_token_accuracy": 0.4344827592372894, + "step": 97890 + }, + { + "epoch": 0.09860088372659873, + "grad_norm": 9.899938036503844, + "learning_rate": 4.971003729464212e-05, + "loss": 2.1744, + "mean_token_accuracy": 0.45862067937850953, + "step": 97895 + }, + { + "epoch": 0.0986059197797029, + "grad_norm": 10.303391171010055, + "learning_rate": 4.970997732756239e-05, + "loss": 2.8414, + "mean_token_accuracy": 0.3620689570903778, + "step": 97900 + }, + { + "epoch": 0.09861095583280706, + "grad_norm": 10.196640367325134, + "learning_rate": 4.970991735432264e-05, + "loss": 2.5658, + "mean_token_accuracy": 0.46896551847457885, + "step": 97905 + }, + { + "epoch": 0.09861599188591123, + "grad_norm": 10.427601331647349, + "learning_rate": 4.9709857374922884e-05, + "loss": 2.598, + "mean_token_accuracy": 0.42413793206214906, + "step": 97910 + }, + { + "epoch": 0.09862102793901541, + "grad_norm": 14.224260412360465, + "learning_rate": 4.970979738936314e-05, + "loss": 2.602, + "mean_token_accuracy": 0.40689654350280763, + "step": 97915 + }, + { + "epoch": 0.09862606399211958, + "grad_norm": 11.581395200303872, + "learning_rate": 4.970973739764342e-05, + "loss": 2.3103, + "mean_token_accuracy": 0.38965516686439516, + "step": 97920 + }, + { + "epoch": 0.09863110004522375, + "grad_norm": 8.87365657375527, + "learning_rate": 4.970967739976375e-05, + "loss": 2.4235, + "mean_token_accuracy": 0.41034482717514037, + "step": 97925 + }, + { + "epoch": 0.09863613609832793, + "grad_norm": 11.007757752120106, + "learning_rate": 4.970961739572416e-05, + "loss": 2.3578, + "mean_token_accuracy": 0.4172413766384125, + "step": 97930 + }, + { + "epoch": 0.0986411721514321, + "grad_norm": 12.790968680546733, + "learning_rate": 4.970955738552463e-05, + "loss": 2.6098, + "mean_token_accuracy": 0.3999999940395355, + "step": 97935 + }, + { + "epoch": 0.09864620820453628, + "grad_norm": 13.963999583814843, + "learning_rate": 4.97094973691652e-05, + "loss": 2.2513, + "mean_token_accuracy": 0.4689655125141144, + "step": 97940 + }, + { + "epoch": 0.09865124425764045, + "grad_norm": 11.49703928048607, + "learning_rate": 4.9709437346645886e-05, + "loss": 2.4784, + "mean_token_accuracy": 0.4310344934463501, + "step": 97945 + }, + { + "epoch": 0.09865628031074462, + "grad_norm": 13.004024432610764, + "learning_rate": 4.97093773179667e-05, + "loss": 2.6603, + "mean_token_accuracy": 0.37586206793785093, + "step": 97950 + }, + { + "epoch": 0.0986613163638488, + "grad_norm": 10.453636593894931, + "learning_rate": 4.970931728312766e-05, + "loss": 2.2709, + "mean_token_accuracy": 0.4, + "step": 97955 + }, + { + "epoch": 0.09866635241695297, + "grad_norm": 11.673354702653139, + "learning_rate": 4.970925724212879e-05, + "loss": 2.6566, + "mean_token_accuracy": 0.3862068891525269, + "step": 97960 + }, + { + "epoch": 0.09867138847005714, + "grad_norm": 14.311675181997712, + "learning_rate": 4.970919719497009e-05, + "loss": 2.5013, + "mean_token_accuracy": 0.3931034475564957, + "step": 97965 + }, + { + "epoch": 0.09867642452316132, + "grad_norm": 9.253007233229098, + "learning_rate": 4.97091371416516e-05, + "loss": 2.076, + "mean_token_accuracy": 0.4862068951129913, + "step": 97970 + }, + { + "epoch": 0.09868146057626548, + "grad_norm": 10.990643773768477, + "learning_rate": 4.9709077082173314e-05, + "loss": 2.6848, + "mean_token_accuracy": 0.44827585816383364, + "step": 97975 + }, + { + "epoch": 0.09868649662936965, + "grad_norm": 9.911742042713716, + "learning_rate": 4.970901701653525e-05, + "loss": 2.8209, + "mean_token_accuracy": 0.39655171930789945, + "step": 97980 + }, + { + "epoch": 0.09869153268247383, + "grad_norm": 10.11859020133486, + "learning_rate": 4.970895694473744e-05, + "loss": 2.3041, + "mean_token_accuracy": 0.41724138259887694, + "step": 97985 + }, + { + "epoch": 0.098696568735578, + "grad_norm": 11.270023606115382, + "learning_rate": 4.970889686677989e-05, + "loss": 2.159, + "mean_token_accuracy": 0.4620689570903778, + "step": 97990 + }, + { + "epoch": 0.09870160478868217, + "grad_norm": 13.942223548536774, + "learning_rate": 4.9708836782662624e-05, + "loss": 2.9282, + "mean_token_accuracy": 0.38275861740112305, + "step": 97995 + }, + { + "epoch": 0.09870664084178635, + "grad_norm": 11.859938131770873, + "learning_rate": 4.970877669238565e-05, + "loss": 2.6706, + "mean_token_accuracy": 0.44137930274009707, + "step": 98000 + }, + { + "epoch": 0.09871167689489052, + "grad_norm": 16.507703314031456, + "learning_rate": 4.970871659594899e-05, + "loss": 3.0387, + "mean_token_accuracy": 0.3793103456497192, + "step": 98005 + }, + { + "epoch": 0.0987167129479947, + "grad_norm": 11.24955043281426, + "learning_rate": 4.970865649335266e-05, + "loss": 2.6179, + "mean_token_accuracy": 0.42758620977401735, + "step": 98010 + }, + { + "epoch": 0.09872174900109887, + "grad_norm": 14.666138617772406, + "learning_rate": 4.9708596384596675e-05, + "loss": 2.8386, + "mean_token_accuracy": 0.3393829435110092, + "step": 98015 + }, + { + "epoch": 0.09872678505420304, + "grad_norm": 11.724975261497976, + "learning_rate": 4.970853626968106e-05, + "loss": 2.5236, + "mean_token_accuracy": 0.39310344457626345, + "step": 98020 + }, + { + "epoch": 0.09873182110730722, + "grad_norm": 17.32955881392607, + "learning_rate": 4.970847614860582e-05, + "loss": 2.8313, + "mean_token_accuracy": 0.41379310190677643, + "step": 98025 + }, + { + "epoch": 0.09873685716041139, + "grad_norm": 12.770506308880238, + "learning_rate": 4.970841602137097e-05, + "loss": 2.7397, + "mean_token_accuracy": 0.4034482777118683, + "step": 98030 + }, + { + "epoch": 0.09874189321351556, + "grad_norm": 10.954857814938002, + "learning_rate": 4.9708355887976545e-05, + "loss": 2.4899, + "mean_token_accuracy": 0.39310344457626345, + "step": 98035 + }, + { + "epoch": 0.09874692926661974, + "grad_norm": 11.409355004776438, + "learning_rate": 4.9708295748422536e-05, + "loss": 2.3361, + "mean_token_accuracy": 0.41379310488700866, + "step": 98040 + }, + { + "epoch": 0.0987519653197239, + "grad_norm": 12.596639555162035, + "learning_rate": 4.9708235602708986e-05, + "loss": 3.2598, + "mean_token_accuracy": 0.3551724016666412, + "step": 98045 + }, + { + "epoch": 0.09875700137282807, + "grad_norm": 8.990840990409456, + "learning_rate": 4.970817545083589e-05, + "loss": 2.5519, + "mean_token_accuracy": 0.41379310488700866, + "step": 98050 + }, + { + "epoch": 0.09876203742593224, + "grad_norm": 19.859006178409228, + "learning_rate": 4.970811529280328e-05, + "loss": 2.4828, + "mean_token_accuracy": 0.44912281036376955, + "step": 98055 + }, + { + "epoch": 0.09876707347903642, + "grad_norm": 9.138786415558275, + "learning_rate": 4.970805512861116e-05, + "loss": 2.3064, + "mean_token_accuracy": 0.4241379380226135, + "step": 98060 + }, + { + "epoch": 0.09877210953214059, + "grad_norm": 12.432783570971484, + "learning_rate": 4.9707994958259564e-05, + "loss": 2.8446, + "mean_token_accuracy": 0.41379310488700866, + "step": 98065 + }, + { + "epoch": 0.09877714558524477, + "grad_norm": 10.164596343937557, + "learning_rate": 4.970793478174849e-05, + "loss": 2.3907, + "mean_token_accuracy": 0.4379310369491577, + "step": 98070 + }, + { + "epoch": 0.09878218163834894, + "grad_norm": 11.089378520015082, + "learning_rate": 4.9707874599077965e-05, + "loss": 2.4744, + "mean_token_accuracy": 0.43448275327682495, + "step": 98075 + }, + { + "epoch": 0.09878721769145311, + "grad_norm": 11.312346405715486, + "learning_rate": 4.970781441024799e-05, + "loss": 2.597, + "mean_token_accuracy": 0.44827585816383364, + "step": 98080 + }, + { + "epoch": 0.09879225374455729, + "grad_norm": 12.592623145362236, + "learning_rate": 4.97077542152586e-05, + "loss": 2.7911, + "mean_token_accuracy": 0.36896551847457887, + "step": 98085 + }, + { + "epoch": 0.09879728979766146, + "grad_norm": 11.01910612485212, + "learning_rate": 4.970769401410982e-05, + "loss": 2.4545, + "mean_token_accuracy": 0.4379310429096222, + "step": 98090 + }, + { + "epoch": 0.09880232585076563, + "grad_norm": 10.050163366299188, + "learning_rate": 4.970763380680165e-05, + "loss": 2.3706, + "mean_token_accuracy": 0.42758620381355283, + "step": 98095 + }, + { + "epoch": 0.09880736190386981, + "grad_norm": 15.22832408515365, + "learning_rate": 4.970757359333411e-05, + "loss": 2.5037, + "mean_token_accuracy": 0.43793103098869324, + "step": 98100 + }, + { + "epoch": 0.09881239795697398, + "grad_norm": 9.79357039731364, + "learning_rate": 4.97075133737072e-05, + "loss": 2.1665, + "mean_token_accuracy": 0.48106473684310913, + "step": 98105 + }, + { + "epoch": 0.09881743401007816, + "grad_norm": 15.360350619818183, + "learning_rate": 4.970745314792097e-05, + "loss": 2.6582, + "mean_token_accuracy": 0.42413792610168455, + "step": 98110 + }, + { + "epoch": 0.09882247006318232, + "grad_norm": 10.53078678268063, + "learning_rate": 4.9707392915975414e-05, + "loss": 2.4484, + "mean_token_accuracy": 0.42413793206214906, + "step": 98115 + }, + { + "epoch": 0.09882750611628649, + "grad_norm": 13.59320778984477, + "learning_rate": 4.970733267787056e-05, + "loss": 2.385, + "mean_token_accuracy": 0.41379310488700866, + "step": 98120 + }, + { + "epoch": 0.09883254216939066, + "grad_norm": 11.517848992756017, + "learning_rate": 4.970727243360642e-05, + "loss": 2.8107, + "mean_token_accuracy": 0.40895341634750365, + "step": 98125 + }, + { + "epoch": 0.09883757822249484, + "grad_norm": 9.744572804387897, + "learning_rate": 4.970721218318301e-05, + "loss": 2.6388, + "mean_token_accuracy": 0.40689654350280763, + "step": 98130 + }, + { + "epoch": 0.09884261427559901, + "grad_norm": 11.930975430787674, + "learning_rate": 4.970715192660033e-05, + "loss": 2.5446, + "mean_token_accuracy": 0.4103448212146759, + "step": 98135 + }, + { + "epoch": 0.09884765032870318, + "grad_norm": 9.386461575119368, + "learning_rate": 4.970709166385843e-05, + "loss": 2.5837, + "mean_token_accuracy": 0.43793103098869324, + "step": 98140 + }, + { + "epoch": 0.09885268638180736, + "grad_norm": 11.013691281426624, + "learning_rate": 4.97070313949573e-05, + "loss": 2.3803, + "mean_token_accuracy": 0.4172413647174835, + "step": 98145 + }, + { + "epoch": 0.09885772243491153, + "grad_norm": 12.339083280322672, + "learning_rate": 4.970697111989698e-05, + "loss": 2.7812, + "mean_token_accuracy": 0.3827586203813553, + "step": 98150 + }, + { + "epoch": 0.0988627584880157, + "grad_norm": 11.330990404560511, + "learning_rate": 4.9706910838677466e-05, + "loss": 2.6536, + "mean_token_accuracy": 0.37931033968925476, + "step": 98155 + }, + { + "epoch": 0.09886779454111988, + "grad_norm": 9.888103145798377, + "learning_rate": 4.9706850551298787e-05, + "loss": 2.4927, + "mean_token_accuracy": 0.3896551728248596, + "step": 98160 + }, + { + "epoch": 0.09887283059422405, + "grad_norm": 10.522646339948139, + "learning_rate": 4.970679025776096e-05, + "loss": 2.4188, + "mean_token_accuracy": 0.4363581299781799, + "step": 98165 + }, + { + "epoch": 0.09887786664732823, + "grad_norm": 8.939621682336476, + "learning_rate": 4.970672995806398e-05, + "loss": 2.3047, + "mean_token_accuracy": 0.4689655125141144, + "step": 98170 + }, + { + "epoch": 0.0988829027004324, + "grad_norm": 11.403875569504207, + "learning_rate": 4.9706669652207887e-05, + "loss": 2.3731, + "mean_token_accuracy": 0.41379310488700866, + "step": 98175 + }, + { + "epoch": 0.09888793875353658, + "grad_norm": 10.907963880037839, + "learning_rate": 4.97066093401927e-05, + "loss": 2.7625, + "mean_token_accuracy": 0.4643073260784149, + "step": 98180 + }, + { + "epoch": 0.09889297480664073, + "grad_norm": 10.387843828286515, + "learning_rate": 4.9706549022018415e-05, + "loss": 2.5832, + "mean_token_accuracy": 0.4310344815254211, + "step": 98185 + }, + { + "epoch": 0.09889801085974491, + "grad_norm": 11.351449294036794, + "learning_rate": 4.970648869768506e-05, + "loss": 2.462, + "mean_token_accuracy": 0.3896551728248596, + "step": 98190 + }, + { + "epoch": 0.09890304691284908, + "grad_norm": 11.59278996563145, + "learning_rate": 4.9706428367192664e-05, + "loss": 2.5998, + "mean_token_accuracy": 0.35172414481639863, + "step": 98195 + }, + { + "epoch": 0.09890808296595326, + "grad_norm": 10.314298294382988, + "learning_rate": 4.970636803054123e-05, + "loss": 2.5392, + "mean_token_accuracy": 0.3827586114406586, + "step": 98200 + }, + { + "epoch": 0.09891311901905743, + "grad_norm": 9.053597490023474, + "learning_rate": 4.970630768773077e-05, + "loss": 1.9558, + "mean_token_accuracy": 0.5087719321250915, + "step": 98205 + }, + { + "epoch": 0.0989181550721616, + "grad_norm": 14.524493455425674, + "learning_rate": 4.970624733876132e-05, + "loss": 2.4377, + "mean_token_accuracy": 0.4034482777118683, + "step": 98210 + }, + { + "epoch": 0.09892319112526578, + "grad_norm": 9.162183929573207, + "learning_rate": 4.970618698363288e-05, + "loss": 2.4739, + "mean_token_accuracy": 0.42068964838981626, + "step": 98215 + }, + { + "epoch": 0.09892822717836995, + "grad_norm": 9.973446366697797, + "learning_rate": 4.9706126622345463e-05, + "loss": 2.4182, + "mean_token_accuracy": 0.46206897497177124, + "step": 98220 + }, + { + "epoch": 0.09893326323147413, + "grad_norm": 9.174188685983182, + "learning_rate": 4.97060662548991e-05, + "loss": 2.3595, + "mean_token_accuracy": 0.43793103098869324, + "step": 98225 + }, + { + "epoch": 0.0989382992845783, + "grad_norm": 10.347456211126978, + "learning_rate": 4.97060058812938e-05, + "loss": 2.2713, + "mean_token_accuracy": 0.420689657330513, + "step": 98230 + }, + { + "epoch": 0.09894333533768247, + "grad_norm": 15.330849747475884, + "learning_rate": 4.9705945501529575e-05, + "loss": 2.5948, + "mean_token_accuracy": 0.4, + "step": 98235 + }, + { + "epoch": 0.09894837139078665, + "grad_norm": 11.123840969212207, + "learning_rate": 4.970588511560645e-05, + "loss": 2.3632, + "mean_token_accuracy": 0.48784029483795166, + "step": 98240 + }, + { + "epoch": 0.09895340744389082, + "grad_norm": 10.05710093112814, + "learning_rate": 4.970582472352445e-05, + "loss": 2.4511, + "mean_token_accuracy": 0.4310344815254211, + "step": 98245 + }, + { + "epoch": 0.098958443496995, + "grad_norm": 12.27344087661297, + "learning_rate": 4.9705764325283585e-05, + "loss": 2.3802, + "mean_token_accuracy": 0.46551724076271056, + "step": 98250 + }, + { + "epoch": 0.09896347955009915, + "grad_norm": 12.304931155812197, + "learning_rate": 4.970570392088385e-05, + "loss": 2.3704, + "mean_token_accuracy": 0.4379310369491577, + "step": 98255 + }, + { + "epoch": 0.09896851560320333, + "grad_norm": 14.218439567165651, + "learning_rate": 4.9705643510325294e-05, + "loss": 2.7131, + "mean_token_accuracy": 0.3999999940395355, + "step": 98260 + }, + { + "epoch": 0.0989735516563075, + "grad_norm": 13.361735695200595, + "learning_rate": 4.9705583093607916e-05, + "loss": 2.3466, + "mean_token_accuracy": 0.4931034505367279, + "step": 98265 + }, + { + "epoch": 0.09897858770941168, + "grad_norm": 11.242552604930378, + "learning_rate": 4.9705522670731734e-05, + "loss": 2.2203, + "mean_token_accuracy": 0.458620685338974, + "step": 98270 + }, + { + "epoch": 0.09898362376251585, + "grad_norm": 10.9032269436806, + "learning_rate": 4.970546224169677e-05, + "loss": 2.2533, + "mean_token_accuracy": 0.4, + "step": 98275 + }, + { + "epoch": 0.09898865981562002, + "grad_norm": 9.387443345083136, + "learning_rate": 4.970540180650304e-05, + "loss": 2.3786, + "mean_token_accuracy": 0.3896551728248596, + "step": 98280 + }, + { + "epoch": 0.0989936958687242, + "grad_norm": 9.763117654702796, + "learning_rate": 4.9705341365150556e-05, + "loss": 2.4596, + "mean_token_accuracy": 0.43284936547279357, + "step": 98285 + }, + { + "epoch": 0.09899873192182837, + "grad_norm": 10.49590051133532, + "learning_rate": 4.970528091763934e-05, + "loss": 2.1148, + "mean_token_accuracy": 0.47931034564971925, + "step": 98290 + }, + { + "epoch": 0.09900376797493254, + "grad_norm": 10.496333006310019, + "learning_rate": 4.970522046396941e-05, + "loss": 2.1725, + "mean_token_accuracy": 0.44827587008476255, + "step": 98295 + }, + { + "epoch": 0.09900880402803672, + "grad_norm": 9.364964119123947, + "learning_rate": 4.9705160004140775e-05, + "loss": 2.4457, + "mean_token_accuracy": 0.45517241954803467, + "step": 98300 + }, + { + "epoch": 0.09901384008114089, + "grad_norm": 10.62952092212202, + "learning_rate": 4.9705099538153455e-05, + "loss": 2.5531, + "mean_token_accuracy": 0.3862069010734558, + "step": 98305 + }, + { + "epoch": 0.09901887613424507, + "grad_norm": 11.544604815851127, + "learning_rate": 4.970503906600747e-05, + "loss": 2.5351, + "mean_token_accuracy": 0.39310344457626345, + "step": 98310 + }, + { + "epoch": 0.09902391218734924, + "grad_norm": 7.919368350375854, + "learning_rate": 4.970497858770283e-05, + "loss": 2.303, + "mean_token_accuracy": 0.43448275327682495, + "step": 98315 + }, + { + "epoch": 0.09902894824045341, + "grad_norm": 12.790521448469477, + "learning_rate": 4.970491810323956e-05, + "loss": 2.6252, + "mean_token_accuracy": 0.3827586233615875, + "step": 98320 + }, + { + "epoch": 0.09903398429355757, + "grad_norm": 10.20745370132174, + "learning_rate": 4.970485761261767e-05, + "loss": 2.3155, + "mean_token_accuracy": 0.43448275327682495, + "step": 98325 + }, + { + "epoch": 0.09903902034666175, + "grad_norm": 13.440856080388821, + "learning_rate": 4.970479711583719e-05, + "loss": 2.4851, + "mean_token_accuracy": 0.3896551728248596, + "step": 98330 + }, + { + "epoch": 0.09904405639976592, + "grad_norm": 11.491372423211995, + "learning_rate": 4.970473661289811e-05, + "loss": 2.3748, + "mean_token_accuracy": 0.4068965554237366, + "step": 98335 + }, + { + "epoch": 0.0990490924528701, + "grad_norm": 9.348850771782654, + "learning_rate": 4.970467610380048e-05, + "loss": 2.2375, + "mean_token_accuracy": 0.4517241418361664, + "step": 98340 + }, + { + "epoch": 0.09905412850597427, + "grad_norm": 11.857151642450845, + "learning_rate": 4.970461558854429e-05, + "loss": 2.6386, + "mean_token_accuracy": 0.4034482777118683, + "step": 98345 + }, + { + "epoch": 0.09905916455907844, + "grad_norm": 8.29150731066686, + "learning_rate": 4.970455506712957e-05, + "loss": 2.3929, + "mean_token_accuracy": 0.41954023241996763, + "step": 98350 + }, + { + "epoch": 0.09906420061218262, + "grad_norm": 9.823777042874413, + "learning_rate": 4.970449453955634e-05, + "loss": 2.1681, + "mean_token_accuracy": 0.44482759237289426, + "step": 98355 + }, + { + "epoch": 0.09906923666528679, + "grad_norm": 8.476104390782211, + "learning_rate": 4.97044340058246e-05, + "loss": 1.9027, + "mean_token_accuracy": 0.5241379261016845, + "step": 98360 + }, + { + "epoch": 0.09907427271839096, + "grad_norm": 13.772664323795611, + "learning_rate": 4.970437346593438e-05, + "loss": 2.767, + "mean_token_accuracy": 0.3241379290819168, + "step": 98365 + }, + { + "epoch": 0.09907930877149514, + "grad_norm": 10.752918980724479, + "learning_rate": 4.970431291988569e-05, + "loss": 2.544, + "mean_token_accuracy": 0.4068965494632721, + "step": 98370 + }, + { + "epoch": 0.09908434482459931, + "grad_norm": 10.844680386209507, + "learning_rate": 4.9704252367678564e-05, + "loss": 2.4945, + "mean_token_accuracy": 0.40689654350280763, + "step": 98375 + }, + { + "epoch": 0.09908938087770348, + "grad_norm": 10.56764299324792, + "learning_rate": 4.9704191809313e-05, + "loss": 2.5656, + "mean_token_accuracy": 0.358620685338974, + "step": 98380 + }, + { + "epoch": 0.09909441693080766, + "grad_norm": 9.967996859520062, + "learning_rate": 4.970413124478902e-05, + "loss": 2.5736, + "mean_token_accuracy": 0.4, + "step": 98385 + }, + { + "epoch": 0.09909945298391183, + "grad_norm": 10.648253726024706, + "learning_rate": 4.970407067410664e-05, + "loss": 2.6266, + "mean_token_accuracy": 0.4137930989265442, + "step": 98390 + }, + { + "epoch": 0.09910448903701599, + "grad_norm": 10.567841801304708, + "learning_rate": 4.9704010097265875e-05, + "loss": 2.7533, + "mean_token_accuracy": 0.38620689809322356, + "step": 98395 + }, + { + "epoch": 0.09910952509012017, + "grad_norm": 11.095193333315816, + "learning_rate": 4.970394951426675e-05, + "loss": 2.3154, + "mean_token_accuracy": 0.4413793087005615, + "step": 98400 + }, + { + "epoch": 0.09911456114322434, + "grad_norm": 12.353686485128089, + "learning_rate": 4.970388892510928e-05, + "loss": 2.5274, + "mean_token_accuracy": 0.40840895771980285, + "step": 98405 + }, + { + "epoch": 0.09911959719632851, + "grad_norm": 9.309233916156364, + "learning_rate": 4.970382832979347e-05, + "loss": 2.5977, + "mean_token_accuracy": 0.4413793087005615, + "step": 98410 + }, + { + "epoch": 0.09912463324943269, + "grad_norm": 11.282905550993819, + "learning_rate": 4.970376772831935e-05, + "loss": 2.3098, + "mean_token_accuracy": 0.44827587008476255, + "step": 98415 + }, + { + "epoch": 0.09912966930253686, + "grad_norm": 10.321442875593423, + "learning_rate": 4.970370712068693e-05, + "loss": 2.09, + "mean_token_accuracy": 0.4724137902259827, + "step": 98420 + }, + { + "epoch": 0.09913470535564103, + "grad_norm": 9.718331729705111, + "learning_rate": 4.970364650689624e-05, + "loss": 2.4611, + "mean_token_accuracy": 0.3862068891525269, + "step": 98425 + }, + { + "epoch": 0.09913974140874521, + "grad_norm": 14.211198740007548, + "learning_rate": 4.9703585886947276e-05, + "loss": 2.9202, + "mean_token_accuracy": 0.3965517282485962, + "step": 98430 + }, + { + "epoch": 0.09914477746184938, + "grad_norm": 10.057099281748014, + "learning_rate": 4.970352526084005e-05, + "loss": 2.5545, + "mean_token_accuracy": 0.3896551787853241, + "step": 98435 + }, + { + "epoch": 0.09914981351495356, + "grad_norm": 10.89137713310155, + "learning_rate": 4.9703464628574616e-05, + "loss": 2.7316, + "mean_token_accuracy": 0.3999999940395355, + "step": 98440 + }, + { + "epoch": 0.09915484956805773, + "grad_norm": 9.354170961846618, + "learning_rate": 4.970340399015096e-05, + "loss": 2.6871, + "mean_token_accuracy": 0.37241379022598264, + "step": 98445 + }, + { + "epoch": 0.0991598856211619, + "grad_norm": 9.514754293177226, + "learning_rate": 4.970334334556911e-05, + "loss": 2.0579, + "mean_token_accuracy": 0.4551724135875702, + "step": 98450 + }, + { + "epoch": 0.09916492167426608, + "grad_norm": 10.115899460275848, + "learning_rate": 4.970328269482908e-05, + "loss": 2.6012, + "mean_token_accuracy": 0.4275862157344818, + "step": 98455 + }, + { + "epoch": 0.09916995772737025, + "grad_norm": 12.03177884342683, + "learning_rate": 4.970322203793088e-05, + "loss": 2.4224, + "mean_token_accuracy": 0.42758620381355283, + "step": 98460 + }, + { + "epoch": 0.09917499378047441, + "grad_norm": 9.718838390403258, + "learning_rate": 4.970316137487454e-05, + "loss": 2.109, + "mean_token_accuracy": 0.4724137902259827, + "step": 98465 + }, + { + "epoch": 0.09918002983357858, + "grad_norm": 7.7876160230351985, + "learning_rate": 4.9703100705660074e-05, + "loss": 2.815, + "mean_token_accuracy": 0.4379310369491577, + "step": 98470 + }, + { + "epoch": 0.09918506588668276, + "grad_norm": 12.051391339796783, + "learning_rate": 4.9703040030287484e-05, + "loss": 2.841, + "mean_token_accuracy": 0.3275862097740173, + "step": 98475 + }, + { + "epoch": 0.09919010193978693, + "grad_norm": 9.741272709894394, + "learning_rate": 4.970297934875681e-05, + "loss": 2.4143, + "mean_token_accuracy": 0.39836661219596864, + "step": 98480 + }, + { + "epoch": 0.0991951379928911, + "grad_norm": 11.84357379264377, + "learning_rate": 4.970291866106804e-05, + "loss": 2.3261, + "mean_token_accuracy": 0.44137930274009707, + "step": 98485 + }, + { + "epoch": 0.09920017404599528, + "grad_norm": 12.384175045590043, + "learning_rate": 4.970285796722123e-05, + "loss": 2.3696, + "mean_token_accuracy": 0.4068965494632721, + "step": 98490 + }, + { + "epoch": 0.09920521009909945, + "grad_norm": 11.285704162737877, + "learning_rate": 4.970279726721636e-05, + "loss": 2.4975, + "mean_token_accuracy": 0.4068965554237366, + "step": 98495 + }, + { + "epoch": 0.09921024615220363, + "grad_norm": 10.984177641200109, + "learning_rate": 4.970273656105346e-05, + "loss": 3.292, + "mean_token_accuracy": 0.3620689660310745, + "step": 98500 + }, + { + "epoch": 0.0992152822053078, + "grad_norm": 10.499486834300223, + "learning_rate": 4.9702675848732555e-05, + "loss": 2.7665, + "mean_token_accuracy": 0.4159104585647583, + "step": 98505 + }, + { + "epoch": 0.09922031825841197, + "grad_norm": 11.926402628258648, + "learning_rate": 4.970261513025365e-05, + "loss": 2.7236, + "mean_token_accuracy": 0.39310344457626345, + "step": 98510 + }, + { + "epoch": 0.09922535431151615, + "grad_norm": 13.497427441574592, + "learning_rate": 4.970255440561677e-05, + "loss": 2.678, + "mean_token_accuracy": 0.4068965494632721, + "step": 98515 + }, + { + "epoch": 0.09923039036462032, + "grad_norm": 10.192882297896023, + "learning_rate": 4.9702493674821935e-05, + "loss": 2.3648, + "mean_token_accuracy": 0.42758620977401735, + "step": 98520 + }, + { + "epoch": 0.0992354264177245, + "grad_norm": 14.420117220228178, + "learning_rate": 4.970243293786914e-05, + "loss": 2.8603, + "mean_token_accuracy": 0.3965517282485962, + "step": 98525 + }, + { + "epoch": 0.09924046247082867, + "grad_norm": 9.781530864769534, + "learning_rate": 4.970237219475843e-05, + "loss": 2.9612, + "mean_token_accuracy": 0.3827586233615875, + "step": 98530 + }, + { + "epoch": 0.09924549852393283, + "grad_norm": 10.016367391422348, + "learning_rate": 4.97023114454898e-05, + "loss": 2.2179, + "mean_token_accuracy": 0.4103448212146759, + "step": 98535 + }, + { + "epoch": 0.099250534577037, + "grad_norm": 12.911669335938276, + "learning_rate": 4.9702250690063285e-05, + "loss": 2.4651, + "mean_token_accuracy": 0.3862069010734558, + "step": 98540 + }, + { + "epoch": 0.09925557063014118, + "grad_norm": 11.094196147909896, + "learning_rate": 4.970218992847889e-05, + "loss": 2.2788, + "mean_token_accuracy": 0.4482758641242981, + "step": 98545 + }, + { + "epoch": 0.09926060668324535, + "grad_norm": 10.743605574524937, + "learning_rate": 4.970212916073663e-05, + "loss": 2.724, + "mean_token_accuracy": 0.37586205899715425, + "step": 98550 + }, + { + "epoch": 0.09926564273634952, + "grad_norm": 8.44018118026029, + "learning_rate": 4.970206838683653e-05, + "loss": 2.2113, + "mean_token_accuracy": 0.4689655125141144, + "step": 98555 + }, + { + "epoch": 0.0992706787894537, + "grad_norm": 12.436463688893426, + "learning_rate": 4.97020076067786e-05, + "loss": 2.3063, + "mean_token_accuracy": 0.42413792610168455, + "step": 98560 + }, + { + "epoch": 0.09927571484255787, + "grad_norm": 11.250473691738149, + "learning_rate": 4.9701946820562875e-05, + "loss": 2.2968, + "mean_token_accuracy": 0.47931033968925474, + "step": 98565 + }, + { + "epoch": 0.09928075089566205, + "grad_norm": 13.219721867856176, + "learning_rate": 4.970188602818934e-05, + "loss": 2.7902, + "mean_token_accuracy": 0.34827586710453035, + "step": 98570 + }, + { + "epoch": 0.09928578694876622, + "grad_norm": 10.866393075138312, + "learning_rate": 4.9701825229658036e-05, + "loss": 2.0123, + "mean_token_accuracy": 0.5137931048870087, + "step": 98575 + }, + { + "epoch": 0.09929082300187039, + "grad_norm": 11.065729230389186, + "learning_rate": 4.970176442496898e-05, + "loss": 2.3753, + "mean_token_accuracy": 0.4344827592372894, + "step": 98580 + }, + { + "epoch": 0.09929585905497457, + "grad_norm": 11.691243329008165, + "learning_rate": 4.970170361412217e-05, + "loss": 2.5292, + "mean_token_accuracy": 0.4103448212146759, + "step": 98585 + }, + { + "epoch": 0.09930089510807874, + "grad_norm": 9.518201721618828, + "learning_rate": 4.9701642797117635e-05, + "loss": 2.637, + "mean_token_accuracy": 0.3620689630508423, + "step": 98590 + }, + { + "epoch": 0.09930593116118291, + "grad_norm": 11.566564809877233, + "learning_rate": 4.97015819739554e-05, + "loss": 2.6406, + "mean_token_accuracy": 0.37241379022598264, + "step": 98595 + }, + { + "epoch": 0.09931096721428709, + "grad_norm": 12.10777567611062, + "learning_rate": 4.9701521144635466e-05, + "loss": 2.3623, + "mean_token_accuracy": 0.4206896543502808, + "step": 98600 + }, + { + "epoch": 0.09931600326739125, + "grad_norm": 10.330323316124542, + "learning_rate": 4.970146030915786e-05, + "loss": 2.0646, + "mean_token_accuracy": 0.47586206197738645, + "step": 98605 + }, + { + "epoch": 0.09932103932049542, + "grad_norm": 11.049019002518808, + "learning_rate": 4.97013994675226e-05, + "loss": 2.5011, + "mean_token_accuracy": 0.4172413766384125, + "step": 98610 + }, + { + "epoch": 0.0993260753735996, + "grad_norm": 10.953651094109697, + "learning_rate": 4.97013386197297e-05, + "loss": 2.5149, + "mean_token_accuracy": 0.4551724076271057, + "step": 98615 + }, + { + "epoch": 0.09933111142670377, + "grad_norm": 10.5106908839289, + "learning_rate": 4.970127776577917e-05, + "loss": 2.5463, + "mean_token_accuracy": 0.43793103098869324, + "step": 98620 + }, + { + "epoch": 0.09933614747980794, + "grad_norm": 9.949187433789927, + "learning_rate": 4.9701216905671034e-05, + "loss": 2.5424, + "mean_token_accuracy": 0.3931034505367279, + "step": 98625 + }, + { + "epoch": 0.09934118353291212, + "grad_norm": 10.674849677636555, + "learning_rate": 4.970115603940531e-05, + "loss": 2.4712, + "mean_token_accuracy": 0.41379310488700866, + "step": 98630 + }, + { + "epoch": 0.09934621958601629, + "grad_norm": 10.822906127121689, + "learning_rate": 4.970109516698201e-05, + "loss": 2.3629, + "mean_token_accuracy": 0.5, + "step": 98635 + }, + { + "epoch": 0.09935125563912046, + "grad_norm": 9.107292054737528, + "learning_rate": 4.970103428840115e-05, + "loss": 2.7903, + "mean_token_accuracy": 0.34482758343219755, + "step": 98640 + }, + { + "epoch": 0.09935629169222464, + "grad_norm": 12.27485698258569, + "learning_rate": 4.970097340366276e-05, + "loss": 2.4764, + "mean_token_accuracy": 0.37586207389831544, + "step": 98645 + }, + { + "epoch": 0.09936132774532881, + "grad_norm": 14.601480566352635, + "learning_rate": 4.9700912512766834e-05, + "loss": 2.9752, + "mean_token_accuracy": 0.40532365441322327, + "step": 98650 + }, + { + "epoch": 0.09936636379843299, + "grad_norm": 12.298621456984765, + "learning_rate": 4.970085161571341e-05, + "loss": 2.5744, + "mean_token_accuracy": 0.39310344457626345, + "step": 98655 + }, + { + "epoch": 0.09937139985153716, + "grad_norm": 12.642852450677413, + "learning_rate": 4.97007907125025e-05, + "loss": 2.7501, + "mean_token_accuracy": 0.4156079888343811, + "step": 98660 + }, + { + "epoch": 0.09937643590464133, + "grad_norm": 10.632012229562127, + "learning_rate": 4.970072980313411e-05, + "loss": 2.668, + "mean_token_accuracy": 0.3931034505367279, + "step": 98665 + }, + { + "epoch": 0.0993814719577455, + "grad_norm": 10.964231559771315, + "learning_rate": 4.970066888760827e-05, + "loss": 2.9092, + "mean_token_accuracy": 0.3931034505367279, + "step": 98670 + }, + { + "epoch": 0.09938650801084967, + "grad_norm": 9.934552541215675, + "learning_rate": 4.9700607965924995e-05, + "loss": 2.7586, + "mean_token_accuracy": 0.37586206793785093, + "step": 98675 + }, + { + "epoch": 0.09939154406395384, + "grad_norm": 10.661615742983955, + "learning_rate": 4.9700547038084294e-05, + "loss": 2.4344, + "mean_token_accuracy": 0.4379310369491577, + "step": 98680 + }, + { + "epoch": 0.09939658011705801, + "grad_norm": 10.721069147172011, + "learning_rate": 4.970048610408619e-05, + "loss": 2.3045, + "mean_token_accuracy": 0.4551724135875702, + "step": 98685 + }, + { + "epoch": 0.09940161617016219, + "grad_norm": 10.82883695641102, + "learning_rate": 4.9700425163930695e-05, + "loss": 2.4394, + "mean_token_accuracy": 0.39655172228813174, + "step": 98690 + }, + { + "epoch": 0.09940665222326636, + "grad_norm": 12.265849840191644, + "learning_rate": 4.9700364217617825e-05, + "loss": 2.4691, + "mean_token_accuracy": 0.42068964838981626, + "step": 98695 + }, + { + "epoch": 0.09941168827637054, + "grad_norm": 11.248099816361371, + "learning_rate": 4.9700303265147615e-05, + "loss": 2.3333, + "mean_token_accuracy": 0.44827585816383364, + "step": 98700 + }, + { + "epoch": 0.09941672432947471, + "grad_norm": 12.595841418237173, + "learning_rate": 4.970024230652006e-05, + "loss": 2.5097, + "mean_token_accuracy": 0.4137930989265442, + "step": 98705 + }, + { + "epoch": 0.09942176038257888, + "grad_norm": 11.077531968957656, + "learning_rate": 4.970018134173519e-05, + "loss": 2.4496, + "mean_token_accuracy": 0.4034482777118683, + "step": 98710 + }, + { + "epoch": 0.09942679643568306, + "grad_norm": 9.681898887377212, + "learning_rate": 4.9700120370793006e-05, + "loss": 2.2035, + "mean_token_accuracy": 0.4517241358757019, + "step": 98715 + }, + { + "epoch": 0.09943183248878723, + "grad_norm": 10.118208067177969, + "learning_rate": 4.970005939369355e-05, + "loss": 2.5644, + "mean_token_accuracy": 0.4206896543502808, + "step": 98720 + }, + { + "epoch": 0.0994368685418914, + "grad_norm": 11.467393157413877, + "learning_rate": 4.969999841043682e-05, + "loss": 2.5986, + "mean_token_accuracy": 0.4103448331356049, + "step": 98725 + }, + { + "epoch": 0.09944190459499558, + "grad_norm": 10.59435410237419, + "learning_rate": 4.9699937421022825e-05, + "loss": 2.1589, + "mean_token_accuracy": 0.4379310369491577, + "step": 98730 + }, + { + "epoch": 0.09944694064809975, + "grad_norm": 11.176840101673928, + "learning_rate": 4.969987642545161e-05, + "loss": 2.5739, + "mean_token_accuracy": 0.3999999940395355, + "step": 98735 + }, + { + "epoch": 0.09945197670120393, + "grad_norm": 12.273939670179729, + "learning_rate": 4.9699815423723166e-05, + "loss": 2.67, + "mean_token_accuracy": 0.36896551251411436, + "step": 98740 + }, + { + "epoch": 0.09945701275430809, + "grad_norm": 10.198438594014455, + "learning_rate": 4.969975441583753e-05, + "loss": 2.7478, + "mean_token_accuracy": 0.39310344457626345, + "step": 98745 + }, + { + "epoch": 0.09946204880741226, + "grad_norm": 11.766708095950554, + "learning_rate": 4.96996934017947e-05, + "loss": 2.3438, + "mean_token_accuracy": 0.4430732011795044, + "step": 98750 + }, + { + "epoch": 0.09946708486051643, + "grad_norm": 15.5827839047036, + "learning_rate": 4.969963238159471e-05, + "loss": 2.8499, + "mean_token_accuracy": 0.39310344457626345, + "step": 98755 + }, + { + "epoch": 0.0994721209136206, + "grad_norm": 11.220868741327068, + "learning_rate": 4.969957135523756e-05, + "loss": 2.4575, + "mean_token_accuracy": 0.3999999940395355, + "step": 98760 + }, + { + "epoch": 0.09947715696672478, + "grad_norm": 10.630225877909318, + "learning_rate": 4.969951032272329e-05, + "loss": 2.2291, + "mean_token_accuracy": 0.4103448331356049, + "step": 98765 + }, + { + "epoch": 0.09948219301982895, + "grad_norm": 10.800337562717154, + "learning_rate": 4.969944928405189e-05, + "loss": 2.1977, + "mean_token_accuracy": 0.4344827651977539, + "step": 98770 + }, + { + "epoch": 0.09948722907293313, + "grad_norm": 10.357729836771824, + "learning_rate": 4.9699388239223396e-05, + "loss": 2.2495, + "mean_token_accuracy": 0.4535995125770569, + "step": 98775 + }, + { + "epoch": 0.0994922651260373, + "grad_norm": 11.569148634465867, + "learning_rate": 4.969932718823781e-05, + "loss": 2.4024, + "mean_token_accuracy": 0.4034482777118683, + "step": 98780 + }, + { + "epoch": 0.09949730117914148, + "grad_norm": 10.0707680592349, + "learning_rate": 4.969926613109517e-05, + "loss": 2.2049, + "mean_token_accuracy": 0.437931028008461, + "step": 98785 + }, + { + "epoch": 0.09950233723224565, + "grad_norm": 8.64780139545705, + "learning_rate": 4.969920506779548e-05, + "loss": 2.3762, + "mean_token_accuracy": 0.4560344755649567, + "step": 98790 + }, + { + "epoch": 0.09950737328534982, + "grad_norm": 13.284293459716656, + "learning_rate": 4.969914399833875e-05, + "loss": 2.6859, + "mean_token_accuracy": 0.358620685338974, + "step": 98795 + }, + { + "epoch": 0.099512409338454, + "grad_norm": 11.627583222495725, + "learning_rate": 4.969908292272501e-05, + "loss": 2.5095, + "mean_token_accuracy": 0.436539626121521, + "step": 98800 + }, + { + "epoch": 0.09951744539155817, + "grad_norm": 11.520644698152573, + "learning_rate": 4.969902184095428e-05, + "loss": 2.2787, + "mean_token_accuracy": 0.45517241954803467, + "step": 98805 + }, + { + "epoch": 0.09952248144466234, + "grad_norm": 10.412683323961023, + "learning_rate": 4.9698960753026544e-05, + "loss": 2.4295, + "mean_token_accuracy": 0.3931034505367279, + "step": 98810 + }, + { + "epoch": 0.0995275174977665, + "grad_norm": 10.654100452719224, + "learning_rate": 4.9698899658941865e-05, + "loss": 2.1985, + "mean_token_accuracy": 0.4379310429096222, + "step": 98815 + }, + { + "epoch": 0.09953255355087068, + "grad_norm": 10.042910275665275, + "learning_rate": 4.9698838558700225e-05, + "loss": 2.4013, + "mean_token_accuracy": 0.4413793087005615, + "step": 98820 + }, + { + "epoch": 0.09953758960397485, + "grad_norm": 12.211300078064497, + "learning_rate": 4.9698777452301665e-05, + "loss": 2.9837, + "mean_token_accuracy": 0.3551724076271057, + "step": 98825 + }, + { + "epoch": 0.09954262565707903, + "grad_norm": 11.414060841156932, + "learning_rate": 4.9698716339746185e-05, + "loss": 2.8429, + "mean_token_accuracy": 0.3551724165678024, + "step": 98830 + }, + { + "epoch": 0.0995476617101832, + "grad_norm": 9.296741925534977, + "learning_rate": 4.969865522103381e-05, + "loss": 2.473, + "mean_token_accuracy": 0.4413793087005615, + "step": 98835 + }, + { + "epoch": 0.09955269776328737, + "grad_norm": 10.075177711016169, + "learning_rate": 4.969859409616456e-05, + "loss": 2.2966, + "mean_token_accuracy": 0.4482758641242981, + "step": 98840 + }, + { + "epoch": 0.09955773381639155, + "grad_norm": 10.44830127364626, + "learning_rate": 4.969853296513844e-05, + "loss": 2.6144, + "mean_token_accuracy": 0.4068965554237366, + "step": 98845 + }, + { + "epoch": 0.09956276986949572, + "grad_norm": 18.1209026497701, + "learning_rate": 4.969847182795548e-05, + "loss": 2.4151, + "mean_token_accuracy": 0.4482758641242981, + "step": 98850 + }, + { + "epoch": 0.0995678059225999, + "grad_norm": 13.347163491758097, + "learning_rate": 4.9698410684615687e-05, + "loss": 2.5989, + "mean_token_accuracy": 0.39655172228813174, + "step": 98855 + }, + { + "epoch": 0.09957284197570407, + "grad_norm": 10.474832666221184, + "learning_rate": 4.969834953511908e-05, + "loss": 2.4402, + "mean_token_accuracy": 0.39655172228813174, + "step": 98860 + }, + { + "epoch": 0.09957787802880824, + "grad_norm": 11.970620745948679, + "learning_rate": 4.969828837946568e-05, + "loss": 2.438, + "mean_token_accuracy": 0.4068965494632721, + "step": 98865 + }, + { + "epoch": 0.09958291408191242, + "grad_norm": 11.337109254039637, + "learning_rate": 4.96982272176555e-05, + "loss": 2.2941, + "mean_token_accuracy": 0.4620689690113068, + "step": 98870 + }, + { + "epoch": 0.09958795013501659, + "grad_norm": 10.791998404263058, + "learning_rate": 4.9698166049688564e-05, + "loss": 2.4983, + "mean_token_accuracy": 0.4206896543502808, + "step": 98875 + }, + { + "epoch": 0.09959298618812076, + "grad_norm": 8.127306335718513, + "learning_rate": 4.9698104875564875e-05, + "loss": 2.1692, + "mean_token_accuracy": 0.4413793087005615, + "step": 98880 + }, + { + "epoch": 0.09959802224122492, + "grad_norm": 10.160648262308854, + "learning_rate": 4.9698043695284476e-05, + "loss": 2.7352, + "mean_token_accuracy": 0.41034482717514037, + "step": 98885 + }, + { + "epoch": 0.0996030582943291, + "grad_norm": 9.432822650485459, + "learning_rate": 4.969798250884735e-05, + "loss": 2.0272, + "mean_token_accuracy": 0.4965517222881317, + "step": 98890 + }, + { + "epoch": 0.09960809434743327, + "grad_norm": 12.200758439472196, + "learning_rate": 4.969792131625353e-05, + "loss": 2.8701, + "mean_token_accuracy": 0.3362976461648941, + "step": 98895 + }, + { + "epoch": 0.09961313040053744, + "grad_norm": 8.700086054407246, + "learning_rate": 4.969786011750305e-05, + "loss": 2.2461, + "mean_token_accuracy": 0.4658197224140167, + "step": 98900 + }, + { + "epoch": 0.09961816645364162, + "grad_norm": 11.710281529437616, + "learning_rate": 4.969779891259589e-05, + "loss": 2.7498, + "mean_token_accuracy": 0.36896551847457887, + "step": 98905 + }, + { + "epoch": 0.09962320250674579, + "grad_norm": 18.58801103588291, + "learning_rate": 4.9697737701532106e-05, + "loss": 2.5482, + "mean_token_accuracy": 0.3931034505367279, + "step": 98910 + }, + { + "epoch": 0.09962823855984997, + "grad_norm": 10.457774100455532, + "learning_rate": 4.9697676484311685e-05, + "loss": 2.2683, + "mean_token_accuracy": 0.4206896543502808, + "step": 98915 + }, + { + "epoch": 0.09963327461295414, + "grad_norm": 10.021663273790406, + "learning_rate": 4.9697615260934663e-05, + "loss": 2.3746, + "mean_token_accuracy": 0.44827587008476255, + "step": 98920 + }, + { + "epoch": 0.09963831066605831, + "grad_norm": 9.49497681748817, + "learning_rate": 4.969755403140105e-05, + "loss": 2.0187, + "mean_token_accuracy": 0.4965517222881317, + "step": 98925 + }, + { + "epoch": 0.09964334671916249, + "grad_norm": 12.515823715786759, + "learning_rate": 4.9697492795710856e-05, + "loss": 2.545, + "mean_token_accuracy": 0.37586206793785093, + "step": 98930 + }, + { + "epoch": 0.09964838277226666, + "grad_norm": 11.375614846554585, + "learning_rate": 4.9697431553864105e-05, + "loss": 2.6744, + "mean_token_accuracy": 0.3944948613643646, + "step": 98935 + }, + { + "epoch": 0.09965341882537083, + "grad_norm": 10.139094090256808, + "learning_rate": 4.969737030586081e-05, + "loss": 2.1591, + "mean_token_accuracy": 0.4448275864124298, + "step": 98940 + }, + { + "epoch": 0.09965845487847501, + "grad_norm": 10.883906244462077, + "learning_rate": 4.969730905170101e-05, + "loss": 2.7957, + "mean_token_accuracy": 0.3999999940395355, + "step": 98945 + }, + { + "epoch": 0.09966349093157917, + "grad_norm": 10.373030960042904, + "learning_rate": 4.969724779138469e-05, + "loss": 2.1725, + "mean_token_accuracy": 0.4172413796186447, + "step": 98950 + }, + { + "epoch": 0.09966852698468334, + "grad_norm": 10.48641410286295, + "learning_rate": 4.969718652491188e-05, + "loss": 2.3644, + "mean_token_accuracy": 0.458620685338974, + "step": 98955 + }, + { + "epoch": 0.09967356303778752, + "grad_norm": 12.134083299054232, + "learning_rate": 4.969712525228259e-05, + "loss": 2.4977, + "mean_token_accuracy": 0.44482758045196535, + "step": 98960 + }, + { + "epoch": 0.09967859909089169, + "grad_norm": 11.871116464171761, + "learning_rate": 4.969706397349686e-05, + "loss": 2.6052, + "mean_token_accuracy": 0.4119177281856537, + "step": 98965 + }, + { + "epoch": 0.09968363514399586, + "grad_norm": 11.380105868826336, + "learning_rate": 4.969700268855468e-05, + "loss": 2.3494, + "mean_token_accuracy": 0.43793103098869324, + "step": 98970 + }, + { + "epoch": 0.09968867119710004, + "grad_norm": 9.727769067454384, + "learning_rate": 4.969694139745609e-05, + "loss": 2.3021, + "mean_token_accuracy": 0.4275861978530884, + "step": 98975 + }, + { + "epoch": 0.09969370725020421, + "grad_norm": 13.115409471336744, + "learning_rate": 4.9696880100201084e-05, + "loss": 2.7524, + "mean_token_accuracy": 0.4172413766384125, + "step": 98980 + }, + { + "epoch": 0.09969874330330838, + "grad_norm": 9.869143397132234, + "learning_rate": 4.96968187967897e-05, + "loss": 2.3224, + "mean_token_accuracy": 0.41379310488700866, + "step": 98985 + }, + { + "epoch": 0.09970377935641256, + "grad_norm": 12.666317483410737, + "learning_rate": 4.9696757487221934e-05, + "loss": 2.5547, + "mean_token_accuracy": 0.4310344815254211, + "step": 98990 + }, + { + "epoch": 0.09970881540951673, + "grad_norm": 14.222433735127392, + "learning_rate": 4.969669617149783e-05, + "loss": 2.4814, + "mean_token_accuracy": 0.42413792610168455, + "step": 98995 + }, + { + "epoch": 0.0997138514626209, + "grad_norm": 11.10356258419414, + "learning_rate": 4.9696634849617374e-05, + "loss": 2.4463, + "mean_token_accuracy": 0.4, + "step": 99000 + }, + { + "epoch": 0.09971888751572508, + "grad_norm": 9.593355543892944, + "learning_rate": 4.969657352158061e-05, + "loss": 2.6309, + "mean_token_accuracy": 0.40689654350280763, + "step": 99005 + }, + { + "epoch": 0.09972392356882925, + "grad_norm": 10.782492613959077, + "learning_rate": 4.9696512187387534e-05, + "loss": 2.5121, + "mean_token_accuracy": 0.37586206793785093, + "step": 99010 + }, + { + "epoch": 0.09972895962193343, + "grad_norm": 10.535287164253207, + "learning_rate": 4.969645084703818e-05, + "loss": 2.214, + "mean_token_accuracy": 0.4517241418361664, + "step": 99015 + }, + { + "epoch": 0.09973399567503759, + "grad_norm": 9.258756670458862, + "learning_rate": 4.9696389500532555e-05, + "loss": 2.2542, + "mean_token_accuracy": 0.3931034505367279, + "step": 99020 + }, + { + "epoch": 0.09973903172814176, + "grad_norm": 12.422026695781394, + "learning_rate": 4.969632814787068e-05, + "loss": 2.8272, + "mean_token_accuracy": 0.3655172407627106, + "step": 99025 + }, + { + "epoch": 0.09974406778124593, + "grad_norm": 10.561823931093555, + "learning_rate": 4.969626678905257e-05, + "loss": 2.5561, + "mean_token_accuracy": 0.4103448331356049, + "step": 99030 + }, + { + "epoch": 0.09974910383435011, + "grad_norm": 8.580780370886467, + "learning_rate": 4.969620542407824e-05, + "loss": 2.6034, + "mean_token_accuracy": 0.38965516686439516, + "step": 99035 + }, + { + "epoch": 0.09975413988745428, + "grad_norm": 10.58446338342712, + "learning_rate": 4.9696144052947705e-05, + "loss": 2.1629, + "mean_token_accuracy": 0.40689654350280763, + "step": 99040 + }, + { + "epoch": 0.09975917594055846, + "grad_norm": 15.544189544015795, + "learning_rate": 4.9696082675661e-05, + "loss": 2.5914, + "mean_token_accuracy": 0.4401088833808899, + "step": 99045 + }, + { + "epoch": 0.09976421199366263, + "grad_norm": 10.989792785526587, + "learning_rate": 4.969602129221812e-05, + "loss": 2.4828, + "mean_token_accuracy": 0.39830611646175385, + "step": 99050 + }, + { + "epoch": 0.0997692480467668, + "grad_norm": 10.672310492075967, + "learning_rate": 4.969595990261909e-05, + "loss": 2.7451, + "mean_token_accuracy": 0.3862069010734558, + "step": 99055 + }, + { + "epoch": 0.09977428409987098, + "grad_norm": 11.742391124157088, + "learning_rate": 4.9695898506863934e-05, + "loss": 2.9543, + "mean_token_accuracy": 0.3931034505367279, + "step": 99060 + }, + { + "epoch": 0.09977932015297515, + "grad_norm": 12.094649350930446, + "learning_rate": 4.969583710495266e-05, + "loss": 2.6948, + "mean_token_accuracy": 0.39655172228813174, + "step": 99065 + }, + { + "epoch": 0.09978435620607932, + "grad_norm": 12.232624536325778, + "learning_rate": 4.969577569688529e-05, + "loss": 2.452, + "mean_token_accuracy": 0.4137930929660797, + "step": 99070 + }, + { + "epoch": 0.0997893922591835, + "grad_norm": 10.933116284129689, + "learning_rate": 4.969571428266183e-05, + "loss": 2.6466, + "mean_token_accuracy": 0.36551723778247835, + "step": 99075 + }, + { + "epoch": 0.09979442831228767, + "grad_norm": 7.915417517343722, + "learning_rate": 4.969565286228231e-05, + "loss": 2.4086, + "mean_token_accuracy": 0.4620689570903778, + "step": 99080 + }, + { + "epoch": 0.09979946436539185, + "grad_norm": 10.46099070616217, + "learning_rate": 4.969559143574675e-05, + "loss": 2.1768, + "mean_token_accuracy": 0.4517241299152374, + "step": 99085 + }, + { + "epoch": 0.099804500418496, + "grad_norm": 13.41693857158804, + "learning_rate": 4.969553000305516e-05, + "loss": 2.8919, + "mean_token_accuracy": 0.32758620381355286, + "step": 99090 + }, + { + "epoch": 0.09980953647160018, + "grad_norm": 12.503043460188392, + "learning_rate": 4.9695468564207546e-05, + "loss": 2.6278, + "mean_token_accuracy": 0.3862068891525269, + "step": 99095 + }, + { + "epoch": 0.09981457252470435, + "grad_norm": 12.459121011245854, + "learning_rate": 4.969540711920394e-05, + "loss": 2.5036, + "mean_token_accuracy": 0.42220205068588257, + "step": 99100 + }, + { + "epoch": 0.09981960857780853, + "grad_norm": 13.892588678278305, + "learning_rate": 4.969534566804436e-05, + "loss": 2.0729, + "mean_token_accuracy": 0.4655172348022461, + "step": 99105 + }, + { + "epoch": 0.0998246446309127, + "grad_norm": 12.195619404815158, + "learning_rate": 4.969528421072881e-05, + "loss": 2.4114, + "mean_token_accuracy": 0.3482758581638336, + "step": 99110 + }, + { + "epoch": 0.09982968068401687, + "grad_norm": 12.23996210337998, + "learning_rate": 4.9695222747257316e-05, + "loss": 2.5238, + "mean_token_accuracy": 0.43448275327682495, + "step": 99115 + }, + { + "epoch": 0.09983471673712105, + "grad_norm": 10.417373315229511, + "learning_rate": 4.969516127762991e-05, + "loss": 2.872, + "mean_token_accuracy": 0.3655172407627106, + "step": 99120 + }, + { + "epoch": 0.09983975279022522, + "grad_norm": 11.708786220345328, + "learning_rate": 4.969509980184657e-05, + "loss": 2.5609, + "mean_token_accuracy": 0.42758620381355283, + "step": 99125 + }, + { + "epoch": 0.0998447888433294, + "grad_norm": 14.886931717542952, + "learning_rate": 4.969503831990735e-05, + "loss": 2.4945, + "mean_token_accuracy": 0.41034482717514037, + "step": 99130 + }, + { + "epoch": 0.09984982489643357, + "grad_norm": 11.219473864698278, + "learning_rate": 4.969497683181225e-05, + "loss": 2.6717, + "mean_token_accuracy": 0.39655172228813174, + "step": 99135 + }, + { + "epoch": 0.09985486094953774, + "grad_norm": 14.21772172548677, + "learning_rate": 4.969491533756129e-05, + "loss": 2.6834, + "mean_token_accuracy": 0.3620689630508423, + "step": 99140 + }, + { + "epoch": 0.09985989700264192, + "grad_norm": 11.36825899213656, + "learning_rate": 4.9694853837154485e-05, + "loss": 2.564, + "mean_token_accuracy": 0.41379310488700866, + "step": 99145 + }, + { + "epoch": 0.09986493305574609, + "grad_norm": 10.392873837240568, + "learning_rate": 4.969479233059186e-05, + "loss": 2.433, + "mean_token_accuracy": 0.4, + "step": 99150 + }, + { + "epoch": 0.09986996910885027, + "grad_norm": 12.594430542438904, + "learning_rate": 4.9694730817873424e-05, + "loss": 2.6588, + "mean_token_accuracy": 0.3551724076271057, + "step": 99155 + }, + { + "epoch": 0.09987500516195442, + "grad_norm": 11.355689336299601, + "learning_rate": 4.9694669298999194e-05, + "loss": 2.3361, + "mean_token_accuracy": 0.3981851160526276, + "step": 99160 + }, + { + "epoch": 0.0998800412150586, + "grad_norm": 8.462548672991622, + "learning_rate": 4.9694607773969196e-05, + "loss": 2.2673, + "mean_token_accuracy": 0.4310344815254211, + "step": 99165 + }, + { + "epoch": 0.09988507726816277, + "grad_norm": 11.166572742782451, + "learning_rate": 4.9694546242783443e-05, + "loss": 2.2497, + "mean_token_accuracy": 0.436539626121521, + "step": 99170 + }, + { + "epoch": 0.09989011332126695, + "grad_norm": 13.443203151636299, + "learning_rate": 4.9694484705441944e-05, + "loss": 2.4326, + "mean_token_accuracy": 0.44482759237289426, + "step": 99175 + }, + { + "epoch": 0.09989514937437112, + "grad_norm": 11.619874225175279, + "learning_rate": 4.9694423161944716e-05, + "loss": 2.3325, + "mean_token_accuracy": 0.4344827592372894, + "step": 99180 + }, + { + "epoch": 0.0999001854274753, + "grad_norm": 9.716694246877593, + "learning_rate": 4.969436161229178e-05, + "loss": 2.0969, + "mean_token_accuracy": 0.47931033968925474, + "step": 99185 + }, + { + "epoch": 0.09990522148057947, + "grad_norm": 9.276625419870708, + "learning_rate": 4.9694300056483175e-05, + "loss": 2.3838, + "mean_token_accuracy": 0.42601330280303956, + "step": 99190 + }, + { + "epoch": 0.09991025753368364, + "grad_norm": 10.682665418150888, + "learning_rate": 4.969423849451888e-05, + "loss": 2.4852, + "mean_token_accuracy": 0.4448275983333588, + "step": 99195 + }, + { + "epoch": 0.09991529358678782, + "grad_norm": 11.8784768354687, + "learning_rate": 4.969417692639894e-05, + "loss": 2.6568, + "mean_token_accuracy": 0.4, + "step": 99200 + }, + { + "epoch": 0.09992032963989199, + "grad_norm": 10.728917227615135, + "learning_rate": 4.969411535212336e-05, + "loss": 2.4141, + "mean_token_accuracy": 0.41379310488700866, + "step": 99205 + }, + { + "epoch": 0.09992536569299616, + "grad_norm": 10.011199717063104, + "learning_rate": 4.969405377169216e-05, + "loss": 2.2496, + "mean_token_accuracy": 0.4344827592372894, + "step": 99210 + }, + { + "epoch": 0.09993040174610034, + "grad_norm": 9.797106060114677, + "learning_rate": 4.969399218510535e-05, + "loss": 2.2509, + "mean_token_accuracy": 0.48415002822875974, + "step": 99215 + }, + { + "epoch": 0.09993543779920451, + "grad_norm": 10.213380519425609, + "learning_rate": 4.969393059236296e-05, + "loss": 2.2472, + "mean_token_accuracy": 0.47241380214691164, + "step": 99220 + }, + { + "epoch": 0.09994047385230868, + "grad_norm": 10.298990486925344, + "learning_rate": 4.9693868993464996e-05, + "loss": 2.5746, + "mean_token_accuracy": 0.41379310488700866, + "step": 99225 + }, + { + "epoch": 0.09994550990541284, + "grad_norm": 10.810960491940095, + "learning_rate": 4.969380738841148e-05, + "loss": 2.4035, + "mean_token_accuracy": 0.4103448331356049, + "step": 99230 + }, + { + "epoch": 0.09995054595851702, + "grad_norm": 10.777615275321356, + "learning_rate": 4.969374577720243e-05, + "loss": 2.759, + "mean_token_accuracy": 0.39310343861579894, + "step": 99235 + }, + { + "epoch": 0.09995558201162119, + "grad_norm": 10.221440041771125, + "learning_rate": 4.969368415983786e-05, + "loss": 2.1677, + "mean_token_accuracy": 0.4689655125141144, + "step": 99240 + }, + { + "epoch": 0.09996061806472537, + "grad_norm": 16.346927451575883, + "learning_rate": 4.96936225363178e-05, + "loss": 2.7581, + "mean_token_accuracy": 0.4068965554237366, + "step": 99245 + }, + { + "epoch": 0.09996565411782954, + "grad_norm": 14.641648106751774, + "learning_rate": 4.9693560906642246e-05, + "loss": 2.6693, + "mean_token_accuracy": 0.3827586233615875, + "step": 99250 + }, + { + "epoch": 0.09997069017093371, + "grad_norm": 11.980772030032853, + "learning_rate": 4.9693499270811224e-05, + "loss": 2.5957, + "mean_token_accuracy": 0.3793103456497192, + "step": 99255 + }, + { + "epoch": 0.09997572622403789, + "grad_norm": 10.21698409410971, + "learning_rate": 4.969343762882475e-05, + "loss": 2.3181, + "mean_token_accuracy": 0.4620689630508423, + "step": 99260 + }, + { + "epoch": 0.09998076227714206, + "grad_norm": 13.716931006099163, + "learning_rate": 4.9693375980682846e-05, + "loss": 2.6615, + "mean_token_accuracy": 0.40139141082763674, + "step": 99265 + }, + { + "epoch": 0.09998579833024623, + "grad_norm": 12.44162635338195, + "learning_rate": 4.969331432638553e-05, + "loss": 2.4805, + "mean_token_accuracy": 0.4034482717514038, + "step": 99270 + }, + { + "epoch": 0.09999083438335041, + "grad_norm": 9.823905401134374, + "learning_rate": 4.969325266593281e-05, + "loss": 2.0817, + "mean_token_accuracy": 0.4862069010734558, + "step": 99275 + }, + { + "epoch": 0.09999587043645458, + "grad_norm": 15.011565730512642, + "learning_rate": 4.969319099932472e-05, + "loss": 2.6554, + "mean_token_accuracy": 0.3655172407627106, + "step": 99280 + }, + { + "epoch": 0.10000090648955876, + "grad_norm": 12.407264718586683, + "learning_rate": 4.9693129326561254e-05, + "loss": 2.2656, + "mean_token_accuracy": 0.4758620738983154, + "step": 99285 + }, + { + "epoch": 0.10000594254266293, + "grad_norm": 10.960894213379845, + "learning_rate": 4.969306764764245e-05, + "loss": 2.7494, + "mean_token_accuracy": 0.44482758045196535, + "step": 99290 + }, + { + "epoch": 0.1000109785957671, + "grad_norm": 10.37501664405137, + "learning_rate": 4.969300596256831e-05, + "loss": 2.4118, + "mean_token_accuracy": 0.41379310488700866, + "step": 99295 + }, + { + "epoch": 0.10001601464887126, + "grad_norm": 13.114081328634484, + "learning_rate": 4.969294427133885e-05, + "loss": 2.2604, + "mean_token_accuracy": 0.4344827592372894, + "step": 99300 + }, + { + "epoch": 0.10002105070197544, + "grad_norm": 10.529368929388264, + "learning_rate": 4.96928825739541e-05, + "loss": 2.2526, + "mean_token_accuracy": 0.4310344815254211, + "step": 99305 + }, + { + "epoch": 0.10002608675507961, + "grad_norm": 10.295379670241816, + "learning_rate": 4.969282087041408e-05, + "loss": 2.4375, + "mean_token_accuracy": 0.41034482717514037, + "step": 99310 + }, + { + "epoch": 0.10003112280818378, + "grad_norm": 10.464335594527368, + "learning_rate": 4.96927591607188e-05, + "loss": 2.4012, + "mean_token_accuracy": 0.4517241418361664, + "step": 99315 + }, + { + "epoch": 0.10003615886128796, + "grad_norm": 11.732558757512262, + "learning_rate": 4.969269744486826e-05, + "loss": 2.4388, + "mean_token_accuracy": 0.42758620381355283, + "step": 99320 + }, + { + "epoch": 0.10004119491439213, + "grad_norm": 8.823232934067352, + "learning_rate": 4.969263572286249e-05, + "loss": 2.3381, + "mean_token_accuracy": 0.4781004309654236, + "step": 99325 + }, + { + "epoch": 0.1000462309674963, + "grad_norm": 11.20387134333757, + "learning_rate": 4.9692573994701526e-05, + "loss": 2.4972, + "mean_token_accuracy": 0.4172413766384125, + "step": 99330 + }, + { + "epoch": 0.10005126702060048, + "grad_norm": 12.471100555508237, + "learning_rate": 4.969251226038537e-05, + "loss": 2.5894, + "mean_token_accuracy": 0.4137930989265442, + "step": 99335 + }, + { + "epoch": 0.10005630307370465, + "grad_norm": 11.70143422867852, + "learning_rate": 4.9692450519914016e-05, + "loss": 2.4349, + "mean_token_accuracy": 0.42758620381355283, + "step": 99340 + }, + { + "epoch": 0.10006133912680883, + "grad_norm": 11.763850201073302, + "learning_rate": 4.969238877328752e-05, + "loss": 2.5607, + "mean_token_accuracy": 0.3793103456497192, + "step": 99345 + }, + { + "epoch": 0.100066375179913, + "grad_norm": 9.956199229471707, + "learning_rate": 4.969232702050588e-05, + "loss": 2.1737, + "mean_token_accuracy": 0.43793103098869324, + "step": 99350 + }, + { + "epoch": 0.10007141123301717, + "grad_norm": 11.909644105979483, + "learning_rate": 4.969226526156912e-05, + "loss": 2.5138, + "mean_token_accuracy": 0.4103448331356049, + "step": 99355 + }, + { + "epoch": 0.10007644728612135, + "grad_norm": 10.416608854649859, + "learning_rate": 4.969220349647724e-05, + "loss": 3.2346, + "mean_token_accuracy": 0.38669951260089874, + "step": 99360 + }, + { + "epoch": 0.10008148333922552, + "grad_norm": 11.33488239024833, + "learning_rate": 4.969214172523027e-05, + "loss": 2.2209, + "mean_token_accuracy": 0.482758629322052, + "step": 99365 + }, + { + "epoch": 0.10008651939232968, + "grad_norm": 24.836189889071715, + "learning_rate": 4.969207994782824e-05, + "loss": 2.5347, + "mean_token_accuracy": 0.41724138259887694, + "step": 99370 + }, + { + "epoch": 0.10009155544543386, + "grad_norm": 14.293277722426808, + "learning_rate": 4.969201816427115e-05, + "loss": 2.7279, + "mean_token_accuracy": 0.4068965494632721, + "step": 99375 + }, + { + "epoch": 0.10009659149853803, + "grad_norm": 9.531946835537436, + "learning_rate": 4.969195637455901e-05, + "loss": 2.488, + "mean_token_accuracy": 0.4517241358757019, + "step": 99380 + }, + { + "epoch": 0.1001016275516422, + "grad_norm": 12.535947009106794, + "learning_rate": 4.969189457869186e-05, + "loss": 2.6647, + "mean_token_accuracy": 0.36896551549434664, + "step": 99385 + }, + { + "epoch": 0.10010666360474638, + "grad_norm": 10.522443380350985, + "learning_rate": 4.96918327766697e-05, + "loss": 2.4499, + "mean_token_accuracy": 0.40157289505004884, + "step": 99390 + }, + { + "epoch": 0.10011169965785055, + "grad_norm": 14.99927374364409, + "learning_rate": 4.9691770968492554e-05, + "loss": 2.6767, + "mean_token_accuracy": 0.4, + "step": 99395 + }, + { + "epoch": 0.10011673571095472, + "grad_norm": 9.714847932169736, + "learning_rate": 4.9691709154160446e-05, + "loss": 2.5908, + "mean_token_accuracy": 0.41724138259887694, + "step": 99400 + }, + { + "epoch": 0.1001217717640589, + "grad_norm": 10.828901569134635, + "learning_rate": 4.969164733367337e-05, + "loss": 2.3157, + "mean_token_accuracy": 0.4517241418361664, + "step": 99405 + }, + { + "epoch": 0.10012680781716307, + "grad_norm": 14.692788611917804, + "learning_rate": 4.9691585507031364e-05, + "loss": 2.644, + "mean_token_accuracy": 0.37586207389831544, + "step": 99410 + }, + { + "epoch": 0.10013184387026725, + "grad_norm": 9.609449728062376, + "learning_rate": 4.969152367423444e-05, + "loss": 2.3803, + "mean_token_accuracy": 0.3793103456497192, + "step": 99415 + }, + { + "epoch": 0.10013687992337142, + "grad_norm": 11.084672178078666, + "learning_rate": 4.969146183528262e-05, + "loss": 3.0063, + "mean_token_accuracy": 0.31724137663841245, + "step": 99420 + }, + { + "epoch": 0.10014191597647559, + "grad_norm": 9.833188228212133, + "learning_rate": 4.96913999901759e-05, + "loss": 2.4141, + "mean_token_accuracy": 0.39655172228813174, + "step": 99425 + }, + { + "epoch": 0.10014695202957977, + "grad_norm": 10.536684350389349, + "learning_rate": 4.969133813891432e-05, + "loss": 2.3567, + "mean_token_accuracy": 0.4568663060665131, + "step": 99430 + }, + { + "epoch": 0.10015198808268394, + "grad_norm": 10.12832787088772, + "learning_rate": 4.9691276281497886e-05, + "loss": 2.012, + "mean_token_accuracy": 0.4862068951129913, + "step": 99435 + }, + { + "epoch": 0.1001570241357881, + "grad_norm": 9.492752696885939, + "learning_rate": 4.9691214417926627e-05, + "loss": 2.5507, + "mean_token_accuracy": 0.4241379380226135, + "step": 99440 + }, + { + "epoch": 0.10016206018889227, + "grad_norm": 9.385173944734856, + "learning_rate": 4.969115254820055e-05, + "loss": 2.1055, + "mean_token_accuracy": 0.4517241358757019, + "step": 99445 + }, + { + "epoch": 0.10016709624199645, + "grad_norm": 11.413520320317394, + "learning_rate": 4.9691090672319665e-05, + "loss": 2.5107, + "mean_token_accuracy": 0.4241379201412201, + "step": 99450 + }, + { + "epoch": 0.10017213229510062, + "grad_norm": 11.141921549739099, + "learning_rate": 4.969102879028401e-05, + "loss": 2.3106, + "mean_token_accuracy": 0.42758620381355283, + "step": 99455 + }, + { + "epoch": 0.1001771683482048, + "grad_norm": 10.42985438905199, + "learning_rate": 4.969096690209358e-05, + "loss": 2.4024, + "mean_token_accuracy": 0.36896551847457887, + "step": 99460 + }, + { + "epoch": 0.10018220440130897, + "grad_norm": 9.851768781429033, + "learning_rate": 4.969090500774841e-05, + "loss": 2.1708, + "mean_token_accuracy": 0.4724137902259827, + "step": 99465 + }, + { + "epoch": 0.10018724045441314, + "grad_norm": 12.710849824936062, + "learning_rate": 4.9690843107248506e-05, + "loss": 2.4126, + "mean_token_accuracy": 0.4068965494632721, + "step": 99470 + }, + { + "epoch": 0.10019227650751732, + "grad_norm": 11.26834253617972, + "learning_rate": 4.969078120059389e-05, + "loss": 2.5826, + "mean_token_accuracy": 0.41893526911735535, + "step": 99475 + }, + { + "epoch": 0.10019731256062149, + "grad_norm": 10.91557208926437, + "learning_rate": 4.969071928778458e-05, + "loss": 2.5404, + "mean_token_accuracy": 0.38965516686439516, + "step": 99480 + }, + { + "epoch": 0.10020234861372566, + "grad_norm": 10.298171689786855, + "learning_rate": 4.9690657368820586e-05, + "loss": 2.2453, + "mean_token_accuracy": 0.441379314661026, + "step": 99485 + }, + { + "epoch": 0.10020738466682984, + "grad_norm": 9.876065869837042, + "learning_rate": 4.9690595443701934e-05, + "loss": 2.6354, + "mean_token_accuracy": 0.3896551728248596, + "step": 99490 + }, + { + "epoch": 0.10021242071993401, + "grad_norm": 11.965323352654787, + "learning_rate": 4.9690533512428646e-05, + "loss": 2.7009, + "mean_token_accuracy": 0.4034482777118683, + "step": 99495 + }, + { + "epoch": 0.10021745677303819, + "grad_norm": 10.491704515193655, + "learning_rate": 4.969047157500072e-05, + "loss": 2.4364, + "mean_token_accuracy": 0.4535390198230743, + "step": 99500 + }, + { + "epoch": 0.10022249282614236, + "grad_norm": 12.164310889053505, + "learning_rate": 4.969040963141818e-05, + "loss": 2.3295, + "mean_token_accuracy": 0.4344827592372894, + "step": 99505 + }, + { + "epoch": 0.10022752887924652, + "grad_norm": 9.39579182548424, + "learning_rate": 4.969034768168105e-05, + "loss": 2.138, + "mean_token_accuracy": 0.4551724135875702, + "step": 99510 + }, + { + "epoch": 0.10023256493235069, + "grad_norm": 9.574914082172173, + "learning_rate": 4.969028572578936e-05, + "loss": 2.4374, + "mean_token_accuracy": 0.42413793206214906, + "step": 99515 + }, + { + "epoch": 0.10023760098545487, + "grad_norm": 9.946400805329093, + "learning_rate": 4.96902237637431e-05, + "loss": 2.4621, + "mean_token_accuracy": 0.43793103098869324, + "step": 99520 + }, + { + "epoch": 0.10024263703855904, + "grad_norm": 11.502755861119628, + "learning_rate": 4.9690161795542296e-05, + "loss": 2.6756, + "mean_token_accuracy": 0.4103448301553726, + "step": 99525 + }, + { + "epoch": 0.10024767309166321, + "grad_norm": 10.894927413446235, + "learning_rate": 4.969009982118696e-05, + "loss": 2.5029, + "mean_token_accuracy": 0.4586206912994385, + "step": 99530 + }, + { + "epoch": 0.10025270914476739, + "grad_norm": 11.71608452219711, + "learning_rate": 4.969003784067713e-05, + "loss": 3.0159, + "mean_token_accuracy": 0.36551724672317504, + "step": 99535 + }, + { + "epoch": 0.10025774519787156, + "grad_norm": 13.482145955317348, + "learning_rate": 4.968997585401281e-05, + "loss": 3.3523, + "mean_token_accuracy": 0.33793103098869326, + "step": 99540 + }, + { + "epoch": 0.10026278125097574, + "grad_norm": 11.384578393157817, + "learning_rate": 4.968991386119401e-05, + "loss": 2.6681, + "mean_token_accuracy": 0.4068965494632721, + "step": 99545 + }, + { + "epoch": 0.10026781730407991, + "grad_norm": 15.57004416562115, + "learning_rate": 4.968985186222076e-05, + "loss": 2.7367, + "mean_token_accuracy": 0.4551724076271057, + "step": 99550 + }, + { + "epoch": 0.10027285335718408, + "grad_norm": 11.220674992885856, + "learning_rate": 4.968978985709307e-05, + "loss": 2.6111, + "mean_token_accuracy": 0.4034482777118683, + "step": 99555 + }, + { + "epoch": 0.10027788941028826, + "grad_norm": 9.803040998907047, + "learning_rate": 4.9689727845810963e-05, + "loss": 2.6092, + "mean_token_accuracy": 0.38620689511299133, + "step": 99560 + }, + { + "epoch": 0.10028292546339243, + "grad_norm": 9.819617832136297, + "learning_rate": 4.968966582837445e-05, + "loss": 2.1829, + "mean_token_accuracy": 0.4482758641242981, + "step": 99565 + }, + { + "epoch": 0.1002879615164966, + "grad_norm": 9.916826461897802, + "learning_rate": 4.9689603804783555e-05, + "loss": 2.556, + "mean_token_accuracy": 0.4068965494632721, + "step": 99570 + }, + { + "epoch": 0.10029299756960078, + "grad_norm": 12.731052894230308, + "learning_rate": 4.9689541775038275e-05, + "loss": 2.7417, + "mean_token_accuracy": 0.3569267988204956, + "step": 99575 + }, + { + "epoch": 0.10029803362270494, + "grad_norm": 8.770093561316267, + "learning_rate": 4.968947973913866e-05, + "loss": 2.4464, + "mean_token_accuracy": 0.3999999940395355, + "step": 99580 + }, + { + "epoch": 0.10030306967580911, + "grad_norm": 9.81834980751313, + "learning_rate": 4.968941769708471e-05, + "loss": 2.3344, + "mean_token_accuracy": 0.4310344815254211, + "step": 99585 + }, + { + "epoch": 0.10030810572891329, + "grad_norm": 11.10464612153755, + "learning_rate": 4.968935564887642e-05, + "loss": 2.5198, + "mean_token_accuracy": 0.38620689511299133, + "step": 99590 + }, + { + "epoch": 0.10031314178201746, + "grad_norm": 13.626264747798231, + "learning_rate": 4.9689293594513845e-05, + "loss": 2.442, + "mean_token_accuracy": 0.43448275327682495, + "step": 99595 + }, + { + "epoch": 0.10031817783512163, + "grad_norm": 10.122868081030846, + "learning_rate": 4.968923153399699e-05, + "loss": 2.2156, + "mean_token_accuracy": 0.45862069725990295, + "step": 99600 + }, + { + "epoch": 0.1003232138882258, + "grad_norm": 12.359637982835508, + "learning_rate": 4.9689169467325865e-05, + "loss": 2.7726, + "mean_token_accuracy": 0.4, + "step": 99605 + }, + { + "epoch": 0.10032824994132998, + "grad_norm": 9.404707539610003, + "learning_rate": 4.968910739450049e-05, + "loss": 2.7852, + "mean_token_accuracy": 0.39655172228813174, + "step": 99610 + }, + { + "epoch": 0.10033328599443415, + "grad_norm": 12.151819712117094, + "learning_rate": 4.9689045315520885e-05, + "loss": 2.4407, + "mean_token_accuracy": 0.39655172228813174, + "step": 99615 + }, + { + "epoch": 0.10033832204753833, + "grad_norm": 11.971543134397399, + "learning_rate": 4.9688983230387065e-05, + "loss": 2.2037, + "mean_token_accuracy": 0.4413793087005615, + "step": 99620 + }, + { + "epoch": 0.1003433581006425, + "grad_norm": 13.329918985317477, + "learning_rate": 4.968892113909905e-05, + "loss": 2.4947, + "mean_token_accuracy": 0.38965516686439516, + "step": 99625 + }, + { + "epoch": 0.10034839415374668, + "grad_norm": 11.99320884643853, + "learning_rate": 4.968885904165685e-05, + "loss": 2.959, + "mean_token_accuracy": 0.3724137842655182, + "step": 99630 + }, + { + "epoch": 0.10035343020685085, + "grad_norm": 12.994686342589713, + "learning_rate": 4.968879693806049e-05, + "loss": 2.1923, + "mean_token_accuracy": 0.4620689690113068, + "step": 99635 + }, + { + "epoch": 0.10035846625995502, + "grad_norm": 10.495589776917498, + "learning_rate": 4.9688734828309986e-05, + "loss": 2.5349, + "mean_token_accuracy": 0.39800362586975097, + "step": 99640 + }, + { + "epoch": 0.1003635023130592, + "grad_norm": 11.950593110949752, + "learning_rate": 4.968867271240535e-05, + "loss": 2.3871, + "mean_token_accuracy": 0.4034482717514038, + "step": 99645 + }, + { + "epoch": 0.10036853836616336, + "grad_norm": 11.154201821138878, + "learning_rate": 4.968861059034661e-05, + "loss": 2.4759, + "mean_token_accuracy": 0.4379310369491577, + "step": 99650 + }, + { + "epoch": 0.10037357441926753, + "grad_norm": 10.790384052673996, + "learning_rate": 4.9688548462133764e-05, + "loss": 2.283, + "mean_token_accuracy": 0.43103448748588563, + "step": 99655 + }, + { + "epoch": 0.1003786104723717, + "grad_norm": 10.31027914186145, + "learning_rate": 4.9688486327766845e-05, + "loss": 2.6157, + "mean_token_accuracy": 0.4034482777118683, + "step": 99660 + }, + { + "epoch": 0.10038364652547588, + "grad_norm": 9.813072005090763, + "learning_rate": 4.968842418724587e-05, + "loss": 2.27, + "mean_token_accuracy": 0.4433151841163635, + "step": 99665 + }, + { + "epoch": 0.10038868257858005, + "grad_norm": 11.604168131239176, + "learning_rate": 4.968836204057086e-05, + "loss": 2.4624, + "mean_token_accuracy": 0.4137930989265442, + "step": 99670 + }, + { + "epoch": 0.10039371863168423, + "grad_norm": 11.808789992271457, + "learning_rate": 4.968829988774182e-05, + "loss": 2.3979, + "mean_token_accuracy": 0.38620689511299133, + "step": 99675 + }, + { + "epoch": 0.1003987546847884, + "grad_norm": 14.508113203447868, + "learning_rate": 4.9688237728758766e-05, + "loss": 2.7195, + "mean_token_accuracy": 0.417241370677948, + "step": 99680 + }, + { + "epoch": 0.10040379073789257, + "grad_norm": 11.766272592324986, + "learning_rate": 4.968817556362172e-05, + "loss": 2.4508, + "mean_token_accuracy": 0.4068965494632721, + "step": 99685 + }, + { + "epoch": 0.10040882679099675, + "grad_norm": 12.924217135426119, + "learning_rate": 4.968811339233071e-05, + "loss": 2.7458, + "mean_token_accuracy": 0.3896551728248596, + "step": 99690 + }, + { + "epoch": 0.10041386284410092, + "grad_norm": 10.638004901303898, + "learning_rate": 4.968805121488574e-05, + "loss": 2.4527, + "mean_token_accuracy": 0.38620689511299133, + "step": 99695 + }, + { + "epoch": 0.1004188988972051, + "grad_norm": 12.042164473902988, + "learning_rate": 4.968798903128683e-05, + "loss": 2.5183, + "mean_token_accuracy": 0.4034482717514038, + "step": 99700 + }, + { + "epoch": 0.10042393495030927, + "grad_norm": 11.666529497640347, + "learning_rate": 4.9687926841533996e-05, + "loss": 2.5216, + "mean_token_accuracy": 0.42413793206214906, + "step": 99705 + }, + { + "epoch": 0.10042897100341344, + "grad_norm": 19.53091632343761, + "learning_rate": 4.9687864645627266e-05, + "loss": 3.138, + "mean_token_accuracy": 0.37041741609573364, + "step": 99710 + }, + { + "epoch": 0.10043400705651762, + "grad_norm": 12.591490133034377, + "learning_rate": 4.9687802443566646e-05, + "loss": 2.4824, + "mean_token_accuracy": 0.4413793087005615, + "step": 99715 + }, + { + "epoch": 0.10043904310962178, + "grad_norm": 11.03972189063813, + "learning_rate": 4.968774023535215e-05, + "loss": 2.4438, + "mean_token_accuracy": 0.4034482777118683, + "step": 99720 + }, + { + "epoch": 0.10044407916272595, + "grad_norm": 10.744026486909533, + "learning_rate": 4.9687678020983807e-05, + "loss": 2.2027, + "mean_token_accuracy": 0.4413793087005615, + "step": 99725 + }, + { + "epoch": 0.10044911521583012, + "grad_norm": 10.450375845265931, + "learning_rate": 4.968761580046162e-05, + "loss": 2.4219, + "mean_token_accuracy": 0.4310344815254211, + "step": 99730 + }, + { + "epoch": 0.1004541512689343, + "grad_norm": 9.600689216265954, + "learning_rate": 4.9687553573785634e-05, + "loss": 2.5709, + "mean_token_accuracy": 0.3827586233615875, + "step": 99735 + }, + { + "epoch": 0.10045918732203847, + "grad_norm": 9.493808053644932, + "learning_rate": 4.968749134095583e-05, + "loss": 2.4296, + "mean_token_accuracy": 0.417241370677948, + "step": 99740 + }, + { + "epoch": 0.10046422337514264, + "grad_norm": 9.40650296306212, + "learning_rate": 4.968742910197225e-05, + "loss": 2.4762, + "mean_token_accuracy": 0.3999999940395355, + "step": 99745 + }, + { + "epoch": 0.10046925942824682, + "grad_norm": 11.921619767493363, + "learning_rate": 4.96873668568349e-05, + "loss": 2.5422, + "mean_token_accuracy": 0.4000000059604645, + "step": 99750 + }, + { + "epoch": 0.10047429548135099, + "grad_norm": 11.228487027882364, + "learning_rate": 4.9687304605543805e-05, + "loss": 2.5264, + "mean_token_accuracy": 0.36896551251411436, + "step": 99755 + }, + { + "epoch": 0.10047933153445517, + "grad_norm": 13.549066193087164, + "learning_rate": 4.968724234809898e-05, + "loss": 2.4953, + "mean_token_accuracy": 0.4172413766384125, + "step": 99760 + }, + { + "epoch": 0.10048436758755934, + "grad_norm": 14.667298255463983, + "learning_rate": 4.968718008450043e-05, + "loss": 2.5031, + "mean_token_accuracy": 0.40344828367233276, + "step": 99765 + }, + { + "epoch": 0.10048940364066351, + "grad_norm": 11.223880716589564, + "learning_rate": 4.968711781474819e-05, + "loss": 2.1226, + "mean_token_accuracy": 0.47779794335365294, + "step": 99770 + }, + { + "epoch": 0.10049443969376769, + "grad_norm": 10.90098848918577, + "learning_rate": 4.968705553884228e-05, + "loss": 2.7124, + "mean_token_accuracy": 0.37931033968925476, + "step": 99775 + }, + { + "epoch": 0.10049947574687186, + "grad_norm": 15.607037543667179, + "learning_rate": 4.968699325678269e-05, + "loss": 2.5757, + "mean_token_accuracy": 0.3793103456497192, + "step": 99780 + }, + { + "epoch": 0.10050451179997603, + "grad_norm": 11.877607297371828, + "learning_rate": 4.968693096856946e-05, + "loss": 2.2703, + "mean_token_accuracy": 0.4482758641242981, + "step": 99785 + }, + { + "epoch": 0.1005095478530802, + "grad_norm": 12.152789741228506, + "learning_rate": 4.96868686742026e-05, + "loss": 2.1303, + "mean_token_accuracy": 0.43641863465309144, + "step": 99790 + }, + { + "epoch": 0.10051458390618437, + "grad_norm": 11.587328287609994, + "learning_rate": 4.968680637368214e-05, + "loss": 2.2481, + "mean_token_accuracy": 0.458620685338974, + "step": 99795 + }, + { + "epoch": 0.10051961995928854, + "grad_norm": 10.96572351664888, + "learning_rate": 4.968674406700808e-05, + "loss": 2.6676, + "mean_token_accuracy": 0.3586206793785095, + "step": 99800 + }, + { + "epoch": 0.10052465601239272, + "grad_norm": 12.12271356309071, + "learning_rate": 4.9686681754180445e-05, + "loss": 2.4421, + "mean_token_accuracy": 0.42758620381355283, + "step": 99805 + }, + { + "epoch": 0.10052969206549689, + "grad_norm": 10.255389175651684, + "learning_rate": 4.968661943519925e-05, + "loss": 2.589, + "mean_token_accuracy": 0.39655172228813174, + "step": 99810 + }, + { + "epoch": 0.10053472811860106, + "grad_norm": 11.82423286494721, + "learning_rate": 4.96865571100645e-05, + "loss": 2.5318, + "mean_token_accuracy": 0.42413793206214906, + "step": 99815 + }, + { + "epoch": 0.10053976417170524, + "grad_norm": 13.730040867990613, + "learning_rate": 4.9686494778776244e-05, + "loss": 2.8052, + "mean_token_accuracy": 0.36206896901130675, + "step": 99820 + }, + { + "epoch": 0.10054480022480941, + "grad_norm": 11.466585503826266, + "learning_rate": 4.9686432441334476e-05, + "loss": 2.5078, + "mean_token_accuracy": 0.37931033968925476, + "step": 99825 + }, + { + "epoch": 0.10054983627791358, + "grad_norm": 9.896257939510402, + "learning_rate": 4.9686370097739224e-05, + "loss": 2.0378, + "mean_token_accuracy": 0.4586206912994385, + "step": 99830 + }, + { + "epoch": 0.10055487233101776, + "grad_norm": 9.672838679742702, + "learning_rate": 4.968630774799049e-05, + "loss": 2.2577, + "mean_token_accuracy": 0.48066502809524536, + "step": 99835 + }, + { + "epoch": 0.10055990838412193, + "grad_norm": 9.77640990019493, + "learning_rate": 4.968624539208831e-05, + "loss": 2.1352, + "mean_token_accuracy": 0.441379314661026, + "step": 99840 + }, + { + "epoch": 0.1005649444372261, + "grad_norm": 10.013875542908634, + "learning_rate": 4.968618303003268e-05, + "loss": 2.4218, + "mean_token_accuracy": 0.43448275327682495, + "step": 99845 + }, + { + "epoch": 0.10056998049033028, + "grad_norm": 14.67226933283389, + "learning_rate": 4.968612066182363e-05, + "loss": 2.4974, + "mean_token_accuracy": 0.41379311084747317, + "step": 99850 + }, + { + "epoch": 0.10057501654343445, + "grad_norm": 10.004953253497721, + "learning_rate": 4.968605828746118e-05, + "loss": 2.4775, + "mean_token_accuracy": 0.38275861740112305, + "step": 99855 + }, + { + "epoch": 0.10058005259653861, + "grad_norm": 11.01578635158835, + "learning_rate": 4.968599590694535e-05, + "loss": 2.4849, + "mean_token_accuracy": 0.45414398312568666, + "step": 99860 + }, + { + "epoch": 0.10058508864964279, + "grad_norm": 12.710777933578147, + "learning_rate": 4.9685933520276146e-05, + "loss": 2.4547, + "mean_token_accuracy": 0.45517241954803467, + "step": 99865 + }, + { + "epoch": 0.10059012470274696, + "grad_norm": 11.058875638341222, + "learning_rate": 4.968587112745359e-05, + "loss": 2.3082, + "mean_token_accuracy": 0.4379310250282288, + "step": 99870 + }, + { + "epoch": 0.10059516075585113, + "grad_norm": 10.604547412645783, + "learning_rate": 4.96858087284777e-05, + "loss": 2.3271, + "mean_token_accuracy": 0.44482758045196535, + "step": 99875 + }, + { + "epoch": 0.10060019680895531, + "grad_norm": 12.084023253056268, + "learning_rate": 4.9685746323348495e-05, + "loss": 2.501, + "mean_token_accuracy": 0.46551724076271056, + "step": 99880 + }, + { + "epoch": 0.10060523286205948, + "grad_norm": 10.524335022869112, + "learning_rate": 4.9685683912065993e-05, + "loss": 2.5486, + "mean_token_accuracy": 0.3862069010734558, + "step": 99885 + }, + { + "epoch": 0.10061026891516366, + "grad_norm": 11.87503645700627, + "learning_rate": 4.968562149463021e-05, + "loss": 2.2213, + "mean_token_accuracy": 0.3655172407627106, + "step": 99890 + }, + { + "epoch": 0.10061530496826783, + "grad_norm": 11.09523063974377, + "learning_rate": 4.968555907104115e-05, + "loss": 2.564, + "mean_token_accuracy": 0.41875378489494325, + "step": 99895 + }, + { + "epoch": 0.100620341021372, + "grad_norm": 9.610502738219719, + "learning_rate": 4.968549664129885e-05, + "loss": 2.0579, + "mean_token_accuracy": 0.44827585816383364, + "step": 99900 + }, + { + "epoch": 0.10062537707447618, + "grad_norm": 10.838737457552972, + "learning_rate": 4.9685434205403325e-05, + "loss": 2.6702, + "mean_token_accuracy": 0.37931033968925476, + "step": 99905 + }, + { + "epoch": 0.10063041312758035, + "grad_norm": 8.925940694187503, + "learning_rate": 4.968537176335458e-05, + "loss": 2.0763, + "mean_token_accuracy": 0.4931034445762634, + "step": 99910 + }, + { + "epoch": 0.10063544918068452, + "grad_norm": 12.325617892704834, + "learning_rate": 4.968530931515264e-05, + "loss": 2.2455, + "mean_token_accuracy": 0.458620685338974, + "step": 99915 + }, + { + "epoch": 0.1006404852337887, + "grad_norm": 11.107211811371346, + "learning_rate": 4.9685246860797526e-05, + "loss": 2.4526, + "mean_token_accuracy": 0.39310344457626345, + "step": 99920 + }, + { + "epoch": 0.10064552128689287, + "grad_norm": 13.494162232248229, + "learning_rate": 4.968518440028925e-05, + "loss": 2.4281, + "mean_token_accuracy": 0.42758620977401735, + "step": 99925 + }, + { + "epoch": 0.10065055733999703, + "grad_norm": 9.92818904636053, + "learning_rate": 4.968512193362783e-05, + "loss": 2.2123, + "mean_token_accuracy": 0.4551724135875702, + "step": 99930 + }, + { + "epoch": 0.1006555933931012, + "grad_norm": 12.240299036477683, + "learning_rate": 4.968505946081328e-05, + "loss": 3.2352, + "mean_token_accuracy": 0.3724137872457504, + "step": 99935 + }, + { + "epoch": 0.10066062944620538, + "grad_norm": 10.961753719087756, + "learning_rate": 4.968499698184562e-05, + "loss": 2.2681, + "mean_token_accuracy": 0.44482758045196535, + "step": 99940 + }, + { + "epoch": 0.10066566549930955, + "grad_norm": 9.387616884964391, + "learning_rate": 4.968493449672487e-05, + "loss": 2.3161, + "mean_token_accuracy": 0.4620689630508423, + "step": 99945 + }, + { + "epoch": 0.10067070155241373, + "grad_norm": 11.19079445322811, + "learning_rate": 4.968487200545105e-05, + "loss": 2.1545, + "mean_token_accuracy": 0.44482759237289426, + "step": 99950 + }, + { + "epoch": 0.1006757376055179, + "grad_norm": 13.649963912407397, + "learning_rate": 4.968480950802417e-05, + "loss": 2.308, + "mean_token_accuracy": 0.4206896543502808, + "step": 99955 + }, + { + "epoch": 0.10068077365862207, + "grad_norm": 10.701360880147938, + "learning_rate": 4.968474700444426e-05, + "loss": 2.5466, + "mean_token_accuracy": 0.42068964838981626, + "step": 99960 + }, + { + "epoch": 0.10068580971172625, + "grad_norm": 12.399985579194999, + "learning_rate": 4.968468449471131e-05, + "loss": 2.5668, + "mean_token_accuracy": 0.40344828367233276, + "step": 99965 + }, + { + "epoch": 0.10069084576483042, + "grad_norm": 7.864421568825692, + "learning_rate": 4.9684621978825364e-05, + "loss": 2.3094, + "mean_token_accuracy": 0.4922564923763275, + "step": 99970 + }, + { + "epoch": 0.1006958818179346, + "grad_norm": 10.6402865824142, + "learning_rate": 4.9684559456786433e-05, + "loss": 2.2359, + "mean_token_accuracy": 0.42413793206214906, + "step": 99975 + }, + { + "epoch": 0.10070091787103877, + "grad_norm": 11.851145854922413, + "learning_rate": 4.9684496928594526e-05, + "loss": 3.0235, + "mean_token_accuracy": 0.3758620619773865, + "step": 99980 + }, + { + "epoch": 0.10070595392414294, + "grad_norm": 10.656117607810696, + "learning_rate": 4.968443439424967e-05, + "loss": 2.2565, + "mean_token_accuracy": 0.44827587008476255, + "step": 99985 + }, + { + "epoch": 0.10071098997724712, + "grad_norm": 9.484087717898566, + "learning_rate": 4.968437185375188e-05, + "loss": 2.1594, + "mean_token_accuracy": 0.44827587008476255, + "step": 99990 + }, + { + "epoch": 0.10071602603035129, + "grad_norm": 10.17788425555078, + "learning_rate": 4.968430930710116e-05, + "loss": 2.151, + "mean_token_accuracy": 0.44295220971107485, + "step": 99995 + }, + { + "epoch": 0.10072106208345545, + "grad_norm": 13.964300808080232, + "learning_rate": 4.968424675429755e-05, + "loss": 2.5475, + "mean_token_accuracy": 0.41724138259887694, + "step": 100000 + }, + { + "epoch": 0.10072609813655962, + "grad_norm": 10.406477758421381, + "learning_rate": 4.9684184195341055e-05, + "loss": 2.5639, + "mean_token_accuracy": 0.39655172228813174, + "step": 100005 + }, + { + "epoch": 0.1007311341896638, + "grad_norm": 11.061127377980746, + "learning_rate": 4.968412163023169e-05, + "loss": 2.9076, + "mean_token_accuracy": 0.36206896007061007, + "step": 100010 + }, + { + "epoch": 0.10073617024276797, + "grad_norm": 10.234807691009491, + "learning_rate": 4.968405905896949e-05, + "loss": 2.5931, + "mean_token_accuracy": 0.3620689630508423, + "step": 100015 + }, + { + "epoch": 0.10074120629587215, + "grad_norm": 11.774505581506515, + "learning_rate": 4.968399648155444e-05, + "loss": 2.6812, + "mean_token_accuracy": 0.4482758641242981, + "step": 100020 + }, + { + "epoch": 0.10074624234897632, + "grad_norm": 10.105905203219091, + "learning_rate": 4.9683933897986584e-05, + "loss": 2.3463, + "mean_token_accuracy": 0.4482758641242981, + "step": 100025 + }, + { + "epoch": 0.1007512784020805, + "grad_norm": 10.250057408208953, + "learning_rate": 4.968387130826593e-05, + "loss": 2.4349, + "mean_token_accuracy": 0.4206896543502808, + "step": 100030 + }, + { + "epoch": 0.10075631445518467, + "grad_norm": 10.274751795656123, + "learning_rate": 4.96838087123925e-05, + "loss": 2.5913, + "mean_token_accuracy": 0.4034482717514038, + "step": 100035 + }, + { + "epoch": 0.10076135050828884, + "grad_norm": 9.079882315925548, + "learning_rate": 4.9683746110366305e-05, + "loss": 2.3455, + "mean_token_accuracy": 0.42413792610168455, + "step": 100040 + }, + { + "epoch": 0.10076638656139301, + "grad_norm": 8.863773684582196, + "learning_rate": 4.968368350218736e-05, + "loss": 2.2763, + "mean_token_accuracy": 0.4310344815254211, + "step": 100045 + }, + { + "epoch": 0.10077142261449719, + "grad_norm": 15.44091740263888, + "learning_rate": 4.9683620887855695e-05, + "loss": 2.7504, + "mean_token_accuracy": 0.43103447556495667, + "step": 100050 + }, + { + "epoch": 0.10077645866760136, + "grad_norm": 10.386754074726456, + "learning_rate": 4.9683558267371315e-05, + "loss": 2.5437, + "mean_token_accuracy": 0.3862069010734558, + "step": 100055 + }, + { + "epoch": 0.10078149472070554, + "grad_norm": 9.582472153935983, + "learning_rate": 4.968349564073425e-05, + "loss": 2.5318, + "mean_token_accuracy": 0.3896551728248596, + "step": 100060 + }, + { + "epoch": 0.10078653077380971, + "grad_norm": 20.223415310854257, + "learning_rate": 4.9683433007944494e-05, + "loss": 2.445, + "mean_token_accuracy": 0.4551724135875702, + "step": 100065 + }, + { + "epoch": 0.10079156682691387, + "grad_norm": 10.4790334374899, + "learning_rate": 4.9683370369002094e-05, + "loss": 2.5041, + "mean_token_accuracy": 0.4137930989265442, + "step": 100070 + }, + { + "epoch": 0.10079660288001804, + "grad_norm": 13.358522219394409, + "learning_rate": 4.968330772390705e-05, + "loss": 2.6975, + "mean_token_accuracy": 0.4379310369491577, + "step": 100075 + }, + { + "epoch": 0.10080163893312222, + "grad_norm": 8.916087774069464, + "learning_rate": 4.968324507265938e-05, + "loss": 2.5099, + "mean_token_accuracy": 0.3999999940395355, + "step": 100080 + }, + { + "epoch": 0.10080667498622639, + "grad_norm": 13.451547611396204, + "learning_rate": 4.968318241525911e-05, + "loss": 2.764, + "mean_token_accuracy": 0.374047189950943, + "step": 100085 + }, + { + "epoch": 0.10081171103933056, + "grad_norm": 11.282079887463281, + "learning_rate": 4.968311975170625e-05, + "loss": 2.3409, + "mean_token_accuracy": 0.4103448212146759, + "step": 100090 + }, + { + "epoch": 0.10081674709243474, + "grad_norm": 14.940816422487298, + "learning_rate": 4.968305708200082e-05, + "loss": 2.9524, + "mean_token_accuracy": 0.4172413766384125, + "step": 100095 + }, + { + "epoch": 0.10082178314553891, + "grad_norm": 8.59788008892987, + "learning_rate": 4.9682994406142834e-05, + "loss": 2.6272, + "mean_token_accuracy": 0.3655172437429428, + "step": 100100 + }, + { + "epoch": 0.10082681919864309, + "grad_norm": 11.442598672615754, + "learning_rate": 4.968293172413231e-05, + "loss": 2.2398, + "mean_token_accuracy": 0.4361766457557678, + "step": 100105 + }, + { + "epoch": 0.10083185525174726, + "grad_norm": 9.911722112547768, + "learning_rate": 4.9682869035969275e-05, + "loss": 2.141, + "mean_token_accuracy": 0.43103448748588563, + "step": 100110 + }, + { + "epoch": 0.10083689130485143, + "grad_norm": 8.123422649995424, + "learning_rate": 4.968280634165373e-05, + "loss": 2.4386, + "mean_token_accuracy": 0.42758620977401735, + "step": 100115 + }, + { + "epoch": 0.10084192735795561, + "grad_norm": 11.607680492207328, + "learning_rate": 4.96827436411857e-05, + "loss": 2.4313, + "mean_token_accuracy": 0.4344827592372894, + "step": 100120 + }, + { + "epoch": 0.10084696341105978, + "grad_norm": 9.282297042714827, + "learning_rate": 4.968268093456521e-05, + "loss": 2.343, + "mean_token_accuracy": 0.41724138259887694, + "step": 100125 + }, + { + "epoch": 0.10085199946416396, + "grad_norm": 12.30599082470372, + "learning_rate": 4.9682618221792264e-05, + "loss": 2.8554, + "mean_token_accuracy": 0.4034482777118683, + "step": 100130 + }, + { + "epoch": 0.10085703551726813, + "grad_norm": 9.39146818546685, + "learning_rate": 4.9682555502866894e-05, + "loss": 2.1506, + "mean_token_accuracy": 0.4571687877178192, + "step": 100135 + }, + { + "epoch": 0.10086207157037229, + "grad_norm": 11.621297727704723, + "learning_rate": 4.9682492777789104e-05, + "loss": 2.5482, + "mean_token_accuracy": 0.4000000059604645, + "step": 100140 + }, + { + "epoch": 0.10086710762347646, + "grad_norm": 12.358960974943166, + "learning_rate": 4.968243004655892e-05, + "loss": 2.1358, + "mean_token_accuracy": 0.4662561595439911, + "step": 100145 + }, + { + "epoch": 0.10087214367658064, + "grad_norm": 8.638664694802257, + "learning_rate": 4.968236730917635e-05, + "loss": 2.5744, + "mean_token_accuracy": 0.41034482717514037, + "step": 100150 + }, + { + "epoch": 0.10087717972968481, + "grad_norm": 11.940657813740733, + "learning_rate": 4.968230456564143e-05, + "loss": 2.2187, + "mean_token_accuracy": 0.4517241358757019, + "step": 100155 + }, + { + "epoch": 0.10088221578278898, + "grad_norm": 10.222565744893558, + "learning_rate": 4.968224181595415e-05, + "loss": 2.2774, + "mean_token_accuracy": 0.4103448331356049, + "step": 100160 + }, + { + "epoch": 0.10088725183589316, + "grad_norm": 7.995514602194898, + "learning_rate": 4.968217906011455e-05, + "loss": 2.4379, + "mean_token_accuracy": 0.413793095946312, + "step": 100165 + }, + { + "epoch": 0.10089228788899733, + "grad_norm": 8.805412710913629, + "learning_rate": 4.968211629812264e-05, + "loss": 2.4089, + "mean_token_accuracy": 0.4310344815254211, + "step": 100170 + }, + { + "epoch": 0.1008973239421015, + "grad_norm": 12.420044509657634, + "learning_rate": 4.968205352997844e-05, + "loss": 2.8002, + "mean_token_accuracy": 0.3896551728248596, + "step": 100175 + }, + { + "epoch": 0.10090235999520568, + "grad_norm": 10.742302311862534, + "learning_rate": 4.968199075568196e-05, + "loss": 2.4007, + "mean_token_accuracy": 0.4034482717514038, + "step": 100180 + }, + { + "epoch": 0.10090739604830985, + "grad_norm": 11.166003380484904, + "learning_rate": 4.9681927975233224e-05, + "loss": 2.3804, + "mean_token_accuracy": 0.4379310369491577, + "step": 100185 + }, + { + "epoch": 0.10091243210141403, + "grad_norm": 13.253857309968755, + "learning_rate": 4.9681865188632246e-05, + "loss": 2.4114, + "mean_token_accuracy": 0.44827585816383364, + "step": 100190 + }, + { + "epoch": 0.1009174681545182, + "grad_norm": 10.745851594174658, + "learning_rate": 4.968180239587904e-05, + "loss": 2.2756, + "mean_token_accuracy": 0.44137930274009707, + "step": 100195 + }, + { + "epoch": 0.10092250420762237, + "grad_norm": 11.31536930998323, + "learning_rate": 4.968173959697364e-05, + "loss": 2.6507, + "mean_token_accuracy": 0.34482758641242983, + "step": 100200 + }, + { + "epoch": 0.10092754026072655, + "grad_norm": 10.364119101792767, + "learning_rate": 4.968167679191604e-05, + "loss": 2.0542, + "mean_token_accuracy": 0.48275862336158754, + "step": 100205 + }, + { + "epoch": 0.10093257631383071, + "grad_norm": 8.571516056707093, + "learning_rate": 4.968161398070628e-05, + "loss": 2.4534, + "mean_token_accuracy": 0.42758620381355283, + "step": 100210 + }, + { + "epoch": 0.10093761236693488, + "grad_norm": 10.967180509034472, + "learning_rate": 4.9681551163344356e-05, + "loss": 2.3709, + "mean_token_accuracy": 0.4379310250282288, + "step": 100215 + }, + { + "epoch": 0.10094264842003906, + "grad_norm": 13.01961120023097, + "learning_rate": 4.9681488339830305e-05, + "loss": 2.4307, + "mean_token_accuracy": 0.37241379022598264, + "step": 100220 + }, + { + "epoch": 0.10094768447314323, + "grad_norm": 9.787951741702825, + "learning_rate": 4.968142551016414e-05, + "loss": 2.4716, + "mean_token_accuracy": 0.44137930274009707, + "step": 100225 + }, + { + "epoch": 0.1009527205262474, + "grad_norm": 9.100453432121714, + "learning_rate": 4.968136267434586e-05, + "loss": 2.2706, + "mean_token_accuracy": 0.3965517282485962, + "step": 100230 + }, + { + "epoch": 0.10095775657935158, + "grad_norm": 10.301386374859703, + "learning_rate": 4.968129983237551e-05, + "loss": 2.955, + "mean_token_accuracy": 0.38741681575775144, + "step": 100235 + }, + { + "epoch": 0.10096279263245575, + "grad_norm": 12.0619974465707, + "learning_rate": 4.9681236984253084e-05, + "loss": 2.6295, + "mean_token_accuracy": 0.38620689511299133, + "step": 100240 + }, + { + "epoch": 0.10096782868555992, + "grad_norm": 13.241075970773057, + "learning_rate": 4.9681174129978605e-05, + "loss": 2.3542, + "mean_token_accuracy": 0.4482758641242981, + "step": 100245 + }, + { + "epoch": 0.1009728647386641, + "grad_norm": 16.476216355509845, + "learning_rate": 4.9681111269552106e-05, + "loss": 2.5856, + "mean_token_accuracy": 0.3965517282485962, + "step": 100250 + }, + { + "epoch": 0.10097790079176827, + "grad_norm": 13.063424961048968, + "learning_rate": 4.9681048402973586e-05, + "loss": 2.3574, + "mean_token_accuracy": 0.4206896543502808, + "step": 100255 + }, + { + "epoch": 0.10098293684487245, + "grad_norm": 12.336889264703254, + "learning_rate": 4.9680985530243065e-05, + "loss": 2.4201, + "mean_token_accuracy": 0.4344827592372894, + "step": 100260 + }, + { + "epoch": 0.10098797289797662, + "grad_norm": 10.759507840486759, + "learning_rate": 4.968092265136058e-05, + "loss": 2.8432, + "mean_token_accuracy": 0.42068964838981626, + "step": 100265 + }, + { + "epoch": 0.10099300895108079, + "grad_norm": 10.167995723622077, + "learning_rate": 4.968085976632612e-05, + "loss": 2.603, + "mean_token_accuracy": 0.37241379618644715, + "step": 100270 + }, + { + "epoch": 0.10099804500418497, + "grad_norm": 9.738047831612196, + "learning_rate": 4.9680796875139726e-05, + "loss": 2.5162, + "mean_token_accuracy": 0.42413792610168455, + "step": 100275 + }, + { + "epoch": 0.10100308105728913, + "grad_norm": 9.87581590934551, + "learning_rate": 4.9680733977801394e-05, + "loss": 2.0519, + "mean_token_accuracy": 0.49491833448410033, + "step": 100280 + }, + { + "epoch": 0.1010081171103933, + "grad_norm": 10.410149019998846, + "learning_rate": 4.9680671074311164e-05, + "loss": 2.5467, + "mean_token_accuracy": 0.4344827651977539, + "step": 100285 + }, + { + "epoch": 0.10101315316349747, + "grad_norm": 10.511848361828227, + "learning_rate": 4.9680608164669034e-05, + "loss": 2.7376, + "mean_token_accuracy": 0.37586206793785093, + "step": 100290 + }, + { + "epoch": 0.10101818921660165, + "grad_norm": 9.819977353487074, + "learning_rate": 4.9680545248875034e-05, + "loss": 2.3325, + "mean_token_accuracy": 0.4533575356006622, + "step": 100295 + }, + { + "epoch": 0.10102322526970582, + "grad_norm": 8.741889901348351, + "learning_rate": 4.9680482326929175e-05, + "loss": 2.4936, + "mean_token_accuracy": 0.39655172228813174, + "step": 100300 + }, + { + "epoch": 0.10102826132281, + "grad_norm": 11.951088650761767, + "learning_rate": 4.968041939883148e-05, + "loss": 2.4235, + "mean_token_accuracy": 0.43103448748588563, + "step": 100305 + }, + { + "epoch": 0.10103329737591417, + "grad_norm": 14.52052776631206, + "learning_rate": 4.968035646458196e-05, + "loss": 2.6713, + "mean_token_accuracy": 0.4413793087005615, + "step": 100310 + }, + { + "epoch": 0.10103833342901834, + "grad_norm": 13.753990768981327, + "learning_rate": 4.9680293524180634e-05, + "loss": 2.7418, + "mean_token_accuracy": 0.3896551728248596, + "step": 100315 + }, + { + "epoch": 0.10104336948212252, + "grad_norm": 10.235011059977122, + "learning_rate": 4.968023057762752e-05, + "loss": 2.6075, + "mean_token_accuracy": 0.4413793057203293, + "step": 100320 + }, + { + "epoch": 0.10104840553522669, + "grad_norm": 13.613477175207537, + "learning_rate": 4.9680167624922635e-05, + "loss": 2.5422, + "mean_token_accuracy": 0.4344827651977539, + "step": 100325 + }, + { + "epoch": 0.10105344158833086, + "grad_norm": 12.012970960158299, + "learning_rate": 4.9680104666066e-05, + "loss": 2.6302, + "mean_token_accuracy": 0.41209921836853025, + "step": 100330 + }, + { + "epoch": 0.10105847764143504, + "grad_norm": 18.74923309969989, + "learning_rate": 4.968004170105763e-05, + "loss": 2.8035, + "mean_token_accuracy": 0.43448275327682495, + "step": 100335 + }, + { + "epoch": 0.10106351369453921, + "grad_norm": 10.949841146207635, + "learning_rate": 4.9679978729897546e-05, + "loss": 2.43, + "mean_token_accuracy": 0.46896551847457885, + "step": 100340 + }, + { + "epoch": 0.10106854974764339, + "grad_norm": 13.998837005957975, + "learning_rate": 4.967991575258576e-05, + "loss": 2.4437, + "mean_token_accuracy": 0.42758620381355283, + "step": 100345 + }, + { + "epoch": 0.10107358580074755, + "grad_norm": 10.720579643827577, + "learning_rate": 4.9679852769122295e-05, + "loss": 2.4507, + "mean_token_accuracy": 0.42413793206214906, + "step": 100350 + }, + { + "epoch": 0.10107862185385172, + "grad_norm": 12.431466446024018, + "learning_rate": 4.9679789779507164e-05, + "loss": 2.4548, + "mean_token_accuracy": 0.4310344815254211, + "step": 100355 + }, + { + "epoch": 0.10108365790695589, + "grad_norm": 11.922578286341688, + "learning_rate": 4.967972678374038e-05, + "loss": 2.3269, + "mean_token_accuracy": 0.45027223229408264, + "step": 100360 + }, + { + "epoch": 0.10108869396006007, + "grad_norm": 10.550090600499436, + "learning_rate": 4.967966378182197e-05, + "loss": 2.3792, + "mean_token_accuracy": 0.4758620738983154, + "step": 100365 + }, + { + "epoch": 0.10109373001316424, + "grad_norm": 11.321406819026564, + "learning_rate": 4.9679600773751944e-05, + "loss": 2.5415, + "mean_token_accuracy": 0.4000000059604645, + "step": 100370 + }, + { + "epoch": 0.10109876606626841, + "grad_norm": 10.798501424265243, + "learning_rate": 4.967953775953032e-05, + "loss": 2.141, + "mean_token_accuracy": 0.475862056016922, + "step": 100375 + }, + { + "epoch": 0.10110380211937259, + "grad_norm": 16.0658566618976, + "learning_rate": 4.9679474739157135e-05, + "loss": 3.0247, + "mean_token_accuracy": 0.37356321811676024, + "step": 100380 + }, + { + "epoch": 0.10110883817247676, + "grad_norm": 9.856195793224789, + "learning_rate": 4.9679411712632374e-05, + "loss": 2.6095, + "mean_token_accuracy": 0.4, + "step": 100385 + }, + { + "epoch": 0.10111387422558094, + "grad_norm": 10.437836199492825, + "learning_rate": 4.967934867995608e-05, + "loss": 1.9687, + "mean_token_accuracy": 0.48275861144065857, + "step": 100390 + }, + { + "epoch": 0.10111891027868511, + "grad_norm": 12.685438296265172, + "learning_rate": 4.967928564112826e-05, + "loss": 2.3768, + "mean_token_accuracy": 0.4379310369491577, + "step": 100395 + }, + { + "epoch": 0.10112394633178928, + "grad_norm": 8.778129048227438, + "learning_rate": 4.967922259614893e-05, + "loss": 2.2661, + "mean_token_accuracy": 0.417241370677948, + "step": 100400 + }, + { + "epoch": 0.10112898238489346, + "grad_norm": 9.846191139180466, + "learning_rate": 4.96791595450181e-05, + "loss": 2.1253, + "mean_token_accuracy": 0.4862069010734558, + "step": 100405 + }, + { + "epoch": 0.10113401843799763, + "grad_norm": 9.883444789041006, + "learning_rate": 4.967909648773581e-05, + "loss": 2.3876, + "mean_token_accuracy": 0.41379310488700866, + "step": 100410 + }, + { + "epoch": 0.1011390544911018, + "grad_norm": 10.051223812162036, + "learning_rate": 4.967903342430206e-05, + "loss": 2.3866, + "mean_token_accuracy": 0.4551724076271057, + "step": 100415 + }, + { + "epoch": 0.10114409054420596, + "grad_norm": 9.677470178966669, + "learning_rate": 4.9678970354716874e-05, + "loss": 2.1052, + "mean_token_accuracy": 0.4482758641242981, + "step": 100420 + }, + { + "epoch": 0.10114912659731014, + "grad_norm": 10.700655072553364, + "learning_rate": 4.9678907278980264e-05, + "loss": 2.5944, + "mean_token_accuracy": 0.3517241358757019, + "step": 100425 + }, + { + "epoch": 0.10115416265041431, + "grad_norm": 9.705787756175926, + "learning_rate": 4.9678844197092264e-05, + "loss": 2.27, + "mean_token_accuracy": 0.44827585816383364, + "step": 100430 + }, + { + "epoch": 0.10115919870351849, + "grad_norm": 10.999017398217122, + "learning_rate": 4.967878110905286e-05, + "loss": 1.9737, + "mean_token_accuracy": 0.45517241954803467, + "step": 100435 + }, + { + "epoch": 0.10116423475662266, + "grad_norm": 9.504696337997215, + "learning_rate": 4.96787180148621e-05, + "loss": 2.347, + "mean_token_accuracy": 0.44827587008476255, + "step": 100440 + }, + { + "epoch": 0.10116927080972683, + "grad_norm": 10.79471256984635, + "learning_rate": 4.967865491451998e-05, + "loss": 2.5696, + "mean_token_accuracy": 0.41379310488700866, + "step": 100445 + }, + { + "epoch": 0.101174306862831, + "grad_norm": 10.96726961200012, + "learning_rate": 4.967859180802655e-05, + "loss": 2.5477, + "mean_token_accuracy": 0.4551724076271057, + "step": 100450 + }, + { + "epoch": 0.10117934291593518, + "grad_norm": 10.266500837428914, + "learning_rate": 4.967852869538179e-05, + "loss": 2.5733, + "mean_token_accuracy": 0.4297035694122314, + "step": 100455 + }, + { + "epoch": 0.10118437896903935, + "grad_norm": 11.931957093791128, + "learning_rate": 4.967846557658573e-05, + "loss": 2.4416, + "mean_token_accuracy": 0.4517241358757019, + "step": 100460 + }, + { + "epoch": 0.10118941502214353, + "grad_norm": 9.7194479637551, + "learning_rate": 4.967840245163838e-05, + "loss": 2.5615, + "mean_token_accuracy": 0.4379310369491577, + "step": 100465 + }, + { + "epoch": 0.1011944510752477, + "grad_norm": 10.65114543868417, + "learning_rate": 4.9678339320539787e-05, + "loss": 2.201, + "mean_token_accuracy": 0.44646098613739016, + "step": 100470 + }, + { + "epoch": 0.10119948712835188, + "grad_norm": 10.802781174996692, + "learning_rate": 4.967827618328994e-05, + "loss": 2.6149, + "mean_token_accuracy": 0.4034482777118683, + "step": 100475 + }, + { + "epoch": 0.10120452318145605, + "grad_norm": 9.412327463837844, + "learning_rate": 4.967821303988887e-05, + "loss": 2.6717, + "mean_token_accuracy": 0.4034482777118683, + "step": 100480 + }, + { + "epoch": 0.10120955923456022, + "grad_norm": 9.839997620110958, + "learning_rate": 4.967814989033659e-05, + "loss": 2.6339, + "mean_token_accuracy": 0.37931033968925476, + "step": 100485 + }, + { + "epoch": 0.10121459528766438, + "grad_norm": 11.101060261073052, + "learning_rate": 4.967808673463311e-05, + "loss": 2.9298, + "mean_token_accuracy": 0.36896551847457887, + "step": 100490 + }, + { + "epoch": 0.10121963134076856, + "grad_norm": 14.898977836783757, + "learning_rate": 4.967802357277846e-05, + "loss": 2.1909, + "mean_token_accuracy": 0.4517241418361664, + "step": 100495 + }, + { + "epoch": 0.10122466739387273, + "grad_norm": 12.645860913668214, + "learning_rate": 4.967796040477265e-05, + "loss": 2.5378, + "mean_token_accuracy": 0.43103448748588563, + "step": 100500 + }, + { + "epoch": 0.1012297034469769, + "grad_norm": 10.053678393340512, + "learning_rate": 4.96778972306157e-05, + "loss": 2.5634, + "mean_token_accuracy": 0.41724138259887694, + "step": 100505 + }, + { + "epoch": 0.10123473950008108, + "grad_norm": 11.905846307496736, + "learning_rate": 4.9677834050307625e-05, + "loss": 2.5123, + "mean_token_accuracy": 0.3931034505367279, + "step": 100510 + }, + { + "epoch": 0.10123977555318525, + "grad_norm": 9.63733451074268, + "learning_rate": 4.967777086384845e-05, + "loss": 2.2134, + "mean_token_accuracy": 0.4896551728248596, + "step": 100515 + }, + { + "epoch": 0.10124481160628943, + "grad_norm": 7.5063983459607195, + "learning_rate": 4.9677707671238184e-05, + "loss": 2.3781, + "mean_token_accuracy": 0.4551724135875702, + "step": 100520 + }, + { + "epoch": 0.1012498476593936, + "grad_norm": 13.147814643541263, + "learning_rate": 4.9677644472476856e-05, + "loss": 2.609, + "mean_token_accuracy": 0.3620689660310745, + "step": 100525 + }, + { + "epoch": 0.10125488371249777, + "grad_norm": 11.932190157041866, + "learning_rate": 4.967758126756447e-05, + "loss": 2.6359, + "mean_token_accuracy": 0.3517241358757019, + "step": 100530 + }, + { + "epoch": 0.10125991976560195, + "grad_norm": 8.848120743952473, + "learning_rate": 4.967751805650105e-05, + "loss": 2.1615, + "mean_token_accuracy": 0.4206896543502808, + "step": 100535 + }, + { + "epoch": 0.10126495581870612, + "grad_norm": 9.619795016903415, + "learning_rate": 4.967745483928661e-05, + "loss": 2.3464, + "mean_token_accuracy": 0.41034482717514037, + "step": 100540 + }, + { + "epoch": 0.1012699918718103, + "grad_norm": 11.583524889919026, + "learning_rate": 4.967739161592117e-05, + "loss": 2.5088, + "mean_token_accuracy": 0.4241379380226135, + "step": 100545 + }, + { + "epoch": 0.10127502792491447, + "grad_norm": 11.274429970944206, + "learning_rate": 4.967732838640475e-05, + "loss": 2.5741, + "mean_token_accuracy": 0.42413793206214906, + "step": 100550 + }, + { + "epoch": 0.10128006397801864, + "grad_norm": 11.051592635038917, + "learning_rate": 4.9677265150737364e-05, + "loss": 2.5721, + "mean_token_accuracy": 0.40344828367233276, + "step": 100555 + }, + { + "epoch": 0.1012851000311228, + "grad_norm": 7.866862193770247, + "learning_rate": 4.967720190891903e-05, + "loss": 2.2177, + "mean_token_accuracy": 0.4137930989265442, + "step": 100560 + }, + { + "epoch": 0.10129013608422698, + "grad_norm": 10.415649450687617, + "learning_rate": 4.9677138660949766e-05, + "loss": 2.459, + "mean_token_accuracy": 0.38275861740112305, + "step": 100565 + }, + { + "epoch": 0.10129517213733115, + "grad_norm": 9.706591525887859, + "learning_rate": 4.967707540682959e-05, + "loss": 2.7448, + "mean_token_accuracy": 0.4172413766384125, + "step": 100570 + }, + { + "epoch": 0.10130020819043532, + "grad_norm": 11.475594714070683, + "learning_rate": 4.9677012146558525e-05, + "loss": 2.4134, + "mean_token_accuracy": 0.4551724135875702, + "step": 100575 + }, + { + "epoch": 0.1013052442435395, + "grad_norm": 10.117291897754056, + "learning_rate": 4.967694888013658e-05, + "loss": 2.2558, + "mean_token_accuracy": 0.4586206912994385, + "step": 100580 + }, + { + "epoch": 0.10131028029664367, + "grad_norm": 10.232207089300864, + "learning_rate": 4.9676885607563774e-05, + "loss": 2.4348, + "mean_token_accuracy": 0.39491833448410035, + "step": 100585 + }, + { + "epoch": 0.10131531634974784, + "grad_norm": 13.304504899899978, + "learning_rate": 4.9676822328840124e-05, + "loss": 2.6212, + "mean_token_accuracy": 0.441379314661026, + "step": 100590 + }, + { + "epoch": 0.10132035240285202, + "grad_norm": 10.647563763639933, + "learning_rate": 4.967675904396565e-05, + "loss": 2.5306, + "mean_token_accuracy": 0.4068965554237366, + "step": 100595 + }, + { + "epoch": 0.10132538845595619, + "grad_norm": 8.916272338926895, + "learning_rate": 4.967669575294037e-05, + "loss": 2.1849, + "mean_token_accuracy": 0.43793103098869324, + "step": 100600 + }, + { + "epoch": 0.10133042450906037, + "grad_norm": 13.34168583103646, + "learning_rate": 4.967663245576431e-05, + "loss": 2.4019, + "mean_token_accuracy": 0.4137930929660797, + "step": 100605 + }, + { + "epoch": 0.10133546056216454, + "grad_norm": 10.695399836123494, + "learning_rate": 4.967656915243747e-05, + "loss": 2.6286, + "mean_token_accuracy": 0.4034482777118683, + "step": 100610 + }, + { + "epoch": 0.10134049661526871, + "grad_norm": 10.361462200560322, + "learning_rate": 4.967650584295987e-05, + "loss": 3.1051, + "mean_token_accuracy": 0.3896551787853241, + "step": 100615 + }, + { + "epoch": 0.10134553266837289, + "grad_norm": 13.047216953553455, + "learning_rate": 4.9676442527331535e-05, + "loss": 3.0016, + "mean_token_accuracy": 0.34137930572032926, + "step": 100620 + }, + { + "epoch": 0.10135056872147706, + "grad_norm": 14.073145559868317, + "learning_rate": 4.967637920555249e-05, + "loss": 3.214, + "mean_token_accuracy": 0.37586206793785093, + "step": 100625 + }, + { + "epoch": 0.10135560477458122, + "grad_norm": 11.647210947356317, + "learning_rate": 4.9676315877622734e-05, + "loss": 2.7346, + "mean_token_accuracy": 0.39655172228813174, + "step": 100630 + }, + { + "epoch": 0.1013606408276854, + "grad_norm": 11.732739210304482, + "learning_rate": 4.967625254354231e-05, + "loss": 2.4959, + "mean_token_accuracy": 0.4034482717514038, + "step": 100635 + }, + { + "epoch": 0.10136567688078957, + "grad_norm": 10.287999036247278, + "learning_rate": 4.9676189203311205e-05, + "loss": 2.5217, + "mean_token_accuracy": 0.4034482717514038, + "step": 100640 + }, + { + "epoch": 0.10137071293389374, + "grad_norm": 9.031237145703985, + "learning_rate": 4.967612585692945e-05, + "loss": 2.4471, + "mean_token_accuracy": 0.3896551728248596, + "step": 100645 + }, + { + "epoch": 0.10137574898699792, + "grad_norm": 10.405057172254269, + "learning_rate": 4.967606250439707e-05, + "loss": 2.4304, + "mean_token_accuracy": 0.42758620381355283, + "step": 100650 + }, + { + "epoch": 0.10138078504010209, + "grad_norm": 12.205493953052658, + "learning_rate": 4.9675999145714076e-05, + "loss": 2.4243, + "mean_token_accuracy": 0.44827587008476255, + "step": 100655 + }, + { + "epoch": 0.10138582109320626, + "grad_norm": 11.341097577105035, + "learning_rate": 4.9675935780880486e-05, + "loss": 2.6573, + "mean_token_accuracy": 0.38620689511299133, + "step": 100660 + }, + { + "epoch": 0.10139085714631044, + "grad_norm": 12.258841071436759, + "learning_rate": 4.967587240989632e-05, + "loss": 2.4832, + "mean_token_accuracy": 0.4310344815254211, + "step": 100665 + }, + { + "epoch": 0.10139589319941461, + "grad_norm": 9.706364631053155, + "learning_rate": 4.967580903276158e-05, + "loss": 2.305, + "mean_token_accuracy": 0.4258923172950745, + "step": 100670 + }, + { + "epoch": 0.10140092925251878, + "grad_norm": 8.691059065651524, + "learning_rate": 4.967574564947631e-05, + "loss": 2.2603, + "mean_token_accuracy": 0.4517241358757019, + "step": 100675 + }, + { + "epoch": 0.10140596530562296, + "grad_norm": 12.034777173390824, + "learning_rate": 4.9675682260040504e-05, + "loss": 2.2166, + "mean_token_accuracy": 0.4379310369491577, + "step": 100680 + }, + { + "epoch": 0.10141100135872713, + "grad_norm": 11.295859927668664, + "learning_rate": 4.96756188644542e-05, + "loss": 2.3683, + "mean_token_accuracy": 0.38275861740112305, + "step": 100685 + }, + { + "epoch": 0.1014160374118313, + "grad_norm": 9.446924969284714, + "learning_rate": 4.96755554627174e-05, + "loss": 2.335, + "mean_token_accuracy": 0.43448275327682495, + "step": 100690 + }, + { + "epoch": 0.10142107346493548, + "grad_norm": 10.192895583578453, + "learning_rate": 4.967549205483013e-05, + "loss": 2.3662, + "mean_token_accuracy": 0.48965516686439514, + "step": 100695 + }, + { + "epoch": 0.10142610951803964, + "grad_norm": 8.974214570885653, + "learning_rate": 4.9675428640792404e-05, + "loss": 2.2922, + "mean_token_accuracy": 0.43793103098869324, + "step": 100700 + }, + { + "epoch": 0.10143114557114381, + "grad_norm": 10.097710525785212, + "learning_rate": 4.9675365220604235e-05, + "loss": 2.5003, + "mean_token_accuracy": 0.3931034505367279, + "step": 100705 + }, + { + "epoch": 0.10143618162424799, + "grad_norm": 7.561590843733474, + "learning_rate": 4.967530179426565e-05, + "loss": 2.0413, + "mean_token_accuracy": 0.48965516686439514, + "step": 100710 + }, + { + "epoch": 0.10144121767735216, + "grad_norm": 10.136202970608602, + "learning_rate": 4.967523836177666e-05, + "loss": 2.0748, + "mean_token_accuracy": 0.47828190922737124, + "step": 100715 + }, + { + "epoch": 0.10144625373045633, + "grad_norm": 9.65393062526334, + "learning_rate": 4.967517492313728e-05, + "loss": 2.1347, + "mean_token_accuracy": 0.47352216839790345, + "step": 100720 + }, + { + "epoch": 0.10145128978356051, + "grad_norm": 9.738260090124431, + "learning_rate": 4.9675111478347544e-05, + "loss": 2.3118, + "mean_token_accuracy": 0.44137930274009707, + "step": 100725 + }, + { + "epoch": 0.10145632583666468, + "grad_norm": 11.33915215153483, + "learning_rate": 4.967504802740746e-05, + "loss": 2.9397, + "mean_token_accuracy": 0.36551723480224607, + "step": 100730 + }, + { + "epoch": 0.10146136188976886, + "grad_norm": 11.395081026282972, + "learning_rate": 4.967498457031703e-05, + "loss": 2.4194, + "mean_token_accuracy": 0.3827586233615875, + "step": 100735 + }, + { + "epoch": 0.10146639794287303, + "grad_norm": 13.892290641292924, + "learning_rate": 4.96749211070763e-05, + "loss": 2.5356, + "mean_token_accuracy": 0.4056866317987442, + "step": 100740 + }, + { + "epoch": 0.1014714339959772, + "grad_norm": 9.946780658418962, + "learning_rate": 4.967485763768526e-05, + "loss": 2.3801, + "mean_token_accuracy": 0.41034482717514037, + "step": 100745 + }, + { + "epoch": 0.10147647004908138, + "grad_norm": 10.646127651361248, + "learning_rate": 4.967479416214395e-05, + "loss": 2.6984, + "mean_token_accuracy": 0.441379314661026, + "step": 100750 + }, + { + "epoch": 0.10148150610218555, + "grad_norm": 13.995196633381271, + "learning_rate": 4.9674730680452374e-05, + "loss": 2.2535, + "mean_token_accuracy": 0.4655172348022461, + "step": 100755 + }, + { + "epoch": 0.10148654215528972, + "grad_norm": 9.811352491622047, + "learning_rate": 4.967466719261055e-05, + "loss": 2.237, + "mean_token_accuracy": 0.46551724076271056, + "step": 100760 + }, + { + "epoch": 0.1014915782083939, + "grad_norm": 9.840966881428052, + "learning_rate": 4.9674603698618506e-05, + "loss": 2.701, + "mean_token_accuracy": 0.38965516686439516, + "step": 100765 + }, + { + "epoch": 0.10149661426149806, + "grad_norm": 9.967733088992885, + "learning_rate": 4.967454019847624e-05, + "loss": 2.4588, + "mean_token_accuracy": 0.42413793206214906, + "step": 100770 + }, + { + "epoch": 0.10150165031460223, + "grad_norm": 10.20505073137083, + "learning_rate": 4.96744766921838e-05, + "loss": 2.0905, + "mean_token_accuracy": 0.48965516686439514, + "step": 100775 + }, + { + "epoch": 0.1015066863677064, + "grad_norm": 24.730972254919475, + "learning_rate": 4.967441317974118e-05, + "loss": 3.1655, + "mean_token_accuracy": 0.3592861443758011, + "step": 100780 + }, + { + "epoch": 0.10151172242081058, + "grad_norm": 11.614018424718402, + "learning_rate": 4.96743496611484e-05, + "loss": 2.2111, + "mean_token_accuracy": 0.4379310369491577, + "step": 100785 + }, + { + "epoch": 0.10151675847391475, + "grad_norm": 10.487049522959877, + "learning_rate": 4.967428613640548e-05, + "loss": 2.4044, + "mean_token_accuracy": 0.4034482777118683, + "step": 100790 + }, + { + "epoch": 0.10152179452701893, + "grad_norm": 14.413487159466131, + "learning_rate": 4.967422260551245e-05, + "loss": 2.8713, + "mean_token_accuracy": 0.3793103516101837, + "step": 100795 + }, + { + "epoch": 0.1015268305801231, + "grad_norm": 8.448813906671417, + "learning_rate": 4.967415906846931e-05, + "loss": 2.3608, + "mean_token_accuracy": 0.42758620977401735, + "step": 100800 + }, + { + "epoch": 0.10153186663322727, + "grad_norm": 9.551361366257247, + "learning_rate": 4.9674095525276084e-05, + "loss": 2.0918, + "mean_token_accuracy": 0.4620689690113068, + "step": 100805 + }, + { + "epoch": 0.10153690268633145, + "grad_norm": 14.849401617582771, + "learning_rate": 4.967403197593279e-05, + "loss": 2.5876, + "mean_token_accuracy": 0.40689656138420105, + "step": 100810 + }, + { + "epoch": 0.10154193873943562, + "grad_norm": 12.544426863429585, + "learning_rate": 4.9673968420439445e-05, + "loss": 3.3401, + "mean_token_accuracy": 0.28965516984462736, + "step": 100815 + }, + { + "epoch": 0.1015469747925398, + "grad_norm": 10.325039203033084, + "learning_rate": 4.967390485879607e-05, + "loss": 2.2345, + "mean_token_accuracy": 0.4344827651977539, + "step": 100820 + }, + { + "epoch": 0.10155201084564397, + "grad_norm": 13.817679325790849, + "learning_rate": 4.967384129100267e-05, + "loss": 2.5564, + "mean_token_accuracy": 0.3965517282485962, + "step": 100825 + }, + { + "epoch": 0.10155704689874814, + "grad_norm": 12.571800779946168, + "learning_rate": 4.967377771705928e-05, + "loss": 2.1879, + "mean_token_accuracy": 0.4379310369491577, + "step": 100830 + }, + { + "epoch": 0.10156208295185232, + "grad_norm": 11.72902053037505, + "learning_rate": 4.967371413696591e-05, + "loss": 2.2935, + "mean_token_accuracy": 0.46551724076271056, + "step": 100835 + }, + { + "epoch": 0.10156711900495648, + "grad_norm": 11.235483601668271, + "learning_rate": 4.9673650550722576e-05, + "loss": 2.6906, + "mean_token_accuracy": 0.3827586203813553, + "step": 100840 + }, + { + "epoch": 0.10157215505806065, + "grad_norm": 9.032253035829793, + "learning_rate": 4.96735869583293e-05, + "loss": 2.1702, + "mean_token_accuracy": 0.49655172824859617, + "step": 100845 + }, + { + "epoch": 0.10157719111116482, + "grad_norm": 11.4376212796248, + "learning_rate": 4.9673523359786095e-05, + "loss": 2.6579, + "mean_token_accuracy": 0.4103448212146759, + "step": 100850 + }, + { + "epoch": 0.101582227164269, + "grad_norm": 9.701295805624754, + "learning_rate": 4.967345975509298e-05, + "loss": 2.3822, + "mean_token_accuracy": 0.41379310488700866, + "step": 100855 + }, + { + "epoch": 0.10158726321737317, + "grad_norm": 9.855944809224175, + "learning_rate": 4.9673396144249975e-05, + "loss": 2.5044, + "mean_token_accuracy": 0.4068965554237366, + "step": 100860 + }, + { + "epoch": 0.10159229927047735, + "grad_norm": 8.448833720127668, + "learning_rate": 4.9673332527257097e-05, + "loss": 2.2792, + "mean_token_accuracy": 0.443254691362381, + "step": 100865 + }, + { + "epoch": 0.10159733532358152, + "grad_norm": 10.491740421603138, + "learning_rate": 4.967326890411436e-05, + "loss": 2.5856, + "mean_token_accuracy": 0.4068965554237366, + "step": 100870 + }, + { + "epoch": 0.1016023713766857, + "grad_norm": 11.755491951521089, + "learning_rate": 4.967320527482179e-05, + "loss": 2.703, + "mean_token_accuracy": 0.43189655542373656, + "step": 100875 + }, + { + "epoch": 0.10160740742978987, + "grad_norm": 9.62708744506585, + "learning_rate": 4.967314163937939e-05, + "loss": 2.2155, + "mean_token_accuracy": 0.4482758641242981, + "step": 100880 + }, + { + "epoch": 0.10161244348289404, + "grad_norm": 9.153020145138159, + "learning_rate": 4.967307799778719e-05, + "loss": 2.0903, + "mean_token_accuracy": 0.458620685338974, + "step": 100885 + }, + { + "epoch": 0.10161747953599821, + "grad_norm": 12.388236459140934, + "learning_rate": 4.96730143500452e-05, + "loss": 2.3104, + "mean_token_accuracy": 0.4206896543502808, + "step": 100890 + }, + { + "epoch": 0.10162251558910239, + "grad_norm": 9.773596403077843, + "learning_rate": 4.967295069615345e-05, + "loss": 2.9281, + "mean_token_accuracy": 0.4034482777118683, + "step": 100895 + }, + { + "epoch": 0.10162755164220656, + "grad_norm": 10.15066229161137, + "learning_rate": 4.967288703611194e-05, + "loss": 2.1057, + "mean_token_accuracy": 0.47241378426551817, + "step": 100900 + }, + { + "epoch": 0.10163258769531074, + "grad_norm": 10.52561256831747, + "learning_rate": 4.967282336992071e-05, + "loss": 2.0338, + "mean_token_accuracy": 0.4655172348022461, + "step": 100905 + }, + { + "epoch": 0.1016376237484149, + "grad_norm": 10.369343709503157, + "learning_rate": 4.9672759697579755e-05, + "loss": 2.2386, + "mean_token_accuracy": 0.4137930989265442, + "step": 100910 + }, + { + "epoch": 0.10164265980151907, + "grad_norm": 12.069768056683298, + "learning_rate": 4.9672696019089106e-05, + "loss": 2.2408, + "mean_token_accuracy": 0.46382335424423216, + "step": 100915 + }, + { + "epoch": 0.10164769585462324, + "grad_norm": 10.545574118346028, + "learning_rate": 4.9672632334448774e-05, + "loss": 2.0615, + "mean_token_accuracy": 0.4517241358757019, + "step": 100920 + }, + { + "epoch": 0.10165273190772742, + "grad_norm": 15.455818397902737, + "learning_rate": 4.967256864365879e-05, + "loss": 2.646, + "mean_token_accuracy": 0.3758620619773865, + "step": 100925 + }, + { + "epoch": 0.10165776796083159, + "grad_norm": 11.98615636227642, + "learning_rate": 4.967250494671914e-05, + "loss": 2.7912, + "mean_token_accuracy": 0.37241379022598264, + "step": 100930 + }, + { + "epoch": 0.10166280401393576, + "grad_norm": 12.380631434297218, + "learning_rate": 4.967244124362987e-05, + "loss": 2.5413, + "mean_token_accuracy": 0.4103448331356049, + "step": 100935 + }, + { + "epoch": 0.10166784006703994, + "grad_norm": 7.887126237438663, + "learning_rate": 4.9672377534391e-05, + "loss": 2.059, + "mean_token_accuracy": 0.4816697001457214, + "step": 100940 + }, + { + "epoch": 0.10167287612014411, + "grad_norm": 12.36479513938091, + "learning_rate": 4.967231381900254e-05, + "loss": 2.3425, + "mean_token_accuracy": 0.42280701398849485, + "step": 100945 + }, + { + "epoch": 0.10167791217324829, + "grad_norm": 11.606908700053987, + "learning_rate": 4.96722500974645e-05, + "loss": 2.822, + "mean_token_accuracy": 0.3724137932062149, + "step": 100950 + }, + { + "epoch": 0.10168294822635246, + "grad_norm": 10.50018012618524, + "learning_rate": 4.96721863697769e-05, + "loss": 2.1854, + "mean_token_accuracy": 0.4954023063182831, + "step": 100955 + }, + { + "epoch": 0.10168798427945663, + "grad_norm": 15.256208905980785, + "learning_rate": 4.967212263593977e-05, + "loss": 2.9387, + "mean_token_accuracy": 0.35172412991523744, + "step": 100960 + }, + { + "epoch": 0.10169302033256081, + "grad_norm": 10.382403812413267, + "learning_rate": 4.967205889595312e-05, + "loss": 2.8376, + "mean_token_accuracy": 0.3551724165678024, + "step": 100965 + }, + { + "epoch": 0.10169805638566498, + "grad_norm": 13.240454229917177, + "learning_rate": 4.9671995149816955e-05, + "loss": 2.4774, + "mean_token_accuracy": 0.3965517282485962, + "step": 100970 + }, + { + "epoch": 0.10170309243876915, + "grad_norm": 11.567877897320187, + "learning_rate": 4.967193139753131e-05, + "loss": 2.5932, + "mean_token_accuracy": 0.38620689511299133, + "step": 100975 + }, + { + "epoch": 0.10170812849187331, + "grad_norm": 11.109818837255643, + "learning_rate": 4.9671867639096195e-05, + "loss": 2.6215, + "mean_token_accuracy": 0.3344827651977539, + "step": 100980 + }, + { + "epoch": 0.10171316454497749, + "grad_norm": 10.404065805924267, + "learning_rate": 4.9671803874511634e-05, + "loss": 2.3149, + "mean_token_accuracy": 0.4459770143032074, + "step": 100985 + }, + { + "epoch": 0.10171820059808166, + "grad_norm": 11.232610393977048, + "learning_rate": 4.967174010377764e-05, + "loss": 1.9963, + "mean_token_accuracy": 0.4931034505367279, + "step": 100990 + }, + { + "epoch": 0.10172323665118584, + "grad_norm": 10.925460007176074, + "learning_rate": 4.9671676326894235e-05, + "loss": 2.4733, + "mean_token_accuracy": 0.38620689511299133, + "step": 100995 + }, + { + "epoch": 0.10172827270429001, + "grad_norm": 15.780413641323877, + "learning_rate": 4.967161254386143e-05, + "loss": 2.4543, + "mean_token_accuracy": 0.3999999940395355, + "step": 101000 + }, + { + "epoch": 0.10173330875739418, + "grad_norm": 13.611269464329586, + "learning_rate": 4.967154875467924e-05, + "loss": 2.2035, + "mean_token_accuracy": 0.4862069010734558, + "step": 101005 + }, + { + "epoch": 0.10173834481049836, + "grad_norm": 10.404716092969895, + "learning_rate": 4.9671484959347697e-05, + "loss": 2.2346, + "mean_token_accuracy": 0.42413793206214906, + "step": 101010 + }, + { + "epoch": 0.10174338086360253, + "grad_norm": 10.994114377956851, + "learning_rate": 4.96714211578668e-05, + "loss": 2.5468, + "mean_token_accuracy": 0.43103448748588563, + "step": 101015 + }, + { + "epoch": 0.1017484169167067, + "grad_norm": 16.276048439422148, + "learning_rate": 4.9671357350236586e-05, + "loss": 2.8474, + "mean_token_accuracy": 0.3551724135875702, + "step": 101020 + }, + { + "epoch": 0.10175345296981088, + "grad_norm": 10.77156092862102, + "learning_rate": 4.9671293536457056e-05, + "loss": 2.3863, + "mean_token_accuracy": 0.441379314661026, + "step": 101025 + }, + { + "epoch": 0.10175848902291505, + "grad_norm": 11.03370029899864, + "learning_rate": 4.967122971652824e-05, + "loss": 2.326, + "mean_token_accuracy": 0.4034482717514038, + "step": 101030 + }, + { + "epoch": 0.10176352507601923, + "grad_norm": 10.156911610184817, + "learning_rate": 4.9671165890450146e-05, + "loss": 2.3359, + "mean_token_accuracy": 0.42758620977401735, + "step": 101035 + }, + { + "epoch": 0.1017685611291234, + "grad_norm": 9.197896108400101, + "learning_rate": 4.967110205822279e-05, + "loss": 2.5787, + "mean_token_accuracy": 0.3896551728248596, + "step": 101040 + }, + { + "epoch": 0.10177359718222757, + "grad_norm": 8.528213144170406, + "learning_rate": 4.967103821984621e-05, + "loss": 2.774, + "mean_token_accuracy": 0.38965516686439516, + "step": 101045 + }, + { + "epoch": 0.10177863323533173, + "grad_norm": 10.679897902721834, + "learning_rate": 4.967097437532041e-05, + "loss": 2.3483, + "mean_token_accuracy": 0.4310344815254211, + "step": 101050 + }, + { + "epoch": 0.10178366928843591, + "grad_norm": 10.041133067649916, + "learning_rate": 4.9670910524645404e-05, + "loss": 2.52, + "mean_token_accuracy": 0.41034482717514037, + "step": 101055 + }, + { + "epoch": 0.10178870534154008, + "grad_norm": 9.994290211120356, + "learning_rate": 4.967084666782121e-05, + "loss": 2.1802, + "mean_token_accuracy": 0.4344827592372894, + "step": 101060 + }, + { + "epoch": 0.10179374139464425, + "grad_norm": 14.431225562379563, + "learning_rate": 4.967078280484786e-05, + "loss": 2.2004, + "mean_token_accuracy": 0.4620689570903778, + "step": 101065 + }, + { + "epoch": 0.10179877744774843, + "grad_norm": 10.966927455419945, + "learning_rate": 4.967071893572535e-05, + "loss": 2.1619, + "mean_token_accuracy": 0.4448275864124298, + "step": 101070 + }, + { + "epoch": 0.1018038135008526, + "grad_norm": 10.369541192564483, + "learning_rate": 4.9670655060453706e-05, + "loss": 2.3112, + "mean_token_accuracy": 0.4413793087005615, + "step": 101075 + }, + { + "epoch": 0.10180884955395678, + "grad_norm": 11.061034657124882, + "learning_rate": 4.967059117903296e-05, + "loss": 2.0565, + "mean_token_accuracy": 0.4569267988204956, + "step": 101080 + }, + { + "epoch": 0.10181388560706095, + "grad_norm": 10.587146592194774, + "learning_rate": 4.96705272914631e-05, + "loss": 2.4311, + "mean_token_accuracy": 0.3965517163276672, + "step": 101085 + }, + { + "epoch": 0.10181892166016512, + "grad_norm": 12.934031000544302, + "learning_rate": 4.967046339774418e-05, + "loss": 2.3297, + "mean_token_accuracy": 0.4551724135875702, + "step": 101090 + }, + { + "epoch": 0.1018239577132693, + "grad_norm": 11.918688664572638, + "learning_rate": 4.967039949787619e-05, + "loss": 2.3961, + "mean_token_accuracy": 0.42068964838981626, + "step": 101095 + }, + { + "epoch": 0.10182899376637347, + "grad_norm": 11.264872050838905, + "learning_rate": 4.967033559185915e-05, + "loss": 2.0866, + "mean_token_accuracy": 0.5068965435028077, + "step": 101100 + }, + { + "epoch": 0.10183402981947764, + "grad_norm": 10.289047563676268, + "learning_rate": 4.9670271679693096e-05, + "loss": 2.4932, + "mean_token_accuracy": 0.4137930989265442, + "step": 101105 + }, + { + "epoch": 0.10183906587258182, + "grad_norm": 9.837015291585804, + "learning_rate": 4.967020776137803e-05, + "loss": 2.408, + "mean_token_accuracy": 0.4206896543502808, + "step": 101110 + }, + { + "epoch": 0.10184410192568599, + "grad_norm": 9.853966386804649, + "learning_rate": 4.967014383691398e-05, + "loss": 2.5363, + "mean_token_accuracy": 0.4154869973659515, + "step": 101115 + }, + { + "epoch": 0.10184913797879015, + "grad_norm": 10.295761620117622, + "learning_rate": 4.967007990630095e-05, + "loss": 2.3673, + "mean_token_accuracy": 0.441379314661026, + "step": 101120 + }, + { + "epoch": 0.10185417403189433, + "grad_norm": 10.058262142379462, + "learning_rate": 4.967001596953897e-05, + "loss": 2.2536, + "mean_token_accuracy": 0.4758620738983154, + "step": 101125 + }, + { + "epoch": 0.1018592100849985, + "grad_norm": 11.019428454806961, + "learning_rate": 4.966995202662805e-05, + "loss": 2.7066, + "mean_token_accuracy": 0.39655172228813174, + "step": 101130 + }, + { + "epoch": 0.10186424613810267, + "grad_norm": 10.24806970428062, + "learning_rate": 4.966988807756822e-05, + "loss": 2.5154, + "mean_token_accuracy": 0.4361161530017853, + "step": 101135 + }, + { + "epoch": 0.10186928219120685, + "grad_norm": 12.156107699066538, + "learning_rate": 4.9669824122359475e-05, + "loss": 2.6683, + "mean_token_accuracy": 0.4172413766384125, + "step": 101140 + }, + { + "epoch": 0.10187431824431102, + "grad_norm": 7.389442963778349, + "learning_rate": 4.9669760161001855e-05, + "loss": 2.2613, + "mean_token_accuracy": 0.4655172348022461, + "step": 101145 + }, + { + "epoch": 0.1018793542974152, + "grad_norm": 11.426376677211143, + "learning_rate": 4.9669696193495365e-05, + "loss": 2.3932, + "mean_token_accuracy": 0.4360556542873383, + "step": 101150 + }, + { + "epoch": 0.10188439035051937, + "grad_norm": 10.553610489473256, + "learning_rate": 4.966963221984003e-05, + "loss": 2.5471, + "mean_token_accuracy": 0.42413793206214906, + "step": 101155 + }, + { + "epoch": 0.10188942640362354, + "grad_norm": 12.337645816576272, + "learning_rate": 4.966956824003587e-05, + "loss": 2.5839, + "mean_token_accuracy": 0.4482758641242981, + "step": 101160 + }, + { + "epoch": 0.10189446245672772, + "grad_norm": 46.4937495785292, + "learning_rate": 4.9669504254082895e-05, + "loss": 2.8224, + "mean_token_accuracy": 0.4068965494632721, + "step": 101165 + }, + { + "epoch": 0.10189949850983189, + "grad_norm": 10.495432188970655, + "learning_rate": 4.9669440261981116e-05, + "loss": 2.0261, + "mean_token_accuracy": 0.46896552443504336, + "step": 101170 + }, + { + "epoch": 0.10190453456293606, + "grad_norm": 10.607509788634863, + "learning_rate": 4.966937626373057e-05, + "loss": 2.152, + "mean_token_accuracy": 0.47489412426948546, + "step": 101175 + }, + { + "epoch": 0.10190957061604024, + "grad_norm": 9.70772759813328, + "learning_rate": 4.9669312259331255e-05, + "loss": 2.4355, + "mean_token_accuracy": 0.4413793087005615, + "step": 101180 + }, + { + "epoch": 0.10191460666914441, + "grad_norm": 14.423752812546205, + "learning_rate": 4.966924824878321e-05, + "loss": 2.3348, + "mean_token_accuracy": 0.432667875289917, + "step": 101185 + }, + { + "epoch": 0.10191964272224857, + "grad_norm": 7.629462158312488, + "learning_rate": 4.966918423208644e-05, + "loss": 2.7109, + "mean_token_accuracy": 0.4093768924474716, + "step": 101190 + }, + { + "epoch": 0.10192467877535275, + "grad_norm": 16.061545168444237, + "learning_rate": 4.966912020924095e-05, + "loss": 2.7195, + "mean_token_accuracy": 0.41379310488700866, + "step": 101195 + }, + { + "epoch": 0.10192971482845692, + "grad_norm": 10.133584347045788, + "learning_rate": 4.966905618024679e-05, + "loss": 1.9811, + "mean_token_accuracy": 0.4931034505367279, + "step": 101200 + }, + { + "epoch": 0.10193475088156109, + "grad_norm": 10.662707216546274, + "learning_rate": 4.966899214510395e-05, + "loss": 2.2405, + "mean_token_accuracy": 0.42758620977401735, + "step": 101205 + }, + { + "epoch": 0.10193978693466527, + "grad_norm": 12.168320769300239, + "learning_rate": 4.966892810381246e-05, + "loss": 2.5598, + "mean_token_accuracy": 0.3965517163276672, + "step": 101210 + }, + { + "epoch": 0.10194482298776944, + "grad_norm": 11.716755528533875, + "learning_rate": 4.9668864056372336e-05, + "loss": 2.5719, + "mean_token_accuracy": 0.42413792610168455, + "step": 101215 + }, + { + "epoch": 0.10194985904087361, + "grad_norm": 10.817554107088853, + "learning_rate": 4.966880000278359e-05, + "loss": 2.6313, + "mean_token_accuracy": 0.41379311084747317, + "step": 101220 + }, + { + "epoch": 0.10195489509397779, + "grad_norm": 10.013534855585563, + "learning_rate": 4.966873594304625e-05, + "loss": 2.6696, + "mean_token_accuracy": 0.3896551787853241, + "step": 101225 + }, + { + "epoch": 0.10195993114708196, + "grad_norm": 13.034660912578813, + "learning_rate": 4.9668671877160325e-05, + "loss": 2.8, + "mean_token_accuracy": 0.4137930989265442, + "step": 101230 + }, + { + "epoch": 0.10196496720018614, + "grad_norm": 10.858294969111173, + "learning_rate": 4.9668607805125836e-05, + "loss": 3.1224, + "mean_token_accuracy": 0.3068965435028076, + "step": 101235 + }, + { + "epoch": 0.10197000325329031, + "grad_norm": 10.073687090211894, + "learning_rate": 4.966854372694281e-05, + "loss": 2.3815, + "mean_token_accuracy": 0.4103448331356049, + "step": 101240 + }, + { + "epoch": 0.10197503930639448, + "grad_norm": 11.19879222105325, + "learning_rate": 4.966847964261124e-05, + "loss": 2.4538, + "mean_token_accuracy": 0.41379310488700866, + "step": 101245 + }, + { + "epoch": 0.10198007535949866, + "grad_norm": 13.179217835003104, + "learning_rate": 4.966841555213117e-05, + "loss": 2.9678, + "mean_token_accuracy": 0.42068965137004855, + "step": 101250 + }, + { + "epoch": 0.10198511141260283, + "grad_norm": 10.705079436034646, + "learning_rate": 4.96683514555026e-05, + "loss": 2.8048, + "mean_token_accuracy": 0.3413793116807938, + "step": 101255 + }, + { + "epoch": 0.10199014746570699, + "grad_norm": 9.618498109629163, + "learning_rate": 4.9668287352725564e-05, + "loss": 2.4731, + "mean_token_accuracy": 0.43980641961097716, + "step": 101260 + }, + { + "epoch": 0.10199518351881116, + "grad_norm": 10.437419519007902, + "learning_rate": 4.966822324380006e-05, + "loss": 2.2757, + "mean_token_accuracy": 0.44827587008476255, + "step": 101265 + }, + { + "epoch": 0.10200021957191534, + "grad_norm": 10.677167076755813, + "learning_rate": 4.9668159128726126e-05, + "loss": 2.528, + "mean_token_accuracy": 0.39655172228813174, + "step": 101270 + }, + { + "epoch": 0.10200525562501951, + "grad_norm": 9.050546650387743, + "learning_rate": 4.966809500750377e-05, + "loss": 2.2949, + "mean_token_accuracy": 0.43793103098869324, + "step": 101275 + }, + { + "epoch": 0.10201029167812369, + "grad_norm": 11.680354552411675, + "learning_rate": 4.9668030880133004e-05, + "loss": 2.4028, + "mean_token_accuracy": 0.4413793087005615, + "step": 101280 + }, + { + "epoch": 0.10201532773122786, + "grad_norm": 10.308090073136523, + "learning_rate": 4.966796674661385e-05, + "loss": 2.6891, + "mean_token_accuracy": 0.3931034505367279, + "step": 101285 + }, + { + "epoch": 0.10202036378433203, + "grad_norm": 11.34737519212835, + "learning_rate": 4.966790260694634e-05, + "loss": 2.4255, + "mean_token_accuracy": 0.41034482717514037, + "step": 101290 + }, + { + "epoch": 0.1020253998374362, + "grad_norm": 10.515134928771163, + "learning_rate": 4.966783846113047e-05, + "loss": 2.4041, + "mean_token_accuracy": 0.42413792610168455, + "step": 101295 + }, + { + "epoch": 0.10203043589054038, + "grad_norm": 12.135507006126897, + "learning_rate": 4.9667774309166275e-05, + "loss": 2.5766, + "mean_token_accuracy": 0.4206896543502808, + "step": 101300 + }, + { + "epoch": 0.10203547194364455, + "grad_norm": 12.743260954649752, + "learning_rate": 4.966771015105376e-05, + "loss": 2.2759, + "mean_token_accuracy": 0.43448275327682495, + "step": 101305 + }, + { + "epoch": 0.10204050799674873, + "grad_norm": 9.864209041939874, + "learning_rate": 4.966764598679295e-05, + "loss": 2.9815, + "mean_token_accuracy": 0.4, + "step": 101310 + }, + { + "epoch": 0.1020455440498529, + "grad_norm": 10.034103883878783, + "learning_rate": 4.966758181638386e-05, + "loss": 2.4889, + "mean_token_accuracy": 0.41379310488700866, + "step": 101315 + }, + { + "epoch": 0.10205058010295708, + "grad_norm": 11.913055040461813, + "learning_rate": 4.96675176398265e-05, + "loss": 2.4489, + "mean_token_accuracy": 0.4418719232082367, + "step": 101320 + }, + { + "epoch": 0.10205561615606125, + "grad_norm": 11.15737206432806, + "learning_rate": 4.9667453457120896e-05, + "loss": 2.4279, + "mean_token_accuracy": 0.441379314661026, + "step": 101325 + }, + { + "epoch": 0.10206065220916541, + "grad_norm": 11.326625425440515, + "learning_rate": 4.966738926826708e-05, + "loss": 2.834, + "mean_token_accuracy": 0.341379314661026, + "step": 101330 + }, + { + "epoch": 0.10206568826226958, + "grad_norm": 15.879808073577067, + "learning_rate": 4.9667325073265056e-05, + "loss": 2.1314, + "mean_token_accuracy": 0.46551724076271056, + "step": 101335 + }, + { + "epoch": 0.10207072431537376, + "grad_norm": 13.106315756663285, + "learning_rate": 4.966726087211483e-05, + "loss": 3.0851, + "mean_token_accuracy": 0.3448275804519653, + "step": 101340 + }, + { + "epoch": 0.10207576036847793, + "grad_norm": 12.24190433288864, + "learning_rate": 4.966719666481644e-05, + "loss": 2.2918, + "mean_token_accuracy": 0.44827585816383364, + "step": 101345 + }, + { + "epoch": 0.1020807964215821, + "grad_norm": 13.199450937300666, + "learning_rate": 4.966713245136989e-05, + "loss": 2.3746, + "mean_token_accuracy": 0.43103448748588563, + "step": 101350 + }, + { + "epoch": 0.10208583247468628, + "grad_norm": 11.067275679997097, + "learning_rate": 4.966706823177521e-05, + "loss": 2.3859, + "mean_token_accuracy": 0.44827585816383364, + "step": 101355 + }, + { + "epoch": 0.10209086852779045, + "grad_norm": 9.583815671735127, + "learning_rate": 4.9667004006032394e-05, + "loss": 2.3475, + "mean_token_accuracy": 0.441379314661026, + "step": 101360 + }, + { + "epoch": 0.10209590458089463, + "grad_norm": 12.581979630930846, + "learning_rate": 4.96669397741415e-05, + "loss": 2.4919, + "mean_token_accuracy": 0.4379310429096222, + "step": 101365 + }, + { + "epoch": 0.1021009406339988, + "grad_norm": 11.512231942293987, + "learning_rate": 4.96668755361025e-05, + "loss": 2.1641, + "mean_token_accuracy": 0.48620688915252686, + "step": 101370 + }, + { + "epoch": 0.10210597668710297, + "grad_norm": 10.443468656106361, + "learning_rate": 4.966681129191545e-05, + "loss": 2.2771, + "mean_token_accuracy": 0.4586206912994385, + "step": 101375 + }, + { + "epoch": 0.10211101274020715, + "grad_norm": 10.357880106524657, + "learning_rate": 4.9666747041580344e-05, + "loss": 2.5084, + "mean_token_accuracy": 0.4172413766384125, + "step": 101380 + }, + { + "epoch": 0.10211604879331132, + "grad_norm": 9.30325090885326, + "learning_rate": 4.966668278509722e-05, + "loss": 2.6331, + "mean_token_accuracy": 0.4344827592372894, + "step": 101385 + }, + { + "epoch": 0.1021210848464155, + "grad_norm": 11.013394465380355, + "learning_rate": 4.9666618522466075e-05, + "loss": 2.4513, + "mean_token_accuracy": 0.43103448748588563, + "step": 101390 + }, + { + "epoch": 0.10212612089951967, + "grad_norm": 8.862121802280159, + "learning_rate": 4.9666554253686934e-05, + "loss": 2.2252, + "mean_token_accuracy": 0.4551724135875702, + "step": 101395 + }, + { + "epoch": 0.10213115695262383, + "grad_norm": 10.178767160610892, + "learning_rate": 4.966648997875982e-05, + "loss": 2.0012, + "mean_token_accuracy": 0.5145320236682892, + "step": 101400 + }, + { + "epoch": 0.102136193005728, + "grad_norm": 10.046952452210524, + "learning_rate": 4.9666425697684746e-05, + "loss": 2.2008, + "mean_token_accuracy": 0.4517241358757019, + "step": 101405 + }, + { + "epoch": 0.10214122905883218, + "grad_norm": 11.009588532572721, + "learning_rate": 4.9666361410461734e-05, + "loss": 2.4843, + "mean_token_accuracy": 0.404718691110611, + "step": 101410 + }, + { + "epoch": 0.10214626511193635, + "grad_norm": 13.606734697560686, + "learning_rate": 4.966629711709079e-05, + "loss": 2.4338, + "mean_token_accuracy": 0.4034482717514038, + "step": 101415 + }, + { + "epoch": 0.10215130116504052, + "grad_norm": 11.479368690083962, + "learning_rate": 4.966623281757196e-05, + "loss": 2.6805, + "mean_token_accuracy": 0.38965516686439516, + "step": 101420 + }, + { + "epoch": 0.1021563372181447, + "grad_norm": 10.971211155835643, + "learning_rate": 4.966616851190522e-05, + "loss": 2.8029, + "mean_token_accuracy": 0.33793103098869326, + "step": 101425 + }, + { + "epoch": 0.10216137327124887, + "grad_norm": 10.165815502805659, + "learning_rate": 4.9666104200090615e-05, + "loss": 2.6893, + "mean_token_accuracy": 0.37586206793785093, + "step": 101430 + }, + { + "epoch": 0.10216640932435304, + "grad_norm": 10.671299134544501, + "learning_rate": 4.966603988212817e-05, + "loss": 3.1933, + "mean_token_accuracy": 0.35862069129943847, + "step": 101435 + }, + { + "epoch": 0.10217144537745722, + "grad_norm": 11.150517995312756, + "learning_rate": 4.9665975558017886e-05, + "loss": 2.2069, + "mean_token_accuracy": 0.4586206912994385, + "step": 101440 + }, + { + "epoch": 0.10217648143056139, + "grad_norm": 8.719926713472622, + "learning_rate": 4.966591122775978e-05, + "loss": 2.2478, + "mean_token_accuracy": 0.3931034505367279, + "step": 101445 + }, + { + "epoch": 0.10218151748366557, + "grad_norm": 10.55459519302372, + "learning_rate": 4.966584689135388e-05, + "loss": 2.3912, + "mean_token_accuracy": 0.40689654350280763, + "step": 101450 + }, + { + "epoch": 0.10218655353676974, + "grad_norm": 12.966120515592092, + "learning_rate": 4.966578254880021e-05, + "loss": 2.1867, + "mean_token_accuracy": 0.4586206912994385, + "step": 101455 + }, + { + "epoch": 0.10219158958987391, + "grad_norm": 15.400451831114731, + "learning_rate": 4.966571820009876e-05, + "loss": 2.7093, + "mean_token_accuracy": 0.3586206793785095, + "step": 101460 + }, + { + "epoch": 0.10219662564297809, + "grad_norm": 10.905507874059488, + "learning_rate": 4.966565384524957e-05, + "loss": 2.3838, + "mean_token_accuracy": 0.4931034445762634, + "step": 101465 + }, + { + "epoch": 0.10220166169608225, + "grad_norm": 8.854349295183948, + "learning_rate": 4.966558948425266e-05, + "loss": 2.4196, + "mean_token_accuracy": 0.4413793087005615, + "step": 101470 + }, + { + "epoch": 0.10220669774918642, + "grad_norm": 14.25834374638957, + "learning_rate": 4.9665525117108033e-05, + "loss": 2.5468, + "mean_token_accuracy": 0.42262552976608275, + "step": 101475 + }, + { + "epoch": 0.1022117338022906, + "grad_norm": 10.199296782329531, + "learning_rate": 4.9665460743815726e-05, + "loss": 2.3312, + "mean_token_accuracy": 0.4620689630508423, + "step": 101480 + }, + { + "epoch": 0.10221676985539477, + "grad_norm": 9.50060042877304, + "learning_rate": 4.9665396364375735e-05, + "loss": 2.4487, + "mean_token_accuracy": 0.3517241358757019, + "step": 101485 + }, + { + "epoch": 0.10222180590849894, + "grad_norm": 11.434091340228274, + "learning_rate": 4.9665331978788094e-05, + "loss": 2.653, + "mean_token_accuracy": 0.38620689511299133, + "step": 101490 + }, + { + "epoch": 0.10222684196160312, + "grad_norm": 11.805984698003433, + "learning_rate": 4.966526758705281e-05, + "loss": 2.8318, + "mean_token_accuracy": 0.34827586114406583, + "step": 101495 + }, + { + "epoch": 0.10223187801470729, + "grad_norm": 10.085346775076433, + "learning_rate": 4.9665203189169916e-05, + "loss": 2.5287, + "mean_token_accuracy": 0.39310344457626345, + "step": 101500 + }, + { + "epoch": 0.10223691406781146, + "grad_norm": 11.17373884407409, + "learning_rate": 4.966513878513941e-05, + "loss": 2.3601, + "mean_token_accuracy": 0.4068965554237366, + "step": 101505 + }, + { + "epoch": 0.10224195012091564, + "grad_norm": 12.515360738995351, + "learning_rate": 4.966507437496132e-05, + "loss": 2.3276, + "mean_token_accuracy": 0.41161524653434756, + "step": 101510 + }, + { + "epoch": 0.10224698617401981, + "grad_norm": 9.474818718156941, + "learning_rate": 4.9665009958635675e-05, + "loss": 2.5922, + "mean_token_accuracy": 0.40877193212509155, + "step": 101515 + }, + { + "epoch": 0.10225202222712398, + "grad_norm": 12.475417642394666, + "learning_rate": 4.966494553616248e-05, + "loss": 2.4751, + "mean_token_accuracy": 0.46896551847457885, + "step": 101520 + }, + { + "epoch": 0.10225705828022816, + "grad_norm": 11.589868268843718, + "learning_rate": 4.966488110754175e-05, + "loss": 2.5717, + "mean_token_accuracy": 0.39310344457626345, + "step": 101525 + }, + { + "epoch": 0.10226209433333233, + "grad_norm": 11.472717688386036, + "learning_rate": 4.9664816672773506e-05, + "loss": 2.0792, + "mean_token_accuracy": 0.4689655125141144, + "step": 101530 + }, + { + "epoch": 0.1022671303864365, + "grad_norm": 10.237386807476732, + "learning_rate": 4.966475223185777e-05, + "loss": 2.5498, + "mean_token_accuracy": 0.40344828367233276, + "step": 101535 + }, + { + "epoch": 0.10227216643954067, + "grad_norm": 11.862567961789917, + "learning_rate": 4.9664687784794555e-05, + "loss": 2.3874, + "mean_token_accuracy": 0.41379310488700866, + "step": 101540 + }, + { + "epoch": 0.10227720249264484, + "grad_norm": 15.204933789485153, + "learning_rate": 4.9664623331583876e-05, + "loss": 2.0477, + "mean_token_accuracy": 0.46394434571266174, + "step": 101545 + }, + { + "epoch": 0.10228223854574901, + "grad_norm": 8.825379391394835, + "learning_rate": 4.966455887222577e-05, + "loss": 2.4553, + "mean_token_accuracy": 0.40000000298023225, + "step": 101550 + }, + { + "epoch": 0.10228727459885319, + "grad_norm": 11.297179166233379, + "learning_rate": 4.966449440672023e-05, + "loss": 2.472, + "mean_token_accuracy": 0.41724138259887694, + "step": 101555 + }, + { + "epoch": 0.10229231065195736, + "grad_norm": 12.19411521625494, + "learning_rate": 4.966442993506729e-05, + "loss": 2.7066, + "mean_token_accuracy": 0.37931033968925476, + "step": 101560 + }, + { + "epoch": 0.10229734670506153, + "grad_norm": 11.33800025449437, + "learning_rate": 4.966436545726696e-05, + "loss": 2.7136, + "mean_token_accuracy": 0.4103448331356049, + "step": 101565 + }, + { + "epoch": 0.10230238275816571, + "grad_norm": 10.279873596176971, + "learning_rate": 4.966430097331926e-05, + "loss": 2.1518, + "mean_token_accuracy": 0.47241379618644713, + "step": 101570 + }, + { + "epoch": 0.10230741881126988, + "grad_norm": 11.673115134047578, + "learning_rate": 4.9664236483224215e-05, + "loss": 2.5192, + "mean_token_accuracy": 0.44827585816383364, + "step": 101575 + }, + { + "epoch": 0.10231245486437406, + "grad_norm": 10.944195663423022, + "learning_rate": 4.966417198698183e-05, + "loss": 2.3235, + "mean_token_accuracy": 0.42068966031074523, + "step": 101580 + }, + { + "epoch": 0.10231749091747823, + "grad_norm": 10.889442432948343, + "learning_rate": 4.966410748459213e-05, + "loss": 2.6649, + "mean_token_accuracy": 0.38620689511299133, + "step": 101585 + }, + { + "epoch": 0.1023225269705824, + "grad_norm": 10.49645800974054, + "learning_rate": 4.966404297605514e-05, + "loss": 2.9, + "mean_token_accuracy": 0.41724138259887694, + "step": 101590 + }, + { + "epoch": 0.10232756302368658, + "grad_norm": 14.745876133236068, + "learning_rate": 4.9663978461370854e-05, + "loss": 2.4916, + "mean_token_accuracy": 0.4011494219303131, + "step": 101595 + }, + { + "epoch": 0.10233259907679075, + "grad_norm": 9.206702414430856, + "learning_rate": 4.966391394053931e-05, + "loss": 2.6706, + "mean_token_accuracy": 0.39310344457626345, + "step": 101600 + }, + { + "epoch": 0.10233763512989492, + "grad_norm": 10.575541480541212, + "learning_rate": 4.9663849413560534e-05, + "loss": 2.3481, + "mean_token_accuracy": 0.44827585816383364, + "step": 101605 + }, + { + "epoch": 0.10234267118299908, + "grad_norm": 11.455111239494304, + "learning_rate": 4.966378488043452e-05, + "loss": 2.2668, + "mean_token_accuracy": 0.46206896007061005, + "step": 101610 + }, + { + "epoch": 0.10234770723610326, + "grad_norm": 10.099946547297618, + "learning_rate": 4.96637203411613e-05, + "loss": 2.3926, + "mean_token_accuracy": 0.42758620977401735, + "step": 101615 + }, + { + "epoch": 0.10235274328920743, + "grad_norm": 10.492093904601548, + "learning_rate": 4.9663655795740895e-05, + "loss": 2.3901, + "mean_token_accuracy": 0.45862067937850953, + "step": 101620 + }, + { + "epoch": 0.1023577793423116, + "grad_norm": 10.087200804485107, + "learning_rate": 4.966359124417331e-05, + "loss": 2.1933, + "mean_token_accuracy": 0.4379310369491577, + "step": 101625 + }, + { + "epoch": 0.10236281539541578, + "grad_norm": 11.9923446371806, + "learning_rate": 4.9663526686458575e-05, + "loss": 2.1371, + "mean_token_accuracy": 0.4689655125141144, + "step": 101630 + }, + { + "epoch": 0.10236785144851995, + "grad_norm": 18.498692301792317, + "learning_rate": 4.96634621225967e-05, + "loss": 2.557, + "mean_token_accuracy": 0.379310342669487, + "step": 101635 + }, + { + "epoch": 0.10237288750162413, + "grad_norm": 11.751099973737002, + "learning_rate": 4.9663397552587715e-05, + "loss": 2.3503, + "mean_token_accuracy": 0.4275862157344818, + "step": 101640 + }, + { + "epoch": 0.1023779235547283, + "grad_norm": 9.677773184802712, + "learning_rate": 4.966333297643161e-05, + "loss": 2.5097, + "mean_token_accuracy": 0.41724138259887694, + "step": 101645 + }, + { + "epoch": 0.10238295960783247, + "grad_norm": 11.912300075149359, + "learning_rate": 4.966326839412844e-05, + "loss": 2.4573, + "mean_token_accuracy": 0.4, + "step": 101650 + }, + { + "epoch": 0.10238799566093665, + "grad_norm": 10.07874219320994, + "learning_rate": 4.96632038056782e-05, + "loss": 2.4578, + "mean_token_accuracy": 0.4225045323371887, + "step": 101655 + }, + { + "epoch": 0.10239303171404082, + "grad_norm": 10.641358592510718, + "learning_rate": 4.966313921108091e-05, + "loss": 2.4991, + "mean_token_accuracy": 0.4310344815254211, + "step": 101660 + }, + { + "epoch": 0.102398067767145, + "grad_norm": 11.003416098229291, + "learning_rate": 4.966307461033659e-05, + "loss": 2.5566, + "mean_token_accuracy": 0.4172413766384125, + "step": 101665 + }, + { + "epoch": 0.10240310382024917, + "grad_norm": 12.221158778065156, + "learning_rate": 4.9663010003445265e-05, + "loss": 2.3277, + "mean_token_accuracy": 0.4413793087005615, + "step": 101670 + }, + { + "epoch": 0.10240813987335334, + "grad_norm": 15.358813467993949, + "learning_rate": 4.9662945390406936e-05, + "loss": 2.9863, + "mean_token_accuracy": 0.3529340624809265, + "step": 101675 + }, + { + "epoch": 0.1024131759264575, + "grad_norm": 10.354403483886225, + "learning_rate": 4.966288077122164e-05, + "loss": 2.4462, + "mean_token_accuracy": 0.4034482717514038, + "step": 101680 + }, + { + "epoch": 0.10241821197956168, + "grad_norm": 15.36691114722628, + "learning_rate": 4.966281614588938e-05, + "loss": 2.6192, + "mean_token_accuracy": 0.4517241299152374, + "step": 101685 + }, + { + "epoch": 0.10242324803266585, + "grad_norm": 11.575804384082033, + "learning_rate": 4.966275151441019e-05, + "loss": 2.4587, + "mean_token_accuracy": 0.4206896543502808, + "step": 101690 + }, + { + "epoch": 0.10242828408577002, + "grad_norm": 8.582409374530396, + "learning_rate": 4.966268687678407e-05, + "loss": 2.1808, + "mean_token_accuracy": 0.5034482777118683, + "step": 101695 + }, + { + "epoch": 0.1024333201388742, + "grad_norm": 11.188238898730267, + "learning_rate": 4.9662622233011054e-05, + "loss": 2.7465, + "mean_token_accuracy": 0.39310344457626345, + "step": 101700 + }, + { + "epoch": 0.10243835619197837, + "grad_norm": 12.498810735958163, + "learning_rate": 4.966255758309115e-05, + "loss": 2.4635, + "mean_token_accuracy": 0.4034482717514038, + "step": 101705 + }, + { + "epoch": 0.10244339224508255, + "grad_norm": 11.42433198736133, + "learning_rate": 4.966249292702436e-05, + "loss": 2.3962, + "mean_token_accuracy": 0.41379311084747317, + "step": 101710 + }, + { + "epoch": 0.10244842829818672, + "grad_norm": 11.08192715069187, + "learning_rate": 4.966242826481074e-05, + "loss": 2.6651, + "mean_token_accuracy": 0.3911675751209259, + "step": 101715 + }, + { + "epoch": 0.1024534643512909, + "grad_norm": 11.88239101660398, + "learning_rate": 4.9662363596450284e-05, + "loss": 2.4092, + "mean_token_accuracy": 0.4517241358757019, + "step": 101720 + }, + { + "epoch": 0.10245850040439507, + "grad_norm": 11.594731320782449, + "learning_rate": 4.966229892194301e-05, + "loss": 2.1298, + "mean_token_accuracy": 0.4379310369491577, + "step": 101725 + }, + { + "epoch": 0.10246353645749924, + "grad_norm": 9.138191817084461, + "learning_rate": 4.966223424128894e-05, + "loss": 2.6246, + "mean_token_accuracy": 0.401935875415802, + "step": 101730 + }, + { + "epoch": 0.10246857251060341, + "grad_norm": 11.059292015228092, + "learning_rate": 4.96621695544881e-05, + "loss": 2.6863, + "mean_token_accuracy": 0.37241379022598264, + "step": 101735 + }, + { + "epoch": 0.10247360856370759, + "grad_norm": 8.519398414335633, + "learning_rate": 4.966210486154049e-05, + "loss": 2.2333, + "mean_token_accuracy": 0.48965516686439514, + "step": 101740 + }, + { + "epoch": 0.10247864461681176, + "grad_norm": 9.061208460313637, + "learning_rate": 4.966204016244614e-05, + "loss": 2.4709, + "mean_token_accuracy": 0.44827585816383364, + "step": 101745 + }, + { + "epoch": 0.10248368066991592, + "grad_norm": 9.70373673018463, + "learning_rate": 4.9661975457205066e-05, + "loss": 2.2141, + "mean_token_accuracy": 0.4551724135875702, + "step": 101750 + }, + { + "epoch": 0.1024887167230201, + "grad_norm": 7.944684320234915, + "learning_rate": 4.966191074581729e-05, + "loss": 2.1249, + "mean_token_accuracy": 0.5034482717514038, + "step": 101755 + }, + { + "epoch": 0.10249375277612427, + "grad_norm": 9.808142779752208, + "learning_rate": 4.966184602828282e-05, + "loss": 2.1705, + "mean_token_accuracy": 0.4758620738983154, + "step": 101760 + }, + { + "epoch": 0.10249878882922844, + "grad_norm": 12.081405100812802, + "learning_rate": 4.9661781304601675e-05, + "loss": 2.6282, + "mean_token_accuracy": 0.3862069010734558, + "step": 101765 + }, + { + "epoch": 0.10250382488233262, + "grad_norm": 15.460521455990698, + "learning_rate": 4.966171657477389e-05, + "loss": 2.5727, + "mean_token_accuracy": 0.4034482777118683, + "step": 101770 + }, + { + "epoch": 0.10250886093543679, + "grad_norm": 11.007469662805402, + "learning_rate": 4.9661651838799453e-05, + "loss": 2.5487, + "mean_token_accuracy": 0.4034482777118683, + "step": 101775 + }, + { + "epoch": 0.10251389698854096, + "grad_norm": 11.458407422322827, + "learning_rate": 4.966158709667841e-05, + "loss": 2.3742, + "mean_token_accuracy": 0.441379314661026, + "step": 101780 + }, + { + "epoch": 0.10251893304164514, + "grad_norm": 10.868045266672093, + "learning_rate": 4.966152234841078e-05, + "loss": 2.8839, + "mean_token_accuracy": 0.3896551728248596, + "step": 101785 + }, + { + "epoch": 0.10252396909474931, + "grad_norm": 10.56471236106937, + "learning_rate": 4.9661457593996546e-05, + "loss": 2.3693, + "mean_token_accuracy": 0.38965516686439516, + "step": 101790 + }, + { + "epoch": 0.10252900514785349, + "grad_norm": 14.95486484000657, + "learning_rate": 4.966139283343576e-05, + "loss": 2.7499, + "mean_token_accuracy": 0.3931034475564957, + "step": 101795 + }, + { + "epoch": 0.10253404120095766, + "grad_norm": 10.271916892603645, + "learning_rate": 4.966132806672843e-05, + "loss": 2.3544, + "mean_token_accuracy": 0.48965518474578856, + "step": 101800 + }, + { + "epoch": 0.10253907725406183, + "grad_norm": 12.500649904038099, + "learning_rate": 4.9661263293874575e-05, + "loss": 2.3332, + "mean_token_accuracy": 0.4413793087005615, + "step": 101805 + }, + { + "epoch": 0.10254411330716601, + "grad_norm": 10.326865659565252, + "learning_rate": 4.9661198514874194e-05, + "loss": 2.3606, + "mean_token_accuracy": 0.4241379380226135, + "step": 101810 + }, + { + "epoch": 0.10254914936027018, + "grad_norm": 9.822274304239823, + "learning_rate": 4.966113372972734e-05, + "loss": 2.1969, + "mean_token_accuracy": 0.48517847061157227, + "step": 101815 + }, + { + "epoch": 0.10255418541337434, + "grad_norm": 14.33095604089978, + "learning_rate": 4.966106893843401e-05, + "loss": 2.794, + "mean_token_accuracy": 0.38275861740112305, + "step": 101820 + }, + { + "epoch": 0.10255922146647851, + "grad_norm": 10.75770010849827, + "learning_rate": 4.9661004140994227e-05, + "loss": 2.4954, + "mean_token_accuracy": 0.41379310488700866, + "step": 101825 + }, + { + "epoch": 0.10256425751958269, + "grad_norm": 17.115912407385828, + "learning_rate": 4.9660939337408e-05, + "loss": 2.5609, + "mean_token_accuracy": 0.4103448212146759, + "step": 101830 + }, + { + "epoch": 0.10256929357268686, + "grad_norm": 13.939205403960424, + "learning_rate": 4.966087452767536e-05, + "loss": 2.5106, + "mean_token_accuracy": 0.4172413766384125, + "step": 101835 + }, + { + "epoch": 0.10257432962579104, + "grad_norm": 11.497435487720708, + "learning_rate": 4.9660809711796306e-05, + "loss": 2.4063, + "mean_token_accuracy": 0.4310344815254211, + "step": 101840 + }, + { + "epoch": 0.10257936567889521, + "grad_norm": 9.858974802556869, + "learning_rate": 4.966074488977088e-05, + "loss": 2.4905, + "mean_token_accuracy": 0.4344827592372894, + "step": 101845 + }, + { + "epoch": 0.10258440173199938, + "grad_norm": 10.65864954646837, + "learning_rate": 4.9660680061599084e-05, + "loss": 2.3768, + "mean_token_accuracy": 0.4206896543502808, + "step": 101850 + }, + { + "epoch": 0.10258943778510356, + "grad_norm": 11.426020433645894, + "learning_rate": 4.9660615227280954e-05, + "loss": 2.4336, + "mean_token_accuracy": 0.4413793087005615, + "step": 101855 + }, + { + "epoch": 0.10259447383820773, + "grad_norm": 11.348431755087745, + "learning_rate": 4.966055038681648e-05, + "loss": 2.6029, + "mean_token_accuracy": 0.4379310250282288, + "step": 101860 + }, + { + "epoch": 0.1025995098913119, + "grad_norm": 9.167094048011469, + "learning_rate": 4.96604855402057e-05, + "loss": 2.4337, + "mean_token_accuracy": 0.44682395458221436, + "step": 101865 + }, + { + "epoch": 0.10260454594441608, + "grad_norm": 12.77370967740232, + "learning_rate": 4.9660420687448624e-05, + "loss": 2.7136, + "mean_token_accuracy": 0.38275861740112305, + "step": 101870 + }, + { + "epoch": 0.10260958199752025, + "grad_norm": 11.39784735322758, + "learning_rate": 4.9660355828545274e-05, + "loss": 2.7993, + "mean_token_accuracy": 0.3896551787853241, + "step": 101875 + }, + { + "epoch": 0.10261461805062443, + "grad_norm": 9.42788743546398, + "learning_rate": 4.966029096349567e-05, + "loss": 2.5035, + "mean_token_accuracy": 0.417241370677948, + "step": 101880 + }, + { + "epoch": 0.1026196541037286, + "grad_norm": 16.510028063328747, + "learning_rate": 4.966022609229982e-05, + "loss": 2.8042, + "mean_token_accuracy": 0.36206896901130675, + "step": 101885 + }, + { + "epoch": 0.10262469015683276, + "grad_norm": 10.321451525604472, + "learning_rate": 4.9660161214957754e-05, + "loss": 2.3351, + "mean_token_accuracy": 0.48620688915252686, + "step": 101890 + }, + { + "epoch": 0.10262972620993693, + "grad_norm": 9.621585518575282, + "learning_rate": 4.966009633146948e-05, + "loss": 1.9495, + "mean_token_accuracy": 0.4965517222881317, + "step": 101895 + }, + { + "epoch": 0.10263476226304111, + "grad_norm": 10.22010924434354, + "learning_rate": 4.9660031441835025e-05, + "loss": 2.2038, + "mean_token_accuracy": 0.5034482657909394, + "step": 101900 + }, + { + "epoch": 0.10263979831614528, + "grad_norm": 9.859462023288978, + "learning_rate": 4.96599665460544e-05, + "loss": 2.4479, + "mean_token_accuracy": 0.4413793087005615, + "step": 101905 + }, + { + "epoch": 0.10264483436924945, + "grad_norm": 10.978269983895451, + "learning_rate": 4.9659901644127626e-05, + "loss": 2.5835, + "mean_token_accuracy": 0.39098607897758486, + "step": 101910 + }, + { + "epoch": 0.10264987042235363, + "grad_norm": 12.484107422676011, + "learning_rate": 4.965983673605472e-05, + "loss": 2.8407, + "mean_token_accuracy": 0.3827586233615875, + "step": 101915 + }, + { + "epoch": 0.1026549064754578, + "grad_norm": 9.319840339828954, + "learning_rate": 4.9659771821835705e-05, + "loss": 2.1539, + "mean_token_accuracy": 0.4492610812187195, + "step": 101920 + }, + { + "epoch": 0.10265994252856198, + "grad_norm": 18.990370251890706, + "learning_rate": 4.9659706901470594e-05, + "loss": 2.5973, + "mean_token_accuracy": 0.4137930989265442, + "step": 101925 + }, + { + "epoch": 0.10266497858166615, + "grad_norm": 11.767563176004774, + "learning_rate": 4.96596419749594e-05, + "loss": 2.7638, + "mean_token_accuracy": 0.3896551728248596, + "step": 101930 + }, + { + "epoch": 0.10267001463477032, + "grad_norm": 9.250690645177698, + "learning_rate": 4.9659577042302145e-05, + "loss": 2.4109, + "mean_token_accuracy": 0.4344827651977539, + "step": 101935 + }, + { + "epoch": 0.1026750506878745, + "grad_norm": 13.465977049780664, + "learning_rate": 4.9659512103498855e-05, + "loss": 2.9523, + "mean_token_accuracy": 0.398064124584198, + "step": 101940 + }, + { + "epoch": 0.10268008674097867, + "grad_norm": 12.779978983341879, + "learning_rate": 4.965944715854954e-05, + "loss": 2.5364, + "mean_token_accuracy": 0.4186327874660492, + "step": 101945 + }, + { + "epoch": 0.10268512279408284, + "grad_norm": 8.497088425683993, + "learning_rate": 4.9659382207454224e-05, + "loss": 2.1507, + "mean_token_accuracy": 0.47586206793785096, + "step": 101950 + }, + { + "epoch": 0.10269015884718702, + "grad_norm": 10.939774519452017, + "learning_rate": 4.9659317250212917e-05, + "loss": 2.6772, + "mean_token_accuracy": 0.41724138259887694, + "step": 101955 + }, + { + "epoch": 0.10269519490029118, + "grad_norm": 12.280715256808495, + "learning_rate": 4.965925228682563e-05, + "loss": 2.309, + "mean_token_accuracy": 0.4220810651779175, + "step": 101960 + }, + { + "epoch": 0.10270023095339535, + "grad_norm": 22.130645483761572, + "learning_rate": 4.965918731729241e-05, + "loss": 2.4027, + "mean_token_accuracy": 0.44137930274009707, + "step": 101965 + }, + { + "epoch": 0.10270526700649953, + "grad_norm": 14.658707686083057, + "learning_rate": 4.965912234161325e-05, + "loss": 2.8321, + "mean_token_accuracy": 0.37931033968925476, + "step": 101970 + }, + { + "epoch": 0.1027103030596037, + "grad_norm": 10.223582360701371, + "learning_rate": 4.965905735978817e-05, + "loss": 2.669, + "mean_token_accuracy": 0.35172413289546967, + "step": 101975 + }, + { + "epoch": 0.10271533911270787, + "grad_norm": 9.000549878270615, + "learning_rate": 4.96589923718172e-05, + "loss": 2.2625, + "mean_token_accuracy": 0.41724138259887694, + "step": 101980 + }, + { + "epoch": 0.10272037516581205, + "grad_norm": 11.357538538101815, + "learning_rate": 4.965892737770034e-05, + "loss": 2.6219, + "mean_token_accuracy": 0.3931034505367279, + "step": 101985 + }, + { + "epoch": 0.10272541121891622, + "grad_norm": 10.25022959832566, + "learning_rate": 4.965886237743763e-05, + "loss": 2.5494, + "mean_token_accuracy": 0.39110708236694336, + "step": 101990 + }, + { + "epoch": 0.1027304472720204, + "grad_norm": 10.529763776620856, + "learning_rate": 4.965879737102907e-05, + "loss": 2.3375, + "mean_token_accuracy": 0.4068965494632721, + "step": 101995 + }, + { + "epoch": 0.10273548332512457, + "grad_norm": 10.923046048960414, + "learning_rate": 4.9658732358474684e-05, + "loss": 2.3635, + "mean_token_accuracy": 0.41379311084747317, + "step": 102000 + }, + { + "epoch": 0.10274051937822874, + "grad_norm": 9.27500908781363, + "learning_rate": 4.96586673397745e-05, + "loss": 2.3628, + "mean_token_accuracy": 0.44482759237289426, + "step": 102005 + }, + { + "epoch": 0.10274555543133292, + "grad_norm": 9.633407643152006, + "learning_rate": 4.965860231492852e-05, + "loss": 2.8986, + "mean_token_accuracy": 0.44482759237289426, + "step": 102010 + }, + { + "epoch": 0.10275059148443709, + "grad_norm": 7.5630789289690465, + "learning_rate": 4.9658537283936774e-05, + "loss": 2.3514, + "mean_token_accuracy": 0.45759226083755494, + "step": 102015 + }, + { + "epoch": 0.10275562753754126, + "grad_norm": 11.504885787997875, + "learning_rate": 4.965847224679927e-05, + "loss": 2.2804, + "mean_token_accuracy": 0.4310344815254211, + "step": 102020 + }, + { + "epoch": 0.10276066359064544, + "grad_norm": 22.000718110991656, + "learning_rate": 4.965840720351603e-05, + "loss": 2.887, + "mean_token_accuracy": 0.43448275327682495, + "step": 102025 + }, + { + "epoch": 0.1027656996437496, + "grad_norm": 9.415149749126341, + "learning_rate": 4.965834215408707e-05, + "loss": 2.367, + "mean_token_accuracy": 0.42413792610168455, + "step": 102030 + }, + { + "epoch": 0.10277073569685377, + "grad_norm": 10.134115044373447, + "learning_rate": 4.965827709851242e-05, + "loss": 2.276, + "mean_token_accuracy": 0.45172414779663084, + "step": 102035 + }, + { + "epoch": 0.10277577174995794, + "grad_norm": 10.408724625688393, + "learning_rate": 4.965821203679209e-05, + "loss": 2.1607, + "mean_token_accuracy": 0.43103448748588563, + "step": 102040 + }, + { + "epoch": 0.10278080780306212, + "grad_norm": 10.781107238036869, + "learning_rate": 4.965814696892609e-05, + "loss": 2.5701, + "mean_token_accuracy": 0.41379310488700866, + "step": 102045 + }, + { + "epoch": 0.10278584385616629, + "grad_norm": 10.918861678490645, + "learning_rate": 4.965808189491445e-05, + "loss": 2.2644, + "mean_token_accuracy": 0.42413792610168455, + "step": 102050 + }, + { + "epoch": 0.10279087990927047, + "grad_norm": 8.356097719679958, + "learning_rate": 4.965801681475717e-05, + "loss": 2.1046, + "mean_token_accuracy": 0.4793103516101837, + "step": 102055 + }, + { + "epoch": 0.10279591596237464, + "grad_norm": 11.286571209524185, + "learning_rate": 4.96579517284543e-05, + "loss": 2.2013, + "mean_token_accuracy": 0.49999999403953554, + "step": 102060 + }, + { + "epoch": 0.10280095201547881, + "grad_norm": 9.781892291497881, + "learning_rate": 4.9657886636005835e-05, + "loss": 2.1821, + "mean_token_accuracy": 0.43448275327682495, + "step": 102065 + }, + { + "epoch": 0.10280598806858299, + "grad_norm": 10.454231894049318, + "learning_rate": 4.965782153741179e-05, + "loss": 2.6903, + "mean_token_accuracy": 0.4206896543502808, + "step": 102070 + }, + { + "epoch": 0.10281102412168716, + "grad_norm": 13.977495116606674, + "learning_rate": 4.96577564326722e-05, + "loss": 2.1795, + "mean_token_accuracy": 0.4517241358757019, + "step": 102075 + }, + { + "epoch": 0.10281606017479133, + "grad_norm": 10.916049243674708, + "learning_rate": 4.965769132178707e-05, + "loss": 2.5148, + "mean_token_accuracy": 0.39310345649719236, + "step": 102080 + }, + { + "epoch": 0.10282109622789551, + "grad_norm": 11.019145188539882, + "learning_rate": 4.965762620475642e-05, + "loss": 2.4799, + "mean_token_accuracy": 0.4034482717514038, + "step": 102085 + }, + { + "epoch": 0.10282613228099968, + "grad_norm": 10.471593732993124, + "learning_rate": 4.9657561081580274e-05, + "loss": 2.1258, + "mean_token_accuracy": 0.458620685338974, + "step": 102090 + }, + { + "epoch": 0.10283116833410386, + "grad_norm": 9.632936805314667, + "learning_rate": 4.9657495952258634e-05, + "loss": 2.2233, + "mean_token_accuracy": 0.4275862067937851, + "step": 102095 + }, + { + "epoch": 0.10283620438720802, + "grad_norm": 12.075287331955318, + "learning_rate": 4.965743081679154e-05, + "loss": 2.3132, + "mean_token_accuracy": 0.47241378426551817, + "step": 102100 + }, + { + "epoch": 0.10284124044031219, + "grad_norm": 10.053007290233133, + "learning_rate": 4.9657365675179e-05, + "loss": 3.0249, + "mean_token_accuracy": 0.4124621868133545, + "step": 102105 + }, + { + "epoch": 0.10284627649341636, + "grad_norm": 10.108612896490703, + "learning_rate": 4.9657300527421026e-05, + "loss": 2.4482, + "mean_token_accuracy": 0.3896551728248596, + "step": 102110 + }, + { + "epoch": 0.10285131254652054, + "grad_norm": 11.122819943444814, + "learning_rate": 4.965723537351765e-05, + "loss": 2.543, + "mean_token_accuracy": 0.4137930929660797, + "step": 102115 + }, + { + "epoch": 0.10285634859962471, + "grad_norm": 10.945181653672073, + "learning_rate": 4.965717021346887e-05, + "loss": 2.4663, + "mean_token_accuracy": 0.45862069725990295, + "step": 102120 + }, + { + "epoch": 0.10286138465272888, + "grad_norm": 9.588363496597273, + "learning_rate": 4.9657105047274735e-05, + "loss": 2.1413, + "mean_token_accuracy": 0.4776164531707764, + "step": 102125 + }, + { + "epoch": 0.10286642070583306, + "grad_norm": 13.137631775570863, + "learning_rate": 4.965703987493523e-05, + "loss": 2.4318, + "mean_token_accuracy": 0.37586206793785093, + "step": 102130 + }, + { + "epoch": 0.10287145675893723, + "grad_norm": 11.256421228621463, + "learning_rate": 4.965697469645039e-05, + "loss": 2.744, + "mean_token_accuracy": 0.4103448331356049, + "step": 102135 + }, + { + "epoch": 0.1028764928120414, + "grad_norm": 10.607895119433163, + "learning_rate": 4.965690951182023e-05, + "loss": 2.841, + "mean_token_accuracy": 0.4, + "step": 102140 + }, + { + "epoch": 0.10288152886514558, + "grad_norm": 11.257702317682202, + "learning_rate": 4.965684432104477e-05, + "loss": 2.7679, + "mean_token_accuracy": 0.3965517282485962, + "step": 102145 + }, + { + "epoch": 0.10288656491824975, + "grad_norm": 10.201335101923332, + "learning_rate": 4.965677912412402e-05, + "loss": 3.1029, + "mean_token_accuracy": 0.3724137932062149, + "step": 102150 + }, + { + "epoch": 0.10289160097135393, + "grad_norm": 11.691532097259172, + "learning_rate": 4.965671392105802e-05, + "loss": 2.7562, + "mean_token_accuracy": 0.4467029690742493, + "step": 102155 + }, + { + "epoch": 0.1028966370244581, + "grad_norm": 10.92796435176211, + "learning_rate": 4.965664871184676e-05, + "loss": 2.2167, + "mean_token_accuracy": 0.4777374446392059, + "step": 102160 + }, + { + "epoch": 0.10290167307756228, + "grad_norm": 10.51862890936034, + "learning_rate": 4.9656583496490275e-05, + "loss": 2.2038, + "mean_token_accuracy": 0.42413793206214906, + "step": 102165 + }, + { + "epoch": 0.10290670913066643, + "grad_norm": 14.117083089812956, + "learning_rate": 4.965651827498858e-05, + "loss": 2.9368, + "mean_token_accuracy": 0.42413793206214906, + "step": 102170 + }, + { + "epoch": 0.10291174518377061, + "grad_norm": 9.525585719769627, + "learning_rate": 4.965645304734169e-05, + "loss": 2.0411, + "mean_token_accuracy": 0.48275861144065857, + "step": 102175 + }, + { + "epoch": 0.10291678123687478, + "grad_norm": 11.178360780100101, + "learning_rate": 4.965638781354962e-05, + "loss": 2.1904, + "mean_token_accuracy": 0.4413793087005615, + "step": 102180 + }, + { + "epoch": 0.10292181728997896, + "grad_norm": 10.293867303868764, + "learning_rate": 4.96563225736124e-05, + "loss": 2.2929, + "mean_token_accuracy": 0.47931034564971925, + "step": 102185 + }, + { + "epoch": 0.10292685334308313, + "grad_norm": 9.916686573688567, + "learning_rate": 4.965625732753004e-05, + "loss": 2.1097, + "mean_token_accuracy": 0.4862068951129913, + "step": 102190 + }, + { + "epoch": 0.1029318893961873, + "grad_norm": 12.2876593804011, + "learning_rate": 4.965619207530256e-05, + "loss": 2.0313, + "mean_token_accuracy": 0.512583190202713, + "step": 102195 + }, + { + "epoch": 0.10293692544929148, + "grad_norm": 10.686750626569669, + "learning_rate": 4.965612681692998e-05, + "loss": 2.5175, + "mean_token_accuracy": 0.42413793206214906, + "step": 102200 + }, + { + "epoch": 0.10294196150239565, + "grad_norm": 11.06822083002831, + "learning_rate": 4.965606155241231e-05, + "loss": 2.3551, + "mean_token_accuracy": 0.38965516686439516, + "step": 102205 + }, + { + "epoch": 0.10294699755549983, + "grad_norm": 12.736464350132138, + "learning_rate": 4.965599628174958e-05, + "loss": 3.0601, + "mean_token_accuracy": 0.34137930870056155, + "step": 102210 + }, + { + "epoch": 0.102952033608604, + "grad_norm": 8.848482387845467, + "learning_rate": 4.965593100494179e-05, + "loss": 2.2923, + "mean_token_accuracy": 0.4551724135875702, + "step": 102215 + }, + { + "epoch": 0.10295706966170817, + "grad_norm": 11.158299392439307, + "learning_rate": 4.965586572198898e-05, + "loss": 2.6138, + "mean_token_accuracy": 0.4, + "step": 102220 + }, + { + "epoch": 0.10296210571481235, + "grad_norm": 10.227388314430089, + "learning_rate": 4.9655800432891156e-05, + "loss": 2.4652, + "mean_token_accuracy": 0.37586207389831544, + "step": 102225 + }, + { + "epoch": 0.10296714176791652, + "grad_norm": 11.121329429618935, + "learning_rate": 4.965573513764834e-05, + "loss": 2.103, + "mean_token_accuracy": 0.4758620738983154, + "step": 102230 + }, + { + "epoch": 0.1029721778210207, + "grad_norm": 13.31074725819182, + "learning_rate": 4.9655669836260546e-05, + "loss": 2.9709, + "mean_token_accuracy": 0.37931033968925476, + "step": 102235 + }, + { + "epoch": 0.10297721387412485, + "grad_norm": 10.57385465195964, + "learning_rate": 4.965560452872779e-05, + "loss": 2.8341, + "mean_token_accuracy": 0.38965516686439516, + "step": 102240 + }, + { + "epoch": 0.10298224992722903, + "grad_norm": 10.886100239209368, + "learning_rate": 4.9655539215050104e-05, + "loss": 2.2853, + "mean_token_accuracy": 0.39310345649719236, + "step": 102245 + }, + { + "epoch": 0.1029872859803332, + "grad_norm": 10.70721907608428, + "learning_rate": 4.965547389522749e-05, + "loss": 2.5388, + "mean_token_accuracy": 0.44827587008476255, + "step": 102250 + }, + { + "epoch": 0.10299232203343738, + "grad_norm": 9.1492192171996, + "learning_rate": 4.965540856925998e-05, + "loss": 2.3169, + "mean_token_accuracy": 0.42068964838981626, + "step": 102255 + }, + { + "epoch": 0.10299735808654155, + "grad_norm": 11.588311642934736, + "learning_rate": 4.965534323714757e-05, + "loss": 2.2744, + "mean_token_accuracy": 0.4793103516101837, + "step": 102260 + }, + { + "epoch": 0.10300239413964572, + "grad_norm": 10.638209323478575, + "learning_rate": 4.96552778988903e-05, + "loss": 2.4668, + "mean_token_accuracy": 0.4034482717514038, + "step": 102265 + }, + { + "epoch": 0.1030074301927499, + "grad_norm": 14.192960015896805, + "learning_rate": 4.965521255448818e-05, + "loss": 2.7249, + "mean_token_accuracy": 0.3620689630508423, + "step": 102270 + }, + { + "epoch": 0.10301246624585407, + "grad_norm": 11.741020354618106, + "learning_rate": 4.965514720394124e-05, + "loss": 2.6517, + "mean_token_accuracy": 0.3931034505367279, + "step": 102275 + }, + { + "epoch": 0.10301750229895824, + "grad_norm": 10.103938788632266, + "learning_rate": 4.965508184724948e-05, + "loss": 2.4217, + "mean_token_accuracy": 0.4344827592372894, + "step": 102280 + }, + { + "epoch": 0.10302253835206242, + "grad_norm": 9.435916129455332, + "learning_rate": 4.965501648441292e-05, + "loss": 2.482, + "mean_token_accuracy": 0.439443439245224, + "step": 102285 + }, + { + "epoch": 0.10302757440516659, + "grad_norm": 10.856898010993282, + "learning_rate": 4.965495111543159e-05, + "loss": 2.3643, + "mean_token_accuracy": 0.4206896543502808, + "step": 102290 + }, + { + "epoch": 0.10303261045827077, + "grad_norm": 10.73676591610329, + "learning_rate": 4.96548857403055e-05, + "loss": 2.667, + "mean_token_accuracy": 0.4275861978530884, + "step": 102295 + }, + { + "epoch": 0.10303764651137494, + "grad_norm": 12.054086517693046, + "learning_rate": 4.9654820359034667e-05, + "loss": 2.3956, + "mean_token_accuracy": 0.46896552443504336, + "step": 102300 + }, + { + "epoch": 0.10304268256447911, + "grad_norm": 21.745721914804694, + "learning_rate": 4.965475497161912e-05, + "loss": 2.7046, + "mean_token_accuracy": 0.39310344457626345, + "step": 102305 + }, + { + "epoch": 0.10304771861758327, + "grad_norm": 12.650487035239586, + "learning_rate": 4.965468957805886e-05, + "loss": 2.3852, + "mean_token_accuracy": 0.4034482777118683, + "step": 102310 + }, + { + "epoch": 0.10305275467068745, + "grad_norm": 13.08184521403198, + "learning_rate": 4.965462417835392e-05, + "loss": 2.4584, + "mean_token_accuracy": 0.44482759237289426, + "step": 102315 + }, + { + "epoch": 0.10305779072379162, + "grad_norm": 15.234797038153893, + "learning_rate": 4.96545587725043e-05, + "loss": 2.5349, + "mean_token_accuracy": 0.38965517580509185, + "step": 102320 + }, + { + "epoch": 0.1030628267768958, + "grad_norm": 17.217485075191032, + "learning_rate": 4.9654493360510054e-05, + "loss": 2.9051, + "mean_token_accuracy": 0.3862068891525269, + "step": 102325 + }, + { + "epoch": 0.10306786282999997, + "grad_norm": 9.105017808039984, + "learning_rate": 4.965442794237115e-05, + "loss": 2.5363, + "mean_token_accuracy": 0.4344827592372894, + "step": 102330 + }, + { + "epoch": 0.10307289888310414, + "grad_norm": 11.451312095636137, + "learning_rate": 4.9654362518087654e-05, + "loss": 2.6433, + "mean_token_accuracy": 0.41724138259887694, + "step": 102335 + }, + { + "epoch": 0.10307793493620832, + "grad_norm": 11.475994716990174, + "learning_rate": 4.965429708765955e-05, + "loss": 2.8815, + "mean_token_accuracy": 0.37047791481018066, + "step": 102340 + }, + { + "epoch": 0.10308297098931249, + "grad_norm": 11.22939912890726, + "learning_rate": 4.965423165108687e-05, + "loss": 2.7322, + "mean_token_accuracy": 0.42068966031074523, + "step": 102345 + }, + { + "epoch": 0.10308800704241666, + "grad_norm": 11.083428163653416, + "learning_rate": 4.965416620836964e-05, + "loss": 2.4685, + "mean_token_accuracy": 0.4103448212146759, + "step": 102350 + }, + { + "epoch": 0.10309304309552084, + "grad_norm": 11.22904067861591, + "learning_rate": 4.965410075950786e-05, + "loss": 2.4457, + "mean_token_accuracy": 0.4536600112915039, + "step": 102355 + }, + { + "epoch": 0.10309807914862501, + "grad_norm": 11.042799264635574, + "learning_rate": 4.9654035304501567e-05, + "loss": 2.5452, + "mean_token_accuracy": 0.39655172228813174, + "step": 102360 + }, + { + "epoch": 0.10310311520172918, + "grad_norm": 11.604895082807737, + "learning_rate": 4.9653969843350764e-05, + "loss": 2.5327, + "mean_token_accuracy": 0.36896551251411436, + "step": 102365 + }, + { + "epoch": 0.10310815125483336, + "grad_norm": 8.972818498717327, + "learning_rate": 4.965390437605547e-05, + "loss": 2.1956, + "mean_token_accuracy": 0.4620689690113068, + "step": 102370 + }, + { + "epoch": 0.10311318730793753, + "grad_norm": 13.044439230023187, + "learning_rate": 4.9653838902615716e-05, + "loss": 2.4239, + "mean_token_accuracy": 0.4275861978530884, + "step": 102375 + }, + { + "epoch": 0.10311822336104169, + "grad_norm": 11.155043759351242, + "learning_rate": 4.965377342303151e-05, + "loss": 2.544, + "mean_token_accuracy": 0.44482758045196535, + "step": 102380 + }, + { + "epoch": 0.10312325941414587, + "grad_norm": 11.443821357480022, + "learning_rate": 4.9653707937302876e-05, + "loss": 2.7777, + "mean_token_accuracy": 0.37586206793785093, + "step": 102385 + }, + { + "epoch": 0.10312829546725004, + "grad_norm": 11.512084109565542, + "learning_rate": 4.965364244542982e-05, + "loss": 2.5911, + "mean_token_accuracy": 0.43103448748588563, + "step": 102390 + }, + { + "epoch": 0.10313333152035421, + "grad_norm": 8.834290493130307, + "learning_rate": 4.965357694741237e-05, + "loss": 2.437, + "mean_token_accuracy": 0.36551723778247835, + "step": 102395 + }, + { + "epoch": 0.10313836757345839, + "grad_norm": 11.404413400892183, + "learning_rate": 4.965351144325055e-05, + "loss": 2.3684, + "mean_token_accuracy": 0.4620689690113068, + "step": 102400 + }, + { + "epoch": 0.10314340362656256, + "grad_norm": 9.690302487377703, + "learning_rate": 4.965344593294437e-05, + "loss": 2.6497, + "mean_token_accuracy": 0.36896551847457887, + "step": 102405 + }, + { + "epoch": 0.10314843967966673, + "grad_norm": 9.410942211405763, + "learning_rate": 4.9653380416493845e-05, + "loss": 2.3542, + "mean_token_accuracy": 0.42758620977401735, + "step": 102410 + }, + { + "epoch": 0.10315347573277091, + "grad_norm": 10.334421098922991, + "learning_rate": 4.9653314893898995e-05, + "loss": 2.6954, + "mean_token_accuracy": 0.35172413289546967, + "step": 102415 + }, + { + "epoch": 0.10315851178587508, + "grad_norm": 12.287584805356952, + "learning_rate": 4.965324936515984e-05, + "loss": 2.4355, + "mean_token_accuracy": 0.417241370677948, + "step": 102420 + }, + { + "epoch": 0.10316354783897926, + "grad_norm": 11.606875750707568, + "learning_rate": 4.96531838302764e-05, + "loss": 2.6862, + "mean_token_accuracy": 0.3862068891525269, + "step": 102425 + }, + { + "epoch": 0.10316858389208343, + "grad_norm": 9.6642033044946, + "learning_rate": 4.96531182892487e-05, + "loss": 2.3706, + "mean_token_accuracy": 0.4068965494632721, + "step": 102430 + }, + { + "epoch": 0.1031736199451876, + "grad_norm": 16.15070223826042, + "learning_rate": 4.9653052742076745e-05, + "loss": 2.7245, + "mean_token_accuracy": 0.4068965494632721, + "step": 102435 + }, + { + "epoch": 0.10317865599829178, + "grad_norm": 12.292067190496162, + "learning_rate": 4.965298718876055e-05, + "loss": 2.7839, + "mean_token_accuracy": 0.3620689630508423, + "step": 102440 + }, + { + "epoch": 0.10318369205139595, + "grad_norm": 12.262468834402549, + "learning_rate": 4.965292162930015e-05, + "loss": 2.6092, + "mean_token_accuracy": 0.41724138855934145, + "step": 102445 + }, + { + "epoch": 0.10318872810450011, + "grad_norm": 10.366988528147912, + "learning_rate": 4.965285606369555e-05, + "loss": 2.1988, + "mean_token_accuracy": 0.41379310488700866, + "step": 102450 + }, + { + "epoch": 0.10319376415760428, + "grad_norm": 9.550317414248871, + "learning_rate": 4.965279049194678e-05, + "loss": 2.3536, + "mean_token_accuracy": 0.3896551728248596, + "step": 102455 + }, + { + "epoch": 0.10319880021070846, + "grad_norm": 13.132805084269354, + "learning_rate": 4.9652724914053845e-05, + "loss": 2.3134, + "mean_token_accuracy": 0.493103438615799, + "step": 102460 + }, + { + "epoch": 0.10320383626381263, + "grad_norm": 13.844490654754868, + "learning_rate": 4.965265933001678e-05, + "loss": 2.7033, + "mean_token_accuracy": 0.4344827473163605, + "step": 102465 + }, + { + "epoch": 0.1032088723169168, + "grad_norm": 10.165563134331066, + "learning_rate": 4.965259373983558e-05, + "loss": 2.5848, + "mean_token_accuracy": 0.38275861740112305, + "step": 102470 + }, + { + "epoch": 0.10321390837002098, + "grad_norm": 11.0973737556596, + "learning_rate": 4.9652528143510274e-05, + "loss": 2.3131, + "mean_token_accuracy": 0.4344827592372894, + "step": 102475 + }, + { + "epoch": 0.10321894442312515, + "grad_norm": 9.691557745696265, + "learning_rate": 4.965246254104089e-05, + "loss": 2.1327, + "mean_token_accuracy": 0.5206896543502808, + "step": 102480 + }, + { + "epoch": 0.10322398047622933, + "grad_norm": 12.20701557338449, + "learning_rate": 4.965239693242742e-05, + "loss": 2.4231, + "mean_token_accuracy": 0.4206896543502808, + "step": 102485 + }, + { + "epoch": 0.1032290165293335, + "grad_norm": 12.365695155282843, + "learning_rate": 4.9652331317669926e-05, + "loss": 2.7968, + "mean_token_accuracy": 0.3275862067937851, + "step": 102490 + }, + { + "epoch": 0.10323405258243767, + "grad_norm": 11.42862035879425, + "learning_rate": 4.9652265696768384e-05, + "loss": 2.5835, + "mean_token_accuracy": 0.41379310488700866, + "step": 102495 + }, + { + "epoch": 0.10323908863554185, + "grad_norm": 8.460444567284652, + "learning_rate": 4.965220006972284e-05, + "loss": 1.9352, + "mean_token_accuracy": 0.46896551847457885, + "step": 102500 + }, + { + "epoch": 0.10324412468864602, + "grad_norm": 11.582043784227944, + "learning_rate": 4.965213443653329e-05, + "loss": 2.3879, + "mean_token_accuracy": 0.4517241299152374, + "step": 102505 + }, + { + "epoch": 0.1032491607417502, + "grad_norm": 10.155256065840705, + "learning_rate": 4.965206879719977e-05, + "loss": 2.2055, + "mean_token_accuracy": 0.4931034505367279, + "step": 102510 + }, + { + "epoch": 0.10325419679485437, + "grad_norm": 10.938398392110008, + "learning_rate": 4.965200315172228e-05, + "loss": 2.4685, + "mean_token_accuracy": 0.42413793206214906, + "step": 102515 + }, + { + "epoch": 0.10325923284795853, + "grad_norm": 9.291433428053487, + "learning_rate": 4.965193750010086e-05, + "loss": 2.3029, + "mean_token_accuracy": 0.42758620977401735, + "step": 102520 + }, + { + "epoch": 0.1032642689010627, + "grad_norm": 10.643813554795633, + "learning_rate": 4.965187184233551e-05, + "loss": 2.2657, + "mean_token_accuracy": 0.42413793206214906, + "step": 102525 + }, + { + "epoch": 0.10326930495416688, + "grad_norm": 11.060384727781962, + "learning_rate": 4.9651806178426265e-05, + "loss": 2.3702, + "mean_token_accuracy": 0.4297035694122314, + "step": 102530 + }, + { + "epoch": 0.10327434100727105, + "grad_norm": 16.888993833240633, + "learning_rate": 4.965174050837313e-05, + "loss": 2.6972, + "mean_token_accuracy": 0.41034482717514037, + "step": 102535 + }, + { + "epoch": 0.10327937706037522, + "grad_norm": 9.448533773582648, + "learning_rate": 4.9651674832176125e-05, + "loss": 2.2382, + "mean_token_accuracy": 0.47586206197738645, + "step": 102540 + }, + { + "epoch": 0.1032844131134794, + "grad_norm": 9.512584620871289, + "learning_rate": 4.9651609149835265e-05, + "loss": 2.6679, + "mean_token_accuracy": 0.4172413766384125, + "step": 102545 + }, + { + "epoch": 0.10328944916658357, + "grad_norm": 13.64653361353644, + "learning_rate": 4.9651543461350584e-05, + "loss": 2.4553, + "mean_token_accuracy": 0.4620689630508423, + "step": 102550 + }, + { + "epoch": 0.10329448521968775, + "grad_norm": 11.438827036420195, + "learning_rate": 4.965147776672209e-05, + "loss": 2.4752, + "mean_token_accuracy": 0.4496672749519348, + "step": 102555 + }, + { + "epoch": 0.10329952127279192, + "grad_norm": 10.146353314890844, + "learning_rate": 4.96514120659498e-05, + "loss": 2.8717, + "mean_token_accuracy": 0.3517241418361664, + "step": 102560 + }, + { + "epoch": 0.1033045573258961, + "grad_norm": 11.140627280171406, + "learning_rate": 4.965134635903373e-05, + "loss": 2.6728, + "mean_token_accuracy": 0.4068965554237366, + "step": 102565 + }, + { + "epoch": 0.10330959337900027, + "grad_norm": 11.410044974639801, + "learning_rate": 4.96512806459739e-05, + "loss": 2.2708, + "mean_token_accuracy": 0.4448275864124298, + "step": 102570 + }, + { + "epoch": 0.10331462943210444, + "grad_norm": 8.977165008458515, + "learning_rate": 4.965121492677032e-05, + "loss": 2.352, + "mean_token_accuracy": 0.37586206793785093, + "step": 102575 + }, + { + "epoch": 0.10331966548520861, + "grad_norm": 9.506870663721585, + "learning_rate": 4.9651149201423036e-05, + "loss": 2.1491, + "mean_token_accuracy": 0.4379310369491577, + "step": 102580 + }, + { + "epoch": 0.10332470153831277, + "grad_norm": 10.27802258758621, + "learning_rate": 4.965108346993204e-05, + "loss": 2.4839, + "mean_token_accuracy": 0.44482758045196535, + "step": 102585 + }, + { + "epoch": 0.10332973759141695, + "grad_norm": 9.179126107326997, + "learning_rate": 4.9651017732297363e-05, + "loss": 2.7501, + "mean_token_accuracy": 0.40532365441322327, + "step": 102590 + }, + { + "epoch": 0.10333477364452112, + "grad_norm": 9.963692042329413, + "learning_rate": 4.9650951988519015e-05, + "loss": 2.1578, + "mean_token_accuracy": 0.43793103098869324, + "step": 102595 + }, + { + "epoch": 0.1033398096976253, + "grad_norm": 8.942288152229755, + "learning_rate": 4.9650886238597015e-05, + "loss": 2.1868, + "mean_token_accuracy": 0.47241380214691164, + "step": 102600 + }, + { + "epoch": 0.10334484575072947, + "grad_norm": 10.93475842217757, + "learning_rate": 4.9650820482531384e-05, + "loss": 2.2634, + "mean_token_accuracy": 0.43284936547279357, + "step": 102605 + }, + { + "epoch": 0.10334988180383364, + "grad_norm": 12.283703315383988, + "learning_rate": 4.965075472032215e-05, + "loss": 2.3561, + "mean_token_accuracy": 0.4379310369491577, + "step": 102610 + }, + { + "epoch": 0.10335491785693782, + "grad_norm": 10.175706091107438, + "learning_rate": 4.965068895196931e-05, + "loss": 2.2654, + "mean_token_accuracy": 0.47931034564971925, + "step": 102615 + }, + { + "epoch": 0.10335995391004199, + "grad_norm": 13.829327287560446, + "learning_rate": 4.96506231774729e-05, + "loss": 2.4558, + "mean_token_accuracy": 0.3551724195480347, + "step": 102620 + }, + { + "epoch": 0.10336498996314616, + "grad_norm": 36.56604074730109, + "learning_rate": 4.965055739683293e-05, + "loss": 2.4292, + "mean_token_accuracy": 0.4620689630508423, + "step": 102625 + }, + { + "epoch": 0.10337002601625034, + "grad_norm": 11.171458291252886, + "learning_rate": 4.965049161004942e-05, + "loss": 2.439, + "mean_token_accuracy": 0.42758620381355283, + "step": 102630 + }, + { + "epoch": 0.10337506206935451, + "grad_norm": 10.630095197094661, + "learning_rate": 4.9650425817122394e-05, + "loss": 2.5001, + "mean_token_accuracy": 0.441379314661026, + "step": 102635 + }, + { + "epoch": 0.10338009812245869, + "grad_norm": 11.123369177335173, + "learning_rate": 4.9650360018051856e-05, + "loss": 2.6794, + "mean_token_accuracy": 0.37241379618644715, + "step": 102640 + }, + { + "epoch": 0.10338513417556286, + "grad_norm": 8.401817875232402, + "learning_rate": 4.9650294212837836e-05, + "loss": 2.1332, + "mean_token_accuracy": 0.42413793206214906, + "step": 102645 + }, + { + "epoch": 0.10339017022866703, + "grad_norm": 11.943173112985958, + "learning_rate": 4.965022840148036e-05, + "loss": 2.7018, + "mean_token_accuracy": 0.4344827592372894, + "step": 102650 + }, + { + "epoch": 0.1033952062817712, + "grad_norm": 12.505414167079445, + "learning_rate": 4.9650162583979415e-05, + "loss": 2.6027, + "mean_token_accuracy": 0.42758620381355283, + "step": 102655 + }, + { + "epoch": 0.10340024233487537, + "grad_norm": 21.47987013681267, + "learning_rate": 4.965009676033505e-05, + "loss": 2.79, + "mean_token_accuracy": 0.37931033968925476, + "step": 102660 + }, + { + "epoch": 0.10340527838797954, + "grad_norm": 10.206726569838661, + "learning_rate": 4.965003093054728e-05, + "loss": 2.7186, + "mean_token_accuracy": 0.4172413766384125, + "step": 102665 + }, + { + "epoch": 0.10341031444108371, + "grad_norm": 10.521354699204924, + "learning_rate": 4.9649965094616106e-05, + "loss": 2.5722, + "mean_token_accuracy": 0.4413793087005615, + "step": 102670 + }, + { + "epoch": 0.10341535049418789, + "grad_norm": 9.998131703431875, + "learning_rate": 4.9649899252541556e-05, + "loss": 2.5432, + "mean_token_accuracy": 0.4379310250282288, + "step": 102675 + }, + { + "epoch": 0.10342038654729206, + "grad_norm": 10.319603400035327, + "learning_rate": 4.9649833404323666e-05, + "loss": 2.1164, + "mean_token_accuracy": 0.4620689690113068, + "step": 102680 + }, + { + "epoch": 0.10342542260039624, + "grad_norm": 11.182615632355347, + "learning_rate": 4.9649767549962416e-05, + "loss": 2.7234, + "mean_token_accuracy": 0.4344827592372894, + "step": 102685 + }, + { + "epoch": 0.10343045865350041, + "grad_norm": 10.896269753901313, + "learning_rate": 4.964970168945786e-05, + "loss": 2.5915, + "mean_token_accuracy": 0.37586206793785093, + "step": 102690 + }, + { + "epoch": 0.10343549470660458, + "grad_norm": 10.736927029675403, + "learning_rate": 4.964963582280999e-05, + "loss": 2.0535, + "mean_token_accuracy": 0.4949788272380829, + "step": 102695 + }, + { + "epoch": 0.10344053075970876, + "grad_norm": 9.482490996003563, + "learning_rate": 4.964956995001884e-05, + "loss": 1.9643, + "mean_token_accuracy": 0.4931034445762634, + "step": 102700 + }, + { + "epoch": 0.10344556681281293, + "grad_norm": 12.004747237594303, + "learning_rate": 4.964950407108443e-05, + "loss": 2.7128, + "mean_token_accuracy": 0.3758620619773865, + "step": 102705 + }, + { + "epoch": 0.1034506028659171, + "grad_norm": 10.358029056520946, + "learning_rate": 4.964943818600677e-05, + "loss": 2.217, + "mean_token_accuracy": 0.4517241418361664, + "step": 102710 + }, + { + "epoch": 0.10345563891902128, + "grad_norm": 12.833175990620939, + "learning_rate": 4.964937229478587e-05, + "loss": 2.7707, + "mean_token_accuracy": 0.36896551251411436, + "step": 102715 + }, + { + "epoch": 0.10346067497212545, + "grad_norm": 12.048965530177313, + "learning_rate": 4.9649306397421776e-05, + "loss": 2.2048, + "mean_token_accuracy": 0.44482758045196535, + "step": 102720 + }, + { + "epoch": 0.10346571102522961, + "grad_norm": 11.588323127330552, + "learning_rate": 4.964924049391448e-05, + "loss": 2.4735, + "mean_token_accuracy": 0.4034482717514038, + "step": 102725 + }, + { + "epoch": 0.10347074707833379, + "grad_norm": 13.710773499176616, + "learning_rate": 4.964917458426401e-05, + "loss": 2.7472, + "mean_token_accuracy": 0.3896551728248596, + "step": 102730 + }, + { + "epoch": 0.10347578313143796, + "grad_norm": 13.268886204756269, + "learning_rate": 4.9649108668470386e-05, + "loss": 2.7009, + "mean_token_accuracy": 0.3655172437429428, + "step": 102735 + }, + { + "epoch": 0.10348081918454213, + "grad_norm": 15.896686541395285, + "learning_rate": 4.964904274653362e-05, + "loss": 2.7674, + "mean_token_accuracy": 0.4034482717514038, + "step": 102740 + }, + { + "epoch": 0.10348585523764631, + "grad_norm": 9.749419147201362, + "learning_rate": 4.9648976818453743e-05, + "loss": 2.1021, + "mean_token_accuracy": 0.4551724076271057, + "step": 102745 + }, + { + "epoch": 0.10349089129075048, + "grad_norm": 10.351683510126103, + "learning_rate": 4.964891088423075e-05, + "loss": 2.4928, + "mean_token_accuracy": 0.44482758045196535, + "step": 102750 + }, + { + "epoch": 0.10349592734385465, + "grad_norm": 11.140251616950433, + "learning_rate": 4.964884494386468e-05, + "loss": 2.4965, + "mean_token_accuracy": 0.4206896543502808, + "step": 102755 + }, + { + "epoch": 0.10350096339695883, + "grad_norm": 11.8491191589136, + "learning_rate": 4.964877899735555e-05, + "loss": 2.2765, + "mean_token_accuracy": 0.44482759237289426, + "step": 102760 + }, + { + "epoch": 0.103505999450063, + "grad_norm": 13.058745086239194, + "learning_rate": 4.964871304470337e-05, + "loss": 2.6243, + "mean_token_accuracy": 0.39310344457626345, + "step": 102765 + }, + { + "epoch": 0.10351103550316718, + "grad_norm": 10.827598293098102, + "learning_rate": 4.964864708590816e-05, + "loss": 2.1959, + "mean_token_accuracy": 0.43103447556495667, + "step": 102770 + }, + { + "epoch": 0.10351607155627135, + "grad_norm": 13.409402588385651, + "learning_rate": 4.964858112096995e-05, + "loss": 2.7859, + "mean_token_accuracy": 0.3482758641242981, + "step": 102775 + }, + { + "epoch": 0.10352110760937552, + "grad_norm": 10.533910111706248, + "learning_rate": 4.964851514988874e-05, + "loss": 2.3645, + "mean_token_accuracy": 0.38620689511299133, + "step": 102780 + }, + { + "epoch": 0.1035261436624797, + "grad_norm": 12.019038677153114, + "learning_rate": 4.964844917266456e-05, + "loss": 2.8183, + "mean_token_accuracy": 0.37241379022598264, + "step": 102785 + }, + { + "epoch": 0.10353117971558387, + "grad_norm": 9.451477302906017, + "learning_rate": 4.9648383189297414e-05, + "loss": 2.6351, + "mean_token_accuracy": 0.38620689511299133, + "step": 102790 + }, + { + "epoch": 0.10353621576868803, + "grad_norm": 11.311353459201282, + "learning_rate": 4.964831719978735e-05, + "loss": 2.6062, + "mean_token_accuracy": 0.42413793206214906, + "step": 102795 + }, + { + "epoch": 0.1035412518217922, + "grad_norm": 10.269240646645464, + "learning_rate": 4.9648251204134354e-05, + "loss": 2.1797, + "mean_token_accuracy": 0.47241379618644713, + "step": 102800 + }, + { + "epoch": 0.10354628787489638, + "grad_norm": 11.131598349902763, + "learning_rate": 4.964818520233846e-05, + "loss": 2.4014, + "mean_token_accuracy": 0.4344827592372894, + "step": 102805 + }, + { + "epoch": 0.10355132392800055, + "grad_norm": 10.834083475202164, + "learning_rate": 4.964811919439968e-05, + "loss": 2.2055, + "mean_token_accuracy": 0.45591133236885073, + "step": 102810 + }, + { + "epoch": 0.10355635998110473, + "grad_norm": 10.530199647379593, + "learning_rate": 4.964805318031805e-05, + "loss": 2.5862, + "mean_token_accuracy": 0.3999999940395355, + "step": 102815 + }, + { + "epoch": 0.1035613960342089, + "grad_norm": 10.98968036115313, + "learning_rate": 4.9647987160093565e-05, + "loss": 2.5896, + "mean_token_accuracy": 0.3965517282485962, + "step": 102820 + }, + { + "epoch": 0.10356643208731307, + "grad_norm": 11.773844974577928, + "learning_rate": 4.9647921133726255e-05, + "loss": 2.193, + "mean_token_accuracy": 0.45972906351089476, + "step": 102825 + }, + { + "epoch": 0.10357146814041725, + "grad_norm": 10.907432719035597, + "learning_rate": 4.964785510121614e-05, + "loss": 2.5704, + "mean_token_accuracy": 0.4137930989265442, + "step": 102830 + }, + { + "epoch": 0.10357650419352142, + "grad_norm": 9.594904047676632, + "learning_rate": 4.964778906256323e-05, + "loss": 2.5079, + "mean_token_accuracy": 0.40689654350280763, + "step": 102835 + }, + { + "epoch": 0.1035815402466256, + "grad_norm": 11.817506377087932, + "learning_rate": 4.964772301776755e-05, + "loss": 1.9084, + "mean_token_accuracy": 0.4689655065536499, + "step": 102840 + }, + { + "epoch": 0.10358657629972977, + "grad_norm": 10.436809406700329, + "learning_rate": 4.964765696682912e-05, + "loss": 2.2675, + "mean_token_accuracy": 0.4517241358757019, + "step": 102845 + }, + { + "epoch": 0.10359161235283394, + "grad_norm": 9.744801261941682, + "learning_rate": 4.9647590909747946e-05, + "loss": 2.3756, + "mean_token_accuracy": 0.4310344815254211, + "step": 102850 + }, + { + "epoch": 0.10359664840593812, + "grad_norm": 10.67594776021183, + "learning_rate": 4.964752484652406e-05, + "loss": 2.5473, + "mean_token_accuracy": 0.38275861740112305, + "step": 102855 + }, + { + "epoch": 0.10360168445904229, + "grad_norm": 10.95796771953436, + "learning_rate": 4.964745877715747e-05, + "loss": 2.7631, + "mean_token_accuracy": 0.3448275804519653, + "step": 102860 + }, + { + "epoch": 0.10360672051214645, + "grad_norm": 10.294691886952323, + "learning_rate": 4.964739270164821e-05, + "loss": 2.616, + "mean_token_accuracy": 0.36206896007061007, + "step": 102865 + }, + { + "epoch": 0.10361175656525062, + "grad_norm": 11.622773085958949, + "learning_rate": 4.964732661999629e-05, + "loss": 2.5797, + "mean_token_accuracy": 0.41034482717514037, + "step": 102870 + }, + { + "epoch": 0.1036167926183548, + "grad_norm": 21.016569424734772, + "learning_rate": 4.964726053220171e-05, + "loss": 2.6472, + "mean_token_accuracy": 0.4517241358757019, + "step": 102875 + }, + { + "epoch": 0.10362182867145897, + "grad_norm": 10.159564637084017, + "learning_rate": 4.9647194438264515e-05, + "loss": 2.6604, + "mean_token_accuracy": 0.31724137663841245, + "step": 102880 + }, + { + "epoch": 0.10362686472456314, + "grad_norm": 10.361291726826597, + "learning_rate": 4.964712833818472e-05, + "loss": 2.2221, + "mean_token_accuracy": 0.41724138259887694, + "step": 102885 + }, + { + "epoch": 0.10363190077766732, + "grad_norm": 10.137609089141609, + "learning_rate": 4.964706223196232e-05, + "loss": 2.8024, + "mean_token_accuracy": 0.38275861740112305, + "step": 102890 + }, + { + "epoch": 0.10363693683077149, + "grad_norm": 9.163925465658757, + "learning_rate": 4.964699611959736e-05, + "loss": 3.0137, + "mean_token_accuracy": 0.38620689511299133, + "step": 102895 + }, + { + "epoch": 0.10364197288387567, + "grad_norm": 9.93383430476696, + "learning_rate": 4.964693000108985e-05, + "loss": 2.4878, + "mean_token_accuracy": 0.43272837400436404, + "step": 102900 + }, + { + "epoch": 0.10364700893697984, + "grad_norm": 8.456397507965278, + "learning_rate": 4.9646863876439796e-05, + "loss": 2.2444, + "mean_token_accuracy": 0.45862067937850953, + "step": 102905 + }, + { + "epoch": 0.10365204499008401, + "grad_norm": 12.527954859910455, + "learning_rate": 4.9646797745647236e-05, + "loss": 2.6061, + "mean_token_accuracy": 0.3689655244350433, + "step": 102910 + }, + { + "epoch": 0.10365708104318819, + "grad_norm": 10.092031234987058, + "learning_rate": 4.964673160871217e-05, + "loss": 2.7003, + "mean_token_accuracy": 0.4000000059604645, + "step": 102915 + }, + { + "epoch": 0.10366211709629236, + "grad_norm": 10.787233802403007, + "learning_rate": 4.964666546563464e-05, + "loss": 1.8639, + "mean_token_accuracy": 0.5697044312953949, + "step": 102920 + }, + { + "epoch": 0.10366715314939653, + "grad_norm": 10.915366378775854, + "learning_rate": 4.964659931641464e-05, + "loss": 2.8096, + "mean_token_accuracy": 0.38620689511299133, + "step": 102925 + }, + { + "epoch": 0.10367218920250071, + "grad_norm": 11.843694460287882, + "learning_rate": 4.96465331610522e-05, + "loss": 2.3656, + "mean_token_accuracy": 0.3914095640182495, + "step": 102930 + }, + { + "epoch": 0.10367722525560487, + "grad_norm": 12.34386225549301, + "learning_rate": 4.9646466999547335e-05, + "loss": 2.2592, + "mean_token_accuracy": 0.4206896543502808, + "step": 102935 + }, + { + "epoch": 0.10368226130870904, + "grad_norm": 12.651232048921464, + "learning_rate": 4.9646400831900066e-05, + "loss": 2.4771, + "mean_token_accuracy": 0.4013309121131897, + "step": 102940 + }, + { + "epoch": 0.10368729736181322, + "grad_norm": 10.265638717915435, + "learning_rate": 4.9646334658110406e-05, + "loss": 2.4267, + "mean_token_accuracy": 0.44827585816383364, + "step": 102945 + }, + { + "epoch": 0.10369233341491739, + "grad_norm": 12.147893068494495, + "learning_rate": 4.964626847817838e-05, + "loss": 2.5088, + "mean_token_accuracy": 0.44482758045196535, + "step": 102950 + }, + { + "epoch": 0.10369736946802156, + "grad_norm": 11.516871813047802, + "learning_rate": 4.964620229210401e-05, + "loss": 2.6737, + "mean_token_accuracy": 0.3876588046550751, + "step": 102955 + }, + { + "epoch": 0.10370240552112574, + "grad_norm": 9.598558584090025, + "learning_rate": 4.96461360998873e-05, + "loss": 2.8774, + "mean_token_accuracy": 0.36896551251411436, + "step": 102960 + }, + { + "epoch": 0.10370744157422991, + "grad_norm": 12.125545601326763, + "learning_rate": 4.9646069901528276e-05, + "loss": 2.2847, + "mean_token_accuracy": 0.4448275864124298, + "step": 102965 + }, + { + "epoch": 0.10371247762733408, + "grad_norm": 11.559501571674797, + "learning_rate": 4.964600369702696e-05, + "loss": 2.4679, + "mean_token_accuracy": 0.4551724135875702, + "step": 102970 + }, + { + "epoch": 0.10371751368043826, + "grad_norm": 10.613762754498005, + "learning_rate": 4.964593748638337e-05, + "loss": 2.4601, + "mean_token_accuracy": 0.4517241299152374, + "step": 102975 + }, + { + "epoch": 0.10372254973354243, + "grad_norm": 9.179121786804597, + "learning_rate": 4.964587126959752e-05, + "loss": 2.0388, + "mean_token_accuracy": 0.47241380214691164, + "step": 102980 + }, + { + "epoch": 0.1037275857866466, + "grad_norm": 13.746243739614954, + "learning_rate": 4.964580504666943e-05, + "loss": 2.6289, + "mean_token_accuracy": 0.37586206793785093, + "step": 102985 + }, + { + "epoch": 0.10373262183975078, + "grad_norm": 11.85659496551069, + "learning_rate": 4.964573881759911e-05, + "loss": 2.418, + "mean_token_accuracy": 0.41379310488700866, + "step": 102990 + }, + { + "epoch": 0.10373765789285495, + "grad_norm": 10.488008864164938, + "learning_rate": 4.96456725823866e-05, + "loss": 2.5342, + "mean_token_accuracy": 0.41034482717514037, + "step": 102995 + }, + { + "epoch": 0.10374269394595913, + "grad_norm": 11.29669362449236, + "learning_rate": 4.96456063410319e-05, + "loss": 2.3771, + "mean_token_accuracy": 0.4172413766384125, + "step": 103000 + }, + { + "epoch": 0.10374772999906329, + "grad_norm": 12.217376306539848, + "learning_rate": 4.9645540093535037e-05, + "loss": 2.5203, + "mean_token_accuracy": 0.4470659375190735, + "step": 103005 + }, + { + "epoch": 0.10375276605216746, + "grad_norm": 9.961374498156212, + "learning_rate": 4.9645473839896014e-05, + "loss": 2.5732, + "mean_token_accuracy": 0.3965517163276672, + "step": 103010 + }, + { + "epoch": 0.10375780210527163, + "grad_norm": 11.544591464007157, + "learning_rate": 4.964540758011487e-05, + "loss": 2.6246, + "mean_token_accuracy": 0.38620689511299133, + "step": 103015 + }, + { + "epoch": 0.10376283815837581, + "grad_norm": 11.66084024621183, + "learning_rate": 4.9645341314191614e-05, + "loss": 2.3964, + "mean_token_accuracy": 0.4482758641242981, + "step": 103020 + }, + { + "epoch": 0.10376787421147998, + "grad_norm": 11.087490562297699, + "learning_rate": 4.964527504212627e-05, + "loss": 2.0646, + "mean_token_accuracy": 0.4847549915313721, + "step": 103025 + }, + { + "epoch": 0.10377291026458416, + "grad_norm": 11.17779495633632, + "learning_rate": 4.964520876391885e-05, + "loss": 2.4559, + "mean_token_accuracy": 0.3965517282485962, + "step": 103030 + }, + { + "epoch": 0.10377794631768833, + "grad_norm": 14.16836855289918, + "learning_rate": 4.9645142479569365e-05, + "loss": 2.7772, + "mean_token_accuracy": 0.3655172407627106, + "step": 103035 + }, + { + "epoch": 0.1037829823707925, + "grad_norm": 18.170207200643063, + "learning_rate": 4.964507618907784e-05, + "loss": 2.8981, + "mean_token_accuracy": 0.3620689630508423, + "step": 103040 + }, + { + "epoch": 0.10378801842389668, + "grad_norm": 10.609405445528665, + "learning_rate": 4.964500989244431e-05, + "loss": 2.1909, + "mean_token_accuracy": 0.4620689690113068, + "step": 103045 + }, + { + "epoch": 0.10379305447700085, + "grad_norm": 10.819538578165798, + "learning_rate": 4.964494358966877e-05, + "loss": 2.5672, + "mean_token_accuracy": 0.38965516686439516, + "step": 103050 + }, + { + "epoch": 0.10379809053010502, + "grad_norm": 14.278955876243346, + "learning_rate": 4.9644877280751246e-05, + "loss": 2.5587, + "mean_token_accuracy": 0.3896551728248596, + "step": 103055 + }, + { + "epoch": 0.1038031265832092, + "grad_norm": 12.953895390448283, + "learning_rate": 4.964481096569176e-05, + "loss": 2.4801, + "mean_token_accuracy": 0.41724138259887694, + "step": 103060 + }, + { + "epoch": 0.10380816263631337, + "grad_norm": 9.862541243696544, + "learning_rate": 4.9644744644490323e-05, + "loss": 2.4888, + "mean_token_accuracy": 0.43103448748588563, + "step": 103065 + }, + { + "epoch": 0.10381319868941755, + "grad_norm": 10.524393373886394, + "learning_rate": 4.964467831714697e-05, + "loss": 2.2784, + "mean_token_accuracy": 0.4172413766384125, + "step": 103070 + }, + { + "epoch": 0.1038182347425217, + "grad_norm": 20.148968872601777, + "learning_rate": 4.96446119836617e-05, + "loss": 2.8623, + "mean_token_accuracy": 0.4185117959976196, + "step": 103075 + }, + { + "epoch": 0.10382327079562588, + "grad_norm": 11.300556852011367, + "learning_rate": 4.964454564403455e-05, + "loss": 2.9313, + "mean_token_accuracy": 0.3206896483898163, + "step": 103080 + }, + { + "epoch": 0.10382830684873005, + "grad_norm": 9.115530990852688, + "learning_rate": 4.964447929826551e-05, + "loss": 2.6246, + "mean_token_accuracy": 0.3913490653038025, + "step": 103085 + }, + { + "epoch": 0.10383334290183423, + "grad_norm": 11.346985371573272, + "learning_rate": 4.964441294635462e-05, + "loss": 2.5378, + "mean_token_accuracy": 0.4, + "step": 103090 + }, + { + "epoch": 0.1038383789549384, + "grad_norm": 12.383008048615729, + "learning_rate": 4.96443465883019e-05, + "loss": 2.7198, + "mean_token_accuracy": 0.33793102502822875, + "step": 103095 + }, + { + "epoch": 0.10384341500804257, + "grad_norm": 10.278049099499064, + "learning_rate": 4.964428022410737e-05, + "loss": 2.3052, + "mean_token_accuracy": 0.4068965554237366, + "step": 103100 + }, + { + "epoch": 0.10384845106114675, + "grad_norm": 9.842128740586753, + "learning_rate": 4.964421385377103e-05, + "loss": 2.4356, + "mean_token_accuracy": 0.37586206793785093, + "step": 103105 + }, + { + "epoch": 0.10385348711425092, + "grad_norm": 12.431769659594135, + "learning_rate": 4.964414747729291e-05, + "loss": 2.6686, + "mean_token_accuracy": 0.38620689511299133, + "step": 103110 + }, + { + "epoch": 0.1038585231673551, + "grad_norm": 9.527892922749471, + "learning_rate": 4.964408109467303e-05, + "loss": 2.4081, + "mean_token_accuracy": 0.42758620381355283, + "step": 103115 + }, + { + "epoch": 0.10386355922045927, + "grad_norm": 12.013315758723625, + "learning_rate": 4.964401470591141e-05, + "loss": 2.6244, + "mean_token_accuracy": 0.3743496656417847, + "step": 103120 + }, + { + "epoch": 0.10386859527356344, + "grad_norm": 12.059398000966743, + "learning_rate": 4.964394831100806e-05, + "loss": 2.5144, + "mean_token_accuracy": 0.39655172526836396, + "step": 103125 + }, + { + "epoch": 0.10387363132666762, + "grad_norm": 10.236682310414727, + "learning_rate": 4.9643881909963005e-05, + "loss": 1.9798, + "mean_token_accuracy": 0.476477837562561, + "step": 103130 + }, + { + "epoch": 0.10387866737977179, + "grad_norm": 11.948750204413411, + "learning_rate": 4.964381550277626e-05, + "loss": 2.333, + "mean_token_accuracy": 0.39310344457626345, + "step": 103135 + }, + { + "epoch": 0.10388370343287597, + "grad_norm": 10.389392272985065, + "learning_rate": 4.964374908944785e-05, + "loss": 2.4147, + "mean_token_accuracy": 0.42413793206214906, + "step": 103140 + }, + { + "epoch": 0.10388873948598012, + "grad_norm": 10.482548569784681, + "learning_rate": 4.9643682669977784e-05, + "loss": 2.168, + "mean_token_accuracy": 0.47586206793785096, + "step": 103145 + }, + { + "epoch": 0.1038937755390843, + "grad_norm": 9.366579341243721, + "learning_rate": 4.964361624436609e-05, + "loss": 2.9311, + "mean_token_accuracy": 0.417241370677948, + "step": 103150 + }, + { + "epoch": 0.10389881159218847, + "grad_norm": 8.877242022777903, + "learning_rate": 4.9643549812612775e-05, + "loss": 2.3232, + "mean_token_accuracy": 0.4517241418361664, + "step": 103155 + }, + { + "epoch": 0.10390384764529265, + "grad_norm": 11.63038409830123, + "learning_rate": 4.9643483374717876e-05, + "loss": 2.6384, + "mean_token_accuracy": 0.42758620381355283, + "step": 103160 + }, + { + "epoch": 0.10390888369839682, + "grad_norm": 8.924805549176025, + "learning_rate": 4.9643416930681385e-05, + "loss": 3.1786, + "mean_token_accuracy": 0.37586206793785093, + "step": 103165 + }, + { + "epoch": 0.103913919751501, + "grad_norm": 9.810659204540482, + "learning_rate": 4.964335048050334e-05, + "loss": 2.398, + "mean_token_accuracy": 0.4137930989265442, + "step": 103170 + }, + { + "epoch": 0.10391895580460517, + "grad_norm": 13.268822926787431, + "learning_rate": 4.9643284024183755e-05, + "loss": 2.4014, + "mean_token_accuracy": 0.4517241358757019, + "step": 103175 + }, + { + "epoch": 0.10392399185770934, + "grad_norm": 9.64431378841381, + "learning_rate": 4.964321756172265e-05, + "loss": 2.1192, + "mean_token_accuracy": 0.5103448331356049, + "step": 103180 + }, + { + "epoch": 0.10392902791081352, + "grad_norm": 10.46694535595462, + "learning_rate": 4.9643151093120036e-05, + "loss": 2.6512, + "mean_token_accuracy": 0.3758620649576187, + "step": 103185 + }, + { + "epoch": 0.10393406396391769, + "grad_norm": 16.359528268712353, + "learning_rate": 4.964308461837594e-05, + "loss": 2.5574, + "mean_token_accuracy": 0.4206896543502808, + "step": 103190 + }, + { + "epoch": 0.10393910001702186, + "grad_norm": 10.325443510330471, + "learning_rate": 4.964301813749037e-05, + "loss": 2.6767, + "mean_token_accuracy": 0.39310344457626345, + "step": 103195 + }, + { + "epoch": 0.10394413607012604, + "grad_norm": 10.306307195822047, + "learning_rate": 4.964295165046336e-05, + "loss": 2.5239, + "mean_token_accuracy": 0.4310344815254211, + "step": 103200 + }, + { + "epoch": 0.10394917212323021, + "grad_norm": 10.154949405566802, + "learning_rate": 4.964288515729492e-05, + "loss": 2.8357, + "mean_token_accuracy": 0.38275861740112305, + "step": 103205 + }, + { + "epoch": 0.10395420817633438, + "grad_norm": 10.246948459832236, + "learning_rate": 4.964281865798506e-05, + "loss": 2.4723, + "mean_token_accuracy": 0.41379310488700866, + "step": 103210 + }, + { + "epoch": 0.10395924422943854, + "grad_norm": 12.44744602847921, + "learning_rate": 4.964275215253381e-05, + "loss": 2.5516, + "mean_token_accuracy": 0.42068964540958403, + "step": 103215 + }, + { + "epoch": 0.10396428028254272, + "grad_norm": 18.60790535718843, + "learning_rate": 4.964268564094119e-05, + "loss": 2.6742, + "mean_token_accuracy": 0.4413793087005615, + "step": 103220 + }, + { + "epoch": 0.10396931633564689, + "grad_norm": 10.163224950449452, + "learning_rate": 4.964261912320721e-05, + "loss": 2.4327, + "mean_token_accuracy": 0.4275861978530884, + "step": 103225 + }, + { + "epoch": 0.10397435238875107, + "grad_norm": 10.62199361822042, + "learning_rate": 4.964255259933189e-05, + "loss": 2.8677, + "mean_token_accuracy": 0.38620689511299133, + "step": 103230 + }, + { + "epoch": 0.10397938844185524, + "grad_norm": 10.06233262961867, + "learning_rate": 4.9642486069315253e-05, + "loss": 2.4183, + "mean_token_accuracy": 0.4344827592372894, + "step": 103235 + }, + { + "epoch": 0.10398442449495941, + "grad_norm": 10.668441515428324, + "learning_rate": 4.9642419533157316e-05, + "loss": 2.2822, + "mean_token_accuracy": 0.4310344815254211, + "step": 103240 + }, + { + "epoch": 0.10398946054806359, + "grad_norm": 10.880775073365328, + "learning_rate": 4.9642352990858084e-05, + "loss": 3.0879, + "mean_token_accuracy": 0.3965517282485962, + "step": 103245 + }, + { + "epoch": 0.10399449660116776, + "grad_norm": 10.999367407275313, + "learning_rate": 4.9642286442417606e-05, + "loss": 2.473, + "mean_token_accuracy": 0.4482758641242981, + "step": 103250 + }, + { + "epoch": 0.10399953265427193, + "grad_norm": 14.956567271060655, + "learning_rate": 4.9642219887835875e-05, + "loss": 2.7544, + "mean_token_accuracy": 0.3931034505367279, + "step": 103255 + }, + { + "epoch": 0.10400456870737611, + "grad_norm": 13.48617843293436, + "learning_rate": 4.964215332711292e-05, + "loss": 2.5589, + "mean_token_accuracy": 0.4310344815254211, + "step": 103260 + }, + { + "epoch": 0.10400960476048028, + "grad_norm": 8.576824025256261, + "learning_rate": 4.964208676024875e-05, + "loss": 2.2712, + "mean_token_accuracy": 0.4551724135875702, + "step": 103265 + }, + { + "epoch": 0.10401464081358446, + "grad_norm": 13.463537277148754, + "learning_rate": 4.9642020187243394e-05, + "loss": 2.3939, + "mean_token_accuracy": 0.39655172228813174, + "step": 103270 + }, + { + "epoch": 0.10401967686668863, + "grad_norm": 8.691586043168915, + "learning_rate": 4.964195360809686e-05, + "loss": 2.2159, + "mean_token_accuracy": 0.47586206197738645, + "step": 103275 + }, + { + "epoch": 0.1040247129197928, + "grad_norm": 11.218305997294125, + "learning_rate": 4.9641887022809175e-05, + "loss": 2.2314, + "mean_token_accuracy": 0.44482759237289426, + "step": 103280 + }, + { + "epoch": 0.10402974897289696, + "grad_norm": 9.440437200746915, + "learning_rate": 4.964182043138036e-05, + "loss": 2.4635, + "mean_token_accuracy": 0.3862069010734558, + "step": 103285 + }, + { + "epoch": 0.10403478502600114, + "grad_norm": 12.155968677040468, + "learning_rate": 4.9641753833810425e-05, + "loss": 2.3421, + "mean_token_accuracy": 0.4, + "step": 103290 + }, + { + "epoch": 0.10403982107910531, + "grad_norm": 14.98645656481463, + "learning_rate": 4.9641687230099394e-05, + "loss": 2.5056, + "mean_token_accuracy": 0.4413793087005615, + "step": 103295 + }, + { + "epoch": 0.10404485713220948, + "grad_norm": 17.883162164846706, + "learning_rate": 4.964162062024728e-05, + "loss": 2.527, + "mean_token_accuracy": 0.4329098641872406, + "step": 103300 + }, + { + "epoch": 0.10404989318531366, + "grad_norm": 9.82871302895975, + "learning_rate": 4.96415540042541e-05, + "loss": 2.493, + "mean_token_accuracy": 0.4034482777118683, + "step": 103305 + }, + { + "epoch": 0.10405492923841783, + "grad_norm": 10.211613205743413, + "learning_rate": 4.9641487382119886e-05, + "loss": 2.4909, + "mean_token_accuracy": 0.37931033968925476, + "step": 103310 + }, + { + "epoch": 0.104059965291522, + "grad_norm": 10.002671639656016, + "learning_rate": 4.964142075384465e-05, + "loss": 2.56, + "mean_token_accuracy": 0.3911070764064789, + "step": 103315 + }, + { + "epoch": 0.10406500134462618, + "grad_norm": 10.486757320972604, + "learning_rate": 4.96413541194284e-05, + "loss": 2.0799, + "mean_token_accuracy": 0.5094373881816864, + "step": 103320 + }, + { + "epoch": 0.10407003739773035, + "grad_norm": 10.49286509392905, + "learning_rate": 4.9641287478871174e-05, + "loss": 2.3625, + "mean_token_accuracy": 0.4586206912994385, + "step": 103325 + }, + { + "epoch": 0.10407507345083453, + "grad_norm": 11.817766591551393, + "learning_rate": 4.964122083217297e-05, + "loss": 2.342, + "mean_token_accuracy": 0.43103448748588563, + "step": 103330 + }, + { + "epoch": 0.1040801095039387, + "grad_norm": 9.992153067497501, + "learning_rate": 4.964115417933382e-05, + "loss": 2.0207, + "mean_token_accuracy": 0.4986085832118988, + "step": 103335 + }, + { + "epoch": 0.10408514555704287, + "grad_norm": 11.92204613269951, + "learning_rate": 4.964108752035374e-05, + "loss": 2.6386, + "mean_token_accuracy": 0.37241379022598264, + "step": 103340 + }, + { + "epoch": 0.10409018161014705, + "grad_norm": 8.146902150215247, + "learning_rate": 4.964102085523274e-05, + "loss": 2.3077, + "mean_token_accuracy": 0.4482758641242981, + "step": 103345 + }, + { + "epoch": 0.10409521766325122, + "grad_norm": 10.133016040599395, + "learning_rate": 4.964095418397085e-05, + "loss": 2.2462, + "mean_token_accuracy": 0.4275861978530884, + "step": 103350 + }, + { + "epoch": 0.10410025371635538, + "grad_norm": 13.3906812671072, + "learning_rate": 4.964088750656808e-05, + "loss": 2.5773, + "mean_token_accuracy": 0.41379310488700866, + "step": 103355 + }, + { + "epoch": 0.10410528976945956, + "grad_norm": 10.007831257830533, + "learning_rate": 4.964082082302445e-05, + "loss": 2.5446, + "mean_token_accuracy": 0.41379310488700866, + "step": 103360 + }, + { + "epoch": 0.10411032582256373, + "grad_norm": 12.2229837756531, + "learning_rate": 4.964075413333999e-05, + "loss": 2.1582, + "mean_token_accuracy": 0.48620688915252686, + "step": 103365 + }, + { + "epoch": 0.1041153618756679, + "grad_norm": 10.342947958259925, + "learning_rate": 4.964068743751471e-05, + "loss": 2.2866, + "mean_token_accuracy": 0.4586206912994385, + "step": 103370 + }, + { + "epoch": 0.10412039792877208, + "grad_norm": 12.110625930506135, + "learning_rate": 4.9640620735548614e-05, + "loss": 3.1656, + "mean_token_accuracy": 0.3620689630508423, + "step": 103375 + }, + { + "epoch": 0.10412543398187625, + "grad_norm": 10.53502498754386, + "learning_rate": 4.964055402744174e-05, + "loss": 2.8981, + "mean_token_accuracy": 0.39310344457626345, + "step": 103380 + }, + { + "epoch": 0.10413047003498042, + "grad_norm": 14.418509527586824, + "learning_rate": 4.964048731319411e-05, + "loss": 2.7263, + "mean_token_accuracy": 0.38275861740112305, + "step": 103385 + }, + { + "epoch": 0.1041355060880846, + "grad_norm": 9.623282878133681, + "learning_rate": 4.964042059280573e-05, + "loss": 2.4296, + "mean_token_accuracy": 0.4172413766384125, + "step": 103390 + }, + { + "epoch": 0.10414054214118877, + "grad_norm": 10.259714571318074, + "learning_rate": 4.964035386627661e-05, + "loss": 2.5875, + "mean_token_accuracy": 0.34137930274009703, + "step": 103395 + }, + { + "epoch": 0.10414557819429295, + "grad_norm": 11.38552948638494, + "learning_rate": 4.9640287133606794e-05, + "loss": 2.3321, + "mean_token_accuracy": 0.46551724076271056, + "step": 103400 + }, + { + "epoch": 0.10415061424739712, + "grad_norm": 10.547823130894201, + "learning_rate": 4.964022039479628e-05, + "loss": 2.4525, + "mean_token_accuracy": 0.47931034564971925, + "step": 103405 + }, + { + "epoch": 0.10415565030050129, + "grad_norm": 11.321733970183939, + "learning_rate": 4.964015364984509e-05, + "loss": 2.7999, + "mean_token_accuracy": 0.4034482777118683, + "step": 103410 + }, + { + "epoch": 0.10416068635360547, + "grad_norm": 11.600272548937992, + "learning_rate": 4.9640086898753254e-05, + "loss": 2.4958, + "mean_token_accuracy": 0.38620689511299133, + "step": 103415 + }, + { + "epoch": 0.10416572240670964, + "grad_norm": 10.777140456521964, + "learning_rate": 4.964002014152077e-05, + "loss": 2.5404, + "mean_token_accuracy": 0.3862068891525269, + "step": 103420 + }, + { + "epoch": 0.1041707584598138, + "grad_norm": 11.119819852100127, + "learning_rate": 4.9639953378147685e-05, + "loss": 2.4379, + "mean_token_accuracy": 0.4223835408687592, + "step": 103425 + }, + { + "epoch": 0.10417579451291797, + "grad_norm": 7.941035122173889, + "learning_rate": 4.9639886608633993e-05, + "loss": 1.5436, + "mean_token_accuracy": 0.5575922548770904, + "step": 103430 + }, + { + "epoch": 0.10418083056602215, + "grad_norm": 11.522293897442974, + "learning_rate": 4.963981983297972e-05, + "loss": 2.4231, + "mean_token_accuracy": 0.42068966031074523, + "step": 103435 + }, + { + "epoch": 0.10418586661912632, + "grad_norm": 13.592013846310603, + "learning_rate": 4.963975305118488e-05, + "loss": 2.431, + "mean_token_accuracy": 0.39655172228813174, + "step": 103440 + }, + { + "epoch": 0.1041909026722305, + "grad_norm": 9.173513121450622, + "learning_rate": 4.963968626324951e-05, + "loss": 2.3383, + "mean_token_accuracy": 0.4485178530216217, + "step": 103445 + }, + { + "epoch": 0.10419593872533467, + "grad_norm": 11.145550894835978, + "learning_rate": 4.963961946917361e-05, + "loss": 2.2309, + "mean_token_accuracy": 0.4275861978530884, + "step": 103450 + }, + { + "epoch": 0.10420097477843884, + "grad_norm": 11.089754074483869, + "learning_rate": 4.9639552668957206e-05, + "loss": 2.6872, + "mean_token_accuracy": 0.4137930989265442, + "step": 103455 + }, + { + "epoch": 0.10420601083154302, + "grad_norm": 8.44635633157336, + "learning_rate": 4.963948586260031e-05, + "loss": 2.4519, + "mean_token_accuracy": 0.41379310488700866, + "step": 103460 + }, + { + "epoch": 0.10421104688464719, + "grad_norm": 10.975085593705144, + "learning_rate": 4.963941905010295e-05, + "loss": 2.2306, + "mean_token_accuracy": 0.4517241299152374, + "step": 103465 + }, + { + "epoch": 0.10421608293775136, + "grad_norm": 9.340644400696902, + "learning_rate": 4.9639352231465136e-05, + "loss": 2.5608, + "mean_token_accuracy": 0.37586206793785093, + "step": 103470 + }, + { + "epoch": 0.10422111899085554, + "grad_norm": 22.183797640315863, + "learning_rate": 4.9639285406686894e-05, + "loss": 3.0508, + "mean_token_accuracy": 0.3827586114406586, + "step": 103475 + }, + { + "epoch": 0.10422615504395971, + "grad_norm": 13.619188693516692, + "learning_rate": 4.9639218575768233e-05, + "loss": 2.2544, + "mean_token_accuracy": 0.4463399887084961, + "step": 103480 + }, + { + "epoch": 0.10423119109706389, + "grad_norm": 11.330051944073848, + "learning_rate": 4.963915173870917e-05, + "loss": 2.21, + "mean_token_accuracy": 0.47586206793785096, + "step": 103485 + }, + { + "epoch": 0.10423622715016806, + "grad_norm": 11.535812835645796, + "learning_rate": 4.963908489550975e-05, + "loss": 2.5493, + "mean_token_accuracy": 0.4206896543502808, + "step": 103490 + }, + { + "epoch": 0.10424126320327222, + "grad_norm": 9.591861953394002, + "learning_rate": 4.963901804616996e-05, + "loss": 2.5748, + "mean_token_accuracy": 0.4344827592372894, + "step": 103495 + }, + { + "epoch": 0.10424629925637639, + "grad_norm": 10.27634254909053, + "learning_rate": 4.9638951190689833e-05, + "loss": 2.455, + "mean_token_accuracy": 0.4137930989265442, + "step": 103500 + }, + { + "epoch": 0.10425133530948057, + "grad_norm": 13.906251838594448, + "learning_rate": 4.963888432906938e-05, + "loss": 2.4018, + "mean_token_accuracy": 0.4241379380226135, + "step": 103505 + }, + { + "epoch": 0.10425637136258474, + "grad_norm": 10.205707547763863, + "learning_rate": 4.9638817461308634e-05, + "loss": 2.7354, + "mean_token_accuracy": 0.3724137932062149, + "step": 103510 + }, + { + "epoch": 0.10426140741568891, + "grad_norm": 11.229334354443871, + "learning_rate": 4.96387505874076e-05, + "loss": 2.597, + "mean_token_accuracy": 0.43448275327682495, + "step": 103515 + }, + { + "epoch": 0.10426644346879309, + "grad_norm": 10.581523680019457, + "learning_rate": 4.9638683707366304e-05, + "loss": 2.5355, + "mean_token_accuracy": 0.41506351828575133, + "step": 103520 + }, + { + "epoch": 0.10427147952189726, + "grad_norm": 15.318133913874002, + "learning_rate": 4.963861682118476e-05, + "loss": 2.7423, + "mean_token_accuracy": 0.382758629322052, + "step": 103525 + }, + { + "epoch": 0.10427651557500144, + "grad_norm": 10.378484211995836, + "learning_rate": 4.963854992886298e-05, + "loss": 2.2227, + "mean_token_accuracy": 0.42758620977401735, + "step": 103530 + }, + { + "epoch": 0.10428155162810561, + "grad_norm": 10.711756366934662, + "learning_rate": 4.9638483030401e-05, + "loss": 2.3336, + "mean_token_accuracy": 0.43793103098869324, + "step": 103535 + }, + { + "epoch": 0.10428658768120978, + "grad_norm": 10.033906857823823, + "learning_rate": 4.9638416125798825e-05, + "loss": 2.0944, + "mean_token_accuracy": 0.495099812746048, + "step": 103540 + }, + { + "epoch": 0.10429162373431396, + "grad_norm": 10.084271402057777, + "learning_rate": 4.963834921505648e-05, + "loss": 2.4423, + "mean_token_accuracy": 0.43793103098869324, + "step": 103545 + }, + { + "epoch": 0.10429665978741813, + "grad_norm": 12.389113586671364, + "learning_rate": 4.963828229817398e-05, + "loss": 2.8161, + "mean_token_accuracy": 0.3965517282485962, + "step": 103550 + }, + { + "epoch": 0.1043016958405223, + "grad_norm": 12.23171169453411, + "learning_rate": 4.963821537515134e-05, + "loss": 2.432, + "mean_token_accuracy": 0.4310344815254211, + "step": 103555 + }, + { + "epoch": 0.10430673189362648, + "grad_norm": 10.378459731312915, + "learning_rate": 4.9638148445988596e-05, + "loss": 2.5288, + "mean_token_accuracy": 0.42413793206214906, + "step": 103560 + }, + { + "epoch": 0.10431176794673064, + "grad_norm": 13.36084284854327, + "learning_rate": 4.963808151068575e-05, + "loss": 2.7507, + "mean_token_accuracy": 0.39310344457626345, + "step": 103565 + }, + { + "epoch": 0.10431680399983481, + "grad_norm": 12.177270616466435, + "learning_rate": 4.9638014569242813e-05, + "loss": 2.6571, + "mean_token_accuracy": 0.3620689630508423, + "step": 103570 + }, + { + "epoch": 0.10432184005293899, + "grad_norm": 12.600076777861132, + "learning_rate": 4.9637947621659825e-05, + "loss": 2.4698, + "mean_token_accuracy": 0.42879613041877745, + "step": 103575 + }, + { + "epoch": 0.10432687610604316, + "grad_norm": 10.650463763307181, + "learning_rate": 4.9637880667936795e-05, + "loss": 2.6863, + "mean_token_accuracy": 0.4379310369491577, + "step": 103580 + }, + { + "epoch": 0.10433191215914733, + "grad_norm": 11.371756029700144, + "learning_rate": 4.963781370807374e-05, + "loss": 2.4485, + "mean_token_accuracy": 0.42413793206214906, + "step": 103585 + }, + { + "epoch": 0.1043369482122515, + "grad_norm": 11.348176211773698, + "learning_rate": 4.963774674207067e-05, + "loss": 2.3483, + "mean_token_accuracy": 0.42068966031074523, + "step": 103590 + }, + { + "epoch": 0.10434198426535568, + "grad_norm": 10.473499532902943, + "learning_rate": 4.963767976992763e-05, + "loss": 2.5315, + "mean_token_accuracy": 0.42413793206214906, + "step": 103595 + }, + { + "epoch": 0.10434702031845985, + "grad_norm": 10.181923659945623, + "learning_rate": 4.963761279164461e-05, + "loss": 2.3881, + "mean_token_accuracy": 0.4413793087005615, + "step": 103600 + }, + { + "epoch": 0.10435205637156403, + "grad_norm": 11.741191079925311, + "learning_rate": 4.963754580722165e-05, + "loss": 2.4332, + "mean_token_accuracy": 0.44827587008476255, + "step": 103605 + }, + { + "epoch": 0.1043570924246682, + "grad_norm": 12.14507891519842, + "learning_rate": 4.963747881665875e-05, + "loss": 2.9841, + "mean_token_accuracy": 0.34482758641242983, + "step": 103610 + }, + { + "epoch": 0.10436212847777238, + "grad_norm": 12.220318888751688, + "learning_rate": 4.9637411819955944e-05, + "loss": 2.5066, + "mean_token_accuracy": 0.39310344457626345, + "step": 103615 + }, + { + "epoch": 0.10436716453087655, + "grad_norm": 12.76438227022332, + "learning_rate": 4.9637344817113243e-05, + "loss": 2.4688, + "mean_token_accuracy": 0.3999999940395355, + "step": 103620 + }, + { + "epoch": 0.10437220058398072, + "grad_norm": 9.686551430316133, + "learning_rate": 4.963727780813066e-05, + "loss": 2.6688, + "mean_token_accuracy": 0.3965517163276672, + "step": 103625 + }, + { + "epoch": 0.1043772366370849, + "grad_norm": 12.19324147446133, + "learning_rate": 4.963721079300823e-05, + "loss": 2.4444, + "mean_token_accuracy": 0.44694494009017943, + "step": 103630 + }, + { + "epoch": 0.10438227269018906, + "grad_norm": 9.404065912769726, + "learning_rate": 4.963714377174595e-05, + "loss": 2.2919, + "mean_token_accuracy": 0.42068964838981626, + "step": 103635 + }, + { + "epoch": 0.10438730874329323, + "grad_norm": 8.7383613542677, + "learning_rate": 4.9637076744343864e-05, + "loss": 2.4102, + "mean_token_accuracy": 0.4137930989265442, + "step": 103640 + }, + { + "epoch": 0.1043923447963974, + "grad_norm": 12.843332349912515, + "learning_rate": 4.963700971080197e-05, + "loss": 2.2671, + "mean_token_accuracy": 0.4517241358757019, + "step": 103645 + }, + { + "epoch": 0.10439738084950158, + "grad_norm": 11.796621544836372, + "learning_rate": 4.96369426711203e-05, + "loss": 2.4795, + "mean_token_accuracy": 0.3862068891525269, + "step": 103650 + }, + { + "epoch": 0.10440241690260575, + "grad_norm": 11.140150458946463, + "learning_rate": 4.9636875625298855e-05, + "loss": 2.2771, + "mean_token_accuracy": 0.43793103098869324, + "step": 103655 + }, + { + "epoch": 0.10440745295570993, + "grad_norm": 10.922218200206508, + "learning_rate": 4.963680857333767e-05, + "loss": 2.0578, + "mean_token_accuracy": 0.4620689690113068, + "step": 103660 + }, + { + "epoch": 0.1044124890088141, + "grad_norm": 36.46360879936684, + "learning_rate": 4.963674151523677e-05, + "loss": 2.6587, + "mean_token_accuracy": 0.441379314661026, + "step": 103665 + }, + { + "epoch": 0.10441752506191827, + "grad_norm": 14.067742648750556, + "learning_rate": 4.963667445099615e-05, + "loss": 2.7886, + "mean_token_accuracy": 0.38620689511299133, + "step": 103670 + }, + { + "epoch": 0.10442256111502245, + "grad_norm": 13.928579320046438, + "learning_rate": 4.963660738061584e-05, + "loss": 2.2611, + "mean_token_accuracy": 0.44482759237289426, + "step": 103675 + }, + { + "epoch": 0.10442759716812662, + "grad_norm": 9.44874343203612, + "learning_rate": 4.963654030409586e-05, + "loss": 2.5797, + "mean_token_accuracy": 0.44137930274009707, + "step": 103680 + }, + { + "epoch": 0.1044326332212308, + "grad_norm": 10.673237390557992, + "learning_rate": 4.963647322143623e-05, + "loss": 2.4717, + "mean_token_accuracy": 0.3896551728248596, + "step": 103685 + }, + { + "epoch": 0.10443766927433497, + "grad_norm": 11.82852704694365, + "learning_rate": 4.963640613263698e-05, + "loss": 2.3012, + "mean_token_accuracy": 0.41379310488700866, + "step": 103690 + }, + { + "epoch": 0.10444270532743914, + "grad_norm": 12.157273296885288, + "learning_rate": 4.9636339037698094e-05, + "loss": 2.7572, + "mean_token_accuracy": 0.43242589831352235, + "step": 103695 + }, + { + "epoch": 0.10444774138054332, + "grad_norm": 14.101796927600006, + "learning_rate": 4.9636271936619616e-05, + "loss": 2.6262, + "mean_token_accuracy": 0.35172413289546967, + "step": 103700 + }, + { + "epoch": 0.10445277743364748, + "grad_norm": 14.167107548156475, + "learning_rate": 4.963620482940157e-05, + "loss": 2.2663, + "mean_token_accuracy": 0.4502722263336182, + "step": 103705 + }, + { + "epoch": 0.10445781348675165, + "grad_norm": 10.17840737953548, + "learning_rate": 4.9636137716043956e-05, + "loss": 1.9708, + "mean_token_accuracy": 0.48275861144065857, + "step": 103710 + }, + { + "epoch": 0.10446284953985582, + "grad_norm": 13.1848292823169, + "learning_rate": 4.963607059654681e-05, + "loss": 2.4803, + "mean_token_accuracy": 0.4068965494632721, + "step": 103715 + }, + { + "epoch": 0.10446788559296, + "grad_norm": 10.127751539255769, + "learning_rate": 4.9636003470910134e-05, + "loss": 2.2646, + "mean_token_accuracy": 0.48620688915252686, + "step": 103720 + }, + { + "epoch": 0.10447292164606417, + "grad_norm": 11.570488402059526, + "learning_rate": 4.963593633913396e-05, + "loss": 2.525, + "mean_token_accuracy": 0.4222625494003296, + "step": 103725 + }, + { + "epoch": 0.10447795769916834, + "grad_norm": 11.42208981661114, + "learning_rate": 4.96358692012183e-05, + "loss": 2.2482, + "mean_token_accuracy": 0.43793103098869324, + "step": 103730 + }, + { + "epoch": 0.10448299375227252, + "grad_norm": 8.893750605117278, + "learning_rate": 4.963580205716317e-05, + "loss": 2.612, + "mean_token_accuracy": 0.41034482717514037, + "step": 103735 + }, + { + "epoch": 0.10448802980537669, + "grad_norm": 9.89667229528441, + "learning_rate": 4.96357349069686e-05, + "loss": 2.0072, + "mean_token_accuracy": 0.4852389574050903, + "step": 103740 + }, + { + "epoch": 0.10449306585848087, + "grad_norm": 9.82896367214625, + "learning_rate": 4.9635667750634594e-05, + "loss": 2.4494, + "mean_token_accuracy": 0.4034482717514038, + "step": 103745 + }, + { + "epoch": 0.10449810191158504, + "grad_norm": 9.003076552507387, + "learning_rate": 4.963560058816118e-05, + "loss": 2.5537, + "mean_token_accuracy": 0.3965517163276672, + "step": 103750 + }, + { + "epoch": 0.10450313796468921, + "grad_norm": 11.461572428129221, + "learning_rate": 4.963553341954838e-05, + "loss": 2.6165, + "mean_token_accuracy": 0.3793103456497192, + "step": 103755 + }, + { + "epoch": 0.10450817401779339, + "grad_norm": 16.95036199849308, + "learning_rate": 4.9635466244796204e-05, + "loss": 2.4711, + "mean_token_accuracy": 0.425952810049057, + "step": 103760 + }, + { + "epoch": 0.10451321007089756, + "grad_norm": 7.344111676692995, + "learning_rate": 4.963539906390468e-05, + "loss": 2.1086, + "mean_token_accuracy": 0.47931033968925474, + "step": 103765 + }, + { + "epoch": 0.10451824612400173, + "grad_norm": 12.03461989892936, + "learning_rate": 4.963533187687382e-05, + "loss": 2.1615, + "mean_token_accuracy": 0.495099812746048, + "step": 103770 + }, + { + "epoch": 0.1045232821771059, + "grad_norm": 9.535352672246937, + "learning_rate": 4.9635264683703627e-05, + "loss": 2.1203, + "mean_token_accuracy": 0.441379314661026, + "step": 103775 + }, + { + "epoch": 0.10452831823021007, + "grad_norm": 8.972267724154506, + "learning_rate": 4.963519748439415e-05, + "loss": 2.2181, + "mean_token_accuracy": 0.4482758641242981, + "step": 103780 + }, + { + "epoch": 0.10453335428331424, + "grad_norm": 9.20649779723848, + "learning_rate": 4.963513027894538e-05, + "loss": 2.3419, + "mean_token_accuracy": 0.46061705946922304, + "step": 103785 + }, + { + "epoch": 0.10453839033641842, + "grad_norm": 11.397274773826782, + "learning_rate": 4.963506306735737e-05, + "loss": 2.3216, + "mean_token_accuracy": 0.47931033968925474, + "step": 103790 + }, + { + "epoch": 0.10454342638952259, + "grad_norm": 9.657594127647421, + "learning_rate": 4.96349958496301e-05, + "loss": 2.3275, + "mean_token_accuracy": 0.4620689630508423, + "step": 103795 + }, + { + "epoch": 0.10454846244262676, + "grad_norm": 10.113205202715854, + "learning_rate": 4.963492862576361e-05, + "loss": 2.6431, + "mean_token_accuracy": 0.38965516686439516, + "step": 103800 + }, + { + "epoch": 0.10455349849573094, + "grad_norm": 9.476346875198999, + "learning_rate": 4.963486139575792e-05, + "loss": 2.2249, + "mean_token_accuracy": 0.47241379618644713, + "step": 103805 + }, + { + "epoch": 0.10455853454883511, + "grad_norm": 10.17892326259015, + "learning_rate": 4.9634794159613044e-05, + "loss": 2.4943, + "mean_token_accuracy": 0.4482758641242981, + "step": 103810 + }, + { + "epoch": 0.10456357060193928, + "grad_norm": 10.77684541891807, + "learning_rate": 4.9634726917328996e-05, + "loss": 2.7576, + "mean_token_accuracy": 0.38965516686439516, + "step": 103815 + }, + { + "epoch": 0.10456860665504346, + "grad_norm": 12.59925814778215, + "learning_rate": 4.96346596689058e-05, + "loss": 2.6967, + "mean_token_accuracy": 0.3862069010734558, + "step": 103820 + }, + { + "epoch": 0.10457364270814763, + "grad_norm": 12.83608469894313, + "learning_rate": 4.963459241434347e-05, + "loss": 2.9241, + "mean_token_accuracy": 0.3448275804519653, + "step": 103825 + }, + { + "epoch": 0.1045786787612518, + "grad_norm": 8.834855075990559, + "learning_rate": 4.9634525153642035e-05, + "loss": 2.2304, + "mean_token_accuracy": 0.45396249294281005, + "step": 103830 + }, + { + "epoch": 0.10458371481435598, + "grad_norm": 10.627287815257999, + "learning_rate": 4.96344578868015e-05, + "loss": 2.2472, + "mean_token_accuracy": 0.4429521977901459, + "step": 103835 + }, + { + "epoch": 0.10458875086746015, + "grad_norm": 8.50141838204776, + "learning_rate": 4.96343906138219e-05, + "loss": 2.3461, + "mean_token_accuracy": 0.4813067078590393, + "step": 103840 + }, + { + "epoch": 0.10459378692056431, + "grad_norm": 11.275918176377852, + "learning_rate": 4.9634323334703236e-05, + "loss": 2.2656, + "mean_token_accuracy": 0.4592256486415863, + "step": 103845 + }, + { + "epoch": 0.10459882297366849, + "grad_norm": 10.296569154783302, + "learning_rate": 4.9634256049445534e-05, + "loss": 2.7804, + "mean_token_accuracy": 0.4188747704029083, + "step": 103850 + }, + { + "epoch": 0.10460385902677266, + "grad_norm": 11.388375264799716, + "learning_rate": 4.9634188758048816e-05, + "loss": 2.4708, + "mean_token_accuracy": 0.44482759237289426, + "step": 103855 + }, + { + "epoch": 0.10460889507987683, + "grad_norm": 12.311459088558646, + "learning_rate": 4.96341214605131e-05, + "loss": 2.468, + "mean_token_accuracy": 0.4448275983333588, + "step": 103860 + }, + { + "epoch": 0.10461393113298101, + "grad_norm": 10.150749440153362, + "learning_rate": 4.96340541568384e-05, + "loss": 2.3655, + "mean_token_accuracy": 0.4103448301553726, + "step": 103865 + }, + { + "epoch": 0.10461896718608518, + "grad_norm": 12.186425261492035, + "learning_rate": 4.963398684702474e-05, + "loss": 2.825, + "mean_token_accuracy": 0.41379310488700866, + "step": 103870 + }, + { + "epoch": 0.10462400323918936, + "grad_norm": 15.864778150880378, + "learning_rate": 4.9633919531072146e-05, + "loss": 2.6345, + "mean_token_accuracy": 0.3758620619773865, + "step": 103875 + }, + { + "epoch": 0.10462903929229353, + "grad_norm": 12.775242450317522, + "learning_rate": 4.963385220898061e-05, + "loss": 2.4882, + "mean_token_accuracy": 0.4172413766384125, + "step": 103880 + }, + { + "epoch": 0.1046340753453977, + "grad_norm": 8.836555736129975, + "learning_rate": 4.963378488075017e-05, + "loss": 2.671, + "mean_token_accuracy": 0.44827585220336913, + "step": 103885 + }, + { + "epoch": 0.10463911139850188, + "grad_norm": 11.426439924731422, + "learning_rate": 4.963371754638085e-05, + "loss": 2.3924, + "mean_token_accuracy": 0.4399878978729248, + "step": 103890 + }, + { + "epoch": 0.10464414745160605, + "grad_norm": 10.138186121060816, + "learning_rate": 4.963365020587266e-05, + "loss": 2.7743, + "mean_token_accuracy": 0.3965517163276672, + "step": 103895 + }, + { + "epoch": 0.10464918350471022, + "grad_norm": 10.180488070865366, + "learning_rate": 4.9633582859225617e-05, + "loss": 2.4515, + "mean_token_accuracy": 0.43103447556495667, + "step": 103900 + }, + { + "epoch": 0.1046542195578144, + "grad_norm": 10.343279252894899, + "learning_rate": 4.9633515506439745e-05, + "loss": 2.2611, + "mean_token_accuracy": 0.4344827473163605, + "step": 103905 + }, + { + "epoch": 0.10465925561091857, + "grad_norm": 11.82328202583085, + "learning_rate": 4.963344814751505e-05, + "loss": 2.4676, + "mean_token_accuracy": 0.4172413766384125, + "step": 103910 + }, + { + "epoch": 0.10466429166402273, + "grad_norm": 9.697109450703039, + "learning_rate": 4.963338078245157e-05, + "loss": 2.3514, + "mean_token_accuracy": 0.4379310369491577, + "step": 103915 + }, + { + "epoch": 0.1046693277171269, + "grad_norm": 12.33348395039808, + "learning_rate": 4.963331341124931e-05, + "loss": 2.5811, + "mean_token_accuracy": 0.4241379380226135, + "step": 103920 + }, + { + "epoch": 0.10467436377023108, + "grad_norm": 10.727621179600156, + "learning_rate": 4.963324603390829e-05, + "loss": 2.53, + "mean_token_accuracy": 0.3827586233615875, + "step": 103925 + }, + { + "epoch": 0.10467939982333525, + "grad_norm": 11.050456638740005, + "learning_rate": 4.9633178650428534e-05, + "loss": 2.1409, + "mean_token_accuracy": 0.4586206912994385, + "step": 103930 + }, + { + "epoch": 0.10468443587643943, + "grad_norm": 10.215214947431406, + "learning_rate": 4.9633111260810065e-05, + "loss": 2.3731, + "mean_token_accuracy": 0.44827585220336913, + "step": 103935 + }, + { + "epoch": 0.1046894719295436, + "grad_norm": 12.2797076162197, + "learning_rate": 4.963304386505289e-05, + "loss": 3.0348, + "mean_token_accuracy": 0.37586206793785093, + "step": 103940 + }, + { + "epoch": 0.10469450798264777, + "grad_norm": 12.396413275175545, + "learning_rate": 4.963297646315704e-05, + "loss": 2.4188, + "mean_token_accuracy": 0.4379310429096222, + "step": 103945 + }, + { + "epoch": 0.10469954403575195, + "grad_norm": 13.493329298902268, + "learning_rate": 4.9632909055122514e-05, + "loss": 2.5714, + "mean_token_accuracy": 0.37586206793785093, + "step": 103950 + }, + { + "epoch": 0.10470458008885612, + "grad_norm": 20.143616129975943, + "learning_rate": 4.963284164094935e-05, + "loss": 2.524, + "mean_token_accuracy": 0.38275861740112305, + "step": 103955 + }, + { + "epoch": 0.1047096161419603, + "grad_norm": 12.228609158097202, + "learning_rate": 4.963277422063756e-05, + "loss": 2.3223, + "mean_token_accuracy": 0.4379310369491577, + "step": 103960 + }, + { + "epoch": 0.10471465219506447, + "grad_norm": 11.537071422058181, + "learning_rate": 4.963270679418716e-05, + "loss": 2.376, + "mean_token_accuracy": 0.4413793087005615, + "step": 103965 + }, + { + "epoch": 0.10471968824816864, + "grad_norm": 10.531307624490426, + "learning_rate": 4.963263936159818e-05, + "loss": 2.2471, + "mean_token_accuracy": 0.46551724672317507, + "step": 103970 + }, + { + "epoch": 0.10472472430127282, + "grad_norm": 11.332010079471702, + "learning_rate": 4.9632571922870616e-05, + "loss": 2.3787, + "mean_token_accuracy": 0.4034482717514038, + "step": 103975 + }, + { + "epoch": 0.10472976035437699, + "grad_norm": 10.238605881439101, + "learning_rate": 4.9632504478004504e-05, + "loss": 2.3162, + "mean_token_accuracy": 0.41724138259887694, + "step": 103980 + }, + { + "epoch": 0.10473479640748115, + "grad_norm": 22.071766369195398, + "learning_rate": 4.9632437026999864e-05, + "loss": 2.4119, + "mean_token_accuracy": 0.41034482717514037, + "step": 103985 + }, + { + "epoch": 0.10473983246058532, + "grad_norm": 16.04930805860322, + "learning_rate": 4.963236956985671e-05, + "loss": 2.4662, + "mean_token_accuracy": 0.4137930989265442, + "step": 103990 + }, + { + "epoch": 0.1047448685136895, + "grad_norm": 11.466228927049901, + "learning_rate": 4.9632302106575056e-05, + "loss": 2.5193, + "mean_token_accuracy": 0.41724138259887694, + "step": 103995 + }, + { + "epoch": 0.10474990456679367, + "grad_norm": 9.816737589081203, + "learning_rate": 4.963223463715492e-05, + "loss": 2.2841, + "mean_token_accuracy": 0.4517241358757019, + "step": 104000 + }, + { + "epoch": 0.10475494061989785, + "grad_norm": 14.948246793100575, + "learning_rate": 4.963216716159634e-05, + "loss": 2.7182, + "mean_token_accuracy": 0.4517241358757019, + "step": 104005 + }, + { + "epoch": 0.10475997667300202, + "grad_norm": 13.051587827248236, + "learning_rate": 4.963209967989932e-05, + "loss": 2.55, + "mean_token_accuracy": 0.41161524057388305, + "step": 104010 + }, + { + "epoch": 0.1047650127261062, + "grad_norm": 12.039451179042391, + "learning_rate": 4.9632032192063876e-05, + "loss": 2.2666, + "mean_token_accuracy": 0.44355716109275817, + "step": 104015 + }, + { + "epoch": 0.10477004877921037, + "grad_norm": 12.13252582412816, + "learning_rate": 4.963196469809002e-05, + "loss": 2.4723, + "mean_token_accuracy": 0.4034482717514038, + "step": 104020 + }, + { + "epoch": 0.10477508483231454, + "grad_norm": 9.56408220808534, + "learning_rate": 4.96318971979778e-05, + "loss": 2.5118, + "mean_token_accuracy": 0.40000000298023225, + "step": 104025 + }, + { + "epoch": 0.10478012088541871, + "grad_norm": 11.037321589730412, + "learning_rate": 4.963182969172721e-05, + "loss": 2.5299, + "mean_token_accuracy": 0.3862069010734558, + "step": 104030 + }, + { + "epoch": 0.10478515693852289, + "grad_norm": 12.726437126198556, + "learning_rate": 4.9631762179338265e-05, + "loss": 2.981, + "mean_token_accuracy": 0.36896551251411436, + "step": 104035 + }, + { + "epoch": 0.10479019299162706, + "grad_norm": 9.330428238882309, + "learning_rate": 4.9631694660811e-05, + "loss": 2.4234, + "mean_token_accuracy": 0.4, + "step": 104040 + }, + { + "epoch": 0.10479522904473124, + "grad_norm": 12.35139380693233, + "learning_rate": 4.963162713614542e-05, + "loss": 2.1832, + "mean_token_accuracy": 0.4551724076271057, + "step": 104045 + }, + { + "epoch": 0.10480026509783541, + "grad_norm": 11.979356145690286, + "learning_rate": 4.963155960534156e-05, + "loss": 2.5356, + "mean_token_accuracy": 0.44827585220336913, + "step": 104050 + }, + { + "epoch": 0.10480530115093957, + "grad_norm": 11.626126946679923, + "learning_rate": 4.963149206839943e-05, + "loss": 2.4335, + "mean_token_accuracy": 0.4310344815254211, + "step": 104055 + }, + { + "epoch": 0.10481033720404374, + "grad_norm": 10.55813657532861, + "learning_rate": 4.9631424525319046e-05, + "loss": 2.5451, + "mean_token_accuracy": 0.4275862127542496, + "step": 104060 + }, + { + "epoch": 0.10481537325714792, + "grad_norm": 9.846870308188988, + "learning_rate": 4.963135697610042e-05, + "loss": 2.5082, + "mean_token_accuracy": 0.4482758641242981, + "step": 104065 + }, + { + "epoch": 0.10482040931025209, + "grad_norm": 11.495942412233441, + "learning_rate": 4.963128942074358e-05, + "loss": 2.4912, + "mean_token_accuracy": 0.38620689511299133, + "step": 104070 + }, + { + "epoch": 0.10482544536335626, + "grad_norm": 9.321278029663754, + "learning_rate": 4.9631221859248556e-05, + "loss": 2.0798, + "mean_token_accuracy": 0.5034482717514038, + "step": 104075 + }, + { + "epoch": 0.10483048141646044, + "grad_norm": 12.654234140705524, + "learning_rate": 4.9631154291615345e-05, + "loss": 2.2259, + "mean_token_accuracy": 0.4918330192565918, + "step": 104080 + }, + { + "epoch": 0.10483551746956461, + "grad_norm": 8.676855807659111, + "learning_rate": 4.9631086717843985e-05, + "loss": 2.2767, + "mean_token_accuracy": 0.42413793206214906, + "step": 104085 + }, + { + "epoch": 0.10484055352266879, + "grad_norm": 10.062172691741328, + "learning_rate": 4.963101913793448e-05, + "loss": 2.3631, + "mean_token_accuracy": 0.4379310250282288, + "step": 104090 + }, + { + "epoch": 0.10484558957577296, + "grad_norm": 19.394394574490114, + "learning_rate": 4.963095155188685e-05, + "loss": 2.7214, + "mean_token_accuracy": 0.3482758641242981, + "step": 104095 + }, + { + "epoch": 0.10485062562887713, + "grad_norm": 11.064985386540059, + "learning_rate": 4.963088395970113e-05, + "loss": 2.1542, + "mean_token_accuracy": 0.47586206197738645, + "step": 104100 + }, + { + "epoch": 0.10485566168198131, + "grad_norm": 12.81755036922968, + "learning_rate": 4.963081636137732e-05, + "loss": 2.751, + "mean_token_accuracy": 0.3517241358757019, + "step": 104105 + }, + { + "epoch": 0.10486069773508548, + "grad_norm": 9.06593498323187, + "learning_rate": 4.963074875691545e-05, + "loss": 2.3601, + "mean_token_accuracy": 0.41379310488700866, + "step": 104110 + }, + { + "epoch": 0.10486573378818966, + "grad_norm": 12.581203768010441, + "learning_rate": 4.963068114631553e-05, + "loss": 2.8013, + "mean_token_accuracy": 0.39655172228813174, + "step": 104115 + }, + { + "epoch": 0.10487076984129383, + "grad_norm": 12.819218417071824, + "learning_rate": 4.963061352957758e-05, + "loss": 2.3518, + "mean_token_accuracy": 0.4119782209396362, + "step": 104120 + }, + { + "epoch": 0.10487580589439799, + "grad_norm": 10.9190985692, + "learning_rate": 4.963054590670162e-05, + "loss": 2.3678, + "mean_token_accuracy": 0.441379314661026, + "step": 104125 + }, + { + "epoch": 0.10488084194750216, + "grad_norm": 11.534833327034596, + "learning_rate": 4.9630478277687675e-05, + "loss": 2.7484, + "mean_token_accuracy": 0.3999999940395355, + "step": 104130 + }, + { + "epoch": 0.10488587800060634, + "grad_norm": 11.499511754363747, + "learning_rate": 4.963041064253576e-05, + "loss": 2.9454, + "mean_token_accuracy": 0.36896551251411436, + "step": 104135 + }, + { + "epoch": 0.10489091405371051, + "grad_norm": 11.366043390532214, + "learning_rate": 4.963034300124589e-05, + "loss": 2.2627, + "mean_token_accuracy": 0.4620689630508423, + "step": 104140 + }, + { + "epoch": 0.10489595010681468, + "grad_norm": 13.412084507353034, + "learning_rate": 4.963027535381809e-05, + "loss": 2.8307, + "mean_token_accuracy": 0.36206896007061007, + "step": 104145 + }, + { + "epoch": 0.10490098615991886, + "grad_norm": 10.956585587672876, + "learning_rate": 4.963020770025238e-05, + "loss": 2.1864, + "mean_token_accuracy": 0.4068965554237366, + "step": 104150 + }, + { + "epoch": 0.10490602221302303, + "grad_norm": 8.756257263100425, + "learning_rate": 4.963014004054876e-05, + "loss": 2.5224, + "mean_token_accuracy": 0.482758617401123, + "step": 104155 + }, + { + "epoch": 0.1049110582661272, + "grad_norm": 9.666583919047543, + "learning_rate": 4.963007237470728e-05, + "loss": 2.5129, + "mean_token_accuracy": 0.42413793206214906, + "step": 104160 + }, + { + "epoch": 0.10491609431923138, + "grad_norm": 10.450762512692284, + "learning_rate": 4.963000470272793e-05, + "loss": 2.6904, + "mean_token_accuracy": 0.4172413766384125, + "step": 104165 + }, + { + "epoch": 0.10492113037233555, + "grad_norm": 10.55127333999612, + "learning_rate": 4.9629937024610744e-05, + "loss": 2.4627, + "mean_token_accuracy": 0.3965517282485962, + "step": 104170 + }, + { + "epoch": 0.10492616642543973, + "grad_norm": 10.397115012695442, + "learning_rate": 4.962986934035575e-05, + "loss": 2.2181, + "mean_token_accuracy": 0.40344828367233276, + "step": 104175 + }, + { + "epoch": 0.1049312024785439, + "grad_norm": 10.206228920985115, + "learning_rate": 4.962980164996293e-05, + "loss": 2.2698, + "mean_token_accuracy": 0.44482758045196535, + "step": 104180 + }, + { + "epoch": 0.10493623853164807, + "grad_norm": 11.600334409972222, + "learning_rate": 4.962973395343234e-05, + "loss": 2.8156, + "mean_token_accuracy": 0.41379311084747317, + "step": 104185 + }, + { + "epoch": 0.10494127458475225, + "grad_norm": 10.21850808763546, + "learning_rate": 4.962966625076399e-05, + "loss": 2.5689, + "mean_token_accuracy": 0.42413793206214906, + "step": 104190 + }, + { + "epoch": 0.10494631063785641, + "grad_norm": 10.3588736542016, + "learning_rate": 4.962959854195789e-05, + "loss": 2.3702, + "mean_token_accuracy": 0.41724138259887694, + "step": 104195 + }, + { + "epoch": 0.10495134669096058, + "grad_norm": 10.060905824278123, + "learning_rate": 4.962953082701407e-05, + "loss": 2.7718, + "mean_token_accuracy": 0.3655172407627106, + "step": 104200 + }, + { + "epoch": 0.10495638274406476, + "grad_norm": 11.411455235343968, + "learning_rate": 4.962946310593253e-05, + "loss": 2.2699, + "mean_token_accuracy": 0.4620689690113068, + "step": 104205 + }, + { + "epoch": 0.10496141879716893, + "grad_norm": 10.467943632835208, + "learning_rate": 4.96293953787133e-05, + "loss": 2.4553, + "mean_token_accuracy": 0.4000000059604645, + "step": 104210 + }, + { + "epoch": 0.1049664548502731, + "grad_norm": 9.863013480701674, + "learning_rate": 4.9629327645356405e-05, + "loss": 2.3218, + "mean_token_accuracy": 0.4344827592372894, + "step": 104215 + }, + { + "epoch": 0.10497149090337728, + "grad_norm": 9.570439695297525, + "learning_rate": 4.962925990586186e-05, + "loss": 2.4081, + "mean_token_accuracy": 0.43448275327682495, + "step": 104220 + }, + { + "epoch": 0.10497652695648145, + "grad_norm": 10.416331869635563, + "learning_rate": 4.962919216022968e-05, + "loss": 2.6841, + "mean_token_accuracy": 0.3551724195480347, + "step": 104225 + }, + { + "epoch": 0.10498156300958562, + "grad_norm": 10.639930018155725, + "learning_rate": 4.962912440845989e-05, + "loss": 2.0856, + "mean_token_accuracy": 0.4704433500766754, + "step": 104230 + }, + { + "epoch": 0.1049865990626898, + "grad_norm": 9.45027781800723, + "learning_rate": 4.9629056650552497e-05, + "loss": 2.4314, + "mean_token_accuracy": 0.4068965494632721, + "step": 104235 + }, + { + "epoch": 0.10499163511579397, + "grad_norm": 15.264633429674324, + "learning_rate": 4.962898888650753e-05, + "loss": 2.6629, + "mean_token_accuracy": 0.3758620649576187, + "step": 104240 + }, + { + "epoch": 0.10499667116889815, + "grad_norm": 11.310133921733465, + "learning_rate": 4.962892111632502e-05, + "loss": 2.6282, + "mean_token_accuracy": 0.39842710494995115, + "step": 104245 + }, + { + "epoch": 0.10500170722200232, + "grad_norm": 9.72143240421472, + "learning_rate": 4.962885334000495e-05, + "loss": 2.4102, + "mean_token_accuracy": 0.4123411953449249, + "step": 104250 + }, + { + "epoch": 0.10500674327510649, + "grad_norm": 9.02671059113164, + "learning_rate": 4.9628785557547376e-05, + "loss": 2.5854, + "mean_token_accuracy": 0.4551724076271057, + "step": 104255 + }, + { + "epoch": 0.10501177932821067, + "grad_norm": 10.936300577015556, + "learning_rate": 4.9628717768952286e-05, + "loss": 2.553, + "mean_token_accuracy": 0.39086509943008424, + "step": 104260 + }, + { + "epoch": 0.10501681538131483, + "grad_norm": 13.13127863390209, + "learning_rate": 4.9628649974219723e-05, + "loss": 2.5706, + "mean_token_accuracy": 0.35862069129943847, + "step": 104265 + }, + { + "epoch": 0.105021851434419, + "grad_norm": 12.216592917636945, + "learning_rate": 4.96285821733497e-05, + "loss": 2.2482, + "mean_token_accuracy": 0.4344827592372894, + "step": 104270 + }, + { + "epoch": 0.10502688748752317, + "grad_norm": 10.293780999079074, + "learning_rate": 4.962851436634222e-05, + "loss": 2.1938, + "mean_token_accuracy": 0.441379314661026, + "step": 104275 + }, + { + "epoch": 0.10503192354062735, + "grad_norm": 14.330754022928506, + "learning_rate": 4.962844655319732e-05, + "loss": 2.614, + "mean_token_accuracy": 0.4034482717514038, + "step": 104280 + }, + { + "epoch": 0.10503695959373152, + "grad_norm": 9.812876636797895, + "learning_rate": 4.962837873391502e-05, + "loss": 1.9529, + "mean_token_accuracy": 0.520992124080658, + "step": 104285 + }, + { + "epoch": 0.1050419956468357, + "grad_norm": 17.135634112779773, + "learning_rate": 4.9628310908495326e-05, + "loss": 2.7436, + "mean_token_accuracy": 0.4137930989265442, + "step": 104290 + }, + { + "epoch": 0.10504703169993987, + "grad_norm": 10.385988512392446, + "learning_rate": 4.9628243076938264e-05, + "loss": 2.5008, + "mean_token_accuracy": 0.4482758641242981, + "step": 104295 + }, + { + "epoch": 0.10505206775304404, + "grad_norm": 9.807478457153017, + "learning_rate": 4.9628175239243845e-05, + "loss": 2.4519, + "mean_token_accuracy": 0.42068966031074523, + "step": 104300 + }, + { + "epoch": 0.10505710380614822, + "grad_norm": 9.084020543276518, + "learning_rate": 4.9628107395412096e-05, + "loss": 2.3274, + "mean_token_accuracy": 0.3896551728248596, + "step": 104305 + }, + { + "epoch": 0.10506213985925239, + "grad_norm": 11.68858665292888, + "learning_rate": 4.962803954544304e-05, + "loss": 2.3425, + "mean_token_accuracy": 0.43103448748588563, + "step": 104310 + }, + { + "epoch": 0.10506717591235656, + "grad_norm": 10.238277599128011, + "learning_rate": 4.9627971689336684e-05, + "loss": 2.0116, + "mean_token_accuracy": 0.45517241954803467, + "step": 104315 + }, + { + "epoch": 0.10507221196546074, + "grad_norm": 8.623054930425678, + "learning_rate": 4.9627903827093054e-05, + "loss": 1.9555, + "mean_token_accuracy": 0.4936479091644287, + "step": 104320 + }, + { + "epoch": 0.10507724801856491, + "grad_norm": 10.719437973646045, + "learning_rate": 4.962783595871217e-05, + "loss": 2.794, + "mean_token_accuracy": 0.3896551728248596, + "step": 104325 + }, + { + "epoch": 0.10508228407166909, + "grad_norm": 19.229301521538122, + "learning_rate": 4.9627768084194044e-05, + "loss": 2.4967, + "mean_token_accuracy": 0.4084089517593384, + "step": 104330 + }, + { + "epoch": 0.10508732012477325, + "grad_norm": 9.65723479161499, + "learning_rate": 4.9627700203538696e-05, + "loss": 2.1691, + "mean_token_accuracy": 0.4482758641242981, + "step": 104335 + }, + { + "epoch": 0.10509235617787742, + "grad_norm": 8.805551982575079, + "learning_rate": 4.9627632316746154e-05, + "loss": 2.0863, + "mean_token_accuracy": 0.5068965494632721, + "step": 104340 + }, + { + "epoch": 0.10509739223098159, + "grad_norm": 9.666568512834388, + "learning_rate": 4.962756442381643e-05, + "loss": 2.1471, + "mean_token_accuracy": 0.493103438615799, + "step": 104345 + }, + { + "epoch": 0.10510242828408577, + "grad_norm": 12.872396051499596, + "learning_rate": 4.962749652474954e-05, + "loss": 2.8309, + "mean_token_accuracy": 0.36896551847457887, + "step": 104350 + }, + { + "epoch": 0.10510746433718994, + "grad_norm": 11.63484894566667, + "learning_rate": 4.9627428619545514e-05, + "loss": 2.44, + "mean_token_accuracy": 0.37586206793785093, + "step": 104355 + }, + { + "epoch": 0.10511250039029411, + "grad_norm": 11.430611182435214, + "learning_rate": 4.962736070820435e-05, + "loss": 2.0939, + "mean_token_accuracy": 0.47931034564971925, + "step": 104360 + }, + { + "epoch": 0.10511753644339829, + "grad_norm": 10.623681452284707, + "learning_rate": 4.9627292790726096e-05, + "loss": 1.9904, + "mean_token_accuracy": 0.44482759237289426, + "step": 104365 + }, + { + "epoch": 0.10512257249650246, + "grad_norm": 10.061719151977647, + "learning_rate": 4.962722486711074e-05, + "loss": 2.6797, + "mean_token_accuracy": 0.37586207389831544, + "step": 104370 + }, + { + "epoch": 0.10512760854960664, + "grad_norm": 12.94275104905402, + "learning_rate": 4.962715693735833e-05, + "loss": 2.2372, + "mean_token_accuracy": 0.44482758045196535, + "step": 104375 + }, + { + "epoch": 0.10513264460271081, + "grad_norm": 11.861088513103544, + "learning_rate": 4.9627089001468865e-05, + "loss": 2.5491, + "mean_token_accuracy": 0.3999999940395355, + "step": 104380 + }, + { + "epoch": 0.10513768065581498, + "grad_norm": 8.238414969436992, + "learning_rate": 4.9627021059442355e-05, + "loss": 1.7623, + "mean_token_accuracy": 0.5424876868724823, + "step": 104385 + }, + { + "epoch": 0.10514271670891916, + "grad_norm": 10.089772643188878, + "learning_rate": 4.9626953111278854e-05, + "loss": 2.5224, + "mean_token_accuracy": 0.4310344815254211, + "step": 104390 + }, + { + "epoch": 0.10514775276202333, + "grad_norm": 12.43101837941754, + "learning_rate": 4.962688515697834e-05, + "loss": 2.098, + "mean_token_accuracy": 0.4551724135875702, + "step": 104395 + }, + { + "epoch": 0.1051527888151275, + "grad_norm": 11.939319118217934, + "learning_rate": 4.962681719654087e-05, + "loss": 2.3852, + "mean_token_accuracy": 0.4551724135875702, + "step": 104400 + }, + { + "epoch": 0.10515782486823166, + "grad_norm": 12.213084854459414, + "learning_rate": 4.962674922996644e-05, + "loss": 2.4956, + "mean_token_accuracy": 0.4379310429096222, + "step": 104405 + }, + { + "epoch": 0.10516286092133584, + "grad_norm": 10.405037374973546, + "learning_rate": 4.962668125725507e-05, + "loss": 2.2493, + "mean_token_accuracy": 0.4344827592372894, + "step": 104410 + }, + { + "epoch": 0.10516789697444001, + "grad_norm": 11.700716407650718, + "learning_rate": 4.962661327840678e-05, + "loss": 2.3471, + "mean_token_accuracy": 0.4206896543502808, + "step": 104415 + }, + { + "epoch": 0.10517293302754419, + "grad_norm": 11.285773180230967, + "learning_rate": 4.96265452934216e-05, + "loss": 2.5454, + "mean_token_accuracy": 0.38275861740112305, + "step": 104420 + }, + { + "epoch": 0.10517796908064836, + "grad_norm": 10.643787686356456, + "learning_rate": 4.9626477302299534e-05, + "loss": 2.1005, + "mean_token_accuracy": 0.441379314661026, + "step": 104425 + }, + { + "epoch": 0.10518300513375253, + "grad_norm": 11.803988382830147, + "learning_rate": 4.962640930504061e-05, + "loss": 2.3415, + "mean_token_accuracy": 0.4533575356006622, + "step": 104430 + }, + { + "epoch": 0.1051880411868567, + "grad_norm": 9.346115168494713, + "learning_rate": 4.962634130164483e-05, + "loss": 2.3295, + "mean_token_accuracy": 0.4290986001491547, + "step": 104435 + }, + { + "epoch": 0.10519307723996088, + "grad_norm": 11.257395626140907, + "learning_rate": 4.962627329211225e-05, + "loss": 2.2176, + "mean_token_accuracy": 0.42413792610168455, + "step": 104440 + }, + { + "epoch": 0.10519811329306505, + "grad_norm": 10.824926258074312, + "learning_rate": 4.962620527644285e-05, + "loss": 2.5731, + "mean_token_accuracy": 0.4068965494632721, + "step": 104445 + }, + { + "epoch": 0.10520314934616923, + "grad_norm": 11.269678800878701, + "learning_rate": 4.9626137254636675e-05, + "loss": 2.98, + "mean_token_accuracy": 0.3206896483898163, + "step": 104450 + }, + { + "epoch": 0.1052081853992734, + "grad_norm": 9.341629381584324, + "learning_rate": 4.962606922669372e-05, + "loss": 2.2644, + "mean_token_accuracy": 0.4745311439037323, + "step": 104455 + }, + { + "epoch": 0.10521322145237758, + "grad_norm": 11.700680338270931, + "learning_rate": 4.962600119261403e-05, + "loss": 2.3472, + "mean_token_accuracy": 0.4310344815254211, + "step": 104460 + }, + { + "epoch": 0.10521825750548175, + "grad_norm": 11.168420741253183, + "learning_rate": 4.9625933152397604e-05, + "loss": 2.3918, + "mean_token_accuracy": 0.45517241954803467, + "step": 104465 + }, + { + "epoch": 0.10522329355858592, + "grad_norm": 11.073665060641494, + "learning_rate": 4.9625865106044466e-05, + "loss": 2.235, + "mean_token_accuracy": 0.4310344696044922, + "step": 104470 + }, + { + "epoch": 0.10522832961169008, + "grad_norm": 11.14349013105833, + "learning_rate": 4.962579705355464e-05, + "loss": 2.4706, + "mean_token_accuracy": 0.3896551728248596, + "step": 104475 + }, + { + "epoch": 0.10523336566479426, + "grad_norm": 15.492503794684588, + "learning_rate": 4.962572899492814e-05, + "loss": 2.5848, + "mean_token_accuracy": 0.4103448331356049, + "step": 104480 + }, + { + "epoch": 0.10523840171789843, + "grad_norm": 10.376270434511785, + "learning_rate": 4.962566093016499e-05, + "loss": 2.5172, + "mean_token_accuracy": 0.36551723778247835, + "step": 104485 + }, + { + "epoch": 0.1052434377710026, + "grad_norm": 10.411506146859866, + "learning_rate": 4.96255928592652e-05, + "loss": 2.5499, + "mean_token_accuracy": 0.41724138259887694, + "step": 104490 + }, + { + "epoch": 0.10524847382410678, + "grad_norm": 10.64188409034023, + "learning_rate": 4.96255247822288e-05, + "loss": 2.5712, + "mean_token_accuracy": 0.43641863465309144, + "step": 104495 + }, + { + "epoch": 0.10525350987721095, + "grad_norm": 10.7727777010905, + "learning_rate": 4.96254566990558e-05, + "loss": 2.5249, + "mean_token_accuracy": 0.41379311084747317, + "step": 104500 + }, + { + "epoch": 0.10525854593031513, + "grad_norm": 9.46284445099597, + "learning_rate": 4.962538860974622e-05, + "loss": 2.1961, + "mean_token_accuracy": 0.4689655125141144, + "step": 104505 + }, + { + "epoch": 0.1052635819834193, + "grad_norm": 10.823538629885405, + "learning_rate": 4.9625320514300085e-05, + "loss": 2.2574, + "mean_token_accuracy": 0.41379310488700866, + "step": 104510 + }, + { + "epoch": 0.10526861803652347, + "grad_norm": 13.48198615315915, + "learning_rate": 4.962525241271741e-05, + "loss": 2.9341, + "mean_token_accuracy": 0.3931034505367279, + "step": 104515 + }, + { + "epoch": 0.10527365408962765, + "grad_norm": 10.922012810188601, + "learning_rate": 4.962518430499822e-05, + "loss": 2.3436, + "mean_token_accuracy": 0.42613430619239806, + "step": 104520 + }, + { + "epoch": 0.10527869014273182, + "grad_norm": 9.469779872193012, + "learning_rate": 4.962511619114251e-05, + "loss": 2.5683, + "mean_token_accuracy": 0.41724138259887694, + "step": 104525 + }, + { + "epoch": 0.105283726195836, + "grad_norm": 10.473719154934223, + "learning_rate": 4.9625048071150325e-05, + "loss": 3.2372, + "mean_token_accuracy": 0.3620689660310745, + "step": 104530 + }, + { + "epoch": 0.10528876224894017, + "grad_norm": 9.587430195189388, + "learning_rate": 4.9624979945021685e-05, + "loss": 2.6169, + "mean_token_accuracy": 0.41379310488700866, + "step": 104535 + }, + { + "epoch": 0.10529379830204434, + "grad_norm": 11.77243450265687, + "learning_rate": 4.962491181275658e-05, + "loss": 2.1481, + "mean_token_accuracy": 0.44827585816383364, + "step": 104540 + }, + { + "epoch": 0.1052988343551485, + "grad_norm": 13.20106916984188, + "learning_rate": 4.9624843674355066e-05, + "loss": 2.5211, + "mean_token_accuracy": 0.41379310488700866, + "step": 104545 + }, + { + "epoch": 0.10530387040825268, + "grad_norm": 10.043084635997937, + "learning_rate": 4.9624775529817134e-05, + "loss": 2.4595, + "mean_token_accuracy": 0.43793103098869324, + "step": 104550 + }, + { + "epoch": 0.10530890646135685, + "grad_norm": 18.76786820225853, + "learning_rate": 4.962470737914282e-05, + "loss": 2.7603, + "mean_token_accuracy": 0.358620685338974, + "step": 104555 + }, + { + "epoch": 0.10531394251446102, + "grad_norm": 12.54169573967831, + "learning_rate": 4.962463922233213e-05, + "loss": 2.7367, + "mean_token_accuracy": 0.4517241418361664, + "step": 104560 + }, + { + "epoch": 0.1053189785675652, + "grad_norm": 10.909113543475502, + "learning_rate": 4.9624571059385094e-05, + "loss": 2.6188, + "mean_token_accuracy": 0.3931034505367279, + "step": 104565 + }, + { + "epoch": 0.10532401462066937, + "grad_norm": 9.96207734278988, + "learning_rate": 4.962450289030172e-05, + "loss": 2.3455, + "mean_token_accuracy": 0.47241379618644713, + "step": 104570 + }, + { + "epoch": 0.10532905067377354, + "grad_norm": 10.90432089142969, + "learning_rate": 4.962443471508204e-05, + "loss": 2.7206, + "mean_token_accuracy": 0.3517241358757019, + "step": 104575 + }, + { + "epoch": 0.10533408672687772, + "grad_norm": 10.54067863793279, + "learning_rate": 4.962436653372606e-05, + "loss": 2.317, + "mean_token_accuracy": 0.4034482777118683, + "step": 104580 + }, + { + "epoch": 0.10533912277998189, + "grad_norm": 9.9133158633996, + "learning_rate": 4.9624298346233804e-05, + "loss": 2.5115, + "mean_token_accuracy": 0.3827586144208908, + "step": 104585 + }, + { + "epoch": 0.10534415883308607, + "grad_norm": 12.40972735281351, + "learning_rate": 4.962423015260529e-05, + "loss": 2.8339, + "mean_token_accuracy": 0.42758620977401735, + "step": 104590 + }, + { + "epoch": 0.10534919488619024, + "grad_norm": 11.987546463643906, + "learning_rate": 4.962416195284054e-05, + "loss": 2.4714, + "mean_token_accuracy": 0.43684210777282717, + "step": 104595 + }, + { + "epoch": 0.10535423093929441, + "grad_norm": 9.83523822454958, + "learning_rate": 4.962409374693957e-05, + "loss": 2.5817, + "mean_token_accuracy": 0.4172413766384125, + "step": 104600 + }, + { + "epoch": 0.10535926699239859, + "grad_norm": 10.858031263847476, + "learning_rate": 4.9624025534902404e-05, + "loss": 2.5035, + "mean_token_accuracy": 0.3909255862236023, + "step": 104605 + }, + { + "epoch": 0.10536430304550276, + "grad_norm": 11.622443928297898, + "learning_rate": 4.962395731672905e-05, + "loss": 2.3513, + "mean_token_accuracy": 0.4068965494632721, + "step": 104610 + }, + { + "epoch": 0.10536933909860692, + "grad_norm": 12.848475172146022, + "learning_rate": 4.962388909241954e-05, + "loss": 2.3392, + "mean_token_accuracy": 0.4344827651977539, + "step": 104615 + }, + { + "epoch": 0.1053743751517111, + "grad_norm": 10.36120287179352, + "learning_rate": 4.962382086197389e-05, + "loss": 2.7308, + "mean_token_accuracy": 0.3551724135875702, + "step": 104620 + }, + { + "epoch": 0.10537941120481527, + "grad_norm": 10.985492743745747, + "learning_rate": 4.962375262539211e-05, + "loss": 2.4592, + "mean_token_accuracy": 0.3999999940395355, + "step": 104625 + }, + { + "epoch": 0.10538444725791944, + "grad_norm": 13.275102379195742, + "learning_rate": 4.962368438267423e-05, + "loss": 2.6649, + "mean_token_accuracy": 0.4, + "step": 104630 + }, + { + "epoch": 0.10538948331102362, + "grad_norm": 10.97049861843048, + "learning_rate": 4.9623616133820256e-05, + "loss": 2.3995, + "mean_token_accuracy": 0.41379310488700866, + "step": 104635 + }, + { + "epoch": 0.10539451936412779, + "grad_norm": 11.661108321111179, + "learning_rate": 4.962354787883022e-05, + "loss": 2.3673, + "mean_token_accuracy": 0.44827585816383364, + "step": 104640 + }, + { + "epoch": 0.10539955541723196, + "grad_norm": 12.174696233824838, + "learning_rate": 4.962347961770413e-05, + "loss": 2.665, + "mean_token_accuracy": 0.4413793087005615, + "step": 104645 + }, + { + "epoch": 0.10540459147033614, + "grad_norm": 10.869964958454787, + "learning_rate": 4.962341135044202e-05, + "loss": 2.2938, + "mean_token_accuracy": 0.4482758641242981, + "step": 104650 + }, + { + "epoch": 0.10540962752344031, + "grad_norm": 8.462168269319575, + "learning_rate": 4.962334307704389e-05, + "loss": 2.2435, + "mean_token_accuracy": 0.47931034564971925, + "step": 104655 + }, + { + "epoch": 0.10541466357654448, + "grad_norm": 11.181526119449435, + "learning_rate": 4.962327479750977e-05, + "loss": 2.6877, + "mean_token_accuracy": 0.39655172228813174, + "step": 104660 + }, + { + "epoch": 0.10541969962964866, + "grad_norm": 11.300944835898926, + "learning_rate": 4.962320651183968e-05, + "loss": 2.2873, + "mean_token_accuracy": 0.4359346628189087, + "step": 104665 + }, + { + "epoch": 0.10542473568275283, + "grad_norm": 10.561806510308658, + "learning_rate": 4.9623138220033633e-05, + "loss": 3.2234, + "mean_token_accuracy": 0.37737447023391724, + "step": 104670 + }, + { + "epoch": 0.105429771735857, + "grad_norm": 10.238636926645285, + "learning_rate": 4.962306992209166e-05, + "loss": 2.4781, + "mean_token_accuracy": 0.4172413766384125, + "step": 104675 + }, + { + "epoch": 0.10543480778896118, + "grad_norm": 11.43516663557122, + "learning_rate": 4.9623001618013756e-05, + "loss": 2.9079, + "mean_token_accuracy": 0.3689655065536499, + "step": 104680 + }, + { + "epoch": 0.10543984384206534, + "grad_norm": 10.749287119223046, + "learning_rate": 4.962293330779997e-05, + "loss": 2.2969, + "mean_token_accuracy": 0.4068965554237366, + "step": 104685 + }, + { + "epoch": 0.10544487989516951, + "grad_norm": 10.461361133467605, + "learning_rate": 4.962286499145029e-05, + "loss": 2.2206, + "mean_token_accuracy": 0.4230490028858185, + "step": 104690 + }, + { + "epoch": 0.10544991594827369, + "grad_norm": 10.53091488839773, + "learning_rate": 4.962279666896476e-05, + "loss": 2.4326, + "mean_token_accuracy": 0.43297035694122316, + "step": 104695 + }, + { + "epoch": 0.10545495200137786, + "grad_norm": 12.379212629863284, + "learning_rate": 4.9622728340343386e-05, + "loss": 2.4512, + "mean_token_accuracy": 0.4344827592372894, + "step": 104700 + }, + { + "epoch": 0.10545998805448203, + "grad_norm": 8.729504539453158, + "learning_rate": 4.9622660005586195e-05, + "loss": 2.2001, + "mean_token_accuracy": 0.47241378426551817, + "step": 104705 + }, + { + "epoch": 0.10546502410758621, + "grad_norm": 10.32949120703033, + "learning_rate": 4.9622591664693205e-05, + "loss": 2.6687, + "mean_token_accuracy": 0.3448275804519653, + "step": 104710 + }, + { + "epoch": 0.10547006016069038, + "grad_norm": 9.460294170557734, + "learning_rate": 4.9622523317664424e-05, + "loss": 2.1135, + "mean_token_accuracy": 0.4241379380226135, + "step": 104715 + }, + { + "epoch": 0.10547509621379456, + "grad_norm": 13.97331300135685, + "learning_rate": 4.9622454964499885e-05, + "loss": 2.9182, + "mean_token_accuracy": 0.36896551847457887, + "step": 104720 + }, + { + "epoch": 0.10548013226689873, + "grad_norm": 8.801153722767756, + "learning_rate": 4.96223866051996e-05, + "loss": 2.092, + "mean_token_accuracy": 0.482758617401123, + "step": 104725 + }, + { + "epoch": 0.1054851683200029, + "grad_norm": 12.10756195488822, + "learning_rate": 4.962231823976359e-05, + "loss": 2.6237, + "mean_token_accuracy": 0.4, + "step": 104730 + }, + { + "epoch": 0.10549020437310708, + "grad_norm": 11.07534289901473, + "learning_rate": 4.9622249868191866e-05, + "loss": 2.5145, + "mean_token_accuracy": 0.44337567687034607, + "step": 104735 + }, + { + "epoch": 0.10549524042621125, + "grad_norm": 11.511118031889584, + "learning_rate": 4.962218149048445e-05, + "loss": 2.7609, + "mean_token_accuracy": 0.4, + "step": 104740 + }, + { + "epoch": 0.10550027647931542, + "grad_norm": 9.695354287989908, + "learning_rate": 4.9622113106641376e-05, + "loss": 2.4246, + "mean_token_accuracy": 0.4052631616592407, + "step": 104745 + }, + { + "epoch": 0.1055053125324196, + "grad_norm": 12.345215988806293, + "learning_rate": 4.9622044716662644e-05, + "loss": 2.6129, + "mean_token_accuracy": 0.42758620381355283, + "step": 104750 + }, + { + "epoch": 0.10551034858552376, + "grad_norm": 11.269323579776556, + "learning_rate": 4.9621976320548283e-05, + "loss": 2.3762, + "mean_token_accuracy": 0.4448275864124298, + "step": 104755 + }, + { + "epoch": 0.10551538463862793, + "grad_norm": 9.817481281374286, + "learning_rate": 4.962190791829831e-05, + "loss": 2.4085, + "mean_token_accuracy": 0.42413792610168455, + "step": 104760 + }, + { + "epoch": 0.1055204206917321, + "grad_norm": 10.982604451325106, + "learning_rate": 4.962183950991274e-05, + "loss": 2.4533, + "mean_token_accuracy": 0.44137930274009707, + "step": 104765 + }, + { + "epoch": 0.10552545674483628, + "grad_norm": 13.306689968402468, + "learning_rate": 4.9621771095391604e-05, + "loss": 2.3624, + "mean_token_accuracy": 0.41379310488700866, + "step": 104770 + }, + { + "epoch": 0.10553049279794045, + "grad_norm": 13.141821337319119, + "learning_rate": 4.9621702674734904e-05, + "loss": 2.4011, + "mean_token_accuracy": 0.4398669183254242, + "step": 104775 + }, + { + "epoch": 0.10553552885104463, + "grad_norm": 10.786315989336098, + "learning_rate": 4.962163424794267e-05, + "loss": 2.7761, + "mean_token_accuracy": 0.3896551728248596, + "step": 104780 + }, + { + "epoch": 0.1055405649041488, + "grad_norm": 10.6102419213239, + "learning_rate": 4.962156581501492e-05, + "loss": 2.183, + "mean_token_accuracy": 0.46551724076271056, + "step": 104785 + }, + { + "epoch": 0.10554560095725297, + "grad_norm": 13.516152712457925, + "learning_rate": 4.962149737595166e-05, + "loss": 2.54, + "mean_token_accuracy": 0.4413793087005615, + "step": 104790 + }, + { + "epoch": 0.10555063701035715, + "grad_norm": 10.823859038659698, + "learning_rate": 4.9621428930752934e-05, + "loss": 2.4085, + "mean_token_accuracy": 0.4517241418361664, + "step": 104795 + }, + { + "epoch": 0.10555567306346132, + "grad_norm": 8.051955577275516, + "learning_rate": 4.962136047941875e-05, + "loss": 2.4135, + "mean_token_accuracy": 0.46896551847457885, + "step": 104800 + }, + { + "epoch": 0.1055607091165655, + "grad_norm": 11.083787475872414, + "learning_rate": 4.9621292021949114e-05, + "loss": 2.3255, + "mean_token_accuracy": 0.44827585816383364, + "step": 104805 + }, + { + "epoch": 0.10556574516966967, + "grad_norm": 11.089800852299916, + "learning_rate": 4.9621223558344056e-05, + "loss": 2.1278, + "mean_token_accuracy": 0.44827585220336913, + "step": 104810 + }, + { + "epoch": 0.10557078122277384, + "grad_norm": 16.303213202834048, + "learning_rate": 4.962115508860359e-05, + "loss": 2.2118, + "mean_token_accuracy": 0.43103448748588563, + "step": 104815 + }, + { + "epoch": 0.10557581727587802, + "grad_norm": 9.635345101044063, + "learning_rate": 4.9621086612727744e-05, + "loss": 1.879, + "mean_token_accuracy": 0.4862068951129913, + "step": 104820 + }, + { + "epoch": 0.10558085332898218, + "grad_norm": 11.405114978774881, + "learning_rate": 4.962101813071654e-05, + "loss": 2.2892, + "mean_token_accuracy": 0.4482758641242981, + "step": 104825 + }, + { + "epoch": 0.10558588938208635, + "grad_norm": 39.30089286162542, + "learning_rate": 4.962094964256997e-05, + "loss": 2.6542, + "mean_token_accuracy": 0.45722927451133727, + "step": 104830 + }, + { + "epoch": 0.10559092543519052, + "grad_norm": 13.975043636295133, + "learning_rate": 4.9620881148288085e-05, + "loss": 2.5085, + "mean_token_accuracy": 0.4655172526836395, + "step": 104835 + }, + { + "epoch": 0.1055959614882947, + "grad_norm": 10.352586111100576, + "learning_rate": 4.9620812647870894e-05, + "loss": 2.2482, + "mean_token_accuracy": 0.4572292804718018, + "step": 104840 + }, + { + "epoch": 0.10560099754139887, + "grad_norm": 12.515984210502726, + "learning_rate": 4.9620744141318405e-05, + "loss": 2.329, + "mean_token_accuracy": 0.47241379618644713, + "step": 104845 + }, + { + "epoch": 0.10560603359450305, + "grad_norm": 14.284755088394794, + "learning_rate": 4.9620675628630654e-05, + "loss": 2.4442, + "mean_token_accuracy": 0.4172413766384125, + "step": 104850 + }, + { + "epoch": 0.10561106964760722, + "grad_norm": 9.837977078318639, + "learning_rate": 4.962060710980765e-05, + "loss": 2.0779, + "mean_token_accuracy": 0.49655171632766726, + "step": 104855 + }, + { + "epoch": 0.1056161057007114, + "grad_norm": 9.098912515875515, + "learning_rate": 4.9620538584849404e-05, + "loss": 2.7032, + "mean_token_accuracy": 0.417241370677948, + "step": 104860 + }, + { + "epoch": 0.10562114175381557, + "grad_norm": 9.863230341870077, + "learning_rate": 4.962047005375595e-05, + "loss": 2.4562, + "mean_token_accuracy": 0.42413793206214906, + "step": 104865 + }, + { + "epoch": 0.10562617780691974, + "grad_norm": 9.085318534133192, + "learning_rate": 4.96204015165273e-05, + "loss": 2.4252, + "mean_token_accuracy": 0.3999999940395355, + "step": 104870 + }, + { + "epoch": 0.10563121386002391, + "grad_norm": 9.755402938567999, + "learning_rate": 4.9620332973163476e-05, + "loss": 2.3463, + "mean_token_accuracy": 0.3999999940395355, + "step": 104875 + }, + { + "epoch": 0.10563624991312809, + "grad_norm": 15.077860369430121, + "learning_rate": 4.962026442366449e-05, + "loss": 2.3191, + "mean_token_accuracy": 0.4206896543502808, + "step": 104880 + }, + { + "epoch": 0.10564128596623226, + "grad_norm": 10.259733066517377, + "learning_rate": 4.9620195868030375e-05, + "loss": 2.4412, + "mean_token_accuracy": 0.37931033968925476, + "step": 104885 + }, + { + "epoch": 0.10564632201933644, + "grad_norm": 8.506638040850353, + "learning_rate": 4.962012730626113e-05, + "loss": 2.3846, + "mean_token_accuracy": 0.4724137902259827, + "step": 104890 + }, + { + "epoch": 0.1056513580724406, + "grad_norm": 12.024224121183295, + "learning_rate": 4.962005873835679e-05, + "loss": 2.0721, + "mean_token_accuracy": 0.5004926145076751, + "step": 104895 + }, + { + "epoch": 0.10565639412554477, + "grad_norm": 11.149523757050849, + "learning_rate": 4.9619990164317374e-05, + "loss": 2.253, + "mean_token_accuracy": 0.441379314661026, + "step": 104900 + }, + { + "epoch": 0.10566143017864894, + "grad_norm": 11.416983244837931, + "learning_rate": 4.961992158414289e-05, + "loss": 2.5954, + "mean_token_accuracy": 0.3827586233615875, + "step": 104905 + }, + { + "epoch": 0.10566646623175312, + "grad_norm": 11.021363502121414, + "learning_rate": 4.9619852997833365e-05, + "loss": 2.4318, + "mean_token_accuracy": 0.37586207389831544, + "step": 104910 + }, + { + "epoch": 0.10567150228485729, + "grad_norm": 11.370701414301896, + "learning_rate": 4.961978440538882e-05, + "loss": 2.6969, + "mean_token_accuracy": 0.36376285552978516, + "step": 104915 + }, + { + "epoch": 0.10567653833796146, + "grad_norm": 13.866042022192232, + "learning_rate": 4.9619715806809267e-05, + "loss": 2.3446, + "mean_token_accuracy": 0.44137930274009707, + "step": 104920 + }, + { + "epoch": 0.10568157439106564, + "grad_norm": 17.036802978387495, + "learning_rate": 4.961964720209473e-05, + "loss": 2.4377, + "mean_token_accuracy": 0.39655173420906065, + "step": 104925 + }, + { + "epoch": 0.10568661044416981, + "grad_norm": 13.35022357804328, + "learning_rate": 4.961957859124522e-05, + "loss": 2.1544, + "mean_token_accuracy": 0.4551724135875702, + "step": 104930 + }, + { + "epoch": 0.10569164649727399, + "grad_norm": 10.74752020647704, + "learning_rate": 4.961950997426077e-05, + "loss": 2.4397, + "mean_token_accuracy": 0.4413793087005615, + "step": 104935 + }, + { + "epoch": 0.10569668255037816, + "grad_norm": 14.679087123588063, + "learning_rate": 4.961944135114139e-05, + "loss": 2.1548, + "mean_token_accuracy": 0.4944581389427185, + "step": 104940 + }, + { + "epoch": 0.10570171860348233, + "grad_norm": 10.893804127277205, + "learning_rate": 4.96193727218871e-05, + "loss": 2.6017, + "mean_token_accuracy": 0.37586207389831544, + "step": 104945 + }, + { + "epoch": 0.10570675465658651, + "grad_norm": 9.856905481399101, + "learning_rate": 4.961930408649791e-05, + "loss": 2.4833, + "mean_token_accuracy": 0.3965517282485962, + "step": 104950 + }, + { + "epoch": 0.10571179070969068, + "grad_norm": 12.909340183488235, + "learning_rate": 4.961923544497386e-05, + "loss": 2.7324, + "mean_token_accuracy": 0.3896551728248596, + "step": 104955 + }, + { + "epoch": 0.10571682676279485, + "grad_norm": 11.999709155867427, + "learning_rate": 4.9619166797314954e-05, + "loss": 2.5658, + "mean_token_accuracy": 0.3827586114406586, + "step": 104960 + }, + { + "epoch": 0.10572186281589901, + "grad_norm": 11.45126738377503, + "learning_rate": 4.961909814352122e-05, + "loss": 2.37, + "mean_token_accuracy": 0.42068966031074523, + "step": 104965 + }, + { + "epoch": 0.10572689886900319, + "grad_norm": 13.605116506280586, + "learning_rate": 4.9619029483592665e-05, + "loss": 2.3484, + "mean_token_accuracy": 0.4172413766384125, + "step": 104970 + }, + { + "epoch": 0.10573193492210736, + "grad_norm": 10.673263994220422, + "learning_rate": 4.961896081752932e-05, + "loss": 2.5726, + "mean_token_accuracy": 0.4379310250282288, + "step": 104975 + }, + { + "epoch": 0.10573697097521154, + "grad_norm": 10.224274248464523, + "learning_rate": 4.961889214533119e-05, + "loss": 2.4232, + "mean_token_accuracy": 0.45517241954803467, + "step": 104980 + }, + { + "epoch": 0.10574200702831571, + "grad_norm": 10.10453858373152, + "learning_rate": 4.961882346699831e-05, + "loss": 2.2098, + "mean_token_accuracy": 0.41379310488700866, + "step": 104985 + }, + { + "epoch": 0.10574704308141988, + "grad_norm": 10.847356554116907, + "learning_rate": 4.961875478253069e-05, + "loss": 2.6371, + "mean_token_accuracy": 0.37241379618644715, + "step": 104990 + }, + { + "epoch": 0.10575207913452406, + "grad_norm": 9.706681182326673, + "learning_rate": 4.9618686091928345e-05, + "loss": 2.2699, + "mean_token_accuracy": 0.42758620381355283, + "step": 104995 + }, + { + "epoch": 0.10575711518762823, + "grad_norm": 10.38859414274225, + "learning_rate": 4.9618617395191305e-05, + "loss": 2.5912, + "mean_token_accuracy": 0.4034482777118683, + "step": 105000 + }, + { + "epoch": 0.1057621512407324, + "grad_norm": 11.418707185344394, + "learning_rate": 4.961854869231959e-05, + "loss": 2.5848, + "mean_token_accuracy": 0.42758620381355283, + "step": 105005 + }, + { + "epoch": 0.10576718729383658, + "grad_norm": 10.347604298356547, + "learning_rate": 4.961847998331321e-05, + "loss": 2.1672, + "mean_token_accuracy": 0.510344821214676, + "step": 105010 + }, + { + "epoch": 0.10577222334694075, + "grad_norm": 11.456624173197307, + "learning_rate": 4.9618411268172185e-05, + "loss": 2.742, + "mean_token_accuracy": 0.3517241418361664, + "step": 105015 + }, + { + "epoch": 0.10577725940004493, + "grad_norm": 9.322663367072932, + "learning_rate": 4.9618342546896527e-05, + "loss": 2.2623, + "mean_token_accuracy": 0.4329098641872406, + "step": 105020 + }, + { + "epoch": 0.1057822954531491, + "grad_norm": 11.520760448156823, + "learning_rate": 4.9618273819486275e-05, + "loss": 2.222, + "mean_token_accuracy": 0.43103448748588563, + "step": 105025 + }, + { + "epoch": 0.10578733150625327, + "grad_norm": 12.394725831205681, + "learning_rate": 4.961820508594144e-05, + "loss": 2.8287, + "mean_token_accuracy": 0.4172413766384125, + "step": 105030 + }, + { + "epoch": 0.10579236755935743, + "grad_norm": 17.065761537363592, + "learning_rate": 4.961813634626203e-05, + "loss": 2.4802, + "mean_token_accuracy": 0.4551724135875702, + "step": 105035 + }, + { + "epoch": 0.10579740361246161, + "grad_norm": 10.103761519271226, + "learning_rate": 4.961806760044808e-05, + "loss": 2.9535, + "mean_token_accuracy": 0.3724137932062149, + "step": 105040 + }, + { + "epoch": 0.10580243966556578, + "grad_norm": 10.049954772048478, + "learning_rate": 4.9617998848499594e-05, + "loss": 2.2423, + "mean_token_accuracy": 0.43793103098869324, + "step": 105045 + }, + { + "epoch": 0.10580747571866995, + "grad_norm": 11.120205487737183, + "learning_rate": 4.96179300904166e-05, + "loss": 3.0154, + "mean_token_accuracy": 0.4275862067937851, + "step": 105050 + }, + { + "epoch": 0.10581251177177413, + "grad_norm": 9.19907989531605, + "learning_rate": 4.9617861326199115e-05, + "loss": 2.3271, + "mean_token_accuracy": 0.41379310488700866, + "step": 105055 + }, + { + "epoch": 0.1058175478248783, + "grad_norm": 9.668116411039161, + "learning_rate": 4.9617792555847166e-05, + "loss": 2.4488, + "mean_token_accuracy": 0.3896551728248596, + "step": 105060 + }, + { + "epoch": 0.10582258387798248, + "grad_norm": 12.161778669010873, + "learning_rate": 4.961772377936076e-05, + "loss": 2.3818, + "mean_token_accuracy": 0.41893526911735535, + "step": 105065 + }, + { + "epoch": 0.10582761993108665, + "grad_norm": 9.469732473525996, + "learning_rate": 4.9617654996739914e-05, + "loss": 2.3649, + "mean_token_accuracy": 0.4034482777118683, + "step": 105070 + }, + { + "epoch": 0.10583265598419082, + "grad_norm": 11.738179206051273, + "learning_rate": 4.961758620798467e-05, + "loss": 2.0533, + "mean_token_accuracy": 0.4517241358757019, + "step": 105075 + }, + { + "epoch": 0.105837692037295, + "grad_norm": 11.436384264354333, + "learning_rate": 4.961751741309502e-05, + "loss": 2.6769, + "mean_token_accuracy": 0.3862068891525269, + "step": 105080 + }, + { + "epoch": 0.10584272809039917, + "grad_norm": 8.892214731727295, + "learning_rate": 4.9617448612071e-05, + "loss": 2.4675, + "mean_token_accuracy": 0.4241379380226135, + "step": 105085 + }, + { + "epoch": 0.10584776414350335, + "grad_norm": 14.601768913496445, + "learning_rate": 4.9617379804912614e-05, + "loss": 2.4999, + "mean_token_accuracy": 0.41379310488700866, + "step": 105090 + }, + { + "epoch": 0.10585280019660752, + "grad_norm": 14.151238869399718, + "learning_rate": 4.961731099161988e-05, + "loss": 2.9225, + "mean_token_accuracy": 0.37241379022598264, + "step": 105095 + }, + { + "epoch": 0.10585783624971169, + "grad_norm": 12.828700198136676, + "learning_rate": 4.961724217219285e-05, + "loss": 2.5272, + "mean_token_accuracy": 0.4068965494632721, + "step": 105100 + }, + { + "epoch": 0.10586287230281585, + "grad_norm": 10.93421308188576, + "learning_rate": 4.961717334663151e-05, + "loss": 2.3744, + "mean_token_accuracy": 0.4068965554237366, + "step": 105105 + }, + { + "epoch": 0.10586790835592003, + "grad_norm": 11.308857749649773, + "learning_rate": 4.96171045149359e-05, + "loss": 2.0331, + "mean_token_accuracy": 0.48965516686439514, + "step": 105110 + }, + { + "epoch": 0.1058729444090242, + "grad_norm": 11.17492842316766, + "learning_rate": 4.9617035677106014e-05, + "loss": 2.33, + "mean_token_accuracy": 0.4344827592372894, + "step": 105115 + }, + { + "epoch": 0.10587798046212837, + "grad_norm": 10.886734741446126, + "learning_rate": 4.9616966833141895e-05, + "loss": 2.1474, + "mean_token_accuracy": 0.4551724076271057, + "step": 105120 + }, + { + "epoch": 0.10588301651523255, + "grad_norm": 9.34640769941015, + "learning_rate": 4.961689798304355e-05, + "loss": 2.2887, + "mean_token_accuracy": 0.4862068951129913, + "step": 105125 + }, + { + "epoch": 0.10588805256833672, + "grad_norm": 11.885704967343518, + "learning_rate": 4.9616829126811e-05, + "loss": 2.673, + "mean_token_accuracy": 0.33793102502822875, + "step": 105130 + }, + { + "epoch": 0.1058930886214409, + "grad_norm": 9.044865313366774, + "learning_rate": 4.961676026444426e-05, + "loss": 2.4155, + "mean_token_accuracy": 0.4310344815254211, + "step": 105135 + }, + { + "epoch": 0.10589812467454507, + "grad_norm": 12.30889076324346, + "learning_rate": 4.961669139594336e-05, + "loss": 2.7935, + "mean_token_accuracy": 0.3793103456497192, + "step": 105140 + }, + { + "epoch": 0.10590316072764924, + "grad_norm": 10.925506315490697, + "learning_rate": 4.961662252130831e-05, + "loss": 2.1736, + "mean_token_accuracy": 0.5006049573421478, + "step": 105145 + }, + { + "epoch": 0.10590819678075342, + "grad_norm": 12.598723717332883, + "learning_rate": 4.9616553640539135e-05, + "loss": 2.29, + "mean_token_accuracy": 0.4851309359073639, + "step": 105150 + }, + { + "epoch": 0.10591323283385759, + "grad_norm": 11.628122445942168, + "learning_rate": 4.961648475363585e-05, + "loss": 2.3636, + "mean_token_accuracy": 0.39655172228813174, + "step": 105155 + }, + { + "epoch": 0.10591826888696176, + "grad_norm": 11.083621451349437, + "learning_rate": 4.961641586059848e-05, + "loss": 2.4006, + "mean_token_accuracy": 0.4448275864124298, + "step": 105160 + }, + { + "epoch": 0.10592330494006594, + "grad_norm": 11.36896968345212, + "learning_rate": 4.961634696142704e-05, + "loss": 2.5781, + "mean_token_accuracy": 0.39310344457626345, + "step": 105165 + }, + { + "epoch": 0.10592834099317011, + "grad_norm": 9.99275332003944, + "learning_rate": 4.961627805612154e-05, + "loss": 2.1454, + "mean_token_accuracy": 0.45329703092575074, + "step": 105170 + }, + { + "epoch": 0.10593337704627427, + "grad_norm": 12.295185471839522, + "learning_rate": 4.961620914468202e-05, + "loss": 2.6159, + "mean_token_accuracy": 0.3862068921327591, + "step": 105175 + }, + { + "epoch": 0.10593841309937845, + "grad_norm": 15.089669187153085, + "learning_rate": 4.9616140227108476e-05, + "loss": 2.9091, + "mean_token_accuracy": 0.3896551787853241, + "step": 105180 + }, + { + "epoch": 0.10594344915248262, + "grad_norm": 12.425555679652831, + "learning_rate": 4.961607130340094e-05, + "loss": 2.4273, + "mean_token_accuracy": 0.41724138259887694, + "step": 105185 + }, + { + "epoch": 0.10594848520558679, + "grad_norm": 9.991696131651292, + "learning_rate": 4.961600237355943e-05, + "loss": 2.2178, + "mean_token_accuracy": 0.4379310369491577, + "step": 105190 + }, + { + "epoch": 0.10595352125869097, + "grad_norm": 13.056928082819637, + "learning_rate": 4.961593343758397e-05, + "loss": 2.5414, + "mean_token_accuracy": 0.3862069010734558, + "step": 105195 + }, + { + "epoch": 0.10595855731179514, + "grad_norm": 12.656115923246702, + "learning_rate": 4.961586449547456e-05, + "loss": 2.7699, + "mean_token_accuracy": 0.37241379618644715, + "step": 105200 + }, + { + "epoch": 0.10596359336489931, + "grad_norm": 11.125883502101003, + "learning_rate": 4.961579554723124e-05, + "loss": 2.5101, + "mean_token_accuracy": 0.4103448212146759, + "step": 105205 + }, + { + "epoch": 0.10596862941800349, + "grad_norm": 10.473859195726073, + "learning_rate": 4.961572659285403e-05, + "loss": 2.4145, + "mean_token_accuracy": 0.432667875289917, + "step": 105210 + }, + { + "epoch": 0.10597366547110766, + "grad_norm": 12.819109730456917, + "learning_rate": 4.9615657632342935e-05, + "loss": 2.3883, + "mean_token_accuracy": 0.4689655125141144, + "step": 105215 + }, + { + "epoch": 0.10597870152421184, + "grad_norm": 9.737457267989155, + "learning_rate": 4.961558866569798e-05, + "loss": 2.1959, + "mean_token_accuracy": 0.45517241954803467, + "step": 105220 + }, + { + "epoch": 0.10598373757731601, + "grad_norm": 11.944480824546707, + "learning_rate": 4.961551969291918e-05, + "loss": 2.4549, + "mean_token_accuracy": 0.39655172228813174, + "step": 105225 + }, + { + "epoch": 0.10598877363042018, + "grad_norm": 13.39909587970602, + "learning_rate": 4.9615450714006554e-05, + "loss": 2.515, + "mean_token_accuracy": 0.44482758045196535, + "step": 105230 + }, + { + "epoch": 0.10599380968352436, + "grad_norm": 11.198920903498093, + "learning_rate": 4.961538172896013e-05, + "loss": 2.5482, + "mean_token_accuracy": 0.4172413766384125, + "step": 105235 + }, + { + "epoch": 0.10599884573662853, + "grad_norm": 11.951487202630013, + "learning_rate": 4.9615312737779926e-05, + "loss": 2.7598, + "mean_token_accuracy": 0.36896551251411436, + "step": 105240 + }, + { + "epoch": 0.10600388178973269, + "grad_norm": 13.470815415442516, + "learning_rate": 4.961524374046595e-05, + "loss": 2.6538, + "mean_token_accuracy": 0.3655172437429428, + "step": 105245 + }, + { + "epoch": 0.10600891784283686, + "grad_norm": 11.016017163838463, + "learning_rate": 4.9615174737018236e-05, + "loss": 2.6624, + "mean_token_accuracy": 0.4103448212146759, + "step": 105250 + }, + { + "epoch": 0.10601395389594104, + "grad_norm": 10.284956892607294, + "learning_rate": 4.961510572743679e-05, + "loss": 2.1685, + "mean_token_accuracy": 0.4551724076271057, + "step": 105255 + }, + { + "epoch": 0.10601898994904521, + "grad_norm": 13.621210071624485, + "learning_rate": 4.961503671172165e-05, + "loss": 2.4654, + "mean_token_accuracy": 0.38275861740112305, + "step": 105260 + }, + { + "epoch": 0.10602402600214939, + "grad_norm": 11.308854201371862, + "learning_rate": 4.96149676898728e-05, + "loss": 2.5186, + "mean_token_accuracy": 0.3620689630508423, + "step": 105265 + }, + { + "epoch": 0.10602906205525356, + "grad_norm": 11.034911051757247, + "learning_rate": 4.96148986618903e-05, + "loss": 2.1912, + "mean_token_accuracy": 0.4862068951129913, + "step": 105270 + }, + { + "epoch": 0.10603409810835773, + "grad_norm": 12.432000512609143, + "learning_rate": 4.961482962777414e-05, + "loss": 2.5644, + "mean_token_accuracy": 0.42413792610168455, + "step": 105275 + }, + { + "epoch": 0.1060391341614619, + "grad_norm": 10.35387927682903, + "learning_rate": 4.9614760587524355e-05, + "loss": 2.4656, + "mean_token_accuracy": 0.42413792610168455, + "step": 105280 + }, + { + "epoch": 0.10604417021456608, + "grad_norm": 11.600779834109012, + "learning_rate": 4.9614691541140964e-05, + "loss": 2.7816, + "mean_token_accuracy": 0.3862069010734558, + "step": 105285 + }, + { + "epoch": 0.10604920626767025, + "grad_norm": 9.50925266287983, + "learning_rate": 4.961462248862397e-05, + "loss": 2.5536, + "mean_token_accuracy": 0.4241379201412201, + "step": 105290 + }, + { + "epoch": 0.10605424232077443, + "grad_norm": 10.037863619727432, + "learning_rate": 4.961455342997341e-05, + "loss": 2.983, + "mean_token_accuracy": 0.4206896543502808, + "step": 105295 + }, + { + "epoch": 0.1060592783738786, + "grad_norm": 10.916374475968524, + "learning_rate": 4.961448436518929e-05, + "loss": 2.1646, + "mean_token_accuracy": 0.4965517222881317, + "step": 105300 + }, + { + "epoch": 0.10606431442698278, + "grad_norm": 10.63419228515586, + "learning_rate": 4.961441529427164e-05, + "loss": 2.5277, + "mean_token_accuracy": 0.36206896901130675, + "step": 105305 + }, + { + "epoch": 0.10606935048008695, + "grad_norm": 11.860842572942735, + "learning_rate": 4.961434621722048e-05, + "loss": 2.2351, + "mean_token_accuracy": 0.5034482836723327, + "step": 105310 + }, + { + "epoch": 0.10607438653319111, + "grad_norm": 10.613797104784211, + "learning_rate": 4.9614277134035816e-05, + "loss": 2.3155, + "mean_token_accuracy": 0.41724138259887694, + "step": 105315 + }, + { + "epoch": 0.10607942258629528, + "grad_norm": 10.796840745443847, + "learning_rate": 4.9614208044717667e-05, + "loss": 2.3432, + "mean_token_accuracy": 0.4310344815254211, + "step": 105320 + }, + { + "epoch": 0.10608445863939946, + "grad_norm": 10.8506939966427, + "learning_rate": 4.9614138949266064e-05, + "loss": 2.6897, + "mean_token_accuracy": 0.4034482717514038, + "step": 105325 + }, + { + "epoch": 0.10608949469250363, + "grad_norm": 11.259507604331944, + "learning_rate": 4.9614069847681034e-05, + "loss": 2.1789, + "mean_token_accuracy": 0.4379310369491577, + "step": 105330 + }, + { + "epoch": 0.1060945307456078, + "grad_norm": 16.963744397502197, + "learning_rate": 4.961400073996257e-05, + "loss": 2.6779, + "mean_token_accuracy": 0.39310345649719236, + "step": 105335 + }, + { + "epoch": 0.10609956679871198, + "grad_norm": 10.599918125224239, + "learning_rate": 4.961393162611071e-05, + "loss": 2.3201, + "mean_token_accuracy": 0.41379310488700866, + "step": 105340 + }, + { + "epoch": 0.10610460285181615, + "grad_norm": 13.463836883453, + "learning_rate": 4.9613862506125475e-05, + "loss": 2.7117, + "mean_token_accuracy": 0.3793103456497192, + "step": 105345 + }, + { + "epoch": 0.10610963890492033, + "grad_norm": 10.605284951449091, + "learning_rate": 4.9613793380006875e-05, + "loss": 2.3708, + "mean_token_accuracy": 0.4448275864124298, + "step": 105350 + }, + { + "epoch": 0.1061146749580245, + "grad_norm": 13.286983318225928, + "learning_rate": 4.961372424775492e-05, + "loss": 2.8408, + "mean_token_accuracy": 0.4137930989265442, + "step": 105355 + }, + { + "epoch": 0.10611971101112867, + "grad_norm": 10.659024874570177, + "learning_rate": 4.961365510936965e-05, + "loss": 2.2983, + "mean_token_accuracy": 0.41379310488700866, + "step": 105360 + }, + { + "epoch": 0.10612474706423285, + "grad_norm": 10.239351153667839, + "learning_rate": 4.9613585964851084e-05, + "loss": 2.5403, + "mean_token_accuracy": 0.43103448748588563, + "step": 105365 + }, + { + "epoch": 0.10612978311733702, + "grad_norm": 9.778370845000953, + "learning_rate": 4.9613516814199225e-05, + "loss": 2.1441, + "mean_token_accuracy": 0.4862068951129913, + "step": 105370 + }, + { + "epoch": 0.1061348191704412, + "grad_norm": 11.085666650011644, + "learning_rate": 4.9613447657414095e-05, + "loss": 2.5558, + "mean_token_accuracy": 0.41923774480819703, + "step": 105375 + }, + { + "epoch": 0.10613985522354537, + "grad_norm": 15.21483270012711, + "learning_rate": 4.961337849449573e-05, + "loss": 2.6565, + "mean_token_accuracy": 0.4068965494632721, + "step": 105380 + }, + { + "epoch": 0.10614489127664953, + "grad_norm": 12.536474340951818, + "learning_rate": 4.9613309325444125e-05, + "loss": 2.7464, + "mean_token_accuracy": 0.4137930989265442, + "step": 105385 + }, + { + "epoch": 0.1061499273297537, + "grad_norm": 14.373195057216673, + "learning_rate": 4.961324015025932e-05, + "loss": 2.7939, + "mean_token_accuracy": 0.32068965435028074, + "step": 105390 + }, + { + "epoch": 0.10615496338285788, + "grad_norm": 7.794521083882906, + "learning_rate": 4.9613170968941316e-05, + "loss": 2.2107, + "mean_token_accuracy": 0.44137930274009707, + "step": 105395 + }, + { + "epoch": 0.10615999943596205, + "grad_norm": 13.502608002782837, + "learning_rate": 4.9613101781490145e-05, + "loss": 2.3439, + "mean_token_accuracy": 0.41379310488700866, + "step": 105400 + }, + { + "epoch": 0.10616503548906622, + "grad_norm": 11.00293083121396, + "learning_rate": 4.9613032587905824e-05, + "loss": 2.6104, + "mean_token_accuracy": 0.39310344457626345, + "step": 105405 + }, + { + "epoch": 0.1061700715421704, + "grad_norm": 12.561234825180192, + "learning_rate": 4.961296338818837e-05, + "loss": 2.3741, + "mean_token_accuracy": 0.43103447556495667, + "step": 105410 + }, + { + "epoch": 0.10617510759527457, + "grad_norm": 12.108726898952854, + "learning_rate": 4.9612894182337805e-05, + "loss": 2.5407, + "mean_token_accuracy": 0.4310344815254211, + "step": 105415 + }, + { + "epoch": 0.10618014364837874, + "grad_norm": 11.3990008502189, + "learning_rate": 4.961282497035415e-05, + "loss": 2.7171, + "mean_token_accuracy": 0.3965517282485962, + "step": 105420 + }, + { + "epoch": 0.10618517970148292, + "grad_norm": 9.414912725330982, + "learning_rate": 4.961275575223741e-05, + "loss": 2.3447, + "mean_token_accuracy": 0.42068964838981626, + "step": 105425 + }, + { + "epoch": 0.10619021575458709, + "grad_norm": 10.50253492514507, + "learning_rate": 4.961268652798762e-05, + "loss": 2.1962, + "mean_token_accuracy": 0.4517241358757019, + "step": 105430 + }, + { + "epoch": 0.10619525180769127, + "grad_norm": 10.619512878157053, + "learning_rate": 4.9612617297604795e-05, + "loss": 2.292, + "mean_token_accuracy": 0.48620688915252686, + "step": 105435 + }, + { + "epoch": 0.10620028786079544, + "grad_norm": 16.64499333926843, + "learning_rate": 4.961254806108895e-05, + "loss": 2.5417, + "mean_token_accuracy": 0.3965517282485962, + "step": 105440 + }, + { + "epoch": 0.10620532391389961, + "grad_norm": 11.782473629050147, + "learning_rate": 4.9612478818440116e-05, + "loss": 2.5476, + "mean_token_accuracy": 0.3862068891525269, + "step": 105445 + }, + { + "epoch": 0.10621035996700379, + "grad_norm": 9.963823262948056, + "learning_rate": 4.96124095696583e-05, + "loss": 2.2809, + "mean_token_accuracy": 0.4413793087005615, + "step": 105450 + }, + { + "epoch": 0.10621539602010795, + "grad_norm": 10.65502581556437, + "learning_rate": 4.961234031474351e-05, + "loss": 2.7316, + "mean_token_accuracy": 0.38275861740112305, + "step": 105455 + }, + { + "epoch": 0.10622043207321212, + "grad_norm": 9.713704383903654, + "learning_rate": 4.96122710536958e-05, + "loss": 2.3658, + "mean_token_accuracy": 0.42413792610168455, + "step": 105460 + }, + { + "epoch": 0.1062254681263163, + "grad_norm": 9.503679212412685, + "learning_rate": 4.9612201786515156e-05, + "loss": 2.0081, + "mean_token_accuracy": 0.5189957678318023, + "step": 105465 + }, + { + "epoch": 0.10623050417942047, + "grad_norm": 12.759896204469833, + "learning_rate": 4.961213251320162e-05, + "loss": 2.4544, + "mean_token_accuracy": 0.4206896543502808, + "step": 105470 + }, + { + "epoch": 0.10623554023252464, + "grad_norm": 8.492496292863187, + "learning_rate": 4.96120632337552e-05, + "loss": 2.222, + "mean_token_accuracy": 0.4562807857990265, + "step": 105475 + }, + { + "epoch": 0.10624057628562882, + "grad_norm": 10.393793841584083, + "learning_rate": 4.9611993948175904e-05, + "loss": 2.3483, + "mean_token_accuracy": 0.45862069725990295, + "step": 105480 + }, + { + "epoch": 0.10624561233873299, + "grad_norm": 10.0208717486039, + "learning_rate": 4.961192465646378e-05, + "loss": 2.3656, + "mean_token_accuracy": 0.4241379201412201, + "step": 105485 + }, + { + "epoch": 0.10625064839183716, + "grad_norm": 10.499095883016969, + "learning_rate": 4.9611855358618824e-05, + "loss": 2.4608, + "mean_token_accuracy": 0.37931033968925476, + "step": 105490 + }, + { + "epoch": 0.10625568444494134, + "grad_norm": 10.260554888578302, + "learning_rate": 4.961178605464106e-05, + "loss": 2.3661, + "mean_token_accuracy": 0.41034482717514037, + "step": 105495 + }, + { + "epoch": 0.10626072049804551, + "grad_norm": 8.381159521609332, + "learning_rate": 4.961171674453051e-05, + "loss": 2.4305, + "mean_token_accuracy": 0.4103448331356049, + "step": 105500 + }, + { + "epoch": 0.10626575655114968, + "grad_norm": 10.240063427387415, + "learning_rate": 4.961164742828721e-05, + "loss": 2.3052, + "mean_token_accuracy": 0.44361767172813416, + "step": 105505 + }, + { + "epoch": 0.10627079260425386, + "grad_norm": 14.493319553815224, + "learning_rate": 4.961157810591114e-05, + "loss": 2.4815, + "mean_token_accuracy": 0.41034482717514037, + "step": 105510 + }, + { + "epoch": 0.10627582865735803, + "grad_norm": 13.59889139638753, + "learning_rate": 4.9611508777402346e-05, + "loss": 2.5607, + "mean_token_accuracy": 0.44827585816383364, + "step": 105515 + }, + { + "epoch": 0.1062808647104622, + "grad_norm": 11.754689613278279, + "learning_rate": 4.961143944276085e-05, + "loss": 2.599, + "mean_token_accuracy": 0.441379314661026, + "step": 105520 + }, + { + "epoch": 0.10628590076356637, + "grad_norm": 10.770112403080116, + "learning_rate": 4.961137010198666e-05, + "loss": 2.1231, + "mean_token_accuracy": 0.4586206912994385, + "step": 105525 + }, + { + "epoch": 0.10629093681667054, + "grad_norm": 10.718579797369966, + "learning_rate": 4.961130075507979e-05, + "loss": 2.2288, + "mean_token_accuracy": 0.47773745059967043, + "step": 105530 + }, + { + "epoch": 0.10629597286977471, + "grad_norm": 10.329629135147139, + "learning_rate": 4.961123140204028e-05, + "loss": 2.3146, + "mean_token_accuracy": 0.42068966031074523, + "step": 105535 + }, + { + "epoch": 0.10630100892287889, + "grad_norm": 9.794321499993579, + "learning_rate": 4.9611162042868134e-05, + "loss": 2.1824, + "mean_token_accuracy": 0.4172413766384125, + "step": 105540 + }, + { + "epoch": 0.10630604497598306, + "grad_norm": 10.215088042946645, + "learning_rate": 4.9611092677563374e-05, + "loss": 2.0877, + "mean_token_accuracy": 0.45517241954803467, + "step": 105545 + }, + { + "epoch": 0.10631108102908723, + "grad_norm": 14.690943723624414, + "learning_rate": 4.961102330612602e-05, + "loss": 2.9215, + "mean_token_accuracy": 0.3793103456497192, + "step": 105550 + }, + { + "epoch": 0.10631611708219141, + "grad_norm": 10.91961207736115, + "learning_rate": 4.9610953928556095e-05, + "loss": 2.2084, + "mean_token_accuracy": 0.441379314661026, + "step": 105555 + }, + { + "epoch": 0.10632115313529558, + "grad_norm": 11.774315441355565, + "learning_rate": 4.961088454485362e-05, + "loss": 2.7718, + "mean_token_accuracy": 0.4068965494632721, + "step": 105560 + }, + { + "epoch": 0.10632618918839976, + "grad_norm": 12.845555990087119, + "learning_rate": 4.9610815155018594e-05, + "loss": 2.5161, + "mean_token_accuracy": 0.4068965554237366, + "step": 105565 + }, + { + "epoch": 0.10633122524150393, + "grad_norm": 10.969263196048981, + "learning_rate": 4.961074575905106e-05, + "loss": 2.2892, + "mean_token_accuracy": 0.441379314661026, + "step": 105570 + }, + { + "epoch": 0.1063362612946081, + "grad_norm": 12.889627573376501, + "learning_rate": 4.961067635695102e-05, + "loss": 2.522, + "mean_token_accuracy": 0.40465819239616396, + "step": 105575 + }, + { + "epoch": 0.10634129734771228, + "grad_norm": 10.391521625971489, + "learning_rate": 4.9610606948718506e-05, + "loss": 2.3197, + "mean_token_accuracy": 0.4413793087005615, + "step": 105580 + }, + { + "epoch": 0.10634633340081645, + "grad_norm": 9.615394528583272, + "learning_rate": 4.961053753435354e-05, + "loss": 2.1895, + "mean_token_accuracy": 0.4517241299152374, + "step": 105585 + }, + { + "epoch": 0.10635136945392062, + "grad_norm": 8.10619015543136, + "learning_rate": 4.9610468113856124e-05, + "loss": 2.2081, + "mean_token_accuracy": 0.493719220161438, + "step": 105590 + }, + { + "epoch": 0.10635640550702478, + "grad_norm": 10.511974420793832, + "learning_rate": 4.961039868722629e-05, + "loss": 2.6032, + "mean_token_accuracy": 0.43103448748588563, + "step": 105595 + }, + { + "epoch": 0.10636144156012896, + "grad_norm": 10.222124705448744, + "learning_rate": 4.961032925446406e-05, + "loss": 2.5101, + "mean_token_accuracy": 0.43103448748588563, + "step": 105600 + }, + { + "epoch": 0.10636647761323313, + "grad_norm": 23.42722016373519, + "learning_rate": 4.961025981556943e-05, + "loss": 2.6944, + "mean_token_accuracy": 0.42068966031074523, + "step": 105605 + }, + { + "epoch": 0.1063715136663373, + "grad_norm": 13.195504360518031, + "learning_rate": 4.961019037054245e-05, + "loss": 2.8729, + "mean_token_accuracy": 0.3551724135875702, + "step": 105610 + }, + { + "epoch": 0.10637654971944148, + "grad_norm": 11.924925565640642, + "learning_rate": 4.961012091938313e-05, + "loss": 2.296, + "mean_token_accuracy": 0.4620689630508423, + "step": 105615 + }, + { + "epoch": 0.10638158577254565, + "grad_norm": 11.562062057140933, + "learning_rate": 4.9610051462091486e-05, + "loss": 2.4037, + "mean_token_accuracy": 0.43793103098869324, + "step": 105620 + }, + { + "epoch": 0.10638662182564983, + "grad_norm": 13.330768663913016, + "learning_rate": 4.9609981998667534e-05, + "loss": 2.9719, + "mean_token_accuracy": 0.358620685338974, + "step": 105625 + }, + { + "epoch": 0.106391657878754, + "grad_norm": 11.510533390902443, + "learning_rate": 4.9609912529111294e-05, + "loss": 3.0845, + "mean_token_accuracy": 0.324137932062149, + "step": 105630 + }, + { + "epoch": 0.10639669393185817, + "grad_norm": 11.336166139199856, + "learning_rate": 4.9609843053422786e-05, + "loss": 2.2834, + "mean_token_accuracy": 0.4448275864124298, + "step": 105635 + }, + { + "epoch": 0.10640172998496235, + "grad_norm": 10.63952827321493, + "learning_rate": 4.960977357160204e-05, + "loss": 2.8423, + "mean_token_accuracy": 0.39310343861579894, + "step": 105640 + }, + { + "epoch": 0.10640676603806652, + "grad_norm": 10.936849103297401, + "learning_rate": 4.960970408364904e-05, + "loss": 2.3336, + "mean_token_accuracy": 0.458620685338974, + "step": 105645 + }, + { + "epoch": 0.1064118020911707, + "grad_norm": 10.556678027162631, + "learning_rate": 4.960963458956386e-05, + "loss": 2.4622, + "mean_token_accuracy": 0.4103448212146759, + "step": 105650 + }, + { + "epoch": 0.10641683814427487, + "grad_norm": 9.883236959531065, + "learning_rate": 4.9609565089346476e-05, + "loss": 2.1086, + "mean_token_accuracy": 0.4206896543502808, + "step": 105655 + }, + { + "epoch": 0.10642187419737904, + "grad_norm": 14.404082389049533, + "learning_rate": 4.960949558299693e-05, + "loss": 2.9326, + "mean_token_accuracy": 0.3758620619773865, + "step": 105660 + }, + { + "epoch": 0.1064269102504832, + "grad_norm": 10.168499193146074, + "learning_rate": 4.9609426070515226e-05, + "loss": 2.2846, + "mean_token_accuracy": 0.42589232325553894, + "step": 105665 + }, + { + "epoch": 0.10643194630358738, + "grad_norm": 10.892191226795626, + "learning_rate": 4.960935655190139e-05, + "loss": 2.5668, + "mean_token_accuracy": 0.3965517282485962, + "step": 105670 + }, + { + "epoch": 0.10643698235669155, + "grad_norm": 10.143857847713587, + "learning_rate": 4.960928702715545e-05, + "loss": 2.1427, + "mean_token_accuracy": 0.4517241358757019, + "step": 105675 + }, + { + "epoch": 0.10644201840979572, + "grad_norm": 10.899831503693116, + "learning_rate": 4.96092174962774e-05, + "loss": 2.4924, + "mean_token_accuracy": 0.4172413766384125, + "step": 105680 + }, + { + "epoch": 0.1064470544628999, + "grad_norm": 9.179687584947143, + "learning_rate": 4.960914795926729e-05, + "loss": 2.0653, + "mean_token_accuracy": 0.4986690878868103, + "step": 105685 + }, + { + "epoch": 0.10645209051600407, + "grad_norm": 11.28341117584674, + "learning_rate": 4.960907841612512e-05, + "loss": 2.4174, + "mean_token_accuracy": 0.40689656138420105, + "step": 105690 + }, + { + "epoch": 0.10645712656910825, + "grad_norm": 12.171949813679081, + "learning_rate": 4.960900886685092e-05, + "loss": 2.2726, + "mean_token_accuracy": 0.4620689690113068, + "step": 105695 + }, + { + "epoch": 0.10646216262221242, + "grad_norm": 14.024559736456316, + "learning_rate": 4.9608939311444696e-05, + "loss": 2.5324, + "mean_token_accuracy": 0.3965517282485962, + "step": 105700 + }, + { + "epoch": 0.1064671986753166, + "grad_norm": 10.475452493906293, + "learning_rate": 4.960886974990648e-05, + "loss": 2.8657, + "mean_token_accuracy": 0.3896551728248596, + "step": 105705 + }, + { + "epoch": 0.10647223472842077, + "grad_norm": 9.217486913995485, + "learning_rate": 4.9608800182236284e-05, + "loss": 2.2551, + "mean_token_accuracy": 0.4517241418361664, + "step": 105710 + }, + { + "epoch": 0.10647727078152494, + "grad_norm": 7.74224869736242, + "learning_rate": 4.9608730608434134e-05, + "loss": 1.949, + "mean_token_accuracy": 0.5151477873325347, + "step": 105715 + }, + { + "epoch": 0.10648230683462911, + "grad_norm": 14.009155484137054, + "learning_rate": 4.960866102850004e-05, + "loss": 2.7425, + "mean_token_accuracy": 0.35862069129943847, + "step": 105720 + }, + { + "epoch": 0.10648734288773329, + "grad_norm": 9.48476867867452, + "learning_rate": 4.960859144243403e-05, + "loss": 2.0394, + "mean_token_accuracy": 0.49122806787490847, + "step": 105725 + }, + { + "epoch": 0.10649237894083746, + "grad_norm": 11.010740838657517, + "learning_rate": 4.960852185023612e-05, + "loss": 2.2702, + "mean_token_accuracy": 0.4586206912994385, + "step": 105730 + }, + { + "epoch": 0.10649741499394162, + "grad_norm": 11.02368755871764, + "learning_rate": 4.960845225190633e-05, + "loss": 2.5832, + "mean_token_accuracy": 0.40689654350280763, + "step": 105735 + }, + { + "epoch": 0.1065024510470458, + "grad_norm": 10.056229824156834, + "learning_rate": 4.960838264744467e-05, + "loss": 2.2277, + "mean_token_accuracy": 0.4379310369491577, + "step": 105740 + }, + { + "epoch": 0.10650748710014997, + "grad_norm": 12.45825661268633, + "learning_rate": 4.9608313036851184e-05, + "loss": 2.9005, + "mean_token_accuracy": 0.3793103456497192, + "step": 105745 + }, + { + "epoch": 0.10651252315325414, + "grad_norm": 10.239135865581074, + "learning_rate": 4.960824342012586e-05, + "loss": 2.1918, + "mean_token_accuracy": 0.47931033968925474, + "step": 105750 + }, + { + "epoch": 0.10651755920635832, + "grad_norm": 10.642204973958417, + "learning_rate": 4.9608173797268735e-05, + "loss": 2.243, + "mean_token_accuracy": 0.44827585816383364, + "step": 105755 + }, + { + "epoch": 0.10652259525946249, + "grad_norm": 10.59360479051198, + "learning_rate": 4.960810416827983e-05, + "loss": 2.5923, + "mean_token_accuracy": 0.4034482777118683, + "step": 105760 + }, + { + "epoch": 0.10652763131256666, + "grad_norm": 8.951112576899517, + "learning_rate": 4.960803453315915e-05, + "loss": 2.0167, + "mean_token_accuracy": 0.5076225101947784, + "step": 105765 + }, + { + "epoch": 0.10653266736567084, + "grad_norm": 11.327368266147833, + "learning_rate": 4.960796489190674e-05, + "loss": 2.165, + "mean_token_accuracy": 0.4713248610496521, + "step": 105770 + }, + { + "epoch": 0.10653770341877501, + "grad_norm": 10.751595775869605, + "learning_rate": 4.960789524452259e-05, + "loss": 3.0152, + "mean_token_accuracy": 0.38620689809322356, + "step": 105775 + }, + { + "epoch": 0.10654273947187919, + "grad_norm": 10.134679720416521, + "learning_rate": 4.960782559100674e-05, + "loss": 2.5519, + "mean_token_accuracy": 0.3896551728248596, + "step": 105780 + }, + { + "epoch": 0.10654777552498336, + "grad_norm": 14.89436097122948, + "learning_rate": 4.9607755931359204e-05, + "loss": 2.7041, + "mean_token_accuracy": 0.4154264897108078, + "step": 105785 + }, + { + "epoch": 0.10655281157808753, + "grad_norm": 11.671515814058447, + "learning_rate": 4.960768626557999e-05, + "loss": 2.3587, + "mean_token_accuracy": 0.4398669183254242, + "step": 105790 + }, + { + "epoch": 0.10655784763119171, + "grad_norm": 9.428897918744239, + "learning_rate": 4.960761659366913e-05, + "loss": 2.2364, + "mean_token_accuracy": 0.40344828367233276, + "step": 105795 + }, + { + "epoch": 0.10656288368429588, + "grad_norm": 10.661186134068625, + "learning_rate": 4.960754691562664e-05, + "loss": 2.1223, + "mean_token_accuracy": 0.4896551787853241, + "step": 105800 + }, + { + "epoch": 0.10656791973740004, + "grad_norm": 15.771477211011103, + "learning_rate": 4.9607477231452545e-05, + "loss": 2.5723, + "mean_token_accuracy": 0.4344827592372894, + "step": 105805 + }, + { + "epoch": 0.10657295579050421, + "grad_norm": 9.904939115355846, + "learning_rate": 4.9607407541146854e-05, + "loss": 2.2659, + "mean_token_accuracy": 0.4103448212146759, + "step": 105810 + }, + { + "epoch": 0.10657799184360839, + "grad_norm": 12.325862312722297, + "learning_rate": 4.960733784470959e-05, + "loss": 2.3011, + "mean_token_accuracy": 0.4103448212146759, + "step": 105815 + }, + { + "epoch": 0.10658302789671256, + "grad_norm": 9.290105810914442, + "learning_rate": 4.9607268142140776e-05, + "loss": 2.3861, + "mean_token_accuracy": 0.43103448748588563, + "step": 105820 + }, + { + "epoch": 0.10658806394981674, + "grad_norm": 13.122035911356026, + "learning_rate": 4.960719843344043e-05, + "loss": 2.3222, + "mean_token_accuracy": 0.42758620977401735, + "step": 105825 + }, + { + "epoch": 0.10659310000292091, + "grad_norm": 10.803169377196573, + "learning_rate": 4.960712871860857e-05, + "loss": 2.2713, + "mean_token_accuracy": 0.44827587008476255, + "step": 105830 + }, + { + "epoch": 0.10659813605602508, + "grad_norm": 9.666909029526721, + "learning_rate": 4.960705899764522e-05, + "loss": 2.2915, + "mean_token_accuracy": 0.3965517163276672, + "step": 105835 + }, + { + "epoch": 0.10660317210912926, + "grad_norm": 10.077490101975984, + "learning_rate": 4.960698927055038e-05, + "loss": 2.4332, + "mean_token_accuracy": 0.41034482717514037, + "step": 105840 + }, + { + "epoch": 0.10660820816223343, + "grad_norm": 9.293324603500935, + "learning_rate": 4.960691953732409e-05, + "loss": 2.2629, + "mean_token_accuracy": 0.4310344815254211, + "step": 105845 + }, + { + "epoch": 0.1066132442153376, + "grad_norm": 9.346917581556548, + "learning_rate": 4.960684979796637e-05, + "loss": 2.3457, + "mean_token_accuracy": 0.44827585816383364, + "step": 105850 + }, + { + "epoch": 0.10661828026844178, + "grad_norm": 10.554066058941908, + "learning_rate": 4.960678005247723e-05, + "loss": 2.4649, + "mean_token_accuracy": 0.3827586233615875, + "step": 105855 + }, + { + "epoch": 0.10662331632154595, + "grad_norm": 10.08565646644747, + "learning_rate": 4.960671030085669e-05, + "loss": 2.2852, + "mean_token_accuracy": 0.3999999940395355, + "step": 105860 + }, + { + "epoch": 0.10662835237465013, + "grad_norm": 10.991491638556516, + "learning_rate": 4.960664054310477e-05, + "loss": 2.3441, + "mean_token_accuracy": 0.4448275864124298, + "step": 105865 + }, + { + "epoch": 0.1066333884277543, + "grad_norm": 12.157366812144062, + "learning_rate": 4.9606570779221494e-05, + "loss": 2.1713, + "mean_token_accuracy": 0.45674530863761903, + "step": 105870 + }, + { + "epoch": 0.10663842448085846, + "grad_norm": 10.160379790920366, + "learning_rate": 4.960650100920688e-05, + "loss": 2.0676, + "mean_token_accuracy": 0.49999998807907103, + "step": 105875 + }, + { + "epoch": 0.10664346053396263, + "grad_norm": 10.594469968303201, + "learning_rate": 4.960643123306094e-05, + "loss": 2.8621, + "mean_token_accuracy": 0.37241379618644715, + "step": 105880 + }, + { + "epoch": 0.10664849658706681, + "grad_norm": 10.492713765073432, + "learning_rate": 4.96063614507837e-05, + "loss": 2.5596, + "mean_token_accuracy": 0.43103448748588563, + "step": 105885 + }, + { + "epoch": 0.10665353264017098, + "grad_norm": 11.07504603392353, + "learning_rate": 4.960629166237518e-05, + "loss": 2.1858, + "mean_token_accuracy": 0.4642468154430389, + "step": 105890 + }, + { + "epoch": 0.10665856869327515, + "grad_norm": 10.428499407613012, + "learning_rate": 4.9606221867835396e-05, + "loss": 2.5399, + "mean_token_accuracy": 0.41379310488700866, + "step": 105895 + }, + { + "epoch": 0.10666360474637933, + "grad_norm": 13.77881257173632, + "learning_rate": 4.960615206716437e-05, + "loss": 2.5363, + "mean_token_accuracy": 0.40852994918823243, + "step": 105900 + }, + { + "epoch": 0.1066686407994835, + "grad_norm": 10.345392802571451, + "learning_rate": 4.9606082260362116e-05, + "loss": 2.5579, + "mean_token_accuracy": 0.4206896543502808, + "step": 105905 + }, + { + "epoch": 0.10667367685258768, + "grad_norm": 13.557675106105078, + "learning_rate": 4.9606012447428666e-05, + "loss": 2.2094, + "mean_token_accuracy": 0.4310344815254211, + "step": 105910 + }, + { + "epoch": 0.10667871290569185, + "grad_norm": 14.314976740800002, + "learning_rate": 4.9605942628364027e-05, + "loss": 2.5085, + "mean_token_accuracy": 0.37586206793785093, + "step": 105915 + }, + { + "epoch": 0.10668374895879602, + "grad_norm": 11.8181993652702, + "learning_rate": 4.9605872803168224e-05, + "loss": 2.2983, + "mean_token_accuracy": 0.4655172348022461, + "step": 105920 + }, + { + "epoch": 0.1066887850119002, + "grad_norm": 11.343552154104566, + "learning_rate": 4.960580297184127e-05, + "loss": 3.028, + "mean_token_accuracy": 0.337931028008461, + "step": 105925 + }, + { + "epoch": 0.10669382106500437, + "grad_norm": 12.292892553855758, + "learning_rate": 4.9605733134383195e-05, + "loss": 3.0418, + "mean_token_accuracy": 0.3774954617023468, + "step": 105930 + }, + { + "epoch": 0.10669885711810854, + "grad_norm": 12.814627635984294, + "learning_rate": 4.960566329079401e-05, + "loss": 2.5443, + "mean_token_accuracy": 0.4344827592372894, + "step": 105935 + }, + { + "epoch": 0.10670389317121272, + "grad_norm": 8.75248970029647, + "learning_rate": 4.960559344107373e-05, + "loss": 2.3261, + "mean_token_accuracy": 0.4206896543502808, + "step": 105940 + }, + { + "epoch": 0.10670892922431688, + "grad_norm": 11.417607596483748, + "learning_rate": 4.960552358522239e-05, + "loss": 2.0592, + "mean_token_accuracy": 0.47931034564971925, + "step": 105945 + }, + { + "epoch": 0.10671396527742105, + "grad_norm": 10.111870792220772, + "learning_rate": 4.960545372324e-05, + "loss": 2.5146, + "mean_token_accuracy": 0.358620685338974, + "step": 105950 + }, + { + "epoch": 0.10671900133052523, + "grad_norm": 9.850160258794245, + "learning_rate": 4.9605383855126575e-05, + "loss": 2.3184, + "mean_token_accuracy": 0.412522679567337, + "step": 105955 + }, + { + "epoch": 0.1067240373836294, + "grad_norm": 17.821630861894306, + "learning_rate": 4.9605313980882144e-05, + "loss": 3.4998, + "mean_token_accuracy": 0.334482753276825, + "step": 105960 + }, + { + "epoch": 0.10672907343673357, + "grad_norm": 11.355975336235863, + "learning_rate": 4.960524410050672e-05, + "loss": 2.3843, + "mean_token_accuracy": 0.42068964838981626, + "step": 105965 + }, + { + "epoch": 0.10673410948983775, + "grad_norm": 11.183338936268772, + "learning_rate": 4.960517421400032e-05, + "loss": 3.2897, + "mean_token_accuracy": 0.31379310190677645, + "step": 105970 + }, + { + "epoch": 0.10673914554294192, + "grad_norm": 10.3078057862167, + "learning_rate": 4.960510432136298e-05, + "loss": 2.2432, + "mean_token_accuracy": 0.4551724135875702, + "step": 105975 + }, + { + "epoch": 0.1067441815960461, + "grad_norm": 10.843678159346208, + "learning_rate": 4.9605034422594695e-05, + "loss": 2.3662, + "mean_token_accuracy": 0.4034482777118683, + "step": 105980 + }, + { + "epoch": 0.10674921764915027, + "grad_norm": 9.871976758010373, + "learning_rate": 4.96049645176955e-05, + "loss": 2.4233, + "mean_token_accuracy": 0.4641863226890564, + "step": 105985 + }, + { + "epoch": 0.10675425370225444, + "grad_norm": 11.668965947357728, + "learning_rate": 4.960489460666541e-05, + "loss": 2.1161, + "mean_token_accuracy": 0.47241379618644713, + "step": 105990 + }, + { + "epoch": 0.10675928975535862, + "grad_norm": 9.717714476370647, + "learning_rate": 4.9604824689504445e-05, + "loss": 2.5415, + "mean_token_accuracy": 0.4034482777118683, + "step": 105995 + }, + { + "epoch": 0.10676432580846279, + "grad_norm": 10.017032205451983, + "learning_rate": 4.9604754766212625e-05, + "loss": 2.6212, + "mean_token_accuracy": 0.3931034505367279, + "step": 106000 + }, + { + "epoch": 0.10676936186156696, + "grad_norm": 11.555550626982852, + "learning_rate": 4.960468483678996e-05, + "loss": 2.1882, + "mean_token_accuracy": 0.4586206912994385, + "step": 106005 + }, + { + "epoch": 0.10677439791467114, + "grad_norm": 9.750141990981302, + "learning_rate": 4.96046149012365e-05, + "loss": 2.3654, + "mean_token_accuracy": 0.42758620381355283, + "step": 106010 + }, + { + "epoch": 0.1067794339677753, + "grad_norm": 9.966254099722603, + "learning_rate": 4.960454495955222e-05, + "loss": 2.2491, + "mean_token_accuracy": 0.44137929677963256, + "step": 106015 + }, + { + "epoch": 0.10678447002087947, + "grad_norm": 9.19069934302603, + "learning_rate": 4.9604475011737174e-05, + "loss": 2.2026, + "mean_token_accuracy": 0.4379310369491577, + "step": 106020 + }, + { + "epoch": 0.10678950607398364, + "grad_norm": 8.92999780093296, + "learning_rate": 4.9604405057791367e-05, + "loss": 2.0855, + "mean_token_accuracy": 0.49655171632766726, + "step": 106025 + }, + { + "epoch": 0.10679454212708782, + "grad_norm": 11.594527455518849, + "learning_rate": 4.9604335097714816e-05, + "loss": 2.2019, + "mean_token_accuracy": 0.4344827651977539, + "step": 106030 + }, + { + "epoch": 0.10679957818019199, + "grad_norm": 11.231909746995255, + "learning_rate": 4.960426513150756e-05, + "loss": 2.4957, + "mean_token_accuracy": 0.42758620977401735, + "step": 106035 + }, + { + "epoch": 0.10680461423329617, + "grad_norm": 10.60092254365431, + "learning_rate": 4.960419515916958e-05, + "loss": 2.2344, + "mean_token_accuracy": 0.4586206912994385, + "step": 106040 + }, + { + "epoch": 0.10680965028640034, + "grad_norm": 10.89283291457191, + "learning_rate": 4.960412518070093e-05, + "loss": 2.2898, + "mean_token_accuracy": 0.42068964838981626, + "step": 106045 + }, + { + "epoch": 0.10681468633950451, + "grad_norm": 11.15762957271298, + "learning_rate": 4.9604055196101624e-05, + "loss": 2.7114, + "mean_token_accuracy": 0.4157894730567932, + "step": 106050 + }, + { + "epoch": 0.10681972239260869, + "grad_norm": 8.172697546037096, + "learning_rate": 4.960398520537167e-05, + "loss": 2.366, + "mean_token_accuracy": 0.44682395458221436, + "step": 106055 + }, + { + "epoch": 0.10682475844571286, + "grad_norm": 11.559083532195913, + "learning_rate": 4.96039152085111e-05, + "loss": 2.3883, + "mean_token_accuracy": 0.4620689690113068, + "step": 106060 + }, + { + "epoch": 0.10682979449881704, + "grad_norm": 10.810911381803772, + "learning_rate": 4.9603845205519925e-05, + "loss": 2.2879, + "mean_token_accuracy": 0.39655172526836396, + "step": 106065 + }, + { + "epoch": 0.10683483055192121, + "grad_norm": 13.772707847197502, + "learning_rate": 4.960377519639816e-05, + "loss": 2.7858, + "mean_token_accuracy": 0.38620689809322356, + "step": 106070 + }, + { + "epoch": 0.10683986660502538, + "grad_norm": 16.89154582576504, + "learning_rate": 4.960370518114584e-05, + "loss": 3.0403, + "mean_token_accuracy": 0.4034482777118683, + "step": 106075 + }, + { + "epoch": 0.10684490265812956, + "grad_norm": 12.273779166174588, + "learning_rate": 4.960363515976296e-05, + "loss": 2.5523, + "mean_token_accuracy": 0.3814881980419159, + "step": 106080 + }, + { + "epoch": 0.10684993871123372, + "grad_norm": 10.545258833736206, + "learning_rate": 4.960356513224957e-05, + "loss": 1.9563, + "mean_token_accuracy": 0.506896561384201, + "step": 106085 + }, + { + "epoch": 0.10685497476433789, + "grad_norm": 9.494618917363432, + "learning_rate": 4.960349509860566e-05, + "loss": 2.4414, + "mean_token_accuracy": 0.4241379380226135, + "step": 106090 + }, + { + "epoch": 0.10686001081744206, + "grad_norm": 9.699515332554189, + "learning_rate": 4.960342505883127e-05, + "loss": 2.651, + "mean_token_accuracy": 0.3689655065536499, + "step": 106095 + }, + { + "epoch": 0.10686504687054624, + "grad_norm": 9.225856898683219, + "learning_rate": 4.960335501292642e-05, + "loss": 2.2099, + "mean_token_accuracy": 0.41379310488700866, + "step": 106100 + }, + { + "epoch": 0.10687008292365041, + "grad_norm": 13.655133750090595, + "learning_rate": 4.960328496089111e-05, + "loss": 2.7623, + "mean_token_accuracy": 0.4034482717514038, + "step": 106105 + }, + { + "epoch": 0.10687511897675459, + "grad_norm": 10.576333421835402, + "learning_rate": 4.9603214902725375e-05, + "loss": 2.4774, + "mean_token_accuracy": 0.4114531993865967, + "step": 106110 + }, + { + "epoch": 0.10688015502985876, + "grad_norm": 9.96006598630805, + "learning_rate": 4.9603144838429234e-05, + "loss": 2.349, + "mean_token_accuracy": 0.3965517163276672, + "step": 106115 + }, + { + "epoch": 0.10688519108296293, + "grad_norm": 10.53316904756797, + "learning_rate": 4.960307476800271e-05, + "loss": 2.3149, + "mean_token_accuracy": 0.44694494009017943, + "step": 106120 + }, + { + "epoch": 0.1068902271360671, + "grad_norm": 10.250091928136863, + "learning_rate": 4.9603004691445806e-05, + "loss": 2.1367, + "mean_token_accuracy": 0.4813672065734863, + "step": 106125 + }, + { + "epoch": 0.10689526318917128, + "grad_norm": 9.788375258606946, + "learning_rate": 4.960293460875855e-05, + "loss": 2.1996, + "mean_token_accuracy": 0.4896551787853241, + "step": 106130 + }, + { + "epoch": 0.10690029924227545, + "grad_norm": 11.068595653552201, + "learning_rate": 4.960286451994096e-05, + "loss": 2.4378, + "mean_token_accuracy": 0.4137930989265442, + "step": 106135 + }, + { + "epoch": 0.10690533529537963, + "grad_norm": 11.564967844533834, + "learning_rate": 4.960279442499306e-05, + "loss": 2.2954, + "mean_token_accuracy": 0.42413792610168455, + "step": 106140 + }, + { + "epoch": 0.1069103713484838, + "grad_norm": 10.215552343545129, + "learning_rate": 4.960272432391487e-05, + "loss": 2.0954, + "mean_token_accuracy": 0.46551724672317507, + "step": 106145 + }, + { + "epoch": 0.10691540740158798, + "grad_norm": 18.567800987719952, + "learning_rate": 4.960265421670641e-05, + "loss": 2.4066, + "mean_token_accuracy": 0.4222625494003296, + "step": 106150 + }, + { + "epoch": 0.10692044345469214, + "grad_norm": 12.28733449130241, + "learning_rate": 4.9602584103367694e-05, + "loss": 1.969, + "mean_token_accuracy": 0.4620689630508423, + "step": 106155 + }, + { + "epoch": 0.10692547950779631, + "grad_norm": 10.080060803221418, + "learning_rate": 4.960251398389875e-05, + "loss": 2.1451, + "mean_token_accuracy": 0.4758620738983154, + "step": 106160 + }, + { + "epoch": 0.10693051556090048, + "grad_norm": 12.6990985086516, + "learning_rate": 4.960244385829958e-05, + "loss": 2.4214, + "mean_token_accuracy": 0.43448275327682495, + "step": 106165 + }, + { + "epoch": 0.10693555161400466, + "grad_norm": 9.355155322848296, + "learning_rate": 4.960237372657022e-05, + "loss": 2.4064, + "mean_token_accuracy": 0.4689655125141144, + "step": 106170 + }, + { + "epoch": 0.10694058766710883, + "grad_norm": 12.179542378794698, + "learning_rate": 4.9602303588710686e-05, + "loss": 3.1062, + "mean_token_accuracy": 0.3620689630508423, + "step": 106175 + }, + { + "epoch": 0.106945623720213, + "grad_norm": 11.682613518361826, + "learning_rate": 4.960223344472099e-05, + "loss": 2.4927, + "mean_token_accuracy": 0.3999999940395355, + "step": 106180 + }, + { + "epoch": 0.10695065977331718, + "grad_norm": 11.92787282939523, + "learning_rate": 4.960216329460116e-05, + "loss": 2.6219, + "mean_token_accuracy": 0.3655172407627106, + "step": 106185 + }, + { + "epoch": 0.10695569582642135, + "grad_norm": 8.813097699009763, + "learning_rate": 4.9602093138351216e-05, + "loss": 2.547, + "mean_token_accuracy": 0.3827586084604263, + "step": 106190 + }, + { + "epoch": 0.10696073187952553, + "grad_norm": 9.720737737116847, + "learning_rate": 4.9602022975971166e-05, + "loss": 2.6435, + "mean_token_accuracy": 0.3827586233615875, + "step": 106195 + }, + { + "epoch": 0.1069657679326297, + "grad_norm": 9.203666714240121, + "learning_rate": 4.9601952807461044e-05, + "loss": 2.347, + "mean_token_accuracy": 0.3896551728248596, + "step": 106200 + }, + { + "epoch": 0.10697080398573387, + "grad_norm": 10.551161485115642, + "learning_rate": 4.960188263282087e-05, + "loss": 2.3377, + "mean_token_accuracy": 0.4620689570903778, + "step": 106205 + }, + { + "epoch": 0.10697584003883805, + "grad_norm": 12.18837055252694, + "learning_rate": 4.9601812452050645e-05, + "loss": 2.2996, + "mean_token_accuracy": 0.42068966031074523, + "step": 106210 + }, + { + "epoch": 0.10698087609194222, + "grad_norm": 10.647184005148105, + "learning_rate": 4.96017422651504e-05, + "loss": 2.3411, + "mean_token_accuracy": 0.4206896543502808, + "step": 106215 + }, + { + "epoch": 0.1069859121450464, + "grad_norm": 10.591931149445683, + "learning_rate": 4.960167207212016e-05, + "loss": 2.0449, + "mean_token_accuracy": 0.4915305435657501, + "step": 106220 + }, + { + "epoch": 0.10699094819815055, + "grad_norm": 12.62236676550205, + "learning_rate": 4.9601601872959934e-05, + "loss": 2.0983, + "mean_token_accuracy": 0.4689655125141144, + "step": 106225 + }, + { + "epoch": 0.10699598425125473, + "grad_norm": 9.151261576963176, + "learning_rate": 4.960153166766975e-05, + "loss": 2.6565, + "mean_token_accuracy": 0.36896551847457887, + "step": 106230 + }, + { + "epoch": 0.1070010203043589, + "grad_norm": 11.335415882908068, + "learning_rate": 4.960146145624962e-05, + "loss": 2.5267, + "mean_token_accuracy": 0.41034482717514037, + "step": 106235 + }, + { + "epoch": 0.10700605635746308, + "grad_norm": 12.610846103990081, + "learning_rate": 4.9601391238699574e-05, + "loss": 2.7983, + "mean_token_accuracy": 0.3724137842655182, + "step": 106240 + }, + { + "epoch": 0.10701109241056725, + "grad_norm": 9.512504353904053, + "learning_rate": 4.9601321015019615e-05, + "loss": 2.5021, + "mean_token_accuracy": 0.42758620381355283, + "step": 106245 + }, + { + "epoch": 0.10701612846367142, + "grad_norm": 11.262034357659095, + "learning_rate": 4.960125078520977e-05, + "loss": 2.2957, + "mean_token_accuracy": 0.4379310369491577, + "step": 106250 + }, + { + "epoch": 0.1070211645167756, + "grad_norm": 10.281610013330932, + "learning_rate": 4.9601180549270076e-05, + "loss": 2.4475, + "mean_token_accuracy": 0.47586206793785096, + "step": 106255 + }, + { + "epoch": 0.10702620056987977, + "grad_norm": 10.632703885607167, + "learning_rate": 4.960111030720053e-05, + "loss": 2.1376, + "mean_token_accuracy": 0.4954023063182831, + "step": 106260 + }, + { + "epoch": 0.10703123662298394, + "grad_norm": 13.97067586289822, + "learning_rate": 4.9601040059001155e-05, + "loss": 2.4335, + "mean_token_accuracy": 0.4448275864124298, + "step": 106265 + }, + { + "epoch": 0.10703627267608812, + "grad_norm": 12.21876480455352, + "learning_rate": 4.960096980467198e-05, + "loss": 2.7622, + "mean_token_accuracy": 0.3999999940395355, + "step": 106270 + }, + { + "epoch": 0.10704130872919229, + "grad_norm": 11.958643687740581, + "learning_rate": 4.9600899544213016e-05, + "loss": 2.4066, + "mean_token_accuracy": 0.4502117395401001, + "step": 106275 + }, + { + "epoch": 0.10704634478229647, + "grad_norm": 12.505872551999165, + "learning_rate": 4.960082927762428e-05, + "loss": 2.4707, + "mean_token_accuracy": 0.4068965554237366, + "step": 106280 + }, + { + "epoch": 0.10705138083540064, + "grad_norm": 12.86858903315689, + "learning_rate": 4.960075900490581e-05, + "loss": 2.5142, + "mean_token_accuracy": 0.41034482717514037, + "step": 106285 + }, + { + "epoch": 0.1070564168885048, + "grad_norm": 9.864573964343156, + "learning_rate": 4.9600688726057596e-05, + "loss": 2.6396, + "mean_token_accuracy": 0.4103448212146759, + "step": 106290 + }, + { + "epoch": 0.10706145294160897, + "grad_norm": 11.333264249833642, + "learning_rate": 4.960061844107968e-05, + "loss": 2.484, + "mean_token_accuracy": 0.38275861740112305, + "step": 106295 + }, + { + "epoch": 0.10706648899471315, + "grad_norm": 11.846136773835408, + "learning_rate": 4.960054814997207e-05, + "loss": 2.6541, + "mean_token_accuracy": 0.3827586233615875, + "step": 106300 + }, + { + "epoch": 0.10707152504781732, + "grad_norm": 10.375181675040198, + "learning_rate": 4.96004778527348e-05, + "loss": 2.45, + "mean_token_accuracy": 0.45710828304290774, + "step": 106305 + }, + { + "epoch": 0.1070765611009215, + "grad_norm": 12.30809107495536, + "learning_rate": 4.9600407549367876e-05, + "loss": 2.8306, + "mean_token_accuracy": 0.37586207389831544, + "step": 106310 + }, + { + "epoch": 0.10708159715402567, + "grad_norm": 8.901571025778285, + "learning_rate": 4.960033723987132e-05, + "loss": 2.6224, + "mean_token_accuracy": 0.36206896007061007, + "step": 106315 + }, + { + "epoch": 0.10708663320712984, + "grad_norm": 11.254247772851109, + "learning_rate": 4.960026692424516e-05, + "loss": 2.3727, + "mean_token_accuracy": 0.4103448331356049, + "step": 106320 + }, + { + "epoch": 0.10709166926023402, + "grad_norm": 10.602474859357526, + "learning_rate": 4.9600196602489395e-05, + "loss": 2.4554, + "mean_token_accuracy": 0.3827586233615875, + "step": 106325 + }, + { + "epoch": 0.10709670531333819, + "grad_norm": 13.974560603666632, + "learning_rate": 4.9600126274604074e-05, + "loss": 2.4257, + "mean_token_accuracy": 0.44482758045196535, + "step": 106330 + }, + { + "epoch": 0.10710174136644236, + "grad_norm": 8.092058920024067, + "learning_rate": 4.960005594058919e-05, + "loss": 2.0517, + "mean_token_accuracy": 0.5137931048870087, + "step": 106335 + }, + { + "epoch": 0.10710677741954654, + "grad_norm": 9.468858471077091, + "learning_rate": 4.9599985600444775e-05, + "loss": 2.3183, + "mean_token_accuracy": 0.4931034505367279, + "step": 106340 + }, + { + "epoch": 0.10711181347265071, + "grad_norm": 6.812811816305926, + "learning_rate": 4.959991525417085e-05, + "loss": 2.2093, + "mean_token_accuracy": 0.4701567471027374, + "step": 106345 + }, + { + "epoch": 0.10711684952575488, + "grad_norm": 9.989062824667348, + "learning_rate": 4.959984490176742e-05, + "loss": 2.5022, + "mean_token_accuracy": 0.41379311084747317, + "step": 106350 + }, + { + "epoch": 0.10712188557885906, + "grad_norm": 9.576582406198384, + "learning_rate": 4.959977454323453e-05, + "loss": 2.5167, + "mean_token_accuracy": 0.4275861978530884, + "step": 106355 + }, + { + "epoch": 0.10712692163196322, + "grad_norm": 10.156951963421292, + "learning_rate": 4.9599704178572173e-05, + "loss": 2.148, + "mean_token_accuracy": 0.47241378426551817, + "step": 106360 + }, + { + "epoch": 0.10713195768506739, + "grad_norm": 15.215475566162258, + "learning_rate": 4.959963380778039e-05, + "loss": 2.8452, + "mean_token_accuracy": 0.38275861740112305, + "step": 106365 + }, + { + "epoch": 0.10713699373817157, + "grad_norm": 12.155713386710275, + "learning_rate": 4.959956343085919e-05, + "loss": 2.6524, + "mean_token_accuracy": 0.39122806787490844, + "step": 106370 + }, + { + "epoch": 0.10714202979127574, + "grad_norm": 10.922229977426966, + "learning_rate": 4.959949304780859e-05, + "loss": 2.5111, + "mean_token_accuracy": 0.4034482777118683, + "step": 106375 + }, + { + "epoch": 0.10714706584437991, + "grad_norm": 12.981070557993153, + "learning_rate": 4.959942265862861e-05, + "loss": 2.4031, + "mean_token_accuracy": 0.4172413766384125, + "step": 106380 + }, + { + "epoch": 0.10715210189748409, + "grad_norm": 13.378562793767566, + "learning_rate": 4.959935226331928e-05, + "loss": 2.3787, + "mean_token_accuracy": 0.3931034505367279, + "step": 106385 + }, + { + "epoch": 0.10715713795058826, + "grad_norm": 9.97687030117511, + "learning_rate": 4.959928186188061e-05, + "loss": 2.5343, + "mean_token_accuracy": 0.4, + "step": 106390 + }, + { + "epoch": 0.10716217400369243, + "grad_norm": 9.040406108841525, + "learning_rate": 4.959921145431262e-05, + "loss": 2.1169, + "mean_token_accuracy": 0.4965517222881317, + "step": 106395 + }, + { + "epoch": 0.10716721005679661, + "grad_norm": 10.305001881807378, + "learning_rate": 4.9599141040615335e-05, + "loss": 2.5092, + "mean_token_accuracy": 0.4673926174640656, + "step": 106400 + }, + { + "epoch": 0.10717224610990078, + "grad_norm": 9.437873331732726, + "learning_rate": 4.959907062078877e-05, + "loss": 2.2606, + "mean_token_accuracy": 0.44827587008476255, + "step": 106405 + }, + { + "epoch": 0.10717728216300496, + "grad_norm": 11.109560962306661, + "learning_rate": 4.9599000194832944e-05, + "loss": 2.4531, + "mean_token_accuracy": 0.39310344457626345, + "step": 106410 + }, + { + "epoch": 0.10718231821610913, + "grad_norm": 9.300760891524575, + "learning_rate": 4.959892976274788e-05, + "loss": 2.463, + "mean_token_accuracy": 0.43448275327682495, + "step": 106415 + }, + { + "epoch": 0.1071873542692133, + "grad_norm": 10.85006747763822, + "learning_rate": 4.9598859324533584e-05, + "loss": 2.4694, + "mean_token_accuracy": 0.4430732011795044, + "step": 106420 + }, + { + "epoch": 0.10719239032231748, + "grad_norm": 12.325406028623624, + "learning_rate": 4.9598788880190104e-05, + "loss": 2.8229, + "mean_token_accuracy": 0.42758620381355283, + "step": 106425 + }, + { + "epoch": 0.10719742637542164, + "grad_norm": 10.177527305236834, + "learning_rate": 4.9598718429717425e-05, + "loss": 2.1901, + "mean_token_accuracy": 0.4572897732257843, + "step": 106430 + }, + { + "epoch": 0.10720246242852581, + "grad_norm": 9.957698541120218, + "learning_rate": 4.95986479731156e-05, + "loss": 2.47, + "mean_token_accuracy": 0.4206896543502808, + "step": 106435 + }, + { + "epoch": 0.10720749848162998, + "grad_norm": 13.722865477115791, + "learning_rate": 4.959857751038462e-05, + "loss": 2.8377, + "mean_token_accuracy": 0.4068965494632721, + "step": 106440 + }, + { + "epoch": 0.10721253453473416, + "grad_norm": 10.862217325426737, + "learning_rate": 4.9598507041524525e-05, + "loss": 2.4196, + "mean_token_accuracy": 0.4103448331356049, + "step": 106445 + }, + { + "epoch": 0.10721757058783833, + "grad_norm": 10.783111750898163, + "learning_rate": 4.9598436566535325e-05, + "loss": 2.0976, + "mean_token_accuracy": 0.5156684756278992, + "step": 106450 + }, + { + "epoch": 0.1072226066409425, + "grad_norm": 9.586846385596319, + "learning_rate": 4.9598366085417035e-05, + "loss": 2.2582, + "mean_token_accuracy": 0.4965517222881317, + "step": 106455 + }, + { + "epoch": 0.10722764269404668, + "grad_norm": 10.344148331896527, + "learning_rate": 4.959829559816969e-05, + "loss": 2.387, + "mean_token_accuracy": 0.4068965554237366, + "step": 106460 + }, + { + "epoch": 0.10723267874715085, + "grad_norm": 12.102759267611043, + "learning_rate": 4.95982251047933e-05, + "loss": 2.648, + "mean_token_accuracy": 0.36896551847457887, + "step": 106465 + }, + { + "epoch": 0.10723771480025503, + "grad_norm": 16.544150644709934, + "learning_rate": 4.959815460528788e-05, + "loss": 2.5696, + "mean_token_accuracy": 0.4551724135875702, + "step": 106470 + }, + { + "epoch": 0.1072427508533592, + "grad_norm": 11.737588974468213, + "learning_rate": 4.959808409965346e-05, + "loss": 2.2859, + "mean_token_accuracy": 0.41034482717514037, + "step": 106475 + }, + { + "epoch": 0.10724778690646337, + "grad_norm": 9.432847999561684, + "learning_rate": 4.9598013587890045e-05, + "loss": 2.8732, + "mean_token_accuracy": 0.3862069010734558, + "step": 106480 + }, + { + "epoch": 0.10725282295956755, + "grad_norm": 10.15694550816408, + "learning_rate": 4.959794306999767e-05, + "loss": 2.4563, + "mean_token_accuracy": 0.42068966031074523, + "step": 106485 + }, + { + "epoch": 0.10725785901267172, + "grad_norm": 12.170937370608891, + "learning_rate": 4.959787254597634e-05, + "loss": 2.2018, + "mean_token_accuracy": 0.43448275327682495, + "step": 106490 + }, + { + "epoch": 0.1072628950657759, + "grad_norm": 12.104171816278498, + "learning_rate": 4.959780201582609e-05, + "loss": 2.9951, + "mean_token_accuracy": 0.39310345649719236, + "step": 106495 + }, + { + "epoch": 0.10726793111888006, + "grad_norm": 8.904005847271538, + "learning_rate": 4.959773147954693e-05, + "loss": 2.0593, + "mean_token_accuracy": 0.5137931108474731, + "step": 106500 + }, + { + "epoch": 0.10727296717198423, + "grad_norm": 10.590525083425986, + "learning_rate": 4.959766093713889e-05, + "loss": 2.2341, + "mean_token_accuracy": 0.4137930989265442, + "step": 106505 + }, + { + "epoch": 0.1072780032250884, + "grad_norm": 10.64108891856517, + "learning_rate": 4.959759038860197e-05, + "loss": 2.5243, + "mean_token_accuracy": 0.4172413766384125, + "step": 106510 + }, + { + "epoch": 0.10728303927819258, + "grad_norm": 13.137321110980233, + "learning_rate": 4.9597519833936205e-05, + "loss": 2.1885, + "mean_token_accuracy": 0.441379314661026, + "step": 106515 + }, + { + "epoch": 0.10728807533129675, + "grad_norm": 10.62191902858775, + "learning_rate": 4.959744927314161e-05, + "loss": 2.5678, + "mean_token_accuracy": 0.3793103456497192, + "step": 106520 + }, + { + "epoch": 0.10729311138440092, + "grad_norm": 12.031802667360553, + "learning_rate": 4.9597378706218206e-05, + "loss": 2.4642, + "mean_token_accuracy": 0.3793103456497192, + "step": 106525 + }, + { + "epoch": 0.1072981474375051, + "grad_norm": 10.344008858056721, + "learning_rate": 4.9597308133166016e-05, + "loss": 2.6041, + "mean_token_accuracy": 0.38965516686439516, + "step": 106530 + }, + { + "epoch": 0.10730318349060927, + "grad_norm": 23.621930592021844, + "learning_rate": 4.959723755398504e-05, + "loss": 2.6777, + "mean_token_accuracy": 0.34137930274009703, + "step": 106535 + }, + { + "epoch": 0.10730821954371345, + "grad_norm": 8.125423020146227, + "learning_rate": 4.959716696867532e-05, + "loss": 2.3515, + "mean_token_accuracy": 0.43448275327682495, + "step": 106540 + }, + { + "epoch": 0.10731325559681762, + "grad_norm": 7.905561744551934, + "learning_rate": 4.959709637723688e-05, + "loss": 2.6121, + "mean_token_accuracy": 0.42413792610168455, + "step": 106545 + }, + { + "epoch": 0.1073182916499218, + "grad_norm": 10.076402592319168, + "learning_rate": 4.959702577966972e-05, + "loss": 2.7255, + "mean_token_accuracy": 0.33103448152542114, + "step": 106550 + }, + { + "epoch": 0.10732332770302597, + "grad_norm": 12.837750551964897, + "learning_rate": 4.959695517597387e-05, + "loss": 2.2592, + "mean_token_accuracy": 0.45517241954803467, + "step": 106555 + }, + { + "epoch": 0.10732836375613014, + "grad_norm": 13.091453100239086, + "learning_rate": 4.959688456614934e-05, + "loss": 2.4559, + "mean_token_accuracy": 0.382758629322052, + "step": 106560 + }, + { + "epoch": 0.10733339980923431, + "grad_norm": 12.032691621125032, + "learning_rate": 4.959681395019616e-05, + "loss": 2.7772, + "mean_token_accuracy": 0.3137931048870087, + "step": 106565 + }, + { + "epoch": 0.10733843586233847, + "grad_norm": 13.330493760758372, + "learning_rate": 4.959674332811434e-05, + "loss": 2.609, + "mean_token_accuracy": 0.4533575356006622, + "step": 106570 + }, + { + "epoch": 0.10734347191544265, + "grad_norm": 13.666181170991402, + "learning_rate": 4.959667269990391e-05, + "loss": 2.4683, + "mean_token_accuracy": 0.4344827651977539, + "step": 106575 + }, + { + "epoch": 0.10734850796854682, + "grad_norm": 10.696599630127945, + "learning_rate": 4.9596602065564895e-05, + "loss": 2.3656, + "mean_token_accuracy": 0.4551724076271057, + "step": 106580 + }, + { + "epoch": 0.107353544021651, + "grad_norm": 11.217610448709276, + "learning_rate": 4.9596531425097305e-05, + "loss": 2.3688, + "mean_token_accuracy": 0.4034482777118683, + "step": 106585 + }, + { + "epoch": 0.10735858007475517, + "grad_norm": 9.85562931862014, + "learning_rate": 4.959646077850115e-05, + "loss": 2.2313, + "mean_token_accuracy": 0.4599753737449646, + "step": 106590 + }, + { + "epoch": 0.10736361612785934, + "grad_norm": 12.895972277728696, + "learning_rate": 4.959639012577646e-05, + "loss": 2.4179, + "mean_token_accuracy": 0.4413793087005615, + "step": 106595 + }, + { + "epoch": 0.10736865218096352, + "grad_norm": 10.924053609702735, + "learning_rate": 4.959631946692325e-05, + "loss": 2.3044, + "mean_token_accuracy": 0.4172413766384125, + "step": 106600 + }, + { + "epoch": 0.10737368823406769, + "grad_norm": 10.733186332823689, + "learning_rate": 4.959624880194155e-05, + "loss": 2.3178, + "mean_token_accuracy": 0.4034482717514038, + "step": 106605 + }, + { + "epoch": 0.10737872428717186, + "grad_norm": 13.530467681064485, + "learning_rate": 4.9596178130831365e-05, + "loss": 2.3154, + "mean_token_accuracy": 0.45650331377983094, + "step": 106610 + }, + { + "epoch": 0.10738376034027604, + "grad_norm": 14.403014485381881, + "learning_rate": 4.959610745359274e-05, + "loss": 2.4041, + "mean_token_accuracy": 0.46551724672317507, + "step": 106615 + }, + { + "epoch": 0.10738879639338021, + "grad_norm": 10.393808097004916, + "learning_rate": 4.959603677022566e-05, + "loss": 2.4378, + "mean_token_accuracy": 0.4379310429096222, + "step": 106620 + }, + { + "epoch": 0.10739383244648439, + "grad_norm": 12.35202844295194, + "learning_rate": 4.959596608073017e-05, + "loss": 2.2705, + "mean_token_accuracy": 0.4310344815254211, + "step": 106625 + }, + { + "epoch": 0.10739886849958856, + "grad_norm": 11.293775159140248, + "learning_rate": 4.959589538510628e-05, + "loss": 2.4262, + "mean_token_accuracy": 0.41379310488700866, + "step": 106630 + }, + { + "epoch": 0.10740390455269273, + "grad_norm": 11.732765882356322, + "learning_rate": 4.959582468335401e-05, + "loss": 2.7593, + "mean_token_accuracy": 0.36896551847457887, + "step": 106635 + }, + { + "epoch": 0.1074089406057969, + "grad_norm": 9.468588879874773, + "learning_rate": 4.959575397547338e-05, + "loss": 2.5064, + "mean_token_accuracy": 0.403448274731636, + "step": 106640 + }, + { + "epoch": 0.10741397665890107, + "grad_norm": 11.944422244346457, + "learning_rate": 4.959568326146442e-05, + "loss": 2.3013, + "mean_token_accuracy": 0.44482759237289426, + "step": 106645 + }, + { + "epoch": 0.10741901271200524, + "grad_norm": 11.017043266183217, + "learning_rate": 4.9595612541327125e-05, + "loss": 2.7699, + "mean_token_accuracy": 0.3862069010734558, + "step": 106650 + }, + { + "epoch": 0.10742404876510941, + "grad_norm": 10.449653059428352, + "learning_rate": 4.9595541815061535e-05, + "loss": 2.563, + "mean_token_accuracy": 0.37586206793785093, + "step": 106655 + }, + { + "epoch": 0.10742908481821359, + "grad_norm": 9.994158753533286, + "learning_rate": 4.959547108266766e-05, + "loss": 2.5532, + "mean_token_accuracy": 0.37586206793785093, + "step": 106660 + }, + { + "epoch": 0.10743412087131776, + "grad_norm": 10.48592145820035, + "learning_rate": 4.959540034414553e-05, + "loss": 2.2758, + "mean_token_accuracy": 0.38275861740112305, + "step": 106665 + }, + { + "epoch": 0.10743915692442194, + "grad_norm": 10.062621367223123, + "learning_rate": 4.959532959949515e-05, + "loss": 2.2209, + "mean_token_accuracy": 0.4517241299152374, + "step": 106670 + }, + { + "epoch": 0.10744419297752611, + "grad_norm": 11.223274517915318, + "learning_rate": 4.959525884871656e-05, + "loss": 2.8167, + "mean_token_accuracy": 0.39310344457626345, + "step": 106675 + }, + { + "epoch": 0.10744922903063028, + "grad_norm": 10.011428549098868, + "learning_rate": 4.959518809180976e-05, + "loss": 2.2912, + "mean_token_accuracy": 0.4551724135875702, + "step": 106680 + }, + { + "epoch": 0.10745426508373446, + "grad_norm": 10.646608262919381, + "learning_rate": 4.9595117328774783e-05, + "loss": 2.1735, + "mean_token_accuracy": 0.4310344815254211, + "step": 106685 + }, + { + "epoch": 0.10745930113683863, + "grad_norm": 10.927723365878006, + "learning_rate": 4.959504655961164e-05, + "loss": 2.6942, + "mean_token_accuracy": 0.39655172228813174, + "step": 106690 + }, + { + "epoch": 0.1074643371899428, + "grad_norm": 9.848513366683308, + "learning_rate": 4.959497578432035e-05, + "loss": 2.3722, + "mean_token_accuracy": 0.4413793087005615, + "step": 106695 + }, + { + "epoch": 0.10746937324304698, + "grad_norm": 9.745369565534425, + "learning_rate": 4.959490500290094e-05, + "loss": 2.3078, + "mean_token_accuracy": 0.43103448748588563, + "step": 106700 + }, + { + "epoch": 0.10747440929615115, + "grad_norm": 10.453926616018494, + "learning_rate": 4.959483421535342e-05, + "loss": 2.2058, + "mean_token_accuracy": 0.4517241299152374, + "step": 106705 + }, + { + "epoch": 0.10747944534925531, + "grad_norm": 12.161459202232635, + "learning_rate": 4.959476342167782e-05, + "loss": 2.3153, + "mean_token_accuracy": 0.43103448748588563, + "step": 106710 + }, + { + "epoch": 0.10748448140235949, + "grad_norm": 9.649767345890206, + "learning_rate": 4.9594692621874154e-05, + "loss": 2.4339, + "mean_token_accuracy": 0.38620689511299133, + "step": 106715 + }, + { + "epoch": 0.10748951745546366, + "grad_norm": 12.886503610995915, + "learning_rate": 4.959462181594245e-05, + "loss": 2.1258, + "mean_token_accuracy": 0.49655171632766726, + "step": 106720 + }, + { + "epoch": 0.10749455350856783, + "grad_norm": 11.555338059611536, + "learning_rate": 4.9594551003882705e-05, + "loss": 2.6321, + "mean_token_accuracy": 0.37931033968925476, + "step": 106725 + }, + { + "epoch": 0.10749958956167201, + "grad_norm": 9.640672181942858, + "learning_rate": 4.959448018569497e-05, + "loss": 2.7943, + "mean_token_accuracy": 0.42413793206214906, + "step": 106730 + }, + { + "epoch": 0.10750462561477618, + "grad_norm": 11.32226200521982, + "learning_rate": 4.959440936137923e-05, + "loss": 2.6046, + "mean_token_accuracy": 0.435571676492691, + "step": 106735 + }, + { + "epoch": 0.10750966166788035, + "grad_norm": 10.195056195676251, + "learning_rate": 4.9594338530935536e-05, + "loss": 2.2792, + "mean_token_accuracy": 0.4137930989265442, + "step": 106740 + }, + { + "epoch": 0.10751469772098453, + "grad_norm": 9.98543662903377, + "learning_rate": 4.9594267694363895e-05, + "loss": 2.9484, + "mean_token_accuracy": 0.36896551847457887, + "step": 106745 + }, + { + "epoch": 0.1075197337740887, + "grad_norm": 10.326730138561052, + "learning_rate": 4.959419685166432e-05, + "loss": 2.4506, + "mean_token_accuracy": 0.41379310488700866, + "step": 106750 + }, + { + "epoch": 0.10752476982719288, + "grad_norm": 11.294552537674972, + "learning_rate": 4.959412600283685e-05, + "loss": 2.1495, + "mean_token_accuracy": 0.47241378426551817, + "step": 106755 + }, + { + "epoch": 0.10752980588029705, + "grad_norm": 12.551294045295297, + "learning_rate": 4.9594055147881473e-05, + "loss": 2.8227, + "mean_token_accuracy": 0.3724137872457504, + "step": 106760 + }, + { + "epoch": 0.10753484193340122, + "grad_norm": 10.354612437864445, + "learning_rate": 4.959398428679824e-05, + "loss": 2.5441, + "mean_token_accuracy": 0.3965517163276672, + "step": 106765 + }, + { + "epoch": 0.1075398779865054, + "grad_norm": 12.029865720983064, + "learning_rate": 4.959391341958715e-05, + "loss": 2.6199, + "mean_token_accuracy": 0.42758620977401735, + "step": 106770 + }, + { + "epoch": 0.10754491403960957, + "grad_norm": 9.381046228614197, + "learning_rate": 4.9593842546248235e-05, + "loss": 2.3842, + "mean_token_accuracy": 0.4310344815254211, + "step": 106775 + }, + { + "epoch": 0.10754995009271373, + "grad_norm": 9.958304397898976, + "learning_rate": 4.959377166678152e-05, + "loss": 2.1766, + "mean_token_accuracy": 0.4379310369491577, + "step": 106780 + }, + { + "epoch": 0.1075549861458179, + "grad_norm": 14.621644808857548, + "learning_rate": 4.9593700781187004e-05, + "loss": 2.907, + "mean_token_accuracy": 0.3655172407627106, + "step": 106785 + }, + { + "epoch": 0.10756002219892208, + "grad_norm": 39.435746885334055, + "learning_rate": 4.9593629889464715e-05, + "loss": 3.0036, + "mean_token_accuracy": 0.4, + "step": 106790 + }, + { + "epoch": 0.10756505825202625, + "grad_norm": 12.416538995700153, + "learning_rate": 4.959355899161468e-05, + "loss": 2.7264, + "mean_token_accuracy": 0.4310344815254211, + "step": 106795 + }, + { + "epoch": 0.10757009430513043, + "grad_norm": 10.089464802630003, + "learning_rate": 4.9593488087636915e-05, + "loss": 2.1758, + "mean_token_accuracy": 0.458620685338974, + "step": 106800 + }, + { + "epoch": 0.1075751303582346, + "grad_norm": 10.988388477522694, + "learning_rate": 4.9593417177531436e-05, + "loss": 2.6094, + "mean_token_accuracy": 0.41379310488700866, + "step": 106805 + }, + { + "epoch": 0.10758016641133877, + "grad_norm": 10.302927322332708, + "learning_rate": 4.959334626129827e-05, + "loss": 2.5436, + "mean_token_accuracy": 0.4068965554237366, + "step": 106810 + }, + { + "epoch": 0.10758520246444295, + "grad_norm": 10.84781891306715, + "learning_rate": 4.9593275338937425e-05, + "loss": 2.2994, + "mean_token_accuracy": 0.4034482777118683, + "step": 106815 + }, + { + "epoch": 0.10759023851754712, + "grad_norm": 10.649820333278269, + "learning_rate": 4.9593204410448926e-05, + "loss": 2.2966, + "mean_token_accuracy": 0.4689655125141144, + "step": 106820 + }, + { + "epoch": 0.1075952745706513, + "grad_norm": 10.733918534391444, + "learning_rate": 4.959313347583281e-05, + "loss": 2.2193, + "mean_token_accuracy": 0.4517241358757019, + "step": 106825 + }, + { + "epoch": 0.10760031062375547, + "grad_norm": 10.886491885111527, + "learning_rate": 4.959306253508907e-05, + "loss": 2.4809, + "mean_token_accuracy": 0.41034482717514037, + "step": 106830 + }, + { + "epoch": 0.10760534667685964, + "grad_norm": 13.075971436146078, + "learning_rate": 4.9592991588217734e-05, + "loss": 2.9847, + "mean_token_accuracy": 0.4172413766384125, + "step": 106835 + }, + { + "epoch": 0.10761038272996382, + "grad_norm": 11.286498635031593, + "learning_rate": 4.9592920635218825e-05, + "loss": 2.1673, + "mean_token_accuracy": 0.4896551728248596, + "step": 106840 + }, + { + "epoch": 0.10761541878306799, + "grad_norm": 9.85853254442693, + "learning_rate": 4.959284967609237e-05, + "loss": 2.0912, + "mean_token_accuracy": 0.42710224390029905, + "step": 106845 + }, + { + "epoch": 0.10762045483617215, + "grad_norm": 10.54336939385977, + "learning_rate": 4.959277871083838e-05, + "loss": 2.452, + "mean_token_accuracy": 0.4310344815254211, + "step": 106850 + }, + { + "epoch": 0.10762549088927632, + "grad_norm": 13.397413412069337, + "learning_rate": 4.9592707739456866e-05, + "loss": 2.6721, + "mean_token_accuracy": 0.324137932062149, + "step": 106855 + }, + { + "epoch": 0.1076305269423805, + "grad_norm": 11.608632425523028, + "learning_rate": 4.9592636761947866e-05, + "loss": 2.5292, + "mean_token_accuracy": 0.42758620977401735, + "step": 106860 + }, + { + "epoch": 0.10763556299548467, + "grad_norm": 11.9895769676642, + "learning_rate": 4.959256577831138e-05, + "loss": 2.5368, + "mean_token_accuracy": 0.36206896007061007, + "step": 106865 + }, + { + "epoch": 0.10764059904858884, + "grad_norm": 14.227945077870011, + "learning_rate": 4.959249478854745e-05, + "loss": 2.3987, + "mean_token_accuracy": 0.39655172228813174, + "step": 106870 + }, + { + "epoch": 0.10764563510169302, + "grad_norm": 9.415251898660706, + "learning_rate": 4.959242379265608e-05, + "loss": 2.2808, + "mean_token_accuracy": 0.4551724076271057, + "step": 106875 + }, + { + "epoch": 0.10765067115479719, + "grad_norm": 12.354147006948123, + "learning_rate": 4.959235279063729e-05, + "loss": 2.1985, + "mean_token_accuracy": 0.4655172348022461, + "step": 106880 + }, + { + "epoch": 0.10765570720790137, + "grad_norm": 11.299348128399007, + "learning_rate": 4.959228178249111e-05, + "loss": 2.7676, + "mean_token_accuracy": 0.3620689630508423, + "step": 106885 + }, + { + "epoch": 0.10766074326100554, + "grad_norm": 10.16039787844633, + "learning_rate": 4.959221076821755e-05, + "loss": 2.5441, + "mean_token_accuracy": 0.38620689511299133, + "step": 106890 + }, + { + "epoch": 0.10766577931410971, + "grad_norm": 11.012813586769083, + "learning_rate": 4.959213974781663e-05, + "loss": 2.3132, + "mean_token_accuracy": 0.4448275864124298, + "step": 106895 + }, + { + "epoch": 0.10767081536721389, + "grad_norm": 11.268510508450252, + "learning_rate": 4.9592068721288373e-05, + "loss": 2.4029, + "mean_token_accuracy": 0.39655172228813174, + "step": 106900 + }, + { + "epoch": 0.10767585142031806, + "grad_norm": 9.226826916605239, + "learning_rate": 4.9591997688632806e-05, + "loss": 2.0827, + "mean_token_accuracy": 0.4569267988204956, + "step": 106905 + }, + { + "epoch": 0.10768088747342223, + "grad_norm": 12.642494166586225, + "learning_rate": 4.959192664984993e-05, + "loss": 2.3647, + "mean_token_accuracy": 0.4379310250282288, + "step": 106910 + }, + { + "epoch": 0.10768592352652641, + "grad_norm": 11.138630939629552, + "learning_rate": 4.959185560493979e-05, + "loss": 2.4635, + "mean_token_accuracy": 0.43581366539001465, + "step": 106915 + }, + { + "epoch": 0.10769095957963057, + "grad_norm": 9.700827587130128, + "learning_rate": 4.959178455390239e-05, + "loss": 2.5871, + "mean_token_accuracy": 0.4000000059604645, + "step": 106920 + }, + { + "epoch": 0.10769599563273474, + "grad_norm": 11.045386543859502, + "learning_rate": 4.959171349673773e-05, + "loss": 2.2538, + "mean_token_accuracy": 0.42413792610168455, + "step": 106925 + }, + { + "epoch": 0.10770103168583892, + "grad_norm": 11.94376585504857, + "learning_rate": 4.959164243344587e-05, + "loss": 2.5539, + "mean_token_accuracy": 0.4137930989265442, + "step": 106930 + }, + { + "epoch": 0.10770606773894309, + "grad_norm": 10.980915290607202, + "learning_rate": 4.9591571364026804e-05, + "loss": 2.3658, + "mean_token_accuracy": 0.41379310488700866, + "step": 106935 + }, + { + "epoch": 0.10771110379204726, + "grad_norm": 11.535089263460891, + "learning_rate": 4.9591500288480565e-05, + "loss": 2.3859, + "mean_token_accuracy": 0.4137930989265442, + "step": 106940 + }, + { + "epoch": 0.10771613984515144, + "grad_norm": 11.110720294691056, + "learning_rate": 4.959142920680716e-05, + "loss": 2.3338, + "mean_token_accuracy": 0.4534180283546448, + "step": 106945 + }, + { + "epoch": 0.10772117589825561, + "grad_norm": 14.82000985595911, + "learning_rate": 4.959135811900662e-05, + "loss": 2.6034, + "mean_token_accuracy": 0.38620689809322356, + "step": 106950 + }, + { + "epoch": 0.10772621195135978, + "grad_norm": 10.571544165444235, + "learning_rate": 4.959128702507895e-05, + "loss": 2.2071, + "mean_token_accuracy": 0.48620688915252686, + "step": 106955 + }, + { + "epoch": 0.10773124800446396, + "grad_norm": 12.637791143581376, + "learning_rate": 4.9591215925024185e-05, + "loss": 2.5277, + "mean_token_accuracy": 0.42413793206214906, + "step": 106960 + }, + { + "epoch": 0.10773628405756813, + "grad_norm": 14.220807513351382, + "learning_rate": 4.9591144818842336e-05, + "loss": 2.5348, + "mean_token_accuracy": 0.41379310488700866, + "step": 106965 + }, + { + "epoch": 0.1077413201106723, + "grad_norm": 11.702181506459732, + "learning_rate": 4.9591073706533424e-05, + "loss": 2.6193, + "mean_token_accuracy": 0.443254691362381, + "step": 106970 + }, + { + "epoch": 0.10774635616377648, + "grad_norm": 10.326327762732863, + "learning_rate": 4.959100258809748e-05, + "loss": 2.1686, + "mean_token_accuracy": 0.4310344815254211, + "step": 106975 + }, + { + "epoch": 0.10775139221688065, + "grad_norm": 11.336737911831042, + "learning_rate": 4.959093146353451e-05, + "loss": 2.6062, + "mean_token_accuracy": 0.36896551251411436, + "step": 106980 + }, + { + "epoch": 0.10775642826998483, + "grad_norm": 9.72767969604264, + "learning_rate": 4.9590860332844535e-05, + "loss": 2.5936, + "mean_token_accuracy": 0.39310344457626345, + "step": 106985 + }, + { + "epoch": 0.10776146432308899, + "grad_norm": 8.205227644145948, + "learning_rate": 4.959078919602758e-05, + "loss": 2.4432, + "mean_token_accuracy": 0.42413793206214906, + "step": 106990 + }, + { + "epoch": 0.10776650037619316, + "grad_norm": 10.508058166432223, + "learning_rate": 4.959071805308366e-05, + "loss": 2.1863, + "mean_token_accuracy": 0.46551724076271056, + "step": 106995 + }, + { + "epoch": 0.10777153642929733, + "grad_norm": 10.408608414489235, + "learning_rate": 4.959064690401279e-05, + "loss": 2.4637, + "mean_token_accuracy": 0.4083484590053558, + "step": 107000 + }, + { + "epoch": 0.10777657248240151, + "grad_norm": 12.599113631693305, + "learning_rate": 4.959057574881501e-05, + "loss": 2.4122, + "mean_token_accuracy": 0.3827586233615875, + "step": 107005 + }, + { + "epoch": 0.10778160853550568, + "grad_norm": 10.046104973558526, + "learning_rate": 4.959050458749032e-05, + "loss": 2.2215, + "mean_token_accuracy": 0.4551724255084991, + "step": 107010 + }, + { + "epoch": 0.10778664458860986, + "grad_norm": 10.38936285245647, + "learning_rate": 4.959043342003875e-05, + "loss": 2.4585, + "mean_token_accuracy": 0.4413793087005615, + "step": 107015 + }, + { + "epoch": 0.10779168064171403, + "grad_norm": 11.887184611399931, + "learning_rate": 4.959036224646032e-05, + "loss": 2.4733, + "mean_token_accuracy": 0.4379310369491577, + "step": 107020 + }, + { + "epoch": 0.1077967166948182, + "grad_norm": 12.150976664691568, + "learning_rate": 4.9590291066755026e-05, + "loss": 2.7771, + "mean_token_accuracy": 0.4034482777118683, + "step": 107025 + }, + { + "epoch": 0.10780175274792238, + "grad_norm": 15.485262959152017, + "learning_rate": 4.959021988092292e-05, + "loss": 2.726, + "mean_token_accuracy": 0.41379310488700866, + "step": 107030 + }, + { + "epoch": 0.10780678880102655, + "grad_norm": 9.956924656575037, + "learning_rate": 4.959014868896401e-05, + "loss": 2.0671, + "mean_token_accuracy": 0.458620685338974, + "step": 107035 + }, + { + "epoch": 0.10781182485413073, + "grad_norm": 9.516171518316742, + "learning_rate": 4.959007749087831e-05, + "loss": 3.0218, + "mean_token_accuracy": 0.3758620619773865, + "step": 107040 + }, + { + "epoch": 0.1078168609072349, + "grad_norm": 10.872518206226435, + "learning_rate": 4.9590006286665854e-05, + "loss": 2.521, + "mean_token_accuracy": 0.38620689511299133, + "step": 107045 + }, + { + "epoch": 0.10782189696033907, + "grad_norm": 11.21976720925354, + "learning_rate": 4.958993507632665e-05, + "loss": 2.6692, + "mean_token_accuracy": 0.38965516686439516, + "step": 107050 + }, + { + "epoch": 0.10782693301344325, + "grad_norm": 10.75634765566282, + "learning_rate": 4.9589863859860716e-05, + "loss": 2.2986, + "mean_token_accuracy": 0.38620689511299133, + "step": 107055 + }, + { + "epoch": 0.1078319690665474, + "grad_norm": 12.063245540241597, + "learning_rate": 4.9589792637268066e-05, + "loss": 2.4978, + "mean_token_accuracy": 0.4068965554237366, + "step": 107060 + }, + { + "epoch": 0.10783700511965158, + "grad_norm": 9.748706292483627, + "learning_rate": 4.9589721408548746e-05, + "loss": 1.93, + "mean_token_accuracy": 0.4862069070339203, + "step": 107065 + }, + { + "epoch": 0.10784204117275575, + "grad_norm": 10.949890092092266, + "learning_rate": 4.9589650173702755e-05, + "loss": 2.4979, + "mean_token_accuracy": 0.38965516686439516, + "step": 107070 + }, + { + "epoch": 0.10784707722585993, + "grad_norm": 12.802860486650173, + "learning_rate": 4.958957893273011e-05, + "loss": 2.1315, + "mean_token_accuracy": 0.44482758045196535, + "step": 107075 + }, + { + "epoch": 0.1078521132789641, + "grad_norm": 16.689067428247053, + "learning_rate": 4.958950768563084e-05, + "loss": 2.508, + "mean_token_accuracy": 0.4103448212146759, + "step": 107080 + }, + { + "epoch": 0.10785714933206828, + "grad_norm": 9.142764262337046, + "learning_rate": 4.9589436432404975e-05, + "loss": 2.3, + "mean_token_accuracy": 0.44827585816383364, + "step": 107085 + }, + { + "epoch": 0.10786218538517245, + "grad_norm": 10.321886647706958, + "learning_rate": 4.958936517305251e-05, + "loss": 2.1765, + "mean_token_accuracy": 0.42068964838981626, + "step": 107090 + }, + { + "epoch": 0.10786722143827662, + "grad_norm": 10.576006120563074, + "learning_rate": 4.958929390757348e-05, + "loss": 2.3124, + "mean_token_accuracy": 0.44996975660324096, + "step": 107095 + }, + { + "epoch": 0.1078722574913808, + "grad_norm": 8.491517345020146, + "learning_rate": 4.9589222635967905e-05, + "loss": 2.3533, + "mean_token_accuracy": 0.46606169939041137, + "step": 107100 + }, + { + "epoch": 0.10787729354448497, + "grad_norm": 14.648448004805473, + "learning_rate": 4.9589151358235796e-05, + "loss": 2.5556, + "mean_token_accuracy": 0.45033273100852966, + "step": 107105 + }, + { + "epoch": 0.10788232959758914, + "grad_norm": 11.384283404655084, + "learning_rate": 4.9589080074377185e-05, + "loss": 2.522, + "mean_token_accuracy": 0.4241379380226135, + "step": 107110 + }, + { + "epoch": 0.10788736565069332, + "grad_norm": 11.85457718177044, + "learning_rate": 4.958900878439208e-05, + "loss": 2.475, + "mean_token_accuracy": 0.3620689630508423, + "step": 107115 + }, + { + "epoch": 0.10789240170379749, + "grad_norm": 9.900306314179742, + "learning_rate": 4.958893748828051e-05, + "loss": 2.4336, + "mean_token_accuracy": 0.4472474277019501, + "step": 107120 + }, + { + "epoch": 0.10789743775690167, + "grad_norm": 11.402912262842616, + "learning_rate": 4.958886618604248e-05, + "loss": 2.2369, + "mean_token_accuracy": 0.4551724135875702, + "step": 107125 + }, + { + "epoch": 0.10790247381000583, + "grad_norm": 10.47367821767516, + "learning_rate": 4.9588794877678036e-05, + "loss": 2.7329, + "mean_token_accuracy": 0.4344827592372894, + "step": 107130 + }, + { + "epoch": 0.10790750986311, + "grad_norm": 11.258819322469655, + "learning_rate": 4.9588723563187177e-05, + "loss": 3.357, + "mean_token_accuracy": 0.37241379618644715, + "step": 107135 + }, + { + "epoch": 0.10791254591621417, + "grad_norm": 12.559829967512895, + "learning_rate": 4.9588652242569925e-05, + "loss": 2.2833, + "mean_token_accuracy": 0.4517241299152374, + "step": 107140 + }, + { + "epoch": 0.10791758196931835, + "grad_norm": 15.834871708418579, + "learning_rate": 4.958858091582631e-05, + "loss": 2.9651, + "mean_token_accuracy": 0.37931033968925476, + "step": 107145 + }, + { + "epoch": 0.10792261802242252, + "grad_norm": 11.218242847107856, + "learning_rate": 4.958850958295633e-05, + "loss": 2.3789, + "mean_token_accuracy": 0.39655172228813174, + "step": 107150 + }, + { + "epoch": 0.1079276540755267, + "grad_norm": 9.559395743077209, + "learning_rate": 4.9588438243960036e-05, + "loss": 2.1937, + "mean_token_accuracy": 0.4344827592372894, + "step": 107155 + }, + { + "epoch": 0.10793269012863087, + "grad_norm": 10.598965202174151, + "learning_rate": 4.958836689883742e-05, + "loss": 2.5258, + "mean_token_accuracy": 0.44827585816383364, + "step": 107160 + }, + { + "epoch": 0.10793772618173504, + "grad_norm": 19.833252508455786, + "learning_rate": 4.958829554758852e-05, + "loss": 2.7702, + "mean_token_accuracy": 0.39310344457626345, + "step": 107165 + }, + { + "epoch": 0.10794276223483922, + "grad_norm": 11.90751503448679, + "learning_rate": 4.9588224190213344e-05, + "loss": 2.5352, + "mean_token_accuracy": 0.39310344457626345, + "step": 107170 + }, + { + "epoch": 0.10794779828794339, + "grad_norm": 12.029558370933948, + "learning_rate": 4.958815282671191e-05, + "loss": 2.3734, + "mean_token_accuracy": 0.46896552443504336, + "step": 107175 + }, + { + "epoch": 0.10795283434104756, + "grad_norm": 12.213706627105474, + "learning_rate": 4.9588081457084256e-05, + "loss": 2.9199, + "mean_token_accuracy": 0.3758620649576187, + "step": 107180 + }, + { + "epoch": 0.10795787039415174, + "grad_norm": 11.566061077418949, + "learning_rate": 4.95880100813304e-05, + "loss": 2.5143, + "mean_token_accuracy": 0.41034482717514037, + "step": 107185 + }, + { + "epoch": 0.10796290644725591, + "grad_norm": 11.797199762362448, + "learning_rate": 4.9587938699450335e-05, + "loss": 2.6118, + "mean_token_accuracy": 0.39310345649719236, + "step": 107190 + }, + { + "epoch": 0.10796794250036008, + "grad_norm": 10.789797786022955, + "learning_rate": 4.95878673114441e-05, + "loss": 2.0403, + "mean_token_accuracy": 0.458620685338974, + "step": 107195 + }, + { + "epoch": 0.10797297855346424, + "grad_norm": 11.74311871053466, + "learning_rate": 4.958779591731171e-05, + "loss": 2.8219, + "mean_token_accuracy": 0.3879612863063812, + "step": 107200 + }, + { + "epoch": 0.10797801460656842, + "grad_norm": 11.680201625989499, + "learning_rate": 4.9587724517053196e-05, + "loss": 2.4864, + "mean_token_accuracy": 0.4034482717514038, + "step": 107205 + }, + { + "epoch": 0.10798305065967259, + "grad_norm": 11.068890728030484, + "learning_rate": 4.9587653110668566e-05, + "loss": 2.4819, + "mean_token_accuracy": 0.4206896543502808, + "step": 107210 + }, + { + "epoch": 0.10798808671277677, + "grad_norm": 12.626547126885658, + "learning_rate": 4.9587581698157834e-05, + "loss": 2.4119, + "mean_token_accuracy": 0.4034482717514038, + "step": 107215 + }, + { + "epoch": 0.10799312276588094, + "grad_norm": 10.823577352195663, + "learning_rate": 4.958751027952104e-05, + "loss": 2.8991, + "mean_token_accuracy": 0.3793103456497192, + "step": 107220 + }, + { + "epoch": 0.10799815881898511, + "grad_norm": 10.272992506054853, + "learning_rate": 4.9587438854758196e-05, + "loss": 2.8285, + "mean_token_accuracy": 0.38777979016304015, + "step": 107225 + }, + { + "epoch": 0.10800319487208929, + "grad_norm": 10.346500315759233, + "learning_rate": 4.958736742386931e-05, + "loss": 2.3054, + "mean_token_accuracy": 0.43793103098869324, + "step": 107230 + }, + { + "epoch": 0.10800823092519346, + "grad_norm": 12.922690792302639, + "learning_rate": 4.958729598685441e-05, + "loss": 2.5809, + "mean_token_accuracy": 0.41379310488700866, + "step": 107235 + }, + { + "epoch": 0.10801326697829763, + "grad_norm": 9.993981741315281, + "learning_rate": 4.9587224543713516e-05, + "loss": 2.1847, + "mean_token_accuracy": 0.4241379350423813, + "step": 107240 + }, + { + "epoch": 0.10801830303140181, + "grad_norm": 10.025685651026366, + "learning_rate": 4.958715309444665e-05, + "loss": 2.565, + "mean_token_accuracy": 0.4241379380226135, + "step": 107245 + }, + { + "epoch": 0.10802333908450598, + "grad_norm": 8.565518092204236, + "learning_rate": 4.958708163905383e-05, + "loss": 2.2594, + "mean_token_accuracy": 0.475862056016922, + "step": 107250 + }, + { + "epoch": 0.10802837513761016, + "grad_norm": 11.040162681653403, + "learning_rate": 4.958701017753508e-05, + "loss": 2.341, + "mean_token_accuracy": 0.4137930989265442, + "step": 107255 + }, + { + "epoch": 0.10803341119071433, + "grad_norm": 10.464873015943722, + "learning_rate": 4.958693870989041e-05, + "loss": 2.1986, + "mean_token_accuracy": 0.4931034505367279, + "step": 107260 + }, + { + "epoch": 0.1080384472438185, + "grad_norm": 9.44870375624461, + "learning_rate": 4.958686723611985e-05, + "loss": 2.6182, + "mean_token_accuracy": 0.44482758045196535, + "step": 107265 + }, + { + "epoch": 0.10804348329692266, + "grad_norm": 10.092844744653002, + "learning_rate": 4.95867957562234e-05, + "loss": 2.5479, + "mean_token_accuracy": 0.41379310488700866, + "step": 107270 + }, + { + "epoch": 0.10804851935002684, + "grad_norm": 10.377126181360559, + "learning_rate": 4.9586724270201115e-05, + "loss": 2.4641, + "mean_token_accuracy": 0.4241379380226135, + "step": 107275 + }, + { + "epoch": 0.10805355540313101, + "grad_norm": 11.093827759210939, + "learning_rate": 4.9586652778052976e-05, + "loss": 2.5624, + "mean_token_accuracy": 0.4, + "step": 107280 + }, + { + "epoch": 0.10805859145623518, + "grad_norm": 11.300982499222169, + "learning_rate": 4.958658127977904e-05, + "loss": 2.311, + "mean_token_accuracy": 0.4517241358757019, + "step": 107285 + }, + { + "epoch": 0.10806362750933936, + "grad_norm": 10.759340402018251, + "learning_rate": 4.9586509775379294e-05, + "loss": 2.9253, + "mean_token_accuracy": 0.3241379350423813, + "step": 107290 + }, + { + "epoch": 0.10806866356244353, + "grad_norm": 15.289329173842033, + "learning_rate": 4.958643826485377e-05, + "loss": 2.5459, + "mean_token_accuracy": 0.40689656138420105, + "step": 107295 + }, + { + "epoch": 0.1080736996155477, + "grad_norm": 10.64537646012742, + "learning_rate": 4.958636674820251e-05, + "loss": 2.3555, + "mean_token_accuracy": 0.4517241358757019, + "step": 107300 + }, + { + "epoch": 0.10807873566865188, + "grad_norm": 10.439646458196718, + "learning_rate": 4.9586295225425496e-05, + "loss": 2.2438, + "mean_token_accuracy": 0.46400484442710876, + "step": 107305 + }, + { + "epoch": 0.10808377172175605, + "grad_norm": 10.373069338254593, + "learning_rate": 4.9586223696522774e-05, + "loss": 2.0092, + "mean_token_accuracy": 0.49458128213882446, + "step": 107310 + }, + { + "epoch": 0.10808880777486023, + "grad_norm": 10.22462487396539, + "learning_rate": 4.958615216149435e-05, + "loss": 2.1586, + "mean_token_accuracy": 0.4901477873325348, + "step": 107315 + }, + { + "epoch": 0.1080938438279644, + "grad_norm": 11.445289551072111, + "learning_rate": 4.9586080620340254e-05, + "loss": 2.3405, + "mean_token_accuracy": 0.4310344815254211, + "step": 107320 + }, + { + "epoch": 0.10809887988106857, + "grad_norm": 11.589002846779666, + "learning_rate": 4.9586009073060495e-05, + "loss": 2.377, + "mean_token_accuracy": 0.42413792610168455, + "step": 107325 + }, + { + "epoch": 0.10810391593417275, + "grad_norm": 10.062132257376236, + "learning_rate": 4.95859375196551e-05, + "loss": 2.3485, + "mean_token_accuracy": 0.4068965554237366, + "step": 107330 + }, + { + "epoch": 0.10810895198727692, + "grad_norm": 9.46868935589262, + "learning_rate": 4.95858659601241e-05, + "loss": 2.8643, + "mean_token_accuracy": 0.4137930989265442, + "step": 107335 + }, + { + "epoch": 0.10811398804038108, + "grad_norm": 9.494639027989525, + "learning_rate": 4.958579439446749e-05, + "loss": 2.4966, + "mean_token_accuracy": 0.4034482717514038, + "step": 107340 + }, + { + "epoch": 0.10811902409348526, + "grad_norm": 12.802094574690742, + "learning_rate": 4.9585722822685306e-05, + "loss": 2.9213, + "mean_token_accuracy": 0.3931034505367279, + "step": 107345 + }, + { + "epoch": 0.10812406014658943, + "grad_norm": 12.523136556930131, + "learning_rate": 4.958565124477756e-05, + "loss": 2.2792, + "mean_token_accuracy": 0.4379310429096222, + "step": 107350 + }, + { + "epoch": 0.1081290961996936, + "grad_norm": 12.434475197743826, + "learning_rate": 4.958557966074428e-05, + "loss": 2.4196, + "mean_token_accuracy": 0.43448275327682495, + "step": 107355 + }, + { + "epoch": 0.10813413225279778, + "grad_norm": 8.667422303567143, + "learning_rate": 4.958550807058548e-05, + "loss": 2.288, + "mean_token_accuracy": 0.4172413766384125, + "step": 107360 + }, + { + "epoch": 0.10813916830590195, + "grad_norm": 9.785923036680265, + "learning_rate": 4.958543647430118e-05, + "loss": 2.5693, + "mean_token_accuracy": 0.38275861740112305, + "step": 107365 + }, + { + "epoch": 0.10814420435900612, + "grad_norm": 10.746296508199606, + "learning_rate": 4.958536487189141e-05, + "loss": 2.6697, + "mean_token_accuracy": 0.43448275327682495, + "step": 107370 + }, + { + "epoch": 0.1081492404121103, + "grad_norm": 10.650991381878647, + "learning_rate": 4.958529326335618e-05, + "loss": 2.5991, + "mean_token_accuracy": 0.4, + "step": 107375 + }, + { + "epoch": 0.10815427646521447, + "grad_norm": 10.524250231328603, + "learning_rate": 4.9585221648695505e-05, + "loss": 2.2478, + "mean_token_accuracy": 0.4793103516101837, + "step": 107380 + }, + { + "epoch": 0.10815931251831865, + "grad_norm": 15.039030097582273, + "learning_rate": 4.9585150027909415e-05, + "loss": 2.7766, + "mean_token_accuracy": 0.4206896543502808, + "step": 107385 + }, + { + "epoch": 0.10816434857142282, + "grad_norm": 10.887860819406933, + "learning_rate": 4.9585078400997934e-05, + "loss": 2.6311, + "mean_token_accuracy": 0.45009074211120603, + "step": 107390 + }, + { + "epoch": 0.10816938462452699, + "grad_norm": 9.152435867834567, + "learning_rate": 4.958500676796106e-05, + "loss": 2.7181, + "mean_token_accuracy": 0.38620689511299133, + "step": 107395 + }, + { + "epoch": 0.10817442067763117, + "grad_norm": 13.203597254523038, + "learning_rate": 4.958493512879884e-05, + "loss": 2.459, + "mean_token_accuracy": 0.4643073260784149, + "step": 107400 + }, + { + "epoch": 0.10817945673073534, + "grad_norm": 10.613120559609271, + "learning_rate": 4.9584863483511274e-05, + "loss": 2.2338, + "mean_token_accuracy": 0.4310344815254211, + "step": 107405 + }, + { + "epoch": 0.1081844927838395, + "grad_norm": 9.302783715417787, + "learning_rate": 4.958479183209839e-05, + "loss": 2.1151, + "mean_token_accuracy": 0.5, + "step": 107410 + }, + { + "epoch": 0.10818952883694367, + "grad_norm": 9.660760906329786, + "learning_rate": 4.95847201745602e-05, + "loss": 2.3339, + "mean_token_accuracy": 0.43103448748588563, + "step": 107415 + }, + { + "epoch": 0.10819456489004785, + "grad_norm": 12.463404645136231, + "learning_rate": 4.9584648510896736e-05, + "loss": 2.6261, + "mean_token_accuracy": 0.3620689630508423, + "step": 107420 + }, + { + "epoch": 0.10819960094315202, + "grad_norm": 9.683566349908325, + "learning_rate": 4.958457684110801e-05, + "loss": 2.4024, + "mean_token_accuracy": 0.4448275864124298, + "step": 107425 + }, + { + "epoch": 0.1082046369962562, + "grad_norm": 10.035247930991305, + "learning_rate": 4.958450516519406e-05, + "loss": 2.3781, + "mean_token_accuracy": 0.45517240166664125, + "step": 107430 + }, + { + "epoch": 0.10820967304936037, + "grad_norm": 11.622224003970778, + "learning_rate": 4.9584433483154874e-05, + "loss": 2.4197, + "mean_token_accuracy": 0.46896551847457885, + "step": 107435 + }, + { + "epoch": 0.10821470910246454, + "grad_norm": 9.264001350381193, + "learning_rate": 4.958436179499049e-05, + "loss": 2.0957, + "mean_token_accuracy": 0.48100423216819765, + "step": 107440 + }, + { + "epoch": 0.10821974515556872, + "grad_norm": 10.420695390730026, + "learning_rate": 4.9584290100700926e-05, + "loss": 2.4615, + "mean_token_accuracy": 0.42413793206214906, + "step": 107445 + }, + { + "epoch": 0.10822478120867289, + "grad_norm": 9.294736832060329, + "learning_rate": 4.95842184002862e-05, + "loss": 2.3118, + "mean_token_accuracy": 0.38275861740112305, + "step": 107450 + }, + { + "epoch": 0.10822981726177706, + "grad_norm": 12.687697838901109, + "learning_rate": 4.958414669374634e-05, + "loss": 2.7852, + "mean_token_accuracy": 0.3724137842655182, + "step": 107455 + }, + { + "epoch": 0.10823485331488124, + "grad_norm": 10.104240226412362, + "learning_rate": 4.958407498108136e-05, + "loss": 2.0405, + "mean_token_accuracy": 0.4620689630508423, + "step": 107460 + }, + { + "epoch": 0.10823988936798541, + "grad_norm": 8.860770956740469, + "learning_rate": 4.9584003262291274e-05, + "loss": 2.3939, + "mean_token_accuracy": 0.42068964838981626, + "step": 107465 + }, + { + "epoch": 0.10824492542108959, + "grad_norm": 9.79202651283673, + "learning_rate": 4.95839315373761e-05, + "loss": 2.5215, + "mean_token_accuracy": 0.3655172407627106, + "step": 107470 + }, + { + "epoch": 0.10824996147419376, + "grad_norm": 9.14099140106788, + "learning_rate": 4.958385980633587e-05, + "loss": 2.6976, + "mean_token_accuracy": 0.4258923172950745, + "step": 107475 + }, + { + "epoch": 0.10825499752729792, + "grad_norm": 11.05207810881123, + "learning_rate": 4.958378806917061e-05, + "loss": 2.6333, + "mean_token_accuracy": 0.358620685338974, + "step": 107480 + }, + { + "epoch": 0.10826003358040209, + "grad_norm": 10.51688792104066, + "learning_rate": 4.958371632588032e-05, + "loss": 2.3226, + "mean_token_accuracy": 0.4570477962493896, + "step": 107485 + }, + { + "epoch": 0.10826506963350627, + "grad_norm": 9.038423443608297, + "learning_rate": 4.958364457646504e-05, + "loss": 2.2169, + "mean_token_accuracy": 0.44827585816383364, + "step": 107490 + }, + { + "epoch": 0.10827010568661044, + "grad_norm": 10.07695589821479, + "learning_rate": 4.958357282092477e-05, + "loss": 2.5001, + "mean_token_accuracy": 0.42068966031074523, + "step": 107495 + }, + { + "epoch": 0.10827514173971461, + "grad_norm": 12.1911758363293, + "learning_rate": 4.958350105925953e-05, + "loss": 2.6414, + "mean_token_accuracy": 0.37241379022598264, + "step": 107500 + }, + { + "epoch": 0.10828017779281879, + "grad_norm": 9.30245249092957, + "learning_rate": 4.9583429291469366e-05, + "loss": 2.6197, + "mean_token_accuracy": 0.3793103516101837, + "step": 107505 + }, + { + "epoch": 0.10828521384592296, + "grad_norm": 12.387305577628005, + "learning_rate": 4.958335751755427e-05, + "loss": 2.4096, + "mean_token_accuracy": 0.4, + "step": 107510 + }, + { + "epoch": 0.10829024989902714, + "grad_norm": 11.891341128770739, + "learning_rate": 4.9583285737514266e-05, + "loss": 2.1559, + "mean_token_accuracy": 0.4551724076271057, + "step": 107515 + }, + { + "epoch": 0.10829528595213131, + "grad_norm": 13.987866228561792, + "learning_rate": 4.95832139513494e-05, + "loss": 2.2902, + "mean_token_accuracy": 0.4448275864124298, + "step": 107520 + }, + { + "epoch": 0.10830032200523548, + "grad_norm": 10.823924971041762, + "learning_rate": 4.958314215905965e-05, + "loss": 2.3386, + "mean_token_accuracy": 0.42758620977401735, + "step": 107525 + }, + { + "epoch": 0.10830535805833966, + "grad_norm": 10.563332099407061, + "learning_rate": 4.9583070360645075e-05, + "loss": 2.5251, + "mean_token_accuracy": 0.4206896543502808, + "step": 107530 + }, + { + "epoch": 0.10831039411144383, + "grad_norm": 10.299329133286241, + "learning_rate": 4.958299855610567e-05, + "loss": 2.9887, + "mean_token_accuracy": 0.3896551728248596, + "step": 107535 + }, + { + "epoch": 0.108315430164548, + "grad_norm": 9.24378194962503, + "learning_rate": 4.958292674544146e-05, + "loss": 2.3368, + "mean_token_accuracy": 0.4724137902259827, + "step": 107540 + }, + { + "epoch": 0.10832046621765218, + "grad_norm": 8.922517584232857, + "learning_rate": 4.958285492865247e-05, + "loss": 2.2844, + "mean_token_accuracy": 0.47931033968925474, + "step": 107545 + }, + { + "epoch": 0.10832550227075634, + "grad_norm": 9.494566560284643, + "learning_rate": 4.958278310573873e-05, + "loss": 2.3539, + "mean_token_accuracy": 0.43103447556495667, + "step": 107550 + }, + { + "epoch": 0.10833053832386051, + "grad_norm": 11.233770414286804, + "learning_rate": 4.958271127670023e-05, + "loss": 2.3762, + "mean_token_accuracy": 0.4344827651977539, + "step": 107555 + }, + { + "epoch": 0.10833557437696469, + "grad_norm": 10.873409003105886, + "learning_rate": 4.958263944153702e-05, + "loss": 2.5282, + "mean_token_accuracy": 0.44137930274009707, + "step": 107560 + }, + { + "epoch": 0.10834061043006886, + "grad_norm": 10.94725529590566, + "learning_rate": 4.9582567600249104e-05, + "loss": 2.2092, + "mean_token_accuracy": 0.4517241299152374, + "step": 107565 + }, + { + "epoch": 0.10834564648317303, + "grad_norm": 9.956894360677891, + "learning_rate": 4.958249575283651e-05, + "loss": 2.5438, + "mean_token_accuracy": 0.42758620977401735, + "step": 107570 + }, + { + "epoch": 0.1083506825362772, + "grad_norm": 9.852720301979515, + "learning_rate": 4.958242389929925e-05, + "loss": 2.3212, + "mean_token_accuracy": 0.4137930989265442, + "step": 107575 + }, + { + "epoch": 0.10835571858938138, + "grad_norm": 10.142005304332944, + "learning_rate": 4.958235203963734e-05, + "loss": 2.2497, + "mean_token_accuracy": 0.48820326924324037, + "step": 107580 + }, + { + "epoch": 0.10836075464248555, + "grad_norm": 13.449839586034551, + "learning_rate": 4.958228017385081e-05, + "loss": 2.5428, + "mean_token_accuracy": 0.41724138259887694, + "step": 107585 + }, + { + "epoch": 0.10836579069558973, + "grad_norm": 12.866381576266825, + "learning_rate": 4.958220830193968e-05, + "loss": 2.8029, + "mean_token_accuracy": 0.3379310339689255, + "step": 107590 + }, + { + "epoch": 0.1083708267486939, + "grad_norm": 10.541573799016785, + "learning_rate": 4.958213642390396e-05, + "loss": 2.4042, + "mean_token_accuracy": 0.4137930989265442, + "step": 107595 + }, + { + "epoch": 0.10837586280179808, + "grad_norm": 10.25566746548862, + "learning_rate": 4.958206453974369e-05, + "loss": 2.6518, + "mean_token_accuracy": 0.41379310488700866, + "step": 107600 + }, + { + "epoch": 0.10838089885490225, + "grad_norm": 9.698676517309917, + "learning_rate": 4.958199264945887e-05, + "loss": 2.3868, + "mean_token_accuracy": 0.47586206197738645, + "step": 107605 + }, + { + "epoch": 0.10838593490800642, + "grad_norm": 10.419011441160416, + "learning_rate": 4.9581920753049534e-05, + "loss": 2.5083, + "mean_token_accuracy": 0.41034483909606934, + "step": 107610 + }, + { + "epoch": 0.1083909709611106, + "grad_norm": 9.52083331818641, + "learning_rate": 4.958184885051569e-05, + "loss": 2.4758, + "mean_token_accuracy": 0.38275861740112305, + "step": 107615 + }, + { + "epoch": 0.10839600701421476, + "grad_norm": 11.776694282578525, + "learning_rate": 4.958177694185736e-05, + "loss": 2.2945, + "mean_token_accuracy": 0.44137930274009707, + "step": 107620 + }, + { + "epoch": 0.10840104306731893, + "grad_norm": 9.902060165033578, + "learning_rate": 4.958170502707457e-05, + "loss": 2.1022, + "mean_token_accuracy": 0.45517241954803467, + "step": 107625 + }, + { + "epoch": 0.1084060791204231, + "grad_norm": 9.657241804594197, + "learning_rate": 4.958163310616734e-05, + "loss": 2.5984, + "mean_token_accuracy": 0.4172413766384125, + "step": 107630 + }, + { + "epoch": 0.10841111517352728, + "grad_norm": 10.076574902292391, + "learning_rate": 4.958156117913568e-05, + "loss": 2.3847, + "mean_token_accuracy": 0.3862069010734558, + "step": 107635 + }, + { + "epoch": 0.10841615122663145, + "grad_norm": 9.996452728064519, + "learning_rate": 4.958148924597961e-05, + "loss": 2.4251, + "mean_token_accuracy": 0.4413793087005615, + "step": 107640 + }, + { + "epoch": 0.10842118727973563, + "grad_norm": 12.901895317726764, + "learning_rate": 4.958141730669917e-05, + "loss": 2.7025, + "mean_token_accuracy": 0.3931034505367279, + "step": 107645 + }, + { + "epoch": 0.1084262233328398, + "grad_norm": 10.908951882145884, + "learning_rate": 4.9581345361294364e-05, + "loss": 2.383, + "mean_token_accuracy": 0.42413793206214906, + "step": 107650 + }, + { + "epoch": 0.10843125938594397, + "grad_norm": 10.019730527859553, + "learning_rate": 4.958127340976521e-05, + "loss": 2.3888, + "mean_token_accuracy": 0.4896551787853241, + "step": 107655 + }, + { + "epoch": 0.10843629543904815, + "grad_norm": 11.282415504630528, + "learning_rate": 4.958120145211174e-05, + "loss": 2.7264, + "mean_token_accuracy": 0.39655172228813174, + "step": 107660 + }, + { + "epoch": 0.10844133149215232, + "grad_norm": 9.205569010808901, + "learning_rate": 4.958112948833396e-05, + "loss": 2.3047, + "mean_token_accuracy": 0.4000000059604645, + "step": 107665 + }, + { + "epoch": 0.1084463675452565, + "grad_norm": 11.587499296846909, + "learning_rate": 4.95810575184319e-05, + "loss": 2.4643, + "mean_token_accuracy": 0.46473078727722167, + "step": 107670 + }, + { + "epoch": 0.10845140359836067, + "grad_norm": 8.930365861313936, + "learning_rate": 4.958098554240557e-05, + "loss": 2.091, + "mean_token_accuracy": 0.3965517282485962, + "step": 107675 + }, + { + "epoch": 0.10845643965146484, + "grad_norm": 9.940522958349861, + "learning_rate": 4.9580913560255e-05, + "loss": 2.1943, + "mean_token_accuracy": 0.4275862157344818, + "step": 107680 + }, + { + "epoch": 0.10846147570456902, + "grad_norm": 14.954939526481805, + "learning_rate": 4.9580841571980204e-05, + "loss": 2.3722, + "mean_token_accuracy": 0.41379311084747317, + "step": 107685 + }, + { + "epoch": 0.10846651175767318, + "grad_norm": 10.043515394005372, + "learning_rate": 4.958076957758121e-05, + "loss": 2.2527, + "mean_token_accuracy": 0.42758620977401735, + "step": 107690 + }, + { + "epoch": 0.10847154781077735, + "grad_norm": 11.07833058342766, + "learning_rate": 4.958069757705803e-05, + "loss": 2.7994, + "mean_token_accuracy": 0.3862069010734558, + "step": 107695 + }, + { + "epoch": 0.10847658386388152, + "grad_norm": 10.436887162517603, + "learning_rate": 4.958062557041068e-05, + "loss": 2.6833, + "mean_token_accuracy": 0.4137930989265442, + "step": 107700 + }, + { + "epoch": 0.1084816199169857, + "grad_norm": 13.109083800302747, + "learning_rate": 4.9580553557639197e-05, + "loss": 2.1981, + "mean_token_accuracy": 0.417241370677948, + "step": 107705 + }, + { + "epoch": 0.10848665597008987, + "grad_norm": 8.999146228067858, + "learning_rate": 4.9580481538743575e-05, + "loss": 2.2286, + "mean_token_accuracy": 0.44827587008476255, + "step": 107710 + }, + { + "epoch": 0.10849169202319404, + "grad_norm": 12.205726235544475, + "learning_rate": 4.9580409513723865e-05, + "loss": 2.7248, + "mean_token_accuracy": 0.40689654350280763, + "step": 107715 + }, + { + "epoch": 0.10849672807629822, + "grad_norm": 13.211418989216797, + "learning_rate": 4.958033748258006e-05, + "loss": 2.6636, + "mean_token_accuracy": 0.3965517282485962, + "step": 107720 + }, + { + "epoch": 0.10850176412940239, + "grad_norm": 9.055874683385968, + "learning_rate": 4.9580265445312205e-05, + "loss": 2.3515, + "mean_token_accuracy": 0.4206896543502808, + "step": 107725 + }, + { + "epoch": 0.10850680018250657, + "grad_norm": 14.681977442729286, + "learning_rate": 4.958019340192029e-05, + "loss": 2.5968, + "mean_token_accuracy": 0.42577131986618044, + "step": 107730 + }, + { + "epoch": 0.10851183623561074, + "grad_norm": 11.48838623259037, + "learning_rate": 4.958012135240436e-05, + "loss": 2.2207, + "mean_token_accuracy": 0.4551724135875702, + "step": 107735 + }, + { + "epoch": 0.10851687228871491, + "grad_norm": 9.048892188515945, + "learning_rate": 4.958004929676442e-05, + "loss": 2.3158, + "mean_token_accuracy": 0.38620689511299133, + "step": 107740 + }, + { + "epoch": 0.10852190834181909, + "grad_norm": 10.25570001694402, + "learning_rate": 4.95799772350005e-05, + "loss": 2.1282, + "mean_token_accuracy": 0.42758620381355283, + "step": 107745 + }, + { + "epoch": 0.10852694439492326, + "grad_norm": 10.248551393993596, + "learning_rate": 4.957990516711262e-05, + "loss": 2.6414, + "mean_token_accuracy": 0.358620685338974, + "step": 107750 + }, + { + "epoch": 0.10853198044802743, + "grad_norm": 12.22966268992081, + "learning_rate": 4.9579833093100786e-05, + "loss": 2.2885, + "mean_token_accuracy": 0.5034482836723327, + "step": 107755 + }, + { + "epoch": 0.1085370165011316, + "grad_norm": 11.13492676504733, + "learning_rate": 4.9579761012965034e-05, + "loss": 2.0114, + "mean_token_accuracy": 0.4862068951129913, + "step": 107760 + }, + { + "epoch": 0.10854205255423577, + "grad_norm": 10.190497565216308, + "learning_rate": 4.957968892670538e-05, + "loss": 2.2403, + "mean_token_accuracy": 0.44664247035980226, + "step": 107765 + }, + { + "epoch": 0.10854708860733994, + "grad_norm": 9.719465157318897, + "learning_rate": 4.9579616834321835e-05, + "loss": 2.4884, + "mean_token_accuracy": 0.37241379022598264, + "step": 107770 + }, + { + "epoch": 0.10855212466044412, + "grad_norm": 12.564084130798074, + "learning_rate": 4.9579544735814434e-05, + "loss": 2.347, + "mean_token_accuracy": 0.4482758641242981, + "step": 107775 + }, + { + "epoch": 0.10855716071354829, + "grad_norm": 11.284598125268072, + "learning_rate": 4.957947263118318e-05, + "loss": 2.5532, + "mean_token_accuracy": 0.3931034505367279, + "step": 107780 + }, + { + "epoch": 0.10856219676665246, + "grad_norm": 9.601179675761317, + "learning_rate": 4.957940052042811e-05, + "loss": 2.1785, + "mean_token_accuracy": 0.48965516686439514, + "step": 107785 + }, + { + "epoch": 0.10856723281975664, + "grad_norm": 10.334860227560275, + "learning_rate": 4.957932840354923e-05, + "loss": 2.875, + "mean_token_accuracy": 0.4103448212146759, + "step": 107790 + }, + { + "epoch": 0.10857226887286081, + "grad_norm": 9.734078012842598, + "learning_rate": 4.957925628054657e-05, + "loss": 2.1836, + "mean_token_accuracy": 0.39310344457626345, + "step": 107795 + }, + { + "epoch": 0.10857730492596498, + "grad_norm": 9.431708358796635, + "learning_rate": 4.957918415142015e-05, + "loss": 2.3445, + "mean_token_accuracy": 0.42068966031074523, + "step": 107800 + }, + { + "epoch": 0.10858234097906916, + "grad_norm": 10.390653286525446, + "learning_rate": 4.9579112016169975e-05, + "loss": 2.0985, + "mean_token_accuracy": 0.46896552443504336, + "step": 107805 + }, + { + "epoch": 0.10858737703217333, + "grad_norm": 10.578269221935887, + "learning_rate": 4.957903987479608e-05, + "loss": 2.5589, + "mean_token_accuracy": 0.42758620381355283, + "step": 107810 + }, + { + "epoch": 0.1085924130852775, + "grad_norm": 8.145174514185971, + "learning_rate": 4.957896772729848e-05, + "loss": 2.3477, + "mean_token_accuracy": 0.4275862157344818, + "step": 107815 + }, + { + "epoch": 0.10859744913838168, + "grad_norm": 10.07165347850683, + "learning_rate": 4.95788955736772e-05, + "loss": 2.2564, + "mean_token_accuracy": 0.4379310429096222, + "step": 107820 + }, + { + "epoch": 0.10860248519148585, + "grad_norm": 9.76073721936002, + "learning_rate": 4.957882341393225e-05, + "loss": 2.3799, + "mean_token_accuracy": 0.41428571939468384, + "step": 107825 + }, + { + "epoch": 0.10860752124459001, + "grad_norm": 11.430223968135316, + "learning_rate": 4.957875124806367e-05, + "loss": 2.8401, + "mean_token_accuracy": 0.39655172228813174, + "step": 107830 + }, + { + "epoch": 0.10861255729769419, + "grad_norm": 9.179443591895861, + "learning_rate": 4.957867907607145e-05, + "loss": 2.3865, + "mean_token_accuracy": 0.47931033968925474, + "step": 107835 + }, + { + "epoch": 0.10861759335079836, + "grad_norm": 11.580930374660609, + "learning_rate": 4.9578606897955636e-05, + "loss": 2.2179, + "mean_token_accuracy": 0.4344827592372894, + "step": 107840 + }, + { + "epoch": 0.10862262940390253, + "grad_norm": 9.721771272805599, + "learning_rate": 4.957853471371623e-05, + "loss": 2.4192, + "mean_token_accuracy": 0.43103447556495667, + "step": 107845 + }, + { + "epoch": 0.10862766545700671, + "grad_norm": 10.368469971728636, + "learning_rate": 4.957846252335326e-05, + "loss": 1.991, + "mean_token_accuracy": 0.4586206912994385, + "step": 107850 + }, + { + "epoch": 0.10863270151011088, + "grad_norm": 10.965851375509738, + "learning_rate": 4.9578390326866754e-05, + "loss": 2.7694, + "mean_token_accuracy": 0.35862069129943847, + "step": 107855 + }, + { + "epoch": 0.10863773756321506, + "grad_norm": 12.479519430493118, + "learning_rate": 4.957831812425672e-05, + "loss": 2.4902, + "mean_token_accuracy": 0.4034482717514038, + "step": 107860 + }, + { + "epoch": 0.10864277361631923, + "grad_norm": 11.213427912062103, + "learning_rate": 4.9578245915523187e-05, + "loss": 2.2476, + "mean_token_accuracy": 0.45517241954803467, + "step": 107865 + }, + { + "epoch": 0.1086478096694234, + "grad_norm": 9.102651433724484, + "learning_rate": 4.957817370066616e-05, + "loss": 2.6581, + "mean_token_accuracy": 0.41379310488700866, + "step": 107870 + }, + { + "epoch": 0.10865284572252758, + "grad_norm": 9.848736678041211, + "learning_rate": 4.9578101479685676e-05, + "loss": 2.4875, + "mean_token_accuracy": 0.41724138259887694, + "step": 107875 + }, + { + "epoch": 0.10865788177563175, + "grad_norm": 12.0343968566699, + "learning_rate": 4.957802925258175e-05, + "loss": 2.472, + "mean_token_accuracy": 0.43793103098869324, + "step": 107880 + }, + { + "epoch": 0.10866291782873592, + "grad_norm": 12.737831842083631, + "learning_rate": 4.957795701935439e-05, + "loss": 2.669, + "mean_token_accuracy": 0.43793103098869324, + "step": 107885 + }, + { + "epoch": 0.1086679538818401, + "grad_norm": 11.280302186493369, + "learning_rate": 4.957788478000364e-05, + "loss": 2.8683, + "mean_token_accuracy": 0.3793103456497192, + "step": 107890 + }, + { + "epoch": 0.10867298993494427, + "grad_norm": 11.505229966531815, + "learning_rate": 4.95778125345295e-05, + "loss": 2.3707, + "mean_token_accuracy": 0.4413793087005615, + "step": 107895 + }, + { + "epoch": 0.10867802598804843, + "grad_norm": 11.696723527524535, + "learning_rate": 4.9577740282932e-05, + "loss": 2.4819, + "mean_token_accuracy": 0.42413793206214906, + "step": 107900 + }, + { + "epoch": 0.1086830620411526, + "grad_norm": 13.27828633819417, + "learning_rate": 4.957766802521115e-05, + "loss": 2.7083, + "mean_token_accuracy": 0.4103448301553726, + "step": 107905 + }, + { + "epoch": 0.10868809809425678, + "grad_norm": 11.05128518338207, + "learning_rate": 4.957759576136698e-05, + "loss": 2.1464, + "mean_token_accuracy": 0.46551724076271056, + "step": 107910 + }, + { + "epoch": 0.10869313414736095, + "grad_norm": 14.405241113682067, + "learning_rate": 4.95775234913995e-05, + "loss": 2.6629, + "mean_token_accuracy": 0.3793103456497192, + "step": 107915 + }, + { + "epoch": 0.10869817020046513, + "grad_norm": 10.87844225916082, + "learning_rate": 4.9577451215308745e-05, + "loss": 2.5054, + "mean_token_accuracy": 0.4034482717514038, + "step": 107920 + }, + { + "epoch": 0.1087032062535693, + "grad_norm": 10.614373495693224, + "learning_rate": 4.957737893309472e-05, + "loss": 2.6128, + "mean_token_accuracy": 0.3482758581638336, + "step": 107925 + }, + { + "epoch": 0.10870824230667347, + "grad_norm": 10.105755457826929, + "learning_rate": 4.9577306644757455e-05, + "loss": 2.1546, + "mean_token_accuracy": 0.4758620738983154, + "step": 107930 + }, + { + "epoch": 0.10871327835977765, + "grad_norm": 11.37408020714365, + "learning_rate": 4.957723435029697e-05, + "loss": 2.3592, + "mean_token_accuracy": 0.45862069725990295, + "step": 107935 + }, + { + "epoch": 0.10871831441288182, + "grad_norm": 11.717717882526417, + "learning_rate": 4.957716204971328e-05, + "loss": 2.2561, + "mean_token_accuracy": 0.47931033968925474, + "step": 107940 + }, + { + "epoch": 0.108723350465986, + "grad_norm": 10.639334449477898, + "learning_rate": 4.95770897430064e-05, + "loss": 2.464, + "mean_token_accuracy": 0.41724138259887694, + "step": 107945 + }, + { + "epoch": 0.10872838651909017, + "grad_norm": 11.474340397399446, + "learning_rate": 4.957701743017636e-05, + "loss": 2.4107, + "mean_token_accuracy": 0.41034482717514037, + "step": 107950 + }, + { + "epoch": 0.10873342257219434, + "grad_norm": 12.57865789778267, + "learning_rate": 4.957694511122318e-05, + "loss": 2.4403, + "mean_token_accuracy": 0.41379310488700866, + "step": 107955 + }, + { + "epoch": 0.10873845862529852, + "grad_norm": 10.182736488404696, + "learning_rate": 4.957687278614688e-05, + "loss": 2.2516, + "mean_token_accuracy": 0.4310344815254211, + "step": 107960 + }, + { + "epoch": 0.10874349467840269, + "grad_norm": 10.297561916473837, + "learning_rate": 4.957680045494747e-05, + "loss": 2.2555, + "mean_token_accuracy": 0.4938423693180084, + "step": 107965 + }, + { + "epoch": 0.10874853073150685, + "grad_norm": 10.132930233663659, + "learning_rate": 4.957672811762498e-05, + "loss": 2.1096, + "mean_token_accuracy": 0.42758620977401735, + "step": 107970 + }, + { + "epoch": 0.10875356678461102, + "grad_norm": 10.729307032032356, + "learning_rate": 4.957665577417942e-05, + "loss": 2.7136, + "mean_token_accuracy": 0.38965516686439516, + "step": 107975 + }, + { + "epoch": 0.1087586028377152, + "grad_norm": 11.38983820438597, + "learning_rate": 4.957658342461082e-05, + "loss": 2.4992, + "mean_token_accuracy": 0.4344827592372894, + "step": 107980 + }, + { + "epoch": 0.10876363889081937, + "grad_norm": 11.025366738862676, + "learning_rate": 4.957651106891921e-05, + "loss": 2.6043, + "mean_token_accuracy": 0.4034482777118683, + "step": 107985 + }, + { + "epoch": 0.10876867494392355, + "grad_norm": 10.46297519838561, + "learning_rate": 4.957643870710459e-05, + "loss": 2.6122, + "mean_token_accuracy": 0.3655172407627106, + "step": 107990 + }, + { + "epoch": 0.10877371099702772, + "grad_norm": 9.541571150209489, + "learning_rate": 4.9576366339166983e-05, + "loss": 2.1058, + "mean_token_accuracy": 0.5019963681697845, + "step": 107995 + }, + { + "epoch": 0.1087787470501319, + "grad_norm": 10.95452656592583, + "learning_rate": 4.9576293965106415e-05, + "loss": 2.934, + "mean_token_accuracy": 0.34137930870056155, + "step": 108000 + }, + { + "epoch": 0.10878378310323607, + "grad_norm": 9.403151743631476, + "learning_rate": 4.9576221584922904e-05, + "loss": 2.0866, + "mean_token_accuracy": 0.48275862336158754, + "step": 108005 + }, + { + "epoch": 0.10878881915634024, + "grad_norm": 10.21360845422915, + "learning_rate": 4.9576149198616475e-05, + "loss": 2.3717, + "mean_token_accuracy": 0.4172413766384125, + "step": 108010 + }, + { + "epoch": 0.10879385520944442, + "grad_norm": 11.573570887355926, + "learning_rate": 4.9576076806187144e-05, + "loss": 2.3018, + "mean_token_accuracy": 0.41724138259887694, + "step": 108015 + }, + { + "epoch": 0.10879889126254859, + "grad_norm": 9.457313843012107, + "learning_rate": 4.9576004407634916e-05, + "loss": 2.5998, + "mean_token_accuracy": 0.441379314661026, + "step": 108020 + }, + { + "epoch": 0.10880392731565276, + "grad_norm": 10.661575915492778, + "learning_rate": 4.9575932002959847e-05, + "loss": 2.341, + "mean_token_accuracy": 0.39310344457626345, + "step": 108025 + }, + { + "epoch": 0.10880896336875694, + "grad_norm": 14.987939893896437, + "learning_rate": 4.957585959216193e-05, + "loss": 2.3976, + "mean_token_accuracy": 0.38620689511299133, + "step": 108030 + }, + { + "epoch": 0.10881399942186111, + "grad_norm": 10.495172534890404, + "learning_rate": 4.957578717524118e-05, + "loss": 1.9939, + "mean_token_accuracy": 0.5160314619541169, + "step": 108035 + }, + { + "epoch": 0.10881903547496527, + "grad_norm": 10.073257899056186, + "learning_rate": 4.957571475219763e-05, + "loss": 2.4188, + "mean_token_accuracy": 0.4344827651977539, + "step": 108040 + }, + { + "epoch": 0.10882407152806944, + "grad_norm": 10.020221787427186, + "learning_rate": 4.957564232303131e-05, + "loss": 2.2604, + "mean_token_accuracy": 0.4758620738983154, + "step": 108045 + }, + { + "epoch": 0.10882910758117362, + "grad_norm": 14.692738473639386, + "learning_rate": 4.957556988774223e-05, + "loss": 2.5038, + "mean_token_accuracy": 0.458620685338974, + "step": 108050 + }, + { + "epoch": 0.10883414363427779, + "grad_norm": 12.013371097987012, + "learning_rate": 4.9575497446330396e-05, + "loss": 2.1864, + "mean_token_accuracy": 0.4344827592372894, + "step": 108055 + }, + { + "epoch": 0.10883917968738197, + "grad_norm": 11.996179751429695, + "learning_rate": 4.9575424998795846e-05, + "loss": 2.5556, + "mean_token_accuracy": 0.36896551847457887, + "step": 108060 + }, + { + "epoch": 0.10884421574048614, + "grad_norm": 9.82080505469875, + "learning_rate": 4.957535254513859e-05, + "loss": 2.2525, + "mean_token_accuracy": 0.4620689690113068, + "step": 108065 + }, + { + "epoch": 0.10884925179359031, + "grad_norm": 15.273675682413538, + "learning_rate": 4.957528008535865e-05, + "loss": 2.7929, + "mean_token_accuracy": 0.4068965494632721, + "step": 108070 + }, + { + "epoch": 0.10885428784669449, + "grad_norm": 11.676115863539788, + "learning_rate": 4.957520761945606e-05, + "loss": 2.2338, + "mean_token_accuracy": 0.4172413766384125, + "step": 108075 + }, + { + "epoch": 0.10885932389979866, + "grad_norm": 12.064138271427389, + "learning_rate": 4.957513514743082e-05, + "loss": 2.3401, + "mean_token_accuracy": 0.4275861978530884, + "step": 108080 + }, + { + "epoch": 0.10886435995290283, + "grad_norm": 10.05242403700605, + "learning_rate": 4.9575062669282956e-05, + "loss": 2.4797, + "mean_token_accuracy": 0.4068965494632721, + "step": 108085 + }, + { + "epoch": 0.10886939600600701, + "grad_norm": 9.396606420006043, + "learning_rate": 4.9574990185012506e-05, + "loss": 2.4669, + "mean_token_accuracy": 0.3827586233615875, + "step": 108090 + }, + { + "epoch": 0.10887443205911118, + "grad_norm": 11.650425855942851, + "learning_rate": 4.9574917694619463e-05, + "loss": 2.615, + "mean_token_accuracy": 0.37586206793785093, + "step": 108095 + }, + { + "epoch": 0.10887946811221536, + "grad_norm": 10.267526455166738, + "learning_rate": 4.9574845198103864e-05, + "loss": 2.2319, + "mean_token_accuracy": 0.4172413766384125, + "step": 108100 + }, + { + "epoch": 0.10888450416531953, + "grad_norm": 12.498345415347659, + "learning_rate": 4.957477269546572e-05, + "loss": 2.2986, + "mean_token_accuracy": 0.4379310369491577, + "step": 108105 + }, + { + "epoch": 0.10888954021842369, + "grad_norm": 8.205448714540209, + "learning_rate": 4.957470018670506e-05, + "loss": 2.1284, + "mean_token_accuracy": 0.5068965494632721, + "step": 108110 + }, + { + "epoch": 0.10889457627152786, + "grad_norm": 19.647114577009354, + "learning_rate": 4.95746276718219e-05, + "loss": 2.5784, + "mean_token_accuracy": 0.39655172228813174, + "step": 108115 + }, + { + "epoch": 0.10889961232463204, + "grad_norm": 12.97147890919276, + "learning_rate": 4.957455515081626e-05, + "loss": 2.5496, + "mean_token_accuracy": 0.3931034505367279, + "step": 108120 + }, + { + "epoch": 0.10890464837773621, + "grad_norm": 9.935039590879072, + "learning_rate": 4.957448262368815e-05, + "loss": 2.5451, + "mean_token_accuracy": 0.4137930989265442, + "step": 108125 + }, + { + "epoch": 0.10890968443084038, + "grad_norm": 10.483668187022491, + "learning_rate": 4.9574410090437604e-05, + "loss": 2.3306, + "mean_token_accuracy": 0.4482758641242981, + "step": 108130 + }, + { + "epoch": 0.10891472048394456, + "grad_norm": 8.826081389787808, + "learning_rate": 4.9574337551064645e-05, + "loss": 2.2358, + "mean_token_accuracy": 0.4413793087005615, + "step": 108135 + }, + { + "epoch": 0.10891975653704873, + "grad_norm": 14.689322150219958, + "learning_rate": 4.9574265005569284e-05, + "loss": 2.3884, + "mean_token_accuracy": 0.5103448331356049, + "step": 108140 + }, + { + "epoch": 0.1089247925901529, + "grad_norm": 11.144727887098208, + "learning_rate": 4.957419245395155e-05, + "loss": 2.2089, + "mean_token_accuracy": 0.4724137902259827, + "step": 108145 + }, + { + "epoch": 0.10892982864325708, + "grad_norm": 10.544490285058664, + "learning_rate": 4.957411989621144e-05, + "loss": 2.1723, + "mean_token_accuracy": 0.458620685338974, + "step": 108150 + }, + { + "epoch": 0.10893486469636125, + "grad_norm": 9.467608452738634, + "learning_rate": 4.9574047332349e-05, + "loss": 2.4276, + "mean_token_accuracy": 0.4000000059604645, + "step": 108155 + }, + { + "epoch": 0.10893990074946543, + "grad_norm": 8.898884737868197, + "learning_rate": 4.957397476236424e-05, + "loss": 2.0143, + "mean_token_accuracy": 0.4985480964183807, + "step": 108160 + }, + { + "epoch": 0.1089449368025696, + "grad_norm": 15.059485773345463, + "learning_rate": 4.9573902186257174e-05, + "loss": 2.5793, + "mean_token_accuracy": 0.4172413766384125, + "step": 108165 + }, + { + "epoch": 0.10894997285567377, + "grad_norm": 9.320827974452195, + "learning_rate": 4.957382960402784e-05, + "loss": 2.1197, + "mean_token_accuracy": 0.4571687877178192, + "step": 108170 + }, + { + "epoch": 0.10895500890877795, + "grad_norm": 8.973035806766127, + "learning_rate": 4.9573757015676234e-05, + "loss": 2.6466, + "mean_token_accuracy": 0.38965516686439516, + "step": 108175 + }, + { + "epoch": 0.10896004496188211, + "grad_norm": 10.79885469224029, + "learning_rate": 4.95736844212024e-05, + "loss": 2.6738, + "mean_token_accuracy": 0.39310344457626345, + "step": 108180 + }, + { + "epoch": 0.10896508101498628, + "grad_norm": 11.975679355972376, + "learning_rate": 4.9573611820606345e-05, + "loss": 2.3399, + "mean_token_accuracy": 0.45517241954803467, + "step": 108185 + }, + { + "epoch": 0.10897011706809046, + "grad_norm": 9.281850799655222, + "learning_rate": 4.957353921388809e-05, + "loss": 3.0748, + "mean_token_accuracy": 0.39655172228813174, + "step": 108190 + }, + { + "epoch": 0.10897515312119463, + "grad_norm": 11.565441338285277, + "learning_rate": 4.957346660104766e-05, + "loss": 2.6548, + "mean_token_accuracy": 0.4068965494632721, + "step": 108195 + }, + { + "epoch": 0.1089801891742988, + "grad_norm": 10.197908561640979, + "learning_rate": 4.957339398208507e-05, + "loss": 2.2878, + "mean_token_accuracy": 0.48275861144065857, + "step": 108200 + }, + { + "epoch": 0.10898522522740298, + "grad_norm": 13.350505600424414, + "learning_rate": 4.957332135700034e-05, + "loss": 2.4207, + "mean_token_accuracy": 0.42758620381355283, + "step": 108205 + }, + { + "epoch": 0.10899026128050715, + "grad_norm": 10.519676093018191, + "learning_rate": 4.957324872579349e-05, + "loss": 2.4808, + "mean_token_accuracy": 0.4366606116294861, + "step": 108210 + }, + { + "epoch": 0.10899529733361132, + "grad_norm": 10.50880654358656, + "learning_rate": 4.957317608846454e-05, + "loss": 2.7415, + "mean_token_accuracy": 0.38620689511299133, + "step": 108215 + }, + { + "epoch": 0.1090003333867155, + "grad_norm": 13.56973323792708, + "learning_rate": 4.957310344501352e-05, + "loss": 2.8227, + "mean_token_accuracy": 0.3862068921327591, + "step": 108220 + }, + { + "epoch": 0.10900536943981967, + "grad_norm": 8.816985028773987, + "learning_rate": 4.957303079544044e-05, + "loss": 2.2547, + "mean_token_accuracy": 0.46551724076271056, + "step": 108225 + }, + { + "epoch": 0.10901040549292385, + "grad_norm": 9.696729429715983, + "learning_rate": 4.957295813974532e-05, + "loss": 2.3033, + "mean_token_accuracy": 0.42413793206214906, + "step": 108230 + }, + { + "epoch": 0.10901544154602802, + "grad_norm": 11.55662873057817, + "learning_rate": 4.957288547792819e-05, + "loss": 2.6538, + "mean_token_accuracy": 0.3931034505367279, + "step": 108235 + }, + { + "epoch": 0.10902047759913219, + "grad_norm": 11.430457806643274, + "learning_rate": 4.957281280998905e-05, + "loss": 2.5991, + "mean_token_accuracy": 0.382758629322052, + "step": 108240 + }, + { + "epoch": 0.10902551365223637, + "grad_norm": 12.761544139807148, + "learning_rate": 4.957274013592794e-05, + "loss": 2.4026, + "mean_token_accuracy": 0.4482758641242981, + "step": 108245 + }, + { + "epoch": 0.10903054970534053, + "grad_norm": 10.93873973240141, + "learning_rate": 4.957266745574488e-05, + "loss": 2.3708, + "mean_token_accuracy": 0.42413793206214906, + "step": 108250 + }, + { + "epoch": 0.1090355857584447, + "grad_norm": 9.972063477556208, + "learning_rate": 4.9572594769439875e-05, + "loss": 2.285, + "mean_token_accuracy": 0.4310344696044922, + "step": 108255 + }, + { + "epoch": 0.10904062181154887, + "grad_norm": 13.358316529814356, + "learning_rate": 4.9572522077012954e-05, + "loss": 2.4205, + "mean_token_accuracy": 0.42758620381355283, + "step": 108260 + }, + { + "epoch": 0.10904565786465305, + "grad_norm": 11.587656475147119, + "learning_rate": 4.957244937846414e-05, + "loss": 2.5325, + "mean_token_accuracy": 0.42413793206214906, + "step": 108265 + }, + { + "epoch": 0.10905069391775722, + "grad_norm": 7.870348937029143, + "learning_rate": 4.957237667379345e-05, + "loss": 2.0006, + "mean_token_accuracy": 0.4931034445762634, + "step": 108270 + }, + { + "epoch": 0.1090557299708614, + "grad_norm": 12.540386994390301, + "learning_rate": 4.95723039630009e-05, + "loss": 2.498, + "mean_token_accuracy": 0.4206896543502808, + "step": 108275 + }, + { + "epoch": 0.10906076602396557, + "grad_norm": 12.365801220103963, + "learning_rate": 4.9572231246086515e-05, + "loss": 2.3619, + "mean_token_accuracy": 0.47586206197738645, + "step": 108280 + }, + { + "epoch": 0.10906580207706974, + "grad_norm": 15.286300865465538, + "learning_rate": 4.957215852305032e-05, + "loss": 2.9182, + "mean_token_accuracy": 0.38965516686439516, + "step": 108285 + }, + { + "epoch": 0.10907083813017392, + "grad_norm": 12.390272458637462, + "learning_rate": 4.957208579389233e-05, + "loss": 2.6025, + "mean_token_accuracy": 0.3896551728248596, + "step": 108290 + }, + { + "epoch": 0.10907587418327809, + "grad_norm": 9.10308311971539, + "learning_rate": 4.9572013058612554e-05, + "loss": 2.4029, + "mean_token_accuracy": 0.42413793206214906, + "step": 108295 + }, + { + "epoch": 0.10908091023638226, + "grad_norm": 9.723218202371056, + "learning_rate": 4.957194031721102e-05, + "loss": 1.9334, + "mean_token_accuracy": 0.4670901417732239, + "step": 108300 + }, + { + "epoch": 0.10908594628948644, + "grad_norm": 10.91961489154031, + "learning_rate": 4.9571867569687765e-05, + "loss": 2.0924, + "mean_token_accuracy": 0.41724138855934145, + "step": 108305 + }, + { + "epoch": 0.10909098234259061, + "grad_norm": 10.053105494412218, + "learning_rate": 4.957179481604279e-05, + "loss": 2.8632, + "mean_token_accuracy": 0.43623714447021483, + "step": 108310 + }, + { + "epoch": 0.10909601839569479, + "grad_norm": 13.117692457570408, + "learning_rate": 4.957172205627612e-05, + "loss": 2.4038, + "mean_token_accuracy": 0.45517241954803467, + "step": 108315 + }, + { + "epoch": 0.10910105444879895, + "grad_norm": 9.433686975924559, + "learning_rate": 4.9571649290387776e-05, + "loss": 2.1009, + "mean_token_accuracy": 0.4965517342090607, + "step": 108320 + }, + { + "epoch": 0.10910609050190312, + "grad_norm": 13.404124523328223, + "learning_rate": 4.957157651837778e-05, + "loss": 2.2973, + "mean_token_accuracy": 0.4847549915313721, + "step": 108325 + }, + { + "epoch": 0.10911112655500729, + "grad_norm": 10.215457499307105, + "learning_rate": 4.957150374024615e-05, + "loss": 2.3049, + "mean_token_accuracy": 0.48275862336158754, + "step": 108330 + }, + { + "epoch": 0.10911616260811147, + "grad_norm": 10.884165755558707, + "learning_rate": 4.95714309559929e-05, + "loss": 2.4353, + "mean_token_accuracy": 0.40344828367233276, + "step": 108335 + }, + { + "epoch": 0.10912119866121564, + "grad_norm": 10.961152783339772, + "learning_rate": 4.957135816561806e-05, + "loss": 2.4183, + "mean_token_accuracy": 0.4482758641242981, + "step": 108340 + }, + { + "epoch": 0.10912623471431981, + "grad_norm": 10.339602000872851, + "learning_rate": 4.957128536912165e-05, + "loss": 2.1201, + "mean_token_accuracy": 0.4896551787853241, + "step": 108345 + }, + { + "epoch": 0.10913127076742399, + "grad_norm": 9.372026241108806, + "learning_rate": 4.9571212566503675e-05, + "loss": 1.9588, + "mean_token_accuracy": 0.5137930989265442, + "step": 108350 + }, + { + "epoch": 0.10913630682052816, + "grad_norm": 10.981711469328777, + "learning_rate": 4.9571139757764176e-05, + "loss": 2.1345, + "mean_token_accuracy": 0.4482758641242981, + "step": 108355 + }, + { + "epoch": 0.10914134287363234, + "grad_norm": 10.200577394268366, + "learning_rate": 4.957106694290316e-05, + "loss": 2.3385, + "mean_token_accuracy": 0.4517241358757019, + "step": 108360 + }, + { + "epoch": 0.10914637892673651, + "grad_norm": 8.376924389264458, + "learning_rate": 4.9570994121920656e-05, + "loss": 2.133, + "mean_token_accuracy": 0.4931034564971924, + "step": 108365 + }, + { + "epoch": 0.10915141497984068, + "grad_norm": 10.327140714044267, + "learning_rate": 4.957092129481668e-05, + "loss": 2.5852, + "mean_token_accuracy": 0.4034482717514038, + "step": 108370 + }, + { + "epoch": 0.10915645103294486, + "grad_norm": 10.343691824511522, + "learning_rate": 4.957084846159124e-05, + "loss": 2.7169, + "mean_token_accuracy": 0.4068965494632721, + "step": 108375 + }, + { + "epoch": 0.10916148708604903, + "grad_norm": 12.118700244603959, + "learning_rate": 4.9570775622244384e-05, + "loss": 2.5423, + "mean_token_accuracy": 0.4103448212146759, + "step": 108380 + }, + { + "epoch": 0.1091665231391532, + "grad_norm": 10.901960424015048, + "learning_rate": 4.9570702776776106e-05, + "loss": 2.6154, + "mean_token_accuracy": 0.3586206942796707, + "step": 108385 + }, + { + "epoch": 0.10917155919225736, + "grad_norm": 9.660631562939315, + "learning_rate": 4.957062992518644e-05, + "loss": 2.5239, + "mean_token_accuracy": 0.41034482717514037, + "step": 108390 + }, + { + "epoch": 0.10917659524536154, + "grad_norm": 10.805359119674723, + "learning_rate": 4.95705570674754e-05, + "loss": 3.0089, + "mean_token_accuracy": 0.3448275774717331, + "step": 108395 + }, + { + "epoch": 0.10918163129846571, + "grad_norm": 10.562073921113637, + "learning_rate": 4.9570484203643016e-05, + "loss": 2.2378, + "mean_token_accuracy": 0.5110837459564209, + "step": 108400 + }, + { + "epoch": 0.10918666735156989, + "grad_norm": 9.499923768028935, + "learning_rate": 4.957041133368929e-05, + "loss": 2.2979, + "mean_token_accuracy": 0.42758620977401735, + "step": 108405 + }, + { + "epoch": 0.10919170340467406, + "grad_norm": 13.03946893888272, + "learning_rate": 4.9570338457614265e-05, + "loss": 2.2928, + "mean_token_accuracy": 0.4310344934463501, + "step": 108410 + }, + { + "epoch": 0.10919673945777823, + "grad_norm": 11.417979671059793, + "learning_rate": 4.957026557541794e-05, + "loss": 2.5613, + "mean_token_accuracy": 0.39310345649719236, + "step": 108415 + }, + { + "epoch": 0.1092017755108824, + "grad_norm": 8.99716566265736, + "learning_rate": 4.957019268710035e-05, + "loss": 1.9817, + "mean_token_accuracy": 0.4724137902259827, + "step": 108420 + }, + { + "epoch": 0.10920681156398658, + "grad_norm": 12.568965942508944, + "learning_rate": 4.95701197926615e-05, + "loss": 2.7194, + "mean_token_accuracy": 0.38275861740112305, + "step": 108425 + }, + { + "epoch": 0.10921184761709075, + "grad_norm": 10.503866668828742, + "learning_rate": 4.957004689210143e-05, + "loss": 2.321, + "mean_token_accuracy": 0.4172413766384125, + "step": 108430 + }, + { + "epoch": 0.10921688367019493, + "grad_norm": 11.453502483966966, + "learning_rate": 4.956997398542015e-05, + "loss": 2.4259, + "mean_token_accuracy": 0.4379310369491577, + "step": 108435 + }, + { + "epoch": 0.1092219197232991, + "grad_norm": 9.504855925697074, + "learning_rate": 4.956990107261768e-05, + "loss": 2.5629, + "mean_token_accuracy": 0.4034482717514038, + "step": 108440 + }, + { + "epoch": 0.10922695577640328, + "grad_norm": 7.927636634948441, + "learning_rate": 4.956982815369403e-05, + "loss": 2.1775, + "mean_token_accuracy": 0.47241379618644713, + "step": 108445 + }, + { + "epoch": 0.10923199182950745, + "grad_norm": 9.68166631441675, + "learning_rate": 4.9569755228649244e-05, + "loss": 2.6143, + "mean_token_accuracy": 0.39310344457626345, + "step": 108450 + }, + { + "epoch": 0.10923702788261162, + "grad_norm": 9.734905257771333, + "learning_rate": 4.9569682297483316e-05, + "loss": 2.2195, + "mean_token_accuracy": 0.47241380214691164, + "step": 108455 + }, + { + "epoch": 0.10924206393571578, + "grad_norm": 13.6352396891129, + "learning_rate": 4.9569609360196294e-05, + "loss": 2.6495, + "mean_token_accuracy": 0.3551724165678024, + "step": 108460 + }, + { + "epoch": 0.10924709998881996, + "grad_norm": 10.145225661141481, + "learning_rate": 4.956953641678818e-05, + "loss": 2.1389, + "mean_token_accuracy": 0.47241379618644713, + "step": 108465 + }, + { + "epoch": 0.10925213604192413, + "grad_norm": 11.423748588562828, + "learning_rate": 4.956946346725899e-05, + "loss": 2.341, + "mean_token_accuracy": 0.4137930989265442, + "step": 108470 + }, + { + "epoch": 0.1092571720950283, + "grad_norm": 10.463783815585675, + "learning_rate": 4.956939051160875e-05, + "loss": 2.1605, + "mean_token_accuracy": 0.44137930274009707, + "step": 108475 + }, + { + "epoch": 0.10926220814813248, + "grad_norm": 11.02651498479961, + "learning_rate": 4.956931754983749e-05, + "loss": 2.5377, + "mean_token_accuracy": 0.4241379380226135, + "step": 108480 + }, + { + "epoch": 0.10926724420123665, + "grad_norm": 11.648402800939554, + "learning_rate": 4.9569244581945225e-05, + "loss": 2.6916, + "mean_token_accuracy": 0.36346037983894347, + "step": 108485 + }, + { + "epoch": 0.10927228025434083, + "grad_norm": 12.758083638189552, + "learning_rate": 4.956917160793197e-05, + "loss": 2.677, + "mean_token_accuracy": 0.35172412991523744, + "step": 108490 + }, + { + "epoch": 0.109277316307445, + "grad_norm": 10.346411580881599, + "learning_rate": 4.956909862779774e-05, + "loss": 2.1845, + "mean_token_accuracy": 0.4655172348022461, + "step": 108495 + }, + { + "epoch": 0.10928235236054917, + "grad_norm": 12.511417127285094, + "learning_rate": 4.956902564154258e-05, + "loss": 2.2436, + "mean_token_accuracy": 0.48620688915252686, + "step": 108500 + }, + { + "epoch": 0.10928738841365335, + "grad_norm": 8.916031877945324, + "learning_rate": 4.956895264916648e-05, + "loss": 2.6102, + "mean_token_accuracy": 0.3758620619773865, + "step": 108505 + }, + { + "epoch": 0.10929242446675752, + "grad_norm": 11.873683213908357, + "learning_rate": 4.9568879650669475e-05, + "loss": 2.4268, + "mean_token_accuracy": 0.3931034505367279, + "step": 108510 + }, + { + "epoch": 0.1092974605198617, + "grad_norm": 9.888570492973518, + "learning_rate": 4.956880664605159e-05, + "loss": 2.3682, + "mean_token_accuracy": 0.4137930989265442, + "step": 108515 + }, + { + "epoch": 0.10930249657296587, + "grad_norm": 10.962545228941444, + "learning_rate": 4.956873363531283e-05, + "loss": 2.7676, + "mean_token_accuracy": 0.36896551847457887, + "step": 108520 + }, + { + "epoch": 0.10930753262607004, + "grad_norm": 9.229155459405217, + "learning_rate": 4.9568660618453235e-05, + "loss": 1.9578, + "mean_token_accuracy": 0.4551724135875702, + "step": 108525 + }, + { + "epoch": 0.1093125686791742, + "grad_norm": 11.714011720261418, + "learning_rate": 4.9568587595472805e-05, + "loss": 2.5371, + "mean_token_accuracy": 0.40532365441322327, + "step": 108530 + }, + { + "epoch": 0.10931760473227838, + "grad_norm": 8.694130699884665, + "learning_rate": 4.956851456637157e-05, + "loss": 2.124, + "mean_token_accuracy": 0.43103447556495667, + "step": 108535 + }, + { + "epoch": 0.10932264078538255, + "grad_norm": 9.940895009659549, + "learning_rate": 4.956844153114956e-05, + "loss": 1.9123, + "mean_token_accuracy": 0.5275861978530884, + "step": 108540 + }, + { + "epoch": 0.10932767683848672, + "grad_norm": 11.782744428796434, + "learning_rate": 4.9568368489806773e-05, + "loss": 2.5556, + "mean_token_accuracy": 0.4620689630508423, + "step": 108545 + }, + { + "epoch": 0.1093327128915909, + "grad_norm": 7.097656365248149, + "learning_rate": 4.956829544234325e-05, + "loss": 1.9566, + "mean_token_accuracy": 0.515517246723175, + "step": 108550 + }, + { + "epoch": 0.10933774894469507, + "grad_norm": 16.880874774381244, + "learning_rate": 4.9568222388759e-05, + "loss": 2.4144, + "mean_token_accuracy": 0.37586206793785093, + "step": 108555 + }, + { + "epoch": 0.10934278499779924, + "grad_norm": 11.434189214434877, + "learning_rate": 4.9568149329054056e-05, + "loss": 2.4591, + "mean_token_accuracy": 0.4034482717514038, + "step": 108560 + }, + { + "epoch": 0.10934782105090342, + "grad_norm": 13.50847539939889, + "learning_rate": 4.9568076263228416e-05, + "loss": 2.6547, + "mean_token_accuracy": 0.3999999940395355, + "step": 108565 + }, + { + "epoch": 0.10935285710400759, + "grad_norm": 10.963449505652344, + "learning_rate": 4.9568003191282115e-05, + "loss": 2.4893, + "mean_token_accuracy": 0.4551724135875702, + "step": 108570 + }, + { + "epoch": 0.10935789315711177, + "grad_norm": 11.571722551782264, + "learning_rate": 4.9567930113215175e-05, + "loss": 2.3046, + "mean_token_accuracy": 0.4068965494632721, + "step": 108575 + }, + { + "epoch": 0.10936292921021594, + "grad_norm": 11.300867966490793, + "learning_rate": 4.95678570290276e-05, + "loss": 2.8549, + "mean_token_accuracy": 0.39999999701976774, + "step": 108580 + }, + { + "epoch": 0.10936796526332011, + "grad_norm": 8.203230108640994, + "learning_rate": 4.9567783938719436e-05, + "loss": 2.4367, + "mean_token_accuracy": 0.4853599429130554, + "step": 108585 + }, + { + "epoch": 0.10937300131642429, + "grad_norm": 10.602353265511875, + "learning_rate": 4.9567710842290685e-05, + "loss": 2.5164, + "mean_token_accuracy": 0.38275861740112305, + "step": 108590 + }, + { + "epoch": 0.10937803736952846, + "grad_norm": 10.36125809228581, + "learning_rate": 4.9567637739741375e-05, + "loss": 2.3382, + "mean_token_accuracy": 0.43103448748588563, + "step": 108595 + }, + { + "epoch": 0.10938307342263262, + "grad_norm": 9.579450353640178, + "learning_rate": 4.956756463107152e-05, + "loss": 2.6928, + "mean_token_accuracy": 0.3931034505367279, + "step": 108600 + }, + { + "epoch": 0.1093881094757368, + "grad_norm": 7.815804526294398, + "learning_rate": 4.956749151628115e-05, + "loss": 2.4032, + "mean_token_accuracy": 0.4483968555927277, + "step": 108605 + }, + { + "epoch": 0.10939314552884097, + "grad_norm": 11.501560278260502, + "learning_rate": 4.956741839537027e-05, + "loss": 2.3212, + "mean_token_accuracy": 0.44137930274009707, + "step": 108610 + }, + { + "epoch": 0.10939818158194514, + "grad_norm": 12.306308563207965, + "learning_rate": 4.956734526833891e-05, + "loss": 2.0798, + "mean_token_accuracy": 0.4570477843284607, + "step": 108615 + }, + { + "epoch": 0.10940321763504932, + "grad_norm": 9.596085647339196, + "learning_rate": 4.95672721351871e-05, + "loss": 2.2661, + "mean_token_accuracy": 0.4379310250282288, + "step": 108620 + }, + { + "epoch": 0.10940825368815349, + "grad_norm": 10.933406617928826, + "learning_rate": 4.956719899591484e-05, + "loss": 2.5301, + "mean_token_accuracy": 0.37241379618644715, + "step": 108625 + }, + { + "epoch": 0.10941328974125766, + "grad_norm": 12.024663090448541, + "learning_rate": 4.956712585052216e-05, + "loss": 2.0425, + "mean_token_accuracy": 0.49655171036720275, + "step": 108630 + }, + { + "epoch": 0.10941832579436184, + "grad_norm": 11.365256647370954, + "learning_rate": 4.956705269900908e-05, + "loss": 2.3019, + "mean_token_accuracy": 0.44482759237289426, + "step": 108635 + }, + { + "epoch": 0.10942336184746601, + "grad_norm": 11.117497196982137, + "learning_rate": 4.956697954137563e-05, + "loss": 2.1995, + "mean_token_accuracy": 0.4137930989265442, + "step": 108640 + }, + { + "epoch": 0.10942839790057018, + "grad_norm": 11.286968685953827, + "learning_rate": 4.9566906377621814e-05, + "loss": 2.5183, + "mean_token_accuracy": 0.3896551728248596, + "step": 108645 + }, + { + "epoch": 0.10943343395367436, + "grad_norm": 10.844175633416276, + "learning_rate": 4.9566833207747656e-05, + "loss": 2.4981, + "mean_token_accuracy": 0.41379310488700866, + "step": 108650 + }, + { + "epoch": 0.10943847000677853, + "grad_norm": 10.260610069847061, + "learning_rate": 4.9566760031753176e-05, + "loss": 2.4458, + "mean_token_accuracy": 0.4344827592372894, + "step": 108655 + }, + { + "epoch": 0.1094435060598827, + "grad_norm": 11.182699354731659, + "learning_rate": 4.956668684963841e-05, + "loss": 2.7002, + "mean_token_accuracy": 0.42589232325553894, + "step": 108660 + }, + { + "epoch": 0.10944854211298688, + "grad_norm": 8.458551738372089, + "learning_rate": 4.956661366140336e-05, + "loss": 2.508, + "mean_token_accuracy": 0.37586207389831544, + "step": 108665 + }, + { + "epoch": 0.10945357816609104, + "grad_norm": 11.192587152765487, + "learning_rate": 4.956654046704805e-05, + "loss": 2.5648, + "mean_token_accuracy": 0.38100423812866213, + "step": 108670 + }, + { + "epoch": 0.10945861421919521, + "grad_norm": 11.511134975528474, + "learning_rate": 4.9566467266572514e-05, + "loss": 2.5, + "mean_token_accuracy": 0.3999999940395355, + "step": 108675 + }, + { + "epoch": 0.10946365027229939, + "grad_norm": 12.47021857143296, + "learning_rate": 4.956639405997675e-05, + "loss": 2.0222, + "mean_token_accuracy": 0.5413793087005615, + "step": 108680 + }, + { + "epoch": 0.10946868632540356, + "grad_norm": 11.419565921442782, + "learning_rate": 4.956632084726079e-05, + "loss": 2.4063, + "mean_token_accuracy": 0.38275861740112305, + "step": 108685 + }, + { + "epoch": 0.10947372237850773, + "grad_norm": 11.398851867381527, + "learning_rate": 4.9566247628424655e-05, + "loss": 2.2905, + "mean_token_accuracy": 0.45517241954803467, + "step": 108690 + }, + { + "epoch": 0.10947875843161191, + "grad_norm": 10.492629529401714, + "learning_rate": 4.9566174403468365e-05, + "loss": 2.5323, + "mean_token_accuracy": 0.44482758045196535, + "step": 108695 + }, + { + "epoch": 0.10948379448471608, + "grad_norm": 10.0336889384859, + "learning_rate": 4.956610117239194e-05, + "loss": 2.2245, + "mean_token_accuracy": 0.4569268047809601, + "step": 108700 + }, + { + "epoch": 0.10948883053782026, + "grad_norm": 11.181414314160177, + "learning_rate": 4.9566027935195396e-05, + "loss": 2.2796, + "mean_token_accuracy": 0.43272837400436404, + "step": 108705 + }, + { + "epoch": 0.10949386659092443, + "grad_norm": 8.98544241455164, + "learning_rate": 4.9565954691878765e-05, + "loss": 2.2329, + "mean_token_accuracy": 0.44827585816383364, + "step": 108710 + }, + { + "epoch": 0.1094989026440286, + "grad_norm": 10.57836124799796, + "learning_rate": 4.956588144244205e-05, + "loss": 2.5064, + "mean_token_accuracy": 0.4241379380226135, + "step": 108715 + }, + { + "epoch": 0.10950393869713278, + "grad_norm": 11.9543074173301, + "learning_rate": 4.9565808186885285e-05, + "loss": 2.4164, + "mean_token_accuracy": 0.4, + "step": 108720 + }, + { + "epoch": 0.10950897475023695, + "grad_norm": 9.808049072316026, + "learning_rate": 4.9565734925208476e-05, + "loss": 2.977, + "mean_token_accuracy": 0.3482758551836014, + "step": 108725 + }, + { + "epoch": 0.10951401080334112, + "grad_norm": 9.833083804690995, + "learning_rate": 4.956566165741167e-05, + "loss": 2.1515, + "mean_token_accuracy": 0.4620689630508423, + "step": 108730 + }, + { + "epoch": 0.1095190468564453, + "grad_norm": 13.038763886401624, + "learning_rate": 4.956558838349486e-05, + "loss": 2.3303, + "mean_token_accuracy": 0.43448275327682495, + "step": 108735 + }, + { + "epoch": 0.10952408290954946, + "grad_norm": 10.162877259629278, + "learning_rate": 4.9565515103458074e-05, + "loss": 2.4305, + "mean_token_accuracy": 0.4689655125141144, + "step": 108740 + }, + { + "epoch": 0.10952911896265363, + "grad_norm": 11.539658194386842, + "learning_rate": 4.956544181730134e-05, + "loss": 2.4234, + "mean_token_accuracy": 0.4, + "step": 108745 + }, + { + "epoch": 0.1095341550157578, + "grad_norm": 11.546414194787163, + "learning_rate": 4.9565368525024676e-05, + "loss": 2.5711, + "mean_token_accuracy": 0.3344827651977539, + "step": 108750 + }, + { + "epoch": 0.10953919106886198, + "grad_norm": 7.491873613831233, + "learning_rate": 4.95652952266281e-05, + "loss": 2.024, + "mean_token_accuracy": 0.48844525814056394, + "step": 108755 + }, + { + "epoch": 0.10954422712196615, + "grad_norm": 10.461560499969957, + "learning_rate": 4.9565221922111625e-05, + "loss": 1.9001, + "mean_token_accuracy": 0.5112069010734558, + "step": 108760 + }, + { + "epoch": 0.10954926317507033, + "grad_norm": 9.423412894797863, + "learning_rate": 4.956514861147529e-05, + "loss": 2.5084, + "mean_token_accuracy": 0.42758620381355283, + "step": 108765 + }, + { + "epoch": 0.1095542992281745, + "grad_norm": 9.032713231007905, + "learning_rate": 4.9565075294719086e-05, + "loss": 2.472, + "mean_token_accuracy": 0.4586206912994385, + "step": 108770 + }, + { + "epoch": 0.10955933528127867, + "grad_norm": 9.549320520417819, + "learning_rate": 4.956500197184307e-05, + "loss": 2.336, + "mean_token_accuracy": 0.4344827592372894, + "step": 108775 + }, + { + "epoch": 0.10956437133438285, + "grad_norm": 8.858719270822732, + "learning_rate": 4.956492864284723e-05, + "loss": 2.2514, + "mean_token_accuracy": 0.4448275864124298, + "step": 108780 + }, + { + "epoch": 0.10956940738748702, + "grad_norm": 8.972141698119945, + "learning_rate": 4.9564855307731604e-05, + "loss": 2.4366, + "mean_token_accuracy": 0.42413793206214906, + "step": 108785 + }, + { + "epoch": 0.1095744434405912, + "grad_norm": 10.75523328238598, + "learning_rate": 4.956478196649621e-05, + "loss": 2.5833, + "mean_token_accuracy": 0.458620685338974, + "step": 108790 + }, + { + "epoch": 0.10957947949369537, + "grad_norm": 15.136250847038207, + "learning_rate": 4.956470861914107e-05, + "loss": 2.4689, + "mean_token_accuracy": 0.43793103098869324, + "step": 108795 + }, + { + "epoch": 0.10958451554679954, + "grad_norm": 11.161871078113332, + "learning_rate": 4.956463526566619e-05, + "loss": 2.2914, + "mean_token_accuracy": 0.4034482777118683, + "step": 108800 + }, + { + "epoch": 0.10958955159990372, + "grad_norm": 11.104912436014118, + "learning_rate": 4.956456190607161e-05, + "loss": 2.4898, + "mean_token_accuracy": 0.4084089517593384, + "step": 108805 + }, + { + "epoch": 0.10959458765300788, + "grad_norm": 12.26517361156502, + "learning_rate": 4.9564488540357333e-05, + "loss": 2.5938, + "mean_token_accuracy": 0.3862068891525269, + "step": 108810 + }, + { + "epoch": 0.10959962370611205, + "grad_norm": 24.670047407451257, + "learning_rate": 4.9564415168523395e-05, + "loss": 2.9217, + "mean_token_accuracy": 0.35517241060733795, + "step": 108815 + }, + { + "epoch": 0.10960465975921622, + "grad_norm": 9.313628219943586, + "learning_rate": 4.9564341790569814e-05, + "loss": 2.0554, + "mean_token_accuracy": 0.47586206793785096, + "step": 108820 + }, + { + "epoch": 0.1096096958123204, + "grad_norm": 11.91295505229531, + "learning_rate": 4.9564268406496595e-05, + "loss": 2.4908, + "mean_token_accuracy": 0.37586207389831544, + "step": 108825 + }, + { + "epoch": 0.10961473186542457, + "grad_norm": 11.376378925485312, + "learning_rate": 4.9564195016303773e-05, + "loss": 2.2501, + "mean_token_accuracy": 0.46896552443504336, + "step": 108830 + }, + { + "epoch": 0.10961976791852875, + "grad_norm": 12.06563125639636, + "learning_rate": 4.956412161999136e-05, + "loss": 3.1737, + "mean_token_accuracy": 0.29999999403953553, + "step": 108835 + }, + { + "epoch": 0.10962480397163292, + "grad_norm": 10.02406054281331, + "learning_rate": 4.956404821755938e-05, + "loss": 2.5961, + "mean_token_accuracy": 0.3931034505367279, + "step": 108840 + }, + { + "epoch": 0.1096298400247371, + "grad_norm": 10.01634283227325, + "learning_rate": 4.956397480900786e-05, + "loss": 2.7164, + "mean_token_accuracy": 0.3448275923728943, + "step": 108845 + }, + { + "epoch": 0.10963487607784127, + "grad_norm": 10.959486846850748, + "learning_rate": 4.9563901394336816e-05, + "loss": 2.5941, + "mean_token_accuracy": 0.40689654350280763, + "step": 108850 + }, + { + "epoch": 0.10963991213094544, + "grad_norm": 10.777803636831063, + "learning_rate": 4.9563827973546265e-05, + "loss": 2.3814, + "mean_token_accuracy": 0.4206896543502808, + "step": 108855 + }, + { + "epoch": 0.10964494818404961, + "grad_norm": 11.234310989495029, + "learning_rate": 4.9563754546636225e-05, + "loss": 2.1941, + "mean_token_accuracy": 0.5137930870056152, + "step": 108860 + }, + { + "epoch": 0.10964998423715379, + "grad_norm": 13.029129294493176, + "learning_rate": 4.9563681113606725e-05, + "loss": 2.1202, + "mean_token_accuracy": 0.4482758641242981, + "step": 108865 + }, + { + "epoch": 0.10965502029025796, + "grad_norm": 13.2264640952751, + "learning_rate": 4.956360767445777e-05, + "loss": 2.4013, + "mean_token_accuracy": 0.45347853302955626, + "step": 108870 + }, + { + "epoch": 0.10966005634336214, + "grad_norm": 10.588105514833211, + "learning_rate": 4.95635342291894e-05, + "loss": 2.5287, + "mean_token_accuracy": 0.4310344815254211, + "step": 108875 + }, + { + "epoch": 0.1096650923964663, + "grad_norm": 8.38986348488346, + "learning_rate": 4.956346077780163e-05, + "loss": 2.4858, + "mean_token_accuracy": 0.42280701994895936, + "step": 108880 + }, + { + "epoch": 0.10967012844957047, + "grad_norm": 11.367896400979324, + "learning_rate": 4.9563387320294475e-05, + "loss": 2.5733, + "mean_token_accuracy": 0.4137930989265442, + "step": 108885 + }, + { + "epoch": 0.10967516450267464, + "grad_norm": 10.94798172670199, + "learning_rate": 4.9563313856667956e-05, + "loss": 2.8079, + "mean_token_accuracy": 0.3655172407627106, + "step": 108890 + }, + { + "epoch": 0.10968020055577882, + "grad_norm": 9.997614792450273, + "learning_rate": 4.9563240386922085e-05, + "loss": 2.559, + "mean_token_accuracy": 0.4363581418991089, + "step": 108895 + }, + { + "epoch": 0.10968523660888299, + "grad_norm": 12.603090892798074, + "learning_rate": 4.95631669110569e-05, + "loss": 2.4431, + "mean_token_accuracy": 0.3965517282485962, + "step": 108900 + }, + { + "epoch": 0.10969027266198716, + "grad_norm": 12.577871356908824, + "learning_rate": 4.9563093429072414e-05, + "loss": 2.2132, + "mean_token_accuracy": 0.45517241954803467, + "step": 108905 + }, + { + "epoch": 0.10969530871509134, + "grad_norm": 11.31073341549833, + "learning_rate": 4.956301994096865e-05, + "loss": 2.2594, + "mean_token_accuracy": 0.4379310369491577, + "step": 108910 + }, + { + "epoch": 0.10970034476819551, + "grad_norm": 10.245542774431373, + "learning_rate": 4.9562946446745626e-05, + "loss": 2.4868, + "mean_token_accuracy": 0.36896551847457887, + "step": 108915 + }, + { + "epoch": 0.10970538082129969, + "grad_norm": 11.499372335243203, + "learning_rate": 4.956287294640335e-05, + "loss": 2.4993, + "mean_token_accuracy": 0.41379310488700866, + "step": 108920 + }, + { + "epoch": 0.10971041687440386, + "grad_norm": 11.5584277646368, + "learning_rate": 4.956279943994186e-05, + "loss": 2.2586, + "mean_token_accuracy": 0.44361767172813416, + "step": 108925 + }, + { + "epoch": 0.10971545292750803, + "grad_norm": 11.26854249102457, + "learning_rate": 4.9562725927361175e-05, + "loss": 2.704, + "mean_token_accuracy": 0.4413793087005615, + "step": 108930 + }, + { + "epoch": 0.10972048898061221, + "grad_norm": 12.244780790930013, + "learning_rate": 4.9562652408661306e-05, + "loss": 2.5002, + "mean_token_accuracy": 0.39310344457626345, + "step": 108935 + }, + { + "epoch": 0.10972552503371638, + "grad_norm": 9.966766507119857, + "learning_rate": 4.9562578883842274e-05, + "loss": 2.1889, + "mean_token_accuracy": 0.4655172348022461, + "step": 108940 + }, + { + "epoch": 0.10973056108682055, + "grad_norm": 9.487043282670616, + "learning_rate": 4.956250535290411e-05, + "loss": 2.4244, + "mean_token_accuracy": 0.4517241418361664, + "step": 108945 + }, + { + "epoch": 0.10973559713992471, + "grad_norm": 10.76844236529563, + "learning_rate": 4.956243181584683e-05, + "loss": 2.1593, + "mean_token_accuracy": 0.4344827592372894, + "step": 108950 + }, + { + "epoch": 0.10974063319302889, + "grad_norm": 14.655030719350563, + "learning_rate": 4.9562358272670446e-05, + "loss": 2.4538, + "mean_token_accuracy": 0.43793103098869324, + "step": 108955 + }, + { + "epoch": 0.10974566924613306, + "grad_norm": 9.489390959980613, + "learning_rate": 4.956228472337499e-05, + "loss": 2.4256, + "mean_token_accuracy": 0.42068966031074523, + "step": 108960 + }, + { + "epoch": 0.10975070529923724, + "grad_norm": 8.45499594880466, + "learning_rate": 4.956221116796047e-05, + "loss": 2.1903, + "mean_token_accuracy": 0.4862068951129913, + "step": 108965 + }, + { + "epoch": 0.10975574135234141, + "grad_norm": 10.663743035899332, + "learning_rate": 4.9562137606426915e-05, + "loss": 2.2014, + "mean_token_accuracy": 0.44482759237289426, + "step": 108970 + }, + { + "epoch": 0.10976077740544558, + "grad_norm": 8.931933969285565, + "learning_rate": 4.9562064038774355e-05, + "loss": 2.4114, + "mean_token_accuracy": 0.4241379380226135, + "step": 108975 + }, + { + "epoch": 0.10976581345854976, + "grad_norm": 10.06488116436584, + "learning_rate": 4.956199046500279e-05, + "loss": 2.3047, + "mean_token_accuracy": 0.4689655065536499, + "step": 108980 + }, + { + "epoch": 0.10977084951165393, + "grad_norm": 9.62908143732193, + "learning_rate": 4.956191688511225e-05, + "loss": 2.677, + "mean_token_accuracy": 0.4034482777118683, + "step": 108985 + }, + { + "epoch": 0.1097758855647581, + "grad_norm": 11.612168542583195, + "learning_rate": 4.956184329910275e-05, + "loss": 2.4342, + "mean_token_accuracy": 0.3965517282485962, + "step": 108990 + }, + { + "epoch": 0.10978092161786228, + "grad_norm": 10.823624749613211, + "learning_rate": 4.956176970697432e-05, + "loss": 2.7756, + "mean_token_accuracy": 0.37586206793785093, + "step": 108995 + }, + { + "epoch": 0.10978595767096645, + "grad_norm": 8.696694680540846, + "learning_rate": 4.956169610872697e-05, + "loss": 2.2633, + "mean_token_accuracy": 0.443254691362381, + "step": 109000 + }, + { + "epoch": 0.10979099372407063, + "grad_norm": 9.115203828898157, + "learning_rate": 4.9561622504360744e-05, + "loss": 2.4558, + "mean_token_accuracy": 0.4068965554237366, + "step": 109005 + }, + { + "epoch": 0.1097960297771748, + "grad_norm": 10.394024740651497, + "learning_rate": 4.956154889387563e-05, + "loss": 2.4039, + "mean_token_accuracy": 0.42262552976608275, + "step": 109010 + }, + { + "epoch": 0.10980106583027897, + "grad_norm": 12.558174560202506, + "learning_rate": 4.956147527727167e-05, + "loss": 2.4549, + "mean_token_accuracy": 0.4482758641242981, + "step": 109015 + }, + { + "epoch": 0.10980610188338313, + "grad_norm": 13.029078177801535, + "learning_rate": 4.956140165454887e-05, + "loss": 2.3651, + "mean_token_accuracy": 0.4137930929660797, + "step": 109020 + }, + { + "epoch": 0.10981113793648731, + "grad_norm": 8.719260535972563, + "learning_rate": 4.956132802570726e-05, + "loss": 2.2063, + "mean_token_accuracy": 0.48620688915252686, + "step": 109025 + }, + { + "epoch": 0.10981617398959148, + "grad_norm": 14.97062534915284, + "learning_rate": 4.9561254390746866e-05, + "loss": 2.6109, + "mean_token_accuracy": 0.3862069010734558, + "step": 109030 + }, + { + "epoch": 0.10982121004269566, + "grad_norm": 9.595315478970837, + "learning_rate": 4.956118074966769e-05, + "loss": 2.3554, + "mean_token_accuracy": 0.42758620977401735, + "step": 109035 + }, + { + "epoch": 0.10982624609579983, + "grad_norm": 10.244274513043388, + "learning_rate": 4.956110710246977e-05, + "loss": 2.2588, + "mean_token_accuracy": 0.4344827473163605, + "step": 109040 + }, + { + "epoch": 0.109831282148904, + "grad_norm": 11.507146602902875, + "learning_rate": 4.9561033449153124e-05, + "loss": 2.7435, + "mean_token_accuracy": 0.3793103456497192, + "step": 109045 + }, + { + "epoch": 0.10983631820200818, + "grad_norm": 8.293472591177698, + "learning_rate": 4.956095978971776e-05, + "loss": 1.7032, + "mean_token_accuracy": 0.5572292804718018, + "step": 109050 + }, + { + "epoch": 0.10984135425511235, + "grad_norm": 11.360813037703537, + "learning_rate": 4.9560886124163704e-05, + "loss": 2.936, + "mean_token_accuracy": 0.3517241358757019, + "step": 109055 + }, + { + "epoch": 0.10984639030821652, + "grad_norm": 9.135239364974659, + "learning_rate": 4.9560812452490985e-05, + "loss": 2.1318, + "mean_token_accuracy": 0.5000000059604645, + "step": 109060 + }, + { + "epoch": 0.1098514263613207, + "grad_norm": 14.214867158708788, + "learning_rate": 4.956073877469961e-05, + "loss": 2.4464, + "mean_token_accuracy": 0.4241379380226135, + "step": 109065 + }, + { + "epoch": 0.10985646241442487, + "grad_norm": 10.533282752688555, + "learning_rate": 4.956066509078962e-05, + "loss": 2.4081, + "mean_token_accuracy": 0.4551724135875702, + "step": 109070 + }, + { + "epoch": 0.10986149846752905, + "grad_norm": 11.67710087480588, + "learning_rate": 4.956059140076101e-05, + "loss": 2.1866, + "mean_token_accuracy": 0.46442831158638, + "step": 109075 + }, + { + "epoch": 0.10986653452063322, + "grad_norm": 10.756572263131078, + "learning_rate": 4.9560517704613824e-05, + "loss": 2.5072, + "mean_token_accuracy": 0.42413792610168455, + "step": 109080 + }, + { + "epoch": 0.10987157057373739, + "grad_norm": 13.000984900594185, + "learning_rate": 4.9560444002348064e-05, + "loss": 2.5627, + "mean_token_accuracy": 0.41034482717514037, + "step": 109085 + }, + { + "epoch": 0.10987660662684155, + "grad_norm": 9.61400504338785, + "learning_rate": 4.956037029396376e-05, + "loss": 2.5575, + "mean_token_accuracy": 0.41034482717514037, + "step": 109090 + }, + { + "epoch": 0.10988164267994573, + "grad_norm": 10.147762278408658, + "learning_rate": 4.956029657946093e-05, + "loss": 2.0324, + "mean_token_accuracy": 0.5034482777118683, + "step": 109095 + }, + { + "epoch": 0.1098866787330499, + "grad_norm": 11.882602427341729, + "learning_rate": 4.956022285883959e-05, + "loss": 2.4902, + "mean_token_accuracy": 0.3931034505367279, + "step": 109100 + }, + { + "epoch": 0.10989171478615407, + "grad_norm": 9.907048937390432, + "learning_rate": 4.956014913209976e-05, + "loss": 2.2911, + "mean_token_accuracy": 0.41379311084747317, + "step": 109105 + }, + { + "epoch": 0.10989675083925825, + "grad_norm": 9.123481824334343, + "learning_rate": 4.956007539924148e-05, + "loss": 2.3323, + "mean_token_accuracy": 0.4034482717514038, + "step": 109110 + }, + { + "epoch": 0.10990178689236242, + "grad_norm": 10.32365337270559, + "learning_rate": 4.9560001660264746e-05, + "loss": 2.3214, + "mean_token_accuracy": 0.44482757449150084, + "step": 109115 + }, + { + "epoch": 0.1099068229454666, + "grad_norm": 9.666834411935854, + "learning_rate": 4.955992791516959e-05, + "loss": 2.5506, + "mean_token_accuracy": 0.4172413766384125, + "step": 109120 + }, + { + "epoch": 0.10991185899857077, + "grad_norm": 10.090616537661079, + "learning_rate": 4.9559854163956035e-05, + "loss": 2.2862, + "mean_token_accuracy": 0.4698275923728943, + "step": 109125 + }, + { + "epoch": 0.10991689505167494, + "grad_norm": 11.80025952737308, + "learning_rate": 4.9559780406624094e-05, + "loss": 2.6937, + "mean_token_accuracy": 0.4137930989265442, + "step": 109130 + }, + { + "epoch": 0.10992193110477912, + "grad_norm": 9.942229468930933, + "learning_rate": 4.955970664317379e-05, + "loss": 2.2458, + "mean_token_accuracy": 0.43793103098869324, + "step": 109135 + }, + { + "epoch": 0.10992696715788329, + "grad_norm": 10.687084895712541, + "learning_rate": 4.955963287360514e-05, + "loss": 2.7939, + "mean_token_accuracy": 0.41034482717514037, + "step": 109140 + }, + { + "epoch": 0.10993200321098746, + "grad_norm": 11.331095515013114, + "learning_rate": 4.955955909791818e-05, + "loss": 2.1298, + "mean_token_accuracy": 0.42413793206214906, + "step": 109145 + }, + { + "epoch": 0.10993703926409164, + "grad_norm": 8.395247658307, + "learning_rate": 4.955948531611291e-05, + "loss": 2.0727, + "mean_token_accuracy": 0.44664247035980226, + "step": 109150 + }, + { + "epoch": 0.10994207531719581, + "grad_norm": 9.1294058608412, + "learning_rate": 4.955941152818936e-05, + "loss": 2.6236, + "mean_token_accuracy": 0.3862069010734558, + "step": 109155 + }, + { + "epoch": 0.10994711137029997, + "grad_norm": 9.572630979379975, + "learning_rate": 4.955933773414755e-05, + "loss": 2.0219, + "mean_token_accuracy": 0.4551724135875702, + "step": 109160 + }, + { + "epoch": 0.10995214742340415, + "grad_norm": 9.536677659619636, + "learning_rate": 4.9559263933987504e-05, + "loss": 2.0314, + "mean_token_accuracy": 0.46551724672317507, + "step": 109165 + }, + { + "epoch": 0.10995718347650832, + "grad_norm": 10.207024342914838, + "learning_rate": 4.9559190127709235e-05, + "loss": 2.594, + "mean_token_accuracy": 0.3827586203813553, + "step": 109170 + }, + { + "epoch": 0.10996221952961249, + "grad_norm": 10.562118068320435, + "learning_rate": 4.955911631531277e-05, + "loss": 2.3512, + "mean_token_accuracy": 0.4310344815254211, + "step": 109175 + }, + { + "epoch": 0.10996725558271667, + "grad_norm": 9.805271019172876, + "learning_rate": 4.955904249679813e-05, + "loss": 2.174, + "mean_token_accuracy": 0.46551724076271056, + "step": 109180 + }, + { + "epoch": 0.10997229163582084, + "grad_norm": 10.900952548385096, + "learning_rate": 4.955896867216532e-05, + "loss": 2.4664, + "mean_token_accuracy": 0.4327888786792755, + "step": 109185 + }, + { + "epoch": 0.10997732768892501, + "grad_norm": 9.016507243162119, + "learning_rate": 4.955889484141439e-05, + "loss": 2.2283, + "mean_token_accuracy": 0.46896551847457885, + "step": 109190 + }, + { + "epoch": 0.10998236374202919, + "grad_norm": 11.541788444179556, + "learning_rate": 4.955882100454533e-05, + "loss": 2.1302, + "mean_token_accuracy": 0.4537810027599335, + "step": 109195 + }, + { + "epoch": 0.10998739979513336, + "grad_norm": 9.921947478533077, + "learning_rate": 4.955874716155818e-05, + "loss": 2.1204, + "mean_token_accuracy": 0.49655172824859617, + "step": 109200 + }, + { + "epoch": 0.10999243584823754, + "grad_norm": 10.520993757513427, + "learning_rate": 4.9558673312452954e-05, + "loss": 2.3247, + "mean_token_accuracy": 0.39655172228813174, + "step": 109205 + }, + { + "epoch": 0.10999747190134171, + "grad_norm": 10.197429672015701, + "learning_rate": 4.955859945722967e-05, + "loss": 2.2475, + "mean_token_accuracy": 0.42413793206214906, + "step": 109210 + }, + { + "epoch": 0.11000250795444588, + "grad_norm": 11.577869560983032, + "learning_rate": 4.955852559588835e-05, + "loss": 2.4024, + "mean_token_accuracy": 0.42758620381355283, + "step": 109215 + }, + { + "epoch": 0.11000754400755006, + "grad_norm": 11.605556582082269, + "learning_rate": 4.955845172842902e-05, + "loss": 2.2004, + "mean_token_accuracy": 0.44482759237289426, + "step": 109220 + }, + { + "epoch": 0.11001258006065423, + "grad_norm": 11.703510667040309, + "learning_rate": 4.955837785485169e-05, + "loss": 2.5897, + "mean_token_accuracy": 0.42758620977401735, + "step": 109225 + }, + { + "epoch": 0.11001761611375839, + "grad_norm": 10.582449903275375, + "learning_rate": 4.955830397515639e-05, + "loss": 2.832, + "mean_token_accuracy": 0.33793103098869326, + "step": 109230 + }, + { + "epoch": 0.11002265216686256, + "grad_norm": 9.240855462927925, + "learning_rate": 4.9558230089343136e-05, + "loss": 2.3382, + "mean_token_accuracy": 0.3965517282485962, + "step": 109235 + }, + { + "epoch": 0.11002768821996674, + "grad_norm": 12.371712584977232, + "learning_rate": 4.9558156197411954e-05, + "loss": 2.4259, + "mean_token_accuracy": 0.4, + "step": 109240 + }, + { + "epoch": 0.11003272427307091, + "grad_norm": 12.781668667436623, + "learning_rate": 4.9558082299362856e-05, + "loss": 2.4081, + "mean_token_accuracy": 0.38620689511299133, + "step": 109245 + }, + { + "epoch": 0.11003776032617509, + "grad_norm": 12.500905724944795, + "learning_rate": 4.9558008395195863e-05, + "loss": 2.7141, + "mean_token_accuracy": 0.37241379618644715, + "step": 109250 + }, + { + "epoch": 0.11004279637927926, + "grad_norm": 14.045517669688499, + "learning_rate": 4.955793448491101e-05, + "loss": 2.6004, + "mean_token_accuracy": 0.39655172228813174, + "step": 109255 + }, + { + "epoch": 0.11004783243238343, + "grad_norm": 9.73210681947098, + "learning_rate": 4.95578605685083e-05, + "loss": 2.0796, + "mean_token_accuracy": 0.4586206912994385, + "step": 109260 + }, + { + "epoch": 0.1100528684854876, + "grad_norm": 16.367300958170205, + "learning_rate": 4.9557786645987756e-05, + "loss": 2.469, + "mean_token_accuracy": 0.4310344815254211, + "step": 109265 + }, + { + "epoch": 0.11005790453859178, + "grad_norm": 12.026634040401063, + "learning_rate": 4.9557712717349405e-05, + "loss": 2.7868, + "mean_token_accuracy": 0.38275861740112305, + "step": 109270 + }, + { + "epoch": 0.11006294059169595, + "grad_norm": 11.97665012264357, + "learning_rate": 4.9557638782593275e-05, + "loss": 2.3032, + "mean_token_accuracy": 0.4620689630508423, + "step": 109275 + }, + { + "epoch": 0.11006797664480013, + "grad_norm": 8.873279672474425, + "learning_rate": 4.955756484171937e-05, + "loss": 2.1302, + "mean_token_accuracy": 0.49655171632766726, + "step": 109280 + }, + { + "epoch": 0.1100730126979043, + "grad_norm": 12.927814400287867, + "learning_rate": 4.955749089472771e-05, + "loss": 2.5186, + "mean_token_accuracy": 0.4103448331356049, + "step": 109285 + }, + { + "epoch": 0.11007804875100848, + "grad_norm": 11.353392999280747, + "learning_rate": 4.955741694161833e-05, + "loss": 2.6087, + "mean_token_accuracy": 0.3931034505367279, + "step": 109290 + }, + { + "epoch": 0.11008308480411265, + "grad_norm": 8.32873067447434, + "learning_rate": 4.955734298239124e-05, + "loss": 2.1706, + "mean_token_accuracy": 0.47241378426551817, + "step": 109295 + }, + { + "epoch": 0.11008812085721681, + "grad_norm": 11.388515903319364, + "learning_rate": 4.9557269017046466e-05, + "loss": 2.7546, + "mean_token_accuracy": 0.4068965494632721, + "step": 109300 + }, + { + "epoch": 0.11009315691032098, + "grad_norm": 14.069913783782553, + "learning_rate": 4.955719504558403e-05, + "loss": 2.5818, + "mean_token_accuracy": 0.4, + "step": 109305 + }, + { + "epoch": 0.11009819296342516, + "grad_norm": 10.990039738625331, + "learning_rate": 4.955712106800394e-05, + "loss": 3.1273, + "mean_token_accuracy": 0.38275861740112305, + "step": 109310 + }, + { + "epoch": 0.11010322901652933, + "grad_norm": 12.389077670206957, + "learning_rate": 4.955704708430623e-05, + "loss": 2.7016, + "mean_token_accuracy": 0.4413793087005615, + "step": 109315 + }, + { + "epoch": 0.1101082650696335, + "grad_norm": 12.905178621033356, + "learning_rate": 4.9556973094490916e-05, + "loss": 2.8666, + "mean_token_accuracy": 0.43448275327682495, + "step": 109320 + }, + { + "epoch": 0.11011330112273768, + "grad_norm": 10.732526781395356, + "learning_rate": 4.955689909855802e-05, + "loss": 3.0251, + "mean_token_accuracy": 0.3379310339689255, + "step": 109325 + }, + { + "epoch": 0.11011833717584185, + "grad_norm": 10.722491266444997, + "learning_rate": 4.9556825096507554e-05, + "loss": 2.5219, + "mean_token_accuracy": 0.41379310190677643, + "step": 109330 + }, + { + "epoch": 0.11012337322894603, + "grad_norm": 10.846609840891913, + "learning_rate": 4.9556751088339545e-05, + "loss": 2.4061, + "mean_token_accuracy": 0.4517241358757019, + "step": 109335 + }, + { + "epoch": 0.1101284092820502, + "grad_norm": 12.023357898130989, + "learning_rate": 4.955667707405403e-05, + "loss": 2.3183, + "mean_token_accuracy": 0.441379314661026, + "step": 109340 + }, + { + "epoch": 0.11013344533515437, + "grad_norm": 10.845900272437596, + "learning_rate": 4.955660305365099e-05, + "loss": 2.9428, + "mean_token_accuracy": 0.34482758641242983, + "step": 109345 + }, + { + "epoch": 0.11013848138825855, + "grad_norm": 10.35951877913365, + "learning_rate": 4.9556529027130484e-05, + "loss": 2.5158, + "mean_token_accuracy": 0.3931034505367279, + "step": 109350 + }, + { + "epoch": 0.11014351744136272, + "grad_norm": 11.070199972009435, + "learning_rate": 4.955645499449251e-05, + "loss": 2.212, + "mean_token_accuracy": 0.4344827592372894, + "step": 109355 + }, + { + "epoch": 0.1101485534944669, + "grad_norm": 13.00614912999903, + "learning_rate": 4.9556380955737094e-05, + "loss": 2.0353, + "mean_token_accuracy": 0.5206896543502808, + "step": 109360 + }, + { + "epoch": 0.11015358954757107, + "grad_norm": 11.877169643945226, + "learning_rate": 4.9556306910864274e-05, + "loss": 2.5716, + "mean_token_accuracy": 0.41034482717514037, + "step": 109365 + }, + { + "epoch": 0.11015862560067523, + "grad_norm": 7.635389021133433, + "learning_rate": 4.955623285987405e-05, + "loss": 2.2498, + "mean_token_accuracy": 0.4793103516101837, + "step": 109370 + }, + { + "epoch": 0.1101636616537794, + "grad_norm": 11.77651774718235, + "learning_rate": 4.955615880276643e-05, + "loss": 2.4145, + "mean_token_accuracy": 0.4206896543502808, + "step": 109375 + }, + { + "epoch": 0.11016869770688358, + "grad_norm": 9.140364914867904, + "learning_rate": 4.9556084739541465e-05, + "loss": 2.1714, + "mean_token_accuracy": 0.482758617401123, + "step": 109380 + }, + { + "epoch": 0.11017373375998775, + "grad_norm": 9.969413366228334, + "learning_rate": 4.9556010670199165e-05, + "loss": 2.4727, + "mean_token_accuracy": 0.4000000059604645, + "step": 109385 + }, + { + "epoch": 0.11017876981309192, + "grad_norm": 9.655183222443686, + "learning_rate": 4.955593659473955e-05, + "loss": 1.9601, + "mean_token_accuracy": 0.46896552443504336, + "step": 109390 + }, + { + "epoch": 0.1101838058661961, + "grad_norm": 13.220295950243484, + "learning_rate": 4.9555862513162626e-05, + "loss": 2.6718, + "mean_token_accuracy": 0.41379311084747317, + "step": 109395 + }, + { + "epoch": 0.11018884191930027, + "grad_norm": 13.056702115673938, + "learning_rate": 4.9555788425468435e-05, + "loss": 2.5597, + "mean_token_accuracy": 0.4068965494632721, + "step": 109400 + }, + { + "epoch": 0.11019387797240444, + "grad_norm": 12.313428856175328, + "learning_rate": 4.9555714331656986e-05, + "loss": 2.4862, + "mean_token_accuracy": 0.3931034505367279, + "step": 109405 + }, + { + "epoch": 0.11019891402550862, + "grad_norm": 12.010561202697476, + "learning_rate": 4.9555640231728305e-05, + "loss": 2.3542, + "mean_token_accuracy": 0.44827585220336913, + "step": 109410 + }, + { + "epoch": 0.11020395007861279, + "grad_norm": 9.549941453729044, + "learning_rate": 4.9555566125682414e-05, + "loss": 2.2935, + "mean_token_accuracy": 0.42758620977401735, + "step": 109415 + }, + { + "epoch": 0.11020898613171697, + "grad_norm": 11.571261226572455, + "learning_rate": 4.9555492013519325e-05, + "loss": 2.3012, + "mean_token_accuracy": 0.4594827651977539, + "step": 109420 + }, + { + "epoch": 0.11021402218482114, + "grad_norm": 9.978467544062743, + "learning_rate": 4.9555417895239066e-05, + "loss": 2.2097, + "mean_token_accuracy": 0.46206897497177124, + "step": 109425 + }, + { + "epoch": 0.11021905823792531, + "grad_norm": 9.22213391527546, + "learning_rate": 4.955534377084165e-05, + "loss": 2.1686, + "mean_token_accuracy": 0.4620689630508423, + "step": 109430 + }, + { + "epoch": 0.11022409429102949, + "grad_norm": 15.63808825943289, + "learning_rate": 4.9555269640327104e-05, + "loss": 2.7615, + "mean_token_accuracy": 0.3758620649576187, + "step": 109435 + }, + { + "epoch": 0.11022913034413365, + "grad_norm": 10.900510942479785, + "learning_rate": 4.955519550369544e-05, + "loss": 2.8324, + "mean_token_accuracy": 0.3655172437429428, + "step": 109440 + }, + { + "epoch": 0.11023416639723782, + "grad_norm": 11.397674057538907, + "learning_rate": 4.95551213609467e-05, + "loss": 2.4514, + "mean_token_accuracy": 0.3965517282485962, + "step": 109445 + }, + { + "epoch": 0.110239202450342, + "grad_norm": 10.75236197834535, + "learning_rate": 4.955504721208087e-05, + "loss": 3.2443, + "mean_token_accuracy": 0.36896551549434664, + "step": 109450 + }, + { + "epoch": 0.11024423850344617, + "grad_norm": 9.8912308325458, + "learning_rate": 4.9554973057098005e-05, + "loss": 2.2946, + "mean_token_accuracy": 0.44827585816383364, + "step": 109455 + }, + { + "epoch": 0.11024927455655034, + "grad_norm": 8.207547283177636, + "learning_rate": 4.955489889599811e-05, + "loss": 2.1444, + "mean_token_accuracy": 0.5295825719833374, + "step": 109460 + }, + { + "epoch": 0.11025431060965452, + "grad_norm": 12.073450475059984, + "learning_rate": 4.9554824728781206e-05, + "loss": 2.1309, + "mean_token_accuracy": 0.48620688915252686, + "step": 109465 + }, + { + "epoch": 0.11025934666275869, + "grad_norm": 14.434298059952658, + "learning_rate": 4.955475055544731e-05, + "loss": 2.3811, + "mean_token_accuracy": 0.43103447556495667, + "step": 109470 + }, + { + "epoch": 0.11026438271586286, + "grad_norm": 9.406792768219908, + "learning_rate": 4.955467637599645e-05, + "loss": 2.4872, + "mean_token_accuracy": 0.3896551728248596, + "step": 109475 + }, + { + "epoch": 0.11026941876896704, + "grad_norm": 11.76848425384687, + "learning_rate": 4.955460219042864e-05, + "loss": 2.6666, + "mean_token_accuracy": 0.3896551728248596, + "step": 109480 + }, + { + "epoch": 0.11027445482207121, + "grad_norm": 9.193786222754824, + "learning_rate": 4.95545279987439e-05, + "loss": 2.4146, + "mean_token_accuracy": 0.39491832852363584, + "step": 109485 + }, + { + "epoch": 0.11027949087517538, + "grad_norm": 14.46834330140287, + "learning_rate": 4.9554453800942265e-05, + "loss": 2.2428, + "mean_token_accuracy": 0.4275861978530884, + "step": 109490 + }, + { + "epoch": 0.11028452692827956, + "grad_norm": 10.469294607553444, + "learning_rate": 4.955437959702374e-05, + "loss": 2.1871, + "mean_token_accuracy": 0.45517240166664125, + "step": 109495 + }, + { + "epoch": 0.11028956298138373, + "grad_norm": 9.827957063673365, + "learning_rate": 4.955430538698835e-05, + "loss": 2.0378, + "mean_token_accuracy": 0.5298850536346436, + "step": 109500 + }, + { + "epoch": 0.1102945990344879, + "grad_norm": 10.638137693186344, + "learning_rate": 4.955423117083612e-05, + "loss": 2.6076, + "mean_token_accuracy": 0.4620689630508423, + "step": 109505 + }, + { + "epoch": 0.11029963508759207, + "grad_norm": 11.747277135006366, + "learning_rate": 4.955415694856706e-05, + "loss": 2.47, + "mean_token_accuracy": 0.3896551728248596, + "step": 109510 + }, + { + "epoch": 0.11030467114069624, + "grad_norm": 9.601980644215656, + "learning_rate": 4.95540827201812e-05, + "loss": 2.5441, + "mean_token_accuracy": 0.3758620619773865, + "step": 109515 + }, + { + "epoch": 0.11030970719380041, + "grad_norm": 13.986962082498318, + "learning_rate": 4.9554008485678564e-05, + "loss": 2.4506, + "mean_token_accuracy": 0.4206896543502808, + "step": 109520 + }, + { + "epoch": 0.11031474324690459, + "grad_norm": 9.2992598269699, + "learning_rate": 4.955393424505916e-05, + "loss": 2.008, + "mean_token_accuracy": 0.482758617401123, + "step": 109525 + }, + { + "epoch": 0.11031977930000876, + "grad_norm": 12.521267897572608, + "learning_rate": 4.9553859998323014e-05, + "loss": 2.3269, + "mean_token_accuracy": 0.4551724076271057, + "step": 109530 + }, + { + "epoch": 0.11032481535311293, + "grad_norm": 11.551533419814339, + "learning_rate": 4.955378574547015e-05, + "loss": 2.3207, + "mean_token_accuracy": 0.4482758641242981, + "step": 109535 + }, + { + "epoch": 0.11032985140621711, + "grad_norm": 9.4768386392394, + "learning_rate": 4.9553711486500586e-05, + "loss": 2.3067, + "mean_token_accuracy": 0.48965516686439514, + "step": 109540 + }, + { + "epoch": 0.11033488745932128, + "grad_norm": 8.915524231201431, + "learning_rate": 4.9553637221414345e-05, + "loss": 2.1285, + "mean_token_accuracy": 0.441379314661026, + "step": 109545 + }, + { + "epoch": 0.11033992351242546, + "grad_norm": 9.158754359256083, + "learning_rate": 4.955356295021144e-05, + "loss": 2.6137, + "mean_token_accuracy": 0.3655172407627106, + "step": 109550 + }, + { + "epoch": 0.11034495956552963, + "grad_norm": 10.334274787792156, + "learning_rate": 4.9553488672891904e-05, + "loss": 2.3052, + "mean_token_accuracy": 0.4517241299152374, + "step": 109555 + }, + { + "epoch": 0.1103499956186338, + "grad_norm": 14.83845485644772, + "learning_rate": 4.9553414389455745e-05, + "loss": 2.4244, + "mean_token_accuracy": 0.4655172348022461, + "step": 109560 + }, + { + "epoch": 0.11035503167173798, + "grad_norm": 10.952943568136478, + "learning_rate": 4.955334009990299e-05, + "loss": 2.3906, + "mean_token_accuracy": 0.4103448212146759, + "step": 109565 + }, + { + "epoch": 0.11036006772484215, + "grad_norm": 11.095427012312488, + "learning_rate": 4.955326580423367e-05, + "loss": 2.4344, + "mean_token_accuracy": 0.4241379380226135, + "step": 109570 + }, + { + "epoch": 0.11036510377794632, + "grad_norm": 10.372240080987558, + "learning_rate": 4.955319150244778e-05, + "loss": 2.0447, + "mean_token_accuracy": 0.5068965494632721, + "step": 109575 + }, + { + "epoch": 0.11037013983105048, + "grad_norm": 9.03514956929517, + "learning_rate": 4.955311719454536e-05, + "loss": 2.4609, + "mean_token_accuracy": 0.44137930274009707, + "step": 109580 + }, + { + "epoch": 0.11037517588415466, + "grad_norm": 9.427176315848945, + "learning_rate": 4.9553042880526425e-05, + "loss": 2.3222, + "mean_token_accuracy": 0.4689655125141144, + "step": 109585 + }, + { + "epoch": 0.11038021193725883, + "grad_norm": 10.254788826424004, + "learning_rate": 4.955296856039099e-05, + "loss": 2.7149, + "mean_token_accuracy": 0.3655172407627106, + "step": 109590 + }, + { + "epoch": 0.110385247990363, + "grad_norm": 11.058537366012793, + "learning_rate": 4.955289423413909e-05, + "loss": 2.0562, + "mean_token_accuracy": 0.5034482777118683, + "step": 109595 + }, + { + "epoch": 0.11039028404346718, + "grad_norm": 10.674820923840143, + "learning_rate": 4.955281990177074e-05, + "loss": 2.5423, + "mean_token_accuracy": 0.4344827651977539, + "step": 109600 + }, + { + "epoch": 0.11039532009657135, + "grad_norm": 9.315257241279388, + "learning_rate": 4.955274556328595e-05, + "loss": 2.5583, + "mean_token_accuracy": 0.4517241358757019, + "step": 109605 + }, + { + "epoch": 0.11040035614967553, + "grad_norm": 11.023524514206002, + "learning_rate": 4.955267121868476e-05, + "loss": 2.5568, + "mean_token_accuracy": 0.3965517282485962, + "step": 109610 + }, + { + "epoch": 0.1104053922027797, + "grad_norm": 13.108712415522671, + "learning_rate": 4.955259686796716e-05, + "loss": 2.7166, + "mean_token_accuracy": 0.41034482717514037, + "step": 109615 + }, + { + "epoch": 0.11041042825588387, + "grad_norm": 9.515249960031287, + "learning_rate": 4.9552522511133206e-05, + "loss": 2.1482, + "mean_token_accuracy": 0.458620685338974, + "step": 109620 + }, + { + "epoch": 0.11041546430898805, + "grad_norm": 13.48738857296415, + "learning_rate": 4.95524481481829e-05, + "loss": 2.2626, + "mean_token_accuracy": 0.4206896543502808, + "step": 109625 + }, + { + "epoch": 0.11042050036209222, + "grad_norm": 16.671338326199596, + "learning_rate": 4.955237377911627e-05, + "loss": 2.5432, + "mean_token_accuracy": 0.4172413766384125, + "step": 109630 + }, + { + "epoch": 0.1104255364151964, + "grad_norm": 10.403399271466787, + "learning_rate": 4.955229940393332e-05, + "loss": 2.4631, + "mean_token_accuracy": 0.41379310488700866, + "step": 109635 + }, + { + "epoch": 0.11043057246830057, + "grad_norm": 11.389975719114739, + "learning_rate": 4.9552225022634094e-05, + "loss": 2.395, + "mean_token_accuracy": 0.43448275327682495, + "step": 109640 + }, + { + "epoch": 0.11043560852140474, + "grad_norm": 13.346718168449666, + "learning_rate": 4.955215063521859e-05, + "loss": 2.2772, + "mean_token_accuracy": 0.4586206912994385, + "step": 109645 + }, + { + "epoch": 0.1104406445745089, + "grad_norm": 32.283932634229714, + "learning_rate": 4.955207624168685e-05, + "loss": 2.7353, + "mean_token_accuracy": 0.4448275864124298, + "step": 109650 + }, + { + "epoch": 0.11044568062761308, + "grad_norm": 12.385272983795435, + "learning_rate": 4.955200184203888e-05, + "loss": 2.8038, + "mean_token_accuracy": 0.3379310369491577, + "step": 109655 + }, + { + "epoch": 0.11045071668071725, + "grad_norm": 11.1051776907032, + "learning_rate": 4.955192743627471e-05, + "loss": 2.6471, + "mean_token_accuracy": 0.41379310488700866, + "step": 109660 + }, + { + "epoch": 0.11045575273382142, + "grad_norm": 10.874896130766787, + "learning_rate": 4.955185302439435e-05, + "loss": 2.2655, + "mean_token_accuracy": 0.44827585816383364, + "step": 109665 + }, + { + "epoch": 0.1104607887869256, + "grad_norm": 9.292569251972836, + "learning_rate": 4.955177860639783e-05, + "loss": 2.3477, + "mean_token_accuracy": 0.4, + "step": 109670 + }, + { + "epoch": 0.11046582484002977, + "grad_norm": 9.389619648486853, + "learning_rate": 4.9551704182285156e-05, + "loss": 2.5465, + "mean_token_accuracy": 0.41379311084747317, + "step": 109675 + }, + { + "epoch": 0.11047086089313395, + "grad_norm": 10.673917268828012, + "learning_rate": 4.955162975205637e-05, + "loss": 2.3425, + "mean_token_accuracy": 0.4137930989265442, + "step": 109680 + }, + { + "epoch": 0.11047589694623812, + "grad_norm": 9.572200887804915, + "learning_rate": 4.955155531571148e-05, + "loss": 2.421, + "mean_token_accuracy": 0.42758620381355283, + "step": 109685 + }, + { + "epoch": 0.1104809329993423, + "grad_norm": 9.068268375554354, + "learning_rate": 4.955148087325051e-05, + "loss": 2.3713, + "mean_token_accuracy": 0.42413793206214906, + "step": 109690 + }, + { + "epoch": 0.11048596905244647, + "grad_norm": 11.722302539232277, + "learning_rate": 4.955140642467348e-05, + "loss": 2.1007, + "mean_token_accuracy": 0.4571687877178192, + "step": 109695 + }, + { + "epoch": 0.11049100510555064, + "grad_norm": 11.223394323575025, + "learning_rate": 4.95513319699804e-05, + "loss": 2.7298, + "mean_token_accuracy": 0.39310344159603117, + "step": 109700 + }, + { + "epoch": 0.11049604115865481, + "grad_norm": 9.904687433262863, + "learning_rate": 4.955125750917131e-05, + "loss": 2.3784, + "mean_token_accuracy": 0.4551724135875702, + "step": 109705 + }, + { + "epoch": 0.11050107721175899, + "grad_norm": 20.13622530860082, + "learning_rate": 4.9551183042246217e-05, + "loss": 2.8809, + "mean_token_accuracy": 0.441379314661026, + "step": 109710 + }, + { + "epoch": 0.11050611326486316, + "grad_norm": 11.850941322256276, + "learning_rate": 4.9551108569205154e-05, + "loss": 2.3217, + "mean_token_accuracy": 0.4448275864124298, + "step": 109715 + }, + { + "epoch": 0.11051114931796732, + "grad_norm": 9.987281615342921, + "learning_rate": 4.9551034090048134e-05, + "loss": 2.2064, + "mean_token_accuracy": 0.4604355752468109, + "step": 109720 + }, + { + "epoch": 0.1105161853710715, + "grad_norm": 13.640592864088912, + "learning_rate": 4.955095960477516e-05, + "loss": 2.7339, + "mean_token_accuracy": 0.3620689660310745, + "step": 109725 + }, + { + "epoch": 0.11052122142417567, + "grad_norm": 14.368705103113806, + "learning_rate": 4.955088511338629e-05, + "loss": 2.593, + "mean_token_accuracy": 0.4034482777118683, + "step": 109730 + }, + { + "epoch": 0.11052625747727984, + "grad_norm": 10.709924261817832, + "learning_rate": 4.955081061588152e-05, + "loss": 2.6429, + "mean_token_accuracy": 0.39655172228813174, + "step": 109735 + }, + { + "epoch": 0.11053129353038402, + "grad_norm": 10.029966508314171, + "learning_rate": 4.955073611226087e-05, + "loss": 2.3564, + "mean_token_accuracy": 0.41379310488700866, + "step": 109740 + }, + { + "epoch": 0.11053632958348819, + "grad_norm": 9.719727214402484, + "learning_rate": 4.9550661602524364e-05, + "loss": 2.5198, + "mean_token_accuracy": 0.4379310250282288, + "step": 109745 + }, + { + "epoch": 0.11054136563659236, + "grad_norm": 15.32053188174393, + "learning_rate": 4.955058708667204e-05, + "loss": 2.538, + "mean_token_accuracy": 0.4068965554237366, + "step": 109750 + }, + { + "epoch": 0.11054640168969654, + "grad_norm": 13.546835382426913, + "learning_rate": 4.9550512564703895e-05, + "loss": 2.2657, + "mean_token_accuracy": 0.4620689690113068, + "step": 109755 + }, + { + "epoch": 0.11055143774280071, + "grad_norm": 10.179518109351017, + "learning_rate": 4.955043803661995e-05, + "loss": 2.2496, + "mean_token_accuracy": 0.44137930274009707, + "step": 109760 + }, + { + "epoch": 0.11055647379590489, + "grad_norm": 9.906249218313276, + "learning_rate": 4.955036350242024e-05, + "loss": 2.6624, + "mean_token_accuracy": 0.40344828367233276, + "step": 109765 + }, + { + "epoch": 0.11056150984900906, + "grad_norm": 9.015173128222282, + "learning_rate": 4.955028896210478e-05, + "loss": 2.081, + "mean_token_accuracy": 0.5034482836723327, + "step": 109770 + }, + { + "epoch": 0.11056654590211323, + "grad_norm": 12.675304093858019, + "learning_rate": 4.955021441567359e-05, + "loss": 2.6159, + "mean_token_accuracy": 0.42758620381355283, + "step": 109775 + }, + { + "epoch": 0.11057158195521741, + "grad_norm": 11.774736227891056, + "learning_rate": 4.9550139863126694e-05, + "loss": 2.6381, + "mean_token_accuracy": 0.4275861978530884, + "step": 109780 + }, + { + "epoch": 0.11057661800832158, + "grad_norm": 11.759809373559845, + "learning_rate": 4.9550065304464105e-05, + "loss": 2.7168, + "mean_token_accuracy": 0.4068965554237366, + "step": 109785 + }, + { + "epoch": 0.11058165406142574, + "grad_norm": 11.344495015857778, + "learning_rate": 4.954999073968585e-05, + "loss": 2.5939, + "mean_token_accuracy": 0.3827586114406586, + "step": 109790 + }, + { + "epoch": 0.11058669011452991, + "grad_norm": 9.546165615006423, + "learning_rate": 4.954991616879195e-05, + "loss": 2.5838, + "mean_token_accuracy": 0.4241379380226135, + "step": 109795 + }, + { + "epoch": 0.11059172616763409, + "grad_norm": 9.58269865528971, + "learning_rate": 4.954984159178241e-05, + "loss": 1.9524, + "mean_token_accuracy": 0.48620688915252686, + "step": 109800 + }, + { + "epoch": 0.11059676222073826, + "grad_norm": 12.245640387298852, + "learning_rate": 4.954976700865728e-05, + "loss": 2.2869, + "mean_token_accuracy": 0.43793101906776427, + "step": 109805 + }, + { + "epoch": 0.11060179827384244, + "grad_norm": 12.237561594788703, + "learning_rate": 4.954969241941656e-05, + "loss": 2.5247, + "mean_token_accuracy": 0.3896551787853241, + "step": 109810 + }, + { + "epoch": 0.11060683432694661, + "grad_norm": 13.56023357104217, + "learning_rate": 4.954961782406027e-05, + "loss": 2.5396, + "mean_token_accuracy": 0.42413793206214906, + "step": 109815 + }, + { + "epoch": 0.11061187038005078, + "grad_norm": 11.363380819785297, + "learning_rate": 4.954954322258845e-05, + "loss": 2.4353, + "mean_token_accuracy": 0.42256503105163573, + "step": 109820 + }, + { + "epoch": 0.11061690643315496, + "grad_norm": 11.028061172380802, + "learning_rate": 4.954946861500109e-05, + "loss": 2.4616, + "mean_token_accuracy": 0.45172414779663084, + "step": 109825 + }, + { + "epoch": 0.11062194248625913, + "grad_norm": 9.02747523092846, + "learning_rate": 4.954939400129824e-05, + "loss": 2.2565, + "mean_token_accuracy": 0.41379310488700866, + "step": 109830 + }, + { + "epoch": 0.1106269785393633, + "grad_norm": 13.9316982551118, + "learning_rate": 4.95493193814799e-05, + "loss": 2.5783, + "mean_token_accuracy": 0.41379311084747317, + "step": 109835 + }, + { + "epoch": 0.11063201459246748, + "grad_norm": 10.591652515223547, + "learning_rate": 4.95492447555461e-05, + "loss": 2.7444, + "mean_token_accuracy": 0.40689656138420105, + "step": 109840 + }, + { + "epoch": 0.11063705064557165, + "grad_norm": 10.871162833352379, + "learning_rate": 4.954917012349686e-05, + "loss": 2.3832, + "mean_token_accuracy": 0.482758617401123, + "step": 109845 + }, + { + "epoch": 0.11064208669867583, + "grad_norm": 10.346597303766462, + "learning_rate": 4.954909548533221e-05, + "loss": 2.5977, + "mean_token_accuracy": 0.4586206912994385, + "step": 109850 + }, + { + "epoch": 0.11064712275178, + "grad_norm": 10.245106963223579, + "learning_rate": 4.9549020841052153e-05, + "loss": 1.9882, + "mean_token_accuracy": 0.5021173536777497, + "step": 109855 + }, + { + "epoch": 0.11065215880488416, + "grad_norm": 12.653765015318836, + "learning_rate": 4.954894619065672e-05, + "loss": 2.8519, + "mean_token_accuracy": 0.3758620649576187, + "step": 109860 + }, + { + "epoch": 0.11065719485798833, + "grad_norm": 10.66559733395443, + "learning_rate": 4.954887153414593e-05, + "loss": 2.3186, + "mean_token_accuracy": 0.4482758641242981, + "step": 109865 + }, + { + "epoch": 0.11066223091109251, + "grad_norm": 10.286967440091127, + "learning_rate": 4.95487968715198e-05, + "loss": 2.7121, + "mean_token_accuracy": 0.3620689570903778, + "step": 109870 + }, + { + "epoch": 0.11066726696419668, + "grad_norm": 14.504670113400602, + "learning_rate": 4.9548722202778355e-05, + "loss": 3.1091, + "mean_token_accuracy": 0.31724137961864474, + "step": 109875 + }, + { + "epoch": 0.11067230301730085, + "grad_norm": 9.268425655153992, + "learning_rate": 4.9548647527921616e-05, + "loss": 2.3421, + "mean_token_accuracy": 0.4620689690113068, + "step": 109880 + }, + { + "epoch": 0.11067733907040503, + "grad_norm": 11.762144195784408, + "learning_rate": 4.9548572846949605e-05, + "loss": 2.5762, + "mean_token_accuracy": 0.4206896543502808, + "step": 109885 + }, + { + "epoch": 0.1106823751235092, + "grad_norm": 11.084962044007144, + "learning_rate": 4.9548498159862334e-05, + "loss": 2.6662, + "mean_token_accuracy": 0.40344826877117157, + "step": 109890 + }, + { + "epoch": 0.11068741117661338, + "grad_norm": 11.613146716309696, + "learning_rate": 4.954842346665983e-05, + "loss": 2.6062, + "mean_token_accuracy": 0.4344827592372894, + "step": 109895 + }, + { + "epoch": 0.11069244722971755, + "grad_norm": 11.502046407054445, + "learning_rate": 4.9548348767342115e-05, + "loss": 2.2017, + "mean_token_accuracy": 0.4068965494632721, + "step": 109900 + }, + { + "epoch": 0.11069748328282172, + "grad_norm": 10.374903355464683, + "learning_rate": 4.9548274061909215e-05, + "loss": 2.3751, + "mean_token_accuracy": 0.4448275864124298, + "step": 109905 + }, + { + "epoch": 0.1107025193359259, + "grad_norm": 11.80128765962287, + "learning_rate": 4.9548199350361136e-05, + "loss": 2.3038, + "mean_token_accuracy": 0.4758620738983154, + "step": 109910 + }, + { + "epoch": 0.11070755538903007, + "grad_norm": 12.309000490712037, + "learning_rate": 4.9548124632697914e-05, + "loss": 2.144, + "mean_token_accuracy": 0.4551724076271057, + "step": 109915 + }, + { + "epoch": 0.11071259144213424, + "grad_norm": 12.987800322106187, + "learning_rate": 4.9548049908919554e-05, + "loss": 2.6532, + "mean_token_accuracy": 0.3931034505367279, + "step": 109920 + }, + { + "epoch": 0.1107176274952384, + "grad_norm": 9.856746840931061, + "learning_rate": 4.9547975179026084e-05, + "loss": 1.9359, + "mean_token_accuracy": 0.49999998807907103, + "step": 109925 + }, + { + "epoch": 0.11072266354834258, + "grad_norm": 24.511497974789975, + "learning_rate": 4.954790044301753e-05, + "loss": 2.7501, + "mean_token_accuracy": 0.38275861740112305, + "step": 109930 + }, + { + "epoch": 0.11072769960144675, + "grad_norm": 10.625728982011431, + "learning_rate": 4.9547825700893914e-05, + "loss": 2.8649, + "mean_token_accuracy": 0.37586207389831544, + "step": 109935 + }, + { + "epoch": 0.11073273565455093, + "grad_norm": 10.453088954549797, + "learning_rate": 4.954775095265524e-05, + "loss": 2.3728, + "mean_token_accuracy": 0.43103448748588563, + "step": 109940 + }, + { + "epoch": 0.1107377717076551, + "grad_norm": 11.0728554615507, + "learning_rate": 4.9547676198301554e-05, + "loss": 2.447, + "mean_token_accuracy": 0.42413793206214906, + "step": 109945 + }, + { + "epoch": 0.11074280776075927, + "grad_norm": 12.654626196566557, + "learning_rate": 4.9547601437832844e-05, + "loss": 2.8965, + "mean_token_accuracy": 0.3655172407627106, + "step": 109950 + }, + { + "epoch": 0.11074784381386345, + "grad_norm": 10.443391565215034, + "learning_rate": 4.9547526671249166e-05, + "loss": 2.5015, + "mean_token_accuracy": 0.3931034505367279, + "step": 109955 + }, + { + "epoch": 0.11075287986696762, + "grad_norm": 11.513641732787965, + "learning_rate": 4.954745189855052e-05, + "loss": 3.0081, + "mean_token_accuracy": 0.34827586114406583, + "step": 109960 + }, + { + "epoch": 0.1107579159200718, + "grad_norm": 9.741013950465092, + "learning_rate": 4.954737711973693e-05, + "loss": 1.9497, + "mean_token_accuracy": 0.48965516686439514, + "step": 109965 + }, + { + "epoch": 0.11076295197317597, + "grad_norm": 10.268028046277964, + "learning_rate": 4.954730233480842e-05, + "loss": 2.3784, + "mean_token_accuracy": 0.44482758045196535, + "step": 109970 + }, + { + "epoch": 0.11076798802628014, + "grad_norm": 7.991277488403565, + "learning_rate": 4.9547227543765004e-05, + "loss": 2.3326, + "mean_token_accuracy": 0.42068966031074523, + "step": 109975 + }, + { + "epoch": 0.11077302407938432, + "grad_norm": 8.914467459232396, + "learning_rate": 4.954715274660671e-05, + "loss": 2.0928, + "mean_token_accuracy": 0.49655172824859617, + "step": 109980 + }, + { + "epoch": 0.11077806013248849, + "grad_norm": 9.061479512418181, + "learning_rate": 4.954707794333355e-05, + "loss": 2.2215, + "mean_token_accuracy": 0.4344827592372894, + "step": 109985 + }, + { + "epoch": 0.11078309618559266, + "grad_norm": 9.651910742173058, + "learning_rate": 4.9547003133945555e-05, + "loss": 2.2611, + "mean_token_accuracy": 0.4838669955730438, + "step": 109990 + }, + { + "epoch": 0.11078813223869682, + "grad_norm": 11.424252864925052, + "learning_rate": 4.954692831844274e-05, + "loss": 2.4038, + "mean_token_accuracy": 0.43448275327682495, + "step": 109995 + }, + { + "epoch": 0.110793168291801, + "grad_norm": 11.518070052847287, + "learning_rate": 4.954685349682513e-05, + "loss": 2.4439, + "mean_token_accuracy": 0.4206896543502808, + "step": 110000 + }, + { + "epoch": 0.11079820434490517, + "grad_norm": 12.068718110049637, + "learning_rate": 4.954677866909275e-05, + "loss": 2.6058, + "mean_token_accuracy": 0.3965517282485962, + "step": 110005 + }, + { + "epoch": 0.11080324039800934, + "grad_norm": 10.487826596411983, + "learning_rate": 4.95467038352456e-05, + "loss": 2.0984, + "mean_token_accuracy": 0.4620689570903778, + "step": 110010 + }, + { + "epoch": 0.11080827645111352, + "grad_norm": 11.859439615410343, + "learning_rate": 4.9546628995283724e-05, + "loss": 2.6553, + "mean_token_accuracy": 0.43793103098869324, + "step": 110015 + }, + { + "epoch": 0.11081331250421769, + "grad_norm": 10.888986211809044, + "learning_rate": 4.954655414920712e-05, + "loss": 2.7119, + "mean_token_accuracy": 0.3620689630508423, + "step": 110020 + }, + { + "epoch": 0.11081834855732187, + "grad_norm": 13.535134401186484, + "learning_rate": 4.9546479297015837e-05, + "loss": 2.2219, + "mean_token_accuracy": 0.4448275864124298, + "step": 110025 + }, + { + "epoch": 0.11082338461042604, + "grad_norm": 12.382731144823627, + "learning_rate": 4.9546404438709874e-05, + "loss": 2.551, + "mean_token_accuracy": 0.4068965554237366, + "step": 110030 + }, + { + "epoch": 0.11082842066353021, + "grad_norm": 13.59385685116867, + "learning_rate": 4.954632957428926e-05, + "loss": 2.3917, + "mean_token_accuracy": 0.4620689570903778, + "step": 110035 + }, + { + "epoch": 0.11083345671663439, + "grad_norm": 9.659639169689061, + "learning_rate": 4.9546254703754015e-05, + "loss": 2.1192, + "mean_token_accuracy": 0.5120689690113067, + "step": 110040 + }, + { + "epoch": 0.11083849276973856, + "grad_norm": 11.124379741638602, + "learning_rate": 4.954617982710416e-05, + "loss": 2.0506, + "mean_token_accuracy": 0.458620685338974, + "step": 110045 + }, + { + "epoch": 0.11084352882284274, + "grad_norm": 9.97150667264149, + "learning_rate": 4.9546104944339714e-05, + "loss": 2.6434, + "mean_token_accuracy": 0.36896551251411436, + "step": 110050 + }, + { + "epoch": 0.11084856487594691, + "grad_norm": 11.568727237806593, + "learning_rate": 4.9546030055460694e-05, + "loss": 2.5471, + "mean_token_accuracy": 0.40000000298023225, + "step": 110055 + }, + { + "epoch": 0.11085360092905108, + "grad_norm": 13.942814407888182, + "learning_rate": 4.9545955160467126e-05, + "loss": 2.958, + "mean_token_accuracy": 0.31518452167510985, + "step": 110060 + }, + { + "epoch": 0.11085863698215524, + "grad_norm": 11.155265925637252, + "learning_rate": 4.954588025935904e-05, + "loss": 2.2793, + "mean_token_accuracy": 0.4068965494632721, + "step": 110065 + }, + { + "epoch": 0.11086367303525942, + "grad_norm": 10.410653943219387, + "learning_rate": 4.9545805352136434e-05, + "loss": 2.1551, + "mean_token_accuracy": 0.4379310369491577, + "step": 110070 + }, + { + "epoch": 0.11086870908836359, + "grad_norm": 9.620776123483173, + "learning_rate": 4.9545730438799344e-05, + "loss": 2.2818, + "mean_token_accuracy": 0.4448275864124298, + "step": 110075 + }, + { + "epoch": 0.11087374514146776, + "grad_norm": 11.085788917626777, + "learning_rate": 4.9545655519347795e-05, + "loss": 2.3998, + "mean_token_accuracy": 0.46206897497177124, + "step": 110080 + }, + { + "epoch": 0.11087878119457194, + "grad_norm": 10.722759915260553, + "learning_rate": 4.95455805937818e-05, + "loss": 1.945, + "mean_token_accuracy": 0.517241370677948, + "step": 110085 + }, + { + "epoch": 0.11088381724767611, + "grad_norm": 10.727426872371629, + "learning_rate": 4.9545505662101385e-05, + "loss": 2.2256, + "mean_token_accuracy": 0.4379310429096222, + "step": 110090 + }, + { + "epoch": 0.11088885330078029, + "grad_norm": 10.901751741023881, + "learning_rate": 4.954543072430655e-05, + "loss": 2.4993, + "mean_token_accuracy": 0.4344827473163605, + "step": 110095 + }, + { + "epoch": 0.11089388935388446, + "grad_norm": 10.582380533412987, + "learning_rate": 4.954535578039734e-05, + "loss": 2.6804, + "mean_token_accuracy": 0.3931034505367279, + "step": 110100 + }, + { + "epoch": 0.11089892540698863, + "grad_norm": 11.885277752466243, + "learning_rate": 4.9545280830373774e-05, + "loss": 2.6796, + "mean_token_accuracy": 0.38620689511299133, + "step": 110105 + }, + { + "epoch": 0.1109039614600928, + "grad_norm": 14.748744086151163, + "learning_rate": 4.9545205874235866e-05, + "loss": 2.4036, + "mean_token_accuracy": 0.44482758045196535, + "step": 110110 + }, + { + "epoch": 0.11090899751319698, + "grad_norm": 11.087118471490058, + "learning_rate": 4.954513091198364e-05, + "loss": 2.6209, + "mean_token_accuracy": 0.3965517282485962, + "step": 110115 + }, + { + "epoch": 0.11091403356630115, + "grad_norm": 10.10432501022113, + "learning_rate": 4.954505594361711e-05, + "loss": 2.3121, + "mean_token_accuracy": 0.441379314661026, + "step": 110120 + }, + { + "epoch": 0.11091906961940533, + "grad_norm": 9.174868000152191, + "learning_rate": 4.9544980969136304e-05, + "loss": 2.4145, + "mean_token_accuracy": 0.4034482777118683, + "step": 110125 + }, + { + "epoch": 0.1109241056725095, + "grad_norm": 11.997378902262785, + "learning_rate": 4.9544905988541234e-05, + "loss": 2.4038, + "mean_token_accuracy": 0.41724138259887694, + "step": 110130 + }, + { + "epoch": 0.11092914172561366, + "grad_norm": 9.640441241196982, + "learning_rate": 4.9544831001831934e-05, + "loss": 2.4542, + "mean_token_accuracy": 0.4172413766384125, + "step": 110135 + }, + { + "epoch": 0.11093417777871784, + "grad_norm": 11.094944454720647, + "learning_rate": 4.954475600900842e-05, + "loss": 2.4031, + "mean_token_accuracy": 0.42413793206214906, + "step": 110140 + }, + { + "epoch": 0.11093921383182201, + "grad_norm": 9.66466292718135, + "learning_rate": 4.95446810100707e-05, + "loss": 2.5415, + "mean_token_accuracy": 0.42068964838981626, + "step": 110145 + }, + { + "epoch": 0.11094424988492618, + "grad_norm": 9.004590094798614, + "learning_rate": 4.954460600501881e-05, + "loss": 2.1957, + "mean_token_accuracy": 0.4172413766384125, + "step": 110150 + }, + { + "epoch": 0.11094928593803036, + "grad_norm": 9.593047790045324, + "learning_rate": 4.9544530993852775e-05, + "loss": 2.1682, + "mean_token_accuracy": 0.4601935863494873, + "step": 110155 + }, + { + "epoch": 0.11095432199113453, + "grad_norm": 11.122775640429913, + "learning_rate": 4.954445597657259e-05, + "loss": 2.4374, + "mean_token_accuracy": 0.42758620381355283, + "step": 110160 + }, + { + "epoch": 0.1109593580442387, + "grad_norm": 11.06441242174139, + "learning_rate": 4.954438095317831e-05, + "loss": 2.4318, + "mean_token_accuracy": 0.43448275327682495, + "step": 110165 + }, + { + "epoch": 0.11096439409734288, + "grad_norm": 9.79968108215539, + "learning_rate": 4.954430592366993e-05, + "loss": 2.358, + "mean_token_accuracy": 0.42413792610168455, + "step": 110170 + }, + { + "epoch": 0.11096943015044705, + "grad_norm": 12.593161215383045, + "learning_rate": 4.954423088804748e-05, + "loss": 2.6868, + "mean_token_accuracy": 0.4034482717514038, + "step": 110175 + }, + { + "epoch": 0.11097446620355123, + "grad_norm": 11.44840356709061, + "learning_rate": 4.954415584631099e-05, + "loss": 2.3658, + "mean_token_accuracy": 0.47586206197738645, + "step": 110180 + }, + { + "epoch": 0.1109795022566554, + "grad_norm": 10.557877242503999, + "learning_rate": 4.954408079846046e-05, + "loss": 2.5215, + "mean_token_accuracy": 0.4586206912994385, + "step": 110185 + }, + { + "epoch": 0.11098453830975957, + "grad_norm": 9.337159884677273, + "learning_rate": 4.954400574449592e-05, + "loss": 2.3686, + "mean_token_accuracy": 0.38620689511299133, + "step": 110190 + }, + { + "epoch": 0.11098957436286375, + "grad_norm": 10.220651248503906, + "learning_rate": 4.95439306844174e-05, + "loss": 2.4958, + "mean_token_accuracy": 0.40689656138420105, + "step": 110195 + }, + { + "epoch": 0.11099461041596792, + "grad_norm": 15.058861993490368, + "learning_rate": 4.9543855618224915e-05, + "loss": 2.8307, + "mean_token_accuracy": 0.3689655065536499, + "step": 110200 + }, + { + "epoch": 0.11099964646907208, + "grad_norm": 11.508079660034259, + "learning_rate": 4.954378054591848e-05, + "loss": 2.24, + "mean_token_accuracy": 0.4620689630508423, + "step": 110205 + }, + { + "epoch": 0.11100468252217625, + "grad_norm": 9.96174904279089, + "learning_rate": 4.954370546749812e-05, + "loss": 2.7077, + "mean_token_accuracy": 0.4181034445762634, + "step": 110210 + }, + { + "epoch": 0.11100971857528043, + "grad_norm": 10.851027597930178, + "learning_rate": 4.9543630382963856e-05, + "loss": 2.5933, + "mean_token_accuracy": 0.4034482717514038, + "step": 110215 + }, + { + "epoch": 0.1110147546283846, + "grad_norm": 10.177923066405706, + "learning_rate": 4.9543555292315706e-05, + "loss": 2.2393, + "mean_token_accuracy": 0.4482758641242981, + "step": 110220 + }, + { + "epoch": 0.11101979068148878, + "grad_norm": 10.285213547500963, + "learning_rate": 4.954348019555371e-05, + "loss": 2.3333, + "mean_token_accuracy": 0.4206896543502808, + "step": 110225 + }, + { + "epoch": 0.11102482673459295, + "grad_norm": 10.095437591075353, + "learning_rate": 4.954340509267785e-05, + "loss": 2.4565, + "mean_token_accuracy": 0.4206896543502808, + "step": 110230 + }, + { + "epoch": 0.11102986278769712, + "grad_norm": 10.785176337978845, + "learning_rate": 4.954332998368818e-05, + "loss": 2.5958, + "mean_token_accuracy": 0.4344827651977539, + "step": 110235 + }, + { + "epoch": 0.1110348988408013, + "grad_norm": 11.698265739953348, + "learning_rate": 4.9543254868584716e-05, + "loss": 2.6384, + "mean_token_accuracy": 0.37241379022598264, + "step": 110240 + }, + { + "epoch": 0.11103993489390547, + "grad_norm": 14.712600744845648, + "learning_rate": 4.9543179747367466e-05, + "loss": 2.9592, + "mean_token_accuracy": 0.33793103098869326, + "step": 110245 + }, + { + "epoch": 0.11104497094700964, + "grad_norm": 11.633217030752702, + "learning_rate": 4.954310462003646e-05, + "loss": 2.3947, + "mean_token_accuracy": 0.42177858352661135, + "step": 110250 + }, + { + "epoch": 0.11105000700011382, + "grad_norm": 10.885593285085745, + "learning_rate": 4.9543029486591706e-05, + "loss": 2.8173, + "mean_token_accuracy": 0.3862068921327591, + "step": 110255 + }, + { + "epoch": 0.11105504305321799, + "grad_norm": 17.17325481498889, + "learning_rate": 4.954295434703325e-05, + "loss": 3.221, + "mean_token_accuracy": 0.39310344159603117, + "step": 110260 + }, + { + "epoch": 0.11106007910632217, + "grad_norm": 9.684765455382694, + "learning_rate": 4.954287920136109e-05, + "loss": 1.8703, + "mean_token_accuracy": 0.5517241477966308, + "step": 110265 + }, + { + "epoch": 0.11106511515942634, + "grad_norm": 9.948375734490787, + "learning_rate": 4.954280404957526e-05, + "loss": 2.7016, + "mean_token_accuracy": 0.37586206793785093, + "step": 110270 + }, + { + "epoch": 0.1110701512125305, + "grad_norm": 10.044869395111666, + "learning_rate": 4.954272889167578e-05, + "loss": 2.3212, + "mean_token_accuracy": 0.45704779028892517, + "step": 110275 + }, + { + "epoch": 0.11107518726563467, + "grad_norm": 12.321505575188331, + "learning_rate": 4.9542653727662655e-05, + "loss": 2.8457, + "mean_token_accuracy": 0.3620689630508423, + "step": 110280 + }, + { + "epoch": 0.11108022331873885, + "grad_norm": 8.825196663857502, + "learning_rate": 4.9542578557535926e-05, + "loss": 2.4642, + "mean_token_accuracy": 0.44295220375061034, + "step": 110285 + }, + { + "epoch": 0.11108525937184302, + "grad_norm": 10.082480663983144, + "learning_rate": 4.9542503381295605e-05, + "loss": 2.7871, + "mean_token_accuracy": 0.379310342669487, + "step": 110290 + }, + { + "epoch": 0.1110902954249472, + "grad_norm": 9.303642530805092, + "learning_rate": 4.954242819894171e-05, + "loss": 2.6453, + "mean_token_accuracy": 0.3793103456497192, + "step": 110295 + }, + { + "epoch": 0.11109533147805137, + "grad_norm": 11.223689810610527, + "learning_rate": 4.9542353010474274e-05, + "loss": 2.741, + "mean_token_accuracy": 0.38620689511299133, + "step": 110300 + }, + { + "epoch": 0.11110036753115554, + "grad_norm": 13.02158096692777, + "learning_rate": 4.9542277815893304e-05, + "loss": 2.9099, + "mean_token_accuracy": 0.37241379022598264, + "step": 110305 + }, + { + "epoch": 0.11110540358425972, + "grad_norm": 11.61095458882924, + "learning_rate": 4.954220261519882e-05, + "loss": 2.5561, + "mean_token_accuracy": 0.4103448331356049, + "step": 110310 + }, + { + "epoch": 0.11111043963736389, + "grad_norm": 9.355107755077444, + "learning_rate": 4.954212740839085e-05, + "loss": 2.4781, + "mean_token_accuracy": 0.38275861740112305, + "step": 110315 + }, + { + "epoch": 0.11111547569046806, + "grad_norm": 11.150287279466312, + "learning_rate": 4.954205219546942e-05, + "loss": 2.4673, + "mean_token_accuracy": 0.4068965494632721, + "step": 110320 + }, + { + "epoch": 0.11112051174357224, + "grad_norm": 13.397771189045741, + "learning_rate": 4.954197697643454e-05, + "loss": 3.1258, + "mean_token_accuracy": 0.39310344457626345, + "step": 110325 + }, + { + "epoch": 0.11112554779667641, + "grad_norm": 11.043361513122283, + "learning_rate": 4.954190175128624e-05, + "loss": 2.352, + "mean_token_accuracy": 0.3827586233615875, + "step": 110330 + }, + { + "epoch": 0.11113058384978058, + "grad_norm": 8.962144224146357, + "learning_rate": 4.954182652002454e-05, + "loss": 2.3292, + "mean_token_accuracy": 0.4206896543502808, + "step": 110335 + }, + { + "epoch": 0.11113561990288476, + "grad_norm": 9.948653972187333, + "learning_rate": 4.9541751282649446e-05, + "loss": 2.2913, + "mean_token_accuracy": 0.4689655125141144, + "step": 110340 + }, + { + "epoch": 0.11114065595598892, + "grad_norm": 15.077061327698045, + "learning_rate": 4.9541676039161e-05, + "loss": 2.5348, + "mean_token_accuracy": 0.4000000059604645, + "step": 110345 + }, + { + "epoch": 0.11114569200909309, + "grad_norm": 11.029063436377841, + "learning_rate": 4.95416007895592e-05, + "loss": 2.1476, + "mean_token_accuracy": 0.4724137902259827, + "step": 110350 + }, + { + "epoch": 0.11115072806219727, + "grad_norm": 8.888393755009425, + "learning_rate": 4.954152553384408e-05, + "loss": 2.7789, + "mean_token_accuracy": 0.4103448212146759, + "step": 110355 + }, + { + "epoch": 0.11115576411530144, + "grad_norm": 10.508304348408263, + "learning_rate": 4.9541450272015676e-05, + "loss": 1.7988, + "mean_token_accuracy": 0.529885059595108, + "step": 110360 + }, + { + "epoch": 0.11116080016840561, + "grad_norm": 12.335557853499209, + "learning_rate": 4.954137500407399e-05, + "loss": 2.278, + "mean_token_accuracy": 0.4502117335796356, + "step": 110365 + }, + { + "epoch": 0.11116583622150979, + "grad_norm": 10.536092121125144, + "learning_rate": 4.9541299730019035e-05, + "loss": 2.5118, + "mean_token_accuracy": 0.4034482777118683, + "step": 110370 + }, + { + "epoch": 0.11117087227461396, + "grad_norm": 10.50055006341244, + "learning_rate": 4.9541224449850855e-05, + "loss": 2.2827, + "mean_token_accuracy": 0.42758620381355283, + "step": 110375 + }, + { + "epoch": 0.11117590832771813, + "grad_norm": 10.495838385562928, + "learning_rate": 4.954114916356945e-05, + "loss": 2.5279, + "mean_token_accuracy": 0.4344827592372894, + "step": 110380 + }, + { + "epoch": 0.11118094438082231, + "grad_norm": 12.974342219083915, + "learning_rate": 4.954107387117486e-05, + "loss": 2.0973, + "mean_token_accuracy": 0.517241382598877, + "step": 110385 + }, + { + "epoch": 0.11118598043392648, + "grad_norm": 10.158284186311523, + "learning_rate": 4.954099857266709e-05, + "loss": 2.4258, + "mean_token_accuracy": 0.4068965554237366, + "step": 110390 + }, + { + "epoch": 0.11119101648703066, + "grad_norm": 11.423101564756257, + "learning_rate": 4.9540923268046166e-05, + "loss": 2.3213, + "mean_token_accuracy": 0.4379310369491577, + "step": 110395 + }, + { + "epoch": 0.11119605254013483, + "grad_norm": 11.37459651136444, + "learning_rate": 4.954084795731212e-05, + "loss": 2.298, + "mean_token_accuracy": 0.4206896543502808, + "step": 110400 + }, + { + "epoch": 0.111201088593239, + "grad_norm": 10.80007583529436, + "learning_rate": 4.9540772640464944e-05, + "loss": 2.7532, + "mean_token_accuracy": 0.41034483909606934, + "step": 110405 + }, + { + "epoch": 0.11120612464634318, + "grad_norm": 11.335589089895011, + "learning_rate": 4.9540697317504686e-05, + "loss": 2.1709, + "mean_token_accuracy": 0.458620685338974, + "step": 110410 + }, + { + "epoch": 0.11121116069944734, + "grad_norm": 12.228076525824628, + "learning_rate": 4.9540621988431364e-05, + "loss": 2.2282, + "mean_token_accuracy": 0.4551724076271057, + "step": 110415 + }, + { + "epoch": 0.11121619675255151, + "grad_norm": 8.762992080173202, + "learning_rate": 4.9540546653244986e-05, + "loss": 2.2797, + "mean_token_accuracy": 0.482758629322052, + "step": 110420 + }, + { + "epoch": 0.11122123280565568, + "grad_norm": 13.360289357424698, + "learning_rate": 4.954047131194558e-05, + "loss": 2.6025, + "mean_token_accuracy": 0.4344827592372894, + "step": 110425 + }, + { + "epoch": 0.11122626885875986, + "grad_norm": 11.793708266371718, + "learning_rate": 4.954039596453317e-05, + "loss": 2.2251, + "mean_token_accuracy": 0.4172413766384125, + "step": 110430 + }, + { + "epoch": 0.11123130491186403, + "grad_norm": 11.69601847267536, + "learning_rate": 4.954032061100778e-05, + "loss": 2.3525, + "mean_token_accuracy": 0.4103448331356049, + "step": 110435 + }, + { + "epoch": 0.1112363409649682, + "grad_norm": 10.845760133687905, + "learning_rate": 4.954024525136942e-05, + "loss": 2.445, + "mean_token_accuracy": 0.42758620977401735, + "step": 110440 + }, + { + "epoch": 0.11124137701807238, + "grad_norm": 12.493566632589582, + "learning_rate": 4.954016988561811e-05, + "loss": 2.4163, + "mean_token_accuracy": 0.4379310429096222, + "step": 110445 + }, + { + "epoch": 0.11124641307117655, + "grad_norm": 20.37054813027392, + "learning_rate": 4.954009451375388e-05, + "loss": 2.2182, + "mean_token_accuracy": 0.44960677027702334, + "step": 110450 + }, + { + "epoch": 0.11125144912428073, + "grad_norm": 11.551775185124189, + "learning_rate": 4.954001913577675e-05, + "loss": 2.5128, + "mean_token_accuracy": 0.3931034505367279, + "step": 110455 + }, + { + "epoch": 0.1112564851773849, + "grad_norm": 10.053724802856078, + "learning_rate": 4.953994375168674e-05, + "loss": 2.4334, + "mean_token_accuracy": 0.43793103098869324, + "step": 110460 + }, + { + "epoch": 0.11126152123048907, + "grad_norm": 10.242921357952964, + "learning_rate": 4.953986836148387e-05, + "loss": 2.4438, + "mean_token_accuracy": 0.42413793206214906, + "step": 110465 + }, + { + "epoch": 0.11126655728359325, + "grad_norm": 14.65582635615077, + "learning_rate": 4.953979296516815e-05, + "loss": 2.39, + "mean_token_accuracy": 0.41034482717514037, + "step": 110470 + }, + { + "epoch": 0.11127159333669742, + "grad_norm": 10.09093030157355, + "learning_rate": 4.953971756273962e-05, + "loss": 2.5585, + "mean_token_accuracy": 0.39655172228813174, + "step": 110475 + }, + { + "epoch": 0.1112766293898016, + "grad_norm": 9.750261506284803, + "learning_rate": 4.953964215419829e-05, + "loss": 2.2408, + "mean_token_accuracy": 0.4689655125141144, + "step": 110480 + }, + { + "epoch": 0.11128166544290576, + "grad_norm": 10.527660368852109, + "learning_rate": 4.9539566739544185e-05, + "loss": 2.7538, + "mean_token_accuracy": 0.36896551549434664, + "step": 110485 + }, + { + "epoch": 0.11128670149600993, + "grad_norm": 10.506903661457997, + "learning_rate": 4.953949131877732e-05, + "loss": 2.0252, + "mean_token_accuracy": 0.47931034564971925, + "step": 110490 + }, + { + "epoch": 0.1112917375491141, + "grad_norm": 10.828540407467989, + "learning_rate": 4.9539415891897724e-05, + "loss": 2.2575, + "mean_token_accuracy": 0.48275862336158754, + "step": 110495 + }, + { + "epoch": 0.11129677360221828, + "grad_norm": 11.35562333335125, + "learning_rate": 4.953934045890541e-05, + "loss": 2.5985, + "mean_token_accuracy": 0.37586206793785093, + "step": 110500 + }, + { + "epoch": 0.11130180965532245, + "grad_norm": 9.725937323099147, + "learning_rate": 4.9539265019800404e-05, + "loss": 2.3709, + "mean_token_accuracy": 0.4620689690113068, + "step": 110505 + }, + { + "epoch": 0.11130684570842662, + "grad_norm": 10.884393705252405, + "learning_rate": 4.953918957458273e-05, + "loss": 2.2995, + "mean_token_accuracy": 0.48154870271682737, + "step": 110510 + }, + { + "epoch": 0.1113118817615308, + "grad_norm": 12.848763980197587, + "learning_rate": 4.95391141232524e-05, + "loss": 2.5165, + "mean_token_accuracy": 0.45547489523887635, + "step": 110515 + }, + { + "epoch": 0.11131691781463497, + "grad_norm": 10.762092124487754, + "learning_rate": 4.953903866580945e-05, + "loss": 2.3617, + "mean_token_accuracy": 0.42413792610168455, + "step": 110520 + }, + { + "epoch": 0.11132195386773915, + "grad_norm": 12.958528396881144, + "learning_rate": 4.9538963202253877e-05, + "loss": 2.5745, + "mean_token_accuracy": 0.4, + "step": 110525 + }, + { + "epoch": 0.11132698992084332, + "grad_norm": 11.03736355806719, + "learning_rate": 4.953888773258572e-05, + "loss": 2.5607, + "mean_token_accuracy": 0.37586206793785093, + "step": 110530 + }, + { + "epoch": 0.1113320259739475, + "grad_norm": 12.355525708990191, + "learning_rate": 4.953881225680499e-05, + "loss": 2.6048, + "mean_token_accuracy": 0.3931034475564957, + "step": 110535 + }, + { + "epoch": 0.11133706202705167, + "grad_norm": 10.32782352270584, + "learning_rate": 4.953873677491172e-05, + "loss": 2.3601, + "mean_token_accuracy": 0.42068964838981626, + "step": 110540 + }, + { + "epoch": 0.11134209808015584, + "grad_norm": 9.827078563739821, + "learning_rate": 4.9538661286905916e-05, + "loss": 2.4995, + "mean_token_accuracy": 0.43448275327682495, + "step": 110545 + }, + { + "epoch": 0.11134713413326001, + "grad_norm": 10.612580609890855, + "learning_rate": 4.9538585792787615e-05, + "loss": 2.3123, + "mean_token_accuracy": 0.3931034505367279, + "step": 110550 + }, + { + "epoch": 0.11135217018636417, + "grad_norm": 10.907721166149548, + "learning_rate": 4.9538510292556826e-05, + "loss": 2.6297, + "mean_token_accuracy": 0.4034482777118683, + "step": 110555 + }, + { + "epoch": 0.11135720623946835, + "grad_norm": 10.490573046916055, + "learning_rate": 4.9538434786213576e-05, + "loss": 2.3859, + "mean_token_accuracy": 0.42758620381355283, + "step": 110560 + }, + { + "epoch": 0.11136224229257252, + "grad_norm": 9.34342491229045, + "learning_rate": 4.953835927375789e-05, + "loss": 2.4234, + "mean_token_accuracy": 0.4206896424293518, + "step": 110565 + }, + { + "epoch": 0.1113672783456767, + "grad_norm": 10.296674005922966, + "learning_rate": 4.9538283755189774e-05, + "loss": 2.6413, + "mean_token_accuracy": 0.4068965494632721, + "step": 110570 + }, + { + "epoch": 0.11137231439878087, + "grad_norm": 12.215545999974788, + "learning_rate": 4.953820823050926e-05, + "loss": 2.6204, + "mean_token_accuracy": 0.4137930929660797, + "step": 110575 + }, + { + "epoch": 0.11137735045188504, + "grad_norm": 9.997158353579897, + "learning_rate": 4.953813269971637e-05, + "loss": 1.9829, + "mean_token_accuracy": 0.46412582993507384, + "step": 110580 + }, + { + "epoch": 0.11138238650498922, + "grad_norm": 11.591238824863009, + "learning_rate": 4.953805716281112e-05, + "loss": 2.1891, + "mean_token_accuracy": 0.4413793087005615, + "step": 110585 + }, + { + "epoch": 0.11138742255809339, + "grad_norm": 10.285217000722298, + "learning_rate": 4.953798161979353e-05, + "loss": 2.2364, + "mean_token_accuracy": 0.4344827592372894, + "step": 110590 + }, + { + "epoch": 0.11139245861119756, + "grad_norm": 11.157793969430854, + "learning_rate": 4.9537906070663624e-05, + "loss": 2.8121, + "mean_token_accuracy": 0.37586206793785093, + "step": 110595 + }, + { + "epoch": 0.11139749466430174, + "grad_norm": 10.35494370593346, + "learning_rate": 4.9537830515421426e-05, + "loss": 2.7957, + "mean_token_accuracy": 0.37241379022598264, + "step": 110600 + }, + { + "epoch": 0.11140253071740591, + "grad_norm": 12.853773983244253, + "learning_rate": 4.953775495406695e-05, + "loss": 2.4785, + "mean_token_accuracy": 0.3931034505367279, + "step": 110605 + }, + { + "epoch": 0.11140756677051009, + "grad_norm": 11.210287908247357, + "learning_rate": 4.953767938660022e-05, + "loss": 2.4828, + "mean_token_accuracy": 0.42413792610168455, + "step": 110610 + }, + { + "epoch": 0.11141260282361426, + "grad_norm": 9.498600452489391, + "learning_rate": 4.953760381302126e-05, + "loss": 2.367, + "mean_token_accuracy": 0.3827586233615875, + "step": 110615 + }, + { + "epoch": 0.11141763887671843, + "grad_norm": 11.819527715681321, + "learning_rate": 4.953752823333008e-05, + "loss": 2.2712, + "mean_token_accuracy": 0.46781609058380125, + "step": 110620 + }, + { + "epoch": 0.1114226749298226, + "grad_norm": 13.00013439027541, + "learning_rate": 4.9537452647526724e-05, + "loss": 2.7481, + "mean_token_accuracy": 0.4206896543502808, + "step": 110625 + }, + { + "epoch": 0.11142771098292677, + "grad_norm": 10.87984326479111, + "learning_rate": 4.953737705561119e-05, + "loss": 2.2068, + "mean_token_accuracy": 0.47931033968925474, + "step": 110630 + }, + { + "epoch": 0.11143274703603094, + "grad_norm": 9.772128858780778, + "learning_rate": 4.953730145758351e-05, + "loss": 2.2872, + "mean_token_accuracy": 0.4551724076271057, + "step": 110635 + }, + { + "epoch": 0.11143778308913511, + "grad_norm": 10.115109721506451, + "learning_rate": 4.953722585344369e-05, + "loss": 2.7062, + "mean_token_accuracy": 0.3482758581638336, + "step": 110640 + }, + { + "epoch": 0.11144281914223929, + "grad_norm": 11.443729566491506, + "learning_rate": 4.953715024319178e-05, + "loss": 2.6285, + "mean_token_accuracy": 0.3862068921327591, + "step": 110645 + }, + { + "epoch": 0.11144785519534346, + "grad_norm": 11.755669373232548, + "learning_rate": 4.953707462682778e-05, + "loss": 2.6414, + "mean_token_accuracy": 0.4379310429096222, + "step": 110650 + }, + { + "epoch": 0.11145289124844764, + "grad_norm": 11.07940779943921, + "learning_rate": 4.953699900435172e-05, + "loss": 2.3766, + "mean_token_accuracy": 0.3999999940395355, + "step": 110655 + }, + { + "epoch": 0.11145792730155181, + "grad_norm": 13.284320189212751, + "learning_rate": 4.95369233757636e-05, + "loss": 2.5046, + "mean_token_accuracy": 0.4206896543502808, + "step": 110660 + }, + { + "epoch": 0.11146296335465598, + "grad_norm": 9.548899674063628, + "learning_rate": 4.9536847741063466e-05, + "loss": 2.109, + "mean_token_accuracy": 0.4793103516101837, + "step": 110665 + }, + { + "epoch": 0.11146799940776016, + "grad_norm": 13.762526908063501, + "learning_rate": 4.953677210025133e-05, + "loss": 2.4152, + "mean_token_accuracy": 0.4206896543502808, + "step": 110670 + }, + { + "epoch": 0.11147303546086433, + "grad_norm": 10.376260966025225, + "learning_rate": 4.9536696453327214e-05, + "loss": 1.9672, + "mean_token_accuracy": 0.4979064047336578, + "step": 110675 + }, + { + "epoch": 0.1114780715139685, + "grad_norm": 11.30098161996951, + "learning_rate": 4.953662080029113e-05, + "loss": 2.3488, + "mean_token_accuracy": 0.493103438615799, + "step": 110680 + }, + { + "epoch": 0.11148310756707268, + "grad_norm": 13.23386022898983, + "learning_rate": 4.953654514114312e-05, + "loss": 2.3514, + "mean_token_accuracy": 0.41724138259887694, + "step": 110685 + }, + { + "epoch": 0.11148814362017685, + "grad_norm": 10.60955388038811, + "learning_rate": 4.953646947588319e-05, + "loss": 2.4547, + "mean_token_accuracy": 0.4379310369491577, + "step": 110690 + }, + { + "epoch": 0.11149317967328101, + "grad_norm": 10.292455483662872, + "learning_rate": 4.953639380451136e-05, + "loss": 2.5023, + "mean_token_accuracy": 0.43448275327682495, + "step": 110695 + }, + { + "epoch": 0.11149821572638519, + "grad_norm": 10.716393784483344, + "learning_rate": 4.953631812702765e-05, + "loss": 2.4966, + "mean_token_accuracy": 0.4068965554237366, + "step": 110700 + }, + { + "epoch": 0.11150325177948936, + "grad_norm": 12.164293124710401, + "learning_rate": 4.953624244343209e-05, + "loss": 2.4054, + "mean_token_accuracy": 0.4154264986515045, + "step": 110705 + }, + { + "epoch": 0.11150828783259353, + "grad_norm": 10.710341024233369, + "learning_rate": 4.9536166753724694e-05, + "loss": 2.5788, + "mean_token_accuracy": 0.41379310488700866, + "step": 110710 + }, + { + "epoch": 0.11151332388569771, + "grad_norm": 11.588803029077017, + "learning_rate": 4.953609105790549e-05, + "loss": 2.4546, + "mean_token_accuracy": 0.4068965494632721, + "step": 110715 + }, + { + "epoch": 0.11151835993880188, + "grad_norm": 9.482871540570281, + "learning_rate": 4.953601535597448e-05, + "loss": 2.3732, + "mean_token_accuracy": 0.44482758045196535, + "step": 110720 + }, + { + "epoch": 0.11152339599190605, + "grad_norm": 10.663535739391383, + "learning_rate": 4.9535939647931715e-05, + "loss": 2.661, + "mean_token_accuracy": 0.3896551787853241, + "step": 110725 + }, + { + "epoch": 0.11152843204501023, + "grad_norm": 10.059427643298658, + "learning_rate": 4.953586393377719e-05, + "loss": 2.2777, + "mean_token_accuracy": 0.4604355812072754, + "step": 110730 + }, + { + "epoch": 0.1115334680981144, + "grad_norm": 11.343786322693434, + "learning_rate": 4.953578821351094e-05, + "loss": 2.2654, + "mean_token_accuracy": 0.4620689570903778, + "step": 110735 + }, + { + "epoch": 0.11153850415121858, + "grad_norm": 11.656704348393465, + "learning_rate": 4.953571248713297e-05, + "loss": 3.1517, + "mean_token_accuracy": 0.3310344755649567, + "step": 110740 + }, + { + "epoch": 0.11154354020432275, + "grad_norm": 10.279025073617573, + "learning_rate": 4.9535636754643336e-05, + "loss": 2.7121, + "mean_token_accuracy": 0.4, + "step": 110745 + }, + { + "epoch": 0.11154857625742692, + "grad_norm": 9.535866612954388, + "learning_rate": 4.953556101604202e-05, + "loss": 2.3325, + "mean_token_accuracy": 0.4413793087005615, + "step": 110750 + }, + { + "epoch": 0.1115536123105311, + "grad_norm": 9.502575982822695, + "learning_rate": 4.953548527132906e-05, + "loss": 2.5756, + "mean_token_accuracy": 0.3965517282485962, + "step": 110755 + }, + { + "epoch": 0.11155864836363527, + "grad_norm": 9.339453241139926, + "learning_rate": 4.953540952050448e-05, + "loss": 2.7921, + "mean_token_accuracy": 0.4000000059604645, + "step": 110760 + }, + { + "epoch": 0.11156368441673943, + "grad_norm": 13.484861169499599, + "learning_rate": 4.9535333763568296e-05, + "loss": 2.7356, + "mean_token_accuracy": 0.4413793087005615, + "step": 110765 + }, + { + "epoch": 0.1115687204698436, + "grad_norm": 11.06732019133782, + "learning_rate": 4.953525800052053e-05, + "loss": 2.9855, + "mean_token_accuracy": 0.38965516686439516, + "step": 110770 + }, + { + "epoch": 0.11157375652294778, + "grad_norm": 9.616676088453826, + "learning_rate": 4.9535182231361205e-05, + "loss": 2.3516, + "mean_token_accuracy": 0.4068965494632721, + "step": 110775 + }, + { + "epoch": 0.11157879257605195, + "grad_norm": 11.352184240263004, + "learning_rate": 4.9535106456090334e-05, + "loss": 2.4339, + "mean_token_accuracy": 0.3944343626499176, + "step": 110780 + }, + { + "epoch": 0.11158382862915613, + "grad_norm": 12.396612748659251, + "learning_rate": 4.953503067470795e-05, + "loss": 2.707, + "mean_token_accuracy": 0.40514216423034666, + "step": 110785 + }, + { + "epoch": 0.1115888646822603, + "grad_norm": 9.779578194676493, + "learning_rate": 4.953495488721407e-05, + "loss": 2.478, + "mean_token_accuracy": 0.42758620977401735, + "step": 110790 + }, + { + "epoch": 0.11159390073536447, + "grad_norm": 11.795191925124541, + "learning_rate": 4.95348790936087e-05, + "loss": 2.8242, + "mean_token_accuracy": 0.34482758641242983, + "step": 110795 + }, + { + "epoch": 0.11159893678846865, + "grad_norm": 9.15154141335474, + "learning_rate": 4.9534803293891885e-05, + "loss": 2.4325, + "mean_token_accuracy": 0.458620685338974, + "step": 110800 + }, + { + "epoch": 0.11160397284157282, + "grad_norm": 10.246749224216927, + "learning_rate": 4.9534727488063636e-05, + "loss": 2.6053, + "mean_token_accuracy": 0.41034482419490814, + "step": 110805 + }, + { + "epoch": 0.111609008894677, + "grad_norm": 10.1329068323462, + "learning_rate": 4.953465167612397e-05, + "loss": 2.2484, + "mean_token_accuracy": 0.47931033968925474, + "step": 110810 + }, + { + "epoch": 0.11161404494778117, + "grad_norm": 9.905672545290285, + "learning_rate": 4.953457585807291e-05, + "loss": 2.5303, + "mean_token_accuracy": 0.4344827651977539, + "step": 110815 + }, + { + "epoch": 0.11161908100088534, + "grad_norm": 10.765856681195736, + "learning_rate": 4.9534500033910474e-05, + "loss": 2.3704, + "mean_token_accuracy": 0.42413793206214906, + "step": 110820 + }, + { + "epoch": 0.11162411705398952, + "grad_norm": 11.593184941508369, + "learning_rate": 4.953442420363669e-05, + "loss": 2.3821, + "mean_token_accuracy": 0.4344827592372894, + "step": 110825 + }, + { + "epoch": 0.11162915310709369, + "grad_norm": 14.602525630328351, + "learning_rate": 4.953434836725158e-05, + "loss": 2.6927, + "mean_token_accuracy": 0.358620685338974, + "step": 110830 + }, + { + "epoch": 0.11163418916019785, + "grad_norm": 8.303154935895021, + "learning_rate": 4.953427252475517e-05, + "loss": 2.1447, + "mean_token_accuracy": 0.46206897497177124, + "step": 110835 + }, + { + "epoch": 0.11163922521330202, + "grad_norm": 9.626989284073598, + "learning_rate": 4.953419667614745e-05, + "loss": 2.2338, + "mean_token_accuracy": 0.4379310429096222, + "step": 110840 + }, + { + "epoch": 0.1116442612664062, + "grad_norm": 9.507409702526015, + "learning_rate": 4.953412082142848e-05, + "loss": 2.0527, + "mean_token_accuracy": 0.49999999403953554, + "step": 110845 + }, + { + "epoch": 0.11164929731951037, + "grad_norm": 11.49961083813994, + "learning_rate": 4.9534044960598264e-05, + "loss": 2.5957, + "mean_token_accuracy": 0.3965517282485962, + "step": 110850 + }, + { + "epoch": 0.11165433337261454, + "grad_norm": 9.7436923832067, + "learning_rate": 4.9533969093656825e-05, + "loss": 2.2096, + "mean_token_accuracy": 0.4206896543502808, + "step": 110855 + }, + { + "epoch": 0.11165936942571872, + "grad_norm": 10.648083208190558, + "learning_rate": 4.953389322060417e-05, + "loss": 2.5918, + "mean_token_accuracy": 0.4206896543502808, + "step": 110860 + }, + { + "epoch": 0.11166440547882289, + "grad_norm": 10.816188705176922, + "learning_rate": 4.953381734144034e-05, + "loss": 2.6074, + "mean_token_accuracy": 0.42413793206214906, + "step": 110865 + }, + { + "epoch": 0.11166944153192707, + "grad_norm": 10.177074200698323, + "learning_rate": 4.953374145616535e-05, + "loss": 2.7441, + "mean_token_accuracy": 0.3999999940395355, + "step": 110870 + }, + { + "epoch": 0.11167447758503124, + "grad_norm": 13.473949425816114, + "learning_rate": 4.9533665564779215e-05, + "loss": 2.6249, + "mean_token_accuracy": 0.3896551787853241, + "step": 110875 + }, + { + "epoch": 0.11167951363813541, + "grad_norm": 9.925723353855735, + "learning_rate": 4.9533589667281964e-05, + "loss": 2.2559, + "mean_token_accuracy": 0.47931034564971925, + "step": 110880 + }, + { + "epoch": 0.11168454969123959, + "grad_norm": 11.997162726550556, + "learning_rate": 4.9533513763673614e-05, + "loss": 2.7263, + "mean_token_accuracy": 0.3896551698446274, + "step": 110885 + }, + { + "epoch": 0.11168958574434376, + "grad_norm": 9.295672855691638, + "learning_rate": 4.9533437853954186e-05, + "loss": 2.0838, + "mean_token_accuracy": 0.5068965554237366, + "step": 110890 + }, + { + "epoch": 0.11169462179744793, + "grad_norm": 10.298976124105616, + "learning_rate": 4.95333619381237e-05, + "loss": 2.6653, + "mean_token_accuracy": 0.39310343861579894, + "step": 110895 + }, + { + "epoch": 0.11169965785055211, + "grad_norm": 10.337588599310013, + "learning_rate": 4.953328601618219e-05, + "loss": 2.3744, + "mean_token_accuracy": 0.43103447556495667, + "step": 110900 + }, + { + "epoch": 0.11170469390365627, + "grad_norm": 10.603923910155563, + "learning_rate": 4.9533210088129654e-05, + "loss": 2.5229, + "mean_token_accuracy": 0.41724138259887694, + "step": 110905 + }, + { + "epoch": 0.11170972995676044, + "grad_norm": 11.612979925316814, + "learning_rate": 4.953313415396613e-05, + "loss": 2.1222, + "mean_token_accuracy": 0.46551724672317507, + "step": 110910 + }, + { + "epoch": 0.11171476600986462, + "grad_norm": 9.193943927243676, + "learning_rate": 4.953305821369163e-05, + "loss": 1.8503, + "mean_token_accuracy": 0.5344827532768249, + "step": 110915 + }, + { + "epoch": 0.11171980206296879, + "grad_norm": 10.12334670770148, + "learning_rate": 4.953298226730618e-05, + "loss": 2.4957, + "mean_token_accuracy": 0.4275861978530884, + "step": 110920 + }, + { + "epoch": 0.11172483811607296, + "grad_norm": 9.054474149665763, + "learning_rate": 4.953290631480981e-05, + "loss": 1.8832, + "mean_token_accuracy": 0.5206896603107453, + "step": 110925 + }, + { + "epoch": 0.11172987416917714, + "grad_norm": 10.053945288436957, + "learning_rate": 4.953283035620251e-05, + "loss": 2.1417, + "mean_token_accuracy": 0.4517241418361664, + "step": 110930 + }, + { + "epoch": 0.11173491022228131, + "grad_norm": 9.412402485151269, + "learning_rate": 4.9532754391484346e-05, + "loss": 2.1324, + "mean_token_accuracy": 0.4862069010734558, + "step": 110935 + }, + { + "epoch": 0.11173994627538548, + "grad_norm": 9.430807208054931, + "learning_rate": 4.95326784206553e-05, + "loss": 2.3637, + "mean_token_accuracy": 0.45517241954803467, + "step": 110940 + }, + { + "epoch": 0.11174498232848966, + "grad_norm": 10.463232152204137, + "learning_rate": 4.953260244371542e-05, + "loss": 2.4481, + "mean_token_accuracy": 0.4068965494632721, + "step": 110945 + }, + { + "epoch": 0.11175001838159383, + "grad_norm": 9.247274572045685, + "learning_rate": 4.95325264606647e-05, + "loss": 2.2117, + "mean_token_accuracy": 0.42758620381355283, + "step": 110950 + }, + { + "epoch": 0.111755054434698, + "grad_norm": 10.650369682632245, + "learning_rate": 4.953245047150319e-05, + "loss": 2.3998, + "mean_token_accuracy": 0.39655172228813174, + "step": 110955 + }, + { + "epoch": 0.11176009048780218, + "grad_norm": 12.732139309490627, + "learning_rate": 4.953237447623089e-05, + "loss": 2.5877, + "mean_token_accuracy": 0.4, + "step": 110960 + }, + { + "epoch": 0.11176512654090635, + "grad_norm": 10.861681915362963, + "learning_rate": 4.953229847484783e-05, + "loss": 2.3366, + "mean_token_accuracy": 0.3896551728248596, + "step": 110965 + }, + { + "epoch": 0.11177016259401053, + "grad_norm": 9.858798211628004, + "learning_rate": 4.9532222467354034e-05, + "loss": 2.3234, + "mean_token_accuracy": 0.4982456147670746, + "step": 110970 + }, + { + "epoch": 0.11177519864711469, + "grad_norm": 10.124148828320575, + "learning_rate": 4.953214645374952e-05, + "loss": 2.4675, + "mean_token_accuracy": 0.458620685338974, + "step": 110975 + }, + { + "epoch": 0.11178023470021886, + "grad_norm": 16.524433944756108, + "learning_rate": 4.95320704340343e-05, + "loss": 3.1711, + "mean_token_accuracy": 0.358620685338974, + "step": 110980 + }, + { + "epoch": 0.11178527075332303, + "grad_norm": 11.320927483490227, + "learning_rate": 4.95319944082084e-05, + "loss": 2.2631, + "mean_token_accuracy": 0.42758620977401735, + "step": 110985 + }, + { + "epoch": 0.11179030680642721, + "grad_norm": 12.663338965840186, + "learning_rate": 4.953191837627186e-05, + "loss": 2.5605, + "mean_token_accuracy": 0.4689655065536499, + "step": 110990 + }, + { + "epoch": 0.11179534285953138, + "grad_norm": 13.396691761271915, + "learning_rate": 4.9531842338224675e-05, + "loss": 2.2233, + "mean_token_accuracy": 0.4310344815254211, + "step": 110995 + }, + { + "epoch": 0.11180037891263556, + "grad_norm": 12.472089474532494, + "learning_rate": 4.953176629406688e-05, + "loss": 2.1972, + "mean_token_accuracy": 0.4689655125141144, + "step": 111000 + }, + { + "epoch": 0.11180541496573973, + "grad_norm": 14.046998961968201, + "learning_rate": 4.953169024379849e-05, + "loss": 2.3181, + "mean_token_accuracy": 0.42758620977401735, + "step": 111005 + }, + { + "epoch": 0.1118104510188439, + "grad_norm": 9.540763109644704, + "learning_rate": 4.953161418741953e-05, + "loss": 2.1762, + "mean_token_accuracy": 0.4931034505367279, + "step": 111010 + }, + { + "epoch": 0.11181548707194808, + "grad_norm": 13.756471299461124, + "learning_rate": 4.9531538124930017e-05, + "loss": 2.8185, + "mean_token_accuracy": 0.3896551728248596, + "step": 111015 + }, + { + "epoch": 0.11182052312505225, + "grad_norm": 8.693958263619736, + "learning_rate": 4.9531462056329976e-05, + "loss": 2.5323, + "mean_token_accuracy": 0.4103448212146759, + "step": 111020 + }, + { + "epoch": 0.11182555917815643, + "grad_norm": 10.713517853928321, + "learning_rate": 4.9531385981619426e-05, + "loss": 2.0694, + "mean_token_accuracy": 0.4931034445762634, + "step": 111025 + }, + { + "epoch": 0.1118305952312606, + "grad_norm": 9.69548130710903, + "learning_rate": 4.9531309900798386e-05, + "loss": 2.3841, + "mean_token_accuracy": 0.4689655065536499, + "step": 111030 + }, + { + "epoch": 0.11183563128436477, + "grad_norm": 12.798650720802252, + "learning_rate": 4.953123381386689e-05, + "loss": 2.7263, + "mean_token_accuracy": 0.4034482777118683, + "step": 111035 + }, + { + "epoch": 0.11184066733746895, + "grad_norm": 11.525139206890879, + "learning_rate": 4.953115772082495e-05, + "loss": 2.8028, + "mean_token_accuracy": 0.39310344457626345, + "step": 111040 + }, + { + "epoch": 0.1118457033905731, + "grad_norm": 10.865228083289118, + "learning_rate": 4.953108162167257e-05, + "loss": 2.3897, + "mean_token_accuracy": 0.441379314661026, + "step": 111045 + }, + { + "epoch": 0.11185073944367728, + "grad_norm": 10.565659199273556, + "learning_rate": 4.95310055164098e-05, + "loss": 2.4671, + "mean_token_accuracy": 0.4310344815254211, + "step": 111050 + }, + { + "epoch": 0.11185577549678145, + "grad_norm": 10.237238183303273, + "learning_rate": 4.9530929405036644e-05, + "loss": 2.8823, + "mean_token_accuracy": 0.3448275804519653, + "step": 111055 + }, + { + "epoch": 0.11186081154988563, + "grad_norm": 9.951644754213525, + "learning_rate": 4.953085328755313e-05, + "loss": 2.7408, + "mean_token_accuracy": 0.37241379022598264, + "step": 111060 + }, + { + "epoch": 0.1118658476029898, + "grad_norm": 10.484451819199595, + "learning_rate": 4.953077716395928e-05, + "loss": 2.1717, + "mean_token_accuracy": 0.441379314661026, + "step": 111065 + }, + { + "epoch": 0.11187088365609398, + "grad_norm": 12.531515707193178, + "learning_rate": 4.95307010342551e-05, + "loss": 2.3904, + "mean_token_accuracy": 0.38620689511299133, + "step": 111070 + }, + { + "epoch": 0.11187591970919815, + "grad_norm": 9.875491658205487, + "learning_rate": 4.953062489844064e-05, + "loss": 2.3522, + "mean_token_accuracy": 0.4586206912994385, + "step": 111075 + }, + { + "epoch": 0.11188095576230232, + "grad_norm": 10.313897918031554, + "learning_rate": 4.953054875651589e-05, + "loss": 2.5532, + "mean_token_accuracy": 0.4275862157344818, + "step": 111080 + }, + { + "epoch": 0.1118859918154065, + "grad_norm": 10.97537884778486, + "learning_rate": 4.953047260848089e-05, + "loss": 2.5606, + "mean_token_accuracy": 0.4172413766384125, + "step": 111085 + }, + { + "epoch": 0.11189102786851067, + "grad_norm": 10.810061069684265, + "learning_rate": 4.953039645433565e-05, + "loss": 2.478, + "mean_token_accuracy": 0.4000000059604645, + "step": 111090 + }, + { + "epoch": 0.11189606392161484, + "grad_norm": 9.310833566438879, + "learning_rate": 4.953032029408021e-05, + "loss": 2.3566, + "mean_token_accuracy": 0.46896551847457885, + "step": 111095 + }, + { + "epoch": 0.11190109997471902, + "grad_norm": 9.468411684501064, + "learning_rate": 4.9530244127714567e-05, + "loss": 2.4165, + "mean_token_accuracy": 0.4344827592372894, + "step": 111100 + }, + { + "epoch": 0.11190613602782319, + "grad_norm": 8.842924622038577, + "learning_rate": 4.953016795523876e-05, + "loss": 2.2643, + "mean_token_accuracy": 0.5049261093139649, + "step": 111105 + }, + { + "epoch": 0.11191117208092737, + "grad_norm": 10.918748783857758, + "learning_rate": 4.95300917766528e-05, + "loss": 2.1434, + "mean_token_accuracy": 0.42758620381355283, + "step": 111110 + }, + { + "epoch": 0.11191620813403153, + "grad_norm": 13.525772695657489, + "learning_rate": 4.9530015591956717e-05, + "loss": 2.4318, + "mean_token_accuracy": 0.4605565667152405, + "step": 111115 + }, + { + "epoch": 0.1119212441871357, + "grad_norm": 9.816509222091243, + "learning_rate": 4.952993940115052e-05, + "loss": 2.4423, + "mean_token_accuracy": 0.44482757449150084, + "step": 111120 + }, + { + "epoch": 0.11192628024023987, + "grad_norm": 9.712713442780796, + "learning_rate": 4.952986320423424e-05, + "loss": 2.2117, + "mean_token_accuracy": 0.4586206912994385, + "step": 111125 + }, + { + "epoch": 0.11193131629334405, + "grad_norm": 12.10666252517279, + "learning_rate": 4.9529787001207895e-05, + "loss": 2.4886, + "mean_token_accuracy": 0.39655172228813174, + "step": 111130 + }, + { + "epoch": 0.11193635234644822, + "grad_norm": 12.167263075450759, + "learning_rate": 4.95297107920715e-05, + "loss": 2.5754, + "mean_token_accuracy": 0.3758620619773865, + "step": 111135 + }, + { + "epoch": 0.1119413883995524, + "grad_norm": 14.332584530072541, + "learning_rate": 4.95296345768251e-05, + "loss": 2.9891, + "mean_token_accuracy": 0.4413793087005615, + "step": 111140 + }, + { + "epoch": 0.11194642445265657, + "grad_norm": 9.670487177999423, + "learning_rate": 4.9529558355468685e-05, + "loss": 2.6526, + "mean_token_accuracy": 0.38275861740112305, + "step": 111145 + }, + { + "epoch": 0.11195146050576074, + "grad_norm": 9.62758780723168, + "learning_rate": 4.9529482128002296e-05, + "loss": 2.0316, + "mean_token_accuracy": 0.5194192349910736, + "step": 111150 + }, + { + "epoch": 0.11195649655886492, + "grad_norm": 11.294622904054343, + "learning_rate": 4.9529405894425946e-05, + "loss": 2.5927, + "mean_token_accuracy": 0.42583181858062746, + "step": 111155 + }, + { + "epoch": 0.11196153261196909, + "grad_norm": 9.049024622146542, + "learning_rate": 4.9529329654739656e-05, + "loss": 2.3028, + "mean_token_accuracy": 0.41379311084747317, + "step": 111160 + }, + { + "epoch": 0.11196656866507326, + "grad_norm": 7.460371526434573, + "learning_rate": 4.952925340894345e-05, + "loss": 2.272, + "mean_token_accuracy": 0.45517241954803467, + "step": 111165 + }, + { + "epoch": 0.11197160471817744, + "grad_norm": 10.561710427335155, + "learning_rate": 4.952917715703735e-05, + "loss": 2.6899, + "mean_token_accuracy": 0.3275861978530884, + "step": 111170 + }, + { + "epoch": 0.11197664077128161, + "grad_norm": 8.509174651008847, + "learning_rate": 4.9529100899021376e-05, + "loss": 2.1767, + "mean_token_accuracy": 0.4517241418361664, + "step": 111175 + }, + { + "epoch": 0.11198167682438578, + "grad_norm": 9.766128160097967, + "learning_rate": 4.952902463489555e-05, + "loss": 2.2074, + "mean_token_accuracy": 0.482758629322052, + "step": 111180 + }, + { + "epoch": 0.11198671287748994, + "grad_norm": 11.084506233577857, + "learning_rate": 4.9528948364659885e-05, + "loss": 2.6478, + "mean_token_accuracy": 0.42885662317276, + "step": 111185 + }, + { + "epoch": 0.11199174893059412, + "grad_norm": 11.296836032036177, + "learning_rate": 4.952887208831441e-05, + "loss": 2.285, + "mean_token_accuracy": 0.45862069725990295, + "step": 111190 + }, + { + "epoch": 0.11199678498369829, + "grad_norm": 10.208591807688657, + "learning_rate": 4.952879580585915e-05, + "loss": 2.7127, + "mean_token_accuracy": 0.36896551251411436, + "step": 111195 + }, + { + "epoch": 0.11200182103680247, + "grad_norm": 12.768241008486962, + "learning_rate": 4.9528719517294125e-05, + "loss": 2.5838, + "mean_token_accuracy": 0.42758620381355283, + "step": 111200 + }, + { + "epoch": 0.11200685708990664, + "grad_norm": 11.832199326700236, + "learning_rate": 4.952864322261935e-05, + "loss": 2.5365, + "mean_token_accuracy": 0.3862068921327591, + "step": 111205 + }, + { + "epoch": 0.11201189314301081, + "grad_norm": 10.47701256959548, + "learning_rate": 4.952856692183483e-05, + "loss": 2.5727, + "mean_token_accuracy": 0.42758620977401735, + "step": 111210 + }, + { + "epoch": 0.11201692919611499, + "grad_norm": 11.959750929542492, + "learning_rate": 4.952849061494063e-05, + "loss": 2.0576, + "mean_token_accuracy": 0.5399878978729248, + "step": 111215 + }, + { + "epoch": 0.11202196524921916, + "grad_norm": 14.215202551521886, + "learning_rate": 4.9528414301936736e-05, + "loss": 2.2206, + "mean_token_accuracy": 0.5229280173778534, + "step": 111220 + }, + { + "epoch": 0.11202700130232333, + "grad_norm": 11.877617653268583, + "learning_rate": 4.952833798282318e-05, + "loss": 2.1696, + "mean_token_accuracy": 0.4482758641242981, + "step": 111225 + }, + { + "epoch": 0.11203203735542751, + "grad_norm": 12.249088500671558, + "learning_rate": 4.9528261657599976e-05, + "loss": 2.7504, + "mean_token_accuracy": 0.38275861740112305, + "step": 111230 + }, + { + "epoch": 0.11203707340853168, + "grad_norm": 10.589151339813798, + "learning_rate": 4.952818532626716e-05, + "loss": 2.4574, + "mean_token_accuracy": 0.4551724135875702, + "step": 111235 + }, + { + "epoch": 0.11204210946163586, + "grad_norm": 9.036791724909689, + "learning_rate": 4.9528108988824744e-05, + "loss": 2.4017, + "mean_token_accuracy": 0.4880822837352753, + "step": 111240 + }, + { + "epoch": 0.11204714551474003, + "grad_norm": 9.974213918441478, + "learning_rate": 4.952803264527274e-05, + "loss": 2.0507, + "mean_token_accuracy": 0.4758620738983154, + "step": 111245 + }, + { + "epoch": 0.1120521815678442, + "grad_norm": 9.420207511631451, + "learning_rate": 4.952795629561119e-05, + "loss": 2.3749, + "mean_token_accuracy": 0.4103448212146759, + "step": 111250 + }, + { + "epoch": 0.11205721762094836, + "grad_norm": 13.017623951879994, + "learning_rate": 4.9527879939840104e-05, + "loss": 2.7838, + "mean_token_accuracy": 0.4517241418361664, + "step": 111255 + }, + { + "epoch": 0.11206225367405254, + "grad_norm": 10.082903539599533, + "learning_rate": 4.95278035779595e-05, + "loss": 2.2282, + "mean_token_accuracy": 0.4344827592372894, + "step": 111260 + }, + { + "epoch": 0.11206728972715671, + "grad_norm": 10.479371746569397, + "learning_rate": 4.95277272099694e-05, + "loss": 2.328, + "mean_token_accuracy": 0.4506352126598358, + "step": 111265 + }, + { + "epoch": 0.11207232578026088, + "grad_norm": 9.717997432650758, + "learning_rate": 4.9527650835869826e-05, + "loss": 2.6857, + "mean_token_accuracy": 0.39655172228813174, + "step": 111270 + }, + { + "epoch": 0.11207736183336506, + "grad_norm": 9.907502702399379, + "learning_rate": 4.952757445566081e-05, + "loss": 2.3944, + "mean_token_accuracy": 0.42413793206214906, + "step": 111275 + }, + { + "epoch": 0.11208239788646923, + "grad_norm": 9.84157711374118, + "learning_rate": 4.9527498069342357e-05, + "loss": 2.2078, + "mean_token_accuracy": 0.458620685338974, + "step": 111280 + }, + { + "epoch": 0.1120874339395734, + "grad_norm": 11.632289960422431, + "learning_rate": 4.9527421676914505e-05, + "loss": 2.5868, + "mean_token_accuracy": 0.42758620977401735, + "step": 111285 + }, + { + "epoch": 0.11209246999267758, + "grad_norm": 11.210209146752737, + "learning_rate": 4.9527345278377255e-05, + "loss": 2.4733, + "mean_token_accuracy": 0.4068965494632721, + "step": 111290 + }, + { + "epoch": 0.11209750604578175, + "grad_norm": 10.436801095705171, + "learning_rate": 4.952726887373064e-05, + "loss": 2.5441, + "mean_token_accuracy": 0.36551724672317504, + "step": 111295 + }, + { + "epoch": 0.11210254209888593, + "grad_norm": 11.892442080040759, + "learning_rate": 4.9527192462974684e-05, + "loss": 2.6294, + "mean_token_accuracy": 0.3655172407627106, + "step": 111300 + }, + { + "epoch": 0.1121075781519901, + "grad_norm": 11.308370950501159, + "learning_rate": 4.9527116046109397e-05, + "loss": 2.3855, + "mean_token_accuracy": 0.4, + "step": 111305 + }, + { + "epoch": 0.11211261420509427, + "grad_norm": 9.276320636204971, + "learning_rate": 4.952703962313481e-05, + "loss": 2.063, + "mean_token_accuracy": 0.5086509466171265, + "step": 111310 + }, + { + "epoch": 0.11211765025819845, + "grad_norm": 10.219062619352373, + "learning_rate": 4.952696319405094e-05, + "loss": 2.2618, + "mean_token_accuracy": 0.4310344785451889, + "step": 111315 + }, + { + "epoch": 0.11212268631130262, + "grad_norm": 12.749503576301349, + "learning_rate": 4.952688675885782e-05, + "loss": 2.3858, + "mean_token_accuracy": 0.43297035694122316, + "step": 111320 + }, + { + "epoch": 0.11212772236440678, + "grad_norm": 11.997728127878338, + "learning_rate": 4.952681031755545e-05, + "loss": 2.278, + "mean_token_accuracy": 0.42068964838981626, + "step": 111325 + }, + { + "epoch": 0.11213275841751096, + "grad_norm": 9.185447867605896, + "learning_rate": 4.9526733870143874e-05, + "loss": 2.4808, + "mean_token_accuracy": 0.45517240166664125, + "step": 111330 + }, + { + "epoch": 0.11213779447061513, + "grad_norm": 10.181894623507537, + "learning_rate": 4.9526657416623086e-05, + "loss": 2.3951, + "mean_token_accuracy": 0.4, + "step": 111335 + }, + { + "epoch": 0.1121428305237193, + "grad_norm": 12.153119787161627, + "learning_rate": 4.952658095699313e-05, + "loss": 2.7726, + "mean_token_accuracy": 0.37586206793785093, + "step": 111340 + }, + { + "epoch": 0.11214786657682348, + "grad_norm": 11.543220030583123, + "learning_rate": 4.9526504491254015e-05, + "loss": 2.2505, + "mean_token_accuracy": 0.4620689630508423, + "step": 111345 + }, + { + "epoch": 0.11215290262992765, + "grad_norm": 8.450532948394486, + "learning_rate": 4.952642801940577e-05, + "loss": 2.1657, + "mean_token_accuracy": 0.4551724076271057, + "step": 111350 + }, + { + "epoch": 0.11215793868303182, + "grad_norm": 10.854961635333785, + "learning_rate": 4.952635154144842e-05, + "loss": 2.1853, + "mean_token_accuracy": 0.4448275864124298, + "step": 111355 + }, + { + "epoch": 0.112162974736136, + "grad_norm": 12.176724733024207, + "learning_rate": 4.952627505738197e-05, + "loss": 2.0843, + "mean_token_accuracy": 0.47586206793785096, + "step": 111360 + }, + { + "epoch": 0.11216801078924017, + "grad_norm": 10.641635157863735, + "learning_rate": 4.952619856720645e-05, + "loss": 2.4683, + "mean_token_accuracy": 0.42232305407524107, + "step": 111365 + }, + { + "epoch": 0.11217304684234435, + "grad_norm": 9.728677723855975, + "learning_rate": 4.9526122070921886e-05, + "loss": 2.0068, + "mean_token_accuracy": 0.48275861144065857, + "step": 111370 + }, + { + "epoch": 0.11217808289544852, + "grad_norm": 11.115419596188751, + "learning_rate": 4.9526045568528294e-05, + "loss": 2.3474, + "mean_token_accuracy": 0.4604355812072754, + "step": 111375 + }, + { + "epoch": 0.1121831189485527, + "grad_norm": 12.548180784201522, + "learning_rate": 4.952596906002569e-05, + "loss": 2.2223, + "mean_token_accuracy": 0.4551724135875702, + "step": 111380 + }, + { + "epoch": 0.11218815500165687, + "grad_norm": 11.537445129690736, + "learning_rate": 4.952589254541411e-05, + "loss": 2.4014, + "mean_token_accuracy": 0.49039408564567566, + "step": 111385 + }, + { + "epoch": 0.11219319105476104, + "grad_norm": 9.990629746328988, + "learning_rate": 4.952581602469357e-05, + "loss": 2.3455, + "mean_token_accuracy": 0.4463054180145264, + "step": 111390 + }, + { + "epoch": 0.1121982271078652, + "grad_norm": 10.539457799127904, + "learning_rate": 4.952573949786407e-05, + "loss": 2.6802, + "mean_token_accuracy": 0.42413793206214906, + "step": 111395 + }, + { + "epoch": 0.11220326316096937, + "grad_norm": 12.640962539544208, + "learning_rate": 4.9525662964925665e-05, + "loss": 2.3339, + "mean_token_accuracy": 0.44827587008476255, + "step": 111400 + }, + { + "epoch": 0.11220829921407355, + "grad_norm": 13.685443646724211, + "learning_rate": 4.9525586425878355e-05, + "loss": 2.7182, + "mean_token_accuracy": 0.4068965494632721, + "step": 111405 + }, + { + "epoch": 0.11221333526717772, + "grad_norm": 10.523390347403765, + "learning_rate": 4.952550988072217e-05, + "loss": 2.2881, + "mean_token_accuracy": 0.4275861978530884, + "step": 111410 + }, + { + "epoch": 0.1122183713202819, + "grad_norm": 16.752881544991617, + "learning_rate": 4.952543332945712e-05, + "loss": 2.4511, + "mean_token_accuracy": 0.39310344457626345, + "step": 111415 + }, + { + "epoch": 0.11222340737338607, + "grad_norm": 17.69398408263333, + "learning_rate": 4.9525356772083235e-05, + "loss": 2.4886, + "mean_token_accuracy": 0.4758620738983154, + "step": 111420 + }, + { + "epoch": 0.11222844342649024, + "grad_norm": 10.968546883937739, + "learning_rate": 4.9525280208600536e-05, + "loss": 2.4074, + "mean_token_accuracy": 0.4724137902259827, + "step": 111425 + }, + { + "epoch": 0.11223347947959442, + "grad_norm": 15.899308460762535, + "learning_rate": 4.9525203639009055e-05, + "loss": 2.3983, + "mean_token_accuracy": 0.4344827592372894, + "step": 111430 + }, + { + "epoch": 0.11223851553269859, + "grad_norm": 11.667906683856852, + "learning_rate": 4.952512706330879e-05, + "loss": 2.4178, + "mean_token_accuracy": 0.4275861978530884, + "step": 111435 + }, + { + "epoch": 0.11224355158580276, + "grad_norm": 9.524507232600618, + "learning_rate": 4.9525050481499776e-05, + "loss": 2.3471, + "mean_token_accuracy": 0.4482758641242981, + "step": 111440 + }, + { + "epoch": 0.11224858763890694, + "grad_norm": 10.8551219848192, + "learning_rate": 4.952497389358203e-05, + "loss": 2.4519, + "mean_token_accuracy": 0.3896551728248596, + "step": 111445 + }, + { + "epoch": 0.11225362369201111, + "grad_norm": 9.501695722841893, + "learning_rate": 4.9524897299555576e-05, + "loss": 2.36, + "mean_token_accuracy": 0.41034482717514037, + "step": 111450 + }, + { + "epoch": 0.11225865974511529, + "grad_norm": 10.847067942215253, + "learning_rate": 4.952482069942044e-05, + "loss": 2.9669, + "mean_token_accuracy": 0.35644282698631286, + "step": 111455 + }, + { + "epoch": 0.11226369579821946, + "grad_norm": 11.573546131623743, + "learning_rate": 4.952474409317664e-05, + "loss": 2.2444, + "mean_token_accuracy": 0.44482758045196535, + "step": 111460 + }, + { + "epoch": 0.11226873185132362, + "grad_norm": 10.969126447925802, + "learning_rate": 4.952466748082418e-05, + "loss": 2.6441, + "mean_token_accuracy": 0.38433151245117186, + "step": 111465 + }, + { + "epoch": 0.1122737679044278, + "grad_norm": 10.991123847201068, + "learning_rate": 4.952459086236311e-05, + "loss": 2.0642, + "mean_token_accuracy": 0.5226860284805298, + "step": 111470 + }, + { + "epoch": 0.11227880395753197, + "grad_norm": 12.886517572434629, + "learning_rate": 4.952451423779343e-05, + "loss": 2.7396, + "mean_token_accuracy": 0.41004234552383423, + "step": 111475 + }, + { + "epoch": 0.11228384001063614, + "grad_norm": 10.090199717238045, + "learning_rate": 4.952443760711518e-05, + "loss": 2.2984, + "mean_token_accuracy": 0.4206896543502808, + "step": 111480 + }, + { + "epoch": 0.11228887606374031, + "grad_norm": 12.082189845388879, + "learning_rate": 4.952436097032835e-05, + "loss": 2.3999, + "mean_token_accuracy": 0.36896551251411436, + "step": 111485 + }, + { + "epoch": 0.11229391211684449, + "grad_norm": 9.400050088194, + "learning_rate": 4.9524284327433e-05, + "loss": 2.3374, + "mean_token_accuracy": 0.4448275864124298, + "step": 111490 + }, + { + "epoch": 0.11229894816994866, + "grad_norm": 11.23501525366294, + "learning_rate": 4.952420767842912e-05, + "loss": 2.5844, + "mean_token_accuracy": 0.36551723480224607, + "step": 111495 + }, + { + "epoch": 0.11230398422305284, + "grad_norm": 9.646472311888301, + "learning_rate": 4.952413102331675e-05, + "loss": 2.2016, + "mean_token_accuracy": 0.44700543880462645, + "step": 111500 + }, + { + "epoch": 0.11230902027615701, + "grad_norm": 10.563971354164007, + "learning_rate": 4.952405436209591e-05, + "loss": 2.3543, + "mean_token_accuracy": 0.46896551847457885, + "step": 111505 + }, + { + "epoch": 0.11231405632926118, + "grad_norm": 18.897577827448874, + "learning_rate": 4.952397769476661e-05, + "loss": 2.7902, + "mean_token_accuracy": 0.38965516686439516, + "step": 111510 + }, + { + "epoch": 0.11231909238236536, + "grad_norm": 10.7458985397591, + "learning_rate": 4.952390102132888e-05, + "loss": 2.3902, + "mean_token_accuracy": 0.41379310488700866, + "step": 111515 + }, + { + "epoch": 0.11232412843546953, + "grad_norm": 10.344108476463264, + "learning_rate": 4.9523824341782735e-05, + "loss": 2.3228, + "mean_token_accuracy": 0.4551724135875702, + "step": 111520 + }, + { + "epoch": 0.1123291644885737, + "grad_norm": 10.856748075871668, + "learning_rate": 4.9523747656128204e-05, + "loss": 2.2985, + "mean_token_accuracy": 0.46896551847457885, + "step": 111525 + }, + { + "epoch": 0.11233420054167788, + "grad_norm": 11.77775214849814, + "learning_rate": 4.9523670964365306e-05, + "loss": 2.4635, + "mean_token_accuracy": 0.4344827592372894, + "step": 111530 + }, + { + "epoch": 0.11233923659478204, + "grad_norm": 9.605010599188414, + "learning_rate": 4.952359426649405e-05, + "loss": 2.0447, + "mean_token_accuracy": 0.4655172348022461, + "step": 111535 + }, + { + "epoch": 0.11234427264788621, + "grad_norm": 25.123472258204288, + "learning_rate": 4.952351756251448e-05, + "loss": 3.3984, + "mean_token_accuracy": 0.3448275923728943, + "step": 111540 + }, + { + "epoch": 0.11234930870099039, + "grad_norm": 10.951970184018228, + "learning_rate": 4.95234408524266e-05, + "loss": 2.5563, + "mean_token_accuracy": 0.3931034505367279, + "step": 111545 + }, + { + "epoch": 0.11235434475409456, + "grad_norm": 12.167939679656278, + "learning_rate": 4.952336413623044e-05, + "loss": 2.4186, + "mean_token_accuracy": 0.38965516686439516, + "step": 111550 + }, + { + "epoch": 0.11235938080719873, + "grad_norm": 10.804955590322109, + "learning_rate": 4.952328741392601e-05, + "loss": 2.8534, + "mean_token_accuracy": 0.4172413766384125, + "step": 111555 + }, + { + "epoch": 0.11236441686030291, + "grad_norm": 9.437564171340664, + "learning_rate": 4.952321068551335e-05, + "loss": 2.1959, + "mean_token_accuracy": 0.46551724672317507, + "step": 111560 + }, + { + "epoch": 0.11236945291340708, + "grad_norm": 8.240952102744117, + "learning_rate": 4.952313395099246e-05, + "loss": 2.1984, + "mean_token_accuracy": 0.4413793087005615, + "step": 111565 + }, + { + "epoch": 0.11237448896651125, + "grad_norm": 14.237742846783584, + "learning_rate": 4.952305721036338e-05, + "loss": 2.3903, + "mean_token_accuracy": 0.4053236573934555, + "step": 111570 + }, + { + "epoch": 0.11237952501961543, + "grad_norm": 11.417725487580878, + "learning_rate": 4.952298046362612e-05, + "loss": 2.077, + "mean_token_accuracy": 0.4965517222881317, + "step": 111575 + }, + { + "epoch": 0.1123845610727196, + "grad_norm": 11.426746032620931, + "learning_rate": 4.95229037107807e-05, + "loss": 2.2463, + "mean_token_accuracy": 0.441379314661026, + "step": 111580 + }, + { + "epoch": 0.11238959712582378, + "grad_norm": 11.645258676917305, + "learning_rate": 4.9522826951827156e-05, + "loss": 2.51, + "mean_token_accuracy": 0.4103448331356049, + "step": 111585 + }, + { + "epoch": 0.11239463317892795, + "grad_norm": 9.81325088724214, + "learning_rate": 4.952275018676549e-05, + "loss": 2.7684, + "mean_token_accuracy": 0.42068964838981626, + "step": 111590 + }, + { + "epoch": 0.11239966923203212, + "grad_norm": 11.137167333862763, + "learning_rate": 4.952267341559573e-05, + "loss": 2.4601, + "mean_token_accuracy": 0.4, + "step": 111595 + }, + { + "epoch": 0.1124047052851363, + "grad_norm": 10.623380760707912, + "learning_rate": 4.9522596638317905e-05, + "loss": 2.5053, + "mean_token_accuracy": 0.4, + "step": 111600 + }, + { + "epoch": 0.11240974133824046, + "grad_norm": 9.607679715333383, + "learning_rate": 4.952251985493203e-05, + "loss": 2.6569, + "mean_token_accuracy": 0.403448274731636, + "step": 111605 + }, + { + "epoch": 0.11241477739134463, + "grad_norm": 11.798647927684359, + "learning_rate": 4.952244306543813e-05, + "loss": 2.4386, + "mean_token_accuracy": 0.38620689511299133, + "step": 111610 + }, + { + "epoch": 0.1124198134444488, + "grad_norm": 10.75262759235998, + "learning_rate": 4.9522366269836215e-05, + "loss": 2.7987, + "mean_token_accuracy": 0.4068965494632721, + "step": 111615 + }, + { + "epoch": 0.11242484949755298, + "grad_norm": 10.558491694008719, + "learning_rate": 4.952228946812631e-05, + "loss": 2.8253, + "mean_token_accuracy": 0.36206896007061007, + "step": 111620 + }, + { + "epoch": 0.11242988555065715, + "grad_norm": 8.685177177048237, + "learning_rate": 4.952221266030846e-05, + "loss": 2.2789, + "mean_token_accuracy": 0.4103448212146759, + "step": 111625 + }, + { + "epoch": 0.11243492160376133, + "grad_norm": 8.81007218401979, + "learning_rate": 4.952213584638265e-05, + "loss": 2.2682, + "mean_token_accuracy": 0.43793103098869324, + "step": 111630 + }, + { + "epoch": 0.1124399576568655, + "grad_norm": 8.547261276375915, + "learning_rate": 4.9522059026348924e-05, + "loss": 2.295, + "mean_token_accuracy": 0.46551724076271056, + "step": 111635 + }, + { + "epoch": 0.11244499370996967, + "grad_norm": 12.66313043718746, + "learning_rate": 4.9521982200207305e-05, + "loss": 2.4051, + "mean_token_accuracy": 0.4724137902259827, + "step": 111640 + }, + { + "epoch": 0.11245002976307385, + "grad_norm": 9.126060731176104, + "learning_rate": 4.95219053679578e-05, + "loss": 2.2014, + "mean_token_accuracy": 0.45172412395477296, + "step": 111645 + }, + { + "epoch": 0.11245506581617802, + "grad_norm": 10.78781175057242, + "learning_rate": 4.952182852960044e-05, + "loss": 2.5019, + "mean_token_accuracy": 0.42758620977401735, + "step": 111650 + }, + { + "epoch": 0.1124601018692822, + "grad_norm": 11.503963056929237, + "learning_rate": 4.9521751685135234e-05, + "loss": 2.6663, + "mean_token_accuracy": 0.4068965494632721, + "step": 111655 + }, + { + "epoch": 0.11246513792238637, + "grad_norm": 9.379835630985818, + "learning_rate": 4.952167483456223e-05, + "loss": 2.4182, + "mean_token_accuracy": 0.4482758641242981, + "step": 111660 + }, + { + "epoch": 0.11247017397549054, + "grad_norm": 9.436200834845094, + "learning_rate": 4.952159797788142e-05, + "loss": 2.4973, + "mean_token_accuracy": 0.3999999940395355, + "step": 111665 + }, + { + "epoch": 0.11247521002859472, + "grad_norm": 10.27729235818234, + "learning_rate": 4.9521521115092846e-05, + "loss": 2.6232, + "mean_token_accuracy": 0.43103448748588563, + "step": 111670 + }, + { + "epoch": 0.11248024608169888, + "grad_norm": 9.30369414209824, + "learning_rate": 4.952144424619651e-05, + "loss": 2.4661, + "mean_token_accuracy": 0.4103448301553726, + "step": 111675 + }, + { + "epoch": 0.11248528213480305, + "grad_norm": 10.263524921083793, + "learning_rate": 4.952136737119245e-05, + "loss": 2.2314, + "mean_token_accuracy": 0.4379310369491577, + "step": 111680 + }, + { + "epoch": 0.11249031818790722, + "grad_norm": 9.987483150347861, + "learning_rate": 4.952129049008068e-05, + "loss": 2.414, + "mean_token_accuracy": 0.4, + "step": 111685 + }, + { + "epoch": 0.1124953542410114, + "grad_norm": 12.314383439974607, + "learning_rate": 4.952121360286123e-05, + "loss": 2.284, + "mean_token_accuracy": 0.43448275327682495, + "step": 111690 + }, + { + "epoch": 0.11250039029411557, + "grad_norm": 14.527667997174206, + "learning_rate": 4.952113670953411e-05, + "loss": 2.5226, + "mean_token_accuracy": 0.4137930989265442, + "step": 111695 + }, + { + "epoch": 0.11250542634721974, + "grad_norm": 10.308531318547193, + "learning_rate": 4.9521059810099347e-05, + "loss": 2.275, + "mean_token_accuracy": 0.4850574791431427, + "step": 111700 + }, + { + "epoch": 0.11251046240032392, + "grad_norm": 12.085283609470673, + "learning_rate": 4.952098290455696e-05, + "loss": 2.3342, + "mean_token_accuracy": 0.4344827473163605, + "step": 111705 + }, + { + "epoch": 0.11251549845342809, + "grad_norm": 10.152063430710427, + "learning_rate": 4.952090599290697e-05, + "loss": 2.8539, + "mean_token_accuracy": 0.37241379618644715, + "step": 111710 + }, + { + "epoch": 0.11252053450653227, + "grad_norm": 10.46692517274618, + "learning_rate": 4.95208290751494e-05, + "loss": 2.407, + "mean_token_accuracy": 0.4413793087005615, + "step": 111715 + }, + { + "epoch": 0.11252557055963644, + "grad_norm": 11.351004995776478, + "learning_rate": 4.952075215128427e-05, + "loss": 2.2432, + "mean_token_accuracy": 0.44482758045196535, + "step": 111720 + }, + { + "epoch": 0.11253060661274061, + "grad_norm": 11.844090882535875, + "learning_rate": 4.9520675221311596e-05, + "loss": 2.7212, + "mean_token_accuracy": 0.4310344934463501, + "step": 111725 + }, + { + "epoch": 0.11253564266584479, + "grad_norm": 9.56817327362832, + "learning_rate": 4.9520598285231416e-05, + "loss": 2.283, + "mean_token_accuracy": 0.4068965494632721, + "step": 111730 + }, + { + "epoch": 0.11254067871894896, + "grad_norm": 10.643896297863472, + "learning_rate": 4.952052134304374e-05, + "loss": 2.3846, + "mean_token_accuracy": 0.4103448331356049, + "step": 111735 + }, + { + "epoch": 0.11254571477205313, + "grad_norm": 9.618497514387489, + "learning_rate": 4.952044439474858e-05, + "loss": 2.6347, + "mean_token_accuracy": 0.39310344457626345, + "step": 111740 + }, + { + "epoch": 0.1125507508251573, + "grad_norm": 21.129664013225085, + "learning_rate": 4.952036744034598e-05, + "loss": 2.2762, + "mean_token_accuracy": 0.47241378426551817, + "step": 111745 + }, + { + "epoch": 0.11255578687826147, + "grad_norm": 12.46011851105571, + "learning_rate": 4.9520290479835945e-05, + "loss": 2.6727, + "mean_token_accuracy": 0.4172413766384125, + "step": 111750 + }, + { + "epoch": 0.11256082293136564, + "grad_norm": 10.375489734873824, + "learning_rate": 4.95202135132185e-05, + "loss": 2.5563, + "mean_token_accuracy": 0.4, + "step": 111755 + }, + { + "epoch": 0.11256585898446982, + "grad_norm": 12.222679199375383, + "learning_rate": 4.952013654049367e-05, + "loss": 2.568, + "mean_token_accuracy": 0.39310344457626345, + "step": 111760 + }, + { + "epoch": 0.11257089503757399, + "grad_norm": 19.109422920181366, + "learning_rate": 4.9520059561661464e-05, + "loss": 2.6066, + "mean_token_accuracy": 0.4172413766384125, + "step": 111765 + }, + { + "epoch": 0.11257593109067816, + "grad_norm": 11.38413408592637, + "learning_rate": 4.9519982576721916e-05, + "loss": 2.3885, + "mean_token_accuracy": 0.4586206912994385, + "step": 111770 + }, + { + "epoch": 0.11258096714378234, + "grad_norm": 10.814878962772442, + "learning_rate": 4.951990558567505e-05, + "loss": 2.6322, + "mean_token_accuracy": 0.37241379022598264, + "step": 111775 + }, + { + "epoch": 0.11258600319688651, + "grad_norm": 10.028460252741345, + "learning_rate": 4.951982858852087e-05, + "loss": 2.4866, + "mean_token_accuracy": 0.3965517282485962, + "step": 111780 + }, + { + "epoch": 0.11259103924999068, + "grad_norm": 9.565090671659185, + "learning_rate": 4.951975158525941e-05, + "loss": 2.1258, + "mean_token_accuracy": 0.4517241358757019, + "step": 111785 + }, + { + "epoch": 0.11259607530309486, + "grad_norm": 10.568144969376164, + "learning_rate": 4.95196745758907e-05, + "loss": 2.6386, + "mean_token_accuracy": 0.39310344457626345, + "step": 111790 + }, + { + "epoch": 0.11260111135619903, + "grad_norm": 8.861342015210543, + "learning_rate": 4.951959756041475e-05, + "loss": 2.1312, + "mean_token_accuracy": 0.512401682138443, + "step": 111795 + }, + { + "epoch": 0.1126061474093032, + "grad_norm": 18.477909269401962, + "learning_rate": 4.9519520538831573e-05, + "loss": 2.6257, + "mean_token_accuracy": 0.44137930274009707, + "step": 111800 + }, + { + "epoch": 0.11261118346240738, + "grad_norm": 9.666251068916313, + "learning_rate": 4.951944351114121e-05, + "loss": 2.558, + "mean_token_accuracy": 0.4344827592372894, + "step": 111805 + }, + { + "epoch": 0.11261621951551155, + "grad_norm": 9.711545888476971, + "learning_rate": 4.951936647734366e-05, + "loss": 2.3615, + "mean_token_accuracy": 0.42413793206214906, + "step": 111810 + }, + { + "epoch": 0.11262125556861571, + "grad_norm": 9.849719905149824, + "learning_rate": 4.9519289437438966e-05, + "loss": 2.0468, + "mean_token_accuracy": 0.458620685338974, + "step": 111815 + }, + { + "epoch": 0.11262629162171989, + "grad_norm": 11.402520882318544, + "learning_rate": 4.9519212391427137e-05, + "loss": 2.4914, + "mean_token_accuracy": 0.3896551728248596, + "step": 111820 + }, + { + "epoch": 0.11263132767482406, + "grad_norm": 10.796561043821034, + "learning_rate": 4.95191353393082e-05, + "loss": 2.3916, + "mean_token_accuracy": 0.491379314661026, + "step": 111825 + }, + { + "epoch": 0.11263636372792823, + "grad_norm": 9.584189099628968, + "learning_rate": 4.951905828108217e-05, + "loss": 2.5697, + "mean_token_accuracy": 0.3806412577629089, + "step": 111830 + }, + { + "epoch": 0.11264139978103241, + "grad_norm": 10.340988010763109, + "learning_rate": 4.951898121674907e-05, + "loss": 2.7361, + "mean_token_accuracy": 0.4034482777118683, + "step": 111835 + }, + { + "epoch": 0.11264643583413658, + "grad_norm": 10.007496824739125, + "learning_rate": 4.951890414630893e-05, + "loss": 2.6144, + "mean_token_accuracy": 0.4068965494632721, + "step": 111840 + }, + { + "epoch": 0.11265147188724076, + "grad_norm": 10.643485027265461, + "learning_rate": 4.951882706976176e-05, + "loss": 2.2278, + "mean_token_accuracy": 0.4482758641242981, + "step": 111845 + }, + { + "epoch": 0.11265650794034493, + "grad_norm": 8.10109905816921, + "learning_rate": 4.951874998710759e-05, + "loss": 2.601, + "mean_token_accuracy": 0.47586206793785096, + "step": 111850 + }, + { + "epoch": 0.1126615439934491, + "grad_norm": 10.173892208865098, + "learning_rate": 4.9518672898346434e-05, + "loss": 2.8477, + "mean_token_accuracy": 0.39655172228813174, + "step": 111855 + }, + { + "epoch": 0.11266658004655328, + "grad_norm": 10.112922739474547, + "learning_rate": 4.951859580347832e-05, + "loss": 2.2828, + "mean_token_accuracy": 0.46896551847457885, + "step": 111860 + }, + { + "epoch": 0.11267161609965745, + "grad_norm": 11.99106759352344, + "learning_rate": 4.951851870250326e-05, + "loss": 2.5196, + "mean_token_accuracy": 0.3827586203813553, + "step": 111865 + }, + { + "epoch": 0.11267665215276162, + "grad_norm": 9.537426146310862, + "learning_rate": 4.951844159542129e-05, + "loss": 2.6249, + "mean_token_accuracy": 0.3896551787853241, + "step": 111870 + }, + { + "epoch": 0.1126816882058658, + "grad_norm": 8.538851622621495, + "learning_rate": 4.9518364482232416e-05, + "loss": 2.3502, + "mean_token_accuracy": 0.42758620381355283, + "step": 111875 + }, + { + "epoch": 0.11268672425896997, + "grad_norm": 10.163664557694991, + "learning_rate": 4.9518287362936665e-05, + "loss": 2.3424, + "mean_token_accuracy": 0.4413793087005615, + "step": 111880 + }, + { + "epoch": 0.11269176031207413, + "grad_norm": 11.36191656978102, + "learning_rate": 4.9518210237534077e-05, + "loss": 2.4425, + "mean_token_accuracy": 0.4068965524435043, + "step": 111885 + }, + { + "epoch": 0.1126967963651783, + "grad_norm": 10.886909854495062, + "learning_rate": 4.951813310602463e-05, + "loss": 2.2276, + "mean_token_accuracy": 0.4275861978530884, + "step": 111890 + }, + { + "epoch": 0.11270183241828248, + "grad_norm": 10.924110545016852, + "learning_rate": 4.951805596840839e-05, + "loss": 2.2889, + "mean_token_accuracy": 0.41379311084747317, + "step": 111895 + }, + { + "epoch": 0.11270686847138665, + "grad_norm": 33.09264553973042, + "learning_rate": 4.951797882468536e-05, + "loss": 2.9556, + "mean_token_accuracy": 0.3827586233615875, + "step": 111900 + }, + { + "epoch": 0.11271190452449083, + "grad_norm": 14.067818432535168, + "learning_rate": 4.9517901674855556e-05, + "loss": 2.5971, + "mean_token_accuracy": 0.4379310369491577, + "step": 111905 + }, + { + "epoch": 0.112716940577595, + "grad_norm": 9.457722548284556, + "learning_rate": 4.9517824518919e-05, + "loss": 1.9271, + "mean_token_accuracy": 0.5125226736068725, + "step": 111910 + }, + { + "epoch": 0.11272197663069917, + "grad_norm": 18.477518320215772, + "learning_rate": 4.9517747356875733e-05, + "loss": 2.7695, + "mean_token_accuracy": 0.42758620977401735, + "step": 111915 + }, + { + "epoch": 0.11272701268380335, + "grad_norm": 10.219748128361198, + "learning_rate": 4.951767018872575e-05, + "loss": 2.4082, + "mean_token_accuracy": 0.4517241418361664, + "step": 111920 + }, + { + "epoch": 0.11273204873690752, + "grad_norm": 11.068663896903583, + "learning_rate": 4.951759301446909e-05, + "loss": 2.732, + "mean_token_accuracy": 0.38275861740112305, + "step": 111925 + }, + { + "epoch": 0.1127370847900117, + "grad_norm": 10.43735099678545, + "learning_rate": 4.9517515834105757e-05, + "loss": 2.2349, + "mean_token_accuracy": 0.4379310429096222, + "step": 111930 + }, + { + "epoch": 0.11274212084311587, + "grad_norm": 9.635107309744868, + "learning_rate": 4.951743864763579e-05, + "loss": 2.0681, + "mean_token_accuracy": 0.4746521532535553, + "step": 111935 + }, + { + "epoch": 0.11274715689622004, + "grad_norm": 13.786354329463379, + "learning_rate": 4.951736145505921e-05, + "loss": 2.0381, + "mean_token_accuracy": 0.4862069010734558, + "step": 111940 + }, + { + "epoch": 0.11275219294932422, + "grad_norm": 9.843596638262246, + "learning_rate": 4.951728425637603e-05, + "loss": 2.3695, + "mean_token_accuracy": 0.4517241418361664, + "step": 111945 + }, + { + "epoch": 0.11275722900242839, + "grad_norm": 9.712538126988122, + "learning_rate": 4.951720705158627e-05, + "loss": 2.2845, + "mean_token_accuracy": 0.4517241418361664, + "step": 111950 + }, + { + "epoch": 0.11276226505553255, + "grad_norm": 14.041250514925462, + "learning_rate": 4.9517129840689965e-05, + "loss": 2.5721, + "mean_token_accuracy": 0.37586206793785093, + "step": 111955 + }, + { + "epoch": 0.11276730110863672, + "grad_norm": 12.515475796807888, + "learning_rate": 4.951705262368711e-05, + "loss": 2.044, + "mean_token_accuracy": 0.46551724672317507, + "step": 111960 + }, + { + "epoch": 0.1127723371617409, + "grad_norm": 11.426288377859997, + "learning_rate": 4.9516975400577756e-05, + "loss": 2.5846, + "mean_token_accuracy": 0.35862069129943847, + "step": 111965 + }, + { + "epoch": 0.11277737321484507, + "grad_norm": 9.625575267367854, + "learning_rate": 4.9516898171361916e-05, + "loss": 2.0719, + "mean_token_accuracy": 0.4620689570903778, + "step": 111970 + }, + { + "epoch": 0.11278240926794925, + "grad_norm": 9.796840021384469, + "learning_rate": 4.95168209360396e-05, + "loss": 2.302, + "mean_token_accuracy": 0.47241379618644713, + "step": 111975 + }, + { + "epoch": 0.11278744532105342, + "grad_norm": 10.247770792589984, + "learning_rate": 4.9516743694610836e-05, + "loss": 2.3907, + "mean_token_accuracy": 0.4379310369491577, + "step": 111980 + }, + { + "epoch": 0.1127924813741576, + "grad_norm": 9.626352838635478, + "learning_rate": 4.9516666447075645e-05, + "loss": 2.3265, + "mean_token_accuracy": 0.4206896543502808, + "step": 111985 + }, + { + "epoch": 0.11279751742726177, + "grad_norm": 11.510714745508748, + "learning_rate": 4.951658919343406e-05, + "loss": 2.5235, + "mean_token_accuracy": 0.4, + "step": 111990 + }, + { + "epoch": 0.11280255348036594, + "grad_norm": 10.708162001686013, + "learning_rate": 4.951651193368608e-05, + "loss": 2.6105, + "mean_token_accuracy": 0.32758620381355286, + "step": 111995 + }, + { + "epoch": 0.11280758953347012, + "grad_norm": 8.929597227548431, + "learning_rate": 4.9516434667831735e-05, + "loss": 2.5534, + "mean_token_accuracy": 0.42196006774902345, + "step": 112000 + }, + { + "epoch": 0.11281262558657429, + "grad_norm": 16.823140738009453, + "learning_rate": 4.951635739587106e-05, + "loss": 2.7911, + "mean_token_accuracy": 0.3655172407627106, + "step": 112005 + }, + { + "epoch": 0.11281766163967846, + "grad_norm": 11.409889263961729, + "learning_rate": 4.951628011780406e-05, + "loss": 2.9153, + "mean_token_accuracy": 0.37586206793785093, + "step": 112010 + }, + { + "epoch": 0.11282269769278264, + "grad_norm": 18.697553057474412, + "learning_rate": 4.951620283363077e-05, + "loss": 3.0374, + "mean_token_accuracy": 0.38620689511299133, + "step": 112015 + }, + { + "epoch": 0.11282773374588681, + "grad_norm": 10.593040641906047, + "learning_rate": 4.95161255433512e-05, + "loss": 2.4457, + "mean_token_accuracy": 0.4000000089406967, + "step": 112020 + }, + { + "epoch": 0.11283276979899097, + "grad_norm": 11.098326081548795, + "learning_rate": 4.9516048246965374e-05, + "loss": 2.1724, + "mean_token_accuracy": 0.43103447556495667, + "step": 112025 + }, + { + "epoch": 0.11283780585209514, + "grad_norm": 11.685250317188183, + "learning_rate": 4.9515970944473316e-05, + "loss": 2.3764, + "mean_token_accuracy": 0.46896552443504336, + "step": 112030 + }, + { + "epoch": 0.11284284190519932, + "grad_norm": 9.707788891878984, + "learning_rate": 4.9515893635875046e-05, + "loss": 2.5649, + "mean_token_accuracy": 0.4103448301553726, + "step": 112035 + }, + { + "epoch": 0.11284787795830349, + "grad_norm": 13.091141906105934, + "learning_rate": 4.951581632117058e-05, + "loss": 2.6138, + "mean_token_accuracy": 0.37586206793785093, + "step": 112040 + }, + { + "epoch": 0.11285291401140767, + "grad_norm": 10.37130532949413, + "learning_rate": 4.951573900035995e-05, + "loss": 2.3476, + "mean_token_accuracy": 0.4811857283115387, + "step": 112045 + }, + { + "epoch": 0.11285795006451184, + "grad_norm": 10.561388418740135, + "learning_rate": 4.951566167344318e-05, + "loss": 2.2696, + "mean_token_accuracy": 0.4620689570903778, + "step": 112050 + }, + { + "epoch": 0.11286298611761601, + "grad_norm": 8.702935097513361, + "learning_rate": 4.951558434042027e-05, + "loss": 2.0795, + "mean_token_accuracy": 0.482758617401123, + "step": 112055 + }, + { + "epoch": 0.11286802217072019, + "grad_norm": 10.060070189179497, + "learning_rate": 4.9515507001291274e-05, + "loss": 2.0212, + "mean_token_accuracy": 0.4620689690113068, + "step": 112060 + }, + { + "epoch": 0.11287305822382436, + "grad_norm": 10.007695369763864, + "learning_rate": 4.951542965605618e-05, + "loss": 2.9708, + "mean_token_accuracy": 0.39485783576965333, + "step": 112065 + }, + { + "epoch": 0.11287809427692853, + "grad_norm": 10.204914770179567, + "learning_rate": 4.9515352304715026e-05, + "loss": 2.2973, + "mean_token_accuracy": 0.4379310250282288, + "step": 112070 + }, + { + "epoch": 0.11288313033003271, + "grad_norm": 13.838557556687348, + "learning_rate": 4.951527494726784e-05, + "loss": 2.9558, + "mean_token_accuracy": 0.3655172437429428, + "step": 112075 + }, + { + "epoch": 0.11288816638313688, + "grad_norm": 10.54915127093984, + "learning_rate": 4.951519758371463e-05, + "loss": 2.4616, + "mean_token_accuracy": 0.3965517282485962, + "step": 112080 + }, + { + "epoch": 0.11289320243624106, + "grad_norm": 12.08559040022823, + "learning_rate": 4.9515120214055415e-05, + "loss": 2.7843, + "mean_token_accuracy": 0.3620689660310745, + "step": 112085 + }, + { + "epoch": 0.11289823848934523, + "grad_norm": 11.987381198976127, + "learning_rate": 4.951504283829024e-05, + "loss": 2.6091, + "mean_token_accuracy": 0.39310344457626345, + "step": 112090 + }, + { + "epoch": 0.11290327454244939, + "grad_norm": 11.083705451838046, + "learning_rate": 4.9514965456419096e-05, + "loss": 2.3758, + "mean_token_accuracy": 0.42413792610168455, + "step": 112095 + }, + { + "epoch": 0.11290831059555356, + "grad_norm": 10.123408378246427, + "learning_rate": 4.951488806844203e-05, + "loss": 2.371, + "mean_token_accuracy": 0.4569268047809601, + "step": 112100 + }, + { + "epoch": 0.11291334664865774, + "grad_norm": 11.114590718880807, + "learning_rate": 4.951481067435905e-05, + "loss": 2.7544, + "mean_token_accuracy": 0.42952207922935487, + "step": 112105 + }, + { + "epoch": 0.11291838270176191, + "grad_norm": 10.614712722050163, + "learning_rate": 4.951473327417017e-05, + "loss": 2.4094, + "mean_token_accuracy": 0.41724138259887694, + "step": 112110 + }, + { + "epoch": 0.11292341875486608, + "grad_norm": 9.853735122918833, + "learning_rate": 4.951465586787544e-05, + "loss": 2.4062, + "mean_token_accuracy": 0.42413792610168455, + "step": 112115 + }, + { + "epoch": 0.11292845480797026, + "grad_norm": 9.738702403117605, + "learning_rate": 4.9514578455474846e-05, + "loss": 2.5223, + "mean_token_accuracy": 0.3793103337287903, + "step": 112120 + }, + { + "epoch": 0.11293349086107443, + "grad_norm": 14.684717154250205, + "learning_rate": 4.951450103696844e-05, + "loss": 2.6053, + "mean_token_accuracy": 0.4280788242816925, + "step": 112125 + }, + { + "epoch": 0.1129385269141786, + "grad_norm": 8.460870341075733, + "learning_rate": 4.951442361235622e-05, + "loss": 2.3291, + "mean_token_accuracy": 0.4620689690113068, + "step": 112130 + }, + { + "epoch": 0.11294356296728278, + "grad_norm": 9.934954491826542, + "learning_rate": 4.951434618163822e-05, + "loss": 2.5354, + "mean_token_accuracy": 0.42413793206214906, + "step": 112135 + }, + { + "epoch": 0.11294859902038695, + "grad_norm": 9.598687975005795, + "learning_rate": 4.951426874481446e-05, + "loss": 2.1552, + "mean_token_accuracy": 0.4448275864124298, + "step": 112140 + }, + { + "epoch": 0.11295363507349113, + "grad_norm": 12.13814884438668, + "learning_rate": 4.9514191301884965e-05, + "loss": 2.8591, + "mean_token_accuracy": 0.3689655244350433, + "step": 112145 + }, + { + "epoch": 0.1129586711265953, + "grad_norm": 11.824384033807242, + "learning_rate": 4.951411385284974e-05, + "loss": 2.4284, + "mean_token_accuracy": 0.42413792610168455, + "step": 112150 + }, + { + "epoch": 0.11296370717969947, + "grad_norm": 11.985515489600779, + "learning_rate": 4.9514036397708836e-05, + "loss": 2.0137, + "mean_token_accuracy": 0.48965516686439514, + "step": 112155 + }, + { + "epoch": 0.11296874323280365, + "grad_norm": 10.026855947297554, + "learning_rate": 4.9513958936462244e-05, + "loss": 2.2569, + "mean_token_accuracy": 0.4689655125141144, + "step": 112160 + }, + { + "epoch": 0.11297377928590781, + "grad_norm": 9.869882921703228, + "learning_rate": 4.951388146911e-05, + "loss": 2.2764, + "mean_token_accuracy": 0.44827585816383364, + "step": 112165 + }, + { + "epoch": 0.11297881533901198, + "grad_norm": 10.335936383874522, + "learning_rate": 4.9513803995652123e-05, + "loss": 2.3098, + "mean_token_accuracy": 0.42413792610168455, + "step": 112170 + }, + { + "epoch": 0.11298385139211616, + "grad_norm": 8.558601014769206, + "learning_rate": 4.9513726516088634e-05, + "loss": 2.352, + "mean_token_accuracy": 0.42068966031074523, + "step": 112175 + }, + { + "epoch": 0.11298888744522033, + "grad_norm": 9.74611985971413, + "learning_rate": 4.951364903041957e-05, + "loss": 2.0931, + "mean_token_accuracy": 0.43103448748588563, + "step": 112180 + }, + { + "epoch": 0.1129939234983245, + "grad_norm": 11.781720208734127, + "learning_rate": 4.951357153864493e-05, + "loss": 2.7263, + "mean_token_accuracy": 0.43103447556495667, + "step": 112185 + }, + { + "epoch": 0.11299895955142868, + "grad_norm": 10.168224510456232, + "learning_rate": 4.951349404076474e-05, + "loss": 2.1716, + "mean_token_accuracy": 0.4655172288417816, + "step": 112190 + }, + { + "epoch": 0.11300399560453285, + "grad_norm": 9.383615654781536, + "learning_rate": 4.9513416536779026e-05, + "loss": 2.615, + "mean_token_accuracy": 0.3827586233615875, + "step": 112195 + }, + { + "epoch": 0.11300903165763702, + "grad_norm": 8.837247309828369, + "learning_rate": 4.9513339026687805e-05, + "loss": 2.1011, + "mean_token_accuracy": 0.44137930274009707, + "step": 112200 + }, + { + "epoch": 0.1130140677107412, + "grad_norm": 8.780647521165315, + "learning_rate": 4.951326151049111e-05, + "loss": 2.2176, + "mean_token_accuracy": 0.47586206793785096, + "step": 112205 + }, + { + "epoch": 0.11301910376384537, + "grad_norm": 9.210831455293675, + "learning_rate": 4.9513183988188954e-05, + "loss": 2.6068, + "mean_token_accuracy": 0.3896551728248596, + "step": 112210 + }, + { + "epoch": 0.11302413981694955, + "grad_norm": 11.415318432500177, + "learning_rate": 4.951310645978135e-05, + "loss": 2.23, + "mean_token_accuracy": 0.4931034505367279, + "step": 112215 + }, + { + "epoch": 0.11302917587005372, + "grad_norm": 13.522460028780852, + "learning_rate": 4.951302892526834e-05, + "loss": 2.1977, + "mean_token_accuracy": 0.49848759174346924, + "step": 112220 + }, + { + "epoch": 0.11303421192315789, + "grad_norm": 10.634091185403916, + "learning_rate": 4.951295138464993e-05, + "loss": 2.4248, + "mean_token_accuracy": 0.3876588046550751, + "step": 112225 + }, + { + "epoch": 0.11303924797626207, + "grad_norm": 16.452895581389484, + "learning_rate": 4.951287383792614e-05, + "loss": 2.9035, + "mean_token_accuracy": 0.3482758581638336, + "step": 112230 + }, + { + "epoch": 0.11304428402936623, + "grad_norm": 9.840390173606698, + "learning_rate": 4.9512796285097006e-05, + "loss": 2.3514, + "mean_token_accuracy": 0.42413793206214906, + "step": 112235 + }, + { + "epoch": 0.1130493200824704, + "grad_norm": 10.361210623654326, + "learning_rate": 4.951271872616254e-05, + "loss": 2.6183, + "mean_token_accuracy": 0.4034482717514038, + "step": 112240 + }, + { + "epoch": 0.11305435613557457, + "grad_norm": 10.172913874332453, + "learning_rate": 4.9512641161122766e-05, + "loss": 2.7148, + "mean_token_accuracy": 0.39655172228813174, + "step": 112245 + }, + { + "epoch": 0.11305939218867875, + "grad_norm": 11.726342984073534, + "learning_rate": 4.95125635899777e-05, + "loss": 2.1831, + "mean_token_accuracy": 0.44482759237289426, + "step": 112250 + }, + { + "epoch": 0.11306442824178292, + "grad_norm": 9.167725604210977, + "learning_rate": 4.951248601272737e-05, + "loss": 2.5631, + "mean_token_accuracy": 0.4344827592372894, + "step": 112255 + }, + { + "epoch": 0.1130694642948871, + "grad_norm": 10.136103673162948, + "learning_rate": 4.951240842937179e-05, + "loss": 2.3461, + "mean_token_accuracy": 0.4482758641242981, + "step": 112260 + }, + { + "epoch": 0.11307450034799127, + "grad_norm": 10.152051075292167, + "learning_rate": 4.951233083991099e-05, + "loss": 2.4725, + "mean_token_accuracy": 0.4, + "step": 112265 + }, + { + "epoch": 0.11307953640109544, + "grad_norm": 11.169857360158185, + "learning_rate": 4.9512253244344985e-05, + "loss": 2.2612, + "mean_token_accuracy": 0.4344827592372894, + "step": 112270 + }, + { + "epoch": 0.11308457245419962, + "grad_norm": 9.798676541615995, + "learning_rate": 4.9512175642673796e-05, + "loss": 2.3035, + "mean_token_accuracy": 0.42413792610168455, + "step": 112275 + }, + { + "epoch": 0.11308960850730379, + "grad_norm": 13.604119379206551, + "learning_rate": 4.951209803489746e-05, + "loss": 2.2973, + "mean_token_accuracy": 0.47931033968925474, + "step": 112280 + }, + { + "epoch": 0.11309464456040796, + "grad_norm": 11.814696111357394, + "learning_rate": 4.9512020421015976e-05, + "loss": 3.0832, + "mean_token_accuracy": 0.37586206793785093, + "step": 112285 + }, + { + "epoch": 0.11309968061351214, + "grad_norm": 18.551617540978377, + "learning_rate": 4.951194280102938e-05, + "loss": 2.3041, + "mean_token_accuracy": 0.4068965494632721, + "step": 112290 + }, + { + "epoch": 0.11310471666661631, + "grad_norm": 10.32830745736162, + "learning_rate": 4.951186517493769e-05, + "loss": 2.6332, + "mean_token_accuracy": 0.4034482717514038, + "step": 112295 + }, + { + "epoch": 0.11310975271972049, + "grad_norm": 10.25450270484326, + "learning_rate": 4.951178754274093e-05, + "loss": 2.3259, + "mean_token_accuracy": 0.43448275327682495, + "step": 112300 + }, + { + "epoch": 0.11311478877282465, + "grad_norm": 12.868099106526067, + "learning_rate": 4.951170990443911e-05, + "loss": 2.407, + "mean_token_accuracy": 0.4068965494632721, + "step": 112305 + }, + { + "epoch": 0.11311982482592882, + "grad_norm": 12.423213574924635, + "learning_rate": 4.951163226003227e-05, + "loss": 2.7118, + "mean_token_accuracy": 0.3999999940395355, + "step": 112310 + }, + { + "epoch": 0.11312486087903299, + "grad_norm": 12.528918792941527, + "learning_rate": 4.951155460952042e-05, + "loss": 2.9389, + "mean_token_accuracy": 0.39310344457626345, + "step": 112315 + }, + { + "epoch": 0.11312989693213717, + "grad_norm": 9.661359959574826, + "learning_rate": 4.951147695290357e-05, + "loss": 2.3118, + "mean_token_accuracy": 0.4517241358757019, + "step": 112320 + }, + { + "epoch": 0.11313493298524134, + "grad_norm": 11.26934291117187, + "learning_rate": 4.951139929018177e-05, + "loss": 2.162, + "mean_token_accuracy": 0.4620689630508423, + "step": 112325 + }, + { + "epoch": 0.11313996903834551, + "grad_norm": 14.58188098207035, + "learning_rate": 4.951132162135502e-05, + "loss": 1.9054, + "mean_token_accuracy": 0.5108287930488586, + "step": 112330 + }, + { + "epoch": 0.11314500509144969, + "grad_norm": 10.933531568971459, + "learning_rate": 4.9511243946423345e-05, + "loss": 2.2488, + "mean_token_accuracy": 0.42758620381355283, + "step": 112335 + }, + { + "epoch": 0.11315004114455386, + "grad_norm": 10.33322509381467, + "learning_rate": 4.9511166265386774e-05, + "loss": 2.7507, + "mean_token_accuracy": 0.4221415638923645, + "step": 112340 + }, + { + "epoch": 0.11315507719765804, + "grad_norm": 12.94608655515518, + "learning_rate": 4.9511088578245325e-05, + "loss": 2.5327, + "mean_token_accuracy": 0.3999999940395355, + "step": 112345 + }, + { + "epoch": 0.11316011325076221, + "grad_norm": 11.593199381456072, + "learning_rate": 4.951101088499902e-05, + "loss": 2.7042, + "mean_token_accuracy": 0.37586206793785093, + "step": 112350 + }, + { + "epoch": 0.11316514930386638, + "grad_norm": 14.356117380316872, + "learning_rate": 4.9510933185647874e-05, + "loss": 2.4616, + "mean_token_accuracy": 0.4, + "step": 112355 + }, + { + "epoch": 0.11317018535697056, + "grad_norm": 11.117393676043172, + "learning_rate": 4.951085548019191e-05, + "loss": 2.6159, + "mean_token_accuracy": 0.4206896543502808, + "step": 112360 + }, + { + "epoch": 0.11317522141007473, + "grad_norm": 10.166844273646618, + "learning_rate": 4.951077776863116e-05, + "loss": 2.2713, + "mean_token_accuracy": 0.44827585816383364, + "step": 112365 + }, + { + "epoch": 0.1131802574631789, + "grad_norm": 12.211920564853251, + "learning_rate": 4.951070005096564e-05, + "loss": 2.351, + "mean_token_accuracy": 0.45517241954803467, + "step": 112370 + }, + { + "epoch": 0.11318529351628306, + "grad_norm": 11.287092608852857, + "learning_rate": 4.951062232719536e-05, + "loss": 2.4213, + "mean_token_accuracy": 0.4344827592372894, + "step": 112375 + }, + { + "epoch": 0.11319032956938724, + "grad_norm": 10.886003405238254, + "learning_rate": 4.951054459732037e-05, + "loss": 2.4819, + "mean_token_accuracy": 0.41724138259887694, + "step": 112380 + }, + { + "epoch": 0.11319536562249141, + "grad_norm": 10.711937260092208, + "learning_rate": 4.951046686134066e-05, + "loss": 2.5082, + "mean_token_accuracy": 0.4068965494632721, + "step": 112385 + }, + { + "epoch": 0.11320040167559559, + "grad_norm": 14.781412246581398, + "learning_rate": 4.9510389119256265e-05, + "loss": 2.8194, + "mean_token_accuracy": 0.3793103456497192, + "step": 112390 + }, + { + "epoch": 0.11320543772869976, + "grad_norm": 11.100164450716207, + "learning_rate": 4.951031137106721e-05, + "loss": 2.4426, + "mean_token_accuracy": 0.45862067937850953, + "step": 112395 + }, + { + "epoch": 0.11321047378180393, + "grad_norm": 11.393728140951234, + "learning_rate": 4.9510233616773515e-05, + "loss": 2.2806, + "mean_token_accuracy": 0.39655172228813174, + "step": 112400 + }, + { + "epoch": 0.1132155098349081, + "grad_norm": 14.274189415511296, + "learning_rate": 4.9510155856375195e-05, + "loss": 2.6659, + "mean_token_accuracy": 0.42413793206214906, + "step": 112405 + }, + { + "epoch": 0.11322054588801228, + "grad_norm": 10.758011177961524, + "learning_rate": 4.951007808987228e-05, + "loss": 2.3328, + "mean_token_accuracy": 0.4, + "step": 112410 + }, + { + "epoch": 0.11322558194111645, + "grad_norm": 8.837145034889286, + "learning_rate": 4.951000031726478e-05, + "loss": 2.0194, + "mean_token_accuracy": 0.4965517222881317, + "step": 112415 + }, + { + "epoch": 0.11323061799422063, + "grad_norm": 18.80916709207696, + "learning_rate": 4.9509922538552735e-05, + "loss": 2.396, + "mean_token_accuracy": 0.4034482777118683, + "step": 112420 + }, + { + "epoch": 0.1132356540473248, + "grad_norm": 10.216712361886339, + "learning_rate": 4.950984475373614e-05, + "loss": 2.7103, + "mean_token_accuracy": 0.36551724672317504, + "step": 112425 + }, + { + "epoch": 0.11324069010042898, + "grad_norm": 12.161941207908773, + "learning_rate": 4.950976696281505e-05, + "loss": 2.6317, + "mean_token_accuracy": 0.4, + "step": 112430 + }, + { + "epoch": 0.11324572615353315, + "grad_norm": 10.63732096438779, + "learning_rate": 4.950968916578946e-05, + "loss": 2.5106, + "mean_token_accuracy": 0.41379310488700866, + "step": 112435 + }, + { + "epoch": 0.11325076220663732, + "grad_norm": 7.68988123791223, + "learning_rate": 4.950961136265941e-05, + "loss": 1.9188, + "mean_token_accuracy": 0.4468844473361969, + "step": 112440 + }, + { + "epoch": 0.11325579825974148, + "grad_norm": 10.239054108598628, + "learning_rate": 4.95095335534249e-05, + "loss": 2.2205, + "mean_token_accuracy": 0.43986691236495973, + "step": 112445 + }, + { + "epoch": 0.11326083431284566, + "grad_norm": 10.600008138134777, + "learning_rate": 4.950945573808597e-05, + "loss": 2.4271, + "mean_token_accuracy": 0.46551724076271056, + "step": 112450 + }, + { + "epoch": 0.11326587036594983, + "grad_norm": 13.560729837005418, + "learning_rate": 4.950937791664264e-05, + "loss": 2.6895, + "mean_token_accuracy": 0.3793103456497192, + "step": 112455 + }, + { + "epoch": 0.113270906419054, + "grad_norm": 10.76779913253055, + "learning_rate": 4.950930008909492e-05, + "loss": 2.3881, + "mean_token_accuracy": 0.42413792610168455, + "step": 112460 + }, + { + "epoch": 0.11327594247215818, + "grad_norm": 11.29523549352523, + "learning_rate": 4.950922225544283e-05, + "loss": 2.6198, + "mean_token_accuracy": 0.46896551847457885, + "step": 112465 + }, + { + "epoch": 0.11328097852526235, + "grad_norm": 10.27067347910389, + "learning_rate": 4.950914441568642e-05, + "loss": 2.5696, + "mean_token_accuracy": 0.41379310488700866, + "step": 112470 + }, + { + "epoch": 0.11328601457836653, + "grad_norm": 13.064380407176515, + "learning_rate": 4.950906656982567e-05, + "loss": 3.3672, + "mean_token_accuracy": 0.31034482419490816, + "step": 112475 + }, + { + "epoch": 0.1132910506314707, + "grad_norm": 10.430120548001469, + "learning_rate": 4.9508988717860637e-05, + "loss": 2.5065, + "mean_token_accuracy": 0.4413793087005615, + "step": 112480 + }, + { + "epoch": 0.11329608668457487, + "grad_norm": 8.271612612404052, + "learning_rate": 4.950891085979133e-05, + "loss": 2.2725, + "mean_token_accuracy": 0.45051422119140627, + "step": 112485 + }, + { + "epoch": 0.11330112273767905, + "grad_norm": 11.642432113317138, + "learning_rate": 4.950883299561776e-05, + "loss": 2.3184, + "mean_token_accuracy": 0.4398669093847275, + "step": 112490 + }, + { + "epoch": 0.11330615879078322, + "grad_norm": 10.632275282403485, + "learning_rate": 4.9508755125339964e-05, + "loss": 2.6096, + "mean_token_accuracy": 0.3793103337287903, + "step": 112495 + }, + { + "epoch": 0.1133111948438874, + "grad_norm": 10.387993284894442, + "learning_rate": 4.9508677248957965e-05, + "loss": 2.1457, + "mean_token_accuracy": 0.4949788272380829, + "step": 112500 + }, + { + "epoch": 0.11331623089699157, + "grad_norm": 12.583606904666755, + "learning_rate": 4.950859936647177e-05, + "loss": 2.4628, + "mean_token_accuracy": 0.3793103456497192, + "step": 112505 + }, + { + "epoch": 0.11332126695009574, + "grad_norm": 8.9784507628001, + "learning_rate": 4.9508521477881406e-05, + "loss": 2.3244, + "mean_token_accuracy": 0.42413793206214906, + "step": 112510 + }, + { + "epoch": 0.1133263030031999, + "grad_norm": 14.054294401755884, + "learning_rate": 4.95084435831869e-05, + "loss": 2.3109, + "mean_token_accuracy": 0.4620689690113068, + "step": 112515 + }, + { + "epoch": 0.11333133905630408, + "grad_norm": 10.86300268767509, + "learning_rate": 4.950836568238827e-05, + "loss": 2.4358, + "mean_token_accuracy": 0.47241378426551817, + "step": 112520 + }, + { + "epoch": 0.11333637510940825, + "grad_norm": 10.021149861681003, + "learning_rate": 4.950828777548553e-05, + "loss": 2.4272, + "mean_token_accuracy": 0.46061705946922304, + "step": 112525 + }, + { + "epoch": 0.11334141116251242, + "grad_norm": 10.860990583187526, + "learning_rate": 4.9508209862478714e-05, + "loss": 2.3804, + "mean_token_accuracy": 0.44137930274009707, + "step": 112530 + }, + { + "epoch": 0.1133464472156166, + "grad_norm": 27.751713056182442, + "learning_rate": 4.950813194336784e-05, + "loss": 2.8249, + "mean_token_accuracy": 0.4257713258266449, + "step": 112535 + }, + { + "epoch": 0.11335148326872077, + "grad_norm": 9.798477220317345, + "learning_rate": 4.9508054018152935e-05, + "loss": 2.371, + "mean_token_accuracy": 0.4482758641242981, + "step": 112540 + }, + { + "epoch": 0.11335651932182494, + "grad_norm": 12.974638441815156, + "learning_rate": 4.9507976086834005e-05, + "loss": 2.2347, + "mean_token_accuracy": 0.46896551847457885, + "step": 112545 + }, + { + "epoch": 0.11336155537492912, + "grad_norm": 8.890279946733163, + "learning_rate": 4.9507898149411084e-05, + "loss": 2.1002, + "mean_token_accuracy": 0.4344827651977539, + "step": 112550 + }, + { + "epoch": 0.11336659142803329, + "grad_norm": 12.400856437531234, + "learning_rate": 4.9507820205884194e-05, + "loss": 2.6807, + "mean_token_accuracy": 0.4344827473163605, + "step": 112555 + }, + { + "epoch": 0.11337162748113747, + "grad_norm": 10.51500033595627, + "learning_rate": 4.950774225625334e-05, + "loss": 2.2955, + "mean_token_accuracy": 0.4430127084255219, + "step": 112560 + }, + { + "epoch": 0.11337666353424164, + "grad_norm": 12.908551875679096, + "learning_rate": 4.950766430051857e-05, + "loss": 2.61, + "mean_token_accuracy": 0.3758620649576187, + "step": 112565 + }, + { + "epoch": 0.11338169958734581, + "grad_norm": 10.114325260277047, + "learning_rate": 4.950758633867989e-05, + "loss": 2.3728, + "mean_token_accuracy": 0.41034482717514037, + "step": 112570 + }, + { + "epoch": 0.11338673564044999, + "grad_norm": 12.65785854263634, + "learning_rate": 4.950750837073732e-05, + "loss": 2.5889, + "mean_token_accuracy": 0.4052026689052582, + "step": 112575 + }, + { + "epoch": 0.11339177169355416, + "grad_norm": 9.5696482370971, + "learning_rate": 4.950743039669089e-05, + "loss": 2.4666, + "mean_token_accuracy": 0.37743496894836426, + "step": 112580 + }, + { + "epoch": 0.11339680774665832, + "grad_norm": 12.110643775014921, + "learning_rate": 4.950735241654061e-05, + "loss": 2.592, + "mean_token_accuracy": 0.41034482717514037, + "step": 112585 + }, + { + "epoch": 0.1134018437997625, + "grad_norm": 13.612028782413036, + "learning_rate": 4.950727443028651e-05, + "loss": 2.5999, + "mean_token_accuracy": 0.39655172228813174, + "step": 112590 + }, + { + "epoch": 0.11340687985286667, + "grad_norm": 10.21409142588348, + "learning_rate": 4.950719643792862e-05, + "loss": 2.6029, + "mean_token_accuracy": 0.4206896543502808, + "step": 112595 + }, + { + "epoch": 0.11341191590597084, + "grad_norm": 10.318700033312874, + "learning_rate": 4.950711843946694e-05, + "loss": 2.432, + "mean_token_accuracy": 0.4482758641242981, + "step": 112600 + }, + { + "epoch": 0.11341695195907502, + "grad_norm": 9.78717945823098, + "learning_rate": 4.9507040434901506e-05, + "loss": 2.3255, + "mean_token_accuracy": 0.41379311084747317, + "step": 112605 + }, + { + "epoch": 0.11342198801217919, + "grad_norm": 14.162182263606587, + "learning_rate": 4.950696242423235e-05, + "loss": 2.319, + "mean_token_accuracy": 0.41034482717514037, + "step": 112610 + }, + { + "epoch": 0.11342702406528336, + "grad_norm": 13.33732163138306, + "learning_rate": 4.950688440745946e-05, + "loss": 2.7267, + "mean_token_accuracy": 0.3827586233615875, + "step": 112615 + }, + { + "epoch": 0.11343206011838754, + "grad_norm": 13.831559515206177, + "learning_rate": 4.950680638458289e-05, + "loss": 2.0875, + "mean_token_accuracy": 0.458620685338974, + "step": 112620 + }, + { + "epoch": 0.11343709617149171, + "grad_norm": 10.164092618482316, + "learning_rate": 4.950672835560265e-05, + "loss": 2.194, + "mean_token_accuracy": 0.4620689630508423, + "step": 112625 + }, + { + "epoch": 0.11344213222459588, + "grad_norm": 9.485509818800969, + "learning_rate": 4.950665032051876e-05, + "loss": 2.1772, + "mean_token_accuracy": 0.4931034445762634, + "step": 112630 + }, + { + "epoch": 0.11344716827770006, + "grad_norm": 9.462468287577686, + "learning_rate": 4.950657227933125e-05, + "loss": 2.2047, + "mean_token_accuracy": 0.46394434571266174, + "step": 112635 + }, + { + "epoch": 0.11345220433080423, + "grad_norm": 9.82952668679054, + "learning_rate": 4.950649423204013e-05, + "loss": 2.3954, + "mean_token_accuracy": 0.41034482717514037, + "step": 112640 + }, + { + "epoch": 0.1134572403839084, + "grad_norm": 15.486847673804846, + "learning_rate": 4.950641617864542e-05, + "loss": 2.6007, + "mean_token_accuracy": 0.43103448748588563, + "step": 112645 + }, + { + "epoch": 0.11346227643701258, + "grad_norm": 10.980521361238507, + "learning_rate": 4.950633811914715e-05, + "loss": 2.5132, + "mean_token_accuracy": 0.4758620738983154, + "step": 112650 + }, + { + "epoch": 0.11346731249011674, + "grad_norm": 13.808154705068338, + "learning_rate": 4.950626005354535e-05, + "loss": 2.5863, + "mean_token_accuracy": 0.40508167147636415, + "step": 112655 + }, + { + "epoch": 0.11347234854322091, + "grad_norm": 9.834409151075814, + "learning_rate": 4.950618198184003e-05, + "loss": 2.1222, + "mean_token_accuracy": 0.4620689630508423, + "step": 112660 + }, + { + "epoch": 0.11347738459632509, + "grad_norm": 9.129435969139413, + "learning_rate": 4.950610390403121e-05, + "loss": 2.3897, + "mean_token_accuracy": 0.4275862157344818, + "step": 112665 + }, + { + "epoch": 0.11348242064942926, + "grad_norm": 10.802864718112625, + "learning_rate": 4.9506025820118914e-05, + "loss": 2.2116, + "mean_token_accuracy": 0.4482758641242981, + "step": 112670 + }, + { + "epoch": 0.11348745670253343, + "grad_norm": 8.534373747527995, + "learning_rate": 4.9505947730103164e-05, + "loss": 2.543, + "mean_token_accuracy": 0.4034482717514038, + "step": 112675 + }, + { + "epoch": 0.11349249275563761, + "grad_norm": 12.626695417522242, + "learning_rate": 4.950586963398398e-05, + "loss": 2.419, + "mean_token_accuracy": 0.42068966031074523, + "step": 112680 + }, + { + "epoch": 0.11349752880874178, + "grad_norm": 8.772732762734016, + "learning_rate": 4.9505791531761394e-05, + "loss": 2.1669, + "mean_token_accuracy": 0.4137930989265442, + "step": 112685 + }, + { + "epoch": 0.11350256486184596, + "grad_norm": 10.631695648456004, + "learning_rate": 4.9505713423435414e-05, + "loss": 2.5723, + "mean_token_accuracy": 0.4, + "step": 112690 + }, + { + "epoch": 0.11350760091495013, + "grad_norm": 13.291338143216759, + "learning_rate": 4.9505635309006074e-05, + "loss": 2.2889, + "mean_token_accuracy": 0.45722927451133727, + "step": 112695 + }, + { + "epoch": 0.1135126369680543, + "grad_norm": 12.065328072716197, + "learning_rate": 4.950555718847338e-05, + "loss": 2.5799, + "mean_token_accuracy": 0.38275861740112305, + "step": 112700 + }, + { + "epoch": 0.11351767302115848, + "grad_norm": 11.007597806646965, + "learning_rate": 4.950547906183737e-05, + "loss": 2.453, + "mean_token_accuracy": 0.39655172228813174, + "step": 112705 + }, + { + "epoch": 0.11352270907426265, + "grad_norm": 9.330024105780648, + "learning_rate": 4.950540092909805e-05, + "loss": 2.4688, + "mean_token_accuracy": 0.4517241418361664, + "step": 112710 + }, + { + "epoch": 0.11352774512736682, + "grad_norm": 13.155215818721846, + "learning_rate": 4.950532279025546e-05, + "loss": 3.4271, + "mean_token_accuracy": 0.3862069010734558, + "step": 112715 + }, + { + "epoch": 0.113532781180471, + "grad_norm": 9.951792348916053, + "learning_rate": 4.950524464530959e-05, + "loss": 2.3433, + "mean_token_accuracy": 0.43968542814254763, + "step": 112720 + }, + { + "epoch": 0.11353781723357516, + "grad_norm": 11.179116986339334, + "learning_rate": 4.9505166494260505e-05, + "loss": 2.5908, + "mean_token_accuracy": 0.3896551728248596, + "step": 112725 + }, + { + "epoch": 0.11354285328667933, + "grad_norm": 9.822797362745447, + "learning_rate": 4.9505088337108205e-05, + "loss": 2.4251, + "mean_token_accuracy": 0.4482758641242981, + "step": 112730 + }, + { + "epoch": 0.1135478893397835, + "grad_norm": 11.091482619629184, + "learning_rate": 4.95050101738527e-05, + "loss": 2.5196, + "mean_token_accuracy": 0.4310344934463501, + "step": 112735 + }, + { + "epoch": 0.11355292539288768, + "grad_norm": 9.990303092562069, + "learning_rate": 4.9504932004494033e-05, + "loss": 2.3834, + "mean_token_accuracy": 0.4137930989265442, + "step": 112740 + }, + { + "epoch": 0.11355796144599185, + "grad_norm": 9.791429906243796, + "learning_rate": 4.950485382903221e-05, + "loss": 2.4259, + "mean_token_accuracy": 0.4296430766582489, + "step": 112745 + }, + { + "epoch": 0.11356299749909603, + "grad_norm": 10.26657727767445, + "learning_rate": 4.950477564746727e-05, + "loss": 2.4806, + "mean_token_accuracy": 0.4517241358757019, + "step": 112750 + }, + { + "epoch": 0.1135680335522002, + "grad_norm": 9.503826251956095, + "learning_rate": 4.9504697459799203e-05, + "loss": 2.3452, + "mean_token_accuracy": 0.4206896543502808, + "step": 112755 + }, + { + "epoch": 0.11357306960530437, + "grad_norm": 8.27102891811764, + "learning_rate": 4.950461926602807e-05, + "loss": 2.1934, + "mean_token_accuracy": 0.44827585816383364, + "step": 112760 + }, + { + "epoch": 0.11357810565840855, + "grad_norm": 12.78409129981293, + "learning_rate": 4.950454106615386e-05, + "loss": 2.6662, + "mean_token_accuracy": 0.35172413289546967, + "step": 112765 + }, + { + "epoch": 0.11358314171151272, + "grad_norm": 10.086959067495929, + "learning_rate": 4.950446286017662e-05, + "loss": 2.6014, + "mean_token_accuracy": 0.4, + "step": 112770 + }, + { + "epoch": 0.1135881777646169, + "grad_norm": 13.320189148045081, + "learning_rate": 4.950438464809635e-05, + "loss": 2.4212, + "mean_token_accuracy": 0.44482758045196535, + "step": 112775 + }, + { + "epoch": 0.11359321381772107, + "grad_norm": 12.34315559533147, + "learning_rate": 4.9504306429913086e-05, + "loss": 2.36, + "mean_token_accuracy": 0.45862067937850953, + "step": 112780 + }, + { + "epoch": 0.11359824987082524, + "grad_norm": 15.218955362267408, + "learning_rate": 4.9504228205626846e-05, + "loss": 2.846, + "mean_token_accuracy": 0.34482758641242983, + "step": 112785 + }, + { + "epoch": 0.11360328592392942, + "grad_norm": 11.538281053886553, + "learning_rate": 4.950414997523765e-05, + "loss": 2.4983, + "mean_token_accuracy": 0.3896551728248596, + "step": 112790 + }, + { + "epoch": 0.11360832197703358, + "grad_norm": 34.391844066076786, + "learning_rate": 4.9504071738745524e-05, + "loss": 2.8062, + "mean_token_accuracy": 0.37586206793785093, + "step": 112795 + }, + { + "epoch": 0.11361335803013775, + "grad_norm": 9.412878639740066, + "learning_rate": 4.950399349615049e-05, + "loss": 2.3222, + "mean_token_accuracy": 0.4068965494632721, + "step": 112800 + }, + { + "epoch": 0.11361839408324192, + "grad_norm": 11.358788539751473, + "learning_rate": 4.950391524745256e-05, + "loss": 2.349, + "mean_token_accuracy": 0.4152450144290924, + "step": 112805 + }, + { + "epoch": 0.1136234301363461, + "grad_norm": 10.196096069373818, + "learning_rate": 4.950383699265176e-05, + "loss": 2.1797, + "mean_token_accuracy": 0.4, + "step": 112810 + }, + { + "epoch": 0.11362846618945027, + "grad_norm": 11.84958197422626, + "learning_rate": 4.9503758731748114e-05, + "loss": 2.5803, + "mean_token_accuracy": 0.4068965494632721, + "step": 112815 + }, + { + "epoch": 0.11363350224255445, + "grad_norm": 9.918889629749781, + "learning_rate": 4.950368046474165e-05, + "loss": 2.5376, + "mean_token_accuracy": 0.38965516686439516, + "step": 112820 + }, + { + "epoch": 0.11363853829565862, + "grad_norm": 10.59079363378361, + "learning_rate": 4.950360219163238e-05, + "loss": 2.3285, + "mean_token_accuracy": 0.47931034564971925, + "step": 112825 + }, + { + "epoch": 0.1136435743487628, + "grad_norm": 11.738348529134498, + "learning_rate": 4.950352391242033e-05, + "loss": 2.2935, + "mean_token_accuracy": 0.44137931168079375, + "step": 112830 + }, + { + "epoch": 0.11364861040186697, + "grad_norm": 11.721461380802934, + "learning_rate": 4.950344562710552e-05, + "loss": 2.4273, + "mean_token_accuracy": 0.4344827592372894, + "step": 112835 + }, + { + "epoch": 0.11365364645497114, + "grad_norm": 9.079775859432178, + "learning_rate": 4.9503367335687974e-05, + "loss": 2.3564, + "mean_token_accuracy": 0.44827585816383364, + "step": 112840 + }, + { + "epoch": 0.11365868250807531, + "grad_norm": 10.516943776892882, + "learning_rate": 4.9503289038167705e-05, + "loss": 2.6198, + "mean_token_accuracy": 0.3999999940395355, + "step": 112845 + }, + { + "epoch": 0.11366371856117949, + "grad_norm": 10.669957973110739, + "learning_rate": 4.950321073454475e-05, + "loss": 2.2477, + "mean_token_accuracy": 0.4655172288417816, + "step": 112850 + }, + { + "epoch": 0.11366875461428366, + "grad_norm": 11.41794169458884, + "learning_rate": 4.950313242481911e-05, + "loss": 2.6348, + "mean_token_accuracy": 0.358620685338974, + "step": 112855 + }, + { + "epoch": 0.11367379066738784, + "grad_norm": 10.12704674619292, + "learning_rate": 4.950305410899083e-05, + "loss": 2.6235, + "mean_token_accuracy": 0.4, + "step": 112860 + }, + { + "epoch": 0.113678826720492, + "grad_norm": 10.572714846039661, + "learning_rate": 4.950297578705992e-05, + "loss": 2.6728, + "mean_token_accuracy": 0.37241379022598264, + "step": 112865 + }, + { + "epoch": 0.11368386277359617, + "grad_norm": 12.658584169158898, + "learning_rate": 4.95028974590264e-05, + "loss": 2.6985, + "mean_token_accuracy": 0.3827586233615875, + "step": 112870 + }, + { + "epoch": 0.11368889882670034, + "grad_norm": 15.314121025482617, + "learning_rate": 4.9502819124890294e-05, + "loss": 2.5923, + "mean_token_accuracy": 0.4206896543502808, + "step": 112875 + }, + { + "epoch": 0.11369393487980452, + "grad_norm": 10.867150235515789, + "learning_rate": 4.950274078465163e-05, + "loss": 2.4636, + "mean_token_accuracy": 0.4206896543502808, + "step": 112880 + }, + { + "epoch": 0.11369897093290869, + "grad_norm": 9.092189141555469, + "learning_rate": 4.950266243831042e-05, + "loss": 2.3646, + "mean_token_accuracy": 0.4206896543502808, + "step": 112885 + }, + { + "epoch": 0.11370400698601286, + "grad_norm": 9.462584613121015, + "learning_rate": 4.950258408586668e-05, + "loss": 2.3151, + "mean_token_accuracy": 0.42068964838981626, + "step": 112890 + }, + { + "epoch": 0.11370904303911704, + "grad_norm": 11.490035366236722, + "learning_rate": 4.9502505727320456e-05, + "loss": 2.2452, + "mean_token_accuracy": 0.44482758045196535, + "step": 112895 + }, + { + "epoch": 0.11371407909222121, + "grad_norm": 12.448376649907267, + "learning_rate": 4.950242736267175e-05, + "loss": 2.4434, + "mean_token_accuracy": 0.3965517282485962, + "step": 112900 + }, + { + "epoch": 0.11371911514532539, + "grad_norm": 11.593818648544124, + "learning_rate": 4.950234899192058e-05, + "loss": 2.1065, + "mean_token_accuracy": 0.46896551847457885, + "step": 112905 + }, + { + "epoch": 0.11372415119842956, + "grad_norm": 15.143760686034854, + "learning_rate": 4.950227061506699e-05, + "loss": 2.3473, + "mean_token_accuracy": 0.4620689630508423, + "step": 112910 + }, + { + "epoch": 0.11372918725153373, + "grad_norm": 9.673757838364011, + "learning_rate": 4.950219223211098e-05, + "loss": 2.1377, + "mean_token_accuracy": 0.5028044164180756, + "step": 112915 + }, + { + "epoch": 0.11373422330463791, + "grad_norm": 11.27581860778706, + "learning_rate": 4.950211384305258e-05, + "loss": 2.2512, + "mean_token_accuracy": 0.46551724672317507, + "step": 112920 + }, + { + "epoch": 0.11373925935774208, + "grad_norm": 10.413519342874757, + "learning_rate": 4.950203544789182e-05, + "loss": 2.2228, + "mean_token_accuracy": 0.4206896543502808, + "step": 112925 + }, + { + "epoch": 0.11374429541084626, + "grad_norm": 11.104010138660158, + "learning_rate": 4.95019570466287e-05, + "loss": 2.3453, + "mean_token_accuracy": 0.43793103098869324, + "step": 112930 + }, + { + "epoch": 0.11374933146395041, + "grad_norm": 9.292504800398415, + "learning_rate": 4.950187863926327e-05, + "loss": 2.1692, + "mean_token_accuracy": 0.45517241954803467, + "step": 112935 + }, + { + "epoch": 0.11375436751705459, + "grad_norm": 9.35564041040178, + "learning_rate": 4.950180022579553e-05, + "loss": 2.3855, + "mean_token_accuracy": 0.4465819776058197, + "step": 112940 + }, + { + "epoch": 0.11375940357015876, + "grad_norm": 11.528256574261754, + "learning_rate": 4.95017218062255e-05, + "loss": 2.5506, + "mean_token_accuracy": 0.41034482717514037, + "step": 112945 + }, + { + "epoch": 0.11376443962326294, + "grad_norm": 11.464581648542369, + "learning_rate": 4.950164338055322e-05, + "loss": 2.5332, + "mean_token_accuracy": 0.39310344457626345, + "step": 112950 + }, + { + "epoch": 0.11376947567636711, + "grad_norm": 10.898727030019352, + "learning_rate": 4.95015649487787e-05, + "loss": 2.3679, + "mean_token_accuracy": 0.45329703092575074, + "step": 112955 + }, + { + "epoch": 0.11377451172947128, + "grad_norm": 10.450689479248306, + "learning_rate": 4.9501486510901965e-05, + "loss": 2.7554, + "mean_token_accuracy": 0.35172412991523744, + "step": 112960 + }, + { + "epoch": 0.11377954778257546, + "grad_norm": 10.208772550248536, + "learning_rate": 4.950140806692303e-05, + "loss": 2.3792, + "mean_token_accuracy": 0.42758620381355283, + "step": 112965 + }, + { + "epoch": 0.11378458383567963, + "grad_norm": 10.977935427287786, + "learning_rate": 4.950132961684193e-05, + "loss": 2.3115, + "mean_token_accuracy": 0.4655172348022461, + "step": 112970 + }, + { + "epoch": 0.1137896198887838, + "grad_norm": 11.462108564871988, + "learning_rate": 4.9501251160658676e-05, + "loss": 2.2867, + "mean_token_accuracy": 0.44482759237289426, + "step": 112975 + }, + { + "epoch": 0.11379465594188798, + "grad_norm": 8.66631317198689, + "learning_rate": 4.9501172698373295e-05, + "loss": 2.5056, + "mean_token_accuracy": 0.4172413766384125, + "step": 112980 + }, + { + "epoch": 0.11379969199499215, + "grad_norm": 10.63965215620542, + "learning_rate": 4.9501094229985806e-05, + "loss": 2.2019, + "mean_token_accuracy": 0.4620689690113068, + "step": 112985 + }, + { + "epoch": 0.11380472804809633, + "grad_norm": 10.667001787606706, + "learning_rate": 4.9501015755496225e-05, + "loss": 2.3473, + "mean_token_accuracy": 0.4206896543502808, + "step": 112990 + }, + { + "epoch": 0.1138097641012005, + "grad_norm": 10.985345749358471, + "learning_rate": 4.9500937274904584e-05, + "loss": 2.2199, + "mean_token_accuracy": 0.441379314661026, + "step": 112995 + }, + { + "epoch": 0.11381480015430467, + "grad_norm": 12.966664106527714, + "learning_rate": 4.9500858788210904e-05, + "loss": 2.4685, + "mean_token_accuracy": 0.4310344815254211, + "step": 113000 + }, + { + "epoch": 0.11381983620740883, + "grad_norm": 11.872130864182086, + "learning_rate": 4.9500780295415206e-05, + "loss": 2.3333, + "mean_token_accuracy": 0.4517241358757019, + "step": 113005 + }, + { + "epoch": 0.11382487226051301, + "grad_norm": 14.256529578119999, + "learning_rate": 4.95007017965175e-05, + "loss": 2.6981, + "mean_token_accuracy": 0.43103447556495667, + "step": 113010 + }, + { + "epoch": 0.11382990831361718, + "grad_norm": 12.122172025402355, + "learning_rate": 4.950062329151783e-05, + "loss": 2.189, + "mean_token_accuracy": 0.4847549915313721, + "step": 113015 + }, + { + "epoch": 0.11383494436672136, + "grad_norm": 11.811552295943956, + "learning_rate": 4.950054478041619e-05, + "loss": 2.3214, + "mean_token_accuracy": 0.43103447556495667, + "step": 113020 + }, + { + "epoch": 0.11383998041982553, + "grad_norm": 14.985012060033208, + "learning_rate": 4.9500466263212634e-05, + "loss": 2.6502, + "mean_token_accuracy": 0.41034482717514037, + "step": 113025 + }, + { + "epoch": 0.1138450164729297, + "grad_norm": 8.848924416683507, + "learning_rate": 4.9500387739907155e-05, + "loss": 2.1618, + "mean_token_accuracy": 0.46079854369163514, + "step": 113030 + }, + { + "epoch": 0.11385005252603388, + "grad_norm": 13.43749839772969, + "learning_rate": 4.950030921049979e-05, + "loss": 2.63, + "mean_token_accuracy": 0.43103448748588563, + "step": 113035 + }, + { + "epoch": 0.11385508857913805, + "grad_norm": 10.644979632040803, + "learning_rate": 4.9500230674990553e-05, + "loss": 2.6792, + "mean_token_accuracy": 0.3965517163276672, + "step": 113040 + }, + { + "epoch": 0.11386012463224222, + "grad_norm": 8.77887395422998, + "learning_rate": 4.950015213337947e-05, + "loss": 2.4625, + "mean_token_accuracy": 0.3896551728248596, + "step": 113045 + }, + { + "epoch": 0.1138651606853464, + "grad_norm": 9.594743290726822, + "learning_rate": 4.950007358566657e-05, + "loss": 2.3294, + "mean_token_accuracy": 0.4724137902259827, + "step": 113050 + }, + { + "epoch": 0.11387019673845057, + "grad_norm": 9.311390483127402, + "learning_rate": 4.949999503185186e-05, + "loss": 2.4759, + "mean_token_accuracy": 0.4172413766384125, + "step": 113055 + }, + { + "epoch": 0.11387523279155475, + "grad_norm": 12.503288392959382, + "learning_rate": 4.949991647193538e-05, + "loss": 2.8729, + "mean_token_accuracy": 0.41724138259887694, + "step": 113060 + }, + { + "epoch": 0.11388026884465892, + "grad_norm": 9.931197784672545, + "learning_rate": 4.949983790591713e-05, + "loss": 2.398, + "mean_token_accuracy": 0.4517241358757019, + "step": 113065 + }, + { + "epoch": 0.11388530489776309, + "grad_norm": 10.788719159143596, + "learning_rate": 4.949975933379715e-05, + "loss": 2.5342, + "mean_token_accuracy": 0.3896551787853241, + "step": 113070 + }, + { + "epoch": 0.11389034095086725, + "grad_norm": 11.045821825500827, + "learning_rate": 4.949968075557545e-05, + "loss": 2.5056, + "mean_token_accuracy": 0.3896551787853241, + "step": 113075 + }, + { + "epoch": 0.11389537700397143, + "grad_norm": 10.842479058349177, + "learning_rate": 4.949960217125205e-05, + "loss": 2.3472, + "mean_token_accuracy": 0.42413793206214906, + "step": 113080 + }, + { + "epoch": 0.1139004130570756, + "grad_norm": 11.785158897818905, + "learning_rate": 4.949952358082699e-05, + "loss": 2.7876, + "mean_token_accuracy": 0.42068964838981626, + "step": 113085 + }, + { + "epoch": 0.11390544911017977, + "grad_norm": 8.418180242209662, + "learning_rate": 4.949944498430028e-05, + "loss": 2.041, + "mean_token_accuracy": 0.5021173477172851, + "step": 113090 + }, + { + "epoch": 0.11391048516328395, + "grad_norm": 7.59588563388341, + "learning_rate": 4.949936638167194e-05, + "loss": 2.0517, + "mean_token_accuracy": 0.46551724672317507, + "step": 113095 + }, + { + "epoch": 0.11391552121638812, + "grad_norm": 10.082629439074063, + "learning_rate": 4.949928777294198e-05, + "loss": 2.4002, + "mean_token_accuracy": 0.44482759237289426, + "step": 113100 + }, + { + "epoch": 0.1139205572694923, + "grad_norm": 9.56419024566006, + "learning_rate": 4.949920915811045e-05, + "loss": 2.4172, + "mean_token_accuracy": 0.4172413766384125, + "step": 113105 + }, + { + "epoch": 0.11392559332259647, + "grad_norm": 11.583427611281936, + "learning_rate": 4.9499130537177355e-05, + "loss": 2.7297, + "mean_token_accuracy": 0.3918935328722, + "step": 113110 + }, + { + "epoch": 0.11393062937570064, + "grad_norm": 10.667556518165586, + "learning_rate": 4.9499051910142724e-05, + "loss": 2.5091, + "mean_token_accuracy": 0.42068966031074523, + "step": 113115 + }, + { + "epoch": 0.11393566542880482, + "grad_norm": 10.481446910962806, + "learning_rate": 4.949897327700657e-05, + "loss": 2.8785, + "mean_token_accuracy": 0.36896551847457887, + "step": 113120 + }, + { + "epoch": 0.11394070148190899, + "grad_norm": 9.550723010260254, + "learning_rate": 4.94988946377689e-05, + "loss": 2.1406, + "mean_token_accuracy": 0.44827585816383364, + "step": 113125 + }, + { + "epoch": 0.11394573753501316, + "grad_norm": 34.810218784031846, + "learning_rate": 4.949881599242978e-05, + "loss": 2.8493, + "mean_token_accuracy": 0.3931034505367279, + "step": 113130 + }, + { + "epoch": 0.11395077358811734, + "grad_norm": 12.314575408284757, + "learning_rate": 4.94987373409892e-05, + "loss": 2.7444, + "mean_token_accuracy": 0.4137930989265442, + "step": 113135 + }, + { + "epoch": 0.11395580964122151, + "grad_norm": 9.397865391359787, + "learning_rate": 4.949865868344719e-05, + "loss": 2.4864, + "mean_token_accuracy": 0.40689654350280763, + "step": 113140 + }, + { + "epoch": 0.11396084569432567, + "grad_norm": 10.221560714454467, + "learning_rate": 4.949858001980376e-05, + "loss": 3.0097, + "mean_token_accuracy": 0.29715668559074404, + "step": 113145 + }, + { + "epoch": 0.11396588174742985, + "grad_norm": 11.451542010614418, + "learning_rate": 4.9498501350058954e-05, + "loss": 2.4813, + "mean_token_accuracy": 0.4172413766384125, + "step": 113150 + }, + { + "epoch": 0.11397091780053402, + "grad_norm": 9.172662428553256, + "learning_rate": 4.949842267421277e-05, + "loss": 2.4894, + "mean_token_accuracy": 0.43793103396892546, + "step": 113155 + }, + { + "epoch": 0.11397595385363819, + "grad_norm": 9.969517984225336, + "learning_rate": 4.949834399226525e-05, + "loss": 2.2152, + "mean_token_accuracy": 0.458620685338974, + "step": 113160 + }, + { + "epoch": 0.11398098990674237, + "grad_norm": 10.647808643521731, + "learning_rate": 4.9498265304216394e-05, + "loss": 2.1946, + "mean_token_accuracy": 0.4137930989265442, + "step": 113165 + }, + { + "epoch": 0.11398602595984654, + "grad_norm": 10.64814818584678, + "learning_rate": 4.949818661006625e-05, + "loss": 2.43, + "mean_token_accuracy": 0.42758620381355283, + "step": 113170 + }, + { + "epoch": 0.11399106201295071, + "grad_norm": 11.474775204938512, + "learning_rate": 4.9498107909814825e-05, + "loss": 2.4596, + "mean_token_accuracy": 0.4172413766384125, + "step": 113175 + }, + { + "epoch": 0.11399609806605489, + "grad_norm": 10.125856101884013, + "learning_rate": 4.949802920346214e-05, + "loss": 2.3479, + "mean_token_accuracy": 0.42068966031074523, + "step": 113180 + }, + { + "epoch": 0.11400113411915906, + "grad_norm": 11.191507978650462, + "learning_rate": 4.9497950491008224e-05, + "loss": 2.5476, + "mean_token_accuracy": 0.42413792610168455, + "step": 113185 + }, + { + "epoch": 0.11400617017226324, + "grad_norm": 11.33454535843002, + "learning_rate": 4.949787177245309e-05, + "loss": 2.3468, + "mean_token_accuracy": 0.49458128213882446, + "step": 113190 + }, + { + "epoch": 0.11401120622536741, + "grad_norm": 12.087548133594487, + "learning_rate": 4.949779304779676e-05, + "loss": 2.5392, + "mean_token_accuracy": 0.3965517282485962, + "step": 113195 + }, + { + "epoch": 0.11401624227847158, + "grad_norm": 9.960898151943242, + "learning_rate": 4.949771431703927e-05, + "loss": 2.278, + "mean_token_accuracy": 0.41034482717514037, + "step": 113200 + }, + { + "epoch": 0.11402127833157576, + "grad_norm": 12.06523950578498, + "learning_rate": 4.949763558018063e-05, + "loss": 2.1652, + "mean_token_accuracy": 0.44827585220336913, + "step": 113205 + }, + { + "epoch": 0.11402631438467993, + "grad_norm": 9.831053572903501, + "learning_rate": 4.949755683722086e-05, + "loss": 2.5092, + "mean_token_accuracy": 0.4241379380226135, + "step": 113210 + }, + { + "epoch": 0.11403135043778409, + "grad_norm": 10.775848668041025, + "learning_rate": 4.949747808815999e-05, + "loss": 2.5701, + "mean_token_accuracy": 0.4344827592372894, + "step": 113215 + }, + { + "epoch": 0.11403638649088826, + "grad_norm": 14.695883011614601, + "learning_rate": 4.9497399332998036e-05, + "loss": 2.307, + "mean_token_accuracy": 0.42758620977401735, + "step": 113220 + }, + { + "epoch": 0.11404142254399244, + "grad_norm": 8.710291487652412, + "learning_rate": 4.9497320571735014e-05, + "loss": 2.1022, + "mean_token_accuracy": 0.4655172348022461, + "step": 113225 + }, + { + "epoch": 0.11404645859709661, + "grad_norm": 11.419332765046402, + "learning_rate": 4.949724180437096e-05, + "loss": 2.3351, + "mean_token_accuracy": 0.4344827592372894, + "step": 113230 + }, + { + "epoch": 0.11405149465020079, + "grad_norm": 10.20398804765159, + "learning_rate": 4.949716303090589e-05, + "loss": 2.4702, + "mean_token_accuracy": 0.42413792610168455, + "step": 113235 + }, + { + "epoch": 0.11405653070330496, + "grad_norm": 10.845057161348851, + "learning_rate": 4.949708425133982e-05, + "loss": 2.1939, + "mean_token_accuracy": 0.44482758045196535, + "step": 113240 + }, + { + "epoch": 0.11406156675640913, + "grad_norm": 10.049798920749064, + "learning_rate": 4.949700546567278e-05, + "loss": 2.5361, + "mean_token_accuracy": 0.42068966031074523, + "step": 113245 + }, + { + "epoch": 0.1140666028095133, + "grad_norm": 11.378965183847846, + "learning_rate": 4.949692667390478e-05, + "loss": 2.5501, + "mean_token_accuracy": 0.4000000059604645, + "step": 113250 + }, + { + "epoch": 0.11407163886261748, + "grad_norm": 9.294428055582308, + "learning_rate": 4.9496847876035856e-05, + "loss": 2.973, + "mean_token_accuracy": 0.3620689660310745, + "step": 113255 + }, + { + "epoch": 0.11407667491572165, + "grad_norm": 13.975864531693889, + "learning_rate": 4.949676907206603e-05, + "loss": 2.7928, + "mean_token_accuracy": 0.37586207389831544, + "step": 113260 + }, + { + "epoch": 0.11408171096882583, + "grad_norm": 13.660362790115851, + "learning_rate": 4.9496690261995316e-05, + "loss": 2.6571, + "mean_token_accuracy": 0.41379310488700866, + "step": 113265 + }, + { + "epoch": 0.11408674702193, + "grad_norm": 11.148633100257763, + "learning_rate": 4.949661144582374e-05, + "loss": 2.3314, + "mean_token_accuracy": 0.458620685338974, + "step": 113270 + }, + { + "epoch": 0.11409178307503418, + "grad_norm": 12.064414064142063, + "learning_rate": 4.949653262355131e-05, + "loss": 2.2437, + "mean_token_accuracy": 0.432667875289917, + "step": 113275 + }, + { + "epoch": 0.11409681912813835, + "grad_norm": 10.688852440708585, + "learning_rate": 4.949645379517807e-05, + "loss": 1.8782, + "mean_token_accuracy": 0.541379302740097, + "step": 113280 + }, + { + "epoch": 0.11410185518124251, + "grad_norm": 11.218316108767265, + "learning_rate": 4.949637496070403e-05, + "loss": 2.2959, + "mean_token_accuracy": 0.4379310369491577, + "step": 113285 + }, + { + "epoch": 0.11410689123434668, + "grad_norm": 14.816725548606522, + "learning_rate": 4.949629612012921e-05, + "loss": 2.4734, + "mean_token_accuracy": 0.44827587008476255, + "step": 113290 + }, + { + "epoch": 0.11411192728745086, + "grad_norm": 11.061974015416226, + "learning_rate": 4.949621727345364e-05, + "loss": 2.7431, + "mean_token_accuracy": 0.42413792610168455, + "step": 113295 + }, + { + "epoch": 0.11411696334055503, + "grad_norm": 9.964756555006183, + "learning_rate": 4.949613842067733e-05, + "loss": 2.1772, + "mean_token_accuracy": 0.4896551609039307, + "step": 113300 + }, + { + "epoch": 0.1141219993936592, + "grad_norm": 12.390570868900841, + "learning_rate": 4.949605956180031e-05, + "loss": 2.3921, + "mean_token_accuracy": 0.41724138259887694, + "step": 113305 + }, + { + "epoch": 0.11412703544676338, + "grad_norm": 14.845901343009714, + "learning_rate": 4.949598069682261e-05, + "loss": 2.803, + "mean_token_accuracy": 0.3793103516101837, + "step": 113310 + }, + { + "epoch": 0.11413207149986755, + "grad_norm": 10.67618062191245, + "learning_rate": 4.9495901825744235e-05, + "loss": 2.1307, + "mean_token_accuracy": 0.43793103098869324, + "step": 113315 + }, + { + "epoch": 0.11413710755297173, + "grad_norm": 10.031834764163156, + "learning_rate": 4.949582294856521e-05, + "loss": 2.2445, + "mean_token_accuracy": 0.48275861144065857, + "step": 113320 + }, + { + "epoch": 0.1141421436060759, + "grad_norm": 15.736364692493241, + "learning_rate": 4.9495744065285574e-05, + "loss": 2.5228, + "mean_token_accuracy": 0.38275861740112305, + "step": 113325 + }, + { + "epoch": 0.11414717965918007, + "grad_norm": 9.364293880169328, + "learning_rate": 4.949566517590532e-05, + "loss": 2.0923, + "mean_token_accuracy": 0.44482758045196535, + "step": 113330 + }, + { + "epoch": 0.11415221571228425, + "grad_norm": 8.819982790443811, + "learning_rate": 4.9495586280424504e-05, + "loss": 2.4813, + "mean_token_accuracy": 0.4068965494632721, + "step": 113335 + }, + { + "epoch": 0.11415725176538842, + "grad_norm": 11.601371405048335, + "learning_rate": 4.949550737884312e-05, + "loss": 2.4397, + "mean_token_accuracy": 0.40544463992118834, + "step": 113340 + }, + { + "epoch": 0.1141622878184926, + "grad_norm": 13.0197702469481, + "learning_rate": 4.9495428471161207e-05, + "loss": 2.3181, + "mean_token_accuracy": 0.4620689630508423, + "step": 113345 + }, + { + "epoch": 0.11416732387159677, + "grad_norm": 12.50222061393604, + "learning_rate": 4.949534955737877e-05, + "loss": 2.6322, + "mean_token_accuracy": 0.417241370677948, + "step": 113350 + }, + { + "epoch": 0.11417235992470093, + "grad_norm": 10.474822116968944, + "learning_rate": 4.949527063749585e-05, + "loss": 2.3376, + "mean_token_accuracy": 0.4620689630508423, + "step": 113355 + }, + { + "epoch": 0.1141773959778051, + "grad_norm": 14.458068592006503, + "learning_rate": 4.9495191711512455e-05, + "loss": 2.3975, + "mean_token_accuracy": 0.4551724135875702, + "step": 113360 + }, + { + "epoch": 0.11418243203090928, + "grad_norm": 11.361647917559504, + "learning_rate": 4.949511277942861e-05, + "loss": 2.572, + "mean_token_accuracy": 0.4034482777118683, + "step": 113365 + }, + { + "epoch": 0.11418746808401345, + "grad_norm": 12.932486338295037, + "learning_rate": 4.949503384124434e-05, + "loss": 2.6922, + "mean_token_accuracy": 0.3862069010734558, + "step": 113370 + }, + { + "epoch": 0.11419250413711762, + "grad_norm": 9.67422420348851, + "learning_rate": 4.9494954896959665e-05, + "loss": 2.7554, + "mean_token_accuracy": 0.4206896543502808, + "step": 113375 + }, + { + "epoch": 0.1141975401902218, + "grad_norm": 7.887708908293583, + "learning_rate": 4.949487594657461e-05, + "loss": 2.4777, + "mean_token_accuracy": 0.41034482717514037, + "step": 113380 + }, + { + "epoch": 0.11420257624332597, + "grad_norm": 9.680144030373048, + "learning_rate": 4.949479699008919e-05, + "loss": 2.0863, + "mean_token_accuracy": 0.47773745059967043, + "step": 113385 + }, + { + "epoch": 0.11420761229643014, + "grad_norm": 8.883907198980403, + "learning_rate": 4.949471802750344e-05, + "loss": 2.3385, + "mean_token_accuracy": 0.4310344815254211, + "step": 113390 + }, + { + "epoch": 0.11421264834953432, + "grad_norm": 11.62154901442218, + "learning_rate": 4.9494639058817364e-05, + "loss": 2.2665, + "mean_token_accuracy": 0.47586206793785096, + "step": 113395 + }, + { + "epoch": 0.11421768440263849, + "grad_norm": 10.840802169478733, + "learning_rate": 4.9494560084031e-05, + "loss": 2.1543, + "mean_token_accuracy": 0.4419950723648071, + "step": 113400 + }, + { + "epoch": 0.11422272045574267, + "grad_norm": 10.73399636419925, + "learning_rate": 4.949448110314435e-05, + "loss": 2.7492, + "mean_token_accuracy": 0.39310344457626345, + "step": 113405 + }, + { + "epoch": 0.11422775650884684, + "grad_norm": 10.139801059311369, + "learning_rate": 4.9494402116157454e-05, + "loss": 2.2088, + "mean_token_accuracy": 0.4689655125141144, + "step": 113410 + }, + { + "epoch": 0.11423279256195101, + "grad_norm": 11.13104416938885, + "learning_rate": 4.949432312307034e-05, + "loss": 2.3649, + "mean_token_accuracy": 0.4359951615333557, + "step": 113415 + }, + { + "epoch": 0.11423782861505519, + "grad_norm": 14.536714460285188, + "learning_rate": 4.9494244123883005e-05, + "loss": 2.5675, + "mean_token_accuracy": 0.41034482717514037, + "step": 113420 + }, + { + "epoch": 0.11424286466815935, + "grad_norm": 13.429721080359569, + "learning_rate": 4.9494165118595484e-05, + "loss": 2.6397, + "mean_token_accuracy": 0.3758620619773865, + "step": 113425 + }, + { + "epoch": 0.11424790072126352, + "grad_norm": 10.29286802052455, + "learning_rate": 4.949408610720781e-05, + "loss": 2.4443, + "mean_token_accuracy": 0.4137930989265442, + "step": 113430 + }, + { + "epoch": 0.1142529367743677, + "grad_norm": 12.176194479251862, + "learning_rate": 4.949400708971998e-05, + "loss": 2.4235, + "mean_token_accuracy": 0.47241380214691164, + "step": 113435 + }, + { + "epoch": 0.11425797282747187, + "grad_norm": 11.04352862540111, + "learning_rate": 4.9493928066132044e-05, + "loss": 2.131, + "mean_token_accuracy": 0.4448275864124298, + "step": 113440 + }, + { + "epoch": 0.11426300888057604, + "grad_norm": 8.749721719723174, + "learning_rate": 4.9493849036444004e-05, + "loss": 2.3874, + "mean_token_accuracy": 0.44482759237289426, + "step": 113445 + }, + { + "epoch": 0.11426804493368022, + "grad_norm": 11.753717979389771, + "learning_rate": 4.949377000065589e-05, + "loss": 2.3063, + "mean_token_accuracy": 0.4517241418361664, + "step": 113450 + }, + { + "epoch": 0.11427308098678439, + "grad_norm": 11.952751825182776, + "learning_rate": 4.949369095876771e-05, + "loss": 2.4727, + "mean_token_accuracy": 0.35862068831920624, + "step": 113455 + }, + { + "epoch": 0.11427811703988856, + "grad_norm": 12.615017839166764, + "learning_rate": 4.949361191077951e-05, + "loss": 2.517, + "mean_token_accuracy": 0.39655172228813174, + "step": 113460 + }, + { + "epoch": 0.11428315309299274, + "grad_norm": 8.643812854011056, + "learning_rate": 4.94935328566913e-05, + "loss": 2.1651, + "mean_token_accuracy": 0.4623109459877014, + "step": 113465 + }, + { + "epoch": 0.11428818914609691, + "grad_norm": 14.701537562191152, + "learning_rate": 4.9493453796503096e-05, + "loss": 2.4089, + "mean_token_accuracy": 0.45069570541381837, + "step": 113470 + }, + { + "epoch": 0.11429322519920108, + "grad_norm": 14.544497270939637, + "learning_rate": 4.9493374730214925e-05, + "loss": 2.4453, + "mean_token_accuracy": 0.4379310369491577, + "step": 113475 + }, + { + "epoch": 0.11429826125230526, + "grad_norm": 10.722636481132666, + "learning_rate": 4.949329565782682e-05, + "loss": 2.3277, + "mean_token_accuracy": 0.4517241358757019, + "step": 113480 + }, + { + "epoch": 0.11430329730540943, + "grad_norm": 10.245423331537594, + "learning_rate": 4.949321657933878e-05, + "loss": 2.7559, + "mean_token_accuracy": 0.3793103456497192, + "step": 113485 + }, + { + "epoch": 0.1143083333585136, + "grad_norm": 9.015153978871965, + "learning_rate": 4.9493137494750843e-05, + "loss": 2.2583, + "mean_token_accuracy": 0.46551724076271056, + "step": 113490 + }, + { + "epoch": 0.11431336941161777, + "grad_norm": 13.955637579080173, + "learning_rate": 4.9493058404063034e-05, + "loss": 2.3894, + "mean_token_accuracy": 0.4676346004009247, + "step": 113495 + }, + { + "epoch": 0.11431840546472194, + "grad_norm": 10.164242267721278, + "learning_rate": 4.949297930727537e-05, + "loss": 2.4805, + "mean_token_accuracy": 0.4396249264478683, + "step": 113500 + }, + { + "epoch": 0.11432344151782611, + "grad_norm": 10.701808374759569, + "learning_rate": 4.949290020438785e-05, + "loss": 2.274, + "mean_token_accuracy": 0.3999999940395355, + "step": 113505 + }, + { + "epoch": 0.11432847757093029, + "grad_norm": 10.056245059931017, + "learning_rate": 4.9492821095400534e-05, + "loss": 2.7455, + "mean_token_accuracy": 0.41379311084747317, + "step": 113510 + }, + { + "epoch": 0.11433351362403446, + "grad_norm": 10.379616270184927, + "learning_rate": 4.9492741980313426e-05, + "loss": 2.75, + "mean_token_accuracy": 0.43103447556495667, + "step": 113515 + }, + { + "epoch": 0.11433854967713863, + "grad_norm": 10.128488584249718, + "learning_rate": 4.949266285912655e-05, + "loss": 2.1725, + "mean_token_accuracy": 0.44827585816383364, + "step": 113520 + }, + { + "epoch": 0.11434358573024281, + "grad_norm": 10.028530631862171, + "learning_rate": 4.949258373183992e-05, + "loss": 2.7574, + "mean_token_accuracy": 0.37931033968925476, + "step": 113525 + }, + { + "epoch": 0.11434862178334698, + "grad_norm": 11.029166188529539, + "learning_rate": 4.949250459845357e-05, + "loss": 2.2862, + "mean_token_accuracy": 0.39655171930789945, + "step": 113530 + }, + { + "epoch": 0.11435365783645116, + "grad_norm": 8.639796434258983, + "learning_rate": 4.9492425458967515e-05, + "loss": 2.232, + "mean_token_accuracy": 0.4620689630508423, + "step": 113535 + }, + { + "epoch": 0.11435869388955533, + "grad_norm": 10.308297789321932, + "learning_rate": 4.949234631338178e-05, + "loss": 2.4251, + "mean_token_accuracy": 0.4482758641242981, + "step": 113540 + }, + { + "epoch": 0.1143637299426595, + "grad_norm": 15.08414463091528, + "learning_rate": 4.9492267161696384e-05, + "loss": 2.846, + "mean_token_accuracy": 0.34482758641242983, + "step": 113545 + }, + { + "epoch": 0.11436876599576368, + "grad_norm": 11.177042411524832, + "learning_rate": 4.949218800391135e-05, + "loss": 2.1939, + "mean_token_accuracy": 0.4965517222881317, + "step": 113550 + }, + { + "epoch": 0.11437380204886785, + "grad_norm": 10.12394115917201, + "learning_rate": 4.94921088400267e-05, + "loss": 2.4855, + "mean_token_accuracy": 0.441379314661026, + "step": 113555 + }, + { + "epoch": 0.11437883810197202, + "grad_norm": 11.461125092714033, + "learning_rate": 4.9492029670042464e-05, + "loss": 2.5777, + "mean_token_accuracy": 0.39655172228813174, + "step": 113560 + }, + { + "epoch": 0.11438387415507618, + "grad_norm": 12.145113342961128, + "learning_rate": 4.949195049395866e-05, + "loss": 2.6422, + "mean_token_accuracy": 0.3931034475564957, + "step": 113565 + }, + { + "epoch": 0.11438891020818036, + "grad_norm": 11.039025934611505, + "learning_rate": 4.949187131177529e-05, + "loss": 2.4975, + "mean_token_accuracy": 0.4068965494632721, + "step": 113570 + }, + { + "epoch": 0.11439394626128453, + "grad_norm": 11.917544606537819, + "learning_rate": 4.94917921234924e-05, + "loss": 2.6254, + "mean_token_accuracy": 0.37241379618644715, + "step": 113575 + }, + { + "epoch": 0.1143989823143887, + "grad_norm": 10.91747988373674, + "learning_rate": 4.9491712929110005e-05, + "loss": 2.1965, + "mean_token_accuracy": 0.482758617401123, + "step": 113580 + }, + { + "epoch": 0.11440401836749288, + "grad_norm": 9.607019364874029, + "learning_rate": 4.9491633728628125e-05, + "loss": 2.2214, + "mean_token_accuracy": 0.4620689630508423, + "step": 113585 + }, + { + "epoch": 0.11440905442059705, + "grad_norm": 11.946990671274653, + "learning_rate": 4.949155452204678e-05, + "loss": 2.1696, + "mean_token_accuracy": 0.4344827592372894, + "step": 113590 + }, + { + "epoch": 0.11441409047370123, + "grad_norm": 10.656620084598478, + "learning_rate": 4.949147530936601e-05, + "loss": 2.6961, + "mean_token_accuracy": 0.4068965375423431, + "step": 113595 + }, + { + "epoch": 0.1144191265268054, + "grad_norm": 10.741289191887997, + "learning_rate": 4.949139609058581e-05, + "loss": 2.6364, + "mean_token_accuracy": 0.4228675127029419, + "step": 113600 + }, + { + "epoch": 0.11442416257990957, + "grad_norm": 17.391155622455074, + "learning_rate": 4.949131686570622e-05, + "loss": 2.4523, + "mean_token_accuracy": 0.41379310488700866, + "step": 113605 + }, + { + "epoch": 0.11442919863301375, + "grad_norm": 15.482805296608134, + "learning_rate": 4.949123763472725e-05, + "loss": 2.3816, + "mean_token_accuracy": 0.4620689690113068, + "step": 113610 + }, + { + "epoch": 0.11443423468611792, + "grad_norm": 8.62625679001273, + "learning_rate": 4.949115839764894e-05, + "loss": 2.4148, + "mean_token_accuracy": 0.4862069010734558, + "step": 113615 + }, + { + "epoch": 0.1144392707392221, + "grad_norm": 11.702000404910105, + "learning_rate": 4.9491079154471285e-05, + "loss": 2.5361, + "mean_token_accuracy": 0.43793103098869324, + "step": 113620 + }, + { + "epoch": 0.11444430679232627, + "grad_norm": 10.346494056259406, + "learning_rate": 4.9490999905194324e-05, + "loss": 2.3197, + "mean_token_accuracy": 0.4344827592372894, + "step": 113625 + }, + { + "epoch": 0.11444934284543043, + "grad_norm": 12.908847049773321, + "learning_rate": 4.949092064981808e-05, + "loss": 2.4551, + "mean_token_accuracy": 0.42413793206214906, + "step": 113630 + }, + { + "epoch": 0.1144543788985346, + "grad_norm": 11.779814119931922, + "learning_rate": 4.949084138834258e-05, + "loss": 2.3351, + "mean_token_accuracy": 0.4137930989265442, + "step": 113635 + }, + { + "epoch": 0.11445941495163878, + "grad_norm": 11.125115793043847, + "learning_rate": 4.949076212076782e-05, + "loss": 2.3582, + "mean_token_accuracy": 0.4310344815254211, + "step": 113640 + }, + { + "epoch": 0.11446445100474295, + "grad_norm": 10.960252828186526, + "learning_rate": 4.9490682847093853e-05, + "loss": 2.1299, + "mean_token_accuracy": 0.482758629322052, + "step": 113645 + }, + { + "epoch": 0.11446948705784712, + "grad_norm": 12.008041702282949, + "learning_rate": 4.949060356732069e-05, + "loss": 2.1779, + "mean_token_accuracy": 0.4482758641242981, + "step": 113650 + }, + { + "epoch": 0.1144745231109513, + "grad_norm": 10.583236393470349, + "learning_rate": 4.949052428144835e-05, + "loss": 2.4378, + "mean_token_accuracy": 0.4154869973659515, + "step": 113655 + }, + { + "epoch": 0.11447955916405547, + "grad_norm": 9.57775670723433, + "learning_rate": 4.9490444989476855e-05, + "loss": 1.8755, + "mean_token_accuracy": 0.536721122264862, + "step": 113660 + }, + { + "epoch": 0.11448459521715965, + "grad_norm": 9.15819841022125, + "learning_rate": 4.949036569140623e-05, + "loss": 2.5678, + "mean_token_accuracy": 0.39310343861579894, + "step": 113665 + }, + { + "epoch": 0.11448963127026382, + "grad_norm": 12.42287228126611, + "learning_rate": 4.949028638723649e-05, + "loss": 2.3551, + "mean_token_accuracy": 0.4517241358757019, + "step": 113670 + }, + { + "epoch": 0.114494667323368, + "grad_norm": 14.653872043611278, + "learning_rate": 4.9490207076967656e-05, + "loss": 2.6872, + "mean_token_accuracy": 0.4310344815254211, + "step": 113675 + }, + { + "epoch": 0.11449970337647217, + "grad_norm": 11.294341596083138, + "learning_rate": 4.949012776059977e-05, + "loss": 2.3009, + "mean_token_accuracy": 0.4068965494632721, + "step": 113680 + }, + { + "epoch": 0.11450473942957634, + "grad_norm": 11.918013344220562, + "learning_rate": 4.9490048438132834e-05, + "loss": 2.1516, + "mean_token_accuracy": 0.458620685338974, + "step": 113685 + }, + { + "epoch": 0.11450977548268051, + "grad_norm": 9.69246667312578, + "learning_rate": 4.9489969109566874e-05, + "loss": 2.2894, + "mean_token_accuracy": 0.4776164650917053, + "step": 113690 + }, + { + "epoch": 0.11451481153578469, + "grad_norm": 11.139233038301702, + "learning_rate": 4.9489889774901924e-05, + "loss": 2.4423, + "mean_token_accuracy": 0.43103447556495667, + "step": 113695 + }, + { + "epoch": 0.11451984758888885, + "grad_norm": 9.183912868197861, + "learning_rate": 4.948981043413798e-05, + "loss": 2.5323, + "mean_token_accuracy": 0.39655172228813174, + "step": 113700 + }, + { + "epoch": 0.11452488364199302, + "grad_norm": 12.614316695091405, + "learning_rate": 4.948973108727509e-05, + "loss": 2.6711, + "mean_token_accuracy": 0.4034482777118683, + "step": 113705 + }, + { + "epoch": 0.1145299196950972, + "grad_norm": 9.632480915156544, + "learning_rate": 4.9489651734313266e-05, + "loss": 2.2178, + "mean_token_accuracy": 0.4172413766384125, + "step": 113710 + }, + { + "epoch": 0.11453495574820137, + "grad_norm": 8.296316241171496, + "learning_rate": 4.948957237525253e-05, + "loss": 2.4047, + "mean_token_accuracy": 0.4551724076271057, + "step": 113715 + }, + { + "epoch": 0.11453999180130554, + "grad_norm": 12.759721194862824, + "learning_rate": 4.94894930100929e-05, + "loss": 2.7462, + "mean_token_accuracy": 0.38620689511299133, + "step": 113720 + }, + { + "epoch": 0.11454502785440972, + "grad_norm": 10.036537827581931, + "learning_rate": 4.948941363883441e-05, + "loss": 2.8933, + "mean_token_accuracy": 0.4034482777118683, + "step": 113725 + }, + { + "epoch": 0.11455006390751389, + "grad_norm": 12.447410641465243, + "learning_rate": 4.9489334261477065e-05, + "loss": 2.8365, + "mean_token_accuracy": 0.3655172407627106, + "step": 113730 + }, + { + "epoch": 0.11455509996061806, + "grad_norm": 11.953243997680993, + "learning_rate": 4.9489254878020904e-05, + "loss": 2.0314, + "mean_token_accuracy": 0.49655171632766726, + "step": 113735 + }, + { + "epoch": 0.11456013601372224, + "grad_norm": 11.78610799620108, + "learning_rate": 4.9489175488465936e-05, + "loss": 2.7733, + "mean_token_accuracy": 0.35862069129943847, + "step": 113740 + }, + { + "epoch": 0.11456517206682641, + "grad_norm": 9.406421298271498, + "learning_rate": 4.948909609281218e-05, + "loss": 1.997, + "mean_token_accuracy": 0.5103448331356049, + "step": 113745 + }, + { + "epoch": 0.11457020811993059, + "grad_norm": 10.095220806974822, + "learning_rate": 4.9489016691059674e-05, + "loss": 2.5905, + "mean_token_accuracy": 0.42758620381355283, + "step": 113750 + }, + { + "epoch": 0.11457524417303476, + "grad_norm": 13.388223965868127, + "learning_rate": 4.948893728320844e-05, + "loss": 2.6073, + "mean_token_accuracy": 0.4275862157344818, + "step": 113755 + }, + { + "epoch": 0.11458028022613893, + "grad_norm": 9.471261397869029, + "learning_rate": 4.9488857869258484e-05, + "loss": 2.3186, + "mean_token_accuracy": 0.4310344815254211, + "step": 113760 + }, + { + "epoch": 0.11458531627924311, + "grad_norm": 12.643587075811629, + "learning_rate": 4.9488778449209835e-05, + "loss": 2.1844, + "mean_token_accuracy": 0.4811857283115387, + "step": 113765 + }, + { + "epoch": 0.11459035233234727, + "grad_norm": 11.020482074224862, + "learning_rate": 4.9488699023062515e-05, + "loss": 2.1993, + "mean_token_accuracy": 0.45759225487709043, + "step": 113770 + }, + { + "epoch": 0.11459538838545144, + "grad_norm": 9.02576152182453, + "learning_rate": 4.948861959081656e-05, + "loss": 2.4543, + "mean_token_accuracy": 0.40689654350280763, + "step": 113775 + }, + { + "epoch": 0.11460042443855561, + "grad_norm": 11.790217320470852, + "learning_rate": 4.948854015247197e-05, + "loss": 2.5383, + "mean_token_accuracy": 0.42068964838981626, + "step": 113780 + }, + { + "epoch": 0.11460546049165979, + "grad_norm": 10.833820992894111, + "learning_rate": 4.948846070802878e-05, + "loss": 2.4236, + "mean_token_accuracy": 0.3931034475564957, + "step": 113785 + }, + { + "epoch": 0.11461049654476396, + "grad_norm": 9.15191295838091, + "learning_rate": 4.9488381257487e-05, + "loss": 2.168, + "mean_token_accuracy": 0.4551724076271057, + "step": 113790 + }, + { + "epoch": 0.11461553259786814, + "grad_norm": 12.64919693295713, + "learning_rate": 4.948830180084667e-05, + "loss": 2.4248, + "mean_token_accuracy": 0.42413793206214906, + "step": 113795 + }, + { + "epoch": 0.11462056865097231, + "grad_norm": 12.098311546097689, + "learning_rate": 4.9488222338107795e-05, + "loss": 2.2487, + "mean_token_accuracy": 0.4517241299152374, + "step": 113800 + }, + { + "epoch": 0.11462560470407648, + "grad_norm": 11.590061303108335, + "learning_rate": 4.948814286927041e-05, + "loss": 2.3527, + "mean_token_accuracy": 0.3862069010734558, + "step": 113805 + }, + { + "epoch": 0.11463064075718066, + "grad_norm": 11.089201212737738, + "learning_rate": 4.9488063394334525e-05, + "loss": 1.9798, + "mean_token_accuracy": 0.4931034505367279, + "step": 113810 + }, + { + "epoch": 0.11463567681028483, + "grad_norm": 9.799307407646957, + "learning_rate": 4.9487983913300175e-05, + "loss": 2.0101, + "mean_token_accuracy": 0.5344827592372894, + "step": 113815 + }, + { + "epoch": 0.114640712863389, + "grad_norm": 11.72433125646831, + "learning_rate": 4.948790442616738e-05, + "loss": 2.6755, + "mean_token_accuracy": 0.37931033968925476, + "step": 113820 + }, + { + "epoch": 0.11464574891649318, + "grad_norm": 11.495957315083603, + "learning_rate": 4.9487824932936145e-05, + "loss": 2.5161, + "mean_token_accuracy": 0.4448275864124298, + "step": 113825 + }, + { + "epoch": 0.11465078496959735, + "grad_norm": 9.615827817054202, + "learning_rate": 4.9487745433606507e-05, + "loss": 2.5414, + "mean_token_accuracy": 0.441379314661026, + "step": 113830 + }, + { + "epoch": 0.11465582102270153, + "grad_norm": 11.397016871850619, + "learning_rate": 4.9487665928178495e-05, + "loss": 2.5587, + "mean_token_accuracy": 0.4137930989265442, + "step": 113835 + }, + { + "epoch": 0.11466085707580569, + "grad_norm": 10.278638141718346, + "learning_rate": 4.948758641665212e-05, + "loss": 2.2742, + "mean_token_accuracy": 0.42413793206214906, + "step": 113840 + }, + { + "epoch": 0.11466589312890986, + "grad_norm": 11.024709624385842, + "learning_rate": 4.94875068990274e-05, + "loss": 2.4534, + "mean_token_accuracy": 0.4068965494632721, + "step": 113845 + }, + { + "epoch": 0.11467092918201403, + "grad_norm": 10.649946148412724, + "learning_rate": 4.9487427375304365e-05, + "loss": 2.5059, + "mean_token_accuracy": 0.4137930989265442, + "step": 113850 + }, + { + "epoch": 0.11467596523511821, + "grad_norm": 9.277995238077608, + "learning_rate": 4.948734784548303e-05, + "loss": 2.5528, + "mean_token_accuracy": 0.4517241358757019, + "step": 113855 + }, + { + "epoch": 0.11468100128822238, + "grad_norm": 10.86562718460318, + "learning_rate": 4.9487268309563426e-05, + "loss": 2.3333, + "mean_token_accuracy": 0.46551724076271056, + "step": 113860 + }, + { + "epoch": 0.11468603734132655, + "grad_norm": 11.57718861134086, + "learning_rate": 4.9487188767545576e-05, + "loss": 2.4861, + "mean_token_accuracy": 0.3862068891525269, + "step": 113865 + }, + { + "epoch": 0.11469107339443073, + "grad_norm": 10.33927655828899, + "learning_rate": 4.94871092194295e-05, + "loss": 2.542, + "mean_token_accuracy": 0.38965516686439516, + "step": 113870 + }, + { + "epoch": 0.1146961094475349, + "grad_norm": 12.595536968842474, + "learning_rate": 4.9487029665215206e-05, + "loss": 2.646, + "mean_token_accuracy": 0.3827586233615875, + "step": 113875 + }, + { + "epoch": 0.11470114550063908, + "grad_norm": 10.606889630928459, + "learning_rate": 4.948695010490273e-05, + "loss": 2.5376, + "mean_token_accuracy": 0.4068965554237366, + "step": 113880 + }, + { + "epoch": 0.11470618155374325, + "grad_norm": 11.796607424015194, + "learning_rate": 4.9486870538492096e-05, + "loss": 2.4371, + "mean_token_accuracy": 0.43448275327682495, + "step": 113885 + }, + { + "epoch": 0.11471121760684742, + "grad_norm": 14.262373274703593, + "learning_rate": 4.9486790965983324e-05, + "loss": 2.4766, + "mean_token_accuracy": 0.4931034505367279, + "step": 113890 + }, + { + "epoch": 0.1147162536599516, + "grad_norm": 10.473058102726982, + "learning_rate": 4.948671138737643e-05, + "loss": 2.4986, + "mean_token_accuracy": 0.4310344815254211, + "step": 113895 + }, + { + "epoch": 0.11472128971305577, + "grad_norm": 14.285388653870314, + "learning_rate": 4.948663180267143e-05, + "loss": 2.3645, + "mean_token_accuracy": 0.4620689690113068, + "step": 113900 + }, + { + "epoch": 0.11472632576615995, + "grad_norm": 11.24305873388817, + "learning_rate": 4.948655221186836e-05, + "loss": 2.7835, + "mean_token_accuracy": 0.3517241358757019, + "step": 113905 + }, + { + "epoch": 0.1147313618192641, + "grad_norm": 11.77452984959016, + "learning_rate": 4.948647261496725e-05, + "loss": 2.4833, + "mean_token_accuracy": 0.426013308763504, + "step": 113910 + }, + { + "epoch": 0.11473639787236828, + "grad_norm": 9.78259016733333, + "learning_rate": 4.94863930119681e-05, + "loss": 2.3666, + "mean_token_accuracy": 0.42413793206214906, + "step": 113915 + }, + { + "epoch": 0.11474143392547245, + "grad_norm": 11.90363737609743, + "learning_rate": 4.948631340287095e-05, + "loss": 2.5529, + "mean_token_accuracy": 0.41724138259887694, + "step": 113920 + }, + { + "epoch": 0.11474646997857663, + "grad_norm": 10.411634503372966, + "learning_rate": 4.94862337876758e-05, + "loss": 2.7654, + "mean_token_accuracy": 0.40907440185546873, + "step": 113925 + }, + { + "epoch": 0.1147515060316808, + "grad_norm": 12.423891223336954, + "learning_rate": 4.948615416638269e-05, + "loss": 2.2315, + "mean_token_accuracy": 0.4620689690113068, + "step": 113930 + }, + { + "epoch": 0.11475654208478497, + "grad_norm": 12.018425686425598, + "learning_rate": 4.9486074538991645e-05, + "loss": 2.0233, + "mean_token_accuracy": 0.4482758641242981, + "step": 113935 + }, + { + "epoch": 0.11476157813788915, + "grad_norm": 10.912654916593338, + "learning_rate": 4.948599490550268e-05, + "loss": 2.4244, + "mean_token_accuracy": 0.4172413766384125, + "step": 113940 + }, + { + "epoch": 0.11476661419099332, + "grad_norm": 12.06295380848458, + "learning_rate": 4.948591526591581e-05, + "loss": 2.647, + "mean_token_accuracy": 0.39655172228813174, + "step": 113945 + }, + { + "epoch": 0.1147716502440975, + "grad_norm": 9.18634312006316, + "learning_rate": 4.948583562023106e-05, + "loss": 2.3188, + "mean_token_accuracy": 0.4551724076271057, + "step": 113950 + }, + { + "epoch": 0.11477668629720167, + "grad_norm": 13.814035827151605, + "learning_rate": 4.9485755968448476e-05, + "loss": 2.5358, + "mean_token_accuracy": 0.4310344815254211, + "step": 113955 + }, + { + "epoch": 0.11478172235030584, + "grad_norm": 10.84685704303101, + "learning_rate": 4.9485676310568046e-05, + "loss": 2.4725, + "mean_token_accuracy": 0.4241379380226135, + "step": 113960 + }, + { + "epoch": 0.11478675840341002, + "grad_norm": 10.54386286725506, + "learning_rate": 4.948559664658981e-05, + "loss": 2.0323, + "mean_token_accuracy": 0.4448275864124298, + "step": 113965 + }, + { + "epoch": 0.11479179445651419, + "grad_norm": 11.51650669111011, + "learning_rate": 4.948551697651379e-05, + "loss": 2.398, + "mean_token_accuracy": 0.4137930989265442, + "step": 113970 + }, + { + "epoch": 0.11479683050961836, + "grad_norm": 11.172986342949743, + "learning_rate": 4.948543730034e-05, + "loss": 2.3643, + "mean_token_accuracy": 0.44137930274009707, + "step": 113975 + }, + { + "epoch": 0.11480186656272252, + "grad_norm": 10.282606205618496, + "learning_rate": 4.948535761806847e-05, + "loss": 2.5042, + "mean_token_accuracy": 0.39655172228813174, + "step": 113980 + }, + { + "epoch": 0.1148069026158267, + "grad_norm": 8.888747477147502, + "learning_rate": 4.948527792969921e-05, + "loss": 2.6922, + "mean_token_accuracy": 0.4206896543502808, + "step": 113985 + }, + { + "epoch": 0.11481193866893087, + "grad_norm": 8.525829562270047, + "learning_rate": 4.9485198235232264e-05, + "loss": 2.3016, + "mean_token_accuracy": 0.3827586233615875, + "step": 113990 + }, + { + "epoch": 0.11481697472203505, + "grad_norm": 13.810846568031577, + "learning_rate": 4.9485118534667634e-05, + "loss": 2.6228, + "mean_token_accuracy": 0.33103448450565337, + "step": 113995 + }, + { + "epoch": 0.11482201077513922, + "grad_norm": 10.375553307280166, + "learning_rate": 4.948503882800536e-05, + "loss": 2.3311, + "mean_token_accuracy": 0.4137930989265442, + "step": 114000 + }, + { + "epoch": 0.11482704682824339, + "grad_norm": 9.001013142581225, + "learning_rate": 4.948495911524544e-05, + "loss": 2.1663, + "mean_token_accuracy": 0.4551724135875702, + "step": 114005 + }, + { + "epoch": 0.11483208288134757, + "grad_norm": 14.235429279709575, + "learning_rate": 4.948487939638791e-05, + "loss": 3.1092, + "mean_token_accuracy": 0.34137930274009703, + "step": 114010 + }, + { + "epoch": 0.11483711893445174, + "grad_norm": 11.371773245033062, + "learning_rate": 4.94847996714328e-05, + "loss": 2.1699, + "mean_token_accuracy": 0.47586206793785096, + "step": 114015 + }, + { + "epoch": 0.11484215498755591, + "grad_norm": 9.155351957488874, + "learning_rate": 4.948471994038012e-05, + "loss": 2.2043, + "mean_token_accuracy": 0.4448275864124298, + "step": 114020 + }, + { + "epoch": 0.11484719104066009, + "grad_norm": 10.16519489865712, + "learning_rate": 4.9484640203229896e-05, + "loss": 2.236, + "mean_token_accuracy": 0.43103448748588563, + "step": 114025 + }, + { + "epoch": 0.11485222709376426, + "grad_norm": 10.50597979725388, + "learning_rate": 4.948456045998215e-05, + "loss": 2.3956, + "mean_token_accuracy": 0.4034482717514038, + "step": 114030 + }, + { + "epoch": 0.11485726314686844, + "grad_norm": 12.404880570502268, + "learning_rate": 4.948448071063691e-05, + "loss": 2.4476, + "mean_token_accuracy": 0.3931034505367279, + "step": 114035 + }, + { + "epoch": 0.11486229919997261, + "grad_norm": 13.37793461164307, + "learning_rate": 4.948440095519418e-05, + "loss": 2.5674, + "mean_token_accuracy": 0.39104658365249634, + "step": 114040 + }, + { + "epoch": 0.11486733525307678, + "grad_norm": 11.847185817857563, + "learning_rate": 4.9484321193654e-05, + "loss": 2.363, + "mean_token_accuracy": 0.441379314661026, + "step": 114045 + }, + { + "epoch": 0.11487237130618094, + "grad_norm": 12.545202002162336, + "learning_rate": 4.948424142601639e-05, + "loss": 2.6762, + "mean_token_accuracy": 0.39310344457626345, + "step": 114050 + }, + { + "epoch": 0.11487740735928512, + "grad_norm": 10.991761816530065, + "learning_rate": 4.948416165228136e-05, + "loss": 2.5085, + "mean_token_accuracy": 0.38275861740112305, + "step": 114055 + }, + { + "epoch": 0.11488244341238929, + "grad_norm": 10.769079708819692, + "learning_rate": 4.948408187244895e-05, + "loss": 2.7193, + "mean_token_accuracy": 0.41379311084747317, + "step": 114060 + }, + { + "epoch": 0.11488747946549346, + "grad_norm": 9.905533680041689, + "learning_rate": 4.9484002086519165e-05, + "loss": 2.5577, + "mean_token_accuracy": 0.42413793206214906, + "step": 114065 + }, + { + "epoch": 0.11489251551859764, + "grad_norm": 9.861688952243156, + "learning_rate": 4.948392229449204e-05, + "loss": 2.045, + "mean_token_accuracy": 0.46896551847457885, + "step": 114070 + }, + { + "epoch": 0.11489755157170181, + "grad_norm": 15.9746567750674, + "learning_rate": 4.948384249636759e-05, + "loss": 2.6828, + "mean_token_accuracy": 0.4000000059604645, + "step": 114075 + }, + { + "epoch": 0.11490258762480599, + "grad_norm": 12.009074934802989, + "learning_rate": 4.948376269214584e-05, + "loss": 2.6382, + "mean_token_accuracy": 0.42068964838981626, + "step": 114080 + }, + { + "epoch": 0.11490762367791016, + "grad_norm": 13.64660522941905, + "learning_rate": 4.9483682881826824e-05, + "loss": 2.0331, + "mean_token_accuracy": 0.4896551728248596, + "step": 114085 + }, + { + "epoch": 0.11491265973101433, + "grad_norm": 9.280593708131645, + "learning_rate": 4.948360306541053e-05, + "loss": 2.3967, + "mean_token_accuracy": 0.42413792610168455, + "step": 114090 + }, + { + "epoch": 0.1149176957841185, + "grad_norm": 8.396777275562203, + "learning_rate": 4.948352324289701e-05, + "loss": 2.6186, + "mean_token_accuracy": 0.417241370677948, + "step": 114095 + }, + { + "epoch": 0.11492273183722268, + "grad_norm": 12.026471867673358, + "learning_rate": 4.948344341428628e-05, + "loss": 2.5018, + "mean_token_accuracy": 0.4068965554237366, + "step": 114100 + }, + { + "epoch": 0.11492776789032685, + "grad_norm": 10.564590504863084, + "learning_rate": 4.9483363579578364e-05, + "loss": 2.3974, + "mean_token_accuracy": 0.4738656938076019, + "step": 114105 + }, + { + "epoch": 0.11493280394343103, + "grad_norm": 12.253368627038808, + "learning_rate": 4.948328373877327e-05, + "loss": 2.4085, + "mean_token_accuracy": 0.42546883821487425, + "step": 114110 + }, + { + "epoch": 0.1149378399965352, + "grad_norm": 10.777913057139038, + "learning_rate": 4.948320389187104e-05, + "loss": 2.5384, + "mean_token_accuracy": 0.41034482717514037, + "step": 114115 + }, + { + "epoch": 0.11494287604963936, + "grad_norm": 11.006712039320778, + "learning_rate": 4.948312403887168e-05, + "loss": 2.3672, + "mean_token_accuracy": 0.44827585816383364, + "step": 114120 + }, + { + "epoch": 0.11494791210274354, + "grad_norm": 10.826713771697541, + "learning_rate": 4.948304417977522e-05, + "loss": 2.4728, + "mean_token_accuracy": 0.42758620977401735, + "step": 114125 + }, + { + "epoch": 0.11495294815584771, + "grad_norm": 11.741902115103413, + "learning_rate": 4.948296431458168e-05, + "loss": 2.2491, + "mean_token_accuracy": 0.4758620738983154, + "step": 114130 + }, + { + "epoch": 0.11495798420895188, + "grad_norm": 9.423203753511673, + "learning_rate": 4.9482884443291084e-05, + "loss": 2.4385, + "mean_token_accuracy": 0.4499697506427765, + "step": 114135 + }, + { + "epoch": 0.11496302026205606, + "grad_norm": 10.127323478544906, + "learning_rate": 4.9482804565903455e-05, + "loss": 2.6661, + "mean_token_accuracy": 0.4068965554237366, + "step": 114140 + }, + { + "epoch": 0.11496805631516023, + "grad_norm": 10.717278265783976, + "learning_rate": 4.948272468241881e-05, + "loss": 2.5826, + "mean_token_accuracy": 0.4, + "step": 114145 + }, + { + "epoch": 0.1149730923682644, + "grad_norm": 11.829556268726973, + "learning_rate": 4.948264479283718e-05, + "loss": 2.4729, + "mean_token_accuracy": 0.4413793087005615, + "step": 114150 + }, + { + "epoch": 0.11497812842136858, + "grad_norm": 10.015714812563699, + "learning_rate": 4.9482564897158576e-05, + "loss": 2.2554, + "mean_token_accuracy": 0.4482758641242981, + "step": 114155 + }, + { + "epoch": 0.11498316447447275, + "grad_norm": 11.024550062713494, + "learning_rate": 4.9482484995383025e-05, + "loss": 2.4351, + "mean_token_accuracy": 0.39655172228813174, + "step": 114160 + }, + { + "epoch": 0.11498820052757693, + "grad_norm": 12.413669346970101, + "learning_rate": 4.948240508751055e-05, + "loss": 2.6537, + "mean_token_accuracy": 0.45862069725990295, + "step": 114165 + }, + { + "epoch": 0.1149932365806811, + "grad_norm": 11.993218426553433, + "learning_rate": 4.9482325173541175e-05, + "loss": 2.5589, + "mean_token_accuracy": 0.4310344815254211, + "step": 114170 + }, + { + "epoch": 0.11499827263378527, + "grad_norm": 11.715414005312931, + "learning_rate": 4.9482245253474916e-05, + "loss": 2.7837, + "mean_token_accuracy": 0.36896551847457887, + "step": 114175 + }, + { + "epoch": 0.11500330868688945, + "grad_norm": 10.1591967873332, + "learning_rate": 4.948216532731181e-05, + "loss": 2.4536, + "mean_token_accuracy": 0.4310344815254211, + "step": 114180 + }, + { + "epoch": 0.11500834473999362, + "grad_norm": 9.471177610560758, + "learning_rate": 4.948208539505186e-05, + "loss": 2.2309, + "mean_token_accuracy": 0.4540834903717041, + "step": 114185 + }, + { + "epoch": 0.11501338079309778, + "grad_norm": 9.983318819801264, + "learning_rate": 4.948200545669509e-05, + "loss": 2.1922, + "mean_token_accuracy": 0.42758620977401735, + "step": 114190 + }, + { + "epoch": 0.11501841684620195, + "grad_norm": 17.922162131705186, + "learning_rate": 4.948192551224154e-05, + "loss": 2.33, + "mean_token_accuracy": 0.5127041757106781, + "step": 114195 + }, + { + "epoch": 0.11502345289930613, + "grad_norm": 11.167165349571508, + "learning_rate": 4.948184556169122e-05, + "loss": 2.4666, + "mean_token_accuracy": 0.4241379380226135, + "step": 114200 + }, + { + "epoch": 0.1150284889524103, + "grad_norm": 9.807124593451773, + "learning_rate": 4.948176560504415e-05, + "loss": 2.3143, + "mean_token_accuracy": 0.4103448212146759, + "step": 114205 + }, + { + "epoch": 0.11503352500551448, + "grad_norm": 9.639815815686905, + "learning_rate": 4.948168564230036e-05, + "loss": 2.6257, + "mean_token_accuracy": 0.36896551251411436, + "step": 114210 + }, + { + "epoch": 0.11503856105861865, + "grad_norm": 12.720133604106408, + "learning_rate": 4.9481605673459855e-05, + "loss": 2.1358, + "mean_token_accuracy": 0.4344827592372894, + "step": 114215 + }, + { + "epoch": 0.11504359711172282, + "grad_norm": 10.955302307421924, + "learning_rate": 4.9481525698522687e-05, + "loss": 2.4368, + "mean_token_accuracy": 0.41724138259887694, + "step": 114220 + }, + { + "epoch": 0.115048633164827, + "grad_norm": 13.360167559560479, + "learning_rate": 4.9481445717488846e-05, + "loss": 2.6689, + "mean_token_accuracy": 0.37586206793785093, + "step": 114225 + }, + { + "epoch": 0.11505366921793117, + "grad_norm": 12.308827627719717, + "learning_rate": 4.9481365730358375e-05, + "loss": 2.5291, + "mean_token_accuracy": 0.41379310488700866, + "step": 114230 + }, + { + "epoch": 0.11505870527103534, + "grad_norm": 10.971639254032365, + "learning_rate": 4.9481285737131285e-05, + "loss": 2.2121, + "mean_token_accuracy": 0.4620689690113068, + "step": 114235 + }, + { + "epoch": 0.11506374132413952, + "grad_norm": 12.218376112623474, + "learning_rate": 4.948120573780761e-05, + "loss": 2.3167, + "mean_token_accuracy": 0.453901994228363, + "step": 114240 + }, + { + "epoch": 0.11506877737724369, + "grad_norm": 9.461375627072245, + "learning_rate": 4.948112573238737e-05, + "loss": 2.7059, + "mean_token_accuracy": 0.3793103516101837, + "step": 114245 + }, + { + "epoch": 0.11507381343034787, + "grad_norm": 10.086943376776528, + "learning_rate": 4.9481045720870574e-05, + "loss": 2.4821, + "mean_token_accuracy": 0.41724138557910917, + "step": 114250 + }, + { + "epoch": 0.11507884948345204, + "grad_norm": 11.246074287412615, + "learning_rate": 4.948096570325726e-05, + "loss": 2.4058, + "mean_token_accuracy": 0.42068964838981626, + "step": 114255 + }, + { + "epoch": 0.1150838855365562, + "grad_norm": 14.01658130832733, + "learning_rate": 4.948088567954744e-05, + "loss": 2.5915, + "mean_token_accuracy": 0.3999999940395355, + "step": 114260 + }, + { + "epoch": 0.11508892158966037, + "grad_norm": 10.665053961858103, + "learning_rate": 4.9480805649741135e-05, + "loss": 2.3465, + "mean_token_accuracy": 0.4655172348022461, + "step": 114265 + }, + { + "epoch": 0.11509395764276455, + "grad_norm": 11.915422266993902, + "learning_rate": 4.948072561383838e-05, + "loss": 2.1998, + "mean_token_accuracy": 0.4620689690113068, + "step": 114270 + }, + { + "epoch": 0.11509899369586872, + "grad_norm": 11.951763763137516, + "learning_rate": 4.948064557183918e-05, + "loss": 2.592, + "mean_token_accuracy": 0.36551723480224607, + "step": 114275 + }, + { + "epoch": 0.1151040297489729, + "grad_norm": 11.06375173601289, + "learning_rate": 4.9480565523743564e-05, + "loss": 2.2002, + "mean_token_accuracy": 0.458620685338974, + "step": 114280 + }, + { + "epoch": 0.11510906580207707, + "grad_norm": 10.857945632590365, + "learning_rate": 4.948048546955156e-05, + "loss": 2.5112, + "mean_token_accuracy": 0.4482758641242981, + "step": 114285 + }, + { + "epoch": 0.11511410185518124, + "grad_norm": 8.79550657736542, + "learning_rate": 4.9480405409263195e-05, + "loss": 2.5516, + "mean_token_accuracy": 0.42758620381355283, + "step": 114290 + }, + { + "epoch": 0.11511913790828542, + "grad_norm": 7.3495390667440414, + "learning_rate": 4.948032534287847e-05, + "loss": 2.3559, + "mean_token_accuracy": 0.41584996581077577, + "step": 114295 + }, + { + "epoch": 0.11512417396138959, + "grad_norm": 11.67739099793903, + "learning_rate": 4.9480245270397424e-05, + "loss": 2.4519, + "mean_token_accuracy": 0.39310344457626345, + "step": 114300 + }, + { + "epoch": 0.11512921001449376, + "grad_norm": 9.329102045504136, + "learning_rate": 4.948016519182008e-05, + "loss": 2.3697, + "mean_token_accuracy": 0.4517241418361664, + "step": 114305 + }, + { + "epoch": 0.11513424606759794, + "grad_norm": 10.41458916255205, + "learning_rate": 4.948008510714645e-05, + "loss": 2.4249, + "mean_token_accuracy": 0.41034482717514037, + "step": 114310 + }, + { + "epoch": 0.11513928212070211, + "grad_norm": 8.793076330776032, + "learning_rate": 4.948000501637657e-05, + "loss": 1.9952, + "mean_token_accuracy": 0.4862068951129913, + "step": 114315 + }, + { + "epoch": 0.11514431817380628, + "grad_norm": 15.360007649938716, + "learning_rate": 4.947992491951044e-05, + "loss": 2.6239, + "mean_token_accuracy": 0.41034482717514037, + "step": 114320 + }, + { + "epoch": 0.11514935422691046, + "grad_norm": 11.677199411865576, + "learning_rate": 4.94798448165481e-05, + "loss": 2.0911, + "mean_token_accuracy": 0.4903940916061401, + "step": 114325 + }, + { + "epoch": 0.11515439028001462, + "grad_norm": 12.196811723782464, + "learning_rate": 4.947976470748958e-05, + "loss": 2.2896, + "mean_token_accuracy": 0.3896551728248596, + "step": 114330 + }, + { + "epoch": 0.11515942633311879, + "grad_norm": 11.117467543743958, + "learning_rate": 4.947968459233488e-05, + "loss": 2.0601, + "mean_token_accuracy": 0.4551724135875702, + "step": 114335 + }, + { + "epoch": 0.11516446238622297, + "grad_norm": 10.42918928673411, + "learning_rate": 4.947960447108403e-05, + "loss": 2.3085, + "mean_token_accuracy": 0.4034482717514038, + "step": 114340 + }, + { + "epoch": 0.11516949843932714, + "grad_norm": 11.630209305619932, + "learning_rate": 4.9479524343737054e-05, + "loss": 3.0507, + "mean_token_accuracy": 0.34482758343219755, + "step": 114345 + }, + { + "epoch": 0.11517453449243131, + "grad_norm": 15.21471754844737, + "learning_rate": 4.947944421029398e-05, + "loss": 2.7611, + "mean_token_accuracy": 0.43103447556495667, + "step": 114350 + }, + { + "epoch": 0.11517957054553549, + "grad_norm": 10.59730367485689, + "learning_rate": 4.9479364070754825e-05, + "loss": 2.1756, + "mean_token_accuracy": 0.43448275327682495, + "step": 114355 + }, + { + "epoch": 0.11518460659863966, + "grad_norm": 10.275831841831076, + "learning_rate": 4.947928392511961e-05, + "loss": 2.4117, + "mean_token_accuracy": 0.41379310488700866, + "step": 114360 + }, + { + "epoch": 0.11518964265174383, + "grad_norm": 11.78705069720452, + "learning_rate": 4.947920377338836e-05, + "loss": 2.0692, + "mean_token_accuracy": 0.5, + "step": 114365 + }, + { + "epoch": 0.11519467870484801, + "grad_norm": 11.146134650809783, + "learning_rate": 4.94791236155611e-05, + "loss": 2.4884, + "mean_token_accuracy": 0.4206896543502808, + "step": 114370 + }, + { + "epoch": 0.11519971475795218, + "grad_norm": 12.941719514604799, + "learning_rate": 4.9479043451637835e-05, + "loss": 2.4966, + "mean_token_accuracy": 0.4379310369491577, + "step": 114375 + }, + { + "epoch": 0.11520475081105636, + "grad_norm": 13.595352655746929, + "learning_rate": 4.947896328161861e-05, + "loss": 2.0218, + "mean_token_accuracy": 0.5024198353290558, + "step": 114380 + }, + { + "epoch": 0.11520978686416053, + "grad_norm": 11.1919901749803, + "learning_rate": 4.9478883105503434e-05, + "loss": 2.5195, + "mean_token_accuracy": 0.4172413766384125, + "step": 114385 + }, + { + "epoch": 0.1152148229172647, + "grad_norm": 11.49490483273503, + "learning_rate": 4.947880292329234e-05, + "loss": 2.4846, + "mean_token_accuracy": 0.4172413766384125, + "step": 114390 + }, + { + "epoch": 0.11521985897036888, + "grad_norm": 11.641513033964513, + "learning_rate": 4.947872273498533e-05, + "loss": 2.1974, + "mean_token_accuracy": 0.4517241358757019, + "step": 114395 + }, + { + "epoch": 0.11522489502347304, + "grad_norm": 15.442837953448356, + "learning_rate": 4.947864254058245e-05, + "loss": 2.5486, + "mean_token_accuracy": 0.4482758641242981, + "step": 114400 + }, + { + "epoch": 0.11522993107657721, + "grad_norm": 11.156768581566247, + "learning_rate": 4.947856234008371e-05, + "loss": 2.2582, + "mean_token_accuracy": 0.460496062040329, + "step": 114405 + }, + { + "epoch": 0.11523496712968138, + "grad_norm": 8.461948292711309, + "learning_rate": 4.9478482133489135e-05, + "loss": 2.1503, + "mean_token_accuracy": 0.4448275864124298, + "step": 114410 + }, + { + "epoch": 0.11524000318278556, + "grad_norm": 11.672629027264625, + "learning_rate": 4.947840192079874e-05, + "loss": 2.1338, + "mean_token_accuracy": 0.5019358813762664, + "step": 114415 + }, + { + "epoch": 0.11524503923588973, + "grad_norm": 8.908164182296904, + "learning_rate": 4.947832170201255e-05, + "loss": 2.2706, + "mean_token_accuracy": 0.47586206197738645, + "step": 114420 + }, + { + "epoch": 0.1152500752889939, + "grad_norm": 10.042469449380288, + "learning_rate": 4.94782414771306e-05, + "loss": 2.0327, + "mean_token_accuracy": 0.4896551609039307, + "step": 114425 + }, + { + "epoch": 0.11525511134209808, + "grad_norm": 11.020448341863673, + "learning_rate": 4.94781612461529e-05, + "loss": 2.8366, + "mean_token_accuracy": 0.32413792312145234, + "step": 114430 + }, + { + "epoch": 0.11526014739520225, + "grad_norm": 10.912518933696612, + "learning_rate": 4.947808100907947e-05, + "loss": 2.5729, + "mean_token_accuracy": 0.3965517282485962, + "step": 114435 + }, + { + "epoch": 0.11526518344830643, + "grad_norm": 10.143547787815832, + "learning_rate": 4.947800076591034e-05, + "loss": 2.2699, + "mean_token_accuracy": 0.4379310369491577, + "step": 114440 + }, + { + "epoch": 0.1152702195014106, + "grad_norm": 9.002677320809923, + "learning_rate": 4.947792051664554e-05, + "loss": 1.9524, + "mean_token_accuracy": 0.4862068951129913, + "step": 114445 + }, + { + "epoch": 0.11527525555451477, + "grad_norm": 11.54999764020228, + "learning_rate": 4.947784026128507e-05, + "loss": 2.4735, + "mean_token_accuracy": 0.46896551847457885, + "step": 114450 + }, + { + "epoch": 0.11528029160761895, + "grad_norm": 12.515416508491509, + "learning_rate": 4.947775999982897e-05, + "loss": 2.4628, + "mean_token_accuracy": 0.39655172526836396, + "step": 114455 + }, + { + "epoch": 0.11528532766072312, + "grad_norm": 10.968169503793305, + "learning_rate": 4.9477679732277254e-05, + "loss": 3.013, + "mean_token_accuracy": 0.37241379022598264, + "step": 114460 + }, + { + "epoch": 0.1152903637138273, + "grad_norm": 10.349380549278491, + "learning_rate": 4.947759945862994e-05, + "loss": 2.1009, + "mean_token_accuracy": 0.49534180760383606, + "step": 114465 + }, + { + "epoch": 0.11529539976693146, + "grad_norm": 10.95108320875787, + "learning_rate": 4.9477519178887064e-05, + "loss": 2.398, + "mean_token_accuracy": 0.4517241418361664, + "step": 114470 + }, + { + "epoch": 0.11530043582003563, + "grad_norm": 9.61673932105015, + "learning_rate": 4.947743889304864e-05, + "loss": 2.3768, + "mean_token_accuracy": 0.3965517282485962, + "step": 114475 + }, + { + "epoch": 0.1153054718731398, + "grad_norm": 14.447229623405963, + "learning_rate": 4.94773586011147e-05, + "loss": 2.6607, + "mean_token_accuracy": 0.4310344815254211, + "step": 114480 + }, + { + "epoch": 0.11531050792624398, + "grad_norm": 10.146372300552864, + "learning_rate": 4.947727830308524e-05, + "loss": 2.4017, + "mean_token_accuracy": 0.4413793087005615, + "step": 114485 + }, + { + "epoch": 0.11531554397934815, + "grad_norm": 8.930959553256985, + "learning_rate": 4.9477197998960314e-05, + "loss": 2.3884, + "mean_token_accuracy": 0.3950998157262802, + "step": 114490 + }, + { + "epoch": 0.11532058003245232, + "grad_norm": 16.218202745912052, + "learning_rate": 4.947711768873992e-05, + "loss": 2.6574, + "mean_token_accuracy": 0.4517241299152374, + "step": 114495 + }, + { + "epoch": 0.1153256160855565, + "grad_norm": 9.89118439570471, + "learning_rate": 4.94770373724241e-05, + "loss": 2.4402, + "mean_token_accuracy": 0.36206896901130675, + "step": 114500 + }, + { + "epoch": 0.11533065213866067, + "grad_norm": 9.691230429043946, + "learning_rate": 4.9476957050012854e-05, + "loss": 2.4374, + "mean_token_accuracy": 0.44343618154525755, + "step": 114505 + }, + { + "epoch": 0.11533568819176485, + "grad_norm": 11.510495907441681, + "learning_rate": 4.947687672150623e-05, + "loss": 2.1139, + "mean_token_accuracy": 0.4379310250282288, + "step": 114510 + }, + { + "epoch": 0.11534072424486902, + "grad_norm": 8.964848206860545, + "learning_rate": 4.947679638690423e-05, + "loss": 2.053, + "mean_token_accuracy": 0.4601935803890228, + "step": 114515 + }, + { + "epoch": 0.1153457602979732, + "grad_norm": 10.682343061870995, + "learning_rate": 4.9476716046206885e-05, + "loss": 2.4019, + "mean_token_accuracy": 0.4379310369491577, + "step": 114520 + }, + { + "epoch": 0.11535079635107737, + "grad_norm": 9.618198043858225, + "learning_rate": 4.9476635699414216e-05, + "loss": 2.3808, + "mean_token_accuracy": 0.44137930274009707, + "step": 114525 + }, + { + "epoch": 0.11535583240418154, + "grad_norm": 8.806456485991411, + "learning_rate": 4.947655534652625e-05, + "loss": 1.991, + "mean_token_accuracy": 0.5156684815883636, + "step": 114530 + }, + { + "epoch": 0.11536086845728571, + "grad_norm": 9.785695649715745, + "learning_rate": 4.947647498754299e-05, + "loss": 2.2655, + "mean_token_accuracy": 0.4517241358757019, + "step": 114535 + }, + { + "epoch": 0.11536590451038987, + "grad_norm": 10.563434986961429, + "learning_rate": 4.947639462246449e-05, + "loss": 2.7464, + "mean_token_accuracy": 0.38620689511299133, + "step": 114540 + }, + { + "epoch": 0.11537094056349405, + "grad_norm": 10.008861512931677, + "learning_rate": 4.9476314251290746e-05, + "loss": 2.6529, + "mean_token_accuracy": 0.38753780722618103, + "step": 114545 + }, + { + "epoch": 0.11537597661659822, + "grad_norm": 11.674184383647587, + "learning_rate": 4.9476233874021785e-05, + "loss": 2.7439, + "mean_token_accuracy": 0.3620689630508423, + "step": 114550 + }, + { + "epoch": 0.1153810126697024, + "grad_norm": 11.682666554977915, + "learning_rate": 4.947615349065764e-05, + "loss": 2.4392, + "mean_token_accuracy": 0.3862068891525269, + "step": 114555 + }, + { + "epoch": 0.11538604872280657, + "grad_norm": 14.130804620692176, + "learning_rate": 4.947607310119833e-05, + "loss": 2.4434, + "mean_token_accuracy": 0.40344826579093934, + "step": 114560 + }, + { + "epoch": 0.11539108477591074, + "grad_norm": 11.545471822807887, + "learning_rate": 4.9475992705643867e-05, + "loss": 2.308, + "mean_token_accuracy": 0.493103438615799, + "step": 114565 + }, + { + "epoch": 0.11539612082901492, + "grad_norm": 9.137790354232546, + "learning_rate": 4.947591230399429e-05, + "loss": 2.154, + "mean_token_accuracy": 0.4517241418361664, + "step": 114570 + }, + { + "epoch": 0.11540115688211909, + "grad_norm": 10.369850857638635, + "learning_rate": 4.947583189624961e-05, + "loss": 2.3209, + "mean_token_accuracy": 0.4344827592372894, + "step": 114575 + }, + { + "epoch": 0.11540619293522326, + "grad_norm": 11.400631360061109, + "learning_rate": 4.947575148240984e-05, + "loss": 2.4818, + "mean_token_accuracy": 0.4206896543502808, + "step": 114580 + }, + { + "epoch": 0.11541122898832744, + "grad_norm": 10.777133922965536, + "learning_rate": 4.947567106247502e-05, + "loss": 2.2518, + "mean_token_accuracy": 0.4379310429096222, + "step": 114585 + }, + { + "epoch": 0.11541626504143161, + "grad_norm": 9.033371463432728, + "learning_rate": 4.9475590636445165e-05, + "loss": 2.5923, + "mean_token_accuracy": 0.43793103098869324, + "step": 114590 + }, + { + "epoch": 0.11542130109453579, + "grad_norm": 10.609364242310876, + "learning_rate": 4.94755102043203e-05, + "loss": 2.4796, + "mean_token_accuracy": 0.3931034505367279, + "step": 114595 + }, + { + "epoch": 0.11542633714763996, + "grad_norm": 10.80417511154219, + "learning_rate": 4.9475429766100437e-05, + "loss": 2.8225, + "mean_token_accuracy": 0.3758620619773865, + "step": 114600 + }, + { + "epoch": 0.11543137320074413, + "grad_norm": 10.556849137695398, + "learning_rate": 4.947534932178562e-05, + "loss": 2.3881, + "mean_token_accuracy": 0.4517241418361664, + "step": 114605 + }, + { + "epoch": 0.1154364092538483, + "grad_norm": 8.778695121711372, + "learning_rate": 4.947526887137585e-05, + "loss": 2.3347, + "mean_token_accuracy": 0.4517241299152374, + "step": 114610 + }, + { + "epoch": 0.11544144530695247, + "grad_norm": 14.78203657973234, + "learning_rate": 4.947518841487116e-05, + "loss": 2.2739, + "mean_token_accuracy": 0.47931034564971925, + "step": 114615 + }, + { + "epoch": 0.11544648136005664, + "grad_norm": 11.10012417945893, + "learning_rate": 4.947510795227156e-05, + "loss": 2.1409, + "mean_token_accuracy": 0.47241379618644713, + "step": 114620 + }, + { + "epoch": 0.11545151741316081, + "grad_norm": 8.722344282943816, + "learning_rate": 4.94750274835771e-05, + "loss": 2.0505, + "mean_token_accuracy": 0.4732758581638336, + "step": 114625 + }, + { + "epoch": 0.11545655346626499, + "grad_norm": 8.160706046546673, + "learning_rate": 4.947494700878777e-05, + "loss": 2.1153, + "mean_token_accuracy": 0.482758617401123, + "step": 114630 + }, + { + "epoch": 0.11546158951936916, + "grad_norm": 11.082409385852314, + "learning_rate": 4.9474866527903604e-05, + "loss": 2.5476, + "mean_token_accuracy": 0.4482758641242981, + "step": 114635 + }, + { + "epoch": 0.11546662557247334, + "grad_norm": 9.689819310465234, + "learning_rate": 4.9474786040924645e-05, + "loss": 2.1592, + "mean_token_accuracy": 0.42758620977401735, + "step": 114640 + }, + { + "epoch": 0.11547166162557751, + "grad_norm": 10.02147786714863, + "learning_rate": 4.947470554785088e-05, + "loss": 2.2084, + "mean_token_accuracy": 0.4413793087005615, + "step": 114645 + }, + { + "epoch": 0.11547669767868168, + "grad_norm": 10.179254784402412, + "learning_rate": 4.9474625048682355e-05, + "loss": 2.367, + "mean_token_accuracy": 0.441379314661026, + "step": 114650 + }, + { + "epoch": 0.11548173373178586, + "grad_norm": 10.90891773673151, + "learning_rate": 4.9474544543419086e-05, + "loss": 2.9072, + "mean_token_accuracy": 0.3837870478630066, + "step": 114655 + }, + { + "epoch": 0.11548676978489003, + "grad_norm": 12.081832697686217, + "learning_rate": 4.947446403206109e-05, + "loss": 2.9445, + "mean_token_accuracy": 0.36896551847457887, + "step": 114660 + }, + { + "epoch": 0.1154918058379942, + "grad_norm": 9.856248224185135, + "learning_rate": 4.94743835146084e-05, + "loss": 2.1448, + "mean_token_accuracy": 0.4862068831920624, + "step": 114665 + }, + { + "epoch": 0.11549684189109838, + "grad_norm": 10.526721312771814, + "learning_rate": 4.947430299106103e-05, + "loss": 2.8332, + "mean_token_accuracy": 0.3793103456497192, + "step": 114670 + }, + { + "epoch": 0.11550187794420255, + "grad_norm": 10.40351915739031, + "learning_rate": 4.947422246141901e-05, + "loss": 2.6716, + "mean_token_accuracy": 0.4034482717514038, + "step": 114675 + }, + { + "epoch": 0.11550691399730671, + "grad_norm": 9.859906992833814, + "learning_rate": 4.947414192568235e-05, + "loss": 1.9472, + "mean_token_accuracy": 0.482758617401123, + "step": 114680 + }, + { + "epoch": 0.11551195005041089, + "grad_norm": 10.731568345069613, + "learning_rate": 4.947406138385108e-05, + "loss": 2.2379, + "mean_token_accuracy": 0.42256503105163573, + "step": 114685 + }, + { + "epoch": 0.11551698610351506, + "grad_norm": 12.068249906238519, + "learning_rate": 4.947398083592523e-05, + "loss": 2.3637, + "mean_token_accuracy": 0.4448275864124298, + "step": 114690 + }, + { + "epoch": 0.11552202215661923, + "grad_norm": 10.369265759046108, + "learning_rate": 4.9473900281904804e-05, + "loss": 2.26, + "mean_token_accuracy": 0.4724137902259827, + "step": 114695 + }, + { + "epoch": 0.11552705820972341, + "grad_norm": 12.092107678789406, + "learning_rate": 4.947381972178984e-05, + "loss": 2.3578, + "mean_token_accuracy": 0.4137930989265442, + "step": 114700 + }, + { + "epoch": 0.11553209426282758, + "grad_norm": 9.981810315288632, + "learning_rate": 4.947373915558036e-05, + "loss": 2.9392, + "mean_token_accuracy": 0.4034482717514038, + "step": 114705 + }, + { + "epoch": 0.11553713031593175, + "grad_norm": 8.897771367701681, + "learning_rate": 4.947365858327638e-05, + "loss": 2.156, + "mean_token_accuracy": 0.4758620738983154, + "step": 114710 + }, + { + "epoch": 0.11554216636903593, + "grad_norm": 14.925776088147948, + "learning_rate": 4.947357800487792e-05, + "loss": 2.5908, + "mean_token_accuracy": 0.4379310250282288, + "step": 114715 + }, + { + "epoch": 0.1155472024221401, + "grad_norm": 10.433349035700555, + "learning_rate": 4.9473497420385e-05, + "loss": 2.3255, + "mean_token_accuracy": 0.420689657330513, + "step": 114720 + }, + { + "epoch": 0.11555223847524428, + "grad_norm": 12.290843267835486, + "learning_rate": 4.947341682979765e-05, + "loss": 2.4987, + "mean_token_accuracy": 0.41379310488700866, + "step": 114725 + }, + { + "epoch": 0.11555727452834845, + "grad_norm": 9.367805269568239, + "learning_rate": 4.9473336233115906e-05, + "loss": 2.675, + "mean_token_accuracy": 0.417241370677948, + "step": 114730 + }, + { + "epoch": 0.11556231058145262, + "grad_norm": 9.307414056999395, + "learning_rate": 4.9473255630339764e-05, + "loss": 2.8124, + "mean_token_accuracy": 0.4103448331356049, + "step": 114735 + }, + { + "epoch": 0.1155673466345568, + "grad_norm": 9.033892750017497, + "learning_rate": 4.947317502146926e-05, + "loss": 2.0454, + "mean_token_accuracy": 0.48965516686439514, + "step": 114740 + }, + { + "epoch": 0.11557238268766097, + "grad_norm": 11.683163522492425, + "learning_rate": 4.947309440650441e-05, + "loss": 2.4118, + "mean_token_accuracy": 0.41724138259887694, + "step": 114745 + }, + { + "epoch": 0.11557741874076513, + "grad_norm": 8.234765699547703, + "learning_rate": 4.947301378544525e-05, + "loss": 2.007, + "mean_token_accuracy": 0.5436781585216522, + "step": 114750 + }, + { + "epoch": 0.1155824547938693, + "grad_norm": 11.987082736154791, + "learning_rate": 4.947293315829178e-05, + "loss": 2.2801, + "mean_token_accuracy": 0.4598911166191101, + "step": 114755 + }, + { + "epoch": 0.11558749084697348, + "grad_norm": 9.134477323977766, + "learning_rate": 4.947285252504404e-05, + "loss": 2.5214, + "mean_token_accuracy": 0.3931034505367279, + "step": 114760 + }, + { + "epoch": 0.11559252690007765, + "grad_norm": 10.96523923994237, + "learning_rate": 4.9472771885702054e-05, + "loss": 2.2035, + "mean_token_accuracy": 0.38275861740112305, + "step": 114765 + }, + { + "epoch": 0.11559756295318183, + "grad_norm": 10.794217381560557, + "learning_rate": 4.947269124026583e-05, + "loss": 2.427, + "mean_token_accuracy": 0.43448275327682495, + "step": 114770 + }, + { + "epoch": 0.115602599006286, + "grad_norm": 11.50113186908765, + "learning_rate": 4.94726105887354e-05, + "loss": 2.6883, + "mean_token_accuracy": 0.4083484590053558, + "step": 114775 + }, + { + "epoch": 0.11560763505939017, + "grad_norm": 10.102937652965693, + "learning_rate": 4.947252993111078e-05, + "loss": 2.6221, + "mean_token_accuracy": 0.39655172228813174, + "step": 114780 + }, + { + "epoch": 0.11561267111249435, + "grad_norm": 13.240633773539358, + "learning_rate": 4.947244926739201e-05, + "loss": 2.3388, + "mean_token_accuracy": 0.4620689690113068, + "step": 114785 + }, + { + "epoch": 0.11561770716559852, + "grad_norm": 12.086622987868527, + "learning_rate": 4.947236859757909e-05, + "loss": 2.3593, + "mean_token_accuracy": 0.42068966031074523, + "step": 114790 + }, + { + "epoch": 0.1156227432187027, + "grad_norm": 34.74912174700879, + "learning_rate": 4.947228792167205e-05, + "loss": 2.958, + "mean_token_accuracy": 0.42413792908191683, + "step": 114795 + }, + { + "epoch": 0.11562777927180687, + "grad_norm": 8.655295521346384, + "learning_rate": 4.9472207239670916e-05, + "loss": 2.3412, + "mean_token_accuracy": 0.4361161470413208, + "step": 114800 + }, + { + "epoch": 0.11563281532491104, + "grad_norm": 10.327715624395896, + "learning_rate": 4.947212655157571e-05, + "loss": 2.3336, + "mean_token_accuracy": 0.42758620977401735, + "step": 114805 + }, + { + "epoch": 0.11563785137801522, + "grad_norm": 10.001276555685791, + "learning_rate": 4.947204585738645e-05, + "loss": 1.8739, + "mean_token_accuracy": 0.5159709513187408, + "step": 114810 + }, + { + "epoch": 0.11564288743111939, + "grad_norm": 10.655800717031573, + "learning_rate": 4.947196515710316e-05, + "loss": 2.656, + "mean_token_accuracy": 0.3931034505367279, + "step": 114815 + }, + { + "epoch": 0.11564792348422355, + "grad_norm": 9.787162743551132, + "learning_rate": 4.947188445072587e-05, + "loss": 2.7723, + "mean_token_accuracy": 0.4206896424293518, + "step": 114820 + }, + { + "epoch": 0.11565295953732772, + "grad_norm": 15.857705156140165, + "learning_rate": 4.9471803738254595e-05, + "loss": 2.3892, + "mean_token_accuracy": 0.42413793206214906, + "step": 114825 + }, + { + "epoch": 0.1156579955904319, + "grad_norm": 9.104147080791181, + "learning_rate": 4.9471723019689354e-05, + "loss": 2.5101, + "mean_token_accuracy": 0.3793103456497192, + "step": 114830 + }, + { + "epoch": 0.11566303164353607, + "grad_norm": 10.824231104091838, + "learning_rate": 4.947164229503017e-05, + "loss": 2.2565, + "mean_token_accuracy": 0.4413793087005615, + "step": 114835 + }, + { + "epoch": 0.11566806769664024, + "grad_norm": 10.108348420436315, + "learning_rate": 4.9471561564277075e-05, + "loss": 2.4726, + "mean_token_accuracy": 0.4068965494632721, + "step": 114840 + }, + { + "epoch": 0.11567310374974442, + "grad_norm": 10.209690995148106, + "learning_rate": 4.9471480827430085e-05, + "loss": 2.5131, + "mean_token_accuracy": 0.42413792610168455, + "step": 114845 + }, + { + "epoch": 0.11567813980284859, + "grad_norm": 12.259702109827607, + "learning_rate": 4.947140008448922e-05, + "loss": 2.4142, + "mean_token_accuracy": 0.3965517282485962, + "step": 114850 + }, + { + "epoch": 0.11568317585595277, + "grad_norm": 9.608136061964927, + "learning_rate": 4.947131933545451e-05, + "loss": 2.5046, + "mean_token_accuracy": 0.4172413766384125, + "step": 114855 + }, + { + "epoch": 0.11568821190905694, + "grad_norm": 10.668309501766641, + "learning_rate": 4.947123858032597e-05, + "loss": 2.2662, + "mean_token_accuracy": 0.41724137365818026, + "step": 114860 + }, + { + "epoch": 0.11569324796216111, + "grad_norm": 10.617106029159755, + "learning_rate": 4.947115781910363e-05, + "loss": 2.2898, + "mean_token_accuracy": 0.4586206912994385, + "step": 114865 + }, + { + "epoch": 0.11569828401526529, + "grad_norm": 9.878663582166649, + "learning_rate": 4.94710770517875e-05, + "loss": 2.6638, + "mean_token_accuracy": 0.4344827592372894, + "step": 114870 + }, + { + "epoch": 0.11570332006836946, + "grad_norm": 10.096062908600633, + "learning_rate": 4.947099627837761e-05, + "loss": 2.4537, + "mean_token_accuracy": 0.4, + "step": 114875 + }, + { + "epoch": 0.11570835612147364, + "grad_norm": 10.109360426562704, + "learning_rate": 4.947091549887399e-05, + "loss": 2.4242, + "mean_token_accuracy": 0.41034482717514037, + "step": 114880 + }, + { + "epoch": 0.11571339217457781, + "grad_norm": 11.870642270027597, + "learning_rate": 4.947083471327665e-05, + "loss": 3.1018, + "mean_token_accuracy": 0.3517241358757019, + "step": 114885 + }, + { + "epoch": 0.11571842822768197, + "grad_norm": 13.896298910578459, + "learning_rate": 4.947075392158561e-05, + "loss": 2.8808, + "mean_token_accuracy": 0.39655172526836396, + "step": 114890 + }, + { + "epoch": 0.11572346428078614, + "grad_norm": 15.534732701661298, + "learning_rate": 4.9470673123800906e-05, + "loss": 3.1769, + "mean_token_accuracy": 0.3517241358757019, + "step": 114895 + }, + { + "epoch": 0.11572850033389032, + "grad_norm": 12.422139315432718, + "learning_rate": 4.947059231992256e-05, + "loss": 2.4334, + "mean_token_accuracy": 0.4275861978530884, + "step": 114900 + }, + { + "epoch": 0.11573353638699449, + "grad_norm": 11.289629133522578, + "learning_rate": 4.947051150995057e-05, + "loss": 2.3586, + "mean_token_accuracy": 0.458620685338974, + "step": 114905 + }, + { + "epoch": 0.11573857244009866, + "grad_norm": 12.187650052563535, + "learning_rate": 4.9470430693884995e-05, + "loss": 2.3475, + "mean_token_accuracy": 0.42413793206214906, + "step": 114910 + }, + { + "epoch": 0.11574360849320284, + "grad_norm": 12.530995904598273, + "learning_rate": 4.947034987172583e-05, + "loss": 2.3189, + "mean_token_accuracy": 0.39310343861579894, + "step": 114915 + }, + { + "epoch": 0.11574864454630701, + "grad_norm": 8.385574642579826, + "learning_rate": 4.947026904347311e-05, + "loss": 2.4921, + "mean_token_accuracy": 0.4344827622175217, + "step": 114920 + }, + { + "epoch": 0.11575368059941119, + "grad_norm": 10.98471409710384, + "learning_rate": 4.947018820912685e-05, + "loss": 2.5151, + "mean_token_accuracy": 0.37586206793785093, + "step": 114925 + }, + { + "epoch": 0.11575871665251536, + "grad_norm": 12.494020152037292, + "learning_rate": 4.947010736868709e-05, + "loss": 2.6162, + "mean_token_accuracy": 0.37241379022598264, + "step": 114930 + }, + { + "epoch": 0.11576375270561953, + "grad_norm": 9.994918790582947, + "learning_rate": 4.947002652215382e-05, + "loss": 2.1625, + "mean_token_accuracy": 0.44137930274009707, + "step": 114935 + }, + { + "epoch": 0.1157687887587237, + "grad_norm": 9.349052996898859, + "learning_rate": 4.9469945669527085e-05, + "loss": 2.1796, + "mean_token_accuracy": 0.4689655125141144, + "step": 114940 + }, + { + "epoch": 0.11577382481182788, + "grad_norm": 11.128489157331199, + "learning_rate": 4.946986481080691e-05, + "loss": 2.501, + "mean_token_accuracy": 0.42921960949897764, + "step": 114945 + }, + { + "epoch": 0.11577886086493205, + "grad_norm": 11.22896045822213, + "learning_rate": 4.9469783945993316e-05, + "loss": 2.6193, + "mean_token_accuracy": 0.43103447556495667, + "step": 114950 + }, + { + "epoch": 0.11578389691803623, + "grad_norm": 34.76976321431227, + "learning_rate": 4.946970307508631e-05, + "loss": 3.1037, + "mean_token_accuracy": 0.37241379022598264, + "step": 114955 + }, + { + "epoch": 0.11578893297114039, + "grad_norm": 9.948230086104374, + "learning_rate": 4.9469622198085924e-05, + "loss": 2.5616, + "mean_token_accuracy": 0.3999999940395355, + "step": 114960 + }, + { + "epoch": 0.11579396902424456, + "grad_norm": 10.366545295723576, + "learning_rate": 4.946954131499219e-05, + "loss": 2.0939, + "mean_token_accuracy": 0.45246305465698244, + "step": 114965 + }, + { + "epoch": 0.11579900507734874, + "grad_norm": 9.506954420751157, + "learning_rate": 4.946946042580511e-05, + "loss": 2.3933, + "mean_token_accuracy": 0.4620689630508423, + "step": 114970 + }, + { + "epoch": 0.11580404113045291, + "grad_norm": 10.374595171185781, + "learning_rate": 4.946937953052473e-05, + "loss": 2.1646, + "mean_token_accuracy": 0.4655172348022461, + "step": 114975 + }, + { + "epoch": 0.11580907718355708, + "grad_norm": 11.622030695526126, + "learning_rate": 4.9469298629151054e-05, + "loss": 2.1895, + "mean_token_accuracy": 0.4016333997249603, + "step": 114980 + }, + { + "epoch": 0.11581411323666126, + "grad_norm": 9.645352718793392, + "learning_rate": 4.9469217721684116e-05, + "loss": 2.1136, + "mean_token_accuracy": 0.47931033968925474, + "step": 114985 + }, + { + "epoch": 0.11581914928976543, + "grad_norm": 13.168528483976742, + "learning_rate": 4.946913680812393e-05, + "loss": 2.5168, + "mean_token_accuracy": 0.4068965494632721, + "step": 114990 + }, + { + "epoch": 0.1158241853428696, + "grad_norm": 11.922130695032525, + "learning_rate": 4.9469055888470525e-05, + "loss": 2.644, + "mean_token_accuracy": 0.3620689630508423, + "step": 114995 + }, + { + "epoch": 0.11582922139597378, + "grad_norm": 12.839271749319044, + "learning_rate": 4.946897496272392e-05, + "loss": 2.5436, + "mean_token_accuracy": 0.39655172228813174, + "step": 115000 + }, + { + "epoch": 0.11583425744907795, + "grad_norm": 10.1785852050693, + "learning_rate": 4.9468894030884136e-05, + "loss": 2.6784, + "mean_token_accuracy": 0.38275861740112305, + "step": 115005 + }, + { + "epoch": 0.11583929350218213, + "grad_norm": 10.980216295560634, + "learning_rate": 4.946881309295119e-05, + "loss": 2.757, + "mean_token_accuracy": 0.3896551728248596, + "step": 115010 + }, + { + "epoch": 0.1158443295552863, + "grad_norm": 10.226675146828429, + "learning_rate": 4.9468732148925127e-05, + "loss": 2.1552, + "mean_token_accuracy": 0.4620689630508423, + "step": 115015 + }, + { + "epoch": 0.11584936560839047, + "grad_norm": 14.026351250308208, + "learning_rate": 4.946865119880594e-05, + "loss": 2.5151, + "mean_token_accuracy": 0.38620689511299133, + "step": 115020 + }, + { + "epoch": 0.11585440166149465, + "grad_norm": 12.45389663486251, + "learning_rate": 4.946857024259368e-05, + "loss": 2.733, + "mean_token_accuracy": 0.4206896543502808, + "step": 115025 + }, + { + "epoch": 0.1158594377145988, + "grad_norm": 9.442534667269339, + "learning_rate": 4.946848928028834e-05, + "loss": 2.5857, + "mean_token_accuracy": 0.3827586233615875, + "step": 115030 + }, + { + "epoch": 0.11586447376770298, + "grad_norm": 9.083497313303834, + "learning_rate": 4.946840831188996e-05, + "loss": 2.5459, + "mean_token_accuracy": 0.4034482777118683, + "step": 115035 + }, + { + "epoch": 0.11586950982080715, + "grad_norm": 9.124452232725604, + "learning_rate": 4.9468327337398575e-05, + "loss": 2.3336, + "mean_token_accuracy": 0.4620689630508423, + "step": 115040 + }, + { + "epoch": 0.11587454587391133, + "grad_norm": 12.295081768215281, + "learning_rate": 4.9468246356814175e-05, + "loss": 2.2935, + "mean_token_accuracy": 0.4379310369491577, + "step": 115045 + }, + { + "epoch": 0.1158795819270155, + "grad_norm": 8.93200602151638, + "learning_rate": 4.946816537013681e-05, + "loss": 2.1948, + "mean_token_accuracy": 0.41379310488700866, + "step": 115050 + }, + { + "epoch": 0.11588461798011968, + "grad_norm": 10.026883581061169, + "learning_rate": 4.9468084377366484e-05, + "loss": 2.2993, + "mean_token_accuracy": 0.4551724135875702, + "step": 115055 + }, + { + "epoch": 0.11588965403322385, + "grad_norm": 12.689090336341405, + "learning_rate": 4.9468003378503235e-05, + "loss": 2.9292, + "mean_token_accuracy": 0.36896551251411436, + "step": 115060 + }, + { + "epoch": 0.11589469008632802, + "grad_norm": 9.541070340947421, + "learning_rate": 4.9467922373547074e-05, + "loss": 2.0635, + "mean_token_accuracy": 0.44482759237289426, + "step": 115065 + }, + { + "epoch": 0.1158997261394322, + "grad_norm": 9.35622604691093, + "learning_rate": 4.9467841362498036e-05, + "loss": 2.4309, + "mean_token_accuracy": 0.4379310369491577, + "step": 115070 + }, + { + "epoch": 0.11590476219253637, + "grad_norm": 10.691137842513843, + "learning_rate": 4.946776034535612e-05, + "loss": 2.4491, + "mean_token_accuracy": 0.42413793206214906, + "step": 115075 + }, + { + "epoch": 0.11590979824564054, + "grad_norm": 9.829862440545714, + "learning_rate": 4.946767932212139e-05, + "loss": 2.5544, + "mean_token_accuracy": 0.4, + "step": 115080 + }, + { + "epoch": 0.11591483429874472, + "grad_norm": 9.145867329156193, + "learning_rate": 4.946759829279381e-05, + "loss": 2.4485, + "mean_token_accuracy": 0.4034482777118683, + "step": 115085 + }, + { + "epoch": 0.11591987035184889, + "grad_norm": 7.6897273959218975, + "learning_rate": 4.9467517257373455e-05, + "loss": 2.3298, + "mean_token_accuracy": 0.4551724076271057, + "step": 115090 + }, + { + "epoch": 0.11592490640495307, + "grad_norm": 10.741907085339285, + "learning_rate": 4.946743621586032e-05, + "loss": 2.6791, + "mean_token_accuracy": 0.39183302521705626, + "step": 115095 + }, + { + "epoch": 0.11592994245805723, + "grad_norm": 9.84487328448593, + "learning_rate": 4.946735516825444e-05, + "loss": 2.4177, + "mean_token_accuracy": 0.4379310369491577, + "step": 115100 + }, + { + "epoch": 0.1159349785111614, + "grad_norm": 10.878252234993955, + "learning_rate": 4.946727411455583e-05, + "loss": 2.2905, + "mean_token_accuracy": 0.4379310250282288, + "step": 115105 + }, + { + "epoch": 0.11594001456426557, + "grad_norm": 11.409014987691457, + "learning_rate": 4.946719305476451e-05, + "loss": 2.3298, + "mean_token_accuracy": 0.4793103337287903, + "step": 115110 + }, + { + "epoch": 0.11594505061736975, + "grad_norm": 11.28540886150633, + "learning_rate": 4.946711198888051e-05, + "loss": 2.4118, + "mean_token_accuracy": 0.3862068891525269, + "step": 115115 + }, + { + "epoch": 0.11595008667047392, + "grad_norm": 11.082297894509734, + "learning_rate": 4.946703091690385e-05, + "loss": 2.9201, + "mean_token_accuracy": 0.42583181858062746, + "step": 115120 + }, + { + "epoch": 0.1159551227235781, + "grad_norm": 9.514869502614246, + "learning_rate": 4.9466949838834556e-05, + "loss": 3.0083, + "mean_token_accuracy": 0.4, + "step": 115125 + }, + { + "epoch": 0.11596015877668227, + "grad_norm": 10.541619586285215, + "learning_rate": 4.946686875467264e-05, + "loss": 2.2433, + "mean_token_accuracy": 0.4655172348022461, + "step": 115130 + }, + { + "epoch": 0.11596519482978644, + "grad_norm": 10.34254713848287, + "learning_rate": 4.946678766441813e-05, + "loss": 2.4655, + "mean_token_accuracy": 0.38275861740112305, + "step": 115135 + }, + { + "epoch": 0.11597023088289062, + "grad_norm": 12.565393072279965, + "learning_rate": 4.946670656807105e-05, + "loss": 2.5526, + "mean_token_accuracy": 0.43103447556495667, + "step": 115140 + }, + { + "epoch": 0.11597526693599479, + "grad_norm": 9.367707549708788, + "learning_rate": 4.9466625465631426e-05, + "loss": 2.6294, + "mean_token_accuracy": 0.35862069129943847, + "step": 115145 + }, + { + "epoch": 0.11598030298909896, + "grad_norm": 9.053484530661889, + "learning_rate": 4.946654435709928e-05, + "loss": 2.0614, + "mean_token_accuracy": 0.4724137902259827, + "step": 115150 + }, + { + "epoch": 0.11598533904220314, + "grad_norm": 10.290453044422614, + "learning_rate": 4.946646324247462e-05, + "loss": 2.4449, + "mean_token_accuracy": 0.4379310369491577, + "step": 115155 + }, + { + "epoch": 0.11599037509530731, + "grad_norm": 10.530823031091657, + "learning_rate": 4.946638212175749e-05, + "loss": 2.2961, + "mean_token_accuracy": 0.4482758641242981, + "step": 115160 + }, + { + "epoch": 0.11599541114841148, + "grad_norm": 13.568082956372091, + "learning_rate": 4.9466300994947896e-05, + "loss": 2.9254, + "mean_token_accuracy": 0.3551724135875702, + "step": 115165 + }, + { + "epoch": 0.11600044720151564, + "grad_norm": 12.82807777098334, + "learning_rate": 4.946621986204587e-05, + "loss": 2.3822, + "mean_token_accuracy": 0.441379314661026, + "step": 115170 + }, + { + "epoch": 0.11600548325461982, + "grad_norm": 14.738920314987807, + "learning_rate": 4.946613872305143e-05, + "loss": 2.6168, + "mean_token_accuracy": 0.38965516686439516, + "step": 115175 + }, + { + "epoch": 0.11601051930772399, + "grad_norm": 8.541220965909346, + "learning_rate": 4.9466057577964596e-05, + "loss": 2.1948, + "mean_token_accuracy": 0.4931034505367279, + "step": 115180 + }, + { + "epoch": 0.11601555536082817, + "grad_norm": 9.648440456121552, + "learning_rate": 4.9465976426785395e-05, + "loss": 2.5241, + "mean_token_accuracy": 0.4034482717514038, + "step": 115185 + }, + { + "epoch": 0.11602059141393234, + "grad_norm": 10.92054789620148, + "learning_rate": 4.946589526951386e-05, + "loss": 2.0495, + "mean_token_accuracy": 0.4965517342090607, + "step": 115190 + }, + { + "epoch": 0.11602562746703651, + "grad_norm": 11.155178262817554, + "learning_rate": 4.946581410614999e-05, + "loss": 2.609, + "mean_token_accuracy": 0.38275861740112305, + "step": 115195 + }, + { + "epoch": 0.11603066352014069, + "grad_norm": 23.46945464633818, + "learning_rate": 4.946573293669382e-05, + "loss": 2.6794, + "mean_token_accuracy": 0.39310343861579894, + "step": 115200 + }, + { + "epoch": 0.11603569957324486, + "grad_norm": 9.472567503498016, + "learning_rate": 4.946565176114537e-05, + "loss": 2.4833, + "mean_token_accuracy": 0.4344827592372894, + "step": 115205 + }, + { + "epoch": 0.11604073562634903, + "grad_norm": 9.744768122377197, + "learning_rate": 4.9465570579504666e-05, + "loss": 2.4249, + "mean_token_accuracy": 0.413793095946312, + "step": 115210 + }, + { + "epoch": 0.11604577167945321, + "grad_norm": 10.369140622678696, + "learning_rate": 4.946548939177174e-05, + "loss": 2.7524, + "mean_token_accuracy": 0.3862069010734558, + "step": 115215 + }, + { + "epoch": 0.11605080773255738, + "grad_norm": 11.05828993811328, + "learning_rate": 4.9465408197946596e-05, + "loss": 2.447, + "mean_token_accuracy": 0.43272837400436404, + "step": 115220 + }, + { + "epoch": 0.11605584378566156, + "grad_norm": 19.4444165493529, + "learning_rate": 4.946532699802926e-05, + "loss": 2.5705, + "mean_token_accuracy": 0.4221415638923645, + "step": 115225 + }, + { + "epoch": 0.11606087983876573, + "grad_norm": 10.784234333579297, + "learning_rate": 4.946524579201976e-05, + "loss": 2.2905, + "mean_token_accuracy": 0.4517241299152374, + "step": 115230 + }, + { + "epoch": 0.1160659158918699, + "grad_norm": 12.178963322160445, + "learning_rate": 4.9465164579918116e-05, + "loss": 2.7118, + "mean_token_accuracy": 0.4172413766384125, + "step": 115235 + }, + { + "epoch": 0.11607095194497406, + "grad_norm": 10.549543156394241, + "learning_rate": 4.946508336172436e-05, + "loss": 1.9286, + "mean_token_accuracy": 0.4915305495262146, + "step": 115240 + }, + { + "epoch": 0.11607598799807824, + "grad_norm": 10.319450282173989, + "learning_rate": 4.9465002137438496e-05, + "loss": 2.1127, + "mean_token_accuracy": 0.4172413766384125, + "step": 115245 + }, + { + "epoch": 0.11608102405118241, + "grad_norm": 10.528320450768115, + "learning_rate": 4.946492090706057e-05, + "loss": 2.3161, + "mean_token_accuracy": 0.48275862336158754, + "step": 115250 + }, + { + "epoch": 0.11608606010428658, + "grad_norm": 14.61231232918794, + "learning_rate": 4.946483967059057e-05, + "loss": 2.5036, + "mean_token_accuracy": 0.45172413289546964, + "step": 115255 + }, + { + "epoch": 0.11609109615739076, + "grad_norm": 9.441541989977045, + "learning_rate": 4.946475842802856e-05, + "loss": 2.1945, + "mean_token_accuracy": 0.4689655125141144, + "step": 115260 + }, + { + "epoch": 0.11609613221049493, + "grad_norm": 10.398852625841755, + "learning_rate": 4.946467717937454e-05, + "loss": 2.2905, + "mean_token_accuracy": 0.4793103516101837, + "step": 115265 + }, + { + "epoch": 0.1161011682635991, + "grad_norm": 11.306175116284312, + "learning_rate": 4.946459592462852e-05, + "loss": 2.4169, + "mean_token_accuracy": 0.37931033968925476, + "step": 115270 + }, + { + "epoch": 0.11610620431670328, + "grad_norm": 12.044385339473058, + "learning_rate": 4.946451466379055e-05, + "loss": 2.4445, + "mean_token_accuracy": 0.4517241358757019, + "step": 115275 + }, + { + "epoch": 0.11611124036980745, + "grad_norm": 9.809964191586662, + "learning_rate": 4.9464433396860635e-05, + "loss": 2.5434, + "mean_token_accuracy": 0.40689656138420105, + "step": 115280 + }, + { + "epoch": 0.11611627642291163, + "grad_norm": 11.045390611267464, + "learning_rate": 4.94643521238388e-05, + "loss": 2.2195, + "mean_token_accuracy": 0.44827587008476255, + "step": 115285 + }, + { + "epoch": 0.1161213124760158, + "grad_norm": 11.545014940079167, + "learning_rate": 4.946427084472508e-05, + "loss": 2.7855, + "mean_token_accuracy": 0.38620689511299133, + "step": 115290 + }, + { + "epoch": 0.11612634852911997, + "grad_norm": 10.423737383038201, + "learning_rate": 4.946418955951949e-05, + "loss": 2.3702, + "mean_token_accuracy": 0.441379314661026, + "step": 115295 + }, + { + "epoch": 0.11613138458222415, + "grad_norm": 11.287713678219394, + "learning_rate": 4.946410826822204e-05, + "loss": 2.6133, + "mean_token_accuracy": 0.4344827651977539, + "step": 115300 + }, + { + "epoch": 0.11613642063532832, + "grad_norm": 11.538449547221573, + "learning_rate": 4.946402697083276e-05, + "loss": 2.3911, + "mean_token_accuracy": 0.4241379380226135, + "step": 115305 + }, + { + "epoch": 0.11614145668843248, + "grad_norm": 11.15145853769805, + "learning_rate": 4.946394566735168e-05, + "loss": 2.5075, + "mean_token_accuracy": 0.4206896543502808, + "step": 115310 + }, + { + "epoch": 0.11614649274153666, + "grad_norm": 12.004308088461892, + "learning_rate": 4.9463864357778817e-05, + "loss": 2.1845, + "mean_token_accuracy": 0.47931033968925474, + "step": 115315 + }, + { + "epoch": 0.11615152879464083, + "grad_norm": 10.836842826315328, + "learning_rate": 4.946378304211419e-05, + "loss": 2.3953, + "mean_token_accuracy": 0.44827585816383364, + "step": 115320 + }, + { + "epoch": 0.116156564847745, + "grad_norm": 12.454193753984839, + "learning_rate": 4.946370172035783e-05, + "loss": 2.0635, + "mean_token_accuracy": 0.47931033968925474, + "step": 115325 + }, + { + "epoch": 0.11616160090084918, + "grad_norm": 9.981426029421588, + "learning_rate": 4.946362039250976e-05, + "loss": 2.1319, + "mean_token_accuracy": 0.48275862336158754, + "step": 115330 + }, + { + "epoch": 0.11616663695395335, + "grad_norm": 9.968584080082733, + "learning_rate": 4.946353905856999e-05, + "loss": 2.3514, + "mean_token_accuracy": 0.38965516686439516, + "step": 115335 + }, + { + "epoch": 0.11617167300705752, + "grad_norm": 12.891631154553258, + "learning_rate": 4.9463457718538565e-05, + "loss": 2.3522, + "mean_token_accuracy": 0.4241379380226135, + "step": 115340 + }, + { + "epoch": 0.1161767090601617, + "grad_norm": 12.53300376786482, + "learning_rate": 4.9463376372415476e-05, + "loss": 2.9471, + "mean_token_accuracy": 0.33103448152542114, + "step": 115345 + }, + { + "epoch": 0.11618174511326587, + "grad_norm": 7.86987531860855, + "learning_rate": 4.946329502020077e-05, + "loss": 2.647, + "mean_token_accuracy": 0.4379310369491577, + "step": 115350 + }, + { + "epoch": 0.11618678116637005, + "grad_norm": 10.889868247753515, + "learning_rate": 4.9463213661894455e-05, + "loss": 2.3054, + "mean_token_accuracy": 0.4551724135875702, + "step": 115355 + }, + { + "epoch": 0.11619181721947422, + "grad_norm": 9.311672817113127, + "learning_rate": 4.946313229749657e-05, + "loss": 2.4792, + "mean_token_accuracy": 0.4241379380226135, + "step": 115360 + }, + { + "epoch": 0.1161968532725784, + "grad_norm": 8.51992237007899, + "learning_rate": 4.946305092700713e-05, + "loss": 2.2338, + "mean_token_accuracy": 0.44670296311378477, + "step": 115365 + }, + { + "epoch": 0.11620188932568257, + "grad_norm": 11.49364806349511, + "learning_rate": 4.9462969550426145e-05, + "loss": 2.681, + "mean_token_accuracy": 0.42758620977401735, + "step": 115370 + }, + { + "epoch": 0.11620692537878674, + "grad_norm": 9.994100106065499, + "learning_rate": 4.946288816775366e-05, + "loss": 2.2989, + "mean_token_accuracy": 0.42413792610168455, + "step": 115375 + }, + { + "epoch": 0.1162119614318909, + "grad_norm": 10.836512539711663, + "learning_rate": 4.946280677898968e-05, + "loss": 2.7114, + "mean_token_accuracy": 0.4000000059604645, + "step": 115380 + }, + { + "epoch": 0.11621699748499507, + "grad_norm": 9.85786582514832, + "learning_rate": 4.946272538413424e-05, + "loss": 2.5756, + "mean_token_accuracy": 0.4413793087005615, + "step": 115385 + }, + { + "epoch": 0.11622203353809925, + "grad_norm": 9.700186948577992, + "learning_rate": 4.946264398318735e-05, + "loss": 2.5969, + "mean_token_accuracy": 0.4, + "step": 115390 + }, + { + "epoch": 0.11622706959120342, + "grad_norm": 12.330438731043174, + "learning_rate": 4.946256257614904e-05, + "loss": 2.2164, + "mean_token_accuracy": 0.4000000059604645, + "step": 115395 + }, + { + "epoch": 0.1162321056443076, + "grad_norm": 9.762070619396953, + "learning_rate": 4.946248116301933e-05, + "loss": 2.0498, + "mean_token_accuracy": 0.5172413766384125, + "step": 115400 + }, + { + "epoch": 0.11623714169741177, + "grad_norm": 11.941247389040686, + "learning_rate": 4.946239974379824e-05, + "loss": 2.4527, + "mean_token_accuracy": 0.40471869707107544, + "step": 115405 + }, + { + "epoch": 0.11624217775051594, + "grad_norm": 13.135468872935595, + "learning_rate": 4.946231831848581e-05, + "loss": 2.6166, + "mean_token_accuracy": 0.3896551728248596, + "step": 115410 + }, + { + "epoch": 0.11624721380362012, + "grad_norm": 10.318823286145287, + "learning_rate": 4.9462236887082034e-05, + "loss": 2.2466, + "mean_token_accuracy": 0.4569872975349426, + "step": 115415 + }, + { + "epoch": 0.11625224985672429, + "grad_norm": 11.915735426719996, + "learning_rate": 4.9462155449586954e-05, + "loss": 2.4182, + "mean_token_accuracy": 0.3724137872457504, + "step": 115420 + }, + { + "epoch": 0.11625728590982846, + "grad_norm": 14.011478759940081, + "learning_rate": 4.946207400600059e-05, + "loss": 2.5114, + "mean_token_accuracy": 0.4103448331356049, + "step": 115425 + }, + { + "epoch": 0.11626232196293264, + "grad_norm": 10.498706248395369, + "learning_rate": 4.946199255632296e-05, + "loss": 2.4155, + "mean_token_accuracy": 0.4068965554237366, + "step": 115430 + }, + { + "epoch": 0.11626735801603681, + "grad_norm": 9.147964912638376, + "learning_rate": 4.946191110055409e-05, + "loss": 3.1358, + "mean_token_accuracy": 0.35692680180072783, + "step": 115435 + }, + { + "epoch": 0.11627239406914099, + "grad_norm": 11.739749886125201, + "learning_rate": 4.9461829638694014e-05, + "loss": 2.5006, + "mean_token_accuracy": 0.42413793206214906, + "step": 115440 + }, + { + "epoch": 0.11627743012224516, + "grad_norm": 10.159315269181814, + "learning_rate": 4.9461748170742725e-05, + "loss": 2.4216, + "mean_token_accuracy": 0.4620689690113068, + "step": 115445 + }, + { + "epoch": 0.11628246617534932, + "grad_norm": 9.585271059352436, + "learning_rate": 4.9461666696700274e-05, + "loss": 2.193, + "mean_token_accuracy": 0.48275862336158754, + "step": 115450 + }, + { + "epoch": 0.1162875022284535, + "grad_norm": 11.422969018296149, + "learning_rate": 4.946158521656667e-05, + "loss": 2.581, + "mean_token_accuracy": 0.37586206793785093, + "step": 115455 + }, + { + "epoch": 0.11629253828155767, + "grad_norm": 12.990905244431511, + "learning_rate": 4.946150373034194e-05, + "loss": 2.3948, + "mean_token_accuracy": 0.43103448748588563, + "step": 115460 + }, + { + "epoch": 0.11629757433466184, + "grad_norm": 9.992660761729882, + "learning_rate": 4.94614222380261e-05, + "loss": 1.8641, + "mean_token_accuracy": 0.5261343002319336, + "step": 115465 + }, + { + "epoch": 0.11630261038776601, + "grad_norm": 9.603350942751467, + "learning_rate": 4.9461340739619184e-05, + "loss": 2.9338, + "mean_token_accuracy": 0.4206896543502808, + "step": 115470 + }, + { + "epoch": 0.11630764644087019, + "grad_norm": 11.694132810592606, + "learning_rate": 4.946125923512121e-05, + "loss": 2.4882, + "mean_token_accuracy": 0.3896551728248596, + "step": 115475 + }, + { + "epoch": 0.11631268249397436, + "grad_norm": 9.080885273015994, + "learning_rate": 4.946117772453219e-05, + "loss": 2.4411, + "mean_token_accuracy": 0.42758620381355283, + "step": 115480 + }, + { + "epoch": 0.11631771854707854, + "grad_norm": 10.452452824649617, + "learning_rate": 4.946109620785216e-05, + "loss": 2.8492, + "mean_token_accuracy": 0.39999998807907106, + "step": 115485 + }, + { + "epoch": 0.11632275460018271, + "grad_norm": 11.34839882166613, + "learning_rate": 4.946101468508114e-05, + "loss": 2.2454, + "mean_token_accuracy": 0.4586206912994385, + "step": 115490 + }, + { + "epoch": 0.11632779065328688, + "grad_norm": 8.657593349849884, + "learning_rate": 4.946093315621915e-05, + "loss": 2.2743, + "mean_token_accuracy": 0.43448275327682495, + "step": 115495 + }, + { + "epoch": 0.11633282670639106, + "grad_norm": 9.247033890336532, + "learning_rate": 4.946085162126621e-05, + "loss": 2.258, + "mean_token_accuracy": 0.42413792610168455, + "step": 115500 + }, + { + "epoch": 0.11633786275949523, + "grad_norm": 12.083708733895131, + "learning_rate": 4.946077008022234e-05, + "loss": 2.1065, + "mean_token_accuracy": 0.5052631616592407, + "step": 115505 + }, + { + "epoch": 0.1163428988125994, + "grad_norm": 11.385202146190863, + "learning_rate": 4.946068853308758e-05, + "loss": 2.3868, + "mean_token_accuracy": 0.4275861978530884, + "step": 115510 + }, + { + "epoch": 0.11634793486570358, + "grad_norm": 11.251132100476541, + "learning_rate": 4.946060697986193e-05, + "loss": 2.5211, + "mean_token_accuracy": 0.38620689511299133, + "step": 115515 + }, + { + "epoch": 0.11635297091880774, + "grad_norm": 9.929897116976612, + "learning_rate": 4.946052542054544e-05, + "loss": 2.4848, + "mean_token_accuracy": 0.43793103098869324, + "step": 115520 + }, + { + "epoch": 0.11635800697191191, + "grad_norm": 12.82217655212544, + "learning_rate": 4.9460443855138107e-05, + "loss": 2.8325, + "mean_token_accuracy": 0.3862068891525269, + "step": 115525 + }, + { + "epoch": 0.11636304302501609, + "grad_norm": 11.12107264207888, + "learning_rate": 4.946036228363996e-05, + "loss": 2.4833, + "mean_token_accuracy": 0.3965517163276672, + "step": 115530 + }, + { + "epoch": 0.11636807907812026, + "grad_norm": 9.190720710973812, + "learning_rate": 4.9460280706051025e-05, + "loss": 2.4454, + "mean_token_accuracy": 0.4304900109767914, + "step": 115535 + }, + { + "epoch": 0.11637311513122443, + "grad_norm": 11.576107026425028, + "learning_rate": 4.946019912237133e-05, + "loss": 2.5712, + "mean_token_accuracy": 0.4275862067937851, + "step": 115540 + }, + { + "epoch": 0.11637815118432861, + "grad_norm": 12.33257781253891, + "learning_rate": 4.946011753260089e-05, + "loss": 2.5197, + "mean_token_accuracy": 0.403448274731636, + "step": 115545 + }, + { + "epoch": 0.11638318723743278, + "grad_norm": 10.178551352018037, + "learning_rate": 4.9460035936739726e-05, + "loss": 2.5318, + "mean_token_accuracy": 0.42758620381355283, + "step": 115550 + }, + { + "epoch": 0.11638822329053695, + "grad_norm": 8.431372736083754, + "learning_rate": 4.945995433478787e-05, + "loss": 2.4723, + "mean_token_accuracy": 0.43103448748588563, + "step": 115555 + }, + { + "epoch": 0.11639325934364113, + "grad_norm": 10.924898393733491, + "learning_rate": 4.945987272674534e-05, + "loss": 2.144, + "mean_token_accuracy": 0.4379310369491577, + "step": 115560 + }, + { + "epoch": 0.1163982953967453, + "grad_norm": 12.096936335973517, + "learning_rate": 4.945979111261215e-05, + "loss": 2.3004, + "mean_token_accuracy": 0.4482758641242981, + "step": 115565 + }, + { + "epoch": 0.11640333144984948, + "grad_norm": 10.105374753845368, + "learning_rate": 4.9459709492388324e-05, + "loss": 2.4953, + "mean_token_accuracy": 0.41379310488700866, + "step": 115570 + }, + { + "epoch": 0.11640836750295365, + "grad_norm": 11.453520102704662, + "learning_rate": 4.9459627866073906e-05, + "loss": 2.6879, + "mean_token_accuracy": 0.36709014177322385, + "step": 115575 + }, + { + "epoch": 0.11641340355605782, + "grad_norm": 10.354748191361663, + "learning_rate": 4.945954623366889e-05, + "loss": 2.3209, + "mean_token_accuracy": 0.441379314661026, + "step": 115580 + }, + { + "epoch": 0.116418439609162, + "grad_norm": 6.707900197516155, + "learning_rate": 4.945946459517332e-05, + "loss": 2.2203, + "mean_token_accuracy": 0.47241380214691164, + "step": 115585 + }, + { + "epoch": 0.11642347566226616, + "grad_norm": 11.404918589719339, + "learning_rate": 4.945938295058721e-05, + "loss": 1.9056, + "mean_token_accuracy": 0.49655171632766726, + "step": 115590 + }, + { + "epoch": 0.11642851171537033, + "grad_norm": 9.888018090877482, + "learning_rate": 4.945930129991059e-05, + "loss": 2.2519, + "mean_token_accuracy": 0.4586206912994385, + "step": 115595 + }, + { + "epoch": 0.1164335477684745, + "grad_norm": 9.483823781135168, + "learning_rate": 4.945921964314346e-05, + "loss": 2.2268, + "mean_token_accuracy": 0.42758620381355283, + "step": 115600 + }, + { + "epoch": 0.11643858382157868, + "grad_norm": 9.930794360241201, + "learning_rate": 4.945913798028587e-05, + "loss": 2.2112, + "mean_token_accuracy": 0.4813067078590393, + "step": 115605 + }, + { + "epoch": 0.11644361987468285, + "grad_norm": 11.083984495867913, + "learning_rate": 4.9459056311337824e-05, + "loss": 2.4094, + "mean_token_accuracy": 0.42413793206214906, + "step": 115610 + }, + { + "epoch": 0.11644865592778703, + "grad_norm": 9.934613486360456, + "learning_rate": 4.9458974636299353e-05, + "loss": 2.4664, + "mean_token_accuracy": 0.42413793206214906, + "step": 115615 + }, + { + "epoch": 0.1164536919808912, + "grad_norm": 10.699417389567897, + "learning_rate": 4.9458892955170486e-05, + "loss": 2.2118, + "mean_token_accuracy": 0.4813672125339508, + "step": 115620 + }, + { + "epoch": 0.11645872803399537, + "grad_norm": 10.542226740099181, + "learning_rate": 4.945881126795124e-05, + "loss": 2.766, + "mean_token_accuracy": 0.4222625494003296, + "step": 115625 + }, + { + "epoch": 0.11646376408709955, + "grad_norm": 15.216529993859794, + "learning_rate": 4.945872957464162e-05, + "loss": 2.768, + "mean_token_accuracy": 0.37241379022598264, + "step": 115630 + }, + { + "epoch": 0.11646880014020372, + "grad_norm": 9.670434014276903, + "learning_rate": 4.9458647875241675e-05, + "loss": 2.3946, + "mean_token_accuracy": 0.441379314661026, + "step": 115635 + }, + { + "epoch": 0.1164738361933079, + "grad_norm": 12.776683054562769, + "learning_rate": 4.9458566169751415e-05, + "loss": 2.6635, + "mean_token_accuracy": 0.39655172228813174, + "step": 115640 + }, + { + "epoch": 0.11647887224641207, + "grad_norm": 13.176645976549592, + "learning_rate": 4.9458484458170865e-05, + "loss": 2.399, + "mean_token_accuracy": 0.42413793206214906, + "step": 115645 + }, + { + "epoch": 0.11648390829951624, + "grad_norm": 8.316682104635019, + "learning_rate": 4.945840274050005e-05, + "loss": 2.2724, + "mean_token_accuracy": 0.48620688915252686, + "step": 115650 + }, + { + "epoch": 0.11648894435262042, + "grad_norm": 10.106033476581214, + "learning_rate": 4.945832101673898e-05, + "loss": 2.2859, + "mean_token_accuracy": 0.46896551847457885, + "step": 115655 + }, + { + "epoch": 0.11649398040572458, + "grad_norm": 8.671452747323778, + "learning_rate": 4.94582392868877e-05, + "loss": 2.1955, + "mean_token_accuracy": 0.475862056016922, + "step": 115660 + }, + { + "epoch": 0.11649901645882875, + "grad_norm": 12.048126220549499, + "learning_rate": 4.9458157550946213e-05, + "loss": 2.6396, + "mean_token_accuracy": 0.42413793206214906, + "step": 115665 + }, + { + "epoch": 0.11650405251193292, + "grad_norm": 10.216266725546491, + "learning_rate": 4.945807580891456e-05, + "loss": 2.5016, + "mean_token_accuracy": 0.42758620977401735, + "step": 115670 + }, + { + "epoch": 0.1165090885650371, + "grad_norm": 9.176206072741564, + "learning_rate": 4.9457994060792744e-05, + "loss": 2.2731, + "mean_token_accuracy": 0.42413792610168455, + "step": 115675 + }, + { + "epoch": 0.11651412461814127, + "grad_norm": 8.96944403741141, + "learning_rate": 4.9457912306580796e-05, + "loss": 2.1557, + "mean_token_accuracy": 0.4172413766384125, + "step": 115680 + }, + { + "epoch": 0.11651916067124544, + "grad_norm": 18.97427530744383, + "learning_rate": 4.9457830546278736e-05, + "loss": 2.7075, + "mean_token_accuracy": 0.37241379022598264, + "step": 115685 + }, + { + "epoch": 0.11652419672434962, + "grad_norm": 10.410101995569148, + "learning_rate": 4.9457748779886596e-05, + "loss": 2.5704, + "mean_token_accuracy": 0.39310344457626345, + "step": 115690 + }, + { + "epoch": 0.11652923277745379, + "grad_norm": 10.027134632980392, + "learning_rate": 4.945766700740439e-05, + "loss": 2.1855, + "mean_token_accuracy": 0.4310344815254211, + "step": 115695 + }, + { + "epoch": 0.11653426883055797, + "grad_norm": 8.591661744075036, + "learning_rate": 4.945758522883215e-05, + "loss": 2.1383, + "mean_token_accuracy": 0.46551724076271056, + "step": 115700 + }, + { + "epoch": 0.11653930488366214, + "grad_norm": 9.923454356588389, + "learning_rate": 4.9457503444169875e-05, + "loss": 2.4699, + "mean_token_accuracy": 0.4344827651977539, + "step": 115705 + }, + { + "epoch": 0.11654434093676631, + "grad_norm": 9.345009795333038, + "learning_rate": 4.945742165341762e-05, + "loss": 1.7995, + "mean_token_accuracy": 0.5241379380226135, + "step": 115710 + }, + { + "epoch": 0.11654937698987049, + "grad_norm": 10.789109957377846, + "learning_rate": 4.945733985657539e-05, + "loss": 2.3099, + "mean_token_accuracy": 0.4517241418361664, + "step": 115715 + }, + { + "epoch": 0.11655441304297466, + "grad_norm": 9.785067243177775, + "learning_rate": 4.945725805364321e-05, + "loss": 2.4444, + "mean_token_accuracy": 0.42758620977401735, + "step": 115720 + }, + { + "epoch": 0.11655944909607883, + "grad_norm": 8.163219949610674, + "learning_rate": 4.945717624462109e-05, + "loss": 2.0798, + "mean_token_accuracy": 0.4747731387615204, + "step": 115725 + }, + { + "epoch": 0.116564485149183, + "grad_norm": 13.167250983399748, + "learning_rate": 4.945709442950908e-05, + "loss": 2.4144, + "mean_token_accuracy": 0.4241379380226135, + "step": 115730 + }, + { + "epoch": 0.11656952120228717, + "grad_norm": 11.350491317908327, + "learning_rate": 4.945701260830718e-05, + "loss": 2.1833, + "mean_token_accuracy": 0.4310344815254211, + "step": 115735 + }, + { + "epoch": 0.11657455725539134, + "grad_norm": 10.45625902204729, + "learning_rate": 4.945693078101542e-05, + "loss": 2.4926, + "mean_token_accuracy": 0.4117362439632416, + "step": 115740 + }, + { + "epoch": 0.11657959330849552, + "grad_norm": 10.596470722984007, + "learning_rate": 4.9456848947633826e-05, + "loss": 2.5133, + "mean_token_accuracy": 0.37241379022598264, + "step": 115745 + }, + { + "epoch": 0.11658462936159969, + "grad_norm": 12.026924708159525, + "learning_rate": 4.945676710816242e-05, + "loss": 2.6786, + "mean_token_accuracy": 0.42758620381355283, + "step": 115750 + }, + { + "epoch": 0.11658966541470386, + "grad_norm": 10.180229389458578, + "learning_rate": 4.945668526260122e-05, + "loss": 2.4549, + "mean_token_accuracy": 0.4517241299152374, + "step": 115755 + }, + { + "epoch": 0.11659470146780804, + "grad_norm": 10.909569803329962, + "learning_rate": 4.945660341095026e-05, + "loss": 2.3922, + "mean_token_accuracy": 0.4655172348022461, + "step": 115760 + }, + { + "epoch": 0.11659973752091221, + "grad_norm": 9.356114418358679, + "learning_rate": 4.9456521553209545e-05, + "loss": 2.2703, + "mean_token_accuracy": 0.47241379618644713, + "step": 115765 + }, + { + "epoch": 0.11660477357401638, + "grad_norm": 18.25112101732361, + "learning_rate": 4.945643968937911e-05, + "loss": 2.7905, + "mean_token_accuracy": 0.4068965524435043, + "step": 115770 + }, + { + "epoch": 0.11660980962712056, + "grad_norm": 11.722689611003101, + "learning_rate": 4.945635781945897e-05, + "loss": 2.6798, + "mean_token_accuracy": 0.4034482777118683, + "step": 115775 + }, + { + "epoch": 0.11661484568022473, + "grad_norm": 9.051407925823062, + "learning_rate": 4.945627594344916e-05, + "loss": 2.4164, + "mean_token_accuracy": 0.42068966031074523, + "step": 115780 + }, + { + "epoch": 0.1166198817333289, + "grad_norm": 14.681051897934866, + "learning_rate": 4.945619406134969e-05, + "loss": 2.4927, + "mean_token_accuracy": 0.42758620381355283, + "step": 115785 + }, + { + "epoch": 0.11662491778643308, + "grad_norm": 11.922758460906103, + "learning_rate": 4.945611217316059e-05, + "loss": 2.14, + "mean_token_accuracy": 0.4517241418361664, + "step": 115790 + }, + { + "epoch": 0.11662995383953725, + "grad_norm": 11.468697062484884, + "learning_rate": 4.9456030278881875e-05, + "loss": 2.3373, + "mean_token_accuracy": 0.4517241358757019, + "step": 115795 + }, + { + "epoch": 0.11663498989264141, + "grad_norm": 10.450657000528347, + "learning_rate": 4.945594837851358e-05, + "loss": 2.2488, + "mean_token_accuracy": 0.441379314661026, + "step": 115800 + }, + { + "epoch": 0.11664002594574559, + "grad_norm": 11.42373818839804, + "learning_rate": 4.945586647205571e-05, + "loss": 2.6074, + "mean_token_accuracy": 0.39655172228813174, + "step": 115805 + }, + { + "epoch": 0.11664506199884976, + "grad_norm": 10.016274459032315, + "learning_rate": 4.945578455950831e-05, + "loss": 2.0037, + "mean_token_accuracy": 0.4758620738983154, + "step": 115810 + }, + { + "epoch": 0.11665009805195393, + "grad_norm": 11.579618902275154, + "learning_rate": 4.945570264087139e-05, + "loss": 2.2197, + "mean_token_accuracy": 0.43103447556495667, + "step": 115815 + }, + { + "epoch": 0.11665513410505811, + "grad_norm": 12.47962708618119, + "learning_rate": 4.9455620716144974e-05, + "loss": 2.4978, + "mean_token_accuracy": 0.3620689630508423, + "step": 115820 + }, + { + "epoch": 0.11666017015816228, + "grad_norm": 11.375758142208007, + "learning_rate": 4.945553878532908e-05, + "loss": 2.6813, + "mean_token_accuracy": 0.39836660623550413, + "step": 115825 + }, + { + "epoch": 0.11666520621126646, + "grad_norm": 11.643283017714696, + "learning_rate": 4.9455456848423735e-05, + "loss": 2.4689, + "mean_token_accuracy": 0.35862069129943847, + "step": 115830 + }, + { + "epoch": 0.11667024226437063, + "grad_norm": 13.874526331426166, + "learning_rate": 4.945537490542897e-05, + "loss": 2.2901, + "mean_token_accuracy": 0.4413793087005615, + "step": 115835 + }, + { + "epoch": 0.1166752783174748, + "grad_norm": 10.737342865805761, + "learning_rate": 4.945529295634479e-05, + "loss": 2.5921, + "mean_token_accuracy": 0.37241379022598264, + "step": 115840 + }, + { + "epoch": 0.11668031437057898, + "grad_norm": 11.20551990751828, + "learning_rate": 4.9455211001171234e-05, + "loss": 2.6037, + "mean_token_accuracy": 0.38275861740112305, + "step": 115845 + }, + { + "epoch": 0.11668535042368315, + "grad_norm": 11.744746618391442, + "learning_rate": 4.9455129039908316e-05, + "loss": 2.3853, + "mean_token_accuracy": 0.38620689511299133, + "step": 115850 + }, + { + "epoch": 0.11669038647678733, + "grad_norm": 28.02986587397832, + "learning_rate": 4.945504707255606e-05, + "loss": 2.743, + "mean_token_accuracy": 0.4034482777118683, + "step": 115855 + }, + { + "epoch": 0.1166954225298915, + "grad_norm": 10.984878470524434, + "learning_rate": 4.9454965099114494e-05, + "loss": 2.2535, + "mean_token_accuracy": 0.4068965494632721, + "step": 115860 + }, + { + "epoch": 0.11670045858299567, + "grad_norm": 9.60325715319281, + "learning_rate": 4.945488311958364e-05, + "loss": 2.2033, + "mean_token_accuracy": 0.4862068831920624, + "step": 115865 + }, + { + "epoch": 0.11670549463609983, + "grad_norm": 10.512226715408875, + "learning_rate": 4.94548011339635e-05, + "loss": 2.3801, + "mean_token_accuracy": 0.4206896543502808, + "step": 115870 + }, + { + "epoch": 0.116710530689204, + "grad_norm": 12.691413563055008, + "learning_rate": 4.945471914225413e-05, + "loss": 2.4049, + "mean_token_accuracy": 0.4000000059604645, + "step": 115875 + }, + { + "epoch": 0.11671556674230818, + "grad_norm": 9.86549905467702, + "learning_rate": 4.945463714445553e-05, + "loss": 2.3183, + "mean_token_accuracy": 0.44482759237289426, + "step": 115880 + }, + { + "epoch": 0.11672060279541235, + "grad_norm": 11.167283535735207, + "learning_rate": 4.945455514056773e-05, + "loss": 2.5128, + "mean_token_accuracy": 0.4117362380027771, + "step": 115885 + }, + { + "epoch": 0.11672563884851653, + "grad_norm": 13.239678191295784, + "learning_rate": 4.945447313059075e-05, + "loss": 2.4661, + "mean_token_accuracy": 0.39310345649719236, + "step": 115890 + }, + { + "epoch": 0.1167306749016207, + "grad_norm": 13.349667705713188, + "learning_rate": 4.9454391114524615e-05, + "loss": 2.4421, + "mean_token_accuracy": 0.4206896543502808, + "step": 115895 + }, + { + "epoch": 0.11673571095472488, + "grad_norm": 7.313308269894281, + "learning_rate": 4.945430909236936e-05, + "loss": 1.9481, + "mean_token_accuracy": 0.49999999403953554, + "step": 115900 + }, + { + "epoch": 0.11674074700782905, + "grad_norm": 13.024430158237502, + "learning_rate": 4.945422706412499e-05, + "loss": 3.0536, + "mean_token_accuracy": 0.3849969744682312, + "step": 115905 + }, + { + "epoch": 0.11674578306093322, + "grad_norm": 13.575749639406023, + "learning_rate": 4.9454145029791526e-05, + "loss": 2.5119, + "mean_token_accuracy": 0.4379310369491577, + "step": 115910 + }, + { + "epoch": 0.1167508191140374, + "grad_norm": 13.48809560332994, + "learning_rate": 4.9454062989369e-05, + "loss": 2.824, + "mean_token_accuracy": 0.353780996799469, + "step": 115915 + }, + { + "epoch": 0.11675585516714157, + "grad_norm": 10.725665005337913, + "learning_rate": 4.9453980942857435e-05, + "loss": 2.4539, + "mean_token_accuracy": 0.44137930274009707, + "step": 115920 + }, + { + "epoch": 0.11676089122024574, + "grad_norm": 13.271258282021748, + "learning_rate": 4.9453898890256854e-05, + "loss": 2.9137, + "mean_token_accuracy": 0.4344827592372894, + "step": 115925 + }, + { + "epoch": 0.11676592727334992, + "grad_norm": 11.570208199295593, + "learning_rate": 4.945381683156727e-05, + "loss": 2.4912, + "mean_token_accuracy": 0.34482758641242983, + "step": 115930 + }, + { + "epoch": 0.11677096332645409, + "grad_norm": 11.486074291200067, + "learning_rate": 4.9453734766788715e-05, + "loss": 2.2736, + "mean_token_accuracy": 0.417241370677948, + "step": 115935 + }, + { + "epoch": 0.11677599937955825, + "grad_norm": 12.672968983714828, + "learning_rate": 4.945365269592122e-05, + "loss": 2.2165, + "mean_token_accuracy": 0.42068964838981626, + "step": 115940 + }, + { + "epoch": 0.11678103543266243, + "grad_norm": 10.465695061299856, + "learning_rate": 4.945357061896478e-05, + "loss": 2.6173, + "mean_token_accuracy": 0.43103448748588563, + "step": 115945 + }, + { + "epoch": 0.1167860714857666, + "grad_norm": 10.520903876469106, + "learning_rate": 4.945348853591946e-05, + "loss": 2.1098, + "mean_token_accuracy": 0.5000000059604645, + "step": 115950 + }, + { + "epoch": 0.11679110753887077, + "grad_norm": 11.154555243478825, + "learning_rate": 4.9453406446785234e-05, + "loss": 2.7848, + "mean_token_accuracy": 0.38620689511299133, + "step": 115955 + }, + { + "epoch": 0.11679614359197495, + "grad_norm": 9.618912115411533, + "learning_rate": 4.945332435156215e-05, + "loss": 2.2893, + "mean_token_accuracy": 0.43297035694122316, + "step": 115960 + }, + { + "epoch": 0.11680117964507912, + "grad_norm": 9.948535689632957, + "learning_rate": 4.945324225025025e-05, + "loss": 2.1097, + "mean_token_accuracy": 0.46896552443504336, + "step": 115965 + }, + { + "epoch": 0.1168062156981833, + "grad_norm": 8.899589222745341, + "learning_rate": 4.945316014284952e-05, + "loss": 1.9782, + "mean_token_accuracy": 0.4896551728248596, + "step": 115970 + }, + { + "epoch": 0.11681125175128747, + "grad_norm": 12.360207828206821, + "learning_rate": 4.945307802936e-05, + "loss": 2.4752, + "mean_token_accuracy": 0.39310344457626345, + "step": 115975 + }, + { + "epoch": 0.11681628780439164, + "grad_norm": 10.672202597674643, + "learning_rate": 4.945299590978172e-05, + "loss": 2.3938, + "mean_token_accuracy": 0.4620689690113068, + "step": 115980 + }, + { + "epoch": 0.11682132385749582, + "grad_norm": 10.185671768148804, + "learning_rate": 4.945291378411469e-05, + "loss": 2.2032, + "mean_token_accuracy": 0.49165154099464414, + "step": 115985 + }, + { + "epoch": 0.11682635991059999, + "grad_norm": 10.105402541411513, + "learning_rate": 4.945283165235893e-05, + "loss": 2.2887, + "mean_token_accuracy": 0.4206896543502808, + "step": 115990 + }, + { + "epoch": 0.11683139596370416, + "grad_norm": 10.593841295959065, + "learning_rate": 4.9452749514514486e-05, + "loss": 2.3434, + "mean_token_accuracy": 0.4724137902259827, + "step": 115995 + }, + { + "epoch": 0.11683643201680834, + "grad_norm": 11.053590106659037, + "learning_rate": 4.945266737058135e-05, + "loss": 2.6298, + "mean_token_accuracy": 0.4103448212146759, + "step": 116000 + }, + { + "epoch": 0.11684146806991251, + "grad_norm": 11.681979189068993, + "learning_rate": 4.945258522055957e-05, + "loss": 2.4986, + "mean_token_accuracy": 0.4448275864124298, + "step": 116005 + }, + { + "epoch": 0.11684650412301667, + "grad_norm": 9.559378969716334, + "learning_rate": 4.9452503064449154e-05, + "loss": 2.2271, + "mean_token_accuracy": 0.44827587008476255, + "step": 116010 + }, + { + "epoch": 0.11685154017612084, + "grad_norm": 10.755371691121825, + "learning_rate": 4.9452420902250126e-05, + "loss": 2.4322, + "mean_token_accuracy": 0.42758620977401735, + "step": 116015 + }, + { + "epoch": 0.11685657622922502, + "grad_norm": 10.7181291881355, + "learning_rate": 4.945233873396252e-05, + "loss": 2.076, + "mean_token_accuracy": 0.510344821214676, + "step": 116020 + }, + { + "epoch": 0.11686161228232919, + "grad_norm": 9.338306548513417, + "learning_rate": 4.945225655958635e-05, + "loss": 2.5562, + "mean_token_accuracy": 0.4310344696044922, + "step": 116025 + }, + { + "epoch": 0.11686664833543337, + "grad_norm": 10.759471829250641, + "learning_rate": 4.945217437912163e-05, + "loss": 2.4652, + "mean_token_accuracy": 0.38965516686439516, + "step": 116030 + }, + { + "epoch": 0.11687168438853754, + "grad_norm": 10.963118704539715, + "learning_rate": 4.94520921925684e-05, + "loss": 2.3406, + "mean_token_accuracy": 0.4137930989265442, + "step": 116035 + }, + { + "epoch": 0.11687672044164171, + "grad_norm": 12.000032890893863, + "learning_rate": 4.9452009999926675e-05, + "loss": 2.644, + "mean_token_accuracy": 0.41724138855934145, + "step": 116040 + }, + { + "epoch": 0.11688175649474589, + "grad_norm": 9.52798022326986, + "learning_rate": 4.945192780119648e-05, + "loss": 2.0728, + "mean_token_accuracy": 0.44730792939662933, + "step": 116045 + }, + { + "epoch": 0.11688679254785006, + "grad_norm": 11.530358754824915, + "learning_rate": 4.9451845596377836e-05, + "loss": 2.546, + "mean_token_accuracy": 0.41554749608039854, + "step": 116050 + }, + { + "epoch": 0.11689182860095423, + "grad_norm": 9.943993307005176, + "learning_rate": 4.945176338547076e-05, + "loss": 2.5577, + "mean_token_accuracy": 0.4379310369491577, + "step": 116055 + }, + { + "epoch": 0.11689686465405841, + "grad_norm": 12.576407512955814, + "learning_rate": 4.945168116847529e-05, + "loss": 2.5042, + "mean_token_accuracy": 0.42068964838981626, + "step": 116060 + }, + { + "epoch": 0.11690190070716258, + "grad_norm": 10.263905879947119, + "learning_rate": 4.945159894539142e-05, + "loss": 2.3021, + "mean_token_accuracy": 0.42758620977401735, + "step": 116065 + }, + { + "epoch": 0.11690693676026676, + "grad_norm": 12.035905441718675, + "learning_rate": 4.9451516716219215e-05, + "loss": 2.7625, + "mean_token_accuracy": 0.4344827592372894, + "step": 116070 + }, + { + "epoch": 0.11691197281337093, + "grad_norm": 11.323575787580229, + "learning_rate": 4.945143448095866e-05, + "loss": 2.4595, + "mean_token_accuracy": 0.44137930274009707, + "step": 116075 + }, + { + "epoch": 0.11691700886647509, + "grad_norm": 10.026153568842135, + "learning_rate": 4.9451352239609806e-05, + "loss": 2.3371, + "mean_token_accuracy": 0.4206896543502808, + "step": 116080 + }, + { + "epoch": 0.11692204491957926, + "grad_norm": 10.440528336379558, + "learning_rate": 4.9451269992172646e-05, + "loss": 2.6709, + "mean_token_accuracy": 0.42758620381355283, + "step": 116085 + }, + { + "epoch": 0.11692708097268344, + "grad_norm": 10.734080592238982, + "learning_rate": 4.945118773864723e-05, + "loss": 2.4214, + "mean_token_accuracy": 0.42758620381355283, + "step": 116090 + }, + { + "epoch": 0.11693211702578761, + "grad_norm": 10.27241678816356, + "learning_rate": 4.945110547903357e-05, + "loss": 2.7337, + "mean_token_accuracy": 0.35862069129943847, + "step": 116095 + }, + { + "epoch": 0.11693715307889178, + "grad_norm": 9.808677096690635, + "learning_rate": 4.9451023213331686e-05, + "loss": 2.268, + "mean_token_accuracy": 0.44482759237289426, + "step": 116100 + }, + { + "epoch": 0.11694218913199596, + "grad_norm": 14.870292691563819, + "learning_rate": 4.9450940941541604e-05, + "loss": 2.4544, + "mean_token_accuracy": 0.4482758641242981, + "step": 116105 + }, + { + "epoch": 0.11694722518510013, + "grad_norm": 8.867048741685338, + "learning_rate": 4.9450858663663355e-05, + "loss": 2.1487, + "mean_token_accuracy": 0.4862069010734558, + "step": 116110 + }, + { + "epoch": 0.1169522612382043, + "grad_norm": 9.519703216701137, + "learning_rate": 4.9450776379696944e-05, + "loss": 2.2684, + "mean_token_accuracy": 0.44827585816383364, + "step": 116115 + }, + { + "epoch": 0.11695729729130848, + "grad_norm": 10.869691303841805, + "learning_rate": 4.9450694089642406e-05, + "loss": 2.2337, + "mean_token_accuracy": 0.458620685338974, + "step": 116120 + }, + { + "epoch": 0.11696233334441265, + "grad_norm": 11.765809329502606, + "learning_rate": 4.9450611793499754e-05, + "loss": 2.208, + "mean_token_accuracy": 0.40810647010803225, + "step": 116125 + }, + { + "epoch": 0.11696736939751683, + "grad_norm": 15.498511979054731, + "learning_rate": 4.945052949126903e-05, + "loss": 2.6611, + "mean_token_accuracy": 0.4517241299152374, + "step": 116130 + }, + { + "epoch": 0.116972405450621, + "grad_norm": 10.976151391005148, + "learning_rate": 4.945044718295024e-05, + "loss": 2.1054, + "mean_token_accuracy": 0.4896551787853241, + "step": 116135 + }, + { + "epoch": 0.11697744150372517, + "grad_norm": 12.076626805056296, + "learning_rate": 4.9450364868543415e-05, + "loss": 2.2408, + "mean_token_accuracy": 0.46551724672317507, + "step": 116140 + }, + { + "epoch": 0.11698247755682935, + "grad_norm": 15.360905749949245, + "learning_rate": 4.9450282548048566e-05, + "loss": 2.3533, + "mean_token_accuracy": 0.46551724672317507, + "step": 116145 + }, + { + "epoch": 0.11698751360993351, + "grad_norm": 13.149931175719512, + "learning_rate": 4.945020022146573e-05, + "loss": 2.6351, + "mean_token_accuracy": 0.403448274731636, + "step": 116150 + }, + { + "epoch": 0.11699254966303768, + "grad_norm": 9.83808949999587, + "learning_rate": 4.945011788879492e-05, + "loss": 2.4157, + "mean_token_accuracy": 0.41724138259887694, + "step": 116155 + }, + { + "epoch": 0.11699758571614186, + "grad_norm": 10.78087795454106, + "learning_rate": 4.945003555003617e-05, + "loss": 2.2923, + "mean_token_accuracy": 0.4379310369491577, + "step": 116160 + }, + { + "epoch": 0.11700262176924603, + "grad_norm": 12.384688224747618, + "learning_rate": 4.944995320518949e-05, + "loss": 2.3945, + "mean_token_accuracy": 0.42758620381355283, + "step": 116165 + }, + { + "epoch": 0.1170076578223502, + "grad_norm": 11.947188904828211, + "learning_rate": 4.9449870854254906e-05, + "loss": 2.0403, + "mean_token_accuracy": 0.48275862336158754, + "step": 116170 + }, + { + "epoch": 0.11701269387545438, + "grad_norm": 11.194985142313664, + "learning_rate": 4.944978849723245e-05, + "loss": 2.488, + "mean_token_accuracy": 0.4641863167285919, + "step": 116175 + }, + { + "epoch": 0.11701772992855855, + "grad_norm": 9.875553562382589, + "learning_rate": 4.9449706134122136e-05, + "loss": 2.3149, + "mean_token_accuracy": 0.4862069010734558, + "step": 116180 + }, + { + "epoch": 0.11702276598166272, + "grad_norm": 11.698830232013327, + "learning_rate": 4.944962376492399e-05, + "loss": 2.3978, + "mean_token_accuracy": 0.4206896543502808, + "step": 116185 + }, + { + "epoch": 0.1170278020347669, + "grad_norm": 9.022639123999628, + "learning_rate": 4.944954138963803e-05, + "loss": 2.3598, + "mean_token_accuracy": 0.4068965494632721, + "step": 116190 + }, + { + "epoch": 0.11703283808787107, + "grad_norm": 10.105603856025704, + "learning_rate": 4.9449459008264286e-05, + "loss": 2.3072, + "mean_token_accuracy": 0.44827587008476255, + "step": 116195 + }, + { + "epoch": 0.11703787414097525, + "grad_norm": 13.839522445052427, + "learning_rate": 4.9449376620802775e-05, + "loss": 2.8676, + "mean_token_accuracy": 0.36896551847457887, + "step": 116200 + }, + { + "epoch": 0.11704291019407942, + "grad_norm": 12.252244272753927, + "learning_rate": 4.944929422725353e-05, + "loss": 2.4315, + "mean_token_accuracy": 0.4000000059604645, + "step": 116205 + }, + { + "epoch": 0.11704794624718359, + "grad_norm": 13.399723841664212, + "learning_rate": 4.944921182761656e-05, + "loss": 2.6766, + "mean_token_accuracy": 0.4068965554237366, + "step": 116210 + }, + { + "epoch": 0.11705298230028777, + "grad_norm": 10.358785604996916, + "learning_rate": 4.944912942189189e-05, + "loss": 2.1308, + "mean_token_accuracy": 0.5103448271751404, + "step": 116215 + }, + { + "epoch": 0.11705801835339193, + "grad_norm": 10.699659395828462, + "learning_rate": 4.944904701007956e-05, + "loss": 2.5629, + "mean_token_accuracy": 0.3827586114406586, + "step": 116220 + }, + { + "epoch": 0.1170630544064961, + "grad_norm": 8.28110972307634, + "learning_rate": 4.944896459217957e-05, + "loss": 2.3129, + "mean_token_accuracy": 0.43448275327682495, + "step": 116225 + }, + { + "epoch": 0.11706809045960027, + "grad_norm": 11.220316342260155, + "learning_rate": 4.9448882168191955e-05, + "loss": 2.6096, + "mean_token_accuracy": 0.42068966627120974, + "step": 116230 + }, + { + "epoch": 0.11707312651270445, + "grad_norm": 11.924521016067692, + "learning_rate": 4.944879973811674e-05, + "loss": 2.3065, + "mean_token_accuracy": 0.4, + "step": 116235 + }, + { + "epoch": 0.11707816256580862, + "grad_norm": 11.030082688305972, + "learning_rate": 4.944871730195394e-05, + "loss": 2.4011, + "mean_token_accuracy": 0.41379310488700866, + "step": 116240 + }, + { + "epoch": 0.1170831986189128, + "grad_norm": 9.670925299909234, + "learning_rate": 4.944863485970358e-05, + "loss": 2.0029, + "mean_token_accuracy": 0.5103448331356049, + "step": 116245 + }, + { + "epoch": 0.11708823467201697, + "grad_norm": 8.693764624460925, + "learning_rate": 4.9448552411365686e-05, + "loss": 2.13, + "mean_token_accuracy": 0.42758620977401735, + "step": 116250 + }, + { + "epoch": 0.11709327072512114, + "grad_norm": 10.138147437805983, + "learning_rate": 4.944846995694027e-05, + "loss": 2.5662, + "mean_token_accuracy": 0.38965516686439516, + "step": 116255 + }, + { + "epoch": 0.11709830677822532, + "grad_norm": 12.428578656213613, + "learning_rate": 4.944838749642738e-05, + "loss": 2.1952, + "mean_token_accuracy": 0.44827585816383364, + "step": 116260 + }, + { + "epoch": 0.11710334283132949, + "grad_norm": 12.939888660894074, + "learning_rate": 4.944830502982701e-05, + "loss": 2.4612, + "mean_token_accuracy": 0.42413792610168455, + "step": 116265 + }, + { + "epoch": 0.11710837888443366, + "grad_norm": 10.034471661828219, + "learning_rate": 4.94482225571392e-05, + "loss": 2.7153, + "mean_token_accuracy": 0.37931033968925476, + "step": 116270 + }, + { + "epoch": 0.11711341493753784, + "grad_norm": 11.213729992659832, + "learning_rate": 4.944814007836397e-05, + "loss": 2.057, + "mean_token_accuracy": 0.44482758045196535, + "step": 116275 + }, + { + "epoch": 0.11711845099064201, + "grad_norm": 7.821901811106521, + "learning_rate": 4.9448057593501343e-05, + "loss": 2.4318, + "mean_token_accuracy": 0.39509981870651245, + "step": 116280 + }, + { + "epoch": 0.11712348704374619, + "grad_norm": 13.529311609703427, + "learning_rate": 4.944797510255134e-05, + "loss": 2.7912, + "mean_token_accuracy": 0.3482758641242981, + "step": 116285 + }, + { + "epoch": 0.11712852309685035, + "grad_norm": 13.182477130985573, + "learning_rate": 4.944789260551398e-05, + "loss": 2.6032, + "mean_token_accuracy": 0.3655172437429428, + "step": 116290 + }, + { + "epoch": 0.11713355914995452, + "grad_norm": 9.372159615134265, + "learning_rate": 4.9447810102389294e-05, + "loss": 2.1468, + "mean_token_accuracy": 0.5090744018554687, + "step": 116295 + }, + { + "epoch": 0.11713859520305869, + "grad_norm": 9.787342582369819, + "learning_rate": 4.94477275931773e-05, + "loss": 2.1813, + "mean_token_accuracy": 0.4431941986083984, + "step": 116300 + }, + { + "epoch": 0.11714363125616287, + "grad_norm": 9.58095525109613, + "learning_rate": 4.944764507787802e-05, + "loss": 1.9698, + "mean_token_accuracy": 0.46551724076271056, + "step": 116305 + }, + { + "epoch": 0.11714866730926704, + "grad_norm": 12.768645829501542, + "learning_rate": 4.944756255649148e-05, + "loss": 2.9265, + "mean_token_accuracy": 0.38275861740112305, + "step": 116310 + }, + { + "epoch": 0.11715370336237121, + "grad_norm": 9.881912939677031, + "learning_rate": 4.94474800290177e-05, + "loss": 2.3946, + "mean_token_accuracy": 0.4000000059604645, + "step": 116315 + }, + { + "epoch": 0.11715873941547539, + "grad_norm": 13.848996644139467, + "learning_rate": 4.9447397495456705e-05, + "loss": 2.3912, + "mean_token_accuracy": 0.3999999940395355, + "step": 116320 + }, + { + "epoch": 0.11716377546857956, + "grad_norm": 10.713600679017972, + "learning_rate": 4.944731495580852e-05, + "loss": 2.3341, + "mean_token_accuracy": 0.4034482717514038, + "step": 116325 + }, + { + "epoch": 0.11716881152168374, + "grad_norm": 10.595057932568483, + "learning_rate": 4.9447232410073164e-05, + "loss": 2.4671, + "mean_token_accuracy": 0.46963096857070924, + "step": 116330 + }, + { + "epoch": 0.11717384757478791, + "grad_norm": 11.199938322306465, + "learning_rate": 4.944714985825066e-05, + "loss": 2.3577, + "mean_token_accuracy": 0.42413792610168455, + "step": 116335 + }, + { + "epoch": 0.11717888362789208, + "grad_norm": 12.241815179422435, + "learning_rate": 4.944706730034104e-05, + "loss": 2.2017, + "mean_token_accuracy": 0.4396249234676361, + "step": 116340 + }, + { + "epoch": 0.11718391968099626, + "grad_norm": 10.908901025234922, + "learning_rate": 4.944698473634431e-05, + "loss": 2.3606, + "mean_token_accuracy": 0.41034482717514037, + "step": 116345 + }, + { + "epoch": 0.11718895573410043, + "grad_norm": 11.218768136939213, + "learning_rate": 4.94469021662605e-05, + "loss": 2.6828, + "mean_token_accuracy": 0.38620689511299133, + "step": 116350 + }, + { + "epoch": 0.1171939917872046, + "grad_norm": 11.210801240564177, + "learning_rate": 4.944681959008964e-05, + "loss": 2.2302, + "mean_token_accuracy": 0.48820326924324037, + "step": 116355 + }, + { + "epoch": 0.11719902784030876, + "grad_norm": 8.064764567798221, + "learning_rate": 4.944673700783175e-05, + "loss": 2.25, + "mean_token_accuracy": 0.47586207985877993, + "step": 116360 + }, + { + "epoch": 0.11720406389341294, + "grad_norm": 11.448064656910768, + "learning_rate": 4.944665441948684e-05, + "loss": 2.6454, + "mean_token_accuracy": 0.36896551251411436, + "step": 116365 + }, + { + "epoch": 0.11720909994651711, + "grad_norm": 10.418409702827855, + "learning_rate": 4.944657182505495e-05, + "loss": 2.5253, + "mean_token_accuracy": 0.44827585220336913, + "step": 116370 + }, + { + "epoch": 0.11721413599962129, + "grad_norm": 11.40192024260826, + "learning_rate": 4.94464892245361e-05, + "loss": 2.2117, + "mean_token_accuracy": 0.4517241299152374, + "step": 116375 + }, + { + "epoch": 0.11721917205272546, + "grad_norm": 12.788808913068152, + "learning_rate": 4.9446406617930306e-05, + "loss": 2.7074, + "mean_token_accuracy": 0.3517241358757019, + "step": 116380 + }, + { + "epoch": 0.11722420810582963, + "grad_norm": 8.897991442329726, + "learning_rate": 4.94463240052376e-05, + "loss": 2.0492, + "mean_token_accuracy": 0.4517241358757019, + "step": 116385 + }, + { + "epoch": 0.1172292441589338, + "grad_norm": 11.43191693620906, + "learning_rate": 4.944624138645799e-05, + "loss": 2.2601, + "mean_token_accuracy": 0.46908867359161377, + "step": 116390 + }, + { + "epoch": 0.11723428021203798, + "grad_norm": 11.563658486798383, + "learning_rate": 4.9446158761591514e-05, + "loss": 2.2644, + "mean_token_accuracy": 0.41584996581077577, + "step": 116395 + }, + { + "epoch": 0.11723931626514215, + "grad_norm": 21.739944846020986, + "learning_rate": 4.9446076130638185e-05, + "loss": 3.0435, + "mean_token_accuracy": 0.3896551728248596, + "step": 116400 + }, + { + "epoch": 0.11724435231824633, + "grad_norm": 10.567984691030203, + "learning_rate": 4.944599349359803e-05, + "loss": 2.7275, + "mean_token_accuracy": 0.43103447556495667, + "step": 116405 + }, + { + "epoch": 0.1172493883713505, + "grad_norm": 12.491885799705903, + "learning_rate": 4.944591085047108e-05, + "loss": 2.4462, + "mean_token_accuracy": 0.443315190076828, + "step": 116410 + }, + { + "epoch": 0.11725442442445468, + "grad_norm": 11.112751849615425, + "learning_rate": 4.944582820125734e-05, + "loss": 2.6486, + "mean_token_accuracy": 0.3793103456497192, + "step": 116415 + }, + { + "epoch": 0.11725946047755885, + "grad_norm": 13.281479164905049, + "learning_rate": 4.944574554595685e-05, + "loss": 2.3952, + "mean_token_accuracy": 0.44482758045196535, + "step": 116420 + }, + { + "epoch": 0.11726449653066302, + "grad_norm": 16.47375911351208, + "learning_rate": 4.944566288456962e-05, + "loss": 2.6882, + "mean_token_accuracy": 0.38275861740112305, + "step": 116425 + }, + { + "epoch": 0.11726953258376718, + "grad_norm": 10.59301812283459, + "learning_rate": 4.944558021709568e-05, + "loss": 2.6771, + "mean_token_accuracy": 0.36896551251411436, + "step": 116430 + }, + { + "epoch": 0.11727456863687136, + "grad_norm": 9.041556915038434, + "learning_rate": 4.944549754353506e-05, + "loss": 2.7566, + "mean_token_accuracy": 0.41034482717514037, + "step": 116435 + }, + { + "epoch": 0.11727960468997553, + "grad_norm": 15.713302346805873, + "learning_rate": 4.9445414863887755e-05, + "loss": 2.4818, + "mean_token_accuracy": 0.42758620381355283, + "step": 116440 + }, + { + "epoch": 0.1172846407430797, + "grad_norm": 8.74296217736429, + "learning_rate": 4.9445332178153824e-05, + "loss": 2.0144, + "mean_token_accuracy": 0.44482758045196535, + "step": 116445 + }, + { + "epoch": 0.11728967679618388, + "grad_norm": 9.159036261888977, + "learning_rate": 4.944524948633327e-05, + "loss": 2.0715, + "mean_token_accuracy": 0.47931034564971925, + "step": 116450 + }, + { + "epoch": 0.11729471284928805, + "grad_norm": 9.915648138392548, + "learning_rate": 4.944516678842611e-05, + "loss": 2.2438, + "mean_token_accuracy": 0.4219600737094879, + "step": 116455 + }, + { + "epoch": 0.11729974890239223, + "grad_norm": 9.037395980035706, + "learning_rate": 4.9445084084432395e-05, + "loss": 2.6635, + "mean_token_accuracy": 0.3896551728248596, + "step": 116460 + }, + { + "epoch": 0.1173047849554964, + "grad_norm": 9.669724716283866, + "learning_rate": 4.9445001374352114e-05, + "loss": 2.0881, + "mean_token_accuracy": 0.4586206912994385, + "step": 116465 + }, + { + "epoch": 0.11730982100860057, + "grad_norm": 14.121449160891514, + "learning_rate": 4.94449186581853e-05, + "loss": 3.1865, + "mean_token_accuracy": 0.35172412991523744, + "step": 116470 + }, + { + "epoch": 0.11731485706170475, + "grad_norm": 10.16106422488777, + "learning_rate": 4.9444835935931996e-05, + "loss": 2.8576, + "mean_token_accuracy": 0.37586206793785093, + "step": 116475 + }, + { + "epoch": 0.11731989311480892, + "grad_norm": 14.003053310333689, + "learning_rate": 4.9444753207592206e-05, + "loss": 2.6765, + "mean_token_accuracy": 0.4000000059604645, + "step": 116480 + }, + { + "epoch": 0.1173249291679131, + "grad_norm": 9.659221885652371, + "learning_rate": 4.944467047316595e-05, + "loss": 2.0901, + "mean_token_accuracy": 0.4551724135875702, + "step": 116485 + }, + { + "epoch": 0.11732996522101727, + "grad_norm": 8.91509779253529, + "learning_rate": 4.9444587732653266e-05, + "loss": 2.4517, + "mean_token_accuracy": 0.4517241418361664, + "step": 116490 + }, + { + "epoch": 0.11733500127412144, + "grad_norm": 20.58438867076416, + "learning_rate": 4.9444504986054156e-05, + "loss": 2.551, + "mean_token_accuracy": 0.39655172228813174, + "step": 116495 + }, + { + "epoch": 0.1173400373272256, + "grad_norm": 13.882471124924116, + "learning_rate": 4.944442223336867e-05, + "loss": 2.2383, + "mean_token_accuracy": 0.4310344815254211, + "step": 116500 + }, + { + "epoch": 0.11734507338032978, + "grad_norm": 10.622615345071779, + "learning_rate": 4.944433947459681e-05, + "loss": 2.5263, + "mean_token_accuracy": 0.36551723480224607, + "step": 116505 + }, + { + "epoch": 0.11735010943343395, + "grad_norm": 13.143523544054066, + "learning_rate": 4.94442567097386e-05, + "loss": 2.6672, + "mean_token_accuracy": 0.41724138259887694, + "step": 116510 + }, + { + "epoch": 0.11735514548653812, + "grad_norm": 9.70575261608229, + "learning_rate": 4.944417393879408e-05, + "loss": 2.3677, + "mean_token_accuracy": 0.4379310250282288, + "step": 116515 + }, + { + "epoch": 0.1173601815396423, + "grad_norm": 10.95023956229262, + "learning_rate": 4.9444091161763254e-05, + "loss": 2.2053, + "mean_token_accuracy": 0.48275862336158754, + "step": 116520 + }, + { + "epoch": 0.11736521759274647, + "grad_norm": 9.224374933290502, + "learning_rate": 4.9444008378646155e-05, + "loss": 2.4481, + "mean_token_accuracy": 0.4241379380226135, + "step": 116525 + }, + { + "epoch": 0.11737025364585064, + "grad_norm": 12.865728933357365, + "learning_rate": 4.9443925589442806e-05, + "loss": 2.4315, + "mean_token_accuracy": 0.4689655125141144, + "step": 116530 + }, + { + "epoch": 0.11737528969895482, + "grad_norm": 11.155486633358318, + "learning_rate": 4.9443842794153227e-05, + "loss": 1.961, + "mean_token_accuracy": 0.5052026689052582, + "step": 116535 + }, + { + "epoch": 0.11738032575205899, + "grad_norm": 11.38635000140854, + "learning_rate": 4.944375999277743e-05, + "loss": 2.3602, + "mean_token_accuracy": 0.45972906351089476, + "step": 116540 + }, + { + "epoch": 0.11738536180516317, + "grad_norm": 11.078220699198326, + "learning_rate": 4.944367718531546e-05, + "loss": 3.0634, + "mean_token_accuracy": 0.4206896543502808, + "step": 116545 + }, + { + "epoch": 0.11739039785826734, + "grad_norm": 10.766764198575192, + "learning_rate": 4.9443594371767333e-05, + "loss": 2.0095, + "mean_token_accuracy": 0.5103448331356049, + "step": 116550 + }, + { + "epoch": 0.11739543391137151, + "grad_norm": 11.078317938177184, + "learning_rate": 4.9443511552133065e-05, + "loss": 2.5383, + "mean_token_accuracy": 0.38620689511299133, + "step": 116555 + }, + { + "epoch": 0.11740046996447569, + "grad_norm": 13.535648154996771, + "learning_rate": 4.9443428726412675e-05, + "loss": 2.1587, + "mean_token_accuracy": 0.4344827592372894, + "step": 116560 + }, + { + "epoch": 0.11740550601757986, + "grad_norm": 9.799503329525264, + "learning_rate": 4.94433458946062e-05, + "loss": 3.0303, + "mean_token_accuracy": 0.36206896901130675, + "step": 116565 + }, + { + "epoch": 0.11741054207068402, + "grad_norm": 10.916761000971757, + "learning_rate": 4.9443263056713654e-05, + "loss": 2.2982, + "mean_token_accuracy": 0.4310344815254211, + "step": 116570 + }, + { + "epoch": 0.1174155781237882, + "grad_norm": 12.05348692217127, + "learning_rate": 4.944318021273506e-05, + "loss": 2.2204, + "mean_token_accuracy": 0.45015124678611756, + "step": 116575 + }, + { + "epoch": 0.11742061417689237, + "grad_norm": 13.874551247473775, + "learning_rate": 4.944309736267045e-05, + "loss": 2.6612, + "mean_token_accuracy": 0.43793103098869324, + "step": 116580 + }, + { + "epoch": 0.11742565022999654, + "grad_norm": 15.31809047807715, + "learning_rate": 4.944301450651983e-05, + "loss": 2.7279, + "mean_token_accuracy": 0.38620689511299133, + "step": 116585 + }, + { + "epoch": 0.11743068628310072, + "grad_norm": 8.023310194004289, + "learning_rate": 4.944293164428324e-05, + "loss": 2.6278, + "mean_token_accuracy": 0.3793103456497192, + "step": 116590 + }, + { + "epoch": 0.11743572233620489, + "grad_norm": 10.891732738447331, + "learning_rate": 4.94428487759607e-05, + "loss": 2.206, + "mean_token_accuracy": 0.48620688915252686, + "step": 116595 + }, + { + "epoch": 0.11744075838930906, + "grad_norm": 10.954299021734673, + "learning_rate": 4.944276590155222e-05, + "loss": 2.1167, + "mean_token_accuracy": 0.46551724672317507, + "step": 116600 + }, + { + "epoch": 0.11744579444241324, + "grad_norm": 11.97448311872732, + "learning_rate": 4.9442683021057836e-05, + "loss": 2.4581, + "mean_token_accuracy": 0.42758620977401735, + "step": 116605 + }, + { + "epoch": 0.11745083049551741, + "grad_norm": 12.149972506785112, + "learning_rate": 4.9442600134477566e-05, + "loss": 2.7285, + "mean_token_accuracy": 0.36896551847457887, + "step": 116610 + }, + { + "epoch": 0.11745586654862158, + "grad_norm": 8.21473441148617, + "learning_rate": 4.944251724181144e-05, + "loss": 2.389, + "mean_token_accuracy": 0.4206896543502808, + "step": 116615 + }, + { + "epoch": 0.11746090260172576, + "grad_norm": 10.914441827126826, + "learning_rate": 4.944243434305947e-05, + "loss": 2.5348, + "mean_token_accuracy": 0.4068965494632721, + "step": 116620 + }, + { + "epoch": 0.11746593865482993, + "grad_norm": 10.157873643056975, + "learning_rate": 4.9442351438221676e-05, + "loss": 2.2388, + "mean_token_accuracy": 0.4517241358757019, + "step": 116625 + }, + { + "epoch": 0.1174709747079341, + "grad_norm": 9.81894238236103, + "learning_rate": 4.94422685272981e-05, + "loss": 2.1986, + "mean_token_accuracy": 0.4344827592372894, + "step": 116630 + }, + { + "epoch": 0.11747601076103828, + "grad_norm": 11.247209605334845, + "learning_rate": 4.9442185610288754e-05, + "loss": 2.5353, + "mean_token_accuracy": 0.4034482717514038, + "step": 116635 + }, + { + "epoch": 0.11748104681414244, + "grad_norm": 8.861290669821091, + "learning_rate": 4.9442102687193654e-05, + "loss": 2.5188, + "mean_token_accuracy": 0.42413792610168455, + "step": 116640 + }, + { + "epoch": 0.11748608286724661, + "grad_norm": 9.831821329559252, + "learning_rate": 4.944201975801284e-05, + "loss": 2.3204, + "mean_token_accuracy": 0.4, + "step": 116645 + }, + { + "epoch": 0.11749111892035079, + "grad_norm": 10.197027567787785, + "learning_rate": 4.9441936822746314e-05, + "loss": 2.2504, + "mean_token_accuracy": 0.4744101524353027, + "step": 116650 + }, + { + "epoch": 0.11749615497345496, + "grad_norm": 10.09618015134596, + "learning_rate": 4.944185388139411e-05, + "loss": 2.316, + "mean_token_accuracy": 0.47007388472557066, + "step": 116655 + }, + { + "epoch": 0.11750119102655913, + "grad_norm": 12.26876481277808, + "learning_rate": 4.9441770933956264e-05, + "loss": 2.4975, + "mean_token_accuracy": 0.4103448331356049, + "step": 116660 + }, + { + "epoch": 0.11750622707966331, + "grad_norm": 8.819438306540784, + "learning_rate": 4.944168798043277e-05, + "loss": 2.1074, + "mean_token_accuracy": 0.4279556632041931, + "step": 116665 + }, + { + "epoch": 0.11751126313276748, + "grad_norm": 11.377646003569401, + "learning_rate": 4.944160502082368e-05, + "loss": 2.5739, + "mean_token_accuracy": 0.38620689511299133, + "step": 116670 + }, + { + "epoch": 0.11751629918587166, + "grad_norm": 9.965306354696677, + "learning_rate": 4.9441522055128995e-05, + "loss": 2.2015, + "mean_token_accuracy": 0.46896552443504336, + "step": 116675 + }, + { + "epoch": 0.11752133523897583, + "grad_norm": 8.746778140833316, + "learning_rate": 4.944143908334875e-05, + "loss": 2.2589, + "mean_token_accuracy": 0.4068965554237366, + "step": 116680 + }, + { + "epoch": 0.11752637129208, + "grad_norm": 10.380934340358964, + "learning_rate": 4.944135610548297e-05, + "loss": 2.122, + "mean_token_accuracy": 0.4517241358757019, + "step": 116685 + }, + { + "epoch": 0.11753140734518418, + "grad_norm": 11.553585604385464, + "learning_rate": 4.944127312153167e-05, + "loss": 2.0439, + "mean_token_accuracy": 0.4758620738983154, + "step": 116690 + }, + { + "epoch": 0.11753644339828835, + "grad_norm": 8.502544671889812, + "learning_rate": 4.944119013149487e-05, + "loss": 2.0998, + "mean_token_accuracy": 0.4931034505367279, + "step": 116695 + }, + { + "epoch": 0.11754147945139252, + "grad_norm": 12.520537765873877, + "learning_rate": 4.9441107135372605e-05, + "loss": 2.604, + "mean_token_accuracy": 0.42413793206214906, + "step": 116700 + }, + { + "epoch": 0.1175465155044967, + "grad_norm": 9.74614436118029, + "learning_rate": 4.944102413316489e-05, + "loss": 2.3471, + "mean_token_accuracy": 0.4448275834321976, + "step": 116705 + }, + { + "epoch": 0.11755155155760086, + "grad_norm": 10.250991341635984, + "learning_rate": 4.9440941124871754e-05, + "loss": 2.3137, + "mean_token_accuracy": 0.4586206912994385, + "step": 116710 + }, + { + "epoch": 0.11755658761070503, + "grad_norm": 11.448313482341357, + "learning_rate": 4.944085811049321e-05, + "loss": 2.5401, + "mean_token_accuracy": 0.4137930989265442, + "step": 116715 + }, + { + "epoch": 0.1175616236638092, + "grad_norm": 11.042596152989422, + "learning_rate": 4.944077509002929e-05, + "loss": 2.3346, + "mean_token_accuracy": 0.4551724135875702, + "step": 116720 + }, + { + "epoch": 0.11756665971691338, + "grad_norm": 12.343340410772536, + "learning_rate": 4.944069206348002e-05, + "loss": 2.6679, + "mean_token_accuracy": 0.4068965554237366, + "step": 116725 + }, + { + "epoch": 0.11757169577001755, + "grad_norm": 10.807518279498833, + "learning_rate": 4.944060903084541e-05, + "loss": 2.3591, + "mean_token_accuracy": 0.40689654350280763, + "step": 116730 + }, + { + "epoch": 0.11757673182312173, + "grad_norm": 10.609633143030225, + "learning_rate": 4.944052599212549e-05, + "loss": 2.313, + "mean_token_accuracy": 0.4379310369491577, + "step": 116735 + }, + { + "epoch": 0.1175817678762259, + "grad_norm": 10.04981050905842, + "learning_rate": 4.944044294732028e-05, + "loss": 2.2392, + "mean_token_accuracy": 0.44482758045196535, + "step": 116740 + }, + { + "epoch": 0.11758680392933007, + "grad_norm": 13.130474002526809, + "learning_rate": 4.944035989642981e-05, + "loss": 2.5623, + "mean_token_accuracy": 0.41034482717514037, + "step": 116745 + }, + { + "epoch": 0.11759183998243425, + "grad_norm": 9.840445667958704, + "learning_rate": 4.9440276839454106e-05, + "loss": 2.2261, + "mean_token_accuracy": 0.4931034445762634, + "step": 116750 + }, + { + "epoch": 0.11759687603553842, + "grad_norm": 10.62719303654517, + "learning_rate": 4.944019377639319e-05, + "loss": 2.399, + "mean_token_accuracy": 0.45517241954803467, + "step": 116755 + }, + { + "epoch": 0.1176019120886426, + "grad_norm": 11.567061175900406, + "learning_rate": 4.944011070724706e-05, + "loss": 3.0578, + "mean_token_accuracy": 0.37586206793785093, + "step": 116760 + }, + { + "epoch": 0.11760694814174677, + "grad_norm": 12.958909484683227, + "learning_rate": 4.944002763201577e-05, + "loss": 2.5405, + "mean_token_accuracy": 0.40689654648303986, + "step": 116765 + }, + { + "epoch": 0.11761198419485094, + "grad_norm": 9.29276585877999, + "learning_rate": 4.943994455069932e-05, + "loss": 2.1623, + "mean_token_accuracy": 0.482758617401123, + "step": 116770 + }, + { + "epoch": 0.11761702024795512, + "grad_norm": 10.16086692536918, + "learning_rate": 4.9439861463297756e-05, + "loss": 2.3733, + "mean_token_accuracy": 0.4241379380226135, + "step": 116775 + }, + { + "epoch": 0.11762205630105928, + "grad_norm": 11.966958024976213, + "learning_rate": 4.943977836981109e-05, + "loss": 3.0058, + "mean_token_accuracy": 0.36551723480224607, + "step": 116780 + }, + { + "epoch": 0.11762709235416345, + "grad_norm": 10.14445576897235, + "learning_rate": 4.9439695270239336e-05, + "loss": 2.0251, + "mean_token_accuracy": 0.47931034564971925, + "step": 116785 + }, + { + "epoch": 0.11763212840726762, + "grad_norm": 10.99127178405551, + "learning_rate": 4.9439612164582526e-05, + "loss": 2.4016, + "mean_token_accuracy": 0.4206896543502808, + "step": 116790 + }, + { + "epoch": 0.1176371644603718, + "grad_norm": 13.946121313919505, + "learning_rate": 4.9439529052840686e-05, + "loss": 2.5516, + "mean_token_accuracy": 0.45862067937850953, + "step": 116795 + }, + { + "epoch": 0.11764220051347597, + "grad_norm": 10.770480312769632, + "learning_rate": 4.943944593501384e-05, + "loss": 2.4444, + "mean_token_accuracy": 0.42413792610168455, + "step": 116800 + }, + { + "epoch": 0.11764723656658015, + "grad_norm": 9.747945798982782, + "learning_rate": 4.943936281110201e-05, + "loss": 2.5046, + "mean_token_accuracy": 0.40443349480628965, + "step": 116805 + }, + { + "epoch": 0.11765227261968432, + "grad_norm": 8.63774571230082, + "learning_rate": 4.943927968110521e-05, + "loss": 2.2843, + "mean_token_accuracy": 0.42413793206214906, + "step": 116810 + }, + { + "epoch": 0.1176573086727885, + "grad_norm": 14.275075114798376, + "learning_rate": 4.943919654502347e-05, + "loss": 2.7727, + "mean_token_accuracy": 0.4103448152542114, + "step": 116815 + }, + { + "epoch": 0.11766234472589267, + "grad_norm": 10.896224554118199, + "learning_rate": 4.9439113402856806e-05, + "loss": 2.2958, + "mean_token_accuracy": 0.4448275864124298, + "step": 116820 + }, + { + "epoch": 0.11766738077899684, + "grad_norm": 10.215495976486038, + "learning_rate": 4.943903025460526e-05, + "loss": 2.4377, + "mean_token_accuracy": 0.4482758641242981, + "step": 116825 + }, + { + "epoch": 0.11767241683210101, + "grad_norm": 11.09979579138643, + "learning_rate": 4.943894710026883e-05, + "loss": 2.638, + "mean_token_accuracy": 0.38620689511299133, + "step": 116830 + }, + { + "epoch": 0.11767745288520519, + "grad_norm": 6.843723155900329, + "learning_rate": 4.943886393984755e-05, + "loss": 2.2339, + "mean_token_accuracy": 0.4985221564769745, + "step": 116835 + }, + { + "epoch": 0.11768248893830936, + "grad_norm": 12.085533955454624, + "learning_rate": 4.943878077334146e-05, + "loss": 2.5308, + "mean_token_accuracy": 0.3965517282485962, + "step": 116840 + }, + { + "epoch": 0.11768752499141354, + "grad_norm": 10.032777712656817, + "learning_rate": 4.943869760075055e-05, + "loss": 2.0214, + "mean_token_accuracy": 0.4965517222881317, + "step": 116845 + }, + { + "epoch": 0.1176925610445177, + "grad_norm": 10.75534712392264, + "learning_rate": 4.943861442207486e-05, + "loss": 2.7358, + "mean_token_accuracy": 0.3999999940395355, + "step": 116850 + }, + { + "epoch": 0.11769759709762187, + "grad_norm": 11.894288871575446, + "learning_rate": 4.943853123731443e-05, + "loss": 2.298, + "mean_token_accuracy": 0.4206896543502808, + "step": 116855 + }, + { + "epoch": 0.11770263315072604, + "grad_norm": 10.914205775975462, + "learning_rate": 4.943844804646925e-05, + "loss": 2.2869, + "mean_token_accuracy": 0.44827585816383364, + "step": 116860 + }, + { + "epoch": 0.11770766920383022, + "grad_norm": 12.210442613010017, + "learning_rate": 4.943836484953937e-05, + "loss": 1.9752, + "mean_token_accuracy": 0.5295825660228729, + "step": 116865 + }, + { + "epoch": 0.11771270525693439, + "grad_norm": 9.530681257972288, + "learning_rate": 4.94382816465248e-05, + "loss": 2.1066, + "mean_token_accuracy": 0.4620689630508423, + "step": 116870 + }, + { + "epoch": 0.11771774131003857, + "grad_norm": 11.82857876694248, + "learning_rate": 4.943819843742557e-05, + "loss": 2.2201, + "mean_token_accuracy": 0.4517241358757019, + "step": 116875 + }, + { + "epoch": 0.11772277736314274, + "grad_norm": 11.550901837025753, + "learning_rate": 4.943811522224169e-05, + "loss": 2.4304, + "mean_token_accuracy": 0.39655172228813174, + "step": 116880 + }, + { + "epoch": 0.11772781341624691, + "grad_norm": 11.051989524457058, + "learning_rate": 4.94380320009732e-05, + "loss": 2.3391, + "mean_token_accuracy": 0.4255293428897858, + "step": 116885 + }, + { + "epoch": 0.11773284946935109, + "grad_norm": 8.099628719990488, + "learning_rate": 4.943794877362011e-05, + "loss": 2.1933, + "mean_token_accuracy": 0.46896551847457885, + "step": 116890 + }, + { + "epoch": 0.11773788552245526, + "grad_norm": 11.378584371753458, + "learning_rate": 4.943786554018245e-05, + "loss": 2.2902, + "mean_token_accuracy": 0.45862067937850953, + "step": 116895 + }, + { + "epoch": 0.11774292157555943, + "grad_norm": 10.332208158184434, + "learning_rate": 4.9437782300660234e-05, + "loss": 2.254, + "mean_token_accuracy": 0.47931033968925474, + "step": 116900 + }, + { + "epoch": 0.11774795762866361, + "grad_norm": 13.736697877725698, + "learning_rate": 4.94376990550535e-05, + "loss": 2.9497, + "mean_token_accuracy": 0.3586206942796707, + "step": 116905 + }, + { + "epoch": 0.11775299368176778, + "grad_norm": 10.67860134953222, + "learning_rate": 4.943761580336226e-05, + "loss": 2.746, + "mean_token_accuracy": 0.3517241418361664, + "step": 116910 + }, + { + "epoch": 0.11775802973487196, + "grad_norm": 12.495174976682623, + "learning_rate": 4.943753254558655e-05, + "loss": 2.5648, + "mean_token_accuracy": 0.37931033968925476, + "step": 116915 + }, + { + "epoch": 0.11776306578797612, + "grad_norm": 7.76158216386975, + "learning_rate": 4.943744928172637e-05, + "loss": 2.1637, + "mean_token_accuracy": 0.4884452521800995, + "step": 116920 + }, + { + "epoch": 0.11776810184108029, + "grad_norm": 9.517951481718757, + "learning_rate": 4.943736601178176e-05, + "loss": 2.2113, + "mean_token_accuracy": 0.4434361755847931, + "step": 116925 + }, + { + "epoch": 0.11777313789418446, + "grad_norm": 13.937153001581242, + "learning_rate": 4.9437282735752746e-05, + "loss": 2.9494, + "mean_token_accuracy": 0.3310344785451889, + "step": 116930 + }, + { + "epoch": 0.11777817394728864, + "grad_norm": 12.714448270026521, + "learning_rate": 4.9437199453639336e-05, + "loss": 2.1383, + "mean_token_accuracy": 0.4620689630508423, + "step": 116935 + }, + { + "epoch": 0.11778321000039281, + "grad_norm": 10.614352109131126, + "learning_rate": 4.9437116165441566e-05, + "loss": 2.4785, + "mean_token_accuracy": 0.4000000059604645, + "step": 116940 + }, + { + "epoch": 0.11778824605349698, + "grad_norm": 11.56157409273577, + "learning_rate": 4.943703287115946e-05, + "loss": 2.6419, + "mean_token_accuracy": 0.4068965554237366, + "step": 116945 + }, + { + "epoch": 0.11779328210660116, + "grad_norm": 10.751747902612573, + "learning_rate": 4.943694957079303e-05, + "loss": 2.1691, + "mean_token_accuracy": 0.47586206793785096, + "step": 116950 + }, + { + "epoch": 0.11779831815970533, + "grad_norm": 9.273145195229677, + "learning_rate": 4.943686626434231e-05, + "loss": 2.2331, + "mean_token_accuracy": 0.43611615896224976, + "step": 116955 + }, + { + "epoch": 0.1178033542128095, + "grad_norm": 11.099075430529211, + "learning_rate": 4.9436782951807316e-05, + "loss": 2.2157, + "mean_token_accuracy": 0.43103447556495667, + "step": 116960 + }, + { + "epoch": 0.11780839026591368, + "grad_norm": 11.020795811539765, + "learning_rate": 4.9436699633188066e-05, + "loss": 2.3982, + "mean_token_accuracy": 0.4206896543502808, + "step": 116965 + }, + { + "epoch": 0.11781342631901785, + "grad_norm": 11.039962977281919, + "learning_rate": 4.94366163084846e-05, + "loss": 2.6009, + "mean_token_accuracy": 0.37586206793785093, + "step": 116970 + }, + { + "epoch": 0.11781846237212203, + "grad_norm": 9.696212176677967, + "learning_rate": 4.943653297769693e-05, + "loss": 2.0978, + "mean_token_accuracy": 0.4689655125141144, + "step": 116975 + }, + { + "epoch": 0.1178234984252262, + "grad_norm": 11.797569378567681, + "learning_rate": 4.943644964082508e-05, + "loss": 2.5671, + "mean_token_accuracy": 0.41379310488700866, + "step": 116980 + }, + { + "epoch": 0.11782853447833037, + "grad_norm": 10.673206046986072, + "learning_rate": 4.943636629786907e-05, + "loss": 2.7213, + "mean_token_accuracy": 0.3655172407627106, + "step": 116985 + }, + { + "epoch": 0.11783357053143453, + "grad_norm": 10.94125678942213, + "learning_rate": 4.943628294882893e-05, + "loss": 3.3002, + "mean_token_accuracy": 0.3931034475564957, + "step": 116990 + }, + { + "epoch": 0.11783860658453871, + "grad_norm": 11.685919606684527, + "learning_rate": 4.943619959370469e-05, + "loss": 2.4185, + "mean_token_accuracy": 0.44827585816383364, + "step": 116995 + }, + { + "epoch": 0.11784364263764288, + "grad_norm": 9.678789269292205, + "learning_rate": 4.943611623249635e-05, + "loss": 2.4696, + "mean_token_accuracy": 0.42758620381355283, + "step": 117000 + }, + { + "epoch": 0.11784867869074706, + "grad_norm": 8.748330517344156, + "learning_rate": 4.9436032865203945e-05, + "loss": 2.655, + "mean_token_accuracy": 0.42758620381355283, + "step": 117005 + }, + { + "epoch": 0.11785371474385123, + "grad_norm": 17.70168653896884, + "learning_rate": 4.943594949182751e-05, + "loss": 2.7592, + "mean_token_accuracy": 0.4310344785451889, + "step": 117010 + }, + { + "epoch": 0.1178587507969554, + "grad_norm": 10.709325792840621, + "learning_rate": 4.9435866112367044e-05, + "loss": 2.2808, + "mean_token_accuracy": 0.45172412395477296, + "step": 117015 + }, + { + "epoch": 0.11786378685005958, + "grad_norm": 10.410880206000954, + "learning_rate": 4.9435782726822597e-05, + "loss": 2.1926, + "mean_token_accuracy": 0.4467029690742493, + "step": 117020 + }, + { + "epoch": 0.11786882290316375, + "grad_norm": 10.399901059749748, + "learning_rate": 4.9435699335194175e-05, + "loss": 2.0198, + "mean_token_accuracy": 0.4724137902259827, + "step": 117025 + }, + { + "epoch": 0.11787385895626792, + "grad_norm": 9.679408712776091, + "learning_rate": 4.94356159374818e-05, + "loss": 2.2997, + "mean_token_accuracy": 0.4034482717514038, + "step": 117030 + }, + { + "epoch": 0.1178788950093721, + "grad_norm": 11.190594597219697, + "learning_rate": 4.943553253368551e-05, + "loss": 2.6156, + "mean_token_accuracy": 0.39310344457626345, + "step": 117035 + }, + { + "epoch": 0.11788393106247627, + "grad_norm": 12.247828846290636, + "learning_rate": 4.94354491238053e-05, + "loss": 2.4953, + "mean_token_accuracy": 0.4034482777118683, + "step": 117040 + }, + { + "epoch": 0.11788896711558045, + "grad_norm": 11.19305764561484, + "learning_rate": 4.943536570784123e-05, + "loss": 2.4112, + "mean_token_accuracy": 0.4551724076271057, + "step": 117045 + }, + { + "epoch": 0.11789400316868462, + "grad_norm": 5.6205515226935185, + "learning_rate": 4.94352822857933e-05, + "loss": 2.1552, + "mean_token_accuracy": 0.47426108121871946, + "step": 117050 + }, + { + "epoch": 0.11789903922178879, + "grad_norm": 10.693146134476525, + "learning_rate": 4.9435198857661535e-05, + "loss": 2.5307, + "mean_token_accuracy": 0.3931034505367279, + "step": 117055 + }, + { + "epoch": 0.11790407527489295, + "grad_norm": 11.972952121323944, + "learning_rate": 4.943511542344596e-05, + "loss": 2.1609, + "mean_token_accuracy": 0.46551724076271056, + "step": 117060 + }, + { + "epoch": 0.11790911132799713, + "grad_norm": 10.978410036891553, + "learning_rate": 4.9435031983146603e-05, + "loss": 2.4684, + "mean_token_accuracy": 0.4259528160095215, + "step": 117065 + }, + { + "epoch": 0.1179141473811013, + "grad_norm": 17.227658551889444, + "learning_rate": 4.943494853676348e-05, + "loss": 2.5962, + "mean_token_accuracy": 0.37241379022598264, + "step": 117070 + }, + { + "epoch": 0.11791918343420547, + "grad_norm": 8.61719860457293, + "learning_rate": 4.943486508429661e-05, + "loss": 2.2179, + "mean_token_accuracy": 0.44984875321388246, + "step": 117075 + }, + { + "epoch": 0.11792421948730965, + "grad_norm": 13.023991930899264, + "learning_rate": 4.943478162574604e-05, + "loss": 3.0853, + "mean_token_accuracy": 0.34827586114406583, + "step": 117080 + }, + { + "epoch": 0.11792925554041382, + "grad_norm": 10.756462401450111, + "learning_rate": 4.943469816111176e-05, + "loss": 2.1238, + "mean_token_accuracy": 0.4517241418361664, + "step": 117085 + }, + { + "epoch": 0.117934291593518, + "grad_norm": 10.580216514491761, + "learning_rate": 4.9434614690393815e-05, + "loss": 2.592, + "mean_token_accuracy": 0.4634236454963684, + "step": 117090 + }, + { + "epoch": 0.11793932764662217, + "grad_norm": 9.200668956704202, + "learning_rate": 4.9434531213592226e-05, + "loss": 2.357, + "mean_token_accuracy": 0.4517241299152374, + "step": 117095 + }, + { + "epoch": 0.11794436369972634, + "grad_norm": 11.751319282570229, + "learning_rate": 4.943444773070701e-05, + "loss": 2.3512, + "mean_token_accuracy": 0.47586206793785096, + "step": 117100 + }, + { + "epoch": 0.11794939975283052, + "grad_norm": 9.95946497456793, + "learning_rate": 4.943436424173819e-05, + "loss": 2.2289, + "mean_token_accuracy": 0.4448275864124298, + "step": 117105 + }, + { + "epoch": 0.11795443580593469, + "grad_norm": 14.116453248101966, + "learning_rate": 4.943428074668581e-05, + "loss": 2.6454, + "mean_token_accuracy": 0.39310343861579894, + "step": 117110 + }, + { + "epoch": 0.11795947185903886, + "grad_norm": 11.529620933914135, + "learning_rate": 4.943419724554986e-05, + "loss": 2.4068, + "mean_token_accuracy": 0.46206897497177124, + "step": 117115 + }, + { + "epoch": 0.11796450791214304, + "grad_norm": 11.08098099742401, + "learning_rate": 4.9434113738330374e-05, + "loss": 3.1405, + "mean_token_accuracy": 0.4, + "step": 117120 + }, + { + "epoch": 0.11796954396524721, + "grad_norm": 10.394389551154124, + "learning_rate": 4.94340302250274e-05, + "loss": 2.3689, + "mean_token_accuracy": 0.4172413766384125, + "step": 117125 + }, + { + "epoch": 0.11797458001835137, + "grad_norm": 10.161269927461255, + "learning_rate": 4.9433946705640916e-05, + "loss": 2.6113, + "mean_token_accuracy": 0.4068965494632721, + "step": 117130 + }, + { + "epoch": 0.11797961607145555, + "grad_norm": 10.075927651983134, + "learning_rate": 4.943386318017099e-05, + "loss": 2.1861, + "mean_token_accuracy": 0.47931033968925474, + "step": 117135 + }, + { + "epoch": 0.11798465212455972, + "grad_norm": 10.388338409910235, + "learning_rate": 4.943377964861761e-05, + "loss": 2.2907, + "mean_token_accuracy": 0.4413793087005615, + "step": 117140 + }, + { + "epoch": 0.11798968817766389, + "grad_norm": 10.3047240369664, + "learning_rate": 4.943369611098082e-05, + "loss": 2.6871, + "mean_token_accuracy": 0.41034482717514037, + "step": 117145 + }, + { + "epoch": 0.11799472423076807, + "grad_norm": 9.922034938102792, + "learning_rate": 4.943361256726064e-05, + "loss": 2.5867, + "mean_token_accuracy": 0.4034482777118683, + "step": 117150 + }, + { + "epoch": 0.11799976028387224, + "grad_norm": 9.57198006769421, + "learning_rate": 4.9433529017457084e-05, + "loss": 2.6722, + "mean_token_accuracy": 0.3655172407627106, + "step": 117155 + }, + { + "epoch": 0.11800479633697641, + "grad_norm": 12.418429027404768, + "learning_rate": 4.943344546157019e-05, + "loss": 2.5296, + "mean_token_accuracy": 0.45172414779663084, + "step": 117160 + }, + { + "epoch": 0.11800983239008059, + "grad_norm": 11.285319776402508, + "learning_rate": 4.9433361899599975e-05, + "loss": 2.5, + "mean_token_accuracy": 0.41724138259887694, + "step": 117165 + }, + { + "epoch": 0.11801486844318476, + "grad_norm": 10.226897098447497, + "learning_rate": 4.9433278331546456e-05, + "loss": 2.1124, + "mean_token_accuracy": 0.4931034445762634, + "step": 117170 + }, + { + "epoch": 0.11801990449628894, + "grad_norm": 12.192806240284487, + "learning_rate": 4.9433194757409655e-05, + "loss": 2.1581, + "mean_token_accuracy": 0.5241379320621491, + "step": 117175 + }, + { + "epoch": 0.11802494054939311, + "grad_norm": 8.217158210169593, + "learning_rate": 4.943311117718961e-05, + "loss": 2.4065, + "mean_token_accuracy": 0.4103448331356049, + "step": 117180 + }, + { + "epoch": 0.11802997660249728, + "grad_norm": 10.021513594986747, + "learning_rate": 4.9433027590886335e-05, + "loss": 2.4971, + "mean_token_accuracy": 0.44827585816383364, + "step": 117185 + }, + { + "epoch": 0.11803501265560146, + "grad_norm": 10.032821865296821, + "learning_rate": 4.9432943998499836e-05, + "loss": 2.7352, + "mean_token_accuracy": 0.4364791214466095, + "step": 117190 + }, + { + "epoch": 0.11804004870870563, + "grad_norm": 10.687139729635792, + "learning_rate": 4.943286040003017e-05, + "loss": 2.3678, + "mean_token_accuracy": 0.43793103098869324, + "step": 117195 + }, + { + "epoch": 0.11804508476180979, + "grad_norm": 10.806512773438895, + "learning_rate": 4.9432776795477345e-05, + "loss": 2.2012, + "mean_token_accuracy": 0.45862069725990295, + "step": 117200 + }, + { + "epoch": 0.11805012081491396, + "grad_norm": 10.987334124490186, + "learning_rate": 4.943269318484137e-05, + "loss": 2.7004, + "mean_token_accuracy": 0.4034482777118683, + "step": 117205 + }, + { + "epoch": 0.11805515686801814, + "grad_norm": 10.861353474319626, + "learning_rate": 4.943260956812229e-05, + "loss": 2.7197, + "mean_token_accuracy": 0.35172412991523744, + "step": 117210 + }, + { + "epoch": 0.11806019292112231, + "grad_norm": 9.601023839327263, + "learning_rate": 4.9432525945320116e-05, + "loss": 2.7831, + "mean_token_accuracy": 0.3758620619773865, + "step": 117215 + }, + { + "epoch": 0.11806522897422649, + "grad_norm": 10.853734434122517, + "learning_rate": 4.943244231643487e-05, + "loss": 2.5222, + "mean_token_accuracy": 0.40514216423034666, + "step": 117220 + }, + { + "epoch": 0.11807026502733066, + "grad_norm": 10.527451331740146, + "learning_rate": 4.9432358681466585e-05, + "loss": 2.5907, + "mean_token_accuracy": 0.3482758581638336, + "step": 117225 + }, + { + "epoch": 0.11807530108043483, + "grad_norm": 10.195285015841613, + "learning_rate": 4.943227504041528e-05, + "loss": 2.1806, + "mean_token_accuracy": 0.4882637679576874, + "step": 117230 + }, + { + "epoch": 0.118080337133539, + "grad_norm": 9.334626967617538, + "learning_rate": 4.9432191393280975e-05, + "loss": 2.0619, + "mean_token_accuracy": 0.5022988557815552, + "step": 117235 + }, + { + "epoch": 0.11808537318664318, + "grad_norm": 12.024272279862098, + "learning_rate": 4.943210774006369e-05, + "loss": 1.8735, + "mean_token_accuracy": 0.5379310369491577, + "step": 117240 + }, + { + "epoch": 0.11809040923974735, + "grad_norm": 10.52909907742804, + "learning_rate": 4.9432024080763456e-05, + "loss": 2.376, + "mean_token_accuracy": 0.42068964838981626, + "step": 117245 + }, + { + "epoch": 0.11809544529285153, + "grad_norm": 12.035014513484528, + "learning_rate": 4.9431940415380295e-05, + "loss": 2.1957, + "mean_token_accuracy": 0.4413793087005615, + "step": 117250 + }, + { + "epoch": 0.1181004813459557, + "grad_norm": 9.578910746855403, + "learning_rate": 4.9431856743914224e-05, + "loss": 2.2358, + "mean_token_accuracy": 0.45517241954803467, + "step": 117255 + }, + { + "epoch": 0.11810551739905988, + "grad_norm": 10.68387916337257, + "learning_rate": 4.943177306636527e-05, + "loss": 2.233, + "mean_token_accuracy": 0.4366001129150391, + "step": 117260 + }, + { + "epoch": 0.11811055345216404, + "grad_norm": 10.3284038067129, + "learning_rate": 4.943168938273347e-05, + "loss": 2.0098, + "mean_token_accuracy": 0.47931034564971925, + "step": 117265 + }, + { + "epoch": 0.11811558950526821, + "grad_norm": 10.892552848245662, + "learning_rate": 4.943160569301882e-05, + "loss": 2.449, + "mean_token_accuracy": 0.42068966031074523, + "step": 117270 + }, + { + "epoch": 0.11812062555837238, + "grad_norm": 10.6957681776383, + "learning_rate": 4.943152199722137e-05, + "loss": 2.693, + "mean_token_accuracy": 0.40689654350280763, + "step": 117275 + }, + { + "epoch": 0.11812566161147656, + "grad_norm": 11.610918461643395, + "learning_rate": 4.9431438295341123e-05, + "loss": 2.7006, + "mean_token_accuracy": 0.3793103456497192, + "step": 117280 + }, + { + "epoch": 0.11813069766458073, + "grad_norm": 13.23784821915778, + "learning_rate": 4.9431354587378104e-05, + "loss": 2.9336, + "mean_token_accuracy": 0.37241379618644715, + "step": 117285 + }, + { + "epoch": 0.1181357337176849, + "grad_norm": 11.482730018297387, + "learning_rate": 4.943127087333235e-05, + "loss": 3.1177, + "mean_token_accuracy": 0.4085299432277679, + "step": 117290 + }, + { + "epoch": 0.11814076977078908, + "grad_norm": 13.683177575608765, + "learning_rate": 4.943118715320388e-05, + "loss": 2.5691, + "mean_token_accuracy": 0.42758620381355283, + "step": 117295 + }, + { + "epoch": 0.11814580582389325, + "grad_norm": 10.762445831827403, + "learning_rate": 4.94311034269927e-05, + "loss": 2.3604, + "mean_token_accuracy": 0.44827585816383364, + "step": 117300 + }, + { + "epoch": 0.11815084187699743, + "grad_norm": 9.783928643717415, + "learning_rate": 4.9431019694698856e-05, + "loss": 2.4328, + "mean_token_accuracy": 0.43448275327682495, + "step": 117305 + }, + { + "epoch": 0.1181558779301016, + "grad_norm": 9.329378896819275, + "learning_rate": 4.943093595632236e-05, + "loss": 2.2028, + "mean_token_accuracy": 0.458620685338974, + "step": 117310 + }, + { + "epoch": 0.11816091398320577, + "grad_norm": 11.55787108820168, + "learning_rate": 4.943085221186323e-05, + "loss": 2.5845, + "mean_token_accuracy": 0.4103448212146759, + "step": 117315 + }, + { + "epoch": 0.11816595003630995, + "grad_norm": 11.236494829853308, + "learning_rate": 4.943076846132151e-05, + "loss": 2.642, + "mean_token_accuracy": 0.4116757333278656, + "step": 117320 + }, + { + "epoch": 0.11817098608941412, + "grad_norm": 10.767562652970813, + "learning_rate": 4.94306847046972e-05, + "loss": 2.4351, + "mean_token_accuracy": 0.4068965554237366, + "step": 117325 + }, + { + "epoch": 0.1181760221425183, + "grad_norm": 14.274857540631332, + "learning_rate": 4.943060094199034e-05, + "loss": 2.4518, + "mean_token_accuracy": 0.4671506345272064, + "step": 117330 + }, + { + "epoch": 0.11818105819562245, + "grad_norm": 10.392036178706148, + "learning_rate": 4.943051717320094e-05, + "loss": 2.2696, + "mean_token_accuracy": 0.4413793087005615, + "step": 117335 + }, + { + "epoch": 0.11818609424872663, + "grad_norm": 11.054529643493035, + "learning_rate": 4.943043339832903e-05, + "loss": 2.4521, + "mean_token_accuracy": 0.35862069129943847, + "step": 117340 + }, + { + "epoch": 0.1181911303018308, + "grad_norm": 10.887870657814348, + "learning_rate": 4.9430349617374635e-05, + "loss": 2.4558, + "mean_token_accuracy": 0.41034482717514037, + "step": 117345 + }, + { + "epoch": 0.11819616635493498, + "grad_norm": 11.916027275345636, + "learning_rate": 4.9430265830337776e-05, + "loss": 2.6069, + "mean_token_accuracy": 0.417241370677948, + "step": 117350 + }, + { + "epoch": 0.11820120240803915, + "grad_norm": 12.681660600658997, + "learning_rate": 4.9430182037218476e-05, + "loss": 2.1111, + "mean_token_accuracy": 0.5194797456264496, + "step": 117355 + }, + { + "epoch": 0.11820623846114332, + "grad_norm": 11.660412767454138, + "learning_rate": 4.943009823801676e-05, + "loss": 2.2791, + "mean_token_accuracy": 0.4366606116294861, + "step": 117360 + }, + { + "epoch": 0.1182112745142475, + "grad_norm": 12.597235078045887, + "learning_rate": 4.9430014432732644e-05, + "loss": 2.5437, + "mean_token_accuracy": 0.4413793087005615, + "step": 117365 + }, + { + "epoch": 0.11821631056735167, + "grad_norm": 10.116875910105714, + "learning_rate": 4.942993062136615e-05, + "loss": 2.8002, + "mean_token_accuracy": 0.44137930274009707, + "step": 117370 + }, + { + "epoch": 0.11822134662045584, + "grad_norm": 9.784215524689278, + "learning_rate": 4.9429846803917315e-05, + "loss": 2.3437, + "mean_token_accuracy": 0.458620685338974, + "step": 117375 + }, + { + "epoch": 0.11822638267356002, + "grad_norm": 11.574574742106151, + "learning_rate": 4.942976298038616e-05, + "loss": 2.4179, + "mean_token_accuracy": 0.46551724076271056, + "step": 117380 + }, + { + "epoch": 0.11823141872666419, + "grad_norm": 10.16596423409803, + "learning_rate": 4.94296791507727e-05, + "loss": 2.009, + "mean_token_accuracy": 0.482758617401123, + "step": 117385 + }, + { + "epoch": 0.11823645477976837, + "grad_norm": 11.13949715530086, + "learning_rate": 4.942959531507696e-05, + "loss": 2.5864, + "mean_token_accuracy": 0.41724138259887694, + "step": 117390 + }, + { + "epoch": 0.11824149083287254, + "grad_norm": 11.017517344902009, + "learning_rate": 4.9429511473298973e-05, + "loss": 3.1637, + "mean_token_accuracy": 0.3758620709180832, + "step": 117395 + }, + { + "epoch": 0.11824652688597671, + "grad_norm": 11.264072718340433, + "learning_rate": 4.9429427625438746e-05, + "loss": 2.3694, + "mean_token_accuracy": 0.3827586233615875, + "step": 117400 + }, + { + "epoch": 0.11825156293908087, + "grad_norm": 10.684505736246438, + "learning_rate": 4.942934377149631e-05, + "loss": 2.3108, + "mean_token_accuracy": 0.44827585816383364, + "step": 117405 + }, + { + "epoch": 0.11825659899218505, + "grad_norm": 8.699446510608043, + "learning_rate": 4.942925991147169e-05, + "loss": 2.4175, + "mean_token_accuracy": 0.4413793087005615, + "step": 117410 + }, + { + "epoch": 0.11826163504528922, + "grad_norm": 9.259399237925509, + "learning_rate": 4.9429176045364915e-05, + "loss": 2.0575, + "mean_token_accuracy": 0.4620689570903778, + "step": 117415 + }, + { + "epoch": 0.1182666710983934, + "grad_norm": 11.25169555740929, + "learning_rate": 4.942909217317599e-05, + "loss": 2.6031, + "mean_token_accuracy": 0.3862069010734558, + "step": 117420 + }, + { + "epoch": 0.11827170715149757, + "grad_norm": 8.369906799755208, + "learning_rate": 4.942900829490496e-05, + "loss": 2.3816, + "mean_token_accuracy": 0.4620689690113068, + "step": 117425 + }, + { + "epoch": 0.11827674320460174, + "grad_norm": 12.00864604398708, + "learning_rate": 4.942892441055183e-05, + "loss": 2.5364, + "mean_token_accuracy": 0.3862068891525269, + "step": 117430 + }, + { + "epoch": 0.11828177925770592, + "grad_norm": 13.424480548541231, + "learning_rate": 4.942884052011663e-05, + "loss": 2.7027, + "mean_token_accuracy": 0.39310343861579894, + "step": 117435 + }, + { + "epoch": 0.11828681531081009, + "grad_norm": 11.532548787149036, + "learning_rate": 4.9428756623599394e-05, + "loss": 2.7056, + "mean_token_accuracy": 0.33448276221752166, + "step": 117440 + }, + { + "epoch": 0.11829185136391426, + "grad_norm": 15.206613147743617, + "learning_rate": 4.942867272100013e-05, + "loss": 2.7865, + "mean_token_accuracy": 0.40344826579093934, + "step": 117445 + }, + { + "epoch": 0.11829688741701844, + "grad_norm": 9.009886833345593, + "learning_rate": 4.9428588812318864e-05, + "loss": 2.6155, + "mean_token_accuracy": 0.4137930989265442, + "step": 117450 + }, + { + "epoch": 0.11830192347012261, + "grad_norm": 9.085238104629758, + "learning_rate": 4.942850489755563e-05, + "loss": 2.1024, + "mean_token_accuracy": 0.5275862038135528, + "step": 117455 + }, + { + "epoch": 0.11830695952322678, + "grad_norm": 10.336552955561045, + "learning_rate": 4.942842097671043e-05, + "loss": 2.7042, + "mean_token_accuracy": 0.3931034505367279, + "step": 117460 + }, + { + "epoch": 0.11831199557633096, + "grad_norm": 10.179359474179078, + "learning_rate": 4.942833704978331e-05, + "loss": 2.4306, + "mean_token_accuracy": 0.43793103098869324, + "step": 117465 + }, + { + "epoch": 0.11831703162943513, + "grad_norm": 13.22413514023442, + "learning_rate": 4.942825311677428e-05, + "loss": 2.9233, + "mean_token_accuracy": 0.3827586233615875, + "step": 117470 + }, + { + "epoch": 0.11832206768253929, + "grad_norm": 9.05398544786203, + "learning_rate": 4.942816917768337e-05, + "loss": 2.3322, + "mean_token_accuracy": 0.4344827592372894, + "step": 117475 + }, + { + "epoch": 0.11832710373564347, + "grad_norm": 13.285777591973783, + "learning_rate": 4.94280852325106e-05, + "loss": 2.1749, + "mean_token_accuracy": 0.43793103098869324, + "step": 117480 + }, + { + "epoch": 0.11833213978874764, + "grad_norm": 11.856534996611979, + "learning_rate": 4.9428001281255994e-05, + "loss": 2.1179, + "mean_token_accuracy": 0.4671182245016098, + "step": 117485 + }, + { + "epoch": 0.11833717584185181, + "grad_norm": 7.868162655609711, + "learning_rate": 4.9427917323919576e-05, + "loss": 2.3885, + "mean_token_accuracy": 0.4589231610298157, + "step": 117490 + }, + { + "epoch": 0.11834221189495599, + "grad_norm": 9.978699211146864, + "learning_rate": 4.942783336050136e-05, + "loss": 2.2539, + "mean_token_accuracy": 0.4206896424293518, + "step": 117495 + }, + { + "epoch": 0.11834724794806016, + "grad_norm": 9.328487121688646, + "learning_rate": 4.942774939100139e-05, + "loss": 2.6718, + "mean_token_accuracy": 0.36896551251411436, + "step": 117500 + }, + { + "epoch": 0.11835228400116433, + "grad_norm": 10.846124902863966, + "learning_rate": 4.942766541541967e-05, + "loss": 2.1751, + "mean_token_accuracy": 0.4241379380226135, + "step": 117505 + }, + { + "epoch": 0.11835732005426851, + "grad_norm": 10.879567136373531, + "learning_rate": 4.942758143375623e-05, + "loss": 2.3471, + "mean_token_accuracy": 0.4137930989265442, + "step": 117510 + }, + { + "epoch": 0.11836235610737268, + "grad_norm": 9.427494804127166, + "learning_rate": 4.94274974460111e-05, + "loss": 2.2597, + "mean_token_accuracy": 0.3999999940395355, + "step": 117515 + }, + { + "epoch": 0.11836739216047686, + "grad_norm": 9.78666787103167, + "learning_rate": 4.942741345218429e-05, + "loss": 2.3376, + "mean_token_accuracy": 0.42758620977401735, + "step": 117520 + }, + { + "epoch": 0.11837242821358103, + "grad_norm": 10.318343540428916, + "learning_rate": 4.942732945227583e-05, + "loss": 2.4714, + "mean_token_accuracy": 0.4034482777118683, + "step": 117525 + }, + { + "epoch": 0.1183774642666852, + "grad_norm": 9.002988706579933, + "learning_rate": 4.9427245446285746e-05, + "loss": 2.3321, + "mean_token_accuracy": 0.42068964838981626, + "step": 117530 + }, + { + "epoch": 0.11838250031978938, + "grad_norm": 9.600980834146766, + "learning_rate": 4.9427161434214066e-05, + "loss": 2.2218, + "mean_token_accuracy": 0.4551724076271057, + "step": 117535 + }, + { + "epoch": 0.11838753637289355, + "grad_norm": 13.096164722424598, + "learning_rate": 4.942707741606079e-05, + "loss": 2.4372, + "mean_token_accuracy": 0.4034482717514038, + "step": 117540 + }, + { + "epoch": 0.11839257242599771, + "grad_norm": 10.407059675104497, + "learning_rate": 4.942699339182596e-05, + "loss": 2.534, + "mean_token_accuracy": 0.4, + "step": 117545 + }, + { + "epoch": 0.11839760847910188, + "grad_norm": 11.990428529160273, + "learning_rate": 4.942690936150961e-05, + "loss": 2.8298, + "mean_token_accuracy": 0.37241379022598264, + "step": 117550 + }, + { + "epoch": 0.11840264453220606, + "grad_norm": 10.56402549219765, + "learning_rate": 4.9426825325111744e-05, + "loss": 2.8074, + "mean_token_accuracy": 0.3482758581638336, + "step": 117555 + }, + { + "epoch": 0.11840768058531023, + "grad_norm": 10.793341755636009, + "learning_rate": 4.942674128263239e-05, + "loss": 2.5085, + "mean_token_accuracy": 0.4448275864124298, + "step": 117560 + }, + { + "epoch": 0.1184127166384144, + "grad_norm": 8.77722281746387, + "learning_rate": 4.9426657234071566e-05, + "loss": 2.4511, + "mean_token_accuracy": 0.3896551787853241, + "step": 117565 + }, + { + "epoch": 0.11841775269151858, + "grad_norm": 10.761729214711456, + "learning_rate": 4.942657317942931e-05, + "loss": 2.1193, + "mean_token_accuracy": 0.43793103098869324, + "step": 117570 + }, + { + "epoch": 0.11842278874462275, + "grad_norm": 20.087335454805636, + "learning_rate": 4.942648911870563e-05, + "loss": 2.4982, + "mean_token_accuracy": 0.43266788125038147, + "step": 117575 + }, + { + "epoch": 0.11842782479772693, + "grad_norm": 9.513108457658975, + "learning_rate": 4.942640505190057e-05, + "loss": 2.1642, + "mean_token_accuracy": 0.42758620381355283, + "step": 117580 + }, + { + "epoch": 0.1184328608508311, + "grad_norm": 13.829662132590203, + "learning_rate": 4.942632097901413e-05, + "loss": 2.4177, + "mean_token_accuracy": 0.4068965524435043, + "step": 117585 + }, + { + "epoch": 0.11843789690393527, + "grad_norm": 11.000197874428952, + "learning_rate": 4.9426236900046344e-05, + "loss": 2.404, + "mean_token_accuracy": 0.4103448331356049, + "step": 117590 + }, + { + "epoch": 0.11844293295703945, + "grad_norm": 12.16239773865491, + "learning_rate": 4.9426152814997226e-05, + "loss": 2.2412, + "mean_token_accuracy": 0.4448275864124298, + "step": 117595 + }, + { + "epoch": 0.11844796901014362, + "grad_norm": 12.171884762290903, + "learning_rate": 4.942606872386681e-05, + "loss": 2.8885, + "mean_token_accuracy": 0.41034482717514037, + "step": 117600 + }, + { + "epoch": 0.1184530050632478, + "grad_norm": 10.067866480557523, + "learning_rate": 4.9425984626655134e-05, + "loss": 2.5273, + "mean_token_accuracy": 0.3827586233615875, + "step": 117605 + }, + { + "epoch": 0.11845804111635197, + "grad_norm": 11.085957033618337, + "learning_rate": 4.942590052336219e-05, + "loss": 2.2385, + "mean_token_accuracy": 0.4448275864124298, + "step": 117610 + }, + { + "epoch": 0.11846307716945613, + "grad_norm": 12.088177247722781, + "learning_rate": 4.942581641398802e-05, + "loss": 2.6982, + "mean_token_accuracy": 0.4137930989265442, + "step": 117615 + }, + { + "epoch": 0.1184681132225603, + "grad_norm": 9.04376959477072, + "learning_rate": 4.942573229853264e-05, + "loss": 2.3519, + "mean_token_accuracy": 0.441379314661026, + "step": 117620 + }, + { + "epoch": 0.11847314927566448, + "grad_norm": 8.741414380852834, + "learning_rate": 4.942564817699608e-05, + "loss": 2.42, + "mean_token_accuracy": 0.3827586203813553, + "step": 117625 + }, + { + "epoch": 0.11847818532876865, + "grad_norm": 11.81567649432294, + "learning_rate": 4.942556404937836e-05, + "loss": 2.0234, + "mean_token_accuracy": 0.4862068951129913, + "step": 117630 + }, + { + "epoch": 0.11848322138187282, + "grad_norm": 12.698237896206757, + "learning_rate": 4.94254799156795e-05, + "loss": 2.447, + "mean_token_accuracy": 0.4137930929660797, + "step": 117635 + }, + { + "epoch": 0.118488257434977, + "grad_norm": 7.251000999991585, + "learning_rate": 4.942539577589952e-05, + "loss": 2.1174, + "mean_token_accuracy": 0.47586206793785096, + "step": 117640 + }, + { + "epoch": 0.11849329348808117, + "grad_norm": 13.115054723603714, + "learning_rate": 4.942531163003846e-05, + "loss": 2.4662, + "mean_token_accuracy": 0.42758620977401735, + "step": 117645 + }, + { + "epoch": 0.11849832954118535, + "grad_norm": 11.066373482963813, + "learning_rate": 4.942522747809633e-05, + "loss": 2.4682, + "mean_token_accuracy": 0.4344827622175217, + "step": 117650 + }, + { + "epoch": 0.11850336559428952, + "grad_norm": 11.485186188581066, + "learning_rate": 4.9425143320073164e-05, + "loss": 2.371, + "mean_token_accuracy": 0.4344827592372894, + "step": 117655 + }, + { + "epoch": 0.1185084016473937, + "grad_norm": 10.44216665598516, + "learning_rate": 4.9425059155968964e-05, + "loss": 2.3079, + "mean_token_accuracy": 0.4640653312206268, + "step": 117660 + }, + { + "epoch": 0.11851343770049787, + "grad_norm": 11.114178055563702, + "learning_rate": 4.942497498578377e-05, + "loss": 2.1642, + "mean_token_accuracy": 0.4275861978530884, + "step": 117665 + }, + { + "epoch": 0.11851847375360204, + "grad_norm": 10.757586622165972, + "learning_rate": 4.94248908095176e-05, + "loss": 2.4873, + "mean_token_accuracy": 0.4172413766384125, + "step": 117670 + }, + { + "epoch": 0.11852350980670621, + "grad_norm": 10.001279349511824, + "learning_rate": 4.9424806627170494e-05, + "loss": 2.103, + "mean_token_accuracy": 0.5206896424293518, + "step": 117675 + }, + { + "epoch": 0.11852854585981039, + "grad_norm": 11.860519748174326, + "learning_rate": 4.9424722438742455e-05, + "loss": 2.4628, + "mean_token_accuracy": 0.3808832406997681, + "step": 117680 + }, + { + "epoch": 0.11853358191291455, + "grad_norm": 13.110566053895788, + "learning_rate": 4.942463824423351e-05, + "loss": 2.5673, + "mean_token_accuracy": 0.4344827592372894, + "step": 117685 + }, + { + "epoch": 0.11853861796601872, + "grad_norm": 13.694202556480725, + "learning_rate": 4.9424554043643686e-05, + "loss": 2.8296, + "mean_token_accuracy": 0.3655172407627106, + "step": 117690 + }, + { + "epoch": 0.1185436540191229, + "grad_norm": 11.035358788701384, + "learning_rate": 4.9424469836973004e-05, + "loss": 2.5899, + "mean_token_accuracy": 0.38275861740112305, + "step": 117695 + }, + { + "epoch": 0.11854869007222707, + "grad_norm": 10.123509006224824, + "learning_rate": 4.9424385624221486e-05, + "loss": 2.0671, + "mean_token_accuracy": 0.4896551787853241, + "step": 117700 + }, + { + "epoch": 0.11855372612533124, + "grad_norm": 9.839207093390044, + "learning_rate": 4.942430140538916e-05, + "loss": 2.5789, + "mean_token_accuracy": 0.4068965494632721, + "step": 117705 + }, + { + "epoch": 0.11855876217843542, + "grad_norm": 11.423751810183264, + "learning_rate": 4.942421718047605e-05, + "loss": 2.2464, + "mean_token_accuracy": 0.4310344815254211, + "step": 117710 + }, + { + "epoch": 0.11856379823153959, + "grad_norm": 10.284298461641127, + "learning_rate": 4.942413294948217e-05, + "loss": 2.4385, + "mean_token_accuracy": 0.3862069010734558, + "step": 117715 + }, + { + "epoch": 0.11856883428464376, + "grad_norm": 12.764844554029859, + "learning_rate": 4.9424048712407554e-05, + "loss": 2.5734, + "mean_token_accuracy": 0.3965517282485962, + "step": 117720 + }, + { + "epoch": 0.11857387033774794, + "grad_norm": 12.938438140374915, + "learning_rate": 4.942396446925222e-05, + "loss": 2.4212, + "mean_token_accuracy": 0.44137929677963256, + "step": 117725 + }, + { + "epoch": 0.11857890639085211, + "grad_norm": 11.029249170841775, + "learning_rate": 4.9423880220016196e-05, + "loss": 2.6026, + "mean_token_accuracy": 0.4, + "step": 117730 + }, + { + "epoch": 0.11858394244395629, + "grad_norm": 9.138235091635813, + "learning_rate": 4.94237959646995e-05, + "loss": 2.5885, + "mean_token_accuracy": 0.3310344785451889, + "step": 117735 + }, + { + "epoch": 0.11858897849706046, + "grad_norm": 9.311674742558143, + "learning_rate": 4.942371170330216e-05, + "loss": 2.2073, + "mean_token_accuracy": 0.4689655125141144, + "step": 117740 + }, + { + "epoch": 0.11859401455016463, + "grad_norm": 10.121723998478167, + "learning_rate": 4.942362743582419e-05, + "loss": 2.5153, + "mean_token_accuracy": 0.43635813891887665, + "step": 117745 + }, + { + "epoch": 0.11859905060326881, + "grad_norm": 11.286087147679487, + "learning_rate": 4.9423543162265624e-05, + "loss": 2.3436, + "mean_token_accuracy": 0.4620689630508423, + "step": 117750 + }, + { + "epoch": 0.11860408665637297, + "grad_norm": 9.099147944692174, + "learning_rate": 4.942345888262649e-05, + "loss": 2.5682, + "mean_token_accuracy": 0.3931034505367279, + "step": 117755 + }, + { + "epoch": 0.11860912270947714, + "grad_norm": 10.335274921440492, + "learning_rate": 4.942337459690679e-05, + "loss": 1.9992, + "mean_token_accuracy": 0.4862069010734558, + "step": 117760 + }, + { + "epoch": 0.11861415876258131, + "grad_norm": 9.88447416569921, + "learning_rate": 4.9423290305106564e-05, + "loss": 1.995, + "mean_token_accuracy": 0.5310344815254211, + "step": 117765 + }, + { + "epoch": 0.11861919481568549, + "grad_norm": 10.34007925251923, + "learning_rate": 4.942320600722583e-05, + "loss": 2.1413, + "mean_token_accuracy": 0.452873569726944, + "step": 117770 + }, + { + "epoch": 0.11862423086878966, + "grad_norm": 10.102537843597744, + "learning_rate": 4.942312170326461e-05, + "loss": 2.4418, + "mean_token_accuracy": 0.41034482419490814, + "step": 117775 + }, + { + "epoch": 0.11862926692189384, + "grad_norm": 8.945151886829414, + "learning_rate": 4.942303739322294e-05, + "loss": 2.1897, + "mean_token_accuracy": 0.4517241358757019, + "step": 117780 + }, + { + "epoch": 0.11863430297499801, + "grad_norm": 8.971994918877126, + "learning_rate": 4.942295307710083e-05, + "loss": 2.2438, + "mean_token_accuracy": 0.4931034505367279, + "step": 117785 + }, + { + "epoch": 0.11863933902810218, + "grad_norm": 11.420219573034966, + "learning_rate": 4.942286875489831e-05, + "loss": 2.2393, + "mean_token_accuracy": 0.49999999403953554, + "step": 117790 + }, + { + "epoch": 0.11864437508120636, + "grad_norm": 11.171050385377344, + "learning_rate": 4.94227844266154e-05, + "loss": 2.7947, + "mean_token_accuracy": 0.3738656997680664, + "step": 117795 + }, + { + "epoch": 0.11864941113431053, + "grad_norm": 13.69795820936663, + "learning_rate": 4.942270009225212e-05, + "loss": 2.4582, + "mean_token_accuracy": 0.3965517163276672, + "step": 117800 + }, + { + "epoch": 0.1186544471874147, + "grad_norm": 11.448656436044338, + "learning_rate": 4.94226157518085e-05, + "loss": 2.1673, + "mean_token_accuracy": 0.4482758641242981, + "step": 117805 + }, + { + "epoch": 0.11865948324051888, + "grad_norm": 8.190796980509887, + "learning_rate": 4.942253140528456e-05, + "loss": 2.1994, + "mean_token_accuracy": 0.4896551787853241, + "step": 117810 + }, + { + "epoch": 0.11866451929362305, + "grad_norm": 11.821608159128315, + "learning_rate": 4.942244705268032e-05, + "loss": 2.7316, + "mean_token_accuracy": 0.38082274198532107, + "step": 117815 + }, + { + "epoch": 0.11866955534672723, + "grad_norm": 10.125852585041727, + "learning_rate": 4.942236269399581e-05, + "loss": 2.4308, + "mean_token_accuracy": 0.39655172228813174, + "step": 117820 + }, + { + "epoch": 0.11867459139983139, + "grad_norm": 10.849466719655965, + "learning_rate": 4.942227832923105e-05, + "loss": 2.6407, + "mean_token_accuracy": 0.37241379618644715, + "step": 117825 + }, + { + "epoch": 0.11867962745293556, + "grad_norm": 12.085639964009205, + "learning_rate": 4.942219395838608e-05, + "loss": 2.5136, + "mean_token_accuracy": 0.4181034445762634, + "step": 117830 + }, + { + "epoch": 0.11868466350603973, + "grad_norm": 11.125941227921238, + "learning_rate": 4.942210958146089e-05, + "loss": 2.4519, + "mean_token_accuracy": 0.38965516686439516, + "step": 117835 + }, + { + "epoch": 0.11868969955914391, + "grad_norm": 10.139745065097202, + "learning_rate": 4.942202519845552e-05, + "loss": 2.4342, + "mean_token_accuracy": 0.42068966031074523, + "step": 117840 + }, + { + "epoch": 0.11869473561224808, + "grad_norm": 11.923264426840769, + "learning_rate": 4.942194080937e-05, + "loss": 2.4822, + "mean_token_accuracy": 0.39655172228813174, + "step": 117845 + }, + { + "epoch": 0.11869977166535225, + "grad_norm": 11.146289536686972, + "learning_rate": 4.942185641420435e-05, + "loss": 2.1482, + "mean_token_accuracy": 0.4655172526836395, + "step": 117850 + }, + { + "epoch": 0.11870480771845643, + "grad_norm": 9.368933974324925, + "learning_rate": 4.9421772012958586e-05, + "loss": 2.7444, + "mean_token_accuracy": 0.4034482777118683, + "step": 117855 + }, + { + "epoch": 0.1187098437715606, + "grad_norm": 10.941368930278218, + "learning_rate": 4.942168760563274e-05, + "loss": 2.4808, + "mean_token_accuracy": 0.42068966031074523, + "step": 117860 + }, + { + "epoch": 0.11871487982466478, + "grad_norm": 12.32080165588529, + "learning_rate": 4.942160319222683e-05, + "loss": 2.5743, + "mean_token_accuracy": 0.39310344457626345, + "step": 117865 + }, + { + "epoch": 0.11871991587776895, + "grad_norm": 10.220284658427996, + "learning_rate": 4.942151877274089e-05, + "loss": 2.5447, + "mean_token_accuracy": 0.3862068891525269, + "step": 117870 + }, + { + "epoch": 0.11872495193087312, + "grad_norm": 9.61090673585351, + "learning_rate": 4.942143434717492e-05, + "loss": 1.889, + "mean_token_accuracy": 0.5179802894592285, + "step": 117875 + }, + { + "epoch": 0.1187299879839773, + "grad_norm": 10.631401272861194, + "learning_rate": 4.942134991552897e-05, + "loss": 2.0979, + "mean_token_accuracy": 0.47586206197738645, + "step": 117880 + }, + { + "epoch": 0.11873502403708147, + "grad_norm": 10.53666213770821, + "learning_rate": 4.942126547780305e-05, + "loss": 2.4646, + "mean_token_accuracy": 0.39310345649719236, + "step": 117885 + }, + { + "epoch": 0.11874006009018565, + "grad_norm": 16.50154761240891, + "learning_rate": 4.942118103399719e-05, + "loss": 2.3266, + "mean_token_accuracy": 0.4620689630508423, + "step": 117890 + }, + { + "epoch": 0.1187450961432898, + "grad_norm": 10.434859789789655, + "learning_rate": 4.9421096584111396e-05, + "loss": 2.5767, + "mean_token_accuracy": 0.4068965554237366, + "step": 117895 + }, + { + "epoch": 0.11875013219639398, + "grad_norm": 9.674231512248724, + "learning_rate": 4.942101212814572e-05, + "loss": 2.326, + "mean_token_accuracy": 0.4413793087005615, + "step": 117900 + }, + { + "epoch": 0.11875516824949815, + "grad_norm": 9.049887517878883, + "learning_rate": 4.942092766610016e-05, + "loss": 3.1305, + "mean_token_accuracy": 0.4019963666796684, + "step": 117905 + }, + { + "epoch": 0.11876020430260233, + "grad_norm": 9.117668473827552, + "learning_rate": 4.9420843197974745e-05, + "loss": 2.3043, + "mean_token_accuracy": 0.42413792610168455, + "step": 117910 + }, + { + "epoch": 0.1187652403557065, + "grad_norm": 17.52025506560329, + "learning_rate": 4.9420758723769506e-05, + "loss": 2.7826, + "mean_token_accuracy": 0.42758620381355283, + "step": 117915 + }, + { + "epoch": 0.11877027640881067, + "grad_norm": 9.853902032540526, + "learning_rate": 4.942067424348447e-05, + "loss": 2.7749, + "mean_token_accuracy": 0.43103447556495667, + "step": 117920 + }, + { + "epoch": 0.11877531246191485, + "grad_norm": 8.682800140539221, + "learning_rate": 4.9420589757119637e-05, + "loss": 3.1315, + "mean_token_accuracy": 0.3551724135875702, + "step": 117925 + }, + { + "epoch": 0.11878034851501902, + "grad_norm": 9.935166064346944, + "learning_rate": 4.942050526467506e-05, + "loss": 2.3773, + "mean_token_accuracy": 0.42413792610168455, + "step": 117930 + }, + { + "epoch": 0.1187853845681232, + "grad_norm": 9.531610144893307, + "learning_rate": 4.942042076615074e-05, + "loss": 2.2526, + "mean_token_accuracy": 0.46551724076271056, + "step": 117935 + }, + { + "epoch": 0.11879042062122737, + "grad_norm": 9.064302013788339, + "learning_rate": 4.9420336261546715e-05, + "loss": 2.2141, + "mean_token_accuracy": 0.4586207032203674, + "step": 117940 + }, + { + "epoch": 0.11879545667433154, + "grad_norm": 10.62548195728122, + "learning_rate": 4.9420251750863006e-05, + "loss": 2.4602, + "mean_token_accuracy": 0.4620689690113068, + "step": 117945 + }, + { + "epoch": 0.11880049272743572, + "grad_norm": 11.34577112182543, + "learning_rate": 4.9420167234099627e-05, + "loss": 2.6009, + "mean_token_accuracy": 0.38275861740112305, + "step": 117950 + }, + { + "epoch": 0.11880552878053989, + "grad_norm": 10.812351520362958, + "learning_rate": 4.942008271125662e-05, + "loss": 2.5879, + "mean_token_accuracy": 0.4122806966304779, + "step": 117955 + }, + { + "epoch": 0.11881056483364406, + "grad_norm": 10.109293415462771, + "learning_rate": 4.9419998182333985e-05, + "loss": 2.4105, + "mean_token_accuracy": 0.4068965554237366, + "step": 117960 + }, + { + "epoch": 0.11881560088674822, + "grad_norm": 10.850329048416622, + "learning_rate": 4.941991364733176e-05, + "loss": 2.538, + "mean_token_accuracy": 0.3655172407627106, + "step": 117965 + }, + { + "epoch": 0.1188206369398524, + "grad_norm": 10.164670770448964, + "learning_rate": 4.941982910624996e-05, + "loss": 2.1548, + "mean_token_accuracy": 0.5379310369491577, + "step": 117970 + }, + { + "epoch": 0.11882567299295657, + "grad_norm": 10.978882418264059, + "learning_rate": 4.9419744559088614e-05, + "loss": 2.5366, + "mean_token_accuracy": 0.43448275327682495, + "step": 117975 + }, + { + "epoch": 0.11883070904606075, + "grad_norm": 7.394035147359939, + "learning_rate": 4.9419660005847754e-05, + "loss": 1.9128, + "mean_token_accuracy": 0.5013546824455262, + "step": 117980 + }, + { + "epoch": 0.11883574509916492, + "grad_norm": 10.371970959692444, + "learning_rate": 4.941957544652739e-05, + "loss": 2.9192, + "mean_token_accuracy": 0.29655171632766725, + "step": 117985 + }, + { + "epoch": 0.11884078115226909, + "grad_norm": 10.143870509519564, + "learning_rate": 4.941949088112755e-05, + "loss": 2.4329, + "mean_token_accuracy": 0.41034482717514037, + "step": 117990 + }, + { + "epoch": 0.11884581720537327, + "grad_norm": 10.791012924625877, + "learning_rate": 4.9419406309648255e-05, + "loss": 2.5678, + "mean_token_accuracy": 0.40344826579093934, + "step": 117995 + }, + { + "epoch": 0.11885085325847744, + "grad_norm": 11.779664692385717, + "learning_rate": 4.941932173208953e-05, + "loss": 2.6919, + "mean_token_accuracy": 0.3655172407627106, + "step": 118000 + }, + { + "epoch": 0.11885588931158161, + "grad_norm": 10.210680346041746, + "learning_rate": 4.9419237148451405e-05, + "loss": 2.4755, + "mean_token_accuracy": 0.38275861740112305, + "step": 118005 + }, + { + "epoch": 0.11886092536468579, + "grad_norm": 11.861469167534205, + "learning_rate": 4.94191525587339e-05, + "loss": 2.5803, + "mean_token_accuracy": 0.3827586233615875, + "step": 118010 + }, + { + "epoch": 0.11886596141778996, + "grad_norm": 7.677105491402499, + "learning_rate": 4.941906796293703e-05, + "loss": 1.9268, + "mean_token_accuracy": 0.49534180760383606, + "step": 118015 + }, + { + "epoch": 0.11887099747089414, + "grad_norm": 10.084756005372782, + "learning_rate": 4.941898336106083e-05, + "loss": 2.146, + "mean_token_accuracy": 0.4724137902259827, + "step": 118020 + }, + { + "epoch": 0.11887603352399831, + "grad_norm": 8.677621099266126, + "learning_rate": 4.941889875310531e-05, + "loss": 2.0427, + "mean_token_accuracy": 0.47586206793785096, + "step": 118025 + }, + { + "epoch": 0.11888106957710248, + "grad_norm": 9.983090439914552, + "learning_rate": 4.9418814139070504e-05, + "loss": 2.3807, + "mean_token_accuracy": 0.42758620381355283, + "step": 118030 + }, + { + "epoch": 0.11888610563020664, + "grad_norm": 9.523396300499368, + "learning_rate": 4.9418729518956434e-05, + "loss": 2.5352, + "mean_token_accuracy": 0.37586207389831544, + "step": 118035 + }, + { + "epoch": 0.11889114168331082, + "grad_norm": 9.439288814639417, + "learning_rate": 4.9418644892763126e-05, + "loss": 2.3616, + "mean_token_accuracy": 0.43254687786102297, + "step": 118040 + }, + { + "epoch": 0.11889617773641499, + "grad_norm": 9.97860023703339, + "learning_rate": 4.9418560260490596e-05, + "loss": 2.5781, + "mean_token_accuracy": 0.41379310488700866, + "step": 118045 + }, + { + "epoch": 0.11890121378951916, + "grad_norm": 9.273839606992649, + "learning_rate": 4.9418475622138876e-05, + "loss": 2.0821, + "mean_token_accuracy": 0.3793103456497192, + "step": 118050 + }, + { + "epoch": 0.11890624984262334, + "grad_norm": 9.248461754615594, + "learning_rate": 4.941839097770798e-05, + "loss": 2.4006, + "mean_token_accuracy": 0.3827586233615875, + "step": 118055 + }, + { + "epoch": 0.11891128589572751, + "grad_norm": 10.561315303283921, + "learning_rate": 4.9418306327197935e-05, + "loss": 2.4693, + "mean_token_accuracy": 0.41379310488700866, + "step": 118060 + }, + { + "epoch": 0.11891632194883169, + "grad_norm": 10.537358804239489, + "learning_rate": 4.9418221670608776e-05, + "loss": 2.6695, + "mean_token_accuracy": 0.37931033968925476, + "step": 118065 + }, + { + "epoch": 0.11892135800193586, + "grad_norm": 11.036825697979381, + "learning_rate": 4.941813700794051e-05, + "loss": 2.2234, + "mean_token_accuracy": 0.4482758641242981, + "step": 118070 + }, + { + "epoch": 0.11892639405504003, + "grad_norm": 8.878387545852805, + "learning_rate": 4.941805233919317e-05, + "loss": 2.4538, + "mean_token_accuracy": 0.4172413766384125, + "step": 118075 + }, + { + "epoch": 0.1189314301081442, + "grad_norm": 9.625659053447226, + "learning_rate": 4.941796766436677e-05, + "loss": 2.6818, + "mean_token_accuracy": 0.4103448212146759, + "step": 118080 + }, + { + "epoch": 0.11893646616124838, + "grad_norm": 12.540810744088365, + "learning_rate": 4.941788298346134e-05, + "loss": 2.5207, + "mean_token_accuracy": 0.41379311084747317, + "step": 118085 + }, + { + "epoch": 0.11894150221435255, + "grad_norm": 11.855896783512279, + "learning_rate": 4.9417798296476915e-05, + "loss": 2.5088, + "mean_token_accuracy": 0.41724137365818026, + "step": 118090 + }, + { + "epoch": 0.11894653826745673, + "grad_norm": 12.714311260357253, + "learning_rate": 4.941771360341349e-05, + "loss": 2.9778, + "mean_token_accuracy": 0.34137930870056155, + "step": 118095 + }, + { + "epoch": 0.1189515743205609, + "grad_norm": 11.921585351966051, + "learning_rate": 4.941762890427112e-05, + "loss": 2.5337, + "mean_token_accuracy": 0.358620685338974, + "step": 118100 + }, + { + "epoch": 0.11895661037366506, + "grad_norm": 10.638178479169765, + "learning_rate": 4.94175441990498e-05, + "loss": 2.8544, + "mean_token_accuracy": 0.4068965584039688, + "step": 118105 + }, + { + "epoch": 0.11896164642676924, + "grad_norm": 9.211244094789771, + "learning_rate": 4.941745948774958e-05, + "loss": 2.2945, + "mean_token_accuracy": 0.4379310369491577, + "step": 118110 + }, + { + "epoch": 0.11896668247987341, + "grad_norm": 9.227315085861271, + "learning_rate": 4.941737477037047e-05, + "loss": 2.492, + "mean_token_accuracy": 0.42758620381355283, + "step": 118115 + }, + { + "epoch": 0.11897171853297758, + "grad_norm": 9.721460869628173, + "learning_rate": 4.941729004691249e-05, + "loss": 2.3486, + "mean_token_accuracy": 0.4034482777118683, + "step": 118120 + }, + { + "epoch": 0.11897675458608176, + "grad_norm": 11.817939790871243, + "learning_rate": 4.941720531737566e-05, + "loss": 2.2008, + "mean_token_accuracy": 0.45722927451133727, + "step": 118125 + }, + { + "epoch": 0.11898179063918593, + "grad_norm": 9.594371120965361, + "learning_rate": 4.941712058176003e-05, + "loss": 2.2015, + "mean_token_accuracy": 0.4517241299152374, + "step": 118130 + }, + { + "epoch": 0.1189868266922901, + "grad_norm": 10.57940980313601, + "learning_rate": 4.941703584006559e-05, + "loss": 2.4606, + "mean_token_accuracy": 0.42915910482406616, + "step": 118135 + }, + { + "epoch": 0.11899186274539428, + "grad_norm": 10.19369821741318, + "learning_rate": 4.941695109229238e-05, + "loss": 2.8426, + "mean_token_accuracy": 0.358620685338974, + "step": 118140 + }, + { + "epoch": 0.11899689879849845, + "grad_norm": 12.708986970676188, + "learning_rate": 4.9416866338440435e-05, + "loss": 2.3852, + "mean_token_accuracy": 0.4435571670532227, + "step": 118145 + }, + { + "epoch": 0.11900193485160263, + "grad_norm": 15.577878330836903, + "learning_rate": 4.9416781578509746e-05, + "loss": 2.5312, + "mean_token_accuracy": 0.42068966031074523, + "step": 118150 + }, + { + "epoch": 0.1190069709047068, + "grad_norm": 11.072975299928592, + "learning_rate": 4.941669681250036e-05, + "loss": 2.673, + "mean_token_accuracy": 0.3517241418361664, + "step": 118155 + }, + { + "epoch": 0.11901200695781097, + "grad_norm": 12.045619258627642, + "learning_rate": 4.9416612040412304e-05, + "loss": 2.6572, + "mean_token_accuracy": 0.3793103456497192, + "step": 118160 + }, + { + "epoch": 0.11901704301091515, + "grad_norm": 12.090685709215084, + "learning_rate": 4.94165272622456e-05, + "loss": 2.3707, + "mean_token_accuracy": 0.4551724135875702, + "step": 118165 + }, + { + "epoch": 0.11902207906401932, + "grad_norm": 10.277842844948426, + "learning_rate": 4.9416442478000253e-05, + "loss": 1.8944, + "mean_token_accuracy": 0.5283251285552979, + "step": 118170 + }, + { + "epoch": 0.11902711511712348, + "grad_norm": 9.381176932969831, + "learning_rate": 4.94163576876763e-05, + "loss": 2.2185, + "mean_token_accuracy": 0.42413793206214906, + "step": 118175 + }, + { + "epoch": 0.11903215117022765, + "grad_norm": 11.36013866152452, + "learning_rate": 4.941627289127377e-05, + "loss": 2.6036, + "mean_token_accuracy": 0.4034482717514038, + "step": 118180 + }, + { + "epoch": 0.11903718722333183, + "grad_norm": 11.077589817940073, + "learning_rate": 4.941618808879268e-05, + "loss": 2.4475, + "mean_token_accuracy": 0.441379314661026, + "step": 118185 + }, + { + "epoch": 0.119042223276436, + "grad_norm": 10.959590634225602, + "learning_rate": 4.941610328023304e-05, + "loss": 2.2719, + "mean_token_accuracy": 0.42758620977401735, + "step": 118190 + }, + { + "epoch": 0.11904725932954018, + "grad_norm": 9.667822891809388, + "learning_rate": 4.94160184655949e-05, + "loss": 2.3571, + "mean_token_accuracy": 0.4482758641242981, + "step": 118195 + }, + { + "epoch": 0.11905229538264435, + "grad_norm": 13.392846605407586, + "learning_rate": 4.941593364487827e-05, + "loss": 2.2951, + "mean_token_accuracy": 0.4551724076271057, + "step": 118200 + }, + { + "epoch": 0.11905733143574852, + "grad_norm": 10.827498649861308, + "learning_rate": 4.9415848818083164e-05, + "loss": 2.6113, + "mean_token_accuracy": 0.3896551728248596, + "step": 118205 + }, + { + "epoch": 0.1190623674888527, + "grad_norm": 11.830284374903114, + "learning_rate": 4.941576398520962e-05, + "loss": 2.3752, + "mean_token_accuracy": 0.4448275864124298, + "step": 118210 + }, + { + "epoch": 0.11906740354195687, + "grad_norm": 10.33120865007257, + "learning_rate": 4.941567914625767e-05, + "loss": 2.2492, + "mean_token_accuracy": 0.44137930274009707, + "step": 118215 + }, + { + "epoch": 0.11907243959506104, + "grad_norm": 14.411637735979626, + "learning_rate": 4.941559430122731e-05, + "loss": 2.5762, + "mean_token_accuracy": 0.41724138259887694, + "step": 118220 + }, + { + "epoch": 0.11907747564816522, + "grad_norm": 15.249387732309263, + "learning_rate": 4.941550945011858e-05, + "loss": 2.3438, + "mean_token_accuracy": 0.4551724076271057, + "step": 118225 + }, + { + "epoch": 0.11908251170126939, + "grad_norm": 10.272365709143001, + "learning_rate": 4.94154245929315e-05, + "loss": 2.1131, + "mean_token_accuracy": 0.48457351326942444, + "step": 118230 + }, + { + "epoch": 0.11908754775437357, + "grad_norm": 9.873290752968993, + "learning_rate": 4.9415339729666105e-05, + "loss": 2.5571, + "mean_token_accuracy": 0.4310344815254211, + "step": 118235 + }, + { + "epoch": 0.11909258380747774, + "grad_norm": 9.918590101541952, + "learning_rate": 4.94152548603224e-05, + "loss": 2.6277, + "mean_token_accuracy": 0.4034482777118683, + "step": 118240 + }, + { + "epoch": 0.1190976198605819, + "grad_norm": 9.343003725989234, + "learning_rate": 4.9415169984900423e-05, + "loss": 2.3822, + "mean_token_accuracy": 0.4272837281227112, + "step": 118245 + }, + { + "epoch": 0.11910265591368607, + "grad_norm": 9.381508685954094, + "learning_rate": 4.941508510340019e-05, + "loss": 2.6887, + "mean_token_accuracy": 0.3999999940395355, + "step": 118250 + }, + { + "epoch": 0.11910769196679025, + "grad_norm": 11.231169027324897, + "learning_rate": 4.9415000215821725e-05, + "loss": 2.4338, + "mean_token_accuracy": 0.3896551728248596, + "step": 118255 + }, + { + "epoch": 0.11911272801989442, + "grad_norm": 9.980491772279954, + "learning_rate": 4.941491532216505e-05, + "loss": 2.3145, + "mean_token_accuracy": 0.4344827592372894, + "step": 118260 + }, + { + "epoch": 0.1191177640729986, + "grad_norm": 9.940889878598394, + "learning_rate": 4.94148304224302e-05, + "loss": 2.0441, + "mean_token_accuracy": 0.47586206793785096, + "step": 118265 + }, + { + "epoch": 0.11912280012610277, + "grad_norm": 10.171271102125502, + "learning_rate": 4.941474551661718e-05, + "loss": 2.2372, + "mean_token_accuracy": 0.458620685338974, + "step": 118270 + }, + { + "epoch": 0.11912783617920694, + "grad_norm": 9.684442713136876, + "learning_rate": 4.941466060472603e-05, + "loss": 2.4832, + "mean_token_accuracy": 0.4399273991584778, + "step": 118275 + }, + { + "epoch": 0.11913287223231112, + "grad_norm": 11.457882577223433, + "learning_rate": 4.941457568675677e-05, + "loss": 2.479, + "mean_token_accuracy": 0.3931034505367279, + "step": 118280 + }, + { + "epoch": 0.11913790828541529, + "grad_norm": 9.625692035660764, + "learning_rate": 4.941449076270941e-05, + "loss": 2.3688, + "mean_token_accuracy": 0.41034482717514037, + "step": 118285 + }, + { + "epoch": 0.11914294433851946, + "grad_norm": 15.243137908281236, + "learning_rate": 4.9414405832584e-05, + "loss": 2.7049, + "mean_token_accuracy": 0.4196007251739502, + "step": 118290 + }, + { + "epoch": 0.11914798039162364, + "grad_norm": 11.12678717048002, + "learning_rate": 4.941432089638054e-05, + "loss": 2.7158, + "mean_token_accuracy": 0.37241379618644715, + "step": 118295 + }, + { + "epoch": 0.11915301644472781, + "grad_norm": 13.054541526482632, + "learning_rate": 4.941423595409905e-05, + "loss": 2.528, + "mean_token_accuracy": 0.4, + "step": 118300 + }, + { + "epoch": 0.11915805249783198, + "grad_norm": 10.794845815368852, + "learning_rate": 4.941415100573958e-05, + "loss": 1.9352, + "mean_token_accuracy": 0.47434966564178466, + "step": 118305 + }, + { + "epoch": 0.11916308855093616, + "grad_norm": 9.005065862407985, + "learning_rate": 4.9414066051302135e-05, + "loss": 2.8789, + "mean_token_accuracy": 0.3724137872457504, + "step": 118310 + }, + { + "epoch": 0.11916812460404032, + "grad_norm": 10.549657818791939, + "learning_rate": 4.941398109078674e-05, + "loss": 2.8535, + "mean_token_accuracy": 0.36896551251411436, + "step": 118315 + }, + { + "epoch": 0.11917316065714449, + "grad_norm": 13.709032557563544, + "learning_rate": 4.941389612419342e-05, + "loss": 2.8147, + "mean_token_accuracy": 0.3758620619773865, + "step": 118320 + }, + { + "epoch": 0.11917819671024867, + "grad_norm": 10.870110386109967, + "learning_rate": 4.94138111515222e-05, + "loss": 2.8652, + "mean_token_accuracy": 0.39655172228813174, + "step": 118325 + }, + { + "epoch": 0.11918323276335284, + "grad_norm": 11.334745620308894, + "learning_rate": 4.94137261727731e-05, + "loss": 2.669, + "mean_token_accuracy": 0.33448274731636046, + "step": 118330 + }, + { + "epoch": 0.11918826881645701, + "grad_norm": 13.40262837600971, + "learning_rate": 4.941364118794615e-05, + "loss": 2.2534, + "mean_token_accuracy": 0.4275861978530884, + "step": 118335 + }, + { + "epoch": 0.11919330486956119, + "grad_norm": 11.110444206783198, + "learning_rate": 4.941355619704137e-05, + "loss": 2.4351, + "mean_token_accuracy": 0.42068964838981626, + "step": 118340 + }, + { + "epoch": 0.11919834092266536, + "grad_norm": 12.597115681514273, + "learning_rate": 4.9413471200058785e-05, + "loss": 2.3296, + "mean_token_accuracy": 0.4257713258266449, + "step": 118345 + }, + { + "epoch": 0.11920337697576953, + "grad_norm": 9.62017124710925, + "learning_rate": 4.9413386196998415e-05, + "loss": 2.492, + "mean_token_accuracy": 0.4, + "step": 118350 + }, + { + "epoch": 0.11920841302887371, + "grad_norm": 11.268950610158683, + "learning_rate": 4.9413301187860286e-05, + "loss": 2.4912, + "mean_token_accuracy": 0.40689654648303986, + "step": 118355 + }, + { + "epoch": 0.11921344908197788, + "grad_norm": 10.656016022940861, + "learning_rate": 4.941321617264442e-05, + "loss": 2.538, + "mean_token_accuracy": 0.3793103456497192, + "step": 118360 + }, + { + "epoch": 0.11921848513508206, + "grad_norm": 12.183203114827036, + "learning_rate": 4.9413131151350836e-05, + "loss": 2.7715, + "mean_token_accuracy": 0.4260738015174866, + "step": 118365 + }, + { + "epoch": 0.11922352118818623, + "grad_norm": 9.321545495686983, + "learning_rate": 4.9413046123979564e-05, + "loss": 2.346, + "mean_token_accuracy": 0.4448275864124298, + "step": 118370 + }, + { + "epoch": 0.1192285572412904, + "grad_norm": 29.997487817204984, + "learning_rate": 4.9412961090530633e-05, + "loss": 3.2, + "mean_token_accuracy": 0.4310344815254211, + "step": 118375 + }, + { + "epoch": 0.11923359329439458, + "grad_norm": 8.918160088236391, + "learning_rate": 4.9412876051004065e-05, + "loss": 2.5311, + "mean_token_accuracy": 0.4172413766384125, + "step": 118380 + }, + { + "epoch": 0.11923862934749874, + "grad_norm": 11.781419307822668, + "learning_rate": 4.941279100539988e-05, + "loss": 2.481, + "mean_token_accuracy": 0.41917725205421447, + "step": 118385 + }, + { + "epoch": 0.11924366540060291, + "grad_norm": 10.94782684171135, + "learning_rate": 4.941270595371809e-05, + "loss": 2.6994, + "mean_token_accuracy": 0.44482759237289426, + "step": 118390 + }, + { + "epoch": 0.11924870145370708, + "grad_norm": 11.8883192673968, + "learning_rate": 4.941262089595873e-05, + "loss": 2.47, + "mean_token_accuracy": 0.4448275864124298, + "step": 118395 + }, + { + "epoch": 0.11925373750681126, + "grad_norm": 9.984444725726966, + "learning_rate": 4.941253583212183e-05, + "loss": 2.7977, + "mean_token_accuracy": 0.4, + "step": 118400 + }, + { + "epoch": 0.11925877355991543, + "grad_norm": 11.55064352432013, + "learning_rate": 4.94124507622074e-05, + "loss": 2.2126, + "mean_token_accuracy": 0.4482758641242981, + "step": 118405 + }, + { + "epoch": 0.1192638096130196, + "grad_norm": 10.443933013103306, + "learning_rate": 4.941236568621548e-05, + "loss": 2.1891, + "mean_token_accuracy": 0.47931033968925474, + "step": 118410 + }, + { + "epoch": 0.11926884566612378, + "grad_norm": 10.56321911840976, + "learning_rate": 4.941228060414608e-05, + "loss": 2.5508, + "mean_token_accuracy": 0.44482758045196535, + "step": 118415 + }, + { + "epoch": 0.11927388171922795, + "grad_norm": 12.413423540846772, + "learning_rate": 4.941219551599922e-05, + "loss": 2.4043, + "mean_token_accuracy": 0.43103447556495667, + "step": 118420 + }, + { + "epoch": 0.11927891777233213, + "grad_norm": 9.739692962291747, + "learning_rate": 4.9412110421774935e-05, + "loss": 2.2191, + "mean_token_accuracy": 0.47931033968925474, + "step": 118425 + }, + { + "epoch": 0.1192839538254363, + "grad_norm": 10.278708269236052, + "learning_rate": 4.941202532147324e-05, + "loss": 2.6529, + "mean_token_accuracy": 0.42413792610168455, + "step": 118430 + }, + { + "epoch": 0.11928898987854047, + "grad_norm": 12.280283579430765, + "learning_rate": 4.9411940215094174e-05, + "loss": 2.31, + "mean_token_accuracy": 0.42758620381355283, + "step": 118435 + }, + { + "epoch": 0.11929402593164465, + "grad_norm": 9.466449749542441, + "learning_rate": 4.941185510263774e-05, + "loss": 2.391, + "mean_token_accuracy": 0.4310344815254211, + "step": 118440 + }, + { + "epoch": 0.11929906198474882, + "grad_norm": 11.407865215387272, + "learning_rate": 4.941176998410397e-05, + "loss": 2.1453, + "mean_token_accuracy": 0.4655172348022461, + "step": 118445 + }, + { + "epoch": 0.119304098037853, + "grad_norm": 12.10325482713896, + "learning_rate": 4.941168485949289e-05, + "loss": 2.9337, + "mean_token_accuracy": 0.37241379022598264, + "step": 118450 + }, + { + "epoch": 0.11930913409095716, + "grad_norm": 10.93154155786285, + "learning_rate": 4.941159972880453e-05, + "loss": 2.2945, + "mean_token_accuracy": 0.43103447556495667, + "step": 118455 + }, + { + "epoch": 0.11931417014406133, + "grad_norm": 11.786331351668094, + "learning_rate": 4.94115145920389e-05, + "loss": 2.6024, + "mean_token_accuracy": 0.4344827592372894, + "step": 118460 + }, + { + "epoch": 0.1193192061971655, + "grad_norm": 12.373428329367554, + "learning_rate": 4.941142944919603e-05, + "loss": 2.383, + "mean_token_accuracy": 0.4344827592372894, + "step": 118465 + }, + { + "epoch": 0.11932424225026968, + "grad_norm": 8.579469214844448, + "learning_rate": 4.9411344300275944e-05, + "loss": 3.0894, + "mean_token_accuracy": 0.4103448331356049, + "step": 118470 + }, + { + "epoch": 0.11932927830337385, + "grad_norm": 12.034593023354848, + "learning_rate": 4.9411259145278666e-05, + "loss": 2.2998, + "mean_token_accuracy": 0.46896552443504336, + "step": 118475 + }, + { + "epoch": 0.11933431435647802, + "grad_norm": 10.751329233718838, + "learning_rate": 4.941117398420422e-05, + "loss": 2.2987, + "mean_token_accuracy": 0.4137930989265442, + "step": 118480 + }, + { + "epoch": 0.1193393504095822, + "grad_norm": 12.21522408430805, + "learning_rate": 4.941108881705263e-05, + "loss": 2.4447, + "mean_token_accuracy": 0.4275861978530884, + "step": 118485 + }, + { + "epoch": 0.11934438646268637, + "grad_norm": 12.249848250650917, + "learning_rate": 4.941100364382391e-05, + "loss": 2.862, + "mean_token_accuracy": 0.38620689511299133, + "step": 118490 + }, + { + "epoch": 0.11934942251579055, + "grad_norm": 13.804779691914534, + "learning_rate": 4.9410918464518094e-05, + "loss": 2.4697, + "mean_token_accuracy": 0.4448275864124298, + "step": 118495 + }, + { + "epoch": 0.11935445856889472, + "grad_norm": 10.03047140658655, + "learning_rate": 4.94108332791352e-05, + "loss": 2.7607, + "mean_token_accuracy": 0.4310344934463501, + "step": 118500 + }, + { + "epoch": 0.1193594946219989, + "grad_norm": 9.0664881963372, + "learning_rate": 4.9410748087675265e-05, + "loss": 2.1456, + "mean_token_accuracy": 0.4413793087005615, + "step": 118505 + }, + { + "epoch": 0.11936453067510307, + "grad_norm": 10.215796004246767, + "learning_rate": 4.941066289013829e-05, + "loss": 2.8482, + "mean_token_accuracy": 0.41379311084747317, + "step": 118510 + }, + { + "epoch": 0.11936956672820724, + "grad_norm": 10.41967782928123, + "learning_rate": 4.9410577686524325e-05, + "loss": 2.5428, + "mean_token_accuracy": 0.4034482777118683, + "step": 118515 + }, + { + "epoch": 0.11937460278131141, + "grad_norm": 11.804039453028354, + "learning_rate": 4.941049247683337e-05, + "loss": 2.6252, + "mean_token_accuracy": 0.41566848158836367, + "step": 118520 + }, + { + "epoch": 0.11937963883441557, + "grad_norm": 10.249753588947353, + "learning_rate": 4.941040726106546e-05, + "loss": 2.3151, + "mean_token_accuracy": 0.41034482717514037, + "step": 118525 + }, + { + "epoch": 0.11938467488751975, + "grad_norm": 11.667652439456687, + "learning_rate": 4.941032203922061e-05, + "loss": 2.4205, + "mean_token_accuracy": 0.4068965494632721, + "step": 118530 + }, + { + "epoch": 0.11938971094062392, + "grad_norm": 9.204732011029918, + "learning_rate": 4.9410236811298865e-05, + "loss": 2.1406, + "mean_token_accuracy": 0.4517241418361664, + "step": 118535 + }, + { + "epoch": 0.1193947469937281, + "grad_norm": 10.816973514215896, + "learning_rate": 4.941015157730022e-05, + "loss": 2.1132, + "mean_token_accuracy": 0.4931034505367279, + "step": 118540 + }, + { + "epoch": 0.11939978304683227, + "grad_norm": 10.342434586976344, + "learning_rate": 4.941006633722472e-05, + "loss": 2.422, + "mean_token_accuracy": 0.42413793206214906, + "step": 118545 + }, + { + "epoch": 0.11940481909993644, + "grad_norm": 9.624978956037522, + "learning_rate": 4.9409981091072384e-05, + "loss": 2.3141, + "mean_token_accuracy": 0.46551724672317507, + "step": 118550 + }, + { + "epoch": 0.11940985515304062, + "grad_norm": 9.520798323294272, + "learning_rate": 4.9409895838843225e-05, + "loss": 2.215, + "mean_token_accuracy": 0.47241379618644713, + "step": 118555 + }, + { + "epoch": 0.11941489120614479, + "grad_norm": 9.925576465575528, + "learning_rate": 4.940981058053728e-05, + "loss": 2.3774, + "mean_token_accuracy": 0.4310344815254211, + "step": 118560 + }, + { + "epoch": 0.11941992725924896, + "grad_norm": 11.098376916598838, + "learning_rate": 4.9409725316154564e-05, + "loss": 2.5175, + "mean_token_accuracy": 0.3999999940395355, + "step": 118565 + }, + { + "epoch": 0.11942496331235314, + "grad_norm": 10.26660720285367, + "learning_rate": 4.940964004569511e-05, + "loss": 2.3985, + "mean_token_accuracy": 0.4034482777118683, + "step": 118570 + }, + { + "epoch": 0.11942999936545731, + "grad_norm": 10.316315067683968, + "learning_rate": 4.940955476915893e-05, + "loss": 2.3255, + "mean_token_accuracy": 0.4344827651977539, + "step": 118575 + }, + { + "epoch": 0.11943503541856149, + "grad_norm": 11.201046746577662, + "learning_rate": 4.9409469486546056e-05, + "loss": 2.8227, + "mean_token_accuracy": 0.4034482717514038, + "step": 118580 + }, + { + "epoch": 0.11944007147166566, + "grad_norm": 9.27490743014243, + "learning_rate": 4.9409384197856506e-05, + "loss": 2.5329, + "mean_token_accuracy": 0.4090139091014862, + "step": 118585 + }, + { + "epoch": 0.11944510752476983, + "grad_norm": 9.621219968579341, + "learning_rate": 4.940929890309031e-05, + "loss": 2.0157, + "mean_token_accuracy": 0.482758617401123, + "step": 118590 + }, + { + "epoch": 0.119450143577874, + "grad_norm": 8.885591944982341, + "learning_rate": 4.940921360224749e-05, + "loss": 2.6776, + "mean_token_accuracy": 0.37241379618644715, + "step": 118595 + }, + { + "epoch": 0.11945517963097817, + "grad_norm": 11.219647294579152, + "learning_rate": 4.9409128295328063e-05, + "loss": 2.554, + "mean_token_accuracy": 0.4, + "step": 118600 + }, + { + "epoch": 0.11946021568408234, + "grad_norm": 10.447968297480196, + "learning_rate": 4.9409042982332057e-05, + "loss": 2.4538, + "mean_token_accuracy": 0.41724138259887694, + "step": 118605 + }, + { + "epoch": 0.11946525173718651, + "grad_norm": 10.576691868508107, + "learning_rate": 4.9408957663259496e-05, + "loss": 2.3196, + "mean_token_accuracy": 0.46551724076271056, + "step": 118610 + }, + { + "epoch": 0.11947028779029069, + "grad_norm": 11.610870927902063, + "learning_rate": 4.940887233811041e-05, + "loss": 2.6224, + "mean_token_accuracy": 0.4310344934463501, + "step": 118615 + }, + { + "epoch": 0.11947532384339486, + "grad_norm": 10.370765177794075, + "learning_rate": 4.940878700688481e-05, + "loss": 2.4932, + "mean_token_accuracy": 0.4296430706977844, + "step": 118620 + }, + { + "epoch": 0.11948035989649904, + "grad_norm": 10.941744439570284, + "learning_rate": 4.940870166958273e-05, + "loss": 2.1503, + "mean_token_accuracy": 0.46896551847457885, + "step": 118625 + }, + { + "epoch": 0.11948539594960321, + "grad_norm": 9.865619061932657, + "learning_rate": 4.940861632620419e-05, + "loss": 2.1389, + "mean_token_accuracy": 0.47065940499305725, + "step": 118630 + }, + { + "epoch": 0.11949043200270738, + "grad_norm": 10.012562084091808, + "learning_rate": 4.9408530976749214e-05, + "loss": 2.3268, + "mean_token_accuracy": 0.4137930989265442, + "step": 118635 + }, + { + "epoch": 0.11949546805581156, + "grad_norm": 12.204932619089906, + "learning_rate": 4.940844562121782e-05, + "loss": 2.3256, + "mean_token_accuracy": 0.4493842303752899, + "step": 118640 + }, + { + "epoch": 0.11950050410891573, + "grad_norm": 10.940336844614546, + "learning_rate": 4.940836025961004e-05, + "loss": 2.4426, + "mean_token_accuracy": 0.39310344457626345, + "step": 118645 + }, + { + "epoch": 0.1195055401620199, + "grad_norm": 9.467157973653636, + "learning_rate": 4.940827489192589e-05, + "loss": 2.5546, + "mean_token_accuracy": 0.42413793206214906, + "step": 118650 + }, + { + "epoch": 0.11951057621512408, + "grad_norm": 13.484174544539952, + "learning_rate": 4.940818951816541e-05, + "loss": 2.9886, + "mean_token_accuracy": 0.38275861740112305, + "step": 118655 + }, + { + "epoch": 0.11951561226822825, + "grad_norm": 9.145809618443721, + "learning_rate": 4.94081041383286e-05, + "loss": 2.1042, + "mean_token_accuracy": 0.4655172348022461, + "step": 118660 + }, + { + "epoch": 0.11952064832133241, + "grad_norm": 9.716263629843706, + "learning_rate": 4.9408018752415506e-05, + "loss": 2.0296, + "mean_token_accuracy": 0.5543859660625458, + "step": 118665 + }, + { + "epoch": 0.11952568437443659, + "grad_norm": 8.783392975159177, + "learning_rate": 4.9407933360426135e-05, + "loss": 2.4244, + "mean_token_accuracy": 0.4586206912994385, + "step": 118670 + }, + { + "epoch": 0.11953072042754076, + "grad_norm": 9.89286492434891, + "learning_rate": 4.9407847962360515e-05, + "loss": 2.0519, + "mean_token_accuracy": 0.4965517222881317, + "step": 118675 + }, + { + "epoch": 0.11953575648064493, + "grad_norm": 10.68375550297062, + "learning_rate": 4.9407762558218674e-05, + "loss": 2.3839, + "mean_token_accuracy": 0.4637023627758026, + "step": 118680 + }, + { + "epoch": 0.11954079253374911, + "grad_norm": 10.995806811562966, + "learning_rate": 4.940767714800064e-05, + "loss": 2.6065, + "mean_token_accuracy": 0.3965517163276672, + "step": 118685 + }, + { + "epoch": 0.11954582858685328, + "grad_norm": 12.235988455435903, + "learning_rate": 4.940759173170642e-05, + "loss": 2.0908, + "mean_token_accuracy": 0.5, + "step": 118690 + }, + { + "epoch": 0.11955086463995745, + "grad_norm": 10.52725905394331, + "learning_rate": 4.940750630933605e-05, + "loss": 2.4397, + "mean_token_accuracy": 0.3793103516101837, + "step": 118695 + }, + { + "epoch": 0.11955590069306163, + "grad_norm": 10.095836068033115, + "learning_rate": 4.940742088088956e-05, + "loss": 2.3236, + "mean_token_accuracy": 0.42226254343986513, + "step": 118700 + }, + { + "epoch": 0.1195609367461658, + "grad_norm": 10.743249271216033, + "learning_rate": 4.9407335446366955e-05, + "loss": 2.1919, + "mean_token_accuracy": 0.4517241358757019, + "step": 118705 + }, + { + "epoch": 0.11956597279926998, + "grad_norm": 9.192319439405217, + "learning_rate": 4.9407250005768274e-05, + "loss": 2.2704, + "mean_token_accuracy": 0.46745312213897705, + "step": 118710 + }, + { + "epoch": 0.11957100885237415, + "grad_norm": 12.105264217980421, + "learning_rate": 4.940716455909353e-05, + "loss": 2.4528, + "mean_token_accuracy": 0.4620689690113068, + "step": 118715 + }, + { + "epoch": 0.11957604490547832, + "grad_norm": 12.509984843605038, + "learning_rate": 4.940707910634275e-05, + "loss": 2.8933, + "mean_token_accuracy": 0.3620689630508423, + "step": 118720 + }, + { + "epoch": 0.1195810809585825, + "grad_norm": 10.673559763475271, + "learning_rate": 4.940699364751597e-05, + "loss": 2.257, + "mean_token_accuracy": 0.4620689630508423, + "step": 118725 + }, + { + "epoch": 0.11958611701168667, + "grad_norm": 11.563129849200525, + "learning_rate": 4.94069081826132e-05, + "loss": 2.6905, + "mean_token_accuracy": 0.41034482717514037, + "step": 118730 + }, + { + "epoch": 0.11959115306479083, + "grad_norm": 13.04224262956906, + "learning_rate": 4.940682271163446e-05, + "loss": 2.1283, + "mean_token_accuracy": 0.4551724135875702, + "step": 118735 + }, + { + "epoch": 0.119596189117895, + "grad_norm": 10.060225573720642, + "learning_rate": 4.9406737234579795e-05, + "loss": 2.5173, + "mean_token_accuracy": 0.3931034505367279, + "step": 118740 + }, + { + "epoch": 0.11960122517099918, + "grad_norm": 11.091293881696107, + "learning_rate": 4.9406651751449204e-05, + "loss": 2.1876, + "mean_token_accuracy": 0.4689655125141144, + "step": 118745 + }, + { + "epoch": 0.11960626122410335, + "grad_norm": 9.678496445382857, + "learning_rate": 4.9406566262242724e-05, + "loss": 2.619, + "mean_token_accuracy": 0.3655172407627106, + "step": 118750 + }, + { + "epoch": 0.11961129727720753, + "grad_norm": 10.843439663023569, + "learning_rate": 4.9406480766960376e-05, + "loss": 2.3407, + "mean_token_accuracy": 0.4517241299152374, + "step": 118755 + }, + { + "epoch": 0.1196163333303117, + "grad_norm": 10.073501246121298, + "learning_rate": 4.9406395265602185e-05, + "loss": 2.3933, + "mean_token_accuracy": 0.4103448212146759, + "step": 118760 + }, + { + "epoch": 0.11962136938341587, + "grad_norm": 9.748001097700424, + "learning_rate": 4.940630975816817e-05, + "loss": 2.3229, + "mean_token_accuracy": 0.42413793206214906, + "step": 118765 + }, + { + "epoch": 0.11962640543652005, + "grad_norm": 7.7739619930482515, + "learning_rate": 4.9406224244658366e-05, + "loss": 2.2761, + "mean_token_accuracy": 0.5113300383090973, + "step": 118770 + }, + { + "epoch": 0.11963144148962422, + "grad_norm": 19.792673048276622, + "learning_rate": 4.940613872507278e-05, + "loss": 2.449, + "mean_token_accuracy": 0.4344827592372894, + "step": 118775 + }, + { + "epoch": 0.1196364775427284, + "grad_norm": 14.19393826632409, + "learning_rate": 4.940605319941146e-05, + "loss": 2.6144, + "mean_token_accuracy": 0.4034482717514038, + "step": 118780 + }, + { + "epoch": 0.11964151359583257, + "grad_norm": 10.715374569230379, + "learning_rate": 4.940596766767439e-05, + "loss": 2.6072, + "mean_token_accuracy": 0.41034482717514037, + "step": 118785 + }, + { + "epoch": 0.11964654964893674, + "grad_norm": 12.56496972200216, + "learning_rate": 4.9405882129861635e-05, + "loss": 2.7369, + "mean_token_accuracy": 0.41034482717514037, + "step": 118790 + }, + { + "epoch": 0.11965158570204092, + "grad_norm": 10.184012183619025, + "learning_rate": 4.9405796585973195e-05, + "loss": 2.4989, + "mean_token_accuracy": 0.42758620381355283, + "step": 118795 + }, + { + "epoch": 0.11965662175514509, + "grad_norm": 10.170626396109657, + "learning_rate": 4.940571103600911e-05, + "loss": 2.4599, + "mean_token_accuracy": 0.4620689630508423, + "step": 118800 + }, + { + "epoch": 0.11966165780824925, + "grad_norm": 9.405842261791056, + "learning_rate": 4.940562547996939e-05, + "loss": 2.4974, + "mean_token_accuracy": 0.4068965494632721, + "step": 118805 + }, + { + "epoch": 0.11966669386135342, + "grad_norm": 10.300422134341396, + "learning_rate": 4.9405539917854056e-05, + "loss": 2.7065, + "mean_token_accuracy": 0.4464004814624786, + "step": 118810 + }, + { + "epoch": 0.1196717299144576, + "grad_norm": 10.21148583836709, + "learning_rate": 4.940545434966314e-05, + "loss": 2.6322, + "mean_token_accuracy": 0.41851180195808413, + "step": 118815 + }, + { + "epoch": 0.11967676596756177, + "grad_norm": 11.207322937327852, + "learning_rate": 4.940536877539667e-05, + "loss": 2.5619, + "mean_token_accuracy": 0.42068966031074523, + "step": 118820 + }, + { + "epoch": 0.11968180202066594, + "grad_norm": 11.636949975235481, + "learning_rate": 4.940528319505467e-05, + "loss": 2.4313, + "mean_token_accuracy": 0.42068964838981626, + "step": 118825 + }, + { + "epoch": 0.11968683807377012, + "grad_norm": 11.32954740889244, + "learning_rate": 4.940519760863714e-05, + "loss": 2.358, + "mean_token_accuracy": 0.39310343861579894, + "step": 118830 + }, + { + "epoch": 0.11969187412687429, + "grad_norm": 10.159525081342787, + "learning_rate": 4.940511201614414e-05, + "loss": 2.2329, + "mean_token_accuracy": 0.42413793206214906, + "step": 118835 + }, + { + "epoch": 0.11969691017997847, + "grad_norm": 10.902030583230793, + "learning_rate": 4.9405026417575666e-05, + "loss": 2.379, + "mean_token_accuracy": 0.4482758641242981, + "step": 118840 + }, + { + "epoch": 0.11970194623308264, + "grad_norm": 10.27155430843561, + "learning_rate": 4.9404940812931755e-05, + "loss": 2.4629, + "mean_token_accuracy": 0.42413793206214906, + "step": 118845 + }, + { + "epoch": 0.11970698228618681, + "grad_norm": 9.977628237820763, + "learning_rate": 4.940485520221242e-05, + "loss": 2.6085, + "mean_token_accuracy": 0.36896551549434664, + "step": 118850 + }, + { + "epoch": 0.11971201833929099, + "grad_norm": 15.665022171625505, + "learning_rate": 4.94047695854177e-05, + "loss": 3.0928, + "mean_token_accuracy": 0.37241379022598264, + "step": 118855 + }, + { + "epoch": 0.11971705439239516, + "grad_norm": 11.904276053349044, + "learning_rate": 4.940468396254761e-05, + "loss": 2.3811, + "mean_token_accuracy": 0.4482758641242981, + "step": 118860 + }, + { + "epoch": 0.11972209044549934, + "grad_norm": 10.897861802254642, + "learning_rate": 4.940459833360217e-05, + "loss": 2.2718, + "mean_token_accuracy": 0.4965517222881317, + "step": 118865 + }, + { + "epoch": 0.11972712649860351, + "grad_norm": 8.331129149671176, + "learning_rate": 4.940451269858141e-05, + "loss": 2.0369, + "mean_token_accuracy": 0.5241379320621491, + "step": 118870 + }, + { + "epoch": 0.11973216255170767, + "grad_norm": 8.700785293168776, + "learning_rate": 4.9404427057485346e-05, + "loss": 2.4496, + "mean_token_accuracy": 0.3620689630508423, + "step": 118875 + }, + { + "epoch": 0.11973719860481184, + "grad_norm": 10.286123215787304, + "learning_rate": 4.940434141031402e-05, + "loss": 2.7102, + "mean_token_accuracy": 0.42413793206214906, + "step": 118880 + }, + { + "epoch": 0.11974223465791602, + "grad_norm": 9.920525488813636, + "learning_rate": 4.940425575706744e-05, + "loss": 2.2155, + "mean_token_accuracy": 0.47931033968925474, + "step": 118885 + }, + { + "epoch": 0.11974727071102019, + "grad_norm": 11.999261667656052, + "learning_rate": 4.9404170097745625e-05, + "loss": 2.3321, + "mean_token_accuracy": 0.47822141647338867, + "step": 118890 + }, + { + "epoch": 0.11975230676412436, + "grad_norm": 12.354776235962683, + "learning_rate": 4.940408443234861e-05, + "loss": 2.493, + "mean_token_accuracy": 0.43448275327682495, + "step": 118895 + }, + { + "epoch": 0.11975734281722854, + "grad_norm": 9.104268609076932, + "learning_rate": 4.940399876087641e-05, + "loss": 2.4663, + "mean_token_accuracy": 0.4517241299152374, + "step": 118900 + }, + { + "epoch": 0.11976237887033271, + "grad_norm": 16.04227384387449, + "learning_rate": 4.940391308332906e-05, + "loss": 2.74, + "mean_token_accuracy": 0.41034482717514037, + "step": 118905 + }, + { + "epoch": 0.11976741492343689, + "grad_norm": 15.218781341166528, + "learning_rate": 4.9403827399706584e-05, + "loss": 2.2539, + "mean_token_accuracy": 0.4586206912994385, + "step": 118910 + }, + { + "epoch": 0.11977245097654106, + "grad_norm": 12.214424445694076, + "learning_rate": 4.940374171000899e-05, + "loss": 2.6897, + "mean_token_accuracy": 0.38620689511299133, + "step": 118915 + }, + { + "epoch": 0.11977748702964523, + "grad_norm": 12.256669135965447, + "learning_rate": 4.9403656014236315e-05, + "loss": 2.4118, + "mean_token_accuracy": 0.4103448212146759, + "step": 118920 + }, + { + "epoch": 0.1197825230827494, + "grad_norm": 10.130158485571638, + "learning_rate": 4.9403570312388584e-05, + "loss": 2.0252, + "mean_token_accuracy": 0.4551724135875702, + "step": 118925 + }, + { + "epoch": 0.11978755913585358, + "grad_norm": 12.852546942699673, + "learning_rate": 4.940348460446581e-05, + "loss": 2.5184, + "mean_token_accuracy": 0.3965517282485962, + "step": 118930 + }, + { + "epoch": 0.11979259518895775, + "grad_norm": 10.17935657611278, + "learning_rate": 4.940339889046802e-05, + "loss": 2.2801, + "mean_token_accuracy": 0.4517241358757019, + "step": 118935 + }, + { + "epoch": 0.11979763124206193, + "grad_norm": 10.456700518548788, + "learning_rate": 4.9403313170395244e-05, + "loss": 2.3518, + "mean_token_accuracy": 0.4034482717514038, + "step": 118940 + }, + { + "epoch": 0.11980266729516609, + "grad_norm": 10.000661376662674, + "learning_rate": 4.94032274442475e-05, + "loss": 2.3661, + "mean_token_accuracy": 0.4344827473163605, + "step": 118945 + }, + { + "epoch": 0.11980770334827026, + "grad_norm": 11.3251079797568, + "learning_rate": 4.940314171202483e-05, + "loss": 2.0831, + "mean_token_accuracy": 0.47931033968925474, + "step": 118950 + }, + { + "epoch": 0.11981273940137444, + "grad_norm": 9.703495039658216, + "learning_rate": 4.940305597372723e-05, + "loss": 2.5393, + "mean_token_accuracy": 0.4034482777118683, + "step": 118955 + }, + { + "epoch": 0.11981777545447861, + "grad_norm": 9.292965117055608, + "learning_rate": 4.940297022935473e-05, + "loss": 1.9019, + "mean_token_accuracy": 0.5253694593906403, + "step": 118960 + }, + { + "epoch": 0.11982281150758278, + "grad_norm": 11.01403846788282, + "learning_rate": 4.940288447890737e-05, + "loss": 2.2578, + "mean_token_accuracy": 0.49999998807907103, + "step": 118965 + }, + { + "epoch": 0.11982784756068696, + "grad_norm": 9.243486076187825, + "learning_rate": 4.940279872238515e-05, + "loss": 1.9937, + "mean_token_accuracy": 0.4896551728248596, + "step": 118970 + }, + { + "epoch": 0.11983288361379113, + "grad_norm": 12.92874074726818, + "learning_rate": 4.9402712959788116e-05, + "loss": 2.547, + "mean_token_accuracy": 0.4052026689052582, + "step": 118975 + }, + { + "epoch": 0.1198379196668953, + "grad_norm": 10.171625556908468, + "learning_rate": 4.940262719111628e-05, + "loss": 2.1989, + "mean_token_accuracy": 0.47931033968925474, + "step": 118980 + }, + { + "epoch": 0.11984295571999948, + "grad_norm": 9.600413939387508, + "learning_rate": 4.940254141636968e-05, + "loss": 2.3366, + "mean_token_accuracy": 0.4172413766384125, + "step": 118985 + }, + { + "epoch": 0.11984799177310365, + "grad_norm": 9.832874507392205, + "learning_rate": 4.940245563554831e-05, + "loss": 2.3572, + "mean_token_accuracy": 0.4103448331356049, + "step": 118990 + }, + { + "epoch": 0.11985302782620783, + "grad_norm": 10.908752166369093, + "learning_rate": 4.9402369848652225e-05, + "loss": 2.3137, + "mean_token_accuracy": 0.4527093589305878, + "step": 118995 + }, + { + "epoch": 0.119858063879312, + "grad_norm": 11.510245501421815, + "learning_rate": 4.940228405568143e-05, + "loss": 2.9673, + "mean_token_accuracy": 0.3620689630508423, + "step": 119000 + }, + { + "epoch": 0.11986309993241617, + "grad_norm": 10.230521188245369, + "learning_rate": 4.940219825663596e-05, + "loss": 2.3662, + "mean_token_accuracy": 0.4379310369491577, + "step": 119005 + }, + { + "epoch": 0.11986813598552035, + "grad_norm": 12.91677881265146, + "learning_rate": 4.940211245151583e-05, + "loss": 2.3097, + "mean_token_accuracy": 0.44812030494213106, + "step": 119010 + }, + { + "epoch": 0.1198731720386245, + "grad_norm": 12.252074748502762, + "learning_rate": 4.940202664032107e-05, + "loss": 2.3363, + "mean_token_accuracy": 0.46896552443504336, + "step": 119015 + }, + { + "epoch": 0.11987820809172868, + "grad_norm": 13.843651949500092, + "learning_rate": 4.94019408230517e-05, + "loss": 2.3691, + "mean_token_accuracy": 0.44482758045196535, + "step": 119020 + }, + { + "epoch": 0.11988324414483285, + "grad_norm": 10.150970044974168, + "learning_rate": 4.940185499970774e-05, + "loss": 2.2829, + "mean_token_accuracy": 0.4482758641242981, + "step": 119025 + }, + { + "epoch": 0.11988828019793703, + "grad_norm": 9.82948620935528, + "learning_rate": 4.9401769170289225e-05, + "loss": 2.0981, + "mean_token_accuracy": 0.46551724076271056, + "step": 119030 + }, + { + "epoch": 0.1198933162510412, + "grad_norm": 11.337032245245409, + "learning_rate": 4.9401683334796174e-05, + "loss": 2.3292, + "mean_token_accuracy": 0.3896551728248596, + "step": 119035 + }, + { + "epoch": 0.11989835230414538, + "grad_norm": 13.153124209937824, + "learning_rate": 4.94015974932286e-05, + "loss": 2.2478, + "mean_token_accuracy": 0.413793095946312, + "step": 119040 + }, + { + "epoch": 0.11990338835724955, + "grad_norm": 14.768692772045624, + "learning_rate": 4.9401511645586545e-05, + "loss": 2.3267, + "mean_token_accuracy": 0.3827586233615875, + "step": 119045 + }, + { + "epoch": 0.11990842441035372, + "grad_norm": 10.708134388991471, + "learning_rate": 4.940142579187002e-05, + "loss": 2.6283, + "mean_token_accuracy": 0.37931033968925476, + "step": 119050 + }, + { + "epoch": 0.1199134604634579, + "grad_norm": 10.565276953590889, + "learning_rate": 4.940133993207906e-05, + "loss": 2.021, + "mean_token_accuracy": 0.4586206912994385, + "step": 119055 + }, + { + "epoch": 0.11991849651656207, + "grad_norm": 9.436655558648924, + "learning_rate": 4.940125406621367e-05, + "loss": 2.4049, + "mean_token_accuracy": 0.4448275864124298, + "step": 119060 + }, + { + "epoch": 0.11992353256966624, + "grad_norm": 9.163636747168164, + "learning_rate": 4.9401168194273894e-05, + "loss": 2.12, + "mean_token_accuracy": 0.4916515350341797, + "step": 119065 + }, + { + "epoch": 0.11992856862277042, + "grad_norm": 9.974203461871152, + "learning_rate": 4.940108231625974e-05, + "loss": 2.6935, + "mean_token_accuracy": 0.37586206793785093, + "step": 119070 + }, + { + "epoch": 0.11993360467587459, + "grad_norm": 11.032710511938467, + "learning_rate": 4.940099643217125e-05, + "loss": 2.4679, + "mean_token_accuracy": 0.4551724135875702, + "step": 119075 + }, + { + "epoch": 0.11993864072897877, + "grad_norm": 11.328955800219799, + "learning_rate": 4.940091054200843e-05, + "loss": 2.2894, + "mean_token_accuracy": 0.46551724076271056, + "step": 119080 + }, + { + "epoch": 0.11994367678208293, + "grad_norm": 9.538276974271273, + "learning_rate": 4.940082464577131e-05, + "loss": 1.9611, + "mean_token_accuracy": 0.4724137902259827, + "step": 119085 + }, + { + "epoch": 0.1199487128351871, + "grad_norm": 9.495828542190486, + "learning_rate": 4.940073874345992e-05, + "loss": 2.188, + "mean_token_accuracy": 0.4517241418361664, + "step": 119090 + }, + { + "epoch": 0.11995374888829127, + "grad_norm": 12.40922566963609, + "learning_rate": 4.940065283507428e-05, + "loss": 2.5948, + "mean_token_accuracy": 0.34137930870056155, + "step": 119095 + }, + { + "epoch": 0.11995878494139545, + "grad_norm": 8.266268253995737, + "learning_rate": 4.94005669206144e-05, + "loss": 2.0969, + "mean_token_accuracy": 0.4517241358757019, + "step": 119100 + }, + { + "epoch": 0.11996382099449962, + "grad_norm": 10.477201096847546, + "learning_rate": 4.940048100008032e-05, + "loss": 1.9724, + "mean_token_accuracy": 0.4655172348022461, + "step": 119105 + }, + { + "epoch": 0.1199688570476038, + "grad_norm": 10.795744390895273, + "learning_rate": 4.940039507347207e-05, + "loss": 2.1984, + "mean_token_accuracy": 0.4620689570903778, + "step": 119110 + }, + { + "epoch": 0.11997389310070797, + "grad_norm": 12.252935225346146, + "learning_rate": 4.940030914078965e-05, + "loss": 2.7286, + "mean_token_accuracy": 0.35862069129943847, + "step": 119115 + }, + { + "epoch": 0.11997892915381214, + "grad_norm": 10.224078113513775, + "learning_rate": 4.9400223202033104e-05, + "loss": 2.0383, + "mean_token_accuracy": 0.4034482777118683, + "step": 119120 + }, + { + "epoch": 0.11998396520691632, + "grad_norm": 11.5327247403299, + "learning_rate": 4.940013725720245e-05, + "loss": 2.3646, + "mean_token_accuracy": 0.4344827651977539, + "step": 119125 + }, + { + "epoch": 0.11998900126002049, + "grad_norm": 8.676333968411784, + "learning_rate": 4.9400051306297706e-05, + "loss": 2.3536, + "mean_token_accuracy": 0.42068966031074523, + "step": 119130 + }, + { + "epoch": 0.11999403731312466, + "grad_norm": 10.610228992034166, + "learning_rate": 4.9399965349318914e-05, + "loss": 2.1975, + "mean_token_accuracy": 0.46551724076271056, + "step": 119135 + }, + { + "epoch": 0.11999907336622884, + "grad_norm": 10.964166810150786, + "learning_rate": 4.9399879386266076e-05, + "loss": 1.8283, + "mean_token_accuracy": 0.5090744078159333, + "step": 119140 + }, + { + "epoch": 0.12000410941933301, + "grad_norm": 11.34789436059137, + "learning_rate": 4.9399793417139225e-05, + "loss": 2.4062, + "mean_token_accuracy": 0.4358741700649261, + "step": 119145 + }, + { + "epoch": 0.12000914547243718, + "grad_norm": 13.675601557621595, + "learning_rate": 4.939970744193838e-05, + "loss": 2.1142, + "mean_token_accuracy": 0.42068964540958403, + "step": 119150 + }, + { + "epoch": 0.12001418152554134, + "grad_norm": 16.563869672257503, + "learning_rate": 4.939962146066358e-05, + "loss": 2.7509, + "mean_token_accuracy": 0.4034482717514038, + "step": 119155 + }, + { + "epoch": 0.12001921757864552, + "grad_norm": 9.714801161384313, + "learning_rate": 4.939953547331483e-05, + "loss": 2.1133, + "mean_token_accuracy": 0.5, + "step": 119160 + }, + { + "epoch": 0.12002425363174969, + "grad_norm": 8.702701082102617, + "learning_rate": 4.939944947989217e-05, + "loss": 2.6492, + "mean_token_accuracy": 0.36896551847457887, + "step": 119165 + }, + { + "epoch": 0.12002928968485387, + "grad_norm": 11.956838965722106, + "learning_rate": 4.93993634803956e-05, + "loss": 2.5793, + "mean_token_accuracy": 0.4034482777118683, + "step": 119170 + }, + { + "epoch": 0.12003432573795804, + "grad_norm": 10.97510252987617, + "learning_rate": 4.939927747482518e-05, + "loss": 2.4492, + "mean_token_accuracy": 0.4551724135875702, + "step": 119175 + }, + { + "epoch": 0.12003936179106221, + "grad_norm": 9.048682434985329, + "learning_rate": 4.93991914631809e-05, + "loss": 2.1737, + "mean_token_accuracy": 0.4434361755847931, + "step": 119180 + }, + { + "epoch": 0.12004439784416639, + "grad_norm": 10.984697320878448, + "learning_rate": 4.93991054454628e-05, + "loss": 2.4599, + "mean_token_accuracy": 0.42758620381355283, + "step": 119185 + }, + { + "epoch": 0.12004943389727056, + "grad_norm": 10.492211584079056, + "learning_rate": 4.939901942167091e-05, + "loss": 2.3248, + "mean_token_accuracy": 0.42758620381355283, + "step": 119190 + }, + { + "epoch": 0.12005446995037473, + "grad_norm": 15.40266057087019, + "learning_rate": 4.9398933391805236e-05, + "loss": 2.4143, + "mean_token_accuracy": 0.4379310429096222, + "step": 119195 + }, + { + "epoch": 0.12005950600347891, + "grad_norm": 9.522418932194364, + "learning_rate": 4.939884735586581e-05, + "loss": 2.0445, + "mean_token_accuracy": 0.4344827592372894, + "step": 119200 + }, + { + "epoch": 0.12006454205658308, + "grad_norm": 14.270578034997587, + "learning_rate": 4.939876131385266e-05, + "loss": 2.0061, + "mean_token_accuracy": 0.4482758641242981, + "step": 119205 + }, + { + "epoch": 0.12006957810968726, + "grad_norm": 12.579368981392484, + "learning_rate": 4.9398675265765806e-05, + "loss": 2.5752, + "mean_token_accuracy": 0.3931034505367279, + "step": 119210 + }, + { + "epoch": 0.12007461416279143, + "grad_norm": 10.493216865759225, + "learning_rate": 4.9398589211605285e-05, + "loss": 2.4452, + "mean_token_accuracy": 0.4448275864124298, + "step": 119215 + }, + { + "epoch": 0.1200796502158956, + "grad_norm": 9.083767363237282, + "learning_rate": 4.9398503151371096e-05, + "loss": 2.0128, + "mean_token_accuracy": 0.4655172348022461, + "step": 119220 + }, + { + "epoch": 0.12008468626899976, + "grad_norm": 8.68131762824565, + "learning_rate": 4.939841708506327e-05, + "loss": 1.985, + "mean_token_accuracy": 0.4586206912994385, + "step": 119225 + }, + { + "epoch": 0.12008972232210394, + "grad_norm": 9.060833273217225, + "learning_rate": 4.939833101268185e-05, + "loss": 2.2175, + "mean_token_accuracy": 0.5, + "step": 119230 + }, + { + "epoch": 0.12009475837520811, + "grad_norm": 10.658697829405641, + "learning_rate": 4.9398244934226835e-05, + "loss": 2.547, + "mean_token_accuracy": 0.37241379618644715, + "step": 119235 + }, + { + "epoch": 0.12009979442831228, + "grad_norm": 11.165869247559232, + "learning_rate": 4.939815884969827e-05, + "loss": 2.3151, + "mean_token_accuracy": 0.44137930274009707, + "step": 119240 + }, + { + "epoch": 0.12010483048141646, + "grad_norm": 9.00918724149998, + "learning_rate": 4.939807275909616e-05, + "loss": 2.3159, + "mean_token_accuracy": 0.44827585220336913, + "step": 119245 + }, + { + "epoch": 0.12010986653452063, + "grad_norm": 8.59740497651537, + "learning_rate": 4.939798666242054e-05, + "loss": 2.4973, + "mean_token_accuracy": 0.4259528160095215, + "step": 119250 + }, + { + "epoch": 0.1201149025876248, + "grad_norm": 12.549208054763874, + "learning_rate": 4.939790055967144e-05, + "loss": 2.3964, + "mean_token_accuracy": 0.4034482717514038, + "step": 119255 + }, + { + "epoch": 0.12011993864072898, + "grad_norm": 10.297792320242346, + "learning_rate": 4.9397814450848866e-05, + "loss": 2.378, + "mean_token_accuracy": 0.3999999940395355, + "step": 119260 + }, + { + "epoch": 0.12012497469383315, + "grad_norm": 13.728203886537814, + "learning_rate": 4.939772833595286e-05, + "loss": 2.1032, + "mean_token_accuracy": 0.47241379618644713, + "step": 119265 + }, + { + "epoch": 0.12013001074693733, + "grad_norm": 9.196916049079698, + "learning_rate": 4.939764221498343e-05, + "loss": 2.2051, + "mean_token_accuracy": 0.44482759237289426, + "step": 119270 + }, + { + "epoch": 0.1201350468000415, + "grad_norm": 8.360078704652105, + "learning_rate": 4.93975560879406e-05, + "loss": 2.1462, + "mean_token_accuracy": 0.47931034564971925, + "step": 119275 + }, + { + "epoch": 0.12014008285314567, + "grad_norm": 10.720533760981393, + "learning_rate": 4.939746995482441e-05, + "loss": 2.229, + "mean_token_accuracy": 0.4551724076271057, + "step": 119280 + }, + { + "epoch": 0.12014511890624985, + "grad_norm": 12.01617068733847, + "learning_rate": 4.939738381563488e-05, + "loss": 2.8512, + "mean_token_accuracy": 0.37586206793785093, + "step": 119285 + }, + { + "epoch": 0.12015015495935402, + "grad_norm": 10.344463691227435, + "learning_rate": 4.939729767037203e-05, + "loss": 2.7412, + "mean_token_accuracy": 0.39310345649719236, + "step": 119290 + }, + { + "epoch": 0.12015519101245818, + "grad_norm": 10.8528720313763, + "learning_rate": 4.9397211519035876e-05, + "loss": 2.5764, + "mean_token_accuracy": 0.4034482717514038, + "step": 119295 + }, + { + "epoch": 0.12016022706556236, + "grad_norm": 10.249421101168654, + "learning_rate": 4.939712536162645e-05, + "loss": 2.3834, + "mean_token_accuracy": 0.46551724076271056, + "step": 119300 + }, + { + "epoch": 0.12016526311866653, + "grad_norm": 9.908293084728589, + "learning_rate": 4.939703919814377e-05, + "loss": 2.3114, + "mean_token_accuracy": 0.4689655125141144, + "step": 119305 + }, + { + "epoch": 0.1201702991717707, + "grad_norm": 11.284976380655607, + "learning_rate": 4.939695302858787e-05, + "loss": 2.4606, + "mean_token_accuracy": 0.3896551728248596, + "step": 119310 + }, + { + "epoch": 0.12017533522487488, + "grad_norm": 11.631437179094759, + "learning_rate": 4.9396866852958764e-05, + "loss": 2.5469, + "mean_token_accuracy": 0.38620689511299133, + "step": 119315 + }, + { + "epoch": 0.12018037127797905, + "grad_norm": 9.041907164640111, + "learning_rate": 4.9396780671256494e-05, + "loss": 2.4526, + "mean_token_accuracy": 0.4241379380226135, + "step": 119320 + }, + { + "epoch": 0.12018540733108322, + "grad_norm": 10.280150140189098, + "learning_rate": 4.939669448348105e-05, + "loss": 2.1474, + "mean_token_accuracy": 0.44482759237289426, + "step": 119325 + }, + { + "epoch": 0.1201904433841874, + "grad_norm": 27.760202456618853, + "learning_rate": 4.9396608289632483e-05, + "loss": 2.5445, + "mean_token_accuracy": 0.4448275864124298, + "step": 119330 + }, + { + "epoch": 0.12019547943729157, + "grad_norm": 11.669155123272887, + "learning_rate": 4.939652208971082e-05, + "loss": 2.6354, + "mean_token_accuracy": 0.39812461137771604, + "step": 119335 + }, + { + "epoch": 0.12020051549039575, + "grad_norm": 7.567567153558938, + "learning_rate": 4.939643588371607e-05, + "loss": 2.1298, + "mean_token_accuracy": 0.5019963622093201, + "step": 119340 + }, + { + "epoch": 0.12020555154349992, + "grad_norm": 11.94060441776718, + "learning_rate": 4.939634967164826e-05, + "loss": 2.4958, + "mean_token_accuracy": 0.3862069010734558, + "step": 119345 + }, + { + "epoch": 0.1202105875966041, + "grad_norm": 11.562754340638174, + "learning_rate": 4.9396263453507415e-05, + "loss": 2.7727, + "mean_token_accuracy": 0.37586206793785093, + "step": 119350 + }, + { + "epoch": 0.12021562364970827, + "grad_norm": 10.803906235923012, + "learning_rate": 4.939617722929356e-05, + "loss": 2.815, + "mean_token_accuracy": 0.37586206793785093, + "step": 119355 + }, + { + "epoch": 0.12022065970281244, + "grad_norm": 10.604244322832534, + "learning_rate": 4.939609099900671e-05, + "loss": 2.2045, + "mean_token_accuracy": 0.4862068951129913, + "step": 119360 + }, + { + "epoch": 0.1202256957559166, + "grad_norm": 10.367510210182056, + "learning_rate": 4.939600476264691e-05, + "loss": 2.286, + "mean_token_accuracy": 0.4620689690113068, + "step": 119365 + }, + { + "epoch": 0.12023073180902077, + "grad_norm": 8.71436870723413, + "learning_rate": 4.939591852021417e-05, + "loss": 2.2074, + "mean_token_accuracy": 0.4586206912994385, + "step": 119370 + }, + { + "epoch": 0.12023576786212495, + "grad_norm": 11.10623928914366, + "learning_rate": 4.939583227170851e-05, + "loss": 3.0237, + "mean_token_accuracy": 0.3448275804519653, + "step": 119375 + }, + { + "epoch": 0.12024080391522912, + "grad_norm": 11.043683362963579, + "learning_rate": 4.9395746017129965e-05, + "loss": 2.4732, + "mean_token_accuracy": 0.40689656138420105, + "step": 119380 + }, + { + "epoch": 0.1202458399683333, + "grad_norm": 13.284203050957663, + "learning_rate": 4.9395659756478545e-05, + "loss": 3.2988, + "mean_token_accuracy": 0.36896551251411436, + "step": 119385 + }, + { + "epoch": 0.12025087602143747, + "grad_norm": 9.54136856788621, + "learning_rate": 4.939557348975429e-05, + "loss": 2.2835, + "mean_token_accuracy": 0.4172413766384125, + "step": 119390 + }, + { + "epoch": 0.12025591207454164, + "grad_norm": 10.81992871715047, + "learning_rate": 4.939548721695721e-05, + "loss": 2.1642, + "mean_token_accuracy": 0.4517241358757019, + "step": 119395 + }, + { + "epoch": 0.12026094812764582, + "grad_norm": 10.651737103546157, + "learning_rate": 4.939540093808734e-05, + "loss": 2.5969, + "mean_token_accuracy": 0.36206896901130675, + "step": 119400 + }, + { + "epoch": 0.12026598418074999, + "grad_norm": 9.287539398613724, + "learning_rate": 4.9395314653144696e-05, + "loss": 2.1858, + "mean_token_accuracy": 0.42758620381355283, + "step": 119405 + }, + { + "epoch": 0.12027102023385416, + "grad_norm": 9.869456143179955, + "learning_rate": 4.9395228362129313e-05, + "loss": 2.6165, + "mean_token_accuracy": 0.42758620381355283, + "step": 119410 + }, + { + "epoch": 0.12027605628695834, + "grad_norm": 32.52525249723404, + "learning_rate": 4.939514206504119e-05, + "loss": 3.0794, + "mean_token_accuracy": 0.3965517163276672, + "step": 119415 + }, + { + "epoch": 0.12028109234006251, + "grad_norm": 12.64574993353301, + "learning_rate": 4.939505576188038e-05, + "loss": 2.3077, + "mean_token_accuracy": 0.4517241358757019, + "step": 119420 + }, + { + "epoch": 0.12028612839316669, + "grad_norm": 9.992750336531746, + "learning_rate": 4.939496945264689e-05, + "loss": 2.5118, + "mean_token_accuracy": 0.4448275864124298, + "step": 119425 + }, + { + "epoch": 0.12029116444627086, + "grad_norm": 10.348380643279022, + "learning_rate": 4.939488313734076e-05, + "loss": 2.0726, + "mean_token_accuracy": 0.4862069010734558, + "step": 119430 + }, + { + "epoch": 0.12029620049937502, + "grad_norm": 11.011436839827741, + "learning_rate": 4.939479681596199e-05, + "loss": 2.5894, + "mean_token_accuracy": 0.4206896543502808, + "step": 119435 + }, + { + "epoch": 0.1203012365524792, + "grad_norm": 9.745717444248355, + "learning_rate": 4.939471048851062e-05, + "loss": 2.4751, + "mean_token_accuracy": 0.39655172228813174, + "step": 119440 + }, + { + "epoch": 0.12030627260558337, + "grad_norm": 11.221352869419286, + "learning_rate": 4.939462415498666e-05, + "loss": 2.1377, + "mean_token_accuracy": 0.44827585816383364, + "step": 119445 + }, + { + "epoch": 0.12031130865868754, + "grad_norm": 12.30248396059743, + "learning_rate": 4.9394537815390165e-05, + "loss": 2.4484, + "mean_token_accuracy": 0.3896551728248596, + "step": 119450 + }, + { + "epoch": 0.12031634471179171, + "grad_norm": 7.836043482022656, + "learning_rate": 4.939445146972112e-05, + "loss": 2.1376, + "mean_token_accuracy": 0.44137929677963256, + "step": 119455 + }, + { + "epoch": 0.12032138076489589, + "grad_norm": 10.88343759550977, + "learning_rate": 4.939436511797958e-05, + "loss": 2.1031, + "mean_token_accuracy": 0.4620689630508423, + "step": 119460 + }, + { + "epoch": 0.12032641681800006, + "grad_norm": 16.91187662092988, + "learning_rate": 4.939427876016555e-05, + "loss": 2.9889, + "mean_token_accuracy": 0.4172413766384125, + "step": 119465 + }, + { + "epoch": 0.12033145287110424, + "grad_norm": 11.636216546632317, + "learning_rate": 4.939419239627906e-05, + "loss": 2.2032, + "mean_token_accuracy": 0.4569267988204956, + "step": 119470 + }, + { + "epoch": 0.12033648892420841, + "grad_norm": 10.63139269103073, + "learning_rate": 4.939410602632013e-05, + "loss": 1.991, + "mean_token_accuracy": 0.5090139031410217, + "step": 119475 + }, + { + "epoch": 0.12034152497731258, + "grad_norm": 9.018000626298614, + "learning_rate": 4.93940196502888e-05, + "loss": 2.4226, + "mean_token_accuracy": 0.3896551728248596, + "step": 119480 + }, + { + "epoch": 0.12034656103041676, + "grad_norm": 8.911382714420135, + "learning_rate": 4.939393326818507e-05, + "loss": 1.8531, + "mean_token_accuracy": 0.5034482717514038, + "step": 119485 + }, + { + "epoch": 0.12035159708352093, + "grad_norm": 11.54642673046286, + "learning_rate": 4.9393846880008984e-05, + "loss": 2.3367, + "mean_token_accuracy": 0.43793103098869324, + "step": 119490 + }, + { + "epoch": 0.1203566331366251, + "grad_norm": 10.832706700787105, + "learning_rate": 4.939376048576055e-05, + "loss": 2.2838, + "mean_token_accuracy": 0.4137930989265442, + "step": 119495 + }, + { + "epoch": 0.12036166918972928, + "grad_norm": 9.819163447589506, + "learning_rate": 4.939367408543981e-05, + "loss": 2.1329, + "mean_token_accuracy": 0.44482759237289426, + "step": 119500 + }, + { + "epoch": 0.12036670524283344, + "grad_norm": 9.918766033231737, + "learning_rate": 4.939358767904677e-05, + "loss": 2.2703, + "mean_token_accuracy": 0.42413792610168455, + "step": 119505 + }, + { + "epoch": 0.12037174129593761, + "grad_norm": 11.972368695892213, + "learning_rate": 4.9393501266581465e-05, + "loss": 2.3409, + "mean_token_accuracy": 0.4620689690113068, + "step": 119510 + }, + { + "epoch": 0.12037677734904179, + "grad_norm": 11.767434629251118, + "learning_rate": 4.939341484804391e-05, + "loss": 2.6007, + "mean_token_accuracy": 0.38275861740112305, + "step": 119515 + }, + { + "epoch": 0.12038181340214596, + "grad_norm": 8.979492131450424, + "learning_rate": 4.9393328423434146e-05, + "loss": 2.5495, + "mean_token_accuracy": 0.4413793087005615, + "step": 119520 + }, + { + "epoch": 0.12038684945525013, + "grad_norm": 9.724939695225908, + "learning_rate": 4.939324199275218e-05, + "loss": 2.2549, + "mean_token_accuracy": 0.44482758045196535, + "step": 119525 + }, + { + "epoch": 0.12039188550835431, + "grad_norm": 12.575643828864438, + "learning_rate": 4.9393155555998034e-05, + "loss": 2.6445, + "mean_token_accuracy": 0.37931033968925476, + "step": 119530 + }, + { + "epoch": 0.12039692156145848, + "grad_norm": 11.776350353140675, + "learning_rate": 4.939306911317175e-05, + "loss": 2.3675, + "mean_token_accuracy": 0.39655172526836396, + "step": 119535 + }, + { + "epoch": 0.12040195761456265, + "grad_norm": 9.704736849706698, + "learning_rate": 4.939298266427334e-05, + "loss": 2.2858, + "mean_token_accuracy": 0.4570477962493896, + "step": 119540 + }, + { + "epoch": 0.12040699366766683, + "grad_norm": 10.293264739354548, + "learning_rate": 4.9392896209302826e-05, + "loss": 2.3684, + "mean_token_accuracy": 0.4137930989265442, + "step": 119545 + }, + { + "epoch": 0.120412029720771, + "grad_norm": 11.01948440028967, + "learning_rate": 4.9392809748260236e-05, + "loss": 2.0536, + "mean_token_accuracy": 0.4918935298919678, + "step": 119550 + }, + { + "epoch": 0.12041706577387518, + "grad_norm": 10.314883741051242, + "learning_rate": 4.939272328114559e-05, + "loss": 2.0789, + "mean_token_accuracy": 0.4896551728248596, + "step": 119555 + }, + { + "epoch": 0.12042210182697935, + "grad_norm": 9.714599254518184, + "learning_rate": 4.939263680795892e-05, + "loss": 2.3804, + "mean_token_accuracy": 0.42068966031074523, + "step": 119560 + }, + { + "epoch": 0.12042713788008352, + "grad_norm": 7.991958419485984, + "learning_rate": 4.939255032870025e-05, + "loss": 2.2102, + "mean_token_accuracy": 0.4435571670532227, + "step": 119565 + }, + { + "epoch": 0.1204321739331877, + "grad_norm": 9.614107222669023, + "learning_rate": 4.9392463843369595e-05, + "loss": 2.3379, + "mean_token_accuracy": 0.40344828367233276, + "step": 119570 + }, + { + "epoch": 0.12043720998629186, + "grad_norm": 8.794686040579972, + "learning_rate": 4.9392377351966986e-05, + "loss": 2.1446, + "mean_token_accuracy": 0.4620689630508423, + "step": 119575 + }, + { + "epoch": 0.12044224603939603, + "grad_norm": 12.737362241231565, + "learning_rate": 4.939229085449243e-05, + "loss": 2.4156, + "mean_token_accuracy": 0.42758620381355283, + "step": 119580 + }, + { + "epoch": 0.1204472820925002, + "grad_norm": 12.01372889612714, + "learning_rate": 4.939220435094599e-05, + "loss": 2.3528, + "mean_token_accuracy": 0.441379314661026, + "step": 119585 + }, + { + "epoch": 0.12045231814560438, + "grad_norm": 9.591463992722597, + "learning_rate": 4.939211784132765e-05, + "loss": 2.0362, + "mean_token_accuracy": 0.4878402888774872, + "step": 119590 + }, + { + "epoch": 0.12045735419870855, + "grad_norm": 12.380487522905705, + "learning_rate": 4.939203132563745e-05, + "loss": 2.5136, + "mean_token_accuracy": 0.4448275864124298, + "step": 119595 + }, + { + "epoch": 0.12046239025181273, + "grad_norm": 10.470770068110566, + "learning_rate": 4.939194480387542e-05, + "loss": 2.5423, + "mean_token_accuracy": 0.4137930929660797, + "step": 119600 + }, + { + "epoch": 0.1204674263049169, + "grad_norm": 12.650502820147741, + "learning_rate": 4.939185827604157e-05, + "loss": 2.3645, + "mean_token_accuracy": 0.4034482717514038, + "step": 119605 + }, + { + "epoch": 0.12047246235802107, + "grad_norm": 9.570650699452537, + "learning_rate": 4.939177174213593e-05, + "loss": 2.2257, + "mean_token_accuracy": 0.4206896543502808, + "step": 119610 + }, + { + "epoch": 0.12047749841112525, + "grad_norm": 11.739066180743068, + "learning_rate": 4.939168520215854e-05, + "loss": 2.2903, + "mean_token_accuracy": 0.417241370677948, + "step": 119615 + }, + { + "epoch": 0.12048253446422942, + "grad_norm": 11.658804802019306, + "learning_rate": 4.9391598656109394e-05, + "loss": 2.0974, + "mean_token_accuracy": 0.43103447556495667, + "step": 119620 + }, + { + "epoch": 0.1204875705173336, + "grad_norm": 8.22395582134829, + "learning_rate": 4.9391512103988543e-05, + "loss": 2.2502, + "mean_token_accuracy": 0.5160314559936523, + "step": 119625 + }, + { + "epoch": 0.12049260657043777, + "grad_norm": 10.632937136180853, + "learning_rate": 4.939142554579599e-05, + "loss": 2.2095, + "mean_token_accuracy": 0.4586206912994385, + "step": 119630 + }, + { + "epoch": 0.12049764262354194, + "grad_norm": 9.372745707739217, + "learning_rate": 4.939133898153178e-05, + "loss": 3.4135, + "mean_token_accuracy": 0.3620689660310745, + "step": 119635 + }, + { + "epoch": 0.12050267867664612, + "grad_norm": 12.383556932427554, + "learning_rate": 4.9391252411195917e-05, + "loss": 2.5519, + "mean_token_accuracy": 0.41554749608039854, + "step": 119640 + }, + { + "epoch": 0.12050771472975028, + "grad_norm": 14.570918422761398, + "learning_rate": 4.9391165834788435e-05, + "loss": 2.2136, + "mean_token_accuracy": 0.47931034564971925, + "step": 119645 + }, + { + "epoch": 0.12051275078285445, + "grad_norm": 12.057980017328477, + "learning_rate": 4.939107925230935e-05, + "loss": 2.4286, + "mean_token_accuracy": 0.37586206793785093, + "step": 119650 + }, + { + "epoch": 0.12051778683595862, + "grad_norm": 11.984541302396252, + "learning_rate": 4.9390992663758694e-05, + "loss": 3.0031, + "mean_token_accuracy": 0.3620689630508423, + "step": 119655 + }, + { + "epoch": 0.1205228228890628, + "grad_norm": 8.9438285946296, + "learning_rate": 4.9390906069136494e-05, + "loss": 2.102, + "mean_token_accuracy": 0.47241378426551817, + "step": 119660 + }, + { + "epoch": 0.12052785894216697, + "grad_norm": 9.183306602436094, + "learning_rate": 4.9390819468442776e-05, + "loss": 2.3076, + "mean_token_accuracy": 0.4551724135875702, + "step": 119665 + }, + { + "epoch": 0.12053289499527114, + "grad_norm": 17.02493412871042, + "learning_rate": 4.9390732861677546e-05, + "loss": 2.5484, + "mean_token_accuracy": 0.44137930274009707, + "step": 119670 + }, + { + "epoch": 0.12053793104837532, + "grad_norm": 10.495945609886947, + "learning_rate": 4.9390646248840845e-05, + "loss": 2.3382, + "mean_token_accuracy": 0.4344827592372894, + "step": 119675 + }, + { + "epoch": 0.12054296710147949, + "grad_norm": 10.71761475111207, + "learning_rate": 4.9390559629932686e-05, + "loss": 2.618, + "mean_token_accuracy": 0.3999999940395355, + "step": 119680 + }, + { + "epoch": 0.12054800315458367, + "grad_norm": 10.270847984584083, + "learning_rate": 4.939047300495311e-05, + "loss": 2.0535, + "mean_token_accuracy": 0.46551724076271056, + "step": 119685 + }, + { + "epoch": 0.12055303920768784, + "grad_norm": 11.069210295224085, + "learning_rate": 4.9390386373902124e-05, + "loss": 2.1289, + "mean_token_accuracy": 0.4620689630508423, + "step": 119690 + }, + { + "epoch": 0.12055807526079201, + "grad_norm": 12.26007824166588, + "learning_rate": 4.9390299736779755e-05, + "loss": 2.1972, + "mean_token_accuracy": 0.42758620381355283, + "step": 119695 + }, + { + "epoch": 0.12056311131389619, + "grad_norm": 12.172665597580234, + "learning_rate": 4.939021309358603e-05, + "loss": 2.5059, + "mean_token_accuracy": 0.41034482717514037, + "step": 119700 + }, + { + "epoch": 0.12056814736700036, + "grad_norm": 10.495685601158664, + "learning_rate": 4.9390126444320976e-05, + "loss": 2.4284, + "mean_token_accuracy": 0.42758620381355283, + "step": 119705 + }, + { + "epoch": 0.12057318342010453, + "grad_norm": 14.974502979744939, + "learning_rate": 4.939003978898461e-05, + "loss": 2.473, + "mean_token_accuracy": 0.4206896543502808, + "step": 119710 + }, + { + "epoch": 0.1205782194732087, + "grad_norm": 10.603599990063904, + "learning_rate": 4.938995312757696e-05, + "loss": 2.0775, + "mean_token_accuracy": 0.47586206793785096, + "step": 119715 + }, + { + "epoch": 0.12058325552631287, + "grad_norm": 10.915449558780491, + "learning_rate": 4.938986646009806e-05, + "loss": 2.5771, + "mean_token_accuracy": 0.47755595445632937, + "step": 119720 + }, + { + "epoch": 0.12058829157941704, + "grad_norm": 8.697697814021558, + "learning_rate": 4.9389779786547905e-05, + "loss": 2.2102, + "mean_token_accuracy": 0.4517241418361664, + "step": 119725 + }, + { + "epoch": 0.12059332763252122, + "grad_norm": 9.949494820778709, + "learning_rate": 4.938969310692655e-05, + "loss": 2.4292, + "mean_token_accuracy": 0.44827585816383364, + "step": 119730 + }, + { + "epoch": 0.12059836368562539, + "grad_norm": 7.897445737047752, + "learning_rate": 4.9389606421234006e-05, + "loss": 1.9606, + "mean_token_accuracy": 0.49516030550003054, + "step": 119735 + }, + { + "epoch": 0.12060339973872956, + "grad_norm": 8.208998202976456, + "learning_rate": 4.93895197294703e-05, + "loss": 2.2496, + "mean_token_accuracy": 0.47241379618644713, + "step": 119740 + }, + { + "epoch": 0.12060843579183374, + "grad_norm": 10.776803221210308, + "learning_rate": 4.9389433031635456e-05, + "loss": 2.4423, + "mean_token_accuracy": 0.42413793206214906, + "step": 119745 + }, + { + "epoch": 0.12061347184493791, + "grad_norm": 10.536352296005909, + "learning_rate": 4.938934632772949e-05, + "loss": 2.3323, + "mean_token_accuracy": 0.4344827592372894, + "step": 119750 + }, + { + "epoch": 0.12061850789804208, + "grad_norm": 10.659730998003429, + "learning_rate": 4.938925961775243e-05, + "loss": 2.2688, + "mean_token_accuracy": 0.44827587008476255, + "step": 119755 + }, + { + "epoch": 0.12062354395114626, + "grad_norm": 10.37958100675098, + "learning_rate": 4.938917290170431e-05, + "loss": 2.218, + "mean_token_accuracy": 0.44827585816383364, + "step": 119760 + }, + { + "epoch": 0.12062858000425043, + "grad_norm": 11.495114283050215, + "learning_rate": 4.938908617958514e-05, + "loss": 2.1074, + "mean_token_accuracy": 0.5026013255119324, + "step": 119765 + }, + { + "epoch": 0.1206336160573546, + "grad_norm": 11.40299953602754, + "learning_rate": 4.938899945139496e-05, + "loss": 2.265, + "mean_token_accuracy": 0.4689655065536499, + "step": 119770 + }, + { + "epoch": 0.12063865211045878, + "grad_norm": 10.071225830748558, + "learning_rate": 4.9388912717133774e-05, + "loss": 2.3125, + "mean_token_accuracy": 0.46551724672317507, + "step": 119775 + }, + { + "epoch": 0.12064368816356295, + "grad_norm": 10.156495815418682, + "learning_rate": 4.938882597680162e-05, + "loss": 2.5974, + "mean_token_accuracy": 0.4, + "step": 119780 + }, + { + "epoch": 0.12064872421666711, + "grad_norm": 10.775821589432335, + "learning_rate": 4.9388739230398515e-05, + "loss": 2.5367, + "mean_token_accuracy": 0.4068965494632721, + "step": 119785 + }, + { + "epoch": 0.12065376026977129, + "grad_norm": 11.932509695464233, + "learning_rate": 4.938865247792449e-05, + "loss": 2.3439, + "mean_token_accuracy": 0.4560344874858856, + "step": 119790 + }, + { + "epoch": 0.12065879632287546, + "grad_norm": 10.289316619823074, + "learning_rate": 4.938856571937956e-05, + "loss": 2.5038, + "mean_token_accuracy": 0.43103447556495667, + "step": 119795 + }, + { + "epoch": 0.12066383237597963, + "grad_norm": 10.219116144507854, + "learning_rate": 4.9388478954763764e-05, + "loss": 2.2071, + "mean_token_accuracy": 0.4918935298919678, + "step": 119800 + }, + { + "epoch": 0.12066886842908381, + "grad_norm": 19.40126364999409, + "learning_rate": 4.9388392184077106e-05, + "loss": 2.546, + "mean_token_accuracy": 0.4310344815254211, + "step": 119805 + }, + { + "epoch": 0.12067390448218798, + "grad_norm": 11.096735218936674, + "learning_rate": 4.938830540731963e-05, + "loss": 2.2491, + "mean_token_accuracy": 0.4931034564971924, + "step": 119810 + }, + { + "epoch": 0.12067894053529216, + "grad_norm": 11.279328338358503, + "learning_rate": 4.938821862449135e-05, + "loss": 2.6181, + "mean_token_accuracy": 0.42758620977401735, + "step": 119815 + }, + { + "epoch": 0.12068397658839633, + "grad_norm": 10.461129544658773, + "learning_rate": 4.9388131835592284e-05, + "loss": 2.448, + "mean_token_accuracy": 0.3931034505367279, + "step": 119820 + }, + { + "epoch": 0.1206890126415005, + "grad_norm": 15.349564432918356, + "learning_rate": 4.938804504062247e-05, + "loss": 2.4651, + "mean_token_accuracy": 0.4344827592372894, + "step": 119825 + }, + { + "epoch": 0.12069404869460468, + "grad_norm": 10.463945414669572, + "learning_rate": 4.938795823958192e-05, + "loss": 2.7383, + "mean_token_accuracy": 0.42758620977401735, + "step": 119830 + }, + { + "epoch": 0.12069908474770885, + "grad_norm": 11.141218834272754, + "learning_rate": 4.9387871432470665e-05, + "loss": 2.6949, + "mean_token_accuracy": 0.4206896543502808, + "step": 119835 + }, + { + "epoch": 0.12070412080081303, + "grad_norm": 9.729841806931288, + "learning_rate": 4.938778461928874e-05, + "loss": 2.3785, + "mean_token_accuracy": 0.4586206912994385, + "step": 119840 + }, + { + "epoch": 0.1207091568539172, + "grad_norm": 9.286168167246696, + "learning_rate": 4.938769780003614e-05, + "loss": 2.0795, + "mean_token_accuracy": 0.5000000059604645, + "step": 119845 + }, + { + "epoch": 0.12071419290702137, + "grad_norm": 10.922441992397061, + "learning_rate": 4.938761097471291e-05, + "loss": 2.6727, + "mean_token_accuracy": 0.36551723778247835, + "step": 119850 + }, + { + "epoch": 0.12071922896012553, + "grad_norm": 13.17136120137017, + "learning_rate": 4.938752414331906e-05, + "loss": 2.2017, + "mean_token_accuracy": 0.49999998807907103, + "step": 119855 + }, + { + "epoch": 0.1207242650132297, + "grad_norm": 12.032808538737187, + "learning_rate": 4.9387437305854634e-05, + "loss": 2.4743, + "mean_token_accuracy": 0.3758620619773865, + "step": 119860 + }, + { + "epoch": 0.12072930106633388, + "grad_norm": 11.144776189053026, + "learning_rate": 4.9387350462319647e-05, + "loss": 2.4078, + "mean_token_accuracy": 0.42413792610168455, + "step": 119865 + }, + { + "epoch": 0.12073433711943805, + "grad_norm": 11.093021360465691, + "learning_rate": 4.9387263612714116e-05, + "loss": 2.6446, + "mean_token_accuracy": 0.3931034505367279, + "step": 119870 + }, + { + "epoch": 0.12073937317254223, + "grad_norm": 10.068329441708558, + "learning_rate": 4.9387176757038077e-05, + "loss": 2.1654, + "mean_token_accuracy": 0.47586206793785096, + "step": 119875 + }, + { + "epoch": 0.1207444092256464, + "grad_norm": 11.692878504966988, + "learning_rate": 4.938708989529154e-05, + "loss": 2.7517, + "mean_token_accuracy": 0.38965516686439516, + "step": 119880 + }, + { + "epoch": 0.12074944527875058, + "grad_norm": 9.56749402782208, + "learning_rate": 4.9387003027474545e-05, + "loss": 2.5233, + "mean_token_accuracy": 0.38620689511299133, + "step": 119885 + }, + { + "epoch": 0.12075448133185475, + "grad_norm": 9.429646089478142, + "learning_rate": 4.93869161535871e-05, + "loss": 2.4559, + "mean_token_accuracy": 0.43448275327682495, + "step": 119890 + }, + { + "epoch": 0.12075951738495892, + "grad_norm": 11.126281550131901, + "learning_rate": 4.938682927362925e-05, + "loss": 2.3736, + "mean_token_accuracy": 0.467271625995636, + "step": 119895 + }, + { + "epoch": 0.1207645534380631, + "grad_norm": 12.089545852863473, + "learning_rate": 4.938674238760099e-05, + "loss": 2.7572, + "mean_token_accuracy": 0.38275861740112305, + "step": 119900 + }, + { + "epoch": 0.12076958949116727, + "grad_norm": 11.928383775352179, + "learning_rate": 4.938665549550238e-05, + "loss": 2.8054, + "mean_token_accuracy": 0.37241379618644715, + "step": 119905 + }, + { + "epoch": 0.12077462554427144, + "grad_norm": 13.112245779094343, + "learning_rate": 4.9386568597333406e-05, + "loss": 2.8366, + "mean_token_accuracy": 0.4, + "step": 119910 + }, + { + "epoch": 0.12077966159737562, + "grad_norm": 11.663632670937336, + "learning_rate": 4.9386481693094115e-05, + "loss": 2.5705, + "mean_token_accuracy": 0.38965516686439516, + "step": 119915 + }, + { + "epoch": 0.12078469765047979, + "grad_norm": 12.139700838623277, + "learning_rate": 4.938639478278453e-05, + "loss": 2.3511, + "mean_token_accuracy": 0.4517241418361664, + "step": 119920 + }, + { + "epoch": 0.12078973370358395, + "grad_norm": 10.141972203242688, + "learning_rate": 4.938630786640467e-05, + "loss": 2.7594, + "mean_token_accuracy": 0.3655172407627106, + "step": 119925 + }, + { + "epoch": 0.12079476975668813, + "grad_norm": 9.07689169863671, + "learning_rate": 4.938622094395456e-05, + "loss": 2.3086, + "mean_token_accuracy": 0.46551724076271056, + "step": 119930 + }, + { + "epoch": 0.1207998058097923, + "grad_norm": 9.862111892403567, + "learning_rate": 4.9386134015434225e-05, + "loss": 2.5544, + "mean_token_accuracy": 0.41379310488700866, + "step": 119935 + }, + { + "epoch": 0.12080484186289647, + "grad_norm": 10.557784555330421, + "learning_rate": 4.93860470808437e-05, + "loss": 2.6732, + "mean_token_accuracy": 0.43103447556495667, + "step": 119940 + }, + { + "epoch": 0.12080987791600065, + "grad_norm": 9.70815899929393, + "learning_rate": 4.9385960140182986e-05, + "loss": 2.3673, + "mean_token_accuracy": 0.4413793087005615, + "step": 119945 + }, + { + "epoch": 0.12081491396910482, + "grad_norm": 9.443494375477073, + "learning_rate": 4.9385873193452124e-05, + "loss": 2.6375, + "mean_token_accuracy": 0.41034482419490814, + "step": 119950 + }, + { + "epoch": 0.120819950022209, + "grad_norm": 10.138093443799047, + "learning_rate": 4.9385786240651126e-05, + "loss": 2.3327, + "mean_token_accuracy": 0.4620689690113068, + "step": 119955 + }, + { + "epoch": 0.12082498607531317, + "grad_norm": 13.190709410072337, + "learning_rate": 4.9385699281780026e-05, + "loss": 3.5428, + "mean_token_accuracy": 0.3793103456497192, + "step": 119960 + }, + { + "epoch": 0.12083002212841734, + "grad_norm": 10.022703388811944, + "learning_rate": 4.938561231683885e-05, + "loss": 2.1006, + "mean_token_accuracy": 0.4551724076271057, + "step": 119965 + }, + { + "epoch": 0.12083505818152152, + "grad_norm": 17.57306208010187, + "learning_rate": 4.938552534582762e-05, + "loss": 2.576, + "mean_token_accuracy": 0.41379311084747317, + "step": 119970 + }, + { + "epoch": 0.12084009423462569, + "grad_norm": 9.476523944353834, + "learning_rate": 4.938543836874635e-05, + "loss": 2.0163, + "mean_token_accuracy": 0.4655172318220139, + "step": 119975 + }, + { + "epoch": 0.12084513028772986, + "grad_norm": 11.95481414916916, + "learning_rate": 4.938535138559508e-05, + "loss": 2.4738, + "mean_token_accuracy": 0.4896551728248596, + "step": 119980 + }, + { + "epoch": 0.12085016634083404, + "grad_norm": 8.58745794557383, + "learning_rate": 4.938526439637383e-05, + "loss": 2.0912, + "mean_token_accuracy": 0.45862069725990295, + "step": 119985 + }, + { + "epoch": 0.12085520239393821, + "grad_norm": 19.392899335778417, + "learning_rate": 4.938517740108261e-05, + "loss": 2.6767, + "mean_token_accuracy": 0.4517241418361664, + "step": 119990 + }, + { + "epoch": 0.12086023844704237, + "grad_norm": 10.971620664565677, + "learning_rate": 4.9385090399721457e-05, + "loss": 3.3046, + "mean_token_accuracy": 0.3448275774717331, + "step": 119995 + }, + { + "epoch": 0.12086527450014654, + "grad_norm": 12.210256226935813, + "learning_rate": 4.938500339229039e-05, + "loss": 2.5852, + "mean_token_accuracy": 0.38275861740112305, + "step": 120000 + }, + { + "epoch": 0.12087031055325072, + "grad_norm": 12.116814794560296, + "learning_rate": 4.938491637878943e-05, + "loss": 2.7835, + "mean_token_accuracy": 0.4, + "step": 120005 + }, + { + "epoch": 0.12087534660635489, + "grad_norm": 9.982573184528107, + "learning_rate": 4.9384829359218624e-05, + "loss": 2.0066, + "mean_token_accuracy": 0.47931034564971925, + "step": 120010 + }, + { + "epoch": 0.12088038265945907, + "grad_norm": 9.237819035520596, + "learning_rate": 4.938474233357797e-05, + "loss": 2.2866, + "mean_token_accuracy": 0.46896551847457885, + "step": 120015 + }, + { + "epoch": 0.12088541871256324, + "grad_norm": 11.201955113769582, + "learning_rate": 4.9384655301867504e-05, + "loss": 2.7916, + "mean_token_accuracy": 0.4, + "step": 120020 + }, + { + "epoch": 0.12089045476566741, + "grad_norm": 10.385106638494413, + "learning_rate": 4.938456826408724e-05, + "loss": 2.1148, + "mean_token_accuracy": 0.47931034564971925, + "step": 120025 + }, + { + "epoch": 0.12089549081877159, + "grad_norm": 13.535292564317503, + "learning_rate": 4.9384481220237215e-05, + "loss": 2.4539, + "mean_token_accuracy": 0.3827586233615875, + "step": 120030 + }, + { + "epoch": 0.12090052687187576, + "grad_norm": 17.069054210040363, + "learning_rate": 4.938439417031745e-05, + "loss": 2.4322, + "mean_token_accuracy": 0.4, + "step": 120035 + }, + { + "epoch": 0.12090556292497993, + "grad_norm": 8.657831299887144, + "learning_rate": 4.938430711432796e-05, + "loss": 2.1443, + "mean_token_accuracy": 0.47586206793785096, + "step": 120040 + }, + { + "epoch": 0.12091059897808411, + "grad_norm": 11.789636023570297, + "learning_rate": 4.938422005226878e-05, + "loss": 2.2108, + "mean_token_accuracy": 0.4814277052879333, + "step": 120045 + }, + { + "epoch": 0.12091563503118828, + "grad_norm": 12.160959023374206, + "learning_rate": 4.9384132984139936e-05, + "loss": 2.3242, + "mean_token_accuracy": 0.3808832347393036, + "step": 120050 + }, + { + "epoch": 0.12092067108429246, + "grad_norm": 11.746523812364865, + "learning_rate": 4.938404590994144e-05, + "loss": 2.5708, + "mean_token_accuracy": 0.3999999940395355, + "step": 120055 + }, + { + "epoch": 0.12092570713739663, + "grad_norm": 11.00083660213163, + "learning_rate": 4.938395882967332e-05, + "loss": 2.1925, + "mean_token_accuracy": 0.4413793087005615, + "step": 120060 + }, + { + "epoch": 0.12093074319050079, + "grad_norm": 10.349146856917724, + "learning_rate": 4.9383871743335597e-05, + "loss": 2.0897, + "mean_token_accuracy": 0.4344827592372894, + "step": 120065 + }, + { + "epoch": 0.12093577924360496, + "grad_norm": 10.460742382898891, + "learning_rate": 4.938378465092831e-05, + "loss": 2.8577, + "mean_token_accuracy": 0.4275862157344818, + "step": 120070 + }, + { + "epoch": 0.12094081529670914, + "grad_norm": 12.77132394219928, + "learning_rate": 4.938369755245147e-05, + "loss": 2.1322, + "mean_token_accuracy": 0.4862068951129913, + "step": 120075 + }, + { + "epoch": 0.12094585134981331, + "grad_norm": 13.448642945068208, + "learning_rate": 4.938361044790511e-05, + "loss": 2.7866, + "mean_token_accuracy": 0.4103448331356049, + "step": 120080 + }, + { + "epoch": 0.12095088740291748, + "grad_norm": 9.766243083882312, + "learning_rate": 4.938352333728924e-05, + "loss": 2.6188, + "mean_token_accuracy": 0.43103448748588563, + "step": 120085 + }, + { + "epoch": 0.12095592345602166, + "grad_norm": 9.926836684641998, + "learning_rate": 4.9383436220603894e-05, + "loss": 2.3454, + "mean_token_accuracy": 0.4379310250282288, + "step": 120090 + }, + { + "epoch": 0.12096095950912583, + "grad_norm": 9.773049601919867, + "learning_rate": 4.93833490978491e-05, + "loss": 2.3402, + "mean_token_accuracy": 0.4137930989265442, + "step": 120095 + }, + { + "epoch": 0.12096599556223, + "grad_norm": 9.190953412641445, + "learning_rate": 4.938326196902488e-05, + "loss": 2.1136, + "mean_token_accuracy": 0.482758617401123, + "step": 120100 + }, + { + "epoch": 0.12097103161533418, + "grad_norm": 11.33808946977182, + "learning_rate": 4.9383174834131254e-05, + "loss": 2.4902, + "mean_token_accuracy": 0.4366606116294861, + "step": 120105 + }, + { + "epoch": 0.12097606766843835, + "grad_norm": 10.95698627837282, + "learning_rate": 4.938308769316824e-05, + "loss": 2.1453, + "mean_token_accuracy": 0.4517241358757019, + "step": 120110 + }, + { + "epoch": 0.12098110372154253, + "grad_norm": 11.447831193440226, + "learning_rate": 4.9383000546135875e-05, + "loss": 2.3365, + "mean_token_accuracy": 0.4379310369491577, + "step": 120115 + }, + { + "epoch": 0.1209861397746467, + "grad_norm": 13.032614444825704, + "learning_rate": 4.938291339303418e-05, + "loss": 2.5661, + "mean_token_accuracy": 0.42413792610168455, + "step": 120120 + }, + { + "epoch": 0.12099117582775087, + "grad_norm": 10.607033673857114, + "learning_rate": 4.9382826233863176e-05, + "loss": 2.1775, + "mean_token_accuracy": 0.47416818141937256, + "step": 120125 + }, + { + "epoch": 0.12099621188085505, + "grad_norm": 10.928678481999963, + "learning_rate": 4.938273906862289e-05, + "loss": 2.188, + "mean_token_accuracy": 0.43793103098869324, + "step": 120130 + }, + { + "epoch": 0.12100124793395921, + "grad_norm": 8.716496323919689, + "learning_rate": 4.938265189731335e-05, + "loss": 2.1433, + "mean_token_accuracy": 0.4464004814624786, + "step": 120135 + }, + { + "epoch": 0.12100628398706338, + "grad_norm": 9.82406300903284, + "learning_rate": 4.938256471993457e-05, + "loss": 1.7709, + "mean_token_accuracy": 0.5068965554237366, + "step": 120140 + }, + { + "epoch": 0.12101132004016756, + "grad_norm": 12.211974652897986, + "learning_rate": 4.938247753648658e-05, + "loss": 2.6315, + "mean_token_accuracy": 0.3896551728248596, + "step": 120145 + }, + { + "epoch": 0.12101635609327173, + "grad_norm": 10.989976602793591, + "learning_rate": 4.938239034696939e-05, + "loss": 2.4892, + "mean_token_accuracy": 0.3947973370552063, + "step": 120150 + }, + { + "epoch": 0.1210213921463759, + "grad_norm": 7.8069879239806905, + "learning_rate": 4.938230315138305e-05, + "loss": 3.1158, + "mean_token_accuracy": 0.3655172407627106, + "step": 120155 + }, + { + "epoch": 0.12102642819948008, + "grad_norm": 10.614186702616331, + "learning_rate": 4.938221594972757e-05, + "loss": 2.252, + "mean_token_accuracy": 0.4413793087005615, + "step": 120160 + }, + { + "epoch": 0.12103146425258425, + "grad_norm": 12.378579465273567, + "learning_rate": 4.938212874200298e-05, + "loss": 2.154, + "mean_token_accuracy": 0.4562807857990265, + "step": 120165 + }, + { + "epoch": 0.12103650030568842, + "grad_norm": 8.096885142546059, + "learning_rate": 4.938204152820929e-05, + "loss": 2.1504, + "mean_token_accuracy": 0.4379310369491577, + "step": 120170 + }, + { + "epoch": 0.1210415363587926, + "grad_norm": 6.968283989863469, + "learning_rate": 4.938195430834654e-05, + "loss": 2.1627, + "mean_token_accuracy": 0.5124016880989075, + "step": 120175 + }, + { + "epoch": 0.12104657241189677, + "grad_norm": 9.825609365282302, + "learning_rate": 4.938186708241475e-05, + "loss": 2.096, + "mean_token_accuracy": 0.4620689630508423, + "step": 120180 + }, + { + "epoch": 0.12105160846500095, + "grad_norm": 9.305946906054047, + "learning_rate": 4.938177985041394e-05, + "loss": 2.0768, + "mean_token_accuracy": 0.4551724135875702, + "step": 120185 + }, + { + "epoch": 0.12105664451810512, + "grad_norm": 11.453103384495998, + "learning_rate": 4.938169261234414e-05, + "loss": 2.3913, + "mean_token_accuracy": 0.4103448212146759, + "step": 120190 + }, + { + "epoch": 0.12106168057120929, + "grad_norm": 10.977990146632083, + "learning_rate": 4.9381605368205366e-05, + "loss": 2.1762, + "mean_token_accuracy": 0.48275862336158754, + "step": 120195 + }, + { + "epoch": 0.12106671662431347, + "grad_norm": 11.151734148888854, + "learning_rate": 4.938151811799765e-05, + "loss": 2.5549, + "mean_token_accuracy": 0.4263157844543457, + "step": 120200 + }, + { + "epoch": 0.12107175267741763, + "grad_norm": 13.424320880941268, + "learning_rate": 4.938143086172101e-05, + "loss": 2.8915, + "mean_token_accuracy": 0.4034482777118683, + "step": 120205 + }, + { + "epoch": 0.1210767887305218, + "grad_norm": 11.285235111386482, + "learning_rate": 4.9381343599375474e-05, + "loss": 2.7379, + "mean_token_accuracy": 0.42413792610168455, + "step": 120210 + }, + { + "epoch": 0.12108182478362597, + "grad_norm": 10.718240452057701, + "learning_rate": 4.9381256330961076e-05, + "loss": 2.492, + "mean_token_accuracy": 0.39655172228813174, + "step": 120215 + }, + { + "epoch": 0.12108686083673015, + "grad_norm": 13.86921907798621, + "learning_rate": 4.938116905647782e-05, + "loss": 2.3665, + "mean_token_accuracy": 0.4137930989265442, + "step": 120220 + }, + { + "epoch": 0.12109189688983432, + "grad_norm": 11.210185715616591, + "learning_rate": 4.938108177592574e-05, + "loss": 2.2386, + "mean_token_accuracy": 0.4571082890033722, + "step": 120225 + }, + { + "epoch": 0.1210969329429385, + "grad_norm": 9.603845382617719, + "learning_rate": 4.938099448930486e-05, + "loss": 2.3047, + "mean_token_accuracy": 0.44137930274009707, + "step": 120230 + }, + { + "epoch": 0.12110196899604267, + "grad_norm": 10.672811966930382, + "learning_rate": 4.9380907196615205e-05, + "loss": 2.659, + "mean_token_accuracy": 0.4206896543502808, + "step": 120235 + }, + { + "epoch": 0.12110700504914684, + "grad_norm": 9.996195800708499, + "learning_rate": 4.9380819897856806e-05, + "loss": 2.3678, + "mean_token_accuracy": 0.4068965494632721, + "step": 120240 + }, + { + "epoch": 0.12111204110225102, + "grad_norm": 10.361795962037187, + "learning_rate": 4.9380732593029675e-05, + "loss": 2.6602, + "mean_token_accuracy": 0.4, + "step": 120245 + }, + { + "epoch": 0.12111707715535519, + "grad_norm": 12.399482218678184, + "learning_rate": 4.9380645282133836e-05, + "loss": 2.8659, + "mean_token_accuracy": 0.3620689570903778, + "step": 120250 + }, + { + "epoch": 0.12112211320845936, + "grad_norm": 10.285962165738681, + "learning_rate": 4.9380557965169325e-05, + "loss": 2.495, + "mean_token_accuracy": 0.42068966031074523, + "step": 120255 + }, + { + "epoch": 0.12112714926156354, + "grad_norm": 9.784592649551751, + "learning_rate": 4.938047064213616e-05, + "loss": 2.4922, + "mean_token_accuracy": 0.4, + "step": 120260 + }, + { + "epoch": 0.12113218531466771, + "grad_norm": 12.99517676049138, + "learning_rate": 4.938038331303436e-05, + "loss": 2.4436, + "mean_token_accuracy": 0.45517240166664125, + "step": 120265 + }, + { + "epoch": 0.12113722136777189, + "grad_norm": 9.92430422859366, + "learning_rate": 4.9380295977863954e-05, + "loss": 2.1558, + "mean_token_accuracy": 0.48802178502082827, + "step": 120270 + }, + { + "epoch": 0.12114225742087605, + "grad_norm": 9.64559470193739, + "learning_rate": 4.9380208636624976e-05, + "loss": 2.5619, + "mean_token_accuracy": 0.4, + "step": 120275 + }, + { + "epoch": 0.12114729347398022, + "grad_norm": 9.997121108832845, + "learning_rate": 4.938012128931743e-05, + "loss": 2.3047, + "mean_token_accuracy": 0.45221675634384156, + "step": 120280 + }, + { + "epoch": 0.1211523295270844, + "grad_norm": 9.814901312998252, + "learning_rate": 4.9380033935941356e-05, + "loss": 2.2789, + "mean_token_accuracy": 0.4172413766384125, + "step": 120285 + }, + { + "epoch": 0.12115736558018857, + "grad_norm": 10.38207030718613, + "learning_rate": 4.9379946576496774e-05, + "loss": 2.351, + "mean_token_accuracy": 0.41724138259887694, + "step": 120290 + }, + { + "epoch": 0.12116240163329274, + "grad_norm": 15.741537450101857, + "learning_rate": 4.93798592109837e-05, + "loss": 2.8712, + "mean_token_accuracy": 0.3793103516101837, + "step": 120295 + }, + { + "epoch": 0.12116743768639691, + "grad_norm": 10.194906733545208, + "learning_rate": 4.9379771839402175e-05, + "loss": 2.1374, + "mean_token_accuracy": 0.47931034564971925, + "step": 120300 + }, + { + "epoch": 0.12117247373950109, + "grad_norm": 11.166133493141777, + "learning_rate": 4.9379684461752205e-05, + "loss": 2.4095, + "mean_token_accuracy": 0.39310344457626345, + "step": 120305 + }, + { + "epoch": 0.12117750979260526, + "grad_norm": 11.285412342761997, + "learning_rate": 4.937959707803382e-05, + "loss": 2.3926, + "mean_token_accuracy": 0.4103448212146759, + "step": 120310 + }, + { + "epoch": 0.12118254584570944, + "grad_norm": 9.47469002559648, + "learning_rate": 4.937950968824706e-05, + "loss": 2.3099, + "mean_token_accuracy": 0.42758620381355283, + "step": 120315 + }, + { + "epoch": 0.12118758189881361, + "grad_norm": 11.423265110601163, + "learning_rate": 4.937942229239192e-05, + "loss": 2.2817, + "mean_token_accuracy": 0.441379314661026, + "step": 120320 + }, + { + "epoch": 0.12119261795191778, + "grad_norm": 11.600232200419542, + "learning_rate": 4.937933489046846e-05, + "loss": 2.6292, + "mean_token_accuracy": 0.3931034505367279, + "step": 120325 + }, + { + "epoch": 0.12119765400502196, + "grad_norm": 9.398905621716544, + "learning_rate": 4.9379247482476674e-05, + "loss": 2.3636, + "mean_token_accuracy": 0.38620689511299133, + "step": 120330 + }, + { + "epoch": 0.12120269005812613, + "grad_norm": 9.29751415571977, + "learning_rate": 4.93791600684166e-05, + "loss": 2.1308, + "mean_token_accuracy": 0.4620689630508423, + "step": 120335 + }, + { + "epoch": 0.1212077261112303, + "grad_norm": 10.86078431887358, + "learning_rate": 4.937907264828825e-05, + "loss": 2.5892, + "mean_token_accuracy": 0.4137930989265442, + "step": 120340 + }, + { + "epoch": 0.12121276216433446, + "grad_norm": 10.236869872636051, + "learning_rate": 4.9378985222091675e-05, + "loss": 2.4435, + "mean_token_accuracy": 0.43103448748588563, + "step": 120345 + }, + { + "epoch": 0.12121779821743864, + "grad_norm": 9.665608131417388, + "learning_rate": 4.937889778982687e-05, + "loss": 2.4599, + "mean_token_accuracy": 0.4310344785451889, + "step": 120350 + }, + { + "epoch": 0.12122283427054281, + "grad_norm": 14.653260246763328, + "learning_rate": 4.937881035149387e-05, + "loss": 2.489, + "mean_token_accuracy": 0.46551724076271056, + "step": 120355 + }, + { + "epoch": 0.12122787032364699, + "grad_norm": 8.42060325037053, + "learning_rate": 4.937872290709271e-05, + "loss": 2.8, + "mean_token_accuracy": 0.4551724076271057, + "step": 120360 + }, + { + "epoch": 0.12123290637675116, + "grad_norm": 10.600841404884385, + "learning_rate": 4.93786354566234e-05, + "loss": 2.4621, + "mean_token_accuracy": 0.4103448331356049, + "step": 120365 + }, + { + "epoch": 0.12123794242985533, + "grad_norm": 12.088405929896956, + "learning_rate": 4.937854800008596e-05, + "loss": 2.3426, + "mean_token_accuracy": 0.38620689511299133, + "step": 120370 + }, + { + "epoch": 0.12124297848295951, + "grad_norm": 11.0242749881951, + "learning_rate": 4.937846053748044e-05, + "loss": 2.2108, + "mean_token_accuracy": 0.46896551847457885, + "step": 120375 + }, + { + "epoch": 0.12124801453606368, + "grad_norm": 9.603891355691717, + "learning_rate": 4.937837306880683e-05, + "loss": 2.9046, + "mean_token_accuracy": 0.4034482717514038, + "step": 120380 + }, + { + "epoch": 0.12125305058916785, + "grad_norm": 11.782482720778125, + "learning_rate": 4.937828559406518e-05, + "loss": 2.7371, + "mean_token_accuracy": 0.4068965554237366, + "step": 120385 + }, + { + "epoch": 0.12125808664227203, + "grad_norm": 9.161066812798278, + "learning_rate": 4.9378198113255504e-05, + "loss": 2.5352, + "mean_token_accuracy": 0.44827585220336913, + "step": 120390 + }, + { + "epoch": 0.1212631226953762, + "grad_norm": 11.593063145764928, + "learning_rate": 4.937811062637783e-05, + "loss": 2.522, + "mean_token_accuracy": 0.4103448212146759, + "step": 120395 + }, + { + "epoch": 0.12126815874848038, + "grad_norm": 12.426460319688541, + "learning_rate": 4.937802313343217e-05, + "loss": 2.65, + "mean_token_accuracy": 0.38620689511299133, + "step": 120400 + }, + { + "epoch": 0.12127319480158455, + "grad_norm": 9.873844750451001, + "learning_rate": 4.937793563441858e-05, + "loss": 2.4676, + "mean_token_accuracy": 0.41724138259887694, + "step": 120405 + }, + { + "epoch": 0.12127823085468872, + "grad_norm": 11.272156999710116, + "learning_rate": 4.937784812933705e-05, + "loss": 2.3113, + "mean_token_accuracy": 0.48965516686439514, + "step": 120410 + }, + { + "epoch": 0.12128326690779288, + "grad_norm": 11.720457982146641, + "learning_rate": 4.9377760618187614e-05, + "loss": 2.2135, + "mean_token_accuracy": 0.4655172348022461, + "step": 120415 + }, + { + "epoch": 0.12128830296089706, + "grad_norm": 10.601317848607376, + "learning_rate": 4.9377673100970304e-05, + "loss": 2.4268, + "mean_token_accuracy": 0.458620685338974, + "step": 120420 + }, + { + "epoch": 0.12129333901400123, + "grad_norm": 15.489294899875768, + "learning_rate": 4.9377585577685136e-05, + "loss": 2.7776, + "mean_token_accuracy": 0.4344827592372894, + "step": 120425 + }, + { + "epoch": 0.1212983750671054, + "grad_norm": 10.375758239062606, + "learning_rate": 4.937749804833214e-05, + "loss": 2.5071, + "mean_token_accuracy": 0.4068965494632721, + "step": 120430 + }, + { + "epoch": 0.12130341112020958, + "grad_norm": 10.828522449640985, + "learning_rate": 4.937741051291135e-05, + "loss": 2.4512, + "mean_token_accuracy": 0.4103448331356049, + "step": 120435 + }, + { + "epoch": 0.12130844717331375, + "grad_norm": 9.255624322829284, + "learning_rate": 4.9377322971422764e-05, + "loss": 2.7181, + "mean_token_accuracy": 0.4344827592372894, + "step": 120440 + }, + { + "epoch": 0.12131348322641793, + "grad_norm": 11.375533692867764, + "learning_rate": 4.937723542386643e-05, + "loss": 2.543, + "mean_token_accuracy": 0.37931033968925476, + "step": 120445 + }, + { + "epoch": 0.1213185192795221, + "grad_norm": 9.163183192069422, + "learning_rate": 4.937714787024235e-05, + "loss": 2.5502, + "mean_token_accuracy": 0.38620689511299133, + "step": 120450 + }, + { + "epoch": 0.12132355533262627, + "grad_norm": 12.600403676226422, + "learning_rate": 4.937706031055057e-05, + "loss": 2.7572, + "mean_token_accuracy": 0.3703569293022156, + "step": 120455 + }, + { + "epoch": 0.12132859138573045, + "grad_norm": 14.277540634614661, + "learning_rate": 4.9376972744791105e-05, + "loss": 2.491, + "mean_token_accuracy": 0.4034482717514038, + "step": 120460 + }, + { + "epoch": 0.12133362743883462, + "grad_norm": 11.17707346836547, + "learning_rate": 4.937688517296399e-05, + "loss": 2.571, + "mean_token_accuracy": 0.4034482717514038, + "step": 120465 + }, + { + "epoch": 0.1213386634919388, + "grad_norm": 10.11725155944679, + "learning_rate": 4.9376797595069224e-05, + "loss": 2.3732, + "mean_token_accuracy": 0.417241370677948, + "step": 120470 + }, + { + "epoch": 0.12134369954504297, + "grad_norm": 10.42665940094799, + "learning_rate": 4.937671001110685e-05, + "loss": 2.559, + "mean_token_accuracy": 0.39655172228813174, + "step": 120475 + }, + { + "epoch": 0.12134873559814714, + "grad_norm": 10.256120201551772, + "learning_rate": 4.9376622421076894e-05, + "loss": 2.4898, + "mean_token_accuracy": 0.4344827651977539, + "step": 120480 + }, + { + "epoch": 0.1213537716512513, + "grad_norm": 16.923836450998838, + "learning_rate": 4.937653482497938e-05, + "loss": 2.4644, + "mean_token_accuracy": 0.4517241299152374, + "step": 120485 + }, + { + "epoch": 0.12135880770435548, + "grad_norm": 12.059392930689311, + "learning_rate": 4.937644722281431e-05, + "loss": 2.5091, + "mean_token_accuracy": 0.41893526911735535, + "step": 120490 + }, + { + "epoch": 0.12136384375745965, + "grad_norm": 9.94589110734491, + "learning_rate": 4.937635961458174e-05, + "loss": 2.4979, + "mean_token_accuracy": 0.4655172348022461, + "step": 120495 + }, + { + "epoch": 0.12136887981056382, + "grad_norm": 10.345628469984126, + "learning_rate": 4.9376272000281676e-05, + "loss": 2.5816, + "mean_token_accuracy": 0.41203871965408323, + "step": 120500 + }, + { + "epoch": 0.121373915863668, + "grad_norm": 11.58355614348508, + "learning_rate": 4.9376184379914144e-05, + "loss": 2.5957, + "mean_token_accuracy": 0.41379310488700866, + "step": 120505 + }, + { + "epoch": 0.12137895191677217, + "grad_norm": 10.063103209053208, + "learning_rate": 4.9376096753479173e-05, + "loss": 2.1287, + "mean_token_accuracy": 0.44827585220336913, + "step": 120510 + }, + { + "epoch": 0.12138398796987634, + "grad_norm": 10.785798079029975, + "learning_rate": 4.9376009120976784e-05, + "loss": 2.3553, + "mean_token_accuracy": 0.39655172228813174, + "step": 120515 + }, + { + "epoch": 0.12138902402298052, + "grad_norm": 9.27298247722596, + "learning_rate": 4.937592148240701e-05, + "loss": 2.5951, + "mean_token_accuracy": 0.3931034505367279, + "step": 120520 + }, + { + "epoch": 0.12139406007608469, + "grad_norm": 10.36050684981949, + "learning_rate": 4.9375833837769855e-05, + "loss": 2.213, + "mean_token_accuracy": 0.4448275864124298, + "step": 120525 + }, + { + "epoch": 0.12139909612918887, + "grad_norm": 11.145933623528881, + "learning_rate": 4.9375746187065364e-05, + "loss": 2.2229, + "mean_token_accuracy": 0.44137930274009707, + "step": 120530 + }, + { + "epoch": 0.12140413218229304, + "grad_norm": 11.326628551346337, + "learning_rate": 4.937565853029355e-05, + "loss": 2.5891, + "mean_token_accuracy": 0.3793103456497192, + "step": 120535 + }, + { + "epoch": 0.12140916823539721, + "grad_norm": 9.883880007910863, + "learning_rate": 4.937557086745444e-05, + "loss": 2.3097, + "mean_token_accuracy": 0.47241378426551817, + "step": 120540 + }, + { + "epoch": 0.12141420428850139, + "grad_norm": 8.91231419260317, + "learning_rate": 4.9375483198548055e-05, + "loss": 2.299, + "mean_token_accuracy": 0.45892316699028013, + "step": 120545 + }, + { + "epoch": 0.12141924034160556, + "grad_norm": 11.438918901227437, + "learning_rate": 4.937539552357443e-05, + "loss": 2.4636, + "mean_token_accuracy": 0.4137930989265442, + "step": 120550 + }, + { + "epoch": 0.12142427639470972, + "grad_norm": 10.001018463477065, + "learning_rate": 4.937530784253358e-05, + "loss": 2.399, + "mean_token_accuracy": 0.4310344815254211, + "step": 120555 + }, + { + "epoch": 0.1214293124478139, + "grad_norm": 11.553979131488429, + "learning_rate": 4.9375220155425534e-05, + "loss": 2.5057, + "mean_token_accuracy": 0.4103448212146759, + "step": 120560 + }, + { + "epoch": 0.12143434850091807, + "grad_norm": 9.547920778014074, + "learning_rate": 4.937513246225031e-05, + "loss": 2.1815, + "mean_token_accuracy": 0.44137930274009707, + "step": 120565 + }, + { + "epoch": 0.12143938455402224, + "grad_norm": 11.02842244577163, + "learning_rate": 4.9375044763007944e-05, + "loss": 2.3135, + "mean_token_accuracy": 0.39848760068416594, + "step": 120570 + }, + { + "epoch": 0.12144442060712642, + "grad_norm": 10.233708769251557, + "learning_rate": 4.9374957057698444e-05, + "loss": 2.5839, + "mean_token_accuracy": 0.41379310488700866, + "step": 120575 + }, + { + "epoch": 0.12144945666023059, + "grad_norm": 9.63693952197964, + "learning_rate": 4.937486934632185e-05, + "loss": 2.2923, + "mean_token_accuracy": 0.43103448748588563, + "step": 120580 + }, + { + "epoch": 0.12145449271333476, + "grad_norm": 11.712512949514247, + "learning_rate": 4.937478162887818e-05, + "loss": 2.3935, + "mean_token_accuracy": 0.37241379022598264, + "step": 120585 + }, + { + "epoch": 0.12145952876643894, + "grad_norm": 10.196705029413163, + "learning_rate": 4.937469390536744e-05, + "loss": 3.0547, + "mean_token_accuracy": 0.3862069010734558, + "step": 120590 + }, + { + "epoch": 0.12146456481954311, + "grad_norm": 11.673843445773649, + "learning_rate": 4.937460617578969e-05, + "loss": 2.4666, + "mean_token_accuracy": 0.47339901328086853, + "step": 120595 + }, + { + "epoch": 0.12146960087264728, + "grad_norm": 11.051134910077542, + "learning_rate": 4.937451844014493e-05, + "loss": 2.3782, + "mean_token_accuracy": 0.4137930989265442, + "step": 120600 + }, + { + "epoch": 0.12147463692575146, + "grad_norm": 9.662490137264092, + "learning_rate": 4.9374430698433195e-05, + "loss": 2.2281, + "mean_token_accuracy": 0.4310344815254211, + "step": 120605 + }, + { + "epoch": 0.12147967297885563, + "grad_norm": 10.594896887822555, + "learning_rate": 4.93743429506545e-05, + "loss": 2.1138, + "mean_token_accuracy": 0.4586206912994385, + "step": 120610 + }, + { + "epoch": 0.1214847090319598, + "grad_norm": 10.164091822738284, + "learning_rate": 4.937425519680888e-05, + "loss": 2.3946, + "mean_token_accuracy": 0.4137930989265442, + "step": 120615 + }, + { + "epoch": 0.12148974508506398, + "grad_norm": 8.900498666653574, + "learning_rate": 4.937416743689636e-05, + "loss": 2.0881, + "mean_token_accuracy": 0.4344827651977539, + "step": 120620 + }, + { + "epoch": 0.12149478113816814, + "grad_norm": 9.72999431506868, + "learning_rate": 4.937407967091694e-05, + "loss": 2.4121, + "mean_token_accuracy": 0.44482758045196535, + "step": 120625 + }, + { + "epoch": 0.12149981719127231, + "grad_norm": 10.054439081019352, + "learning_rate": 4.9373991898870666e-05, + "loss": 2.8045, + "mean_token_accuracy": 0.3655172407627106, + "step": 120630 + }, + { + "epoch": 0.12150485324437649, + "grad_norm": 10.00976193556926, + "learning_rate": 4.937390412075757e-05, + "loss": 2.0481, + "mean_token_accuracy": 0.5122807025909424, + "step": 120635 + }, + { + "epoch": 0.12150988929748066, + "grad_norm": 11.67198068390328, + "learning_rate": 4.937381633657766e-05, + "loss": 2.1867, + "mean_token_accuracy": 0.4862068951129913, + "step": 120640 + }, + { + "epoch": 0.12151492535058483, + "grad_norm": 9.697137386991486, + "learning_rate": 4.937372854633097e-05, + "loss": 2.7594, + "mean_token_accuracy": 0.3551724076271057, + "step": 120645 + }, + { + "epoch": 0.12151996140368901, + "grad_norm": 10.94089020360911, + "learning_rate": 4.937364075001751e-05, + "loss": 2.2663, + "mean_token_accuracy": 0.4620689630508423, + "step": 120650 + }, + { + "epoch": 0.12152499745679318, + "grad_norm": 12.503431871236701, + "learning_rate": 4.937355294763732e-05, + "loss": 2.2608, + "mean_token_accuracy": 0.4517241358757019, + "step": 120655 + }, + { + "epoch": 0.12153003350989736, + "grad_norm": 11.678741885218328, + "learning_rate": 4.937346513919042e-05, + "loss": 2.9036, + "mean_token_accuracy": 0.4068965494632721, + "step": 120660 + }, + { + "epoch": 0.12153506956300153, + "grad_norm": 10.005761581058362, + "learning_rate": 4.9373377324676823e-05, + "loss": 2.2571, + "mean_token_accuracy": 0.43793103098869324, + "step": 120665 + }, + { + "epoch": 0.1215401056161057, + "grad_norm": 7.548786495964204, + "learning_rate": 4.937328950409657e-05, + "loss": 2.2689, + "mean_token_accuracy": 0.49546279907226565, + "step": 120670 + }, + { + "epoch": 0.12154514166920988, + "grad_norm": 9.387003846558114, + "learning_rate": 4.937320167744968e-05, + "loss": 2.6214, + "mean_token_accuracy": 0.40689654648303986, + "step": 120675 + }, + { + "epoch": 0.12155017772231405, + "grad_norm": 14.687873187352, + "learning_rate": 4.937311384473617e-05, + "loss": 2.1185, + "mean_token_accuracy": 0.5, + "step": 120680 + }, + { + "epoch": 0.12155521377541822, + "grad_norm": 9.67346853750333, + "learning_rate": 4.937302600595608e-05, + "loss": 2.4354, + "mean_token_accuracy": 0.42413792610168455, + "step": 120685 + }, + { + "epoch": 0.1215602498285224, + "grad_norm": 8.824522417738727, + "learning_rate": 4.9372938161109414e-05, + "loss": 1.9263, + "mean_token_accuracy": 0.5103448390960693, + "step": 120690 + }, + { + "epoch": 0.12156528588162656, + "grad_norm": 9.415608002719981, + "learning_rate": 4.937285031019622e-05, + "loss": 2.2339, + "mean_token_accuracy": 0.4499092519283295, + "step": 120695 + }, + { + "epoch": 0.12157032193473073, + "grad_norm": 13.767608234370705, + "learning_rate": 4.93727624532165e-05, + "loss": 2.7323, + "mean_token_accuracy": 0.3793103456497192, + "step": 120700 + }, + { + "epoch": 0.1215753579878349, + "grad_norm": 10.245752921240404, + "learning_rate": 4.9372674590170285e-05, + "loss": 2.534, + "mean_token_accuracy": 0.3793103456497192, + "step": 120705 + }, + { + "epoch": 0.12158039404093908, + "grad_norm": 10.983964394106494, + "learning_rate": 4.937258672105761e-05, + "loss": 2.1828, + "mean_token_accuracy": 0.4551724076271057, + "step": 120710 + }, + { + "epoch": 0.12158543009404325, + "grad_norm": 11.822016746933091, + "learning_rate": 4.937249884587849e-05, + "loss": 2.3725, + "mean_token_accuracy": 0.39310343861579894, + "step": 120715 + }, + { + "epoch": 0.12159046614714743, + "grad_norm": 10.601322210612762, + "learning_rate": 4.937241096463295e-05, + "loss": 2.9515, + "mean_token_accuracy": 0.36551723778247835, + "step": 120720 + }, + { + "epoch": 0.1215955022002516, + "grad_norm": 12.6458548777165, + "learning_rate": 4.937232307732101e-05, + "loss": 2.7574, + "mean_token_accuracy": 0.4, + "step": 120725 + }, + { + "epoch": 0.12160053825335577, + "grad_norm": 11.763389772036374, + "learning_rate": 4.937223518394271e-05, + "loss": 2.1901, + "mean_token_accuracy": 0.4413793087005615, + "step": 120730 + }, + { + "epoch": 0.12160557430645995, + "grad_norm": 9.92799858890616, + "learning_rate": 4.9372147284498055e-05, + "loss": 2.4223, + "mean_token_accuracy": 0.37931033968925476, + "step": 120735 + }, + { + "epoch": 0.12161061035956412, + "grad_norm": 12.086966002312767, + "learning_rate": 4.9372059378987087e-05, + "loss": 2.2654, + "mean_token_accuracy": 0.4275862157344818, + "step": 120740 + }, + { + "epoch": 0.1216156464126683, + "grad_norm": 8.773783071522493, + "learning_rate": 4.937197146740981e-05, + "loss": 2.1319, + "mean_token_accuracy": 0.441379314661026, + "step": 120745 + }, + { + "epoch": 0.12162068246577247, + "grad_norm": 10.49362533066179, + "learning_rate": 4.937188354976626e-05, + "loss": 2.3509, + "mean_token_accuracy": 0.4137930989265442, + "step": 120750 + }, + { + "epoch": 0.12162571851887664, + "grad_norm": 9.880684153305749, + "learning_rate": 4.937179562605648e-05, + "loss": 2.3882, + "mean_token_accuracy": 0.4068965524435043, + "step": 120755 + }, + { + "epoch": 0.12163075457198082, + "grad_norm": 8.57741112309702, + "learning_rate": 4.9371707696280455e-05, + "loss": 1.9318, + "mean_token_accuracy": 0.49866907596588134, + "step": 120760 + }, + { + "epoch": 0.12163579062508498, + "grad_norm": 10.615805398260516, + "learning_rate": 4.937161976043824e-05, + "loss": 2.8547, + "mean_token_accuracy": 0.358620685338974, + "step": 120765 + }, + { + "epoch": 0.12164082667818915, + "grad_norm": 9.87325862359283, + "learning_rate": 4.9371531818529846e-05, + "loss": 2.6827, + "mean_token_accuracy": 0.3896551728248596, + "step": 120770 + }, + { + "epoch": 0.12164586273129332, + "grad_norm": 9.93872256032969, + "learning_rate": 4.937144387055531e-05, + "loss": 2.2824, + "mean_token_accuracy": 0.4655172348022461, + "step": 120775 + }, + { + "epoch": 0.1216508987843975, + "grad_norm": 10.41750432024745, + "learning_rate": 4.9371355916514636e-05, + "loss": 2.307, + "mean_token_accuracy": 0.4517241358757019, + "step": 120780 + }, + { + "epoch": 0.12165593483750167, + "grad_norm": 12.950178847478162, + "learning_rate": 4.937126795640786e-05, + "loss": 2.5854, + "mean_token_accuracy": 0.3793103456497192, + "step": 120785 + }, + { + "epoch": 0.12166097089060585, + "grad_norm": 9.266068436255633, + "learning_rate": 4.9371179990235016e-05, + "loss": 1.9095, + "mean_token_accuracy": 0.5310344874858857, + "step": 120790 + }, + { + "epoch": 0.12166600694371002, + "grad_norm": 11.248566923948843, + "learning_rate": 4.937109201799611e-05, + "loss": 2.5359, + "mean_token_accuracy": 0.4068965494632721, + "step": 120795 + }, + { + "epoch": 0.1216710429968142, + "grad_norm": 12.076005330854338, + "learning_rate": 4.937100403969118e-05, + "loss": 2.3918, + "mean_token_accuracy": 0.4137930929660797, + "step": 120800 + }, + { + "epoch": 0.12167607904991837, + "grad_norm": 12.589601539914614, + "learning_rate": 4.9370916055320246e-05, + "loss": 2.3997, + "mean_token_accuracy": 0.42068966031074523, + "step": 120805 + }, + { + "epoch": 0.12168111510302254, + "grad_norm": 12.587271314276455, + "learning_rate": 4.937082806488333e-05, + "loss": 2.3232, + "mean_token_accuracy": 0.4103448212146759, + "step": 120810 + }, + { + "epoch": 0.12168615115612672, + "grad_norm": 14.264993183318627, + "learning_rate": 4.937074006838046e-05, + "loss": 2.5165, + "mean_token_accuracy": 0.44277071952819824, + "step": 120815 + }, + { + "epoch": 0.12169118720923089, + "grad_norm": 10.280872429740363, + "learning_rate": 4.9370652065811655e-05, + "loss": 2.3239, + "mean_token_accuracy": 0.43448275327682495, + "step": 120820 + }, + { + "epoch": 0.12169622326233506, + "grad_norm": 10.382145245856712, + "learning_rate": 4.9370564057176946e-05, + "loss": 2.4227, + "mean_token_accuracy": 0.4034482777118683, + "step": 120825 + }, + { + "epoch": 0.12170125931543924, + "grad_norm": 11.046465818948418, + "learning_rate": 4.937047604247635e-05, + "loss": 2.5073, + "mean_token_accuracy": 0.41034482717514037, + "step": 120830 + }, + { + "epoch": 0.1217062953685434, + "grad_norm": 11.645857120861283, + "learning_rate": 4.9370388021709906e-05, + "loss": 2.6629, + "mean_token_accuracy": 0.38275861740112305, + "step": 120835 + }, + { + "epoch": 0.12171133142164757, + "grad_norm": 14.515156698114708, + "learning_rate": 4.9370299994877616e-05, + "loss": 2.9101, + "mean_token_accuracy": 0.37241379022598264, + "step": 120840 + }, + { + "epoch": 0.12171636747475174, + "grad_norm": 9.620447827615553, + "learning_rate": 4.937021196197952e-05, + "loss": 2.4242, + "mean_token_accuracy": 0.4172413766384125, + "step": 120845 + }, + { + "epoch": 0.12172140352785592, + "grad_norm": 11.891240101430462, + "learning_rate": 4.9370123923015645e-05, + "loss": 2.389, + "mean_token_accuracy": 0.4103448212146759, + "step": 120850 + }, + { + "epoch": 0.12172643958096009, + "grad_norm": 12.24415331286597, + "learning_rate": 4.9370035877986004e-05, + "loss": 2.6654, + "mean_token_accuracy": 0.43793103098869324, + "step": 120855 + }, + { + "epoch": 0.12173147563406427, + "grad_norm": 16.10458497593291, + "learning_rate": 4.936994782689063e-05, + "loss": 2.8543, + "mean_token_accuracy": 0.39310344457626345, + "step": 120860 + }, + { + "epoch": 0.12173651168716844, + "grad_norm": 9.974693272520863, + "learning_rate": 4.936985976972955e-05, + "loss": 2.6449, + "mean_token_accuracy": 0.42068966031074523, + "step": 120865 + }, + { + "epoch": 0.12174154774027261, + "grad_norm": 12.418056434408932, + "learning_rate": 4.936977170650277e-05, + "loss": 2.375, + "mean_token_accuracy": 0.4034482777118683, + "step": 120870 + }, + { + "epoch": 0.12174658379337679, + "grad_norm": 12.80926923393782, + "learning_rate": 4.9369683637210326e-05, + "loss": 2.5236, + "mean_token_accuracy": 0.4517241299152374, + "step": 120875 + }, + { + "epoch": 0.12175161984648096, + "grad_norm": 11.305023198755583, + "learning_rate": 4.9369595561852264e-05, + "loss": 2.505, + "mean_token_accuracy": 0.43448275327682495, + "step": 120880 + }, + { + "epoch": 0.12175665589958513, + "grad_norm": 12.939012358247384, + "learning_rate": 4.9369507480428565e-05, + "loss": 2.2778, + "mean_token_accuracy": 0.4931034505367279, + "step": 120885 + }, + { + "epoch": 0.12176169195268931, + "grad_norm": 11.262067885166859, + "learning_rate": 4.936941939293929e-05, + "loss": 2.814, + "mean_token_accuracy": 0.37931033968925476, + "step": 120890 + }, + { + "epoch": 0.12176672800579348, + "grad_norm": 11.422960089008283, + "learning_rate": 4.9369331299384446e-05, + "loss": 2.3471, + "mean_token_accuracy": 0.4310344815254211, + "step": 120895 + }, + { + "epoch": 0.12177176405889764, + "grad_norm": 11.016005044908395, + "learning_rate": 4.9369243199764066e-05, + "loss": 2.3323, + "mean_token_accuracy": 0.4068965494632721, + "step": 120900 + }, + { + "epoch": 0.12177680011200182, + "grad_norm": 11.228697001756135, + "learning_rate": 4.9369155094078166e-05, + "loss": 2.3851, + "mean_token_accuracy": 0.42413793206214906, + "step": 120905 + }, + { + "epoch": 0.12178183616510599, + "grad_norm": 9.543767393004085, + "learning_rate": 4.936906698232677e-05, + "loss": 2.0445, + "mean_token_accuracy": 0.49999998807907103, + "step": 120910 + }, + { + "epoch": 0.12178687221821016, + "grad_norm": 10.888401323398396, + "learning_rate": 4.936897886450991e-05, + "loss": 2.2918, + "mean_token_accuracy": 0.4413793087005615, + "step": 120915 + }, + { + "epoch": 0.12179190827131434, + "grad_norm": 9.677847150006269, + "learning_rate": 4.9368890740627605e-05, + "loss": 2.2958, + "mean_token_accuracy": 0.4398064136505127, + "step": 120920 + }, + { + "epoch": 0.12179694432441851, + "grad_norm": 10.724600558176078, + "learning_rate": 4.936880261067989e-05, + "loss": 2.3988, + "mean_token_accuracy": 0.44827587008476255, + "step": 120925 + }, + { + "epoch": 0.12180198037752268, + "grad_norm": 11.078514878068315, + "learning_rate": 4.936871447466677e-05, + "loss": 2.0681, + "mean_token_accuracy": 0.47447065711021424, + "step": 120930 + }, + { + "epoch": 0.12180701643062686, + "grad_norm": 10.307876550711665, + "learning_rate": 4.936862633258829e-05, + "loss": 2.3099, + "mean_token_accuracy": 0.41034482419490814, + "step": 120935 + }, + { + "epoch": 0.12181205248373103, + "grad_norm": 15.823716962417205, + "learning_rate": 4.936853818444446e-05, + "loss": 2.4597, + "mean_token_accuracy": 0.43103448748588563, + "step": 120940 + }, + { + "epoch": 0.1218170885368352, + "grad_norm": 10.768261005979381, + "learning_rate": 4.9368450030235305e-05, + "loss": 2.4851, + "mean_token_accuracy": 0.4379310369491577, + "step": 120945 + }, + { + "epoch": 0.12182212458993938, + "grad_norm": 12.907811499072487, + "learning_rate": 4.936836186996087e-05, + "loss": 2.3195, + "mean_token_accuracy": 0.4482758641242981, + "step": 120950 + }, + { + "epoch": 0.12182716064304355, + "grad_norm": 56.72053711103278, + "learning_rate": 4.936827370362115e-05, + "loss": 2.285, + "mean_token_accuracy": 0.4413793087005615, + "step": 120955 + }, + { + "epoch": 0.12183219669614773, + "grad_norm": 14.416886640099078, + "learning_rate": 4.9368185531216176e-05, + "loss": 2.6549, + "mean_token_accuracy": 0.46727163791656495, + "step": 120960 + }, + { + "epoch": 0.1218372327492519, + "grad_norm": 9.846233034514107, + "learning_rate": 4.9368097352745994e-05, + "loss": 2.1298, + "mean_token_accuracy": 0.4655172348022461, + "step": 120965 + }, + { + "epoch": 0.12184226880235606, + "grad_norm": 10.118920275099352, + "learning_rate": 4.936800916821061e-05, + "loss": 2.6429, + "mean_token_accuracy": 0.42068966031074523, + "step": 120970 + }, + { + "epoch": 0.12184730485546023, + "grad_norm": 12.882912051116485, + "learning_rate": 4.936792097761005e-05, + "loss": 2.8345, + "mean_token_accuracy": 0.4034482777118683, + "step": 120975 + }, + { + "epoch": 0.12185234090856441, + "grad_norm": 10.87623865508724, + "learning_rate": 4.936783278094433e-05, + "loss": 2.2185, + "mean_token_accuracy": 0.47586206197738645, + "step": 120980 + }, + { + "epoch": 0.12185737696166858, + "grad_norm": 11.412181938900813, + "learning_rate": 4.93677445782135e-05, + "loss": 2.2828, + "mean_token_accuracy": 0.4517241358757019, + "step": 120985 + }, + { + "epoch": 0.12186241301477276, + "grad_norm": 9.268336075017007, + "learning_rate": 4.936765636941757e-05, + "loss": 2.323, + "mean_token_accuracy": 0.41379310488700866, + "step": 120990 + }, + { + "epoch": 0.12186744906787693, + "grad_norm": 8.857694394723612, + "learning_rate": 4.9367568154556554e-05, + "loss": 2.5201, + "mean_token_accuracy": 0.4413793087005615, + "step": 120995 + }, + { + "epoch": 0.1218724851209811, + "grad_norm": 10.560603745111912, + "learning_rate": 4.936747993363049e-05, + "loss": 2.5293, + "mean_token_accuracy": 0.4379310369491577, + "step": 121000 + }, + { + "epoch": 0.12187752117408528, + "grad_norm": 10.309961578868663, + "learning_rate": 4.93673917066394e-05, + "loss": 2.0209, + "mean_token_accuracy": 0.48965516686439514, + "step": 121005 + }, + { + "epoch": 0.12188255722718945, + "grad_norm": 11.298540414880438, + "learning_rate": 4.93673034735833e-05, + "loss": 2.6075, + "mean_token_accuracy": 0.36206896901130675, + "step": 121010 + }, + { + "epoch": 0.12188759328029362, + "grad_norm": 15.477266608422392, + "learning_rate": 4.9367215234462235e-05, + "loss": 2.8664, + "mean_token_accuracy": 0.3793103516101837, + "step": 121015 + }, + { + "epoch": 0.1218926293333978, + "grad_norm": 11.486352501534608, + "learning_rate": 4.9367126989276215e-05, + "loss": 2.4449, + "mean_token_accuracy": 0.39655172228813174, + "step": 121020 + }, + { + "epoch": 0.12189766538650197, + "grad_norm": 8.983045730980086, + "learning_rate": 4.936703873802526e-05, + "loss": 2.614, + "mean_token_accuracy": 0.4034482717514038, + "step": 121025 + }, + { + "epoch": 0.12190270143960615, + "grad_norm": 10.376790430148095, + "learning_rate": 4.9366950480709405e-05, + "loss": 2.5675, + "mean_token_accuracy": 0.38965516686439516, + "step": 121030 + }, + { + "epoch": 0.12190773749271032, + "grad_norm": 11.162068342335486, + "learning_rate": 4.936686221732866e-05, + "loss": 1.9516, + "mean_token_accuracy": 0.5068965494632721, + "step": 121035 + }, + { + "epoch": 0.12191277354581448, + "grad_norm": 10.600636211328618, + "learning_rate": 4.9366773947883064e-05, + "loss": 2.4381, + "mean_token_accuracy": 0.4275861978530884, + "step": 121040 + }, + { + "epoch": 0.12191780959891865, + "grad_norm": 10.652840561018301, + "learning_rate": 4.936668567237264e-05, + "loss": 2.4576, + "mean_token_accuracy": 0.38275861740112305, + "step": 121045 + }, + { + "epoch": 0.12192284565202283, + "grad_norm": 9.656538319058857, + "learning_rate": 4.93665973907974e-05, + "loss": 2.1529, + "mean_token_accuracy": 0.47931034564971925, + "step": 121050 + }, + { + "epoch": 0.121927881705127, + "grad_norm": 12.661305047453597, + "learning_rate": 4.936650910315739e-05, + "loss": 2.1475, + "mean_token_accuracy": 0.4758620738983154, + "step": 121055 + }, + { + "epoch": 0.12193291775823117, + "grad_norm": 11.217138411052392, + "learning_rate": 4.936642080945262e-05, + "loss": 2.5425, + "mean_token_accuracy": 0.3827586203813553, + "step": 121060 + }, + { + "epoch": 0.12193795381133535, + "grad_norm": 9.37308839789053, + "learning_rate": 4.936633250968311e-05, + "loss": 2.2179, + "mean_token_accuracy": 0.4241379380226135, + "step": 121065 + }, + { + "epoch": 0.12194298986443952, + "grad_norm": 8.51889995827723, + "learning_rate": 4.9366244203848896e-05, + "loss": 2.2586, + "mean_token_accuracy": 0.45172414779663084, + "step": 121070 + }, + { + "epoch": 0.1219480259175437, + "grad_norm": 9.516899425169772, + "learning_rate": 4.936615589194999e-05, + "loss": 2.337, + "mean_token_accuracy": 0.4931034445762634, + "step": 121075 + }, + { + "epoch": 0.12195306197064787, + "grad_norm": 10.179137119713541, + "learning_rate": 4.936606757398643e-05, + "loss": 2.1674, + "mean_token_accuracy": 0.5119177162647247, + "step": 121080 + }, + { + "epoch": 0.12195809802375204, + "grad_norm": 11.19400002726952, + "learning_rate": 4.936597924995824e-05, + "loss": 2.6164, + "mean_token_accuracy": 0.4034482777118683, + "step": 121085 + }, + { + "epoch": 0.12196313407685622, + "grad_norm": 10.478852863741832, + "learning_rate": 4.9365890919865435e-05, + "loss": 2.0486, + "mean_token_accuracy": 0.47586206793785096, + "step": 121090 + }, + { + "epoch": 0.12196817012996039, + "grad_norm": 12.88581354153339, + "learning_rate": 4.936580258370804e-05, + "loss": 2.7266, + "mean_token_accuracy": 0.4137930929660797, + "step": 121095 + }, + { + "epoch": 0.12197320618306456, + "grad_norm": 12.614571098767058, + "learning_rate": 4.936571424148609e-05, + "loss": 2.8028, + "mean_token_accuracy": 0.37586206793785093, + "step": 121100 + }, + { + "epoch": 0.12197824223616874, + "grad_norm": 11.257747289112984, + "learning_rate": 4.93656258931996e-05, + "loss": 2.2517, + "mean_token_accuracy": 0.43103448748588563, + "step": 121105 + }, + { + "epoch": 0.1219832782892729, + "grad_norm": 9.710058035522483, + "learning_rate": 4.9365537538848596e-05, + "loss": 2.5217, + "mean_token_accuracy": 0.4034482717514038, + "step": 121110 + }, + { + "epoch": 0.12198831434237707, + "grad_norm": 12.99042040873905, + "learning_rate": 4.93654491784331e-05, + "loss": 2.7372, + "mean_token_accuracy": 0.39310344457626345, + "step": 121115 + }, + { + "epoch": 0.12199335039548125, + "grad_norm": 12.102373886652407, + "learning_rate": 4.936536081195314e-05, + "loss": 2.2319, + "mean_token_accuracy": 0.5034482717514038, + "step": 121120 + }, + { + "epoch": 0.12199838644858542, + "grad_norm": 8.671251500205502, + "learning_rate": 4.9365272439408746e-05, + "loss": 2.7123, + "mean_token_accuracy": 0.3793103486299515, + "step": 121125 + }, + { + "epoch": 0.12200342250168959, + "grad_norm": 10.510099727228706, + "learning_rate": 4.936518406079993e-05, + "loss": 2.5184, + "mean_token_accuracy": 0.4, + "step": 121130 + }, + { + "epoch": 0.12200845855479377, + "grad_norm": 9.75872879010189, + "learning_rate": 4.9365095676126736e-05, + "loss": 2.1933, + "mean_token_accuracy": 0.3931034505367279, + "step": 121135 + }, + { + "epoch": 0.12201349460789794, + "grad_norm": 9.514115811381586, + "learning_rate": 4.9365007285389156e-05, + "loss": 2.163, + "mean_token_accuracy": 0.44482758045196535, + "step": 121140 + }, + { + "epoch": 0.12201853066100211, + "grad_norm": 11.194827149128853, + "learning_rate": 4.9364918888587254e-05, + "loss": 2.4977, + "mean_token_accuracy": 0.41379310488700866, + "step": 121145 + }, + { + "epoch": 0.12202356671410629, + "grad_norm": 9.179973665571547, + "learning_rate": 4.936483048572103e-05, + "loss": 1.9788, + "mean_token_accuracy": 0.4707804024219513, + "step": 121150 + }, + { + "epoch": 0.12202860276721046, + "grad_norm": 10.146628749200255, + "learning_rate": 4.936474207679051e-05, + "loss": 1.9078, + "mean_token_accuracy": 0.4983666121959686, + "step": 121155 + }, + { + "epoch": 0.12203363882031464, + "grad_norm": 9.934749783353913, + "learning_rate": 4.936465366179572e-05, + "loss": 2.5568, + "mean_token_accuracy": 0.4379310369491577, + "step": 121160 + }, + { + "epoch": 0.12203867487341881, + "grad_norm": 10.040597958827709, + "learning_rate": 4.93645652407367e-05, + "loss": 3.0408, + "mean_token_accuracy": 0.33236539363861084, + "step": 121165 + }, + { + "epoch": 0.12204371092652298, + "grad_norm": 12.860581249828853, + "learning_rate": 4.9364476813613444e-05, + "loss": 2.8723, + "mean_token_accuracy": 0.37586206793785093, + "step": 121170 + }, + { + "epoch": 0.12204874697962716, + "grad_norm": 9.970452458169836, + "learning_rate": 4.936438838042601e-05, + "loss": 2.3145, + "mean_token_accuracy": 0.42413793206214906, + "step": 121175 + }, + { + "epoch": 0.12205378303273132, + "grad_norm": 10.359453306483196, + "learning_rate": 4.93642999411744e-05, + "loss": 2.4171, + "mean_token_accuracy": 0.4206896543502808, + "step": 121180 + }, + { + "epoch": 0.12205881908583549, + "grad_norm": 9.92680492800865, + "learning_rate": 4.936421149585864e-05, + "loss": 2.2162, + "mean_token_accuracy": 0.441379314661026, + "step": 121185 + }, + { + "epoch": 0.12206385513893966, + "grad_norm": 9.15110478953717, + "learning_rate": 4.936412304447876e-05, + "loss": 2.2946, + "mean_token_accuracy": 0.4137930989265442, + "step": 121190 + }, + { + "epoch": 0.12206889119204384, + "grad_norm": 10.021739379620538, + "learning_rate": 4.936403458703479e-05, + "loss": 2.4026, + "mean_token_accuracy": 0.42413793206214906, + "step": 121195 + }, + { + "epoch": 0.12207392724514801, + "grad_norm": 17.75240690086342, + "learning_rate": 4.936394612352674e-05, + "loss": 2.6913, + "mean_token_accuracy": 0.4172413766384125, + "step": 121200 + }, + { + "epoch": 0.12207896329825219, + "grad_norm": 8.899132038704407, + "learning_rate": 4.936385765395465e-05, + "loss": 1.9303, + "mean_token_accuracy": 0.4870689630508423, + "step": 121205 + }, + { + "epoch": 0.12208399935135636, + "grad_norm": 11.503835137540108, + "learning_rate": 4.936376917831853e-05, + "loss": 2.5138, + "mean_token_accuracy": 0.4172413766384125, + "step": 121210 + }, + { + "epoch": 0.12208903540446053, + "grad_norm": 12.009769717857852, + "learning_rate": 4.936368069661842e-05, + "loss": 2.3726, + "mean_token_accuracy": 0.4379310369491577, + "step": 121215 + }, + { + "epoch": 0.1220940714575647, + "grad_norm": 14.429799926429205, + "learning_rate": 4.936359220885434e-05, + "loss": 2.7104, + "mean_token_accuracy": 0.35862068831920624, + "step": 121220 + }, + { + "epoch": 0.12209910751066888, + "grad_norm": 9.638666670722689, + "learning_rate": 4.93635037150263e-05, + "loss": 2.4151, + "mean_token_accuracy": 0.4379310369491577, + "step": 121225 + }, + { + "epoch": 0.12210414356377305, + "grad_norm": 12.674910190507333, + "learning_rate": 4.936341521513434e-05, + "loss": 2.6197, + "mean_token_accuracy": 0.41034482717514037, + "step": 121230 + }, + { + "epoch": 0.12210917961687723, + "grad_norm": 14.569840522821229, + "learning_rate": 4.9363326709178484e-05, + "loss": 2.6321, + "mean_token_accuracy": 0.36896551251411436, + "step": 121235 + }, + { + "epoch": 0.1221142156699814, + "grad_norm": 10.48847662719086, + "learning_rate": 4.936323819715874e-05, + "loss": 2.5415, + "mean_token_accuracy": 0.42413792610168455, + "step": 121240 + }, + { + "epoch": 0.12211925172308558, + "grad_norm": 11.810558738847785, + "learning_rate": 4.9363149679075166e-05, + "loss": 2.6024, + "mean_token_accuracy": 0.3896551728248596, + "step": 121245 + }, + { + "epoch": 0.12212428777618974, + "grad_norm": 10.183504769037581, + "learning_rate": 4.9363061154927756e-05, + "loss": 2.5701, + "mean_token_accuracy": 0.37931033968925476, + "step": 121250 + }, + { + "epoch": 0.12212932382929391, + "grad_norm": 12.193471720602403, + "learning_rate": 4.9362972624716545e-05, + "loss": 2.3607, + "mean_token_accuracy": 0.4344827473163605, + "step": 121255 + }, + { + "epoch": 0.12213435988239808, + "grad_norm": 12.938878458011072, + "learning_rate": 4.9362884088441554e-05, + "loss": 2.2493, + "mean_token_accuracy": 0.46896551847457885, + "step": 121260 + }, + { + "epoch": 0.12213939593550226, + "grad_norm": 11.36122923191146, + "learning_rate": 4.936279554610281e-05, + "loss": 2.7301, + "mean_token_accuracy": 0.4206896543502808, + "step": 121265 + }, + { + "epoch": 0.12214443198860643, + "grad_norm": 9.776324450531078, + "learning_rate": 4.936270699770034e-05, + "loss": 2.2343, + "mean_token_accuracy": 0.4620689690113068, + "step": 121270 + }, + { + "epoch": 0.1221494680417106, + "grad_norm": 14.679221488187956, + "learning_rate": 4.9362618443234164e-05, + "loss": 2.6374, + "mean_token_accuracy": 0.4034482717514038, + "step": 121275 + }, + { + "epoch": 0.12215450409481478, + "grad_norm": 11.263835252959014, + "learning_rate": 4.936252988270431e-05, + "loss": 2.2481, + "mean_token_accuracy": 0.4260738015174866, + "step": 121280 + }, + { + "epoch": 0.12215954014791895, + "grad_norm": 10.71129766485962, + "learning_rate": 4.9362441316110804e-05, + "loss": 2.3893, + "mean_token_accuracy": 0.42413793206214906, + "step": 121285 + }, + { + "epoch": 0.12216457620102313, + "grad_norm": 8.405130328536725, + "learning_rate": 4.9362352743453675e-05, + "loss": 2.0884, + "mean_token_accuracy": 0.4517241358757019, + "step": 121290 + }, + { + "epoch": 0.1221696122541273, + "grad_norm": 11.797503653762023, + "learning_rate": 4.936226416473293e-05, + "loss": 2.3586, + "mean_token_accuracy": 0.4517241418361664, + "step": 121295 + }, + { + "epoch": 0.12217464830723147, + "grad_norm": 13.954412993068757, + "learning_rate": 4.936217557994861e-05, + "loss": 2.7041, + "mean_token_accuracy": 0.40689654350280763, + "step": 121300 + }, + { + "epoch": 0.12217968436033565, + "grad_norm": 12.415919735727012, + "learning_rate": 4.936208698910073e-05, + "loss": 2.5593, + "mean_token_accuracy": 0.42413792610168455, + "step": 121305 + }, + { + "epoch": 0.12218472041343982, + "grad_norm": 10.828909108272393, + "learning_rate": 4.9361998392189327e-05, + "loss": 2.4364, + "mean_token_accuracy": 0.4275862157344818, + "step": 121310 + }, + { + "epoch": 0.122189756466544, + "grad_norm": 13.159279686413846, + "learning_rate": 4.936190978921441e-05, + "loss": 2.4755, + "mean_token_accuracy": 0.4461584985256195, + "step": 121315 + }, + { + "epoch": 0.12219479251964815, + "grad_norm": 8.356807728290756, + "learning_rate": 4.9361821180176007e-05, + "loss": 1.9142, + "mean_token_accuracy": 0.4965517342090607, + "step": 121320 + }, + { + "epoch": 0.12219982857275233, + "grad_norm": 13.356515919549372, + "learning_rate": 4.936173256507415e-05, + "loss": 2.7456, + "mean_token_accuracy": 0.4241379201412201, + "step": 121325 + }, + { + "epoch": 0.1222048646258565, + "grad_norm": 15.580783028937352, + "learning_rate": 4.936164394390887e-05, + "loss": 2.2568, + "mean_token_accuracy": 0.4206896543502808, + "step": 121330 + }, + { + "epoch": 0.12220990067896068, + "grad_norm": 9.987052801556084, + "learning_rate": 4.936155531668017e-05, + "loss": 2.4574, + "mean_token_accuracy": 0.4034482777118683, + "step": 121335 + }, + { + "epoch": 0.12221493673206485, + "grad_norm": 13.054548295401002, + "learning_rate": 4.9361466683388085e-05, + "loss": 2.2296, + "mean_token_accuracy": 0.46551724076271056, + "step": 121340 + }, + { + "epoch": 0.12221997278516902, + "grad_norm": 10.620575354067812, + "learning_rate": 4.9361378044032646e-05, + "loss": 2.4148, + "mean_token_accuracy": 0.4310344815254211, + "step": 121345 + }, + { + "epoch": 0.1222250088382732, + "grad_norm": 12.219362256712568, + "learning_rate": 4.936128939861387e-05, + "loss": 2.5817, + "mean_token_accuracy": 0.37931033968925476, + "step": 121350 + }, + { + "epoch": 0.12223004489137737, + "grad_norm": 10.844771906941629, + "learning_rate": 4.9361200747131785e-05, + "loss": 2.597, + "mean_token_accuracy": 0.43448275327682495, + "step": 121355 + }, + { + "epoch": 0.12223508094448154, + "grad_norm": 11.191620068907104, + "learning_rate": 4.9361112089586416e-05, + "loss": 2.5029, + "mean_token_accuracy": 0.4034482777118683, + "step": 121360 + }, + { + "epoch": 0.12224011699758572, + "grad_norm": 10.12592926822746, + "learning_rate": 4.936102342597778e-05, + "loss": 2.5111, + "mean_token_accuracy": 0.41379310488700866, + "step": 121365 + }, + { + "epoch": 0.12224515305068989, + "grad_norm": 10.03368269690278, + "learning_rate": 4.936093475630592e-05, + "loss": 2.2759, + "mean_token_accuracy": 0.44482759237289426, + "step": 121370 + }, + { + "epoch": 0.12225018910379407, + "grad_norm": 9.109986488930303, + "learning_rate": 4.9360846080570834e-05, + "loss": 1.9092, + "mean_token_accuracy": 0.5036902606487275, + "step": 121375 + }, + { + "epoch": 0.12225522515689824, + "grad_norm": 11.97127112414645, + "learning_rate": 4.936075739877257e-05, + "loss": 2.3626, + "mean_token_accuracy": 0.42758620381355283, + "step": 121380 + }, + { + "epoch": 0.12226026121000241, + "grad_norm": 12.70064289269278, + "learning_rate": 4.9360668710911137e-05, + "loss": 3.175, + "mean_token_accuracy": 0.3594676285982132, + "step": 121385 + }, + { + "epoch": 0.12226529726310657, + "grad_norm": 10.032753153666066, + "learning_rate": 4.936058001698657e-05, + "loss": 2.4278, + "mean_token_accuracy": 0.42068964838981626, + "step": 121390 + }, + { + "epoch": 0.12227033331621075, + "grad_norm": 12.213514883895792, + "learning_rate": 4.936049131699889e-05, + "loss": 2.5609, + "mean_token_accuracy": 0.4068965494632721, + "step": 121395 + }, + { + "epoch": 0.12227536936931492, + "grad_norm": 11.153800724567695, + "learning_rate": 4.9360402610948115e-05, + "loss": 2.422, + "mean_token_accuracy": 0.38620689511299133, + "step": 121400 + }, + { + "epoch": 0.1222804054224191, + "grad_norm": 11.833366417666083, + "learning_rate": 4.936031389883428e-05, + "loss": 2.5362, + "mean_token_accuracy": 0.4, + "step": 121405 + }, + { + "epoch": 0.12228544147552327, + "grad_norm": 9.554982204496632, + "learning_rate": 4.936022518065741e-05, + "loss": 2.3373, + "mean_token_accuracy": 0.4310344815254211, + "step": 121410 + }, + { + "epoch": 0.12229047752862744, + "grad_norm": 9.903573236209526, + "learning_rate": 4.9360136456417524e-05, + "loss": 2.6248, + "mean_token_accuracy": 0.42758620381355283, + "step": 121415 + }, + { + "epoch": 0.12229551358173162, + "grad_norm": 9.361240432603488, + "learning_rate": 4.936004772611464e-05, + "loss": 2.3405, + "mean_token_accuracy": 0.41724138259887694, + "step": 121420 + }, + { + "epoch": 0.12230054963483579, + "grad_norm": 14.207082360371928, + "learning_rate": 4.935995898974879e-05, + "loss": 2.6811, + "mean_token_accuracy": 0.37241379022598264, + "step": 121425 + }, + { + "epoch": 0.12230558568793996, + "grad_norm": 11.99375107440776, + "learning_rate": 4.9359870247320007e-05, + "loss": 2.5141, + "mean_token_accuracy": 0.4261947989463806, + "step": 121430 + }, + { + "epoch": 0.12231062174104414, + "grad_norm": 8.80324942000633, + "learning_rate": 4.9359781498828303e-05, + "loss": 2.1206, + "mean_token_accuracy": 0.482758617401123, + "step": 121435 + }, + { + "epoch": 0.12231565779414831, + "grad_norm": 11.111146703953166, + "learning_rate": 4.935969274427371e-05, + "loss": 2.2333, + "mean_token_accuracy": 0.4655172348022461, + "step": 121440 + }, + { + "epoch": 0.12232069384725248, + "grad_norm": 9.926311242152007, + "learning_rate": 4.935960398365625e-05, + "loss": 2.1255, + "mean_token_accuracy": 0.4310344815254211, + "step": 121445 + }, + { + "epoch": 0.12232572990035666, + "grad_norm": 10.450654889452851, + "learning_rate": 4.935951521697594e-05, + "loss": 2.0784, + "mean_token_accuracy": 0.4862068951129913, + "step": 121450 + }, + { + "epoch": 0.12233076595346083, + "grad_norm": 10.294057870232526, + "learning_rate": 4.935942644423282e-05, + "loss": 2.1529, + "mean_token_accuracy": 0.4692679882049561, + "step": 121455 + }, + { + "epoch": 0.12233580200656499, + "grad_norm": 15.892870821234443, + "learning_rate": 4.93593376654269e-05, + "loss": 2.3106, + "mean_token_accuracy": 0.4689655125141144, + "step": 121460 + }, + { + "epoch": 0.12234083805966917, + "grad_norm": 10.353767881472717, + "learning_rate": 4.935924888055822e-05, + "loss": 2.1775, + "mean_token_accuracy": 0.46424682140350343, + "step": 121465 + }, + { + "epoch": 0.12234587411277334, + "grad_norm": 10.278426076973306, + "learning_rate": 4.935916008962679e-05, + "loss": 2.546, + "mean_token_accuracy": 0.42758620381355283, + "step": 121470 + }, + { + "epoch": 0.12235091016587751, + "grad_norm": 13.428204139539321, + "learning_rate": 4.935907129263264e-05, + "loss": 2.4375, + "mean_token_accuracy": 0.38965516686439516, + "step": 121475 + }, + { + "epoch": 0.12235594621898169, + "grad_norm": 9.050061318899274, + "learning_rate": 4.93589824895758e-05, + "loss": 2.4069, + "mean_token_accuracy": 0.38620689511299133, + "step": 121480 + }, + { + "epoch": 0.12236098227208586, + "grad_norm": 9.360855229320544, + "learning_rate": 4.935889368045628e-05, + "loss": 2.4494, + "mean_token_accuracy": 0.4551724076271057, + "step": 121485 + }, + { + "epoch": 0.12236601832519003, + "grad_norm": 9.60718879025586, + "learning_rate": 4.9358804865274124e-05, + "loss": 2.693, + "mean_token_accuracy": 0.42758620381355283, + "step": 121490 + }, + { + "epoch": 0.12237105437829421, + "grad_norm": 11.53511541930201, + "learning_rate": 4.935871604402934e-05, + "loss": 2.4695, + "mean_token_accuracy": 0.41724138855934145, + "step": 121495 + }, + { + "epoch": 0.12237609043139838, + "grad_norm": 11.885172029189246, + "learning_rate": 4.9358627216721964e-05, + "loss": 2.2745, + "mean_token_accuracy": 0.4413793087005615, + "step": 121500 + }, + { + "epoch": 0.12238112648450256, + "grad_norm": 9.970289511704793, + "learning_rate": 4.935853838335202e-05, + "loss": 2.4042, + "mean_token_accuracy": 0.4379310369491577, + "step": 121505 + }, + { + "epoch": 0.12238616253760673, + "grad_norm": 9.820630526731277, + "learning_rate": 4.935844954391951e-05, + "loss": 2.165, + "mean_token_accuracy": 0.493103438615799, + "step": 121510 + }, + { + "epoch": 0.1223911985907109, + "grad_norm": 17.48422252904585, + "learning_rate": 4.9358360698424495e-05, + "loss": 2.5834, + "mean_token_accuracy": 0.4206896543502808, + "step": 121515 + }, + { + "epoch": 0.12239623464381508, + "grad_norm": 12.956607920809779, + "learning_rate": 4.935827184686698e-05, + "loss": 2.9828, + "mean_token_accuracy": 0.4103448212146759, + "step": 121520 + }, + { + "epoch": 0.12240127069691925, + "grad_norm": 10.05435123886727, + "learning_rate": 4.935818298924699e-05, + "loss": 2.091, + "mean_token_accuracy": 0.47084089517593386, + "step": 121525 + }, + { + "epoch": 0.12240630675002341, + "grad_norm": 9.384768454592837, + "learning_rate": 4.935809412556455e-05, + "loss": 2.3141, + "mean_token_accuracy": 0.441379314661026, + "step": 121530 + }, + { + "epoch": 0.12241134280312758, + "grad_norm": 12.261951383600177, + "learning_rate": 4.935800525581969e-05, + "loss": 2.2676, + "mean_token_accuracy": 0.4689655125141144, + "step": 121535 + }, + { + "epoch": 0.12241637885623176, + "grad_norm": 13.864205712724058, + "learning_rate": 4.935791638001243e-05, + "loss": 1.9623, + "mean_token_accuracy": 0.4620689690113068, + "step": 121540 + }, + { + "epoch": 0.12242141490933593, + "grad_norm": 8.363041382122102, + "learning_rate": 4.9357827498142784e-05, + "loss": 2.0423, + "mean_token_accuracy": 0.4689655125141144, + "step": 121545 + }, + { + "epoch": 0.1224264509624401, + "grad_norm": 8.964253345151192, + "learning_rate": 4.93577386102108e-05, + "loss": 2.6198, + "mean_token_accuracy": 0.44482759237289426, + "step": 121550 + }, + { + "epoch": 0.12243148701554428, + "grad_norm": 11.927986116234097, + "learning_rate": 4.9357649716216485e-05, + "loss": 2.3706, + "mean_token_accuracy": 0.4689655125141144, + "step": 121555 + }, + { + "epoch": 0.12243652306864845, + "grad_norm": 9.108694779143375, + "learning_rate": 4.935756081615988e-05, + "loss": 2.4559, + "mean_token_accuracy": 0.4413793087005615, + "step": 121560 + }, + { + "epoch": 0.12244155912175263, + "grad_norm": 10.995882469029445, + "learning_rate": 4.935747191004099e-05, + "loss": 2.3059, + "mean_token_accuracy": 0.46896551847457885, + "step": 121565 + }, + { + "epoch": 0.1224465951748568, + "grad_norm": 12.557856584590972, + "learning_rate": 4.935738299785984e-05, + "loss": 2.3171, + "mean_token_accuracy": 0.44827585816383364, + "step": 121570 + }, + { + "epoch": 0.12245163122796097, + "grad_norm": 7.766123913482772, + "learning_rate": 4.9357294079616475e-05, + "loss": 2.3083, + "mean_token_accuracy": 0.44827585816383364, + "step": 121575 + }, + { + "epoch": 0.12245666728106515, + "grad_norm": 11.862250227676634, + "learning_rate": 4.935720515531091e-05, + "loss": 2.4781, + "mean_token_accuracy": 0.4034482717514038, + "step": 121580 + }, + { + "epoch": 0.12246170333416932, + "grad_norm": 9.576481949485492, + "learning_rate": 4.9357116224943164e-05, + "loss": 2.2323, + "mean_token_accuracy": 0.44694494605064394, + "step": 121585 + }, + { + "epoch": 0.1224667393872735, + "grad_norm": 11.820432407760167, + "learning_rate": 4.935702728851326e-05, + "loss": 2.5114, + "mean_token_accuracy": 0.4569872975349426, + "step": 121590 + }, + { + "epoch": 0.12247177544037767, + "grad_norm": 10.79248982777244, + "learning_rate": 4.935693834602124e-05, + "loss": 2.233, + "mean_token_accuracy": 0.441379314661026, + "step": 121595 + }, + { + "epoch": 0.12247681149348183, + "grad_norm": 26.80401715911033, + "learning_rate": 4.935684939746711e-05, + "loss": 2.7738, + "mean_token_accuracy": 0.39310344457626345, + "step": 121600 + }, + { + "epoch": 0.122481847546586, + "grad_norm": 9.832309485629386, + "learning_rate": 4.9356760442850894e-05, + "loss": 2.0483, + "mean_token_accuracy": 0.47931033968925474, + "step": 121605 + }, + { + "epoch": 0.12248688359969018, + "grad_norm": 9.752849769286588, + "learning_rate": 4.935667148217263e-05, + "loss": 2.3149, + "mean_token_accuracy": 0.4413793087005615, + "step": 121610 + }, + { + "epoch": 0.12249191965279435, + "grad_norm": 9.915835315350268, + "learning_rate": 4.935658251543234e-05, + "loss": 2.1885, + "mean_token_accuracy": 0.4, + "step": 121615 + }, + { + "epoch": 0.12249695570589852, + "grad_norm": 17.210321293211422, + "learning_rate": 4.935649354263004e-05, + "loss": 2.5434, + "mean_token_accuracy": 0.42546883821487425, + "step": 121620 + }, + { + "epoch": 0.1225019917590027, + "grad_norm": 10.719012778819799, + "learning_rate": 4.935640456376577e-05, + "loss": 2.5969, + "mean_token_accuracy": 0.4, + "step": 121625 + }, + { + "epoch": 0.12250702781210687, + "grad_norm": 13.328770440093699, + "learning_rate": 4.9356315578839536e-05, + "loss": 2.5945, + "mean_token_accuracy": 0.4310344815254211, + "step": 121630 + }, + { + "epoch": 0.12251206386521105, + "grad_norm": 10.454093315816003, + "learning_rate": 4.935622658785137e-05, + "loss": 2.3913, + "mean_token_accuracy": 0.41034482717514037, + "step": 121635 + }, + { + "epoch": 0.12251709991831522, + "grad_norm": 11.469231157751647, + "learning_rate": 4.935613759080131e-05, + "loss": 2.7981, + "mean_token_accuracy": 0.3965517282485962, + "step": 121640 + }, + { + "epoch": 0.1225221359714194, + "grad_norm": 11.811339025463111, + "learning_rate": 4.935604858768935e-05, + "loss": 2.2884, + "mean_token_accuracy": 0.43297035694122316, + "step": 121645 + }, + { + "epoch": 0.12252717202452357, + "grad_norm": 10.91494700715504, + "learning_rate": 4.935595957851554e-05, + "loss": 2.407, + "mean_token_accuracy": 0.4034482777118683, + "step": 121650 + }, + { + "epoch": 0.12253220807762774, + "grad_norm": 9.602808461567404, + "learning_rate": 4.935587056327991e-05, + "loss": 2.3474, + "mean_token_accuracy": 0.3965517282485962, + "step": 121655 + }, + { + "epoch": 0.12253724413073191, + "grad_norm": 9.886630898940915, + "learning_rate": 4.9355781541982455e-05, + "loss": 2.3762, + "mean_token_accuracy": 0.46733213067054746, + "step": 121660 + }, + { + "epoch": 0.12254228018383609, + "grad_norm": 15.719746930413635, + "learning_rate": 4.935569251462323e-05, + "loss": 3.0132, + "mean_token_accuracy": 0.31034483313560485, + "step": 121665 + }, + { + "epoch": 0.12254731623694025, + "grad_norm": 9.711687822022741, + "learning_rate": 4.9355603481202243e-05, + "loss": 2.427, + "mean_token_accuracy": 0.38275861740112305, + "step": 121670 + }, + { + "epoch": 0.12255235229004442, + "grad_norm": 14.215228459804647, + "learning_rate": 4.935551444171952e-05, + "loss": 3.0844, + "mean_token_accuracy": 0.3947368383407593, + "step": 121675 + }, + { + "epoch": 0.1225573883431486, + "grad_norm": 8.346107333980514, + "learning_rate": 4.935542539617509e-05, + "loss": 2.9142, + "mean_token_accuracy": 0.3758620619773865, + "step": 121680 + }, + { + "epoch": 0.12256242439625277, + "grad_norm": 10.163008809713059, + "learning_rate": 4.935533634456898e-05, + "loss": 2.4207, + "mean_token_accuracy": 0.46896551847457885, + "step": 121685 + }, + { + "epoch": 0.12256746044935694, + "grad_norm": 10.430847811801907, + "learning_rate": 4.9355247286901206e-05, + "loss": 2.3228, + "mean_token_accuracy": 0.43448275327682495, + "step": 121690 + }, + { + "epoch": 0.12257249650246112, + "grad_norm": 10.771167818701835, + "learning_rate": 4.9355158223171806e-05, + "loss": 2.4486, + "mean_token_accuracy": 0.4103448152542114, + "step": 121695 + }, + { + "epoch": 0.12257753255556529, + "grad_norm": 10.034137743810472, + "learning_rate": 4.935506915338079e-05, + "loss": 2.6503, + "mean_token_accuracy": 0.4068965554237366, + "step": 121700 + }, + { + "epoch": 0.12258256860866946, + "grad_norm": 10.4542404728085, + "learning_rate": 4.935498007752819e-05, + "loss": 2.3909, + "mean_token_accuracy": 0.41379310488700866, + "step": 121705 + }, + { + "epoch": 0.12258760466177364, + "grad_norm": 10.983530671415469, + "learning_rate": 4.935489099561403e-05, + "loss": 2.7458, + "mean_token_accuracy": 0.41724138557910917, + "step": 121710 + }, + { + "epoch": 0.12259264071487781, + "grad_norm": 8.880514656150925, + "learning_rate": 4.935480190763833e-05, + "loss": 2.5023, + "mean_token_accuracy": 0.42413792908191683, + "step": 121715 + }, + { + "epoch": 0.12259767676798199, + "grad_norm": 8.189523926450258, + "learning_rate": 4.935471281360112e-05, + "loss": 2.1778, + "mean_token_accuracy": 0.44482758045196535, + "step": 121720 + }, + { + "epoch": 0.12260271282108616, + "grad_norm": 11.20940686563691, + "learning_rate": 4.935462371350244e-05, + "loss": 2.4978, + "mean_token_accuracy": 0.38620689511299133, + "step": 121725 + }, + { + "epoch": 0.12260774887419033, + "grad_norm": 10.86722975107101, + "learning_rate": 4.935453460734228e-05, + "loss": 2.4715, + "mean_token_accuracy": 0.41034482717514037, + "step": 121730 + }, + { + "epoch": 0.12261278492729451, + "grad_norm": 8.845563554866562, + "learning_rate": 4.935444549512068e-05, + "loss": 2.4495, + "mean_token_accuracy": 0.44827585816383364, + "step": 121735 + }, + { + "epoch": 0.12261782098039867, + "grad_norm": 9.751217644349351, + "learning_rate": 4.9354356376837676e-05, + "loss": 2.2902, + "mean_token_accuracy": 0.458620685338974, + "step": 121740 + }, + { + "epoch": 0.12262285703350284, + "grad_norm": 10.078829783583458, + "learning_rate": 4.935426725249329e-05, + "loss": 2.405, + "mean_token_accuracy": 0.41379310488700866, + "step": 121745 + }, + { + "epoch": 0.12262789308660701, + "grad_norm": 10.797611699898967, + "learning_rate": 4.935417812208754e-05, + "loss": 2.2163, + "mean_token_accuracy": 0.4448275864124298, + "step": 121750 + }, + { + "epoch": 0.12263292913971119, + "grad_norm": 12.4847417045229, + "learning_rate": 4.935408898562045e-05, + "loss": 2.6193, + "mean_token_accuracy": 0.37586206793785093, + "step": 121755 + }, + { + "epoch": 0.12263796519281536, + "grad_norm": 9.386764998884157, + "learning_rate": 4.935399984309205e-05, + "loss": 2.3367, + "mean_token_accuracy": 0.4620689690113068, + "step": 121760 + }, + { + "epoch": 0.12264300124591954, + "grad_norm": 10.206212944884228, + "learning_rate": 4.935391069450235e-05, + "loss": 2.1902, + "mean_token_accuracy": 0.43793103098869324, + "step": 121765 + }, + { + "epoch": 0.12264803729902371, + "grad_norm": 8.367255113049477, + "learning_rate": 4.93538215398514e-05, + "loss": 2.3965, + "mean_token_accuracy": 0.4034482717514038, + "step": 121770 + }, + { + "epoch": 0.12265307335212788, + "grad_norm": 12.882973084132638, + "learning_rate": 4.935373237913921e-05, + "loss": 2.1428, + "mean_token_accuracy": 0.47931033968925474, + "step": 121775 + }, + { + "epoch": 0.12265810940523206, + "grad_norm": 12.253773367498395, + "learning_rate": 4.935364321236579e-05, + "loss": 2.527, + "mean_token_accuracy": 0.42413792610168455, + "step": 121780 + }, + { + "epoch": 0.12266314545833623, + "grad_norm": 10.887786789705851, + "learning_rate": 4.935355403953119e-05, + "loss": 2.4306, + "mean_token_accuracy": 0.43793103098869324, + "step": 121785 + }, + { + "epoch": 0.1226681815114404, + "grad_norm": 12.070078026633132, + "learning_rate": 4.935346486063543e-05, + "loss": 2.3688, + "mean_token_accuracy": 0.44482758045196535, + "step": 121790 + }, + { + "epoch": 0.12267321756454458, + "grad_norm": 12.326080088770455, + "learning_rate": 4.935337567567853e-05, + "loss": 2.456, + "mean_token_accuracy": 0.42068966031074523, + "step": 121795 + }, + { + "epoch": 0.12267825361764875, + "grad_norm": 9.437294848283038, + "learning_rate": 4.9353286484660505e-05, + "loss": 2.2973, + "mean_token_accuracy": 0.43103448748588563, + "step": 121800 + }, + { + "epoch": 0.12268328967075293, + "grad_norm": 9.5412196916285, + "learning_rate": 4.9353197287581406e-05, + "loss": 2.4068, + "mean_token_accuracy": 0.4413793087005615, + "step": 121805 + }, + { + "epoch": 0.12268832572385709, + "grad_norm": 10.883449010273933, + "learning_rate": 4.935310808444123e-05, + "loss": 2.4444, + "mean_token_accuracy": 0.4896551728248596, + "step": 121810 + }, + { + "epoch": 0.12269336177696126, + "grad_norm": 10.060457538465636, + "learning_rate": 4.935301887524001e-05, + "loss": 2.1762, + "mean_token_accuracy": 0.441379314661026, + "step": 121815 + }, + { + "epoch": 0.12269839783006543, + "grad_norm": 9.876891216365344, + "learning_rate": 4.9352929659977775e-05, + "loss": 2.386, + "mean_token_accuracy": 0.4172413766384125, + "step": 121820 + }, + { + "epoch": 0.12270343388316961, + "grad_norm": 11.566010370984657, + "learning_rate": 4.9352840438654554e-05, + "loss": 2.5215, + "mean_token_accuracy": 0.4103448331356049, + "step": 121825 + }, + { + "epoch": 0.12270846993627378, + "grad_norm": 10.526634497768276, + "learning_rate": 4.9352751211270365e-05, + "loss": 2.2711, + "mean_token_accuracy": 0.47931034564971925, + "step": 121830 + }, + { + "epoch": 0.12271350598937796, + "grad_norm": 9.865047703071046, + "learning_rate": 4.9352661977825235e-05, + "loss": 2.5044, + "mean_token_accuracy": 0.4034482777118683, + "step": 121835 + }, + { + "epoch": 0.12271854204248213, + "grad_norm": 10.104994077691904, + "learning_rate": 4.935257273831918e-05, + "loss": 2.7055, + "mean_token_accuracy": 0.36896551847457887, + "step": 121840 + }, + { + "epoch": 0.1227235780955863, + "grad_norm": 10.495909520130299, + "learning_rate": 4.935248349275224e-05, + "loss": 2.5719, + "mean_token_accuracy": 0.4034482777118683, + "step": 121845 + }, + { + "epoch": 0.12272861414869048, + "grad_norm": 9.977928245229942, + "learning_rate": 4.9352394241124424e-05, + "loss": 2.1629, + "mean_token_accuracy": 0.4586206912994385, + "step": 121850 + }, + { + "epoch": 0.12273365020179465, + "grad_norm": 8.983168042489476, + "learning_rate": 4.935230498343577e-05, + "loss": 1.9745, + "mean_token_accuracy": 0.4604960739612579, + "step": 121855 + }, + { + "epoch": 0.12273868625489882, + "grad_norm": 10.463266512923543, + "learning_rate": 4.935221571968629e-05, + "loss": 2.2338, + "mean_token_accuracy": 0.4310344815254211, + "step": 121860 + }, + { + "epoch": 0.122743722308003, + "grad_norm": 10.715422570772402, + "learning_rate": 4.935212644987603e-05, + "loss": 2.6194, + "mean_token_accuracy": 0.4172413766384125, + "step": 121865 + }, + { + "epoch": 0.12274875836110717, + "grad_norm": 12.817112697024978, + "learning_rate": 4.9352037174005e-05, + "loss": 2.3101, + "mean_token_accuracy": 0.458620685338974, + "step": 121870 + }, + { + "epoch": 0.12275379441421135, + "grad_norm": 11.155629682729474, + "learning_rate": 4.935194789207322e-05, + "loss": 2.6317, + "mean_token_accuracy": 0.41379310488700866, + "step": 121875 + }, + { + "epoch": 0.1227588304673155, + "grad_norm": 11.608936094961171, + "learning_rate": 4.9351858604080726e-05, + "loss": 2.7478, + "mean_token_accuracy": 0.458620685338974, + "step": 121880 + }, + { + "epoch": 0.12276386652041968, + "grad_norm": 9.982194832615862, + "learning_rate": 4.9351769310027536e-05, + "loss": 2.2063, + "mean_token_accuracy": 0.4482758641242981, + "step": 121885 + }, + { + "epoch": 0.12276890257352385, + "grad_norm": 9.183322155753466, + "learning_rate": 4.935168000991367e-05, + "loss": 2.2612, + "mean_token_accuracy": 0.4310344815254211, + "step": 121890 + }, + { + "epoch": 0.12277393862662803, + "grad_norm": 11.314825603512896, + "learning_rate": 4.935159070373917e-05, + "loss": 2.6206, + "mean_token_accuracy": 0.38275861740112305, + "step": 121895 + }, + { + "epoch": 0.1227789746797322, + "grad_norm": 9.898057998260635, + "learning_rate": 4.9351501391504035e-05, + "loss": 1.8034, + "mean_token_accuracy": 0.517241370677948, + "step": 121900 + }, + { + "epoch": 0.12278401073283637, + "grad_norm": 11.2381078925426, + "learning_rate": 4.9351412073208315e-05, + "loss": 2.5622, + "mean_token_accuracy": 0.3793103456497192, + "step": 121905 + }, + { + "epoch": 0.12278904678594055, + "grad_norm": 10.566624335440231, + "learning_rate": 4.935132274885202e-05, + "loss": 2.377, + "mean_token_accuracy": 0.41034482419490814, + "step": 121910 + }, + { + "epoch": 0.12279408283904472, + "grad_norm": 9.181959502235571, + "learning_rate": 4.935123341843519e-05, + "loss": 2.3739, + "mean_token_accuracy": 0.4103448301553726, + "step": 121915 + }, + { + "epoch": 0.1227991188921489, + "grad_norm": 11.317684730500437, + "learning_rate": 4.935114408195782e-05, + "loss": 2.62, + "mean_token_accuracy": 0.3896551728248596, + "step": 121920 + }, + { + "epoch": 0.12280415494525307, + "grad_norm": 9.163524177015958, + "learning_rate": 4.935105473941997e-05, + "loss": 2.1259, + "mean_token_accuracy": 0.47586206793785096, + "step": 121925 + }, + { + "epoch": 0.12280919099835724, + "grad_norm": 8.947398990806773, + "learning_rate": 4.935096539082164e-05, + "loss": 2.0649, + "mean_token_accuracy": 0.46551724672317507, + "step": 121930 + }, + { + "epoch": 0.12281422705146142, + "grad_norm": 12.908806992140438, + "learning_rate": 4.935087603616287e-05, + "loss": 2.4393, + "mean_token_accuracy": 0.4517241358757019, + "step": 121935 + }, + { + "epoch": 0.12281926310456559, + "grad_norm": 8.906935245391251, + "learning_rate": 4.9350786675443666e-05, + "loss": 2.1313, + "mean_token_accuracy": 0.42758620381355283, + "step": 121940 + }, + { + "epoch": 0.12282429915766976, + "grad_norm": 9.254257509208076, + "learning_rate": 4.935069730866407e-05, + "loss": 2.5688, + "mean_token_accuracy": 0.43448275327682495, + "step": 121945 + }, + { + "epoch": 0.12282933521077392, + "grad_norm": 12.868984027175689, + "learning_rate": 4.935060793582411e-05, + "loss": 2.8082, + "mean_token_accuracy": 0.358620685338974, + "step": 121950 + }, + { + "epoch": 0.1228343712638781, + "grad_norm": 9.684376588769746, + "learning_rate": 4.9350518556923794e-05, + "loss": 2.196, + "mean_token_accuracy": 0.4413793087005615, + "step": 121955 + }, + { + "epoch": 0.12283940731698227, + "grad_norm": 32.1243764826919, + "learning_rate": 4.935042917196316e-05, + "loss": 3.1854, + "mean_token_accuracy": 0.36551723778247835, + "step": 121960 + }, + { + "epoch": 0.12284444337008645, + "grad_norm": 14.493461315580515, + "learning_rate": 4.935033978094222e-05, + "loss": 2.697, + "mean_token_accuracy": 0.42758620381355283, + "step": 121965 + }, + { + "epoch": 0.12284947942319062, + "grad_norm": 11.240579427875884, + "learning_rate": 4.935025038386101e-05, + "loss": 2.487, + "mean_token_accuracy": 0.3896551728248596, + "step": 121970 + }, + { + "epoch": 0.12285451547629479, + "grad_norm": 10.173161328823317, + "learning_rate": 4.935016098071956e-05, + "loss": 2.4905, + "mean_token_accuracy": 0.4586206912994385, + "step": 121975 + }, + { + "epoch": 0.12285955152939897, + "grad_norm": 12.41371511141704, + "learning_rate": 4.935007157151788e-05, + "loss": 2.1636, + "mean_token_accuracy": 0.4877797901630402, + "step": 121980 + }, + { + "epoch": 0.12286458758250314, + "grad_norm": 9.607398853428961, + "learning_rate": 4.9349982156255994e-05, + "loss": 2.5129, + "mean_token_accuracy": 0.4448275864124298, + "step": 121985 + }, + { + "epoch": 0.12286962363560731, + "grad_norm": 8.732479306056428, + "learning_rate": 4.934989273493395e-05, + "loss": 2.3917, + "mean_token_accuracy": 0.41724138259887694, + "step": 121990 + }, + { + "epoch": 0.12287465968871149, + "grad_norm": 11.018997964429701, + "learning_rate": 4.934980330755174e-05, + "loss": 2.5562, + "mean_token_accuracy": 0.3999999940395355, + "step": 121995 + }, + { + "epoch": 0.12287969574181566, + "grad_norm": 9.88367912052159, + "learning_rate": 4.934971387410942e-05, + "loss": 2.3735, + "mean_token_accuracy": 0.4206896543502808, + "step": 122000 + }, + { + "epoch": 0.12288473179491984, + "grad_norm": 11.125284335688404, + "learning_rate": 4.934962443460699e-05, + "loss": 2.4272, + "mean_token_accuracy": 0.4310344934463501, + "step": 122005 + }, + { + "epoch": 0.12288976784802401, + "grad_norm": 9.200645577064995, + "learning_rate": 4.934953498904449e-05, + "loss": 2.3684, + "mean_token_accuracy": 0.4448275864124298, + "step": 122010 + }, + { + "epoch": 0.12289480390112818, + "grad_norm": 11.973062038301448, + "learning_rate": 4.934944553742194e-05, + "loss": 2.3666, + "mean_token_accuracy": 0.41034482717514037, + "step": 122015 + }, + { + "epoch": 0.12289983995423234, + "grad_norm": 11.3152855482902, + "learning_rate": 4.934935607973936e-05, + "loss": 2.8278, + "mean_token_accuracy": 0.4013309061527252, + "step": 122020 + }, + { + "epoch": 0.12290487600733652, + "grad_norm": 9.861293063168498, + "learning_rate": 4.934926661599679e-05, + "loss": 2.1448, + "mean_token_accuracy": 0.4620689630508423, + "step": 122025 + }, + { + "epoch": 0.12290991206044069, + "grad_norm": 12.701298497665615, + "learning_rate": 4.9349177146194236e-05, + "loss": 2.7552, + "mean_token_accuracy": 0.40145190358161925, + "step": 122030 + }, + { + "epoch": 0.12291494811354486, + "grad_norm": 11.370258313665408, + "learning_rate": 4.9349087670331726e-05, + "loss": 2.3378, + "mean_token_accuracy": 0.4034482777118683, + "step": 122035 + }, + { + "epoch": 0.12291998416664904, + "grad_norm": 9.157821819141166, + "learning_rate": 4.93489981884093e-05, + "loss": 2.0949, + "mean_token_accuracy": 0.48275861144065857, + "step": 122040 + }, + { + "epoch": 0.12292502021975321, + "grad_norm": 11.709418180204672, + "learning_rate": 4.934890870042697e-05, + "loss": 2.8257, + "mean_token_accuracy": 0.3655172407627106, + "step": 122045 + }, + { + "epoch": 0.12293005627285739, + "grad_norm": 12.674235237540847, + "learning_rate": 4.9348819206384764e-05, + "loss": 2.3904, + "mean_token_accuracy": 0.4620689690113068, + "step": 122050 + }, + { + "epoch": 0.12293509232596156, + "grad_norm": 7.832461608738996, + "learning_rate": 4.9348729706282706e-05, + "loss": 2.2918, + "mean_token_accuracy": 0.4482758641242981, + "step": 122055 + }, + { + "epoch": 0.12294012837906573, + "grad_norm": 11.729691535342516, + "learning_rate": 4.9348640200120824e-05, + "loss": 2.3476, + "mean_token_accuracy": 0.41785714626312254, + "step": 122060 + }, + { + "epoch": 0.1229451644321699, + "grad_norm": 12.683163604003601, + "learning_rate": 4.934855068789913e-05, + "loss": 2.1076, + "mean_token_accuracy": 0.5137931108474731, + "step": 122065 + }, + { + "epoch": 0.12295020048527408, + "grad_norm": 10.10668550733948, + "learning_rate": 4.9348461169617674e-05, + "loss": 2.3285, + "mean_token_accuracy": 0.4241379201412201, + "step": 122070 + }, + { + "epoch": 0.12295523653837825, + "grad_norm": 9.60832629517651, + "learning_rate": 4.934837164527645e-05, + "loss": 2.0307, + "mean_token_accuracy": 0.5020568549633027, + "step": 122075 + }, + { + "epoch": 0.12296027259148243, + "grad_norm": 10.753520562126957, + "learning_rate": 4.934828211487551e-05, + "loss": 2.6678, + "mean_token_accuracy": 0.486699515581131, + "step": 122080 + }, + { + "epoch": 0.1229653086445866, + "grad_norm": 10.864081625897514, + "learning_rate": 4.934819257841487e-05, + "loss": 2.5528, + "mean_token_accuracy": 0.42068966031074523, + "step": 122085 + }, + { + "epoch": 0.12297034469769076, + "grad_norm": 10.425084734217068, + "learning_rate": 4.934810303589454e-05, + "loss": 2.579, + "mean_token_accuracy": 0.41034482717514037, + "step": 122090 + }, + { + "epoch": 0.12297538075079494, + "grad_norm": 9.165616166946, + "learning_rate": 4.9348013487314566e-05, + "loss": 2.2168, + "mean_token_accuracy": 0.44827587008476255, + "step": 122095 + }, + { + "epoch": 0.12298041680389911, + "grad_norm": 12.455719235845594, + "learning_rate": 4.934792393267496e-05, + "loss": 2.6802, + "mean_token_accuracy": 0.3758620709180832, + "step": 122100 + }, + { + "epoch": 0.12298545285700328, + "grad_norm": 12.10342353058217, + "learning_rate": 4.9347834371975754e-05, + "loss": 2.6783, + "mean_token_accuracy": 0.4137930989265442, + "step": 122105 + }, + { + "epoch": 0.12299048891010746, + "grad_norm": 10.266020780838815, + "learning_rate": 4.9347744805216966e-05, + "loss": 2.1128, + "mean_token_accuracy": 0.47586206793785096, + "step": 122110 + }, + { + "epoch": 0.12299552496321163, + "grad_norm": 10.073603447435751, + "learning_rate": 4.934765523239863e-05, + "loss": 2.1167, + "mean_token_accuracy": 0.5034482717514038, + "step": 122115 + }, + { + "epoch": 0.1230005610163158, + "grad_norm": 10.152533713681779, + "learning_rate": 4.934756565352076e-05, + "loss": 2.794, + "mean_token_accuracy": 0.40526315569877625, + "step": 122120 + }, + { + "epoch": 0.12300559706941998, + "grad_norm": 10.1068134619733, + "learning_rate": 4.9347476068583386e-05, + "loss": 2.4023, + "mean_token_accuracy": 0.3896551728248596, + "step": 122125 + }, + { + "epoch": 0.12301063312252415, + "grad_norm": 10.609370120959813, + "learning_rate": 4.9347386477586533e-05, + "loss": 2.3438, + "mean_token_accuracy": 0.4689655125141144, + "step": 122130 + }, + { + "epoch": 0.12301566917562833, + "grad_norm": 9.742480497713437, + "learning_rate": 4.934729688053023e-05, + "loss": 2.6343, + "mean_token_accuracy": 0.4206896543502808, + "step": 122135 + }, + { + "epoch": 0.1230207052287325, + "grad_norm": 68.44647623926005, + "learning_rate": 4.934720727741449e-05, + "loss": 2.9402, + "mean_token_accuracy": 0.3965517312288284, + "step": 122140 + }, + { + "epoch": 0.12302574128183667, + "grad_norm": 10.335090569920583, + "learning_rate": 4.934711766823935e-05, + "loss": 1.9034, + "mean_token_accuracy": 0.5137930989265442, + "step": 122145 + }, + { + "epoch": 0.12303077733494085, + "grad_norm": 10.942605504454992, + "learning_rate": 4.934702805300484e-05, + "loss": 2.4326, + "mean_token_accuracy": 0.4776769459247589, + "step": 122150 + }, + { + "epoch": 0.12303581338804502, + "grad_norm": 10.417116363783542, + "learning_rate": 4.9346938431710966e-05, + "loss": 1.9756, + "mean_token_accuracy": 0.4974591672420502, + "step": 122155 + }, + { + "epoch": 0.12304084944114918, + "grad_norm": 14.272467142322608, + "learning_rate": 4.934684880435776e-05, + "loss": 2.4885, + "mean_token_accuracy": 0.3931034505367279, + "step": 122160 + }, + { + "epoch": 0.12304588549425335, + "grad_norm": 10.231096807610847, + "learning_rate": 4.934675917094525e-05, + "loss": 2.567, + "mean_token_accuracy": 0.4000000059604645, + "step": 122165 + }, + { + "epoch": 0.12305092154735753, + "grad_norm": 10.613035074623335, + "learning_rate": 4.934666953147347e-05, + "loss": 2.4373, + "mean_token_accuracy": 0.4172413766384125, + "step": 122170 + }, + { + "epoch": 0.1230559576004617, + "grad_norm": 11.604897608991482, + "learning_rate": 4.934657988594242e-05, + "loss": 2.3155, + "mean_token_accuracy": 0.44827585816383364, + "step": 122175 + }, + { + "epoch": 0.12306099365356588, + "grad_norm": 9.442446180596152, + "learning_rate": 4.934649023435215e-05, + "loss": 2.4477, + "mean_token_accuracy": 0.39310344457626345, + "step": 122180 + }, + { + "epoch": 0.12306602970667005, + "grad_norm": 14.862346094080515, + "learning_rate": 4.9346400576702666e-05, + "loss": 2.6597, + "mean_token_accuracy": 0.3793103456497192, + "step": 122185 + }, + { + "epoch": 0.12307106575977422, + "grad_norm": 9.237671255463706, + "learning_rate": 4.9346310912994004e-05, + "loss": 2.5755, + "mean_token_accuracy": 0.4, + "step": 122190 + }, + { + "epoch": 0.1230761018128784, + "grad_norm": 9.426723561084556, + "learning_rate": 4.9346221243226184e-05, + "loss": 2.4332, + "mean_token_accuracy": 0.44827585220336913, + "step": 122195 + }, + { + "epoch": 0.12308113786598257, + "grad_norm": 11.685295995903461, + "learning_rate": 4.934613156739924e-05, + "loss": 2.5591, + "mean_token_accuracy": 0.4517241358757019, + "step": 122200 + }, + { + "epoch": 0.12308617391908674, + "grad_norm": 9.24584563280647, + "learning_rate": 4.93460418855132e-05, + "loss": 2.0428, + "mean_token_accuracy": 0.4862069010734558, + "step": 122205 + }, + { + "epoch": 0.12309120997219092, + "grad_norm": 9.845804357507516, + "learning_rate": 4.934595219756805e-05, + "loss": 2.4044, + "mean_token_accuracy": 0.43793103098869324, + "step": 122210 + }, + { + "epoch": 0.12309624602529509, + "grad_norm": 9.59392011588664, + "learning_rate": 4.934586250356386e-05, + "loss": 2.4527, + "mean_token_accuracy": 0.3793103516101837, + "step": 122215 + }, + { + "epoch": 0.12310128207839927, + "grad_norm": 10.38199541003397, + "learning_rate": 4.934577280350064e-05, + "loss": 2.5265, + "mean_token_accuracy": 0.4517241358757019, + "step": 122220 + }, + { + "epoch": 0.12310631813150344, + "grad_norm": 11.495513406816896, + "learning_rate": 4.934568309737841e-05, + "loss": 2.3304, + "mean_token_accuracy": 0.42413792610168455, + "step": 122225 + }, + { + "epoch": 0.1231113541846076, + "grad_norm": 11.851411438700698, + "learning_rate": 4.93455933851972e-05, + "loss": 2.0192, + "mean_token_accuracy": 0.5034482777118683, + "step": 122230 + }, + { + "epoch": 0.12311639023771177, + "grad_norm": 14.31969473723742, + "learning_rate": 4.9345503666957035e-05, + "loss": 2.5795, + "mean_token_accuracy": 0.4137930989265442, + "step": 122235 + }, + { + "epoch": 0.12312142629081595, + "grad_norm": 10.698692759965065, + "learning_rate": 4.934541394265794e-05, + "loss": 3.2477, + "mean_token_accuracy": 0.37241379618644715, + "step": 122240 + }, + { + "epoch": 0.12312646234392012, + "grad_norm": 10.50615280000507, + "learning_rate": 4.9345324212299934e-05, + "loss": 2.1866, + "mean_token_accuracy": 0.4758620738983154, + "step": 122245 + }, + { + "epoch": 0.1231314983970243, + "grad_norm": 11.09216276946314, + "learning_rate": 4.934523447588304e-05, + "loss": 2.3951, + "mean_token_accuracy": 0.41034482717514037, + "step": 122250 + }, + { + "epoch": 0.12313653445012847, + "grad_norm": 13.996994909452836, + "learning_rate": 4.9345144733407296e-05, + "loss": 2.1654, + "mean_token_accuracy": 0.4620689630508423, + "step": 122255 + }, + { + "epoch": 0.12314157050323264, + "grad_norm": 10.064488265327634, + "learning_rate": 4.934505498487271e-05, + "loss": 2.2446, + "mean_token_accuracy": 0.4482758641242981, + "step": 122260 + }, + { + "epoch": 0.12314660655633682, + "grad_norm": 12.082994841482064, + "learning_rate": 4.934496523027933e-05, + "loss": 2.8551, + "mean_token_accuracy": 0.37586206793785093, + "step": 122265 + }, + { + "epoch": 0.12315164260944099, + "grad_norm": 10.886210630920107, + "learning_rate": 4.9344875469627166e-05, + "loss": 2.3093, + "mean_token_accuracy": 0.4551724076271057, + "step": 122270 + }, + { + "epoch": 0.12315667866254516, + "grad_norm": 10.62891950233889, + "learning_rate": 4.934478570291624e-05, + "loss": 2.4382, + "mean_token_accuracy": 0.40139141082763674, + "step": 122275 + }, + { + "epoch": 0.12316171471564934, + "grad_norm": 11.946527016663094, + "learning_rate": 4.934469593014658e-05, + "loss": 2.5413, + "mean_token_accuracy": 0.45172414779663084, + "step": 122280 + }, + { + "epoch": 0.12316675076875351, + "grad_norm": 10.377185977853642, + "learning_rate": 4.934460615131821e-05, + "loss": 2.3462, + "mean_token_accuracy": 0.41034482717514037, + "step": 122285 + }, + { + "epoch": 0.12317178682185768, + "grad_norm": 8.429718218233317, + "learning_rate": 4.934451636643116e-05, + "loss": 2.0345, + "mean_token_accuracy": 0.49655171632766726, + "step": 122290 + }, + { + "epoch": 0.12317682287496186, + "grad_norm": 11.707822099177465, + "learning_rate": 4.934442657548546e-05, + "loss": 2.1893, + "mean_token_accuracy": 0.48275862336158754, + "step": 122295 + }, + { + "epoch": 0.12318185892806602, + "grad_norm": 10.599966945278034, + "learning_rate": 4.9344336778481114e-05, + "loss": 2.2445, + "mean_token_accuracy": 0.44827585816383364, + "step": 122300 + }, + { + "epoch": 0.12318689498117019, + "grad_norm": 10.3228864071613, + "learning_rate": 4.9344246975418165e-05, + "loss": 2.4709, + "mean_token_accuracy": 0.4137930989265442, + "step": 122305 + }, + { + "epoch": 0.12319193103427437, + "grad_norm": 19.561972299995883, + "learning_rate": 4.934415716629663e-05, + "loss": 2.6427, + "mean_token_accuracy": 0.4670901417732239, + "step": 122310 + }, + { + "epoch": 0.12319696708737854, + "grad_norm": 12.098653544273036, + "learning_rate": 4.934406735111653e-05, + "loss": 2.6018, + "mean_token_accuracy": 0.37241379022598264, + "step": 122315 + }, + { + "epoch": 0.12320200314048271, + "grad_norm": 13.267338246173694, + "learning_rate": 4.934397752987791e-05, + "loss": 2.8484, + "mean_token_accuracy": 0.3999999940395355, + "step": 122320 + }, + { + "epoch": 0.12320703919358689, + "grad_norm": 11.88545734147257, + "learning_rate": 4.934388770258078e-05, + "loss": 2.6183, + "mean_token_accuracy": 0.4000000059604645, + "step": 122325 + }, + { + "epoch": 0.12321207524669106, + "grad_norm": 9.549587396754854, + "learning_rate": 4.934379786922516e-05, + "loss": 2.3662, + "mean_token_accuracy": 0.458620685338974, + "step": 122330 + }, + { + "epoch": 0.12321711129979523, + "grad_norm": 10.598679680548079, + "learning_rate": 4.934370802981109e-05, + "loss": 2.2729, + "mean_token_accuracy": 0.441379314661026, + "step": 122335 + }, + { + "epoch": 0.12322214735289941, + "grad_norm": 10.873357950340468, + "learning_rate": 4.9343618184338576e-05, + "loss": 2.3628, + "mean_token_accuracy": 0.4257108271121979, + "step": 122340 + }, + { + "epoch": 0.12322718340600358, + "grad_norm": 8.808599840763735, + "learning_rate": 4.934352833280766e-05, + "loss": 2.5232, + "mean_token_accuracy": 0.39655171930789945, + "step": 122345 + }, + { + "epoch": 0.12323221945910776, + "grad_norm": 11.236695446009245, + "learning_rate": 4.934343847521835e-05, + "loss": 2.6449, + "mean_token_accuracy": 0.4310344815254211, + "step": 122350 + }, + { + "epoch": 0.12323725551221193, + "grad_norm": 9.154052568938097, + "learning_rate": 4.9343348611570694e-05, + "loss": 2.4198, + "mean_token_accuracy": 0.42068966031074523, + "step": 122355 + }, + { + "epoch": 0.1232422915653161, + "grad_norm": 12.884911901827888, + "learning_rate": 4.9343258741864704e-05, + "loss": 2.7091, + "mean_token_accuracy": 0.3551724076271057, + "step": 122360 + }, + { + "epoch": 0.12324732761842028, + "grad_norm": 9.59221002167568, + "learning_rate": 4.934316886610039e-05, + "loss": 2.3734, + "mean_token_accuracy": 0.4551724135875702, + "step": 122365 + }, + { + "epoch": 0.12325236367152444, + "grad_norm": 10.957045855982939, + "learning_rate": 4.93430789842778e-05, + "loss": 2.7854, + "mean_token_accuracy": 0.41034482419490814, + "step": 122370 + }, + { + "epoch": 0.12325739972462861, + "grad_norm": 8.685332501064916, + "learning_rate": 4.934298909639695e-05, + "loss": 2.3265, + "mean_token_accuracy": 0.44827587008476255, + "step": 122375 + }, + { + "epoch": 0.12326243577773278, + "grad_norm": 12.684931364529685, + "learning_rate": 4.9342899202457865e-05, + "loss": 2.6232, + "mean_token_accuracy": 0.40344828367233276, + "step": 122380 + }, + { + "epoch": 0.12326747183083696, + "grad_norm": 10.700377474508583, + "learning_rate": 4.9342809302460574e-05, + "loss": 2.478, + "mean_token_accuracy": 0.39310344457626345, + "step": 122385 + }, + { + "epoch": 0.12327250788394113, + "grad_norm": 10.669205270590549, + "learning_rate": 4.934271939640509e-05, + "loss": 2.4491, + "mean_token_accuracy": 0.44827585220336913, + "step": 122390 + }, + { + "epoch": 0.1232775439370453, + "grad_norm": 8.964201599565843, + "learning_rate": 4.934262948429146e-05, + "loss": 2.4483, + "mean_token_accuracy": 0.4537809997797012, + "step": 122395 + }, + { + "epoch": 0.12328257999014948, + "grad_norm": 10.237126865968039, + "learning_rate": 4.9342539566119685e-05, + "loss": 2.1953, + "mean_token_accuracy": 0.46551724076271056, + "step": 122400 + }, + { + "epoch": 0.12328761604325365, + "grad_norm": 10.495606421078685, + "learning_rate": 4.93424496418898e-05, + "loss": 2.2949, + "mean_token_accuracy": 0.43793103396892546, + "step": 122405 + }, + { + "epoch": 0.12329265209635783, + "grad_norm": 11.78599740399031, + "learning_rate": 4.934235971160183e-05, + "loss": 2.7297, + "mean_token_accuracy": 0.42068964838981626, + "step": 122410 + }, + { + "epoch": 0.123297688149462, + "grad_norm": 11.006839787611197, + "learning_rate": 4.934226977525581e-05, + "loss": 2.2737, + "mean_token_accuracy": 0.4551724135875702, + "step": 122415 + }, + { + "epoch": 0.12330272420256617, + "grad_norm": 8.10602968583033, + "learning_rate": 4.934217983285175e-05, + "loss": 2.431, + "mean_token_accuracy": 0.44609800577163694, + "step": 122420 + }, + { + "epoch": 0.12330776025567035, + "grad_norm": 10.681590818867242, + "learning_rate": 4.934208988438967e-05, + "loss": 2.5981, + "mean_token_accuracy": 0.4000000059604645, + "step": 122425 + }, + { + "epoch": 0.12331279630877452, + "grad_norm": 9.501809449986075, + "learning_rate": 4.934199992986961e-05, + "loss": 2.1827, + "mean_token_accuracy": 0.4862069010734558, + "step": 122430 + }, + { + "epoch": 0.1233178323618787, + "grad_norm": 8.0031911329358, + "learning_rate": 4.93419099692916e-05, + "loss": 2.1435, + "mean_token_accuracy": 0.4379310250282288, + "step": 122435 + }, + { + "epoch": 0.12332286841498286, + "grad_norm": 18.725685310228762, + "learning_rate": 4.934182000265564e-05, + "loss": 2.4854, + "mean_token_accuracy": 0.4448275864124298, + "step": 122440 + }, + { + "epoch": 0.12332790446808703, + "grad_norm": 10.02171009051166, + "learning_rate": 4.934173002996179e-05, + "loss": 2.0844, + "mean_token_accuracy": 0.4344827651977539, + "step": 122445 + }, + { + "epoch": 0.1233329405211912, + "grad_norm": 15.272123740620883, + "learning_rate": 4.934164005121004e-05, + "loss": 2.5708, + "mean_token_accuracy": 0.37241379022598264, + "step": 122450 + }, + { + "epoch": 0.12333797657429538, + "grad_norm": 9.492198916098461, + "learning_rate": 4.934155006640042e-05, + "loss": 2.1095, + "mean_token_accuracy": 0.43103447556495667, + "step": 122455 + }, + { + "epoch": 0.12334301262739955, + "grad_norm": 11.532721926617688, + "learning_rate": 4.934146007553298e-05, + "loss": 2.8655, + "mean_token_accuracy": 0.34482758343219755, + "step": 122460 + }, + { + "epoch": 0.12334804868050372, + "grad_norm": 9.475005782651309, + "learning_rate": 4.934137007860773e-05, + "loss": 2.2627, + "mean_token_accuracy": 0.41379311084747317, + "step": 122465 + }, + { + "epoch": 0.1233530847336079, + "grad_norm": 14.421185428687581, + "learning_rate": 4.934128007562469e-05, + "loss": 2.4155, + "mean_token_accuracy": 0.4620689630508423, + "step": 122470 + }, + { + "epoch": 0.12335812078671207, + "grad_norm": 9.488506030382721, + "learning_rate": 4.934119006658388e-05, + "loss": 1.9484, + "mean_token_accuracy": 0.47931033968925474, + "step": 122475 + }, + { + "epoch": 0.12336315683981625, + "grad_norm": 11.20149929380574, + "learning_rate": 4.9341100051485354e-05, + "loss": 2.9896, + "mean_token_accuracy": 0.3241379290819168, + "step": 122480 + }, + { + "epoch": 0.12336819289292042, + "grad_norm": 10.523035609920209, + "learning_rate": 4.93410100303291e-05, + "loss": 2.3309, + "mean_token_accuracy": 0.41379310488700866, + "step": 122485 + }, + { + "epoch": 0.1233732289460246, + "grad_norm": 10.057987553972398, + "learning_rate": 4.9340920003115165e-05, + "loss": 2.0617, + "mean_token_accuracy": 0.4413793087005615, + "step": 122490 + }, + { + "epoch": 0.12337826499912877, + "grad_norm": 30.662806239973115, + "learning_rate": 4.9340829969843576e-05, + "loss": 2.5439, + "mean_token_accuracy": 0.4586206912994385, + "step": 122495 + }, + { + "epoch": 0.12338330105223294, + "grad_norm": 10.517830905827733, + "learning_rate": 4.9340739930514346e-05, + "loss": 2.6503, + "mean_token_accuracy": 0.37241379618644715, + "step": 122500 + }, + { + "epoch": 0.12338833710533711, + "grad_norm": 12.188794124120367, + "learning_rate": 4.934064988512751e-05, + "loss": 2.4389, + "mean_token_accuracy": 0.4551724135875702, + "step": 122505 + }, + { + "epoch": 0.12339337315844127, + "grad_norm": 14.063493944951889, + "learning_rate": 4.934055983368308e-05, + "loss": 2.9226, + "mean_token_accuracy": 0.42068966031074523, + "step": 122510 + }, + { + "epoch": 0.12339840921154545, + "grad_norm": 11.751157732192395, + "learning_rate": 4.9340469776181096e-05, + "loss": 2.2274, + "mean_token_accuracy": 0.47931033968925474, + "step": 122515 + }, + { + "epoch": 0.12340344526464962, + "grad_norm": 14.59810801429352, + "learning_rate": 4.9340379712621573e-05, + "loss": 2.8038, + "mean_token_accuracy": 0.3931034505367279, + "step": 122520 + }, + { + "epoch": 0.1234084813177538, + "grad_norm": 9.975519892782028, + "learning_rate": 4.9340289643004546e-05, + "loss": 2.3315, + "mean_token_accuracy": 0.4551724135875702, + "step": 122525 + }, + { + "epoch": 0.12341351737085797, + "grad_norm": 9.167926122192553, + "learning_rate": 4.9340199567330027e-05, + "loss": 1.7595, + "mean_token_accuracy": 0.517241370677948, + "step": 122530 + }, + { + "epoch": 0.12341855342396214, + "grad_norm": 10.63017394951189, + "learning_rate": 4.934010948559804e-05, + "loss": 2.6677, + "mean_token_accuracy": 0.3517241358757019, + "step": 122535 + }, + { + "epoch": 0.12342358947706632, + "grad_norm": 10.82094683116335, + "learning_rate": 4.934001939780863e-05, + "loss": 2.2388, + "mean_token_accuracy": 0.4379310369491577, + "step": 122540 + }, + { + "epoch": 0.12342862553017049, + "grad_norm": 8.789235422813926, + "learning_rate": 4.93399293039618e-05, + "loss": 2.9268, + "mean_token_accuracy": 0.3776769578456879, + "step": 122545 + }, + { + "epoch": 0.12343366158327466, + "grad_norm": 9.11318163434481, + "learning_rate": 4.9339839204057594e-05, + "loss": 2.3913, + "mean_token_accuracy": 0.3931034505367279, + "step": 122550 + }, + { + "epoch": 0.12343869763637884, + "grad_norm": 10.645606718158037, + "learning_rate": 4.933974909809602e-05, + "loss": 2.4794, + "mean_token_accuracy": 0.4034482777118683, + "step": 122555 + }, + { + "epoch": 0.12344373368948301, + "grad_norm": 12.97868220946382, + "learning_rate": 4.9339658986077115e-05, + "loss": 2.6555, + "mean_token_accuracy": 0.41034482717514037, + "step": 122560 + }, + { + "epoch": 0.12344876974258719, + "grad_norm": 11.268035981086955, + "learning_rate": 4.93395688680009e-05, + "loss": 2.6434, + "mean_token_accuracy": 0.3793103456497192, + "step": 122565 + }, + { + "epoch": 0.12345380579569136, + "grad_norm": 10.363461799977566, + "learning_rate": 4.933947874386739e-05, + "loss": 2.1606, + "mean_token_accuracy": 0.46551724076271056, + "step": 122570 + }, + { + "epoch": 0.12345884184879553, + "grad_norm": 9.597729992356948, + "learning_rate": 4.933938861367662e-05, + "loss": 2.5874, + "mean_token_accuracy": 0.43793103098869324, + "step": 122575 + }, + { + "epoch": 0.1234638779018997, + "grad_norm": 10.214094880742476, + "learning_rate": 4.933929847742862e-05, + "loss": 2.0039, + "mean_token_accuracy": 0.4551724135875702, + "step": 122580 + }, + { + "epoch": 0.12346891395500387, + "grad_norm": 21.97856787739723, + "learning_rate": 4.933920833512341e-05, + "loss": 2.4996, + "mean_token_accuracy": 0.4517241299152374, + "step": 122585 + }, + { + "epoch": 0.12347395000810804, + "grad_norm": 9.073023721521006, + "learning_rate": 4.9339118186761006e-05, + "loss": 2.0413, + "mean_token_accuracy": 0.4988505721092224, + "step": 122590 + }, + { + "epoch": 0.12347898606121221, + "grad_norm": 11.45995196448123, + "learning_rate": 4.9339028032341446e-05, + "loss": 2.4562, + "mean_token_accuracy": 0.42758620381355283, + "step": 122595 + }, + { + "epoch": 0.12348402211431639, + "grad_norm": 10.166830509264564, + "learning_rate": 4.933893787186475e-05, + "loss": 2.0681, + "mean_token_accuracy": 0.4551724135875702, + "step": 122600 + }, + { + "epoch": 0.12348905816742056, + "grad_norm": 15.125531378775012, + "learning_rate": 4.933884770533094e-05, + "loss": 2.8791, + "mean_token_accuracy": 0.3551724135875702, + "step": 122605 + }, + { + "epoch": 0.12349409422052474, + "grad_norm": 11.559263634711028, + "learning_rate": 4.9338757532740056e-05, + "loss": 2.833, + "mean_token_accuracy": 0.358620685338974, + "step": 122610 + }, + { + "epoch": 0.12349913027362891, + "grad_norm": 8.377145186880623, + "learning_rate": 4.93386673540921e-05, + "loss": 2.0139, + "mean_token_accuracy": 0.5034482777118683, + "step": 122615 + }, + { + "epoch": 0.12350416632673308, + "grad_norm": 14.01875907962331, + "learning_rate": 4.9338577169387114e-05, + "loss": 2.4491, + "mean_token_accuracy": 0.4620689630508423, + "step": 122620 + }, + { + "epoch": 0.12350920237983726, + "grad_norm": 9.439122293341352, + "learning_rate": 4.933848697862511e-05, + "loss": 2.6415, + "mean_token_accuracy": 0.3310344755649567, + "step": 122625 + }, + { + "epoch": 0.12351423843294143, + "grad_norm": 10.854702700968856, + "learning_rate": 4.933839678180612e-05, + "loss": 2.2741, + "mean_token_accuracy": 0.4413793087005615, + "step": 122630 + }, + { + "epoch": 0.1235192744860456, + "grad_norm": 11.606268337319495, + "learning_rate": 4.933830657893018e-05, + "loss": 2.6366, + "mean_token_accuracy": 0.3931034505367279, + "step": 122635 + }, + { + "epoch": 0.12352431053914978, + "grad_norm": 11.934936294638902, + "learning_rate": 4.9338216369997305e-05, + "loss": 2.3036, + "mean_token_accuracy": 0.42758620381355283, + "step": 122640 + }, + { + "epoch": 0.12352934659225395, + "grad_norm": 10.6581479301926, + "learning_rate": 4.93381261550075e-05, + "loss": 2.1824, + "mean_token_accuracy": 0.5034482717514038, + "step": 122645 + }, + { + "epoch": 0.12353438264535811, + "grad_norm": 10.98119753348063, + "learning_rate": 4.933803593396083e-05, + "loss": 2.2313, + "mean_token_accuracy": 0.4517241358757019, + "step": 122650 + }, + { + "epoch": 0.12353941869846229, + "grad_norm": 9.386481536184336, + "learning_rate": 4.9337945706857286e-05, + "loss": 2.2538, + "mean_token_accuracy": 0.44482758045196535, + "step": 122655 + }, + { + "epoch": 0.12354445475156646, + "grad_norm": 8.275189126747991, + "learning_rate": 4.933785547369691e-05, + "loss": 2.0559, + "mean_token_accuracy": 0.4758620738983154, + "step": 122660 + }, + { + "epoch": 0.12354949080467063, + "grad_norm": 12.947507515004895, + "learning_rate": 4.9337765234479724e-05, + "loss": 2.4047, + "mean_token_accuracy": 0.4068965554237366, + "step": 122665 + }, + { + "epoch": 0.12355452685777481, + "grad_norm": 11.591014604155436, + "learning_rate": 4.9337674989205756e-05, + "loss": 2.6539, + "mean_token_accuracy": 0.3793103456497192, + "step": 122670 + }, + { + "epoch": 0.12355956291087898, + "grad_norm": 11.831730717192517, + "learning_rate": 4.933758473787502e-05, + "loss": 2.614, + "mean_token_accuracy": 0.3517241358757019, + "step": 122675 + }, + { + "epoch": 0.12356459896398315, + "grad_norm": 8.255610433265508, + "learning_rate": 4.933749448048755e-05, + "loss": 2.0614, + "mean_token_accuracy": 0.5034482836723327, + "step": 122680 + }, + { + "epoch": 0.12356963501708733, + "grad_norm": 9.238959475696083, + "learning_rate": 4.933740421704337e-05, + "loss": 3.0759, + "mean_token_accuracy": 0.3758620619773865, + "step": 122685 + }, + { + "epoch": 0.1235746710701915, + "grad_norm": 9.687465701072114, + "learning_rate": 4.933731394754251e-05, + "loss": 2.4747, + "mean_token_accuracy": 0.40344826877117157, + "step": 122690 + }, + { + "epoch": 0.12357970712329568, + "grad_norm": 10.552977557162263, + "learning_rate": 4.933722367198499e-05, + "loss": 2.3345, + "mean_token_accuracy": 0.4367816150188446, + "step": 122695 + }, + { + "epoch": 0.12358474317639985, + "grad_norm": 9.247735368272822, + "learning_rate": 4.9337133390370823e-05, + "loss": 2.3351, + "mean_token_accuracy": 0.43448275327682495, + "step": 122700 + }, + { + "epoch": 0.12358977922950402, + "grad_norm": 9.510902667002675, + "learning_rate": 4.933704310270005e-05, + "loss": 2.187, + "mean_token_accuracy": 0.458620685338974, + "step": 122705 + }, + { + "epoch": 0.1235948152826082, + "grad_norm": 9.1942552925599, + "learning_rate": 4.9336952808972695e-05, + "loss": 2.1377, + "mean_token_accuracy": 0.5034482717514038, + "step": 122710 + }, + { + "epoch": 0.12359985133571237, + "grad_norm": 12.30740033421167, + "learning_rate": 4.933686250918879e-05, + "loss": 2.0976, + "mean_token_accuracy": 0.5, + "step": 122715 + }, + { + "epoch": 0.12360488738881653, + "grad_norm": 10.833060155889019, + "learning_rate": 4.933677220334833e-05, + "loss": 2.1524, + "mean_token_accuracy": 0.441379314661026, + "step": 122720 + }, + { + "epoch": 0.1236099234419207, + "grad_norm": 9.470699214866597, + "learning_rate": 4.9336681891451375e-05, + "loss": 2.2198, + "mean_token_accuracy": 0.4413793087005615, + "step": 122725 + }, + { + "epoch": 0.12361495949502488, + "grad_norm": 11.043474440367223, + "learning_rate": 4.933659157349793e-05, + "loss": 2.4286, + "mean_token_accuracy": 0.38112522959709166, + "step": 122730 + }, + { + "epoch": 0.12361999554812905, + "grad_norm": 11.205923135653073, + "learning_rate": 4.933650124948801e-05, + "loss": 2.5146, + "mean_token_accuracy": 0.3965517282485962, + "step": 122735 + }, + { + "epoch": 0.12362503160123323, + "grad_norm": 13.127696901634849, + "learning_rate": 4.9336410919421675e-05, + "loss": 2.3849, + "mean_token_accuracy": 0.475862056016922, + "step": 122740 + }, + { + "epoch": 0.1236300676543374, + "grad_norm": 12.189552039999455, + "learning_rate": 4.933632058329893e-05, + "loss": 2.5459, + "mean_token_accuracy": 0.4344827592372894, + "step": 122745 + }, + { + "epoch": 0.12363510370744157, + "grad_norm": 11.750928280873806, + "learning_rate": 4.933623024111979e-05, + "loss": 2.7862, + "mean_token_accuracy": 0.36896551847457887, + "step": 122750 + }, + { + "epoch": 0.12364013976054575, + "grad_norm": 11.644090559075515, + "learning_rate": 4.9336139892884294e-05, + "loss": 2.2513, + "mean_token_accuracy": 0.4655172348022461, + "step": 122755 + }, + { + "epoch": 0.12364517581364992, + "grad_norm": 10.814084565507269, + "learning_rate": 4.9336049538592456e-05, + "loss": 2.3283, + "mean_token_accuracy": 0.4620689630508423, + "step": 122760 + }, + { + "epoch": 0.1236502118667541, + "grad_norm": 12.056941159123008, + "learning_rate": 4.933595917824431e-05, + "loss": 2.4188, + "mean_token_accuracy": 0.4517241418361664, + "step": 122765 + }, + { + "epoch": 0.12365524791985827, + "grad_norm": 8.035789403161033, + "learning_rate": 4.933586881183989e-05, + "loss": 2.117, + "mean_token_accuracy": 0.47931034564971925, + "step": 122770 + }, + { + "epoch": 0.12366028397296244, + "grad_norm": 9.921849775718766, + "learning_rate": 4.93357784393792e-05, + "loss": 2.5015, + "mean_token_accuracy": 0.43448275327682495, + "step": 122775 + }, + { + "epoch": 0.12366532002606662, + "grad_norm": 12.974777475857632, + "learning_rate": 4.933568806086228e-05, + "loss": 2.6382, + "mean_token_accuracy": 0.3931034505367279, + "step": 122780 + }, + { + "epoch": 0.12367035607917079, + "grad_norm": 9.115591857408452, + "learning_rate": 4.933559767628915e-05, + "loss": 2.1798, + "mean_token_accuracy": 0.47586206197738645, + "step": 122785 + }, + { + "epoch": 0.12367539213227495, + "grad_norm": 12.00099751559129, + "learning_rate": 4.9335507285659844e-05, + "loss": 2.3544, + "mean_token_accuracy": 0.4068965494632721, + "step": 122790 + }, + { + "epoch": 0.12368042818537912, + "grad_norm": 12.223195474719459, + "learning_rate": 4.933541688897436e-05, + "loss": 2.5474, + "mean_token_accuracy": 0.4114531993865967, + "step": 122795 + }, + { + "epoch": 0.1236854642384833, + "grad_norm": 8.883663320839975, + "learning_rate": 4.933532648623276e-05, + "loss": 1.9578, + "mean_token_accuracy": 0.4793103575706482, + "step": 122800 + }, + { + "epoch": 0.12369050029158747, + "grad_norm": 9.001479448267245, + "learning_rate": 4.933523607743504e-05, + "loss": 2.163, + "mean_token_accuracy": 0.4517241358757019, + "step": 122805 + }, + { + "epoch": 0.12369553634469165, + "grad_norm": 10.619966449266244, + "learning_rate": 4.933514566258124e-05, + "loss": 2.3223, + "mean_token_accuracy": 0.38965516686439516, + "step": 122810 + }, + { + "epoch": 0.12370057239779582, + "grad_norm": 9.622159652964532, + "learning_rate": 4.933505524167138e-05, + "loss": 2.0597, + "mean_token_accuracy": 0.4517241299152374, + "step": 122815 + }, + { + "epoch": 0.12370560845089999, + "grad_norm": 14.221265806155078, + "learning_rate": 4.9334964814705483e-05, + "loss": 2.8977, + "mean_token_accuracy": 0.3896551728248596, + "step": 122820 + }, + { + "epoch": 0.12371064450400417, + "grad_norm": 10.585106992767665, + "learning_rate": 4.933487438168358e-05, + "loss": 2.8013, + "mean_token_accuracy": 0.4034482777118683, + "step": 122825 + }, + { + "epoch": 0.12371568055710834, + "grad_norm": 9.244865405063791, + "learning_rate": 4.933478394260569e-05, + "loss": 2.3691, + "mean_token_accuracy": 0.43103448748588563, + "step": 122830 + }, + { + "epoch": 0.12372071661021251, + "grad_norm": 8.728101480524408, + "learning_rate": 4.933469349747184e-05, + "loss": 2.0607, + "mean_token_accuracy": 0.4862068951129913, + "step": 122835 + }, + { + "epoch": 0.12372575266331669, + "grad_norm": 11.918679743896949, + "learning_rate": 4.933460304628207e-05, + "loss": 2.4647, + "mean_token_accuracy": 0.42413792610168455, + "step": 122840 + }, + { + "epoch": 0.12373078871642086, + "grad_norm": 14.893946559527697, + "learning_rate": 4.9334512589036374e-05, + "loss": 2.8687, + "mean_token_accuracy": 0.3896551728248596, + "step": 122845 + }, + { + "epoch": 0.12373582476952504, + "grad_norm": 13.07274882812411, + "learning_rate": 4.9334422125734804e-05, + "loss": 2.6378, + "mean_token_accuracy": 0.3862069010734558, + "step": 122850 + }, + { + "epoch": 0.12374086082262921, + "grad_norm": 9.309205331551194, + "learning_rate": 4.933433165637737e-05, + "loss": 2.2724, + "mean_token_accuracy": 0.43793103098869324, + "step": 122855 + }, + { + "epoch": 0.12374589687573337, + "grad_norm": 9.830650399023668, + "learning_rate": 4.933424118096411e-05, + "loss": 2.5416, + "mean_token_accuracy": 0.425952810049057, + "step": 122860 + }, + { + "epoch": 0.12375093292883754, + "grad_norm": 11.682765027821395, + "learning_rate": 4.9334150699495044e-05, + "loss": 2.6916, + "mean_token_accuracy": 0.3379310369491577, + "step": 122865 + }, + { + "epoch": 0.12375596898194172, + "grad_norm": 12.274278011156381, + "learning_rate": 4.933406021197018e-05, + "loss": 2.666, + "mean_token_accuracy": 0.4379310369491577, + "step": 122870 + }, + { + "epoch": 0.12376100503504589, + "grad_norm": 9.517303409525839, + "learning_rate": 4.933396971838957e-05, + "loss": 1.9496, + "mean_token_accuracy": 0.5034482777118683, + "step": 122875 + }, + { + "epoch": 0.12376604108815006, + "grad_norm": 11.183388915112685, + "learning_rate": 4.933387921875322e-05, + "loss": 2.171, + "mean_token_accuracy": 0.4137930989265442, + "step": 122880 + }, + { + "epoch": 0.12377107714125424, + "grad_norm": 10.102851830475673, + "learning_rate": 4.933378871306116e-05, + "loss": 2.4229, + "mean_token_accuracy": 0.4068965494632721, + "step": 122885 + }, + { + "epoch": 0.12377611319435841, + "grad_norm": 10.549116762332543, + "learning_rate": 4.9333698201313426e-05, + "loss": 2.6748, + "mean_token_accuracy": 0.3655172407627106, + "step": 122890 + }, + { + "epoch": 0.12378114924746259, + "grad_norm": 10.412924194872431, + "learning_rate": 4.933360768351003e-05, + "loss": 2.6036, + "mean_token_accuracy": 0.4517241418361664, + "step": 122895 + }, + { + "epoch": 0.12378618530056676, + "grad_norm": 9.597784032188406, + "learning_rate": 4.9333517159651004e-05, + "loss": 2.2233, + "mean_token_accuracy": 0.46896551847457885, + "step": 122900 + }, + { + "epoch": 0.12379122135367093, + "grad_norm": 9.62977868901446, + "learning_rate": 4.933342662973637e-05, + "loss": 2.1076, + "mean_token_accuracy": 0.482758629322052, + "step": 122905 + }, + { + "epoch": 0.1237962574067751, + "grad_norm": 14.18589321538855, + "learning_rate": 4.933333609376616e-05, + "loss": 2.6625, + "mean_token_accuracy": 0.3862068891525269, + "step": 122910 + }, + { + "epoch": 0.12380129345987928, + "grad_norm": 10.620738254493059, + "learning_rate": 4.933324555174038e-05, + "loss": 2.3512, + "mean_token_accuracy": 0.4206896543502808, + "step": 122915 + }, + { + "epoch": 0.12380632951298345, + "grad_norm": 12.553274848792418, + "learning_rate": 4.9333155003659075e-05, + "loss": 2.7671, + "mean_token_accuracy": 0.38620689511299133, + "step": 122920 + }, + { + "epoch": 0.12381136556608763, + "grad_norm": 11.177973919066197, + "learning_rate": 4.933306444952227e-05, + "loss": 2.3321, + "mean_token_accuracy": 0.4137930989265442, + "step": 122925 + }, + { + "epoch": 0.12381640161919179, + "grad_norm": 15.734728497837217, + "learning_rate": 4.933297388932997e-05, + "loss": 2.4784, + "mean_token_accuracy": 0.4137930989265442, + "step": 122930 + }, + { + "epoch": 0.12382143767229596, + "grad_norm": 18.686827346883888, + "learning_rate": 4.9332883323082215e-05, + "loss": 2.2393, + "mean_token_accuracy": 0.4206896543502808, + "step": 122935 + }, + { + "epoch": 0.12382647372540014, + "grad_norm": 11.696466187630268, + "learning_rate": 4.933279275077903e-05, + "loss": 2.4513, + "mean_token_accuracy": 0.42413793206214906, + "step": 122940 + }, + { + "epoch": 0.12383150977850431, + "grad_norm": 14.387234670852038, + "learning_rate": 4.933270217242044e-05, + "loss": 2.0492, + "mean_token_accuracy": 0.4758620738983154, + "step": 122945 + }, + { + "epoch": 0.12383654583160848, + "grad_norm": 9.64707468667973, + "learning_rate": 4.933261158800647e-05, + "loss": 2.4413, + "mean_token_accuracy": 0.39655172228813174, + "step": 122950 + }, + { + "epoch": 0.12384158188471266, + "grad_norm": 12.283068189896253, + "learning_rate": 4.9332520997537147e-05, + "loss": 2.5172, + "mean_token_accuracy": 0.4034482717514038, + "step": 122955 + }, + { + "epoch": 0.12384661793781683, + "grad_norm": 9.61040748359341, + "learning_rate": 4.933243040101248e-05, + "loss": 2.8036, + "mean_token_accuracy": 0.4034482717514038, + "step": 122960 + }, + { + "epoch": 0.123851653990921, + "grad_norm": 9.911546105077123, + "learning_rate": 4.933233979843252e-05, + "loss": 2.1647, + "mean_token_accuracy": 0.45862067937850953, + "step": 122965 + }, + { + "epoch": 0.12385669004402518, + "grad_norm": 10.78625900590272, + "learning_rate": 4.933224918979727e-05, + "loss": 2.1092, + "mean_token_accuracy": 0.4551724135875702, + "step": 122970 + }, + { + "epoch": 0.12386172609712935, + "grad_norm": 10.395474681117163, + "learning_rate": 4.933215857510677e-05, + "loss": 2.5441, + "mean_token_accuracy": 0.4172413766384125, + "step": 122975 + }, + { + "epoch": 0.12386676215023353, + "grad_norm": 8.471404191387302, + "learning_rate": 4.933206795436104e-05, + "loss": 2.1045, + "mean_token_accuracy": 0.4620689630508423, + "step": 122980 + }, + { + "epoch": 0.1238717982033377, + "grad_norm": 10.138903641802825, + "learning_rate": 4.93319773275601e-05, + "loss": 2.6696, + "mean_token_accuracy": 0.34827586114406583, + "step": 122985 + }, + { + "epoch": 0.12387683425644187, + "grad_norm": 9.879582244721448, + "learning_rate": 4.933188669470399e-05, + "loss": 2.0361, + "mean_token_accuracy": 0.5137930989265442, + "step": 122990 + }, + { + "epoch": 0.12388187030954605, + "grad_norm": 9.295481233671834, + "learning_rate": 4.933179605579271e-05, + "loss": 2.3187, + "mean_token_accuracy": 0.42758620977401735, + "step": 122995 + }, + { + "epoch": 0.1238869063626502, + "grad_norm": 11.421037056117518, + "learning_rate": 4.933170541082631e-05, + "loss": 2.6755, + "mean_token_accuracy": 0.3482758641242981, + "step": 123000 + }, + { + "epoch": 0.12389194241575438, + "grad_norm": 10.437684618484582, + "learning_rate": 4.93316147598048e-05, + "loss": 2.3647, + "mean_token_accuracy": 0.4413793087005615, + "step": 123005 + }, + { + "epoch": 0.12389697846885855, + "grad_norm": 10.476839742944929, + "learning_rate": 4.933152410272821e-05, + "loss": 2.7695, + "mean_token_accuracy": 0.41034482717514037, + "step": 123010 + }, + { + "epoch": 0.12390201452196273, + "grad_norm": 10.838014396490431, + "learning_rate": 4.933143343959657e-05, + "loss": 2.1852, + "mean_token_accuracy": 0.4379310369491577, + "step": 123015 + }, + { + "epoch": 0.1239070505750669, + "grad_norm": 11.5087287255968, + "learning_rate": 4.9331342770409894e-05, + "loss": 2.3872, + "mean_token_accuracy": 0.4068965494632721, + "step": 123020 + }, + { + "epoch": 0.12391208662817108, + "grad_norm": 9.389222967909676, + "learning_rate": 4.933125209516822e-05, + "loss": 2.7633, + "mean_token_accuracy": 0.408771938085556, + "step": 123025 + }, + { + "epoch": 0.12391712268127525, + "grad_norm": 10.308981962959685, + "learning_rate": 4.933116141387156e-05, + "loss": 2.3194, + "mean_token_accuracy": 0.42413793206214906, + "step": 123030 + }, + { + "epoch": 0.12392215873437942, + "grad_norm": 9.561279221556552, + "learning_rate": 4.9331070726519956e-05, + "loss": 2.9644, + "mean_token_accuracy": 0.43448275327682495, + "step": 123035 + }, + { + "epoch": 0.1239271947874836, + "grad_norm": 10.0573677422884, + "learning_rate": 4.933098003311341e-05, + "loss": 2.4854, + "mean_token_accuracy": 0.42413793206214906, + "step": 123040 + }, + { + "epoch": 0.12393223084058777, + "grad_norm": 10.431477206753401, + "learning_rate": 4.933088933365197e-05, + "loss": 2.5239, + "mean_token_accuracy": 0.37241379022598264, + "step": 123045 + }, + { + "epoch": 0.12393726689369194, + "grad_norm": 10.867068217849392, + "learning_rate": 4.933079862813565e-05, + "loss": 2.0881, + "mean_token_accuracy": 0.5172413647174835, + "step": 123050 + }, + { + "epoch": 0.12394230294679612, + "grad_norm": 9.542101754828474, + "learning_rate": 4.933070791656448e-05, + "loss": 2.2698, + "mean_token_accuracy": 0.47931033968925474, + "step": 123055 + }, + { + "epoch": 0.12394733899990029, + "grad_norm": 9.421025680142774, + "learning_rate": 4.933061719893848e-05, + "loss": 2.5452, + "mean_token_accuracy": 0.44137930274009707, + "step": 123060 + }, + { + "epoch": 0.12395237505300447, + "grad_norm": 11.830683477115663, + "learning_rate": 4.9330526475257674e-05, + "loss": 2.4479, + "mean_token_accuracy": 0.3655172407627106, + "step": 123065 + }, + { + "epoch": 0.12395741110610863, + "grad_norm": 8.705518853903019, + "learning_rate": 4.9330435745522086e-05, + "loss": 2.5622, + "mean_token_accuracy": 0.4517241358757019, + "step": 123070 + }, + { + "epoch": 0.1239624471592128, + "grad_norm": 9.681590034965595, + "learning_rate": 4.933034500973175e-05, + "loss": 2.1431, + "mean_token_accuracy": 0.4655172348022461, + "step": 123075 + }, + { + "epoch": 0.12396748321231697, + "grad_norm": 11.509248704375837, + "learning_rate": 4.9330254267886684e-05, + "loss": 2.4454, + "mean_token_accuracy": 0.4206896543502808, + "step": 123080 + }, + { + "epoch": 0.12397251926542115, + "grad_norm": 10.983950387277721, + "learning_rate": 4.9330163519986916e-05, + "loss": 2.4705, + "mean_token_accuracy": 0.4586206912994385, + "step": 123085 + }, + { + "epoch": 0.12397755531852532, + "grad_norm": 9.778208279643033, + "learning_rate": 4.933007276603247e-05, + "loss": 2.3174, + "mean_token_accuracy": 0.4103448212146759, + "step": 123090 + }, + { + "epoch": 0.1239825913716295, + "grad_norm": 12.598905892946542, + "learning_rate": 4.9329982006023374e-05, + "loss": 2.2357, + "mean_token_accuracy": 0.4137930989265442, + "step": 123095 + }, + { + "epoch": 0.12398762742473367, + "grad_norm": 12.948293171149169, + "learning_rate": 4.9329891239959656e-05, + "loss": 2.2673, + "mean_token_accuracy": 0.4931034445762634, + "step": 123100 + }, + { + "epoch": 0.12399266347783784, + "grad_norm": 12.287551972168124, + "learning_rate": 4.932980046784133e-05, + "loss": 2.1795, + "mean_token_accuracy": 0.4608590483665466, + "step": 123105 + }, + { + "epoch": 0.12399769953094202, + "grad_norm": 10.708187140126693, + "learning_rate": 4.932970968966842e-05, + "loss": 2.518, + "mean_token_accuracy": 0.3999999940395355, + "step": 123110 + }, + { + "epoch": 0.12400273558404619, + "grad_norm": 11.383412162028849, + "learning_rate": 4.932961890544097e-05, + "loss": 2.5119, + "mean_token_accuracy": 0.39655172228813174, + "step": 123115 + }, + { + "epoch": 0.12400777163715036, + "grad_norm": 10.48135826335423, + "learning_rate": 4.9329528115158994e-05, + "loss": 2.4048, + "mean_token_accuracy": 0.4275862157344818, + "step": 123120 + }, + { + "epoch": 0.12401280769025454, + "grad_norm": 11.668377404142335, + "learning_rate": 4.932943731882252e-05, + "loss": 2.3271, + "mean_token_accuracy": 0.4620689690113068, + "step": 123125 + }, + { + "epoch": 0.12401784374335871, + "grad_norm": 18.12678406395544, + "learning_rate": 4.9329346516431564e-05, + "loss": 2.7186, + "mean_token_accuracy": 0.334482753276825, + "step": 123130 + }, + { + "epoch": 0.12402287979646288, + "grad_norm": 10.52793786279384, + "learning_rate": 4.932925570798615e-05, + "loss": 2.1032, + "mean_token_accuracy": 0.44827585816383364, + "step": 123135 + }, + { + "epoch": 0.12402791584956704, + "grad_norm": 12.860449506042098, + "learning_rate": 4.932916489348632e-05, + "loss": 2.5461, + "mean_token_accuracy": 0.38275861740112305, + "step": 123140 + }, + { + "epoch": 0.12403295190267122, + "grad_norm": 10.48567017880704, + "learning_rate": 4.932907407293209e-05, + "loss": 2.2084, + "mean_token_accuracy": 0.47931034564971925, + "step": 123145 + }, + { + "epoch": 0.12403798795577539, + "grad_norm": 10.094704907297531, + "learning_rate": 4.932898324632348e-05, + "loss": 2.1186, + "mean_token_accuracy": 0.5137930929660797, + "step": 123150 + }, + { + "epoch": 0.12404302400887957, + "grad_norm": 11.368293399874839, + "learning_rate": 4.932889241366052e-05, + "loss": 2.3863, + "mean_token_accuracy": 0.4172413766384125, + "step": 123155 + }, + { + "epoch": 0.12404806006198374, + "grad_norm": 9.861717710956027, + "learning_rate": 4.932880157494324e-05, + "loss": 2.1843, + "mean_token_accuracy": 0.4, + "step": 123160 + }, + { + "epoch": 0.12405309611508791, + "grad_norm": 11.714800578611175, + "learning_rate": 4.9328710730171655e-05, + "loss": 2.5869, + "mean_token_accuracy": 0.4069570451974869, + "step": 123165 + }, + { + "epoch": 0.12405813216819209, + "grad_norm": 10.010907623165348, + "learning_rate": 4.9328619879345797e-05, + "loss": 2.5001, + "mean_token_accuracy": 0.38275861740112305, + "step": 123170 + }, + { + "epoch": 0.12406316822129626, + "grad_norm": 16.96044715898209, + "learning_rate": 4.93285290224657e-05, + "loss": 2.5044, + "mean_token_accuracy": 0.4551724135875702, + "step": 123175 + }, + { + "epoch": 0.12406820427440043, + "grad_norm": 10.72564551454885, + "learning_rate": 4.932843815953136e-05, + "loss": 2.5379, + "mean_token_accuracy": 0.42413792610168455, + "step": 123180 + }, + { + "epoch": 0.12407324032750461, + "grad_norm": 12.05064641159864, + "learning_rate": 4.932834729054284e-05, + "loss": 2.3526, + "mean_token_accuracy": 0.39999999701976774, + "step": 123185 + }, + { + "epoch": 0.12407827638060878, + "grad_norm": 9.475588833538952, + "learning_rate": 4.932825641550013e-05, + "loss": 2.034, + "mean_token_accuracy": 0.5, + "step": 123190 + }, + { + "epoch": 0.12408331243371296, + "grad_norm": 10.048130366180112, + "learning_rate": 4.932816553440329e-05, + "loss": 2.758, + "mean_token_accuracy": 0.36896551549434664, + "step": 123195 + }, + { + "epoch": 0.12408834848681713, + "grad_norm": 21.957440087304068, + "learning_rate": 4.9328074647252314e-05, + "loss": 2.641, + "mean_token_accuracy": 0.43436177372932433, + "step": 123200 + }, + { + "epoch": 0.1240933845399213, + "grad_norm": 9.74255369131871, + "learning_rate": 4.932798375404724e-05, + "loss": 2.1832, + "mean_token_accuracy": 0.47241379618644713, + "step": 123205 + }, + { + "epoch": 0.12409842059302546, + "grad_norm": 13.61766829837327, + "learning_rate": 4.9327892854788094e-05, + "loss": 2.3567, + "mean_token_accuracy": 0.4103448212146759, + "step": 123210 + }, + { + "epoch": 0.12410345664612964, + "grad_norm": 14.615082523035545, + "learning_rate": 4.93278019494749e-05, + "loss": 2.5977, + "mean_token_accuracy": 0.38965516686439516, + "step": 123215 + }, + { + "epoch": 0.12410849269923381, + "grad_norm": 8.428534949801909, + "learning_rate": 4.9327711038107696e-05, + "loss": 2.0244, + "mean_token_accuracy": 0.48374384045600893, + "step": 123220 + }, + { + "epoch": 0.12411352875233798, + "grad_norm": 10.762917444876706, + "learning_rate": 4.932762012068648e-05, + "loss": 2.419, + "mean_token_accuracy": 0.4517241418361664, + "step": 123225 + }, + { + "epoch": 0.12411856480544216, + "grad_norm": 10.747234994252992, + "learning_rate": 4.9327529197211304e-05, + "loss": 2.5998, + "mean_token_accuracy": 0.4087114453315735, + "step": 123230 + }, + { + "epoch": 0.12412360085854633, + "grad_norm": 9.57360757696649, + "learning_rate": 4.9327438267682166e-05, + "loss": 2.1839, + "mean_token_accuracy": 0.4620689630508423, + "step": 123235 + }, + { + "epoch": 0.1241286369116505, + "grad_norm": 8.406422987196738, + "learning_rate": 4.932734733209912e-05, + "loss": 2.3269, + "mean_token_accuracy": 0.43103448748588563, + "step": 123240 + }, + { + "epoch": 0.12413367296475468, + "grad_norm": 12.941132001325842, + "learning_rate": 4.9327256390462176e-05, + "loss": 2.7547, + "mean_token_accuracy": 0.38965516686439516, + "step": 123245 + }, + { + "epoch": 0.12413870901785885, + "grad_norm": 9.752449969786012, + "learning_rate": 4.932716544277135e-05, + "loss": 2.4135, + "mean_token_accuracy": 0.42413793206214906, + "step": 123250 + }, + { + "epoch": 0.12414374507096303, + "grad_norm": 13.615187740349615, + "learning_rate": 4.9327074489026686e-05, + "loss": 2.3145, + "mean_token_accuracy": 0.458620685338974, + "step": 123255 + }, + { + "epoch": 0.1241487811240672, + "grad_norm": 10.040319267060978, + "learning_rate": 4.93269835292282e-05, + "loss": 2.373, + "mean_token_accuracy": 0.43103447556495667, + "step": 123260 + }, + { + "epoch": 0.12415381717717137, + "grad_norm": 10.079910092066099, + "learning_rate": 4.932689256337592e-05, + "loss": 2.6837, + "mean_token_accuracy": 0.36896551251411436, + "step": 123265 + }, + { + "epoch": 0.12415885323027555, + "grad_norm": 10.38608072450349, + "learning_rate": 4.932680159146987e-05, + "loss": 2.4664, + "mean_token_accuracy": 0.4570477962493896, + "step": 123270 + }, + { + "epoch": 0.12416388928337972, + "grad_norm": 10.729621282416062, + "learning_rate": 4.932671061351008e-05, + "loss": 2.4258, + "mean_token_accuracy": 0.42068966031074523, + "step": 123275 + }, + { + "epoch": 0.12416892533648388, + "grad_norm": 11.276463566834492, + "learning_rate": 4.9326619629496565e-05, + "loss": 2.602, + "mean_token_accuracy": 0.3965517282485962, + "step": 123280 + }, + { + "epoch": 0.12417396138958806, + "grad_norm": 10.133533547966623, + "learning_rate": 4.932652863942936e-05, + "loss": 2.3539, + "mean_token_accuracy": 0.4172413766384125, + "step": 123285 + }, + { + "epoch": 0.12417899744269223, + "grad_norm": 13.013234062376828, + "learning_rate": 4.932643764330848e-05, + "loss": 2.6983, + "mean_token_accuracy": 0.3655172407627106, + "step": 123290 + }, + { + "epoch": 0.1241840334957964, + "grad_norm": 11.833425017667839, + "learning_rate": 4.932634664113396e-05, + "loss": 2.2789, + "mean_token_accuracy": 0.4344827651977539, + "step": 123295 + }, + { + "epoch": 0.12418906954890058, + "grad_norm": 11.757532075886633, + "learning_rate": 4.932625563290582e-05, + "loss": 2.8312, + "mean_token_accuracy": 0.3862068891525269, + "step": 123300 + }, + { + "epoch": 0.12419410560200475, + "grad_norm": 11.451683336274522, + "learning_rate": 4.9326164618624086e-05, + "loss": 2.4466, + "mean_token_accuracy": 0.4586206942796707, + "step": 123305 + }, + { + "epoch": 0.12419914165510892, + "grad_norm": 11.259269763179793, + "learning_rate": 4.932607359828879e-05, + "loss": 2.3162, + "mean_token_accuracy": 0.40689654350280763, + "step": 123310 + }, + { + "epoch": 0.1242041777082131, + "grad_norm": 11.918246905982889, + "learning_rate": 4.932598257189994e-05, + "loss": 2.6922, + "mean_token_accuracy": 0.36896551847457887, + "step": 123315 + }, + { + "epoch": 0.12420921376131727, + "grad_norm": 10.53536661795408, + "learning_rate": 4.9325891539457575e-05, + "loss": 2.3382, + "mean_token_accuracy": 0.4344827592372894, + "step": 123320 + }, + { + "epoch": 0.12421424981442145, + "grad_norm": 8.986325722805827, + "learning_rate": 4.932580050096172e-05, + "loss": 2.5828, + "mean_token_accuracy": 0.417241370677948, + "step": 123325 + }, + { + "epoch": 0.12421928586752562, + "grad_norm": 9.110786002475594, + "learning_rate": 4.9325709456412407e-05, + "loss": 2.6606, + "mean_token_accuracy": 0.4310344815254211, + "step": 123330 + }, + { + "epoch": 0.1242243219206298, + "grad_norm": 19.228952943598653, + "learning_rate": 4.932561840580964e-05, + "loss": 2.6777, + "mean_token_accuracy": 0.4344827651977539, + "step": 123335 + }, + { + "epoch": 0.12422935797373397, + "grad_norm": 10.83522742378952, + "learning_rate": 4.932552734915345e-05, + "loss": 2.6914, + "mean_token_accuracy": 0.3931034505367279, + "step": 123340 + }, + { + "epoch": 0.12423439402683814, + "grad_norm": 10.136030393349158, + "learning_rate": 4.932543628644388e-05, + "loss": 2.1136, + "mean_token_accuracy": 0.46049606800079346, + "step": 123345 + }, + { + "epoch": 0.1242394300799423, + "grad_norm": 9.226142623918228, + "learning_rate": 4.9325345217680945e-05, + "loss": 2.0239, + "mean_token_accuracy": 0.4603750824928284, + "step": 123350 + }, + { + "epoch": 0.12424446613304647, + "grad_norm": 12.026389118068465, + "learning_rate": 4.932525414286467e-05, + "loss": 2.127, + "mean_token_accuracy": 0.4851784646511078, + "step": 123355 + }, + { + "epoch": 0.12424950218615065, + "grad_norm": 10.800474833113938, + "learning_rate": 4.932516306199507e-05, + "loss": 2.3156, + "mean_token_accuracy": 0.441379314661026, + "step": 123360 + }, + { + "epoch": 0.12425453823925482, + "grad_norm": 11.772811955549438, + "learning_rate": 4.932507197507219e-05, + "loss": 2.4715, + "mean_token_accuracy": 0.4206896543502808, + "step": 123365 + }, + { + "epoch": 0.124259574292359, + "grad_norm": 12.381118087819438, + "learning_rate": 4.932498088209604e-05, + "loss": 2.582, + "mean_token_accuracy": 0.42758620381355283, + "step": 123370 + }, + { + "epoch": 0.12426461034546317, + "grad_norm": 23.817773174982808, + "learning_rate": 4.932488978306665e-05, + "loss": 2.4687, + "mean_token_accuracy": 0.4310344815254211, + "step": 123375 + }, + { + "epoch": 0.12426964639856734, + "grad_norm": 10.785328934366714, + "learning_rate": 4.932479867798404e-05, + "loss": 2.3527, + "mean_token_accuracy": 0.44482758045196535, + "step": 123380 + }, + { + "epoch": 0.12427468245167152, + "grad_norm": 7.588134819157633, + "learning_rate": 4.932470756684825e-05, + "loss": 2.3163, + "mean_token_accuracy": 0.45583788156509397, + "step": 123385 + }, + { + "epoch": 0.12427971850477569, + "grad_norm": 9.970622714855644, + "learning_rate": 4.932461644965929e-05, + "loss": 2.4369, + "mean_token_accuracy": 0.4172413766384125, + "step": 123390 + }, + { + "epoch": 0.12428475455787986, + "grad_norm": 10.703460641311072, + "learning_rate": 4.932452532641719e-05, + "loss": 2.4344, + "mean_token_accuracy": 0.42413793206214906, + "step": 123395 + }, + { + "epoch": 0.12428979061098404, + "grad_norm": 11.057351957639316, + "learning_rate": 4.932443419712198e-05, + "loss": 2.4941, + "mean_token_accuracy": 0.4344827592372894, + "step": 123400 + }, + { + "epoch": 0.12429482666408821, + "grad_norm": 12.06777601976704, + "learning_rate": 4.932434306177368e-05, + "loss": 2.5108, + "mean_token_accuracy": 0.44482758045196535, + "step": 123405 + }, + { + "epoch": 0.12429986271719239, + "grad_norm": 9.745327129200179, + "learning_rate": 4.932425192037232e-05, + "loss": 2.5553, + "mean_token_accuracy": 0.4323048949241638, + "step": 123410 + }, + { + "epoch": 0.12430489877029656, + "grad_norm": 12.494454986223051, + "learning_rate": 4.932416077291792e-05, + "loss": 2.7016, + "mean_token_accuracy": 0.3793103516101837, + "step": 123415 + }, + { + "epoch": 0.12430993482340072, + "grad_norm": 10.671681937722541, + "learning_rate": 4.9324069619410515e-05, + "loss": 2.4252, + "mean_token_accuracy": 0.42413792610168455, + "step": 123420 + }, + { + "epoch": 0.1243149708765049, + "grad_norm": 9.71090263272129, + "learning_rate": 4.932397845985011e-05, + "loss": 2.7505, + "mean_token_accuracy": 0.3999999940395355, + "step": 123425 + }, + { + "epoch": 0.12432000692960907, + "grad_norm": 9.547100984754968, + "learning_rate": 4.932388729423675e-05, + "loss": 2.267, + "mean_token_accuracy": 0.4413793087005615, + "step": 123430 + }, + { + "epoch": 0.12432504298271324, + "grad_norm": 28.65387918051316, + "learning_rate": 4.932379612257045e-05, + "loss": 2.7288, + "mean_token_accuracy": 0.41034482717514037, + "step": 123435 + }, + { + "epoch": 0.12433007903581741, + "grad_norm": 9.559524436937783, + "learning_rate": 4.932370494485125e-05, + "loss": 2.3979, + "mean_token_accuracy": 0.3862069010734558, + "step": 123440 + }, + { + "epoch": 0.12433511508892159, + "grad_norm": 8.859240559581517, + "learning_rate": 4.932361376107915e-05, + "loss": 1.9736, + "mean_token_accuracy": 0.5124016880989075, + "step": 123445 + }, + { + "epoch": 0.12434015114202576, + "grad_norm": 13.439340088597277, + "learning_rate": 4.9323522571254195e-05, + "loss": 2.5032, + "mean_token_accuracy": 0.41724138259887694, + "step": 123450 + }, + { + "epoch": 0.12434518719512994, + "grad_norm": 14.494201727076739, + "learning_rate": 4.932343137537641e-05, + "loss": 2.0793, + "mean_token_accuracy": 0.49999999403953554, + "step": 123455 + }, + { + "epoch": 0.12435022324823411, + "grad_norm": 13.525405972438469, + "learning_rate": 4.9323340173445804e-05, + "loss": 2.6097, + "mean_token_accuracy": 0.41724138259887694, + "step": 123460 + }, + { + "epoch": 0.12435525930133828, + "grad_norm": 10.659670098934646, + "learning_rate": 4.932324896546242e-05, + "loss": 2.1852, + "mean_token_accuracy": 0.47586206793785096, + "step": 123465 + }, + { + "epoch": 0.12436029535444246, + "grad_norm": 13.545564239805788, + "learning_rate": 4.9323157751426277e-05, + "loss": 2.5635, + "mean_token_accuracy": 0.40344826579093934, + "step": 123470 + }, + { + "epoch": 0.12436533140754663, + "grad_norm": 12.404792712341317, + "learning_rate": 4.9323066531337394e-05, + "loss": 2.5232, + "mean_token_accuracy": 0.42068966031074523, + "step": 123475 + }, + { + "epoch": 0.1243703674606508, + "grad_norm": 11.971366749414326, + "learning_rate": 4.9322975305195803e-05, + "loss": 2.4219, + "mean_token_accuracy": 0.47241378426551817, + "step": 123480 + }, + { + "epoch": 0.12437540351375498, + "grad_norm": 10.491526317177453, + "learning_rate": 4.932288407300153e-05, + "loss": 2.0759, + "mean_token_accuracy": 0.4793103516101837, + "step": 123485 + }, + { + "epoch": 0.12438043956685914, + "grad_norm": 14.121802508591614, + "learning_rate": 4.932279283475461e-05, + "loss": 2.8156, + "mean_token_accuracy": 0.3862068891525269, + "step": 123490 + }, + { + "epoch": 0.12438547561996331, + "grad_norm": 8.981840407513966, + "learning_rate": 4.932270159045504e-05, + "loss": 2.1874, + "mean_token_accuracy": 0.4379310369491577, + "step": 123495 + }, + { + "epoch": 0.12439051167306749, + "grad_norm": 10.878311789490034, + "learning_rate": 4.932261034010287e-05, + "loss": 2.2241, + "mean_token_accuracy": 0.4310344815254211, + "step": 123500 + }, + { + "epoch": 0.12439554772617166, + "grad_norm": 14.448846822573158, + "learning_rate": 4.932251908369812e-05, + "loss": 2.5753, + "mean_token_accuracy": 0.41379310488700866, + "step": 123505 + }, + { + "epoch": 0.12440058377927583, + "grad_norm": 9.363857420735751, + "learning_rate": 4.932242782124081e-05, + "loss": 2.1954, + "mean_token_accuracy": 0.42758620977401735, + "step": 123510 + }, + { + "epoch": 0.12440561983238001, + "grad_norm": 12.516416195348718, + "learning_rate": 4.932233655273096e-05, + "loss": 2.1983, + "mean_token_accuracy": 0.4604355752468109, + "step": 123515 + }, + { + "epoch": 0.12441065588548418, + "grad_norm": 11.037564589852316, + "learning_rate": 4.932224527816861e-05, + "loss": 2.5965, + "mean_token_accuracy": 0.4034482777118683, + "step": 123520 + }, + { + "epoch": 0.12441569193858835, + "grad_norm": 12.07528768859811, + "learning_rate": 4.932215399755379e-05, + "loss": 2.8248, + "mean_token_accuracy": 0.38620689511299133, + "step": 123525 + }, + { + "epoch": 0.12442072799169253, + "grad_norm": 10.434405104204275, + "learning_rate": 4.9322062710886505e-05, + "loss": 2.1271, + "mean_token_accuracy": 0.4896551728248596, + "step": 123530 + }, + { + "epoch": 0.1244257640447967, + "grad_norm": 11.304892198752592, + "learning_rate": 4.932197141816678e-05, + "loss": 2.4779, + "mean_token_accuracy": 0.4379310369491577, + "step": 123535 + }, + { + "epoch": 0.12443080009790088, + "grad_norm": 9.920643442686885, + "learning_rate": 4.9321880119394666e-05, + "loss": 2.0384, + "mean_token_accuracy": 0.482758617401123, + "step": 123540 + }, + { + "epoch": 0.12443583615100505, + "grad_norm": 11.921665429899907, + "learning_rate": 4.9321788814570165e-05, + "loss": 2.6862, + "mean_token_accuracy": 0.3827586114406586, + "step": 123545 + }, + { + "epoch": 0.12444087220410922, + "grad_norm": 14.894451234447269, + "learning_rate": 4.93216975036933e-05, + "loss": 2.2954, + "mean_token_accuracy": 0.4849364757537842, + "step": 123550 + }, + { + "epoch": 0.1244459082572134, + "grad_norm": 10.677267425999865, + "learning_rate": 4.9321606186764126e-05, + "loss": 2.759, + "mean_token_accuracy": 0.39310344457626345, + "step": 123555 + }, + { + "epoch": 0.12445094431031756, + "grad_norm": 11.453531964760659, + "learning_rate": 4.932151486378263e-05, + "loss": 2.448, + "mean_token_accuracy": 0.42068966031074523, + "step": 123560 + }, + { + "epoch": 0.12445598036342173, + "grad_norm": 12.149339625850565, + "learning_rate": 4.932142353474887e-05, + "loss": 2.5669, + "mean_token_accuracy": 0.37586206793785093, + "step": 123565 + }, + { + "epoch": 0.1244610164165259, + "grad_norm": 14.387005558601993, + "learning_rate": 4.932133219966285e-05, + "loss": 2.3753, + "mean_token_accuracy": 0.4896551609039307, + "step": 123570 + }, + { + "epoch": 0.12446605246963008, + "grad_norm": 9.610242161842296, + "learning_rate": 4.9321240858524597e-05, + "loss": 1.9729, + "mean_token_accuracy": 0.46551724672317507, + "step": 123575 + }, + { + "epoch": 0.12447108852273425, + "grad_norm": 10.729990638589701, + "learning_rate": 4.9321149511334144e-05, + "loss": 1.9673, + "mean_token_accuracy": 0.4871921122074127, + "step": 123580 + }, + { + "epoch": 0.12447612457583843, + "grad_norm": 12.314099501332782, + "learning_rate": 4.932105815809152e-05, + "loss": 2.6641, + "mean_token_accuracy": 0.4103448331356049, + "step": 123585 + }, + { + "epoch": 0.1244811606289426, + "grad_norm": 11.820789226694464, + "learning_rate": 4.9320966798796736e-05, + "loss": 2.2736, + "mean_token_accuracy": 0.47931034564971925, + "step": 123590 + }, + { + "epoch": 0.12448619668204677, + "grad_norm": 10.065627258347645, + "learning_rate": 4.9320875433449835e-05, + "loss": 2.3747, + "mean_token_accuracy": 0.45517241954803467, + "step": 123595 + }, + { + "epoch": 0.12449123273515095, + "grad_norm": 9.663683918750145, + "learning_rate": 4.932078406205083e-05, + "loss": 2.3449, + "mean_token_accuracy": 0.42068964838981626, + "step": 123600 + }, + { + "epoch": 0.12449626878825512, + "grad_norm": 9.791134050849395, + "learning_rate": 4.9320692684599745e-05, + "loss": 2.5731, + "mean_token_accuracy": 0.36551723480224607, + "step": 123605 + }, + { + "epoch": 0.1245013048413593, + "grad_norm": 10.175382639824074, + "learning_rate": 4.932060130109661e-05, + "loss": 2.5262, + "mean_token_accuracy": 0.4344827592372894, + "step": 123610 + }, + { + "epoch": 0.12450634089446347, + "grad_norm": 10.568509304617983, + "learning_rate": 4.9320509911541446e-05, + "loss": 2.331, + "mean_token_accuracy": 0.46896552443504336, + "step": 123615 + }, + { + "epoch": 0.12451137694756764, + "grad_norm": 10.804222047710022, + "learning_rate": 4.932041851593429e-05, + "loss": 2.434, + "mean_token_accuracy": 0.4206896543502808, + "step": 123620 + }, + { + "epoch": 0.12451641300067182, + "grad_norm": 10.42270728526008, + "learning_rate": 4.9320327114275156e-05, + "loss": 2.3402, + "mean_token_accuracy": 0.4153659999370575, + "step": 123625 + }, + { + "epoch": 0.12452144905377598, + "grad_norm": 12.493308769494055, + "learning_rate": 4.932023570656407e-05, + "loss": 2.4953, + "mean_token_accuracy": 0.3931034505367279, + "step": 123630 + }, + { + "epoch": 0.12452648510688015, + "grad_norm": 9.52786883902121, + "learning_rate": 4.932014429280107e-05, + "loss": 2.2242, + "mean_token_accuracy": 0.4482758641242981, + "step": 123635 + }, + { + "epoch": 0.12453152115998432, + "grad_norm": 10.849111844048021, + "learning_rate": 4.932005287298616e-05, + "loss": 2.4742, + "mean_token_accuracy": 0.4344827592372894, + "step": 123640 + }, + { + "epoch": 0.1245365572130885, + "grad_norm": 10.348852917568902, + "learning_rate": 4.931996144711938e-05, + "loss": 2.4332, + "mean_token_accuracy": 0.4517241299152374, + "step": 123645 + }, + { + "epoch": 0.12454159326619267, + "grad_norm": 9.367672234804413, + "learning_rate": 4.931987001520076e-05, + "loss": 2.4281, + "mean_token_accuracy": 0.4517241299152374, + "step": 123650 + }, + { + "epoch": 0.12454662931929684, + "grad_norm": 9.344785450827878, + "learning_rate": 4.931977857723031e-05, + "loss": 2.0822, + "mean_token_accuracy": 0.48965516686439514, + "step": 123655 + }, + { + "epoch": 0.12455166537240102, + "grad_norm": 11.600768022953147, + "learning_rate": 4.9319687133208066e-05, + "loss": 2.2734, + "mean_token_accuracy": 0.43793103098869324, + "step": 123660 + }, + { + "epoch": 0.12455670142550519, + "grad_norm": 10.30137558229247, + "learning_rate": 4.9319595683134045e-05, + "loss": 2.341, + "mean_token_accuracy": 0.3965517282485962, + "step": 123665 + }, + { + "epoch": 0.12456173747860937, + "grad_norm": 9.901523654434826, + "learning_rate": 4.9319504227008284e-05, + "loss": 2.5494, + "mean_token_accuracy": 0.39655172526836396, + "step": 123670 + }, + { + "epoch": 0.12456677353171354, + "grad_norm": 12.095347484241078, + "learning_rate": 4.9319412764830806e-05, + "loss": 2.3354, + "mean_token_accuracy": 0.42758620381355283, + "step": 123675 + }, + { + "epoch": 0.12457180958481771, + "grad_norm": 9.619847395745683, + "learning_rate": 4.931932129660162e-05, + "loss": 2.4084, + "mean_token_accuracy": 0.44482758045196535, + "step": 123680 + }, + { + "epoch": 0.12457684563792189, + "grad_norm": 9.395549674515834, + "learning_rate": 4.9319229822320774e-05, + "loss": 2.1872, + "mean_token_accuracy": 0.44827585816383364, + "step": 123685 + }, + { + "epoch": 0.12458188169102606, + "grad_norm": 10.512749270574066, + "learning_rate": 4.931913834198828e-05, + "loss": 2.4194, + "mean_token_accuracy": 0.3620689630508423, + "step": 123690 + }, + { + "epoch": 0.12458691774413024, + "grad_norm": 11.996159906718693, + "learning_rate": 4.931904685560416e-05, + "loss": 3.0487, + "mean_token_accuracy": 0.3551724135875702, + "step": 123695 + }, + { + "epoch": 0.1245919537972344, + "grad_norm": 10.081222082945459, + "learning_rate": 4.9318955363168456e-05, + "loss": 2.545, + "mean_token_accuracy": 0.38620689511299133, + "step": 123700 + }, + { + "epoch": 0.12459698985033857, + "grad_norm": 9.23894621754333, + "learning_rate": 4.931886386468118e-05, + "loss": 2.2581, + "mean_token_accuracy": 0.44827585816383364, + "step": 123705 + }, + { + "epoch": 0.12460202590344274, + "grad_norm": 13.542145021256491, + "learning_rate": 4.931877236014236e-05, + "loss": 2.8101, + "mean_token_accuracy": 0.39310344457626345, + "step": 123710 + }, + { + "epoch": 0.12460706195654692, + "grad_norm": 11.581129341966413, + "learning_rate": 4.931868084955203e-05, + "loss": 2.7297, + "mean_token_accuracy": 0.37586206793785093, + "step": 123715 + }, + { + "epoch": 0.12461209800965109, + "grad_norm": 11.798533917299554, + "learning_rate": 4.9318589332910195e-05, + "loss": 2.9002, + "mean_token_accuracy": 0.4241379380226135, + "step": 123720 + }, + { + "epoch": 0.12461713406275526, + "grad_norm": 9.640623293799536, + "learning_rate": 4.93184978102169e-05, + "loss": 2.0202, + "mean_token_accuracy": 0.5206896543502808, + "step": 123725 + }, + { + "epoch": 0.12462217011585944, + "grad_norm": 9.617962068195393, + "learning_rate": 4.9318406281472154e-05, + "loss": 2.5705, + "mean_token_accuracy": 0.36551723480224607, + "step": 123730 + }, + { + "epoch": 0.12462720616896361, + "grad_norm": 9.604813469913651, + "learning_rate": 4.9318314746676006e-05, + "loss": 2.3539, + "mean_token_accuracy": 0.4344827592372894, + "step": 123735 + }, + { + "epoch": 0.12463224222206779, + "grad_norm": 9.077493884672778, + "learning_rate": 4.931822320582846e-05, + "loss": 2.2066, + "mean_token_accuracy": 0.42068964838981626, + "step": 123740 + }, + { + "epoch": 0.12463727827517196, + "grad_norm": 10.867110172558268, + "learning_rate": 4.9318131658929545e-05, + "loss": 2.8088, + "mean_token_accuracy": 0.4068965494632721, + "step": 123745 + }, + { + "epoch": 0.12464231432827613, + "grad_norm": 9.392779864386263, + "learning_rate": 4.931804010597929e-05, + "loss": 2.4038, + "mean_token_accuracy": 0.4620689690113068, + "step": 123750 + }, + { + "epoch": 0.1246473503813803, + "grad_norm": 9.77315457119676, + "learning_rate": 4.931794854697773e-05, + "loss": 2.7775, + "mean_token_accuracy": 0.417241370677948, + "step": 123755 + }, + { + "epoch": 0.12465238643448448, + "grad_norm": 8.83455450371274, + "learning_rate": 4.931785698192487e-05, + "loss": 2.4564, + "mean_token_accuracy": 0.44827585816383364, + "step": 123760 + }, + { + "epoch": 0.12465742248758865, + "grad_norm": 12.050637375279532, + "learning_rate": 4.9317765410820746e-05, + "loss": 2.2942, + "mean_token_accuracy": 0.4344827651977539, + "step": 123765 + }, + { + "epoch": 0.12466245854069281, + "grad_norm": 11.707590265022704, + "learning_rate": 4.931767383366539e-05, + "loss": 2.2418, + "mean_token_accuracy": 0.44694494605064394, + "step": 123770 + }, + { + "epoch": 0.12466749459379699, + "grad_norm": 9.189279777389158, + "learning_rate": 4.9317582250458814e-05, + "loss": 2.6886, + "mean_token_accuracy": 0.43998789191246035, + "step": 123775 + }, + { + "epoch": 0.12467253064690116, + "grad_norm": 15.773591276214393, + "learning_rate": 4.931749066120106e-05, + "loss": 2.4981, + "mean_token_accuracy": 0.4, + "step": 123780 + }, + { + "epoch": 0.12467756670000534, + "grad_norm": 8.863097339242998, + "learning_rate": 4.931739906589214e-05, + "loss": 2.3278, + "mean_token_accuracy": 0.47114336490631104, + "step": 123785 + }, + { + "epoch": 0.12468260275310951, + "grad_norm": 12.017006178921458, + "learning_rate": 4.931730746453208e-05, + "loss": 2.3017, + "mean_token_accuracy": 0.4586206912994385, + "step": 123790 + }, + { + "epoch": 0.12468763880621368, + "grad_norm": 10.598740583238024, + "learning_rate": 4.9317215857120903e-05, + "loss": 2.1881, + "mean_token_accuracy": 0.4551724076271057, + "step": 123795 + }, + { + "epoch": 0.12469267485931786, + "grad_norm": 11.231135437901454, + "learning_rate": 4.9317124243658644e-05, + "loss": 2.8643, + "mean_token_accuracy": 0.4034482717514038, + "step": 123800 + }, + { + "epoch": 0.12469771091242203, + "grad_norm": 11.894096075202533, + "learning_rate": 4.931703262414533e-05, + "loss": 2.7349, + "mean_token_accuracy": 0.3517241358757019, + "step": 123805 + }, + { + "epoch": 0.1247027469655262, + "grad_norm": 10.344794595110173, + "learning_rate": 4.931694099858098e-05, + "loss": 2.2276, + "mean_token_accuracy": 0.42758620381355283, + "step": 123810 + }, + { + "epoch": 0.12470778301863038, + "grad_norm": 11.748590401556678, + "learning_rate": 4.931684936696561e-05, + "loss": 2.0791, + "mean_token_accuracy": 0.4620689630508423, + "step": 123815 + }, + { + "epoch": 0.12471281907173455, + "grad_norm": 9.635866922146365, + "learning_rate": 4.931675772929926e-05, + "loss": 1.8313, + "mean_token_accuracy": 0.5344827592372894, + "step": 123820 + }, + { + "epoch": 0.12471785512483873, + "grad_norm": 10.695390381346565, + "learning_rate": 4.931666608558196e-05, + "loss": 2.4097, + "mean_token_accuracy": 0.44827585816383364, + "step": 123825 + }, + { + "epoch": 0.1247228911779429, + "grad_norm": 12.305061301430387, + "learning_rate": 4.931657443581371e-05, + "loss": 2.7296, + "mean_token_accuracy": 0.3965517282485962, + "step": 123830 + }, + { + "epoch": 0.12472792723104707, + "grad_norm": 11.57831007506338, + "learning_rate": 4.931648277999457e-05, + "loss": 2.5923, + "mean_token_accuracy": 0.4137930989265442, + "step": 123835 + }, + { + "epoch": 0.12473296328415123, + "grad_norm": 11.896689409926472, + "learning_rate": 4.9316391118124535e-05, + "loss": 2.1688, + "mean_token_accuracy": 0.43793103098869324, + "step": 123840 + }, + { + "epoch": 0.1247379993372554, + "grad_norm": 10.781291657583932, + "learning_rate": 4.931629945020364e-05, + "loss": 2.1756, + "mean_token_accuracy": 0.44827587008476255, + "step": 123845 + }, + { + "epoch": 0.12474303539035958, + "grad_norm": 6.429427710512313, + "learning_rate": 4.931620777623192e-05, + "loss": 1.7935, + "mean_token_accuracy": 0.500738924741745, + "step": 123850 + }, + { + "epoch": 0.12474807144346375, + "grad_norm": 10.912106315867291, + "learning_rate": 4.9316116096209394e-05, + "loss": 2.6199, + "mean_token_accuracy": 0.38620689511299133, + "step": 123855 + }, + { + "epoch": 0.12475310749656793, + "grad_norm": 11.403112544925696, + "learning_rate": 4.931602441013608e-05, + "loss": 2.5271, + "mean_token_accuracy": 0.4413793087005615, + "step": 123860 + }, + { + "epoch": 0.1247581435496721, + "grad_norm": 10.326074684639051, + "learning_rate": 4.9315932718012015e-05, + "loss": 1.9635, + "mean_token_accuracy": 0.4586206912994385, + "step": 123865 + }, + { + "epoch": 0.12476317960277628, + "grad_norm": 11.579898695866243, + "learning_rate": 4.9315841019837226e-05, + "loss": 2.3312, + "mean_token_accuracy": 0.4172413766384125, + "step": 123870 + }, + { + "epoch": 0.12476821565588045, + "grad_norm": 12.279130019615577, + "learning_rate": 4.931574931561172e-05, + "loss": 2.3592, + "mean_token_accuracy": 0.44482758045196535, + "step": 123875 + }, + { + "epoch": 0.12477325170898462, + "grad_norm": 11.004844148836165, + "learning_rate": 4.931565760533555e-05, + "loss": 2.486, + "mean_token_accuracy": 0.36896551251411436, + "step": 123880 + }, + { + "epoch": 0.1247782877620888, + "grad_norm": 10.991898578040617, + "learning_rate": 4.931556588900871e-05, + "loss": 2.5265, + "mean_token_accuracy": 0.4344827592372894, + "step": 123885 + }, + { + "epoch": 0.12478332381519297, + "grad_norm": 10.865961846740692, + "learning_rate": 4.931547416663125e-05, + "loss": 2.2762, + "mean_token_accuracy": 0.43248639106750486, + "step": 123890 + }, + { + "epoch": 0.12478835986829714, + "grad_norm": 11.425831672949833, + "learning_rate": 4.931538243820318e-05, + "loss": 2.376, + "mean_token_accuracy": 0.4620689630508423, + "step": 123895 + }, + { + "epoch": 0.12479339592140132, + "grad_norm": 11.182051213736225, + "learning_rate": 4.931529070372454e-05, + "loss": 2.5771, + "mean_token_accuracy": 0.4103448212146759, + "step": 123900 + }, + { + "epoch": 0.12479843197450549, + "grad_norm": 10.699551092116875, + "learning_rate": 4.9315198963195343e-05, + "loss": 2.304, + "mean_token_accuracy": 0.42758620977401735, + "step": 123905 + }, + { + "epoch": 0.12480346802760965, + "grad_norm": 9.59465081003794, + "learning_rate": 4.931510721661562e-05, + "loss": 2.2041, + "mean_token_accuracy": 0.4551724255084991, + "step": 123910 + }, + { + "epoch": 0.12480850408071383, + "grad_norm": 11.962476885429458, + "learning_rate": 4.931501546398539e-05, + "loss": 2.441, + "mean_token_accuracy": 0.3896551728248596, + "step": 123915 + }, + { + "epoch": 0.124813540133818, + "grad_norm": 10.79373016696175, + "learning_rate": 4.93149237053047e-05, + "loss": 1.9983, + "mean_token_accuracy": 0.482758617401123, + "step": 123920 + }, + { + "epoch": 0.12481857618692217, + "grad_norm": 12.24720353234997, + "learning_rate": 4.931483194057355e-05, + "loss": 2.6165, + "mean_token_accuracy": 0.40508167147636415, + "step": 123925 + }, + { + "epoch": 0.12482361224002635, + "grad_norm": 10.906314179567266, + "learning_rate": 4.9314740169791976e-05, + "loss": 2.8854, + "mean_token_accuracy": 0.39310344457626345, + "step": 123930 + }, + { + "epoch": 0.12482864829313052, + "grad_norm": 8.57757051376491, + "learning_rate": 4.931464839296e-05, + "loss": 2.1679, + "mean_token_accuracy": 0.4551724076271057, + "step": 123935 + }, + { + "epoch": 0.1248336843462347, + "grad_norm": 9.317363075229544, + "learning_rate": 4.931455661007765e-05, + "loss": 2.1262, + "mean_token_accuracy": 0.48965516686439514, + "step": 123940 + }, + { + "epoch": 0.12483872039933887, + "grad_norm": 9.020899315593375, + "learning_rate": 4.931446482114495e-05, + "loss": 2.1771, + "mean_token_accuracy": 0.43103447556495667, + "step": 123945 + }, + { + "epoch": 0.12484375645244304, + "grad_norm": 11.216994947024483, + "learning_rate": 4.931437302616193e-05, + "loss": 2.6884, + "mean_token_accuracy": 0.42068966031074523, + "step": 123950 + }, + { + "epoch": 0.12484879250554722, + "grad_norm": 13.577766699237666, + "learning_rate": 4.931428122512861e-05, + "loss": 2.2794, + "mean_token_accuracy": 0.4607380509376526, + "step": 123955 + }, + { + "epoch": 0.12485382855865139, + "grad_norm": 8.915431276976955, + "learning_rate": 4.9314189418045027e-05, + "loss": 1.9651, + "mean_token_accuracy": 0.5034482836723327, + "step": 123960 + }, + { + "epoch": 0.12485886461175556, + "grad_norm": 10.207325715047853, + "learning_rate": 4.931409760491118e-05, + "loss": 2.4525, + "mean_token_accuracy": 0.4517241358757019, + "step": 123965 + }, + { + "epoch": 0.12486390066485974, + "grad_norm": 10.3904599476029, + "learning_rate": 4.9314005785727125e-05, + "loss": 2.1991, + "mean_token_accuracy": 0.4551724076271057, + "step": 123970 + }, + { + "epoch": 0.12486893671796391, + "grad_norm": 12.275869292310661, + "learning_rate": 4.9313913960492866e-05, + "loss": 2.4438, + "mean_token_accuracy": 0.41560798287391665, + "step": 123975 + }, + { + "epoch": 0.12487397277106807, + "grad_norm": 11.654252935057961, + "learning_rate": 4.931382212920844e-05, + "loss": 2.6634, + "mean_token_accuracy": 0.41724138259887694, + "step": 123980 + }, + { + "epoch": 0.12487900882417224, + "grad_norm": 10.64454127005554, + "learning_rate": 4.931373029187387e-05, + "loss": 1.859, + "mean_token_accuracy": 0.44482757449150084, + "step": 123985 + }, + { + "epoch": 0.12488404487727642, + "grad_norm": 9.17738753882329, + "learning_rate": 4.9313638448489186e-05, + "loss": 2.1325, + "mean_token_accuracy": 0.4225045442581177, + "step": 123990 + }, + { + "epoch": 0.12488908093038059, + "grad_norm": 8.155930147871105, + "learning_rate": 4.93135465990544e-05, + "loss": 2.1224, + "mean_token_accuracy": 0.4379310369491577, + "step": 123995 + }, + { + "epoch": 0.12489411698348477, + "grad_norm": 13.913259919205482, + "learning_rate": 4.931345474356955e-05, + "loss": 2.3732, + "mean_token_accuracy": 0.42758620977401735, + "step": 124000 + }, + { + "epoch": 0.12489915303658894, + "grad_norm": 9.275250141998018, + "learning_rate": 4.931336288203465e-05, + "loss": 2.2718, + "mean_token_accuracy": 0.4586206912994385, + "step": 124005 + }, + { + "epoch": 0.12490418908969311, + "grad_norm": 11.45198936517712, + "learning_rate": 4.931327101444974e-05, + "loss": 2.5811, + "mean_token_accuracy": 0.4103448212146759, + "step": 124010 + }, + { + "epoch": 0.12490922514279729, + "grad_norm": 9.764758178106982, + "learning_rate": 4.931317914081483e-05, + "loss": 2.7661, + "mean_token_accuracy": 0.4551724135875702, + "step": 124015 + }, + { + "epoch": 0.12491426119590146, + "grad_norm": 11.54259689616539, + "learning_rate": 4.931308726112996e-05, + "loss": 2.6193, + "mean_token_accuracy": 0.437931028008461, + "step": 124020 + }, + { + "epoch": 0.12491929724900563, + "grad_norm": 10.856792115832969, + "learning_rate": 4.931299537539515e-05, + "loss": 2.476, + "mean_token_accuracy": 0.43103448748588563, + "step": 124025 + }, + { + "epoch": 0.12492433330210981, + "grad_norm": 10.82719994541945, + "learning_rate": 4.931290348361043e-05, + "loss": 2.2318, + "mean_token_accuracy": 0.43103448748588563, + "step": 124030 + }, + { + "epoch": 0.12492936935521398, + "grad_norm": 11.730860017126803, + "learning_rate": 4.93128115857758e-05, + "loss": 2.4042, + "mean_token_accuracy": 0.4034482717514038, + "step": 124035 + }, + { + "epoch": 0.12493440540831816, + "grad_norm": 12.511839415280349, + "learning_rate": 4.931271968189132e-05, + "loss": 2.5389, + "mean_token_accuracy": 0.41724138259887694, + "step": 124040 + }, + { + "epoch": 0.12493944146142233, + "grad_norm": 11.472001760787498, + "learning_rate": 4.9312627771956995e-05, + "loss": 2.4237, + "mean_token_accuracy": 0.40344828367233276, + "step": 124045 + }, + { + "epoch": 0.12494447751452649, + "grad_norm": 9.800059066481898, + "learning_rate": 4.931253585597287e-05, + "loss": 2.2778, + "mean_token_accuracy": 0.4310344815254211, + "step": 124050 + }, + { + "epoch": 0.12494951356763066, + "grad_norm": 12.022123126888289, + "learning_rate": 4.931244393393894e-05, + "loss": 2.9999, + "mean_token_accuracy": 0.37931033968925476, + "step": 124055 + }, + { + "epoch": 0.12495454962073484, + "grad_norm": 9.869224805735533, + "learning_rate": 4.9312352005855255e-05, + "loss": 1.8331, + "mean_token_accuracy": 0.5228675186634064, + "step": 124060 + }, + { + "epoch": 0.12495958567383901, + "grad_norm": 8.535249595321716, + "learning_rate": 4.9312260071721826e-05, + "loss": 1.9784, + "mean_token_accuracy": 0.4862068951129913, + "step": 124065 + }, + { + "epoch": 0.12496462172694318, + "grad_norm": 13.554037841516994, + "learning_rate": 4.931216813153869e-05, + "loss": 2.4976, + "mean_token_accuracy": 0.39655172228813174, + "step": 124070 + }, + { + "epoch": 0.12496965778004736, + "grad_norm": 8.215018749067184, + "learning_rate": 4.931207618530588e-05, + "loss": 2.3306, + "mean_token_accuracy": 0.42413793206214906, + "step": 124075 + }, + { + "epoch": 0.12497469383315153, + "grad_norm": 10.58593383937158, + "learning_rate": 4.9311984233023386e-05, + "loss": 2.1368, + "mean_token_accuracy": 0.4551724076271057, + "step": 124080 + }, + { + "epoch": 0.1249797298862557, + "grad_norm": 15.386159952830395, + "learning_rate": 4.9311892274691276e-05, + "loss": 2.7046, + "mean_token_accuracy": 0.3827586233615875, + "step": 124085 + }, + { + "epoch": 0.12498476593935988, + "grad_norm": 8.918774474895576, + "learning_rate": 4.931180031030955e-05, + "loss": 2.0983, + "mean_token_accuracy": 0.4172413766384125, + "step": 124090 + }, + { + "epoch": 0.12498980199246405, + "grad_norm": 11.663443381614654, + "learning_rate": 4.931170833987823e-05, + "loss": 2.3922, + "mean_token_accuracy": 0.4358374387025833, + "step": 124095 + }, + { + "epoch": 0.12499483804556823, + "grad_norm": 10.212163572717571, + "learning_rate": 4.931161636339737e-05, + "loss": 2.4685, + "mean_token_accuracy": 0.4206896543502808, + "step": 124100 + }, + { + "epoch": 0.1249998740986724, + "grad_norm": 10.752643245639343, + "learning_rate": 4.931152438086696e-05, + "loss": 2.26, + "mean_token_accuracy": 0.4361766457557678, + "step": 124105 + }, + { + "epoch": 0.12500491015177656, + "grad_norm": 9.792429972102655, + "learning_rate": 4.931143239228705e-05, + "loss": 2.4898, + "mean_token_accuracy": 0.4186932861804962, + "step": 124110 + }, + { + "epoch": 0.12500994620488073, + "grad_norm": 12.945059019387644, + "learning_rate": 4.9311340397657655e-05, + "loss": 2.3639, + "mean_token_accuracy": 0.4103448331356049, + "step": 124115 + }, + { + "epoch": 0.1250149822579849, + "grad_norm": 10.903339494566122, + "learning_rate": 4.9311248396978804e-05, + "loss": 2.4642, + "mean_token_accuracy": 0.417241370677948, + "step": 124120 + }, + { + "epoch": 0.12502001831108908, + "grad_norm": 9.756111801907657, + "learning_rate": 4.9311156390250525e-05, + "loss": 2.3362, + "mean_token_accuracy": 0.44827585816383364, + "step": 124125 + }, + { + "epoch": 0.12502505436419326, + "grad_norm": 21.707479036469287, + "learning_rate": 4.931106437747284e-05, + "loss": 2.4704, + "mean_token_accuracy": 0.43793103098869324, + "step": 124130 + }, + { + "epoch": 0.12503009041729743, + "grad_norm": 9.421060213031076, + "learning_rate": 4.931097235864577e-05, + "loss": 2.4808, + "mean_token_accuracy": 0.44137930274009707, + "step": 124135 + }, + { + "epoch": 0.1250351264704016, + "grad_norm": 11.522459449156207, + "learning_rate": 4.931088033376935e-05, + "loss": 2.7515, + "mean_token_accuracy": 0.3793103486299515, + "step": 124140 + }, + { + "epoch": 0.12504016252350578, + "grad_norm": 10.196326021331092, + "learning_rate": 4.93107883028436e-05, + "loss": 2.306, + "mean_token_accuracy": 0.4689655125141144, + "step": 124145 + }, + { + "epoch": 0.12504519857660995, + "grad_norm": 11.545179991732585, + "learning_rate": 4.931069626586855e-05, + "loss": 1.9724, + "mean_token_accuracy": 0.5295297801494598, + "step": 124150 + }, + { + "epoch": 0.12505023462971412, + "grad_norm": 14.09801574966443, + "learning_rate": 4.931060422284422e-05, + "loss": 2.4379, + "mean_token_accuracy": 0.3896551728248596, + "step": 124155 + }, + { + "epoch": 0.1250552706828183, + "grad_norm": 10.258868445131919, + "learning_rate": 4.9310512173770626e-05, + "loss": 2.3348, + "mean_token_accuracy": 0.41379311084747317, + "step": 124160 + }, + { + "epoch": 0.12506030673592247, + "grad_norm": 9.476993458056684, + "learning_rate": 4.931042011864782e-05, + "loss": 2.4549, + "mean_token_accuracy": 0.41724138259887694, + "step": 124165 + }, + { + "epoch": 0.12506534278902665, + "grad_norm": 11.347618134338601, + "learning_rate": 4.9310328057475805e-05, + "loss": 2.6114, + "mean_token_accuracy": 0.42413793206214906, + "step": 124170 + }, + { + "epoch": 0.12507037884213082, + "grad_norm": 10.38833527401101, + "learning_rate": 4.9310235990254614e-05, + "loss": 2.1821, + "mean_token_accuracy": 0.4655172348022461, + "step": 124175 + }, + { + "epoch": 0.125075414895235, + "grad_norm": 9.497734017409657, + "learning_rate": 4.931014391698428e-05, + "loss": 2.242, + "mean_token_accuracy": 0.4448275864124298, + "step": 124180 + }, + { + "epoch": 0.12508045094833917, + "grad_norm": 14.710155365013705, + "learning_rate": 4.931005183766482e-05, + "loss": 2.6883, + "mean_token_accuracy": 0.4344827592372894, + "step": 124185 + }, + { + "epoch": 0.12508548700144334, + "grad_norm": 11.80418681181357, + "learning_rate": 4.930995975229625e-05, + "loss": 3.2515, + "mean_token_accuracy": 0.34137930870056155, + "step": 124190 + }, + { + "epoch": 0.12509052305454751, + "grad_norm": 14.032108909157538, + "learning_rate": 4.9309867660878616e-05, + "loss": 2.4535, + "mean_token_accuracy": 0.42068966031074523, + "step": 124195 + }, + { + "epoch": 0.1250955591076517, + "grad_norm": 13.390933802810478, + "learning_rate": 4.930977556341194e-05, + "loss": 2.4829, + "mean_token_accuracy": 0.42413792610168455, + "step": 124200 + }, + { + "epoch": 0.12510059516075586, + "grad_norm": 12.24163069549035, + "learning_rate": 4.930968345989622e-05, + "loss": 2.4697, + "mean_token_accuracy": 0.4154264986515045, + "step": 124205 + }, + { + "epoch": 0.12510563121386004, + "grad_norm": 10.36573334788406, + "learning_rate": 4.930959135033153e-05, + "loss": 2.7618, + "mean_token_accuracy": 0.39310344457626345, + "step": 124210 + }, + { + "epoch": 0.1251106672669642, + "grad_norm": 10.286740525438994, + "learning_rate": 4.930949923471785e-05, + "loss": 2.3208, + "mean_token_accuracy": 0.4586206912994385, + "step": 124215 + }, + { + "epoch": 0.12511570332006838, + "grad_norm": 10.726593501320407, + "learning_rate": 4.930940711305522e-05, + "loss": 2.2779, + "mean_token_accuracy": 0.46551724672317507, + "step": 124220 + }, + { + "epoch": 0.12512073937317253, + "grad_norm": 10.501920826463534, + "learning_rate": 4.930931498534368e-05, + "loss": 2.1967, + "mean_token_accuracy": 0.4396854221820831, + "step": 124225 + }, + { + "epoch": 0.1251257754262767, + "grad_norm": 9.10472974801024, + "learning_rate": 4.930922285158324e-05, + "loss": 1.8772, + "mean_token_accuracy": 0.5137930989265442, + "step": 124230 + }, + { + "epoch": 0.12513081147938088, + "grad_norm": 11.832084788221232, + "learning_rate": 4.9309130711773936e-05, + "loss": 2.7032, + "mean_token_accuracy": 0.43103448748588563, + "step": 124235 + }, + { + "epoch": 0.12513584753248505, + "grad_norm": 10.79371384366645, + "learning_rate": 4.9309038565915786e-05, + "loss": 2.7187, + "mean_token_accuracy": 0.41724138259887694, + "step": 124240 + }, + { + "epoch": 0.12514088358558922, + "grad_norm": 15.330319638646554, + "learning_rate": 4.930894641400882e-05, + "loss": 2.374, + "mean_token_accuracy": 0.4000000059604645, + "step": 124245 + }, + { + "epoch": 0.1251459196386934, + "grad_norm": 9.256353586588736, + "learning_rate": 4.930885425605306e-05, + "loss": 2.1746, + "mean_token_accuracy": 0.4931034445762634, + "step": 124250 + }, + { + "epoch": 0.12515095569179757, + "grad_norm": 9.520127108943822, + "learning_rate": 4.9308762092048535e-05, + "loss": 2.1476, + "mean_token_accuracy": 0.4811857223510742, + "step": 124255 + }, + { + "epoch": 0.12515599174490175, + "grad_norm": 12.72107528571997, + "learning_rate": 4.930866992199527e-05, + "loss": 2.418, + "mean_token_accuracy": 0.4517241358757019, + "step": 124260 + }, + { + "epoch": 0.12516102779800592, + "grad_norm": 11.166375304005644, + "learning_rate": 4.930857774589327e-05, + "loss": 2.5324, + "mean_token_accuracy": 0.3999999940395355, + "step": 124265 + }, + { + "epoch": 0.1251660638511101, + "grad_norm": 10.106412227290502, + "learning_rate": 4.93084855637426e-05, + "loss": 2.4309, + "mean_token_accuracy": 0.4034482777118683, + "step": 124270 + }, + { + "epoch": 0.12517109990421427, + "grad_norm": 9.806830703261674, + "learning_rate": 4.930839337554326e-05, + "loss": 2.3157, + "mean_token_accuracy": 0.3999999940395355, + "step": 124275 + }, + { + "epoch": 0.12517613595731844, + "grad_norm": 12.062694964583635, + "learning_rate": 4.930830118129528e-05, + "loss": 2.1704, + "mean_token_accuracy": 0.5103448271751404, + "step": 124280 + }, + { + "epoch": 0.12518117201042261, + "grad_norm": 11.332874025154965, + "learning_rate": 4.930820898099869e-05, + "loss": 2.2498, + "mean_token_accuracy": 0.45517241954803467, + "step": 124285 + }, + { + "epoch": 0.1251862080635268, + "grad_norm": 10.850540724342629, + "learning_rate": 4.93081167746535e-05, + "loss": 2.2788, + "mean_token_accuracy": 0.4413793087005615, + "step": 124290 + }, + { + "epoch": 0.12519124411663096, + "grad_norm": 10.745253987207922, + "learning_rate": 4.930802456225976e-05, + "loss": 1.9893, + "mean_token_accuracy": 0.47126436829566953, + "step": 124295 + }, + { + "epoch": 0.12519628016973514, + "grad_norm": 12.566878173001326, + "learning_rate": 4.930793234381748e-05, + "loss": 2.5113, + "mean_token_accuracy": 0.3793103456497192, + "step": 124300 + }, + { + "epoch": 0.1252013162228393, + "grad_norm": 9.90817146343094, + "learning_rate": 4.9307840119326687e-05, + "loss": 2.2328, + "mean_token_accuracy": 0.47241378426551817, + "step": 124305 + }, + { + "epoch": 0.12520635227594348, + "grad_norm": 8.875456236458364, + "learning_rate": 4.930774788878741e-05, + "loss": 2.4195, + "mean_token_accuracy": 0.39310344457626345, + "step": 124310 + }, + { + "epoch": 0.12521138832904766, + "grad_norm": 8.679191996897456, + "learning_rate": 4.930765565219967e-05, + "loss": 2.0784, + "mean_token_accuracy": 0.4551724076271057, + "step": 124315 + }, + { + "epoch": 0.12521642438215183, + "grad_norm": 12.42731798861218, + "learning_rate": 4.930756340956349e-05, + "loss": 2.538, + "mean_token_accuracy": 0.4435571640729904, + "step": 124320 + }, + { + "epoch": 0.125221460435256, + "grad_norm": 11.811407222849237, + "learning_rate": 4.930747116087891e-05, + "loss": 2.7656, + "mean_token_accuracy": 0.37241379022598264, + "step": 124325 + }, + { + "epoch": 0.12522649648836018, + "grad_norm": 8.669276835630525, + "learning_rate": 4.930737890614595e-05, + "loss": 2.0491, + "mean_token_accuracy": 0.45862067937850953, + "step": 124330 + }, + { + "epoch": 0.12523153254146435, + "grad_norm": 11.280309414632294, + "learning_rate": 4.9307286645364624e-05, + "loss": 2.314, + "mean_token_accuracy": 0.4655172348022461, + "step": 124335 + }, + { + "epoch": 0.12523656859456853, + "grad_norm": 9.16049354730678, + "learning_rate": 4.930719437853496e-05, + "loss": 2.4316, + "mean_token_accuracy": 0.4068965494632721, + "step": 124340 + }, + { + "epoch": 0.1252416046476727, + "grad_norm": 10.812821904576646, + "learning_rate": 4.9307102105657006e-05, + "loss": 2.2273, + "mean_token_accuracy": 0.4137930989265442, + "step": 124345 + }, + { + "epoch": 0.12524664070077687, + "grad_norm": 10.701569128505094, + "learning_rate": 4.930700982673076e-05, + "loss": 2.2037, + "mean_token_accuracy": 0.4620689630508423, + "step": 124350 + }, + { + "epoch": 0.12525167675388105, + "grad_norm": 12.091054923666615, + "learning_rate": 4.9306917541756256e-05, + "loss": 2.5064, + "mean_token_accuracy": 0.43103447556495667, + "step": 124355 + }, + { + "epoch": 0.12525671280698522, + "grad_norm": 10.86908834460403, + "learning_rate": 4.930682525073353e-05, + "loss": 2.3314, + "mean_token_accuracy": 0.42068966031074523, + "step": 124360 + }, + { + "epoch": 0.12526174886008937, + "grad_norm": 11.236026503204053, + "learning_rate": 4.930673295366259e-05, + "loss": 2.2179, + "mean_token_accuracy": 0.4137930989265442, + "step": 124365 + }, + { + "epoch": 0.12526678491319354, + "grad_norm": 8.872651251415567, + "learning_rate": 4.930664065054349e-05, + "loss": 2.1659, + "mean_token_accuracy": 0.4379310369491577, + "step": 124370 + }, + { + "epoch": 0.12527182096629771, + "grad_norm": 11.003687844283842, + "learning_rate": 4.930654834137622e-05, + "loss": 2.7956, + "mean_token_accuracy": 0.4344827592372894, + "step": 124375 + }, + { + "epoch": 0.1252768570194019, + "grad_norm": 11.387695870284572, + "learning_rate": 4.930645602616082e-05, + "loss": 2.3843, + "mean_token_accuracy": 0.4482758641242981, + "step": 124380 + }, + { + "epoch": 0.12528189307250606, + "grad_norm": 9.0184280692249, + "learning_rate": 4.930636370489733e-05, + "loss": 2.3452, + "mean_token_accuracy": 0.44827585220336913, + "step": 124385 + }, + { + "epoch": 0.12528692912561024, + "grad_norm": 10.555524392227948, + "learning_rate": 4.930627137758576e-05, + "loss": 2.416, + "mean_token_accuracy": 0.3793103456497192, + "step": 124390 + }, + { + "epoch": 0.1252919651787144, + "grad_norm": 12.984583476416342, + "learning_rate": 4.930617904422613e-05, + "loss": 2.2601, + "mean_token_accuracy": 0.4448275864124298, + "step": 124395 + }, + { + "epoch": 0.12529700123181858, + "grad_norm": 9.799760564074004, + "learning_rate": 4.930608670481849e-05, + "loss": 2.4443, + "mean_token_accuracy": 0.4482758641242981, + "step": 124400 + }, + { + "epoch": 0.12530203728492276, + "grad_norm": 9.708679615962772, + "learning_rate": 4.9305994359362844e-05, + "loss": 2.1101, + "mean_token_accuracy": 0.4848154902458191, + "step": 124405 + }, + { + "epoch": 0.12530707333802693, + "grad_norm": 13.767479336738107, + "learning_rate": 4.930590200785922e-05, + "loss": 2.4934, + "mean_token_accuracy": 0.43103448748588563, + "step": 124410 + }, + { + "epoch": 0.1253121093911311, + "grad_norm": 10.372723597984143, + "learning_rate": 4.930580965030766e-05, + "loss": 2.624, + "mean_token_accuracy": 0.42068964838981626, + "step": 124415 + }, + { + "epoch": 0.12531714544423528, + "grad_norm": 9.657501886421322, + "learning_rate": 4.930571728670816e-05, + "loss": 1.95, + "mean_token_accuracy": 0.5103448152542114, + "step": 124420 + }, + { + "epoch": 0.12532218149733945, + "grad_norm": 22.344905047164442, + "learning_rate": 4.930562491706077e-05, + "loss": 2.1606, + "mean_token_accuracy": 0.4896551609039307, + "step": 124425 + }, + { + "epoch": 0.12532721755044363, + "grad_norm": 9.981920369758555, + "learning_rate": 4.9305532541365514e-05, + "loss": 2.3096, + "mean_token_accuracy": 0.43793103098869324, + "step": 124430 + }, + { + "epoch": 0.1253322536035478, + "grad_norm": 11.536663812063374, + "learning_rate": 4.930544015962241e-05, + "loss": 2.64, + "mean_token_accuracy": 0.3620689570903778, + "step": 124435 + }, + { + "epoch": 0.12533728965665197, + "grad_norm": 13.416887935152237, + "learning_rate": 4.9305347771831484e-05, + "loss": 2.6735, + "mean_token_accuracy": 0.39310344457626345, + "step": 124440 + }, + { + "epoch": 0.12534232570975615, + "grad_norm": 11.819190433372986, + "learning_rate": 4.9305255377992756e-05, + "loss": 2.2816, + "mean_token_accuracy": 0.4551724135875702, + "step": 124445 + }, + { + "epoch": 0.12534736176286032, + "grad_norm": 10.157211201410727, + "learning_rate": 4.9305162978106274e-05, + "loss": 2.2673, + "mean_token_accuracy": 0.4640048503875732, + "step": 124450 + }, + { + "epoch": 0.1253523978159645, + "grad_norm": 10.378394586946198, + "learning_rate": 4.930507057217204e-05, + "loss": 2.298, + "mean_token_accuracy": 0.42758620977401735, + "step": 124455 + }, + { + "epoch": 0.12535743386906867, + "grad_norm": 12.713838081300732, + "learning_rate": 4.930497816019009e-05, + "loss": 2.7325, + "mean_token_accuracy": 0.36551724672317504, + "step": 124460 + }, + { + "epoch": 0.12536246992217284, + "grad_norm": 9.907453395459001, + "learning_rate": 4.9304885742160445e-05, + "loss": 2.0588, + "mean_token_accuracy": 0.45517241954803467, + "step": 124465 + }, + { + "epoch": 0.12536750597527702, + "grad_norm": 11.534492418789997, + "learning_rate": 4.9304793318083135e-05, + "loss": 2.1591, + "mean_token_accuracy": 0.46009852886199953, + "step": 124470 + }, + { + "epoch": 0.1253725420283812, + "grad_norm": 10.900154180569425, + "learning_rate": 4.930470088795819e-05, + "loss": 2.1837, + "mean_token_accuracy": 0.42068964838981626, + "step": 124475 + }, + { + "epoch": 0.12537757808148536, + "grad_norm": 10.898282998411263, + "learning_rate": 4.930460845178562e-05, + "loss": 2.7508, + "mean_token_accuracy": 0.3827586233615875, + "step": 124480 + }, + { + "epoch": 0.12538261413458954, + "grad_norm": 12.61184354425924, + "learning_rate": 4.9304516009565474e-05, + "loss": 2.4133, + "mean_token_accuracy": 0.4034482777118683, + "step": 124485 + }, + { + "epoch": 0.1253876501876937, + "grad_norm": 9.62049562423281, + "learning_rate": 4.930442356129775e-05, + "loss": 2.7313, + "mean_token_accuracy": 0.3862069010734558, + "step": 124490 + }, + { + "epoch": 0.12539268624079788, + "grad_norm": 9.27471690784353, + "learning_rate": 4.930433110698249e-05, + "loss": 2.2691, + "mean_token_accuracy": 0.42934058904647826, + "step": 124495 + }, + { + "epoch": 0.12539772229390206, + "grad_norm": 12.108371796427354, + "learning_rate": 4.9304238646619726e-05, + "loss": 2.2935, + "mean_token_accuracy": 0.4551724076271057, + "step": 124500 + }, + { + "epoch": 0.1254027583470062, + "grad_norm": 13.159421005456878, + "learning_rate": 4.930414618020947e-05, + "loss": 2.5133, + "mean_token_accuracy": 0.4758620738983154, + "step": 124505 + }, + { + "epoch": 0.12540779440011038, + "grad_norm": 10.628612301447237, + "learning_rate": 4.9304053707751755e-05, + "loss": 2.4069, + "mean_token_accuracy": 0.44827587008476255, + "step": 124510 + }, + { + "epoch": 0.12541283045321455, + "grad_norm": 11.661209873602834, + "learning_rate": 4.93039612292466e-05, + "loss": 2.2758, + "mean_token_accuracy": 0.45704780220985414, + "step": 124515 + }, + { + "epoch": 0.12541786650631873, + "grad_norm": 11.928862655309173, + "learning_rate": 4.930386874469404e-05, + "loss": 2.3238, + "mean_token_accuracy": 0.43641863465309144, + "step": 124520 + }, + { + "epoch": 0.1254229025594229, + "grad_norm": 11.161303972046165, + "learning_rate": 4.930377625409409e-05, + "loss": 2.8901, + "mean_token_accuracy": 0.43448275327682495, + "step": 124525 + }, + { + "epoch": 0.12542793861252707, + "grad_norm": 11.51545051422143, + "learning_rate": 4.9303683757446786e-05, + "loss": 2.3218, + "mean_token_accuracy": 0.42758620381355283, + "step": 124530 + }, + { + "epoch": 0.12543297466563125, + "grad_norm": 11.795054491219824, + "learning_rate": 4.930359125475215e-05, + "loss": 2.4242, + "mean_token_accuracy": 0.41034482717514037, + "step": 124535 + }, + { + "epoch": 0.12543801071873542, + "grad_norm": 11.97099122066195, + "learning_rate": 4.93034987460102e-05, + "loss": 2.6628, + "mean_token_accuracy": 0.39999999701976774, + "step": 124540 + }, + { + "epoch": 0.1254430467718396, + "grad_norm": 9.952526010113953, + "learning_rate": 4.930340623122097e-05, + "loss": 2.6825, + "mean_token_accuracy": 0.3551724076271057, + "step": 124545 + }, + { + "epoch": 0.12544808282494377, + "grad_norm": 9.958362861784732, + "learning_rate": 4.930331371038449e-05, + "loss": 2.396, + "mean_token_accuracy": 0.4620689690113068, + "step": 124550 + }, + { + "epoch": 0.12545311887804794, + "grad_norm": 9.934306306390218, + "learning_rate": 4.9303221183500774e-05, + "loss": 2.7908, + "mean_token_accuracy": 0.3793103456497192, + "step": 124555 + }, + { + "epoch": 0.12545815493115212, + "grad_norm": 14.938526734115726, + "learning_rate": 4.930312865056985e-05, + "loss": 2.8463, + "mean_token_accuracy": 0.42758620977401735, + "step": 124560 + }, + { + "epoch": 0.1254631909842563, + "grad_norm": 9.940380520462925, + "learning_rate": 4.930303611159176e-05, + "loss": 2.1645, + "mean_token_accuracy": 0.44827585816383364, + "step": 124565 + }, + { + "epoch": 0.12546822703736046, + "grad_norm": 11.730876966483073, + "learning_rate": 4.9302943566566504e-05, + "loss": 2.6275, + "mean_token_accuracy": 0.3827586203813553, + "step": 124570 + }, + { + "epoch": 0.12547326309046464, + "grad_norm": 10.734082551112081, + "learning_rate": 4.930285101549413e-05, + "loss": 2.6104, + "mean_token_accuracy": 0.4, + "step": 124575 + }, + { + "epoch": 0.1254782991435688, + "grad_norm": 11.250192412186836, + "learning_rate": 4.930275845837465e-05, + "loss": 2.4856, + "mean_token_accuracy": 0.4724137783050537, + "step": 124580 + }, + { + "epoch": 0.12548333519667298, + "grad_norm": 12.33069578576654, + "learning_rate": 4.930266589520809e-05, + "loss": 2.8329, + "mean_token_accuracy": 0.4103448212146759, + "step": 124585 + }, + { + "epoch": 0.12548837124977716, + "grad_norm": 11.16268165731229, + "learning_rate": 4.930257332599448e-05, + "loss": 2.4668, + "mean_token_accuracy": 0.39999999701976774, + "step": 124590 + }, + { + "epoch": 0.12549340730288133, + "grad_norm": 16.9342047945548, + "learning_rate": 4.930248075073385e-05, + "loss": 2.2973, + "mean_token_accuracy": 0.43793103098869324, + "step": 124595 + }, + { + "epoch": 0.1254984433559855, + "grad_norm": 8.674582059178801, + "learning_rate": 4.930238816942621e-05, + "loss": 2.1273, + "mean_token_accuracy": 0.45862067937850953, + "step": 124600 + }, + { + "epoch": 0.12550347940908968, + "grad_norm": 9.246244899952336, + "learning_rate": 4.9302295582071606e-05, + "loss": 2.3476, + "mean_token_accuracy": 0.39310344457626345, + "step": 124605 + }, + { + "epoch": 0.12550851546219385, + "grad_norm": 13.81039599773027, + "learning_rate": 4.930220298867005e-05, + "loss": 2.2275, + "mean_token_accuracy": 0.4448275864124298, + "step": 124610 + }, + { + "epoch": 0.12551355151529803, + "grad_norm": 10.68915342013985, + "learning_rate": 4.930211038922157e-05, + "loss": 2.5478, + "mean_token_accuracy": 0.4156684875488281, + "step": 124615 + }, + { + "epoch": 0.1255185875684022, + "grad_norm": 12.119654851182881, + "learning_rate": 4.93020177837262e-05, + "loss": 2.5325, + "mean_token_accuracy": 0.43448275327682495, + "step": 124620 + }, + { + "epoch": 0.12552362362150637, + "grad_norm": 10.375049626621571, + "learning_rate": 4.9301925172183954e-05, + "loss": 2.1615, + "mean_token_accuracy": 0.46551724076271056, + "step": 124625 + }, + { + "epoch": 0.12552865967461055, + "grad_norm": 7.42633046238009, + "learning_rate": 4.930183255459486e-05, + "loss": 2.3654, + "mean_token_accuracy": 0.4551724076271057, + "step": 124630 + }, + { + "epoch": 0.12553369572771472, + "grad_norm": 10.70101445203032, + "learning_rate": 4.930173993095896e-05, + "loss": 2.2136, + "mean_token_accuracy": 0.42758620977401735, + "step": 124635 + }, + { + "epoch": 0.12553873178081887, + "grad_norm": 11.388928004284875, + "learning_rate": 4.9301647301276254e-05, + "loss": 2.4813, + "mean_token_accuracy": 0.4241379201412201, + "step": 124640 + }, + { + "epoch": 0.12554376783392304, + "grad_norm": 14.501089363676828, + "learning_rate": 4.9301554665546784e-05, + "loss": 2.8524, + "mean_token_accuracy": 0.4000000059604645, + "step": 124645 + }, + { + "epoch": 0.12554880388702722, + "grad_norm": 12.210052158703052, + "learning_rate": 4.9301462023770575e-05, + "loss": 2.691, + "mean_token_accuracy": 0.44482759237289426, + "step": 124650 + }, + { + "epoch": 0.1255538399401314, + "grad_norm": 12.062600488128512, + "learning_rate": 4.9301369375947636e-05, + "loss": 2.8426, + "mean_token_accuracy": 0.37241379618644715, + "step": 124655 + }, + { + "epoch": 0.12555887599323556, + "grad_norm": 9.623788881445236, + "learning_rate": 4.930127672207801e-05, + "loss": 2.6817, + "mean_token_accuracy": 0.41034482717514037, + "step": 124660 + }, + { + "epoch": 0.12556391204633974, + "grad_norm": 10.865980085018004, + "learning_rate": 4.930118406216173e-05, + "loss": 2.1651, + "mean_token_accuracy": 0.47931033968925474, + "step": 124665 + }, + { + "epoch": 0.1255689480994439, + "grad_norm": 10.824042084364352, + "learning_rate": 4.9301091396198804e-05, + "loss": 2.1779, + "mean_token_accuracy": 0.44827585816383364, + "step": 124670 + }, + { + "epoch": 0.12557398415254808, + "grad_norm": 15.09230145508599, + "learning_rate": 4.9300998724189266e-05, + "loss": 2.3435, + "mean_token_accuracy": 0.38620689511299133, + "step": 124675 + }, + { + "epoch": 0.12557902020565226, + "grad_norm": 13.185264547451006, + "learning_rate": 4.930090604613313e-05, + "loss": 1.9523, + "mean_token_accuracy": 0.48275861144065857, + "step": 124680 + }, + { + "epoch": 0.12558405625875643, + "grad_norm": 14.11166633829688, + "learning_rate": 4.930081336203044e-05, + "loss": 2.3738, + "mean_token_accuracy": 0.42413793206214906, + "step": 124685 + }, + { + "epoch": 0.1255890923118606, + "grad_norm": 9.379666024803575, + "learning_rate": 4.930072067188121e-05, + "loss": 2.1793, + "mean_token_accuracy": 0.48275862336158754, + "step": 124690 + }, + { + "epoch": 0.12559412836496478, + "grad_norm": 11.276653192053859, + "learning_rate": 4.930062797568547e-05, + "loss": 2.5849, + "mean_token_accuracy": 0.41724138259887694, + "step": 124695 + }, + { + "epoch": 0.12559916441806895, + "grad_norm": 10.60472390318333, + "learning_rate": 4.9300535273443244e-05, + "loss": 2.2812, + "mean_token_accuracy": 0.43793103098869324, + "step": 124700 + }, + { + "epoch": 0.12560420047117313, + "grad_norm": 10.09012991853654, + "learning_rate": 4.930044256515456e-05, + "loss": 2.267, + "mean_token_accuracy": 0.39310343861579894, + "step": 124705 + }, + { + "epoch": 0.1256092365242773, + "grad_norm": 15.513915624820415, + "learning_rate": 4.9300349850819436e-05, + "loss": 2.5037, + "mean_token_accuracy": 0.4517241418361664, + "step": 124710 + }, + { + "epoch": 0.12561427257738148, + "grad_norm": 13.02613968089426, + "learning_rate": 4.930025713043791e-05, + "loss": 2.6672, + "mean_token_accuracy": 0.3999999940395355, + "step": 124715 + }, + { + "epoch": 0.12561930863048565, + "grad_norm": 9.649172974368424, + "learning_rate": 4.930016440401e-05, + "loss": 2.2826, + "mean_token_accuracy": 0.482758629322052, + "step": 124720 + }, + { + "epoch": 0.12562434468358982, + "grad_norm": 9.343040850393614, + "learning_rate": 4.930007167153574e-05, + "loss": 2.2532, + "mean_token_accuracy": 0.44482758045196535, + "step": 124725 + }, + { + "epoch": 0.125629380736694, + "grad_norm": 11.10436546194367, + "learning_rate": 4.929997893301513e-05, + "loss": 2.4143, + "mean_token_accuracy": 0.4498487651348114, + "step": 124730 + }, + { + "epoch": 0.12563441678979817, + "grad_norm": 10.163303957350657, + "learning_rate": 4.929988618844823e-05, + "loss": 2.4618, + "mean_token_accuracy": 0.4206896543502808, + "step": 124735 + }, + { + "epoch": 0.12563945284290234, + "grad_norm": 9.585957170919789, + "learning_rate": 4.929979343783505e-05, + "loss": 2.4401, + "mean_token_accuracy": 0.42758620381355283, + "step": 124740 + }, + { + "epoch": 0.12564448889600652, + "grad_norm": 14.754339119992299, + "learning_rate": 4.929970068117561e-05, + "loss": 2.5607, + "mean_token_accuracy": 0.4275862157344818, + "step": 124745 + }, + { + "epoch": 0.1256495249491107, + "grad_norm": 11.08677609217498, + "learning_rate": 4.929960791846994e-05, + "loss": 2.4467, + "mean_token_accuracy": 0.43448275327682495, + "step": 124750 + }, + { + "epoch": 0.12565456100221487, + "grad_norm": 10.362906917031145, + "learning_rate": 4.929951514971807e-05, + "loss": 2.3062, + "mean_token_accuracy": 0.4172413766384125, + "step": 124755 + }, + { + "epoch": 0.12565959705531904, + "grad_norm": 13.472080325072039, + "learning_rate": 4.929942237492002e-05, + "loss": 2.1709, + "mean_token_accuracy": 0.4862068951129913, + "step": 124760 + }, + { + "epoch": 0.1256646331084232, + "grad_norm": 17.291678219400506, + "learning_rate": 4.929932959407583e-05, + "loss": 2.9935, + "mean_token_accuracy": 0.3482758641242981, + "step": 124765 + }, + { + "epoch": 0.1256696691615274, + "grad_norm": 9.183559872479305, + "learning_rate": 4.9299236807185504e-05, + "loss": 2.1417, + "mean_token_accuracy": 0.4620689690113068, + "step": 124770 + }, + { + "epoch": 0.12567470521463156, + "grad_norm": 11.532906173862996, + "learning_rate": 4.929914401424909e-05, + "loss": 2.276, + "mean_token_accuracy": 0.4724137902259827, + "step": 124775 + }, + { + "epoch": 0.1256797412677357, + "grad_norm": 11.780900114022156, + "learning_rate": 4.9299051215266596e-05, + "loss": 2.226, + "mean_token_accuracy": 0.41034482717514037, + "step": 124780 + }, + { + "epoch": 0.12568477732083988, + "grad_norm": 9.762984920155311, + "learning_rate": 4.929895841023805e-05, + "loss": 2.2261, + "mean_token_accuracy": 0.42758620381355283, + "step": 124785 + }, + { + "epoch": 0.12568981337394405, + "grad_norm": 11.369020675362362, + "learning_rate": 4.929886559916349e-05, + "loss": 2.2596, + "mean_token_accuracy": 0.42758620381355283, + "step": 124790 + }, + { + "epoch": 0.12569484942704823, + "grad_norm": 10.077714583183026, + "learning_rate": 4.929877278204292e-05, + "loss": 2.2478, + "mean_token_accuracy": 0.4458128035068512, + "step": 124795 + }, + { + "epoch": 0.1256998854801524, + "grad_norm": 10.542518272892835, + "learning_rate": 4.929867995887638e-05, + "loss": 2.1669, + "mean_token_accuracy": 0.4637023627758026, + "step": 124800 + }, + { + "epoch": 0.12570492153325658, + "grad_norm": 11.245086223219726, + "learning_rate": 4.929858712966391e-05, + "loss": 2.3923, + "mean_token_accuracy": 0.4793103337287903, + "step": 124805 + }, + { + "epoch": 0.12570995758636075, + "grad_norm": 11.32158876893631, + "learning_rate": 4.9298494294405504e-05, + "loss": 2.251, + "mean_token_accuracy": 0.48275862336158754, + "step": 124810 + }, + { + "epoch": 0.12571499363946492, + "grad_norm": 9.263053814466303, + "learning_rate": 4.929840145310121e-05, + "loss": 3.0557, + "mean_token_accuracy": 0.3551724076271057, + "step": 124815 + }, + { + "epoch": 0.1257200296925691, + "grad_norm": 11.393053418786424, + "learning_rate": 4.9298308605751056e-05, + "loss": 2.505, + "mean_token_accuracy": 0.4379310369491577, + "step": 124820 + }, + { + "epoch": 0.12572506574567327, + "grad_norm": 10.776018376078913, + "learning_rate": 4.9298215752355056e-05, + "loss": 2.3024, + "mean_token_accuracy": 0.42758620977401735, + "step": 124825 + }, + { + "epoch": 0.12573010179877744, + "grad_norm": 10.63415756330727, + "learning_rate": 4.9298122892913234e-05, + "loss": 2.3193, + "mean_token_accuracy": 0.47767695784568787, + "step": 124830 + }, + { + "epoch": 0.12573513785188162, + "grad_norm": 10.115554482214815, + "learning_rate": 4.929803002742562e-05, + "loss": 2.3358, + "mean_token_accuracy": 0.4275862157344818, + "step": 124835 + }, + { + "epoch": 0.1257401739049858, + "grad_norm": 9.617740132383002, + "learning_rate": 4.9297937155892256e-05, + "loss": 2.2648, + "mean_token_accuracy": 0.41530550122261045, + "step": 124840 + }, + { + "epoch": 0.12574520995808997, + "grad_norm": 11.469696881723728, + "learning_rate": 4.9297844278313134e-05, + "loss": 2.6002, + "mean_token_accuracy": 0.3931034505367279, + "step": 124845 + }, + { + "epoch": 0.12575024601119414, + "grad_norm": 12.607824152460605, + "learning_rate": 4.929775139468831e-05, + "loss": 2.2668, + "mean_token_accuracy": 0.4517241418361664, + "step": 124850 + }, + { + "epoch": 0.1257552820642983, + "grad_norm": 10.04156455023881, + "learning_rate": 4.929765850501779e-05, + "loss": 2.303, + "mean_token_accuracy": 0.47931033968925474, + "step": 124855 + }, + { + "epoch": 0.1257603181174025, + "grad_norm": 13.204351552175593, + "learning_rate": 4.9297565609301615e-05, + "loss": 2.3704, + "mean_token_accuracy": 0.3793103456497192, + "step": 124860 + }, + { + "epoch": 0.12576535417050666, + "grad_norm": 14.151111459345502, + "learning_rate": 4.9297472707539804e-05, + "loss": 2.5039, + "mean_token_accuracy": 0.43103447258472444, + "step": 124865 + }, + { + "epoch": 0.12577039022361083, + "grad_norm": 10.287347106515607, + "learning_rate": 4.929737979973238e-05, + "loss": 2.6358, + "mean_token_accuracy": 0.4401088833808899, + "step": 124870 + }, + { + "epoch": 0.125775426276715, + "grad_norm": 9.898973397022498, + "learning_rate": 4.929728688587937e-05, + "loss": 2.1612, + "mean_token_accuracy": 0.47241380214691164, + "step": 124875 + }, + { + "epoch": 0.12578046232981918, + "grad_norm": 12.015950662774873, + "learning_rate": 4.92971939659808e-05, + "loss": 2.102, + "mean_token_accuracy": 0.4655172348022461, + "step": 124880 + }, + { + "epoch": 0.12578549838292336, + "grad_norm": 9.199675212517754, + "learning_rate": 4.92971010400367e-05, + "loss": 2.1101, + "mean_token_accuracy": 0.45862067937850953, + "step": 124885 + }, + { + "epoch": 0.12579053443602753, + "grad_norm": 10.71574399802233, + "learning_rate": 4.92970081080471e-05, + "loss": 2.2249, + "mean_token_accuracy": 0.44482759237289426, + "step": 124890 + }, + { + "epoch": 0.1257955704891317, + "grad_norm": 11.570246221300328, + "learning_rate": 4.929691517001201e-05, + "loss": 2.5286, + "mean_token_accuracy": 0.3896551728248596, + "step": 124895 + }, + { + "epoch": 0.12580060654223588, + "grad_norm": 12.693753011582569, + "learning_rate": 4.929682222593147e-05, + "loss": 2.3181, + "mean_token_accuracy": 0.4448275864124298, + "step": 124900 + }, + { + "epoch": 0.12580564259534005, + "grad_norm": 10.029822738281057, + "learning_rate": 4.929672927580549e-05, + "loss": 2.2635, + "mean_token_accuracy": 0.42758620977401735, + "step": 124905 + }, + { + "epoch": 0.12581067864844422, + "grad_norm": 11.466763209368024, + "learning_rate": 4.929663631963411e-05, + "loss": 2.0675, + "mean_token_accuracy": 0.4551724135875702, + "step": 124910 + }, + { + "epoch": 0.1258157147015484, + "grad_norm": 10.077935727122965, + "learning_rate": 4.929654335741736e-05, + "loss": 2.1194, + "mean_token_accuracy": 0.48965516686439514, + "step": 124915 + }, + { + "epoch": 0.12582075075465254, + "grad_norm": 12.88302234832316, + "learning_rate": 4.929645038915525e-05, + "loss": 2.7777, + "mean_token_accuracy": 0.38620689511299133, + "step": 124920 + }, + { + "epoch": 0.12582578680775672, + "grad_norm": 10.336974789921443, + "learning_rate": 4.929635741484781e-05, + "loss": 2.2853, + "mean_token_accuracy": 0.46551724076271056, + "step": 124925 + }, + { + "epoch": 0.1258308228608609, + "grad_norm": 10.71216876597711, + "learning_rate": 4.929626443449507e-05, + "loss": 2.6768, + "mean_token_accuracy": 0.4413793087005615, + "step": 124930 + }, + { + "epoch": 0.12583585891396507, + "grad_norm": 10.596842791236762, + "learning_rate": 4.9296171448097056e-05, + "loss": 2.4019, + "mean_token_accuracy": 0.4758620738983154, + "step": 124935 + }, + { + "epoch": 0.12584089496706924, + "grad_norm": 12.31550445280161, + "learning_rate": 4.92960784556538e-05, + "loss": 2.3664, + "mean_token_accuracy": 0.4103448212146759, + "step": 124940 + }, + { + "epoch": 0.1258459310201734, + "grad_norm": 9.933005697531689, + "learning_rate": 4.9295985457165314e-05, + "loss": 2.7421, + "mean_token_accuracy": 0.4068965554237366, + "step": 124945 + }, + { + "epoch": 0.1258509670732776, + "grad_norm": 10.145829888384782, + "learning_rate": 4.929589245263163e-05, + "loss": 2.7587, + "mean_token_accuracy": 0.34137930870056155, + "step": 124950 + }, + { + "epoch": 0.12585600312638176, + "grad_norm": 12.329540906777055, + "learning_rate": 4.929579944205277e-05, + "loss": 2.4852, + "mean_token_accuracy": 0.420689657330513, + "step": 124955 + }, + { + "epoch": 0.12586103917948593, + "grad_norm": 8.66417948719421, + "learning_rate": 4.9295706425428775e-05, + "loss": 2.1594, + "mean_token_accuracy": 0.4448275864124298, + "step": 124960 + }, + { + "epoch": 0.1258660752325901, + "grad_norm": 8.781055132942296, + "learning_rate": 4.9295613402759645e-05, + "loss": 2.0965, + "mean_token_accuracy": 0.4689655065536499, + "step": 124965 + }, + { + "epoch": 0.12587111128569428, + "grad_norm": 10.36706656518254, + "learning_rate": 4.929552037404544e-05, + "loss": 2.1067, + "mean_token_accuracy": 0.4848154842853546, + "step": 124970 + }, + { + "epoch": 0.12587614733879846, + "grad_norm": 9.672273847009492, + "learning_rate": 4.929542733928615e-05, + "loss": 2.553, + "mean_token_accuracy": 0.41034482717514037, + "step": 124975 + }, + { + "epoch": 0.12588118339190263, + "grad_norm": 10.864068710218195, + "learning_rate": 4.9295334298481815e-05, + "loss": 2.2643, + "mean_token_accuracy": 0.4655172348022461, + "step": 124980 + }, + { + "epoch": 0.1258862194450068, + "grad_norm": 10.403609574552823, + "learning_rate": 4.9295241251632476e-05, + "loss": 2.2046, + "mean_token_accuracy": 0.441379314661026, + "step": 124985 + }, + { + "epoch": 0.12589125549811098, + "grad_norm": 10.583532540482432, + "learning_rate": 4.9295148198738134e-05, + "loss": 2.5579, + "mean_token_accuracy": 0.4137930989265442, + "step": 124990 + }, + { + "epoch": 0.12589629155121515, + "grad_norm": 10.344399787512279, + "learning_rate": 4.929505513979883e-05, + "loss": 2.2059, + "mean_token_accuracy": 0.4724137902259827, + "step": 124995 + }, + { + "epoch": 0.12590132760431932, + "grad_norm": 8.62941402833875, + "learning_rate": 4.929496207481459e-05, + "loss": 2.9367, + "mean_token_accuracy": 0.41034482717514037, + "step": 125000 + }, + { + "epoch": 0.1259063636574235, + "grad_norm": 11.420862615541132, + "learning_rate": 4.929486900378543e-05, + "loss": 2.5274, + "mean_token_accuracy": 0.4310344815254211, + "step": 125005 + }, + { + "epoch": 0.12591139971052767, + "grad_norm": 9.742107056323428, + "learning_rate": 4.929477592671138e-05, + "loss": 1.8625, + "mean_token_accuracy": 0.4845130205154419, + "step": 125010 + }, + { + "epoch": 0.12591643576363185, + "grad_norm": 9.898257793177857, + "learning_rate": 4.9294682843592474e-05, + "loss": 2.3977, + "mean_token_accuracy": 0.41034482717514037, + "step": 125015 + }, + { + "epoch": 0.12592147181673602, + "grad_norm": 10.394117451096738, + "learning_rate": 4.929458975442873e-05, + "loss": 2.4513, + "mean_token_accuracy": 0.4517241299152374, + "step": 125020 + }, + { + "epoch": 0.1259265078698402, + "grad_norm": 14.143012275106507, + "learning_rate": 4.929449665922017e-05, + "loss": 2.4695, + "mean_token_accuracy": 0.4620689630508423, + "step": 125025 + }, + { + "epoch": 0.12593154392294437, + "grad_norm": 13.89995037920458, + "learning_rate": 4.9294403557966835e-05, + "loss": 2.5268, + "mean_token_accuracy": 0.3999999940395355, + "step": 125030 + }, + { + "epoch": 0.12593657997604854, + "grad_norm": 11.73473090263776, + "learning_rate": 4.929431045066873e-05, + "loss": 2.5304, + "mean_token_accuracy": 0.3793103456497192, + "step": 125035 + }, + { + "epoch": 0.12594161602915271, + "grad_norm": 14.821295776122207, + "learning_rate": 4.92942173373259e-05, + "loss": 2.8426, + "mean_token_accuracy": 0.35172413289546967, + "step": 125040 + }, + { + "epoch": 0.1259466520822569, + "grad_norm": 10.112050581830612, + "learning_rate": 4.929412421793836e-05, + "loss": 2.3848, + "mean_token_accuracy": 0.3793103516101837, + "step": 125045 + }, + { + "epoch": 0.12595168813536106, + "grad_norm": 10.959863838491733, + "learning_rate": 4.929403109250614e-05, + "loss": 2.4967, + "mean_token_accuracy": 0.46551724076271056, + "step": 125050 + }, + { + "epoch": 0.12595672418846524, + "grad_norm": 10.8585325509035, + "learning_rate": 4.9293937961029264e-05, + "loss": 2.3296, + "mean_token_accuracy": 0.4034482777118683, + "step": 125055 + }, + { + "epoch": 0.12596176024156938, + "grad_norm": 10.824946314020456, + "learning_rate": 4.929384482350775e-05, + "loss": 2.491, + "mean_token_accuracy": 0.44482758045196535, + "step": 125060 + }, + { + "epoch": 0.12596679629467356, + "grad_norm": 11.690886979181048, + "learning_rate": 4.9293751679941636e-05, + "loss": 2.3739, + "mean_token_accuracy": 0.46382335424423216, + "step": 125065 + }, + { + "epoch": 0.12597183234777773, + "grad_norm": 11.006663606558181, + "learning_rate": 4.929365853033096e-05, + "loss": 2.187, + "mean_token_accuracy": 0.42413792610168455, + "step": 125070 + }, + { + "epoch": 0.1259768684008819, + "grad_norm": 9.46632307608489, + "learning_rate": 4.929356537467571e-05, + "loss": 2.5671, + "mean_token_accuracy": 0.42068966031074523, + "step": 125075 + }, + { + "epoch": 0.12598190445398608, + "grad_norm": 12.498023763067543, + "learning_rate": 4.929347221297594e-05, + "loss": 2.6123, + "mean_token_accuracy": 0.4034482777118683, + "step": 125080 + }, + { + "epoch": 0.12598694050709025, + "grad_norm": 13.837864983827453, + "learning_rate": 4.929337904523167e-05, + "loss": 2.345, + "mean_token_accuracy": 0.4379310429096222, + "step": 125085 + }, + { + "epoch": 0.12599197656019442, + "grad_norm": 11.005831427290232, + "learning_rate": 4.929328587144292e-05, + "loss": 2.44, + "mean_token_accuracy": 0.4068965554237366, + "step": 125090 + }, + { + "epoch": 0.1259970126132986, + "grad_norm": 10.100789607065176, + "learning_rate": 4.929319269160972e-05, + "loss": 2.4461, + "mean_token_accuracy": 0.41724138259887694, + "step": 125095 + }, + { + "epoch": 0.12600204866640277, + "grad_norm": 12.183378943133926, + "learning_rate": 4.929309950573211e-05, + "loss": 2.639, + "mean_token_accuracy": 0.44827585816383364, + "step": 125100 + }, + { + "epoch": 0.12600708471950695, + "grad_norm": 10.780438410034664, + "learning_rate": 4.9293006313810095e-05, + "loss": 2.1722, + "mean_token_accuracy": 0.4448275864124298, + "step": 125105 + }, + { + "epoch": 0.12601212077261112, + "grad_norm": 10.16260196750849, + "learning_rate": 4.929291311584371e-05, + "loss": 2.1611, + "mean_token_accuracy": 0.47586206793785096, + "step": 125110 + }, + { + "epoch": 0.1260171568257153, + "grad_norm": 10.302701842205792, + "learning_rate": 4.9292819911832976e-05, + "loss": 2.5502, + "mean_token_accuracy": 0.4172413766384125, + "step": 125115 + }, + { + "epoch": 0.12602219287881947, + "grad_norm": 9.628048275734054, + "learning_rate": 4.9292726701777924e-05, + "loss": 2.144, + "mean_token_accuracy": 0.4482758641242981, + "step": 125120 + }, + { + "epoch": 0.12602722893192364, + "grad_norm": 8.94019705587569, + "learning_rate": 4.9292633485678575e-05, + "loss": 1.9248, + "mean_token_accuracy": 0.5401088833808899, + "step": 125125 + }, + { + "epoch": 0.12603226498502781, + "grad_norm": 10.65335797697744, + "learning_rate": 4.929254026353496e-05, + "loss": 2.6787, + "mean_token_accuracy": 0.37931033968925476, + "step": 125130 + }, + { + "epoch": 0.126037301038132, + "grad_norm": 10.931902697564423, + "learning_rate": 4.929244703534711e-05, + "loss": 2.6358, + "mean_token_accuracy": 0.3965517282485962, + "step": 125135 + }, + { + "epoch": 0.12604233709123616, + "grad_norm": 10.615660359355696, + "learning_rate": 4.9292353801115034e-05, + "loss": 2.5811, + "mean_token_accuracy": 0.39310344457626345, + "step": 125140 + }, + { + "epoch": 0.12604737314434034, + "grad_norm": 11.283569643290294, + "learning_rate": 4.929226056083876e-05, + "loss": 2.4803, + "mean_token_accuracy": 0.42758620381355283, + "step": 125145 + }, + { + "epoch": 0.1260524091974445, + "grad_norm": 21.173688337041085, + "learning_rate": 4.929216731451834e-05, + "loss": 2.5651, + "mean_token_accuracy": 0.40344828069210054, + "step": 125150 + }, + { + "epoch": 0.12605744525054868, + "grad_norm": 10.645529630885825, + "learning_rate": 4.929207406215377e-05, + "loss": 2.5582, + "mean_token_accuracy": 0.4034482777118683, + "step": 125155 + }, + { + "epoch": 0.12606248130365286, + "grad_norm": 10.613206518269015, + "learning_rate": 4.92919808037451e-05, + "loss": 2.6919, + "mean_token_accuracy": 0.38275861740112305, + "step": 125160 + }, + { + "epoch": 0.12606751735675703, + "grad_norm": 9.177729620095056, + "learning_rate": 4.9291887539292324e-05, + "loss": 2.2705, + "mean_token_accuracy": 0.4586206912994385, + "step": 125165 + }, + { + "epoch": 0.1260725534098612, + "grad_norm": 10.18131320489993, + "learning_rate": 4.92917942687955e-05, + "loss": 2.1232, + "mean_token_accuracy": 0.4206896543502808, + "step": 125170 + }, + { + "epoch": 0.12607758946296538, + "grad_norm": 9.84210706704598, + "learning_rate": 4.9291700992254626e-05, + "loss": 2.2631, + "mean_token_accuracy": 0.43968542814254763, + "step": 125175 + }, + { + "epoch": 0.12608262551606955, + "grad_norm": 12.288679026138501, + "learning_rate": 4.9291607709669756e-05, + "loss": 2.4131, + "mean_token_accuracy": 0.38620689511299133, + "step": 125180 + }, + { + "epoch": 0.12608766156917373, + "grad_norm": 9.6532293476981, + "learning_rate": 4.929151442104089e-05, + "loss": 2.3215, + "mean_token_accuracy": 0.41379311084747317, + "step": 125185 + }, + { + "epoch": 0.1260926976222779, + "grad_norm": 10.20791223464611, + "learning_rate": 4.9291421126368084e-05, + "loss": 2.0508, + "mean_token_accuracy": 0.47586206197738645, + "step": 125190 + }, + { + "epoch": 0.12609773367538207, + "grad_norm": 10.628761295181024, + "learning_rate": 4.9291327825651336e-05, + "loss": 2.5726, + "mean_token_accuracy": 0.4310344815254211, + "step": 125195 + }, + { + "epoch": 0.12610276972848622, + "grad_norm": 9.70843707323825, + "learning_rate": 4.929123451889068e-05, + "loss": 2.2057, + "mean_token_accuracy": 0.44827585816383364, + "step": 125200 + }, + { + "epoch": 0.1261078057815904, + "grad_norm": 8.989997491979045, + "learning_rate": 4.929114120608615e-05, + "loss": 2.0401, + "mean_token_accuracy": 0.47241379618644713, + "step": 125205 + }, + { + "epoch": 0.12611284183469457, + "grad_norm": 10.665395672430744, + "learning_rate": 4.9291047887237754e-05, + "loss": 2.4025, + "mean_token_accuracy": 0.4310344815254211, + "step": 125210 + }, + { + "epoch": 0.12611787788779874, + "grad_norm": 10.825138193270469, + "learning_rate": 4.929095456234554e-05, + "loss": 2.367, + "mean_token_accuracy": 0.42758620977401735, + "step": 125215 + }, + { + "epoch": 0.12612291394090291, + "grad_norm": 11.750569379245496, + "learning_rate": 4.929086123140952e-05, + "loss": 2.2446, + "mean_token_accuracy": 0.4517241358757019, + "step": 125220 + }, + { + "epoch": 0.1261279499940071, + "grad_norm": 10.331386419265293, + "learning_rate": 4.929076789442973e-05, + "loss": 2.1454, + "mean_token_accuracy": 0.4551724076271057, + "step": 125225 + }, + { + "epoch": 0.12613298604711126, + "grad_norm": 9.907021810398078, + "learning_rate": 4.929067455140618e-05, + "loss": 2.6215, + "mean_token_accuracy": 0.4068965554237366, + "step": 125230 + }, + { + "epoch": 0.12613802210021544, + "grad_norm": 9.801439438960116, + "learning_rate": 4.929058120233891e-05, + "loss": 2.8114, + "mean_token_accuracy": 0.4034482777118683, + "step": 125235 + }, + { + "epoch": 0.1261430581533196, + "grad_norm": 10.784841116248522, + "learning_rate": 4.929048784722794e-05, + "loss": 2.1726, + "mean_token_accuracy": 0.47931033968925474, + "step": 125240 + }, + { + "epoch": 0.12614809420642378, + "grad_norm": 10.47706261029374, + "learning_rate": 4.9290394486073294e-05, + "loss": 2.4178, + "mean_token_accuracy": 0.44137930274009707, + "step": 125245 + }, + { + "epoch": 0.12615313025952796, + "grad_norm": 9.606461496733194, + "learning_rate": 4.9290301118875003e-05, + "loss": 2.3324, + "mean_token_accuracy": 0.4724137902259827, + "step": 125250 + }, + { + "epoch": 0.12615816631263213, + "grad_norm": 11.004724308533243, + "learning_rate": 4.929020774563309e-05, + "loss": 2.591, + "mean_token_accuracy": 0.358620697259903, + "step": 125255 + }, + { + "epoch": 0.1261632023657363, + "grad_norm": 9.406589197659763, + "learning_rate": 4.929011436634759e-05, + "loss": 2.3462, + "mean_token_accuracy": 0.4551724076271057, + "step": 125260 + }, + { + "epoch": 0.12616823841884048, + "grad_norm": 16.60781137389467, + "learning_rate": 4.929002098101851e-05, + "loss": 2.735, + "mean_token_accuracy": 0.3896551638841629, + "step": 125265 + }, + { + "epoch": 0.12617327447194465, + "grad_norm": 11.919541753701893, + "learning_rate": 4.9289927589645883e-05, + "loss": 2.6369, + "mean_token_accuracy": 0.4068965554237366, + "step": 125270 + }, + { + "epoch": 0.12617831052504883, + "grad_norm": 11.030222231153909, + "learning_rate": 4.928983419222975e-05, + "loss": 2.2932, + "mean_token_accuracy": 0.4551724076271057, + "step": 125275 + }, + { + "epoch": 0.126183346578153, + "grad_norm": 9.509624470021091, + "learning_rate": 4.928974078877012e-05, + "loss": 2.5445, + "mean_token_accuracy": 0.382758629322052, + "step": 125280 + }, + { + "epoch": 0.12618838263125717, + "grad_norm": 9.999143345040915, + "learning_rate": 4.9289647379267024e-05, + "loss": 2.7804, + "mean_token_accuracy": 0.4, + "step": 125285 + }, + { + "epoch": 0.12619341868436135, + "grad_norm": 10.630657938917718, + "learning_rate": 4.9289553963720485e-05, + "loss": 2.4329, + "mean_token_accuracy": 0.4206896543502808, + "step": 125290 + }, + { + "epoch": 0.12619845473746552, + "grad_norm": 10.706565739502814, + "learning_rate": 4.9289460542130536e-05, + "loss": 2.504, + "mean_token_accuracy": 0.3827586114406586, + "step": 125295 + }, + { + "epoch": 0.1262034907905697, + "grad_norm": 11.87256073857122, + "learning_rate": 4.92893671144972e-05, + "loss": 2.3704, + "mean_token_accuracy": 0.4000000059604645, + "step": 125300 + }, + { + "epoch": 0.12620852684367387, + "grad_norm": 10.824801603564477, + "learning_rate": 4.9289273680820494e-05, + "loss": 2.3819, + "mean_token_accuracy": 0.4068965494632721, + "step": 125305 + }, + { + "epoch": 0.12621356289677804, + "grad_norm": 15.291171512413726, + "learning_rate": 4.9289180241100455e-05, + "loss": 2.7953, + "mean_token_accuracy": 0.41379310488700866, + "step": 125310 + }, + { + "epoch": 0.12621859894988222, + "grad_norm": 8.183961806346213, + "learning_rate": 4.9289086795337114e-05, + "loss": 2.1634, + "mean_token_accuracy": 0.43793103098869324, + "step": 125315 + }, + { + "epoch": 0.1262236350029864, + "grad_norm": 9.322994671475339, + "learning_rate": 4.9288993343530484e-05, + "loss": 2.2329, + "mean_token_accuracy": 0.43793103098869324, + "step": 125320 + }, + { + "epoch": 0.12622867105609056, + "grad_norm": 12.536703364082399, + "learning_rate": 4.9288899885680586e-05, + "loss": 2.631, + "mean_token_accuracy": 0.42758620381355283, + "step": 125325 + }, + { + "epoch": 0.12623370710919474, + "grad_norm": 13.493922756640167, + "learning_rate": 4.928880642178747e-05, + "loss": 2.5095, + "mean_token_accuracy": 0.4344827592372894, + "step": 125330 + }, + { + "epoch": 0.1262387431622989, + "grad_norm": 11.094542147196364, + "learning_rate": 4.928871295185114e-05, + "loss": 2.5548, + "mean_token_accuracy": 0.39655172228813174, + "step": 125335 + }, + { + "epoch": 0.12624377921540306, + "grad_norm": 8.895421205279188, + "learning_rate": 4.928861947587163e-05, + "loss": 2.5806, + "mean_token_accuracy": 0.42068964838981626, + "step": 125340 + }, + { + "epoch": 0.12624881526850723, + "grad_norm": 9.610954266568175, + "learning_rate": 4.928852599384896e-05, + "loss": 2.5595, + "mean_token_accuracy": 0.4344827592372894, + "step": 125345 + }, + { + "epoch": 0.1262538513216114, + "grad_norm": 10.962175878236517, + "learning_rate": 4.9288432505783175e-05, + "loss": 2.0971, + "mean_token_accuracy": 0.47586206197738645, + "step": 125350 + }, + { + "epoch": 0.12625888737471558, + "grad_norm": 14.625286193677272, + "learning_rate": 4.9288339011674276e-05, + "loss": 2.6845, + "mean_token_accuracy": 0.37241379022598264, + "step": 125355 + }, + { + "epoch": 0.12626392342781975, + "grad_norm": 12.327264770028144, + "learning_rate": 4.92882455115223e-05, + "loss": 2.7291, + "mean_token_accuracy": 0.38620689511299133, + "step": 125360 + }, + { + "epoch": 0.12626895948092393, + "grad_norm": 11.019882851579643, + "learning_rate": 4.928815200532727e-05, + "loss": 2.4473, + "mean_token_accuracy": 0.4379310369491577, + "step": 125365 + }, + { + "epoch": 0.1262739955340281, + "grad_norm": 10.90551572072697, + "learning_rate": 4.928805849308922e-05, + "loss": 2.2083, + "mean_token_accuracy": 0.4724137902259827, + "step": 125370 + }, + { + "epoch": 0.12627903158713227, + "grad_norm": 8.654918445428313, + "learning_rate": 4.928796497480817e-05, + "loss": 2.2839, + "mean_token_accuracy": 0.4551724135875702, + "step": 125375 + }, + { + "epoch": 0.12628406764023645, + "grad_norm": 14.226194401493888, + "learning_rate": 4.9287871450484146e-05, + "loss": 2.368, + "mean_token_accuracy": 0.4172413766384125, + "step": 125380 + }, + { + "epoch": 0.12628910369334062, + "grad_norm": 9.501190271952192, + "learning_rate": 4.9287777920117176e-05, + "loss": 2.5419, + "mean_token_accuracy": 0.42068964838981626, + "step": 125385 + }, + { + "epoch": 0.1262941397464448, + "grad_norm": 14.654793063213871, + "learning_rate": 4.928768438370728e-05, + "loss": 2.5638, + "mean_token_accuracy": 0.441379314661026, + "step": 125390 + }, + { + "epoch": 0.12629917579954897, + "grad_norm": 7.9908159369372385, + "learning_rate": 4.9287590841254493e-05, + "loss": 2.2217, + "mean_token_accuracy": 0.4517241358757019, + "step": 125395 + }, + { + "epoch": 0.12630421185265314, + "grad_norm": 11.148710354915991, + "learning_rate": 4.9287497292758836e-05, + "loss": 2.4963, + "mean_token_accuracy": 0.4186932921409607, + "step": 125400 + }, + { + "epoch": 0.12630924790575732, + "grad_norm": 10.61996935053498, + "learning_rate": 4.928740373822034e-05, + "loss": 2.538, + "mean_token_accuracy": 0.4468844473361969, + "step": 125405 + }, + { + "epoch": 0.1263142839588615, + "grad_norm": 11.302727946872293, + "learning_rate": 4.928731017763902e-05, + "loss": 1.9627, + "mean_token_accuracy": 0.44646098613739016, + "step": 125410 + }, + { + "epoch": 0.12631932001196566, + "grad_norm": 12.82476906672126, + "learning_rate": 4.92872166110149e-05, + "loss": 2.3222, + "mean_token_accuracy": 0.4413793087005615, + "step": 125415 + }, + { + "epoch": 0.12632435606506984, + "grad_norm": 10.035710143545849, + "learning_rate": 4.928712303834803e-05, + "loss": 2.4011, + "mean_token_accuracy": 0.43103447556495667, + "step": 125420 + }, + { + "epoch": 0.126329392118174, + "grad_norm": 10.420871992617379, + "learning_rate": 4.9287029459638414e-05, + "loss": 2.2528, + "mean_token_accuracy": 0.4288566291332245, + "step": 125425 + }, + { + "epoch": 0.12633442817127818, + "grad_norm": 11.096351566011899, + "learning_rate": 4.928693587488608e-05, + "loss": 2.3015, + "mean_token_accuracy": 0.46551724672317507, + "step": 125430 + }, + { + "epoch": 0.12633946422438236, + "grad_norm": 14.962162675219886, + "learning_rate": 4.928684228409107e-05, + "loss": 2.6576, + "mean_token_accuracy": 0.37241379022598264, + "step": 125435 + }, + { + "epoch": 0.12634450027748653, + "grad_norm": 9.744584065333749, + "learning_rate": 4.9286748687253384e-05, + "loss": 2.0041, + "mean_token_accuracy": 0.4778325021266937, + "step": 125440 + }, + { + "epoch": 0.1263495363305907, + "grad_norm": 9.188978448825495, + "learning_rate": 4.928665508437307e-05, + "loss": 2.1169, + "mean_token_accuracy": 0.4896551728248596, + "step": 125445 + }, + { + "epoch": 0.12635457238369488, + "grad_norm": 9.679030618989241, + "learning_rate": 4.928656147545014e-05, + "loss": 2.2549, + "mean_token_accuracy": 0.4367211163043976, + "step": 125450 + }, + { + "epoch": 0.12635960843679905, + "grad_norm": 9.980158435699172, + "learning_rate": 4.928646786048463e-05, + "loss": 2.3197, + "mean_token_accuracy": 0.46358137130737304, + "step": 125455 + }, + { + "epoch": 0.12636464448990323, + "grad_norm": 10.314676378958742, + "learning_rate": 4.9286374239476555e-05, + "loss": 2.7281, + "mean_token_accuracy": 0.4034482777118683, + "step": 125460 + }, + { + "epoch": 0.1263696805430074, + "grad_norm": 11.940209851895457, + "learning_rate": 4.9286280612425964e-05, + "loss": 2.5419, + "mean_token_accuracy": 0.39655172228813174, + "step": 125465 + }, + { + "epoch": 0.12637471659611157, + "grad_norm": 11.328375333734817, + "learning_rate": 4.928618697933286e-05, + "loss": 2.5356, + "mean_token_accuracy": 0.36896551549434664, + "step": 125470 + }, + { + "epoch": 0.12637975264921575, + "grad_norm": 9.765939095629442, + "learning_rate": 4.928609334019727e-05, + "loss": 2.39, + "mean_token_accuracy": 0.482758617401123, + "step": 125475 + }, + { + "epoch": 0.1263847887023199, + "grad_norm": 11.57498601297852, + "learning_rate": 4.928599969501922e-05, + "loss": 2.0353, + "mean_token_accuracy": 0.4931034445762634, + "step": 125480 + }, + { + "epoch": 0.12638982475542407, + "grad_norm": 12.448634704061009, + "learning_rate": 4.928590604379875e-05, + "loss": 2.6564, + "mean_token_accuracy": 0.4275862157344818, + "step": 125485 + }, + { + "epoch": 0.12639486080852824, + "grad_norm": 14.055194407238002, + "learning_rate": 4.9285812386535884e-05, + "loss": 2.3279, + "mean_token_accuracy": 0.4448275864124298, + "step": 125490 + }, + { + "epoch": 0.12639989686163242, + "grad_norm": 11.505836065007093, + "learning_rate": 4.9285718723230636e-05, + "loss": 2.6071, + "mean_token_accuracy": 0.4247537016868591, + "step": 125495 + }, + { + "epoch": 0.1264049329147366, + "grad_norm": 11.697771985600228, + "learning_rate": 4.928562505388304e-05, + "loss": 2.153, + "mean_token_accuracy": 0.43103448748588563, + "step": 125500 + }, + { + "epoch": 0.12640996896784076, + "grad_norm": 9.868091003841489, + "learning_rate": 4.928553137849311e-05, + "loss": 2.6435, + "mean_token_accuracy": 0.3689655244350433, + "step": 125505 + }, + { + "epoch": 0.12641500502094494, + "grad_norm": 11.493527042101217, + "learning_rate": 4.928543769706089e-05, + "loss": 2.6694, + "mean_token_accuracy": 0.4103448152542114, + "step": 125510 + }, + { + "epoch": 0.1264200410740491, + "grad_norm": 9.167007892768117, + "learning_rate": 4.92853440095864e-05, + "loss": 2.2183, + "mean_token_accuracy": 0.46551724076271056, + "step": 125515 + }, + { + "epoch": 0.12642507712715328, + "grad_norm": 21.044905122756113, + "learning_rate": 4.928525031606966e-05, + "loss": 2.4057, + "mean_token_accuracy": 0.42758620977401735, + "step": 125520 + }, + { + "epoch": 0.12643011318025746, + "grad_norm": 9.62231005158025, + "learning_rate": 4.9285156616510694e-05, + "loss": 2.578, + "mean_token_accuracy": 0.4413793087005615, + "step": 125525 + }, + { + "epoch": 0.12643514923336163, + "grad_norm": 9.952977222937193, + "learning_rate": 4.928506291090954e-05, + "loss": 2.5167, + "mean_token_accuracy": 0.4379310369491577, + "step": 125530 + }, + { + "epoch": 0.1264401852864658, + "grad_norm": 11.778850765784465, + "learning_rate": 4.928496919926622e-05, + "loss": 2.8928, + "mean_token_accuracy": 0.37241379022598264, + "step": 125535 + }, + { + "epoch": 0.12644522133956998, + "grad_norm": 11.751936236388296, + "learning_rate": 4.928487548158075e-05, + "loss": 2.6896, + "mean_token_accuracy": 0.3999999940395355, + "step": 125540 + }, + { + "epoch": 0.12645025739267415, + "grad_norm": 11.833655397496972, + "learning_rate": 4.928478175785317e-05, + "loss": 2.681, + "mean_token_accuracy": 0.39310344457626345, + "step": 125545 + }, + { + "epoch": 0.12645529344577833, + "grad_norm": 8.958547432045819, + "learning_rate": 4.928468802808349e-05, + "loss": 1.9739, + "mean_token_accuracy": 0.5034482836723327, + "step": 125550 + }, + { + "epoch": 0.1264603294988825, + "grad_norm": 11.790643284422298, + "learning_rate": 4.928459429227176e-05, + "loss": 2.6511, + "mean_token_accuracy": 0.334482753276825, + "step": 125555 + }, + { + "epoch": 0.12646536555198667, + "grad_norm": 12.113374873844297, + "learning_rate": 4.9284500550417986e-05, + "loss": 2.6445, + "mean_token_accuracy": 0.4, + "step": 125560 + }, + { + "epoch": 0.12647040160509085, + "grad_norm": 9.572315590352302, + "learning_rate": 4.9284406802522194e-05, + "loss": 2.5127, + "mean_token_accuracy": 0.4379310369491577, + "step": 125565 + }, + { + "epoch": 0.12647543765819502, + "grad_norm": 17.7582823476348, + "learning_rate": 4.928431304858442e-05, + "loss": 3.2016, + "mean_token_accuracy": 0.3999999940395355, + "step": 125570 + }, + { + "epoch": 0.1264804737112992, + "grad_norm": 9.815793387767407, + "learning_rate": 4.928421928860468e-05, + "loss": 2.3778, + "mean_token_accuracy": 0.4551724135875702, + "step": 125575 + }, + { + "epoch": 0.12648550976440337, + "grad_norm": 11.28466485145402, + "learning_rate": 4.928412552258301e-05, + "loss": 2.3067, + "mean_token_accuracy": 0.48275861144065857, + "step": 125580 + }, + { + "epoch": 0.12649054581750754, + "grad_norm": 12.078761570883694, + "learning_rate": 4.9284031750519434e-05, + "loss": 2.2491, + "mean_token_accuracy": 0.44827585816383364, + "step": 125585 + }, + { + "epoch": 0.12649558187061172, + "grad_norm": 10.881890212445043, + "learning_rate": 4.928393797241397e-05, + "loss": 2.3827, + "mean_token_accuracy": 0.4344827592372894, + "step": 125590 + }, + { + "epoch": 0.1265006179237159, + "grad_norm": 10.144265724688271, + "learning_rate": 4.9283844188266657e-05, + "loss": 2.2691, + "mean_token_accuracy": 0.47586206793785096, + "step": 125595 + }, + { + "epoch": 0.12650565397682006, + "grad_norm": 9.992885385037306, + "learning_rate": 4.9283750398077507e-05, + "loss": 2.138, + "mean_token_accuracy": 0.4517241418361664, + "step": 125600 + }, + { + "epoch": 0.12651069002992424, + "grad_norm": 11.102061507671314, + "learning_rate": 4.928365660184656e-05, + "loss": 2.3777, + "mean_token_accuracy": 0.3896551728248596, + "step": 125605 + }, + { + "epoch": 0.1265157260830284, + "grad_norm": 10.846652540957106, + "learning_rate": 4.9283562799573826e-05, + "loss": 2.3766, + "mean_token_accuracy": 0.46551724076271056, + "step": 125610 + }, + { + "epoch": 0.1265207621361326, + "grad_norm": 10.072721302397126, + "learning_rate": 4.9283468991259344e-05, + "loss": 2.5078, + "mean_token_accuracy": 0.42413792610168455, + "step": 125615 + }, + { + "epoch": 0.12652579818923673, + "grad_norm": 11.758240459043593, + "learning_rate": 4.9283375176903124e-05, + "loss": 2.3474, + "mean_token_accuracy": 0.4, + "step": 125620 + }, + { + "epoch": 0.1265308342423409, + "grad_norm": 10.5553537837584, + "learning_rate": 4.9283281356505215e-05, + "loss": 2.0944, + "mean_token_accuracy": 0.44482758045196535, + "step": 125625 + }, + { + "epoch": 0.12653587029544508, + "grad_norm": 20.73416552728234, + "learning_rate": 4.9283187530065636e-05, + "loss": 2.3979, + "mean_token_accuracy": 0.3896551728248596, + "step": 125630 + }, + { + "epoch": 0.12654090634854925, + "grad_norm": 12.130696116556242, + "learning_rate": 4.92830936975844e-05, + "loss": 2.6901, + "mean_token_accuracy": 0.42758620977401735, + "step": 125635 + }, + { + "epoch": 0.12654594240165343, + "grad_norm": 11.034498458822736, + "learning_rate": 4.928299985906155e-05, + "loss": 2.3368, + "mean_token_accuracy": 0.4310344934463501, + "step": 125640 + }, + { + "epoch": 0.1265509784547576, + "grad_norm": 10.19478699878888, + "learning_rate": 4.9282906014497096e-05, + "loss": 2.0181, + "mean_token_accuracy": 0.4862068951129913, + "step": 125645 + }, + { + "epoch": 0.12655601450786177, + "grad_norm": 10.351644092984687, + "learning_rate": 4.928281216389107e-05, + "loss": 2.1446, + "mean_token_accuracy": 0.5034482836723327, + "step": 125650 + }, + { + "epoch": 0.12656105056096595, + "grad_norm": 9.809617944629604, + "learning_rate": 4.9282718307243506e-05, + "loss": 2.2823, + "mean_token_accuracy": 0.46560150384902954, + "step": 125655 + }, + { + "epoch": 0.12656608661407012, + "grad_norm": 10.746901483546806, + "learning_rate": 4.928262444455443e-05, + "loss": 2.2662, + "mean_token_accuracy": 0.4310344815254211, + "step": 125660 + }, + { + "epoch": 0.1265711226671743, + "grad_norm": 10.798428574052325, + "learning_rate": 4.928253057582385e-05, + "loss": 2.5561, + "mean_token_accuracy": 0.43793103098869324, + "step": 125665 + }, + { + "epoch": 0.12657615872027847, + "grad_norm": 10.845569390647395, + "learning_rate": 4.9282436701051806e-05, + "loss": 2.5372, + "mean_token_accuracy": 0.42068966031074523, + "step": 125670 + }, + { + "epoch": 0.12658119477338264, + "grad_norm": 10.442403129634714, + "learning_rate": 4.928234282023832e-05, + "loss": 2.5139, + "mean_token_accuracy": 0.41379310488700866, + "step": 125675 + }, + { + "epoch": 0.12658623082648682, + "grad_norm": 9.695632224657807, + "learning_rate": 4.928224893338342e-05, + "loss": 1.9068, + "mean_token_accuracy": 0.5310344696044922, + "step": 125680 + }, + { + "epoch": 0.126591266879591, + "grad_norm": 11.866102981580292, + "learning_rate": 4.928215504048714e-05, + "loss": 2.6371, + "mean_token_accuracy": 0.42413793206214906, + "step": 125685 + }, + { + "epoch": 0.12659630293269516, + "grad_norm": 14.304503777762402, + "learning_rate": 4.92820611415495e-05, + "loss": 2.6484, + "mean_token_accuracy": 0.43309134244918823, + "step": 125690 + }, + { + "epoch": 0.12660133898579934, + "grad_norm": 10.683574373781973, + "learning_rate": 4.928196723657051e-05, + "loss": 2.1932, + "mean_token_accuracy": 0.4344827592372894, + "step": 125695 + }, + { + "epoch": 0.1266063750389035, + "grad_norm": 9.647674244008142, + "learning_rate": 4.928187332555023e-05, + "loss": 2.3392, + "mean_token_accuracy": 0.45517241954803467, + "step": 125700 + }, + { + "epoch": 0.1266114110920077, + "grad_norm": 11.951268303104852, + "learning_rate": 4.928177940848865e-05, + "loss": 2.3583, + "mean_token_accuracy": 0.41034482419490814, + "step": 125705 + }, + { + "epoch": 0.12661644714511186, + "grad_norm": 11.867609282460064, + "learning_rate": 4.928168548538581e-05, + "loss": 2.466, + "mean_token_accuracy": 0.36551723480224607, + "step": 125710 + }, + { + "epoch": 0.12662148319821603, + "grad_norm": 10.59241334516758, + "learning_rate": 4.9281591556241754e-05, + "loss": 2.4093, + "mean_token_accuracy": 0.4034482777118683, + "step": 125715 + }, + { + "epoch": 0.1266265192513202, + "grad_norm": 9.858919147885919, + "learning_rate": 4.9281497621056474e-05, + "loss": 2.2759, + "mean_token_accuracy": 0.42413792610168455, + "step": 125720 + }, + { + "epoch": 0.12663155530442438, + "grad_norm": 12.625832124887527, + "learning_rate": 4.9281403679830026e-05, + "loss": 2.8458, + "mean_token_accuracy": 0.41034482717514037, + "step": 125725 + }, + { + "epoch": 0.12663659135752856, + "grad_norm": 10.127678075054163, + "learning_rate": 4.928130973256242e-05, + "loss": 2.0577, + "mean_token_accuracy": 0.46551724076271056, + "step": 125730 + }, + { + "epoch": 0.12664162741063273, + "grad_norm": 13.32731949070172, + "learning_rate": 4.9281215779253695e-05, + "loss": 3.0282, + "mean_token_accuracy": 0.40000000298023225, + "step": 125735 + }, + { + "epoch": 0.1266466634637369, + "grad_norm": 11.92925612436529, + "learning_rate": 4.928112181990386e-05, + "loss": 2.4612, + "mean_token_accuracy": 0.41034482717514037, + "step": 125740 + }, + { + "epoch": 0.12665169951684108, + "grad_norm": 15.30487666368016, + "learning_rate": 4.9281027854512945e-05, + "loss": 2.6929, + "mean_token_accuracy": 0.43448275327682495, + "step": 125745 + }, + { + "epoch": 0.12665673556994525, + "grad_norm": 8.888813138442623, + "learning_rate": 4.928093388308099e-05, + "loss": 2.357, + "mean_token_accuracy": 0.42758620977401735, + "step": 125750 + }, + { + "epoch": 0.12666177162304942, + "grad_norm": 11.469065745541219, + "learning_rate": 4.928083990560801e-05, + "loss": 2.2921, + "mean_token_accuracy": 0.44137930274009707, + "step": 125755 + }, + { + "epoch": 0.12666680767615357, + "grad_norm": 9.732309707326875, + "learning_rate": 4.928074592209403e-05, + "loss": 2.5668, + "mean_token_accuracy": 0.3965517282485962, + "step": 125760 + }, + { + "epoch": 0.12667184372925774, + "grad_norm": 12.14781775355426, + "learning_rate": 4.928065193253908e-05, + "loss": 2.7583, + "mean_token_accuracy": 0.41034482717514037, + "step": 125765 + }, + { + "epoch": 0.12667687978236192, + "grad_norm": 9.858763586687843, + "learning_rate": 4.928055793694319e-05, + "loss": 2.2642, + "mean_token_accuracy": 0.4813672125339508, + "step": 125770 + }, + { + "epoch": 0.1266819158354661, + "grad_norm": 10.476294704595, + "learning_rate": 4.928046393530637e-05, + "loss": 2.4076, + "mean_token_accuracy": 0.44482757449150084, + "step": 125775 + }, + { + "epoch": 0.12668695188857027, + "grad_norm": 9.384607881273848, + "learning_rate": 4.9280369927628666e-05, + "loss": 2.2854, + "mean_token_accuracy": 0.47241379618644713, + "step": 125780 + }, + { + "epoch": 0.12669198794167444, + "grad_norm": 10.989054703167026, + "learning_rate": 4.928027591391009e-05, + "loss": 2.4529, + "mean_token_accuracy": 0.43103447556495667, + "step": 125785 + }, + { + "epoch": 0.1266970239947786, + "grad_norm": 11.787620489354616, + "learning_rate": 4.928018189415068e-05, + "loss": 2.2382, + "mean_token_accuracy": 0.4517241418361664, + "step": 125790 + }, + { + "epoch": 0.1267020600478828, + "grad_norm": 13.057123464911285, + "learning_rate": 4.928008786835045e-05, + "loss": 2.4118, + "mean_token_accuracy": 0.4483968555927277, + "step": 125795 + }, + { + "epoch": 0.12670709610098696, + "grad_norm": 11.90658399725102, + "learning_rate": 4.9279993836509434e-05, + "loss": 2.5213, + "mean_token_accuracy": 0.46751360297203065, + "step": 125800 + }, + { + "epoch": 0.12671213215409113, + "grad_norm": 11.202323943168969, + "learning_rate": 4.9279899798627657e-05, + "loss": 3.0484, + "mean_token_accuracy": 0.42413793206214906, + "step": 125805 + }, + { + "epoch": 0.1267171682071953, + "grad_norm": 11.892098534929213, + "learning_rate": 4.927980575470513e-05, + "loss": 2.4668, + "mean_token_accuracy": 0.3896551728248596, + "step": 125810 + }, + { + "epoch": 0.12672220426029948, + "grad_norm": 10.765266469873094, + "learning_rate": 4.927971170474191e-05, + "loss": 2.5185, + "mean_token_accuracy": 0.4241379380226135, + "step": 125815 + }, + { + "epoch": 0.12672724031340366, + "grad_norm": 11.553713110870778, + "learning_rate": 4.9279617648738e-05, + "loss": 2.1007, + "mean_token_accuracy": 0.482758617401123, + "step": 125820 + }, + { + "epoch": 0.12673227636650783, + "grad_norm": 9.135975429448365, + "learning_rate": 4.9279523586693425e-05, + "loss": 1.9383, + "mean_token_accuracy": 0.4965517222881317, + "step": 125825 + }, + { + "epoch": 0.126737312419612, + "grad_norm": 11.53238773394369, + "learning_rate": 4.927942951860823e-05, + "loss": 2.6553, + "mean_token_accuracy": 0.4172413766384125, + "step": 125830 + }, + { + "epoch": 0.12674234847271618, + "grad_norm": 11.28761532049412, + "learning_rate": 4.927933544448242e-05, + "loss": 2.4189, + "mean_token_accuracy": 0.47586206793785096, + "step": 125835 + }, + { + "epoch": 0.12674738452582035, + "grad_norm": 9.666054691713788, + "learning_rate": 4.927924136431603e-05, + "loss": 2.0141, + "mean_token_accuracy": 0.517241382598877, + "step": 125840 + }, + { + "epoch": 0.12675242057892452, + "grad_norm": 10.877253332283415, + "learning_rate": 4.9279147278109095e-05, + "loss": 2.3106, + "mean_token_accuracy": 0.4103448212146759, + "step": 125845 + }, + { + "epoch": 0.1267574566320287, + "grad_norm": 10.64344925255087, + "learning_rate": 4.927905318586162e-05, + "loss": 2.3265, + "mean_token_accuracy": 0.4206896543502808, + "step": 125850 + }, + { + "epoch": 0.12676249268513287, + "grad_norm": 12.002022163480463, + "learning_rate": 4.9278959087573654e-05, + "loss": 2.4491, + "mean_token_accuracy": 0.3896551728248596, + "step": 125855 + }, + { + "epoch": 0.12676752873823705, + "grad_norm": 10.491001765712781, + "learning_rate": 4.9278864983245205e-05, + "loss": 2.3777, + "mean_token_accuracy": 0.4172413766384125, + "step": 125860 + }, + { + "epoch": 0.12677256479134122, + "grad_norm": 8.231952688544519, + "learning_rate": 4.927877087287631e-05, + "loss": 2.2266, + "mean_token_accuracy": 0.4620689570903778, + "step": 125865 + }, + { + "epoch": 0.1267776008444454, + "grad_norm": 12.772871663660506, + "learning_rate": 4.927867675646699e-05, + "loss": 2.5991, + "mean_token_accuracy": 0.3689655214548111, + "step": 125870 + }, + { + "epoch": 0.12678263689754957, + "grad_norm": 10.57896307688069, + "learning_rate": 4.927858263401727e-05, + "loss": 2.2857, + "mean_token_accuracy": 0.42758620977401735, + "step": 125875 + }, + { + "epoch": 0.12678767295065374, + "grad_norm": 9.537771282310928, + "learning_rate": 4.9278488505527186e-05, + "loss": 2.2944, + "mean_token_accuracy": 0.47241379618644713, + "step": 125880 + }, + { + "epoch": 0.12679270900375791, + "grad_norm": 12.42032507431532, + "learning_rate": 4.927839437099675e-05, + "loss": 2.7396, + "mean_token_accuracy": 0.4068965494632721, + "step": 125885 + }, + { + "epoch": 0.1267977450568621, + "grad_norm": 11.367172733771271, + "learning_rate": 4.927830023042601e-05, + "loss": 2.7015, + "mean_token_accuracy": 0.38620689511299133, + "step": 125890 + }, + { + "epoch": 0.12680278110996626, + "grad_norm": 8.617554411637004, + "learning_rate": 4.927820608381496e-05, + "loss": 2.1608, + "mean_token_accuracy": 0.4586206912994385, + "step": 125895 + }, + { + "epoch": 0.1268078171630704, + "grad_norm": 12.765817488545972, + "learning_rate": 4.927811193116364e-05, + "loss": 2.8886, + "mean_token_accuracy": 0.3758620709180832, + "step": 125900 + }, + { + "epoch": 0.12681285321617458, + "grad_norm": 12.779562712700741, + "learning_rate": 4.927801777247209e-05, + "loss": 1.9414, + "mean_token_accuracy": 0.5331518352031708, + "step": 125905 + }, + { + "epoch": 0.12681788926927876, + "grad_norm": 10.848155771989825, + "learning_rate": 4.9277923607740326e-05, + "loss": 2.7266, + "mean_token_accuracy": 0.3931034505367279, + "step": 125910 + }, + { + "epoch": 0.12682292532238293, + "grad_norm": 15.294449732672954, + "learning_rate": 4.927782943696838e-05, + "loss": 2.2415, + "mean_token_accuracy": 0.46551724076271056, + "step": 125915 + }, + { + "epoch": 0.1268279613754871, + "grad_norm": 10.61165736207434, + "learning_rate": 4.927773526015626e-05, + "loss": 2.274, + "mean_token_accuracy": 0.4780399203300476, + "step": 125920 + }, + { + "epoch": 0.12683299742859128, + "grad_norm": 10.924617714631845, + "learning_rate": 4.9277641077304004e-05, + "loss": 2.7693, + "mean_token_accuracy": 0.44295220375061034, + "step": 125925 + }, + { + "epoch": 0.12683803348169545, + "grad_norm": 8.537161209579299, + "learning_rate": 4.927754688841164e-05, + "loss": 1.8594, + "mean_token_accuracy": 0.5310344874858857, + "step": 125930 + }, + { + "epoch": 0.12684306953479962, + "grad_norm": 10.52313436943759, + "learning_rate": 4.9277452693479195e-05, + "loss": 2.4275, + "mean_token_accuracy": 0.441379314661026, + "step": 125935 + }, + { + "epoch": 0.1268481055879038, + "grad_norm": 11.018736252574792, + "learning_rate": 4.927735849250669e-05, + "loss": 2.6516, + "mean_token_accuracy": 0.4137930989265442, + "step": 125940 + }, + { + "epoch": 0.12685314164100797, + "grad_norm": 11.05401076865327, + "learning_rate": 4.927726428549416e-05, + "loss": 2.5095, + "mean_token_accuracy": 0.43448275327682495, + "step": 125945 + }, + { + "epoch": 0.12685817769411215, + "grad_norm": 10.578709637466218, + "learning_rate": 4.927717007244161e-05, + "loss": 2.379, + "mean_token_accuracy": 0.4862069010734558, + "step": 125950 + }, + { + "epoch": 0.12686321374721632, + "grad_norm": 9.160598917720971, + "learning_rate": 4.927707585334909e-05, + "loss": 2.3359, + "mean_token_accuracy": 0.4275862008333206, + "step": 125955 + }, + { + "epoch": 0.1268682498003205, + "grad_norm": 12.49224326408125, + "learning_rate": 4.9276981628216626e-05, + "loss": 2.8845, + "mean_token_accuracy": 0.3655172407627106, + "step": 125960 + }, + { + "epoch": 0.12687328585342467, + "grad_norm": 9.7104464524469, + "learning_rate": 4.927688739704423e-05, + "loss": 2.0831, + "mean_token_accuracy": 0.47931034564971925, + "step": 125965 + }, + { + "epoch": 0.12687832190652884, + "grad_norm": 11.040785223097275, + "learning_rate": 4.927679315983192e-05, + "loss": 2.6082, + "mean_token_accuracy": 0.43103448748588563, + "step": 125970 + }, + { + "epoch": 0.12688335795963301, + "grad_norm": 12.631956779654836, + "learning_rate": 4.9276698916579736e-05, + "loss": 2.0813, + "mean_token_accuracy": 0.5, + "step": 125975 + }, + { + "epoch": 0.1268883940127372, + "grad_norm": 10.699209321960875, + "learning_rate": 4.927660466728772e-05, + "loss": 2.5479, + "mean_token_accuracy": 0.4413793087005615, + "step": 125980 + }, + { + "epoch": 0.12689343006584136, + "grad_norm": 11.853747468741728, + "learning_rate": 4.9276510411955865e-05, + "loss": 2.1796, + "mean_token_accuracy": 0.4551724135875702, + "step": 125985 + }, + { + "epoch": 0.12689846611894554, + "grad_norm": 11.635591701468837, + "learning_rate": 4.927641615058422e-05, + "loss": 2.0917, + "mean_token_accuracy": 0.48965516686439514, + "step": 125990 + }, + { + "epoch": 0.1269035021720497, + "grad_norm": 12.007231326699971, + "learning_rate": 4.927632188317281e-05, + "loss": 2.4904, + "mean_token_accuracy": 0.43448275327682495, + "step": 125995 + }, + { + "epoch": 0.12690853822515388, + "grad_norm": 10.520497902007655, + "learning_rate": 4.9276227609721646e-05, + "loss": 2.2964, + "mean_token_accuracy": 0.4689655125141144, + "step": 126000 + }, + { + "epoch": 0.12691357427825806, + "grad_norm": 15.20270641096983, + "learning_rate": 4.9276133330230774e-05, + "loss": 2.5535, + "mean_token_accuracy": 0.42758620977401735, + "step": 126005 + }, + { + "epoch": 0.12691861033136223, + "grad_norm": 9.32013585696496, + "learning_rate": 4.9276039044700206e-05, + "loss": 2.5769, + "mean_token_accuracy": 0.4310344815254211, + "step": 126010 + }, + { + "epoch": 0.1269236463844664, + "grad_norm": 11.859055222617217, + "learning_rate": 4.9275944753129974e-05, + "loss": 2.4756, + "mean_token_accuracy": 0.4172413766384125, + "step": 126015 + }, + { + "epoch": 0.12692868243757058, + "grad_norm": 8.500413637966444, + "learning_rate": 4.927585045552009e-05, + "loss": 2.1482, + "mean_token_accuracy": 0.46551724076271056, + "step": 126020 + }, + { + "epoch": 0.12693371849067475, + "grad_norm": 10.795854196155297, + "learning_rate": 4.927575615187061e-05, + "loss": 2.2955, + "mean_token_accuracy": 0.4310344815254211, + "step": 126025 + }, + { + "epoch": 0.12693875454377893, + "grad_norm": 11.676153816267155, + "learning_rate": 4.9275661842181535e-05, + "loss": 2.6862, + "mean_token_accuracy": 0.41034482717514037, + "step": 126030 + }, + { + "epoch": 0.1269437905968831, + "grad_norm": 10.209753627457854, + "learning_rate": 4.92755675264529e-05, + "loss": 2.5684, + "mean_token_accuracy": 0.4103448212146759, + "step": 126035 + }, + { + "epoch": 0.12694882664998725, + "grad_norm": 9.752392978485547, + "learning_rate": 4.927547320468473e-05, + "loss": 2.8925, + "mean_token_accuracy": 0.3379310339689255, + "step": 126040 + }, + { + "epoch": 0.12695386270309142, + "grad_norm": 11.263508757835163, + "learning_rate": 4.927537887687705e-05, + "loss": 2.2759, + "mean_token_accuracy": 0.4482758641242981, + "step": 126045 + }, + { + "epoch": 0.1269588987561956, + "grad_norm": 11.100882632609176, + "learning_rate": 4.927528454302989e-05, + "loss": 2.34, + "mean_token_accuracy": 0.4034482717514038, + "step": 126050 + }, + { + "epoch": 0.12696393480929977, + "grad_norm": 10.708809656733335, + "learning_rate": 4.927519020314327e-05, + "loss": 1.9983, + "mean_token_accuracy": 0.4862069070339203, + "step": 126055 + }, + { + "epoch": 0.12696897086240394, + "grad_norm": 9.836202796063645, + "learning_rate": 4.927509585721722e-05, + "loss": 2.183, + "mean_token_accuracy": 0.46382336020469667, + "step": 126060 + }, + { + "epoch": 0.12697400691550811, + "grad_norm": 13.299203499541346, + "learning_rate": 4.927500150525177e-05, + "loss": 3.0111, + "mean_token_accuracy": 0.3931034505367279, + "step": 126065 + }, + { + "epoch": 0.1269790429686123, + "grad_norm": 11.356182211108925, + "learning_rate": 4.927490714724694e-05, + "loss": 2.5998, + "mean_token_accuracy": 0.36896551251411436, + "step": 126070 + }, + { + "epoch": 0.12698407902171646, + "grad_norm": 10.975556575741773, + "learning_rate": 4.927481278320276e-05, + "loss": 2.3455, + "mean_token_accuracy": 0.4379310429096222, + "step": 126075 + }, + { + "epoch": 0.12698911507482064, + "grad_norm": 13.378187000547074, + "learning_rate": 4.927471841311925e-05, + "loss": 2.3857, + "mean_token_accuracy": 0.4517241299152374, + "step": 126080 + }, + { + "epoch": 0.1269941511279248, + "grad_norm": 12.900145296320545, + "learning_rate": 4.9274624036996444e-05, + "loss": 2.591, + "mean_token_accuracy": 0.39655172228813174, + "step": 126085 + }, + { + "epoch": 0.12699918718102898, + "grad_norm": 10.271377444815963, + "learning_rate": 4.9274529654834365e-05, + "loss": 2.9004, + "mean_token_accuracy": 0.3655172437429428, + "step": 126090 + }, + { + "epoch": 0.12700422323413316, + "grad_norm": 9.310323407261366, + "learning_rate": 4.927443526663304e-05, + "loss": 2.3656, + "mean_token_accuracy": 0.4551724135875702, + "step": 126095 + }, + { + "epoch": 0.12700925928723733, + "grad_norm": 11.23852980567283, + "learning_rate": 4.927434087239249e-05, + "loss": 2.2402, + "mean_token_accuracy": 0.4329098641872406, + "step": 126100 + }, + { + "epoch": 0.1270142953403415, + "grad_norm": 11.554499275540394, + "learning_rate": 4.927424647211274e-05, + "loss": 3.3686, + "mean_token_accuracy": 0.3482758581638336, + "step": 126105 + }, + { + "epoch": 0.12701933139344568, + "grad_norm": 12.759030625814802, + "learning_rate": 4.927415206579384e-05, + "loss": 2.4493, + "mean_token_accuracy": 0.3896551728248596, + "step": 126110 + }, + { + "epoch": 0.12702436744654985, + "grad_norm": 8.250148875832812, + "learning_rate": 4.927405765343579e-05, + "loss": 2.4197, + "mean_token_accuracy": 0.4310344815254211, + "step": 126115 + }, + { + "epoch": 0.12702940349965403, + "grad_norm": 9.495345883736034, + "learning_rate": 4.9273963235038614e-05, + "loss": 2.4691, + "mean_token_accuracy": 0.4310344815254211, + "step": 126120 + }, + { + "epoch": 0.1270344395527582, + "grad_norm": 10.927679756824796, + "learning_rate": 4.927386881060236e-05, + "loss": 2.5678, + "mean_token_accuracy": 0.39655172228813174, + "step": 126125 + }, + { + "epoch": 0.12703947560586237, + "grad_norm": 11.57393533609862, + "learning_rate": 4.927377438012703e-05, + "loss": 2.4389, + "mean_token_accuracy": 0.4413793087005615, + "step": 126130 + }, + { + "epoch": 0.12704451165896655, + "grad_norm": 11.680836305810228, + "learning_rate": 4.927367994361267e-05, + "loss": 2.9498, + "mean_token_accuracy": 0.4172413766384125, + "step": 126135 + }, + { + "epoch": 0.12704954771207072, + "grad_norm": 8.774422747857063, + "learning_rate": 4.927358550105929e-05, + "loss": 2.2867, + "mean_token_accuracy": 0.4586206912994385, + "step": 126140 + }, + { + "epoch": 0.1270545837651749, + "grad_norm": 9.592288354276853, + "learning_rate": 4.9273491052466946e-05, + "loss": 2.5243, + "mean_token_accuracy": 0.4034482717514038, + "step": 126145 + }, + { + "epoch": 0.12705961981827907, + "grad_norm": 10.222573659945471, + "learning_rate": 4.927339659783563e-05, + "loss": 2.3097, + "mean_token_accuracy": 0.4620689690113068, + "step": 126150 + }, + { + "epoch": 0.12706465587138324, + "grad_norm": 7.7184142181124855, + "learning_rate": 4.927330213716537e-05, + "loss": 2.2712, + "mean_token_accuracy": 0.441379314661026, + "step": 126155 + }, + { + "epoch": 0.12706969192448742, + "grad_norm": 8.746859491145443, + "learning_rate": 4.927320767045622e-05, + "loss": 2.3161, + "mean_token_accuracy": 0.4551724076271057, + "step": 126160 + }, + { + "epoch": 0.1270747279775916, + "grad_norm": 10.099152115326639, + "learning_rate": 4.927311319770818e-05, + "loss": 1.9294, + "mean_token_accuracy": 0.5115763545036316, + "step": 126165 + }, + { + "epoch": 0.12707976403069576, + "grad_norm": 12.579689390553792, + "learning_rate": 4.927301871892129e-05, + "loss": 2.7328, + "mean_token_accuracy": 0.4, + "step": 126170 + }, + { + "epoch": 0.12708480008379994, + "grad_norm": 10.084073752844548, + "learning_rate": 4.927292423409557e-05, + "loss": 2.2837, + "mean_token_accuracy": 0.4172413766384125, + "step": 126175 + }, + { + "epoch": 0.12708983613690408, + "grad_norm": 12.628416672666575, + "learning_rate": 4.927282974323105e-05, + "loss": 2.4772, + "mean_token_accuracy": 0.42068966627120974, + "step": 126180 + }, + { + "epoch": 0.12709487219000826, + "grad_norm": 11.732637109279727, + "learning_rate": 4.927273524632776e-05, + "loss": 2.5584, + "mean_token_accuracy": 0.37241379022598264, + "step": 126185 + }, + { + "epoch": 0.12709990824311243, + "grad_norm": 11.225813804845815, + "learning_rate": 4.9272640743385716e-05, + "loss": 2.4805, + "mean_token_accuracy": 0.39655172526836396, + "step": 126190 + }, + { + "epoch": 0.1271049442962166, + "grad_norm": 7.941056891899211, + "learning_rate": 4.927254623440495e-05, + "loss": 2.2199, + "mean_token_accuracy": 0.47096188068389894, + "step": 126195 + }, + { + "epoch": 0.12710998034932078, + "grad_norm": 8.890275672568116, + "learning_rate": 4.927245171938549e-05, + "loss": 2.1447, + "mean_token_accuracy": 0.46412582993507384, + "step": 126200 + }, + { + "epoch": 0.12711501640242495, + "grad_norm": 9.62779900760923, + "learning_rate": 4.9272357198327345e-05, + "loss": 2.0407, + "mean_token_accuracy": 0.47803992629051206, + "step": 126205 + }, + { + "epoch": 0.12712005245552913, + "grad_norm": 12.390036725574676, + "learning_rate": 4.927226267123057e-05, + "loss": 2.6304, + "mean_token_accuracy": 0.41379310488700866, + "step": 126210 + }, + { + "epoch": 0.1271250885086333, + "grad_norm": 11.363172950723811, + "learning_rate": 4.927216813809517e-05, + "loss": 2.5063, + "mean_token_accuracy": 0.44827585220336913, + "step": 126215 + }, + { + "epoch": 0.12713012456173747, + "grad_norm": 10.612965705980761, + "learning_rate": 4.927207359892117e-05, + "loss": 2.4224, + "mean_token_accuracy": 0.37241379022598264, + "step": 126220 + }, + { + "epoch": 0.12713516061484165, + "grad_norm": 10.421657051158945, + "learning_rate": 4.9271979053708626e-05, + "loss": 2.668, + "mean_token_accuracy": 0.4034482717514038, + "step": 126225 + }, + { + "epoch": 0.12714019666794582, + "grad_norm": 13.430643635519768, + "learning_rate": 4.9271884502457526e-05, + "loss": 2.3038, + "mean_token_accuracy": 0.4137930989265442, + "step": 126230 + }, + { + "epoch": 0.12714523272105, + "grad_norm": 11.092229422373455, + "learning_rate": 4.927178994516792e-05, + "loss": 2.4333, + "mean_token_accuracy": 0.3965517163276672, + "step": 126235 + }, + { + "epoch": 0.12715026877415417, + "grad_norm": 12.403814249020611, + "learning_rate": 4.927169538183982e-05, + "loss": 2.1577, + "mean_token_accuracy": 0.4551724076271057, + "step": 126240 + }, + { + "epoch": 0.12715530482725834, + "grad_norm": 9.75285635893045, + "learning_rate": 4.927160081247327e-05, + "loss": 2.3119, + "mean_token_accuracy": 0.4551724135875702, + "step": 126245 + }, + { + "epoch": 0.12716034088036252, + "grad_norm": 13.412757811935899, + "learning_rate": 4.927150623706827e-05, + "loss": 2.4692, + "mean_token_accuracy": 0.458620685338974, + "step": 126250 + }, + { + "epoch": 0.1271653769334667, + "grad_norm": 8.598081282392796, + "learning_rate": 4.9271411655624874e-05, + "loss": 2.879, + "mean_token_accuracy": 0.3793103516101837, + "step": 126255 + }, + { + "epoch": 0.12717041298657086, + "grad_norm": 9.855782686600524, + "learning_rate": 4.927131706814309e-05, + "loss": 2.8979, + "mean_token_accuracy": 0.39310345351696013, + "step": 126260 + }, + { + "epoch": 0.12717544903967504, + "grad_norm": 9.936308898051733, + "learning_rate": 4.9271222474622956e-05, + "loss": 2.0479, + "mean_token_accuracy": 0.47931034564971925, + "step": 126265 + }, + { + "epoch": 0.1271804850927792, + "grad_norm": 10.129650455500053, + "learning_rate": 4.927112787506448e-05, + "loss": 2.0105, + "mean_token_accuracy": 0.4882637679576874, + "step": 126270 + }, + { + "epoch": 0.12718552114588338, + "grad_norm": 11.483971175291135, + "learning_rate": 4.9271033269467716e-05, + "loss": 2.5026, + "mean_token_accuracy": 0.42068966031074523, + "step": 126275 + }, + { + "epoch": 0.12719055719898756, + "grad_norm": 10.138688224848085, + "learning_rate": 4.927093865783266e-05, + "loss": 2.5921, + "mean_token_accuracy": 0.41379310488700866, + "step": 126280 + }, + { + "epoch": 0.12719559325209173, + "grad_norm": 9.909470183148706, + "learning_rate": 4.927084404015936e-05, + "loss": 2.4916, + "mean_token_accuracy": 0.41379310488700866, + "step": 126285 + }, + { + "epoch": 0.1272006293051959, + "grad_norm": 10.510144087976744, + "learning_rate": 4.927074941644784e-05, + "loss": 2.2887, + "mean_token_accuracy": 0.48275862336158754, + "step": 126290 + }, + { + "epoch": 0.12720566535830008, + "grad_norm": 10.479268481324404, + "learning_rate": 4.927065478669811e-05, + "loss": 2.4535, + "mean_token_accuracy": 0.41379310488700866, + "step": 126295 + }, + { + "epoch": 0.12721070141140425, + "grad_norm": 8.061558553294134, + "learning_rate": 4.9270560150910215e-05, + "loss": 2.1166, + "mean_token_accuracy": 0.5051421761512757, + "step": 126300 + }, + { + "epoch": 0.12721573746450843, + "grad_norm": 9.899583273039967, + "learning_rate": 4.9270465509084175e-05, + "loss": 1.7906, + "mean_token_accuracy": 0.5068965494632721, + "step": 126305 + }, + { + "epoch": 0.1272207735176126, + "grad_norm": 10.573630394345253, + "learning_rate": 4.927037086122001e-05, + "loss": 2.4518, + "mean_token_accuracy": 0.41034482717514037, + "step": 126310 + }, + { + "epoch": 0.12722580957071677, + "grad_norm": 10.924930949289928, + "learning_rate": 4.9270276207317754e-05, + "loss": 2.7127, + "mean_token_accuracy": 0.37241379022598264, + "step": 126315 + }, + { + "epoch": 0.12723084562382092, + "grad_norm": 9.901593932786334, + "learning_rate": 4.9270181547377433e-05, + "loss": 2.664, + "mean_token_accuracy": 0.4206896543502808, + "step": 126320 + }, + { + "epoch": 0.1272358816769251, + "grad_norm": 9.58202496628284, + "learning_rate": 4.927008688139907e-05, + "loss": 2.4454, + "mean_token_accuracy": 0.43448275327682495, + "step": 126325 + }, + { + "epoch": 0.12724091773002927, + "grad_norm": 11.105729245350968, + "learning_rate": 4.9269992209382695e-05, + "loss": 2.5745, + "mean_token_accuracy": 0.38112522661685944, + "step": 126330 + }, + { + "epoch": 0.12724595378313344, + "grad_norm": 10.55666765294462, + "learning_rate": 4.926989753132833e-05, + "loss": 2.2971, + "mean_token_accuracy": 0.42758620977401735, + "step": 126335 + }, + { + "epoch": 0.12725098983623762, + "grad_norm": 11.385150936882395, + "learning_rate": 4.926980284723599e-05, + "loss": 2.1933, + "mean_token_accuracy": 0.4482758641242981, + "step": 126340 + }, + { + "epoch": 0.1272560258893418, + "grad_norm": 8.739827855266293, + "learning_rate": 4.926970815710573e-05, + "loss": 2.2255, + "mean_token_accuracy": 0.44827585816383364, + "step": 126345 + }, + { + "epoch": 0.12726106194244596, + "grad_norm": 9.726817323603358, + "learning_rate": 4.926961346093756e-05, + "loss": 2.2247, + "mean_token_accuracy": 0.4517241358757019, + "step": 126350 + }, + { + "epoch": 0.12726609799555014, + "grad_norm": 12.34747528232104, + "learning_rate": 4.92695187587315e-05, + "loss": 2.5598, + "mean_token_accuracy": 0.41034482717514037, + "step": 126355 + }, + { + "epoch": 0.1272711340486543, + "grad_norm": 8.80877887276747, + "learning_rate": 4.926942405048758e-05, + "loss": 2.334, + "mean_token_accuracy": 0.4344827592372894, + "step": 126360 + }, + { + "epoch": 0.12727617010175848, + "grad_norm": 11.165485289462055, + "learning_rate": 4.926932933620584e-05, + "loss": 2.2251, + "mean_token_accuracy": 0.43103448748588563, + "step": 126365 + }, + { + "epoch": 0.12728120615486266, + "grad_norm": 10.46804713058949, + "learning_rate": 4.926923461588628e-05, + "loss": 2.6067, + "mean_token_accuracy": 0.4294615924358368, + "step": 126370 + }, + { + "epoch": 0.12728624220796683, + "grad_norm": 10.98617950668715, + "learning_rate": 4.926913988952895e-05, + "loss": 2.3449, + "mean_token_accuracy": 0.4517241358757019, + "step": 126375 + }, + { + "epoch": 0.127291278261071, + "grad_norm": 10.645373637133252, + "learning_rate": 4.926904515713387e-05, + "loss": 2.4209, + "mean_token_accuracy": 0.48814277052879335, + "step": 126380 + }, + { + "epoch": 0.12729631431417518, + "grad_norm": 12.310859565002135, + "learning_rate": 4.9268950418701066e-05, + "loss": 2.2172, + "mean_token_accuracy": 0.4793103337287903, + "step": 126385 + }, + { + "epoch": 0.12730135036727935, + "grad_norm": 10.674351654876757, + "learning_rate": 4.926885567423055e-05, + "loss": 2.4508, + "mean_token_accuracy": 0.4275861978530884, + "step": 126390 + }, + { + "epoch": 0.12730638642038353, + "grad_norm": 15.355195517702839, + "learning_rate": 4.926876092372237e-05, + "loss": 2.5824, + "mean_token_accuracy": 0.42068964838981626, + "step": 126395 + }, + { + "epoch": 0.1273114224734877, + "grad_norm": 9.354645539103366, + "learning_rate": 4.9268666167176547e-05, + "loss": 2.2881, + "mean_token_accuracy": 0.4724137902259827, + "step": 126400 + }, + { + "epoch": 0.12731645852659187, + "grad_norm": 10.575320321669027, + "learning_rate": 4.926857140459309e-05, + "loss": 1.8653, + "mean_token_accuracy": 0.5206896603107453, + "step": 126405 + }, + { + "epoch": 0.12732149457969605, + "grad_norm": 10.266682275879177, + "learning_rate": 4.9268476635972054e-05, + "loss": 2.7302, + "mean_token_accuracy": 0.3931034505367279, + "step": 126410 + }, + { + "epoch": 0.12732653063280022, + "grad_norm": 10.495592614842995, + "learning_rate": 4.926838186131344e-05, + "loss": 2.1113, + "mean_token_accuracy": 0.4881427764892578, + "step": 126415 + }, + { + "epoch": 0.1273315666859044, + "grad_norm": 10.83245621046197, + "learning_rate": 4.9268287080617284e-05, + "loss": 2.3692, + "mean_token_accuracy": 0.4206896543502808, + "step": 126420 + }, + { + "epoch": 0.12733660273900857, + "grad_norm": 10.577581259228142, + "learning_rate": 4.926819229388362e-05, + "loss": 2.2124, + "mean_token_accuracy": 0.4379310369491577, + "step": 126425 + }, + { + "epoch": 0.12734163879211274, + "grad_norm": 11.524831788194373, + "learning_rate": 4.926809750111246e-05, + "loss": 2.6795, + "mean_token_accuracy": 0.4241379380226135, + "step": 126430 + }, + { + "epoch": 0.12734667484521692, + "grad_norm": 9.555904086448859, + "learning_rate": 4.926800270230383e-05, + "loss": 2.4417, + "mean_token_accuracy": 0.44827585220336913, + "step": 126435 + }, + { + "epoch": 0.1273517108983211, + "grad_norm": 10.767659136283019, + "learning_rate": 4.926790789745778e-05, + "loss": 2.5106, + "mean_token_accuracy": 0.42068964838981626, + "step": 126440 + }, + { + "epoch": 0.12735674695142526, + "grad_norm": 10.637516448783286, + "learning_rate": 4.926781308657431e-05, + "loss": 2.7729, + "mean_token_accuracy": 0.3827586233615875, + "step": 126445 + }, + { + "epoch": 0.12736178300452944, + "grad_norm": 13.003034501984684, + "learning_rate": 4.926771826965346e-05, + "loss": 2.4798, + "mean_token_accuracy": 0.4103448212146759, + "step": 126450 + }, + { + "epoch": 0.1273668190576336, + "grad_norm": 10.043682963341322, + "learning_rate": 4.9267623446695244e-05, + "loss": 2.5028, + "mean_token_accuracy": 0.44137930274009707, + "step": 126455 + }, + { + "epoch": 0.12737185511073776, + "grad_norm": 9.288098370214811, + "learning_rate": 4.92675286176997e-05, + "loss": 2.4132, + "mean_token_accuracy": 0.48275862336158754, + "step": 126460 + }, + { + "epoch": 0.12737689116384193, + "grad_norm": 15.561254059001973, + "learning_rate": 4.926743378266686e-05, + "loss": 2.3064, + "mean_token_accuracy": 0.4620689630508423, + "step": 126465 + }, + { + "epoch": 0.1273819272169461, + "grad_norm": 10.657211722902776, + "learning_rate": 4.926733894159673e-05, + "loss": 2.5582, + "mean_token_accuracy": 0.3915305495262146, + "step": 126470 + }, + { + "epoch": 0.12738696327005028, + "grad_norm": 10.108846251623126, + "learning_rate": 4.9267244094489346e-05, + "loss": 2.3001, + "mean_token_accuracy": 0.4482758641242981, + "step": 126475 + }, + { + "epoch": 0.12739199932315445, + "grad_norm": 9.68266235903544, + "learning_rate": 4.926714924134475e-05, + "loss": 2.7632, + "mean_token_accuracy": 0.40344826579093934, + "step": 126480 + }, + { + "epoch": 0.12739703537625863, + "grad_norm": 8.813745223752033, + "learning_rate": 4.926705438216294e-05, + "loss": 2.3415, + "mean_token_accuracy": 0.4448275864124298, + "step": 126485 + }, + { + "epoch": 0.1274020714293628, + "grad_norm": 11.138610458056178, + "learning_rate": 4.926695951694396e-05, + "loss": 2.4439, + "mean_token_accuracy": 0.4206896543502808, + "step": 126490 + }, + { + "epoch": 0.12740710748246697, + "grad_norm": 12.766977686041347, + "learning_rate": 4.926686464568783e-05, + "loss": 2.697, + "mean_token_accuracy": 0.4379310250282288, + "step": 126495 + }, + { + "epoch": 0.12741214353557115, + "grad_norm": 9.7252725679758, + "learning_rate": 4.926676976839459e-05, + "loss": 2.0266, + "mean_token_accuracy": 0.4793103516101837, + "step": 126500 + }, + { + "epoch": 0.12741717958867532, + "grad_norm": 13.430226307482679, + "learning_rate": 4.926667488506425e-05, + "loss": 2.2601, + "mean_token_accuracy": 0.43448275327682495, + "step": 126505 + }, + { + "epoch": 0.1274222156417795, + "grad_norm": 9.432532705921734, + "learning_rate": 4.9266579995696834e-05, + "loss": 2.1758, + "mean_token_accuracy": 0.49655171632766726, + "step": 126510 + }, + { + "epoch": 0.12742725169488367, + "grad_norm": 17.202132188727933, + "learning_rate": 4.926648510029237e-05, + "loss": 2.3422, + "mean_token_accuracy": 0.4379310369491577, + "step": 126515 + }, + { + "epoch": 0.12743228774798784, + "grad_norm": 10.708867121674936, + "learning_rate": 4.926639019885091e-05, + "loss": 2.379, + "mean_token_accuracy": 0.42068966031074523, + "step": 126520 + }, + { + "epoch": 0.12743732380109202, + "grad_norm": 8.986516714934993, + "learning_rate": 4.926629529137245e-05, + "loss": 2.1604, + "mean_token_accuracy": 0.4586206912994385, + "step": 126525 + }, + { + "epoch": 0.1274423598541962, + "grad_norm": 8.583356398084181, + "learning_rate": 4.9266200377857026e-05, + "loss": 2.1423, + "mean_token_accuracy": 0.5160919666290283, + "step": 126530 + }, + { + "epoch": 0.12744739590730036, + "grad_norm": 11.232109148260191, + "learning_rate": 4.9266105458304665e-05, + "loss": 2.1981, + "mean_token_accuracy": 0.5034482717514038, + "step": 126535 + }, + { + "epoch": 0.12745243196040454, + "grad_norm": 11.077759855187457, + "learning_rate": 4.92660105327154e-05, + "loss": 2.2887, + "mean_token_accuracy": 0.41034482717514037, + "step": 126540 + }, + { + "epoch": 0.1274574680135087, + "grad_norm": 11.363657292745028, + "learning_rate": 4.9265915601089246e-05, + "loss": 2.6995, + "mean_token_accuracy": 0.4034482777118683, + "step": 126545 + }, + { + "epoch": 0.12746250406661289, + "grad_norm": 10.641174029367203, + "learning_rate": 4.926582066342623e-05, + "loss": 2.5183, + "mean_token_accuracy": 0.41034482717514037, + "step": 126550 + }, + { + "epoch": 0.12746754011971706, + "grad_norm": 10.112679768361234, + "learning_rate": 4.926572571972638e-05, + "loss": 2.3635, + "mean_token_accuracy": 0.41034482717514037, + "step": 126555 + }, + { + "epoch": 0.12747257617282123, + "grad_norm": 8.072071754023415, + "learning_rate": 4.9265630769989744e-05, + "loss": 1.9905, + "mean_token_accuracy": 0.46551724076271056, + "step": 126560 + }, + { + "epoch": 0.1274776122259254, + "grad_norm": 8.702500347073224, + "learning_rate": 4.926553581421631e-05, + "loss": 2.2123, + "mean_token_accuracy": 0.4793103337287903, + "step": 126565 + }, + { + "epoch": 0.12748264827902958, + "grad_norm": 13.993208313287704, + "learning_rate": 4.926544085240614e-05, + "loss": 2.4542, + "mean_token_accuracy": 0.4482758641242981, + "step": 126570 + }, + { + "epoch": 0.12748768433213375, + "grad_norm": 10.31942398466338, + "learning_rate": 4.9265345884559225e-05, + "loss": 2.2075, + "mean_token_accuracy": 0.4620689690113068, + "step": 126575 + }, + { + "epoch": 0.12749272038523793, + "grad_norm": 13.51433793724202, + "learning_rate": 4.926525091067562e-05, + "loss": 2.8164, + "mean_token_accuracy": 0.42758620381355283, + "step": 126580 + }, + { + "epoch": 0.1274977564383421, + "grad_norm": 10.586864729474502, + "learning_rate": 4.926515593075534e-05, + "loss": 2.2226, + "mean_token_accuracy": 0.4896551787853241, + "step": 126585 + }, + { + "epoch": 0.12750279249144628, + "grad_norm": 10.929018940348854, + "learning_rate": 4.926506094479842e-05, + "loss": 2.3437, + "mean_token_accuracy": 0.4310344815254211, + "step": 126590 + }, + { + "epoch": 0.12750782854455045, + "grad_norm": 9.495089533920298, + "learning_rate": 4.926496595280487e-05, + "loss": 1.7149, + "mean_token_accuracy": 0.4948578417301178, + "step": 126595 + }, + { + "epoch": 0.1275128645976546, + "grad_norm": 9.64151516723201, + "learning_rate": 4.9264870954774726e-05, + "loss": 1.9267, + "mean_token_accuracy": 0.5310344874858857, + "step": 126600 + }, + { + "epoch": 0.12751790065075877, + "grad_norm": 8.772590219093928, + "learning_rate": 4.926477595070802e-05, + "loss": 2.3717, + "mean_token_accuracy": 0.4119782209396362, + "step": 126605 + }, + { + "epoch": 0.12752293670386294, + "grad_norm": 12.856256732661985, + "learning_rate": 4.9264680940604766e-05, + "loss": 2.4908, + "mean_token_accuracy": 0.4172413766384125, + "step": 126610 + }, + { + "epoch": 0.12752797275696712, + "grad_norm": 12.366155219429201, + "learning_rate": 4.9264585924464994e-05, + "loss": 2.1575, + "mean_token_accuracy": 0.4620689630508423, + "step": 126615 + }, + { + "epoch": 0.1275330088100713, + "grad_norm": 8.931818705571665, + "learning_rate": 4.9264490902288743e-05, + "loss": 2.5921, + "mean_token_accuracy": 0.3531155467033386, + "step": 126620 + }, + { + "epoch": 0.12753804486317546, + "grad_norm": 10.560233131796643, + "learning_rate": 4.926439587407603e-05, + "loss": 2.3733, + "mean_token_accuracy": 0.441379314661026, + "step": 126625 + }, + { + "epoch": 0.12754308091627964, + "grad_norm": 11.466093898814218, + "learning_rate": 4.926430083982687e-05, + "loss": 2.2441, + "mean_token_accuracy": 0.42280701398849485, + "step": 126630 + }, + { + "epoch": 0.1275481169693838, + "grad_norm": 11.523090129326764, + "learning_rate": 4.926420579954131e-05, + "loss": 2.2938, + "mean_token_accuracy": 0.43793103098869324, + "step": 126635 + }, + { + "epoch": 0.127553153022488, + "grad_norm": 8.508769059063956, + "learning_rate": 4.926411075321935e-05, + "loss": 2.1088, + "mean_token_accuracy": 0.4793103516101837, + "step": 126640 + }, + { + "epoch": 0.12755818907559216, + "grad_norm": 10.607255769094087, + "learning_rate": 4.926401570086105e-05, + "loss": 2.3691, + "mean_token_accuracy": 0.44137930274009707, + "step": 126645 + }, + { + "epoch": 0.12756322512869633, + "grad_norm": 10.100091149557151, + "learning_rate": 4.926392064246641e-05, + "loss": 2.369, + "mean_token_accuracy": 0.4344827592372894, + "step": 126650 + }, + { + "epoch": 0.1275682611818005, + "grad_norm": 13.376922680986132, + "learning_rate": 4.9263825578035466e-05, + "loss": 2.8752, + "mean_token_accuracy": 0.4, + "step": 126655 + }, + { + "epoch": 0.12757329723490468, + "grad_norm": 11.641374353681552, + "learning_rate": 4.926373050756825e-05, + "loss": 2.5629, + "mean_token_accuracy": 0.4379310250282288, + "step": 126660 + }, + { + "epoch": 0.12757833328800885, + "grad_norm": 8.761074886129105, + "learning_rate": 4.926363543106477e-05, + "loss": 1.8578, + "mean_token_accuracy": 0.5101266205310822, + "step": 126665 + }, + { + "epoch": 0.12758336934111303, + "grad_norm": 9.25209989931402, + "learning_rate": 4.926354034852507e-05, + "loss": 2.2834, + "mean_token_accuracy": 0.44670296609401705, + "step": 126670 + }, + { + "epoch": 0.1275884053942172, + "grad_norm": 12.2968119486088, + "learning_rate": 4.926344525994918e-05, + "loss": 2.52, + "mean_token_accuracy": 0.42758620381355283, + "step": 126675 + }, + { + "epoch": 0.12759344144732138, + "grad_norm": 15.028851253985428, + "learning_rate": 4.9263350165337114e-05, + "loss": 2.8971, + "mean_token_accuracy": 0.3482758581638336, + "step": 126680 + }, + { + "epoch": 0.12759847750042555, + "grad_norm": 11.65526991793121, + "learning_rate": 4.92632550646889e-05, + "loss": 2.4411, + "mean_token_accuracy": 0.4068965554237366, + "step": 126685 + }, + { + "epoch": 0.12760351355352972, + "grad_norm": 13.435179148738603, + "learning_rate": 4.9263159958004564e-05, + "loss": 2.4512, + "mean_token_accuracy": 0.43448275327682495, + "step": 126690 + }, + { + "epoch": 0.1276085496066339, + "grad_norm": 11.142575610417355, + "learning_rate": 4.926306484528414e-05, + "loss": 2.2504, + "mean_token_accuracy": 0.4537810027599335, + "step": 126695 + }, + { + "epoch": 0.12761358565973807, + "grad_norm": 11.366193941869742, + "learning_rate": 4.926296972652763e-05, + "loss": 2.2988, + "mean_token_accuracy": 0.42413792610168455, + "step": 126700 + }, + { + "epoch": 0.12761862171284225, + "grad_norm": 9.492350808900062, + "learning_rate": 4.92628746017351e-05, + "loss": 2.2662, + "mean_token_accuracy": 0.4172413766384125, + "step": 126705 + }, + { + "epoch": 0.12762365776594642, + "grad_norm": 9.410108035231657, + "learning_rate": 4.9262779470906545e-05, + "loss": 2.5687, + "mean_token_accuracy": 0.41379310488700866, + "step": 126710 + }, + { + "epoch": 0.1276286938190506, + "grad_norm": 10.048063691775335, + "learning_rate": 4.9262684334042006e-05, + "loss": 2.3475, + "mean_token_accuracy": 0.44827585816383364, + "step": 126715 + }, + { + "epoch": 0.12763372987215477, + "grad_norm": 9.91279451156627, + "learning_rate": 4.9262589191141504e-05, + "loss": 2.2665, + "mean_token_accuracy": 0.45517241954803467, + "step": 126720 + }, + { + "epoch": 0.12763876592525894, + "grad_norm": 11.505168197035678, + "learning_rate": 4.926249404220507e-05, + "loss": 2.7829, + "mean_token_accuracy": 0.3379310369491577, + "step": 126725 + }, + { + "epoch": 0.12764380197836311, + "grad_norm": 10.275022579429226, + "learning_rate": 4.9262398887232717e-05, + "loss": 2.796, + "mean_token_accuracy": 0.3827586233615875, + "step": 126730 + }, + { + "epoch": 0.1276488380314673, + "grad_norm": 8.43177887188339, + "learning_rate": 4.926230372622449e-05, + "loss": 2.0841, + "mean_token_accuracy": 0.4586206912994385, + "step": 126735 + }, + { + "epoch": 0.12765387408457143, + "grad_norm": 9.833103960504232, + "learning_rate": 4.9262208559180414e-05, + "loss": 2.3915, + "mean_token_accuracy": 0.42758620977401735, + "step": 126740 + }, + { + "epoch": 0.1276589101376756, + "grad_norm": 8.668301392831404, + "learning_rate": 4.9262113386100493e-05, + "loss": 2.3824, + "mean_token_accuracy": 0.4, + "step": 126745 + }, + { + "epoch": 0.12766394619077978, + "grad_norm": 14.967785717041187, + "learning_rate": 4.926201820698477e-05, + "loss": 2.627, + "mean_token_accuracy": 0.3931034505367279, + "step": 126750 + }, + { + "epoch": 0.12766898224388395, + "grad_norm": 8.005346880875743, + "learning_rate": 4.9261923021833284e-05, + "loss": 2.2352, + "mean_token_accuracy": 0.4689655125141144, + "step": 126755 + }, + { + "epoch": 0.12767401829698813, + "grad_norm": 13.11089051448881, + "learning_rate": 4.9261827830646035e-05, + "loss": 3.1929, + "mean_token_accuracy": 0.3758620619773865, + "step": 126760 + }, + { + "epoch": 0.1276790543500923, + "grad_norm": 11.198795776855427, + "learning_rate": 4.9261732633423066e-05, + "loss": 2.725, + "mean_token_accuracy": 0.3896551728248596, + "step": 126765 + }, + { + "epoch": 0.12768409040319648, + "grad_norm": 11.893137380648179, + "learning_rate": 4.92616374301644e-05, + "loss": 2.1802, + "mean_token_accuracy": 0.4620689630508423, + "step": 126770 + }, + { + "epoch": 0.12768912645630065, + "grad_norm": 9.040045397192271, + "learning_rate": 4.926154222087006e-05, + "loss": 2.2651, + "mean_token_accuracy": 0.46896551847457885, + "step": 126775 + }, + { + "epoch": 0.12769416250940482, + "grad_norm": 12.05815741875097, + "learning_rate": 4.926144700554008e-05, + "loss": 2.3568, + "mean_token_accuracy": 0.4241379380226135, + "step": 126780 + }, + { + "epoch": 0.127699198562509, + "grad_norm": 10.663360429660948, + "learning_rate": 4.926135178417448e-05, + "loss": 2.8555, + "mean_token_accuracy": 0.37586207389831544, + "step": 126785 + }, + { + "epoch": 0.12770423461561317, + "grad_norm": 10.7243196619777, + "learning_rate": 4.9261256556773284e-05, + "loss": 2.3048, + "mean_token_accuracy": 0.458620685338974, + "step": 126790 + }, + { + "epoch": 0.12770927066871735, + "grad_norm": 12.378873060511623, + "learning_rate": 4.926116132333653e-05, + "loss": 2.2625, + "mean_token_accuracy": 0.4275861978530884, + "step": 126795 + }, + { + "epoch": 0.12771430672182152, + "grad_norm": 11.267963760878676, + "learning_rate": 4.9261066083864226e-05, + "loss": 2.4628, + "mean_token_accuracy": 0.4206896543502808, + "step": 126800 + }, + { + "epoch": 0.1277193427749257, + "grad_norm": 10.039140044980758, + "learning_rate": 4.9260970838356415e-05, + "loss": 2.2729, + "mean_token_accuracy": 0.4551724135875702, + "step": 126805 + }, + { + "epoch": 0.12772437882802987, + "grad_norm": 11.310061161728843, + "learning_rate": 4.9260875586813116e-05, + "loss": 2.2114, + "mean_token_accuracy": 0.4517241418361664, + "step": 126810 + }, + { + "epoch": 0.12772941488113404, + "grad_norm": 9.95515123383782, + "learning_rate": 4.9260780329234354e-05, + "loss": 2.3266, + "mean_token_accuracy": 0.4379310369491577, + "step": 126815 + }, + { + "epoch": 0.12773445093423821, + "grad_norm": 8.628315622092826, + "learning_rate": 4.926068506562016e-05, + "loss": 2.1622, + "mean_token_accuracy": 0.4517241418361664, + "step": 126820 + }, + { + "epoch": 0.1277394869873424, + "grad_norm": 11.195149438029354, + "learning_rate": 4.9260589795970566e-05, + "loss": 2.3587, + "mean_token_accuracy": 0.42413793206214906, + "step": 126825 + }, + { + "epoch": 0.12774452304044656, + "grad_norm": 8.914357616401052, + "learning_rate": 4.926049452028559e-05, + "loss": 2.3972, + "mean_token_accuracy": 0.4137930929660797, + "step": 126830 + }, + { + "epoch": 0.12774955909355074, + "grad_norm": 11.50655367157371, + "learning_rate": 4.9260399238565255e-05, + "loss": 2.2437, + "mean_token_accuracy": 0.4310344815254211, + "step": 126835 + }, + { + "epoch": 0.1277545951466549, + "grad_norm": 9.787988754799539, + "learning_rate": 4.9260303950809596e-05, + "loss": 2.1003, + "mean_token_accuracy": 0.4586206912994385, + "step": 126840 + }, + { + "epoch": 0.12775963119975908, + "grad_norm": 9.306597994197702, + "learning_rate": 4.926020865701863e-05, + "loss": 2.1581, + "mean_token_accuracy": 0.44482759237289426, + "step": 126845 + }, + { + "epoch": 0.12776466725286326, + "grad_norm": 12.729434732711708, + "learning_rate": 4.926011335719239e-05, + "loss": 2.4881, + "mean_token_accuracy": 0.4498487591743469, + "step": 126850 + }, + { + "epoch": 0.12776970330596743, + "grad_norm": 11.276623286836196, + "learning_rate": 4.92600180513309e-05, + "loss": 2.6583, + "mean_token_accuracy": 0.37241379618644715, + "step": 126855 + }, + { + "epoch": 0.1277747393590716, + "grad_norm": 10.831769414058755, + "learning_rate": 4.9259922739434195e-05, + "loss": 2.4984, + "mean_token_accuracy": 0.42758620977401735, + "step": 126860 + }, + { + "epoch": 0.12777977541217578, + "grad_norm": 10.698054881529341, + "learning_rate": 4.925982742150228e-05, + "loss": 2.7509, + "mean_token_accuracy": 0.39310344457626345, + "step": 126865 + }, + { + "epoch": 0.12778481146527995, + "grad_norm": 9.19272532564477, + "learning_rate": 4.925973209753521e-05, + "loss": 2.5509, + "mean_token_accuracy": 0.4206896543502808, + "step": 126870 + }, + { + "epoch": 0.12778984751838413, + "grad_norm": 12.02608796593253, + "learning_rate": 4.925963676753299e-05, + "loss": 2.7366, + "mean_token_accuracy": 0.4586206912994385, + "step": 126875 + }, + { + "epoch": 0.12779488357148827, + "grad_norm": 10.731068961003015, + "learning_rate": 4.9259541431495657e-05, + "loss": 2.6728, + "mean_token_accuracy": 0.3620689660310745, + "step": 126880 + }, + { + "epoch": 0.12779991962459245, + "grad_norm": 10.631990481014354, + "learning_rate": 4.9259446089423244e-05, + "loss": 2.375, + "mean_token_accuracy": 0.4310344815254211, + "step": 126885 + }, + { + "epoch": 0.12780495567769662, + "grad_norm": 11.33762696506454, + "learning_rate": 4.9259350741315756e-05, + "loss": 2.4843, + "mean_token_accuracy": 0.4172413766384125, + "step": 126890 + }, + { + "epoch": 0.1278099917308008, + "grad_norm": 11.22839614285331, + "learning_rate": 4.925925538717323e-05, + "loss": 2.2964, + "mean_token_accuracy": 0.39310344457626345, + "step": 126895 + }, + { + "epoch": 0.12781502778390497, + "grad_norm": 12.456939660159644, + "learning_rate": 4.92591600269957e-05, + "loss": 2.3633, + "mean_token_accuracy": 0.458620685338974, + "step": 126900 + }, + { + "epoch": 0.12782006383700914, + "grad_norm": 13.257324270212553, + "learning_rate": 4.9259064660783174e-05, + "loss": 2.7723, + "mean_token_accuracy": 0.3999999940395355, + "step": 126905 + }, + { + "epoch": 0.12782509989011331, + "grad_norm": 10.561204754156838, + "learning_rate": 4.92589692885357e-05, + "loss": 2.3811, + "mean_token_accuracy": 0.3965517282485962, + "step": 126910 + }, + { + "epoch": 0.1278301359432175, + "grad_norm": 11.138798056264237, + "learning_rate": 4.925887391025329e-05, + "loss": 2.4485, + "mean_token_accuracy": 0.4310344696044922, + "step": 126915 + }, + { + "epoch": 0.12783517199632166, + "grad_norm": 12.146522636450808, + "learning_rate": 4.925877852593598e-05, + "loss": 2.4301, + "mean_token_accuracy": 0.4, + "step": 126920 + }, + { + "epoch": 0.12784020804942584, + "grad_norm": 11.319515796797806, + "learning_rate": 4.925868313558379e-05, + "loss": 2.533, + "mean_token_accuracy": 0.4206896543502808, + "step": 126925 + }, + { + "epoch": 0.12784524410253, + "grad_norm": 11.103742563527515, + "learning_rate": 4.925858773919674e-05, + "loss": 2.5852, + "mean_token_accuracy": 0.42068966031074523, + "step": 126930 + }, + { + "epoch": 0.12785028015563418, + "grad_norm": 9.783547542750082, + "learning_rate": 4.925849233677487e-05, + "loss": 2.1932, + "mean_token_accuracy": 0.4965517342090607, + "step": 126935 + }, + { + "epoch": 0.12785531620873836, + "grad_norm": 12.345273388940988, + "learning_rate": 4.925839692831821e-05, + "loss": 2.3496, + "mean_token_accuracy": 0.43653962314128875, + "step": 126940 + }, + { + "epoch": 0.12786035226184253, + "grad_norm": 10.654256450051767, + "learning_rate": 4.9258301513826766e-05, + "loss": 2.2947, + "mean_token_accuracy": 0.4, + "step": 126945 + }, + { + "epoch": 0.1278653883149467, + "grad_norm": 13.083685128526781, + "learning_rate": 4.925820609330058e-05, + "loss": 2.3788, + "mean_token_accuracy": 0.4, + "step": 126950 + }, + { + "epoch": 0.12787042436805088, + "grad_norm": 11.51521000744791, + "learning_rate": 4.9258110666739676e-05, + "loss": 2.4277, + "mean_token_accuracy": 0.45517241954803467, + "step": 126955 + }, + { + "epoch": 0.12787546042115505, + "grad_norm": 12.263774892039638, + "learning_rate": 4.9258015234144074e-05, + "loss": 2.3009, + "mean_token_accuracy": 0.4620689630508423, + "step": 126960 + }, + { + "epoch": 0.12788049647425923, + "grad_norm": 10.367202593000918, + "learning_rate": 4.925791979551381e-05, + "loss": 2.2849, + "mean_token_accuracy": 0.441379314661026, + "step": 126965 + }, + { + "epoch": 0.1278855325273634, + "grad_norm": 11.010811145056024, + "learning_rate": 4.92578243508489e-05, + "loss": 2.527, + "mean_token_accuracy": 0.3517241418361664, + "step": 126970 + }, + { + "epoch": 0.12789056858046757, + "grad_norm": 11.879036947544455, + "learning_rate": 4.9257728900149385e-05, + "loss": 2.2518, + "mean_token_accuracy": 0.41724138259887694, + "step": 126975 + }, + { + "epoch": 0.12789560463357175, + "grad_norm": 11.817416531026135, + "learning_rate": 4.9257633443415274e-05, + "loss": 2.7877, + "mean_token_accuracy": 0.3738657057285309, + "step": 126980 + }, + { + "epoch": 0.12790064068667592, + "grad_norm": 8.819305555394017, + "learning_rate": 4.925753798064661e-05, + "loss": 2.4334, + "mean_token_accuracy": 0.45698729157447815, + "step": 126985 + }, + { + "epoch": 0.1279056767397801, + "grad_norm": 10.167271536189954, + "learning_rate": 4.925744251184341e-05, + "loss": 2.6804, + "mean_token_accuracy": 0.3827586114406586, + "step": 126990 + }, + { + "epoch": 0.12791071279288427, + "grad_norm": 11.95473369184682, + "learning_rate": 4.925734703700569e-05, + "loss": 2.1363, + "mean_token_accuracy": 0.42758620381355283, + "step": 126995 + }, + { + "epoch": 0.12791574884598844, + "grad_norm": 10.837953761188341, + "learning_rate": 4.925725155613351e-05, + "loss": 2.418, + "mean_token_accuracy": 0.3793103516101837, + "step": 127000 + }, + { + "epoch": 0.12792078489909262, + "grad_norm": 8.522922760905878, + "learning_rate": 4.9257156069226864e-05, + "loss": 2.5517, + "mean_token_accuracy": 0.4310344815254211, + "step": 127005 + }, + { + "epoch": 0.1279258209521968, + "grad_norm": 10.661776153818293, + "learning_rate": 4.925706057628579e-05, + "loss": 2.7257, + "mean_token_accuracy": 0.3793103516101837, + "step": 127010 + }, + { + "epoch": 0.12793085700530096, + "grad_norm": 10.759453543875233, + "learning_rate": 4.9256965077310314e-05, + "loss": 2.0764, + "mean_token_accuracy": 0.42758620977401735, + "step": 127015 + }, + { + "epoch": 0.1279358930584051, + "grad_norm": 10.132624634933931, + "learning_rate": 4.9256869572300455e-05, + "loss": 2.4544, + "mean_token_accuracy": 0.42413793206214906, + "step": 127020 + }, + { + "epoch": 0.12794092911150928, + "grad_norm": 9.65406052034216, + "learning_rate": 4.925677406125626e-05, + "loss": 2.2545, + "mean_token_accuracy": 0.44482759237289426, + "step": 127025 + }, + { + "epoch": 0.12794596516461346, + "grad_norm": 12.313267183044637, + "learning_rate": 4.925667854417773e-05, + "loss": 3.0173, + "mean_token_accuracy": 0.4000000059604645, + "step": 127030 + }, + { + "epoch": 0.12795100121771763, + "grad_norm": 9.452212010042398, + "learning_rate": 4.925658302106491e-05, + "loss": 2.5741, + "mean_token_accuracy": 0.4344827592372894, + "step": 127035 + }, + { + "epoch": 0.1279560372708218, + "grad_norm": 9.19184639950459, + "learning_rate": 4.925648749191782e-05, + "loss": 2.122, + "mean_token_accuracy": 0.441379314661026, + "step": 127040 + }, + { + "epoch": 0.12796107332392598, + "grad_norm": 11.684138239985195, + "learning_rate": 4.9256391956736494e-05, + "loss": 2.3861, + "mean_token_accuracy": 0.46551724076271056, + "step": 127045 + }, + { + "epoch": 0.12796610937703015, + "grad_norm": 11.67704504528358, + "learning_rate": 4.9256296415520944e-05, + "loss": 2.3009, + "mean_token_accuracy": 0.4379310369491577, + "step": 127050 + }, + { + "epoch": 0.12797114543013433, + "grad_norm": 11.540006463307481, + "learning_rate": 4.9256200868271205e-05, + "loss": 2.2267, + "mean_token_accuracy": 0.4172413766384125, + "step": 127055 + }, + { + "epoch": 0.1279761814832385, + "grad_norm": 9.412826689091045, + "learning_rate": 4.9256105314987306e-05, + "loss": 2.7954, + "mean_token_accuracy": 0.41379310488700866, + "step": 127060 + }, + { + "epoch": 0.12798121753634267, + "grad_norm": 11.417912369468148, + "learning_rate": 4.925600975566927e-05, + "loss": 2.4841, + "mean_token_accuracy": 0.36896551549434664, + "step": 127065 + }, + { + "epoch": 0.12798625358944685, + "grad_norm": 12.139551665347785, + "learning_rate": 4.925591419031712e-05, + "loss": 2.2353, + "mean_token_accuracy": 0.4413793087005615, + "step": 127070 + }, + { + "epoch": 0.12799128964255102, + "grad_norm": 10.12006375810203, + "learning_rate": 4.9255818618930884e-05, + "loss": 2.2912, + "mean_token_accuracy": 0.46436781287193296, + "step": 127075 + }, + { + "epoch": 0.1279963256956552, + "grad_norm": 12.493811221751345, + "learning_rate": 4.925572304151059e-05, + "loss": 2.4715, + "mean_token_accuracy": 0.4344827592372894, + "step": 127080 + }, + { + "epoch": 0.12800136174875937, + "grad_norm": 12.31332505515964, + "learning_rate": 4.925562745805627e-05, + "loss": 2.853, + "mean_token_accuracy": 0.341379314661026, + "step": 127085 + }, + { + "epoch": 0.12800639780186354, + "grad_norm": 9.675071818223804, + "learning_rate": 4.925553186856795e-05, + "loss": 2.3168, + "mean_token_accuracy": 0.41724138259887694, + "step": 127090 + }, + { + "epoch": 0.12801143385496772, + "grad_norm": 14.779468129722208, + "learning_rate": 4.925543627304563e-05, + "loss": 2.4308, + "mean_token_accuracy": 0.4034482777118683, + "step": 127095 + }, + { + "epoch": 0.1280164699080719, + "grad_norm": 9.866213196686568, + "learning_rate": 4.9255340671489386e-05, + "loss": 2.2729, + "mean_token_accuracy": 0.39655172228813174, + "step": 127100 + }, + { + "epoch": 0.12802150596117606, + "grad_norm": 13.929116392266403, + "learning_rate": 4.92552450638992e-05, + "loss": 2.5871, + "mean_token_accuracy": 0.47586206197738645, + "step": 127105 + }, + { + "epoch": 0.12802654201428024, + "grad_norm": 12.175500117205303, + "learning_rate": 4.925514945027512e-05, + "loss": 2.3948, + "mean_token_accuracy": 0.43103448748588563, + "step": 127110 + }, + { + "epoch": 0.1280315780673844, + "grad_norm": 12.50221174596488, + "learning_rate": 4.925505383061717e-05, + "loss": 2.3325, + "mean_token_accuracy": 0.47791893482208253, + "step": 127115 + }, + { + "epoch": 0.12803661412048858, + "grad_norm": 9.112202940582087, + "learning_rate": 4.925495820492538e-05, + "loss": 2.4781, + "mean_token_accuracy": 0.3655172407627106, + "step": 127120 + }, + { + "epoch": 0.12804165017359276, + "grad_norm": 10.764015915914927, + "learning_rate": 4.925486257319977e-05, + "loss": 2.567, + "mean_token_accuracy": 0.45517241954803467, + "step": 127125 + }, + { + "epoch": 0.12804668622669693, + "grad_norm": 13.662902081936021, + "learning_rate": 4.925476693544035e-05, + "loss": 2.5529, + "mean_token_accuracy": 0.3931034505367279, + "step": 127130 + }, + { + "epoch": 0.1280517222798011, + "grad_norm": 12.774570797529282, + "learning_rate": 4.925467129164719e-05, + "loss": 2.3579, + "mean_token_accuracy": 0.42758620381355283, + "step": 127135 + }, + { + "epoch": 0.12805675833290528, + "grad_norm": 10.175613935564426, + "learning_rate": 4.925457564182027e-05, + "loss": 2.2847, + "mean_token_accuracy": 0.42413793206214906, + "step": 127140 + }, + { + "epoch": 0.12806179438600945, + "grad_norm": 11.607022208415822, + "learning_rate": 4.925447998595965e-05, + "loss": 2.8177, + "mean_token_accuracy": 0.40205685794353485, + "step": 127145 + }, + { + "epoch": 0.12806683043911363, + "grad_norm": 7.464077015132231, + "learning_rate": 4.9254384324065336e-05, + "loss": 1.9185, + "mean_token_accuracy": 0.5225369393825531, + "step": 127150 + }, + { + "epoch": 0.1280718664922178, + "grad_norm": 15.118912837057882, + "learning_rate": 4.925428865613736e-05, + "loss": 2.3115, + "mean_token_accuracy": 0.4689655125141144, + "step": 127155 + }, + { + "epoch": 0.12807690254532195, + "grad_norm": 10.826731697966773, + "learning_rate": 4.9254192982175765e-05, + "loss": 2.5683, + "mean_token_accuracy": 0.4344827592372894, + "step": 127160 + }, + { + "epoch": 0.12808193859842612, + "grad_norm": 12.394609339012014, + "learning_rate": 4.9254097302180554e-05, + "loss": 2.8275, + "mean_token_accuracy": 0.3999999940395355, + "step": 127165 + }, + { + "epoch": 0.1280869746515303, + "grad_norm": 9.514137556998087, + "learning_rate": 4.9254001616151765e-05, + "loss": 2.2196, + "mean_token_accuracy": 0.42758620977401735, + "step": 127170 + }, + { + "epoch": 0.12809201070463447, + "grad_norm": 9.329028999210804, + "learning_rate": 4.9253905924089425e-05, + "loss": 1.9966, + "mean_token_accuracy": 0.45517240166664125, + "step": 127175 + }, + { + "epoch": 0.12809704675773864, + "grad_norm": 9.595595575056784, + "learning_rate": 4.9253810225993554e-05, + "loss": 2.225, + "mean_token_accuracy": 0.4448275864124298, + "step": 127180 + }, + { + "epoch": 0.12810208281084282, + "grad_norm": 11.336584626123788, + "learning_rate": 4.925371452186418e-05, + "loss": 2.2194, + "mean_token_accuracy": 0.4931034445762634, + "step": 127185 + }, + { + "epoch": 0.128107118863947, + "grad_norm": 11.50290157788579, + "learning_rate": 4.9253618811701334e-05, + "loss": 2.5734, + "mean_token_accuracy": 0.39999999701976774, + "step": 127190 + }, + { + "epoch": 0.12811215491705116, + "grad_norm": 10.304893162485786, + "learning_rate": 4.925352309550504e-05, + "loss": 2.431, + "mean_token_accuracy": 0.44827587008476255, + "step": 127195 + }, + { + "epoch": 0.12811719097015534, + "grad_norm": 10.3719513078457, + "learning_rate": 4.925342737327533e-05, + "loss": 2.5209, + "mean_token_accuracy": 0.42413792610168455, + "step": 127200 + }, + { + "epoch": 0.1281222270232595, + "grad_norm": 10.42465515173153, + "learning_rate": 4.925333164501222e-05, + "loss": 2.6396, + "mean_token_accuracy": 0.36896551251411436, + "step": 127205 + }, + { + "epoch": 0.12812726307636368, + "grad_norm": 9.702067531280344, + "learning_rate": 4.9253235910715744e-05, + "loss": 2.4835, + "mean_token_accuracy": 0.36896551251411436, + "step": 127210 + }, + { + "epoch": 0.12813229912946786, + "grad_norm": 10.16428887813001, + "learning_rate": 4.925314017038593e-05, + "loss": 2.5293, + "mean_token_accuracy": 0.4295825779438019, + "step": 127215 + }, + { + "epoch": 0.12813733518257203, + "grad_norm": 7.453574434938719, + "learning_rate": 4.92530444240228e-05, + "loss": 2.1615, + "mean_token_accuracy": 0.48965516686439514, + "step": 127220 + }, + { + "epoch": 0.1281423712356762, + "grad_norm": 11.525993832832416, + "learning_rate": 4.925294867162638e-05, + "loss": 2.4023, + "mean_token_accuracy": 0.3965517163276672, + "step": 127225 + }, + { + "epoch": 0.12814740728878038, + "grad_norm": 8.571556145362914, + "learning_rate": 4.9252852913196696e-05, + "loss": 1.9015, + "mean_token_accuracy": 0.5310344755649566, + "step": 127230 + }, + { + "epoch": 0.12815244334188455, + "grad_norm": 10.611404609252613, + "learning_rate": 4.925275714873379e-05, + "loss": 2.3414, + "mean_token_accuracy": 0.4310344815254211, + "step": 127235 + }, + { + "epoch": 0.12815747939498873, + "grad_norm": 21.901660959596388, + "learning_rate": 4.925266137823766e-05, + "loss": 2.6081, + "mean_token_accuracy": 0.4644088625907898, + "step": 127240 + }, + { + "epoch": 0.1281625154480929, + "grad_norm": 11.028341137836424, + "learning_rate": 4.9252565601708354e-05, + "loss": 2.3647, + "mean_token_accuracy": 0.41379310488700866, + "step": 127245 + }, + { + "epoch": 0.12816755150119707, + "grad_norm": 11.373256720423647, + "learning_rate": 4.92524698191459e-05, + "loss": 2.3173, + "mean_token_accuracy": 0.4344827592372894, + "step": 127250 + }, + { + "epoch": 0.12817258755430125, + "grad_norm": 10.668431034298811, + "learning_rate": 4.925237403055031e-05, + "loss": 2.6515, + "mean_token_accuracy": 0.3827586233615875, + "step": 127255 + }, + { + "epoch": 0.12817762360740542, + "grad_norm": 10.611157214423176, + "learning_rate": 4.9252278235921616e-05, + "loss": 2.3834, + "mean_token_accuracy": 0.4517241418361664, + "step": 127260 + }, + { + "epoch": 0.1281826596605096, + "grad_norm": 11.746070389120103, + "learning_rate": 4.925218243525985e-05, + "loss": 2.4977, + "mean_token_accuracy": 0.44137930274009707, + "step": 127265 + }, + { + "epoch": 0.12818769571361377, + "grad_norm": 11.253197243240193, + "learning_rate": 4.9252086628565034e-05, + "loss": 2.3282, + "mean_token_accuracy": 0.4034482717514038, + "step": 127270 + }, + { + "epoch": 0.12819273176671794, + "grad_norm": 10.671099711682324, + "learning_rate": 4.925199081583719e-05, + "loss": 2.0988, + "mean_token_accuracy": 0.46896552443504336, + "step": 127275 + }, + { + "epoch": 0.12819776781982212, + "grad_norm": 9.405020583357254, + "learning_rate": 4.925189499707636e-05, + "loss": 2.2536, + "mean_token_accuracy": 0.4620689690113068, + "step": 127280 + }, + { + "epoch": 0.1282028038729263, + "grad_norm": 9.644305615183036, + "learning_rate": 4.925179917228256e-05, + "loss": 2.1338, + "mean_token_accuracy": 0.47931034564971925, + "step": 127285 + }, + { + "epoch": 0.12820783992603046, + "grad_norm": 12.086707061678714, + "learning_rate": 4.925170334145581e-05, + "loss": 2.1114, + "mean_token_accuracy": 0.46067755818367007, + "step": 127290 + }, + { + "epoch": 0.12821287597913464, + "grad_norm": 9.009217015545866, + "learning_rate": 4.925160750459615e-05, + "loss": 2.1778, + "mean_token_accuracy": 0.4517241418361664, + "step": 127295 + }, + { + "epoch": 0.12821791203223878, + "grad_norm": 12.85407971067005, + "learning_rate": 4.92515116617036e-05, + "loss": 2.1135, + "mean_token_accuracy": 0.4724137902259827, + "step": 127300 + }, + { + "epoch": 0.12822294808534296, + "grad_norm": 10.309255889606387, + "learning_rate": 4.9251415812778185e-05, + "loss": 2.3609, + "mean_token_accuracy": 0.42758620381355283, + "step": 127305 + }, + { + "epoch": 0.12822798413844713, + "grad_norm": 20.933573288781254, + "learning_rate": 4.9251319957819936e-05, + "loss": 2.3028, + "mean_token_accuracy": 0.4482758641242981, + "step": 127310 + }, + { + "epoch": 0.1282330201915513, + "grad_norm": 13.329691281463843, + "learning_rate": 4.9251224096828876e-05, + "loss": 2.413, + "mean_token_accuracy": 0.47586206793785096, + "step": 127315 + }, + { + "epoch": 0.12823805624465548, + "grad_norm": 15.943843930149265, + "learning_rate": 4.925112822980503e-05, + "loss": 2.5111, + "mean_token_accuracy": 0.39068360924720763, + "step": 127320 + }, + { + "epoch": 0.12824309229775965, + "grad_norm": 10.436887443574433, + "learning_rate": 4.925103235674843e-05, + "loss": 2.5934, + "mean_token_accuracy": 0.4034482777118683, + "step": 127325 + }, + { + "epoch": 0.12824812835086383, + "grad_norm": 9.44659820454716, + "learning_rate": 4.92509364776591e-05, + "loss": 2.4543, + "mean_token_accuracy": 0.44482759237289426, + "step": 127330 + }, + { + "epoch": 0.128253164403968, + "grad_norm": 10.18558018446586, + "learning_rate": 4.9250840592537076e-05, + "loss": 2.0392, + "mean_token_accuracy": 0.4758620738983154, + "step": 127335 + }, + { + "epoch": 0.12825820045707217, + "grad_norm": 10.845008242974112, + "learning_rate": 4.925074470138236e-05, + "loss": 2.8803, + "mean_token_accuracy": 0.341379314661026, + "step": 127340 + }, + { + "epoch": 0.12826323651017635, + "grad_norm": 9.280594036668052, + "learning_rate": 4.9250648804195e-05, + "loss": 2.171, + "mean_token_accuracy": 0.482758617401123, + "step": 127345 + }, + { + "epoch": 0.12826827256328052, + "grad_norm": 8.348892104554656, + "learning_rate": 4.9250552900975015e-05, + "loss": 2.3708, + "mean_token_accuracy": 0.4620689630508423, + "step": 127350 + }, + { + "epoch": 0.1282733086163847, + "grad_norm": 12.328074801651551, + "learning_rate": 4.925045699172243e-05, + "loss": 2.4891, + "mean_token_accuracy": 0.43793103098869324, + "step": 127355 + }, + { + "epoch": 0.12827834466948887, + "grad_norm": 11.849255652131053, + "learning_rate": 4.925036107643728e-05, + "loss": 2.5749, + "mean_token_accuracy": 0.3793103456497192, + "step": 127360 + }, + { + "epoch": 0.12828338072259304, + "grad_norm": 10.870795805309061, + "learning_rate": 4.9250265155119586e-05, + "loss": 2.3152, + "mean_token_accuracy": 0.45341803431510924, + "step": 127365 + }, + { + "epoch": 0.12828841677569722, + "grad_norm": 13.493678797309856, + "learning_rate": 4.925016922776937e-05, + "loss": 2.7042, + "mean_token_accuracy": 0.43793101906776427, + "step": 127370 + }, + { + "epoch": 0.1282934528288014, + "grad_norm": 9.2207436766657, + "learning_rate": 4.925007329438667e-05, + "loss": 1.938, + "mean_token_accuracy": 0.49655172824859617, + "step": 127375 + }, + { + "epoch": 0.12829848888190556, + "grad_norm": 10.288247749415964, + "learning_rate": 4.9249977354971506e-05, + "loss": 2.051, + "mean_token_accuracy": 0.4778584361076355, + "step": 127380 + }, + { + "epoch": 0.12830352493500974, + "grad_norm": 10.173543011809986, + "learning_rate": 4.924988140952391e-05, + "loss": 2.2012, + "mean_token_accuracy": 0.43793103098869324, + "step": 127385 + }, + { + "epoch": 0.1283085609881139, + "grad_norm": 10.800836878000327, + "learning_rate": 4.924978545804389e-05, + "loss": 2.0077, + "mean_token_accuracy": 0.4931034564971924, + "step": 127390 + }, + { + "epoch": 0.12831359704121809, + "grad_norm": 9.555954246154421, + "learning_rate": 4.924968950053149e-05, + "loss": 2.191, + "mean_token_accuracy": 0.4931034445762634, + "step": 127395 + }, + { + "epoch": 0.12831863309432226, + "grad_norm": 9.3459721359432, + "learning_rate": 4.924959353698673e-05, + "loss": 2.2685, + "mean_token_accuracy": 0.4586206912994385, + "step": 127400 + }, + { + "epoch": 0.12832366914742643, + "grad_norm": 16.841221492904037, + "learning_rate": 4.924949756740964e-05, + "loss": 2.3733, + "mean_token_accuracy": 0.47241379618644713, + "step": 127405 + }, + { + "epoch": 0.1283287052005306, + "grad_norm": 10.528813251349805, + "learning_rate": 4.9249401591800256e-05, + "loss": 2.4338, + "mean_token_accuracy": 0.42758620381355283, + "step": 127410 + }, + { + "epoch": 0.12833374125363478, + "grad_norm": 12.323400203574582, + "learning_rate": 4.924930561015859e-05, + "loss": 2.7101, + "mean_token_accuracy": 0.3793103456497192, + "step": 127415 + }, + { + "epoch": 0.12833877730673895, + "grad_norm": 10.683197484944763, + "learning_rate": 4.924920962248466e-05, + "loss": 2.5163, + "mean_token_accuracy": 0.36896551251411436, + "step": 127420 + }, + { + "epoch": 0.12834381335984313, + "grad_norm": 11.927005848611316, + "learning_rate": 4.924911362877852e-05, + "loss": 2.3026, + "mean_token_accuracy": 0.43103448748588563, + "step": 127425 + }, + { + "epoch": 0.1283488494129473, + "grad_norm": 10.382088964590611, + "learning_rate": 4.924901762904018e-05, + "loss": 2.1833, + "mean_token_accuracy": 0.45517241954803467, + "step": 127430 + }, + { + "epoch": 0.12835388546605148, + "grad_norm": 10.912613604648959, + "learning_rate": 4.9248921623269665e-05, + "loss": 2.7045, + "mean_token_accuracy": 0.43103448748588563, + "step": 127435 + }, + { + "epoch": 0.12835892151915562, + "grad_norm": 8.791477757425053, + "learning_rate": 4.9248825611467e-05, + "loss": 2.3226, + "mean_token_accuracy": 0.41724138259887694, + "step": 127440 + }, + { + "epoch": 0.1283639575722598, + "grad_norm": 9.801801918771966, + "learning_rate": 4.9248729593632234e-05, + "loss": 2.3741, + "mean_token_accuracy": 0.4965517222881317, + "step": 127445 + }, + { + "epoch": 0.12836899362536397, + "grad_norm": 10.980425861458698, + "learning_rate": 4.9248633569765364e-05, + "loss": 2.0667, + "mean_token_accuracy": 0.47241380214691164, + "step": 127450 + }, + { + "epoch": 0.12837402967846814, + "grad_norm": 10.18514598274303, + "learning_rate": 4.924853753986643e-05, + "loss": 2.3063, + "mean_token_accuracy": 0.458620685338974, + "step": 127455 + }, + { + "epoch": 0.12837906573157232, + "grad_norm": 15.872264392054353, + "learning_rate": 4.924844150393546e-05, + "loss": 2.6809, + "mean_token_accuracy": 0.3965517282485962, + "step": 127460 + }, + { + "epoch": 0.1283841017846765, + "grad_norm": 9.711729365108138, + "learning_rate": 4.924834546197248e-05, + "loss": 2.2877, + "mean_token_accuracy": 0.42068964838981626, + "step": 127465 + }, + { + "epoch": 0.12838913783778066, + "grad_norm": 11.688967012013288, + "learning_rate": 4.9248249413977515e-05, + "loss": 2.504, + "mean_token_accuracy": 0.4081669747829437, + "step": 127470 + }, + { + "epoch": 0.12839417389088484, + "grad_norm": 12.05582258052738, + "learning_rate": 4.9248153359950587e-05, + "loss": 2.6158, + "mean_token_accuracy": 0.4068965524435043, + "step": 127475 + }, + { + "epoch": 0.128399209943989, + "grad_norm": 11.699874378755295, + "learning_rate": 4.924805729989173e-05, + "loss": 2.5124, + "mean_token_accuracy": 0.43793103098869324, + "step": 127480 + }, + { + "epoch": 0.12840424599709319, + "grad_norm": 11.294930048293406, + "learning_rate": 4.924796123380097e-05, + "loss": 2.5431, + "mean_token_accuracy": 0.40344828367233276, + "step": 127485 + }, + { + "epoch": 0.12840928205019736, + "grad_norm": 12.327184659029557, + "learning_rate": 4.924786516167833e-05, + "loss": 2.5591, + "mean_token_accuracy": 0.4009679317474365, + "step": 127490 + }, + { + "epoch": 0.12841431810330153, + "grad_norm": 8.143868523905455, + "learning_rate": 4.924776908352384e-05, + "loss": 2.4144, + "mean_token_accuracy": 0.4137930989265442, + "step": 127495 + }, + { + "epoch": 0.1284193541564057, + "grad_norm": 10.218091600680188, + "learning_rate": 4.9247672999337524e-05, + "loss": 2.0457, + "mean_token_accuracy": 0.4746521532535553, + "step": 127500 + }, + { + "epoch": 0.12842439020950988, + "grad_norm": 9.665013529497648, + "learning_rate": 4.924757690911941e-05, + "loss": 2.5364, + "mean_token_accuracy": 0.3896551728248596, + "step": 127505 + }, + { + "epoch": 0.12842942626261405, + "grad_norm": 9.824078209991308, + "learning_rate": 4.924748081286953e-05, + "loss": 2.293, + "mean_token_accuracy": 0.48620688915252686, + "step": 127510 + }, + { + "epoch": 0.12843446231571823, + "grad_norm": 9.967185374003213, + "learning_rate": 4.92473847105879e-05, + "loss": 2.3901, + "mean_token_accuracy": 0.4551724076271057, + "step": 127515 + }, + { + "epoch": 0.1284394983688224, + "grad_norm": 9.646116556289856, + "learning_rate": 4.924728860227455e-05, + "loss": 2.31, + "mean_token_accuracy": 0.4172413766384125, + "step": 127520 + }, + { + "epoch": 0.12844453442192658, + "grad_norm": 10.727863685857805, + "learning_rate": 4.924719248792951e-05, + "loss": 2.473, + "mean_token_accuracy": 0.4000000059604645, + "step": 127525 + }, + { + "epoch": 0.12844957047503075, + "grad_norm": 10.4072670042468, + "learning_rate": 4.9247096367552804e-05, + "loss": 2.3919, + "mean_token_accuracy": 0.43448275327682495, + "step": 127530 + }, + { + "epoch": 0.12845460652813492, + "grad_norm": 13.119713129747444, + "learning_rate": 4.924700024114446e-05, + "loss": 2.5632, + "mean_token_accuracy": 0.3551724135875702, + "step": 127535 + }, + { + "epoch": 0.1284596425812391, + "grad_norm": 11.06028328417857, + "learning_rate": 4.924690410870451e-05, + "loss": 2.6298, + "mean_token_accuracy": 0.4206896543502808, + "step": 127540 + }, + { + "epoch": 0.12846467863434327, + "grad_norm": 11.149996919638442, + "learning_rate": 4.9246807970232964e-05, + "loss": 2.3162, + "mean_token_accuracy": 0.45680580735206605, + "step": 127545 + }, + { + "epoch": 0.12846971468744744, + "grad_norm": 12.446012580055408, + "learning_rate": 4.9246711825729866e-05, + "loss": 2.2609, + "mean_token_accuracy": 0.46896551847457885, + "step": 127550 + }, + { + "epoch": 0.12847475074055162, + "grad_norm": 8.857469839756094, + "learning_rate": 4.924661567519524e-05, + "loss": 2.3958, + "mean_token_accuracy": 0.4206896543502808, + "step": 127555 + }, + { + "epoch": 0.1284797867936558, + "grad_norm": 17.366067394988328, + "learning_rate": 4.92465195186291e-05, + "loss": 2.8502, + "mean_token_accuracy": 0.4068965494632721, + "step": 127560 + }, + { + "epoch": 0.12848482284675997, + "grad_norm": 9.524688744223372, + "learning_rate": 4.924642335603149e-05, + "loss": 2.2596, + "mean_token_accuracy": 0.4379310369491577, + "step": 127565 + }, + { + "epoch": 0.12848985889986414, + "grad_norm": 8.954131992880237, + "learning_rate": 4.924632718740243e-05, + "loss": 2.0301, + "mean_token_accuracy": 0.47931034564971925, + "step": 127570 + }, + { + "epoch": 0.1284948949529683, + "grad_norm": 10.572252217444298, + "learning_rate": 4.924623101274193e-05, + "loss": 2.051, + "mean_token_accuracy": 0.47586206197738645, + "step": 127575 + }, + { + "epoch": 0.12849993100607246, + "grad_norm": 9.972141032575564, + "learning_rate": 4.924613483205005e-05, + "loss": 2.5891, + "mean_token_accuracy": 0.38620689511299133, + "step": 127580 + }, + { + "epoch": 0.12850496705917663, + "grad_norm": 23.634134233052833, + "learning_rate": 4.924603864532679e-05, + "loss": 2.5493, + "mean_token_accuracy": 0.4620689690113068, + "step": 127585 + }, + { + "epoch": 0.1285100031122808, + "grad_norm": 9.920526659620677, + "learning_rate": 4.924594245257219e-05, + "loss": 2.1956, + "mean_token_accuracy": 0.4862068951129913, + "step": 127590 + }, + { + "epoch": 0.12851503916538498, + "grad_norm": 9.685879530127387, + "learning_rate": 4.9245846253786265e-05, + "loss": 2.0457, + "mean_token_accuracy": 0.5054446458816528, + "step": 127595 + }, + { + "epoch": 0.12852007521848915, + "grad_norm": 9.778513673376692, + "learning_rate": 4.924575004896905e-05, + "loss": 2.2707, + "mean_token_accuracy": 0.4551724076271057, + "step": 127600 + }, + { + "epoch": 0.12852511127159333, + "grad_norm": 11.891404312162072, + "learning_rate": 4.924565383812057e-05, + "loss": 2.4989, + "mean_token_accuracy": 0.37586206793785093, + "step": 127605 + }, + { + "epoch": 0.1285301473246975, + "grad_norm": 10.75039502544218, + "learning_rate": 4.9245557621240856e-05, + "loss": 2.5416, + "mean_token_accuracy": 0.3862068891525269, + "step": 127610 + }, + { + "epoch": 0.12853518337780168, + "grad_norm": 9.612919729099593, + "learning_rate": 4.924546139832992e-05, + "loss": 2.7252, + "mean_token_accuracy": 0.40514217019081117, + "step": 127615 + }, + { + "epoch": 0.12854021943090585, + "grad_norm": 10.037783219321234, + "learning_rate": 4.924536516938781e-05, + "loss": 2.2765, + "mean_token_accuracy": 0.4344827651977539, + "step": 127620 + }, + { + "epoch": 0.12854525548401002, + "grad_norm": 10.355716639114506, + "learning_rate": 4.924526893441454e-05, + "loss": 2.578, + "mean_token_accuracy": 0.37931033968925476, + "step": 127625 + }, + { + "epoch": 0.1285502915371142, + "grad_norm": 11.434283329777282, + "learning_rate": 4.9245172693410137e-05, + "loss": 2.3593, + "mean_token_accuracy": 0.42413792610168455, + "step": 127630 + }, + { + "epoch": 0.12855532759021837, + "grad_norm": 9.883753883517219, + "learning_rate": 4.924507644637462e-05, + "loss": 2.4999, + "mean_token_accuracy": 0.37586207389831544, + "step": 127635 + }, + { + "epoch": 0.12856036364332254, + "grad_norm": 9.85825155963918, + "learning_rate": 4.9244980193308035e-05, + "loss": 2.1288, + "mean_token_accuracy": 0.41034482717514037, + "step": 127640 + }, + { + "epoch": 0.12856539969642672, + "grad_norm": 9.490900552391723, + "learning_rate": 4.92448839342104e-05, + "loss": 2.2676, + "mean_token_accuracy": 0.40689654350280763, + "step": 127645 + }, + { + "epoch": 0.1285704357495309, + "grad_norm": 10.392967148786031, + "learning_rate": 4.924478766908174e-05, + "loss": 2.3756, + "mean_token_accuracy": 0.42758620381355283, + "step": 127650 + }, + { + "epoch": 0.12857547180263507, + "grad_norm": 10.196652029877601, + "learning_rate": 4.924469139792208e-05, + "loss": 2.4218, + "mean_token_accuracy": 0.4151845157146454, + "step": 127655 + }, + { + "epoch": 0.12858050785573924, + "grad_norm": 10.106740909292347, + "learning_rate": 4.9244595120731454e-05, + "loss": 2.5054, + "mean_token_accuracy": 0.40889291763305663, + "step": 127660 + }, + { + "epoch": 0.1285855439088434, + "grad_norm": 8.746469829851762, + "learning_rate": 4.9244498837509865e-05, + "loss": 2.1263, + "mean_token_accuracy": 0.4848759710788727, + "step": 127665 + }, + { + "epoch": 0.1285905799619476, + "grad_norm": 11.96363592722122, + "learning_rate": 4.9244402548257374e-05, + "loss": 2.3255, + "mean_token_accuracy": 0.4517241299152374, + "step": 127670 + }, + { + "epoch": 0.12859561601505176, + "grad_norm": 12.873178156372564, + "learning_rate": 4.9244306252973995e-05, + "loss": 2.1929, + "mean_token_accuracy": 0.4620689690113068, + "step": 127675 + }, + { + "epoch": 0.12860065206815594, + "grad_norm": 12.555515286965361, + "learning_rate": 4.924420995165974e-05, + "loss": 2.6105, + "mean_token_accuracy": 0.42068966031074523, + "step": 127680 + }, + { + "epoch": 0.1286056881212601, + "grad_norm": 11.257758453877543, + "learning_rate": 4.924411364431465e-05, + "loss": 2.2747, + "mean_token_accuracy": 0.46551724076271056, + "step": 127685 + }, + { + "epoch": 0.12861072417436428, + "grad_norm": 9.524178075990802, + "learning_rate": 4.924401733093875e-05, + "loss": 2.2319, + "mean_token_accuracy": 0.43448275327682495, + "step": 127690 + }, + { + "epoch": 0.12861576022746846, + "grad_norm": 10.73306407693994, + "learning_rate": 4.924392101153207e-05, + "loss": 2.8543, + "mean_token_accuracy": 0.4, + "step": 127695 + }, + { + "epoch": 0.12862079628057263, + "grad_norm": 12.358542560353868, + "learning_rate": 4.924382468609463e-05, + "loss": 2.5391, + "mean_token_accuracy": 0.41034482717514037, + "step": 127700 + }, + { + "epoch": 0.1286258323336768, + "grad_norm": 12.803602730367725, + "learning_rate": 4.9243728354626465e-05, + "loss": 2.2057, + "mean_token_accuracy": 0.4665024638175964, + "step": 127705 + }, + { + "epoch": 0.12863086838678098, + "grad_norm": 11.912318131331174, + "learning_rate": 4.924363201712758e-05, + "loss": 2.0926, + "mean_token_accuracy": 0.482758617401123, + "step": 127710 + }, + { + "epoch": 0.12863590443988515, + "grad_norm": 8.778746103534983, + "learning_rate": 4.924353567359804e-05, + "loss": 2.2093, + "mean_token_accuracy": 0.4517241358757019, + "step": 127715 + }, + { + "epoch": 0.1286409404929893, + "grad_norm": 11.294805026590723, + "learning_rate": 4.9243439324037824e-05, + "loss": 2.9956, + "mean_token_accuracy": 0.3517241358757019, + "step": 127720 + }, + { + "epoch": 0.12864597654609347, + "grad_norm": 12.377306215613103, + "learning_rate": 4.9243342968447e-05, + "loss": 2.149, + "mean_token_accuracy": 0.4551724076271057, + "step": 127725 + }, + { + "epoch": 0.12865101259919764, + "grad_norm": 12.352788698625478, + "learning_rate": 4.924324660682558e-05, + "loss": 2.0403, + "mean_token_accuracy": 0.4689655125141144, + "step": 127730 + }, + { + "epoch": 0.12865604865230182, + "grad_norm": 12.446138862570537, + "learning_rate": 4.924315023917358e-05, + "loss": 2.3995, + "mean_token_accuracy": 0.4482758641242981, + "step": 127735 + }, + { + "epoch": 0.128661084705406, + "grad_norm": 9.314516141434398, + "learning_rate": 4.924305386549104e-05, + "loss": 2.4982, + "mean_token_accuracy": 0.44827585816383364, + "step": 127740 + }, + { + "epoch": 0.12866612075851017, + "grad_norm": 13.349780516521868, + "learning_rate": 4.924295748577799e-05, + "loss": 2.5256, + "mean_token_accuracy": 0.39848759174346926, + "step": 127745 + }, + { + "epoch": 0.12867115681161434, + "grad_norm": 9.868787205010163, + "learning_rate": 4.924286110003444e-05, + "loss": 2.539, + "mean_token_accuracy": 0.43103448748588563, + "step": 127750 + }, + { + "epoch": 0.1286761928647185, + "grad_norm": 8.303075561032642, + "learning_rate": 4.9242764708260426e-05, + "loss": 2.2235, + "mean_token_accuracy": 0.46049606800079346, + "step": 127755 + }, + { + "epoch": 0.1286812289178227, + "grad_norm": 8.700129701718794, + "learning_rate": 4.924266831045598e-05, + "loss": 2.3138, + "mean_token_accuracy": 0.43448275327682495, + "step": 127760 + }, + { + "epoch": 0.12868626497092686, + "grad_norm": 10.867260357626947, + "learning_rate": 4.924257190662112e-05, + "loss": 2.4015, + "mean_token_accuracy": 0.41863279342651366, + "step": 127765 + }, + { + "epoch": 0.12869130102403104, + "grad_norm": 9.941276875340995, + "learning_rate": 4.924247549675588e-05, + "loss": 2.3781, + "mean_token_accuracy": 0.4241379201412201, + "step": 127770 + }, + { + "epoch": 0.1286963370771352, + "grad_norm": 11.761954858608897, + "learning_rate": 4.924237908086028e-05, + "loss": 2.4607, + "mean_token_accuracy": 0.42413792610168455, + "step": 127775 + }, + { + "epoch": 0.12870137313023938, + "grad_norm": 11.724174152208045, + "learning_rate": 4.924228265893436e-05, + "loss": 2.4791, + "mean_token_accuracy": 0.4379310369491577, + "step": 127780 + }, + { + "epoch": 0.12870640918334356, + "grad_norm": 12.11535240758572, + "learning_rate": 4.924218623097813e-05, + "loss": 2.705, + "mean_token_accuracy": 0.42413793206214906, + "step": 127785 + }, + { + "epoch": 0.12871144523644773, + "grad_norm": 11.900259854091827, + "learning_rate": 4.924208979699162e-05, + "loss": 2.699, + "mean_token_accuracy": 0.37241379022598264, + "step": 127790 + }, + { + "epoch": 0.1287164812895519, + "grad_norm": 9.320682189089904, + "learning_rate": 4.924199335697486e-05, + "loss": 2.2684, + "mean_token_accuracy": 0.4482758641242981, + "step": 127795 + }, + { + "epoch": 0.12872151734265608, + "grad_norm": 10.173876403857657, + "learning_rate": 4.9241896910927884e-05, + "loss": 2.077, + "mean_token_accuracy": 0.4620689570903778, + "step": 127800 + }, + { + "epoch": 0.12872655339576025, + "grad_norm": 12.429764339461054, + "learning_rate": 4.9241800458850704e-05, + "loss": 2.3958, + "mean_token_accuracy": 0.4206896543502808, + "step": 127805 + }, + { + "epoch": 0.12873158944886443, + "grad_norm": 9.717841497686875, + "learning_rate": 4.9241704000743354e-05, + "loss": 2.219, + "mean_token_accuracy": 0.4206896543502808, + "step": 127810 + }, + { + "epoch": 0.1287366255019686, + "grad_norm": 8.179142170925058, + "learning_rate": 4.9241607536605874e-05, + "loss": 2.0514, + "mean_token_accuracy": 0.47931034564971925, + "step": 127815 + }, + { + "epoch": 0.12874166155507277, + "grad_norm": 9.36774110027047, + "learning_rate": 4.9241511066438265e-05, + "loss": 2.1227, + "mean_token_accuracy": 0.4517241418361664, + "step": 127820 + }, + { + "epoch": 0.12874669760817695, + "grad_norm": 16.963848948166888, + "learning_rate": 4.9241414590240575e-05, + "loss": 2.8065, + "mean_token_accuracy": 0.3793103456497192, + "step": 127825 + }, + { + "epoch": 0.12875173366128112, + "grad_norm": 9.067236700654885, + "learning_rate": 4.9241318108012816e-05, + "loss": 2.5271, + "mean_token_accuracy": 0.42758620381355283, + "step": 127830 + }, + { + "epoch": 0.1287567697143853, + "grad_norm": 10.033844406102103, + "learning_rate": 4.924122161975502e-05, + "loss": 2.0759, + "mean_token_accuracy": 0.4620689690113068, + "step": 127835 + }, + { + "epoch": 0.12876180576748947, + "grad_norm": 11.480512654263155, + "learning_rate": 4.924112512546722e-05, + "loss": 2.5, + "mean_token_accuracy": 0.44827587008476255, + "step": 127840 + }, + { + "epoch": 0.12876684182059364, + "grad_norm": 10.490263782530084, + "learning_rate": 4.924102862514945e-05, + "loss": 2.4743, + "mean_token_accuracy": 0.3551724076271057, + "step": 127845 + }, + { + "epoch": 0.12877187787369782, + "grad_norm": 11.952263836096163, + "learning_rate": 4.92409321188017e-05, + "loss": 2.5803, + "mean_token_accuracy": 0.4137930989265442, + "step": 127850 + }, + { + "epoch": 0.128776913926802, + "grad_norm": 12.367147219303193, + "learning_rate": 4.9240835606424044e-05, + "loss": 2.6585, + "mean_token_accuracy": 0.3896551787853241, + "step": 127855 + }, + { + "epoch": 0.12878194997990614, + "grad_norm": 8.913225504028858, + "learning_rate": 4.924073908801647e-05, + "loss": 2.8537, + "mean_token_accuracy": 0.42413792610168455, + "step": 127860 + }, + { + "epoch": 0.1287869860330103, + "grad_norm": 13.54135211834433, + "learning_rate": 4.924064256357903e-05, + "loss": 2.4155, + "mean_token_accuracy": 0.38620689511299133, + "step": 127865 + }, + { + "epoch": 0.12879202208611448, + "grad_norm": 11.067944021256205, + "learning_rate": 4.9240546033111734e-05, + "loss": 2.4104, + "mean_token_accuracy": 0.4137930989265442, + "step": 127870 + }, + { + "epoch": 0.12879705813921866, + "grad_norm": 8.913873877160082, + "learning_rate": 4.924044949661462e-05, + "loss": 2.3584, + "mean_token_accuracy": 0.4482758641242981, + "step": 127875 + }, + { + "epoch": 0.12880209419232283, + "grad_norm": 10.619948518487105, + "learning_rate": 4.924035295408771e-05, + "loss": 2.016, + "mean_token_accuracy": 0.4620689690113068, + "step": 127880 + }, + { + "epoch": 0.128807130245427, + "grad_norm": 8.548220761346169, + "learning_rate": 4.924025640553104e-05, + "loss": 2.2873, + "mean_token_accuracy": 0.41724138855934145, + "step": 127885 + }, + { + "epoch": 0.12881216629853118, + "grad_norm": 10.81221961159857, + "learning_rate": 4.9240159850944624e-05, + "loss": 2.9849, + "mean_token_accuracy": 0.3758620619773865, + "step": 127890 + }, + { + "epoch": 0.12881720235163535, + "grad_norm": 10.749276371479498, + "learning_rate": 4.9240063290328486e-05, + "loss": 2.7106, + "mean_token_accuracy": 0.3896551698446274, + "step": 127895 + }, + { + "epoch": 0.12882223840473953, + "grad_norm": 9.089119627198698, + "learning_rate": 4.9239966723682666e-05, + "loss": 2.377, + "mean_token_accuracy": 0.4379310369491577, + "step": 127900 + }, + { + "epoch": 0.1288272744578437, + "grad_norm": 7.1446871415420885, + "learning_rate": 4.9239870151007184e-05, + "loss": 2.2417, + "mean_token_accuracy": 0.4551724135875702, + "step": 127905 + }, + { + "epoch": 0.12883231051094787, + "grad_norm": 10.187841017023604, + "learning_rate": 4.9239773572302075e-05, + "loss": 2.2897, + "mean_token_accuracy": 0.4517241418361664, + "step": 127910 + }, + { + "epoch": 0.12883734656405205, + "grad_norm": 10.403386119245795, + "learning_rate": 4.9239676987567345e-05, + "loss": 2.6126, + "mean_token_accuracy": 0.4241379350423813, + "step": 127915 + }, + { + "epoch": 0.12884238261715622, + "grad_norm": 12.326672261273545, + "learning_rate": 4.9239580396803054e-05, + "loss": 2.734, + "mean_token_accuracy": 0.3551724135875702, + "step": 127920 + }, + { + "epoch": 0.1288474186702604, + "grad_norm": 9.206492008968038, + "learning_rate": 4.923948380000919e-05, + "loss": 2.032, + "mean_token_accuracy": 0.458620685338974, + "step": 127925 + }, + { + "epoch": 0.12885245472336457, + "grad_norm": 10.773759247858782, + "learning_rate": 4.923938719718581e-05, + "loss": 2.1891, + "mean_token_accuracy": 0.4517241418361664, + "step": 127930 + }, + { + "epoch": 0.12885749077646874, + "grad_norm": 9.995627006310333, + "learning_rate": 4.923929058833293e-05, + "loss": 1.9157, + "mean_token_accuracy": 0.46551724672317507, + "step": 127935 + }, + { + "epoch": 0.12886252682957292, + "grad_norm": 9.945657078055119, + "learning_rate": 4.923919397345057e-05, + "loss": 2.2962, + "mean_token_accuracy": 0.47586206793785096, + "step": 127940 + }, + { + "epoch": 0.1288675628826771, + "grad_norm": 9.875888206298198, + "learning_rate": 4.9239097352538764e-05, + "loss": 2.3374, + "mean_token_accuracy": 0.42413793206214906, + "step": 127945 + }, + { + "epoch": 0.12887259893578126, + "grad_norm": 12.603182960831314, + "learning_rate": 4.923900072559755e-05, + "loss": 2.4499, + "mean_token_accuracy": 0.4034482717514038, + "step": 127950 + }, + { + "epoch": 0.12887763498888544, + "grad_norm": 6.387231426374068, + "learning_rate": 4.9238904092626934e-05, + "loss": 2.1974, + "mean_token_accuracy": 0.5034482717514038, + "step": 127955 + }, + { + "epoch": 0.1288826710419896, + "grad_norm": 12.448383390707571, + "learning_rate": 4.923880745362694e-05, + "loss": 2.4921, + "mean_token_accuracy": 0.41379311084747317, + "step": 127960 + }, + { + "epoch": 0.12888770709509378, + "grad_norm": 10.895570328460016, + "learning_rate": 4.923871080859763e-05, + "loss": 2.5131, + "mean_token_accuracy": 0.3896551728248596, + "step": 127965 + }, + { + "epoch": 0.12889274314819796, + "grad_norm": 9.671907138873527, + "learning_rate": 4.923861415753899e-05, + "loss": 2.376, + "mean_token_accuracy": 0.4344827592372894, + "step": 127970 + }, + { + "epoch": 0.12889777920130213, + "grad_norm": 10.7373867442119, + "learning_rate": 4.923851750045107e-05, + "loss": 2.0071, + "mean_token_accuracy": 0.5151477873325347, + "step": 127975 + }, + { + "epoch": 0.1289028152544063, + "grad_norm": 10.500383645785941, + "learning_rate": 4.923842083733389e-05, + "loss": 2.3578, + "mean_token_accuracy": 0.43974592089653014, + "step": 127980 + }, + { + "epoch": 0.12890785130751048, + "grad_norm": 15.368089437734245, + "learning_rate": 4.923832416818748e-05, + "loss": 2.7182, + "mean_token_accuracy": 0.4137930929660797, + "step": 127985 + }, + { + "epoch": 0.12891288736061465, + "grad_norm": 9.960802926068677, + "learning_rate": 4.923822749301186e-05, + "loss": 2.6472, + "mean_token_accuracy": 0.38965516686439516, + "step": 127990 + }, + { + "epoch": 0.12891792341371883, + "grad_norm": 10.09790831119215, + "learning_rate": 4.923813081180707e-05, + "loss": 2.5491, + "mean_token_accuracy": 0.42413792610168455, + "step": 127995 + }, + { + "epoch": 0.12892295946682297, + "grad_norm": 9.149688846576023, + "learning_rate": 4.923803412457313e-05, + "loss": 2.0772, + "mean_token_accuracy": 0.45716878175735476, + "step": 128000 + }, + { + "epoch": 0.12892799551992715, + "grad_norm": 9.896589071965902, + "learning_rate": 4.923793743131006e-05, + "loss": 2.3349, + "mean_token_accuracy": 0.41379311084747317, + "step": 128005 + }, + { + "epoch": 0.12893303157303132, + "grad_norm": 9.850544263452704, + "learning_rate": 4.923784073201788e-05, + "loss": 2.4156, + "mean_token_accuracy": 0.41379310488700866, + "step": 128010 + }, + { + "epoch": 0.1289380676261355, + "grad_norm": 11.280189944010093, + "learning_rate": 4.923774402669664e-05, + "loss": 2.6709, + "mean_token_accuracy": 0.4428917050361633, + "step": 128015 + }, + { + "epoch": 0.12894310367923967, + "grad_norm": 10.265939459215152, + "learning_rate": 4.923764731534636e-05, + "loss": 2.373, + "mean_token_accuracy": 0.4448275864124298, + "step": 128020 + }, + { + "epoch": 0.12894813973234384, + "grad_norm": 10.446819118202718, + "learning_rate": 4.923755059796706e-05, + "loss": 2.4208, + "mean_token_accuracy": 0.44827585220336913, + "step": 128025 + }, + { + "epoch": 0.12895317578544802, + "grad_norm": 8.906470300846742, + "learning_rate": 4.923745387455876e-05, + "loss": 2.3462, + "mean_token_accuracy": 0.4068965554237366, + "step": 128030 + }, + { + "epoch": 0.1289582118385522, + "grad_norm": 8.416743895264542, + "learning_rate": 4.9237357145121504e-05, + "loss": 2.3328, + "mean_token_accuracy": 0.4103448212146759, + "step": 128035 + }, + { + "epoch": 0.12896324789165636, + "grad_norm": 12.411916065591448, + "learning_rate": 4.9237260409655316e-05, + "loss": 2.4392, + "mean_token_accuracy": 0.44373866319656374, + "step": 128040 + }, + { + "epoch": 0.12896828394476054, + "grad_norm": 11.547835284403805, + "learning_rate": 4.923716366816021e-05, + "loss": 2.5093, + "mean_token_accuracy": 0.39310344457626345, + "step": 128045 + }, + { + "epoch": 0.1289733199978647, + "grad_norm": 13.68837922765251, + "learning_rate": 4.9237066920636224e-05, + "loss": 2.8612, + "mean_token_accuracy": 0.4118572294712067, + "step": 128050 + }, + { + "epoch": 0.12897835605096888, + "grad_norm": 9.466380823989482, + "learning_rate": 4.923697016708338e-05, + "loss": 2.134, + "mean_token_accuracy": 0.5103448331356049, + "step": 128055 + }, + { + "epoch": 0.12898339210407306, + "grad_norm": 10.352341153421657, + "learning_rate": 4.9236873407501705e-05, + "loss": 2.5447, + "mean_token_accuracy": 0.3896551728248596, + "step": 128060 + }, + { + "epoch": 0.12898842815717723, + "grad_norm": 14.339941788551553, + "learning_rate": 4.923677664189123e-05, + "loss": 2.0564, + "mean_token_accuracy": 0.5217980325222016, + "step": 128065 + }, + { + "epoch": 0.1289934642102814, + "grad_norm": 10.45788698802218, + "learning_rate": 4.9236679870251975e-05, + "loss": 2.4836, + "mean_token_accuracy": 0.42758620381355283, + "step": 128070 + }, + { + "epoch": 0.12899850026338558, + "grad_norm": 11.187185664102419, + "learning_rate": 4.923658309258397e-05, + "loss": 2.5896, + "mean_token_accuracy": 0.43793103098869324, + "step": 128075 + }, + { + "epoch": 0.12900353631648975, + "grad_norm": 10.299184261023209, + "learning_rate": 4.923648630888725e-05, + "loss": 2.341, + "mean_token_accuracy": 0.45517241954803467, + "step": 128080 + }, + { + "epoch": 0.12900857236959393, + "grad_norm": 9.825751820106026, + "learning_rate": 4.9236389519161826e-05, + "loss": 2.3219, + "mean_token_accuracy": 0.4413793087005615, + "step": 128085 + }, + { + "epoch": 0.1290136084226981, + "grad_norm": 11.382378327600449, + "learning_rate": 4.9236292723407736e-05, + "loss": 2.4512, + "mean_token_accuracy": 0.44482758045196535, + "step": 128090 + }, + { + "epoch": 0.12901864447580227, + "grad_norm": 12.597371176736944, + "learning_rate": 4.923619592162501e-05, + "loss": 2.1234, + "mean_token_accuracy": 0.4620689630508423, + "step": 128095 + }, + { + "epoch": 0.12902368052890645, + "grad_norm": 11.599556806224502, + "learning_rate": 4.923609911381366e-05, + "loss": 2.3214, + "mean_token_accuracy": 0.43103447556495667, + "step": 128100 + }, + { + "epoch": 0.12902871658201062, + "grad_norm": 10.661955965787193, + "learning_rate": 4.923600229997373e-05, + "loss": 1.9721, + "mean_token_accuracy": 0.5241379320621491, + "step": 128105 + }, + { + "epoch": 0.1290337526351148, + "grad_norm": 9.6460281938407, + "learning_rate": 4.9235905480105235e-05, + "loss": 2.4721, + "mean_token_accuracy": 0.43647912740707395, + "step": 128110 + }, + { + "epoch": 0.12903878868821897, + "grad_norm": 13.656770520630891, + "learning_rate": 4.92358086542082e-05, + "loss": 2.7494, + "mean_token_accuracy": 0.3655172407627106, + "step": 128115 + }, + { + "epoch": 0.12904382474132314, + "grad_norm": 10.38199837686001, + "learning_rate": 4.923571182228267e-05, + "loss": 2.6349, + "mean_token_accuracy": 0.3931034505367279, + "step": 128120 + }, + { + "epoch": 0.12904886079442732, + "grad_norm": 12.340078572625343, + "learning_rate": 4.923561498432865e-05, + "loss": 2.1448, + "mean_token_accuracy": 0.4551724076271057, + "step": 128125 + }, + { + "epoch": 0.1290538968475315, + "grad_norm": 9.270954449059742, + "learning_rate": 4.923551814034618e-05, + "loss": 2.5246, + "mean_token_accuracy": 0.44137929677963256, + "step": 128130 + }, + { + "epoch": 0.12905893290063566, + "grad_norm": 9.518150641211344, + "learning_rate": 4.923542129033528e-05, + "loss": 2.1757, + "mean_token_accuracy": 0.4620689570903778, + "step": 128135 + }, + { + "epoch": 0.1290639689537398, + "grad_norm": 13.75313540086422, + "learning_rate": 4.9235324434295976e-05, + "loss": 2.2751, + "mean_token_accuracy": 0.4534785389900208, + "step": 128140 + }, + { + "epoch": 0.12906900500684398, + "grad_norm": 10.193098893792, + "learning_rate": 4.9235227572228306e-05, + "loss": 2.0801, + "mean_token_accuracy": 0.4551724076271057, + "step": 128145 + }, + { + "epoch": 0.12907404105994816, + "grad_norm": 13.76885370745627, + "learning_rate": 4.923513070413228e-05, + "loss": 2.7368, + "mean_token_accuracy": 0.3896551728248596, + "step": 128150 + }, + { + "epoch": 0.12907907711305233, + "grad_norm": 10.137959076144442, + "learning_rate": 4.9235033830007947e-05, + "loss": 2.3066, + "mean_token_accuracy": 0.4620689630508423, + "step": 128155 + }, + { + "epoch": 0.1290841131661565, + "grad_norm": 9.497434500079118, + "learning_rate": 4.923493694985532e-05, + "loss": 2.2012, + "mean_token_accuracy": 0.493103438615799, + "step": 128160 + }, + { + "epoch": 0.12908914921926068, + "grad_norm": 10.618999431179382, + "learning_rate": 4.923484006367442e-05, + "loss": 2.2725, + "mean_token_accuracy": 0.36551723480224607, + "step": 128165 + }, + { + "epoch": 0.12909418527236485, + "grad_norm": 23.14589823289881, + "learning_rate": 4.923474317146528e-05, + "loss": 2.4004, + "mean_token_accuracy": 0.4310344815254211, + "step": 128170 + }, + { + "epoch": 0.12909922132546903, + "grad_norm": 11.869449548702043, + "learning_rate": 4.9234646273227937e-05, + "loss": 2.2808, + "mean_token_accuracy": 0.4551724076271057, + "step": 128175 + }, + { + "epoch": 0.1291042573785732, + "grad_norm": 17.95330533625835, + "learning_rate": 4.92345493689624e-05, + "loss": 3.1248, + "mean_token_accuracy": 0.3793103456497192, + "step": 128180 + }, + { + "epoch": 0.12910929343167737, + "grad_norm": 11.767762272627898, + "learning_rate": 4.923445245866871e-05, + "loss": 2.5562, + "mean_token_accuracy": 0.42413793206214906, + "step": 128185 + }, + { + "epoch": 0.12911432948478155, + "grad_norm": 11.455829641652464, + "learning_rate": 4.923435554234688e-05, + "loss": 2.5565, + "mean_token_accuracy": 0.3931034505367279, + "step": 128190 + }, + { + "epoch": 0.12911936553788572, + "grad_norm": 18.73248184651721, + "learning_rate": 4.9234258619996956e-05, + "loss": 2.3877, + "mean_token_accuracy": 0.47931034564971925, + "step": 128195 + }, + { + "epoch": 0.1291244015909899, + "grad_norm": 12.882458843004732, + "learning_rate": 4.923416169161895e-05, + "loss": 2.194, + "mean_token_accuracy": 0.44827585220336913, + "step": 128200 + }, + { + "epoch": 0.12912943764409407, + "grad_norm": 11.47918219496123, + "learning_rate": 4.9234064757212894e-05, + "loss": 2.3348, + "mean_token_accuracy": 0.42413793206214906, + "step": 128205 + }, + { + "epoch": 0.12913447369719824, + "grad_norm": 10.204767922131863, + "learning_rate": 4.9233967816778814e-05, + "loss": 2.3474, + "mean_token_accuracy": 0.4344827651977539, + "step": 128210 + }, + { + "epoch": 0.12913950975030242, + "grad_norm": 11.021061055877462, + "learning_rate": 4.923387087031674e-05, + "loss": 2.3792, + "mean_token_accuracy": 0.4, + "step": 128215 + }, + { + "epoch": 0.1291445458034066, + "grad_norm": 11.659137608060712, + "learning_rate": 4.923377391782669e-05, + "loss": 2.4856, + "mean_token_accuracy": 0.4586206912994385, + "step": 128220 + }, + { + "epoch": 0.12914958185651076, + "grad_norm": 12.511110755537983, + "learning_rate": 4.9233676959308706e-05, + "loss": 2.6203, + "mean_token_accuracy": 0.4172413766384125, + "step": 128225 + }, + { + "epoch": 0.12915461790961494, + "grad_norm": 10.143849413269466, + "learning_rate": 4.923357999476279e-05, + "loss": 2.1678, + "mean_token_accuracy": 0.4689655065536499, + "step": 128230 + }, + { + "epoch": 0.1291596539627191, + "grad_norm": 12.378513577454244, + "learning_rate": 4.923348302418899e-05, + "loss": 2.2234, + "mean_token_accuracy": 0.4275861978530884, + "step": 128235 + }, + { + "epoch": 0.12916469001582329, + "grad_norm": 9.238350809807386, + "learning_rate": 4.923338604758734e-05, + "loss": 2.3772, + "mean_token_accuracy": 0.38965516686439516, + "step": 128240 + }, + { + "epoch": 0.12916972606892746, + "grad_norm": 10.017038917496683, + "learning_rate": 4.9233289064957844e-05, + "loss": 2.6635, + "mean_token_accuracy": 0.4068965405225754, + "step": 128245 + }, + { + "epoch": 0.12917476212203163, + "grad_norm": 13.79515467311658, + "learning_rate": 4.923319207630054e-05, + "loss": 2.2718, + "mean_token_accuracy": 0.4689655125141144, + "step": 128250 + }, + { + "epoch": 0.1291797981751358, + "grad_norm": 10.141504044822554, + "learning_rate": 4.9233095081615454e-05, + "loss": 2.4224, + "mean_token_accuracy": 0.46551724672317507, + "step": 128255 + }, + { + "epoch": 0.12918483422823998, + "grad_norm": 9.756983224290494, + "learning_rate": 4.923299808090261e-05, + "loss": 2.4208, + "mean_token_accuracy": 0.4103448212146759, + "step": 128260 + }, + { + "epoch": 0.12918987028134415, + "grad_norm": 10.809049172746894, + "learning_rate": 4.923290107416204e-05, + "loss": 2.396, + "mean_token_accuracy": 0.4068965494632721, + "step": 128265 + }, + { + "epoch": 0.12919490633444833, + "grad_norm": 12.873062680950348, + "learning_rate": 4.923280406139377e-05, + "loss": 2.4719, + "mean_token_accuracy": 0.4172413796186447, + "step": 128270 + }, + { + "epoch": 0.12919994238755247, + "grad_norm": 12.02306164832703, + "learning_rate": 4.923270704259783e-05, + "loss": 2.3277, + "mean_token_accuracy": 0.4517241358757019, + "step": 128275 + }, + { + "epoch": 0.12920497844065665, + "grad_norm": 10.543216943325918, + "learning_rate": 4.9232610017774235e-05, + "loss": 2.429, + "mean_token_accuracy": 0.4359346628189087, + "step": 128280 + }, + { + "epoch": 0.12921001449376082, + "grad_norm": 9.285387787629254, + "learning_rate": 4.9232512986923027e-05, + "loss": 2.3374, + "mean_token_accuracy": 0.3965517163276672, + "step": 128285 + }, + { + "epoch": 0.129215050546865, + "grad_norm": 8.288132963426827, + "learning_rate": 4.9232415950044216e-05, + "loss": 2.1738, + "mean_token_accuracy": 0.4344827592372894, + "step": 128290 + }, + { + "epoch": 0.12922008659996917, + "grad_norm": 9.100106675981571, + "learning_rate": 4.9232318907137845e-05, + "loss": 2.3022, + "mean_token_accuracy": 0.4310344815254211, + "step": 128295 + }, + { + "epoch": 0.12922512265307334, + "grad_norm": 8.591850562778362, + "learning_rate": 4.923222185820393e-05, + "loss": 2.2747, + "mean_token_accuracy": 0.4620689630508423, + "step": 128300 + }, + { + "epoch": 0.12923015870617752, + "grad_norm": 11.711904980572218, + "learning_rate": 4.923212480324251e-05, + "loss": 2.5069, + "mean_token_accuracy": 0.417241370677948, + "step": 128305 + }, + { + "epoch": 0.1292351947592817, + "grad_norm": 10.441602356776599, + "learning_rate": 4.92320277422536e-05, + "loss": 2.4624, + "mean_token_accuracy": 0.44827585220336913, + "step": 128310 + }, + { + "epoch": 0.12924023081238586, + "grad_norm": 8.936264956571058, + "learning_rate": 4.9231930675237224e-05, + "loss": 2.0904, + "mean_token_accuracy": 0.4620689511299133, + "step": 128315 + }, + { + "epoch": 0.12924526686549004, + "grad_norm": 11.696122010133827, + "learning_rate": 4.923183360219343e-05, + "loss": 2.5087, + "mean_token_accuracy": 0.46268472671508787, + "step": 128320 + }, + { + "epoch": 0.1292503029185942, + "grad_norm": 9.49282278617059, + "learning_rate": 4.9231736523122226e-05, + "loss": 2.339, + "mean_token_accuracy": 0.4781004250049591, + "step": 128325 + }, + { + "epoch": 0.12925533897169839, + "grad_norm": 10.560907837869582, + "learning_rate": 4.923163943802364e-05, + "loss": 2.1361, + "mean_token_accuracy": 0.4931034445762634, + "step": 128330 + }, + { + "epoch": 0.12926037502480256, + "grad_norm": 7.868158711451674, + "learning_rate": 4.92315423468977e-05, + "loss": 1.82, + "mean_token_accuracy": 0.5434361755847931, + "step": 128335 + }, + { + "epoch": 0.12926541107790673, + "grad_norm": 11.559445731840162, + "learning_rate": 4.923144524974444e-05, + "loss": 2.3789, + "mean_token_accuracy": 0.4448275864124298, + "step": 128340 + }, + { + "epoch": 0.1292704471310109, + "grad_norm": 8.62185157778288, + "learning_rate": 4.9231348146563875e-05, + "loss": 2.5222, + "mean_token_accuracy": 0.4517241418361664, + "step": 128345 + }, + { + "epoch": 0.12927548318411508, + "grad_norm": 10.499650567420986, + "learning_rate": 4.9231251037356054e-05, + "loss": 2.6852, + "mean_token_accuracy": 0.4, + "step": 128350 + }, + { + "epoch": 0.12928051923721925, + "grad_norm": 9.987244560985339, + "learning_rate": 4.9231153922120983e-05, + "loss": 2.2718, + "mean_token_accuracy": 0.42758620381355283, + "step": 128355 + }, + { + "epoch": 0.12928555529032343, + "grad_norm": 11.609162890519203, + "learning_rate": 4.923105680085869e-05, + "loss": 2.3983, + "mean_token_accuracy": 0.44827587008476255, + "step": 128360 + }, + { + "epoch": 0.1292905913434276, + "grad_norm": 10.785544627357782, + "learning_rate": 4.923095967356922e-05, + "loss": 2.6034, + "mean_token_accuracy": 0.3793103456497192, + "step": 128365 + }, + { + "epoch": 0.12929562739653178, + "grad_norm": 10.05384063989405, + "learning_rate": 4.9230862540252575e-05, + "loss": 2.3357, + "mean_token_accuracy": 0.4482758462429047, + "step": 128370 + }, + { + "epoch": 0.12930066344963595, + "grad_norm": 10.077603193180943, + "learning_rate": 4.92307654009088e-05, + "loss": 2.6173, + "mean_token_accuracy": 0.42413793206214906, + "step": 128375 + }, + { + "epoch": 0.12930569950274012, + "grad_norm": 12.94194483584215, + "learning_rate": 4.923066825553791e-05, + "loss": 2.4918, + "mean_token_accuracy": 0.40689654648303986, + "step": 128380 + }, + { + "epoch": 0.1293107355558443, + "grad_norm": 8.653185343151973, + "learning_rate": 4.9230571104139946e-05, + "loss": 1.9475, + "mean_token_accuracy": 0.5137931048870087, + "step": 128385 + }, + { + "epoch": 0.12931577160894847, + "grad_norm": 10.093317038606145, + "learning_rate": 4.923047394671492e-05, + "loss": 2.7248, + "mean_token_accuracy": 0.35172413289546967, + "step": 128390 + }, + { + "epoch": 0.12932080766205264, + "grad_norm": 9.015370340150174, + "learning_rate": 4.923037678326287e-05, + "loss": 2.3865, + "mean_token_accuracy": 0.43448275327682495, + "step": 128395 + }, + { + "epoch": 0.12932584371515682, + "grad_norm": 12.727613415682727, + "learning_rate": 4.9230279613783815e-05, + "loss": 2.718, + "mean_token_accuracy": 0.37241379022598264, + "step": 128400 + }, + { + "epoch": 0.129330879768261, + "grad_norm": 12.893233089636619, + "learning_rate": 4.923018243827779e-05, + "loss": 2.6024, + "mean_token_accuracy": 0.38620689511299133, + "step": 128405 + }, + { + "epoch": 0.12933591582136517, + "grad_norm": 9.769822701325634, + "learning_rate": 4.923008525674482e-05, + "loss": 2.3737, + "mean_token_accuracy": 0.42758620977401735, + "step": 128410 + }, + { + "epoch": 0.1293409518744693, + "grad_norm": 11.093732285389486, + "learning_rate": 4.922998806918493e-05, + "loss": 2.2606, + "mean_token_accuracy": 0.44827587008476255, + "step": 128415 + }, + { + "epoch": 0.12934598792757349, + "grad_norm": 11.831503662923112, + "learning_rate": 4.922989087559814e-05, + "loss": 2.3646, + "mean_token_accuracy": 0.44482759237289426, + "step": 128420 + }, + { + "epoch": 0.12935102398067766, + "grad_norm": 10.932611173680359, + "learning_rate": 4.9229793675984484e-05, + "loss": 2.516, + "mean_token_accuracy": 0.4, + "step": 128425 + }, + { + "epoch": 0.12935606003378183, + "grad_norm": 10.720429181341594, + "learning_rate": 4.9229696470343995e-05, + "loss": 2.5235, + "mean_token_accuracy": 0.4103448212146759, + "step": 128430 + }, + { + "epoch": 0.129361096086886, + "grad_norm": 9.057200822809468, + "learning_rate": 4.922959925867669e-05, + "loss": 2.3166, + "mean_token_accuracy": 0.3999999940395355, + "step": 128435 + }, + { + "epoch": 0.12936613213999018, + "grad_norm": 11.393283278613834, + "learning_rate": 4.92295020409826e-05, + "loss": 2.3571, + "mean_token_accuracy": 0.42758620977401735, + "step": 128440 + }, + { + "epoch": 0.12937116819309435, + "grad_norm": 9.417508024936371, + "learning_rate": 4.922940481726175e-05, + "loss": 2.2085, + "mean_token_accuracy": 0.4172413766384125, + "step": 128445 + }, + { + "epoch": 0.12937620424619853, + "grad_norm": 10.641598745260088, + "learning_rate": 4.9229307587514174e-05, + "loss": 2.605, + "mean_token_accuracy": 0.4000000059604645, + "step": 128450 + }, + { + "epoch": 0.1293812402993027, + "grad_norm": 12.391252616413993, + "learning_rate": 4.9229210351739884e-05, + "loss": 2.7849, + "mean_token_accuracy": 0.3724137991666794, + "step": 128455 + }, + { + "epoch": 0.12938627635240688, + "grad_norm": 10.609599197597293, + "learning_rate": 4.922911310993892e-05, + "loss": 2.4388, + "mean_token_accuracy": 0.4137930989265442, + "step": 128460 + }, + { + "epoch": 0.12939131240551105, + "grad_norm": 11.084332974920564, + "learning_rate": 4.922901586211131e-05, + "loss": 2.2999, + "mean_token_accuracy": 0.4241379380226135, + "step": 128465 + }, + { + "epoch": 0.12939634845861522, + "grad_norm": 9.929504039439461, + "learning_rate": 4.922891860825707e-05, + "loss": 2.4784, + "mean_token_accuracy": 0.4586206912994385, + "step": 128470 + }, + { + "epoch": 0.1294013845117194, + "grad_norm": 15.988994757873366, + "learning_rate": 4.9228821348376245e-05, + "loss": 2.6338, + "mean_token_accuracy": 0.4379310250282288, + "step": 128475 + }, + { + "epoch": 0.12940642056482357, + "grad_norm": 10.086236902092688, + "learning_rate": 4.9228724082468846e-05, + "loss": 2.2012, + "mean_token_accuracy": 0.4448275864124298, + "step": 128480 + }, + { + "epoch": 0.12941145661792774, + "grad_norm": 9.919850608237327, + "learning_rate": 4.92286268105349e-05, + "loss": 2.4092, + "mean_token_accuracy": 0.42068966031074523, + "step": 128485 + }, + { + "epoch": 0.12941649267103192, + "grad_norm": 10.36151074613392, + "learning_rate": 4.9228529532574436e-05, + "loss": 2.6474, + "mean_token_accuracy": 0.41034482717514037, + "step": 128490 + }, + { + "epoch": 0.1294215287241361, + "grad_norm": 10.814078549595507, + "learning_rate": 4.9228432248587486e-05, + "loss": 1.9901, + "mean_token_accuracy": 0.46388384103775027, + "step": 128495 + }, + { + "epoch": 0.12942656477724027, + "grad_norm": 9.628282644193899, + "learning_rate": 4.922833495857408e-05, + "loss": 2.3803, + "mean_token_accuracy": 0.42649728059768677, + "step": 128500 + }, + { + "epoch": 0.12943160083034444, + "grad_norm": 10.190468686052826, + "learning_rate": 4.922823766253423e-05, + "loss": 2.1543, + "mean_token_accuracy": 0.4085904359817505, + "step": 128505 + }, + { + "epoch": 0.1294366368834486, + "grad_norm": 12.736714565369475, + "learning_rate": 4.922814036046797e-05, + "loss": 2.2055, + "mean_token_accuracy": 0.47241379618644713, + "step": 128510 + }, + { + "epoch": 0.1294416729365528, + "grad_norm": 13.81866931657363, + "learning_rate": 4.922804305237534e-05, + "loss": 2.1993, + "mean_token_accuracy": 0.4551724135875702, + "step": 128515 + }, + { + "epoch": 0.12944670898965696, + "grad_norm": 11.332984657417704, + "learning_rate": 4.922794573825635e-05, + "loss": 2.291, + "mean_token_accuracy": 0.43448275327682495, + "step": 128520 + }, + { + "epoch": 0.12945174504276113, + "grad_norm": 10.612774116665511, + "learning_rate": 4.9227848418111036e-05, + "loss": 2.2284, + "mean_token_accuracy": 0.41034482717514037, + "step": 128525 + }, + { + "epoch": 0.1294567810958653, + "grad_norm": 10.647990408439938, + "learning_rate": 4.9227751091939425e-05, + "loss": 2.6217, + "mean_token_accuracy": 0.3793103456497192, + "step": 128530 + }, + { + "epoch": 0.12946181714896948, + "grad_norm": 10.215387089311125, + "learning_rate": 4.922765375974154e-05, + "loss": 2.0988, + "mean_token_accuracy": 0.4517241358757019, + "step": 128535 + }, + { + "epoch": 0.12946685320207366, + "grad_norm": 9.50565017360137, + "learning_rate": 4.9227556421517404e-05, + "loss": 2.5273, + "mean_token_accuracy": 0.48275862336158754, + "step": 128540 + }, + { + "epoch": 0.12947188925517783, + "grad_norm": 12.570121742660366, + "learning_rate": 4.922745907726705e-05, + "loss": 2.3682, + "mean_token_accuracy": 0.4206896543502808, + "step": 128545 + }, + { + "epoch": 0.129476925308282, + "grad_norm": 10.587107295593276, + "learning_rate": 4.9227361726990505e-05, + "loss": 2.6492, + "mean_token_accuracy": 0.3999999940395355, + "step": 128550 + }, + { + "epoch": 0.12948196136138615, + "grad_norm": 10.889903996097548, + "learning_rate": 4.92272643706878e-05, + "loss": 2.325, + "mean_token_accuracy": 0.4551724135875702, + "step": 128555 + }, + { + "epoch": 0.12948699741449032, + "grad_norm": 11.250836260352825, + "learning_rate": 4.9227167008358946e-05, + "loss": 2.2902, + "mean_token_accuracy": 0.3931034505367279, + "step": 128560 + }, + { + "epoch": 0.1294920334675945, + "grad_norm": 9.692144480822643, + "learning_rate": 4.9227069640003984e-05, + "loss": 2.6993, + "mean_token_accuracy": 0.46182698011398315, + "step": 128565 + }, + { + "epoch": 0.12949706952069867, + "grad_norm": 10.364059605856148, + "learning_rate": 4.922697226562295e-05, + "loss": 2.3901, + "mean_token_accuracy": 0.40689654350280763, + "step": 128570 + }, + { + "epoch": 0.12950210557380284, + "grad_norm": 10.945650258632602, + "learning_rate": 4.9226874885215854e-05, + "loss": 2.3936, + "mean_token_accuracy": 0.42068964838981626, + "step": 128575 + }, + { + "epoch": 0.12950714162690702, + "grad_norm": 13.475264468376844, + "learning_rate": 4.922677749878272e-05, + "loss": 2.6797, + "mean_token_accuracy": 0.34137930572032926, + "step": 128580 + }, + { + "epoch": 0.1295121776800112, + "grad_norm": 9.29288252708925, + "learning_rate": 4.922668010632359e-05, + "loss": 2.2817, + "mean_token_accuracy": 0.4068965524435043, + "step": 128585 + }, + { + "epoch": 0.12951721373311537, + "grad_norm": 15.144942365167076, + "learning_rate": 4.922658270783848e-05, + "loss": 2.2094, + "mean_token_accuracy": 0.4517241358757019, + "step": 128590 + }, + { + "epoch": 0.12952224978621954, + "grad_norm": 14.56149968575802, + "learning_rate": 4.922648530332743e-05, + "loss": 2.5125, + "mean_token_accuracy": 0.43103448748588563, + "step": 128595 + }, + { + "epoch": 0.1295272858393237, + "grad_norm": 9.788288204012805, + "learning_rate": 4.922638789279045e-05, + "loss": 2.5252, + "mean_token_accuracy": 0.4310344815254211, + "step": 128600 + }, + { + "epoch": 0.1295323218924279, + "grad_norm": 11.313123335277009, + "learning_rate": 4.922629047622758e-05, + "loss": 2.375, + "mean_token_accuracy": 0.41379310488700866, + "step": 128605 + }, + { + "epoch": 0.12953735794553206, + "grad_norm": 11.983241871420145, + "learning_rate": 4.922619305363884e-05, + "loss": 2.4844, + "mean_token_accuracy": 0.4, + "step": 128610 + }, + { + "epoch": 0.12954239399863623, + "grad_norm": 9.775363386181654, + "learning_rate": 4.9226095625024256e-05, + "loss": 2.6076, + "mean_token_accuracy": 0.3655172407627106, + "step": 128615 + }, + { + "epoch": 0.1295474300517404, + "grad_norm": 10.321132760324106, + "learning_rate": 4.922599819038386e-05, + "loss": 2.2395, + "mean_token_accuracy": 0.42068966031074523, + "step": 128620 + }, + { + "epoch": 0.12955246610484458, + "grad_norm": 13.021030834541213, + "learning_rate": 4.9225900749717683e-05, + "loss": 2.515, + "mean_token_accuracy": 0.36896551251411436, + "step": 128625 + }, + { + "epoch": 0.12955750215794876, + "grad_norm": 10.368196936865282, + "learning_rate": 4.922580330302574e-05, + "loss": 2.3935, + "mean_token_accuracy": 0.4206896543502808, + "step": 128630 + }, + { + "epoch": 0.12956253821105293, + "grad_norm": 31.510578625425886, + "learning_rate": 4.922570585030808e-05, + "loss": 3.061, + "mean_token_accuracy": 0.3379310369491577, + "step": 128635 + }, + { + "epoch": 0.1295675742641571, + "grad_norm": 14.779858693807187, + "learning_rate": 4.922560839156469e-05, + "loss": 2.7028, + "mean_token_accuracy": 0.37241379618644715, + "step": 128640 + }, + { + "epoch": 0.12957261031726128, + "grad_norm": 11.210140774520562, + "learning_rate": 4.922551092679564e-05, + "loss": 2.3217, + "mean_token_accuracy": 0.4413793087005615, + "step": 128645 + }, + { + "epoch": 0.12957764637036545, + "grad_norm": 10.49835099290548, + "learning_rate": 4.922541345600093e-05, + "loss": 2.3338, + "mean_token_accuracy": 0.4502722263336182, + "step": 128650 + }, + { + "epoch": 0.12958268242346963, + "grad_norm": 9.933881739230081, + "learning_rate": 4.922531597918059e-05, + "loss": 2.4273, + "mean_token_accuracy": 0.43448275327682495, + "step": 128655 + }, + { + "epoch": 0.1295877184765738, + "grad_norm": 9.02386435036657, + "learning_rate": 4.922521849633467e-05, + "loss": 2.3226, + "mean_token_accuracy": 0.42758620381355283, + "step": 128660 + }, + { + "epoch": 0.12959275452967797, + "grad_norm": 9.422806619540497, + "learning_rate": 4.922512100746316e-05, + "loss": 2.2151, + "mean_token_accuracy": 0.45172412395477296, + "step": 128665 + }, + { + "epoch": 0.12959779058278215, + "grad_norm": 10.513136882419335, + "learning_rate": 4.922502351256612e-05, + "loss": 2.7507, + "mean_token_accuracy": 0.358620685338974, + "step": 128670 + }, + { + "epoch": 0.12960282663588632, + "grad_norm": 10.102674252622704, + "learning_rate": 4.922492601164356e-05, + "loss": 2.3054, + "mean_token_accuracy": 0.4551724135875702, + "step": 128675 + }, + { + "epoch": 0.1296078626889905, + "grad_norm": 10.341627093599968, + "learning_rate": 4.922482850469551e-05, + "loss": 2.5214, + "mean_token_accuracy": 0.42068966031074523, + "step": 128680 + }, + { + "epoch": 0.12961289874209467, + "grad_norm": 9.825369302983255, + "learning_rate": 4.9224730991721996e-05, + "loss": 2.2416, + "mean_token_accuracy": 0.4448275864124298, + "step": 128685 + }, + { + "epoch": 0.12961793479519884, + "grad_norm": 8.92783688411926, + "learning_rate": 4.922463347272305e-05, + "loss": 2.0701, + "mean_token_accuracy": 0.47241379618644713, + "step": 128690 + }, + { + "epoch": 0.129622970848303, + "grad_norm": 13.58763556782782, + "learning_rate": 4.92245359476987e-05, + "loss": 2.4561, + "mean_token_accuracy": 0.4103448212146759, + "step": 128695 + }, + { + "epoch": 0.12962800690140716, + "grad_norm": 10.210042775066235, + "learning_rate": 4.922443841664896e-05, + "loss": 2.3501, + "mean_token_accuracy": 0.39310344457626345, + "step": 128700 + }, + { + "epoch": 0.12963304295451133, + "grad_norm": 14.199492185502088, + "learning_rate": 4.9224340879573864e-05, + "loss": 2.4118, + "mean_token_accuracy": 0.45985221266746523, + "step": 128705 + }, + { + "epoch": 0.1296380790076155, + "grad_norm": 9.968943710280945, + "learning_rate": 4.922424333647345e-05, + "loss": 2.1674, + "mean_token_accuracy": 0.46551724672317507, + "step": 128710 + }, + { + "epoch": 0.12964311506071968, + "grad_norm": 11.371724280845331, + "learning_rate": 4.922414578734773e-05, + "loss": 2.8526, + "mean_token_accuracy": 0.3896551728248596, + "step": 128715 + }, + { + "epoch": 0.12964815111382386, + "grad_norm": 9.973817709465926, + "learning_rate": 4.922404823219674e-05, + "loss": 2.3886, + "mean_token_accuracy": 0.4365396320819855, + "step": 128720 + }, + { + "epoch": 0.12965318716692803, + "grad_norm": 10.75277288340718, + "learning_rate": 4.92239506710205e-05, + "loss": 2.295, + "mean_token_accuracy": 0.4068965554237366, + "step": 128725 + }, + { + "epoch": 0.1296582232200322, + "grad_norm": 11.649506136148458, + "learning_rate": 4.922385310381906e-05, + "loss": 3.0828, + "mean_token_accuracy": 0.3689655065536499, + "step": 128730 + }, + { + "epoch": 0.12966325927313638, + "grad_norm": 10.35002744805028, + "learning_rate": 4.922375553059241e-05, + "loss": 2.6444, + "mean_token_accuracy": 0.41034482717514037, + "step": 128735 + }, + { + "epoch": 0.12966829532624055, + "grad_norm": 14.850963433565857, + "learning_rate": 4.922365795134059e-05, + "loss": 2.3013, + "mean_token_accuracy": 0.43793103098869324, + "step": 128740 + }, + { + "epoch": 0.12967333137934473, + "grad_norm": 10.138083259821439, + "learning_rate": 4.922356036606365e-05, + "loss": 2.6505, + "mean_token_accuracy": 0.39310344457626345, + "step": 128745 + }, + { + "epoch": 0.1296783674324489, + "grad_norm": 13.169020782694234, + "learning_rate": 4.922346277476159e-05, + "loss": 2.683, + "mean_token_accuracy": 0.4015124022960663, + "step": 128750 + }, + { + "epoch": 0.12968340348555307, + "grad_norm": 14.682999815421061, + "learning_rate": 4.922336517743445e-05, + "loss": 2.7534, + "mean_token_accuracy": 0.3862069010734558, + "step": 128755 + }, + { + "epoch": 0.12968843953865725, + "grad_norm": 12.406982092579819, + "learning_rate": 4.922326757408224e-05, + "loss": 2.5751, + "mean_token_accuracy": 0.4034482717514038, + "step": 128760 + }, + { + "epoch": 0.12969347559176142, + "grad_norm": 9.289541207141683, + "learning_rate": 4.9223169964705016e-05, + "loss": 2.1228, + "mean_token_accuracy": 0.532758629322052, + "step": 128765 + }, + { + "epoch": 0.1296985116448656, + "grad_norm": 14.905165619608505, + "learning_rate": 4.9223072349302784e-05, + "loss": 2.479, + "mean_token_accuracy": 0.41034482717514037, + "step": 128770 + }, + { + "epoch": 0.12970354769796977, + "grad_norm": 10.54707094637327, + "learning_rate": 4.922297472787558e-05, + "loss": 2.369, + "mean_token_accuracy": 0.4068965554237366, + "step": 128775 + }, + { + "epoch": 0.12970858375107394, + "grad_norm": 9.267326814110223, + "learning_rate": 4.922287710042342e-05, + "loss": 2.0204, + "mean_token_accuracy": 0.46551724076271056, + "step": 128780 + }, + { + "epoch": 0.12971361980417812, + "grad_norm": 11.6991069401333, + "learning_rate": 4.922277946694635e-05, + "loss": 2.5079, + "mean_token_accuracy": 0.45039322376251223, + "step": 128785 + }, + { + "epoch": 0.1297186558572823, + "grad_norm": 11.040960812283831, + "learning_rate": 4.922268182744438e-05, + "loss": 2.5548, + "mean_token_accuracy": 0.4103448331356049, + "step": 128790 + }, + { + "epoch": 0.12972369191038646, + "grad_norm": 11.821578228411786, + "learning_rate": 4.9222584181917545e-05, + "loss": 2.429, + "mean_token_accuracy": 0.3724137842655182, + "step": 128795 + }, + { + "epoch": 0.12972872796349064, + "grad_norm": 14.745171348941362, + "learning_rate": 4.922248653036587e-05, + "loss": 2.1999, + "mean_token_accuracy": 0.482758629322052, + "step": 128800 + }, + { + "epoch": 0.1297337640165948, + "grad_norm": 9.803592162819138, + "learning_rate": 4.922238887278938e-05, + "loss": 2.2611, + "mean_token_accuracy": 0.4344827592372894, + "step": 128805 + }, + { + "epoch": 0.12973880006969898, + "grad_norm": 14.078533489124007, + "learning_rate": 4.922229120918811e-05, + "loss": 2.5571, + "mean_token_accuracy": 0.41379310488700866, + "step": 128810 + }, + { + "epoch": 0.12974383612280316, + "grad_norm": 9.427708135493718, + "learning_rate": 4.922219353956208e-05, + "loss": 2.7344, + "mean_token_accuracy": 0.3620689630508423, + "step": 128815 + }, + { + "epoch": 0.12974887217590733, + "grad_norm": 10.202429176037935, + "learning_rate": 4.922209586391131e-05, + "loss": 2.244, + "mean_token_accuracy": 0.4586206912994385, + "step": 128820 + }, + { + "epoch": 0.1297539082290115, + "grad_norm": 7.656981448783427, + "learning_rate": 4.9221998182235844e-05, + "loss": 1.965, + "mean_token_accuracy": 0.4517241358757019, + "step": 128825 + }, + { + "epoch": 0.12975894428211568, + "grad_norm": 14.059614221938608, + "learning_rate": 4.9221900494535704e-05, + "loss": 2.5231, + "mean_token_accuracy": 0.4068965494632721, + "step": 128830 + }, + { + "epoch": 0.12976398033521983, + "grad_norm": 9.42369608083215, + "learning_rate": 4.9221802800810904e-05, + "loss": 2.6977, + "mean_token_accuracy": 0.43793103098869324, + "step": 128835 + }, + { + "epoch": 0.129769016388324, + "grad_norm": 10.432590511032222, + "learning_rate": 4.922170510106149e-05, + "loss": 2.3689, + "mean_token_accuracy": 0.44482759237289426, + "step": 128840 + }, + { + "epoch": 0.12977405244142817, + "grad_norm": 10.572904177778087, + "learning_rate": 4.922160739528747e-05, + "loss": 2.4216, + "mean_token_accuracy": 0.4068965554237366, + "step": 128845 + }, + { + "epoch": 0.12977908849453235, + "grad_norm": 9.81605639781412, + "learning_rate": 4.9221509683488884e-05, + "loss": 1.6755, + "mean_token_accuracy": 0.545976996421814, + "step": 128850 + }, + { + "epoch": 0.12978412454763652, + "grad_norm": 13.612671088942882, + "learning_rate": 4.9221411965665766e-05, + "loss": 2.2121, + "mean_token_accuracy": 0.43448275327682495, + "step": 128855 + }, + { + "epoch": 0.1297891606007407, + "grad_norm": 13.363442660999057, + "learning_rate": 4.922131424181813e-05, + "loss": 2.74, + "mean_token_accuracy": 0.3793103456497192, + "step": 128860 + }, + { + "epoch": 0.12979419665384487, + "grad_norm": 12.847790319177335, + "learning_rate": 4.9221216511945994e-05, + "loss": 2.8904, + "mean_token_accuracy": 0.39782213866710664, + "step": 128865 + }, + { + "epoch": 0.12979923270694904, + "grad_norm": 11.075422676408175, + "learning_rate": 4.9221118776049416e-05, + "loss": 2.5751, + "mean_token_accuracy": 0.4034482777118683, + "step": 128870 + }, + { + "epoch": 0.12980426876005322, + "grad_norm": 9.254314635761203, + "learning_rate": 4.922102103412839e-05, + "loss": 2.442, + "mean_token_accuracy": 0.44700543880462645, + "step": 128875 + }, + { + "epoch": 0.1298093048131574, + "grad_norm": 9.202192513016714, + "learning_rate": 4.922092328618297e-05, + "loss": 2.4319, + "mean_token_accuracy": 0.4482758641242981, + "step": 128880 + }, + { + "epoch": 0.12981434086626156, + "grad_norm": 9.269608318216155, + "learning_rate": 4.922082553221315e-05, + "loss": 2.1075, + "mean_token_accuracy": 0.47931034564971925, + "step": 128885 + }, + { + "epoch": 0.12981937691936574, + "grad_norm": 9.99024636474826, + "learning_rate": 4.9220727772219e-05, + "loss": 2.3831, + "mean_token_accuracy": 0.4275861978530884, + "step": 128890 + }, + { + "epoch": 0.1298244129724699, + "grad_norm": 10.679935001270755, + "learning_rate": 4.922063000620052e-05, + "loss": 2.3392, + "mean_token_accuracy": 0.4448275864124298, + "step": 128895 + }, + { + "epoch": 0.12982944902557408, + "grad_norm": 15.057453091172635, + "learning_rate": 4.9220532234157733e-05, + "loss": 2.2004, + "mean_token_accuracy": 0.43793103098869324, + "step": 128900 + }, + { + "epoch": 0.12983448507867826, + "grad_norm": 8.823157014977081, + "learning_rate": 4.9220434456090684e-05, + "loss": 2.7488, + "mean_token_accuracy": 0.4, + "step": 128905 + }, + { + "epoch": 0.12983952113178243, + "grad_norm": 9.661226208373158, + "learning_rate": 4.922033667199939e-05, + "loss": 2.1303, + "mean_token_accuracy": 0.4931034445762634, + "step": 128910 + }, + { + "epoch": 0.1298445571848866, + "grad_norm": 11.7028942971968, + "learning_rate": 4.9220238881883885e-05, + "loss": 2.2658, + "mean_token_accuracy": 0.44482758045196535, + "step": 128915 + }, + { + "epoch": 0.12984959323799078, + "grad_norm": 15.302319056815664, + "learning_rate": 4.9220141085744175e-05, + "loss": 2.4273, + "mean_token_accuracy": 0.36551724672317504, + "step": 128920 + }, + { + "epoch": 0.12985462929109495, + "grad_norm": 9.824674646383706, + "learning_rate": 4.922004328358032e-05, + "loss": 2.2326, + "mean_token_accuracy": 0.4620689630508423, + "step": 128925 + }, + { + "epoch": 0.12985966534419913, + "grad_norm": 9.23863390844651, + "learning_rate": 4.921994547539232e-05, + "loss": 2.0856, + "mean_token_accuracy": 0.47586206197738645, + "step": 128930 + }, + { + "epoch": 0.1298647013973033, + "grad_norm": 11.617189621281785, + "learning_rate": 4.9219847661180205e-05, + "loss": 2.5977, + "mean_token_accuracy": 0.3896551787853241, + "step": 128935 + }, + { + "epoch": 0.12986973745040747, + "grad_norm": 7.843181029924144, + "learning_rate": 4.921974984094403e-05, + "loss": 2.0132, + "mean_token_accuracy": 0.46896551847457885, + "step": 128940 + }, + { + "epoch": 0.12987477350351165, + "grad_norm": 9.467589054048226, + "learning_rate": 4.9219652014683785e-05, + "loss": 2.3464, + "mean_token_accuracy": 0.4103448331356049, + "step": 128945 + }, + { + "epoch": 0.12987980955661582, + "grad_norm": 9.934286509569239, + "learning_rate": 4.9219554182399526e-05, + "loss": 2.7643, + "mean_token_accuracy": 0.4044162094593048, + "step": 128950 + }, + { + "epoch": 0.12988484560972, + "grad_norm": 12.713532099531973, + "learning_rate": 4.9219456344091256e-05, + "loss": 2.6595, + "mean_token_accuracy": 0.35172413289546967, + "step": 128955 + }, + { + "epoch": 0.12988988166282417, + "grad_norm": 9.237348765735163, + "learning_rate": 4.921935849975902e-05, + "loss": 2.5821, + "mean_token_accuracy": 0.39310345649719236, + "step": 128960 + }, + { + "epoch": 0.12989491771592834, + "grad_norm": 10.1279886098419, + "learning_rate": 4.921926064940283e-05, + "loss": 2.0792, + "mean_token_accuracy": 0.47931033968925474, + "step": 128965 + }, + { + "epoch": 0.12989995376903252, + "grad_norm": 10.802393616726349, + "learning_rate": 4.921916279302273e-05, + "loss": 2.7915, + "mean_token_accuracy": 0.3862069010734558, + "step": 128970 + }, + { + "epoch": 0.12990498982213666, + "grad_norm": 11.344827411072202, + "learning_rate": 4.9219064930618734e-05, + "loss": 1.9055, + "mean_token_accuracy": 0.4804187178611755, + "step": 128975 + }, + { + "epoch": 0.12991002587524084, + "grad_norm": 10.613188935967989, + "learning_rate": 4.921896706219088e-05, + "loss": 2.4469, + "mean_token_accuracy": 0.4034482777118683, + "step": 128980 + }, + { + "epoch": 0.129915061928345, + "grad_norm": 15.929356254626578, + "learning_rate": 4.921886918773919e-05, + "loss": 2.7692, + "mean_token_accuracy": 0.3931034505367279, + "step": 128985 + }, + { + "epoch": 0.12992009798144918, + "grad_norm": 11.052685889955178, + "learning_rate": 4.921877130726369e-05, + "loss": 2.3434, + "mean_token_accuracy": 0.4448275864124298, + "step": 128990 + }, + { + "epoch": 0.12992513403455336, + "grad_norm": 9.754025641991657, + "learning_rate": 4.9218673420764406e-05, + "loss": 2.495, + "mean_token_accuracy": 0.4103448212146759, + "step": 128995 + }, + { + "epoch": 0.12993017008765753, + "grad_norm": 9.099272999375952, + "learning_rate": 4.9218575528241366e-05, + "loss": 2.4259, + "mean_token_accuracy": 0.3913490653038025, + "step": 129000 + }, + { + "epoch": 0.1299352061407617, + "grad_norm": 11.197552428919481, + "learning_rate": 4.92184776296946e-05, + "loss": 2.3226, + "mean_token_accuracy": 0.4551724076271057, + "step": 129005 + }, + { + "epoch": 0.12994024219386588, + "grad_norm": 10.507743180889685, + "learning_rate": 4.921837972512413e-05, + "loss": 2.2834, + "mean_token_accuracy": 0.41724138259887694, + "step": 129010 + }, + { + "epoch": 0.12994527824697005, + "grad_norm": 9.535505812762104, + "learning_rate": 4.921828181452999e-05, + "loss": 2.5572, + "mean_token_accuracy": 0.4620689630508423, + "step": 129015 + }, + { + "epoch": 0.12995031430007423, + "grad_norm": 12.289841951176568, + "learning_rate": 4.92181838979122e-05, + "loss": 2.629, + "mean_token_accuracy": 0.39310344457626345, + "step": 129020 + }, + { + "epoch": 0.1299553503531784, + "grad_norm": 9.17240638367847, + "learning_rate": 4.92180859752708e-05, + "loss": 2.196, + "mean_token_accuracy": 0.42068966031074523, + "step": 129025 + }, + { + "epoch": 0.12996038640628257, + "grad_norm": 8.65741313496406, + "learning_rate": 4.9217988046605795e-05, + "loss": 2.2303, + "mean_token_accuracy": 0.4551724135875702, + "step": 129030 + }, + { + "epoch": 0.12996542245938675, + "grad_norm": 8.179782684164124, + "learning_rate": 4.921789011191723e-05, + "loss": 2.0353, + "mean_token_accuracy": 0.5034482717514038, + "step": 129035 + }, + { + "epoch": 0.12997045851249092, + "grad_norm": 9.164897471708702, + "learning_rate": 4.921779217120513e-05, + "loss": 2.8426, + "mean_token_accuracy": 0.36896551847457887, + "step": 129040 + }, + { + "epoch": 0.1299754945655951, + "grad_norm": 10.418627358706344, + "learning_rate": 4.9217694224469516e-05, + "loss": 2.2527, + "mean_token_accuracy": 0.43103447556495667, + "step": 129045 + }, + { + "epoch": 0.12998053061869927, + "grad_norm": 11.44265481778297, + "learning_rate": 4.921759627171042e-05, + "loss": 2.1793, + "mean_token_accuracy": 0.49999999403953554, + "step": 129050 + }, + { + "epoch": 0.12998556667180344, + "grad_norm": 9.587542874032676, + "learning_rate": 4.921749831292787e-05, + "loss": 2.2143, + "mean_token_accuracy": 0.493103438615799, + "step": 129055 + }, + { + "epoch": 0.12999060272490762, + "grad_norm": 10.181623158865825, + "learning_rate": 4.921740034812189e-05, + "loss": 2.4807, + "mean_token_accuracy": 0.36896551847457887, + "step": 129060 + }, + { + "epoch": 0.1299956387780118, + "grad_norm": 14.319432190892446, + "learning_rate": 4.92173023772925e-05, + "loss": 2.6834, + "mean_token_accuracy": 0.39310343861579894, + "step": 129065 + }, + { + "epoch": 0.13000067483111596, + "grad_norm": 9.642575956353483, + "learning_rate": 4.921720440043974e-05, + "loss": 2.2141, + "mean_token_accuracy": 0.47931034564971925, + "step": 129070 + }, + { + "epoch": 0.13000571088422014, + "grad_norm": 11.05650215824258, + "learning_rate": 4.921710641756364e-05, + "loss": 2.0048, + "mean_token_accuracy": 0.4931034564971924, + "step": 129075 + }, + { + "epoch": 0.1300107469373243, + "grad_norm": 11.343850295285575, + "learning_rate": 4.9217008428664206e-05, + "loss": 2.2962, + "mean_token_accuracy": 0.4344827592372894, + "step": 129080 + }, + { + "epoch": 0.13001578299042849, + "grad_norm": 13.143462895810625, + "learning_rate": 4.921691043374149e-05, + "loss": 2.685, + "mean_token_accuracy": 0.4034482777118683, + "step": 129085 + }, + { + "epoch": 0.13002081904353266, + "grad_norm": 9.681457221954199, + "learning_rate": 4.92168124327955e-05, + "loss": 2.0833, + "mean_token_accuracy": 0.42413792610168455, + "step": 129090 + }, + { + "epoch": 0.13002585509663683, + "grad_norm": 12.861691871039671, + "learning_rate": 4.921671442582628e-05, + "loss": 2.6104, + "mean_token_accuracy": 0.3896551728248596, + "step": 129095 + }, + { + "epoch": 0.130030891149741, + "grad_norm": 9.534868523340778, + "learning_rate": 4.921661641283384e-05, + "loss": 2.6672, + "mean_token_accuracy": 0.41034483909606934, + "step": 129100 + }, + { + "epoch": 0.13003592720284518, + "grad_norm": 13.538707796803724, + "learning_rate": 4.9216518393818226e-05, + "loss": 2.5987, + "mean_token_accuracy": 0.43103447556495667, + "step": 129105 + }, + { + "epoch": 0.13004096325594935, + "grad_norm": 8.648761549755044, + "learning_rate": 4.9216420368779436e-05, + "loss": 2.171, + "mean_token_accuracy": 0.48275862336158754, + "step": 129110 + }, + { + "epoch": 0.1300459993090535, + "grad_norm": 11.340521407737254, + "learning_rate": 4.921632233771753e-05, + "loss": 2.1855, + "mean_token_accuracy": 0.47241379618644713, + "step": 129115 + }, + { + "epoch": 0.13005103536215767, + "grad_norm": 8.28950862814899, + "learning_rate": 4.9216224300632515e-05, + "loss": 2.0552, + "mean_token_accuracy": 0.4655172526836395, + "step": 129120 + }, + { + "epoch": 0.13005607141526185, + "grad_norm": 10.91781274078189, + "learning_rate": 4.921612625752443e-05, + "loss": 2.4565, + "mean_token_accuracy": 0.4034482717514038, + "step": 129125 + }, + { + "epoch": 0.13006110746836602, + "grad_norm": 8.519807181961898, + "learning_rate": 4.9216028208393294e-05, + "loss": 2.5394, + "mean_token_accuracy": 0.4068965554237366, + "step": 129130 + }, + { + "epoch": 0.1300661435214702, + "grad_norm": 10.888882690432926, + "learning_rate": 4.9215930153239136e-05, + "loss": 2.4095, + "mean_token_accuracy": 0.4344827592372894, + "step": 129135 + }, + { + "epoch": 0.13007117957457437, + "grad_norm": 10.000760322978257, + "learning_rate": 4.921583209206198e-05, + "loss": 2.3519, + "mean_token_accuracy": 0.4620689570903778, + "step": 129140 + }, + { + "epoch": 0.13007621562767854, + "grad_norm": 11.844827946635355, + "learning_rate": 4.921573402486187e-05, + "loss": 2.3979, + "mean_token_accuracy": 0.4103448331356049, + "step": 129145 + }, + { + "epoch": 0.13008125168078272, + "grad_norm": 10.55951584015606, + "learning_rate": 4.9215635951638805e-05, + "loss": 2.6713, + "mean_token_accuracy": 0.4000000059604645, + "step": 129150 + }, + { + "epoch": 0.1300862877338869, + "grad_norm": 10.040452606788156, + "learning_rate": 4.9215537872392834e-05, + "loss": 2.3228, + "mean_token_accuracy": 0.4517241358757019, + "step": 129155 + }, + { + "epoch": 0.13009132378699106, + "grad_norm": 9.823654898282637, + "learning_rate": 4.921543978712398e-05, + "loss": 2.213, + "mean_token_accuracy": 0.4931034505367279, + "step": 129160 + }, + { + "epoch": 0.13009635984009524, + "grad_norm": 11.925671672230084, + "learning_rate": 4.921534169583226e-05, + "loss": 2.3796, + "mean_token_accuracy": 0.44827585816383364, + "step": 129165 + }, + { + "epoch": 0.1301013958931994, + "grad_norm": 10.751464219240567, + "learning_rate": 4.9215243598517716e-05, + "loss": 2.1919, + "mean_token_accuracy": 0.42068964838981626, + "step": 129170 + }, + { + "epoch": 0.13010643194630359, + "grad_norm": 10.406064063478205, + "learning_rate": 4.9215145495180366e-05, + "loss": 2.2861, + "mean_token_accuracy": 0.4310344815254211, + "step": 129175 + }, + { + "epoch": 0.13011146799940776, + "grad_norm": 9.81013586051914, + "learning_rate": 4.921504738582024e-05, + "loss": 2.6879, + "mean_token_accuracy": 0.39655172228813174, + "step": 129180 + }, + { + "epoch": 0.13011650405251193, + "grad_norm": 10.003018902046872, + "learning_rate": 4.921494927043737e-05, + "loss": 2.2703, + "mean_token_accuracy": 0.482758617401123, + "step": 129185 + }, + { + "epoch": 0.1301215401056161, + "grad_norm": 10.69242808223807, + "learning_rate": 4.921485114903176e-05, + "loss": 2.7214, + "mean_token_accuracy": 0.3655172437429428, + "step": 129190 + }, + { + "epoch": 0.13012657615872028, + "grad_norm": 9.312606287242122, + "learning_rate": 4.9214753021603474e-05, + "loss": 2.3152, + "mean_token_accuracy": 0.3911675691604614, + "step": 129195 + }, + { + "epoch": 0.13013161221182445, + "grad_norm": 9.997138202707541, + "learning_rate": 4.92146548881525e-05, + "loss": 2.1418, + "mean_token_accuracy": 0.47586206793785096, + "step": 129200 + }, + { + "epoch": 0.13013664826492863, + "grad_norm": 8.934825362531758, + "learning_rate": 4.921455674867891e-05, + "loss": 2.7794, + "mean_token_accuracy": 0.3965517163276672, + "step": 129205 + }, + { + "epoch": 0.1301416843180328, + "grad_norm": 12.049984574977294, + "learning_rate": 4.921445860318269e-05, + "loss": 2.7606, + "mean_token_accuracy": 0.42758620977401735, + "step": 129210 + }, + { + "epoch": 0.13014672037113698, + "grad_norm": 6.646942027711644, + "learning_rate": 4.9214360451663885e-05, + "loss": 2.0259, + "mean_token_accuracy": 0.4862069010734558, + "step": 129215 + }, + { + "epoch": 0.13015175642424115, + "grad_norm": 13.214860662286243, + "learning_rate": 4.921426229412253e-05, + "loss": 2.6543, + "mean_token_accuracy": 0.358620685338974, + "step": 129220 + }, + { + "epoch": 0.13015679247734532, + "grad_norm": 10.64313424656918, + "learning_rate": 4.921416413055864e-05, + "loss": 2.6381, + "mean_token_accuracy": 0.4068965554237366, + "step": 129225 + }, + { + "epoch": 0.1301618285304495, + "grad_norm": 10.069920428691292, + "learning_rate": 4.9214065960972246e-05, + "loss": 2.5657, + "mean_token_accuracy": 0.42413793206214906, + "step": 129230 + }, + { + "epoch": 0.13016686458355367, + "grad_norm": 11.161183774699586, + "learning_rate": 4.921396778536337e-05, + "loss": 2.2985, + "mean_token_accuracy": 0.4413793087005615, + "step": 129235 + }, + { + "epoch": 0.13017190063665784, + "grad_norm": 11.080384450565381, + "learning_rate": 4.9213869603732046e-05, + "loss": 2.5605, + "mean_token_accuracy": 0.4034482717514038, + "step": 129240 + }, + { + "epoch": 0.13017693668976202, + "grad_norm": 11.70509229179508, + "learning_rate": 4.92137714160783e-05, + "loss": 2.2962, + "mean_token_accuracy": 0.4586206912994385, + "step": 129245 + }, + { + "epoch": 0.1301819727428662, + "grad_norm": 12.776876263891452, + "learning_rate": 4.921367322240216e-05, + "loss": 2.758, + "mean_token_accuracy": 0.37241379022598264, + "step": 129250 + }, + { + "epoch": 0.13018700879597034, + "grad_norm": 9.298334534560004, + "learning_rate": 4.9213575022703645e-05, + "loss": 2.4465, + "mean_token_accuracy": 0.45862069725990295, + "step": 129255 + }, + { + "epoch": 0.1301920448490745, + "grad_norm": 11.343326549216577, + "learning_rate": 4.92134768169828e-05, + "loss": 2.3055, + "mean_token_accuracy": 0.4551724135875702, + "step": 129260 + }, + { + "epoch": 0.13019708090217869, + "grad_norm": 10.341818039911217, + "learning_rate": 4.921337860523963e-05, + "loss": 2.2726, + "mean_token_accuracy": 0.3896551787853241, + "step": 129265 + }, + { + "epoch": 0.13020211695528286, + "grad_norm": 9.315649677002215, + "learning_rate": 4.921328038747418e-05, + "loss": 2.5262, + "mean_token_accuracy": 0.4482758641242981, + "step": 129270 + }, + { + "epoch": 0.13020715300838703, + "grad_norm": 10.38477189795735, + "learning_rate": 4.921318216368646e-05, + "loss": 2.6443, + "mean_token_accuracy": 0.37586206793785093, + "step": 129275 + }, + { + "epoch": 0.1302121890614912, + "grad_norm": 10.051838724946624, + "learning_rate": 4.921308393387653e-05, + "loss": 2.4293, + "mean_token_accuracy": 0.4241379201412201, + "step": 129280 + }, + { + "epoch": 0.13021722511459538, + "grad_norm": 9.464562921840644, + "learning_rate": 4.921298569804438e-05, + "loss": 2.2438, + "mean_token_accuracy": 0.42413792610168455, + "step": 129285 + }, + { + "epoch": 0.13022226116769955, + "grad_norm": 12.47707481414817, + "learning_rate": 4.9212887456190053e-05, + "loss": 2.3116, + "mean_token_accuracy": 0.41379310488700866, + "step": 129290 + }, + { + "epoch": 0.13022729722080373, + "grad_norm": 8.405989613412654, + "learning_rate": 4.921278920831358e-05, + "loss": 2.2264, + "mean_token_accuracy": 0.41379310488700866, + "step": 129295 + }, + { + "epoch": 0.1302323332739079, + "grad_norm": 9.65346990567041, + "learning_rate": 4.921269095441498e-05, + "loss": 2.4937, + "mean_token_accuracy": 0.4365396171808243, + "step": 129300 + }, + { + "epoch": 0.13023736932701208, + "grad_norm": 9.624865426897893, + "learning_rate": 4.921259269449429e-05, + "loss": 2.4789, + "mean_token_accuracy": 0.41857229471206664, + "step": 129305 + }, + { + "epoch": 0.13024240538011625, + "grad_norm": 8.909263640046639, + "learning_rate": 4.921249442855152e-05, + "loss": 2.6475, + "mean_token_accuracy": 0.42758620977401735, + "step": 129310 + }, + { + "epoch": 0.13024744143322042, + "grad_norm": 18.5774035363204, + "learning_rate": 4.921239615658672e-05, + "loss": 2.7813, + "mean_token_accuracy": 0.4551724076271057, + "step": 129315 + }, + { + "epoch": 0.1302524774863246, + "grad_norm": 11.055270515815687, + "learning_rate": 4.92122978785999e-05, + "loss": 2.5187, + "mean_token_accuracy": 0.47931033968925474, + "step": 129320 + }, + { + "epoch": 0.13025751353942877, + "grad_norm": 9.245289731657552, + "learning_rate": 4.9212199594591095e-05, + "loss": 2.2389, + "mean_token_accuracy": 0.43103447556495667, + "step": 129325 + }, + { + "epoch": 0.13026254959253294, + "grad_norm": 12.670729568976498, + "learning_rate": 4.921210130456033e-05, + "loss": 2.9651, + "mean_token_accuracy": 0.34827586114406583, + "step": 129330 + }, + { + "epoch": 0.13026758564563712, + "grad_norm": 9.6831610186234, + "learning_rate": 4.921200300850764e-05, + "loss": 2.4141, + "mean_token_accuracy": 0.42413793206214906, + "step": 129335 + }, + { + "epoch": 0.1302726216987413, + "grad_norm": 11.13431813272363, + "learning_rate": 4.921190470643303e-05, + "loss": 2.2295, + "mean_token_accuracy": 0.4482758641242981, + "step": 129340 + }, + { + "epoch": 0.13027765775184547, + "grad_norm": 9.657109050767806, + "learning_rate": 4.9211806398336554e-05, + "loss": 2.0867, + "mean_token_accuracy": 0.482758629322052, + "step": 129345 + }, + { + "epoch": 0.13028269380494964, + "grad_norm": 9.460780956134226, + "learning_rate": 4.921170808421822e-05, + "loss": 2.4273, + "mean_token_accuracy": 0.4257713258266449, + "step": 129350 + }, + { + "epoch": 0.1302877298580538, + "grad_norm": 10.926793093908415, + "learning_rate": 4.921160976407807e-05, + "loss": 2.5411, + "mean_token_accuracy": 0.41034483909606934, + "step": 129355 + }, + { + "epoch": 0.130292765911158, + "grad_norm": 10.43138692876934, + "learning_rate": 4.921151143791612e-05, + "loss": 2.1316, + "mean_token_accuracy": 0.4379310369491577, + "step": 129360 + }, + { + "epoch": 0.13029780196426216, + "grad_norm": 10.042449375688033, + "learning_rate": 4.92114131057324e-05, + "loss": 2.1914, + "mean_token_accuracy": 0.4689655125141144, + "step": 129365 + }, + { + "epoch": 0.13030283801736633, + "grad_norm": 10.90227374136877, + "learning_rate": 4.9211314767526944e-05, + "loss": 2.0421, + "mean_token_accuracy": 0.5054446518421173, + "step": 129370 + }, + { + "epoch": 0.1303078740704705, + "grad_norm": 10.4911757872581, + "learning_rate": 4.921121642329977e-05, + "loss": 2.4023, + "mean_token_accuracy": 0.4068965494632721, + "step": 129375 + }, + { + "epoch": 0.13031291012357468, + "grad_norm": 10.41582666642685, + "learning_rate": 4.92111180730509e-05, + "loss": 2.075, + "mean_token_accuracy": 0.510344821214676, + "step": 129380 + }, + { + "epoch": 0.13031794617667886, + "grad_norm": 16.958841846099634, + "learning_rate": 4.921101971678039e-05, + "loss": 2.8366, + "mean_token_accuracy": 0.4068965494632721, + "step": 129385 + }, + { + "epoch": 0.13032298222978303, + "grad_norm": 10.44170471139477, + "learning_rate": 4.921092135448824e-05, + "loss": 2.4146, + "mean_token_accuracy": 0.4034482777118683, + "step": 129390 + }, + { + "epoch": 0.13032801828288718, + "grad_norm": 9.98467170556745, + "learning_rate": 4.9210822986174474e-05, + "loss": 2.4195, + "mean_token_accuracy": 0.43793103098869324, + "step": 129395 + }, + { + "epoch": 0.13033305433599135, + "grad_norm": 8.795114003306088, + "learning_rate": 4.9210724611839144e-05, + "loss": 2.2201, + "mean_token_accuracy": 0.46206897497177124, + "step": 129400 + }, + { + "epoch": 0.13033809038909552, + "grad_norm": 12.788349839012314, + "learning_rate": 4.921062623148226e-05, + "loss": 2.145, + "mean_token_accuracy": 0.49655172824859617, + "step": 129405 + }, + { + "epoch": 0.1303431264421997, + "grad_norm": 11.071766114295322, + "learning_rate": 4.9210527845103846e-05, + "loss": 2.484, + "mean_token_accuracy": 0.4034482717514038, + "step": 129410 + }, + { + "epoch": 0.13034816249530387, + "grad_norm": 9.672154944824332, + "learning_rate": 4.9210429452703947e-05, + "loss": 2.6607, + "mean_token_accuracy": 0.40689654350280763, + "step": 129415 + }, + { + "epoch": 0.13035319854840804, + "grad_norm": 10.651940626290392, + "learning_rate": 4.921033105428257e-05, + "loss": 2.5017, + "mean_token_accuracy": 0.44137930274009707, + "step": 129420 + }, + { + "epoch": 0.13035823460151222, + "grad_norm": 9.630757351472075, + "learning_rate": 4.921023264983975e-05, + "loss": 2.4826, + "mean_token_accuracy": 0.41379311084747317, + "step": 129425 + }, + { + "epoch": 0.1303632706546164, + "grad_norm": 14.629111292803264, + "learning_rate": 4.921013423937553e-05, + "loss": 2.8851, + "mean_token_accuracy": 0.3862068891525269, + "step": 129430 + }, + { + "epoch": 0.13036830670772057, + "grad_norm": 11.588360137664067, + "learning_rate": 4.921003582288991e-05, + "loss": 2.6196, + "mean_token_accuracy": 0.3827586233615875, + "step": 129435 + }, + { + "epoch": 0.13037334276082474, + "grad_norm": 7.840971141067827, + "learning_rate": 4.920993740038293e-05, + "loss": 2.1574, + "mean_token_accuracy": 0.5344827592372894, + "step": 129440 + }, + { + "epoch": 0.1303783788139289, + "grad_norm": 11.674542533545212, + "learning_rate": 4.9209838971854624e-05, + "loss": 2.1892, + "mean_token_accuracy": 0.47586206793785096, + "step": 129445 + }, + { + "epoch": 0.1303834148670331, + "grad_norm": 9.30807409605611, + "learning_rate": 4.9209740537305017e-05, + "loss": 2.1998, + "mean_token_accuracy": 0.46551724672317507, + "step": 129450 + }, + { + "epoch": 0.13038845092013726, + "grad_norm": 12.180083801125372, + "learning_rate": 4.920964209673413e-05, + "loss": 2.4786, + "mean_token_accuracy": 0.4, + "step": 129455 + }, + { + "epoch": 0.13039348697324143, + "grad_norm": 12.088921490412227, + "learning_rate": 4.920954365014199e-05, + "loss": 2.5126, + "mean_token_accuracy": 0.4344827592372894, + "step": 129460 + }, + { + "epoch": 0.1303985230263456, + "grad_norm": 9.06604965626044, + "learning_rate": 4.920944519752863e-05, + "loss": 2.6583, + "mean_token_accuracy": 0.3896551728248596, + "step": 129465 + }, + { + "epoch": 0.13040355907944978, + "grad_norm": 10.64590763756691, + "learning_rate": 4.920934673889406e-05, + "loss": 2.6717, + "mean_token_accuracy": 0.4034482777118683, + "step": 129470 + }, + { + "epoch": 0.13040859513255396, + "grad_norm": 10.184118682889142, + "learning_rate": 4.920924827423834e-05, + "loss": 2.3753, + "mean_token_accuracy": 0.44482759237289426, + "step": 129475 + }, + { + "epoch": 0.13041363118565813, + "grad_norm": 10.17231426013366, + "learning_rate": 4.920914980356147e-05, + "loss": 2.3596, + "mean_token_accuracy": 0.4103448212146759, + "step": 129480 + }, + { + "epoch": 0.1304186672387623, + "grad_norm": 12.910015412580528, + "learning_rate": 4.92090513268635e-05, + "loss": 2.4315, + "mean_token_accuracy": 0.4068965554237366, + "step": 129485 + }, + { + "epoch": 0.13042370329186648, + "grad_norm": 12.850317633386826, + "learning_rate": 4.920895284414442e-05, + "loss": 2.3701, + "mean_token_accuracy": 0.4034482777118683, + "step": 129490 + }, + { + "epoch": 0.13042873934497065, + "grad_norm": 8.44974869565826, + "learning_rate": 4.92088543554043e-05, + "loss": 2.1333, + "mean_token_accuracy": 0.47586206197738645, + "step": 129495 + }, + { + "epoch": 0.13043377539807482, + "grad_norm": 9.698224933233899, + "learning_rate": 4.920875586064314e-05, + "loss": 2.5498, + "mean_token_accuracy": 0.42413793206214906, + "step": 129500 + }, + { + "epoch": 0.130438811451179, + "grad_norm": 11.745868023296493, + "learning_rate": 4.9208657359860976e-05, + "loss": 2.1325, + "mean_token_accuracy": 0.4862069010734558, + "step": 129505 + }, + { + "epoch": 0.13044384750428317, + "grad_norm": 18.42893924893373, + "learning_rate": 4.920855885305784e-05, + "loss": 2.6479, + "mean_token_accuracy": 0.42068964838981626, + "step": 129510 + }, + { + "epoch": 0.13044888355738735, + "grad_norm": 8.951450479716442, + "learning_rate": 4.920846034023375e-05, + "loss": 2.6923, + "mean_token_accuracy": 0.4172413766384125, + "step": 129515 + }, + { + "epoch": 0.13045391961049152, + "grad_norm": 10.470889349126548, + "learning_rate": 4.9208361821388735e-05, + "loss": 2.088, + "mean_token_accuracy": 0.5125831842422486, + "step": 129520 + }, + { + "epoch": 0.1304589556635957, + "grad_norm": 11.878940122235473, + "learning_rate": 4.920826329652283e-05, + "loss": 2.5428, + "mean_token_accuracy": 0.4454930365085602, + "step": 129525 + }, + { + "epoch": 0.13046399171669987, + "grad_norm": 12.421954377669323, + "learning_rate": 4.920816476563606e-05, + "loss": 2.9375, + "mean_token_accuracy": 0.37586206793785093, + "step": 129530 + }, + { + "epoch": 0.130469027769804, + "grad_norm": 17.224319106689666, + "learning_rate": 4.920806622872844e-05, + "loss": 2.3543, + "mean_token_accuracy": 0.47382464408874514, + "step": 129535 + }, + { + "epoch": 0.1304740638229082, + "grad_norm": 10.859901545378836, + "learning_rate": 4.920796768580001e-05, + "loss": 2.5708, + "mean_token_accuracy": 0.4103448212146759, + "step": 129540 + }, + { + "epoch": 0.13047909987601236, + "grad_norm": 11.065226287882751, + "learning_rate": 4.920786913685079e-05, + "loss": 2.5147, + "mean_token_accuracy": 0.37241379022598264, + "step": 129545 + }, + { + "epoch": 0.13048413592911653, + "grad_norm": 15.267922582902608, + "learning_rate": 4.920777058188082e-05, + "loss": 2.4677, + "mean_token_accuracy": 0.4137930989265442, + "step": 129550 + }, + { + "epoch": 0.1304891719822207, + "grad_norm": 9.361456865913672, + "learning_rate": 4.920767202089012e-05, + "loss": 2.44, + "mean_token_accuracy": 0.4310344815254211, + "step": 129555 + }, + { + "epoch": 0.13049420803532488, + "grad_norm": 11.411162104666378, + "learning_rate": 4.920757345387871e-05, + "loss": 2.3057, + "mean_token_accuracy": 0.4517241418361664, + "step": 129560 + }, + { + "epoch": 0.13049924408842906, + "grad_norm": 13.428229385735968, + "learning_rate": 4.9207474880846626e-05, + "loss": 2.5223, + "mean_token_accuracy": 0.41724138259887694, + "step": 129565 + }, + { + "epoch": 0.13050428014153323, + "grad_norm": 11.694987075875211, + "learning_rate": 4.9207376301793896e-05, + "loss": 1.9535, + "mean_token_accuracy": 0.5190886616706848, + "step": 129570 + }, + { + "epoch": 0.1305093161946374, + "grad_norm": 10.633994817693539, + "learning_rate": 4.920727771672054e-05, + "loss": 2.1742, + "mean_token_accuracy": 0.43448275327682495, + "step": 129575 + }, + { + "epoch": 0.13051435224774158, + "grad_norm": 19.100883530220017, + "learning_rate": 4.920717912562658e-05, + "loss": 2.3931, + "mean_token_accuracy": 0.4950393199920654, + "step": 129580 + }, + { + "epoch": 0.13051938830084575, + "grad_norm": 9.3374759327297, + "learning_rate": 4.920708052851207e-05, + "loss": 2.0964, + "mean_token_accuracy": 0.4689655065536499, + "step": 129585 + }, + { + "epoch": 0.13052442435394992, + "grad_norm": 10.15109235166133, + "learning_rate": 4.9206981925377016e-05, + "loss": 2.1965, + "mean_token_accuracy": 0.4482758641242981, + "step": 129590 + }, + { + "epoch": 0.1305294604070541, + "grad_norm": 16.952555359938135, + "learning_rate": 4.920688331622144e-05, + "loss": 2.6903, + "mean_token_accuracy": 0.4172413766384125, + "step": 129595 + }, + { + "epoch": 0.13053449646015827, + "grad_norm": 11.76023223390273, + "learning_rate": 4.920678470104539e-05, + "loss": 2.3956, + "mean_token_accuracy": 0.4517241358757019, + "step": 129600 + }, + { + "epoch": 0.13053953251326245, + "grad_norm": 9.660428406496449, + "learning_rate": 4.920668607984888e-05, + "loss": 2.3671, + "mean_token_accuracy": 0.43944343328475954, + "step": 129605 + }, + { + "epoch": 0.13054456856636662, + "grad_norm": 12.941687181262658, + "learning_rate": 4.920658745263194e-05, + "loss": 2.436, + "mean_token_accuracy": 0.39812461733818055, + "step": 129610 + }, + { + "epoch": 0.1305496046194708, + "grad_norm": 12.624723468066707, + "learning_rate": 4.9206488819394585e-05, + "loss": 2.516, + "mean_token_accuracy": 0.4034482717514038, + "step": 129615 + }, + { + "epoch": 0.13055464067257497, + "grad_norm": 10.474768786916417, + "learning_rate": 4.920639018013687e-05, + "loss": 2.1695, + "mean_token_accuracy": 0.4551724076271057, + "step": 129620 + }, + { + "epoch": 0.13055967672567914, + "grad_norm": 14.267385471219535, + "learning_rate": 4.92062915348588e-05, + "loss": 2.2539, + "mean_token_accuracy": 0.48965516686439514, + "step": 129625 + }, + { + "epoch": 0.13056471277878332, + "grad_norm": 10.69116743341844, + "learning_rate": 4.920619288356041e-05, + "loss": 2.4402, + "mean_token_accuracy": 0.4344827592372894, + "step": 129630 + }, + { + "epoch": 0.1305697488318875, + "grad_norm": 11.170199085196069, + "learning_rate": 4.920609422624173e-05, + "loss": 2.321, + "mean_token_accuracy": 0.46551724076271056, + "step": 129635 + }, + { + "epoch": 0.13057478488499166, + "grad_norm": 9.977782510464031, + "learning_rate": 4.920599556290278e-05, + "loss": 2.0961, + "mean_token_accuracy": 0.49854809045791626, + "step": 129640 + }, + { + "epoch": 0.13057982093809584, + "grad_norm": 12.156757647579397, + "learning_rate": 4.920589689354359e-05, + "loss": 2.2209, + "mean_token_accuracy": 0.4724137902259827, + "step": 129645 + }, + { + "epoch": 0.1305848569912, + "grad_norm": 8.98511433843367, + "learning_rate": 4.920579821816418e-05, + "loss": 2.0213, + "mean_token_accuracy": 0.4931034445762634, + "step": 129650 + }, + { + "epoch": 0.13058989304430418, + "grad_norm": 14.459676711731003, + "learning_rate": 4.92056995367646e-05, + "loss": 2.6627, + "mean_token_accuracy": 0.3965517282485962, + "step": 129655 + }, + { + "epoch": 0.13059492909740836, + "grad_norm": 9.037490198170193, + "learning_rate": 4.920560084934486e-05, + "loss": 2.7547, + "mean_token_accuracy": 0.3793103516101837, + "step": 129660 + }, + { + "epoch": 0.13059996515051253, + "grad_norm": 7.480979262369426, + "learning_rate": 4.920550215590499e-05, + "loss": 2.4778, + "mean_token_accuracy": 0.46031458377838136, + "step": 129665 + }, + { + "epoch": 0.1306050012036167, + "grad_norm": 12.20311132509539, + "learning_rate": 4.920540345644502e-05, + "loss": 2.4602, + "mean_token_accuracy": 0.4103448331356049, + "step": 129670 + }, + { + "epoch": 0.13061003725672085, + "grad_norm": 9.041003906303553, + "learning_rate": 4.9205304750964977e-05, + "loss": 2.3402, + "mean_token_accuracy": 0.3931034505367279, + "step": 129675 + }, + { + "epoch": 0.13061507330982502, + "grad_norm": 13.422866558312972, + "learning_rate": 4.920520603946488e-05, + "loss": 2.3834, + "mean_token_accuracy": 0.4053236603736877, + "step": 129680 + }, + { + "epoch": 0.1306201093629292, + "grad_norm": 16.872525776779753, + "learning_rate": 4.920510732194477e-05, + "loss": 2.6042, + "mean_token_accuracy": 0.4, + "step": 129685 + }, + { + "epoch": 0.13062514541603337, + "grad_norm": 11.430652407173223, + "learning_rate": 4.9205008598404656e-05, + "loss": 2.5181, + "mean_token_accuracy": 0.482758617401123, + "step": 129690 + }, + { + "epoch": 0.13063018146913755, + "grad_norm": 10.1883215431587, + "learning_rate": 4.920490986884459e-05, + "loss": 1.9856, + "mean_token_accuracy": 0.4973667740821838, + "step": 129695 + }, + { + "epoch": 0.13063521752224172, + "grad_norm": 9.697056299902146, + "learning_rate": 4.920481113326458e-05, + "loss": 2.2099, + "mean_token_accuracy": 0.4482758641242981, + "step": 129700 + }, + { + "epoch": 0.1306402535753459, + "grad_norm": 12.189435485388746, + "learning_rate": 4.920471239166466e-05, + "loss": 2.4748, + "mean_token_accuracy": 0.4620689630508423, + "step": 129705 + }, + { + "epoch": 0.13064528962845007, + "grad_norm": 14.042526472262566, + "learning_rate": 4.920461364404486e-05, + "loss": 2.8522, + "mean_token_accuracy": 0.38620689511299133, + "step": 129710 + }, + { + "epoch": 0.13065032568155424, + "grad_norm": 9.390108999891527, + "learning_rate": 4.920451489040519e-05, + "loss": 2.4159, + "mean_token_accuracy": 0.38965516686439516, + "step": 129715 + }, + { + "epoch": 0.13065536173465842, + "grad_norm": 10.82204494570798, + "learning_rate": 4.920441613074571e-05, + "loss": 2.4565, + "mean_token_accuracy": 0.4068965554237366, + "step": 129720 + }, + { + "epoch": 0.1306603977877626, + "grad_norm": 10.495526618004677, + "learning_rate": 4.920431736506643e-05, + "loss": 2.5404, + "mean_token_accuracy": 0.38275861740112305, + "step": 129725 + }, + { + "epoch": 0.13066543384086676, + "grad_norm": 18.14833919057225, + "learning_rate": 4.9204218593367364e-05, + "loss": 3.1358, + "mean_token_accuracy": 0.3827586233615875, + "step": 129730 + }, + { + "epoch": 0.13067046989397094, + "grad_norm": 13.721473082093036, + "learning_rate": 4.920411981564856e-05, + "loss": 2.6488, + "mean_token_accuracy": 0.3551724076271057, + "step": 129735 + }, + { + "epoch": 0.1306755059470751, + "grad_norm": 8.712896202315477, + "learning_rate": 4.920402103191003e-05, + "loss": 2.2153, + "mean_token_accuracy": 0.4620689570903778, + "step": 129740 + }, + { + "epoch": 0.13068054200017928, + "grad_norm": 11.133789461992814, + "learning_rate": 4.920392224215181e-05, + "loss": 2.083, + "mean_token_accuracy": 0.47931034564971925, + "step": 129745 + }, + { + "epoch": 0.13068557805328346, + "grad_norm": 9.878926789994928, + "learning_rate": 4.9203823446373937e-05, + "loss": 2.0918, + "mean_token_accuracy": 0.4931034505367279, + "step": 129750 + }, + { + "epoch": 0.13069061410638763, + "grad_norm": 11.12451114204712, + "learning_rate": 4.920372464457642e-05, + "loss": 2.3229, + "mean_token_accuracy": 0.4724137902259827, + "step": 129755 + }, + { + "epoch": 0.1306956501594918, + "grad_norm": 9.630222230347455, + "learning_rate": 4.920362583675929e-05, + "loss": 2.7839, + "mean_token_accuracy": 0.37586206793785093, + "step": 129760 + }, + { + "epoch": 0.13070068621259598, + "grad_norm": 9.794447815635563, + "learning_rate": 4.9203527022922587e-05, + "loss": 2.1764, + "mean_token_accuracy": 0.4517241299152374, + "step": 129765 + }, + { + "epoch": 0.13070572226570015, + "grad_norm": 9.887450821671035, + "learning_rate": 4.9203428203066324e-05, + "loss": 2.2153, + "mean_token_accuracy": 0.4586206912994385, + "step": 129770 + }, + { + "epoch": 0.13071075831880433, + "grad_norm": 9.964614323649075, + "learning_rate": 4.920332937719054e-05, + "loss": 2.4447, + "mean_token_accuracy": 0.44482758045196535, + "step": 129775 + }, + { + "epoch": 0.1307157943719085, + "grad_norm": 16.12490333612492, + "learning_rate": 4.920323054529525e-05, + "loss": 2.0431, + "mean_token_accuracy": 0.4954023063182831, + "step": 129780 + }, + { + "epoch": 0.13072083042501267, + "grad_norm": 11.820050219581136, + "learning_rate": 4.920313170738049e-05, + "loss": 2.7879, + "mean_token_accuracy": 0.4551724135875702, + "step": 129785 + }, + { + "epoch": 0.13072586647811685, + "grad_norm": 10.997605829381802, + "learning_rate": 4.920303286344628e-05, + "loss": 2.2698, + "mean_token_accuracy": 0.4034482777118683, + "step": 129790 + }, + { + "epoch": 0.13073090253122102, + "grad_norm": 10.624101015425934, + "learning_rate": 4.920293401349266e-05, + "loss": 2.3461, + "mean_token_accuracy": 0.4068965494632721, + "step": 129795 + }, + { + "epoch": 0.1307359385843252, + "grad_norm": 11.01763601336403, + "learning_rate": 4.920283515751965e-05, + "loss": 2.3293, + "mean_token_accuracy": 0.42758620977401735, + "step": 129800 + }, + { + "epoch": 0.13074097463742937, + "grad_norm": 16.38627217214197, + "learning_rate": 4.920273629552728e-05, + "loss": 2.1937, + "mean_token_accuracy": 0.46382335424423216, + "step": 129805 + }, + { + "epoch": 0.13074601069053354, + "grad_norm": 9.930145538067137, + "learning_rate": 4.920263742751557e-05, + "loss": 2.1672, + "mean_token_accuracy": 0.48275862336158754, + "step": 129810 + }, + { + "epoch": 0.1307510467436377, + "grad_norm": 11.596086022056923, + "learning_rate": 4.920253855348456e-05, + "loss": 2.28, + "mean_token_accuracy": 0.4482758641242981, + "step": 129815 + }, + { + "epoch": 0.13075608279674186, + "grad_norm": 12.876303231028354, + "learning_rate": 4.920243967343426e-05, + "loss": 2.6879, + "mean_token_accuracy": 0.43448275327682495, + "step": 129820 + }, + { + "epoch": 0.13076111884984604, + "grad_norm": 9.588606886560406, + "learning_rate": 4.920234078736471e-05, + "loss": 2.44, + "mean_token_accuracy": 0.4034482777118683, + "step": 129825 + }, + { + "epoch": 0.1307661549029502, + "grad_norm": 12.695248496820787, + "learning_rate": 4.920224189527594e-05, + "loss": 2.6157, + "mean_token_accuracy": 0.37241379618644715, + "step": 129830 + }, + { + "epoch": 0.13077119095605438, + "grad_norm": 13.50772015646374, + "learning_rate": 4.920214299716797e-05, + "loss": 2.6908, + "mean_token_accuracy": 0.35517241060733795, + "step": 129835 + }, + { + "epoch": 0.13077622700915856, + "grad_norm": 12.512866530151031, + "learning_rate": 4.920204409304083e-05, + "loss": 2.396, + "mean_token_accuracy": 0.43793103098869324, + "step": 129840 + }, + { + "epoch": 0.13078126306226273, + "grad_norm": 9.551203519074571, + "learning_rate": 4.920194518289454e-05, + "loss": 2.4663, + "mean_token_accuracy": 0.4310344815254211, + "step": 129845 + }, + { + "epoch": 0.1307862991153669, + "grad_norm": 20.901340638137732, + "learning_rate": 4.920184626672914e-05, + "loss": 2.7596, + "mean_token_accuracy": 0.41724138259887694, + "step": 129850 + }, + { + "epoch": 0.13079133516847108, + "grad_norm": 15.215228871466978, + "learning_rate": 4.920174734454465e-05, + "loss": 2.6496, + "mean_token_accuracy": 0.4068965554237366, + "step": 129855 + }, + { + "epoch": 0.13079637122157525, + "grad_norm": 11.02966613076287, + "learning_rate": 4.9201648416341106e-05, + "loss": 2.4374, + "mean_token_accuracy": 0.39310343861579894, + "step": 129860 + }, + { + "epoch": 0.13080140727467943, + "grad_norm": 8.354521930561766, + "learning_rate": 4.920154948211852e-05, + "loss": 1.989, + "mean_token_accuracy": 0.4931034445762634, + "step": 129865 + }, + { + "epoch": 0.1308064433277836, + "grad_norm": 11.08835571405055, + "learning_rate": 4.9201450541876944e-05, + "loss": 2.5489, + "mean_token_accuracy": 0.4103448331356049, + "step": 129870 + }, + { + "epoch": 0.13081147938088777, + "grad_norm": 12.010531214067022, + "learning_rate": 4.9201351595616376e-05, + "loss": 2.269, + "mean_token_accuracy": 0.44827587008476255, + "step": 129875 + }, + { + "epoch": 0.13081651543399195, + "grad_norm": 15.212630984228232, + "learning_rate": 4.920125264333685e-05, + "loss": 2.5735, + "mean_token_accuracy": 0.4137931078672409, + "step": 129880 + }, + { + "epoch": 0.13082155148709612, + "grad_norm": 11.780594254963692, + "learning_rate": 4.920115368503842e-05, + "loss": 2.4857, + "mean_token_accuracy": 0.42758620381355283, + "step": 129885 + }, + { + "epoch": 0.1308265875402003, + "grad_norm": 12.696101235434234, + "learning_rate": 4.9201054720721076e-05, + "loss": 2.2825, + "mean_token_accuracy": 0.48275862336158754, + "step": 129890 + }, + { + "epoch": 0.13083162359330447, + "grad_norm": 13.458449655521985, + "learning_rate": 4.920095575038488e-05, + "loss": 2.2646, + "mean_token_accuracy": 0.48275862336158754, + "step": 129895 + }, + { + "epoch": 0.13083665964640864, + "grad_norm": 10.629783900166613, + "learning_rate": 4.920085677402983e-05, + "loss": 2.2409, + "mean_token_accuracy": 0.47586206197738645, + "step": 129900 + }, + { + "epoch": 0.13084169569951282, + "grad_norm": 7.100971966958274, + "learning_rate": 4.9200757791655973e-05, + "loss": 1.7576, + "mean_token_accuracy": 0.5264367878437042, + "step": 129905 + }, + { + "epoch": 0.130846731752617, + "grad_norm": 10.72771873207638, + "learning_rate": 4.920065880326333e-05, + "loss": 2.6802, + "mean_token_accuracy": 0.42068964838981626, + "step": 129910 + }, + { + "epoch": 0.13085176780572116, + "grad_norm": 14.13485711279724, + "learning_rate": 4.920055980885192e-05, + "loss": 2.2795, + "mean_token_accuracy": 0.45517241954803467, + "step": 129915 + }, + { + "epoch": 0.13085680385882534, + "grad_norm": 10.808975715900987, + "learning_rate": 4.920046080842179e-05, + "loss": 2.5262, + "mean_token_accuracy": 0.4068965494632721, + "step": 129920 + }, + { + "epoch": 0.1308618399119295, + "grad_norm": 9.810444991375585, + "learning_rate": 4.9200361801972953e-05, + "loss": 2.4169, + "mean_token_accuracy": 0.43103447556495667, + "step": 129925 + }, + { + "epoch": 0.13086687596503369, + "grad_norm": 10.527248837170186, + "learning_rate": 4.920026278950544e-05, + "loss": 2.2655, + "mean_token_accuracy": 0.4620689690113068, + "step": 129930 + }, + { + "epoch": 0.13087191201813786, + "grad_norm": 10.518438006204596, + "learning_rate": 4.920016377101928e-05, + "loss": 2.2862, + "mean_token_accuracy": 0.4705989181995392, + "step": 129935 + }, + { + "epoch": 0.13087694807124203, + "grad_norm": 9.189124120088655, + "learning_rate": 4.920006474651448e-05, + "loss": 2.549, + "mean_token_accuracy": 0.38965516686439516, + "step": 129940 + }, + { + "epoch": 0.1308819841243462, + "grad_norm": 11.018723028212438, + "learning_rate": 4.919996571599111e-05, + "loss": 2.0742, + "mean_token_accuracy": 0.4413793087005615, + "step": 129945 + }, + { + "epoch": 0.13088702017745038, + "grad_norm": 11.193656102346607, + "learning_rate": 4.919986667944917e-05, + "loss": 2.9236, + "mean_token_accuracy": 0.3896551728248596, + "step": 129950 + }, + { + "epoch": 0.13089205623055453, + "grad_norm": 9.719023521199578, + "learning_rate": 4.919976763688868e-05, + "loss": 2.1043, + "mean_token_accuracy": 0.41379310488700866, + "step": 129955 + }, + { + "epoch": 0.1308970922836587, + "grad_norm": 13.20713095654652, + "learning_rate": 4.919966858830969e-05, + "loss": 2.4281, + "mean_token_accuracy": 0.458620685338974, + "step": 129960 + }, + { + "epoch": 0.13090212833676287, + "grad_norm": 10.419874370579727, + "learning_rate": 4.9199569533712205e-05, + "loss": 2.7523, + "mean_token_accuracy": 0.3827586144208908, + "step": 129965 + }, + { + "epoch": 0.13090716438986705, + "grad_norm": 12.196057513270569, + "learning_rate": 4.919947047309627e-05, + "loss": 2.8233, + "mean_token_accuracy": 0.3655172407627106, + "step": 129970 + }, + { + "epoch": 0.13091220044297122, + "grad_norm": 9.877991174780144, + "learning_rate": 4.919937140646191e-05, + "loss": 2.1484, + "mean_token_accuracy": 0.42758620381355283, + "step": 129975 + }, + { + "epoch": 0.1309172364960754, + "grad_norm": 8.838912440528503, + "learning_rate": 4.919927233380913e-05, + "loss": 2.3966, + "mean_token_accuracy": 0.4344827592372894, + "step": 129980 + }, + { + "epoch": 0.13092227254917957, + "grad_norm": 9.436291529241307, + "learning_rate": 4.9199173255138e-05, + "loss": 2.1374, + "mean_token_accuracy": 0.49546279907226565, + "step": 129985 + }, + { + "epoch": 0.13092730860228374, + "grad_norm": 11.91555615954252, + "learning_rate": 4.9199074170448505e-05, + "loss": 2.4139, + "mean_token_accuracy": 0.4018753796815872, + "step": 129990 + }, + { + "epoch": 0.13093234465538792, + "grad_norm": 9.73781943289101, + "learning_rate": 4.91989750797407e-05, + "loss": 2.8005, + "mean_token_accuracy": 0.417241370677948, + "step": 129995 + }, + { + "epoch": 0.1309373807084921, + "grad_norm": 10.608270433801268, + "learning_rate": 4.91988759830146e-05, + "loss": 1.9521, + "mean_token_accuracy": 0.5195402264595032, + "step": 130000 + }, + { + "epoch": 0.13094241676159626, + "grad_norm": 10.102706112430658, + "learning_rate": 4.919877688027024e-05, + "loss": 2.2082, + "mean_token_accuracy": 0.4379310369491577, + "step": 130005 + }, + { + "epoch": 0.13094745281470044, + "grad_norm": 10.551981193913814, + "learning_rate": 4.919867777150764e-05, + "loss": 2.3321, + "mean_token_accuracy": 0.4413793087005615, + "step": 130010 + }, + { + "epoch": 0.1309524888678046, + "grad_norm": 11.997734580941797, + "learning_rate": 4.919857865672683e-05, + "loss": 2.4767, + "mean_token_accuracy": 0.3931034505367279, + "step": 130015 + }, + { + "epoch": 0.13095752492090879, + "grad_norm": 10.408937049720626, + "learning_rate": 4.919847953592784e-05, + "loss": 2.5107, + "mean_token_accuracy": 0.4172413766384125, + "step": 130020 + }, + { + "epoch": 0.13096256097401296, + "grad_norm": 14.602571703561534, + "learning_rate": 4.91983804091107e-05, + "loss": 2.2086, + "mean_token_accuracy": 0.43793103098869324, + "step": 130025 + }, + { + "epoch": 0.13096759702711713, + "grad_norm": 15.360068161103989, + "learning_rate": 4.919828127627543e-05, + "loss": 2.2283, + "mean_token_accuracy": 0.47931034564971925, + "step": 130030 + }, + { + "epoch": 0.1309726330802213, + "grad_norm": 16.08048085303682, + "learning_rate": 4.919818213742205e-05, + "loss": 2.4826, + "mean_token_accuracy": 0.42758620977401735, + "step": 130035 + }, + { + "epoch": 0.13097766913332548, + "grad_norm": 9.753755326232202, + "learning_rate": 4.91980829925506e-05, + "loss": 2.3319, + "mean_token_accuracy": 0.42413793206214906, + "step": 130040 + }, + { + "epoch": 0.13098270518642965, + "grad_norm": 10.062689479623907, + "learning_rate": 4.9197983841661124e-05, + "loss": 2.5425, + "mean_token_accuracy": 0.4, + "step": 130045 + }, + { + "epoch": 0.13098774123953383, + "grad_norm": 9.066139539157533, + "learning_rate": 4.919788468475362e-05, + "loss": 2.39, + "mean_token_accuracy": 0.46206897497177124, + "step": 130050 + }, + { + "epoch": 0.130992777292638, + "grad_norm": 8.655193501532251, + "learning_rate": 4.919778552182813e-05, + "loss": 2.606, + "mean_token_accuracy": 0.45172414779663084, + "step": 130055 + }, + { + "epoch": 0.13099781334574218, + "grad_norm": 9.63673942337131, + "learning_rate": 4.919768635288467e-05, + "loss": 2.4175, + "mean_token_accuracy": 0.39147005677223207, + "step": 130060 + }, + { + "epoch": 0.13100284939884635, + "grad_norm": 11.109108598482244, + "learning_rate": 4.9197587177923285e-05, + "loss": 3.0937, + "mean_token_accuracy": 0.35172413289546967, + "step": 130065 + }, + { + "epoch": 0.13100788545195052, + "grad_norm": 14.704660697402826, + "learning_rate": 4.9197487996943985e-05, + "loss": 2.161, + "mean_token_accuracy": 0.49999999403953554, + "step": 130070 + }, + { + "epoch": 0.1310129215050547, + "grad_norm": 7.641346943298125, + "learning_rate": 4.919738880994681e-05, + "loss": 2.129, + "mean_token_accuracy": 0.44827585816383364, + "step": 130075 + }, + { + "epoch": 0.13101795755815887, + "grad_norm": 9.729897206417885, + "learning_rate": 4.9197289616931786e-05, + "loss": 2.3801, + "mean_token_accuracy": 0.4620689630508423, + "step": 130080 + }, + { + "epoch": 0.13102299361126304, + "grad_norm": 11.985453882281405, + "learning_rate": 4.9197190417898934e-05, + "loss": 2.8645, + "mean_token_accuracy": 0.38620689511299133, + "step": 130085 + }, + { + "epoch": 0.13102802966436722, + "grad_norm": 10.392842210404202, + "learning_rate": 4.9197091212848286e-05, + "loss": 2.0525, + "mean_token_accuracy": 0.45517241954803467, + "step": 130090 + }, + { + "epoch": 0.13103306571747136, + "grad_norm": 11.203783979656587, + "learning_rate": 4.9196992001779875e-05, + "loss": 2.5856, + "mean_token_accuracy": 0.4068965554237366, + "step": 130095 + }, + { + "epoch": 0.13103810177057554, + "grad_norm": 9.506109214061613, + "learning_rate": 4.9196892784693716e-05, + "loss": 2.6634, + "mean_token_accuracy": 0.3965517282485962, + "step": 130100 + }, + { + "epoch": 0.1310431378236797, + "grad_norm": 9.016042048450963, + "learning_rate": 4.919679356158984e-05, + "loss": 2.1301, + "mean_token_accuracy": 0.4310344815254211, + "step": 130105 + }, + { + "epoch": 0.13104817387678389, + "grad_norm": 12.493404828163342, + "learning_rate": 4.919669433246828e-05, + "loss": 2.7075, + "mean_token_accuracy": 0.4103448331356049, + "step": 130110 + }, + { + "epoch": 0.13105320992988806, + "grad_norm": 11.324293391799031, + "learning_rate": 4.919659509732906e-05, + "loss": 2.3695, + "mean_token_accuracy": 0.4448275864124298, + "step": 130115 + }, + { + "epoch": 0.13105824598299223, + "grad_norm": 12.893193860867884, + "learning_rate": 4.919649585617221e-05, + "loss": 2.4074, + "mean_token_accuracy": 0.39655172228813174, + "step": 130120 + }, + { + "epoch": 0.1310632820360964, + "grad_norm": 11.579970701681532, + "learning_rate": 4.9196396608997754e-05, + "loss": 2.5701, + "mean_token_accuracy": 0.4395644307136536, + "step": 130125 + }, + { + "epoch": 0.13106831808920058, + "grad_norm": 9.39088508223922, + "learning_rate": 4.919629735580573e-05, + "loss": 2.2407, + "mean_token_accuracy": 0.42758620381355283, + "step": 130130 + }, + { + "epoch": 0.13107335414230475, + "grad_norm": 10.649754037464291, + "learning_rate": 4.919619809659615e-05, + "loss": 2.5375, + "mean_token_accuracy": 0.4034482717514038, + "step": 130135 + }, + { + "epoch": 0.13107839019540893, + "grad_norm": 11.553762091100667, + "learning_rate": 4.919609883136905e-05, + "loss": 2.4565, + "mean_token_accuracy": 0.4034482717514038, + "step": 130140 + }, + { + "epoch": 0.1310834262485131, + "grad_norm": 13.648557332774484, + "learning_rate": 4.9195999560124455e-05, + "loss": 2.5649, + "mean_token_accuracy": 0.458620685338974, + "step": 130145 + }, + { + "epoch": 0.13108846230161728, + "grad_norm": 11.224593578664763, + "learning_rate": 4.91959002828624e-05, + "loss": 2.3424, + "mean_token_accuracy": 0.44827585816383364, + "step": 130150 + }, + { + "epoch": 0.13109349835472145, + "grad_norm": 9.896415497485727, + "learning_rate": 4.919580099958289e-05, + "loss": 2.4924, + "mean_token_accuracy": 0.38965516686439516, + "step": 130155 + }, + { + "epoch": 0.13109853440782562, + "grad_norm": 9.984960096864834, + "learning_rate": 4.919570171028598e-05, + "loss": 2.4295, + "mean_token_accuracy": 0.46551724672317507, + "step": 130160 + }, + { + "epoch": 0.1311035704609298, + "grad_norm": 9.852320387332838, + "learning_rate": 4.919560241497168e-05, + "loss": 2.6723, + "mean_token_accuracy": 0.41209921836853025, + "step": 130165 + }, + { + "epoch": 0.13110860651403397, + "grad_norm": 10.516934470168565, + "learning_rate": 4.9195503113640036e-05, + "loss": 2.3592, + "mean_token_accuracy": 0.4551724076271057, + "step": 130170 + }, + { + "epoch": 0.13111364256713814, + "grad_norm": 10.079109944616036, + "learning_rate": 4.919540380629105e-05, + "loss": 2.3045, + "mean_token_accuracy": 0.4482758641242981, + "step": 130175 + }, + { + "epoch": 0.13111867862024232, + "grad_norm": 11.082892332421725, + "learning_rate": 4.919530449292477e-05, + "loss": 2.4649, + "mean_token_accuracy": 0.3793103456497192, + "step": 130180 + }, + { + "epoch": 0.1311237146733465, + "grad_norm": 11.958995615805087, + "learning_rate": 4.919520517354121e-05, + "loss": 2.6175, + "mean_token_accuracy": 0.4, + "step": 130185 + }, + { + "epoch": 0.13112875072645067, + "grad_norm": 9.698069856827663, + "learning_rate": 4.919510584814042e-05, + "loss": 2.5469, + "mean_token_accuracy": 0.42413793206214906, + "step": 130190 + }, + { + "epoch": 0.13113378677955484, + "grad_norm": 11.555025109812906, + "learning_rate": 4.9195006516722396e-05, + "loss": 2.5379, + "mean_token_accuracy": 0.42413793206214906, + "step": 130195 + }, + { + "epoch": 0.131138822832659, + "grad_norm": 10.856936120673042, + "learning_rate": 4.919490717928718e-05, + "loss": 2.7043, + "mean_token_accuracy": 0.4344827473163605, + "step": 130200 + }, + { + "epoch": 0.1311438588857632, + "grad_norm": 9.153556648880983, + "learning_rate": 4.919480783583481e-05, + "loss": 2.9457, + "mean_token_accuracy": 0.44343616962432864, + "step": 130205 + }, + { + "epoch": 0.13114889493886736, + "grad_norm": 10.072519646850195, + "learning_rate": 4.9194708486365295e-05, + "loss": 2.2067, + "mean_token_accuracy": 0.458620685338974, + "step": 130210 + }, + { + "epoch": 0.13115393099197153, + "grad_norm": 13.775635477351994, + "learning_rate": 4.9194609130878675e-05, + "loss": 3.1039, + "mean_token_accuracy": 0.3379310369491577, + "step": 130215 + }, + { + "epoch": 0.1311589670450757, + "grad_norm": 10.124892147619125, + "learning_rate": 4.919450976937498e-05, + "loss": 2.3458, + "mean_token_accuracy": 0.3862069010734558, + "step": 130220 + }, + { + "epoch": 0.13116400309817988, + "grad_norm": 8.653004527765933, + "learning_rate": 4.919441040185423e-05, + "loss": 2.0558, + "mean_token_accuracy": 0.5, + "step": 130225 + }, + { + "epoch": 0.13116903915128406, + "grad_norm": 10.051151348138543, + "learning_rate": 4.919431102831645e-05, + "loss": 2.3128, + "mean_token_accuracy": 0.42583181858062746, + "step": 130230 + }, + { + "epoch": 0.1311740752043882, + "grad_norm": 9.678135859494779, + "learning_rate": 4.919421164876167e-05, + "loss": 2.0366, + "mean_token_accuracy": 0.5034482717514038, + "step": 130235 + }, + { + "epoch": 0.13117911125749238, + "grad_norm": 13.76658112564712, + "learning_rate": 4.919411226318993e-05, + "loss": 2.9472, + "mean_token_accuracy": 0.3620689630508423, + "step": 130240 + }, + { + "epoch": 0.13118414731059655, + "grad_norm": 9.82820005691906, + "learning_rate": 4.9194012871601236e-05, + "loss": 2.4012, + "mean_token_accuracy": 0.42413793206214906, + "step": 130245 + }, + { + "epoch": 0.13118918336370072, + "grad_norm": 10.867473851137854, + "learning_rate": 4.919391347399563e-05, + "loss": 2.4646, + "mean_token_accuracy": 0.4310344815254211, + "step": 130250 + }, + { + "epoch": 0.1311942194168049, + "grad_norm": 8.560602894269996, + "learning_rate": 4.9193814070373145e-05, + "loss": 2.3708, + "mean_token_accuracy": 0.4034482777118683, + "step": 130255 + }, + { + "epoch": 0.13119925546990907, + "grad_norm": 13.70526708950858, + "learning_rate": 4.919371466073378e-05, + "loss": 2.6405, + "mean_token_accuracy": 0.4, + "step": 130260 + }, + { + "epoch": 0.13120429152301324, + "grad_norm": 11.987382850126089, + "learning_rate": 4.91936152450776e-05, + "loss": 2.6374, + "mean_token_accuracy": 0.37586206793785093, + "step": 130265 + }, + { + "epoch": 0.13120932757611742, + "grad_norm": 12.218585208855114, + "learning_rate": 4.9193515823404614e-05, + "loss": 2.7804, + "mean_token_accuracy": 0.42068964838981626, + "step": 130270 + }, + { + "epoch": 0.1312143636292216, + "grad_norm": 10.83111027273831, + "learning_rate": 4.919341639571484e-05, + "loss": 2.3029, + "mean_token_accuracy": 0.4206896543502808, + "step": 130275 + }, + { + "epoch": 0.13121939968232577, + "grad_norm": 10.80114190478455, + "learning_rate": 4.9193316962008325e-05, + "loss": 2.7782, + "mean_token_accuracy": 0.3965517282485962, + "step": 130280 + }, + { + "epoch": 0.13122443573542994, + "grad_norm": 11.532907977641091, + "learning_rate": 4.919321752228508e-05, + "loss": 2.3239, + "mean_token_accuracy": 0.43103448748588563, + "step": 130285 + }, + { + "epoch": 0.1312294717885341, + "grad_norm": 11.346798519587363, + "learning_rate": 4.919311807654514e-05, + "loss": 2.461, + "mean_token_accuracy": 0.41379311084747317, + "step": 130290 + }, + { + "epoch": 0.1312345078416383, + "grad_norm": 14.882312863508448, + "learning_rate": 4.919301862478855e-05, + "loss": 2.7366, + "mean_token_accuracy": 0.4122202038764954, + "step": 130295 + }, + { + "epoch": 0.13123954389474246, + "grad_norm": 11.56274694643577, + "learning_rate": 4.91929191670153e-05, + "loss": 2.457, + "mean_token_accuracy": 0.4000000059604645, + "step": 130300 + }, + { + "epoch": 0.13124457994784663, + "grad_norm": 9.154972583480959, + "learning_rate": 4.919281970322545e-05, + "loss": 2.1285, + "mean_token_accuracy": 0.4448275864124298, + "step": 130305 + }, + { + "epoch": 0.1312496160009508, + "grad_norm": 12.123011151756938, + "learning_rate": 4.919272023341901e-05, + "loss": 2.6897, + "mean_token_accuracy": 0.4551724135875702, + "step": 130310 + }, + { + "epoch": 0.13125465205405498, + "grad_norm": 14.504716702752743, + "learning_rate": 4.919262075759601e-05, + "loss": 2.4632, + "mean_token_accuracy": 0.4172413766384125, + "step": 130315 + }, + { + "epoch": 0.13125968810715916, + "grad_norm": 11.232959252033986, + "learning_rate": 4.9192521275756487e-05, + "loss": 2.4201, + "mean_token_accuracy": 0.3931034505367279, + "step": 130320 + }, + { + "epoch": 0.13126472416026333, + "grad_norm": 14.172836088682345, + "learning_rate": 4.9192421787900463e-05, + "loss": 2.932, + "mean_token_accuracy": 0.3655172407627106, + "step": 130325 + }, + { + "epoch": 0.1312697602133675, + "grad_norm": 11.092964364126177, + "learning_rate": 4.919232229402796e-05, + "loss": 2.3312, + "mean_token_accuracy": 0.4517241358757019, + "step": 130330 + }, + { + "epoch": 0.13127479626647168, + "grad_norm": 9.144583078035716, + "learning_rate": 4.919222279413901e-05, + "loss": 2.3342, + "mean_token_accuracy": 0.4413793087005615, + "step": 130335 + }, + { + "epoch": 0.13127983231957585, + "grad_norm": 11.247149482203813, + "learning_rate": 4.919212328823365e-05, + "loss": 2.5313, + "mean_token_accuracy": 0.4206896543502808, + "step": 130340 + }, + { + "epoch": 0.13128486837268002, + "grad_norm": 22.56215493686229, + "learning_rate": 4.9192023776311885e-05, + "loss": 2.6521, + "mean_token_accuracy": 0.44827585816383364, + "step": 130345 + }, + { + "epoch": 0.1312899044257842, + "grad_norm": 11.4417838074006, + "learning_rate": 4.9191924258373765e-05, + "loss": 2.2865, + "mean_token_accuracy": 0.4586206912994385, + "step": 130350 + }, + { + "epoch": 0.13129494047888837, + "grad_norm": 10.750042411820706, + "learning_rate": 4.9191824734419305e-05, + "loss": 2.936, + "mean_token_accuracy": 0.38977832198143003, + "step": 130355 + }, + { + "epoch": 0.13129997653199255, + "grad_norm": 8.960459162968913, + "learning_rate": 4.919172520444854e-05, + "loss": 2.3058, + "mean_token_accuracy": 0.4568663060665131, + "step": 130360 + }, + { + "epoch": 0.13130501258509672, + "grad_norm": 10.906366735402356, + "learning_rate": 4.9191625668461495e-05, + "loss": 2.4839, + "mean_token_accuracy": 0.42577131986618044, + "step": 130365 + }, + { + "epoch": 0.1313100486382009, + "grad_norm": 10.94864501521933, + "learning_rate": 4.919152612645819e-05, + "loss": 2.2908, + "mean_token_accuracy": 0.4310344815254211, + "step": 130370 + }, + { + "epoch": 0.13131508469130504, + "grad_norm": 13.608887585731715, + "learning_rate": 4.919142657843867e-05, + "loss": 2.7443, + "mean_token_accuracy": 0.35862069129943847, + "step": 130375 + }, + { + "epoch": 0.1313201207444092, + "grad_norm": 13.200972101879383, + "learning_rate": 4.919132702440294e-05, + "loss": 2.3799, + "mean_token_accuracy": 0.43448275327682495, + "step": 130380 + }, + { + "epoch": 0.1313251567975134, + "grad_norm": 10.049640929350957, + "learning_rate": 4.919122746435105e-05, + "loss": 2.1232, + "mean_token_accuracy": 0.4344827592372894, + "step": 130385 + }, + { + "epoch": 0.13133019285061756, + "grad_norm": 11.199253034482947, + "learning_rate": 4.919112789828301e-05, + "loss": 2.418, + "mean_token_accuracy": 0.38620689511299133, + "step": 130390 + }, + { + "epoch": 0.13133522890372173, + "grad_norm": 10.338739335677511, + "learning_rate": 4.919102832619886e-05, + "loss": 2.2746, + "mean_token_accuracy": 0.5088929176330567, + "step": 130395 + }, + { + "epoch": 0.1313402649568259, + "grad_norm": 10.99225191177493, + "learning_rate": 4.919092874809862e-05, + "loss": 2.355, + "mean_token_accuracy": 0.46358135938644407, + "step": 130400 + }, + { + "epoch": 0.13134530100993008, + "grad_norm": 8.993524183841702, + "learning_rate": 4.9190829163982315e-05, + "loss": 2.3243, + "mean_token_accuracy": 0.42232305407524107, + "step": 130405 + }, + { + "epoch": 0.13135033706303426, + "grad_norm": 16.099817940591038, + "learning_rate": 4.9190729573849984e-05, + "loss": 2.7001, + "mean_token_accuracy": 0.3931034505367279, + "step": 130410 + }, + { + "epoch": 0.13135537311613843, + "grad_norm": 12.36970136626534, + "learning_rate": 4.9190629977701646e-05, + "loss": 2.4489, + "mean_token_accuracy": 0.40895341634750365, + "step": 130415 + }, + { + "epoch": 0.1313604091692426, + "grad_norm": 12.385607328677422, + "learning_rate": 4.919053037553733e-05, + "loss": 2.2225, + "mean_token_accuracy": 0.47447065711021424, + "step": 130420 + }, + { + "epoch": 0.13136544522234678, + "grad_norm": 12.861950985129017, + "learning_rate": 4.919043076735706e-05, + "loss": 2.3408, + "mean_token_accuracy": 0.4379310369491577, + "step": 130425 + }, + { + "epoch": 0.13137048127545095, + "grad_norm": 11.269090545533686, + "learning_rate": 4.9190331153160884e-05, + "loss": 2.5385, + "mean_token_accuracy": 0.4107142806053162, + "step": 130430 + }, + { + "epoch": 0.13137551732855512, + "grad_norm": 11.35043243277724, + "learning_rate": 4.919023153294881e-05, + "loss": 2.3926, + "mean_token_accuracy": 0.4586206912994385, + "step": 130435 + }, + { + "epoch": 0.1313805533816593, + "grad_norm": 10.005446994077566, + "learning_rate": 4.919013190672086e-05, + "loss": 2.3473, + "mean_token_accuracy": 0.4551724135875702, + "step": 130440 + }, + { + "epoch": 0.13138558943476347, + "grad_norm": 11.845036976035948, + "learning_rate": 4.919003227447707e-05, + "loss": 2.247, + "mean_token_accuracy": 0.4551724135875702, + "step": 130445 + }, + { + "epoch": 0.13139062548786765, + "grad_norm": 9.652152332640988, + "learning_rate": 4.918993263621747e-05, + "loss": 2.4258, + "mean_token_accuracy": 0.420689657330513, + "step": 130450 + }, + { + "epoch": 0.13139566154097182, + "grad_norm": 17.78756715625396, + "learning_rate": 4.9189832991942086e-05, + "loss": 2.4968, + "mean_token_accuracy": 0.42068966031074523, + "step": 130455 + }, + { + "epoch": 0.131400697594076, + "grad_norm": 11.840060615482374, + "learning_rate": 4.9189733341650954e-05, + "loss": 2.2451, + "mean_token_accuracy": 0.4689655125141144, + "step": 130460 + }, + { + "epoch": 0.13140573364718017, + "grad_norm": 11.038203201361574, + "learning_rate": 4.9189633685344086e-05, + "loss": 2.3946, + "mean_token_accuracy": 0.41379310488700866, + "step": 130465 + }, + { + "epoch": 0.13141076970028434, + "grad_norm": 11.539909051254734, + "learning_rate": 4.9189534023021516e-05, + "loss": 2.507, + "mean_token_accuracy": 0.4034482777118683, + "step": 130470 + }, + { + "epoch": 0.13141580575338851, + "grad_norm": 10.396043621501276, + "learning_rate": 4.918943435468328e-05, + "loss": 2.5884, + "mean_token_accuracy": 0.3896551728248596, + "step": 130475 + }, + { + "epoch": 0.1314208418064927, + "grad_norm": 8.713508039669044, + "learning_rate": 4.9189334680329396e-05, + "loss": 2.0885, + "mean_token_accuracy": 0.5068965435028077, + "step": 130480 + }, + { + "epoch": 0.13142587785959686, + "grad_norm": 10.718424930014715, + "learning_rate": 4.91892349999599e-05, + "loss": 2.3036, + "mean_token_accuracy": 0.43793103098869324, + "step": 130485 + }, + { + "epoch": 0.13143091391270104, + "grad_norm": 10.499580053717974, + "learning_rate": 4.9189135313574804e-05, + "loss": 2.4824, + "mean_token_accuracy": 0.3931034505367279, + "step": 130490 + }, + { + "epoch": 0.1314359499658052, + "grad_norm": 11.643215205789456, + "learning_rate": 4.918903562117415e-05, + "loss": 2.4437, + "mean_token_accuracy": 0.4241379380226135, + "step": 130495 + }, + { + "epoch": 0.13144098601890938, + "grad_norm": 11.751402074726798, + "learning_rate": 4.918893592275795e-05, + "loss": 2.4656, + "mean_token_accuracy": 0.4068965494632721, + "step": 130500 + }, + { + "epoch": 0.13144602207201356, + "grad_norm": 10.315020227886011, + "learning_rate": 4.9188836218326257e-05, + "loss": 2.5402, + "mean_token_accuracy": 0.4000000059604645, + "step": 130505 + }, + { + "epoch": 0.13145105812511773, + "grad_norm": 8.686696753774982, + "learning_rate": 4.918873650787908e-05, + "loss": 1.9837, + "mean_token_accuracy": 0.46073804795742035, + "step": 130510 + }, + { + "epoch": 0.13145609417822188, + "grad_norm": 10.651359576102672, + "learning_rate": 4.918863679141645e-05, + "loss": 2.2277, + "mean_token_accuracy": 0.47241379618644713, + "step": 130515 + }, + { + "epoch": 0.13146113023132605, + "grad_norm": 9.191772021659512, + "learning_rate": 4.91885370689384e-05, + "loss": 2.1276, + "mean_token_accuracy": 0.45662432312965395, + "step": 130520 + }, + { + "epoch": 0.13146616628443022, + "grad_norm": 9.553214576997286, + "learning_rate": 4.918843734044495e-05, + "loss": 2.5742, + "mean_token_accuracy": 0.417241370677948, + "step": 130525 + }, + { + "epoch": 0.1314712023375344, + "grad_norm": 12.024252790485859, + "learning_rate": 4.9188337605936124e-05, + "loss": 2.6667, + "mean_token_accuracy": 0.391349059343338, + "step": 130530 + }, + { + "epoch": 0.13147623839063857, + "grad_norm": 9.581341588144074, + "learning_rate": 4.918823786541197e-05, + "loss": 2.6407, + "mean_token_accuracy": 0.3965517282485962, + "step": 130535 + }, + { + "epoch": 0.13148127444374275, + "grad_norm": 10.291594235321346, + "learning_rate": 4.918813811887249e-05, + "loss": 2.3599, + "mean_token_accuracy": 0.4172413766384125, + "step": 130540 + }, + { + "epoch": 0.13148631049684692, + "grad_norm": 8.77402733319208, + "learning_rate": 4.918803836631773e-05, + "loss": 2.2768, + "mean_token_accuracy": 0.45735026597976686, + "step": 130545 + }, + { + "epoch": 0.1314913465499511, + "grad_norm": 11.694059760426102, + "learning_rate": 4.9187938607747706e-05, + "loss": 2.471, + "mean_token_accuracy": 0.4534180223941803, + "step": 130550 + }, + { + "epoch": 0.13149638260305527, + "grad_norm": 9.272513535748267, + "learning_rate": 4.918783884316246e-05, + "loss": 2.3881, + "mean_token_accuracy": 0.4620689690113068, + "step": 130555 + }, + { + "epoch": 0.13150141865615944, + "grad_norm": 12.489392368658415, + "learning_rate": 4.9187739072562e-05, + "loss": 2.4079, + "mean_token_accuracy": 0.43103448748588563, + "step": 130560 + }, + { + "epoch": 0.13150645470926361, + "grad_norm": 10.428182844686333, + "learning_rate": 4.9187639295946376e-05, + "loss": 2.3662, + "mean_token_accuracy": 0.4448275864124298, + "step": 130565 + }, + { + "epoch": 0.1315114907623678, + "grad_norm": 14.591326633318399, + "learning_rate": 4.91875395133156e-05, + "loss": 2.4757, + "mean_token_accuracy": 0.40550513863563536, + "step": 130570 + }, + { + "epoch": 0.13151652681547196, + "grad_norm": 10.509598139622824, + "learning_rate": 4.918743972466971e-05, + "loss": 2.3434, + "mean_token_accuracy": 0.44137930274009707, + "step": 130575 + }, + { + "epoch": 0.13152156286857614, + "grad_norm": 10.638161909813379, + "learning_rate": 4.918733993000872e-05, + "loss": 2.4599, + "mean_token_accuracy": 0.42413793206214906, + "step": 130580 + }, + { + "epoch": 0.1315265989216803, + "grad_norm": 11.120027321832138, + "learning_rate": 4.9187240129332666e-05, + "loss": 2.1883, + "mean_token_accuracy": 0.4626134276390076, + "step": 130585 + }, + { + "epoch": 0.13153163497478448, + "grad_norm": 12.451789154309804, + "learning_rate": 4.9187140322641577e-05, + "loss": 2.568, + "mean_token_accuracy": 0.43448275327682495, + "step": 130590 + }, + { + "epoch": 0.13153667102788866, + "grad_norm": 10.354979661515424, + "learning_rate": 4.9187040509935476e-05, + "loss": 2.5616, + "mean_token_accuracy": 0.42758620977401735, + "step": 130595 + }, + { + "epoch": 0.13154170708099283, + "grad_norm": 9.640153981685936, + "learning_rate": 4.91869406912144e-05, + "loss": 2.6799, + "mean_token_accuracy": 0.3999999940395355, + "step": 130600 + }, + { + "epoch": 0.131546743134097, + "grad_norm": 8.968165629848603, + "learning_rate": 4.918684086647836e-05, + "loss": 2.4565, + "mean_token_accuracy": 0.4137930989265442, + "step": 130605 + }, + { + "epoch": 0.13155177918720118, + "grad_norm": 11.945135896989738, + "learning_rate": 4.918674103572741e-05, + "loss": 2.0696, + "mean_token_accuracy": 0.4344827592372894, + "step": 130610 + }, + { + "epoch": 0.13155681524030535, + "grad_norm": 10.251090533862238, + "learning_rate": 4.918664119896154e-05, + "loss": 2.384, + "mean_token_accuracy": 0.42758620977401735, + "step": 130615 + }, + { + "epoch": 0.13156185129340953, + "grad_norm": 10.679151359396819, + "learning_rate": 4.9186541356180817e-05, + "loss": 2.2952, + "mean_token_accuracy": 0.4068965494632721, + "step": 130620 + }, + { + "epoch": 0.1315668873465137, + "grad_norm": 10.267037817347953, + "learning_rate": 4.918644150738524e-05, + "loss": 2.3921, + "mean_token_accuracy": 0.42068966031074523, + "step": 130625 + }, + { + "epoch": 0.13157192339961787, + "grad_norm": 11.9522428317448, + "learning_rate": 4.918634165257485e-05, + "loss": 2.0382, + "mean_token_accuracy": 0.46896551847457885, + "step": 130630 + }, + { + "epoch": 0.13157695945272205, + "grad_norm": 10.786578199414956, + "learning_rate": 4.9186241791749674e-05, + "loss": 2.4337, + "mean_token_accuracy": 0.4068965554237366, + "step": 130635 + }, + { + "epoch": 0.13158199550582622, + "grad_norm": 9.12774338031861, + "learning_rate": 4.918614192490974e-05, + "loss": 2.2604, + "mean_token_accuracy": 0.43103447556495667, + "step": 130640 + }, + { + "epoch": 0.1315870315589304, + "grad_norm": 10.972530098819206, + "learning_rate": 4.9186042052055075e-05, + "loss": 2.3279, + "mean_token_accuracy": 0.41724138259887694, + "step": 130645 + }, + { + "epoch": 0.13159206761203457, + "grad_norm": 11.419850076009672, + "learning_rate": 4.91859421731857e-05, + "loss": 2.2267, + "mean_token_accuracy": 0.4517241418361664, + "step": 130650 + }, + { + "epoch": 0.13159710366513871, + "grad_norm": 9.51567772329566, + "learning_rate": 4.9185842288301644e-05, + "loss": 2.2613, + "mean_token_accuracy": 0.46551724672317507, + "step": 130655 + }, + { + "epoch": 0.1316021397182429, + "grad_norm": 9.419136121885083, + "learning_rate": 4.918574239740294e-05, + "loss": 2.2573, + "mean_token_accuracy": 0.41034482717514037, + "step": 130660 + }, + { + "epoch": 0.13160717577134706, + "grad_norm": 10.858073589711925, + "learning_rate": 4.9185642500489624e-05, + "loss": 2.4129, + "mean_token_accuracy": 0.4, + "step": 130665 + }, + { + "epoch": 0.13161221182445124, + "grad_norm": 10.099936893414954, + "learning_rate": 4.9185542597561714e-05, + "loss": 2.4664, + "mean_token_accuracy": 0.38620689511299133, + "step": 130670 + }, + { + "epoch": 0.1316172478775554, + "grad_norm": 8.753919941448983, + "learning_rate": 4.918544268861923e-05, + "loss": 2.4327, + "mean_token_accuracy": 0.4586206912994385, + "step": 130675 + }, + { + "epoch": 0.13162228393065958, + "grad_norm": 10.99113104929302, + "learning_rate": 4.9185342773662214e-05, + "loss": 2.262, + "mean_token_accuracy": 0.4448275864124298, + "step": 130680 + }, + { + "epoch": 0.13162731998376376, + "grad_norm": 9.908806389853094, + "learning_rate": 4.918524285269068e-05, + "loss": 2.5109, + "mean_token_accuracy": 0.4689655125141144, + "step": 130685 + }, + { + "epoch": 0.13163235603686793, + "grad_norm": 12.39326867247831, + "learning_rate": 4.918514292570466e-05, + "loss": 2.7155, + "mean_token_accuracy": 0.38965516686439516, + "step": 130690 + }, + { + "epoch": 0.1316373920899721, + "grad_norm": 10.27760616865546, + "learning_rate": 4.9185042992704195e-05, + "loss": 2.4212, + "mean_token_accuracy": 0.40344828367233276, + "step": 130695 + }, + { + "epoch": 0.13164242814307628, + "grad_norm": 9.075318294268499, + "learning_rate": 4.91849430536893e-05, + "loss": 2.0368, + "mean_token_accuracy": 0.46896551847457885, + "step": 130700 + }, + { + "epoch": 0.13164746419618045, + "grad_norm": 10.629530341777205, + "learning_rate": 4.918484310866001e-05, + "loss": 2.3232, + "mean_token_accuracy": 0.4310344815254211, + "step": 130705 + }, + { + "epoch": 0.13165250024928463, + "grad_norm": 12.129961913344362, + "learning_rate": 4.918474315761634e-05, + "loss": 2.5669, + "mean_token_accuracy": 0.39655172526836396, + "step": 130710 + }, + { + "epoch": 0.1316575363023888, + "grad_norm": 9.811054599151198, + "learning_rate": 4.918464320055833e-05, + "loss": 2.3931, + "mean_token_accuracy": 0.48275862336158754, + "step": 130715 + }, + { + "epoch": 0.13166257235549297, + "grad_norm": 10.855580974194785, + "learning_rate": 4.9184543237485996e-05, + "loss": 2.555, + "mean_token_accuracy": 0.4034482717514038, + "step": 130720 + }, + { + "epoch": 0.13166760840859715, + "grad_norm": 10.0511608861002, + "learning_rate": 4.918444326839938e-05, + "loss": 2.1263, + "mean_token_accuracy": 0.43448275327682495, + "step": 130725 + }, + { + "epoch": 0.13167264446170132, + "grad_norm": 11.743041098757228, + "learning_rate": 4.91843432932985e-05, + "loss": 2.9244, + "mean_token_accuracy": 0.38620689511299133, + "step": 130730 + }, + { + "epoch": 0.1316776805148055, + "grad_norm": 10.961361444324625, + "learning_rate": 4.918424331218338e-05, + "loss": 2.2351, + "mean_token_accuracy": 0.4448275864124298, + "step": 130735 + }, + { + "epoch": 0.13168271656790967, + "grad_norm": 11.201072956082047, + "learning_rate": 4.9184143325054064e-05, + "loss": 2.5289, + "mean_token_accuracy": 0.3896551728248596, + "step": 130740 + }, + { + "epoch": 0.13168775262101384, + "grad_norm": 11.143533927783068, + "learning_rate": 4.918404333191057e-05, + "loss": 2.4085, + "mean_token_accuracy": 0.44482759237289426, + "step": 130745 + }, + { + "epoch": 0.13169278867411802, + "grad_norm": 12.110253306948275, + "learning_rate": 4.918394333275292e-05, + "loss": 2.3231, + "mean_token_accuracy": 0.4724137902259827, + "step": 130750 + }, + { + "epoch": 0.1316978247272222, + "grad_norm": 31.580804448961075, + "learning_rate": 4.9183843327581155e-05, + "loss": 2.398, + "mean_token_accuracy": 0.4482758641242981, + "step": 130755 + }, + { + "epoch": 0.13170286078032636, + "grad_norm": 11.051335679790999, + "learning_rate": 4.918374331639529e-05, + "loss": 2.5759, + "mean_token_accuracy": 0.4034482777118683, + "step": 130760 + }, + { + "epoch": 0.13170789683343054, + "grad_norm": 11.582452527757654, + "learning_rate": 4.918364329919536e-05, + "loss": 2.3433, + "mean_token_accuracy": 0.3999999940395355, + "step": 130765 + }, + { + "epoch": 0.1317129328865347, + "grad_norm": 12.362602207543476, + "learning_rate": 4.9183543275981385e-05, + "loss": 2.4198, + "mean_token_accuracy": 0.4172413766384125, + "step": 130770 + }, + { + "epoch": 0.13171796893963889, + "grad_norm": 11.07896659049019, + "learning_rate": 4.918344324675341e-05, + "loss": 2.5332, + "mean_token_accuracy": 0.4398669064044952, + "step": 130775 + }, + { + "epoch": 0.13172300499274306, + "grad_norm": 9.869163444998218, + "learning_rate": 4.918334321151144e-05, + "loss": 2.0117, + "mean_token_accuracy": 0.4344827592372894, + "step": 130780 + }, + { + "epoch": 0.13172804104584723, + "grad_norm": 10.216398764595747, + "learning_rate": 4.918324317025552e-05, + "loss": 2.4519, + "mean_token_accuracy": 0.4206896543502808, + "step": 130785 + }, + { + "epoch": 0.1317330770989514, + "grad_norm": 10.647137697604052, + "learning_rate": 4.9183143122985665e-05, + "loss": 2.664, + "mean_token_accuracy": 0.3689655065536499, + "step": 130790 + }, + { + "epoch": 0.13173811315205555, + "grad_norm": 12.942946391372622, + "learning_rate": 4.918304306970191e-05, + "loss": 2.0517, + "mean_token_accuracy": 0.48275861144065857, + "step": 130795 + }, + { + "epoch": 0.13174314920515973, + "grad_norm": 12.145405589499571, + "learning_rate": 4.9182943010404286e-05, + "loss": 2.7871, + "mean_token_accuracy": 0.37241379022598264, + "step": 130800 + }, + { + "epoch": 0.1317481852582639, + "grad_norm": 10.703535198521816, + "learning_rate": 4.9182842945092814e-05, + "loss": 2.2691, + "mean_token_accuracy": 0.43448275327682495, + "step": 130805 + }, + { + "epoch": 0.13175322131136807, + "grad_norm": 9.716822014926146, + "learning_rate": 4.918274287376753e-05, + "loss": 2.5473, + "mean_token_accuracy": 0.38965516686439516, + "step": 130810 + }, + { + "epoch": 0.13175825736447225, + "grad_norm": 9.543300299718846, + "learning_rate": 4.9182642796428455e-05, + "loss": 2.2674, + "mean_token_accuracy": 0.4294615864753723, + "step": 130815 + }, + { + "epoch": 0.13176329341757642, + "grad_norm": 13.564384799143454, + "learning_rate": 4.918254271307562e-05, + "loss": 2.7759, + "mean_token_accuracy": 0.4379310250282288, + "step": 130820 + }, + { + "epoch": 0.1317683294706806, + "grad_norm": 10.11796654090193, + "learning_rate": 4.9182442623709045e-05, + "loss": 2.4708, + "mean_token_accuracy": 0.493103438615799, + "step": 130825 + }, + { + "epoch": 0.13177336552378477, + "grad_norm": 10.51085136920519, + "learning_rate": 4.9182342528328764e-05, + "loss": 2.3156, + "mean_token_accuracy": 0.4068965494632721, + "step": 130830 + }, + { + "epoch": 0.13177840157688894, + "grad_norm": 10.877044530946895, + "learning_rate": 4.918224242693481e-05, + "loss": 2.1231, + "mean_token_accuracy": 0.4896551787853241, + "step": 130835 + }, + { + "epoch": 0.13178343762999312, + "grad_norm": 9.954786739638214, + "learning_rate": 4.9182142319527194e-05, + "loss": 2.3727, + "mean_token_accuracy": 0.43448275327682495, + "step": 130840 + }, + { + "epoch": 0.1317884736830973, + "grad_norm": 10.956913633085835, + "learning_rate": 4.918204220610597e-05, + "loss": 2.2265, + "mean_token_accuracy": 0.44482758045196535, + "step": 130845 + }, + { + "epoch": 0.13179350973620146, + "grad_norm": 10.389404041662878, + "learning_rate": 4.9181942086671134e-05, + "loss": 2.1054, + "mean_token_accuracy": 0.46896551847457885, + "step": 130850 + }, + { + "epoch": 0.13179854578930564, + "grad_norm": 13.511752163760171, + "learning_rate": 4.918184196122275e-05, + "loss": 2.6279, + "mean_token_accuracy": 0.38275861740112305, + "step": 130855 + }, + { + "epoch": 0.1318035818424098, + "grad_norm": 10.996592771240188, + "learning_rate": 4.9181741829760816e-05, + "loss": 2.5114, + "mean_token_accuracy": 0.36896551847457887, + "step": 130860 + }, + { + "epoch": 0.13180861789551399, + "grad_norm": 12.723959178336107, + "learning_rate": 4.918164169228536e-05, + "loss": 2.2492, + "mean_token_accuracy": 0.4241379201412201, + "step": 130865 + }, + { + "epoch": 0.13181365394861816, + "grad_norm": 11.729989690647935, + "learning_rate": 4.918154154879644e-05, + "loss": 2.4429, + "mean_token_accuracy": 0.4379310369491577, + "step": 130870 + }, + { + "epoch": 0.13181869000172233, + "grad_norm": 10.219886363999361, + "learning_rate": 4.918144139929405e-05, + "loss": 2.6907, + "mean_token_accuracy": 0.4396249294281006, + "step": 130875 + }, + { + "epoch": 0.1318237260548265, + "grad_norm": 12.509321280105091, + "learning_rate": 4.918134124377824e-05, + "loss": 2.4371, + "mean_token_accuracy": 0.4413793087005615, + "step": 130880 + }, + { + "epoch": 0.13182876210793068, + "grad_norm": 10.991233959975574, + "learning_rate": 4.918124108224902e-05, + "loss": 2.8331, + "mean_token_accuracy": 0.3896551728248596, + "step": 130885 + }, + { + "epoch": 0.13183379816103485, + "grad_norm": 11.226964045819216, + "learning_rate": 4.9181140914706426e-05, + "loss": 2.6919, + "mean_token_accuracy": 0.4361161530017853, + "step": 130890 + }, + { + "epoch": 0.13183883421413903, + "grad_norm": 10.09364755610323, + "learning_rate": 4.91810407411505e-05, + "loss": 2.4786, + "mean_token_accuracy": 0.38965516686439516, + "step": 130895 + }, + { + "epoch": 0.1318438702672432, + "grad_norm": 18.228300204087997, + "learning_rate": 4.918094056158124e-05, + "loss": 3.0076, + "mean_token_accuracy": 0.34482758641242983, + "step": 130900 + }, + { + "epoch": 0.13184890632034738, + "grad_norm": 11.023487326657166, + "learning_rate": 4.91808403759987e-05, + "loss": 2.3864, + "mean_token_accuracy": 0.3793103456497192, + "step": 130905 + }, + { + "epoch": 0.13185394237345155, + "grad_norm": 10.180432916863133, + "learning_rate": 4.91807401844029e-05, + "loss": 2.3274, + "mean_token_accuracy": 0.4379310369491577, + "step": 130910 + }, + { + "epoch": 0.13185897842655572, + "grad_norm": 11.584152702809524, + "learning_rate": 4.918063998679386e-05, + "loss": 2.552, + "mean_token_accuracy": 0.38965517580509185, + "step": 130915 + }, + { + "epoch": 0.1318640144796599, + "grad_norm": 11.616318357888956, + "learning_rate": 4.918053978317161e-05, + "loss": 2.2036, + "mean_token_accuracy": 0.4448275864124298, + "step": 130920 + }, + { + "epoch": 0.13186905053276407, + "grad_norm": 10.43237190405584, + "learning_rate": 4.9180439573536194e-05, + "loss": 2.2806, + "mean_token_accuracy": 0.4862068831920624, + "step": 130925 + }, + { + "epoch": 0.13187408658586824, + "grad_norm": 11.56454662836667, + "learning_rate": 4.9180339357887616e-05, + "loss": 2.352, + "mean_token_accuracy": 0.3862069010734558, + "step": 130930 + }, + { + "epoch": 0.1318791226389724, + "grad_norm": 8.755932696871785, + "learning_rate": 4.918023913622592e-05, + "loss": 2.1958, + "mean_token_accuracy": 0.4862068951129913, + "step": 130935 + }, + { + "epoch": 0.13188415869207656, + "grad_norm": 11.669919603291996, + "learning_rate": 4.9180138908551125e-05, + "loss": 2.088, + "mean_token_accuracy": 0.49999999403953554, + "step": 130940 + }, + { + "epoch": 0.13188919474518074, + "grad_norm": 16.694921306766254, + "learning_rate": 4.9180038674863266e-05, + "loss": 3.0236, + "mean_token_accuracy": 0.3827586114406586, + "step": 130945 + }, + { + "epoch": 0.1318942307982849, + "grad_norm": 9.622264401455466, + "learning_rate": 4.917993843516237e-05, + "loss": 2.5885, + "mean_token_accuracy": 0.3758620619773865, + "step": 130950 + }, + { + "epoch": 0.13189926685138909, + "grad_norm": 10.367114962281956, + "learning_rate": 4.9179838189448457e-05, + "loss": 2.1693, + "mean_token_accuracy": 0.48275862336158754, + "step": 130955 + }, + { + "epoch": 0.13190430290449326, + "grad_norm": 8.965918179172675, + "learning_rate": 4.917973793772157e-05, + "loss": 1.8551, + "mean_token_accuracy": 0.541379302740097, + "step": 130960 + }, + { + "epoch": 0.13190933895759743, + "grad_norm": 9.200192226062848, + "learning_rate": 4.917963767998171e-05, + "loss": 2.4657, + "mean_token_accuracy": 0.3827586114406586, + "step": 130965 + }, + { + "epoch": 0.1319143750107016, + "grad_norm": 11.525048352643964, + "learning_rate": 4.917953741622894e-05, + "loss": 2.8565, + "mean_token_accuracy": 0.39310344457626345, + "step": 130970 + }, + { + "epoch": 0.13191941106380578, + "grad_norm": 11.234872839272798, + "learning_rate": 4.917943714646326e-05, + "loss": 2.5469, + "mean_token_accuracy": 0.3275861978530884, + "step": 130975 + }, + { + "epoch": 0.13192444711690995, + "grad_norm": 9.656233619424537, + "learning_rate": 4.91793368706847e-05, + "loss": 2.575, + "mean_token_accuracy": 0.4206896543502808, + "step": 130980 + }, + { + "epoch": 0.13192948317001413, + "grad_norm": 10.807322796840618, + "learning_rate": 4.9179236588893304e-05, + "loss": 2.1792, + "mean_token_accuracy": 0.441379314661026, + "step": 130985 + }, + { + "epoch": 0.1319345192231183, + "grad_norm": 10.264219868380904, + "learning_rate": 4.917913630108909e-05, + "loss": 2.1323, + "mean_token_accuracy": 0.4586206912994385, + "step": 130990 + }, + { + "epoch": 0.13193955527622248, + "grad_norm": 12.58639345930477, + "learning_rate": 4.9179036007272086e-05, + "loss": 2.6361, + "mean_token_accuracy": 0.4, + "step": 130995 + }, + { + "epoch": 0.13194459132932665, + "grad_norm": 13.160175907894914, + "learning_rate": 4.917893570744232e-05, + "loss": 2.5628, + "mean_token_accuracy": 0.4103448212146759, + "step": 131000 + }, + { + "epoch": 0.13194962738243082, + "grad_norm": 10.335783733791828, + "learning_rate": 4.917883540159982e-05, + "loss": 2.3135, + "mean_token_accuracy": 0.45747126936912536, + "step": 131005 + }, + { + "epoch": 0.131954663435535, + "grad_norm": 10.232142277331928, + "learning_rate": 4.917873508974462e-05, + "loss": 2.4727, + "mean_token_accuracy": 0.44482758045196535, + "step": 131010 + }, + { + "epoch": 0.13195969948863917, + "grad_norm": 13.111616940367243, + "learning_rate": 4.9178634771876734e-05, + "loss": 2.3704, + "mean_token_accuracy": 0.34482757449150087, + "step": 131015 + }, + { + "epoch": 0.13196473554174334, + "grad_norm": 10.46645009032725, + "learning_rate": 4.9178534447996197e-05, + "loss": 2.4401, + "mean_token_accuracy": 0.4241379380226135, + "step": 131020 + }, + { + "epoch": 0.13196977159484752, + "grad_norm": 22.755171977644668, + "learning_rate": 4.917843411810305e-05, + "loss": 2.8994, + "mean_token_accuracy": 0.39310345649719236, + "step": 131025 + }, + { + "epoch": 0.1319748076479517, + "grad_norm": 12.08302287552268, + "learning_rate": 4.917833378219729e-05, + "loss": 2.44, + "mean_token_accuracy": 0.4034482777118683, + "step": 131030 + }, + { + "epoch": 0.13197984370105587, + "grad_norm": 10.741653749099575, + "learning_rate": 4.917823344027898e-05, + "loss": 2.3395, + "mean_token_accuracy": 0.44827585220336913, + "step": 131035 + }, + { + "epoch": 0.13198487975416004, + "grad_norm": 12.285779944372209, + "learning_rate": 4.917813309234812e-05, + "loss": 2.4102, + "mean_token_accuracy": 0.4206896543502808, + "step": 131040 + }, + { + "epoch": 0.1319899158072642, + "grad_norm": 9.717539691535709, + "learning_rate": 4.917803273840475e-05, + "loss": 2.1824, + "mean_token_accuracy": 0.47931033968925474, + "step": 131045 + }, + { + "epoch": 0.1319949518603684, + "grad_norm": 10.720397785005064, + "learning_rate": 4.91779323784489e-05, + "loss": 2.0308, + "mean_token_accuracy": 0.5103448271751404, + "step": 131050 + }, + { + "epoch": 0.13199998791347256, + "grad_norm": 8.744973254593255, + "learning_rate": 4.917783201248059e-05, + "loss": 2.5705, + "mean_token_accuracy": 0.41034482717514037, + "step": 131055 + }, + { + "epoch": 0.13200502396657673, + "grad_norm": 11.02182176009723, + "learning_rate": 4.917773164049986e-05, + "loss": 2.4431, + "mean_token_accuracy": 0.45287356376647947, + "step": 131060 + }, + { + "epoch": 0.1320100600196809, + "grad_norm": 9.546201881254037, + "learning_rate": 4.917763126250673e-05, + "loss": 2.5843, + "mean_token_accuracy": 0.4448275864124298, + "step": 131065 + }, + { + "epoch": 0.13201509607278508, + "grad_norm": 11.119891023409128, + "learning_rate": 4.917753087850123e-05, + "loss": 2.7079, + "mean_token_accuracy": 0.3482758581638336, + "step": 131070 + }, + { + "epoch": 0.13202013212588923, + "grad_norm": 10.586922478488056, + "learning_rate": 4.9177430488483375e-05, + "loss": 2.2008, + "mean_token_accuracy": 0.46896552443504336, + "step": 131075 + }, + { + "epoch": 0.1320251681789934, + "grad_norm": 9.587490736970123, + "learning_rate": 4.917733009245321e-05, + "loss": 2.8213, + "mean_token_accuracy": 0.3551724135875702, + "step": 131080 + }, + { + "epoch": 0.13203020423209758, + "grad_norm": 11.598202276377595, + "learning_rate": 4.917722969041076e-05, + "loss": 2.8273, + "mean_token_accuracy": 0.3793103456497192, + "step": 131085 + }, + { + "epoch": 0.13203524028520175, + "grad_norm": 11.115493501480103, + "learning_rate": 4.917712928235604e-05, + "loss": 3.114, + "mean_token_accuracy": 0.42413792610168455, + "step": 131090 + }, + { + "epoch": 0.13204027633830592, + "grad_norm": 9.492574454602126, + "learning_rate": 4.9177028868289094e-05, + "loss": 2.3009, + "mean_token_accuracy": 0.45517241954803467, + "step": 131095 + }, + { + "epoch": 0.1320453123914101, + "grad_norm": 11.23168719622025, + "learning_rate": 4.917692844820994e-05, + "loss": 2.4937, + "mean_token_accuracy": 0.417241370677948, + "step": 131100 + }, + { + "epoch": 0.13205034844451427, + "grad_norm": 9.585405627488765, + "learning_rate": 4.9176828022118615e-05, + "loss": 2.2565, + "mean_token_accuracy": 0.41034482717514037, + "step": 131105 + }, + { + "epoch": 0.13205538449761844, + "grad_norm": 8.178961082310304, + "learning_rate": 4.917672759001514e-05, + "loss": 2.2924, + "mean_token_accuracy": 0.4344827592372894, + "step": 131110 + }, + { + "epoch": 0.13206042055072262, + "grad_norm": 9.714175817509103, + "learning_rate": 4.917662715189954e-05, + "loss": 2.2799, + "mean_token_accuracy": 0.46551724672317507, + "step": 131115 + }, + { + "epoch": 0.1320654566038268, + "grad_norm": 9.612685652369812, + "learning_rate": 4.9176526707771844e-05, + "loss": 1.8617, + "mean_token_accuracy": 0.4724137902259827, + "step": 131120 + }, + { + "epoch": 0.13207049265693097, + "grad_norm": 12.597026104451066, + "learning_rate": 4.917642625763209e-05, + "loss": 2.2525, + "mean_token_accuracy": 0.39310344457626345, + "step": 131125 + }, + { + "epoch": 0.13207552871003514, + "grad_norm": 8.632268085849248, + "learning_rate": 4.9176325801480295e-05, + "loss": 2.2023, + "mean_token_accuracy": 0.46896551847457885, + "step": 131130 + }, + { + "epoch": 0.1320805647631393, + "grad_norm": 10.383354543780289, + "learning_rate": 4.9176225339316486e-05, + "loss": 2.9107, + "mean_token_accuracy": 0.3931034505367279, + "step": 131135 + }, + { + "epoch": 0.1320856008162435, + "grad_norm": 11.693272698879378, + "learning_rate": 4.91761248711407e-05, + "loss": 2.6274, + "mean_token_accuracy": 0.38965516686439516, + "step": 131140 + }, + { + "epoch": 0.13209063686934766, + "grad_norm": 11.731994477902825, + "learning_rate": 4.9176024396952955e-05, + "loss": 2.4591, + "mean_token_accuracy": 0.4018148839473724, + "step": 131145 + }, + { + "epoch": 0.13209567292245183, + "grad_norm": 9.602340126816566, + "learning_rate": 4.9175923916753294e-05, + "loss": 2.2571, + "mean_token_accuracy": 0.4517241299152374, + "step": 131150 + }, + { + "epoch": 0.132100708975556, + "grad_norm": 11.349152587114629, + "learning_rate": 4.9175823430541725e-05, + "loss": 2.26, + "mean_token_accuracy": 0.3931034505367279, + "step": 131155 + }, + { + "epoch": 0.13210574502866018, + "grad_norm": 8.832727124403188, + "learning_rate": 4.917572293831829e-05, + "loss": 2.6766, + "mean_token_accuracy": 0.42413793206214906, + "step": 131160 + }, + { + "epoch": 0.13211078108176436, + "grad_norm": 11.520790035382664, + "learning_rate": 4.9175622440083006e-05, + "loss": 2.4084, + "mean_token_accuracy": 0.42413793206214906, + "step": 131165 + }, + { + "epoch": 0.13211581713486853, + "grad_norm": 11.453158788052189, + "learning_rate": 4.9175521935835916e-05, + "loss": 2.3295, + "mean_token_accuracy": 0.4068965494632721, + "step": 131170 + }, + { + "epoch": 0.1321208531879727, + "grad_norm": 11.026000468198363, + "learning_rate": 4.9175421425577035e-05, + "loss": 2.5167, + "mean_token_accuracy": 0.3758620619773865, + "step": 131175 + }, + { + "epoch": 0.13212588924107688, + "grad_norm": 9.229372105944885, + "learning_rate": 4.9175320909306394e-05, + "loss": 2.0608, + "mean_token_accuracy": 0.4586206912994385, + "step": 131180 + }, + { + "epoch": 0.13213092529418105, + "grad_norm": 15.977849725511692, + "learning_rate": 4.917522038702402e-05, + "loss": 3.0136, + "mean_token_accuracy": 0.36206897497177126, + "step": 131185 + }, + { + "epoch": 0.13213596134728522, + "grad_norm": 11.309395976283522, + "learning_rate": 4.917511985872995e-05, + "loss": 2.5063, + "mean_token_accuracy": 0.3827586233615875, + "step": 131190 + }, + { + "epoch": 0.1321409974003894, + "grad_norm": 10.662406744295948, + "learning_rate": 4.91750193244242e-05, + "loss": 2.7202, + "mean_token_accuracy": 0.34827585220336915, + "step": 131195 + }, + { + "epoch": 0.13214603345349357, + "grad_norm": 9.929448598498977, + "learning_rate": 4.917491878410681e-05, + "loss": 2.667, + "mean_token_accuracy": 0.38275861740112305, + "step": 131200 + }, + { + "epoch": 0.13215106950659775, + "grad_norm": 10.571578543823838, + "learning_rate": 4.9174818237777786e-05, + "loss": 2.584, + "mean_token_accuracy": 0.3931034475564957, + "step": 131205 + }, + { + "epoch": 0.13215610555970192, + "grad_norm": 14.077657892704604, + "learning_rate": 4.9174717685437176e-05, + "loss": 2.7506, + "mean_token_accuracy": 0.37586206793785093, + "step": 131210 + }, + { + "epoch": 0.13216114161280607, + "grad_norm": 9.96251433581295, + "learning_rate": 4.9174617127085e-05, + "loss": 2.1662, + "mean_token_accuracy": 0.4517241418361664, + "step": 131215 + }, + { + "epoch": 0.13216617766591024, + "grad_norm": 10.087131419183478, + "learning_rate": 4.9174516562721296e-05, + "loss": 2.2735, + "mean_token_accuracy": 0.4294615864753723, + "step": 131220 + }, + { + "epoch": 0.1321712137190144, + "grad_norm": 10.337666296090887, + "learning_rate": 4.917441599234608e-05, + "loss": 2.9218, + "mean_token_accuracy": 0.4206896543502808, + "step": 131225 + }, + { + "epoch": 0.1321762497721186, + "grad_norm": 11.760898393300893, + "learning_rate": 4.917431541595939e-05, + "loss": 2.3589, + "mean_token_accuracy": 0.4569268047809601, + "step": 131230 + }, + { + "epoch": 0.13218128582522276, + "grad_norm": 11.750066054010318, + "learning_rate": 4.917421483356124e-05, + "loss": 2.2726, + "mean_token_accuracy": 0.4586206912994385, + "step": 131235 + }, + { + "epoch": 0.13218632187832693, + "grad_norm": 11.684625591762964, + "learning_rate": 4.917411424515166e-05, + "loss": 2.6757, + "mean_token_accuracy": 0.4000000059604645, + "step": 131240 + }, + { + "epoch": 0.1321913579314311, + "grad_norm": 9.368007996693185, + "learning_rate": 4.917401365073069e-05, + "loss": 2.0041, + "mean_token_accuracy": 0.44506957530975344, + "step": 131245 + }, + { + "epoch": 0.13219639398453528, + "grad_norm": 12.579201731817781, + "learning_rate": 4.917391305029836e-05, + "loss": 2.1639, + "mean_token_accuracy": 0.458620685338974, + "step": 131250 + }, + { + "epoch": 0.13220143003763946, + "grad_norm": 10.211357839688405, + "learning_rate": 4.917381244385468e-05, + "loss": 2.6896, + "mean_token_accuracy": 0.4, + "step": 131255 + }, + { + "epoch": 0.13220646609074363, + "grad_norm": 13.016531313607894, + "learning_rate": 4.9173711831399684e-05, + "loss": 2.4494, + "mean_token_accuracy": 0.4137930989265442, + "step": 131260 + }, + { + "epoch": 0.1322115021438478, + "grad_norm": 11.52024239764897, + "learning_rate": 4.9173611212933413e-05, + "loss": 2.3147, + "mean_token_accuracy": 0.38753780722618103, + "step": 131265 + }, + { + "epoch": 0.13221653819695198, + "grad_norm": 10.485724964378825, + "learning_rate": 4.917351058845587e-05, + "loss": 2.6046, + "mean_token_accuracy": 0.4379310429096222, + "step": 131270 + }, + { + "epoch": 0.13222157425005615, + "grad_norm": 10.100919214957443, + "learning_rate": 4.9173409957967115e-05, + "loss": 2.3011, + "mean_token_accuracy": 0.4344827592372894, + "step": 131275 + }, + { + "epoch": 0.13222661030316032, + "grad_norm": 10.351577044309916, + "learning_rate": 4.9173309321467154e-05, + "loss": 2.3228, + "mean_token_accuracy": 0.41034482717514037, + "step": 131280 + }, + { + "epoch": 0.1322316463562645, + "grad_norm": 9.561083520264361, + "learning_rate": 4.917320867895602e-05, + "loss": 2.0107, + "mean_token_accuracy": 0.4986085891723633, + "step": 131285 + }, + { + "epoch": 0.13223668240936867, + "grad_norm": 13.970629077734946, + "learning_rate": 4.917310803043374e-05, + "loss": 2.14, + "mean_token_accuracy": 0.49153055548667907, + "step": 131290 + }, + { + "epoch": 0.13224171846247285, + "grad_norm": 9.138960642554114, + "learning_rate": 4.9173007375900345e-05, + "loss": 2.2237, + "mean_token_accuracy": 0.42413793206214906, + "step": 131295 + }, + { + "epoch": 0.13224675451557702, + "grad_norm": 10.315801795330843, + "learning_rate": 4.917290671535585e-05, + "loss": 2.5188, + "mean_token_accuracy": 0.37586206793785093, + "step": 131300 + }, + { + "epoch": 0.1322517905686812, + "grad_norm": 8.279586333882575, + "learning_rate": 4.91728060488003e-05, + "loss": 2.4168, + "mean_token_accuracy": 0.42413793206214906, + "step": 131305 + }, + { + "epoch": 0.13225682662178537, + "grad_norm": 10.277069416620028, + "learning_rate": 4.917270537623372e-05, + "loss": 1.9879, + "mean_token_accuracy": 0.5172413647174835, + "step": 131310 + }, + { + "epoch": 0.13226186267488954, + "grad_norm": 9.939770771114727, + "learning_rate": 4.917260469765613e-05, + "loss": 2.0263, + "mean_token_accuracy": 0.4398064136505127, + "step": 131315 + }, + { + "epoch": 0.13226689872799371, + "grad_norm": 9.271120978803843, + "learning_rate": 4.917250401306756e-05, + "loss": 2.2392, + "mean_token_accuracy": 0.4620689570903778, + "step": 131320 + }, + { + "epoch": 0.1322719347810979, + "grad_norm": 15.96267272538953, + "learning_rate": 4.9172403322468044e-05, + "loss": 2.3658, + "mean_token_accuracy": 0.4379310369491577, + "step": 131325 + }, + { + "epoch": 0.13227697083420206, + "grad_norm": 13.79683631607779, + "learning_rate": 4.917230262585761e-05, + "loss": 2.6164, + "mean_token_accuracy": 0.4034482717514038, + "step": 131330 + }, + { + "epoch": 0.13228200688730624, + "grad_norm": 10.343560728145343, + "learning_rate": 4.917220192323628e-05, + "loss": 2.3659, + "mean_token_accuracy": 0.41724138259887694, + "step": 131335 + }, + { + "epoch": 0.1322870429404104, + "grad_norm": 9.719834097177923, + "learning_rate": 4.917210121460407e-05, + "loss": 2.4235, + "mean_token_accuracy": 0.4310344815254211, + "step": 131340 + }, + { + "epoch": 0.13229207899351458, + "grad_norm": 21.46685489085881, + "learning_rate": 4.917200049996104e-05, + "loss": 2.8414, + "mean_token_accuracy": 0.3931034505367279, + "step": 131345 + }, + { + "epoch": 0.13229711504661876, + "grad_norm": 11.96503878559186, + "learning_rate": 4.917189977930719e-05, + "loss": 2.3628, + "mean_token_accuracy": 0.4137930989265442, + "step": 131350 + }, + { + "epoch": 0.1323021510997229, + "grad_norm": 12.497098217018177, + "learning_rate": 4.917179905264256e-05, + "loss": 2.6067, + "mean_token_accuracy": 0.4206896543502808, + "step": 131355 + }, + { + "epoch": 0.13230718715282708, + "grad_norm": 10.17702784969293, + "learning_rate": 4.917169831996718e-05, + "loss": 2.3237, + "mean_token_accuracy": 0.4137930929660797, + "step": 131360 + }, + { + "epoch": 0.13231222320593125, + "grad_norm": 12.070017585259182, + "learning_rate": 4.917159758128106e-05, + "loss": 2.3383, + "mean_token_accuracy": 0.4655172348022461, + "step": 131365 + }, + { + "epoch": 0.13231725925903542, + "grad_norm": 10.332457917331118, + "learning_rate": 4.9171496836584265e-05, + "loss": 2.0801, + "mean_token_accuracy": 0.4724137902259827, + "step": 131370 + }, + { + "epoch": 0.1323222953121396, + "grad_norm": 13.063408071261948, + "learning_rate": 4.917139608587678e-05, + "loss": 2.5882, + "mean_token_accuracy": 0.41548699140548706, + "step": 131375 + }, + { + "epoch": 0.13232733136524377, + "grad_norm": 10.314802861097107, + "learning_rate": 4.917129532915866e-05, + "loss": 2.3084, + "mean_token_accuracy": 0.4, + "step": 131380 + }, + { + "epoch": 0.13233236741834795, + "grad_norm": 10.439205836624764, + "learning_rate": 4.9171194566429924e-05, + "loss": 2.078, + "mean_token_accuracy": 0.4551724076271057, + "step": 131385 + }, + { + "epoch": 0.13233740347145212, + "grad_norm": 10.37218801985173, + "learning_rate": 4.91710937976906e-05, + "loss": 1.9862, + "mean_token_accuracy": 0.42413792610168455, + "step": 131390 + }, + { + "epoch": 0.1323424395245563, + "grad_norm": 36.31546879822128, + "learning_rate": 4.917099302294072e-05, + "loss": 2.5273, + "mean_token_accuracy": 0.43980641961097716, + "step": 131395 + }, + { + "epoch": 0.13234747557766047, + "grad_norm": 10.501689614303995, + "learning_rate": 4.917089224218031e-05, + "loss": 2.2748, + "mean_token_accuracy": 0.42758620977401735, + "step": 131400 + }, + { + "epoch": 0.13235251163076464, + "grad_norm": 8.661146613634248, + "learning_rate": 4.9170791455409394e-05, + "loss": 2.3483, + "mean_token_accuracy": 0.42413792610168455, + "step": 131405 + }, + { + "epoch": 0.13235754768386881, + "grad_norm": 11.843981967257625, + "learning_rate": 4.9170690662628e-05, + "loss": 2.0483, + "mean_token_accuracy": 0.46551724076271056, + "step": 131410 + }, + { + "epoch": 0.132362583736973, + "grad_norm": 12.1326545456674, + "learning_rate": 4.9170589863836166e-05, + "loss": 2.5534, + "mean_token_accuracy": 0.42758620977401735, + "step": 131415 + }, + { + "epoch": 0.13236761979007716, + "grad_norm": 8.759100106322363, + "learning_rate": 4.917048905903391e-05, + "loss": 2.3965, + "mean_token_accuracy": 0.4137930989265442, + "step": 131420 + }, + { + "epoch": 0.13237265584318134, + "grad_norm": 8.406203514615466, + "learning_rate": 4.9170388248221264e-05, + "loss": 1.8667, + "mean_token_accuracy": 0.5459359467029572, + "step": 131425 + }, + { + "epoch": 0.1323776918962855, + "grad_norm": 10.691088865635312, + "learning_rate": 4.9170287431398254e-05, + "loss": 2.3505, + "mean_token_accuracy": 0.43103447556495667, + "step": 131430 + }, + { + "epoch": 0.13238272794938968, + "grad_norm": 15.15634085761613, + "learning_rate": 4.917018660856491e-05, + "loss": 2.6865, + "mean_token_accuracy": 0.3999999940395355, + "step": 131435 + }, + { + "epoch": 0.13238776400249386, + "grad_norm": 10.948326879346327, + "learning_rate": 4.9170085779721256e-05, + "loss": 2.2749, + "mean_token_accuracy": 0.43103448748588563, + "step": 131440 + }, + { + "epoch": 0.13239280005559803, + "grad_norm": 11.414183825778988, + "learning_rate": 4.9169984944867334e-05, + "loss": 2.509, + "mean_token_accuracy": 0.4206896543502808, + "step": 131445 + }, + { + "epoch": 0.1323978361087022, + "grad_norm": 12.886774756888196, + "learning_rate": 4.916988410400314e-05, + "loss": 2.7913, + "mean_token_accuracy": 0.4068965554237366, + "step": 131450 + }, + { + "epoch": 0.13240287216180638, + "grad_norm": 10.089831696112617, + "learning_rate": 4.9169783257128735e-05, + "loss": 1.9596, + "mean_token_accuracy": 0.5071428477764129, + "step": 131455 + }, + { + "epoch": 0.13240790821491055, + "grad_norm": 10.369503562693835, + "learning_rate": 4.916968240424414e-05, + "loss": 2.4564, + "mean_token_accuracy": 0.4034482777118683, + "step": 131460 + }, + { + "epoch": 0.13241294426801473, + "grad_norm": 9.905948068893506, + "learning_rate": 4.916958154534937e-05, + "loss": 2.3357, + "mean_token_accuracy": 0.4586207032203674, + "step": 131465 + }, + { + "epoch": 0.1324179803211189, + "grad_norm": 10.701725071365345, + "learning_rate": 4.9169480680444466e-05, + "loss": 2.747, + "mean_token_accuracy": 0.3862068891525269, + "step": 131470 + }, + { + "epoch": 0.13242301637422307, + "grad_norm": 8.490209353266549, + "learning_rate": 4.9169379809529445e-05, + "loss": 2.2655, + "mean_token_accuracy": 0.43793103098869324, + "step": 131475 + }, + { + "epoch": 0.13242805242732725, + "grad_norm": 10.229179170868987, + "learning_rate": 4.916927893260434e-05, + "loss": 2.3276, + "mean_token_accuracy": 0.4448275864124298, + "step": 131480 + }, + { + "epoch": 0.13243308848043142, + "grad_norm": 10.496374428250446, + "learning_rate": 4.916917804966918e-05, + "loss": 2.4068, + "mean_token_accuracy": 0.37241379022598264, + "step": 131485 + }, + { + "epoch": 0.1324381245335356, + "grad_norm": 10.760596030963063, + "learning_rate": 4.9169077160724e-05, + "loss": 2.3649, + "mean_token_accuracy": 0.46412583589553835, + "step": 131490 + }, + { + "epoch": 0.13244316058663974, + "grad_norm": 9.846684539687933, + "learning_rate": 4.9168976265768816e-05, + "loss": 2.3506, + "mean_token_accuracy": 0.4344827592372894, + "step": 131495 + }, + { + "epoch": 0.13244819663974391, + "grad_norm": 9.88283171136494, + "learning_rate": 4.916887536480366e-05, + "loss": 2.6195, + "mean_token_accuracy": 0.4068965554237366, + "step": 131500 + }, + { + "epoch": 0.1324532326928481, + "grad_norm": 10.896891512713259, + "learning_rate": 4.9168774457828563e-05, + "loss": 2.5124, + "mean_token_accuracy": 0.3999999940395355, + "step": 131505 + }, + { + "epoch": 0.13245826874595226, + "grad_norm": 9.659845829241906, + "learning_rate": 4.916867354484355e-05, + "loss": 2.5078, + "mean_token_accuracy": 0.43103447556495667, + "step": 131510 + }, + { + "epoch": 0.13246330479905644, + "grad_norm": 11.485363420803353, + "learning_rate": 4.9168572625848656e-05, + "loss": 2.6061, + "mean_token_accuracy": 0.3896551728248596, + "step": 131515 + }, + { + "epoch": 0.1324683408521606, + "grad_norm": 11.92137294081313, + "learning_rate": 4.916847170084389e-05, + "loss": 2.6137, + "mean_token_accuracy": 0.4068965554237366, + "step": 131520 + }, + { + "epoch": 0.13247337690526478, + "grad_norm": 9.675054214550553, + "learning_rate": 4.9168370769829295e-05, + "loss": 2.5857, + "mean_token_accuracy": 0.4034482717514038, + "step": 131525 + }, + { + "epoch": 0.13247841295836896, + "grad_norm": 11.477417646564005, + "learning_rate": 4.91682698328049e-05, + "loss": 2.2241, + "mean_token_accuracy": 0.47773743867874147, + "step": 131530 + }, + { + "epoch": 0.13248344901147313, + "grad_norm": 9.78207054101505, + "learning_rate": 4.916816888977073e-05, + "loss": 2.2948, + "mean_token_accuracy": 0.4811857283115387, + "step": 131535 + }, + { + "epoch": 0.1324884850645773, + "grad_norm": 11.22467178490561, + "learning_rate": 4.9168067940726817e-05, + "loss": 2.4509, + "mean_token_accuracy": 0.443254691362381, + "step": 131540 + }, + { + "epoch": 0.13249352111768148, + "grad_norm": 13.435615470633008, + "learning_rate": 4.9167966985673174e-05, + "loss": 2.9686, + "mean_token_accuracy": 0.34482758641242983, + "step": 131545 + }, + { + "epoch": 0.13249855717078565, + "grad_norm": 8.849383088208135, + "learning_rate": 4.9167866024609845e-05, + "loss": 2.4739, + "mean_token_accuracy": 0.40532365441322327, + "step": 131550 + }, + { + "epoch": 0.13250359322388983, + "grad_norm": 11.395979086494675, + "learning_rate": 4.916776505753685e-05, + "loss": 2.2397, + "mean_token_accuracy": 0.4620689630508423, + "step": 131555 + }, + { + "epoch": 0.132508629276994, + "grad_norm": 9.820701868560276, + "learning_rate": 4.916766408445422e-05, + "loss": 2.1531, + "mean_token_accuracy": 0.47586206197738645, + "step": 131560 + }, + { + "epoch": 0.13251366533009817, + "grad_norm": 9.71007132106587, + "learning_rate": 4.916756310536199e-05, + "loss": 2.1673, + "mean_token_accuracy": 0.4448275864124298, + "step": 131565 + }, + { + "epoch": 0.13251870138320235, + "grad_norm": 11.118668292212806, + "learning_rate": 4.916746212026017e-05, + "loss": 2.0053, + "mean_token_accuracy": 0.4896551728248596, + "step": 131570 + }, + { + "epoch": 0.13252373743630652, + "grad_norm": 12.048046714378241, + "learning_rate": 4.91673611291488e-05, + "loss": 2.579, + "mean_token_accuracy": 0.39310344457626345, + "step": 131575 + }, + { + "epoch": 0.1325287734894107, + "grad_norm": 15.87882610048152, + "learning_rate": 4.916726013202791e-05, + "loss": 2.22, + "mean_token_accuracy": 0.41379310488700866, + "step": 131580 + }, + { + "epoch": 0.13253380954251487, + "grad_norm": 12.182246906187553, + "learning_rate": 4.9167159128897524e-05, + "loss": 2.6226, + "mean_token_accuracy": 0.4034482777118683, + "step": 131585 + }, + { + "epoch": 0.13253884559561904, + "grad_norm": 9.862150161993853, + "learning_rate": 4.916705811975767e-05, + "loss": 2.0312, + "mean_token_accuracy": 0.4379310429096222, + "step": 131590 + }, + { + "epoch": 0.13254388164872322, + "grad_norm": 19.43909647021108, + "learning_rate": 4.916695710460838e-05, + "loss": 2.6867, + "mean_token_accuracy": 0.40689656138420105, + "step": 131595 + }, + { + "epoch": 0.1325489177018274, + "grad_norm": 9.562658268984745, + "learning_rate": 4.916685608344968e-05, + "loss": 2.5226, + "mean_token_accuracy": 0.39140955805778505, + "step": 131600 + }, + { + "epoch": 0.13255395375493156, + "grad_norm": 10.78529389997785, + "learning_rate": 4.916675505628159e-05, + "loss": 2.0993, + "mean_token_accuracy": 0.44827585220336913, + "step": 131605 + }, + { + "epoch": 0.13255898980803574, + "grad_norm": 10.670135376599857, + "learning_rate": 4.9166654023104144e-05, + "loss": 2.2091, + "mean_token_accuracy": 0.43793103098869324, + "step": 131610 + }, + { + "epoch": 0.1325640258611399, + "grad_norm": 9.393311037102201, + "learning_rate": 4.9166552983917375e-05, + "loss": 2.5331, + "mean_token_accuracy": 0.4068965494632721, + "step": 131615 + }, + { + "epoch": 0.13256906191424409, + "grad_norm": 10.847580722863094, + "learning_rate": 4.9166451938721306e-05, + "loss": 2.3997, + "mean_token_accuracy": 0.40689654350280763, + "step": 131620 + }, + { + "epoch": 0.13257409796734826, + "grad_norm": 8.923688101012566, + "learning_rate": 4.9166350887515966e-05, + "loss": 1.9499, + "mean_token_accuracy": 0.5206896483898162, + "step": 131625 + }, + { + "epoch": 0.13257913402045243, + "grad_norm": 9.482095439516355, + "learning_rate": 4.916624983030138e-05, + "loss": 2.4504, + "mean_token_accuracy": 0.4344827592372894, + "step": 131630 + }, + { + "epoch": 0.13258417007355658, + "grad_norm": 8.73390809723528, + "learning_rate": 4.916614876707758e-05, + "loss": 2.4479, + "mean_token_accuracy": 0.42413793206214906, + "step": 131635 + }, + { + "epoch": 0.13258920612666075, + "grad_norm": 11.26250179756786, + "learning_rate": 4.91660476978446e-05, + "loss": 2.0022, + "mean_token_accuracy": 0.5086509406566619, + "step": 131640 + }, + { + "epoch": 0.13259424217976493, + "grad_norm": 11.449484641269287, + "learning_rate": 4.9165946622602456e-05, + "loss": 2.3139, + "mean_token_accuracy": 0.44482758045196535, + "step": 131645 + }, + { + "epoch": 0.1325992782328691, + "grad_norm": 8.657478646736775, + "learning_rate": 4.916584554135118e-05, + "loss": 2.367, + "mean_token_accuracy": 0.44137930274009707, + "step": 131650 + }, + { + "epoch": 0.13260431428597327, + "grad_norm": 11.247643761071732, + "learning_rate": 4.916574445409079e-05, + "loss": 2.2148, + "mean_token_accuracy": 0.4172413766384125, + "step": 131655 + }, + { + "epoch": 0.13260935033907745, + "grad_norm": 13.109814885357773, + "learning_rate": 4.916564336082134e-05, + "loss": 2.8525, + "mean_token_accuracy": 0.3275861978530884, + "step": 131660 + }, + { + "epoch": 0.13261438639218162, + "grad_norm": 11.0417158925555, + "learning_rate": 4.916554226154283e-05, + "loss": 2.4252, + "mean_token_accuracy": 0.45862067937850953, + "step": 131665 + }, + { + "epoch": 0.1326194224452858, + "grad_norm": 11.743411025304352, + "learning_rate": 4.916544115625531e-05, + "loss": 2.4164, + "mean_token_accuracy": 0.4206896543502808, + "step": 131670 + }, + { + "epoch": 0.13262445849838997, + "grad_norm": 15.62222504259958, + "learning_rate": 4.9165340044958806e-05, + "loss": 2.8422, + "mean_token_accuracy": 0.3896551728248596, + "step": 131675 + }, + { + "epoch": 0.13262949455149414, + "grad_norm": 9.31799944861975, + "learning_rate": 4.916523892765333e-05, + "loss": 2.4405, + "mean_token_accuracy": 0.44827587008476255, + "step": 131680 + }, + { + "epoch": 0.13263453060459832, + "grad_norm": 9.377359860755403, + "learning_rate": 4.916513780433892e-05, + "loss": 2.2141, + "mean_token_accuracy": 0.49741379618644715, + "step": 131685 + }, + { + "epoch": 0.1326395666577025, + "grad_norm": 10.512934130986027, + "learning_rate": 4.9165036675015605e-05, + "loss": 2.6758, + "mean_token_accuracy": 0.3999999940395355, + "step": 131690 + }, + { + "epoch": 0.13264460271080666, + "grad_norm": 10.28861878063264, + "learning_rate": 4.9164935539683414e-05, + "loss": 2.3657, + "mean_token_accuracy": 0.43992740511894224, + "step": 131695 + }, + { + "epoch": 0.13264963876391084, + "grad_norm": 11.21309032444019, + "learning_rate": 4.916483439834236e-05, + "loss": 2.6913, + "mean_token_accuracy": 0.40000000298023225, + "step": 131700 + }, + { + "epoch": 0.132654674817015, + "grad_norm": 10.510568610255131, + "learning_rate": 4.9164733250992495e-05, + "loss": 2.3078, + "mean_token_accuracy": 0.41379310488700866, + "step": 131705 + }, + { + "epoch": 0.13265971087011919, + "grad_norm": 14.342943222275748, + "learning_rate": 4.9164632097633837e-05, + "loss": 2.3003, + "mean_token_accuracy": 0.48004926443099977, + "step": 131710 + }, + { + "epoch": 0.13266474692322336, + "grad_norm": 10.761889931504381, + "learning_rate": 4.91645309382664e-05, + "loss": 2.1374, + "mean_token_accuracy": 0.4551724076271057, + "step": 131715 + }, + { + "epoch": 0.13266978297632753, + "grad_norm": 11.984663810654952, + "learning_rate": 4.9164429772890235e-05, + "loss": 2.4102, + "mean_token_accuracy": 0.4655172348022461, + "step": 131720 + }, + { + "epoch": 0.1326748190294317, + "grad_norm": 12.920931275325259, + "learning_rate": 4.916432860150535e-05, + "loss": 2.5328, + "mean_token_accuracy": 0.46376285552978513, + "step": 131725 + }, + { + "epoch": 0.13267985508253588, + "grad_norm": 9.875292559232992, + "learning_rate": 4.916422742411179e-05, + "loss": 2.4135, + "mean_token_accuracy": 0.4482758641242981, + "step": 131730 + }, + { + "epoch": 0.13268489113564005, + "grad_norm": 12.410993239805087, + "learning_rate": 4.916412624070958e-05, + "loss": 2.433, + "mean_token_accuracy": 0.3896551728248596, + "step": 131735 + }, + { + "epoch": 0.13268992718874423, + "grad_norm": 9.917737556210058, + "learning_rate": 4.916402505129874e-05, + "loss": 2.4898, + "mean_token_accuracy": 0.43793103098869324, + "step": 131740 + }, + { + "epoch": 0.1326949632418484, + "grad_norm": 12.867834643911216, + "learning_rate": 4.91639238558793e-05, + "loss": 2.4387, + "mean_token_accuracy": 0.41034482717514037, + "step": 131745 + }, + { + "epoch": 0.13269999929495258, + "grad_norm": 9.632127676682542, + "learning_rate": 4.91638226544513e-05, + "loss": 2.233, + "mean_token_accuracy": 0.458620685338974, + "step": 131750 + }, + { + "epoch": 0.13270503534805675, + "grad_norm": 9.573333116567206, + "learning_rate": 4.916372144701474e-05, + "loss": 2.0884, + "mean_token_accuracy": 0.48965516686439514, + "step": 131755 + }, + { + "epoch": 0.13271007140116092, + "grad_norm": 9.435793480784792, + "learning_rate": 4.916362023356968e-05, + "loss": 2.0749, + "mean_token_accuracy": 0.4551724135875702, + "step": 131760 + }, + { + "epoch": 0.1327151074542651, + "grad_norm": 10.638511432362334, + "learning_rate": 4.916351901411612e-05, + "loss": 2.365, + "mean_token_accuracy": 0.4413793087005615, + "step": 131765 + }, + { + "epoch": 0.13272014350736927, + "grad_norm": 19.767062646244298, + "learning_rate": 4.916341778865411e-05, + "loss": 2.4292, + "mean_token_accuracy": 0.3999999940395355, + "step": 131770 + }, + { + "epoch": 0.13272517956047342, + "grad_norm": 10.474225381550344, + "learning_rate": 4.9163316557183675e-05, + "loss": 2.11, + "mean_token_accuracy": 0.5126436829566956, + "step": 131775 + }, + { + "epoch": 0.1327302156135776, + "grad_norm": 11.657439438173649, + "learning_rate": 4.916321531970484e-05, + "loss": 2.7211, + "mean_token_accuracy": 0.4068965494632721, + "step": 131780 + }, + { + "epoch": 0.13273525166668176, + "grad_norm": 13.246943746822842, + "learning_rate": 4.916311407621762e-05, + "loss": 2.4879, + "mean_token_accuracy": 0.44827585816383364, + "step": 131785 + }, + { + "epoch": 0.13274028771978594, + "grad_norm": 8.023523552751579, + "learning_rate": 4.916301282672206e-05, + "loss": 2.049, + "mean_token_accuracy": 0.5275862038135528, + "step": 131790 + }, + { + "epoch": 0.1327453237728901, + "grad_norm": 8.637507836645831, + "learning_rate": 4.916291157121819e-05, + "loss": 2.228, + "mean_token_accuracy": 0.4931034505367279, + "step": 131795 + }, + { + "epoch": 0.13275035982599429, + "grad_norm": 11.564754129719343, + "learning_rate": 4.916281030970602e-05, + "loss": 2.9449, + "mean_token_accuracy": 0.3862069010734558, + "step": 131800 + }, + { + "epoch": 0.13275539587909846, + "grad_norm": 9.99793168000728, + "learning_rate": 4.9162709042185596e-05, + "loss": 2.1108, + "mean_token_accuracy": 0.4689655125141144, + "step": 131805 + }, + { + "epoch": 0.13276043193220263, + "grad_norm": 11.47606025338248, + "learning_rate": 4.916260776865693e-05, + "loss": 2.536, + "mean_token_accuracy": 0.42413793206214906, + "step": 131810 + }, + { + "epoch": 0.1327654679853068, + "grad_norm": 10.340388348556395, + "learning_rate": 4.916250648912006e-05, + "loss": 2.8799, + "mean_token_accuracy": 0.34482758939266206, + "step": 131815 + }, + { + "epoch": 0.13277050403841098, + "grad_norm": 11.270559860737144, + "learning_rate": 4.916240520357503e-05, + "loss": 2.5084, + "mean_token_accuracy": 0.42068964838981626, + "step": 131820 + }, + { + "epoch": 0.13277554009151515, + "grad_norm": 10.355598985787058, + "learning_rate": 4.916230391202183e-05, + "loss": 2.0765, + "mean_token_accuracy": 0.5034482777118683, + "step": 131825 + }, + { + "epoch": 0.13278057614461933, + "grad_norm": 10.490987947046968, + "learning_rate": 4.9162202614460516e-05, + "loss": 2.6774, + "mean_token_accuracy": 0.3896551728248596, + "step": 131830 + }, + { + "epoch": 0.1327856121977235, + "grad_norm": 11.359052761441204, + "learning_rate": 4.9162101310891115e-05, + "loss": 2.3137, + "mean_token_accuracy": 0.46551724076271056, + "step": 131835 + }, + { + "epoch": 0.13279064825082768, + "grad_norm": 8.805525345810198, + "learning_rate": 4.916200000131364e-05, + "loss": 2.418, + "mean_token_accuracy": 0.42758620381355283, + "step": 131840 + }, + { + "epoch": 0.13279568430393185, + "grad_norm": 11.485304325040717, + "learning_rate": 4.9161898685728135e-05, + "loss": 2.5012, + "mean_token_accuracy": 0.41379311084747317, + "step": 131845 + }, + { + "epoch": 0.13280072035703602, + "grad_norm": 11.85380995143541, + "learning_rate": 4.916179736413462e-05, + "loss": 2.7883, + "mean_token_accuracy": 0.41724138259887694, + "step": 131850 + }, + { + "epoch": 0.1328057564101402, + "grad_norm": 11.79408619798211, + "learning_rate": 4.916169603653313e-05, + "loss": 2.5852, + "mean_token_accuracy": 0.43448275327682495, + "step": 131855 + }, + { + "epoch": 0.13281079246324437, + "grad_norm": 10.814319003709935, + "learning_rate": 4.916159470292368e-05, + "loss": 2.3897, + "mean_token_accuracy": 0.42758620381355283, + "step": 131860 + }, + { + "epoch": 0.13281582851634854, + "grad_norm": 12.36461339934935, + "learning_rate": 4.916149336330631e-05, + "loss": 2.1125, + "mean_token_accuracy": 0.4758620738983154, + "step": 131865 + }, + { + "epoch": 0.13282086456945272, + "grad_norm": 10.251104530062156, + "learning_rate": 4.916139201768104e-05, + "loss": 2.4213, + "mean_token_accuracy": 0.42413793206214906, + "step": 131870 + }, + { + "epoch": 0.1328259006225569, + "grad_norm": 8.484113929065005, + "learning_rate": 4.9161290666047904e-05, + "loss": 2.647, + "mean_token_accuracy": 0.4275861918926239, + "step": 131875 + }, + { + "epoch": 0.13283093667566107, + "grad_norm": 10.543575917388496, + "learning_rate": 4.916118930840693e-05, + "loss": 2.9202, + "mean_token_accuracy": 0.35862069129943847, + "step": 131880 + }, + { + "epoch": 0.13283597272876524, + "grad_norm": 8.667227564300891, + "learning_rate": 4.916108794475815e-05, + "loss": 1.9443, + "mean_token_accuracy": 0.4984271049499512, + "step": 131885 + }, + { + "epoch": 0.1328410087818694, + "grad_norm": 8.661460774879252, + "learning_rate": 4.916098657510157e-05, + "loss": 2.2425, + "mean_token_accuracy": 0.4896551728248596, + "step": 131890 + }, + { + "epoch": 0.1328460448349736, + "grad_norm": 12.274384618143488, + "learning_rate": 4.9160885199437246e-05, + "loss": 2.6273, + "mean_token_accuracy": 0.4137930989265442, + "step": 131895 + }, + { + "epoch": 0.13285108088807776, + "grad_norm": 11.894362730075114, + "learning_rate": 4.91607838177652e-05, + "loss": 2.1414, + "mean_token_accuracy": 0.4620689570903778, + "step": 131900 + }, + { + "epoch": 0.13285611694118193, + "grad_norm": 13.058002187582895, + "learning_rate": 4.916068243008544e-05, + "loss": 2.5928, + "mean_token_accuracy": 0.4413793206214905, + "step": 131905 + }, + { + "epoch": 0.13286115299428608, + "grad_norm": 9.359054182187394, + "learning_rate": 4.9160581036398015e-05, + "loss": 2.0785, + "mean_token_accuracy": 0.46551724076271056, + "step": 131910 + }, + { + "epoch": 0.13286618904739025, + "grad_norm": 9.193580598433886, + "learning_rate": 4.916047963670295e-05, + "loss": 2.5178, + "mean_token_accuracy": 0.44361767172813416, + "step": 131915 + }, + { + "epoch": 0.13287122510049443, + "grad_norm": 9.438929849363703, + "learning_rate": 4.9160378231000275e-05, + "loss": 2.6763, + "mean_token_accuracy": 0.3793103456497192, + "step": 131920 + }, + { + "epoch": 0.1328762611535986, + "grad_norm": 10.195353714241149, + "learning_rate": 4.916027681929001e-05, + "loss": 2.5948, + "mean_token_accuracy": 0.4390199601650238, + "step": 131925 + }, + { + "epoch": 0.13288129720670278, + "grad_norm": 11.975394004266729, + "learning_rate": 4.916017540157218e-05, + "loss": 2.5113, + "mean_token_accuracy": 0.4034482777118683, + "step": 131930 + }, + { + "epoch": 0.13288633325980695, + "grad_norm": 10.119138938993203, + "learning_rate": 4.916007397784682e-05, + "loss": 2.6704, + "mean_token_accuracy": 0.4, + "step": 131935 + }, + { + "epoch": 0.13289136931291112, + "grad_norm": 11.243474446267884, + "learning_rate": 4.915997254811396e-05, + "loss": 2.1211, + "mean_token_accuracy": 0.4780399203300476, + "step": 131940 + }, + { + "epoch": 0.1328964053660153, + "grad_norm": 9.085515380993764, + "learning_rate": 4.915987111237363e-05, + "loss": 1.8056, + "mean_token_accuracy": 0.5103448331356049, + "step": 131945 + }, + { + "epoch": 0.13290144141911947, + "grad_norm": 9.815287074696679, + "learning_rate": 4.915976967062585e-05, + "loss": 2.2832, + "mean_token_accuracy": 0.4534180223941803, + "step": 131950 + }, + { + "epoch": 0.13290647747222364, + "grad_norm": 10.185282238549654, + "learning_rate": 4.9159668222870654e-05, + "loss": 2.7654, + "mean_token_accuracy": 0.39655172526836396, + "step": 131955 + }, + { + "epoch": 0.13291151352532782, + "grad_norm": 10.863188943194942, + "learning_rate": 4.915956676910806e-05, + "loss": 2.4454, + "mean_token_accuracy": 0.4034482777118683, + "step": 131960 + }, + { + "epoch": 0.132916549578432, + "grad_norm": 10.67823818133062, + "learning_rate": 4.915946530933812e-05, + "loss": 2.4855, + "mean_token_accuracy": 0.4310344815254211, + "step": 131965 + }, + { + "epoch": 0.13292158563153617, + "grad_norm": 15.1265342494139, + "learning_rate": 4.915936384356084e-05, + "loss": 2.3431, + "mean_token_accuracy": 0.4103448212146759, + "step": 131970 + }, + { + "epoch": 0.13292662168464034, + "grad_norm": 22.052109568977116, + "learning_rate": 4.9159262371776243e-05, + "loss": 2.46, + "mean_token_accuracy": 0.441379314661026, + "step": 131975 + }, + { + "epoch": 0.1329316577377445, + "grad_norm": 10.76489942420142, + "learning_rate": 4.915916089398438e-05, + "loss": 1.9348, + "mean_token_accuracy": 0.5085299372673034, + "step": 131980 + }, + { + "epoch": 0.1329366937908487, + "grad_norm": 8.798894477178559, + "learning_rate": 4.915905941018527e-05, + "loss": 2.3728, + "mean_token_accuracy": 0.441379314661026, + "step": 131985 + }, + { + "epoch": 0.13294172984395286, + "grad_norm": 12.647294383361563, + "learning_rate": 4.915895792037893e-05, + "loss": 2.7153, + "mean_token_accuracy": 0.41034482717514037, + "step": 131990 + }, + { + "epoch": 0.13294676589705703, + "grad_norm": 10.796091961172817, + "learning_rate": 4.9158856424565406e-05, + "loss": 2.7037, + "mean_token_accuracy": 0.39310344457626345, + "step": 131995 + }, + { + "epoch": 0.1329518019501612, + "grad_norm": 10.039218075877924, + "learning_rate": 4.915875492274472e-05, + "loss": 2.2736, + "mean_token_accuracy": 0.46376286149024964, + "step": 132000 + }, + { + "epoch": 0.13295683800326538, + "grad_norm": 11.396729238236093, + "learning_rate": 4.9158653414916886e-05, + "loss": 2.1541, + "mean_token_accuracy": 0.482758629322052, + "step": 132005 + }, + { + "epoch": 0.13296187405636956, + "grad_norm": 11.646207327810686, + "learning_rate": 4.915855190108194e-05, + "loss": 2.1585, + "mean_token_accuracy": 0.4777374446392059, + "step": 132010 + }, + { + "epoch": 0.13296691010947373, + "grad_norm": 10.653717123464158, + "learning_rate": 4.915845038123993e-05, + "loss": 2.3329, + "mean_token_accuracy": 0.4275861978530884, + "step": 132015 + }, + { + "epoch": 0.1329719461625779, + "grad_norm": 12.564776851315061, + "learning_rate": 4.915834885539086e-05, + "loss": 2.018, + "mean_token_accuracy": 0.4620689690113068, + "step": 132020 + }, + { + "epoch": 0.13297698221568208, + "grad_norm": 10.207974851854551, + "learning_rate": 4.9158247323534766e-05, + "loss": 2.4124, + "mean_token_accuracy": 0.44827587008476255, + "step": 132025 + }, + { + "epoch": 0.13298201826878625, + "grad_norm": 12.089901391488828, + "learning_rate": 4.915814578567168e-05, + "loss": 2.7537, + "mean_token_accuracy": 0.44482758045196535, + "step": 132030 + }, + { + "epoch": 0.13298705432189042, + "grad_norm": 8.938536101347493, + "learning_rate": 4.915804424180163e-05, + "loss": 2.4471, + "mean_token_accuracy": 0.42068966031074523, + "step": 132035 + }, + { + "epoch": 0.1329920903749946, + "grad_norm": 12.077924313112794, + "learning_rate": 4.915794269192463e-05, + "loss": 2.5535, + "mean_token_accuracy": 0.4206896543502808, + "step": 132040 + }, + { + "epoch": 0.13299712642809877, + "grad_norm": 11.852353616291612, + "learning_rate": 4.915784113604072e-05, + "loss": 2.2313, + "mean_token_accuracy": 0.4655172348022461, + "step": 132045 + }, + { + "epoch": 0.13300216248120292, + "grad_norm": 13.399834003605365, + "learning_rate": 4.915773957414993e-05, + "loss": 2.3031, + "mean_token_accuracy": 0.4448275864124298, + "step": 132050 + }, + { + "epoch": 0.1330071985343071, + "grad_norm": 11.966392271391191, + "learning_rate": 4.9157638006252285e-05, + "loss": 2.5341, + "mean_token_accuracy": 0.38620689511299133, + "step": 132055 + }, + { + "epoch": 0.13301223458741127, + "grad_norm": 9.349772230467618, + "learning_rate": 4.915753643234782e-05, + "loss": 2.7031, + "mean_token_accuracy": 0.3965517282485962, + "step": 132060 + }, + { + "epoch": 0.13301727064051544, + "grad_norm": 11.353400416253692, + "learning_rate": 4.915743485243655e-05, + "loss": 2.5622, + "mean_token_accuracy": 0.40459770560264585, + "step": 132065 + }, + { + "epoch": 0.1330223066936196, + "grad_norm": 9.116405140053319, + "learning_rate": 4.9157333266518505e-05, + "loss": 2.3034, + "mean_token_accuracy": 0.4620689630508423, + "step": 132070 + }, + { + "epoch": 0.1330273427467238, + "grad_norm": 11.168078417540226, + "learning_rate": 4.9157231674593725e-05, + "loss": 2.0119, + "mean_token_accuracy": 0.5103448092937469, + "step": 132075 + }, + { + "epoch": 0.13303237879982796, + "grad_norm": 10.151547722963047, + "learning_rate": 4.915713007666223e-05, + "loss": 2.2252, + "mean_token_accuracy": 0.43793103098869324, + "step": 132080 + }, + { + "epoch": 0.13303741485293213, + "grad_norm": 11.479746197267716, + "learning_rate": 4.915702847272404e-05, + "loss": 2.5537, + "mean_token_accuracy": 0.4103448182344437, + "step": 132085 + }, + { + "epoch": 0.1330424509060363, + "grad_norm": 7.962346591684328, + "learning_rate": 4.915692686277921e-05, + "loss": 2.6326, + "mean_token_accuracy": 0.44827585220336913, + "step": 132090 + }, + { + "epoch": 0.13304748695914048, + "grad_norm": 8.297299082817306, + "learning_rate": 4.915682524682773e-05, + "loss": 2.5865, + "mean_token_accuracy": 0.4344827651977539, + "step": 132095 + }, + { + "epoch": 0.13305252301224466, + "grad_norm": 11.443677323384353, + "learning_rate": 4.915672362486967e-05, + "loss": 2.9198, + "mean_token_accuracy": 0.37931033968925476, + "step": 132100 + }, + { + "epoch": 0.13305755906534883, + "grad_norm": 10.802605073644573, + "learning_rate": 4.915662199690502e-05, + "loss": 2.1064, + "mean_token_accuracy": 0.46551724076271056, + "step": 132105 + }, + { + "epoch": 0.133062595118453, + "grad_norm": 9.45052455665339, + "learning_rate": 4.9156520362933835e-05, + "loss": 2.7062, + "mean_token_accuracy": 0.35172412991523744, + "step": 132110 + }, + { + "epoch": 0.13306763117155718, + "grad_norm": 9.23547982077421, + "learning_rate": 4.9156418722956124e-05, + "loss": 2.1828, + "mean_token_accuracy": 0.4586206912994385, + "step": 132115 + }, + { + "epoch": 0.13307266722466135, + "grad_norm": 9.673633147922375, + "learning_rate": 4.915631707697193e-05, + "loss": 2.3341, + "mean_token_accuracy": 0.4206896543502808, + "step": 132120 + }, + { + "epoch": 0.13307770327776552, + "grad_norm": 13.885664281892273, + "learning_rate": 4.9156215424981275e-05, + "loss": 2.1458, + "mean_token_accuracy": 0.47931033968925474, + "step": 132125 + }, + { + "epoch": 0.1330827393308697, + "grad_norm": 11.176150266962713, + "learning_rate": 4.915611376698419e-05, + "loss": 2.6637, + "mean_token_accuracy": 0.4034482777118683, + "step": 132130 + }, + { + "epoch": 0.13308777538397387, + "grad_norm": 10.051840013391441, + "learning_rate": 4.91560121029807e-05, + "loss": 2.4634, + "mean_token_accuracy": 0.38620689511299133, + "step": 132135 + }, + { + "epoch": 0.13309281143707805, + "grad_norm": 11.134289399222892, + "learning_rate": 4.9155910432970834e-05, + "loss": 2.216, + "mean_token_accuracy": 0.4310344815254211, + "step": 132140 + }, + { + "epoch": 0.13309784749018222, + "grad_norm": 12.018013637770206, + "learning_rate": 4.915580875695462e-05, + "loss": 2.6949, + "mean_token_accuracy": 0.42413793206214906, + "step": 132145 + }, + { + "epoch": 0.1331028835432864, + "grad_norm": 9.42623565566413, + "learning_rate": 4.915570707493209e-05, + "loss": 2.1757, + "mean_token_accuracy": 0.43103448748588563, + "step": 132150 + }, + { + "epoch": 0.13310791959639057, + "grad_norm": 11.750737887213594, + "learning_rate": 4.9155605386903266e-05, + "loss": 3.0971, + "mean_token_accuracy": 0.36551724672317504, + "step": 132155 + }, + { + "epoch": 0.13311295564949474, + "grad_norm": 12.535646438558016, + "learning_rate": 4.9155503692868176e-05, + "loss": 2.2651, + "mean_token_accuracy": 0.4034482777118683, + "step": 132160 + }, + { + "epoch": 0.13311799170259891, + "grad_norm": 11.135125656488361, + "learning_rate": 4.915540199282686e-05, + "loss": 2.5473, + "mean_token_accuracy": 0.4068965494632721, + "step": 132165 + }, + { + "epoch": 0.1331230277557031, + "grad_norm": 10.803154192200502, + "learning_rate": 4.915530028677933e-05, + "loss": 2.3341, + "mean_token_accuracy": 0.42758620977401735, + "step": 132170 + }, + { + "epoch": 0.13312806380880726, + "grad_norm": 10.538160831708433, + "learning_rate": 4.9155198574725625e-05, + "loss": 2.7023, + "mean_token_accuracy": 0.3896551728248596, + "step": 132175 + }, + { + "epoch": 0.13313309986191144, + "grad_norm": 11.32589980018265, + "learning_rate": 4.915509685666577e-05, + "loss": 2.4213, + "mean_token_accuracy": 0.43793103098869324, + "step": 132180 + }, + { + "epoch": 0.1331381359150156, + "grad_norm": 11.263135811665544, + "learning_rate": 4.915499513259979e-05, + "loss": 2.3971, + "mean_token_accuracy": 0.44379915595054625, + "step": 132185 + }, + { + "epoch": 0.13314317196811976, + "grad_norm": 9.510455347950607, + "learning_rate": 4.9154893402527716e-05, + "loss": 2.3048, + "mean_token_accuracy": 0.42413793206214906, + "step": 132190 + }, + { + "epoch": 0.13314820802122393, + "grad_norm": 10.397219407345082, + "learning_rate": 4.9154791666449576e-05, + "loss": 2.8361, + "mean_token_accuracy": 0.3482758581638336, + "step": 132195 + }, + { + "epoch": 0.1331532440743281, + "grad_norm": 9.519707912151798, + "learning_rate": 4.9154689924365404e-05, + "loss": 2.1436, + "mean_token_accuracy": 0.44482759237289426, + "step": 132200 + }, + { + "epoch": 0.13315828012743228, + "grad_norm": 10.703701834386429, + "learning_rate": 4.915458817627522e-05, + "loss": 2.211, + "mean_token_accuracy": 0.4620689630508423, + "step": 132205 + }, + { + "epoch": 0.13316331618053645, + "grad_norm": 10.87082224789843, + "learning_rate": 4.915448642217905e-05, + "loss": 2.3981, + "mean_token_accuracy": 0.4068965494632721, + "step": 132210 + }, + { + "epoch": 0.13316835223364062, + "grad_norm": 10.222340345317583, + "learning_rate": 4.9154384662076935e-05, + "loss": 2.3752, + "mean_token_accuracy": 0.4481548726558685, + "step": 132215 + }, + { + "epoch": 0.1331733882867448, + "grad_norm": 9.966450625379919, + "learning_rate": 4.9154282895968896e-05, + "loss": 2.2086, + "mean_token_accuracy": 0.4620689690113068, + "step": 132220 + }, + { + "epoch": 0.13317842433984897, + "grad_norm": 9.901856998448434, + "learning_rate": 4.915418112385495e-05, + "loss": 2.1241, + "mean_token_accuracy": 0.42068964838981626, + "step": 132225 + }, + { + "epoch": 0.13318346039295315, + "grad_norm": 10.362412351414692, + "learning_rate": 4.9154079345735146e-05, + "loss": 2.1566, + "mean_token_accuracy": 0.4570477962493896, + "step": 132230 + }, + { + "epoch": 0.13318849644605732, + "grad_norm": 11.946812552390723, + "learning_rate": 4.91539775616095e-05, + "loss": 2.5042, + "mean_token_accuracy": 0.4537205040454865, + "step": 132235 + }, + { + "epoch": 0.1331935324991615, + "grad_norm": 8.963898442849015, + "learning_rate": 4.9153875771478044e-05, + "loss": 2.1656, + "mean_token_accuracy": 0.4704779267311096, + "step": 132240 + }, + { + "epoch": 0.13319856855226567, + "grad_norm": 11.230198414813245, + "learning_rate": 4.91537739753408e-05, + "loss": 2.992, + "mean_token_accuracy": 0.334482753276825, + "step": 132245 + }, + { + "epoch": 0.13320360460536984, + "grad_norm": 11.137909716423819, + "learning_rate": 4.9153672173197814e-05, + "loss": 2.6267, + "mean_token_accuracy": 0.4034482777118683, + "step": 132250 + }, + { + "epoch": 0.13320864065847401, + "grad_norm": 9.633426638705155, + "learning_rate": 4.915357036504909e-05, + "loss": 2.2067, + "mean_token_accuracy": 0.45862069725990295, + "step": 132255 + }, + { + "epoch": 0.1332136767115782, + "grad_norm": 10.24122116036434, + "learning_rate": 4.915346855089467e-05, + "loss": 2.0719, + "mean_token_accuracy": 0.4774349570274353, + "step": 132260 + }, + { + "epoch": 0.13321871276468236, + "grad_norm": 9.826265850620526, + "learning_rate": 4.915336673073458e-05, + "loss": 2.4983, + "mean_token_accuracy": 0.4137930989265442, + "step": 132265 + }, + { + "epoch": 0.13322374881778654, + "grad_norm": 9.7760541309109, + "learning_rate": 4.915326490456884e-05, + "loss": 2.2561, + "mean_token_accuracy": 0.4620689570903778, + "step": 132270 + }, + { + "epoch": 0.1332287848708907, + "grad_norm": 16.81265450181346, + "learning_rate": 4.91531630723975e-05, + "loss": 2.3172, + "mean_token_accuracy": 0.5121921122074127, + "step": 132275 + }, + { + "epoch": 0.13323382092399488, + "grad_norm": 10.115995813429034, + "learning_rate": 4.915306123422056e-05, + "loss": 2.4699, + "mean_token_accuracy": 0.4344827592372894, + "step": 132280 + }, + { + "epoch": 0.13323885697709906, + "grad_norm": 10.663772783600143, + "learning_rate": 4.9152959390038075e-05, + "loss": 2.4444, + "mean_token_accuracy": 0.39310344457626345, + "step": 132285 + }, + { + "epoch": 0.13324389303020323, + "grad_norm": 10.845725373986284, + "learning_rate": 4.915285753985006e-05, + "loss": 2.1197, + "mean_token_accuracy": 0.4724137902259827, + "step": 132290 + }, + { + "epoch": 0.1332489290833074, + "grad_norm": 10.506011958517517, + "learning_rate": 4.915275568365654e-05, + "loss": 2.608, + "mean_token_accuracy": 0.4034482777118683, + "step": 132295 + }, + { + "epoch": 0.13325396513641158, + "grad_norm": 8.927997535448005, + "learning_rate": 4.9152653821457546e-05, + "loss": 2.2032, + "mean_token_accuracy": 0.47586206793785096, + "step": 132300 + }, + { + "epoch": 0.13325900118951575, + "grad_norm": 9.539594807060489, + "learning_rate": 4.9152551953253115e-05, + "loss": 2.4096, + "mean_token_accuracy": 0.4206896543502808, + "step": 132305 + }, + { + "epoch": 0.13326403724261993, + "grad_norm": 10.19393566965759, + "learning_rate": 4.9152450079043255e-05, + "loss": 2.2502, + "mean_token_accuracy": 0.4, + "step": 132310 + }, + { + "epoch": 0.1332690732957241, + "grad_norm": 10.76437977440662, + "learning_rate": 4.915234819882802e-05, + "loss": 2.0744, + "mean_token_accuracy": 0.4344827651977539, + "step": 132315 + }, + { + "epoch": 0.13327410934882827, + "grad_norm": 12.426593182115818, + "learning_rate": 4.915224631260742e-05, + "loss": 2.3423, + "mean_token_accuracy": 0.4482758641242981, + "step": 132320 + }, + { + "epoch": 0.13327914540193245, + "grad_norm": 10.371151253711096, + "learning_rate": 4.915214442038148e-05, + "loss": 2.5384, + "mean_token_accuracy": 0.4103448212146759, + "step": 132325 + }, + { + "epoch": 0.1332841814550366, + "grad_norm": 12.081050002121174, + "learning_rate": 4.9152042522150256e-05, + "loss": 2.625, + "mean_token_accuracy": 0.35517241060733795, + "step": 132330 + }, + { + "epoch": 0.13328921750814077, + "grad_norm": 11.575247314112485, + "learning_rate": 4.915194061791374e-05, + "loss": 2.4546, + "mean_token_accuracy": 0.4192377507686615, + "step": 132335 + }, + { + "epoch": 0.13329425356124494, + "grad_norm": 12.513990260424526, + "learning_rate": 4.915183870767199e-05, + "loss": 2.6065, + "mean_token_accuracy": 0.38620689511299133, + "step": 132340 + }, + { + "epoch": 0.13329928961434911, + "grad_norm": 12.074422198381475, + "learning_rate": 4.9151736791425015e-05, + "loss": 2.7445, + "mean_token_accuracy": 0.38275861740112305, + "step": 132345 + }, + { + "epoch": 0.1333043256674533, + "grad_norm": 11.81352053095203, + "learning_rate": 4.9151634869172856e-05, + "loss": 2.2128, + "mean_token_accuracy": 0.4413793087005615, + "step": 132350 + }, + { + "epoch": 0.13330936172055746, + "grad_norm": 10.157109635385419, + "learning_rate": 4.915153294091553e-05, + "loss": 2.3494, + "mean_token_accuracy": 0.46551724076271056, + "step": 132355 + }, + { + "epoch": 0.13331439777366164, + "grad_norm": 11.08906866905451, + "learning_rate": 4.9151431006653074e-05, + "loss": 2.1999, + "mean_token_accuracy": 0.4517241358757019, + "step": 132360 + }, + { + "epoch": 0.1333194338267658, + "grad_norm": 9.874767953637017, + "learning_rate": 4.915132906638551e-05, + "loss": 2.1974, + "mean_token_accuracy": 0.4551724135875702, + "step": 132365 + }, + { + "epoch": 0.13332446987986998, + "grad_norm": 8.889791975857879, + "learning_rate": 4.915122712011286e-05, + "loss": 2.3457, + "mean_token_accuracy": 0.4607380509376526, + "step": 132370 + }, + { + "epoch": 0.13332950593297416, + "grad_norm": 9.579326306040423, + "learning_rate": 4.915112516783518e-05, + "loss": 2.2347, + "mean_token_accuracy": 0.48620688915252686, + "step": 132375 + }, + { + "epoch": 0.13333454198607833, + "grad_norm": 7.44213485909985, + "learning_rate": 4.915102320955247e-05, + "loss": 2.4027, + "mean_token_accuracy": 0.47477314472198484, + "step": 132380 + }, + { + "epoch": 0.1333395780391825, + "grad_norm": 11.016213690426166, + "learning_rate": 4.915092124526476e-05, + "loss": 2.2751, + "mean_token_accuracy": 0.4413793206214905, + "step": 132385 + }, + { + "epoch": 0.13334461409228668, + "grad_norm": 9.429713145680276, + "learning_rate": 4.9150819274972105e-05, + "loss": 2.4606, + "mean_token_accuracy": 0.41034482717514037, + "step": 132390 + }, + { + "epoch": 0.13334965014539085, + "grad_norm": 10.412893022096712, + "learning_rate": 4.91507172986745e-05, + "loss": 2.3192, + "mean_token_accuracy": 0.4241379380226135, + "step": 132395 + }, + { + "epoch": 0.13335468619849503, + "grad_norm": 12.683025104184283, + "learning_rate": 4.9150615316371994e-05, + "loss": 2.3743, + "mean_token_accuracy": 0.4517241299152374, + "step": 132400 + }, + { + "epoch": 0.1333597222515992, + "grad_norm": 10.08946744621511, + "learning_rate": 4.915051332806461e-05, + "loss": 2.3287, + "mean_token_accuracy": 0.4379310369491577, + "step": 132405 + }, + { + "epoch": 0.13336475830470337, + "grad_norm": 12.311138825663695, + "learning_rate": 4.915041133375237e-05, + "loss": 2.4812, + "mean_token_accuracy": 0.47931033968925474, + "step": 132410 + }, + { + "epoch": 0.13336979435780755, + "grad_norm": 9.841007098414352, + "learning_rate": 4.9150309333435306e-05, + "loss": 1.9557, + "mean_token_accuracy": 0.5206896603107453, + "step": 132415 + }, + { + "epoch": 0.13337483041091172, + "grad_norm": 10.44645064500164, + "learning_rate": 4.915020732711345e-05, + "loss": 2.2705, + "mean_token_accuracy": 0.44313369393348695, + "step": 132420 + }, + { + "epoch": 0.1333798664640159, + "grad_norm": 12.76287741334063, + "learning_rate": 4.9150105314786834e-05, + "loss": 2.5614, + "mean_token_accuracy": 0.4379310369491577, + "step": 132425 + }, + { + "epoch": 0.13338490251712007, + "grad_norm": 9.703420109395488, + "learning_rate": 4.915000329645548e-05, + "loss": 2.6288, + "mean_token_accuracy": 0.4206896543502808, + "step": 132430 + }, + { + "epoch": 0.13338993857022424, + "grad_norm": 11.359975340461137, + "learning_rate": 4.914990127211941e-05, + "loss": 2.3734, + "mean_token_accuracy": 0.4586206912994385, + "step": 132435 + }, + { + "epoch": 0.13339497462332842, + "grad_norm": 9.972858105994659, + "learning_rate": 4.914979924177866e-05, + "loss": 2.5992, + "mean_token_accuracy": 0.3896551728248596, + "step": 132440 + }, + { + "epoch": 0.1334000106764326, + "grad_norm": 10.273852217690463, + "learning_rate": 4.914969720543326e-05, + "loss": 2.5234, + "mean_token_accuracy": 0.41724138259887694, + "step": 132445 + }, + { + "epoch": 0.13340504672953676, + "grad_norm": 11.041777510228668, + "learning_rate": 4.914959516308324e-05, + "loss": 2.1707, + "mean_token_accuracy": 0.4793103516101837, + "step": 132450 + }, + { + "epoch": 0.13341008278264094, + "grad_norm": 10.355198465854153, + "learning_rate": 4.9149493114728616e-05, + "loss": 2.3933, + "mean_token_accuracy": 0.46896552443504336, + "step": 132455 + }, + { + "epoch": 0.1334151188357451, + "grad_norm": 9.969261327728704, + "learning_rate": 4.914939106036943e-05, + "loss": 2.2307, + "mean_token_accuracy": 0.4902709424495697, + "step": 132460 + }, + { + "epoch": 0.13342015488884928, + "grad_norm": 10.715074201412383, + "learning_rate": 4.91492890000057e-05, + "loss": 2.5986, + "mean_token_accuracy": 0.3862069010734558, + "step": 132465 + }, + { + "epoch": 0.13342519094195343, + "grad_norm": 8.290038760557643, + "learning_rate": 4.914918693363747e-05, + "loss": 1.934, + "mean_token_accuracy": 0.48759830594062803, + "step": 132470 + }, + { + "epoch": 0.1334302269950576, + "grad_norm": 9.115469342357398, + "learning_rate": 4.9149084861264744e-05, + "loss": 2.0923, + "mean_token_accuracy": 0.47931033968925474, + "step": 132475 + }, + { + "epoch": 0.13343526304816178, + "grad_norm": 9.930231112784956, + "learning_rate": 4.9148982782887566e-05, + "loss": 2.5001, + "mean_token_accuracy": 0.41379311084747317, + "step": 132480 + }, + { + "epoch": 0.13344029910126595, + "grad_norm": 12.289848910541782, + "learning_rate": 4.914888069850596e-05, + "loss": 2.6816, + "mean_token_accuracy": 0.43448275327682495, + "step": 132485 + }, + { + "epoch": 0.13344533515437013, + "grad_norm": 10.923444466219964, + "learning_rate": 4.9148778608119965e-05, + "loss": 2.1846, + "mean_token_accuracy": 0.4137930989265442, + "step": 132490 + }, + { + "epoch": 0.1334503712074743, + "grad_norm": 9.219428885511377, + "learning_rate": 4.9148676511729595e-05, + "loss": 2.6184, + "mean_token_accuracy": 0.40344826579093934, + "step": 132495 + }, + { + "epoch": 0.13345540726057847, + "grad_norm": 12.167766426389326, + "learning_rate": 4.914857440933489e-05, + "loss": 2.0861, + "mean_token_accuracy": 0.5159709632396698, + "step": 132500 + }, + { + "epoch": 0.13346044331368265, + "grad_norm": 8.495593411891136, + "learning_rate": 4.914847230093586e-05, + "loss": 2.3632, + "mean_token_accuracy": 0.4310344815254211, + "step": 132505 + }, + { + "epoch": 0.13346547936678682, + "grad_norm": 9.673650213929653, + "learning_rate": 4.9148370186532555e-05, + "loss": 2.2874, + "mean_token_accuracy": 0.4172413766384125, + "step": 132510 + }, + { + "epoch": 0.133470515419891, + "grad_norm": 9.903761315307902, + "learning_rate": 4.914826806612499e-05, + "loss": 2.5553, + "mean_token_accuracy": 0.3896551787853241, + "step": 132515 + }, + { + "epoch": 0.13347555147299517, + "grad_norm": 11.253200742936842, + "learning_rate": 4.91481659397132e-05, + "loss": 2.1125, + "mean_token_accuracy": 0.4517241358757019, + "step": 132520 + }, + { + "epoch": 0.13348058752609934, + "grad_norm": 9.606242510285352, + "learning_rate": 4.9148063807297205e-05, + "loss": 2.6121, + "mean_token_accuracy": 0.4068965494632721, + "step": 132525 + }, + { + "epoch": 0.13348562357920352, + "grad_norm": 11.074393237953196, + "learning_rate": 4.914796166887705e-05, + "loss": 2.5716, + "mean_token_accuracy": 0.39310344457626345, + "step": 132530 + }, + { + "epoch": 0.1334906596323077, + "grad_norm": 14.032823941949458, + "learning_rate": 4.914785952445275e-05, + "loss": 2.6413, + "mean_token_accuracy": 0.37241379618644715, + "step": 132535 + }, + { + "epoch": 0.13349569568541186, + "grad_norm": 10.692836874100095, + "learning_rate": 4.9147757374024325e-05, + "loss": 2.5335, + "mean_token_accuracy": 0.4517241358757019, + "step": 132540 + }, + { + "epoch": 0.13350073173851604, + "grad_norm": 11.480399136903326, + "learning_rate": 4.9147655217591814e-05, + "loss": 2.2775, + "mean_token_accuracy": 0.4206896543502808, + "step": 132545 + }, + { + "epoch": 0.1335057677916202, + "grad_norm": 9.498456804950347, + "learning_rate": 4.914755305515526e-05, + "loss": 2.791, + "mean_token_accuracy": 0.36551723778247835, + "step": 132550 + }, + { + "epoch": 0.13351080384472439, + "grad_norm": 17.17535357230016, + "learning_rate": 4.914745088671465e-05, + "loss": 2.6421, + "mean_token_accuracy": 0.4206896543502808, + "step": 132555 + }, + { + "epoch": 0.13351583989782856, + "grad_norm": 10.396980808087871, + "learning_rate": 4.914734871227006e-05, + "loss": 2.1902, + "mean_token_accuracy": 0.44482757449150084, + "step": 132560 + }, + { + "epoch": 0.13352087595093273, + "grad_norm": 10.728842336884252, + "learning_rate": 4.914724653182149e-05, + "loss": 2.1188, + "mean_token_accuracy": 0.46896551847457885, + "step": 132565 + }, + { + "epoch": 0.1335259120040369, + "grad_norm": 12.136619877507176, + "learning_rate": 4.914714434536898e-05, + "loss": 2.3394, + "mean_token_accuracy": 0.4777374565601349, + "step": 132570 + }, + { + "epoch": 0.13353094805714108, + "grad_norm": 11.312670255868811, + "learning_rate": 4.914704215291255e-05, + "loss": 2.6016, + "mean_token_accuracy": 0.38620689511299133, + "step": 132575 + }, + { + "epoch": 0.13353598411024525, + "grad_norm": 10.134706433020293, + "learning_rate": 4.914693995445224e-05, + "loss": 2.3442, + "mean_token_accuracy": 0.4379310250282288, + "step": 132580 + }, + { + "epoch": 0.13354102016334943, + "grad_norm": 9.175375443598146, + "learning_rate": 4.9146837749988054e-05, + "loss": 1.9364, + "mean_token_accuracy": 0.43448275327682495, + "step": 132585 + }, + { + "epoch": 0.1335460562164536, + "grad_norm": 10.585431626375978, + "learning_rate": 4.914673553952005e-05, + "loss": 2.3183, + "mean_token_accuracy": 0.3965517282485962, + "step": 132590 + }, + { + "epoch": 0.13355109226955778, + "grad_norm": 8.579939257555077, + "learning_rate": 4.9146633323048234e-05, + "loss": 2.265, + "mean_token_accuracy": 0.44827585816383364, + "step": 132595 + }, + { + "epoch": 0.13355612832266195, + "grad_norm": 10.832529687475509, + "learning_rate": 4.914653110057265e-05, + "loss": 2.3328, + "mean_token_accuracy": 0.41034482717514037, + "step": 132600 + }, + { + "epoch": 0.13356116437576612, + "grad_norm": 10.723135066863131, + "learning_rate": 4.914642887209332e-05, + "loss": 2.5505, + "mean_token_accuracy": 0.41034482717514037, + "step": 132605 + }, + { + "epoch": 0.13356620042887027, + "grad_norm": 11.354202567108759, + "learning_rate": 4.914632663761028e-05, + "loss": 2.184, + "mean_token_accuracy": 0.4698275804519653, + "step": 132610 + }, + { + "epoch": 0.13357123648197444, + "grad_norm": 13.082194537811986, + "learning_rate": 4.9146224397123536e-05, + "loss": 2.6288, + "mean_token_accuracy": 0.3793103516101837, + "step": 132615 + }, + { + "epoch": 0.13357627253507862, + "grad_norm": 10.610177877711255, + "learning_rate": 4.9146122150633135e-05, + "loss": 2.301, + "mean_token_accuracy": 0.45172414779663084, + "step": 132620 + }, + { + "epoch": 0.1335813085881828, + "grad_norm": 11.398095866296075, + "learning_rate": 4.9146019898139106e-05, + "loss": 2.278, + "mean_token_accuracy": 0.4034482717514038, + "step": 132625 + }, + { + "epoch": 0.13358634464128696, + "grad_norm": 9.617024180196179, + "learning_rate": 4.914591763964147e-05, + "loss": 2.4793, + "mean_token_accuracy": 0.4517241358757019, + "step": 132630 + }, + { + "epoch": 0.13359138069439114, + "grad_norm": 12.898190848755679, + "learning_rate": 4.9145815375140254e-05, + "loss": 2.7774, + "mean_token_accuracy": 0.35862069129943847, + "step": 132635 + }, + { + "epoch": 0.1335964167474953, + "grad_norm": 10.424689137327166, + "learning_rate": 4.91457131046355e-05, + "loss": 2.3163, + "mean_token_accuracy": 0.4275861978530884, + "step": 132640 + }, + { + "epoch": 0.13360145280059949, + "grad_norm": 10.616311205440159, + "learning_rate": 4.914561082812721e-05, + "loss": 2.6066, + "mean_token_accuracy": 0.38965516686439516, + "step": 132645 + }, + { + "epoch": 0.13360648885370366, + "grad_norm": 8.816876301880235, + "learning_rate": 4.914550854561544e-05, + "loss": 2.2237, + "mean_token_accuracy": 0.4793103575706482, + "step": 132650 + }, + { + "epoch": 0.13361152490680783, + "grad_norm": 10.065134558069312, + "learning_rate": 4.914540625710021e-05, + "loss": 2.4567, + "mean_token_accuracy": 0.42758620977401735, + "step": 132655 + }, + { + "epoch": 0.133616560959912, + "grad_norm": 17.385560365900922, + "learning_rate": 4.914530396258154e-05, + "loss": 2.6183, + "mean_token_accuracy": 0.39655172526836396, + "step": 132660 + }, + { + "epoch": 0.13362159701301618, + "grad_norm": 12.352481916083004, + "learning_rate": 4.914520166205947e-05, + "loss": 2.2549, + "mean_token_accuracy": 0.4206896543502808, + "step": 132665 + }, + { + "epoch": 0.13362663306612035, + "grad_norm": 12.818845359529771, + "learning_rate": 4.914509935553402e-05, + "loss": 2.496, + "mean_token_accuracy": 0.43103448748588563, + "step": 132670 + }, + { + "epoch": 0.13363166911922453, + "grad_norm": 11.51858791853334, + "learning_rate": 4.914499704300522e-05, + "loss": 2.1524, + "mean_token_accuracy": 0.48275861144065857, + "step": 132675 + }, + { + "epoch": 0.1336367051723287, + "grad_norm": 22.79409224571709, + "learning_rate": 4.91448947244731e-05, + "loss": 2.8767, + "mean_token_accuracy": 0.42413793206214906, + "step": 132680 + }, + { + "epoch": 0.13364174122543288, + "grad_norm": 10.669982224306079, + "learning_rate": 4.914479239993769e-05, + "loss": 2.4179, + "mean_token_accuracy": 0.41379311084747317, + "step": 132685 + }, + { + "epoch": 0.13364677727853705, + "grad_norm": 10.827332537692554, + "learning_rate": 4.914469006939901e-05, + "loss": 2.5588, + "mean_token_accuracy": 0.43103448748588563, + "step": 132690 + }, + { + "epoch": 0.13365181333164122, + "grad_norm": 12.321758019302035, + "learning_rate": 4.91445877328571e-05, + "loss": 2.7483, + "mean_token_accuracy": 0.37586206793785093, + "step": 132695 + }, + { + "epoch": 0.1336568493847454, + "grad_norm": 9.985538986564807, + "learning_rate": 4.9144485390311985e-05, + "loss": 2.5958, + "mean_token_accuracy": 0.4, + "step": 132700 + }, + { + "epoch": 0.13366188543784957, + "grad_norm": 12.781752086945525, + "learning_rate": 4.914438304176369e-05, + "loss": 2.8808, + "mean_token_accuracy": 0.3551724016666412, + "step": 132705 + }, + { + "epoch": 0.13366692149095374, + "grad_norm": 10.376259500940364, + "learning_rate": 4.914428068721224e-05, + "loss": 2.5545, + "mean_token_accuracy": 0.4000000059604645, + "step": 132710 + }, + { + "epoch": 0.13367195754405792, + "grad_norm": 8.191527774801951, + "learning_rate": 4.914417832665767e-05, + "loss": 2.1893, + "mean_token_accuracy": 0.43866994976997375, + "step": 132715 + }, + { + "epoch": 0.1336769935971621, + "grad_norm": 14.34974612829401, + "learning_rate": 4.91440759601e-05, + "loss": 2.6576, + "mean_token_accuracy": 0.37586206793785093, + "step": 132720 + }, + { + "epoch": 0.13368202965026627, + "grad_norm": 10.427514085603141, + "learning_rate": 4.914397358753927e-05, + "loss": 2.6231, + "mean_token_accuracy": 0.41724138259887694, + "step": 132725 + }, + { + "epoch": 0.13368706570337044, + "grad_norm": 11.5500148420489, + "learning_rate": 4.914387120897551e-05, + "loss": 2.2847, + "mean_token_accuracy": 0.42413793206214906, + "step": 132730 + }, + { + "epoch": 0.1336921017564746, + "grad_norm": 10.878118902282123, + "learning_rate": 4.914376882440874e-05, + "loss": 2.6356, + "mean_token_accuracy": 0.4, + "step": 132735 + }, + { + "epoch": 0.1336971378095788, + "grad_norm": 13.983115964926723, + "learning_rate": 4.914366643383898e-05, + "loss": 2.8121, + "mean_token_accuracy": 0.3620689630508423, + "step": 132740 + }, + { + "epoch": 0.13370217386268296, + "grad_norm": 10.155528925921091, + "learning_rate": 4.9143564037266275e-05, + "loss": 2.7115, + "mean_token_accuracy": 0.41379310488700866, + "step": 132745 + }, + { + "epoch": 0.1337072099157871, + "grad_norm": 11.784421946494257, + "learning_rate": 4.9143461634690645e-05, + "loss": 2.3591, + "mean_token_accuracy": 0.43103448748588563, + "step": 132750 + }, + { + "epoch": 0.13371224596889128, + "grad_norm": 12.584000791093423, + "learning_rate": 4.914335922611212e-05, + "loss": 2.17, + "mean_token_accuracy": 0.458620685338974, + "step": 132755 + }, + { + "epoch": 0.13371728202199545, + "grad_norm": 11.062782696755132, + "learning_rate": 4.9143256811530734e-05, + "loss": 3.0922, + "mean_token_accuracy": 0.3551724076271057, + "step": 132760 + }, + { + "epoch": 0.13372231807509963, + "grad_norm": 9.852571150636424, + "learning_rate": 4.91431543909465e-05, + "loss": 2.2072, + "mean_token_accuracy": 0.4068965494632721, + "step": 132765 + }, + { + "epoch": 0.1337273541282038, + "grad_norm": 9.508538450273738, + "learning_rate": 4.914305196435947e-05, + "loss": 2.3233, + "mean_token_accuracy": 0.42413792610168455, + "step": 132770 + }, + { + "epoch": 0.13373239018130798, + "grad_norm": 12.218598375822111, + "learning_rate": 4.914294953176965e-05, + "loss": 2.262, + "mean_token_accuracy": 0.4551724076271057, + "step": 132775 + }, + { + "epoch": 0.13373742623441215, + "grad_norm": 12.766986084677166, + "learning_rate": 4.9142847093177075e-05, + "loss": 1.9585, + "mean_token_accuracy": 0.49848759174346924, + "step": 132780 + }, + { + "epoch": 0.13374246228751632, + "grad_norm": 8.570092817951632, + "learning_rate": 4.914274464858178e-05, + "loss": 2.173, + "mean_token_accuracy": 0.458620685338974, + "step": 132785 + }, + { + "epoch": 0.1337474983406205, + "grad_norm": 10.904169385376413, + "learning_rate": 4.914264219798379e-05, + "loss": 1.9838, + "mean_token_accuracy": 0.47931034564971925, + "step": 132790 + }, + { + "epoch": 0.13375253439372467, + "grad_norm": 11.971272960783534, + "learning_rate": 4.914253974138312e-05, + "loss": 2.4261, + "mean_token_accuracy": 0.41724138259887694, + "step": 132795 + }, + { + "epoch": 0.13375757044682884, + "grad_norm": 11.458978331242289, + "learning_rate": 4.914243727877983e-05, + "loss": 2.3362, + "mean_token_accuracy": 0.41724138259887694, + "step": 132800 + }, + { + "epoch": 0.13376260649993302, + "grad_norm": 8.854863267870993, + "learning_rate": 4.914233481017392e-05, + "loss": 2.0155, + "mean_token_accuracy": 0.45862067937850953, + "step": 132805 + }, + { + "epoch": 0.1337676425530372, + "grad_norm": 17.31446553650992, + "learning_rate": 4.9142232335565424e-05, + "loss": 2.819, + "mean_token_accuracy": 0.3689655065536499, + "step": 132810 + }, + { + "epoch": 0.13377267860614137, + "grad_norm": 10.870273783181668, + "learning_rate": 4.9142129854954383e-05, + "loss": 2.321, + "mean_token_accuracy": 0.4103448212146759, + "step": 132815 + }, + { + "epoch": 0.13377771465924554, + "grad_norm": 10.124824159533858, + "learning_rate": 4.914202736834081e-05, + "loss": 2.4832, + "mean_token_accuracy": 0.4413793087005615, + "step": 132820 + }, + { + "epoch": 0.1337827507123497, + "grad_norm": 11.811036420889828, + "learning_rate": 4.914192487572474e-05, + "loss": 1.9942, + "mean_token_accuracy": 0.47931033968925474, + "step": 132825 + }, + { + "epoch": 0.1337877867654539, + "grad_norm": 15.224906291430523, + "learning_rate": 4.914182237710621e-05, + "loss": 2.5857, + "mean_token_accuracy": 0.4379310429096222, + "step": 132830 + }, + { + "epoch": 0.13379282281855806, + "grad_norm": 11.504207770620662, + "learning_rate": 4.9141719872485226e-05, + "loss": 2.2877, + "mean_token_accuracy": 0.4379310369491577, + "step": 132835 + }, + { + "epoch": 0.13379785887166223, + "grad_norm": 11.40332274990979, + "learning_rate": 4.914161736186184e-05, + "loss": 2.8495, + "mean_token_accuracy": 0.3896551728248596, + "step": 132840 + }, + { + "epoch": 0.1338028949247664, + "grad_norm": 8.67422066018454, + "learning_rate": 4.914151484523607e-05, + "loss": 2.4108, + "mean_token_accuracy": 0.42413792610168455, + "step": 132845 + }, + { + "epoch": 0.13380793097787058, + "grad_norm": 10.836801792395475, + "learning_rate": 4.914141232260794e-05, + "loss": 2.1388, + "mean_token_accuracy": 0.44827587008476255, + "step": 132850 + }, + { + "epoch": 0.13381296703097476, + "grad_norm": 9.0048891014409, + "learning_rate": 4.914130979397749e-05, + "loss": 2.3271, + "mean_token_accuracy": 0.44827585816383364, + "step": 132855 + }, + { + "epoch": 0.13381800308407893, + "grad_norm": 13.409097072022995, + "learning_rate": 4.914120725934473e-05, + "loss": 2.8783, + "mean_token_accuracy": 0.4, + "step": 132860 + }, + { + "epoch": 0.1338230391371831, + "grad_norm": 11.00959267113664, + "learning_rate": 4.914110471870971e-05, + "loss": 2.471, + "mean_token_accuracy": 0.42068966031074523, + "step": 132865 + }, + { + "epoch": 0.13382807519028728, + "grad_norm": 8.653477752527861, + "learning_rate": 4.9141002172072454e-05, + "loss": 2.0179, + "mean_token_accuracy": 0.47586206197738645, + "step": 132870 + }, + { + "epoch": 0.13383311124339145, + "grad_norm": 16.228999588675798, + "learning_rate": 4.914089961943298e-05, + "loss": 2.8427, + "mean_token_accuracy": 0.38965516686439516, + "step": 132875 + }, + { + "epoch": 0.13383814729649562, + "grad_norm": 8.061341592807446, + "learning_rate": 4.914079706079132e-05, + "loss": 1.9456, + "mean_token_accuracy": 0.482103967666626, + "step": 132880 + }, + { + "epoch": 0.1338431833495998, + "grad_norm": 10.451054794891656, + "learning_rate": 4.91406944961475e-05, + "loss": 2.3465, + "mean_token_accuracy": 0.42068964838981626, + "step": 132885 + }, + { + "epoch": 0.13384821940270394, + "grad_norm": 10.63925203095825, + "learning_rate": 4.9140591925501555e-05, + "loss": 2.558, + "mean_token_accuracy": 0.39655172228813174, + "step": 132890 + }, + { + "epoch": 0.13385325545580812, + "grad_norm": 8.208218346940836, + "learning_rate": 4.9140489348853526e-05, + "loss": 2.1901, + "mean_token_accuracy": 0.420689657330513, + "step": 132895 + }, + { + "epoch": 0.1338582915089123, + "grad_norm": 11.74804805769207, + "learning_rate": 4.914038676620341e-05, + "loss": 2.5114, + "mean_token_accuracy": 0.42758620381355283, + "step": 132900 + }, + { + "epoch": 0.13386332756201647, + "grad_norm": 10.075976236252403, + "learning_rate": 4.9140284177551256e-05, + "loss": 2.256, + "mean_token_accuracy": 0.4758620738983154, + "step": 132905 + }, + { + "epoch": 0.13386836361512064, + "grad_norm": 9.139071813359875, + "learning_rate": 4.914018158289709e-05, + "loss": 1.9682, + "mean_token_accuracy": 0.4620689630508423, + "step": 132910 + }, + { + "epoch": 0.1338733996682248, + "grad_norm": 10.810455610549674, + "learning_rate": 4.9140078982240934e-05, + "loss": 2.0084, + "mean_token_accuracy": 0.47931033968925474, + "step": 132915 + }, + { + "epoch": 0.133878435721329, + "grad_norm": 8.57428280495784, + "learning_rate": 4.9139976375582826e-05, + "loss": 2.1659, + "mean_token_accuracy": 0.458620685338974, + "step": 132920 + }, + { + "epoch": 0.13388347177443316, + "grad_norm": 11.581032813967136, + "learning_rate": 4.913987376292279e-05, + "loss": 2.9114, + "mean_token_accuracy": 0.34137930274009703, + "step": 132925 + }, + { + "epoch": 0.13388850782753733, + "grad_norm": 11.804490014022837, + "learning_rate": 4.913977114426085e-05, + "loss": 2.3912, + "mean_token_accuracy": 0.43448275327682495, + "step": 132930 + }, + { + "epoch": 0.1338935438806415, + "grad_norm": 11.689207118688163, + "learning_rate": 4.9139668519597046e-05, + "loss": 2.7253, + "mean_token_accuracy": 0.4344827473163605, + "step": 132935 + }, + { + "epoch": 0.13389857993374568, + "grad_norm": 8.85195806845111, + "learning_rate": 4.9139565888931395e-05, + "loss": 2.2928, + "mean_token_accuracy": 0.4517241418361664, + "step": 132940 + }, + { + "epoch": 0.13390361598684986, + "grad_norm": 10.658283975668535, + "learning_rate": 4.9139463252263926e-05, + "loss": 2.7722, + "mean_token_accuracy": 0.3827586233615875, + "step": 132945 + }, + { + "epoch": 0.13390865203995403, + "grad_norm": 9.51333075158609, + "learning_rate": 4.9139360609594674e-05, + "loss": 2.4664, + "mean_token_accuracy": 0.41234118938446046, + "step": 132950 + }, + { + "epoch": 0.1339136880930582, + "grad_norm": 9.145077594692667, + "learning_rate": 4.913925796092367e-05, + "loss": 2.4349, + "mean_token_accuracy": 0.45311554670333865, + "step": 132955 + }, + { + "epoch": 0.13391872414616238, + "grad_norm": 10.269115618329831, + "learning_rate": 4.913915530625093e-05, + "loss": 2.4945, + "mean_token_accuracy": 0.4310344815254211, + "step": 132960 + }, + { + "epoch": 0.13392376019926655, + "grad_norm": 10.771952804465455, + "learning_rate": 4.9139052645576495e-05, + "loss": 2.2464, + "mean_token_accuracy": 0.3655172407627106, + "step": 132965 + }, + { + "epoch": 0.13392879625237072, + "grad_norm": 12.203569634118466, + "learning_rate": 4.9138949978900384e-05, + "loss": 2.6665, + "mean_token_accuracy": 0.3965517282485962, + "step": 132970 + }, + { + "epoch": 0.1339338323054749, + "grad_norm": 9.92652646975939, + "learning_rate": 4.913884730622263e-05, + "loss": 2.7056, + "mean_token_accuracy": 0.32413792610168457, + "step": 132975 + }, + { + "epoch": 0.13393886835857907, + "grad_norm": 9.99964857963376, + "learning_rate": 4.913874462754326e-05, + "loss": 2.6125, + "mean_token_accuracy": 0.42758620381355283, + "step": 132980 + }, + { + "epoch": 0.13394390441168325, + "grad_norm": 10.101567371639872, + "learning_rate": 4.913864194286231e-05, + "loss": 1.7469, + "mean_token_accuracy": 0.5586206972599029, + "step": 132985 + }, + { + "epoch": 0.13394894046478742, + "grad_norm": 8.377959689537375, + "learning_rate": 4.913853925217979e-05, + "loss": 2.2224, + "mean_token_accuracy": 0.48054187297821044, + "step": 132990 + }, + { + "epoch": 0.1339539765178916, + "grad_norm": 9.44531952152318, + "learning_rate": 4.9138436555495756e-05, + "loss": 2.544, + "mean_token_accuracy": 0.4034482777118683, + "step": 132995 + }, + { + "epoch": 0.13395901257099577, + "grad_norm": 10.000169829196041, + "learning_rate": 4.913833385281021e-05, + "loss": 2.1524, + "mean_token_accuracy": 0.458620685338974, + "step": 133000 + }, + { + "epoch": 0.13396404862409994, + "grad_norm": 13.629602265393356, + "learning_rate": 4.913823114412319e-05, + "loss": 2.5425, + "mean_token_accuracy": 0.36896551847457887, + "step": 133005 + }, + { + "epoch": 0.13396908467720411, + "grad_norm": 10.51377362786883, + "learning_rate": 4.913812842943473e-05, + "loss": 2.2282, + "mean_token_accuracy": 0.45517241954803467, + "step": 133010 + }, + { + "epoch": 0.1339741207303083, + "grad_norm": 10.584606563915406, + "learning_rate": 4.9138025708744855e-05, + "loss": 2.4518, + "mean_token_accuracy": 0.40689654350280763, + "step": 133015 + }, + { + "epoch": 0.13397915678341246, + "grad_norm": 9.472146976716418, + "learning_rate": 4.913792298205359e-05, + "loss": 2.2788, + "mean_token_accuracy": 0.40689654350280763, + "step": 133020 + }, + { + "epoch": 0.13398419283651664, + "grad_norm": 8.030218433539074, + "learning_rate": 4.9137820249360974e-05, + "loss": 2.2345, + "mean_token_accuracy": 0.4344827651977539, + "step": 133025 + }, + { + "epoch": 0.13398922888962078, + "grad_norm": 9.976727152180056, + "learning_rate": 4.913771751066702e-05, + "loss": 2.6853, + "mean_token_accuracy": 0.38275861740112305, + "step": 133030 + }, + { + "epoch": 0.13399426494272496, + "grad_norm": 12.819014328147253, + "learning_rate": 4.913761476597176e-05, + "loss": 1.8869, + "mean_token_accuracy": 0.5034482836723327, + "step": 133035 + }, + { + "epoch": 0.13399930099582913, + "grad_norm": 11.847882179186286, + "learning_rate": 4.9137512015275244e-05, + "loss": 2.3391, + "mean_token_accuracy": 0.42068964838981626, + "step": 133040 + }, + { + "epoch": 0.1340043370489333, + "grad_norm": 11.053580448375211, + "learning_rate": 4.9137409258577474e-05, + "loss": 2.5285, + "mean_token_accuracy": 0.4068965554237366, + "step": 133045 + }, + { + "epoch": 0.13400937310203748, + "grad_norm": 11.451463147876439, + "learning_rate": 4.913730649587848e-05, + "loss": 2.2927, + "mean_token_accuracy": 0.458620685338974, + "step": 133050 + }, + { + "epoch": 0.13401440915514165, + "grad_norm": 9.146146665604276, + "learning_rate": 4.91372037271783e-05, + "loss": 2.6785, + "mean_token_accuracy": 0.417241370677948, + "step": 133055 + }, + { + "epoch": 0.13401944520824582, + "grad_norm": 10.681153017267379, + "learning_rate": 4.913710095247697e-05, + "loss": 2.3833, + "mean_token_accuracy": 0.4620689630508423, + "step": 133060 + }, + { + "epoch": 0.13402448126135, + "grad_norm": 10.369229621960262, + "learning_rate": 4.91369981717745e-05, + "loss": 2.2033, + "mean_token_accuracy": 0.47586206197738645, + "step": 133065 + }, + { + "epoch": 0.13402951731445417, + "grad_norm": 9.313391799992058, + "learning_rate": 4.913689538507094e-05, + "loss": 2.2578, + "mean_token_accuracy": 0.43103447556495667, + "step": 133070 + }, + { + "epoch": 0.13403455336755835, + "grad_norm": 12.514210450104544, + "learning_rate": 4.9136792592366296e-05, + "loss": 2.1865, + "mean_token_accuracy": 0.5091954052448273, + "step": 133075 + }, + { + "epoch": 0.13403958942066252, + "grad_norm": 9.37192116552471, + "learning_rate": 4.913668979366061e-05, + "loss": 2.309, + "mean_token_accuracy": 0.4724137902259827, + "step": 133080 + }, + { + "epoch": 0.1340446254737667, + "grad_norm": 11.55364010225787, + "learning_rate": 4.913658698895391e-05, + "loss": 2.2739, + "mean_token_accuracy": 0.46811856627464293, + "step": 133085 + }, + { + "epoch": 0.13404966152687087, + "grad_norm": 9.139603808094293, + "learning_rate": 4.9136484178246214e-05, + "loss": 2.3969, + "mean_token_accuracy": 0.41724138855934145, + "step": 133090 + }, + { + "epoch": 0.13405469757997504, + "grad_norm": 11.068345412109831, + "learning_rate": 4.913638136153756e-05, + "loss": 2.2119, + "mean_token_accuracy": 0.4775559604167938, + "step": 133095 + }, + { + "epoch": 0.13405973363307921, + "grad_norm": 11.89108967900816, + "learning_rate": 4.913627853882799e-05, + "loss": 2.5425, + "mean_token_accuracy": 0.47931033968925474, + "step": 133100 + }, + { + "epoch": 0.1340647696861834, + "grad_norm": 12.545874715113227, + "learning_rate": 4.91361757101175e-05, + "loss": 2.6121, + "mean_token_accuracy": 0.3896551728248596, + "step": 133105 + }, + { + "epoch": 0.13406980573928756, + "grad_norm": 8.566067221791846, + "learning_rate": 4.913607287540615e-05, + "loss": 2.3286, + "mean_token_accuracy": 0.44827585816383364, + "step": 133110 + }, + { + "epoch": 0.13407484179239174, + "grad_norm": 10.49852594089317, + "learning_rate": 4.913597003469394e-05, + "loss": 2.3743, + "mean_token_accuracy": 0.42068966031074523, + "step": 133115 + }, + { + "epoch": 0.1340798778454959, + "grad_norm": 13.509442755498323, + "learning_rate": 4.913586718798092e-05, + "loss": 2.9966, + "mean_token_accuracy": 0.3758620709180832, + "step": 133120 + }, + { + "epoch": 0.13408491389860008, + "grad_norm": 9.345885190793043, + "learning_rate": 4.913576433526711e-05, + "loss": 2.4534, + "mean_token_accuracy": 0.36551723480224607, + "step": 133125 + }, + { + "epoch": 0.13408994995170426, + "grad_norm": 9.824789875846834, + "learning_rate": 4.913566147655254e-05, + "loss": 2.459, + "mean_token_accuracy": 0.44137930274009707, + "step": 133130 + }, + { + "epoch": 0.13409498600480843, + "grad_norm": 9.11876444051404, + "learning_rate": 4.913555861183724e-05, + "loss": 2.2755, + "mean_token_accuracy": 0.4879007875919342, + "step": 133135 + }, + { + "epoch": 0.1341000220579126, + "grad_norm": 9.697784821845543, + "learning_rate": 4.913545574112124e-05, + "loss": 1.8646, + "mean_token_accuracy": 0.5068965494632721, + "step": 133140 + }, + { + "epoch": 0.13410505811101678, + "grad_norm": 11.780024562371858, + "learning_rate": 4.913535286440456e-05, + "loss": 2.3058, + "mean_token_accuracy": 0.44827585816383364, + "step": 133145 + }, + { + "epoch": 0.13411009416412095, + "grad_norm": 11.082344507478709, + "learning_rate": 4.9135249981687236e-05, + "loss": 2.3147, + "mean_token_accuracy": 0.4189957737922668, + "step": 133150 + }, + { + "epoch": 0.13411513021722513, + "grad_norm": 11.679361567906682, + "learning_rate": 4.91351470929693e-05, + "loss": 2.3547, + "mean_token_accuracy": 0.4379310369491577, + "step": 133155 + }, + { + "epoch": 0.1341201662703293, + "grad_norm": 12.86433525259886, + "learning_rate": 4.913504419825077e-05, + "loss": 2.5741, + "mean_token_accuracy": 0.3827586114406586, + "step": 133160 + }, + { + "epoch": 0.13412520232343347, + "grad_norm": 8.872623305573056, + "learning_rate": 4.9134941297531675e-05, + "loss": 2.3257, + "mean_token_accuracy": 0.47241379618644713, + "step": 133165 + }, + { + "epoch": 0.13413023837653762, + "grad_norm": 14.146418841040349, + "learning_rate": 4.913483839081206e-05, + "loss": 2.6141, + "mean_token_accuracy": 0.4672111332416534, + "step": 133170 + }, + { + "epoch": 0.1341352744296418, + "grad_norm": 10.232165953638365, + "learning_rate": 4.913473547809193e-05, + "loss": 2.2596, + "mean_token_accuracy": 0.42413793206214906, + "step": 133175 + }, + { + "epoch": 0.13414031048274597, + "grad_norm": 13.307303938397402, + "learning_rate": 4.913463255937133e-05, + "loss": 1.992, + "mean_token_accuracy": 0.5207501649856567, + "step": 133180 + }, + { + "epoch": 0.13414534653585014, + "grad_norm": 11.607179444353685, + "learning_rate": 4.9134529634650286e-05, + "loss": 2.4738, + "mean_token_accuracy": 0.47586206197738645, + "step": 133185 + }, + { + "epoch": 0.13415038258895431, + "grad_norm": 10.685020322988358, + "learning_rate": 4.9134426703928824e-05, + "loss": 2.5088, + "mean_token_accuracy": 0.37241379618644715, + "step": 133190 + }, + { + "epoch": 0.1341554186420585, + "grad_norm": 11.727129026799751, + "learning_rate": 4.913432376720697e-05, + "loss": 3.1191, + "mean_token_accuracy": 0.35862068831920624, + "step": 133195 + }, + { + "epoch": 0.13416045469516266, + "grad_norm": 10.339170376160103, + "learning_rate": 4.9134220824484763e-05, + "loss": 2.4782, + "mean_token_accuracy": 0.4137930989265442, + "step": 133200 + }, + { + "epoch": 0.13416549074826684, + "grad_norm": 9.180894110905147, + "learning_rate": 4.913411787576221e-05, + "loss": 2.4063, + "mean_token_accuracy": 0.41034482717514037, + "step": 133205 + }, + { + "epoch": 0.134170526801371, + "grad_norm": 12.360519081212592, + "learning_rate": 4.913401492103937e-05, + "loss": 2.3694, + "mean_token_accuracy": 0.4502117395401001, + "step": 133210 + }, + { + "epoch": 0.13417556285447518, + "grad_norm": 13.12219190977595, + "learning_rate": 4.913391196031625e-05, + "loss": 2.5681, + "mean_token_accuracy": 0.3448275804519653, + "step": 133215 + }, + { + "epoch": 0.13418059890757936, + "grad_norm": 9.795439270401314, + "learning_rate": 4.913380899359288e-05, + "loss": 2.0729, + "mean_token_accuracy": 0.4551724076271057, + "step": 133220 + }, + { + "epoch": 0.13418563496068353, + "grad_norm": 9.787602510419227, + "learning_rate": 4.9133706020869296e-05, + "loss": 2.1653, + "mean_token_accuracy": 0.4586206912994385, + "step": 133225 + }, + { + "epoch": 0.1341906710137877, + "grad_norm": 12.946466812453222, + "learning_rate": 4.9133603042145515e-05, + "loss": 2.8084, + "mean_token_accuracy": 0.3896551728248596, + "step": 133230 + }, + { + "epoch": 0.13419570706689188, + "grad_norm": 10.131712109489373, + "learning_rate": 4.913350005742158e-05, + "loss": 2.1511, + "mean_token_accuracy": 0.4517241358757019, + "step": 133235 + }, + { + "epoch": 0.13420074311999605, + "grad_norm": 12.390586016617839, + "learning_rate": 4.9133397066697514e-05, + "loss": 2.2503, + "mean_token_accuracy": 0.4551724135875702, + "step": 133240 + }, + { + "epoch": 0.13420577917310023, + "grad_norm": 13.206889327990059, + "learning_rate": 4.913329406997335e-05, + "loss": 2.6561, + "mean_token_accuracy": 0.41149425506591797, + "step": 133245 + }, + { + "epoch": 0.1342108152262044, + "grad_norm": 11.139734694336255, + "learning_rate": 4.9133191067249105e-05, + "loss": 2.4348, + "mean_token_accuracy": 0.4448275864124298, + "step": 133250 + }, + { + "epoch": 0.13421585127930857, + "grad_norm": 14.013888397976796, + "learning_rate": 4.913308805852481e-05, + "loss": 2.4082, + "mean_token_accuracy": 0.42068966031074523, + "step": 133255 + }, + { + "epoch": 0.13422088733241275, + "grad_norm": 11.405764248954883, + "learning_rate": 4.91329850438005e-05, + "loss": 2.2966, + "mean_token_accuracy": 0.47241379618644713, + "step": 133260 + }, + { + "epoch": 0.13422592338551692, + "grad_norm": 9.94753541247845, + "learning_rate": 4.9132882023076196e-05, + "loss": 2.3294, + "mean_token_accuracy": 0.42758620977401735, + "step": 133265 + }, + { + "epoch": 0.1342309594386211, + "grad_norm": 12.12423042332564, + "learning_rate": 4.913277899635194e-05, + "loss": 2.5722, + "mean_token_accuracy": 0.42758620977401735, + "step": 133270 + }, + { + "epoch": 0.13423599549172527, + "grad_norm": 8.784623124555536, + "learning_rate": 4.913267596362775e-05, + "loss": 2.1155, + "mean_token_accuracy": 0.46551724076271056, + "step": 133275 + }, + { + "epoch": 0.13424103154482944, + "grad_norm": 8.420739386644287, + "learning_rate": 4.9132572924903655e-05, + "loss": 2.1642, + "mean_token_accuracy": 0.4586206912994385, + "step": 133280 + }, + { + "epoch": 0.13424606759793362, + "grad_norm": 10.118032240411473, + "learning_rate": 4.913246988017969e-05, + "loss": 2.1197, + "mean_token_accuracy": 0.4172413766384125, + "step": 133285 + }, + { + "epoch": 0.1342511036510378, + "grad_norm": 17.003970341571257, + "learning_rate": 4.913236682945587e-05, + "loss": 2.64, + "mean_token_accuracy": 0.41034482717514037, + "step": 133290 + }, + { + "epoch": 0.13425613970414196, + "grad_norm": 9.666119406403576, + "learning_rate": 4.913226377273224e-05, + "loss": 2.0674, + "mean_token_accuracy": 0.4689655125141144, + "step": 133295 + }, + { + "epoch": 0.13426117575724614, + "grad_norm": 10.483624931438063, + "learning_rate": 4.9132160710008815e-05, + "loss": 2.3974, + "mean_token_accuracy": 0.44827585816383364, + "step": 133300 + }, + { + "epoch": 0.1342662118103503, + "grad_norm": 20.50268935468113, + "learning_rate": 4.913205764128563e-05, + "loss": 2.789, + "mean_token_accuracy": 0.39310344457626345, + "step": 133305 + }, + { + "epoch": 0.13427124786345446, + "grad_norm": 11.510033586954078, + "learning_rate": 4.9131954566562724e-05, + "loss": 2.3261, + "mean_token_accuracy": 0.4622504532337189, + "step": 133310 + }, + { + "epoch": 0.13427628391655863, + "grad_norm": 8.661591766418606, + "learning_rate": 4.91318514858401e-05, + "loss": 2.1883, + "mean_token_accuracy": 0.441379314661026, + "step": 133315 + }, + { + "epoch": 0.1342813199696628, + "grad_norm": 9.308058952299053, + "learning_rate": 4.9131748399117805e-05, + "loss": 2.7824, + "mean_token_accuracy": 0.41379311084747317, + "step": 133320 + }, + { + "epoch": 0.13428635602276698, + "grad_norm": 10.865454672814817, + "learning_rate": 4.913164530639587e-05, + "loss": 2.5983, + "mean_token_accuracy": 0.3896551728248596, + "step": 133325 + }, + { + "epoch": 0.13429139207587115, + "grad_norm": 9.760613271791762, + "learning_rate": 4.9131542207674315e-05, + "loss": 2.2572, + "mean_token_accuracy": 0.4689655125141144, + "step": 133330 + }, + { + "epoch": 0.13429642812897533, + "grad_norm": 10.06067540029772, + "learning_rate": 4.913143910295317e-05, + "loss": 2.3667, + "mean_token_accuracy": 0.4413793087005615, + "step": 133335 + }, + { + "epoch": 0.1343014641820795, + "grad_norm": 9.752870812481735, + "learning_rate": 4.9131335992232464e-05, + "loss": 2.3194, + "mean_token_accuracy": 0.44343616664409635, + "step": 133340 + }, + { + "epoch": 0.13430650023518367, + "grad_norm": 9.521570759914928, + "learning_rate": 4.9131232875512223e-05, + "loss": 2.2088, + "mean_token_accuracy": 0.44137930274009707, + "step": 133345 + }, + { + "epoch": 0.13431153628828785, + "grad_norm": 9.595652112515525, + "learning_rate": 4.913112975279249e-05, + "loss": 2.0212, + "mean_token_accuracy": 0.4931034505367279, + "step": 133350 + }, + { + "epoch": 0.13431657234139202, + "grad_norm": 11.017809863357463, + "learning_rate": 4.9131026624073275e-05, + "loss": 2.4975, + "mean_token_accuracy": 0.41724138259887694, + "step": 133355 + }, + { + "epoch": 0.1343216083944962, + "grad_norm": 12.118249733207724, + "learning_rate": 4.913092348935461e-05, + "loss": 2.2868, + "mean_token_accuracy": 0.41724138259887694, + "step": 133360 + }, + { + "epoch": 0.13432664444760037, + "grad_norm": 10.7971842295323, + "learning_rate": 4.9130820348636534e-05, + "loss": 2.6374, + "mean_token_accuracy": 0.3827586233615875, + "step": 133365 + }, + { + "epoch": 0.13433168050070454, + "grad_norm": 12.3888643519639, + "learning_rate": 4.913071720191906e-05, + "loss": 2.5238, + "mean_token_accuracy": 0.3915305554866791, + "step": 133370 + }, + { + "epoch": 0.13433671655380872, + "grad_norm": 10.022165617186296, + "learning_rate": 4.913061404920223e-05, + "loss": 2.2466, + "mean_token_accuracy": 0.4965517222881317, + "step": 133375 + }, + { + "epoch": 0.1343417526069129, + "grad_norm": 10.572371980290658, + "learning_rate": 4.9130510890486084e-05, + "loss": 2.0121, + "mean_token_accuracy": 0.4655172348022461, + "step": 133380 + }, + { + "epoch": 0.13434678866001706, + "grad_norm": 14.280286013821122, + "learning_rate": 4.913040772577062e-05, + "loss": 2.4849, + "mean_token_accuracy": 0.41034482717514037, + "step": 133385 + }, + { + "epoch": 0.13435182471312124, + "grad_norm": 12.001847560340789, + "learning_rate": 4.913030455505588e-05, + "loss": 2.8677, + "mean_token_accuracy": 0.39310344457626345, + "step": 133390 + }, + { + "epoch": 0.1343568607662254, + "grad_norm": 11.483911533121626, + "learning_rate": 4.91302013783419e-05, + "loss": 2.679, + "mean_token_accuracy": 0.41724138259887694, + "step": 133395 + }, + { + "epoch": 0.13436189681932958, + "grad_norm": 7.9937897646191685, + "learning_rate": 4.91300981956287e-05, + "loss": 2.335, + "mean_token_accuracy": 0.43103448748588563, + "step": 133400 + }, + { + "epoch": 0.13436693287243376, + "grad_norm": 10.207440022254842, + "learning_rate": 4.912999500691632e-05, + "loss": 2.0766, + "mean_token_accuracy": 0.4931034445762634, + "step": 133405 + }, + { + "epoch": 0.13437196892553793, + "grad_norm": 10.57464206836519, + "learning_rate": 4.912989181220476e-05, + "loss": 2.5347, + "mean_token_accuracy": 0.4034482777118683, + "step": 133410 + }, + { + "epoch": 0.1343770049786421, + "grad_norm": 11.706299070904699, + "learning_rate": 4.912978861149409e-05, + "loss": 2.4012, + "mean_token_accuracy": 0.4206896543502808, + "step": 133415 + }, + { + "epoch": 0.13438204103174628, + "grad_norm": 10.15425674813988, + "learning_rate": 4.912968540478431e-05, + "loss": 2.4497, + "mean_token_accuracy": 0.41724138259887694, + "step": 133420 + }, + { + "epoch": 0.13438707708485045, + "grad_norm": 13.635647965013494, + "learning_rate": 4.9129582192075457e-05, + "loss": 2.7606, + "mean_token_accuracy": 0.3551724135875702, + "step": 133425 + }, + { + "epoch": 0.13439211313795463, + "grad_norm": 10.139692543827582, + "learning_rate": 4.9129478973367557e-05, + "loss": 2.4617, + "mean_token_accuracy": 0.358620685338974, + "step": 133430 + }, + { + "epoch": 0.1343971491910588, + "grad_norm": 9.172812265900287, + "learning_rate": 4.9129375748660644e-05, + "loss": 2.1545, + "mean_token_accuracy": 0.4551724135875702, + "step": 133435 + }, + { + "epoch": 0.13440218524416297, + "grad_norm": 11.754630218774727, + "learning_rate": 4.912927251795473e-05, + "loss": 2.424, + "mean_token_accuracy": 0.42413792610168455, + "step": 133440 + }, + { + "epoch": 0.13440722129726715, + "grad_norm": 12.444782696408742, + "learning_rate": 4.9129169281249876e-05, + "loss": 2.2839, + "mean_token_accuracy": 0.48275862336158754, + "step": 133445 + }, + { + "epoch": 0.1344122573503713, + "grad_norm": 9.00634132468982, + "learning_rate": 4.912906603854608e-05, + "loss": 2.3557, + "mean_token_accuracy": 0.47931033968925474, + "step": 133450 + }, + { + "epoch": 0.13441729340347547, + "grad_norm": 12.523626858454488, + "learning_rate": 4.912896278984338e-05, + "loss": 2.6274, + "mean_token_accuracy": 0.3999999940395355, + "step": 133455 + }, + { + "epoch": 0.13442232945657964, + "grad_norm": 11.513513783155293, + "learning_rate": 4.9128859535141814e-05, + "loss": 2.3317, + "mean_token_accuracy": 0.4586206912994385, + "step": 133460 + }, + { + "epoch": 0.13442736550968382, + "grad_norm": 10.452896530261441, + "learning_rate": 4.9128756274441395e-05, + "loss": 2.2798, + "mean_token_accuracy": 0.4551724135875702, + "step": 133465 + }, + { + "epoch": 0.134432401562788, + "grad_norm": 10.323327562640388, + "learning_rate": 4.9128653007742167e-05, + "loss": 1.9003, + "mean_token_accuracy": 0.5502117335796356, + "step": 133470 + }, + { + "epoch": 0.13443743761589216, + "grad_norm": 9.41760246078116, + "learning_rate": 4.912854973504414e-05, + "loss": 2.6979, + "mean_token_accuracy": 0.4172413766384125, + "step": 133475 + }, + { + "epoch": 0.13444247366899634, + "grad_norm": 10.679260859463167, + "learning_rate": 4.9128446456347364e-05, + "loss": 2.586, + "mean_token_accuracy": 0.4137930989265442, + "step": 133480 + }, + { + "epoch": 0.1344475097221005, + "grad_norm": 9.938032536234672, + "learning_rate": 4.9128343171651856e-05, + "loss": 2.6954, + "mean_token_accuracy": 0.42758620977401735, + "step": 133485 + }, + { + "epoch": 0.13445254577520468, + "grad_norm": 10.792187770235449, + "learning_rate": 4.9128239880957655e-05, + "loss": 2.5092, + "mean_token_accuracy": 0.37731397747993467, + "step": 133490 + }, + { + "epoch": 0.13445758182830886, + "grad_norm": 11.27454788544269, + "learning_rate": 4.912813658426477e-05, + "loss": 2.216, + "mean_token_accuracy": 0.4413793087005615, + "step": 133495 + }, + { + "epoch": 0.13446261788141303, + "grad_norm": 8.325130653367347, + "learning_rate": 4.9128033281573245e-05, + "loss": 2.2849, + "mean_token_accuracy": 0.4448275864124298, + "step": 133500 + }, + { + "epoch": 0.1344676539345172, + "grad_norm": 19.603952016840513, + "learning_rate": 4.9127929972883104e-05, + "loss": 2.6118, + "mean_token_accuracy": 0.4000000059604645, + "step": 133505 + }, + { + "epoch": 0.13447268998762138, + "grad_norm": 9.258050918716755, + "learning_rate": 4.9127826658194363e-05, + "loss": 2.5046, + "mean_token_accuracy": 0.4034482717514038, + "step": 133510 + }, + { + "epoch": 0.13447772604072555, + "grad_norm": 14.102883500042001, + "learning_rate": 4.912772333750708e-05, + "loss": 2.6154, + "mean_token_accuracy": 0.38620689511299133, + "step": 133515 + }, + { + "epoch": 0.13448276209382973, + "grad_norm": 9.074882647934036, + "learning_rate": 4.912762001082126e-05, + "loss": 2.5697, + "mean_token_accuracy": 0.43103447556495667, + "step": 133520 + }, + { + "epoch": 0.1344877981469339, + "grad_norm": 12.398591908673087, + "learning_rate": 4.912751667813694e-05, + "loss": 2.4407, + "mean_token_accuracy": 0.45517241954803467, + "step": 133525 + }, + { + "epoch": 0.13449283420003807, + "grad_norm": 10.843185465278435, + "learning_rate": 4.9127413339454155e-05, + "loss": 2.8635, + "mean_token_accuracy": 0.3379310339689255, + "step": 133530 + }, + { + "epoch": 0.13449787025314225, + "grad_norm": 10.031724284400868, + "learning_rate": 4.912730999477292e-05, + "loss": 2.8421, + "mean_token_accuracy": 0.3724137872457504, + "step": 133535 + }, + { + "epoch": 0.13450290630624642, + "grad_norm": 10.031076282944703, + "learning_rate": 4.912720664409327e-05, + "loss": 2.3495, + "mean_token_accuracy": 0.43309134244918823, + "step": 133540 + }, + { + "epoch": 0.1345079423593506, + "grad_norm": 5.70606433663581, + "learning_rate": 4.912710328741523e-05, + "loss": 1.9283, + "mean_token_accuracy": 0.5120689630508423, + "step": 133545 + }, + { + "epoch": 0.13451297841245477, + "grad_norm": 10.097182019474523, + "learning_rate": 4.912699992473883e-05, + "loss": 2.2718, + "mean_token_accuracy": 0.4482758641242981, + "step": 133550 + }, + { + "epoch": 0.13451801446555894, + "grad_norm": 8.15183901205883, + "learning_rate": 4.9126896556064114e-05, + "loss": 2.2808, + "mean_token_accuracy": 0.4517241358757019, + "step": 133555 + }, + { + "epoch": 0.13452305051866312, + "grad_norm": 12.916751120344587, + "learning_rate": 4.912679318139109e-05, + "loss": 2.4991, + "mean_token_accuracy": 0.41034482717514037, + "step": 133560 + }, + { + "epoch": 0.1345280865717673, + "grad_norm": 10.76104074360416, + "learning_rate": 4.912668980071979e-05, + "loss": 2.2546, + "mean_token_accuracy": 0.42068964838981626, + "step": 133565 + }, + { + "epoch": 0.13453312262487147, + "grad_norm": 10.253601639588439, + "learning_rate": 4.912658641405025e-05, + "loss": 2.5368, + "mean_token_accuracy": 0.38433151245117186, + "step": 133570 + }, + { + "epoch": 0.13453815867797564, + "grad_norm": 12.134093752749203, + "learning_rate": 4.91264830213825e-05, + "loss": 2.4463, + "mean_token_accuracy": 0.4344827651977539, + "step": 133575 + }, + { + "epoch": 0.1345431947310798, + "grad_norm": 11.176038787645842, + "learning_rate": 4.912637962271655e-05, + "loss": 2.3772, + "mean_token_accuracy": 0.4344827651977539, + "step": 133580 + }, + { + "epoch": 0.134548230784184, + "grad_norm": 10.813119999477912, + "learning_rate": 4.912627621805246e-05, + "loss": 2.353, + "mean_token_accuracy": 0.3793103456497192, + "step": 133585 + }, + { + "epoch": 0.13455326683728813, + "grad_norm": 10.582459540899668, + "learning_rate": 4.912617280739024e-05, + "loss": 2.3769, + "mean_token_accuracy": 0.42413793206214906, + "step": 133590 + }, + { + "epoch": 0.1345583028903923, + "grad_norm": 9.138121368722187, + "learning_rate": 4.9126069390729914e-05, + "loss": 2.179, + "mean_token_accuracy": 0.39310344457626345, + "step": 133595 + }, + { + "epoch": 0.13456333894349648, + "grad_norm": 9.011560157753548, + "learning_rate": 4.912596596807152e-05, + "loss": 2.3317, + "mean_token_accuracy": 0.4344827592372894, + "step": 133600 + }, + { + "epoch": 0.13456837499660065, + "grad_norm": 9.607758436900355, + "learning_rate": 4.9125862539415076e-05, + "loss": 2.0898, + "mean_token_accuracy": 0.46551724076271056, + "step": 133605 + }, + { + "epoch": 0.13457341104970483, + "grad_norm": 11.648362120287345, + "learning_rate": 4.9125759104760626e-05, + "loss": 2.9579, + "mean_token_accuracy": 0.37931033670902253, + "step": 133610 + }, + { + "epoch": 0.134578447102809, + "grad_norm": 9.75640295240588, + "learning_rate": 4.912565566410819e-05, + "loss": 1.9985, + "mean_token_accuracy": 0.4620689570903778, + "step": 133615 + }, + { + "epoch": 0.13458348315591318, + "grad_norm": 10.37540080001233, + "learning_rate": 4.91255522174578e-05, + "loss": 2.3242, + "mean_token_accuracy": 0.42413792610168455, + "step": 133620 + }, + { + "epoch": 0.13458851920901735, + "grad_norm": 10.583097536956135, + "learning_rate": 4.912544876480948e-05, + "loss": 2.3466, + "mean_token_accuracy": 0.4482758641242981, + "step": 133625 + }, + { + "epoch": 0.13459355526212152, + "grad_norm": 13.876431571158918, + "learning_rate": 4.9125345306163256e-05, + "loss": 2.5999, + "mean_token_accuracy": 0.43103448748588563, + "step": 133630 + }, + { + "epoch": 0.1345985913152257, + "grad_norm": 6.694327059811454, + "learning_rate": 4.912524184151917e-05, + "loss": 2.4814, + "mean_token_accuracy": 0.46049606800079346, + "step": 133635 + }, + { + "epoch": 0.13460362736832987, + "grad_norm": 10.402479532906442, + "learning_rate": 4.9125138370877244e-05, + "loss": 2.892, + "mean_token_accuracy": 0.3482758581638336, + "step": 133640 + }, + { + "epoch": 0.13460866342143404, + "grad_norm": 12.186872258273448, + "learning_rate": 4.912503489423749e-05, + "loss": 2.3716, + "mean_token_accuracy": 0.4379310369491577, + "step": 133645 + }, + { + "epoch": 0.13461369947453822, + "grad_norm": 15.448822356552942, + "learning_rate": 4.9124931411599966e-05, + "loss": 2.5409, + "mean_token_accuracy": 0.417241370677948, + "step": 133650 + }, + { + "epoch": 0.1346187355276424, + "grad_norm": 10.154175173796041, + "learning_rate": 4.912482792296468e-05, + "loss": 2.9022, + "mean_token_accuracy": 0.38275861740112305, + "step": 133655 + }, + { + "epoch": 0.13462377158074657, + "grad_norm": 11.723209325468458, + "learning_rate": 4.912472442833167e-05, + "loss": 2.6422, + "mean_token_accuracy": 0.4172413766384125, + "step": 133660 + }, + { + "epoch": 0.13462880763385074, + "grad_norm": 12.18209481165935, + "learning_rate": 4.9124620927700965e-05, + "loss": 2.8385, + "mean_token_accuracy": 0.3931034505367279, + "step": 133665 + }, + { + "epoch": 0.1346338436869549, + "grad_norm": 8.411865562125689, + "learning_rate": 4.9124517421072594e-05, + "loss": 2.4184, + "mean_token_accuracy": 0.46896551847457885, + "step": 133670 + }, + { + "epoch": 0.1346388797400591, + "grad_norm": 9.677528228758433, + "learning_rate": 4.912441390844658e-05, + "loss": 2.277, + "mean_token_accuracy": 0.4635813653469086, + "step": 133675 + }, + { + "epoch": 0.13464391579316326, + "grad_norm": 14.922613398793022, + "learning_rate": 4.912431038982294e-05, + "loss": 2.5145, + "mean_token_accuracy": 0.4310344815254211, + "step": 133680 + }, + { + "epoch": 0.13464895184626743, + "grad_norm": 13.401940994145589, + "learning_rate": 4.912420686520173e-05, + "loss": 2.5887, + "mean_token_accuracy": 0.3999999940395355, + "step": 133685 + }, + { + "epoch": 0.1346539878993716, + "grad_norm": 11.99587020235577, + "learning_rate": 4.912410333458297e-05, + "loss": 2.3474, + "mean_token_accuracy": 0.43103447556495667, + "step": 133690 + }, + { + "epoch": 0.13465902395247578, + "grad_norm": 8.685505027949565, + "learning_rate": 4.912399979796667e-05, + "loss": 2.1243, + "mean_token_accuracy": 0.5068965375423431, + "step": 133695 + }, + { + "epoch": 0.13466406000557996, + "grad_norm": 9.51198468992629, + "learning_rate": 4.912389625535288e-05, + "loss": 2.4714, + "mean_token_accuracy": 0.4517241418361664, + "step": 133700 + }, + { + "epoch": 0.13466909605868413, + "grad_norm": 28.386604242370172, + "learning_rate": 4.912379270674162e-05, + "loss": 2.7739, + "mean_token_accuracy": 0.4137930989265442, + "step": 133705 + }, + { + "epoch": 0.1346741321117883, + "grad_norm": 10.066417902762476, + "learning_rate": 4.9123689152132926e-05, + "loss": 2.0011, + "mean_token_accuracy": 0.458620685338974, + "step": 133710 + }, + { + "epoch": 0.13467916816489248, + "grad_norm": 10.132296576308173, + "learning_rate": 4.912358559152682e-05, + "loss": 2.6321, + "mean_token_accuracy": 0.3896551728248596, + "step": 133715 + }, + { + "epoch": 0.13468420421799665, + "grad_norm": 13.253826024454707, + "learning_rate": 4.912348202492333e-05, + "loss": 2.4118, + "mean_token_accuracy": 0.4724137902259827, + "step": 133720 + }, + { + "epoch": 0.13468924027110082, + "grad_norm": 11.019678721869447, + "learning_rate": 4.912337845232248e-05, + "loss": 2.281, + "mean_token_accuracy": 0.44482759237289426, + "step": 133725 + }, + { + "epoch": 0.13469427632420497, + "grad_norm": 9.313991698671483, + "learning_rate": 4.912327487372431e-05, + "loss": 2.2266, + "mean_token_accuracy": 0.47779794335365294, + "step": 133730 + }, + { + "epoch": 0.13469931237730914, + "grad_norm": 12.398574198450524, + "learning_rate": 4.912317128912885e-05, + "loss": 2.4346, + "mean_token_accuracy": 0.3810042321681976, + "step": 133735 + }, + { + "epoch": 0.13470434843041332, + "grad_norm": 10.5935220502372, + "learning_rate": 4.912306769853612e-05, + "loss": 2.3504, + "mean_token_accuracy": 0.4551724135875702, + "step": 133740 + }, + { + "epoch": 0.1347093844835175, + "grad_norm": 11.175962196455592, + "learning_rate": 4.9122964101946147e-05, + "loss": 2.6502, + "mean_token_accuracy": 0.4034482777118683, + "step": 133745 + }, + { + "epoch": 0.13471442053662167, + "grad_norm": 12.101065987946038, + "learning_rate": 4.9122860499358964e-05, + "loss": 2.3451, + "mean_token_accuracy": 0.4344827473163605, + "step": 133750 + }, + { + "epoch": 0.13471945658972584, + "grad_norm": 9.007486248400172, + "learning_rate": 4.91227568907746e-05, + "loss": 2.3273, + "mean_token_accuracy": 0.4413793087005615, + "step": 133755 + }, + { + "epoch": 0.13472449264283, + "grad_norm": 9.892949173219353, + "learning_rate": 4.912265327619309e-05, + "loss": 2.2623, + "mean_token_accuracy": 0.43103448748588563, + "step": 133760 + }, + { + "epoch": 0.1347295286959342, + "grad_norm": 10.927627703182745, + "learning_rate": 4.9122549655614455e-05, + "loss": 2.4706, + "mean_token_accuracy": 0.42413793206214906, + "step": 133765 + }, + { + "epoch": 0.13473456474903836, + "grad_norm": 9.446514498577935, + "learning_rate": 4.9122446029038724e-05, + "loss": 2.3503, + "mean_token_accuracy": 0.43793103098869324, + "step": 133770 + }, + { + "epoch": 0.13473960080214253, + "grad_norm": 11.530480000539573, + "learning_rate": 4.9122342396465925e-05, + "loss": 3.0848, + "mean_token_accuracy": 0.34482758641242983, + "step": 133775 + }, + { + "epoch": 0.1347446368552467, + "grad_norm": 11.046254420366234, + "learning_rate": 4.912223875789609e-05, + "loss": 2.3711, + "mean_token_accuracy": 0.41034482717514037, + "step": 133780 + }, + { + "epoch": 0.13474967290835088, + "grad_norm": 8.752395082828878, + "learning_rate": 4.9122135113329245e-05, + "loss": 2.7309, + "mean_token_accuracy": 0.3965517163276672, + "step": 133785 + }, + { + "epoch": 0.13475470896145506, + "grad_norm": 10.541089007514666, + "learning_rate": 4.912203146276542e-05, + "loss": 2.53, + "mean_token_accuracy": 0.43103447556495667, + "step": 133790 + }, + { + "epoch": 0.13475974501455923, + "grad_norm": 9.581735740937182, + "learning_rate": 4.912192780620464e-05, + "loss": 2.485, + "mean_token_accuracy": 0.4034482777118683, + "step": 133795 + }, + { + "epoch": 0.1347647810676634, + "grad_norm": 9.41235586227464, + "learning_rate": 4.912182414364694e-05, + "loss": 2.2543, + "mean_token_accuracy": 0.4413793087005615, + "step": 133800 + }, + { + "epoch": 0.13476981712076758, + "grad_norm": 10.810401897623253, + "learning_rate": 4.912172047509235e-05, + "loss": 2.4189, + "mean_token_accuracy": 0.43793103098869324, + "step": 133805 + }, + { + "epoch": 0.13477485317387175, + "grad_norm": 9.53372088423735, + "learning_rate": 4.9121616800540906e-05, + "loss": 2.4075, + "mean_token_accuracy": 0.42413793206214906, + "step": 133810 + }, + { + "epoch": 0.13477988922697592, + "grad_norm": 9.738706491641206, + "learning_rate": 4.912151311999261e-05, + "loss": 1.8531, + "mean_token_accuracy": 0.49999999403953554, + "step": 133815 + }, + { + "epoch": 0.1347849252800801, + "grad_norm": 10.071647612939412, + "learning_rate": 4.912140943344751e-05, + "loss": 2.3993, + "mean_token_accuracy": 0.4068965554237366, + "step": 133820 + }, + { + "epoch": 0.13478996133318427, + "grad_norm": 9.60063380760036, + "learning_rate": 4.9121305740905635e-05, + "loss": 2.1425, + "mean_token_accuracy": 0.4379310250282288, + "step": 133825 + }, + { + "epoch": 0.13479499738628845, + "grad_norm": 11.734080148261107, + "learning_rate": 4.9121202042367e-05, + "loss": 2.7414, + "mean_token_accuracy": 0.4034482717514038, + "step": 133830 + }, + { + "epoch": 0.13480003343939262, + "grad_norm": 10.67872495327807, + "learning_rate": 4.912109833783166e-05, + "loss": 2.318, + "mean_token_accuracy": 0.42758620977401735, + "step": 133835 + }, + { + "epoch": 0.1348050694924968, + "grad_norm": 8.361095471035858, + "learning_rate": 4.9120994627299614e-05, + "loss": 2.0856, + "mean_token_accuracy": 0.4793103516101837, + "step": 133840 + }, + { + "epoch": 0.13481010554560097, + "grad_norm": 8.280312432846882, + "learning_rate": 4.9120890910770915e-05, + "loss": 2.0511, + "mean_token_accuracy": 0.47241378426551817, + "step": 133845 + }, + { + "epoch": 0.13481514159870514, + "grad_norm": 15.400048115235153, + "learning_rate": 4.9120787188245575e-05, + "loss": 2.5305, + "mean_token_accuracy": 0.41724138259887694, + "step": 133850 + }, + { + "epoch": 0.13482017765180931, + "grad_norm": 12.298536895995761, + "learning_rate": 4.912068345972363e-05, + "loss": 2.7783, + "mean_token_accuracy": 0.36206896901130675, + "step": 133855 + }, + { + "epoch": 0.1348252137049135, + "grad_norm": 11.97210348239016, + "learning_rate": 4.912057972520511e-05, + "loss": 2.6808, + "mean_token_accuracy": 0.35862069129943847, + "step": 133860 + }, + { + "epoch": 0.13483024975801766, + "grad_norm": 12.5364236790175, + "learning_rate": 4.9120475984690035e-05, + "loss": 2.4669, + "mean_token_accuracy": 0.441379314661026, + "step": 133865 + }, + { + "epoch": 0.1348352858111218, + "grad_norm": 10.551063878686245, + "learning_rate": 4.912037223817844e-05, + "loss": 2.4076, + "mean_token_accuracy": 0.41034482717514037, + "step": 133870 + }, + { + "epoch": 0.13484032186422598, + "grad_norm": 10.174315704925045, + "learning_rate": 4.9120268485670366e-05, + "loss": 3.0154, + "mean_token_accuracy": 0.34482758939266206, + "step": 133875 + }, + { + "epoch": 0.13484535791733016, + "grad_norm": 11.781305791097932, + "learning_rate": 4.912016472716582e-05, + "loss": 2.4743, + "mean_token_accuracy": 0.41379310488700866, + "step": 133880 + }, + { + "epoch": 0.13485039397043433, + "grad_norm": 11.96023562107951, + "learning_rate": 4.912006096266485e-05, + "loss": 2.2782, + "mean_token_accuracy": 0.44827585816383364, + "step": 133885 + }, + { + "epoch": 0.1348554300235385, + "grad_norm": 15.998734954490622, + "learning_rate": 4.911995719216746e-05, + "loss": 2.3395, + "mean_token_accuracy": 0.4620689630508423, + "step": 133890 + }, + { + "epoch": 0.13486046607664268, + "grad_norm": 11.012662040539741, + "learning_rate": 4.91198534156737e-05, + "loss": 2.3062, + "mean_token_accuracy": 0.43103448748588563, + "step": 133895 + }, + { + "epoch": 0.13486550212974685, + "grad_norm": 11.237657636424517, + "learning_rate": 4.911974963318361e-05, + "loss": 3.0287, + "mean_token_accuracy": 0.3862068891525269, + "step": 133900 + }, + { + "epoch": 0.13487053818285102, + "grad_norm": 10.118485586805253, + "learning_rate": 4.9119645844697176e-05, + "loss": 2.4654, + "mean_token_accuracy": 0.4620689570903778, + "step": 133905 + }, + { + "epoch": 0.1348755742359552, + "grad_norm": 10.27909446657505, + "learning_rate": 4.911954205021447e-05, + "loss": 2.4954, + "mean_token_accuracy": 0.417241370677948, + "step": 133910 + }, + { + "epoch": 0.13488061028905937, + "grad_norm": 16.774118091077757, + "learning_rate": 4.91194382497355e-05, + "loss": 2.6778, + "mean_token_accuracy": 0.40344828069210054, + "step": 133915 + }, + { + "epoch": 0.13488564634216355, + "grad_norm": 11.323490016084147, + "learning_rate": 4.911933444326029e-05, + "loss": 2.2893, + "mean_token_accuracy": 0.4551724135875702, + "step": 133920 + }, + { + "epoch": 0.13489068239526772, + "grad_norm": 9.348215625439757, + "learning_rate": 4.911923063078889e-05, + "loss": 2.4759, + "mean_token_accuracy": 0.4034482777118683, + "step": 133925 + }, + { + "epoch": 0.1348957184483719, + "grad_norm": 10.055446909814775, + "learning_rate": 4.9119126812321307e-05, + "loss": 2.1959, + "mean_token_accuracy": 0.48620688915252686, + "step": 133930 + }, + { + "epoch": 0.13490075450147607, + "grad_norm": 10.862648239989293, + "learning_rate": 4.911902298785758e-05, + "loss": 2.7677, + "mean_token_accuracy": 0.4137930989265442, + "step": 133935 + }, + { + "epoch": 0.13490579055458024, + "grad_norm": 10.300920681506383, + "learning_rate": 4.9118919157397736e-05, + "loss": 3.0812, + "mean_token_accuracy": 0.39655172228813174, + "step": 133940 + }, + { + "epoch": 0.13491082660768441, + "grad_norm": 10.489518210733586, + "learning_rate": 4.91188153209418e-05, + "loss": 2.5897, + "mean_token_accuracy": 0.4, + "step": 133945 + }, + { + "epoch": 0.1349158626607886, + "grad_norm": 16.287307495748056, + "learning_rate": 4.911871147848982e-05, + "loss": 2.4924, + "mean_token_accuracy": 0.4206896543502808, + "step": 133950 + }, + { + "epoch": 0.13492089871389276, + "grad_norm": 8.915601218879473, + "learning_rate": 4.911860763004179e-05, + "loss": 2.0285, + "mean_token_accuracy": 0.4724137902259827, + "step": 133955 + }, + { + "epoch": 0.13492593476699694, + "grad_norm": 10.430567495711408, + "learning_rate": 4.911850377559777e-05, + "loss": 2.0593, + "mean_token_accuracy": 0.47241378426551817, + "step": 133960 + }, + { + "epoch": 0.1349309708201011, + "grad_norm": 19.03416432676315, + "learning_rate": 4.911839991515778e-05, + "loss": 2.569, + "mean_token_accuracy": 0.39310345649719236, + "step": 133965 + }, + { + "epoch": 0.13493600687320528, + "grad_norm": 9.22705765416522, + "learning_rate": 4.911829604872183e-05, + "loss": 2.2654, + "mean_token_accuracy": 0.4310344815254211, + "step": 133970 + }, + { + "epoch": 0.13494104292630946, + "grad_norm": 12.047863230369853, + "learning_rate": 4.911819217628998e-05, + "loss": 2.3742, + "mean_token_accuracy": 0.4, + "step": 133975 + }, + { + "epoch": 0.13494607897941363, + "grad_norm": 10.8520032882447, + "learning_rate": 4.911808829786224e-05, + "loss": 2.1134, + "mean_token_accuracy": 0.4896551787853241, + "step": 133980 + }, + { + "epoch": 0.1349511150325178, + "grad_norm": 13.608625923537215, + "learning_rate": 4.911798441343865e-05, + "loss": 2.324, + "mean_token_accuracy": 0.4496672749519348, + "step": 133985 + }, + { + "epoch": 0.13495615108562198, + "grad_norm": 10.309724646163778, + "learning_rate": 4.911788052301922e-05, + "loss": 2.2157, + "mean_token_accuracy": 0.44827585816383364, + "step": 133990 + }, + { + "epoch": 0.13496118713872615, + "grad_norm": 12.901230017291423, + "learning_rate": 4.911777662660399e-05, + "loss": 2.3951, + "mean_token_accuracy": 0.4310344815254211, + "step": 133995 + }, + { + "epoch": 0.13496622319183033, + "grad_norm": 10.571001115092859, + "learning_rate": 4.911767272419299e-05, + "loss": 2.5293, + "mean_token_accuracy": 0.4068965554237366, + "step": 134000 + }, + { + "epoch": 0.1349712592449345, + "grad_norm": 12.118835239395823, + "learning_rate": 4.911756881578626e-05, + "loss": 2.5977, + "mean_token_accuracy": 0.3862068891525269, + "step": 134005 + }, + { + "epoch": 0.13497629529803865, + "grad_norm": 10.838426177390122, + "learning_rate": 4.9117464901383806e-05, + "loss": 2.218, + "mean_token_accuracy": 0.5241379201412201, + "step": 134010 + }, + { + "epoch": 0.13498133135114282, + "grad_norm": 11.946303971191362, + "learning_rate": 4.911736098098567e-05, + "loss": 2.3248, + "mean_token_accuracy": 0.4689655125141144, + "step": 134015 + }, + { + "epoch": 0.134986367404247, + "grad_norm": 10.835532176075716, + "learning_rate": 4.911725705459188e-05, + "loss": 2.3557, + "mean_token_accuracy": 0.4689655065536499, + "step": 134020 + }, + { + "epoch": 0.13499140345735117, + "grad_norm": 12.084261370210807, + "learning_rate": 4.9117153122202456e-05, + "loss": 2.6956, + "mean_token_accuracy": 0.4379310250282288, + "step": 134025 + }, + { + "epoch": 0.13499643951045534, + "grad_norm": 10.109992058669187, + "learning_rate": 4.911704918381744e-05, + "loss": 2.3221, + "mean_token_accuracy": 0.47241379618644713, + "step": 134030 + }, + { + "epoch": 0.13500147556355951, + "grad_norm": 11.427131594917999, + "learning_rate": 4.911694523943685e-05, + "loss": 2.4006, + "mean_token_accuracy": 0.4068965494632721, + "step": 134035 + }, + { + "epoch": 0.1350065116166637, + "grad_norm": 11.1950678910402, + "learning_rate": 4.911684128906072e-05, + "loss": 2.3739, + "mean_token_accuracy": 0.42068964838981626, + "step": 134040 + }, + { + "epoch": 0.13501154766976786, + "grad_norm": 9.522524461318765, + "learning_rate": 4.911673733268908e-05, + "loss": 2.0028, + "mean_token_accuracy": 0.540411365032196, + "step": 134045 + }, + { + "epoch": 0.13501658372287204, + "grad_norm": 8.764797279627862, + "learning_rate": 4.911663337032197e-05, + "loss": 2.2466, + "mean_token_accuracy": 0.42413793206214906, + "step": 134050 + }, + { + "epoch": 0.1350216197759762, + "grad_norm": 9.267684411076148, + "learning_rate": 4.9116529401959386e-05, + "loss": 2.0835, + "mean_token_accuracy": 0.5034482657909394, + "step": 134055 + }, + { + "epoch": 0.13502665582908038, + "grad_norm": 10.663144303020138, + "learning_rate": 4.911642542760138e-05, + "loss": 2.3304, + "mean_token_accuracy": 0.43448275327682495, + "step": 134060 + }, + { + "epoch": 0.13503169188218456, + "grad_norm": 11.30183683156277, + "learning_rate": 4.9116321447247986e-05, + "loss": 2.5074, + "mean_token_accuracy": 0.42413793206214906, + "step": 134065 + }, + { + "epoch": 0.13503672793528873, + "grad_norm": 14.325452294640524, + "learning_rate": 4.9116217460899225e-05, + "loss": 2.0719, + "mean_token_accuracy": 0.47241380214691164, + "step": 134070 + }, + { + "epoch": 0.1350417639883929, + "grad_norm": 12.3677009720883, + "learning_rate": 4.9116113468555124e-05, + "loss": 2.0885, + "mean_token_accuracy": 0.48275862336158754, + "step": 134075 + }, + { + "epoch": 0.13504680004149708, + "grad_norm": 10.453095009381897, + "learning_rate": 4.9116009470215704e-05, + "loss": 2.0955, + "mean_token_accuracy": 0.4517241299152374, + "step": 134080 + }, + { + "epoch": 0.13505183609460125, + "grad_norm": 10.177434958845236, + "learning_rate": 4.911590546588101e-05, + "loss": 2.1657, + "mean_token_accuracy": 0.5034482717514038, + "step": 134085 + }, + { + "epoch": 0.13505687214770543, + "grad_norm": 14.466640415489486, + "learning_rate": 4.9115801455551074e-05, + "loss": 2.449, + "mean_token_accuracy": 0.441379314661026, + "step": 134090 + }, + { + "epoch": 0.1350619082008096, + "grad_norm": 8.71599529256436, + "learning_rate": 4.91156974392259e-05, + "loss": 2.5054, + "mean_token_accuracy": 0.4103448212146759, + "step": 134095 + }, + { + "epoch": 0.13506694425391377, + "grad_norm": 9.964051426015743, + "learning_rate": 4.911559341690554e-05, + "loss": 2.4314, + "mean_token_accuracy": 0.38965516686439516, + "step": 134100 + }, + { + "epoch": 0.13507198030701795, + "grad_norm": 8.982877355933086, + "learning_rate": 4.9115489388590016e-05, + "loss": 2.3002, + "mean_token_accuracy": 0.4551724076271057, + "step": 134105 + }, + { + "epoch": 0.13507701636012212, + "grad_norm": 10.68853928492849, + "learning_rate": 4.911538535427935e-05, + "loss": 2.2288, + "mean_token_accuracy": 0.4137930989265442, + "step": 134110 + }, + { + "epoch": 0.1350820524132263, + "grad_norm": 12.064505048445348, + "learning_rate": 4.9115281313973574e-05, + "loss": 2.313, + "mean_token_accuracy": 0.4758620738983154, + "step": 134115 + }, + { + "epoch": 0.13508708846633047, + "grad_norm": 10.3681680363652, + "learning_rate": 4.9115177267672715e-05, + "loss": 2.04, + "mean_token_accuracy": 0.48620688915252686, + "step": 134120 + }, + { + "epoch": 0.13509212451943464, + "grad_norm": 15.009625634737692, + "learning_rate": 4.911507321537682e-05, + "loss": 2.3651, + "mean_token_accuracy": 0.4103448331356049, + "step": 134125 + }, + { + "epoch": 0.13509716057253882, + "grad_norm": 11.979159361815714, + "learning_rate": 4.91149691570859e-05, + "loss": 2.1158, + "mean_token_accuracy": 0.4517241418361664, + "step": 134130 + }, + { + "epoch": 0.135102196625643, + "grad_norm": 10.048725524509766, + "learning_rate": 4.9114865092799985e-05, + "loss": 2.3877, + "mean_token_accuracy": 0.42413793206214906, + "step": 134135 + }, + { + "epoch": 0.13510723267874716, + "grad_norm": 9.584548277364538, + "learning_rate": 4.911476102251911e-05, + "loss": 2.2622, + "mean_token_accuracy": 0.43448275327682495, + "step": 134140 + }, + { + "epoch": 0.13511226873185134, + "grad_norm": 10.146499888779887, + "learning_rate": 4.9114656946243286e-05, + "loss": 2.1218, + "mean_token_accuracy": 0.4430732071399689, + "step": 134145 + }, + { + "epoch": 0.13511730478495548, + "grad_norm": 8.822927197041379, + "learning_rate": 4.911455286397257e-05, + "loss": 2.6094, + "mean_token_accuracy": 0.4068965494632721, + "step": 134150 + }, + { + "epoch": 0.13512234083805966, + "grad_norm": 8.816105331622458, + "learning_rate": 4.9114448775706976e-05, + "loss": 2.1758, + "mean_token_accuracy": 0.4551724076271057, + "step": 134155 + }, + { + "epoch": 0.13512737689116383, + "grad_norm": 12.561072336316514, + "learning_rate": 4.9114344681446536e-05, + "loss": 2.6445, + "mean_token_accuracy": 0.4172413766384125, + "step": 134160 + }, + { + "epoch": 0.135132412944268, + "grad_norm": 11.068561780759016, + "learning_rate": 4.911424058119127e-05, + "loss": 2.6803, + "mean_token_accuracy": 0.41379310488700866, + "step": 134165 + }, + { + "epoch": 0.13513744899737218, + "grad_norm": 10.560662256423516, + "learning_rate": 4.9114136474941216e-05, + "loss": 2.4919, + "mean_token_accuracy": 0.4551724135875702, + "step": 134170 + }, + { + "epoch": 0.13514248505047635, + "grad_norm": 13.962793709724243, + "learning_rate": 4.9114032362696396e-05, + "loss": 2.7213, + "mean_token_accuracy": 0.4, + "step": 134175 + }, + { + "epoch": 0.13514752110358053, + "grad_norm": 9.784722099460133, + "learning_rate": 4.911392824445686e-05, + "loss": 2.288, + "mean_token_accuracy": 0.5103448331356049, + "step": 134180 + }, + { + "epoch": 0.1351525571566847, + "grad_norm": 11.529774238402076, + "learning_rate": 4.91138241202226e-05, + "loss": 2.4712, + "mean_token_accuracy": 0.43793103098869324, + "step": 134185 + }, + { + "epoch": 0.13515759320978887, + "grad_norm": 7.804580903429626, + "learning_rate": 4.911371998999368e-05, + "loss": 2.5191, + "mean_token_accuracy": 0.41379310488700866, + "step": 134190 + }, + { + "epoch": 0.13516262926289305, + "grad_norm": 11.342987471364093, + "learning_rate": 4.911361585377011e-05, + "loss": 1.9117, + "mean_token_accuracy": 0.5310344815254211, + "step": 134195 + }, + { + "epoch": 0.13516766531599722, + "grad_norm": 12.48097639164439, + "learning_rate": 4.911351171155192e-05, + "loss": 2.1229, + "mean_token_accuracy": 0.4502117455005646, + "step": 134200 + }, + { + "epoch": 0.1351727013691014, + "grad_norm": 14.526926243458428, + "learning_rate": 4.9113407563339145e-05, + "loss": 2.2314, + "mean_token_accuracy": 0.4310344815254211, + "step": 134205 + }, + { + "epoch": 0.13517773742220557, + "grad_norm": 10.12012098430208, + "learning_rate": 4.9113303409131805e-05, + "loss": 2.3848, + "mean_token_accuracy": 0.4068965494632721, + "step": 134210 + }, + { + "epoch": 0.13518277347530974, + "grad_norm": 10.63263857675329, + "learning_rate": 4.911319924892994e-05, + "loss": 2.6857, + "mean_token_accuracy": 0.39655172228813174, + "step": 134215 + }, + { + "epoch": 0.13518780952841392, + "grad_norm": 11.56514580180841, + "learning_rate": 4.911309508273357e-05, + "loss": 2.2822, + "mean_token_accuracy": 0.41724138259887694, + "step": 134220 + }, + { + "epoch": 0.1351928455815181, + "grad_norm": 11.57157120705022, + "learning_rate": 4.911299091054273e-05, + "loss": 2.307, + "mean_token_accuracy": 0.48275862336158754, + "step": 134225 + }, + { + "epoch": 0.13519788163462226, + "grad_norm": 11.579526536382897, + "learning_rate": 4.911288673235745e-05, + "loss": 2.4915, + "mean_token_accuracy": 0.4379310369491577, + "step": 134230 + }, + { + "epoch": 0.13520291768772644, + "grad_norm": 9.282754933043174, + "learning_rate": 4.911278254817775e-05, + "loss": 2.2027, + "mean_token_accuracy": 0.4517241358757019, + "step": 134235 + }, + { + "epoch": 0.1352079537408306, + "grad_norm": 11.68654468548344, + "learning_rate": 4.911267835800367e-05, + "loss": 2.5102, + "mean_token_accuracy": 0.4413793087005615, + "step": 134240 + }, + { + "epoch": 0.13521298979393478, + "grad_norm": 11.696958310195383, + "learning_rate": 4.911257416183523e-05, + "loss": 2.2067, + "mean_token_accuracy": 0.44137930274009707, + "step": 134245 + }, + { + "epoch": 0.13521802584703896, + "grad_norm": 7.7341016996194885, + "learning_rate": 4.9112469959672456e-05, + "loss": 2.285, + "mean_token_accuracy": 0.5275862157344818, + "step": 134250 + }, + { + "epoch": 0.13522306190014313, + "grad_norm": 8.26806514487324, + "learning_rate": 4.9112365751515385e-05, + "loss": 2.2244, + "mean_token_accuracy": 0.45603448152542114, + "step": 134255 + }, + { + "epoch": 0.1352280979532473, + "grad_norm": 9.052513778662904, + "learning_rate": 4.9112261537364045e-05, + "loss": 2.7097, + "mean_token_accuracy": 0.37586206793785093, + "step": 134260 + }, + { + "epoch": 0.13523313400635148, + "grad_norm": 10.358952667644225, + "learning_rate": 4.911215731721846e-05, + "loss": 2.6853, + "mean_token_accuracy": 0.4103448331356049, + "step": 134265 + }, + { + "epoch": 0.13523817005945565, + "grad_norm": 10.84652823674153, + "learning_rate": 4.911205309107867e-05, + "loss": 2.7389, + "mean_token_accuracy": 0.38620689511299133, + "step": 134270 + }, + { + "epoch": 0.13524320611255983, + "grad_norm": 9.661898795137935, + "learning_rate": 4.91119488589447e-05, + "loss": 2.4458, + "mean_token_accuracy": 0.4137930989265442, + "step": 134275 + }, + { + "epoch": 0.135248242165664, + "grad_norm": 10.578013512358094, + "learning_rate": 4.911184462081657e-05, + "loss": 1.57, + "mean_token_accuracy": 0.5782819092273712, + "step": 134280 + }, + { + "epoch": 0.13525327821876817, + "grad_norm": 9.214234654981272, + "learning_rate": 4.911174037669431e-05, + "loss": 2.4946, + "mean_token_accuracy": 0.41379310190677643, + "step": 134285 + }, + { + "epoch": 0.13525831427187232, + "grad_norm": 10.377155604179269, + "learning_rate": 4.911163612657796e-05, + "loss": 2.7323, + "mean_token_accuracy": 0.35517241060733795, + "step": 134290 + }, + { + "epoch": 0.1352633503249765, + "grad_norm": 11.791831056460001, + "learning_rate": 4.9111531870467534e-05, + "loss": 2.6479, + "mean_token_accuracy": 0.4369026005268097, + "step": 134295 + }, + { + "epoch": 0.13526838637808067, + "grad_norm": 8.321679894250863, + "learning_rate": 4.911142760836308e-05, + "loss": 2.3718, + "mean_token_accuracy": 0.4482758641242981, + "step": 134300 + }, + { + "epoch": 0.13527342243118484, + "grad_norm": 12.363095154415507, + "learning_rate": 4.911132334026461e-05, + "loss": 2.3877, + "mean_token_accuracy": 0.4206896543502808, + "step": 134305 + }, + { + "epoch": 0.13527845848428902, + "grad_norm": 10.960700157617692, + "learning_rate": 4.9111219066172156e-05, + "loss": 2.7451, + "mean_token_accuracy": 0.3793103456497192, + "step": 134310 + }, + { + "epoch": 0.1352834945373932, + "grad_norm": 9.551276207656139, + "learning_rate": 4.911111478608575e-05, + "loss": 2.1222, + "mean_token_accuracy": 0.4448275864124298, + "step": 134315 + }, + { + "epoch": 0.13528853059049736, + "grad_norm": 8.959557224852793, + "learning_rate": 4.9111010500005424e-05, + "loss": 2.1304, + "mean_token_accuracy": 0.4758620738983154, + "step": 134320 + }, + { + "epoch": 0.13529356664360154, + "grad_norm": 10.394042482921195, + "learning_rate": 4.91109062079312e-05, + "loss": 2.3457, + "mean_token_accuracy": 0.45862067937850953, + "step": 134325 + }, + { + "epoch": 0.1352986026967057, + "grad_norm": 10.347218849399173, + "learning_rate": 4.9110801909863114e-05, + "loss": 2.4632, + "mean_token_accuracy": 0.37241379022598264, + "step": 134330 + }, + { + "epoch": 0.13530363874980988, + "grad_norm": 9.816100206324814, + "learning_rate": 4.911069760580119e-05, + "loss": 2.5011, + "mean_token_accuracy": 0.4322444021701813, + "step": 134335 + }, + { + "epoch": 0.13530867480291406, + "grad_norm": 11.386703863016669, + "learning_rate": 4.911059329574546e-05, + "loss": 2.3656, + "mean_token_accuracy": 0.4172413766384125, + "step": 134340 + }, + { + "epoch": 0.13531371085601823, + "grad_norm": 9.30169926119461, + "learning_rate": 4.911048897969595e-05, + "loss": 2.3522, + "mean_token_accuracy": 0.4172413766384125, + "step": 134345 + }, + { + "epoch": 0.1353187469091224, + "grad_norm": 9.707239272392467, + "learning_rate": 4.911038465765269e-05, + "loss": 2.3368, + "mean_token_accuracy": 0.47931034564971925, + "step": 134350 + }, + { + "epoch": 0.13532378296222658, + "grad_norm": 9.892065496286596, + "learning_rate": 4.911028032961571e-05, + "loss": 2.347, + "mean_token_accuracy": 0.4413793087005615, + "step": 134355 + }, + { + "epoch": 0.13532881901533075, + "grad_norm": 14.069196258933996, + "learning_rate": 4.9110175995585045e-05, + "loss": 2.7075, + "mean_token_accuracy": 0.42413793206214906, + "step": 134360 + }, + { + "epoch": 0.13533385506843493, + "grad_norm": 11.455966748293225, + "learning_rate": 4.91100716555607e-05, + "loss": 2.2488, + "mean_token_accuracy": 0.48511797189712524, + "step": 134365 + }, + { + "epoch": 0.1353388911215391, + "grad_norm": 9.070914228591613, + "learning_rate": 4.9109967309542734e-05, + "loss": 2.7783, + "mean_token_accuracy": 0.3965517163276672, + "step": 134370 + }, + { + "epoch": 0.13534392717464327, + "grad_norm": 8.638662378730041, + "learning_rate": 4.9109862957531165e-05, + "loss": 2.1407, + "mean_token_accuracy": 0.49999998807907103, + "step": 134375 + }, + { + "epoch": 0.13534896322774745, + "grad_norm": 11.116117841010515, + "learning_rate": 4.910975859952601e-05, + "loss": 2.6601, + "mean_token_accuracy": 0.3793103456497192, + "step": 134380 + }, + { + "epoch": 0.13535399928085162, + "grad_norm": 12.160637588656488, + "learning_rate": 4.910965423552731e-05, + "loss": 2.4647, + "mean_token_accuracy": 0.4068965554237366, + "step": 134385 + }, + { + "epoch": 0.1353590353339558, + "grad_norm": 10.68550088546402, + "learning_rate": 4.9109549865535097e-05, + "loss": 2.6795, + "mean_token_accuracy": 0.3965517282485962, + "step": 134390 + }, + { + "epoch": 0.13536407138705997, + "grad_norm": 9.187802779646592, + "learning_rate": 4.910944548954939e-05, + "loss": 2.1286, + "mean_token_accuracy": 0.5034482717514038, + "step": 134395 + }, + { + "epoch": 0.13536910744016414, + "grad_norm": 9.571506840992566, + "learning_rate": 4.910934110757023e-05, + "loss": 2.3671, + "mean_token_accuracy": 0.39310344457626345, + "step": 134400 + }, + { + "epoch": 0.13537414349326832, + "grad_norm": 11.61446372423744, + "learning_rate": 4.910923671959763e-05, + "loss": 2.4063, + "mean_token_accuracy": 0.4, + "step": 134405 + }, + { + "epoch": 0.1353791795463725, + "grad_norm": 9.787973356697972, + "learning_rate": 4.910913232563163e-05, + "loss": 2.339, + "mean_token_accuracy": 0.45765275359153745, + "step": 134410 + }, + { + "epoch": 0.13538421559947666, + "grad_norm": 9.809484587417113, + "learning_rate": 4.9109027925672266e-05, + "loss": 1.9537, + "mean_token_accuracy": 0.5571082711219788, + "step": 134415 + }, + { + "epoch": 0.13538925165258084, + "grad_norm": 9.226670387197863, + "learning_rate": 4.910892351971954e-05, + "loss": 2.294, + "mean_token_accuracy": 0.44482759237289426, + "step": 134420 + }, + { + "epoch": 0.135394287705685, + "grad_norm": 10.579580986271335, + "learning_rate": 4.910881910777351e-05, + "loss": 2.6831, + "mean_token_accuracy": 0.3896551728248596, + "step": 134425 + }, + { + "epoch": 0.13539932375878916, + "grad_norm": 10.590151771080215, + "learning_rate": 4.9108714689834194e-05, + "loss": 2.1947, + "mean_token_accuracy": 0.44827585816383364, + "step": 134430 + }, + { + "epoch": 0.13540435981189333, + "grad_norm": 7.895620978827685, + "learning_rate": 4.910861026590162e-05, + "loss": 2.0078, + "mean_token_accuracy": 0.4896551787853241, + "step": 134435 + }, + { + "epoch": 0.1354093958649975, + "grad_norm": 10.37713905080901, + "learning_rate": 4.910850583597581e-05, + "loss": 2.9234, + "mean_token_accuracy": 0.35172412991523744, + "step": 134440 + }, + { + "epoch": 0.13541443191810168, + "grad_norm": 11.385826271765035, + "learning_rate": 4.910840140005681e-05, + "loss": 2.3887, + "mean_token_accuracy": 0.47586206793785096, + "step": 134445 + }, + { + "epoch": 0.13541946797120585, + "grad_norm": 14.109973597538097, + "learning_rate": 4.910829695814463e-05, + "loss": 2.613, + "mean_token_accuracy": 0.35862069129943847, + "step": 134450 + }, + { + "epoch": 0.13542450402431003, + "grad_norm": 6.602300073144044, + "learning_rate": 4.9108192510239315e-05, + "loss": 2.2262, + "mean_token_accuracy": 0.4629310369491577, + "step": 134455 + }, + { + "epoch": 0.1354295400774142, + "grad_norm": 12.075685160737626, + "learning_rate": 4.910808805634088e-05, + "loss": 2.2568, + "mean_token_accuracy": 0.4517241358757019, + "step": 134460 + }, + { + "epoch": 0.13543457613051837, + "grad_norm": 13.61166427562624, + "learning_rate": 4.910798359644936e-05, + "loss": 2.3764, + "mean_token_accuracy": 0.4275862157344818, + "step": 134465 + }, + { + "epoch": 0.13543961218362255, + "grad_norm": 10.295834218076493, + "learning_rate": 4.91078791305648e-05, + "loss": 2.2475, + "mean_token_accuracy": 0.4724137902259827, + "step": 134470 + }, + { + "epoch": 0.13544464823672672, + "grad_norm": 9.673106805131535, + "learning_rate": 4.91077746586872e-05, + "loss": 2.4547, + "mean_token_accuracy": 0.39310344457626345, + "step": 134475 + }, + { + "epoch": 0.1354496842898309, + "grad_norm": 14.141919511802351, + "learning_rate": 4.910767018081661e-05, + "loss": 2.8227, + "mean_token_accuracy": 0.3862068891525269, + "step": 134480 + }, + { + "epoch": 0.13545472034293507, + "grad_norm": 10.452408266692883, + "learning_rate": 4.910756569695305e-05, + "loss": 2.3405, + "mean_token_accuracy": 0.4678765952587128, + "step": 134485 + }, + { + "epoch": 0.13545975639603924, + "grad_norm": 10.644654044171334, + "learning_rate": 4.910746120709656e-05, + "loss": 2.3277, + "mean_token_accuracy": 0.5192377507686615, + "step": 134490 + }, + { + "epoch": 0.13546479244914342, + "grad_norm": 9.15269330776536, + "learning_rate": 4.910735671124715e-05, + "loss": 2.2452, + "mean_token_accuracy": 0.4620689690113068, + "step": 134495 + }, + { + "epoch": 0.1354698285022476, + "grad_norm": 9.76447611091786, + "learning_rate": 4.910725220940486e-05, + "loss": 2.3601, + "mean_token_accuracy": 0.41724138259887694, + "step": 134500 + }, + { + "epoch": 0.13547486455535176, + "grad_norm": 20.65541071571421, + "learning_rate": 4.9107147701569714e-05, + "loss": 2.5985, + "mean_token_accuracy": 0.4206896543502808, + "step": 134505 + }, + { + "epoch": 0.13547990060845594, + "grad_norm": 10.858618480034215, + "learning_rate": 4.910704318774175e-05, + "loss": 2.7066, + "mean_token_accuracy": 0.39655172228813174, + "step": 134510 + }, + { + "epoch": 0.1354849366615601, + "grad_norm": 11.148422083990829, + "learning_rate": 4.910693866792099e-05, + "loss": 2.4485, + "mean_token_accuracy": 0.4172413766384125, + "step": 134515 + }, + { + "epoch": 0.1354899727146643, + "grad_norm": 8.316215149826675, + "learning_rate": 4.9106834142107465e-05, + "loss": 2.2563, + "mean_token_accuracy": 0.46206897497177124, + "step": 134520 + }, + { + "epoch": 0.13549500876776846, + "grad_norm": 12.269140773848898, + "learning_rate": 4.9106729610301206e-05, + "loss": 2.237, + "mean_token_accuracy": 0.4620689690113068, + "step": 134525 + }, + { + "epoch": 0.13550004482087263, + "grad_norm": 13.925802520228363, + "learning_rate": 4.9106625072502236e-05, + "loss": 2.7348, + "mean_token_accuracy": 0.4159104585647583, + "step": 134530 + }, + { + "epoch": 0.1355050808739768, + "grad_norm": 13.488833469103525, + "learning_rate": 4.9106520528710596e-05, + "loss": 2.4526, + "mean_token_accuracy": 0.39655172228813174, + "step": 134535 + }, + { + "epoch": 0.13551011692708098, + "grad_norm": 9.790118483949719, + "learning_rate": 4.9106415978926307e-05, + "loss": 2.2722, + "mean_token_accuracy": 0.48275862336158754, + "step": 134540 + }, + { + "epoch": 0.13551515298018516, + "grad_norm": 9.33258444305644, + "learning_rate": 4.910631142314939e-05, + "loss": 2.0748, + "mean_token_accuracy": 0.44827587008476255, + "step": 134545 + }, + { + "epoch": 0.13552018903328933, + "grad_norm": 10.74609256888069, + "learning_rate": 4.910620686137989e-05, + "loss": 2.2249, + "mean_token_accuracy": 0.46551724672317507, + "step": 134550 + }, + { + "epoch": 0.1355252250863935, + "grad_norm": 9.812625818012902, + "learning_rate": 4.910610229361782e-05, + "loss": 2.2271, + "mean_token_accuracy": 0.5034482717514038, + "step": 134555 + }, + { + "epoch": 0.13553026113949768, + "grad_norm": 9.15525460646177, + "learning_rate": 4.9105997719863214e-05, + "loss": 2.0133, + "mean_token_accuracy": 0.458620685338974, + "step": 134560 + }, + { + "epoch": 0.13553529719260185, + "grad_norm": 9.560462453324432, + "learning_rate": 4.910589314011611e-05, + "loss": 2.2022, + "mean_token_accuracy": 0.44137930274009707, + "step": 134565 + }, + { + "epoch": 0.135540333245706, + "grad_norm": 11.123479346124654, + "learning_rate": 4.910578855437653e-05, + "loss": 2.2477, + "mean_token_accuracy": 0.44482759237289426, + "step": 134570 + }, + { + "epoch": 0.13554536929881017, + "grad_norm": 17.165124226894974, + "learning_rate": 4.910568396264451e-05, + "loss": 2.4816, + "mean_token_accuracy": 0.41034482717514037, + "step": 134575 + }, + { + "epoch": 0.13555040535191434, + "grad_norm": 10.294722633833748, + "learning_rate": 4.9105579364920065e-05, + "loss": 2.3182, + "mean_token_accuracy": 0.4551724135875702, + "step": 134580 + }, + { + "epoch": 0.13555544140501852, + "grad_norm": 12.547537538882024, + "learning_rate": 4.9105474761203236e-05, + "loss": 2.8974, + "mean_token_accuracy": 0.41034482717514037, + "step": 134585 + }, + { + "epoch": 0.1355604774581227, + "grad_norm": 9.824400181034228, + "learning_rate": 4.910537015149405e-05, + "loss": 2.3554, + "mean_token_accuracy": 0.42413792610168455, + "step": 134590 + }, + { + "epoch": 0.13556551351122686, + "grad_norm": 12.099075409569291, + "learning_rate": 4.910526553579253e-05, + "loss": 2.1965, + "mean_token_accuracy": 0.45172412395477296, + "step": 134595 + }, + { + "epoch": 0.13557054956433104, + "grad_norm": 17.399589723914314, + "learning_rate": 4.91051609140987e-05, + "loss": 2.7991, + "mean_token_accuracy": 0.4467634618282318, + "step": 134600 + }, + { + "epoch": 0.1355755856174352, + "grad_norm": 11.818633242135931, + "learning_rate": 4.9105056286412615e-05, + "loss": 2.4059, + "mean_token_accuracy": 0.4310344815254211, + "step": 134605 + }, + { + "epoch": 0.1355806216705394, + "grad_norm": 10.398285796101677, + "learning_rate": 4.910495165273427e-05, + "loss": 2.3367, + "mean_token_accuracy": 0.4361766457557678, + "step": 134610 + }, + { + "epoch": 0.13558565772364356, + "grad_norm": 10.334115957871711, + "learning_rate": 4.910484701306373e-05, + "loss": 2.3425, + "mean_token_accuracy": 0.45172412395477296, + "step": 134615 + }, + { + "epoch": 0.13559069377674773, + "grad_norm": 10.292044808807947, + "learning_rate": 4.9104742367400994e-05, + "loss": 2.6356, + "mean_token_accuracy": 0.3965517282485962, + "step": 134620 + }, + { + "epoch": 0.1355957298298519, + "grad_norm": 12.10726224563903, + "learning_rate": 4.9104637715746105e-05, + "loss": 2.0444, + "mean_token_accuracy": 0.5, + "step": 134625 + }, + { + "epoch": 0.13560076588295608, + "grad_norm": 11.887122461123523, + "learning_rate": 4.910453305809908e-05, + "loss": 2.3411, + "mean_token_accuracy": 0.4344827592372894, + "step": 134630 + }, + { + "epoch": 0.13560580193606026, + "grad_norm": 12.776616194512602, + "learning_rate": 4.910442839445997e-05, + "loss": 2.4799, + "mean_token_accuracy": 0.4482758641242981, + "step": 134635 + }, + { + "epoch": 0.13561083798916443, + "grad_norm": 9.073238643883903, + "learning_rate": 4.9104323724828786e-05, + "loss": 2.0792, + "mean_token_accuracy": 0.5034482836723327, + "step": 134640 + }, + { + "epoch": 0.1356158740422686, + "grad_norm": 12.28018019401208, + "learning_rate": 4.910421904920556e-05, + "loss": 2.4638, + "mean_token_accuracy": 0.41379310488700866, + "step": 134645 + }, + { + "epoch": 0.13562091009537278, + "grad_norm": 12.020921971274877, + "learning_rate": 4.9104114367590325e-05, + "loss": 2.5349, + "mean_token_accuracy": 0.3909255862236023, + "step": 134650 + }, + { + "epoch": 0.13562594614847695, + "grad_norm": 9.879299725274887, + "learning_rate": 4.9104009679983116e-05, + "loss": 2.3308, + "mean_token_accuracy": 0.44827585816383364, + "step": 134655 + }, + { + "epoch": 0.13563098220158112, + "grad_norm": 12.79554918928031, + "learning_rate": 4.910390498638394e-05, + "loss": 2.4381, + "mean_token_accuracy": 0.42413793206214906, + "step": 134660 + }, + { + "epoch": 0.1356360182546853, + "grad_norm": 10.033473108875548, + "learning_rate": 4.910380028679285e-05, + "loss": 2.384, + "mean_token_accuracy": 0.4034482717514038, + "step": 134665 + }, + { + "epoch": 0.13564105430778947, + "grad_norm": 11.407401730491923, + "learning_rate": 4.9103695581209866e-05, + "loss": 2.4792, + "mean_token_accuracy": 0.41034482717514037, + "step": 134670 + }, + { + "epoch": 0.13564609036089365, + "grad_norm": 10.269367932254575, + "learning_rate": 4.910359086963501e-05, + "loss": 2.5202, + "mean_token_accuracy": 0.42413792610168455, + "step": 134675 + }, + { + "epoch": 0.13565112641399782, + "grad_norm": 9.017545695758873, + "learning_rate": 4.910348615206832e-05, + "loss": 2.4904, + "mean_token_accuracy": 0.458620685338974, + "step": 134680 + }, + { + "epoch": 0.135656162467102, + "grad_norm": 11.758824284667863, + "learning_rate": 4.9103381428509824e-05, + "loss": 2.4863, + "mean_token_accuracy": 0.36896551847457887, + "step": 134685 + }, + { + "epoch": 0.13566119852020617, + "grad_norm": 10.324570422707733, + "learning_rate": 4.910327669895955e-05, + "loss": 2.2392, + "mean_token_accuracy": 0.4607380509376526, + "step": 134690 + }, + { + "epoch": 0.13566623457331034, + "grad_norm": 12.703443822856327, + "learning_rate": 4.9103171963417526e-05, + "loss": 2.3297, + "mean_token_accuracy": 0.3931034505367279, + "step": 134695 + }, + { + "epoch": 0.13567127062641451, + "grad_norm": 8.540405010803717, + "learning_rate": 4.910306722188378e-05, + "loss": 2.3059, + "mean_token_accuracy": 0.4793103516101837, + "step": 134700 + }, + { + "epoch": 0.1356763066795187, + "grad_norm": 7.75841584452314, + "learning_rate": 4.910296247435835e-05, + "loss": 2.4136, + "mean_token_accuracy": 0.4068965554237366, + "step": 134705 + }, + { + "epoch": 0.13568134273262283, + "grad_norm": 7.011809069499588, + "learning_rate": 4.910285772084124e-05, + "loss": 2.175, + "mean_token_accuracy": 0.405686628818512, + "step": 134710 + }, + { + "epoch": 0.135686378785727, + "grad_norm": 11.657648535725755, + "learning_rate": 4.9102752961332524e-05, + "loss": 2.2094, + "mean_token_accuracy": 0.4551724135875702, + "step": 134715 + }, + { + "epoch": 0.13569141483883118, + "grad_norm": 11.19370054381177, + "learning_rate": 4.9102648195832186e-05, + "loss": 2.0656, + "mean_token_accuracy": 0.4620689690113068, + "step": 134720 + }, + { + "epoch": 0.13569645089193536, + "grad_norm": 10.396463003001184, + "learning_rate": 4.910254342434027e-05, + "loss": 2.2045, + "mean_token_accuracy": 0.4344827651977539, + "step": 134725 + }, + { + "epoch": 0.13570148694503953, + "grad_norm": 11.166109500554299, + "learning_rate": 4.910243864685681e-05, + "loss": 2.2489, + "mean_token_accuracy": 0.4517241358757019, + "step": 134730 + }, + { + "epoch": 0.1357065229981437, + "grad_norm": 10.078022418014237, + "learning_rate": 4.910233386338184e-05, + "loss": 2.9287, + "mean_token_accuracy": 0.35862069129943847, + "step": 134735 + }, + { + "epoch": 0.13571155905124788, + "grad_norm": 11.472873720029453, + "learning_rate": 4.910222907391538e-05, + "loss": 2.4801, + "mean_token_accuracy": 0.39655172228813174, + "step": 134740 + }, + { + "epoch": 0.13571659510435205, + "grad_norm": 9.687737765961836, + "learning_rate": 4.9102124278457464e-05, + "loss": 2.2483, + "mean_token_accuracy": 0.44137930274009707, + "step": 134745 + }, + { + "epoch": 0.13572163115745622, + "grad_norm": 11.2223644952304, + "learning_rate": 4.910201947700811e-05, + "loss": 2.4151, + "mean_token_accuracy": 0.42758620977401735, + "step": 134750 + }, + { + "epoch": 0.1357266672105604, + "grad_norm": 10.330103789203898, + "learning_rate": 4.9101914669567364e-05, + "loss": 2.2052, + "mean_token_accuracy": 0.4965517222881317, + "step": 134755 + }, + { + "epoch": 0.13573170326366457, + "grad_norm": 11.792452838435404, + "learning_rate": 4.9101809856135234e-05, + "loss": 2.6672, + "mean_token_accuracy": 0.4401088893413544, + "step": 134760 + }, + { + "epoch": 0.13573673931676875, + "grad_norm": 10.416671559807702, + "learning_rate": 4.910170503671177e-05, + "loss": 2.6069, + "mean_token_accuracy": 0.41724138259887694, + "step": 134765 + }, + { + "epoch": 0.13574177536987292, + "grad_norm": 8.319861882176228, + "learning_rate": 4.9101600211296994e-05, + "loss": 2.2543, + "mean_token_accuracy": 0.4275862157344818, + "step": 134770 + }, + { + "epoch": 0.1357468114229771, + "grad_norm": 8.967706717234528, + "learning_rate": 4.910149537989093e-05, + "loss": 2.3003, + "mean_token_accuracy": 0.4517241418361664, + "step": 134775 + }, + { + "epoch": 0.13575184747608127, + "grad_norm": 10.727135679483629, + "learning_rate": 4.910139054249361e-05, + "loss": 2.5893, + "mean_token_accuracy": 0.43793103098869324, + "step": 134780 + }, + { + "epoch": 0.13575688352918544, + "grad_norm": 8.71511367396217, + "learning_rate": 4.910128569910507e-05, + "loss": 1.863, + "mean_token_accuracy": 0.5172413766384125, + "step": 134785 + }, + { + "epoch": 0.13576191958228961, + "grad_norm": 10.20220617606134, + "learning_rate": 4.910118084972533e-05, + "loss": 2.325, + "mean_token_accuracy": 0.441379314661026, + "step": 134790 + }, + { + "epoch": 0.1357669556353938, + "grad_norm": 14.654835845579635, + "learning_rate": 4.9101075994354415e-05, + "loss": 2.0335, + "mean_token_accuracy": 0.4620689570903778, + "step": 134795 + }, + { + "epoch": 0.13577199168849796, + "grad_norm": 9.107487045661019, + "learning_rate": 4.910097113299237e-05, + "loss": 2.5072, + "mean_token_accuracy": 0.48620688915252686, + "step": 134800 + }, + { + "epoch": 0.13577702774160214, + "grad_norm": 10.256392372117176, + "learning_rate": 4.910086626563921e-05, + "loss": 2.4318, + "mean_token_accuracy": 0.42758620381355283, + "step": 134805 + }, + { + "epoch": 0.1357820637947063, + "grad_norm": 12.465447984058482, + "learning_rate": 4.910076139229498e-05, + "loss": 2.4414, + "mean_token_accuracy": 0.417241370677948, + "step": 134810 + }, + { + "epoch": 0.13578709984781048, + "grad_norm": 20.737231958196922, + "learning_rate": 4.910065651295967e-05, + "loss": 2.5157, + "mean_token_accuracy": 0.46551724672317507, + "step": 134815 + }, + { + "epoch": 0.13579213590091466, + "grad_norm": 11.527708214182006, + "learning_rate": 4.9100551627633363e-05, + "loss": 2.4518, + "mean_token_accuracy": 0.3931034505367279, + "step": 134820 + }, + { + "epoch": 0.13579717195401883, + "grad_norm": 8.989011306114987, + "learning_rate": 4.9100446736316065e-05, + "loss": 2.0133, + "mean_token_accuracy": 0.4948578476905823, + "step": 134825 + }, + { + "epoch": 0.135802208007123, + "grad_norm": 8.810941986319946, + "learning_rate": 4.9100341839007785e-05, + "loss": 2.2906, + "mean_token_accuracy": 0.4310344815254211, + "step": 134830 + }, + { + "epoch": 0.13580724406022718, + "grad_norm": 11.451202896023153, + "learning_rate": 4.9100236935708576e-05, + "loss": 2.9063, + "mean_token_accuracy": 0.42589232325553894, + "step": 134835 + }, + { + "epoch": 0.13581228011333135, + "grad_norm": 11.448326912683768, + "learning_rate": 4.910013202641847e-05, + "loss": 2.4252, + "mean_token_accuracy": 0.4724137902259827, + "step": 134840 + }, + { + "epoch": 0.13581731616643553, + "grad_norm": 9.809057552147612, + "learning_rate": 4.910002711113747e-05, + "loss": 2.5392, + "mean_token_accuracy": 0.3999999940395355, + "step": 134845 + }, + { + "epoch": 0.13582235221953967, + "grad_norm": 10.625769521623344, + "learning_rate": 4.909992218986564e-05, + "loss": 2.2282, + "mean_token_accuracy": 0.441379314661026, + "step": 134850 + }, + { + "epoch": 0.13582738827264385, + "grad_norm": 9.318727028269022, + "learning_rate": 4.9099817262602975e-05, + "loss": 2.3223, + "mean_token_accuracy": 0.4655172348022461, + "step": 134855 + }, + { + "epoch": 0.13583242432574802, + "grad_norm": 9.37571888139062, + "learning_rate": 4.909971232934953e-05, + "loss": 1.9659, + "mean_token_accuracy": 0.48620688915252686, + "step": 134860 + }, + { + "epoch": 0.1358374603788522, + "grad_norm": 10.43704680094581, + "learning_rate": 4.9099607390105326e-05, + "loss": 2.3609, + "mean_token_accuracy": 0.42068964838981626, + "step": 134865 + }, + { + "epoch": 0.13584249643195637, + "grad_norm": 10.526898917299087, + "learning_rate": 4.909950244487038e-05, + "loss": 3.1729, + "mean_token_accuracy": 0.3655172407627106, + "step": 134870 + }, + { + "epoch": 0.13584753248506054, + "grad_norm": 9.628162157020952, + "learning_rate": 4.909939749364474e-05, + "loss": 2.034, + "mean_token_accuracy": 0.5088929176330567, + "step": 134875 + }, + { + "epoch": 0.13585256853816471, + "grad_norm": 10.378508311716494, + "learning_rate": 4.909929253642843e-05, + "loss": 2.8286, + "mean_token_accuracy": 0.4, + "step": 134880 + }, + { + "epoch": 0.1358576045912689, + "grad_norm": 10.18549868262699, + "learning_rate": 4.909918757322146e-05, + "loss": 2.0197, + "mean_token_accuracy": 0.47586206793785096, + "step": 134885 + }, + { + "epoch": 0.13586264064437306, + "grad_norm": 9.705192629733032, + "learning_rate": 4.909908260402389e-05, + "loss": 2.1531, + "mean_token_accuracy": 0.4620689630508423, + "step": 134890 + }, + { + "epoch": 0.13586767669747724, + "grad_norm": 10.650840882695144, + "learning_rate": 4.9098977628835724e-05, + "loss": 2.4431, + "mean_token_accuracy": 0.46418632864952086, + "step": 134895 + }, + { + "epoch": 0.1358727127505814, + "grad_norm": 12.034846807645062, + "learning_rate": 4.9098872647657e-05, + "loss": 3.1024, + "mean_token_accuracy": 0.3206896513700485, + "step": 134900 + }, + { + "epoch": 0.13587774880368558, + "grad_norm": 13.285080532283546, + "learning_rate": 4.909876766048776e-05, + "loss": 2.5358, + "mean_token_accuracy": 0.4296430677175522, + "step": 134905 + }, + { + "epoch": 0.13588278485678976, + "grad_norm": 9.350627589899336, + "learning_rate": 4.909866266732802e-05, + "loss": 2.7844, + "mean_token_accuracy": 0.4068965554237366, + "step": 134910 + }, + { + "epoch": 0.13588782090989393, + "grad_norm": 9.76162932119752, + "learning_rate": 4.90985576681778e-05, + "loss": 2.4129, + "mean_token_accuracy": 0.4379310369491577, + "step": 134915 + }, + { + "epoch": 0.1358928569629981, + "grad_norm": 10.339488160598401, + "learning_rate": 4.909845266303714e-05, + "loss": 2.5262, + "mean_token_accuracy": 0.4068965494632721, + "step": 134920 + }, + { + "epoch": 0.13589789301610228, + "grad_norm": 9.575103478744335, + "learning_rate": 4.909834765190607e-05, + "loss": 1.9268, + "mean_token_accuracy": 0.48275862336158754, + "step": 134925 + }, + { + "epoch": 0.13590292906920645, + "grad_norm": 10.300009171430993, + "learning_rate": 4.909824263478462e-05, + "loss": 2.6719, + "mean_token_accuracy": 0.41379310488700866, + "step": 134930 + }, + { + "epoch": 0.13590796512231063, + "grad_norm": 10.701821422309203, + "learning_rate": 4.9098137611672826e-05, + "loss": 2.385, + "mean_token_accuracy": 0.4206896543502808, + "step": 134935 + }, + { + "epoch": 0.1359130011754148, + "grad_norm": 10.26987192395177, + "learning_rate": 4.9098032582570705e-05, + "loss": 2.2756, + "mean_token_accuracy": 0.46896552443504336, + "step": 134940 + }, + { + "epoch": 0.13591803722851897, + "grad_norm": 10.122385395617751, + "learning_rate": 4.909792754747828e-05, + "loss": 2.3364, + "mean_token_accuracy": 0.43448275327682495, + "step": 134945 + }, + { + "epoch": 0.13592307328162315, + "grad_norm": 11.025132976003075, + "learning_rate": 4.909782250639559e-05, + "loss": 2.3811, + "mean_token_accuracy": 0.4137930989265442, + "step": 134950 + }, + { + "epoch": 0.13592810933472732, + "grad_norm": 10.602909905138443, + "learning_rate": 4.909771745932266e-05, + "loss": 2.1733, + "mean_token_accuracy": 0.5083484590053559, + "step": 134955 + }, + { + "epoch": 0.1359331453878315, + "grad_norm": 10.751343204639257, + "learning_rate": 4.9097612406259534e-05, + "loss": 2.4136, + "mean_token_accuracy": 0.4172413766384125, + "step": 134960 + }, + { + "epoch": 0.13593818144093567, + "grad_norm": 10.857986457624833, + "learning_rate": 4.909750734720622e-05, + "loss": 2.4537, + "mean_token_accuracy": 0.44137930274009707, + "step": 134965 + }, + { + "epoch": 0.13594321749403984, + "grad_norm": 12.320225899263486, + "learning_rate": 4.909740228216276e-05, + "loss": 2.7086, + "mean_token_accuracy": 0.3655172407627106, + "step": 134970 + }, + { + "epoch": 0.13594825354714402, + "grad_norm": 8.57234693096351, + "learning_rate": 4.909729721112918e-05, + "loss": 2.1371, + "mean_token_accuracy": 0.47465215921401976, + "step": 134975 + }, + { + "epoch": 0.1359532896002482, + "grad_norm": 8.60113866560614, + "learning_rate": 4.9097192134105515e-05, + "loss": 2.3393, + "mean_token_accuracy": 0.42068966031074523, + "step": 134980 + }, + { + "epoch": 0.13595832565335236, + "grad_norm": 11.203779607274168, + "learning_rate": 4.9097087051091786e-05, + "loss": 2.2498, + "mean_token_accuracy": 0.4996975243091583, + "step": 134985 + }, + { + "epoch": 0.1359633617064565, + "grad_norm": 10.384319456705752, + "learning_rate": 4.909698196208802e-05, + "loss": 2.4609, + "mean_token_accuracy": 0.4537205040454865, + "step": 134990 + }, + { + "epoch": 0.13596839775956068, + "grad_norm": 9.850889807999696, + "learning_rate": 4.909687686709425e-05, + "loss": 2.2459, + "mean_token_accuracy": 0.42758620977401735, + "step": 134995 + }, + { + "epoch": 0.13597343381266486, + "grad_norm": 9.919318033324638, + "learning_rate": 4.9096771766110504e-05, + "loss": 2.5861, + "mean_token_accuracy": 0.44827585816383364, + "step": 135000 + }, + { + "epoch": 0.13597846986576903, + "grad_norm": 10.834208506090366, + "learning_rate": 4.909666665913682e-05, + "loss": 2.5082, + "mean_token_accuracy": 0.4275861978530884, + "step": 135005 + }, + { + "epoch": 0.1359835059188732, + "grad_norm": 9.994035391413979, + "learning_rate": 4.9096561546173214e-05, + "loss": 2.3494, + "mean_token_accuracy": 0.3999999940395355, + "step": 135010 + }, + { + "epoch": 0.13598854197197738, + "grad_norm": 9.037913822541128, + "learning_rate": 4.909645642721972e-05, + "loss": 2.194, + "mean_token_accuracy": 0.4344827651977539, + "step": 135015 + }, + { + "epoch": 0.13599357802508155, + "grad_norm": 9.476318552434709, + "learning_rate": 4.909635130227637e-05, + "loss": 2.2532, + "mean_token_accuracy": 0.4534180283546448, + "step": 135020 + }, + { + "epoch": 0.13599861407818573, + "grad_norm": 9.534823880227412, + "learning_rate": 4.9096246171343197e-05, + "loss": 2.3333, + "mean_token_accuracy": 0.42068964838981626, + "step": 135025 + }, + { + "epoch": 0.1360036501312899, + "grad_norm": 8.61040708087451, + "learning_rate": 4.909614103442022e-05, + "loss": 2.3254, + "mean_token_accuracy": 0.4620689690113068, + "step": 135030 + }, + { + "epoch": 0.13600868618439407, + "grad_norm": 12.376448307067244, + "learning_rate": 4.909603589150747e-05, + "loss": 3.1811, + "mean_token_accuracy": 0.3275862067937851, + "step": 135035 + }, + { + "epoch": 0.13601372223749825, + "grad_norm": 11.254993594381054, + "learning_rate": 4.909593074260499e-05, + "loss": 2.5762, + "mean_token_accuracy": 0.4379310369491577, + "step": 135040 + }, + { + "epoch": 0.13601875829060242, + "grad_norm": 11.655562213876975, + "learning_rate": 4.909582558771279e-05, + "loss": 2.5759, + "mean_token_accuracy": 0.38620689511299133, + "step": 135045 + }, + { + "epoch": 0.1360237943437066, + "grad_norm": 9.426336308367542, + "learning_rate": 4.90957204268309e-05, + "loss": 2.3741, + "mean_token_accuracy": 0.4068965494632721, + "step": 135050 + }, + { + "epoch": 0.13602883039681077, + "grad_norm": 9.828631522931131, + "learning_rate": 4.909561525995937e-05, + "loss": 2.4961, + "mean_token_accuracy": 0.3827586233615875, + "step": 135055 + }, + { + "epoch": 0.13603386644991494, + "grad_norm": 10.165825869598894, + "learning_rate": 4.909551008709821e-05, + "loss": 2.3954, + "mean_token_accuracy": 0.4310344815254211, + "step": 135060 + }, + { + "epoch": 0.13603890250301912, + "grad_norm": 11.310326970354806, + "learning_rate": 4.909540490824745e-05, + "loss": 2.3369, + "mean_token_accuracy": 0.41379310488700866, + "step": 135065 + }, + { + "epoch": 0.1360439385561233, + "grad_norm": 11.554730150391544, + "learning_rate": 4.909529972340713e-05, + "loss": 2.2475, + "mean_token_accuracy": 0.4813672065734863, + "step": 135070 + }, + { + "epoch": 0.13604897460922746, + "grad_norm": 10.888892245926671, + "learning_rate": 4.9095194532577265e-05, + "loss": 2.7959, + "mean_token_accuracy": 0.3655172407627106, + "step": 135075 + }, + { + "epoch": 0.13605401066233164, + "grad_norm": 14.301049862934988, + "learning_rate": 4.90950893357579e-05, + "loss": 2.1327, + "mean_token_accuracy": 0.46896552443504336, + "step": 135080 + }, + { + "epoch": 0.1360590467154358, + "grad_norm": 9.637814314957314, + "learning_rate": 4.909498413294906e-05, + "loss": 2.0183, + "mean_token_accuracy": 0.4744101583957672, + "step": 135085 + }, + { + "epoch": 0.13606408276853998, + "grad_norm": 11.596876911292185, + "learning_rate": 4.909487892415076e-05, + "loss": 2.2781, + "mean_token_accuracy": 0.43793103098869324, + "step": 135090 + }, + { + "epoch": 0.13606911882164416, + "grad_norm": 10.564066701047258, + "learning_rate": 4.9094773709363045e-05, + "loss": 2.0091, + "mean_token_accuracy": 0.45862067937850953, + "step": 135095 + }, + { + "epoch": 0.13607415487474833, + "grad_norm": 10.893480891659905, + "learning_rate": 4.909466848858594e-05, + "loss": 2.7744, + "mean_token_accuracy": 0.38620689511299133, + "step": 135100 + }, + { + "epoch": 0.1360791909278525, + "grad_norm": 11.894073803883426, + "learning_rate": 4.909456326181947e-05, + "loss": 2.5722, + "mean_token_accuracy": 0.42758620381355283, + "step": 135105 + }, + { + "epoch": 0.13608422698095668, + "grad_norm": 11.103981380465429, + "learning_rate": 4.9094458029063666e-05, + "loss": 2.2399, + "mean_token_accuracy": 0.4379310369491577, + "step": 135110 + }, + { + "epoch": 0.13608926303406085, + "grad_norm": 11.7665019787653, + "learning_rate": 4.9094352790318566e-05, + "loss": 2.6011, + "mean_token_accuracy": 0.38620689511299133, + "step": 135115 + }, + { + "epoch": 0.13609429908716503, + "grad_norm": 10.386723034479058, + "learning_rate": 4.909424754558419e-05, + "loss": 2.4262, + "mean_token_accuracy": 0.4103448331356049, + "step": 135120 + }, + { + "epoch": 0.1360993351402692, + "grad_norm": 9.795442890473675, + "learning_rate": 4.909414229486057e-05, + "loss": 2.2112, + "mean_token_accuracy": 0.49655171632766726, + "step": 135125 + }, + { + "epoch": 0.13610437119337335, + "grad_norm": 10.543278520003819, + "learning_rate": 4.909403703814773e-05, + "loss": 2.7072, + "mean_token_accuracy": 0.43448275327682495, + "step": 135130 + }, + { + "epoch": 0.13610940724647752, + "grad_norm": 12.18317096866015, + "learning_rate": 4.909393177544569e-05, + "loss": 2.4097, + "mean_token_accuracy": 0.4123411953449249, + "step": 135135 + }, + { + "epoch": 0.1361144432995817, + "grad_norm": 11.567600636890907, + "learning_rate": 4.9093826506754515e-05, + "loss": 2.2595, + "mean_token_accuracy": 0.4517241358757019, + "step": 135140 + }, + { + "epoch": 0.13611947935268587, + "grad_norm": 10.271591629195022, + "learning_rate": 4.9093721232074205e-05, + "loss": 2.3277, + "mean_token_accuracy": 0.46206897497177124, + "step": 135145 + }, + { + "epoch": 0.13612451540579004, + "grad_norm": 11.858334251233378, + "learning_rate": 4.909361595140479e-05, + "loss": 2.5765, + "mean_token_accuracy": 0.42758620381355283, + "step": 135150 + }, + { + "epoch": 0.13612955145889422, + "grad_norm": 12.613837023524765, + "learning_rate": 4.9093510664746315e-05, + "loss": 2.1171, + "mean_token_accuracy": 0.4758620738983154, + "step": 135155 + }, + { + "epoch": 0.1361345875119984, + "grad_norm": 11.531968914418643, + "learning_rate": 4.909340537209879e-05, + "loss": 3.0951, + "mean_token_accuracy": 0.41524500250816343, + "step": 135160 + }, + { + "epoch": 0.13613962356510256, + "grad_norm": 8.685575388182794, + "learning_rate": 4.909330007346226e-05, + "loss": 2.1483, + "mean_token_accuracy": 0.45662432312965395, + "step": 135165 + }, + { + "epoch": 0.13614465961820674, + "grad_norm": 14.525316624975638, + "learning_rate": 4.909319476883674e-05, + "loss": 2.6061, + "mean_token_accuracy": 0.37931033968925476, + "step": 135170 + }, + { + "epoch": 0.1361496956713109, + "grad_norm": 8.766679044006592, + "learning_rate": 4.909308945822227e-05, + "loss": 2.0469, + "mean_token_accuracy": 0.4871921181678772, + "step": 135175 + }, + { + "epoch": 0.13615473172441508, + "grad_norm": 10.384038159544508, + "learning_rate": 4.9092984141618876e-05, + "loss": 2.3653, + "mean_token_accuracy": 0.47586206793785096, + "step": 135180 + }, + { + "epoch": 0.13615976777751926, + "grad_norm": 10.983669825097552, + "learning_rate": 4.909287881902659e-05, + "loss": 2.3395, + "mean_token_accuracy": 0.3896551698446274, + "step": 135185 + }, + { + "epoch": 0.13616480383062343, + "grad_norm": 8.911640785485202, + "learning_rate": 4.909277349044543e-05, + "loss": 2.0558, + "mean_token_accuracy": 0.47931033968925474, + "step": 135190 + }, + { + "epoch": 0.1361698398837276, + "grad_norm": 10.144545324285271, + "learning_rate": 4.9092668155875446e-05, + "loss": 2.414, + "mean_token_accuracy": 0.46551724672317507, + "step": 135195 + }, + { + "epoch": 0.13617487593683178, + "grad_norm": 10.71539302123256, + "learning_rate": 4.909256281531665e-05, + "loss": 2.5, + "mean_token_accuracy": 0.4068965494632721, + "step": 135200 + }, + { + "epoch": 0.13617991198993595, + "grad_norm": 8.199437387665153, + "learning_rate": 4.9092457468769074e-05, + "loss": 2.3983, + "mean_token_accuracy": 0.4206896543502808, + "step": 135205 + }, + { + "epoch": 0.13618494804304013, + "grad_norm": 7.531103572492438, + "learning_rate": 4.909235211623275e-05, + "loss": 2.4469, + "mean_token_accuracy": 0.43793103098869324, + "step": 135210 + }, + { + "epoch": 0.1361899840961443, + "grad_norm": 11.20982693060921, + "learning_rate": 4.909224675770771e-05, + "loss": 2.7481, + "mean_token_accuracy": 0.38620689511299133, + "step": 135215 + }, + { + "epoch": 0.13619502014924847, + "grad_norm": 11.903465358504354, + "learning_rate": 4.909214139319398e-05, + "loss": 2.2895, + "mean_token_accuracy": 0.42758620977401735, + "step": 135220 + }, + { + "epoch": 0.13620005620235265, + "grad_norm": 8.71958800570729, + "learning_rate": 4.909203602269158e-05, + "loss": 2.1193, + "mean_token_accuracy": 0.4931034505367279, + "step": 135225 + }, + { + "epoch": 0.13620509225545682, + "grad_norm": 8.298410295578147, + "learning_rate": 4.909193064620056e-05, + "loss": 2.0886, + "mean_token_accuracy": 0.5103448331356049, + "step": 135230 + }, + { + "epoch": 0.136210128308561, + "grad_norm": 13.469718728551308, + "learning_rate": 4.909182526372093e-05, + "loss": 2.6954, + "mean_token_accuracy": 0.39655172228813174, + "step": 135235 + }, + { + "epoch": 0.13621516436166517, + "grad_norm": 10.44142830007144, + "learning_rate": 4.9091719875252726e-05, + "loss": 2.4475, + "mean_token_accuracy": 0.4655172348022461, + "step": 135240 + }, + { + "epoch": 0.13622020041476934, + "grad_norm": 11.184724697943007, + "learning_rate": 4.909161448079599e-05, + "loss": 2.4728, + "mean_token_accuracy": 0.4517241418361664, + "step": 135245 + }, + { + "epoch": 0.13622523646787352, + "grad_norm": 11.599710357012317, + "learning_rate": 4.9091509080350725e-05, + "loss": 2.3116, + "mean_token_accuracy": 0.44670296311378477, + "step": 135250 + }, + { + "epoch": 0.1362302725209777, + "grad_norm": 10.982550594858601, + "learning_rate": 4.909140367391698e-05, + "loss": 2.3269, + "mean_token_accuracy": 0.47586206793785096, + "step": 135255 + }, + { + "epoch": 0.13623530857408186, + "grad_norm": 10.924649200313599, + "learning_rate": 4.9091298261494775e-05, + "loss": 2.7371, + "mean_token_accuracy": 0.4172413766384125, + "step": 135260 + }, + { + "epoch": 0.13624034462718604, + "grad_norm": 9.707937050177211, + "learning_rate": 4.909119284308415e-05, + "loss": 2.4901, + "mean_token_accuracy": 0.42758620977401735, + "step": 135265 + }, + { + "epoch": 0.13624538068029018, + "grad_norm": 11.195725706103861, + "learning_rate": 4.909108741868512e-05, + "loss": 2.586, + "mean_token_accuracy": 0.4413793087005615, + "step": 135270 + }, + { + "epoch": 0.13625041673339436, + "grad_norm": 11.198289722582283, + "learning_rate": 4.909098198829773e-05, + "loss": 2.3725, + "mean_token_accuracy": 0.3965517282485962, + "step": 135275 + }, + { + "epoch": 0.13625545278649853, + "grad_norm": 9.953590525112904, + "learning_rate": 4.9090876551921995e-05, + "loss": 2.5993, + "mean_token_accuracy": 0.39655172228813174, + "step": 135280 + }, + { + "epoch": 0.1362604888396027, + "grad_norm": 12.190293475508124, + "learning_rate": 4.909077110955795e-05, + "loss": 2.3804, + "mean_token_accuracy": 0.46896551847457885, + "step": 135285 + }, + { + "epoch": 0.13626552489270688, + "grad_norm": 10.203274644735158, + "learning_rate": 4.909066566120562e-05, + "loss": 2.3389, + "mean_token_accuracy": 0.4206896424293518, + "step": 135290 + }, + { + "epoch": 0.13627056094581105, + "grad_norm": 12.484476917394169, + "learning_rate": 4.909056020686505e-05, + "loss": 2.491, + "mean_token_accuracy": 0.37586206793785093, + "step": 135295 + }, + { + "epoch": 0.13627559699891523, + "grad_norm": 12.140425491162, + "learning_rate": 4.909045474653624e-05, + "loss": 2.056, + "mean_token_accuracy": 0.5137930929660797, + "step": 135300 + }, + { + "epoch": 0.1362806330520194, + "grad_norm": 13.025325655031022, + "learning_rate": 4.9090349280219255e-05, + "loss": 2.2648, + "mean_token_accuracy": 0.4344827592372894, + "step": 135305 + }, + { + "epoch": 0.13628566910512357, + "grad_norm": 8.521537603635023, + "learning_rate": 4.90902438079141e-05, + "loss": 2.1226, + "mean_token_accuracy": 0.4758620738983154, + "step": 135310 + }, + { + "epoch": 0.13629070515822775, + "grad_norm": 8.889512194189532, + "learning_rate": 4.909013832962081e-05, + "loss": 2.5257, + "mean_token_accuracy": 0.42068964838981626, + "step": 135315 + }, + { + "epoch": 0.13629574121133192, + "grad_norm": 11.299305237901116, + "learning_rate": 4.90900328453394e-05, + "loss": 2.5716, + "mean_token_accuracy": 0.36896551251411436, + "step": 135320 + }, + { + "epoch": 0.1363007772644361, + "grad_norm": 12.740960495417967, + "learning_rate": 4.9089927355069936e-05, + "loss": 2.5015, + "mean_token_accuracy": 0.441379314661026, + "step": 135325 + }, + { + "epoch": 0.13630581331754027, + "grad_norm": 8.901009574069738, + "learning_rate": 4.9089821858812414e-05, + "loss": 1.9809, + "mean_token_accuracy": 0.47586206793785096, + "step": 135330 + }, + { + "epoch": 0.13631084937064444, + "grad_norm": 10.156291374049633, + "learning_rate": 4.908971635656687e-05, + "loss": 2.5359, + "mean_token_accuracy": 0.358620685338974, + "step": 135335 + }, + { + "epoch": 0.13631588542374862, + "grad_norm": 10.876251908827907, + "learning_rate": 4.9089610848333345e-05, + "loss": 2.6118, + "mean_token_accuracy": 0.37586206793785093, + "step": 135340 + }, + { + "epoch": 0.1363209214768528, + "grad_norm": 11.496870807405713, + "learning_rate": 4.908950533411186e-05, + "loss": 2.4697, + "mean_token_accuracy": 0.42413792610168455, + "step": 135345 + }, + { + "epoch": 0.13632595752995696, + "grad_norm": 10.053602593276302, + "learning_rate": 4.908939981390244e-05, + "loss": 2.3262, + "mean_token_accuracy": 0.44482758045196535, + "step": 135350 + }, + { + "epoch": 0.13633099358306114, + "grad_norm": 10.651505026473655, + "learning_rate": 4.908929428770514e-05, + "loss": 2.125, + "mean_token_accuracy": 0.49304295182228086, + "step": 135355 + }, + { + "epoch": 0.1363360296361653, + "grad_norm": 8.486358133970912, + "learning_rate": 4.9089188755519946e-05, + "loss": 2.6041, + "mean_token_accuracy": 0.3896551728248596, + "step": 135360 + }, + { + "epoch": 0.13634106568926949, + "grad_norm": 10.595594961370118, + "learning_rate": 4.908908321734692e-05, + "loss": 2.2611, + "mean_token_accuracy": 0.4689655065536499, + "step": 135365 + }, + { + "epoch": 0.13634610174237366, + "grad_norm": 9.77888335773004, + "learning_rate": 4.908897767318607e-05, + "loss": 2.6426, + "mean_token_accuracy": 0.40344826579093934, + "step": 135370 + }, + { + "epoch": 0.13635113779547783, + "grad_norm": 14.029751904359943, + "learning_rate": 4.908887212303745e-05, + "loss": 2.0935, + "mean_token_accuracy": 0.493103438615799, + "step": 135375 + }, + { + "epoch": 0.136356173848582, + "grad_norm": 10.018733048680547, + "learning_rate": 4.908876656690107e-05, + "loss": 2.5488, + "mean_token_accuracy": 0.42068964838981626, + "step": 135380 + }, + { + "epoch": 0.13636120990168618, + "grad_norm": 11.48659306883941, + "learning_rate": 4.9088661004776964e-05, + "loss": 2.4371, + "mean_token_accuracy": 0.42758620977401735, + "step": 135385 + }, + { + "epoch": 0.13636624595479035, + "grad_norm": 12.345266104832206, + "learning_rate": 4.908855543666516e-05, + "loss": 2.345, + "mean_token_accuracy": 0.47586206793785096, + "step": 135390 + }, + { + "epoch": 0.13637128200789453, + "grad_norm": 11.708150446160678, + "learning_rate": 4.908844986256569e-05, + "loss": 2.5193, + "mean_token_accuracy": 0.40689654350280763, + "step": 135395 + }, + { + "epoch": 0.1363763180609987, + "grad_norm": 9.933684622114361, + "learning_rate": 4.908834428247859e-05, + "loss": 1.8625, + "mean_token_accuracy": 0.4931034445762634, + "step": 135400 + }, + { + "epoch": 0.13638135411410288, + "grad_norm": 10.180946165121135, + "learning_rate": 4.9088238696403885e-05, + "loss": 2.4931, + "mean_token_accuracy": 0.4034482717514038, + "step": 135405 + }, + { + "epoch": 0.13638639016720702, + "grad_norm": 12.944302460802872, + "learning_rate": 4.9088133104341597e-05, + "loss": 2.4866, + "mean_token_accuracy": 0.4034482777118683, + "step": 135410 + }, + { + "epoch": 0.1363914262203112, + "grad_norm": 11.160175260513713, + "learning_rate": 4.908802750629175e-05, + "loss": 3.2161, + "mean_token_accuracy": 0.3068965464830399, + "step": 135415 + }, + { + "epoch": 0.13639646227341537, + "grad_norm": 10.115774536303306, + "learning_rate": 4.908792190225439e-05, + "loss": 2.378, + "mean_token_accuracy": 0.4534180283546448, + "step": 135420 + }, + { + "epoch": 0.13640149832651954, + "grad_norm": 10.24903239010195, + "learning_rate": 4.908781629222954e-05, + "loss": 2.3047, + "mean_token_accuracy": 0.41379310488700866, + "step": 135425 + }, + { + "epoch": 0.13640653437962372, + "grad_norm": 12.24430658822229, + "learning_rate": 4.908771067621723e-05, + "loss": 2.7355, + "mean_token_accuracy": 0.4448275864124298, + "step": 135430 + }, + { + "epoch": 0.1364115704327279, + "grad_norm": 9.75682316947418, + "learning_rate": 4.908760505421748e-05, + "loss": 2.3864, + "mean_token_accuracy": 0.3551724076271057, + "step": 135435 + }, + { + "epoch": 0.13641660648583206, + "grad_norm": 10.84387583303431, + "learning_rate": 4.9087499426230345e-05, + "loss": 2.134, + "mean_token_accuracy": 0.482758617401123, + "step": 135440 + }, + { + "epoch": 0.13642164253893624, + "grad_norm": 7.781143301341315, + "learning_rate": 4.908739379225582e-05, + "loss": 2.7764, + "mean_token_accuracy": 0.4103448331356049, + "step": 135445 + }, + { + "epoch": 0.1364266785920404, + "grad_norm": 9.715167308356557, + "learning_rate": 4.9087288152293956e-05, + "loss": 2.4715, + "mean_token_accuracy": 0.41034482419490814, + "step": 135450 + }, + { + "epoch": 0.1364317146451446, + "grad_norm": 10.339507817014834, + "learning_rate": 4.9087182506344773e-05, + "loss": 2.1105, + "mean_token_accuracy": 0.4620689690113068, + "step": 135455 + }, + { + "epoch": 0.13643675069824876, + "grad_norm": 11.267792314345673, + "learning_rate": 4.9087076854408315e-05, + "loss": 2.3678, + "mean_token_accuracy": 0.4344827651977539, + "step": 135460 + }, + { + "epoch": 0.13644178675135293, + "grad_norm": 9.922119316205647, + "learning_rate": 4.908697119648459e-05, + "loss": 2.4497, + "mean_token_accuracy": 0.4896551787853241, + "step": 135465 + }, + { + "epoch": 0.1364468228044571, + "grad_norm": 15.993875423042248, + "learning_rate": 4.908686553257364e-05, + "loss": 2.8003, + "mean_token_accuracy": 0.3827586144208908, + "step": 135470 + }, + { + "epoch": 0.13645185885756128, + "grad_norm": 7.624076825591366, + "learning_rate": 4.9086759862675505e-05, + "loss": 2.1068, + "mean_token_accuracy": 0.47586206793785096, + "step": 135475 + }, + { + "epoch": 0.13645689491066545, + "grad_norm": 8.058550841209156, + "learning_rate": 4.9086654186790185e-05, + "loss": 2.2171, + "mean_token_accuracy": 0.4344827651977539, + "step": 135480 + }, + { + "epoch": 0.13646193096376963, + "grad_norm": 9.978191751112645, + "learning_rate": 4.908654850491773e-05, + "loss": 2.7062, + "mean_token_accuracy": 0.3551724076271057, + "step": 135485 + }, + { + "epoch": 0.1364669670168738, + "grad_norm": 11.453235491368275, + "learning_rate": 4.908644281705817e-05, + "loss": 2.9978, + "mean_token_accuracy": 0.3703569233417511, + "step": 135490 + }, + { + "epoch": 0.13647200306997798, + "grad_norm": 10.123248518917839, + "learning_rate": 4.9086337123211526e-05, + "loss": 2.6042, + "mean_token_accuracy": 0.4103448212146759, + "step": 135495 + }, + { + "epoch": 0.13647703912308215, + "grad_norm": 12.399090320080028, + "learning_rate": 4.908623142337784e-05, + "loss": 2.579, + "mean_token_accuracy": 0.4137930989265442, + "step": 135500 + }, + { + "epoch": 0.13648207517618632, + "grad_norm": 10.293412898312734, + "learning_rate": 4.9086125717557116e-05, + "loss": 2.3392, + "mean_token_accuracy": 0.46896551847457885, + "step": 135505 + }, + { + "epoch": 0.1364871112292905, + "grad_norm": 11.832399098214186, + "learning_rate": 4.908602000574941e-05, + "loss": 2.5785, + "mean_token_accuracy": 0.4068965494632721, + "step": 135510 + }, + { + "epoch": 0.13649214728239467, + "grad_norm": 10.30583612612166, + "learning_rate": 4.9085914287954734e-05, + "loss": 2.805, + "mean_token_accuracy": 0.36896551847457887, + "step": 135515 + }, + { + "epoch": 0.13649718333549885, + "grad_norm": 12.42619622049391, + "learning_rate": 4.908580856417314e-05, + "loss": 2.2246, + "mean_token_accuracy": 0.482758629322052, + "step": 135520 + }, + { + "epoch": 0.13650221938860302, + "grad_norm": 10.099012146219438, + "learning_rate": 4.908570283440462e-05, + "loss": 2.3806, + "mean_token_accuracy": 0.4241379380226135, + "step": 135525 + }, + { + "epoch": 0.1365072554417072, + "grad_norm": 21.67459570613669, + "learning_rate": 4.908559709864924e-05, + "loss": 2.7892, + "mean_token_accuracy": 0.39655172228813174, + "step": 135530 + }, + { + "epoch": 0.13651229149481137, + "grad_norm": 9.609873011820277, + "learning_rate": 4.908549135690701e-05, + "loss": 2.6407, + "mean_token_accuracy": 0.4172413766384125, + "step": 135535 + }, + { + "epoch": 0.13651732754791554, + "grad_norm": 10.385783878801465, + "learning_rate": 4.9085385609177966e-05, + "loss": 2.3383, + "mean_token_accuracy": 0.46551724672317507, + "step": 135540 + }, + { + "epoch": 0.13652236360101971, + "grad_norm": 9.631095652365897, + "learning_rate": 4.908527985546213e-05, + "loss": 2.3807, + "mean_token_accuracy": 0.4689655065536499, + "step": 135545 + }, + { + "epoch": 0.13652739965412386, + "grad_norm": 9.58935937001955, + "learning_rate": 4.908517409575953e-05, + "loss": 1.9288, + "mean_token_accuracy": 0.4965517222881317, + "step": 135550 + }, + { + "epoch": 0.13653243570722803, + "grad_norm": 9.380781398888034, + "learning_rate": 4.908506833007021e-05, + "loss": 2.2995, + "mean_token_accuracy": 0.45517241954803467, + "step": 135555 + }, + { + "epoch": 0.1365374717603322, + "grad_norm": 11.484186725657539, + "learning_rate": 4.908496255839419e-05, + "loss": 2.5599, + "mean_token_accuracy": 0.4310344815254211, + "step": 135560 + }, + { + "epoch": 0.13654250781343638, + "grad_norm": 8.893936903036693, + "learning_rate": 4.90848567807315e-05, + "loss": 2.0975, + "mean_token_accuracy": 0.4517241358757019, + "step": 135565 + }, + { + "epoch": 0.13654754386654055, + "grad_norm": 8.709901800691684, + "learning_rate": 4.908475099708217e-05, + "loss": 2.1193, + "mean_token_accuracy": 0.4551724135875702, + "step": 135570 + }, + { + "epoch": 0.13655257991964473, + "grad_norm": 10.24766166134284, + "learning_rate": 4.9084645207446227e-05, + "loss": 2.1881, + "mean_token_accuracy": 0.47931034564971925, + "step": 135575 + }, + { + "epoch": 0.1365576159727489, + "grad_norm": 10.756631531153516, + "learning_rate": 4.9084539411823706e-05, + "loss": 2.5237, + "mean_token_accuracy": 0.3896551787853241, + "step": 135580 + }, + { + "epoch": 0.13656265202585308, + "grad_norm": 9.200164999961094, + "learning_rate": 4.908443361021463e-05, + "loss": 2.1445, + "mean_token_accuracy": 0.46896551847457885, + "step": 135585 + }, + { + "epoch": 0.13656768807895725, + "grad_norm": 12.552491377522674, + "learning_rate": 4.9084327802619026e-05, + "loss": 2.219, + "mean_token_accuracy": 0.441379314661026, + "step": 135590 + }, + { + "epoch": 0.13657272413206142, + "grad_norm": 15.657541026704987, + "learning_rate": 4.908422198903693e-05, + "loss": 2.5273, + "mean_token_accuracy": 0.4275862157344818, + "step": 135595 + }, + { + "epoch": 0.1365777601851656, + "grad_norm": 11.511882973868785, + "learning_rate": 4.908411616946837e-05, + "loss": 2.5325, + "mean_token_accuracy": 0.45680580735206605, + "step": 135600 + }, + { + "epoch": 0.13658279623826977, + "grad_norm": 11.333998279274018, + "learning_rate": 4.908401034391338e-05, + "loss": 2.4372, + "mean_token_accuracy": 0.42413792610168455, + "step": 135605 + }, + { + "epoch": 0.13658783229137395, + "grad_norm": 11.38586177226549, + "learning_rate": 4.908390451237198e-05, + "loss": 2.1006, + "mean_token_accuracy": 0.45517241954803467, + "step": 135610 + }, + { + "epoch": 0.13659286834447812, + "grad_norm": 10.296699962451797, + "learning_rate": 4.908379867484421e-05, + "loss": 2.2691, + "mean_token_accuracy": 0.4103448212146759, + "step": 135615 + }, + { + "epoch": 0.1365979043975823, + "grad_norm": 14.543260602105354, + "learning_rate": 4.9083692831330085e-05, + "loss": 2.5569, + "mean_token_accuracy": 0.45862067937850953, + "step": 135620 + }, + { + "epoch": 0.13660294045068647, + "grad_norm": 12.509600024158017, + "learning_rate": 4.908358698182964e-05, + "loss": 2.5598, + "mean_token_accuracy": 0.4332123339176178, + "step": 135625 + }, + { + "epoch": 0.13660797650379064, + "grad_norm": 9.296517733305533, + "learning_rate": 4.908348112634292e-05, + "loss": 2.067, + "mean_token_accuracy": 0.48481547832489014, + "step": 135630 + }, + { + "epoch": 0.13661301255689481, + "grad_norm": 10.609535722223866, + "learning_rate": 4.908337526486993e-05, + "loss": 2.1491, + "mean_token_accuracy": 0.4551724135875702, + "step": 135635 + }, + { + "epoch": 0.136618048609999, + "grad_norm": 11.191355586371886, + "learning_rate": 4.908326939741071e-05, + "loss": 2.6168, + "mean_token_accuracy": 0.3896551728248596, + "step": 135640 + }, + { + "epoch": 0.13662308466310316, + "grad_norm": 9.410607713723122, + "learning_rate": 4.908316352396529e-05, + "loss": 2.1183, + "mean_token_accuracy": 0.458620685338974, + "step": 135645 + }, + { + "epoch": 0.13662812071620734, + "grad_norm": 11.713432375302173, + "learning_rate": 4.908305764453371e-05, + "loss": 2.4753, + "mean_token_accuracy": 0.41724138259887694, + "step": 135650 + }, + { + "epoch": 0.1366331567693115, + "grad_norm": 10.460759250891593, + "learning_rate": 4.908295175911597e-05, + "loss": 2.2784, + "mean_token_accuracy": 0.4620689630508423, + "step": 135655 + }, + { + "epoch": 0.13663819282241568, + "grad_norm": 10.518883913201156, + "learning_rate": 4.908284586771213e-05, + "loss": 2.647, + "mean_token_accuracy": 0.36896551847457887, + "step": 135660 + }, + { + "epoch": 0.13664322887551986, + "grad_norm": 10.219987220055971, + "learning_rate": 4.908273997032221e-05, + "loss": 2.3712, + "mean_token_accuracy": 0.41379310488700866, + "step": 135665 + }, + { + "epoch": 0.13664826492862403, + "grad_norm": 11.685848597574024, + "learning_rate": 4.908263406694623e-05, + "loss": 2.5529, + "mean_token_accuracy": 0.4068965554237366, + "step": 135670 + }, + { + "epoch": 0.1366533009817282, + "grad_norm": 12.678774976506544, + "learning_rate": 4.9082528157584225e-05, + "loss": 2.3296, + "mean_token_accuracy": 0.4448275864124298, + "step": 135675 + }, + { + "epoch": 0.13665833703483238, + "grad_norm": 10.059582428515721, + "learning_rate": 4.9082422242236234e-05, + "loss": 2.5278, + "mean_token_accuracy": 0.4676950991153717, + "step": 135680 + }, + { + "epoch": 0.13666337308793652, + "grad_norm": 11.103417334819177, + "learning_rate": 4.908231632090227e-05, + "loss": 2.5446, + "mean_token_accuracy": 0.3988505780696869, + "step": 135685 + }, + { + "epoch": 0.1366684091410407, + "grad_norm": 11.509387331554962, + "learning_rate": 4.9082210393582366e-05, + "loss": 2.5288, + "mean_token_accuracy": 0.39655172228813174, + "step": 135690 + }, + { + "epoch": 0.13667344519414487, + "grad_norm": 10.843360639643176, + "learning_rate": 4.9082104460276565e-05, + "loss": 2.5729, + "mean_token_accuracy": 0.4068965494632721, + "step": 135695 + }, + { + "epoch": 0.13667848124724905, + "grad_norm": 12.800319408649564, + "learning_rate": 4.9081998520984877e-05, + "loss": 2.5334, + "mean_token_accuracy": 0.42413793206214906, + "step": 135700 + }, + { + "epoch": 0.13668351730035322, + "grad_norm": 8.56434919795878, + "learning_rate": 4.908189257570735e-05, + "loss": 1.879, + "mean_token_accuracy": 0.5379310309886932, + "step": 135705 + }, + { + "epoch": 0.1366885533534574, + "grad_norm": 10.938561242395497, + "learning_rate": 4.9081786624444e-05, + "loss": 2.5053, + "mean_token_accuracy": 0.3758620619773865, + "step": 135710 + }, + { + "epoch": 0.13669358940656157, + "grad_norm": 10.368933791296365, + "learning_rate": 4.908168066719486e-05, + "loss": 2.6473, + "mean_token_accuracy": 0.41149425506591797, + "step": 135715 + }, + { + "epoch": 0.13669862545966574, + "grad_norm": 16.352253388168233, + "learning_rate": 4.908157470395997e-05, + "loss": 2.7023, + "mean_token_accuracy": 0.43103447556495667, + "step": 135720 + }, + { + "epoch": 0.13670366151276991, + "grad_norm": 8.160908149039999, + "learning_rate": 4.9081468734739336e-05, + "loss": 2.1192, + "mean_token_accuracy": 0.44482758045196535, + "step": 135725 + }, + { + "epoch": 0.1367086975658741, + "grad_norm": 10.729100064138116, + "learning_rate": 4.908136275953301e-05, + "loss": 1.7761, + "mean_token_accuracy": 0.5401693999767303, + "step": 135730 + }, + { + "epoch": 0.13671373361897826, + "grad_norm": 12.256194451201003, + "learning_rate": 4.908125677834101e-05, + "loss": 2.3885, + "mean_token_accuracy": 0.4068965554237366, + "step": 135735 + }, + { + "epoch": 0.13671876967208244, + "grad_norm": 10.080527200540951, + "learning_rate": 4.908115079116337e-05, + "loss": 2.2734, + "mean_token_accuracy": 0.46394434571266174, + "step": 135740 + }, + { + "epoch": 0.1367238057251866, + "grad_norm": 8.503494370737384, + "learning_rate": 4.9081044798000114e-05, + "loss": 2.1487, + "mean_token_accuracy": 0.47931033968925474, + "step": 135745 + }, + { + "epoch": 0.13672884177829078, + "grad_norm": 12.39321165001928, + "learning_rate": 4.908093879885128e-05, + "loss": 2.4709, + "mean_token_accuracy": 0.41379311084747317, + "step": 135750 + }, + { + "epoch": 0.13673387783139496, + "grad_norm": 10.89923452419791, + "learning_rate": 4.908083279371688e-05, + "loss": 2.3868, + "mean_token_accuracy": 0.4137930929660797, + "step": 135755 + }, + { + "epoch": 0.13673891388449913, + "grad_norm": 11.085570439455482, + "learning_rate": 4.9080726782596966e-05, + "loss": 2.0162, + "mean_token_accuracy": 0.5034482717514038, + "step": 135760 + }, + { + "epoch": 0.1367439499376033, + "grad_norm": 10.47726377049291, + "learning_rate": 4.908062076549155e-05, + "loss": 2.1781, + "mean_token_accuracy": 0.4793103516101837, + "step": 135765 + }, + { + "epoch": 0.13674898599070748, + "grad_norm": 12.95445687510898, + "learning_rate": 4.908051474240068e-05, + "loss": 2.4998, + "mean_token_accuracy": 0.4241379380226135, + "step": 135770 + }, + { + "epoch": 0.13675402204381165, + "grad_norm": 9.112381986491494, + "learning_rate": 4.9080408713324365e-05, + "loss": 1.922, + "mean_token_accuracy": 0.5034482717514038, + "step": 135775 + }, + { + "epoch": 0.13675905809691583, + "grad_norm": 8.486942946990604, + "learning_rate": 4.9080302678262646e-05, + "loss": 2.7234, + "mean_token_accuracy": 0.477918928861618, + "step": 135780 + }, + { + "epoch": 0.13676409415002, + "grad_norm": 10.81563756380048, + "learning_rate": 4.9080196637215555e-05, + "loss": 2.2595, + "mean_token_accuracy": 0.4662561535835266, + "step": 135785 + }, + { + "epoch": 0.13676913020312417, + "grad_norm": 12.023642180534015, + "learning_rate": 4.9080090590183105e-05, + "loss": 2.6414, + "mean_token_accuracy": 0.43103448748588563, + "step": 135790 + }, + { + "epoch": 0.13677416625622835, + "grad_norm": 11.336626388819141, + "learning_rate": 4.907998453716534e-05, + "loss": 2.1673, + "mean_token_accuracy": 0.4551724135875702, + "step": 135795 + }, + { + "epoch": 0.13677920230933252, + "grad_norm": 9.939559783102764, + "learning_rate": 4.9079878478162286e-05, + "loss": 2.3156, + "mean_token_accuracy": 0.4189957737922668, + "step": 135800 + }, + { + "epoch": 0.1367842383624367, + "grad_norm": 9.15824023565933, + "learning_rate": 4.907977241317397e-05, + "loss": 2.3298, + "mean_token_accuracy": 0.38965516686439516, + "step": 135805 + }, + { + "epoch": 0.13678927441554087, + "grad_norm": 10.574219774939547, + "learning_rate": 4.9079666342200434e-05, + "loss": 2.7353, + "mean_token_accuracy": 0.4137930989265442, + "step": 135810 + }, + { + "epoch": 0.13679431046864504, + "grad_norm": 10.913841034023124, + "learning_rate": 4.907956026524169e-05, + "loss": 2.9818, + "mean_token_accuracy": 0.3862068891525269, + "step": 135815 + }, + { + "epoch": 0.13679934652174922, + "grad_norm": 9.492111242102373, + "learning_rate": 4.9079454182297764e-05, + "loss": 2.1097, + "mean_token_accuracy": 0.4896551728248596, + "step": 135820 + }, + { + "epoch": 0.13680438257485336, + "grad_norm": 11.172785300326586, + "learning_rate": 4.9079348093368714e-05, + "loss": 2.2607, + "mean_token_accuracy": 0.4655172348022461, + "step": 135825 + }, + { + "epoch": 0.13680941862795754, + "grad_norm": 12.09329985211726, + "learning_rate": 4.907924199845454e-05, + "loss": 2.5087, + "mean_token_accuracy": 0.4379310429096222, + "step": 135830 + }, + { + "epoch": 0.1368144546810617, + "grad_norm": 12.114663388722757, + "learning_rate": 4.907913589755528e-05, + "loss": 2.4364, + "mean_token_accuracy": 0.3620689630508423, + "step": 135835 + }, + { + "epoch": 0.13681949073416588, + "grad_norm": 12.065124300662013, + "learning_rate": 4.907902979067097e-05, + "loss": 2.3438, + "mean_token_accuracy": 0.4448275864124298, + "step": 135840 + }, + { + "epoch": 0.13682452678727006, + "grad_norm": 9.494257844873646, + "learning_rate": 4.9078923677801644e-05, + "loss": 2.1547, + "mean_token_accuracy": 0.441379314661026, + "step": 135845 + }, + { + "epoch": 0.13682956284037423, + "grad_norm": 8.251131335075153, + "learning_rate": 4.907881755894732e-05, + "loss": 2.2371, + "mean_token_accuracy": 0.44827585220336913, + "step": 135850 + }, + { + "epoch": 0.1368345988934784, + "grad_norm": 10.496312156606233, + "learning_rate": 4.907871143410802e-05, + "loss": 2.45, + "mean_token_accuracy": 0.4551724135875702, + "step": 135855 + }, + { + "epoch": 0.13683963494658258, + "grad_norm": 10.704950881211056, + "learning_rate": 4.9078605303283794e-05, + "loss": 2.7004, + "mean_token_accuracy": 0.41724138259887694, + "step": 135860 + }, + { + "epoch": 0.13684467099968675, + "grad_norm": 11.511399896788891, + "learning_rate": 4.907849916647466e-05, + "loss": 2.4837, + "mean_token_accuracy": 0.417241370677948, + "step": 135865 + }, + { + "epoch": 0.13684970705279093, + "grad_norm": 15.016114437666047, + "learning_rate": 4.9078393023680644e-05, + "loss": 2.2655, + "mean_token_accuracy": 0.4655172348022461, + "step": 135870 + }, + { + "epoch": 0.1368547431058951, + "grad_norm": 10.0134265843284, + "learning_rate": 4.907828687490178e-05, + "loss": 2.4789, + "mean_token_accuracy": 0.4206896543502808, + "step": 135875 + }, + { + "epoch": 0.13685977915899927, + "grad_norm": 9.225585068947067, + "learning_rate": 4.907818072013811e-05, + "loss": 2.4856, + "mean_token_accuracy": 0.4241379380226135, + "step": 135880 + }, + { + "epoch": 0.13686481521210345, + "grad_norm": 9.947984305211662, + "learning_rate": 4.907807455938964e-05, + "loss": 2.3211, + "mean_token_accuracy": 0.4551724135875702, + "step": 135885 + }, + { + "epoch": 0.13686985126520762, + "grad_norm": 10.864506555676126, + "learning_rate": 4.9077968392656406e-05, + "loss": 2.1528, + "mean_token_accuracy": 0.4620689630508423, + "step": 135890 + }, + { + "epoch": 0.1368748873183118, + "grad_norm": 11.536300710192116, + "learning_rate": 4.907786221993845e-05, + "loss": 2.8137, + "mean_token_accuracy": 0.3551724135875702, + "step": 135895 + }, + { + "epoch": 0.13687992337141597, + "grad_norm": 11.514153017691001, + "learning_rate": 4.907775604123579e-05, + "loss": 2.2797, + "mean_token_accuracy": 0.4517241418361664, + "step": 135900 + }, + { + "epoch": 0.13688495942452014, + "grad_norm": 9.5683650319034, + "learning_rate": 4.907764985654846e-05, + "loss": 2.1909, + "mean_token_accuracy": 0.4482758641242981, + "step": 135905 + }, + { + "epoch": 0.13688999547762432, + "grad_norm": 11.143735614080876, + "learning_rate": 4.907754366587649e-05, + "loss": 2.4838, + "mean_token_accuracy": 0.4344827651977539, + "step": 135910 + }, + { + "epoch": 0.1368950315307285, + "grad_norm": 10.645569250971965, + "learning_rate": 4.907743746921991e-05, + "loss": 2.2982, + "mean_token_accuracy": 0.4935269236564636, + "step": 135915 + }, + { + "epoch": 0.13690006758383266, + "grad_norm": 13.127273468234945, + "learning_rate": 4.907733126657874e-05, + "loss": 2.4373, + "mean_token_accuracy": 0.41379311084747317, + "step": 135920 + }, + { + "epoch": 0.13690510363693684, + "grad_norm": 9.508404774728922, + "learning_rate": 4.9077225057953025e-05, + "loss": 2.5469, + "mean_token_accuracy": 0.39310344457626345, + "step": 135925 + }, + { + "epoch": 0.136910139690041, + "grad_norm": 11.144393319841221, + "learning_rate": 4.9077118843342785e-05, + "loss": 2.4026, + "mean_token_accuracy": 0.44271021485328677, + "step": 135930 + }, + { + "epoch": 0.13691517574314518, + "grad_norm": 13.097881625104053, + "learning_rate": 4.9077012622748046e-05, + "loss": 2.4426, + "mean_token_accuracy": 0.4354679763317108, + "step": 135935 + }, + { + "epoch": 0.13692021179624936, + "grad_norm": 9.62386253153449, + "learning_rate": 4.9076906396168843e-05, + "loss": 2.1662, + "mean_token_accuracy": 0.39655172228813174, + "step": 135940 + }, + { + "epoch": 0.13692524784935353, + "grad_norm": 9.067579986071935, + "learning_rate": 4.9076800163605204e-05, + "loss": 2.6508, + "mean_token_accuracy": 0.4379310250282288, + "step": 135945 + }, + { + "epoch": 0.1369302839024577, + "grad_norm": 9.645249132035138, + "learning_rate": 4.907669392505716e-05, + "loss": 2.3388, + "mean_token_accuracy": 0.43103448748588563, + "step": 135950 + }, + { + "epoch": 0.13693531995556188, + "grad_norm": 11.687398053373709, + "learning_rate": 4.907658768052474e-05, + "loss": 2.1456, + "mean_token_accuracy": 0.5068965435028077, + "step": 135955 + }, + { + "epoch": 0.13694035600866605, + "grad_norm": 9.432984148265975, + "learning_rate": 4.9076481430007976e-05, + "loss": 2.3607, + "mean_token_accuracy": 0.4034482777118683, + "step": 135960 + }, + { + "epoch": 0.1369453920617702, + "grad_norm": 8.841447470051627, + "learning_rate": 4.907637517350688e-05, + "loss": 2.1675, + "mean_token_accuracy": 0.44482759237289426, + "step": 135965 + }, + { + "epoch": 0.13695042811487437, + "grad_norm": 12.175639530744665, + "learning_rate": 4.907626891102152e-05, + "loss": 2.5576, + "mean_token_accuracy": 0.4068965494632721, + "step": 135970 + }, + { + "epoch": 0.13695546416797855, + "grad_norm": 14.827176017455397, + "learning_rate": 4.907616264255188e-05, + "loss": 3.1147, + "mean_token_accuracy": 0.3758620619773865, + "step": 135975 + }, + { + "epoch": 0.13696050022108272, + "grad_norm": 11.234839425148808, + "learning_rate": 4.907605636809802e-05, + "loss": 3.0248, + "mean_token_accuracy": 0.3896551728248596, + "step": 135980 + }, + { + "epoch": 0.1369655362741869, + "grad_norm": 12.574025625243491, + "learning_rate": 4.9075950087659954e-05, + "loss": 2.5381, + "mean_token_accuracy": 0.37241379022598264, + "step": 135985 + }, + { + "epoch": 0.13697057232729107, + "grad_norm": 7.804242184151293, + "learning_rate": 4.907584380123773e-05, + "loss": 2.0614, + "mean_token_accuracy": 0.44482758045196535, + "step": 135990 + }, + { + "epoch": 0.13697560838039524, + "grad_norm": 9.780675150070724, + "learning_rate": 4.9075737508831356e-05, + "loss": 2.1679, + "mean_token_accuracy": 0.4448275864124298, + "step": 135995 + }, + { + "epoch": 0.13698064443349942, + "grad_norm": 9.572254974878895, + "learning_rate": 4.907563121044088e-05, + "loss": 2.2885, + "mean_token_accuracy": 0.47586206793785096, + "step": 136000 + }, + { + "epoch": 0.1369856804866036, + "grad_norm": 10.042028799340086, + "learning_rate": 4.9075524906066306e-05, + "loss": 2.3757, + "mean_token_accuracy": 0.4137930989265442, + "step": 136005 + }, + { + "epoch": 0.13699071653970776, + "grad_norm": 11.119454788986793, + "learning_rate": 4.907541859570769e-05, + "loss": 2.1566, + "mean_token_accuracy": 0.4551724135875702, + "step": 136010 + }, + { + "epoch": 0.13699575259281194, + "grad_norm": 11.495457951447007, + "learning_rate": 4.9075312279365046e-05, + "loss": 2.1571, + "mean_token_accuracy": 0.46551724076271056, + "step": 136015 + }, + { + "epoch": 0.1370007886459161, + "grad_norm": 10.886745299393255, + "learning_rate": 4.907520595703841e-05, + "loss": 2.4077, + "mean_token_accuracy": 0.4241379201412201, + "step": 136020 + }, + { + "epoch": 0.13700582469902028, + "grad_norm": 11.340735266264652, + "learning_rate": 4.9075099628727814e-05, + "loss": 2.088, + "mean_token_accuracy": 0.46382335424423216, + "step": 136025 + }, + { + "epoch": 0.13701086075212446, + "grad_norm": 9.706314110154384, + "learning_rate": 4.907499329443329e-05, + "loss": 2.3392, + "mean_token_accuracy": 0.3965517282485962, + "step": 136030 + }, + { + "epoch": 0.13701589680522863, + "grad_norm": 8.527116252600417, + "learning_rate": 4.907488695415485e-05, + "loss": 2.4207, + "mean_token_accuracy": 0.441379314661026, + "step": 136035 + }, + { + "epoch": 0.1370209328583328, + "grad_norm": 11.010879816877619, + "learning_rate": 4.907478060789253e-05, + "loss": 2.4459, + "mean_token_accuracy": 0.4517241299152374, + "step": 136040 + }, + { + "epoch": 0.13702596891143698, + "grad_norm": 12.25871214004453, + "learning_rate": 4.907467425564638e-05, + "loss": 2.4374, + "mean_token_accuracy": 0.4068965494632721, + "step": 136045 + }, + { + "epoch": 0.13703100496454115, + "grad_norm": 9.928917428190028, + "learning_rate": 4.90745678974164e-05, + "loss": 2.2436, + "mean_token_accuracy": 0.4172413766384125, + "step": 136050 + }, + { + "epoch": 0.13703604101764533, + "grad_norm": 11.114040014995396, + "learning_rate": 4.9074461533202645e-05, + "loss": 2.2276, + "mean_token_accuracy": 0.42413793206214906, + "step": 136055 + }, + { + "epoch": 0.1370410770707495, + "grad_norm": 9.267364456394278, + "learning_rate": 4.9074355163005124e-05, + "loss": 2.2421, + "mean_token_accuracy": 0.44482758045196535, + "step": 136060 + }, + { + "epoch": 0.13704611312385367, + "grad_norm": 10.816579715589292, + "learning_rate": 4.907424878682388e-05, + "loss": 2.1085, + "mean_token_accuracy": 0.4862068951129913, + "step": 136065 + }, + { + "epoch": 0.13705114917695785, + "grad_norm": 9.816683275266303, + "learning_rate": 4.907414240465893e-05, + "loss": 2.5691, + "mean_token_accuracy": 0.3517241358757019, + "step": 136070 + }, + { + "epoch": 0.13705618523006202, + "grad_norm": 9.462703960918113, + "learning_rate": 4.9074036016510315e-05, + "loss": 2.2923, + "mean_token_accuracy": 0.48620688915252686, + "step": 136075 + }, + { + "epoch": 0.1370612212831662, + "grad_norm": 11.611679605605419, + "learning_rate": 4.907392962237807e-05, + "loss": 2.5785, + "mean_token_accuracy": 0.39310344457626345, + "step": 136080 + }, + { + "epoch": 0.13706625733627037, + "grad_norm": 10.557758545073588, + "learning_rate": 4.9073823222262205e-05, + "loss": 2.3001, + "mean_token_accuracy": 0.44137930274009707, + "step": 136085 + }, + { + "epoch": 0.13707129338937454, + "grad_norm": 9.433129599599724, + "learning_rate": 4.9073716816162765e-05, + "loss": 2.2114, + "mean_token_accuracy": 0.441379314661026, + "step": 136090 + }, + { + "epoch": 0.13707632944247872, + "grad_norm": 13.690244724965458, + "learning_rate": 4.907361040407977e-05, + "loss": 2.6409, + "mean_token_accuracy": 0.4206896543502808, + "step": 136095 + }, + { + "epoch": 0.1370813654955829, + "grad_norm": 8.524433838375618, + "learning_rate": 4.9073503986013244e-05, + "loss": 2.3354, + "mean_token_accuracy": 0.4620689630508423, + "step": 136100 + }, + { + "epoch": 0.13708640154868704, + "grad_norm": 11.07018864312266, + "learning_rate": 4.9073397561963245e-05, + "loss": 2.3782, + "mean_token_accuracy": 0.4413793087005615, + "step": 136105 + }, + { + "epoch": 0.1370914376017912, + "grad_norm": 12.046695789768876, + "learning_rate": 4.9073291131929764e-05, + "loss": 2.3822, + "mean_token_accuracy": 0.42068966031074523, + "step": 136110 + }, + { + "epoch": 0.13709647365489538, + "grad_norm": 12.816600202262455, + "learning_rate": 4.907318469591286e-05, + "loss": 2.4926, + "mean_token_accuracy": 0.41724137365818026, + "step": 136115 + }, + { + "epoch": 0.13710150970799956, + "grad_norm": 10.342952437291279, + "learning_rate": 4.907307825391256e-05, + "loss": 2.5264, + "mean_token_accuracy": 0.42413793206214906, + "step": 136120 + }, + { + "epoch": 0.13710654576110373, + "grad_norm": 11.30941781606829, + "learning_rate": 4.907297180592889e-05, + "loss": 2.3541, + "mean_token_accuracy": 0.4344827592372894, + "step": 136125 + }, + { + "epoch": 0.1371115818142079, + "grad_norm": 9.542453894523126, + "learning_rate": 4.907286535196186e-05, + "loss": 2.0848, + "mean_token_accuracy": 0.458620685338974, + "step": 136130 + }, + { + "epoch": 0.13711661786731208, + "grad_norm": 9.272627075370915, + "learning_rate": 4.907275889201152e-05, + "loss": 2.2037, + "mean_token_accuracy": 0.4620689690113068, + "step": 136135 + }, + { + "epoch": 0.13712165392041625, + "grad_norm": 12.109606398340768, + "learning_rate": 4.9072652426077905e-05, + "loss": 2.7207, + "mean_token_accuracy": 0.38965516090393065, + "step": 136140 + }, + { + "epoch": 0.13712668997352043, + "grad_norm": 12.705608967016135, + "learning_rate": 4.907254595416102e-05, + "loss": 2.4437, + "mean_token_accuracy": 0.40344828367233276, + "step": 136145 + }, + { + "epoch": 0.1371317260266246, + "grad_norm": 10.686500966298913, + "learning_rate": 4.907243947626092e-05, + "loss": 2.2968, + "mean_token_accuracy": 0.44827585220336913, + "step": 136150 + }, + { + "epoch": 0.13713676207972877, + "grad_norm": 11.47754341045721, + "learning_rate": 4.907233299237761e-05, + "loss": 2.501, + "mean_token_accuracy": 0.38965516686439516, + "step": 136155 + }, + { + "epoch": 0.13714179813283295, + "grad_norm": 8.142258236729274, + "learning_rate": 4.9072226502511144e-05, + "loss": 2.2272, + "mean_token_accuracy": 0.4931034505367279, + "step": 136160 + }, + { + "epoch": 0.13714683418593712, + "grad_norm": 10.340973576528922, + "learning_rate": 4.907212000666154e-05, + "loss": 2.1338, + "mean_token_accuracy": 0.4482758641242981, + "step": 136165 + }, + { + "epoch": 0.1371518702390413, + "grad_norm": 9.353108202392926, + "learning_rate": 4.907201350482882e-05, + "loss": 2.15, + "mean_token_accuracy": 0.4517241299152374, + "step": 136170 + }, + { + "epoch": 0.13715690629214547, + "grad_norm": 10.242239453726352, + "learning_rate": 4.907190699701303e-05, + "loss": 2.0592, + "mean_token_accuracy": 0.49655172824859617, + "step": 136175 + }, + { + "epoch": 0.13716194234524964, + "grad_norm": 9.42138167629758, + "learning_rate": 4.9071800483214183e-05, + "loss": 2.404, + "mean_token_accuracy": 0.4206896424293518, + "step": 136180 + }, + { + "epoch": 0.13716697839835382, + "grad_norm": 9.708814390341226, + "learning_rate": 4.907169396343233e-05, + "loss": 2.5426, + "mean_token_accuracy": 0.4068965494632721, + "step": 136185 + }, + { + "epoch": 0.137172014451458, + "grad_norm": 16.2530589417657, + "learning_rate": 4.9071587437667484e-05, + "loss": 2.5089, + "mean_token_accuracy": 0.403448286652565, + "step": 136190 + }, + { + "epoch": 0.13717705050456216, + "grad_norm": 14.518727612001474, + "learning_rate": 4.907148090591967e-05, + "loss": 2.3871, + "mean_token_accuracy": 0.4689655065536499, + "step": 136195 + }, + { + "epoch": 0.13718208655766634, + "grad_norm": 9.524263509919313, + "learning_rate": 4.907137436818893e-05, + "loss": 2.6311, + "mean_token_accuracy": 0.42238354682922363, + "step": 136200 + }, + { + "epoch": 0.1371871226107705, + "grad_norm": 9.362826799482935, + "learning_rate": 4.907126782447528e-05, + "loss": 2.2835, + "mean_token_accuracy": 0.38275861740112305, + "step": 136205 + }, + { + "epoch": 0.13719215866387469, + "grad_norm": 7.482705876092103, + "learning_rate": 4.907116127477877e-05, + "loss": 2.2736, + "mean_token_accuracy": 0.4745916426181793, + "step": 136210 + }, + { + "epoch": 0.13719719471697886, + "grad_norm": 8.38619159968426, + "learning_rate": 4.9071054719099414e-05, + "loss": 2.2208, + "mean_token_accuracy": 0.4620689630508423, + "step": 136215 + }, + { + "epoch": 0.13720223077008303, + "grad_norm": 12.394507897619473, + "learning_rate": 4.9070948157437244e-05, + "loss": 2.4786, + "mean_token_accuracy": 0.3793103516101837, + "step": 136220 + }, + { + "epoch": 0.1372072668231872, + "grad_norm": 9.408778307993797, + "learning_rate": 4.907084158979229e-05, + "loss": 2.4651, + "mean_token_accuracy": 0.41724138855934145, + "step": 136225 + }, + { + "epoch": 0.13721230287629138, + "grad_norm": 10.934104177259798, + "learning_rate": 4.907073501616459e-05, + "loss": 2.4992, + "mean_token_accuracy": 0.41034482717514037, + "step": 136230 + }, + { + "epoch": 0.13721733892939555, + "grad_norm": 11.972905825989562, + "learning_rate": 4.9070628436554155e-05, + "loss": 2.2579, + "mean_token_accuracy": 0.4482758641242981, + "step": 136235 + }, + { + "epoch": 0.13722237498249973, + "grad_norm": 9.550701547007698, + "learning_rate": 4.9070521850961027e-05, + "loss": 2.1434, + "mean_token_accuracy": 0.49570477604866026, + "step": 136240 + }, + { + "epoch": 0.13722741103560387, + "grad_norm": 9.009356377795768, + "learning_rate": 4.9070415259385244e-05, + "loss": 2.4399, + "mean_token_accuracy": 0.43103447556495667, + "step": 136245 + }, + { + "epoch": 0.13723244708870805, + "grad_norm": 10.337365230336474, + "learning_rate": 4.907030866182683e-05, + "loss": 2.6053, + "mean_token_accuracy": 0.38275861740112305, + "step": 136250 + }, + { + "epoch": 0.13723748314181222, + "grad_norm": 12.734087246568043, + "learning_rate": 4.907020205828579e-05, + "loss": 2.4927, + "mean_token_accuracy": 0.43623715043067934, + "step": 136255 + }, + { + "epoch": 0.1372425191949164, + "grad_norm": 12.035540202230168, + "learning_rate": 4.9070095448762185e-05, + "loss": 2.5144, + "mean_token_accuracy": 0.41034482717514037, + "step": 136260 + }, + { + "epoch": 0.13724755524802057, + "grad_norm": 11.692116725575898, + "learning_rate": 4.9069988833256034e-05, + "loss": 2.5743, + "mean_token_accuracy": 0.44482759237289426, + "step": 136265 + }, + { + "epoch": 0.13725259130112474, + "grad_norm": 10.151920373961394, + "learning_rate": 4.9069882211767364e-05, + "loss": 3.0067, + "mean_token_accuracy": 0.3620689630508423, + "step": 136270 + }, + { + "epoch": 0.13725762735422892, + "grad_norm": 11.77063418645305, + "learning_rate": 4.906977558429621e-05, + "loss": 2.5996, + "mean_token_accuracy": 0.41379310488700866, + "step": 136275 + }, + { + "epoch": 0.1372626634073331, + "grad_norm": 14.13691016072988, + "learning_rate": 4.9069668950842596e-05, + "loss": 2.5965, + "mean_token_accuracy": 0.43448275327682495, + "step": 136280 + }, + { + "epoch": 0.13726769946043726, + "grad_norm": 11.029586573691278, + "learning_rate": 4.906956231140656e-05, + "loss": 2.4327, + "mean_token_accuracy": 0.38275861740112305, + "step": 136285 + }, + { + "epoch": 0.13727273551354144, + "grad_norm": 8.751896540793833, + "learning_rate": 4.906945566598812e-05, + "loss": 2.2932, + "mean_token_accuracy": 0.44827585816383364, + "step": 136290 + }, + { + "epoch": 0.1372777715666456, + "grad_norm": 10.977420043212653, + "learning_rate": 4.9069349014587305e-05, + "loss": 1.9976, + "mean_token_accuracy": 0.5034482717514038, + "step": 136295 + }, + { + "epoch": 0.13728280761974979, + "grad_norm": 13.143539726540567, + "learning_rate": 4.9069242357204164e-05, + "loss": 2.8354, + "mean_token_accuracy": 0.41034482717514037, + "step": 136300 + }, + { + "epoch": 0.13728784367285396, + "grad_norm": 13.561305320310327, + "learning_rate": 4.90691356938387e-05, + "loss": 2.6335, + "mean_token_accuracy": 0.43103448748588563, + "step": 136305 + }, + { + "epoch": 0.13729287972595813, + "grad_norm": 11.021052340221079, + "learning_rate": 4.906902902449096e-05, + "loss": 2.0098, + "mean_token_accuracy": 0.5124016880989075, + "step": 136310 + }, + { + "epoch": 0.1372979157790623, + "grad_norm": 8.86168839029277, + "learning_rate": 4.9068922349160974e-05, + "loss": 2.021, + "mean_token_accuracy": 0.4620689630508423, + "step": 136315 + }, + { + "epoch": 0.13730295183216648, + "grad_norm": 11.391912999731778, + "learning_rate": 4.9068815667848766e-05, + "loss": 2.2412, + "mean_token_accuracy": 0.493103438615799, + "step": 136320 + }, + { + "epoch": 0.13730798788527065, + "grad_norm": 14.714220771487728, + "learning_rate": 4.9068708980554364e-05, + "loss": 2.6496, + "mean_token_accuracy": 0.3985480904579163, + "step": 136325 + }, + { + "epoch": 0.13731302393837483, + "grad_norm": 9.94891545761921, + "learning_rate": 4.90686022872778e-05, + "loss": 2.3421, + "mean_token_accuracy": 0.44827585816383364, + "step": 136330 + }, + { + "epoch": 0.137318059991479, + "grad_norm": 12.7171115352992, + "learning_rate": 4.90684955880191e-05, + "loss": 2.46, + "mean_token_accuracy": 0.4137930989265442, + "step": 136335 + }, + { + "epoch": 0.13732309604458318, + "grad_norm": 10.990955599204508, + "learning_rate": 4.906838888277831e-05, + "loss": 2.5245, + "mean_token_accuracy": 0.4034482777118683, + "step": 136340 + }, + { + "epoch": 0.13732813209768735, + "grad_norm": 10.387655015171548, + "learning_rate": 4.9068282171555434e-05, + "loss": 2.5266, + "mean_token_accuracy": 0.3965517282485962, + "step": 136345 + }, + { + "epoch": 0.13733316815079152, + "grad_norm": 12.952437233026501, + "learning_rate": 4.9068175454350516e-05, + "loss": 2.0946, + "mean_token_accuracy": 0.43793103098869324, + "step": 136350 + }, + { + "epoch": 0.1373382042038957, + "grad_norm": 8.274698151445026, + "learning_rate": 4.906806873116359e-05, + "loss": 2.3992, + "mean_token_accuracy": 0.41724138259887694, + "step": 136355 + }, + { + "epoch": 0.13734324025699987, + "grad_norm": 10.989499560781473, + "learning_rate": 4.9067962001994676e-05, + "loss": 2.0467, + "mean_token_accuracy": 0.49999999403953554, + "step": 136360 + }, + { + "epoch": 0.13734827631010404, + "grad_norm": 12.520432452312585, + "learning_rate": 4.906785526684381e-05, + "loss": 2.8514, + "mean_token_accuracy": 0.379310342669487, + "step": 136365 + }, + { + "epoch": 0.13735331236320822, + "grad_norm": 11.070790068735144, + "learning_rate": 4.9067748525711015e-05, + "loss": 2.4084, + "mean_token_accuracy": 0.39655172228813174, + "step": 136370 + }, + { + "epoch": 0.1373583484163124, + "grad_norm": 9.792638919538321, + "learning_rate": 4.906764177859633e-05, + "loss": 2.3621, + "mean_token_accuracy": 0.4775559604167938, + "step": 136375 + }, + { + "epoch": 0.13736338446941657, + "grad_norm": 10.601379319490368, + "learning_rate": 4.9067535025499764e-05, + "loss": 2.405, + "mean_token_accuracy": 0.4310344815254211, + "step": 136380 + }, + { + "epoch": 0.1373684205225207, + "grad_norm": 12.32227299803304, + "learning_rate": 4.9067428266421387e-05, + "loss": 2.7521, + "mean_token_accuracy": 0.36896551847457887, + "step": 136385 + }, + { + "epoch": 0.13737345657562489, + "grad_norm": 12.86883440409392, + "learning_rate": 4.906732150136119e-05, + "loss": 2.359, + "mean_token_accuracy": 0.42068966031074523, + "step": 136390 + }, + { + "epoch": 0.13737849262872906, + "grad_norm": 10.581922677303655, + "learning_rate": 4.906721473031921e-05, + "loss": 2.4041, + "mean_token_accuracy": 0.38620689511299133, + "step": 136395 + }, + { + "epoch": 0.13738352868183323, + "grad_norm": 12.51682623500794, + "learning_rate": 4.906710795329549e-05, + "loss": 2.2267, + "mean_token_accuracy": 0.47586206793785096, + "step": 136400 + }, + { + "epoch": 0.1373885647349374, + "grad_norm": 11.540444447718269, + "learning_rate": 4.906700117029005e-05, + "loss": 2.614, + "mean_token_accuracy": 0.42565032839775085, + "step": 136405 + }, + { + "epoch": 0.13739360078804158, + "grad_norm": 12.363266223349033, + "learning_rate": 4.906689438130292e-05, + "loss": 2.5614, + "mean_token_accuracy": 0.42413793206214906, + "step": 136410 + }, + { + "epoch": 0.13739863684114575, + "grad_norm": 9.81090070323206, + "learning_rate": 4.906678758633413e-05, + "loss": 2.922, + "mean_token_accuracy": 0.35862069129943847, + "step": 136415 + }, + { + "epoch": 0.13740367289424993, + "grad_norm": 11.301183879033823, + "learning_rate": 4.9066680785383715e-05, + "loss": 2.1516, + "mean_token_accuracy": 0.46551724672317507, + "step": 136420 + }, + { + "epoch": 0.1374087089473541, + "grad_norm": 10.273330544556877, + "learning_rate": 4.90665739784517e-05, + "loss": 2.8287, + "mean_token_accuracy": 0.3965517282485962, + "step": 136425 + }, + { + "epoch": 0.13741374500045828, + "grad_norm": 10.776364942481337, + "learning_rate": 4.906646716553811e-05, + "loss": 2.5586, + "mean_token_accuracy": 0.3862068891525269, + "step": 136430 + }, + { + "epoch": 0.13741878105356245, + "grad_norm": 11.984528575711225, + "learning_rate": 4.9066360346642995e-05, + "loss": 2.3482, + "mean_token_accuracy": 0.4, + "step": 136435 + }, + { + "epoch": 0.13742381710666662, + "grad_norm": 11.540234106329256, + "learning_rate": 4.906625352176635e-05, + "loss": 2.3558, + "mean_token_accuracy": 0.45862067937850953, + "step": 136440 + }, + { + "epoch": 0.1374288531597708, + "grad_norm": 10.722915835629925, + "learning_rate": 4.906614669090824e-05, + "loss": 2.28, + "mean_token_accuracy": 0.47586206793785096, + "step": 136445 + }, + { + "epoch": 0.13743388921287497, + "grad_norm": 9.645872253154693, + "learning_rate": 4.9066039854068675e-05, + "loss": 2.63, + "mean_token_accuracy": 0.43103448748588563, + "step": 136450 + }, + { + "epoch": 0.13743892526597914, + "grad_norm": 11.53571113142397, + "learning_rate": 4.906593301124768e-05, + "loss": 2.3693, + "mean_token_accuracy": 0.42413793206214906, + "step": 136455 + }, + { + "epoch": 0.13744396131908332, + "grad_norm": 11.719508133002138, + "learning_rate": 4.90658261624453e-05, + "loss": 2.3318, + "mean_token_accuracy": 0.4758620738983154, + "step": 136460 + }, + { + "epoch": 0.1374489973721875, + "grad_norm": 11.831339310424726, + "learning_rate": 4.9065719307661556e-05, + "loss": 2.4024, + "mean_token_accuracy": 0.42232305407524107, + "step": 136465 + }, + { + "epoch": 0.13745403342529167, + "grad_norm": 12.740673870858636, + "learning_rate": 4.906561244689648e-05, + "loss": 2.6623, + "mean_token_accuracy": 0.4310344815254211, + "step": 136470 + }, + { + "epoch": 0.13745906947839584, + "grad_norm": 8.703729263294038, + "learning_rate": 4.90655055801501e-05, + "loss": 2.0985, + "mean_token_accuracy": 0.47241379618644713, + "step": 136475 + }, + { + "epoch": 0.1374641055315, + "grad_norm": 10.575471902962864, + "learning_rate": 4.906539870742245e-05, + "loss": 2.0685, + "mean_token_accuracy": 0.482758617401123, + "step": 136480 + }, + { + "epoch": 0.1374691415846042, + "grad_norm": 9.629168613369925, + "learning_rate": 4.906529182871355e-05, + "loss": 1.9937, + "mean_token_accuracy": 0.482758617401123, + "step": 136485 + }, + { + "epoch": 0.13747417763770836, + "grad_norm": 9.411305289169983, + "learning_rate": 4.906518494402344e-05, + "loss": 1.9733, + "mean_token_accuracy": 0.4912885665893555, + "step": 136490 + }, + { + "epoch": 0.13747921369081254, + "grad_norm": 10.55569999939673, + "learning_rate": 4.9065078053352144e-05, + "loss": 2.2655, + "mean_token_accuracy": 0.4551724135875702, + "step": 136495 + }, + { + "epoch": 0.1374842497439167, + "grad_norm": 10.880735978588028, + "learning_rate": 4.90649711566997e-05, + "loss": 2.4775, + "mean_token_accuracy": 0.4034482777118683, + "step": 136500 + }, + { + "epoch": 0.13748928579702088, + "grad_norm": 11.323541153133842, + "learning_rate": 4.9064864254066116e-05, + "loss": 2.3076, + "mean_token_accuracy": 0.4275861978530884, + "step": 136505 + }, + { + "epoch": 0.13749432185012506, + "grad_norm": 14.17729734090147, + "learning_rate": 4.9064757345451444e-05, + "loss": 2.5106, + "mean_token_accuracy": 0.41034482717514037, + "step": 136510 + }, + { + "epoch": 0.13749935790322923, + "grad_norm": 8.401545497778573, + "learning_rate": 4.906465043085571e-05, + "loss": 2.2, + "mean_token_accuracy": 0.42413792610168455, + "step": 136515 + }, + { + "epoch": 0.1375043939563334, + "grad_norm": 10.427659484019577, + "learning_rate": 4.9064543510278934e-05, + "loss": 2.6843, + "mean_token_accuracy": 0.36551724672317504, + "step": 136520 + }, + { + "epoch": 0.13750943000943755, + "grad_norm": 10.422607359600912, + "learning_rate": 4.9064436583721164e-05, + "loss": 2.3948, + "mean_token_accuracy": 0.42758620977401735, + "step": 136525 + }, + { + "epoch": 0.13751446606254172, + "grad_norm": 12.272840518022374, + "learning_rate": 4.9064329651182406e-05, + "loss": 2.7378, + "mean_token_accuracy": 0.37241379022598264, + "step": 136530 + }, + { + "epoch": 0.1375195021156459, + "grad_norm": 9.081743187149343, + "learning_rate": 4.90642227126627e-05, + "loss": 2.2047, + "mean_token_accuracy": 0.46551724076271056, + "step": 136535 + }, + { + "epoch": 0.13752453816875007, + "grad_norm": 10.462039705229992, + "learning_rate": 4.906411576816208e-05, + "loss": 2.2901, + "mean_token_accuracy": 0.43605564832687377, + "step": 136540 + }, + { + "epoch": 0.13752957422185424, + "grad_norm": 12.466515237867482, + "learning_rate": 4.906400881768057e-05, + "loss": 2.562, + "mean_token_accuracy": 0.4103448331356049, + "step": 136545 + }, + { + "epoch": 0.13753461027495842, + "grad_norm": 9.902118063692782, + "learning_rate": 4.90639018612182e-05, + "loss": 2.2668, + "mean_token_accuracy": 0.43103448748588563, + "step": 136550 + }, + { + "epoch": 0.1375396463280626, + "grad_norm": 13.30530957149223, + "learning_rate": 4.9063794898775e-05, + "loss": 2.5321, + "mean_token_accuracy": 0.43103448748588563, + "step": 136555 + }, + { + "epoch": 0.13754468238116677, + "grad_norm": 9.003939930118364, + "learning_rate": 4.906368793035102e-05, + "loss": 2.8386, + "mean_token_accuracy": 0.41724138259887694, + "step": 136560 + }, + { + "epoch": 0.13754971843427094, + "grad_norm": 12.764952823437417, + "learning_rate": 4.906358095594625e-05, + "loss": 2.4504, + "mean_token_accuracy": 0.4379310369491577, + "step": 136565 + }, + { + "epoch": 0.1375547544873751, + "grad_norm": 9.731918624674412, + "learning_rate": 4.906347397556074e-05, + "loss": 2.2048, + "mean_token_accuracy": 0.46551724076271056, + "step": 136570 + }, + { + "epoch": 0.1375597905404793, + "grad_norm": 14.460179150725013, + "learning_rate": 4.906336698919453e-05, + "loss": 2.5673, + "mean_token_accuracy": 0.4068965494632721, + "step": 136575 + }, + { + "epoch": 0.13756482659358346, + "grad_norm": 10.504726128188349, + "learning_rate": 4.9063259996847634e-05, + "loss": 2.7979, + "mean_token_accuracy": 0.36206896007061007, + "step": 136580 + }, + { + "epoch": 0.13756986264668764, + "grad_norm": 10.493296448473458, + "learning_rate": 4.906315299852009e-05, + "loss": 3.068, + "mean_token_accuracy": 0.36896551847457887, + "step": 136585 + }, + { + "epoch": 0.1375748986997918, + "grad_norm": 10.148421421011829, + "learning_rate": 4.906304599421192e-05, + "loss": 2.6677, + "mean_token_accuracy": 0.3896551728248596, + "step": 136590 + }, + { + "epoch": 0.13757993475289598, + "grad_norm": 8.9807198983443, + "learning_rate": 4.9062938983923166e-05, + "loss": 2.1835, + "mean_token_accuracy": 0.49134907126426697, + "step": 136595 + }, + { + "epoch": 0.13758497080600016, + "grad_norm": 10.801109995988671, + "learning_rate": 4.9062831967653846e-05, + "loss": 1.8324, + "mean_token_accuracy": 0.5334543228149414, + "step": 136600 + }, + { + "epoch": 0.13759000685910433, + "grad_norm": 9.405679141414362, + "learning_rate": 4.9062724945404e-05, + "loss": 2.3966, + "mean_token_accuracy": 0.44827587008476255, + "step": 136605 + }, + { + "epoch": 0.1375950429122085, + "grad_norm": 9.845748829689436, + "learning_rate": 4.9062617917173645e-05, + "loss": 2.2054, + "mean_token_accuracy": 0.4655172348022461, + "step": 136610 + }, + { + "epoch": 0.13760007896531268, + "grad_norm": 11.187357914483874, + "learning_rate": 4.906251088296281e-05, + "loss": 2.7507, + "mean_token_accuracy": 0.3758620619773865, + "step": 136615 + }, + { + "epoch": 0.13760511501841685, + "grad_norm": 10.445948104660134, + "learning_rate": 4.9062403842771546e-05, + "loss": 2.7209, + "mean_token_accuracy": 0.44827585220336913, + "step": 136620 + }, + { + "epoch": 0.13761015107152103, + "grad_norm": 11.01636669908, + "learning_rate": 4.906229679659987e-05, + "loss": 2.2845, + "mean_token_accuracy": 0.3931034505367279, + "step": 136625 + }, + { + "epoch": 0.1376151871246252, + "grad_norm": 15.111603290554621, + "learning_rate": 4.906218974444781e-05, + "loss": 2.4377, + "mean_token_accuracy": 0.4551724076271057, + "step": 136630 + }, + { + "epoch": 0.13762022317772937, + "grad_norm": 8.375800745313047, + "learning_rate": 4.906208268631539e-05, + "loss": 2.1765, + "mean_token_accuracy": 0.43448275327682495, + "step": 136635 + }, + { + "epoch": 0.13762525923083355, + "grad_norm": 11.653389826858136, + "learning_rate": 4.9061975622202645e-05, + "loss": 2.3983, + "mean_token_accuracy": 0.3793103516101837, + "step": 136640 + }, + { + "epoch": 0.13763029528393772, + "grad_norm": 11.276637213463118, + "learning_rate": 4.9061868552109605e-05, + "loss": 2.0715, + "mean_token_accuracy": 0.4931034505367279, + "step": 136645 + }, + { + "epoch": 0.1376353313370419, + "grad_norm": 10.104326211257732, + "learning_rate": 4.90617614760363e-05, + "loss": 2.4408, + "mean_token_accuracy": 0.4137930989265442, + "step": 136650 + }, + { + "epoch": 0.13764036739014607, + "grad_norm": 11.396492953972677, + "learning_rate": 4.906165439398277e-05, + "loss": 2.6152, + "mean_token_accuracy": 0.4620689570903778, + "step": 136655 + }, + { + "epoch": 0.13764540344325024, + "grad_norm": 11.79919644382028, + "learning_rate": 4.906154730594903e-05, + "loss": 2.2641, + "mean_token_accuracy": 0.42413793206214906, + "step": 136660 + }, + { + "epoch": 0.1376504394963544, + "grad_norm": 12.052208431850582, + "learning_rate": 4.906144021193511e-05, + "loss": 2.6911, + "mean_token_accuracy": 0.4310344815254211, + "step": 136665 + }, + { + "epoch": 0.13765547554945856, + "grad_norm": 10.716860197718113, + "learning_rate": 4.906133311194105e-05, + "loss": 2.2462, + "mean_token_accuracy": 0.4713853597640991, + "step": 136670 + }, + { + "epoch": 0.13766051160256274, + "grad_norm": 8.845636419514264, + "learning_rate": 4.906122600596687e-05, + "loss": 2.4097, + "mean_token_accuracy": 0.45172412395477296, + "step": 136675 + }, + { + "epoch": 0.1376655476556669, + "grad_norm": 9.832655404320112, + "learning_rate": 4.9061118894012606e-05, + "loss": 2.3503, + "mean_token_accuracy": 0.4034482777118683, + "step": 136680 + }, + { + "epoch": 0.13767058370877108, + "grad_norm": 10.656940961050061, + "learning_rate": 4.906101177607828e-05, + "loss": 2.5194, + "mean_token_accuracy": 0.4103448331356049, + "step": 136685 + }, + { + "epoch": 0.13767561976187526, + "grad_norm": 13.850997173002858, + "learning_rate": 4.906090465216393e-05, + "loss": 2.0968, + "mean_token_accuracy": 0.45722927451133727, + "step": 136690 + }, + { + "epoch": 0.13768065581497943, + "grad_norm": 12.083859191690681, + "learning_rate": 4.9060797522269587e-05, + "loss": 2.3586, + "mean_token_accuracy": 0.4413793087005615, + "step": 136695 + }, + { + "epoch": 0.1376856918680836, + "grad_norm": 10.55002487815266, + "learning_rate": 4.9060690386395264e-05, + "loss": 2.2371, + "mean_token_accuracy": 0.44137930274009707, + "step": 136700 + }, + { + "epoch": 0.13769072792118778, + "grad_norm": 7.593834996570759, + "learning_rate": 4.906058324454102e-05, + "loss": 1.8884, + "mean_token_accuracy": 0.4896551728248596, + "step": 136705 + }, + { + "epoch": 0.13769576397429195, + "grad_norm": 9.774646881517143, + "learning_rate": 4.906047609670685e-05, + "loss": 1.947, + "mean_token_accuracy": 0.48275862336158754, + "step": 136710 + }, + { + "epoch": 0.13770080002739613, + "grad_norm": 9.924265967868068, + "learning_rate": 4.906036894289281e-05, + "loss": 2.6218, + "mean_token_accuracy": 0.334482753276825, + "step": 136715 + }, + { + "epoch": 0.1377058360805003, + "grad_norm": 10.602625830938189, + "learning_rate": 4.906026178309893e-05, + "loss": 2.5839, + "mean_token_accuracy": 0.37241379022598264, + "step": 136720 + }, + { + "epoch": 0.13771087213360447, + "grad_norm": 8.495630254069749, + "learning_rate": 4.906015461732522e-05, + "loss": 2.3646, + "mean_token_accuracy": 0.45015124678611756, + "step": 136725 + }, + { + "epoch": 0.13771590818670865, + "grad_norm": 9.61102893210028, + "learning_rate": 4.9060047445571724e-05, + "loss": 2.379, + "mean_token_accuracy": 0.42413792610168455, + "step": 136730 + }, + { + "epoch": 0.13772094423981282, + "grad_norm": 12.124575524206698, + "learning_rate": 4.905994026783846e-05, + "loss": 2.4611, + "mean_token_accuracy": 0.4379310250282288, + "step": 136735 + }, + { + "epoch": 0.137725980292917, + "grad_norm": 11.709566039554073, + "learning_rate": 4.905983308412548e-05, + "loss": 2.5627, + "mean_token_accuracy": 0.41034482717514037, + "step": 136740 + }, + { + "epoch": 0.13773101634602117, + "grad_norm": 9.630544866211713, + "learning_rate": 4.90597258944328e-05, + "loss": 2.2565, + "mean_token_accuracy": 0.441379314661026, + "step": 136745 + }, + { + "epoch": 0.13773605239912534, + "grad_norm": 13.858902192140379, + "learning_rate": 4.9059618698760436e-05, + "loss": 2.4368, + "mean_token_accuracy": 0.44482758045196535, + "step": 136750 + }, + { + "epoch": 0.13774108845222952, + "grad_norm": 9.291180180520113, + "learning_rate": 4.905951149710844e-05, + "loss": 2.1934, + "mean_token_accuracy": 0.4310344815254211, + "step": 136755 + }, + { + "epoch": 0.1377461245053337, + "grad_norm": 14.002157040737972, + "learning_rate": 4.905940428947683e-05, + "loss": 2.1689, + "mean_token_accuracy": 0.47434967160224917, + "step": 136760 + }, + { + "epoch": 0.13775116055843786, + "grad_norm": 8.820057836259314, + "learning_rate": 4.905929707586564e-05, + "loss": 2.6486, + "mean_token_accuracy": 0.45051422119140627, + "step": 136765 + }, + { + "epoch": 0.13775619661154204, + "grad_norm": 12.042484893992597, + "learning_rate": 4.9059189856274906e-05, + "loss": 2.5097, + "mean_token_accuracy": 0.4310344815254211, + "step": 136770 + }, + { + "epoch": 0.1377612326646462, + "grad_norm": 9.591091031643348, + "learning_rate": 4.905908263070464e-05, + "loss": 1.8931, + "mean_token_accuracy": 0.5090139091014863, + "step": 136775 + }, + { + "epoch": 0.13776626871775038, + "grad_norm": 13.975887059718948, + "learning_rate": 4.905897539915488e-05, + "loss": 2.7371, + "mean_token_accuracy": 0.38620689511299133, + "step": 136780 + }, + { + "epoch": 0.13777130477085456, + "grad_norm": 11.277756565453252, + "learning_rate": 4.905886816162567e-05, + "loss": 2.5793, + "mean_token_accuracy": 0.38275861740112305, + "step": 136785 + }, + { + "epoch": 0.13777634082395873, + "grad_norm": 10.185410510737633, + "learning_rate": 4.905876091811702e-05, + "loss": 2.6091, + "mean_token_accuracy": 0.3931034505367279, + "step": 136790 + }, + { + "epoch": 0.1377813768770629, + "grad_norm": 8.0069155507688, + "learning_rate": 4.905865366862897e-05, + "loss": 2.2221, + "mean_token_accuracy": 0.458620685338974, + "step": 136795 + }, + { + "epoch": 0.13778641293016708, + "grad_norm": 10.834240940441203, + "learning_rate": 4.905854641316154e-05, + "loss": 2.2695, + "mean_token_accuracy": 0.4413793087005615, + "step": 136800 + }, + { + "epoch": 0.13779144898327123, + "grad_norm": 12.09209203205288, + "learning_rate": 4.905843915171477e-05, + "loss": 2.3944, + "mean_token_accuracy": 0.44137930274009707, + "step": 136805 + }, + { + "epoch": 0.1377964850363754, + "grad_norm": 10.38534993805814, + "learning_rate": 4.905833188428869e-05, + "loss": 2.1349, + "mean_token_accuracy": 0.47241379618644713, + "step": 136810 + }, + { + "epoch": 0.13780152108947957, + "grad_norm": 7.594934501084621, + "learning_rate": 4.905822461088333e-05, + "loss": 2.2327, + "mean_token_accuracy": 0.4862069010734558, + "step": 136815 + }, + { + "epoch": 0.13780655714258375, + "grad_norm": 14.194466932647156, + "learning_rate": 4.905811733149871e-05, + "loss": 2.6584, + "mean_token_accuracy": 0.3999999940395355, + "step": 136820 + }, + { + "epoch": 0.13781159319568792, + "grad_norm": 12.965053641913252, + "learning_rate": 4.905801004613486e-05, + "loss": 2.4713, + "mean_token_accuracy": 0.42413792610168455, + "step": 136825 + }, + { + "epoch": 0.1378166292487921, + "grad_norm": 12.280817252737299, + "learning_rate": 4.905790275479182e-05, + "loss": 2.377, + "mean_token_accuracy": 0.4413793087005615, + "step": 136830 + }, + { + "epoch": 0.13782166530189627, + "grad_norm": 11.878873899025091, + "learning_rate": 4.905779545746962e-05, + "loss": 2.5405, + "mean_token_accuracy": 0.3896551728248596, + "step": 136835 + }, + { + "epoch": 0.13782670135500044, + "grad_norm": 9.630215387534745, + "learning_rate": 4.905768815416829e-05, + "loss": 2.1913, + "mean_token_accuracy": 0.4448275864124298, + "step": 136840 + }, + { + "epoch": 0.13783173740810462, + "grad_norm": 10.875049910242822, + "learning_rate": 4.9057580844887834e-05, + "loss": 2.4449, + "mean_token_accuracy": 0.4448275864124298, + "step": 136845 + }, + { + "epoch": 0.1378367734612088, + "grad_norm": 11.19522485683445, + "learning_rate": 4.905747352962832e-05, + "loss": 2.5162, + "mean_token_accuracy": 0.4254688322544098, + "step": 136850 + }, + { + "epoch": 0.13784180951431296, + "grad_norm": 8.350857277435006, + "learning_rate": 4.905736620838975e-05, + "loss": 2.4651, + "mean_token_accuracy": 0.39655172526836396, + "step": 136855 + }, + { + "epoch": 0.13784684556741714, + "grad_norm": 10.284784564587305, + "learning_rate": 4.905725888117218e-05, + "loss": 2.4822, + "mean_token_accuracy": 0.4172413766384125, + "step": 136860 + }, + { + "epoch": 0.1378518816205213, + "grad_norm": 10.669307444808718, + "learning_rate": 4.905715154797561e-05, + "loss": 2.064, + "mean_token_accuracy": 0.5, + "step": 136865 + }, + { + "epoch": 0.13785691767362548, + "grad_norm": 13.866455836440284, + "learning_rate": 4.905704420880009e-05, + "loss": 2.5525, + "mean_token_accuracy": 0.4137930989265442, + "step": 136870 + }, + { + "epoch": 0.13786195372672966, + "grad_norm": 12.248044318795838, + "learning_rate": 4.9056936863645635e-05, + "loss": 2.1595, + "mean_token_accuracy": 0.48411330580711365, + "step": 136875 + }, + { + "epoch": 0.13786698977983383, + "grad_norm": 8.668135042266153, + "learning_rate": 4.9056829512512294e-05, + "loss": 2.4577, + "mean_token_accuracy": 0.429824560880661, + "step": 136880 + }, + { + "epoch": 0.137872025832938, + "grad_norm": 11.294755298022483, + "learning_rate": 4.9056722155400074e-05, + "loss": 2.8683, + "mean_token_accuracy": 0.33793103992938994, + "step": 136885 + }, + { + "epoch": 0.13787706188604218, + "grad_norm": 10.386652912984934, + "learning_rate": 4.905661479230903e-05, + "loss": 2.1511, + "mean_token_accuracy": 0.49655172824859617, + "step": 136890 + }, + { + "epoch": 0.13788209793914635, + "grad_norm": 9.792388121877957, + "learning_rate": 4.905650742323917e-05, + "loss": 2.2602, + "mean_token_accuracy": 0.44482759237289426, + "step": 136895 + }, + { + "epoch": 0.13788713399225053, + "grad_norm": 15.028538542117635, + "learning_rate": 4.905640004819053e-05, + "loss": 2.3416, + "mean_token_accuracy": 0.4586206912994385, + "step": 136900 + }, + { + "epoch": 0.1378921700453547, + "grad_norm": 13.425048102203833, + "learning_rate": 4.905629266716315e-05, + "loss": 2.5431, + "mean_token_accuracy": 0.39310344457626345, + "step": 136905 + }, + { + "epoch": 0.13789720609845887, + "grad_norm": 10.908821719862164, + "learning_rate": 4.905618528015705e-05, + "loss": 2.3338, + "mean_token_accuracy": 0.4344827651977539, + "step": 136910 + }, + { + "epoch": 0.13790224215156305, + "grad_norm": 9.89787586598569, + "learning_rate": 4.905607788717225e-05, + "loss": 2.5354, + "mean_token_accuracy": 0.39310344457626345, + "step": 136915 + }, + { + "epoch": 0.13790727820466722, + "grad_norm": 12.073112708228308, + "learning_rate": 4.90559704882088e-05, + "loss": 2.2725, + "mean_token_accuracy": 0.4034482717514038, + "step": 136920 + }, + { + "epoch": 0.1379123142577714, + "grad_norm": 13.285635492773977, + "learning_rate": 4.905586308326673e-05, + "loss": 3.2935, + "mean_token_accuracy": 0.3241379290819168, + "step": 136925 + }, + { + "epoch": 0.13791735031087557, + "grad_norm": 14.378967800372177, + "learning_rate": 4.905575567234605e-05, + "loss": 2.6202, + "mean_token_accuracy": 0.3862069010734558, + "step": 136930 + }, + { + "epoch": 0.13792238636397974, + "grad_norm": 10.900127823284995, + "learning_rate": 4.9055648255446803e-05, + "loss": 1.9515, + "mean_token_accuracy": 0.482758617401123, + "step": 136935 + }, + { + "epoch": 0.13792742241708392, + "grad_norm": 9.970344111401257, + "learning_rate": 4.9055540832569014e-05, + "loss": 2.0349, + "mean_token_accuracy": 0.49655172824859617, + "step": 136940 + }, + { + "epoch": 0.13793245847018806, + "grad_norm": 10.066737277256141, + "learning_rate": 4.905543340371272e-05, + "loss": 2.0653, + "mean_token_accuracy": 0.4551724135875702, + "step": 136945 + }, + { + "epoch": 0.13793749452329224, + "grad_norm": 15.82429436231584, + "learning_rate": 4.9055325968877945e-05, + "loss": 1.9315, + "mean_token_accuracy": 0.4724137783050537, + "step": 136950 + }, + { + "epoch": 0.1379425305763964, + "grad_norm": 11.176959868858997, + "learning_rate": 4.905521852806472e-05, + "loss": 2.3429, + "mean_token_accuracy": 0.47241379618644713, + "step": 136955 + }, + { + "epoch": 0.13794756662950058, + "grad_norm": 10.920548319080318, + "learning_rate": 4.9055111081273076e-05, + "loss": 2.7859, + "mean_token_accuracy": 0.3827586233615875, + "step": 136960 + }, + { + "epoch": 0.13795260268260476, + "grad_norm": 9.071779718795359, + "learning_rate": 4.905500362850304e-05, + "loss": 2.3313, + "mean_token_accuracy": 0.43793103098869324, + "step": 136965 + }, + { + "epoch": 0.13795763873570893, + "grad_norm": 14.804829078159091, + "learning_rate": 4.9054896169754645e-05, + "loss": 2.3547, + "mean_token_accuracy": 0.4206896543502808, + "step": 136970 + }, + { + "epoch": 0.1379626747888131, + "grad_norm": 9.867255189960712, + "learning_rate": 4.905478870502792e-05, + "loss": 2.4791, + "mean_token_accuracy": 0.38275861740112305, + "step": 136975 + }, + { + "epoch": 0.13796771084191728, + "grad_norm": 11.315643617224785, + "learning_rate": 4.905468123432288e-05, + "loss": 2.4736, + "mean_token_accuracy": 0.4137930989265442, + "step": 136980 + }, + { + "epoch": 0.13797274689502145, + "grad_norm": 8.824749949082534, + "learning_rate": 4.9054573757639586e-05, + "loss": 2.9322, + "mean_token_accuracy": 0.38052027225494384, + "step": 136985 + }, + { + "epoch": 0.13797778294812563, + "grad_norm": 12.474583745039066, + "learning_rate": 4.905446627497804e-05, + "loss": 2.484, + "mean_token_accuracy": 0.3670296400785446, + "step": 136990 + }, + { + "epoch": 0.1379828190012298, + "grad_norm": 10.503463242314194, + "learning_rate": 4.905435878633829e-05, + "loss": 2.258, + "mean_token_accuracy": 0.42068964838981626, + "step": 136995 + }, + { + "epoch": 0.13798785505433397, + "grad_norm": 14.520077099238978, + "learning_rate": 4.905425129172036e-05, + "loss": 2.5189, + "mean_token_accuracy": 0.4379310369491577, + "step": 137000 + }, + { + "epoch": 0.13799289110743815, + "grad_norm": 12.288976383106434, + "learning_rate": 4.9054143791124266e-05, + "loss": 2.3049, + "mean_token_accuracy": 0.4517241299152374, + "step": 137005 + }, + { + "epoch": 0.13799792716054232, + "grad_norm": 10.870148067752572, + "learning_rate": 4.905403628455006e-05, + "loss": 2.0616, + "mean_token_accuracy": 0.5401088893413544, + "step": 137010 + }, + { + "epoch": 0.1380029632136465, + "grad_norm": 11.381628355192376, + "learning_rate": 4.905392877199776e-05, + "loss": 2.1135, + "mean_token_accuracy": 0.49999999403953554, + "step": 137015 + }, + { + "epoch": 0.13800799926675067, + "grad_norm": 12.398975859255172, + "learning_rate": 4.90538212534674e-05, + "loss": 2.266, + "mean_token_accuracy": 0.441379314661026, + "step": 137020 + }, + { + "epoch": 0.13801303531985484, + "grad_norm": 8.669295632679196, + "learning_rate": 4.9053713728959e-05, + "loss": 2.14, + "mean_token_accuracy": 0.47241379618644713, + "step": 137025 + }, + { + "epoch": 0.13801807137295902, + "grad_norm": 11.444708069676741, + "learning_rate": 4.90536061984726e-05, + "loss": 2.3678, + "mean_token_accuracy": 0.43793103098869324, + "step": 137030 + }, + { + "epoch": 0.1380231074260632, + "grad_norm": 14.32177627152805, + "learning_rate": 4.9053498662008224e-05, + "loss": 2.7152, + "mean_token_accuracy": 0.4344827592372894, + "step": 137035 + }, + { + "epoch": 0.13802814347916736, + "grad_norm": 10.144860758809681, + "learning_rate": 4.9053391119565906e-05, + "loss": 2.5075, + "mean_token_accuracy": 0.4206896543502808, + "step": 137040 + }, + { + "epoch": 0.13803317953227154, + "grad_norm": 11.131856876310675, + "learning_rate": 4.9053283571145684e-05, + "loss": 2.4843, + "mean_token_accuracy": 0.3793103456497192, + "step": 137045 + }, + { + "epoch": 0.1380382155853757, + "grad_norm": 12.656420751038938, + "learning_rate": 4.9053176016747563e-05, + "loss": 2.4822, + "mean_token_accuracy": 0.4517241358757019, + "step": 137050 + }, + { + "epoch": 0.13804325163847989, + "grad_norm": 9.106570207860083, + "learning_rate": 4.90530684563716e-05, + "loss": 2.1144, + "mean_token_accuracy": 0.4931034445762634, + "step": 137055 + }, + { + "epoch": 0.13804828769158406, + "grad_norm": 10.8556618101491, + "learning_rate": 4.905296089001781e-05, + "loss": 2.3589, + "mean_token_accuracy": 0.4604355812072754, + "step": 137060 + }, + { + "epoch": 0.13805332374468823, + "grad_norm": 13.90595170214158, + "learning_rate": 4.905285331768622e-05, + "loss": 2.4118, + "mean_token_accuracy": 0.43448275327682495, + "step": 137065 + }, + { + "epoch": 0.1380583597977924, + "grad_norm": 10.359932887432198, + "learning_rate": 4.905274573937688e-05, + "loss": 2.809, + "mean_token_accuracy": 0.3758620619773865, + "step": 137070 + }, + { + "epoch": 0.13806339585089658, + "grad_norm": 11.624453058993755, + "learning_rate": 4.9052638155089784e-05, + "loss": 2.3946, + "mean_token_accuracy": 0.46551724076271056, + "step": 137075 + }, + { + "epoch": 0.13806843190400075, + "grad_norm": 10.360263526165383, + "learning_rate": 4.9052530564825e-05, + "loss": 2.1491, + "mean_token_accuracy": 0.4620689690113068, + "step": 137080 + }, + { + "epoch": 0.1380734679571049, + "grad_norm": 9.680510268385348, + "learning_rate": 4.905242296858254e-05, + "loss": 2.1831, + "mean_token_accuracy": 0.39655172228813174, + "step": 137085 + }, + { + "epoch": 0.13807850401020907, + "grad_norm": 10.689559689993407, + "learning_rate": 4.9052315366362437e-05, + "loss": 2.4747, + "mean_token_accuracy": 0.42758620381355283, + "step": 137090 + }, + { + "epoch": 0.13808354006331325, + "grad_norm": 9.599386529997076, + "learning_rate": 4.905220775816471e-05, + "loss": 2.247, + "mean_token_accuracy": 0.4602540791034698, + "step": 137095 + }, + { + "epoch": 0.13808857611641742, + "grad_norm": 9.433341354868118, + "learning_rate": 4.90521001439894e-05, + "loss": 2.4138, + "mean_token_accuracy": 0.41917724609375, + "step": 137100 + }, + { + "epoch": 0.1380936121695216, + "grad_norm": 11.290225540078543, + "learning_rate": 4.905199252383654e-05, + "loss": 2.4215, + "mean_token_accuracy": 0.41034482717514037, + "step": 137105 + }, + { + "epoch": 0.13809864822262577, + "grad_norm": 9.879474403676081, + "learning_rate": 4.905188489770615e-05, + "loss": 2.2323, + "mean_token_accuracy": 0.44482758045196535, + "step": 137110 + }, + { + "epoch": 0.13810368427572994, + "grad_norm": 12.747891207018581, + "learning_rate": 4.905177726559827e-05, + "loss": 2.41, + "mean_token_accuracy": 0.45517240166664125, + "step": 137115 + }, + { + "epoch": 0.13810872032883412, + "grad_norm": 12.10589362761177, + "learning_rate": 4.9051669627512906e-05, + "loss": 2.2764, + "mean_token_accuracy": 0.441379314661026, + "step": 137120 + }, + { + "epoch": 0.1381137563819383, + "grad_norm": 10.312404845477976, + "learning_rate": 4.905156198345013e-05, + "loss": 2.3761, + "mean_token_accuracy": 0.44016939401626587, + "step": 137125 + }, + { + "epoch": 0.13811879243504246, + "grad_norm": 9.25616159943411, + "learning_rate": 4.905145433340993e-05, + "loss": 2.0753, + "mean_token_accuracy": 0.4448275864124298, + "step": 137130 + }, + { + "epoch": 0.13812382848814664, + "grad_norm": 15.540410244376028, + "learning_rate": 4.905134667739236e-05, + "loss": 3.0296, + "mean_token_accuracy": 0.34482758641242983, + "step": 137135 + }, + { + "epoch": 0.1381288645412508, + "grad_norm": 10.735319805341923, + "learning_rate": 4.905123901539745e-05, + "loss": 2.2277, + "mean_token_accuracy": 0.47816091775894165, + "step": 137140 + }, + { + "epoch": 0.13813390059435499, + "grad_norm": 12.175245379993324, + "learning_rate": 4.9051131347425214e-05, + "loss": 2.4507, + "mean_token_accuracy": 0.41724138259887694, + "step": 137145 + }, + { + "epoch": 0.13813893664745916, + "grad_norm": 18.14650242142119, + "learning_rate": 4.90510236734757e-05, + "loss": 2.5692, + "mean_token_accuracy": 0.441379314661026, + "step": 137150 + }, + { + "epoch": 0.13814397270056333, + "grad_norm": 10.868349348383948, + "learning_rate": 4.905091599354892e-05, + "loss": 2.3217, + "mean_token_accuracy": 0.43103447556495667, + "step": 137155 + }, + { + "epoch": 0.1381490087536675, + "grad_norm": 13.34380480243648, + "learning_rate": 4.905080830764492e-05, + "loss": 2.6578, + "mean_token_accuracy": 0.3862069010734558, + "step": 137160 + }, + { + "epoch": 0.13815404480677168, + "grad_norm": 8.348855998744924, + "learning_rate": 4.9050700615763714e-05, + "loss": 2.2951, + "mean_token_accuracy": 0.45674530863761903, + "step": 137165 + }, + { + "epoch": 0.13815908085987585, + "grad_norm": 10.714009487202686, + "learning_rate": 4.905059291790536e-05, + "loss": 2.0102, + "mean_token_accuracy": 0.4931034505367279, + "step": 137170 + }, + { + "epoch": 0.13816411691298003, + "grad_norm": 14.481448321359927, + "learning_rate": 4.905048521406985e-05, + "loss": 2.1652, + "mean_token_accuracy": 0.4730295598506927, + "step": 137175 + }, + { + "epoch": 0.1381691529660842, + "grad_norm": 10.231914992411706, + "learning_rate": 4.905037750425724e-05, + "loss": 2.026, + "mean_token_accuracy": 0.4620689690113068, + "step": 137180 + }, + { + "epoch": 0.13817418901918838, + "grad_norm": 15.689019094250472, + "learning_rate": 4.905026978846755e-05, + "loss": 2.6332, + "mean_token_accuracy": 0.46896552443504336, + "step": 137185 + }, + { + "epoch": 0.13817922507229255, + "grad_norm": 9.469104299702643, + "learning_rate": 4.905016206670081e-05, + "loss": 2.3308, + "mean_token_accuracy": 0.4413793087005615, + "step": 137190 + }, + { + "epoch": 0.13818426112539672, + "grad_norm": 10.221052666376501, + "learning_rate": 4.905005433895706e-05, + "loss": 2.2697, + "mean_token_accuracy": 0.4551724076271057, + "step": 137195 + }, + { + "epoch": 0.1381892971785009, + "grad_norm": 9.591216125438017, + "learning_rate": 4.904994660523631e-05, + "loss": 2.4653, + "mean_token_accuracy": 0.4068965494632721, + "step": 137200 + }, + { + "epoch": 0.13819433323160507, + "grad_norm": 10.852272926656907, + "learning_rate": 4.904983886553862e-05, + "loss": 2.2275, + "mean_token_accuracy": 0.40000000298023225, + "step": 137205 + }, + { + "epoch": 0.13819936928470924, + "grad_norm": 10.475173500781013, + "learning_rate": 4.904973111986399e-05, + "loss": 2.2973, + "mean_token_accuracy": 0.41034482717514037, + "step": 137210 + }, + { + "epoch": 0.13820440533781342, + "grad_norm": 11.55150213426642, + "learning_rate": 4.904962336821246e-05, + "loss": 2.3126, + "mean_token_accuracy": 0.4482758641242981, + "step": 137215 + }, + { + "epoch": 0.1382094413909176, + "grad_norm": 11.281230794346442, + "learning_rate": 4.9049515610584066e-05, + "loss": 2.707, + "mean_token_accuracy": 0.42758620381355283, + "step": 137220 + }, + { + "epoch": 0.13821447744402174, + "grad_norm": 11.021125983677178, + "learning_rate": 4.904940784697883e-05, + "loss": 2.3097, + "mean_token_accuracy": 0.4551724135875702, + "step": 137225 + }, + { + "epoch": 0.1382195134971259, + "grad_norm": 11.119004590427968, + "learning_rate": 4.904930007739679e-05, + "loss": 2.6087, + "mean_token_accuracy": 0.33793103098869326, + "step": 137230 + }, + { + "epoch": 0.13822454955023009, + "grad_norm": 10.573175022213283, + "learning_rate": 4.9049192301837964e-05, + "loss": 2.1897, + "mean_token_accuracy": 0.45172414779663084, + "step": 137235 + }, + { + "epoch": 0.13822958560333426, + "grad_norm": 16.90987496133507, + "learning_rate": 4.9049084520302396e-05, + "loss": 2.9156, + "mean_token_accuracy": 0.36896551847457887, + "step": 137240 + }, + { + "epoch": 0.13823462165643843, + "grad_norm": 16.466814730736886, + "learning_rate": 4.904897673279011e-05, + "loss": 2.767, + "mean_token_accuracy": 0.4085299432277679, + "step": 137245 + }, + { + "epoch": 0.1382396577095426, + "grad_norm": 10.317154033298207, + "learning_rate": 4.904886893930113e-05, + "loss": 2.353, + "mean_token_accuracy": 0.4586206912994385, + "step": 137250 + }, + { + "epoch": 0.13824469376264678, + "grad_norm": 14.731924504778434, + "learning_rate": 4.9048761139835494e-05, + "loss": 2.5058, + "mean_token_accuracy": 0.4310344815254211, + "step": 137255 + }, + { + "epoch": 0.13824972981575095, + "grad_norm": 9.495680848055532, + "learning_rate": 4.904865333439323e-05, + "loss": 2.2217, + "mean_token_accuracy": 0.36896551251411436, + "step": 137260 + }, + { + "epoch": 0.13825476586885513, + "grad_norm": 10.395401397049287, + "learning_rate": 4.904854552297436e-05, + "loss": 2.2706, + "mean_token_accuracy": 0.4482758641242981, + "step": 137265 + }, + { + "epoch": 0.1382598019219593, + "grad_norm": 9.271198117956585, + "learning_rate": 4.9048437705578924e-05, + "loss": 1.9969, + "mean_token_accuracy": 0.48275861144065857, + "step": 137270 + }, + { + "epoch": 0.13826483797506348, + "grad_norm": 10.250941744286214, + "learning_rate": 4.904832988220695e-05, + "loss": 2.3267, + "mean_token_accuracy": 0.4151845097541809, + "step": 137275 + }, + { + "epoch": 0.13826987402816765, + "grad_norm": 9.250033277470592, + "learning_rate": 4.9048222052858464e-05, + "loss": 2.4131, + "mean_token_accuracy": 0.42758620381355283, + "step": 137280 + }, + { + "epoch": 0.13827491008127182, + "grad_norm": 11.554099944772243, + "learning_rate": 4.90481142175335e-05, + "loss": 2.3868, + "mean_token_accuracy": 0.4551724076271057, + "step": 137285 + }, + { + "epoch": 0.138279946134376, + "grad_norm": 9.451505387463778, + "learning_rate": 4.904800637623208e-05, + "loss": 2.217, + "mean_token_accuracy": 0.40344828367233276, + "step": 137290 + }, + { + "epoch": 0.13828498218748017, + "grad_norm": 7.950810557745291, + "learning_rate": 4.9047898528954243e-05, + "loss": 2.0858, + "mean_token_accuracy": 0.5000000059604645, + "step": 137295 + }, + { + "epoch": 0.13829001824058434, + "grad_norm": 11.068630256193153, + "learning_rate": 4.904779067570002e-05, + "loss": 2.3589, + "mean_token_accuracy": 0.41034482717514037, + "step": 137300 + }, + { + "epoch": 0.13829505429368852, + "grad_norm": 11.382220404431587, + "learning_rate": 4.904768281646944e-05, + "loss": 2.5385, + "mean_token_accuracy": 0.42758620977401735, + "step": 137305 + }, + { + "epoch": 0.1383000903467927, + "grad_norm": 12.88338565696784, + "learning_rate": 4.904757495126252e-05, + "loss": 2.5518, + "mean_token_accuracy": 0.4034482777118683, + "step": 137310 + }, + { + "epoch": 0.13830512639989687, + "grad_norm": 12.480782662638976, + "learning_rate": 4.9047467080079305e-05, + "loss": 2.8154, + "mean_token_accuracy": 0.39310344457626345, + "step": 137315 + }, + { + "epoch": 0.13831016245300104, + "grad_norm": 11.82498137813083, + "learning_rate": 4.904735920291982e-05, + "loss": 2.4048, + "mean_token_accuracy": 0.41724138259887694, + "step": 137320 + }, + { + "epoch": 0.1383151985061052, + "grad_norm": 9.58618007083539, + "learning_rate": 4.9047251319784096e-05, + "loss": 2.5648, + "mean_token_accuracy": 0.403448274731636, + "step": 137325 + }, + { + "epoch": 0.1383202345592094, + "grad_norm": 9.01771748780737, + "learning_rate": 4.904714343067216e-05, + "loss": 2.1572, + "mean_token_accuracy": 0.46751360297203065, + "step": 137330 + }, + { + "epoch": 0.13832527061231356, + "grad_norm": 9.15640355929264, + "learning_rate": 4.904703553558404e-05, + "loss": 3.0106, + "mean_token_accuracy": 0.3827586114406586, + "step": 137335 + }, + { + "epoch": 0.13833030666541773, + "grad_norm": 11.72502150114695, + "learning_rate": 4.9046927634519774e-05, + "loss": 2.3864, + "mean_token_accuracy": 0.40344828367233276, + "step": 137340 + }, + { + "epoch": 0.1383353427185219, + "grad_norm": 9.52159232524416, + "learning_rate": 4.904681972747938e-05, + "loss": 2.1858, + "mean_token_accuracy": 0.4413793087005615, + "step": 137345 + }, + { + "epoch": 0.13834037877162608, + "grad_norm": 9.218164867696823, + "learning_rate": 4.904671181446291e-05, + "loss": 2.3578, + "mean_token_accuracy": 0.4482758641242981, + "step": 137350 + }, + { + "epoch": 0.13834541482473026, + "grad_norm": 11.812019112137193, + "learning_rate": 4.9046603895470364e-05, + "loss": 2.6194, + "mean_token_accuracy": 0.3931034505367279, + "step": 137355 + }, + { + "epoch": 0.13835045087783443, + "grad_norm": 11.781836200897756, + "learning_rate": 4.9046495970501794e-05, + "loss": 2.6645, + "mean_token_accuracy": 0.42558982968330383, + "step": 137360 + }, + { + "epoch": 0.13835548693093858, + "grad_norm": 11.276375512533276, + "learning_rate": 4.904638803955722e-05, + "loss": 2.1741, + "mean_token_accuracy": 0.43061100840568545, + "step": 137365 + }, + { + "epoch": 0.13836052298404275, + "grad_norm": 10.405169205644825, + "learning_rate": 4.904628010263668e-05, + "loss": 2.6281, + "mean_token_accuracy": 0.39310344457626345, + "step": 137370 + }, + { + "epoch": 0.13836555903714692, + "grad_norm": 12.194100580645351, + "learning_rate": 4.904617215974019e-05, + "loss": 2.1313, + "mean_token_accuracy": 0.4482758641242981, + "step": 137375 + }, + { + "epoch": 0.1383705950902511, + "grad_norm": 10.833087115932075, + "learning_rate": 4.9046064210867795e-05, + "loss": 2.3863, + "mean_token_accuracy": 0.4379310250282288, + "step": 137380 + }, + { + "epoch": 0.13837563114335527, + "grad_norm": 7.916665057757423, + "learning_rate": 4.904595625601952e-05, + "loss": 2.2082, + "mean_token_accuracy": 0.47447065711021424, + "step": 137385 + }, + { + "epoch": 0.13838066719645944, + "grad_norm": 8.663216830766238, + "learning_rate": 4.904584829519539e-05, + "loss": 1.9562, + "mean_token_accuracy": 0.4965517222881317, + "step": 137390 + }, + { + "epoch": 0.13838570324956362, + "grad_norm": 11.622604955985764, + "learning_rate": 4.904574032839544e-05, + "loss": 2.5407, + "mean_token_accuracy": 0.39310344457626345, + "step": 137395 + }, + { + "epoch": 0.1383907393026678, + "grad_norm": 11.697562640288808, + "learning_rate": 4.90456323556197e-05, + "loss": 3.1473, + "mean_token_accuracy": 0.34827586114406583, + "step": 137400 + }, + { + "epoch": 0.13839577535577197, + "grad_norm": 10.577042874043594, + "learning_rate": 4.9045524376868196e-05, + "loss": 3.3001, + "mean_token_accuracy": 0.34137930870056155, + "step": 137405 + }, + { + "epoch": 0.13840081140887614, + "grad_norm": 12.746065946383407, + "learning_rate": 4.904541639214096e-05, + "loss": 2.4203, + "mean_token_accuracy": 0.43448275327682495, + "step": 137410 + }, + { + "epoch": 0.1384058474619803, + "grad_norm": 10.261736686311684, + "learning_rate": 4.904530840143803e-05, + "loss": 2.3571, + "mean_token_accuracy": 0.4606170654296875, + "step": 137415 + }, + { + "epoch": 0.1384108835150845, + "grad_norm": 13.541318582829687, + "learning_rate": 4.904520040475942e-05, + "loss": 2.6215, + "mean_token_accuracy": 0.4206896543502808, + "step": 137420 + }, + { + "epoch": 0.13841591956818866, + "grad_norm": 10.280934054611498, + "learning_rate": 4.904509240210517e-05, + "loss": 2.1093, + "mean_token_accuracy": 0.43103447556495667, + "step": 137425 + }, + { + "epoch": 0.13842095562129283, + "grad_norm": 11.157989834070339, + "learning_rate": 4.904498439347531e-05, + "loss": 2.2858, + "mean_token_accuracy": 0.4379310369491577, + "step": 137430 + }, + { + "epoch": 0.138425991674397, + "grad_norm": 11.620784354362506, + "learning_rate": 4.9044876378869865e-05, + "loss": 2.7407, + "mean_token_accuracy": 0.42068966031074523, + "step": 137435 + }, + { + "epoch": 0.13843102772750118, + "grad_norm": 11.273182659546192, + "learning_rate": 4.9044768358288866e-05, + "loss": 2.6857, + "mean_token_accuracy": 0.37586206793785093, + "step": 137440 + }, + { + "epoch": 0.13843606378060536, + "grad_norm": 10.910585631493031, + "learning_rate": 4.904466033173235e-05, + "loss": 2.4133, + "mean_token_accuracy": 0.4433151870965958, + "step": 137445 + }, + { + "epoch": 0.13844109983370953, + "grad_norm": 10.726100672630558, + "learning_rate": 4.9044552299200345e-05, + "loss": 2.4185, + "mean_token_accuracy": 0.4206896543502808, + "step": 137450 + }, + { + "epoch": 0.1384461358868137, + "grad_norm": 10.660147194562361, + "learning_rate": 4.9044444260692876e-05, + "loss": 2.3437, + "mean_token_accuracy": 0.43793103098869324, + "step": 137455 + }, + { + "epoch": 0.13845117193991788, + "grad_norm": 10.308886504176655, + "learning_rate": 4.904433621620997e-05, + "loss": 2.1821, + "mean_token_accuracy": 0.44627949595451355, + "step": 137460 + }, + { + "epoch": 0.13845620799302205, + "grad_norm": 11.023020700375163, + "learning_rate": 4.9044228165751666e-05, + "loss": 1.9921, + "mean_token_accuracy": 0.5089534163475037, + "step": 137465 + }, + { + "epoch": 0.13846124404612623, + "grad_norm": 9.615908699486756, + "learning_rate": 4.9044120109317986e-05, + "loss": 2.471, + "mean_token_accuracy": 0.3999999940395355, + "step": 137470 + }, + { + "epoch": 0.1384662800992304, + "grad_norm": 14.119461677050118, + "learning_rate": 4.904401204690897e-05, + "loss": 2.4644, + "mean_token_accuracy": 0.44827585816383364, + "step": 137475 + }, + { + "epoch": 0.13847131615233457, + "grad_norm": 10.701577690105434, + "learning_rate": 4.9043903978524636e-05, + "loss": 2.3724, + "mean_token_accuracy": 0.44827587008476255, + "step": 137480 + }, + { + "epoch": 0.13847635220543875, + "grad_norm": 11.080186665236422, + "learning_rate": 4.904379590416503e-05, + "loss": 2.5163, + "mean_token_accuracy": 0.45862067937850953, + "step": 137485 + }, + { + "epoch": 0.13848138825854292, + "grad_norm": 9.603316987386908, + "learning_rate": 4.904368782383016e-05, + "loss": 2.8909, + "mean_token_accuracy": 0.34482758343219755, + "step": 137490 + }, + { + "epoch": 0.1384864243116471, + "grad_norm": 12.258062794740598, + "learning_rate": 4.904357973752007e-05, + "loss": 2.7883, + "mean_token_accuracy": 0.3103448212146759, + "step": 137495 + }, + { + "epoch": 0.13849146036475127, + "grad_norm": 12.035741627762773, + "learning_rate": 4.9043471645234794e-05, + "loss": 2.4714, + "mean_token_accuracy": 0.4379310250282288, + "step": 137500 + }, + { + "epoch": 0.1384964964178554, + "grad_norm": 10.244816802233077, + "learning_rate": 4.9043363546974356e-05, + "loss": 2.4702, + "mean_token_accuracy": 0.4551724076271057, + "step": 137505 + }, + { + "epoch": 0.1385015324709596, + "grad_norm": 11.16001390284774, + "learning_rate": 4.904325544273878e-05, + "loss": 2.1344, + "mean_token_accuracy": 0.4551724135875702, + "step": 137510 + }, + { + "epoch": 0.13850656852406376, + "grad_norm": 11.881230684968784, + "learning_rate": 4.90431473325281e-05, + "loss": 2.1808, + "mean_token_accuracy": 0.44827585220336913, + "step": 137515 + }, + { + "epoch": 0.13851160457716793, + "grad_norm": 10.628101455439015, + "learning_rate": 4.904303921634235e-05, + "loss": 2.2778, + "mean_token_accuracy": 0.4448275864124298, + "step": 137520 + }, + { + "epoch": 0.1385166406302721, + "grad_norm": 11.438640988404217, + "learning_rate": 4.9042931094181555e-05, + "loss": 2.5684, + "mean_token_accuracy": 0.37586206793785093, + "step": 137525 + }, + { + "epoch": 0.13852167668337628, + "grad_norm": 10.576739010811345, + "learning_rate": 4.9042822966045756e-05, + "loss": 2.5198, + "mean_token_accuracy": 0.41379310488700866, + "step": 137530 + }, + { + "epoch": 0.13852671273648046, + "grad_norm": 10.152419746810168, + "learning_rate": 4.9042714831934973e-05, + "loss": 2.4466, + "mean_token_accuracy": 0.44482759237289426, + "step": 137535 + }, + { + "epoch": 0.13853174878958463, + "grad_norm": 12.276746517424135, + "learning_rate": 4.904260669184923e-05, + "loss": 2.3249, + "mean_token_accuracy": 0.4551724135875702, + "step": 137540 + }, + { + "epoch": 0.1385367848426888, + "grad_norm": 10.528237674010498, + "learning_rate": 4.904249854578857e-05, + "loss": 2.5224, + "mean_token_accuracy": 0.4609800338745117, + "step": 137545 + }, + { + "epoch": 0.13854182089579298, + "grad_norm": 11.036238445347475, + "learning_rate": 4.904239039375302e-05, + "loss": 2.8143, + "mean_token_accuracy": 0.36896551847457887, + "step": 137550 + }, + { + "epoch": 0.13854685694889715, + "grad_norm": 9.231108976476456, + "learning_rate": 4.904228223574261e-05, + "loss": 2.3346, + "mean_token_accuracy": 0.4413793087005615, + "step": 137555 + }, + { + "epoch": 0.13855189300200133, + "grad_norm": 10.60910131750133, + "learning_rate": 4.904217407175736e-05, + "loss": 2.0112, + "mean_token_accuracy": 0.48965516686439514, + "step": 137560 + }, + { + "epoch": 0.1385569290551055, + "grad_norm": 11.169699435349395, + "learning_rate": 4.9042065901797314e-05, + "loss": 2.2149, + "mean_token_accuracy": 0.44827585816383364, + "step": 137565 + }, + { + "epoch": 0.13856196510820967, + "grad_norm": 8.9183196385182, + "learning_rate": 4.904195772586249e-05, + "loss": 2.4079, + "mean_token_accuracy": 0.40344826579093934, + "step": 137570 + }, + { + "epoch": 0.13856700116131385, + "grad_norm": 11.64262218747763, + "learning_rate": 4.904184954395292e-05, + "loss": 2.3683, + "mean_token_accuracy": 0.4862068951129913, + "step": 137575 + }, + { + "epoch": 0.13857203721441802, + "grad_norm": 9.93335293716104, + "learning_rate": 4.904174135606865e-05, + "loss": 2.1072, + "mean_token_accuracy": 0.47241379618644713, + "step": 137580 + }, + { + "epoch": 0.1385770732675222, + "grad_norm": 8.922987126211607, + "learning_rate": 4.904163316220969e-05, + "loss": 2.1256, + "mean_token_accuracy": 0.4793103516101837, + "step": 137585 + }, + { + "epoch": 0.13858210932062637, + "grad_norm": 11.512434339345575, + "learning_rate": 4.9041524962376073e-05, + "loss": 2.3249, + "mean_token_accuracy": 0.47586206793785096, + "step": 137590 + }, + { + "epoch": 0.13858714537373054, + "grad_norm": 11.645284226868005, + "learning_rate": 4.904141675656784e-05, + "loss": 2.503, + "mean_token_accuracy": 0.4172413766384125, + "step": 137595 + }, + { + "epoch": 0.13859218142683472, + "grad_norm": 9.744035686352326, + "learning_rate": 4.904130854478501e-05, + "loss": 2.1334, + "mean_token_accuracy": 0.458620685338974, + "step": 137600 + }, + { + "epoch": 0.1385972174799389, + "grad_norm": 13.01820589789826, + "learning_rate": 4.904120032702763e-05, + "loss": 2.4171, + "mean_token_accuracy": 0.40689654350280763, + "step": 137605 + }, + { + "epoch": 0.13860225353304306, + "grad_norm": 9.849513515092672, + "learning_rate": 4.904109210329571e-05, + "loss": 2.2435, + "mean_token_accuracy": 0.458620685338974, + "step": 137610 + }, + { + "epoch": 0.13860728958614724, + "grad_norm": 11.121458827373726, + "learning_rate": 4.904098387358929e-05, + "loss": 2.4628, + "mean_token_accuracy": 0.4034482777118683, + "step": 137615 + }, + { + "epoch": 0.1386123256392514, + "grad_norm": 15.226103610918562, + "learning_rate": 4.904087563790839e-05, + "loss": 2.4234, + "mean_token_accuracy": 0.47931033968925474, + "step": 137620 + }, + { + "epoch": 0.13861736169235558, + "grad_norm": 9.868087293165326, + "learning_rate": 4.9040767396253054e-05, + "loss": 2.6737, + "mean_token_accuracy": 0.3999999940395355, + "step": 137625 + }, + { + "epoch": 0.13862239774545976, + "grad_norm": 9.515130093003481, + "learning_rate": 4.90406591486233e-05, + "loss": 1.9356, + "mean_token_accuracy": 0.48965516686439514, + "step": 137630 + }, + { + "epoch": 0.13862743379856393, + "grad_norm": 9.175991263659336, + "learning_rate": 4.904055089501917e-05, + "loss": 2.2071, + "mean_token_accuracy": 0.5, + "step": 137635 + }, + { + "epoch": 0.1386324698516681, + "grad_norm": 11.396941962001137, + "learning_rate": 4.904044263544069e-05, + "loss": 2.3473, + "mean_token_accuracy": 0.4862068951129913, + "step": 137640 + }, + { + "epoch": 0.13863750590477225, + "grad_norm": 9.99291140903006, + "learning_rate": 4.904033436988789e-05, + "loss": 2.4109, + "mean_token_accuracy": 0.4034482717514038, + "step": 137645 + }, + { + "epoch": 0.13864254195787643, + "grad_norm": 10.707973807977968, + "learning_rate": 4.904022609836078e-05, + "loss": 2.3892, + "mean_token_accuracy": 0.4310344815254211, + "step": 137650 + }, + { + "epoch": 0.1386475780109806, + "grad_norm": 10.665935932947045, + "learning_rate": 4.9040117820859424e-05, + "loss": 2.5403, + "mean_token_accuracy": 0.404718691110611, + "step": 137655 + }, + { + "epoch": 0.13865261406408477, + "grad_norm": 10.163328041771319, + "learning_rate": 4.904000953738383e-05, + "loss": 2.4614, + "mean_token_accuracy": 0.4206896543502808, + "step": 137660 + }, + { + "epoch": 0.13865765011718895, + "grad_norm": 9.364856610750643, + "learning_rate": 4.9039901247934046e-05, + "loss": 2.204, + "mean_token_accuracy": 0.4620689570903778, + "step": 137665 + }, + { + "epoch": 0.13866268617029312, + "grad_norm": 9.30278864850231, + "learning_rate": 4.903979295251008e-05, + "loss": 2.3598, + "mean_token_accuracy": 0.3758620709180832, + "step": 137670 + }, + { + "epoch": 0.1386677222233973, + "grad_norm": 11.83659438420772, + "learning_rate": 4.9039684651111964e-05, + "loss": 2.3189, + "mean_token_accuracy": 0.4551724076271057, + "step": 137675 + }, + { + "epoch": 0.13867275827650147, + "grad_norm": 12.75266470224908, + "learning_rate": 4.9039576343739744e-05, + "loss": 2.5158, + "mean_token_accuracy": 0.4310344815254211, + "step": 137680 + }, + { + "epoch": 0.13867779432960564, + "grad_norm": 12.959357670909712, + "learning_rate": 4.903946803039344e-05, + "loss": 2.2067, + "mean_token_accuracy": 0.4534180283546448, + "step": 137685 + }, + { + "epoch": 0.13868283038270982, + "grad_norm": 13.650449006551257, + "learning_rate": 4.903935971107309e-05, + "loss": 2.7519, + "mean_token_accuracy": 0.4137930989265442, + "step": 137690 + }, + { + "epoch": 0.138687866435814, + "grad_norm": 10.036240247186441, + "learning_rate": 4.903925138577872e-05, + "loss": 2.1192, + "mean_token_accuracy": 0.4793103337287903, + "step": 137695 + }, + { + "epoch": 0.13869290248891816, + "grad_norm": 10.471331265088, + "learning_rate": 4.903914305451035e-05, + "loss": 2.4165, + "mean_token_accuracy": 0.4482758641242981, + "step": 137700 + }, + { + "epoch": 0.13869793854202234, + "grad_norm": 9.794905224518436, + "learning_rate": 4.9039034717268024e-05, + "loss": 2.2063, + "mean_token_accuracy": 0.4586206912994385, + "step": 137705 + }, + { + "epoch": 0.1387029745951265, + "grad_norm": 8.86632227156554, + "learning_rate": 4.903892637405176e-05, + "loss": 2.5154, + "mean_token_accuracy": 0.4344827592372894, + "step": 137710 + }, + { + "epoch": 0.13870801064823068, + "grad_norm": 9.762438746553263, + "learning_rate": 4.90388180248616e-05, + "loss": 2.411, + "mean_token_accuracy": 0.39655172228813174, + "step": 137715 + }, + { + "epoch": 0.13871304670133486, + "grad_norm": 10.925161537819086, + "learning_rate": 4.903870966969756e-05, + "loss": 2.251, + "mean_token_accuracy": 0.42758620977401735, + "step": 137720 + }, + { + "epoch": 0.13871808275443903, + "grad_norm": 14.417647549775682, + "learning_rate": 4.903860130855969e-05, + "loss": 2.8337, + "mean_token_accuracy": 0.4000000059604645, + "step": 137725 + }, + { + "epoch": 0.1387231188075432, + "grad_norm": 15.668986893551715, + "learning_rate": 4.9038492941448e-05, + "loss": 2.7359, + "mean_token_accuracy": 0.3482758581638336, + "step": 137730 + }, + { + "epoch": 0.13872815486064738, + "grad_norm": 8.754833916991052, + "learning_rate": 4.903838456836254e-05, + "loss": 2.4601, + "mean_token_accuracy": 0.4448275864124298, + "step": 137735 + }, + { + "epoch": 0.13873319091375155, + "grad_norm": 10.782162986464106, + "learning_rate": 4.9038276189303314e-05, + "loss": 2.6377, + "mean_token_accuracy": 0.38965516686439516, + "step": 137740 + }, + { + "epoch": 0.13873822696685573, + "grad_norm": 13.224611530697665, + "learning_rate": 4.903816780427038e-05, + "loss": 2.2805, + "mean_token_accuracy": 0.4586207032203674, + "step": 137745 + }, + { + "epoch": 0.1387432630199599, + "grad_norm": 10.385233817571228, + "learning_rate": 4.9038059413263745e-05, + "loss": 2.5225, + "mean_token_accuracy": 0.4310344815254211, + "step": 137750 + }, + { + "epoch": 0.13874829907306407, + "grad_norm": 7.90224873690731, + "learning_rate": 4.903795101628345e-05, + "loss": 2.4298, + "mean_token_accuracy": 0.3986085832118988, + "step": 137755 + }, + { + "epoch": 0.13875333512616825, + "grad_norm": 8.550517658032902, + "learning_rate": 4.9037842613329525e-05, + "loss": 1.7479, + "mean_token_accuracy": 0.5620689570903779, + "step": 137760 + }, + { + "epoch": 0.13875837117927242, + "grad_norm": 9.039573614767058, + "learning_rate": 4.9037734204402006e-05, + "loss": 2.4587, + "mean_token_accuracy": 0.37586206793785093, + "step": 137765 + }, + { + "epoch": 0.1387634072323766, + "grad_norm": 10.29357716208612, + "learning_rate": 4.9037625789500905e-05, + "loss": 2.3634, + "mean_token_accuracy": 0.42565032839775085, + "step": 137770 + }, + { + "epoch": 0.13876844328548077, + "grad_norm": 11.245347297824466, + "learning_rate": 4.903751736862627e-05, + "loss": 2.4103, + "mean_token_accuracy": 0.4034482777118683, + "step": 137775 + }, + { + "epoch": 0.13877347933858494, + "grad_norm": 10.203285206720572, + "learning_rate": 4.903740894177813e-05, + "loss": 2.6202, + "mean_token_accuracy": 0.4206896543502808, + "step": 137780 + }, + { + "epoch": 0.1387785153916891, + "grad_norm": 12.423792938302471, + "learning_rate": 4.9037300508956496e-05, + "loss": 2.2398, + "mean_token_accuracy": 0.46896552443504336, + "step": 137785 + }, + { + "epoch": 0.13878355144479326, + "grad_norm": 10.2837849655862, + "learning_rate": 4.903719207016142e-05, + "loss": 2.4378, + "mean_token_accuracy": 0.4517241418361664, + "step": 137790 + }, + { + "epoch": 0.13878858749789744, + "grad_norm": 15.958778176556656, + "learning_rate": 4.903708362539292e-05, + "loss": 2.2345, + "mean_token_accuracy": 0.4923775017261505, + "step": 137795 + }, + { + "epoch": 0.1387936235510016, + "grad_norm": 10.346261953761873, + "learning_rate": 4.9036975174651034e-05, + "loss": 2.5646, + "mean_token_accuracy": 0.4413793087005615, + "step": 137800 + }, + { + "epoch": 0.13879865960410578, + "grad_norm": 11.895805338116347, + "learning_rate": 4.903686671793578e-05, + "loss": 2.459, + "mean_token_accuracy": 0.3896551728248596, + "step": 137805 + }, + { + "epoch": 0.13880369565720996, + "grad_norm": 9.555228511309338, + "learning_rate": 4.90367582552472e-05, + "loss": 2.3317, + "mean_token_accuracy": 0.4482758641242981, + "step": 137810 + }, + { + "epoch": 0.13880873171031413, + "grad_norm": 13.278667190565079, + "learning_rate": 4.903664978658532e-05, + "loss": 2.7452, + "mean_token_accuracy": 0.3896551728248596, + "step": 137815 + }, + { + "epoch": 0.1388137677634183, + "grad_norm": 10.344080922683894, + "learning_rate": 4.903654131195017e-05, + "loss": 2.5725, + "mean_token_accuracy": 0.43103448748588563, + "step": 137820 + }, + { + "epoch": 0.13881880381652248, + "grad_norm": 11.180681779655693, + "learning_rate": 4.9036432831341775e-05, + "loss": 2.4851, + "mean_token_accuracy": 0.4379310369491577, + "step": 137825 + }, + { + "epoch": 0.13882383986962665, + "grad_norm": 11.311234262064488, + "learning_rate": 4.903632434476018e-05, + "loss": 2.2905, + "mean_token_accuracy": 0.44827587008476255, + "step": 137830 + }, + { + "epoch": 0.13882887592273083, + "grad_norm": 10.161867191477869, + "learning_rate": 4.90362158522054e-05, + "loss": 2.4287, + "mean_token_accuracy": 0.42413793206214906, + "step": 137835 + }, + { + "epoch": 0.138833911975835, + "grad_norm": 9.990398835478857, + "learning_rate": 4.903610735367747e-05, + "loss": 2.1889, + "mean_token_accuracy": 0.4517241418361664, + "step": 137840 + }, + { + "epoch": 0.13883894802893917, + "grad_norm": 12.112970488862109, + "learning_rate": 4.903599884917642e-05, + "loss": 2.4917, + "mean_token_accuracy": 0.3931034475564957, + "step": 137845 + }, + { + "epoch": 0.13884398408204335, + "grad_norm": 10.24034146128821, + "learning_rate": 4.903589033870228e-05, + "loss": 2.1452, + "mean_token_accuracy": 0.4628078818321228, + "step": 137850 + }, + { + "epoch": 0.13884902013514752, + "grad_norm": 14.382979694102247, + "learning_rate": 4.903578182225508e-05, + "loss": 2.3549, + "mean_token_accuracy": 0.4517241418361664, + "step": 137855 + }, + { + "epoch": 0.1388540561882517, + "grad_norm": 11.666553817110822, + "learning_rate": 4.903567329983485e-05, + "loss": 2.6477, + "mean_token_accuracy": 0.3999999940395355, + "step": 137860 + }, + { + "epoch": 0.13885909224135587, + "grad_norm": 10.193437754289564, + "learning_rate": 4.903556477144162e-05, + "loss": 2.2737, + "mean_token_accuracy": 0.4448275864124298, + "step": 137865 + }, + { + "epoch": 0.13886412829446004, + "grad_norm": 10.590477856530862, + "learning_rate": 4.903545623707542e-05, + "loss": 3.1103, + "mean_token_accuracy": 0.3482758581638336, + "step": 137870 + }, + { + "epoch": 0.13886916434756422, + "grad_norm": 9.432163206937203, + "learning_rate": 4.903534769673629e-05, + "loss": 2.4242, + "mean_token_accuracy": 0.3793103456497192, + "step": 137875 + }, + { + "epoch": 0.1388742004006684, + "grad_norm": 11.306132357714635, + "learning_rate": 4.903523915042424e-05, + "loss": 2.2195, + "mean_token_accuracy": 0.4778584361076355, + "step": 137880 + }, + { + "epoch": 0.13887923645377256, + "grad_norm": 9.86446661995922, + "learning_rate": 4.903513059813932e-05, + "loss": 2.3377, + "mean_token_accuracy": 0.4586206912994385, + "step": 137885 + }, + { + "epoch": 0.13888427250687674, + "grad_norm": 10.780617319385092, + "learning_rate": 4.9035022039881544e-05, + "loss": 2.3903, + "mean_token_accuracy": 0.43793103098869324, + "step": 137890 + }, + { + "epoch": 0.1388893085599809, + "grad_norm": 9.198797385726362, + "learning_rate": 4.9034913475650954e-05, + "loss": 2.0886, + "mean_token_accuracy": 0.4758620738983154, + "step": 137895 + }, + { + "epoch": 0.13889434461308509, + "grad_norm": 8.012888687964251, + "learning_rate": 4.9034804905447576e-05, + "loss": 2.7181, + "mean_token_accuracy": 0.40344828367233276, + "step": 137900 + }, + { + "epoch": 0.13889938066618926, + "grad_norm": 12.746106942797661, + "learning_rate": 4.9034696329271436e-05, + "loss": 2.8771, + "mean_token_accuracy": 0.37241379022598264, + "step": 137905 + }, + { + "epoch": 0.13890441671929343, + "grad_norm": 9.153222914600978, + "learning_rate": 4.903458774712257e-05, + "loss": 2.4741, + "mean_token_accuracy": 0.4413793087005615, + "step": 137910 + }, + { + "epoch": 0.1389094527723976, + "grad_norm": 9.589653414713865, + "learning_rate": 4.903447915900101e-05, + "loss": 2.3311, + "mean_token_accuracy": 0.4103448331356049, + "step": 137915 + }, + { + "epoch": 0.13891448882550178, + "grad_norm": 12.465493931272226, + "learning_rate": 4.903437056490677e-05, + "loss": 2.7182, + "mean_token_accuracy": 0.3862069010734558, + "step": 137920 + }, + { + "epoch": 0.13891952487860593, + "grad_norm": 12.194614443620884, + "learning_rate": 4.90342619648399e-05, + "loss": 2.8387, + "mean_token_accuracy": 0.38620689511299133, + "step": 137925 + }, + { + "epoch": 0.1389245609317101, + "grad_norm": 12.483580817677456, + "learning_rate": 4.9034153358800416e-05, + "loss": 2.7213, + "mean_token_accuracy": 0.34137930870056155, + "step": 137930 + }, + { + "epoch": 0.13892959698481427, + "grad_norm": 11.274087490412493, + "learning_rate": 4.9034044746788365e-05, + "loss": 2.1154, + "mean_token_accuracy": 0.5019963681697845, + "step": 137935 + }, + { + "epoch": 0.13893463303791845, + "grad_norm": 9.183380510443397, + "learning_rate": 4.903393612880376e-05, + "loss": 2.2159, + "mean_token_accuracy": 0.45862067937850953, + "step": 137940 + }, + { + "epoch": 0.13893966909102262, + "grad_norm": 12.420203105639043, + "learning_rate": 4.903382750484663e-05, + "loss": 2.4579, + "mean_token_accuracy": 0.41034482717514037, + "step": 137945 + }, + { + "epoch": 0.1389447051441268, + "grad_norm": 8.71773347303741, + "learning_rate": 4.903371887491703e-05, + "loss": 2.5388, + "mean_token_accuracy": 0.4034482777118683, + "step": 137950 + }, + { + "epoch": 0.13894974119723097, + "grad_norm": 12.522384658642308, + "learning_rate": 4.9033610239014956e-05, + "loss": 2.5564, + "mean_token_accuracy": 0.38965516686439516, + "step": 137955 + }, + { + "epoch": 0.13895477725033514, + "grad_norm": 7.66552044937926, + "learning_rate": 4.903350159714046e-05, + "loss": 2.0543, + "mean_token_accuracy": 0.48275861144065857, + "step": 137960 + }, + { + "epoch": 0.13895981330343932, + "grad_norm": 11.261656770615696, + "learning_rate": 4.903339294929357e-05, + "loss": 2.5308, + "mean_token_accuracy": 0.41034482717514037, + "step": 137965 + }, + { + "epoch": 0.1389648493565435, + "grad_norm": 12.070572764705537, + "learning_rate": 4.903328429547431e-05, + "loss": 2.314, + "mean_token_accuracy": 0.46400484442710876, + "step": 137970 + }, + { + "epoch": 0.13896988540964766, + "grad_norm": 9.969717366675551, + "learning_rate": 4.903317563568272e-05, + "loss": 2.4393, + "mean_token_accuracy": 0.4275861978530884, + "step": 137975 + }, + { + "epoch": 0.13897492146275184, + "grad_norm": 10.244218385183077, + "learning_rate": 4.903306696991882e-05, + "loss": 2.103, + "mean_token_accuracy": 0.4413793087005615, + "step": 137980 + }, + { + "epoch": 0.138979957515856, + "grad_norm": 9.91108807615884, + "learning_rate": 4.903295829818264e-05, + "loss": 2.4426, + "mean_token_accuracy": 0.42413793206214906, + "step": 137985 + }, + { + "epoch": 0.13898499356896019, + "grad_norm": 9.70166862965431, + "learning_rate": 4.9032849620474216e-05, + "loss": 2.2445, + "mean_token_accuracy": 0.4206896543502808, + "step": 137990 + }, + { + "epoch": 0.13899002962206436, + "grad_norm": 12.324521566578316, + "learning_rate": 4.9032740936793576e-05, + "loss": 2.2155, + "mean_token_accuracy": 0.4344827473163605, + "step": 137995 + }, + { + "epoch": 0.13899506567516853, + "grad_norm": 8.910904160651258, + "learning_rate": 4.9032632247140744e-05, + "loss": 2.3252, + "mean_token_accuracy": 0.4000000059604645, + "step": 138000 + }, + { + "epoch": 0.1390001017282727, + "grad_norm": 10.912184101473503, + "learning_rate": 4.903252355151577e-05, + "loss": 2.4278, + "mean_token_accuracy": 0.37241379022598264, + "step": 138005 + }, + { + "epoch": 0.13900513778137688, + "grad_norm": 8.401878709833023, + "learning_rate": 4.903241484991866e-05, + "loss": 2.6433, + "mean_token_accuracy": 0.4241379380226135, + "step": 138010 + }, + { + "epoch": 0.13901017383448105, + "grad_norm": 11.411948926553926, + "learning_rate": 4.9032306142349455e-05, + "loss": 2.1714, + "mean_token_accuracy": 0.4655172288417816, + "step": 138015 + }, + { + "epoch": 0.13901520988758523, + "grad_norm": 9.002902173648769, + "learning_rate": 4.9032197428808186e-05, + "loss": 2.2608, + "mean_token_accuracy": 0.44827585220336913, + "step": 138020 + }, + { + "epoch": 0.1390202459406894, + "grad_norm": 11.468546986488565, + "learning_rate": 4.903208870929488e-05, + "loss": 2.1164, + "mean_token_accuracy": 0.4689655125141144, + "step": 138025 + }, + { + "epoch": 0.13902528199379358, + "grad_norm": 10.21250185102687, + "learning_rate": 4.9031979983809574e-05, + "loss": 2.523, + "mean_token_accuracy": 0.44827585816383364, + "step": 138030 + }, + { + "epoch": 0.13903031804689775, + "grad_norm": 9.20307640474596, + "learning_rate": 4.9031871252352285e-05, + "loss": 2.2648, + "mean_token_accuracy": 0.458620685338974, + "step": 138035 + }, + { + "epoch": 0.13903535410000192, + "grad_norm": 11.237508977536352, + "learning_rate": 4.903176251492306e-05, + "loss": 2.689, + "mean_token_accuracy": 0.3999999940395355, + "step": 138040 + }, + { + "epoch": 0.1390403901531061, + "grad_norm": 10.444415930032232, + "learning_rate": 4.9031653771521917e-05, + "loss": 2.1158, + "mean_token_accuracy": 0.4586206912994385, + "step": 138045 + }, + { + "epoch": 0.13904542620621027, + "grad_norm": 12.991128060419946, + "learning_rate": 4.9031545022148884e-05, + "loss": 2.1717, + "mean_token_accuracy": 0.42413793206214906, + "step": 138050 + }, + { + "epoch": 0.13905046225931444, + "grad_norm": 10.21366718347926, + "learning_rate": 4.9031436266804006e-05, + "loss": 2.5931, + "mean_token_accuracy": 0.4551724135875702, + "step": 138055 + }, + { + "epoch": 0.13905549831241862, + "grad_norm": 9.507030179771352, + "learning_rate": 4.90313275054873e-05, + "loss": 2.3662, + "mean_token_accuracy": 0.42928009629249575, + "step": 138060 + }, + { + "epoch": 0.13906053436552276, + "grad_norm": 9.136395429138256, + "learning_rate": 4.903121873819879e-05, + "loss": 2.6565, + "mean_token_accuracy": 0.39310345649719236, + "step": 138065 + }, + { + "epoch": 0.13906557041862694, + "grad_norm": 11.72924407157271, + "learning_rate": 4.9031109964938534e-05, + "loss": 2.1958, + "mean_token_accuracy": 0.48517847061157227, + "step": 138070 + }, + { + "epoch": 0.1390706064717311, + "grad_norm": 9.94878574891501, + "learning_rate": 4.9031001185706534e-05, + "loss": 1.9967, + "mean_token_accuracy": 0.4896551728248596, + "step": 138075 + }, + { + "epoch": 0.13907564252483529, + "grad_norm": 10.061181207790982, + "learning_rate": 4.903089240050283e-05, + "loss": 2.5354, + "mean_token_accuracy": 0.4206896543502808, + "step": 138080 + }, + { + "epoch": 0.13908067857793946, + "grad_norm": 10.56597023375056, + "learning_rate": 4.903078360932746e-05, + "loss": 2.3996, + "mean_token_accuracy": 0.39655173420906065, + "step": 138085 + }, + { + "epoch": 0.13908571463104363, + "grad_norm": 10.378029170450773, + "learning_rate": 4.9030674812180435e-05, + "loss": 2.3387, + "mean_token_accuracy": 0.4448275864124298, + "step": 138090 + }, + { + "epoch": 0.1390907506841478, + "grad_norm": 8.75736642418929, + "learning_rate": 4.90305660090618e-05, + "loss": 2.0634, + "mean_token_accuracy": 0.4896551787853241, + "step": 138095 + }, + { + "epoch": 0.13909578673725198, + "grad_norm": 9.659099767311268, + "learning_rate": 4.90304571999716e-05, + "loss": 2.5228, + "mean_token_accuracy": 0.42758620977401735, + "step": 138100 + }, + { + "epoch": 0.13910082279035615, + "grad_norm": 11.432933599256998, + "learning_rate": 4.9030348384909825e-05, + "loss": 2.6788, + "mean_token_accuracy": 0.38965516686439516, + "step": 138105 + }, + { + "epoch": 0.13910585884346033, + "grad_norm": 11.906547700122951, + "learning_rate": 4.903023956387654e-05, + "loss": 2.2792, + "mean_token_accuracy": 0.45009074807167054, + "step": 138110 + }, + { + "epoch": 0.1391108948965645, + "grad_norm": 13.552862035027253, + "learning_rate": 4.903013073687175e-05, + "loss": 2.3946, + "mean_token_accuracy": 0.4448275864124298, + "step": 138115 + }, + { + "epoch": 0.13911593094966868, + "grad_norm": 10.987791871745515, + "learning_rate": 4.903002190389551e-05, + "loss": 2.622, + "mean_token_accuracy": 0.4413793087005615, + "step": 138120 + }, + { + "epoch": 0.13912096700277285, + "grad_norm": 11.633542299620593, + "learning_rate": 4.9029913064947835e-05, + "loss": 2.5031, + "mean_token_accuracy": 0.41034482717514037, + "step": 138125 + }, + { + "epoch": 0.13912600305587702, + "grad_norm": 8.137984642986344, + "learning_rate": 4.9029804220028755e-05, + "loss": 2.1178, + "mean_token_accuracy": 0.5, + "step": 138130 + }, + { + "epoch": 0.1391310391089812, + "grad_norm": 13.88857811975663, + "learning_rate": 4.902969536913831e-05, + "loss": 2.452, + "mean_token_accuracy": 0.4034482777118683, + "step": 138135 + }, + { + "epoch": 0.13913607516208537, + "grad_norm": 10.689321482944598, + "learning_rate": 4.902958651227652e-05, + "loss": 2.4192, + "mean_token_accuracy": 0.4448275864124298, + "step": 138140 + }, + { + "epoch": 0.13914111121518954, + "grad_norm": 15.883226788609294, + "learning_rate": 4.9029477649443416e-05, + "loss": 2.857, + "mean_token_accuracy": 0.38620689511299133, + "step": 138145 + }, + { + "epoch": 0.13914614726829372, + "grad_norm": 10.444189513819644, + "learning_rate": 4.9029368780639036e-05, + "loss": 2.3962, + "mean_token_accuracy": 0.44827587008476255, + "step": 138150 + }, + { + "epoch": 0.1391511833213979, + "grad_norm": 10.125392677487993, + "learning_rate": 4.90292599058634e-05, + "loss": 2.3208, + "mean_token_accuracy": 0.44319420456886294, + "step": 138155 + }, + { + "epoch": 0.13915621937450207, + "grad_norm": 9.904978871022061, + "learning_rate": 4.902915102511655e-05, + "loss": 2.899, + "mean_token_accuracy": 0.36896551251411436, + "step": 138160 + }, + { + "epoch": 0.13916125542760624, + "grad_norm": 9.515434606826695, + "learning_rate": 4.9029042138398505e-05, + "loss": 2.2028, + "mean_token_accuracy": 0.44482758045196535, + "step": 138165 + }, + { + "epoch": 0.1391662914807104, + "grad_norm": 9.369317643402482, + "learning_rate": 4.90289332457093e-05, + "loss": 2.2061, + "mean_token_accuracy": 0.44948577880859375, + "step": 138170 + }, + { + "epoch": 0.1391713275338146, + "grad_norm": 9.407218242209533, + "learning_rate": 4.9028824347048975e-05, + "loss": 2.4273, + "mean_token_accuracy": 0.44137930274009707, + "step": 138175 + }, + { + "epoch": 0.13917636358691876, + "grad_norm": 11.928753865683907, + "learning_rate": 4.902871544241754e-05, + "loss": 2.2869, + "mean_token_accuracy": 0.4471869349479675, + "step": 138180 + }, + { + "epoch": 0.13918139964002293, + "grad_norm": 9.101652122933654, + "learning_rate": 4.902860653181503e-05, + "loss": 2.1599, + "mean_token_accuracy": 0.49999999403953554, + "step": 138185 + }, + { + "epoch": 0.1391864356931271, + "grad_norm": 10.007759430917048, + "learning_rate": 4.902849761524149e-05, + "loss": 2.659, + "mean_token_accuracy": 0.4275861978530884, + "step": 138190 + }, + { + "epoch": 0.13919147174623128, + "grad_norm": 11.164013256194261, + "learning_rate": 4.902838869269694e-05, + "loss": 2.46, + "mean_token_accuracy": 0.41379310488700866, + "step": 138195 + }, + { + "epoch": 0.13919650779933546, + "grad_norm": 8.214313629134498, + "learning_rate": 4.902827976418141e-05, + "loss": 2.1199, + "mean_token_accuracy": 0.5068965554237366, + "step": 138200 + }, + { + "epoch": 0.1392015438524396, + "grad_norm": 10.420779353360977, + "learning_rate": 4.902817082969493e-05, + "loss": 2.2839, + "mean_token_accuracy": 0.4689655125141144, + "step": 138205 + }, + { + "epoch": 0.13920657990554378, + "grad_norm": 13.41093416659383, + "learning_rate": 4.902806188923753e-05, + "loss": 2.7504, + "mean_token_accuracy": 0.3379310369491577, + "step": 138210 + }, + { + "epoch": 0.13921161595864795, + "grad_norm": 8.564914610140596, + "learning_rate": 4.902795294280925e-05, + "loss": 2.6871, + "mean_token_accuracy": 0.4398669064044952, + "step": 138215 + }, + { + "epoch": 0.13921665201175212, + "grad_norm": 10.542582119819189, + "learning_rate": 4.90278439904101e-05, + "loss": 2.2545, + "mean_token_accuracy": 0.43103447556495667, + "step": 138220 + }, + { + "epoch": 0.1392216880648563, + "grad_norm": 8.494785645191097, + "learning_rate": 4.9027735032040126e-05, + "loss": 2.1487, + "mean_token_accuracy": 0.4379310369491577, + "step": 138225 + }, + { + "epoch": 0.13922672411796047, + "grad_norm": 9.554576544302856, + "learning_rate": 4.9027626067699355e-05, + "loss": 2.3332, + "mean_token_accuracy": 0.4379310369491577, + "step": 138230 + }, + { + "epoch": 0.13923176017106464, + "grad_norm": 8.897587687713937, + "learning_rate": 4.902751709738782e-05, + "loss": 2.6564, + "mean_token_accuracy": 0.44137930274009707, + "step": 138235 + }, + { + "epoch": 0.13923679622416882, + "grad_norm": 10.74437103532797, + "learning_rate": 4.902740812110555e-05, + "loss": 2.7377, + "mean_token_accuracy": 0.37931033968925476, + "step": 138240 + }, + { + "epoch": 0.139241832277273, + "grad_norm": 13.989450030941116, + "learning_rate": 4.902729913885257e-05, + "loss": 2.8632, + "mean_token_accuracy": 0.3275862067937851, + "step": 138245 + }, + { + "epoch": 0.13924686833037717, + "grad_norm": 10.747075860146865, + "learning_rate": 4.902719015062891e-05, + "loss": 2.219, + "mean_token_accuracy": 0.447126442193985, + "step": 138250 + }, + { + "epoch": 0.13925190438348134, + "grad_norm": 8.964149179309548, + "learning_rate": 4.90270811564346e-05, + "loss": 2.2549, + "mean_token_accuracy": 0.4535995125770569, + "step": 138255 + }, + { + "epoch": 0.1392569404365855, + "grad_norm": 11.30206386376445, + "learning_rate": 4.902697215626968e-05, + "loss": 2.3434, + "mean_token_accuracy": 0.4344827562570572, + "step": 138260 + }, + { + "epoch": 0.1392619764896897, + "grad_norm": 13.915997808145846, + "learning_rate": 4.902686315013418e-05, + "loss": 2.2891, + "mean_token_accuracy": 0.47241378426551817, + "step": 138265 + }, + { + "epoch": 0.13926701254279386, + "grad_norm": 13.101239805499006, + "learning_rate": 4.9026754138028115e-05, + "loss": 2.5622, + "mean_token_accuracy": 0.4448275864124298, + "step": 138270 + }, + { + "epoch": 0.13927204859589803, + "grad_norm": 10.882545307473345, + "learning_rate": 4.9026645119951526e-05, + "loss": 2.0387, + "mean_token_accuracy": 0.4507561981678009, + "step": 138275 + }, + { + "epoch": 0.1392770846490022, + "grad_norm": 8.803099259557577, + "learning_rate": 4.9026536095904446e-05, + "loss": 2.3447, + "mean_token_accuracy": 0.4206896543502808, + "step": 138280 + }, + { + "epoch": 0.13928212070210638, + "grad_norm": 11.073421275329984, + "learning_rate": 4.9026427065886894e-05, + "loss": 2.1487, + "mean_token_accuracy": 0.5068965554237366, + "step": 138285 + }, + { + "epoch": 0.13928715675521056, + "grad_norm": 10.946873592636605, + "learning_rate": 4.902631802989891e-05, + "loss": 2.3964, + "mean_token_accuracy": 0.42758620977401735, + "step": 138290 + }, + { + "epoch": 0.13929219280831473, + "grad_norm": 11.61481953076501, + "learning_rate": 4.902620898794053e-05, + "loss": 2.5463, + "mean_token_accuracy": 0.4517241358757019, + "step": 138295 + }, + { + "epoch": 0.1392972288614189, + "grad_norm": 8.827576466973946, + "learning_rate": 4.902609994001176e-05, + "loss": 2.186, + "mean_token_accuracy": 0.4324258863925934, + "step": 138300 + }, + { + "epoch": 0.13930226491452308, + "grad_norm": 11.596319817919353, + "learning_rate": 4.9025990886112656e-05, + "loss": 2.6384, + "mean_token_accuracy": 0.4034482777118683, + "step": 138305 + }, + { + "epoch": 0.13930730096762725, + "grad_norm": 11.188737093216742, + "learning_rate": 4.902588182624324e-05, + "loss": 2.4005, + "mean_token_accuracy": 0.4379310369491577, + "step": 138310 + }, + { + "epoch": 0.13931233702073142, + "grad_norm": 10.877356751000981, + "learning_rate": 4.902577276040354e-05, + "loss": 2.1877, + "mean_token_accuracy": 0.4811857223510742, + "step": 138315 + }, + { + "epoch": 0.1393173730738356, + "grad_norm": 9.001370454213625, + "learning_rate": 4.902566368859357e-05, + "loss": 2.1878, + "mean_token_accuracy": 0.46896551847457885, + "step": 138320 + }, + { + "epoch": 0.13932240912693977, + "grad_norm": 11.585411080556945, + "learning_rate": 4.9025554610813396e-05, + "loss": 2.2194, + "mean_token_accuracy": 0.42758620977401735, + "step": 138325 + }, + { + "epoch": 0.13932744518004395, + "grad_norm": 9.698754410718333, + "learning_rate": 4.902544552706302e-05, + "loss": 2.0983, + "mean_token_accuracy": 0.47931033968925474, + "step": 138330 + }, + { + "epoch": 0.13933248123314812, + "grad_norm": 9.79765868913099, + "learning_rate": 4.902533643734248e-05, + "loss": 2.3342, + "mean_token_accuracy": 0.4482758641242981, + "step": 138335 + }, + { + "epoch": 0.1393375172862523, + "grad_norm": 11.372303806391345, + "learning_rate": 4.902522734165182e-05, + "loss": 2.6344, + "mean_token_accuracy": 0.37931033968925476, + "step": 138340 + }, + { + "epoch": 0.13934255333935644, + "grad_norm": 10.733291449885284, + "learning_rate": 4.9025118239991046e-05, + "loss": 2.282, + "mean_token_accuracy": 0.4620689570903778, + "step": 138345 + }, + { + "epoch": 0.1393475893924606, + "grad_norm": 11.032773316002121, + "learning_rate": 4.9025009132360205e-05, + "loss": 2.4531, + "mean_token_accuracy": 0.43793103098869324, + "step": 138350 + }, + { + "epoch": 0.1393526254455648, + "grad_norm": 10.410188478437263, + "learning_rate": 4.9024900018759326e-05, + "loss": 2.5367, + "mean_token_accuracy": 0.3965517282485962, + "step": 138355 + }, + { + "epoch": 0.13935766149866896, + "grad_norm": 10.81220914775972, + "learning_rate": 4.902479089918842e-05, + "loss": 2.7992, + "mean_token_accuracy": 0.3793103456497192, + "step": 138360 + }, + { + "epoch": 0.13936269755177313, + "grad_norm": 10.646893498768199, + "learning_rate": 4.902468177364755e-05, + "loss": 2.5031, + "mean_token_accuracy": 0.4259528160095215, + "step": 138365 + }, + { + "epoch": 0.1393677336048773, + "grad_norm": 10.169692944861206, + "learning_rate": 4.902457264213672e-05, + "loss": 2.8302, + "mean_token_accuracy": 0.41034482717514037, + "step": 138370 + }, + { + "epoch": 0.13937276965798148, + "grad_norm": 11.67320683246617, + "learning_rate": 4.902446350465598e-05, + "loss": 2.3626, + "mean_token_accuracy": 0.41724138259887694, + "step": 138375 + }, + { + "epoch": 0.13937780571108566, + "grad_norm": 12.382190024465723, + "learning_rate": 4.902435436120534e-05, + "loss": 2.2152, + "mean_token_accuracy": 0.42561576664447787, + "step": 138380 + }, + { + "epoch": 0.13938284176418983, + "grad_norm": 10.763645388359468, + "learning_rate": 4.902424521178484e-05, + "loss": 2.2482, + "mean_token_accuracy": 0.46206897497177124, + "step": 138385 + }, + { + "epoch": 0.139387877817294, + "grad_norm": 12.41934744040569, + "learning_rate": 4.9024136056394514e-05, + "loss": 2.4457, + "mean_token_accuracy": 0.44482758045196535, + "step": 138390 + }, + { + "epoch": 0.13939291387039818, + "grad_norm": 12.094651792852375, + "learning_rate": 4.902402689503439e-05, + "loss": 2.1624, + "mean_token_accuracy": 0.482758617401123, + "step": 138395 + }, + { + "epoch": 0.13939794992350235, + "grad_norm": 10.181590765726154, + "learning_rate": 4.9023917727704494e-05, + "loss": 2.4502, + "mean_token_accuracy": 0.4103448331356049, + "step": 138400 + }, + { + "epoch": 0.13940298597660652, + "grad_norm": 10.98334490657734, + "learning_rate": 4.9023808554404855e-05, + "loss": 2.3567, + "mean_token_accuracy": 0.458620685338974, + "step": 138405 + }, + { + "epoch": 0.1394080220297107, + "grad_norm": 11.351928047402858, + "learning_rate": 4.902369937513551e-05, + "loss": 2.4708, + "mean_token_accuracy": 0.4551724135875702, + "step": 138410 + }, + { + "epoch": 0.13941305808281487, + "grad_norm": 11.2812501949377, + "learning_rate": 4.902359018989649e-05, + "loss": 2.3831, + "mean_token_accuracy": 0.4344827651977539, + "step": 138415 + }, + { + "epoch": 0.13941809413591905, + "grad_norm": 11.043802477745736, + "learning_rate": 4.902348099868783e-05, + "loss": 2.8233, + "mean_token_accuracy": 0.3931034475564957, + "step": 138420 + }, + { + "epoch": 0.13942313018902322, + "grad_norm": 9.935437741146034, + "learning_rate": 4.902337180150954e-05, + "loss": 2.1206, + "mean_token_accuracy": 0.48275862336158754, + "step": 138425 + }, + { + "epoch": 0.1394281662421274, + "grad_norm": 10.799092022856922, + "learning_rate": 4.902326259836166e-05, + "loss": 2.3676, + "mean_token_accuracy": 0.4517241358757019, + "step": 138430 + }, + { + "epoch": 0.13943320229523157, + "grad_norm": 9.067445162586733, + "learning_rate": 4.9023153389244226e-05, + "loss": 2.5468, + "mean_token_accuracy": 0.4000000059604645, + "step": 138435 + }, + { + "epoch": 0.13943823834833574, + "grad_norm": 9.11153063017215, + "learning_rate": 4.902304417415727e-05, + "loss": 2.4023, + "mean_token_accuracy": 0.4172413766384125, + "step": 138440 + }, + { + "epoch": 0.13944327440143992, + "grad_norm": 12.161098585142575, + "learning_rate": 4.902293495310081e-05, + "loss": 2.2996, + "mean_token_accuracy": 0.4517241299152374, + "step": 138445 + }, + { + "epoch": 0.1394483104545441, + "grad_norm": 11.321213945096785, + "learning_rate": 4.90228257260749e-05, + "loss": 2.5568, + "mean_token_accuracy": 0.37931033968925476, + "step": 138450 + }, + { + "epoch": 0.13945334650764826, + "grad_norm": 8.778706998273167, + "learning_rate": 4.902271649307953e-05, + "loss": 2.493, + "mean_token_accuracy": 0.4275861978530884, + "step": 138455 + }, + { + "epoch": 0.13945838256075244, + "grad_norm": 13.006123335979375, + "learning_rate": 4.902260725411477e-05, + "loss": 2.2237, + "mean_token_accuracy": 0.4724137902259827, + "step": 138460 + }, + { + "epoch": 0.1394634186138566, + "grad_norm": 11.11006556785583, + "learning_rate": 4.902249800918063e-05, + "loss": 2.5023, + "mean_token_accuracy": 0.3827586144208908, + "step": 138465 + }, + { + "epoch": 0.13946845466696078, + "grad_norm": 9.32610739204872, + "learning_rate": 4.902238875827715e-05, + "loss": 2.1111, + "mean_token_accuracy": 0.4879007875919342, + "step": 138470 + }, + { + "epoch": 0.13947349072006496, + "grad_norm": 12.701999385316975, + "learning_rate": 4.902227950140435e-05, + "loss": 2.3897, + "mean_token_accuracy": 0.4103448212146759, + "step": 138475 + }, + { + "epoch": 0.13947852677316913, + "grad_norm": 18.756392609940214, + "learning_rate": 4.902217023856227e-05, + "loss": 2.3465, + "mean_token_accuracy": 0.47586207985877993, + "step": 138480 + }, + { + "epoch": 0.13948356282627328, + "grad_norm": 8.76499009037372, + "learning_rate": 4.902206096975093e-05, + "loss": 2.2433, + "mean_token_accuracy": 0.47241379618644713, + "step": 138485 + }, + { + "epoch": 0.13948859887937745, + "grad_norm": 10.424806807924217, + "learning_rate": 4.9021951694970366e-05, + "loss": 2.2282, + "mean_token_accuracy": 0.4366001129150391, + "step": 138490 + }, + { + "epoch": 0.13949363493248162, + "grad_norm": 9.530232678047204, + "learning_rate": 4.902184241422061e-05, + "loss": 2.3043, + "mean_token_accuracy": 0.4241379380226135, + "step": 138495 + }, + { + "epoch": 0.1394986709855858, + "grad_norm": 14.184248034622462, + "learning_rate": 4.902173312750169e-05, + "loss": 3.0517, + "mean_token_accuracy": 0.3896551728248596, + "step": 138500 + }, + { + "epoch": 0.13950370703868997, + "grad_norm": 8.646129628138572, + "learning_rate": 4.902162383481364e-05, + "loss": 2.1395, + "mean_token_accuracy": 0.482758617401123, + "step": 138505 + }, + { + "epoch": 0.13950874309179415, + "grad_norm": 8.954026470766216, + "learning_rate": 4.902151453615649e-05, + "loss": 2.5184, + "mean_token_accuracy": 0.4310344815254211, + "step": 138510 + }, + { + "epoch": 0.13951377914489832, + "grad_norm": 9.92052424405013, + "learning_rate": 4.902140523153026e-05, + "loss": 2.3178, + "mean_token_accuracy": 0.43103448748588563, + "step": 138515 + }, + { + "epoch": 0.1395188151980025, + "grad_norm": 10.608029590972818, + "learning_rate": 4.902129592093499e-05, + "loss": 2.4826, + "mean_token_accuracy": 0.3620689630508423, + "step": 138520 + }, + { + "epoch": 0.13952385125110667, + "grad_norm": 9.38824613635307, + "learning_rate": 4.902118660437071e-05, + "loss": 2.5035, + "mean_token_accuracy": 0.38275861740112305, + "step": 138525 + }, + { + "epoch": 0.13952888730421084, + "grad_norm": 11.012334879282674, + "learning_rate": 4.9021077281837455e-05, + "loss": 3.1635, + "mean_token_accuracy": 0.34137930870056155, + "step": 138530 + }, + { + "epoch": 0.13953392335731502, + "grad_norm": 9.412759144174226, + "learning_rate": 4.902096795333524e-05, + "loss": 2.1925, + "mean_token_accuracy": 0.48802177906036376, + "step": 138535 + }, + { + "epoch": 0.1395389594104192, + "grad_norm": 10.633446858388059, + "learning_rate": 4.9020858618864106e-05, + "loss": 2.2427, + "mean_token_accuracy": 0.43103448748588563, + "step": 138540 + }, + { + "epoch": 0.13954399546352336, + "grad_norm": 8.375857380693052, + "learning_rate": 4.9020749278424084e-05, + "loss": 2.3735, + "mean_token_accuracy": 0.46763460636138915, + "step": 138545 + }, + { + "epoch": 0.13954903151662754, + "grad_norm": 11.612892044901223, + "learning_rate": 4.9020639932015197e-05, + "loss": 2.1501, + "mean_token_accuracy": 0.4620689690113068, + "step": 138550 + }, + { + "epoch": 0.1395540675697317, + "grad_norm": 9.97141359396388, + "learning_rate": 4.902053057963748e-05, + "loss": 1.9055, + "mean_token_accuracy": 0.5135511159896851, + "step": 138555 + }, + { + "epoch": 0.13955910362283588, + "grad_norm": 13.345035026753036, + "learning_rate": 4.9020421221290976e-05, + "loss": 3.0173, + "mean_token_accuracy": 0.34482758939266206, + "step": 138560 + }, + { + "epoch": 0.13956413967594006, + "grad_norm": 11.707339745097064, + "learning_rate": 4.902031185697569e-05, + "loss": 2.4901, + "mean_token_accuracy": 0.4034482777118683, + "step": 138565 + }, + { + "epoch": 0.13956917572904423, + "grad_norm": 11.758542923246482, + "learning_rate": 4.902020248669167e-05, + "loss": 2.0067, + "mean_token_accuracy": 0.4517241418361664, + "step": 138570 + }, + { + "epoch": 0.1395742117821484, + "grad_norm": 11.196569265510686, + "learning_rate": 4.902009311043894e-05, + "loss": 2.2605, + "mean_token_accuracy": 0.43448275327682495, + "step": 138575 + }, + { + "epoch": 0.13957924783525258, + "grad_norm": 10.052913751080675, + "learning_rate": 4.901998372821753e-05, + "loss": 2.2127, + "mean_token_accuracy": 0.4793103575706482, + "step": 138580 + }, + { + "epoch": 0.13958428388835675, + "grad_norm": 10.325772128783614, + "learning_rate": 4.9019874340027475e-05, + "loss": 2.7954, + "mean_token_accuracy": 0.4068965494632721, + "step": 138585 + }, + { + "epoch": 0.13958931994146093, + "grad_norm": 10.84032970806072, + "learning_rate": 4.9019764945868806e-05, + "loss": 2.5035, + "mean_token_accuracy": 0.4172413766384125, + "step": 138590 + }, + { + "epoch": 0.1395943559945651, + "grad_norm": 10.6851604910485, + "learning_rate": 4.901965554574154e-05, + "loss": 2.2395, + "mean_token_accuracy": 0.47931033968925474, + "step": 138595 + }, + { + "epoch": 0.13959939204766927, + "grad_norm": 13.432105152064153, + "learning_rate": 4.901954613964573e-05, + "loss": 2.3655, + "mean_token_accuracy": 0.4413793087005615, + "step": 138600 + }, + { + "epoch": 0.13960442810077345, + "grad_norm": 10.057474841407988, + "learning_rate": 4.9019436727581385e-05, + "loss": 2.5073, + "mean_token_accuracy": 0.4206896543502808, + "step": 138605 + }, + { + "epoch": 0.13960946415387762, + "grad_norm": 9.650911306282936, + "learning_rate": 4.9019327309548544e-05, + "loss": 2.3488, + "mean_token_accuracy": 0.4599515974521637, + "step": 138610 + }, + { + "epoch": 0.1396145002069818, + "grad_norm": 11.248147183678558, + "learning_rate": 4.901921788554725e-05, + "loss": 2.3044, + "mean_token_accuracy": 0.42413793206214906, + "step": 138615 + }, + { + "epoch": 0.13961953626008597, + "grad_norm": 10.12306111273883, + "learning_rate": 4.90191084555775e-05, + "loss": 2.1577, + "mean_token_accuracy": 0.43103448748588563, + "step": 138620 + }, + { + "epoch": 0.13962457231319012, + "grad_norm": 13.757092797154547, + "learning_rate": 4.901899901963936e-05, + "loss": 2.786, + "mean_token_accuracy": 0.3931034505367279, + "step": 138625 + }, + { + "epoch": 0.1396296083662943, + "grad_norm": 9.97554070946242, + "learning_rate": 4.901888957773284e-05, + "loss": 2.6034, + "mean_token_accuracy": 0.34482758641242983, + "step": 138630 + }, + { + "epoch": 0.13963464441939846, + "grad_norm": 16.11802176669795, + "learning_rate": 4.9018780129857975e-05, + "loss": 3.1878, + "mean_token_accuracy": 0.32413793802261354, + "step": 138635 + }, + { + "epoch": 0.13963968047250264, + "grad_norm": 12.025073633421036, + "learning_rate": 4.9018670676014796e-05, + "loss": 2.8853, + "mean_token_accuracy": 0.40000000298023225, + "step": 138640 + }, + { + "epoch": 0.1396447165256068, + "grad_norm": 9.608657618001468, + "learning_rate": 4.9018561216203335e-05, + "loss": 2.4033, + "mean_token_accuracy": 0.3965517282485962, + "step": 138645 + }, + { + "epoch": 0.13964975257871098, + "grad_norm": 10.923076100713418, + "learning_rate": 4.901845175042362e-05, + "loss": 2.6882, + "mean_token_accuracy": 0.36896551847457887, + "step": 138650 + }, + { + "epoch": 0.13965478863181516, + "grad_norm": 9.891167464380397, + "learning_rate": 4.901834227867568e-05, + "loss": 2.1082, + "mean_token_accuracy": 0.4501512348651886, + "step": 138655 + }, + { + "epoch": 0.13965982468491933, + "grad_norm": 11.83539559042224, + "learning_rate": 4.901823280095955e-05, + "loss": 2.3404, + "mean_token_accuracy": 0.4401088893413544, + "step": 138660 + }, + { + "epoch": 0.1396648607380235, + "grad_norm": 9.50602837051987, + "learning_rate": 4.901812331727525e-05, + "loss": 2.4626, + "mean_token_accuracy": 0.47241379618644713, + "step": 138665 + }, + { + "epoch": 0.13966989679112768, + "grad_norm": 9.991968010804602, + "learning_rate": 4.9018013827622834e-05, + "loss": 2.4565, + "mean_token_accuracy": 0.41034482717514037, + "step": 138670 + }, + { + "epoch": 0.13967493284423185, + "grad_norm": 12.534533423459216, + "learning_rate": 4.901790433200231e-05, + "loss": 2.2622, + "mean_token_accuracy": 0.47241378426551817, + "step": 138675 + }, + { + "epoch": 0.13967996889733603, + "grad_norm": 6.9494080248452645, + "learning_rate": 4.9017794830413714e-05, + "loss": 2.339, + "mean_token_accuracy": 0.4748336374759674, + "step": 138680 + }, + { + "epoch": 0.1396850049504402, + "grad_norm": 8.24727363396836, + "learning_rate": 4.901768532285708e-05, + "loss": 2.259, + "mean_token_accuracy": 0.4498487591743469, + "step": 138685 + }, + { + "epoch": 0.13969004100354437, + "grad_norm": 10.413743122868981, + "learning_rate": 4.9017575809332434e-05, + "loss": 2.3511, + "mean_token_accuracy": 0.4482758641242981, + "step": 138690 + }, + { + "epoch": 0.13969507705664855, + "grad_norm": 9.514875476680462, + "learning_rate": 4.9017466289839804e-05, + "loss": 1.8696, + "mean_token_accuracy": 0.49999999403953554, + "step": 138695 + }, + { + "epoch": 0.13970011310975272, + "grad_norm": 15.61218891489284, + "learning_rate": 4.9017356764379224e-05, + "loss": 2.1779, + "mean_token_accuracy": 0.44137930274009707, + "step": 138700 + }, + { + "epoch": 0.1397051491628569, + "grad_norm": 11.5958430694819, + "learning_rate": 4.901724723295073e-05, + "loss": 2.7642, + "mean_token_accuracy": 0.3965517282485962, + "step": 138705 + }, + { + "epoch": 0.13971018521596107, + "grad_norm": 10.743450446058946, + "learning_rate": 4.901713769555434e-05, + "loss": 2.0948, + "mean_token_accuracy": 0.4620689690113068, + "step": 138710 + }, + { + "epoch": 0.13971522126906524, + "grad_norm": 11.391153197258028, + "learning_rate": 4.90170281521901e-05, + "loss": 2.4385, + "mean_token_accuracy": 0.46896551847457885, + "step": 138715 + }, + { + "epoch": 0.13972025732216942, + "grad_norm": 8.123152879525938, + "learning_rate": 4.901691860285803e-05, + "loss": 2.4433, + "mean_token_accuracy": 0.4379310369491577, + "step": 138720 + }, + { + "epoch": 0.1397252933752736, + "grad_norm": 9.04099305298166, + "learning_rate": 4.901680904755816e-05, + "loss": 2.5904, + "mean_token_accuracy": 0.3517241358757019, + "step": 138725 + }, + { + "epoch": 0.13973032942837776, + "grad_norm": 11.197689852313406, + "learning_rate": 4.901669948629052e-05, + "loss": 2.7276, + "mean_token_accuracy": 0.41379310488700866, + "step": 138730 + }, + { + "epoch": 0.13973536548148194, + "grad_norm": 8.543191133511975, + "learning_rate": 4.901658991905515e-05, + "loss": 2.1756, + "mean_token_accuracy": 0.4413793087005615, + "step": 138735 + }, + { + "epoch": 0.1397404015345861, + "grad_norm": 9.259519840888386, + "learning_rate": 4.901648034585207e-05, + "loss": 2.2343, + "mean_token_accuracy": 0.4413793087005615, + "step": 138740 + }, + { + "epoch": 0.13974543758769029, + "grad_norm": 10.094875051719807, + "learning_rate": 4.901637076668132e-05, + "loss": 2.8997, + "mean_token_accuracy": 0.3482758551836014, + "step": 138745 + }, + { + "epoch": 0.13975047364079446, + "grad_norm": 11.52518806306133, + "learning_rate": 4.901626118154292e-05, + "loss": 2.4085, + "mean_token_accuracy": 0.4068965554237366, + "step": 138750 + }, + { + "epoch": 0.13975550969389863, + "grad_norm": 9.924997927173873, + "learning_rate": 4.90161515904369e-05, + "loss": 2.5045, + "mean_token_accuracy": 0.41034482717514037, + "step": 138755 + }, + { + "epoch": 0.1397605457470028, + "grad_norm": 10.322435487405864, + "learning_rate": 4.90160419933633e-05, + "loss": 2.5755, + "mean_token_accuracy": 0.41034482717514037, + "step": 138760 + }, + { + "epoch": 0.13976558180010695, + "grad_norm": 10.984497541881616, + "learning_rate": 4.901593239032214e-05, + "loss": 2.5877, + "mean_token_accuracy": 0.44482758045196535, + "step": 138765 + }, + { + "epoch": 0.13977061785321113, + "grad_norm": 9.985500438638363, + "learning_rate": 4.9015822781313456e-05, + "loss": 2.6733, + "mean_token_accuracy": 0.3862068891525269, + "step": 138770 + }, + { + "epoch": 0.1397756539063153, + "grad_norm": 9.839280189869713, + "learning_rate": 4.901571316633729e-05, + "loss": 2.227, + "mean_token_accuracy": 0.41034482717514037, + "step": 138775 + }, + { + "epoch": 0.13978068995941947, + "grad_norm": 11.619182377787622, + "learning_rate": 4.9015603545393646e-05, + "loss": 2.7114, + "mean_token_accuracy": 0.37241379618644715, + "step": 138780 + }, + { + "epoch": 0.13978572601252365, + "grad_norm": 9.720204505650978, + "learning_rate": 4.901549391848258e-05, + "loss": 2.1535, + "mean_token_accuracy": 0.4344827592372894, + "step": 138785 + }, + { + "epoch": 0.13979076206562782, + "grad_norm": 8.880793317753692, + "learning_rate": 4.90153842856041e-05, + "loss": 2.4879, + "mean_token_accuracy": 0.41724138259887694, + "step": 138790 + }, + { + "epoch": 0.139795798118732, + "grad_norm": 9.848995127445994, + "learning_rate": 4.9015274646758255e-05, + "loss": 2.1515, + "mean_token_accuracy": 0.45517241954803467, + "step": 138795 + }, + { + "epoch": 0.13980083417183617, + "grad_norm": 9.391197121756159, + "learning_rate": 4.9015165001945074e-05, + "loss": 2.8079, + "mean_token_accuracy": 0.3379310369491577, + "step": 138800 + }, + { + "epoch": 0.13980587022494034, + "grad_norm": 12.512222154983283, + "learning_rate": 4.9015055351164574e-05, + "loss": 2.4856, + "mean_token_accuracy": 0.4206896543502808, + "step": 138805 + }, + { + "epoch": 0.13981090627804452, + "grad_norm": 13.156955501515759, + "learning_rate": 4.9014945694416794e-05, + "loss": 2.4305, + "mean_token_accuracy": 0.43103447556495667, + "step": 138810 + }, + { + "epoch": 0.1398159423311487, + "grad_norm": 14.232943009717248, + "learning_rate": 4.901483603170177e-05, + "loss": 2.7672, + "mean_token_accuracy": 0.3931034505367279, + "step": 138815 + }, + { + "epoch": 0.13982097838425286, + "grad_norm": 10.095733539290586, + "learning_rate": 4.901472636301952e-05, + "loss": 2.3585, + "mean_token_accuracy": 0.41724138259887694, + "step": 138820 + }, + { + "epoch": 0.13982601443735704, + "grad_norm": 11.012987496322, + "learning_rate": 4.901461668837008e-05, + "loss": 2.3807, + "mean_token_accuracy": 0.42601331472396853, + "step": 138825 + }, + { + "epoch": 0.1398310504904612, + "grad_norm": 12.154427601217643, + "learning_rate": 4.901450700775349e-05, + "loss": 2.7411, + "mean_token_accuracy": 0.4068965494632721, + "step": 138830 + }, + { + "epoch": 0.13983608654356539, + "grad_norm": 9.369549210602742, + "learning_rate": 4.9014397321169755e-05, + "loss": 2.4032, + "mean_token_accuracy": 0.4724137902259827, + "step": 138835 + }, + { + "epoch": 0.13984112259666956, + "grad_norm": 11.779058860201115, + "learning_rate": 4.901428762861893e-05, + "loss": 2.5919, + "mean_token_accuracy": 0.44482758045196535, + "step": 138840 + }, + { + "epoch": 0.13984615864977373, + "grad_norm": 13.923338633110752, + "learning_rate": 4.901417793010104e-05, + "loss": 2.4291, + "mean_token_accuracy": 0.4137930989265442, + "step": 138845 + }, + { + "epoch": 0.1398511947028779, + "grad_norm": 10.72745288436101, + "learning_rate": 4.901406822561611e-05, + "loss": 2.7415, + "mean_token_accuracy": 0.38457350730895995, + "step": 138850 + }, + { + "epoch": 0.13985623075598208, + "grad_norm": 9.111101787912652, + "learning_rate": 4.901395851516417e-05, + "loss": 2.1251, + "mean_token_accuracy": 0.46896552443504336, + "step": 138855 + }, + { + "epoch": 0.13986126680908625, + "grad_norm": 11.183073884929263, + "learning_rate": 4.901384879874526e-05, + "loss": 2.465, + "mean_token_accuracy": 0.37586206793785093, + "step": 138860 + }, + { + "epoch": 0.13986630286219043, + "grad_norm": 10.552658930336557, + "learning_rate": 4.90137390763594e-05, + "loss": 2.4291, + "mean_token_accuracy": 0.42758620977401735, + "step": 138865 + }, + { + "epoch": 0.1398713389152946, + "grad_norm": 10.24059635533299, + "learning_rate": 4.901362934800662e-05, + "loss": 2.5229, + "mean_token_accuracy": 0.41034482717514037, + "step": 138870 + }, + { + "epoch": 0.13987637496839878, + "grad_norm": 18.121937558094515, + "learning_rate": 4.901351961368696e-05, + "loss": 3.0135, + "mean_token_accuracy": 0.39655172228813174, + "step": 138875 + }, + { + "epoch": 0.13988141102150295, + "grad_norm": 12.192624920447315, + "learning_rate": 4.901340987340044e-05, + "loss": 2.7795, + "mean_token_accuracy": 0.382758629322052, + "step": 138880 + }, + { + "epoch": 0.13988644707460712, + "grad_norm": 14.448744215219861, + "learning_rate": 4.9013300127147107e-05, + "loss": 2.4568, + "mean_token_accuracy": 0.3793103456497192, + "step": 138885 + }, + { + "epoch": 0.1398914831277113, + "grad_norm": 8.602885960596431, + "learning_rate": 4.901319037492697e-05, + "loss": 2.9684, + "mean_token_accuracy": 0.3965517282485962, + "step": 138890 + }, + { + "epoch": 0.13989651918081547, + "grad_norm": 14.829126514133309, + "learning_rate": 4.901308061674007e-05, + "loss": 2.7345, + "mean_token_accuracy": 0.39310344457626345, + "step": 138895 + }, + { + "epoch": 0.13990155523391964, + "grad_norm": 12.299415496261407, + "learning_rate": 4.901297085258644e-05, + "loss": 2.3936, + "mean_token_accuracy": 0.5000000119209289, + "step": 138900 + }, + { + "epoch": 0.1399065912870238, + "grad_norm": 9.759059418694848, + "learning_rate": 4.9012861082466104e-05, + "loss": 2.4379, + "mean_token_accuracy": 0.38620689511299133, + "step": 138905 + }, + { + "epoch": 0.13991162734012796, + "grad_norm": 10.638240956294572, + "learning_rate": 4.9012751306379093e-05, + "loss": 2.4646, + "mean_token_accuracy": 0.4620689690113068, + "step": 138910 + }, + { + "epoch": 0.13991666339323214, + "grad_norm": 9.398956662430583, + "learning_rate": 4.901264152432545e-05, + "loss": 2.5795, + "mean_token_accuracy": 0.43103448748588563, + "step": 138915 + }, + { + "epoch": 0.1399216994463363, + "grad_norm": 12.626573288276347, + "learning_rate": 4.901253173630519e-05, + "loss": 2.4614, + "mean_token_accuracy": 0.46551724076271056, + "step": 138920 + }, + { + "epoch": 0.13992673549944049, + "grad_norm": 12.691360376781272, + "learning_rate": 4.901242194231835e-05, + "loss": 2.2126, + "mean_token_accuracy": 0.4724137902259827, + "step": 138925 + }, + { + "epoch": 0.13993177155254466, + "grad_norm": 8.703075945727914, + "learning_rate": 4.901231214236495e-05, + "loss": 2.3335, + "mean_token_accuracy": 0.4172413766384125, + "step": 138930 + }, + { + "epoch": 0.13993680760564883, + "grad_norm": 9.258698468487308, + "learning_rate": 4.901220233644505e-05, + "loss": 2.0679, + "mean_token_accuracy": 0.4931034445762634, + "step": 138935 + }, + { + "epoch": 0.139941843658753, + "grad_norm": 11.939088530866028, + "learning_rate": 4.9012092524558644e-05, + "loss": 2.1829, + "mean_token_accuracy": 0.467332124710083, + "step": 138940 + }, + { + "epoch": 0.13994687971185718, + "grad_norm": 9.44370795039286, + "learning_rate": 4.901198270670579e-05, + "loss": 1.9512, + "mean_token_accuracy": 0.47241379618644713, + "step": 138945 + }, + { + "epoch": 0.13995191576496135, + "grad_norm": 10.626942016479017, + "learning_rate": 4.90118728828865e-05, + "loss": 2.4886, + "mean_token_accuracy": 0.41724138259887694, + "step": 138950 + }, + { + "epoch": 0.13995695181806553, + "grad_norm": 11.54772783675832, + "learning_rate": 4.901176305310081e-05, + "loss": 2.2523, + "mean_token_accuracy": 0.4793103516101837, + "step": 138955 + }, + { + "epoch": 0.1399619878711697, + "grad_norm": 10.63693418340612, + "learning_rate": 4.901165321734875e-05, + "loss": 2.5565, + "mean_token_accuracy": 0.4551724135875702, + "step": 138960 + }, + { + "epoch": 0.13996702392427388, + "grad_norm": 10.503411659562344, + "learning_rate": 4.901154337563036e-05, + "loss": 2.4678, + "mean_token_accuracy": 0.4344827592372894, + "step": 138965 + }, + { + "epoch": 0.13997205997737805, + "grad_norm": 11.670286438156529, + "learning_rate": 4.901143352794566e-05, + "loss": 2.4988, + "mean_token_accuracy": 0.4137930989265442, + "step": 138970 + }, + { + "epoch": 0.13997709603048222, + "grad_norm": 10.191622304656846, + "learning_rate": 4.9011323674294685e-05, + "loss": 1.8963, + "mean_token_accuracy": 0.47586206197738645, + "step": 138975 + }, + { + "epoch": 0.1399821320835864, + "grad_norm": 9.512826429027188, + "learning_rate": 4.9011213814677466e-05, + "loss": 3.29, + "mean_token_accuracy": 0.35172413289546967, + "step": 138980 + }, + { + "epoch": 0.13998716813669057, + "grad_norm": 19.334430252992227, + "learning_rate": 4.901110394909403e-05, + "loss": 2.5925, + "mean_token_accuracy": 0.4586206942796707, + "step": 138985 + }, + { + "epoch": 0.13999220418979474, + "grad_norm": 8.090313246728336, + "learning_rate": 4.9010994077544415e-05, + "loss": 2.2273, + "mean_token_accuracy": 0.46551724076271056, + "step": 138990 + }, + { + "epoch": 0.13999724024289892, + "grad_norm": 10.2247295561546, + "learning_rate": 4.901088420002863e-05, + "loss": 2.4429, + "mean_token_accuracy": 0.4290381193161011, + "step": 138995 + }, + { + "epoch": 0.1400022762960031, + "grad_norm": 10.51347882502283, + "learning_rate": 4.901077431654673e-05, + "loss": 2.5892, + "mean_token_accuracy": 0.4206896543502808, + "step": 139000 + }, + { + "epoch": 0.14000731234910727, + "grad_norm": 19.928081727414696, + "learning_rate": 4.901066442709874e-05, + "loss": 2.3488, + "mean_token_accuracy": 0.46787658929824827, + "step": 139005 + }, + { + "epoch": 0.14001234840221144, + "grad_norm": 13.073458969460921, + "learning_rate": 4.9010554531684684e-05, + "loss": 2.8096, + "mean_token_accuracy": 0.3827586114406586, + "step": 139010 + }, + { + "epoch": 0.1400173844553156, + "grad_norm": 13.611862322043228, + "learning_rate": 4.90104446303046e-05, + "loss": 2.4835, + "mean_token_accuracy": 0.37586207389831544, + "step": 139015 + }, + { + "epoch": 0.1400224205084198, + "grad_norm": 9.21132323967406, + "learning_rate": 4.90103347229585e-05, + "loss": 2.1327, + "mean_token_accuracy": 0.4862069010734558, + "step": 139020 + }, + { + "epoch": 0.14002745656152396, + "grad_norm": 7.535234153586982, + "learning_rate": 4.901022480964644e-05, + "loss": 2.2996, + "mean_token_accuracy": 0.4712038815021515, + "step": 139025 + }, + { + "epoch": 0.14003249261462813, + "grad_norm": 15.50829161788053, + "learning_rate": 4.901011489036844e-05, + "loss": 2.6368, + "mean_token_accuracy": 0.35862069129943847, + "step": 139030 + }, + { + "epoch": 0.1400375286677323, + "grad_norm": 15.411876217041295, + "learning_rate": 4.901000496512452e-05, + "loss": 2.4964, + "mean_token_accuracy": 0.43103448748588563, + "step": 139035 + }, + { + "epoch": 0.14004256472083648, + "grad_norm": 9.1592915126872, + "learning_rate": 4.9009895033914724e-05, + "loss": 2.4246, + "mean_token_accuracy": 0.4172413766384125, + "step": 139040 + }, + { + "epoch": 0.14004760077394063, + "grad_norm": 14.251832508038001, + "learning_rate": 4.900978509673908e-05, + "loss": 2.7483, + "mean_token_accuracy": 0.4172413796186447, + "step": 139045 + }, + { + "epoch": 0.1400526368270448, + "grad_norm": 10.521493549543232, + "learning_rate": 4.9009675153597625e-05, + "loss": 2.3302, + "mean_token_accuracy": 0.4103448212146759, + "step": 139050 + }, + { + "epoch": 0.14005767288014898, + "grad_norm": 9.139566987242326, + "learning_rate": 4.900956520449038e-05, + "loss": 2.3065, + "mean_token_accuracy": 0.4467634618282318, + "step": 139055 + }, + { + "epoch": 0.14006270893325315, + "grad_norm": 9.848211533087522, + "learning_rate": 4.9009455249417365e-05, + "loss": 2.3878, + "mean_token_accuracy": 0.43448275327682495, + "step": 139060 + }, + { + "epoch": 0.14006774498635732, + "grad_norm": 9.027539234516688, + "learning_rate": 4.900934528837862e-05, + "loss": 2.2721, + "mean_token_accuracy": 0.44349666833877566, + "step": 139065 + }, + { + "epoch": 0.1400727810394615, + "grad_norm": 12.458266542839068, + "learning_rate": 4.9009235321374194e-05, + "loss": 2.9422, + "mean_token_accuracy": 0.3758620619773865, + "step": 139070 + }, + { + "epoch": 0.14007781709256567, + "grad_norm": 9.372321142066122, + "learning_rate": 4.900912534840409e-05, + "loss": 2.5158, + "mean_token_accuracy": 0.41379310488700866, + "step": 139075 + }, + { + "epoch": 0.14008285314566984, + "grad_norm": 10.2786625992754, + "learning_rate": 4.900901536946835e-05, + "loss": 2.7206, + "mean_token_accuracy": 0.38620689511299133, + "step": 139080 + }, + { + "epoch": 0.14008788919877402, + "grad_norm": 8.855288560786711, + "learning_rate": 4.9008905384567e-05, + "loss": 1.855, + "mean_token_accuracy": 0.4965517222881317, + "step": 139085 + }, + { + "epoch": 0.1400929252518782, + "grad_norm": 12.105421089854316, + "learning_rate": 4.900879539370009e-05, + "loss": 2.5964, + "mean_token_accuracy": 0.4206896543502808, + "step": 139090 + }, + { + "epoch": 0.14009796130498237, + "grad_norm": 10.585312496330332, + "learning_rate": 4.9008685396867634e-05, + "loss": 2.0317, + "mean_token_accuracy": 0.5206896603107453, + "step": 139095 + }, + { + "epoch": 0.14010299735808654, + "grad_norm": 8.981563536695814, + "learning_rate": 4.900857539406965e-05, + "loss": 2.2813, + "mean_token_accuracy": 0.4586206912994385, + "step": 139100 + }, + { + "epoch": 0.1401080334111907, + "grad_norm": 13.201874204328215, + "learning_rate": 4.900846538530619e-05, + "loss": 2.3025, + "mean_token_accuracy": 0.415366005897522, + "step": 139105 + }, + { + "epoch": 0.1401130694642949, + "grad_norm": 9.933358970549161, + "learning_rate": 4.9008355370577274e-05, + "loss": 2.1694, + "mean_token_accuracy": 0.4517241358757019, + "step": 139110 + }, + { + "epoch": 0.14011810551739906, + "grad_norm": 12.062571233864, + "learning_rate": 4.9008245349882937e-05, + "loss": 2.6266, + "mean_token_accuracy": 0.3793103486299515, + "step": 139115 + }, + { + "epoch": 0.14012314157050323, + "grad_norm": 10.41651913250454, + "learning_rate": 4.900813532322322e-05, + "loss": 2.0123, + "mean_token_accuracy": 0.43448275327682495, + "step": 139120 + }, + { + "epoch": 0.1401281776236074, + "grad_norm": 16.591361418564762, + "learning_rate": 4.900802529059812e-05, + "loss": 2.1933, + "mean_token_accuracy": 0.48275862336158754, + "step": 139125 + }, + { + "epoch": 0.14013321367671158, + "grad_norm": 12.727940082014657, + "learning_rate": 4.90079152520077e-05, + "loss": 2.2459, + "mean_token_accuracy": 0.42758620381355283, + "step": 139130 + }, + { + "epoch": 0.14013824972981576, + "grad_norm": 9.073764428447051, + "learning_rate": 4.900780520745198e-05, + "loss": 2.1091, + "mean_token_accuracy": 0.4517241358757019, + "step": 139135 + }, + { + "epoch": 0.14014328578291993, + "grad_norm": 12.034877253069277, + "learning_rate": 4.900769515693099e-05, + "loss": 2.09, + "mean_token_accuracy": 0.5241379201412201, + "step": 139140 + }, + { + "epoch": 0.1401483218360241, + "grad_norm": 9.312586739437604, + "learning_rate": 4.900758510044476e-05, + "loss": 2.1223, + "mean_token_accuracy": 0.4689655065536499, + "step": 139145 + }, + { + "epoch": 0.14015335788912828, + "grad_norm": 16.291483471752958, + "learning_rate": 4.900747503799332e-05, + "loss": 1.8691, + "mean_token_accuracy": 0.5263762891292572, + "step": 139150 + }, + { + "epoch": 0.14015839394223245, + "grad_norm": 11.517378736431523, + "learning_rate": 4.90073649695767e-05, + "loss": 2.3893, + "mean_token_accuracy": 0.4, + "step": 139155 + }, + { + "epoch": 0.14016342999533662, + "grad_norm": 12.058350532068006, + "learning_rate": 4.900725489519493e-05, + "loss": 2.3427, + "mean_token_accuracy": 0.4724137902259827, + "step": 139160 + }, + { + "epoch": 0.1401684660484408, + "grad_norm": 12.247367859836316, + "learning_rate": 4.900714481484805e-05, + "loss": 2.4289, + "mean_token_accuracy": 0.42068964540958403, + "step": 139165 + }, + { + "epoch": 0.14017350210154497, + "grad_norm": 11.664116711394353, + "learning_rate": 4.900703472853608e-05, + "loss": 2.0739, + "mean_token_accuracy": 0.46551724076271056, + "step": 139170 + }, + { + "epoch": 0.14017853815464915, + "grad_norm": 8.84188175726541, + "learning_rate": 4.9006924636259057e-05, + "loss": 2.4292, + "mean_token_accuracy": 0.45517241954803467, + "step": 139175 + }, + { + "epoch": 0.14018357420775332, + "grad_norm": 11.66205435050764, + "learning_rate": 4.9006814538016996e-05, + "loss": 2.42, + "mean_token_accuracy": 0.4206896543502808, + "step": 139180 + }, + { + "epoch": 0.14018861026085747, + "grad_norm": 15.69878509604065, + "learning_rate": 4.900670443380995e-05, + "loss": 2.3147, + "mean_token_accuracy": 0.41034482717514037, + "step": 139185 + }, + { + "epoch": 0.14019364631396164, + "grad_norm": 10.017662401771489, + "learning_rate": 4.9006594323637936e-05, + "loss": 2.1387, + "mean_token_accuracy": 0.4655172348022461, + "step": 139190 + }, + { + "epoch": 0.1401986823670658, + "grad_norm": 10.124190116692123, + "learning_rate": 4.900648420750099e-05, + "loss": 2.7362, + "mean_token_accuracy": 0.4103448331356049, + "step": 139195 + }, + { + "epoch": 0.14020371842017, + "grad_norm": 9.662283841152613, + "learning_rate": 4.9006374085399144e-05, + "loss": 2.2973, + "mean_token_accuracy": 0.4344827651977539, + "step": 139200 + }, + { + "epoch": 0.14020875447327416, + "grad_norm": 11.179549673853787, + "learning_rate": 4.9006263957332416e-05, + "loss": 2.4211, + "mean_token_accuracy": 0.42413792610168455, + "step": 139205 + }, + { + "epoch": 0.14021379052637833, + "grad_norm": 10.283234753001329, + "learning_rate": 4.9006153823300856e-05, + "loss": 2.391, + "mean_token_accuracy": 0.42758620977401735, + "step": 139210 + }, + { + "epoch": 0.1402188265794825, + "grad_norm": 11.171105789881556, + "learning_rate": 4.9006043683304476e-05, + "loss": 2.4757, + "mean_token_accuracy": 0.42413792610168455, + "step": 139215 + }, + { + "epoch": 0.14022386263258668, + "grad_norm": 8.097784619175115, + "learning_rate": 4.900593353734332e-05, + "loss": 2.3885, + "mean_token_accuracy": 0.4448275864124298, + "step": 139220 + }, + { + "epoch": 0.14022889868569086, + "grad_norm": 11.955975766076095, + "learning_rate": 4.9005823385417407e-05, + "loss": 2.5614, + "mean_token_accuracy": 0.41724138259887694, + "step": 139225 + }, + { + "epoch": 0.14023393473879503, + "grad_norm": 10.223153127774294, + "learning_rate": 4.900571322752677e-05, + "loss": 2.4043, + "mean_token_accuracy": 0.4241379380226135, + "step": 139230 + }, + { + "epoch": 0.1402389707918992, + "grad_norm": 11.52589749687505, + "learning_rate": 4.900560306367145e-05, + "loss": 2.2384, + "mean_token_accuracy": 0.4517241358757019, + "step": 139235 + }, + { + "epoch": 0.14024400684500338, + "grad_norm": 10.124743661857542, + "learning_rate": 4.9005492893851475e-05, + "loss": 2.3875, + "mean_token_accuracy": 0.43793103098869324, + "step": 139240 + }, + { + "epoch": 0.14024904289810755, + "grad_norm": 10.44462942717473, + "learning_rate": 4.900538271806687e-05, + "loss": 2.3663, + "mean_token_accuracy": 0.42758620977401735, + "step": 139245 + }, + { + "epoch": 0.14025407895121172, + "grad_norm": 9.322598172616475, + "learning_rate": 4.900527253631766e-05, + "loss": 2.16, + "mean_token_accuracy": 0.47586206793785096, + "step": 139250 + }, + { + "epoch": 0.1402591150043159, + "grad_norm": 9.584306030148369, + "learning_rate": 4.900516234860388e-05, + "loss": 2.3789, + "mean_token_accuracy": 0.46442831158638, + "step": 139255 + }, + { + "epoch": 0.14026415105742007, + "grad_norm": 9.659192377417122, + "learning_rate": 4.900505215492558e-05, + "loss": 2.8023, + "mean_token_accuracy": 0.3965517282485962, + "step": 139260 + }, + { + "epoch": 0.14026918711052425, + "grad_norm": 12.828217970272565, + "learning_rate": 4.9004941955282756e-05, + "loss": 2.9303, + "mean_token_accuracy": 0.3241379290819168, + "step": 139265 + }, + { + "epoch": 0.14027422316362842, + "grad_norm": 12.259214896672813, + "learning_rate": 4.900483174967546e-05, + "loss": 2.6622, + "mean_token_accuracy": 0.4137930989265442, + "step": 139270 + }, + { + "epoch": 0.1402792592167326, + "grad_norm": 11.350557603293797, + "learning_rate": 4.900472153810372e-05, + "loss": 1.9712, + "mean_token_accuracy": 0.48965516686439514, + "step": 139275 + }, + { + "epoch": 0.14028429526983677, + "grad_norm": 10.581251705602003, + "learning_rate": 4.9004611320567566e-05, + "loss": 2.017, + "mean_token_accuracy": 0.47586206793785096, + "step": 139280 + }, + { + "epoch": 0.14028933132294094, + "grad_norm": 10.82034167271544, + "learning_rate": 4.900450109706703e-05, + "loss": 2.1689, + "mean_token_accuracy": 0.4068965494632721, + "step": 139285 + }, + { + "epoch": 0.14029436737604511, + "grad_norm": 13.203422437323873, + "learning_rate": 4.900439086760214e-05, + "loss": 2.1883, + "mean_token_accuracy": 0.4689655125141144, + "step": 139290 + }, + { + "epoch": 0.1402994034291493, + "grad_norm": 9.205727895805992, + "learning_rate": 4.900428063217293e-05, + "loss": 2.3245, + "mean_token_accuracy": 0.45172414779663084, + "step": 139295 + }, + { + "epoch": 0.14030443948225346, + "grad_norm": 19.21318897273395, + "learning_rate": 4.900417039077942e-05, + "loss": 2.9463, + "mean_token_accuracy": 0.3344827562570572, + "step": 139300 + }, + { + "epoch": 0.14030947553535764, + "grad_norm": 16.47565831909942, + "learning_rate": 4.9004060143421656e-05, + "loss": 2.9311, + "mean_token_accuracy": 0.3896551698446274, + "step": 139305 + }, + { + "epoch": 0.1403145115884618, + "grad_norm": 9.05375693288172, + "learning_rate": 4.9003949890099654e-05, + "loss": 2.4132, + "mean_token_accuracy": 0.4156079888343811, + "step": 139310 + }, + { + "epoch": 0.14031954764156598, + "grad_norm": 11.980634945575275, + "learning_rate": 4.9003839630813456e-05, + "loss": 2.2819, + "mean_token_accuracy": 0.4517241418361664, + "step": 139315 + }, + { + "epoch": 0.14032458369467013, + "grad_norm": 9.041239592318718, + "learning_rate": 4.900372936556308e-05, + "loss": 2.5154, + "mean_token_accuracy": 0.41034482717514037, + "step": 139320 + }, + { + "epoch": 0.1403296197477743, + "grad_norm": 10.722671620746567, + "learning_rate": 4.900361909434857e-05, + "loss": 2.5416, + "mean_token_accuracy": 0.39655172228813174, + "step": 139325 + }, + { + "epoch": 0.14033465580087848, + "grad_norm": 9.231609839942674, + "learning_rate": 4.9003508817169955e-05, + "loss": 2.1098, + "mean_token_accuracy": 0.4206896543502808, + "step": 139330 + }, + { + "epoch": 0.14033969185398265, + "grad_norm": 10.24275957869904, + "learning_rate": 4.9003398534027257e-05, + "loss": 2.2875, + "mean_token_accuracy": 0.441379314661026, + "step": 139335 + }, + { + "epoch": 0.14034472790708682, + "grad_norm": 11.761517503972227, + "learning_rate": 4.900328824492051e-05, + "loss": 2.4554, + "mean_token_accuracy": 0.4482758641242981, + "step": 139340 + }, + { + "epoch": 0.140349763960191, + "grad_norm": 11.761358809799896, + "learning_rate": 4.9003177949849746e-05, + "loss": 2.6772, + "mean_token_accuracy": 0.4000000059604645, + "step": 139345 + }, + { + "epoch": 0.14035480001329517, + "grad_norm": 10.137150080451988, + "learning_rate": 4.9003067648814994e-05, + "loss": 2.6645, + "mean_token_accuracy": 0.40865094065666197, + "step": 139350 + }, + { + "epoch": 0.14035983606639935, + "grad_norm": 9.896264781417404, + "learning_rate": 4.900295734181629e-05, + "loss": 2.4077, + "mean_token_accuracy": 0.44827587008476255, + "step": 139355 + }, + { + "epoch": 0.14036487211950352, + "grad_norm": 8.832964560311275, + "learning_rate": 4.900284702885366e-05, + "loss": 2.2254, + "mean_token_accuracy": 0.4812462151050568, + "step": 139360 + }, + { + "epoch": 0.1403699081726077, + "grad_norm": 11.209053106571305, + "learning_rate": 4.9002736709927135e-05, + "loss": 2.2229, + "mean_token_accuracy": 0.45862069725990295, + "step": 139365 + }, + { + "epoch": 0.14037494422571187, + "grad_norm": 8.675804603529198, + "learning_rate": 4.9002626385036744e-05, + "loss": 2.2057, + "mean_token_accuracy": 0.47931034564971925, + "step": 139370 + }, + { + "epoch": 0.14037998027881604, + "grad_norm": 9.656935528038789, + "learning_rate": 4.900251605418252e-05, + "loss": 2.5236, + "mean_token_accuracy": 0.43103448748588563, + "step": 139375 + }, + { + "epoch": 0.14038501633192021, + "grad_norm": 10.335045491477471, + "learning_rate": 4.900240571736449e-05, + "loss": 2.42, + "mean_token_accuracy": 0.3793103456497192, + "step": 139380 + }, + { + "epoch": 0.1403900523850244, + "grad_norm": 10.545376150839555, + "learning_rate": 4.900229537458269e-05, + "loss": 2.7233, + "mean_token_accuracy": 0.4103448212146759, + "step": 139385 + }, + { + "epoch": 0.14039508843812856, + "grad_norm": 10.054170697550973, + "learning_rate": 4.900218502583715e-05, + "loss": 2.0616, + "mean_token_accuracy": 0.46551724076271056, + "step": 139390 + }, + { + "epoch": 0.14040012449123274, + "grad_norm": 9.992570489486635, + "learning_rate": 4.90020746711279e-05, + "loss": 2.295, + "mean_token_accuracy": 0.45517241954803467, + "step": 139395 + }, + { + "epoch": 0.1404051605443369, + "grad_norm": 12.087619958634065, + "learning_rate": 4.900196431045497e-05, + "loss": 2.5257, + "mean_token_accuracy": 0.41379310488700866, + "step": 139400 + }, + { + "epoch": 0.14041019659744108, + "grad_norm": 11.297483629008509, + "learning_rate": 4.900185394381839e-05, + "loss": 2.7386, + "mean_token_accuracy": 0.3862069010734558, + "step": 139405 + }, + { + "epoch": 0.14041523265054526, + "grad_norm": 12.593812614576546, + "learning_rate": 4.900174357121818e-05, + "loss": 2.4246, + "mean_token_accuracy": 0.37586206793785093, + "step": 139410 + }, + { + "epoch": 0.14042026870364943, + "grad_norm": 8.909897318422171, + "learning_rate": 4.900163319265439e-05, + "loss": 2.4036, + "mean_token_accuracy": 0.4398064136505127, + "step": 139415 + }, + { + "epoch": 0.1404253047567536, + "grad_norm": 10.338116893786037, + "learning_rate": 4.9001522808127045e-05, + "loss": 2.3046, + "mean_token_accuracy": 0.48275861144065857, + "step": 139420 + }, + { + "epoch": 0.14043034080985778, + "grad_norm": 10.942972624665288, + "learning_rate": 4.900141241763617e-05, + "loss": 2.4669, + "mean_token_accuracy": 0.43103447556495667, + "step": 139425 + }, + { + "epoch": 0.14043537686296195, + "grad_norm": 11.438862790844516, + "learning_rate": 4.9001302021181794e-05, + "loss": 2.258, + "mean_token_accuracy": 0.40514216423034666, + "step": 139430 + }, + { + "epoch": 0.14044041291606613, + "grad_norm": 10.664454673071734, + "learning_rate": 4.9001191618763954e-05, + "loss": 2.1797, + "mean_token_accuracy": 0.4984271049499512, + "step": 139435 + }, + { + "epoch": 0.1404454489691703, + "grad_norm": 9.828506200140152, + "learning_rate": 4.9001081210382685e-05, + "loss": 2.0698, + "mean_token_accuracy": 0.4396854221820831, + "step": 139440 + }, + { + "epoch": 0.14045048502227447, + "grad_norm": 11.173908697940576, + "learning_rate": 4.9000970796038e-05, + "loss": 2.2084, + "mean_token_accuracy": 0.4517241299152374, + "step": 139445 + }, + { + "epoch": 0.14045552107537865, + "grad_norm": 11.476234922586118, + "learning_rate": 4.900086037572995e-05, + "loss": 2.4118, + "mean_token_accuracy": 0.4344827651977539, + "step": 139450 + }, + { + "epoch": 0.14046055712848282, + "grad_norm": 10.761930333336347, + "learning_rate": 4.9000749949458544e-05, + "loss": 2.5639, + "mean_token_accuracy": 0.4310344815254211, + "step": 139455 + }, + { + "epoch": 0.14046559318158697, + "grad_norm": 12.552787006342987, + "learning_rate": 4.900063951722384e-05, + "loss": 2.6124, + "mean_token_accuracy": 0.4344827592372894, + "step": 139460 + }, + { + "epoch": 0.14047062923469114, + "grad_norm": 10.376070024977015, + "learning_rate": 4.9000529079025835e-05, + "loss": 2.3629, + "mean_token_accuracy": 0.4793103516101837, + "step": 139465 + }, + { + "epoch": 0.14047566528779531, + "grad_norm": 11.142843012201803, + "learning_rate": 4.900041863486459e-05, + "loss": 1.9052, + "mean_token_accuracy": 0.5137930929660797, + "step": 139470 + }, + { + "epoch": 0.1404807013408995, + "grad_norm": 14.60091052510591, + "learning_rate": 4.900030818474012e-05, + "loss": 2.979, + "mean_token_accuracy": 0.4172413796186447, + "step": 139475 + }, + { + "epoch": 0.14048573739400366, + "grad_norm": 12.897988935225726, + "learning_rate": 4.900019772865247e-05, + "loss": 2.7057, + "mean_token_accuracy": 0.3862068921327591, + "step": 139480 + }, + { + "epoch": 0.14049077344710784, + "grad_norm": 10.324823458658424, + "learning_rate": 4.9000087266601645e-05, + "loss": 2.502, + "mean_token_accuracy": 0.4068965494632721, + "step": 139485 + }, + { + "epoch": 0.140495809500212, + "grad_norm": 10.953638221385665, + "learning_rate": 4.899997679858769e-05, + "loss": 2.27, + "mean_token_accuracy": 0.4275862157344818, + "step": 139490 + }, + { + "epoch": 0.14050084555331618, + "grad_norm": 11.062164752672775, + "learning_rate": 4.899986632461065e-05, + "loss": 2.0976, + "mean_token_accuracy": 0.4758620738983154, + "step": 139495 + }, + { + "epoch": 0.14050588160642036, + "grad_norm": 10.402952569081437, + "learning_rate": 4.899975584467053e-05, + "loss": 2.6015, + "mean_token_accuracy": 0.4206896543502808, + "step": 139500 + }, + { + "epoch": 0.14051091765952453, + "grad_norm": 9.00585432002654, + "learning_rate": 4.8999645358767385e-05, + "loss": 2.0235, + "mean_token_accuracy": 0.4931034505367279, + "step": 139505 + }, + { + "epoch": 0.1405159537126287, + "grad_norm": 11.48408208839091, + "learning_rate": 4.899953486690123e-05, + "loss": 2.6746, + "mean_token_accuracy": 0.3931034475564957, + "step": 139510 + }, + { + "epoch": 0.14052098976573288, + "grad_norm": 8.903935284368469, + "learning_rate": 4.899942436907209e-05, + "loss": 2.4534, + "mean_token_accuracy": 0.43793103098869324, + "step": 139515 + }, + { + "epoch": 0.14052602581883705, + "grad_norm": 10.829057253173971, + "learning_rate": 4.899931386528001e-05, + "loss": 2.1615, + "mean_token_accuracy": 0.4862068951129913, + "step": 139520 + }, + { + "epoch": 0.14053106187194123, + "grad_norm": 8.982386851741925, + "learning_rate": 4.899920335552501e-05, + "loss": 2.0607, + "mean_token_accuracy": 0.4862069010734558, + "step": 139525 + }, + { + "epoch": 0.1405360979250454, + "grad_norm": 10.897197379718845, + "learning_rate": 4.8999092839807134e-05, + "loss": 2.345, + "mean_token_accuracy": 0.4517241358757019, + "step": 139530 + }, + { + "epoch": 0.14054113397814957, + "grad_norm": 11.031823700677897, + "learning_rate": 4.89989823181264e-05, + "loss": 2.2833, + "mean_token_accuracy": 0.45366001725196836, + "step": 139535 + }, + { + "epoch": 0.14054617003125375, + "grad_norm": 12.70552698639148, + "learning_rate": 4.8998871790482836e-05, + "loss": 2.5248, + "mean_token_accuracy": 0.4137930989265442, + "step": 139540 + }, + { + "epoch": 0.14055120608435792, + "grad_norm": 9.040965860315774, + "learning_rate": 4.899876125687649e-05, + "loss": 2.1248, + "mean_token_accuracy": 0.4517241358757019, + "step": 139545 + }, + { + "epoch": 0.1405562421374621, + "grad_norm": 9.875195562904292, + "learning_rate": 4.8998650717307384e-05, + "loss": 2.4734, + "mean_token_accuracy": 0.4206896543502808, + "step": 139550 + }, + { + "epoch": 0.14056127819056627, + "grad_norm": 10.370181418695568, + "learning_rate": 4.8998540171775535e-05, + "loss": 2.2798, + "mean_token_accuracy": 0.44827585816383364, + "step": 139555 + }, + { + "epoch": 0.14056631424367044, + "grad_norm": 9.325861809152086, + "learning_rate": 4.8998429620280995e-05, + "loss": 2.1956, + "mean_token_accuracy": 0.4724137902259827, + "step": 139560 + }, + { + "epoch": 0.14057135029677462, + "grad_norm": 9.799924933096444, + "learning_rate": 4.899831906282378e-05, + "loss": 2.6542, + "mean_token_accuracy": 0.4068965554237366, + "step": 139565 + }, + { + "epoch": 0.1405763863498788, + "grad_norm": 10.422712060934366, + "learning_rate": 4.8998208499403934e-05, + "loss": 2.3175, + "mean_token_accuracy": 0.4689655125141144, + "step": 139570 + }, + { + "epoch": 0.14058142240298296, + "grad_norm": 9.22770587468454, + "learning_rate": 4.8998097930021474e-05, + "loss": 2.3997, + "mean_token_accuracy": 0.4206896543502808, + "step": 139575 + }, + { + "epoch": 0.14058645845608714, + "grad_norm": 9.77450462307289, + "learning_rate": 4.899798735467643e-05, + "loss": 2.2906, + "mean_token_accuracy": 0.4, + "step": 139580 + }, + { + "epoch": 0.1405914945091913, + "grad_norm": 14.29418619099334, + "learning_rate": 4.899787677336886e-05, + "loss": 2.6012, + "mean_token_accuracy": 0.4344827592372894, + "step": 139585 + }, + { + "epoch": 0.14059653056229549, + "grad_norm": 10.946760185547165, + "learning_rate": 4.899776618609876e-05, + "loss": 2.751, + "mean_token_accuracy": 0.44137930274009707, + "step": 139590 + }, + { + "epoch": 0.14060156661539966, + "grad_norm": 9.114994358870264, + "learning_rate": 4.899765559286617e-05, + "loss": 2.3317, + "mean_token_accuracy": 0.4517241418361664, + "step": 139595 + }, + { + "epoch": 0.1406066026685038, + "grad_norm": 10.06857655927627, + "learning_rate": 4.899754499367113e-05, + "loss": 2.4197, + "mean_token_accuracy": 0.4430127084255219, + "step": 139600 + }, + { + "epoch": 0.14061163872160798, + "grad_norm": 12.347033308199482, + "learning_rate": 4.899743438851366e-05, + "loss": 2.2558, + "mean_token_accuracy": 0.4413793087005615, + "step": 139605 + }, + { + "epoch": 0.14061667477471215, + "grad_norm": 12.026445723120974, + "learning_rate": 4.89973237773938e-05, + "loss": 2.2505, + "mean_token_accuracy": 0.4413793087005615, + "step": 139610 + }, + { + "epoch": 0.14062171082781633, + "grad_norm": 9.958994714436637, + "learning_rate": 4.8997213160311585e-05, + "loss": 2.2818, + "mean_token_accuracy": 0.4620689570903778, + "step": 139615 + }, + { + "epoch": 0.1406267468809205, + "grad_norm": 8.968294497628191, + "learning_rate": 4.8997102537267026e-05, + "loss": 2.2862, + "mean_token_accuracy": 0.4103448331356049, + "step": 139620 + }, + { + "epoch": 0.14063178293402467, + "grad_norm": 10.250974065133866, + "learning_rate": 4.899699190826017e-05, + "loss": 2.1489, + "mean_token_accuracy": 0.44827585220336913, + "step": 139625 + }, + { + "epoch": 0.14063681898712885, + "grad_norm": 11.022246682922955, + "learning_rate": 4.899688127329105e-05, + "loss": 2.4209, + "mean_token_accuracy": 0.4620689690113068, + "step": 139630 + }, + { + "epoch": 0.14064185504023302, + "grad_norm": 11.751815057017549, + "learning_rate": 4.899677063235968e-05, + "loss": 2.3832, + "mean_token_accuracy": 0.4103448331356049, + "step": 139635 + }, + { + "epoch": 0.1406468910933372, + "grad_norm": 9.93093762813725, + "learning_rate": 4.89966599854661e-05, + "loss": 2.3763, + "mean_token_accuracy": 0.46551724672317507, + "step": 139640 + }, + { + "epoch": 0.14065192714644137, + "grad_norm": 12.325785950215145, + "learning_rate": 4.899654933261035e-05, + "loss": 2.5596, + "mean_token_accuracy": 0.4103448212146759, + "step": 139645 + }, + { + "epoch": 0.14065696319954554, + "grad_norm": 10.450418428944218, + "learning_rate": 4.8996438673792444e-05, + "loss": 2.3352, + "mean_token_accuracy": 0.42068964838981626, + "step": 139650 + }, + { + "epoch": 0.14066199925264972, + "grad_norm": 11.979308607118154, + "learning_rate": 4.8996328009012424e-05, + "loss": 2.2246, + "mean_token_accuracy": 0.4655172348022461, + "step": 139655 + }, + { + "epoch": 0.1406670353057539, + "grad_norm": 9.971127001869693, + "learning_rate": 4.899621733827032e-05, + "loss": 2.5945, + "mean_token_accuracy": 0.39655171930789945, + "step": 139660 + }, + { + "epoch": 0.14067207135885806, + "grad_norm": 10.396666104581882, + "learning_rate": 4.899610666156615e-05, + "loss": 2.1845, + "mean_token_accuracy": 0.4689655125141144, + "step": 139665 + }, + { + "epoch": 0.14067710741196224, + "grad_norm": 9.985233472204856, + "learning_rate": 4.899599597889997e-05, + "loss": 2.1423, + "mean_token_accuracy": 0.4862068951129913, + "step": 139670 + }, + { + "epoch": 0.1406821434650664, + "grad_norm": 12.209251818893916, + "learning_rate": 4.899588529027178e-05, + "loss": 2.2087, + "mean_token_accuracy": 0.43793103098869324, + "step": 139675 + }, + { + "epoch": 0.14068717951817059, + "grad_norm": 10.066478584763802, + "learning_rate": 4.899577459568164e-05, + "loss": 2.1785, + "mean_token_accuracy": 0.45015124082565305, + "step": 139680 + }, + { + "epoch": 0.14069221557127476, + "grad_norm": 14.365214932951366, + "learning_rate": 4.899566389512956e-05, + "loss": 2.0712, + "mean_token_accuracy": 0.4896551728248596, + "step": 139685 + }, + { + "epoch": 0.14069725162437893, + "grad_norm": 12.85979836097937, + "learning_rate": 4.899555318861557e-05, + "loss": 1.9809, + "mean_token_accuracy": 0.4724137902259827, + "step": 139690 + }, + { + "epoch": 0.1407022876774831, + "grad_norm": 11.075975719662246, + "learning_rate": 4.8995442476139714e-05, + "loss": 2.5468, + "mean_token_accuracy": 0.44827585816383364, + "step": 139695 + }, + { + "epoch": 0.14070732373058728, + "grad_norm": 9.748092925762462, + "learning_rate": 4.899533175770202e-05, + "loss": 2.6427, + "mean_token_accuracy": 0.38275861740112305, + "step": 139700 + }, + { + "epoch": 0.14071235978369145, + "grad_norm": 10.502854651784915, + "learning_rate": 4.8995221033302515e-05, + "loss": 2.3565, + "mean_token_accuracy": 0.43103448748588563, + "step": 139705 + }, + { + "epoch": 0.14071739583679563, + "grad_norm": 19.017710038931856, + "learning_rate": 4.899511030294123e-05, + "loss": 2.3597, + "mean_token_accuracy": 0.458620685338974, + "step": 139710 + }, + { + "epoch": 0.1407224318898998, + "grad_norm": 9.037820447294438, + "learning_rate": 4.8994999566618196e-05, + "loss": 2.1866, + "mean_token_accuracy": 0.4965517222881317, + "step": 139715 + }, + { + "epoch": 0.14072746794300398, + "grad_norm": 9.972292385846975, + "learning_rate": 4.899488882433344e-05, + "loss": 2.193, + "mean_token_accuracy": 0.4517241418361664, + "step": 139720 + }, + { + "epoch": 0.14073250399610815, + "grad_norm": 11.05430110814426, + "learning_rate": 4.899477807608699e-05, + "loss": 2.7626, + "mean_token_accuracy": 0.36896551847457887, + "step": 139725 + }, + { + "epoch": 0.14073754004921232, + "grad_norm": 10.639225865633867, + "learning_rate": 4.89946673218789e-05, + "loss": 2.323, + "mean_token_accuracy": 0.45862069725990295, + "step": 139730 + }, + { + "epoch": 0.1407425761023165, + "grad_norm": 8.9008135600385, + "learning_rate": 4.8994556561709174e-05, + "loss": 2.833, + "mean_token_accuracy": 0.3965517282485962, + "step": 139735 + }, + { + "epoch": 0.14074761215542064, + "grad_norm": 11.262624325821289, + "learning_rate": 4.8994445795577853e-05, + "loss": 2.5571, + "mean_token_accuracy": 0.42068966031074523, + "step": 139740 + }, + { + "epoch": 0.14075264820852482, + "grad_norm": 8.759702841218077, + "learning_rate": 4.899433502348497e-05, + "loss": 2.3394, + "mean_token_accuracy": 0.43103447556495667, + "step": 139745 + }, + { + "epoch": 0.140757684261629, + "grad_norm": 9.990617824041623, + "learning_rate": 4.899422424543055e-05, + "loss": 2.0778, + "mean_token_accuracy": 0.4676345944404602, + "step": 139750 + }, + { + "epoch": 0.14076272031473316, + "grad_norm": 11.990691204399099, + "learning_rate": 4.899411346141463e-05, + "loss": 2.411, + "mean_token_accuracy": 0.441379314661026, + "step": 139755 + }, + { + "epoch": 0.14076775636783734, + "grad_norm": 8.14239853760547, + "learning_rate": 4.899400267143724e-05, + "loss": 2.2878, + "mean_token_accuracy": 0.5000000059604645, + "step": 139760 + }, + { + "epoch": 0.1407727924209415, + "grad_norm": 7.304990894467067, + "learning_rate": 4.89938918754984e-05, + "loss": 2.3353, + "mean_token_accuracy": 0.4560344874858856, + "step": 139765 + }, + { + "epoch": 0.14077782847404569, + "grad_norm": 9.711098969676604, + "learning_rate": 4.899378107359815e-05, + "loss": 2.4662, + "mean_token_accuracy": 0.47931033968925474, + "step": 139770 + }, + { + "epoch": 0.14078286452714986, + "grad_norm": 11.452736443222044, + "learning_rate": 4.8993670265736526e-05, + "loss": 2.7586, + "mean_token_accuracy": 0.41923774480819703, + "step": 139775 + }, + { + "epoch": 0.14078790058025403, + "grad_norm": 11.65883598095452, + "learning_rate": 4.8993559451913544e-05, + "loss": 2.6439, + "mean_token_accuracy": 0.417241370677948, + "step": 139780 + }, + { + "epoch": 0.1407929366333582, + "grad_norm": 8.965376572137226, + "learning_rate": 4.899344863212925e-05, + "loss": 2.1841, + "mean_token_accuracy": 0.47586206793785096, + "step": 139785 + }, + { + "epoch": 0.14079797268646238, + "grad_norm": 10.979349000585401, + "learning_rate": 4.8993337806383665e-05, + "loss": 2.337, + "mean_token_accuracy": 0.47586206793785096, + "step": 139790 + }, + { + "epoch": 0.14080300873956655, + "grad_norm": 10.040133198237875, + "learning_rate": 4.899322697467682e-05, + "loss": 2.5308, + "mean_token_accuracy": 0.4034482717514038, + "step": 139795 + }, + { + "epoch": 0.14080804479267073, + "grad_norm": 12.413311189740595, + "learning_rate": 4.899311613700875e-05, + "loss": 2.5288, + "mean_token_accuracy": 0.36551723480224607, + "step": 139800 + }, + { + "epoch": 0.1408130808457749, + "grad_norm": 9.630840695022773, + "learning_rate": 4.8993005293379494e-05, + "loss": 2.7143, + "mean_token_accuracy": 0.4448275864124298, + "step": 139805 + }, + { + "epoch": 0.14081811689887908, + "grad_norm": 11.555177692253574, + "learning_rate": 4.899289444378906e-05, + "loss": 2.3341, + "mean_token_accuracy": 0.3862068891525269, + "step": 139810 + }, + { + "epoch": 0.14082315295198325, + "grad_norm": 9.979713772713549, + "learning_rate": 4.8992783588237504e-05, + "loss": 2.3791, + "mean_token_accuracy": 0.39310344457626345, + "step": 139815 + }, + { + "epoch": 0.14082818900508742, + "grad_norm": 9.296548387463963, + "learning_rate": 4.899267272672483e-05, + "loss": 2.7434, + "mean_token_accuracy": 0.4068965494632721, + "step": 139820 + }, + { + "epoch": 0.1408332250581916, + "grad_norm": 10.655820474670318, + "learning_rate": 4.899256185925109e-05, + "loss": 2.3232, + "mean_token_accuracy": 0.45862069725990295, + "step": 139825 + }, + { + "epoch": 0.14083826111129577, + "grad_norm": 13.862068093895052, + "learning_rate": 4.8992450985816306e-05, + "loss": 2.8521, + "mean_token_accuracy": 0.3793103516101837, + "step": 139830 + }, + { + "epoch": 0.14084329716439994, + "grad_norm": 8.981484648042969, + "learning_rate": 4.8992340106420506e-05, + "loss": 1.9815, + "mean_token_accuracy": 0.49999999403953554, + "step": 139835 + }, + { + "epoch": 0.14084833321750412, + "grad_norm": 10.667547079564688, + "learning_rate": 4.8992229221063735e-05, + "loss": 2.6164, + "mean_token_accuracy": 0.3862069010734558, + "step": 139840 + }, + { + "epoch": 0.1408533692706083, + "grad_norm": 9.57885945424305, + "learning_rate": 4.899211832974602e-05, + "loss": 2.629, + "mean_token_accuracy": 0.4413793087005615, + "step": 139845 + }, + { + "epoch": 0.14085840532371247, + "grad_norm": 11.856917455682977, + "learning_rate": 4.899200743246736e-05, + "loss": 2.8038, + "mean_token_accuracy": 0.43793103098869324, + "step": 139850 + }, + { + "epoch": 0.14086344137681664, + "grad_norm": 9.610660169698743, + "learning_rate": 4.899189652922783e-05, + "loss": 2.2006, + "mean_token_accuracy": 0.42758620381355283, + "step": 139855 + }, + { + "epoch": 0.1408684774299208, + "grad_norm": 9.77261766777269, + "learning_rate": 4.899178562002744e-05, + "loss": 2.3198, + "mean_token_accuracy": 0.4310344815254211, + "step": 139860 + }, + { + "epoch": 0.140873513483025, + "grad_norm": 9.766768902788696, + "learning_rate": 4.899167470486622e-05, + "loss": 2.1202, + "mean_token_accuracy": 0.4931034564971924, + "step": 139865 + }, + { + "epoch": 0.14087854953612916, + "grad_norm": 11.156741153298752, + "learning_rate": 4.899156378374422e-05, + "loss": 2.4204, + "mean_token_accuracy": 0.3984875977039337, + "step": 139870 + }, + { + "epoch": 0.14088358558923333, + "grad_norm": 8.550468581124601, + "learning_rate": 4.899145285666144e-05, + "loss": 2.6242, + "mean_token_accuracy": 0.38965517580509185, + "step": 139875 + }, + { + "epoch": 0.14088862164233748, + "grad_norm": 11.476300785924476, + "learning_rate": 4.8991341923617924e-05, + "loss": 2.3596, + "mean_token_accuracy": 0.39310345649719236, + "step": 139880 + }, + { + "epoch": 0.14089365769544165, + "grad_norm": 9.637903713103686, + "learning_rate": 4.89912309846137e-05, + "loss": 2.3063, + "mean_token_accuracy": 0.38620689511299133, + "step": 139885 + }, + { + "epoch": 0.14089869374854583, + "grad_norm": 11.384715198003379, + "learning_rate": 4.899112003964881e-05, + "loss": 2.5156, + "mean_token_accuracy": 0.4137930989265442, + "step": 139890 + }, + { + "epoch": 0.14090372980165, + "grad_norm": 10.143126159734122, + "learning_rate": 4.899100908872328e-05, + "loss": 2.2249, + "mean_token_accuracy": 0.4344827592372894, + "step": 139895 + }, + { + "epoch": 0.14090876585475418, + "grad_norm": 9.208608954645548, + "learning_rate": 4.8990898131837134e-05, + "loss": 2.2479, + "mean_token_accuracy": 0.42413793206214906, + "step": 139900 + }, + { + "epoch": 0.14091380190785835, + "grad_norm": 10.489483594896107, + "learning_rate": 4.899078716899041e-05, + "loss": 2.1731, + "mean_token_accuracy": 0.46551724076271056, + "step": 139905 + }, + { + "epoch": 0.14091883796096252, + "grad_norm": 9.781124148368592, + "learning_rate": 4.899067620018314e-05, + "loss": 2.4242, + "mean_token_accuracy": 0.4503932178020477, + "step": 139910 + }, + { + "epoch": 0.1409238740140667, + "grad_norm": 10.323881929166419, + "learning_rate": 4.8990565225415345e-05, + "loss": 2.7946, + "mean_token_accuracy": 0.41034482717514037, + "step": 139915 + }, + { + "epoch": 0.14092891006717087, + "grad_norm": 10.819996630872469, + "learning_rate": 4.899045424468706e-05, + "loss": 2.3114, + "mean_token_accuracy": 0.41379310488700866, + "step": 139920 + }, + { + "epoch": 0.14093394612027504, + "grad_norm": 11.843478603817722, + "learning_rate": 4.8990343257998316e-05, + "loss": 2.6739, + "mean_token_accuracy": 0.4172413766384125, + "step": 139925 + }, + { + "epoch": 0.14093898217337922, + "grad_norm": 10.319134352011824, + "learning_rate": 4.8990232265349156e-05, + "loss": 2.338, + "mean_token_accuracy": 0.44482758045196535, + "step": 139930 + }, + { + "epoch": 0.1409440182264834, + "grad_norm": 10.518440036706624, + "learning_rate": 4.8990121266739594e-05, + "loss": 2.2685, + "mean_token_accuracy": 0.41724138259887694, + "step": 139935 + }, + { + "epoch": 0.14094905427958757, + "grad_norm": 10.728684087341568, + "learning_rate": 4.899001026216966e-05, + "loss": 2.6778, + "mean_token_accuracy": 0.4103448331356049, + "step": 139940 + }, + { + "epoch": 0.14095409033269174, + "grad_norm": 10.516534104407803, + "learning_rate": 4.89898992516394e-05, + "loss": 2.4708, + "mean_token_accuracy": 0.47586206793785096, + "step": 139945 + }, + { + "epoch": 0.1409591263857959, + "grad_norm": 9.390443545951973, + "learning_rate": 4.898978823514883e-05, + "loss": 2.2734, + "mean_token_accuracy": 0.4620689690113068, + "step": 139950 + }, + { + "epoch": 0.1409641624389001, + "grad_norm": 9.4896109536185, + "learning_rate": 4.8989677212697996e-05, + "loss": 2.3454, + "mean_token_accuracy": 0.4517241358757019, + "step": 139955 + }, + { + "epoch": 0.14096919849200426, + "grad_norm": 9.213443594944872, + "learning_rate": 4.898956618428691e-05, + "loss": 2.2364, + "mean_token_accuracy": 0.4517241358757019, + "step": 139960 + }, + { + "epoch": 0.14097423454510843, + "grad_norm": 11.50410476388823, + "learning_rate": 4.8989455149915615e-05, + "loss": 2.3556, + "mean_token_accuracy": 0.4103448331356049, + "step": 139965 + }, + { + "epoch": 0.1409792705982126, + "grad_norm": 14.455722093071376, + "learning_rate": 4.898934410958414e-05, + "loss": 2.4618, + "mean_token_accuracy": 0.4413793087005615, + "step": 139970 + }, + { + "epoch": 0.14098430665131678, + "grad_norm": 14.396036685765093, + "learning_rate": 4.8989233063292516e-05, + "loss": 2.5183, + "mean_token_accuracy": 0.4068965494632721, + "step": 139975 + }, + { + "epoch": 0.14098934270442096, + "grad_norm": 10.747526328631206, + "learning_rate": 4.898912201104077e-05, + "loss": 2.3234, + "mean_token_accuracy": 0.47241378426551817, + "step": 139980 + }, + { + "epoch": 0.14099437875752513, + "grad_norm": 10.97783751007184, + "learning_rate": 4.898901095282894e-05, + "loss": 2.6459, + "mean_token_accuracy": 0.35862069129943847, + "step": 139985 + }, + { + "epoch": 0.1409994148106293, + "grad_norm": 9.044167778940276, + "learning_rate": 4.898889988865706e-05, + "loss": 1.7793, + "mean_token_accuracy": 0.5793103396892547, + "step": 139990 + }, + { + "epoch": 0.14100445086373348, + "grad_norm": 10.707788793871304, + "learning_rate": 4.898878881852514e-05, + "loss": 2.7049, + "mean_token_accuracy": 0.3965517163276672, + "step": 139995 + }, + { + "epoch": 0.14100948691683765, + "grad_norm": 10.898108399224432, + "learning_rate": 4.898867774243323e-05, + "loss": 2.8714, + "mean_token_accuracy": 0.40859044194221494, + "step": 140000 + }, + { + "epoch": 0.14101452296994182, + "grad_norm": 8.18986887852621, + "learning_rate": 4.898856666038136e-05, + "loss": 2.2691, + "mean_token_accuracy": 0.46896552443504336, + "step": 140005 + }, + { + "epoch": 0.141019559023046, + "grad_norm": 12.161511605561584, + "learning_rate": 4.898845557236955e-05, + "loss": 2.5507, + "mean_token_accuracy": 0.3896551728248596, + "step": 140010 + }, + { + "epoch": 0.14102459507615017, + "grad_norm": 10.658953642251863, + "learning_rate": 4.898834447839783e-05, + "loss": 2.294, + "mean_token_accuracy": 0.42413793206214906, + "step": 140015 + }, + { + "epoch": 0.14102963112925432, + "grad_norm": 10.485906680599228, + "learning_rate": 4.898823337846625e-05, + "loss": 2.537, + "mean_token_accuracy": 0.4068965494632721, + "step": 140020 + }, + { + "epoch": 0.1410346671823585, + "grad_norm": 11.290276387902704, + "learning_rate": 4.898812227257482e-05, + "loss": 2.3955, + "mean_token_accuracy": 0.40689656138420105, + "step": 140025 + }, + { + "epoch": 0.14103970323546267, + "grad_norm": 8.670751035579551, + "learning_rate": 4.898801116072358e-05, + "loss": 2.0738, + "mean_token_accuracy": 0.47586206197738645, + "step": 140030 + }, + { + "epoch": 0.14104473928856684, + "grad_norm": 10.153570688196185, + "learning_rate": 4.898790004291257e-05, + "loss": 2.2202, + "mean_token_accuracy": 0.4551724135875702, + "step": 140035 + }, + { + "epoch": 0.141049775341671, + "grad_norm": 9.6228233344508, + "learning_rate": 4.8987788919141796e-05, + "loss": 2.1784, + "mean_token_accuracy": 0.4586206912994385, + "step": 140040 + }, + { + "epoch": 0.1410548113947752, + "grad_norm": 11.342022243938711, + "learning_rate": 4.898767778941132e-05, + "loss": 2.0338, + "mean_token_accuracy": 0.47586206197738645, + "step": 140045 + }, + { + "epoch": 0.14105984744787936, + "grad_norm": 13.364459776718911, + "learning_rate": 4.898756665372114e-05, + "loss": 2.2932, + "mean_token_accuracy": 0.4770935893058777, + "step": 140050 + }, + { + "epoch": 0.14106488350098353, + "grad_norm": 10.27067917033712, + "learning_rate": 4.898745551207131e-05, + "loss": 2.1734, + "mean_token_accuracy": 0.4551724135875702, + "step": 140055 + }, + { + "epoch": 0.1410699195540877, + "grad_norm": 8.861173507432301, + "learning_rate": 4.8987344364461856e-05, + "loss": 1.9017, + "mean_token_accuracy": 0.48620688915252686, + "step": 140060 + }, + { + "epoch": 0.14107495560719188, + "grad_norm": 9.07117296886931, + "learning_rate": 4.898723321089281e-05, + "loss": 2.1245, + "mean_token_accuracy": 0.506896561384201, + "step": 140065 + }, + { + "epoch": 0.14107999166029606, + "grad_norm": 10.99698495734675, + "learning_rate": 4.8987122051364184e-05, + "loss": 2.5377, + "mean_token_accuracy": 0.4379310369491577, + "step": 140070 + }, + { + "epoch": 0.14108502771340023, + "grad_norm": 11.00143540255811, + "learning_rate": 4.898701088587604e-05, + "loss": 2.5042, + "mean_token_accuracy": 0.41034482717514037, + "step": 140075 + }, + { + "epoch": 0.1410900637665044, + "grad_norm": 9.113838658528676, + "learning_rate": 4.898689971442839e-05, + "loss": 2.4596, + "mean_token_accuracy": 0.3896551728248596, + "step": 140080 + }, + { + "epoch": 0.14109509981960858, + "grad_norm": 14.643268076382094, + "learning_rate": 4.8986788537021264e-05, + "loss": 2.8461, + "mean_token_accuracy": 0.379310342669487, + "step": 140085 + }, + { + "epoch": 0.14110013587271275, + "grad_norm": 8.575846274495344, + "learning_rate": 4.89866773536547e-05, + "loss": 2.4397, + "mean_token_accuracy": 0.39310344457626345, + "step": 140090 + }, + { + "epoch": 0.14110517192581692, + "grad_norm": 9.03748019875443, + "learning_rate": 4.898656616432873e-05, + "loss": 2.5223, + "mean_token_accuracy": 0.4482758641242981, + "step": 140095 + }, + { + "epoch": 0.1411102079789211, + "grad_norm": 10.441266561970286, + "learning_rate": 4.898645496904337e-05, + "loss": 2.4091, + "mean_token_accuracy": 0.42068966031074523, + "step": 140100 + }, + { + "epoch": 0.14111524403202527, + "grad_norm": 8.736722222990101, + "learning_rate": 4.898634376779867e-05, + "loss": 2.7521, + "mean_token_accuracy": 0.4034482777118683, + "step": 140105 + }, + { + "epoch": 0.14112028008512945, + "grad_norm": 12.10196233947874, + "learning_rate": 4.898623256059466e-05, + "loss": 2.6441, + "mean_token_accuracy": 0.3896551728248596, + "step": 140110 + }, + { + "epoch": 0.14112531613823362, + "grad_norm": 11.653963721379736, + "learning_rate": 4.8986121347431345e-05, + "loss": 2.3006, + "mean_token_accuracy": 0.4172413766384125, + "step": 140115 + }, + { + "epoch": 0.1411303521913378, + "grad_norm": 12.20022416476889, + "learning_rate": 4.8986010128308784e-05, + "loss": 2.5637, + "mean_token_accuracy": 0.3793103516101837, + "step": 140120 + }, + { + "epoch": 0.14113538824444197, + "grad_norm": 14.65555960243942, + "learning_rate": 4.8985898903227e-05, + "loss": 2.2879, + "mean_token_accuracy": 0.4448275864124298, + "step": 140125 + }, + { + "epoch": 0.14114042429754614, + "grad_norm": 12.230146963168526, + "learning_rate": 4.898578767218601e-05, + "loss": 2.5196, + "mean_token_accuracy": 0.4344827592372894, + "step": 140130 + }, + { + "epoch": 0.14114546035065031, + "grad_norm": 10.025759506542542, + "learning_rate": 4.898567643518587e-05, + "loss": 2.3615, + "mean_token_accuracy": 0.42722323536872864, + "step": 140135 + }, + { + "epoch": 0.1411504964037545, + "grad_norm": 9.653743575317753, + "learning_rate": 4.89855651922266e-05, + "loss": 2.2445, + "mean_token_accuracy": 0.482758617401123, + "step": 140140 + }, + { + "epoch": 0.14115553245685866, + "grad_norm": 10.933280997987332, + "learning_rate": 4.8985453943308215e-05, + "loss": 2.3478, + "mean_token_accuracy": 0.4676950931549072, + "step": 140145 + }, + { + "epoch": 0.14116056850996284, + "grad_norm": 9.70933537446517, + "learning_rate": 4.8985342688430766e-05, + "loss": 2.4706, + "mean_token_accuracy": 0.42068966031074523, + "step": 140150 + }, + { + "epoch": 0.141165604563067, + "grad_norm": 12.193108437295077, + "learning_rate": 4.898523142759428e-05, + "loss": 2.6538, + "mean_token_accuracy": 0.3758620709180832, + "step": 140155 + }, + { + "epoch": 0.14117064061617116, + "grad_norm": 10.52239724888491, + "learning_rate": 4.8985120160798784e-05, + "loss": 2.1904, + "mean_token_accuracy": 0.4551724135875702, + "step": 140160 + }, + { + "epoch": 0.14117567666927533, + "grad_norm": 8.414153533225, + "learning_rate": 4.89850088880443e-05, + "loss": 2.3012, + "mean_token_accuracy": 0.44676345586776733, + "step": 140165 + }, + { + "epoch": 0.1411807127223795, + "grad_norm": 12.635108441515952, + "learning_rate": 4.8984897609330875e-05, + "loss": 2.3956, + "mean_token_accuracy": 0.3862069010734558, + "step": 140170 + }, + { + "epoch": 0.14118574877548368, + "grad_norm": 10.341726897217049, + "learning_rate": 4.898478632465854e-05, + "loss": 2.2588, + "mean_token_accuracy": 0.4482758641242981, + "step": 140175 + }, + { + "epoch": 0.14119078482858785, + "grad_norm": 9.47824428022073, + "learning_rate": 4.89846750340273e-05, + "loss": 2.1953, + "mean_token_accuracy": 0.46896551847457885, + "step": 140180 + }, + { + "epoch": 0.14119582088169202, + "grad_norm": 17.177649131223404, + "learning_rate": 4.898456373743723e-05, + "loss": 2.4312, + "mean_token_accuracy": 0.4517241358757019, + "step": 140185 + }, + { + "epoch": 0.1412008569347962, + "grad_norm": 11.872364210819864, + "learning_rate": 4.8984452434888325e-05, + "loss": 2.5291, + "mean_token_accuracy": 0.3999999940395355, + "step": 140190 + }, + { + "epoch": 0.14120589298790037, + "grad_norm": 9.727180919595572, + "learning_rate": 4.8984341126380626e-05, + "loss": 2.4045, + "mean_token_accuracy": 0.43448275327682495, + "step": 140195 + }, + { + "epoch": 0.14121092904100455, + "grad_norm": 10.716446686141841, + "learning_rate": 4.898422981191416e-05, + "loss": 2.3554, + "mean_token_accuracy": 0.41724138259887694, + "step": 140200 + }, + { + "epoch": 0.14121596509410872, + "grad_norm": 9.746793355315841, + "learning_rate": 4.898411849148897e-05, + "loss": 2.2881, + "mean_token_accuracy": 0.44482758045196535, + "step": 140205 + }, + { + "epoch": 0.1412210011472129, + "grad_norm": 10.276787786817653, + "learning_rate": 4.8984007165105075e-05, + "loss": 2.1275, + "mean_token_accuracy": 0.43448275327682495, + "step": 140210 + }, + { + "epoch": 0.14122603720031707, + "grad_norm": 13.52914833486473, + "learning_rate": 4.898389583276252e-05, + "loss": 2.7987, + "mean_token_accuracy": 0.35862069129943847, + "step": 140215 + }, + { + "epoch": 0.14123107325342124, + "grad_norm": 10.313252126485908, + "learning_rate": 4.898378449446132e-05, + "loss": 2.6326, + "mean_token_accuracy": 0.38965517580509185, + "step": 140220 + }, + { + "epoch": 0.14123610930652541, + "grad_norm": 10.09076594803918, + "learning_rate": 4.898367315020151e-05, + "loss": 2.2662, + "mean_token_accuracy": 0.4551724135875702, + "step": 140225 + }, + { + "epoch": 0.1412411453596296, + "grad_norm": 9.925232496000808, + "learning_rate": 4.898356179998312e-05, + "loss": 2.4097, + "mean_token_accuracy": 0.4103448301553726, + "step": 140230 + }, + { + "epoch": 0.14124618141273376, + "grad_norm": 11.172973133006497, + "learning_rate": 4.898345044380619e-05, + "loss": 2.351, + "mean_token_accuracy": 0.43793103098869324, + "step": 140235 + }, + { + "epoch": 0.14125121746583794, + "grad_norm": 9.712642526144881, + "learning_rate": 4.898333908167075e-05, + "loss": 2.2714, + "mean_token_accuracy": 0.4551724135875702, + "step": 140240 + }, + { + "epoch": 0.1412562535189421, + "grad_norm": 10.373756526269561, + "learning_rate": 4.898322771357682e-05, + "loss": 2.3885, + "mean_token_accuracy": 0.493103438615799, + "step": 140245 + }, + { + "epoch": 0.14126128957204628, + "grad_norm": 11.621263239404584, + "learning_rate": 4.8983116339524435e-05, + "loss": 2.5346, + "mean_token_accuracy": 0.4034482777118683, + "step": 140250 + }, + { + "epoch": 0.14126632562515046, + "grad_norm": 10.09838222890984, + "learning_rate": 4.898300495951362e-05, + "loss": 2.2988, + "mean_token_accuracy": 0.4172413766384125, + "step": 140255 + }, + { + "epoch": 0.14127136167825463, + "grad_norm": 10.525047106236844, + "learning_rate": 4.898289357354442e-05, + "loss": 2.3908, + "mean_token_accuracy": 0.4154264986515045, + "step": 140260 + }, + { + "epoch": 0.1412763977313588, + "grad_norm": 9.381272791933913, + "learning_rate": 4.898278218161687e-05, + "loss": 2.123, + "mean_token_accuracy": 0.4551724135875702, + "step": 140265 + }, + { + "epoch": 0.14128143378446298, + "grad_norm": 13.257994199044582, + "learning_rate": 4.898267078373098e-05, + "loss": 2.3994, + "mean_token_accuracy": 0.4137930989265442, + "step": 140270 + }, + { + "epoch": 0.14128646983756715, + "grad_norm": 10.18273344471407, + "learning_rate": 4.89825593798868e-05, + "loss": 2.3609, + "mean_token_accuracy": 0.42413792610168455, + "step": 140275 + }, + { + "epoch": 0.14129150589067133, + "grad_norm": 10.144082618994162, + "learning_rate": 4.8982447970084346e-05, + "loss": 2.1577, + "mean_token_accuracy": 0.4758620738983154, + "step": 140280 + }, + { + "epoch": 0.1412965419437755, + "grad_norm": 12.126970683517973, + "learning_rate": 4.898233655432365e-05, + "loss": 2.7429, + "mean_token_accuracy": 0.4379310369491577, + "step": 140285 + }, + { + "epoch": 0.14130157799687967, + "grad_norm": 11.139577454047947, + "learning_rate": 4.898222513260475e-05, + "loss": 2.388, + "mean_token_accuracy": 0.42758620381355283, + "step": 140290 + }, + { + "epoch": 0.14130661404998385, + "grad_norm": 13.314973888332993, + "learning_rate": 4.898211370492768e-05, + "loss": 2.6831, + "mean_token_accuracy": 0.42413793206214906, + "step": 140295 + }, + { + "epoch": 0.141311650103088, + "grad_norm": 9.660334531709022, + "learning_rate": 4.898200227129246e-05, + "loss": 2.3828, + "mean_token_accuracy": 0.4517241418361664, + "step": 140300 + }, + { + "epoch": 0.14131668615619217, + "grad_norm": 11.606781396537196, + "learning_rate": 4.898189083169913e-05, + "loss": 2.2712, + "mean_token_accuracy": 0.4896551728248596, + "step": 140305 + }, + { + "epoch": 0.14132172220929634, + "grad_norm": 10.287606596183796, + "learning_rate": 4.898177938614771e-05, + "loss": 2.0992, + "mean_token_accuracy": 0.4965517222881317, + "step": 140310 + }, + { + "epoch": 0.14132675826240051, + "grad_norm": 12.174701373617077, + "learning_rate": 4.898166793463825e-05, + "loss": 2.0341, + "mean_token_accuracy": 0.4620689630508423, + "step": 140315 + }, + { + "epoch": 0.1413317943155047, + "grad_norm": 8.855311388685147, + "learning_rate": 4.8981556477170764e-05, + "loss": 2.1052, + "mean_token_accuracy": 0.43448275327682495, + "step": 140320 + }, + { + "epoch": 0.14133683036860886, + "grad_norm": 10.731210668084534, + "learning_rate": 4.8981445013745286e-05, + "loss": 2.5581, + "mean_token_accuracy": 0.4068965494632721, + "step": 140325 + }, + { + "epoch": 0.14134186642171304, + "grad_norm": 11.96474676320133, + "learning_rate": 4.898133354436185e-05, + "loss": 2.5796, + "mean_token_accuracy": 0.43448275327682495, + "step": 140330 + }, + { + "epoch": 0.1413469024748172, + "grad_norm": 12.604994495618143, + "learning_rate": 4.898122206902049e-05, + "loss": 2.5921, + "mean_token_accuracy": 0.4413793087005615, + "step": 140335 + }, + { + "epoch": 0.14135193852792138, + "grad_norm": 9.624686642942736, + "learning_rate": 4.898111058772122e-05, + "loss": 2.2435, + "mean_token_accuracy": 0.3896551787853241, + "step": 140340 + }, + { + "epoch": 0.14135697458102556, + "grad_norm": 12.948842647200312, + "learning_rate": 4.89809991004641e-05, + "loss": 2.4847, + "mean_token_accuracy": 0.4068965554237366, + "step": 140345 + }, + { + "epoch": 0.14136201063412973, + "grad_norm": 11.421800298599992, + "learning_rate": 4.8980887607249136e-05, + "loss": 2.8777, + "mean_token_accuracy": 0.3758620649576187, + "step": 140350 + }, + { + "epoch": 0.1413670466872339, + "grad_norm": 13.720396630523153, + "learning_rate": 4.898077610807637e-05, + "loss": 2.7729, + "mean_token_accuracy": 0.358620685338974, + "step": 140355 + }, + { + "epoch": 0.14137208274033808, + "grad_norm": 11.030175853799784, + "learning_rate": 4.898066460294583e-05, + "loss": 2.123, + "mean_token_accuracy": 0.4517241418361664, + "step": 140360 + }, + { + "epoch": 0.14137711879344225, + "grad_norm": 11.493065712468525, + "learning_rate": 4.898055309185755e-05, + "loss": 2.1129, + "mean_token_accuracy": 0.4793103337287903, + "step": 140365 + }, + { + "epoch": 0.14138215484654643, + "grad_norm": 10.992088475638084, + "learning_rate": 4.8980441574811556e-05, + "loss": 2.3463, + "mean_token_accuracy": 0.42758620381355283, + "step": 140370 + }, + { + "epoch": 0.1413871908996506, + "grad_norm": 12.328007538954466, + "learning_rate": 4.898033005180789e-05, + "loss": 2.7487, + "mean_token_accuracy": 0.38965516686439516, + "step": 140375 + }, + { + "epoch": 0.14139222695275477, + "grad_norm": 11.117514349694028, + "learning_rate": 4.898021852284656e-05, + "loss": 2.3066, + "mean_token_accuracy": 0.4172413766384125, + "step": 140380 + }, + { + "epoch": 0.14139726300585895, + "grad_norm": 10.827037726596515, + "learning_rate": 4.898010698792762e-05, + "loss": 2.5234, + "mean_token_accuracy": 0.4206896543502808, + "step": 140385 + }, + { + "epoch": 0.14140229905896312, + "grad_norm": 10.78278508323296, + "learning_rate": 4.897999544705108e-05, + "loss": 1.923, + "mean_token_accuracy": 0.46551724076271056, + "step": 140390 + }, + { + "epoch": 0.1414073351120673, + "grad_norm": 13.052926176205943, + "learning_rate": 4.8979883900217e-05, + "loss": 2.2951, + "mean_token_accuracy": 0.43278887271881106, + "step": 140395 + }, + { + "epoch": 0.14141237116517147, + "grad_norm": 11.414591722506675, + "learning_rate": 4.8979772347425395e-05, + "loss": 2.3205, + "mean_token_accuracy": 0.42413792610168455, + "step": 140400 + }, + { + "epoch": 0.14141740721827564, + "grad_norm": 10.634271188252404, + "learning_rate": 4.8979660788676286e-05, + "loss": 2.4997, + "mean_token_accuracy": 0.42413793206214906, + "step": 140405 + }, + { + "epoch": 0.14142244327137982, + "grad_norm": 9.796247610264974, + "learning_rate": 4.897954922396971e-05, + "loss": 2.4256, + "mean_token_accuracy": 0.4, + "step": 140410 + }, + { + "epoch": 0.141427479324484, + "grad_norm": 9.402012681689591, + "learning_rate": 4.8979437653305706e-05, + "loss": 2.4828, + "mean_token_accuracy": 0.46406533718109133, + "step": 140415 + }, + { + "epoch": 0.14143251537758816, + "grad_norm": 10.008120138217288, + "learning_rate": 4.89793260766843e-05, + "loss": 2.8613, + "mean_token_accuracy": 0.3448275953531265, + "step": 140420 + }, + { + "epoch": 0.14143755143069234, + "grad_norm": 8.451043286737669, + "learning_rate": 4.897921449410552e-05, + "loss": 2.5674, + "mean_token_accuracy": 0.4310344815254211, + "step": 140425 + }, + { + "epoch": 0.1414425874837965, + "grad_norm": 9.344740315998466, + "learning_rate": 4.89791029055694e-05, + "loss": 2.4221, + "mean_token_accuracy": 0.4206896543502808, + "step": 140430 + }, + { + "epoch": 0.14144762353690069, + "grad_norm": 11.750915121098629, + "learning_rate": 4.8978991311075976e-05, + "loss": 2.19, + "mean_token_accuracy": 0.4758620738983154, + "step": 140435 + }, + { + "epoch": 0.14145265959000483, + "grad_norm": 13.94916707577565, + "learning_rate": 4.897887971062527e-05, + "loss": 2.6946, + "mean_token_accuracy": 0.4034482777118683, + "step": 140440 + }, + { + "epoch": 0.141457695643109, + "grad_norm": 9.422062147159924, + "learning_rate": 4.897876810421731e-05, + "loss": 2.3551, + "mean_token_accuracy": 0.43793103098869324, + "step": 140445 + }, + { + "epoch": 0.14146273169621318, + "grad_norm": 10.45485081064072, + "learning_rate": 4.897865649185214e-05, + "loss": 3.1192, + "mean_token_accuracy": 0.37241379618644715, + "step": 140450 + }, + { + "epoch": 0.14146776774931735, + "grad_norm": 8.97546546694762, + "learning_rate": 4.8978544873529784e-05, + "loss": 2.5455, + "mean_token_accuracy": 0.4517241358757019, + "step": 140455 + }, + { + "epoch": 0.14147280380242153, + "grad_norm": 9.723024655199998, + "learning_rate": 4.8978433249250275e-05, + "loss": 2.13, + "mean_token_accuracy": 0.49655172824859617, + "step": 140460 + }, + { + "epoch": 0.1414778398555257, + "grad_norm": 9.354849215674433, + "learning_rate": 4.8978321619013645e-05, + "loss": 2.0469, + "mean_token_accuracy": 0.49655171036720275, + "step": 140465 + }, + { + "epoch": 0.14148287590862987, + "grad_norm": 9.501963620531358, + "learning_rate": 4.8978209982819914e-05, + "loss": 1.9376, + "mean_token_accuracy": 0.5034482657909394, + "step": 140470 + }, + { + "epoch": 0.14148791196173405, + "grad_norm": 10.944631798962858, + "learning_rate": 4.897809834066913e-05, + "loss": 3.0087, + "mean_token_accuracy": 0.4068965494632721, + "step": 140475 + }, + { + "epoch": 0.14149294801483822, + "grad_norm": 9.022146326981478, + "learning_rate": 4.897798669256131e-05, + "loss": 2.233, + "mean_token_accuracy": 0.42758620977401735, + "step": 140480 + }, + { + "epoch": 0.1414979840679424, + "grad_norm": 15.865809505163934, + "learning_rate": 4.897787503849649e-05, + "loss": 2.4466, + "mean_token_accuracy": 0.42413792610168455, + "step": 140485 + }, + { + "epoch": 0.14150302012104657, + "grad_norm": 9.63268504884066, + "learning_rate": 4.89777633784747e-05, + "loss": 2.2981, + "mean_token_accuracy": 0.4551724135875702, + "step": 140490 + }, + { + "epoch": 0.14150805617415074, + "grad_norm": 10.635618866900481, + "learning_rate": 4.897765171249598e-05, + "loss": 2.2536, + "mean_token_accuracy": 0.43793103098869324, + "step": 140495 + }, + { + "epoch": 0.14151309222725492, + "grad_norm": 8.329877176197359, + "learning_rate": 4.897754004056034e-05, + "loss": 2.033, + "mean_token_accuracy": 0.5172413766384125, + "step": 140500 + }, + { + "epoch": 0.1415181282803591, + "grad_norm": 10.562381120803575, + "learning_rate": 4.897742836266784e-05, + "loss": 2.4704, + "mean_token_accuracy": 0.4172413766384125, + "step": 140505 + }, + { + "epoch": 0.14152316433346326, + "grad_norm": 11.061384329355556, + "learning_rate": 4.8977316678818485e-05, + "loss": 2.3914, + "mean_token_accuracy": 0.4206896543502808, + "step": 140510 + }, + { + "epoch": 0.14152820038656744, + "grad_norm": 10.317934170653393, + "learning_rate": 4.897720498901232e-05, + "loss": 2.2801, + "mean_token_accuracy": 0.42256503701210024, + "step": 140515 + }, + { + "epoch": 0.1415332364396716, + "grad_norm": 12.259346922344315, + "learning_rate": 4.897709329324937e-05, + "loss": 2.77, + "mean_token_accuracy": 0.3999999940395355, + "step": 140520 + }, + { + "epoch": 0.14153827249277579, + "grad_norm": 11.834567606563937, + "learning_rate": 4.897698159152967e-05, + "loss": 2.2481, + "mean_token_accuracy": 0.4344827592372894, + "step": 140525 + }, + { + "epoch": 0.14154330854587996, + "grad_norm": 10.065975099062419, + "learning_rate": 4.897686988385325e-05, + "loss": 2.3026, + "mean_token_accuracy": 0.42068966031074523, + "step": 140530 + }, + { + "epoch": 0.14154834459898413, + "grad_norm": 12.308906671307767, + "learning_rate": 4.8976758170220134e-05, + "loss": 2.9094, + "mean_token_accuracy": 0.4124621868133545, + "step": 140535 + }, + { + "epoch": 0.1415533806520883, + "grad_norm": 8.744237157228927, + "learning_rate": 4.8976646450630365e-05, + "loss": 2.2514, + "mean_token_accuracy": 0.4517241358757019, + "step": 140540 + }, + { + "epoch": 0.14155841670519248, + "grad_norm": 9.012054690036376, + "learning_rate": 4.897653472508396e-05, + "loss": 2.7568, + "mean_token_accuracy": 0.3999999940395355, + "step": 140545 + }, + { + "epoch": 0.14156345275829665, + "grad_norm": 13.739321139411189, + "learning_rate": 4.897642299358097e-05, + "loss": 2.6432, + "mean_token_accuracy": 0.37586207389831544, + "step": 140550 + }, + { + "epoch": 0.14156848881140083, + "grad_norm": 9.658993374433193, + "learning_rate": 4.89763112561214e-05, + "loss": 2.3521, + "mean_token_accuracy": 0.46412582993507384, + "step": 140555 + }, + { + "epoch": 0.141573524864505, + "grad_norm": 10.756090730949467, + "learning_rate": 4.897619951270531e-05, + "loss": 2.1926, + "mean_token_accuracy": 0.4517241299152374, + "step": 140560 + }, + { + "epoch": 0.14157856091760918, + "grad_norm": 9.69602627991262, + "learning_rate": 4.897608776333271e-05, + "loss": 2.2113, + "mean_token_accuracy": 0.4535390198230743, + "step": 140565 + }, + { + "epoch": 0.14158359697071335, + "grad_norm": 10.914551114022528, + "learning_rate": 4.897597600800363e-05, + "loss": 2.248, + "mean_token_accuracy": 0.4482758641242981, + "step": 140570 + }, + { + "epoch": 0.14158863302381752, + "grad_norm": 12.221413371806607, + "learning_rate": 4.8975864246718114e-05, + "loss": 2.4784, + "mean_token_accuracy": 0.4034482717514038, + "step": 140575 + }, + { + "epoch": 0.14159366907692167, + "grad_norm": 8.552375192417331, + "learning_rate": 4.897575247947618e-05, + "loss": 1.8452, + "mean_token_accuracy": 0.5448275923728942, + "step": 140580 + }, + { + "epoch": 0.14159870513002584, + "grad_norm": 10.830551814681563, + "learning_rate": 4.897564070627788e-05, + "loss": 2.4263, + "mean_token_accuracy": 0.38620689511299133, + "step": 140585 + }, + { + "epoch": 0.14160374118313002, + "grad_norm": 10.372813144746793, + "learning_rate": 4.897552892712322e-05, + "loss": 2.7543, + "mean_token_accuracy": 0.382758629322052, + "step": 140590 + }, + { + "epoch": 0.1416087772362342, + "grad_norm": 11.76858045557601, + "learning_rate": 4.897541714201224e-05, + "loss": 2.6237, + "mean_token_accuracy": 0.3655172407627106, + "step": 140595 + }, + { + "epoch": 0.14161381328933836, + "grad_norm": 10.338713370355235, + "learning_rate": 4.897530535094498e-05, + "loss": 2.333, + "mean_token_accuracy": 0.458620685338974, + "step": 140600 + }, + { + "epoch": 0.14161884934244254, + "grad_norm": 10.054601672940803, + "learning_rate": 4.8975193553921464e-05, + "loss": 2.1073, + "mean_token_accuracy": 0.4620689570903778, + "step": 140605 + }, + { + "epoch": 0.1416238853955467, + "grad_norm": 10.043036507454254, + "learning_rate": 4.897508175094172e-05, + "loss": 2.5921, + "mean_token_accuracy": 0.4344827651977539, + "step": 140610 + }, + { + "epoch": 0.14162892144865089, + "grad_norm": 9.512135524580431, + "learning_rate": 4.897496994200579e-05, + "loss": 2.229, + "mean_token_accuracy": 0.4379310429096222, + "step": 140615 + }, + { + "epoch": 0.14163395750175506, + "grad_norm": 10.612037186521766, + "learning_rate": 4.897485812711368e-05, + "loss": 2.5287, + "mean_token_accuracy": 0.36551724672317504, + "step": 140620 + }, + { + "epoch": 0.14163899355485923, + "grad_norm": 10.980393505520793, + "learning_rate": 4.897474630626545e-05, + "loss": 2.237, + "mean_token_accuracy": 0.46551724672317507, + "step": 140625 + }, + { + "epoch": 0.1416440296079634, + "grad_norm": 11.188049625385073, + "learning_rate": 4.897463447946112e-05, + "loss": 2.5239, + "mean_token_accuracy": 0.36551723480224607, + "step": 140630 + }, + { + "epoch": 0.14164906566106758, + "grad_norm": 11.796677818491794, + "learning_rate": 4.8974522646700714e-05, + "loss": 2.1772, + "mean_token_accuracy": 0.5068965435028077, + "step": 140635 + }, + { + "epoch": 0.14165410171417175, + "grad_norm": 11.863186522702293, + "learning_rate": 4.897441080798428e-05, + "loss": 2.3496, + "mean_token_accuracy": 0.42413792610168455, + "step": 140640 + }, + { + "epoch": 0.14165913776727593, + "grad_norm": 8.572983100789882, + "learning_rate": 4.897429896331182e-05, + "loss": 2.0626, + "mean_token_accuracy": 0.47586206197738645, + "step": 140645 + }, + { + "epoch": 0.1416641738203801, + "grad_norm": 10.05207531341534, + "learning_rate": 4.8974187112683396e-05, + "loss": 2.338, + "mean_token_accuracy": 0.4379310369491577, + "step": 140650 + }, + { + "epoch": 0.14166920987348428, + "grad_norm": 10.126779680806836, + "learning_rate": 4.897407525609903e-05, + "loss": 2.7301, + "mean_token_accuracy": 0.3862068891525269, + "step": 140655 + }, + { + "epoch": 0.14167424592658845, + "grad_norm": 9.008011981512118, + "learning_rate": 4.8973963393558735e-05, + "loss": 2.1246, + "mean_token_accuracy": 0.5081280767917633, + "step": 140660 + }, + { + "epoch": 0.14167928197969262, + "grad_norm": 10.93459614300818, + "learning_rate": 4.8973851525062566e-05, + "loss": 2.5802, + "mean_token_accuracy": 0.42413793206214906, + "step": 140665 + }, + { + "epoch": 0.1416843180327968, + "grad_norm": 10.600525510421074, + "learning_rate": 4.897373965061054e-05, + "loss": 2.5237, + "mean_token_accuracy": 0.45027223229408264, + "step": 140670 + }, + { + "epoch": 0.14168935408590097, + "grad_norm": 11.35846045729706, + "learning_rate": 4.89736277702027e-05, + "loss": 2.1242, + "mean_token_accuracy": 0.4137930989265442, + "step": 140675 + }, + { + "epoch": 0.14169439013900514, + "grad_norm": 9.047634473736348, + "learning_rate": 4.897351588383905e-05, + "loss": 2.687, + "mean_token_accuracy": 0.37931033968925476, + "step": 140680 + }, + { + "epoch": 0.14169942619210932, + "grad_norm": 9.971295003993943, + "learning_rate": 4.897340399151966e-05, + "loss": 2.498, + "mean_token_accuracy": 0.39655172228813174, + "step": 140685 + }, + { + "epoch": 0.1417044622452135, + "grad_norm": 10.846671202121541, + "learning_rate": 4.897329209324453e-05, + "loss": 2.3081, + "mean_token_accuracy": 0.4034482717514038, + "step": 140690 + }, + { + "epoch": 0.14170949829831767, + "grad_norm": 9.840885510986078, + "learning_rate": 4.8973180189013714e-05, + "loss": 2.7341, + "mean_token_accuracy": 0.4068965554237366, + "step": 140695 + }, + { + "epoch": 0.14171453435142184, + "grad_norm": 9.388392577875948, + "learning_rate": 4.897306827882722e-05, + "loss": 2.3073, + "mean_token_accuracy": 0.4689655065536499, + "step": 140700 + }, + { + "epoch": 0.141719570404526, + "grad_norm": 9.677848832983646, + "learning_rate": 4.8972956362685095e-05, + "loss": 2.2022, + "mean_token_accuracy": 0.4363581418991089, + "step": 140705 + }, + { + "epoch": 0.1417246064576302, + "grad_norm": 8.924277262526973, + "learning_rate": 4.8972844440587366e-05, + "loss": 2.4606, + "mean_token_accuracy": 0.44482758045196535, + "step": 140710 + }, + { + "epoch": 0.14172964251073436, + "grad_norm": 8.81836687695403, + "learning_rate": 4.897273251253407e-05, + "loss": 2.1631, + "mean_token_accuracy": 0.5103448271751404, + "step": 140715 + }, + { + "epoch": 0.1417346785638385, + "grad_norm": 9.532537457285772, + "learning_rate": 4.897262057852522e-05, + "loss": 2.4825, + "mean_token_accuracy": 0.36896551847457887, + "step": 140720 + }, + { + "epoch": 0.14173971461694268, + "grad_norm": 10.598305056216368, + "learning_rate": 4.897250863856086e-05, + "loss": 2.331, + "mean_token_accuracy": 0.46551724672317507, + "step": 140725 + }, + { + "epoch": 0.14174475067004685, + "grad_norm": 11.849648045054636, + "learning_rate": 4.897239669264102e-05, + "loss": 2.3966, + "mean_token_accuracy": 0.4034482717514038, + "step": 140730 + }, + { + "epoch": 0.14174978672315103, + "grad_norm": 9.875225871715957, + "learning_rate": 4.897228474076574e-05, + "loss": 2.3271, + "mean_token_accuracy": 0.4551724076271057, + "step": 140735 + }, + { + "epoch": 0.1417548227762552, + "grad_norm": 9.090098733677863, + "learning_rate": 4.897217278293503e-05, + "loss": 2.3069, + "mean_token_accuracy": 0.44827585816383364, + "step": 140740 + }, + { + "epoch": 0.14175985882935938, + "grad_norm": 10.361103920331118, + "learning_rate": 4.897206081914894e-05, + "loss": 2.3218, + "mean_token_accuracy": 0.41379310488700866, + "step": 140745 + }, + { + "epoch": 0.14176489488246355, + "grad_norm": 9.454296164036574, + "learning_rate": 4.897194884940749e-05, + "loss": 2.2077, + "mean_token_accuracy": 0.4620689690113068, + "step": 140750 + }, + { + "epoch": 0.14176993093556772, + "grad_norm": 9.522459822864715, + "learning_rate": 4.8971836873710714e-05, + "loss": 2.3656, + "mean_token_accuracy": 0.41724138259887694, + "step": 140755 + }, + { + "epoch": 0.1417749669886719, + "grad_norm": 12.117112242135113, + "learning_rate": 4.897172489205865e-05, + "loss": 2.7839, + "mean_token_accuracy": 0.38463400602340697, + "step": 140760 + }, + { + "epoch": 0.14178000304177607, + "grad_norm": 10.290577897456522, + "learning_rate": 4.897161290445132e-05, + "loss": 2.4051, + "mean_token_accuracy": 0.41034482717514037, + "step": 140765 + }, + { + "epoch": 0.14178503909488024, + "grad_norm": 10.597945414836664, + "learning_rate": 4.897150091088875e-05, + "loss": 2.5502, + "mean_token_accuracy": 0.46751360297203065, + "step": 140770 + }, + { + "epoch": 0.14179007514798442, + "grad_norm": 8.834526259676629, + "learning_rate": 4.897138891137099e-05, + "loss": 2.2364, + "mean_token_accuracy": 0.4517241418361664, + "step": 140775 + }, + { + "epoch": 0.1417951112010886, + "grad_norm": 9.268951993418353, + "learning_rate": 4.8971276905898055e-05, + "loss": 2.5022, + "mean_token_accuracy": 0.4034482717514038, + "step": 140780 + }, + { + "epoch": 0.14180014725419277, + "grad_norm": 15.94133768404537, + "learning_rate": 4.897116489446998e-05, + "loss": 2.7268, + "mean_token_accuracy": 0.36896551847457887, + "step": 140785 + }, + { + "epoch": 0.14180518330729694, + "grad_norm": 9.713797860014633, + "learning_rate": 4.8971052877086806e-05, + "loss": 2.292, + "mean_token_accuracy": 0.49655171632766726, + "step": 140790 + }, + { + "epoch": 0.1418102193604011, + "grad_norm": 9.255380320866411, + "learning_rate": 4.897094085374855e-05, + "loss": 2.2783, + "mean_token_accuracy": 0.4344827592372894, + "step": 140795 + }, + { + "epoch": 0.1418152554135053, + "grad_norm": 10.894301802127519, + "learning_rate": 4.897082882445524e-05, + "loss": 2.4727, + "mean_token_accuracy": 0.4724137902259827, + "step": 140800 + }, + { + "epoch": 0.14182029146660946, + "grad_norm": 10.18585837496387, + "learning_rate": 4.897071678920693e-05, + "loss": 2.3684, + "mean_token_accuracy": 0.46896551847457885, + "step": 140805 + }, + { + "epoch": 0.14182532751971363, + "grad_norm": 14.05805239361398, + "learning_rate": 4.8970604748003634e-05, + "loss": 2.4313, + "mean_token_accuracy": 0.4275861978530884, + "step": 140810 + }, + { + "epoch": 0.1418303635728178, + "grad_norm": 9.655686560449553, + "learning_rate": 4.897049270084538e-05, + "loss": 2.612, + "mean_token_accuracy": 0.42589232325553894, + "step": 140815 + }, + { + "epoch": 0.14183539962592198, + "grad_norm": 11.071090820043537, + "learning_rate": 4.897038064773221e-05, + "loss": 2.5678, + "mean_token_accuracy": 0.4068965554237366, + "step": 140820 + }, + { + "epoch": 0.14184043567902616, + "grad_norm": 10.058146522739435, + "learning_rate": 4.8970268588664134e-05, + "loss": 2.6578, + "mean_token_accuracy": 0.3862068891525269, + "step": 140825 + }, + { + "epoch": 0.14184547173213033, + "grad_norm": 9.119488496681655, + "learning_rate": 4.8970156523641217e-05, + "loss": 2.0372, + "mean_token_accuracy": 0.4931034445762634, + "step": 140830 + }, + { + "epoch": 0.1418505077852345, + "grad_norm": 10.037200280132705, + "learning_rate": 4.8970044452663474e-05, + "loss": 2.3195, + "mean_token_accuracy": 0.44482759237289426, + "step": 140835 + }, + { + "epoch": 0.14185554383833868, + "grad_norm": 10.46885876160073, + "learning_rate": 4.896993237573092e-05, + "loss": 2.0095, + "mean_token_accuracy": 0.5335148334503174, + "step": 140840 + }, + { + "epoch": 0.14186057989144285, + "grad_norm": 11.322084224315978, + "learning_rate": 4.8969820292843604e-05, + "loss": 2.4778, + "mean_token_accuracy": 0.41379311084747317, + "step": 140845 + }, + { + "epoch": 0.14186561594454702, + "grad_norm": 11.422134024762268, + "learning_rate": 4.896970820400156e-05, + "loss": 2.2072, + "mean_token_accuracy": 0.44827587008476255, + "step": 140850 + }, + { + "epoch": 0.1418706519976512, + "grad_norm": 9.873205871418836, + "learning_rate": 4.896959610920481e-05, + "loss": 2.2568, + "mean_token_accuracy": 0.46896551847457885, + "step": 140855 + }, + { + "epoch": 0.14187568805075534, + "grad_norm": 10.120730953598713, + "learning_rate": 4.8969484008453384e-05, + "loss": 2.7048, + "mean_token_accuracy": 0.3896551638841629, + "step": 140860 + }, + { + "epoch": 0.14188072410385952, + "grad_norm": 11.311150174074205, + "learning_rate": 4.896937190174731e-05, + "loss": 2.5134, + "mean_token_accuracy": 0.48844525814056394, + "step": 140865 + }, + { + "epoch": 0.1418857601569637, + "grad_norm": 9.899761657734727, + "learning_rate": 4.896925978908664e-05, + "loss": 2.331, + "mean_token_accuracy": 0.46896551847457885, + "step": 140870 + }, + { + "epoch": 0.14189079621006787, + "grad_norm": 10.912927801440418, + "learning_rate": 4.896914767047138e-05, + "loss": 2.5445, + "mean_token_accuracy": 0.3620689630508423, + "step": 140875 + }, + { + "epoch": 0.14189583226317204, + "grad_norm": 9.32575424118185, + "learning_rate": 4.8969035545901575e-05, + "loss": 2.4505, + "mean_token_accuracy": 0.3862068891525269, + "step": 140880 + }, + { + "epoch": 0.1419008683162762, + "grad_norm": 11.095413021837498, + "learning_rate": 4.896892341537726e-05, + "loss": 2.0422, + "mean_token_accuracy": 0.43793103098869324, + "step": 140885 + }, + { + "epoch": 0.1419059043693804, + "grad_norm": 9.911363407725052, + "learning_rate": 4.8968811278898454e-05, + "loss": 2.0976, + "mean_token_accuracy": 0.5034482717514038, + "step": 140890 + }, + { + "epoch": 0.14191094042248456, + "grad_norm": 10.368251280404074, + "learning_rate": 4.896869913646518e-05, + "loss": 2.3692, + "mean_token_accuracy": 0.4379310429096222, + "step": 140895 + }, + { + "epoch": 0.14191597647558873, + "grad_norm": 9.285587338149734, + "learning_rate": 4.8968586988077505e-05, + "loss": 2.3245, + "mean_token_accuracy": 0.44482758045196535, + "step": 140900 + }, + { + "epoch": 0.1419210125286929, + "grad_norm": 9.070771564999916, + "learning_rate": 4.896847483373543e-05, + "loss": 2.0363, + "mean_token_accuracy": 0.4310344815254211, + "step": 140905 + }, + { + "epoch": 0.14192604858179708, + "grad_norm": 14.42470625561916, + "learning_rate": 4.8968362673438986e-05, + "loss": 3.1887, + "mean_token_accuracy": 0.358620685338974, + "step": 140910 + }, + { + "epoch": 0.14193108463490126, + "grad_norm": 9.513326418077009, + "learning_rate": 4.896825050718821e-05, + "loss": 2.4314, + "mean_token_accuracy": 0.44482759237289426, + "step": 140915 + }, + { + "epoch": 0.14193612068800543, + "grad_norm": 11.541750092723131, + "learning_rate": 4.896813833498315e-05, + "loss": 2.6271, + "mean_token_accuracy": 0.4413793087005615, + "step": 140920 + }, + { + "epoch": 0.1419411567411096, + "grad_norm": 11.632716192325871, + "learning_rate": 4.896802615682381e-05, + "loss": 2.5468, + "mean_token_accuracy": 0.4241379380226135, + "step": 140925 + }, + { + "epoch": 0.14194619279421378, + "grad_norm": 8.007504897940324, + "learning_rate": 4.896791397271023e-05, + "loss": 2.08, + "mean_token_accuracy": 0.4361161530017853, + "step": 140930 + }, + { + "epoch": 0.14195122884731795, + "grad_norm": 10.710936697031785, + "learning_rate": 4.896780178264245e-05, + "loss": 2.3661, + "mean_token_accuracy": 0.4241379380226135, + "step": 140935 + }, + { + "epoch": 0.14195626490042212, + "grad_norm": 8.92219122155108, + "learning_rate": 4.89676895866205e-05, + "loss": 2.4784, + "mean_token_accuracy": 0.38275861740112305, + "step": 140940 + }, + { + "epoch": 0.1419613009535263, + "grad_norm": 10.811052885337256, + "learning_rate": 4.8967577384644396e-05, + "loss": 2.2442, + "mean_token_accuracy": 0.4448275864124298, + "step": 140945 + }, + { + "epoch": 0.14196633700663047, + "grad_norm": 7.148822758124012, + "learning_rate": 4.896746517671418e-05, + "loss": 1.8096, + "mean_token_accuracy": 0.5016333878040313, + "step": 140950 + }, + { + "epoch": 0.14197137305973465, + "grad_norm": 10.426764610385678, + "learning_rate": 4.896735296282989e-05, + "loss": 2.9665, + "mean_token_accuracy": 0.3482758641242981, + "step": 140955 + }, + { + "epoch": 0.14197640911283882, + "grad_norm": 10.146766630414817, + "learning_rate": 4.896724074299154e-05, + "loss": 2.3365, + "mean_token_accuracy": 0.4551724135875702, + "step": 140960 + }, + { + "epoch": 0.141981445165943, + "grad_norm": 9.96404973689838, + "learning_rate": 4.896712851719918e-05, + "loss": 2.4104, + "mean_token_accuracy": 0.4000000059604645, + "step": 140965 + }, + { + "epoch": 0.14198648121904717, + "grad_norm": 10.211723997932223, + "learning_rate": 4.8967016285452825e-05, + "loss": 2.1196, + "mean_token_accuracy": 0.42413793206214906, + "step": 140970 + }, + { + "epoch": 0.14199151727215134, + "grad_norm": 12.061668823075768, + "learning_rate": 4.896690404775252e-05, + "loss": 2.2852, + "mean_token_accuracy": 0.44482759237289426, + "step": 140975 + }, + { + "epoch": 0.14199655332525551, + "grad_norm": 12.303384484024459, + "learning_rate": 4.8966791804098286e-05, + "loss": 2.387, + "mean_token_accuracy": 0.3620689630508423, + "step": 140980 + }, + { + "epoch": 0.1420015893783597, + "grad_norm": 9.525906890567837, + "learning_rate": 4.896667955449015e-05, + "loss": 2.2701, + "mean_token_accuracy": 0.4, + "step": 140985 + }, + { + "epoch": 0.14200662543146386, + "grad_norm": 11.858504179675501, + "learning_rate": 4.8966567298928155e-05, + "loss": 2.2479, + "mean_token_accuracy": 0.47241379618644713, + "step": 140990 + }, + { + "epoch": 0.14201166148456804, + "grad_norm": 8.844314648514631, + "learning_rate": 4.896645503741234e-05, + "loss": 2.4965, + "mean_token_accuracy": 0.4275861978530884, + "step": 140995 + }, + { + "epoch": 0.14201669753767218, + "grad_norm": 14.983828707664037, + "learning_rate": 4.8966342769942704e-05, + "loss": 2.7224, + "mean_token_accuracy": 0.36896551847457887, + "step": 141000 + }, + { + "epoch": 0.14202173359077636, + "grad_norm": 10.077346424562178, + "learning_rate": 4.8966230496519303e-05, + "loss": 2.0267, + "mean_token_accuracy": 0.43623714447021483, + "step": 141005 + }, + { + "epoch": 0.14202676964388053, + "grad_norm": 13.948258456025338, + "learning_rate": 4.896611821714217e-05, + "loss": 2.9053, + "mean_token_accuracy": 0.3655172407627106, + "step": 141010 + }, + { + "epoch": 0.1420318056969847, + "grad_norm": 11.388061491974396, + "learning_rate": 4.896600593181133e-05, + "loss": 2.932, + "mean_token_accuracy": 0.32758620083332063, + "step": 141015 + }, + { + "epoch": 0.14203684175008888, + "grad_norm": 10.037468793346019, + "learning_rate": 4.896589364052681e-05, + "loss": 2.2387, + "mean_token_accuracy": 0.4172413766384125, + "step": 141020 + }, + { + "epoch": 0.14204187780319305, + "grad_norm": 10.974305205920933, + "learning_rate": 4.8965781343288636e-05, + "loss": 2.1298, + "mean_token_accuracy": 0.4862068951129913, + "step": 141025 + }, + { + "epoch": 0.14204691385629722, + "grad_norm": 10.59801725174759, + "learning_rate": 4.896566904009686e-05, + "loss": 2.1937, + "mean_token_accuracy": 0.4911070704460144, + "step": 141030 + }, + { + "epoch": 0.1420519499094014, + "grad_norm": 10.657638654456118, + "learning_rate": 4.896555673095149e-05, + "loss": 2.1561, + "mean_token_accuracy": 0.4620689690113068, + "step": 141035 + }, + { + "epoch": 0.14205698596250557, + "grad_norm": 11.44847654385035, + "learning_rate": 4.896544441585257e-05, + "loss": 2.171, + "mean_token_accuracy": 0.4862069010734558, + "step": 141040 + }, + { + "epoch": 0.14206202201560975, + "grad_norm": 13.137746523735641, + "learning_rate": 4.8965332094800135e-05, + "loss": 2.2783, + "mean_token_accuracy": 0.4705665111541748, + "step": 141045 + }, + { + "epoch": 0.14206705806871392, + "grad_norm": 16.95535342428976, + "learning_rate": 4.896521976779421e-05, + "loss": 2.398, + "mean_token_accuracy": 0.39310344457626345, + "step": 141050 + }, + { + "epoch": 0.1420720941218181, + "grad_norm": 10.212740368942896, + "learning_rate": 4.8965107434834815e-05, + "loss": 2.1489, + "mean_token_accuracy": 0.47586206197738645, + "step": 141055 + }, + { + "epoch": 0.14207713017492227, + "grad_norm": 10.045407358934472, + "learning_rate": 4.8964995095922e-05, + "loss": 2.0879, + "mean_token_accuracy": 0.4517241358757019, + "step": 141060 + }, + { + "epoch": 0.14208216622802644, + "grad_norm": 10.77785501540689, + "learning_rate": 4.896488275105579e-05, + "loss": 2.2385, + "mean_token_accuracy": 0.458620685338974, + "step": 141065 + }, + { + "epoch": 0.14208720228113061, + "grad_norm": 8.825761376826689, + "learning_rate": 4.8964770400236214e-05, + "loss": 2.2815, + "mean_token_accuracy": 0.43793103098869324, + "step": 141070 + }, + { + "epoch": 0.1420922383342348, + "grad_norm": 10.668019924457385, + "learning_rate": 4.896465804346331e-05, + "loss": 2.3725, + "mean_token_accuracy": 0.42068966031074523, + "step": 141075 + }, + { + "epoch": 0.14209727438733896, + "grad_norm": 9.77261111577999, + "learning_rate": 4.8964545680737095e-05, + "loss": 2.1173, + "mean_token_accuracy": 0.4931034564971924, + "step": 141080 + }, + { + "epoch": 0.14210231044044314, + "grad_norm": 10.185101362941314, + "learning_rate": 4.896443331205761e-05, + "loss": 2.9549, + "mean_token_accuracy": 0.36896551847457887, + "step": 141085 + }, + { + "epoch": 0.1421073464935473, + "grad_norm": 13.254612313178525, + "learning_rate": 4.896432093742488e-05, + "loss": 2.3473, + "mean_token_accuracy": 0.45862069725990295, + "step": 141090 + }, + { + "epoch": 0.14211238254665148, + "grad_norm": 13.987393775269584, + "learning_rate": 4.896420855683895e-05, + "loss": 2.5207, + "mean_token_accuracy": 0.4034482777118683, + "step": 141095 + }, + { + "epoch": 0.14211741859975566, + "grad_norm": 10.93681104332027, + "learning_rate": 4.896409617029983e-05, + "loss": 2.7626, + "mean_token_accuracy": 0.4068965494632721, + "step": 141100 + }, + { + "epoch": 0.14212245465285983, + "grad_norm": 10.272301624324763, + "learning_rate": 4.896398377780757e-05, + "loss": 2.9075, + "mean_token_accuracy": 0.37586206793785093, + "step": 141105 + }, + { + "epoch": 0.142127490705964, + "grad_norm": 13.508000210201565, + "learning_rate": 4.896387137936219e-05, + "loss": 2.9287, + "mean_token_accuracy": 0.358620685338974, + "step": 141110 + }, + { + "epoch": 0.14213252675906818, + "grad_norm": 10.26811277042134, + "learning_rate": 4.8963758974963734e-05, + "loss": 2.2963, + "mean_token_accuracy": 0.3965517312288284, + "step": 141115 + }, + { + "epoch": 0.14213756281217235, + "grad_norm": 9.513383640491488, + "learning_rate": 4.896364656461221e-05, + "loss": 2.2715, + "mean_token_accuracy": 0.420689657330513, + "step": 141120 + }, + { + "epoch": 0.14214259886527653, + "grad_norm": 14.05237692527733, + "learning_rate": 4.8963534148307676e-05, + "loss": 2.882, + "mean_token_accuracy": 0.42413792610168455, + "step": 141125 + }, + { + "epoch": 0.1421476349183807, + "grad_norm": 11.168760733997585, + "learning_rate": 4.8963421726050144e-05, + "loss": 2.3859, + "mean_token_accuracy": 0.44482758045196535, + "step": 141130 + }, + { + "epoch": 0.14215267097148487, + "grad_norm": 10.366485561412672, + "learning_rate": 4.896330929783966e-05, + "loss": 2.4547, + "mean_token_accuracy": 0.4413793087005615, + "step": 141135 + }, + { + "epoch": 0.14215770702458902, + "grad_norm": 10.309330661182006, + "learning_rate": 4.8963196863676245e-05, + "loss": 2.2219, + "mean_token_accuracy": 0.44827585816383364, + "step": 141140 + }, + { + "epoch": 0.1421627430776932, + "grad_norm": 10.613685918498128, + "learning_rate": 4.8963084423559925e-05, + "loss": 2.2536, + "mean_token_accuracy": 0.4448275864124298, + "step": 141145 + }, + { + "epoch": 0.14216777913079737, + "grad_norm": 11.271630130401839, + "learning_rate": 4.896297197749074e-05, + "loss": 2.3566, + "mean_token_accuracy": 0.4586206912994385, + "step": 141150 + }, + { + "epoch": 0.14217281518390154, + "grad_norm": 9.019749163164834, + "learning_rate": 4.8962859525468727e-05, + "loss": 2.4052, + "mean_token_accuracy": 0.4413793087005615, + "step": 141155 + }, + { + "epoch": 0.14217785123700571, + "grad_norm": 8.435157768006238, + "learning_rate": 4.89627470674939e-05, + "loss": 2.4991, + "mean_token_accuracy": 0.4586206912994385, + "step": 141160 + }, + { + "epoch": 0.1421828872901099, + "grad_norm": 10.295988183147786, + "learning_rate": 4.896263460356631e-05, + "loss": 2.3865, + "mean_token_accuracy": 0.44827585816383364, + "step": 141165 + }, + { + "epoch": 0.14218792334321406, + "grad_norm": 10.145669246967278, + "learning_rate": 4.8962522133685974e-05, + "loss": 2.4595, + "mean_token_accuracy": 0.4261947929859161, + "step": 141170 + }, + { + "epoch": 0.14219295939631824, + "grad_norm": 9.663443038021436, + "learning_rate": 4.8962409657852925e-05, + "loss": 2.2954, + "mean_token_accuracy": 0.4413793087005615, + "step": 141175 + }, + { + "epoch": 0.1421979954494224, + "grad_norm": 11.220140350573832, + "learning_rate": 4.89622971760672e-05, + "loss": 2.0092, + "mean_token_accuracy": 0.4620689630508423, + "step": 141180 + }, + { + "epoch": 0.14220303150252658, + "grad_norm": 11.483491156430729, + "learning_rate": 4.896218468832882e-05, + "loss": 2.5964, + "mean_token_accuracy": 0.39818512201309203, + "step": 141185 + }, + { + "epoch": 0.14220806755563076, + "grad_norm": 8.145439930215328, + "learning_rate": 4.896207219463783e-05, + "loss": 2.1218, + "mean_token_accuracy": 0.4344827592372894, + "step": 141190 + }, + { + "epoch": 0.14221310360873493, + "grad_norm": 9.869774171982263, + "learning_rate": 4.896195969499425e-05, + "loss": 2.3997, + "mean_token_accuracy": 0.37931033968925476, + "step": 141195 + }, + { + "epoch": 0.1422181396618391, + "grad_norm": 14.60416652870884, + "learning_rate": 4.8961847189398123e-05, + "loss": 2.2127, + "mean_token_accuracy": 0.4689655125141144, + "step": 141200 + }, + { + "epoch": 0.14222317571494328, + "grad_norm": 10.039637809862839, + "learning_rate": 4.896173467784947e-05, + "loss": 2.2846, + "mean_token_accuracy": 0.417241370677948, + "step": 141205 + }, + { + "epoch": 0.14222821176804745, + "grad_norm": 11.299360646116535, + "learning_rate": 4.896162216034832e-05, + "loss": 2.25, + "mean_token_accuracy": 0.4517241358757019, + "step": 141210 + }, + { + "epoch": 0.14223324782115163, + "grad_norm": 9.687789816232236, + "learning_rate": 4.8961509636894705e-05, + "loss": 2.2678, + "mean_token_accuracy": 0.4103448331356049, + "step": 141215 + }, + { + "epoch": 0.1422382838742558, + "grad_norm": 9.846454307770834, + "learning_rate": 4.896139710748867e-05, + "loss": 2.4145, + "mean_token_accuracy": 0.4310344815254211, + "step": 141220 + }, + { + "epoch": 0.14224331992735997, + "grad_norm": 10.14602689265939, + "learning_rate": 4.8961284572130225e-05, + "loss": 2.2042, + "mean_token_accuracy": 0.4655172288417816, + "step": 141225 + }, + { + "epoch": 0.14224835598046415, + "grad_norm": 9.220046500442663, + "learning_rate": 4.896117203081943e-05, + "loss": 2.2613, + "mean_token_accuracy": 0.40689654350280763, + "step": 141230 + }, + { + "epoch": 0.14225339203356832, + "grad_norm": 9.086221933721525, + "learning_rate": 4.896105948355629e-05, + "loss": 2.2105, + "mean_token_accuracy": 0.4896551787853241, + "step": 141235 + }, + { + "epoch": 0.1422584280866725, + "grad_norm": 10.703765692638385, + "learning_rate": 4.8960946930340847e-05, + "loss": 2.2976, + "mean_token_accuracy": 0.4206896543502808, + "step": 141240 + }, + { + "epoch": 0.14226346413977667, + "grad_norm": 10.550652540569354, + "learning_rate": 4.8960834371173124e-05, + "loss": 2.1626, + "mean_token_accuracy": 0.47931033968925474, + "step": 141245 + }, + { + "epoch": 0.14226850019288084, + "grad_norm": 11.790994265025699, + "learning_rate": 4.896072180605316e-05, + "loss": 2.5608, + "mean_token_accuracy": 0.4359951615333557, + "step": 141250 + }, + { + "epoch": 0.14227353624598502, + "grad_norm": 10.668019039878052, + "learning_rate": 4.8960609234980985e-05, + "loss": 2.3075, + "mean_token_accuracy": 0.4551724076271057, + "step": 141255 + }, + { + "epoch": 0.1422785722990892, + "grad_norm": 11.173237362527454, + "learning_rate": 4.8960496657956636e-05, + "loss": 2.3156, + "mean_token_accuracy": 0.44016939401626587, + "step": 141260 + }, + { + "epoch": 0.14228360835219336, + "grad_norm": 10.792219036543454, + "learning_rate": 4.896038407498013e-05, + "loss": 2.4937, + "mean_token_accuracy": 0.39655172228813174, + "step": 141265 + }, + { + "epoch": 0.14228864440529754, + "grad_norm": 8.952040291371814, + "learning_rate": 4.8960271486051514e-05, + "loss": 2.0192, + "mean_token_accuracy": 0.4689655125141144, + "step": 141270 + }, + { + "epoch": 0.1422936804584017, + "grad_norm": 13.450481730834248, + "learning_rate": 4.896015889117081e-05, + "loss": 2.5972, + "mean_token_accuracy": 0.41379310488700866, + "step": 141275 + }, + { + "epoch": 0.14229871651150586, + "grad_norm": 9.566379926835328, + "learning_rate": 4.8960046290338044e-05, + "loss": 2.4572, + "mean_token_accuracy": 0.47586206793785096, + "step": 141280 + }, + { + "epoch": 0.14230375256461003, + "grad_norm": 13.285743698725172, + "learning_rate": 4.8959933683553266e-05, + "loss": 2.8435, + "mean_token_accuracy": 0.356745308637619, + "step": 141285 + }, + { + "epoch": 0.1423087886177142, + "grad_norm": 14.17482383744256, + "learning_rate": 4.8959821070816483e-05, + "loss": 2.2732, + "mean_token_accuracy": 0.47586206197738645, + "step": 141290 + }, + { + "epoch": 0.14231382467081838, + "grad_norm": 20.14986715247233, + "learning_rate": 4.895970845212775e-05, + "loss": 3.0606, + "mean_token_accuracy": 0.36551723778247835, + "step": 141295 + }, + { + "epoch": 0.14231886072392255, + "grad_norm": 15.76744823165706, + "learning_rate": 4.8959595827487084e-05, + "loss": 2.5924, + "mean_token_accuracy": 0.3896551787853241, + "step": 141300 + }, + { + "epoch": 0.14232389677702673, + "grad_norm": 10.286503459414803, + "learning_rate": 4.8959483196894514e-05, + "loss": 2.8485, + "mean_token_accuracy": 0.3827586233615875, + "step": 141305 + }, + { + "epoch": 0.1423289328301309, + "grad_norm": 13.26480661680959, + "learning_rate": 4.8959370560350074e-05, + "loss": 2.4321, + "mean_token_accuracy": 0.43103448748588563, + "step": 141310 + }, + { + "epoch": 0.14233396888323507, + "grad_norm": 10.900787353088155, + "learning_rate": 4.89592579178538e-05, + "loss": 2.5457, + "mean_token_accuracy": 0.39310344457626345, + "step": 141315 + }, + { + "epoch": 0.14233900493633925, + "grad_norm": 9.448613502341418, + "learning_rate": 4.895914526940573e-05, + "loss": 2.467, + "mean_token_accuracy": 0.4689655125141144, + "step": 141320 + }, + { + "epoch": 0.14234404098944342, + "grad_norm": 9.36175886338441, + "learning_rate": 4.8959032615005874e-05, + "loss": 2.1982, + "mean_token_accuracy": 0.4586206912994385, + "step": 141325 + }, + { + "epoch": 0.1423490770425476, + "grad_norm": 10.516834633718496, + "learning_rate": 4.8958919954654285e-05, + "loss": 2.071, + "mean_token_accuracy": 0.4689655125141144, + "step": 141330 + }, + { + "epoch": 0.14235411309565177, + "grad_norm": 9.838949492684728, + "learning_rate": 4.8958807288350976e-05, + "loss": 2.6374, + "mean_token_accuracy": 0.37931033968925476, + "step": 141335 + }, + { + "epoch": 0.14235914914875594, + "grad_norm": 12.854416776601228, + "learning_rate": 4.8958694616095994e-05, + "loss": 2.6133, + "mean_token_accuracy": 0.398124623298645, + "step": 141340 + }, + { + "epoch": 0.14236418520186012, + "grad_norm": 10.656888231225333, + "learning_rate": 4.895858193788936e-05, + "loss": 2.3674, + "mean_token_accuracy": 0.4758620738983154, + "step": 141345 + }, + { + "epoch": 0.1423692212549643, + "grad_norm": 9.194759405948771, + "learning_rate": 4.89584692537311e-05, + "loss": 2.4147, + "mean_token_accuracy": 0.42413793206214906, + "step": 141350 + }, + { + "epoch": 0.14237425730806846, + "grad_norm": 10.72545074738459, + "learning_rate": 4.895835656362127e-05, + "loss": 2.2215, + "mean_token_accuracy": 0.4620689630508423, + "step": 141355 + }, + { + "epoch": 0.14237929336117264, + "grad_norm": 11.605693258750737, + "learning_rate": 4.895824386755987e-05, + "loss": 2.4759, + "mean_token_accuracy": 0.4172413766384125, + "step": 141360 + }, + { + "epoch": 0.1423843294142768, + "grad_norm": 9.310789496504128, + "learning_rate": 4.895813116554696e-05, + "loss": 2.3005, + "mean_token_accuracy": 0.44827585816383364, + "step": 141365 + }, + { + "epoch": 0.14238936546738098, + "grad_norm": 11.613144257311893, + "learning_rate": 4.895801845758254e-05, + "loss": 2.4648, + "mean_token_accuracy": 0.3896551787853241, + "step": 141370 + }, + { + "epoch": 0.14239440152048516, + "grad_norm": 9.475547683301434, + "learning_rate": 4.8957905743666665e-05, + "loss": 2.452, + "mean_token_accuracy": 0.441379314661026, + "step": 141375 + }, + { + "epoch": 0.14239943757358933, + "grad_norm": 9.899667098061835, + "learning_rate": 4.8957793023799364e-05, + "loss": 2.5849, + "mean_token_accuracy": 0.40859044790267945, + "step": 141380 + }, + { + "epoch": 0.1424044736266935, + "grad_norm": 11.097587615629124, + "learning_rate": 4.895768029798066e-05, + "loss": 2.5797, + "mean_token_accuracy": 0.38620689511299133, + "step": 141385 + }, + { + "epoch": 0.14240950967979768, + "grad_norm": 12.721831261273225, + "learning_rate": 4.8957567566210595e-05, + "loss": 2.6485, + "mean_token_accuracy": 0.41379310488700866, + "step": 141390 + }, + { + "epoch": 0.14241454573290185, + "grad_norm": 10.538936985079907, + "learning_rate": 4.895745482848919e-05, + "loss": 2.3958, + "mean_token_accuracy": 0.4241379380226135, + "step": 141395 + }, + { + "epoch": 0.14241958178600603, + "grad_norm": 11.422928366408936, + "learning_rate": 4.895734208481648e-05, + "loss": 2.6473, + "mean_token_accuracy": 0.43103448748588563, + "step": 141400 + }, + { + "epoch": 0.1424246178391102, + "grad_norm": 10.579202051044097, + "learning_rate": 4.895722933519249e-05, + "loss": 2.6898, + "mean_token_accuracy": 0.4172413766384125, + "step": 141405 + }, + { + "epoch": 0.14242965389221438, + "grad_norm": 9.195161677949168, + "learning_rate": 4.895711657961726e-05, + "loss": 2.8399, + "mean_token_accuracy": 0.41724138855934145, + "step": 141410 + }, + { + "epoch": 0.14243468994531855, + "grad_norm": 11.884989064117356, + "learning_rate": 4.895700381809083e-05, + "loss": 2.3041, + "mean_token_accuracy": 0.43448275327682495, + "step": 141415 + }, + { + "epoch": 0.1424397259984227, + "grad_norm": 12.612884798410827, + "learning_rate": 4.8956891050613215e-05, + "loss": 2.1731, + "mean_token_accuracy": 0.4724137902259827, + "step": 141420 + }, + { + "epoch": 0.14244476205152687, + "grad_norm": 10.313498016522264, + "learning_rate": 4.895677827718444e-05, + "loss": 2.5638, + "mean_token_accuracy": 0.3931034505367279, + "step": 141425 + }, + { + "epoch": 0.14244979810463104, + "grad_norm": 10.475617810084003, + "learning_rate": 4.895666549780455e-05, + "loss": 2.5571, + "mean_token_accuracy": 0.42068965137004855, + "step": 141430 + }, + { + "epoch": 0.14245483415773522, + "grad_norm": 11.950850135254194, + "learning_rate": 4.8956552712473585e-05, + "loss": 2.5198, + "mean_token_accuracy": 0.4172413766384125, + "step": 141435 + }, + { + "epoch": 0.1424598702108394, + "grad_norm": 14.038478340352409, + "learning_rate": 4.8956439921191554e-05, + "loss": 2.3094, + "mean_token_accuracy": 0.46551724076271056, + "step": 141440 + }, + { + "epoch": 0.14246490626394356, + "grad_norm": 10.737440516268325, + "learning_rate": 4.8956327123958506e-05, + "loss": 2.2301, + "mean_token_accuracy": 0.4517241418361664, + "step": 141445 + }, + { + "epoch": 0.14246994231704774, + "grad_norm": 10.05330475387196, + "learning_rate": 4.895621432077446e-05, + "loss": 2.8713, + "mean_token_accuracy": 0.3551724076271057, + "step": 141450 + }, + { + "epoch": 0.1424749783701519, + "grad_norm": 10.875036855316745, + "learning_rate": 4.8956101511639464e-05, + "loss": 2.5182, + "mean_token_accuracy": 0.43793103098869324, + "step": 141455 + }, + { + "epoch": 0.14248001442325609, + "grad_norm": 9.225976812589868, + "learning_rate": 4.895598869655353e-05, + "loss": 2.6044, + "mean_token_accuracy": 0.39655172228813174, + "step": 141460 + }, + { + "epoch": 0.14248505047636026, + "grad_norm": 12.412333450213206, + "learning_rate": 4.8955875875516695e-05, + "loss": 2.5763, + "mean_token_accuracy": 0.4, + "step": 141465 + }, + { + "epoch": 0.14249008652946443, + "grad_norm": 11.264765658813884, + "learning_rate": 4.8955763048529e-05, + "loss": 2.3981, + "mean_token_accuracy": 0.40344828367233276, + "step": 141470 + }, + { + "epoch": 0.1424951225825686, + "grad_norm": 11.90655647059862, + "learning_rate": 4.8955650215590464e-05, + "loss": 2.6728, + "mean_token_accuracy": 0.4137930989265442, + "step": 141475 + }, + { + "epoch": 0.14250015863567278, + "grad_norm": 15.634529799107476, + "learning_rate": 4.895553737670113e-05, + "loss": 2.7791, + "mean_token_accuracy": 0.39655172228813174, + "step": 141480 + }, + { + "epoch": 0.14250519468877695, + "grad_norm": 19.621687256677887, + "learning_rate": 4.8955424531861015e-05, + "loss": 2.2949, + "mean_token_accuracy": 0.4978826344013214, + "step": 141485 + }, + { + "epoch": 0.14251023074188113, + "grad_norm": 9.88725786825097, + "learning_rate": 4.895531168107016e-05, + "loss": 2.5174, + "mean_token_accuracy": 0.41034482717514037, + "step": 141490 + }, + { + "epoch": 0.1425152667949853, + "grad_norm": 10.004261285613284, + "learning_rate": 4.895519882432859e-05, + "loss": 1.9665, + "mean_token_accuracy": 0.47586206197738645, + "step": 141495 + }, + { + "epoch": 0.14252030284808948, + "grad_norm": 10.025632276024881, + "learning_rate": 4.895508596163635e-05, + "loss": 2.6842, + "mean_token_accuracy": 0.3448275804519653, + "step": 141500 + }, + { + "epoch": 0.14252533890119365, + "grad_norm": 11.662455053865623, + "learning_rate": 4.895497309299346e-05, + "loss": 2.3911, + "mean_token_accuracy": 0.3827586233615875, + "step": 141505 + }, + { + "epoch": 0.14253037495429782, + "grad_norm": 9.685060655803396, + "learning_rate": 4.895486021839994e-05, + "loss": 1.8858, + "mean_token_accuracy": 0.5424682378768921, + "step": 141510 + }, + { + "epoch": 0.142535411007402, + "grad_norm": 15.323721565962153, + "learning_rate": 4.895474733785585e-05, + "loss": 2.1344, + "mean_token_accuracy": 0.48475498557090757, + "step": 141515 + }, + { + "epoch": 0.14254044706050617, + "grad_norm": 9.633800734797438, + "learning_rate": 4.8954634451361205e-05, + "loss": 2.251, + "mean_token_accuracy": 0.46896552443504336, + "step": 141520 + }, + { + "epoch": 0.14254548311361034, + "grad_norm": 11.132390898904177, + "learning_rate": 4.895452155891603e-05, + "loss": 2.6389, + "mean_token_accuracy": 0.39655171930789945, + "step": 141525 + }, + { + "epoch": 0.14255051916671452, + "grad_norm": 11.897015722219962, + "learning_rate": 4.895440866052036e-05, + "loss": 2.2948, + "mean_token_accuracy": 0.47931033968925474, + "step": 141530 + }, + { + "epoch": 0.1425555552198187, + "grad_norm": 15.912652194894457, + "learning_rate": 4.8954295756174237e-05, + "loss": 2.5154, + "mean_token_accuracy": 0.4448275864124298, + "step": 141535 + }, + { + "epoch": 0.14256059127292287, + "grad_norm": 9.229246553208593, + "learning_rate": 4.8954182845877686e-05, + "loss": 2.1304, + "mean_token_accuracy": 0.4344827592372894, + "step": 141540 + }, + { + "epoch": 0.14256562732602704, + "grad_norm": 12.401056819916683, + "learning_rate": 4.8954069929630736e-05, + "loss": 2.2053, + "mean_token_accuracy": 0.5241379380226135, + "step": 141545 + }, + { + "epoch": 0.1425706633791312, + "grad_norm": 11.106357915030257, + "learning_rate": 4.8953957007433414e-05, + "loss": 2.6212, + "mean_token_accuracy": 0.38620689511299133, + "step": 141550 + }, + { + "epoch": 0.1425756994322354, + "grad_norm": 14.16393875869589, + "learning_rate": 4.8953844079285766e-05, + "loss": 2.8523, + "mean_token_accuracy": 0.3862069010734558, + "step": 141555 + }, + { + "epoch": 0.14258073548533953, + "grad_norm": 9.021376015537998, + "learning_rate": 4.895373114518781e-05, + "loss": 2.232, + "mean_token_accuracy": 0.42758620381355283, + "step": 141560 + }, + { + "epoch": 0.1425857715384437, + "grad_norm": 10.94063795694331, + "learning_rate": 4.895361820513958e-05, + "loss": 2.0968, + "mean_token_accuracy": 0.458620685338974, + "step": 141565 + }, + { + "epoch": 0.14259080759154788, + "grad_norm": 9.38740314338868, + "learning_rate": 4.8953505259141116e-05, + "loss": 2.3606, + "mean_token_accuracy": 0.46444128155708314, + "step": 141570 + }, + { + "epoch": 0.14259584364465205, + "grad_norm": 10.899526411405446, + "learning_rate": 4.895339230719243e-05, + "loss": 2.1431, + "mean_token_accuracy": 0.44845736026763916, + "step": 141575 + }, + { + "epoch": 0.14260087969775623, + "grad_norm": 11.584637535459546, + "learning_rate": 4.895327934929357e-05, + "loss": 2.127, + "mean_token_accuracy": 0.4862069010734558, + "step": 141580 + }, + { + "epoch": 0.1426059157508604, + "grad_norm": 12.549067034394517, + "learning_rate": 4.895316638544457e-05, + "loss": 2.477, + "mean_token_accuracy": 0.4034482777118683, + "step": 141585 + }, + { + "epoch": 0.14261095180396458, + "grad_norm": 10.435713558871418, + "learning_rate": 4.895305341564545e-05, + "loss": 2.37, + "mean_token_accuracy": 0.4379310429096222, + "step": 141590 + }, + { + "epoch": 0.14261598785706875, + "grad_norm": 9.502680736548118, + "learning_rate": 4.895294043989625e-05, + "loss": 1.8999, + "mean_token_accuracy": 0.5206896543502808, + "step": 141595 + }, + { + "epoch": 0.14262102391017292, + "grad_norm": 11.779035599684613, + "learning_rate": 4.895282745819699e-05, + "loss": 2.4356, + "mean_token_accuracy": 0.43448275327682495, + "step": 141600 + }, + { + "epoch": 0.1426260599632771, + "grad_norm": 8.814478921204374, + "learning_rate": 4.895271447054771e-05, + "loss": 2.3021, + "mean_token_accuracy": 0.4517241358757019, + "step": 141605 + }, + { + "epoch": 0.14263109601638127, + "grad_norm": 10.207233498975144, + "learning_rate": 4.895260147694844e-05, + "loss": 2.1742, + "mean_token_accuracy": 0.42758620381355283, + "step": 141610 + }, + { + "epoch": 0.14263613206948544, + "grad_norm": 10.744285534654255, + "learning_rate": 4.895248847739921e-05, + "loss": 2.6359, + "mean_token_accuracy": 0.3724137932062149, + "step": 141615 + }, + { + "epoch": 0.14264116812258962, + "grad_norm": 8.952708532882186, + "learning_rate": 4.895237547190005e-05, + "loss": 2.2507, + "mean_token_accuracy": 0.46031457781791685, + "step": 141620 + }, + { + "epoch": 0.1426462041756938, + "grad_norm": 10.828567856076612, + "learning_rate": 4.8952262460451005e-05, + "loss": 2.4986, + "mean_token_accuracy": 0.3862069010734558, + "step": 141625 + }, + { + "epoch": 0.14265124022879797, + "grad_norm": 8.939414661333021, + "learning_rate": 4.8952149443052085e-05, + "loss": 2.2078, + "mean_token_accuracy": 0.4517241358757019, + "step": 141630 + }, + { + "epoch": 0.14265627628190214, + "grad_norm": 12.20278928812962, + "learning_rate": 4.8952036419703335e-05, + "loss": 2.2184, + "mean_token_accuracy": 0.43103447556495667, + "step": 141635 + }, + { + "epoch": 0.1426613123350063, + "grad_norm": 10.665320726712025, + "learning_rate": 4.8951923390404774e-05, + "loss": 2.3348, + "mean_token_accuracy": 0.42068966031074523, + "step": 141640 + }, + { + "epoch": 0.1426663483881105, + "grad_norm": 8.682706110536937, + "learning_rate": 4.895181035515645e-05, + "loss": 2.5533, + "mean_token_accuracy": 0.43103448748588563, + "step": 141645 + }, + { + "epoch": 0.14267138444121466, + "grad_norm": 10.984521591415628, + "learning_rate": 4.895169731395839e-05, + "loss": 2.8462, + "mean_token_accuracy": 0.4056866317987442, + "step": 141650 + }, + { + "epoch": 0.14267642049431883, + "grad_norm": 14.628061707971717, + "learning_rate": 4.895158426681062e-05, + "loss": 2.7121, + "mean_token_accuracy": 0.417241370677948, + "step": 141655 + }, + { + "epoch": 0.142681456547423, + "grad_norm": 8.964339430464383, + "learning_rate": 4.895147121371316e-05, + "loss": 2.3723, + "mean_token_accuracy": 0.3999999940395355, + "step": 141660 + }, + { + "epoch": 0.14268649260052718, + "grad_norm": 11.136021743360747, + "learning_rate": 4.8951358154666074e-05, + "loss": 2.1417, + "mean_token_accuracy": 0.46551724076271056, + "step": 141665 + }, + { + "epoch": 0.14269152865363136, + "grad_norm": 8.182646104682481, + "learning_rate": 4.895124508966936e-05, + "loss": 2.1598, + "mean_token_accuracy": 0.47586206793785096, + "step": 141670 + }, + { + "epoch": 0.14269656470673553, + "grad_norm": 12.637494700926819, + "learning_rate": 4.895113201872307e-05, + "loss": 2.1847, + "mean_token_accuracy": 0.5068965494632721, + "step": 141675 + }, + { + "epoch": 0.1427016007598397, + "grad_norm": 8.989064363423491, + "learning_rate": 4.895101894182722e-05, + "loss": 2.4409, + "mean_token_accuracy": 0.43448275327682495, + "step": 141680 + }, + { + "epoch": 0.14270663681294388, + "grad_norm": 10.214071476566891, + "learning_rate": 4.895090585898186e-05, + "loss": 2.2494, + "mean_token_accuracy": 0.482758617401123, + "step": 141685 + }, + { + "epoch": 0.14271167286604805, + "grad_norm": 11.091593225008213, + "learning_rate": 4.8950792770187005e-05, + "loss": 2.7683, + "mean_token_accuracy": 0.39655172228813174, + "step": 141690 + }, + { + "epoch": 0.14271670891915222, + "grad_norm": 10.315438827310407, + "learning_rate": 4.8950679675442695e-05, + "loss": 2.2512, + "mean_token_accuracy": 0.4758620738983154, + "step": 141695 + }, + { + "epoch": 0.14272174497225637, + "grad_norm": 9.15601974817395, + "learning_rate": 4.895056657474896e-05, + "loss": 2.5103, + "mean_token_accuracy": 0.3862069010734558, + "step": 141700 + }, + { + "epoch": 0.14272678102536054, + "grad_norm": 11.300411610102584, + "learning_rate": 4.895045346810583e-05, + "loss": 2.9462, + "mean_token_accuracy": 0.3724137991666794, + "step": 141705 + }, + { + "epoch": 0.14273181707846472, + "grad_norm": 8.528404433474439, + "learning_rate": 4.895034035551333e-05, + "loss": 2.4233, + "mean_token_accuracy": 0.4482758641242981, + "step": 141710 + }, + { + "epoch": 0.1427368531315689, + "grad_norm": 11.10031752440764, + "learning_rate": 4.8950227236971506e-05, + "loss": 2.5772, + "mean_token_accuracy": 0.43103448748588563, + "step": 141715 + }, + { + "epoch": 0.14274188918467307, + "grad_norm": 9.284322448936308, + "learning_rate": 4.895011411248038e-05, + "loss": 2.3742, + "mean_token_accuracy": 0.45517241060733793, + "step": 141720 + }, + { + "epoch": 0.14274692523777724, + "grad_norm": 8.671194285909623, + "learning_rate": 4.895000098203999e-05, + "loss": 2.5617, + "mean_token_accuracy": 0.44137930274009707, + "step": 141725 + }, + { + "epoch": 0.1427519612908814, + "grad_norm": 9.944315520966793, + "learning_rate": 4.894988784565035e-05, + "loss": 2.1421, + "mean_token_accuracy": 0.48275862336158754, + "step": 141730 + }, + { + "epoch": 0.1427569973439856, + "grad_norm": 10.457472562897905, + "learning_rate": 4.894977470331151e-05, + "loss": 2.0252, + "mean_token_accuracy": 0.47586206793785096, + "step": 141735 + }, + { + "epoch": 0.14276203339708976, + "grad_norm": 13.507136978983162, + "learning_rate": 4.89496615550235e-05, + "loss": 2.3075, + "mean_token_accuracy": 0.4448275864124298, + "step": 141740 + }, + { + "epoch": 0.14276706945019393, + "grad_norm": 10.092187187820274, + "learning_rate": 4.894954840078634e-05, + "loss": 2.7683, + "mean_token_accuracy": 0.41379311084747317, + "step": 141745 + }, + { + "epoch": 0.1427721055032981, + "grad_norm": 9.605043920109585, + "learning_rate": 4.8949435240600066e-05, + "loss": 2.3649, + "mean_token_accuracy": 0.4620689690113068, + "step": 141750 + }, + { + "epoch": 0.14277714155640228, + "grad_norm": 13.04330234819312, + "learning_rate": 4.894932207446472e-05, + "loss": 2.504, + "mean_token_accuracy": 0.40514216423034666, + "step": 141755 + }, + { + "epoch": 0.14278217760950646, + "grad_norm": 10.000405101912865, + "learning_rate": 4.8949208902380314e-05, + "loss": 2.4243, + "mean_token_accuracy": 0.4103448331356049, + "step": 141760 + }, + { + "epoch": 0.14278721366261063, + "grad_norm": 10.681181248950077, + "learning_rate": 4.8949095724346894e-05, + "loss": 2.5671, + "mean_token_accuracy": 0.4379310369491577, + "step": 141765 + }, + { + "epoch": 0.1427922497157148, + "grad_norm": 10.58088004630637, + "learning_rate": 4.894898254036449e-05, + "loss": 2.5083, + "mean_token_accuracy": 0.42758620381355283, + "step": 141770 + }, + { + "epoch": 0.14279728576881898, + "grad_norm": 8.403765330799994, + "learning_rate": 4.8948869350433126e-05, + "loss": 2.1921, + "mean_token_accuracy": 0.45378100872039795, + "step": 141775 + }, + { + "epoch": 0.14280232182192315, + "grad_norm": 11.229487922552703, + "learning_rate": 4.8948756154552844e-05, + "loss": 2.4494, + "mean_token_accuracy": 0.441379314661026, + "step": 141780 + }, + { + "epoch": 0.14280735787502732, + "grad_norm": 10.276600902528685, + "learning_rate": 4.894864295272367e-05, + "loss": 2.2468, + "mean_token_accuracy": 0.4517241418361664, + "step": 141785 + }, + { + "epoch": 0.1428123939281315, + "grad_norm": 9.962363912546921, + "learning_rate": 4.894852974494562e-05, + "loss": 2.3183, + "mean_token_accuracy": 0.3758620619773865, + "step": 141790 + }, + { + "epoch": 0.14281742998123567, + "grad_norm": 8.784905427274058, + "learning_rate": 4.894841653121876e-05, + "loss": 2.5422, + "mean_token_accuracy": 0.42413793206214906, + "step": 141795 + }, + { + "epoch": 0.14282246603433985, + "grad_norm": 11.91506185260996, + "learning_rate": 4.8948303311543084e-05, + "loss": 2.1749, + "mean_token_accuracy": 0.47586207985877993, + "step": 141800 + }, + { + "epoch": 0.14282750208744402, + "grad_norm": 8.563560731086097, + "learning_rate": 4.894819008591866e-05, + "loss": 2.1818, + "mean_token_accuracy": 0.46896551847457885, + "step": 141805 + }, + { + "epoch": 0.1428325381405482, + "grad_norm": 8.086693389918691, + "learning_rate": 4.894807685434548e-05, + "loss": 2.4703, + "mean_token_accuracy": 0.4437386512756348, + "step": 141810 + }, + { + "epoch": 0.14283757419365237, + "grad_norm": 9.376110628291343, + "learning_rate": 4.8947963616823614e-05, + "loss": 2.1402, + "mean_token_accuracy": 0.4620689690113068, + "step": 141815 + }, + { + "epoch": 0.14284261024675654, + "grad_norm": 10.167587644893137, + "learning_rate": 4.894785037335306e-05, + "loss": 2.4622, + "mean_token_accuracy": 0.4655172288417816, + "step": 141820 + }, + { + "epoch": 0.14284764629986071, + "grad_norm": 12.146371620836993, + "learning_rate": 4.8947737123933877e-05, + "loss": 2.2601, + "mean_token_accuracy": 0.4551724135875702, + "step": 141825 + }, + { + "epoch": 0.1428526823529649, + "grad_norm": 10.11365606599262, + "learning_rate": 4.894762386856608e-05, + "loss": 2.5054, + "mean_token_accuracy": 0.4344827592372894, + "step": 141830 + }, + { + "epoch": 0.14285771840606906, + "grad_norm": 9.250691631376075, + "learning_rate": 4.89475106072497e-05, + "loss": 1.9619, + "mean_token_accuracy": 0.5206896543502808, + "step": 141835 + }, + { + "epoch": 0.1428627544591732, + "grad_norm": 9.84959232650346, + "learning_rate": 4.8947397339984785e-05, + "loss": 2.1061, + "mean_token_accuracy": 0.4655172348022461, + "step": 141840 + }, + { + "epoch": 0.14286779051227738, + "grad_norm": 10.034749605122135, + "learning_rate": 4.8947284066771344e-05, + "loss": 2.4274, + "mean_token_accuracy": 0.4344827592372894, + "step": 141845 + }, + { + "epoch": 0.14287282656538156, + "grad_norm": 10.79047950197894, + "learning_rate": 4.894717078760942e-05, + "loss": 2.4638, + "mean_token_accuracy": 0.3965517282485962, + "step": 141850 + }, + { + "epoch": 0.14287786261848573, + "grad_norm": 9.555681721030963, + "learning_rate": 4.8947057502499037e-05, + "loss": 2.6829, + "mean_token_accuracy": 0.4221415638923645, + "step": 141855 + }, + { + "epoch": 0.1428828986715899, + "grad_norm": 10.026774264029338, + "learning_rate": 4.894694421144025e-05, + "loss": 2.1912, + "mean_token_accuracy": 0.42413792610168455, + "step": 141860 + }, + { + "epoch": 0.14288793472469408, + "grad_norm": 11.657065929371347, + "learning_rate": 4.894683091443306e-05, + "loss": 2.4754, + "mean_token_accuracy": 0.42413793206214906, + "step": 141865 + }, + { + "epoch": 0.14289297077779825, + "grad_norm": 14.326566556699827, + "learning_rate": 4.894671761147751e-05, + "loss": 2.5484, + "mean_token_accuracy": 0.4517241418361664, + "step": 141870 + }, + { + "epoch": 0.14289800683090242, + "grad_norm": 10.12768213909908, + "learning_rate": 4.894660430257364e-05, + "loss": 2.4187, + "mean_token_accuracy": 0.4344827592372894, + "step": 141875 + }, + { + "epoch": 0.1429030428840066, + "grad_norm": 18.994702174819135, + "learning_rate": 4.894649098772147e-05, + "loss": 2.5575, + "mean_token_accuracy": 0.42413792610168455, + "step": 141880 + }, + { + "epoch": 0.14290807893711077, + "grad_norm": 9.9006838276388, + "learning_rate": 4.894637766692104e-05, + "loss": 2.1047, + "mean_token_accuracy": 0.4781609117984772, + "step": 141885 + }, + { + "epoch": 0.14291311499021495, + "grad_norm": 9.174694897367916, + "learning_rate": 4.894626434017237e-05, + "loss": 2.1653, + "mean_token_accuracy": 0.46013309359550475, + "step": 141890 + }, + { + "epoch": 0.14291815104331912, + "grad_norm": 9.485597095521344, + "learning_rate": 4.894615100747551e-05, + "loss": 2.0575, + "mean_token_accuracy": 0.5034482777118683, + "step": 141895 + }, + { + "epoch": 0.1429231870964233, + "grad_norm": 13.328185000393646, + "learning_rate": 4.8946037668830466e-05, + "loss": 2.2879, + "mean_token_accuracy": 0.46551724672317507, + "step": 141900 + }, + { + "epoch": 0.14292822314952747, + "grad_norm": 11.38363801832749, + "learning_rate": 4.8945924324237294e-05, + "loss": 2.1266, + "mean_token_accuracy": 0.4744101583957672, + "step": 141905 + }, + { + "epoch": 0.14293325920263164, + "grad_norm": 11.097844435500429, + "learning_rate": 4.894581097369601e-05, + "loss": 2.2888, + "mean_token_accuracy": 0.4586206912994385, + "step": 141910 + }, + { + "epoch": 0.14293829525573581, + "grad_norm": 10.024183880059436, + "learning_rate": 4.894569761720665e-05, + "loss": 2.3068, + "mean_token_accuracy": 0.44137930274009707, + "step": 141915 + }, + { + "epoch": 0.14294333130884, + "grad_norm": 13.216345590116223, + "learning_rate": 4.8945584254769246e-05, + "loss": 2.7632, + "mean_token_accuracy": 0.4068965554237366, + "step": 141920 + }, + { + "epoch": 0.14294836736194416, + "grad_norm": 9.509369766761399, + "learning_rate": 4.8945470886383834e-05, + "loss": 2.2577, + "mean_token_accuracy": 0.46551724672317507, + "step": 141925 + }, + { + "epoch": 0.14295340341504834, + "grad_norm": 10.317496259801056, + "learning_rate": 4.894535751205043e-05, + "loss": 2.2601, + "mean_token_accuracy": 0.4206896543502808, + "step": 141930 + }, + { + "epoch": 0.1429584394681525, + "grad_norm": 15.33368799426953, + "learning_rate": 4.894524413176909e-05, + "loss": 2.702, + "mean_token_accuracy": 0.3655172407627106, + "step": 141935 + }, + { + "epoch": 0.14296347552125668, + "grad_norm": 9.533239394331332, + "learning_rate": 4.894513074553982e-05, + "loss": 1.9119, + "mean_token_accuracy": 0.5019963681697845, + "step": 141940 + }, + { + "epoch": 0.14296851157436086, + "grad_norm": 12.1401776094745, + "learning_rate": 4.8945017353362663e-05, + "loss": 2.868, + "mean_token_accuracy": 0.3655172407627106, + "step": 141945 + }, + { + "epoch": 0.14297354762746503, + "grad_norm": 11.291174179479324, + "learning_rate": 4.894490395523767e-05, + "loss": 2.2961, + "mean_token_accuracy": 0.4724137902259827, + "step": 141950 + }, + { + "epoch": 0.1429785836805692, + "grad_norm": 8.72942414483867, + "learning_rate": 4.894479055116483e-05, + "loss": 2.6986, + "mean_token_accuracy": 0.42413792610168455, + "step": 141955 + }, + { + "epoch": 0.14298361973367338, + "grad_norm": 10.589229903088244, + "learning_rate": 4.8944677141144206e-05, + "loss": 2.39, + "mean_token_accuracy": 0.3913490533828735, + "step": 141960 + }, + { + "epoch": 0.14298865578677755, + "grad_norm": 8.648879054593172, + "learning_rate": 4.894456372517582e-05, + "loss": 2.2915, + "mean_token_accuracy": 0.45628078281879425, + "step": 141965 + }, + { + "epoch": 0.14299369183988173, + "grad_norm": 9.6028357468956, + "learning_rate": 4.89444503032597e-05, + "loss": 2.3294, + "mean_token_accuracy": 0.42068964838981626, + "step": 141970 + }, + { + "epoch": 0.1429987278929859, + "grad_norm": 16.6827246438967, + "learning_rate": 4.894433687539588e-05, + "loss": 2.4573, + "mean_token_accuracy": 0.4344827592372894, + "step": 141975 + }, + { + "epoch": 0.14300376394609005, + "grad_norm": 11.24473563606688, + "learning_rate": 4.8944223441584406e-05, + "loss": 2.2142, + "mean_token_accuracy": 0.43793103098869324, + "step": 141980 + }, + { + "epoch": 0.14300879999919422, + "grad_norm": 10.181697928473278, + "learning_rate": 4.894411000182529e-05, + "loss": 2.5375, + "mean_token_accuracy": 0.3965517282485962, + "step": 141985 + }, + { + "epoch": 0.1430138360522984, + "grad_norm": 10.622712703749242, + "learning_rate": 4.8943996556118574e-05, + "loss": 2.3641, + "mean_token_accuracy": 0.4344827592372894, + "step": 141990 + }, + { + "epoch": 0.14301887210540257, + "grad_norm": 10.66806468324331, + "learning_rate": 4.894388310446428e-05, + "loss": 2.2145, + "mean_token_accuracy": 0.4620689690113068, + "step": 141995 + }, + { + "epoch": 0.14302390815850674, + "grad_norm": 11.58854942185488, + "learning_rate": 4.894376964686244e-05, + "loss": 2.6412, + "mean_token_accuracy": 0.4172413766384125, + "step": 142000 + }, + { + "epoch": 0.14302894421161091, + "grad_norm": 11.034475025489074, + "learning_rate": 4.89436561833131e-05, + "loss": 2.1873, + "mean_token_accuracy": 0.4655172288417816, + "step": 142005 + }, + { + "epoch": 0.1430339802647151, + "grad_norm": 10.761635140303058, + "learning_rate": 4.894354271381628e-05, + "loss": 2.3423, + "mean_token_accuracy": 0.37586207389831544, + "step": 142010 + }, + { + "epoch": 0.14303901631781926, + "grad_norm": 10.175484263393212, + "learning_rate": 4.8943429238372014e-05, + "loss": 2.2858, + "mean_token_accuracy": 0.41034482717514037, + "step": 142015 + }, + { + "epoch": 0.14304405237092344, + "grad_norm": 12.468903387806174, + "learning_rate": 4.894331575698033e-05, + "loss": 2.5247, + "mean_token_accuracy": 0.4262552857398987, + "step": 142020 + }, + { + "epoch": 0.1430490884240276, + "grad_norm": 9.768972183838297, + "learning_rate": 4.8943202269641265e-05, + "loss": 2.624, + "mean_token_accuracy": 0.4206896543502808, + "step": 142025 + }, + { + "epoch": 0.14305412447713178, + "grad_norm": 8.723564683420744, + "learning_rate": 4.894308877635485e-05, + "loss": 2.2417, + "mean_token_accuracy": 0.4586206912994385, + "step": 142030 + }, + { + "epoch": 0.14305916053023596, + "grad_norm": 12.44844643459062, + "learning_rate": 4.894297527712111e-05, + "loss": 2.5936, + "mean_token_accuracy": 0.42758620381355283, + "step": 142035 + }, + { + "epoch": 0.14306419658334013, + "grad_norm": 12.31040023863623, + "learning_rate": 4.894286177194008e-05, + "loss": 2.7149, + "mean_token_accuracy": 0.39086509346961973, + "step": 142040 + }, + { + "epoch": 0.1430692326364443, + "grad_norm": 10.27872409693468, + "learning_rate": 4.894274826081179e-05, + "loss": 2.1609, + "mean_token_accuracy": 0.5, + "step": 142045 + }, + { + "epoch": 0.14307426868954848, + "grad_norm": 11.683319826760895, + "learning_rate": 4.894263474373628e-05, + "loss": 2.5845, + "mean_token_accuracy": 0.4103448331356049, + "step": 142050 + }, + { + "epoch": 0.14307930474265265, + "grad_norm": 11.309545943273221, + "learning_rate": 4.8942521220713576e-05, + "loss": 2.3464, + "mean_token_accuracy": 0.482758617401123, + "step": 142055 + }, + { + "epoch": 0.14308434079575683, + "grad_norm": 10.663402743574363, + "learning_rate": 4.89424076917437e-05, + "loss": 2.3397, + "mean_token_accuracy": 0.3931034505367279, + "step": 142060 + }, + { + "epoch": 0.143089376848861, + "grad_norm": 14.732504452074618, + "learning_rate": 4.8942294156826704e-05, + "loss": 2.8556, + "mean_token_accuracy": 0.3482758551836014, + "step": 142065 + }, + { + "epoch": 0.14309441290196517, + "grad_norm": 9.293152140553792, + "learning_rate": 4.8942180615962595e-05, + "loss": 2.3359, + "mean_token_accuracy": 0.4103448331356049, + "step": 142070 + }, + { + "epoch": 0.14309944895506935, + "grad_norm": 10.314799527898382, + "learning_rate": 4.894206706915143e-05, + "loss": 2.2959, + "mean_token_accuracy": 0.39655172228813174, + "step": 142075 + }, + { + "epoch": 0.14310448500817352, + "grad_norm": 12.020262661884685, + "learning_rate": 4.894195351639322e-05, + "loss": 2.9017, + "mean_token_accuracy": 0.32413792610168457, + "step": 142080 + }, + { + "epoch": 0.1431095210612777, + "grad_norm": 9.32319227505478, + "learning_rate": 4.8941839957688e-05, + "loss": 2.4485, + "mean_token_accuracy": 0.4068965554237366, + "step": 142085 + }, + { + "epoch": 0.14311455711438187, + "grad_norm": 12.616994441756953, + "learning_rate": 4.8941726393035814e-05, + "loss": 2.3747, + "mean_token_accuracy": 0.41724138259887694, + "step": 142090 + }, + { + "epoch": 0.14311959316748604, + "grad_norm": 9.696162821039412, + "learning_rate": 4.894161282243668e-05, + "loss": 2.4629, + "mean_token_accuracy": 0.46551724672317507, + "step": 142095 + }, + { + "epoch": 0.14312462922059022, + "grad_norm": 11.866414190036519, + "learning_rate": 4.894149924589064e-05, + "loss": 2.5934, + "mean_token_accuracy": 0.43103448748588563, + "step": 142100 + }, + { + "epoch": 0.1431296652736944, + "grad_norm": 8.865310754608231, + "learning_rate": 4.894138566339771e-05, + "loss": 2.106, + "mean_token_accuracy": 0.4689655125141144, + "step": 142105 + }, + { + "epoch": 0.14313470132679856, + "grad_norm": 10.18423949656946, + "learning_rate": 4.8941272074957945e-05, + "loss": 2.602, + "mean_token_accuracy": 0.38620689511299133, + "step": 142110 + }, + { + "epoch": 0.14313973737990274, + "grad_norm": 12.771904845471733, + "learning_rate": 4.8941158480571356e-05, + "loss": 2.4986, + "mean_token_accuracy": 0.41379310488700866, + "step": 142115 + }, + { + "epoch": 0.14314477343300688, + "grad_norm": 14.332773844458725, + "learning_rate": 4.8941044880237985e-05, + "loss": 2.2115, + "mean_token_accuracy": 0.4344827592372894, + "step": 142120 + }, + { + "epoch": 0.14314980948611106, + "grad_norm": 12.679406207532665, + "learning_rate": 4.894093127395786e-05, + "loss": 2.3028, + "mean_token_accuracy": 0.39310345351696013, + "step": 142125 + }, + { + "epoch": 0.14315484553921523, + "grad_norm": 10.211925230908582, + "learning_rate": 4.8940817661731016e-05, + "loss": 2.1921, + "mean_token_accuracy": 0.38620689511299133, + "step": 142130 + }, + { + "epoch": 0.1431598815923194, + "grad_norm": 11.023294846962461, + "learning_rate": 4.894070404355748e-05, + "loss": 2.4552, + "mean_token_accuracy": 0.4137930989265442, + "step": 142135 + }, + { + "epoch": 0.14316491764542358, + "grad_norm": 12.815710717691665, + "learning_rate": 4.8940590419437274e-05, + "loss": 2.4909, + "mean_token_accuracy": 0.45517241954803467, + "step": 142140 + }, + { + "epoch": 0.14316995369852775, + "grad_norm": 10.20301371367607, + "learning_rate": 4.8940476789370445e-05, + "loss": 2.1383, + "mean_token_accuracy": 0.4931034564971924, + "step": 142145 + }, + { + "epoch": 0.14317498975163193, + "grad_norm": 11.292461760416774, + "learning_rate": 4.8940363153357025e-05, + "loss": 2.3221, + "mean_token_accuracy": 0.482758617401123, + "step": 142150 + }, + { + "epoch": 0.1431800258047361, + "grad_norm": 11.77007366153293, + "learning_rate": 4.894024951139703e-05, + "loss": 2.3725, + "mean_token_accuracy": 0.4034482717514038, + "step": 142155 + }, + { + "epoch": 0.14318506185784027, + "grad_norm": 10.2591162277197, + "learning_rate": 4.894013586349052e-05, + "loss": 2.1751, + "mean_token_accuracy": 0.46551724076271056, + "step": 142160 + }, + { + "epoch": 0.14319009791094445, + "grad_norm": 8.648655269257274, + "learning_rate": 4.894002220963749e-05, + "loss": 1.9065, + "mean_token_accuracy": 0.5068965494632721, + "step": 142165 + }, + { + "epoch": 0.14319513396404862, + "grad_norm": 8.565182878262995, + "learning_rate": 4.8939908549838e-05, + "loss": 2.029, + "mean_token_accuracy": 0.49999998807907103, + "step": 142170 + }, + { + "epoch": 0.1432001700171528, + "grad_norm": 13.197079025667072, + "learning_rate": 4.893979488409207e-05, + "loss": 2.6033, + "mean_token_accuracy": 0.39655172228813174, + "step": 142175 + }, + { + "epoch": 0.14320520607025697, + "grad_norm": 10.69538313057205, + "learning_rate": 4.8939681212399735e-05, + "loss": 2.4933, + "mean_token_accuracy": 0.3896551787853241, + "step": 142180 + }, + { + "epoch": 0.14321024212336114, + "grad_norm": 11.014820888254176, + "learning_rate": 4.8939567534761015e-05, + "loss": 2.5046, + "mean_token_accuracy": 0.43793103098869324, + "step": 142185 + }, + { + "epoch": 0.14321527817646532, + "grad_norm": 10.308990344874427, + "learning_rate": 4.893945385117596e-05, + "loss": 2.3255, + "mean_token_accuracy": 0.4172413766384125, + "step": 142190 + }, + { + "epoch": 0.1432203142295695, + "grad_norm": 11.021667571167184, + "learning_rate": 4.8939340161644586e-05, + "loss": 2.5997, + "mean_token_accuracy": 0.4068965494632721, + "step": 142195 + }, + { + "epoch": 0.14322535028267366, + "grad_norm": 9.243108800555603, + "learning_rate": 4.893922646616694e-05, + "loss": 2.3038, + "mean_token_accuracy": 0.4137930989265442, + "step": 142200 + }, + { + "epoch": 0.14323038633577784, + "grad_norm": 10.51774654850775, + "learning_rate": 4.893911276474304e-05, + "loss": 2.2999, + "mean_token_accuracy": 0.47586206793785096, + "step": 142205 + }, + { + "epoch": 0.143235422388882, + "grad_norm": 9.34607383792057, + "learning_rate": 4.893899905737292e-05, + "loss": 2.5204, + "mean_token_accuracy": 0.39310344457626345, + "step": 142210 + }, + { + "epoch": 0.14324045844198618, + "grad_norm": 10.980732864523231, + "learning_rate": 4.893888534405662e-05, + "loss": 2.4498, + "mean_token_accuracy": 0.39655172228813174, + "step": 142215 + }, + { + "epoch": 0.14324549449509036, + "grad_norm": 9.370142588671659, + "learning_rate": 4.8938771624794155e-05, + "loss": 2.4707, + "mean_token_accuracy": 0.4379310369491577, + "step": 142220 + }, + { + "epoch": 0.14325053054819453, + "grad_norm": 10.171151405433928, + "learning_rate": 4.893865789958557e-05, + "loss": 2.1027, + "mean_token_accuracy": 0.4517241358757019, + "step": 142225 + }, + { + "epoch": 0.1432555666012987, + "grad_norm": 10.034188907682145, + "learning_rate": 4.8938544168430895e-05, + "loss": 2.4175, + "mean_token_accuracy": 0.42068964838981626, + "step": 142230 + }, + { + "epoch": 0.14326060265440288, + "grad_norm": 10.59538714056528, + "learning_rate": 4.893843043133016e-05, + "loss": 2.4987, + "mean_token_accuracy": 0.4189957737922668, + "step": 142235 + }, + { + "epoch": 0.14326563870750705, + "grad_norm": 10.253105010731302, + "learning_rate": 4.89383166882834e-05, + "loss": 2.3388, + "mean_token_accuracy": 0.41379311084747317, + "step": 142240 + }, + { + "epoch": 0.14327067476061123, + "grad_norm": 8.801381718030939, + "learning_rate": 4.8938202939290635e-05, + "loss": 2.1775, + "mean_token_accuracy": 0.4310344815254211, + "step": 142245 + }, + { + "epoch": 0.1432757108137154, + "grad_norm": 10.40045671004918, + "learning_rate": 4.893808918435192e-05, + "loss": 2.6706, + "mean_token_accuracy": 0.39999999701976774, + "step": 142250 + }, + { + "epoch": 0.14328074686681957, + "grad_norm": 12.647901745772565, + "learning_rate": 4.8937975423467254e-05, + "loss": 2.2626, + "mean_token_accuracy": 0.4344827651977539, + "step": 142255 + }, + { + "epoch": 0.14328578291992372, + "grad_norm": 10.663679437221587, + "learning_rate": 4.893786165663668e-05, + "loss": 2.5321, + "mean_token_accuracy": 0.39655172228813174, + "step": 142260 + }, + { + "epoch": 0.1432908189730279, + "grad_norm": 9.456632714736738, + "learning_rate": 4.893774788386025e-05, + "loss": 2.2809, + "mean_token_accuracy": 0.4034482777118683, + "step": 142265 + }, + { + "epoch": 0.14329585502613207, + "grad_norm": 10.69034084326508, + "learning_rate": 4.893763410513798e-05, + "loss": 2.5798, + "mean_token_accuracy": 0.4034482777118683, + "step": 142270 + }, + { + "epoch": 0.14330089107923624, + "grad_norm": 8.105797660799526, + "learning_rate": 4.89375203204699e-05, + "loss": 2.1371, + "mean_token_accuracy": 0.4482758641242981, + "step": 142275 + }, + { + "epoch": 0.14330592713234042, + "grad_norm": 10.02794028244597, + "learning_rate": 4.893740652985604e-05, + "loss": 2.2947, + "mean_token_accuracy": 0.4137930989265442, + "step": 142280 + }, + { + "epoch": 0.1433109631854446, + "grad_norm": 11.6287036775017, + "learning_rate": 4.893729273329644e-05, + "loss": 2.3262, + "mean_token_accuracy": 0.4448275864124298, + "step": 142285 + }, + { + "epoch": 0.14331599923854876, + "grad_norm": 11.493178326797263, + "learning_rate": 4.8937178930791125e-05, + "loss": 2.0035, + "mean_token_accuracy": 0.4918330252170563, + "step": 142290 + }, + { + "epoch": 0.14332103529165294, + "grad_norm": 9.74977521948768, + "learning_rate": 4.893706512234013e-05, + "loss": 2.1981, + "mean_token_accuracy": 0.48275861144065857, + "step": 142295 + }, + { + "epoch": 0.1433260713447571, + "grad_norm": 12.018917736964728, + "learning_rate": 4.893695130794348e-05, + "loss": 2.4145, + "mean_token_accuracy": 0.43448275327682495, + "step": 142300 + }, + { + "epoch": 0.14333110739786128, + "grad_norm": 10.532603538100084, + "learning_rate": 4.8936837487601214e-05, + "loss": 2.5207, + "mean_token_accuracy": 0.38620689511299133, + "step": 142305 + }, + { + "epoch": 0.14333614345096546, + "grad_norm": 9.027791921491541, + "learning_rate": 4.893672366131337e-05, + "loss": 1.9632, + "mean_token_accuracy": 0.5156684815883636, + "step": 142310 + }, + { + "epoch": 0.14334117950406963, + "grad_norm": 9.032475741494851, + "learning_rate": 4.893660982907996e-05, + "loss": 2.4655, + "mean_token_accuracy": 0.39310344457626345, + "step": 142315 + }, + { + "epoch": 0.1433462155571738, + "grad_norm": 11.448570660489818, + "learning_rate": 4.893649599090103e-05, + "loss": 2.5471, + "mean_token_accuracy": 0.4, + "step": 142320 + }, + { + "epoch": 0.14335125161027798, + "grad_norm": 10.220177076975029, + "learning_rate": 4.8936382146776605e-05, + "loss": 2.2138, + "mean_token_accuracy": 0.42413793206214906, + "step": 142325 + }, + { + "epoch": 0.14335628766338215, + "grad_norm": 9.753661335563747, + "learning_rate": 4.893626829670672e-05, + "loss": 2.5121, + "mean_token_accuracy": 0.40508167147636415, + "step": 142330 + }, + { + "epoch": 0.14336132371648633, + "grad_norm": 9.36079811304816, + "learning_rate": 4.893615444069141e-05, + "loss": 2.3672, + "mean_token_accuracy": 0.46400484442710876, + "step": 142335 + }, + { + "epoch": 0.1433663597695905, + "grad_norm": 10.56503968391867, + "learning_rate": 4.8936040578730704e-05, + "loss": 2.6566, + "mean_token_accuracy": 0.39655172228813174, + "step": 142340 + }, + { + "epoch": 0.14337139582269467, + "grad_norm": 11.57958333115118, + "learning_rate": 4.893592671082462e-05, + "loss": 2.2532, + "mean_token_accuracy": 0.4156079888343811, + "step": 142345 + }, + { + "epoch": 0.14337643187579885, + "grad_norm": 13.011856053615718, + "learning_rate": 4.893581283697322e-05, + "loss": 2.3316, + "mean_token_accuracy": 0.458620685338974, + "step": 142350 + }, + { + "epoch": 0.14338146792890302, + "grad_norm": 9.904553765279603, + "learning_rate": 4.89356989571765e-05, + "loss": 2.7929, + "mean_token_accuracy": 0.3620689660310745, + "step": 142355 + }, + { + "epoch": 0.1433865039820072, + "grad_norm": 11.753921076166261, + "learning_rate": 4.893558507143452e-05, + "loss": 2.5901, + "mean_token_accuracy": 0.4, + "step": 142360 + }, + { + "epoch": 0.14339154003511137, + "grad_norm": 10.497688320923682, + "learning_rate": 4.8935471179747296e-05, + "loss": 2.5919, + "mean_token_accuracy": 0.42758620381355283, + "step": 142365 + }, + { + "epoch": 0.14339657608821554, + "grad_norm": 8.736465012408782, + "learning_rate": 4.893535728211486e-05, + "loss": 2.4362, + "mean_token_accuracy": 0.4103448331356049, + "step": 142370 + }, + { + "epoch": 0.14340161214131972, + "grad_norm": 7.743734806437265, + "learning_rate": 4.893524337853725e-05, + "loss": 2.2784, + "mean_token_accuracy": 0.4620689690113068, + "step": 142375 + }, + { + "epoch": 0.1434066481944239, + "grad_norm": 11.084259441332001, + "learning_rate": 4.89351294690145e-05, + "loss": 2.6369, + "mean_token_accuracy": 0.4172413766384125, + "step": 142380 + }, + { + "epoch": 0.14341168424752807, + "grad_norm": 9.398699408113318, + "learning_rate": 4.893501555354664e-05, + "loss": 2.231, + "mean_token_accuracy": 0.43103448748588563, + "step": 142385 + }, + { + "epoch": 0.14341672030063224, + "grad_norm": 10.144258018426022, + "learning_rate": 4.893490163213369e-05, + "loss": 2.5378, + "mean_token_accuracy": 0.443254691362381, + "step": 142390 + }, + { + "epoch": 0.1434217563537364, + "grad_norm": 8.438241155200776, + "learning_rate": 4.893478770477569e-05, + "loss": 2.6303, + "mean_token_accuracy": 0.4413793087005615, + "step": 142395 + }, + { + "epoch": 0.14342679240684056, + "grad_norm": 10.585559060276362, + "learning_rate": 4.893467377147268e-05, + "loss": 2.4211, + "mean_token_accuracy": 0.4500302493572235, + "step": 142400 + }, + { + "epoch": 0.14343182845994473, + "grad_norm": 7.276441687843659, + "learning_rate": 4.893455983222468e-05, + "loss": 2.1778, + "mean_token_accuracy": 0.5183908104896545, + "step": 142405 + }, + { + "epoch": 0.1434368645130489, + "grad_norm": 10.72973024094628, + "learning_rate": 4.893444588703172e-05, + "loss": 2.5671, + "mean_token_accuracy": 0.4206896543502808, + "step": 142410 + }, + { + "epoch": 0.14344190056615308, + "grad_norm": 14.851634928703682, + "learning_rate": 4.893433193589385e-05, + "loss": 2.5111, + "mean_token_accuracy": 0.4172413766384125, + "step": 142415 + }, + { + "epoch": 0.14344693661925725, + "grad_norm": 10.997921866015664, + "learning_rate": 4.893421797881107e-05, + "loss": 2.3297, + "mean_token_accuracy": 0.43950393199920657, + "step": 142420 + }, + { + "epoch": 0.14345197267236143, + "grad_norm": 8.909375251276662, + "learning_rate": 4.8934104015783436e-05, + "loss": 2.4451, + "mean_token_accuracy": 0.41554749608039854, + "step": 142425 + }, + { + "epoch": 0.1434570087254656, + "grad_norm": 10.02553404993454, + "learning_rate": 4.893399004681097e-05, + "loss": 2.2527, + "mean_token_accuracy": 0.42413793206214906, + "step": 142430 + }, + { + "epoch": 0.14346204477856977, + "grad_norm": 11.292403071647975, + "learning_rate": 4.8933876071893714e-05, + "loss": 2.1369, + "mean_token_accuracy": 0.4862068951129913, + "step": 142435 + }, + { + "epoch": 0.14346708083167395, + "grad_norm": 10.521073119759935, + "learning_rate": 4.89337620910317e-05, + "loss": 2.5993, + "mean_token_accuracy": 0.4103448331356049, + "step": 142440 + }, + { + "epoch": 0.14347211688477812, + "grad_norm": 12.589572489070424, + "learning_rate": 4.893364810422493e-05, + "loss": 2.7581, + "mean_token_accuracy": 0.40344826579093934, + "step": 142445 + }, + { + "epoch": 0.1434771529378823, + "grad_norm": 9.386092869301782, + "learning_rate": 4.893353411147348e-05, + "loss": 2.2453, + "mean_token_accuracy": 0.42413793206214906, + "step": 142450 + }, + { + "epoch": 0.14348218899098647, + "grad_norm": 18.646650802756803, + "learning_rate": 4.8933420112777344e-05, + "loss": 2.3946, + "mean_token_accuracy": 0.4068965554237366, + "step": 142455 + }, + { + "epoch": 0.14348722504409064, + "grad_norm": 12.281833304853626, + "learning_rate": 4.8933306108136574e-05, + "loss": 2.4013, + "mean_token_accuracy": 0.39310344457626345, + "step": 142460 + }, + { + "epoch": 0.14349226109719482, + "grad_norm": 10.939944439176033, + "learning_rate": 4.89331920975512e-05, + "loss": 2.5234, + "mean_token_accuracy": 0.41034482717514037, + "step": 142465 + }, + { + "epoch": 0.143497297150299, + "grad_norm": 10.605068418150696, + "learning_rate": 4.893307808102124e-05, + "loss": 2.1838, + "mean_token_accuracy": 0.46896551847457885, + "step": 142470 + }, + { + "epoch": 0.14350233320340317, + "grad_norm": 9.507031802556483, + "learning_rate": 4.893296405854676e-05, + "loss": 2.0857, + "mean_token_accuracy": 0.4635813653469086, + "step": 142475 + }, + { + "epoch": 0.14350736925650734, + "grad_norm": 10.335730599836609, + "learning_rate": 4.893285003012774e-05, + "loss": 2.2415, + "mean_token_accuracy": 0.4586206912994385, + "step": 142480 + }, + { + "epoch": 0.1435124053096115, + "grad_norm": 8.428254249613822, + "learning_rate": 4.893273599576425e-05, + "loss": 1.8031, + "mean_token_accuracy": 0.5158499658107758, + "step": 142485 + }, + { + "epoch": 0.1435174413627157, + "grad_norm": 10.457251660945689, + "learning_rate": 4.893262195545631e-05, + "loss": 2.5364, + "mean_token_accuracy": 0.41034482717514037, + "step": 142490 + }, + { + "epoch": 0.14352247741581986, + "grad_norm": 11.449481289769475, + "learning_rate": 4.893250790920396e-05, + "loss": 2.381, + "mean_token_accuracy": 0.42758620381355283, + "step": 142495 + }, + { + "epoch": 0.14352751346892403, + "grad_norm": 13.515358398855035, + "learning_rate": 4.893239385700721e-05, + "loss": 2.6162, + "mean_token_accuracy": 0.3999999940395355, + "step": 142500 + }, + { + "epoch": 0.1435325495220282, + "grad_norm": 8.921441336596908, + "learning_rate": 4.893227979886611e-05, + "loss": 2.3353, + "mean_token_accuracy": 0.4379310369491577, + "step": 142505 + }, + { + "epoch": 0.14353758557513238, + "grad_norm": 10.055709108630017, + "learning_rate": 4.893216573478069e-05, + "loss": 2.6292, + "mean_token_accuracy": 0.39310344457626345, + "step": 142510 + }, + { + "epoch": 0.14354262162823656, + "grad_norm": 12.05665796282248, + "learning_rate": 4.893205166475098e-05, + "loss": 2.4443, + "mean_token_accuracy": 0.4068965524435043, + "step": 142515 + }, + { + "epoch": 0.14354765768134073, + "grad_norm": 11.963735281836508, + "learning_rate": 4.893193758877701e-05, + "loss": 2.9896, + "mean_token_accuracy": 0.36551724672317504, + "step": 142520 + }, + { + "epoch": 0.1435526937344449, + "grad_norm": 10.565423138164762, + "learning_rate": 4.893182350685881e-05, + "loss": 2.4077, + "mean_token_accuracy": 0.41724138259887694, + "step": 142525 + }, + { + "epoch": 0.14355772978754908, + "grad_norm": 10.716848205079073, + "learning_rate": 4.893170941899641e-05, + "loss": 2.3145, + "mean_token_accuracy": 0.4689655125141144, + "step": 142530 + }, + { + "epoch": 0.14356276584065325, + "grad_norm": 12.29720545120543, + "learning_rate": 4.893159532518986e-05, + "loss": 2.7395, + "mean_token_accuracy": 0.37586206793785093, + "step": 142535 + }, + { + "epoch": 0.1435678018937574, + "grad_norm": 11.653353411762163, + "learning_rate": 4.8931481225439166e-05, + "loss": 2.8983, + "mean_token_accuracy": 0.4186932861804962, + "step": 142540 + }, + { + "epoch": 0.14357283794686157, + "grad_norm": 9.83492647673044, + "learning_rate": 4.893136711974437e-05, + "loss": 2.3948, + "mean_token_accuracy": 0.4172413766384125, + "step": 142545 + }, + { + "epoch": 0.14357787399996574, + "grad_norm": 10.976336079313233, + "learning_rate": 4.89312530081055e-05, + "loss": 2.515, + "mean_token_accuracy": 0.40344828367233276, + "step": 142550 + }, + { + "epoch": 0.14358291005306992, + "grad_norm": 10.00846077115079, + "learning_rate": 4.89311388905226e-05, + "loss": 2.507, + "mean_token_accuracy": 0.4413793087005615, + "step": 142555 + }, + { + "epoch": 0.1435879461061741, + "grad_norm": 11.00399921509478, + "learning_rate": 4.8931024766995695e-05, + "loss": 2.1564, + "mean_token_accuracy": 0.4517241358757019, + "step": 142560 + }, + { + "epoch": 0.14359298215927827, + "grad_norm": 14.913380959347803, + "learning_rate": 4.893091063752481e-05, + "loss": 2.5114, + "mean_token_accuracy": 0.44022988677024844, + "step": 142565 + }, + { + "epoch": 0.14359801821238244, + "grad_norm": 12.594649962353268, + "learning_rate": 4.893079650210998e-05, + "loss": 2.94, + "mean_token_accuracy": 0.3793103516101837, + "step": 142570 + }, + { + "epoch": 0.1436030542654866, + "grad_norm": 9.047690944809839, + "learning_rate": 4.893068236075125e-05, + "loss": 2.1825, + "mean_token_accuracy": 0.48130671977996825, + "step": 142575 + }, + { + "epoch": 0.1436080903185908, + "grad_norm": 9.057478762686248, + "learning_rate": 4.893056821344863e-05, + "loss": 2.2233, + "mean_token_accuracy": 0.43448275327682495, + "step": 142580 + }, + { + "epoch": 0.14361312637169496, + "grad_norm": 10.349138760724113, + "learning_rate": 4.8930454060202165e-05, + "loss": 2.7152, + "mean_token_accuracy": 0.38965516686439516, + "step": 142585 + }, + { + "epoch": 0.14361816242479913, + "grad_norm": 9.973013972395451, + "learning_rate": 4.8930339901011885e-05, + "loss": 2.5617, + "mean_token_accuracy": 0.42758620977401735, + "step": 142590 + }, + { + "epoch": 0.1436231984779033, + "grad_norm": 9.956689188971596, + "learning_rate": 4.893022573587782e-05, + "loss": 2.6338, + "mean_token_accuracy": 0.3931034505367279, + "step": 142595 + }, + { + "epoch": 0.14362823453100748, + "grad_norm": 15.062488479970959, + "learning_rate": 4.893011156480001e-05, + "loss": 2.3249, + "mean_token_accuracy": 0.4517241358757019, + "step": 142600 + }, + { + "epoch": 0.14363327058411166, + "grad_norm": 10.167529023060766, + "learning_rate": 4.8929997387778466e-05, + "loss": 2.2516, + "mean_token_accuracy": 0.4620689570903778, + "step": 142605 + }, + { + "epoch": 0.14363830663721583, + "grad_norm": 10.327946073253557, + "learning_rate": 4.892988320481324e-05, + "loss": 2.6832, + "mean_token_accuracy": 0.39655172228813174, + "step": 142610 + }, + { + "epoch": 0.14364334269032, + "grad_norm": 14.274125365171392, + "learning_rate": 4.892976901590435e-05, + "loss": 2.6174, + "mean_token_accuracy": 0.40689656138420105, + "step": 142615 + }, + { + "epoch": 0.14364837874342418, + "grad_norm": 10.082656262496846, + "learning_rate": 4.892965482105183e-05, + "loss": 2.3299, + "mean_token_accuracy": 0.4206896543502808, + "step": 142620 + }, + { + "epoch": 0.14365341479652835, + "grad_norm": 12.980735696920775, + "learning_rate": 4.892954062025572e-05, + "loss": 2.131, + "mean_token_accuracy": 0.48965516686439514, + "step": 142625 + }, + { + "epoch": 0.14365845084963252, + "grad_norm": 9.569055479685993, + "learning_rate": 4.892942641351605e-05, + "loss": 1.8868, + "mean_token_accuracy": 0.4979064047336578, + "step": 142630 + }, + { + "epoch": 0.1436634869027367, + "grad_norm": 10.08435068373998, + "learning_rate": 4.8929312200832845e-05, + "loss": 2.5762, + "mean_token_accuracy": 0.4330308556556702, + "step": 142635 + }, + { + "epoch": 0.14366852295584087, + "grad_norm": 8.307796679513682, + "learning_rate": 4.8929197982206135e-05, + "loss": 2.203, + "mean_token_accuracy": 0.45728976726531984, + "step": 142640 + }, + { + "epoch": 0.14367355900894505, + "grad_norm": 15.810018936395103, + "learning_rate": 4.892908375763597e-05, + "loss": 3.0555, + "mean_token_accuracy": 0.41379310488700866, + "step": 142645 + }, + { + "epoch": 0.14367859506204922, + "grad_norm": 16.86658739806709, + "learning_rate": 4.8928969527122366e-05, + "loss": 2.5404, + "mean_token_accuracy": 0.441379314661026, + "step": 142650 + }, + { + "epoch": 0.1436836311151534, + "grad_norm": 8.742553063071579, + "learning_rate": 4.8928855290665355e-05, + "loss": 2.3121, + "mean_token_accuracy": 0.44482758045196535, + "step": 142655 + }, + { + "epoch": 0.14368866716825757, + "grad_norm": 11.062227354150226, + "learning_rate": 4.8928741048264964e-05, + "loss": 2.3729, + "mean_token_accuracy": 0.3896551728248596, + "step": 142660 + }, + { + "epoch": 0.14369370322136174, + "grad_norm": 8.749707763812081, + "learning_rate": 4.892862679992124e-05, + "loss": 2.2242, + "mean_token_accuracy": 0.43103447556495667, + "step": 142665 + }, + { + "epoch": 0.14369873927446591, + "grad_norm": 8.784119900665745, + "learning_rate": 4.892851254563421e-05, + "loss": 2.1423, + "mean_token_accuracy": 0.47791893482208253, + "step": 142670 + }, + { + "epoch": 0.1437037753275701, + "grad_norm": 8.644596631490936, + "learning_rate": 4.8928398285403894e-05, + "loss": 2.2741, + "mean_token_accuracy": 0.49183303117752075, + "step": 142675 + }, + { + "epoch": 0.14370881138067423, + "grad_norm": 13.500314469573821, + "learning_rate": 4.892828401923033e-05, + "loss": 2.4512, + "mean_token_accuracy": 0.4068965554237366, + "step": 142680 + }, + { + "epoch": 0.1437138474337784, + "grad_norm": 12.742102927445767, + "learning_rate": 4.892816974711356e-05, + "loss": 2.6951, + "mean_token_accuracy": 0.3551724135875702, + "step": 142685 + }, + { + "epoch": 0.14371888348688258, + "grad_norm": 9.636528134294212, + "learning_rate": 4.8928055469053595e-05, + "loss": 2.2518, + "mean_token_accuracy": 0.44482757449150084, + "step": 142690 + }, + { + "epoch": 0.14372391953998676, + "grad_norm": 14.573304839801267, + "learning_rate": 4.892794118505049e-05, + "loss": 2.4239, + "mean_token_accuracy": 0.4413793087005615, + "step": 142695 + }, + { + "epoch": 0.14372895559309093, + "grad_norm": 10.123872182408846, + "learning_rate": 4.892782689510427e-05, + "loss": 2.2216, + "mean_token_accuracy": 0.43103447556495667, + "step": 142700 + }, + { + "epoch": 0.1437339916461951, + "grad_norm": 9.28094841795952, + "learning_rate": 4.892771259921495e-05, + "loss": 2.433, + "mean_token_accuracy": 0.40689654350280763, + "step": 142705 + }, + { + "epoch": 0.14373902769929928, + "grad_norm": 9.579510297904683, + "learning_rate": 4.892759829738258e-05, + "loss": 2.1139, + "mean_token_accuracy": 0.4379310369491577, + "step": 142710 + }, + { + "epoch": 0.14374406375240345, + "grad_norm": 12.808606537632743, + "learning_rate": 4.892748398960718e-05, + "loss": 2.5815, + "mean_token_accuracy": 0.4103448331356049, + "step": 142715 + }, + { + "epoch": 0.14374909980550762, + "grad_norm": 12.547084600884551, + "learning_rate": 4.892736967588879e-05, + "loss": 2.7051, + "mean_token_accuracy": 0.3965517282485962, + "step": 142720 + }, + { + "epoch": 0.1437541358586118, + "grad_norm": 10.806085269547168, + "learning_rate": 4.892725535622744e-05, + "loss": 2.5653, + "mean_token_accuracy": 0.42413792610168455, + "step": 142725 + }, + { + "epoch": 0.14375917191171597, + "grad_norm": 9.315899254453186, + "learning_rate": 4.892714103062317e-05, + "loss": 2.548, + "mean_token_accuracy": 0.3965517282485962, + "step": 142730 + }, + { + "epoch": 0.14376420796482015, + "grad_norm": 9.085792076205212, + "learning_rate": 4.892702669907599e-05, + "loss": 2.1217, + "mean_token_accuracy": 0.43793103098869324, + "step": 142735 + }, + { + "epoch": 0.14376924401792432, + "grad_norm": 10.004012546130024, + "learning_rate": 4.8926912361585945e-05, + "loss": 2.3576, + "mean_token_accuracy": 0.4, + "step": 142740 + }, + { + "epoch": 0.1437742800710285, + "grad_norm": 12.469702844078748, + "learning_rate": 4.892679801815307e-05, + "loss": 2.9837, + "mean_token_accuracy": 0.37931033968925476, + "step": 142745 + }, + { + "epoch": 0.14377931612413267, + "grad_norm": 12.871215290354318, + "learning_rate": 4.892668366877739e-05, + "loss": 2.583, + "mean_token_accuracy": 0.43103448748588563, + "step": 142750 + }, + { + "epoch": 0.14378435217723684, + "grad_norm": 10.421704312204175, + "learning_rate": 4.8926569313458944e-05, + "loss": 2.4026, + "mean_token_accuracy": 0.41379311084747317, + "step": 142755 + }, + { + "epoch": 0.14378938823034101, + "grad_norm": 9.895373798535891, + "learning_rate": 4.892645495219775e-05, + "loss": 2.2403, + "mean_token_accuracy": 0.4413793087005615, + "step": 142760 + }, + { + "epoch": 0.1437944242834452, + "grad_norm": 11.124880046749746, + "learning_rate": 4.8926340584993854e-05, + "loss": 2.489, + "mean_token_accuracy": 0.42068964838981626, + "step": 142765 + }, + { + "epoch": 0.14379946033654936, + "grad_norm": 9.161263075594407, + "learning_rate": 4.892622621184729e-05, + "loss": 2.1112, + "mean_token_accuracy": 0.458620685338974, + "step": 142770 + }, + { + "epoch": 0.14380449638965354, + "grad_norm": 8.468632696864248, + "learning_rate": 4.892611183275807e-05, + "loss": 1.9238, + "mean_token_accuracy": 0.5034482777118683, + "step": 142775 + }, + { + "epoch": 0.1438095324427577, + "grad_norm": 8.991145199280211, + "learning_rate": 4.8925997447726245e-05, + "loss": 2.2855, + "mean_token_accuracy": 0.4275861978530884, + "step": 142780 + }, + { + "epoch": 0.14381456849586188, + "grad_norm": 11.383034455708005, + "learning_rate": 4.892588305675184e-05, + "loss": 2.5955, + "mean_token_accuracy": 0.4448275864124298, + "step": 142785 + }, + { + "epoch": 0.14381960454896606, + "grad_norm": 11.491862019309762, + "learning_rate": 4.892576865983489e-05, + "loss": 2.502, + "mean_token_accuracy": 0.3931034505367279, + "step": 142790 + }, + { + "epoch": 0.14382464060207023, + "grad_norm": 8.651065048269503, + "learning_rate": 4.892565425697541e-05, + "loss": 2.5239, + "mean_token_accuracy": 0.4724137902259827, + "step": 142795 + }, + { + "epoch": 0.1438296766551744, + "grad_norm": 10.464849541224504, + "learning_rate": 4.892553984817346e-05, + "loss": 2.0151, + "mean_token_accuracy": 0.4767241358757019, + "step": 142800 + }, + { + "epoch": 0.14383471270827858, + "grad_norm": 10.842183693315686, + "learning_rate": 4.892542543342904e-05, + "loss": 2.1677, + "mean_token_accuracy": 0.42758620977401735, + "step": 142805 + }, + { + "epoch": 0.14383974876138275, + "grad_norm": 9.730482362758144, + "learning_rate": 4.8925311012742216e-05, + "loss": 2.3562, + "mean_token_accuracy": 0.45862069725990295, + "step": 142810 + }, + { + "epoch": 0.14384478481448693, + "grad_norm": 10.228069904042458, + "learning_rate": 4.892519658611299e-05, + "loss": 2.367, + "mean_token_accuracy": 0.41379310488700866, + "step": 142815 + }, + { + "epoch": 0.14384982086759107, + "grad_norm": 10.667346683059518, + "learning_rate": 4.8925082153541406e-05, + "loss": 2.3034, + "mean_token_accuracy": 0.42758620977401735, + "step": 142820 + }, + { + "epoch": 0.14385485692069525, + "grad_norm": 11.759707994048476, + "learning_rate": 4.89249677150275e-05, + "loss": 2.3183, + "mean_token_accuracy": 0.42413792610168455, + "step": 142825 + }, + { + "epoch": 0.14385989297379942, + "grad_norm": 13.097501186563894, + "learning_rate": 4.89248532705713e-05, + "loss": 2.5067, + "mean_token_accuracy": 0.382758629322052, + "step": 142830 + }, + { + "epoch": 0.1438649290269036, + "grad_norm": 9.467002903542522, + "learning_rate": 4.892473882017284e-05, + "loss": 2.1393, + "mean_token_accuracy": 0.4689655125141144, + "step": 142835 + }, + { + "epoch": 0.14386996508000777, + "grad_norm": 10.855511156032291, + "learning_rate": 4.892462436383214e-05, + "loss": 2.3535, + "mean_token_accuracy": 0.4448275864124298, + "step": 142840 + }, + { + "epoch": 0.14387500113311194, + "grad_norm": 14.699044491647461, + "learning_rate": 4.892450990154924e-05, + "loss": 2.7592, + "mean_token_accuracy": 0.4344827592372894, + "step": 142845 + }, + { + "epoch": 0.14388003718621611, + "grad_norm": 17.951133107832828, + "learning_rate": 4.892439543332418e-05, + "loss": 2.2737, + "mean_token_accuracy": 0.44137930274009707, + "step": 142850 + }, + { + "epoch": 0.1438850732393203, + "grad_norm": 11.935975563885311, + "learning_rate": 4.892428095915698e-05, + "loss": 1.9793, + "mean_token_accuracy": 0.482758617401123, + "step": 142855 + }, + { + "epoch": 0.14389010929242446, + "grad_norm": 11.93193313017178, + "learning_rate": 4.892416647904767e-05, + "loss": 2.6294, + "mean_token_accuracy": 0.39655172228813174, + "step": 142860 + }, + { + "epoch": 0.14389514534552864, + "grad_norm": 11.509025055382219, + "learning_rate": 4.89240519929963e-05, + "loss": 2.2546, + "mean_token_accuracy": 0.45517241954803467, + "step": 142865 + }, + { + "epoch": 0.1439001813986328, + "grad_norm": 12.35155089295012, + "learning_rate": 4.892393750100289e-05, + "loss": 2.5757, + "mean_token_accuracy": 0.4379310369491577, + "step": 142870 + }, + { + "epoch": 0.14390521745173698, + "grad_norm": 9.107493141827403, + "learning_rate": 4.892382300306746e-05, + "loss": 2.3494, + "mean_token_accuracy": 0.4, + "step": 142875 + }, + { + "epoch": 0.14391025350484116, + "grad_norm": 9.706155659895366, + "learning_rate": 4.892370849919006e-05, + "loss": 2.5125, + "mean_token_accuracy": 0.4137930989265442, + "step": 142880 + }, + { + "epoch": 0.14391528955794533, + "grad_norm": 10.051711676076456, + "learning_rate": 4.892359398937071e-05, + "loss": 2.0437, + "mean_token_accuracy": 0.4724137902259827, + "step": 142885 + }, + { + "epoch": 0.1439203256110495, + "grad_norm": 9.271766836140744, + "learning_rate": 4.892347947360945e-05, + "loss": 2.3653, + "mean_token_accuracy": 0.45172414779663084, + "step": 142890 + }, + { + "epoch": 0.14392536166415368, + "grad_norm": 14.211886455223294, + "learning_rate": 4.89233649519063e-05, + "loss": 2.3437, + "mean_token_accuracy": 0.44137930274009707, + "step": 142895 + }, + { + "epoch": 0.14393039771725785, + "grad_norm": 12.761658982227075, + "learning_rate": 4.892325042426131e-05, + "loss": 2.2547, + "mean_token_accuracy": 0.4517241358757019, + "step": 142900 + }, + { + "epoch": 0.14393543377036203, + "grad_norm": 13.60360436706123, + "learning_rate": 4.892313589067449e-05, + "loss": 2.3689, + "mean_token_accuracy": 0.41379310488700866, + "step": 142905 + }, + { + "epoch": 0.1439404698234662, + "grad_norm": 10.97842570300436, + "learning_rate": 4.8923021351145894e-05, + "loss": 2.1077, + "mean_token_accuracy": 0.44827587008476255, + "step": 142910 + }, + { + "epoch": 0.14394550587657037, + "grad_norm": 10.144580239775088, + "learning_rate": 4.892290680567554e-05, + "loss": 2.3137, + "mean_token_accuracy": 0.4620689630508423, + "step": 142915 + }, + { + "epoch": 0.14395054192967455, + "grad_norm": 8.362098187957232, + "learning_rate": 4.8922792254263465e-05, + "loss": 2.5756, + "mean_token_accuracy": 0.4344827592372894, + "step": 142920 + }, + { + "epoch": 0.14395557798277872, + "grad_norm": 11.499226014986755, + "learning_rate": 4.8922677696909695e-05, + "loss": 2.5759, + "mean_token_accuracy": 0.4068965554237366, + "step": 142925 + }, + { + "epoch": 0.1439606140358829, + "grad_norm": 10.976927907102782, + "learning_rate": 4.892256313361426e-05, + "loss": 2.2757, + "mean_token_accuracy": 0.4620689630508423, + "step": 142930 + }, + { + "epoch": 0.14396565008898707, + "grad_norm": 9.320894159894085, + "learning_rate": 4.892244856437721e-05, + "loss": 2.2437, + "mean_token_accuracy": 0.4586206912994385, + "step": 142935 + }, + { + "epoch": 0.14397068614209124, + "grad_norm": 10.061830760009846, + "learning_rate": 4.892233398919855e-05, + "loss": 2.569, + "mean_token_accuracy": 0.4068965554237366, + "step": 142940 + }, + { + "epoch": 0.14397572219519542, + "grad_norm": 9.423061277116496, + "learning_rate": 4.892221940807834e-05, + "loss": 2.4951, + "mean_token_accuracy": 0.43998789191246035, + "step": 142945 + }, + { + "epoch": 0.1439807582482996, + "grad_norm": 9.72547668072557, + "learning_rate": 4.892210482101659e-05, + "loss": 2.6852, + "mean_token_accuracy": 0.42758620977401735, + "step": 142950 + }, + { + "epoch": 0.14398579430140374, + "grad_norm": 11.770839052130134, + "learning_rate": 4.8921990228013344e-05, + "loss": 2.0784, + "mean_token_accuracy": 0.4344827592372894, + "step": 142955 + }, + { + "epoch": 0.1439908303545079, + "grad_norm": 9.130040836127705, + "learning_rate": 4.8921875629068624e-05, + "loss": 2.7847, + "mean_token_accuracy": 0.358620685338974, + "step": 142960 + }, + { + "epoch": 0.14399586640761208, + "grad_norm": 12.249613521821624, + "learning_rate": 4.8921761024182475e-05, + "loss": 2.8651, + "mean_token_accuracy": 0.40175439715385436, + "step": 142965 + }, + { + "epoch": 0.14400090246071626, + "grad_norm": 10.401410801803454, + "learning_rate": 4.892164641335491e-05, + "loss": 2.4592, + "mean_token_accuracy": 0.4344827592372894, + "step": 142970 + }, + { + "epoch": 0.14400593851382043, + "grad_norm": 12.195850219459128, + "learning_rate": 4.8921531796585976e-05, + "loss": 2.3793, + "mean_token_accuracy": 0.4448275864124298, + "step": 142975 + }, + { + "epoch": 0.1440109745669246, + "grad_norm": 11.443995089878216, + "learning_rate": 4.892141717387569e-05, + "loss": 2.5288, + "mean_token_accuracy": 0.39310344457626345, + "step": 142980 + }, + { + "epoch": 0.14401601062002878, + "grad_norm": 10.035048004010267, + "learning_rate": 4.892130254522411e-05, + "loss": 2.4465, + "mean_token_accuracy": 0.4517241418361664, + "step": 142985 + }, + { + "epoch": 0.14402104667313295, + "grad_norm": 8.892666886622552, + "learning_rate": 4.892118791063125e-05, + "loss": 2.1489, + "mean_token_accuracy": 0.46896551847457885, + "step": 142990 + }, + { + "epoch": 0.14402608272623713, + "grad_norm": 8.853451102214152, + "learning_rate": 4.892107327009714e-05, + "loss": 2.0947, + "mean_token_accuracy": 0.5228675007820129, + "step": 142995 + }, + { + "epoch": 0.1440311187793413, + "grad_norm": 10.36582724648141, + "learning_rate": 4.8920958623621814e-05, + "loss": 2.3684, + "mean_token_accuracy": 0.42413792610168455, + "step": 143000 + }, + { + "epoch": 0.14403615483244547, + "grad_norm": 10.178254030609075, + "learning_rate": 4.8920843971205306e-05, + "loss": 2.2843, + "mean_token_accuracy": 0.441379314661026, + "step": 143005 + }, + { + "epoch": 0.14404119088554965, + "grad_norm": 10.352139427298674, + "learning_rate": 4.892072931284765e-05, + "loss": 2.0948, + "mean_token_accuracy": 0.43103448748588563, + "step": 143010 + }, + { + "epoch": 0.14404622693865382, + "grad_norm": 9.310362326069008, + "learning_rate": 4.892061464854887e-05, + "loss": 2.3319, + "mean_token_accuracy": 0.4448275864124298, + "step": 143015 + }, + { + "epoch": 0.144051262991758, + "grad_norm": 10.919517886956424, + "learning_rate": 4.8920499978309e-05, + "loss": 2.4419, + "mean_token_accuracy": 0.4325468838214874, + "step": 143020 + }, + { + "epoch": 0.14405629904486217, + "grad_norm": 10.886104098074167, + "learning_rate": 4.892038530212808e-05, + "loss": 2.5615, + "mean_token_accuracy": 0.42068964838981626, + "step": 143025 + }, + { + "epoch": 0.14406133509796634, + "grad_norm": 11.708641266032057, + "learning_rate": 4.892027062000614e-05, + "loss": 2.6762, + "mean_token_accuracy": 0.42758620977401735, + "step": 143030 + }, + { + "epoch": 0.14406637115107052, + "grad_norm": 9.038643535728255, + "learning_rate": 4.89201559319432e-05, + "loss": 2.2711, + "mean_token_accuracy": 0.46896551847457885, + "step": 143035 + }, + { + "epoch": 0.1440714072041747, + "grad_norm": 10.023533905000063, + "learning_rate": 4.89200412379393e-05, + "loss": 2.3468, + "mean_token_accuracy": 0.39655172228813174, + "step": 143040 + }, + { + "epoch": 0.14407644325727886, + "grad_norm": 9.515347929939889, + "learning_rate": 4.891992653799447e-05, + "loss": 1.9717, + "mean_token_accuracy": 0.4931034505367279, + "step": 143045 + }, + { + "epoch": 0.14408147931038304, + "grad_norm": 9.57465746427421, + "learning_rate": 4.891981183210875e-05, + "loss": 2.2037, + "mean_token_accuracy": 0.4310344815254211, + "step": 143050 + }, + { + "epoch": 0.1440865153634872, + "grad_norm": 9.275227159154056, + "learning_rate": 4.8919697120282165e-05, + "loss": 2.2079, + "mean_token_accuracy": 0.4744101583957672, + "step": 143055 + }, + { + "epoch": 0.14409155141659138, + "grad_norm": 10.82387792293545, + "learning_rate": 4.8919582402514755e-05, + "loss": 2.3596, + "mean_token_accuracy": 0.43793103098869324, + "step": 143060 + }, + { + "epoch": 0.14409658746969556, + "grad_norm": 10.027555293206424, + "learning_rate": 4.891946767880653e-05, + "loss": 2.2664, + "mean_token_accuracy": 0.43103448748588563, + "step": 143065 + }, + { + "epoch": 0.14410162352279973, + "grad_norm": 10.302555316224726, + "learning_rate": 4.8919352949157534e-05, + "loss": 2.1611, + "mean_token_accuracy": 0.4344827651977539, + "step": 143070 + }, + { + "epoch": 0.1441066595759039, + "grad_norm": 8.502397602443207, + "learning_rate": 4.8919238213567806e-05, + "loss": 2.0091, + "mean_token_accuracy": 0.5034482777118683, + "step": 143075 + }, + { + "epoch": 0.14411169562900808, + "grad_norm": 9.34425559516383, + "learning_rate": 4.891912347203738e-05, + "loss": 2.5947, + "mean_token_accuracy": 0.43103447556495667, + "step": 143080 + }, + { + "epoch": 0.14411673168211225, + "grad_norm": 10.170930713389188, + "learning_rate": 4.8919008724566266e-05, + "loss": 2.5939, + "mean_token_accuracy": 0.4068965554237366, + "step": 143085 + }, + { + "epoch": 0.14412176773521643, + "grad_norm": 8.503277582441921, + "learning_rate": 4.8918893971154524e-05, + "loss": 2.432, + "mean_token_accuracy": 0.4275861978530884, + "step": 143090 + }, + { + "epoch": 0.14412680378832057, + "grad_norm": 9.46354659690624, + "learning_rate": 4.891877921180216e-05, + "loss": 2.1359, + "mean_token_accuracy": 0.47931034564971925, + "step": 143095 + }, + { + "epoch": 0.14413183984142475, + "grad_norm": 10.137913563625476, + "learning_rate": 4.891866444650922e-05, + "loss": 2.2744, + "mean_token_accuracy": 0.4638838529586792, + "step": 143100 + }, + { + "epoch": 0.14413687589452892, + "grad_norm": 16.099043559733598, + "learning_rate": 4.8918549675275736e-05, + "loss": 2.7884, + "mean_token_accuracy": 0.4206896543502808, + "step": 143105 + }, + { + "epoch": 0.1441419119476331, + "grad_norm": 8.901882095246567, + "learning_rate": 4.891843489810174e-05, + "loss": 2.2721, + "mean_token_accuracy": 0.5178463518619537, + "step": 143110 + }, + { + "epoch": 0.14414694800073727, + "grad_norm": 13.038421232099818, + "learning_rate": 4.891832011498726e-05, + "loss": 2.5571, + "mean_token_accuracy": 0.3965517163276672, + "step": 143115 + }, + { + "epoch": 0.14415198405384144, + "grad_norm": 11.266259422140152, + "learning_rate": 4.8918205325932324e-05, + "loss": 2.6415, + "mean_token_accuracy": 0.41034482717514037, + "step": 143120 + }, + { + "epoch": 0.14415702010694562, + "grad_norm": 9.480887745959103, + "learning_rate": 4.8918090530936976e-05, + "loss": 2.4457, + "mean_token_accuracy": 0.4137930989265442, + "step": 143125 + }, + { + "epoch": 0.1441620561600498, + "grad_norm": 13.243234792372652, + "learning_rate": 4.891797573000124e-05, + "loss": 2.312, + "mean_token_accuracy": 0.40689656138420105, + "step": 143130 + }, + { + "epoch": 0.14416709221315396, + "grad_norm": 11.138285367638243, + "learning_rate": 4.891786092312514e-05, + "loss": 2.6573, + "mean_token_accuracy": 0.37241379022598264, + "step": 143135 + }, + { + "epoch": 0.14417212826625814, + "grad_norm": 10.895345954382462, + "learning_rate": 4.891774611030873e-05, + "loss": 2.4479, + "mean_token_accuracy": 0.44283121824264526, + "step": 143140 + }, + { + "epoch": 0.1441771643193623, + "grad_norm": 9.091524903573514, + "learning_rate": 4.891763129155201e-05, + "loss": 2.3169, + "mean_token_accuracy": 0.45862069725990295, + "step": 143145 + }, + { + "epoch": 0.14418220037246648, + "grad_norm": 10.69980679483711, + "learning_rate": 4.891751646685505e-05, + "loss": 2.2274, + "mean_token_accuracy": 0.4344827651977539, + "step": 143150 + }, + { + "epoch": 0.14418723642557066, + "grad_norm": 11.215358222077949, + "learning_rate": 4.8917401636217854e-05, + "loss": 2.6825, + "mean_token_accuracy": 0.4172413766384125, + "step": 143155 + }, + { + "epoch": 0.14419227247867483, + "grad_norm": 10.291894865198412, + "learning_rate": 4.891728679964046e-05, + "loss": 2.2819, + "mean_token_accuracy": 0.4, + "step": 143160 + }, + { + "epoch": 0.144197308531779, + "grad_norm": 9.485089848847432, + "learning_rate": 4.89171719571229e-05, + "loss": 2.0843, + "mean_token_accuracy": 0.47586206793785096, + "step": 143165 + }, + { + "epoch": 0.14420234458488318, + "grad_norm": 12.620909235561669, + "learning_rate": 4.891705710866521e-05, + "loss": 2.5386, + "mean_token_accuracy": 0.417241370677948, + "step": 143170 + }, + { + "epoch": 0.14420738063798735, + "grad_norm": 8.761753680024981, + "learning_rate": 4.891694225426742e-05, + "loss": 2.0999, + "mean_token_accuracy": 0.4620689690113068, + "step": 143175 + }, + { + "epoch": 0.14421241669109153, + "grad_norm": 11.052827646988586, + "learning_rate": 4.891682739392955e-05, + "loss": 2.2684, + "mean_token_accuracy": 0.43793103098869324, + "step": 143180 + }, + { + "epoch": 0.1442174527441957, + "grad_norm": 7.482576051375531, + "learning_rate": 4.891671252765165e-05, + "loss": 2.2995, + "mean_token_accuracy": 0.4413793087005615, + "step": 143185 + }, + { + "epoch": 0.14422248879729987, + "grad_norm": 11.638067613836233, + "learning_rate": 4.891659765543376e-05, + "loss": 2.5494, + "mean_token_accuracy": 0.4068965554237366, + "step": 143190 + }, + { + "epoch": 0.14422752485040405, + "grad_norm": 10.08076645440741, + "learning_rate": 4.8916482777275877e-05, + "loss": 2.4693, + "mean_token_accuracy": 0.4189352750778198, + "step": 143195 + }, + { + "epoch": 0.14423256090350822, + "grad_norm": 9.752454165354598, + "learning_rate": 4.891636789317806e-05, + "loss": 2.2789, + "mean_token_accuracy": 0.42413793206214906, + "step": 143200 + }, + { + "epoch": 0.1442375969566124, + "grad_norm": 17.415005957185635, + "learning_rate": 4.891625300314033e-05, + "loss": 2.61, + "mean_token_accuracy": 0.4379310369491577, + "step": 143205 + }, + { + "epoch": 0.14424263300971657, + "grad_norm": 14.367878434977694, + "learning_rate": 4.891613810716273e-05, + "loss": 2.4245, + "mean_token_accuracy": 0.4709618926048279, + "step": 143210 + }, + { + "epoch": 0.14424766906282074, + "grad_norm": 14.849141981857443, + "learning_rate": 4.891602320524528e-05, + "loss": 2.3179, + "mean_token_accuracy": 0.4689655125141144, + "step": 143215 + }, + { + "epoch": 0.14425270511592492, + "grad_norm": 9.688235532458037, + "learning_rate": 4.891590829738802e-05, + "loss": 2.1415, + "mean_token_accuracy": 0.4620689630508423, + "step": 143220 + }, + { + "epoch": 0.1442577411690291, + "grad_norm": 10.787323860969947, + "learning_rate": 4.891579338359098e-05, + "loss": 2.2664, + "mean_token_accuracy": 0.4413793087005615, + "step": 143225 + }, + { + "epoch": 0.14426277722213326, + "grad_norm": 10.068157699710326, + "learning_rate": 4.891567846385418e-05, + "loss": 2.3069, + "mean_token_accuracy": 0.44482759237289426, + "step": 143230 + }, + { + "epoch": 0.1442678132752374, + "grad_norm": 11.02357286740229, + "learning_rate": 4.891556353817766e-05, + "loss": 2.7127, + "mean_token_accuracy": 0.4034482717514038, + "step": 143235 + }, + { + "epoch": 0.14427284932834158, + "grad_norm": 10.844049314339482, + "learning_rate": 4.891544860656146e-05, + "loss": 2.5795, + "mean_token_accuracy": 0.3931034505367279, + "step": 143240 + }, + { + "epoch": 0.14427788538144576, + "grad_norm": 11.516562104779565, + "learning_rate": 4.89153336690056e-05, + "loss": 2.5312, + "mean_token_accuracy": 0.43448275327682495, + "step": 143245 + }, + { + "epoch": 0.14428292143454993, + "grad_norm": 9.863996749709504, + "learning_rate": 4.8915218725510124e-05, + "loss": 2.5991, + "mean_token_accuracy": 0.33793103098869326, + "step": 143250 + }, + { + "epoch": 0.1442879574876541, + "grad_norm": 12.275832968848517, + "learning_rate": 4.8915103776075056e-05, + "loss": 2.2932, + "mean_token_accuracy": 0.4068965494632721, + "step": 143255 + }, + { + "epoch": 0.14429299354075828, + "grad_norm": 10.95577215644058, + "learning_rate": 4.891498882070043e-05, + "loss": 2.0273, + "mean_token_accuracy": 0.5241379380226135, + "step": 143260 + }, + { + "epoch": 0.14429802959386245, + "grad_norm": 8.574616501875061, + "learning_rate": 4.891487385938627e-05, + "loss": 2.5562, + "mean_token_accuracy": 0.4034482777118683, + "step": 143265 + }, + { + "epoch": 0.14430306564696663, + "grad_norm": 10.340551851702875, + "learning_rate": 4.891475889213263e-05, + "loss": 2.4589, + "mean_token_accuracy": 0.4, + "step": 143270 + }, + { + "epoch": 0.1443081017000708, + "grad_norm": 9.705779604379604, + "learning_rate": 4.891464391893951e-05, + "loss": 1.9368, + "mean_token_accuracy": 0.5206896483898162, + "step": 143275 + }, + { + "epoch": 0.14431313775317497, + "grad_norm": 8.623271610185295, + "learning_rate": 4.8914528939806966e-05, + "loss": 2.1983, + "mean_token_accuracy": 0.43793103098869324, + "step": 143280 + }, + { + "epoch": 0.14431817380627915, + "grad_norm": 9.591571117723104, + "learning_rate": 4.891441395473502e-05, + "loss": 2.3306, + "mean_token_accuracy": 0.4379310369491577, + "step": 143285 + }, + { + "epoch": 0.14432320985938332, + "grad_norm": 8.771184680138322, + "learning_rate": 4.891429896372371e-05, + "loss": 2.482, + "mean_token_accuracy": 0.3896551787853241, + "step": 143290 + }, + { + "epoch": 0.1443282459124875, + "grad_norm": 10.690560367218001, + "learning_rate": 4.891418396677306e-05, + "loss": 2.1146, + "mean_token_accuracy": 0.4517241418361664, + "step": 143295 + }, + { + "epoch": 0.14433328196559167, + "grad_norm": 10.843560529973189, + "learning_rate": 4.8914068963883106e-05, + "loss": 2.5281, + "mean_token_accuracy": 0.4172413766384125, + "step": 143300 + }, + { + "epoch": 0.14433831801869584, + "grad_norm": 10.618059285439788, + "learning_rate": 4.8913953955053884e-05, + "loss": 2.2213, + "mean_token_accuracy": 0.44482759237289426, + "step": 143305 + }, + { + "epoch": 0.14434335407180002, + "grad_norm": 9.805842091284974, + "learning_rate": 4.891383894028543e-05, + "loss": 2.0299, + "mean_token_accuracy": 0.47931034564971925, + "step": 143310 + }, + { + "epoch": 0.1443483901249042, + "grad_norm": 10.48552327669885, + "learning_rate": 4.891372391957775e-05, + "loss": 2.6893, + "mean_token_accuracy": 0.42758620381355283, + "step": 143315 + }, + { + "epoch": 0.14435342617800836, + "grad_norm": 12.89758142964133, + "learning_rate": 4.8913608892930905e-05, + "loss": 2.6489, + "mean_token_accuracy": 0.3931034505367279, + "step": 143320 + }, + { + "epoch": 0.14435846223111254, + "grad_norm": 12.64000100251207, + "learning_rate": 4.8913493860344916e-05, + "loss": 2.2976, + "mean_token_accuracy": 0.3793103456497192, + "step": 143325 + }, + { + "epoch": 0.1443634982842167, + "grad_norm": 9.952823769657149, + "learning_rate": 4.891337882181981e-05, + "loss": 2.4684, + "mean_token_accuracy": 0.37241379022598264, + "step": 143330 + }, + { + "epoch": 0.1443685343373209, + "grad_norm": 12.520595778832488, + "learning_rate": 4.891326377735562e-05, + "loss": 2.6512, + "mean_token_accuracy": 0.38620689511299133, + "step": 143335 + }, + { + "epoch": 0.14437357039042506, + "grad_norm": 11.066279583864217, + "learning_rate": 4.891314872695239e-05, + "loss": 2.6658, + "mean_token_accuracy": 0.41034482717514037, + "step": 143340 + }, + { + "epoch": 0.14437860644352923, + "grad_norm": 13.380214678805146, + "learning_rate": 4.891303367061015e-05, + "loss": 3.1223, + "mean_token_accuracy": 0.3517241418361664, + "step": 143345 + }, + { + "epoch": 0.1443836424966334, + "grad_norm": 7.514151261822749, + "learning_rate": 4.8912918608328905e-05, + "loss": 2.4334, + "mean_token_accuracy": 0.43793103098869324, + "step": 143350 + }, + { + "epoch": 0.14438867854973758, + "grad_norm": 9.585838792894119, + "learning_rate": 4.891280354010872e-05, + "loss": 2.1383, + "mean_token_accuracy": 0.4241379380226135, + "step": 143355 + }, + { + "epoch": 0.14439371460284176, + "grad_norm": 11.104781221037275, + "learning_rate": 4.891268846594961e-05, + "loss": 2.234, + "mean_token_accuracy": 0.458620685338974, + "step": 143360 + }, + { + "epoch": 0.14439875065594593, + "grad_norm": 11.944601811366978, + "learning_rate": 4.891257338585161e-05, + "loss": 2.9673, + "mean_token_accuracy": 0.35862069129943847, + "step": 143365 + }, + { + "epoch": 0.1444037867090501, + "grad_norm": 10.148460206790432, + "learning_rate": 4.8912458299814754e-05, + "loss": 2.3615, + "mean_token_accuracy": 0.441379314661026, + "step": 143370 + }, + { + "epoch": 0.14440882276215425, + "grad_norm": 10.721689126599045, + "learning_rate": 4.8912343207839077e-05, + "loss": 1.9903, + "mean_token_accuracy": 0.5103448271751404, + "step": 143375 + }, + { + "epoch": 0.14441385881525842, + "grad_norm": 13.132127896281627, + "learning_rate": 4.891222810992461e-05, + "loss": 2.1801, + "mean_token_accuracy": 0.48275862336158754, + "step": 143380 + }, + { + "epoch": 0.1444188948683626, + "grad_norm": 10.862661004551356, + "learning_rate": 4.891211300607137e-05, + "loss": 2.7413, + "mean_token_accuracy": 0.4620689630508423, + "step": 143385 + }, + { + "epoch": 0.14442393092146677, + "grad_norm": 11.849320444589557, + "learning_rate": 4.89119978962794e-05, + "loss": 2.1309, + "mean_token_accuracy": 0.43103447556495667, + "step": 143390 + }, + { + "epoch": 0.14442896697457094, + "grad_norm": 11.592519995421126, + "learning_rate": 4.891188278054875e-05, + "loss": 2.4221, + "mean_token_accuracy": 0.4517241358757019, + "step": 143395 + }, + { + "epoch": 0.14443400302767512, + "grad_norm": 11.637374528984818, + "learning_rate": 4.8911767658879416e-05, + "loss": 2.4152, + "mean_token_accuracy": 0.4586206912994385, + "step": 143400 + }, + { + "epoch": 0.1444390390807793, + "grad_norm": 10.958413941240806, + "learning_rate": 4.8911652531271455e-05, + "loss": 2.4273, + "mean_token_accuracy": 0.42758620977401735, + "step": 143405 + }, + { + "epoch": 0.14444407513388346, + "grad_norm": 12.207092557902424, + "learning_rate": 4.891153739772489e-05, + "loss": 2.801, + "mean_token_accuracy": 0.3517241388559341, + "step": 143410 + }, + { + "epoch": 0.14444911118698764, + "grad_norm": 9.957504072861148, + "learning_rate": 4.891142225823977e-05, + "loss": 2.3038, + "mean_token_accuracy": 0.4137930989265442, + "step": 143415 + }, + { + "epoch": 0.1444541472400918, + "grad_norm": 10.479548449903048, + "learning_rate": 4.891130711281609e-05, + "loss": 2.7249, + "mean_token_accuracy": 0.3758620619773865, + "step": 143420 + }, + { + "epoch": 0.144459183293196, + "grad_norm": 11.93547655899798, + "learning_rate": 4.8911191961453914e-05, + "loss": 2.7033, + "mean_token_accuracy": 0.37586206793785093, + "step": 143425 + }, + { + "epoch": 0.14446421934630016, + "grad_norm": 9.575876754599406, + "learning_rate": 4.8911076804153274e-05, + "loss": 2.3689, + "mean_token_accuracy": 0.44827586114406587, + "step": 143430 + }, + { + "epoch": 0.14446925539940433, + "grad_norm": 8.01727140145651, + "learning_rate": 4.8910961640914174e-05, + "loss": 2.2264, + "mean_token_accuracy": 0.4468844473361969, + "step": 143435 + }, + { + "epoch": 0.1444742914525085, + "grad_norm": 11.87748897012932, + "learning_rate": 4.891084647173667e-05, + "loss": 2.5445, + "mean_token_accuracy": 0.4413793087005615, + "step": 143440 + }, + { + "epoch": 0.14447932750561268, + "grad_norm": 10.887153348004146, + "learning_rate": 4.891073129662079e-05, + "loss": 2.3559, + "mean_token_accuracy": 0.4878078818321228, + "step": 143445 + }, + { + "epoch": 0.14448436355871686, + "grad_norm": 9.895765928188267, + "learning_rate": 4.891061611556657e-05, + "loss": 2.2234, + "mean_token_accuracy": 0.46896551847457885, + "step": 143450 + }, + { + "epoch": 0.14448939961182103, + "grad_norm": 9.411800088793186, + "learning_rate": 4.891050092857403e-05, + "loss": 2.2195, + "mean_token_accuracy": 0.40889292359352114, + "step": 143455 + }, + { + "epoch": 0.1444944356649252, + "grad_norm": 17.251257754668526, + "learning_rate": 4.8910385735643206e-05, + "loss": 3.2276, + "mean_token_accuracy": 0.3827586203813553, + "step": 143460 + }, + { + "epoch": 0.14449947171802938, + "grad_norm": 11.387152471798426, + "learning_rate": 4.8910270536774126e-05, + "loss": 2.4489, + "mean_token_accuracy": 0.42068964838981626, + "step": 143465 + }, + { + "epoch": 0.14450450777113355, + "grad_norm": 8.928583937612485, + "learning_rate": 4.891015533196684e-05, + "loss": 2.0579, + "mean_token_accuracy": 0.49655171632766726, + "step": 143470 + }, + { + "epoch": 0.14450954382423772, + "grad_norm": 9.385929338101583, + "learning_rate": 4.891004012122136e-05, + "loss": 2.1213, + "mean_token_accuracy": 0.4620689570903778, + "step": 143475 + }, + { + "epoch": 0.1445145798773419, + "grad_norm": 9.638575526377231, + "learning_rate": 4.890992490453773e-05, + "loss": 2.4355, + "mean_token_accuracy": 0.4467634618282318, + "step": 143480 + }, + { + "epoch": 0.14451961593044607, + "grad_norm": 11.146512895679482, + "learning_rate": 4.890980968191597e-05, + "loss": 2.3323, + "mean_token_accuracy": 0.3896551728248596, + "step": 143485 + }, + { + "epoch": 0.14452465198355025, + "grad_norm": 10.041846521572744, + "learning_rate": 4.890969445335613e-05, + "loss": 2.7848, + "mean_token_accuracy": 0.4068965494632721, + "step": 143490 + }, + { + "epoch": 0.14452968803665442, + "grad_norm": 11.61979238003141, + "learning_rate": 4.890957921885823e-05, + "loss": 2.4216, + "mean_token_accuracy": 0.43103448748588563, + "step": 143495 + }, + { + "epoch": 0.1445347240897586, + "grad_norm": 12.021135306229978, + "learning_rate": 4.8909463978422294e-05, + "loss": 2.1835, + "mean_token_accuracy": 0.4882637560367584, + "step": 143500 + }, + { + "epoch": 0.14453976014286277, + "grad_norm": 13.628813098609225, + "learning_rate": 4.890934873204837e-05, + "loss": 2.3797, + "mean_token_accuracy": 0.4482758641242981, + "step": 143505 + }, + { + "epoch": 0.14454479619596694, + "grad_norm": 8.37439364426545, + "learning_rate": 4.8909233479736476e-05, + "loss": 2.1544, + "mean_token_accuracy": 0.4137930989265442, + "step": 143510 + }, + { + "epoch": 0.1445498322490711, + "grad_norm": 10.731539105938554, + "learning_rate": 4.890911822148666e-05, + "loss": 2.2245, + "mean_token_accuracy": 0.42413792610168455, + "step": 143515 + }, + { + "epoch": 0.14455486830217526, + "grad_norm": 10.665186522741884, + "learning_rate": 4.890900295729894e-05, + "loss": 2.5141, + "mean_token_accuracy": 0.38620689809322356, + "step": 143520 + }, + { + "epoch": 0.14455990435527943, + "grad_norm": 10.553039706236603, + "learning_rate": 4.8908887687173354e-05, + "loss": 2.2351, + "mean_token_accuracy": 0.4534180283546448, + "step": 143525 + }, + { + "epoch": 0.1445649404083836, + "grad_norm": 7.852568444741093, + "learning_rate": 4.890877241110994e-05, + "loss": 2.7851, + "mean_token_accuracy": 0.40532365441322327, + "step": 143530 + }, + { + "epoch": 0.14456997646148778, + "grad_norm": 12.041815270921463, + "learning_rate": 4.890865712910871e-05, + "loss": 2.8099, + "mean_token_accuracy": 0.39655172228813174, + "step": 143535 + }, + { + "epoch": 0.14457501251459196, + "grad_norm": 10.350282734708696, + "learning_rate": 4.890854184116972e-05, + "loss": 2.3552, + "mean_token_accuracy": 0.42758620977401735, + "step": 143540 + }, + { + "epoch": 0.14458004856769613, + "grad_norm": 10.244674767421802, + "learning_rate": 4.890842654729299e-05, + "loss": 2.1739, + "mean_token_accuracy": 0.4310344815254211, + "step": 143545 + }, + { + "epoch": 0.1445850846208003, + "grad_norm": 10.646964327069766, + "learning_rate": 4.890831124747855e-05, + "loss": 2.3636, + "mean_token_accuracy": 0.49165154695510865, + "step": 143550 + }, + { + "epoch": 0.14459012067390448, + "grad_norm": 11.933970967377196, + "learning_rate": 4.890819594172644e-05, + "loss": 2.4161, + "mean_token_accuracy": 0.44137930274009707, + "step": 143555 + }, + { + "epoch": 0.14459515672700865, + "grad_norm": 9.8570826445566, + "learning_rate": 4.890808063003667e-05, + "loss": 2.2871, + "mean_token_accuracy": 0.4344827592372894, + "step": 143560 + }, + { + "epoch": 0.14460019278011282, + "grad_norm": 9.144568269671431, + "learning_rate": 4.890796531240931e-05, + "loss": 2.4222, + "mean_token_accuracy": 0.45317604541778567, + "step": 143565 + }, + { + "epoch": 0.144605228833217, + "grad_norm": 8.426753187654638, + "learning_rate": 4.890784998884436e-05, + "loss": 2.2292, + "mean_token_accuracy": 0.5172413766384125, + "step": 143570 + }, + { + "epoch": 0.14461026488632117, + "grad_norm": 9.789786819455657, + "learning_rate": 4.890773465934187e-05, + "loss": 2.4364, + "mean_token_accuracy": 0.43793103098869324, + "step": 143575 + }, + { + "epoch": 0.14461530093942535, + "grad_norm": 11.07853969471019, + "learning_rate": 4.890761932390186e-05, + "loss": 2.3389, + "mean_token_accuracy": 0.4517241299152374, + "step": 143580 + }, + { + "epoch": 0.14462033699252952, + "grad_norm": 9.86452615311759, + "learning_rate": 4.890750398252436e-05, + "loss": 2.1072, + "mean_token_accuracy": 0.4793103337287903, + "step": 143585 + }, + { + "epoch": 0.1446253730456337, + "grad_norm": 10.310264323735417, + "learning_rate": 4.890738863520942e-05, + "loss": 2.2695, + "mean_token_accuracy": 0.458620685338974, + "step": 143590 + }, + { + "epoch": 0.14463040909873787, + "grad_norm": 11.954127932678544, + "learning_rate": 4.890727328195706e-05, + "loss": 2.0617, + "mean_token_accuracy": 0.4724137902259827, + "step": 143595 + }, + { + "epoch": 0.14463544515184204, + "grad_norm": 10.064799958434044, + "learning_rate": 4.890715792276731e-05, + "loss": 2.9538, + "mean_token_accuracy": 0.3655172407627106, + "step": 143600 + }, + { + "epoch": 0.14464048120494621, + "grad_norm": 11.880906390614564, + "learning_rate": 4.8907042557640196e-05, + "loss": 2.1926, + "mean_token_accuracy": 0.4206896543502808, + "step": 143605 + }, + { + "epoch": 0.1446455172580504, + "grad_norm": 9.074700355558805, + "learning_rate": 4.8906927186575776e-05, + "loss": 2.1178, + "mean_token_accuracy": 0.4655172288417816, + "step": 143610 + }, + { + "epoch": 0.14465055331115456, + "grad_norm": 14.580498351142044, + "learning_rate": 4.8906811809574054e-05, + "loss": 2.2088, + "mean_token_accuracy": 0.458620685338974, + "step": 143615 + }, + { + "epoch": 0.14465558936425874, + "grad_norm": 7.764915631679862, + "learning_rate": 4.890669642663507e-05, + "loss": 2.3413, + "mean_token_accuracy": 0.4222625494003296, + "step": 143620 + }, + { + "epoch": 0.1446606254173629, + "grad_norm": 15.547314601621624, + "learning_rate": 4.8906581037758864e-05, + "loss": 2.3682, + "mean_token_accuracy": 0.40689654350280763, + "step": 143625 + }, + { + "epoch": 0.14466566147046708, + "grad_norm": 10.311099846790047, + "learning_rate": 4.890646564294547e-05, + "loss": 2.2407, + "mean_token_accuracy": 0.4620689630508423, + "step": 143630 + }, + { + "epoch": 0.14467069752357126, + "grad_norm": 13.303215093196087, + "learning_rate": 4.89063502421949e-05, + "loss": 2.4223, + "mean_token_accuracy": 0.4379310250282288, + "step": 143635 + }, + { + "epoch": 0.14467573357667543, + "grad_norm": 11.191849086810171, + "learning_rate": 4.890623483550721e-05, + "loss": 2.3015, + "mean_token_accuracy": 0.44827585220336913, + "step": 143640 + }, + { + "epoch": 0.1446807696297796, + "grad_norm": 9.40574544389425, + "learning_rate": 4.890611942288241e-05, + "loss": 2.3691, + "mean_token_accuracy": 0.4793103337287903, + "step": 143645 + }, + { + "epoch": 0.14468580568288378, + "grad_norm": 13.39674112459057, + "learning_rate": 4.890600400432055e-05, + "loss": 1.9823, + "mean_token_accuracy": 0.4931034445762634, + "step": 143650 + }, + { + "epoch": 0.14469084173598792, + "grad_norm": 10.63053464737138, + "learning_rate": 4.8905888579821655e-05, + "loss": 2.3018, + "mean_token_accuracy": 0.4517241418361664, + "step": 143655 + }, + { + "epoch": 0.1446958777890921, + "grad_norm": 11.041860587795977, + "learning_rate": 4.890577314938576e-05, + "loss": 2.4835, + "mean_token_accuracy": 0.42758620977401735, + "step": 143660 + }, + { + "epoch": 0.14470091384219627, + "grad_norm": 7.713130534594883, + "learning_rate": 4.890565771301289e-05, + "loss": 2.0691, + "mean_token_accuracy": 0.5229280233383179, + "step": 143665 + }, + { + "epoch": 0.14470594989530045, + "grad_norm": 12.0256046617136, + "learning_rate": 4.8905542270703086e-05, + "loss": 2.8295, + "mean_token_accuracy": 0.4482758641242981, + "step": 143670 + }, + { + "epoch": 0.14471098594840462, + "grad_norm": 11.197485021689355, + "learning_rate": 4.890542682245636e-05, + "loss": 2.5196, + "mean_token_accuracy": 0.4034482717514038, + "step": 143675 + }, + { + "epoch": 0.1447160220015088, + "grad_norm": 10.002284322971597, + "learning_rate": 4.890531136827277e-05, + "loss": 2.4056, + "mean_token_accuracy": 0.47931034564971925, + "step": 143680 + }, + { + "epoch": 0.14472105805461297, + "grad_norm": 13.844670141628352, + "learning_rate": 4.890519590815234e-05, + "loss": 2.8464, + "mean_token_accuracy": 0.41034482717514037, + "step": 143685 + }, + { + "epoch": 0.14472609410771714, + "grad_norm": 10.90476468155278, + "learning_rate": 4.890508044209509e-05, + "loss": 2.6711, + "mean_token_accuracy": 0.39310344457626345, + "step": 143690 + }, + { + "epoch": 0.14473113016082131, + "grad_norm": 7.351219742039671, + "learning_rate": 4.890496497010107e-05, + "loss": 2.3591, + "mean_token_accuracy": 0.4413793087005615, + "step": 143695 + }, + { + "epoch": 0.1447361662139255, + "grad_norm": 8.678514876001069, + "learning_rate": 4.89048494921703e-05, + "loss": 2.5466, + "mean_token_accuracy": 0.43448275327682495, + "step": 143700 + }, + { + "epoch": 0.14474120226702966, + "grad_norm": 8.508233171386774, + "learning_rate": 4.890473400830282e-05, + "loss": 1.9673, + "mean_token_accuracy": 0.5158499598503112, + "step": 143705 + }, + { + "epoch": 0.14474623832013384, + "grad_norm": 9.973779322066303, + "learning_rate": 4.8904618518498644e-05, + "loss": 2.104, + "mean_token_accuracy": 0.49352691173553465, + "step": 143710 + }, + { + "epoch": 0.144751274373238, + "grad_norm": 13.70444786587233, + "learning_rate": 4.8904503022757824e-05, + "loss": 2.6227, + "mean_token_accuracy": 0.4172413766384125, + "step": 143715 + }, + { + "epoch": 0.14475631042634218, + "grad_norm": 9.773112128442369, + "learning_rate": 4.890438752108039e-05, + "loss": 2.3243, + "mean_token_accuracy": 0.43448275327682495, + "step": 143720 + }, + { + "epoch": 0.14476134647944636, + "grad_norm": 10.901284331543764, + "learning_rate": 4.890427201346636e-05, + "loss": 2.3806, + "mean_token_accuracy": 0.4275862157344818, + "step": 143725 + }, + { + "epoch": 0.14476638253255053, + "grad_norm": 10.053272252718848, + "learning_rate": 4.890415649991578e-05, + "loss": 2.1494, + "mean_token_accuracy": 0.4586206912994385, + "step": 143730 + }, + { + "epoch": 0.1447714185856547, + "grad_norm": 8.806981634197653, + "learning_rate": 4.890404098042868e-05, + "loss": 2.4275, + "mean_token_accuracy": 0.4551724135875702, + "step": 143735 + }, + { + "epoch": 0.14477645463875888, + "grad_norm": 9.854604057972457, + "learning_rate": 4.890392545500509e-05, + "loss": 2.1196, + "mean_token_accuracy": 0.4965517222881317, + "step": 143740 + }, + { + "epoch": 0.14478149069186305, + "grad_norm": 9.044237419350468, + "learning_rate": 4.890380992364503e-05, + "loss": 2.2412, + "mean_token_accuracy": 0.47586206793785096, + "step": 143745 + }, + { + "epoch": 0.14478652674496723, + "grad_norm": 9.760579061885334, + "learning_rate": 4.890369438634856e-05, + "loss": 2.3011, + "mean_token_accuracy": 0.41724138259887694, + "step": 143750 + }, + { + "epoch": 0.1447915627980714, + "grad_norm": 9.526871137559967, + "learning_rate": 4.890357884311569e-05, + "loss": 2.0646, + "mean_token_accuracy": 0.47586207985877993, + "step": 143755 + }, + { + "epoch": 0.14479659885117557, + "grad_norm": 9.34023446067502, + "learning_rate": 4.890346329394645e-05, + "loss": 2.427, + "mean_token_accuracy": 0.44137930274009707, + "step": 143760 + }, + { + "epoch": 0.14480163490427975, + "grad_norm": 10.966220555941506, + "learning_rate": 4.8903347738840885e-05, + "loss": 2.6758, + "mean_token_accuracy": 0.35862068831920624, + "step": 143765 + }, + { + "epoch": 0.14480667095738392, + "grad_norm": 9.180049688747873, + "learning_rate": 4.8903232177799016e-05, + "loss": 2.6232, + "mean_token_accuracy": 0.4310344815254211, + "step": 143770 + }, + { + "epoch": 0.1448117070104881, + "grad_norm": 9.935572026306854, + "learning_rate": 4.890311661082089e-05, + "loss": 2.2243, + "mean_token_accuracy": 0.4344827592372894, + "step": 143775 + }, + { + "epoch": 0.14481674306359227, + "grad_norm": 10.180242047202107, + "learning_rate": 4.890300103790653e-05, + "loss": 2.3633, + "mean_token_accuracy": 0.4620689630508423, + "step": 143780 + }, + { + "epoch": 0.14482177911669644, + "grad_norm": 10.296106936531059, + "learning_rate": 4.890288545905596e-05, + "loss": 2.6595, + "mean_token_accuracy": 0.4068965494632721, + "step": 143785 + }, + { + "epoch": 0.14482681516980062, + "grad_norm": 10.871421274096532, + "learning_rate": 4.890276987426923e-05, + "loss": 2.3724, + "mean_token_accuracy": 0.44827587008476255, + "step": 143790 + }, + { + "epoch": 0.14483185122290476, + "grad_norm": 11.610330271156743, + "learning_rate": 4.890265428354635e-05, + "loss": 2.5323, + "mean_token_accuracy": 0.3999999940395355, + "step": 143795 + }, + { + "epoch": 0.14483688727600894, + "grad_norm": 9.325752887287116, + "learning_rate": 4.8902538686887374e-05, + "loss": 2.2223, + "mean_token_accuracy": 0.4689655125141144, + "step": 143800 + }, + { + "epoch": 0.1448419233291131, + "grad_norm": 10.764415738498748, + "learning_rate": 4.890242308429231e-05, + "loss": 2.5769, + "mean_token_accuracy": 0.4103448331356049, + "step": 143805 + }, + { + "epoch": 0.14484695938221728, + "grad_norm": 11.105961830874552, + "learning_rate": 4.890230747576122e-05, + "loss": 2.3693, + "mean_token_accuracy": 0.46551724076271056, + "step": 143810 + }, + { + "epoch": 0.14485199543532146, + "grad_norm": 9.875217497119914, + "learning_rate": 4.890219186129412e-05, + "loss": 2.1929, + "mean_token_accuracy": 0.47241379618644713, + "step": 143815 + }, + { + "epoch": 0.14485703148842563, + "grad_norm": 8.880986339448448, + "learning_rate": 4.890207624089103e-05, + "loss": 2.303, + "mean_token_accuracy": 0.47931034564971925, + "step": 143820 + }, + { + "epoch": 0.1448620675415298, + "grad_norm": 10.336485399460186, + "learning_rate": 4.8901960614552e-05, + "loss": 2.492, + "mean_token_accuracy": 0.4344827651977539, + "step": 143825 + }, + { + "epoch": 0.14486710359463398, + "grad_norm": 10.687870028820543, + "learning_rate": 4.890184498227706e-05, + "loss": 2.5015, + "mean_token_accuracy": 0.358620685338974, + "step": 143830 + }, + { + "epoch": 0.14487213964773815, + "grad_norm": 10.55102736755888, + "learning_rate": 4.8901729344066226e-05, + "loss": 2.3304, + "mean_token_accuracy": 0.4068965494632721, + "step": 143835 + }, + { + "epoch": 0.14487717570084233, + "grad_norm": 8.726308751882007, + "learning_rate": 4.8901613699919555e-05, + "loss": 2.3365, + "mean_token_accuracy": 0.44827585816383364, + "step": 143840 + }, + { + "epoch": 0.1448822117539465, + "grad_norm": 8.48515825576751, + "learning_rate": 4.890149804983707e-05, + "loss": 2.4447, + "mean_token_accuracy": 0.35862069129943847, + "step": 143845 + }, + { + "epoch": 0.14488724780705067, + "grad_norm": 9.840174812097512, + "learning_rate": 4.8901382393818783e-05, + "loss": 2.3096, + "mean_token_accuracy": 0.4379310369491577, + "step": 143850 + }, + { + "epoch": 0.14489228386015485, + "grad_norm": 10.298639407294521, + "learning_rate": 4.890126673186476e-05, + "loss": 2.3572, + "mean_token_accuracy": 0.42758620977401735, + "step": 143855 + }, + { + "epoch": 0.14489731991325902, + "grad_norm": 12.306937003980694, + "learning_rate": 4.8901151063975e-05, + "loss": 2.5047, + "mean_token_accuracy": 0.3655172407627106, + "step": 143860 + }, + { + "epoch": 0.1449023559663632, + "grad_norm": 8.951201298382005, + "learning_rate": 4.890103539014957e-05, + "loss": 2.1799, + "mean_token_accuracy": 0.4745916426181793, + "step": 143865 + }, + { + "epoch": 0.14490739201946737, + "grad_norm": 8.840214351787456, + "learning_rate": 4.8900919710388464e-05, + "loss": 2.7421, + "mean_token_accuracy": 0.3620689660310745, + "step": 143870 + }, + { + "epoch": 0.14491242807257154, + "grad_norm": 10.873226613590463, + "learning_rate": 4.8900804024691744e-05, + "loss": 2.2906, + "mean_token_accuracy": 0.4034482717514038, + "step": 143875 + }, + { + "epoch": 0.14491746412567572, + "grad_norm": 9.147833652217198, + "learning_rate": 4.890068833305943e-05, + "loss": 2.1011, + "mean_token_accuracy": 0.47931034564971925, + "step": 143880 + }, + { + "epoch": 0.1449225001787799, + "grad_norm": 8.761168710927961, + "learning_rate": 4.890057263549155e-05, + "loss": 2.6739, + "mean_token_accuracy": 0.37931033968925476, + "step": 143885 + }, + { + "epoch": 0.14492753623188406, + "grad_norm": 10.937823363854655, + "learning_rate": 4.890045693198815e-05, + "loss": 2.4387, + "mean_token_accuracy": 0.39655172228813174, + "step": 143890 + }, + { + "epoch": 0.14493257228498824, + "grad_norm": 9.293145269453115, + "learning_rate": 4.890034122254925e-05, + "loss": 2.1434, + "mean_token_accuracy": 0.4862068951129913, + "step": 143895 + }, + { + "epoch": 0.1449376083380924, + "grad_norm": 10.425609436580242, + "learning_rate": 4.890022550717489e-05, + "loss": 2.3657, + "mean_token_accuracy": 0.4310344815254211, + "step": 143900 + }, + { + "epoch": 0.14494264439119658, + "grad_norm": 9.38570091352509, + "learning_rate": 4.890010978586509e-05, + "loss": 2.411, + "mean_token_accuracy": 0.41034482717514037, + "step": 143905 + }, + { + "epoch": 0.14494768044430076, + "grad_norm": 8.32616852429847, + "learning_rate": 4.8899994058619884e-05, + "loss": 2.4168, + "mean_token_accuracy": 0.39655172228813174, + "step": 143910 + }, + { + "epoch": 0.14495271649740493, + "grad_norm": 9.779628102895474, + "learning_rate": 4.8899878325439324e-05, + "loss": 2.1989, + "mean_token_accuracy": 0.4379310369491577, + "step": 143915 + }, + { + "epoch": 0.1449577525505091, + "grad_norm": 10.694421346236242, + "learning_rate": 4.889976258632342e-05, + "loss": 2.5396, + "mean_token_accuracy": 0.3793103456497192, + "step": 143920 + }, + { + "epoch": 0.14496278860361328, + "grad_norm": 8.627128455322516, + "learning_rate": 4.889964684127221e-05, + "loss": 2.3402, + "mean_token_accuracy": 0.4620689630508423, + "step": 143925 + }, + { + "epoch": 0.14496782465671745, + "grad_norm": 15.435647028289457, + "learning_rate": 4.889953109028573e-05, + "loss": 2.7457, + "mean_token_accuracy": 0.4344827592372894, + "step": 143930 + }, + { + "epoch": 0.1449728607098216, + "grad_norm": 14.648393878955599, + "learning_rate": 4.8899415333364015e-05, + "loss": 2.563, + "mean_token_accuracy": 0.42758620381355283, + "step": 143935 + }, + { + "epoch": 0.14497789676292577, + "grad_norm": 10.800983892020598, + "learning_rate": 4.889929957050709e-05, + "loss": 2.5798, + "mean_token_accuracy": 0.40344828367233276, + "step": 143940 + }, + { + "epoch": 0.14498293281602995, + "grad_norm": 11.240491480234054, + "learning_rate": 4.889918380171499e-05, + "loss": 2.0261, + "mean_token_accuracy": 0.48965516686439514, + "step": 143945 + }, + { + "epoch": 0.14498796886913412, + "grad_norm": 12.04618851131278, + "learning_rate": 4.889906802698775e-05, + "loss": 2.5256, + "mean_token_accuracy": 0.40344828367233276, + "step": 143950 + }, + { + "epoch": 0.1449930049222383, + "grad_norm": 10.037136549439175, + "learning_rate": 4.889895224632539e-05, + "loss": 2.4872, + "mean_token_accuracy": 0.4241379380226135, + "step": 143955 + }, + { + "epoch": 0.14499804097534247, + "grad_norm": 10.301120866236207, + "learning_rate": 4.889883645972795e-05, + "loss": 2.7662, + "mean_token_accuracy": 0.3876587986946106, + "step": 143960 + }, + { + "epoch": 0.14500307702844664, + "grad_norm": 10.856067419009127, + "learning_rate": 4.889872066719547e-05, + "loss": 2.3838, + "mean_token_accuracy": 0.4, + "step": 143965 + }, + { + "epoch": 0.14500811308155082, + "grad_norm": 9.641385640304472, + "learning_rate": 4.889860486872797e-05, + "loss": 2.4189, + "mean_token_accuracy": 0.4068965554237366, + "step": 143970 + }, + { + "epoch": 0.145013149134655, + "grad_norm": 11.699035947852236, + "learning_rate": 4.8898489064325494e-05, + "loss": 2.5506, + "mean_token_accuracy": 0.44827585220336913, + "step": 143975 + }, + { + "epoch": 0.14501818518775916, + "grad_norm": 11.057039924185045, + "learning_rate": 4.889837325398806e-05, + "loss": 2.1407, + "mean_token_accuracy": 0.4620689690113068, + "step": 143980 + }, + { + "epoch": 0.14502322124086334, + "grad_norm": 10.45350702650619, + "learning_rate": 4.889825743771571e-05, + "loss": 2.2872, + "mean_token_accuracy": 0.4551724135875702, + "step": 143985 + }, + { + "epoch": 0.1450282572939675, + "grad_norm": 12.202902585242608, + "learning_rate": 4.8898141615508476e-05, + "loss": 2.8517, + "mean_token_accuracy": 0.41034482717514037, + "step": 143990 + }, + { + "epoch": 0.14503329334707168, + "grad_norm": 14.710312929269715, + "learning_rate": 4.889802578736638e-05, + "loss": 2.2663, + "mean_token_accuracy": 0.4551724076271057, + "step": 143995 + }, + { + "epoch": 0.14503832940017586, + "grad_norm": 12.836859931946204, + "learning_rate": 4.8897909953289466e-05, + "loss": 2.4801, + "mean_token_accuracy": 0.3896551728248596, + "step": 144000 + }, + { + "epoch": 0.14504336545328003, + "grad_norm": 9.863589158517167, + "learning_rate": 4.889779411327777e-05, + "loss": 2.4554, + "mean_token_accuracy": 0.4034482717514038, + "step": 144005 + }, + { + "epoch": 0.1450484015063842, + "grad_norm": 10.612544796577074, + "learning_rate": 4.889767826733131e-05, + "loss": 2.4213, + "mean_token_accuracy": 0.41379310488700866, + "step": 144010 + }, + { + "epoch": 0.14505343755948838, + "grad_norm": 10.389338223339522, + "learning_rate": 4.889756241545012e-05, + "loss": 2.5346, + "mean_token_accuracy": 0.3793103456497192, + "step": 144015 + }, + { + "epoch": 0.14505847361259255, + "grad_norm": 9.35086172850557, + "learning_rate": 4.889744655763424e-05, + "loss": 2.1159, + "mean_token_accuracy": 0.4379310369491577, + "step": 144020 + }, + { + "epoch": 0.14506350966569673, + "grad_norm": 13.299832752644116, + "learning_rate": 4.88973306938837e-05, + "loss": 2.2956, + "mean_token_accuracy": 0.5, + "step": 144025 + }, + { + "epoch": 0.1450685457188009, + "grad_norm": 10.985377744809222, + "learning_rate": 4.889721482419852e-05, + "loss": 2.6731, + "mean_token_accuracy": 0.37931033968925476, + "step": 144030 + }, + { + "epoch": 0.14507358177190507, + "grad_norm": 19.253025299708447, + "learning_rate": 4.889709894857875e-05, + "loss": 2.1133, + "mean_token_accuracy": 0.42413792610168455, + "step": 144035 + }, + { + "epoch": 0.14507861782500925, + "grad_norm": 13.273572952704141, + "learning_rate": 4.889698306702442e-05, + "loss": 3.144, + "mean_token_accuracy": 0.34137930274009703, + "step": 144040 + }, + { + "epoch": 0.14508365387811342, + "grad_norm": 11.204478365249356, + "learning_rate": 4.889686717953556e-05, + "loss": 2.5541, + "mean_token_accuracy": 0.37586206793785093, + "step": 144045 + }, + { + "epoch": 0.1450886899312176, + "grad_norm": 9.790070463210167, + "learning_rate": 4.889675128611218e-05, + "loss": 2.7196, + "mean_token_accuracy": 0.4188142716884613, + "step": 144050 + }, + { + "epoch": 0.14509372598432177, + "grad_norm": 9.169571866707972, + "learning_rate": 4.889663538675435e-05, + "loss": 2.6706, + "mean_token_accuracy": 0.4, + "step": 144055 + }, + { + "epoch": 0.14509876203742594, + "grad_norm": 9.731395408579983, + "learning_rate": 4.889651948146207e-05, + "loss": 2.4059, + "mean_token_accuracy": 0.4310344815254211, + "step": 144060 + }, + { + "epoch": 0.14510379809053012, + "grad_norm": 10.655534593442033, + "learning_rate": 4.88964035702354e-05, + "loss": 2.2562, + "mean_token_accuracy": 0.4534785211086273, + "step": 144065 + }, + { + "epoch": 0.1451088341436343, + "grad_norm": 9.43224057953019, + "learning_rate": 4.889628765307435e-05, + "loss": 2.0552, + "mean_token_accuracy": 0.4931034445762634, + "step": 144070 + }, + { + "epoch": 0.14511387019673844, + "grad_norm": 11.139666092470963, + "learning_rate": 4.889617172997895e-05, + "loss": 2.4151, + "mean_token_accuracy": 0.4586206912994385, + "step": 144075 + }, + { + "epoch": 0.1451189062498426, + "grad_norm": 10.036144004922507, + "learning_rate": 4.889605580094925e-05, + "loss": 2.6416, + "mean_token_accuracy": 0.4449485898017883, + "step": 144080 + }, + { + "epoch": 0.14512394230294678, + "grad_norm": 10.019564781152859, + "learning_rate": 4.8895939865985276e-05, + "loss": 2.4254, + "mean_token_accuracy": 0.4068965494632721, + "step": 144085 + }, + { + "epoch": 0.14512897835605096, + "grad_norm": 11.033667837102554, + "learning_rate": 4.889582392508705e-05, + "loss": 2.3985, + "mean_token_accuracy": 0.44482758045196535, + "step": 144090 + }, + { + "epoch": 0.14513401440915513, + "grad_norm": 12.277800521706919, + "learning_rate": 4.889570797825463e-05, + "loss": 2.2557, + "mean_token_accuracy": 0.4344827592372894, + "step": 144095 + }, + { + "epoch": 0.1451390504622593, + "grad_norm": 10.14419244234258, + "learning_rate": 4.889559202548801e-05, + "loss": 2.2994, + "mean_token_accuracy": 0.39655172228813174, + "step": 144100 + }, + { + "epoch": 0.14514408651536348, + "grad_norm": 14.973001143199081, + "learning_rate": 4.889547606678726e-05, + "loss": 2.7306, + "mean_token_accuracy": 0.3965517163276672, + "step": 144105 + }, + { + "epoch": 0.14514912256846765, + "grad_norm": 9.416034107534177, + "learning_rate": 4.889536010215238e-05, + "loss": 2.1685, + "mean_token_accuracy": 0.4586206912994385, + "step": 144110 + }, + { + "epoch": 0.14515415862157183, + "grad_norm": 11.049515826058766, + "learning_rate": 4.889524413158343e-05, + "loss": 2.6506, + "mean_token_accuracy": 0.334482753276825, + "step": 144115 + }, + { + "epoch": 0.145159194674676, + "grad_norm": 9.46568139107262, + "learning_rate": 4.8895128155080425e-05, + "loss": 2.3446, + "mean_token_accuracy": 0.4286751329898834, + "step": 144120 + }, + { + "epoch": 0.14516423072778017, + "grad_norm": 10.525009490374117, + "learning_rate": 4.88950121726434e-05, + "loss": 2.5606, + "mean_token_accuracy": 0.4344827592372894, + "step": 144125 + }, + { + "epoch": 0.14516926678088435, + "grad_norm": 12.206420934587465, + "learning_rate": 4.889489618427239e-05, + "loss": 2.2318, + "mean_token_accuracy": 0.44482758045196535, + "step": 144130 + }, + { + "epoch": 0.14517430283398852, + "grad_norm": 8.235072848658808, + "learning_rate": 4.889478018996742e-05, + "loss": 2.4855, + "mean_token_accuracy": 0.40199636816978457, + "step": 144135 + }, + { + "epoch": 0.1451793388870927, + "grad_norm": 15.38345435333593, + "learning_rate": 4.889466418972854e-05, + "loss": 2.3603, + "mean_token_accuracy": 0.482758617401123, + "step": 144140 + }, + { + "epoch": 0.14518437494019687, + "grad_norm": 10.226437540212723, + "learning_rate": 4.889454818355576e-05, + "loss": 2.2697, + "mean_token_accuracy": 0.495099812746048, + "step": 144145 + }, + { + "epoch": 0.14518941099330104, + "grad_norm": 10.255810840952867, + "learning_rate": 4.889443217144913e-05, + "loss": 2.3935, + "mean_token_accuracy": 0.4, + "step": 144150 + }, + { + "epoch": 0.14519444704640522, + "grad_norm": 9.170845774254225, + "learning_rate": 4.8894316153408665e-05, + "loss": 2.5126, + "mean_token_accuracy": 0.4172413766384125, + "step": 144155 + }, + { + "epoch": 0.1451994830995094, + "grad_norm": 13.151268072543482, + "learning_rate": 4.889420012943441e-05, + "loss": 2.662, + "mean_token_accuracy": 0.4206896543502808, + "step": 144160 + }, + { + "epoch": 0.14520451915261356, + "grad_norm": 8.869687218031247, + "learning_rate": 4.889408409952639e-05, + "loss": 2.1596, + "mean_token_accuracy": 0.44827585816383364, + "step": 144165 + }, + { + "epoch": 0.14520955520571774, + "grad_norm": 9.361657593129753, + "learning_rate": 4.889396806368465e-05, + "loss": 2.6107, + "mean_token_accuracy": 0.4082274615764618, + "step": 144170 + }, + { + "epoch": 0.1452145912588219, + "grad_norm": 11.213247843621472, + "learning_rate": 4.889385202190921e-05, + "loss": 2.2497, + "mean_token_accuracy": 0.4413793087005615, + "step": 144175 + }, + { + "epoch": 0.14521962731192609, + "grad_norm": 21.232124236132805, + "learning_rate": 4.88937359742001e-05, + "loss": 3.0354, + "mean_token_accuracy": 0.44137930274009707, + "step": 144180 + }, + { + "epoch": 0.14522466336503026, + "grad_norm": 9.486034007939356, + "learning_rate": 4.8893619920557366e-05, + "loss": 2.5019, + "mean_token_accuracy": 0.41034482717514037, + "step": 144185 + }, + { + "epoch": 0.14522969941813443, + "grad_norm": 11.67197113763365, + "learning_rate": 4.889350386098103e-05, + "loss": 2.3774, + "mean_token_accuracy": 0.4294010937213898, + "step": 144190 + }, + { + "epoch": 0.1452347354712386, + "grad_norm": 10.244797816806349, + "learning_rate": 4.889338779547112e-05, + "loss": 2.4288, + "mean_token_accuracy": 0.4034482717514038, + "step": 144195 + }, + { + "epoch": 0.14523977152434278, + "grad_norm": 9.340278528944898, + "learning_rate": 4.8893271724027686e-05, + "loss": 2.3846, + "mean_token_accuracy": 0.40508167147636415, + "step": 144200 + }, + { + "epoch": 0.14524480757744695, + "grad_norm": 9.9208010306362, + "learning_rate": 4.8893155646650734e-05, + "loss": 2.325, + "mean_token_accuracy": 0.42068966031074523, + "step": 144205 + }, + { + "epoch": 0.14524984363055113, + "grad_norm": 10.508248707737343, + "learning_rate": 4.889303956334032e-05, + "loss": 2.3219, + "mean_token_accuracy": 0.41724138259887694, + "step": 144210 + }, + { + "epoch": 0.14525487968365527, + "grad_norm": 10.969547268367997, + "learning_rate": 4.889292347409646e-05, + "loss": 2.5017, + "mean_token_accuracy": 0.4068965494632721, + "step": 144215 + }, + { + "epoch": 0.14525991573675945, + "grad_norm": 8.92339435587438, + "learning_rate": 4.889280737891919e-05, + "loss": 2.3111, + "mean_token_accuracy": 0.42758620381355283, + "step": 144220 + }, + { + "epoch": 0.14526495178986362, + "grad_norm": 12.869771875676484, + "learning_rate": 4.8892691277808554e-05, + "loss": 2.8821, + "mean_token_accuracy": 0.4103448212146759, + "step": 144225 + }, + { + "epoch": 0.1452699878429678, + "grad_norm": 9.532101150508655, + "learning_rate": 4.889257517076458e-05, + "loss": 2.202, + "mean_token_accuracy": 0.4275861978530884, + "step": 144230 + }, + { + "epoch": 0.14527502389607197, + "grad_norm": 9.401701099697474, + "learning_rate": 4.889245905778728e-05, + "loss": 2.6174, + "mean_token_accuracy": 0.41034482717514037, + "step": 144235 + }, + { + "epoch": 0.14528005994917614, + "grad_norm": 10.289291533082476, + "learning_rate": 4.8892342938876704e-05, + "loss": 2.3804, + "mean_token_accuracy": 0.44827585816383364, + "step": 144240 + }, + { + "epoch": 0.14528509600228032, + "grad_norm": 9.365251462868741, + "learning_rate": 4.889222681403289e-05, + "loss": 2.5318, + "mean_token_accuracy": 0.41240169703960416, + "step": 144245 + }, + { + "epoch": 0.1452901320553845, + "grad_norm": 14.171632730720827, + "learning_rate": 4.889211068325587e-05, + "loss": 2.3127, + "mean_token_accuracy": 0.48275862336158754, + "step": 144250 + }, + { + "epoch": 0.14529516810848866, + "grad_norm": 27.186534065406306, + "learning_rate": 4.889199454654565e-05, + "loss": 3.1015, + "mean_token_accuracy": 0.38965516686439516, + "step": 144255 + }, + { + "epoch": 0.14530020416159284, + "grad_norm": 12.761725413589406, + "learning_rate": 4.8891878403902286e-05, + "loss": 2.1759, + "mean_token_accuracy": 0.47241379618644713, + "step": 144260 + }, + { + "epoch": 0.145305240214697, + "grad_norm": 12.483021539479228, + "learning_rate": 4.889176225532581e-05, + "loss": 2.8166, + "mean_token_accuracy": 0.3840290367603302, + "step": 144265 + }, + { + "epoch": 0.14531027626780119, + "grad_norm": 8.647452654036163, + "learning_rate": 4.8891646100816244e-05, + "loss": 2.0301, + "mean_token_accuracy": 0.4862069010734558, + "step": 144270 + }, + { + "epoch": 0.14531531232090536, + "grad_norm": 8.676734337797246, + "learning_rate": 4.8891529940373624e-05, + "loss": 2.4508, + "mean_token_accuracy": 0.44137930274009707, + "step": 144275 + }, + { + "epoch": 0.14532034837400953, + "grad_norm": 10.376473478992587, + "learning_rate": 4.8891413773997985e-05, + "loss": 2.3951, + "mean_token_accuracy": 0.404779189825058, + "step": 144280 + }, + { + "epoch": 0.1453253844271137, + "grad_norm": 11.34289837909769, + "learning_rate": 4.889129760168936e-05, + "loss": 2.6268, + "mean_token_accuracy": 0.38620689511299133, + "step": 144285 + }, + { + "epoch": 0.14533042048021788, + "grad_norm": 11.995882775810296, + "learning_rate": 4.889118142344778e-05, + "loss": 2.4344, + "mean_token_accuracy": 0.4379310250282288, + "step": 144290 + }, + { + "epoch": 0.14533545653332205, + "grad_norm": 9.681560242490411, + "learning_rate": 4.8891065239273277e-05, + "loss": 2.5085, + "mean_token_accuracy": 0.42413792610168455, + "step": 144295 + }, + { + "epoch": 0.14534049258642623, + "grad_norm": 12.58500341296998, + "learning_rate": 4.889094904916588e-05, + "loss": 2.6026, + "mean_token_accuracy": 0.41724138259887694, + "step": 144300 + }, + { + "epoch": 0.1453455286395304, + "grad_norm": 11.11766631688563, + "learning_rate": 4.889083285312562e-05, + "loss": 2.4082, + "mean_token_accuracy": 0.45517240166664125, + "step": 144305 + }, + { + "epoch": 0.14535056469263458, + "grad_norm": 13.12004566989439, + "learning_rate": 4.889071665115253e-05, + "loss": 2.0867, + "mean_token_accuracy": 0.47241379618644713, + "step": 144310 + }, + { + "epoch": 0.14535560074573875, + "grad_norm": 9.25192071354174, + "learning_rate": 4.889060044324665e-05, + "loss": 2.2359, + "mean_token_accuracy": 0.4517241299152374, + "step": 144315 + }, + { + "epoch": 0.14536063679884292, + "grad_norm": 8.62132500190601, + "learning_rate": 4.889048422940801e-05, + "loss": 2.434, + "mean_token_accuracy": 0.42413793206214906, + "step": 144320 + }, + { + "epoch": 0.1453656728519471, + "grad_norm": 10.00444366000446, + "learning_rate": 4.889036800963664e-05, + "loss": 2.359, + "mean_token_accuracy": 0.42413793206214906, + "step": 144325 + }, + { + "epoch": 0.14537070890505127, + "grad_norm": 10.381742255197999, + "learning_rate": 4.8890251783932565e-05, + "loss": 2.7726, + "mean_token_accuracy": 0.3793103516101837, + "step": 144330 + }, + { + "epoch": 0.14537574495815545, + "grad_norm": 10.554536801910256, + "learning_rate": 4.889013555229583e-05, + "loss": 2.0251, + "mean_token_accuracy": 0.4931034505367279, + "step": 144335 + }, + { + "epoch": 0.14538078101125962, + "grad_norm": 10.825861935288485, + "learning_rate": 4.8890019314726455e-05, + "loss": 2.2847, + "mean_token_accuracy": 0.4275862127542496, + "step": 144340 + }, + { + "epoch": 0.1453858170643638, + "grad_norm": 10.392230735549658, + "learning_rate": 4.888990307122448e-05, + "loss": 2.3777, + "mean_token_accuracy": 0.4, + "step": 144345 + }, + { + "epoch": 0.14539085311746797, + "grad_norm": 11.413451660256577, + "learning_rate": 4.888978682178994e-05, + "loss": 2.4745, + "mean_token_accuracy": 0.3620689570903778, + "step": 144350 + }, + { + "epoch": 0.1453958891705721, + "grad_norm": 11.464395644345553, + "learning_rate": 4.8889670566422856e-05, + "loss": 2.1961, + "mean_token_accuracy": 0.4482758641242981, + "step": 144355 + }, + { + "epoch": 0.1454009252236763, + "grad_norm": 10.265047272305177, + "learning_rate": 4.888955430512328e-05, + "loss": 2.168, + "mean_token_accuracy": 0.482758629322052, + "step": 144360 + }, + { + "epoch": 0.14540596127678046, + "grad_norm": 12.192577203677107, + "learning_rate": 4.8889438037891216e-05, + "loss": 2.2678, + "mean_token_accuracy": 0.4620689690113068, + "step": 144365 + }, + { + "epoch": 0.14541099732988463, + "grad_norm": 10.837831977032371, + "learning_rate": 4.888932176472672e-05, + "loss": 2.2325, + "mean_token_accuracy": 0.4173645317554474, + "step": 144370 + }, + { + "epoch": 0.1454160333829888, + "grad_norm": 7.604900852444863, + "learning_rate": 4.8889205485629806e-05, + "loss": 2.3525, + "mean_token_accuracy": 0.43793103098869324, + "step": 144375 + }, + { + "epoch": 0.14542106943609298, + "grad_norm": 9.625856543534868, + "learning_rate": 4.888908920060053e-05, + "loss": 2.7043, + "mean_token_accuracy": 0.43103447556495667, + "step": 144380 + }, + { + "epoch": 0.14542610548919715, + "grad_norm": 12.043598692983185, + "learning_rate": 4.88889729096389e-05, + "loss": 2.0682, + "mean_token_accuracy": 0.48275862336158754, + "step": 144385 + }, + { + "epoch": 0.14543114154230133, + "grad_norm": 11.3587931051436, + "learning_rate": 4.8888856612744974e-05, + "loss": 2.5668, + "mean_token_accuracy": 0.35862069129943847, + "step": 144390 + }, + { + "epoch": 0.1454361775954055, + "grad_norm": 9.963673080854955, + "learning_rate": 4.888874030991875e-05, + "loss": 2.5672, + "mean_token_accuracy": 0.47586206793785096, + "step": 144395 + }, + { + "epoch": 0.14544121364850968, + "grad_norm": 12.708974744406197, + "learning_rate": 4.888862400116028e-05, + "loss": 2.2343, + "mean_token_accuracy": 0.42068966031074523, + "step": 144400 + }, + { + "epoch": 0.14544624970161385, + "grad_norm": 9.616563885099296, + "learning_rate": 4.88885076864696e-05, + "loss": 2.7389, + "mean_token_accuracy": 0.36896551251411436, + "step": 144405 + }, + { + "epoch": 0.14545128575471802, + "grad_norm": 9.197532718057804, + "learning_rate": 4.888839136584675e-05, + "loss": 2.2281, + "mean_token_accuracy": 0.5068965375423431, + "step": 144410 + }, + { + "epoch": 0.1454563218078222, + "grad_norm": 11.832377091990546, + "learning_rate": 4.888827503929174e-05, + "loss": 2.6893, + "mean_token_accuracy": 0.4517241358757019, + "step": 144415 + }, + { + "epoch": 0.14546135786092637, + "grad_norm": 9.884544140312132, + "learning_rate": 4.888815870680461e-05, + "loss": 2.376, + "mean_token_accuracy": 0.42758620381355283, + "step": 144420 + }, + { + "epoch": 0.14546639391403055, + "grad_norm": 11.07355858223955, + "learning_rate": 4.8888042368385395e-05, + "loss": 2.3433, + "mean_token_accuracy": 0.4172413766384125, + "step": 144425 + }, + { + "epoch": 0.14547142996713472, + "grad_norm": 10.661454421438036, + "learning_rate": 4.8887926024034134e-05, + "loss": 2.2484, + "mean_token_accuracy": 0.39310344457626345, + "step": 144430 + }, + { + "epoch": 0.1454764660202389, + "grad_norm": 10.424891466571518, + "learning_rate": 4.888780967375084e-05, + "loss": 2.2839, + "mean_token_accuracy": 0.4655172288417816, + "step": 144435 + }, + { + "epoch": 0.14548150207334307, + "grad_norm": 15.509448158895806, + "learning_rate": 4.888769331753556e-05, + "loss": 2.6166, + "mean_token_accuracy": 0.41724138259887694, + "step": 144440 + }, + { + "epoch": 0.14548653812644724, + "grad_norm": 10.743288438907072, + "learning_rate": 4.888757695538833e-05, + "loss": 2.4881, + "mean_token_accuracy": 0.4551724135875702, + "step": 144445 + }, + { + "epoch": 0.14549157417955141, + "grad_norm": 10.242524086885805, + "learning_rate": 4.888746058730917e-05, + "loss": 2.2962, + "mean_token_accuracy": 0.42758620381355283, + "step": 144450 + }, + { + "epoch": 0.1454966102326556, + "grad_norm": 11.292890722852055, + "learning_rate": 4.888734421329812e-05, + "loss": 2.3614, + "mean_token_accuracy": 0.4448275983333588, + "step": 144455 + }, + { + "epoch": 0.14550164628575976, + "grad_norm": 10.741908329471995, + "learning_rate": 4.888722783335521e-05, + "loss": 2.2343, + "mean_token_accuracy": 0.4965517222881317, + "step": 144460 + }, + { + "epoch": 0.14550668233886394, + "grad_norm": 9.362354509511386, + "learning_rate": 4.888711144748047e-05, + "loss": 2.4286, + "mean_token_accuracy": 0.4724137902259827, + "step": 144465 + }, + { + "epoch": 0.1455117183919681, + "grad_norm": 12.36193028359177, + "learning_rate": 4.888699505567394e-05, + "loss": 2.1978, + "mean_token_accuracy": 0.47931033968925474, + "step": 144470 + }, + { + "epoch": 0.14551675444507228, + "grad_norm": 10.337179575992268, + "learning_rate": 4.888687865793565e-05, + "loss": 2.4558, + "mean_token_accuracy": 0.36896551847457887, + "step": 144475 + }, + { + "epoch": 0.14552179049817646, + "grad_norm": 10.519706713246107, + "learning_rate": 4.888676225426562e-05, + "loss": 2.3247, + "mean_token_accuracy": 0.44482759237289426, + "step": 144480 + }, + { + "epoch": 0.14552682655128063, + "grad_norm": 10.951551038644812, + "learning_rate": 4.8886645844663895e-05, + "loss": 2.2377, + "mean_token_accuracy": 0.4379310369491577, + "step": 144485 + }, + { + "epoch": 0.1455318626043848, + "grad_norm": 12.437850575141454, + "learning_rate": 4.8886529429130506e-05, + "loss": 2.4488, + "mean_token_accuracy": 0.4620689690113068, + "step": 144490 + }, + { + "epoch": 0.14553689865748895, + "grad_norm": 16.517742840874035, + "learning_rate": 4.888641300766549e-05, + "loss": 2.2771, + "mean_token_accuracy": 0.4862069010734558, + "step": 144495 + }, + { + "epoch": 0.14554193471059312, + "grad_norm": 10.909007851881578, + "learning_rate": 4.888629658026886e-05, + "loss": 2.5681, + "mean_token_accuracy": 0.4172413766384125, + "step": 144500 + }, + { + "epoch": 0.1455469707636973, + "grad_norm": 10.717016654914923, + "learning_rate": 4.8886180146940665e-05, + "loss": 2.4762, + "mean_token_accuracy": 0.44313369393348695, + "step": 144505 + }, + { + "epoch": 0.14555200681680147, + "grad_norm": 10.697570069704058, + "learning_rate": 4.888606370768093e-05, + "loss": 2.4675, + "mean_token_accuracy": 0.43448275327682495, + "step": 144510 + }, + { + "epoch": 0.14555704286990565, + "grad_norm": 9.545413177348333, + "learning_rate": 4.8885947262489704e-05, + "loss": 2.2167, + "mean_token_accuracy": 0.4344827592372894, + "step": 144515 + }, + { + "epoch": 0.14556207892300982, + "grad_norm": 9.894923804973, + "learning_rate": 4.8885830811366986e-05, + "loss": 2.0823, + "mean_token_accuracy": 0.4517241418361664, + "step": 144520 + }, + { + "epoch": 0.145567114976114, + "grad_norm": 11.294093033446769, + "learning_rate": 4.888571435431284e-05, + "loss": 2.065, + "mean_token_accuracy": 0.4896551728248596, + "step": 144525 + }, + { + "epoch": 0.14557215102921817, + "grad_norm": 14.935364140106937, + "learning_rate": 4.8885597891327285e-05, + "loss": 2.6554, + "mean_token_accuracy": 0.3931034505367279, + "step": 144530 + }, + { + "epoch": 0.14557718708232234, + "grad_norm": 10.080596441750812, + "learning_rate": 4.888548142241035e-05, + "loss": 2.432, + "mean_token_accuracy": 0.4137930989265442, + "step": 144535 + }, + { + "epoch": 0.14558222313542651, + "grad_norm": 10.142654123478007, + "learning_rate": 4.888536494756208e-05, + "loss": 2.2946, + "mean_token_accuracy": 0.4586206912994385, + "step": 144540 + }, + { + "epoch": 0.1455872591885307, + "grad_norm": 9.648018681217982, + "learning_rate": 4.88852484667825e-05, + "loss": 2.0938, + "mean_token_accuracy": 0.458620685338974, + "step": 144545 + }, + { + "epoch": 0.14559229524163486, + "grad_norm": 10.32925051710693, + "learning_rate": 4.888513198007164e-05, + "loss": 2.2811, + "mean_token_accuracy": 0.4620689690113068, + "step": 144550 + }, + { + "epoch": 0.14559733129473904, + "grad_norm": 10.24128686709177, + "learning_rate": 4.888501548742952e-05, + "loss": 2.3273, + "mean_token_accuracy": 0.4551724076271057, + "step": 144555 + }, + { + "epoch": 0.1456023673478432, + "grad_norm": 10.271126002448788, + "learning_rate": 4.8884898988856205e-05, + "loss": 2.5657, + "mean_token_accuracy": 0.41724138259887694, + "step": 144560 + }, + { + "epoch": 0.14560740340094738, + "grad_norm": 10.905717947339431, + "learning_rate": 4.8884782484351695e-05, + "loss": 2.6367, + "mean_token_accuracy": 0.3931034505367279, + "step": 144565 + }, + { + "epoch": 0.14561243945405156, + "grad_norm": 8.531675511345025, + "learning_rate": 4.888466597391604e-05, + "loss": 2.0644, + "mean_token_accuracy": 0.4931034445762634, + "step": 144570 + }, + { + "epoch": 0.14561747550715573, + "grad_norm": 9.93483498824128, + "learning_rate": 4.8884549457549265e-05, + "loss": 2.3304, + "mean_token_accuracy": 0.4379310369491577, + "step": 144575 + }, + { + "epoch": 0.1456225115602599, + "grad_norm": 11.004631565552463, + "learning_rate": 4.888443293525141e-05, + "loss": 2.1023, + "mean_token_accuracy": 0.441379314661026, + "step": 144580 + }, + { + "epoch": 0.14562754761336408, + "grad_norm": 8.735859471364368, + "learning_rate": 4.8884316407022505e-05, + "loss": 2.4805, + "mean_token_accuracy": 0.4551724076271057, + "step": 144585 + }, + { + "epoch": 0.14563258366646825, + "grad_norm": 10.749132653151824, + "learning_rate": 4.888419987286257e-05, + "loss": 2.4813, + "mean_token_accuracy": 0.4310344815254211, + "step": 144590 + }, + { + "epoch": 0.14563761971957243, + "grad_norm": 9.67260251110979, + "learning_rate": 4.888408333277166e-05, + "loss": 2.0654, + "mean_token_accuracy": 0.46206897497177124, + "step": 144595 + }, + { + "epoch": 0.1456426557726766, + "grad_norm": 10.701243916762587, + "learning_rate": 4.88839667867498e-05, + "loss": 2.3127, + "mean_token_accuracy": 0.4551724076271057, + "step": 144600 + }, + { + "epoch": 0.14564769182578077, + "grad_norm": 9.96710520524778, + "learning_rate": 4.8883850234797e-05, + "loss": 2.6524, + "mean_token_accuracy": 0.36896551847457887, + "step": 144605 + }, + { + "epoch": 0.14565272787888495, + "grad_norm": 9.511383630915285, + "learning_rate": 4.8883733676913316e-05, + "loss": 2.3262, + "mean_token_accuracy": 0.4482758641242981, + "step": 144610 + }, + { + "epoch": 0.14565776393198912, + "grad_norm": 10.993620650344084, + "learning_rate": 4.8883617113098774e-05, + "loss": 2.1842, + "mean_token_accuracy": 0.47056649923324584, + "step": 144615 + }, + { + "epoch": 0.1456627999850933, + "grad_norm": 13.003419172956475, + "learning_rate": 4.888350054335341e-05, + "loss": 2.2209, + "mean_token_accuracy": 0.4344827502965927, + "step": 144620 + }, + { + "epoch": 0.14566783603819747, + "grad_norm": 8.647210635726747, + "learning_rate": 4.8883383967677246e-05, + "loss": 1.8081, + "mean_token_accuracy": 0.5329703629016876, + "step": 144625 + }, + { + "epoch": 0.14567287209130164, + "grad_norm": 12.14729915501484, + "learning_rate": 4.888326738607032e-05, + "loss": 2.6107, + "mean_token_accuracy": 0.39310344457626345, + "step": 144630 + }, + { + "epoch": 0.1456779081444058, + "grad_norm": 10.757932704802165, + "learning_rate": 4.888315079853267e-05, + "loss": 2.602, + "mean_token_accuracy": 0.4172413796186447, + "step": 144635 + }, + { + "epoch": 0.14568294419750996, + "grad_norm": 9.619088775702318, + "learning_rate": 4.8883034205064324e-05, + "loss": 2.5713, + "mean_token_accuracy": 0.42952207922935487, + "step": 144640 + }, + { + "epoch": 0.14568798025061414, + "grad_norm": 10.68771651191752, + "learning_rate": 4.8882917605665304e-05, + "loss": 2.4639, + "mean_token_accuracy": 0.41724138259887694, + "step": 144645 + }, + { + "epoch": 0.1456930163037183, + "grad_norm": 10.225864230620648, + "learning_rate": 4.888280100033566e-05, + "loss": 2.1057, + "mean_token_accuracy": 0.4882637619972229, + "step": 144650 + }, + { + "epoch": 0.14569805235682248, + "grad_norm": 8.644768773421884, + "learning_rate": 4.888268438907541e-05, + "loss": 2.1361, + "mean_token_accuracy": 0.5034482717514038, + "step": 144655 + }, + { + "epoch": 0.14570308840992666, + "grad_norm": 10.463296715261665, + "learning_rate": 4.88825677718846e-05, + "loss": 2.43, + "mean_token_accuracy": 0.42068964838981626, + "step": 144660 + }, + { + "epoch": 0.14570812446303083, + "grad_norm": 8.812853379726597, + "learning_rate": 4.8882451148763256e-05, + "loss": 2.3339, + "mean_token_accuracy": 0.4896551728248596, + "step": 144665 + }, + { + "epoch": 0.145713160516135, + "grad_norm": 11.800741116160715, + "learning_rate": 4.888233451971141e-05, + "loss": 2.1268, + "mean_token_accuracy": 0.505626130104065, + "step": 144670 + }, + { + "epoch": 0.14571819656923918, + "grad_norm": 10.695167028861896, + "learning_rate": 4.8882217884729096e-05, + "loss": 2.5146, + "mean_token_accuracy": 0.4379310369491577, + "step": 144675 + }, + { + "epoch": 0.14572323262234335, + "grad_norm": 14.041231694539492, + "learning_rate": 4.8882101243816333e-05, + "loss": 2.7504, + "mean_token_accuracy": 0.3862069010734558, + "step": 144680 + }, + { + "epoch": 0.14572826867544753, + "grad_norm": 11.26069328809716, + "learning_rate": 4.888198459697317e-05, + "loss": 2.2443, + "mean_token_accuracy": 0.4517241299152374, + "step": 144685 + }, + { + "epoch": 0.1457333047285517, + "grad_norm": 11.164173089689157, + "learning_rate": 4.8881867944199634e-05, + "loss": 2.4696, + "mean_token_accuracy": 0.3862069010734558, + "step": 144690 + }, + { + "epoch": 0.14573834078165587, + "grad_norm": 9.87908515702691, + "learning_rate": 4.888175128549575e-05, + "loss": 2.2186, + "mean_token_accuracy": 0.44827585220336913, + "step": 144695 + }, + { + "epoch": 0.14574337683476005, + "grad_norm": 9.334249511607466, + "learning_rate": 4.888163462086157e-05, + "loss": 2.2677, + "mean_token_accuracy": 0.4206896543502808, + "step": 144700 + }, + { + "epoch": 0.14574841288786422, + "grad_norm": 10.264404103126378, + "learning_rate": 4.888151795029711e-05, + "loss": 2.6023, + "mean_token_accuracy": 0.42758620381355283, + "step": 144705 + }, + { + "epoch": 0.1457534489409684, + "grad_norm": 14.628177256114146, + "learning_rate": 4.88814012738024e-05, + "loss": 2.8311, + "mean_token_accuracy": 0.3448275804519653, + "step": 144710 + }, + { + "epoch": 0.14575848499407257, + "grad_norm": 10.255341114933445, + "learning_rate": 4.8881284591377484e-05, + "loss": 2.4064, + "mean_token_accuracy": 0.4103448212146759, + "step": 144715 + }, + { + "epoch": 0.14576352104717674, + "grad_norm": 9.817811501160612, + "learning_rate": 4.888116790302239e-05, + "loss": 2.2874, + "mean_token_accuracy": 0.4034482777118683, + "step": 144720 + }, + { + "epoch": 0.14576855710028092, + "grad_norm": 14.262689213451118, + "learning_rate": 4.888105120873714e-05, + "loss": 2.5467, + "mean_token_accuracy": 0.43793103098869324, + "step": 144725 + }, + { + "epoch": 0.1457735931533851, + "grad_norm": 11.520927008923671, + "learning_rate": 4.888093450852179e-05, + "loss": 2.1516, + "mean_token_accuracy": 0.46551724076271056, + "step": 144730 + }, + { + "epoch": 0.14577862920648926, + "grad_norm": 10.016385601931507, + "learning_rate": 4.888081780237635e-05, + "loss": 2.1276, + "mean_token_accuracy": 0.4327283680438995, + "step": 144735 + }, + { + "epoch": 0.14578366525959344, + "grad_norm": 8.022945322771255, + "learning_rate": 4.8880701090300865e-05, + "loss": 2.4319, + "mean_token_accuracy": 0.46067755222320556, + "step": 144740 + }, + { + "epoch": 0.1457887013126976, + "grad_norm": 10.68885753251828, + "learning_rate": 4.888058437229535e-05, + "loss": 2.1898, + "mean_token_accuracy": 0.44827585816383364, + "step": 144745 + }, + { + "epoch": 0.14579373736580178, + "grad_norm": 16.14074096211515, + "learning_rate": 4.888046764835987e-05, + "loss": 2.1014, + "mean_token_accuracy": 0.4586206912994385, + "step": 144750 + }, + { + "epoch": 0.14579877341890596, + "grad_norm": 14.552204652856918, + "learning_rate": 4.8880350918494424e-05, + "loss": 2.7088, + "mean_token_accuracy": 0.35862069129943847, + "step": 144755 + }, + { + "epoch": 0.14580380947201013, + "grad_norm": 9.37388644784603, + "learning_rate": 4.888023418269906e-05, + "loss": 2.4195, + "mean_token_accuracy": 0.4344827592372894, + "step": 144760 + }, + { + "epoch": 0.1458088455251143, + "grad_norm": 10.50407241743583, + "learning_rate": 4.88801174409738e-05, + "loss": 2.7224, + "mean_token_accuracy": 0.39310344457626345, + "step": 144765 + }, + { + "epoch": 0.14581388157821848, + "grad_norm": 13.889496809619446, + "learning_rate": 4.88800006933187e-05, + "loss": 2.5856, + "mean_token_accuracy": 0.39310344457626345, + "step": 144770 + }, + { + "epoch": 0.14581891763132263, + "grad_norm": 10.190710151364954, + "learning_rate": 4.887988393973377e-05, + "loss": 2.0719, + "mean_token_accuracy": 0.4586206912994385, + "step": 144775 + }, + { + "epoch": 0.1458239536844268, + "grad_norm": 9.850653234987464, + "learning_rate": 4.887976718021905e-05, + "loss": 2.4285, + "mean_token_accuracy": 0.4551724135875702, + "step": 144780 + }, + { + "epoch": 0.14582898973753097, + "grad_norm": 12.893192656296447, + "learning_rate": 4.887965041477457e-05, + "loss": 2.3197, + "mean_token_accuracy": 0.44827585220336913, + "step": 144785 + }, + { + "epoch": 0.14583402579063515, + "grad_norm": 9.405295492733737, + "learning_rate": 4.8879533643400374e-05, + "loss": 2.3793, + "mean_token_accuracy": 0.4206896543502808, + "step": 144790 + }, + { + "epoch": 0.14583906184373932, + "grad_norm": 9.637295185609057, + "learning_rate": 4.887941686609647e-05, + "loss": 2.7076, + "mean_token_accuracy": 0.40145190954208376, + "step": 144795 + }, + { + "epoch": 0.1458440978968435, + "grad_norm": 16.44437324800028, + "learning_rate": 4.8879300082862914e-05, + "loss": 2.939, + "mean_token_accuracy": 0.4517241358757019, + "step": 144800 + }, + { + "epoch": 0.14584913394994767, + "grad_norm": 10.621185130230701, + "learning_rate": 4.8879183293699735e-05, + "loss": 2.7376, + "mean_token_accuracy": 0.36206896901130675, + "step": 144805 + }, + { + "epoch": 0.14585417000305184, + "grad_norm": 11.449473763842597, + "learning_rate": 4.8879066498606945e-05, + "loss": 2.2818, + "mean_token_accuracy": 0.47241379618644713, + "step": 144810 + }, + { + "epoch": 0.14585920605615602, + "grad_norm": 16.575242993507, + "learning_rate": 4.88789496975846e-05, + "loss": 2.5262, + "mean_token_accuracy": 0.42068966031074523, + "step": 144815 + }, + { + "epoch": 0.1458642421092602, + "grad_norm": 9.991273353251882, + "learning_rate": 4.887883289063272e-05, + "loss": 2.0831, + "mean_token_accuracy": 0.4640048384666443, + "step": 144820 + }, + { + "epoch": 0.14586927816236436, + "grad_norm": 12.521648153428837, + "learning_rate": 4.8878716077751344e-05, + "loss": 2.697, + "mean_token_accuracy": 0.4172413766384125, + "step": 144825 + }, + { + "epoch": 0.14587431421546854, + "grad_norm": 9.451919881371168, + "learning_rate": 4.887859925894049e-05, + "loss": 2.5282, + "mean_token_accuracy": 0.4103448212146759, + "step": 144830 + }, + { + "epoch": 0.1458793502685727, + "grad_norm": 15.12196023674418, + "learning_rate": 4.8878482434200216e-05, + "loss": 2.7586, + "mean_token_accuracy": 0.42758620381355283, + "step": 144835 + }, + { + "epoch": 0.14588438632167688, + "grad_norm": 11.906774201948894, + "learning_rate": 4.887836560353053e-05, + "loss": 2.7351, + "mean_token_accuracy": 0.44482758045196535, + "step": 144840 + }, + { + "epoch": 0.14588942237478106, + "grad_norm": 9.744268153766406, + "learning_rate": 4.887824876693149e-05, + "loss": 2.2401, + "mean_token_accuracy": 0.41724138259887694, + "step": 144845 + }, + { + "epoch": 0.14589445842788523, + "grad_norm": 9.31832115400844, + "learning_rate": 4.8878131924403105e-05, + "loss": 2.2482, + "mean_token_accuracy": 0.4482758641242981, + "step": 144850 + }, + { + "epoch": 0.1458994944809894, + "grad_norm": 12.447226022350334, + "learning_rate": 4.887801507594541e-05, + "loss": 2.2735, + "mean_token_accuracy": 0.43793103098869324, + "step": 144855 + }, + { + "epoch": 0.14590453053409358, + "grad_norm": 10.5553289273563, + "learning_rate": 4.8877898221558445e-05, + "loss": 2.5304, + "mean_token_accuracy": 0.4, + "step": 144860 + }, + { + "epoch": 0.14590956658719775, + "grad_norm": 10.332532340072381, + "learning_rate": 4.887778136124224e-05, + "loss": 2.3174, + "mean_token_accuracy": 0.5, + "step": 144865 + }, + { + "epoch": 0.14591460264030193, + "grad_norm": 11.227655717835669, + "learning_rate": 4.887766449499684e-05, + "loss": 2.9671, + "mean_token_accuracy": 0.3655172437429428, + "step": 144870 + }, + { + "epoch": 0.1459196386934061, + "grad_norm": 10.788963230355808, + "learning_rate": 4.887754762282225e-05, + "loss": 2.2361, + "mean_token_accuracy": 0.45862069725990295, + "step": 144875 + }, + { + "epoch": 0.14592467474651027, + "grad_norm": 9.199285460307284, + "learning_rate": 4.887743074471852e-05, + "loss": 2.3151, + "mean_token_accuracy": 0.4344827592372894, + "step": 144880 + }, + { + "epoch": 0.14592971079961445, + "grad_norm": 11.87326953583054, + "learning_rate": 4.887731386068568e-05, + "loss": 2.686, + "mean_token_accuracy": 0.3931034505367279, + "step": 144885 + }, + { + "epoch": 0.14593474685271862, + "grad_norm": 9.187627287430418, + "learning_rate": 4.887719697072376e-05, + "loss": 2.4483, + "mean_token_accuracy": 0.4689655065536499, + "step": 144890 + }, + { + "epoch": 0.1459397829058228, + "grad_norm": 10.03346699047246, + "learning_rate": 4.8877080074832805e-05, + "loss": 2.2087, + "mean_token_accuracy": 0.4586206912994385, + "step": 144895 + }, + { + "epoch": 0.14594481895892697, + "grad_norm": 10.606343132149872, + "learning_rate": 4.887696317301283e-05, + "loss": 2.5578, + "mean_token_accuracy": 0.3913490653038025, + "step": 144900 + }, + { + "epoch": 0.14594985501203114, + "grad_norm": 8.970144784164436, + "learning_rate": 4.887684626526387e-05, + "loss": 2.2966, + "mean_token_accuracy": 0.42413793206214906, + "step": 144905 + }, + { + "epoch": 0.14595489106513532, + "grad_norm": 10.741852925020835, + "learning_rate": 4.887672935158597e-05, + "loss": 2.3262, + "mean_token_accuracy": 0.43793103098869324, + "step": 144910 + }, + { + "epoch": 0.14595992711823946, + "grad_norm": 10.67657215363258, + "learning_rate": 4.887661243197915e-05, + "loss": 2.284, + "mean_token_accuracy": 0.4210591077804565, + "step": 144915 + }, + { + "epoch": 0.14596496317134364, + "grad_norm": 12.2909904199333, + "learning_rate": 4.887649550644345e-05, + "loss": 2.5811, + "mean_token_accuracy": 0.4000000059604645, + "step": 144920 + }, + { + "epoch": 0.1459699992244478, + "grad_norm": 10.135351060230445, + "learning_rate": 4.88763785749789e-05, + "loss": 2.5141, + "mean_token_accuracy": 0.42413793206214906, + "step": 144925 + }, + { + "epoch": 0.14597503527755198, + "grad_norm": 9.41953744451291, + "learning_rate": 4.887626163758552e-05, + "loss": 2.4457, + "mean_token_accuracy": 0.4034482777118683, + "step": 144930 + }, + { + "epoch": 0.14598007133065616, + "grad_norm": 12.382012870046914, + "learning_rate": 4.887614469426337e-05, + "loss": 2.4097, + "mean_token_accuracy": 0.4965517222881317, + "step": 144935 + }, + { + "epoch": 0.14598510738376033, + "grad_norm": 9.502633008980666, + "learning_rate": 4.887602774501246e-05, + "loss": 2.3572, + "mean_token_accuracy": 0.45517241954803467, + "step": 144940 + }, + { + "epoch": 0.1459901434368645, + "grad_norm": 10.56196352739203, + "learning_rate": 4.887591078983283e-05, + "loss": 2.3738, + "mean_token_accuracy": 0.441379314661026, + "step": 144945 + }, + { + "epoch": 0.14599517948996868, + "grad_norm": 9.595458511014769, + "learning_rate": 4.8875793828724504e-05, + "loss": 2.2889, + "mean_token_accuracy": 0.38620689511299133, + "step": 144950 + }, + { + "epoch": 0.14600021554307285, + "grad_norm": 11.25543358036961, + "learning_rate": 4.887567686168753e-05, + "loss": 2.4449, + "mean_token_accuracy": 0.42413792610168455, + "step": 144955 + }, + { + "epoch": 0.14600525159617703, + "grad_norm": 10.655611753061484, + "learning_rate": 4.8875559888721935e-05, + "loss": 2.6358, + "mean_token_accuracy": 0.3896551787853241, + "step": 144960 + }, + { + "epoch": 0.1460102876492812, + "grad_norm": 9.803499990848563, + "learning_rate": 4.887544290982774e-05, + "loss": 1.8953, + "mean_token_accuracy": 0.5482758462429047, + "step": 144965 + }, + { + "epoch": 0.14601532370238537, + "grad_norm": 12.138479923900167, + "learning_rate": 4.887532592500499e-05, + "loss": 1.9609, + "mean_token_accuracy": 0.49999998807907103, + "step": 144970 + }, + { + "epoch": 0.14602035975548955, + "grad_norm": 12.150081441851995, + "learning_rate": 4.8875208934253716e-05, + "loss": 2.7599, + "mean_token_accuracy": 0.43793103098869324, + "step": 144975 + }, + { + "epoch": 0.14602539580859372, + "grad_norm": 10.643119612542845, + "learning_rate": 4.887509193757394e-05, + "loss": 2.5193, + "mean_token_accuracy": 0.42758620977401735, + "step": 144980 + }, + { + "epoch": 0.1460304318616979, + "grad_norm": 10.235546829888639, + "learning_rate": 4.8874974934965705e-05, + "loss": 2.5595, + "mean_token_accuracy": 0.43103447556495667, + "step": 144985 + }, + { + "epoch": 0.14603546791480207, + "grad_norm": 8.764113881085224, + "learning_rate": 4.887485792642905e-05, + "loss": 2.547, + "mean_token_accuracy": 0.4137930929660797, + "step": 144990 + }, + { + "epoch": 0.14604050396790624, + "grad_norm": 9.833380073682754, + "learning_rate": 4.887474091196399e-05, + "loss": 2.604, + "mean_token_accuracy": 0.41379310488700866, + "step": 144995 + }, + { + "epoch": 0.14604554002101042, + "grad_norm": 12.703148089164713, + "learning_rate": 4.887462389157057e-05, + "loss": 2.3965, + "mean_token_accuracy": 0.43103448748588563, + "step": 145000 + }, + { + "epoch": 0.1460505760741146, + "grad_norm": 10.74192104244049, + "learning_rate": 4.8874506865248815e-05, + "loss": 2.6119, + "mean_token_accuracy": 0.3862069010734558, + "step": 145005 + }, + { + "epoch": 0.14605561212721876, + "grad_norm": 9.778281601497044, + "learning_rate": 4.887438983299876e-05, + "loss": 2.1423, + "mean_token_accuracy": 0.5034482717514038, + "step": 145010 + }, + { + "epoch": 0.14606064818032294, + "grad_norm": 10.005973435299799, + "learning_rate": 4.8874272794820445e-05, + "loss": 2.4001, + "mean_token_accuracy": 0.4206896424293518, + "step": 145015 + }, + { + "epoch": 0.1460656842334271, + "grad_norm": 8.16984249525536, + "learning_rate": 4.8874155750713894e-05, + "loss": 1.981, + "mean_token_accuracy": 0.4482758641242981, + "step": 145020 + }, + { + "epoch": 0.14607072028653129, + "grad_norm": 9.253231203084932, + "learning_rate": 4.8874038700679136e-05, + "loss": 2.4178, + "mean_token_accuracy": 0.47241379618644713, + "step": 145025 + }, + { + "epoch": 0.14607575633963546, + "grad_norm": 9.520827044277427, + "learning_rate": 4.887392164471621e-05, + "loss": 2.2402, + "mean_token_accuracy": 0.47761645913124084, + "step": 145030 + }, + { + "epoch": 0.14608079239273963, + "grad_norm": 8.605783949684087, + "learning_rate": 4.887380458282515e-05, + "loss": 2.2401, + "mean_token_accuracy": 0.44482759237289426, + "step": 145035 + }, + { + "epoch": 0.1460858284458438, + "grad_norm": 12.92517872220332, + "learning_rate": 4.887368751500598e-05, + "loss": 2.4023, + "mean_token_accuracy": 0.42413792610168455, + "step": 145040 + }, + { + "epoch": 0.14609086449894798, + "grad_norm": 11.228400260967698, + "learning_rate": 4.887357044125874e-05, + "loss": 2.3198, + "mean_token_accuracy": 0.4379310369491577, + "step": 145045 + }, + { + "epoch": 0.14609590055205215, + "grad_norm": 11.77582828047089, + "learning_rate": 4.887345336158347e-05, + "loss": 2.6586, + "mean_token_accuracy": 0.3965517282485962, + "step": 145050 + }, + { + "epoch": 0.1461009366051563, + "grad_norm": 9.257639507098217, + "learning_rate": 4.8873336275980184e-05, + "loss": 2.2383, + "mean_token_accuracy": 0.43793103098869324, + "step": 145055 + }, + { + "epoch": 0.14610597265826047, + "grad_norm": 15.881600711056246, + "learning_rate": 4.887321918444893e-05, + "loss": 2.9169, + "mean_token_accuracy": 0.3482758581638336, + "step": 145060 + }, + { + "epoch": 0.14611100871136465, + "grad_norm": 10.521979159805525, + "learning_rate": 4.8873102086989725e-05, + "loss": 2.0545, + "mean_token_accuracy": 0.47586206197738645, + "step": 145065 + }, + { + "epoch": 0.14611604476446882, + "grad_norm": 6.059787968589027, + "learning_rate": 4.887298498360261e-05, + "loss": 2.011, + "mean_token_accuracy": 0.4782819151878357, + "step": 145070 + }, + { + "epoch": 0.146121080817573, + "grad_norm": 10.82478389979442, + "learning_rate": 4.887286787428762e-05, + "loss": 2.254, + "mean_token_accuracy": 0.4413793087005615, + "step": 145075 + }, + { + "epoch": 0.14612611687067717, + "grad_norm": 12.594626774951399, + "learning_rate": 4.887275075904479e-05, + "loss": 2.6804, + "mean_token_accuracy": 0.4034482777118683, + "step": 145080 + }, + { + "epoch": 0.14613115292378134, + "grad_norm": 10.213882292348739, + "learning_rate": 4.887263363787415e-05, + "loss": 2.462, + "mean_token_accuracy": 0.4517241418361664, + "step": 145085 + }, + { + "epoch": 0.14613618897688552, + "grad_norm": 16.243449315093645, + "learning_rate": 4.887251651077573e-05, + "loss": 2.3108, + "mean_token_accuracy": 0.43103448748588563, + "step": 145090 + }, + { + "epoch": 0.1461412250299897, + "grad_norm": 9.518482863534286, + "learning_rate": 4.887239937774955e-05, + "loss": 2.5031, + "mean_token_accuracy": 0.4068965494632721, + "step": 145095 + }, + { + "epoch": 0.14614626108309386, + "grad_norm": 14.651437891454007, + "learning_rate": 4.8872282238795664e-05, + "loss": 2.6737, + "mean_token_accuracy": 0.4137930989265442, + "step": 145100 + }, + { + "epoch": 0.14615129713619804, + "grad_norm": 15.969765915014376, + "learning_rate": 4.8872165093914094e-05, + "loss": 2.143, + "mean_token_accuracy": 0.4620689630508423, + "step": 145105 + }, + { + "epoch": 0.1461563331893022, + "grad_norm": 9.623776600143229, + "learning_rate": 4.887204794310488e-05, + "loss": 2.5268, + "mean_token_accuracy": 0.4310344815254211, + "step": 145110 + }, + { + "epoch": 0.14616136924240639, + "grad_norm": 11.175351583446634, + "learning_rate": 4.8871930786368045e-05, + "loss": 2.288, + "mean_token_accuracy": 0.4620689690113068, + "step": 145115 + }, + { + "epoch": 0.14616640529551056, + "grad_norm": 12.608747568334623, + "learning_rate": 4.887181362370362e-05, + "loss": 2.6211, + "mean_token_accuracy": 0.3944948613643646, + "step": 145120 + }, + { + "epoch": 0.14617144134861473, + "grad_norm": 9.880089663522657, + "learning_rate": 4.8871696455111654e-05, + "loss": 2.7392, + "mean_token_accuracy": 0.39310344457626345, + "step": 145125 + }, + { + "epoch": 0.1461764774017189, + "grad_norm": 10.977726237919411, + "learning_rate": 4.887157928059216e-05, + "loss": 2.407, + "mean_token_accuracy": 0.4206896424293518, + "step": 145130 + }, + { + "epoch": 0.14618151345482308, + "grad_norm": 10.771524191081141, + "learning_rate": 4.887146210014518e-05, + "loss": 2.3254, + "mean_token_accuracy": 0.4517241358757019, + "step": 145135 + }, + { + "epoch": 0.14618654950792725, + "grad_norm": 12.72612007111849, + "learning_rate": 4.887134491377074e-05, + "loss": 2.3624, + "mean_token_accuracy": 0.4379310369491577, + "step": 145140 + }, + { + "epoch": 0.14619158556103143, + "grad_norm": 10.386076914076511, + "learning_rate": 4.887122772146889e-05, + "loss": 2.7472, + "mean_token_accuracy": 0.3965517282485962, + "step": 145145 + }, + { + "epoch": 0.1461966216141356, + "grad_norm": 12.509018692820693, + "learning_rate": 4.8871110523239635e-05, + "loss": 2.5605, + "mean_token_accuracy": 0.417241370677948, + "step": 145150 + }, + { + "epoch": 0.14620165766723978, + "grad_norm": 8.319202162844528, + "learning_rate": 4.887099331908304e-05, + "loss": 2.2244, + "mean_token_accuracy": 0.4620689690113068, + "step": 145155 + }, + { + "epoch": 0.14620669372034395, + "grad_norm": 9.336887161305897, + "learning_rate": 4.8870876108999114e-05, + "loss": 2.6074, + "mean_token_accuracy": 0.417241370677948, + "step": 145160 + }, + { + "epoch": 0.14621172977344812, + "grad_norm": 9.818742450836854, + "learning_rate": 4.887075889298789e-05, + "loss": 2.3112, + "mean_token_accuracy": 0.4137930989265442, + "step": 145165 + }, + { + "epoch": 0.1462167658265523, + "grad_norm": 14.405686081070389, + "learning_rate": 4.887064167104941e-05, + "loss": 2.2286, + "mean_token_accuracy": 0.4517241299152374, + "step": 145170 + }, + { + "epoch": 0.14622180187965647, + "grad_norm": 8.62197203962883, + "learning_rate": 4.88705244431837e-05, + "loss": 2.2324, + "mean_token_accuracy": 0.4689655125141144, + "step": 145175 + }, + { + "epoch": 0.14622683793276064, + "grad_norm": 10.482808664392508, + "learning_rate": 4.88704072093908e-05, + "loss": 2.2475, + "mean_token_accuracy": 0.441379314661026, + "step": 145180 + }, + { + "epoch": 0.14623187398586482, + "grad_norm": 10.305381296142238, + "learning_rate": 4.887028996967074e-05, + "loss": 1.9466, + "mean_token_accuracy": 0.46896551847457885, + "step": 145185 + }, + { + "epoch": 0.146236910038969, + "grad_norm": 9.108531661038388, + "learning_rate": 4.887017272402355e-05, + "loss": 2.3282, + "mean_token_accuracy": 0.43448275327682495, + "step": 145190 + }, + { + "epoch": 0.14624194609207314, + "grad_norm": 11.23526797433356, + "learning_rate": 4.887005547244925e-05, + "loss": 2.0589, + "mean_token_accuracy": 0.49534180760383606, + "step": 145195 + }, + { + "epoch": 0.1462469821451773, + "grad_norm": 9.714520385387342, + "learning_rate": 4.8869938214947894e-05, + "loss": 2.1993, + "mean_token_accuracy": 0.4398064136505127, + "step": 145200 + }, + { + "epoch": 0.14625201819828149, + "grad_norm": 9.586311871662996, + "learning_rate": 4.8869820951519515e-05, + "loss": 2.5142, + "mean_token_accuracy": 0.44827585816383364, + "step": 145205 + }, + { + "epoch": 0.14625705425138566, + "grad_norm": 8.767572791243037, + "learning_rate": 4.886970368216412e-05, + "loss": 2.3565, + "mean_token_accuracy": 0.43793103098869324, + "step": 145210 + }, + { + "epoch": 0.14626209030448983, + "grad_norm": 9.161731312713252, + "learning_rate": 4.886958640688177e-05, + "loss": 2.3168, + "mean_token_accuracy": 0.4448275864124298, + "step": 145215 + }, + { + "epoch": 0.146267126357594, + "grad_norm": 10.536170174216142, + "learning_rate": 4.886946912567248e-05, + "loss": 2.2404, + "mean_token_accuracy": 0.47241379618644713, + "step": 145220 + }, + { + "epoch": 0.14627216241069818, + "grad_norm": 9.092385023752758, + "learning_rate": 4.8869351838536295e-05, + "loss": 2.7206, + "mean_token_accuracy": 0.36896550953388213, + "step": 145225 + }, + { + "epoch": 0.14627719846380235, + "grad_norm": 11.271421972496517, + "learning_rate": 4.886923454547323e-05, + "loss": 2.2669, + "mean_token_accuracy": 0.458620685338974, + "step": 145230 + }, + { + "epoch": 0.14628223451690653, + "grad_norm": 9.986911204398492, + "learning_rate": 4.886911724648333e-05, + "loss": 2.5131, + "mean_token_accuracy": 0.4000000059604645, + "step": 145235 + }, + { + "epoch": 0.1462872705700107, + "grad_norm": 12.166151040017963, + "learning_rate": 4.886899994156663e-05, + "loss": 2.37, + "mean_token_accuracy": 0.4724137902259827, + "step": 145240 + }, + { + "epoch": 0.14629230662311488, + "grad_norm": 10.630490326639443, + "learning_rate": 4.886888263072316e-05, + "loss": 2.0936, + "mean_token_accuracy": 0.5034482836723327, + "step": 145245 + }, + { + "epoch": 0.14629734267621905, + "grad_norm": 11.499656341197936, + "learning_rate": 4.8868765313952945e-05, + "loss": 2.0978, + "mean_token_accuracy": 0.4707804024219513, + "step": 145250 + }, + { + "epoch": 0.14630237872932322, + "grad_norm": 11.46836326496535, + "learning_rate": 4.886864799125603e-05, + "loss": 2.4239, + "mean_token_accuracy": 0.4, + "step": 145255 + }, + { + "epoch": 0.1463074147824274, + "grad_norm": 10.375279649780268, + "learning_rate": 4.886853066263244e-05, + "loss": 2.7977, + "mean_token_accuracy": 0.44137930274009707, + "step": 145260 + }, + { + "epoch": 0.14631245083553157, + "grad_norm": 12.346465022982581, + "learning_rate": 4.88684133280822e-05, + "loss": 2.4361, + "mean_token_accuracy": 0.4103448331356049, + "step": 145265 + }, + { + "epoch": 0.14631748688863574, + "grad_norm": 8.116920205755989, + "learning_rate": 4.886829598760536e-05, + "loss": 2.1886, + "mean_token_accuracy": 0.47586206197738645, + "step": 145270 + }, + { + "epoch": 0.14632252294173992, + "grad_norm": 8.748390537776741, + "learning_rate": 4.8868178641201936e-05, + "loss": 2.4177, + "mean_token_accuracy": 0.417241370677948, + "step": 145275 + }, + { + "epoch": 0.1463275589948441, + "grad_norm": 9.844327932894803, + "learning_rate": 4.886806128887197e-05, + "loss": 2.3415, + "mean_token_accuracy": 0.4154264986515045, + "step": 145280 + }, + { + "epoch": 0.14633259504794827, + "grad_norm": 9.610179290266823, + "learning_rate": 4.886794393061549e-05, + "loss": 2.3172, + "mean_token_accuracy": 0.42758620977401735, + "step": 145285 + }, + { + "epoch": 0.14633763110105244, + "grad_norm": 9.9467020217488, + "learning_rate": 4.886782656643254e-05, + "loss": 2.3103, + "mean_token_accuracy": 0.4471869349479675, + "step": 145290 + }, + { + "epoch": 0.1463426671541566, + "grad_norm": 13.027939965778941, + "learning_rate": 4.886770919632313e-05, + "loss": 2.4696, + "mean_token_accuracy": 0.38275861740112305, + "step": 145295 + }, + { + "epoch": 0.1463477032072608, + "grad_norm": 9.822032466540874, + "learning_rate": 4.886759182028732e-05, + "loss": 2.6223, + "mean_token_accuracy": 0.41379310488700866, + "step": 145300 + }, + { + "epoch": 0.14635273926036496, + "grad_norm": 8.86115774028444, + "learning_rate": 4.8867474438325125e-05, + "loss": 2.1155, + "mean_token_accuracy": 0.4379310369491577, + "step": 145305 + }, + { + "epoch": 0.14635777531346914, + "grad_norm": 9.15420892047723, + "learning_rate": 4.886735705043658e-05, + "loss": 2.0348, + "mean_token_accuracy": 0.4862068831920624, + "step": 145310 + }, + { + "epoch": 0.1463628113665733, + "grad_norm": 13.90180387713896, + "learning_rate": 4.886723965662171e-05, + "loss": 2.5824, + "mean_token_accuracy": 0.3999999940395355, + "step": 145315 + }, + { + "epoch": 0.14636784741967748, + "grad_norm": 9.560134711230804, + "learning_rate": 4.886712225688057e-05, + "loss": 2.6392, + "mean_token_accuracy": 0.3862068891525269, + "step": 145320 + }, + { + "epoch": 0.14637288347278166, + "grad_norm": 9.257689002483183, + "learning_rate": 4.886700485121318e-05, + "loss": 2.3795, + "mean_token_accuracy": 0.4379310369491577, + "step": 145325 + }, + { + "epoch": 0.14637791952588583, + "grad_norm": 9.88636912998103, + "learning_rate": 4.8866887439619555e-05, + "loss": 2.5104, + "mean_token_accuracy": 0.39310344457626345, + "step": 145330 + }, + { + "epoch": 0.14638295557898998, + "grad_norm": 12.641085148222354, + "learning_rate": 4.8866770022099756e-05, + "loss": 2.6555, + "mean_token_accuracy": 0.38965516686439516, + "step": 145335 + }, + { + "epoch": 0.14638799163209415, + "grad_norm": 10.587093023030333, + "learning_rate": 4.88666525986538e-05, + "loss": 2.385, + "mean_token_accuracy": 0.42413792610168455, + "step": 145340 + }, + { + "epoch": 0.14639302768519832, + "grad_norm": 9.942878377791454, + "learning_rate": 4.886653516928173e-05, + "loss": 3.044, + "mean_token_accuracy": 0.3758620619773865, + "step": 145345 + }, + { + "epoch": 0.1463980637383025, + "grad_norm": 10.328591194376648, + "learning_rate": 4.886641773398356e-05, + "loss": 2.4966, + "mean_token_accuracy": 0.4137930989265442, + "step": 145350 + }, + { + "epoch": 0.14640309979140667, + "grad_norm": 11.040157081548655, + "learning_rate": 4.886630029275935e-05, + "loss": 2.4591, + "mean_token_accuracy": 0.45716878175735476, + "step": 145355 + }, + { + "epoch": 0.14640813584451084, + "grad_norm": 9.512107579349054, + "learning_rate": 4.88661828456091e-05, + "loss": 2.3314, + "mean_token_accuracy": 0.44482759237289426, + "step": 145360 + }, + { + "epoch": 0.14641317189761502, + "grad_norm": 10.014396220280643, + "learning_rate": 4.886606539253287e-05, + "loss": 2.3752, + "mean_token_accuracy": 0.42758620977401735, + "step": 145365 + }, + { + "epoch": 0.1464182079507192, + "grad_norm": 9.076690843344158, + "learning_rate": 4.886594793353068e-05, + "loss": 2.2153, + "mean_token_accuracy": 0.4344827651977539, + "step": 145370 + }, + { + "epoch": 0.14642324400382337, + "grad_norm": 10.510217708281013, + "learning_rate": 4.8865830468602565e-05, + "loss": 2.7783, + "mean_token_accuracy": 0.334482753276825, + "step": 145375 + }, + { + "epoch": 0.14642828005692754, + "grad_norm": 9.385462644882063, + "learning_rate": 4.886571299774855e-05, + "loss": 2.1743, + "mean_token_accuracy": 0.441379314661026, + "step": 145380 + }, + { + "epoch": 0.1464333161100317, + "grad_norm": 11.103320864408245, + "learning_rate": 4.886559552096868e-05, + "loss": 2.6643, + "mean_token_accuracy": 0.42413793206214906, + "step": 145385 + }, + { + "epoch": 0.1464383521631359, + "grad_norm": 10.882895083002902, + "learning_rate": 4.886547803826299e-05, + "loss": 2.3235, + "mean_token_accuracy": 0.4903940916061401, + "step": 145390 + }, + { + "epoch": 0.14644338821624006, + "grad_norm": 10.353339572409475, + "learning_rate": 4.886536054963149e-05, + "loss": 2.5862, + "mean_token_accuracy": 0.38620689511299133, + "step": 145395 + }, + { + "epoch": 0.14644842426934424, + "grad_norm": 10.53692987441267, + "learning_rate": 4.886524305507424e-05, + "loss": 2.4441, + "mean_token_accuracy": 0.3517241358757019, + "step": 145400 + }, + { + "epoch": 0.1464534603224484, + "grad_norm": 9.812351582450763, + "learning_rate": 4.8865125554591254e-05, + "loss": 2.2427, + "mean_token_accuracy": 0.4172413766384125, + "step": 145405 + }, + { + "epoch": 0.14645849637555258, + "grad_norm": 11.825106032235364, + "learning_rate": 4.886500804818258e-05, + "loss": 2.0443, + "mean_token_accuracy": 0.4896551728248596, + "step": 145410 + }, + { + "epoch": 0.14646353242865676, + "grad_norm": 11.246363289145366, + "learning_rate": 4.886489053584823e-05, + "loss": 2.281, + "mean_token_accuracy": 0.4793103337287903, + "step": 145415 + }, + { + "epoch": 0.14646856848176093, + "grad_norm": 10.221547869805566, + "learning_rate": 4.886477301758825e-05, + "loss": 2.2731, + "mean_token_accuracy": 0.4551724135875702, + "step": 145420 + }, + { + "epoch": 0.1464736045348651, + "grad_norm": 9.076963328962508, + "learning_rate": 4.886465549340267e-05, + "loss": 2.1997, + "mean_token_accuracy": 0.43448275327682495, + "step": 145425 + }, + { + "epoch": 0.14647864058796928, + "grad_norm": 10.908653531218096, + "learning_rate": 4.886453796329153e-05, + "loss": 2.0624, + "mean_token_accuracy": 0.4551724135875702, + "step": 145430 + }, + { + "epoch": 0.14648367664107345, + "grad_norm": 8.602774489269136, + "learning_rate": 4.886442042725485e-05, + "loss": 2.4516, + "mean_token_accuracy": 0.4500302493572235, + "step": 145435 + }, + { + "epoch": 0.14648871269417763, + "grad_norm": 9.171289380464234, + "learning_rate": 4.886430288529267e-05, + "loss": 2.4143, + "mean_token_accuracy": 0.4137930989265442, + "step": 145440 + }, + { + "epoch": 0.1464937487472818, + "grad_norm": 9.473873284077241, + "learning_rate": 4.8864185337405015e-05, + "loss": 2.3762, + "mean_token_accuracy": 0.4655172348022461, + "step": 145445 + }, + { + "epoch": 0.14649878480038597, + "grad_norm": 9.450784426802032, + "learning_rate": 4.886406778359193e-05, + "loss": 2.2168, + "mean_token_accuracy": 0.49879008531570435, + "step": 145450 + }, + { + "epoch": 0.14650382085349015, + "grad_norm": 10.045406209960491, + "learning_rate": 4.886395022385344e-05, + "loss": 2.4078, + "mean_token_accuracy": 0.441379314661026, + "step": 145455 + }, + { + "epoch": 0.14650885690659432, + "grad_norm": 12.911688311279304, + "learning_rate": 4.8863832658189586e-05, + "loss": 2.5789, + "mean_token_accuracy": 0.3896551638841629, + "step": 145460 + }, + { + "epoch": 0.1465138929596985, + "grad_norm": 10.831772057316453, + "learning_rate": 4.886371508660038e-05, + "loss": 2.6407, + "mean_token_accuracy": 0.4034482717514038, + "step": 145465 + }, + { + "epoch": 0.14651892901280267, + "grad_norm": 12.57706631841123, + "learning_rate": 4.886359750908588e-05, + "loss": 2.3528, + "mean_token_accuracy": 0.4586206912994385, + "step": 145470 + }, + { + "epoch": 0.1465239650659068, + "grad_norm": 14.186897043768306, + "learning_rate": 4.8863479925646095e-05, + "loss": 2.5375, + "mean_token_accuracy": 0.4068965494632721, + "step": 145475 + }, + { + "epoch": 0.146529001119011, + "grad_norm": 14.189834806165358, + "learning_rate": 4.886336233628108e-05, + "loss": 2.7029, + "mean_token_accuracy": 0.36206896901130675, + "step": 145480 + }, + { + "epoch": 0.14653403717211516, + "grad_norm": 11.351023344231312, + "learning_rate": 4.886324474099085e-05, + "loss": 2.2815, + "mean_token_accuracy": 0.47586206793785096, + "step": 145485 + }, + { + "epoch": 0.14653907322521934, + "grad_norm": 9.745486790013976, + "learning_rate": 4.886312713977544e-05, + "loss": 2.4366, + "mean_token_accuracy": 0.441379314661026, + "step": 145490 + }, + { + "epoch": 0.1465441092783235, + "grad_norm": 9.986840580878154, + "learning_rate": 4.8863009532634904e-05, + "loss": 2.4682, + "mean_token_accuracy": 0.41542649269104004, + "step": 145495 + }, + { + "epoch": 0.14654914533142768, + "grad_norm": 9.979332401274913, + "learning_rate": 4.886289191956924e-05, + "loss": 2.2858, + "mean_token_accuracy": 0.4359951615333557, + "step": 145500 + }, + { + "epoch": 0.14655418138453186, + "grad_norm": 11.922447717112854, + "learning_rate": 4.886277430057851e-05, + "loss": 2.6458, + "mean_token_accuracy": 0.3931034505367279, + "step": 145505 + }, + { + "epoch": 0.14655921743763603, + "grad_norm": 11.247768707166953, + "learning_rate": 4.886265667566272e-05, + "loss": 2.4043, + "mean_token_accuracy": 0.42758620977401735, + "step": 145510 + }, + { + "epoch": 0.1465642534907402, + "grad_norm": 11.350808532624301, + "learning_rate": 4.886253904482193e-05, + "loss": 2.2584, + "mean_token_accuracy": 0.4413793206214905, + "step": 145515 + }, + { + "epoch": 0.14656928954384438, + "grad_norm": 11.445006644942893, + "learning_rate": 4.8862421408056156e-05, + "loss": 2.4112, + "mean_token_accuracy": 0.4601330876350403, + "step": 145520 + }, + { + "epoch": 0.14657432559694855, + "grad_norm": 10.538342848766543, + "learning_rate": 4.8862303765365435e-05, + "loss": 2.156, + "mean_token_accuracy": 0.4379310369491577, + "step": 145525 + }, + { + "epoch": 0.14657936165005273, + "grad_norm": 13.66871592578065, + "learning_rate": 4.88621861167498e-05, + "loss": 2.6588, + "mean_token_accuracy": 0.37586206793785093, + "step": 145530 + }, + { + "epoch": 0.1465843977031569, + "grad_norm": 10.509664315299132, + "learning_rate": 4.8862068462209284e-05, + "loss": 2.2062, + "mean_token_accuracy": 0.44827585816383364, + "step": 145535 + }, + { + "epoch": 0.14658943375626107, + "grad_norm": 13.571823734696153, + "learning_rate": 4.886195080174392e-05, + "loss": 2.7454, + "mean_token_accuracy": 0.3965517163276672, + "step": 145540 + }, + { + "epoch": 0.14659446980936525, + "grad_norm": 9.084225900083837, + "learning_rate": 4.8861833135353734e-05, + "loss": 2.5542, + "mean_token_accuracy": 0.4034482777118683, + "step": 145545 + }, + { + "epoch": 0.14659950586246942, + "grad_norm": 10.65159439266977, + "learning_rate": 4.886171546303877e-05, + "loss": 2.6409, + "mean_token_accuracy": 0.4068965554237366, + "step": 145550 + }, + { + "epoch": 0.1466045419155736, + "grad_norm": 10.186836341788988, + "learning_rate": 4.886159778479905e-05, + "loss": 2.2854, + "mean_token_accuracy": 0.47586206793785096, + "step": 145555 + }, + { + "epoch": 0.14660957796867777, + "grad_norm": 11.683359317168403, + "learning_rate": 4.886148010063461e-05, + "loss": 2.2593, + "mean_token_accuracy": 0.47931034564971925, + "step": 145560 + }, + { + "epoch": 0.14661461402178194, + "grad_norm": 10.465072147948156, + "learning_rate": 4.886136241054549e-05, + "loss": 1.9789, + "mean_token_accuracy": 0.5285714328289032, + "step": 145565 + }, + { + "epoch": 0.14661965007488612, + "grad_norm": 10.995216407366447, + "learning_rate": 4.886124471453171e-05, + "loss": 2.3427, + "mean_token_accuracy": 0.45862067937850953, + "step": 145570 + }, + { + "epoch": 0.1466246861279903, + "grad_norm": 11.83054520627998, + "learning_rate": 4.886112701259331e-05, + "loss": 2.5649, + "mean_token_accuracy": 0.3946158468723297, + "step": 145575 + }, + { + "epoch": 0.14662972218109446, + "grad_norm": 9.973110372983491, + "learning_rate": 4.8861009304730325e-05, + "loss": 2.8555, + "mean_token_accuracy": 0.42413792610168455, + "step": 145580 + }, + { + "epoch": 0.14663475823419864, + "grad_norm": 10.28904497414514, + "learning_rate": 4.886089159094278e-05, + "loss": 2.4667, + "mean_token_accuracy": 0.42758620977401735, + "step": 145585 + }, + { + "epoch": 0.1466397942873028, + "grad_norm": 12.150565256179517, + "learning_rate": 4.886077387123072e-05, + "loss": 2.5299, + "mean_token_accuracy": 0.42758620381355283, + "step": 145590 + }, + { + "epoch": 0.14664483034040698, + "grad_norm": 9.636097363438672, + "learning_rate": 4.8860656145594167e-05, + "loss": 2.0032, + "mean_token_accuracy": 0.5103448331356049, + "step": 145595 + }, + { + "epoch": 0.14664986639351116, + "grad_norm": 11.035740832358089, + "learning_rate": 4.8860538414033155e-05, + "loss": 2.4901, + "mean_token_accuracy": 0.42413792610168455, + "step": 145600 + }, + { + "epoch": 0.14665490244661533, + "grad_norm": 11.133286192020643, + "learning_rate": 4.886042067654771e-05, + "loss": 2.165, + "mean_token_accuracy": 0.46436781883239747, + "step": 145605 + }, + { + "epoch": 0.1466599384997195, + "grad_norm": 11.052990380521088, + "learning_rate": 4.886030293313788e-05, + "loss": 2.3456, + "mean_token_accuracy": 0.4413793087005615, + "step": 145610 + }, + { + "epoch": 0.14666497455282365, + "grad_norm": 11.031521595009053, + "learning_rate": 4.886018518380369e-05, + "loss": 2.1885, + "mean_token_accuracy": 0.43103448748588563, + "step": 145615 + }, + { + "epoch": 0.14667001060592783, + "grad_norm": 10.377146410079444, + "learning_rate": 4.886006742854517e-05, + "loss": 2.4718, + "mean_token_accuracy": 0.38275861740112305, + "step": 145620 + }, + { + "epoch": 0.146675046659032, + "grad_norm": 8.90540535417392, + "learning_rate": 4.885994966736236e-05, + "loss": 2.8863, + "mean_token_accuracy": 0.38620689511299133, + "step": 145625 + }, + { + "epoch": 0.14668008271213617, + "grad_norm": 15.74599136085124, + "learning_rate": 4.8859831900255294e-05, + "loss": 2.3255, + "mean_token_accuracy": 0.40344828367233276, + "step": 145630 + }, + { + "epoch": 0.14668511876524035, + "grad_norm": 14.888125979975138, + "learning_rate": 4.885971412722399e-05, + "loss": 2.3624, + "mean_token_accuracy": 0.4310344815254211, + "step": 145635 + }, + { + "epoch": 0.14669015481834452, + "grad_norm": 9.107663751917197, + "learning_rate": 4.885959634826849e-05, + "loss": 2.5915, + "mean_token_accuracy": 0.4586206912994385, + "step": 145640 + }, + { + "epoch": 0.1466951908714487, + "grad_norm": 10.089719794649529, + "learning_rate": 4.885947856338883e-05, + "loss": 2.4501, + "mean_token_accuracy": 0.4501512348651886, + "step": 145645 + }, + { + "epoch": 0.14670022692455287, + "grad_norm": 9.851325316151765, + "learning_rate": 4.885936077258504e-05, + "loss": 1.8812, + "mean_token_accuracy": 0.5379310250282288, + "step": 145650 + }, + { + "epoch": 0.14670526297765704, + "grad_norm": 11.715360924264887, + "learning_rate": 4.885924297585715e-05, + "loss": 2.679, + "mean_token_accuracy": 0.41899576783180237, + "step": 145655 + }, + { + "epoch": 0.14671029903076122, + "grad_norm": 10.347225279342785, + "learning_rate": 4.885912517320518e-05, + "loss": 2.4853, + "mean_token_accuracy": 0.42413793206214906, + "step": 145660 + }, + { + "epoch": 0.1467153350838654, + "grad_norm": 12.14598176067242, + "learning_rate": 4.88590073646292e-05, + "loss": 2.7475, + "mean_token_accuracy": 0.3655172407627106, + "step": 145665 + }, + { + "epoch": 0.14672037113696956, + "grad_norm": 9.64587504699479, + "learning_rate": 4.88588895501292e-05, + "loss": 2.1623, + "mean_token_accuracy": 0.44301270246505736, + "step": 145670 + }, + { + "epoch": 0.14672540719007374, + "grad_norm": 11.070236644950853, + "learning_rate": 4.8858771729705245e-05, + "loss": 2.4456, + "mean_token_accuracy": 0.4379310369491577, + "step": 145675 + }, + { + "epoch": 0.1467304432431779, + "grad_norm": 10.066350406119934, + "learning_rate": 4.885865390335735e-05, + "loss": 2.211, + "mean_token_accuracy": 0.47241380214691164, + "step": 145680 + }, + { + "epoch": 0.14673547929628208, + "grad_norm": 9.692268990257364, + "learning_rate": 4.885853607108556e-05, + "loss": 2.2728, + "mean_token_accuracy": 0.4068965494632721, + "step": 145685 + }, + { + "epoch": 0.14674051534938626, + "grad_norm": 9.183747003832968, + "learning_rate": 4.885841823288989e-05, + "loss": 2.355, + "mean_token_accuracy": 0.4103448301553726, + "step": 145690 + }, + { + "epoch": 0.14674555140249043, + "grad_norm": 10.9120006558695, + "learning_rate": 4.885830038877039e-05, + "loss": 2.407, + "mean_token_accuracy": 0.41034482717514037, + "step": 145695 + }, + { + "epoch": 0.1467505874555946, + "grad_norm": 11.854291828484142, + "learning_rate": 4.885818253872708e-05, + "loss": 2.4213, + "mean_token_accuracy": 0.47071990966796873, + "step": 145700 + }, + { + "epoch": 0.14675562350869878, + "grad_norm": 9.838840960091375, + "learning_rate": 4.885806468276e-05, + "loss": 2.3391, + "mean_token_accuracy": 0.441379314661026, + "step": 145705 + }, + { + "epoch": 0.14676065956180295, + "grad_norm": 10.682206779740259, + "learning_rate": 4.885794682086919e-05, + "loss": 2.0728, + "mean_token_accuracy": 0.5021173596382141, + "step": 145710 + }, + { + "epoch": 0.14676569561490713, + "grad_norm": 12.44005279101634, + "learning_rate": 4.885782895305466e-05, + "loss": 2.3435, + "mean_token_accuracy": 0.4137930989265442, + "step": 145715 + }, + { + "epoch": 0.1467707316680113, + "grad_norm": 6.374985827492658, + "learning_rate": 4.885771107931646e-05, + "loss": 1.9341, + "mean_token_accuracy": 0.5034482777118683, + "step": 145720 + }, + { + "epoch": 0.14677576772111547, + "grad_norm": 10.600125199136711, + "learning_rate": 4.885759319965463e-05, + "loss": 2.2459, + "mean_token_accuracy": 0.482758629322052, + "step": 145725 + }, + { + "epoch": 0.14678080377421965, + "grad_norm": 9.128536649817926, + "learning_rate": 4.885747531406919e-05, + "loss": 2.233, + "mean_token_accuracy": 0.4344827592372894, + "step": 145730 + }, + { + "epoch": 0.14678583982732382, + "grad_norm": 8.463474046444865, + "learning_rate": 4.885735742256016e-05, + "loss": 2.2102, + "mean_token_accuracy": 0.4013309180736542, + "step": 145735 + }, + { + "epoch": 0.146790875880428, + "grad_norm": 12.409515188585273, + "learning_rate": 4.885723952512759e-05, + "loss": 2.5447, + "mean_token_accuracy": 0.3896551728248596, + "step": 145740 + }, + { + "epoch": 0.14679591193353217, + "grad_norm": 9.474995442155242, + "learning_rate": 4.8857121621771515e-05, + "loss": 2.7443, + "mean_token_accuracy": 0.3931034505367279, + "step": 145745 + }, + { + "epoch": 0.14680094798663634, + "grad_norm": 9.516640948538726, + "learning_rate": 4.885700371249197e-05, + "loss": 2.1318, + "mean_token_accuracy": 0.42413793206214906, + "step": 145750 + }, + { + "epoch": 0.1468059840397405, + "grad_norm": 11.653741616179131, + "learning_rate": 4.885688579728897e-05, + "loss": 2.5068, + "mean_token_accuracy": 0.37241379022598264, + "step": 145755 + }, + { + "epoch": 0.14681102009284466, + "grad_norm": 12.80646141498987, + "learning_rate": 4.8856767876162565e-05, + "loss": 2.2815, + "mean_token_accuracy": 0.45317604541778567, + "step": 145760 + }, + { + "epoch": 0.14681605614594884, + "grad_norm": 10.876172713804136, + "learning_rate": 4.8856649949112774e-05, + "loss": 2.5727, + "mean_token_accuracy": 0.36896551251411436, + "step": 145765 + }, + { + "epoch": 0.146821092199053, + "grad_norm": 12.618811538450247, + "learning_rate": 4.885653201613964e-05, + "loss": 2.3381, + "mean_token_accuracy": 0.4068965494632721, + "step": 145770 + }, + { + "epoch": 0.14682612825215718, + "grad_norm": 11.41519940743099, + "learning_rate": 4.885641407724319e-05, + "loss": 2.1336, + "mean_token_accuracy": 0.4862069010734558, + "step": 145775 + }, + { + "epoch": 0.14683116430526136, + "grad_norm": 12.008356128631757, + "learning_rate": 4.885629613242346e-05, + "loss": 2.907, + "mean_token_accuracy": 0.3793103516101837, + "step": 145780 + }, + { + "epoch": 0.14683620035836553, + "grad_norm": 11.428787394343868, + "learning_rate": 4.8856178181680485e-05, + "loss": 2.601, + "mean_token_accuracy": 0.4310344934463501, + "step": 145785 + }, + { + "epoch": 0.1468412364114697, + "grad_norm": 13.890208340176272, + "learning_rate": 4.8856060225014294e-05, + "loss": 2.2851, + "mean_token_accuracy": 0.48965516686439514, + "step": 145790 + }, + { + "epoch": 0.14684627246457388, + "grad_norm": 9.546459224088771, + "learning_rate": 4.885594226242491e-05, + "loss": 2.2061, + "mean_token_accuracy": 0.4172413766384125, + "step": 145795 + }, + { + "epoch": 0.14685130851767805, + "grad_norm": 9.459151425908313, + "learning_rate": 4.885582429391239e-05, + "loss": 2.4398, + "mean_token_accuracy": 0.4, + "step": 145800 + }, + { + "epoch": 0.14685634457078223, + "grad_norm": 9.159487169346878, + "learning_rate": 4.885570631947674e-05, + "loss": 2.3402, + "mean_token_accuracy": 0.43793103098869324, + "step": 145805 + }, + { + "epoch": 0.1468613806238864, + "grad_norm": 9.331179555651648, + "learning_rate": 4.885558833911802e-05, + "loss": 2.1356, + "mean_token_accuracy": 0.4379310369491577, + "step": 145810 + }, + { + "epoch": 0.14686641667699057, + "grad_norm": 11.26027385746331, + "learning_rate": 4.8855470352836234e-05, + "loss": 2.2584, + "mean_token_accuracy": 0.4793103516101837, + "step": 145815 + }, + { + "epoch": 0.14687145273009475, + "grad_norm": 11.14431777521666, + "learning_rate": 4.885535236063144e-05, + "loss": 2.3869, + "mean_token_accuracy": 0.4221415638923645, + "step": 145820 + }, + { + "epoch": 0.14687648878319892, + "grad_norm": 9.738854023127862, + "learning_rate": 4.8855234362503646e-05, + "loss": 2.2654, + "mean_token_accuracy": 0.42068966031074523, + "step": 145825 + }, + { + "epoch": 0.1468815248363031, + "grad_norm": 8.808123157843095, + "learning_rate": 4.885511635845291e-05, + "loss": 1.9636, + "mean_token_accuracy": 0.5402903914451599, + "step": 145830 + }, + { + "epoch": 0.14688656088940727, + "grad_norm": 13.935103884008377, + "learning_rate": 4.885499834847925e-05, + "loss": 2.3731, + "mean_token_accuracy": 0.4379310429096222, + "step": 145835 + }, + { + "epoch": 0.14689159694251144, + "grad_norm": 12.446096841380216, + "learning_rate": 4.885488033258269e-05, + "loss": 2.1414, + "mean_token_accuracy": 0.4655172526836395, + "step": 145840 + }, + { + "epoch": 0.14689663299561562, + "grad_norm": 9.523884079967656, + "learning_rate": 4.8854762310763284e-05, + "loss": 2.7792, + "mean_token_accuracy": 0.4430127084255219, + "step": 145845 + }, + { + "epoch": 0.1469016690487198, + "grad_norm": 12.15296460653672, + "learning_rate": 4.8854644283021055e-05, + "loss": 2.2877, + "mean_token_accuracy": 0.45517241954803467, + "step": 145850 + }, + { + "epoch": 0.14690670510182396, + "grad_norm": 11.280378439355024, + "learning_rate": 4.885452624935604e-05, + "loss": 2.4809, + "mean_token_accuracy": 0.4034482717514038, + "step": 145855 + }, + { + "epoch": 0.14691174115492814, + "grad_norm": 10.045969162404205, + "learning_rate": 4.885440820976826e-05, + "loss": 2.3391, + "mean_token_accuracy": 0.4497882664203644, + "step": 145860 + }, + { + "epoch": 0.1469167772080323, + "grad_norm": 10.652652667077671, + "learning_rate": 4.885429016425776e-05, + "loss": 2.5874, + "mean_token_accuracy": 0.43248639106750486, + "step": 145865 + }, + { + "epoch": 0.14692181326113649, + "grad_norm": 8.126211499967068, + "learning_rate": 4.8854172112824566e-05, + "loss": 2.0861, + "mean_token_accuracy": 0.4551724076271057, + "step": 145870 + }, + { + "epoch": 0.14692684931424066, + "grad_norm": 11.053502152155357, + "learning_rate": 4.885405405546871e-05, + "loss": 2.3849, + "mean_token_accuracy": 0.4482758641242981, + "step": 145875 + }, + { + "epoch": 0.14693188536734483, + "grad_norm": 8.014216011502475, + "learning_rate": 4.8853935992190236e-05, + "loss": 2.8637, + "mean_token_accuracy": 0.39038113355636594, + "step": 145880 + }, + { + "epoch": 0.146936921420449, + "grad_norm": 8.073890692666042, + "learning_rate": 4.885381792298916e-05, + "loss": 2.3527, + "mean_token_accuracy": 0.43793103098869324, + "step": 145885 + }, + { + "epoch": 0.14694195747355318, + "grad_norm": 10.47429765825648, + "learning_rate": 4.8853699847865527e-05, + "loss": 2.1991, + "mean_token_accuracy": 0.4551724135875702, + "step": 145890 + }, + { + "epoch": 0.14694699352665733, + "grad_norm": 9.686457165065384, + "learning_rate": 4.8853581766819365e-05, + "loss": 2.2451, + "mean_token_accuracy": 0.4482758641242981, + "step": 145895 + }, + { + "epoch": 0.1469520295797615, + "grad_norm": 9.966345895629013, + "learning_rate": 4.885346367985071e-05, + "loss": 2.2633, + "mean_token_accuracy": 0.4620689690113068, + "step": 145900 + }, + { + "epoch": 0.14695706563286567, + "grad_norm": 10.145368446867176, + "learning_rate": 4.8853345586959584e-05, + "loss": 2.3222, + "mean_token_accuracy": 0.4344827592372894, + "step": 145905 + }, + { + "epoch": 0.14696210168596985, + "grad_norm": 11.920314836844916, + "learning_rate": 4.885322748814603e-05, + "loss": 2.5712, + "mean_token_accuracy": 0.4034482777118683, + "step": 145910 + }, + { + "epoch": 0.14696713773907402, + "grad_norm": 9.37710871569525, + "learning_rate": 4.8853109383410086e-05, + "loss": 2.2676, + "mean_token_accuracy": 0.45668481588363646, + "step": 145915 + }, + { + "epoch": 0.1469721737921782, + "grad_norm": 11.302681313322084, + "learning_rate": 4.885299127275177e-05, + "loss": 2.3941, + "mean_token_accuracy": 0.4433756828308105, + "step": 145920 + }, + { + "epoch": 0.14697720984528237, + "grad_norm": 9.407473182492451, + "learning_rate": 4.8852873156171124e-05, + "loss": 2.3374, + "mean_token_accuracy": 0.42487684488296507, + "step": 145925 + }, + { + "epoch": 0.14698224589838654, + "grad_norm": 9.681881942910286, + "learning_rate": 4.885275503366818e-05, + "loss": 2.2165, + "mean_token_accuracy": 0.4517241418361664, + "step": 145930 + }, + { + "epoch": 0.14698728195149072, + "grad_norm": 9.536498020615863, + "learning_rate": 4.885263690524297e-05, + "loss": 2.2131, + "mean_token_accuracy": 0.4655172288417816, + "step": 145935 + }, + { + "epoch": 0.1469923180045949, + "grad_norm": 16.169762331169718, + "learning_rate": 4.885251877089553e-05, + "loss": 2.5006, + "mean_token_accuracy": 0.38620689511299133, + "step": 145940 + }, + { + "epoch": 0.14699735405769906, + "grad_norm": 15.228956956296214, + "learning_rate": 4.8852400630625875e-05, + "loss": 2.5596, + "mean_token_accuracy": 0.4034482717514038, + "step": 145945 + }, + { + "epoch": 0.14700239011080324, + "grad_norm": 9.122632604535788, + "learning_rate": 4.885228248443407e-05, + "loss": 2.2286, + "mean_token_accuracy": 0.4671506345272064, + "step": 145950 + }, + { + "epoch": 0.1470074261639074, + "grad_norm": 12.105011885731672, + "learning_rate": 4.8852164332320124e-05, + "loss": 2.9517, + "mean_token_accuracy": 0.3413793116807938, + "step": 145955 + }, + { + "epoch": 0.14701246221701159, + "grad_norm": 10.893302459486996, + "learning_rate": 4.8852046174284075e-05, + "loss": 2.3943, + "mean_token_accuracy": 0.4517241358757019, + "step": 145960 + }, + { + "epoch": 0.14701749827011576, + "grad_norm": 11.925568275375268, + "learning_rate": 4.885192801032595e-05, + "loss": 2.6216, + "mean_token_accuracy": 0.37586206793785093, + "step": 145965 + }, + { + "epoch": 0.14702253432321993, + "grad_norm": 13.107476373766335, + "learning_rate": 4.885180984044579e-05, + "loss": 2.3008, + "mean_token_accuracy": 0.4361161530017853, + "step": 145970 + }, + { + "epoch": 0.1470275703763241, + "grad_norm": 9.874422450074443, + "learning_rate": 4.885169166464364e-05, + "loss": 2.4107, + "mean_token_accuracy": 0.43103448748588563, + "step": 145975 + }, + { + "epoch": 0.14703260642942828, + "grad_norm": 10.381301587209133, + "learning_rate": 4.88515734829195e-05, + "loss": 2.2267, + "mean_token_accuracy": 0.4379310369491577, + "step": 145980 + }, + { + "epoch": 0.14703764248253245, + "grad_norm": 9.437146978980236, + "learning_rate": 4.885145529527343e-05, + "loss": 2.0833, + "mean_token_accuracy": 0.4965517222881317, + "step": 145985 + }, + { + "epoch": 0.14704267853563663, + "grad_norm": 11.392550468317465, + "learning_rate": 4.885133710170546e-05, + "loss": 2.5354, + "mean_token_accuracy": 0.482758617401123, + "step": 145990 + }, + { + "epoch": 0.1470477145887408, + "grad_norm": 8.982177489988265, + "learning_rate": 4.8851218902215606e-05, + "loss": 2.2162, + "mean_token_accuracy": 0.4689655125141144, + "step": 145995 + }, + { + "epoch": 0.14705275064184498, + "grad_norm": 8.216930060179326, + "learning_rate": 4.885110069680391e-05, + "loss": 2.0107, + "mean_token_accuracy": 0.4517241299152374, + "step": 146000 + }, + { + "epoch": 0.14705778669494915, + "grad_norm": 11.541375977142929, + "learning_rate": 4.885098248547042e-05, + "loss": 2.6444, + "mean_token_accuracy": 0.3965517163276672, + "step": 146005 + }, + { + "epoch": 0.14706282274805332, + "grad_norm": 9.540545112429314, + "learning_rate": 4.885086426821515e-05, + "loss": 2.0853, + "mean_token_accuracy": 0.4551724135875702, + "step": 146010 + }, + { + "epoch": 0.1470678588011575, + "grad_norm": 10.405333466756243, + "learning_rate": 4.885074604503814e-05, + "loss": 2.4269, + "mean_token_accuracy": 0.41379310488700866, + "step": 146015 + }, + { + "epoch": 0.14707289485426167, + "grad_norm": 10.129672544050004, + "learning_rate": 4.8850627815939415e-05, + "loss": 2.2649, + "mean_token_accuracy": 0.4931034505367279, + "step": 146020 + }, + { + "epoch": 0.14707793090736584, + "grad_norm": 9.660299529419826, + "learning_rate": 4.8850509580919014e-05, + "loss": 2.561, + "mean_token_accuracy": 0.3999999940395355, + "step": 146025 + }, + { + "epoch": 0.14708296696047002, + "grad_norm": 14.01725811037699, + "learning_rate": 4.885039133997697e-05, + "loss": 2.4122, + "mean_token_accuracy": 0.441379314661026, + "step": 146030 + }, + { + "epoch": 0.14708800301357416, + "grad_norm": 9.192443450348641, + "learning_rate": 4.885027309311331e-05, + "loss": 2.1291, + "mean_token_accuracy": 0.4586206912994385, + "step": 146035 + }, + { + "epoch": 0.14709303906667834, + "grad_norm": 9.589854701168147, + "learning_rate": 4.8850154840328095e-05, + "loss": 2.4355, + "mean_token_accuracy": 0.42758620977401735, + "step": 146040 + }, + { + "epoch": 0.1470980751197825, + "grad_norm": 11.26730163418241, + "learning_rate": 4.885003658162132e-05, + "loss": 2.5889, + "mean_token_accuracy": 0.3517241358757019, + "step": 146045 + }, + { + "epoch": 0.14710311117288669, + "grad_norm": 9.548168623782244, + "learning_rate": 4.8849918316993025e-05, + "loss": 2.7101, + "mean_token_accuracy": 0.39655172228813174, + "step": 146050 + }, + { + "epoch": 0.14710814722599086, + "grad_norm": 9.345142666263232, + "learning_rate": 4.884980004644326e-05, + "loss": 2.3462, + "mean_token_accuracy": 0.42758620977401735, + "step": 146055 + }, + { + "epoch": 0.14711318327909503, + "grad_norm": 10.592801790736106, + "learning_rate": 4.884968176997205e-05, + "loss": 2.3286, + "mean_token_accuracy": 0.4379310369491577, + "step": 146060 + }, + { + "epoch": 0.1471182193321992, + "grad_norm": 11.60479820521365, + "learning_rate": 4.884956348757941e-05, + "loss": 2.0803, + "mean_token_accuracy": 0.4931034445762634, + "step": 146065 + }, + { + "epoch": 0.14712325538530338, + "grad_norm": 9.607710392518346, + "learning_rate": 4.8849445199265405e-05, + "loss": 2.2281, + "mean_token_accuracy": 0.47749546766281126, + "step": 146070 + }, + { + "epoch": 0.14712829143840755, + "grad_norm": 9.143813328211191, + "learning_rate": 4.884932690503004e-05, + "loss": 2.2761, + "mean_token_accuracy": 0.47241378426551817, + "step": 146075 + }, + { + "epoch": 0.14713332749151173, + "grad_norm": 10.92389833516743, + "learning_rate": 4.884920860487337e-05, + "loss": 2.3137, + "mean_token_accuracy": 0.4103448212146759, + "step": 146080 + }, + { + "epoch": 0.1471383635446159, + "grad_norm": 10.549786870677622, + "learning_rate": 4.884909029879542e-05, + "loss": 2.3712, + "mean_token_accuracy": 0.3965517163276672, + "step": 146085 + }, + { + "epoch": 0.14714339959772008, + "grad_norm": 9.228650300356833, + "learning_rate": 4.884897198679621e-05, + "loss": 2.7161, + "mean_token_accuracy": 0.4344827592372894, + "step": 146090 + }, + { + "epoch": 0.14714843565082425, + "grad_norm": 9.961927474969645, + "learning_rate": 4.8848853668875786e-05, + "loss": 2.544, + "mean_token_accuracy": 0.42758620977401735, + "step": 146095 + }, + { + "epoch": 0.14715347170392842, + "grad_norm": 9.235307517937771, + "learning_rate": 4.884873534503418e-05, + "loss": 2.6298, + "mean_token_accuracy": 0.37931033968925476, + "step": 146100 + }, + { + "epoch": 0.1471585077570326, + "grad_norm": 12.61601264092315, + "learning_rate": 4.8848617015271426e-05, + "loss": 2.627, + "mean_token_accuracy": 0.40689656138420105, + "step": 146105 + }, + { + "epoch": 0.14716354381013677, + "grad_norm": 12.235489887095952, + "learning_rate": 4.8848498679587543e-05, + "loss": 2.445, + "mean_token_accuracy": 0.4137930989265442, + "step": 146110 + }, + { + "epoch": 0.14716857986324094, + "grad_norm": 11.280128927197522, + "learning_rate": 4.884838033798259e-05, + "loss": 2.5774, + "mean_token_accuracy": 0.41379310488700866, + "step": 146115 + }, + { + "epoch": 0.14717361591634512, + "grad_norm": 10.699662333630153, + "learning_rate": 4.884826199045657e-05, + "loss": 2.0909, + "mean_token_accuracy": 0.42413792610168455, + "step": 146120 + }, + { + "epoch": 0.1471786519694493, + "grad_norm": 12.644917554089782, + "learning_rate": 4.884814363700954e-05, + "loss": 2.3941, + "mean_token_accuracy": 0.4137930989265442, + "step": 146125 + }, + { + "epoch": 0.14718368802255347, + "grad_norm": 11.07824378501426, + "learning_rate": 4.884802527764151e-05, + "loss": 2.5755, + "mean_token_accuracy": 0.43103448748588563, + "step": 146130 + }, + { + "epoch": 0.14718872407565764, + "grad_norm": 10.704466925862716, + "learning_rate": 4.884790691235253e-05, + "loss": 2.2047, + "mean_token_accuracy": 0.41724138259887694, + "step": 146135 + }, + { + "epoch": 0.1471937601287618, + "grad_norm": 11.065507394242646, + "learning_rate": 4.8847788541142635e-05, + "loss": 2.3789, + "mean_token_accuracy": 0.4896551728248596, + "step": 146140 + }, + { + "epoch": 0.147198796181866, + "grad_norm": 11.552009885028353, + "learning_rate": 4.884767016401185e-05, + "loss": 2.6443, + "mean_token_accuracy": 0.3931034505367279, + "step": 146145 + }, + { + "epoch": 0.14720383223497016, + "grad_norm": 13.544766515680962, + "learning_rate": 4.8847551780960206e-05, + "loss": 2.7922, + "mean_token_accuracy": 0.41379310488700866, + "step": 146150 + }, + { + "epoch": 0.14720886828807433, + "grad_norm": 13.099892066700825, + "learning_rate": 4.884743339198773e-05, + "loss": 2.0267, + "mean_token_accuracy": 0.4551724135875702, + "step": 146155 + }, + { + "epoch": 0.1472139043411785, + "grad_norm": 9.914650598937762, + "learning_rate": 4.884731499709448e-05, + "loss": 2.6797, + "mean_token_accuracy": 0.3965517282485962, + "step": 146160 + }, + { + "epoch": 0.14721894039428268, + "grad_norm": 10.885627993540094, + "learning_rate": 4.884719659628046e-05, + "loss": 2.3055, + "mean_token_accuracy": 0.4310344815254211, + "step": 146165 + }, + { + "epoch": 0.14722397644738686, + "grad_norm": 10.279930308818976, + "learning_rate": 4.884707818954573e-05, + "loss": 2.0381, + "mean_token_accuracy": 0.4862069010734558, + "step": 146170 + }, + { + "epoch": 0.147229012500491, + "grad_norm": 14.802490336619384, + "learning_rate": 4.8846959776890294e-05, + "loss": 2.4012, + "mean_token_accuracy": 0.4034482777118683, + "step": 146175 + }, + { + "epoch": 0.14723404855359518, + "grad_norm": 18.691997385468763, + "learning_rate": 4.88468413583142e-05, + "loss": 2.9284, + "mean_token_accuracy": 0.3896551698446274, + "step": 146180 + }, + { + "epoch": 0.14723908460669935, + "grad_norm": 10.363947018254711, + "learning_rate": 4.8846722933817485e-05, + "loss": 2.0783, + "mean_token_accuracy": 0.46412582993507384, + "step": 146185 + }, + { + "epoch": 0.14724412065980352, + "grad_norm": 12.29520099921546, + "learning_rate": 4.8846604503400176e-05, + "loss": 2.4025, + "mean_token_accuracy": 0.4480943739414215, + "step": 146190 + }, + { + "epoch": 0.1472491567129077, + "grad_norm": 10.836766934614507, + "learning_rate": 4.884648606706231e-05, + "loss": 2.3418, + "mean_token_accuracy": 0.4206896543502808, + "step": 146195 + }, + { + "epoch": 0.14725419276601187, + "grad_norm": 9.770898605575736, + "learning_rate": 4.884636762480391e-05, + "loss": 2.5394, + "mean_token_accuracy": 0.3999999940395355, + "step": 146200 + }, + { + "epoch": 0.14725922881911604, + "grad_norm": 10.929832378382041, + "learning_rate": 4.884624917662502e-05, + "loss": 2.2056, + "mean_token_accuracy": 0.38275861740112305, + "step": 146205 + }, + { + "epoch": 0.14726426487222022, + "grad_norm": 10.414347683380713, + "learning_rate": 4.8846130722525665e-05, + "loss": 2.3572, + "mean_token_accuracy": 0.42413792610168455, + "step": 146210 + }, + { + "epoch": 0.1472693009253244, + "grad_norm": 12.836375186931496, + "learning_rate": 4.8846012262505884e-05, + "loss": 2.5569, + "mean_token_accuracy": 0.4448275864124298, + "step": 146215 + }, + { + "epoch": 0.14727433697842857, + "grad_norm": 11.862338603483455, + "learning_rate": 4.884589379656571e-05, + "loss": 2.1429, + "mean_token_accuracy": 0.4918330252170563, + "step": 146220 + }, + { + "epoch": 0.14727937303153274, + "grad_norm": 14.677319642046886, + "learning_rate": 4.8845775324705165e-05, + "loss": 2.5143, + "mean_token_accuracy": 0.39655172228813174, + "step": 146225 + }, + { + "epoch": 0.1472844090846369, + "grad_norm": 12.325516327666973, + "learning_rate": 4.8845656846924295e-05, + "loss": 2.2689, + "mean_token_accuracy": 0.42413793206214906, + "step": 146230 + }, + { + "epoch": 0.1472894451377411, + "grad_norm": 11.34140828625138, + "learning_rate": 4.884553836322312e-05, + "loss": 2.2706, + "mean_token_accuracy": 0.47586206793785096, + "step": 146235 + }, + { + "epoch": 0.14729448119084526, + "grad_norm": 10.034739788075274, + "learning_rate": 4.884541987360169e-05, + "loss": 2.2461, + "mean_token_accuracy": 0.44827585816383364, + "step": 146240 + }, + { + "epoch": 0.14729951724394943, + "grad_norm": 8.317131215685407, + "learning_rate": 4.884530137806002e-05, + "loss": 1.7991, + "mean_token_accuracy": 0.5400483965873718, + "step": 146245 + }, + { + "epoch": 0.1473045532970536, + "grad_norm": 9.722296539234446, + "learning_rate": 4.884518287659815e-05, + "loss": 2.7371, + "mean_token_accuracy": 0.3586206793785095, + "step": 146250 + }, + { + "epoch": 0.14730958935015778, + "grad_norm": 9.472312115737532, + "learning_rate": 4.884506436921612e-05, + "loss": 2.1124, + "mean_token_accuracy": 0.4620689630508423, + "step": 146255 + }, + { + "epoch": 0.14731462540326196, + "grad_norm": 10.0180877102542, + "learning_rate": 4.884494585591395e-05, + "loss": 2.2207, + "mean_token_accuracy": 0.46551724076271056, + "step": 146260 + }, + { + "epoch": 0.14731966145636613, + "grad_norm": 13.030258134888165, + "learning_rate": 4.884482733669169e-05, + "loss": 2.2696, + "mean_token_accuracy": 0.4332123279571533, + "step": 146265 + }, + { + "epoch": 0.1473246975094703, + "grad_norm": 11.729951151696431, + "learning_rate": 4.884470881154935e-05, + "loss": 2.8376, + "mean_token_accuracy": 0.39310344457626345, + "step": 146270 + }, + { + "epoch": 0.14732973356257448, + "grad_norm": 12.155701166412321, + "learning_rate": 4.884459028048699e-05, + "loss": 2.3954, + "mean_token_accuracy": 0.44827585220336913, + "step": 146275 + }, + { + "epoch": 0.14733476961567865, + "grad_norm": 8.762977626177326, + "learning_rate": 4.884447174350462e-05, + "loss": 2.0821, + "mean_token_accuracy": 0.4862068951129913, + "step": 146280 + }, + { + "epoch": 0.14733980566878283, + "grad_norm": 8.71140092842227, + "learning_rate": 4.8844353200602285e-05, + "loss": 2.2939, + "mean_token_accuracy": 0.4448275864124298, + "step": 146285 + }, + { + "epoch": 0.147344841721887, + "grad_norm": 10.090694337174211, + "learning_rate": 4.884423465178001e-05, + "loss": 2.4566, + "mean_token_accuracy": 0.458620685338974, + "step": 146290 + }, + { + "epoch": 0.14734987777499117, + "grad_norm": 11.481517029902541, + "learning_rate": 4.8844116097037835e-05, + "loss": 2.285, + "mean_token_accuracy": 0.42758620381355283, + "step": 146295 + }, + { + "epoch": 0.14735491382809535, + "grad_norm": 10.278153610245294, + "learning_rate": 4.884399753637579e-05, + "loss": 2.1368, + "mean_token_accuracy": 0.41379311084747317, + "step": 146300 + }, + { + "epoch": 0.14735994988119952, + "grad_norm": 10.90852096163775, + "learning_rate": 4.88438789697939e-05, + "loss": 2.3939, + "mean_token_accuracy": 0.49183303117752075, + "step": 146305 + }, + { + "epoch": 0.1473649859343037, + "grad_norm": 10.705909895860405, + "learning_rate": 4.884376039729222e-05, + "loss": 2.4323, + "mean_token_accuracy": 0.42068966031074523, + "step": 146310 + }, + { + "epoch": 0.14737002198740784, + "grad_norm": 9.015219956092839, + "learning_rate": 4.884364181887075e-05, + "loss": 2.2666, + "mean_token_accuracy": 0.4517241418361664, + "step": 146315 + }, + { + "epoch": 0.147375058040512, + "grad_norm": 11.08313096632233, + "learning_rate": 4.884352323452955e-05, + "loss": 2.5189, + "mean_token_accuracy": 0.4068965494632721, + "step": 146320 + }, + { + "epoch": 0.1473800940936162, + "grad_norm": 10.33400878703134, + "learning_rate": 4.884340464426865e-05, + "loss": 2.3309, + "mean_token_accuracy": 0.4517241418361664, + "step": 146325 + }, + { + "epoch": 0.14738513014672036, + "grad_norm": 12.154108546770908, + "learning_rate": 4.884328604808807e-05, + "loss": 2.332, + "mean_token_accuracy": 0.4448275864124298, + "step": 146330 + }, + { + "epoch": 0.14739016619982453, + "grad_norm": 8.830048582363991, + "learning_rate": 4.8843167445987854e-05, + "loss": 2.7378, + "mean_token_accuracy": 0.3896551728248596, + "step": 146335 + }, + { + "epoch": 0.1473952022529287, + "grad_norm": 9.68747061073109, + "learning_rate": 4.8843048837968025e-05, + "loss": 2.5257, + "mean_token_accuracy": 0.4103448331356049, + "step": 146340 + }, + { + "epoch": 0.14740023830603288, + "grad_norm": 10.260987762594217, + "learning_rate": 4.884293022402863e-05, + "loss": 2.1342, + "mean_token_accuracy": 0.4448275864124298, + "step": 146345 + }, + { + "epoch": 0.14740527435913706, + "grad_norm": 11.132578723949782, + "learning_rate": 4.884281160416969e-05, + "loss": 2.2275, + "mean_token_accuracy": 0.44568965435028074, + "step": 146350 + }, + { + "epoch": 0.14741031041224123, + "grad_norm": 10.54870335403427, + "learning_rate": 4.884269297839124e-05, + "loss": 2.3216, + "mean_token_accuracy": 0.4551724135875702, + "step": 146355 + }, + { + "epoch": 0.1474153464653454, + "grad_norm": 11.433128773766368, + "learning_rate": 4.884257434669332e-05, + "loss": 2.5821, + "mean_token_accuracy": 0.4103448212146759, + "step": 146360 + }, + { + "epoch": 0.14742038251844958, + "grad_norm": 10.30787475567201, + "learning_rate": 4.884245570907595e-05, + "loss": 2.2467, + "mean_token_accuracy": 0.44827587008476255, + "step": 146365 + }, + { + "epoch": 0.14742541857155375, + "grad_norm": 10.083902013749096, + "learning_rate": 4.884233706553917e-05, + "loss": 2.4386, + "mean_token_accuracy": 0.41034482717514037, + "step": 146370 + }, + { + "epoch": 0.14743045462465793, + "grad_norm": 8.56388368351227, + "learning_rate": 4.8842218416083015e-05, + "loss": 2.3002, + "mean_token_accuracy": 0.4537205100059509, + "step": 146375 + }, + { + "epoch": 0.1474354906777621, + "grad_norm": 10.110050960454688, + "learning_rate": 4.884209976070752e-05, + "loss": 2.4973, + "mean_token_accuracy": 0.4294010877609253, + "step": 146380 + }, + { + "epoch": 0.14744052673086627, + "grad_norm": 8.274268190874139, + "learning_rate": 4.884198109941271e-05, + "loss": 2.2145, + "mean_token_accuracy": 0.46551724076271056, + "step": 146385 + }, + { + "epoch": 0.14744556278397045, + "grad_norm": 8.71122633345135, + "learning_rate": 4.884186243219862e-05, + "loss": 2.6268, + "mean_token_accuracy": 0.460556560754776, + "step": 146390 + }, + { + "epoch": 0.14745059883707462, + "grad_norm": 8.68519901573476, + "learning_rate": 4.884174375906529e-05, + "loss": 2.2366, + "mean_token_accuracy": 0.46896552443504336, + "step": 146395 + }, + { + "epoch": 0.1474556348901788, + "grad_norm": 12.495651136904074, + "learning_rate": 4.884162508001274e-05, + "loss": 2.2491, + "mean_token_accuracy": 0.4327283799648285, + "step": 146400 + }, + { + "epoch": 0.14746067094328297, + "grad_norm": 9.91038335120461, + "learning_rate": 4.884150639504103e-05, + "loss": 2.2684, + "mean_token_accuracy": 0.441379314661026, + "step": 146405 + }, + { + "epoch": 0.14746570699638714, + "grad_norm": 9.188514823770724, + "learning_rate": 4.884138770415015e-05, + "loss": 2.1695, + "mean_token_accuracy": 0.5206896483898162, + "step": 146410 + }, + { + "epoch": 0.14747074304949132, + "grad_norm": 9.237549188900097, + "learning_rate": 4.884126900734017e-05, + "loss": 2.0063, + "mean_token_accuracy": 0.49655172824859617, + "step": 146415 + }, + { + "epoch": 0.1474757791025955, + "grad_norm": 10.956473396832555, + "learning_rate": 4.8841150304611106e-05, + "loss": 2.1587, + "mean_token_accuracy": 0.49522080421447756, + "step": 146420 + }, + { + "epoch": 0.14748081515569966, + "grad_norm": 10.815094748530132, + "learning_rate": 4.8841031595963e-05, + "loss": 2.2209, + "mean_token_accuracy": 0.4206896543502808, + "step": 146425 + }, + { + "epoch": 0.14748585120880384, + "grad_norm": 9.17025455778591, + "learning_rate": 4.884091288139587e-05, + "loss": 2.2416, + "mean_token_accuracy": 0.4724137902259827, + "step": 146430 + }, + { + "epoch": 0.147490887261908, + "grad_norm": 9.225252261706242, + "learning_rate": 4.8840794160909757e-05, + "loss": 2.2493, + "mean_token_accuracy": 0.458620685338974, + "step": 146435 + }, + { + "epoch": 0.14749592331501218, + "grad_norm": 10.240961357357124, + "learning_rate": 4.88406754345047e-05, + "loss": 2.4335, + "mean_token_accuracy": 0.44978825449943544, + "step": 146440 + }, + { + "epoch": 0.14750095936811636, + "grad_norm": 11.787256924838278, + "learning_rate": 4.8840556702180726e-05, + "loss": 2.1962, + "mean_token_accuracy": 0.5091349065303803, + "step": 146445 + }, + { + "epoch": 0.14750599542122053, + "grad_norm": 10.038616450124481, + "learning_rate": 4.8840437963937875e-05, + "loss": 2.3103, + "mean_token_accuracy": 0.43448275327682495, + "step": 146450 + }, + { + "epoch": 0.14751103147432468, + "grad_norm": 5.7273493980233185, + "learning_rate": 4.884031921977617e-05, + "loss": 1.8326, + "mean_token_accuracy": 0.5257053256034852, + "step": 146455 + }, + { + "epoch": 0.14751606752742885, + "grad_norm": 10.79087451912435, + "learning_rate": 4.8840200469695646e-05, + "loss": 2.2156, + "mean_token_accuracy": 0.48275862336158754, + "step": 146460 + }, + { + "epoch": 0.14752110358053303, + "grad_norm": 11.136187144581514, + "learning_rate": 4.8840081713696336e-05, + "loss": 2.3738, + "mean_token_accuracy": 0.4344827651977539, + "step": 146465 + }, + { + "epoch": 0.1475261396336372, + "grad_norm": 9.369930552973205, + "learning_rate": 4.883996295177828e-05, + "loss": 2.2108, + "mean_token_accuracy": 0.4137930989265442, + "step": 146470 + }, + { + "epoch": 0.14753117568674137, + "grad_norm": 11.28572459734448, + "learning_rate": 4.8839844183941505e-05, + "loss": 2.467, + "mean_token_accuracy": 0.44482757449150084, + "step": 146475 + }, + { + "epoch": 0.14753621173984555, + "grad_norm": 17.614949550329765, + "learning_rate": 4.883972541018604e-05, + "loss": 2.8033, + "mean_token_accuracy": 0.3517241358757019, + "step": 146480 + }, + { + "epoch": 0.14754124779294972, + "grad_norm": 10.475821291730258, + "learning_rate": 4.883960663051193e-05, + "loss": 2.7628, + "mean_token_accuracy": 0.41724138259887694, + "step": 146485 + }, + { + "epoch": 0.1475462838460539, + "grad_norm": 9.951961565298, + "learning_rate": 4.883948784491919e-05, + "loss": 2.4026, + "mean_token_accuracy": 0.44482759237289426, + "step": 146490 + }, + { + "epoch": 0.14755131989915807, + "grad_norm": 14.591603339776848, + "learning_rate": 4.8839369053407874e-05, + "loss": 2.4288, + "mean_token_accuracy": 0.37241379022598264, + "step": 146495 + }, + { + "epoch": 0.14755635595226224, + "grad_norm": 10.326592833186215, + "learning_rate": 4.8839250255978e-05, + "loss": 2.3553, + "mean_token_accuracy": 0.4310344815254211, + "step": 146500 + }, + { + "epoch": 0.14756139200536642, + "grad_norm": 10.85624867571091, + "learning_rate": 4.88391314526296e-05, + "loss": 2.7313, + "mean_token_accuracy": 0.36896551847457887, + "step": 146505 + }, + { + "epoch": 0.1475664280584706, + "grad_norm": 10.579138715553771, + "learning_rate": 4.883901264336272e-05, + "loss": 2.402, + "mean_token_accuracy": 0.4379310369491577, + "step": 146510 + }, + { + "epoch": 0.14757146411157476, + "grad_norm": 8.566260735839993, + "learning_rate": 4.8838893828177376e-05, + "loss": 2.2108, + "mean_token_accuracy": 0.47586206793785096, + "step": 146515 + }, + { + "epoch": 0.14757650016467894, + "grad_norm": 12.388594840787238, + "learning_rate": 4.883877500707362e-05, + "loss": 2.6501, + "mean_token_accuracy": 0.4137930929660797, + "step": 146520 + }, + { + "epoch": 0.1475815362177831, + "grad_norm": 9.676457560637665, + "learning_rate": 4.883865618005147e-05, + "loss": 2.3913, + "mean_token_accuracy": 0.48275862336158754, + "step": 146525 + }, + { + "epoch": 0.14758657227088728, + "grad_norm": 9.493631459136834, + "learning_rate": 4.883853734711097e-05, + "loss": 2.4776, + "mean_token_accuracy": 0.39655172228813174, + "step": 146530 + }, + { + "epoch": 0.14759160832399146, + "grad_norm": 11.880314317638344, + "learning_rate": 4.8838418508252144e-05, + "loss": 2.4365, + "mean_token_accuracy": 0.44137929677963256, + "step": 146535 + }, + { + "epoch": 0.14759664437709563, + "grad_norm": 10.421708421766668, + "learning_rate": 4.8838299663475026e-05, + "loss": 2.1948, + "mean_token_accuracy": 0.47586206793785096, + "step": 146540 + }, + { + "epoch": 0.1476016804301998, + "grad_norm": 10.640010068441317, + "learning_rate": 4.8838180812779655e-05, + "loss": 2.3917, + "mean_token_accuracy": 0.4068965494632721, + "step": 146545 + }, + { + "epoch": 0.14760671648330398, + "grad_norm": 12.38843265282111, + "learning_rate": 4.883806195616605e-05, + "loss": 2.7118, + "mean_token_accuracy": 0.341379314661026, + "step": 146550 + }, + { + "epoch": 0.14761175253640815, + "grad_norm": 10.730498109941115, + "learning_rate": 4.883794309363426e-05, + "loss": 2.767, + "mean_token_accuracy": 0.3620689630508423, + "step": 146555 + }, + { + "epoch": 0.14761678858951233, + "grad_norm": 9.462429444011796, + "learning_rate": 4.883782422518431e-05, + "loss": 2.1872, + "mean_token_accuracy": 0.44972777366638184, + "step": 146560 + }, + { + "epoch": 0.1476218246426165, + "grad_norm": 10.733736294996554, + "learning_rate": 4.883770535081624e-05, + "loss": 2.2791, + "mean_token_accuracy": 0.4620689690113068, + "step": 146565 + }, + { + "epoch": 0.14762686069572067, + "grad_norm": 10.149472125156075, + "learning_rate": 4.883758647053008e-05, + "loss": 2.2839, + "mean_token_accuracy": 0.4620689690113068, + "step": 146570 + }, + { + "epoch": 0.14763189674882485, + "grad_norm": 11.807214800314915, + "learning_rate": 4.883746758432586e-05, + "loss": 1.9094, + "mean_token_accuracy": 0.4984271049499512, + "step": 146575 + }, + { + "epoch": 0.14763693280192902, + "grad_norm": 9.609306155537865, + "learning_rate": 4.8837348692203604e-05, + "loss": 2.3248, + "mean_token_accuracy": 0.4517241299152374, + "step": 146580 + }, + { + "epoch": 0.1476419688550332, + "grad_norm": 10.27693372920149, + "learning_rate": 4.883722979416336e-05, + "loss": 2.5637, + "mean_token_accuracy": 0.41179673075675965, + "step": 146585 + }, + { + "epoch": 0.14764700490813734, + "grad_norm": 11.134791560753781, + "learning_rate": 4.8837110890205154e-05, + "loss": 2.7382, + "mean_token_accuracy": 0.3896551728248596, + "step": 146590 + }, + { + "epoch": 0.14765204096124152, + "grad_norm": 9.8899887104632, + "learning_rate": 4.883699198032903e-05, + "loss": 2.5688, + "mean_token_accuracy": 0.43448275327682495, + "step": 146595 + }, + { + "epoch": 0.1476570770143457, + "grad_norm": 9.55192809333942, + "learning_rate": 4.8836873064535e-05, + "loss": 2.6242, + "mean_token_accuracy": 0.35977011919021606, + "step": 146600 + }, + { + "epoch": 0.14766211306744986, + "grad_norm": 13.373182946888855, + "learning_rate": 4.883675414282312e-05, + "loss": 2.8299, + "mean_token_accuracy": 0.4344827651977539, + "step": 146605 + }, + { + "epoch": 0.14766714912055404, + "grad_norm": 9.577822422674908, + "learning_rate": 4.88366352151934e-05, + "loss": 2.5368, + "mean_token_accuracy": 0.42758620381355283, + "step": 146610 + }, + { + "epoch": 0.1476721851736582, + "grad_norm": 11.56112300602276, + "learning_rate": 4.883651628164589e-05, + "loss": 2.5942, + "mean_token_accuracy": 0.41379310488700866, + "step": 146615 + }, + { + "epoch": 0.14767722122676238, + "grad_norm": 10.63430778923677, + "learning_rate": 4.883639734218062e-05, + "loss": 2.4569, + "mean_token_accuracy": 0.4068965494632721, + "step": 146620 + }, + { + "epoch": 0.14768225727986656, + "grad_norm": 9.847355802926, + "learning_rate": 4.883627839679761e-05, + "loss": 2.0555, + "mean_token_accuracy": 0.47586206793785096, + "step": 146625 + }, + { + "epoch": 0.14768729333297073, + "grad_norm": 10.602198340302646, + "learning_rate": 4.883615944549692e-05, + "loss": 2.2781, + "mean_token_accuracy": 0.46551724076271056, + "step": 146630 + }, + { + "epoch": 0.1476923293860749, + "grad_norm": 9.208961841963491, + "learning_rate": 4.8836040488278554e-05, + "loss": 2.1805, + "mean_token_accuracy": 0.5021173596382141, + "step": 146635 + }, + { + "epoch": 0.14769736543917908, + "grad_norm": 9.608708423595962, + "learning_rate": 4.883592152514256e-05, + "loss": 2.5949, + "mean_token_accuracy": 0.3862068921327591, + "step": 146640 + }, + { + "epoch": 0.14770240149228325, + "grad_norm": 27.001126321356463, + "learning_rate": 4.883580255608897e-05, + "loss": 2.4959, + "mean_token_accuracy": 0.4206896543502808, + "step": 146645 + }, + { + "epoch": 0.14770743754538743, + "grad_norm": 11.383972108827463, + "learning_rate": 4.883568358111781e-05, + "loss": 2.2764, + "mean_token_accuracy": 0.45517241954803467, + "step": 146650 + }, + { + "epoch": 0.1477124735984916, + "grad_norm": 9.810501025851796, + "learning_rate": 4.8835564600229134e-05, + "loss": 2.2377, + "mean_token_accuracy": 0.5, + "step": 146655 + }, + { + "epoch": 0.14771750965159577, + "grad_norm": 13.543985645032281, + "learning_rate": 4.883544561342295e-05, + "loss": 2.1642, + "mean_token_accuracy": 0.45172414779663084, + "step": 146660 + }, + { + "epoch": 0.14772254570469995, + "grad_norm": 10.319474986798374, + "learning_rate": 4.88353266206993e-05, + "loss": 2.5935, + "mean_token_accuracy": 0.37586206793785093, + "step": 146665 + }, + { + "epoch": 0.14772758175780412, + "grad_norm": 10.089699163416679, + "learning_rate": 4.883520762205822e-05, + "loss": 2.2842, + "mean_token_accuracy": 0.4586206912994385, + "step": 146670 + }, + { + "epoch": 0.1477326178109083, + "grad_norm": 8.722409595277039, + "learning_rate": 4.883508861749973e-05, + "loss": 2.2236, + "mean_token_accuracy": 0.4379310369491577, + "step": 146675 + }, + { + "epoch": 0.14773765386401247, + "grad_norm": 9.33123581133712, + "learning_rate": 4.883496960702389e-05, + "loss": 2.1117, + "mean_token_accuracy": 0.4448275864124298, + "step": 146680 + }, + { + "epoch": 0.14774268991711664, + "grad_norm": 8.994285269822763, + "learning_rate": 4.883485059063071e-05, + "loss": 2.3298, + "mean_token_accuracy": 0.4137930989265442, + "step": 146685 + }, + { + "epoch": 0.14774772597022082, + "grad_norm": 9.849933635004636, + "learning_rate": 4.8834731568320225e-05, + "loss": 2.2012, + "mean_token_accuracy": 0.4620689630508423, + "step": 146690 + }, + { + "epoch": 0.147752762023325, + "grad_norm": 11.920533190500235, + "learning_rate": 4.883461254009248e-05, + "loss": 1.9521, + "mean_token_accuracy": 0.5227465212345124, + "step": 146695 + }, + { + "epoch": 0.14775779807642916, + "grad_norm": 8.508531091560226, + "learning_rate": 4.883449350594749e-05, + "loss": 2.1337, + "mean_token_accuracy": 0.46551724076271056, + "step": 146700 + }, + { + "epoch": 0.14776283412953334, + "grad_norm": 11.148575961078116, + "learning_rate": 4.883437446588531e-05, + "loss": 2.1781, + "mean_token_accuracy": 0.4172413766384125, + "step": 146705 + }, + { + "epoch": 0.1477678701826375, + "grad_norm": 11.104648109231968, + "learning_rate": 4.8834255419905956e-05, + "loss": 2.4606, + "mean_token_accuracy": 0.41724138259887694, + "step": 146710 + }, + { + "epoch": 0.14777290623574169, + "grad_norm": 10.183326127010416, + "learning_rate": 4.883413636800946e-05, + "loss": 2.2684, + "mean_token_accuracy": 0.4137930989265442, + "step": 146715 + }, + { + "epoch": 0.14777794228884586, + "grad_norm": 10.157512761626103, + "learning_rate": 4.883401731019587e-05, + "loss": 2.4628, + "mean_token_accuracy": 0.41379311084747317, + "step": 146720 + }, + { + "epoch": 0.14778297834195003, + "grad_norm": 12.331485137323705, + "learning_rate": 4.883389824646522e-05, + "loss": 2.2155, + "mean_token_accuracy": 0.4620689690113068, + "step": 146725 + }, + { + "epoch": 0.14778801439505418, + "grad_norm": 8.691006152056522, + "learning_rate": 4.883377917681752e-05, + "loss": 2.2292, + "mean_token_accuracy": 0.44827585816383364, + "step": 146730 + }, + { + "epoch": 0.14779305044815835, + "grad_norm": 9.33166202423798, + "learning_rate": 4.883366010125282e-05, + "loss": 1.9815, + "mean_token_accuracy": 0.5129461526870728, + "step": 146735 + }, + { + "epoch": 0.14779808650126253, + "grad_norm": 8.82905311199546, + "learning_rate": 4.883354101977115e-05, + "loss": 2.1317, + "mean_token_accuracy": 0.47931033968925474, + "step": 146740 + }, + { + "epoch": 0.1478031225543667, + "grad_norm": 8.156492362756103, + "learning_rate": 4.883342193237254e-05, + "loss": 2.0015, + "mean_token_accuracy": 0.4896551728248596, + "step": 146745 + }, + { + "epoch": 0.14780815860747087, + "grad_norm": 8.553112541562346, + "learning_rate": 4.883330283905703e-05, + "loss": 2.6608, + "mean_token_accuracy": 0.38620689511299133, + "step": 146750 + }, + { + "epoch": 0.14781319466057505, + "grad_norm": 10.045827060021612, + "learning_rate": 4.883318373982464e-05, + "loss": 2.3827, + "mean_token_accuracy": 0.38620689511299133, + "step": 146755 + }, + { + "epoch": 0.14781823071367922, + "grad_norm": 11.670589655730081, + "learning_rate": 4.883306463467543e-05, + "loss": 2.1676, + "mean_token_accuracy": 0.4602216809988022, + "step": 146760 + }, + { + "epoch": 0.1478232667667834, + "grad_norm": 9.917873117249139, + "learning_rate": 4.88329455236094e-05, + "loss": 2.4888, + "mean_token_accuracy": 0.42758620977401735, + "step": 146765 + }, + { + "epoch": 0.14782830281988757, + "grad_norm": 11.94839686292455, + "learning_rate": 4.8832826406626605e-05, + "loss": 2.4104, + "mean_token_accuracy": 0.42758620381355283, + "step": 146770 + }, + { + "epoch": 0.14783333887299174, + "grad_norm": 9.446728488471768, + "learning_rate": 4.883270728372707e-05, + "loss": 2.7662, + "mean_token_accuracy": 0.3842710226774216, + "step": 146775 + }, + { + "epoch": 0.14783837492609592, + "grad_norm": 11.060741567921166, + "learning_rate": 4.883258815491083e-05, + "loss": 2.4951, + "mean_token_accuracy": 0.41379310488700866, + "step": 146780 + }, + { + "epoch": 0.1478434109792001, + "grad_norm": 9.97709318867642, + "learning_rate": 4.883246902017791e-05, + "loss": 2.3365, + "mean_token_accuracy": 0.417241370677948, + "step": 146785 + }, + { + "epoch": 0.14784844703230426, + "grad_norm": 7.888987031662764, + "learning_rate": 4.8832349879528354e-05, + "loss": 2.3099, + "mean_token_accuracy": 0.48275861144065857, + "step": 146790 + }, + { + "epoch": 0.14785348308540844, + "grad_norm": 10.081654616337394, + "learning_rate": 4.88322307329622e-05, + "loss": 2.1395, + "mean_token_accuracy": 0.4671506345272064, + "step": 146795 + }, + { + "epoch": 0.1478585191385126, + "grad_norm": 9.190143550317845, + "learning_rate": 4.8832111580479464e-05, + "loss": 2.6588, + "mean_token_accuracy": 0.45015124082565305, + "step": 146800 + }, + { + "epoch": 0.14786355519161679, + "grad_norm": 11.854260742078802, + "learning_rate": 4.883199242208018e-05, + "loss": 2.3756, + "mean_token_accuracy": 0.46551724076271056, + "step": 146805 + }, + { + "epoch": 0.14786859124472096, + "grad_norm": 9.971496348896805, + "learning_rate": 4.88318732577644e-05, + "loss": 2.4286, + "mean_token_accuracy": 0.4068965494632721, + "step": 146810 + }, + { + "epoch": 0.14787362729782513, + "grad_norm": 13.351636449077631, + "learning_rate": 4.883175408753214e-05, + "loss": 2.3931, + "mean_token_accuracy": 0.4011494219303131, + "step": 146815 + }, + { + "epoch": 0.1478786633509293, + "grad_norm": 13.151232499274933, + "learning_rate": 4.883163491138343e-05, + "loss": 2.3639, + "mean_token_accuracy": 0.4620689630508423, + "step": 146820 + }, + { + "epoch": 0.14788369940403348, + "grad_norm": 10.853120985157206, + "learning_rate": 4.883151572931833e-05, + "loss": 2.5652, + "mean_token_accuracy": 0.3862069010734558, + "step": 146825 + }, + { + "epoch": 0.14788873545713765, + "grad_norm": 10.810063094173854, + "learning_rate": 4.883139654133684e-05, + "loss": 2.6214, + "mean_token_accuracy": 0.4137930989265442, + "step": 146830 + }, + { + "epoch": 0.14789377151024183, + "grad_norm": 11.68174636967568, + "learning_rate": 4.883127734743902e-05, + "loss": 2.4171, + "mean_token_accuracy": 0.4448275864124298, + "step": 146835 + }, + { + "epoch": 0.147898807563346, + "grad_norm": 10.452539981283827, + "learning_rate": 4.8831158147624886e-05, + "loss": 2.2912, + "mean_token_accuracy": 0.43793103098869324, + "step": 146840 + }, + { + "epoch": 0.14790384361645018, + "grad_norm": 11.457516000585034, + "learning_rate": 4.883103894189447e-05, + "loss": 2.5971, + "mean_token_accuracy": 0.4318965435028076, + "step": 146845 + }, + { + "epoch": 0.14790887966955435, + "grad_norm": 10.724010091830898, + "learning_rate": 4.883091973024782e-05, + "loss": 2.1616, + "mean_token_accuracy": 0.4517241358757019, + "step": 146850 + }, + { + "epoch": 0.14791391572265852, + "grad_norm": 14.631399793116891, + "learning_rate": 4.883080051268495e-05, + "loss": 2.3651, + "mean_token_accuracy": 0.42413793206214906, + "step": 146855 + }, + { + "epoch": 0.1479189517757627, + "grad_norm": 7.779416434873946, + "learning_rate": 4.88306812892059e-05, + "loss": 1.8556, + "mean_token_accuracy": 0.5, + "step": 146860 + }, + { + "epoch": 0.14792398782886687, + "grad_norm": 12.32688825140249, + "learning_rate": 4.883056205981072e-05, + "loss": 2.5505, + "mean_token_accuracy": 0.4052026569843292, + "step": 146865 + }, + { + "epoch": 0.14792902388197102, + "grad_norm": 10.818373112727803, + "learning_rate": 4.8830442824499425e-05, + "loss": 2.2349, + "mean_token_accuracy": 0.41724138259887694, + "step": 146870 + }, + { + "epoch": 0.1479340599350752, + "grad_norm": 19.448318614458486, + "learning_rate": 4.8830323583272046e-05, + "loss": 2.4397, + "mean_token_accuracy": 0.3896551728248596, + "step": 146875 + }, + { + "epoch": 0.14793909598817936, + "grad_norm": 8.917478137375324, + "learning_rate": 4.883020433612863e-05, + "loss": 2.3465, + "mean_token_accuracy": 0.4379310429096222, + "step": 146880 + }, + { + "epoch": 0.14794413204128354, + "grad_norm": 10.562110699761524, + "learning_rate": 4.8830085083069196e-05, + "loss": 2.3972, + "mean_token_accuracy": 0.4310344815254211, + "step": 146885 + }, + { + "epoch": 0.1479491680943877, + "grad_norm": 12.2661494371973, + "learning_rate": 4.882996582409379e-05, + "loss": 2.232, + "mean_token_accuracy": 0.4137930989265442, + "step": 146890 + }, + { + "epoch": 0.14795420414749189, + "grad_norm": 10.656975402327985, + "learning_rate": 4.882984655920243e-05, + "loss": 2.2246, + "mean_token_accuracy": 0.4379310250282288, + "step": 146895 + }, + { + "epoch": 0.14795924020059606, + "grad_norm": 9.811381690529911, + "learning_rate": 4.8829727288395154e-05, + "loss": 2.355, + "mean_token_accuracy": 0.4482758641242981, + "step": 146900 + }, + { + "epoch": 0.14796427625370023, + "grad_norm": 10.749886293706474, + "learning_rate": 4.8829608011672005e-05, + "loss": 2.6515, + "mean_token_accuracy": 0.4068965554237366, + "step": 146905 + }, + { + "epoch": 0.1479693123068044, + "grad_norm": 10.051160330449358, + "learning_rate": 4.8829488729033016e-05, + "loss": 2.1394, + "mean_token_accuracy": 0.48620688915252686, + "step": 146910 + }, + { + "epoch": 0.14797434835990858, + "grad_norm": 9.063474302065995, + "learning_rate": 4.8829369440478206e-05, + "loss": 2.207, + "mean_token_accuracy": 0.4517241358757019, + "step": 146915 + }, + { + "epoch": 0.14797938441301275, + "grad_norm": 14.468489261088424, + "learning_rate": 4.8829250146007616e-05, + "loss": 2.6316, + "mean_token_accuracy": 0.4517241418361664, + "step": 146920 + }, + { + "epoch": 0.14798442046611693, + "grad_norm": 9.873052513519497, + "learning_rate": 4.882913084562129e-05, + "loss": 2.1174, + "mean_token_accuracy": 0.48275861144065857, + "step": 146925 + }, + { + "epoch": 0.1479894565192211, + "grad_norm": 11.33735986014298, + "learning_rate": 4.882901153931923e-05, + "loss": 2.1657, + "mean_token_accuracy": 0.4896551728248596, + "step": 146930 + }, + { + "epoch": 0.14799449257232528, + "grad_norm": 13.303743717214939, + "learning_rate": 4.8828892227101494e-05, + "loss": 2.3075, + "mean_token_accuracy": 0.4586206912994385, + "step": 146935 + }, + { + "epoch": 0.14799952862542945, + "grad_norm": 11.751778294002708, + "learning_rate": 4.8828772908968124e-05, + "loss": 2.5029, + "mean_token_accuracy": 0.4068965554237366, + "step": 146940 + }, + { + "epoch": 0.14800456467853362, + "grad_norm": 13.795411416776913, + "learning_rate": 4.882865358491913e-05, + "loss": 2.4626, + "mean_token_accuracy": 0.43448275327682495, + "step": 146945 + }, + { + "epoch": 0.1480096007316378, + "grad_norm": 11.153811359712817, + "learning_rate": 4.8828534254954554e-05, + "loss": 2.5096, + "mean_token_accuracy": 0.3793103456497192, + "step": 146950 + }, + { + "epoch": 0.14801463678474197, + "grad_norm": 10.91217027596112, + "learning_rate": 4.882841491907444e-05, + "loss": 2.2654, + "mean_token_accuracy": 0.4465819776058197, + "step": 146955 + }, + { + "epoch": 0.14801967283784614, + "grad_norm": 12.347349310664852, + "learning_rate": 4.882829557727879e-05, + "loss": 2.3912, + "mean_token_accuracy": 0.4344827592372894, + "step": 146960 + }, + { + "epoch": 0.14802470889095032, + "grad_norm": 12.046115658967606, + "learning_rate": 4.8828176229567676e-05, + "loss": 2.347, + "mean_token_accuracy": 0.4068965554237366, + "step": 146965 + }, + { + "epoch": 0.1480297449440545, + "grad_norm": 8.633677325080502, + "learning_rate": 4.8828056875941105e-05, + "loss": 2.1154, + "mean_token_accuracy": 0.46551724076271056, + "step": 146970 + }, + { + "epoch": 0.14803478099715867, + "grad_norm": 9.70258487075281, + "learning_rate": 4.882793751639912e-05, + "loss": 2.3861, + "mean_token_accuracy": 0.4206896543502808, + "step": 146975 + }, + { + "epoch": 0.14803981705026284, + "grad_norm": 9.357207390730874, + "learning_rate": 4.882781815094175e-05, + "loss": 1.975, + "mean_token_accuracy": 0.4931034505367279, + "step": 146980 + }, + { + "epoch": 0.148044853103367, + "grad_norm": 10.092193929985303, + "learning_rate": 4.882769877956903e-05, + "loss": 2.3329, + "mean_token_accuracy": 0.4758620738983154, + "step": 146985 + }, + { + "epoch": 0.1480498891564712, + "grad_norm": 9.337911435620743, + "learning_rate": 4.8827579402280994e-05, + "loss": 2.4896, + "mean_token_accuracy": 0.42068966031074523, + "step": 146990 + }, + { + "epoch": 0.14805492520957536, + "grad_norm": 13.956530833603086, + "learning_rate": 4.882746001907767e-05, + "loss": 2.4379, + "mean_token_accuracy": 0.4379310369491577, + "step": 146995 + }, + { + "epoch": 0.14805996126267953, + "grad_norm": 9.694196148159097, + "learning_rate": 4.88273406299591e-05, + "loss": 2.0896, + "mean_token_accuracy": 0.4886267364025116, + "step": 147000 + }, + { + "epoch": 0.1480649973157837, + "grad_norm": 10.14038209671101, + "learning_rate": 4.8827221234925316e-05, + "loss": 2.3088, + "mean_token_accuracy": 0.4310344815254211, + "step": 147005 + }, + { + "epoch": 0.14807003336888785, + "grad_norm": 8.47962060171844, + "learning_rate": 4.882710183397634e-05, + "loss": 2.231, + "mean_token_accuracy": 0.4724137902259827, + "step": 147010 + }, + { + "epoch": 0.14807506942199203, + "grad_norm": 9.375338667017155, + "learning_rate": 4.882698242711222e-05, + "loss": 3.0113, + "mean_token_accuracy": 0.4310344934463501, + "step": 147015 + }, + { + "epoch": 0.1480801054750962, + "grad_norm": 13.394190069173844, + "learning_rate": 4.882686301433298e-05, + "loss": 2.5559, + "mean_token_accuracy": 0.4482758641242981, + "step": 147020 + }, + { + "epoch": 0.14808514152820038, + "grad_norm": 10.340004248478332, + "learning_rate": 4.882674359563865e-05, + "loss": 2.5679, + "mean_token_accuracy": 0.4172413766384125, + "step": 147025 + }, + { + "epoch": 0.14809017758130455, + "grad_norm": 10.805249286955643, + "learning_rate": 4.882662417102928e-05, + "loss": 2.6067, + "mean_token_accuracy": 0.4172413766384125, + "step": 147030 + }, + { + "epoch": 0.14809521363440872, + "grad_norm": 10.958884524553648, + "learning_rate": 4.8826504740504875e-05, + "loss": 2.4093, + "mean_token_accuracy": 0.42413792610168455, + "step": 147035 + }, + { + "epoch": 0.1481002496875129, + "grad_norm": 10.55947653101469, + "learning_rate": 4.882638530406549e-05, + "loss": 2.6572, + "mean_token_accuracy": 0.3965517163276672, + "step": 147040 + }, + { + "epoch": 0.14810528574061707, + "grad_norm": 8.80993214292461, + "learning_rate": 4.882626586171116e-05, + "loss": 2.1816, + "mean_token_accuracy": 0.4620689630508423, + "step": 147045 + }, + { + "epoch": 0.14811032179372124, + "grad_norm": 8.988300597427417, + "learning_rate": 4.88261464134419e-05, + "loss": 2.1558, + "mean_token_accuracy": 0.46896551847457885, + "step": 147050 + }, + { + "epoch": 0.14811535784682542, + "grad_norm": 11.325289023431656, + "learning_rate": 4.882602695925776e-05, + "loss": 2.6253, + "mean_token_accuracy": 0.3724137872457504, + "step": 147055 + }, + { + "epoch": 0.1481203938999296, + "grad_norm": 11.427566943905516, + "learning_rate": 4.882590749915877e-05, + "loss": 2.0728, + "mean_token_accuracy": 0.47931034564971925, + "step": 147060 + }, + { + "epoch": 0.14812542995303377, + "grad_norm": 9.773811031514485, + "learning_rate": 4.882578803314496e-05, + "loss": 2.5847, + "mean_token_accuracy": 0.4068965494632721, + "step": 147065 + }, + { + "epoch": 0.14813046600613794, + "grad_norm": 9.22098351003182, + "learning_rate": 4.882566856121636e-05, + "loss": 2.4532, + "mean_token_accuracy": 0.4620689570903778, + "step": 147070 + }, + { + "epoch": 0.1481355020592421, + "grad_norm": 10.794101241930523, + "learning_rate": 4.8825549083372995e-05, + "loss": 2.2344, + "mean_token_accuracy": 0.48965516686439514, + "step": 147075 + }, + { + "epoch": 0.1481405381123463, + "grad_norm": 11.782254365490648, + "learning_rate": 4.8825429599614916e-05, + "loss": 2.1795, + "mean_token_accuracy": 0.458620685338974, + "step": 147080 + }, + { + "epoch": 0.14814557416545046, + "grad_norm": 12.516758655723082, + "learning_rate": 4.882531010994216e-05, + "loss": 2.5111, + "mean_token_accuracy": 0.43793103098869324, + "step": 147085 + }, + { + "epoch": 0.14815061021855463, + "grad_norm": 9.292017853443491, + "learning_rate": 4.8825190614354746e-05, + "loss": 2.2597, + "mean_token_accuracy": 0.43793103098869324, + "step": 147090 + }, + { + "epoch": 0.1481556462716588, + "grad_norm": 11.525909744506324, + "learning_rate": 4.88250711128527e-05, + "loss": 2.7411, + "mean_token_accuracy": 0.3793103516101837, + "step": 147095 + }, + { + "epoch": 0.14816068232476298, + "grad_norm": 8.434245892908029, + "learning_rate": 4.8824951605436085e-05, + "loss": 2.376, + "mean_token_accuracy": 0.3896551728248596, + "step": 147100 + }, + { + "epoch": 0.14816571837786716, + "grad_norm": 11.081474491162624, + "learning_rate": 4.88248320921049e-05, + "loss": 2.3704, + "mean_token_accuracy": 0.37586206793785093, + "step": 147105 + }, + { + "epoch": 0.14817075443097133, + "grad_norm": 11.118349859095188, + "learning_rate": 4.88247125728592e-05, + "loss": 2.2949, + "mean_token_accuracy": 0.4137930989265442, + "step": 147110 + }, + { + "epoch": 0.1481757904840755, + "grad_norm": 17.346127663866344, + "learning_rate": 4.8824593047699e-05, + "loss": 2.5886, + "mean_token_accuracy": 0.441379314661026, + "step": 147115 + }, + { + "epoch": 0.14818082653717968, + "grad_norm": 11.41250087747657, + "learning_rate": 4.8824473516624354e-05, + "loss": 2.5483, + "mean_token_accuracy": 0.4413793087005615, + "step": 147120 + }, + { + "epoch": 0.14818586259028385, + "grad_norm": 10.469391016498895, + "learning_rate": 4.882435397963528e-05, + "loss": 2.5192, + "mean_token_accuracy": 0.37241379618644715, + "step": 147125 + }, + { + "epoch": 0.14819089864338802, + "grad_norm": 10.485920367909888, + "learning_rate": 4.882423443673183e-05, + "loss": 2.2307, + "mean_token_accuracy": 0.4551724076271057, + "step": 147130 + }, + { + "epoch": 0.1481959346964922, + "grad_norm": 12.344392980457553, + "learning_rate": 4.8824114887914014e-05, + "loss": 2.5895, + "mean_token_accuracy": 0.4068965494632721, + "step": 147135 + }, + { + "epoch": 0.14820097074959637, + "grad_norm": 9.396068885652161, + "learning_rate": 4.8823995333181885e-05, + "loss": 2.2102, + "mean_token_accuracy": 0.44996975660324096, + "step": 147140 + }, + { + "epoch": 0.14820600680270055, + "grad_norm": 9.656948972456169, + "learning_rate": 4.882387577253545e-05, + "loss": 2.5612, + "mean_token_accuracy": 0.4137930989265442, + "step": 147145 + }, + { + "epoch": 0.1482110428558047, + "grad_norm": 9.092816251556293, + "learning_rate": 4.882375620597476e-05, + "loss": 2.1762, + "mean_token_accuracy": 0.4551724135875702, + "step": 147150 + }, + { + "epoch": 0.14821607890890887, + "grad_norm": 9.671967396975852, + "learning_rate": 4.882363663349986e-05, + "loss": 2.1739, + "mean_token_accuracy": 0.44482758045196535, + "step": 147155 + }, + { + "epoch": 0.14822111496201304, + "grad_norm": 9.750344720450771, + "learning_rate": 4.882351705511076e-05, + "loss": 2.3524, + "mean_token_accuracy": 0.42068966031074523, + "step": 147160 + }, + { + "epoch": 0.1482261510151172, + "grad_norm": 9.83591948492884, + "learning_rate": 4.882339747080751e-05, + "loss": 2.6674, + "mean_token_accuracy": 0.39655173420906065, + "step": 147165 + }, + { + "epoch": 0.1482311870682214, + "grad_norm": 7.771731488091805, + "learning_rate": 4.882327788059014e-05, + "loss": 2.1133, + "mean_token_accuracy": 0.4862068951129913, + "step": 147170 + }, + { + "epoch": 0.14823622312132556, + "grad_norm": 9.491716710061436, + "learning_rate": 4.8823158284458664e-05, + "loss": 2.3641, + "mean_token_accuracy": 0.4586206912994385, + "step": 147175 + }, + { + "epoch": 0.14824125917442973, + "grad_norm": 10.11588444594482, + "learning_rate": 4.882303868241314e-05, + "loss": 1.9879, + "mean_token_accuracy": 0.4862068951129913, + "step": 147180 + }, + { + "epoch": 0.1482462952275339, + "grad_norm": 8.953449011958325, + "learning_rate": 4.882291907445359e-05, + "loss": 1.87, + "mean_token_accuracy": 0.5436176657676697, + "step": 147185 + }, + { + "epoch": 0.14825133128063808, + "grad_norm": 12.349147779226927, + "learning_rate": 4.882279946058004e-05, + "loss": 2.3262, + "mean_token_accuracy": 0.4379310369491577, + "step": 147190 + }, + { + "epoch": 0.14825636733374226, + "grad_norm": 9.809502896048327, + "learning_rate": 4.882267984079255e-05, + "loss": 2.1363, + "mean_token_accuracy": 0.4448275864124298, + "step": 147195 + }, + { + "epoch": 0.14826140338684643, + "grad_norm": 9.520689599131728, + "learning_rate": 4.882256021509112e-05, + "loss": 2.4987, + "mean_token_accuracy": 0.42758620977401735, + "step": 147200 + }, + { + "epoch": 0.1482664394399506, + "grad_norm": 10.98789466847955, + "learning_rate": 4.8822440583475804e-05, + "loss": 2.3156, + "mean_token_accuracy": 0.3793103456497192, + "step": 147205 + }, + { + "epoch": 0.14827147549305478, + "grad_norm": 7.906718049284113, + "learning_rate": 4.882232094594663e-05, + "loss": 2.284, + "mean_token_accuracy": 0.4275861978530884, + "step": 147210 + }, + { + "epoch": 0.14827651154615895, + "grad_norm": 10.278386647021943, + "learning_rate": 4.882220130250363e-05, + "loss": 2.2462, + "mean_token_accuracy": 0.4842710256576538, + "step": 147215 + }, + { + "epoch": 0.14828154759926312, + "grad_norm": 11.365946559692356, + "learning_rate": 4.8822081653146844e-05, + "loss": 2.3849, + "mean_token_accuracy": 0.4551724135875702, + "step": 147220 + }, + { + "epoch": 0.1482865836523673, + "grad_norm": 11.38413584827024, + "learning_rate": 4.882196199787629e-05, + "loss": 3.0469, + "mean_token_accuracy": 0.34827586114406583, + "step": 147225 + }, + { + "epoch": 0.14829161970547147, + "grad_norm": 9.04667339586641, + "learning_rate": 4.8821842336692014e-05, + "loss": 2.7558, + "mean_token_accuracy": 0.38620689511299133, + "step": 147230 + }, + { + "epoch": 0.14829665575857565, + "grad_norm": 11.260133601294315, + "learning_rate": 4.882172266959405e-05, + "loss": 2.2808, + "mean_token_accuracy": 0.4275862157344818, + "step": 147235 + }, + { + "epoch": 0.14830169181167982, + "grad_norm": 10.54253472660178, + "learning_rate": 4.8821602996582414e-05, + "loss": 3.0929, + "mean_token_accuracy": 0.3379310369491577, + "step": 147240 + }, + { + "epoch": 0.148306727864784, + "grad_norm": 10.29737561459594, + "learning_rate": 4.882148331765717e-05, + "loss": 2.2856, + "mean_token_accuracy": 0.4551724076271057, + "step": 147245 + }, + { + "epoch": 0.14831176391788817, + "grad_norm": 15.701734282528049, + "learning_rate": 4.882136363281832e-05, + "loss": 2.6464, + "mean_token_accuracy": 0.358620685338974, + "step": 147250 + }, + { + "epoch": 0.14831679997099234, + "grad_norm": 11.015287121419911, + "learning_rate": 4.882124394206591e-05, + "loss": 2.0684, + "mean_token_accuracy": 0.42068964838981626, + "step": 147255 + }, + { + "epoch": 0.14832183602409652, + "grad_norm": 9.730699479997783, + "learning_rate": 4.882112424539998e-05, + "loss": 2.5134, + "mean_token_accuracy": 0.3999999940395355, + "step": 147260 + }, + { + "epoch": 0.1483268720772007, + "grad_norm": 8.858295651404818, + "learning_rate": 4.882100454282055e-05, + "loss": 2.1884, + "mean_token_accuracy": 0.45517241954803467, + "step": 147265 + }, + { + "epoch": 0.14833190813030486, + "grad_norm": 11.316933747512023, + "learning_rate": 4.8820884834327665e-05, + "loss": 2.3365, + "mean_token_accuracy": 0.43103448748588563, + "step": 147270 + }, + { + "epoch": 0.14833694418340904, + "grad_norm": 12.726977749976244, + "learning_rate": 4.8820765119921355e-05, + "loss": 2.3813, + "mean_token_accuracy": 0.43103448748588563, + "step": 147275 + }, + { + "epoch": 0.1483419802365132, + "grad_norm": 19.096433258255583, + "learning_rate": 4.8820645399601645e-05, + "loss": 2.3175, + "mean_token_accuracy": 0.46945813298225403, + "step": 147280 + }, + { + "epoch": 0.14834701628961738, + "grad_norm": 11.658912463993541, + "learning_rate": 4.8820525673368573e-05, + "loss": 2.5151, + "mean_token_accuracy": 0.4068965494632721, + "step": 147285 + }, + { + "epoch": 0.14835205234272153, + "grad_norm": 10.908673894126805, + "learning_rate": 4.882040594122218e-05, + "loss": 2.0161, + "mean_token_accuracy": 0.4517241358757019, + "step": 147290 + }, + { + "epoch": 0.1483570883958257, + "grad_norm": 12.597855863039289, + "learning_rate": 4.882028620316249e-05, + "loss": 2.5975, + "mean_token_accuracy": 0.4068965554237366, + "step": 147295 + }, + { + "epoch": 0.14836212444892988, + "grad_norm": 10.314037616979181, + "learning_rate": 4.8820166459189535e-05, + "loss": 2.7618, + "mean_token_accuracy": 0.441379314661026, + "step": 147300 + }, + { + "epoch": 0.14836716050203405, + "grad_norm": 10.347742383963164, + "learning_rate": 4.882004670930336e-05, + "loss": 2.6356, + "mean_token_accuracy": 0.4068965494632721, + "step": 147305 + }, + { + "epoch": 0.14837219655513822, + "grad_norm": 10.840890410064661, + "learning_rate": 4.881992695350399e-05, + "loss": 2.3133, + "mean_token_accuracy": 0.42758620381355283, + "step": 147310 + }, + { + "epoch": 0.1483772326082424, + "grad_norm": 11.786699244765305, + "learning_rate": 4.881980719179146e-05, + "loss": 3.2202, + "mean_token_accuracy": 0.36206896901130675, + "step": 147315 + }, + { + "epoch": 0.14838226866134657, + "grad_norm": 8.944905191616261, + "learning_rate": 4.8819687424165796e-05, + "loss": 2.0835, + "mean_token_accuracy": 0.48620688915252686, + "step": 147320 + }, + { + "epoch": 0.14838730471445075, + "grad_norm": 10.845567889878728, + "learning_rate": 4.881956765062703e-05, + "loss": 2.4206, + "mean_token_accuracy": 0.4068965554237366, + "step": 147325 + }, + { + "epoch": 0.14839234076755492, + "grad_norm": 8.877828491981624, + "learning_rate": 4.881944787117522e-05, + "loss": 2.302, + "mean_token_accuracy": 0.47586206197738645, + "step": 147330 + }, + { + "epoch": 0.1483973768206591, + "grad_norm": 9.054392925784292, + "learning_rate": 4.881932808581037e-05, + "loss": 2.3116, + "mean_token_accuracy": 0.4344827473163605, + "step": 147335 + }, + { + "epoch": 0.14840241287376327, + "grad_norm": 10.744226897523658, + "learning_rate": 4.881920829453253e-05, + "loss": 2.3018, + "mean_token_accuracy": 0.43793103098869324, + "step": 147340 + }, + { + "epoch": 0.14840744892686744, + "grad_norm": 10.581016193136323, + "learning_rate": 4.8819088497341726e-05, + "loss": 2.2516, + "mean_token_accuracy": 0.4517241418361664, + "step": 147345 + }, + { + "epoch": 0.14841248497997162, + "grad_norm": 8.957900175853252, + "learning_rate": 4.8818968694237996e-05, + "loss": 2.0733, + "mean_token_accuracy": 0.4551724135875702, + "step": 147350 + }, + { + "epoch": 0.1484175210330758, + "grad_norm": 12.499291710417538, + "learning_rate": 4.8818848885221366e-05, + "loss": 2.5497, + "mean_token_accuracy": 0.4689655125141144, + "step": 147355 + }, + { + "epoch": 0.14842255708617996, + "grad_norm": 10.814685269998908, + "learning_rate": 4.8818729070291876e-05, + "loss": 2.1931, + "mean_token_accuracy": 0.4379310369491577, + "step": 147360 + }, + { + "epoch": 0.14842759313928414, + "grad_norm": 10.544726952120056, + "learning_rate": 4.8818609249449555e-05, + "loss": 2.2425, + "mean_token_accuracy": 0.4327888786792755, + "step": 147365 + }, + { + "epoch": 0.1484326291923883, + "grad_norm": 10.876655510084825, + "learning_rate": 4.881848942269445e-05, + "loss": 2.1023, + "mean_token_accuracy": 0.4620689570903778, + "step": 147370 + }, + { + "epoch": 0.14843766524549248, + "grad_norm": 10.004284273292003, + "learning_rate": 4.8818369590026565e-05, + "loss": 2.8315, + "mean_token_accuracy": 0.39655172228813174, + "step": 147375 + }, + { + "epoch": 0.14844270129859666, + "grad_norm": 13.002677917979186, + "learning_rate": 4.881824975144596e-05, + "loss": 2.3526, + "mean_token_accuracy": 0.4583787083625793, + "step": 147380 + }, + { + "epoch": 0.14844773735170083, + "grad_norm": 12.207589444904194, + "learning_rate": 4.881812990695265e-05, + "loss": 2.6505, + "mean_token_accuracy": 0.37398669123649597, + "step": 147385 + }, + { + "epoch": 0.148452773404805, + "grad_norm": 8.983110729782561, + "learning_rate": 4.8818010056546686e-05, + "loss": 2.0905, + "mean_token_accuracy": 0.5137931048870087, + "step": 147390 + }, + { + "epoch": 0.14845780945790918, + "grad_norm": 10.12616103653861, + "learning_rate": 4.881789020022809e-05, + "loss": 1.9616, + "mean_token_accuracy": 0.4931034505367279, + "step": 147395 + }, + { + "epoch": 0.14846284551101335, + "grad_norm": 10.835814003903316, + "learning_rate": 4.88177703379969e-05, + "loss": 2.1957, + "mean_token_accuracy": 0.4551724076271057, + "step": 147400 + }, + { + "epoch": 0.14846788156411753, + "grad_norm": 9.695695453402482, + "learning_rate": 4.881765046985315e-05, + "loss": 2.2653, + "mean_token_accuracy": 0.42758620977401735, + "step": 147405 + }, + { + "epoch": 0.1484729176172217, + "grad_norm": 10.379687579898073, + "learning_rate": 4.8817530595796856e-05, + "loss": 2.5179, + "mean_token_accuracy": 0.3827586233615875, + "step": 147410 + }, + { + "epoch": 0.14847795367032587, + "grad_norm": 11.628701836231295, + "learning_rate": 4.881741071582808e-05, + "loss": 2.4247, + "mean_token_accuracy": 0.4034482777118683, + "step": 147415 + }, + { + "epoch": 0.14848298972343005, + "grad_norm": 8.51684260282957, + "learning_rate": 4.881729082994683e-05, + "loss": 1.9461, + "mean_token_accuracy": 0.482758617401123, + "step": 147420 + }, + { + "epoch": 0.14848802577653422, + "grad_norm": 12.550463696593551, + "learning_rate": 4.8817170938153156e-05, + "loss": 2.5477, + "mean_token_accuracy": 0.3862068891525269, + "step": 147425 + }, + { + "epoch": 0.14849306182963837, + "grad_norm": 11.278154452532258, + "learning_rate": 4.881705104044709e-05, + "loss": 2.1288, + "mean_token_accuracy": 0.47241378426551817, + "step": 147430 + }, + { + "epoch": 0.14849809788274254, + "grad_norm": 11.713783084567902, + "learning_rate": 4.8816931136828654e-05, + "loss": 2.3778, + "mean_token_accuracy": 0.42758620381355283, + "step": 147435 + }, + { + "epoch": 0.14850313393584672, + "grad_norm": 9.540428320584386, + "learning_rate": 4.881681122729789e-05, + "loss": 2.2388, + "mean_token_accuracy": 0.43448275327682495, + "step": 147440 + }, + { + "epoch": 0.1485081699889509, + "grad_norm": 11.725206211368194, + "learning_rate": 4.881669131185482e-05, + "loss": 2.1806, + "mean_token_accuracy": 0.4551724135875702, + "step": 147445 + }, + { + "epoch": 0.14851320604205506, + "grad_norm": 11.10041759910945, + "learning_rate": 4.881657139049949e-05, + "loss": 2.4277, + "mean_token_accuracy": 0.42413793206214906, + "step": 147450 + }, + { + "epoch": 0.14851824209515924, + "grad_norm": 11.611295353632105, + "learning_rate": 4.881645146323194e-05, + "loss": 2.1882, + "mean_token_accuracy": 0.4896551787853241, + "step": 147455 + }, + { + "epoch": 0.1485232781482634, + "grad_norm": 12.018790977597904, + "learning_rate": 4.881633153005218e-05, + "loss": 2.1082, + "mean_token_accuracy": 0.4776164650917053, + "step": 147460 + }, + { + "epoch": 0.14852831420136758, + "grad_norm": 9.018404724864865, + "learning_rate": 4.881621159096026e-05, + "loss": 2.249, + "mean_token_accuracy": 0.46551724076271056, + "step": 147465 + }, + { + "epoch": 0.14853335025447176, + "grad_norm": 10.485857512990242, + "learning_rate": 4.881609164595621e-05, + "loss": 2.3322, + "mean_token_accuracy": 0.4068965494632721, + "step": 147470 + }, + { + "epoch": 0.14853838630757593, + "grad_norm": 10.614656554974275, + "learning_rate": 4.881597169504006e-05, + "loss": 2.818, + "mean_token_accuracy": 0.4275861978530884, + "step": 147475 + }, + { + "epoch": 0.1485434223606801, + "grad_norm": 9.60084873716458, + "learning_rate": 4.8815851738211854e-05, + "loss": 2.3064, + "mean_token_accuracy": 0.43793103098869324, + "step": 147480 + }, + { + "epoch": 0.14854845841378428, + "grad_norm": 13.41067614510491, + "learning_rate": 4.881573177547161e-05, + "loss": 2.21, + "mean_token_accuracy": 0.5158499777317047, + "step": 147485 + }, + { + "epoch": 0.14855349446688845, + "grad_norm": 10.319058287197134, + "learning_rate": 4.8815611806819364e-05, + "loss": 2.2256, + "mean_token_accuracy": 0.4379310429096222, + "step": 147490 + }, + { + "epoch": 0.14855853051999263, + "grad_norm": 10.532452207198254, + "learning_rate": 4.8815491832255166e-05, + "loss": 2.3208, + "mean_token_accuracy": 0.441379314661026, + "step": 147495 + }, + { + "epoch": 0.1485635665730968, + "grad_norm": 10.398734760351038, + "learning_rate": 4.881537185177903e-05, + "loss": 2.5989, + "mean_token_accuracy": 0.358620685338974, + "step": 147500 + }, + { + "epoch": 0.14856860262620097, + "grad_norm": 10.077617237512756, + "learning_rate": 4.8815251865390997e-05, + "loss": 2.4949, + "mean_token_accuracy": 0.4, + "step": 147505 + }, + { + "epoch": 0.14857363867930515, + "grad_norm": 9.634292263149101, + "learning_rate": 4.8815131873091094e-05, + "loss": 2.2713, + "mean_token_accuracy": 0.4034482717514038, + "step": 147510 + }, + { + "epoch": 0.14857867473240932, + "grad_norm": 12.523660076574277, + "learning_rate": 4.881501187487936e-05, + "loss": 2.4412, + "mean_token_accuracy": 0.39655171930789945, + "step": 147515 + }, + { + "epoch": 0.1485837107855135, + "grad_norm": 12.938800069132133, + "learning_rate": 4.8814891870755834e-05, + "loss": 2.5707, + "mean_token_accuracy": 0.4298850536346436, + "step": 147520 + }, + { + "epoch": 0.14858874683861767, + "grad_norm": 9.960086648719676, + "learning_rate": 4.8814771860720545e-05, + "loss": 2.4222, + "mean_token_accuracy": 0.40689654350280763, + "step": 147525 + }, + { + "epoch": 0.14859378289172184, + "grad_norm": 9.432934319798065, + "learning_rate": 4.8814651844773515e-05, + "loss": 2.2201, + "mean_token_accuracy": 0.47126437425613404, + "step": 147530 + }, + { + "epoch": 0.14859881894482602, + "grad_norm": 11.86340934981577, + "learning_rate": 4.88145318229148e-05, + "loss": 2.5709, + "mean_token_accuracy": 0.429764062166214, + "step": 147535 + }, + { + "epoch": 0.1486038549979302, + "grad_norm": 10.92149632769888, + "learning_rate": 4.881441179514441e-05, + "loss": 2.743, + "mean_token_accuracy": 0.3793103456497192, + "step": 147540 + }, + { + "epoch": 0.14860889105103436, + "grad_norm": 9.907376755441344, + "learning_rate": 4.8814291761462386e-05, + "loss": 2.2283, + "mean_token_accuracy": 0.4931034445762634, + "step": 147545 + }, + { + "epoch": 0.14861392710413854, + "grad_norm": 11.77960396762107, + "learning_rate": 4.881417172186877e-05, + "loss": 2.4171, + "mean_token_accuracy": 0.3896551728248596, + "step": 147550 + }, + { + "epoch": 0.1486189631572427, + "grad_norm": 10.121882667042206, + "learning_rate": 4.881405167636359e-05, + "loss": 2.0961, + "mean_token_accuracy": 0.4620689630508423, + "step": 147555 + }, + { + "epoch": 0.14862399921034689, + "grad_norm": 8.929683759149132, + "learning_rate": 4.881393162494688e-05, + "loss": 2.0055, + "mean_token_accuracy": 0.5034482657909394, + "step": 147560 + }, + { + "epoch": 0.14862903526345106, + "grad_norm": 12.849172693828802, + "learning_rate": 4.8813811567618665e-05, + "loss": 2.4908, + "mean_token_accuracy": 0.40344828367233276, + "step": 147565 + }, + { + "epoch": 0.1486340713165552, + "grad_norm": 10.906387330924511, + "learning_rate": 4.881369150437899e-05, + "loss": 2.3693, + "mean_token_accuracy": 0.43950392603874205, + "step": 147570 + }, + { + "epoch": 0.14863910736965938, + "grad_norm": 8.816824411742006, + "learning_rate": 4.881357143522788e-05, + "loss": 2.0319, + "mean_token_accuracy": 0.4793103516101837, + "step": 147575 + }, + { + "epoch": 0.14864414342276355, + "grad_norm": 15.477416527170242, + "learning_rate": 4.881345136016538e-05, + "loss": 2.8386, + "mean_token_accuracy": 0.4068965494632721, + "step": 147580 + }, + { + "epoch": 0.14864917947586773, + "grad_norm": 11.423052016586015, + "learning_rate": 4.88133312791915e-05, + "loss": 2.4312, + "mean_token_accuracy": 0.4034482717514038, + "step": 147585 + }, + { + "epoch": 0.1486542155289719, + "grad_norm": 10.382182083059218, + "learning_rate": 4.8813211192306293e-05, + "loss": 2.3771, + "mean_token_accuracy": 0.4310344815254211, + "step": 147590 + }, + { + "epoch": 0.14865925158207607, + "grad_norm": 10.319707353086667, + "learning_rate": 4.881309109950978e-05, + "loss": 2.7897, + "mean_token_accuracy": 0.44137929677963256, + "step": 147595 + }, + { + "epoch": 0.14866428763518025, + "grad_norm": 9.990199109384305, + "learning_rate": 4.8812971000802014e-05, + "loss": 2.4214, + "mean_token_accuracy": 0.4310344815254211, + "step": 147600 + }, + { + "epoch": 0.14866932368828442, + "grad_norm": 14.841897328334882, + "learning_rate": 4.8812850896183015e-05, + "loss": 2.8444, + "mean_token_accuracy": 0.3655172407627106, + "step": 147605 + }, + { + "epoch": 0.1486743597413886, + "grad_norm": 9.43993592075201, + "learning_rate": 4.8812730785652816e-05, + "loss": 2.4449, + "mean_token_accuracy": 0.41724138259887694, + "step": 147610 + }, + { + "epoch": 0.14867939579449277, + "grad_norm": 9.578414853263228, + "learning_rate": 4.881261066921145e-05, + "loss": 2.5002, + "mean_token_accuracy": 0.41724138259887694, + "step": 147615 + }, + { + "epoch": 0.14868443184759694, + "grad_norm": 8.461088875133578, + "learning_rate": 4.881249054685896e-05, + "loss": 2.0822, + "mean_token_accuracy": 0.4862068951129913, + "step": 147620 + }, + { + "epoch": 0.14868946790070112, + "grad_norm": 12.254562617019001, + "learning_rate": 4.881237041859536e-05, + "loss": 2.7349, + "mean_token_accuracy": 0.3896551698446274, + "step": 147625 + }, + { + "epoch": 0.1486945039538053, + "grad_norm": 10.02094343171751, + "learning_rate": 4.88122502844207e-05, + "loss": 2.5769, + "mean_token_accuracy": 0.3793103456497192, + "step": 147630 + }, + { + "epoch": 0.14869954000690946, + "grad_norm": 10.701606304690337, + "learning_rate": 4.8812130144335e-05, + "loss": 2.5673, + "mean_token_accuracy": 0.3793103456497192, + "step": 147635 + }, + { + "epoch": 0.14870457606001364, + "grad_norm": 9.376950709664813, + "learning_rate": 4.881200999833831e-05, + "loss": 2.4436, + "mean_token_accuracy": 0.3896551728248596, + "step": 147640 + }, + { + "epoch": 0.1487096121131178, + "grad_norm": 10.074649854714757, + "learning_rate": 4.8811889846430655e-05, + "loss": 2.4225, + "mean_token_accuracy": 0.46551724672317507, + "step": 147645 + }, + { + "epoch": 0.14871464816622199, + "grad_norm": 13.419942909230395, + "learning_rate": 4.8811769688612064e-05, + "loss": 2.4955, + "mean_token_accuracy": 0.41379310488700866, + "step": 147650 + }, + { + "epoch": 0.14871968421932616, + "grad_norm": 10.717162758198729, + "learning_rate": 4.881164952488257e-05, + "loss": 2.6211, + "mean_token_accuracy": 0.3482758641242981, + "step": 147655 + }, + { + "epoch": 0.14872472027243033, + "grad_norm": 8.995823963517623, + "learning_rate": 4.881152935524222e-05, + "loss": 2.2135, + "mean_token_accuracy": 0.46551724076271056, + "step": 147660 + }, + { + "epoch": 0.1487297563255345, + "grad_norm": 10.786590980877957, + "learning_rate": 4.881140917969103e-05, + "loss": 2.5346, + "mean_token_accuracy": 0.4241379380226135, + "step": 147665 + }, + { + "epoch": 0.14873479237863868, + "grad_norm": 9.489613952739157, + "learning_rate": 4.8811288998229045e-05, + "loss": 2.1175, + "mean_token_accuracy": 0.47241379618644713, + "step": 147670 + }, + { + "epoch": 0.14873982843174285, + "grad_norm": 12.306436383054477, + "learning_rate": 4.8811168810856296e-05, + "loss": 2.3714, + "mean_token_accuracy": 0.4379310369491577, + "step": 147675 + }, + { + "epoch": 0.14874486448484703, + "grad_norm": 8.856816820285616, + "learning_rate": 4.8811048617572815e-05, + "loss": 2.3793, + "mean_token_accuracy": 0.41379310488700866, + "step": 147680 + }, + { + "epoch": 0.1487499005379512, + "grad_norm": 15.026180005863395, + "learning_rate": 4.881092841837863e-05, + "loss": 2.5043, + "mean_token_accuracy": 0.43103448748588563, + "step": 147685 + }, + { + "epoch": 0.14875493659105538, + "grad_norm": 8.865992710769632, + "learning_rate": 4.881080821327379e-05, + "loss": 1.9951, + "mean_token_accuracy": 0.519116747379303, + "step": 147690 + }, + { + "epoch": 0.14875997264415955, + "grad_norm": 7.730155789677263, + "learning_rate": 4.88106880022583e-05, + "loss": 2.2018, + "mean_token_accuracy": 0.5137930870056152, + "step": 147695 + }, + { + "epoch": 0.14876500869726372, + "grad_norm": 8.94501605541947, + "learning_rate": 4.881056778533222e-05, + "loss": 2.249, + "mean_token_accuracy": 0.47931034564971925, + "step": 147700 + }, + { + "epoch": 0.1487700447503679, + "grad_norm": 11.35672634689044, + "learning_rate": 4.8810447562495584e-05, + "loss": 2.6019, + "mean_token_accuracy": 0.4034482717514038, + "step": 147705 + }, + { + "epoch": 0.14877508080347204, + "grad_norm": 10.510078978084575, + "learning_rate": 4.8810327333748406e-05, + "loss": 2.4527, + "mean_token_accuracy": 0.37931033968925476, + "step": 147710 + }, + { + "epoch": 0.14878011685657622, + "grad_norm": 9.062219936452191, + "learning_rate": 4.881020709909073e-05, + "loss": 2.0534, + "mean_token_accuracy": 0.482758617401123, + "step": 147715 + }, + { + "epoch": 0.1487851529096804, + "grad_norm": 8.528278659335324, + "learning_rate": 4.8810086858522594e-05, + "loss": 2.1196, + "mean_token_accuracy": 0.48275862336158754, + "step": 147720 + }, + { + "epoch": 0.14879018896278456, + "grad_norm": 10.355541838456608, + "learning_rate": 4.880996661204402e-05, + "loss": 2.6029, + "mean_token_accuracy": 0.4310344815254211, + "step": 147725 + }, + { + "epoch": 0.14879522501588874, + "grad_norm": 10.253693849275912, + "learning_rate": 4.880984635965504e-05, + "loss": 2.4976, + "mean_token_accuracy": 0.4186327874660492, + "step": 147730 + }, + { + "epoch": 0.1488002610689929, + "grad_norm": 11.489037045231054, + "learning_rate": 4.8809726101355704e-05, + "loss": 2.2262, + "mean_token_accuracy": 0.4551724076271057, + "step": 147735 + }, + { + "epoch": 0.14880529712209709, + "grad_norm": 9.50023536174181, + "learning_rate": 4.880960583714604e-05, + "loss": 2.1036, + "mean_token_accuracy": 0.517241370677948, + "step": 147740 + }, + { + "epoch": 0.14881033317520126, + "grad_norm": 10.366810459896795, + "learning_rate": 4.880948556702607e-05, + "loss": 2.5006, + "mean_token_accuracy": 0.4068965494632721, + "step": 147745 + }, + { + "epoch": 0.14881536922830543, + "grad_norm": 9.746488774058648, + "learning_rate": 4.880936529099584e-05, + "loss": 2.5077, + "mean_token_accuracy": 0.3724137842655182, + "step": 147750 + }, + { + "epoch": 0.1488204052814096, + "grad_norm": 12.204200318487382, + "learning_rate": 4.8809245009055375e-05, + "loss": 2.7431, + "mean_token_accuracy": 0.3551724195480347, + "step": 147755 + }, + { + "epoch": 0.14882544133451378, + "grad_norm": 11.305479314792676, + "learning_rate": 4.880912472120472e-05, + "loss": 2.4984, + "mean_token_accuracy": 0.42758620381355283, + "step": 147760 + }, + { + "epoch": 0.14883047738761795, + "grad_norm": 10.101441916861413, + "learning_rate": 4.8809004427443885e-05, + "loss": 2.4116, + "mean_token_accuracy": 0.4758620738983154, + "step": 147765 + }, + { + "epoch": 0.14883551344072213, + "grad_norm": 10.319224736819612, + "learning_rate": 4.8808884127772924e-05, + "loss": 2.4711, + "mean_token_accuracy": 0.4068965554237366, + "step": 147770 + }, + { + "epoch": 0.1488405494938263, + "grad_norm": 9.112383205884079, + "learning_rate": 4.880876382219186e-05, + "loss": 2.1193, + "mean_token_accuracy": 0.4344827651977539, + "step": 147775 + }, + { + "epoch": 0.14884558554693048, + "grad_norm": 12.345336425615708, + "learning_rate": 4.880864351070074e-05, + "loss": 2.2056, + "mean_token_accuracy": 0.39310343861579894, + "step": 147780 + }, + { + "epoch": 0.14885062160003465, + "grad_norm": 12.790394877488497, + "learning_rate": 4.880852319329959e-05, + "loss": 2.6085, + "mean_token_accuracy": 0.4137930989265442, + "step": 147785 + }, + { + "epoch": 0.14885565765313882, + "grad_norm": 16.995596870202775, + "learning_rate": 4.880840286998843e-05, + "loss": 2.6637, + "mean_token_accuracy": 0.37931033968925476, + "step": 147790 + }, + { + "epoch": 0.148860693706243, + "grad_norm": 11.57785781847776, + "learning_rate": 4.880828254076732e-05, + "loss": 2.576, + "mean_token_accuracy": 0.4034482777118683, + "step": 147795 + }, + { + "epoch": 0.14886572975934717, + "grad_norm": 8.066931882905035, + "learning_rate": 4.880816220563626e-05, + "loss": 2.2702, + "mean_token_accuracy": 0.5030464053153991, + "step": 147800 + }, + { + "epoch": 0.14887076581245134, + "grad_norm": 10.112780067315288, + "learning_rate": 4.880804186459532e-05, + "loss": 2.1863, + "mean_token_accuracy": 0.4689655125141144, + "step": 147805 + }, + { + "epoch": 0.14887580186555552, + "grad_norm": 11.163048650329113, + "learning_rate": 4.88079215176445e-05, + "loss": 2.2771, + "mean_token_accuracy": 0.46358135938644407, + "step": 147810 + }, + { + "epoch": 0.1488808379186597, + "grad_norm": 8.09102862164424, + "learning_rate": 4.8807801164783856e-05, + "loss": 1.9657, + "mean_token_accuracy": 0.4862068951129913, + "step": 147815 + }, + { + "epoch": 0.14888587397176387, + "grad_norm": 11.350811064503564, + "learning_rate": 4.8807680806013414e-05, + "loss": 2.1794, + "mean_token_accuracy": 0.4655172348022461, + "step": 147820 + }, + { + "epoch": 0.14889091002486804, + "grad_norm": 10.94535222072632, + "learning_rate": 4.88075604413332e-05, + "loss": 2.4754, + "mean_token_accuracy": 0.4103448212146759, + "step": 147825 + }, + { + "epoch": 0.1488959460779722, + "grad_norm": 8.66256721180295, + "learning_rate": 4.880744007074326e-05, + "loss": 2.2151, + "mean_token_accuracy": 0.44827585816383364, + "step": 147830 + }, + { + "epoch": 0.1489009821310764, + "grad_norm": 9.559076743415517, + "learning_rate": 4.880731969424363e-05, + "loss": 2.4939, + "mean_token_accuracy": 0.4448275864124298, + "step": 147835 + }, + { + "epoch": 0.14890601818418056, + "grad_norm": 8.925513148237549, + "learning_rate": 4.8807199311834326e-05, + "loss": 2.0131, + "mean_token_accuracy": 0.47586206793785096, + "step": 147840 + }, + { + "epoch": 0.14891105423728473, + "grad_norm": 11.116767062339065, + "learning_rate": 4.880707892351539e-05, + "loss": 2.3807, + "mean_token_accuracy": 0.44827585816383364, + "step": 147845 + }, + { + "epoch": 0.14891609029038888, + "grad_norm": 9.692581156856463, + "learning_rate": 4.8806958529286854e-05, + "loss": 2.4221, + "mean_token_accuracy": 0.4068965554237366, + "step": 147850 + }, + { + "epoch": 0.14892112634349305, + "grad_norm": 10.206136318908854, + "learning_rate": 4.8806838129148766e-05, + "loss": 2.3195, + "mean_token_accuracy": 0.42413793206214906, + "step": 147855 + }, + { + "epoch": 0.14892616239659723, + "grad_norm": 13.058360767419309, + "learning_rate": 4.880671772310114e-05, + "loss": 2.5849, + "mean_token_accuracy": 0.4137930989265442, + "step": 147860 + }, + { + "epoch": 0.1489311984497014, + "grad_norm": 10.876942210023405, + "learning_rate": 4.880659731114401e-05, + "loss": 2.4157, + "mean_token_accuracy": 0.4448275864124298, + "step": 147865 + }, + { + "epoch": 0.14893623450280558, + "grad_norm": 13.05876552219092, + "learning_rate": 4.880647689327742e-05, + "loss": 2.9669, + "mean_token_accuracy": 0.35862069129943847, + "step": 147870 + }, + { + "epoch": 0.14894127055590975, + "grad_norm": 13.758972929381663, + "learning_rate": 4.8806356469501405e-05, + "loss": 2.2748, + "mean_token_accuracy": 0.4326073855161667, + "step": 147875 + }, + { + "epoch": 0.14894630660901392, + "grad_norm": 9.031203565914096, + "learning_rate": 4.880623603981599e-05, + "loss": 2.4215, + "mean_token_accuracy": 0.4482758641242981, + "step": 147880 + }, + { + "epoch": 0.1489513426621181, + "grad_norm": 8.93043135600148, + "learning_rate": 4.88061156042212e-05, + "loss": 2.3501, + "mean_token_accuracy": 0.4034482717514038, + "step": 147885 + }, + { + "epoch": 0.14895637871522227, + "grad_norm": 12.75446161689368, + "learning_rate": 4.8805995162717096e-05, + "loss": 2.7954, + "mean_token_accuracy": 0.3896551728248596, + "step": 147890 + }, + { + "epoch": 0.14896141476832644, + "grad_norm": 12.473579975919137, + "learning_rate": 4.8805874715303686e-05, + "loss": 2.7843, + "mean_token_accuracy": 0.4, + "step": 147895 + }, + { + "epoch": 0.14896645082143062, + "grad_norm": 10.831228544433472, + "learning_rate": 4.880575426198101e-05, + "loss": 2.217, + "mean_token_accuracy": 0.4206896543502808, + "step": 147900 + }, + { + "epoch": 0.1489714868745348, + "grad_norm": 9.928650113786528, + "learning_rate": 4.880563380274911e-05, + "loss": 2.6684, + "mean_token_accuracy": 0.3931034505367279, + "step": 147905 + }, + { + "epoch": 0.14897652292763897, + "grad_norm": 10.192390372796485, + "learning_rate": 4.880551333760801e-05, + "loss": 2.4841, + "mean_token_accuracy": 0.43448275327682495, + "step": 147910 + }, + { + "epoch": 0.14898155898074314, + "grad_norm": 10.904328943273141, + "learning_rate": 4.8805392866557745e-05, + "loss": 2.3218, + "mean_token_accuracy": 0.45862069725990295, + "step": 147915 + }, + { + "epoch": 0.1489865950338473, + "grad_norm": 9.474393123707559, + "learning_rate": 4.8805272389598356e-05, + "loss": 2.4437, + "mean_token_accuracy": 0.4448275864124298, + "step": 147920 + }, + { + "epoch": 0.1489916310869515, + "grad_norm": 10.103336800914134, + "learning_rate": 4.880515190672986e-05, + "loss": 2.4376, + "mean_token_accuracy": 0.4034482777118683, + "step": 147925 + }, + { + "epoch": 0.14899666714005566, + "grad_norm": 11.397077837596717, + "learning_rate": 4.880503141795231e-05, + "loss": 2.4252, + "mean_token_accuracy": 0.4310344815254211, + "step": 147930 + }, + { + "epoch": 0.14900170319315983, + "grad_norm": 11.372523411108572, + "learning_rate": 4.8804910923265714e-05, + "loss": 2.2694, + "mean_token_accuracy": 0.42758620381355283, + "step": 147935 + }, + { + "epoch": 0.149006739246264, + "grad_norm": 10.13739814350393, + "learning_rate": 4.880479042267014e-05, + "loss": 2.4235, + "mean_token_accuracy": 0.4310344815254211, + "step": 147940 + }, + { + "epoch": 0.14901177529936818, + "grad_norm": 9.563234727808558, + "learning_rate": 4.8804669916165593e-05, + "loss": 2.0983, + "mean_token_accuracy": 0.4849364757537842, + "step": 147945 + }, + { + "epoch": 0.14901681135247236, + "grad_norm": 14.129655277679928, + "learning_rate": 4.880454940375212e-05, + "loss": 2.3818, + "mean_token_accuracy": 0.4310344815254211, + "step": 147950 + }, + { + "epoch": 0.14902184740557653, + "grad_norm": 9.535677167662188, + "learning_rate": 4.880442888542976e-05, + "loss": 2.3972, + "mean_token_accuracy": 0.42758620381355283, + "step": 147955 + }, + { + "epoch": 0.1490268834586807, + "grad_norm": 8.742882719173116, + "learning_rate": 4.8804308361198516e-05, + "loss": 1.9176, + "mean_token_accuracy": 0.524137943983078, + "step": 147960 + }, + { + "epoch": 0.14903191951178488, + "grad_norm": 12.473851344487406, + "learning_rate": 4.8804187831058455e-05, + "loss": 2.7512, + "mean_token_accuracy": 0.3551724135875702, + "step": 147965 + }, + { + "epoch": 0.14903695556488905, + "grad_norm": 10.01582926439821, + "learning_rate": 4.88040672950096e-05, + "loss": 2.2561, + "mean_token_accuracy": 0.5196007251739502, + "step": 147970 + }, + { + "epoch": 0.14904199161799322, + "grad_norm": 9.899810880791657, + "learning_rate": 4.880394675305198e-05, + "loss": 2.2497, + "mean_token_accuracy": 0.39999999701976774, + "step": 147975 + }, + { + "epoch": 0.1490470276710974, + "grad_norm": 9.77434742918475, + "learning_rate": 4.8803826205185633e-05, + "loss": 2.0911, + "mean_token_accuracy": 0.4586206912994385, + "step": 147980 + }, + { + "epoch": 0.14905206372420157, + "grad_norm": 15.18536553971495, + "learning_rate": 4.880370565141059e-05, + "loss": 2.2278, + "mean_token_accuracy": 0.48620688915252686, + "step": 147985 + }, + { + "epoch": 0.14905709977730572, + "grad_norm": 10.029070090872178, + "learning_rate": 4.8803585091726886e-05, + "loss": 2.0095, + "mean_token_accuracy": 0.5034482717514038, + "step": 147990 + }, + { + "epoch": 0.1490621358304099, + "grad_norm": 11.370103732242823, + "learning_rate": 4.880346452613454e-05, + "loss": 2.3296, + "mean_token_accuracy": 0.44482759237289426, + "step": 147995 + }, + { + "epoch": 0.14906717188351407, + "grad_norm": 11.39297337300726, + "learning_rate": 4.880334395463361e-05, + "loss": 2.4859, + "mean_token_accuracy": 0.45517241954803467, + "step": 148000 + }, + { + "epoch": 0.14907220793661824, + "grad_norm": 10.198390889706975, + "learning_rate": 4.880322337722412e-05, + "loss": 2.1748, + "mean_token_accuracy": 0.4482758641242981, + "step": 148005 + }, + { + "epoch": 0.1490772439897224, + "grad_norm": 13.72151304920062, + "learning_rate": 4.88031027939061e-05, + "loss": 2.5864, + "mean_token_accuracy": 0.4206896543502808, + "step": 148010 + }, + { + "epoch": 0.1490822800428266, + "grad_norm": 12.757458801764914, + "learning_rate": 4.880298220467957e-05, + "loss": 2.3495, + "mean_token_accuracy": 0.4758620738983154, + "step": 148015 + }, + { + "epoch": 0.14908731609593076, + "grad_norm": 15.292942184564707, + "learning_rate": 4.88028616095446e-05, + "loss": 2.3623, + "mean_token_accuracy": 0.4433151841163635, + "step": 148020 + }, + { + "epoch": 0.14909235214903493, + "grad_norm": 10.3950684465651, + "learning_rate": 4.8802741008501194e-05, + "loss": 2.3249, + "mean_token_accuracy": 0.41034482717514037, + "step": 148025 + }, + { + "epoch": 0.1490973882021391, + "grad_norm": 12.299271022344156, + "learning_rate": 4.880262040154939e-05, + "loss": 2.7619, + "mean_token_accuracy": 0.3655172407627106, + "step": 148030 + }, + { + "epoch": 0.14910242425524328, + "grad_norm": 9.655104205759264, + "learning_rate": 4.880249978868923e-05, + "loss": 1.987, + "mean_token_accuracy": 0.4862068951129913, + "step": 148035 + }, + { + "epoch": 0.14910746030834746, + "grad_norm": 11.344340266457227, + "learning_rate": 4.880237916992074e-05, + "loss": 2.5613, + "mean_token_accuracy": 0.3758620649576187, + "step": 148040 + }, + { + "epoch": 0.14911249636145163, + "grad_norm": 10.600874858143706, + "learning_rate": 4.8802258545243955e-05, + "loss": 2.423, + "mean_token_accuracy": 0.37241379022598264, + "step": 148045 + }, + { + "epoch": 0.1491175324145558, + "grad_norm": 12.347228527759603, + "learning_rate": 4.880213791465891e-05, + "loss": 2.1328, + "mean_token_accuracy": 0.44482759237289426, + "step": 148050 + }, + { + "epoch": 0.14912256846765998, + "grad_norm": 11.719124955892893, + "learning_rate": 4.880201727816564e-05, + "loss": 2.5571, + "mean_token_accuracy": 0.39310344457626345, + "step": 148055 + }, + { + "epoch": 0.14912760452076415, + "grad_norm": 10.685215262592372, + "learning_rate": 4.880189663576417e-05, + "loss": 2.078, + "mean_token_accuracy": 0.49655172824859617, + "step": 148060 + }, + { + "epoch": 0.14913264057386832, + "grad_norm": 11.196217155355464, + "learning_rate": 4.8801775987454546e-05, + "loss": 2.2808, + "mean_token_accuracy": 0.41724138259887694, + "step": 148065 + }, + { + "epoch": 0.1491376766269725, + "grad_norm": 12.032237079586652, + "learning_rate": 4.880165533323679e-05, + "loss": 2.4513, + "mean_token_accuracy": 0.4068965494632721, + "step": 148070 + }, + { + "epoch": 0.14914271268007667, + "grad_norm": 7.982271596568263, + "learning_rate": 4.880153467311094e-05, + "loss": 2.18, + "mean_token_accuracy": 0.4517241358757019, + "step": 148075 + }, + { + "epoch": 0.14914774873318085, + "grad_norm": 10.324089178687746, + "learning_rate": 4.8801414007077036e-05, + "loss": 2.3984, + "mean_token_accuracy": 0.4586206912994385, + "step": 148080 + }, + { + "epoch": 0.14915278478628502, + "grad_norm": 9.682314737900265, + "learning_rate": 4.8801293335135095e-05, + "loss": 2.2261, + "mean_token_accuracy": 0.48275861144065857, + "step": 148085 + }, + { + "epoch": 0.1491578208393892, + "grad_norm": 10.53415016224788, + "learning_rate": 4.880117265728517e-05, + "loss": 2.3294, + "mean_token_accuracy": 0.43629764318466185, + "step": 148090 + }, + { + "epoch": 0.14916285689249337, + "grad_norm": 10.695377573487987, + "learning_rate": 4.8801051973527284e-05, + "loss": 2.1814, + "mean_token_accuracy": 0.4586206912994385, + "step": 148095 + }, + { + "epoch": 0.14916789294559754, + "grad_norm": 11.201467064617074, + "learning_rate": 4.880093128386147e-05, + "loss": 2.1282, + "mean_token_accuracy": 0.4551724076271057, + "step": 148100 + }, + { + "epoch": 0.14917292899870171, + "grad_norm": 9.098638102134485, + "learning_rate": 4.880081058828776e-05, + "loss": 2.3743, + "mean_token_accuracy": 0.458620685338974, + "step": 148105 + }, + { + "epoch": 0.1491779650518059, + "grad_norm": 12.150650350062113, + "learning_rate": 4.88006898868062e-05, + "loss": 2.4131, + "mean_token_accuracy": 0.43103447556495667, + "step": 148110 + }, + { + "epoch": 0.14918300110491006, + "grad_norm": 9.611284191717868, + "learning_rate": 4.880056917941681e-05, + "loss": 2.4236, + "mean_token_accuracy": 0.40344826579093934, + "step": 148115 + }, + { + "epoch": 0.14918803715801424, + "grad_norm": 13.619647064722711, + "learning_rate": 4.880044846611962e-05, + "loss": 2.6716, + "mean_token_accuracy": 0.4034482717514038, + "step": 148120 + }, + { + "epoch": 0.1491930732111184, + "grad_norm": 11.68044294464124, + "learning_rate": 4.880032774691467e-05, + "loss": 2.5673, + "mean_token_accuracy": 0.3448275804519653, + "step": 148125 + }, + { + "epoch": 0.14919810926422256, + "grad_norm": 10.596723446342825, + "learning_rate": 4.880020702180201e-05, + "loss": 2.4072, + "mean_token_accuracy": 0.39655172228813174, + "step": 148130 + }, + { + "epoch": 0.14920314531732673, + "grad_norm": 14.501522636336029, + "learning_rate": 4.880008629078164e-05, + "loss": 2.6094, + "mean_token_accuracy": 0.41034482717514037, + "step": 148135 + }, + { + "epoch": 0.1492081813704309, + "grad_norm": 10.564000987435252, + "learning_rate": 4.8799965553853626e-05, + "loss": 2.2041, + "mean_token_accuracy": 0.4689655125141144, + "step": 148140 + }, + { + "epoch": 0.14921321742353508, + "grad_norm": 12.94808724437348, + "learning_rate": 4.879984481101799e-05, + "loss": 2.52, + "mean_token_accuracy": 0.4172413766384125, + "step": 148145 + }, + { + "epoch": 0.14921825347663925, + "grad_norm": 11.230065534700563, + "learning_rate": 4.879972406227475e-05, + "loss": 2.293, + "mean_token_accuracy": 0.44990925788879393, + "step": 148150 + }, + { + "epoch": 0.14922328952974342, + "grad_norm": 9.76289953961644, + "learning_rate": 4.8799603307623956e-05, + "loss": 2.3523, + "mean_token_accuracy": 0.42413793206214906, + "step": 148155 + }, + { + "epoch": 0.1492283255828476, + "grad_norm": 10.300957512961741, + "learning_rate": 4.8799482547065645e-05, + "loss": 2.3979, + "mean_token_accuracy": 0.4586206912994385, + "step": 148160 + }, + { + "epoch": 0.14923336163595177, + "grad_norm": 11.172738754575848, + "learning_rate": 4.879936178059983e-05, + "loss": 2.3189, + "mean_token_accuracy": 0.417241370677948, + "step": 148165 + }, + { + "epoch": 0.14923839768905595, + "grad_norm": 12.392856946602523, + "learning_rate": 4.8799241008226564e-05, + "loss": 2.3853, + "mean_token_accuracy": 0.4206896543502808, + "step": 148170 + }, + { + "epoch": 0.14924343374216012, + "grad_norm": 13.787329741753858, + "learning_rate": 4.879912022994588e-05, + "loss": 2.4156, + "mean_token_accuracy": 0.4310344815254211, + "step": 148175 + }, + { + "epoch": 0.1492484697952643, + "grad_norm": 10.937126774578767, + "learning_rate": 4.87989994457578e-05, + "loss": 2.9653, + "mean_token_accuracy": 0.34482758939266206, + "step": 148180 + }, + { + "epoch": 0.14925350584836847, + "grad_norm": 8.180717057531883, + "learning_rate": 4.8798878655662364e-05, + "loss": 2.1, + "mean_token_accuracy": 0.4689655125141144, + "step": 148185 + }, + { + "epoch": 0.14925854190147264, + "grad_norm": 10.266403607283968, + "learning_rate": 4.87987578596596e-05, + "loss": 2.4839, + "mean_token_accuracy": 0.3793103456497192, + "step": 148190 + }, + { + "epoch": 0.14926357795457681, + "grad_norm": 14.077174798230866, + "learning_rate": 4.8798637057749546e-05, + "loss": 2.1807, + "mean_token_accuracy": 0.4413793087005615, + "step": 148195 + }, + { + "epoch": 0.149268614007681, + "grad_norm": 10.008264155026232, + "learning_rate": 4.8798516249932244e-05, + "loss": 2.1304, + "mean_token_accuracy": 0.458620685338974, + "step": 148200 + }, + { + "epoch": 0.14927365006078516, + "grad_norm": 9.375009881029253, + "learning_rate": 4.8798395436207714e-05, + "loss": 2.1686, + "mean_token_accuracy": 0.46317734122276305, + "step": 148205 + }, + { + "epoch": 0.14927868611388934, + "grad_norm": 7.737577122020157, + "learning_rate": 4.879827461657599e-05, + "loss": 2.4191, + "mean_token_accuracy": 0.46733213067054746, + "step": 148210 + }, + { + "epoch": 0.1492837221669935, + "grad_norm": 10.915379533960568, + "learning_rate": 4.879815379103712e-05, + "loss": 2.5869, + "mean_token_accuracy": 0.41724138855934145, + "step": 148215 + }, + { + "epoch": 0.14928875822009768, + "grad_norm": 9.97862536975699, + "learning_rate": 4.8798032959591114e-05, + "loss": 2.4009, + "mean_token_accuracy": 0.43272837400436404, + "step": 148220 + }, + { + "epoch": 0.14929379427320186, + "grad_norm": 10.10233242277236, + "learning_rate": 4.879791212223803e-05, + "loss": 2.3473, + "mean_token_accuracy": 0.4068965554237366, + "step": 148225 + }, + { + "epoch": 0.14929883032630603, + "grad_norm": 8.989365844841098, + "learning_rate": 4.8797791278977895e-05, + "loss": 2.3237, + "mean_token_accuracy": 0.4551724076271057, + "step": 148230 + }, + { + "epoch": 0.1493038663794102, + "grad_norm": 10.403670656294587, + "learning_rate": 4.879767042981073e-05, + "loss": 2.5127, + "mean_token_accuracy": 0.4310344815254211, + "step": 148235 + }, + { + "epoch": 0.14930890243251438, + "grad_norm": 11.875467960287512, + "learning_rate": 4.879754957473657e-05, + "loss": 2.389, + "mean_token_accuracy": 0.4448275864124298, + "step": 148240 + }, + { + "epoch": 0.14931393848561855, + "grad_norm": 10.850534147072421, + "learning_rate": 4.879742871375547e-05, + "loss": 2.0959, + "mean_token_accuracy": 0.49655171632766726, + "step": 148245 + }, + { + "epoch": 0.14931897453872273, + "grad_norm": 15.019775288432555, + "learning_rate": 4.879730784686744e-05, + "loss": 2.5207, + "mean_token_accuracy": 0.4241379380226135, + "step": 148250 + }, + { + "epoch": 0.1493240105918269, + "grad_norm": 9.466701582848748, + "learning_rate": 4.8797186974072523e-05, + "loss": 2.3528, + "mean_token_accuracy": 0.46896552443504336, + "step": 148255 + }, + { + "epoch": 0.14932904664493107, + "grad_norm": 9.708752915221913, + "learning_rate": 4.879706609537076e-05, + "loss": 2.4561, + "mean_token_accuracy": 0.4068965494632721, + "step": 148260 + }, + { + "epoch": 0.14933408269803525, + "grad_norm": 11.789880332533471, + "learning_rate": 4.8796945210762163e-05, + "loss": 2.8097, + "mean_token_accuracy": 0.37586207389831544, + "step": 148265 + }, + { + "epoch": 0.1493391187511394, + "grad_norm": 12.661856126260206, + "learning_rate": 4.879682432024679e-05, + "loss": 2.9115, + "mean_token_accuracy": 0.379310342669487, + "step": 148270 + }, + { + "epoch": 0.14934415480424357, + "grad_norm": 9.00311771537564, + "learning_rate": 4.879670342382466e-05, + "loss": 1.8237, + "mean_token_accuracy": 0.510344821214676, + "step": 148275 + }, + { + "epoch": 0.14934919085734774, + "grad_norm": 9.195596600836756, + "learning_rate": 4.87965825214958e-05, + "loss": 2.3685, + "mean_token_accuracy": 0.3931034505367279, + "step": 148280 + }, + { + "epoch": 0.14935422691045191, + "grad_norm": 10.939414970578273, + "learning_rate": 4.879646161326026e-05, + "loss": 2.6741, + "mean_token_accuracy": 0.4294010937213898, + "step": 148285 + }, + { + "epoch": 0.1493592629635561, + "grad_norm": 11.24508777724285, + "learning_rate": 4.879634069911808e-05, + "loss": 2.5377, + "mean_token_accuracy": 0.4172413766384125, + "step": 148290 + }, + { + "epoch": 0.14936429901666026, + "grad_norm": 11.252053841601317, + "learning_rate": 4.8796219779069264e-05, + "loss": 2.4785, + "mean_token_accuracy": 0.43103448748588563, + "step": 148295 + }, + { + "epoch": 0.14936933506976444, + "grad_norm": 11.025939425139963, + "learning_rate": 4.8796098853113864e-05, + "loss": 2.2564, + "mean_token_accuracy": 0.46551724076271056, + "step": 148300 + }, + { + "epoch": 0.1493743711228686, + "grad_norm": 11.297055593307292, + "learning_rate": 4.8795977921251914e-05, + "loss": 2.1477, + "mean_token_accuracy": 0.4482758641242981, + "step": 148305 + }, + { + "epoch": 0.14937940717597278, + "grad_norm": 10.265772440832475, + "learning_rate": 4.8795856983483454e-05, + "loss": 2.5669, + "mean_token_accuracy": 0.4620689570903778, + "step": 148310 + }, + { + "epoch": 0.14938444322907696, + "grad_norm": 14.469361902827545, + "learning_rate": 4.879573603980849e-05, + "loss": 2.7066, + "mean_token_accuracy": 0.4, + "step": 148315 + }, + { + "epoch": 0.14938947928218113, + "grad_norm": 9.65081543193366, + "learning_rate": 4.879561509022708e-05, + "loss": 2.3258, + "mean_token_accuracy": 0.4379310369491577, + "step": 148320 + }, + { + "epoch": 0.1493945153352853, + "grad_norm": 11.969685504550256, + "learning_rate": 4.879549413473926e-05, + "loss": 2.5683, + "mean_token_accuracy": 0.3827586233615875, + "step": 148325 + }, + { + "epoch": 0.14939955138838948, + "grad_norm": 10.111833002790497, + "learning_rate": 4.879537317334505e-05, + "loss": 2.3123, + "mean_token_accuracy": 0.47931034564971925, + "step": 148330 + }, + { + "epoch": 0.14940458744149365, + "grad_norm": 8.73186656743287, + "learning_rate": 4.879525220604449e-05, + "loss": 2.4074, + "mean_token_accuracy": 0.44482758045196535, + "step": 148335 + }, + { + "epoch": 0.14940962349459783, + "grad_norm": 9.168036204906091, + "learning_rate": 4.879513123283762e-05, + "loss": 2.2334, + "mean_token_accuracy": 0.46896552443504336, + "step": 148340 + }, + { + "epoch": 0.149414659547702, + "grad_norm": 11.846550094738497, + "learning_rate": 4.879501025372445e-05, + "loss": 2.5143, + "mean_token_accuracy": 0.41724138259887694, + "step": 148345 + }, + { + "epoch": 0.14941969560080617, + "grad_norm": 10.590952234268762, + "learning_rate": 4.879488926870504e-05, + "loss": 2.3816, + "mean_token_accuracy": 0.46551724672317507, + "step": 148350 + }, + { + "epoch": 0.14942473165391035, + "grad_norm": 9.803524960239452, + "learning_rate": 4.8794768277779415e-05, + "loss": 1.9604, + "mean_token_accuracy": 0.5241379320621491, + "step": 148355 + }, + { + "epoch": 0.14942976770701452, + "grad_norm": 14.868326483510446, + "learning_rate": 4.87946472809476e-05, + "loss": 2.6741, + "mean_token_accuracy": 0.4, + "step": 148360 + }, + { + "epoch": 0.1494348037601187, + "grad_norm": 7.333397551595195, + "learning_rate": 4.879452627820963e-05, + "loss": 2.1696, + "mean_token_accuracy": 0.4551724135875702, + "step": 148365 + }, + { + "epoch": 0.14943983981322287, + "grad_norm": 9.724838965693143, + "learning_rate": 4.879440526956556e-05, + "loss": 2.436, + "mean_token_accuracy": 0.42758620977401735, + "step": 148370 + }, + { + "epoch": 0.14944487586632704, + "grad_norm": 10.310872502385898, + "learning_rate": 4.87942842550154e-05, + "loss": 2.1634, + "mean_token_accuracy": 0.460496062040329, + "step": 148375 + }, + { + "epoch": 0.14944991191943122, + "grad_norm": 10.677030462892514, + "learning_rate": 4.8794163234559185e-05, + "loss": 2.5883, + "mean_token_accuracy": 0.40344826579093934, + "step": 148380 + }, + { + "epoch": 0.1494549479725354, + "grad_norm": 9.613311474343398, + "learning_rate": 4.879404220819696e-05, + "loss": 2.0175, + "mean_token_accuracy": 0.5206896483898162, + "step": 148385 + }, + { + "epoch": 0.14945998402563956, + "grad_norm": 13.896996971768822, + "learning_rate": 4.8793921175928755e-05, + "loss": 2.9477, + "mean_token_accuracy": 0.4117967367172241, + "step": 148390 + }, + { + "epoch": 0.14946502007874374, + "grad_norm": 11.309013381608043, + "learning_rate": 4.8793800137754595e-05, + "loss": 1.8786, + "mean_token_accuracy": 0.5551724135875702, + "step": 148395 + }, + { + "epoch": 0.1494700561318479, + "grad_norm": 10.927716761509304, + "learning_rate": 4.879367909367452e-05, + "loss": 2.1021, + "mean_token_accuracy": 0.4620689630508423, + "step": 148400 + }, + { + "epoch": 0.14947509218495209, + "grad_norm": 9.130444915693413, + "learning_rate": 4.879355804368857e-05, + "loss": 2.277, + "mean_token_accuracy": 0.4379310369491577, + "step": 148405 + }, + { + "epoch": 0.14948012823805623, + "grad_norm": 8.930308734693071, + "learning_rate": 4.879343698779676e-05, + "loss": 2.3452, + "mean_token_accuracy": 0.42758620381355283, + "step": 148410 + }, + { + "epoch": 0.1494851642911604, + "grad_norm": 10.4499694150736, + "learning_rate": 4.879331592599915e-05, + "loss": 2.1978, + "mean_token_accuracy": 0.4931034505367279, + "step": 148415 + }, + { + "epoch": 0.14949020034426458, + "grad_norm": 11.749447215994785, + "learning_rate": 4.879319485829576e-05, + "loss": 2.5527, + "mean_token_accuracy": 0.3896551728248596, + "step": 148420 + }, + { + "epoch": 0.14949523639736875, + "grad_norm": 10.884922832677695, + "learning_rate": 4.8793073784686625e-05, + "loss": 2.6011, + "mean_token_accuracy": 0.4379310369491577, + "step": 148425 + }, + { + "epoch": 0.14950027245047293, + "grad_norm": 10.015806435975714, + "learning_rate": 4.8792952705171755e-05, + "loss": 2.2615, + "mean_token_accuracy": 0.4068965554237366, + "step": 148430 + }, + { + "epoch": 0.1495053085035771, + "grad_norm": 10.630052758406004, + "learning_rate": 4.879283161975122e-05, + "loss": 2.0222, + "mean_token_accuracy": 0.4965517222881317, + "step": 148435 + }, + { + "epoch": 0.14951034455668127, + "grad_norm": 10.98089015132552, + "learning_rate": 4.879271052842504e-05, + "loss": 1.9883, + "mean_token_accuracy": 0.4896551728248596, + "step": 148440 + }, + { + "epoch": 0.14951538060978545, + "grad_norm": 8.067609525428704, + "learning_rate": 4.879258943119325e-05, + "loss": 2.7161, + "mean_token_accuracy": 0.384633994102478, + "step": 148445 + }, + { + "epoch": 0.14952041666288962, + "grad_norm": 10.345105820653858, + "learning_rate": 4.879246832805587e-05, + "loss": 2.5786, + "mean_token_accuracy": 0.4103448331356049, + "step": 148450 + }, + { + "epoch": 0.1495254527159938, + "grad_norm": 10.10041364528631, + "learning_rate": 4.879234721901295e-05, + "loss": 2.1977, + "mean_token_accuracy": 0.44827585816383364, + "step": 148455 + }, + { + "epoch": 0.14953048876909797, + "grad_norm": 9.71605378251738, + "learning_rate": 4.8792226104064524e-05, + "loss": 2.3384, + "mean_token_accuracy": 0.46721113920211793, + "step": 148460 + }, + { + "epoch": 0.14953552482220214, + "grad_norm": 16.277063381384682, + "learning_rate": 4.879210498321061e-05, + "loss": 2.5945, + "mean_token_accuracy": 0.47791893482208253, + "step": 148465 + }, + { + "epoch": 0.14954056087530632, + "grad_norm": 12.194419489531699, + "learning_rate": 4.879198385645126e-05, + "loss": 2.1959, + "mean_token_accuracy": 0.441379314661026, + "step": 148470 + }, + { + "epoch": 0.1495455969284105, + "grad_norm": 9.71061939736646, + "learning_rate": 4.879186272378649e-05, + "loss": 1.931, + "mean_token_accuracy": 0.5020568609237671, + "step": 148475 + }, + { + "epoch": 0.14955063298151466, + "grad_norm": 13.272642274508467, + "learning_rate": 4.879174158521635e-05, + "loss": 2.5405, + "mean_token_accuracy": 0.3620689630508423, + "step": 148480 + }, + { + "epoch": 0.14955566903461884, + "grad_norm": 14.746158025462833, + "learning_rate": 4.879162044074086e-05, + "loss": 2.8331, + "mean_token_accuracy": 0.37241379618644715, + "step": 148485 + }, + { + "epoch": 0.149560705087723, + "grad_norm": 11.632739596647642, + "learning_rate": 4.879149929036006e-05, + "loss": 2.5489, + "mean_token_accuracy": 0.4465819776058197, + "step": 148490 + }, + { + "epoch": 0.14956574114082719, + "grad_norm": 9.695752195287898, + "learning_rate": 4.879137813407399e-05, + "loss": 1.9983, + "mean_token_accuracy": 0.5, + "step": 148495 + }, + { + "epoch": 0.14957077719393136, + "grad_norm": 10.519858993061627, + "learning_rate": 4.879125697188267e-05, + "loss": 2.488, + "mean_token_accuracy": 0.41379311084747317, + "step": 148500 + }, + { + "epoch": 0.14957581324703553, + "grad_norm": 10.993756191527162, + "learning_rate": 4.8791135803786146e-05, + "loss": 2.3292, + "mean_token_accuracy": 0.46551724076271056, + "step": 148505 + }, + { + "epoch": 0.1495808493001397, + "grad_norm": 8.359580108318214, + "learning_rate": 4.879101462978444e-05, + "loss": 2.3767, + "mean_token_accuracy": 0.4310344815254211, + "step": 148510 + }, + { + "epoch": 0.14958588535324388, + "grad_norm": 9.309462509988045, + "learning_rate": 4.87908934498776e-05, + "loss": 2.382, + "mean_token_accuracy": 0.4206896543502808, + "step": 148515 + }, + { + "epoch": 0.14959092140634805, + "grad_norm": 10.295602515321162, + "learning_rate": 4.879077226406564e-05, + "loss": 2.3324, + "mean_token_accuracy": 0.4551724135875702, + "step": 148520 + }, + { + "epoch": 0.14959595745945223, + "grad_norm": 10.38944902971831, + "learning_rate": 4.8790651072348614e-05, + "loss": 2.7604, + "mean_token_accuracy": 0.4206896543502808, + "step": 148525 + }, + { + "epoch": 0.1496009935125564, + "grad_norm": 9.178152618584285, + "learning_rate": 4.879052987472654e-05, + "loss": 2.3404, + "mean_token_accuracy": 0.4068965554237366, + "step": 148530 + }, + { + "epoch": 0.14960602956566058, + "grad_norm": 10.901861484426453, + "learning_rate": 4.879040867119946e-05, + "loss": 2.4274, + "mean_token_accuracy": 0.4068965494632721, + "step": 148535 + }, + { + "epoch": 0.14961106561876475, + "grad_norm": 9.739387451810444, + "learning_rate": 4.879028746176741e-05, + "loss": 2.3493, + "mean_token_accuracy": 0.43103448748588563, + "step": 148540 + }, + { + "epoch": 0.14961610167186892, + "grad_norm": 9.446004471544143, + "learning_rate": 4.879016624643041e-05, + "loss": 2.2514, + "mean_token_accuracy": 0.45517241954803467, + "step": 148545 + }, + { + "epoch": 0.14962113772497307, + "grad_norm": 10.526794921908618, + "learning_rate": 4.879004502518852e-05, + "loss": 2.1232, + "mean_token_accuracy": 0.4784029066562653, + "step": 148550 + }, + { + "epoch": 0.14962617377807724, + "grad_norm": 11.175589238857892, + "learning_rate": 4.878992379804174e-05, + "loss": 2.6477, + "mean_token_accuracy": 0.44482759237289426, + "step": 148555 + }, + { + "epoch": 0.14963120983118142, + "grad_norm": 9.543428036272735, + "learning_rate": 4.878980256499013e-05, + "loss": 2.2817, + "mean_token_accuracy": 0.44137930274009707, + "step": 148560 + }, + { + "epoch": 0.1496362458842856, + "grad_norm": 9.415113350912609, + "learning_rate": 4.878968132603371e-05, + "loss": 2.2891, + "mean_token_accuracy": 0.42413793206214906, + "step": 148565 + }, + { + "epoch": 0.14964128193738976, + "grad_norm": 11.36001779401205, + "learning_rate": 4.878956008117251e-05, + "loss": 2.667, + "mean_token_accuracy": 0.40344828367233276, + "step": 148570 + }, + { + "epoch": 0.14964631799049394, + "grad_norm": 14.881006271886568, + "learning_rate": 4.8789438830406586e-05, + "loss": 2.6217, + "mean_token_accuracy": 0.441379314661026, + "step": 148575 + }, + { + "epoch": 0.1496513540435981, + "grad_norm": 9.865572530580646, + "learning_rate": 4.878931757373595e-05, + "loss": 2.1517, + "mean_token_accuracy": 0.443254691362381, + "step": 148580 + }, + { + "epoch": 0.14965639009670229, + "grad_norm": 11.906719238054274, + "learning_rate": 4.878919631116064e-05, + "loss": 2.5137, + "mean_token_accuracy": 0.3724137842655182, + "step": 148585 + }, + { + "epoch": 0.14966142614980646, + "grad_norm": 9.502577276013145, + "learning_rate": 4.878907504268069e-05, + "loss": 2.2702, + "mean_token_accuracy": 0.43793103098869324, + "step": 148590 + }, + { + "epoch": 0.14966646220291063, + "grad_norm": 10.653360898934892, + "learning_rate": 4.878895376829614e-05, + "loss": 2.2837, + "mean_token_accuracy": 0.4517241358757019, + "step": 148595 + }, + { + "epoch": 0.1496714982560148, + "grad_norm": 10.891731410940247, + "learning_rate": 4.878883248800702e-05, + "loss": 2.7295, + "mean_token_accuracy": 0.324137932062149, + "step": 148600 + }, + { + "epoch": 0.14967653430911898, + "grad_norm": 9.086316416203646, + "learning_rate": 4.8788711201813365e-05, + "loss": 2.7321, + "mean_token_accuracy": 0.4068965494632721, + "step": 148605 + }, + { + "epoch": 0.14968157036222315, + "grad_norm": 10.601148418755962, + "learning_rate": 4.878858990971521e-05, + "loss": 2.6769, + "mean_token_accuracy": 0.41724138259887694, + "step": 148610 + }, + { + "epoch": 0.14968660641532733, + "grad_norm": 9.004200206360139, + "learning_rate": 4.8788468611712566e-05, + "loss": 1.7701, + "mean_token_accuracy": 0.5379310190677643, + "step": 148615 + }, + { + "epoch": 0.1496916424684315, + "grad_norm": 10.74539967391511, + "learning_rate": 4.87883473078055e-05, + "loss": 2.1114, + "mean_token_accuracy": 0.45517241954803467, + "step": 148620 + }, + { + "epoch": 0.14969667852153568, + "grad_norm": 10.052218926308193, + "learning_rate": 4.878822599799403e-05, + "loss": 2.34, + "mean_token_accuracy": 0.4586206912994385, + "step": 148625 + }, + { + "epoch": 0.14970171457463985, + "grad_norm": 10.529776402739493, + "learning_rate": 4.8788104682278194e-05, + "loss": 2.2907, + "mean_token_accuracy": 0.43103448748588563, + "step": 148630 + }, + { + "epoch": 0.14970675062774402, + "grad_norm": 11.32648192754511, + "learning_rate": 4.878798336065802e-05, + "loss": 2.508, + "mean_token_accuracy": 0.42413793206214906, + "step": 148635 + }, + { + "epoch": 0.1497117866808482, + "grad_norm": 10.448057935138856, + "learning_rate": 4.878786203313354e-05, + "loss": 2.3618, + "mean_token_accuracy": 0.44482757449150084, + "step": 148640 + }, + { + "epoch": 0.14971682273395237, + "grad_norm": 8.313293761136984, + "learning_rate": 4.878774069970479e-05, + "loss": 1.7163, + "mean_token_accuracy": 0.5586206793785096, + "step": 148645 + }, + { + "epoch": 0.14972185878705654, + "grad_norm": 10.557855113077451, + "learning_rate": 4.878761936037182e-05, + "loss": 2.2664, + "mean_token_accuracy": 0.4379310369491577, + "step": 148650 + }, + { + "epoch": 0.14972689484016072, + "grad_norm": 9.361030906470665, + "learning_rate": 4.8787498015134644e-05, + "loss": 2.1178, + "mean_token_accuracy": 0.4793103516101837, + "step": 148655 + }, + { + "epoch": 0.1497319308932649, + "grad_norm": 10.295678940027424, + "learning_rate": 4.8787376663993296e-05, + "loss": 2.6094, + "mean_token_accuracy": 0.43236538767814636, + "step": 148660 + }, + { + "epoch": 0.14973696694636907, + "grad_norm": 11.960768941097838, + "learning_rate": 4.878725530694781e-05, + "loss": 2.4738, + "mean_token_accuracy": 0.39310343861579894, + "step": 148665 + }, + { + "epoch": 0.14974200299947324, + "grad_norm": 9.959753009658037, + "learning_rate": 4.878713394399823e-05, + "loss": 2.4047, + "mean_token_accuracy": 0.42068964838981626, + "step": 148670 + }, + { + "epoch": 0.1497470390525774, + "grad_norm": 9.535012635855178, + "learning_rate": 4.8787012575144585e-05, + "loss": 2.5496, + "mean_token_accuracy": 0.4310344815254211, + "step": 148675 + }, + { + "epoch": 0.1497520751056816, + "grad_norm": 10.72258053360446, + "learning_rate": 4.878689120038691e-05, + "loss": 2.5148, + "mean_token_accuracy": 0.3931034505367279, + "step": 148680 + }, + { + "epoch": 0.14975711115878576, + "grad_norm": 11.355359417416457, + "learning_rate": 4.878676981972523e-05, + "loss": 2.313, + "mean_token_accuracy": 0.422202056646347, + "step": 148685 + }, + { + "epoch": 0.1497621472118899, + "grad_norm": 9.342657878267854, + "learning_rate": 4.878664843315959e-05, + "loss": 2.0304, + "mean_token_accuracy": 0.5160314619541169, + "step": 148690 + }, + { + "epoch": 0.14976718326499408, + "grad_norm": 11.44283549233883, + "learning_rate": 4.8786527040690025e-05, + "loss": 2.7469, + "mean_token_accuracy": 0.4, + "step": 148695 + }, + { + "epoch": 0.14977221931809825, + "grad_norm": 10.724944970993485, + "learning_rate": 4.8786405642316554e-05, + "loss": 2.0304, + "mean_token_accuracy": 0.44827585220336913, + "step": 148700 + }, + { + "epoch": 0.14977725537120243, + "grad_norm": 10.729697271669913, + "learning_rate": 4.878628423803922e-05, + "loss": 2.4441, + "mean_token_accuracy": 0.4241379201412201, + "step": 148705 + }, + { + "epoch": 0.1497822914243066, + "grad_norm": 11.515331184858699, + "learning_rate": 4.878616282785805e-05, + "loss": 2.5693, + "mean_token_accuracy": 0.3965517282485962, + "step": 148710 + }, + { + "epoch": 0.14978732747741078, + "grad_norm": 9.19058191607933, + "learning_rate": 4.878604141177309e-05, + "loss": 2.345, + "mean_token_accuracy": 0.46896552443504336, + "step": 148715 + }, + { + "epoch": 0.14979236353051495, + "grad_norm": 12.720433430729209, + "learning_rate": 4.878591998978436e-05, + "loss": 2.3439, + "mean_token_accuracy": 0.4395644307136536, + "step": 148720 + }, + { + "epoch": 0.14979739958361912, + "grad_norm": 9.643130474407748, + "learning_rate": 4.878579856189191e-05, + "loss": 2.3975, + "mean_token_accuracy": 0.4103448212146759, + "step": 148725 + }, + { + "epoch": 0.1498024356367233, + "grad_norm": 9.539707176881274, + "learning_rate": 4.8785677128095756e-05, + "loss": 2.2614, + "mean_token_accuracy": 0.3896551728248596, + "step": 148730 + }, + { + "epoch": 0.14980747168982747, + "grad_norm": 10.989628210174091, + "learning_rate": 4.8785555688395954e-05, + "loss": 2.4258, + "mean_token_accuracy": 0.41379310488700866, + "step": 148735 + }, + { + "epoch": 0.14981250774293164, + "grad_norm": 11.014661865481312, + "learning_rate": 4.878543424279251e-05, + "loss": 2.2241, + "mean_token_accuracy": 0.41379310488700866, + "step": 148740 + }, + { + "epoch": 0.14981754379603582, + "grad_norm": 9.115445852981367, + "learning_rate": 4.878531279128548e-05, + "loss": 2.8028, + "mean_token_accuracy": 0.37931033968925476, + "step": 148745 + }, + { + "epoch": 0.14982257984914, + "grad_norm": 9.10019966275961, + "learning_rate": 4.878519133387488e-05, + "loss": 1.9476, + "mean_token_accuracy": 0.47586206793785096, + "step": 148750 + }, + { + "epoch": 0.14982761590224417, + "grad_norm": 9.209248147045804, + "learning_rate": 4.878506987056076e-05, + "loss": 2.266, + "mean_token_accuracy": 0.4310344815254211, + "step": 148755 + }, + { + "epoch": 0.14983265195534834, + "grad_norm": 8.635951368430888, + "learning_rate": 4.878494840134314e-05, + "loss": 2.2878, + "mean_token_accuracy": 0.39794313311576845, + "step": 148760 + }, + { + "epoch": 0.1498376880084525, + "grad_norm": 11.425567283496296, + "learning_rate": 4.878482692622206e-05, + "loss": 2.8391, + "mean_token_accuracy": 0.3896551728248596, + "step": 148765 + }, + { + "epoch": 0.1498427240615567, + "grad_norm": 11.610636106879273, + "learning_rate": 4.8784705445197564e-05, + "loss": 2.4379, + "mean_token_accuracy": 0.4896551787853241, + "step": 148770 + }, + { + "epoch": 0.14984776011466086, + "grad_norm": 12.747569915168574, + "learning_rate": 4.878458395826967e-05, + "loss": 2.2274, + "mean_token_accuracy": 0.45517241954803467, + "step": 148775 + }, + { + "epoch": 0.14985279616776503, + "grad_norm": 9.773138455861067, + "learning_rate": 4.878446246543842e-05, + "loss": 2.2831, + "mean_token_accuracy": 0.4724137902259827, + "step": 148780 + }, + { + "epoch": 0.1498578322208692, + "grad_norm": 13.892887420402237, + "learning_rate": 4.878434096670384e-05, + "loss": 2.6799, + "mean_token_accuracy": 0.4448275864124298, + "step": 148785 + }, + { + "epoch": 0.14986286827397338, + "grad_norm": 11.442342608476883, + "learning_rate": 4.878421946206597e-05, + "loss": 2.9005, + "mean_token_accuracy": 0.43103448748588563, + "step": 148790 + }, + { + "epoch": 0.14986790432707756, + "grad_norm": 12.133394612618105, + "learning_rate": 4.878409795152484e-05, + "loss": 2.7557, + "mean_token_accuracy": 0.43581366539001465, + "step": 148795 + }, + { + "epoch": 0.14987294038018173, + "grad_norm": 11.272833980475172, + "learning_rate": 4.878397643508049e-05, + "loss": 2.4824, + "mean_token_accuracy": 0.4068965494632721, + "step": 148800 + }, + { + "epoch": 0.1498779764332859, + "grad_norm": 8.786581817368605, + "learning_rate": 4.878385491273295e-05, + "loss": 2.6901, + "mean_token_accuracy": 0.43103447556495667, + "step": 148805 + }, + { + "epoch": 0.14988301248639008, + "grad_norm": 10.254010243849295, + "learning_rate": 4.878373338448226e-05, + "loss": 2.2083, + "mean_token_accuracy": 0.4413793087005615, + "step": 148810 + }, + { + "epoch": 0.14988804853949425, + "grad_norm": 10.231022723134345, + "learning_rate": 4.8783611850328436e-05, + "loss": 2.6185, + "mean_token_accuracy": 0.3931034505367279, + "step": 148815 + }, + { + "epoch": 0.14989308459259842, + "grad_norm": 14.05618257685569, + "learning_rate": 4.878349031027153e-05, + "loss": 2.5343, + "mean_token_accuracy": 0.3999999940395355, + "step": 148820 + }, + { + "epoch": 0.1498981206457026, + "grad_norm": 10.071231512045767, + "learning_rate": 4.878336876431156e-05, + "loss": 2.2721, + "mean_token_accuracy": 0.4068965494632721, + "step": 148825 + }, + { + "epoch": 0.14990315669880674, + "grad_norm": 9.069083442024345, + "learning_rate": 4.8783247212448575e-05, + "loss": 1.8482, + "mean_token_accuracy": 0.5034482657909394, + "step": 148830 + }, + { + "epoch": 0.14990819275191092, + "grad_norm": 9.367930012235796, + "learning_rate": 4.87831256546826e-05, + "loss": 1.9385, + "mean_token_accuracy": 0.5059286117553711, + "step": 148835 + }, + { + "epoch": 0.1499132288050151, + "grad_norm": 11.94402885022271, + "learning_rate": 4.878300409101367e-05, + "loss": 2.3734, + "mean_token_accuracy": 0.42413793206214906, + "step": 148840 + }, + { + "epoch": 0.14991826485811927, + "grad_norm": 12.019428941739706, + "learning_rate": 4.878288252144181e-05, + "loss": 3.0016, + "mean_token_accuracy": 0.3977011501789093, + "step": 148845 + }, + { + "epoch": 0.14992330091122344, + "grad_norm": 10.197775634114915, + "learning_rate": 4.878276094596708e-05, + "loss": 2.5506, + "mean_token_accuracy": 0.39655172228813174, + "step": 148850 + }, + { + "epoch": 0.1499283369643276, + "grad_norm": 9.764477539721625, + "learning_rate": 4.8782639364589485e-05, + "loss": 2.2504, + "mean_token_accuracy": 0.46896551847457885, + "step": 148855 + }, + { + "epoch": 0.1499333730174318, + "grad_norm": 14.046883612809316, + "learning_rate": 4.878251777730908e-05, + "loss": 2.3325, + "mean_token_accuracy": 0.46551724672317507, + "step": 148860 + }, + { + "epoch": 0.14993840907053596, + "grad_norm": 9.04202385634362, + "learning_rate": 4.878239618412589e-05, + "loss": 2.0593, + "mean_token_accuracy": 0.4517241299152374, + "step": 148865 + }, + { + "epoch": 0.14994344512364013, + "grad_norm": 8.243663159017458, + "learning_rate": 4.878227458503994e-05, + "loss": 2.6628, + "mean_token_accuracy": 0.4068965494632721, + "step": 148870 + }, + { + "epoch": 0.1499484811767443, + "grad_norm": 11.426014531084967, + "learning_rate": 4.878215298005127e-05, + "loss": 2.5905, + "mean_token_accuracy": 0.4034482777118683, + "step": 148875 + }, + { + "epoch": 0.14995351722984848, + "grad_norm": 10.020135155018252, + "learning_rate": 4.878203136915992e-05, + "loss": 2.1685, + "mean_token_accuracy": 0.46896551847457885, + "step": 148880 + }, + { + "epoch": 0.14995855328295266, + "grad_norm": 11.532289313172079, + "learning_rate": 4.878190975236592e-05, + "loss": 2.2614, + "mean_token_accuracy": 0.4551724135875702, + "step": 148885 + }, + { + "epoch": 0.14996358933605683, + "grad_norm": 12.9859053630927, + "learning_rate": 4.878178812966931e-05, + "loss": 2.2446, + "mean_token_accuracy": 0.48275862336158754, + "step": 148890 + }, + { + "epoch": 0.149968625389161, + "grad_norm": 11.543856647959993, + "learning_rate": 4.878166650107011e-05, + "loss": 2.4612, + "mean_token_accuracy": 0.4, + "step": 148895 + }, + { + "epoch": 0.14997366144226518, + "grad_norm": 11.8968978190887, + "learning_rate": 4.8781544866568354e-05, + "loss": 2.6225, + "mean_token_accuracy": 0.42758620977401735, + "step": 148900 + }, + { + "epoch": 0.14997869749536935, + "grad_norm": 9.076593654845928, + "learning_rate": 4.878142322616409e-05, + "loss": 2.5203, + "mean_token_accuracy": 0.4103448331356049, + "step": 148905 + }, + { + "epoch": 0.14998373354847352, + "grad_norm": 11.38870506646091, + "learning_rate": 4.878130157985734e-05, + "loss": 2.1623, + "mean_token_accuracy": 0.4206896543502808, + "step": 148910 + }, + { + "epoch": 0.1499887696015777, + "grad_norm": 9.964757215398352, + "learning_rate": 4.878117992764814e-05, + "loss": 2.6638, + "mean_token_accuracy": 0.42758620381355283, + "step": 148915 + }, + { + "epoch": 0.14999380565468187, + "grad_norm": 11.51807322376013, + "learning_rate": 4.878105826953653e-05, + "loss": 2.6976, + "mean_token_accuracy": 0.3793103456497192, + "step": 148920 + }, + { + "epoch": 0.14999884170778605, + "grad_norm": 10.919341263421053, + "learning_rate": 4.878093660552254e-05, + "loss": 2.5428, + "mean_token_accuracy": 0.3965517163276672, + "step": 148925 + }, + { + "epoch": 0.15000387776089022, + "grad_norm": 9.707162326772812, + "learning_rate": 4.87808149356062e-05, + "loss": 2.3382, + "mean_token_accuracy": 0.43103448748588563, + "step": 148930 + }, + { + "epoch": 0.1500089138139944, + "grad_norm": 8.820594368317725, + "learning_rate": 4.878069325978755e-05, + "loss": 2.1495, + "mean_token_accuracy": 0.4517241418361664, + "step": 148935 + }, + { + "epoch": 0.15001394986709857, + "grad_norm": 9.614218955216476, + "learning_rate": 4.8780571578066606e-05, + "loss": 2.7179, + "mean_token_accuracy": 0.42413793206214906, + "step": 148940 + }, + { + "epoch": 0.15001898592020274, + "grad_norm": 7.759849633117516, + "learning_rate": 4.878044989044343e-05, + "loss": 2.4532, + "mean_token_accuracy": 0.4379310250282288, + "step": 148945 + }, + { + "epoch": 0.15002402197330691, + "grad_norm": 10.271266963285328, + "learning_rate": 4.8780328196918045e-05, + "loss": 2.3861, + "mean_token_accuracy": 0.4448275864124298, + "step": 148950 + }, + { + "epoch": 0.1500290580264111, + "grad_norm": 7.403448989981989, + "learning_rate": 4.878020649749048e-05, + "loss": 2.2682, + "mean_token_accuracy": 0.4862069010734558, + "step": 148955 + }, + { + "epoch": 0.15003409407951526, + "grad_norm": 9.291017232304474, + "learning_rate": 4.8780084792160764e-05, + "loss": 2.275, + "mean_token_accuracy": 0.4379310250282288, + "step": 148960 + }, + { + "epoch": 0.15003913013261944, + "grad_norm": 9.578114053205415, + "learning_rate": 4.877996308092893e-05, + "loss": 2.3154, + "mean_token_accuracy": 0.42413793206214906, + "step": 148965 + }, + { + "epoch": 0.15004416618572358, + "grad_norm": 9.919018049511516, + "learning_rate": 4.877984136379503e-05, + "loss": 2.4315, + "mean_token_accuracy": 0.4413793087005615, + "step": 148970 + }, + { + "epoch": 0.15004920223882776, + "grad_norm": 9.391851167547143, + "learning_rate": 4.8779719640759095e-05, + "loss": 2.0621, + "mean_token_accuracy": 0.493103438615799, + "step": 148975 + }, + { + "epoch": 0.15005423829193193, + "grad_norm": 9.44248432454044, + "learning_rate": 4.877959791182113e-05, + "loss": 3.3055, + "mean_token_accuracy": 0.37586206793785093, + "step": 148980 + }, + { + "epoch": 0.1500592743450361, + "grad_norm": 10.491930120988131, + "learning_rate": 4.87794761769812e-05, + "loss": 2.3414, + "mean_token_accuracy": 0.39310344457626345, + "step": 148985 + }, + { + "epoch": 0.15006431039814028, + "grad_norm": 10.74666710167593, + "learning_rate": 4.8779354436239325e-05, + "loss": 2.3343, + "mean_token_accuracy": 0.4517241299152374, + "step": 148990 + }, + { + "epoch": 0.15006934645124445, + "grad_norm": 9.267816190884194, + "learning_rate": 4.877923268959555e-05, + "loss": 2.1378, + "mean_token_accuracy": 0.4862069010734558, + "step": 148995 + }, + { + "epoch": 0.15007438250434862, + "grad_norm": 9.858592186187275, + "learning_rate": 4.877911093704989e-05, + "loss": 2.2572, + "mean_token_accuracy": 0.4655172348022461, + "step": 149000 + }, + { + "epoch": 0.1500794185574528, + "grad_norm": 10.757871962043437, + "learning_rate": 4.87789891786024e-05, + "loss": 2.1928, + "mean_token_accuracy": 0.49999999403953554, + "step": 149005 + }, + { + "epoch": 0.15008445461055697, + "grad_norm": 9.408616508959676, + "learning_rate": 4.8778867414253096e-05, + "loss": 2.4488, + "mean_token_accuracy": 0.4103448212146759, + "step": 149010 + }, + { + "epoch": 0.15008949066366115, + "grad_norm": 12.575984067721732, + "learning_rate": 4.8778745644002014e-05, + "loss": 2.5434, + "mean_token_accuracy": 0.41034482717514037, + "step": 149015 + }, + { + "epoch": 0.15009452671676532, + "grad_norm": 12.11592356626097, + "learning_rate": 4.87786238678492e-05, + "loss": 2.379, + "mean_token_accuracy": 0.4172413766384125, + "step": 149020 + }, + { + "epoch": 0.1500995627698695, + "grad_norm": 8.501044669073574, + "learning_rate": 4.877850208579468e-05, + "loss": 2.5428, + "mean_token_accuracy": 0.4172413796186447, + "step": 149025 + }, + { + "epoch": 0.15010459882297367, + "grad_norm": 10.395452406559038, + "learning_rate": 4.877838029783848e-05, + "loss": 2.3063, + "mean_token_accuracy": 0.44827585816383364, + "step": 149030 + }, + { + "epoch": 0.15010963487607784, + "grad_norm": 11.122363805470826, + "learning_rate": 4.877825850398065e-05, + "loss": 2.5179, + "mean_token_accuracy": 0.42758620977401735, + "step": 149035 + }, + { + "epoch": 0.15011467092918201, + "grad_norm": 11.790340827487478, + "learning_rate": 4.877813670422121e-05, + "loss": 2.8304, + "mean_token_accuracy": 0.37241379618644715, + "step": 149040 + }, + { + "epoch": 0.1501197069822862, + "grad_norm": 10.961885638719535, + "learning_rate": 4.87780148985602e-05, + "loss": 2.3959, + "mean_token_accuracy": 0.4482758641242981, + "step": 149045 + }, + { + "epoch": 0.15012474303539036, + "grad_norm": 10.516540892468106, + "learning_rate": 4.877789308699766e-05, + "loss": 2.0377, + "mean_token_accuracy": 0.4551724076271057, + "step": 149050 + }, + { + "epoch": 0.15012977908849454, + "grad_norm": 10.10650256499464, + "learning_rate": 4.8777771269533615e-05, + "loss": 2.4774, + "mean_token_accuracy": 0.4758620738983154, + "step": 149055 + }, + { + "epoch": 0.1501348151415987, + "grad_norm": 11.354013503005183, + "learning_rate": 4.87776494461681e-05, + "loss": 2.5554, + "mean_token_accuracy": 0.37931033968925476, + "step": 149060 + }, + { + "epoch": 0.15013985119470288, + "grad_norm": 10.048751204023914, + "learning_rate": 4.8777527616901155e-05, + "loss": 2.1382, + "mean_token_accuracy": 0.44482759237289426, + "step": 149065 + }, + { + "epoch": 0.15014488724780706, + "grad_norm": 12.242922317850937, + "learning_rate": 4.87774057817328e-05, + "loss": 2.4445, + "mean_token_accuracy": 0.42068966031074523, + "step": 149070 + }, + { + "epoch": 0.15014992330091123, + "grad_norm": 12.636820440373466, + "learning_rate": 4.877728394066308e-05, + "loss": 2.6001, + "mean_token_accuracy": 0.38620689511299133, + "step": 149075 + }, + { + "epoch": 0.1501549593540154, + "grad_norm": 9.973016619717942, + "learning_rate": 4.877716209369202e-05, + "loss": 2.1032, + "mean_token_accuracy": 0.4758620738983154, + "step": 149080 + }, + { + "epoch": 0.15015999540711958, + "grad_norm": 11.203408280948786, + "learning_rate": 4.877704024081966e-05, + "loss": 2.6174, + "mean_token_accuracy": 0.3862068891525269, + "step": 149085 + }, + { + "epoch": 0.15016503146022375, + "grad_norm": 10.333547039096494, + "learning_rate": 4.877691838204604e-05, + "loss": 2.6543, + "mean_token_accuracy": 0.36551723480224607, + "step": 149090 + }, + { + "epoch": 0.15017006751332793, + "grad_norm": 10.845263814813272, + "learning_rate": 4.8776796517371186e-05, + "loss": 2.3034, + "mean_token_accuracy": 0.44652147889137267, + "step": 149095 + }, + { + "epoch": 0.1501751035664321, + "grad_norm": 10.777521911190956, + "learning_rate": 4.877667464679514e-05, + "loss": 2.4951, + "mean_token_accuracy": 0.4034482717514038, + "step": 149100 + }, + { + "epoch": 0.15018013961953627, + "grad_norm": 9.960942792422149, + "learning_rate": 4.877655277031792e-05, + "loss": 2.5793, + "mean_token_accuracy": 0.4103448301553726, + "step": 149105 + }, + { + "epoch": 0.15018517567264042, + "grad_norm": 11.364894197839963, + "learning_rate": 4.877643088793957e-05, + "loss": 2.1251, + "mean_token_accuracy": 0.41034482419490814, + "step": 149110 + }, + { + "epoch": 0.1501902117257446, + "grad_norm": 13.104924108626037, + "learning_rate": 4.877630899966012e-05, + "loss": 3.132, + "mean_token_accuracy": 0.29655172526836393, + "step": 149115 + }, + { + "epoch": 0.15019524777884877, + "grad_norm": 10.310989905943005, + "learning_rate": 4.877618710547961e-05, + "loss": 2.5062, + "mean_token_accuracy": 0.3551724135875702, + "step": 149120 + }, + { + "epoch": 0.15020028383195294, + "grad_norm": 9.967542614058262, + "learning_rate": 4.877606520539807e-05, + "loss": 2.1026, + "mean_token_accuracy": 0.4586206912994385, + "step": 149125 + }, + { + "epoch": 0.15020531988505711, + "grad_norm": 10.845694508080364, + "learning_rate": 4.877594329941553e-05, + "loss": 2.4612, + "mean_token_accuracy": 0.3896551787853241, + "step": 149130 + }, + { + "epoch": 0.1502103559381613, + "grad_norm": 10.533629399924388, + "learning_rate": 4.877582138753203e-05, + "loss": 2.9102, + "mean_token_accuracy": 0.3793103516101837, + "step": 149135 + }, + { + "epoch": 0.15021539199126546, + "grad_norm": 9.404855071504524, + "learning_rate": 4.8775699469747605e-05, + "loss": 2.7323, + "mean_token_accuracy": 0.37241379022598264, + "step": 149140 + }, + { + "epoch": 0.15022042804436964, + "grad_norm": 8.008902234627529, + "learning_rate": 4.877557754606227e-05, + "loss": 2.2674, + "mean_token_accuracy": 0.47241379618644713, + "step": 149145 + }, + { + "epoch": 0.1502254640974738, + "grad_norm": 8.82197542943708, + "learning_rate": 4.877545561647609e-05, + "loss": 2.5263, + "mean_token_accuracy": 0.4448275864124298, + "step": 149150 + }, + { + "epoch": 0.15023050015057798, + "grad_norm": 8.347223630394668, + "learning_rate": 4.877533368098908e-05, + "loss": 2.3304, + "mean_token_accuracy": 0.4329703599214554, + "step": 149155 + }, + { + "epoch": 0.15023553620368216, + "grad_norm": 10.424946909484468, + "learning_rate": 4.877521173960128e-05, + "loss": 2.4824, + "mean_token_accuracy": 0.3931034505367279, + "step": 149160 + }, + { + "epoch": 0.15024057225678633, + "grad_norm": 11.320850131146, + "learning_rate": 4.8775089792312725e-05, + "loss": 2.5967, + "mean_token_accuracy": 0.41379310488700866, + "step": 149165 + }, + { + "epoch": 0.1502456083098905, + "grad_norm": 11.00300938683333, + "learning_rate": 4.877496783912343e-05, + "loss": 2.2146, + "mean_token_accuracy": 0.4496672749519348, + "step": 149170 + }, + { + "epoch": 0.15025064436299468, + "grad_norm": 10.470230954318321, + "learning_rate": 4.8774845880033455e-05, + "loss": 2.6855, + "mean_token_accuracy": 0.42068966031074523, + "step": 149175 + }, + { + "epoch": 0.15025568041609885, + "grad_norm": 13.7454723795735, + "learning_rate": 4.877472391504282e-05, + "loss": 2.4764, + "mean_token_accuracy": 0.42758620977401735, + "step": 149180 + }, + { + "epoch": 0.15026071646920303, + "grad_norm": 10.745877053318678, + "learning_rate": 4.877460194415155e-05, + "loss": 2.5242, + "mean_token_accuracy": 0.4344827651977539, + "step": 149185 + }, + { + "epoch": 0.1502657525223072, + "grad_norm": 10.848949143388364, + "learning_rate": 4.87744799673597e-05, + "loss": 2.3681, + "mean_token_accuracy": 0.4482758641242981, + "step": 149190 + }, + { + "epoch": 0.15027078857541137, + "grad_norm": 10.287839339477767, + "learning_rate": 4.87743579846673e-05, + "loss": 2.3982, + "mean_token_accuracy": 0.458620685338974, + "step": 149195 + }, + { + "epoch": 0.15027582462851555, + "grad_norm": 9.315448651543322, + "learning_rate": 4.8774235996074365e-05, + "loss": 2.3174, + "mean_token_accuracy": 0.4275861978530884, + "step": 149200 + }, + { + "epoch": 0.15028086068161972, + "grad_norm": 9.761662336587126, + "learning_rate": 4.877411400158094e-05, + "loss": 2.4101, + "mean_token_accuracy": 0.41724138259887694, + "step": 149205 + }, + { + "epoch": 0.1502858967347239, + "grad_norm": 11.528203017076272, + "learning_rate": 4.877399200118707e-05, + "loss": 2.3096, + "mean_token_accuracy": 0.4379310429096222, + "step": 149210 + }, + { + "epoch": 0.15029093278782807, + "grad_norm": 10.440056460581612, + "learning_rate": 4.877386999489277e-05, + "loss": 2.4287, + "mean_token_accuracy": 0.3931034505367279, + "step": 149215 + }, + { + "epoch": 0.15029596884093224, + "grad_norm": 8.533533323501887, + "learning_rate": 4.8773747982698086e-05, + "loss": 2.2564, + "mean_token_accuracy": 0.4779189318418503, + "step": 149220 + }, + { + "epoch": 0.15030100489403642, + "grad_norm": 10.388192434978636, + "learning_rate": 4.877362596460305e-05, + "loss": 2.3945, + "mean_token_accuracy": 0.41034482717514037, + "step": 149225 + }, + { + "epoch": 0.1503060409471406, + "grad_norm": 10.86507885932944, + "learning_rate": 4.87735039406077e-05, + "loss": 2.8181, + "mean_token_accuracy": 0.41034482717514037, + "step": 149230 + }, + { + "epoch": 0.15031107700024476, + "grad_norm": 10.066701372831748, + "learning_rate": 4.877338191071205e-05, + "loss": 2.9566, + "mean_token_accuracy": 0.42758620977401735, + "step": 149235 + }, + { + "epoch": 0.15031611305334894, + "grad_norm": 10.812436793850436, + "learning_rate": 4.8773259874916165e-05, + "loss": 2.5275, + "mean_token_accuracy": 0.39310344457626345, + "step": 149240 + }, + { + "epoch": 0.1503211491064531, + "grad_norm": 16.360855448456203, + "learning_rate": 4.877313783322006e-05, + "loss": 2.4582, + "mean_token_accuracy": 0.4448275983333588, + "step": 149245 + }, + { + "epoch": 0.15032618515955726, + "grad_norm": 11.22783409596567, + "learning_rate": 4.877301578562375e-05, + "loss": 2.7584, + "mean_token_accuracy": 0.35862069129943847, + "step": 149250 + }, + { + "epoch": 0.15033122121266143, + "grad_norm": 16.856161186586128, + "learning_rate": 4.877289373212731e-05, + "loss": 2.1153, + "mean_token_accuracy": 0.5034482777118683, + "step": 149255 + }, + { + "epoch": 0.1503362572657656, + "grad_norm": 8.961445332797537, + "learning_rate": 4.877277167273075e-05, + "loss": 2.3328, + "mean_token_accuracy": 0.41724138259887694, + "step": 149260 + }, + { + "epoch": 0.15034129331886978, + "grad_norm": 10.668934313505392, + "learning_rate": 4.877264960743411e-05, + "loss": 2.609, + "mean_token_accuracy": 0.3827586233615875, + "step": 149265 + }, + { + "epoch": 0.15034632937197395, + "grad_norm": 15.418782403901512, + "learning_rate": 4.877252753623742e-05, + "loss": 2.9705, + "mean_token_accuracy": 0.3931034505367279, + "step": 149270 + }, + { + "epoch": 0.15035136542507813, + "grad_norm": 9.610150668532713, + "learning_rate": 4.87724054591407e-05, + "loss": 1.9062, + "mean_token_accuracy": 0.5034482657909394, + "step": 149275 + }, + { + "epoch": 0.1503564014781823, + "grad_norm": 9.16564707574316, + "learning_rate": 4.8772283376144015e-05, + "loss": 2.26, + "mean_token_accuracy": 0.4482758641242981, + "step": 149280 + }, + { + "epoch": 0.15036143753128647, + "grad_norm": 9.912865727384233, + "learning_rate": 4.877216128724738e-05, + "loss": 2.5221, + "mean_token_accuracy": 0.3896551787853241, + "step": 149285 + }, + { + "epoch": 0.15036647358439065, + "grad_norm": 9.90997671815578, + "learning_rate": 4.8772039192450825e-05, + "loss": 2.3044, + "mean_token_accuracy": 0.45015124082565305, + "step": 149290 + }, + { + "epoch": 0.15037150963749482, + "grad_norm": 11.336666980570953, + "learning_rate": 4.87719170917544e-05, + "loss": 2.3481, + "mean_token_accuracy": 0.4034482717514038, + "step": 149295 + }, + { + "epoch": 0.150376545690599, + "grad_norm": 11.430705908547983, + "learning_rate": 4.877179498515812e-05, + "loss": 2.4726, + "mean_token_accuracy": 0.40344828367233276, + "step": 149300 + }, + { + "epoch": 0.15038158174370317, + "grad_norm": 11.413600274057302, + "learning_rate": 4.877167287266204e-05, + "loss": 2.2024, + "mean_token_accuracy": 0.482758629322052, + "step": 149305 + }, + { + "epoch": 0.15038661779680734, + "grad_norm": 10.866667473949668, + "learning_rate": 4.877155075426617e-05, + "loss": 2.4666, + "mean_token_accuracy": 0.42413793206214906, + "step": 149310 + }, + { + "epoch": 0.15039165384991152, + "grad_norm": 8.734110171638065, + "learning_rate": 4.877142862997056e-05, + "loss": 2.4321, + "mean_token_accuracy": 0.4172413647174835, + "step": 149315 + }, + { + "epoch": 0.1503966899030157, + "grad_norm": 9.512535632327548, + "learning_rate": 4.8771306499775244e-05, + "loss": 2.5164, + "mean_token_accuracy": 0.35862069129943847, + "step": 149320 + }, + { + "epoch": 0.15040172595611986, + "grad_norm": 8.81103727049511, + "learning_rate": 4.877118436368025e-05, + "loss": 2.798, + "mean_token_accuracy": 0.3482758641242981, + "step": 149325 + }, + { + "epoch": 0.15040676200922404, + "grad_norm": 11.204882573941534, + "learning_rate": 4.877106222168562e-05, + "loss": 2.4417, + "mean_token_accuracy": 0.4034482777118683, + "step": 149330 + }, + { + "epoch": 0.1504117980623282, + "grad_norm": 11.05799001645349, + "learning_rate": 4.877094007379137e-05, + "loss": 2.2249, + "mean_token_accuracy": 0.4517241358757019, + "step": 149335 + }, + { + "epoch": 0.15041683411543239, + "grad_norm": 10.010865410151064, + "learning_rate": 4.877081791999755e-05, + "loss": 2.1427, + "mean_token_accuracy": 0.458620685338974, + "step": 149340 + }, + { + "epoch": 0.15042187016853656, + "grad_norm": 11.702188432915667, + "learning_rate": 4.8770695760304183e-05, + "loss": 2.459, + "mean_token_accuracy": 0.4034482717514038, + "step": 149345 + }, + { + "epoch": 0.15042690622164073, + "grad_norm": 10.4820106833545, + "learning_rate": 4.877057359471133e-05, + "loss": 2.4436, + "mean_token_accuracy": 0.3999999940395355, + "step": 149350 + }, + { + "epoch": 0.1504319422747449, + "grad_norm": 11.9252868977379, + "learning_rate": 4.8770451423218985e-05, + "loss": 2.641, + "mean_token_accuracy": 0.4206896543502808, + "step": 149355 + }, + { + "epoch": 0.15043697832784908, + "grad_norm": 10.841181273833318, + "learning_rate": 4.87703292458272e-05, + "loss": 3.0287, + "mean_token_accuracy": 0.30689655244350433, + "step": 149360 + }, + { + "epoch": 0.15044201438095325, + "grad_norm": 11.468360403874234, + "learning_rate": 4.8770207062536024e-05, + "loss": 2.526, + "mean_token_accuracy": 0.41379310488700866, + "step": 149365 + }, + { + "epoch": 0.15044705043405743, + "grad_norm": 14.506096313675567, + "learning_rate": 4.877008487334546e-05, + "loss": 2.3074, + "mean_token_accuracy": 0.42758620381355283, + "step": 149370 + }, + { + "epoch": 0.1504520864871616, + "grad_norm": 10.6157349502536, + "learning_rate": 4.876996267825557e-05, + "loss": 2.4079, + "mean_token_accuracy": 0.4330913364887238, + "step": 149375 + }, + { + "epoch": 0.15045712254026578, + "grad_norm": 13.243265727732807, + "learning_rate": 4.876984047726638e-05, + "loss": 2.4476, + "mean_token_accuracy": 0.41379310488700866, + "step": 149380 + }, + { + "epoch": 0.15046215859336995, + "grad_norm": 9.444903195673113, + "learning_rate": 4.876971827037791e-05, + "loss": 2.2499, + "mean_token_accuracy": 0.46551724672317507, + "step": 149385 + }, + { + "epoch": 0.1504671946464741, + "grad_norm": 9.185852543320232, + "learning_rate": 4.876959605759021e-05, + "loss": 2.5892, + "mean_token_accuracy": 0.37586206793785093, + "step": 149390 + }, + { + "epoch": 0.15047223069957827, + "grad_norm": 10.788080496086017, + "learning_rate": 4.876947383890331e-05, + "loss": 2.3162, + "mean_token_accuracy": 0.48620688915252686, + "step": 149395 + }, + { + "epoch": 0.15047726675268244, + "grad_norm": 13.227738234343674, + "learning_rate": 4.876935161431723e-05, + "loss": 2.6437, + "mean_token_accuracy": 0.4, + "step": 149400 + }, + { + "epoch": 0.15048230280578662, + "grad_norm": 12.452748156590717, + "learning_rate": 4.8769229383832024e-05, + "loss": 2.7981, + "mean_token_accuracy": 0.3965517163276672, + "step": 149405 + }, + { + "epoch": 0.1504873388588908, + "grad_norm": 10.493279317078986, + "learning_rate": 4.876910714744772e-05, + "loss": 2.4059, + "mean_token_accuracy": 0.41034482717514037, + "step": 149410 + }, + { + "epoch": 0.15049237491199496, + "grad_norm": 9.929553696538715, + "learning_rate": 4.876898490516435e-05, + "loss": 2.1125, + "mean_token_accuracy": 0.4517241358757019, + "step": 149415 + }, + { + "epoch": 0.15049741096509914, + "grad_norm": 10.224033646320988, + "learning_rate": 4.8768862656981945e-05, + "loss": 2.3911, + "mean_token_accuracy": 0.42758620977401735, + "step": 149420 + }, + { + "epoch": 0.1505024470182033, + "grad_norm": 13.87025510305605, + "learning_rate": 4.8768740402900534e-05, + "loss": 2.2135, + "mean_token_accuracy": 0.5160314619541169, + "step": 149425 + }, + { + "epoch": 0.15050748307130749, + "grad_norm": 12.424796039630738, + "learning_rate": 4.8768618142920176e-05, + "loss": 2.5655, + "mean_token_accuracy": 0.42068966031074523, + "step": 149430 + }, + { + "epoch": 0.15051251912441166, + "grad_norm": 11.188931679050908, + "learning_rate": 4.876849587704088e-05, + "loss": 2.6041, + "mean_token_accuracy": 0.4103448212146759, + "step": 149435 + }, + { + "epoch": 0.15051755517751583, + "grad_norm": 11.335121738830493, + "learning_rate": 4.876837360526268e-05, + "loss": 2.4056, + "mean_token_accuracy": 0.4724137902259827, + "step": 149440 + }, + { + "epoch": 0.15052259123062, + "grad_norm": 9.028648895221707, + "learning_rate": 4.8768251327585626e-05, + "loss": 2.2789, + "mean_token_accuracy": 0.4344827592372894, + "step": 149445 + }, + { + "epoch": 0.15052762728372418, + "grad_norm": 12.847073322829422, + "learning_rate": 4.8768129044009736e-05, + "loss": 2.2681, + "mean_token_accuracy": 0.47586206197738645, + "step": 149450 + }, + { + "epoch": 0.15053266333682835, + "grad_norm": 9.599561657725832, + "learning_rate": 4.876800675453505e-05, + "loss": 2.0743, + "mean_token_accuracy": 0.4810042381286621, + "step": 149455 + }, + { + "epoch": 0.15053769938993253, + "grad_norm": 9.955343765867251, + "learning_rate": 4.876788445916162e-05, + "loss": 2.3083, + "mean_token_accuracy": 0.43793103098869324, + "step": 149460 + }, + { + "epoch": 0.1505427354430367, + "grad_norm": 9.338307927577622, + "learning_rate": 4.876776215788945e-05, + "loss": 2.3352, + "mean_token_accuracy": 0.46551724672317507, + "step": 149465 + }, + { + "epoch": 0.15054777149614088, + "grad_norm": 11.521789616475244, + "learning_rate": 4.876763985071858e-05, + "loss": 2.5044, + "mean_token_accuracy": 0.4379310369491577, + "step": 149470 + }, + { + "epoch": 0.15055280754924505, + "grad_norm": 9.05530161827276, + "learning_rate": 4.8767517537649064e-05, + "loss": 2.419, + "mean_token_accuracy": 0.42758620381355283, + "step": 149475 + }, + { + "epoch": 0.15055784360234922, + "grad_norm": 8.388553656012549, + "learning_rate": 4.876739521868092e-05, + "loss": 2.4173, + "mean_token_accuracy": 0.41494253277778625, + "step": 149480 + }, + { + "epoch": 0.1505628796554534, + "grad_norm": 10.079270759903098, + "learning_rate": 4.876727289381418e-05, + "loss": 2.3905, + "mean_token_accuracy": 0.42758620381355283, + "step": 149485 + }, + { + "epoch": 0.15056791570855757, + "grad_norm": 10.73941934643547, + "learning_rate": 4.876715056304888e-05, + "loss": 2.5387, + "mean_token_accuracy": 0.3862069010734558, + "step": 149490 + }, + { + "epoch": 0.15057295176166174, + "grad_norm": 10.55671533343616, + "learning_rate": 4.8767028226385065e-05, + "loss": 2.4625, + "mean_token_accuracy": 0.42758620381355283, + "step": 149495 + }, + { + "epoch": 0.15057798781476592, + "grad_norm": 10.201375510332802, + "learning_rate": 4.876690588382275e-05, + "loss": 2.3459, + "mean_token_accuracy": 0.47241378426551817, + "step": 149500 + }, + { + "epoch": 0.1505830238678701, + "grad_norm": 10.905500776732705, + "learning_rate": 4.8766783535361986e-05, + "loss": 2.8893, + "mean_token_accuracy": 0.41379310488700866, + "step": 149505 + }, + { + "epoch": 0.15058805992097427, + "grad_norm": 10.895544190398885, + "learning_rate": 4.87666611810028e-05, + "loss": 2.1634, + "mean_token_accuracy": 0.4896551787853241, + "step": 149510 + }, + { + "epoch": 0.15059309597407844, + "grad_norm": 23.294735431575923, + "learning_rate": 4.876653882074523e-05, + "loss": 2.5955, + "mean_token_accuracy": 0.4413793087005615, + "step": 149515 + }, + { + "epoch": 0.1505981320271826, + "grad_norm": 8.261015692328558, + "learning_rate": 4.87664164545893e-05, + "loss": 2.3241, + "mean_token_accuracy": 0.4862068951129913, + "step": 149520 + }, + { + "epoch": 0.1506031680802868, + "grad_norm": 13.469207705946964, + "learning_rate": 4.876629408253505e-05, + "loss": 2.6341, + "mean_token_accuracy": 0.408771938085556, + "step": 149525 + }, + { + "epoch": 0.15060820413339093, + "grad_norm": 10.451827611209271, + "learning_rate": 4.876617170458252e-05, + "loss": 2.6457, + "mean_token_accuracy": 0.3517241358757019, + "step": 149530 + }, + { + "epoch": 0.1506132401864951, + "grad_norm": 11.676499626880178, + "learning_rate": 4.876604932073173e-05, + "loss": 2.5571, + "mean_token_accuracy": 0.3862069010734558, + "step": 149535 + }, + { + "epoch": 0.15061827623959928, + "grad_norm": 15.888734576950432, + "learning_rate": 4.8765926930982726e-05, + "loss": 2.2606, + "mean_token_accuracy": 0.48275862336158754, + "step": 149540 + }, + { + "epoch": 0.15062331229270345, + "grad_norm": 10.140698797615885, + "learning_rate": 4.876580453533553e-05, + "loss": 2.3532, + "mean_token_accuracy": 0.3965517163276672, + "step": 149545 + }, + { + "epoch": 0.15062834834580763, + "grad_norm": 10.246921303055457, + "learning_rate": 4.87656821337902e-05, + "loss": 2.2139, + "mean_token_accuracy": 0.46406533718109133, + "step": 149550 + }, + { + "epoch": 0.1506333843989118, + "grad_norm": 13.345602807923415, + "learning_rate": 4.876555972634675e-05, + "loss": 2.5852, + "mean_token_accuracy": 0.3999999940395355, + "step": 149555 + }, + { + "epoch": 0.15063842045201598, + "grad_norm": 11.97479059377392, + "learning_rate": 4.876543731300521e-05, + "loss": 2.371, + "mean_token_accuracy": 0.44482758045196535, + "step": 149560 + }, + { + "epoch": 0.15064345650512015, + "grad_norm": 9.914794722641435, + "learning_rate": 4.876531489376562e-05, + "loss": 2.2839, + "mean_token_accuracy": 0.43793103098869324, + "step": 149565 + }, + { + "epoch": 0.15064849255822432, + "grad_norm": 10.36707524348504, + "learning_rate": 4.8765192468628015e-05, + "loss": 2.4503, + "mean_token_accuracy": 0.4172413796186447, + "step": 149570 + }, + { + "epoch": 0.1506535286113285, + "grad_norm": 7.952800023192369, + "learning_rate": 4.876507003759244e-05, + "loss": 2.3974, + "mean_token_accuracy": 0.45517241954803467, + "step": 149575 + }, + { + "epoch": 0.15065856466443267, + "grad_norm": 13.575020087122866, + "learning_rate": 4.876494760065892e-05, + "loss": 2.737, + "mean_token_accuracy": 0.36896551847457887, + "step": 149580 + }, + { + "epoch": 0.15066360071753684, + "grad_norm": 11.941980383149899, + "learning_rate": 4.876482515782748e-05, + "loss": 2.7966, + "mean_token_accuracy": 0.35862069129943847, + "step": 149585 + }, + { + "epoch": 0.15066863677064102, + "grad_norm": 9.96491244454929, + "learning_rate": 4.876470270909816e-05, + "loss": 2.2438, + "mean_token_accuracy": 0.3517241388559341, + "step": 149590 + }, + { + "epoch": 0.1506736728237452, + "grad_norm": 10.509370502643945, + "learning_rate": 4.8764580254470994e-05, + "loss": 2.3021, + "mean_token_accuracy": 0.4172413766384125, + "step": 149595 + }, + { + "epoch": 0.15067870887684937, + "grad_norm": 9.433669914356763, + "learning_rate": 4.8764457793946024e-05, + "loss": 2.5857, + "mean_token_accuracy": 0.3862069010734558, + "step": 149600 + }, + { + "epoch": 0.15068374492995354, + "grad_norm": 9.068087399692464, + "learning_rate": 4.876433532752327e-05, + "loss": 2.3172, + "mean_token_accuracy": 0.43793103098869324, + "step": 149605 + }, + { + "epoch": 0.1506887809830577, + "grad_norm": 9.853031068514168, + "learning_rate": 4.8764212855202786e-05, + "loss": 2.2964, + "mean_token_accuracy": 0.44827585816383364, + "step": 149610 + }, + { + "epoch": 0.1506938170361619, + "grad_norm": 8.26916054596924, + "learning_rate": 4.876409037698458e-05, + "loss": 2.5484, + "mean_token_accuracy": 0.43781003952026365, + "step": 149615 + }, + { + "epoch": 0.15069885308926606, + "grad_norm": 13.388403559134366, + "learning_rate": 4.8763967892868704e-05, + "loss": 2.6676, + "mean_token_accuracy": 0.37241379022598264, + "step": 149620 + }, + { + "epoch": 0.15070388914237023, + "grad_norm": 8.793423965927898, + "learning_rate": 4.8763845402855184e-05, + "loss": 2.7327, + "mean_token_accuracy": 0.38965518176555636, + "step": 149625 + }, + { + "epoch": 0.1507089251954744, + "grad_norm": 11.60805081152104, + "learning_rate": 4.8763722906944064e-05, + "loss": 2.5524, + "mean_token_accuracy": 0.42413793206214906, + "step": 149630 + }, + { + "epoch": 0.15071396124857858, + "grad_norm": 8.912183412341369, + "learning_rate": 4.8763600405135366e-05, + "loss": 2.2035, + "mean_token_accuracy": 0.43559112548828127, + "step": 149635 + }, + { + "epoch": 0.15071899730168276, + "grad_norm": 9.571437507568355, + "learning_rate": 4.8763477897429136e-05, + "loss": 2.6076, + "mean_token_accuracy": 0.39655173420906065, + "step": 149640 + }, + { + "epoch": 0.15072403335478693, + "grad_norm": 10.763953085530634, + "learning_rate": 4.876335538382539e-05, + "loss": 2.5354, + "mean_token_accuracy": 0.39655172228813174, + "step": 149645 + }, + { + "epoch": 0.1507290694078911, + "grad_norm": 10.4601301709877, + "learning_rate": 4.876323286432418e-05, + "loss": 2.179, + "mean_token_accuracy": 0.43793103098869324, + "step": 149650 + }, + { + "epoch": 0.15073410546099528, + "grad_norm": 19.890295078568048, + "learning_rate": 4.8763110338925524e-05, + "loss": 2.7891, + "mean_token_accuracy": 0.4068965494632721, + "step": 149655 + }, + { + "epoch": 0.15073914151409945, + "grad_norm": 12.478393594624785, + "learning_rate": 4.876298780762948e-05, + "loss": 2.7203, + "mean_token_accuracy": 0.37241379022598264, + "step": 149660 + }, + { + "epoch": 0.15074417756720362, + "grad_norm": 11.159453248831037, + "learning_rate": 4.8762865270436056e-05, + "loss": 2.8838, + "mean_token_accuracy": 0.358620685338974, + "step": 149665 + }, + { + "epoch": 0.15074921362030777, + "grad_norm": 8.064070902507428, + "learning_rate": 4.87627427273453e-05, + "loss": 2.2971, + "mean_token_accuracy": 0.40689656138420105, + "step": 149670 + }, + { + "epoch": 0.15075424967341194, + "grad_norm": 11.76727276919041, + "learning_rate": 4.876262017835724e-05, + "loss": 2.209, + "mean_token_accuracy": 0.45862067937850953, + "step": 149675 + }, + { + "epoch": 0.15075928572651612, + "grad_norm": 13.916195826948814, + "learning_rate": 4.876249762347193e-05, + "loss": 2.6081, + "mean_token_accuracy": 0.441379314661026, + "step": 149680 + }, + { + "epoch": 0.1507643217796203, + "grad_norm": 10.686591713634101, + "learning_rate": 4.876237506268937e-05, + "loss": 2.264, + "mean_token_accuracy": 0.4379310369491577, + "step": 149685 + }, + { + "epoch": 0.15076935783272447, + "grad_norm": 9.979529297950961, + "learning_rate": 4.8762252496009607e-05, + "loss": 2.2003, + "mean_token_accuracy": 0.4862069010734558, + "step": 149690 + }, + { + "epoch": 0.15077439388582864, + "grad_norm": 9.614156118623567, + "learning_rate": 4.876212992343269e-05, + "loss": 2.0433, + "mean_token_accuracy": 0.4603448212146759, + "step": 149695 + }, + { + "epoch": 0.1507794299389328, + "grad_norm": 11.738415442388206, + "learning_rate": 4.876200734495864e-05, + "loss": 2.2681, + "mean_token_accuracy": 0.4503932178020477, + "step": 149700 + }, + { + "epoch": 0.150784465992037, + "grad_norm": 11.02224262017473, + "learning_rate": 4.876188476058749e-05, + "loss": 2.2197, + "mean_token_accuracy": 0.4655172348022461, + "step": 149705 + }, + { + "epoch": 0.15078950204514116, + "grad_norm": 9.407998096189218, + "learning_rate": 4.876176217031928e-05, + "loss": 2.4105, + "mean_token_accuracy": 0.41379310488700866, + "step": 149710 + }, + { + "epoch": 0.15079453809824533, + "grad_norm": 13.314234277823637, + "learning_rate": 4.876163957415404e-05, + "loss": 2.6917, + "mean_token_accuracy": 0.3896551728248596, + "step": 149715 + }, + { + "epoch": 0.1507995741513495, + "grad_norm": 10.472740319930455, + "learning_rate": 4.87615169720918e-05, + "loss": 3.0116, + "mean_token_accuracy": 0.4, + "step": 149720 + }, + { + "epoch": 0.15080461020445368, + "grad_norm": 9.369210092147018, + "learning_rate": 4.87613943641326e-05, + "loss": 2.4317, + "mean_token_accuracy": 0.4862068951129913, + "step": 149725 + }, + { + "epoch": 0.15080964625755786, + "grad_norm": 10.097844247755216, + "learning_rate": 4.876127175027648e-05, + "loss": 2.7347, + "mean_token_accuracy": 0.41724138259887694, + "step": 149730 + }, + { + "epoch": 0.15081468231066203, + "grad_norm": 11.913214865902967, + "learning_rate": 4.876114913052346e-05, + "loss": 2.3606, + "mean_token_accuracy": 0.4068965554237366, + "step": 149735 + }, + { + "epoch": 0.1508197183637662, + "grad_norm": 10.96368015813979, + "learning_rate": 4.876102650487359e-05, + "loss": 2.1533, + "mean_token_accuracy": 0.4620689630508423, + "step": 149740 + }, + { + "epoch": 0.15082475441687038, + "grad_norm": 9.163423053906138, + "learning_rate": 4.876090387332689e-05, + "loss": 2.3668, + "mean_token_accuracy": 0.41034482717514037, + "step": 149745 + }, + { + "epoch": 0.15082979046997455, + "grad_norm": 12.09859929102039, + "learning_rate": 4.876078123588339e-05, + "loss": 2.5033, + "mean_token_accuracy": 0.3793103456497192, + "step": 149750 + }, + { + "epoch": 0.15083482652307872, + "grad_norm": 11.01466926116127, + "learning_rate": 4.8760658592543146e-05, + "loss": 2.0059, + "mean_token_accuracy": 0.517241370677948, + "step": 149755 + }, + { + "epoch": 0.1508398625761829, + "grad_norm": 9.827046095619751, + "learning_rate": 4.8760535943306174e-05, + "loss": 2.208, + "mean_token_accuracy": 0.4310344815254211, + "step": 149760 + }, + { + "epoch": 0.15084489862928707, + "grad_norm": 11.32186809045248, + "learning_rate": 4.8760413288172505e-05, + "loss": 2.0528, + "mean_token_accuracy": 0.5, + "step": 149765 + }, + { + "epoch": 0.15084993468239125, + "grad_norm": 11.874814258453007, + "learning_rate": 4.87602906271422e-05, + "loss": 2.3954, + "mean_token_accuracy": 0.45359951853752134, + "step": 149770 + }, + { + "epoch": 0.15085497073549542, + "grad_norm": 10.613733684072402, + "learning_rate": 4.8760167960215256e-05, + "loss": 2.5054, + "mean_token_accuracy": 0.42758620977401735, + "step": 149775 + }, + { + "epoch": 0.1508600067885996, + "grad_norm": 11.072250712893451, + "learning_rate": 4.876004528739173e-05, + "loss": 2.351, + "mean_token_accuracy": 0.44827587008476255, + "step": 149780 + }, + { + "epoch": 0.15086504284170377, + "grad_norm": 12.871378237029303, + "learning_rate": 4.8759922608671656e-05, + "loss": 2.4521, + "mean_token_accuracy": 0.4379310369491577, + "step": 149785 + }, + { + "epoch": 0.15087007889480794, + "grad_norm": 11.573208833020342, + "learning_rate": 4.875979992405506e-05, + "loss": 2.4368, + "mean_token_accuracy": 0.4103448212146759, + "step": 149790 + }, + { + "epoch": 0.15087511494791211, + "grad_norm": 10.250447725908355, + "learning_rate": 4.875967723354198e-05, + "loss": 2.3031, + "mean_token_accuracy": 0.42413793206214906, + "step": 149795 + }, + { + "epoch": 0.1508801510010163, + "grad_norm": 13.781562912727232, + "learning_rate": 4.875955453713245e-05, + "loss": 2.6232, + "mean_token_accuracy": 0.3999999940395355, + "step": 149800 + }, + { + "epoch": 0.15088518705412046, + "grad_norm": 9.86087302515152, + "learning_rate": 4.87594318348265e-05, + "loss": 2.2988, + "mean_token_accuracy": 0.44827587008476255, + "step": 149805 + }, + { + "epoch": 0.1508902231072246, + "grad_norm": 10.43181599833497, + "learning_rate": 4.875930912662417e-05, + "loss": 2.5119, + "mean_token_accuracy": 0.5172413766384125, + "step": 149810 + }, + { + "epoch": 0.15089525916032878, + "grad_norm": 10.752435901438233, + "learning_rate": 4.875918641252549e-05, + "loss": 2.7333, + "mean_token_accuracy": 0.4206896543502808, + "step": 149815 + }, + { + "epoch": 0.15090029521343296, + "grad_norm": 12.751389213004384, + "learning_rate": 4.8759063692530496e-05, + "loss": 2.243, + "mean_token_accuracy": 0.44827587008476255, + "step": 149820 + }, + { + "epoch": 0.15090533126653713, + "grad_norm": 11.992010897769077, + "learning_rate": 4.8758940966639225e-05, + "loss": 2.699, + "mean_token_accuracy": 0.40490018129348754, + "step": 149825 + }, + { + "epoch": 0.1509103673196413, + "grad_norm": 10.151150603179934, + "learning_rate": 4.87588182348517e-05, + "loss": 2.2279, + "mean_token_accuracy": 0.46551724076271056, + "step": 149830 + }, + { + "epoch": 0.15091540337274548, + "grad_norm": 10.177737530621329, + "learning_rate": 4.8758695497167965e-05, + "loss": 2.2841, + "mean_token_accuracy": 0.4241379380226135, + "step": 149835 + }, + { + "epoch": 0.15092043942584965, + "grad_norm": 9.183236179182716, + "learning_rate": 4.875857275358806e-05, + "loss": 2.2407, + "mean_token_accuracy": 0.47586206793785096, + "step": 149840 + }, + { + "epoch": 0.15092547547895382, + "grad_norm": 10.34303708360322, + "learning_rate": 4.875845000411201e-05, + "loss": 2.7097, + "mean_token_accuracy": 0.38620689511299133, + "step": 149845 + }, + { + "epoch": 0.150930511532058, + "grad_norm": 8.966914420759421, + "learning_rate": 4.8758327248739835e-05, + "loss": 2.1153, + "mean_token_accuracy": 0.48275861144065857, + "step": 149850 + }, + { + "epoch": 0.15093554758516217, + "grad_norm": 11.799186822662449, + "learning_rate": 4.875820448747159e-05, + "loss": 2.5937, + "mean_token_accuracy": 0.39655172228813174, + "step": 149855 + }, + { + "epoch": 0.15094058363826635, + "grad_norm": 8.60768745881344, + "learning_rate": 4.87580817203073e-05, + "loss": 2.072, + "mean_token_accuracy": 0.47241379618644713, + "step": 149860 + }, + { + "epoch": 0.15094561969137052, + "grad_norm": 13.147817082903778, + "learning_rate": 4.875795894724701e-05, + "loss": 2.8686, + "mean_token_accuracy": 0.4344827592372894, + "step": 149865 + }, + { + "epoch": 0.1509506557444747, + "grad_norm": 10.146111595814933, + "learning_rate": 4.8757836168290736e-05, + "loss": 2.7214, + "mean_token_accuracy": 0.4172413766384125, + "step": 149870 + }, + { + "epoch": 0.15095569179757887, + "grad_norm": 11.741144842181695, + "learning_rate": 4.875771338343854e-05, + "loss": 2.5815, + "mean_token_accuracy": 0.3655172407627106, + "step": 149875 + }, + { + "epoch": 0.15096072785068304, + "grad_norm": 10.74812188369407, + "learning_rate": 4.875759059269042e-05, + "loss": 2.5139, + "mean_token_accuracy": 0.4379310429096222, + "step": 149880 + }, + { + "epoch": 0.15096576390378721, + "grad_norm": 11.23036703299972, + "learning_rate": 4.875746779604643e-05, + "loss": 2.9112, + "mean_token_accuracy": 0.35862069129943847, + "step": 149885 + }, + { + "epoch": 0.1509707999568914, + "grad_norm": 10.52249335232662, + "learning_rate": 4.875734499350661e-05, + "loss": 2.2026, + "mean_token_accuracy": 0.4551724135875702, + "step": 149890 + }, + { + "epoch": 0.15097583600999556, + "grad_norm": 9.957793294884176, + "learning_rate": 4.875722218507098e-05, + "loss": 2.2617, + "mean_token_accuracy": 0.44827585816383364, + "step": 149895 + }, + { + "epoch": 0.15098087206309974, + "grad_norm": 22.814203705122754, + "learning_rate": 4.875709937073958e-05, + "loss": 3.1332, + "mean_token_accuracy": 0.38620689511299133, + "step": 149900 + }, + { + "epoch": 0.1509859081162039, + "grad_norm": 10.12616944077804, + "learning_rate": 4.875697655051245e-05, + "loss": 2.2424, + "mean_token_accuracy": 0.46037507653236387, + "step": 149905 + }, + { + "epoch": 0.15099094416930808, + "grad_norm": 11.989940470325239, + "learning_rate": 4.875685372438961e-05, + "loss": 2.3053, + "mean_token_accuracy": 0.43678161799907683, + "step": 149910 + }, + { + "epoch": 0.15099598022241226, + "grad_norm": 11.498788668223176, + "learning_rate": 4.875673089237111e-05, + "loss": 2.2171, + "mean_token_accuracy": 0.43103448748588563, + "step": 149915 + }, + { + "epoch": 0.15100101627551643, + "grad_norm": 8.686268021095499, + "learning_rate": 4.875660805445697e-05, + "loss": 2.2469, + "mean_token_accuracy": 0.4413793087005615, + "step": 149920 + }, + { + "epoch": 0.1510060523286206, + "grad_norm": 18.64208592614061, + "learning_rate": 4.875648521064723e-05, + "loss": 2.5046, + "mean_token_accuracy": 0.39310344457626345, + "step": 149925 + }, + { + "epoch": 0.15101108838172478, + "grad_norm": 10.94239788937068, + "learning_rate": 4.875636236094193e-05, + "loss": 2.2818, + "mean_token_accuracy": 0.4310344815254211, + "step": 149930 + }, + { + "epoch": 0.15101612443482895, + "grad_norm": 11.725273404964776, + "learning_rate": 4.8756239505341096e-05, + "loss": 2.2754, + "mean_token_accuracy": 0.4448275864124298, + "step": 149935 + }, + { + "epoch": 0.15102116048793313, + "grad_norm": 10.708183403675019, + "learning_rate": 4.875611664384477e-05, + "loss": 2.5445, + "mean_token_accuracy": 0.4517241358757019, + "step": 149940 + }, + { + "epoch": 0.1510261965410373, + "grad_norm": 11.75507898168539, + "learning_rate": 4.875599377645297e-05, + "loss": 2.2224, + "mean_token_accuracy": 0.42413793206214906, + "step": 149945 + }, + { + "epoch": 0.15103123259414145, + "grad_norm": 9.294074764167819, + "learning_rate": 4.875587090316575e-05, + "loss": 2.4482, + "mean_token_accuracy": 0.4310344815254211, + "step": 149950 + }, + { + "epoch": 0.15103626864724562, + "grad_norm": 10.191946578781396, + "learning_rate": 4.875574802398313e-05, + "loss": 2.5094, + "mean_token_accuracy": 0.3981851160526276, + "step": 149955 + }, + { + "epoch": 0.1510413047003498, + "grad_norm": 9.686523551946502, + "learning_rate": 4.8755625138905156e-05, + "loss": 2.3312, + "mean_token_accuracy": 0.4896551787853241, + "step": 149960 + }, + { + "epoch": 0.15104634075345397, + "grad_norm": 11.592236597962085, + "learning_rate": 4.8755502247931846e-05, + "loss": 2.5748, + "mean_token_accuracy": 0.4503327250480652, + "step": 149965 + }, + { + "epoch": 0.15105137680655814, + "grad_norm": 10.545175741716223, + "learning_rate": 4.875537935106325e-05, + "loss": 2.3918, + "mean_token_accuracy": 0.4068965494632721, + "step": 149970 + }, + { + "epoch": 0.15105641285966231, + "grad_norm": 9.994698799653452, + "learning_rate": 4.8755256448299394e-05, + "loss": 2.1932, + "mean_token_accuracy": 0.46551724076271056, + "step": 149975 + }, + { + "epoch": 0.1510614489127665, + "grad_norm": 9.878210308961304, + "learning_rate": 4.875513353964031e-05, + "loss": 2.4291, + "mean_token_accuracy": 0.42413792610168455, + "step": 149980 + }, + { + "epoch": 0.15106648496587066, + "grad_norm": 9.891552626954644, + "learning_rate": 4.8755010625086026e-05, + "loss": 1.9938, + "mean_token_accuracy": 0.5137931048870087, + "step": 149985 + }, + { + "epoch": 0.15107152101897484, + "grad_norm": 9.439133080119229, + "learning_rate": 4.8754887704636606e-05, + "loss": 2.626, + "mean_token_accuracy": 0.42226255536079405, + "step": 149990 + }, + { + "epoch": 0.151076557072079, + "grad_norm": 12.202766034641444, + "learning_rate": 4.875476477829205e-05, + "loss": 2.2253, + "mean_token_accuracy": 0.44482757449150084, + "step": 149995 + }, + { + "epoch": 0.15108159312518318, + "grad_norm": 11.572475729870138, + "learning_rate": 4.875464184605241e-05, + "loss": 2.3586, + "mean_token_accuracy": 0.43793103098869324, + "step": 150000 + }, + { + "epoch": 0.15108662917828736, + "grad_norm": 11.804893872908302, + "learning_rate": 4.875451890791772e-05, + "loss": 1.9869, + "mean_token_accuracy": 0.5000000059604645, + "step": 150005 + }, + { + "epoch": 0.15109166523139153, + "grad_norm": 8.704480657737932, + "learning_rate": 4.8754395963888e-05, + "loss": 2.4238, + "mean_token_accuracy": 0.517241382598877, + "step": 150010 + }, + { + "epoch": 0.1510967012844957, + "grad_norm": 10.845416016884803, + "learning_rate": 4.87542730139633e-05, + "loss": 2.5662, + "mean_token_accuracy": 0.3999999940395355, + "step": 150015 + }, + { + "epoch": 0.15110173733759988, + "grad_norm": 10.044540464393616, + "learning_rate": 4.8754150058143646e-05, + "loss": 2.6184, + "mean_token_accuracy": 0.4344827592372894, + "step": 150020 + }, + { + "epoch": 0.15110677339070405, + "grad_norm": 9.008269068807397, + "learning_rate": 4.875402709642908e-05, + "loss": 2.3114, + "mean_token_accuracy": 0.4540229856967926, + "step": 150025 + }, + { + "epoch": 0.15111180944380823, + "grad_norm": 9.409736321952405, + "learning_rate": 4.875390412881962e-05, + "loss": 2.3293, + "mean_token_accuracy": 0.4275862157344818, + "step": 150030 + }, + { + "epoch": 0.1511168454969124, + "grad_norm": 10.192921760209092, + "learning_rate": 4.875378115531532e-05, + "loss": 2.468, + "mean_token_accuracy": 0.4000000059604645, + "step": 150035 + }, + { + "epoch": 0.15112188155001657, + "grad_norm": 9.628749607775681, + "learning_rate": 4.8753658175916206e-05, + "loss": 2.3601, + "mean_token_accuracy": 0.4172413766384125, + "step": 150040 + }, + { + "epoch": 0.15112691760312075, + "grad_norm": 10.18243906386138, + "learning_rate": 4.87535351906223e-05, + "loss": 2.2478, + "mean_token_accuracy": 0.4689655125141144, + "step": 150045 + }, + { + "epoch": 0.15113195365622492, + "grad_norm": 10.183179336879729, + "learning_rate": 4.875341219943365e-05, + "loss": 2.2614, + "mean_token_accuracy": 0.4482758641242981, + "step": 150050 + }, + { + "epoch": 0.1511369897093291, + "grad_norm": 12.149520292714604, + "learning_rate": 4.875328920235029e-05, + "loss": 2.4328, + "mean_token_accuracy": 0.44827585816383364, + "step": 150055 + }, + { + "epoch": 0.15114202576243327, + "grad_norm": 9.441114482622565, + "learning_rate": 4.875316619937225e-05, + "loss": 2.0619, + "mean_token_accuracy": 0.4620689630508423, + "step": 150060 + }, + { + "epoch": 0.15114706181553744, + "grad_norm": 11.023222362568502, + "learning_rate": 4.875304319049956e-05, + "loss": 2.392, + "mean_token_accuracy": 0.42413793206214906, + "step": 150065 + }, + { + "epoch": 0.15115209786864162, + "grad_norm": 8.160654604508869, + "learning_rate": 4.875292017573227e-05, + "loss": 2.1346, + "mean_token_accuracy": 0.4848154842853546, + "step": 150070 + }, + { + "epoch": 0.1511571339217458, + "grad_norm": 10.713756748241426, + "learning_rate": 4.87527971550704e-05, + "loss": 2.2277, + "mean_token_accuracy": 0.4275861978530884, + "step": 150075 + }, + { + "epoch": 0.15116216997484996, + "grad_norm": 12.92849714902062, + "learning_rate": 4.875267412851398e-05, + "loss": 2.8905, + "mean_token_accuracy": 0.3793103456497192, + "step": 150080 + }, + { + "epoch": 0.15116720602795414, + "grad_norm": 10.12466140362177, + "learning_rate": 4.875255109606306e-05, + "loss": 2.3144, + "mean_token_accuracy": 0.4379310369491577, + "step": 150085 + }, + { + "epoch": 0.15117224208105828, + "grad_norm": 12.040803136784405, + "learning_rate": 4.8752428057717666e-05, + "loss": 2.5957, + "mean_token_accuracy": 0.417241370677948, + "step": 150090 + }, + { + "epoch": 0.15117727813416246, + "grad_norm": 9.918949842160067, + "learning_rate": 4.875230501347782e-05, + "loss": 2.3446, + "mean_token_accuracy": 0.41034482717514037, + "step": 150095 + }, + { + "epoch": 0.15118231418726663, + "grad_norm": 8.825961285039142, + "learning_rate": 4.875218196334359e-05, + "loss": 2.5387, + "mean_token_accuracy": 0.4379310369491577, + "step": 150100 + }, + { + "epoch": 0.1511873502403708, + "grad_norm": 10.521101926381077, + "learning_rate": 4.875205890731497e-05, + "loss": 2.8611, + "mean_token_accuracy": 0.37241379618644715, + "step": 150105 + }, + { + "epoch": 0.15119238629347498, + "grad_norm": 11.364583726901614, + "learning_rate": 4.875193584539201e-05, + "loss": 2.3789, + "mean_token_accuracy": 0.4137930989265442, + "step": 150110 + }, + { + "epoch": 0.15119742234657915, + "grad_norm": 12.29302657449953, + "learning_rate": 4.875181277757475e-05, + "loss": 2.9767, + "mean_token_accuracy": 0.3931034505367279, + "step": 150115 + }, + { + "epoch": 0.15120245839968333, + "grad_norm": 11.157772317481305, + "learning_rate": 4.875168970386322e-05, + "loss": 2.6389, + "mean_token_accuracy": 0.3931034505367279, + "step": 150120 + }, + { + "epoch": 0.1512074944527875, + "grad_norm": 9.61915305788299, + "learning_rate": 4.8751566624257464e-05, + "loss": 2.451, + "mean_token_accuracy": 0.4206896543502808, + "step": 150125 + }, + { + "epoch": 0.15121253050589167, + "grad_norm": 8.398807625686924, + "learning_rate": 4.8751443538757495e-05, + "loss": 2.2748, + "mean_token_accuracy": 0.45862069725990295, + "step": 150130 + }, + { + "epoch": 0.15121756655899585, + "grad_norm": 12.167023287774294, + "learning_rate": 4.875132044736337e-05, + "loss": 2.9361, + "mean_token_accuracy": 0.3879612863063812, + "step": 150135 + }, + { + "epoch": 0.15122260261210002, + "grad_norm": 11.881947019372472, + "learning_rate": 4.8751197350075106e-05, + "loss": 2.2916, + "mean_token_accuracy": 0.4862068951129913, + "step": 150140 + }, + { + "epoch": 0.1512276386652042, + "grad_norm": 10.945645136623938, + "learning_rate": 4.8751074246892734e-05, + "loss": 2.2487, + "mean_token_accuracy": 0.4172413766384125, + "step": 150145 + }, + { + "epoch": 0.15123267471830837, + "grad_norm": 12.90750095286037, + "learning_rate": 4.875095113781631e-05, + "loss": 2.3537, + "mean_token_accuracy": 0.42758620977401735, + "step": 150150 + }, + { + "epoch": 0.15123771077141254, + "grad_norm": 10.971493191383479, + "learning_rate": 4.875082802284586e-05, + "loss": 2.5497, + "mean_token_accuracy": 0.4088324248790741, + "step": 150155 + }, + { + "epoch": 0.15124274682451672, + "grad_norm": 9.815691802977403, + "learning_rate": 4.87507049019814e-05, + "loss": 1.9228, + "mean_token_accuracy": 0.5241379320621491, + "step": 150160 + }, + { + "epoch": 0.1512477828776209, + "grad_norm": 11.939367120445292, + "learning_rate": 4.875058177522298e-05, + "loss": 2.6031, + "mean_token_accuracy": 0.43448275327682495, + "step": 150165 + }, + { + "epoch": 0.15125281893072506, + "grad_norm": 10.768429806105537, + "learning_rate": 4.875045864257064e-05, + "loss": 2.2736, + "mean_token_accuracy": 0.48275862336158754, + "step": 150170 + }, + { + "epoch": 0.15125785498382924, + "grad_norm": 9.313215074885422, + "learning_rate": 4.8750335504024396e-05, + "loss": 2.2369, + "mean_token_accuracy": 0.4620689630508423, + "step": 150175 + }, + { + "epoch": 0.1512628910369334, + "grad_norm": 10.156919514341073, + "learning_rate": 4.87502123595843e-05, + "loss": 2.3537, + "mean_token_accuracy": 0.44482758045196535, + "step": 150180 + }, + { + "epoch": 0.15126792709003758, + "grad_norm": 11.079845153322289, + "learning_rate": 4.875008920925037e-05, + "loss": 2.5276, + "mean_token_accuracy": 0.4068965494632721, + "step": 150185 + }, + { + "epoch": 0.15127296314314176, + "grad_norm": 10.225464559437409, + "learning_rate": 4.8749966053022656e-05, + "loss": 2.7613, + "mean_token_accuracy": 0.3793103456497192, + "step": 150190 + }, + { + "epoch": 0.15127799919624593, + "grad_norm": 11.020912950327778, + "learning_rate": 4.874984289090119e-05, + "loss": 2.0749, + "mean_token_accuracy": 0.5241379380226135, + "step": 150195 + }, + { + "epoch": 0.1512830352493501, + "grad_norm": 10.386691523749422, + "learning_rate": 4.874971972288599e-05, + "loss": 2.515, + "mean_token_accuracy": 0.4517241418361664, + "step": 150200 + }, + { + "epoch": 0.15128807130245428, + "grad_norm": 8.70204916094324, + "learning_rate": 4.8749596548977104e-05, + "loss": 2.252, + "mean_token_accuracy": 0.47931033968925474, + "step": 150205 + }, + { + "epoch": 0.15129310735555845, + "grad_norm": 13.673377437268389, + "learning_rate": 4.8749473369174564e-05, + "loss": 2.0869, + "mean_token_accuracy": 0.4735221564769745, + "step": 150210 + }, + { + "epoch": 0.15129814340866263, + "grad_norm": 10.088551272021586, + "learning_rate": 4.87493501834784e-05, + "loss": 2.2799, + "mean_token_accuracy": 0.46896551847457885, + "step": 150215 + }, + { + "epoch": 0.1513031794617668, + "grad_norm": 9.158333291736222, + "learning_rate": 4.874922699188866e-05, + "loss": 2.0271, + "mean_token_accuracy": 0.45862067937850953, + "step": 150220 + }, + { + "epoch": 0.15130821551487095, + "grad_norm": 12.101635829202934, + "learning_rate": 4.874910379440535e-05, + "loss": 2.3815, + "mean_token_accuracy": 0.42413793206214906, + "step": 150225 + }, + { + "epoch": 0.15131325156797512, + "grad_norm": 12.373608911713731, + "learning_rate": 4.8748980591028544e-05, + "loss": 2.8683, + "mean_token_accuracy": 0.420689657330513, + "step": 150230 + }, + { + "epoch": 0.1513182876210793, + "grad_norm": 7.8917931854661765, + "learning_rate": 4.8748857381758236e-05, + "loss": 2.2322, + "mean_token_accuracy": 0.4689655065536499, + "step": 150235 + }, + { + "epoch": 0.15132332367418347, + "grad_norm": 12.627721669148663, + "learning_rate": 4.8748734166594483e-05, + "loss": 2.5509, + "mean_token_accuracy": 0.42758620977401735, + "step": 150240 + }, + { + "epoch": 0.15132835972728764, + "grad_norm": 10.519164249965797, + "learning_rate": 4.874861094553731e-05, + "loss": 2.3548, + "mean_token_accuracy": 0.3896551728248596, + "step": 150245 + }, + { + "epoch": 0.15133339578039182, + "grad_norm": 11.91811160381917, + "learning_rate": 4.8748487718586764e-05, + "loss": 2.2428, + "mean_token_accuracy": 0.46551724672317507, + "step": 150250 + }, + { + "epoch": 0.151338431833496, + "grad_norm": 10.283878205986278, + "learning_rate": 4.874836448574287e-05, + "loss": 2.2604, + "mean_token_accuracy": 0.42413793206214906, + "step": 150255 + }, + { + "epoch": 0.15134346788660016, + "grad_norm": 11.426007052094397, + "learning_rate": 4.874824124700566e-05, + "loss": 2.4846, + "mean_token_accuracy": 0.42413792610168455, + "step": 150260 + }, + { + "epoch": 0.15134850393970434, + "grad_norm": 9.572970064106837, + "learning_rate": 4.874811800237517e-05, + "loss": 2.2192, + "mean_token_accuracy": 0.4275861978530884, + "step": 150265 + }, + { + "epoch": 0.1513535399928085, + "grad_norm": 9.315929302011174, + "learning_rate": 4.874799475185144e-05, + "loss": 2.3851, + "mean_token_accuracy": 0.4344827592372894, + "step": 150270 + }, + { + "epoch": 0.15135857604591268, + "grad_norm": 10.571683100886027, + "learning_rate": 4.874787149543449e-05, + "loss": 2.2536, + "mean_token_accuracy": 0.4584392011165619, + "step": 150275 + }, + { + "epoch": 0.15136361209901686, + "grad_norm": 8.96391113802866, + "learning_rate": 4.874774823312437e-05, + "loss": 2.564, + "mean_token_accuracy": 0.4344827592372894, + "step": 150280 + }, + { + "epoch": 0.15136864815212103, + "grad_norm": 25.055465272034354, + "learning_rate": 4.874762496492111e-05, + "loss": 2.7606, + "mean_token_accuracy": 0.4275861978530884, + "step": 150285 + }, + { + "epoch": 0.1513736842052252, + "grad_norm": 10.58333950914564, + "learning_rate": 4.874750169082474e-05, + "loss": 2.4669, + "mean_token_accuracy": 0.441379314661026, + "step": 150290 + }, + { + "epoch": 0.15137872025832938, + "grad_norm": 8.555513739456336, + "learning_rate": 4.874737841083529e-05, + "loss": 2.2696, + "mean_token_accuracy": 0.46206897497177124, + "step": 150295 + }, + { + "epoch": 0.15138375631143355, + "grad_norm": 9.887168120500515, + "learning_rate": 4.874725512495281e-05, + "loss": 2.257, + "mean_token_accuracy": 0.4931034564971924, + "step": 150300 + }, + { + "epoch": 0.15138879236453773, + "grad_norm": 8.356905069660419, + "learning_rate": 4.8747131833177315e-05, + "loss": 2.1606, + "mean_token_accuracy": 0.5034482657909394, + "step": 150305 + }, + { + "epoch": 0.1513938284176419, + "grad_norm": 10.579210387552152, + "learning_rate": 4.874700853550886e-05, + "loss": 2.4557, + "mean_token_accuracy": 0.4034482717514038, + "step": 150310 + }, + { + "epoch": 0.15139886447074608, + "grad_norm": 9.465618154448595, + "learning_rate": 4.8746885231947455e-05, + "loss": 2.2165, + "mean_token_accuracy": 0.44482758045196535, + "step": 150315 + }, + { + "epoch": 0.15140390052385025, + "grad_norm": 8.761453061524728, + "learning_rate": 4.874676192249315e-05, + "loss": 2.5343, + "mean_token_accuracy": 0.4000000059604645, + "step": 150320 + }, + { + "epoch": 0.15140893657695442, + "grad_norm": 9.581804774231564, + "learning_rate": 4.874663860714598e-05, + "loss": 2.1224, + "mean_token_accuracy": 0.5068965554237366, + "step": 150325 + }, + { + "epoch": 0.1514139726300586, + "grad_norm": 9.945106232319056, + "learning_rate": 4.874651528590599e-05, + "loss": 2.3541, + "mean_token_accuracy": 0.4448275864124298, + "step": 150330 + }, + { + "epoch": 0.15141900868316277, + "grad_norm": 10.719467867636174, + "learning_rate": 4.874639195877318e-05, + "loss": 2.3878, + "mean_token_accuracy": 0.44482757449150084, + "step": 150335 + }, + { + "epoch": 0.15142404473626694, + "grad_norm": 12.203019194067696, + "learning_rate": 4.8746268625747606e-05, + "loss": 3.027, + "mean_token_accuracy": 0.35317604541778563, + "step": 150340 + }, + { + "epoch": 0.15142908078937112, + "grad_norm": 9.766598134636835, + "learning_rate": 4.874614528682931e-05, + "loss": 2.2568, + "mean_token_accuracy": 0.4310344815254211, + "step": 150345 + }, + { + "epoch": 0.1514341168424753, + "grad_norm": 10.817124358248575, + "learning_rate": 4.87460219420183e-05, + "loss": 2.2315, + "mean_token_accuracy": 0.45710829496383665, + "step": 150350 + }, + { + "epoch": 0.15143915289557947, + "grad_norm": 10.33077888464912, + "learning_rate": 4.874589859131464e-05, + "loss": 2.1726, + "mean_token_accuracy": 0.44482758045196535, + "step": 150355 + }, + { + "epoch": 0.15144418894868364, + "grad_norm": 9.617127992856574, + "learning_rate": 4.874577523471834e-05, + "loss": 2.3214, + "mean_token_accuracy": 0.4379310369491577, + "step": 150360 + }, + { + "epoch": 0.15144922500178779, + "grad_norm": 11.254211020887224, + "learning_rate": 4.8745651872229456e-05, + "loss": 2.4388, + "mean_token_accuracy": 0.441379314661026, + "step": 150365 + }, + { + "epoch": 0.15145426105489196, + "grad_norm": 17.469372191003337, + "learning_rate": 4.874552850384801e-05, + "loss": 2.1248, + "mean_token_accuracy": 0.4896551609039307, + "step": 150370 + }, + { + "epoch": 0.15145929710799613, + "grad_norm": 10.259845077846446, + "learning_rate": 4.8745405129574035e-05, + "loss": 2.1766, + "mean_token_accuracy": 0.43793103098869324, + "step": 150375 + }, + { + "epoch": 0.1514643331611003, + "grad_norm": 13.819621397696476, + "learning_rate": 4.8745281749407566e-05, + "loss": 2.9662, + "mean_token_accuracy": 0.41724137365818026, + "step": 150380 + }, + { + "epoch": 0.15146936921420448, + "grad_norm": 10.156105766218072, + "learning_rate": 4.8745158363348634e-05, + "loss": 2.4862, + "mean_token_accuracy": 0.43448275327682495, + "step": 150385 + }, + { + "epoch": 0.15147440526730865, + "grad_norm": 12.076582965522295, + "learning_rate": 4.8745034971397294e-05, + "loss": 2.3162, + "mean_token_accuracy": 0.4551724135875702, + "step": 150390 + }, + { + "epoch": 0.15147944132041283, + "grad_norm": 10.684685970012838, + "learning_rate": 4.8744911573553546e-05, + "loss": 2.7113, + "mean_token_accuracy": 0.3793103456497192, + "step": 150395 + }, + { + "epoch": 0.151484477373517, + "grad_norm": 12.158531400115896, + "learning_rate": 4.8744788169817456e-05, + "loss": 2.7915, + "mean_token_accuracy": 0.44827585816383364, + "step": 150400 + }, + { + "epoch": 0.15148951342662118, + "grad_norm": 9.360065953940515, + "learning_rate": 4.874466476018904e-05, + "loss": 2.1124, + "mean_token_accuracy": 0.48275861144065857, + "step": 150405 + }, + { + "epoch": 0.15149454947972535, + "grad_norm": 11.331226905079259, + "learning_rate": 4.874454134466834e-05, + "loss": 2.0816, + "mean_token_accuracy": 0.4655172348022461, + "step": 150410 + }, + { + "epoch": 0.15149958553282952, + "grad_norm": 9.679648592540959, + "learning_rate": 4.874441792325538e-05, + "loss": 2.3184, + "mean_token_accuracy": 0.37586206793785093, + "step": 150415 + }, + { + "epoch": 0.1515046215859337, + "grad_norm": 9.308470284668466, + "learning_rate": 4.874429449595021e-05, + "loss": 2.1625, + "mean_token_accuracy": 0.45517241954803467, + "step": 150420 + }, + { + "epoch": 0.15150965763903787, + "grad_norm": 9.659529000544817, + "learning_rate": 4.874417106275285e-05, + "loss": 2.321, + "mean_token_accuracy": 0.4310344815254211, + "step": 150425 + }, + { + "epoch": 0.15151469369214204, + "grad_norm": 17.556991282622622, + "learning_rate": 4.874404762366334e-05, + "loss": 2.564, + "mean_token_accuracy": 0.4344827592372894, + "step": 150430 + }, + { + "epoch": 0.15151972974524622, + "grad_norm": 10.803726976284512, + "learning_rate": 4.8743924178681714e-05, + "loss": 2.1544, + "mean_token_accuracy": 0.4082274615764618, + "step": 150435 + }, + { + "epoch": 0.1515247657983504, + "grad_norm": 9.338568646056933, + "learning_rate": 4.874380072780801e-05, + "loss": 2.6222, + "mean_token_accuracy": 0.3793103516101837, + "step": 150440 + }, + { + "epoch": 0.15152980185145457, + "grad_norm": 8.165962261337153, + "learning_rate": 4.874367727104225e-05, + "loss": 2.2757, + "mean_token_accuracy": 0.44482759237289426, + "step": 150445 + }, + { + "epoch": 0.15153483790455874, + "grad_norm": 9.43841886394729, + "learning_rate": 4.8743553808384484e-05, + "loss": 2.08, + "mean_token_accuracy": 0.510344821214676, + "step": 150450 + }, + { + "epoch": 0.1515398739576629, + "grad_norm": 14.072186571701389, + "learning_rate": 4.874343033983474e-05, + "loss": 2.5124, + "mean_token_accuracy": 0.3896551728248596, + "step": 150455 + }, + { + "epoch": 0.1515449100107671, + "grad_norm": 11.394049111164001, + "learning_rate": 4.874330686539305e-05, + "loss": 2.2264, + "mean_token_accuracy": 0.4068965554237366, + "step": 150460 + }, + { + "epoch": 0.15154994606387126, + "grad_norm": 10.346039073101949, + "learning_rate": 4.874318338505945e-05, + "loss": 2.3938, + "mean_token_accuracy": 0.42758620381355283, + "step": 150465 + }, + { + "epoch": 0.15155498211697543, + "grad_norm": 9.453980304303089, + "learning_rate": 4.874305989883398e-05, + "loss": 2.5905, + "mean_token_accuracy": 0.4344827592372894, + "step": 150470 + }, + { + "epoch": 0.1515600181700796, + "grad_norm": 13.23978943954671, + "learning_rate": 4.874293640671665e-05, + "loss": 2.349, + "mean_token_accuracy": 0.4620689690113068, + "step": 150475 + }, + { + "epoch": 0.15156505422318378, + "grad_norm": 10.258650735743732, + "learning_rate": 4.874281290870753e-05, + "loss": 2.4331, + "mean_token_accuracy": 0.46896551847457885, + "step": 150480 + }, + { + "epoch": 0.15157009027628796, + "grad_norm": 8.817047781918076, + "learning_rate": 4.874268940480663e-05, + "loss": 2.3779, + "mean_token_accuracy": 0.42758620977401735, + "step": 150485 + }, + { + "epoch": 0.15157512632939213, + "grad_norm": 8.672893836478407, + "learning_rate": 4.874256589501398e-05, + "loss": 2.4349, + "mean_token_accuracy": 0.45517241954803467, + "step": 150490 + }, + { + "epoch": 0.1515801623824963, + "grad_norm": 11.903649789741818, + "learning_rate": 4.874244237932964e-05, + "loss": 2.2391, + "mean_token_accuracy": 0.4482758641242981, + "step": 150495 + }, + { + "epoch": 0.15158519843560048, + "grad_norm": 10.746453281167769, + "learning_rate": 4.8742318857753625e-05, + "loss": 2.0424, + "mean_token_accuracy": 0.47803992629051206, + "step": 150500 + }, + { + "epoch": 0.15159023448870462, + "grad_norm": 13.562185438651952, + "learning_rate": 4.874219533028598e-05, + "loss": 2.3888, + "mean_token_accuracy": 0.458620685338974, + "step": 150505 + }, + { + "epoch": 0.1515952705418088, + "grad_norm": 9.790192076271353, + "learning_rate": 4.874207179692672e-05, + "loss": 2.5233, + "mean_token_accuracy": 0.39655172228813174, + "step": 150510 + }, + { + "epoch": 0.15160030659491297, + "grad_norm": 8.458841449175747, + "learning_rate": 4.87419482576759e-05, + "loss": 2.6556, + "mean_token_accuracy": 0.40344828367233276, + "step": 150515 + }, + { + "epoch": 0.15160534264801714, + "grad_norm": 10.722364764240057, + "learning_rate": 4.874182471253355e-05, + "loss": 2.4542, + "mean_token_accuracy": 0.42068966031074523, + "step": 150520 + }, + { + "epoch": 0.15161037870112132, + "grad_norm": 8.994659434697269, + "learning_rate": 4.874170116149969e-05, + "loss": 2.4861, + "mean_token_accuracy": 0.4, + "step": 150525 + }, + { + "epoch": 0.1516154147542255, + "grad_norm": 10.693536127125087, + "learning_rate": 4.874157760457438e-05, + "loss": 2.3542, + "mean_token_accuracy": 0.4344827592372894, + "step": 150530 + }, + { + "epoch": 0.15162045080732967, + "grad_norm": 9.007398111677427, + "learning_rate": 4.874145404175762e-05, + "loss": 2.2279, + "mean_token_accuracy": 0.458620685338974, + "step": 150535 + }, + { + "epoch": 0.15162548686043384, + "grad_norm": 9.22207650987322, + "learning_rate": 4.874133047304948e-05, + "loss": 2.3779, + "mean_token_accuracy": 0.41379310488700866, + "step": 150540 + }, + { + "epoch": 0.151630522913538, + "grad_norm": 8.807127213952073, + "learning_rate": 4.874120689844997e-05, + "loss": 2.2532, + "mean_token_accuracy": 0.4310344815254211, + "step": 150545 + }, + { + "epoch": 0.1516355589666422, + "grad_norm": 11.007965670423662, + "learning_rate": 4.874108331795913e-05, + "loss": 2.2826, + "mean_token_accuracy": 0.4551724135875702, + "step": 150550 + }, + { + "epoch": 0.15164059501974636, + "grad_norm": 10.27759008419283, + "learning_rate": 4.8740959731577004e-05, + "loss": 2.5944, + "mean_token_accuracy": 0.4275861978530884, + "step": 150555 + }, + { + "epoch": 0.15164563107285053, + "grad_norm": 8.178922110203978, + "learning_rate": 4.874083613930361e-05, + "loss": 2.343, + "mean_token_accuracy": 0.4747731387615204, + "step": 150560 + }, + { + "epoch": 0.1516506671259547, + "grad_norm": 11.133960785633366, + "learning_rate": 4.8740712541139e-05, + "loss": 2.4533, + "mean_token_accuracy": 0.42413792610168455, + "step": 150565 + }, + { + "epoch": 0.15165570317905888, + "grad_norm": 9.986264461845304, + "learning_rate": 4.8740588937083194e-05, + "loss": 2.6498, + "mean_token_accuracy": 0.42068966031074523, + "step": 150570 + }, + { + "epoch": 0.15166073923216306, + "grad_norm": 10.280098156367536, + "learning_rate": 4.874046532713623e-05, + "loss": 2.1286, + "mean_token_accuracy": 0.475862056016922, + "step": 150575 + }, + { + "epoch": 0.15166577528526723, + "grad_norm": 13.494117498302932, + "learning_rate": 4.8740341711298144e-05, + "loss": 2.8961, + "mean_token_accuracy": 0.39310344457626345, + "step": 150580 + }, + { + "epoch": 0.1516708113383714, + "grad_norm": 11.958428598222532, + "learning_rate": 4.874021808956897e-05, + "loss": 2.1979, + "mean_token_accuracy": 0.4344827592372894, + "step": 150585 + }, + { + "epoch": 0.15167584739147558, + "grad_norm": 12.82460550507251, + "learning_rate": 4.8740094461948745e-05, + "loss": 2.9441, + "mean_token_accuracy": 0.35517241060733795, + "step": 150590 + }, + { + "epoch": 0.15168088344457975, + "grad_norm": 10.587126171189466, + "learning_rate": 4.87399708284375e-05, + "loss": 2.0716, + "mean_token_accuracy": 0.4655172348022461, + "step": 150595 + }, + { + "epoch": 0.15168591949768392, + "grad_norm": 9.923382689729799, + "learning_rate": 4.873984718903527e-05, + "loss": 2.3636, + "mean_token_accuracy": 0.4068965494632721, + "step": 150600 + }, + { + "epoch": 0.1516909555507881, + "grad_norm": 13.47038608821393, + "learning_rate": 4.873972354374208e-05, + "loss": 2.5455, + "mean_token_accuracy": 0.4344827592372894, + "step": 150605 + }, + { + "epoch": 0.15169599160389227, + "grad_norm": 12.812706379878655, + "learning_rate": 4.873959989255799e-05, + "loss": 2.37, + "mean_token_accuracy": 0.44694494009017943, + "step": 150610 + }, + { + "epoch": 0.15170102765699645, + "grad_norm": 8.120043157794253, + "learning_rate": 4.8739476235483004e-05, + "loss": 2.5319, + "mean_token_accuracy": 0.42413792908191683, + "step": 150615 + }, + { + "epoch": 0.15170606371010062, + "grad_norm": 11.281864042182796, + "learning_rate": 4.873935257251718e-05, + "loss": 2.3366, + "mean_token_accuracy": 0.46896551847457885, + "step": 150620 + }, + { + "epoch": 0.1517110997632048, + "grad_norm": 12.529714941157007, + "learning_rate": 4.8739228903660534e-05, + "loss": 2.3125, + "mean_token_accuracy": 0.47438423037528993, + "step": 150625 + }, + { + "epoch": 0.15171613581630897, + "grad_norm": 8.640803934901799, + "learning_rate": 4.8739105228913116e-05, + "loss": 1.9594, + "mean_token_accuracy": 0.47931033968925474, + "step": 150630 + }, + { + "epoch": 0.15172117186941314, + "grad_norm": 10.169405481963791, + "learning_rate": 4.873898154827495e-05, + "loss": 2.172, + "mean_token_accuracy": 0.48965516686439514, + "step": 150635 + }, + { + "epoch": 0.15172620792251731, + "grad_norm": 10.757194128750637, + "learning_rate": 4.873885786174607e-05, + "loss": 2.2302, + "mean_token_accuracy": 0.4779794216156006, + "step": 150640 + }, + { + "epoch": 0.15173124397562146, + "grad_norm": 17.585003030503234, + "learning_rate": 4.8738734169326515e-05, + "loss": 2.0028, + "mean_token_accuracy": 0.44482758045196535, + "step": 150645 + }, + { + "epoch": 0.15173628002872563, + "grad_norm": 11.042385067666803, + "learning_rate": 4.873861047101632e-05, + "loss": 2.2474, + "mean_token_accuracy": 0.4344827592372894, + "step": 150650 + }, + { + "epoch": 0.1517413160818298, + "grad_norm": 8.833257855649816, + "learning_rate": 4.873848676681551e-05, + "loss": 2.4657, + "mean_token_accuracy": 0.4344827592372894, + "step": 150655 + }, + { + "epoch": 0.15174635213493398, + "grad_norm": 10.68783756984192, + "learning_rate": 4.873836305672414e-05, + "loss": 2.3579, + "mean_token_accuracy": 0.42577131986618044, + "step": 150660 + }, + { + "epoch": 0.15175138818803816, + "grad_norm": 15.986663323270122, + "learning_rate": 4.8738239340742226e-05, + "loss": 2.5879, + "mean_token_accuracy": 0.4, + "step": 150665 + }, + { + "epoch": 0.15175642424114233, + "grad_norm": 11.721582755047603, + "learning_rate": 4.87381156188698e-05, + "loss": 2.219, + "mean_token_accuracy": 0.42068966031074523, + "step": 150670 + }, + { + "epoch": 0.1517614602942465, + "grad_norm": 11.87661260623941, + "learning_rate": 4.873799189110691e-05, + "loss": 2.314, + "mean_token_accuracy": 0.4379310250282288, + "step": 150675 + }, + { + "epoch": 0.15176649634735068, + "grad_norm": 13.004632324647494, + "learning_rate": 4.873786815745358e-05, + "loss": 2.0984, + "mean_token_accuracy": 0.5243366956710815, + "step": 150680 + }, + { + "epoch": 0.15177153240045485, + "grad_norm": 9.618445936924376, + "learning_rate": 4.873774441790985e-05, + "loss": 2.1615, + "mean_token_accuracy": 0.4773744702339172, + "step": 150685 + }, + { + "epoch": 0.15177656845355902, + "grad_norm": 11.501936129113819, + "learning_rate": 4.8737620672475754e-05, + "loss": 2.4036, + "mean_token_accuracy": 0.46551724076271056, + "step": 150690 + }, + { + "epoch": 0.1517816045066632, + "grad_norm": 10.638031260349974, + "learning_rate": 4.873749692115132e-05, + "loss": 2.5078, + "mean_token_accuracy": 0.3827586233615875, + "step": 150695 + }, + { + "epoch": 0.15178664055976737, + "grad_norm": 10.828222783567954, + "learning_rate": 4.87373731639366e-05, + "loss": 2.4322, + "mean_token_accuracy": 0.4551724135875702, + "step": 150700 + }, + { + "epoch": 0.15179167661287155, + "grad_norm": 6.954232589315808, + "learning_rate": 4.8737249400831595e-05, + "loss": 2.1544, + "mean_token_accuracy": 0.4655172348022461, + "step": 150705 + }, + { + "epoch": 0.15179671266597572, + "grad_norm": 9.608329515350112, + "learning_rate": 4.8737125631836366e-05, + "loss": 2.4061, + "mean_token_accuracy": 0.42413793206214906, + "step": 150710 + }, + { + "epoch": 0.1518017487190799, + "grad_norm": 11.374239213659573, + "learning_rate": 4.8737001856950945e-05, + "loss": 2.3526, + "mean_token_accuracy": 0.4551724135875702, + "step": 150715 + }, + { + "epoch": 0.15180678477218407, + "grad_norm": 11.184258517742835, + "learning_rate": 4.8736878076175364e-05, + "loss": 2.3387, + "mean_token_accuracy": 0.4413793087005615, + "step": 150720 + }, + { + "epoch": 0.15181182082528824, + "grad_norm": 12.413705025847536, + "learning_rate": 4.873675428950966e-05, + "loss": 2.5459, + "mean_token_accuracy": 0.4034482717514038, + "step": 150725 + }, + { + "epoch": 0.15181685687839241, + "grad_norm": 10.689350988924502, + "learning_rate": 4.873663049695385e-05, + "loss": 2.2883, + "mean_token_accuracy": 0.47749546766281126, + "step": 150730 + }, + { + "epoch": 0.1518218929314966, + "grad_norm": 11.819130754000316, + "learning_rate": 4.873650669850799e-05, + "loss": 2.2928, + "mean_token_accuracy": 0.39310344457626345, + "step": 150735 + }, + { + "epoch": 0.15182692898460076, + "grad_norm": 13.166761966945314, + "learning_rate": 4.873638289417211e-05, + "loss": 2.5584, + "mean_token_accuracy": 0.4504537105560303, + "step": 150740 + }, + { + "epoch": 0.15183196503770494, + "grad_norm": 10.09324875520783, + "learning_rate": 4.873625908394622e-05, + "loss": 2.5139, + "mean_token_accuracy": 0.43103448748588563, + "step": 150745 + }, + { + "epoch": 0.1518370010908091, + "grad_norm": 9.236224476920444, + "learning_rate": 4.873613526783039e-05, + "loss": 2.4757, + "mean_token_accuracy": 0.3793103456497192, + "step": 150750 + }, + { + "epoch": 0.15184203714391328, + "grad_norm": 11.282594566292891, + "learning_rate": 4.873601144582464e-05, + "loss": 2.5157, + "mean_token_accuracy": 0.4707199037075043, + "step": 150755 + }, + { + "epoch": 0.15184707319701746, + "grad_norm": 12.387120700263967, + "learning_rate": 4.873588761792899e-05, + "loss": 2.6317, + "mean_token_accuracy": 0.4344827592372894, + "step": 150760 + }, + { + "epoch": 0.15185210925012163, + "grad_norm": 13.11763911520989, + "learning_rate": 4.873576378414351e-05, + "loss": 2.6105, + "mean_token_accuracy": 0.441379314661026, + "step": 150765 + }, + { + "epoch": 0.1518571453032258, + "grad_norm": 10.153568637920841, + "learning_rate": 4.87356399444682e-05, + "loss": 2.2433, + "mean_token_accuracy": 0.44482759237289426, + "step": 150770 + }, + { + "epoch": 0.15186218135632998, + "grad_norm": 10.007439944301805, + "learning_rate": 4.8735516098903096e-05, + "loss": 2.1689, + "mean_token_accuracy": 0.4241379380226135, + "step": 150775 + }, + { + "epoch": 0.15186721740943415, + "grad_norm": 11.40487302493936, + "learning_rate": 4.8735392247448254e-05, + "loss": 2.4577, + "mean_token_accuracy": 0.3999999940395355, + "step": 150780 + }, + { + "epoch": 0.1518722534625383, + "grad_norm": 10.81069004641041, + "learning_rate": 4.873526839010369e-05, + "loss": 2.3155, + "mean_token_accuracy": 0.4620689690113068, + "step": 150785 + }, + { + "epoch": 0.15187728951564247, + "grad_norm": 9.283858758743062, + "learning_rate": 4.8735144526869456e-05, + "loss": 2.6422, + "mean_token_accuracy": 0.4344827592372894, + "step": 150790 + }, + { + "epoch": 0.15188232556874665, + "grad_norm": 11.166442509161339, + "learning_rate": 4.873502065774556e-05, + "loss": 2.203, + "mean_token_accuracy": 0.46551724672317507, + "step": 150795 + }, + { + "epoch": 0.15188736162185082, + "grad_norm": 9.578904333460258, + "learning_rate": 4.873489678273206e-05, + "loss": 2.2315, + "mean_token_accuracy": 0.46896551847457885, + "step": 150800 + }, + { + "epoch": 0.151892397674955, + "grad_norm": 8.835839406234053, + "learning_rate": 4.873477290182898e-05, + "loss": 2.5571, + "mean_token_accuracy": 0.4344827592372894, + "step": 150805 + }, + { + "epoch": 0.15189743372805917, + "grad_norm": 12.044151655830165, + "learning_rate": 4.8734649015036366e-05, + "loss": 2.5559, + "mean_token_accuracy": 0.4344827592372894, + "step": 150810 + }, + { + "epoch": 0.15190246978116334, + "grad_norm": 9.899572861756353, + "learning_rate": 4.873452512235423e-05, + "loss": 2.4713, + "mean_token_accuracy": 0.4379310369491577, + "step": 150815 + }, + { + "epoch": 0.15190750583426751, + "grad_norm": 9.46910806652318, + "learning_rate": 4.873440122378263e-05, + "loss": 2.5389, + "mean_token_accuracy": 0.38275861740112305, + "step": 150820 + }, + { + "epoch": 0.1519125418873717, + "grad_norm": 11.552822444880906, + "learning_rate": 4.873427731932158e-05, + "loss": 2.2074, + "mean_token_accuracy": 0.4620689690113068, + "step": 150825 + }, + { + "epoch": 0.15191757794047586, + "grad_norm": 10.46456787412755, + "learning_rate": 4.8734153408971125e-05, + "loss": 2.3814, + "mean_token_accuracy": 0.4, + "step": 150830 + }, + { + "epoch": 0.15192261399358004, + "grad_norm": 10.183681970068573, + "learning_rate": 4.873402949273131e-05, + "loss": 2.2787, + "mean_token_accuracy": 0.4931034505367279, + "step": 150835 + }, + { + "epoch": 0.1519276500466842, + "grad_norm": 9.653474408396317, + "learning_rate": 4.8733905570602144e-05, + "loss": 2.2426, + "mean_token_accuracy": 0.48965516686439514, + "step": 150840 + }, + { + "epoch": 0.15193268609978838, + "grad_norm": 9.065102843908237, + "learning_rate": 4.873378164258368e-05, + "loss": 2.67, + "mean_token_accuracy": 0.4034482777118683, + "step": 150845 + }, + { + "epoch": 0.15193772215289256, + "grad_norm": 9.49958958881616, + "learning_rate": 4.873365770867595e-05, + "loss": 2.2855, + "mean_token_accuracy": 0.4620689630508423, + "step": 150850 + }, + { + "epoch": 0.15194275820599673, + "grad_norm": 10.16470988860959, + "learning_rate": 4.873353376887898e-05, + "loss": 2.0291, + "mean_token_accuracy": 0.4620689630508423, + "step": 150855 + }, + { + "epoch": 0.1519477942591009, + "grad_norm": 19.174208245390435, + "learning_rate": 4.873340982319281e-05, + "loss": 2.881, + "mean_token_accuracy": 0.3965517163276672, + "step": 150860 + }, + { + "epoch": 0.15195283031220508, + "grad_norm": 9.756136747844288, + "learning_rate": 4.873328587161749e-05, + "loss": 2.2686, + "mean_token_accuracy": 0.4793103337287903, + "step": 150865 + }, + { + "epoch": 0.15195786636530925, + "grad_norm": 9.376287546385894, + "learning_rate": 4.873316191415303e-05, + "loss": 2.5359, + "mean_token_accuracy": 0.4172413766384125, + "step": 150870 + }, + { + "epoch": 0.15196290241841343, + "grad_norm": 9.290025809661005, + "learning_rate": 4.873303795079947e-05, + "loss": 2.4794, + "mean_token_accuracy": 0.38275861740112305, + "step": 150875 + }, + { + "epoch": 0.1519679384715176, + "grad_norm": 9.66248497997102, + "learning_rate": 4.873291398155685e-05, + "loss": 2.1645, + "mean_token_accuracy": 0.4344827651977539, + "step": 150880 + }, + { + "epoch": 0.15197297452462177, + "grad_norm": 9.37619109700161, + "learning_rate": 4.8732790006425193e-05, + "loss": 2.0703, + "mean_token_accuracy": 0.4620689690113068, + "step": 150885 + }, + { + "epoch": 0.15197801057772595, + "grad_norm": 8.552158677026144, + "learning_rate": 4.8732666025404546e-05, + "loss": 2.4804, + "mean_token_accuracy": 0.441379314661026, + "step": 150890 + }, + { + "epoch": 0.15198304663083012, + "grad_norm": 11.416381163478356, + "learning_rate": 4.873254203849495e-05, + "loss": 2.718, + "mean_token_accuracy": 0.4225045442581177, + "step": 150895 + }, + { + "epoch": 0.1519880826839343, + "grad_norm": 9.7506027903717, + "learning_rate": 4.873241804569642e-05, + "loss": 1.977, + "mean_token_accuracy": 0.42068964838981626, + "step": 150900 + }, + { + "epoch": 0.15199311873703847, + "grad_norm": 9.691227767635663, + "learning_rate": 4.8732294047009005e-05, + "loss": 2.4689, + "mean_token_accuracy": 0.4172413766384125, + "step": 150905 + }, + { + "epoch": 0.15199815479014264, + "grad_norm": 8.736319444334372, + "learning_rate": 4.873217004243274e-05, + "loss": 2.4549, + "mean_token_accuracy": 0.458620685338974, + "step": 150910 + }, + { + "epoch": 0.15200319084324682, + "grad_norm": 12.25224135947201, + "learning_rate": 4.873204603196764e-05, + "loss": 2.7741, + "mean_token_accuracy": 0.37241379618644715, + "step": 150915 + }, + { + "epoch": 0.152008226896351, + "grad_norm": 9.598961387935718, + "learning_rate": 4.873192201561376e-05, + "loss": 2.5233, + "mean_token_accuracy": 0.441379314661026, + "step": 150920 + }, + { + "epoch": 0.15201326294945514, + "grad_norm": 10.988610621069487, + "learning_rate": 4.873179799337113e-05, + "loss": 2.524, + "mean_token_accuracy": 0.4068965494632721, + "step": 150925 + }, + { + "epoch": 0.1520182990025593, + "grad_norm": 8.425051200285985, + "learning_rate": 4.873167396523977e-05, + "loss": 2.1215, + "mean_token_accuracy": 0.46551724672317507, + "step": 150930 + }, + { + "epoch": 0.15202333505566348, + "grad_norm": 10.023100736401778, + "learning_rate": 4.8731549931219735e-05, + "loss": 2.4643, + "mean_token_accuracy": 0.4103448212146759, + "step": 150935 + }, + { + "epoch": 0.15202837110876766, + "grad_norm": 12.066617312862238, + "learning_rate": 4.8731425891311046e-05, + "loss": 2.7199, + "mean_token_accuracy": 0.41379311084747317, + "step": 150940 + }, + { + "epoch": 0.15203340716187183, + "grad_norm": 9.97821526886476, + "learning_rate": 4.8731301845513744e-05, + "loss": 2.169, + "mean_token_accuracy": 0.46896551847457885, + "step": 150945 + }, + { + "epoch": 0.152038443214976, + "grad_norm": 8.222836564518998, + "learning_rate": 4.873117779382786e-05, + "loss": 2.1493, + "mean_token_accuracy": 0.4689655125141144, + "step": 150950 + }, + { + "epoch": 0.15204347926808018, + "grad_norm": 12.529976976440432, + "learning_rate": 4.873105373625343e-05, + "loss": 2.4912, + "mean_token_accuracy": 0.4137930929660797, + "step": 150955 + }, + { + "epoch": 0.15204851532118435, + "grad_norm": 11.377235120576106, + "learning_rate": 4.8730929672790495e-05, + "loss": 2.5131, + "mean_token_accuracy": 0.3999999940395355, + "step": 150960 + }, + { + "epoch": 0.15205355137428853, + "grad_norm": 11.110446166584714, + "learning_rate": 4.873080560343907e-05, + "loss": 2.3101, + "mean_token_accuracy": 0.4068965494632721, + "step": 150965 + }, + { + "epoch": 0.1520585874273927, + "grad_norm": 11.856457678003226, + "learning_rate": 4.8730681528199214e-05, + "loss": 2.2479, + "mean_token_accuracy": 0.4172413766384125, + "step": 150970 + }, + { + "epoch": 0.15206362348049687, + "grad_norm": 10.168747734603619, + "learning_rate": 4.873055744707094e-05, + "loss": 2.0031, + "mean_token_accuracy": 0.4724137902259827, + "step": 150975 + }, + { + "epoch": 0.15206865953360105, + "grad_norm": 10.540583976378606, + "learning_rate": 4.873043336005429e-05, + "loss": 2.3692, + "mean_token_accuracy": 0.43793103098869324, + "step": 150980 + }, + { + "epoch": 0.15207369558670522, + "grad_norm": 9.55927376836404, + "learning_rate": 4.873030926714931e-05, + "loss": 2.3414, + "mean_token_accuracy": 0.45862069725990295, + "step": 150985 + }, + { + "epoch": 0.1520787316398094, + "grad_norm": 8.231008157035912, + "learning_rate": 4.873018516835603e-05, + "loss": 1.9673, + "mean_token_accuracy": 0.5034482836723327, + "step": 150990 + }, + { + "epoch": 0.15208376769291357, + "grad_norm": 8.640598046638395, + "learning_rate": 4.8730061063674465e-05, + "loss": 2.2573, + "mean_token_accuracy": 0.4517241299152374, + "step": 150995 + }, + { + "epoch": 0.15208880374601774, + "grad_norm": 7.973262380196957, + "learning_rate": 4.872993695310466e-05, + "loss": 2.0267, + "mean_token_accuracy": 0.48620688915252686, + "step": 151000 + }, + { + "epoch": 0.15209383979912192, + "grad_norm": 8.263058231180564, + "learning_rate": 4.872981283664666e-05, + "loss": 2.2512, + "mean_token_accuracy": 0.441379314661026, + "step": 151005 + }, + { + "epoch": 0.1520988758522261, + "grad_norm": 9.160556789446183, + "learning_rate": 4.8729688714300495e-05, + "loss": 2.4617, + "mean_token_accuracy": 0.4206896543502808, + "step": 151010 + }, + { + "epoch": 0.15210391190533026, + "grad_norm": 11.601005730049268, + "learning_rate": 4.872956458606619e-05, + "loss": 2.4511, + "mean_token_accuracy": 0.3862068891525269, + "step": 151015 + }, + { + "epoch": 0.15210894795843444, + "grad_norm": 8.937949429738923, + "learning_rate": 4.87294404519438e-05, + "loss": 2.4401, + "mean_token_accuracy": 0.41034482717514037, + "step": 151020 + }, + { + "epoch": 0.1521139840115386, + "grad_norm": 9.105207038852074, + "learning_rate": 4.872931631193333e-05, + "loss": 1.9159, + "mean_token_accuracy": 0.5597290694713593, + "step": 151025 + }, + { + "epoch": 0.15211902006464278, + "grad_norm": 19.48967502517362, + "learning_rate": 4.8729192166034835e-05, + "loss": 2.5433, + "mean_token_accuracy": 0.4551724135875702, + "step": 151030 + }, + { + "epoch": 0.15212405611774696, + "grad_norm": 10.36361148041307, + "learning_rate": 4.872906801424834e-05, + "loss": 2.272, + "mean_token_accuracy": 0.48620688915252686, + "step": 151035 + }, + { + "epoch": 0.15212909217085113, + "grad_norm": 11.153884189177703, + "learning_rate": 4.872894385657389e-05, + "loss": 2.3786, + "mean_token_accuracy": 0.3965517282485962, + "step": 151040 + }, + { + "epoch": 0.1521341282239553, + "grad_norm": 8.448549559316971, + "learning_rate": 4.872881969301152e-05, + "loss": 2.5053, + "mean_token_accuracy": 0.417241370677948, + "step": 151045 + }, + { + "epoch": 0.15213916427705948, + "grad_norm": 10.269903253532002, + "learning_rate": 4.872869552356125e-05, + "loss": 2.3708, + "mean_token_accuracy": 0.41379310488700866, + "step": 151050 + }, + { + "epoch": 0.15214420033016365, + "grad_norm": 9.53397820847752, + "learning_rate": 4.872857134822312e-05, + "loss": 2.2478, + "mean_token_accuracy": 0.5034482717514038, + "step": 151055 + }, + { + "epoch": 0.15214923638326783, + "grad_norm": 10.586792599406287, + "learning_rate": 4.872844716699717e-05, + "loss": 2.4764, + "mean_token_accuracy": 0.4137930989265442, + "step": 151060 + }, + { + "epoch": 0.15215427243637197, + "grad_norm": 11.179751580697559, + "learning_rate": 4.8728322979883423e-05, + "loss": 2.3748, + "mean_token_accuracy": 0.441379314661026, + "step": 151065 + }, + { + "epoch": 0.15215930848947615, + "grad_norm": 9.961983738649938, + "learning_rate": 4.8728198786881926e-05, + "loss": 2.4995, + "mean_token_accuracy": 0.4137930989265442, + "step": 151070 + }, + { + "epoch": 0.15216434454258032, + "grad_norm": 10.571941493260121, + "learning_rate": 4.8728074587992715e-05, + "loss": 2.3962, + "mean_token_accuracy": 0.4896551728248596, + "step": 151075 + }, + { + "epoch": 0.1521693805956845, + "grad_norm": 9.008727603390826, + "learning_rate": 4.8727950383215806e-05, + "loss": 2.156, + "mean_token_accuracy": 0.4689655125141144, + "step": 151080 + }, + { + "epoch": 0.15217441664878867, + "grad_norm": 9.753150163864436, + "learning_rate": 4.872782617255125e-05, + "loss": 2.2142, + "mean_token_accuracy": 0.47241378426551817, + "step": 151085 + }, + { + "epoch": 0.15217945270189284, + "grad_norm": 11.141523331250118, + "learning_rate": 4.8727701955999095e-05, + "loss": 2.4837, + "mean_token_accuracy": 0.4517241418361664, + "step": 151090 + }, + { + "epoch": 0.15218448875499702, + "grad_norm": 11.792727770775645, + "learning_rate": 4.872757773355934e-05, + "loss": 2.543, + "mean_token_accuracy": 0.35862069129943847, + "step": 151095 + }, + { + "epoch": 0.1521895248081012, + "grad_norm": 11.545932494197213, + "learning_rate": 4.872745350523204e-05, + "loss": 2.4839, + "mean_token_accuracy": 0.40689654350280763, + "step": 151100 + }, + { + "epoch": 0.15219456086120536, + "grad_norm": 9.85121842225319, + "learning_rate": 4.872732927101723e-05, + "loss": 2.5074, + "mean_token_accuracy": 0.44137930274009707, + "step": 151105 + }, + { + "epoch": 0.15219959691430954, + "grad_norm": 10.956718908528908, + "learning_rate": 4.872720503091493e-05, + "loss": 2.2291, + "mean_token_accuracy": 0.48275861144065857, + "step": 151110 + }, + { + "epoch": 0.1522046329674137, + "grad_norm": 11.484814754127292, + "learning_rate": 4.872708078492519e-05, + "loss": 2.3697, + "mean_token_accuracy": 0.42758620977401735, + "step": 151115 + }, + { + "epoch": 0.15220966902051788, + "grad_norm": 8.172030638190998, + "learning_rate": 4.8726956533048055e-05, + "loss": 2.0658, + "mean_token_accuracy": 0.47241380214691164, + "step": 151120 + }, + { + "epoch": 0.15221470507362206, + "grad_norm": 8.869491910141184, + "learning_rate": 4.872683227528353e-05, + "loss": 2.4065, + "mean_token_accuracy": 0.4413793087005615, + "step": 151125 + }, + { + "epoch": 0.15221974112672623, + "grad_norm": 13.832355468500554, + "learning_rate": 4.8726708011631664e-05, + "loss": 2.516, + "mean_token_accuracy": 0.36896551847457887, + "step": 151130 + }, + { + "epoch": 0.1522247771798304, + "grad_norm": 9.766679517702944, + "learning_rate": 4.87265837420925e-05, + "loss": 2.514, + "mean_token_accuracy": 0.4137930989265442, + "step": 151135 + }, + { + "epoch": 0.15222981323293458, + "grad_norm": 12.753550298149598, + "learning_rate": 4.872645946666605e-05, + "loss": 2.2012, + "mean_token_accuracy": 0.42413792610168455, + "step": 151140 + }, + { + "epoch": 0.15223484928603875, + "grad_norm": 9.571916920301375, + "learning_rate": 4.8726335185352375e-05, + "loss": 2.1147, + "mean_token_accuracy": 0.46418632864952086, + "step": 151145 + }, + { + "epoch": 0.15223988533914293, + "grad_norm": 12.480189635331628, + "learning_rate": 4.872621089815149e-05, + "loss": 2.4292, + "mean_token_accuracy": 0.39655172228813174, + "step": 151150 + }, + { + "epoch": 0.1522449213922471, + "grad_norm": 10.937704722682419, + "learning_rate": 4.872608660506344e-05, + "loss": 2.0482, + "mean_token_accuracy": 0.4689655065536499, + "step": 151155 + }, + { + "epoch": 0.15224995744535127, + "grad_norm": 11.07659338745059, + "learning_rate": 4.8725962306088254e-05, + "loss": 2.4514, + "mean_token_accuracy": 0.4206896543502808, + "step": 151160 + }, + { + "epoch": 0.15225499349845545, + "grad_norm": 11.37615556389797, + "learning_rate": 4.872583800122597e-05, + "loss": 2.2952, + "mean_token_accuracy": 0.39310344457626345, + "step": 151165 + }, + { + "epoch": 0.15226002955155962, + "grad_norm": 9.381166374574759, + "learning_rate": 4.872571369047662e-05, + "loss": 2.3261, + "mean_token_accuracy": 0.4413793087005615, + "step": 151170 + }, + { + "epoch": 0.1522650656046638, + "grad_norm": 10.054925547489749, + "learning_rate": 4.872558937384024e-05, + "loss": 2.4361, + "mean_token_accuracy": 0.458620685338974, + "step": 151175 + }, + { + "epoch": 0.15227010165776797, + "grad_norm": 11.124754873244454, + "learning_rate": 4.872546505131686e-05, + "loss": 2.3083, + "mean_token_accuracy": 0.45862069725990295, + "step": 151180 + }, + { + "epoch": 0.15227513771087214, + "grad_norm": 10.610146354795342, + "learning_rate": 4.872534072290651e-05, + "loss": 2.7605, + "mean_token_accuracy": 0.37241379618644715, + "step": 151185 + }, + { + "epoch": 0.15228017376397632, + "grad_norm": 8.559116716687027, + "learning_rate": 4.872521638860925e-05, + "loss": 2.2421, + "mean_token_accuracy": 0.44984875321388246, + "step": 151190 + }, + { + "epoch": 0.1522852098170805, + "grad_norm": 11.246755108386017, + "learning_rate": 4.8725092048425094e-05, + "loss": 2.2351, + "mean_token_accuracy": 0.4517241358757019, + "step": 151195 + }, + { + "epoch": 0.15229024587018467, + "grad_norm": 10.395795161932568, + "learning_rate": 4.872496770235407e-05, + "loss": 2.4224, + "mean_token_accuracy": 0.42413793206214906, + "step": 151200 + }, + { + "epoch": 0.1522952819232888, + "grad_norm": 8.56066293132436, + "learning_rate": 4.872484335039623e-05, + "loss": 2.3127, + "mean_token_accuracy": 0.4344827592372894, + "step": 151205 + }, + { + "epoch": 0.15230031797639298, + "grad_norm": 10.069502260910408, + "learning_rate": 4.8724718992551595e-05, + "loss": 2.1555, + "mean_token_accuracy": 0.46721113920211793, + "step": 151210 + }, + { + "epoch": 0.15230535402949716, + "grad_norm": 10.05625330771739, + "learning_rate": 4.8724594628820205e-05, + "loss": 2.5231, + "mean_token_accuracy": 0.4517241358757019, + "step": 151215 + }, + { + "epoch": 0.15231039008260133, + "grad_norm": 11.665651200013338, + "learning_rate": 4.87244702592021e-05, + "loss": 2.4187, + "mean_token_accuracy": 0.45862067937850953, + "step": 151220 + }, + { + "epoch": 0.1523154261357055, + "grad_norm": 12.054411229911091, + "learning_rate": 4.8724345883697304e-05, + "loss": 2.5488, + "mean_token_accuracy": 0.3620689630508423, + "step": 151225 + }, + { + "epoch": 0.15232046218880968, + "grad_norm": 8.949540897361521, + "learning_rate": 4.872422150230585e-05, + "loss": 2.3183, + "mean_token_accuracy": 0.42758620381355283, + "step": 151230 + }, + { + "epoch": 0.15232549824191385, + "grad_norm": 12.423064094683197, + "learning_rate": 4.8724097115027794e-05, + "loss": 2.7452, + "mean_token_accuracy": 0.39655172228813174, + "step": 151235 + }, + { + "epoch": 0.15233053429501803, + "grad_norm": 9.963564813301332, + "learning_rate": 4.8723972721863155e-05, + "loss": 2.6117, + "mean_token_accuracy": 0.3965517282485962, + "step": 151240 + }, + { + "epoch": 0.1523355703481222, + "grad_norm": 11.463395785147187, + "learning_rate": 4.8723848322811955e-05, + "loss": 2.4206, + "mean_token_accuracy": 0.4206896543502808, + "step": 151245 + }, + { + "epoch": 0.15234060640122637, + "grad_norm": 9.204748131351378, + "learning_rate": 4.8723723917874255e-05, + "loss": 2.3752, + "mean_token_accuracy": 0.41379310488700866, + "step": 151250 + }, + { + "epoch": 0.15234564245433055, + "grad_norm": 10.609299581236924, + "learning_rate": 4.872359950705006e-05, + "loss": 2.4623, + "mean_token_accuracy": 0.4517241358757019, + "step": 151255 + }, + { + "epoch": 0.15235067850743472, + "grad_norm": 9.018425755908083, + "learning_rate": 4.872347509033944e-05, + "loss": 2.2345, + "mean_token_accuracy": 0.4709618926048279, + "step": 151260 + }, + { + "epoch": 0.1523557145605389, + "grad_norm": 9.136870786321776, + "learning_rate": 4.8723350667742396e-05, + "loss": 2.3615, + "mean_token_accuracy": 0.4172413766384125, + "step": 151265 + }, + { + "epoch": 0.15236075061364307, + "grad_norm": 9.148837411963486, + "learning_rate": 4.8723226239258977e-05, + "loss": 2.1876, + "mean_token_accuracy": 0.47241380214691164, + "step": 151270 + }, + { + "epoch": 0.15236578666674724, + "grad_norm": 10.463293262802559, + "learning_rate": 4.872310180488922e-05, + "loss": 2.385, + "mean_token_accuracy": 0.44482758045196535, + "step": 151275 + }, + { + "epoch": 0.15237082271985142, + "grad_norm": 15.458488770555565, + "learning_rate": 4.872297736463316e-05, + "loss": 2.5012, + "mean_token_accuracy": 0.4379310429096222, + "step": 151280 + }, + { + "epoch": 0.1523758587729556, + "grad_norm": 10.883732114438944, + "learning_rate": 4.872285291849083e-05, + "loss": 2.8245, + "mean_token_accuracy": 0.3896551698446274, + "step": 151285 + }, + { + "epoch": 0.15238089482605977, + "grad_norm": 11.37264110961484, + "learning_rate": 4.8722728466462254e-05, + "loss": 2.6627, + "mean_token_accuracy": 0.38275861740112305, + "step": 151290 + }, + { + "epoch": 0.15238593087916394, + "grad_norm": 8.531408550975884, + "learning_rate": 4.872260400854748e-05, + "loss": 2.0808, + "mean_token_accuracy": 0.47785844206809996, + "step": 151295 + }, + { + "epoch": 0.1523909669322681, + "grad_norm": 9.829915672435417, + "learning_rate": 4.8722479544746535e-05, + "loss": 2.5631, + "mean_token_accuracy": 0.4724137902259827, + "step": 151300 + }, + { + "epoch": 0.1523960029853723, + "grad_norm": 10.240994779451803, + "learning_rate": 4.8722355075059455e-05, + "loss": 2.4636, + "mean_token_accuracy": 0.358620685338974, + "step": 151305 + }, + { + "epoch": 0.15240103903847646, + "grad_norm": 9.286018997890258, + "learning_rate": 4.872223059948628e-05, + "loss": 2.5043, + "mean_token_accuracy": 0.4034482777118683, + "step": 151310 + }, + { + "epoch": 0.15240607509158063, + "grad_norm": 7.98614348933282, + "learning_rate": 4.8722106118027037e-05, + "loss": 2.2528, + "mean_token_accuracy": 0.3965517282485962, + "step": 151315 + }, + { + "epoch": 0.1524111111446848, + "grad_norm": 10.576783960652094, + "learning_rate": 4.8721981630681766e-05, + "loss": 2.8685, + "mean_token_accuracy": 0.3793103456497192, + "step": 151320 + }, + { + "epoch": 0.15241614719778898, + "grad_norm": 9.47566822415473, + "learning_rate": 4.87218571374505e-05, + "loss": 2.3843, + "mean_token_accuracy": 0.44482758045196535, + "step": 151325 + }, + { + "epoch": 0.15242118325089316, + "grad_norm": 12.70702678932557, + "learning_rate": 4.8721732638333264e-05, + "loss": 2.3833, + "mean_token_accuracy": 0.37586207389831544, + "step": 151330 + }, + { + "epoch": 0.15242621930399733, + "grad_norm": 9.559229378472281, + "learning_rate": 4.872160813333011e-05, + "loss": 2.3079, + "mean_token_accuracy": 0.41724138259887694, + "step": 151335 + }, + { + "epoch": 0.1524312553571015, + "grad_norm": 11.21058177240816, + "learning_rate": 4.872148362244107e-05, + "loss": 2.6711, + "mean_token_accuracy": 0.39655171036720277, + "step": 151340 + }, + { + "epoch": 0.15243629141020565, + "grad_norm": 6.865840075983419, + "learning_rate": 4.872135910566616e-05, + "loss": 2.1928, + "mean_token_accuracy": 0.45051421523094176, + "step": 151345 + }, + { + "epoch": 0.15244132746330982, + "grad_norm": 8.012826219050568, + "learning_rate": 4.872123458300543e-05, + "loss": 1.794, + "mean_token_accuracy": 0.4990385353565216, + "step": 151350 + }, + { + "epoch": 0.152446363516414, + "grad_norm": 12.407393461241666, + "learning_rate": 4.872111005445891e-05, + "loss": 2.7145, + "mean_token_accuracy": 0.3724137842655182, + "step": 151355 + }, + { + "epoch": 0.15245139956951817, + "grad_norm": 10.486275789575275, + "learning_rate": 4.872098552002664e-05, + "loss": 2.7942, + "mean_token_accuracy": 0.37586206793785093, + "step": 151360 + }, + { + "epoch": 0.15245643562262234, + "grad_norm": 11.080908282816205, + "learning_rate": 4.872086097970865e-05, + "loss": 2.4046, + "mean_token_accuracy": 0.4206896543502808, + "step": 151365 + }, + { + "epoch": 0.15246147167572652, + "grad_norm": 10.730315708466055, + "learning_rate": 4.872073643350498e-05, + "loss": 2.3672, + "mean_token_accuracy": 0.41379310488700866, + "step": 151370 + }, + { + "epoch": 0.1524665077288307, + "grad_norm": 9.226592529268325, + "learning_rate": 4.872061188141565e-05, + "loss": 2.6013, + "mean_token_accuracy": 0.4103448331356049, + "step": 151375 + }, + { + "epoch": 0.15247154378193487, + "grad_norm": 9.685964780924254, + "learning_rate": 4.87204873234407e-05, + "loss": 2.1737, + "mean_token_accuracy": 0.482758617401123, + "step": 151380 + }, + { + "epoch": 0.15247657983503904, + "grad_norm": 10.196151693032158, + "learning_rate": 4.872036275958018e-05, + "loss": 2.3903, + "mean_token_accuracy": 0.4413793087005615, + "step": 151385 + }, + { + "epoch": 0.1524816158881432, + "grad_norm": 12.470920953611174, + "learning_rate": 4.8720238189834116e-05, + "loss": 2.4314, + "mean_token_accuracy": 0.41379310488700866, + "step": 151390 + }, + { + "epoch": 0.1524866519412474, + "grad_norm": 10.18988932451362, + "learning_rate": 4.8720113614202527e-05, + "loss": 2.4184, + "mean_token_accuracy": 0.4241379380226135, + "step": 151395 + }, + { + "epoch": 0.15249168799435156, + "grad_norm": 9.970728859771793, + "learning_rate": 4.8719989032685474e-05, + "loss": 2.6618, + "mean_token_accuracy": 0.38275861740112305, + "step": 151400 + }, + { + "epoch": 0.15249672404745573, + "grad_norm": 11.779190485255194, + "learning_rate": 4.871986444528297e-05, + "loss": 2.393, + "mean_token_accuracy": 0.4448275864124298, + "step": 151405 + }, + { + "epoch": 0.1525017601005599, + "grad_norm": 10.41819503820325, + "learning_rate": 4.8719739851995056e-05, + "loss": 2.1337, + "mean_token_accuracy": 0.4551724135875702, + "step": 151410 + }, + { + "epoch": 0.15250679615366408, + "grad_norm": 10.956776170226467, + "learning_rate": 4.871961525282177e-05, + "loss": 2.3852, + "mean_token_accuracy": 0.41379310488700866, + "step": 151415 + }, + { + "epoch": 0.15251183220676826, + "grad_norm": 9.908217228212681, + "learning_rate": 4.871949064776315e-05, + "loss": 2.6243, + "mean_token_accuracy": 0.3999999910593033, + "step": 151420 + }, + { + "epoch": 0.15251686825987243, + "grad_norm": 11.650979512008755, + "learning_rate": 4.871936603681922e-05, + "loss": 2.5913, + "mean_token_accuracy": 0.39310344457626345, + "step": 151425 + }, + { + "epoch": 0.1525219043129766, + "grad_norm": 10.779053234591526, + "learning_rate": 4.8719241419990016e-05, + "loss": 2.6788, + "mean_token_accuracy": 0.4137930989265442, + "step": 151430 + }, + { + "epoch": 0.15252694036608078, + "grad_norm": 9.177385145937727, + "learning_rate": 4.871911679727558e-05, + "loss": 2.4672, + "mean_token_accuracy": 0.42758620977401735, + "step": 151435 + }, + { + "epoch": 0.15253197641918495, + "grad_norm": 11.801793588825042, + "learning_rate": 4.871899216867594e-05, + "loss": 2.5814, + "mean_token_accuracy": 0.4206896543502808, + "step": 151440 + }, + { + "epoch": 0.15253701247228912, + "grad_norm": 10.252536901418951, + "learning_rate": 4.8718867534191135e-05, + "loss": 2.0177, + "mean_token_accuracy": 0.4781609296798706, + "step": 151445 + }, + { + "epoch": 0.1525420485253933, + "grad_norm": 14.765049214663536, + "learning_rate": 4.8718742893821204e-05, + "loss": 2.4407, + "mean_token_accuracy": 0.39147005677223207, + "step": 151450 + }, + { + "epoch": 0.15254708457849747, + "grad_norm": 13.904018329227313, + "learning_rate": 4.871861824756617e-05, + "loss": 2.3932, + "mean_token_accuracy": 0.44482758045196535, + "step": 151455 + }, + { + "epoch": 0.15255212063160165, + "grad_norm": 12.408712836858651, + "learning_rate": 4.871849359542607e-05, + "loss": 2.5695, + "mean_token_accuracy": 0.38620689511299133, + "step": 151460 + }, + { + "epoch": 0.15255715668470582, + "grad_norm": 9.650186047103047, + "learning_rate": 4.871836893740095e-05, + "loss": 2.4711, + "mean_token_accuracy": 0.4103448212146759, + "step": 151465 + }, + { + "epoch": 0.15256219273781, + "grad_norm": 13.717501718422508, + "learning_rate": 4.871824427349083e-05, + "loss": 2.4142, + "mean_token_accuracy": 0.458620685338974, + "step": 151470 + }, + { + "epoch": 0.15256722879091417, + "grad_norm": 11.141813123222663, + "learning_rate": 4.871811960369575e-05, + "loss": 2.3301, + "mean_token_accuracy": 0.41379310488700866, + "step": 151475 + }, + { + "epoch": 0.15257226484401834, + "grad_norm": 10.193002637409508, + "learning_rate": 4.871799492801574e-05, + "loss": 2.0523, + "mean_token_accuracy": 0.49854809045791626, + "step": 151480 + }, + { + "epoch": 0.1525773008971225, + "grad_norm": 9.021868523805702, + "learning_rate": 4.871787024645085e-05, + "loss": 2.3067, + "mean_token_accuracy": 0.4103448212146759, + "step": 151485 + }, + { + "epoch": 0.15258233695022666, + "grad_norm": 10.015896053840377, + "learning_rate": 4.871774555900111e-05, + "loss": 2.4058, + "mean_token_accuracy": 0.4758620738983154, + "step": 151490 + }, + { + "epoch": 0.15258737300333083, + "grad_norm": 10.121598657624293, + "learning_rate": 4.8717620865666544e-05, + "loss": 2.0974, + "mean_token_accuracy": 0.46896552443504336, + "step": 151495 + }, + { + "epoch": 0.152592409056435, + "grad_norm": 9.691344769911515, + "learning_rate": 4.8717496166447176e-05, + "loss": 2.3171, + "mean_token_accuracy": 0.4068965554237366, + "step": 151500 + }, + { + "epoch": 0.15259744510953918, + "grad_norm": 11.183142971157983, + "learning_rate": 4.871737146134308e-05, + "loss": 2.3376, + "mean_token_accuracy": 0.44482759237289426, + "step": 151505 + }, + { + "epoch": 0.15260248116264336, + "grad_norm": 9.510514166036772, + "learning_rate": 4.871724675035425e-05, + "loss": 2.1412, + "mean_token_accuracy": 0.4379310369491577, + "step": 151510 + }, + { + "epoch": 0.15260751721574753, + "grad_norm": 8.786903969199543, + "learning_rate": 4.871712203348074e-05, + "loss": 2.6494, + "mean_token_accuracy": 0.4517241418361664, + "step": 151515 + }, + { + "epoch": 0.1526125532688517, + "grad_norm": 10.04746745391184, + "learning_rate": 4.871699731072259e-05, + "loss": 2.1131, + "mean_token_accuracy": 0.46551724672317507, + "step": 151520 + }, + { + "epoch": 0.15261758932195588, + "grad_norm": 11.178736741802812, + "learning_rate": 4.871687258207982e-05, + "loss": 2.2215, + "mean_token_accuracy": 0.4172413766384125, + "step": 151525 + }, + { + "epoch": 0.15262262537506005, + "grad_norm": 9.301607131628963, + "learning_rate": 4.871674784755247e-05, + "loss": 2.639, + "mean_token_accuracy": 0.36551723778247835, + "step": 151530 + }, + { + "epoch": 0.15262766142816422, + "grad_norm": 9.529478469282866, + "learning_rate": 4.8716623107140584e-05, + "loss": 2.054, + "mean_token_accuracy": 0.48275861144065857, + "step": 151535 + }, + { + "epoch": 0.1526326974812684, + "grad_norm": 9.029804860064633, + "learning_rate": 4.871649836084419e-05, + "loss": 2.2715, + "mean_token_accuracy": 0.43103447258472444, + "step": 151540 + }, + { + "epoch": 0.15263773353437257, + "grad_norm": 11.3498084336476, + "learning_rate": 4.871637360866331e-05, + "loss": 2.4037, + "mean_token_accuracy": 0.4068965494632721, + "step": 151545 + }, + { + "epoch": 0.15264276958747675, + "grad_norm": 12.478862459655357, + "learning_rate": 4.871624885059799e-05, + "loss": 2.1751, + "mean_token_accuracy": 0.441379314661026, + "step": 151550 + }, + { + "epoch": 0.15264780564058092, + "grad_norm": 9.739873710484195, + "learning_rate": 4.871612408664827e-05, + "loss": 2.0861, + "mean_token_accuracy": 0.4517241418361664, + "step": 151555 + }, + { + "epoch": 0.1526528416936851, + "grad_norm": 12.934551084407365, + "learning_rate": 4.871599931681418e-05, + "loss": 2.7575, + "mean_token_accuracy": 0.4034482717514038, + "step": 151560 + }, + { + "epoch": 0.15265787774678927, + "grad_norm": 11.334478729623061, + "learning_rate": 4.871587454109575e-05, + "loss": 2.4468, + "mean_token_accuracy": 0.4172413766384125, + "step": 151565 + }, + { + "epoch": 0.15266291379989344, + "grad_norm": 12.927983394788198, + "learning_rate": 4.871574975949302e-05, + "loss": 2.3165, + "mean_token_accuracy": 0.43793103098869324, + "step": 151570 + }, + { + "epoch": 0.15266794985299761, + "grad_norm": 6.886275198753878, + "learning_rate": 4.871562497200602e-05, + "loss": 1.9703, + "mean_token_accuracy": 0.5232304811477662, + "step": 151575 + }, + { + "epoch": 0.1526729859061018, + "grad_norm": 13.185844511468096, + "learning_rate": 4.871550017863479e-05, + "loss": 2.2847, + "mean_token_accuracy": 0.4517241418361664, + "step": 151580 + }, + { + "epoch": 0.15267802195920596, + "grad_norm": 9.347999062056264, + "learning_rate": 4.871537537937936e-05, + "loss": 2.1035, + "mean_token_accuracy": 0.47586206793785096, + "step": 151585 + }, + { + "epoch": 0.15268305801231014, + "grad_norm": 10.548593734462095, + "learning_rate": 4.871525057423976e-05, + "loss": 2.4976, + "mean_token_accuracy": 0.43103448748588563, + "step": 151590 + }, + { + "epoch": 0.1526880940654143, + "grad_norm": 10.416995581507157, + "learning_rate": 4.871512576321604e-05, + "loss": 2.2351, + "mean_token_accuracy": 0.4379310250282288, + "step": 151595 + }, + { + "epoch": 0.15269313011851848, + "grad_norm": 11.387295603028846, + "learning_rate": 4.8715000946308235e-05, + "loss": 2.5229, + "mean_token_accuracy": 0.38620689511299133, + "step": 151600 + }, + { + "epoch": 0.15269816617162266, + "grad_norm": 11.667339614137786, + "learning_rate": 4.871487612351635e-05, + "loss": 2.0419, + "mean_token_accuracy": 0.5241379380226135, + "step": 151605 + }, + { + "epoch": 0.15270320222472683, + "grad_norm": 9.405809663694358, + "learning_rate": 4.871475129484045e-05, + "loss": 2.5109, + "mean_token_accuracy": 0.441379314661026, + "step": 151610 + }, + { + "epoch": 0.152708238277831, + "grad_norm": 11.530587124899066, + "learning_rate": 4.871462646028056e-05, + "loss": 2.3764, + "mean_token_accuracy": 0.38275861740112305, + "step": 151615 + }, + { + "epoch": 0.15271327433093518, + "grad_norm": 9.513767262623915, + "learning_rate": 4.8714501619836714e-05, + "loss": 2.181, + "mean_token_accuracy": 0.4586207032203674, + "step": 151620 + }, + { + "epoch": 0.15271831038403932, + "grad_norm": 9.29868485534484, + "learning_rate": 4.871437677350894e-05, + "loss": 2.4578, + "mean_token_accuracy": 0.39999999701976774, + "step": 151625 + }, + { + "epoch": 0.1527233464371435, + "grad_norm": 10.265234570587927, + "learning_rate": 4.8714251921297294e-05, + "loss": 2.5704, + "mean_token_accuracy": 0.3827586114406586, + "step": 151630 + }, + { + "epoch": 0.15272838249024767, + "grad_norm": 9.709357851501359, + "learning_rate": 4.871412706320178e-05, + "loss": 2.2481, + "mean_token_accuracy": 0.441379314661026, + "step": 151635 + }, + { + "epoch": 0.15273341854335185, + "grad_norm": 8.000562747492522, + "learning_rate": 4.871400219922246e-05, + "loss": 2.036, + "mean_token_accuracy": 0.5091349124908447, + "step": 151640 + }, + { + "epoch": 0.15273845459645602, + "grad_norm": 12.837839701697442, + "learning_rate": 4.8713877329359354e-05, + "loss": 2.5302, + "mean_token_accuracy": 0.3999999940395355, + "step": 151645 + }, + { + "epoch": 0.1527434906495602, + "grad_norm": 10.61756748720087, + "learning_rate": 4.8713752453612506e-05, + "loss": 2.7302, + "mean_token_accuracy": 0.39655172228813174, + "step": 151650 + }, + { + "epoch": 0.15274852670266437, + "grad_norm": 7.246577094775593, + "learning_rate": 4.871362757198193e-05, + "loss": 2.3921, + "mean_token_accuracy": 0.4206896543502808, + "step": 151655 + }, + { + "epoch": 0.15275356275576854, + "grad_norm": 10.725187477502589, + "learning_rate": 4.8713502684467686e-05, + "loss": 2.5052, + "mean_token_accuracy": 0.4172413766384125, + "step": 151660 + }, + { + "epoch": 0.15275859880887271, + "grad_norm": 10.952326455724945, + "learning_rate": 4.871337779106979e-05, + "loss": 2.3419, + "mean_token_accuracy": 0.4620689690113068, + "step": 151665 + }, + { + "epoch": 0.1527636348619769, + "grad_norm": 11.94202973151706, + "learning_rate": 4.87132528917883e-05, + "loss": 2.8008, + "mean_token_accuracy": 0.4206896543502808, + "step": 151670 + }, + { + "epoch": 0.15276867091508106, + "grad_norm": 9.051026554822105, + "learning_rate": 4.871312798662322e-05, + "loss": 2.1124, + "mean_token_accuracy": 0.4793103516101837, + "step": 151675 + }, + { + "epoch": 0.15277370696818524, + "grad_norm": 10.633870038916406, + "learning_rate": 4.8713003075574605e-05, + "loss": 3.0386, + "mean_token_accuracy": 0.4068965494632721, + "step": 151680 + }, + { + "epoch": 0.1527787430212894, + "grad_norm": 11.788718539534917, + "learning_rate": 4.871287815864248e-05, + "loss": 2.2269, + "mean_token_accuracy": 0.4620689690113068, + "step": 151685 + }, + { + "epoch": 0.15278377907439358, + "grad_norm": 11.175608296512673, + "learning_rate": 4.871275323582689e-05, + "loss": 2.3596, + "mean_token_accuracy": 0.432667875289917, + "step": 151690 + }, + { + "epoch": 0.15278881512749776, + "grad_norm": 12.892527846812667, + "learning_rate": 4.871262830712786e-05, + "loss": 2.3894, + "mean_token_accuracy": 0.4464004814624786, + "step": 151695 + }, + { + "epoch": 0.15279385118060193, + "grad_norm": 9.815879617138426, + "learning_rate": 4.871250337254543e-05, + "loss": 2.2856, + "mean_token_accuracy": 0.42413793206214906, + "step": 151700 + }, + { + "epoch": 0.1527988872337061, + "grad_norm": 9.977656180084065, + "learning_rate": 4.871237843207964e-05, + "loss": 2.2053, + "mean_token_accuracy": 0.42413793206214906, + "step": 151705 + }, + { + "epoch": 0.15280392328681028, + "grad_norm": 9.166777625200304, + "learning_rate": 4.871225348573051e-05, + "loss": 2.1497, + "mean_token_accuracy": 0.5068965435028077, + "step": 151710 + }, + { + "epoch": 0.15280895933991445, + "grad_norm": 9.282678263058903, + "learning_rate": 4.871212853349808e-05, + "loss": 1.9627, + "mean_token_accuracy": 0.47586206793785096, + "step": 151715 + }, + { + "epoch": 0.15281399539301863, + "grad_norm": 11.166826866246458, + "learning_rate": 4.871200357538239e-05, + "loss": 2.4416, + "mean_token_accuracy": 0.42413792908191683, + "step": 151720 + }, + { + "epoch": 0.1528190314461228, + "grad_norm": 13.727064013421689, + "learning_rate": 4.871187861138348e-05, + "loss": 3.2137, + "mean_token_accuracy": 0.3862069070339203, + "step": 151725 + }, + { + "epoch": 0.15282406749922697, + "grad_norm": 8.18788724744751, + "learning_rate": 4.8711753641501366e-05, + "loss": 2.2336, + "mean_token_accuracy": 0.46551724076271056, + "step": 151730 + }, + { + "epoch": 0.15282910355233115, + "grad_norm": 21.35450454651548, + "learning_rate": 4.8711628665736095e-05, + "loss": 2.5557, + "mean_token_accuracy": 0.4413792997598648, + "step": 151735 + }, + { + "epoch": 0.15283413960543532, + "grad_norm": 10.299272229502446, + "learning_rate": 4.87115036840877e-05, + "loss": 2.4002, + "mean_token_accuracy": 0.4103448212146759, + "step": 151740 + }, + { + "epoch": 0.1528391756585395, + "grad_norm": 11.849195906220306, + "learning_rate": 4.871137869655621e-05, + "loss": 2.2997, + "mean_token_accuracy": 0.4464609742164612, + "step": 151745 + }, + { + "epoch": 0.15284421171164367, + "grad_norm": 10.549046259772638, + "learning_rate": 4.8711253703141674e-05, + "loss": 2.4816, + "mean_token_accuracy": 0.44482758045196535, + "step": 151750 + }, + { + "epoch": 0.15284924776474784, + "grad_norm": 7.691927733556517, + "learning_rate": 4.871112870384411e-05, + "loss": 2.6441, + "mean_token_accuracy": 0.42758620977401735, + "step": 151755 + }, + { + "epoch": 0.15285428381785202, + "grad_norm": 8.574594418168173, + "learning_rate": 4.871100369866357e-05, + "loss": 2.386, + "mean_token_accuracy": 0.4310344815254211, + "step": 151760 + }, + { + "epoch": 0.15285931987095616, + "grad_norm": 9.111693549388136, + "learning_rate": 4.871087868760008e-05, + "loss": 2.5299, + "mean_token_accuracy": 0.39310344457626345, + "step": 151765 + }, + { + "epoch": 0.15286435592406034, + "grad_norm": 8.608842666669304, + "learning_rate": 4.8710753670653656e-05, + "loss": 2.4598, + "mean_token_accuracy": 0.38965516686439516, + "step": 151770 + }, + { + "epoch": 0.1528693919771645, + "grad_norm": 8.51081389050932, + "learning_rate": 4.871062864782437e-05, + "loss": 2.0984, + "mean_token_accuracy": 0.4987900912761688, + "step": 151775 + }, + { + "epoch": 0.15287442803026868, + "grad_norm": 10.581574312767009, + "learning_rate": 4.871050361911223e-05, + "loss": 2.5342, + "mean_token_accuracy": 0.38965516686439516, + "step": 151780 + }, + { + "epoch": 0.15287946408337286, + "grad_norm": 10.184842033287048, + "learning_rate": 4.871037858451727e-05, + "loss": 2.7637, + "mean_token_accuracy": 0.3793103456497192, + "step": 151785 + }, + { + "epoch": 0.15288450013647703, + "grad_norm": 8.89618753294905, + "learning_rate": 4.871025354403954e-05, + "loss": 2.2985, + "mean_token_accuracy": 0.506896561384201, + "step": 151790 + }, + { + "epoch": 0.1528895361895812, + "grad_norm": 11.40291612077752, + "learning_rate": 4.871012849767906e-05, + "loss": 2.8271, + "mean_token_accuracy": 0.36896551549434664, + "step": 151795 + }, + { + "epoch": 0.15289457224268538, + "grad_norm": 10.671323069098571, + "learning_rate": 4.8710003445435885e-05, + "loss": 2.2116, + "mean_token_accuracy": 0.5050816655158996, + "step": 151800 + }, + { + "epoch": 0.15289960829578955, + "grad_norm": 11.042671145085544, + "learning_rate": 4.870987838731003e-05, + "loss": 2.5248, + "mean_token_accuracy": 0.42413793206214906, + "step": 151805 + }, + { + "epoch": 0.15290464434889373, + "grad_norm": 11.431738991206663, + "learning_rate": 4.8709753323301536e-05, + "loss": 2.3597, + "mean_token_accuracy": 0.47586206197738645, + "step": 151810 + }, + { + "epoch": 0.1529096804019979, + "grad_norm": 9.870418688214734, + "learning_rate": 4.8709628253410433e-05, + "loss": 2.4948, + "mean_token_accuracy": 0.4586206912994385, + "step": 151815 + }, + { + "epoch": 0.15291471645510207, + "grad_norm": 10.262012387535739, + "learning_rate": 4.870950317763677e-05, + "loss": 2.4061, + "mean_token_accuracy": 0.41034482717514037, + "step": 151820 + }, + { + "epoch": 0.15291975250820625, + "grad_norm": 9.90876377918367, + "learning_rate": 4.870937809598056e-05, + "loss": 2.3683, + "mean_token_accuracy": 0.4620689570903778, + "step": 151825 + }, + { + "epoch": 0.15292478856131042, + "grad_norm": 10.241089527551376, + "learning_rate": 4.870925300844186e-05, + "loss": 2.5011, + "mean_token_accuracy": 0.41905626058578493, + "step": 151830 + }, + { + "epoch": 0.1529298246144146, + "grad_norm": 13.773002869600292, + "learning_rate": 4.870912791502069e-05, + "loss": 2.4792, + "mean_token_accuracy": 0.41379310488700866, + "step": 151835 + }, + { + "epoch": 0.15293486066751877, + "grad_norm": 10.888111416290574, + "learning_rate": 4.8709002815717085e-05, + "loss": 2.3794, + "mean_token_accuracy": 0.41724138259887694, + "step": 151840 + }, + { + "epoch": 0.15293989672062294, + "grad_norm": 10.939115903693443, + "learning_rate": 4.8708877710531095e-05, + "loss": 2.3927, + "mean_token_accuracy": 0.46061705946922304, + "step": 151845 + }, + { + "epoch": 0.15294493277372712, + "grad_norm": 10.748434161322365, + "learning_rate": 4.870875259946274e-05, + "loss": 2.5535, + "mean_token_accuracy": 0.38275861740112305, + "step": 151850 + }, + { + "epoch": 0.1529499688268313, + "grad_norm": 9.78189905866777, + "learning_rate": 4.870862748251205e-05, + "loss": 2.2492, + "mean_token_accuracy": 0.46896552443504336, + "step": 151855 + }, + { + "epoch": 0.15295500487993546, + "grad_norm": 12.131439172240118, + "learning_rate": 4.870850235967908e-05, + "loss": 2.0645, + "mean_token_accuracy": 0.5103448152542114, + "step": 151860 + }, + { + "epoch": 0.15296004093303964, + "grad_norm": 11.884658880384821, + "learning_rate": 4.870837723096385e-05, + "loss": 2.7462, + "mean_token_accuracy": 0.3655172407627106, + "step": 151865 + }, + { + "epoch": 0.1529650769861438, + "grad_norm": 10.468612699040388, + "learning_rate": 4.870825209636639e-05, + "loss": 2.3633, + "mean_token_accuracy": 0.4482758641242981, + "step": 151870 + }, + { + "epoch": 0.15297011303924798, + "grad_norm": 9.603526905565396, + "learning_rate": 4.8708126955886744e-05, + "loss": 2.062, + "mean_token_accuracy": 0.48275862336158754, + "step": 151875 + }, + { + "epoch": 0.15297514909235216, + "grad_norm": 11.437606666558919, + "learning_rate": 4.870800180952495e-05, + "loss": 2.7419, + "mean_token_accuracy": 0.379310342669487, + "step": 151880 + }, + { + "epoch": 0.15298018514545633, + "grad_norm": 8.09579107786234, + "learning_rate": 4.870787665728103e-05, + "loss": 2.2339, + "mean_token_accuracy": 0.4, + "step": 151885 + }, + { + "epoch": 0.1529852211985605, + "grad_norm": 10.162420187244974, + "learning_rate": 4.870775149915503e-05, + "loss": 2.4522, + "mean_token_accuracy": 0.4206896543502808, + "step": 151890 + }, + { + "epoch": 0.15299025725166468, + "grad_norm": 10.801585831118802, + "learning_rate": 4.870762633514699e-05, + "loss": 2.4288, + "mean_token_accuracy": 0.44482758045196535, + "step": 151895 + }, + { + "epoch": 0.15299529330476885, + "grad_norm": 11.21144892146897, + "learning_rate": 4.870750116525693e-05, + "loss": 2.3875, + "mean_token_accuracy": 0.42413793206214906, + "step": 151900 + }, + { + "epoch": 0.153000329357873, + "grad_norm": 9.285829202834849, + "learning_rate": 4.870737598948488e-05, + "loss": 2.3471, + "mean_token_accuracy": 0.441379314661026, + "step": 151905 + }, + { + "epoch": 0.15300536541097717, + "grad_norm": 8.504852708628952, + "learning_rate": 4.8707250807830904e-05, + "loss": 2.2572, + "mean_token_accuracy": 0.4620689630508423, + "step": 151910 + }, + { + "epoch": 0.15301040146408135, + "grad_norm": 8.427991066045669, + "learning_rate": 4.8707125620295e-05, + "loss": 2.4162, + "mean_token_accuracy": 0.42413792610168455, + "step": 151915 + }, + { + "epoch": 0.15301543751718552, + "grad_norm": 15.864187315162367, + "learning_rate": 4.870700042687723e-05, + "loss": 2.6463, + "mean_token_accuracy": 0.3984271019697189, + "step": 151920 + }, + { + "epoch": 0.1530204735702897, + "grad_norm": 10.667663313154023, + "learning_rate": 4.870687522757761e-05, + "loss": 2.3237, + "mean_token_accuracy": 0.4586206912994385, + "step": 151925 + }, + { + "epoch": 0.15302550962339387, + "grad_norm": 9.73024947682569, + "learning_rate": 4.87067500223962e-05, + "loss": 2.1833, + "mean_token_accuracy": 0.43103448748588563, + "step": 151930 + }, + { + "epoch": 0.15303054567649804, + "grad_norm": 11.319871254495444, + "learning_rate": 4.8706624811333014e-05, + "loss": 2.3869, + "mean_token_accuracy": 0.43448275327682495, + "step": 151935 + }, + { + "epoch": 0.15303558172960222, + "grad_norm": 8.559160719969734, + "learning_rate": 4.870649959438808e-05, + "loss": 2.0895, + "mean_token_accuracy": 0.5088324248790741, + "step": 151940 + }, + { + "epoch": 0.1530406177827064, + "grad_norm": 22.24676302890798, + "learning_rate": 4.870637437156145e-05, + "loss": 2.6164, + "mean_token_accuracy": 0.40344828367233276, + "step": 151945 + }, + { + "epoch": 0.15304565383581056, + "grad_norm": 9.468613409389205, + "learning_rate": 4.8706249142853156e-05, + "loss": 1.9933, + "mean_token_accuracy": 0.4655172526836395, + "step": 151950 + }, + { + "epoch": 0.15305068988891474, + "grad_norm": 12.216585862813039, + "learning_rate": 4.870612390826323e-05, + "loss": 2.8181, + "mean_token_accuracy": 0.4586206912994385, + "step": 151955 + }, + { + "epoch": 0.1530557259420189, + "grad_norm": 10.829345589024582, + "learning_rate": 4.87059986677917e-05, + "loss": 2.1054, + "mean_token_accuracy": 0.5137930989265442, + "step": 151960 + }, + { + "epoch": 0.15306076199512308, + "grad_norm": 9.20775051237787, + "learning_rate": 4.870587342143861e-05, + "loss": 2.5326, + "mean_token_accuracy": 0.47586206793785096, + "step": 151965 + }, + { + "epoch": 0.15306579804822726, + "grad_norm": 11.45067653964252, + "learning_rate": 4.8705748169204e-05, + "loss": 2.179, + "mean_token_accuracy": 0.4819116771221161, + "step": 151970 + }, + { + "epoch": 0.15307083410133143, + "grad_norm": 10.047627535541823, + "learning_rate": 4.870562291108789e-05, + "loss": 2.0545, + "mean_token_accuracy": 0.44482758045196535, + "step": 151975 + }, + { + "epoch": 0.1530758701544356, + "grad_norm": 10.258015097278069, + "learning_rate": 4.870549764709033e-05, + "loss": 2.5542, + "mean_token_accuracy": 0.38275861740112305, + "step": 151980 + }, + { + "epoch": 0.15308090620753978, + "grad_norm": 11.667519297988106, + "learning_rate": 4.870537237721133e-05, + "loss": 2.69, + "mean_token_accuracy": 0.36551723480224607, + "step": 151985 + }, + { + "epoch": 0.15308594226064395, + "grad_norm": 9.342234785050787, + "learning_rate": 4.870524710145095e-05, + "loss": 2.5104, + "mean_token_accuracy": 0.43448275327682495, + "step": 151990 + }, + { + "epoch": 0.15309097831374813, + "grad_norm": 11.208457899320862, + "learning_rate": 4.870512181980922e-05, + "loss": 3.254, + "mean_token_accuracy": 0.40550513863563536, + "step": 151995 + }, + { + "epoch": 0.1530960143668523, + "grad_norm": 12.167625315276945, + "learning_rate": 4.870499653228617e-05, + "loss": 3.1626, + "mean_token_accuracy": 0.3275862067937851, + "step": 152000 + }, + { + "epoch": 0.15310105041995647, + "grad_norm": 11.197789786158246, + "learning_rate": 4.870487123888182e-05, + "loss": 2.3781, + "mean_token_accuracy": 0.4551724135875702, + "step": 152005 + }, + { + "epoch": 0.15310608647306065, + "grad_norm": 11.009016812366454, + "learning_rate": 4.870474593959623e-05, + "loss": 2.616, + "mean_token_accuracy": 0.4172413766384125, + "step": 152010 + }, + { + "epoch": 0.15311112252616482, + "grad_norm": 9.40820274933862, + "learning_rate": 4.870462063442943e-05, + "loss": 1.9719, + "mean_token_accuracy": 0.5491228044033051, + "step": 152015 + }, + { + "epoch": 0.153116158579269, + "grad_norm": 10.726778665828386, + "learning_rate": 4.870449532338144e-05, + "loss": 2.4751, + "mean_token_accuracy": 0.4586206912994385, + "step": 152020 + }, + { + "epoch": 0.15312119463237317, + "grad_norm": 11.481276961778825, + "learning_rate": 4.870437000645231e-05, + "loss": 2.861, + "mean_token_accuracy": 0.39655172228813174, + "step": 152025 + }, + { + "epoch": 0.15312623068547734, + "grad_norm": 10.022286534324389, + "learning_rate": 4.870424468364206e-05, + "loss": 2.2702, + "mean_token_accuracy": 0.4862068951129913, + "step": 152030 + }, + { + "epoch": 0.15313126673858152, + "grad_norm": 11.400913619322484, + "learning_rate": 4.870411935495075e-05, + "loss": 2.4162, + "mean_token_accuracy": 0.4551724135875702, + "step": 152035 + }, + { + "epoch": 0.1531363027916857, + "grad_norm": 9.920515834152631, + "learning_rate": 4.870399402037838e-05, + "loss": 2.4407, + "mean_token_accuracy": 0.42068966031074523, + "step": 152040 + }, + { + "epoch": 0.15314133884478984, + "grad_norm": 8.676121970435092, + "learning_rate": 4.870386867992502e-05, + "loss": 2.4858, + "mean_token_accuracy": 0.441379314661026, + "step": 152045 + }, + { + "epoch": 0.153146374897894, + "grad_norm": 12.247268065948628, + "learning_rate": 4.870374333359068e-05, + "loss": 2.4024, + "mean_token_accuracy": 0.4379310369491577, + "step": 152050 + }, + { + "epoch": 0.15315141095099818, + "grad_norm": 10.583086772042593, + "learning_rate": 4.87036179813754e-05, + "loss": 2.7123, + "mean_token_accuracy": 0.36896551847457887, + "step": 152055 + }, + { + "epoch": 0.15315644700410236, + "grad_norm": 20.987639824287957, + "learning_rate": 4.8703492623279216e-05, + "loss": 2.6867, + "mean_token_accuracy": 0.4241379201412201, + "step": 152060 + }, + { + "epoch": 0.15316148305720653, + "grad_norm": 10.593143472784043, + "learning_rate": 4.870336725930217e-05, + "loss": 2.5997, + "mean_token_accuracy": 0.3931034505367279, + "step": 152065 + }, + { + "epoch": 0.1531665191103107, + "grad_norm": 9.389721465870304, + "learning_rate": 4.87032418894443e-05, + "loss": 2.8018, + "mean_token_accuracy": 0.4103448212146759, + "step": 152070 + }, + { + "epoch": 0.15317155516341488, + "grad_norm": 14.340618558359449, + "learning_rate": 4.870311651370561e-05, + "loss": 2.6436, + "mean_token_accuracy": 0.4137930989265442, + "step": 152075 + }, + { + "epoch": 0.15317659121651905, + "grad_norm": 10.887951600550396, + "learning_rate": 4.8702991132086175e-05, + "loss": 2.3393, + "mean_token_accuracy": 0.4137930989265442, + "step": 152080 + }, + { + "epoch": 0.15318162726962323, + "grad_norm": 9.179446741229333, + "learning_rate": 4.8702865744586e-05, + "loss": 2.7852, + "mean_token_accuracy": 0.4965517222881317, + "step": 152085 + }, + { + "epoch": 0.1531866633227274, + "grad_norm": 12.785418896397752, + "learning_rate": 4.8702740351205135e-05, + "loss": 2.5041, + "mean_token_accuracy": 0.4189352720975876, + "step": 152090 + }, + { + "epoch": 0.15319169937583157, + "grad_norm": 10.168351661429892, + "learning_rate": 4.870261495194361e-05, + "loss": 2.3832, + "mean_token_accuracy": 0.43103448748588563, + "step": 152095 + }, + { + "epoch": 0.15319673542893575, + "grad_norm": 9.741432202081342, + "learning_rate": 4.870248954680146e-05, + "loss": 2.3221, + "mean_token_accuracy": 0.4896551728248596, + "step": 152100 + }, + { + "epoch": 0.15320177148203992, + "grad_norm": 9.075597757831215, + "learning_rate": 4.870236413577873e-05, + "loss": 2.363, + "mean_token_accuracy": 0.4068965554237366, + "step": 152105 + }, + { + "epoch": 0.1532068075351441, + "grad_norm": 9.398835256050344, + "learning_rate": 4.870223871887543e-05, + "loss": 2.5737, + "mean_token_accuracy": 0.42758620381355283, + "step": 152110 + }, + { + "epoch": 0.15321184358824827, + "grad_norm": 13.762839246389571, + "learning_rate": 4.870211329609162e-05, + "loss": 2.451, + "mean_token_accuracy": 0.41379311084747317, + "step": 152115 + }, + { + "epoch": 0.15321687964135244, + "grad_norm": 10.456048215453778, + "learning_rate": 4.8701987867427325e-05, + "loss": 2.2277, + "mean_token_accuracy": 0.46551724672317507, + "step": 152120 + }, + { + "epoch": 0.15322191569445662, + "grad_norm": 13.652380420118053, + "learning_rate": 4.870186243288257e-05, + "loss": 2.5853, + "mean_token_accuracy": 0.4103448331356049, + "step": 152125 + }, + { + "epoch": 0.1532269517475608, + "grad_norm": 9.178461136234572, + "learning_rate": 4.870173699245741e-05, + "loss": 2.5205, + "mean_token_accuracy": 0.42413793206214906, + "step": 152130 + }, + { + "epoch": 0.15323198780066496, + "grad_norm": 9.548187819708897, + "learning_rate": 4.870161154615187e-05, + "loss": 1.8927, + "mean_token_accuracy": 0.48965516686439514, + "step": 152135 + }, + { + "epoch": 0.15323702385376914, + "grad_norm": 10.156394428915025, + "learning_rate": 4.8701486093965976e-05, + "loss": 2.2144, + "mean_token_accuracy": 0.4068965494632721, + "step": 152140 + }, + { + "epoch": 0.1532420599068733, + "grad_norm": 12.787612252849316, + "learning_rate": 4.8701360635899765e-05, + "loss": 2.1762, + "mean_token_accuracy": 0.4879007816314697, + "step": 152145 + }, + { + "epoch": 0.1532470959599775, + "grad_norm": 8.952782330368324, + "learning_rate": 4.8701235171953295e-05, + "loss": 2.3771, + "mean_token_accuracy": 0.4482758641242981, + "step": 152150 + }, + { + "epoch": 0.15325213201308166, + "grad_norm": 10.151318056025973, + "learning_rate": 4.8701109702126574e-05, + "loss": 2.3134, + "mean_token_accuracy": 0.4, + "step": 152155 + }, + { + "epoch": 0.15325716806618583, + "grad_norm": 10.935120848655497, + "learning_rate": 4.870098422641964e-05, + "loss": 2.4237, + "mean_token_accuracy": 0.3931034505367279, + "step": 152160 + }, + { + "epoch": 0.15326220411929, + "grad_norm": 11.7656470658916, + "learning_rate": 4.8700858744832545e-05, + "loss": 2.5806, + "mean_token_accuracy": 0.4, + "step": 152165 + }, + { + "epoch": 0.15326724017239418, + "grad_norm": 10.255123983177315, + "learning_rate": 4.870073325736531e-05, + "loss": 2.381, + "mean_token_accuracy": 0.42758620977401735, + "step": 152170 + }, + { + "epoch": 0.15327227622549836, + "grad_norm": 10.36466229374279, + "learning_rate": 4.870060776401797e-05, + "loss": 2.3818, + "mean_token_accuracy": 0.4379310369491577, + "step": 152175 + }, + { + "epoch": 0.15327731227860253, + "grad_norm": 9.550894712758543, + "learning_rate": 4.870048226479056e-05, + "loss": 1.9145, + "mean_token_accuracy": 0.45320196747779845, + "step": 152180 + }, + { + "epoch": 0.15328234833170667, + "grad_norm": 10.255987789739978, + "learning_rate": 4.870035675968313e-05, + "loss": 2.3522, + "mean_token_accuracy": 0.44827587008476255, + "step": 152185 + }, + { + "epoch": 0.15328738438481085, + "grad_norm": 9.95569579867558, + "learning_rate": 4.870023124869569e-05, + "loss": 2.2056, + "mean_token_accuracy": 0.43448275327682495, + "step": 152190 + }, + { + "epoch": 0.15329242043791502, + "grad_norm": 8.818753028987645, + "learning_rate": 4.8700105731828286e-05, + "loss": 2.1763, + "mean_token_accuracy": 0.45741077959537507, + "step": 152195 + }, + { + "epoch": 0.1532974564910192, + "grad_norm": 11.142784617952936, + "learning_rate": 4.869998020908095e-05, + "loss": 1.9844, + "mean_token_accuracy": 0.48275862336158754, + "step": 152200 + }, + { + "epoch": 0.15330249254412337, + "grad_norm": 9.150038123525317, + "learning_rate": 4.869985468045373e-05, + "loss": 2.4487, + "mean_token_accuracy": 0.4586206912994385, + "step": 152205 + }, + { + "epoch": 0.15330752859722754, + "grad_norm": 12.007596270976562, + "learning_rate": 4.869972914594665e-05, + "loss": 2.5659, + "mean_token_accuracy": 0.3999999940395355, + "step": 152210 + }, + { + "epoch": 0.15331256465033172, + "grad_norm": 9.242306005622929, + "learning_rate": 4.869960360555974e-05, + "loss": 1.9762, + "mean_token_accuracy": 0.4896551787853241, + "step": 152215 + }, + { + "epoch": 0.1533176007034359, + "grad_norm": 11.780202907530558, + "learning_rate": 4.8699478059293055e-05, + "loss": 2.5922, + "mean_token_accuracy": 0.37241379022598264, + "step": 152220 + }, + { + "epoch": 0.15332263675654006, + "grad_norm": 11.240414962369448, + "learning_rate": 4.86993525071466e-05, + "loss": 2.3012, + "mean_token_accuracy": 0.45517241954803467, + "step": 152225 + }, + { + "epoch": 0.15332767280964424, + "grad_norm": 8.947508511221614, + "learning_rate": 4.869922694912044e-05, + "loss": 2.2212, + "mean_token_accuracy": 0.43793103098869324, + "step": 152230 + }, + { + "epoch": 0.1533327088627484, + "grad_norm": 10.289988593945553, + "learning_rate": 4.8699101385214584e-05, + "loss": 2.3248, + "mean_token_accuracy": 0.4310344815254211, + "step": 152235 + }, + { + "epoch": 0.1533377449158526, + "grad_norm": 9.24756338131609, + "learning_rate": 4.869897581542908e-05, + "loss": 2.4678, + "mean_token_accuracy": 0.4379310429096222, + "step": 152240 + }, + { + "epoch": 0.15334278096895676, + "grad_norm": 10.167413129838256, + "learning_rate": 4.869885023976396e-05, + "loss": 2.2453, + "mean_token_accuracy": 0.4551724135875702, + "step": 152245 + }, + { + "epoch": 0.15334781702206093, + "grad_norm": 10.324287830962248, + "learning_rate": 4.869872465821927e-05, + "loss": 2.4204, + "mean_token_accuracy": 0.42413793206214906, + "step": 152250 + }, + { + "epoch": 0.1533528530751651, + "grad_norm": 10.911024951241847, + "learning_rate": 4.869859907079501e-05, + "loss": 2.1514, + "mean_token_accuracy": 0.48475499749183654, + "step": 152255 + }, + { + "epoch": 0.15335788912826928, + "grad_norm": 9.945288471955433, + "learning_rate": 4.869847347749127e-05, + "loss": 2.0171, + "mean_token_accuracy": 0.4551724076271057, + "step": 152260 + }, + { + "epoch": 0.15336292518137346, + "grad_norm": 10.274558276005083, + "learning_rate": 4.8698347878308036e-05, + "loss": 2.4321, + "mean_token_accuracy": 0.4289776086807251, + "step": 152265 + }, + { + "epoch": 0.15336796123447763, + "grad_norm": 12.1782921480261, + "learning_rate": 4.869822227324536e-05, + "loss": 2.8446, + "mean_token_accuracy": 0.4137930989265442, + "step": 152270 + }, + { + "epoch": 0.1533729972875818, + "grad_norm": 18.565229509773516, + "learning_rate": 4.869809666230329e-05, + "loss": 2.5896, + "mean_token_accuracy": 0.4379310250282288, + "step": 152275 + }, + { + "epoch": 0.15337803334068598, + "grad_norm": 9.744418386786226, + "learning_rate": 4.869797104548184e-05, + "loss": 2.3869, + "mean_token_accuracy": 0.4344827592372894, + "step": 152280 + }, + { + "epoch": 0.15338306939379015, + "grad_norm": 11.295196495161099, + "learning_rate": 4.8697845422781056e-05, + "loss": 2.2613, + "mean_token_accuracy": 0.4361766457557678, + "step": 152285 + }, + { + "epoch": 0.15338810544689432, + "grad_norm": 10.025867934039303, + "learning_rate": 4.869771979420097e-05, + "loss": 2.5179, + "mean_token_accuracy": 0.4310344815254211, + "step": 152290 + }, + { + "epoch": 0.1533931414999985, + "grad_norm": 13.330858239719818, + "learning_rate": 4.8697594159741615e-05, + "loss": 2.7727, + "mean_token_accuracy": 0.3999999940395355, + "step": 152295 + }, + { + "epoch": 0.15339817755310267, + "grad_norm": 7.797643653476521, + "learning_rate": 4.8697468519403034e-05, + "loss": 1.9521, + "mean_token_accuracy": 0.5505142211914062, + "step": 152300 + }, + { + "epoch": 0.15340321360620685, + "grad_norm": 10.075622810958173, + "learning_rate": 4.869734287318525e-05, + "loss": 2.1191, + "mean_token_accuracy": 0.4448275864124298, + "step": 152305 + }, + { + "epoch": 0.15340824965931102, + "grad_norm": 9.390315734396285, + "learning_rate": 4.869721722108831e-05, + "loss": 2.2317, + "mean_token_accuracy": 0.48620688915252686, + "step": 152310 + }, + { + "epoch": 0.1534132857124152, + "grad_norm": 8.566844396187701, + "learning_rate": 4.869709156311224e-05, + "loss": 2.1328, + "mean_token_accuracy": 0.42758620977401735, + "step": 152315 + }, + { + "epoch": 0.15341832176551937, + "grad_norm": 9.24133103355712, + "learning_rate": 4.869696589925708e-05, + "loss": 2.4533, + "mean_token_accuracy": 0.42068964838981626, + "step": 152320 + }, + { + "epoch": 0.1534233578186235, + "grad_norm": 8.391797864011842, + "learning_rate": 4.8696840229522856e-05, + "loss": 2.1019, + "mean_token_accuracy": 0.48275861144065857, + "step": 152325 + }, + { + "epoch": 0.1534283938717277, + "grad_norm": 8.234138451324727, + "learning_rate": 4.8696714553909616e-05, + "loss": 2.3108, + "mean_token_accuracy": 0.4379310429096222, + "step": 152330 + }, + { + "epoch": 0.15343342992483186, + "grad_norm": 9.849756764974499, + "learning_rate": 4.8696588872417384e-05, + "loss": 2.4796, + "mean_token_accuracy": 0.42413793206214906, + "step": 152335 + }, + { + "epoch": 0.15343846597793603, + "grad_norm": 8.894894113432109, + "learning_rate": 4.86964631850462e-05, + "loss": 2.2184, + "mean_token_accuracy": 0.49836660623550416, + "step": 152340 + }, + { + "epoch": 0.1534435020310402, + "grad_norm": 11.793901180383713, + "learning_rate": 4.869633749179609e-05, + "loss": 2.7683, + "mean_token_accuracy": 0.3551724135875702, + "step": 152345 + }, + { + "epoch": 0.15344853808414438, + "grad_norm": 11.082730339311247, + "learning_rate": 4.8696211792667106e-05, + "loss": 2.6311, + "mean_token_accuracy": 0.441379314661026, + "step": 152350 + }, + { + "epoch": 0.15345357413724856, + "grad_norm": 9.583236600994754, + "learning_rate": 4.8696086087659275e-05, + "loss": 2.4572, + "mean_token_accuracy": 0.45359950661659243, + "step": 152355 + }, + { + "epoch": 0.15345861019035273, + "grad_norm": 20.854441692991973, + "learning_rate": 4.869596037677263e-05, + "loss": 2.8506, + "mean_token_accuracy": 0.40000000298023225, + "step": 152360 + }, + { + "epoch": 0.1534636462434569, + "grad_norm": 10.36072750664657, + "learning_rate": 4.86958346600072e-05, + "loss": 2.8644, + "mean_token_accuracy": 0.38620689511299133, + "step": 152365 + }, + { + "epoch": 0.15346868229656108, + "grad_norm": 11.308472932885493, + "learning_rate": 4.869570893736303e-05, + "loss": 2.3472, + "mean_token_accuracy": 0.44646098613739016, + "step": 152370 + }, + { + "epoch": 0.15347371834966525, + "grad_norm": 9.414311120096766, + "learning_rate": 4.869558320884015e-05, + "loss": 2.4039, + "mean_token_accuracy": 0.441379314661026, + "step": 152375 + }, + { + "epoch": 0.15347875440276942, + "grad_norm": 12.742787954971101, + "learning_rate": 4.86954574744386e-05, + "loss": 2.5986, + "mean_token_accuracy": 0.3862069010734558, + "step": 152380 + }, + { + "epoch": 0.1534837904558736, + "grad_norm": 10.168780960808778, + "learning_rate": 4.869533173415841e-05, + "loss": 2.1009, + "mean_token_accuracy": 0.45686630010604856, + "step": 152385 + }, + { + "epoch": 0.15348882650897777, + "grad_norm": 10.078630702096303, + "learning_rate": 4.8695205987999614e-05, + "loss": 2.3917, + "mean_token_accuracy": 0.4172413766384125, + "step": 152390 + }, + { + "epoch": 0.15349386256208195, + "grad_norm": 9.863909397321065, + "learning_rate": 4.869508023596224e-05, + "loss": 2.5403, + "mean_token_accuracy": 0.42068966031074523, + "step": 152395 + }, + { + "epoch": 0.15349889861518612, + "grad_norm": 10.669192221008938, + "learning_rate": 4.869495447804634e-05, + "loss": 2.2612, + "mean_token_accuracy": 0.4896551787853241, + "step": 152400 + }, + { + "epoch": 0.1535039346682903, + "grad_norm": 11.188126843425506, + "learning_rate": 4.869482871425194e-05, + "loss": 2.0641, + "mean_token_accuracy": 0.4699507415294647, + "step": 152405 + }, + { + "epoch": 0.15350897072139447, + "grad_norm": 10.829038445867432, + "learning_rate": 4.8694702944579076e-05, + "loss": 2.6204, + "mean_token_accuracy": 0.36896551847457887, + "step": 152410 + }, + { + "epoch": 0.15351400677449864, + "grad_norm": 10.089142619140494, + "learning_rate": 4.8694577169027786e-05, + "loss": 2.0657, + "mean_token_accuracy": 0.46896551847457885, + "step": 152415 + }, + { + "epoch": 0.15351904282760281, + "grad_norm": 9.504665646203092, + "learning_rate": 4.8694451387598094e-05, + "loss": 2.3952, + "mean_token_accuracy": 0.4551724076271057, + "step": 152420 + }, + { + "epoch": 0.153524078880707, + "grad_norm": 8.25398137991359, + "learning_rate": 4.869432560029004e-05, + "loss": 2.2875, + "mean_token_accuracy": 0.48620688915252686, + "step": 152425 + }, + { + "epoch": 0.15352911493381116, + "grad_norm": 8.733203931560963, + "learning_rate": 4.8694199807103675e-05, + "loss": 2.2285, + "mean_token_accuracy": 0.4137930929660797, + "step": 152430 + }, + { + "epoch": 0.15353415098691534, + "grad_norm": 11.304386946503627, + "learning_rate": 4.8694074008039e-05, + "loss": 2.2537, + "mean_token_accuracy": 0.47241378426551817, + "step": 152435 + }, + { + "epoch": 0.1535391870400195, + "grad_norm": 10.675690583665103, + "learning_rate": 4.869394820309609e-05, + "loss": 2.404, + "mean_token_accuracy": 0.41379310488700866, + "step": 152440 + }, + { + "epoch": 0.15354422309312368, + "grad_norm": 9.103331158613084, + "learning_rate": 4.869382239227494e-05, + "loss": 2.455, + "mean_token_accuracy": 0.4034482777118683, + "step": 152445 + }, + { + "epoch": 0.15354925914622786, + "grad_norm": 9.924707943636589, + "learning_rate": 4.869369657557561e-05, + "loss": 2.3058, + "mean_token_accuracy": 0.48451300859451296, + "step": 152450 + }, + { + "epoch": 0.15355429519933203, + "grad_norm": 10.28258156921134, + "learning_rate": 4.8693570752998137e-05, + "loss": 2.3101, + "mean_token_accuracy": 0.4517241358757019, + "step": 152455 + }, + { + "epoch": 0.1535593312524362, + "grad_norm": 9.38510570628702, + "learning_rate": 4.8693444924542544e-05, + "loss": 2.3647, + "mean_token_accuracy": 0.42758620977401735, + "step": 152460 + }, + { + "epoch": 0.15356436730554035, + "grad_norm": 9.419868744950376, + "learning_rate": 4.869331909020886e-05, + "loss": 2.3943, + "mean_token_accuracy": 0.4793103516101837, + "step": 152465 + }, + { + "epoch": 0.15356940335864452, + "grad_norm": 9.43359204304903, + "learning_rate": 4.8693193249997144e-05, + "loss": 2.6575, + "mean_token_accuracy": 0.42607380747795104, + "step": 152470 + }, + { + "epoch": 0.1535744394117487, + "grad_norm": 9.171541503348404, + "learning_rate": 4.869306740390741e-05, + "loss": 2.0849, + "mean_token_accuracy": 0.4413793087005615, + "step": 152475 + }, + { + "epoch": 0.15357947546485287, + "grad_norm": 9.651402615198204, + "learning_rate": 4.86929415519397e-05, + "loss": 2.3366, + "mean_token_accuracy": 0.43629764318466185, + "step": 152480 + }, + { + "epoch": 0.15358451151795705, + "grad_norm": 9.652629114021362, + "learning_rate": 4.8692815694094044e-05, + "loss": 2.4831, + "mean_token_accuracy": 0.4344827592372894, + "step": 152485 + }, + { + "epoch": 0.15358954757106122, + "grad_norm": 8.241141436201081, + "learning_rate": 4.869268983037049e-05, + "loss": 2.4756, + "mean_token_accuracy": 0.4275861978530884, + "step": 152490 + }, + { + "epoch": 0.1535945836241654, + "grad_norm": 9.494076878201364, + "learning_rate": 4.869256396076906e-05, + "loss": 2.2705, + "mean_token_accuracy": 0.4413793087005615, + "step": 152495 + }, + { + "epoch": 0.15359961967726957, + "grad_norm": 10.374822030694109, + "learning_rate": 4.86924380852898e-05, + "loss": 2.3682, + "mean_token_accuracy": 0.38965516686439516, + "step": 152500 + }, + { + "epoch": 0.15360465573037374, + "grad_norm": 9.93031576148973, + "learning_rate": 4.869231220393273e-05, + "loss": 2.3026, + "mean_token_accuracy": 0.4517241299152374, + "step": 152505 + }, + { + "epoch": 0.15360969178347791, + "grad_norm": 9.856929197692208, + "learning_rate": 4.86921863166979e-05, + "loss": 2.5612, + "mean_token_accuracy": 0.441379314661026, + "step": 152510 + }, + { + "epoch": 0.1536147278365821, + "grad_norm": 11.199658291288697, + "learning_rate": 4.869206042358533e-05, + "loss": 2.5062, + "mean_token_accuracy": 0.43611615896224976, + "step": 152515 + }, + { + "epoch": 0.15361976388968626, + "grad_norm": 11.68128292572958, + "learning_rate": 4.869193452459507e-05, + "loss": 2.1489, + "mean_token_accuracy": 0.458620685338974, + "step": 152520 + }, + { + "epoch": 0.15362479994279044, + "grad_norm": 10.499709392686029, + "learning_rate": 4.8691808619727145e-05, + "loss": 2.1773, + "mean_token_accuracy": 0.4758620738983154, + "step": 152525 + }, + { + "epoch": 0.1536298359958946, + "grad_norm": 16.147385061442417, + "learning_rate": 4.869168270898159e-05, + "loss": 2.494, + "mean_token_accuracy": 0.4222625494003296, + "step": 152530 + }, + { + "epoch": 0.15363487204899878, + "grad_norm": 12.015926411021631, + "learning_rate": 4.869155679235845e-05, + "loss": 2.6662, + "mean_token_accuracy": 0.38965516090393065, + "step": 152535 + }, + { + "epoch": 0.15363990810210296, + "grad_norm": 8.673566776580444, + "learning_rate": 4.869143086985775e-05, + "loss": 2.4433, + "mean_token_accuracy": 0.4448275864124298, + "step": 152540 + }, + { + "epoch": 0.15364494415520713, + "grad_norm": 10.714984107264097, + "learning_rate": 4.869130494147952e-05, + "loss": 2.4263, + "mean_token_accuracy": 0.41724138259887694, + "step": 152545 + }, + { + "epoch": 0.1536499802083113, + "grad_norm": 12.028512055636313, + "learning_rate": 4.869117900722382e-05, + "loss": 2.4154, + "mean_token_accuracy": 0.41034482717514037, + "step": 152550 + }, + { + "epoch": 0.15365501626141548, + "grad_norm": 9.3523473085196, + "learning_rate": 4.8691053067090644e-05, + "loss": 2.2619, + "mean_token_accuracy": 0.44137930274009707, + "step": 152555 + }, + { + "epoch": 0.15366005231451965, + "grad_norm": 8.842601992176446, + "learning_rate": 4.8690927121080065e-05, + "loss": 2.3186, + "mean_token_accuracy": 0.4620689570903778, + "step": 152560 + }, + { + "epoch": 0.15366508836762383, + "grad_norm": 9.489756434870225, + "learning_rate": 4.86908011691921e-05, + "loss": 2.2818, + "mean_token_accuracy": 0.4517241358757019, + "step": 152565 + }, + { + "epoch": 0.153670124420728, + "grad_norm": 10.603547930469745, + "learning_rate": 4.869067521142679e-05, + "loss": 2.2164, + "mean_token_accuracy": 0.47023593783378603, + "step": 152570 + }, + { + "epoch": 0.15367516047383217, + "grad_norm": 9.965384134069337, + "learning_rate": 4.8690549247784166e-05, + "loss": 2.8012, + "mean_token_accuracy": 0.4034482717514038, + "step": 152575 + }, + { + "epoch": 0.15368019652693635, + "grad_norm": 11.217262137012245, + "learning_rate": 4.869042327826426e-05, + "loss": 2.5843, + "mean_token_accuracy": 0.4000000059604645, + "step": 152580 + }, + { + "epoch": 0.15368523258004052, + "grad_norm": 9.858179119866252, + "learning_rate": 4.8690297302867115e-05, + "loss": 2.2966, + "mean_token_accuracy": 0.46896551847457885, + "step": 152585 + }, + { + "epoch": 0.1536902686331447, + "grad_norm": 12.404693301576899, + "learning_rate": 4.8690171321592766e-05, + "loss": 2.3784, + "mean_token_accuracy": 0.4137930989265442, + "step": 152590 + }, + { + "epoch": 0.15369530468624887, + "grad_norm": 10.284968920607268, + "learning_rate": 4.8690045334441235e-05, + "loss": 2.3856, + "mean_token_accuracy": 0.4620689570903778, + "step": 152595 + }, + { + "epoch": 0.15370034073935304, + "grad_norm": 9.776008089751452, + "learning_rate": 4.8689919341412575e-05, + "loss": 2.5279, + "mean_token_accuracy": 0.41560798287391665, + "step": 152600 + }, + { + "epoch": 0.1537053767924572, + "grad_norm": 8.931952165722855, + "learning_rate": 4.868979334250681e-05, + "loss": 2.0005, + "mean_token_accuracy": 0.5034482777118683, + "step": 152605 + }, + { + "epoch": 0.15371041284556136, + "grad_norm": 11.407742997655527, + "learning_rate": 4.868966733772397e-05, + "loss": 2.6623, + "mean_token_accuracy": 0.42413792610168455, + "step": 152610 + }, + { + "epoch": 0.15371544889866554, + "grad_norm": 9.301814285796274, + "learning_rate": 4.86895413270641e-05, + "loss": 1.9567, + "mean_token_accuracy": 0.4689655125141144, + "step": 152615 + }, + { + "epoch": 0.1537204849517697, + "grad_norm": 9.466167801782776, + "learning_rate": 4.868941531052723e-05, + "loss": 2.2248, + "mean_token_accuracy": 0.47586206793785096, + "step": 152620 + }, + { + "epoch": 0.15372552100487388, + "grad_norm": 10.341738617332876, + "learning_rate": 4.8689289288113396e-05, + "loss": 2.3502, + "mean_token_accuracy": 0.41379311084747317, + "step": 152625 + }, + { + "epoch": 0.15373055705797806, + "grad_norm": 8.762419214356752, + "learning_rate": 4.868916325982264e-05, + "loss": 2.4355, + "mean_token_accuracy": 0.4310344815254211, + "step": 152630 + }, + { + "epoch": 0.15373559311108223, + "grad_norm": 11.003419269371657, + "learning_rate": 4.868903722565499e-05, + "loss": 2.6732, + "mean_token_accuracy": 0.40514216423034666, + "step": 152635 + }, + { + "epoch": 0.1537406291641864, + "grad_norm": 10.187576999514388, + "learning_rate": 4.8688911185610476e-05, + "loss": 2.4305, + "mean_token_accuracy": 0.3827586233615875, + "step": 152640 + }, + { + "epoch": 0.15374566521729058, + "grad_norm": 9.901928476456222, + "learning_rate": 4.8688785139689136e-05, + "loss": 2.7794, + "mean_token_accuracy": 0.4068965554237366, + "step": 152645 + }, + { + "epoch": 0.15375070127039475, + "grad_norm": 9.43769074776903, + "learning_rate": 4.868865908789101e-05, + "loss": 2.6365, + "mean_token_accuracy": 0.3931034505367279, + "step": 152650 + }, + { + "epoch": 0.15375573732349893, + "grad_norm": 11.635205249043453, + "learning_rate": 4.8688533030216135e-05, + "loss": 2.5248, + "mean_token_accuracy": 0.417241370677948, + "step": 152655 + }, + { + "epoch": 0.1537607733766031, + "grad_norm": 10.615360249817746, + "learning_rate": 4.868840696666454e-05, + "loss": 2.6765, + "mean_token_accuracy": 0.4172413766384125, + "step": 152660 + }, + { + "epoch": 0.15376580942970727, + "grad_norm": 9.41859958477984, + "learning_rate": 4.868828089723626e-05, + "loss": 2.141, + "mean_token_accuracy": 0.4517241358757019, + "step": 152665 + }, + { + "epoch": 0.15377084548281145, + "grad_norm": 12.239592175748397, + "learning_rate": 4.868815482193133e-05, + "loss": 2.26, + "mean_token_accuracy": 0.4517241358757019, + "step": 152670 + }, + { + "epoch": 0.15377588153591562, + "grad_norm": 9.621273044316686, + "learning_rate": 4.868802874074979e-05, + "loss": 2.4762, + "mean_token_accuracy": 0.4137931138277054, + "step": 152675 + }, + { + "epoch": 0.1537809175890198, + "grad_norm": 15.60293971843449, + "learning_rate": 4.8687902653691664e-05, + "loss": 2.8578, + "mean_token_accuracy": 0.4068965524435043, + "step": 152680 + }, + { + "epoch": 0.15378595364212397, + "grad_norm": 10.757539670076417, + "learning_rate": 4.8687776560757007e-05, + "loss": 2.5652, + "mean_token_accuracy": 0.3999999940395355, + "step": 152685 + }, + { + "epoch": 0.15379098969522814, + "grad_norm": 11.638909722825764, + "learning_rate": 4.8687650461945824e-05, + "loss": 2.1667, + "mean_token_accuracy": 0.4896551787853241, + "step": 152690 + }, + { + "epoch": 0.15379602574833232, + "grad_norm": 12.017498981453661, + "learning_rate": 4.868752435725818e-05, + "loss": 2.1646, + "mean_token_accuracy": 0.46896551847457885, + "step": 152695 + }, + { + "epoch": 0.1538010618014365, + "grad_norm": 11.106545452130167, + "learning_rate": 4.8687398246694096e-05, + "loss": 2.45, + "mean_token_accuracy": 0.41379310488700866, + "step": 152700 + }, + { + "epoch": 0.15380609785454066, + "grad_norm": 11.697980917622623, + "learning_rate": 4.8687272130253603e-05, + "loss": 2.2245, + "mean_token_accuracy": 0.41724138259887694, + "step": 152705 + }, + { + "epoch": 0.15381113390764484, + "grad_norm": 8.418259308360632, + "learning_rate": 4.868714600793675e-05, + "loss": 2.3768, + "mean_token_accuracy": 0.44827587008476255, + "step": 152710 + }, + { + "epoch": 0.153816169960749, + "grad_norm": 9.492873589694012, + "learning_rate": 4.868701987974355e-05, + "loss": 2.3504, + "mean_token_accuracy": 0.4034482777118683, + "step": 152715 + }, + { + "epoch": 0.15382120601385318, + "grad_norm": 8.892302044764094, + "learning_rate": 4.8686893745674055e-05, + "loss": 2.2816, + "mean_token_accuracy": 0.42758620381355283, + "step": 152720 + }, + { + "epoch": 0.15382624206695736, + "grad_norm": 10.67497047776544, + "learning_rate": 4.8686767605728296e-05, + "loss": 2.5715, + "mean_token_accuracy": 0.4, + "step": 152725 + }, + { + "epoch": 0.15383127812006153, + "grad_norm": 9.75752235937745, + "learning_rate": 4.868664145990631e-05, + "loss": 2.7417, + "mean_token_accuracy": 0.4000000059604645, + "step": 152730 + }, + { + "epoch": 0.1538363141731657, + "grad_norm": 9.128268119289027, + "learning_rate": 4.8686515308208133e-05, + "loss": 1.8361, + "mean_token_accuracy": 0.5448275864124298, + "step": 152735 + }, + { + "epoch": 0.15384135022626988, + "grad_norm": 9.693243609545528, + "learning_rate": 4.868638915063379e-05, + "loss": 2.2404, + "mean_token_accuracy": 0.47241379618644713, + "step": 152740 + }, + { + "epoch": 0.15384638627937403, + "grad_norm": 10.447568259107445, + "learning_rate": 4.8686262987183335e-05, + "loss": 2.4169, + "mean_token_accuracy": 0.4344827592372894, + "step": 152745 + }, + { + "epoch": 0.1538514223324782, + "grad_norm": 9.82108706086436, + "learning_rate": 4.868613681785678e-05, + "loss": 2.3081, + "mean_token_accuracy": 0.46551724076271056, + "step": 152750 + }, + { + "epoch": 0.15385645838558237, + "grad_norm": 10.268381460801237, + "learning_rate": 4.868601064265417e-05, + "loss": 2.5582, + "mean_token_accuracy": 0.4551724135875702, + "step": 152755 + }, + { + "epoch": 0.15386149443868655, + "grad_norm": 12.650362365762081, + "learning_rate": 4.868588446157555e-05, + "loss": 3.0257, + "mean_token_accuracy": 0.34482758641242983, + "step": 152760 + }, + { + "epoch": 0.15386653049179072, + "grad_norm": 7.7637909385303985, + "learning_rate": 4.868575827462094e-05, + "loss": 2.3698, + "mean_token_accuracy": 0.45862067937850953, + "step": 152765 + }, + { + "epoch": 0.1538715665448949, + "grad_norm": 10.666801660482077, + "learning_rate": 4.868563208179038e-05, + "loss": 2.304, + "mean_token_accuracy": 0.42758620977401735, + "step": 152770 + }, + { + "epoch": 0.15387660259799907, + "grad_norm": 11.625868395731853, + "learning_rate": 4.868550588308391e-05, + "loss": 2.2665, + "mean_token_accuracy": 0.4448275864124298, + "step": 152775 + }, + { + "epoch": 0.15388163865110324, + "grad_norm": 14.713889818267283, + "learning_rate": 4.8685379678501555e-05, + "loss": 2.5167, + "mean_token_accuracy": 0.4620689690113068, + "step": 152780 + }, + { + "epoch": 0.15388667470420742, + "grad_norm": 11.527496518353503, + "learning_rate": 4.868525346804336e-05, + "loss": 2.2512, + "mean_token_accuracy": 0.43103447556495667, + "step": 152785 + }, + { + "epoch": 0.1538917107573116, + "grad_norm": 11.332514694420142, + "learning_rate": 4.868512725170936e-05, + "loss": 2.369, + "mean_token_accuracy": 0.382758629322052, + "step": 152790 + }, + { + "epoch": 0.15389674681041576, + "grad_norm": 16.29999719625112, + "learning_rate": 4.868500102949958e-05, + "loss": 2.173, + "mean_token_accuracy": 0.40689654648303986, + "step": 152795 + }, + { + "epoch": 0.15390178286351994, + "grad_norm": 8.796989125108604, + "learning_rate": 4.868487480141406e-05, + "loss": 2.0905, + "mean_token_accuracy": 0.46896551847457885, + "step": 152800 + }, + { + "epoch": 0.1539068189166241, + "grad_norm": 10.951496891262385, + "learning_rate": 4.868474856745284e-05, + "loss": 2.3651, + "mean_token_accuracy": 0.47241379618644713, + "step": 152805 + }, + { + "epoch": 0.15391185496972828, + "grad_norm": 9.474121639424538, + "learning_rate": 4.8684622327615944e-05, + "loss": 2.2049, + "mean_token_accuracy": 0.510344821214676, + "step": 152810 + }, + { + "epoch": 0.15391689102283246, + "grad_norm": 13.570609595405084, + "learning_rate": 4.868449608190342e-05, + "loss": 2.312, + "mean_token_accuracy": 0.49999999403953554, + "step": 152815 + }, + { + "epoch": 0.15392192707593663, + "grad_norm": 12.135781499796394, + "learning_rate": 4.8684369830315295e-05, + "loss": 2.6659, + "mean_token_accuracy": 0.38275861740112305, + "step": 152820 + }, + { + "epoch": 0.1539269631290408, + "grad_norm": 15.067170887591697, + "learning_rate": 4.868424357285161e-05, + "loss": 2.2609, + "mean_token_accuracy": 0.4344827592372894, + "step": 152825 + }, + { + "epoch": 0.15393199918214498, + "grad_norm": 16.406711908220593, + "learning_rate": 4.8684117309512395e-05, + "loss": 2.7508, + "mean_token_accuracy": 0.38275861740112305, + "step": 152830 + }, + { + "epoch": 0.15393703523524915, + "grad_norm": 8.737784514812487, + "learning_rate": 4.8683991040297686e-05, + "loss": 2.2995, + "mean_token_accuracy": 0.41034482717514037, + "step": 152835 + }, + { + "epoch": 0.15394207128835333, + "grad_norm": 9.350975427811166, + "learning_rate": 4.868386476520751e-05, + "loss": 1.9568, + "mean_token_accuracy": 0.49999999403953554, + "step": 152840 + }, + { + "epoch": 0.1539471073414575, + "grad_norm": 11.035349567187088, + "learning_rate": 4.868373848424191e-05, + "loss": 2.6929, + "mean_token_accuracy": 0.37586206793785093, + "step": 152845 + }, + { + "epoch": 0.15395214339456167, + "grad_norm": 12.079743759526588, + "learning_rate": 4.8683612197400936e-05, + "loss": 2.3261, + "mean_token_accuracy": 0.43103448748588563, + "step": 152850 + }, + { + "epoch": 0.15395717944766585, + "grad_norm": 9.01501846629349, + "learning_rate": 4.868348590468459e-05, + "loss": 2.2341, + "mean_token_accuracy": 0.4896551787853241, + "step": 152855 + }, + { + "epoch": 0.15396221550077002, + "grad_norm": 12.127958586624136, + "learning_rate": 4.8683359606092945e-05, + "loss": 2.3126, + "mean_token_accuracy": 0.4344827592372894, + "step": 152860 + }, + { + "epoch": 0.1539672515538742, + "grad_norm": 10.334268476539346, + "learning_rate": 4.8683233301626005e-05, + "loss": 2.3268, + "mean_token_accuracy": 0.4448275864124298, + "step": 152865 + }, + { + "epoch": 0.15397228760697837, + "grad_norm": 11.005755890460767, + "learning_rate": 4.8683106991283815e-05, + "loss": 2.2639, + "mean_token_accuracy": 0.4379310429096222, + "step": 152870 + }, + { + "epoch": 0.15397732366008254, + "grad_norm": 9.102217154011607, + "learning_rate": 4.8682980675066414e-05, + "loss": 2.0473, + "mean_token_accuracy": 0.517241370677948, + "step": 152875 + }, + { + "epoch": 0.15398235971318672, + "grad_norm": 9.966187980874091, + "learning_rate": 4.868285435297383e-05, + "loss": 2.5421, + "mean_token_accuracy": 0.42758620977401735, + "step": 152880 + }, + { + "epoch": 0.15398739576629086, + "grad_norm": 10.82624557080725, + "learning_rate": 4.868272802500611e-05, + "loss": 2.3233, + "mean_token_accuracy": 0.4620689690113068, + "step": 152885 + }, + { + "epoch": 0.15399243181939504, + "grad_norm": 11.412644996767245, + "learning_rate": 4.8682601691163276e-05, + "loss": 2.3123, + "mean_token_accuracy": 0.4206896543502808, + "step": 152890 + }, + { + "epoch": 0.1539974678724992, + "grad_norm": 11.551899775696123, + "learning_rate": 4.8682475351445366e-05, + "loss": 2.5273, + "mean_token_accuracy": 0.38965516686439516, + "step": 152895 + }, + { + "epoch": 0.15400250392560338, + "grad_norm": 10.24828059053627, + "learning_rate": 4.868234900585242e-05, + "loss": 2.3307, + "mean_token_accuracy": 0.42413792610168455, + "step": 152900 + }, + { + "epoch": 0.15400753997870756, + "grad_norm": 9.266559997527391, + "learning_rate": 4.868222265438447e-05, + "loss": 2.3876, + "mean_token_accuracy": 0.47931034564971925, + "step": 152905 + }, + { + "epoch": 0.15401257603181173, + "grad_norm": 9.33744208582358, + "learning_rate": 4.868209629704156e-05, + "loss": 2.1558, + "mean_token_accuracy": 0.43448275327682495, + "step": 152910 + }, + { + "epoch": 0.1540176120849159, + "grad_norm": 12.2049612651845, + "learning_rate": 4.868196993382371e-05, + "loss": 2.244, + "mean_token_accuracy": 0.4601935833692551, + "step": 152915 + }, + { + "epoch": 0.15402264813802008, + "grad_norm": 9.968679309644138, + "learning_rate": 4.8681843564730964e-05, + "loss": 2.1679, + "mean_token_accuracy": 0.4809437394142151, + "step": 152920 + }, + { + "epoch": 0.15402768419112425, + "grad_norm": 11.445931883220927, + "learning_rate": 4.868171718976335e-05, + "loss": 2.4562, + "mean_token_accuracy": 0.441379314661026, + "step": 152925 + }, + { + "epoch": 0.15403272024422843, + "grad_norm": 13.253691668134923, + "learning_rate": 4.8681590808920914e-05, + "loss": 2.882, + "mean_token_accuracy": 0.34482757449150087, + "step": 152930 + }, + { + "epoch": 0.1540377562973326, + "grad_norm": 12.277239813837044, + "learning_rate": 4.8681464422203674e-05, + "loss": 2.4737, + "mean_token_accuracy": 0.41379310488700866, + "step": 152935 + }, + { + "epoch": 0.15404279235043677, + "grad_norm": 11.189818215633903, + "learning_rate": 4.8681338029611684e-05, + "loss": 2.4426, + "mean_token_accuracy": 0.44827585816383364, + "step": 152940 + }, + { + "epoch": 0.15404782840354095, + "grad_norm": 11.866710622062639, + "learning_rate": 4.868121163114497e-05, + "loss": 2.2985, + "mean_token_accuracy": 0.4034482777118683, + "step": 152945 + }, + { + "epoch": 0.15405286445664512, + "grad_norm": 10.561943385692002, + "learning_rate": 4.8681085226803564e-05, + "loss": 2.2836, + "mean_token_accuracy": 0.42758620977401735, + "step": 152950 + }, + { + "epoch": 0.1540579005097493, + "grad_norm": 11.644118161482014, + "learning_rate": 4.868095881658751e-05, + "loss": 2.0437, + "mean_token_accuracy": 0.5137931108474731, + "step": 152955 + }, + { + "epoch": 0.15406293656285347, + "grad_norm": 12.155367878415287, + "learning_rate": 4.868083240049683e-05, + "loss": 2.3164, + "mean_token_accuracy": 0.4344827473163605, + "step": 152960 + }, + { + "epoch": 0.15406797261595764, + "grad_norm": 11.531993944287722, + "learning_rate": 4.868070597853158e-05, + "loss": 2.3746, + "mean_token_accuracy": 0.4379310369491577, + "step": 152965 + }, + { + "epoch": 0.15407300866906182, + "grad_norm": 12.077769907358492, + "learning_rate": 4.8680579550691774e-05, + "loss": 2.3574, + "mean_token_accuracy": 0.43986691236495973, + "step": 152970 + }, + { + "epoch": 0.154078044722166, + "grad_norm": 11.647663613278928, + "learning_rate": 4.868045311697746e-05, + "loss": 2.3383, + "mean_token_accuracy": 0.4655172348022461, + "step": 152975 + }, + { + "epoch": 0.15408308077527016, + "grad_norm": 10.795617295775722, + "learning_rate": 4.868032667738866e-05, + "loss": 2.112, + "mean_token_accuracy": 0.4724138081073761, + "step": 152980 + }, + { + "epoch": 0.15408811682837434, + "grad_norm": 12.275849309866002, + "learning_rate": 4.868020023192542e-05, + "loss": 2.1301, + "mean_token_accuracy": 0.4551724135875702, + "step": 152985 + }, + { + "epoch": 0.1540931528814785, + "grad_norm": 9.635435076566564, + "learning_rate": 4.8680073780587774e-05, + "loss": 1.9659, + "mean_token_accuracy": 0.4862068951129913, + "step": 152990 + }, + { + "epoch": 0.15409818893458269, + "grad_norm": 37.426696374871334, + "learning_rate": 4.8679947323375756e-05, + "loss": 2.8081, + "mean_token_accuracy": 0.4655172348022461, + "step": 152995 + }, + { + "epoch": 0.15410322498768686, + "grad_norm": 8.485265836781963, + "learning_rate": 4.86798208602894e-05, + "loss": 2.2371, + "mean_token_accuracy": 0.493103438615799, + "step": 153000 + }, + { + "epoch": 0.15410322498768686, + "step": 153000, + "total_flos": 3941084160000.0, + "train_loss": 0.0, + "train_runtime": 2.3821, + "train_samples_per_second": 4198.041, + "train_steps_per_second": 16792.163 + } + ], + "logging_steps": 5, + "max_steps": 40000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3941084160000.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}