diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,21763 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 12059, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 8.292561572269674e-05, + "grad_norm": 4.255775451660156, + "learning_rate": 0.0, + "loss": 1.7068, + "mean_token_accuracy": 0.6242363229393959, + "num_tokens": 32768.0, + "step": 1 + }, + { + "epoch": 0.0004146280786134837, + "grad_norm": 4.439743995666504, + "learning_rate": 3.316749585406302e-07, + "loss": 1.6681, + "mean_token_accuracy": 0.6322473715990782, + "num_tokens": 163840.0, + "step": 5 + }, + { + "epoch": 0.0008292561572269675, + "grad_norm": 4.379396915435791, + "learning_rate": 7.462686567164179e-07, + "loss": 1.6459, + "mean_token_accuracy": 0.6347201891243458, + "num_tokens": 327680.0, + "step": 10 + }, + { + "epoch": 0.0012438842358404511, + "grad_norm": 3.9361932277679443, + "learning_rate": 1.1608623548922056e-06, + "loss": 1.6593, + "mean_token_accuracy": 0.6337243393063545, + "num_tokens": 491520.0, + "step": 15 + }, + { + "epoch": 0.001658512314453935, + "grad_norm": 3.2223989963531494, + "learning_rate": 1.5754560530679936e-06, + "loss": 1.6504, + "mean_token_accuracy": 0.6342436477541924, + "num_tokens": 655360.0, + "step": 20 + }, + { + "epoch": 0.0020731403930674187, + "grad_norm": 3.0505387783050537, + "learning_rate": 1.9900497512437813e-06, + "loss": 1.6054, + "mean_token_accuracy": 0.642295940220356, + "num_tokens": 819200.0, + "step": 25 + }, + { + "epoch": 0.0024877684716809022, + "grad_norm": 2.7383947372436523, + "learning_rate": 2.404643449419569e-06, + "loss": 1.5684, + "mean_token_accuracy": 0.641458946466446, + "num_tokens": 983040.0, + "step": 30 + }, + { + "epoch": 0.002902396550294386, + "grad_norm": 2.6609301567077637, + "learning_rate": 2.819237147595357e-06, + "loss": 1.5282, + "mean_token_accuracy": 0.6450574293732643, + "num_tokens": 1146880.0, + "step": 35 + }, + { + "epoch": 0.00331702462890787, + "grad_norm": 2.531510829925537, + "learning_rate": 3.233830845771144e-06, + "loss": 1.5194, + "mean_token_accuracy": 0.6455400750041008, + "num_tokens": 1310720.0, + "step": 40 + }, + { + "epoch": 0.0037316527075213534, + "grad_norm": 2.3891067504882812, + "learning_rate": 3.6484245439469323e-06, + "loss": 1.5524, + "mean_token_accuracy": 0.6400598779320716, + "num_tokens": 1474560.0, + "step": 45 + }, + { + "epoch": 0.004146280786134837, + "grad_norm": 2.4584314823150635, + "learning_rate": 4.0630182421227205e-06, + "loss": 1.5161, + "mean_token_accuracy": 0.6443792775273323, + "num_tokens": 1638400.0, + "step": 50 + }, + { + "epoch": 0.0045609088647483205, + "grad_norm": 2.7202205657958984, + "learning_rate": 4.477611940298508e-06, + "loss": 1.451, + "mean_token_accuracy": 0.6507514685392379, + "num_tokens": 1802240.0, + "step": 55 + }, + { + "epoch": 0.0049755369433618045, + "grad_norm": 2.1469480991363525, + "learning_rate": 4.892205638474295e-06, + "loss": 1.4507, + "mean_token_accuracy": 0.6539161786437034, + "num_tokens": 1966080.0, + "step": 60 + }, + { + "epoch": 0.0053901650219752885, + "grad_norm": 2.370760679244995, + "learning_rate": 5.306799336650083e-06, + "loss": 1.4258, + "mean_token_accuracy": 0.6556940346956253, + "num_tokens": 2129920.0, + "step": 65 + }, + { + "epoch": 0.005804793100588772, + "grad_norm": 2.206936836242676, + "learning_rate": 5.7213930348258714e-06, + "loss": 1.3915, + "mean_token_accuracy": 0.665120966732502, + "num_tokens": 2293760.0, + "step": 70 + }, + { + "epoch": 0.006219421179202256, + "grad_norm": 2.31195330619812, + "learning_rate": 6.135986733001659e-06, + "loss": 1.4497, + "mean_token_accuracy": 0.6530730664730072, + "num_tokens": 2457600.0, + "step": 75 + }, + { + "epoch": 0.00663404925781574, + "grad_norm": 2.3220083713531494, + "learning_rate": 6.550580431177446e-06, + "loss": 1.3777, + "mean_token_accuracy": 0.6623583763837815, + "num_tokens": 2621056.0, + "step": 80 + }, + { + "epoch": 0.007048677336429223, + "grad_norm": 2.3705880641937256, + "learning_rate": 6.965174129353234e-06, + "loss": 1.4263, + "mean_token_accuracy": 0.6539833784103394, + "num_tokens": 2784896.0, + "step": 85 + }, + { + "epoch": 0.007463305415042707, + "grad_norm": 2.4472298622131348, + "learning_rate": 7.3797678275290215e-06, + "loss": 1.4116, + "mean_token_accuracy": 0.6581011697649956, + "num_tokens": 2948736.0, + "step": 90 + }, + { + "epoch": 0.00787793349365619, + "grad_norm": 2.323025941848755, + "learning_rate": 7.79436152570481e-06, + "loss": 1.3931, + "mean_token_accuracy": 0.6656957253813743, + "num_tokens": 3111601.0, + "step": 95 + }, + { + "epoch": 0.008292561572269675, + "grad_norm": 2.3700666427612305, + "learning_rate": 8.208955223880597e-06, + "loss": 1.4302, + "mean_token_accuracy": 0.6556268364191056, + "num_tokens": 3275441.0, + "step": 100 + }, + { + "epoch": 0.008707189650883157, + "grad_norm": 2.247762441635132, + "learning_rate": 8.623548922056384e-06, + "loss": 1.3676, + "mean_token_accuracy": 0.6675586551427841, + "num_tokens": 3439281.0, + "step": 105 + }, + { + "epoch": 0.009121817729496641, + "grad_norm": 2.2979612350463867, + "learning_rate": 9.038142620232173e-06, + "loss": 1.3495, + "mean_token_accuracy": 0.6681207224726677, + "num_tokens": 3603121.0, + "step": 110 + }, + { + "epoch": 0.009536445808110125, + "grad_norm": 2.7338132858276367, + "learning_rate": 9.45273631840796e-06, + "loss": 1.3666, + "mean_token_accuracy": 0.6656109973788261, + "num_tokens": 3766936.0, + "step": 115 + }, + { + "epoch": 0.009951073886723609, + "grad_norm": 2.2777035236358643, + "learning_rate": 9.867330016583748e-06, + "loss": 1.3427, + "mean_token_accuracy": 0.6667766377329827, + "num_tokens": 3930776.0, + "step": 120 + }, + { + "epoch": 0.010365701965337093, + "grad_norm": 2.358052968978882, + "learning_rate": 1.0281923714759537e-05, + "loss": 1.3618, + "mean_token_accuracy": 0.6643939435482025, + "num_tokens": 4094616.0, + "step": 125 + }, + { + "epoch": 0.010780330043950577, + "grad_norm": 2.2697947025299072, + "learning_rate": 1.0696517412935324e-05, + "loss": 1.3022, + "mean_token_accuracy": 0.6709677398204803, + "num_tokens": 4258456.0, + "step": 130 + }, + { + "epoch": 0.01119495812256406, + "grad_norm": 2.2408878803253174, + "learning_rate": 1.1111111111111112e-05, + "loss": 1.3369, + "mean_token_accuracy": 0.6672592878341674, + "num_tokens": 4422296.0, + "step": 135 + }, + { + "epoch": 0.011609586201177543, + "grad_norm": 2.094745397567749, + "learning_rate": 1.1525704809286899e-05, + "loss": 1.3387, + "mean_token_accuracy": 0.6627565950155259, + "num_tokens": 4586136.0, + "step": 140 + }, + { + "epoch": 0.012024214279791027, + "grad_norm": 2.171077251434326, + "learning_rate": 1.1940298507462686e-05, + "loss": 1.3199, + "mean_token_accuracy": 0.6675586476922035, + "num_tokens": 4749976.0, + "step": 145 + }, + { + "epoch": 0.012438842358404511, + "grad_norm": 2.0367488861083984, + "learning_rate": 1.2354892205638475e-05, + "loss": 1.2787, + "mean_token_accuracy": 0.6763868525624275, + "num_tokens": 4913816.0, + "step": 150 + }, + { + "epoch": 0.012853470437017995, + "grad_norm": 2.0931756496429443, + "learning_rate": 1.2769485903814263e-05, + "loss": 1.2735, + "mean_token_accuracy": 0.6738025419414043, + "num_tokens": 5077656.0, + "step": 155 + }, + { + "epoch": 0.01326809851563148, + "grad_norm": 2.0900051593780518, + "learning_rate": 1.3184079601990052e-05, + "loss": 1.3317, + "mean_token_accuracy": 0.6669599190354347, + "num_tokens": 5241496.0, + "step": 160 + }, + { + "epoch": 0.013682726594244961, + "grad_norm": 2.108306407928467, + "learning_rate": 1.3598673300165837e-05, + "loss": 1.4021, + "mean_token_accuracy": 0.6558712109923363, + "num_tokens": 5405336.0, + "step": 165 + }, + { + "epoch": 0.014097354672858445, + "grad_norm": 2.012080669403076, + "learning_rate": 1.4013266998341626e-05, + "loss": 1.2891, + "mean_token_accuracy": 0.6758186653256416, + "num_tokens": 5569176.0, + "step": 170 + }, + { + "epoch": 0.01451198275147193, + "grad_norm": 2.0759711265563965, + "learning_rate": 1.4427860696517415e-05, + "loss": 1.2816, + "mean_token_accuracy": 0.6724951103329658, + "num_tokens": 5733016.0, + "step": 175 + }, + { + "epoch": 0.014926610830085413, + "grad_norm": 1.8789936304092407, + "learning_rate": 1.4842454394693201e-05, + "loss": 1.2954, + "mean_token_accuracy": 0.677889783680439, + "num_tokens": 5896856.0, + "step": 180 + }, + { + "epoch": 0.015341238908698897, + "grad_norm": 1.902726650238037, + "learning_rate": 1.525704809286899e-05, + "loss": 1.3149, + "mean_token_accuracy": 0.6745784431695938, + "num_tokens": 6060696.0, + "step": 185 + }, + { + "epoch": 0.01575586698731238, + "grad_norm": 2.1739706993103027, + "learning_rate": 1.5671641791044777e-05, + "loss": 1.2684, + "mean_token_accuracy": 0.6763257533311844, + "num_tokens": 6224536.0, + "step": 190 + }, + { + "epoch": 0.016170495065925865, + "grad_norm": 1.9013311862945557, + "learning_rate": 1.6086235489220563e-05, + "loss": 1.3092, + "mean_token_accuracy": 0.6740285888314247, + "num_tokens": 6388376.0, + "step": 195 + }, + { + "epoch": 0.01658512314453935, + "grad_norm": 1.9905920028686523, + "learning_rate": 1.6500829187396352e-05, + "loss": 1.2969, + "mean_token_accuracy": 0.6714259505271911, + "num_tokens": 6552216.0, + "step": 200 + }, + { + "epoch": 0.016999751223152833, + "grad_norm": 2.1580097675323486, + "learning_rate": 1.691542288557214e-05, + "loss": 1.2599, + "mean_token_accuracy": 0.6779753148555756, + "num_tokens": 6716056.0, + "step": 205 + }, + { + "epoch": 0.017414379301766314, + "grad_norm": 2.1075289249420166, + "learning_rate": 1.7330016583747926e-05, + "loss": 1.3148, + "mean_token_accuracy": 0.6719086065888404, + "num_tokens": 6879896.0, + "step": 210 + }, + { + "epoch": 0.017829007380379798, + "grad_norm": 2.0454044342041016, + "learning_rate": 1.7744610281923716e-05, + "loss": 1.3219, + "mean_token_accuracy": 0.6674425765872002, + "num_tokens": 7043736.0, + "step": 215 + }, + { + "epoch": 0.018243635458993282, + "grad_norm": 1.9608289003372192, + "learning_rate": 1.8159203980099505e-05, + "loss": 1.2862, + "mean_token_accuracy": 0.6722079649567604, + "num_tokens": 7207576.0, + "step": 220 + }, + { + "epoch": 0.018658263537606766, + "grad_norm": 1.9292097091674805, + "learning_rate": 1.857379767827529e-05, + "loss": 1.3089, + "mean_token_accuracy": 0.6710471615195275, + "num_tokens": 7371416.0, + "step": 225 + }, + { + "epoch": 0.01907289161622025, + "grad_norm": 1.9487251043319702, + "learning_rate": 1.898839137645108e-05, + "loss": 1.268, + "mean_token_accuracy": 0.6755315259099006, + "num_tokens": 7535256.0, + "step": 230 + }, + { + "epoch": 0.019487519694833734, + "grad_norm": 1.9679298400878906, + "learning_rate": 1.9402985074626868e-05, + "loss": 1.2515, + "mean_token_accuracy": 0.6827223837375641, + "num_tokens": 7699096.0, + "step": 235 + }, + { + "epoch": 0.019902147773447218, + "grad_norm": 2.0813894271850586, + "learning_rate": 1.9817578772802657e-05, + "loss": 1.2744, + "mean_token_accuracy": 0.6776331901550293, + "num_tokens": 7862936.0, + "step": 240 + }, + { + "epoch": 0.020316775852060702, + "grad_norm": 1.8269994258880615, + "learning_rate": 2.0232172470978443e-05, + "loss": 1.2968, + "mean_token_accuracy": 0.6754276618361473, + "num_tokens": 8026776.0, + "step": 245 + }, + { + "epoch": 0.020731403930674186, + "grad_norm": 1.8057606220245361, + "learning_rate": 2.0646766169154232e-05, + "loss": 1.329, + "mean_token_accuracy": 0.668071848154068, + "num_tokens": 8190616.0, + "step": 250 + }, + { + "epoch": 0.02114603200928767, + "grad_norm": 1.9886025190353394, + "learning_rate": 2.1061359867330017e-05, + "loss": 1.2449, + "mean_token_accuracy": 0.6797592893242836, + "num_tokens": 8354456.0, + "step": 255 + }, + { + "epoch": 0.021560660087901154, + "grad_norm": 1.895953893661499, + "learning_rate": 2.1475953565505803e-05, + "loss": 1.2596, + "mean_token_accuracy": 0.6817754164338112, + "num_tokens": 8518296.0, + "step": 260 + }, + { + "epoch": 0.021975288166514638, + "grad_norm": 1.98335599899292, + "learning_rate": 2.1890547263681592e-05, + "loss": 1.2828, + "mean_token_accuracy": 0.6719269320368767, + "num_tokens": 8682136.0, + "step": 265 + }, + { + "epoch": 0.02238991624512812, + "grad_norm": 2.0442817211151123, + "learning_rate": 2.230514096185738e-05, + "loss": 1.2699, + "mean_token_accuracy": 0.6768572837114334, + "num_tokens": 8845976.0, + "step": 270 + }, + { + "epoch": 0.022804544323741602, + "grad_norm": 1.8405953645706177, + "learning_rate": 2.2719734660033167e-05, + "loss": 1.2689, + "mean_token_accuracy": 0.6766678869724274, + "num_tokens": 9009816.0, + "step": 275 + }, + { + "epoch": 0.023219172402355086, + "grad_norm": 1.8921599388122559, + "learning_rate": 2.3134328358208956e-05, + "loss": 1.2757, + "mean_token_accuracy": 0.6725439906120301, + "num_tokens": 9173656.0, + "step": 280 + }, + { + "epoch": 0.02363380048096857, + "grad_norm": 1.9423072338104248, + "learning_rate": 2.3548922056384745e-05, + "loss": 1.2914, + "mean_token_accuracy": 0.6712365537881851, + "num_tokens": 9337496.0, + "step": 285 + }, + { + "epoch": 0.024048428559582054, + "grad_norm": 1.901416540145874, + "learning_rate": 2.396351575456053e-05, + "loss": 1.2725, + "mean_token_accuracy": 0.6756964817643165, + "num_tokens": 9501336.0, + "step": 290 + }, + { + "epoch": 0.02446305663819554, + "grad_norm": 1.8767505884170532, + "learning_rate": 2.437810945273632e-05, + "loss": 1.2286, + "mean_token_accuracy": 0.6843169555068016, + "num_tokens": 9665176.0, + "step": 295 + }, + { + "epoch": 0.024877684716809022, + "grad_norm": 1.8018405437469482, + "learning_rate": 2.479270315091211e-05, + "loss": 1.3213, + "mean_token_accuracy": 0.6664894863963127, + "num_tokens": 9829016.0, + "step": 300 + }, + { + "epoch": 0.025292312795422506, + "grad_norm": 1.8564913272857666, + "learning_rate": 2.5207296849087897e-05, + "loss": 1.2648, + "mean_token_accuracy": 0.6779142245650291, + "num_tokens": 9992856.0, + "step": 305 + }, + { + "epoch": 0.02570694087403599, + "grad_norm": 1.8388084173202515, + "learning_rate": 2.5621890547263683e-05, + "loss": 1.2181, + "mean_token_accuracy": 0.6881781488656997, + "num_tokens": 10156696.0, + "step": 310 + }, + { + "epoch": 0.026121568952649474, + "grad_norm": 1.929056167602539, + "learning_rate": 2.603648424543947e-05, + "loss": 1.353, + "mean_token_accuracy": 0.6692035496234894, + "num_tokens": 10319872.0, + "step": 315 + }, + { + "epoch": 0.02653619703126296, + "grad_norm": 1.7754048109054565, + "learning_rate": 2.645107794361526e-05, + "loss": 1.2762, + "mean_token_accuracy": 0.6776442378759384, + "num_tokens": 10482899.0, + "step": 320 + }, + { + "epoch": 0.026950825109876442, + "grad_norm": 1.938942313194275, + "learning_rate": 2.6865671641791047e-05, + "loss": 1.2157, + "mean_token_accuracy": 0.6928702339529991, + "num_tokens": 10646739.0, + "step": 325 + }, + { + "epoch": 0.027365453188489923, + "grad_norm": 2.127412796020508, + "learning_rate": 2.7280265339966832e-05, + "loss": 1.3557, + "mean_token_accuracy": 0.6633675456047058, + "num_tokens": 10810579.0, + "step": 330 + }, + { + "epoch": 0.027780081267103407, + "grad_norm": 1.875630497932434, + "learning_rate": 2.7694859038142625e-05, + "loss": 1.2634, + "mean_token_accuracy": 0.6779936462640762, + "num_tokens": 10974419.0, + "step": 335 + }, + { + "epoch": 0.02819470934571689, + "grad_norm": 1.8406985998153687, + "learning_rate": 2.810945273631841e-05, + "loss": 1.2822, + "mean_token_accuracy": 0.6770466819405556, + "num_tokens": 11138259.0, + "step": 340 + }, + { + "epoch": 0.028609337424330375, + "grad_norm": 1.8575828075408936, + "learning_rate": 2.8524046434494196e-05, + "loss": 1.256, + "mean_token_accuracy": 0.6802602678537368, + "num_tokens": 11302099.0, + "step": 345 + }, + { + "epoch": 0.02902396550294386, + "grad_norm": 1.8851755857467651, + "learning_rate": 2.8938640132669985e-05, + "loss": 1.3098, + "mean_token_accuracy": 0.6703384682536125, + "num_tokens": 11465939.0, + "step": 350 + }, + { + "epoch": 0.029438593581557343, + "grad_norm": 1.8496495485305786, + "learning_rate": 2.935323383084577e-05, + "loss": 1.279, + "mean_token_accuracy": 0.6763624161481857, + "num_tokens": 11629779.0, + "step": 355 + }, + { + "epoch": 0.029853221660170827, + "grad_norm": 1.8364508152008057, + "learning_rate": 2.976782752902156e-05, + "loss": 1.2982, + "mean_token_accuracy": 0.6732404701411724, + "num_tokens": 11793619.0, + "step": 360 + }, + { + "epoch": 0.03026784973878431, + "grad_norm": 1.8799529075622559, + "learning_rate": 3.018242122719735e-05, + "loss": 1.2816, + "mean_token_accuracy": 0.6801197469234467, + "num_tokens": 11957459.0, + "step": 365 + }, + { + "epoch": 0.030682477817397795, + "grad_norm": 1.8871667385101318, + "learning_rate": 3.059701492537314e-05, + "loss": 1.2468, + "mean_token_accuracy": 0.6795088008046151, + "num_tokens": 12121299.0, + "step": 370 + }, + { + "epoch": 0.03109710589601128, + "grad_norm": 1.9109357595443726, + "learning_rate": 3.101160862354892e-05, + "loss": 1.3207, + "mean_token_accuracy": 0.6695869967341423, + "num_tokens": 12285139.0, + "step": 375 + }, + { + "epoch": 0.03151173397462476, + "grad_norm": 1.798118233680725, + "learning_rate": 3.1426202321724716e-05, + "loss": 1.3022, + "mean_token_accuracy": 0.6710410594940186, + "num_tokens": 12448979.0, + "step": 380 + }, + { + "epoch": 0.03192636205323825, + "grad_norm": 1.9242483377456665, + "learning_rate": 3.18407960199005e-05, + "loss": 1.2831, + "mean_token_accuracy": 0.674749507009983, + "num_tokens": 12612819.0, + "step": 385 + }, + { + "epoch": 0.03234099013185173, + "grad_norm": 1.8487025499343872, + "learning_rate": 3.225538971807629e-05, + "loss": 1.2911, + "mean_token_accuracy": 0.6759653016924858, + "num_tokens": 12776659.0, + "step": 390 + }, + { + "epoch": 0.032755618210465215, + "grad_norm": 1.8383572101593018, + "learning_rate": 3.266998341625207e-05, + "loss": 1.283, + "mean_token_accuracy": 0.6767656400799751, + "num_tokens": 12940499.0, + "step": 395 + }, + { + "epoch": 0.0331702462890787, + "grad_norm": 1.8775842189788818, + "learning_rate": 3.3084577114427865e-05, + "loss": 1.2318, + "mean_token_accuracy": 0.6872495085000991, + "num_tokens": 13104339.0, + "step": 400 + }, + { + "epoch": 0.03358487436769218, + "grad_norm": 1.8343756198883057, + "learning_rate": 3.349917081260365e-05, + "loss": 1.2764, + "mean_token_accuracy": 0.6762924410402775, + "num_tokens": 13267633.0, + "step": 405 + }, + { + "epoch": 0.03399950244630567, + "grad_norm": 1.866855263710022, + "learning_rate": 3.3913764510779436e-05, + "loss": 1.2361, + "mean_token_accuracy": 0.6795393496751785, + "num_tokens": 13431473.0, + "step": 410 + }, + { + "epoch": 0.03441413052491915, + "grad_norm": 2.01704478263855, + "learning_rate": 3.432835820895522e-05, + "loss": 1.2635, + "mean_token_accuracy": 0.6738086491823196, + "num_tokens": 13595313.0, + "step": 415 + }, + { + "epoch": 0.03482875860353263, + "grad_norm": 2.0374269485473633, + "learning_rate": 3.474295190713101e-05, + "loss": 1.3077, + "mean_token_accuracy": 0.6713587492704391, + "num_tokens": 13759153.0, + "step": 420 + }, + { + "epoch": 0.03524338668214611, + "grad_norm": 1.7698193788528442, + "learning_rate": 3.51575456053068e-05, + "loss": 1.2407, + "mean_token_accuracy": 0.6825207769870758, + "num_tokens": 13922993.0, + "step": 425 + }, + { + "epoch": 0.035658014760759596, + "grad_norm": 1.7802177667617798, + "learning_rate": 3.5572139303482585e-05, + "loss": 1.2346, + "mean_token_accuracy": 0.6825635403394699, + "num_tokens": 14086833.0, + "step": 430 + }, + { + "epoch": 0.03607264283937308, + "grad_norm": 1.8050049543380737, + "learning_rate": 3.598673300165838e-05, + "loss": 1.2194, + "mean_token_accuracy": 0.6875488728284835, + "num_tokens": 14250673.0, + "step": 435 + }, + { + "epoch": 0.036487270917986564, + "grad_norm": 1.7095775604248047, + "learning_rate": 3.6401326699834163e-05, + "loss": 1.2356, + "mean_token_accuracy": 0.6833883225917816, + "num_tokens": 14414513.0, + "step": 440 + }, + { + "epoch": 0.03690189899660005, + "grad_norm": 1.748965859413147, + "learning_rate": 3.681592039800995e-05, + "loss": 1.2422, + "mean_token_accuracy": 0.6845124676823616, + "num_tokens": 14578353.0, + "step": 445 + }, + { + "epoch": 0.03731652707521353, + "grad_norm": 1.8167507648468018, + "learning_rate": 3.723051409618574e-05, + "loss": 1.3053, + "mean_token_accuracy": 0.6701429590582848, + "num_tokens": 14742193.0, + "step": 450 + }, + { + "epoch": 0.037731155153827016, + "grad_norm": 1.798095464706421, + "learning_rate": 3.764510779436153e-05, + "loss": 1.3032, + "mean_token_accuracy": 0.6737719923257828, + "num_tokens": 14906033.0, + "step": 455 + }, + { + "epoch": 0.0381457832324405, + "grad_norm": 1.8302721977233887, + "learning_rate": 3.805970149253731e-05, + "loss": 1.3093, + "mean_token_accuracy": 0.6685470998287201, + "num_tokens": 15069125.0, + "step": 460 + }, + { + "epoch": 0.038560411311053984, + "grad_norm": 1.8639119863510132, + "learning_rate": 3.8474295190713105e-05, + "loss": 1.3691, + "mean_token_accuracy": 0.6577162772417069, + "num_tokens": 15232965.0, + "step": 465 + }, + { + "epoch": 0.03897503938966747, + "grad_norm": 1.7261921167373657, + "learning_rate": 3.888888888888889e-05, + "loss": 1.2696, + "mean_token_accuracy": 0.6784152060747146, + "num_tokens": 15396805.0, + "step": 470 + }, + { + "epoch": 0.03938966746828095, + "grad_norm": 1.8879308700561523, + "learning_rate": 3.9303482587064676e-05, + "loss": 1.2801, + "mean_token_accuracy": 0.6760569319128991, + "num_tokens": 15560645.0, + "step": 475 + }, + { + "epoch": 0.039804295546894436, + "grad_norm": 1.7602691650390625, + "learning_rate": 3.971807628524047e-05, + "loss": 1.3729, + "mean_token_accuracy": 0.6612170025706291, + "num_tokens": 15724485.0, + "step": 480 + }, + { + "epoch": 0.04021892362550792, + "grad_norm": 1.8454163074493408, + "learning_rate": 4.0132669983416254e-05, + "loss": 1.3278, + "mean_token_accuracy": 0.6671309858560562, + "num_tokens": 15888325.0, + "step": 485 + }, + { + "epoch": 0.040633551704121404, + "grad_norm": 1.7032504081726074, + "learning_rate": 4.054726368159204e-05, + "loss": 1.235, + "mean_token_accuracy": 0.6836302936077118, + "num_tokens": 16051794.0, + "step": 490 + }, + { + "epoch": 0.04104817978273489, + "grad_norm": 1.6901429891586304, + "learning_rate": 4.096185737976783e-05, + "loss": 1.2892, + "mean_token_accuracy": 0.673714742064476, + "num_tokens": 16215039.0, + "step": 495 + }, + { + "epoch": 0.04146280786134837, + "grad_norm": 1.7194161415100098, + "learning_rate": 4.137645107794362e-05, + "loss": 1.3475, + "mean_token_accuracy": 0.6696969702839851, + "num_tokens": 16378879.0, + "step": 500 + }, + { + "epoch": 0.041877435939961856, + "grad_norm": 1.787503719329834, + "learning_rate": 4.1791044776119404e-05, + "loss": 1.2836, + "mean_token_accuracy": 0.6777657330036163, + "num_tokens": 16541966.0, + "step": 505 + }, + { + "epoch": 0.04229206401857534, + "grad_norm": 1.6372019052505493, + "learning_rate": 4.2205638474295196e-05, + "loss": 1.2472, + "mean_token_accuracy": 0.6835716024041176, + "num_tokens": 16705806.0, + "step": 510 + }, + { + "epoch": 0.042706692097188824, + "grad_norm": 1.7618612051010132, + "learning_rate": 4.262023217247098e-05, + "loss": 1.3188, + "mean_token_accuracy": 0.6690554767847061, + "num_tokens": 16869646.0, + "step": 515 + }, + { + "epoch": 0.04312132017580231, + "grad_norm": 2.250345468521118, + "learning_rate": 4.303482587064677e-05, + "loss": 1.38, + "mean_token_accuracy": 0.6570992156863212, + "num_tokens": 17033486.0, + "step": 520 + }, + { + "epoch": 0.04353594825441579, + "grad_norm": 1.7955857515335083, + "learning_rate": 4.344941956882256e-05, + "loss": 1.2819, + "mean_token_accuracy": 0.6765945747494697, + "num_tokens": 17197326.0, + "step": 525 + }, + { + "epoch": 0.043950576333029276, + "grad_norm": 1.7054221630096436, + "learning_rate": 4.3864013266998345e-05, + "loss": 1.2634, + "mean_token_accuracy": 0.6766556695103645, + "num_tokens": 17361166.0, + "step": 530 + }, + { + "epoch": 0.04436520441164276, + "grad_norm": 1.677278757095337, + "learning_rate": 4.427860696517413e-05, + "loss": 1.2543, + "mean_token_accuracy": 0.6794110462069511, + "num_tokens": 17525006.0, + "step": 535 + }, + { + "epoch": 0.04477983249025624, + "grad_norm": 1.789715051651001, + "learning_rate": 4.469320066334992e-05, + "loss": 1.27, + "mean_token_accuracy": 0.6728861182928085, + "num_tokens": 17688846.0, + "step": 540 + }, + { + "epoch": 0.04519446056886972, + "grad_norm": 1.7444549798965454, + "learning_rate": 4.510779436152571e-05, + "loss": 1.2851, + "mean_token_accuracy": 0.6725989744067192, + "num_tokens": 17852686.0, + "step": 545 + }, + { + "epoch": 0.045609088647483205, + "grad_norm": 1.7071665525436401, + "learning_rate": 4.5522388059701495e-05, + "loss": 1.3174, + "mean_token_accuracy": 0.6670454546809197, + "num_tokens": 18016526.0, + "step": 550 + }, + { + "epoch": 0.04602371672609669, + "grad_norm": 1.6631416082382202, + "learning_rate": 4.593698175787729e-05, + "loss": 1.3353, + "mean_token_accuracy": 0.6682978942990303, + "num_tokens": 18180366.0, + "step": 555 + }, + { + "epoch": 0.04643834480471017, + "grad_norm": 1.747498869895935, + "learning_rate": 4.635157545605307e-05, + "loss": 1.2341, + "mean_token_accuracy": 0.6825391009449959, + "num_tokens": 18344206.0, + "step": 560 + }, + { + "epoch": 0.04685297288332366, + "grad_norm": 1.735395908355713, + "learning_rate": 4.676616915422886e-05, + "loss": 1.2797, + "mean_token_accuracy": 0.6739538997411728, + "num_tokens": 18507084.0, + "step": 565 + }, + { + "epoch": 0.04726760096193714, + "grad_norm": 1.7586103677749634, + "learning_rate": 4.718076285240465e-05, + "loss": 1.4349, + "mean_token_accuracy": 0.6478922292590141, + "num_tokens": 18670924.0, + "step": 570 + }, + { + "epoch": 0.047682229040550625, + "grad_norm": 1.7093908786773682, + "learning_rate": 4.7595356550580436e-05, + "loss": 1.3027, + "mean_token_accuracy": 0.6697977557778358, + "num_tokens": 18834639.0, + "step": 575 + }, + { + "epoch": 0.04809685711916411, + "grad_norm": 1.7219830751419067, + "learning_rate": 4.800995024875622e-05, + "loss": 1.3248, + "mean_token_accuracy": 0.6675891973078251, + "num_tokens": 18998479.0, + "step": 580 + }, + { + "epoch": 0.04851148519777759, + "grad_norm": 1.7310324907302856, + "learning_rate": 4.842454394693201e-05, + "loss": 1.2503, + "mean_token_accuracy": 0.6795957133173942, + "num_tokens": 19162276.0, + "step": 585 + }, + { + "epoch": 0.04892611327639108, + "grad_norm": 1.7623659372329712, + "learning_rate": 4.883913764510779e-05, + "loss": 1.2349, + "mean_token_accuracy": 0.6812561109662056, + "num_tokens": 19326116.0, + "step": 590 + }, + { + "epoch": 0.04934074135500456, + "grad_norm": 1.6814851760864258, + "learning_rate": 4.9253731343283586e-05, + "loss": 1.3514, + "mean_token_accuracy": 0.6650232136249542, + "num_tokens": 19489956.0, + "step": 595 + }, + { + "epoch": 0.049755369433618045, + "grad_norm": 1.9025373458862305, + "learning_rate": 4.966832504145937e-05, + "loss": 1.3641, + "mean_token_accuracy": 0.6606243863701821, + "num_tokens": 19653796.0, + "step": 600 + }, + { + "epoch": 0.05016999751223153, + "grad_norm": 1.6884503364562988, + "learning_rate": 5.0082918739635164e-05, + "loss": 1.3043, + "mean_token_accuracy": 0.6688844054937363, + "num_tokens": 19817636.0, + "step": 605 + }, + { + "epoch": 0.05058462559084501, + "grad_norm": 1.7554152011871338, + "learning_rate": 5.049751243781094e-05, + "loss": 1.2706, + "mean_token_accuracy": 0.6801991656422615, + "num_tokens": 19981476.0, + "step": 610 + }, + { + "epoch": 0.0509992536694585, + "grad_norm": 1.6515464782714844, + "learning_rate": 5.0912106135986735e-05, + "loss": 1.3007, + "mean_token_accuracy": 0.6692387565970421, + "num_tokens": 20145316.0, + "step": 615 + }, + { + "epoch": 0.05141388174807198, + "grad_norm": 1.6192206144332886, + "learning_rate": 5.132669983416253e-05, + "loss": 1.2865, + "mean_token_accuracy": 0.6751038610935212, + "num_tokens": 20309156.0, + "step": 620 + }, + { + "epoch": 0.051828509826685465, + "grad_norm": 1.6686915159225464, + "learning_rate": 5.1741293532338306e-05, + "loss": 1.2957, + "mean_token_accuracy": 0.6767473071813583, + "num_tokens": 20472996.0, + "step": 625 + }, + { + "epoch": 0.05224313790529895, + "grad_norm": 1.667669653892517, + "learning_rate": 5.21558872305141e-05, + "loss": 1.2578, + "mean_token_accuracy": 0.679050587117672, + "num_tokens": 20636836.0, + "step": 630 + }, + { + "epoch": 0.05265776598391243, + "grad_norm": 1.6532917022705078, + "learning_rate": 5.257048092868989e-05, + "loss": 1.2807, + "mean_token_accuracy": 0.6721774160861969, + "num_tokens": 20800676.0, + "step": 635 + }, + { + "epoch": 0.05307239406252592, + "grad_norm": 1.7098535299301147, + "learning_rate": 5.298507462686567e-05, + "loss": 1.3167, + "mean_token_accuracy": 0.6653653442859649, + "num_tokens": 20964516.0, + "step": 640 + }, + { + "epoch": 0.0534870221411394, + "grad_norm": 1.7239949703216553, + "learning_rate": 5.339966832504146e-05, + "loss": 1.313, + "mean_token_accuracy": 0.671446581184864, + "num_tokens": 21127831.0, + "step": 645 + }, + { + "epoch": 0.053901650219752885, + "grad_norm": 1.6677320003509521, + "learning_rate": 5.3814262023217255e-05, + "loss": 1.2705, + "mean_token_accuracy": 0.6744562536478043, + "num_tokens": 21291671.0, + "step": 650 + }, + { + "epoch": 0.05431627829836636, + "grad_norm": 1.6505001783370972, + "learning_rate": 5.422885572139303e-05, + "loss": 1.3399, + "mean_token_accuracy": 0.6627688139677048, + "num_tokens": 21455511.0, + "step": 655 + }, + { + "epoch": 0.054730906376979846, + "grad_norm": 1.6301279067993164, + "learning_rate": 5.4643449419568826e-05, + "loss": 1.2612, + "mean_token_accuracy": 0.6786168172955513, + "num_tokens": 21619351.0, + "step": 660 + }, + { + "epoch": 0.05514553445559333, + "grad_norm": 1.6679112911224365, + "learning_rate": 5.505804311774462e-05, + "loss": 1.2431, + "mean_token_accuracy": 0.682655180990696, + "num_tokens": 21783191.0, + "step": 665 + }, + { + "epoch": 0.055560162534206814, + "grad_norm": 1.6891405582427979, + "learning_rate": 5.54726368159204e-05, + "loss": 1.3149, + "mean_token_accuracy": 0.6664772793650627, + "num_tokens": 21947031.0, + "step": 670 + }, + { + "epoch": 0.0559747906128203, + "grad_norm": 1.7086944580078125, + "learning_rate": 5.588723051409619e-05, + "loss": 1.2677, + "mean_token_accuracy": 0.6777737095952034, + "num_tokens": 22110871.0, + "step": 675 + }, + { + "epoch": 0.05638941869143378, + "grad_norm": 1.6492773294448853, + "learning_rate": 5.630182421227198e-05, + "loss": 1.2703, + "mean_token_accuracy": 0.6764601692557335, + "num_tokens": 22274711.0, + "step": 680 + }, + { + "epoch": 0.056804046770047266, + "grad_norm": 1.843117356300354, + "learning_rate": 5.671641791044776e-05, + "loss": 1.2863, + "mean_token_accuracy": 0.6737047895789147, + "num_tokens": 22438551.0, + "step": 685 + }, + { + "epoch": 0.05721867484866075, + "grad_norm": 1.6733990907669067, + "learning_rate": 5.713101160862355e-05, + "loss": 1.1973, + "mean_token_accuracy": 0.6911718010902405, + "num_tokens": 22602391.0, + "step": 690 + }, + { + "epoch": 0.057633302927274234, + "grad_norm": 1.5508854389190674, + "learning_rate": 5.7545605306799345e-05, + "loss": 1.3249, + "mean_token_accuracy": 0.6713893011212348, + "num_tokens": 22766231.0, + "step": 695 + }, + { + "epoch": 0.05804793100588772, + "grad_norm": 1.62466299533844, + "learning_rate": 5.7960199004975124e-05, + "loss": 1.3918, + "mean_token_accuracy": 0.6578445717692375, + "num_tokens": 22930071.0, + "step": 700 + }, + { + "epoch": 0.0584625590845012, + "grad_norm": 1.4997344017028809, + "learning_rate": 5.837479270315092e-05, + "loss": 1.3206, + "mean_token_accuracy": 0.6705156370997429, + "num_tokens": 23093911.0, + "step": 705 + }, + { + "epoch": 0.058877187163114686, + "grad_norm": 1.654841423034668, + "learning_rate": 5.878938640132671e-05, + "loss": 1.2856, + "mean_token_accuracy": 0.6701063022017479, + "num_tokens": 23257751.0, + "step": 710 + }, + { + "epoch": 0.05929181524172817, + "grad_norm": 1.640665888786316, + "learning_rate": 5.920398009950249e-05, + "loss": 1.3436, + "mean_token_accuracy": 0.6653836756944657, + "num_tokens": 23421591.0, + "step": 715 + }, + { + "epoch": 0.059706443320341654, + "grad_norm": 1.6757763624191284, + "learning_rate": 5.961857379767828e-05, + "loss": 1.3558, + "mean_token_accuracy": 0.6623839154839516, + "num_tokens": 23585431.0, + "step": 720 + }, + { + "epoch": 0.06012107139895514, + "grad_norm": 1.6322523355484009, + "learning_rate": 6.003316749585407e-05, + "loss": 1.3635, + "mean_token_accuracy": 0.6597018599510193, + "num_tokens": 23749271.0, + "step": 725 + }, + { + "epoch": 0.06053569947756862, + "grad_norm": 1.6678824424743652, + "learning_rate": 6.044776119402985e-05, + "loss": 1.3659, + "mean_token_accuracy": 0.6591336786746979, + "num_tokens": 23913111.0, + "step": 730 + }, + { + "epoch": 0.060950327556182106, + "grad_norm": 1.4809238910675049, + "learning_rate": 6.0862354892205644e-05, + "loss": 1.2666, + "mean_token_accuracy": 0.6795271247625351, + "num_tokens": 24076951.0, + "step": 735 + }, + { + "epoch": 0.06136495563479559, + "grad_norm": 1.533839225769043, + "learning_rate": 6.127694859038143e-05, + "loss": 1.2866, + "mean_token_accuracy": 0.6745417892932892, + "num_tokens": 24240791.0, + "step": 740 + }, + { + "epoch": 0.061779583713409074, + "grad_norm": 1.6793161630630493, + "learning_rate": 6.169154228855722e-05, + "loss": 1.2913, + "mean_token_accuracy": 0.6742424249649048, + "num_tokens": 24404631.0, + "step": 745 + }, + { + "epoch": 0.06219421179202256, + "grad_norm": 2.060741424560547, + "learning_rate": 6.2106135986733e-05, + "loss": 1.3387, + "mean_token_accuracy": 0.6664833813905716, + "num_tokens": 24568471.0, + "step": 750 + }, + { + "epoch": 0.06260883987063603, + "grad_norm": 1.731775164604187, + "learning_rate": 6.25207296849088e-05, + "loss": 1.2995, + "mean_token_accuracy": 0.6714076220989227, + "num_tokens": 24732311.0, + "step": 755 + }, + { + "epoch": 0.06302346794924953, + "grad_norm": 1.576099157333374, + "learning_rate": 6.293532338308457e-05, + "loss": 1.3214, + "mean_token_accuracy": 0.6706133902072906, + "num_tokens": 24896151.0, + "step": 760 + }, + { + "epoch": 0.063438096027863, + "grad_norm": 1.6460028886795044, + "learning_rate": 6.334991708126037e-05, + "loss": 1.2541, + "mean_token_accuracy": 0.6741141244769097, + "num_tokens": 25059991.0, + "step": 765 + }, + { + "epoch": 0.0638527241064765, + "grad_norm": 1.728865146636963, + "learning_rate": 6.376451077943616e-05, + "loss": 1.3508, + "mean_token_accuracy": 0.6589809343218803, + "num_tokens": 25223831.0, + "step": 770 + }, + { + "epoch": 0.06426735218508997, + "grad_norm": 1.5584203004837036, + "learning_rate": 6.417910447761194e-05, + "loss": 1.2995, + "mean_token_accuracy": 0.6665994688868523, + "num_tokens": 25387671.0, + "step": 775 + }, + { + "epoch": 0.06468198026370346, + "grad_norm": 1.5956830978393555, + "learning_rate": 6.459369817578773e-05, + "loss": 1.2532, + "mean_token_accuracy": 0.6797776117920875, + "num_tokens": 25551511.0, + "step": 780 + }, + { + "epoch": 0.06509660834231694, + "grad_norm": 1.5926156044006348, + "learning_rate": 6.500829187396353e-05, + "loss": 1.3293, + "mean_token_accuracy": 0.6667155444622039, + "num_tokens": 25715351.0, + "step": 785 + }, + { + "epoch": 0.06551123642093043, + "grad_norm": 1.5243552923202515, + "learning_rate": 6.54228855721393e-05, + "loss": 1.3105, + "mean_token_accuracy": 0.6727578222751618, + "num_tokens": 25879191.0, + "step": 790 + }, + { + "epoch": 0.0659258644995439, + "grad_norm": 1.619012475013733, + "learning_rate": 6.58374792703151e-05, + "loss": 1.3561, + "mean_token_accuracy": 0.658205033838749, + "num_tokens": 26043031.0, + "step": 795 + }, + { + "epoch": 0.0663404925781574, + "grad_norm": 1.593612790107727, + "learning_rate": 6.625207296849088e-05, + "loss": 1.2844, + "mean_token_accuracy": 0.673313781619072, + "num_tokens": 26206871.0, + "step": 800 + }, + { + "epoch": 0.06675512065677087, + "grad_norm": 1.5031630992889404, + "learning_rate": 6.666666666666667e-05, + "loss": 1.2926, + "mean_token_accuracy": 0.6722629576921463, + "num_tokens": 26370711.0, + "step": 805 + }, + { + "epoch": 0.06716974873538437, + "grad_norm": 1.5605660676956177, + "learning_rate": 6.708126036484246e-05, + "loss": 1.2939, + "mean_token_accuracy": 0.6696425586938858, + "num_tokens": 26533925.0, + "step": 810 + }, + { + "epoch": 0.06758437681399784, + "grad_norm": 1.4447792768478394, + "learning_rate": 6.749585406301825e-05, + "loss": 1.2417, + "mean_token_accuracy": 0.6836693555116653, + "num_tokens": 26697765.0, + "step": 815 + }, + { + "epoch": 0.06799900489261133, + "grad_norm": 1.5722301006317139, + "learning_rate": 6.791044776119403e-05, + "loss": 1.2685, + "mean_token_accuracy": 0.6784824058413506, + "num_tokens": 26861605.0, + "step": 820 + }, + { + "epoch": 0.06841363297122481, + "grad_norm": 1.5380804538726807, + "learning_rate": 6.832504145936983e-05, + "loss": 1.3755, + "mean_token_accuracy": 0.6574107989668846, + "num_tokens": 27025445.0, + "step": 825 + }, + { + "epoch": 0.0688282610498383, + "grad_norm": 1.5342949628829956, + "learning_rate": 6.873963515754561e-05, + "loss": 1.2748, + "mean_token_accuracy": 0.6746456444263458, + "num_tokens": 27189285.0, + "step": 830 + }, + { + "epoch": 0.06924288912845178, + "grad_norm": 1.448546290397644, + "learning_rate": 6.91542288557214e-05, + "loss": 1.3013, + "mean_token_accuracy": 0.6716947659850121, + "num_tokens": 27353125.0, + "step": 835 + }, + { + "epoch": 0.06965751720706526, + "grad_norm": 1.6198995113372803, + "learning_rate": 6.956882255389718e-05, + "loss": 1.2998, + "mean_token_accuracy": 0.6699169114232063, + "num_tokens": 27516965.0, + "step": 840 + }, + { + "epoch": 0.07007214528567875, + "grad_norm": 1.6602299213409424, + "learning_rate": 6.998341625207298e-05, + "loss": 1.2599, + "mean_token_accuracy": 0.6774743407964706, + "num_tokens": 27680805.0, + "step": 845 + }, + { + "epoch": 0.07048677336429222, + "grad_norm": 1.5138729810714722, + "learning_rate": 7.039800995024875e-05, + "loss": 1.3361, + "mean_token_accuracy": 0.6645833343267441, + "num_tokens": 27844645.0, + "step": 850 + }, + { + "epoch": 0.07090140144290571, + "grad_norm": 1.5389212369918823, + "learning_rate": 7.081260364842455e-05, + "loss": 1.2749, + "mean_token_accuracy": 0.6735459432005882, + "num_tokens": 28008485.0, + "step": 855 + }, + { + "epoch": 0.07131602952151919, + "grad_norm": 1.3996397256851196, + "learning_rate": 7.122719734660034e-05, + "loss": 1.2249, + "mean_token_accuracy": 0.682807919383049, + "num_tokens": 28172325.0, + "step": 860 + }, + { + "epoch": 0.07173065760013268, + "grad_norm": 1.5532928705215454, + "learning_rate": 7.164179104477612e-05, + "loss": 1.3725, + "mean_token_accuracy": 0.6573436006903648, + "num_tokens": 28336165.0, + "step": 865 + }, + { + "epoch": 0.07214528567874616, + "grad_norm": 1.6004377603530884, + "learning_rate": 7.205638474295191e-05, + "loss": 1.4084, + "mean_token_accuracy": 0.6534824058413505, + "num_tokens": 28500005.0, + "step": 870 + }, + { + "epoch": 0.07255991375735965, + "grad_norm": 1.4392030239105225, + "learning_rate": 7.24709784411277e-05, + "loss": 1.3117, + "mean_token_accuracy": 0.6682306960225105, + "num_tokens": 28663845.0, + "step": 875 + }, + { + "epoch": 0.07297454183597313, + "grad_norm": 1.471176266670227, + "learning_rate": 7.288557213930348e-05, + "loss": 1.3104, + "mean_token_accuracy": 0.6698924705386162, + "num_tokens": 28827685.0, + "step": 880 + }, + { + "epoch": 0.07338916991458662, + "grad_norm": 1.5112273693084717, + "learning_rate": 7.330016583747927e-05, + "loss": 1.3454, + "mean_token_accuracy": 0.668340665102005, + "num_tokens": 28991525.0, + "step": 885 + }, + { + "epoch": 0.0738037979932001, + "grad_norm": 1.4726061820983887, + "learning_rate": 7.371475953565507e-05, + "loss": 1.29, + "mean_token_accuracy": 0.6737292274832726, + "num_tokens": 29155365.0, + "step": 890 + }, + { + "epoch": 0.07421842607181359, + "grad_norm": 1.5442107915878296, + "learning_rate": 7.412935323383084e-05, + "loss": 1.2527, + "mean_token_accuracy": 0.6790017127990723, + "num_tokens": 29319205.0, + "step": 895 + }, + { + "epoch": 0.07463305415042706, + "grad_norm": 1.7195465564727783, + "learning_rate": 7.454394693200664e-05, + "loss": 1.3696, + "mean_token_accuracy": 0.6660129517316818, + "num_tokens": 29483045.0, + "step": 900 + }, + { + "epoch": 0.07504768222904055, + "grad_norm": 1.5677791833877563, + "learning_rate": 7.495854063018242e-05, + "loss": 1.3951, + "mean_token_accuracy": 0.6553331576287746, + "num_tokens": 29645952.0, + "step": 905 + }, + { + "epoch": 0.07546231030765403, + "grad_norm": 1.5033553838729858, + "learning_rate": 7.537313432835821e-05, + "loss": 1.3318, + "mean_token_accuracy": 0.666281770169735, + "num_tokens": 29809792.0, + "step": 910 + }, + { + "epoch": 0.07587693838626752, + "grad_norm": 1.4631794691085815, + "learning_rate": 7.5787728026534e-05, + "loss": 1.3955, + "mean_token_accuracy": 0.6548264935612679, + "num_tokens": 29973632.0, + "step": 915 + }, + { + "epoch": 0.076291566464881, + "grad_norm": 1.6084941625595093, + "learning_rate": 7.62023217247098e-05, + "loss": 1.361, + "mean_token_accuracy": 0.6567937433719635, + "num_tokens": 30137472.0, + "step": 920 + }, + { + "epoch": 0.07670619454349449, + "grad_norm": 1.395077109336853, + "learning_rate": 7.661691542288557e-05, + "loss": 1.3181, + "mean_token_accuracy": 0.668242909014225, + "num_tokens": 30301312.0, + "step": 925 + }, + { + "epoch": 0.07712082262210797, + "grad_norm": 1.754188895225525, + "learning_rate": 7.703150912106136e-05, + "loss": 1.3594, + "mean_token_accuracy": 0.6625899627804757, + "num_tokens": 30465139.0, + "step": 930 + }, + { + "epoch": 0.07753545070072146, + "grad_norm": 1.4886492490768433, + "learning_rate": 7.744610281923715e-05, + "loss": 1.3002, + "mean_token_accuracy": 0.6706049293279648, + "num_tokens": 30628200.0, + "step": 935 + }, + { + "epoch": 0.07795007877933494, + "grad_norm": 1.5473731756210327, + "learning_rate": 7.786069651741294e-05, + "loss": 1.33, + "mean_token_accuracy": 0.6673020482063293, + "num_tokens": 30792040.0, + "step": 940 + }, + { + "epoch": 0.07836470685794843, + "grad_norm": 1.4740445613861084, + "learning_rate": 7.827529021558872e-05, + "loss": 1.2766, + "mean_token_accuracy": 0.6745784506201744, + "num_tokens": 30955880.0, + "step": 945 + }, + { + "epoch": 0.0787793349365619, + "grad_norm": 1.6856474876403809, + "learning_rate": 7.868988391376452e-05, + "loss": 1.3747, + "mean_token_accuracy": 0.6588622033596039, + "num_tokens": 31119579.0, + "step": 950 + }, + { + "epoch": 0.07919396301517538, + "grad_norm": 2.418442964553833, + "learning_rate": 7.910447761194029e-05, + "loss": 1.3778, + "mean_token_accuracy": 0.6559934005141258, + "num_tokens": 31283419.0, + "step": 955 + }, + { + "epoch": 0.07960859109378887, + "grad_norm": 1.7516529560089111, + "learning_rate": 7.951907131011609e-05, + "loss": 1.4087, + "mean_token_accuracy": 0.6597629532217979, + "num_tokens": 31447259.0, + "step": 960 + }, + { + "epoch": 0.08002321917240235, + "grad_norm": 1.4395791292190552, + "learning_rate": 7.993366500829188e-05, + "loss": 1.2647, + "mean_token_accuracy": 0.6777813985943795, + "num_tokens": 31610323.0, + "step": 965 + }, + { + "epoch": 0.08043784725101584, + "grad_norm": 1.554725170135498, + "learning_rate": 8.034825870646766e-05, + "loss": 1.3945, + "mean_token_accuracy": 0.6536779120564461, + "num_tokens": 31774163.0, + "step": 970 + }, + { + "epoch": 0.08085247532962932, + "grad_norm": 1.4651646614074707, + "learning_rate": 8.076285240464345e-05, + "loss": 1.3478, + "mean_token_accuracy": 0.666422289609909, + "num_tokens": 31938003.0, + "step": 975 + }, + { + "epoch": 0.08126710340824281, + "grad_norm": 1.5143450498580933, + "learning_rate": 8.117744610281925e-05, + "loss": 1.3087, + "mean_token_accuracy": 0.6691593304276466, + "num_tokens": 32101843.0, + "step": 980 + }, + { + "epoch": 0.08168173148685628, + "grad_norm": 1.4389920234680176, + "learning_rate": 8.159203980099502e-05, + "loss": 1.3589, + "mean_token_accuracy": 0.662383921444416, + "num_tokens": 32265683.0, + "step": 985 + }, + { + "epoch": 0.08209635956546978, + "grad_norm": 1.4907101392745972, + "learning_rate": 8.200663349917082e-05, + "loss": 1.3373, + "mean_token_accuracy": 0.6661595821380615, + "num_tokens": 32429523.0, + "step": 990 + }, + { + "epoch": 0.08251098764408325, + "grad_norm": 1.498844861984253, + "learning_rate": 8.24212271973466e-05, + "loss": 1.4749, + "mean_token_accuracy": 0.6424853324890136, + "num_tokens": 32593363.0, + "step": 995 + }, + { + "epoch": 0.08292561572269674, + "grad_norm": 1.4503613710403442, + "learning_rate": 8.283582089552239e-05, + "loss": 1.3067, + "mean_token_accuracy": 0.6691892817616463, + "num_tokens": 32756973.0, + "step": 1000 + }, + { + "epoch": 0.08334024380131022, + "grad_norm": 1.4826053380966187, + "learning_rate": 8.325041459369818e-05, + "loss": 1.3312, + "mean_token_accuracy": 0.6646444290876389, + "num_tokens": 32920813.0, + "step": 1005 + }, + { + "epoch": 0.08375487187992371, + "grad_norm": 1.4513267278671265, + "learning_rate": 8.366500829187398e-05, + "loss": 1.2827, + "mean_token_accuracy": 0.6734543010592461, + "num_tokens": 33084653.0, + "step": 1010 + }, + { + "epoch": 0.08416949995853719, + "grad_norm": 1.3998719453811646, + "learning_rate": 8.407960199004975e-05, + "loss": 1.3475, + "mean_token_accuracy": 0.6636377297341823, + "num_tokens": 33247540.0, + "step": 1015 + }, + { + "epoch": 0.08458412803715068, + "grad_norm": 1.4850621223449707, + "learning_rate": 8.449419568822555e-05, + "loss": 1.3413, + "mean_token_accuracy": 0.6632209196686745, + "num_tokens": 33411380.0, + "step": 1020 + }, + { + "epoch": 0.08499875611576416, + "grad_norm": 1.4333192110061646, + "learning_rate": 8.490878938640133e-05, + "loss": 1.3847, + "mean_token_accuracy": 0.6557184763252735, + "num_tokens": 33575220.0, + "step": 1025 + }, + { + "epoch": 0.08541338419437765, + "grad_norm": 1.4944761991500854, + "learning_rate": 8.532338308457712e-05, + "loss": 1.3523, + "mean_token_accuracy": 0.6646868199110031, + "num_tokens": 33738658.0, + "step": 1030 + }, + { + "epoch": 0.08582801227299112, + "grad_norm": 1.388965129852295, + "learning_rate": 8.57379767827529e-05, + "loss": 1.3129, + "mean_token_accuracy": 0.6730571836233139, + "num_tokens": 33902498.0, + "step": 1035 + }, + { + "epoch": 0.08624264035160462, + "grad_norm": 1.4851020574569702, + "learning_rate": 8.61525704809287e-05, + "loss": 1.2922, + "mean_token_accuracy": 0.6700879752635955, + "num_tokens": 34066338.0, + "step": 1040 + }, + { + "epoch": 0.08665726843021809, + "grad_norm": 1.4334027767181396, + "learning_rate": 8.656716417910447e-05, + "loss": 1.307, + "mean_token_accuracy": 0.673802538216114, + "num_tokens": 34230178.0, + "step": 1045 + }, + { + "epoch": 0.08707189650883158, + "grad_norm": 1.4220467805862427, + "learning_rate": 8.698175787728027e-05, + "loss": 1.4381, + "mean_token_accuracy": 0.6489491671323776, + "num_tokens": 34394018.0, + "step": 1050 + }, + { + "epoch": 0.08748652458744506, + "grad_norm": 1.341352105140686, + "learning_rate": 8.739635157545606e-05, + "loss": 1.215, + "mean_token_accuracy": 0.6873839199542999, + "num_tokens": 34557858.0, + "step": 1055 + }, + { + "epoch": 0.08790115266605855, + "grad_norm": 1.4932255744934082, + "learning_rate": 8.781094527363185e-05, + "loss": 1.299, + "mean_token_accuracy": 0.6708152651786804, + "num_tokens": 34720730.0, + "step": 1060 + }, + { + "epoch": 0.08831578074467203, + "grad_norm": 1.3251454830169678, + "learning_rate": 8.822553897180763e-05, + "loss": 1.3161, + "mean_token_accuracy": 0.6739186197519302, + "num_tokens": 34884570.0, + "step": 1065 + }, + { + "epoch": 0.08873040882328552, + "grad_norm": 1.4369324445724487, + "learning_rate": 8.864013266998342e-05, + "loss": 1.3105, + "mean_token_accuracy": 0.6701320111751556, + "num_tokens": 35047445.0, + "step": 1070 + }, + { + "epoch": 0.089145036901899, + "grad_norm": 1.4724633693695068, + "learning_rate": 8.905472636815922e-05, + "loss": 1.3482, + "mean_token_accuracy": 0.6623900294303894, + "num_tokens": 35211285.0, + "step": 1075 + }, + { + "epoch": 0.08955966498051247, + "grad_norm": 1.3175029754638672, + "learning_rate": 8.946932006633499e-05, + "loss": 1.3482, + "mean_token_accuracy": 0.6598729252815246, + "num_tokens": 35375125.0, + "step": 1080 + }, + { + "epoch": 0.08997429305912596, + "grad_norm": 1.4000898599624634, + "learning_rate": 8.988391376451079e-05, + "loss": 1.3913, + "mean_token_accuracy": 0.6548206344246864, + "num_tokens": 35538041.0, + "step": 1085 + }, + { + "epoch": 0.09038892113773944, + "grad_norm": 1.4432710409164429, + "learning_rate": 9.029850746268657e-05, + "loss": 1.3746, + "mean_token_accuracy": 0.6613330885767936, + "num_tokens": 35701881.0, + "step": 1090 + }, + { + "epoch": 0.09080354921635293, + "grad_norm": 1.5282469987869263, + "learning_rate": 9.071310116086236e-05, + "loss": 1.4372, + "mean_token_accuracy": 0.6472564682364463, + "num_tokens": 35864728.0, + "step": 1095 + }, + { + "epoch": 0.09121817729496641, + "grad_norm": 1.3563272953033447, + "learning_rate": 9.112769485903814e-05, + "loss": 1.3101, + "mean_token_accuracy": 0.6677113883197308, + "num_tokens": 36028568.0, + "step": 1100 + }, + { + "epoch": 0.0916328053735799, + "grad_norm": 1.4359475374221802, + "learning_rate": 9.154228855721394e-05, + "loss": 1.2544, + "mean_token_accuracy": 0.6783113405108452, + "num_tokens": 36192408.0, + "step": 1105 + }, + { + "epoch": 0.09204743345219338, + "grad_norm": 1.4146751165390015, + "learning_rate": 9.195688225538971e-05, + "loss": 1.4059, + "mean_token_accuracy": 0.6540689110755921, + "num_tokens": 36356248.0, + "step": 1110 + }, + { + "epoch": 0.09246206153080687, + "grad_norm": 1.501935362815857, + "learning_rate": 9.237147595356551e-05, + "loss": 1.3623, + "mean_token_accuracy": 0.6604533240199089, + "num_tokens": 36520088.0, + "step": 1115 + }, + { + "epoch": 0.09287668960942035, + "grad_norm": 1.3974504470825195, + "learning_rate": 9.27860696517413e-05, + "loss": 1.3582, + "mean_token_accuracy": 0.6621945217251778, + "num_tokens": 36683928.0, + "step": 1120 + }, + { + "epoch": 0.09329131768803384, + "grad_norm": 1.366337537765503, + "learning_rate": 9.320066334991709e-05, + "loss": 1.2823, + "mean_token_accuracy": 0.6725623145699501, + "num_tokens": 36847768.0, + "step": 1125 + }, + { + "epoch": 0.09370594576664731, + "grad_norm": 1.3947283029556274, + "learning_rate": 9.361525704809287e-05, + "loss": 1.4181, + "mean_token_accuracy": 0.6555474132299424, + "num_tokens": 37011608.0, + "step": 1130 + }, + { + "epoch": 0.0941205738452608, + "grad_norm": 1.4116472005844116, + "learning_rate": 9.402985074626867e-05, + "loss": 1.465, + "mean_token_accuracy": 0.6451433047652244, + "num_tokens": 37175231.0, + "step": 1135 + }, + { + "epoch": 0.09453520192387428, + "grad_norm": 1.4221644401550293, + "learning_rate": 9.444444444444444e-05, + "loss": 1.3925, + "mean_token_accuracy": 0.6539161786437034, + "num_tokens": 37339071.0, + "step": 1140 + }, + { + "epoch": 0.09494983000248777, + "grad_norm": 1.4130382537841797, + "learning_rate": 9.485903814262024e-05, + "loss": 1.3436, + "mean_token_accuracy": 0.6620478987693786, + "num_tokens": 37502911.0, + "step": 1145 + }, + { + "epoch": 0.09536445808110125, + "grad_norm": 1.3947120904922485, + "learning_rate": 9.527363184079603e-05, + "loss": 1.4183, + "mean_token_accuracy": 0.6529630959033966, + "num_tokens": 37666751.0, + "step": 1150 + }, + { + "epoch": 0.09577908615971474, + "grad_norm": 1.4517793655395508, + "learning_rate": 9.568822553897181e-05, + "loss": 1.4008, + "mean_token_accuracy": 0.6511601343750953, + "num_tokens": 37830361.0, + "step": 1155 + }, + { + "epoch": 0.09619371423832822, + "grad_norm": 1.4411286115646362, + "learning_rate": 9.61028192371476e-05, + "loss": 1.4914, + "mean_token_accuracy": 0.6399132460355759, + "num_tokens": 37994201.0, + "step": 1160 + }, + { + "epoch": 0.09660834231694171, + "grad_norm": 1.7347967624664307, + "learning_rate": 9.65174129353234e-05, + "loss": 1.2925, + "mean_token_accuracy": 0.6705400794744492, + "num_tokens": 38158041.0, + "step": 1165 + }, + { + "epoch": 0.09702297039555519, + "grad_norm": 1.3912692070007324, + "learning_rate": 9.693200663349917e-05, + "loss": 1.3408, + "mean_token_accuracy": 0.6626405164599418, + "num_tokens": 38321881.0, + "step": 1170 + }, + { + "epoch": 0.09743759847416868, + "grad_norm": 1.3747230768203735, + "learning_rate": 9.734660033167497e-05, + "loss": 1.3228, + "mean_token_accuracy": 0.6728983402252198, + "num_tokens": 38485721.0, + "step": 1175 + }, + { + "epoch": 0.09785222655278215, + "grad_norm": 1.4716606140136719, + "learning_rate": 9.776119402985075e-05, + "loss": 1.3058, + "mean_token_accuracy": 0.6680290788412094, + "num_tokens": 38649561.0, + "step": 1180 + }, + { + "epoch": 0.09826685463139564, + "grad_norm": 1.3675380945205688, + "learning_rate": 9.817578772802654e-05, + "loss": 1.2764, + "mean_token_accuracy": 0.672256837785244, + "num_tokens": 38813401.0, + "step": 1185 + }, + { + "epoch": 0.09868148271000912, + "grad_norm": 1.4142637252807617, + "learning_rate": 9.859038142620233e-05, + "loss": 1.401, + "mean_token_accuracy": 0.6586057350039483, + "num_tokens": 38977105.0, + "step": 1190 + }, + { + "epoch": 0.0990961107886226, + "grad_norm": 1.3965309858322144, + "learning_rate": 9.900497512437812e-05, + "loss": 1.358, + "mean_token_accuracy": 0.6610459417104722, + "num_tokens": 39140945.0, + "step": 1195 + }, + { + "epoch": 0.09951073886723609, + "grad_norm": 1.3496406078338623, + "learning_rate": 9.94195688225539e-05, + "loss": 1.3519, + "mean_token_accuracy": 0.6601967245340348, + "num_tokens": 39304785.0, + "step": 1200 + }, + { + "epoch": 0.09992536694584957, + "grad_norm": 1.3891043663024902, + "learning_rate": 9.98341625207297e-05, + "loss": 1.3577, + "mean_token_accuracy": 0.6609848454594612, + "num_tokens": 39468625.0, + "step": 1205 + }, + { + "epoch": 0.10033999502446306, + "grad_norm": 1.4117522239685059, + "learning_rate": 9.999998114690611e-05, + "loss": 1.376, + "mean_token_accuracy": 0.659420820325613, + "num_tokens": 39632465.0, + "step": 1210 + }, + { + "epoch": 0.10075462310307653, + "grad_norm": 1.3661329746246338, + "learning_rate": 9.999986593360611e-05, + "loss": 1.3479, + "mean_token_accuracy": 0.6634530752897263, + "num_tokens": 39796305.0, + "step": 1215 + }, + { + "epoch": 0.10116925118169003, + "grad_norm": 1.3807592391967773, + "learning_rate": 9.999964598118817e-05, + "loss": 1.3104, + "mean_token_accuracy": 0.6695197895169258, + "num_tokens": 39960145.0, + "step": 1220 + }, + { + "epoch": 0.1015838792603035, + "grad_norm": 1.3176120519638062, + "learning_rate": 9.999932129011307e-05, + "loss": 1.3002, + "mean_token_accuracy": 0.6680840656161309, + "num_tokens": 40123985.0, + "step": 1225 + }, + { + "epoch": 0.101998507338917, + "grad_norm": 2.028810501098633, + "learning_rate": 9.999889186106097e-05, + "loss": 1.3252, + "mean_token_accuracy": 0.6665505826473236, + "num_tokens": 40287825.0, + "step": 1230 + }, + { + "epoch": 0.10241313541753047, + "grad_norm": 1.3492093086242676, + "learning_rate": 9.999835769493143e-05, + "loss": 1.4348, + "mean_token_accuracy": 0.6501893937587738, + "num_tokens": 40451665.0, + "step": 1235 + }, + { + "epoch": 0.10282776349614396, + "grad_norm": 1.3383077383041382, + "learning_rate": 9.999771879284341e-05, + "loss": 1.3338, + "mean_token_accuracy": 0.6664115741848946, + "num_tokens": 40614717.0, + "step": 1240 + }, + { + "epoch": 0.10324239157475744, + "grad_norm": 1.4968199729919434, + "learning_rate": 9.999697515613528e-05, + "loss": 1.3937, + "mean_token_accuracy": 0.6575207717716693, + "num_tokens": 40778557.0, + "step": 1245 + }, + { + "epoch": 0.10365701965337093, + "grad_norm": 1.3148664236068726, + "learning_rate": 9.999612678636478e-05, + "loss": 1.27, + "mean_token_accuracy": 0.6778225794434547, + "num_tokens": 40942397.0, + "step": 1250 + }, + { + "epoch": 0.1040716477319844, + "grad_norm": 1.4289436340332031, + "learning_rate": 9.99951736853091e-05, + "loss": 1.3333, + "mean_token_accuracy": 0.6689454987645149, + "num_tokens": 41106237.0, + "step": 1255 + }, + { + "epoch": 0.1044862758105979, + "grad_norm": 1.3502683639526367, + "learning_rate": 9.999411585496479e-05, + "loss": 1.3966, + "mean_token_accuracy": 0.6577101618051528, + "num_tokens": 41270077.0, + "step": 1260 + }, + { + "epoch": 0.10490090388921137, + "grad_norm": 1.4543397426605225, + "learning_rate": 9.999295329754773e-05, + "loss": 1.3918, + "mean_token_accuracy": 0.6539956003427505, + "num_tokens": 41433917.0, + "step": 1265 + }, + { + "epoch": 0.10531553196782487, + "grad_norm": 1.4151115417480469, + "learning_rate": 9.999168601549327e-05, + "loss": 1.3828, + "mean_token_accuracy": 0.6566471174359322, + "num_tokens": 41597757.0, + "step": 1270 + }, + { + "epoch": 0.10573016004643834, + "grad_norm": 1.315963864326477, + "learning_rate": 9.999031401145609e-05, + "loss": 1.3439, + "mean_token_accuracy": 0.6630020245909691, + "num_tokens": 41760926.0, + "step": 1275 + }, + { + "epoch": 0.10614478812505183, + "grad_norm": 1.3273677825927734, + "learning_rate": 9.998883728831024e-05, + "loss": 1.3293, + "mean_token_accuracy": 0.6641373381018638, + "num_tokens": 41924766.0, + "step": 1280 + }, + { + "epoch": 0.10655941620366531, + "grad_norm": 1.3122780323028564, + "learning_rate": 9.998725584914915e-05, + "loss": 1.3364, + "mean_token_accuracy": 0.6650659814476967, + "num_tokens": 42088606.0, + "step": 1285 + }, + { + "epoch": 0.1069740442822788, + "grad_norm": 1.3338128328323364, + "learning_rate": 9.998556969728559e-05, + "loss": 1.3336, + "mean_token_accuracy": 0.6651763558387757, + "num_tokens": 42251613.0, + "step": 1290 + }, + { + "epoch": 0.10738867236089228, + "grad_norm": 1.5022046566009521, + "learning_rate": 9.99837788362517e-05, + "loss": 1.38, + "mean_token_accuracy": 0.6555290788412094, + "num_tokens": 42415453.0, + "step": 1295 + }, + { + "epoch": 0.10780330043950577, + "grad_norm": 1.3532664775848389, + "learning_rate": 9.998188326979895e-05, + "loss": 1.3507, + "mean_token_accuracy": 0.6577895864844322, + "num_tokens": 42579293.0, + "step": 1300 + }, + { + "epoch": 0.10821792851811925, + "grad_norm": 1.3476784229278564, + "learning_rate": 9.997988300189816e-05, + "loss": 1.3537, + "mean_token_accuracy": 0.6619501486420631, + "num_tokens": 42743133.0, + "step": 1305 + }, + { + "epoch": 0.10863255659673272, + "grad_norm": 1.2422707080841064, + "learning_rate": 9.997777803673944e-05, + "loss": 1.3581, + "mean_token_accuracy": 0.6607506588101387, + "num_tokens": 42905995.0, + "step": 1310 + }, + { + "epoch": 0.10904718467534621, + "grad_norm": 1.3391062021255493, + "learning_rate": 9.997556837873228e-05, + "loss": 1.3127, + "mean_token_accuracy": 0.668255127966404, + "num_tokens": 43069835.0, + "step": 1315 + }, + { + "epoch": 0.10946181275395969, + "grad_norm": 1.22450852394104, + "learning_rate": 9.997325403250541e-05, + "loss": 1.3347, + "mean_token_accuracy": 0.6667644187808037, + "num_tokens": 43233675.0, + "step": 1320 + }, + { + "epoch": 0.10987644083257318, + "grad_norm": 1.3081496953964233, + "learning_rate": 9.997083500290694e-05, + "loss": 1.3801, + "mean_token_accuracy": 0.6562072336673737, + "num_tokens": 43397515.0, + "step": 1325 + }, + { + "epoch": 0.11029106891118666, + "grad_norm": 1.230015516281128, + "learning_rate": 9.99683112950042e-05, + "loss": 1.3381, + "mean_token_accuracy": 0.6677419349551201, + "num_tokens": 43561355.0, + "step": 1330 + }, + { + "epoch": 0.11070569698980015, + "grad_norm": 1.3189239501953125, + "learning_rate": 9.996568291408379e-05, + "loss": 1.3946, + "mean_token_accuracy": 0.6497922793030739, + "num_tokens": 43725195.0, + "step": 1335 + }, + { + "epoch": 0.11112032506841363, + "grad_norm": 1.2687530517578125, + "learning_rate": 9.996294986565166e-05, + "loss": 1.3682, + "mean_token_accuracy": 0.6570442348718644, + "num_tokens": 43889035.0, + "step": 1340 + }, + { + "epoch": 0.11153495314702712, + "grad_norm": 1.2718192338943481, + "learning_rate": 9.996011215543296e-05, + "loss": 1.426, + "mean_token_accuracy": 0.6486681327223778, + "num_tokens": 44052875.0, + "step": 1345 + }, + { + "epoch": 0.1119495812256406, + "grad_norm": 1.4578088521957397, + "learning_rate": 9.995716978937203e-05, + "loss": 1.3986, + "mean_token_accuracy": 0.6559811875224113, + "num_tokens": 44216715.0, + "step": 1350 + }, + { + "epoch": 0.11236420930425409, + "grad_norm": 1.1890809535980225, + "learning_rate": 9.995412277363261e-05, + "loss": 1.336, + "mean_token_accuracy": 0.6617302060127258, + "num_tokens": 44380555.0, + "step": 1355 + }, + { + "epoch": 0.11277883738286756, + "grad_norm": 1.243876338005066, + "learning_rate": 9.995097111459747e-05, + "loss": 1.3838, + "mean_token_accuracy": 0.6524254620075226, + "num_tokens": 44544395.0, + "step": 1360 + }, + { + "epoch": 0.11319346546148105, + "grad_norm": 1.3206053972244263, + "learning_rate": 9.994771481886869e-05, + "loss": 1.3321, + "mean_token_accuracy": 0.6657380282878875, + "num_tokens": 44708235.0, + "step": 1365 + }, + { + "epoch": 0.11360809354009453, + "grad_norm": 1.2924857139587402, + "learning_rate": 9.994435389326753e-05, + "loss": 1.3351, + "mean_token_accuracy": 0.6672470673918725, + "num_tokens": 44872075.0, + "step": 1370 + }, + { + "epoch": 0.11402272161870802, + "grad_norm": 1.2311909198760986, + "learning_rate": 9.99408883448344e-05, + "loss": 1.3773, + "mean_token_accuracy": 0.6586265876889229, + "num_tokens": 45035915.0, + "step": 1375 + }, + { + "epoch": 0.1144373496973215, + "grad_norm": 1.2170907258987427, + "learning_rate": 9.99373181808289e-05, + "loss": 1.3426, + "mean_token_accuracy": 0.659329180419445, + "num_tokens": 45199755.0, + "step": 1380 + }, + { + "epoch": 0.11485197777593499, + "grad_norm": 1.245579481124878, + "learning_rate": 9.993364340872977e-05, + "loss": 1.3514, + "mean_token_accuracy": 0.6624366760253906, + "num_tokens": 45362745.0, + "step": 1385 + }, + { + "epoch": 0.11526660585454847, + "grad_norm": 1.2931832075119019, + "learning_rate": 9.992986403623487e-05, + "loss": 1.3698, + "mean_token_accuracy": 0.6562740832567215, + "num_tokens": 45525978.0, + "step": 1390 + }, + { + "epoch": 0.11568123393316196, + "grad_norm": 1.2172212600708008, + "learning_rate": 9.992598007126117e-05, + "loss": 1.3982, + "mean_token_accuracy": 0.6571908593177795, + "num_tokens": 45689818.0, + "step": 1395 + }, + { + "epoch": 0.11609586201177544, + "grad_norm": 1.2430282831192017, + "learning_rate": 9.99219915219448e-05, + "loss": 1.3635, + "mean_token_accuracy": 0.6602211631834507, + "num_tokens": 45853658.0, + "step": 1400 + }, + { + "epoch": 0.11651049009038893, + "grad_norm": 1.2654681205749512, + "learning_rate": 9.991789839664087e-05, + "loss": 1.3851, + "mean_token_accuracy": 0.655498529970646, + "num_tokens": 46017498.0, + "step": 1405 + }, + { + "epoch": 0.1169251181690024, + "grad_norm": 1.4644423723220825, + "learning_rate": 9.991370070392363e-05, + "loss": 1.3919, + "mean_token_accuracy": 0.6564149558544159, + "num_tokens": 46181338.0, + "step": 1410 + }, + { + "epoch": 0.1173397462476159, + "grad_norm": 1.237156629562378, + "learning_rate": 9.990939845258638e-05, + "loss": 1.3377, + "mean_token_accuracy": 0.6658663243055344, + "num_tokens": 46345178.0, + "step": 1415 + }, + { + "epoch": 0.11775437432622937, + "grad_norm": 1.3756873607635498, + "learning_rate": 9.990499165164139e-05, + "loss": 1.3564, + "mean_token_accuracy": 0.6600928664207458, + "num_tokens": 46509018.0, + "step": 1420 + }, + { + "epoch": 0.11816900240484286, + "grad_norm": 1.257973551750183, + "learning_rate": 9.990048031031999e-05, + "loss": 1.3685, + "mean_token_accuracy": 0.6592802986502647, + "num_tokens": 46672858.0, + "step": 1425 + }, + { + "epoch": 0.11858363048345634, + "grad_norm": 1.257059097290039, + "learning_rate": 9.989586443807248e-05, + "loss": 1.307, + "mean_token_accuracy": 0.6718108534812928, + "num_tokens": 46836698.0, + "step": 1430 + }, + { + "epoch": 0.11899825856206982, + "grad_norm": 1.324131965637207, + "learning_rate": 9.989114404456814e-05, + "loss": 1.4071, + "mean_token_accuracy": 0.6508858755230904, + "num_tokens": 47000538.0, + "step": 1435 + }, + { + "epoch": 0.11941288664068331, + "grad_norm": 1.2344211339950562, + "learning_rate": 9.988631913969519e-05, + "loss": 1.334, + "mean_token_accuracy": 0.6659274190664292, + "num_tokens": 47164378.0, + "step": 1440 + }, + { + "epoch": 0.11982751471929678, + "grad_norm": 1.2423447370529175, + "learning_rate": 9.988138973356079e-05, + "loss": 1.3058, + "mean_token_accuracy": 0.671522231400013, + "num_tokens": 47327560.0, + "step": 1445 + }, + { + "epoch": 0.12024214279791028, + "grad_norm": 1.4536687135696411, + "learning_rate": 9.987635583649097e-05, + "loss": 1.4061, + "mean_token_accuracy": 0.6506109446287155, + "num_tokens": 47491400.0, + "step": 1450 + }, + { + "epoch": 0.12065677087652375, + "grad_norm": 1.1907180547714233, + "learning_rate": 9.987121745903072e-05, + "loss": 1.378, + "mean_token_accuracy": 0.657317741215229, + "num_tokens": 47654463.0, + "step": 1455 + }, + { + "epoch": 0.12107139895513724, + "grad_norm": 1.2786400318145752, + "learning_rate": 9.986597461194382e-05, + "loss": 1.3535, + "mean_token_accuracy": 0.660117307305336, + "num_tokens": 47818303.0, + "step": 1460 + }, + { + "epoch": 0.12148602703375072, + "grad_norm": 1.2278717756271362, + "learning_rate": 9.986062730621294e-05, + "loss": 1.3666, + "mean_token_accuracy": 0.6596590921282768, + "num_tokens": 47982143.0, + "step": 1465 + }, + { + "epoch": 0.12190065511236421, + "grad_norm": 1.2639421224594116, + "learning_rate": 9.985517555303954e-05, + "loss": 1.366, + "mean_token_accuracy": 0.6585043981671334, + "num_tokens": 48145983.0, + "step": 1470 + }, + { + "epoch": 0.12231528319097769, + "grad_norm": 1.2990895509719849, + "learning_rate": 9.984961936384389e-05, + "loss": 1.2915, + "mean_token_accuracy": 0.6701307401061058, + "num_tokens": 48309823.0, + "step": 1475 + }, + { + "epoch": 0.12272991126959118, + "grad_norm": 1.217031717300415, + "learning_rate": 9.984395875026504e-05, + "loss": 1.4054, + "mean_token_accuracy": 0.6530821815133094, + "num_tokens": 48473287.0, + "step": 1480 + }, + { + "epoch": 0.12314453934820466, + "grad_norm": 1.3979650735855103, + "learning_rate": 9.983819372416077e-05, + "loss": 1.3625, + "mean_token_accuracy": 0.661717988550663, + "num_tokens": 48637127.0, + "step": 1485 + }, + { + "epoch": 0.12355916742681815, + "grad_norm": 1.1983777284622192, + "learning_rate": 9.983232429760756e-05, + "loss": 1.3352, + "mean_token_accuracy": 0.6667155444622039, + "num_tokens": 48800967.0, + "step": 1490 + }, + { + "epoch": 0.12397379550543162, + "grad_norm": 1.2083327770233154, + "learning_rate": 9.982635048290065e-05, + "loss": 1.3212, + "mean_token_accuracy": 0.6723729208111763, + "num_tokens": 48964807.0, + "step": 1495 + }, + { + "epoch": 0.12438842358404512, + "grad_norm": 1.2176861763000488, + "learning_rate": 9.98202722925539e-05, + "loss": 1.3341, + "mean_token_accuracy": 0.6651077300310135, + "num_tokens": 49128200.0, + "step": 1500 + }, + { + "epoch": 0.12480305166265859, + "grad_norm": 1.248131513595581, + "learning_rate": 9.981408973929984e-05, + "loss": 1.3594, + "mean_token_accuracy": 0.6569226816296577, + "num_tokens": 49291711.0, + "step": 1505 + }, + { + "epoch": 0.12521767974127207, + "grad_norm": 1.2244327068328857, + "learning_rate": 9.980780283608962e-05, + "loss": 1.3562, + "mean_token_accuracy": 0.6625122189521789, + "num_tokens": 49455551.0, + "step": 1510 + }, + { + "epoch": 0.12563230781988557, + "grad_norm": 1.1873289346694946, + "learning_rate": 9.980141159609292e-05, + "loss": 1.318, + "mean_token_accuracy": 0.6686216980218888, + "num_tokens": 49619391.0, + "step": 1515 + }, + { + "epoch": 0.12604693589849905, + "grad_norm": 1.2468986511230469, + "learning_rate": 9.979491603269807e-05, + "loss": 1.3678, + "mean_token_accuracy": 0.6572947204113007, + "num_tokens": 49783231.0, + "step": 1520 + }, + { + "epoch": 0.12646156397711253, + "grad_norm": 1.2199991941452026, + "learning_rate": 9.97883161595119e-05, + "loss": 1.3319, + "mean_token_accuracy": 0.661113141477108, + "num_tokens": 49947071.0, + "step": 1525 + }, + { + "epoch": 0.126876192055726, + "grad_norm": 1.2399156093597412, + "learning_rate": 9.978161199035973e-05, + "loss": 1.3715, + "mean_token_accuracy": 0.6594170108437538, + "num_tokens": 50110092.0, + "step": 1530 + }, + { + "epoch": 0.1272908201343395, + "grad_norm": 1.2146192789077759, + "learning_rate": 9.977480353928537e-05, + "loss": 1.3009, + "mean_token_accuracy": 0.6711876869201661, + "num_tokens": 50273932.0, + "step": 1535 + }, + { + "epoch": 0.127705448212953, + "grad_norm": 1.263193964958191, + "learning_rate": 9.97678908205511e-05, + "loss": 1.4342, + "mean_token_accuracy": 0.6495478972792625, + "num_tokens": 50437772.0, + "step": 1540 + }, + { + "epoch": 0.12812007629156646, + "grad_norm": 1.2597016096115112, + "learning_rate": 9.97608738486376e-05, + "loss": 1.37, + "mean_token_accuracy": 0.6623414978384972, + "num_tokens": 50601325.0, + "step": 1545 + }, + { + "epoch": 0.12853470437017994, + "grad_norm": 1.2344940900802612, + "learning_rate": 9.975375263824392e-05, + "loss": 1.3619, + "mean_token_accuracy": 0.6563782960176467, + "num_tokens": 50765165.0, + "step": 1550 + }, + { + "epoch": 0.12894933244879342, + "grad_norm": 1.2537332773208618, + "learning_rate": 9.974652720428747e-05, + "loss": 1.3234, + "mean_token_accuracy": 0.6657013684511185, + "num_tokens": 50929005.0, + "step": 1555 + }, + { + "epoch": 0.12936396052740692, + "grad_norm": 1.1453742980957031, + "learning_rate": 9.973919756190407e-05, + "loss": 1.3031, + "mean_token_accuracy": 0.6742241024971009, + "num_tokens": 51092845.0, + "step": 1560 + }, + { + "epoch": 0.1297785886060204, + "grad_norm": 1.1941858530044556, + "learning_rate": 9.973176372644771e-05, + "loss": 1.3448, + "mean_token_accuracy": 0.6630070850253105, + "num_tokens": 51256685.0, + "step": 1565 + }, + { + "epoch": 0.13019321668463388, + "grad_norm": 1.1861906051635742, + "learning_rate": 9.97242257134907e-05, + "loss": 1.3187, + "mean_token_accuracy": 0.6665872409939766, + "num_tokens": 51420525.0, + "step": 1570 + }, + { + "epoch": 0.13060784476324735, + "grad_norm": 1.1945332288742065, + "learning_rate": 9.971658353882359e-05, + "loss": 1.3328, + "mean_token_accuracy": 0.6672470673918725, + "num_tokens": 51584365.0, + "step": 1575 + }, + { + "epoch": 0.13102247284186086, + "grad_norm": 1.3252277374267578, + "learning_rate": 9.970883721845513e-05, + "loss": 1.3837, + "mean_token_accuracy": 0.6575146600604057, + "num_tokens": 51748205.0, + "step": 1580 + }, + { + "epoch": 0.13143710092047434, + "grad_norm": 1.281628966331482, + "learning_rate": 9.97009867686122e-05, + "loss": 1.3115, + "mean_token_accuracy": 0.6694892480969429, + "num_tokens": 51912045.0, + "step": 1585 + }, + { + "epoch": 0.1318517289990878, + "grad_norm": 1.1837189197540283, + "learning_rate": 9.969303220573985e-05, + "loss": 1.2931, + "mean_token_accuracy": 0.6696419835090637, + "num_tokens": 52075885.0, + "step": 1590 + }, + { + "epoch": 0.1322663570777013, + "grad_norm": 1.196635365486145, + "learning_rate": 9.968497354650116e-05, + "loss": 1.344, + "mean_token_accuracy": 0.6630742907524109, + "num_tokens": 52239725.0, + "step": 1595 + }, + { + "epoch": 0.1326809851563148, + "grad_norm": 1.1934854984283447, + "learning_rate": 9.967681080777735e-05, + "loss": 1.4029, + "mean_token_accuracy": 0.6529447734355927, + "num_tokens": 52403565.0, + "step": 1600 + }, + { + "epoch": 0.13309561323492827, + "grad_norm": 1.2019221782684326, + "learning_rate": 9.966854400666762e-05, + "loss": 1.4377, + "mean_token_accuracy": 0.6488391980528831, + "num_tokens": 52567405.0, + "step": 1605 + }, + { + "epoch": 0.13351024131354175, + "grad_norm": 1.150558352470398, + "learning_rate": 9.966017316048917e-05, + "loss": 1.3579, + "mean_token_accuracy": 0.663917401432991, + "num_tokens": 52731245.0, + "step": 1610 + }, + { + "epoch": 0.13392486939215523, + "grad_norm": 1.1604512929916382, + "learning_rate": 9.965169828677711e-05, + "loss": 1.316, + "mean_token_accuracy": 0.6674120202660561, + "num_tokens": 52895085.0, + "step": 1615 + }, + { + "epoch": 0.13433949747076873, + "grad_norm": 1.2017782926559448, + "learning_rate": 9.964311940328456e-05, + "loss": 1.2988, + "mean_token_accuracy": 0.6694525897502899, + "num_tokens": 53058925.0, + "step": 1620 + }, + { + "epoch": 0.1347541255493822, + "grad_norm": 1.1345432996749878, + "learning_rate": 9.963443652798244e-05, + "loss": 1.3817, + "mean_token_accuracy": 0.6574510850012303, + "num_tokens": 53222243.0, + "step": 1625 + }, + { + "epoch": 0.13516875362799569, + "grad_norm": 1.1749430894851685, + "learning_rate": 9.96256496790595e-05, + "loss": 1.3316, + "mean_token_accuracy": 0.6657746791839599, + "num_tokens": 53386083.0, + "step": 1630 + }, + { + "epoch": 0.13558338170660916, + "grad_norm": 1.196176528930664, + "learning_rate": 9.961675887492236e-05, + "loss": 1.3431, + "mean_token_accuracy": 0.6665762454271317, + "num_tokens": 53548930.0, + "step": 1635 + }, + { + "epoch": 0.13599800978522267, + "grad_norm": 1.1467273235321045, + "learning_rate": 9.96077641341954e-05, + "loss": 1.3235, + "mean_token_accuracy": 0.6667259722948075, + "num_tokens": 53711382.0, + "step": 1640 + }, + { + "epoch": 0.13641263786383614, + "grad_norm": 1.1617478132247925, + "learning_rate": 9.959866547572061e-05, + "loss": 1.3364, + "mean_token_accuracy": 0.6599401280283927, + "num_tokens": 53875222.0, + "step": 1645 + }, + { + "epoch": 0.13682726594244962, + "grad_norm": 1.2925304174423218, + "learning_rate": 9.958946291855781e-05, + "loss": 1.4382, + "mean_token_accuracy": 0.6486009255051612, + "num_tokens": 54039062.0, + "step": 1650 + }, + { + "epoch": 0.1372418940210631, + "grad_norm": 1.1700193881988525, + "learning_rate": 9.958015648198441e-05, + "loss": 1.2947, + "mean_token_accuracy": 0.6765765935182572, + "num_tokens": 54202882.0, + "step": 1655 + }, + { + "epoch": 0.1376565220996766, + "grad_norm": 1.3198378086090088, + "learning_rate": 9.95707461854954e-05, + "loss": 1.3828, + "mean_token_accuracy": 0.6575268775224685, + "num_tokens": 54366722.0, + "step": 1660 + }, + { + "epoch": 0.13807115017829008, + "grad_norm": 1.2401063442230225, + "learning_rate": 9.956123204880335e-05, + "loss": 1.4184, + "mean_token_accuracy": 0.6541998594999313, + "num_tokens": 54530471.0, + "step": 1665 + }, + { + "epoch": 0.13848577825690356, + "grad_norm": 1.2013182640075684, + "learning_rate": 9.955161409183838e-05, + "loss": 1.426, + "mean_token_accuracy": 0.6528225809335708, + "num_tokens": 54694311.0, + "step": 1670 + }, + { + "epoch": 0.13890040633551703, + "grad_norm": 1.1475543975830078, + "learning_rate": 9.954189233474807e-05, + "loss": 1.3758, + "mean_token_accuracy": 0.6577040582895279, + "num_tokens": 54858151.0, + "step": 1675 + }, + { + "epoch": 0.1393150344141305, + "grad_norm": 1.1979186534881592, + "learning_rate": 9.953206679789742e-05, + "loss": 1.3477, + "mean_token_accuracy": 0.6635973781347275, + "num_tokens": 55020989.0, + "step": 1680 + }, + { + "epoch": 0.13972966249274402, + "grad_norm": 1.1845098733901978, + "learning_rate": 9.952213750186885e-05, + "loss": 1.3626, + "mean_token_accuracy": 0.6599279105663299, + "num_tokens": 55184829.0, + "step": 1685 + }, + { + "epoch": 0.1401442905713575, + "grad_norm": 1.218155026435852, + "learning_rate": 9.951210446746215e-05, + "loss": 1.3181, + "mean_token_accuracy": 0.6701835080981254, + "num_tokens": 55348280.0, + "step": 1690 + }, + { + "epoch": 0.14055891864997097, + "grad_norm": 1.219197154045105, + "learning_rate": 9.950196771569438e-05, + "loss": 1.3093, + "mean_token_accuracy": 0.6754842758178711, + "num_tokens": 55511537.0, + "step": 1695 + }, + { + "epoch": 0.14097354672858445, + "grad_norm": 1.195906400680542, + "learning_rate": 9.94917272677999e-05, + "loss": 1.3788, + "mean_token_accuracy": 0.6552663698792458, + "num_tokens": 55675377.0, + "step": 1700 + }, + { + "epoch": 0.14138817480719795, + "grad_norm": 1.2281585931777954, + "learning_rate": 9.948138314523026e-05, + "loss": 1.3823, + "mean_token_accuracy": 0.6584310859441758, + "num_tokens": 55839217.0, + "step": 1705 + }, + { + "epoch": 0.14180280288581143, + "grad_norm": 1.2239000797271729, + "learning_rate": 9.947093536965422e-05, + "loss": 1.294, + "mean_token_accuracy": 0.6741141244769097, + "num_tokens": 56003057.0, + "step": 1710 + }, + { + "epoch": 0.1422174309644249, + "grad_norm": 1.1687766313552856, + "learning_rate": 9.946038396295765e-05, + "loss": 1.3398, + "mean_token_accuracy": 0.6643632367253304, + "num_tokens": 56166360.0, + "step": 1715 + }, + { + "epoch": 0.14263205904303838, + "grad_norm": 1.099410891532898, + "learning_rate": 9.94497289472435e-05, + "loss": 1.3683, + "mean_token_accuracy": 0.6596774220466614, + "num_tokens": 56330200.0, + "step": 1720 + }, + { + "epoch": 0.1430466871216519, + "grad_norm": 1.2177159786224365, + "learning_rate": 9.943897034483178e-05, + "loss": 1.3449, + "mean_token_accuracy": 0.6664161801338195, + "num_tokens": 56494040.0, + "step": 1725 + }, + { + "epoch": 0.14346131520026537, + "grad_norm": 1.15543532371521, + "learning_rate": 9.942810817825948e-05, + "loss": 1.3912, + "mean_token_accuracy": 0.656199187040329, + "num_tokens": 56657123.0, + "step": 1730 + }, + { + "epoch": 0.14387594327887884, + "grad_norm": 1.1122572422027588, + "learning_rate": 9.941714247028053e-05, + "loss": 1.2874, + "mean_token_accuracy": 0.6738514184951783, + "num_tokens": 56820963.0, + "step": 1735 + }, + { + "epoch": 0.14429057135749232, + "grad_norm": 1.126460075378418, + "learning_rate": 9.940607324386577e-05, + "loss": 1.3429, + "mean_token_accuracy": 0.6653286918997765, + "num_tokens": 56984803.0, + "step": 1740 + }, + { + "epoch": 0.14470519943610582, + "grad_norm": 1.163468360900879, + "learning_rate": 9.939490052220289e-05, + "loss": 1.3763, + "mean_token_accuracy": 0.6573924794793129, + "num_tokens": 57148643.0, + "step": 1745 + }, + { + "epoch": 0.1451198275147193, + "grad_norm": 1.1244533061981201, + "learning_rate": 9.938362432869635e-05, + "loss": 1.2433, + "mean_token_accuracy": 0.6807140037417412, + "num_tokens": 57311792.0, + "step": 1750 + }, + { + "epoch": 0.14553445559333278, + "grad_norm": 1.2314460277557373, + "learning_rate": 9.93722446869674e-05, + "loss": 1.3881, + "mean_token_accuracy": 0.657117547094822, + "num_tokens": 57475632.0, + "step": 1755 + }, + { + "epoch": 0.14594908367194626, + "grad_norm": 1.1483550071716309, + "learning_rate": 9.936076162085397e-05, + "loss": 1.3635, + "mean_token_accuracy": 0.6604166641831398, + "num_tokens": 57639472.0, + "step": 1760 + }, + { + "epoch": 0.14636371175055976, + "grad_norm": 1.1495261192321777, + "learning_rate": 9.934917515441066e-05, + "loss": 1.3488, + "mean_token_accuracy": 0.6656769335269928, + "num_tokens": 57803312.0, + "step": 1765 + }, + { + "epoch": 0.14677833982917324, + "grad_norm": 1.1304582357406616, + "learning_rate": 9.933748531190865e-05, + "loss": 1.4188, + "mean_token_accuracy": 0.6531586021184921, + "num_tokens": 57967152.0, + "step": 1770 + }, + { + "epoch": 0.14719296790778671, + "grad_norm": 1.192624807357788, + "learning_rate": 9.932569211783567e-05, + "loss": 1.3548, + "mean_token_accuracy": 0.6607099235057831, + "num_tokens": 58130992.0, + "step": 1775 + }, + { + "epoch": 0.1476075959864002, + "grad_norm": 1.160388469696045, + "learning_rate": 9.9313795596896e-05, + "loss": 1.3282, + "mean_token_accuracy": 0.6654569894075394, + "num_tokens": 58294832.0, + "step": 1780 + }, + { + "epoch": 0.14802222406501367, + "grad_norm": 1.136030912399292, + "learning_rate": 9.930179577401029e-05, + "loss": 1.3372, + "mean_token_accuracy": 0.6637654319405556, + "num_tokens": 58458483.0, + "step": 1785 + }, + { + "epoch": 0.14843685214362717, + "grad_norm": 1.1632835865020752, + "learning_rate": 9.928969267431564e-05, + "loss": 1.4023, + "mean_token_accuracy": 0.6540853947401046, + "num_tokens": 58621575.0, + "step": 1790 + }, + { + "epoch": 0.14885148022224065, + "grad_norm": 1.1856722831726074, + "learning_rate": 9.927748632316549e-05, + "loss": 1.3775, + "mean_token_accuracy": 0.6556512728333473, + "num_tokens": 58785415.0, + "step": 1795 + }, + { + "epoch": 0.14926610830085413, + "grad_norm": 1.1614913940429688, + "learning_rate": 9.926517674612952e-05, + "loss": 1.2629, + "mean_token_accuracy": 0.6784029841423035, + "num_tokens": 58949255.0, + "step": 1800 + }, + { + "epoch": 0.1496807363794676, + "grad_norm": 1.094176173210144, + "learning_rate": 9.925276396899372e-05, + "loss": 1.3127, + "mean_token_accuracy": 0.6692265391349792, + "num_tokens": 59113095.0, + "step": 1805 + }, + { + "epoch": 0.1500953644580811, + "grad_norm": 1.1450884342193604, + "learning_rate": 9.924024801776022e-05, + "loss": 1.4249, + "mean_token_accuracy": 0.6477822616696358, + "num_tokens": 59276935.0, + "step": 1810 + }, + { + "epoch": 0.1505099925366946, + "grad_norm": 1.1223996877670288, + "learning_rate": 9.922762891864728e-05, + "loss": 1.3828, + "mean_token_accuracy": 0.6619623631238938, + "num_tokens": 59440775.0, + "step": 1815 + }, + { + "epoch": 0.15092462061530806, + "grad_norm": 1.2567800283432007, + "learning_rate": 9.921490669808924e-05, + "loss": 1.3815, + "mean_token_accuracy": 0.656121701002121, + "num_tokens": 59604615.0, + "step": 1820 + }, + { + "epoch": 0.15133924869392154, + "grad_norm": 1.17715322971344, + "learning_rate": 9.920208138273644e-05, + "loss": 1.3506, + "mean_token_accuracy": 0.6625794246792793, + "num_tokens": 59768455.0, + "step": 1825 + }, + { + "epoch": 0.15175387677253505, + "grad_norm": 1.162170171737671, + "learning_rate": 9.91891529994552e-05, + "loss": 1.3107, + "mean_token_accuracy": 0.6716727122664452, + "num_tokens": 59931303.0, + "step": 1830 + }, + { + "epoch": 0.15216850485114852, + "grad_norm": 6.644440174102783, + "learning_rate": 9.917612157532777e-05, + "loss": 1.3636, + "mean_token_accuracy": 0.6615713581442833, + "num_tokens": 60095143.0, + "step": 1835 + }, + { + "epoch": 0.152583132929762, + "grad_norm": 1.140966534614563, + "learning_rate": 9.916298713765219e-05, + "loss": 1.3936, + "mean_token_accuracy": 0.6620845571160316, + "num_tokens": 60258983.0, + "step": 1840 + }, + { + "epoch": 0.15299776100837548, + "grad_norm": 1.184509515762329, + "learning_rate": 9.914974971394233e-05, + "loss": 1.3678, + "mean_token_accuracy": 0.6639601692557335, + "num_tokens": 60422823.0, + "step": 1845 + }, + { + "epoch": 0.15341238908698898, + "grad_norm": 1.1792467832565308, + "learning_rate": 9.913640933192778e-05, + "loss": 1.3296, + "mean_token_accuracy": 0.6620417848229408, + "num_tokens": 60586663.0, + "step": 1850 + }, + { + "epoch": 0.15382701716560246, + "grad_norm": 1.1453526020050049, + "learning_rate": 9.912296601955384e-05, + "loss": 1.3378, + "mean_token_accuracy": 0.6599584549665451, + "num_tokens": 60750503.0, + "step": 1855 + }, + { + "epoch": 0.15424164524421594, + "grad_norm": 1.1483008861541748, + "learning_rate": 9.910941980498136e-05, + "loss": 1.3822, + "mean_token_accuracy": 0.6558467775583268, + "num_tokens": 60914343.0, + "step": 1860 + }, + { + "epoch": 0.1546562733228294, + "grad_norm": 1.1345916986465454, + "learning_rate": 9.90957707165868e-05, + "loss": 1.2497, + "mean_token_accuracy": 0.678806209564209, + "num_tokens": 61078183.0, + "step": 1865 + }, + { + "epoch": 0.15507090140144292, + "grad_norm": 1.1539981365203857, + "learning_rate": 9.908201878296212e-05, + "loss": 1.2889, + "mean_token_accuracy": 0.6768389582633972, + "num_tokens": 61242023.0, + "step": 1870 + }, + { + "epoch": 0.1554855294800564, + "grad_norm": 1.1856356859207153, + "learning_rate": 9.906816403291471e-05, + "loss": 1.4199, + "mean_token_accuracy": 0.6503726780414582, + "num_tokens": 61405863.0, + "step": 1875 + }, + { + "epoch": 0.15590015755866987, + "grad_norm": 1.1519221067428589, + "learning_rate": 9.905420649546731e-05, + "loss": 1.3275, + "mean_token_accuracy": 0.6667111247777939, + "num_tokens": 61568850.0, + "step": 1880 + }, + { + "epoch": 0.15631478563728335, + "grad_norm": 1.2015529870986938, + "learning_rate": 9.904014619985802e-05, + "loss": 1.3551, + "mean_token_accuracy": 0.6642350882291794, + "num_tokens": 61732690.0, + "step": 1885 + }, + { + "epoch": 0.15672941371589685, + "grad_norm": 1.0787674188613892, + "learning_rate": 9.902598317554018e-05, + "loss": 1.3456, + "mean_token_accuracy": 0.6618436604738236, + "num_tokens": 61895552.0, + "step": 1890 + }, + { + "epoch": 0.15714404179451033, + "grad_norm": 1.1786774396896362, + "learning_rate": 9.901171745218229e-05, + "loss": 1.3969, + "mean_token_accuracy": 0.6578751221299172, + "num_tokens": 62059392.0, + "step": 1895 + }, + { + "epoch": 0.1575586698731238, + "grad_norm": 1.1005985736846924, + "learning_rate": 9.899734905966804e-05, + "loss": 1.3296, + "mean_token_accuracy": 0.6645833313465118, + "num_tokens": 62223232.0, + "step": 1900 + }, + { + "epoch": 0.15797329795173728, + "grad_norm": 1.1051074266433716, + "learning_rate": 9.898287802809619e-05, + "loss": 1.4501, + "mean_token_accuracy": 0.6514662764966488, + "num_tokens": 62387072.0, + "step": 1905 + }, + { + "epoch": 0.15838792603035076, + "grad_norm": 1.10177481174469, + "learning_rate": 9.896830438778043e-05, + "loss": 1.3444, + "mean_token_accuracy": 0.6648582607507706, + "num_tokens": 62550912.0, + "step": 1910 + }, + { + "epoch": 0.15880255410896427, + "grad_norm": 1.9008870124816895, + "learning_rate": 9.895362816924949e-05, + "loss": 1.3342, + "mean_token_accuracy": 0.6668680176138878, + "num_tokens": 62714058.0, + "step": 1915 + }, + { + "epoch": 0.15921718218757774, + "grad_norm": 1.183345913887024, + "learning_rate": 9.893884940324691e-05, + "loss": 1.3417, + "mean_token_accuracy": 0.6646994158625603, + "num_tokens": 62877898.0, + "step": 1920 + }, + { + "epoch": 0.15963181026619122, + "grad_norm": 1.2330020666122437, + "learning_rate": 9.89239681207311e-05, + "loss": 1.4101, + "mean_token_accuracy": 0.6546187698841095, + "num_tokens": 63041738.0, + "step": 1925 + }, + { + "epoch": 0.1600464383448047, + "grad_norm": 1.7111624479293823, + "learning_rate": 9.890898435287517e-05, + "loss": 1.3232, + "mean_token_accuracy": 0.6709066450595855, + "num_tokens": 63205578.0, + "step": 1930 + }, + { + "epoch": 0.1604610664234182, + "grad_norm": 1.1247528791427612, + "learning_rate": 9.889389813106693e-05, + "loss": 1.3438, + "mean_token_accuracy": 0.6666116818785668, + "num_tokens": 63369418.0, + "step": 1935 + }, + { + "epoch": 0.16087569450203168, + "grad_norm": 1.138061761856079, + "learning_rate": 9.887870948690885e-05, + "loss": 1.2895, + "mean_token_accuracy": 0.674154357612133, + "num_tokens": 63532687.0, + "step": 1940 + }, + { + "epoch": 0.16129032258064516, + "grad_norm": 1.2452406883239746, + "learning_rate": 9.886341845221787e-05, + "loss": 1.3505, + "mean_token_accuracy": 0.6643633976578712, + "num_tokens": 63696527.0, + "step": 1945 + }, + { + "epoch": 0.16170495065925863, + "grad_norm": 1.1234358549118042, + "learning_rate": 9.88480250590255e-05, + "loss": 1.3152, + "mean_token_accuracy": 0.6677847012877465, + "num_tokens": 63860367.0, + "step": 1950 + }, + { + "epoch": 0.16211957873787214, + "grad_norm": 1.212026834487915, + "learning_rate": 9.883252933957763e-05, + "loss": 1.3354, + "mean_token_accuracy": 0.663086511194706, + "num_tokens": 64024207.0, + "step": 1955 + }, + { + "epoch": 0.16253420681648562, + "grad_norm": 1.1730372905731201, + "learning_rate": 9.881693132633449e-05, + "loss": 1.4111, + "mean_token_accuracy": 0.650942749530077, + "num_tokens": 64187687.0, + "step": 1960 + }, + { + "epoch": 0.1629488348950991, + "grad_norm": 1.2520666122436523, + "learning_rate": 9.880123105197065e-05, + "loss": 1.4476, + "mean_token_accuracy": 0.6472551025450229, + "num_tokens": 64351075.0, + "step": 1965 + }, + { + "epoch": 0.16336346297371257, + "grad_norm": 1.1247053146362305, + "learning_rate": 9.878542854937482e-05, + "loss": 1.3724, + "mean_token_accuracy": 0.6632514670491219, + "num_tokens": 64514915.0, + "step": 1970 + }, + { + "epoch": 0.16377809105232607, + "grad_norm": 1.1914619207382202, + "learning_rate": 9.876952385164989e-05, + "loss": 1.4081, + "mean_token_accuracy": 0.6540505856275558, + "num_tokens": 64678755.0, + "step": 1975 + }, + { + "epoch": 0.16419271913093955, + "grad_norm": 1.2277207374572754, + "learning_rate": 9.875351699211285e-05, + "loss": 1.2936, + "mean_token_accuracy": 0.6741715192794799, + "num_tokens": 64842396.0, + "step": 1980 + }, + { + "epoch": 0.16460734720955303, + "grad_norm": 1.0599324703216553, + "learning_rate": 9.873740800429467e-05, + "loss": 1.2859, + "mean_token_accuracy": 0.6765945732593537, + "num_tokens": 65006236.0, + "step": 1985 + }, + { + "epoch": 0.1650219752881665, + "grad_norm": 1.1088241338729858, + "learning_rate": 9.872119692194027e-05, + "loss": 1.3476, + "mean_token_accuracy": 0.6675403237342834, + "num_tokens": 65170076.0, + "step": 1990 + }, + { + "epoch": 0.16543660336678, + "grad_norm": 1.1403690576553345, + "learning_rate": 9.87048837790084e-05, + "loss": 1.3365, + "mean_token_accuracy": 0.6665994673967361, + "num_tokens": 65333916.0, + "step": 1995 + }, + { + "epoch": 0.1658512314453935, + "grad_norm": 1.0955932140350342, + "learning_rate": 9.868846860967167e-05, + "loss": 1.3646, + "mean_token_accuracy": 0.662988756597042, + "num_tokens": 65497756.0, + "step": 2000 + }, + { + "epoch": 0.16626585952400696, + "grad_norm": 1.1397483348846436, + "learning_rate": 9.867195144831636e-05, + "loss": 1.435, + "mean_token_accuracy": 0.6478783056139946, + "num_tokens": 65660803.0, + "step": 2005 + }, + { + "epoch": 0.16668048760262044, + "grad_norm": 1.0877341032028198, + "learning_rate": 9.865533232954245e-05, + "loss": 1.3153, + "mean_token_accuracy": 0.6733687669038773, + "num_tokens": 65824643.0, + "step": 2010 + }, + { + "epoch": 0.16709511568123395, + "grad_norm": 1.2004203796386719, + "learning_rate": 9.863861128816344e-05, + "loss": 1.3619, + "mean_token_accuracy": 0.6670149102807045, + "num_tokens": 65988483.0, + "step": 2015 + }, + { + "epoch": 0.16750974375984742, + "grad_norm": 1.2210606336593628, + "learning_rate": 9.862178835920637e-05, + "loss": 1.359, + "mean_token_accuracy": 0.6611253671348095, + "num_tokens": 66152323.0, + "step": 2020 + }, + { + "epoch": 0.1679243718384609, + "grad_norm": 1.1666066646575928, + "learning_rate": 9.860486357791172e-05, + "loss": 1.3124, + "mean_token_accuracy": 0.6706989288330079, + "num_tokens": 66316163.0, + "step": 2025 + }, + { + "epoch": 0.16833899991707438, + "grad_norm": 1.147369623184204, + "learning_rate": 9.85878369797333e-05, + "loss": 1.369, + "mean_token_accuracy": 0.6631170578300953, + "num_tokens": 66480003.0, + "step": 2030 + }, + { + "epoch": 0.16875362799568785, + "grad_norm": 1.1762323379516602, + "learning_rate": 9.857070860033826e-05, + "loss": 1.3241, + "mean_token_accuracy": 0.6665383711457252, + "num_tokens": 66643843.0, + "step": 2035 + }, + { + "epoch": 0.16916825607430136, + "grad_norm": 1.178361177444458, + "learning_rate": 9.855347847560689e-05, + "loss": 1.3434, + "mean_token_accuracy": 0.6594941362738609, + "num_tokens": 66807683.0, + "step": 2040 + }, + { + "epoch": 0.16958288415291484, + "grad_norm": 1.143797516822815, + "learning_rate": 9.853614664163265e-05, + "loss": 1.2285, + "mean_token_accuracy": 0.6801197454333305, + "num_tokens": 66971523.0, + "step": 2045 + }, + { + "epoch": 0.1699975122315283, + "grad_norm": 1.1586716175079346, + "learning_rate": 9.851871313472207e-05, + "loss": 1.3523, + "mean_token_accuracy": 0.6616935506463051, + "num_tokens": 67135363.0, + "step": 2050 + }, + { + "epoch": 0.1704121403101418, + "grad_norm": 1.1401805877685547, + "learning_rate": 9.850117799139464e-05, + "loss": 1.3736, + "mean_token_accuracy": 0.6606549307703972, + "num_tokens": 67299203.0, + "step": 2055 + }, + { + "epoch": 0.1708267683887553, + "grad_norm": 1.1220128536224365, + "learning_rate": 9.84835412483828e-05, + "loss": 1.2944, + "mean_token_accuracy": 0.675079420208931, + "num_tokens": 67463043.0, + "step": 2060 + }, + { + "epoch": 0.17124139646736877, + "grad_norm": 1.1114490032196045, + "learning_rate": 9.846580294263172e-05, + "loss": 1.3091, + "mean_token_accuracy": 0.6686273604631424, + "num_tokens": 67626374.0, + "step": 2065 + }, + { + "epoch": 0.17165602454598225, + "grad_norm": 1.176193118095398, + "learning_rate": 9.844796311129944e-05, + "loss": 1.3406, + "mean_token_accuracy": 0.6674609020352363, + "num_tokens": 67790214.0, + "step": 2070 + }, + { + "epoch": 0.17207065262459573, + "grad_norm": 1.157579779624939, + "learning_rate": 9.843002179175665e-05, + "loss": 1.3304, + "mean_token_accuracy": 0.6642900764942169, + "num_tokens": 67954054.0, + "step": 2075 + }, + { + "epoch": 0.17248528070320923, + "grad_norm": 1.1543059349060059, + "learning_rate": 9.841197902158653e-05, + "loss": 1.3111, + "mean_token_accuracy": 0.6732191652059555, + "num_tokens": 68117791.0, + "step": 2080 + }, + { + "epoch": 0.1728999087818227, + "grad_norm": 1.2211623191833496, + "learning_rate": 9.839383483858492e-05, + "loss": 1.3807, + "mean_token_accuracy": 0.6542844593524932, + "num_tokens": 68280791.0, + "step": 2085 + }, + { + "epoch": 0.17331453686043619, + "grad_norm": 1.1212056875228882, + "learning_rate": 9.837558928076003e-05, + "loss": 1.2909, + "mean_token_accuracy": 0.6729288831353187, + "num_tokens": 68444631.0, + "step": 2090 + }, + { + "epoch": 0.17372916493904966, + "grad_norm": 1.0925419330596924, + "learning_rate": 9.83572423863324e-05, + "loss": 1.2163, + "mean_token_accuracy": 0.6872311800718307, + "num_tokens": 68608471.0, + "step": 2095 + }, + { + "epoch": 0.17414379301766317, + "grad_norm": 1.1620479822158813, + "learning_rate": 9.833879419373493e-05, + "loss": 1.2925, + "mean_token_accuracy": 0.6768633931875229, + "num_tokens": 68772311.0, + "step": 2100 + }, + { + "epoch": 0.17455842109627664, + "grad_norm": 1.1429002285003662, + "learning_rate": 9.832024474161263e-05, + "loss": 1.3457, + "mean_token_accuracy": 0.6651820585131645, + "num_tokens": 68936151.0, + "step": 2105 + }, + { + "epoch": 0.17497304917489012, + "grad_norm": 1.1326366662979126, + "learning_rate": 9.83015940688227e-05, + "loss": 1.3674, + "mean_token_accuracy": 0.6595857813954353, + "num_tokens": 69099991.0, + "step": 2110 + }, + { + "epoch": 0.1753876772535036, + "grad_norm": 1.0936850309371948, + "learning_rate": 9.828284221443433e-05, + "loss": 1.3882, + "mean_token_accuracy": 0.6589015170931816, + "num_tokens": 69263831.0, + "step": 2115 + }, + { + "epoch": 0.1758023053321171, + "grad_norm": 1.0790454149246216, + "learning_rate": 9.826398921772868e-05, + "loss": 1.3828, + "mean_token_accuracy": 0.6558589950203896, + "num_tokens": 69427671.0, + "step": 2120 + }, + { + "epoch": 0.17621693341073058, + "grad_norm": 1.0382895469665527, + "learning_rate": 9.82450351181988e-05, + "loss": 1.3515, + "mean_token_accuracy": 0.6637157872319221, + "num_tokens": 69591511.0, + "step": 2125 + }, + { + "epoch": 0.17663156148934406, + "grad_norm": 1.116068959236145, + "learning_rate": 9.822597995554948e-05, + "loss": 1.336, + "mean_token_accuracy": 0.6685972645878792, + "num_tokens": 69755351.0, + "step": 2130 + }, + { + "epoch": 0.17704618956795753, + "grad_norm": 1.1191227436065674, + "learning_rate": 9.820682376969726e-05, + "loss": 1.3248, + "mean_token_accuracy": 0.6705806404352188, + "num_tokens": 69919049.0, + "step": 2135 + }, + { + "epoch": 0.17746081764657104, + "grad_norm": 1.1419486999511719, + "learning_rate": 9.818756660077029e-05, + "loss": 1.5134, + "mean_token_accuracy": 0.6402981460094452, + "num_tokens": 70082889.0, + "step": 2140 + }, + { + "epoch": 0.17787544572518452, + "grad_norm": 1.7518163919448853, + "learning_rate": 9.816820848910826e-05, + "loss": 1.3682, + "mean_token_accuracy": 0.6609827965497971, + "num_tokens": 70246624.0, + "step": 2145 + }, + { + "epoch": 0.178290073803798, + "grad_norm": 1.0493342876434326, + "learning_rate": 9.81487494752623e-05, + "loss": 1.2596, + "mean_token_accuracy": 0.678048625588417, + "num_tokens": 70410464.0, + "step": 2150 + }, + { + "epoch": 0.17870470188241147, + "grad_norm": 1.1223058700561523, + "learning_rate": 9.81291895999949e-05, + "loss": 1.3444, + "mean_token_accuracy": 0.6666483402252197, + "num_tokens": 70574304.0, + "step": 2155 + }, + { + "epoch": 0.17911932996102495, + "grad_norm": 1.0618618726730347, + "learning_rate": 9.810952890427989e-05, + "loss": 1.3212, + "mean_token_accuracy": 0.6679924234747887, + "num_tokens": 70738144.0, + "step": 2160 + }, + { + "epoch": 0.17953395803963845, + "grad_norm": 1.1819339990615845, + "learning_rate": 9.808976742930224e-05, + "loss": 1.429, + "mean_token_accuracy": 0.6516617774963379, + "num_tokens": 70901984.0, + "step": 2165 + }, + { + "epoch": 0.17994858611825193, + "grad_norm": 1.232933521270752, + "learning_rate": 9.806990521645805e-05, + "loss": 1.3836, + "mean_token_accuracy": 0.6575513169169426, + "num_tokens": 71065824.0, + "step": 2170 + }, + { + "epoch": 0.1803632141968654, + "grad_norm": 1.0888854265213013, + "learning_rate": 9.804994230735444e-05, + "loss": 1.3215, + "mean_token_accuracy": 0.6692053899168968, + "num_tokens": 71229122.0, + "step": 2175 + }, + { + "epoch": 0.18077784227547888, + "grad_norm": 1.100090503692627, + "learning_rate": 9.80298787438095e-05, + "loss": 1.3636, + "mean_token_accuracy": 0.6626466304063797, + "num_tokens": 71392962.0, + "step": 2180 + }, + { + "epoch": 0.1811924703540924, + "grad_norm": 1.1078429222106934, + "learning_rate": 9.800971456785209e-05, + "loss": 1.3774, + "mean_token_accuracy": 0.6605205282568931, + "num_tokens": 71556802.0, + "step": 2185 + }, + { + "epoch": 0.18160709843270587, + "grad_norm": 1.0692031383514404, + "learning_rate": 9.798944982172193e-05, + "loss": 1.3401, + "mean_token_accuracy": 0.665487539768219, + "num_tokens": 71720642.0, + "step": 2190 + }, + { + "epoch": 0.18202172651131934, + "grad_norm": 1.105131983757019, + "learning_rate": 9.796908454786935e-05, + "loss": 1.4053, + "mean_token_accuracy": 0.6570259012281895, + "num_tokens": 71884482.0, + "step": 2195 + }, + { + "epoch": 0.18243635458993282, + "grad_norm": 1.1255451440811157, + "learning_rate": 9.794861878895527e-05, + "loss": 1.3658, + "mean_token_accuracy": 0.6593963801860809, + "num_tokens": 72048322.0, + "step": 2200 + }, + { + "epoch": 0.18285098266854632, + "grad_norm": 1.0730068683624268, + "learning_rate": 9.792805258785114e-05, + "loss": 1.4008, + "mean_token_accuracy": 0.6534824058413505, + "num_tokens": 72212162.0, + "step": 2205 + }, + { + "epoch": 0.1832656107471598, + "grad_norm": 1.0972033739089966, + "learning_rate": 9.790738598763875e-05, + "loss": 1.3791, + "mean_token_accuracy": 0.6571542024612427, + "num_tokens": 72376002.0, + "step": 2210 + }, + { + "epoch": 0.18368023882577328, + "grad_norm": 1.0816172361373901, + "learning_rate": 9.78866190316103e-05, + "loss": 1.3325, + "mean_token_accuracy": 0.6688905179500579, + "num_tokens": 72539842.0, + "step": 2215 + }, + { + "epoch": 0.18409486690438676, + "grad_norm": 1.131365180015564, + "learning_rate": 9.786575176326813e-05, + "loss": 1.3574, + "mean_token_accuracy": 0.6613880753517151, + "num_tokens": 72703682.0, + "step": 2220 + }, + { + "epoch": 0.18450949498300026, + "grad_norm": 1.1486268043518066, + "learning_rate": 9.784478422632473e-05, + "loss": 1.377, + "mean_token_accuracy": 0.6570772603154182, + "num_tokens": 72866724.0, + "step": 2225 + }, + { + "epoch": 0.18492412306161374, + "grad_norm": 1.094726324081421, + "learning_rate": 9.782371646470267e-05, + "loss": 1.4077, + "mean_token_accuracy": 0.6569770231842995, + "num_tokens": 73030564.0, + "step": 2230 + }, + { + "epoch": 0.18533875114022721, + "grad_norm": 1.1273894309997559, + "learning_rate": 9.780254852253444e-05, + "loss": 1.3788, + "mean_token_accuracy": 0.6608993165194988, + "num_tokens": 73194404.0, + "step": 2235 + }, + { + "epoch": 0.1857533792188407, + "grad_norm": 1.1158084869384766, + "learning_rate": 9.778128044416236e-05, + "loss": 1.3406, + "mean_token_accuracy": 0.6637341201305389, + "num_tokens": 73358244.0, + "step": 2240 + }, + { + "epoch": 0.1861680072974542, + "grad_norm": 1.0741921663284302, + "learning_rate": 9.77599122741386e-05, + "loss": 1.2999, + "mean_token_accuracy": 0.6719269290566444, + "num_tokens": 73522084.0, + "step": 2245 + }, + { + "epoch": 0.18658263537606767, + "grad_norm": 1.1117743253707886, + "learning_rate": 9.773844405722487e-05, + "loss": 1.3018, + "mean_token_accuracy": 0.6700600832700729, + "num_tokens": 73685472.0, + "step": 2250 + }, + { + "epoch": 0.18699726345468115, + "grad_norm": 1.135411024093628, + "learning_rate": 9.771687583839261e-05, + "loss": 1.285, + "mean_token_accuracy": 0.6762952029705047, + "num_tokens": 73849312.0, + "step": 2255 + }, + { + "epoch": 0.18741189153329463, + "grad_norm": 1.0903639793395996, + "learning_rate": 9.769520766282263e-05, + "loss": 1.3271, + "mean_token_accuracy": 0.6650048926472664, + "num_tokens": 74013152.0, + "step": 2260 + }, + { + "epoch": 0.1878265196119081, + "grad_norm": 1.1023638248443604, + "learning_rate": 9.767343957590516e-05, + "loss": 1.2548, + "mean_token_accuracy": 0.6818181812763214, + "num_tokens": 74176992.0, + "step": 2265 + }, + { + "epoch": 0.1882411476905216, + "grad_norm": 1.0778052806854248, + "learning_rate": 9.765157162323973e-05, + "loss": 1.3384, + "mean_token_accuracy": 0.663483626395464, + "num_tokens": 74340832.0, + "step": 2270 + }, + { + "epoch": 0.1886557757691351, + "grad_norm": 1.110249400138855, + "learning_rate": 9.762960385063506e-05, + "loss": 1.3886, + "mean_token_accuracy": 0.6580768913030625, + "num_tokens": 74504100.0, + "step": 2275 + }, + { + "epoch": 0.18907040384774856, + "grad_norm": 1.146621823310852, + "learning_rate": 9.7607536304109e-05, + "loss": 1.3387, + "mean_token_accuracy": 0.6679115161299706, + "num_tokens": 74667795.0, + "step": 2280 + }, + { + "epoch": 0.18948503192636204, + "grad_norm": 1.2143663167953491, + "learning_rate": 9.758536902988835e-05, + "loss": 1.3847, + "mean_token_accuracy": 0.658492174744606, + "num_tokens": 74831635.0, + "step": 2285 + }, + { + "epoch": 0.18989966000497555, + "grad_norm": 1.1127694845199585, + "learning_rate": 9.756310207440886e-05, + "loss": 1.3139, + "mean_token_accuracy": 0.6696419835090637, + "num_tokens": 74995475.0, + "step": 2290 + }, + { + "epoch": 0.19031428808358902, + "grad_norm": 1.1862319707870483, + "learning_rate": 9.75407354843151e-05, + "loss": 1.2876, + "mean_token_accuracy": 0.6754197210073472, + "num_tokens": 75158878.0, + "step": 2295 + }, + { + "epoch": 0.1907289161622025, + "grad_norm": 1.1235371828079224, + "learning_rate": 9.751826930646031e-05, + "loss": 1.3592, + "mean_token_accuracy": 0.6648093804717063, + "num_tokens": 75322718.0, + "step": 2300 + }, + { + "epoch": 0.19114354424081598, + "grad_norm": 1.1142855882644653, + "learning_rate": 9.749570358790638e-05, + "loss": 1.2707, + "mean_token_accuracy": 0.6775232195854187, + "num_tokens": 75486558.0, + "step": 2305 + }, + { + "epoch": 0.19155817231942948, + "grad_norm": 1.0357214212417603, + "learning_rate": 9.74730383759237e-05, + "loss": 1.2893, + "mean_token_accuracy": 0.6758553296327591, + "num_tokens": 75650398.0, + "step": 2310 + }, + { + "epoch": 0.19197280039804296, + "grad_norm": 1.2712405920028687, + "learning_rate": 9.745027371799107e-05, + "loss": 1.3268, + "mean_token_accuracy": 0.6664589449763298, + "num_tokens": 75814238.0, + "step": 2315 + }, + { + "epoch": 0.19238742847665644, + "grad_norm": 1.0738105773925781, + "learning_rate": 9.742740966179567e-05, + "loss": 1.3649, + "mean_token_accuracy": 0.6583088934421539, + "num_tokens": 75978078.0, + "step": 2320 + }, + { + "epoch": 0.1928020565552699, + "grad_norm": 1.1622228622436523, + "learning_rate": 9.740444625523279e-05, + "loss": 1.3111, + "mean_token_accuracy": 0.6688497617840767, + "num_tokens": 76141604.0, + "step": 2325 + }, + { + "epoch": 0.19321668463388342, + "grad_norm": 1.0559049844741821, + "learning_rate": 9.738138354640593e-05, + "loss": 1.2614, + "mean_token_accuracy": 0.6786290287971497, + "num_tokens": 76305444.0, + "step": 2330 + }, + { + "epoch": 0.1936313127124969, + "grad_norm": 1.1456258296966553, + "learning_rate": 9.735822158362657e-05, + "loss": 1.3067, + "mean_token_accuracy": 0.6677236080169677, + "num_tokens": 76469284.0, + "step": 2335 + }, + { + "epoch": 0.19404594079111037, + "grad_norm": 1.1015242338180542, + "learning_rate": 9.733496041541414e-05, + "loss": 1.4018, + "mean_token_accuracy": 0.6553274631500244, + "num_tokens": 76633124.0, + "step": 2340 + }, + { + "epoch": 0.19446056886972385, + "grad_norm": 1.0512968301773071, + "learning_rate": 9.73116000904958e-05, + "loss": 1.2869, + "mean_token_accuracy": 0.6766739979386329, + "num_tokens": 76796964.0, + "step": 2345 + }, + { + "epoch": 0.19487519694833735, + "grad_norm": 1.1637961864471436, + "learning_rate": 9.728814065780651e-05, + "loss": 1.3192, + "mean_token_accuracy": 0.6687282115221024, + "num_tokens": 76959932.0, + "step": 2350 + }, + { + "epoch": 0.19528982502695083, + "grad_norm": 1.1040971279144287, + "learning_rate": 9.72645821664888e-05, + "loss": 1.378, + "mean_token_accuracy": 0.6587915405631065, + "num_tokens": 77123772.0, + "step": 2355 + }, + { + "epoch": 0.1957044531055643, + "grad_norm": 1.0787874460220337, + "learning_rate": 9.724092466589273e-05, + "loss": 1.3312, + "mean_token_accuracy": 0.6668265044689179, + "num_tokens": 77287521.0, + "step": 2360 + }, + { + "epoch": 0.19611908118417778, + "grad_norm": 1.105224609375, + "learning_rate": 9.721716820557573e-05, + "loss": 1.2537, + "mean_token_accuracy": 0.6787023469805717, + "num_tokens": 77451361.0, + "step": 2365 + }, + { + "epoch": 0.1965337092627913, + "grad_norm": 1.1023021936416626, + "learning_rate": 9.719331283530255e-05, + "loss": 1.2594, + "mean_token_accuracy": 0.6774978116154671, + "num_tokens": 77615046.0, + "step": 2370 + }, + { + "epoch": 0.19694833734140477, + "grad_norm": 1.169885516166687, + "learning_rate": 9.716935860504512e-05, + "loss": 1.3832, + "mean_token_accuracy": 0.6575268790125847, + "num_tokens": 77778886.0, + "step": 2375 + }, + { + "epoch": 0.19736296542001824, + "grad_norm": 1.3298619985580444, + "learning_rate": 9.714530556498252e-05, + "loss": 1.3489, + "mean_token_accuracy": 0.661094817519188, + "num_tokens": 77942726.0, + "step": 2380 + }, + { + "epoch": 0.19777759349863172, + "grad_norm": 1.0819296836853027, + "learning_rate": 9.712115376550072e-05, + "loss": 1.3445, + "mean_token_accuracy": 0.6615713611245155, + "num_tokens": 78106566.0, + "step": 2385 + }, + { + "epoch": 0.1981922215772452, + "grad_norm": 1.1341317892074585, + "learning_rate": 9.709690325719263e-05, + "loss": 1.3962, + "mean_token_accuracy": 0.6577895864844322, + "num_tokens": 78270406.0, + "step": 2390 + }, + { + "epoch": 0.1986068496558587, + "grad_norm": 1.5062912702560425, + "learning_rate": 9.707255409085793e-05, + "loss": 1.3149, + "mean_token_accuracy": 0.6675708696246148, + "num_tokens": 78434246.0, + "step": 2395 + }, + { + "epoch": 0.19902147773447218, + "grad_norm": 1.6830103397369385, + "learning_rate": 9.704810631750299e-05, + "loss": 1.1792, + "mean_token_accuracy": 0.6939577236771584, + "num_tokens": 78598086.0, + "step": 2400 + }, + { + "epoch": 0.19943610581308566, + "grad_norm": 1.3983255624771118, + "learning_rate": 9.702355998834065e-05, + "loss": 1.3945, + "mean_token_accuracy": 0.6583511427044868, + "num_tokens": 78761846.0, + "step": 2405 + }, + { + "epoch": 0.19985073389169913, + "grad_norm": 1.0804715156555176, + "learning_rate": 9.699891515479031e-05, + "loss": 1.3258, + "mean_token_accuracy": 0.6670332357287407, + "num_tokens": 78925686.0, + "step": 2410 + }, + { + "epoch": 0.20026536197031264, + "grad_norm": 1.0900949239730835, + "learning_rate": 9.697417186847766e-05, + "loss": 1.3864, + "mean_token_accuracy": 0.6587426692247391, + "num_tokens": 79089526.0, + "step": 2415 + }, + { + "epoch": 0.20067999004892612, + "grad_norm": 1.089453101158142, + "learning_rate": 9.694933018123464e-05, + "loss": 1.396, + "mean_token_accuracy": 0.6545210152864456, + "num_tokens": 79253366.0, + "step": 2420 + }, + { + "epoch": 0.2010946181275396, + "grad_norm": 1.11849045753479, + "learning_rate": 9.692439014509931e-05, + "loss": 1.3114, + "mean_token_accuracy": 0.6667460948228836, + "num_tokens": 79417206.0, + "step": 2425 + }, + { + "epoch": 0.20150924620615307, + "grad_norm": 1.2045278549194336, + "learning_rate": 9.689935181231575e-05, + "loss": 1.3497, + "mean_token_accuracy": 0.6638746336102486, + "num_tokens": 79581046.0, + "step": 2430 + }, + { + "epoch": 0.20192387428476657, + "grad_norm": 1.1044609546661377, + "learning_rate": 9.6874215235334e-05, + "loss": 1.3491, + "mean_token_accuracy": 0.6666239008307457, + "num_tokens": 79744886.0, + "step": 2435 + }, + { + "epoch": 0.20233850236338005, + "grad_norm": 1.1200894117355347, + "learning_rate": 9.684898046680981e-05, + "loss": 1.3401, + "mean_token_accuracy": 0.6662483721971512, + "num_tokens": 79908147.0, + "step": 2440 + }, + { + "epoch": 0.20275313044199353, + "grad_norm": 1.1056984663009644, + "learning_rate": 9.682364755960468e-05, + "loss": 1.2718, + "mean_token_accuracy": 0.6802785888314247, + "num_tokens": 80071987.0, + "step": 2445 + }, + { + "epoch": 0.203167758520607, + "grad_norm": 1.1164171695709229, + "learning_rate": 9.679821656678572e-05, + "loss": 1.2684, + "mean_token_accuracy": 0.6782441362738609, + "num_tokens": 80235827.0, + "step": 2450 + }, + { + "epoch": 0.2035823865992205, + "grad_norm": 1.1353195905685425, + "learning_rate": 9.677268754162541e-05, + "loss": 1.3475, + "mean_token_accuracy": 0.6620967760682106, + "num_tokens": 80399667.0, + "step": 2455 + }, + { + "epoch": 0.203997014677834, + "grad_norm": 1.1183967590332031, + "learning_rate": 9.674706053760169e-05, + "loss": 1.3117, + "mean_token_accuracy": 0.6705278620123863, + "num_tokens": 80563507.0, + "step": 2460 + }, + { + "epoch": 0.20441164275644746, + "grad_norm": 1.030859112739563, + "learning_rate": 9.672133560839768e-05, + "loss": 1.3011, + "mean_token_accuracy": 0.6704667627811431, + "num_tokens": 80727347.0, + "step": 2465 + }, + { + "epoch": 0.20482627083506094, + "grad_norm": 1.0588932037353516, + "learning_rate": 9.669551280790166e-05, + "loss": 1.3074, + "mean_token_accuracy": 0.6711326971650123, + "num_tokens": 80891187.0, + "step": 2470 + }, + { + "epoch": 0.20524089891367445, + "grad_norm": 1.0952879190444946, + "learning_rate": 9.66695921902069e-05, + "loss": 1.3323, + "mean_token_accuracy": 0.6656402736902237, + "num_tokens": 81055027.0, + "step": 2475 + }, + { + "epoch": 0.20565552699228792, + "grad_norm": 1.1158757209777832, + "learning_rate": 9.664357380961162e-05, + "loss": 1.3251, + "mean_token_accuracy": 0.6664972737431526, + "num_tokens": 81217910.0, + "step": 2480 + }, + { + "epoch": 0.2060701550709014, + "grad_norm": 0.9855961203575134, + "learning_rate": 9.661745772061881e-05, + "loss": 1.2601, + "mean_token_accuracy": 0.68031525015831, + "num_tokens": 81381750.0, + "step": 2485 + }, + { + "epoch": 0.20648478314951488, + "grad_norm": 1.1344683170318604, + "learning_rate": 9.659124397793613e-05, + "loss": 1.4021, + "mean_token_accuracy": 0.6581928178668022, + "num_tokens": 81545590.0, + "step": 2490 + }, + { + "epoch": 0.20689941122812838, + "grad_norm": 1.0772168636322021, + "learning_rate": 9.656493263647581e-05, + "loss": 1.3634, + "mean_token_accuracy": 0.6626588478684425, + "num_tokens": 81709430.0, + "step": 2495 + }, + { + "epoch": 0.20731403930674186, + "grad_norm": 1.090943455696106, + "learning_rate": 9.653852375135456e-05, + "loss": 1.3196, + "mean_token_accuracy": 0.6680290848016739, + "num_tokens": 81873270.0, + "step": 2500 + }, + { + "epoch": 0.20772866738535534, + "grad_norm": 1.1841354370117188, + "learning_rate": 9.651201737789335e-05, + "loss": 1.4031, + "mean_token_accuracy": 0.6540872484445572, + "num_tokens": 82037110.0, + "step": 2505 + }, + { + "epoch": 0.2081432954639688, + "grad_norm": 1.040313959121704, + "learning_rate": 9.648541357161747e-05, + "loss": 1.2831, + "mean_token_accuracy": 0.6743523925542831, + "num_tokens": 82200950.0, + "step": 2510 + }, + { + "epoch": 0.2085579235425823, + "grad_norm": 1.0221846103668213, + "learning_rate": 9.645871238825619e-05, + "loss": 1.3091, + "mean_token_accuracy": 0.6719657763838768, + "num_tokens": 82364428.0, + "step": 2515 + }, + { + "epoch": 0.2089725516211958, + "grad_norm": 1.0882335901260376, + "learning_rate": 9.643191388374288e-05, + "loss": 1.3205, + "mean_token_accuracy": 0.6690249308943749, + "num_tokens": 82528268.0, + "step": 2520 + }, + { + "epoch": 0.20938717969980927, + "grad_norm": 1.0770103931427002, + "learning_rate": 9.640501811421469e-05, + "loss": 1.3315, + "mean_token_accuracy": 0.6688233181834221, + "num_tokens": 82692108.0, + "step": 2525 + }, + { + "epoch": 0.20980180777842275, + "grad_norm": 1.097158670425415, + "learning_rate": 9.637802513601258e-05, + "loss": 1.3196, + "mean_token_accuracy": 0.6666116788983345, + "num_tokens": 82855948.0, + "step": 2530 + }, + { + "epoch": 0.21021643585703623, + "grad_norm": 1.102793574333191, + "learning_rate": 9.635093500568109e-05, + "loss": 1.3891, + "mean_token_accuracy": 0.6579056680202484, + "num_tokens": 83019788.0, + "step": 2535 + }, + { + "epoch": 0.21063106393564973, + "grad_norm": 1.058576226234436, + "learning_rate": 9.632374777996831e-05, + "loss": 1.3382, + "mean_token_accuracy": 0.6638813644647599, + "num_tokens": 83183076.0, + "step": 2540 + }, + { + "epoch": 0.2110456920142632, + "grad_norm": 1.150504231452942, + "learning_rate": 9.629646351582573e-05, + "loss": 1.3318, + "mean_token_accuracy": 0.6660740464925766, + "num_tokens": 83346916.0, + "step": 2545 + }, + { + "epoch": 0.21146032009287669, + "grad_norm": 1.1130492687225342, + "learning_rate": 9.626908227040808e-05, + "loss": 1.3191, + "mean_token_accuracy": 0.66956866979599, + "num_tokens": 83510756.0, + "step": 2550 + }, + { + "epoch": 0.21187494817149016, + "grad_norm": 1.1053194999694824, + "learning_rate": 9.624160410107326e-05, + "loss": 1.3424, + "mean_token_accuracy": 0.6644733622670174, + "num_tokens": 83674596.0, + "step": 2555 + }, + { + "epoch": 0.21228957625010367, + "grad_norm": 1.0200395584106445, + "learning_rate": 9.621402906538222e-05, + "loss": 1.2404, + "mean_token_accuracy": 0.6816626936197281, + "num_tokens": 83838094.0, + "step": 2560 + }, + { + "epoch": 0.21270420432871714, + "grad_norm": 1.1047829389572144, + "learning_rate": 9.618635722109881e-05, + "loss": 1.4284, + "mean_token_accuracy": 0.6508206129074097, + "num_tokens": 84001749.0, + "step": 2565 + }, + { + "epoch": 0.21311883240733062, + "grad_norm": 0.9829691648483276, + "learning_rate": 9.61585886261897e-05, + "loss": 1.3187, + "mean_token_accuracy": 0.6698619246482849, + "num_tokens": 84165589.0, + "step": 2570 + }, + { + "epoch": 0.2135334604859441, + "grad_norm": 1.0752907991409302, + "learning_rate": 9.613072333882416e-05, + "loss": 1.2852, + "mean_token_accuracy": 0.6733260005712509, + "num_tokens": 84329429.0, + "step": 2575 + }, + { + "epoch": 0.2139480885645576, + "grad_norm": 1.1305135488510132, + "learning_rate": 9.610276141737409e-05, + "loss": 1.3866, + "mean_token_accuracy": 0.6594574764370918, + "num_tokens": 84493269.0, + "step": 2580 + }, + { + "epoch": 0.21436271664317108, + "grad_norm": 1.0816925764083862, + "learning_rate": 9.607470292041379e-05, + "loss": 1.3104, + "mean_token_accuracy": 0.6695931047201157, + "num_tokens": 84657109.0, + "step": 2585 + }, + { + "epoch": 0.21477734472178456, + "grad_norm": 1.08124840259552, + "learning_rate": 9.604654790671985e-05, + "loss": 1.2459, + "mean_token_accuracy": 0.6807917907834053, + "num_tokens": 84820949.0, + "step": 2590 + }, + { + "epoch": 0.21519197280039803, + "grad_norm": 1.0594547986984253, + "learning_rate": 9.601829643527105e-05, + "loss": 1.286, + "mean_token_accuracy": 0.673258799314499, + "num_tokens": 84984789.0, + "step": 2595 + }, + { + "epoch": 0.21560660087901154, + "grad_norm": 0.9991068243980408, + "learning_rate": 9.598994856524826e-05, + "loss": 1.3501, + "mean_token_accuracy": 0.664106796681881, + "num_tokens": 85148629.0, + "step": 2600 + }, + { + "epoch": 0.21602122895762502, + "grad_norm": 1.0182642936706543, + "learning_rate": 9.596150435603422e-05, + "loss": 1.305, + "mean_token_accuracy": 0.6696542009711266, + "num_tokens": 85312469.0, + "step": 2605 + }, + { + "epoch": 0.2164358570362385, + "grad_norm": 1.0205860137939453, + "learning_rate": 9.593296386721353e-05, + "loss": 1.3007, + "mean_token_accuracy": 0.6687988772988319, + "num_tokens": 85476309.0, + "step": 2610 + }, + { + "epoch": 0.21685048511485197, + "grad_norm": 1.0811766386032104, + "learning_rate": 9.59043271585725e-05, + "loss": 1.339, + "mean_token_accuracy": 0.6658479943871498, + "num_tokens": 85640149.0, + "step": 2615 + }, + { + "epoch": 0.21726511319346545, + "grad_norm": 1.0677396059036255, + "learning_rate": 9.587559429009889e-05, + "loss": 1.331, + "mean_token_accuracy": 0.6626160770654679, + "num_tokens": 85803989.0, + "step": 2620 + }, + { + "epoch": 0.21767974127207895, + "grad_norm": 1.0135524272918701, + "learning_rate": 9.584676532198202e-05, + "loss": 1.3114, + "mean_token_accuracy": 0.6692815214395523, + "num_tokens": 85967829.0, + "step": 2625 + }, + { + "epoch": 0.21809436935069243, + "grad_norm": 1.0285613536834717, + "learning_rate": 9.581784031461247e-05, + "loss": 1.2702, + "mean_token_accuracy": 0.6740224823355675, + "num_tokens": 86131669.0, + "step": 2630 + }, + { + "epoch": 0.2185089974293059, + "grad_norm": 1.0852622985839844, + "learning_rate": 9.578881932858198e-05, + "loss": 1.3504, + "mean_token_accuracy": 0.6628107890486717, + "num_tokens": 86294505.0, + "step": 2635 + }, + { + "epoch": 0.21892362550791938, + "grad_norm": 1.1283034086227417, + "learning_rate": 9.575970242468335e-05, + "loss": 1.3912, + "mean_token_accuracy": 0.6552175015211106, + "num_tokens": 86458345.0, + "step": 2640 + }, + { + "epoch": 0.2193382535865329, + "grad_norm": 1.1609395742416382, + "learning_rate": 9.573048966391034e-05, + "loss": 1.362, + "mean_token_accuracy": 0.6610581636428833, + "num_tokens": 86622185.0, + "step": 2645 + }, + { + "epoch": 0.21975288166514637, + "grad_norm": 1.007416009902954, + "learning_rate": 9.570118110745749e-05, + "loss": 1.2972, + "mean_token_accuracy": 0.6743523970246315, + "num_tokens": 86786025.0, + "step": 2650 + }, + { + "epoch": 0.22016750974375984, + "grad_norm": 1.0748475790023804, + "learning_rate": 9.567177681672e-05, + "loss": 1.3899, + "mean_token_accuracy": 0.6591829985380173, + "num_tokens": 86948955.0, + "step": 2655 + }, + { + "epoch": 0.22058213782237332, + "grad_norm": 1.1083078384399414, + "learning_rate": 9.564227685329363e-05, + "loss": 1.3111, + "mean_token_accuracy": 0.6674165606498719, + "num_tokens": 87112703.0, + "step": 2660 + }, + { + "epoch": 0.22099676590098682, + "grad_norm": 1.0616549253463745, + "learning_rate": 9.561268127897457e-05, + "loss": 1.343, + "mean_token_accuracy": 0.6640853062272072, + "num_tokens": 87275623.0, + "step": 2665 + }, + { + "epoch": 0.2214113939796003, + "grad_norm": 1.0404539108276367, + "learning_rate": 9.558299015575922e-05, + "loss": 1.3441, + "mean_token_accuracy": 0.6618218451738358, + "num_tokens": 87439463.0, + "step": 2670 + }, + { + "epoch": 0.22182602205821378, + "grad_norm": 1.0714560747146606, + "learning_rate": 9.555320354584423e-05, + "loss": 1.2658, + "mean_token_accuracy": 0.6778225809335708, + "num_tokens": 87603303.0, + "step": 2675 + }, + { + "epoch": 0.22224065013682726, + "grad_norm": 1.0824095010757446, + "learning_rate": 9.552332151162623e-05, + "loss": 1.2986, + "mean_token_accuracy": 0.671291546523571, + "num_tokens": 87767143.0, + "step": 2680 + }, + { + "epoch": 0.22265527821544076, + "grad_norm": 1.081459403038025, + "learning_rate": 9.549334411570174e-05, + "loss": 1.4601, + "mean_token_accuracy": 0.6446869522333145, + "num_tokens": 87930536.0, + "step": 2685 + }, + { + "epoch": 0.22306990629405424, + "grad_norm": 1.0536558628082275, + "learning_rate": 9.546327142086704e-05, + "loss": 1.3329, + "mean_token_accuracy": 0.6650904163718223, + "num_tokens": 88094376.0, + "step": 2690 + }, + { + "epoch": 0.22348453437266771, + "grad_norm": 1.0550700426101685, + "learning_rate": 9.543310349011805e-05, + "loss": 1.3128, + "mean_token_accuracy": 0.6701490715146065, + "num_tokens": 88258216.0, + "step": 2695 + }, + { + "epoch": 0.2238991624512812, + "grad_norm": 1.101529598236084, + "learning_rate": 9.540284038665022e-05, + "loss": 1.2555, + "mean_token_accuracy": 0.6795943312346935, + "num_tokens": 88422056.0, + "step": 2700 + }, + { + "epoch": 0.2243137905298947, + "grad_norm": 1.0878149271011353, + "learning_rate": 9.537248217385828e-05, + "loss": 1.3549, + "mean_token_accuracy": 0.668823316693306, + "num_tokens": 88585896.0, + "step": 2705 + }, + { + "epoch": 0.22472841860850817, + "grad_norm": 1.0655194520950317, + "learning_rate": 9.53420289153363e-05, + "loss": 1.343, + "mean_token_accuracy": 0.6660740479826928, + "num_tokens": 88749736.0, + "step": 2710 + }, + { + "epoch": 0.22514304668712165, + "grad_norm": 1.0821983814239502, + "learning_rate": 9.531148067487738e-05, + "loss": 1.3206, + "mean_token_accuracy": 0.6682368010282517, + "num_tokens": 88913576.0, + "step": 2715 + }, + { + "epoch": 0.22555767476573513, + "grad_norm": 1.0168461799621582, + "learning_rate": 9.528083751647358e-05, + "loss": 1.2642, + "mean_token_accuracy": 0.6809506341814995, + "num_tokens": 89077416.0, + "step": 2720 + }, + { + "epoch": 0.22597230284434863, + "grad_norm": 1.0486526489257812, + "learning_rate": 9.525009950431588e-05, + "loss": 1.2823, + "mean_token_accuracy": 0.6794293776154519, + "num_tokens": 89241256.0, + "step": 2725 + }, + { + "epoch": 0.2263869309229621, + "grad_norm": 1.0182121992111206, + "learning_rate": 9.521926670279384e-05, + "loss": 1.3659, + "mean_token_accuracy": 0.6572886124253273, + "num_tokens": 89405096.0, + "step": 2730 + }, + { + "epoch": 0.2268015590015756, + "grad_norm": 1.030746579170227, + "learning_rate": 9.518833917649568e-05, + "loss": 1.3167, + "mean_token_accuracy": 0.6725348204374313, + "num_tokens": 89568712.0, + "step": 2735 + }, + { + "epoch": 0.22721618708018906, + "grad_norm": 1.0402779579162598, + "learning_rate": 9.5157316990208e-05, + "loss": 1.3271, + "mean_token_accuracy": 0.6652859270572662, + "num_tokens": 89732552.0, + "step": 2740 + }, + { + "epoch": 0.22763081515880254, + "grad_norm": 1.0385247468948364, + "learning_rate": 9.512620020891569e-05, + "loss": 1.3584, + "mean_token_accuracy": 0.6640579134225846, + "num_tokens": 89896392.0, + "step": 2745 + }, + { + "epoch": 0.22804544323741605, + "grad_norm": 1.0232688188552856, + "learning_rate": 9.509498889780182e-05, + "loss": 1.3352, + "mean_token_accuracy": 0.6671554207801819, + "num_tokens": 90060232.0, + "step": 2750 + }, + { + "epoch": 0.22846007131602952, + "grad_norm": 1.022635817527771, + "learning_rate": 9.506368312224746e-05, + "loss": 1.3824, + "mean_token_accuracy": 0.6553824588656425, + "num_tokens": 90224072.0, + "step": 2755 + }, + { + "epoch": 0.228874699394643, + "grad_norm": 1.0279450416564941, + "learning_rate": 9.503228294783158e-05, + "loss": 1.2934, + "mean_token_accuracy": 0.6734054297208786, + "num_tokens": 90387912.0, + "step": 2760 + }, + { + "epoch": 0.22928932747325648, + "grad_norm": 0.9623901844024658, + "learning_rate": 9.500078844033089e-05, + "loss": 1.2324, + "mean_token_accuracy": 0.6841581121087075, + "num_tokens": 90551752.0, + "step": 2765 + }, + { + "epoch": 0.22970395555186998, + "grad_norm": 1.0440038442611694, + "learning_rate": 9.496919966571971e-05, + "loss": 1.2384, + "mean_token_accuracy": 0.680700147151947, + "num_tokens": 90715592.0, + "step": 2770 + }, + { + "epoch": 0.23011858363048346, + "grad_norm": 1.0675630569458008, + "learning_rate": 9.493751669016982e-05, + "loss": 1.3354, + "mean_token_accuracy": 0.6638172417879105, + "num_tokens": 90879155.0, + "step": 2775 + }, + { + "epoch": 0.23053321170909694, + "grad_norm": 1.002764344215393, + "learning_rate": 9.490573958005032e-05, + "loss": 1.2405, + "mean_token_accuracy": 0.6804007798433304, + "num_tokens": 91042995.0, + "step": 2780 + }, + { + "epoch": 0.2309478397877104, + "grad_norm": 1.21285080909729, + "learning_rate": 9.487386840192754e-05, + "loss": 1.312, + "mean_token_accuracy": 0.6705889537930488, + "num_tokens": 91206835.0, + "step": 2785 + }, + { + "epoch": 0.23136246786632392, + "grad_norm": 1.0286260843276978, + "learning_rate": 9.484190322256484e-05, + "loss": 1.3475, + "mean_token_accuracy": 0.6628112107515335, + "num_tokens": 91369694.0, + "step": 2790 + }, + { + "epoch": 0.2317770959449374, + "grad_norm": 1.0780329704284668, + "learning_rate": 9.480984410892247e-05, + "loss": 1.2979, + "mean_token_accuracy": 0.6744745880365371, + "num_tokens": 91533534.0, + "step": 2795 + }, + { + "epoch": 0.23219172402355087, + "grad_norm": 1.0212982892990112, + "learning_rate": 9.47776911281575e-05, + "loss": 1.3197, + "mean_token_accuracy": 0.672837245464325, + "num_tokens": 91697374.0, + "step": 2800 + }, + { + "epoch": 0.23260635210216435, + "grad_norm": 1.0043659210205078, + "learning_rate": 9.47454443476236e-05, + "loss": 1.4051, + "mean_token_accuracy": 0.657325267791748, + "num_tokens": 91861214.0, + "step": 2805 + }, + { + "epoch": 0.23302098018077785, + "grad_norm": 1.0502917766571045, + "learning_rate": 9.471310383487096e-05, + "loss": 1.3021, + "mean_token_accuracy": 0.668786658346653, + "num_tokens": 92025054.0, + "step": 2810 + }, + { + "epoch": 0.23343560825939133, + "grad_norm": 1.4406880140304565, + "learning_rate": 9.468066965764603e-05, + "loss": 1.2822, + "mean_token_accuracy": 0.6733932077884675, + "num_tokens": 92188894.0, + "step": 2815 + }, + { + "epoch": 0.2338502363380048, + "grad_norm": 1.0190492868423462, + "learning_rate": 9.464814188389162e-05, + "loss": 1.3089, + "mean_token_accuracy": 0.6734237536787987, + "num_tokens": 92352734.0, + "step": 2820 + }, + { + "epoch": 0.23426486441661828, + "grad_norm": 0.9841130971908569, + "learning_rate": 9.461552058174647e-05, + "loss": 1.3578, + "mean_token_accuracy": 0.6651637375354766, + "num_tokens": 92516574.0, + "step": 2825 + }, + { + "epoch": 0.2346794924952318, + "grad_norm": 1.0100873708724976, + "learning_rate": 9.458280581954528e-05, + "loss": 1.2901, + "mean_token_accuracy": 0.6766862124204636, + "num_tokens": 92680414.0, + "step": 2830 + }, + { + "epoch": 0.23509412057384527, + "grad_norm": 1.1190541982650757, + "learning_rate": 9.454999766581858e-05, + "loss": 1.3889, + "mean_token_accuracy": 0.6534491300582885, + "num_tokens": 92843564.0, + "step": 2835 + }, + { + "epoch": 0.23550874865245874, + "grad_norm": 1.0348269939422607, + "learning_rate": 9.451709618929247e-05, + "loss": 1.288, + "mean_token_accuracy": 0.6731304943561554, + "num_tokens": 93007404.0, + "step": 2840 + }, + { + "epoch": 0.23592337673107222, + "grad_norm": 1.0778911113739014, + "learning_rate": 9.448410145888857e-05, + "loss": 1.3457, + "mean_token_accuracy": 0.6646410763263703, + "num_tokens": 93171029.0, + "step": 2845 + }, + { + "epoch": 0.23633800480968573, + "grad_norm": 1.0126421451568604, + "learning_rate": 9.445101354372385e-05, + "loss": 1.3107, + "mean_token_accuracy": 0.6731671556830406, + "num_tokens": 93334869.0, + "step": 2850 + }, + { + "epoch": 0.2367526328882992, + "grad_norm": 1.005475640296936, + "learning_rate": 9.441783251311049e-05, + "loss": 1.3616, + "mean_token_accuracy": 0.6659518577158451, + "num_tokens": 93498709.0, + "step": 2855 + }, + { + "epoch": 0.23716726096691268, + "grad_norm": 0.9984427690505981, + "learning_rate": 9.438455843655569e-05, + "loss": 1.3184, + "mean_token_accuracy": 0.6687435671687126, + "num_tokens": 93662185.0, + "step": 2860 + }, + { + "epoch": 0.23758188904552616, + "grad_norm": 1.0320781469345093, + "learning_rate": 9.435119138376159e-05, + "loss": 1.2562, + "mean_token_accuracy": 0.6770711153745651, + "num_tokens": 93826025.0, + "step": 2865 + }, + { + "epoch": 0.23799651712413963, + "grad_norm": 1.0141587257385254, + "learning_rate": 9.43177314246251e-05, + "loss": 1.2928, + "mean_token_accuracy": 0.6795943334698678, + "num_tokens": 93989865.0, + "step": 2870 + }, + { + "epoch": 0.23841114520275314, + "grad_norm": 1.0589091777801514, + "learning_rate": 9.428417862923772e-05, + "loss": 1.3665, + "mean_token_accuracy": 0.6603066936135292, + "num_tokens": 94153705.0, + "step": 2875 + }, + { + "epoch": 0.23882577328136662, + "grad_norm": 1.1072170734405518, + "learning_rate": 9.425053306788549e-05, + "loss": 1.2944, + "mean_token_accuracy": 0.6692937418818474, + "num_tokens": 94317545.0, + "step": 2880 + }, + { + "epoch": 0.2392404013599801, + "grad_norm": 1.028734803199768, + "learning_rate": 9.421679481104868e-05, + "loss": 1.3394, + "mean_token_accuracy": 0.6677543595433235, + "num_tokens": 94480139.0, + "step": 2885 + }, + { + "epoch": 0.23965502943859357, + "grad_norm": 1.0725563764572144, + "learning_rate": 9.41829639294018e-05, + "loss": 1.3677, + "mean_token_accuracy": 0.6636730194091797, + "num_tokens": 94643979.0, + "step": 2890 + }, + { + "epoch": 0.24006965751720707, + "grad_norm": 1.009470820426941, + "learning_rate": 9.414904049381336e-05, + "loss": 1.2996, + "mean_token_accuracy": 0.6729166626930236, + "num_tokens": 94807819.0, + "step": 2895 + }, + { + "epoch": 0.24048428559582055, + "grad_norm": 1.0591769218444824, + "learning_rate": 9.41150245753458e-05, + "loss": 1.3459, + "mean_token_accuracy": 0.6637157902121544, + "num_tokens": 94971659.0, + "step": 2900 + }, + { + "epoch": 0.24089891367443403, + "grad_norm": 1.0332934856414795, + "learning_rate": 9.408091624525522e-05, + "loss": 1.2765, + "mean_token_accuracy": 0.677138315141201, + "num_tokens": 95135499.0, + "step": 2905 + }, + { + "epoch": 0.2413135417530475, + "grad_norm": 1.0035442113876343, + "learning_rate": 9.404671557499137e-05, + "loss": 1.2972, + "mean_token_accuracy": 0.6727082923054695, + "num_tokens": 95299132.0, + "step": 2910 + }, + { + "epoch": 0.241728169831661, + "grad_norm": 1.0331671237945557, + "learning_rate": 9.401242263619738e-05, + "loss": 1.4168, + "mean_token_accuracy": 0.6534823991358281, + "num_tokens": 95462972.0, + "step": 2915 + }, + { + "epoch": 0.2421427979102745, + "grad_norm": 1.0751533508300781, + "learning_rate": 9.39780375007097e-05, + "loss": 1.3258, + "mean_token_accuracy": 0.6660618290305138, + "num_tokens": 95626812.0, + "step": 2920 + }, + { + "epoch": 0.24255742598888796, + "grad_norm": 1.0071254968643188, + "learning_rate": 9.394356024055788e-05, + "loss": 1.3109, + "mean_token_accuracy": 0.6700085535645485, + "num_tokens": 95790652.0, + "step": 2925 + }, + { + "epoch": 0.24297205406750144, + "grad_norm": 1.0864005088806152, + "learning_rate": 9.39089909279645e-05, + "loss": 1.2887, + "mean_token_accuracy": 0.6743523955345154, + "num_tokens": 95954492.0, + "step": 2930 + }, + { + "epoch": 0.24338668214611495, + "grad_norm": 1.0048301219940186, + "learning_rate": 9.387432963534492e-05, + "loss": 1.2817, + "mean_token_accuracy": 0.6728433534502983, + "num_tokens": 96118332.0, + "step": 2935 + }, + { + "epoch": 0.24380131022472842, + "grad_norm": 1.0312083959579468, + "learning_rate": 9.383957643530718e-05, + "loss": 1.3182, + "mean_token_accuracy": 0.6667404159903526, + "num_tokens": 96282117.0, + "step": 2940 + }, + { + "epoch": 0.2442159383033419, + "grad_norm": 1.0427964925765991, + "learning_rate": 9.380473140065191e-05, + "loss": 1.3905, + "mean_token_accuracy": 0.6564271792769432, + "num_tokens": 96445957.0, + "step": 2945 + }, + { + "epoch": 0.24463056638195538, + "grad_norm": 1.0681729316711426, + "learning_rate": 9.376979460437205e-05, + "loss": 1.3662, + "mean_token_accuracy": 0.6643389567732811, + "num_tokens": 96609797.0, + "step": 2950 + }, + { + "epoch": 0.24504519446056888, + "grad_norm": 1.115302324295044, + "learning_rate": 9.373476611965278e-05, + "loss": 1.3389, + "mean_token_accuracy": 0.6667399793863297, + "num_tokens": 96773637.0, + "step": 2955 + }, + { + "epoch": 0.24545982253918236, + "grad_norm": 1.0743083953857422, + "learning_rate": 9.369964601987132e-05, + "loss": 1.2964, + "mean_token_accuracy": 0.6747861742973328, + "num_tokens": 96937477.0, + "step": 2960 + }, + { + "epoch": 0.24587445061779584, + "grad_norm": 1.0740010738372803, + "learning_rate": 9.366443437859688e-05, + "loss": 1.3728, + "mean_token_accuracy": 0.660416665673256, + "num_tokens": 97101317.0, + "step": 2965 + }, + { + "epoch": 0.2462890786964093, + "grad_norm": 1.0379053354263306, + "learning_rate": 9.362913126959037e-05, + "loss": 1.3167, + "mean_token_accuracy": 0.6735581636428833, + "num_tokens": 97265157.0, + "step": 2970 + }, + { + "epoch": 0.2467037067750228, + "grad_norm": 0.9872788190841675, + "learning_rate": 9.359373676680429e-05, + "loss": 1.2447, + "mean_token_accuracy": 0.683156156539917, + "num_tokens": 97428997.0, + "step": 2975 + }, + { + "epoch": 0.2471183348536363, + "grad_norm": 0.994804859161377, + "learning_rate": 9.355825094438264e-05, + "loss": 1.3235, + "mean_token_accuracy": 0.6704239964485168, + "num_tokens": 97592837.0, + "step": 2980 + }, + { + "epoch": 0.24753296293224977, + "grad_norm": 0.9986344575881958, + "learning_rate": 9.352267387666071e-05, + "loss": 1.3173, + "mean_token_accuracy": 0.6646505400538445, + "num_tokens": 97756677.0, + "step": 2985 + }, + { + "epoch": 0.24794759101086325, + "grad_norm": 1.0404119491577148, + "learning_rate": 9.348700563816488e-05, + "loss": 1.3425, + "mean_token_accuracy": 0.6644794717431068, + "num_tokens": 97920517.0, + "step": 2990 + }, + { + "epoch": 0.24836221908947673, + "grad_norm": 1.0144007205963135, + "learning_rate": 9.345124630361257e-05, + "loss": 1.3814, + "mean_token_accuracy": 0.6638990670442582, + "num_tokens": 98084357.0, + "step": 2995 + }, + { + "epoch": 0.24877684716809023, + "grad_norm": 1.1392951011657715, + "learning_rate": 9.3415395947912e-05, + "loss": 1.3271, + "mean_token_accuracy": 0.6704850926995277, + "num_tokens": 98248197.0, + "step": 3000 + }, + { + "epoch": 0.2491914752467037, + "grad_norm": 1.0668189525604248, + "learning_rate": 9.337945464616207e-05, + "loss": 1.3355, + "mean_token_accuracy": 0.6689821600914001, + "num_tokens": 98412037.0, + "step": 3005 + }, + { + "epoch": 0.24960610332531719, + "grad_norm": 1.0075867176055908, + "learning_rate": 9.334342247365216e-05, + "loss": 1.3259, + "mean_token_accuracy": 0.6669354841113091, + "num_tokens": 98575877.0, + "step": 3010 + }, + { + "epoch": 0.25002073140393066, + "grad_norm": 0.9960988759994507, + "learning_rate": 9.330729950586207e-05, + "loss": 1.3354, + "mean_token_accuracy": 0.6663807734847069, + "num_tokens": 98739529.0, + "step": 3015 + }, + { + "epoch": 0.25043535948254414, + "grad_norm": 1.0258535146713257, + "learning_rate": 9.327108581846172e-05, + "loss": 1.2835, + "mean_token_accuracy": 0.6787328958511353, + "num_tokens": 98903369.0, + "step": 3020 + }, + { + "epoch": 0.2508499875611576, + "grad_norm": 1.0559964179992676, + "learning_rate": 9.323478148731112e-05, + "loss": 1.3956, + "mean_token_accuracy": 0.6548631489276886, + "num_tokens": 99067209.0, + "step": 3025 + }, + { + "epoch": 0.25126461563977115, + "grad_norm": 1.0208572149276733, + "learning_rate": 9.319838658846019e-05, + "loss": 1.3445, + "mean_token_accuracy": 0.6640823528170585, + "num_tokens": 99231049.0, + "step": 3030 + }, + { + "epoch": 0.2516792437183846, + "grad_norm": 1.0227625370025635, + "learning_rate": 9.316190119814847e-05, + "loss": 1.2566, + "mean_token_accuracy": 0.6830950632691384, + "num_tokens": 99394889.0, + "step": 3035 + }, + { + "epoch": 0.2520938717969981, + "grad_norm": 0.9901267290115356, + "learning_rate": 9.312532539280512e-05, + "loss": 1.3832, + "mean_token_accuracy": 0.656995353102684, + "num_tokens": 99558729.0, + "step": 3040 + }, + { + "epoch": 0.2525084998756116, + "grad_norm": 1.0293430089950562, + "learning_rate": 9.308865924904873e-05, + "loss": 1.3067, + "mean_token_accuracy": 0.6714381769299507, + "num_tokens": 99722569.0, + "step": 3045 + }, + { + "epoch": 0.25292312795422506, + "grad_norm": 1.020733118057251, + "learning_rate": 9.305190284368706e-05, + "loss": 1.3645, + "mean_token_accuracy": 0.66188904941082, + "num_tokens": 99886409.0, + "step": 3050 + }, + { + "epoch": 0.25333775603283853, + "grad_norm": 1.0120285749435425, + "learning_rate": 9.301505625371702e-05, + "loss": 1.3091, + "mean_token_accuracy": 0.6673692539334297, + "num_tokens": 100050249.0, + "step": 3055 + }, + { + "epoch": 0.253752384111452, + "grad_norm": 1.0258290767669678, + "learning_rate": 9.29781195563244e-05, + "loss": 1.3079, + "mean_token_accuracy": 0.6742057658731937, + "num_tokens": 100214089.0, + "step": 3060 + }, + { + "epoch": 0.2541670121900655, + "grad_norm": 0.9953669905662537, + "learning_rate": 9.294109282888373e-05, + "loss": 1.2476, + "mean_token_accuracy": 0.6814332813024521, + "num_tokens": 100377929.0, + "step": 3065 + }, + { + "epoch": 0.254581640268679, + "grad_norm": 1.0049127340316772, + "learning_rate": 9.290397614895815e-05, + "loss": 1.3179, + "mean_token_accuracy": 0.6699474558234215, + "num_tokens": 100541769.0, + "step": 3070 + }, + { + "epoch": 0.2549962683472925, + "grad_norm": 1.0290277004241943, + "learning_rate": 9.286676959429926e-05, + "loss": 1.275, + "mean_token_accuracy": 0.6747800588607789, + "num_tokens": 100705609.0, + "step": 3075 + }, + { + "epoch": 0.255410896425906, + "grad_norm": 0.9995718002319336, + "learning_rate": 9.282947324284689e-05, + "loss": 1.2793, + "mean_token_accuracy": 0.6762157842516899, + "num_tokens": 100869449.0, + "step": 3080 + }, + { + "epoch": 0.25582552450451945, + "grad_norm": 0.9663996696472168, + "learning_rate": 9.279208717272898e-05, + "loss": 1.2812, + "mean_token_accuracy": 0.6771077737212181, + "num_tokens": 101033289.0, + "step": 3085 + }, + { + "epoch": 0.25624015258313293, + "grad_norm": 1.0330816507339478, + "learning_rate": 9.275461146226143e-05, + "loss": 1.3458, + "mean_token_accuracy": 0.6675525397062302, + "num_tokens": 101197129.0, + "step": 3090 + }, + { + "epoch": 0.2566547806617464, + "grad_norm": 0.9948139786720276, + "learning_rate": 9.271704618994792e-05, + "loss": 1.2211, + "mean_token_accuracy": 0.6863330885767936, + "num_tokens": 101360969.0, + "step": 3095 + }, + { + "epoch": 0.2570694087403599, + "grad_norm": 1.0910817384719849, + "learning_rate": 9.26793914344797e-05, + "loss": 1.347, + "mean_token_accuracy": 0.665854100883007, + "num_tokens": 101524809.0, + "step": 3100 + }, + { + "epoch": 0.25748403681897336, + "grad_norm": 1.057950735092163, + "learning_rate": 9.264164727473553e-05, + "loss": 1.2894, + "mean_token_accuracy": 0.67501832395792, + "num_tokens": 101688649.0, + "step": 3105 + }, + { + "epoch": 0.25789866489758684, + "grad_norm": 0.997083306312561, + "learning_rate": 9.26038137897814e-05, + "loss": 1.3129, + "mean_token_accuracy": 0.6704789832234382, + "num_tokens": 101852489.0, + "step": 3110 + }, + { + "epoch": 0.25831329297620037, + "grad_norm": 1.046116590499878, + "learning_rate": 9.256589105887045e-05, + "loss": 1.3677, + "mean_token_accuracy": 0.659909576177597, + "num_tokens": 102016329.0, + "step": 3115 + }, + { + "epoch": 0.25872792105481385, + "grad_norm": 1.0335099697113037, + "learning_rate": 9.252787916144276e-05, + "loss": 1.3387, + "mean_token_accuracy": 0.670869991183281, + "num_tokens": 102180169.0, + "step": 3120 + }, + { + "epoch": 0.2591425491334273, + "grad_norm": 1.0001356601715088, + "learning_rate": 9.248977817712521e-05, + "loss": 1.2951, + "mean_token_accuracy": 0.6754032284021377, + "num_tokens": 102344009.0, + "step": 3125 + }, + { + "epoch": 0.2595571772120408, + "grad_norm": 1.0196658372879028, + "learning_rate": 9.245158818573124e-05, + "loss": 1.3248, + "mean_token_accuracy": 0.6689027354121209, + "num_tokens": 102507849.0, + "step": 3130 + }, + { + "epoch": 0.2599718052906543, + "grad_norm": 1.027083158493042, + "learning_rate": 9.241330926726082e-05, + "loss": 1.3356, + "mean_token_accuracy": 0.6643267348408699, + "num_tokens": 102671689.0, + "step": 3135 + }, + { + "epoch": 0.26038643336926776, + "grad_norm": 1.0170602798461914, + "learning_rate": 9.237494150190017e-05, + "loss": 1.2662, + "mean_token_accuracy": 0.6814943760633468, + "num_tokens": 102835529.0, + "step": 3140 + }, + { + "epoch": 0.26080106144788123, + "grad_norm": 0.9635075330734253, + "learning_rate": 9.233648497002161e-05, + "loss": 1.2548, + "mean_token_accuracy": 0.68298509567976, + "num_tokens": 102999369.0, + "step": 3145 + }, + { + "epoch": 0.2612156895264947, + "grad_norm": 1.0100306272506714, + "learning_rate": 9.229793975218342e-05, + "loss": 1.4041, + "mean_token_accuracy": 0.6569037184119224, + "num_tokens": 103163209.0, + "step": 3150 + }, + { + "epoch": 0.26163031760510824, + "grad_norm": 1.0459941625595093, + "learning_rate": 9.225930592912966e-05, + "loss": 1.3257, + "mean_token_accuracy": 0.6676625102758408, + "num_tokens": 103327049.0, + "step": 3155 + }, + { + "epoch": 0.2620449456837217, + "grad_norm": 1.121048927307129, + "learning_rate": 9.222058358179002e-05, + "loss": 1.3567, + "mean_token_accuracy": 0.6637218981981278, + "num_tokens": 103490889.0, + "step": 3160 + }, + { + "epoch": 0.2624595737623352, + "grad_norm": 1.0036065578460693, + "learning_rate": 9.218177279127958e-05, + "loss": 1.2773, + "mean_token_accuracy": 0.674205768108368, + "num_tokens": 103654729.0, + "step": 3165 + }, + { + "epoch": 0.2628742018409487, + "grad_norm": 1.020117163658142, + "learning_rate": 9.214287363889872e-05, + "loss": 1.3656, + "mean_token_accuracy": 0.6625549823045731, + "num_tokens": 103818569.0, + "step": 3170 + }, + { + "epoch": 0.26328882991956215, + "grad_norm": 0.9855335354804993, + "learning_rate": 9.210388620613293e-05, + "loss": 1.2552, + "mean_token_accuracy": 0.6789345040917396, + "num_tokens": 103982409.0, + "step": 3175 + }, + { + "epoch": 0.2637034579981756, + "grad_norm": 0.9548041224479675, + "learning_rate": 9.20648105746526e-05, + "loss": 1.328, + "mean_token_accuracy": 0.6681791037321091, + "num_tokens": 104145721.0, + "step": 3180 + }, + { + "epoch": 0.2641180860767891, + "grad_norm": 1.0116145610809326, + "learning_rate": 9.202564682631289e-05, + "loss": 1.2607, + "mean_token_accuracy": 0.6777675986289978, + "num_tokens": 104309561.0, + "step": 3185 + }, + { + "epoch": 0.2645327141554026, + "grad_norm": 1.0734248161315918, + "learning_rate": 9.198639504315358e-05, + "loss": 1.2948, + "mean_token_accuracy": 0.6718597277998924, + "num_tokens": 104473401.0, + "step": 3190 + }, + { + "epoch": 0.2649473422340161, + "grad_norm": 1.166406512260437, + "learning_rate": 9.194705530739882e-05, + "loss": 1.3162, + "mean_token_accuracy": 0.6695503443479538, + "num_tokens": 104637241.0, + "step": 3195 + }, + { + "epoch": 0.2653619703126296, + "grad_norm": 1.0191946029663086, + "learning_rate": 9.1907627701457e-05, + "loss": 1.3405, + "mean_token_accuracy": 0.6675891995429992, + "num_tokens": 104801081.0, + "step": 3200 + }, + { + "epoch": 0.26577659839124307, + "grad_norm": 1.0574842691421509, + "learning_rate": 9.186811230792061e-05, + "loss": 1.3374, + "mean_token_accuracy": 0.6671737521886826, + "num_tokens": 104964921.0, + "step": 3205 + }, + { + "epoch": 0.26619122646985655, + "grad_norm": 1.0240511894226074, + "learning_rate": 9.182850920956601e-05, + "loss": 1.2516, + "mean_token_accuracy": 0.6806186750531197, + "num_tokens": 105128442.0, + "step": 3210 + }, + { + "epoch": 0.26660585454847, + "grad_norm": 0.9983618855476379, + "learning_rate": 9.178881848935329e-05, + "loss": 1.2835, + "mean_token_accuracy": 0.6728005856275558, + "num_tokens": 105292282.0, + "step": 3215 + }, + { + "epoch": 0.2670204826270835, + "grad_norm": 1.0017285346984863, + "learning_rate": 9.17490402304261e-05, + "loss": 1.28, + "mean_token_accuracy": 0.6750406816601753, + "num_tokens": 105455380.0, + "step": 3220 + }, + { + "epoch": 0.267435110705697, + "grad_norm": 1.013919711112976, + "learning_rate": 9.170917451611147e-05, + "loss": 1.3516, + "mean_token_accuracy": 0.6645039066672325, + "num_tokens": 105619220.0, + "step": 3225 + }, + { + "epoch": 0.26784973878431045, + "grad_norm": 0.9826188087463379, + "learning_rate": 9.166922142991963e-05, + "loss": 1.3478, + "mean_token_accuracy": 0.6666116788983345, + "num_tokens": 105783060.0, + "step": 3230 + }, + { + "epoch": 0.26826436686292393, + "grad_norm": 1.062046766281128, + "learning_rate": 9.162918105554378e-05, + "loss": 1.3156, + "mean_token_accuracy": 0.6701210156083107, + "num_tokens": 105945803.0, + "step": 3235 + }, + { + "epoch": 0.26867899494153746, + "grad_norm": 1.0021882057189941, + "learning_rate": 9.158905347686005e-05, + "loss": 1.363, + "mean_token_accuracy": 0.6650781989097595, + "num_tokens": 106109643.0, + "step": 3240 + }, + { + "epoch": 0.26909362302015094, + "grad_norm": 0.9552791714668274, + "learning_rate": 9.15488387779272e-05, + "loss": 1.3426, + "mean_token_accuracy": 0.66844452470541, + "num_tokens": 106273483.0, + "step": 3245 + }, + { + "epoch": 0.2695082510987644, + "grad_norm": 1.0006051063537598, + "learning_rate": 9.150853704298648e-05, + "loss": 1.3249, + "mean_token_accuracy": 0.6659518554806709, + "num_tokens": 106437323.0, + "step": 3250 + }, + { + "epoch": 0.2699228791773779, + "grad_norm": 1.0177334547042847, + "learning_rate": 9.146814835646151e-05, + "loss": 1.4038, + "mean_token_accuracy": 0.6590929798781872, + "num_tokens": 106600363.0, + "step": 3255 + }, + { + "epoch": 0.27033750725599137, + "grad_norm": 0.9710380434989929, + "learning_rate": 9.1427672802958e-05, + "loss": 1.2932, + "mean_token_accuracy": 0.6759469673037529, + "num_tokens": 106764203.0, + "step": 3260 + }, + { + "epoch": 0.27075213533460485, + "grad_norm": 1.0061321258544922, + "learning_rate": 9.138711046726367e-05, + "loss": 1.2612, + "mean_token_accuracy": 0.6762341171503067, + "num_tokens": 106928043.0, + "step": 3265 + }, + { + "epoch": 0.2711667634132183, + "grad_norm": 1.4211668968200684, + "learning_rate": 9.134646143434802e-05, + "loss": 1.2949, + "mean_token_accuracy": 0.6718108490109443, + "num_tokens": 107091883.0, + "step": 3270 + }, + { + "epoch": 0.2715813914918318, + "grad_norm": 0.9751541614532471, + "learning_rate": 9.130572578936213e-05, + "loss": 1.2502, + "mean_token_accuracy": 0.6819098234176636, + "num_tokens": 107255723.0, + "step": 3275 + }, + { + "epoch": 0.27199601957044534, + "grad_norm": 0.992137610912323, + "learning_rate": 9.126490361763856e-05, + "loss": 1.3316, + "mean_token_accuracy": 0.6688416451215744, + "num_tokens": 107419563.0, + "step": 3280 + }, + { + "epoch": 0.2724106476490588, + "grad_norm": 0.9852685928344727, + "learning_rate": 9.122399500469107e-05, + "loss": 1.2833, + "mean_token_accuracy": 0.6725562021136284, + "num_tokens": 107583403.0, + "step": 3285 + }, + { + "epoch": 0.2728252757276723, + "grad_norm": 0.9987186789512634, + "learning_rate": 9.118300003621459e-05, + "loss": 1.338, + "mean_token_accuracy": 0.6650476589798927, + "num_tokens": 107747243.0, + "step": 3290 + }, + { + "epoch": 0.27323990380628577, + "grad_norm": 1.0254756212234497, + "learning_rate": 9.114191879808484e-05, + "loss": 1.2657, + "mean_token_accuracy": 0.6778714567422867, + "num_tokens": 107911083.0, + "step": 3295 + }, + { + "epoch": 0.27365453188489924, + "grad_norm": 0.9749782085418701, + "learning_rate": 9.110075137635831e-05, + "loss": 1.3152, + "mean_token_accuracy": 0.6695625618100166, + "num_tokens": 108074923.0, + "step": 3300 + }, + { + "epoch": 0.2740691599635127, + "grad_norm": 0.9420017600059509, + "learning_rate": 9.105949785727203e-05, + "loss": 1.2318, + "mean_token_accuracy": 0.6857771277427673, + "num_tokens": 108238763.0, + "step": 3305 + }, + { + "epoch": 0.2744837880421262, + "grad_norm": 2.3964684009552, + "learning_rate": 9.101815832724338e-05, + "loss": 1.366, + "mean_token_accuracy": 0.6613514140248299, + "num_tokens": 108402603.0, + "step": 3310 + }, + { + "epoch": 0.2748984161207397, + "grad_norm": 0.9832311272621155, + "learning_rate": 9.097673287286991e-05, + "loss": 1.2669, + "mean_token_accuracy": 0.6781463831663131, + "num_tokens": 108566443.0, + "step": 3315 + }, + { + "epoch": 0.2753130441993532, + "grad_norm": 1.0415925979614258, + "learning_rate": 9.093522158092914e-05, + "loss": 1.3111, + "mean_token_accuracy": 0.669195581972599, + "num_tokens": 108729715.0, + "step": 3320 + }, + { + "epoch": 0.2757276722779667, + "grad_norm": 1.0237131118774414, + "learning_rate": 9.089362453837845e-05, + "loss": 1.3479, + "mean_token_accuracy": 0.6657319158315659, + "num_tokens": 108893555.0, + "step": 3325 + }, + { + "epoch": 0.27614230035658016, + "grad_norm": 1.0040159225463867, + "learning_rate": 9.085194183235481e-05, + "loss": 1.2577, + "mean_token_accuracy": 0.6773888096213341, + "num_tokens": 109057395.0, + "step": 3330 + }, + { + "epoch": 0.27655692843519364, + "grad_norm": 1.0257548093795776, + "learning_rate": 9.081017355017467e-05, + "loss": 1.3377, + "mean_token_accuracy": 0.6641862168908119, + "num_tokens": 109221235.0, + "step": 3335 + }, + { + "epoch": 0.2769715565138071, + "grad_norm": 1.0096317529678345, + "learning_rate": 9.07683197793337e-05, + "loss": 1.3428, + "mean_token_accuracy": 0.6685923337936401, + "num_tokens": 109384292.0, + "step": 3340 + }, + { + "epoch": 0.2773861845924206, + "grad_norm": 0.9559937715530396, + "learning_rate": 9.07263806075067e-05, + "loss": 1.3623, + "mean_token_accuracy": 0.665658600628376, + "num_tokens": 109548132.0, + "step": 3345 + }, + { + "epoch": 0.27780081267103407, + "grad_norm": 0.9451937079429626, + "learning_rate": 9.068435612254733e-05, + "loss": 1.3313, + "mean_token_accuracy": 0.6634958505630493, + "num_tokens": 109711972.0, + "step": 3350 + }, + { + "epoch": 0.27821544074964755, + "grad_norm": 0.9886478781700134, + "learning_rate": 9.064224641248798e-05, + "loss": 1.2324, + "mean_token_accuracy": 0.6831625834107399, + "num_tokens": 109875744.0, + "step": 3355 + }, + { + "epoch": 0.278630068828261, + "grad_norm": 1.0173379182815552, + "learning_rate": 9.060005156553955e-05, + "loss": 1.3514, + "mean_token_accuracy": 0.6684750705957413, + "num_tokens": 110039584.0, + "step": 3360 + }, + { + "epoch": 0.27904469690687456, + "grad_norm": 0.9815647602081299, + "learning_rate": 9.055777167009133e-05, + "loss": 1.3492, + "mean_token_accuracy": 0.6633675500750542, + "num_tokens": 110203424.0, + "step": 3365 + }, + { + "epoch": 0.27945932498548803, + "grad_norm": 1.0273895263671875, + "learning_rate": 9.051540681471071e-05, + "loss": 1.2606, + "mean_token_accuracy": 0.6797031410038471, + "num_tokens": 110366989.0, + "step": 3370 + }, + { + "epoch": 0.2798739530641015, + "grad_norm": 0.9994346499443054, + "learning_rate": 9.047295708814307e-05, + "loss": 1.2776, + "mean_token_accuracy": 0.6767400532960892, + "num_tokens": 110529452.0, + "step": 3375 + }, + { + "epoch": 0.280288581142715, + "grad_norm": 0.9704295992851257, + "learning_rate": 9.043042257931163e-05, + "loss": 1.2921, + "mean_token_accuracy": 0.6753299072384834, + "num_tokens": 110693292.0, + "step": 3380 + }, + { + "epoch": 0.28070320922132846, + "grad_norm": 1.0179647207260132, + "learning_rate": 9.038780337731712e-05, + "loss": 1.3267, + "mean_token_accuracy": 0.6659090921282769, + "num_tokens": 110857132.0, + "step": 3385 + }, + { + "epoch": 0.28111783729994194, + "grad_norm": 1.0432519912719727, + "learning_rate": 9.034509957143775e-05, + "loss": 1.3504, + "mean_token_accuracy": 0.6619195982813835, + "num_tokens": 111020972.0, + "step": 3390 + }, + { + "epoch": 0.2815324653785554, + "grad_norm": 1.0100421905517578, + "learning_rate": 9.030231125112896e-05, + "loss": 1.2802, + "mean_token_accuracy": 0.6753299161791801, + "num_tokens": 111184812.0, + "step": 3395 + }, + { + "epoch": 0.2819470934571689, + "grad_norm": 0.9555953741073608, + "learning_rate": 9.025943850602316e-05, + "loss": 1.3427, + "mean_token_accuracy": 0.6632758989930153, + "num_tokens": 111347809.0, + "step": 3400 + }, + { + "epoch": 0.28236172153578243, + "grad_norm": 1.0390369892120361, + "learning_rate": 9.021648142592971e-05, + "loss": 1.3525, + "mean_token_accuracy": 0.6661779060959816, + "num_tokens": 111511649.0, + "step": 3405 + }, + { + "epoch": 0.2827763496143959, + "grad_norm": 1.0647310018539429, + "learning_rate": 9.017344010083457e-05, + "loss": 1.2919, + "mean_token_accuracy": 0.6720491155982018, + "num_tokens": 111675489.0, + "step": 3410 + }, + { + "epoch": 0.2831909776930094, + "grad_norm": 1.0379363298416138, + "learning_rate": 9.01303146209002e-05, + "loss": 1.3333, + "mean_token_accuracy": 0.6658174470067024, + "num_tokens": 111839329.0, + "step": 3415 + }, + { + "epoch": 0.28360560577162286, + "grad_norm": 1.0758144855499268, + "learning_rate": 9.008710507646529e-05, + "loss": 1.2527, + "mean_token_accuracy": 0.6785129502415657, + "num_tokens": 112003169.0, + "step": 3420 + }, + { + "epoch": 0.28402023385023634, + "grad_norm": 0.9726275205612183, + "learning_rate": 9.004381155804473e-05, + "loss": 1.3586, + "mean_token_accuracy": 0.6619318142533303, + "num_tokens": 112167009.0, + "step": 3425 + }, + { + "epoch": 0.2844348619288498, + "grad_norm": 1.0184444189071655, + "learning_rate": 9.000043415632923e-05, + "loss": 1.3267, + "mean_token_accuracy": 0.6635813802480698, + "num_tokens": 112330849.0, + "step": 3430 + }, + { + "epoch": 0.2848494900074633, + "grad_norm": 1.0357671976089478, + "learning_rate": 8.995697296218526e-05, + "loss": 1.3327, + "mean_token_accuracy": 0.6685667164623738, + "num_tokens": 112494689.0, + "step": 3435 + }, + { + "epoch": 0.28526411808607677, + "grad_norm": 0.9924004077911377, + "learning_rate": 8.991342806665481e-05, + "loss": 1.2824, + "mean_token_accuracy": 0.6740469232201576, + "num_tokens": 112658529.0, + "step": 3440 + }, + { + "epoch": 0.2856787461646903, + "grad_norm": 0.9633674621582031, + "learning_rate": 8.98697995609552e-05, + "loss": 1.3339, + "mean_token_accuracy": 0.6692316338419915, + "num_tokens": 112821517.0, + "step": 3445 + }, + { + "epoch": 0.2860933742433038, + "grad_norm": 0.9266213774681091, + "learning_rate": 8.982608753647888e-05, + "loss": 1.2695, + "mean_token_accuracy": 0.6778958901762963, + "num_tokens": 112985357.0, + "step": 3450 + }, + { + "epoch": 0.28650800232191725, + "grad_norm": 0.9768882393836975, + "learning_rate": 8.978229208479331e-05, + "loss": 1.2095, + "mean_token_accuracy": 0.6911997899413109, + "num_tokens": 113148805.0, + "step": 3455 + }, + { + "epoch": 0.28692263040053073, + "grad_norm": 1.002746820449829, + "learning_rate": 8.973841329764066e-05, + "loss": 1.2813, + "mean_token_accuracy": 0.6760325044393539, + "num_tokens": 113312645.0, + "step": 3460 + }, + { + "epoch": 0.2873372584791442, + "grad_norm": 0.9470506310462952, + "learning_rate": 8.969445126693768e-05, + "loss": 1.1976, + "mean_token_accuracy": 0.6911351397633553, + "num_tokens": 113476485.0, + "step": 3465 + }, + { + "epoch": 0.2877518865577577, + "grad_norm": 0.9840176105499268, + "learning_rate": 8.965040608477549e-05, + "loss": 1.3362, + "mean_token_accuracy": 0.6656280577182769, + "num_tokens": 113640325.0, + "step": 3470 + }, + { + "epoch": 0.28816651463637116, + "grad_norm": 1.018842101097107, + "learning_rate": 8.960627784341944e-05, + "loss": 1.3188, + "mean_token_accuracy": 0.6690188199281693, + "num_tokens": 113804165.0, + "step": 3475 + }, + { + "epoch": 0.28858114271498464, + "grad_norm": 1.047323226928711, + "learning_rate": 8.956206663530881e-05, + "loss": 1.2952, + "mean_token_accuracy": 0.6749450147151947, + "num_tokens": 113968005.0, + "step": 3480 + }, + { + "epoch": 0.2889957707935981, + "grad_norm": 1.0208772420883179, + "learning_rate": 8.951777255305673e-05, + "loss": 1.3619, + "mean_token_accuracy": 0.6644733607769012, + "num_tokens": 114131845.0, + "step": 3485 + }, + { + "epoch": 0.28941039887221165, + "grad_norm": 0.9302547574043274, + "learning_rate": 8.94733956894499e-05, + "loss": 1.2659, + "mean_token_accuracy": 0.6765823513269424, + "num_tokens": 114295685.0, + "step": 3490 + }, + { + "epoch": 0.2898250269508251, + "grad_norm": 0.9998297691345215, + "learning_rate": 8.942893613744843e-05, + "loss": 1.3208, + "mean_token_accuracy": 0.6723768964409829, + "num_tokens": 114458469.0, + "step": 3495 + }, + { + "epoch": 0.2902396550294386, + "grad_norm": 0.9959949851036072, + "learning_rate": 8.938439399018567e-05, + "loss": 1.3665, + "mean_token_accuracy": 0.6610520526766777, + "num_tokens": 114622309.0, + "step": 3500 + }, + { + "epoch": 0.2906542831080521, + "grad_norm": 1.010382890701294, + "learning_rate": 8.9339769340968e-05, + "loss": 1.2943, + "mean_token_accuracy": 0.6740163698792457, + "num_tokens": 114786149.0, + "step": 3505 + }, + { + "epoch": 0.29106891118666556, + "grad_norm": 0.9420673251152039, + "learning_rate": 8.929506228327453e-05, + "loss": 1.2948, + "mean_token_accuracy": 0.6721407622098923, + "num_tokens": 114949989.0, + "step": 3510 + }, + { + "epoch": 0.29148353926527903, + "grad_norm": 0.9875288605690002, + "learning_rate": 8.925027291075713e-05, + "loss": 1.2495, + "mean_token_accuracy": 0.6839748278260231, + "num_tokens": 115113829.0, + "step": 3515 + }, + { + "epoch": 0.2918981673438925, + "grad_norm": 1.045227289199829, + "learning_rate": 8.920540131724e-05, + "loss": 1.3585, + "mean_token_accuracy": 0.6651698425412178, + "num_tokens": 115277669.0, + "step": 3520 + }, + { + "epoch": 0.292312795422506, + "grad_norm": 0.992067277431488, + "learning_rate": 8.916044759671964e-05, + "loss": 1.2662, + "mean_token_accuracy": 0.6729960918426514, + "num_tokens": 115441509.0, + "step": 3525 + }, + { + "epoch": 0.2927274235011195, + "grad_norm": 0.9711788892745972, + "learning_rate": 8.911541184336455e-05, + "loss": 1.2847, + "mean_token_accuracy": 0.674321848154068, + "num_tokens": 115605349.0, + "step": 3530 + }, + { + "epoch": 0.293142051579733, + "grad_norm": 0.9849469661712646, + "learning_rate": 8.907029415151509e-05, + "loss": 1.3259, + "mean_token_accuracy": 0.6654081135988236, + "num_tokens": 115769189.0, + "step": 3535 + }, + { + "epoch": 0.2935566796583465, + "grad_norm": 0.9864558577537537, + "learning_rate": 8.902509461568324e-05, + "loss": 1.3397, + "mean_token_accuracy": 0.6647116288542747, + "num_tokens": 115933029.0, + "step": 3540 + }, + { + "epoch": 0.29397130773695995, + "grad_norm": 1.0209910869598389, + "learning_rate": 8.897981333055249e-05, + "loss": 1.3385, + "mean_token_accuracy": 0.6657764717936516, + "num_tokens": 116096847.0, + "step": 3545 + }, + { + "epoch": 0.29438593581557343, + "grad_norm": 0.9152947068214417, + "learning_rate": 8.893445039097747e-05, + "loss": 1.2298, + "mean_token_accuracy": 0.6811148285865783, + "num_tokens": 116260162.0, + "step": 3550 + }, + { + "epoch": 0.2948005638941869, + "grad_norm": 1.0247241258621216, + "learning_rate": 8.888900589198397e-05, + "loss": 1.3976, + "mean_token_accuracy": 0.6553091421723366, + "num_tokens": 116424002.0, + "step": 3555 + }, + { + "epoch": 0.2952151919728004, + "grad_norm": 1.0255168676376343, + "learning_rate": 8.884347992876856e-05, + "loss": 1.2533, + "mean_token_accuracy": 0.6799609005451203, + "num_tokens": 116587842.0, + "step": 3560 + }, + { + "epoch": 0.29562982005141386, + "grad_norm": 1.0117552280426025, + "learning_rate": 8.879787259669848e-05, + "loss": 1.326, + "mean_token_accuracy": 0.6681744247674942, + "num_tokens": 116751057.0, + "step": 3565 + }, + { + "epoch": 0.29604444813002734, + "grad_norm": 0.9783384203910828, + "learning_rate": 8.875218399131142e-05, + "loss": 1.2761, + "mean_token_accuracy": 0.6763746306300163, + "num_tokens": 116914897.0, + "step": 3570 + }, + { + "epoch": 0.29645907620864087, + "grad_norm": 1.0150396823883057, + "learning_rate": 8.870641420831534e-05, + "loss": 1.224, + "mean_token_accuracy": 0.6861681327223778, + "num_tokens": 117078737.0, + "step": 3575 + }, + { + "epoch": 0.29687370428725435, + "grad_norm": 0.9749643802642822, + "learning_rate": 8.86605633435882e-05, + "loss": 1.2896, + "mean_token_accuracy": 0.6753237992525101, + "num_tokens": 117242577.0, + "step": 3580 + }, + { + "epoch": 0.2972883323658678, + "grad_norm": 1.0319440364837646, + "learning_rate": 8.861463149317786e-05, + "loss": 1.3142, + "mean_token_accuracy": 0.6703567981719971, + "num_tokens": 117406417.0, + "step": 3585 + }, + { + "epoch": 0.2977029604444813, + "grad_norm": 1.0056684017181396, + "learning_rate": 8.85686187533018e-05, + "loss": 1.275, + "mean_token_accuracy": 0.677285224199295, + "num_tokens": 117568404.0, + "step": 3590 + }, + { + "epoch": 0.2981175885230948, + "grad_norm": 0.9867774844169617, + "learning_rate": 8.852252522034697e-05, + "loss": 1.3728, + "mean_token_accuracy": 0.6640212625265122, + "num_tokens": 117732244.0, + "step": 3595 + }, + { + "epoch": 0.29853221660170826, + "grad_norm": 0.9824956059455872, + "learning_rate": 8.847635099086953e-05, + "loss": 1.294, + "mean_token_accuracy": 0.6750061109662056, + "num_tokens": 117896084.0, + "step": 3600 + }, + { + "epoch": 0.29894684468032173, + "grad_norm": 1.0192714929580688, + "learning_rate": 8.84300961615947e-05, + "loss": 1.2946, + "mean_token_accuracy": 0.6741141274571418, + "num_tokens": 118059924.0, + "step": 3605 + }, + { + "epoch": 0.2993614727589352, + "grad_norm": 0.999767541885376, + "learning_rate": 8.838376082941654e-05, + "loss": 1.2483, + "mean_token_accuracy": 0.6844934061169624, + "num_tokens": 118223311.0, + "step": 3610 + }, + { + "epoch": 0.29977610083754874, + "grad_norm": 0.9591719508171082, + "learning_rate": 8.833734509139778e-05, + "loss": 1.2544, + "mean_token_accuracy": 0.6827162742614746, + "num_tokens": 118387151.0, + "step": 3615 + }, + { + "epoch": 0.3001907289161622, + "grad_norm": 0.9481289386749268, + "learning_rate": 8.829084904476949e-05, + "loss": 1.2819, + "mean_token_accuracy": 0.6749938905239106, + "num_tokens": 118550991.0, + "step": 3620 + }, + { + "epoch": 0.3006053569947757, + "grad_norm": 0.9553553462028503, + "learning_rate": 8.824427278693108e-05, + "loss": 1.2809, + "mean_token_accuracy": 0.6812683321535588, + "num_tokens": 118714831.0, + "step": 3625 + }, + { + "epoch": 0.3010199850733892, + "grad_norm": 1.0011934041976929, + "learning_rate": 8.819761641544992e-05, + "loss": 1.3812, + "mean_token_accuracy": 0.659054248034954, + "num_tokens": 118878671.0, + "step": 3630 + }, + { + "epoch": 0.30143461315200265, + "grad_norm": 1.0294073820114136, + "learning_rate": 8.81508800280612e-05, + "loss": 1.353, + "mean_token_accuracy": 0.6636974602937699, + "num_tokens": 119042511.0, + "step": 3635 + }, + { + "epoch": 0.3018492412306161, + "grad_norm": 1.0390793085098267, + "learning_rate": 8.810406372266778e-05, + "loss": 1.284, + "mean_token_accuracy": 0.6743035152554512, + "num_tokens": 119206351.0, + "step": 3640 + }, + { + "epoch": 0.3022638693092296, + "grad_norm": 0.9407466650009155, + "learning_rate": 8.805716759733984e-05, + "loss": 1.2414, + "mean_token_accuracy": 0.679227763414383, + "num_tokens": 119370191.0, + "step": 3645 + }, + { + "epoch": 0.3026784973878431, + "grad_norm": 0.9619264006614685, + "learning_rate": 8.801019175031486e-05, + "loss": 1.308, + "mean_token_accuracy": 0.6739919319748878, + "num_tokens": 119534031.0, + "step": 3650 + }, + { + "epoch": 0.3030931254664566, + "grad_norm": 1.022631049156189, + "learning_rate": 8.796313627999728e-05, + "loss": 1.2781, + "mean_token_accuracy": 0.6755681842565536, + "num_tokens": 119697871.0, + "step": 3655 + }, + { + "epoch": 0.3035077535450701, + "grad_norm": 1.0383825302124023, + "learning_rate": 8.791600128495832e-05, + "loss": 1.3631, + "mean_token_accuracy": 0.6621700882911682, + "num_tokens": 119861711.0, + "step": 3660 + }, + { + "epoch": 0.30392238162368357, + "grad_norm": 0.9666171669960022, + "learning_rate": 8.786878686393579e-05, + "loss": 1.2866, + "mean_token_accuracy": 0.6758003398776055, + "num_tokens": 120025551.0, + "step": 3665 + }, + { + "epoch": 0.30433700970229705, + "grad_norm": 0.9437312483787537, + "learning_rate": 8.78214931158339e-05, + "loss": 1.2863, + "mean_token_accuracy": 0.6766190111637116, + "num_tokens": 120189391.0, + "step": 3670 + }, + { + "epoch": 0.3047516377809105, + "grad_norm": 1.012891173362732, + "learning_rate": 8.777412013972304e-05, + "loss": 1.3006, + "mean_token_accuracy": 0.6769367054104805, + "num_tokens": 120353231.0, + "step": 3675 + }, + { + "epoch": 0.305166265859524, + "grad_norm": 1.033991813659668, + "learning_rate": 8.772666803483956e-05, + "loss": 1.2896, + "mean_token_accuracy": 0.6741019040346146, + "num_tokens": 120517071.0, + "step": 3680 + }, + { + "epoch": 0.3055808939381375, + "grad_norm": 0.9737828373908997, + "learning_rate": 8.767913690058551e-05, + "loss": 1.2837, + "mean_token_accuracy": 0.675384895503521, + "num_tokens": 120680911.0, + "step": 3685 + }, + { + "epoch": 0.30599552201675095, + "grad_norm": 0.9874985218048096, + "learning_rate": 8.763152683652857e-05, + "loss": 1.3349, + "mean_token_accuracy": 0.6681194260716439, + "num_tokens": 120844087.0, + "step": 3690 + }, + { + "epoch": 0.30641015009536443, + "grad_norm": 1.0114009380340576, + "learning_rate": 8.758383794240172e-05, + "loss": 1.3402, + "mean_token_accuracy": 0.6662512198090553, + "num_tokens": 121007927.0, + "step": 3695 + }, + { + "epoch": 0.30682477817397796, + "grad_norm": 0.9285961389541626, + "learning_rate": 8.753607031810312e-05, + "loss": 1.2309, + "mean_token_accuracy": 0.6873839244246482, + "num_tokens": 121171767.0, + "step": 3700 + }, + { + "epoch": 0.30723940625259144, + "grad_norm": 0.9762885570526123, + "learning_rate": 8.748822406369574e-05, + "loss": 1.2427, + "mean_token_accuracy": 0.682667401432991, + "num_tokens": 121335607.0, + "step": 3705 + }, + { + "epoch": 0.3076540343312049, + "grad_norm": 1.032778024673462, + "learning_rate": 8.74402992794074e-05, + "loss": 1.2541, + "mean_token_accuracy": 0.6826001942157746, + "num_tokens": 121499447.0, + "step": 3710 + }, + { + "epoch": 0.3080686624098184, + "grad_norm": 1.0014163255691528, + "learning_rate": 8.739229606563035e-05, + "loss": 1.2626, + "mean_token_accuracy": 0.6812499985098839, + "num_tokens": 121663287.0, + "step": 3715 + }, + { + "epoch": 0.30848329048843187, + "grad_norm": 0.9528563022613525, + "learning_rate": 8.734421452292114e-05, + "loss": 1.2527, + "mean_token_accuracy": 0.6809811815619469, + "num_tokens": 121827127.0, + "step": 3720 + }, + { + "epoch": 0.30889791856704535, + "grad_norm": 1.0677566528320312, + "learning_rate": 8.72960547520004e-05, + "loss": 1.2492, + "mean_token_accuracy": 0.6800281047821045, + "num_tokens": 121990967.0, + "step": 3725 + }, + { + "epoch": 0.3093125466456588, + "grad_norm": 0.9664039611816406, + "learning_rate": 8.724781685375265e-05, + "loss": 1.3207, + "mean_token_accuracy": 0.6690554708242417, + "num_tokens": 122154807.0, + "step": 3730 + }, + { + "epoch": 0.3097271747242723, + "grad_norm": 1.023189902305603, + "learning_rate": 8.719950092922604e-05, + "loss": 1.2411, + "mean_token_accuracy": 0.6794843584299087, + "num_tokens": 122318647.0, + "step": 3735 + }, + { + "epoch": 0.31014180280288584, + "grad_norm": 0.9762243032455444, + "learning_rate": 8.715110707963221e-05, + "loss": 1.3371, + "mean_token_accuracy": 0.6647421836853027, + "num_tokens": 122482487.0, + "step": 3740 + }, + { + "epoch": 0.3105564308814993, + "grad_norm": 0.9874435663223267, + "learning_rate": 8.710263540634602e-05, + "loss": 1.2827, + "mean_token_accuracy": 0.6774367719888688, + "num_tokens": 122646034.0, + "step": 3745 + }, + { + "epoch": 0.3109710589601128, + "grad_norm": 0.9017988443374634, + "learning_rate": 8.705408601090532e-05, + "loss": 1.1893, + "mean_token_accuracy": 0.6915566936135292, + "num_tokens": 122809874.0, + "step": 3750 + }, + { + "epoch": 0.31138568703872627, + "grad_norm": 0.9966082572937012, + "learning_rate": 8.70054589950108e-05, + "loss": 1.3368, + "mean_token_accuracy": 0.6670882239937782, + "num_tokens": 122973714.0, + "step": 3755 + }, + { + "epoch": 0.31180031511733974, + "grad_norm": 1.0282634496688843, + "learning_rate": 8.695675446052579e-05, + "loss": 1.3208, + "mean_token_accuracy": 0.6717497557401657, + "num_tokens": 123137554.0, + "step": 3760 + }, + { + "epoch": 0.3122149431959532, + "grad_norm": 0.9596050381660461, + "learning_rate": 8.690797250947593e-05, + "loss": 1.3403, + "mean_token_accuracy": 0.6624877825379372, + "num_tokens": 123301394.0, + "step": 3765 + }, + { + "epoch": 0.3126295712745667, + "grad_norm": 1.0352023839950562, + "learning_rate": 8.685911324404906e-05, + "loss": 1.3191, + "mean_token_accuracy": 0.6656373083591461, + "num_tokens": 123464324.0, + "step": 3770 + }, + { + "epoch": 0.3130441993531802, + "grad_norm": 0.9956424832344055, + "learning_rate": 8.681017676659499e-05, + "loss": 1.3084, + "mean_token_accuracy": 0.671352642774582, + "num_tokens": 123628164.0, + "step": 3775 + }, + { + "epoch": 0.3134588274317937, + "grad_norm": 0.9463441967964172, + "learning_rate": 8.676116317962528e-05, + "loss": 1.3019, + "mean_token_accuracy": 0.6744090288877487, + "num_tokens": 123791551.0, + "step": 3780 + }, + { + "epoch": 0.3138734555104072, + "grad_norm": 0.963124692440033, + "learning_rate": 8.671207258581298e-05, + "loss": 1.2478, + "mean_token_accuracy": 0.6797104150056839, + "num_tokens": 123955391.0, + "step": 3785 + }, + { + "epoch": 0.31428808358902066, + "grad_norm": 0.9857081174850464, + "learning_rate": 8.666290508799249e-05, + "loss": 1.3029, + "mean_token_accuracy": 0.6718658357858658, + "num_tokens": 124119231.0, + "step": 3790 + }, + { + "epoch": 0.31470271166763414, + "grad_norm": 0.9956610202789307, + "learning_rate": 8.661366078915926e-05, + "loss": 1.3673, + "mean_token_accuracy": 0.6612109035253525, + "num_tokens": 124283071.0, + "step": 3795 + }, + { + "epoch": 0.3151173397462476, + "grad_norm": 0.9534837603569031, + "learning_rate": 8.656433979246972e-05, + "loss": 1.2769, + "mean_token_accuracy": 0.6763257592916488, + "num_tokens": 124446911.0, + "step": 3800 + }, + { + "epoch": 0.3155319678248611, + "grad_norm": 0.9826126098632812, + "learning_rate": 8.651494220124086e-05, + "loss": 1.297, + "mean_token_accuracy": 0.6729411080479621, + "num_tokens": 124610751.0, + "step": 3805 + }, + { + "epoch": 0.31594659590347457, + "grad_norm": 0.9866495132446289, + "learning_rate": 8.646546811895014e-05, + "loss": 1.2753, + "mean_token_accuracy": 0.675342133641243, + "num_tokens": 124774591.0, + "step": 3810 + }, + { + "epoch": 0.31636122398208805, + "grad_norm": 0.9472578763961792, + "learning_rate": 8.641591764923532e-05, + "loss": 1.2422, + "mean_token_accuracy": 0.6868707269430161, + "num_tokens": 124938431.0, + "step": 3815 + }, + { + "epoch": 0.3167758520607015, + "grad_norm": 0.9497211575508118, + "learning_rate": 8.636629089589409e-05, + "loss": 1.2842, + "mean_token_accuracy": 0.6793316245079041, + "num_tokens": 125102271.0, + "step": 3820 + }, + { + "epoch": 0.31719048013931506, + "grad_norm": 1.0438381433486938, + "learning_rate": 8.631658796288399e-05, + "loss": 1.3518, + "mean_token_accuracy": 0.6632453590631485, + "num_tokens": 125266111.0, + "step": 3825 + }, + { + "epoch": 0.31760510821792853, + "grad_norm": 1.0412575006484985, + "learning_rate": 8.626680895432213e-05, + "loss": 1.3068, + "mean_token_accuracy": 0.6738636314868927, + "num_tokens": 125429951.0, + "step": 3830 + }, + { + "epoch": 0.318019736296542, + "grad_norm": 1.0045636892318726, + "learning_rate": 8.621695397448497e-05, + "loss": 1.2654, + "mean_token_accuracy": 0.6792705297470093, + "num_tokens": 125593791.0, + "step": 3835 + }, + { + "epoch": 0.3184343643751555, + "grad_norm": 0.9786563515663147, + "learning_rate": 8.616702312780813e-05, + "loss": 1.3044, + "mean_token_accuracy": 0.6722140803933143, + "num_tokens": 125757631.0, + "step": 3840 + }, + { + "epoch": 0.31884899245376896, + "grad_norm": 1.0160499811172485, + "learning_rate": 8.611701651888616e-05, + "loss": 1.2634, + "mean_token_accuracy": 0.6827040523290634, + "num_tokens": 125921471.0, + "step": 3845 + }, + { + "epoch": 0.31926362053238244, + "grad_norm": 0.9721868634223938, + "learning_rate": 8.606693425247227e-05, + "loss": 1.2613, + "mean_token_accuracy": 0.6813662230968476, + "num_tokens": 126085196.0, + "step": 3850 + }, + { + "epoch": 0.3196782486109959, + "grad_norm": 0.9894058108329773, + "learning_rate": 8.60167764334782e-05, + "loss": 1.3418, + "mean_token_accuracy": 0.6640518069267273, + "num_tokens": 126249036.0, + "step": 3855 + }, + { + "epoch": 0.3200928766896094, + "grad_norm": 1.0008037090301514, + "learning_rate": 8.596654316697397e-05, + "loss": 1.3918, + "mean_token_accuracy": 0.6580717816948891, + "num_tokens": 126410606.0, + "step": 3860 + }, + { + "epoch": 0.32050750476822293, + "grad_norm": 0.9972857236862183, + "learning_rate": 8.591623455818762e-05, + "loss": 1.336, + "mean_token_accuracy": 0.6668804988265038, + "num_tokens": 126574446.0, + "step": 3865 + }, + { + "epoch": 0.3209221328468364, + "grad_norm": 0.9058495759963989, + "learning_rate": 8.586585071250498e-05, + "loss": 1.351, + "mean_token_accuracy": 0.6672898322343827, + "num_tokens": 126738286.0, + "step": 3870 + }, + { + "epoch": 0.3213367609254499, + "grad_norm": 0.9955798387527466, + "learning_rate": 8.581539173546955e-05, + "loss": 1.2712, + "mean_token_accuracy": 0.6773826986551285, + "num_tokens": 126902126.0, + "step": 3875 + }, + { + "epoch": 0.32175138900406336, + "grad_norm": 1.066311001777649, + "learning_rate": 8.57648577327822e-05, + "loss": 1.2406, + "mean_token_accuracy": 0.6785888627171517, + "num_tokens": 127065196.0, + "step": 3880 + }, + { + "epoch": 0.32216601708267684, + "grad_norm": 0.9382798671722412, + "learning_rate": 8.571424881030093e-05, + "loss": 1.2819, + "mean_token_accuracy": 0.6794049352407455, + "num_tokens": 127229036.0, + "step": 3885 + }, + { + "epoch": 0.3225806451612903, + "grad_norm": 1.005324125289917, + "learning_rate": 8.566356507404072e-05, + "loss": 1.3124, + "mean_token_accuracy": 0.6711082607507706, + "num_tokens": 127392876.0, + "step": 3890 + }, + { + "epoch": 0.3229952732399038, + "grad_norm": 0.9854289293289185, + "learning_rate": 8.561280663017324e-05, + "loss": 1.2908, + "mean_token_accuracy": 0.66966642588377, + "num_tokens": 127556716.0, + "step": 3895 + }, + { + "epoch": 0.32340990131851727, + "grad_norm": 1.044110655784607, + "learning_rate": 8.556197358502666e-05, + "loss": 1.2587, + "mean_token_accuracy": 0.6809384137392044, + "num_tokens": 127720556.0, + "step": 3900 + }, + { + "epoch": 0.3238245293971308, + "grad_norm": 0.9688938856124878, + "learning_rate": 8.551106604508545e-05, + "loss": 1.2507, + "mean_token_accuracy": 0.6818731635808944, + "num_tokens": 127884396.0, + "step": 3905 + }, + { + "epoch": 0.3242391574757443, + "grad_norm": 0.9877737760543823, + "learning_rate": 8.546008411699009e-05, + "loss": 1.2911, + "mean_token_accuracy": 0.6763318687677383, + "num_tokens": 128048236.0, + "step": 3910 + }, + { + "epoch": 0.32465378555435775, + "grad_norm": 0.9584382176399231, + "learning_rate": 8.540902790753693e-05, + "loss": 1.329, + "mean_token_accuracy": 0.6675403237342834, + "num_tokens": 128212076.0, + "step": 3915 + }, + { + "epoch": 0.32506841363297123, + "grad_norm": 0.9776164889335632, + "learning_rate": 8.535789752367791e-05, + "loss": 1.3346, + "mean_token_accuracy": 0.6679924249649047, + "num_tokens": 128375916.0, + "step": 3920 + }, + { + "epoch": 0.3254830417115847, + "grad_norm": 1.0723118782043457, + "learning_rate": 8.530669307252033e-05, + "loss": 1.266, + "mean_token_accuracy": 0.6777309402823448, + "num_tokens": 128539756.0, + "step": 3925 + }, + { + "epoch": 0.3258976697901982, + "grad_norm": 0.9671254754066467, + "learning_rate": 8.525541466132665e-05, + "loss": 1.2936, + "mean_token_accuracy": 0.6725837871432304, + "num_tokens": 128703039.0, + "step": 3930 + }, + { + "epoch": 0.32631229786881166, + "grad_norm": 0.9290532469749451, + "learning_rate": 8.520406239751429e-05, + "loss": 1.2614, + "mean_token_accuracy": 0.68012585490942, + "num_tokens": 128866879.0, + "step": 3935 + }, + { + "epoch": 0.32672692594742514, + "grad_norm": 0.9890875220298767, + "learning_rate": 8.515263638865533e-05, + "loss": 1.2861, + "mean_token_accuracy": 0.6782450526952744, + "num_tokens": 129030618.0, + "step": 3940 + }, + { + "epoch": 0.3271415540260386, + "grad_norm": 0.9538615942001343, + "learning_rate": 8.510113674247636e-05, + "loss": 1.3331, + "mean_token_accuracy": 0.6691410079598427, + "num_tokens": 129194458.0, + "step": 3945 + }, + { + "epoch": 0.32755618210465215, + "grad_norm": 0.995694637298584, + "learning_rate": 8.504956356685825e-05, + "loss": 1.3451, + "mean_token_accuracy": 0.6659946203231811, + "num_tokens": 129358298.0, + "step": 3950 + }, + { + "epoch": 0.3279708101832656, + "grad_norm": 0.9291507005691528, + "learning_rate": 8.499791696983584e-05, + "loss": 1.2219, + "mean_token_accuracy": 0.68574658036232, + "num_tokens": 129522138.0, + "step": 3955 + }, + { + "epoch": 0.3283854382618791, + "grad_norm": 0.9434182047843933, + "learning_rate": 8.494619705959779e-05, + "loss": 1.2882, + "mean_token_accuracy": 0.6761119276285171, + "num_tokens": 129685978.0, + "step": 3960 + }, + { + "epoch": 0.3288000663404926, + "grad_norm": 0.941871702671051, + "learning_rate": 8.489440394448638e-05, + "loss": 1.3127, + "mean_token_accuracy": 0.6678580120205879, + "num_tokens": 129849818.0, + "step": 3965 + }, + { + "epoch": 0.32921469441910606, + "grad_norm": 0.9243893027305603, + "learning_rate": 8.484253773299718e-05, + "loss": 1.2996, + "mean_token_accuracy": 0.6784090906381607, + "num_tokens": 130013658.0, + "step": 3970 + }, + { + "epoch": 0.32962932249771953, + "grad_norm": 0.9161133766174316, + "learning_rate": 8.479059853377892e-05, + "loss": 1.277, + "mean_token_accuracy": 0.6793010771274567, + "num_tokens": 130177498.0, + "step": 3975 + }, + { + "epoch": 0.330043950576333, + "grad_norm": 0.9451107978820801, + "learning_rate": 8.47385864556332e-05, + "loss": 1.3267, + "mean_token_accuracy": 0.671181571483612, + "num_tokens": 130341338.0, + "step": 3980 + }, + { + "epoch": 0.3304585786549465, + "grad_norm": 0.9498744606971741, + "learning_rate": 8.468650160751428e-05, + "loss": 1.3259, + "mean_token_accuracy": 0.6695992231369019, + "num_tokens": 130505178.0, + "step": 3985 + }, + { + "epoch": 0.33087320673356, + "grad_norm": 0.9431107044219971, + "learning_rate": 8.463434409852892e-05, + "loss": 1.3873, + "mean_token_accuracy": 0.6567815229296684, + "num_tokens": 130669018.0, + "step": 3990 + }, + { + "epoch": 0.3312878348121735, + "grad_norm": 0.9482833743095398, + "learning_rate": 8.458211403793599e-05, + "loss": 1.2723, + "mean_token_accuracy": 0.6752749234437943, + "num_tokens": 130832858.0, + "step": 3995 + }, + { + "epoch": 0.331702462890787, + "grad_norm": 0.9537649154663086, + "learning_rate": 8.452981153514643e-05, + "loss": 1.2432, + "mean_token_accuracy": 0.6842253148555756, + "num_tokens": 130996698.0, + "step": 4000 + }, + { + "epoch": 0.33211709096940045, + "grad_norm": 0.9574406743049622, + "learning_rate": 8.44774366997229e-05, + "loss": 1.2732, + "mean_token_accuracy": 0.6766739994287491, + "num_tokens": 131160538.0, + "step": 4005 + }, + { + "epoch": 0.33253171904801393, + "grad_norm": 0.9847679734230042, + "learning_rate": 8.442498964137952e-05, + "loss": 1.3391, + "mean_token_accuracy": 0.6670026913285255, + "num_tokens": 131324378.0, + "step": 4010 + }, + { + "epoch": 0.3329463471266274, + "grad_norm": 0.962680995464325, + "learning_rate": 8.437247046998183e-05, + "loss": 1.2861, + "mean_token_accuracy": 0.6737597808241844, + "num_tokens": 131488218.0, + "step": 4015 + }, + { + "epoch": 0.3333609752052409, + "grad_norm": 1.0363256931304932, + "learning_rate": 8.431987929554632e-05, + "loss": 1.3114, + "mean_token_accuracy": 0.6710471659898758, + "num_tokens": 131652058.0, + "step": 4020 + }, + { + "epoch": 0.33377560328385436, + "grad_norm": 1.0140748023986816, + "learning_rate": 8.426721622824035e-05, + "loss": 1.2832, + "mean_token_accuracy": 0.6753176897764206, + "num_tokens": 131815898.0, + "step": 4025 + }, + { + "epoch": 0.3341902313624679, + "grad_norm": 0.9625388979911804, + "learning_rate": 8.421448137838186e-05, + "loss": 1.204, + "mean_token_accuracy": 0.6887280084192753, + "num_tokens": 131979738.0, + "step": 4030 + }, + { + "epoch": 0.33460485944108137, + "grad_norm": 0.9752932190895081, + "learning_rate": 8.416167485643923e-05, + "loss": 1.3647, + "mean_token_accuracy": 0.6627566017210483, + "num_tokens": 132143578.0, + "step": 4035 + }, + { + "epoch": 0.33501948751969485, + "grad_norm": 0.9496031999588013, + "learning_rate": 8.410879677303087e-05, + "loss": 1.3112, + "mean_token_accuracy": 0.6705584079027176, + "num_tokens": 132307418.0, + "step": 4040 + }, + { + "epoch": 0.3354341155983083, + "grad_norm": 0.9461378455162048, + "learning_rate": 8.405584723892521e-05, + "loss": 1.1767, + "mean_token_accuracy": 0.6935592070221901, + "num_tokens": 132471091.0, + "step": 4045 + }, + { + "epoch": 0.3358487436769218, + "grad_norm": 0.9161537885665894, + "learning_rate": 8.400282636504027e-05, + "loss": 1.3451, + "mean_token_accuracy": 0.667497555911541, + "num_tokens": 132634931.0, + "step": 4050 + }, + { + "epoch": 0.3362633717555353, + "grad_norm": 0.9966378808021545, + "learning_rate": 8.394973426244352e-05, + "loss": 1.3231, + "mean_token_accuracy": 0.6694892451167107, + "num_tokens": 132798771.0, + "step": 4055 + }, + { + "epoch": 0.33667799983414876, + "grad_norm": 0.9339237809181213, + "learning_rate": 8.38965710423517e-05, + "loss": 1.2202, + "mean_token_accuracy": 0.6860826000571251, + "num_tokens": 132962611.0, + "step": 4060 + }, + { + "epoch": 0.33709262791276223, + "grad_norm": 0.9363735318183899, + "learning_rate": 8.384333681613044e-05, + "loss": 1.2093, + "mean_token_accuracy": 0.68759775608778, + "num_tokens": 133126451.0, + "step": 4065 + }, + { + "epoch": 0.3375072559913757, + "grad_norm": 0.9587835073471069, + "learning_rate": 8.37900316952942e-05, + "loss": 1.254, + "mean_token_accuracy": 0.6832692757248878, + "num_tokens": 133290113.0, + "step": 4070 + }, + { + "epoch": 0.33792188406998924, + "grad_norm": 0.9150619506835938, + "learning_rate": 8.373665579150587e-05, + "loss": 1.3307, + "mean_token_accuracy": 0.6668804973363877, + "num_tokens": 133453953.0, + "step": 4075 + }, + { + "epoch": 0.3383365121486027, + "grad_norm": 0.99213707447052, + "learning_rate": 8.368320921657666e-05, + "loss": 1.2631, + "mean_token_accuracy": 0.6791481450200081, + "num_tokens": 133617318.0, + "step": 4080 + }, + { + "epoch": 0.3387511402272162, + "grad_norm": 0.9785225987434387, + "learning_rate": 8.362969208246582e-05, + "loss": 1.271, + "mean_token_accuracy": 0.67991201877594, + "num_tokens": 133781158.0, + "step": 4085 + }, + { + "epoch": 0.3391657683058297, + "grad_norm": 0.8981971740722656, + "learning_rate": 8.357610450128042e-05, + "loss": 1.3013, + "mean_token_accuracy": 0.673093843460083, + "num_tokens": 133944998.0, + "step": 4090 + }, + { + "epoch": 0.33958039638444315, + "grad_norm": 0.930395245552063, + "learning_rate": 8.352244658527504e-05, + "loss": 1.2701, + "mean_token_accuracy": 0.6777633026242256, + "num_tokens": 134108425.0, + "step": 4095 + }, + { + "epoch": 0.3399950244630566, + "grad_norm": 0.9164111018180847, + "learning_rate": 8.346871844685167e-05, + "loss": 1.226, + "mean_token_accuracy": 0.6855266347527504, + "num_tokens": 134272265.0, + "step": 4100 + }, + { + "epoch": 0.3404096525416701, + "grad_norm": 0.9662408828735352, + "learning_rate": 8.341492019855934e-05, + "loss": 1.3584, + "mean_token_accuracy": 0.6646721988916398, + "num_tokens": 134436081.0, + "step": 4105 + }, + { + "epoch": 0.3408242806202836, + "grad_norm": 0.9960159063339233, + "learning_rate": 8.3361051953094e-05, + "loss": 1.3134, + "mean_token_accuracy": 0.6722201898694038, + "num_tokens": 134599921.0, + "step": 4110 + }, + { + "epoch": 0.3412389086988971, + "grad_norm": 0.8849697113037109, + "learning_rate": 8.330711382329817e-05, + "loss": 1.2717, + "mean_token_accuracy": 0.6770100191235542, + "num_tokens": 134763761.0, + "step": 4115 + }, + { + "epoch": 0.3416535367775106, + "grad_norm": 0.9529863595962524, + "learning_rate": 8.325310592216082e-05, + "loss": 1.2397, + "mean_token_accuracy": 0.6822275161743164, + "num_tokens": 134927601.0, + "step": 4120 + }, + { + "epoch": 0.34206816485612407, + "grad_norm": 0.9730038642883301, + "learning_rate": 8.319902836281706e-05, + "loss": 1.346, + "mean_token_accuracy": 0.6663184270262719, + "num_tokens": 135091441.0, + "step": 4125 + }, + { + "epoch": 0.34248279293473755, + "grad_norm": 0.9488906860351562, + "learning_rate": 8.31448812585479e-05, + "loss": 1.289, + "mean_token_accuracy": 0.6779081113636494, + "num_tokens": 135255281.0, + "step": 4130 + }, + { + "epoch": 0.342897421013351, + "grad_norm": 1.0032751560211182, + "learning_rate": 8.309066472278004e-05, + "loss": 1.3417, + "mean_token_accuracy": 0.6674425736069679, + "num_tokens": 135419121.0, + "step": 4135 + }, + { + "epoch": 0.3433120490919645, + "grad_norm": 0.99687659740448, + "learning_rate": 8.303637886908562e-05, + "loss": 1.2785, + "mean_token_accuracy": 0.6735092848539352, + "num_tokens": 135582961.0, + "step": 4140 + }, + { + "epoch": 0.343726677170578, + "grad_norm": 0.9653187990188599, + "learning_rate": 8.2982023811182e-05, + "loss": 1.3828, + "mean_token_accuracy": 0.6618096277117729, + "num_tokens": 135746801.0, + "step": 4145 + }, + { + "epoch": 0.34414130524919145, + "grad_norm": 0.9359713196754456, + "learning_rate": 8.292759966293152e-05, + "loss": 1.2622, + "mean_token_accuracy": 0.6757636874914169, + "num_tokens": 135910641.0, + "step": 4150 + }, + { + "epoch": 0.344555933327805, + "grad_norm": 0.9051123857498169, + "learning_rate": 8.287310653834121e-05, + "loss": 1.2661, + "mean_token_accuracy": 0.6835622668266297, + "num_tokens": 136073698.0, + "step": 4155 + }, + { + "epoch": 0.34497056140641846, + "grad_norm": 0.9394945502281189, + "learning_rate": 8.281854455156262e-05, + "loss": 1.3008, + "mean_token_accuracy": 0.672403471171856, + "num_tokens": 136237538.0, + "step": 4160 + }, + { + "epoch": 0.34538518948503194, + "grad_norm": 0.9870340824127197, + "learning_rate": 8.276391381689152e-05, + "loss": 1.2776, + "mean_token_accuracy": 0.6784396395087242, + "num_tokens": 136401378.0, + "step": 4165 + }, + { + "epoch": 0.3457998175636454, + "grad_norm": 0.9649732112884521, + "learning_rate": 8.270921444876775e-05, + "loss": 1.323, + "mean_token_accuracy": 0.6687744408845901, + "num_tokens": 136565218.0, + "step": 4170 + }, + { + "epoch": 0.3462144456422589, + "grad_norm": 0.9824701547622681, + "learning_rate": 8.26544465617749e-05, + "loss": 1.3662, + "mean_token_accuracy": 0.6621753796935081, + "num_tokens": 136728291.0, + "step": 4175 + }, + { + "epoch": 0.34662907372087237, + "grad_norm": 0.9233903288841248, + "learning_rate": 8.259961027064003e-05, + "loss": 1.2652, + "mean_token_accuracy": 0.6777920290827751, + "num_tokens": 136892131.0, + "step": 4180 + }, + { + "epoch": 0.34704370179948585, + "grad_norm": 0.8987513780593872, + "learning_rate": 8.254470569023359e-05, + "loss": 1.3039, + "mean_token_accuracy": 0.6722690597176552, + "num_tokens": 137055971.0, + "step": 4185 + }, + { + "epoch": 0.3474583298780993, + "grad_norm": 1.0128217935562134, + "learning_rate": 8.2489732935569e-05, + "loss": 1.3247, + "mean_token_accuracy": 0.6686278089880944, + "num_tokens": 137219811.0, + "step": 4190 + }, + { + "epoch": 0.3478729579567128, + "grad_norm": 0.9634391069412231, + "learning_rate": 8.243469212180254e-05, + "loss": 1.2436, + "mean_token_accuracy": 0.6825207725167275, + "num_tokens": 137383651.0, + "step": 4195 + }, + { + "epoch": 0.34828758603532634, + "grad_norm": 0.9329254031181335, + "learning_rate": 8.237958336423305e-05, + "loss": 1.2822, + "mean_token_accuracy": 0.6733443275094032, + "num_tokens": 137547491.0, + "step": 4200 + }, + { + "epoch": 0.3487022141139398, + "grad_norm": 0.9092962741851807, + "learning_rate": 8.232440677830168e-05, + "loss": 1.3034, + "mean_token_accuracy": 0.6714442819356918, + "num_tokens": 137711331.0, + "step": 4205 + }, + { + "epoch": 0.3491168421925533, + "grad_norm": 1.0242863893508911, + "learning_rate": 8.22691624795917e-05, + "loss": 1.3614, + "mean_token_accuracy": 0.663538607954979, + "num_tokens": 137875171.0, + "step": 4210 + }, + { + "epoch": 0.34953147027116677, + "grad_norm": 2.263258934020996, + "learning_rate": 8.221385058382818e-05, + "loss": 1.3176, + "mean_token_accuracy": 0.6717314258217811, + "num_tokens": 138039011.0, + "step": 4215 + }, + { + "epoch": 0.34994609834978024, + "grad_norm": 0.9587223529815674, + "learning_rate": 8.215847120687783e-05, + "loss": 1.3566, + "mean_token_accuracy": 0.6651515141129494, + "num_tokens": 138202851.0, + "step": 4220 + }, + { + "epoch": 0.3503607264283937, + "grad_norm": 0.9732730388641357, + "learning_rate": 8.210302446474869e-05, + "loss": 1.2507, + "mean_token_accuracy": 0.6835585042834282, + "num_tokens": 138366064.0, + "step": 4225 + }, + { + "epoch": 0.3507753545070072, + "grad_norm": 0.9751377701759338, + "learning_rate": 8.204751047358993e-05, + "loss": 1.3179, + "mean_token_accuracy": 0.6698497042059899, + "num_tokens": 138529904.0, + "step": 4230 + }, + { + "epoch": 0.3511899825856207, + "grad_norm": 0.9684930443763733, + "learning_rate": 8.199192934969163e-05, + "loss": 1.2834, + "mean_token_accuracy": 0.6742214113473892, + "num_tokens": 138693733.0, + "step": 4235 + }, + { + "epoch": 0.3516046106642342, + "grad_norm": 0.9790882468223572, + "learning_rate": 8.19362812094844e-05, + "loss": 1.3213, + "mean_token_accuracy": 0.6715420380234718, + "num_tokens": 138857573.0, + "step": 4240 + }, + { + "epoch": 0.3520192387428477, + "grad_norm": 0.9955540299415588, + "learning_rate": 8.188056616953932e-05, + "loss": 1.3012, + "mean_token_accuracy": 0.6734115317463875, + "num_tokens": 139021413.0, + "step": 4245 + }, + { + "epoch": 0.35243386682146116, + "grad_norm": 0.9717499613761902, + "learning_rate": 8.18247843465676e-05, + "loss": 1.2503, + "mean_token_accuracy": 0.6813660755753517, + "num_tokens": 139185253.0, + "step": 4250 + }, + { + "epoch": 0.35284849490007464, + "grad_norm": 0.985129177570343, + "learning_rate": 8.176893585742031e-05, + "loss": 1.2865, + "mean_token_accuracy": 0.6772849515080452, + "num_tokens": 139349093.0, + "step": 4255 + }, + { + "epoch": 0.3532631229786881, + "grad_norm": 0.9623016715049744, + "learning_rate": 8.171302081908819e-05, + "loss": 1.2734, + "mean_token_accuracy": 0.6783895179629326, + "num_tokens": 139512598.0, + "step": 4260 + }, + { + "epoch": 0.3536777510573016, + "grad_norm": 0.9025546312332153, + "learning_rate": 8.165703934870142e-05, + "loss": 1.2964, + "mean_token_accuracy": 0.674847262352705, + "num_tokens": 139676438.0, + "step": 4265 + }, + { + "epoch": 0.35409237913591507, + "grad_norm": 0.9645045399665833, + "learning_rate": 8.160099156352929e-05, + "loss": 1.2387, + "mean_token_accuracy": 0.6831665381789207, + "num_tokens": 139839810.0, + "step": 4270 + }, + { + "epoch": 0.35450700721452855, + "grad_norm": 0.9419810175895691, + "learning_rate": 8.154487758098003e-05, + "loss": 1.2537, + "mean_token_accuracy": 0.6783122837543487, + "num_tokens": 140003379.0, + "step": 4275 + }, + { + "epoch": 0.3549216352931421, + "grad_norm": 0.9562869668006897, + "learning_rate": 8.148869751860053e-05, + "loss": 1.227, + "mean_token_accuracy": 0.6850745335221291, + "num_tokens": 140167219.0, + "step": 4280 + }, + { + "epoch": 0.35533626337175556, + "grad_norm": 1.0040013790130615, + "learning_rate": 8.143245149407612e-05, + "loss": 1.2568, + "mean_token_accuracy": 0.682539102435112, + "num_tokens": 140331059.0, + "step": 4285 + }, + { + "epoch": 0.35575089145036903, + "grad_norm": 0.9768736958503723, + "learning_rate": 8.13761396252303e-05, + "loss": 1.3256, + "mean_token_accuracy": 0.6704789817333221, + "num_tokens": 140494899.0, + "step": 4290 + }, + { + "epoch": 0.3561655195289825, + "grad_norm": 0.9282314777374268, + "learning_rate": 8.131976203002447e-05, + "loss": 1.1742, + "mean_token_accuracy": 0.6952040582895279, + "num_tokens": 140658739.0, + "step": 4295 + }, + { + "epoch": 0.356580147607596, + "grad_norm": 1.0825862884521484, + "learning_rate": 8.126331882655775e-05, + "loss": 1.3217, + "mean_token_accuracy": 0.6707844614982605, + "num_tokens": 140822579.0, + "step": 4300 + }, + { + "epoch": 0.35699477568620946, + "grad_norm": 0.962060809135437, + "learning_rate": 8.12068101330667e-05, + "loss": 1.2716, + "mean_token_accuracy": 0.6770894408226014, + "num_tokens": 140986419.0, + "step": 4305 + }, + { + "epoch": 0.35740940376482294, + "grad_norm": 0.98125821352005, + "learning_rate": 8.115023606792505e-05, + "loss": 1.222, + "mean_token_accuracy": 0.6857465758919716, + "num_tokens": 141150259.0, + "step": 4310 + }, + { + "epoch": 0.3578240318434364, + "grad_norm": 0.9417420029640198, + "learning_rate": 8.109359674964345e-05, + "loss": 1.2183, + "mean_token_accuracy": 0.6839931562542916, + "num_tokens": 141314099.0, + "step": 4315 + }, + { + "epoch": 0.3582386599220499, + "grad_norm": 0.9744819402694702, + "learning_rate": 8.103689229686929e-05, + "loss": 1.2387, + "mean_token_accuracy": 0.6847996070981026, + "num_tokens": 141477939.0, + "step": 4320 + }, + { + "epoch": 0.35865328800066343, + "grad_norm": 0.9578313231468201, + "learning_rate": 8.098012282838634e-05, + "loss": 1.3259, + "mean_token_accuracy": 0.6701979517936707, + "num_tokens": 141641779.0, + "step": 4325 + }, + { + "epoch": 0.3590679160792769, + "grad_norm": 0.9743626713752747, + "learning_rate": 8.092328846311464e-05, + "loss": 1.2282, + "mean_token_accuracy": 0.6806573778390884, + "num_tokens": 141805619.0, + "step": 4330 + }, + { + "epoch": 0.3594825441578904, + "grad_norm": 0.9296370148658752, + "learning_rate": 8.08663893201101e-05, + "loss": 1.2349, + "mean_token_accuracy": 0.6843108460307121, + "num_tokens": 141969459.0, + "step": 4335 + }, + { + "epoch": 0.35989717223650386, + "grad_norm": 0.92148357629776, + "learning_rate": 8.080942551856436e-05, + "loss": 1.2489, + "mean_token_accuracy": 0.685392227768898, + "num_tokens": 142133299.0, + "step": 4340 + }, + { + "epoch": 0.36031180031511734, + "grad_norm": 0.9848425388336182, + "learning_rate": 8.075239717780455e-05, + "loss": 1.2599, + "mean_token_accuracy": 0.6786168172955513, + "num_tokens": 142297139.0, + "step": 4345 + }, + { + "epoch": 0.3607264283937308, + "grad_norm": 1.043373465538025, + "learning_rate": 8.069530441729291e-05, + "loss": 1.3162, + "mean_token_accuracy": 0.6734550461173058, + "num_tokens": 142460884.0, + "step": 4350 + }, + { + "epoch": 0.3611410564723443, + "grad_norm": 0.927104651927948, + "learning_rate": 8.06381473566267e-05, + "loss": 1.2539, + "mean_token_accuracy": 0.681940370798111, + "num_tokens": 142624724.0, + "step": 4355 + }, + { + "epoch": 0.36155568455095777, + "grad_norm": 0.9074141383171082, + "learning_rate": 8.058092611553782e-05, + "loss": 1.2578, + "mean_token_accuracy": 0.6808372125029564, + "num_tokens": 142788207.0, + "step": 4360 + }, + { + "epoch": 0.3619703126295713, + "grad_norm": 0.979074239730835, + "learning_rate": 8.052364081389263e-05, + "loss": 1.3162, + "mean_token_accuracy": 0.6724095776677131, + "num_tokens": 142952047.0, + "step": 4365 + }, + { + "epoch": 0.3623849407081848, + "grad_norm": 0.9424482583999634, + "learning_rate": 8.046629157169172e-05, + "loss": 1.2837, + "mean_token_accuracy": 0.6769992977380752, + "num_tokens": 143115013.0, + "step": 4370 + }, + { + "epoch": 0.36279956878679825, + "grad_norm": 0.969394862651825, + "learning_rate": 8.040887850906957e-05, + "loss": 1.2637, + "mean_token_accuracy": 0.6814088448882103, + "num_tokens": 143278853.0, + "step": 4375 + }, + { + "epoch": 0.36321419686541173, + "grad_norm": 0.9611478447914124, + "learning_rate": 8.035140174629438e-05, + "loss": 1.2938, + "mean_token_accuracy": 0.67542155534029, + "num_tokens": 143442693.0, + "step": 4380 + }, + { + "epoch": 0.3636288249440252, + "grad_norm": 0.9548108577728271, + "learning_rate": 8.02938614037678e-05, + "loss": 1.2743, + "mean_token_accuracy": 0.6769916892051697, + "num_tokens": 143606533.0, + "step": 4385 + }, + { + "epoch": 0.3640434530226387, + "grad_norm": 0.9090794920921326, + "learning_rate": 8.023625760202463e-05, + "loss": 1.2429, + "mean_token_accuracy": 0.6857918426394463, + "num_tokens": 143770118.0, + "step": 4390 + }, + { + "epoch": 0.36445808110125216, + "grad_norm": 0.881459653377533, + "learning_rate": 8.01785904617326e-05, + "loss": 1.2454, + "mean_token_accuracy": 0.6801380708813667, + "num_tokens": 143933958.0, + "step": 4395 + }, + { + "epoch": 0.36487270917986564, + "grad_norm": 0.9274210929870605, + "learning_rate": 8.012086010369218e-05, + "loss": 1.2423, + "mean_token_accuracy": 0.6870169594883919, + "num_tokens": 144096876.0, + "step": 4400 + }, + { + "epoch": 0.3652873372584791, + "grad_norm": 0.9701513648033142, + "learning_rate": 8.00630666488362e-05, + "loss": 1.1798, + "mean_token_accuracy": 0.6948985800147056, + "num_tokens": 144260716.0, + "step": 4405 + }, + { + "epoch": 0.36570196533709265, + "grad_norm": 0.9466547966003418, + "learning_rate": 8.000521021822972e-05, + "loss": 1.2971, + "mean_token_accuracy": 0.6743035256862641, + "num_tokens": 144424556.0, + "step": 4410 + }, + { + "epoch": 0.3661165934157061, + "grad_norm": 0.9436784982681274, + "learning_rate": 7.994729093306968e-05, + "loss": 1.2801, + "mean_token_accuracy": 0.6743096321821213, + "num_tokens": 144588396.0, + "step": 4415 + }, + { + "epoch": 0.3665312214943196, + "grad_norm": 0.9617013931274414, + "learning_rate": 7.98893089146847e-05, + "loss": 1.2311, + "mean_token_accuracy": 0.6872229784727096, + "num_tokens": 144751878.0, + "step": 4420 + }, + { + "epoch": 0.3669458495729331, + "grad_norm": 1.0159647464752197, + "learning_rate": 7.983126428453482e-05, + "loss": 1.2889, + "mean_token_accuracy": 0.6737414434552192, + "num_tokens": 144915718.0, + "step": 4425 + }, + { + "epoch": 0.36736047765154656, + "grad_norm": 0.9690437912940979, + "learning_rate": 7.977315716421125e-05, + "loss": 1.271, + "mean_token_accuracy": 0.6776820629835129, + "num_tokens": 145079558.0, + "step": 4430 + }, + { + "epoch": 0.36777510573016003, + "grad_norm": 0.9304080009460449, + "learning_rate": 7.971498767543604e-05, + "loss": 1.2298, + "mean_token_accuracy": 0.684420819580555, + "num_tokens": 145243398.0, + "step": 4435 + }, + { + "epoch": 0.3681897338087735, + "grad_norm": 0.8567568063735962, + "learning_rate": 7.965675594006198e-05, + "loss": 1.2391, + "mean_token_accuracy": 0.6845796659588814, + "num_tokens": 145407238.0, + "step": 4440 + }, + { + "epoch": 0.368604361887387, + "grad_norm": 1.0175611972808838, + "learning_rate": 7.959846208007221e-05, + "loss": 1.2621, + "mean_token_accuracy": 0.6806634843349457, + "num_tokens": 145571078.0, + "step": 4445 + }, + { + "epoch": 0.3690189899660005, + "grad_norm": 1.036781668663025, + "learning_rate": 7.954010621758e-05, + "loss": 1.2196, + "mean_token_accuracy": 0.687646621465683, + "num_tokens": 145734918.0, + "step": 4450 + }, + { + "epoch": 0.369433618044614, + "grad_norm": 0.9784215688705444, + "learning_rate": 7.948168847482846e-05, + "loss": 1.2364, + "mean_token_accuracy": 0.6837732195854187, + "num_tokens": 145898758.0, + "step": 4455 + }, + { + "epoch": 0.3698482461232275, + "grad_norm": 0.9979178309440613, + "learning_rate": 7.942320897419044e-05, + "loss": 1.2061, + "mean_token_accuracy": 0.6896566465497017, + "num_tokens": 146062598.0, + "step": 4460 + }, + { + "epoch": 0.37026287420184095, + "grad_norm": 0.9144283533096313, + "learning_rate": 7.936466783816808e-05, + "loss": 1.2689, + "mean_token_accuracy": 0.6753910094499588, + "num_tokens": 146226438.0, + "step": 4465 + }, + { + "epoch": 0.37067750228045443, + "grad_norm": 0.8742868304252625, + "learning_rate": 7.930606518939261e-05, + "loss": 1.2428, + "mean_token_accuracy": 0.6869623616337777, + "num_tokens": 146390278.0, + "step": 4470 + }, + { + "epoch": 0.3710921303590679, + "grad_norm": 0.9299134612083435, + "learning_rate": 7.924740115062419e-05, + "loss": 1.2078, + "mean_token_accuracy": 0.6866507828235626, + "num_tokens": 146554118.0, + "step": 4475 + }, + { + "epoch": 0.3715067584376814, + "grad_norm": 0.9693060517311096, + "learning_rate": 7.918867584475154e-05, + "loss": 1.2774, + "mean_token_accuracy": 0.6766065344214439, + "num_tokens": 146716945.0, + "step": 4480 + }, + { + "epoch": 0.37192138651629486, + "grad_norm": 0.9707568287849426, + "learning_rate": 7.912988939479174e-05, + "loss": 1.2349, + "mean_token_accuracy": 0.6863880708813668, + "num_tokens": 146880785.0, + "step": 4485 + }, + { + "epoch": 0.3723360145949084, + "grad_norm": 0.9991249442100525, + "learning_rate": 7.90710419238899e-05, + "loss": 1.3343, + "mean_token_accuracy": 0.6660679325461387, + "num_tokens": 147044625.0, + "step": 4490 + }, + { + "epoch": 0.37275064267352187, + "grad_norm": 0.9615370631217957, + "learning_rate": 7.901213355531901e-05, + "loss": 1.3219, + "mean_token_accuracy": 0.6711326971650123, + "num_tokens": 147208465.0, + "step": 4495 + }, + { + "epoch": 0.37316527075213535, + "grad_norm": 0.9403915405273438, + "learning_rate": 7.895316441247962e-05, + "loss": 1.2407, + "mean_token_accuracy": 0.6798142716288567, + "num_tokens": 147372305.0, + "step": 4500 + }, + { + "epoch": 0.3735798988307488, + "grad_norm": 1.0085158348083496, + "learning_rate": 7.889413461889957e-05, + "loss": 1.2442, + "mean_token_accuracy": 0.6817213639616966, + "num_tokens": 147535888.0, + "step": 4505 + }, + { + "epoch": 0.3739945269093623, + "grad_norm": 0.9304342865943909, + "learning_rate": 7.883504429823377e-05, + "loss": 1.2535, + "mean_token_accuracy": 0.6781647145748139, + "num_tokens": 147699728.0, + "step": 4510 + }, + { + "epoch": 0.3744091549879758, + "grad_norm": 0.9755577445030212, + "learning_rate": 7.877589357426392e-05, + "loss": 1.3527, + "mean_token_accuracy": 0.666141252219677, + "num_tokens": 147863568.0, + "step": 4515 + }, + { + "epoch": 0.37482378306658926, + "grad_norm": 0.888272762298584, + "learning_rate": 7.871668257089822e-05, + "loss": 1.2179, + "mean_token_accuracy": 0.6887952148914337, + "num_tokens": 148027408.0, + "step": 4520 + }, + { + "epoch": 0.37523841114520273, + "grad_norm": 0.8954936861991882, + "learning_rate": 7.86574114121712e-05, + "loss": 1.2342, + "mean_token_accuracy": 0.6860153958201408, + "num_tokens": 148191248.0, + "step": 4525 + }, + { + "epoch": 0.3756530392238162, + "grad_norm": 0.9565839767456055, + "learning_rate": 7.859808022224335e-05, + "loss": 1.2451, + "mean_token_accuracy": 0.6833516657352448, + "num_tokens": 148355088.0, + "step": 4530 + }, + { + "epoch": 0.37606766730242974, + "grad_norm": 0.9371515512466431, + "learning_rate": 7.853868912540095e-05, + "loss": 1.2363, + "mean_token_accuracy": 0.684695751965046, + "num_tokens": 148518928.0, + "step": 4535 + }, + { + "epoch": 0.3764822953810432, + "grad_norm": 0.9867259860038757, + "learning_rate": 7.847923824605572e-05, + "loss": 1.3794, + "mean_token_accuracy": 0.6623289346694946, + "num_tokens": 148682768.0, + "step": 4540 + }, + { + "epoch": 0.3768969234596567, + "grad_norm": 0.9817521572113037, + "learning_rate": 7.841972770874469e-05, + "loss": 1.2832, + "mean_token_accuracy": 0.6744990229606629, + "num_tokens": 148846608.0, + "step": 4545 + }, + { + "epoch": 0.3773115515382702, + "grad_norm": 0.9609804153442383, + "learning_rate": 7.836015763812978e-05, + "loss": 1.3478, + "mean_token_accuracy": 0.6655303075909614, + "num_tokens": 149010448.0, + "step": 4550 + }, + { + "epoch": 0.37772617961688365, + "grad_norm": 0.9168370962142944, + "learning_rate": 7.830052815899769e-05, + "loss": 1.4083, + "mean_token_accuracy": 0.6589259445667267, + "num_tokens": 149174288.0, + "step": 4555 + }, + { + "epoch": 0.3781408076954971, + "grad_norm": 0.9401752352714539, + "learning_rate": 7.824083939625953e-05, + "loss": 1.3108, + "mean_token_accuracy": 0.6691959947347641, + "num_tokens": 149338128.0, + "step": 4560 + }, + { + "epoch": 0.3785554357741106, + "grad_norm": 0.9858892560005188, + "learning_rate": 7.818109147495057e-05, + "loss": 1.2495, + "mean_token_accuracy": 0.6821542009711266, + "num_tokens": 149501968.0, + "step": 4565 + }, + { + "epoch": 0.3789700638527241, + "grad_norm": 0.933495283126831, + "learning_rate": 7.812128452023008e-05, + "loss": 1.2661, + "mean_token_accuracy": 0.6834799602627755, + "num_tokens": 149665808.0, + "step": 4570 + }, + { + "epoch": 0.3793846919313376, + "grad_norm": 1.671836495399475, + "learning_rate": 7.806141865738092e-05, + "loss": 1.2599, + "mean_token_accuracy": 0.6817564889788628, + "num_tokens": 149828698.0, + "step": 4575 + }, + { + "epoch": 0.3797993200099511, + "grad_norm": 0.8905841708183289, + "learning_rate": 7.80014940118094e-05, + "loss": 1.2838, + "mean_token_accuracy": 0.6760935962200165, + "num_tokens": 149992538.0, + "step": 4580 + }, + { + "epoch": 0.38021394808856457, + "grad_norm": 0.8741480112075806, + "learning_rate": 7.794151070904492e-05, + "loss": 1.2684, + "mean_token_accuracy": 0.6797715067863465, + "num_tokens": 150156378.0, + "step": 4585 + }, + { + "epoch": 0.38062857616717805, + "grad_norm": 0.9679931998252869, + "learning_rate": 7.788146887473984e-05, + "loss": 1.2268, + "mean_token_accuracy": 0.6868401765823364, + "num_tokens": 150320218.0, + "step": 4590 + }, + { + "epoch": 0.3810432042457915, + "grad_norm": 1.0136057138442993, + "learning_rate": 7.7821368634669e-05, + "loss": 1.264, + "mean_token_accuracy": 0.6758919849991798, + "num_tokens": 150484058.0, + "step": 4595 + }, + { + "epoch": 0.381457832324405, + "grad_norm": 0.8925061821937561, + "learning_rate": 7.77612101147297e-05, + "loss": 1.2511, + "mean_token_accuracy": 0.6838954016566277, + "num_tokens": 150647898.0, + "step": 4600 + }, + { + "epoch": 0.3818724604030185, + "grad_norm": 0.9632482528686523, + "learning_rate": 7.770099344094126e-05, + "loss": 1.2628, + "mean_token_accuracy": 0.67837243527174, + "num_tokens": 150811738.0, + "step": 4605 + }, + { + "epoch": 0.38228708848163195, + "grad_norm": 0.9291356205940247, + "learning_rate": 7.764071873944488e-05, + "loss": 1.2562, + "mean_token_accuracy": 0.6821419820189476, + "num_tokens": 150975578.0, + "step": 4610 + }, + { + "epoch": 0.3827017165602455, + "grad_norm": 0.9034490585327148, + "learning_rate": 7.758038613650325e-05, + "loss": 1.2158, + "mean_token_accuracy": 0.6865102618932724, + "num_tokens": 151139418.0, + "step": 4615 + }, + { + "epoch": 0.38311634463885896, + "grad_norm": 0.9467730522155762, + "learning_rate": 7.75199957585004e-05, + "loss": 1.3079, + "mean_token_accuracy": 0.6715264797210694, + "num_tokens": 151302594.0, + "step": 4620 + }, + { + "epoch": 0.38353097271747244, + "grad_norm": 0.9477065205574036, + "learning_rate": 7.745954773194135e-05, + "loss": 1.2109, + "mean_token_accuracy": 0.6921913221478462, + "num_tokens": 151465897.0, + "step": 4625 + }, + { + "epoch": 0.3839456007960859, + "grad_norm": 0.9852918982505798, + "learning_rate": 7.739904218345192e-05, + "loss": 1.2157, + "mean_token_accuracy": 0.6889140188694001, + "num_tokens": 151629259.0, + "step": 4630 + }, + { + "epoch": 0.3843602288746994, + "grad_norm": 0.9387479424476624, + "learning_rate": 7.733847923977839e-05, + "loss": 1.3291, + "mean_token_accuracy": 0.6675158843398095, + "num_tokens": 151793099.0, + "step": 4635 + }, + { + "epoch": 0.38477485695331287, + "grad_norm": 0.9405195713043213, + "learning_rate": 7.727785902778728e-05, + "loss": 1.2637, + "mean_token_accuracy": 0.6774315729737281, + "num_tokens": 151956939.0, + "step": 4640 + }, + { + "epoch": 0.38518948503192635, + "grad_norm": 0.9244207143783569, + "learning_rate": 7.72171816744651e-05, + "loss": 1.2571, + "mean_token_accuracy": 0.6825879812240601, + "num_tokens": 152120779.0, + "step": 4645 + }, + { + "epoch": 0.3856041131105398, + "grad_norm": 0.9538646936416626, + "learning_rate": 7.715644730691802e-05, + "loss": 1.2506, + "mean_token_accuracy": 0.6821515664458275, + "num_tokens": 152284101.0, + "step": 4650 + }, + { + "epoch": 0.3860187411891533, + "grad_norm": 0.9201942086219788, + "learning_rate": 7.709565605237168e-05, + "loss": 1.2071, + "mean_token_accuracy": 0.6904081106185913, + "num_tokens": 152447941.0, + "step": 4655 + }, + { + "epoch": 0.38643336926776684, + "grad_norm": 1.0343220233917236, + "learning_rate": 7.703480803817087e-05, + "loss": 1.2443, + "mean_token_accuracy": 0.6835288360714913, + "num_tokens": 152611781.0, + "step": 4660 + }, + { + "epoch": 0.3868479973463803, + "grad_norm": 0.9714875221252441, + "learning_rate": 7.697390339177925e-05, + "loss": 1.2861, + "mean_token_accuracy": 0.6763013213872909, + "num_tokens": 152775621.0, + "step": 4665 + }, + { + "epoch": 0.3872626254249938, + "grad_norm": 0.9571571350097656, + "learning_rate": 7.691294224077919e-05, + "loss": 1.2557, + "mean_token_accuracy": 0.6802358254790306, + "num_tokens": 152939461.0, + "step": 4670 + }, + { + "epoch": 0.38767725350360727, + "grad_norm": 0.9107460379600525, + "learning_rate": 7.685192471287134e-05, + "loss": 1.2313, + "mean_token_accuracy": 0.6853433534502983, + "num_tokens": 153103301.0, + "step": 4675 + }, + { + "epoch": 0.38809188158222074, + "grad_norm": 0.8951613306999207, + "learning_rate": 7.679085093587449e-05, + "loss": 1.3226, + "mean_token_accuracy": 0.6727455973625183, + "num_tokens": 153267141.0, + "step": 4680 + }, + { + "epoch": 0.3885065096608342, + "grad_norm": 1.0090018510818481, + "learning_rate": 7.672972103772524e-05, + "loss": 1.2378, + "mean_token_accuracy": 0.689778833091259, + "num_tokens": 153430981.0, + "step": 4685 + }, + { + "epoch": 0.3889211377394477, + "grad_norm": 0.94710773229599, + "learning_rate": 7.666853514647781e-05, + "loss": 1.2251, + "mean_token_accuracy": 0.6845307916402816, + "num_tokens": 153594821.0, + "step": 4690 + }, + { + "epoch": 0.3893357658180612, + "grad_norm": 0.9767709970474243, + "learning_rate": 7.660729339030361e-05, + "loss": 1.234, + "mean_token_accuracy": 0.6856488302350044, + "num_tokens": 153758661.0, + "step": 4695 + }, + { + "epoch": 0.3897503938966747, + "grad_norm": 1.0423895120620728, + "learning_rate": 7.654599589749119e-05, + "loss": 1.2757, + "mean_token_accuracy": 0.6767481967806817, + "num_tokens": 153921750.0, + "step": 4700 + }, + { + "epoch": 0.3901650219752882, + "grad_norm": 0.9277583956718445, + "learning_rate": 7.648464279644575e-05, + "loss": 1.2619, + "mean_token_accuracy": 0.6783602133393287, + "num_tokens": 154085590.0, + "step": 4705 + }, + { + "epoch": 0.39057965005390166, + "grad_norm": 0.8938004374504089, + "learning_rate": 7.642323421568906e-05, + "loss": 1.1811, + "mean_token_accuracy": 0.6952956974506378, + "num_tokens": 154249430.0, + "step": 4710 + }, + { + "epoch": 0.39099427813251514, + "grad_norm": 0.9139187335968018, + "learning_rate": 7.636177028385909e-05, + "loss": 1.2321, + "mean_token_accuracy": 0.6850405231118202, + "num_tokens": 154412883.0, + "step": 4715 + }, + { + "epoch": 0.3914089062111286, + "grad_norm": 0.9672526121139526, + "learning_rate": 7.63002511297097e-05, + "loss": 1.2406, + "mean_token_accuracy": 0.6823204308748245, + "num_tokens": 154575713.0, + "step": 4720 + }, + { + "epoch": 0.3918235342897421, + "grad_norm": 0.9284092783927917, + "learning_rate": 7.623867688211053e-05, + "loss": 1.1583, + "mean_token_accuracy": 0.6993890523910522, + "num_tokens": 154739553.0, + "step": 4725 + }, + { + "epoch": 0.39223816236835557, + "grad_norm": 0.920409083366394, + "learning_rate": 7.617704767004653e-05, + "loss": 1.2206, + "mean_token_accuracy": 0.6864186227321625, + "num_tokens": 154903393.0, + "step": 4730 + }, + { + "epoch": 0.39265279044696905, + "grad_norm": 0.9640065431594849, + "learning_rate": 7.611536362261783e-05, + "loss": 1.242, + "mean_token_accuracy": 0.6832172498106956, + "num_tokens": 155067233.0, + "step": 4735 + }, + { + "epoch": 0.3930674185255826, + "grad_norm": 0.9314320683479309, + "learning_rate": 7.605362486903946e-05, + "loss": 1.2077, + "mean_token_accuracy": 0.6876893937587738, + "num_tokens": 155231073.0, + "step": 4740 + }, + { + "epoch": 0.39348204660419606, + "grad_norm": 0.9220605492591858, + "learning_rate": 7.5991831538641e-05, + "loss": 1.2445, + "mean_token_accuracy": 0.6840542554855347, + "num_tokens": 155394913.0, + "step": 4745 + }, + { + "epoch": 0.39389667468280953, + "grad_norm": 0.909056544303894, + "learning_rate": 7.59299837608664e-05, + "loss": 1.2791, + "mean_token_accuracy": 0.6786107078194619, + "num_tokens": 155558753.0, + "step": 4750 + }, + { + "epoch": 0.394311302761423, + "grad_norm": 0.9505742192268372, + "learning_rate": 7.586808166527361e-05, + "loss": 1.241, + "mean_token_accuracy": 0.6831500500440597, + "num_tokens": 155722593.0, + "step": 4755 + }, + { + "epoch": 0.3947259308400365, + "grad_norm": 0.8974282145500183, + "learning_rate": 7.58061253815344e-05, + "loss": 1.2235, + "mean_token_accuracy": 0.6804130047559738, + "num_tokens": 155886433.0, + "step": 4760 + }, + { + "epoch": 0.39514055891864996, + "grad_norm": 0.9585201740264893, + "learning_rate": 7.574411503943406e-05, + "loss": 1.3285, + "mean_token_accuracy": 0.666739983856678, + "num_tokens": 156050273.0, + "step": 4765 + }, + { + "epoch": 0.39555518699726344, + "grad_norm": 0.9355363845825195, + "learning_rate": 7.568205076887109e-05, + "loss": 1.2458, + "mean_token_accuracy": 0.6835899338126182, + "num_tokens": 156214113.0, + "step": 4770 + }, + { + "epoch": 0.3959698150758769, + "grad_norm": 0.9360225200653076, + "learning_rate": 7.561993269985703e-05, + "loss": 1.3031, + "mean_token_accuracy": 0.6800586521625519, + "num_tokens": 156377953.0, + "step": 4775 + }, + { + "epoch": 0.3963844431544904, + "grad_norm": 2.4573659896850586, + "learning_rate": 7.555776096251599e-05, + "loss": 1.3057, + "mean_token_accuracy": 0.6769516438245773, + "num_tokens": 156540933.0, + "step": 4780 + }, + { + "epoch": 0.39679907123310393, + "grad_norm": 0.8595539331436157, + "learning_rate": 7.549553568708462e-05, + "loss": 1.2688, + "mean_token_accuracy": 0.6785801604390145, + "num_tokens": 156704773.0, + "step": 4785 + }, + { + "epoch": 0.3972136993117174, + "grad_norm": 0.9471558928489685, + "learning_rate": 7.543325700391169e-05, + "loss": 1.2423, + "mean_token_accuracy": 0.6851234093308449, + "num_tokens": 156868613.0, + "step": 4790 + }, + { + "epoch": 0.3976283273903309, + "grad_norm": 0.8847547769546509, + "learning_rate": 7.537092504345781e-05, + "loss": 1.1858, + "mean_token_accuracy": 0.6924975574016571, + "num_tokens": 157032453.0, + "step": 4795 + }, + { + "epoch": 0.39804295546894436, + "grad_norm": 0.9858616590499878, + "learning_rate": 7.530853993629524e-05, + "loss": 1.2537, + "mean_token_accuracy": 0.6855498313903808, + "num_tokens": 157195089.0, + "step": 4800 + }, + { + "epoch": 0.39845758354755784, + "grad_norm": 0.9740419387817383, + "learning_rate": 7.524610181310752e-05, + "loss": 1.2528, + "mean_token_accuracy": 0.6816410079598427, + "num_tokens": 157358929.0, + "step": 4805 + }, + { + "epoch": 0.3988722116261713, + "grad_norm": 0.8679446578025818, + "learning_rate": 7.518361080468931e-05, + "loss": 1.2289, + "mean_token_accuracy": 0.6904386594891548, + "num_tokens": 157522769.0, + "step": 4810 + }, + { + "epoch": 0.3992868397047848, + "grad_norm": 0.9794730544090271, + "learning_rate": 7.512106704194602e-05, + "loss": 1.3254, + "mean_token_accuracy": 0.6709757059812546, + "num_tokens": 157685877.0, + "step": 4815 + }, + { + "epoch": 0.39970146778339827, + "grad_norm": 0.9422624707221985, + "learning_rate": 7.505847065589357e-05, + "loss": 1.3384, + "mean_token_accuracy": 0.6665444731712341, + "num_tokens": 157849717.0, + "step": 4820 + }, + { + "epoch": 0.4001160958620118, + "grad_norm": 0.9462253451347351, + "learning_rate": 7.499582177765811e-05, + "loss": 1.2414, + "mean_token_accuracy": 0.6833699867129326, + "num_tokens": 158013557.0, + "step": 4825 + }, + { + "epoch": 0.4005307239406253, + "grad_norm": 0.9390513300895691, + "learning_rate": 7.493312053847578e-05, + "loss": 1.2295, + "mean_token_accuracy": 0.685948196053505, + "num_tokens": 158177397.0, + "step": 4830 + }, + { + "epoch": 0.40094535201923875, + "grad_norm": 0.9818820953369141, + "learning_rate": 7.487036706969234e-05, + "loss": 1.2479, + "mean_token_accuracy": 0.6862170085310936, + "num_tokens": 158341237.0, + "step": 4835 + }, + { + "epoch": 0.40135998009785223, + "grad_norm": 0.9143067002296448, + "learning_rate": 7.480756150276303e-05, + "loss": 1.3002, + "mean_token_accuracy": 0.6738724946975708, + "num_tokens": 158504344.0, + "step": 4840 + }, + { + "epoch": 0.4017746081764657, + "grad_norm": 0.9471181035041809, + "learning_rate": 7.47447039692522e-05, + "loss": 1.3073, + "mean_token_accuracy": 0.6732954531908035, + "num_tokens": 158668184.0, + "step": 4845 + }, + { + "epoch": 0.4021892362550792, + "grad_norm": 0.9081429243087769, + "learning_rate": 7.468179460083302e-05, + "loss": 1.2684, + "mean_token_accuracy": 0.6803519010543824, + "num_tokens": 158832024.0, + "step": 4850 + }, + { + "epoch": 0.40260386433369266, + "grad_norm": 0.93732088804245, + "learning_rate": 7.461883352928734e-05, + "loss": 1.2306, + "mean_token_accuracy": 0.6859359741210938, + "num_tokens": 158995864.0, + "step": 4855 + }, + { + "epoch": 0.40301849241230614, + "grad_norm": 0.9196427464485168, + "learning_rate": 7.455582088650521e-05, + "loss": 1.2741, + "mean_token_accuracy": 0.6800830900669098, + "num_tokens": 159159704.0, + "step": 4860 + }, + { + "epoch": 0.40343312049091967, + "grad_norm": 0.9257554411888123, + "learning_rate": 7.449275680448475e-05, + "loss": 1.2941, + "mean_token_accuracy": 0.6768084064126014, + "num_tokens": 159323544.0, + "step": 4865 + }, + { + "epoch": 0.40384774856953315, + "grad_norm": 0.9372634291648865, + "learning_rate": 7.442964141533187e-05, + "loss": 1.2781, + "mean_token_accuracy": 0.6789235323667526, + "num_tokens": 159487002.0, + "step": 4870 + }, + { + "epoch": 0.4042623766481466, + "grad_norm": 0.920061469078064, + "learning_rate": 7.436647485125993e-05, + "loss": 1.3087, + "mean_token_accuracy": 0.6718169584870338, + "num_tokens": 159650842.0, + "step": 4875 + }, + { + "epoch": 0.4046770047267601, + "grad_norm": 0.9787533283233643, + "learning_rate": 7.430325724458945e-05, + "loss": 1.2569, + "mean_token_accuracy": 0.6805657401680947, + "num_tokens": 159814682.0, + "step": 4880 + }, + { + "epoch": 0.4050916328053736, + "grad_norm": 0.9405059814453125, + "learning_rate": 7.423998872774795e-05, + "loss": 1.2982, + "mean_token_accuracy": 0.6744195967912674, + "num_tokens": 159978522.0, + "step": 4885 + }, + { + "epoch": 0.40550626088398706, + "grad_norm": 0.9164022207260132, + "learning_rate": 7.417666943326954e-05, + "loss": 1.2315, + "mean_token_accuracy": 0.6812695115804672, + "num_tokens": 160142292.0, + "step": 4890 + }, + { + "epoch": 0.40592088896260053, + "grad_norm": 0.896062433719635, + "learning_rate": 7.411329949379473e-05, + "loss": 1.2897, + "mean_token_accuracy": 0.6766129016876221, + "num_tokens": 160306132.0, + "step": 4895 + }, + { + "epoch": 0.406335517041214, + "grad_norm": 0.9773444533348083, + "learning_rate": 7.40498790420701e-05, + "loss": 1.2571, + "mean_token_accuracy": 0.6823374912142753, + "num_tokens": 160469972.0, + "step": 4900 + }, + { + "epoch": 0.4067501451198275, + "grad_norm": 0.9660437107086182, + "learning_rate": 7.398640821094803e-05, + "loss": 1.3766, + "mean_token_accuracy": 0.6599706739187241, + "num_tokens": 160633812.0, + "step": 4905 + }, + { + "epoch": 0.407164773198441, + "grad_norm": 0.9986489415168762, + "learning_rate": 7.39228871333865e-05, + "loss": 1.2598, + "mean_token_accuracy": 0.6785618305206299, + "num_tokens": 160797652.0, + "step": 4910 + }, + { + "epoch": 0.4075794012770545, + "grad_norm": 0.9626627564430237, + "learning_rate": 7.385931594244865e-05, + "loss": 1.2133, + "mean_token_accuracy": 0.6895649999380111, + "num_tokens": 160961492.0, + "step": 4915 + }, + { + "epoch": 0.407994029355668, + "grad_norm": 0.9181817770004272, + "learning_rate": 7.379569477130269e-05, + "loss": 1.179, + "mean_token_accuracy": 0.6955156370997428, + "num_tokens": 161125332.0, + "step": 4920 + }, + { + "epoch": 0.40840865743428145, + "grad_norm": 0.9096157550811768, + "learning_rate": 7.373202375322144e-05, + "loss": 1.2787, + "mean_token_accuracy": 0.6760080680251122, + "num_tokens": 161289172.0, + "step": 4925 + }, + { + "epoch": 0.40882328551289493, + "grad_norm": 0.9276206493377686, + "learning_rate": 7.36683030215822e-05, + "loss": 1.2459, + "mean_token_accuracy": 0.6854533195495606, + "num_tokens": 161453012.0, + "step": 4930 + }, + { + "epoch": 0.4092379135915084, + "grad_norm": 0.9503964185714722, + "learning_rate": 7.360453270986642e-05, + "loss": 1.2499, + "mean_token_accuracy": 0.681518816947937, + "num_tokens": 161616852.0, + "step": 4935 + }, + { + "epoch": 0.4096525416701219, + "grad_norm": 0.943750262260437, + "learning_rate": 7.354071295165936e-05, + "loss": 1.2608, + "mean_token_accuracy": 0.681390517950058, + "num_tokens": 161780692.0, + "step": 4940 + }, + { + "epoch": 0.41006716974873536, + "grad_norm": 0.9272693991661072, + "learning_rate": 7.347684388064987e-05, + "loss": 1.3221, + "mean_token_accuracy": 0.6698435947299004, + "num_tokens": 161944532.0, + "step": 4945 + }, + { + "epoch": 0.4104817978273489, + "grad_norm": 0.8784942626953125, + "learning_rate": 7.341292563063014e-05, + "loss": 1.2676, + "mean_token_accuracy": 0.6795271277427674, + "num_tokens": 162108372.0, + "step": 4950 + }, + { + "epoch": 0.41089642590596237, + "grad_norm": 0.922111451625824, + "learning_rate": 7.334895833549533e-05, + "loss": 1.1891, + "mean_token_accuracy": 0.6913672983646393, + "num_tokens": 162272212.0, + "step": 4955 + }, + { + "epoch": 0.41131105398457585, + "grad_norm": 0.9174176454544067, + "learning_rate": 7.328494212924335e-05, + "loss": 1.2044, + "mean_token_accuracy": 0.6894794717431069, + "num_tokens": 162436052.0, + "step": 4960 + }, + { + "epoch": 0.4117256820631893, + "grad_norm": 0.925289511680603, + "learning_rate": 7.322087714597461e-05, + "loss": 1.2223, + "mean_token_accuracy": 0.6847018539905548, + "num_tokens": 162599892.0, + "step": 4965 + }, + { + "epoch": 0.4121403101418028, + "grad_norm": 0.9320161938667297, + "learning_rate": 7.315676351989164e-05, + "loss": 1.2919, + "mean_token_accuracy": 0.6725500985980034, + "num_tokens": 162763732.0, + "step": 4970 + }, + { + "epoch": 0.4125549382204163, + "grad_norm": 0.918063759803772, + "learning_rate": 7.309260138529892e-05, + "loss": 1.2638, + "mean_token_accuracy": 0.6777309387922287, + "num_tokens": 162927572.0, + "step": 4975 + }, + { + "epoch": 0.41296956629902976, + "grad_norm": 0.9324549436569214, + "learning_rate": 7.302839087660251e-05, + "loss": 1.2293, + "mean_token_accuracy": 0.6880620747804642, + "num_tokens": 163091412.0, + "step": 4980 + }, + { + "epoch": 0.41338419437764323, + "grad_norm": 0.9459344148635864, + "learning_rate": 7.296413212830979e-05, + "loss": 1.296, + "mean_token_accuracy": 0.6782135888934135, + "num_tokens": 163255252.0, + "step": 4985 + }, + { + "epoch": 0.41379882245625677, + "grad_norm": 0.9537709355354309, + "learning_rate": 7.289982527502923e-05, + "loss": 1.2983, + "mean_token_accuracy": 0.6713343113660812, + "num_tokens": 163419092.0, + "step": 4990 + }, + { + "epoch": 0.41421345053487024, + "grad_norm": 0.9475631713867188, + "learning_rate": 7.283547045147005e-05, + "loss": 1.2518, + "mean_token_accuracy": 0.6799449473619461, + "num_tokens": 163582720.0, + "step": 4995 + }, + { + "epoch": 0.4146280786134837, + "grad_norm": 0.9904707074165344, + "learning_rate": 7.277106779244196e-05, + "loss": 1.3101, + "mean_token_accuracy": 0.6728284910321236, + "num_tokens": 163746262.0, + "step": 5000 + }, + { + "epoch": 0.4150427066920972, + "grad_norm": 0.931486964225769, + "learning_rate": 7.270661743285489e-05, + "loss": 1.3136, + "mean_token_accuracy": 0.6726686611771584, + "num_tokens": 163909642.0, + "step": 5005 + }, + { + "epoch": 0.4154573347707107, + "grad_norm": 0.9016469717025757, + "learning_rate": 7.264211950771865e-05, + "loss": 1.2161, + "mean_token_accuracy": 0.6878421351313591, + "num_tokens": 164073482.0, + "step": 5010 + }, + { + "epoch": 0.41587196284932415, + "grad_norm": 0.9519628882408142, + "learning_rate": 7.257757415214275e-05, + "loss": 1.2858, + "mean_token_accuracy": 0.6751099690794945, + "num_tokens": 164237322.0, + "step": 5015 + }, + { + "epoch": 0.4162865909279376, + "grad_norm": 0.9091925621032715, + "learning_rate": 7.251298150133598e-05, + "loss": 1.248, + "mean_token_accuracy": 0.6883980944752693, + "num_tokens": 164401162.0, + "step": 5020 + }, + { + "epoch": 0.4167012190065511, + "grad_norm": 0.90833979845047, + "learning_rate": 7.24483416906063e-05, + "loss": 1.2673, + "mean_token_accuracy": 0.6829545453190804, + "num_tokens": 164565002.0, + "step": 5025 + }, + { + "epoch": 0.4171158470851646, + "grad_norm": 0.8844804763793945, + "learning_rate": 7.238365485536038e-05, + "loss": 1.161, + "mean_token_accuracy": 0.6969330415129662, + "num_tokens": 164728842.0, + "step": 5030 + }, + { + "epoch": 0.4175304751637781, + "grad_norm": 0.9600077867507935, + "learning_rate": 7.231892113110342e-05, + "loss": 1.2595, + "mean_token_accuracy": 0.6765945747494697, + "num_tokens": 164892682.0, + "step": 5035 + }, + { + "epoch": 0.4179451032423916, + "grad_norm": 0.8916535973548889, + "learning_rate": 7.225414065343886e-05, + "loss": 1.2524, + "mean_token_accuracy": 0.6826246321201325, + "num_tokens": 165056522.0, + "step": 5040 + }, + { + "epoch": 0.41835973132100507, + "grad_norm": 0.9368508458137512, + "learning_rate": 7.218931355806808e-05, + "loss": 1.2071, + "mean_token_accuracy": 0.686858506500721, + "num_tokens": 165220362.0, + "step": 5045 + }, + { + "epoch": 0.41877435939961855, + "grad_norm": 0.9774593710899353, + "learning_rate": 7.212443998079006e-05, + "loss": 1.3668, + "mean_token_accuracy": 0.6624511271715164, + "num_tokens": 165384202.0, + "step": 5050 + }, + { + "epoch": 0.419188987478232, + "grad_norm": 0.9264464974403381, + "learning_rate": 7.205952005750121e-05, + "loss": 1.2484, + "mean_token_accuracy": 0.678195258975029, + "num_tokens": 165548042.0, + "step": 5055 + }, + { + "epoch": 0.4196036155568455, + "grad_norm": 0.8728965520858765, + "learning_rate": 7.199455392419502e-05, + "loss": 1.2964, + "mean_token_accuracy": 0.676967254281044, + "num_tokens": 165711882.0, + "step": 5060 + }, + { + "epoch": 0.420018243635459, + "grad_norm": 0.9150432348251343, + "learning_rate": 7.192954171696173e-05, + "loss": 1.2071, + "mean_token_accuracy": 0.6928763419389725, + "num_tokens": 165875722.0, + "step": 5065 + }, + { + "epoch": 0.42043287171407245, + "grad_norm": 0.9414107203483582, + "learning_rate": 7.186448357198819e-05, + "loss": 1.2934, + "mean_token_accuracy": 0.6748655915260315, + "num_tokens": 166039562.0, + "step": 5070 + }, + { + "epoch": 0.420847499792686, + "grad_norm": 0.9863566756248474, + "learning_rate": 7.179937962555734e-05, + "loss": 1.2448, + "mean_token_accuracy": 0.6804740965366364, + "num_tokens": 166203402.0, + "step": 5075 + }, + { + "epoch": 0.42126212787129946, + "grad_norm": 0.9362086057662964, + "learning_rate": 7.173423001404821e-05, + "loss": 1.1731, + "mean_token_accuracy": 0.6968719467520714, + "num_tokens": 166367242.0, + "step": 5080 + }, + { + "epoch": 0.42167675594991294, + "grad_norm": 0.9288221597671509, + "learning_rate": 7.166903487393539e-05, + "loss": 1.305, + "mean_token_accuracy": 0.6745540015399456, + "num_tokens": 166531082.0, + "step": 5085 + }, + { + "epoch": 0.4220913840285264, + "grad_norm": 0.9393354654312134, + "learning_rate": 7.160379434178888e-05, + "loss": 1.282, + "mean_token_accuracy": 0.6779623195528984, + "num_tokens": 166693932.0, + "step": 5090 + }, + { + "epoch": 0.4225060121071399, + "grad_norm": 0.8841371536254883, + "learning_rate": 7.153850855427376e-05, + "loss": 1.2141, + "mean_token_accuracy": 0.6882575780153275, + "num_tokens": 166857772.0, + "step": 5095 + }, + { + "epoch": 0.42292064018575337, + "grad_norm": 0.9394116997718811, + "learning_rate": 7.147317764814992e-05, + "loss": 1.2814, + "mean_token_accuracy": 0.6750122174620629, + "num_tokens": 167021612.0, + "step": 5100 + }, + { + "epoch": 0.42333526826436685, + "grad_norm": 0.8560900092124939, + "learning_rate": 7.140780176027177e-05, + "loss": 1.1895, + "mean_token_accuracy": 0.695643937587738, + "num_tokens": 167185452.0, + "step": 5105 + }, + { + "epoch": 0.4237498963429803, + "grad_norm": 0.9591403603553772, + "learning_rate": 7.13423810275879e-05, + "loss": 1.2349, + "mean_token_accuracy": 0.6861192584037781, + "num_tokens": 167349292.0, + "step": 5110 + }, + { + "epoch": 0.4241645244215938, + "grad_norm": 0.9461228251457214, + "learning_rate": 7.127691558714091e-05, + "loss": 1.3244, + "mean_token_accuracy": 0.6665647983551025, + "num_tokens": 167512669.0, + "step": 5115 + }, + { + "epoch": 0.42457915250020734, + "grad_norm": 0.946561872959137, + "learning_rate": 7.121140557606699e-05, + "loss": 1.2473, + "mean_token_accuracy": 0.6869655027985573, + "num_tokens": 167675955.0, + "step": 5120 + }, + { + "epoch": 0.4249937805788208, + "grad_norm": 0.9129251837730408, + "learning_rate": 7.114585113159571e-05, + "loss": 1.2625, + "mean_token_accuracy": 0.6778080701828003, + "num_tokens": 167839038.0, + "step": 5125 + }, + { + "epoch": 0.4254084086574343, + "grad_norm": 0.9162349700927734, + "learning_rate": 7.108025239104978e-05, + "loss": 1.276, + "mean_token_accuracy": 0.676509042084217, + "num_tokens": 168002878.0, + "step": 5130 + }, + { + "epoch": 0.42582303673604777, + "grad_norm": 0.9409959316253662, + "learning_rate": 7.101460949184464e-05, + "loss": 1.2804, + "mean_token_accuracy": 0.6783541053533554, + "num_tokens": 168166718.0, + "step": 5135 + }, + { + "epoch": 0.42623766481466124, + "grad_norm": 0.9891248941421509, + "learning_rate": 7.094892257148821e-05, + "loss": 1.2819, + "mean_token_accuracy": 0.6758308865129947, + "num_tokens": 168330558.0, + "step": 5140 + }, + { + "epoch": 0.4266522928932747, + "grad_norm": 0.899552583694458, + "learning_rate": 7.088319176758069e-05, + "loss": 1.1743, + "mean_token_accuracy": 0.6964381694793701, + "num_tokens": 168494398.0, + "step": 5145 + }, + { + "epoch": 0.4270669209718882, + "grad_norm": 0.8751155138015747, + "learning_rate": 7.081741721781418e-05, + "loss": 1.2244, + "mean_token_accuracy": 0.6883736550807953, + "num_tokens": 168658238.0, + "step": 5150 + }, + { + "epoch": 0.4274815490505017, + "grad_norm": 0.9431947469711304, + "learning_rate": 7.07515990599724e-05, + "loss": 1.2772, + "mean_token_accuracy": 0.6795393422245979, + "num_tokens": 168822078.0, + "step": 5155 + }, + { + "epoch": 0.4278961771291152, + "grad_norm": 0.9290785789489746, + "learning_rate": 7.068573743193047e-05, + "loss": 1.2116, + "mean_token_accuracy": 0.6866263419389724, + "num_tokens": 168985918.0, + "step": 5160 + }, + { + "epoch": 0.4283108052077287, + "grad_norm": 0.9355618953704834, + "learning_rate": 7.061983247165447e-05, + "loss": 1.2549, + "mean_token_accuracy": 0.6818164244294167, + "num_tokens": 169149728.0, + "step": 5165 + }, + { + "epoch": 0.42872543328634216, + "grad_norm": 0.9035906195640564, + "learning_rate": 7.055388431720139e-05, + "loss": 1.2103, + "mean_token_accuracy": 0.6902064979076385, + "num_tokens": 169313568.0, + "step": 5170 + }, + { + "epoch": 0.42914006136495564, + "grad_norm": 0.9600555300712585, + "learning_rate": 7.048789310671859e-05, + "loss": 1.281, + "mean_token_accuracy": 0.6786229223012924, + "num_tokens": 169477408.0, + "step": 5175 + }, + { + "epoch": 0.4295546894435691, + "grad_norm": 0.9158090353012085, + "learning_rate": 7.042185897844367e-05, + "loss": 1.2422, + "mean_token_accuracy": 0.6817509770393372, + "num_tokens": 169641248.0, + "step": 5180 + }, + { + "epoch": 0.4299693175221826, + "grad_norm": 0.856984555721283, + "learning_rate": 7.035578207070412e-05, + "loss": 1.2134, + "mean_token_accuracy": 0.69148338586092, + "num_tokens": 169805088.0, + "step": 5185 + }, + { + "epoch": 0.43038394560079607, + "grad_norm": 0.9986769556999207, + "learning_rate": 7.028966252191709e-05, + "loss": 1.3264, + "mean_token_accuracy": 0.6711939051747322, + "num_tokens": 169968657.0, + "step": 5190 + }, + { + "epoch": 0.43079857367940955, + "grad_norm": 0.9320401549339294, + "learning_rate": 7.022350047058897e-05, + "loss": 1.2592, + "mean_token_accuracy": 0.6817143216729165, + "num_tokens": 170132497.0, + "step": 5195 + }, + { + "epoch": 0.4312132017580231, + "grad_norm": 0.8438582420349121, + "learning_rate": 7.015729605531526e-05, + "loss": 1.1353, + "mean_token_accuracy": 0.7033174499869347, + "num_tokens": 170296337.0, + "step": 5200 + }, + { + "epoch": 0.43162782983663656, + "grad_norm": 0.9292356371879578, + "learning_rate": 7.009104941478015e-05, + "loss": 1.2829, + "mean_token_accuracy": 0.6764112934470177, + "num_tokens": 170460177.0, + "step": 5205 + }, + { + "epoch": 0.43204245791525003, + "grad_norm": 0.9409948587417603, + "learning_rate": 7.002476068775633e-05, + "loss": 1.2898, + "mean_token_accuracy": 0.6755437485873699, + "num_tokens": 170624017.0, + "step": 5210 + }, + { + "epoch": 0.4324570859938635, + "grad_norm": 0.9407399892807007, + "learning_rate": 6.995843001310463e-05, + "loss": 1.2212, + "mean_token_accuracy": 0.6895344540476799, + "num_tokens": 170787857.0, + "step": 5215 + }, + { + "epoch": 0.432871714072477, + "grad_norm": 0.9206990599632263, + "learning_rate": 6.98920575297737e-05, + "loss": 1.3201, + "mean_token_accuracy": 0.6725439876317978, + "num_tokens": 170951697.0, + "step": 5220 + }, + { + "epoch": 0.43328634215109046, + "grad_norm": 0.9319395422935486, + "learning_rate": 6.982564337679986e-05, + "loss": 1.2724, + "mean_token_accuracy": 0.6791238963603974, + "num_tokens": 171115537.0, + "step": 5225 + }, + { + "epoch": 0.43370097022970394, + "grad_norm": 0.9000293016433716, + "learning_rate": 6.975918769330669e-05, + "loss": 1.1855, + "mean_token_accuracy": 0.6975562095642089, + "num_tokens": 171279377.0, + "step": 5230 + }, + { + "epoch": 0.4341155983083174, + "grad_norm": 0.9152920842170715, + "learning_rate": 6.969269061850474e-05, + "loss": 1.2542, + "mean_token_accuracy": 0.6791566848754883, + "num_tokens": 171443130.0, + "step": 5235 + }, + { + "epoch": 0.4345302263869309, + "grad_norm": 0.906944990158081, + "learning_rate": 6.962615229169129e-05, + "loss": 1.2295, + "mean_token_accuracy": 0.6885508254170418, + "num_tokens": 171606970.0, + "step": 5240 + }, + { + "epoch": 0.43494485446554443, + "grad_norm": 0.9342389702796936, + "learning_rate": 6.955957285225001e-05, + "loss": 1.3235, + "mean_token_accuracy": 0.670918869972229, + "num_tokens": 171770810.0, + "step": 5245 + }, + { + "epoch": 0.4353594825441579, + "grad_norm": 0.9459275007247925, + "learning_rate": 6.949295243965073e-05, + "loss": 1.2582, + "mean_token_accuracy": 0.6794415950775147, + "num_tokens": 171934650.0, + "step": 5250 + }, + { + "epoch": 0.4357741106227714, + "grad_norm": 0.9350090026855469, + "learning_rate": 6.942629119344907e-05, + "loss": 1.1836, + "mean_token_accuracy": 0.6913062110543251, + "num_tokens": 172098490.0, + "step": 5255 + }, + { + "epoch": 0.43618873870138486, + "grad_norm": 0.9977842569351196, + "learning_rate": 6.935958925328622e-05, + "loss": 1.279, + "mean_token_accuracy": 0.6781463846564293, + "num_tokens": 172262330.0, + "step": 5260 + }, + { + "epoch": 0.43660336677999834, + "grad_norm": 0.9499651789665222, + "learning_rate": 6.929284675888859e-05, + "loss": 1.2166, + "mean_token_accuracy": 0.6865652456879616, + "num_tokens": 172426170.0, + "step": 5265 + }, + { + "epoch": 0.4370179948586118, + "grad_norm": 0.9373361468315125, + "learning_rate": 6.922606385006757e-05, + "loss": 1.2732, + "mean_token_accuracy": 0.6814699441194534, + "num_tokens": 172590010.0, + "step": 5270 + }, + { + "epoch": 0.4374326229372253, + "grad_norm": 0.9018722176551819, + "learning_rate": 6.91592406667192e-05, + "loss": 1.2672, + "mean_token_accuracy": 0.6811365976929664, + "num_tokens": 172753579.0, + "step": 5275 + }, + { + "epoch": 0.43784725101583877, + "grad_norm": 0.894157350063324, + "learning_rate": 6.909237734882384e-05, + "loss": 1.2316, + "mean_token_accuracy": 0.684879033267498, + "num_tokens": 172917419.0, + "step": 5280 + }, + { + "epoch": 0.4382618790944523, + "grad_norm": 0.9963103532791138, + "learning_rate": 6.902547403644601e-05, + "loss": 1.3413, + "mean_token_accuracy": 0.6668132960796356, + "num_tokens": 173081259.0, + "step": 5285 + }, + { + "epoch": 0.4386765071730658, + "grad_norm": 0.9677658677101135, + "learning_rate": 6.895853086973395e-05, + "loss": 1.3384, + "mean_token_accuracy": 0.6679078742861748, + "num_tokens": 173245094.0, + "step": 5290 + }, + { + "epoch": 0.43909113525167925, + "grad_norm": 0.8745771646499634, + "learning_rate": 6.88915479889194e-05, + "loss": 1.1847, + "mean_token_accuracy": 0.691142700612545, + "num_tokens": 173407960.0, + "step": 5295 + }, + { + "epoch": 0.43950576333029273, + "grad_norm": 0.9091145992279053, + "learning_rate": 6.882452553431728e-05, + "loss": 1.2406, + "mean_token_accuracy": 0.6878787890076637, + "num_tokens": 173571800.0, + "step": 5300 + }, + { + "epoch": 0.4399203914089062, + "grad_norm": 0.9397043585777283, + "learning_rate": 6.875746364632544e-05, + "loss": 1.2601, + "mean_token_accuracy": 0.6831928133964539, + "num_tokens": 173735640.0, + "step": 5305 + }, + { + "epoch": 0.4403350194875197, + "grad_norm": 0.8953580260276794, + "learning_rate": 6.86903624654243e-05, + "loss": 1.2124, + "mean_token_accuracy": 0.6883370012044907, + "num_tokens": 173899480.0, + "step": 5310 + }, + { + "epoch": 0.44074964756613316, + "grad_norm": 0.9146726727485657, + "learning_rate": 6.862322213217661e-05, + "loss": 1.2162, + "mean_token_accuracy": 0.6873961389064789, + "num_tokens": 174063320.0, + "step": 5315 + }, + { + "epoch": 0.44116427564474664, + "grad_norm": 0.9711309671401978, + "learning_rate": 6.855604278722716e-05, + "loss": 1.2564, + "mean_token_accuracy": 0.682221406698227, + "num_tokens": 174227160.0, + "step": 5320 + }, + { + "epoch": 0.44157890372336017, + "grad_norm": 0.9068252444267273, + "learning_rate": 6.84888245713024e-05, + "loss": 1.231, + "mean_token_accuracy": 0.68525170981884, + "num_tokens": 174391000.0, + "step": 5325 + }, + { + "epoch": 0.44199353180197365, + "grad_norm": 0.9210191369056702, + "learning_rate": 6.842156762521026e-05, + "loss": 1.258, + "mean_token_accuracy": 0.6829973116517067, + "num_tokens": 174554840.0, + "step": 5330 + }, + { + "epoch": 0.4424081598805871, + "grad_norm": 0.9596512913703918, + "learning_rate": 6.835427208983977e-05, + "loss": 1.2529, + "mean_token_accuracy": 0.6870601147413253, + "num_tokens": 174718680.0, + "step": 5335 + }, + { + "epoch": 0.4428227879592006, + "grad_norm": 0.9190239310264587, + "learning_rate": 6.828693810616083e-05, + "loss": 1.2416, + "mean_token_accuracy": 0.6829301044344902, + "num_tokens": 174882520.0, + "step": 5340 + }, + { + "epoch": 0.4432374160378141, + "grad_norm": 0.935261070728302, + "learning_rate": 6.821956581522382e-05, + "loss": 1.2233, + "mean_token_accuracy": 0.6880627810955048, + "num_tokens": 175045555.0, + "step": 5345 + }, + { + "epoch": 0.44365204411642756, + "grad_norm": 0.9144822955131531, + "learning_rate": 6.815215535815944e-05, + "loss": 1.2118, + "mean_token_accuracy": 0.6848851442337036, + "num_tokens": 175209395.0, + "step": 5350 + }, + { + "epoch": 0.44406667219504103, + "grad_norm": 0.9519127607345581, + "learning_rate": 6.80847068761783e-05, + "loss": 1.2238, + "mean_token_accuracy": 0.6834555193781853, + "num_tokens": 175373235.0, + "step": 5355 + }, + { + "epoch": 0.4444813002736545, + "grad_norm": 0.9514979124069214, + "learning_rate": 6.801722051057064e-05, + "loss": 1.163, + "mean_token_accuracy": 0.6971590876579284, + "num_tokens": 175537075.0, + "step": 5360 + }, + { + "epoch": 0.444895928352268, + "grad_norm": 0.9447032809257507, + "learning_rate": 6.794969640270611e-05, + "loss": 1.2814, + "mean_token_accuracy": 0.6770955502986908, + "num_tokens": 175700915.0, + "step": 5365 + }, + { + "epoch": 0.4453105564308815, + "grad_norm": 0.9713386297225952, + "learning_rate": 6.788213469403342e-05, + "loss": 1.2513, + "mean_token_accuracy": 0.6837243407964706, + "num_tokens": 175864755.0, + "step": 5370 + }, + { + "epoch": 0.445725184509495, + "grad_norm": 0.9300346970558167, + "learning_rate": 6.781453552608e-05, + "loss": 1.1998, + "mean_token_accuracy": 0.6904753193259239, + "num_tokens": 176028595.0, + "step": 5375 + }, + { + "epoch": 0.4461398125881085, + "grad_norm": 0.8931049108505249, + "learning_rate": 6.774689904045176e-05, + "loss": 1.2284, + "mean_token_accuracy": 0.6878054708242416, + "num_tokens": 176192435.0, + "step": 5380 + }, + { + "epoch": 0.44655444066672195, + "grad_norm": 0.9139026999473572, + "learning_rate": 6.767922537883283e-05, + "loss": 1.1994, + "mean_token_accuracy": 0.6937866598367691, + "num_tokens": 176356275.0, + "step": 5385 + }, + { + "epoch": 0.44696906874533543, + "grad_norm": 0.8814029097557068, + "learning_rate": 6.761151468298514e-05, + "loss": 1.2196, + "mean_token_accuracy": 0.6869806960225106, + "num_tokens": 176520115.0, + "step": 5390 + }, + { + "epoch": 0.4473836968239489, + "grad_norm": 0.8821919560432434, + "learning_rate": 6.75437670947483e-05, + "loss": 1.2221, + "mean_token_accuracy": 0.6869073823094368, + "num_tokens": 176683955.0, + "step": 5395 + }, + { + "epoch": 0.4477983249025624, + "grad_norm": 0.9081880450248718, + "learning_rate": 6.74759827560391e-05, + "loss": 1.2343, + "mean_token_accuracy": 0.6858565524220467, + "num_tokens": 176847795.0, + "step": 5400 + }, + { + "epoch": 0.44821295298117586, + "grad_norm": 0.9185976982116699, + "learning_rate": 6.740816180885135e-05, + "loss": 1.2193, + "mean_token_accuracy": 0.68883186429739, + "num_tokens": 177011635.0, + "step": 5405 + }, + { + "epoch": 0.4486275810597894, + "grad_norm": 0.9622422456741333, + "learning_rate": 6.73403043952556e-05, + "loss": 1.218, + "mean_token_accuracy": 0.689094577729702, + "num_tokens": 177175475.0, + "step": 5410 + }, + { + "epoch": 0.44904220913840287, + "grad_norm": 0.8946992754936218, + "learning_rate": 6.72724106573987e-05, + "loss": 1.305, + "mean_token_accuracy": 0.6759164288640023, + "num_tokens": 177339315.0, + "step": 5415 + }, + { + "epoch": 0.44945683721701635, + "grad_norm": 0.8865119218826294, + "learning_rate": 6.720448073750367e-05, + "loss": 1.1819, + "mean_token_accuracy": 0.6931940361857414, + "num_tokens": 177503155.0, + "step": 5420 + }, + { + "epoch": 0.4498714652956298, + "grad_norm": 0.9959166049957275, + "learning_rate": 6.713651477786926e-05, + "loss": 1.1997, + "mean_token_accuracy": 0.6922165229916573, + "num_tokens": 177666995.0, + "step": 5425 + }, + { + "epoch": 0.4502860933742433, + "grad_norm": 0.9162701964378357, + "learning_rate": 6.706851292086975e-05, + "loss": 1.2072, + "mean_token_accuracy": 0.6892473086714744, + "num_tokens": 177830835.0, + "step": 5430 + }, + { + "epoch": 0.4507007214528568, + "grad_norm": 0.9049626588821411, + "learning_rate": 6.700047530895463e-05, + "loss": 1.1919, + "mean_token_accuracy": 0.6913673028349876, + "num_tokens": 177994675.0, + "step": 5435 + }, + { + "epoch": 0.45111534953147026, + "grad_norm": 0.9806783199310303, + "learning_rate": 6.693240208464827e-05, + "loss": 1.3308, + "mean_token_accuracy": 0.6697458475828171, + "num_tokens": 178158515.0, + "step": 5440 + }, + { + "epoch": 0.45152997761008373, + "grad_norm": 0.9126311540603638, + "learning_rate": 6.686429339054961e-05, + "loss": 1.2807, + "mean_token_accuracy": 0.6787044301629066, + "num_tokens": 178322221.0, + "step": 5445 + }, + { + "epoch": 0.45194460568869727, + "grad_norm": 0.9570632576942444, + "learning_rate": 6.679614936933196e-05, + "loss": 1.2483, + "mean_token_accuracy": 0.6813285246491432, + "num_tokens": 178485110.0, + "step": 5450 + }, + { + "epoch": 0.45235923376731074, + "grad_norm": 0.9193328022956848, + "learning_rate": 6.67279701637426e-05, + "loss": 1.3085, + "mean_token_accuracy": 0.671994136273861, + "num_tokens": 178648950.0, + "step": 5455 + }, + { + "epoch": 0.4527738618459242, + "grad_norm": 0.9053947925567627, + "learning_rate": 6.665975591660247e-05, + "loss": 1.2362, + "mean_token_accuracy": 0.6855449616909027, + "num_tokens": 178812790.0, + "step": 5460 + }, + { + "epoch": 0.4531884899245377, + "grad_norm": 0.9453768730163574, + "learning_rate": 6.659150677080598e-05, + "loss": 1.24, + "mean_token_accuracy": 0.6845857784152031, + "num_tokens": 178976630.0, + "step": 5465 + }, + { + "epoch": 0.4536031180031512, + "grad_norm": 0.9386938810348511, + "learning_rate": 6.652322286932061e-05, + "loss": 1.1507, + "mean_token_accuracy": 0.6952223852276802, + "num_tokens": 179140470.0, + "step": 5470 + }, + { + "epoch": 0.45401774608176465, + "grad_norm": 0.9033142328262329, + "learning_rate": 6.645490435518668e-05, + "loss": 1.2315, + "mean_token_accuracy": 0.685832105576992, + "num_tokens": 179304310.0, + "step": 5475 + }, + { + "epoch": 0.4544323741603781, + "grad_norm": 0.8971104621887207, + "learning_rate": 6.638655137151695e-05, + "loss": 1.1798, + "mean_token_accuracy": 0.6961326941847801, + "num_tokens": 179468150.0, + "step": 5480 + }, + { + "epoch": 0.4548470022389916, + "grad_norm": 0.8761679530143738, + "learning_rate": 6.631816406149648e-05, + "loss": 1.2451, + "mean_token_accuracy": 0.6850073292851449, + "num_tokens": 179631990.0, + "step": 5485 + }, + { + "epoch": 0.4552616303176051, + "grad_norm": 0.9151933193206787, + "learning_rate": 6.624974256838215e-05, + "loss": 1.2634, + "mean_token_accuracy": 0.6781891547143459, + "num_tokens": 179795830.0, + "step": 5490 + }, + { + "epoch": 0.4556762583962186, + "grad_norm": 0.8931605815887451, + "learning_rate": 6.618128703550246e-05, + "loss": 1.1539, + "mean_token_accuracy": 0.6978433474898338, + "num_tokens": 179959670.0, + "step": 5495 + }, + { + "epoch": 0.4560908864748321, + "grad_norm": 0.8990060091018677, + "learning_rate": 6.61127976062573e-05, + "loss": 1.2154, + "mean_token_accuracy": 0.6862170100212097, + "num_tokens": 180123510.0, + "step": 5500 + }, + { + "epoch": 0.45650551455344557, + "grad_norm": 0.8971739411354065, + "learning_rate": 6.604427442411746e-05, + "loss": 1.2498, + "mean_token_accuracy": 0.6818304002285004, + "num_tokens": 180287350.0, + "step": 5505 + }, + { + "epoch": 0.45692014263205905, + "grad_norm": 0.9555619359016418, + "learning_rate": 6.597571763262449e-05, + "loss": 1.2663, + "mean_token_accuracy": 0.6820014685392379, + "num_tokens": 180451190.0, + "step": 5510 + }, + { + "epoch": 0.4573347707106725, + "grad_norm": 0.9414533972740173, + "learning_rate": 6.590712737539031e-05, + "loss": 1.2378, + "mean_token_accuracy": 0.6874816685914993, + "num_tokens": 180615030.0, + "step": 5515 + }, + { + "epoch": 0.457749398789286, + "grad_norm": 0.9812831282615662, + "learning_rate": 6.5838503796097e-05, + "loss": 1.19, + "mean_token_accuracy": 0.6898047953844071, + "num_tokens": 180778332.0, + "step": 5520 + }, + { + "epoch": 0.4581640268678995, + "grad_norm": 0.9169635772705078, + "learning_rate": 6.576984703849639e-05, + "loss": 1.2571, + "mean_token_accuracy": 0.6822153061628342, + "num_tokens": 180942172.0, + "step": 5525 + }, + { + "epoch": 0.45857865494651295, + "grad_norm": 0.9392279982566833, + "learning_rate": 6.570115724640984e-05, + "loss": 1.229, + "mean_token_accuracy": 0.6863025456666947, + "num_tokens": 181106012.0, + "step": 5530 + }, + { + "epoch": 0.4589932830251265, + "grad_norm": 0.9157460927963257, + "learning_rate": 6.563243456372788e-05, + "loss": 1.2381, + "mean_token_accuracy": 0.6850562125444413, + "num_tokens": 181269852.0, + "step": 5535 + }, + { + "epoch": 0.45940791110373996, + "grad_norm": 0.9607328772544861, + "learning_rate": 6.556367913441e-05, + "loss": 1.2847, + "mean_token_accuracy": 0.6747739523649215, + "num_tokens": 181433692.0, + "step": 5540 + }, + { + "epoch": 0.45982253918235344, + "grad_norm": 0.8823652863502502, + "learning_rate": 6.54948911024842e-05, + "loss": 1.2308, + "mean_token_accuracy": 0.6844797477126121, + "num_tokens": 181596791.0, + "step": 5545 + }, + { + "epoch": 0.4602371672609669, + "grad_norm": 0.8790220022201538, + "learning_rate": 6.542607061204683e-05, + "loss": 1.2398, + "mean_token_accuracy": 0.6900232210755348, + "num_tokens": 181760631.0, + "step": 5550 + }, + { + "epoch": 0.4606517953395804, + "grad_norm": 0.9489864110946655, + "learning_rate": 6.535721780726228e-05, + "loss": 1.2872, + "mean_token_accuracy": 0.6787390008568763, + "num_tokens": 181924471.0, + "step": 5555 + }, + { + "epoch": 0.46106642341819387, + "grad_norm": 0.9242657423019409, + "learning_rate": 6.528833283236249e-05, + "loss": 1.2546, + "mean_token_accuracy": 0.6824230194091797, + "num_tokens": 182088311.0, + "step": 5560 + }, + { + "epoch": 0.46148105149680735, + "grad_norm": 1.01797616481781, + "learning_rate": 6.521941583164695e-05, + "loss": 1.3627, + "mean_token_accuracy": 0.6646261021494866, + "num_tokens": 182252151.0, + "step": 5565 + }, + { + "epoch": 0.4618956795754208, + "grad_norm": 0.989361584186554, + "learning_rate": 6.515046694948213e-05, + "loss": 1.2577, + "mean_token_accuracy": 0.6792399823665619, + "num_tokens": 182415991.0, + "step": 5570 + }, + { + "epoch": 0.46231030765403436, + "grad_norm": 0.9516648650169373, + "learning_rate": 6.508148633030132e-05, + "loss": 1.2551, + "mean_token_accuracy": 0.6816052556037903, + "num_tokens": 182579752.0, + "step": 5575 + }, + { + "epoch": 0.46272493573264784, + "grad_norm": 0.9034631252288818, + "learning_rate": 6.501247411860429e-05, + "loss": 1.1766, + "mean_token_accuracy": 0.6908907622098923, + "num_tokens": 182743592.0, + "step": 5580 + }, + { + "epoch": 0.4631395638112613, + "grad_norm": 0.9333795309066772, + "learning_rate": 6.494343045895702e-05, + "loss": 1.2323, + "mean_token_accuracy": 0.6843536138534546, + "num_tokens": 182907432.0, + "step": 5585 + }, + { + "epoch": 0.4635541918898748, + "grad_norm": 0.9616758227348328, + "learning_rate": 6.487435549599132e-05, + "loss": 1.2525, + "mean_token_accuracy": 0.683321113884449, + "num_tokens": 183071272.0, + "step": 5590 + }, + { + "epoch": 0.46396881996848827, + "grad_norm": 0.9114391207695007, + "learning_rate": 6.480524937440456e-05, + "loss": 1.2472, + "mean_token_accuracy": 0.6821236565709115, + "num_tokens": 183235112.0, + "step": 5595 + }, + { + "epoch": 0.46438344804710174, + "grad_norm": 0.9227198958396912, + "learning_rate": 6.473611223895947e-05, + "loss": 1.249, + "mean_token_accuracy": 0.68401148468256, + "num_tokens": 183398952.0, + "step": 5600 + }, + { + "epoch": 0.4647980761257152, + "grad_norm": 0.9138762950897217, + "learning_rate": 6.466694423448365e-05, + "loss": 1.2531, + "mean_token_accuracy": 0.6837206363677979, + "num_tokens": 183562480.0, + "step": 5605 + }, + { + "epoch": 0.4652127042043287, + "grad_norm": 0.8999396562576294, + "learning_rate": 6.459774550586942e-05, + "loss": 1.2405, + "mean_token_accuracy": 0.6830950632691384, + "num_tokens": 183726320.0, + "step": 5610 + }, + { + "epoch": 0.4656273322829422, + "grad_norm": 0.9314269423484802, + "learning_rate": 6.452851619807342e-05, + "loss": 1.2525, + "mean_token_accuracy": 0.6828629061579704, + "num_tokens": 183890160.0, + "step": 5615 + }, + { + "epoch": 0.4660419603615557, + "grad_norm": 0.9825212955474854, + "learning_rate": 6.445925645611641e-05, + "loss": 1.2568, + "mean_token_accuracy": 0.6807856827974319, + "num_tokens": 184054000.0, + "step": 5620 + }, + { + "epoch": 0.4664565884401692, + "grad_norm": 0.9416497945785522, + "learning_rate": 6.438996642508283e-05, + "loss": 1.2582, + "mean_token_accuracy": 0.683064517378807, + "num_tokens": 184217840.0, + "step": 5625 + }, + { + "epoch": 0.46687121651878266, + "grad_norm": 0.9385108947753906, + "learning_rate": 6.432064625012064e-05, + "loss": 1.2555, + "mean_token_accuracy": 0.6771627545356751, + "num_tokens": 184381680.0, + "step": 5630 + }, + { + "epoch": 0.46728584459739614, + "grad_norm": 0.9490394592285156, + "learning_rate": 6.425129607644089e-05, + "loss": 1.2297, + "mean_token_accuracy": 0.684237539768219, + "num_tokens": 184545520.0, + "step": 5635 + }, + { + "epoch": 0.4677004726760096, + "grad_norm": 0.9528844952583313, + "learning_rate": 6.418191604931748e-05, + "loss": 1.1835, + "mean_token_accuracy": 0.6946603089571, + "num_tokens": 184709360.0, + "step": 5640 + }, + { + "epoch": 0.4681151007546231, + "grad_norm": 0.8833242654800415, + "learning_rate": 6.411250631408687e-05, + "loss": 1.2603, + "mean_token_accuracy": 0.6811895027756691, + "num_tokens": 184872732.0, + "step": 5645 + }, + { + "epoch": 0.46852972883323657, + "grad_norm": 0.9160747528076172, + "learning_rate": 6.404306701614773e-05, + "loss": 1.2691, + "mean_token_accuracy": 0.6816104598343372, + "num_tokens": 185036572.0, + "step": 5650 + }, + { + "epoch": 0.46894435691185005, + "grad_norm": 0.9168289303779602, + "learning_rate": 6.397359830096067e-05, + "loss": 1.2455, + "mean_token_accuracy": 0.6794538110494613, + "num_tokens": 185200412.0, + "step": 5655 + }, + { + "epoch": 0.4693589849904636, + "grad_norm": 0.9233760833740234, + "learning_rate": 6.390410031404792e-05, + "loss": 1.2416, + "mean_token_accuracy": 0.6828624308109283, + "num_tokens": 185363403.0, + "step": 5660 + }, + { + "epoch": 0.46977361306907706, + "grad_norm": 0.9137739539146423, + "learning_rate": 6.383457320099303e-05, + "loss": 1.1292, + "mean_token_accuracy": 0.7065860241651535, + "num_tokens": 185527243.0, + "step": 5665 + }, + { + "epoch": 0.47018824114769053, + "grad_norm": 0.9377095699310303, + "learning_rate": 6.376501710744056e-05, + "loss": 1.1629, + "mean_token_accuracy": 0.6961693525314331, + "num_tokens": 185691083.0, + "step": 5670 + }, + { + "epoch": 0.470602869226304, + "grad_norm": 0.8585503101348877, + "learning_rate": 6.369543217909577e-05, + "loss": 1.2209, + "mean_token_accuracy": 0.6888868555426597, + "num_tokens": 185854923.0, + "step": 5675 + }, + { + "epoch": 0.4710174973049175, + "grad_norm": 0.9265078902244568, + "learning_rate": 6.362581856172433e-05, + "loss": 1.2307, + "mean_token_accuracy": 0.6839687183499337, + "num_tokens": 186018763.0, + "step": 5680 + }, + { + "epoch": 0.47143212538353096, + "grad_norm": 0.8388371467590332, + "learning_rate": 6.355617640115203e-05, + "loss": 1.2153, + "mean_token_accuracy": 0.6901798859238625, + "num_tokens": 186182351.0, + "step": 5685 + }, + { + "epoch": 0.47184675346214444, + "grad_norm": 0.898435115814209, + "learning_rate": 6.348650584326439e-05, + "loss": 1.2653, + "mean_token_accuracy": 0.6818548381328583, + "num_tokens": 186346191.0, + "step": 5690 + }, + { + "epoch": 0.4722613815407579, + "grad_norm": 0.9321533441543579, + "learning_rate": 6.341680703400651e-05, + "loss": 1.2397, + "mean_token_accuracy": 0.6852517113089561, + "num_tokens": 186510031.0, + "step": 5695 + }, + { + "epoch": 0.47267600961937145, + "grad_norm": 0.9423805475234985, + "learning_rate": 6.334708011938258e-05, + "loss": 1.2926, + "mean_token_accuracy": 0.6789161786437035, + "num_tokens": 186673871.0, + "step": 5700 + }, + { + "epoch": 0.47309063769798493, + "grad_norm": 0.9340402483940125, + "learning_rate": 6.327732524545571e-05, + "loss": 1.2594, + "mean_token_accuracy": 0.6856793776154518, + "num_tokens": 186837711.0, + "step": 5705 + }, + { + "epoch": 0.4735052657765984, + "grad_norm": 0.9308452010154724, + "learning_rate": 6.320754255834758e-05, + "loss": 1.2601, + "mean_token_accuracy": 0.6822519540786743, + "num_tokens": 187001551.0, + "step": 5710 + }, + { + "epoch": 0.4739198938552119, + "grad_norm": 0.8753827810287476, + "learning_rate": 6.313773220423812e-05, + "loss": 1.1713, + "mean_token_accuracy": 0.69445870667696, + "num_tokens": 187165391.0, + "step": 5715 + }, + { + "epoch": 0.47433452193382536, + "grad_norm": 0.9730857610702515, + "learning_rate": 6.30678943293652e-05, + "loss": 1.3436, + "mean_token_accuracy": 0.6642106577754021, + "num_tokens": 187329231.0, + "step": 5720 + }, + { + "epoch": 0.47474915001243884, + "grad_norm": 0.9346562623977661, + "learning_rate": 6.29980290800244e-05, + "loss": 1.2378, + "mean_token_accuracy": 0.6841153427958488, + "num_tokens": 187493071.0, + "step": 5725 + }, + { + "epoch": 0.4751637780910523, + "grad_norm": 0.9891049861907959, + "learning_rate": 6.292813660256856e-05, + "loss": 1.3049, + "mean_token_accuracy": 0.6780832409858704, + "num_tokens": 187656534.0, + "step": 5730 + }, + { + "epoch": 0.4755784061696658, + "grad_norm": 0.8796224594116211, + "learning_rate": 6.285821704340765e-05, + "loss": 1.2258, + "mean_token_accuracy": 0.6818120688199997, + "num_tokens": 187820374.0, + "step": 5735 + }, + { + "epoch": 0.47599303424827927, + "grad_norm": 0.9198670387268066, + "learning_rate": 6.278827054900828e-05, + "loss": 1.2519, + "mean_token_accuracy": 0.6800464317202568, + "num_tokens": 187984214.0, + "step": 5740 + }, + { + "epoch": 0.4764076623268928, + "grad_norm": 0.9337548613548279, + "learning_rate": 6.271829726589355e-05, + "loss": 1.2797, + "mean_token_accuracy": 0.6787939861416816, + "num_tokens": 188148054.0, + "step": 5745 + }, + { + "epoch": 0.4768222904055063, + "grad_norm": 0.9277735948562622, + "learning_rate": 6.264829734064264e-05, + "loss": 1.338, + "mean_token_accuracy": 0.668963835388422, + "num_tokens": 188311894.0, + "step": 5750 + }, + { + "epoch": 0.47723691848411975, + "grad_norm": 0.9833148121833801, + "learning_rate": 6.257827091989055e-05, + "loss": 1.2735, + "mean_token_accuracy": 0.6778641879558563, + "num_tokens": 188474887.0, + "step": 5755 + }, + { + "epoch": 0.47765154656273323, + "grad_norm": 0.9519672989845276, + "learning_rate": 6.250821815032779e-05, + "loss": 1.2091, + "mean_token_accuracy": 0.6879032239317894, + "num_tokens": 188638727.0, + "step": 5760 + }, + { + "epoch": 0.4780661746413467, + "grad_norm": 0.9036309719085693, + "learning_rate": 6.243813917870005e-05, + "loss": 1.2139, + "mean_token_accuracy": 0.6871397942304611, + "num_tokens": 188801674.0, + "step": 5765 + }, + { + "epoch": 0.4784808027199602, + "grad_norm": 0.9152082204818726, + "learning_rate": 6.236803415180792e-05, + "loss": 1.2659, + "mean_token_accuracy": 0.6844013914465904, + "num_tokens": 188964908.0, + "step": 5770 + }, + { + "epoch": 0.47889543079857366, + "grad_norm": 0.9431639313697815, + "learning_rate": 6.229790321650661e-05, + "loss": 1.1818, + "mean_token_accuracy": 0.6987719893455505, + "num_tokens": 189128748.0, + "step": 5775 + }, + { + "epoch": 0.47931005887718714, + "grad_norm": 0.9656767845153809, + "learning_rate": 6.22277465197055e-05, + "loss": 1.303, + "mean_token_accuracy": 0.6747922793030738, + "num_tokens": 189292588.0, + "step": 5780 + }, + { + "epoch": 0.47972468695580067, + "grad_norm": 0.8947068452835083, + "learning_rate": 6.215756420836801e-05, + "loss": 1.2374, + "mean_token_accuracy": 0.6805779531598091, + "num_tokens": 189456428.0, + "step": 5785 + }, + { + "epoch": 0.48013931503441415, + "grad_norm": 0.9972400665283203, + "learning_rate": 6.208735642951121e-05, + "loss": 1.281, + "mean_token_accuracy": 0.6826551795005799, + "num_tokens": 189620268.0, + "step": 5790 + }, + { + "epoch": 0.4805539431130276, + "grad_norm": 0.994186520576477, + "learning_rate": 6.20171233302055e-05, + "loss": 1.2425, + "mean_token_accuracy": 0.6841886594891549, + "num_tokens": 189784108.0, + "step": 5795 + }, + { + "epoch": 0.4809685711916411, + "grad_norm": 0.9338303208351135, + "learning_rate": 6.194686505757437e-05, + "loss": 1.165, + "mean_token_accuracy": 0.6987719908356667, + "num_tokens": 189947948.0, + "step": 5800 + }, + { + "epoch": 0.4813831992702546, + "grad_norm": 0.8793720006942749, + "learning_rate": 6.187658175879397e-05, + "loss": 1.185, + "mean_token_accuracy": 0.6946175426244736, + "num_tokens": 190111788.0, + "step": 5805 + }, + { + "epoch": 0.48179782734886806, + "grad_norm": 0.9103440046310425, + "learning_rate": 6.18062735810929e-05, + "loss": 1.1409, + "mean_token_accuracy": 0.7049303486943245, + "num_tokens": 190275628.0, + "step": 5810 + }, + { + "epoch": 0.48221245542748153, + "grad_norm": 0.9415132403373718, + "learning_rate": 6.173594067175192e-05, + "loss": 1.1824, + "mean_token_accuracy": 0.696218229830265, + "num_tokens": 190439468.0, + "step": 5815 + }, + { + "epoch": 0.482627083506095, + "grad_norm": 0.9324992895126343, + "learning_rate": 6.166558317810353e-05, + "loss": 1.2044, + "mean_token_accuracy": 0.689943790435791, + "num_tokens": 190603308.0, + "step": 5820 + }, + { + "epoch": 0.48304171158470854, + "grad_norm": 0.9188627004623413, + "learning_rate": 6.159520124753179e-05, + "loss": 1.2093, + "mean_token_accuracy": 0.6883675456047058, + "num_tokens": 190767148.0, + "step": 5825 + }, + { + "epoch": 0.483456339663322, + "grad_norm": 0.873873233795166, + "learning_rate": 6.152479502747189e-05, + "loss": 1.2675, + "mean_token_accuracy": 0.6782244190573692, + "num_tokens": 190930407.0, + "step": 5830 + }, + { + "epoch": 0.4838709677419355, + "grad_norm": 0.9298220276832581, + "learning_rate": 6.145436466540995e-05, + "loss": 1.2797, + "mean_token_accuracy": 0.6794084221124649, + "num_tokens": 191093428.0, + "step": 5835 + }, + { + "epoch": 0.484285595820549, + "grad_norm": 0.8735694289207458, + "learning_rate": 6.138391030888267e-05, + "loss": 1.2099, + "mean_token_accuracy": 0.6898154929280281, + "num_tokens": 191257268.0, + "step": 5840 + }, + { + "epoch": 0.48470022389916245, + "grad_norm": 0.9457949995994568, + "learning_rate": 6.131343210547694e-05, + "loss": 1.1883, + "mean_token_accuracy": 0.6914406135678292, + "num_tokens": 191421108.0, + "step": 5845 + }, + { + "epoch": 0.48511485197777593, + "grad_norm": 0.909713864326477, + "learning_rate": 6.124293020282969e-05, + "loss": 1.1569, + "mean_token_accuracy": 0.698619256913662, + "num_tokens": 191584948.0, + "step": 5850 + }, + { + "epoch": 0.4855294800563894, + "grad_norm": 0.9126371145248413, + "learning_rate": 6.117240474862743e-05, + "loss": 1.2108, + "mean_token_accuracy": 0.6895588994026184, + "num_tokens": 191748788.0, + "step": 5855 + }, + { + "epoch": 0.4859441081350029, + "grad_norm": 0.8835738301277161, + "learning_rate": 6.110185589060608e-05, + "loss": 1.3032, + "mean_token_accuracy": 0.6763624176383018, + "num_tokens": 191912628.0, + "step": 5860 + }, + { + "epoch": 0.48635873621361636, + "grad_norm": 0.9455219507217407, + "learning_rate": 6.1031283776550475e-05, + "loss": 1.1685, + "mean_token_accuracy": 0.6940554723143577, + "num_tokens": 192076468.0, + "step": 5865 + }, + { + "epoch": 0.4867733642922299, + "grad_norm": 0.9001264572143555, + "learning_rate": 6.096068855429429e-05, + "loss": 1.1785, + "mean_token_accuracy": 0.6976967230439186, + "num_tokens": 192240308.0, + "step": 5870 + }, + { + "epoch": 0.48718799237084337, + "grad_norm": 0.9531750082969666, + "learning_rate": 6.08900703717195e-05, + "loss": 1.1979, + "mean_token_accuracy": 0.6936278119683266, + "num_tokens": 192404148.0, + "step": 5875 + }, + { + "epoch": 0.48760262044945685, + "grad_norm": 0.9018110036849976, + "learning_rate": 6.081942937675625e-05, + "loss": 1.1978, + "mean_token_accuracy": 0.6919415920972825, + "num_tokens": 192567988.0, + "step": 5880 + }, + { + "epoch": 0.4880172485280703, + "grad_norm": 0.9620280861854553, + "learning_rate": 6.074876571738246e-05, + "loss": 1.2369, + "mean_token_accuracy": 0.6848770201206207, + "num_tokens": 192731608.0, + "step": 5885 + }, + { + "epoch": 0.4884318766066838, + "grad_norm": 0.931185781955719, + "learning_rate": 6.0678079541623475e-05, + "loss": 1.1997, + "mean_token_accuracy": 0.6906952559947968, + "num_tokens": 192895448.0, + "step": 5890 + }, + { + "epoch": 0.4888465046852973, + "grad_norm": 0.93744295835495, + "learning_rate": 6.060737099755189e-05, + "loss": 1.2619, + "mean_token_accuracy": 0.6782441407442092, + "num_tokens": 193059288.0, + "step": 5895 + }, + { + "epoch": 0.48926113276391076, + "grad_norm": 0.8988864421844482, + "learning_rate": 6.053664023328708e-05, + "loss": 1.239, + "mean_token_accuracy": 0.6839809373021126, + "num_tokens": 193223128.0, + "step": 5900 + }, + { + "epoch": 0.48967576084252423, + "grad_norm": 0.9602072834968567, + "learning_rate": 6.046588739699502e-05, + "loss": 1.3042, + "mean_token_accuracy": 0.6769978016614914, + "num_tokens": 193386968.0, + "step": 5905 + }, + { + "epoch": 0.49009038892113777, + "grad_norm": 0.9107131361961365, + "learning_rate": 6.039511263688789e-05, + "loss": 1.1768, + "mean_token_accuracy": 0.6962706178426743, + "num_tokens": 193550218.0, + "step": 5910 + }, + { + "epoch": 0.49050501699975124, + "grad_norm": 0.945172905921936, + "learning_rate": 6.0324316101223796e-05, + "loss": 1.2521, + "mean_token_accuracy": 0.6807856783270836, + "num_tokens": 193714058.0, + "step": 5915 + }, + { + "epoch": 0.4909196450783647, + "grad_norm": 0.8420608639717102, + "learning_rate": 6.0253497938306494e-05, + "loss": 1.1644, + "mean_token_accuracy": 0.700329914689064, + "num_tokens": 193877898.0, + "step": 5920 + }, + { + "epoch": 0.4913342731569782, + "grad_norm": 0.8994563221931458, + "learning_rate": 6.0182658296485005e-05, + "loss": 1.2116, + "mean_token_accuracy": 0.691538368165493, + "num_tokens": 194041738.0, + "step": 5925 + }, + { + "epoch": 0.4917489012355917, + "grad_norm": 0.9086576700210571, + "learning_rate": 6.011179732415335e-05, + "loss": 1.1416, + "mean_token_accuracy": 0.6986620262265205, + "num_tokens": 194205578.0, + "step": 5930 + }, + { + "epoch": 0.49216352931420515, + "grad_norm": 0.8597748875617981, + "learning_rate": 6.0040915169750265e-05, + "loss": 1.2576, + "mean_token_accuracy": 0.6861559227108955, + "num_tokens": 194369418.0, + "step": 5935 + }, + { + "epoch": 0.4925781573928186, + "grad_norm": 0.9271917343139648, + "learning_rate": 5.997001198175882e-05, + "loss": 1.2655, + "mean_token_accuracy": 0.6808162242174148, + "num_tokens": 194533258.0, + "step": 5940 + }, + { + "epoch": 0.4929927854714321, + "grad_norm": 0.9166371822357178, + "learning_rate": 5.989908790870616e-05, + "loss": 1.3046, + "mean_token_accuracy": 0.6792445093393326, + "num_tokens": 194696193.0, + "step": 5945 + }, + { + "epoch": 0.4934074135500456, + "grad_norm": 0.9166553616523743, + "learning_rate": 5.9828143099163206e-05, + "loss": 1.27, + "mean_token_accuracy": 0.6806940406560897, + "num_tokens": 194860033.0, + "step": 5950 + }, + { + "epoch": 0.4938220416286591, + "grad_norm": 0.924065351486206, + "learning_rate": 5.975717770174424e-05, + "loss": 1.2329, + "mean_token_accuracy": 0.6834431111812591, + "num_tokens": 195023520.0, + "step": 5955 + }, + { + "epoch": 0.4942366697072726, + "grad_norm": 0.907636284828186, + "learning_rate": 5.968619186510678e-05, + "loss": 1.2479, + "mean_token_accuracy": 0.681549359858036, + "num_tokens": 195187360.0, + "step": 5960 + }, + { + "epoch": 0.49465129778588607, + "grad_norm": 0.9144726991653442, + "learning_rate": 5.961518573795105e-05, + "loss": 1.257, + "mean_token_accuracy": 0.6842314228415489, + "num_tokens": 195351200.0, + "step": 5965 + }, + { + "epoch": 0.49506592586449955, + "grad_norm": 0.9033091068267822, + "learning_rate": 5.9544159469019855e-05, + "loss": 1.2346, + "mean_token_accuracy": 0.6842741966247559, + "num_tokens": 195515040.0, + "step": 5970 + }, + { + "epoch": 0.495480553943113, + "grad_norm": 0.8993896842002869, + "learning_rate": 5.9473113207098194e-05, + "loss": 1.2514, + "mean_token_accuracy": 0.6854166716337204, + "num_tokens": 195678880.0, + "step": 5975 + }, + { + "epoch": 0.4958951820217265, + "grad_norm": 0.8590402603149414, + "learning_rate": 5.940204710101288e-05, + "loss": 1.3325, + "mean_token_accuracy": 0.6693731665611267, + "num_tokens": 195842720.0, + "step": 5980 + }, + { + "epoch": 0.49630981010034, + "grad_norm": 0.8978489637374878, + "learning_rate": 5.933096129963238e-05, + "loss": 1.1963, + "mean_token_accuracy": 0.6939699441194535, + "num_tokens": 196006560.0, + "step": 5985 + }, + { + "epoch": 0.49672443817895345, + "grad_norm": 0.9233525991439819, + "learning_rate": 5.925985595186634e-05, + "loss": 1.1436, + "mean_token_accuracy": 0.7024804502725601, + "num_tokens": 196170400.0, + "step": 5990 + }, + { + "epoch": 0.497139066257567, + "grad_norm": 0.9250538349151611, + "learning_rate": 5.9188731206665396e-05, + "loss": 1.2149, + "mean_token_accuracy": 0.6864369481801986, + "num_tokens": 196334240.0, + "step": 5995 + }, + { + "epoch": 0.49755369433618046, + "grad_norm": 0.8921429514884949, + "learning_rate": 5.911758721302082e-05, + "loss": 1.1808, + "mean_token_accuracy": 0.697421795129776, + "num_tokens": 196498080.0, + "step": 6000 + }, + { + "epoch": 0.49796832241479394, + "grad_norm": 0.9132588505744934, + "learning_rate": 5.904642411996418e-05, + "loss": 1.24, + "mean_token_accuracy": 0.6872372940182686, + "num_tokens": 196661920.0, + "step": 6005 + }, + { + "epoch": 0.4983829504934074, + "grad_norm": 0.906562864780426, + "learning_rate": 5.897524207656708e-05, + "loss": 1.2232, + "mean_token_accuracy": 0.6900476559996604, + "num_tokens": 196825760.0, + "step": 6010 + }, + { + "epoch": 0.4987975785720209, + "grad_norm": 0.9356030821800232, + "learning_rate": 5.890404123194081e-05, + "loss": 1.1835, + "mean_token_accuracy": 0.691752202808857, + "num_tokens": 196989600.0, + "step": 6015 + }, + { + "epoch": 0.49921220665063437, + "grad_norm": 0.9574745893478394, + "learning_rate": 5.883282173523603e-05, + "loss": 1.3666, + "mean_token_accuracy": 0.6597079634666443, + "num_tokens": 197153440.0, + "step": 6020 + }, + { + "epoch": 0.49962683472924785, + "grad_norm": 0.9683645963668823, + "learning_rate": 5.876158373564249e-05, + "loss": 1.2422, + "mean_token_accuracy": 0.6861742407083511, + "num_tokens": 197317280.0, + "step": 6025 + }, + { + "epoch": 0.5000414628078613, + "grad_norm": 0.9355904459953308, + "learning_rate": 5.869032738238871e-05, + "loss": 1.2374, + "mean_token_accuracy": 0.6827791914343834, + "num_tokens": 197480273.0, + "step": 6030 + }, + { + "epoch": 0.5004560908864748, + "grad_norm": 0.9154330492019653, + "learning_rate": 5.861905282474161e-05, + "loss": 1.274, + "mean_token_accuracy": 0.6810056194663048, + "num_tokens": 197644113.0, + "step": 6035 + }, + { + "epoch": 0.5008707189650883, + "grad_norm": 0.9223089814186096, + "learning_rate": 5.854776021200632e-05, + "loss": 1.2341, + "mean_token_accuracy": 0.6852827444672585, + "num_tokens": 197807814.0, + "step": 6040 + }, + { + "epoch": 0.5012853470437018, + "grad_norm": 0.8918419480323792, + "learning_rate": 5.847644969352569e-05, + "loss": 1.2313, + "mean_token_accuracy": 0.6858565464615822, + "num_tokens": 197971654.0, + "step": 6045 + }, + { + "epoch": 0.5016999751223152, + "grad_norm": 0.9654561281204224, + "learning_rate": 5.840512141868021e-05, + "loss": 1.243, + "mean_token_accuracy": 0.6822886109352112, + "num_tokens": 198135494.0, + "step": 6050 + }, + { + "epoch": 0.5021146032009288, + "grad_norm": 0.8958101868629456, + "learning_rate": 5.833377553688743e-05, + "loss": 1.251, + "mean_token_accuracy": 0.6814332827925682, + "num_tokens": 198299334.0, + "step": 6055 + }, + { + "epoch": 0.5025292312795423, + "grad_norm": 0.9199835062026978, + "learning_rate": 5.8262412197601856e-05, + "loss": 1.2257, + "mean_token_accuracy": 0.6889723837375641, + "num_tokens": 198463174.0, + "step": 6060 + }, + { + "epoch": 0.5029438593581558, + "grad_norm": 0.931423544883728, + "learning_rate": 5.819103155031459e-05, + "loss": 1.2309, + "mean_token_accuracy": 0.6848666816949844, + "num_tokens": 198626162.0, + "step": 6065 + }, + { + "epoch": 0.5033584874367693, + "grad_norm": 0.9099568724632263, + "learning_rate": 5.811963374455291e-05, + "loss": 1.2458, + "mean_token_accuracy": 0.6816876038908959, + "num_tokens": 198789346.0, + "step": 6070 + }, + { + "epoch": 0.5037731155153827, + "grad_norm": 0.8975064754486084, + "learning_rate": 5.80482189298801e-05, + "loss": 1.1754, + "mean_token_accuracy": 0.6951735079288482, + "num_tokens": 198953186.0, + "step": 6075 + }, + { + "epoch": 0.5041877435939962, + "grad_norm": 0.906932532787323, + "learning_rate": 5.797678725589507e-05, + "loss": 1.2362, + "mean_token_accuracy": 0.6864308372139931, + "num_tokens": 199117026.0, + "step": 6080 + }, + { + "epoch": 0.5046023716726097, + "grad_norm": 0.932145893573761, + "learning_rate": 5.790533887223201e-05, + "loss": 1.2296, + "mean_token_accuracy": 0.6862781062722206, + "num_tokens": 199280866.0, + "step": 6085 + }, + { + "epoch": 0.5050169997512232, + "grad_norm": 0.8927560448646545, + "learning_rate": 5.7833873928560134e-05, + "loss": 1.1698, + "mean_token_accuracy": 0.6978782877326012, + "num_tokens": 199444061.0, + "step": 6090 + }, + { + "epoch": 0.5054316278298366, + "grad_norm": 0.8670108318328857, + "learning_rate": 5.7762392574583356e-05, + "loss": 1.1605, + "mean_token_accuracy": 0.6997556135058403, + "num_tokens": 199607901.0, + "step": 6095 + }, + { + "epoch": 0.5058462559084501, + "grad_norm": 0.8638862371444702, + "learning_rate": 5.7690894960039953e-05, + "loss": 1.2513, + "mean_token_accuracy": 0.6832722336053848, + "num_tokens": 199771741.0, + "step": 6100 + }, + { + "epoch": 0.5062608839870636, + "grad_norm": 0.8774763345718384, + "learning_rate": 5.761938123470227e-05, + "loss": 1.177, + "mean_token_accuracy": 0.6960777178406715, + "num_tokens": 199935581.0, + "step": 6105 + }, + { + "epoch": 0.5066755120656771, + "grad_norm": 0.9620174169540405, + "learning_rate": 5.7547851548376405e-05, + "loss": 1.2632, + "mean_token_accuracy": 0.6802113875746727, + "num_tokens": 200099421.0, + "step": 6110 + }, + { + "epoch": 0.5070901401442905, + "grad_norm": 0.9467723965644836, + "learning_rate": 5.7476306050901876e-05, + "loss": 1.1143, + "mean_token_accuracy": 0.708003418147564, + "num_tokens": 200263261.0, + "step": 6115 + }, + { + "epoch": 0.507504768222904, + "grad_norm": 0.8758918642997742, + "learning_rate": 5.740474489215133e-05, + "loss": 1.1709, + "mean_token_accuracy": 0.6971163272857666, + "num_tokens": 200427101.0, + "step": 6120 + }, + { + "epoch": 0.5079193963015175, + "grad_norm": 0.9225305914878845, + "learning_rate": 5.733316822203022e-05, + "loss": 1.2875, + "mean_token_accuracy": 0.6764418363571167, + "num_tokens": 200590941.0, + "step": 6125 + }, + { + "epoch": 0.508334024380131, + "grad_norm": 0.9075090289115906, + "learning_rate": 5.7261576190476515e-05, + "loss": 1.1457, + "mean_token_accuracy": 0.7037573307752609, + "num_tokens": 200754781.0, + "step": 6130 + }, + { + "epoch": 0.5087486524587445, + "grad_norm": 0.8592643141746521, + "learning_rate": 5.7189968947460316e-05, + "loss": 1.1999, + "mean_token_accuracy": 0.6876955017447471, + "num_tokens": 200918621.0, + "step": 6135 + }, + { + "epoch": 0.509163280537358, + "grad_norm": 0.9491396546363831, + "learning_rate": 5.711834664298362e-05, + "loss": 1.2697, + "mean_token_accuracy": 0.6799853354692459, + "num_tokens": 201082461.0, + "step": 6140 + }, + { + "epoch": 0.5095779086159715, + "grad_norm": 0.9179980158805847, + "learning_rate": 5.704670942707997e-05, + "loss": 1.1817, + "mean_token_accuracy": 0.696865837275982, + "num_tokens": 201246301.0, + "step": 6145 + }, + { + "epoch": 0.509992536694585, + "grad_norm": 0.8991859555244446, + "learning_rate": 5.697505744981415e-05, + "loss": 1.2769, + "mean_token_accuracy": 0.6775598764419556, + "num_tokens": 201410141.0, + "step": 6150 + }, + { + "epoch": 0.5104071647731985, + "grad_norm": 0.8738629817962646, + "learning_rate": 5.690339086128187e-05, + "loss": 1.2654, + "mean_token_accuracy": 0.683230721950531, + "num_tokens": 201573685.0, + "step": 6155 + }, + { + "epoch": 0.510821792851812, + "grad_norm": 0.8947060108184814, + "learning_rate": 5.683170981160941e-05, + "loss": 1.2279, + "mean_token_accuracy": 0.688263687491417, + "num_tokens": 201737525.0, + "step": 6160 + }, + { + "epoch": 0.5112364209304254, + "grad_norm": 0.8794062733650208, + "learning_rate": 5.6760014450953406e-05, + "loss": 1.2145, + "mean_token_accuracy": 0.6862536638975143, + "num_tokens": 201901365.0, + "step": 6165 + }, + { + "epoch": 0.5116510490090389, + "grad_norm": 0.8858832716941833, + "learning_rate": 5.66883049295004e-05, + "loss": 1.1711, + "mean_token_accuracy": 0.6965915784239769, + "num_tokens": 202065168.0, + "step": 6170 + }, + { + "epoch": 0.5120656770876524, + "grad_norm": 0.8891280889511108, + "learning_rate": 5.6616581397466664e-05, + "loss": 1.3551, + "mean_token_accuracy": 0.6664528340101242, + "num_tokens": 202229008.0, + "step": 6175 + }, + { + "epoch": 0.5124803051662659, + "grad_norm": 0.8534021973609924, + "learning_rate": 5.654484400509778e-05, + "loss": 1.1872, + "mean_token_accuracy": 0.6963587433099747, + "num_tokens": 202392848.0, + "step": 6180 + }, + { + "epoch": 0.5128949332448793, + "grad_norm": 0.9156395792961121, + "learning_rate": 5.6473092902668366e-05, + "loss": 1.259, + "mean_token_accuracy": 0.6817326471209526, + "num_tokens": 202556688.0, + "step": 6185 + }, + { + "epoch": 0.5133095613234928, + "grad_norm": 0.9103695154190063, + "learning_rate": 5.640132824048179e-05, + "loss": 1.2786, + "mean_token_accuracy": 0.681138950586319, + "num_tokens": 202719652.0, + "step": 6190 + }, + { + "epoch": 0.5137241894021063, + "grad_norm": 0.9157353639602661, + "learning_rate": 5.632955016886978e-05, + "loss": 1.2264, + "mean_token_accuracy": 0.6895894408226013, + "num_tokens": 202883492.0, + "step": 6195 + }, + { + "epoch": 0.5141388174807198, + "grad_norm": 0.9072819948196411, + "learning_rate": 5.62577588381922e-05, + "loss": 1.2243, + "mean_token_accuracy": 0.6888135358691215, + "num_tokens": 203047332.0, + "step": 6200 + }, + { + "epoch": 0.5145534455593332, + "grad_norm": 0.9278843402862549, + "learning_rate": 5.618595439883664e-05, + "loss": 1.2849, + "mean_token_accuracy": 0.6793351963162422, + "num_tokens": 203210230.0, + "step": 6205 + }, + { + "epoch": 0.5149680736379467, + "grad_norm": 0.8833913207054138, + "learning_rate": 5.61141370012182e-05, + "loss": 1.1722, + "mean_token_accuracy": 0.6963343143463134, + "num_tokens": 203374070.0, + "step": 6210 + }, + { + "epoch": 0.5153827017165602, + "grad_norm": 0.9043241739273071, + "learning_rate": 5.6042306795779085e-05, + "loss": 1.2276, + "mean_token_accuracy": 0.6884836286306382, + "num_tokens": 203537910.0, + "step": 6215 + }, + { + "epoch": 0.5157973297951737, + "grad_norm": 0.9065195322036743, + "learning_rate": 5.597046393298836e-05, + "loss": 1.2534, + "mean_token_accuracy": 0.6853250235319137, + "num_tokens": 203701750.0, + "step": 6220 + }, + { + "epoch": 0.5162119578737873, + "grad_norm": 0.8881427049636841, + "learning_rate": 5.589860856334158e-05, + "loss": 1.204, + "mean_token_accuracy": 0.6931936025619507, + "num_tokens": 203865453.0, + "step": 6225 + }, + { + "epoch": 0.5166265859524007, + "grad_norm": 0.9282435774803162, + "learning_rate": 5.582674083736049e-05, + "loss": 1.2401, + "mean_token_accuracy": 0.6892106533050537, + "num_tokens": 204029293.0, + "step": 6230 + }, + { + "epoch": 0.5170412140310142, + "grad_norm": 0.953029990196228, + "learning_rate": 5.57548609055928e-05, + "loss": 1.3481, + "mean_token_accuracy": 0.6690982446074486, + "num_tokens": 204193133.0, + "step": 6235 + }, + { + "epoch": 0.5174558421096277, + "grad_norm": 0.884660542011261, + "learning_rate": 5.568296891861166e-05, + "loss": 1.1057, + "mean_token_accuracy": 0.7079239949584007, + "num_tokens": 204356973.0, + "step": 6240 + }, + { + "epoch": 0.5178704701882412, + "grad_norm": 0.8669636845588684, + "learning_rate": 5.561106502701557e-05, + "loss": 1.1728, + "mean_token_accuracy": 0.6963220924139023, + "num_tokens": 204520813.0, + "step": 6245 + }, + { + "epoch": 0.5182850982668546, + "grad_norm": 0.9606974720954895, + "learning_rate": 5.5539149381427934e-05, + "loss": 1.2909, + "mean_token_accuracy": 0.6754643246531487, + "num_tokens": 204684653.0, + "step": 6250 + }, + { + "epoch": 0.5186997263454681, + "grad_norm": 0.9595761895179749, + "learning_rate": 5.546722213249678e-05, + "loss": 1.2513, + "mean_token_accuracy": 0.6835227265954018, + "num_tokens": 204848493.0, + "step": 6255 + }, + { + "epoch": 0.5191143544240816, + "grad_norm": 0.8764197826385498, + "learning_rate": 5.539528343089445e-05, + "loss": 1.2467, + "mean_token_accuracy": 0.6843291744589806, + "num_tokens": 205012333.0, + "step": 6260 + }, + { + "epoch": 0.5195289825026951, + "grad_norm": 0.9235277771949768, + "learning_rate": 5.5323333427317256e-05, + "loss": 1.2007, + "mean_token_accuracy": 0.6919721379876137, + "num_tokens": 205176173.0, + "step": 6265 + }, + { + "epoch": 0.5199436105813086, + "grad_norm": 0.9578131437301636, + "learning_rate": 5.525137227248522e-05, + "loss": 1.2815, + "mean_token_accuracy": 0.6770711153745651, + "num_tokens": 205340013.0, + "step": 6270 + }, + { + "epoch": 0.520358238659922, + "grad_norm": 0.9407724738121033, + "learning_rate": 5.51794001171417e-05, + "loss": 1.2213, + "mean_token_accuracy": 0.6865285962820054, + "num_tokens": 205503853.0, + "step": 6275 + }, + { + "epoch": 0.5207728667385355, + "grad_norm": 0.934880256652832, + "learning_rate": 5.5107417112053094e-05, + "loss": 1.1946, + "mean_token_accuracy": 0.6924914509057999, + "num_tokens": 205667693.0, + "step": 6280 + }, + { + "epoch": 0.521187494817149, + "grad_norm": 1.0713592767715454, + "learning_rate": 5.503542340800852e-05, + "loss": 1.1717, + "mean_token_accuracy": 0.6967314258217812, + "num_tokens": 205831533.0, + "step": 6285 + }, + { + "epoch": 0.5216021228957625, + "grad_norm": 0.9055458903312683, + "learning_rate": 5.496341915581957e-05, + "loss": 1.1642, + "mean_token_accuracy": 0.699040810763836, + "num_tokens": 205995373.0, + "step": 6290 + }, + { + "epoch": 0.5220167509743759, + "grad_norm": 0.8946324586868286, + "learning_rate": 5.4891404506319825e-05, + "loss": 1.2236, + "mean_token_accuracy": 0.6841092362999917, + "num_tokens": 206159213.0, + "step": 6295 + }, + { + "epoch": 0.5224313790529894, + "grad_norm": 0.9214036464691162, + "learning_rate": 5.481937961036476e-05, + "loss": 1.293, + "mean_token_accuracy": 0.6736925691366196, + "num_tokens": 206323053.0, + "step": 6300 + }, + { + "epoch": 0.5228460071316029, + "grad_norm": 0.9672065377235413, + "learning_rate": 5.474734461883124e-05, + "loss": 1.2162, + "mean_token_accuracy": 0.6890579178929329, + "num_tokens": 206486893.0, + "step": 6305 + }, + { + "epoch": 0.5232606352102165, + "grad_norm": 0.9097797870635986, + "learning_rate": 5.4675299682617285e-05, + "loss": 1.2221, + "mean_token_accuracy": 0.6881767675280571, + "num_tokens": 206650573.0, + "step": 6310 + }, + { + "epoch": 0.52367526328883, + "grad_norm": 0.8895254731178284, + "learning_rate": 5.460324495264179e-05, + "loss": 1.0871, + "mean_token_accuracy": 0.7105632960796356, + "num_tokens": 206814413.0, + "step": 6315 + }, + { + "epoch": 0.5240898913674434, + "grad_norm": 0.8365790843963623, + "learning_rate": 5.453118057984411e-05, + "loss": 1.2284, + "mean_token_accuracy": 0.6855632916092873, + "num_tokens": 206978253.0, + "step": 6320 + }, + { + "epoch": 0.5245045194460569, + "grad_norm": 0.8553270697593689, + "learning_rate": 5.445910671518384e-05, + "loss": 1.1792, + "mean_token_accuracy": 0.6980083122849464, + "num_tokens": 207142093.0, + "step": 6325 + }, + { + "epoch": 0.5249191475246704, + "grad_norm": 0.9196605086326599, + "learning_rate": 5.438702350964044e-05, + "loss": 1.3084, + "mean_token_accuracy": 0.676484601944685, + "num_tokens": 207305933.0, + "step": 6330 + }, + { + "epoch": 0.5253337756032839, + "grad_norm": 0.9052113890647888, + "learning_rate": 5.4314931114212956e-05, + "loss": 1.2114, + "mean_token_accuracy": 0.691580730676651, + "num_tokens": 207468907.0, + "step": 6335 + }, + { + "epoch": 0.5257484036818973, + "grad_norm": 0.9065616130828857, + "learning_rate": 5.424282967991965e-05, + "loss": 1.1778, + "mean_token_accuracy": 0.6990835785865783, + "num_tokens": 207632747.0, + "step": 6340 + }, + { + "epoch": 0.5261630317605108, + "grad_norm": 13.21120548248291, + "learning_rate": 5.4170719357797774e-05, + "loss": 1.2799, + "mean_token_accuracy": 0.6758736535906792, + "num_tokens": 207796587.0, + "step": 6345 + }, + { + "epoch": 0.5265776598391243, + "grad_norm": 0.8727403283119202, + "learning_rate": 5.4098600298903105e-05, + "loss": 1.2515, + "mean_token_accuracy": 0.6862014904618263, + "num_tokens": 207960159.0, + "step": 6350 + }, + { + "epoch": 0.5269922879177378, + "grad_norm": 0.9306917190551758, + "learning_rate": 5.402647265430982e-05, + "loss": 1.2238, + "mean_token_accuracy": 0.6863636314868927, + "num_tokens": 208123999.0, + "step": 6355 + }, + { + "epoch": 0.5274069159963513, + "grad_norm": 0.8838227987289429, + "learning_rate": 5.3954336575110066e-05, + "loss": 1.224, + "mean_token_accuracy": 0.6893084019422531, + "num_tokens": 208287839.0, + "step": 6360 + }, + { + "epoch": 0.5278215440749647, + "grad_norm": 0.904277503490448, + "learning_rate": 5.388219221241357e-05, + "loss": 1.1809, + "mean_token_accuracy": 0.69630376547575, + "num_tokens": 208451679.0, + "step": 6365 + }, + { + "epoch": 0.5282361721535782, + "grad_norm": 0.9271522760391235, + "learning_rate": 5.3810039717347536e-05, + "loss": 1.2546, + "mean_token_accuracy": 0.6841703325510025, + "num_tokens": 208615519.0, + "step": 6370 + }, + { + "epoch": 0.5286508002321917, + "grad_norm": 0.9033722281455994, + "learning_rate": 5.37378792410561e-05, + "loss": 1.1427, + "mean_token_accuracy": 0.6994868069887161, + "num_tokens": 208779359.0, + "step": 6375 + }, + { + "epoch": 0.5290654283108052, + "grad_norm": 0.8846749067306519, + "learning_rate": 5.3665710934700184e-05, + "loss": 1.229, + "mean_token_accuracy": 0.689711631834507, + "num_tokens": 208943199.0, + "step": 6380 + }, + { + "epoch": 0.5294800563894186, + "grad_norm": 0.8874713778495789, + "learning_rate": 5.3593534949457094e-05, + "loss": 1.2066, + "mean_token_accuracy": 0.692864128947258, + "num_tokens": 209107039.0, + "step": 6385 + }, + { + "epoch": 0.5298946844680322, + "grad_norm": 0.8738123774528503, + "learning_rate": 5.352135143652018e-05, + "loss": 1.197, + "mean_token_accuracy": 0.6921893879771233, + "num_tokens": 209270820.0, + "step": 6390 + }, + { + "epoch": 0.5303093125466457, + "grad_norm": 0.9200013875961304, + "learning_rate": 5.344916054709863e-05, + "loss": 1.1546, + "mean_token_accuracy": 0.697586752474308, + "num_tokens": 209434660.0, + "step": 6395 + }, + { + "epoch": 0.5307239406252592, + "grad_norm": 0.9030777812004089, + "learning_rate": 5.3376962432417045e-05, + "loss": 1.2219, + "mean_token_accuracy": 0.6877749294042588, + "num_tokens": 209598500.0, + "step": 6400 + }, + { + "epoch": 0.5311385687038727, + "grad_norm": 0.9108365178108215, + "learning_rate": 5.3304757243715164e-05, + "loss": 1.2661, + "mean_token_accuracy": 0.6783941894769668, + "num_tokens": 209762253.0, + "step": 6405 + }, + { + "epoch": 0.5315531967824861, + "grad_norm": 0.9172254800796509, + "learning_rate": 5.3232545132247544e-05, + "loss": 1.1724, + "mean_token_accuracy": 0.7031769335269928, + "num_tokens": 209926093.0, + "step": 6410 + }, + { + "epoch": 0.5319678248610996, + "grad_norm": 0.9194810390472412, + "learning_rate": 5.316032624928326e-05, + "loss": 1.2083, + "mean_token_accuracy": 0.6897408664226532, + "num_tokens": 210089227.0, + "step": 6415 + }, + { + "epoch": 0.5323824529397131, + "grad_norm": 0.8845224380493164, + "learning_rate": 5.308810074610554e-05, + "loss": 1.2051, + "mean_token_accuracy": 0.6930779546499253, + "num_tokens": 210253067.0, + "step": 6420 + }, + { + "epoch": 0.5327970810183266, + "grad_norm": 0.8977319598197937, + "learning_rate": 5.3015868774011525e-05, + "loss": 1.19, + "mean_token_accuracy": 0.6922536373138428, + "num_tokens": 210416397.0, + "step": 6425 + }, + { + "epoch": 0.53321170909694, + "grad_norm": 0.9427075982093811, + "learning_rate": 5.2943630484311844e-05, + "loss": 1.2702, + "mean_token_accuracy": 0.6780119732022285, + "num_tokens": 210580237.0, + "step": 6430 + }, + { + "epoch": 0.5336263371755535, + "grad_norm": 0.9184347987174988, + "learning_rate": 5.287138602833045e-05, + "loss": 1.2362, + "mean_token_accuracy": 0.6841031283140182, + "num_tokens": 210744077.0, + "step": 6435 + }, + { + "epoch": 0.534040965254167, + "grad_norm": 0.9414148330688477, + "learning_rate": 5.279913555740411e-05, + "loss": 1.1999, + "mean_token_accuracy": 0.6908468291163444, + "num_tokens": 210907643.0, + "step": 6440 + }, + { + "epoch": 0.5344555933327805, + "grad_norm": 0.9243748188018799, + "learning_rate": 5.272687922288227e-05, + "loss": 1.1242, + "mean_token_accuracy": 0.7059323117136955, + "num_tokens": 211071483.0, + "step": 6445 + }, + { + "epoch": 0.534870221411394, + "grad_norm": 0.8637745976448059, + "learning_rate": 5.265461717612663e-05, + "loss": 1.1653, + "mean_token_accuracy": 0.6991996586322784, + "num_tokens": 211235323.0, + "step": 6450 + }, + { + "epoch": 0.5352848494900074, + "grad_norm": 0.8968706727027893, + "learning_rate": 5.2582349568510835e-05, + "loss": 1.2218, + "mean_token_accuracy": 0.6856182813644409, + "num_tokens": 211399163.0, + "step": 6455 + }, + { + "epoch": 0.5356994775686209, + "grad_norm": 0.9248412847518921, + "learning_rate": 5.251007655142024e-05, + "loss": 1.2183, + "mean_token_accuracy": 0.6859115362167358, + "num_tokens": 211563003.0, + "step": 6460 + }, + { + "epoch": 0.5361141056472344, + "grad_norm": 0.8828514814376831, + "learning_rate": 5.243779827625146e-05, + "loss": 1.1561, + "mean_token_accuracy": 0.6985092923045159, + "num_tokens": 211726843.0, + "step": 6465 + }, + { + "epoch": 0.5365287337258479, + "grad_norm": 0.8828794360160828, + "learning_rate": 5.236551489441216e-05, + "loss": 1.1948, + "mean_token_accuracy": 0.6899408712983132, + "num_tokens": 211889870.0, + "step": 6470 + }, + { + "epoch": 0.5369433618044614, + "grad_norm": 0.9275861978530884, + "learning_rate": 5.229322655732071e-05, + "loss": 1.1307, + "mean_token_accuracy": 0.7023704811930657, + "num_tokens": 212053710.0, + "step": 6475 + }, + { + "epoch": 0.5373579898830749, + "grad_norm": 0.913375198841095, + "learning_rate": 5.222093341640584e-05, + "loss": 1.1373, + "mean_token_accuracy": 0.7017656400799751, + "num_tokens": 212217550.0, + "step": 6480 + }, + { + "epoch": 0.5377726179616884, + "grad_norm": 0.8686186671257019, + "learning_rate": 5.214863562310634e-05, + "loss": 1.2431, + "mean_token_accuracy": 0.6821007788181305, + "num_tokens": 212381009.0, + "step": 6485 + }, + { + "epoch": 0.5381872460403019, + "grad_norm": 0.9184940457344055, + "learning_rate": 5.20763333288708e-05, + "loss": 1.228, + "mean_token_accuracy": 0.6859359756112099, + "num_tokens": 212544849.0, + "step": 6490 + }, + { + "epoch": 0.5386018741189154, + "grad_norm": 0.8911036849021912, + "learning_rate": 5.200402668515716e-05, + "loss": 1.1528, + "mean_token_accuracy": 0.699456250667572, + "num_tokens": 212708689.0, + "step": 6495 + }, + { + "epoch": 0.5390165021975288, + "grad_norm": 0.885284960269928, + "learning_rate": 5.1931715843432506e-05, + "loss": 1.2053, + "mean_token_accuracy": 0.6943487286567688, + "num_tokens": 212872529.0, + "step": 6500 + }, + { + "epoch": 0.5394311302761423, + "grad_norm": 0.9448803663253784, + "learning_rate": 5.185940095517274e-05, + "loss": 1.2753, + "mean_token_accuracy": 0.6810500115156174, + "num_tokens": 213035919.0, + "step": 6505 + }, + { + "epoch": 0.5398457583547558, + "grad_norm": 0.8612961173057556, + "learning_rate": 5.178708217186222e-05, + "loss": 1.2534, + "mean_token_accuracy": 0.683718228340149, + "num_tokens": 213199759.0, + "step": 6510 + }, + { + "epoch": 0.5402603864333693, + "grad_norm": 0.8703817129135132, + "learning_rate": 5.171475964499346e-05, + "loss": 1.1027, + "mean_token_accuracy": 0.7086999043822289, + "num_tokens": 213362820.0, + "step": 6515 + }, + { + "epoch": 0.5406750145119827, + "grad_norm": 0.900538444519043, + "learning_rate": 5.164243352606679e-05, + "loss": 1.1833, + "mean_token_accuracy": 0.6918194040656089, + "num_tokens": 213526660.0, + "step": 6520 + }, + { + "epoch": 0.5410896425905962, + "grad_norm": 0.8728988766670227, + "learning_rate": 5.157010396659014e-05, + "loss": 1.1, + "mean_token_accuracy": 0.7105594664812088, + "num_tokens": 213690000.0, + "step": 6525 + }, + { + "epoch": 0.5415042706692097, + "grad_norm": 0.9673345685005188, + "learning_rate": 5.149777111807859e-05, + "loss": 1.2185, + "mean_token_accuracy": 0.6913367554545402, + "num_tokens": 213853840.0, + "step": 6530 + }, + { + "epoch": 0.5419188987478232, + "grad_norm": 0.9376686811447144, + "learning_rate": 5.142543513205409e-05, + "loss": 1.3058, + "mean_token_accuracy": 0.6771444231271744, + "num_tokens": 214017680.0, + "step": 6535 + }, + { + "epoch": 0.5423335268264367, + "grad_norm": 0.8954134583473206, + "learning_rate": 5.135309616004523e-05, + "loss": 1.1879, + "mean_token_accuracy": 0.6946840047836303, + "num_tokens": 214181489.0, + "step": 6540 + }, + { + "epoch": 0.5427481549050501, + "grad_norm": 0.9150399565696716, + "learning_rate": 5.128075435358679e-05, + "loss": 1.2449, + "mean_token_accuracy": 0.6834860697388649, + "num_tokens": 214345329.0, + "step": 6545 + }, + { + "epoch": 0.5431627829836636, + "grad_norm": 0.9100620150566101, + "learning_rate": 5.120840986421955e-05, + "loss": 1.1598, + "mean_token_accuracy": 0.6999737590551376, + "num_tokens": 214509101.0, + "step": 6550 + }, + { + "epoch": 0.5435774110622771, + "grad_norm": 0.8855962157249451, + "learning_rate": 5.113606284348984e-05, + "loss": 1.1788, + "mean_token_accuracy": 0.6972385168075561, + "num_tokens": 214672941.0, + "step": 6555 + }, + { + "epoch": 0.5439920391408907, + "grad_norm": 0.9107732772827148, + "learning_rate": 5.106371344294936e-05, + "loss": 1.2798, + "mean_token_accuracy": 0.6762696966528893, + "num_tokens": 214835862.0, + "step": 6560 + }, + { + "epoch": 0.5444066672195041, + "grad_norm": 0.8324126601219177, + "learning_rate": 5.099136181415475e-05, + "loss": 1.1893, + "mean_token_accuracy": 0.6961143642663956, + "num_tokens": 214999702.0, + "step": 6565 + }, + { + "epoch": 0.5448212952981176, + "grad_norm": 0.8903244137763977, + "learning_rate": 5.091900810866732e-05, + "loss": 1.2255, + "mean_token_accuracy": 0.6866263464093209, + "num_tokens": 215163542.0, + "step": 6570 + }, + { + "epoch": 0.5452359233767311, + "grad_norm": 1.8215476274490356, + "learning_rate": 5.084665247805276e-05, + "loss": 1.2491, + "mean_token_accuracy": 0.6820075765252114, + "num_tokens": 215327382.0, + "step": 6575 + }, + { + "epoch": 0.5456505514553446, + "grad_norm": 0.8984115123748779, + "learning_rate": 5.0774295073880774e-05, + "loss": 1.2231, + "mean_token_accuracy": 0.6867302045226097, + "num_tokens": 215491222.0, + "step": 6580 + }, + { + "epoch": 0.5460651795339581, + "grad_norm": 0.9719600677490234, + "learning_rate": 5.070193604772477e-05, + "loss": 1.238, + "mean_token_accuracy": 0.686223118007183, + "num_tokens": 215655062.0, + "step": 6585 + }, + { + "epoch": 0.5464798076125715, + "grad_norm": 0.9477178454399109, + "learning_rate": 5.062957555116159e-05, + "loss": 1.2902, + "mean_token_accuracy": 0.6786129310727119, + "num_tokens": 215818714.0, + "step": 6590 + }, + { + "epoch": 0.546894435691185, + "grad_norm": 0.8436419367790222, + "learning_rate": 5.055721373577111e-05, + "loss": 1.2024, + "mean_token_accuracy": 0.6898107171058655, + "num_tokens": 215982484.0, + "step": 6595 + }, + { + "epoch": 0.5473090637697985, + "grad_norm": 0.8872175216674805, + "learning_rate": 5.048485075313598e-05, + "loss": 1.2846, + "mean_token_accuracy": 0.6801319614052772, + "num_tokens": 216146324.0, + "step": 6600 + }, + { + "epoch": 0.547723691848412, + "grad_norm": 0.9372970461845398, + "learning_rate": 5.0412486754841347e-05, + "loss": 1.193, + "mean_token_accuracy": 0.6931696027517319, + "num_tokens": 216310164.0, + "step": 6605 + }, + { + "epoch": 0.5481383199270254, + "grad_norm": 0.9212945103645325, + "learning_rate": 5.03401218924744e-05, + "loss": 1.1958, + "mean_token_accuracy": 0.6920821130275726, + "num_tokens": 216474004.0, + "step": 6610 + }, + { + "epoch": 0.5485529480056389, + "grad_norm": 0.8940766453742981, + "learning_rate": 5.0267756317624216e-05, + "loss": 1.1878, + "mean_token_accuracy": 0.6963892981410027, + "num_tokens": 216637844.0, + "step": 6615 + }, + { + "epoch": 0.5489675760842524, + "grad_norm": 0.8963186144828796, + "learning_rate": 5.019539018188132e-05, + "loss": 1.16, + "mean_token_accuracy": 0.6982710182666778, + "num_tokens": 216801684.0, + "step": 6620 + }, + { + "epoch": 0.5493822041628659, + "grad_norm": 0.8630821108818054, + "learning_rate": 5.0123023636837395e-05, + "loss": 1.2503, + "mean_token_accuracy": 0.6825696498155593, + "num_tokens": 216965524.0, + "step": 6625 + }, + { + "epoch": 0.5497968322414793, + "grad_norm": 0.9982067942619324, + "learning_rate": 5.005065683408508e-05, + "loss": 1.2151, + "mean_token_accuracy": 0.6896871969103813, + "num_tokens": 217129364.0, + "step": 6630 + }, + { + "epoch": 0.5502114603200928, + "grad_norm": 0.8645791411399841, + "learning_rate": 4.99782899252174e-05, + "loss": 1.1423, + "mean_token_accuracy": 0.7014846056699753, + "num_tokens": 217293204.0, + "step": 6635 + }, + { + "epoch": 0.5506260883987064, + "grad_norm": 0.9290412068367004, + "learning_rate": 4.9905923061827736e-05, + "loss": 1.2004, + "mean_token_accuracy": 0.6902295619249343, + "num_tokens": 217456117.0, + "step": 6640 + }, + { + "epoch": 0.5510407164773199, + "grad_norm": 0.8934392929077148, + "learning_rate": 4.98335563955093e-05, + "loss": 1.1539, + "mean_token_accuracy": 0.6991263449192047, + "num_tokens": 217619957.0, + "step": 6645 + }, + { + "epoch": 0.5514553445559334, + "grad_norm": 0.8861909508705139, + "learning_rate": 4.976119007785494e-05, + "loss": 1.2497, + "mean_token_accuracy": 0.6817570835351944, + "num_tokens": 217783797.0, + "step": 6650 + }, + { + "epoch": 0.5518699726345468, + "grad_norm": 0.8809149861335754, + "learning_rate": 4.9688824260456726e-05, + "loss": 1.1592, + "mean_token_accuracy": 0.7015334829688072, + "num_tokens": 217947637.0, + "step": 6655 + }, + { + "epoch": 0.5522846007131603, + "grad_norm": 0.940298318862915, + "learning_rate": 4.9616459094905715e-05, + "loss": 1.2212, + "mean_token_accuracy": 0.6881417751312255, + "num_tokens": 218111131.0, + "step": 6660 + }, + { + "epoch": 0.5526992287917738, + "grad_norm": 0.8721581101417542, + "learning_rate": 4.954409473279158e-05, + "loss": 1.2098, + "mean_token_accuracy": 0.6854899793863296, + "num_tokens": 218274971.0, + "step": 6665 + }, + { + "epoch": 0.5531138568703873, + "grad_norm": 0.9390237927436829, + "learning_rate": 4.947173132570231e-05, + "loss": 1.2649, + "mean_token_accuracy": 0.681109482049942, + "num_tokens": 218438811.0, + "step": 6670 + }, + { + "epoch": 0.5535284849490008, + "grad_norm": 0.9084700345993042, + "learning_rate": 4.9399369025223905e-05, + "loss": 1.2492, + "mean_token_accuracy": 0.6816715553402901, + "num_tokens": 218602651.0, + "step": 6675 + }, + { + "epoch": 0.5539431130276142, + "grad_norm": 0.9213350415229797, + "learning_rate": 4.932700798294006e-05, + "loss": 1.234, + "mean_token_accuracy": 0.683847026526928, + "num_tokens": 218766420.0, + "step": 6680 + }, + { + "epoch": 0.5543577411062277, + "grad_norm": 0.9205579161643982, + "learning_rate": 4.9254648350431787e-05, + "loss": 1.2252, + "mean_token_accuracy": 0.686002540588379, + "num_tokens": 218929489.0, + "step": 6685 + }, + { + "epoch": 0.5547723691848412, + "grad_norm": 0.8921565413475037, + "learning_rate": 4.9182290279277175e-05, + "loss": 1.1792, + "mean_token_accuracy": 0.694709187746048, + "num_tokens": 219093329.0, + "step": 6690 + }, + { + "epoch": 0.5551869972634547, + "grad_norm": 0.9167972207069397, + "learning_rate": 4.9109933921051076e-05, + "loss": 1.1969, + "mean_token_accuracy": 0.6931573793292045, + "num_tokens": 219257169.0, + "step": 6695 + }, + { + "epoch": 0.5556016253420681, + "grad_norm": 0.914140522480011, + "learning_rate": 4.903757942732469e-05, + "loss": 1.1886, + "mean_token_accuracy": 0.6917277619242668, + "num_tokens": 219421009.0, + "step": 6700 + }, + { + "epoch": 0.5560162534206816, + "grad_norm": 0.8975273966789246, + "learning_rate": 4.896522694966533e-05, + "loss": 1.1615, + "mean_token_accuracy": 0.69662756472826, + "num_tokens": 219584849.0, + "step": 6705 + }, + { + "epoch": 0.5564308814992951, + "grad_norm": 0.8833522796630859, + "learning_rate": 4.8892876639636126e-05, + "loss": 1.218, + "mean_token_accuracy": 0.6935483857989311, + "num_tokens": 219748689.0, + "step": 6710 + }, + { + "epoch": 0.5568455095779086, + "grad_norm": 0.9619110226631165, + "learning_rate": 4.8820528648795634e-05, + "loss": 1.2085, + "mean_token_accuracy": 0.6911351323127747, + "num_tokens": 219912529.0, + "step": 6715 + }, + { + "epoch": 0.557260137656522, + "grad_norm": 0.8885596990585327, + "learning_rate": 4.874818312869753e-05, + "loss": 1.2336, + "mean_token_accuracy": 0.6844024941325187, + "num_tokens": 220076369.0, + "step": 6720 + }, + { + "epoch": 0.5576747657351356, + "grad_norm": 0.9228293895721436, + "learning_rate": 4.8675840230890355e-05, + "loss": 1.2688, + "mean_token_accuracy": 0.6799242347478867, + "num_tokens": 220240209.0, + "step": 6725 + }, + { + "epoch": 0.5580893938137491, + "grad_norm": 0.975376546382904, + "learning_rate": 4.860350010691716e-05, + "loss": 1.2384, + "mean_token_accuracy": 0.6906524926424027, + "num_tokens": 220404049.0, + "step": 6730 + }, + { + "epoch": 0.5585040218923626, + "grad_norm": 0.9415110349655151, + "learning_rate": 4.8531162908315134e-05, + "loss": 1.2271, + "mean_token_accuracy": 0.685777124762535, + "num_tokens": 220567889.0, + "step": 6735 + }, + { + "epoch": 0.5589186499709761, + "grad_norm": 0.9494465589523315, + "learning_rate": 4.845882878661538e-05, + "loss": 1.2185, + "mean_token_accuracy": 0.6860642716288566, + "num_tokens": 220731729.0, + "step": 6740 + }, + { + "epoch": 0.5593332780495895, + "grad_norm": 0.9446563124656677, + "learning_rate": 4.838649789334257e-05, + "loss": 1.2448, + "mean_token_accuracy": 0.6815738022327423, + "num_tokens": 220895569.0, + "step": 6745 + }, + { + "epoch": 0.559747906128203, + "grad_norm": 0.8738687634468079, + "learning_rate": 4.8314170380014546e-05, + "loss": 1.2853, + "mean_token_accuracy": 0.6820564493536949, + "num_tokens": 221059409.0, + "step": 6750 + }, + { + "epoch": 0.5601625342068165, + "grad_norm": 0.9208701848983765, + "learning_rate": 4.824184639814215e-05, + "loss": 1.1718, + "mean_token_accuracy": 0.6970601424574852, + "num_tokens": 221222577.0, + "step": 6755 + }, + { + "epoch": 0.56057716228543, + "grad_norm": 0.8384931683540344, + "learning_rate": 4.816952609922879e-05, + "loss": 1.228, + "mean_token_accuracy": 0.6885202810168266, + "num_tokens": 221386417.0, + "step": 6760 + }, + { + "epoch": 0.5609917903640435, + "grad_norm": 0.8435602188110352, + "learning_rate": 4.809720963477013e-05, + "loss": 1.1915, + "mean_token_accuracy": 0.6923203811049461, + "num_tokens": 221550257.0, + "step": 6765 + }, + { + "epoch": 0.5614064184426569, + "grad_norm": 0.9238464832305908, + "learning_rate": 4.802489715625385e-05, + "loss": 1.1586, + "mean_token_accuracy": 0.6958699896931648, + "num_tokens": 221714097.0, + "step": 6770 + }, + { + "epoch": 0.5618210465212704, + "grad_norm": 0.9219141602516174, + "learning_rate": 4.795258881515925e-05, + "loss": 1.2861, + "mean_token_accuracy": 0.6787390008568763, + "num_tokens": 221877937.0, + "step": 6775 + }, + { + "epoch": 0.5622356745998839, + "grad_norm": 0.9290852546691895, + "learning_rate": 4.7880284762957e-05, + "loss": 1.1626, + "mean_token_accuracy": 0.6970918908715248, + "num_tokens": 222041777.0, + "step": 6780 + }, + { + "epoch": 0.5626503026784974, + "grad_norm": 0.9057936668395996, + "learning_rate": 4.7807985151108726e-05, + "loss": 1.2176, + "mean_token_accuracy": 0.689833813905716, + "num_tokens": 222205617.0, + "step": 6785 + }, + { + "epoch": 0.5630649307571108, + "grad_norm": 0.9059953093528748, + "learning_rate": 4.773569013106679e-05, + "loss": 1.1597, + "mean_token_accuracy": 0.6991935476660729, + "num_tokens": 222369457.0, + "step": 6790 + }, + { + "epoch": 0.5634795588357243, + "grad_norm": 0.8961343765258789, + "learning_rate": 4.766339985427396e-05, + "loss": 1.2886, + "mean_token_accuracy": 0.67652125954628, + "num_tokens": 222533297.0, + "step": 6795 + }, + { + "epoch": 0.5638941869143378, + "grad_norm": 0.9040511250495911, + "learning_rate": 4.759111447216301e-05, + "loss": 1.2778, + "mean_token_accuracy": 0.679087245464325, + "num_tokens": 222697137.0, + "step": 6800 + }, + { + "epoch": 0.5643088149929513, + "grad_norm": 0.9621016979217529, + "learning_rate": 4.7518834136156477e-05, + "loss": 1.1837, + "mean_token_accuracy": 0.6943792745471, + "num_tokens": 222860977.0, + "step": 6805 + }, + { + "epoch": 0.5647234430715649, + "grad_norm": 1.0572618246078491, + "learning_rate": 4.74465589976664e-05, + "loss": 1.1642, + "mean_token_accuracy": 0.6983809888362884, + "num_tokens": 223024817.0, + "step": 6810 + }, + { + "epoch": 0.5651380711501783, + "grad_norm": 0.9216713309288025, + "learning_rate": 4.73742892080938e-05, + "loss": 1.1888, + "mean_token_accuracy": 0.6927174970507621, + "num_tokens": 223188657.0, + "step": 6815 + }, + { + "epoch": 0.5655526992287918, + "grad_norm": 0.919407069683075, + "learning_rate": 4.73020249188286e-05, + "loss": 1.1847, + "mean_token_accuracy": 0.6968169614672661, + "num_tokens": 223352497.0, + "step": 6820 + }, + { + "epoch": 0.5659673273074053, + "grad_norm": 0.924695611000061, + "learning_rate": 4.7229766281249165e-05, + "loss": 1.1871, + "mean_token_accuracy": 0.6970416769385338, + "num_tokens": 223515783.0, + "step": 6825 + }, + { + "epoch": 0.5663819553860188, + "grad_norm": 0.9437134265899658, + "learning_rate": 4.7157513446722e-05, + "loss": 1.1951, + "mean_token_accuracy": 0.6948680371046067, + "num_tokens": 223679623.0, + "step": 6830 + }, + { + "epoch": 0.5667965834646322, + "grad_norm": 0.8854874968528748, + "learning_rate": 4.708526656660148e-05, + "loss": 1.2441, + "mean_token_accuracy": 0.6849584549665451, + "num_tokens": 223843463.0, + "step": 6835 + }, + { + "epoch": 0.5672112115432457, + "grad_norm": 0.918292760848999, + "learning_rate": 4.701302579222952e-05, + "loss": 1.173, + "mean_token_accuracy": 0.6970736175775528, + "num_tokens": 224006590.0, + "step": 6840 + }, + { + "epoch": 0.5676258396218592, + "grad_norm": 0.9339110851287842, + "learning_rate": 4.6940791274935224e-05, + "loss": 1.2188, + "mean_token_accuracy": 0.6935510709881783, + "num_tokens": 224169455.0, + "step": 6845 + }, + { + "epoch": 0.5680404677004727, + "grad_norm": 0.9341229796409607, + "learning_rate": 4.686856316603456e-05, + "loss": 1.1925, + "mean_token_accuracy": 0.6918010711669922, + "num_tokens": 224333295.0, + "step": 6850 + }, + { + "epoch": 0.5684550957790862, + "grad_norm": 0.9337224364280701, + "learning_rate": 4.679634161683012e-05, + "loss": 1.1728, + "mean_token_accuracy": 0.6961937919259071, + "num_tokens": 224497135.0, + "step": 6855 + }, + { + "epoch": 0.5688697238576996, + "grad_norm": 0.9041075706481934, + "learning_rate": 4.672412677861076e-05, + "loss": 1.1957, + "mean_token_accuracy": 0.6941210865974426, + "num_tokens": 224660221.0, + "step": 6860 + }, + { + "epoch": 0.5692843519363131, + "grad_norm": 0.9238066673278809, + "learning_rate": 4.6651918802651215e-05, + "loss": 1.1911, + "mean_token_accuracy": 0.6899743393063545, + "num_tokens": 224824061.0, + "step": 6865 + }, + { + "epoch": 0.5696989800149266, + "grad_norm": 0.956802487373352, + "learning_rate": 4.657971784021189e-05, + "loss": 1.1953, + "mean_token_accuracy": 0.6936889082193375, + "num_tokens": 224987901.0, + "step": 6870 + }, + { + "epoch": 0.5701136080935401, + "grad_norm": 0.905755877494812, + "learning_rate": 4.650752404253853e-05, + "loss": 1.2357, + "mean_token_accuracy": 0.6879765421152115, + "num_tokens": 225151741.0, + "step": 6875 + }, + { + "epoch": 0.5705282361721535, + "grad_norm": 0.8741195797920227, + "learning_rate": 4.6435337560861796e-05, + "loss": 1.1974, + "mean_token_accuracy": 0.6937866553664207, + "num_tokens": 225315581.0, + "step": 6880 + }, + { + "epoch": 0.570942864250767, + "grad_norm": 0.911460280418396, + "learning_rate": 4.636315854639707e-05, + "loss": 1.1911, + "mean_token_accuracy": 0.6958699867129325, + "num_tokens": 225479421.0, + "step": 6885 + }, + { + "epoch": 0.5713574923293806, + "grad_norm": 0.9902095794677734, + "learning_rate": 4.629098715034411e-05, + "loss": 1.2458, + "mean_token_accuracy": 0.6900232106447219, + "num_tokens": 225643261.0, + "step": 6890 + }, + { + "epoch": 0.5717721204079941, + "grad_norm": 0.9339204430580139, + "learning_rate": 4.621882352388665e-05, + "loss": 1.2106, + "mean_token_accuracy": 0.6915994614362717, + "num_tokens": 225807101.0, + "step": 6895 + }, + { + "epoch": 0.5721867484866076, + "grad_norm": 0.8947141170501709, + "learning_rate": 4.61466678181922e-05, + "loss": 1.2181, + "mean_token_accuracy": 0.6909457445144653, + "num_tokens": 225970941.0, + "step": 6900 + }, + { + "epoch": 0.572601376565221, + "grad_norm": 0.881833553314209, + "learning_rate": 4.6074520184411685e-05, + "loss": 1.2302, + "mean_token_accuracy": 0.6894855827093125, + "num_tokens": 226134781.0, + "step": 6905 + }, + { + "epoch": 0.5730160046438345, + "grad_norm": 0.9418825507164001, + "learning_rate": 4.6002380773679064e-05, + "loss": 1.1899, + "mean_token_accuracy": 0.6962304413318634, + "num_tokens": 226298621.0, + "step": 6910 + }, + { + "epoch": 0.573430632722448, + "grad_norm": 0.9378498196601868, + "learning_rate": 4.5930249737111134e-05, + "loss": 1.2173, + "mean_token_accuracy": 0.6883369967341423, + "num_tokens": 226462461.0, + "step": 6915 + }, + { + "epoch": 0.5738452608010615, + "grad_norm": 0.9858556985855103, + "learning_rate": 4.5858127225807126e-05, + "loss": 1.2337, + "mean_token_accuracy": 0.6850562021136284, + "num_tokens": 226626301.0, + "step": 6920 + }, + { + "epoch": 0.5742598888796749, + "grad_norm": 0.9240570068359375, + "learning_rate": 4.5786013390848406e-05, + "loss": 1.2609, + "mean_token_accuracy": 0.6839687198400497, + "num_tokens": 226790141.0, + "step": 6925 + }, + { + "epoch": 0.5746745169582884, + "grad_norm": 0.8796793222427368, + "learning_rate": 4.5713908383298134e-05, + "loss": 1.291, + "mean_token_accuracy": 0.6791238993406296, + "num_tokens": 226953981.0, + "step": 6930 + }, + { + "epoch": 0.5750891450369019, + "grad_norm": 0.878657341003418, + "learning_rate": 4.564181235420106e-05, + "loss": 1.1323, + "mean_token_accuracy": 0.7025233536958695, + "num_tokens": 227116801.0, + "step": 6935 + }, + { + "epoch": 0.5755037731155154, + "grad_norm": 0.8942219018936157, + "learning_rate": 4.556972545458307e-05, + "loss": 1.1995, + "mean_token_accuracy": 0.6889784947037697, + "num_tokens": 227280641.0, + "step": 6940 + }, + { + "epoch": 0.5759184011941288, + "grad_norm": 0.9079373478889465, + "learning_rate": 4.549764783545091e-05, + "loss": 1.1846, + "mean_token_accuracy": 0.6932490170001984, + "num_tokens": 227444481.0, + "step": 6945 + }, + { + "epoch": 0.5763330292727423, + "grad_norm": 0.958938717842102, + "learning_rate": 4.5425579647791916e-05, + "loss": 1.2102, + "mean_token_accuracy": 0.6875183284282684, + "num_tokens": 227608321.0, + "step": 6950 + }, + { + "epoch": 0.5767476573513558, + "grad_norm": 0.9544787406921387, + "learning_rate": 4.535352104257369e-05, + "loss": 1.2189, + "mean_token_accuracy": 0.6896444261074066, + "num_tokens": 227772161.0, + "step": 6955 + }, + { + "epoch": 0.5771622854299693, + "grad_norm": 0.8173982501029968, + "learning_rate": 4.52814721707437e-05, + "loss": 1.1961, + "mean_token_accuracy": 0.6936155915260315, + "num_tokens": 227936001.0, + "step": 6960 + }, + { + "epoch": 0.5775769135085828, + "grad_norm": 0.9662452340126038, + "learning_rate": 4.520943318322907e-05, + "loss": 1.2135, + "mean_token_accuracy": 0.6873044952750206, + "num_tokens": 228099841.0, + "step": 6965 + }, + { + "epoch": 0.5779915415871962, + "grad_norm": 0.9559817910194397, + "learning_rate": 4.5137404230936205e-05, + "loss": 1.237, + "mean_token_accuracy": 0.6828873425722122, + "num_tokens": 228263681.0, + "step": 6970 + }, + { + "epoch": 0.5784061696658098, + "grad_norm": 0.8829718828201294, + "learning_rate": 4.506538546475047e-05, + "loss": 1.151, + "mean_token_accuracy": 0.6998533725738525, + "num_tokens": 228427521.0, + "step": 6975 + }, + { + "epoch": 0.5788207977444233, + "grad_norm": 0.9070400595664978, + "learning_rate": 4.499337703553593e-05, + "loss": 1.2105, + "mean_token_accuracy": 0.6938962921500206, + "num_tokens": 228590809.0, + "step": 6980 + }, + { + "epoch": 0.5792354258230368, + "grad_norm": 0.8562132716178894, + "learning_rate": 4.492137909413497e-05, + "loss": 1.1862, + "mean_token_accuracy": 0.693982158601284, + "num_tokens": 228754649.0, + "step": 6985 + }, + { + "epoch": 0.5796500539016503, + "grad_norm": 0.8953412175178528, + "learning_rate": 4.484939179136804e-05, + "loss": 1.1653, + "mean_token_accuracy": 0.7018756113946438, + "num_tokens": 228918489.0, + "step": 6990 + }, + { + "epoch": 0.5800646819802637, + "grad_norm": 0.9159473776817322, + "learning_rate": 4.477741527803322e-05, + "loss": 1.2287, + "mean_token_accuracy": 0.6866508081555367, + "num_tokens": 229082268.0, + "step": 6995 + }, + { + "epoch": 0.5804793100588772, + "grad_norm": 0.9010199308395386, + "learning_rate": 4.4705449704906085e-05, + "loss": 1.2679, + "mean_token_accuracy": 0.6817631974816323, + "num_tokens": 229246108.0, + "step": 7000 + }, + { + "epoch": 0.5808939381374907, + "grad_norm": 0.8651865720748901, + "learning_rate": 4.463349522273925e-05, + "loss": 1.1858, + "mean_token_accuracy": 0.6942204251885414, + "num_tokens": 229409948.0, + "step": 7005 + }, + { + "epoch": 0.5813085662161042, + "grad_norm": 0.9483944177627563, + "learning_rate": 4.456155198226207e-05, + "loss": 1.205, + "mean_token_accuracy": 0.6894000515341758, + "num_tokens": 229573788.0, + "step": 7010 + }, + { + "epoch": 0.5817231942947176, + "grad_norm": 0.9426966905593872, + "learning_rate": 4.4489620134180424e-05, + "loss": 1.3329, + "mean_token_accuracy": 0.668371208012104, + "num_tokens": 229737628.0, + "step": 7015 + }, + { + "epoch": 0.5821378223733311, + "grad_norm": 0.9136776924133301, + "learning_rate": 4.441769982917626e-05, + "loss": 1.2323, + "mean_token_accuracy": 0.6856359332799912, + "num_tokens": 229900650.0, + "step": 7020 + }, + { + "epoch": 0.5825524504519446, + "grad_norm": 0.9698913097381592, + "learning_rate": 4.434579121790735e-05, + "loss": 1.1771, + "mean_token_accuracy": 0.6947214111685753, + "num_tokens": 230064490.0, + "step": 7025 + }, + { + "epoch": 0.5829670785305581, + "grad_norm": 0.8887882828712463, + "learning_rate": 4.4273894451007e-05, + "loss": 1.2011, + "mean_token_accuracy": 0.6927236124873162, + "num_tokens": 230228330.0, + "step": 7030 + }, + { + "epoch": 0.5833817066091715, + "grad_norm": 0.8867167234420776, + "learning_rate": 4.420200967908373e-05, + "loss": 1.1936, + "mean_token_accuracy": 0.691513928771019, + "num_tokens": 230392170.0, + "step": 7035 + }, + { + "epoch": 0.583796334687785, + "grad_norm": 0.9249410033226013, + "learning_rate": 4.413013705272084e-05, + "loss": 1.2341, + "mean_token_accuracy": 0.6920149102807045, + "num_tokens": 230556010.0, + "step": 7040 + }, + { + "epoch": 0.5842109627663985, + "grad_norm": 0.9408177733421326, + "learning_rate": 4.405827672247628e-05, + "loss": 1.2252, + "mean_token_accuracy": 0.6882942304015159, + "num_tokens": 230719850.0, + "step": 7045 + }, + { + "epoch": 0.584625590845012, + "grad_norm": 0.9278060793876648, + "learning_rate": 4.398642883888219e-05, + "loss": 1.2217, + "mean_token_accuracy": 0.6905486330389976, + "num_tokens": 230883690.0, + "step": 7050 + }, + { + "epoch": 0.5850402189236255, + "grad_norm": 0.9284443855285645, + "learning_rate": 4.391459355244464e-05, + "loss": 1.145, + "mean_token_accuracy": 0.7007453590631485, + "num_tokens": 231047530.0, + "step": 7055 + }, + { + "epoch": 0.585454847002239, + "grad_norm": 0.9322700500488281, + "learning_rate": 4.384277101364336e-05, + "loss": 1.2248, + "mean_token_accuracy": 0.6901820629835129, + "num_tokens": 231211370.0, + "step": 7060 + }, + { + "epoch": 0.5858694750808525, + "grad_norm": 0.9473124742507935, + "learning_rate": 4.3770961372931305e-05, + "loss": 1.1841, + "mean_token_accuracy": 0.6927052780985832, + "num_tokens": 231375210.0, + "step": 7065 + }, + { + "epoch": 0.586284103159466, + "grad_norm": 0.9334824085235596, + "learning_rate": 4.369916478073449e-05, + "loss": 1.2233, + "mean_token_accuracy": 0.6873633891344071, + "num_tokens": 231538074.0, + "step": 7070 + }, + { + "epoch": 0.5866987312380795, + "grad_norm": 0.9093325138092041, + "learning_rate": 4.36273813874515e-05, + "loss": 1.2381, + "mean_token_accuracy": 0.6827529326081276, + "num_tokens": 231701914.0, + "step": 7075 + }, + { + "epoch": 0.587113359316693, + "grad_norm": 0.9013286232948303, + "learning_rate": 4.355561134345336e-05, + "loss": 1.1764, + "mean_token_accuracy": 0.6988963022828102, + "num_tokens": 231864747.0, + "step": 7080 + }, + { + "epoch": 0.5875279873953064, + "grad_norm": 0.9422785639762878, + "learning_rate": 4.348385479908309e-05, + "loss": 1.1431, + "mean_token_accuracy": 0.7011178985238076, + "num_tokens": 232028475.0, + "step": 7085 + }, + { + "epoch": 0.5879426154739199, + "grad_norm": 0.881239116191864, + "learning_rate": 4.3412111904655414e-05, + "loss": 1.1635, + "mean_token_accuracy": 0.6978590875864029, + "num_tokens": 232191579.0, + "step": 7090 + }, + { + "epoch": 0.5883572435525334, + "grad_norm": 0.9406387209892273, + "learning_rate": 4.3340382810456506e-05, + "loss": 1.1867, + "mean_token_accuracy": 0.6942754134535789, + "num_tokens": 232355419.0, + "step": 7095 + }, + { + "epoch": 0.5887718716311469, + "grad_norm": 0.949264407157898, + "learning_rate": 4.326866766674362e-05, + "loss": 1.2357, + "mean_token_accuracy": 0.6850134387612343, + "num_tokens": 232519259.0, + "step": 7100 + }, + { + "epoch": 0.5891864997097603, + "grad_norm": 0.9393827319145203, + "learning_rate": 4.3196966623744756e-05, + "loss": 1.1852, + "mean_token_accuracy": 0.6932001456618309, + "num_tokens": 232683099.0, + "step": 7105 + }, + { + "epoch": 0.5896011277883738, + "grad_norm": 0.8733914494514465, + "learning_rate": 4.3125279831658386e-05, + "loss": 1.2229, + "mean_token_accuracy": 0.6882759019732475, + "num_tokens": 232846939.0, + "step": 7110 + }, + { + "epoch": 0.5900157558669873, + "grad_norm": 0.9470304846763611, + "learning_rate": 4.3053607440653187e-05, + "loss": 1.239, + "mean_token_accuracy": 0.6860337257385254, + "num_tokens": 233010779.0, + "step": 7115 + }, + { + "epoch": 0.5904303839456008, + "grad_norm": 0.9031630754470825, + "learning_rate": 4.298194960086758e-05, + "loss": 1.1668, + "mean_token_accuracy": 0.6998717039823532, + "num_tokens": 233174619.0, + "step": 7120 + }, + { + "epoch": 0.5908450120242142, + "grad_norm": 0.9249277114868164, + "learning_rate": 4.291030646240955e-05, + "loss": 1.2993, + "mean_token_accuracy": 0.6730938404798508, + "num_tokens": 233338459.0, + "step": 7125 + }, + { + "epoch": 0.5912596401028277, + "grad_norm": 0.929760217666626, + "learning_rate": 4.2838678175356285e-05, + "loss": 1.2512, + "mean_token_accuracy": 0.6843751162290573, + "num_tokens": 233501361.0, + "step": 7130 + }, + { + "epoch": 0.5916742681814412, + "grad_norm": 0.9114911556243896, + "learning_rate": 4.276706488975388e-05, + "loss": 1.0913, + "mean_token_accuracy": 0.7118707254529, + "num_tokens": 233665201.0, + "step": 7135 + }, + { + "epoch": 0.5920888962600547, + "grad_norm": 0.9281850457191467, + "learning_rate": 4.269546675561697e-05, + "loss": 1.1847, + "mean_token_accuracy": 0.6965053781867028, + "num_tokens": 233829041.0, + "step": 7140 + }, + { + "epoch": 0.5925035243386683, + "grad_norm": 0.9317891001701355, + "learning_rate": 4.262388392292845e-05, + "loss": 1.1841, + "mean_token_accuracy": 0.7003548637032508, + "num_tokens": 233992612.0, + "step": 7145 + }, + { + "epoch": 0.5929181524172817, + "grad_norm": 0.895683765411377, + "learning_rate": 4.2552316541639216e-05, + "loss": 1.1809, + "mean_token_accuracy": 0.6913734123110771, + "num_tokens": 234156452.0, + "step": 7150 + }, + { + "epoch": 0.5933327804958952, + "grad_norm": 0.8942002654075623, + "learning_rate": 4.248076476166771e-05, + "loss": 1.1815, + "mean_token_accuracy": 0.6950024455785752, + "num_tokens": 234320292.0, + "step": 7155 + }, + { + "epoch": 0.5937474085745087, + "grad_norm": 0.9098842144012451, + "learning_rate": 4.240922873289976e-05, + "loss": 1.2284, + "mean_token_accuracy": 0.6897869989275932, + "num_tokens": 234484006.0, + "step": 7160 + }, + { + "epoch": 0.5941620366531222, + "grad_norm": 0.9092161059379578, + "learning_rate": 4.233770860518821e-05, + "loss": 1.1719, + "mean_token_accuracy": 0.6982835829257965, + "num_tokens": 234647277.0, + "step": 7165 + }, + { + "epoch": 0.5945766647317356, + "grad_norm": 0.9007692933082581, + "learning_rate": 4.226620452835252e-05, + "loss": 1.1977, + "mean_token_accuracy": 0.6938905209302902, + "num_tokens": 234811117.0, + "step": 7170 + }, + { + "epoch": 0.5949912928103491, + "grad_norm": 0.9069666266441345, + "learning_rate": 4.2194716652178576e-05, + "loss": 1.2028, + "mean_token_accuracy": 0.6928763464093208, + "num_tokens": 234974957.0, + "step": 7175 + }, + { + "epoch": 0.5954059208889626, + "grad_norm": 0.9262700080871582, + "learning_rate": 4.2123245126418346e-05, + "loss": 1.2319, + "mean_token_accuracy": 0.6861192613840104, + "num_tokens": 235138797.0, + "step": 7180 + }, + { + "epoch": 0.5958205489675761, + "grad_norm": 0.8939736485481262, + "learning_rate": 4.20517901007895e-05, + "loss": 1.2209, + "mean_token_accuracy": 0.68668133020401, + "num_tokens": 235302637.0, + "step": 7185 + }, + { + "epoch": 0.5962351770461896, + "grad_norm": 0.983397364616394, + "learning_rate": 4.198035172497517e-05, + "loss": 1.2777, + "mean_token_accuracy": 0.6798264905810356, + "num_tokens": 235466477.0, + "step": 7190 + }, + { + "epoch": 0.596649805124803, + "grad_norm": 0.9213573336601257, + "learning_rate": 4.190893014862362e-05, + "loss": 1.2034, + "mean_token_accuracy": 0.6935972616076469, + "num_tokens": 235630317.0, + "step": 7195 + }, + { + "epoch": 0.5970644332034165, + "grad_norm": 0.9221646189689636, + "learning_rate": 4.183752552134791e-05, + "loss": 1.1677, + "mean_token_accuracy": 0.6981610432267189, + "num_tokens": 235794157.0, + "step": 7200 + }, + { + "epoch": 0.59747906128203, + "grad_norm": 1.1523939371109009, + "learning_rate": 4.1766137992725576e-05, + "loss": 1.1463, + "mean_token_accuracy": 0.699657866358757, + "num_tokens": 235957997.0, + "step": 7205 + }, + { + "epoch": 0.5978936893606435, + "grad_norm": 0.926038384437561, + "learning_rate": 4.169476771229835e-05, + "loss": 1.2636, + "mean_token_accuracy": 0.6807978987693787, + "num_tokens": 236121837.0, + "step": 7210 + }, + { + "epoch": 0.5983083174392569, + "grad_norm": 0.9002109169960022, + "learning_rate": 4.1623414829571875e-05, + "loss": 1.2146, + "mean_token_accuracy": 0.6921409219503403, + "num_tokens": 236285522.0, + "step": 7215 + }, + { + "epoch": 0.5987229455178704, + "grad_norm": 0.8745740056037903, + "learning_rate": 4.155207949401528e-05, + "loss": 1.2097, + "mean_token_accuracy": 0.6918316230177879, + "num_tokens": 236449362.0, + "step": 7220 + }, + { + "epoch": 0.599137573596484, + "grad_norm": 0.8774746060371399, + "learning_rate": 4.1480761855060974e-05, + "loss": 1.1393, + "mean_token_accuracy": 0.7037817686796188, + "num_tokens": 236613202.0, + "step": 7225 + }, + { + "epoch": 0.5995522016750975, + "grad_norm": 0.9291770458221436, + "learning_rate": 4.14094620621043e-05, + "loss": 1.1597, + "mean_token_accuracy": 0.7004643231630325, + "num_tokens": 236777042.0, + "step": 7230 + }, + { + "epoch": 0.599966829753711, + "grad_norm": 0.8415595889091492, + "learning_rate": 4.133818026450318e-05, + "loss": 1.1789, + "mean_token_accuracy": 0.6951625302433968, + "num_tokens": 236940770.0, + "step": 7235 + }, + { + "epoch": 0.6003814578323244, + "grad_norm": 0.9406865239143372, + "learning_rate": 4.1266916611577886e-05, + "loss": 1.1178, + "mean_token_accuracy": 0.7086510300636292, + "num_tokens": 237104610.0, + "step": 7240 + }, + { + "epoch": 0.6007960859109379, + "grad_norm": 0.9170796871185303, + "learning_rate": 4.119567125261069e-05, + "loss": 1.2775, + "mean_token_accuracy": 0.6801114067435264, + "num_tokens": 237267808.0, + "step": 7245 + }, + { + "epoch": 0.6012107139895514, + "grad_norm": 0.9180638790130615, + "learning_rate": 4.112444433684545e-05, + "loss": 1.2122, + "mean_token_accuracy": 0.689168743789196, + "num_tokens": 237431242.0, + "step": 7250 + }, + { + "epoch": 0.6016253420681649, + "grad_norm": 0.9154341220855713, + "learning_rate": 4.105323601348749e-05, + "loss": 1.2155, + "mean_token_accuracy": 0.6859420806169509, + "num_tokens": 237595082.0, + "step": 7255 + }, + { + "epoch": 0.6020399701467783, + "grad_norm": 0.9194984436035156, + "learning_rate": 4.098204643170316e-05, + "loss": 1.1633, + "mean_token_accuracy": 0.7001832857728004, + "num_tokens": 237758922.0, + "step": 7260 + }, + { + "epoch": 0.6024545982253918, + "grad_norm": 0.9072752594947815, + "learning_rate": 4.091087574061952e-05, + "loss": 1.2475, + "mean_token_accuracy": 0.6822566166520119, + "num_tokens": 237921973.0, + "step": 7265 + }, + { + "epoch": 0.6028692263040053, + "grad_norm": 0.8539906740188599, + "learning_rate": 4.083972408932407e-05, + "loss": 1.1633, + "mean_token_accuracy": 0.6977392688393593, + "num_tokens": 238085383.0, + "step": 7270 + }, + { + "epoch": 0.6032838543826188, + "grad_norm": 0.8923770785331726, + "learning_rate": 4.076859162686446e-05, + "loss": 1.1732, + "mean_token_accuracy": 0.6982282474637032, + "num_tokens": 238249223.0, + "step": 7275 + }, + { + "epoch": 0.6036984824612323, + "grad_norm": 0.9751062393188477, + "learning_rate": 4.069747850224811e-05, + "loss": 1.2062, + "mean_token_accuracy": 0.691937729716301, + "num_tokens": 238412664.0, + "step": 7280 + }, + { + "epoch": 0.6041131105398457, + "grad_norm": 0.8665412068367004, + "learning_rate": 4.0626384864441925e-05, + "loss": 1.1271, + "mean_token_accuracy": 0.7078262493014336, + "num_tokens": 238576504.0, + "step": 7285 + }, + { + "epoch": 0.6045277386184592, + "grad_norm": 0.9097188711166382, + "learning_rate": 4.0555310862372e-05, + "loss": 1.2266, + "mean_token_accuracy": 0.6905669540166854, + "num_tokens": 238740344.0, + "step": 7290 + }, + { + "epoch": 0.6049423666970727, + "grad_norm": 0.9177653789520264, + "learning_rate": 4.0484256644923325e-05, + "loss": 1.1188, + "mean_token_accuracy": 0.7084032386541367, + "num_tokens": 238904059.0, + "step": 7295 + }, + { + "epoch": 0.6053569947756862, + "grad_norm": 0.8903054594993591, + "learning_rate": 4.0413222360939395e-05, + "loss": 1.1767, + "mean_token_accuracy": 0.6976111918687821, + "num_tokens": 239067899.0, + "step": 7300 + }, + { + "epoch": 0.6057716228542996, + "grad_norm": 0.8641144037246704, + "learning_rate": 4.034220815922199e-05, + "loss": 1.2182, + "mean_token_accuracy": 0.688984602689743, + "num_tokens": 239231739.0, + "step": 7305 + }, + { + "epoch": 0.6061862509329132, + "grad_norm": 0.9189815521240234, + "learning_rate": 4.0271214188530804e-05, + "loss": 1.1945, + "mean_token_accuracy": 0.6914650112390518, + "num_tokens": 239394817.0, + "step": 7310 + }, + { + "epoch": 0.6066008790115267, + "grad_norm": 0.9175986051559448, + "learning_rate": 4.020024059758313e-05, + "loss": 1.2556, + "mean_token_accuracy": 0.6834799647331238, + "num_tokens": 239558657.0, + "step": 7315 + }, + { + "epoch": 0.6070155070901402, + "grad_norm": 0.9445254802703857, + "learning_rate": 4.012928753505362e-05, + "loss": 1.0622, + "mean_token_accuracy": 0.7175769805908203, + "num_tokens": 239722497.0, + "step": 7320 + }, + { + "epoch": 0.6074301351687537, + "grad_norm": 0.9159888625144958, + "learning_rate": 4.00583551495739e-05, + "loss": 1.2941, + "mean_token_accuracy": 0.6788978517055512, + "num_tokens": 239886337.0, + "step": 7325 + }, + { + "epoch": 0.6078447632473671, + "grad_norm": 0.897580087184906, + "learning_rate": 3.9987443589732256e-05, + "loss": 1.1761, + "mean_token_accuracy": 0.6998594805598259, + "num_tokens": 240050177.0, + "step": 7330 + }, + { + "epoch": 0.6082593913259806, + "grad_norm": 0.9680034518241882, + "learning_rate": 3.9916553004073376e-05, + "loss": 1.1336, + "mean_token_accuracy": 0.7039039567112922, + "num_tokens": 240214017.0, + "step": 7335 + }, + { + "epoch": 0.6086740194045941, + "grad_norm": 0.9162238836288452, + "learning_rate": 3.9845683541098013e-05, + "loss": 1.2331, + "mean_token_accuracy": 0.6869929172098637, + "num_tokens": 240377857.0, + "step": 7340 + }, + { + "epoch": 0.6090886474832076, + "grad_norm": 0.8927464485168457, + "learning_rate": 3.977483534926267e-05, + "loss": 1.1504, + "mean_token_accuracy": 0.6996334314346313, + "num_tokens": 240541697.0, + "step": 7345 + }, + { + "epoch": 0.609503275561821, + "grad_norm": 0.9331327080726624, + "learning_rate": 3.970400857697929e-05, + "loss": 1.2533, + "mean_token_accuracy": 0.6825652077794075, + "num_tokens": 240704575.0, + "step": 7350 + }, + { + "epoch": 0.6099179036404345, + "grad_norm": 0.9102768898010254, + "learning_rate": 3.963320337261491e-05, + "loss": 1.2194, + "mean_token_accuracy": 0.6883186668157577, + "num_tokens": 240868415.0, + "step": 7355 + }, + { + "epoch": 0.610332531719048, + "grad_norm": 0.9383589029312134, + "learning_rate": 3.9562419884491466e-05, + "loss": 1.1959, + "mean_token_accuracy": 0.6939088463783264, + "num_tokens": 241032255.0, + "step": 7360 + }, + { + "epoch": 0.6107471597976615, + "grad_norm": 0.9416522979736328, + "learning_rate": 3.949165826088533e-05, + "loss": 1.2391, + "mean_token_accuracy": 0.6832233607769013, + "num_tokens": 241196095.0, + "step": 7365 + }, + { + "epoch": 0.611161787876275, + "grad_norm": 0.8662397265434265, + "learning_rate": 3.94209186500271e-05, + "loss": 1.2237, + "mean_token_accuracy": 0.6945136860013008, + "num_tokens": 241359935.0, + "step": 7370 + }, + { + "epoch": 0.6115764159548884, + "grad_norm": 0.9300234913825989, + "learning_rate": 3.93502012001013e-05, + "loss": 1.1999, + "mean_token_accuracy": 0.6895161271095276, + "num_tokens": 241523775.0, + "step": 7375 + }, + { + "epoch": 0.6119910440335019, + "grad_norm": 0.9370386004447937, + "learning_rate": 3.927950605924593e-05, + "loss": 1.2483, + "mean_token_accuracy": 0.686803525686264, + "num_tokens": 241687615.0, + "step": 7380 + }, + { + "epoch": 0.6124056721121154, + "grad_norm": 0.8926957249641418, + "learning_rate": 3.9208833375552366e-05, + "loss": 1.1554, + "mean_token_accuracy": 0.69775170981884, + "num_tokens": 241851455.0, + "step": 7385 + }, + { + "epoch": 0.6128203001907289, + "grad_norm": 0.9361787438392639, + "learning_rate": 3.9138183297064894e-05, + "loss": 1.247, + "mean_token_accuracy": 0.6879825234413147, + "num_tokens": 242014996.0, + "step": 7390 + }, + { + "epoch": 0.6132349282693424, + "grad_norm": 0.9342592358589172, + "learning_rate": 3.9067555971780425e-05, + "loss": 1.201, + "mean_token_accuracy": 0.6900476559996604, + "num_tokens": 242178836.0, + "step": 7395 + }, + { + "epoch": 0.6136495563479559, + "grad_norm": 0.9027369022369385, + "learning_rate": 3.899695154764825e-05, + "loss": 1.1817, + "mean_token_accuracy": 0.6957172527909279, + "num_tokens": 242342676.0, + "step": 7400 + }, + { + "epoch": 0.6140641844265694, + "grad_norm": 0.890907883644104, + "learning_rate": 3.892637017256967e-05, + "loss": 1.1623, + "mean_token_accuracy": 0.7006231635808945, + "num_tokens": 242506516.0, + "step": 7405 + }, + { + "epoch": 0.6144788125051829, + "grad_norm": 0.8817828297615051, + "learning_rate": 3.88558119943977e-05, + "loss": 1.1495, + "mean_token_accuracy": 0.7012776091694832, + "num_tokens": 242670001.0, + "step": 7410 + }, + { + "epoch": 0.6148934405837964, + "grad_norm": 0.9252811670303345, + "learning_rate": 3.878527716093673e-05, + "loss": 1.2046, + "mean_token_accuracy": 0.6961571365594864, + "num_tokens": 242833841.0, + "step": 7415 + }, + { + "epoch": 0.6153080686624098, + "grad_norm": 0.866140604019165, + "learning_rate": 3.871476581994232e-05, + "loss": 1.088, + "mean_token_accuracy": 0.7118157342076301, + "num_tokens": 242997681.0, + "step": 7420 + }, + { + "epoch": 0.6157226967410233, + "grad_norm": 0.8810722827911377, + "learning_rate": 3.864427811912078e-05, + "loss": 1.1928, + "mean_token_accuracy": 0.6912584364414215, + "num_tokens": 243161071.0, + "step": 7425 + }, + { + "epoch": 0.6161373248196368, + "grad_norm": 0.9337232112884521, + "learning_rate": 3.8573814206128874e-05, + "loss": 1.2626, + "mean_token_accuracy": 0.6843047440052032, + "num_tokens": 243324911.0, + "step": 7430 + }, + { + "epoch": 0.6165519528982503, + "grad_norm": 0.9594038128852844, + "learning_rate": 3.8503374228573566e-05, + "loss": 1.2071, + "mean_token_accuracy": 0.6930954113602639, + "num_tokens": 243488240.0, + "step": 7435 + }, + { + "epoch": 0.6169665809768637, + "grad_norm": 0.9113042950630188, + "learning_rate": 3.8432958334011696e-05, + "loss": 1.1896, + "mean_token_accuracy": 0.6943609498441219, + "num_tokens": 243652080.0, + "step": 7440 + }, + { + "epoch": 0.6173812090554772, + "grad_norm": 0.9461240172386169, + "learning_rate": 3.836256666994961e-05, + "loss": 1.2719, + "mean_token_accuracy": 0.683626589179039, + "num_tokens": 243815920.0, + "step": 7445 + }, + { + "epoch": 0.6177958371340907, + "grad_norm": 0.8677425384521484, + "learning_rate": 3.8292199383842904e-05, + "loss": 1.1222, + "mean_token_accuracy": 0.702608747780323, + "num_tokens": 243979760.0, + "step": 7450 + }, + { + "epoch": 0.6182104652127042, + "grad_norm": 0.8997417688369751, + "learning_rate": 3.8221856623096186e-05, + "loss": 1.2245, + "mean_token_accuracy": 0.6878299131989479, + "num_tokens": 244143600.0, + "step": 7455 + }, + { + "epoch": 0.6186250932913177, + "grad_norm": 0.9174249172210693, + "learning_rate": 3.815153853506255e-05, + "loss": 1.2138, + "mean_token_accuracy": 0.6931146115064621, + "num_tokens": 244307440.0, + "step": 7460 + }, + { + "epoch": 0.6190397213699311, + "grad_norm": 0.8721328973770142, + "learning_rate": 3.808124526704352e-05, + "loss": 1.1536, + "mean_token_accuracy": 0.6994314640760422, + "num_tokens": 244470259.0, + "step": 7465 + }, + { + "epoch": 0.6194543494485446, + "grad_norm": 0.903262197971344, + "learning_rate": 3.801097696628859e-05, + "loss": 1.2394, + "mean_token_accuracy": 0.6869745880365372, + "num_tokens": 244634099.0, + "step": 7470 + }, + { + "epoch": 0.6198689775271582, + "grad_norm": 0.8871456384658813, + "learning_rate": 3.7940733779994936e-05, + "loss": 1.105, + "mean_token_accuracy": 0.708999265730381, + "num_tokens": 244797939.0, + "step": 7475 + }, + { + "epoch": 0.6202836056057717, + "grad_norm": 0.897445559501648, + "learning_rate": 3.7870515855307155e-05, + "loss": 1.1674, + "mean_token_accuracy": 0.6953873410820961, + "num_tokens": 244961779.0, + "step": 7480 + }, + { + "epoch": 0.6206982336843851, + "grad_norm": 0.9039589166641235, + "learning_rate": 3.78003233393169e-05, + "loss": 1.2352, + "mean_token_accuracy": 0.6875746801495553, + "num_tokens": 245125126.0, + "step": 7485 + }, + { + "epoch": 0.6211128617629986, + "grad_norm": 0.9061029553413391, + "learning_rate": 3.773015637906263e-05, + "loss": 1.119, + "mean_token_accuracy": 0.7055691704154015, + "num_tokens": 245288633.0, + "step": 7490 + }, + { + "epoch": 0.6215274898416121, + "grad_norm": 0.889281153678894, + "learning_rate": 3.7660015121529214e-05, + "loss": 1.1886, + "mean_token_accuracy": 0.6930413022637367, + "num_tokens": 245452473.0, + "step": 7495 + }, + { + "epoch": 0.6219421179202256, + "grad_norm": 0.9166731834411621, + "learning_rate": 3.758989971364774e-05, + "loss": 1.2139, + "mean_token_accuracy": 0.6898704841732979, + "num_tokens": 245616313.0, + "step": 7500 + }, + { + "epoch": 0.6223567459988391, + "grad_norm": 0.862187922000885, + "learning_rate": 3.7519810302295136e-05, + "loss": 1.1025, + "mean_token_accuracy": 0.7072091907262802, + "num_tokens": 245780153.0, + "step": 7505 + }, + { + "epoch": 0.6227713740774525, + "grad_norm": 0.8542613387107849, + "learning_rate": 3.744974703429382e-05, + "loss": 1.2232, + "mean_token_accuracy": 0.6925891965627671, + "num_tokens": 245943993.0, + "step": 7510 + }, + { + "epoch": 0.623186002156066, + "grad_norm": 0.9268004894256592, + "learning_rate": 3.737971005641149e-05, + "loss": 1.1749, + "mean_token_accuracy": 0.6948831617832184, + "num_tokens": 246107510.0, + "step": 7515 + }, + { + "epoch": 0.6236006302346795, + "grad_norm": 0.8638924360275269, + "learning_rate": 3.730969951536081e-05, + "loss": 1.2193, + "mean_token_accuracy": 0.6944892466068268, + "num_tokens": 246271350.0, + "step": 7520 + }, + { + "epoch": 0.624015258313293, + "grad_norm": 0.8861833810806274, + "learning_rate": 3.723971555779896e-05, + "loss": 1.248, + "mean_token_accuracy": 0.6873167157173157, + "num_tokens": 246435190.0, + "step": 7525 + }, + { + "epoch": 0.6244298863919064, + "grad_norm": 0.8997588753700256, + "learning_rate": 3.716975833032752e-05, + "loss": 1.2002, + "mean_token_accuracy": 0.69580889493227, + "num_tokens": 246599030.0, + "step": 7530 + }, + { + "epoch": 0.6248445144705199, + "grad_norm": 0.9198819994926453, + "learning_rate": 3.7099827979492075e-05, + "loss": 1.2284, + "mean_token_accuracy": 0.6875794202089309, + "num_tokens": 246762870.0, + "step": 7535 + }, + { + "epoch": 0.6252591425491334, + "grad_norm": 0.9142041802406311, + "learning_rate": 3.702992465178182e-05, + "loss": 1.2455, + "mean_token_accuracy": 0.6857679948210716, + "num_tokens": 246926240.0, + "step": 7540 + }, + { + "epoch": 0.6256737706277469, + "grad_norm": 0.9179061651229858, + "learning_rate": 3.696004849362946e-05, + "loss": 1.2106, + "mean_token_accuracy": 0.6883247837424278, + "num_tokens": 247090080.0, + "step": 7545 + }, + { + "epoch": 0.6260883987063603, + "grad_norm": 0.8800495862960815, + "learning_rate": 3.689019965141069e-05, + "loss": 1.1264, + "mean_token_accuracy": 0.7046978026628494, + "num_tokens": 247253601.0, + "step": 7550 + }, + { + "epoch": 0.6265030267849738, + "grad_norm": 0.9119208455085754, + "learning_rate": 3.682037827144409e-05, + "loss": 1.1777, + "mean_token_accuracy": 0.6985948160290718, + "num_tokens": 247417441.0, + "step": 7555 + }, + { + "epoch": 0.6269176548635874, + "grad_norm": 0.8702818751335144, + "learning_rate": 3.675058449999057e-05, + "loss": 1.1369, + "mean_token_accuracy": 0.7018633931875229, + "num_tokens": 247581281.0, + "step": 7560 + }, + { + "epoch": 0.6273322829422009, + "grad_norm": 0.9180110692977905, + "learning_rate": 3.668081848325333e-05, + "loss": 1.2648, + "mean_token_accuracy": 0.6834799602627755, + "num_tokens": 247745121.0, + "step": 7565 + }, + { + "epoch": 0.6277469110208144, + "grad_norm": 0.9151014685630798, + "learning_rate": 3.661108036737737e-05, + "loss": 1.1606, + "mean_token_accuracy": 0.7009775161743164, + "num_tokens": 247908961.0, + "step": 7570 + }, + { + "epoch": 0.6281615390994278, + "grad_norm": 0.94527268409729, + "learning_rate": 3.654137029844924e-05, + "loss": 1.213, + "mean_token_accuracy": 0.6887867733836174, + "num_tokens": 248072066.0, + "step": 7575 + }, + { + "epoch": 0.6285761671780413, + "grad_norm": 0.9195386171340942, + "learning_rate": 3.647168842249679e-05, + "loss": 1.1822, + "mean_token_accuracy": 0.693835535645485, + "num_tokens": 248235906.0, + "step": 7580 + }, + { + "epoch": 0.6289907952566548, + "grad_norm": 0.8834486603736877, + "learning_rate": 3.640203488548876e-05, + "loss": 1.1563, + "mean_token_accuracy": 0.7015090450644493, + "num_tokens": 248399746.0, + "step": 7585 + }, + { + "epoch": 0.6294054233352683, + "grad_norm": 0.9106930494308472, + "learning_rate": 3.633240983333452e-05, + "loss": 1.1919, + "mean_token_accuracy": 0.6935850411653519, + "num_tokens": 248563586.0, + "step": 7590 + }, + { + "epoch": 0.6298200514138818, + "grad_norm": 0.9122629165649414, + "learning_rate": 3.6262813411883814e-05, + "loss": 1.1649, + "mean_token_accuracy": 0.6981880038976669, + "num_tokens": 248726869.0, + "step": 7595 + }, + { + "epoch": 0.6302346794924952, + "grad_norm": 0.896998941898346, + "learning_rate": 3.6193245766926406e-05, + "loss": 1.1874, + "mean_token_accuracy": 0.6955461889505387, + "num_tokens": 248890709.0, + "step": 7600 + }, + { + "epoch": 0.6306493075711087, + "grad_norm": 0.903556764125824, + "learning_rate": 3.612370704419172e-05, + "loss": 1.1902, + "mean_token_accuracy": 0.6891190126538277, + "num_tokens": 249054549.0, + "step": 7605 + }, + { + "epoch": 0.6310639356497222, + "grad_norm": 0.9326213002204895, + "learning_rate": 3.6054197389348665e-05, + "loss": 1.2256, + "mean_token_accuracy": 0.6877138823270798, + "num_tokens": 249216858.0, + "step": 7610 + }, + { + "epoch": 0.6314785637283357, + "grad_norm": 0.9058736562728882, + "learning_rate": 3.598471694800523e-05, + "loss": 1.1588, + "mean_token_accuracy": 0.7015294149518013, + "num_tokens": 249380685.0, + "step": 7615 + }, + { + "epoch": 0.6318931918069491, + "grad_norm": 0.8819628357887268, + "learning_rate": 3.591526586570818e-05, + "loss": 1.1887, + "mean_token_accuracy": 0.6957048639655113, + "num_tokens": 249543792.0, + "step": 7620 + }, + { + "epoch": 0.6323078198855626, + "grad_norm": 0.9046173095703125, + "learning_rate": 3.584584428794284e-05, + "loss": 1.1736, + "mean_token_accuracy": 0.697366812825203, + "num_tokens": 249707632.0, + "step": 7625 + }, + { + "epoch": 0.6327224479641761, + "grad_norm": 0.9316004514694214, + "learning_rate": 3.5776452360132674e-05, + "loss": 1.093, + "mean_token_accuracy": 0.7076551809906959, + "num_tokens": 249871472.0, + "step": 7630 + }, + { + "epoch": 0.6331370760427896, + "grad_norm": 0.8980688452720642, + "learning_rate": 3.57070902276391e-05, + "loss": 1.1912, + "mean_token_accuracy": 0.6930290833115578, + "num_tokens": 250035312.0, + "step": 7635 + }, + { + "epoch": 0.633551704121403, + "grad_norm": 0.8699662685394287, + "learning_rate": 3.563775803576102e-05, + "loss": 1.1337, + "mean_token_accuracy": 0.7062133401632309, + "num_tokens": 250199152.0, + "step": 7640 + }, + { + "epoch": 0.6339663322000166, + "grad_norm": 0.9415832161903381, + "learning_rate": 3.5568455929734703e-05, + "loss": 1.2639, + "mean_token_accuracy": 0.6828629016876221, + "num_tokens": 250362992.0, + "step": 7645 + }, + { + "epoch": 0.6343809602786301, + "grad_norm": 0.9067273139953613, + "learning_rate": 3.549918405473338e-05, + "loss": 1.1426, + "mean_token_accuracy": 0.7032135829329491, + "num_tokens": 250526832.0, + "step": 7650 + }, + { + "epoch": 0.6347955883572436, + "grad_norm": 0.8815768957138062, + "learning_rate": 3.542994255586691e-05, + "loss": 1.1942, + "mean_token_accuracy": 0.6964015141129494, + "num_tokens": 250690672.0, + "step": 7655 + }, + { + "epoch": 0.6352102164358571, + "grad_norm": 0.890948474407196, + "learning_rate": 3.5360731578181586e-05, + "loss": 1.1244, + "mean_token_accuracy": 0.7033846527338028, + "num_tokens": 250854512.0, + "step": 7660 + }, + { + "epoch": 0.6356248445144705, + "grad_norm": 0.8967106342315674, + "learning_rate": 3.529155126665972e-05, + "loss": 1.2503, + "mean_token_accuracy": 0.6870967745780945, + "num_tokens": 251018352.0, + "step": 7665 + }, + { + "epoch": 0.636039472593084, + "grad_norm": 0.9507739543914795, + "learning_rate": 3.522240176621938e-05, + "loss": 1.2102, + "mean_token_accuracy": 0.6890212625265122, + "num_tokens": 251182192.0, + "step": 7670 + }, + { + "epoch": 0.6364541006716975, + "grad_norm": 0.9710748195648193, + "learning_rate": 3.5153283221714114e-05, + "loss": 1.1734, + "mean_token_accuracy": 0.6927552953362465, + "num_tokens": 251345096.0, + "step": 7675 + }, + { + "epoch": 0.636868728750311, + "grad_norm": 0.881964921951294, + "learning_rate": 3.5084195777932655e-05, + "loss": 1.1581, + "mean_token_accuracy": 0.6974370807409287, + "num_tokens": 251508807.0, + "step": 7680 + }, + { + "epoch": 0.6372833568289245, + "grad_norm": 0.8972564339637756, + "learning_rate": 3.5015139579598506e-05, + "loss": 1.2058, + "mean_token_accuracy": 0.6977211624383927, + "num_tokens": 251672647.0, + "step": 7685 + }, + { + "epoch": 0.6376979849075379, + "grad_norm": 0.8787938356399536, + "learning_rate": 3.494611477136978e-05, + "loss": 1.2091, + "mean_token_accuracy": 0.689198437333107, + "num_tokens": 251836487.0, + "step": 7690 + }, + { + "epoch": 0.6381126129861514, + "grad_norm": 0.9102070927619934, + "learning_rate": 3.4877121497838786e-05, + "loss": 1.1796, + "mean_token_accuracy": 0.6987903207540512, + "num_tokens": 252000327.0, + "step": 7695 + }, + { + "epoch": 0.6385272410647649, + "grad_norm": 0.9172378182411194, + "learning_rate": 3.480815990353186e-05, + "loss": 1.2567, + "mean_token_accuracy": 0.6841764420270919, + "num_tokens": 252164167.0, + "step": 7700 + }, + { + "epoch": 0.6389418691433784, + "grad_norm": 0.9535180330276489, + "learning_rate": 3.473923013290887e-05, + "loss": 1.1415, + "mean_token_accuracy": 0.7059567451477051, + "num_tokens": 252328007.0, + "step": 7705 + }, + { + "epoch": 0.6393564972219918, + "grad_norm": 0.8775321841239929, + "learning_rate": 3.467033233036309e-05, + "loss": 1.1707, + "mean_token_accuracy": 0.698680354654789, + "num_tokens": 252491847.0, + "step": 7710 + }, + { + "epoch": 0.6397711253006053, + "grad_norm": 0.9459161162376404, + "learning_rate": 3.4601466640220825e-05, + "loss": 1.1673, + "mean_token_accuracy": 0.6997861638665199, + "num_tokens": 252655687.0, + "step": 7715 + }, + { + "epoch": 0.6401857533792188, + "grad_norm": 0.9370214343070984, + "learning_rate": 3.453263320674105e-05, + "loss": 1.2108, + "mean_token_accuracy": 0.6920271262526512, + "num_tokens": 252819527.0, + "step": 7720 + }, + { + "epoch": 0.6406003814578324, + "grad_norm": 0.955539882183075, + "learning_rate": 3.446383217411526e-05, + "loss": 1.1547, + "mean_token_accuracy": 0.7013013228774071, + "num_tokens": 252983367.0, + "step": 7725 + }, + { + "epoch": 0.6410150095364459, + "grad_norm": 0.8662726283073425, + "learning_rate": 3.439506368646701e-05, + "loss": 1.1857, + "mean_token_accuracy": 0.6947641745209694, + "num_tokens": 253147207.0, + "step": 7730 + }, + { + "epoch": 0.6414296376150593, + "grad_norm": 0.8785839676856995, + "learning_rate": 3.4326327887851686e-05, + "loss": 1.1357, + "mean_token_accuracy": 0.7032488837838173, + "num_tokens": 253310982.0, + "step": 7735 + }, + { + "epoch": 0.6418442656936728, + "grad_norm": 0.9216163754463196, + "learning_rate": 3.4257624922256244e-05, + "loss": 1.1401, + "mean_token_accuracy": 0.7013868510723114, + "num_tokens": 253474822.0, + "step": 7740 + }, + { + "epoch": 0.6422588937722863, + "grad_norm": 0.9228615164756775, + "learning_rate": 3.418895493359882e-05, + "loss": 1.2087, + "mean_token_accuracy": 0.6918560594320298, + "num_tokens": 253638662.0, + "step": 7745 + }, + { + "epoch": 0.6426735218508998, + "grad_norm": 0.8969170451164246, + "learning_rate": 3.412031806572847e-05, + "loss": 1.1717, + "mean_token_accuracy": 0.6972568422555924, + "num_tokens": 253802502.0, + "step": 7750 + }, + { + "epoch": 0.6430881499295132, + "grad_norm": 0.8896551132202148, + "learning_rate": 3.4051714462424874e-05, + "loss": 1.2208, + "mean_token_accuracy": 0.6876038581132888, + "num_tokens": 253966342.0, + "step": 7755 + }, + { + "epoch": 0.6435027780081267, + "grad_norm": 0.8922243118286133, + "learning_rate": 3.398314426739807e-05, + "loss": 1.1533, + "mean_token_accuracy": 0.7022177428007126, + "num_tokens": 254130182.0, + "step": 7760 + }, + { + "epoch": 0.6439174060867402, + "grad_norm": 0.8776208758354187, + "learning_rate": 3.391460762428803e-05, + "loss": 1.1752, + "mean_token_accuracy": 0.7000305414199829, + "num_tokens": 254294022.0, + "step": 7765 + }, + { + "epoch": 0.6443320341653537, + "grad_norm": 0.8624290227890015, + "learning_rate": 3.384610467666453e-05, + "loss": 1.1958, + "mean_token_accuracy": 0.6932917907834053, + "num_tokens": 254457862.0, + "step": 7770 + }, + { + "epoch": 0.6447466622439672, + "grad_norm": 0.8558380603790283, + "learning_rate": 3.377763556802668e-05, + "loss": 1.1781, + "mean_token_accuracy": 0.6946969717741013, + "num_tokens": 254621702.0, + "step": 7775 + }, + { + "epoch": 0.6451612903225806, + "grad_norm": 0.8980141878128052, + "learning_rate": 3.37092004418028e-05, + "loss": 1.0717, + "mean_token_accuracy": 0.7139019802212715, + "num_tokens": 254784915.0, + "step": 7780 + }, + { + "epoch": 0.6455759184011941, + "grad_norm": 0.9248932600021362, + "learning_rate": 3.3640799441349935e-05, + "loss": 1.2491, + "mean_token_accuracy": 0.6859604060649872, + "num_tokens": 254948755.0, + "step": 7785 + }, + { + "epoch": 0.6459905464798076, + "grad_norm": 0.8932434916496277, + "learning_rate": 3.357243270995368e-05, + "loss": 1.1461, + "mean_token_accuracy": 0.7035068452358246, + "num_tokens": 255112595.0, + "step": 7790 + }, + { + "epoch": 0.6464051745584211, + "grad_norm": 0.8438484072685242, + "learning_rate": 3.3504100390827856e-05, + "loss": 1.1113, + "mean_token_accuracy": 0.7079789817333222, + "num_tokens": 255276435.0, + "step": 7795 + }, + { + "epoch": 0.6468198026370345, + "grad_norm": 0.9122107625007629, + "learning_rate": 3.3435802627114146e-05, + "loss": 1.1611, + "mean_token_accuracy": 0.697667233645916, + "num_tokens": 255439429.0, + "step": 7800 + }, + { + "epoch": 0.647234430715648, + "grad_norm": 0.8767837882041931, + "learning_rate": 3.336753956188192e-05, + "loss": 1.1916, + "mean_token_accuracy": 0.6942631945014, + "num_tokens": 255603269.0, + "step": 7805 + }, + { + "epoch": 0.6476490587942616, + "grad_norm": 0.9044637680053711, + "learning_rate": 3.329931133812783e-05, + "loss": 1.2343, + "mean_token_accuracy": 0.6886852413415909, + "num_tokens": 255767109.0, + "step": 7810 + }, + { + "epoch": 0.6480636868728751, + "grad_norm": 0.9283355474472046, + "learning_rate": 3.323111809877552e-05, + "loss": 1.2322, + "mean_token_accuracy": 0.6858137831091881, + "num_tokens": 255930949.0, + "step": 7815 + }, + { + "epoch": 0.6484783149514886, + "grad_norm": 0.8792169690132141, + "learning_rate": 3.3162959986675357e-05, + "loss": 1.2008, + "mean_token_accuracy": 0.6928946733474731, + "num_tokens": 256094789.0, + "step": 7820 + }, + { + "epoch": 0.648892943030102, + "grad_norm": 0.9532812833786011, + "learning_rate": 3.309483714460417e-05, + "loss": 1.1927, + "mean_token_accuracy": 0.6942570865154266, + "num_tokens": 256258629.0, + "step": 7825 + }, + { + "epoch": 0.6493075711087155, + "grad_norm": 0.8612990379333496, + "learning_rate": 3.302674971526485e-05, + "loss": 1.1689, + "mean_token_accuracy": 0.7000855296850205, + "num_tokens": 256422469.0, + "step": 7830 + }, + { + "epoch": 0.649722199187329, + "grad_norm": 0.8923850059509277, + "learning_rate": 3.295869784128611e-05, + "loss": 1.1242, + "mean_token_accuracy": 0.7061583563685417, + "num_tokens": 256586309.0, + "step": 7835 + }, + { + "epoch": 0.6501368272659425, + "grad_norm": 0.9322280287742615, + "learning_rate": 3.2890681665222226e-05, + "loss": 1.2528, + "mean_token_accuracy": 0.6825391009449959, + "num_tokens": 256750149.0, + "step": 7840 + }, + { + "epoch": 0.6505514553445559, + "grad_norm": 0.9143825173377991, + "learning_rate": 3.282270132955266e-05, + "loss": 1.1736, + "mean_token_accuracy": 0.7003849029541016, + "num_tokens": 256913989.0, + "step": 7845 + }, + { + "epoch": 0.6509660834231694, + "grad_norm": 0.919293224811554, + "learning_rate": 3.275475697668178e-05, + "loss": 1.229, + "mean_token_accuracy": 0.6859237551689148, + "num_tokens": 257077829.0, + "step": 7850 + }, + { + "epoch": 0.6513807115017829, + "grad_norm": 0.9100547432899475, + "learning_rate": 3.2686848748938615e-05, + "loss": 1.2084, + "mean_token_accuracy": 0.6919843584299088, + "num_tokens": 257241669.0, + "step": 7855 + }, + { + "epoch": 0.6517953395803964, + "grad_norm": 0.9073319435119629, + "learning_rate": 3.261897678857651e-05, + "loss": 1.2239, + "mean_token_accuracy": 0.6890473529696465, + "num_tokens": 257405149.0, + "step": 7860 + }, + { + "epoch": 0.6522099676590098, + "grad_norm": 0.8998532891273499, + "learning_rate": 3.255114123777282e-05, + "loss": 1.1502, + "mean_token_accuracy": 0.6979288876056671, + "num_tokens": 257568989.0, + "step": 7865 + }, + { + "epoch": 0.6526245957376233, + "grad_norm": 0.9466356635093689, + "learning_rate": 3.2483342238628645e-05, + "loss": 1.2448, + "mean_token_accuracy": 0.6834188640117645, + "num_tokens": 257732829.0, + "step": 7870 + }, + { + "epoch": 0.6530392238162368, + "grad_norm": 0.9228238463401794, + "learning_rate": 3.2415579933168525e-05, + "loss": 1.1342, + "mean_token_accuracy": 0.7012707725167274, + "num_tokens": 257896669.0, + "step": 7875 + }, + { + "epoch": 0.6534538518948503, + "grad_norm": 0.8968457579612732, + "learning_rate": 3.234785446334009e-05, + "loss": 1.1727, + "mean_token_accuracy": 0.6980144158005714, + "num_tokens": 258060509.0, + "step": 7880 + }, + { + "epoch": 0.6538684799734638, + "grad_norm": 0.9200351238250732, + "learning_rate": 3.228016597101387e-05, + "loss": 1.1889, + "mean_token_accuracy": 0.6939210638403892, + "num_tokens": 258224349.0, + "step": 7885 + }, + { + "epoch": 0.6542831080520772, + "grad_norm": 0.9175406694412231, + "learning_rate": 3.221251459798291e-05, + "loss": 1.2323, + "mean_token_accuracy": 0.6847568452358246, + "num_tokens": 258388189.0, + "step": 7890 + }, + { + "epoch": 0.6546977361306908, + "grad_norm": 0.8689358234405518, + "learning_rate": 3.214490048596246e-05, + "loss": 1.1572, + "mean_token_accuracy": 0.700342133641243, + "num_tokens": 258552029.0, + "step": 7895 + }, + { + "epoch": 0.6551123642093043, + "grad_norm": 0.8892029523849487, + "learning_rate": 3.2077323776589766e-05, + "loss": 1.1111, + "mean_token_accuracy": 0.7068059653043747, + "num_tokens": 258715869.0, + "step": 7900 + }, + { + "epoch": 0.6555269922879178, + "grad_norm": 0.8861289024353027, + "learning_rate": 3.200978461142371e-05, + "loss": 1.1604, + "mean_token_accuracy": 0.6996456414461136, + "num_tokens": 258879709.0, + "step": 7905 + }, + { + "epoch": 0.6559416203665313, + "grad_norm": 0.9374545812606812, + "learning_rate": 3.1942283131944525e-05, + "loss": 1.1941, + "mean_token_accuracy": 0.6943059071898461, + "num_tokens": 259042763.0, + "step": 7910 + }, + { + "epoch": 0.6563562484451447, + "grad_norm": 0.889559268951416, + "learning_rate": 3.1874819479553484e-05, + "loss": 1.1385, + "mean_token_accuracy": 0.7040811315178871, + "num_tokens": 259206603.0, + "step": 7915 + }, + { + "epoch": 0.6567708765237582, + "grad_norm": 0.9265528321266174, + "learning_rate": 3.180739379557266e-05, + "loss": 1.1488, + "mean_token_accuracy": 0.7060422763228417, + "num_tokens": 259370443.0, + "step": 7920 + }, + { + "epoch": 0.6571855046023717, + "grad_norm": 0.9090298414230347, + "learning_rate": 3.1740006221244546e-05, + "loss": 1.1003, + "mean_token_accuracy": 0.7120295718312264, + "num_tokens": 259534283.0, + "step": 7925 + }, + { + "epoch": 0.6576001326809852, + "grad_norm": 0.909954845905304, + "learning_rate": 3.1672656897731825e-05, + "loss": 1.0904, + "mean_token_accuracy": 0.7156341642141342, + "num_tokens": 259698123.0, + "step": 7930 + }, + { + "epoch": 0.6580147607595986, + "grad_norm": 0.8400858044624329, + "learning_rate": 3.160534596611704e-05, + "loss": 1.152, + "mean_token_accuracy": 0.7007453545928002, + "num_tokens": 259861963.0, + "step": 7935 + }, + { + "epoch": 0.6584293888382121, + "grad_norm": 0.9129824042320251, + "learning_rate": 3.153807356740235e-05, + "loss": 1.1336, + "mean_token_accuracy": 0.7012524455785751, + "num_tokens": 260025803.0, + "step": 7940 + }, + { + "epoch": 0.6588440169168256, + "grad_norm": 0.9285549521446228, + "learning_rate": 3.147083984250914e-05, + "loss": 1.206, + "mean_token_accuracy": 0.6904447689652443, + "num_tokens": 260189643.0, + "step": 7945 + }, + { + "epoch": 0.6592586449954391, + "grad_norm": 0.897851824760437, + "learning_rate": 3.1403644932277814e-05, + "loss": 1.1734, + "mean_token_accuracy": 0.6981427192687988, + "num_tokens": 260353483.0, + "step": 7950 + }, + { + "epoch": 0.6596732730740525, + "grad_norm": 0.8849973082542419, + "learning_rate": 3.1336488977467484e-05, + "loss": 1.1127, + "mean_token_accuracy": 0.7087732166051864, + "num_tokens": 260517323.0, + "step": 7955 + }, + { + "epoch": 0.660087901152666, + "grad_norm": 0.924141526222229, + "learning_rate": 3.126937211875559e-05, + "loss": 1.1991, + "mean_token_accuracy": 0.691159576177597, + "num_tokens": 260681163.0, + "step": 7960 + }, + { + "epoch": 0.6605025292312795, + "grad_norm": 0.912146806716919, + "learning_rate": 3.1202294496737764e-05, + "loss": 1.2283, + "mean_token_accuracy": 0.6835349485278129, + "num_tokens": 260845003.0, + "step": 7965 + }, + { + "epoch": 0.660917157309893, + "grad_norm": 0.9447025656700134, + "learning_rate": 3.113525625192739e-05, + "loss": 1.1328, + "mean_token_accuracy": 0.7048875838518143, + "num_tokens": 261008843.0, + "step": 7970 + }, + { + "epoch": 0.6613317853885065, + "grad_norm": 0.936392068862915, + "learning_rate": 3.106825752475537e-05, + "loss": 1.1901, + "mean_token_accuracy": 0.6934811815619468, + "num_tokens": 261172683.0, + "step": 7975 + }, + { + "epoch": 0.66174641346712, + "grad_norm": 0.9893807172775269, + "learning_rate": 3.100129845556982e-05, + "loss": 1.1741, + "mean_token_accuracy": 0.6976661816239357, + "num_tokens": 261336523.0, + "step": 7980 + }, + { + "epoch": 0.6621610415457335, + "grad_norm": 0.9306498169898987, + "learning_rate": 3.093437918463582e-05, + "loss": 1.1309, + "mean_token_accuracy": 0.7051431089639664, + "num_tokens": 261499679.0, + "step": 7985 + }, + { + "epoch": 0.662575669624347, + "grad_norm": 0.9055891036987305, + "learning_rate": 3.086749985213506e-05, + "loss": 1.1941, + "mean_token_accuracy": 0.6949699714779853, + "num_tokens": 261663044.0, + "step": 7990 + }, + { + "epoch": 0.6629902977029605, + "grad_norm": 0.9195916652679443, + "learning_rate": 3.0800660598165535e-05, + "loss": 1.19, + "mean_token_accuracy": 0.697262954711914, + "num_tokens": 261826884.0, + "step": 7995 + }, + { + "epoch": 0.663404925781574, + "grad_norm": 0.8760239481925964, + "learning_rate": 3.0733861562741294e-05, + "loss": 1.222, + "mean_token_accuracy": 0.6897421807050705, + "num_tokens": 261990724.0, + "step": 8000 + }, + { + "epoch": 0.6638195538601874, + "grad_norm": 0.9419845342636108, + "learning_rate": 3.066710288579221e-05, + "loss": 1.1852, + "mean_token_accuracy": 0.695423997938633, + "num_tokens": 262154564.0, + "step": 8005 + }, + { + "epoch": 0.6642341819388009, + "grad_norm": 0.9906468987464905, + "learning_rate": 3.0600384707163524e-05, + "loss": 1.229, + "mean_token_accuracy": 0.6863575249910354, + "num_tokens": 262318404.0, + "step": 8010 + }, + { + "epoch": 0.6646488100174144, + "grad_norm": 0.8945165276527405, + "learning_rate": 3.053370716661565e-05, + "loss": 1.1661, + "mean_token_accuracy": 0.6972568422555924, + "num_tokens": 262482244.0, + "step": 8015 + }, + { + "epoch": 0.6650634380960279, + "grad_norm": 0.909670352935791, + "learning_rate": 3.046707040382396e-05, + "loss": 1.2078, + "mean_token_accuracy": 0.6953690111637115, + "num_tokens": 262646084.0, + "step": 8020 + }, + { + "epoch": 0.6654780661746413, + "grad_norm": 0.9329404234886169, + "learning_rate": 3.0400474558378278e-05, + "loss": 1.1376, + "mean_token_accuracy": 0.7053213611245155, + "num_tokens": 262809924.0, + "step": 8025 + }, + { + "epoch": 0.6658926942532548, + "grad_norm": 0.9019607305526733, + "learning_rate": 3.033391976978282e-05, + "loss": 1.1496, + "mean_token_accuracy": 0.705315251648426, + "num_tokens": 262973764.0, + "step": 8030 + }, + { + "epoch": 0.6663073223318683, + "grad_norm": 0.9323384165763855, + "learning_rate": 3.0267406177455758e-05, + "loss": 1.1324, + "mean_token_accuracy": 0.7034274145960808, + "num_tokens": 263137604.0, + "step": 8035 + }, + { + "epoch": 0.6667219504104818, + "grad_norm": 0.8958032131195068, + "learning_rate": 3.0200933920728935e-05, + "loss": 1.0873, + "mean_token_accuracy": 0.710343350470066, + "num_tokens": 263301444.0, + "step": 8040 + }, + { + "epoch": 0.6671365784890952, + "grad_norm": 0.9433259963989258, + "learning_rate": 3.013450313884766e-05, + "loss": 1.2628, + "mean_token_accuracy": 0.6852602422237396, + "num_tokens": 263465171.0, + "step": 8045 + }, + { + "epoch": 0.6675512065677087, + "grad_norm": 0.9961844682693481, + "learning_rate": 3.006811397097033e-05, + "loss": 1.169, + "mean_token_accuracy": 0.7004521027207374, + "num_tokens": 263629011.0, + "step": 8050 + }, + { + "epoch": 0.6679658346463222, + "grad_norm": 0.9597964882850647, + "learning_rate": 3.0001766556168188e-05, + "loss": 1.2416, + "mean_token_accuracy": 0.6857465758919716, + "num_tokens": 263792851.0, + "step": 8055 + }, + { + "epoch": 0.6683804627249358, + "grad_norm": 0.8648970723152161, + "learning_rate": 2.9935461033424972e-05, + "loss": 1.1102, + "mean_token_accuracy": 0.713789102435112, + "num_tokens": 263956691.0, + "step": 8060 + }, + { + "epoch": 0.6687950908035493, + "grad_norm": 0.9382487535476685, + "learning_rate": 2.9869197541636713e-05, + "loss": 1.1416, + "mean_token_accuracy": 0.7014784932136535, + "num_tokens": 264120531.0, + "step": 8065 + }, + { + "epoch": 0.6692097188821627, + "grad_norm": 0.9381263852119446, + "learning_rate": 2.9802976219611388e-05, + "loss": 1.2104, + "mean_token_accuracy": 0.6897899955511093, + "num_tokens": 264284175.0, + "step": 8070 + }, + { + "epoch": 0.6696243469607762, + "grad_norm": 0.9110860824584961, + "learning_rate": 2.97367972060686e-05, + "loss": 1.2149, + "mean_token_accuracy": 0.6906097263097764, + "num_tokens": 264448015.0, + "step": 8075 + }, + { + "epoch": 0.6700389750393897, + "grad_norm": 0.9007151126861572, + "learning_rate": 2.9670660639639354e-05, + "loss": 1.1291, + "mean_token_accuracy": 0.7080767348408699, + "num_tokens": 264611855.0, + "step": 8080 + }, + { + "epoch": 0.6704536031180032, + "grad_norm": 0.9092766046524048, + "learning_rate": 2.9604566658865762e-05, + "loss": 1.2117, + "mean_token_accuracy": 0.6890212625265122, + "num_tokens": 264775695.0, + "step": 8085 + }, + { + "epoch": 0.6708682311966166, + "grad_norm": 0.9114570617675781, + "learning_rate": 2.9538515402200672e-05, + "loss": 1.114, + "mean_token_accuracy": 0.707172529399395, + "num_tokens": 264939535.0, + "step": 8090 + }, + { + "epoch": 0.6712828592752301, + "grad_norm": 0.9584678411483765, + "learning_rate": 2.9472507008007462e-05, + "loss": 1.1718, + "mean_token_accuracy": 0.6979105576872826, + "num_tokens": 265103375.0, + "step": 8095 + }, + { + "epoch": 0.6716974873538436, + "grad_norm": 0.8790441155433655, + "learning_rate": 2.9406541614559757e-05, + "loss": 1.2092, + "mean_token_accuracy": 0.6879531264305114, + "num_tokens": 265266705.0, + "step": 8100 + }, + { + "epoch": 0.6721121154324571, + "grad_norm": 0.9176095724105835, + "learning_rate": 2.934061936004102e-05, + "loss": 1.1874, + "mean_token_accuracy": 0.6954189702868462, + "num_tokens": 265430067.0, + "step": 8105 + }, + { + "epoch": 0.6725267435110706, + "grad_norm": 0.9208274483680725, + "learning_rate": 2.927474038254443e-05, + "loss": 1.1584, + "mean_token_accuracy": 0.7010539382696152, + "num_tokens": 265593552.0, + "step": 8110 + }, + { + "epoch": 0.672941371589684, + "grad_norm": 0.9400938153266907, + "learning_rate": 2.920890482007248e-05, + "loss": 1.1014, + "mean_token_accuracy": 0.7097007974982261, + "num_tokens": 265756985.0, + "step": 8115 + }, + { + "epoch": 0.6733559996682975, + "grad_norm": 0.9142690300941467, + "learning_rate": 2.9143112810536688e-05, + "loss": 1.1734, + "mean_token_accuracy": 0.6966886594891548, + "num_tokens": 265920825.0, + "step": 8120 + }, + { + "epoch": 0.673770627746911, + "grad_norm": 0.9190114736557007, + "learning_rate": 2.9077364491757387e-05, + "loss": 1.2588, + "mean_token_accuracy": 0.6858443230390548, + "num_tokens": 266084665.0, + "step": 8125 + }, + { + "epoch": 0.6741852558255245, + "grad_norm": 0.9296127557754517, + "learning_rate": 2.9011660001463327e-05, + "loss": 1.1336, + "mean_token_accuracy": 0.7009439036250115, + "num_tokens": 266247905.0, + "step": 8130 + }, + { + "epoch": 0.6745998839041379, + "grad_norm": 1.4097745418548584, + "learning_rate": 2.8945999477291564e-05, + "loss": 1.1725, + "mean_token_accuracy": 0.693255127966404, + "num_tokens": 266411745.0, + "step": 8135 + }, + { + "epoch": 0.6750145119827514, + "grad_norm": 0.8844364881515503, + "learning_rate": 2.8880383056786897e-05, + "loss": 1.1612, + "mean_token_accuracy": 0.6995601192116737, + "num_tokens": 266575585.0, + "step": 8140 + }, + { + "epoch": 0.675429140061365, + "grad_norm": 0.8976459503173828, + "learning_rate": 2.8814810877401828e-05, + "loss": 1.2036, + "mean_token_accuracy": 0.6910068362951278, + "num_tokens": 266739425.0, + "step": 8145 + }, + { + "epoch": 0.6758437681399785, + "grad_norm": 0.9473207592964172, + "learning_rate": 2.8749283076496226e-05, + "loss": 1.209, + "mean_token_accuracy": 0.6893572807312012, + "num_tokens": 266903265.0, + "step": 8150 + }, + { + "epoch": 0.676258396218592, + "grad_norm": 0.8655648827552795, + "learning_rate": 2.8683799791336874e-05, + "loss": 1.0945, + "mean_token_accuracy": 0.7166544482111931, + "num_tokens": 267067105.0, + "step": 8155 + }, + { + "epoch": 0.6766730242972054, + "grad_norm": 0.9129595756530762, + "learning_rate": 2.8618361159097396e-05, + "loss": 1.1446, + "mean_token_accuracy": 0.7012891039252281, + "num_tokens": 267230945.0, + "step": 8160 + }, + { + "epoch": 0.6770876523758189, + "grad_norm": 0.8981407284736633, + "learning_rate": 2.8552967316857847e-05, + "loss": 1.2631, + "mean_token_accuracy": 0.6817509770393372, + "num_tokens": 267394785.0, + "step": 8165 + }, + { + "epoch": 0.6775022804544324, + "grad_norm": 0.9718103408813477, + "learning_rate": 2.848761840160447e-05, + "loss": 1.2535, + "mean_token_accuracy": 0.6781463786959648, + "num_tokens": 267558625.0, + "step": 8170 + }, + { + "epoch": 0.6779169085330459, + "grad_norm": 0.8931992053985596, + "learning_rate": 2.842231455022938e-05, + "loss": 1.1518, + "mean_token_accuracy": 0.7029997572302819, + "num_tokens": 267722465.0, + "step": 8175 + }, + { + "epoch": 0.6783315366116593, + "grad_norm": 0.8879521489143372, + "learning_rate": 2.8357055899530305e-05, + "loss": 1.1829, + "mean_token_accuracy": 0.6984604090452194, + "num_tokens": 267886305.0, + "step": 8180 + }, + { + "epoch": 0.6787461646902728, + "grad_norm": 0.9124054312705994, + "learning_rate": 2.8291842586210284e-05, + "loss": 1.2134, + "mean_token_accuracy": 0.6883736565709114, + "num_tokens": 268050145.0, + "step": 8185 + }, + { + "epoch": 0.6791607927688863, + "grad_norm": 0.9303012490272522, + "learning_rate": 2.8226674746877363e-05, + "loss": 1.2363, + "mean_token_accuracy": 0.6868585079908371, + "num_tokens": 268213985.0, + "step": 8190 + }, + { + "epoch": 0.6795754208474998, + "grad_norm": 0.9055581092834473, + "learning_rate": 2.8161552518044365e-05, + "loss": 1.114, + "mean_token_accuracy": 0.7071786433458328, + "num_tokens": 268377825.0, + "step": 8195 + }, + { + "epoch": 0.6799900489261133, + "grad_norm": 0.9333614706993103, + "learning_rate": 2.809647603612855e-05, + "loss": 1.1981, + "mean_token_accuracy": 0.6952223852276802, + "num_tokens": 268541665.0, + "step": 8200 + }, + { + "epoch": 0.6804046770047267, + "grad_norm": 0.8959496021270752, + "learning_rate": 2.8031445437451352e-05, + "loss": 1.1051, + "mean_token_accuracy": 0.711937926709652, + "num_tokens": 268705505.0, + "step": 8205 + }, + { + "epoch": 0.6808193050833402, + "grad_norm": 0.8913466930389404, + "learning_rate": 2.7966460858238076e-05, + "loss": 1.1174, + "mean_token_accuracy": 0.7062072291970253, + "num_tokens": 268869345.0, + "step": 8210 + }, + { + "epoch": 0.6812339331619537, + "grad_norm": 0.8874464631080627, + "learning_rate": 2.790152243461765e-05, + "loss": 1.1403, + "mean_token_accuracy": 0.7017900750041008, + "num_tokens": 269033185.0, + "step": 8215 + }, + { + "epoch": 0.6816485612405672, + "grad_norm": 0.9019583463668823, + "learning_rate": 2.783663030262229e-05, + "loss": 1.1616, + "mean_token_accuracy": 0.6987719938158989, + "num_tokens": 269197025.0, + "step": 8220 + }, + { + "epoch": 0.6820631893191806, + "grad_norm": 0.8874391913414001, + "learning_rate": 2.7771784598187268e-05, + "loss": 1.156, + "mean_token_accuracy": 0.6987349942326546, + "num_tokens": 269360192.0, + "step": 8225 + }, + { + "epoch": 0.6824778173977942, + "grad_norm": 0.8777316808700562, + "learning_rate": 2.7706985457150597e-05, + "loss": 1.1307, + "mean_token_accuracy": 0.7065371468663215, + "num_tokens": 269524032.0, + "step": 8230 + }, + { + "epoch": 0.6828924454764077, + "grad_norm": 0.9662749171257019, + "learning_rate": 2.7642233015252683e-05, + "loss": 1.2603, + "mean_token_accuracy": 0.6818487271666527, + "num_tokens": 269687872.0, + "step": 8235 + }, + { + "epoch": 0.6833070735550212, + "grad_norm": 0.9199495315551758, + "learning_rate": 2.7577527408136217e-05, + "loss": 1.18, + "mean_token_accuracy": 0.6951735138893127, + "num_tokens": 269851712.0, + "step": 8240 + }, + { + "epoch": 0.6837217016336347, + "grad_norm": 0.9042023420333862, + "learning_rate": 2.7512868771345723e-05, + "loss": 1.1748, + "mean_token_accuracy": 0.7006720416247845, + "num_tokens": 270015552.0, + "step": 8245 + }, + { + "epoch": 0.6841363297122481, + "grad_norm": 0.9222347140312195, + "learning_rate": 2.744825724032731e-05, + "loss": 1.146, + "mean_token_accuracy": 0.698130975663662, + "num_tokens": 270178894.0, + "step": 8250 + }, + { + "epoch": 0.6845509577908616, + "grad_norm": 0.9103004336357117, + "learning_rate": 2.738369295042843e-05, + "loss": 1.2015, + "mean_token_accuracy": 0.6903470143675804, + "num_tokens": 270342734.0, + "step": 8255 + }, + { + "epoch": 0.6849655858694751, + "grad_norm": 0.9190084934234619, + "learning_rate": 2.731917603689763e-05, + "loss": 1.1617, + "mean_token_accuracy": 0.7091275677084923, + "num_tokens": 270506574.0, + "step": 8260 + }, + { + "epoch": 0.6853802139480886, + "grad_norm": 0.9093053936958313, + "learning_rate": 2.7254706634884125e-05, + "loss": 1.1341, + "mean_token_accuracy": 0.7053946748375892, + "num_tokens": 270670414.0, + "step": 8265 + }, + { + "epoch": 0.685794842026702, + "grad_norm": 0.8801981210708618, + "learning_rate": 2.719028487943763e-05, + "loss": 1.1643, + "mean_token_accuracy": 0.7008492186665535, + "num_tokens": 270834254.0, + "step": 8270 + }, + { + "epoch": 0.6862094701053155, + "grad_norm": 0.9740326404571533, + "learning_rate": 2.7125910905508102e-05, + "loss": 1.2164, + "mean_token_accuracy": 0.6922470659017563, + "num_tokens": 270998094.0, + "step": 8275 + }, + { + "epoch": 0.686624098183929, + "grad_norm": 0.8929706811904907, + "learning_rate": 2.7061584847945376e-05, + "loss": 1.0785, + "mean_token_accuracy": 0.7141678869724274, + "num_tokens": 271161934.0, + "step": 8280 + }, + { + "epoch": 0.6870387262625425, + "grad_norm": 0.8799037337303162, + "learning_rate": 2.699730684149886e-05, + "loss": 1.1977, + "mean_token_accuracy": 0.6915811315178871, + "num_tokens": 271325774.0, + "step": 8285 + }, + { + "epoch": 0.687453354341156, + "grad_norm": 0.9009878039360046, + "learning_rate": 2.6933077020817344e-05, + "loss": 1.2165, + "mean_token_accuracy": 0.6927935481071472, + "num_tokens": 271488598.0, + "step": 8290 + }, + { + "epoch": 0.6878679824197694, + "grad_norm": 0.9551199674606323, + "learning_rate": 2.686889552044875e-05, + "loss": 1.1944, + "mean_token_accuracy": 0.6921798631548881, + "num_tokens": 271652438.0, + "step": 8295 + }, + { + "epoch": 0.6882826104983829, + "grad_norm": 0.9601017236709595, + "learning_rate": 2.680476247483965e-05, + "loss": 1.2383, + "mean_token_accuracy": 0.6872434094548225, + "num_tokens": 271816278.0, + "step": 8300 + }, + { + "epoch": 0.6886972385769964, + "grad_norm": 0.9434542059898376, + "learning_rate": 2.6740678018335207e-05, + "loss": 1.1403, + "mean_token_accuracy": 0.7043584361672401, + "num_tokens": 271979753.0, + "step": 8305 + }, + { + "epoch": 0.68911186665561, + "grad_norm": 0.9041075706481934, + "learning_rate": 2.6676642285178754e-05, + "loss": 1.1983, + "mean_token_accuracy": 0.6933589935302734, + "num_tokens": 272143593.0, + "step": 8310 + }, + { + "epoch": 0.6895264947342234, + "grad_norm": 0.9166638255119324, + "learning_rate": 2.6612655409511584e-05, + "loss": 1.2031, + "mean_token_accuracy": 0.6886542037129402, + "num_tokens": 272307373.0, + "step": 8315 + }, + { + "epoch": 0.6899411228128369, + "grad_norm": 0.8909661769866943, + "learning_rate": 2.6548717525372635e-05, + "loss": 1.2413, + "mean_token_accuracy": 0.6819937512278557, + "num_tokens": 272470536.0, + "step": 8320 + }, + { + "epoch": 0.6903557508914504, + "grad_norm": 0.9195128679275513, + "learning_rate": 2.6484828766698212e-05, + "loss": 1.1438, + "mean_token_accuracy": 0.7059733361005783, + "num_tokens": 272633354.0, + "step": 8325 + }, + { + "epoch": 0.6907703789700639, + "grad_norm": 0.8586157560348511, + "learning_rate": 2.642098926732172e-05, + "loss": 1.0929, + "mean_token_accuracy": 0.7127057388424873, + "num_tokens": 272796585.0, + "step": 8330 + }, + { + "epoch": 0.6911850070486774, + "grad_norm": 0.930367648601532, + "learning_rate": 2.6357199160973377e-05, + "loss": 1.2428, + "mean_token_accuracy": 0.6862353429198265, + "num_tokens": 272960425.0, + "step": 8335 + }, + { + "epoch": 0.6915996351272908, + "grad_norm": 0.9563949704170227, + "learning_rate": 2.6293458581279938e-05, + "loss": 1.2148, + "mean_token_accuracy": 0.6948497071862221, + "num_tokens": 273124265.0, + "step": 8340 + }, + { + "epoch": 0.6920142632059043, + "grad_norm": 0.9224836230278015, + "learning_rate": 2.6229767661764392e-05, + "loss": 1.173, + "mean_token_accuracy": 0.6997882291674614, + "num_tokens": 273287618.0, + "step": 8345 + }, + { + "epoch": 0.6924288912845178, + "grad_norm": 0.945375919342041, + "learning_rate": 2.6166126535845715e-05, + "loss": 1.1715, + "mean_token_accuracy": 0.6987353324890136, + "num_tokens": 273451458.0, + "step": 8350 + }, + { + "epoch": 0.6928435193631313, + "grad_norm": 0.9309601187705994, + "learning_rate": 2.6102535336838564e-05, + "loss": 1.1151, + "mean_token_accuracy": 0.7070197939872742, + "num_tokens": 273615298.0, + "step": 8355 + }, + { + "epoch": 0.6932581474417447, + "grad_norm": 0.9397746324539185, + "learning_rate": 2.6038994197953036e-05, + "loss": 1.1602, + "mean_token_accuracy": 0.6988819643855095, + "num_tokens": 273779138.0, + "step": 8360 + }, + { + "epoch": 0.6936727755203582, + "grad_norm": 0.8752930760383606, + "learning_rate": 2.597550325229433e-05, + "loss": 1.1543, + "mean_token_accuracy": 0.6997800603508949, + "num_tokens": 273942978.0, + "step": 8365 + }, + { + "epoch": 0.6940874035989717, + "grad_norm": 0.9306272864341736, + "learning_rate": 2.591206263286252e-05, + "loss": 1.1689, + "mean_token_accuracy": 0.697134654223919, + "num_tokens": 274106818.0, + "step": 8370 + }, + { + "epoch": 0.6945020316775852, + "grad_norm": 0.9521862864494324, + "learning_rate": 2.5848672472552253e-05, + "loss": 1.2119, + "mean_token_accuracy": 0.692894670367241, + "num_tokens": 274270658.0, + "step": 8375 + }, + { + "epoch": 0.6949166597561987, + "grad_norm": 0.8978354930877686, + "learning_rate": 2.5785332904152475e-05, + "loss": 1.1133, + "mean_token_accuracy": 0.7113086462020874, + "num_tokens": 274434498.0, + "step": 8380 + }, + { + "epoch": 0.6953312878348121, + "grad_norm": 0.8853723406791687, + "learning_rate": 2.572204406034615e-05, + "loss": 1.1119, + "mean_token_accuracy": 0.707936218380928, + "num_tokens": 274598338.0, + "step": 8385 + }, + { + "epoch": 0.6957459159134256, + "grad_norm": 0.9029449224472046, + "learning_rate": 2.565880607371002e-05, + "loss": 1.1074, + "mean_token_accuracy": 0.7122495099902153, + "num_tokens": 274762178.0, + "step": 8390 + }, + { + "epoch": 0.6961605439920392, + "grad_norm": 0.9149707555770874, + "learning_rate": 2.5595619076714173e-05, + "loss": 1.1772, + "mean_token_accuracy": 0.6978861212730407, + "num_tokens": 274926018.0, + "step": 8395 + }, + { + "epoch": 0.6965751720706527, + "grad_norm": 0.966590166091919, + "learning_rate": 2.5532483201722052e-05, + "loss": 1.1965, + "mean_token_accuracy": 0.6957539081573486, + "num_tokens": 275089858.0, + "step": 8400 + }, + { + "epoch": 0.6969898001492661, + "grad_norm": 0.9344898462295532, + "learning_rate": 2.5469398580989902e-05, + "loss": 1.1724, + "mean_token_accuracy": 0.6987475574016571, + "num_tokens": 275253698.0, + "step": 8405 + }, + { + "epoch": 0.6974044282278796, + "grad_norm": 0.9215954542160034, + "learning_rate": 2.540636534666664e-05, + "loss": 1.1521, + "mean_token_accuracy": 0.6996761962771416, + "num_tokens": 275417538.0, + "step": 8410 + }, + { + "epoch": 0.6978190563064931, + "grad_norm": 0.8877071142196655, + "learning_rate": 2.534338363079348e-05, + "loss": 1.0889, + "mean_token_accuracy": 0.7142167612910271, + "num_tokens": 275581378.0, + "step": 8415 + }, + { + "epoch": 0.6982336843851066, + "grad_norm": 0.9456600546836853, + "learning_rate": 2.528045356530382e-05, + "loss": 1.1511, + "mean_token_accuracy": 0.69794110506773, + "num_tokens": 275745218.0, + "step": 8420 + }, + { + "epoch": 0.6986483124637201, + "grad_norm": 0.9057779908180237, + "learning_rate": 2.5217575282022803e-05, + "loss": 1.1854, + "mean_token_accuracy": 0.6958027854561806, + "num_tokens": 275909058.0, + "step": 8425 + }, + { + "epoch": 0.6990629405423335, + "grad_norm": 0.8589238524436951, + "learning_rate": 2.5154748912667036e-05, + "loss": 1.111, + "mean_token_accuracy": 0.7077101662755012, + "num_tokens": 276072898.0, + "step": 8430 + }, + { + "epoch": 0.699477568620947, + "grad_norm": 0.9326671957969666, + "learning_rate": 2.5091974588844513e-05, + "loss": 1.2035, + "mean_token_accuracy": 0.6897669479250907, + "num_tokens": 276236612.0, + "step": 8435 + }, + { + "epoch": 0.6998921966995605, + "grad_norm": 0.927275538444519, + "learning_rate": 2.5029252442054118e-05, + "loss": 1.1811, + "mean_token_accuracy": 0.6972201868891716, + "num_tokens": 276400452.0, + "step": 8440 + }, + { + "epoch": 0.700306824778174, + "grad_norm": 0.8658373951911926, + "learning_rate": 2.4966582603685423e-05, + "loss": 1.2059, + "mean_token_accuracy": 0.6914638042449951, + "num_tokens": 276563347.0, + "step": 8445 + }, + { + "epoch": 0.7007214528567874, + "grad_norm": 0.9162196516990662, + "learning_rate": 2.4903965205018448e-05, + "loss": 1.151, + "mean_token_accuracy": 0.6998167142271996, + "num_tokens": 276727187.0, + "step": 8450 + }, + { + "epoch": 0.7011360809354009, + "grad_norm": 0.9280889630317688, + "learning_rate": 2.4841400377223422e-05, + "loss": 1.2022, + "mean_token_accuracy": 0.6918499559164047, + "num_tokens": 276891027.0, + "step": 8455 + }, + { + "epoch": 0.7015507090140144, + "grad_norm": 0.937462329864502, + "learning_rate": 2.477888825136034e-05, + "loss": 1.1621, + "mean_token_accuracy": 0.6965909153223038, + "num_tokens": 277054867.0, + "step": 8460 + }, + { + "epoch": 0.7019653370926279, + "grad_norm": 0.9051955342292786, + "learning_rate": 2.4716428958378866e-05, + "loss": 1.1774, + "mean_token_accuracy": 0.6993951588869095, + "num_tokens": 277218707.0, + "step": 8465 + }, + { + "epoch": 0.7023799651712413, + "grad_norm": 0.9229522347450256, + "learning_rate": 2.4654022629117985e-05, + "loss": 1.1614, + "mean_token_accuracy": 0.6999572351574898, + "num_tokens": 277382547.0, + "step": 8470 + }, + { + "epoch": 0.7027945932498548, + "grad_norm": 0.9330226182937622, + "learning_rate": 2.459166939430571e-05, + "loss": 1.2024, + "mean_token_accuracy": 0.6869904205203057, + "num_tokens": 277546102.0, + "step": 8475 + }, + { + "epoch": 0.7032092213284684, + "grad_norm": 0.8841614127159119, + "learning_rate": 2.4529369384558865e-05, + "loss": 1.165, + "mean_token_accuracy": 0.701994250714779, + "num_tokens": 277709789.0, + "step": 8480 + }, + { + "epoch": 0.7036238494070819, + "grad_norm": 0.8814997673034668, + "learning_rate": 2.4467122730382746e-05, + "loss": 1.1611, + "mean_token_accuracy": 0.7013685226440429, + "num_tokens": 277873629.0, + "step": 8485 + }, + { + "epoch": 0.7040384774856954, + "grad_norm": 0.8852235674858093, + "learning_rate": 2.4404929562170902e-05, + "loss": 1.2209, + "mean_token_accuracy": 0.6892045468091965, + "num_tokens": 278037469.0, + "step": 8490 + }, + { + "epoch": 0.7044531055643088, + "grad_norm": 0.9323071241378784, + "learning_rate": 2.4342790010204842e-05, + "loss": 1.215, + "mean_token_accuracy": 0.687761814892292, + "num_tokens": 278200563.0, + "step": 8495 + }, + { + "epoch": 0.7048677336429223, + "grad_norm": 0.9252358675003052, + "learning_rate": 2.4280704204653738e-05, + "loss": 1.1064, + "mean_token_accuracy": 0.7062988758087159, + "num_tokens": 278364403.0, + "step": 8500 + }, + { + "epoch": 0.7052823617215358, + "grad_norm": 0.8709287643432617, + "learning_rate": 2.4218672275574196e-05, + "loss": 1.1719, + "mean_token_accuracy": 0.6959982931613922, + "num_tokens": 278528243.0, + "step": 8505 + }, + { + "epoch": 0.7056969898001493, + "grad_norm": 0.8557941913604736, + "learning_rate": 2.4156694352909957e-05, + "loss": 1.148, + "mean_token_accuracy": 0.7060178413987159, + "num_tokens": 278692083.0, + "step": 8510 + }, + { + "epoch": 0.7061116178787628, + "grad_norm": 0.9327511787414551, + "learning_rate": 2.4094770566491627e-05, + "loss": 1.1671, + "mean_token_accuracy": 0.7007617250084877, + "num_tokens": 278855386.0, + "step": 8515 + }, + { + "epoch": 0.7065262459573762, + "grad_norm": 0.9525324702262878, + "learning_rate": 2.4032901046036404e-05, + "loss": 1.1744, + "mean_token_accuracy": 0.6984604060649872, + "num_tokens": 279019226.0, + "step": 8520 + }, + { + "epoch": 0.7069408740359897, + "grad_norm": 0.9035742282867432, + "learning_rate": 2.397108592114782e-05, + "loss": 1.1417, + "mean_token_accuracy": 0.7000122129917145, + "num_tokens": 279183066.0, + "step": 8525 + }, + { + "epoch": 0.7073555021146032, + "grad_norm": 0.9326964616775513, + "learning_rate": 2.390932532131545e-05, + "loss": 1.1748, + "mean_token_accuracy": 0.6967069894075394, + "num_tokens": 279346906.0, + "step": 8530 + }, + { + "epoch": 0.7077701301932167, + "grad_norm": 0.9051055312156677, + "learning_rate": 2.3847619375914686e-05, + "loss": 1.1475, + "mean_token_accuracy": 0.7067754149436951, + "num_tokens": 279510746.0, + "step": 8535 + }, + { + "epoch": 0.7081847582718301, + "grad_norm": 0.8472020626068115, + "learning_rate": 2.378596821420634e-05, + "loss": 1.2593, + "mean_token_accuracy": 0.6837243407964706, + "num_tokens": 279674586.0, + "step": 8540 + }, + { + "epoch": 0.7085993863504436, + "grad_norm": 0.9158118367195129, + "learning_rate": 2.3724371965336572e-05, + "loss": 1.2266, + "mean_token_accuracy": 0.6914833828806877, + "num_tokens": 279838426.0, + "step": 8545 + }, + { + "epoch": 0.7090140144290571, + "grad_norm": 0.9467946290969849, + "learning_rate": 2.3662830758336453e-05, + "loss": 1.1529, + "mean_token_accuracy": 0.7002810403704643, + "num_tokens": 280002266.0, + "step": 8550 + }, + { + "epoch": 0.7094286425076706, + "grad_norm": 1.0291293859481812, + "learning_rate": 2.360134472212176e-05, + "loss": 1.2379, + "mean_token_accuracy": 0.6841214552521706, + "num_tokens": 280166106.0, + "step": 8555 + }, + { + "epoch": 0.7098432705862842, + "grad_norm": 0.9345492124557495, + "learning_rate": 2.35399139854927e-05, + "loss": 1.2474, + "mean_token_accuracy": 0.6842130973935128, + "num_tokens": 280329946.0, + "step": 8560 + }, + { + "epoch": 0.7102578986648976, + "grad_norm": 0.9244153499603271, + "learning_rate": 2.347853867713365e-05, + "loss": 1.1364, + "mean_token_accuracy": 0.7075217753648758, + "num_tokens": 280493579.0, + "step": 8565 + }, + { + "epoch": 0.7106725267435111, + "grad_norm": 0.897363543510437, + "learning_rate": 2.3417218925612877e-05, + "loss": 1.1711, + "mean_token_accuracy": 0.7002993687987328, + "num_tokens": 280657419.0, + "step": 8570 + }, + { + "epoch": 0.7110871548221246, + "grad_norm": 0.9287620186805725, + "learning_rate": 2.3355954859382212e-05, + "loss": 1.202, + "mean_token_accuracy": 0.6938294187188149, + "num_tokens": 280821259.0, + "step": 8575 + }, + { + "epoch": 0.7115017829007381, + "grad_norm": 0.9229029417037964, + "learning_rate": 2.329474660677693e-05, + "loss": 1.2341, + "mean_token_accuracy": 0.6841947734355927, + "num_tokens": 280985099.0, + "step": 8580 + }, + { + "epoch": 0.7119164109793515, + "grad_norm": 0.9154161810874939, + "learning_rate": 2.3233594296015353e-05, + "loss": 1.1241, + "mean_token_accuracy": 0.7083195969462395, + "num_tokens": 281148608.0, + "step": 8585 + }, + { + "epoch": 0.712331039057965, + "grad_norm": 0.8730635643005371, + "learning_rate": 2.317249805519856e-05, + "loss": 1.1923, + "mean_token_accuracy": 0.697110216319561, + "num_tokens": 281312448.0, + "step": 8590 + }, + { + "epoch": 0.7127456671365785, + "grad_norm": 0.9459541440010071, + "learning_rate": 2.3111458012310227e-05, + "loss": 1.1998, + "mean_token_accuracy": 0.6961143687367439, + "num_tokens": 281476288.0, + "step": 8595 + }, + { + "epoch": 0.713160295215192, + "grad_norm": 0.9326469302177429, + "learning_rate": 2.3050474295216364e-05, + "loss": 1.1534, + "mean_token_accuracy": 0.700861431658268, + "num_tokens": 281640128.0, + "step": 8600 + }, + { + "epoch": 0.7135749232938055, + "grad_norm": 0.8979618549346924, + "learning_rate": 2.2989547031664856e-05, + "loss": 1.108, + "mean_token_accuracy": 0.7087976559996605, + "num_tokens": 281803968.0, + "step": 8605 + }, + { + "epoch": 0.7139895513724189, + "grad_norm": 0.9168345928192139, + "learning_rate": 2.292867634928541e-05, + "loss": 1.1346, + "mean_token_accuracy": 0.7058467760682106, + "num_tokens": 281967808.0, + "step": 8610 + }, + { + "epoch": 0.7144041794510324, + "grad_norm": 0.9103457927703857, + "learning_rate": 2.286786237558926e-05, + "loss": 1.1073, + "mean_token_accuracy": 0.7085715994238854, + "num_tokens": 282131648.0, + "step": 8615 + }, + { + "epoch": 0.7148188075296459, + "grad_norm": 0.967501699924469, + "learning_rate": 2.2807105237968724e-05, + "loss": 1.094, + "mean_token_accuracy": 0.7083488360047341, + "num_tokens": 282294691.0, + "step": 8620 + }, + { + "epoch": 0.7152334356082594, + "grad_norm": 0.9350996017456055, + "learning_rate": 2.2746405063697145e-05, + "loss": 1.1911, + "mean_token_accuracy": 0.6956928178668023, + "num_tokens": 282458531.0, + "step": 8625 + }, + { + "epoch": 0.7156480636868728, + "grad_norm": 0.9099265336990356, + "learning_rate": 2.2685761979928506e-05, + "loss": 1.2773, + "mean_token_accuracy": 0.6796940915286541, + "num_tokens": 282621927.0, + "step": 8630 + }, + { + "epoch": 0.7160626917654863, + "grad_norm": 0.8858419060707092, + "learning_rate": 2.2625176113697255e-05, + "loss": 1.1896, + "mean_token_accuracy": 0.6928777754306793, + "num_tokens": 282785739.0, + "step": 8635 + }, + { + "epoch": 0.7164773198440998, + "grad_norm": 0.8882941603660583, + "learning_rate": 2.256464759191788e-05, + "loss": 1.1034, + "mean_token_accuracy": 0.7116385638713837, + "num_tokens": 282949579.0, + "step": 8640 + }, + { + "epoch": 0.7168919479227134, + "grad_norm": 0.9124554395675659, + "learning_rate": 2.250417654138483e-05, + "loss": 1.2252, + "mean_token_accuracy": 0.6885630503296852, + "num_tokens": 283113419.0, + "step": 8645 + }, + { + "epoch": 0.7173065760013269, + "grad_norm": 0.885246217250824, + "learning_rate": 2.2443763088772125e-05, + "loss": 1.1348, + "mean_token_accuracy": 0.7050769805908204, + "num_tokens": 283277259.0, + "step": 8650 + }, + { + "epoch": 0.7177212040799403, + "grad_norm": 0.9887017607688904, + "learning_rate": 2.238340736063314e-05, + "loss": 1.176, + "mean_token_accuracy": 0.6959293812513352, + "num_tokens": 283440874.0, + "step": 8655 + }, + { + "epoch": 0.7181358321585538, + "grad_norm": 0.9638161659240723, + "learning_rate": 2.2323109483400335e-05, + "loss": 1.108, + "mean_token_accuracy": 0.7070320084691047, + "num_tokens": 283604714.0, + "step": 8660 + }, + { + "epoch": 0.7185504602371673, + "grad_norm": 0.8681931495666504, + "learning_rate": 2.2262869583384972e-05, + "loss": 1.0963, + "mean_token_accuracy": 0.7093477725982666, + "num_tokens": 283767659.0, + "step": 8665 + }, + { + "epoch": 0.7189650883157808, + "grad_norm": 0.8647438883781433, + "learning_rate": 2.220268778677687e-05, + "loss": 1.0793, + "mean_token_accuracy": 0.7119557321071625, + "num_tokens": 283931453.0, + "step": 8670 + }, + { + "epoch": 0.7193797163943942, + "grad_norm": 0.9072765111923218, + "learning_rate": 2.2142564219644136e-05, + "loss": 1.2217, + "mean_token_accuracy": 0.6886974617838859, + "num_tokens": 284095293.0, + "step": 8675 + }, + { + "epoch": 0.7197943444730077, + "grad_norm": 0.8717702627182007, + "learning_rate": 2.208249900793289e-05, + "loss": 1.176, + "mean_token_accuracy": 0.7004154488444329, + "num_tokens": 284259133.0, + "step": 8680 + }, + { + "epoch": 0.7202089725516212, + "grad_norm": 0.9505860209465027, + "learning_rate": 2.202249227746702e-05, + "loss": 1.1514, + "mean_token_accuracy": 0.7013074263930321, + "num_tokens": 284422973.0, + "step": 8685 + }, + { + "epoch": 0.7206236006302347, + "grad_norm": 0.9417359828948975, + "learning_rate": 2.19625441539479e-05, + "loss": 1.1373, + "mean_token_accuracy": 0.7046844303607941, + "num_tokens": 284586516.0, + "step": 8690 + }, + { + "epoch": 0.7210382287088482, + "grad_norm": 0.8849696516990662, + "learning_rate": 2.1902654762954143e-05, + "loss": 1.1799, + "mean_token_accuracy": 0.6978750191628933, + "num_tokens": 284749368.0, + "step": 8695 + }, + { + "epoch": 0.7214528567874616, + "grad_norm": 0.9590681791305542, + "learning_rate": 2.1842824229941323e-05, + "loss": 1.2275, + "mean_token_accuracy": 0.6849340155720711, + "num_tokens": 284913208.0, + "step": 8700 + }, + { + "epoch": 0.7218674848660751, + "grad_norm": 0.9124518632888794, + "learning_rate": 2.1783052680241718e-05, + "loss": 1.1227, + "mean_token_accuracy": 0.7079606577754021, + "num_tokens": 285077048.0, + "step": 8705 + }, + { + "epoch": 0.7222821129446886, + "grad_norm": 0.9214940667152405, + "learning_rate": 2.1723340239064053e-05, + "loss": 1.1775, + "mean_token_accuracy": 0.695729473233223, + "num_tokens": 285240888.0, + "step": 8710 + }, + { + "epoch": 0.7226967410233021, + "grad_norm": 0.8921946287155151, + "learning_rate": 2.1663687031493253e-05, + "loss": 1.1235, + "mean_token_accuracy": 0.7074291318655014, + "num_tokens": 285404728.0, + "step": 8715 + }, + { + "epoch": 0.7231113691019155, + "grad_norm": 0.8673714995384216, + "learning_rate": 2.160409318249008e-05, + "loss": 1.065, + "mean_token_accuracy": 0.7187194541096688, + "num_tokens": 285568568.0, + "step": 8720 + }, + { + "epoch": 0.723525997180529, + "grad_norm": 0.9239243268966675, + "learning_rate": 2.1544558816891075e-05, + "loss": 1.1746, + "mean_token_accuracy": 0.6981548488140106, + "num_tokens": 285731942.0, + "step": 8725 + }, + { + "epoch": 0.7239406252591426, + "grad_norm": 0.9728556871414185, + "learning_rate": 2.14850840594081e-05, + "loss": 1.127, + "mean_token_accuracy": 0.7067754194140434, + "num_tokens": 285895782.0, + "step": 8730 + }, + { + "epoch": 0.7243552533377561, + "grad_norm": 0.9516790509223938, + "learning_rate": 2.1425669034628122e-05, + "loss": 1.0735, + "mean_token_accuracy": 0.7161901235580445, + "num_tokens": 286059622.0, + "step": 8735 + }, + { + "epoch": 0.7247698814163696, + "grad_norm": 0.9084863662719727, + "learning_rate": 2.136631386701306e-05, + "loss": 1.0954, + "mean_token_accuracy": 0.7144672483205795, + "num_tokens": 286223462.0, + "step": 8740 + }, + { + "epoch": 0.725184509494983, + "grad_norm": 0.9737844467163086, + "learning_rate": 2.130701868089941e-05, + "loss": 1.1406, + "mean_token_accuracy": 0.7053213596343995, + "num_tokens": 286387302.0, + "step": 8745 + }, + { + "epoch": 0.7255991375735965, + "grad_norm": 0.9189149141311646, + "learning_rate": 2.1247783600497984e-05, + "loss": 1.181, + "mean_token_accuracy": 0.6972201809287071, + "num_tokens": 286551142.0, + "step": 8750 + }, + { + "epoch": 0.72601376565221, + "grad_norm": 0.9415842890739441, + "learning_rate": 2.1188608749893712e-05, + "loss": 1.2636, + "mean_token_accuracy": 0.6842436477541923, + "num_tokens": 286714982.0, + "step": 8755 + }, + { + "epoch": 0.7264283937308235, + "grad_norm": 0.9185128808021545, + "learning_rate": 2.1129494253045396e-05, + "loss": 1.1508, + "mean_token_accuracy": 0.7027309387922287, + "num_tokens": 286878822.0, + "step": 8760 + }, + { + "epoch": 0.7268430218094369, + "grad_norm": 0.8949790000915527, + "learning_rate": 2.1070440233785373e-05, + "loss": 1.2114, + "mean_token_accuracy": 0.6933162316679955, + "num_tokens": 287042662.0, + "step": 8765 + }, + { + "epoch": 0.7272576498880504, + "grad_norm": 0.9467188715934753, + "learning_rate": 2.1011446815819257e-05, + "loss": 1.1337, + "mean_token_accuracy": 0.7046248733997345, + "num_tokens": 287206502.0, + "step": 8770 + }, + { + "epoch": 0.7276722779666639, + "grad_norm": 0.9147824048995972, + "learning_rate": 2.0952514122725748e-05, + "loss": 1.1576, + "mean_token_accuracy": 0.7011363670229912, + "num_tokens": 287370342.0, + "step": 8775 + }, + { + "epoch": 0.7280869060452774, + "grad_norm": 0.8961211442947388, + "learning_rate": 2.0893642277956404e-05, + "loss": 1.2025, + "mean_token_accuracy": 0.6929313272237778, + "num_tokens": 287534182.0, + "step": 8780 + }, + { + "epoch": 0.7285015341238908, + "grad_norm": 0.9158641695976257, + "learning_rate": 2.0834831404835193e-05, + "loss": 1.2276, + "mean_token_accuracy": 0.6882025882601738, + "num_tokens": 287698022.0, + "step": 8785 + }, + { + "epoch": 0.7289161622025043, + "grad_norm": 0.9232227206230164, + "learning_rate": 2.0776081626558437e-05, + "loss": 1.2649, + "mean_token_accuracy": 0.6843108534812927, + "num_tokens": 287861862.0, + "step": 8790 + }, + { + "epoch": 0.7293307902811178, + "grad_norm": 0.8580409288406372, + "learning_rate": 2.0717393066194507e-05, + "loss": 1.1158, + "mean_token_accuracy": 0.7081439360976219, + "num_tokens": 288025702.0, + "step": 8795 + }, + { + "epoch": 0.7297454183597313, + "grad_norm": 0.9285319447517395, + "learning_rate": 2.065876584668344e-05, + "loss": 1.1391, + "mean_token_accuracy": 0.7008980959653854, + "num_tokens": 288189542.0, + "step": 8800 + }, + { + "epoch": 0.7301600464383448, + "grad_norm": 0.962729275226593, + "learning_rate": 2.0600200090836863e-05, + "loss": 1.1618, + "mean_token_accuracy": 0.6994929105043411, + "num_tokens": 288353382.0, + "step": 8805 + }, + { + "epoch": 0.7305746745169582, + "grad_norm": 0.9175639748573303, + "learning_rate": 2.0541695921337605e-05, + "loss": 1.1075, + "mean_token_accuracy": 0.7102028340101242, + "num_tokens": 288517222.0, + "step": 8810 + }, + { + "epoch": 0.7309893025955718, + "grad_norm": 0.8901187181472778, + "learning_rate": 2.0483253460739498e-05, + "loss": 1.104, + "mean_token_accuracy": 0.710874879360199, + "num_tokens": 288681062.0, + "step": 8815 + }, + { + "epoch": 0.7314039306741853, + "grad_norm": 0.9340787529945374, + "learning_rate": 2.0424872831467106e-05, + "loss": 1.1506, + "mean_token_accuracy": 0.7019116520881653, + "num_tokens": 288844722.0, + "step": 8820 + }, + { + "epoch": 0.7318185587527988, + "grad_norm": 0.9090827703475952, + "learning_rate": 2.0366554155815475e-05, + "loss": 1.1108, + "mean_token_accuracy": 0.7115530282258987, + "num_tokens": 289008562.0, + "step": 8825 + }, + { + "epoch": 0.7322331868314123, + "grad_norm": 0.8902605175971985, + "learning_rate": 2.0308297555949857e-05, + "loss": 1.1804, + "mean_token_accuracy": 0.6916605569422245, + "num_tokens": 289172402.0, + "step": 8830 + }, + { + "epoch": 0.7326478149100257, + "grad_norm": 0.9276025891304016, + "learning_rate": 2.025010315390548e-05, + "loss": 1.1186, + "mean_token_accuracy": 0.7068426206707954, + "num_tokens": 289336242.0, + "step": 8835 + }, + { + "epoch": 0.7330624429886392, + "grad_norm": 0.9215508103370667, + "learning_rate": 2.0191971071587277e-05, + "loss": 1.1487, + "mean_token_accuracy": 0.7047592863440514, + "num_tokens": 289500082.0, + "step": 8840 + }, + { + "epoch": 0.7334770710672527, + "grad_norm": 0.9058172106742859, + "learning_rate": 2.013390143076964e-05, + "loss": 1.1378, + "mean_token_accuracy": 0.6999755576252937, + "num_tokens": 289663922.0, + "step": 8845 + }, + { + "epoch": 0.7338916991458662, + "grad_norm": 0.9303228259086609, + "learning_rate": 2.007589435309615e-05, + "loss": 1.1803, + "mean_token_accuracy": 0.6976417407393456, + "num_tokens": 289827762.0, + "step": 8850 + }, + { + "epoch": 0.7343063272244796, + "grad_norm": 0.9356986284255981, + "learning_rate": 2.0017949960079334e-05, + "loss": 1.1782, + "mean_token_accuracy": 0.6956195041537285, + "num_tokens": 289991602.0, + "step": 8855 + }, + { + "epoch": 0.7347209553030931, + "grad_norm": 0.9283430576324463, + "learning_rate": 1.9960068373100417e-05, + "loss": 1.2187, + "mean_token_accuracy": 0.6888685241341591, + "num_tokens": 290155442.0, + "step": 8860 + }, + { + "epoch": 0.7351355833817066, + "grad_norm": 0.9447711110115051, + "learning_rate": 1.990224971340904e-05, + "loss": 1.1276, + "mean_token_accuracy": 0.7045149073004723, + "num_tokens": 290319282.0, + "step": 8865 + }, + { + "epoch": 0.7355502114603201, + "grad_norm": 0.9082170128822327, + "learning_rate": 1.9844494102123045e-05, + "loss": 1.0518, + "mean_token_accuracy": 0.7191776633262634, + "num_tokens": 290483122.0, + "step": 8870 + }, + { + "epoch": 0.7359648395389335, + "grad_norm": 0.9077820777893066, + "learning_rate": 1.9786801660228204e-05, + "loss": 1.1465, + "mean_token_accuracy": 0.7008919849991798, + "num_tokens": 290646962.0, + "step": 8875 + }, + { + "epoch": 0.736379467617547, + "grad_norm": 0.9205976128578186, + "learning_rate": 1.9729172508577905e-05, + "loss": 1.2184, + "mean_token_accuracy": 0.6890579193830491, + "num_tokens": 290810802.0, + "step": 8880 + }, + { + "epoch": 0.7367940956961605, + "grad_norm": 0.9017412066459656, + "learning_rate": 1.9671606767893046e-05, + "loss": 1.1275, + "mean_token_accuracy": 0.7070197999477387, + "num_tokens": 290974642.0, + "step": 8885 + }, + { + "epoch": 0.737208723774774, + "grad_norm": 0.8834683895111084, + "learning_rate": 1.961410455876166e-05, + "loss": 1.1985, + "mean_token_accuracy": 0.6970735609531402, + "num_tokens": 291138482.0, + "step": 8890 + }, + { + "epoch": 0.7376233518533876, + "grad_norm": 0.8970118165016174, + "learning_rate": 1.9556666001638635e-05, + "loss": 1.0953, + "mean_token_accuracy": 0.7111803531646729, + "num_tokens": 291302322.0, + "step": 8895 + }, + { + "epoch": 0.738037979932001, + "grad_norm": 0.8732209205627441, + "learning_rate": 1.9499291216845578e-05, + "loss": 1.1188, + "mean_token_accuracy": 0.7073191553354263, + "num_tokens": 291466162.0, + "step": 8900 + }, + { + "epoch": 0.7384526080106145, + "grad_norm": 0.9126554131507874, + "learning_rate": 1.944198032457053e-05, + "loss": 1.1583, + "mean_token_accuracy": 0.6986304759979248, + "num_tokens": 291629209.0, + "step": 8905 + }, + { + "epoch": 0.738867236089228, + "grad_norm": 0.933233380317688, + "learning_rate": 1.9384733444867665e-05, + "loss": 1.1907, + "mean_token_accuracy": 0.696230448782444, + "num_tokens": 291793049.0, + "step": 8910 + }, + { + "epoch": 0.7392818641678415, + "grad_norm": 0.8659321665763855, + "learning_rate": 1.9327550697656994e-05, + "loss": 1.1368, + "mean_token_accuracy": 0.7059353232383728, + "num_tokens": 291956639.0, + "step": 8915 + }, + { + "epoch": 0.739696492246455, + "grad_norm": 0.9404541254043579, + "learning_rate": 1.927043220272431e-05, + "loss": 1.2015, + "mean_token_accuracy": 0.6902857288718224, + "num_tokens": 292120389.0, + "step": 8920 + }, + { + "epoch": 0.7401111203250684, + "grad_norm": 0.9204649925231934, + "learning_rate": 1.9213378079720747e-05, + "loss": 1.1809, + "mean_token_accuracy": 0.6947702825069427, + "num_tokens": 292284229.0, + "step": 8925 + }, + { + "epoch": 0.7405257484036819, + "grad_norm": 0.9498615264892578, + "learning_rate": 1.915638844816256e-05, + "loss": 1.1896, + "mean_token_accuracy": 0.6952529326081276, + "num_tokens": 292448069.0, + "step": 8930 + }, + { + "epoch": 0.7409403764822954, + "grad_norm": 0.835673987865448, + "learning_rate": 1.9099463427430943e-05, + "loss": 1.0706, + "mean_token_accuracy": 0.714534455537796, + "num_tokens": 292611909.0, + "step": 8935 + }, + { + "epoch": 0.7413550045609089, + "grad_norm": 0.9525404572486877, + "learning_rate": 1.9042603136771797e-05, + "loss": 1.1649, + "mean_token_accuracy": 0.6959555223584175, + "num_tokens": 292775749.0, + "step": 8940 + }, + { + "epoch": 0.7417696326395223, + "grad_norm": 0.98179692029953, + "learning_rate": 1.8985807695295332e-05, + "loss": 1.1578, + "mean_token_accuracy": 0.7004763603210449, + "num_tokens": 292939308.0, + "step": 8945 + }, + { + "epoch": 0.7421842607181358, + "grad_norm": 0.9175640940666199, + "learning_rate": 1.892907722197596e-05, + "loss": 1.1706, + "mean_token_accuracy": 0.7012402236461639, + "num_tokens": 293103148.0, + "step": 8950 + }, + { + "epoch": 0.7425988887967493, + "grad_norm": 0.8898962736129761, + "learning_rate": 1.8872411835652005e-05, + "loss": 1.1008, + "mean_token_accuracy": 0.7081744864583015, + "num_tokens": 293266988.0, + "step": 8955 + }, + { + "epoch": 0.7430135168753628, + "grad_norm": 0.9193590879440308, + "learning_rate": 1.881581165502543e-05, + "loss": 1.1883, + "mean_token_accuracy": 0.7016434505581856, + "num_tokens": 293430828.0, + "step": 8960 + }, + { + "epoch": 0.7434281449539762, + "grad_norm": 0.8816352486610413, + "learning_rate": 1.8759276798661612e-05, + "loss": 1.1365, + "mean_token_accuracy": 0.7058162286877632, + "num_tokens": 293594668.0, + "step": 8965 + }, + { + "epoch": 0.7438427730325897, + "grad_norm": 0.8981600999832153, + "learning_rate": 1.870280738498909e-05, + "loss": 1.2026, + "mean_token_accuracy": 0.6937744334340096, + "num_tokens": 293758508.0, + "step": 8970 + }, + { + "epoch": 0.7442574011112032, + "grad_norm": 0.9175354838371277, + "learning_rate": 1.8646403532299316e-05, + "loss": 1.1356, + "mean_token_accuracy": 0.7080461919307709, + "num_tokens": 293922348.0, + "step": 8975 + }, + { + "epoch": 0.7446720291898168, + "grad_norm": 0.9029030799865723, + "learning_rate": 1.8590065358746406e-05, + "loss": 1.1872, + "mean_token_accuracy": 0.6982954576611519, + "num_tokens": 294086188.0, + "step": 8980 + }, + { + "epoch": 0.7450866572684303, + "grad_norm": 0.9207746982574463, + "learning_rate": 1.8533792982346877e-05, + "loss": 1.0657, + "mean_token_accuracy": 0.7148093849420547, + "num_tokens": 294250028.0, + "step": 8985 + }, + { + "epoch": 0.7455012853470437, + "grad_norm": 0.9070084691047668, + "learning_rate": 1.8477586520979435e-05, + "loss": 1.1542, + "mean_token_accuracy": 0.7049059122800827, + "num_tokens": 294413868.0, + "step": 8990 + }, + { + "epoch": 0.7459159134256572, + "grad_norm": 0.9451984167098999, + "learning_rate": 1.8421446092384693e-05, + "loss": 1.14, + "mean_token_accuracy": 0.7041361212730408, + "num_tokens": 294577708.0, + "step": 8995 + }, + { + "epoch": 0.7463305415042707, + "grad_norm": 0.879784882068634, + "learning_rate": 1.836537181416495e-05, + "loss": 1.1751, + "mean_token_accuracy": 0.6972507297992706, + "num_tokens": 294741548.0, + "step": 9000 + }, + { + "epoch": 0.7467451695828842, + "grad_norm": 0.9778941869735718, + "learning_rate": 1.830936380378393e-05, + "loss": 1.1636, + "mean_token_accuracy": 0.7022543981671333, + "num_tokens": 294905388.0, + "step": 9005 + }, + { + "epoch": 0.7471597976614976, + "grad_norm": 0.8739100098609924, + "learning_rate": 1.8253422178566543e-05, + "loss": 1.0982, + "mean_token_accuracy": 0.7134714052081108, + "num_tokens": 295069228.0, + "step": 9010 + }, + { + "epoch": 0.7475744257401111, + "grad_norm": 0.9578571915626526, + "learning_rate": 1.8197547055698622e-05, + "loss": 1.2054, + "mean_token_accuracy": 0.6938294231891632, + "num_tokens": 295233068.0, + "step": 9015 + }, + { + "epoch": 0.7479890538187246, + "grad_norm": 0.8959738612174988, + "learning_rate": 1.814173855222671e-05, + "loss": 1.1437, + "mean_token_accuracy": 0.7040872409939766, + "num_tokens": 295396908.0, + "step": 9020 + }, + { + "epoch": 0.7484036818973381, + "grad_norm": 0.8838170766830444, + "learning_rate": 1.808599678505779e-05, + "loss": 1.1555, + "mean_token_accuracy": 0.7020955502986908, + "num_tokens": 295560748.0, + "step": 9025 + }, + { + "epoch": 0.7488183099759516, + "grad_norm": 0.9568129181861877, + "learning_rate": 1.8030321870959043e-05, + "loss": 1.1724, + "mean_token_accuracy": 0.6985031768679619, + "num_tokens": 295724588.0, + "step": 9030 + }, + { + "epoch": 0.749232938054565, + "grad_norm": 0.9240115284919739, + "learning_rate": 1.797471392655763e-05, + "loss": 1.1411, + "mean_token_accuracy": 0.7033174470067024, + "num_tokens": 295888428.0, + "step": 9035 + }, + { + "epoch": 0.7496475661331785, + "grad_norm": 0.8960933089256287, + "learning_rate": 1.7919173068340345e-05, + "loss": 1.1646, + "mean_token_accuracy": 0.7023196890950203, + "num_tokens": 296051910.0, + "step": 9040 + }, + { + "epoch": 0.750062194211792, + "grad_norm": 0.8688474297523499, + "learning_rate": 1.7863699412653568e-05, + "loss": 1.1728, + "mean_token_accuracy": 0.6987781018018723, + "num_tokens": 296215750.0, + "step": 9045 + }, + { + "epoch": 0.7504768222904055, + "grad_norm": 0.9132707118988037, + "learning_rate": 1.7808293075702832e-05, + "loss": 1.1421, + "mean_token_accuracy": 0.7063782960176468, + "num_tokens": 296379590.0, + "step": 9050 + }, + { + "epoch": 0.7508914503690189, + "grad_norm": 0.9477134346961975, + "learning_rate": 1.7752954173552672e-05, + "loss": 1.1669, + "mean_token_accuracy": 0.6979960918426513, + "num_tokens": 296543430.0, + "step": 9055 + }, + { + "epoch": 0.7513060784476324, + "grad_norm": 0.9153250455856323, + "learning_rate": 1.7697682822126312e-05, + "loss": 1.2199, + "mean_token_accuracy": 0.6940127104520798, + "num_tokens": 296707270.0, + "step": 9060 + }, + { + "epoch": 0.751720706526246, + "grad_norm": 0.9431553483009338, + "learning_rate": 1.764247913720556e-05, + "loss": 1.0696, + "mean_token_accuracy": 0.7168010741472244, + "num_tokens": 296871110.0, + "step": 9065 + }, + { + "epoch": 0.7521353346048595, + "grad_norm": 0.9085553884506226, + "learning_rate": 1.758734323443043e-05, + "loss": 1.2406, + "mean_token_accuracy": 0.6867057695984841, + "num_tokens": 297034950.0, + "step": 9070 + }, + { + "epoch": 0.752549962683473, + "grad_norm": 0.9111844897270203, + "learning_rate": 1.7532275229298927e-05, + "loss": 1.1737, + "mean_token_accuracy": 0.6989797174930572, + "num_tokens": 297198790.0, + "step": 9075 + }, + { + "epoch": 0.7529645907620864, + "grad_norm": 0.9182848334312439, + "learning_rate": 1.7477275237166834e-05, + "loss": 1.1336, + "mean_token_accuracy": 0.7052480444312096, + "num_tokens": 297362630.0, + "step": 9080 + }, + { + "epoch": 0.7533792188406999, + "grad_norm": 0.9506045579910278, + "learning_rate": 1.742234337324753e-05, + "loss": 1.1941, + "mean_token_accuracy": 0.6953690081834794, + "num_tokens": 297526470.0, + "step": 9085 + }, + { + "epoch": 0.7537938469193134, + "grad_norm": 1.0101375579833984, + "learning_rate": 1.7367479752611564e-05, + "loss": 1.1485, + "mean_token_accuracy": 0.7034396350383758, + "num_tokens": 297690310.0, + "step": 9090 + }, + { + "epoch": 0.7542084749979269, + "grad_norm": 0.9124699234962463, + "learning_rate": 1.7312684490186597e-05, + "loss": 1.2135, + "mean_token_accuracy": 0.6892900764942169, + "num_tokens": 297854150.0, + "step": 9095 + }, + { + "epoch": 0.7546231030765403, + "grad_norm": 0.8840845823287964, + "learning_rate": 1.7257957700757132e-05, + "loss": 1.1423, + "mean_token_accuracy": 0.7055901750922203, + "num_tokens": 298017990.0, + "step": 9100 + }, + { + "epoch": 0.7550377311551538, + "grad_norm": 0.9478710293769836, + "learning_rate": 1.7203299498964143e-05, + "loss": 1.2587, + "mean_token_accuracy": 0.6825635373592377, + "num_tokens": 298181830.0, + "step": 9105 + }, + { + "epoch": 0.7554523592337673, + "grad_norm": 0.9232124090194702, + "learning_rate": 1.7148709999304984e-05, + "loss": 1.0747, + "mean_token_accuracy": 0.7147482916712761, + "num_tokens": 298345670.0, + "step": 9110 + }, + { + "epoch": 0.7558669873123808, + "grad_norm": 0.9229820370674133, + "learning_rate": 1.7094189316133075e-05, + "loss": 1.1342, + "mean_token_accuracy": 0.7042583122849464, + "num_tokens": 298509510.0, + "step": 9115 + }, + { + "epoch": 0.7562816153909943, + "grad_norm": 0.9188677668571472, + "learning_rate": 1.7039737563657733e-05, + "loss": 1.1156, + "mean_token_accuracy": 0.705241933465004, + "num_tokens": 298673350.0, + "step": 9120 + }, + { + "epoch": 0.7566962434696077, + "grad_norm": 0.9146133661270142, + "learning_rate": 1.698535485594378e-05, + "loss": 1.0953, + "mean_token_accuracy": 0.7126527383923531, + "num_tokens": 298837190.0, + "step": 9125 + }, + { + "epoch": 0.7571108715482212, + "grad_norm": 0.9183034300804138, + "learning_rate": 1.693104130691148e-05, + "loss": 1.1759, + "mean_token_accuracy": 0.694373169541359, + "num_tokens": 299001030.0, + "step": 9130 + }, + { + "epoch": 0.7575254996268347, + "grad_norm": 0.9065912961959839, + "learning_rate": 1.68767970303362e-05, + "loss": 1.1027, + "mean_token_accuracy": 0.7120051354169845, + "num_tokens": 299164870.0, + "step": 9135 + }, + { + "epoch": 0.7579401277054482, + "grad_norm": 0.9084035754203796, + "learning_rate": 1.68226221398482e-05, + "loss": 1.1369, + "mean_token_accuracy": 0.6996546849608422, + "num_tokens": 299327973.0, + "step": 9140 + }, + { + "epoch": 0.7583547557840618, + "grad_norm": 0.942358136177063, + "learning_rate": 1.6768516748932387e-05, + "loss": 1.1368, + "mean_token_accuracy": 0.702773705124855, + "num_tokens": 299491813.0, + "step": 9145 + }, + { + "epoch": 0.7587693838626752, + "grad_norm": 0.9485670328140259, + "learning_rate": 1.6714480970928086e-05, + "loss": 1.0961, + "mean_token_accuracy": 0.7088871866464614, + "num_tokens": 299655234.0, + "step": 9150 + }, + { + "epoch": 0.7591840119412887, + "grad_norm": 0.9354593753814697, + "learning_rate": 1.6660514919028795e-05, + "loss": 1.1791, + "mean_token_accuracy": 0.6979960888624192, + "num_tokens": 299819074.0, + "step": 9155 + }, + { + "epoch": 0.7595986400199022, + "grad_norm": 0.9166666865348816, + "learning_rate": 1.660661870628195e-05, + "loss": 1.1574, + "mean_token_accuracy": 0.7055657356977463, + "num_tokens": 299982914.0, + "step": 9160 + }, + { + "epoch": 0.7600132680985157, + "grad_norm": 0.9199892282485962, + "learning_rate": 1.655279244558869e-05, + "loss": 1.1039, + "mean_token_accuracy": 0.7136839970946312, + "num_tokens": 300145961.0, + "step": 9165 + }, + { + "epoch": 0.7604278961771291, + "grad_norm": 0.8845294117927551, + "learning_rate": 1.649903624970361e-05, + "loss": 1.0821, + "mean_token_accuracy": 0.714443401992321, + "num_tokens": 300309584.0, + "step": 9170 + }, + { + "epoch": 0.7608425242557426, + "grad_norm": 0.9437231421470642, + "learning_rate": 1.6445350231234557e-05, + "loss": 1.0407, + "mean_token_accuracy": 0.7223851427435874, + "num_tokens": 300473424.0, + "step": 9175 + }, + { + "epoch": 0.7612571523343561, + "grad_norm": 0.9445368647575378, + "learning_rate": 1.6391734502642365e-05, + "loss": 1.1781, + "mean_token_accuracy": 0.699159836769104, + "num_tokens": 300636243.0, + "step": 9180 + }, + { + "epoch": 0.7616717804129696, + "grad_norm": 0.965095043182373, + "learning_rate": 1.6338189176240565e-05, + "loss": 1.0827, + "mean_token_accuracy": 0.7107005223631859, + "num_tokens": 300799759.0, + "step": 9185 + }, + { + "epoch": 0.762086408491583, + "grad_norm": 0.917914628982544, + "learning_rate": 1.628471436419532e-05, + "loss": 1.1011, + "mean_token_accuracy": 0.7113697439432144, + "num_tokens": 300963599.0, + "step": 9190 + }, + { + "epoch": 0.7625010365701965, + "grad_norm": 0.9634137749671936, + "learning_rate": 1.6231310178525006e-05, + "loss": 1.1234, + "mean_token_accuracy": 0.7091825500130653, + "num_tokens": 301127439.0, + "step": 9195 + }, + { + "epoch": 0.76291566464881, + "grad_norm": 0.9900341033935547, + "learning_rate": 1.6177976731100064e-05, + "loss": 1.1586, + "mean_token_accuracy": 0.6953079164028168, + "num_tokens": 301291279.0, + "step": 9200 + }, + { + "epoch": 0.7633302927274235, + "grad_norm": 0.9216656684875488, + "learning_rate": 1.612471413364276e-05, + "loss": 1.2361, + "mean_token_accuracy": 0.6890090435743332, + "num_tokens": 301455119.0, + "step": 9205 + }, + { + "epoch": 0.763744920806037, + "grad_norm": 0.8715385794639587, + "learning_rate": 1.607152249772694e-05, + "loss": 1.0902, + "mean_token_accuracy": 0.7106819450855255, + "num_tokens": 301618391.0, + "step": 9210 + }, + { + "epoch": 0.7641595488846504, + "grad_norm": 0.8799657225608826, + "learning_rate": 1.6018401934777834e-05, + "loss": 1.1037, + "mean_token_accuracy": 0.7106854885816574, + "num_tokens": 301782231.0, + "step": 9215 + }, + { + "epoch": 0.7645741769632639, + "grad_norm": 0.9428990483283997, + "learning_rate": 1.5965352556071695e-05, + "loss": 1.1853, + "mean_token_accuracy": 0.6966214567422867, + "num_tokens": 301946071.0, + "step": 9220 + }, + { + "epoch": 0.7649888050418774, + "grad_norm": 0.9601016640663147, + "learning_rate": 1.5912374472735775e-05, + "loss": 1.1461, + "mean_token_accuracy": 0.7017289862036705, + "num_tokens": 302109911.0, + "step": 9225 + }, + { + "epoch": 0.765403433120491, + "grad_norm": 0.9618821144104004, + "learning_rate": 1.5859467795747924e-05, + "loss": 1.2581, + "mean_token_accuracy": 0.6823558151721955, + "num_tokens": 302273751.0, + "step": 9230 + }, + { + "epoch": 0.7658180611991044, + "grad_norm": 0.9125745892524719, + "learning_rate": 1.5806632635936385e-05, + "loss": 1.196, + "mean_token_accuracy": 0.6952101662755013, + "num_tokens": 302437591.0, + "step": 9235 + }, + { + "epoch": 0.7662326892777179, + "grad_norm": 0.9249573349952698, + "learning_rate": 1.5753869103979617e-05, + "loss": 1.1281, + "mean_token_accuracy": 0.708146370947361, + "num_tokens": 302600764.0, + "step": 9240 + }, + { + "epoch": 0.7666473173563314, + "grad_norm": 0.9272013306617737, + "learning_rate": 1.5701177310406074e-05, + "loss": 1.22, + "mean_token_accuracy": 0.6942634865641594, + "num_tokens": 302763132.0, + "step": 9245 + }, + { + "epoch": 0.7670619454349449, + "grad_norm": 0.8903987407684326, + "learning_rate": 1.5648557365593847e-05, + "loss": 1.1602, + "mean_token_accuracy": 0.7022116348147392, + "num_tokens": 302926972.0, + "step": 9250 + }, + { + "epoch": 0.7674765735135584, + "grad_norm": 0.8905354738235474, + "learning_rate": 1.5596009379770582e-05, + "loss": 1.0595, + "mean_token_accuracy": 0.7200260147452354, + "num_tokens": 303090115.0, + "step": 9255 + }, + { + "epoch": 0.7678912015921718, + "grad_norm": 0.9127805233001709, + "learning_rate": 1.554353346301315e-05, + "loss": 1.1398, + "mean_token_accuracy": 0.7073802530765534, + "num_tokens": 303253955.0, + "step": 9260 + }, + { + "epoch": 0.7683058296707853, + "grad_norm": 0.8905320167541504, + "learning_rate": 1.5491129725247517e-05, + "loss": 1.1412, + "mean_token_accuracy": 0.7063364312052727, + "num_tokens": 303417455.0, + "step": 9265 + }, + { + "epoch": 0.7687204577493988, + "grad_norm": 0.8764287233352661, + "learning_rate": 1.5438798276248357e-05, + "loss": 1.1024, + "mean_token_accuracy": 0.7102937892079353, + "num_tokens": 303580409.0, + "step": 9270 + }, + { + "epoch": 0.7691350858280123, + "grad_norm": 0.9234013557434082, + "learning_rate": 1.538653922563895e-05, + "loss": 1.1668, + "mean_token_accuracy": 0.6989613935351372, + "num_tokens": 303744249.0, + "step": 9275 + }, + { + "epoch": 0.7695497139066257, + "grad_norm": 0.9448634386062622, + "learning_rate": 1.5334352682890995e-05, + "loss": 1.1547, + "mean_token_accuracy": 0.7002504870295525, + "num_tokens": 303908089.0, + "step": 9280 + }, + { + "epoch": 0.7699643419852392, + "grad_norm": 0.9277756214141846, + "learning_rate": 1.528223875732417e-05, + "loss": 1.1288, + "mean_token_accuracy": 0.7041738271713257, + "num_tokens": 304071694.0, + "step": 9285 + }, + { + "epoch": 0.7703789700638527, + "grad_norm": 0.913982629776001, + "learning_rate": 1.5230197558106118e-05, + "loss": 1.0845, + "mean_token_accuracy": 0.7101600661873817, + "num_tokens": 304235534.0, + "step": 9290 + }, + { + "epoch": 0.7707935981424662, + "grad_norm": 0.9111948013305664, + "learning_rate": 1.5178229194252125e-05, + "loss": 1.1944, + "mean_token_accuracy": 0.6978800103068352, + "num_tokens": 304399374.0, + "step": 9295 + }, + { + "epoch": 0.7712082262210797, + "grad_norm": 0.9338374733924866, + "learning_rate": 1.5126333774624884e-05, + "loss": 1.1735, + "mean_token_accuracy": 0.6949230194091797, + "num_tokens": 304563214.0, + "step": 9300 + }, + { + "epoch": 0.7716228542996931, + "grad_norm": 0.8922032713890076, + "learning_rate": 1.5074511407934306e-05, + "loss": 1.0864, + "mean_token_accuracy": 0.7133431106805801, + "num_tokens": 304727054.0, + "step": 9305 + }, + { + "epoch": 0.7720374823783066, + "grad_norm": 0.9665189385414124, + "learning_rate": 1.502276220273725e-05, + "loss": 1.1437, + "mean_token_accuracy": 0.7035278007388115, + "num_tokens": 304890209.0, + "step": 9310 + }, + { + "epoch": 0.7724521104569202, + "grad_norm": 0.9113141894340515, + "learning_rate": 1.497108626743734e-05, + "loss": 1.1442, + "mean_token_accuracy": 0.6997678428888321, + "num_tokens": 305054049.0, + "step": 9315 + }, + { + "epoch": 0.7728667385355337, + "grad_norm": 0.9439712166786194, + "learning_rate": 1.49194837102847e-05, + "loss": 1.2242, + "mean_token_accuracy": 0.6925625205039978, + "num_tokens": 305217143.0, + "step": 9320 + }, + { + "epoch": 0.7732813666141471, + "grad_norm": 0.8958566188812256, + "learning_rate": 1.4867954639375747e-05, + "loss": 1.0832, + "mean_token_accuracy": 0.7134441658854485, + "num_tokens": 305380436.0, + "step": 9325 + }, + { + "epoch": 0.7736959946927606, + "grad_norm": 0.9114380478858948, + "learning_rate": 1.4816499162652952e-05, + "loss": 1.1208, + "mean_token_accuracy": 0.7076368540525436, + "num_tokens": 305544276.0, + "step": 9330 + }, + { + "epoch": 0.7741106227713741, + "grad_norm": 0.9351910352706909, + "learning_rate": 1.4765117387904642e-05, + "loss": 1.1473, + "mean_token_accuracy": 0.7046065524220466, + "num_tokens": 305708116.0, + "step": 9335 + }, + { + "epoch": 0.7745252508499876, + "grad_norm": 0.8757506608963013, + "learning_rate": 1.471380942276473e-05, + "loss": 1.0575, + "mean_token_accuracy": 0.7192754164338112, + "num_tokens": 305871956.0, + "step": 9340 + }, + { + "epoch": 0.7749398789286011, + "grad_norm": 0.8893118500709534, + "learning_rate": 1.4662575374712528e-05, + "loss": 1.1157, + "mean_token_accuracy": 0.7113424986600876, + "num_tokens": 306035652.0, + "step": 9345 + }, + { + "epoch": 0.7753545070072145, + "grad_norm": 0.8937063813209534, + "learning_rate": 1.4611415351072505e-05, + "loss": 1.0923, + "mean_token_accuracy": 0.7138135403394699, + "num_tokens": 306199492.0, + "step": 9350 + }, + { + "epoch": 0.775769135085828, + "grad_norm": 0.9636370539665222, + "learning_rate": 1.456032945901406e-05, + "loss": 1.1465, + "mean_token_accuracy": 0.7007453605532646, + "num_tokens": 306363332.0, + "step": 9355 + }, + { + "epoch": 0.7761837631644415, + "grad_norm": 0.9361772537231445, + "learning_rate": 1.4509317805551326e-05, + "loss": 1.1454, + "mean_token_accuracy": 0.7041972175240516, + "num_tokens": 306527172.0, + "step": 9360 + }, + { + "epoch": 0.776598391243055, + "grad_norm": 0.9029082655906677, + "learning_rate": 1.4458380497542851e-05, + "loss": 1.1787, + "mean_token_accuracy": 0.6970857784152031, + "num_tokens": 306691012.0, + "step": 9365 + }, + { + "epoch": 0.7770130193216684, + "grad_norm": 0.865684449672699, + "learning_rate": 1.4407517641691543e-05, + "loss": 1.1602, + "mean_token_accuracy": 0.7019611462950707, + "num_tokens": 306854852.0, + "step": 9370 + }, + { + "epoch": 0.7774276474002819, + "grad_norm": 0.8580654263496399, + "learning_rate": 1.4356729344544296e-05, + "loss": 1.0224, + "mean_token_accuracy": 0.7245845541357994, + "num_tokens": 307018692.0, + "step": 9375 + }, + { + "epoch": 0.7778422754788954, + "grad_norm": 0.9202730655670166, + "learning_rate": 1.4306015712491788e-05, + "loss": 1.0776, + "mean_token_accuracy": 0.716000734269619, + "num_tokens": 307182532.0, + "step": 9380 + }, + { + "epoch": 0.7782569035575089, + "grad_norm": 0.8806677460670471, + "learning_rate": 1.425537685176836e-05, + "loss": 1.1717, + "mean_token_accuracy": 0.6999755635857582, + "num_tokens": 307346372.0, + "step": 9385 + }, + { + "epoch": 0.7786715316361223, + "grad_norm": 0.9154284596443176, + "learning_rate": 1.42048128684517e-05, + "loss": 1.0968, + "mean_token_accuracy": 0.7116507768630982, + "num_tokens": 307510212.0, + "step": 9390 + }, + { + "epoch": 0.7790861597147359, + "grad_norm": 0.9176647663116455, + "learning_rate": 1.4154323868462593e-05, + "loss": 1.1401, + "mean_token_accuracy": 0.7049975529313087, + "num_tokens": 307674052.0, + "step": 9395 + }, + { + "epoch": 0.7795007877933494, + "grad_norm": 0.916608989238739, + "learning_rate": 1.4103909957564792e-05, + "loss": 1.1176, + "mean_token_accuracy": 0.7100989744067192, + "num_tokens": 307837892.0, + "step": 9400 + }, + { + "epoch": 0.7799154158719629, + "grad_norm": 0.8862534761428833, + "learning_rate": 1.4053571241364787e-05, + "loss": 1.2014, + "mean_token_accuracy": 0.6926545411348343, + "num_tokens": 308001618.0, + "step": 9405 + }, + { + "epoch": 0.7803300439505764, + "grad_norm": 0.888766348361969, + "learning_rate": 1.4003307825311507e-05, + "loss": 1.1785, + "mean_token_accuracy": 0.6983260050415993, + "num_tokens": 308165458.0, + "step": 9410 + }, + { + "epoch": 0.7807446720291898, + "grad_norm": 0.934119701385498, + "learning_rate": 1.3953119814696125e-05, + "loss": 1.1837, + "mean_token_accuracy": 0.6947892278432846, + "num_tokens": 308328216.0, + "step": 9415 + }, + { + "epoch": 0.7811593001078033, + "grad_norm": 0.9300495982170105, + "learning_rate": 1.3903007314651877e-05, + "loss": 1.1776, + "mean_token_accuracy": 0.6941837728023529, + "num_tokens": 308492056.0, + "step": 9420 + }, + { + "epoch": 0.7815739281864168, + "grad_norm": 0.9354580044746399, + "learning_rate": 1.3852970430153884e-05, + "loss": 1.1448, + "mean_token_accuracy": 0.703286899626255, + "num_tokens": 308655896.0, + "step": 9425 + }, + { + "epoch": 0.7819885562650303, + "grad_norm": 0.8823285698890686, + "learning_rate": 1.3803009266018752e-05, + "loss": 1.1344, + "mean_token_accuracy": 0.705547408759594, + "num_tokens": 308819736.0, + "step": 9430 + }, + { + "epoch": 0.7824031843436438, + "grad_norm": 0.901211142539978, + "learning_rate": 1.3753123926904527e-05, + "loss": 1.1336, + "mean_token_accuracy": 0.706414957344532, + "num_tokens": 308983576.0, + "step": 9435 + }, + { + "epoch": 0.7828178124222572, + "grad_norm": 0.9550814628601074, + "learning_rate": 1.3703314517310473e-05, + "loss": 1.1562, + "mean_token_accuracy": 0.699321848154068, + "num_tokens": 309147416.0, + "step": 9440 + }, + { + "epoch": 0.7832324405008707, + "grad_norm": 0.9058507680892944, + "learning_rate": 1.3653581141576687e-05, + "loss": 1.128, + "mean_token_accuracy": 0.709035924077034, + "num_tokens": 309311256.0, + "step": 9445 + }, + { + "epoch": 0.7836470685794842, + "grad_norm": 0.9165582656860352, + "learning_rate": 1.3603923903884069e-05, + "loss": 1.1574, + "mean_token_accuracy": 0.7024167910218239, + "num_tokens": 309474909.0, + "step": 9450 + }, + { + "epoch": 0.7840616966580977, + "grad_norm": 0.9098576307296753, + "learning_rate": 1.3554342908253998e-05, + "loss": 1.0992, + "mean_token_accuracy": 0.712964317202568, + "num_tokens": 309638749.0, + "step": 9455 + }, + { + "epoch": 0.7844763247367111, + "grad_norm": 0.9143006801605225, + "learning_rate": 1.3504838258548148e-05, + "loss": 1.1601, + "mean_token_accuracy": 0.699217988550663, + "num_tokens": 309802589.0, + "step": 9460 + }, + { + "epoch": 0.7848909528153246, + "grad_norm": 0.9183697700500488, + "learning_rate": 1.3455410058468266e-05, + "loss": 1.1909, + "mean_token_accuracy": 0.6964198410511017, + "num_tokens": 309966429.0, + "step": 9465 + }, + { + "epoch": 0.7853055808939381, + "grad_norm": 0.9047302007675171, + "learning_rate": 1.340605841155595e-05, + "loss": 1.1261, + "mean_token_accuracy": 0.7058406680822372, + "num_tokens": 310130269.0, + "step": 9470 + }, + { + "epoch": 0.7857202089725516, + "grad_norm": 0.9284247756004333, + "learning_rate": 1.3356783421192436e-05, + "loss": 1.1477, + "mean_token_accuracy": 0.702303272485733, + "num_tokens": 310294109.0, + "step": 9475 + }, + { + "epoch": 0.7861348370511652, + "grad_norm": 0.9016001224517822, + "learning_rate": 1.3307585190598387e-05, + "loss": 1.1685, + "mean_token_accuracy": 0.6992851883172989, + "num_tokens": 310457949.0, + "step": 9480 + }, + { + "epoch": 0.7865494651297786, + "grad_norm": 0.9556524157524109, + "learning_rate": 1.3258463822833655e-05, + "loss": 1.1535, + "mean_token_accuracy": 0.7016067937016487, + "num_tokens": 310621789.0, + "step": 9485 + }, + { + "epoch": 0.7869640932083921, + "grad_norm": 0.920761227607727, + "learning_rate": 1.3209419420797098e-05, + "loss": 1.1588, + "mean_token_accuracy": 0.7001093402504921, + "num_tokens": 310784417.0, + "step": 9490 + }, + { + "epoch": 0.7873787212870056, + "grad_norm": 0.9084736704826355, + "learning_rate": 1.3160452087226332e-05, + "loss": 1.115, + "mean_token_accuracy": 0.709023705124855, + "num_tokens": 310948257.0, + "step": 9495 + }, + { + "epoch": 0.7877933493656191, + "grad_norm": 0.8487734794616699, + "learning_rate": 1.3111561924697552e-05, + "loss": 1.138, + "mean_token_accuracy": 0.7044367000460625, + "num_tokens": 311111442.0, + "step": 9500 + }, + { + "epoch": 0.7882079774442325, + "grad_norm": 0.9419161081314087, + "learning_rate": 1.306274903562527e-05, + "loss": 1.1104, + "mean_token_accuracy": 0.7091336712241173, + "num_tokens": 311275282.0, + "step": 9505 + }, + { + "epoch": 0.788622605522846, + "grad_norm": 0.9123713970184326, + "learning_rate": 1.3014013522262141e-05, + "loss": 1.1511, + "mean_token_accuracy": 0.7031280517578125, + "num_tokens": 311439122.0, + "step": 9510 + }, + { + "epoch": 0.7890372336014595, + "grad_norm": 0.911185622215271, + "learning_rate": 1.2965355486698738e-05, + "loss": 1.1777, + "mean_token_accuracy": 0.6968230694532395, + "num_tokens": 311602962.0, + "step": 9515 + }, + { + "epoch": 0.789451861680073, + "grad_norm": 0.9397674202919006, + "learning_rate": 1.2916775030863337e-05, + "loss": 1.1689, + "mean_token_accuracy": 0.6983198881149292, + "num_tokens": 311766802.0, + "step": 9520 + }, + { + "epoch": 0.7898664897586865, + "grad_norm": 0.9195385575294495, + "learning_rate": 1.2868272256521657e-05, + "loss": 1.151, + "mean_token_accuracy": 0.7044660344719886, + "num_tokens": 311930642.0, + "step": 9525 + }, + { + "epoch": 0.7902811178372999, + "grad_norm": 0.9403318762779236, + "learning_rate": 1.2819847265276757e-05, + "loss": 1.1681, + "mean_token_accuracy": 0.696065491437912, + "num_tokens": 312094482.0, + "step": 9530 + }, + { + "epoch": 0.7906957459159134, + "grad_norm": 0.9081764221191406, + "learning_rate": 1.2771500158568745e-05, + "loss": 1.2203, + "mean_token_accuracy": 0.6894367069005967, + "num_tokens": 312258322.0, + "step": 9535 + }, + { + "epoch": 0.7911103739945269, + "grad_norm": 0.8648397326469421, + "learning_rate": 1.272323103767451e-05, + "loss": 1.0278, + "mean_token_accuracy": 0.725103859603405, + "num_tokens": 312422162.0, + "step": 9540 + }, + { + "epoch": 0.7915250020731404, + "grad_norm": 0.9654725790023804, + "learning_rate": 1.2675040003707639e-05, + "loss": 1.0897, + "mean_token_accuracy": 0.7063783004879951, + "num_tokens": 312586002.0, + "step": 9545 + }, + { + "epoch": 0.7919396301517538, + "grad_norm": 0.9537203311920166, + "learning_rate": 1.2626927157618157e-05, + "loss": 1.1461, + "mean_token_accuracy": 0.7018206283450127, + "num_tokens": 312749842.0, + "step": 9550 + }, + { + "epoch": 0.7923542582303673, + "grad_norm": 0.9430860280990601, + "learning_rate": 1.2578892600192272e-05, + "loss": 1.1302, + "mean_token_accuracy": 0.7051625117659569, + "num_tokens": 312913682.0, + "step": 9555 + }, + { + "epoch": 0.7927688863089808, + "grad_norm": 0.9268710017204285, + "learning_rate": 1.2530936432052154e-05, + "loss": 1.1222, + "mean_token_accuracy": 0.709597997367382, + "num_tokens": 313077522.0, + "step": 9560 + }, + { + "epoch": 0.7931835143875944, + "grad_norm": 0.9255589246749878, + "learning_rate": 1.2483058753655858e-05, + "loss": 1.1696, + "mean_token_accuracy": 0.6983382239937782, + "num_tokens": 313241362.0, + "step": 9565 + }, + { + "epoch": 0.7935981424662079, + "grad_norm": 0.955251157283783, + "learning_rate": 1.243525966529696e-05, + "loss": 1.1111, + "mean_token_accuracy": 0.7081378310918808, + "num_tokens": 313405202.0, + "step": 9570 + }, + { + "epoch": 0.7940127705448213, + "grad_norm": 0.9550685882568359, + "learning_rate": 1.2387539267104392e-05, + "loss": 1.2153, + "mean_token_accuracy": 0.6888379767537117, + "num_tokens": 313569042.0, + "step": 9575 + }, + { + "epoch": 0.7944273986234348, + "grad_norm": 0.9419229030609131, + "learning_rate": 1.2339897659042266e-05, + "loss": 1.1419, + "mean_token_accuracy": 0.704093350470066, + "num_tokens": 313732882.0, + "step": 9580 + }, + { + "epoch": 0.7948420267020483, + "grad_norm": 0.8668137192726135, + "learning_rate": 1.2292334940909699e-05, + "loss": 1.1148, + "mean_token_accuracy": 0.7130742907524109, + "num_tokens": 313896722.0, + "step": 9585 + }, + { + "epoch": 0.7952566547806618, + "grad_norm": 0.9046217203140259, + "learning_rate": 1.2244851212340453e-05, + "loss": 1.0689, + "mean_token_accuracy": 0.716208453476429, + "num_tokens": 314060562.0, + "step": 9590 + }, + { + "epoch": 0.7956712828592752, + "grad_norm": 0.940582811832428, + "learning_rate": 1.219744657280289e-05, + "loss": 1.2395, + "mean_token_accuracy": 0.6899193555116654, + "num_tokens": 314224402.0, + "step": 9595 + }, + { + "epoch": 0.7960859109378887, + "grad_norm": 0.9407996535301208, + "learning_rate": 1.2150121121599672e-05, + "loss": 1.1317, + "mean_token_accuracy": 0.7041361182928085, + "num_tokens": 314388242.0, + "step": 9600 + }, + { + "epoch": 0.7965005390165022, + "grad_norm": 0.8708174824714661, + "learning_rate": 1.2102874957867587e-05, + "loss": 1.1043, + "mean_token_accuracy": 0.7086939424276352, + "num_tokens": 314551308.0, + "step": 9605 + }, + { + "epoch": 0.7969151670951157, + "grad_norm": 0.891569197177887, + "learning_rate": 1.205570818057734e-05, + "loss": 1.1586, + "mean_token_accuracy": 0.7046187713742256, + "num_tokens": 314715148.0, + "step": 9610 + }, + { + "epoch": 0.7973297951737292, + "grad_norm": 0.9294144511222839, + "learning_rate": 1.2008620888533306e-05, + "loss": 1.1247, + "mean_token_accuracy": 0.7054679840803146, + "num_tokens": 314878988.0, + "step": 9615 + }, + { + "epoch": 0.7977444232523426, + "grad_norm": 0.9487144947052002, + "learning_rate": 1.1961613180373421e-05, + "loss": 1.1141, + "mean_token_accuracy": 0.706329420208931, + "num_tokens": 315042828.0, + "step": 9620 + }, + { + "epoch": 0.7981590513309561, + "grad_norm": 0.941530168056488, + "learning_rate": 1.1914685154568822e-05, + "loss": 1.1742, + "mean_token_accuracy": 0.6969696968793869, + "num_tokens": 315206668.0, + "step": 9625 + }, + { + "epoch": 0.7985736794095696, + "grad_norm": 0.9285726547241211, + "learning_rate": 1.1867836909423797e-05, + "loss": 1.2046, + "mean_token_accuracy": 0.6940921306610107, + "num_tokens": 315370508.0, + "step": 9630 + }, + { + "epoch": 0.7989883074881831, + "grad_norm": 0.9117364287376404, + "learning_rate": 1.1821068543075481e-05, + "loss": 1.1294, + "mean_token_accuracy": 0.7027370512485505, + "num_tokens": 315534348.0, + "step": 9635 + }, + { + "epoch": 0.7994029355667965, + "grad_norm": 0.9249398112297058, + "learning_rate": 1.177438015349368e-05, + "loss": 1.0664, + "mean_token_accuracy": 0.7145039081573487, + "num_tokens": 315698188.0, + "step": 9640 + }, + { + "epoch": 0.79981756364541, + "grad_norm": 0.9028452634811401, + "learning_rate": 1.1727771838480678e-05, + "loss": 1.1752, + "mean_token_accuracy": 0.696138808131218, + "num_tokens": 315862028.0, + "step": 9645 + }, + { + "epoch": 0.8002321917240236, + "grad_norm": 0.9576271772384644, + "learning_rate": 1.1681243695671013e-05, + "loss": 1.156, + "mean_token_accuracy": 0.6987964272499084, + "num_tokens": 316025868.0, + "step": 9650 + }, + { + "epoch": 0.8006468198026371, + "grad_norm": 0.9440005421638489, + "learning_rate": 1.1634795822531275e-05, + "loss": 1.1531, + "mean_token_accuracy": 0.7014662772417068, + "num_tokens": 316189708.0, + "step": 9655 + }, + { + "epoch": 0.8010614478812506, + "grad_norm": 0.9253191351890564, + "learning_rate": 1.1588428316359912e-05, + "loss": 1.0808, + "mean_token_accuracy": 0.7155119732022286, + "num_tokens": 316353548.0, + "step": 9660 + }, + { + "epoch": 0.801476075959864, + "grad_norm": 0.9329873323440552, + "learning_rate": 1.1542141274287032e-05, + "loss": 1.1319, + "mean_token_accuracy": 0.7086255982518196, + "num_tokens": 316516714.0, + "step": 9665 + }, + { + "epoch": 0.8018907040384775, + "grad_norm": 0.9451928734779358, + "learning_rate": 1.1495934793274132e-05, + "loss": 1.2161, + "mean_token_accuracy": 0.69285189807415, + "num_tokens": 316680554.0, + "step": 9670 + }, + { + "epoch": 0.802305332117091, + "grad_norm": 0.9663540124893188, + "learning_rate": 1.144980897011404e-05, + "loss": 1.1314, + "mean_token_accuracy": 0.7076063051819801, + "num_tokens": 316844394.0, + "step": 9675 + }, + { + "epoch": 0.8027199601957045, + "grad_norm": 0.967907190322876, + "learning_rate": 1.140376390143057e-05, + "loss": 1.1521, + "mean_token_accuracy": 0.699883921444416, + "num_tokens": 317008234.0, + "step": 9680 + }, + { + "epoch": 0.8031345882743179, + "grad_norm": 0.9527965784072876, + "learning_rate": 1.1357799683678332e-05, + "loss": 1.1615, + "mean_token_accuracy": 0.7014907151460648, + "num_tokens": 317172074.0, + "step": 9685 + }, + { + "epoch": 0.8035492163529314, + "grad_norm": 0.8742443323135376, + "learning_rate": 1.1311916413142671e-05, + "loss": 1.184, + "mean_token_accuracy": 0.6934322997927665, + "num_tokens": 317335914.0, + "step": 9690 + }, + { + "epoch": 0.8039638444315449, + "grad_norm": 0.9260457158088684, + "learning_rate": 1.1266114185939286e-05, + "loss": 1.1367, + "mean_token_accuracy": 0.7050342127680779, + "num_tokens": 317499754.0, + "step": 9695 + }, + { + "epoch": 0.8043784725101584, + "grad_norm": 0.8898655772209167, + "learning_rate": 1.1220393098014147e-05, + "loss": 1.1426, + "mean_token_accuracy": 0.7027431607246399, + "num_tokens": 317663594.0, + "step": 9700 + }, + { + "epoch": 0.8047931005887718, + "grad_norm": 0.8611737489700317, + "learning_rate": 1.1174753245143205e-05, + "loss": 1.0749, + "mean_token_accuracy": 0.7163734138011932, + "num_tokens": 317827434.0, + "step": 9705 + }, + { + "epoch": 0.8052077286673853, + "grad_norm": 0.9127941131591797, + "learning_rate": 1.1129194722932307e-05, + "loss": 1.1924, + "mean_token_accuracy": 0.6959433034062386, + "num_tokens": 317991274.0, + "step": 9710 + }, + { + "epoch": 0.8056223567459988, + "grad_norm": 0.9587748646736145, + "learning_rate": 1.1083717626816904e-05, + "loss": 1.1248, + "mean_token_accuracy": 0.7029631018638611, + "num_tokens": 318155114.0, + "step": 9715 + }, + { + "epoch": 0.8060369848246123, + "grad_norm": 0.9331362843513489, + "learning_rate": 1.1038322052061834e-05, + "loss": 1.2249, + "mean_token_accuracy": 0.6885263949632645, + "num_tokens": 318318954.0, + "step": 9720 + }, + { + "epoch": 0.8064516129032258, + "grad_norm": 0.9582976698875427, + "learning_rate": 1.0993008093761214e-05, + "loss": 1.2495, + "mean_token_accuracy": 0.6804313257336616, + "num_tokens": 318482794.0, + "step": 9725 + }, + { + "epoch": 0.8068662409818393, + "grad_norm": 0.9009600281715393, + "learning_rate": 1.094777584683821e-05, + "loss": 1.1759, + "mean_token_accuracy": 0.6934811815619468, + "num_tokens": 318646634.0, + "step": 9730 + }, + { + "epoch": 0.8072808690604528, + "grad_norm": 0.9420273303985596, + "learning_rate": 1.0902625406044753e-05, + "loss": 1.0941, + "mean_token_accuracy": 0.7095948219299316, + "num_tokens": 318809703.0, + "step": 9735 + }, + { + "epoch": 0.8076954971390663, + "grad_norm": 0.9314038157463074, + "learning_rate": 1.0857556865961437e-05, + "loss": 1.0884, + "mean_token_accuracy": 0.7102150544524193, + "num_tokens": 318973543.0, + "step": 9740 + }, + { + "epoch": 0.8081101252176798, + "grad_norm": 0.8720875382423401, + "learning_rate": 1.081257032099735e-05, + "loss": 1.0702, + "mean_token_accuracy": 0.7190852329134941, + "num_tokens": 319137036.0, + "step": 9745 + }, + { + "epoch": 0.8085247532962933, + "grad_norm": 0.9193700551986694, + "learning_rate": 1.0767665865389714e-05, + "loss": 1.1556, + "mean_token_accuracy": 0.7058703511953354, + "num_tokens": 319300542.0, + "step": 9750 + }, + { + "epoch": 0.8089393813749067, + "grad_norm": 0.9067955017089844, + "learning_rate": 1.0722843593203862e-05, + "loss": 1.1711, + "mean_token_accuracy": 0.6960166156291961, + "num_tokens": 319464382.0, + "step": 9755 + }, + { + "epoch": 0.8093540094535202, + "grad_norm": 0.9262675642967224, + "learning_rate": 1.0678103598332939e-05, + "loss": 1.2394, + "mean_token_accuracy": 0.6870135754346848, + "num_tokens": 319627902.0, + "step": 9760 + }, + { + "epoch": 0.8097686375321337, + "grad_norm": 0.9415069818496704, + "learning_rate": 1.063344597449778e-05, + "loss": 1.102, + "mean_token_accuracy": 0.7079545423388481, + "num_tokens": 319791742.0, + "step": 9765 + }, + { + "epoch": 0.8101832656107472, + "grad_norm": 0.9466578960418701, + "learning_rate": 1.0588870815246604e-05, + "loss": 1.1496, + "mean_token_accuracy": 0.7043682783842087, + "num_tokens": 319955582.0, + "step": 9770 + }, + { + "epoch": 0.8105978936893606, + "grad_norm": 0.9340462684631348, + "learning_rate": 1.0544378213954935e-05, + "loss": 1.1309, + "mean_token_accuracy": 0.705266372859478, + "num_tokens": 320119422.0, + "step": 9775 + }, + { + "epoch": 0.8110125217679741, + "grad_norm": 0.9780790209770203, + "learning_rate": 1.0499968263825332e-05, + "loss": 1.2006, + "mean_token_accuracy": 0.6927358239889145, + "num_tokens": 320283262.0, + "step": 9780 + }, + { + "epoch": 0.8114271498465876, + "grad_norm": 0.9603679776191711, + "learning_rate": 1.0455641057887229e-05, + "loss": 1.1339, + "mean_token_accuracy": 0.7017289817333221, + "num_tokens": 320447102.0, + "step": 9785 + }, + { + "epoch": 0.8118417779252011, + "grad_norm": 0.9297229051589966, + "learning_rate": 1.0411396688996722e-05, + "loss": 1.1352, + "mean_token_accuracy": 0.7051136359572411, + "num_tokens": 320610942.0, + "step": 9790 + }, + { + "epoch": 0.8122564060038145, + "grad_norm": 0.9165574312210083, + "learning_rate": 1.0367235249836383e-05, + "loss": 1.1071, + "mean_token_accuracy": 0.7109909549355506, + "num_tokens": 320774782.0, + "step": 9795 + }, + { + "epoch": 0.812671034082428, + "grad_norm": 0.9278814792633057, + "learning_rate": 1.0323156832915066e-05, + "loss": 1.1511, + "mean_token_accuracy": 0.7008975803852081, + "num_tokens": 320937748.0, + "step": 9800 + }, + { + "epoch": 0.8130856621610415, + "grad_norm": 0.9207982420921326, + "learning_rate": 1.0279161530567711e-05, + "loss": 1.1416, + "mean_token_accuracy": 0.7050158873200416, + "num_tokens": 321101588.0, + "step": 9805 + }, + { + "epoch": 0.813500290239655, + "grad_norm": 0.9000398516654968, + "learning_rate": 1.0235249434955141e-05, + "loss": 1.1846, + "mean_token_accuracy": 0.6955950632691383, + "num_tokens": 321265428.0, + "step": 9810 + }, + { + "epoch": 0.8139149183182686, + "grad_norm": 0.8642020225524902, + "learning_rate": 1.0191420638063887e-05, + "loss": 1.0791, + "mean_token_accuracy": 0.7153164729475975, + "num_tokens": 321429268.0, + "step": 9815 + }, + { + "epoch": 0.814329546396882, + "grad_norm": 0.9205776453018188, + "learning_rate": 1.0147675231705989e-05, + "loss": 1.2521, + "mean_token_accuracy": 0.6874633371829987, + "num_tokens": 321593108.0, + "step": 9820 + }, + { + "epoch": 0.8147441744754955, + "grad_norm": 0.8823779225349426, + "learning_rate": 1.01040133075188e-05, + "loss": 1.1504, + "mean_token_accuracy": 0.697543989121914, + "num_tokens": 321756948.0, + "step": 9825 + }, + { + "epoch": 0.815158802554109, + "grad_norm": 0.9273008108139038, + "learning_rate": 1.0060434956964792e-05, + "loss": 1.1397, + "mean_token_accuracy": 0.7007820159196854, + "num_tokens": 321920788.0, + "step": 9830 + }, + { + "epoch": 0.8155734306327225, + "grad_norm": 0.9190025925636292, + "learning_rate": 1.0016940271331365e-05, + "loss": 1.1571, + "mean_token_accuracy": 0.7008614405989647, + "num_tokens": 322084628.0, + "step": 9835 + }, + { + "epoch": 0.815988058711336, + "grad_norm": 0.9909818172454834, + "learning_rate": 9.97352934173067e-06, + "loss": 1.1285, + "mean_token_accuracy": 0.705999507009983, + "num_tokens": 322248468.0, + "step": 9840 + }, + { + "epoch": 0.8164026867899494, + "grad_norm": 0.9337329268455505, + "learning_rate": 9.930202259099397e-06, + "loss": 1.1454, + "mean_token_accuracy": 0.7040017083287239, + "num_tokens": 322412308.0, + "step": 9845 + }, + { + "epoch": 0.8168173148685629, + "grad_norm": 0.952125072479248, + "learning_rate": 9.886959114198601e-06, + "loss": 1.1633, + "mean_token_accuracy": 0.7001160800457, + "num_tokens": 322576148.0, + "step": 9850 + }, + { + "epoch": 0.8172319429471764, + "grad_norm": 0.9478073716163635, + "learning_rate": 9.843799997613495e-06, + "loss": 1.1494, + "mean_token_accuracy": 0.7000427633523941, + "num_tokens": 322739988.0, + "step": 9855 + }, + { + "epoch": 0.8176465710257899, + "grad_norm": 0.9033485651016235, + "learning_rate": 9.800724999753298e-06, + "loss": 1.1313, + "mean_token_accuracy": 0.7057612419128418, + "num_tokens": 322903828.0, + "step": 9860 + }, + { + "epoch": 0.8180611991044033, + "grad_norm": 0.8981077075004578, + "learning_rate": 9.757734210850956e-06, + "loss": 1.0984, + "mean_token_accuracy": 0.7132087007164956, + "num_tokens": 323067668.0, + "step": 9865 + }, + { + "epoch": 0.8184758271830168, + "grad_norm": 0.8958315849304199, + "learning_rate": 9.714827720963089e-06, + "loss": 1.0902, + "mean_token_accuracy": 0.7091581091284752, + "num_tokens": 323231508.0, + "step": 9870 + }, + { + "epoch": 0.8188904552616303, + "grad_norm": 0.9041430950164795, + "learning_rate": 9.672005619969705e-06, + "loss": 1.1608, + "mean_token_accuracy": 0.7004704251885414, + "num_tokens": 323395348.0, + "step": 9875 + }, + { + "epoch": 0.8193050833402438, + "grad_norm": 0.9167628884315491, + "learning_rate": 9.629267997573998e-06, + "loss": 1.1195, + "mean_token_accuracy": 0.7073863670229912, + "num_tokens": 323559188.0, + "step": 9880 + }, + { + "epoch": 0.8197197114188572, + "grad_norm": 0.948016881942749, + "learning_rate": 9.586614943302225e-06, + "loss": 1.1701, + "mean_token_accuracy": 0.6989162027835846, + "num_tokens": 323722759.0, + "step": 9885 + }, + { + "epoch": 0.8201343394974707, + "grad_norm": 0.9600058197975159, + "learning_rate": 9.544046546503526e-06, + "loss": 1.1119, + "mean_token_accuracy": 0.7122678458690643, + "num_tokens": 323886599.0, + "step": 9890 + }, + { + "epoch": 0.8205489675760842, + "grad_norm": 0.9428344368934631, + "learning_rate": 9.501562896349636e-06, + "loss": 1.1537, + "mean_token_accuracy": 0.7019672557711601, + "num_tokens": 324050439.0, + "step": 9895 + }, + { + "epoch": 0.8209635956546978, + "grad_norm": 0.8840602040290833, + "learning_rate": 9.459164081834803e-06, + "loss": 1.0664, + "mean_token_accuracy": 0.7209860756993294, + "num_tokens": 324214279.0, + "step": 9900 + }, + { + "epoch": 0.8213782237333113, + "grad_norm": 0.9207726120948792, + "learning_rate": 9.41685019177554e-06, + "loss": 1.1189, + "mean_token_accuracy": 0.7070136874914169, + "num_tokens": 324378119.0, + "step": 9905 + }, + { + "epoch": 0.8217928518119247, + "grad_norm": 0.9566503763198853, + "learning_rate": 9.374621314810517e-06, + "loss": 1.1231, + "mean_token_accuracy": 0.7043621718883515, + "num_tokens": 324541959.0, + "step": 9910 + }, + { + "epoch": 0.8222074798905382, + "grad_norm": 0.9196707606315613, + "learning_rate": 9.332477539400237e-06, + "loss": 1.1587, + "mean_token_accuracy": 0.7004154473543167, + "num_tokens": 324705799.0, + "step": 9915 + }, + { + "epoch": 0.8226221079691517, + "grad_norm": 0.9198716282844543, + "learning_rate": 9.290418953826969e-06, + "loss": 1.1419, + "mean_token_accuracy": 0.6994196027517319, + "num_tokens": 324869639.0, + "step": 9920 + }, + { + "epoch": 0.8230367360477652, + "grad_norm": 0.9049525260925293, + "learning_rate": 9.248445646194575e-06, + "loss": 1.1427, + "mean_token_accuracy": 0.7043212160468102, + "num_tokens": 325032620.0, + "step": 9925 + }, + { + "epoch": 0.8234513641263786, + "grad_norm": 0.8652673959732056, + "learning_rate": 9.206557704428203e-06, + "loss": 1.0587, + "mean_token_accuracy": 0.7168009787797928, + "num_tokens": 325195322.0, + "step": 9930 + }, + { + "epoch": 0.8238659922049921, + "grad_norm": 0.9393934607505798, + "learning_rate": 9.164755216274213e-06, + "loss": 1.119, + "mean_token_accuracy": 0.7064393922686577, + "num_tokens": 325359162.0, + "step": 9935 + }, + { + "epoch": 0.8242806202836056, + "grad_norm": 0.8978515267372131, + "learning_rate": 9.123038269299961e-06, + "loss": 1.0676, + "mean_token_accuracy": 0.7135019555687905, + "num_tokens": 325523002.0, + "step": 9940 + }, + { + "epoch": 0.8246952483622191, + "grad_norm": 0.9161264300346375, + "learning_rate": 9.0814069508936e-06, + "loss": 1.1641, + "mean_token_accuracy": 0.6990469247102737, + "num_tokens": 325686842.0, + "step": 9945 + }, + { + "epoch": 0.8251098764408326, + "grad_norm": 0.97547447681427, + "learning_rate": 9.039861348263916e-06, + "loss": 1.2432, + "mean_token_accuracy": 0.688801321387291, + "num_tokens": 325850682.0, + "step": 9950 + }, + { + "epoch": 0.825524504519446, + "grad_norm": 0.9172536730766296, + "learning_rate": 8.99840154844015e-06, + "loss": 1.1863, + "mean_token_accuracy": 0.700428931415081, + "num_tokens": 326013695.0, + "step": 9955 + }, + { + "epoch": 0.8259391325980595, + "grad_norm": 0.8994693160057068, + "learning_rate": 8.957027638271775e-06, + "loss": 1.0593, + "mean_token_accuracy": 0.7163795173168183, + "num_tokens": 326177535.0, + "step": 9960 + }, + { + "epoch": 0.826353760676673, + "grad_norm": 0.8992630243301392, + "learning_rate": 8.915739704428366e-06, + "loss": 1.1719, + "mean_token_accuracy": 0.6994257062673569, + "num_tokens": 326341375.0, + "step": 9965 + }, + { + "epoch": 0.8267683887552865, + "grad_norm": 0.9343917965888977, + "learning_rate": 8.874537833399399e-06, + "loss": 1.1711, + "mean_token_accuracy": 0.6989876970648765, + "num_tokens": 326504772.0, + "step": 9970 + }, + { + "epoch": 0.8271830168338999, + "grad_norm": 0.9663552045822144, + "learning_rate": 8.833422111494043e-06, + "loss": 1.1592, + "mean_token_accuracy": 0.7007820174098015, + "num_tokens": 326668612.0, + "step": 9975 + }, + { + "epoch": 0.8275976449125135, + "grad_norm": 0.8932210803031921, + "learning_rate": 8.792392624841034e-06, + "loss": 1.1281, + "mean_token_accuracy": 0.7049425706267357, + "num_tokens": 326832452.0, + "step": 9980 + }, + { + "epoch": 0.828012272991127, + "grad_norm": 0.888608992099762, + "learning_rate": 8.751449459388434e-06, + "loss": 1.1138, + "mean_token_accuracy": 0.707203084230423, + "num_tokens": 326996292.0, + "step": 9985 + }, + { + "epoch": 0.8284269010697405, + "grad_norm": 0.915816068649292, + "learning_rate": 8.710592700903496e-06, + "loss": 1.1424, + "mean_token_accuracy": 0.7073291972279548, + "num_tokens": 327159314.0, + "step": 9990 + }, + { + "epoch": 0.828841529148354, + "grad_norm": 0.9738427996635437, + "learning_rate": 8.669822434972474e-06, + "loss": 1.1868, + "mean_token_accuracy": 0.6964626044034958, + "num_tokens": 327323154.0, + "step": 9995 + }, + { + "epoch": 0.8292561572269674, + "grad_norm": 0.9736239314079285, + "learning_rate": 8.629138747000425e-06, + "loss": 1.1859, + "mean_token_accuracy": 0.6947519570589066, + "num_tokens": 327486994.0, + "step": 10000 + }, + { + "epoch": 0.8296707853055809, + "grad_norm": 0.9018882513046265, + "learning_rate": 8.588541722211063e-06, + "loss": 1.0624, + "mean_token_accuracy": 0.7181940361857414, + "num_tokens": 327650834.0, + "step": 10005 + }, + { + "epoch": 0.8300854133841944, + "grad_norm": 0.9313610792160034, + "learning_rate": 8.548031445646509e-06, + "loss": 1.1422, + "mean_token_accuracy": 0.7015029296278954, + "num_tokens": 327814674.0, + "step": 10010 + }, + { + "epoch": 0.8305000414628079, + "grad_norm": 0.9230836033821106, + "learning_rate": 8.507608002167244e-06, + "loss": 1.1125, + "mean_token_accuracy": 0.7078018069267273, + "num_tokens": 327978514.0, + "step": 10015 + }, + { + "epoch": 0.8309146695414213, + "grad_norm": 0.9039062857627869, + "learning_rate": 8.46727147645181e-06, + "loss": 1.0786, + "mean_token_accuracy": 0.7102129101753235, + "num_tokens": 328142017.0, + "step": 10020 + }, + { + "epoch": 0.8313292976200348, + "grad_norm": 0.8902859091758728, + "learning_rate": 8.427021952996633e-06, + "loss": 1.1766, + "mean_token_accuracy": 0.6976967275142669, + "num_tokens": 328305857.0, + "step": 10025 + }, + { + "epoch": 0.8317439256986483, + "grad_norm": 0.9402782917022705, + "learning_rate": 8.386859516115974e-06, + "loss": 1.0782, + "mean_token_accuracy": 0.7144122704863548, + "num_tokens": 328469697.0, + "step": 10030 + }, + { + "epoch": 0.8321585537772618, + "grad_norm": 0.9468030333518982, + "learning_rate": 8.346784249941619e-06, + "loss": 1.1426, + "mean_token_accuracy": 0.7015090376138687, + "num_tokens": 328633537.0, + "step": 10035 + }, + { + "epoch": 0.8325731818558753, + "grad_norm": 0.9220185875892639, + "learning_rate": 8.306796238422736e-06, + "loss": 1.2338, + "mean_token_accuracy": 0.6866507828235626, + "num_tokens": 328797377.0, + "step": 10040 + }, + { + "epoch": 0.8329878099344887, + "grad_norm": 0.9200132489204407, + "learning_rate": 8.266895565325722e-06, + "loss": 1.1577, + "mean_token_accuracy": 0.6988514199852943, + "num_tokens": 328961217.0, + "step": 10045 + }, + { + "epoch": 0.8334024380131022, + "grad_norm": 0.9153368473052979, + "learning_rate": 8.227082314234058e-06, + "loss": 1.1172, + "mean_token_accuracy": 0.7106427147984504, + "num_tokens": 329125057.0, + "step": 10050 + }, + { + "epoch": 0.8338170660917157, + "grad_norm": 0.8476196527481079, + "learning_rate": 8.18735656854806e-06, + "loss": 1.1204, + "mean_token_accuracy": 0.7102761447429657, + "num_tokens": 329288897.0, + "step": 10055 + }, + { + "epoch": 0.8342316941703292, + "grad_norm": 0.8842123746871948, + "learning_rate": 8.147718411484717e-06, + "loss": 1.1358, + "mean_token_accuracy": 0.707978980243206, + "num_tokens": 329452737.0, + "step": 10060 + }, + { + "epoch": 0.8346463222489428, + "grad_norm": 0.9294978976249695, + "learning_rate": 8.10816792607757e-06, + "loss": 1.083, + "mean_token_accuracy": 0.7122824415564537, + "num_tokens": 329615930.0, + "step": 10065 + }, + { + "epoch": 0.8350609503275562, + "grad_norm": 0.915131151676178, + "learning_rate": 8.068705195176535e-06, + "loss": 1.1108, + "mean_token_accuracy": 0.7099340170621872, + "num_tokens": 329779770.0, + "step": 10070 + }, + { + "epoch": 0.8354755784061697, + "grad_norm": 0.9168100357055664, + "learning_rate": 8.029330301447618e-06, + "loss": 1.1126, + "mean_token_accuracy": 0.7066104561090469, + "num_tokens": 329943610.0, + "step": 10075 + }, + { + "epoch": 0.8358902064847832, + "grad_norm": 0.939510703086853, + "learning_rate": 7.990043327372904e-06, + "loss": 1.1494, + "mean_token_accuracy": 0.7025109991431236, + "num_tokens": 330107450.0, + "step": 10080 + }, + { + "epoch": 0.8363048345633967, + "grad_norm": 0.9343424439430237, + "learning_rate": 7.950844355250259e-06, + "loss": 1.1409, + "mean_token_accuracy": 0.7024132460355759, + "num_tokens": 330271290.0, + "step": 10085 + }, + { + "epoch": 0.8367194626420101, + "grad_norm": 0.9184180498123169, + "learning_rate": 7.911733467193227e-06, + "loss": 1.0922, + "mean_token_accuracy": 0.7142900779843331, + "num_tokens": 330435130.0, + "step": 10090 + }, + { + "epoch": 0.8371340907206236, + "grad_norm": 0.9067801833152771, + "learning_rate": 7.872710745130824e-06, + "loss": 1.0844, + "mean_token_accuracy": 0.7128604590892792, + "num_tokens": 330598970.0, + "step": 10095 + }, + { + "epoch": 0.8375487187992371, + "grad_norm": 0.9362291693687439, + "learning_rate": 7.833776270807374e-06, + "loss": 1.0784, + "mean_token_accuracy": 0.7090097352862358, + "num_tokens": 330762716.0, + "step": 10100 + }, + { + "epoch": 0.8379633468778506, + "grad_norm": 0.9227790832519531, + "learning_rate": 7.794930125782352e-06, + "loss": 1.1974, + "mean_token_accuracy": 0.6948191598057747, + "num_tokens": 330926556.0, + "step": 10105 + }, + { + "epoch": 0.838377974956464, + "grad_norm": 0.9292659759521484, + "learning_rate": 7.756172391430188e-06, + "loss": 1.0708, + "mean_token_accuracy": 0.7169965773820877, + "num_tokens": 331090396.0, + "step": 10110 + }, + { + "epoch": 0.8387926030350775, + "grad_norm": 0.9163357615470886, + "learning_rate": 7.717503148940125e-06, + "loss": 1.1436, + "mean_token_accuracy": 0.7077346071600914, + "num_tokens": 331254236.0, + "step": 10115 + }, + { + "epoch": 0.839207231113691, + "grad_norm": 0.9387226104736328, + "learning_rate": 7.678922479316025e-06, + "loss": 1.129, + "mean_token_accuracy": 0.7058895394206047, + "num_tokens": 331418076.0, + "step": 10120 + }, + { + "epoch": 0.8396218591923045, + "grad_norm": 0.9172439575195312, + "learning_rate": 7.640430463376214e-06, + "loss": 1.1417, + "mean_token_accuracy": 0.7037695482373237, + "num_tokens": 331581916.0, + "step": 10125 + }, + { + "epoch": 0.840036487270918, + "grad_norm": 0.8944951891899109, + "learning_rate": 7.602027181753302e-06, + "loss": 1.14, + "mean_token_accuracy": 0.7064210638403893, + "num_tokens": 331745756.0, + "step": 10130 + }, + { + "epoch": 0.8404511153495314, + "grad_norm": 0.9381096959114075, + "learning_rate": 7.5637127148940164e-06, + "loss": 1.0607, + "mean_token_accuracy": 0.7156891539692879, + "num_tokens": 331909596.0, + "step": 10135 + }, + { + "epoch": 0.8408657434281449, + "grad_norm": 0.9678494334220886, + "learning_rate": 7.525487143059046e-06, + "loss": 1.1832, + "mean_token_accuracy": 0.6973851352930069, + "num_tokens": 332073436.0, + "step": 10140 + }, + { + "epoch": 0.8412803715067584, + "grad_norm": 0.9401522874832153, + "learning_rate": 7.487350546322858e-06, + "loss": 1.1335, + "mean_token_accuracy": 0.7039284020662308, + "num_tokens": 332237276.0, + "step": 10145 + }, + { + "epoch": 0.841694999585372, + "grad_norm": 0.9201824069023132, + "learning_rate": 7.449303004573538e-06, + "loss": 1.1038, + "mean_token_accuracy": 0.7088831886649132, + "num_tokens": 332401116.0, + "step": 10150 + }, + { + "epoch": 0.8421096276639854, + "grad_norm": 0.9344897866249084, + "learning_rate": 7.4113445975126205e-06, + "loss": 1.1722, + "mean_token_accuracy": 0.7006353884935379, + "num_tokens": 332564956.0, + "step": 10155 + }, + { + "epoch": 0.8425242557425989, + "grad_norm": 0.9061588644981384, + "learning_rate": 7.373475404654917e-06, + "loss": 1.0887, + "mean_token_accuracy": 0.7113880768418313, + "num_tokens": 332728796.0, + "step": 10160 + }, + { + "epoch": 0.8429388838212124, + "grad_norm": 0.913648784160614, + "learning_rate": 7.335695505328366e-06, + "loss": 1.2308, + "mean_token_accuracy": 0.6860153913497925, + "num_tokens": 332892636.0, + "step": 10165 + }, + { + "epoch": 0.8433535118998259, + "grad_norm": 0.9222296476364136, + "learning_rate": 7.298004978673817e-06, + "loss": 1.1789, + "mean_token_accuracy": 0.6969758063554764, + "num_tokens": 333056476.0, + "step": 10170 + }, + { + "epoch": 0.8437681399784394, + "grad_norm": 0.9109228253364563, + "learning_rate": 7.260403903644969e-06, + "loss": 1.0364, + "mean_token_accuracy": 0.7219635859131813, + "num_tokens": 333220316.0, + "step": 10175 + }, + { + "epoch": 0.8441827680570528, + "grad_norm": 0.9047378301620483, + "learning_rate": 7.222892359008082e-06, + "loss": 1.1158, + "mean_token_accuracy": 0.709744618833065, + "num_tokens": 333384156.0, + "step": 10180 + }, + { + "epoch": 0.8445973961356663, + "grad_norm": 0.8748897910118103, + "learning_rate": 7.185470423341906e-06, + "loss": 1.0768, + "mean_token_accuracy": 0.7150476530194283, + "num_tokens": 333547996.0, + "step": 10185 + }, + { + "epoch": 0.8450120242142798, + "grad_norm": 0.916329026222229, + "learning_rate": 7.148138175037427e-06, + "loss": 1.133, + "mean_token_accuracy": 0.7057490259408951, + "num_tokens": 333711836.0, + "step": 10190 + }, + { + "epoch": 0.8454266522928933, + "grad_norm": 0.8952322006225586, + "learning_rate": 7.110895692297825e-06, + "loss": 1.0881, + "mean_token_accuracy": 0.7132270276546478, + "num_tokens": 333875676.0, + "step": 10195 + }, + { + "epoch": 0.8458412803715067, + "grad_norm": 0.9101585149765015, + "learning_rate": 7.0737430531381984e-06, + "loss": 1.1821, + "mean_token_accuracy": 0.6984176456928253, + "num_tokens": 334039516.0, + "step": 10200 + }, + { + "epoch": 0.8462559084501202, + "grad_norm": 0.9506310820579529, + "learning_rate": 7.036680335385426e-06, + "loss": 1.1783, + "mean_token_accuracy": 0.6965603157877922, + "num_tokens": 334202508.0, + "step": 10205 + }, + { + "epoch": 0.8466705365287337, + "grad_norm": 0.9565568566322327, + "learning_rate": 6.999707616678064e-06, + "loss": 1.1938, + "mean_token_accuracy": 0.693558556586504, + "num_tokens": 334366062.0, + "step": 10210 + }, + { + "epoch": 0.8470851646073472, + "grad_norm": 0.9096312522888184, + "learning_rate": 6.962824974466131e-06, + "loss": 1.0899, + "mean_token_accuracy": 0.7146627560257912, + "num_tokens": 334529902.0, + "step": 10215 + }, + { + "epoch": 0.8474997926859607, + "grad_norm": 0.9432851672172546, + "learning_rate": 6.926032486010909e-06, + "loss": 1.1197, + "mean_token_accuracy": 0.707447449862957, + "num_tokens": 334693742.0, + "step": 10220 + }, + { + "epoch": 0.8479144207645741, + "grad_norm": 0.9380450248718262, + "learning_rate": 6.889330228384872e-06, + "loss": 1.1292, + "mean_token_accuracy": 0.7084066480398178, + "num_tokens": 334857582.0, + "step": 10225 + }, + { + "epoch": 0.8483290488431876, + "grad_norm": 0.9398831725120544, + "learning_rate": 6.8527182784714925e-06, + "loss": 1.1497, + "mean_token_accuracy": 0.7017961874604225, + "num_tokens": 335021422.0, + "step": 10230 + }, + { + "epoch": 0.8487436769218012, + "grad_norm": 0.9589568376541138, + "learning_rate": 6.816196712965012e-06, + "loss": 1.0939, + "mean_token_accuracy": 0.7131842628121376, + "num_tokens": 335185262.0, + "step": 10235 + }, + { + "epoch": 0.8491583050004147, + "grad_norm": 0.9200323820114136, + "learning_rate": 6.779765608370381e-06, + "loss": 1.1777, + "mean_token_accuracy": 0.6959616333246231, + "num_tokens": 335349102.0, + "step": 10240 + }, + { + "epoch": 0.8495729330790281, + "grad_norm": 0.8970324993133545, + "learning_rate": 6.743425041003032e-06, + "loss": 1.1821, + "mean_token_accuracy": 0.6969391494989395, + "num_tokens": 335512942.0, + "step": 10245 + }, + { + "epoch": 0.8499875611576416, + "grad_norm": 0.9432082176208496, + "learning_rate": 6.70717508698876e-06, + "loss": 1.1291, + "mean_token_accuracy": 0.7087915413081646, + "num_tokens": 335676782.0, + "step": 10250 + }, + { + "epoch": 0.8504021892362551, + "grad_norm": 0.9453873038291931, + "learning_rate": 6.6710158222635214e-06, + "loss": 1.0524, + "mean_token_accuracy": 0.71732037961483, + "num_tokens": 335840622.0, + "step": 10255 + }, + { + "epoch": 0.8508168173148686, + "grad_norm": 0.9753464460372925, + "learning_rate": 6.634947322573315e-06, + "loss": 1.1827, + "mean_token_accuracy": 0.6988941878080368, + "num_tokens": 336004462.0, + "step": 10260 + }, + { + "epoch": 0.8512314453934821, + "grad_norm": 0.8873828649520874, + "learning_rate": 6.5989696634739975e-06, + "loss": 1.1724, + "mean_token_accuracy": 0.6980716556310653, + "num_tokens": 336167735.0, + "step": 10265 + }, + { + "epoch": 0.8516460734720955, + "grad_norm": 0.843457818031311, + "learning_rate": 6.563082920331143e-06, + "loss": 1.061, + "mean_token_accuracy": 0.7177908137440682, + "num_tokens": 336331575.0, + "step": 10270 + }, + { + "epoch": 0.852060701550709, + "grad_norm": 0.9610382318496704, + "learning_rate": 6.527287168319857e-06, + "loss": 1.1429, + "mean_token_accuracy": 0.7072542741894722, + "num_tokens": 336494655.0, + "step": 10275 + }, + { + "epoch": 0.8524753296293225, + "grad_norm": 0.9118860960006714, + "learning_rate": 6.491582482424663e-06, + "loss": 1.1218, + "mean_token_accuracy": 0.7077346026897431, + "num_tokens": 336658495.0, + "step": 10280 + }, + { + "epoch": 0.852889957707936, + "grad_norm": 0.9611846208572388, + "learning_rate": 6.455968937439299e-06, + "loss": 1.1314, + "mean_token_accuracy": 0.7065094336867332, + "num_tokens": 336821488.0, + "step": 10285 + }, + { + "epoch": 0.8533045857865494, + "grad_norm": 0.9499552845954895, + "learning_rate": 6.4204466079666016e-06, + "loss": 1.1761, + "mean_token_accuracy": 0.7020955517888069, + "num_tokens": 336985328.0, + "step": 10290 + }, + { + "epoch": 0.8537192138651629, + "grad_norm": 0.8805335164070129, + "learning_rate": 6.385015568418307e-06, + "loss": 1.1255, + "mean_token_accuracy": 0.7115713566541672, + "num_tokens": 337149168.0, + "step": 10295 + }, + { + "epoch": 0.8541338419437764, + "grad_norm": 0.9237547516822815, + "learning_rate": 6.349675893014933e-06, + "loss": 1.1945, + "mean_token_accuracy": 0.6932795748114586, + "num_tokens": 337313008.0, + "step": 10300 + }, + { + "epoch": 0.8545484700223899, + "grad_norm": 0.9319496750831604, + "learning_rate": 6.3144276557856155e-06, + "loss": 1.1413, + "mean_token_accuracy": 0.7031341657042504, + "num_tokens": 337476848.0, + "step": 10305 + }, + { + "epoch": 0.8549630981010033, + "grad_norm": 0.9150850176811218, + "learning_rate": 6.279270930567943e-06, + "loss": 1.1264, + "mean_token_accuracy": 0.7067143246531487, + "num_tokens": 337640688.0, + "step": 10310 + }, + { + "epoch": 0.8553777261796169, + "grad_norm": 0.9051622152328491, + "learning_rate": 6.244205791007768e-06, + "loss": 1.1661, + "mean_token_accuracy": 0.699865597486496, + "num_tokens": 337804528.0, + "step": 10315 + }, + { + "epoch": 0.8557923542582304, + "grad_norm": 0.8876291513442993, + "learning_rate": 6.209232310559149e-06, + "loss": 1.0941, + "mean_token_accuracy": 0.7114613935351372, + "num_tokens": 337968368.0, + "step": 10320 + }, + { + "epoch": 0.8562069823368439, + "grad_norm": 0.9366981983184814, + "learning_rate": 6.1743505624841155e-06, + "loss": 1.0545, + "mean_token_accuracy": 0.719440370798111, + "num_tokens": 338132208.0, + "step": 10325 + }, + { + "epoch": 0.8566216104154574, + "grad_norm": 0.9815709590911865, + "learning_rate": 6.139560619852524e-06, + "loss": 1.1019, + "mean_token_accuracy": 0.7128482386469841, + "num_tokens": 338296048.0, + "step": 10330 + }, + { + "epoch": 0.8570362384940708, + "grad_norm": 0.9242997765541077, + "learning_rate": 6.104862555541935e-06, + "loss": 1.0545, + "mean_token_accuracy": 0.7165017127990723, + "num_tokens": 338459888.0, + "step": 10335 + }, + { + "epoch": 0.8574508665726843, + "grad_norm": 0.9416835308074951, + "learning_rate": 6.070256442237426e-06, + "loss": 1.1271, + "mean_token_accuracy": 0.7058345556259156, + "num_tokens": 338623728.0, + "step": 10340 + }, + { + "epoch": 0.8578654946512978, + "grad_norm": 0.9095317125320435, + "learning_rate": 6.0357423524314896e-06, + "loss": 1.0937, + "mean_token_accuracy": 0.712644311785698, + "num_tokens": 338786808.0, + "step": 10345 + }, + { + "epoch": 0.8582801227299113, + "grad_norm": 0.9168542623519897, + "learning_rate": 6.001320358423784e-06, + "loss": 1.1875, + "mean_token_accuracy": 0.6952815085649491, + "num_tokens": 338950140.0, + "step": 10350 + }, + { + "epoch": 0.8586947508085248, + "grad_norm": 0.9308287501335144, + "learning_rate": 5.966990532321126e-06, + "loss": 1.155, + "mean_token_accuracy": 0.6985923990607261, + "num_tokens": 339113721.0, + "step": 10355 + }, + { + "epoch": 0.8591093788871382, + "grad_norm": 0.8683541417121887, + "learning_rate": 5.932752946037223e-06, + "loss": 1.0512, + "mean_token_accuracy": 0.7197947204113007, + "num_tokens": 339277561.0, + "step": 10360 + }, + { + "epoch": 0.8595240069657517, + "grad_norm": 0.9227983951568604, + "learning_rate": 5.898607671292533e-06, + "loss": 1.2245, + "mean_token_accuracy": 0.68935117572546, + "num_tokens": 339441401.0, + "step": 10365 + }, + { + "epoch": 0.8599386350443652, + "grad_norm": 0.8903299570083618, + "learning_rate": 5.864554779614173e-06, + "loss": 1.1656, + "mean_token_accuracy": 0.7003726795315742, + "num_tokens": 339605241.0, + "step": 10370 + }, + { + "epoch": 0.8603532631229787, + "grad_norm": 0.9057297110557556, + "learning_rate": 5.83059434233576e-06, + "loss": 1.0949, + "mean_token_accuracy": 0.7124938845634461, + "num_tokens": 339769081.0, + "step": 10375 + }, + { + "epoch": 0.8607678912015921, + "grad_norm": 0.9160807728767395, + "learning_rate": 5.796726430597177e-06, + "loss": 1.1551, + "mean_token_accuracy": 0.7034396409988404, + "num_tokens": 339932921.0, + "step": 10380 + }, + { + "epoch": 0.8611825192802056, + "grad_norm": 0.9203912615776062, + "learning_rate": 5.762951115344517e-06, + "loss": 1.1418, + "mean_token_accuracy": 0.7056757092475892, + "num_tokens": 340096761.0, + "step": 10385 + }, + { + "epoch": 0.8615971473588191, + "grad_norm": 0.9417601823806763, + "learning_rate": 5.72926846732994e-06, + "loss": 1.2017, + "mean_token_accuracy": 0.6915078178048134, + "num_tokens": 340260601.0, + "step": 10390 + }, + { + "epoch": 0.8620117754374326, + "grad_norm": 0.9596326351165771, + "learning_rate": 5.695678557111417e-06, + "loss": 1.2143, + "mean_token_accuracy": 0.6903286874294281, + "num_tokens": 340424441.0, + "step": 10395 + }, + { + "epoch": 0.8624264035160462, + "grad_norm": 0.9278862476348877, + "learning_rate": 5.6621814550526955e-06, + "loss": 1.1223, + "mean_token_accuracy": 0.7098545983433724, + "num_tokens": 340588281.0, + "step": 10400 + }, + { + "epoch": 0.8628410315946596, + "grad_norm": 0.9150340557098389, + "learning_rate": 5.628777231323101e-06, + "loss": 1.1154, + "mean_token_accuracy": 0.7060789301991462, + "num_tokens": 340752121.0, + "step": 10405 + }, + { + "epoch": 0.8632556596732731, + "grad_norm": 0.9144976139068604, + "learning_rate": 5.5954659558974275e-06, + "loss": 1.1716, + "mean_token_accuracy": 0.6999450191855431, + "num_tokens": 340915961.0, + "step": 10410 + }, + { + "epoch": 0.8636702877518866, + "grad_norm": 0.960284948348999, + "learning_rate": 5.562247698555695e-06, + "loss": 1.0815, + "mean_token_accuracy": 0.7087146058678627, + "num_tokens": 341079573.0, + "step": 10415 + }, + { + "epoch": 0.8640849158305001, + "grad_norm": 0.8988045454025269, + "learning_rate": 5.52912252888313e-06, + "loss": 1.1377, + "mean_token_accuracy": 0.7064760476350784, + "num_tokens": 341243413.0, + "step": 10420 + }, + { + "epoch": 0.8644995439091135, + "grad_norm": 0.905921459197998, + "learning_rate": 5.496090516269936e-06, + "loss": 1.1608, + "mean_token_accuracy": 0.7029386609792709, + "num_tokens": 341407253.0, + "step": 10425 + }, + { + "epoch": 0.864914171987727, + "grad_norm": 0.901429295539856, + "learning_rate": 5.4631517299111755e-06, + "loss": 1.2206, + "mean_token_accuracy": 0.691397850215435, + "num_tokens": 341571093.0, + "step": 10430 + }, + { + "epoch": 0.8653288000663405, + "grad_norm": 0.8878471255302429, + "learning_rate": 5.430306238806626e-06, + "loss": 1.029, + "mean_token_accuracy": 0.7252299129962921, + "num_tokens": 341734914.0, + "step": 10435 + }, + { + "epoch": 0.865743428144954, + "grad_norm": 0.9485891461372375, + "learning_rate": 5.397554111760617e-06, + "loss": 1.1296, + "mean_token_accuracy": 0.7043988257646561, + "num_tokens": 341898754.0, + "step": 10440 + }, + { + "epoch": 0.8661580562235675, + "grad_norm": 0.8997884392738342, + "learning_rate": 5.364895417381921e-06, + "loss": 1.1871, + "mean_token_accuracy": 0.6963770732283592, + "num_tokens": 342062594.0, + "step": 10445 + }, + { + "epoch": 0.8665726843021809, + "grad_norm": 0.9660028219223022, + "learning_rate": 5.33233022408357e-06, + "loss": 1.1105, + "mean_token_accuracy": 0.7105938419699669, + "num_tokens": 342226434.0, + "step": 10450 + }, + { + "epoch": 0.8669873123807944, + "grad_norm": 0.9538041353225708, + "learning_rate": 5.299858600082752e-06, + "loss": 1.1144, + "mean_token_accuracy": 0.7105266362428665, + "num_tokens": 342390274.0, + "step": 10455 + }, + { + "epoch": 0.8674019404594079, + "grad_norm": 0.8947070240974426, + "learning_rate": 5.267480613400616e-06, + "loss": 1.1334, + "mean_token_accuracy": 0.7077346041798591, + "num_tokens": 342554114.0, + "step": 10460 + }, + { + "epoch": 0.8678165685380214, + "grad_norm": 0.8957281112670898, + "learning_rate": 5.235196331862196e-06, + "loss": 1.0513, + "mean_token_accuracy": 0.7214931592345237, + "num_tokens": 342717954.0, + "step": 10465 + }, + { + "epoch": 0.8682311966166348, + "grad_norm": 0.920192301273346, + "learning_rate": 5.203005823096207e-06, + "loss": 1.1912, + "mean_token_accuracy": 0.6949596747756004, + "num_tokens": 342881794.0, + "step": 10470 + }, + { + "epoch": 0.8686458246952483, + "grad_norm": 0.934984564781189, + "learning_rate": 5.170909154534942e-06, + "loss": 1.1365, + "mean_token_accuracy": 0.7070564493536949, + "num_tokens": 343045634.0, + "step": 10475 + }, + { + "epoch": 0.8690604527738618, + "grad_norm": 0.8826227784156799, + "learning_rate": 5.138906393414123e-06, + "loss": 1.0759, + "mean_token_accuracy": 0.7142350882291794, + "num_tokens": 343209474.0, + "step": 10480 + }, + { + "epoch": 0.8694750808524754, + "grad_norm": 0.9156180024147034, + "learning_rate": 5.106997606772734e-06, + "loss": 1.1517, + "mean_token_accuracy": 0.7029875323176384, + "num_tokens": 343373314.0, + "step": 10485 + }, + { + "epoch": 0.8698897089310889, + "grad_norm": 0.9311761260032654, + "learning_rate": 5.075182861452943e-06, + "loss": 1.1354, + "mean_token_accuracy": 0.7048264876008034, + "num_tokens": 343537154.0, + "step": 10490 + }, + { + "epoch": 0.8703043370097023, + "grad_norm": 0.9276612401008606, + "learning_rate": 5.0434622240998595e-06, + "loss": 1.1191, + "mean_token_accuracy": 0.7078445747494697, + "num_tokens": 343700994.0, + "step": 10495 + }, + { + "epoch": 0.8707189650883158, + "grad_norm": 0.9433877468109131, + "learning_rate": 5.011835761161521e-06, + "loss": 1.1138, + "mean_token_accuracy": 0.7077651530504226, + "num_tokens": 343864834.0, + "step": 10500 + }, + { + "epoch": 0.8711335931669293, + "grad_norm": 0.9424065351486206, + "learning_rate": 4.980303538888664e-06, + "loss": 1.1321, + "mean_token_accuracy": 0.7079362139105797, + "num_tokens": 344028674.0, + "step": 10505 + }, + { + "epoch": 0.8715482212455428, + "grad_norm": 0.9527302384376526, + "learning_rate": 4.948865623334581e-06, + "loss": 1.1602, + "mean_token_accuracy": 0.7004765406250953, + "num_tokens": 344192514.0, + "step": 10510 + }, + { + "epoch": 0.8719628493241562, + "grad_norm": 0.9701392650604248, + "learning_rate": 4.917522080355064e-06, + "loss": 1.1372, + "mean_token_accuracy": 0.7040017083287239, + "num_tokens": 344356354.0, + "step": 10515 + }, + { + "epoch": 0.8723774774027697, + "grad_norm": 0.9226775169372559, + "learning_rate": 4.886272975608197e-06, + "loss": 1.1048, + "mean_token_accuracy": 0.7104777619242668, + "num_tokens": 344520194.0, + "step": 10520 + }, + { + "epoch": 0.8727921054813832, + "grad_norm": 0.9319620728492737, + "learning_rate": 4.855118374554202e-06, + "loss": 1.1621, + "mean_token_accuracy": 0.7017656415700912, + "num_tokens": 344684034.0, + "step": 10525 + }, + { + "epoch": 0.8732067335599967, + "grad_norm": 0.9370831251144409, + "learning_rate": 4.8240583424553674e-06, + "loss": 1.1468, + "mean_token_accuracy": 0.7070992231369019, + "num_tokens": 344847874.0, + "step": 10530 + }, + { + "epoch": 0.8736213616386101, + "grad_norm": 0.9271456599235535, + "learning_rate": 4.7930929443758935e-06, + "loss": 1.1367, + "mean_token_accuracy": 0.7005070865154266, + "num_tokens": 345011714.0, + "step": 10535 + }, + { + "epoch": 0.8740359897172236, + "grad_norm": 0.8970178365707397, + "learning_rate": 4.762222245181719e-06, + "loss": 1.1148, + "mean_token_accuracy": 0.7075757578015327, + "num_tokens": 345175554.0, + "step": 10540 + }, + { + "epoch": 0.8744506177958371, + "grad_norm": 0.9386332035064697, + "learning_rate": 4.7314463095404e-06, + "loss": 1.1563, + "mean_token_accuracy": 0.699101909995079, + "num_tokens": 345339394.0, + "step": 10545 + }, + { + "epoch": 0.8748652458744506, + "grad_norm": 0.9848224520683289, + "learning_rate": 4.700765201920998e-06, + "loss": 1.2436, + "mean_token_accuracy": 0.685410051047802, + "num_tokens": 345502571.0, + "step": 10550 + }, + { + "epoch": 0.8752798739530641, + "grad_norm": 0.929392397403717, + "learning_rate": 4.670178986593948e-06, + "loss": 1.1694, + "mean_token_accuracy": 0.700164957344532, + "num_tokens": 345666411.0, + "step": 10555 + }, + { + "epoch": 0.8756945020316775, + "grad_norm": 0.9089635610580444, + "learning_rate": 4.639687727630865e-06, + "loss": 1.1217, + "mean_token_accuracy": 0.7107526905834675, + "num_tokens": 345830251.0, + "step": 10560 + }, + { + "epoch": 0.8761091301102911, + "grad_norm": 0.9183827638626099, + "learning_rate": 4.609291488904472e-06, + "loss": 1.1095, + "mean_token_accuracy": 0.7111681342124939, + "num_tokens": 345994091.0, + "step": 10565 + }, + { + "epoch": 0.8765237581889046, + "grad_norm": 0.9086915850639343, + "learning_rate": 4.578990334088468e-06, + "loss": 1.0955, + "mean_token_accuracy": 0.7092069864273072, + "num_tokens": 346157931.0, + "step": 10570 + }, + { + "epoch": 0.8769383862675181, + "grad_norm": 0.8890146613121033, + "learning_rate": 4.5487843266573235e-06, + "loss": 1.0894, + "mean_token_accuracy": 0.7138990744948387, + "num_tokens": 346321771.0, + "step": 10575 + }, + { + "epoch": 0.8773530143461316, + "grad_norm": 0.9215315580368042, + "learning_rate": 4.518673529886231e-06, + "loss": 1.0649, + "mean_token_accuracy": 0.7202284976840019, + "num_tokens": 346485611.0, + "step": 10580 + }, + { + "epoch": 0.877767642424745, + "grad_norm": 0.9251281023025513, + "learning_rate": 4.488658006850915e-06, + "loss": 1.1543, + "mean_token_accuracy": 0.7058259814977645, + "num_tokens": 346649398.0, + "step": 10585 + }, + { + "epoch": 0.8781822705033585, + "grad_norm": 0.8954192996025085, + "learning_rate": 4.458737820427545e-06, + "loss": 1.1213, + "mean_token_accuracy": 0.7065554738044739, + "num_tokens": 346813238.0, + "step": 10590 + }, + { + "epoch": 0.878596898581972, + "grad_norm": 0.9286209940910339, + "learning_rate": 4.428913033292559e-06, + "loss": 1.1571, + "mean_token_accuracy": 0.6992729738354683, + "num_tokens": 346977078.0, + "step": 10595 + }, + { + "epoch": 0.8790115266605855, + "grad_norm": 0.9251374006271362, + "learning_rate": 4.399183707922566e-06, + "loss": 1.1796, + "mean_token_accuracy": 0.6961632415652275, + "num_tokens": 347140918.0, + "step": 10600 + }, + { + "epoch": 0.8794261547391989, + "grad_norm": 0.9257077574729919, + "learning_rate": 4.369549906594195e-06, + "loss": 1.1771, + "mean_token_accuracy": 0.6977761521935463, + "num_tokens": 347304758.0, + "step": 10605 + }, + { + "epoch": 0.8798407828178124, + "grad_norm": 0.9449937343597412, + "learning_rate": 4.340011691383983e-06, + "loss": 1.1267, + "mean_token_accuracy": 0.7064149603247643, + "num_tokens": 347468598.0, + "step": 10610 + }, + { + "epoch": 0.8802554108964259, + "grad_norm": 0.9059110283851624, + "learning_rate": 4.310569124168229e-06, + "loss": 1.164, + "mean_token_accuracy": 0.7002016142010689, + "num_tokens": 347632438.0, + "step": 10615 + }, + { + "epoch": 0.8806700389750394, + "grad_norm": 0.9088716506958008, + "learning_rate": 4.281222266622864e-06, + "loss": 1.0961, + "mean_token_accuracy": 0.7130865097045899, + "num_tokens": 347796278.0, + "step": 10620 + }, + { + "epoch": 0.8810846670536528, + "grad_norm": 0.8918694257736206, + "learning_rate": 4.25197118022333e-06, + "loss": 1.1256, + "mean_token_accuracy": 0.7093108505010605, + "num_tokens": 347960118.0, + "step": 10625 + }, + { + "epoch": 0.8814992951322663, + "grad_norm": 0.925315797328949, + "learning_rate": 4.222815926244455e-06, + "loss": 1.2178, + "mean_token_accuracy": 0.690927417576313, + "num_tokens": 348123958.0, + "step": 10630 + }, + { + "epoch": 0.8819139232108798, + "grad_norm": 0.8862684965133667, + "learning_rate": 4.193756565760315e-06, + "loss": 1.1023, + "mean_token_accuracy": 0.711528591811657, + "num_tokens": 348287798.0, + "step": 10635 + }, + { + "epoch": 0.8823285512894933, + "grad_norm": 0.9163962602615356, + "learning_rate": 4.164793159644109e-06, + "loss": 1.0738, + "mean_token_accuracy": 0.7229288846254349, + "num_tokens": 348451638.0, + "step": 10640 + }, + { + "epoch": 0.8827431793681068, + "grad_norm": 0.9359307289123535, + "learning_rate": 4.135925768568028e-06, + "loss": 1.1612, + "mean_token_accuracy": 0.7009225338697433, + "num_tokens": 348615478.0, + "step": 10645 + }, + { + "epoch": 0.8831578074467203, + "grad_norm": 0.9241249561309814, + "learning_rate": 4.107154453003148e-06, + "loss": 1.1954, + "mean_token_accuracy": 0.6945578485727311, + "num_tokens": 348778327.0, + "step": 10650 + }, + { + "epoch": 0.8835724355253338, + "grad_norm": 0.875142514705658, + "learning_rate": 4.078479273219249e-06, + "loss": 1.0893, + "mean_token_accuracy": 0.7108382225036621, + "num_tokens": 348942167.0, + "step": 10655 + }, + { + "epoch": 0.8839870636039473, + "grad_norm": 0.9249883890151978, + "learning_rate": 4.049900289284781e-06, + "loss": 1.1509, + "mean_token_accuracy": 0.7056634902954102, + "num_tokens": 349106007.0, + "step": 10660 + }, + { + "epoch": 0.8844016916825608, + "grad_norm": 0.9233696460723877, + "learning_rate": 4.021417561066649e-06, + "loss": 1.0809, + "mean_token_accuracy": 0.7163734093308449, + "num_tokens": 349269847.0, + "step": 10665 + }, + { + "epoch": 0.8848163197611743, + "grad_norm": 0.8704918026924133, + "learning_rate": 3.993031148230114e-06, + "loss": 1.1255, + "mean_token_accuracy": 0.7117546945810318, + "num_tokens": 349433611.0, + "step": 10670 + }, + { + "epoch": 0.8852309478397877, + "grad_norm": 0.9347735643386841, + "learning_rate": 3.964741110238695e-06, + "loss": 1.1009, + "mean_token_accuracy": 0.7121028825640678, + "num_tokens": 349597451.0, + "step": 10675 + }, + { + "epoch": 0.8856455759184012, + "grad_norm": 0.9115481972694397, + "learning_rate": 3.936547506354038e-06, + "loss": 1.1182, + "mean_token_accuracy": 0.7080034166574478, + "num_tokens": 349761291.0, + "step": 10680 + }, + { + "epoch": 0.8860602039970147, + "grad_norm": 0.92775958776474, + "learning_rate": 3.908450395635771e-06, + "loss": 1.1146, + "mean_token_accuracy": 0.7106427147984504, + "num_tokens": 349925131.0, + "step": 10685 + }, + { + "epoch": 0.8864748320756282, + "grad_norm": 0.8857362270355225, + "learning_rate": 3.880449836941352e-06, + "loss": 1.0615, + "mean_token_accuracy": 0.7175005540251732, + "num_tokens": 350088720.0, + "step": 10690 + }, + { + "epoch": 0.8868894601542416, + "grad_norm": 0.9340742230415344, + "learning_rate": 3.8525458889260454e-06, + "loss": 1.0995, + "mean_token_accuracy": 0.712231183052063, + "num_tokens": 350252560.0, + "step": 10695 + }, + { + "epoch": 0.8873040882328551, + "grad_norm": 0.9503607153892517, + "learning_rate": 3.8247386100427e-06, + "loss": 1.1441, + "mean_token_accuracy": 0.7080706223845482, + "num_tokens": 350416400.0, + "step": 10700 + }, + { + "epoch": 0.8877187163114686, + "grad_norm": 0.9132021069526672, + "learning_rate": 3.7970280585416574e-06, + "loss": 1.1013, + "mean_token_accuracy": 0.7119199618697166, + "num_tokens": 350579808.0, + "step": 10705 + }, + { + "epoch": 0.8881333443900821, + "grad_norm": 0.9462399482727051, + "learning_rate": 3.7694142924706467e-06, + "loss": 1.0906, + "mean_token_accuracy": 0.7143477022647857, + "num_tokens": 350743000.0, + "step": 10710 + }, + { + "epoch": 0.8885479724686955, + "grad_norm": 0.9100886583328247, + "learning_rate": 3.741897369674674e-06, + "loss": 1.1007, + "mean_token_accuracy": 0.7157502487301827, + "num_tokens": 350906840.0, + "step": 10715 + }, + { + "epoch": 0.888962600547309, + "grad_norm": 0.9081605672836304, + "learning_rate": 3.714477347795836e-06, + "loss": 1.1059, + "mean_token_accuracy": 0.7109359741210938, + "num_tokens": 351070680.0, + "step": 10720 + }, + { + "epoch": 0.8893772286259225, + "grad_norm": 0.883823037147522, + "learning_rate": 3.6871542842732755e-06, + "loss": 1.0441, + "mean_token_accuracy": 0.7192143186926842, + "num_tokens": 351234520.0, + "step": 10725 + }, + { + "epoch": 0.889791856704536, + "grad_norm": 0.9276925325393677, + "learning_rate": 3.659928236343013e-06, + "loss": 1.0591, + "mean_token_accuracy": 0.7205584034323692, + "num_tokens": 351398360.0, + "step": 10730 + }, + { + "epoch": 0.8902064847831496, + "grad_norm": 0.9058579802513123, + "learning_rate": 3.6327992610378505e-06, + "loss": 1.139, + "mean_token_accuracy": 0.704841522872448, + "num_tokens": 351561184.0, + "step": 10735 + }, + { + "epoch": 0.890621112861763, + "grad_norm": 0.9753690361976624, + "learning_rate": 3.6057674151872336e-06, + "loss": 1.1236, + "mean_token_accuracy": 0.70912756472826, + "num_tokens": 351725024.0, + "step": 10740 + }, + { + "epoch": 0.8910357409403765, + "grad_norm": 0.9092490673065186, + "learning_rate": 3.578832755417155e-06, + "loss": 1.0853, + "mean_token_accuracy": 0.7153225794434548, + "num_tokens": 351888864.0, + "step": 10745 + }, + { + "epoch": 0.89145036901899, + "grad_norm": 0.8844450116157532, + "learning_rate": 3.5519953381500157e-06, + "loss": 1.1252, + "mean_token_accuracy": 0.7079545482993126, + "num_tokens": 352052704.0, + "step": 10750 + }, + { + "epoch": 0.8918649970976035, + "grad_norm": 0.8999067544937134, + "learning_rate": 3.5252552196045065e-06, + "loss": 1.0841, + "mean_token_accuracy": 0.7133003413677216, + "num_tokens": 352216544.0, + "step": 10755 + }, + { + "epoch": 0.892279625176217, + "grad_norm": 0.9216794967651367, + "learning_rate": 3.4986124557955137e-06, + "loss": 1.1149, + "mean_token_accuracy": 0.7114186227321625, + "num_tokens": 352380384.0, + "step": 10760 + }, + { + "epoch": 0.8926942532548304, + "grad_norm": 0.9328907132148743, + "learning_rate": 3.472067102533977e-06, + "loss": 1.2421, + "mean_token_accuracy": 0.6862288989126683, + "num_tokens": 352543574.0, + "step": 10765 + }, + { + "epoch": 0.8931088813334439, + "grad_norm": 0.9407025575637817, + "learning_rate": 3.445619215426782e-06, + "loss": 1.1621, + "mean_token_accuracy": 0.6999144695699215, + "num_tokens": 352707414.0, + "step": 10770 + }, + { + "epoch": 0.8935235094120574, + "grad_norm": 0.9398549199104309, + "learning_rate": 3.4192688498766444e-06, + "loss": 1.1392, + "mean_token_accuracy": 0.7009042024612426, + "num_tokens": 352871254.0, + "step": 10775 + }, + { + "epoch": 0.8939381374906709, + "grad_norm": 0.9020984768867493, + "learning_rate": 3.3930160610819937e-06, + "loss": 1.196, + "mean_token_accuracy": 0.6935483887791634, + "num_tokens": 353035094.0, + "step": 10780 + }, + { + "epoch": 0.8943527655692843, + "grad_norm": 0.8914738893508911, + "learning_rate": 3.366860904036856e-06, + "loss": 1.0662, + "mean_token_accuracy": 0.7212732180953025, + "num_tokens": 353198934.0, + "step": 10785 + }, + { + "epoch": 0.8947673936478978, + "grad_norm": 0.9360032677650452, + "learning_rate": 3.340803433530737e-06, + "loss": 1.1005, + "mean_token_accuracy": 0.7113025352358818, + "num_tokens": 353362774.0, + "step": 10790 + }, + { + "epoch": 0.8951820217265113, + "grad_norm": 0.9081243872642517, + "learning_rate": 3.3148437041485236e-06, + "loss": 1.1593, + "mean_token_accuracy": 0.7005865097045898, + "num_tokens": 353526614.0, + "step": 10795 + }, + { + "epoch": 0.8955966498051248, + "grad_norm": 0.9637793302536011, + "learning_rate": 3.288981770270333e-06, + "loss": 1.1613, + "mean_token_accuracy": 0.7011241421103478, + "num_tokens": 353690454.0, + "step": 10800 + }, + { + "epoch": 0.8960112778837382, + "grad_norm": 0.9164398908615112, + "learning_rate": 3.263217686071435e-06, + "loss": 1.1734, + "mean_token_accuracy": 0.6994990259408951, + "num_tokens": 353854294.0, + "step": 10805 + }, + { + "epoch": 0.8964259059623517, + "grad_norm": 0.8988476395606995, + "learning_rate": 3.237551505522135e-06, + "loss": 1.1431, + "mean_token_accuracy": 0.7030303031206131, + "num_tokens": 354018134.0, + "step": 10810 + }, + { + "epoch": 0.8968405340409653, + "grad_norm": 0.8833062648773193, + "learning_rate": 3.211983282387615e-06, + "loss": 1.1183, + "mean_token_accuracy": 0.7093902751803398, + "num_tokens": 354181974.0, + "step": 10815 + }, + { + "epoch": 0.8972551621195788, + "grad_norm": 0.9248128533363342, + "learning_rate": 3.1865130702278977e-06, + "loss": 1.1218, + "mean_token_accuracy": 0.7110092908143997, + "num_tokens": 354345814.0, + "step": 10820 + }, + { + "epoch": 0.8976697901981923, + "grad_norm": 0.9079005122184753, + "learning_rate": 3.1611409223976817e-06, + "loss": 1.1326, + "mean_token_accuracy": 0.7085654929280281, + "num_tokens": 354509654.0, + "step": 10825 + }, + { + "epoch": 0.8980844182768057, + "grad_norm": 0.8842004537582397, + "learning_rate": 3.135866892046241e-06, + "loss": 1.1492, + "mean_token_accuracy": 0.7079117774963379, + "num_tokens": 354673494.0, + "step": 10830 + }, + { + "epoch": 0.8984990463554192, + "grad_norm": 0.8912933468818665, + "learning_rate": 3.11069103211728e-06, + "loss": 1.1562, + "mean_token_accuracy": 0.7026881724596024, + "num_tokens": 354837334.0, + "step": 10835 + }, + { + "epoch": 0.8989136744340327, + "grad_norm": 0.9348641633987427, + "learning_rate": 3.0856133953489184e-06, + "loss": 1.233, + "mean_token_accuracy": 0.6882453575730324, + "num_tokens": 355001174.0, + "step": 10840 + }, + { + "epoch": 0.8993283025126462, + "grad_norm": 0.9215148687362671, + "learning_rate": 3.0606340342734853e-06, + "loss": 1.1124, + "mean_token_accuracy": 0.7089076235890388, + "num_tokens": 355165014.0, + "step": 10845 + }, + { + "epoch": 0.8997429305912596, + "grad_norm": 0.9471371173858643, + "learning_rate": 3.035753001217423e-06, + "loss": 1.1136, + "mean_token_accuracy": 0.7102561622858048, + "num_tokens": 355327852.0, + "step": 10850 + }, + { + "epoch": 0.9001575586698731, + "grad_norm": 0.9406538009643555, + "learning_rate": 3.0109703483012452e-06, + "loss": 1.1024, + "mean_token_accuracy": 0.7098362594842911, + "num_tokens": 355491692.0, + "step": 10855 + }, + { + "epoch": 0.9005721867484866, + "grad_norm": 0.8713740110397339, + "learning_rate": 2.9862861274393474e-06, + "loss": 1.1124, + "mean_token_accuracy": 0.7118707180023194, + "num_tokens": 355655532.0, + "step": 10860 + }, + { + "epoch": 0.9009868148271001, + "grad_norm": 0.9907150268554688, + "learning_rate": 2.9617003903399333e-06, + "loss": 1.1423, + "mean_token_accuracy": 0.704325507581234, + "num_tokens": 355819372.0, + "step": 10865 + }, + { + "epoch": 0.9014014429057136, + "grad_norm": 0.9606465101242065, + "learning_rate": 2.9372131885049058e-06, + "loss": 1.1436, + "mean_token_accuracy": 0.7018633931875229, + "num_tokens": 355983212.0, + "step": 10870 + }, + { + "epoch": 0.901816070984327, + "grad_norm": 0.9174510836601257, + "learning_rate": 2.912824573229783e-06, + "loss": 1.1764, + "mean_token_accuracy": 0.7021016642451287, + "num_tokens": 356147052.0, + "step": 10875 + }, + { + "epoch": 0.9022306990629405, + "grad_norm": 0.9237547516822815, + "learning_rate": 2.8885345956035205e-06, + "loss": 1.1415, + "mean_token_accuracy": 0.7037756577134132, + "num_tokens": 356310892.0, + "step": 10880 + }, + { + "epoch": 0.902645327141554, + "grad_norm": 0.929188072681427, + "learning_rate": 2.8643433065084824e-06, + "loss": 1.1255, + "mean_token_accuracy": 0.7043377324938774, + "num_tokens": 356474732.0, + "step": 10885 + }, + { + "epoch": 0.9030599552201675, + "grad_norm": 0.9025934338569641, + "learning_rate": 2.840250756620272e-06, + "loss": 1.1304, + "mean_token_accuracy": 0.704802057147026, + "num_tokens": 356638572.0, + "step": 10890 + }, + { + "epoch": 0.9034745832987809, + "grad_norm": 0.920540988445282, + "learning_rate": 2.816256996407707e-06, + "loss": 1.1484, + "mean_token_accuracy": 0.702421247959137, + "num_tokens": 356801251.0, + "step": 10895 + }, + { + "epoch": 0.9038892113773945, + "grad_norm": 0.935437798500061, + "learning_rate": 2.7923620761325986e-06, + "loss": 1.0632, + "mean_token_accuracy": 0.7161219537258148, + "num_tokens": 356964438.0, + "step": 10900 + }, + { + "epoch": 0.904303839456008, + "grad_norm": 0.9342902898788452, + "learning_rate": 2.768566045849752e-06, + "loss": 1.1382, + "mean_token_accuracy": 0.7036717966198921, + "num_tokens": 357128278.0, + "step": 10905 + }, + { + "epoch": 0.9047184675346215, + "grad_norm": 0.979550302028656, + "learning_rate": 2.7448689554067985e-06, + "loss": 1.2227, + "mean_token_accuracy": 0.6927358254790306, + "num_tokens": 357292118.0, + "step": 10910 + }, + { + "epoch": 0.905133095613235, + "grad_norm": 0.9403480291366577, + "learning_rate": 2.7212708544441244e-06, + "loss": 1.1697, + "mean_token_accuracy": 0.7031769335269928, + "num_tokens": 357455958.0, + "step": 10915 + }, + { + "epoch": 0.9055477236918484, + "grad_norm": 0.8986895084381104, + "learning_rate": 2.697771792394743e-06, + "loss": 1.1182, + "mean_token_accuracy": 0.7077284932136536, + "num_tokens": 357619798.0, + "step": 10920 + }, + { + "epoch": 0.9059623517704619, + "grad_norm": 0.9211082458496094, + "learning_rate": 2.6743718184842058e-06, + "loss": 1.0705, + "mean_token_accuracy": 0.7168255105614663, + "num_tokens": 357783638.0, + "step": 10925 + }, + { + "epoch": 0.9063769798490754, + "grad_norm": 0.8845340609550476, + "learning_rate": 2.6510709817305024e-06, + "loss": 1.1135, + "mean_token_accuracy": 0.7054383754730225, + "num_tokens": 357946818.0, + "step": 10930 + }, + { + "epoch": 0.9067916079276889, + "grad_norm": 0.9751585125923157, + "learning_rate": 2.627869330943944e-06, + "loss": 1.1206, + "mean_token_accuracy": 0.7069648057222366, + "num_tokens": 358110658.0, + "step": 10935 + }, + { + "epoch": 0.9072062360063023, + "grad_norm": 0.9455736875534058, + "learning_rate": 2.6047669147270635e-06, + "loss": 1.1558, + "mean_token_accuracy": 0.7025293216109276, + "num_tokens": 358274498.0, + "step": 10940 + }, + { + "epoch": 0.9076208640849158, + "grad_norm": 0.9326270222663879, + "learning_rate": 2.581763781474533e-06, + "loss": 1.1423, + "mean_token_accuracy": 0.7030913949012756, + "num_tokens": 358438338.0, + "step": 10945 + }, + { + "epoch": 0.9080354921635293, + "grad_norm": 0.9129369258880615, + "learning_rate": 2.5588599793730405e-06, + "loss": 1.1019, + "mean_token_accuracy": 0.7141312330961227, + "num_tokens": 358602178.0, + "step": 10950 + }, + { + "epoch": 0.9084501202421428, + "grad_norm": 0.9351323246955872, + "learning_rate": 2.5360555564011903e-06, + "loss": 1.1645, + "mean_token_accuracy": 0.7004582151770592, + "num_tokens": 358766018.0, + "step": 10955 + }, + { + "epoch": 0.9088647483207563, + "grad_norm": 0.8951199054718018, + "learning_rate": 2.513350560329403e-06, + "loss": 1.0588, + "mean_token_accuracy": 0.7205950617790222, + "num_tokens": 358929858.0, + "step": 10960 + }, + { + "epoch": 0.9092793763993697, + "grad_norm": 0.8859832882881165, + "learning_rate": 2.4907450387198495e-06, + "loss": 1.0844, + "mean_token_accuracy": 0.7139895841479301, + "num_tokens": 359092807.0, + "step": 10965 + }, + { + "epoch": 0.9096940044779832, + "grad_norm": 0.9251822829246521, + "learning_rate": 2.4682390389262956e-06, + "loss": 1.0616, + "mean_token_accuracy": 0.7172715052962303, + "num_tokens": 359256647.0, + "step": 10970 + }, + { + "epoch": 0.9101086325565967, + "grad_norm": 0.9385135173797607, + "learning_rate": 2.4458326080940398e-06, + "loss": 1.0476, + "mean_token_accuracy": 0.7187072306871414, + "num_tokens": 359420487.0, + "step": 10975 + }, + { + "epoch": 0.9105232606352102, + "grad_norm": 0.9257782101631165, + "learning_rate": 2.423525793159809e-06, + "loss": 1.1778, + "mean_token_accuracy": 0.6981915965676307, + "num_tokens": 359584327.0, + "step": 10980 + }, + { + "epoch": 0.9109378887138238, + "grad_norm": 0.9558985233306885, + "learning_rate": 2.401318640851641e-06, + "loss": 1.2147, + "mean_token_accuracy": 0.6936339169740677, + "num_tokens": 359748167.0, + "step": 10985 + }, + { + "epoch": 0.9113525167924372, + "grad_norm": 0.9049248099327087, + "learning_rate": 2.37921119768883e-06, + "loss": 1.1101, + "mean_token_accuracy": 0.7097690656781197, + "num_tokens": 359912007.0, + "step": 10990 + }, + { + "epoch": 0.9117671448710507, + "grad_norm": 0.9582362174987793, + "learning_rate": 2.3572035099817535e-06, + "loss": 1.06, + "mean_token_accuracy": 0.720039102435112, + "num_tokens": 360075847.0, + "step": 10995 + }, + { + "epoch": 0.9121817729496642, + "grad_norm": 0.9491180181503296, + "learning_rate": 2.335295623831868e-06, + "loss": 1.1178, + "mean_token_accuracy": 0.7037878766655922, + "num_tokens": 360239687.0, + "step": 11000 + }, + { + "epoch": 0.9125964010282777, + "grad_norm": 0.9045689702033997, + "learning_rate": 2.313487585131563e-06, + "loss": 1.1507, + "mean_token_accuracy": 0.7044049352407455, + "num_tokens": 360403527.0, + "step": 11005 + }, + { + "epoch": 0.9130110291068911, + "grad_norm": 0.8995153307914734, + "learning_rate": 2.291779439564029e-06, + "loss": 1.1369, + "mean_token_accuracy": 0.7027065053582191, + "num_tokens": 360567367.0, + "step": 11010 + }, + { + "epoch": 0.9134256571855046, + "grad_norm": 0.9227888584136963, + "learning_rate": 2.270171232603241e-06, + "loss": 1.066, + "mean_token_accuracy": 0.7193304002285004, + "num_tokens": 360731207.0, + "step": 11015 + }, + { + "epoch": 0.9138402852641181, + "grad_norm": 0.9289051294326782, + "learning_rate": 2.2486630095138184e-06, + "loss": 1.1494, + "mean_token_accuracy": 0.7053235292434692, + "num_tokens": 360893382.0, + "step": 11020 + }, + { + "epoch": 0.9142549133427316, + "grad_norm": 0.9562925100326538, + "learning_rate": 2.2272548153509155e-06, + "loss": 1.1945, + "mean_token_accuracy": 0.694709187746048, + "num_tokens": 361057222.0, + "step": 11025 + }, + { + "epoch": 0.914669541421345, + "grad_norm": 0.9172911643981934, + "learning_rate": 2.2059466949601594e-06, + "loss": 1.1179, + "mean_token_accuracy": 0.7094469025731087, + "num_tokens": 361220686.0, + "step": 11030 + }, + { + "epoch": 0.9150841694999585, + "grad_norm": 0.9196988940238953, + "learning_rate": 2.184738692977556e-06, + "loss": 1.1029, + "mean_token_accuracy": 0.7084433034062385, + "num_tokens": 361384526.0, + "step": 11035 + }, + { + "epoch": 0.915498797578572, + "grad_norm": 0.959909975528717, + "learning_rate": 2.1636308538293794e-06, + "loss": 1.1153, + "mean_token_accuracy": 0.7096346527338028, + "num_tokens": 361548366.0, + "step": 11040 + }, + { + "epoch": 0.9159134256571855, + "grad_norm": 0.9608067870140076, + "learning_rate": 2.142623221732054e-06, + "loss": 1.1618, + "mean_token_accuracy": 0.698104539513588, + "num_tokens": 361711105.0, + "step": 11045 + }, + { + "epoch": 0.916328053735799, + "grad_norm": 0.8869704604148865, + "learning_rate": 2.1217158406921176e-06, + "loss": 1.0719, + "mean_token_accuracy": 0.7176208943128586, + "num_tokens": 361874425.0, + "step": 11050 + }, + { + "epoch": 0.9167426818144124, + "grad_norm": 0.9388926029205322, + "learning_rate": 2.1009087545061258e-06, + "loss": 1.0922, + "mean_token_accuracy": 0.7119195967912674, + "num_tokens": 362038265.0, + "step": 11055 + }, + { + "epoch": 0.9171573098930259, + "grad_norm": 0.9204244613647461, + "learning_rate": 2.0802020067604843e-06, + "loss": 1.0974, + "mean_token_accuracy": 0.7133064493536949, + "num_tokens": 362202105.0, + "step": 11060 + }, + { + "epoch": 0.9175719379716394, + "grad_norm": 0.9380967617034912, + "learning_rate": 2.059595640831452e-06, + "loss": 1.1215, + "mean_token_accuracy": 0.7061278134584427, + "num_tokens": 362365945.0, + "step": 11065 + }, + { + "epoch": 0.917986566050253, + "grad_norm": 0.8923094868659973, + "learning_rate": 2.0390896998849996e-06, + "loss": 1.1248, + "mean_token_accuracy": 0.7078488484025002, + "num_tokens": 362529202.0, + "step": 11070 + }, + { + "epoch": 0.9184011941288664, + "grad_norm": 0.9815731644630432, + "learning_rate": 2.018684226876716e-06, + "loss": 1.1595, + "mean_token_accuracy": 0.6996273174881935, + "num_tokens": 362693042.0, + "step": 11075 + }, + { + "epoch": 0.9188158222074799, + "grad_norm": 0.9461018443107605, + "learning_rate": 1.9983792645517475e-06, + "loss": 1.1783, + "mean_token_accuracy": 0.6971441462635994, + "num_tokens": 362856493.0, + "step": 11080 + }, + { + "epoch": 0.9192304502860934, + "grad_norm": 0.9883350133895874, + "learning_rate": 1.9781748554446867e-06, + "loss": 1.1202, + "mean_token_accuracy": 0.7085654959082603, + "num_tokens": 363020333.0, + "step": 11085 + }, + { + "epoch": 0.9196450783647069, + "grad_norm": 0.933786928653717, + "learning_rate": 1.958071041879478e-06, + "loss": 1.2037, + "mean_token_accuracy": 0.6949841171503067, + "num_tokens": 363184173.0, + "step": 11090 + }, + { + "epoch": 0.9200597064433204, + "grad_norm": 0.8661555647850037, + "learning_rate": 1.9380678659693563e-06, + "loss": 1.1058, + "mean_token_accuracy": 0.7145405665040017, + "num_tokens": 363348013.0, + "step": 11095 + }, + { + "epoch": 0.9204743345219338, + "grad_norm": 0.8983901143074036, + "learning_rate": 1.9181653696167312e-06, + "loss": 1.1112, + "mean_token_accuracy": 0.7085105136036873, + "num_tokens": 363511853.0, + "step": 11100 + }, + { + "epoch": 0.9208889626005473, + "grad_norm": 0.9139328598976135, + "learning_rate": 1.898363594513114e-06, + "loss": 1.0769, + "mean_token_accuracy": 0.7180901765823364, + "num_tokens": 363675693.0, + "step": 11105 + }, + { + "epoch": 0.9213035906791608, + "grad_norm": 0.9308121800422668, + "learning_rate": 1.8786625821390236e-06, + "loss": 1.1448, + "mean_token_accuracy": 0.7064027398824692, + "num_tokens": 363839533.0, + "step": 11110 + }, + { + "epoch": 0.9217182187577743, + "grad_norm": 0.8937799334526062, + "learning_rate": 1.8590623737639035e-06, + "loss": 1.1182, + "mean_token_accuracy": 0.7109054252505302, + "num_tokens": 364003373.0, + "step": 11115 + }, + { + "epoch": 0.9221328468363877, + "grad_norm": 0.8960309028625488, + "learning_rate": 1.8395630104460327e-06, + "loss": 1.1596, + "mean_token_accuracy": 0.7020527794957161, + "num_tokens": 364167213.0, + "step": 11120 + }, + { + "epoch": 0.9225474749150012, + "grad_norm": 0.954889714717865, + "learning_rate": 1.8201645330324479e-06, + "loss": 1.0809, + "mean_token_accuracy": 0.7125427678227425, + "num_tokens": 364331053.0, + "step": 11125 + }, + { + "epoch": 0.9229621029936147, + "grad_norm": 0.9314802885055542, + "learning_rate": 1.8008669821588497e-06, + "loss": 1.0989, + "mean_token_accuracy": 0.7106121718883515, + "num_tokens": 364494893.0, + "step": 11130 + }, + { + "epoch": 0.9233767310722282, + "grad_norm": 0.9300594329833984, + "learning_rate": 1.7816703982495075e-06, + "loss": 1.0691, + "mean_token_accuracy": 0.7181207224726677, + "num_tokens": 364658733.0, + "step": 11135 + }, + { + "epoch": 0.9237913591508417, + "grad_norm": 0.8461613059043884, + "learning_rate": 1.7625748215171878e-06, + "loss": 1.012, + "mean_token_accuracy": 0.7287390038371087, + "num_tokens": 364822573.0, + "step": 11140 + }, + { + "epoch": 0.9242059872294551, + "grad_norm": 0.9067208766937256, + "learning_rate": 1.7435802919630929e-06, + "loss": 1.0838, + "mean_token_accuracy": 0.7154997497797012, + "num_tokens": 364986413.0, + "step": 11145 + }, + { + "epoch": 0.9246206153080687, + "grad_norm": 0.9463154673576355, + "learning_rate": 1.7246868493767277e-06, + "loss": 1.1697, + "mean_token_accuracy": 0.6987170085310936, + "num_tokens": 365150253.0, + "step": 11150 + }, + { + "epoch": 0.9250352433866822, + "grad_norm": 0.8903513550758362, + "learning_rate": 1.7058945333358388e-06, + "loss": 1.0472, + "mean_token_accuracy": 0.7249511256814003, + "num_tokens": 365314093.0, + "step": 11155 + }, + { + "epoch": 0.9254498714652957, + "grad_norm": 0.9678393006324768, + "learning_rate": 1.6872033832063538e-06, + "loss": 1.0955, + "mean_token_accuracy": 0.7143267408013344, + "num_tokens": 365477933.0, + "step": 11160 + }, + { + "epoch": 0.9258644995439091, + "grad_norm": 0.8709613680839539, + "learning_rate": 1.6686134381422802e-06, + "loss": 1.0566, + "mean_token_accuracy": 0.7199596792459488, + "num_tokens": 365641773.0, + "step": 11165 + }, + { + "epoch": 0.9262791276225226, + "grad_norm": 0.9399612545967102, + "learning_rate": 1.6501247370855844e-06, + "loss": 1.122, + "mean_token_accuracy": 0.7127504914999008, + "num_tokens": 365805613.0, + "step": 11170 + }, + { + "epoch": 0.9266937557011361, + "grad_norm": 0.9037859439849854, + "learning_rate": 1.631737318766191e-06, + "loss": 1.1935, + "mean_token_accuracy": 0.6953629046678543, + "num_tokens": 365969453.0, + "step": 11175 + }, + { + "epoch": 0.9271083837797496, + "grad_norm": 0.925135612487793, + "learning_rate": 1.613451221701845e-06, + "loss": 1.1299, + "mean_token_accuracy": 0.7003421306610107, + "num_tokens": 366133293.0, + "step": 11180 + }, + { + "epoch": 0.9275230118583631, + "grad_norm": 0.9519877433776855, + "learning_rate": 1.5952664841980437e-06, + "loss": 1.2125, + "mean_token_accuracy": 0.6950879722833634, + "num_tokens": 366297133.0, + "step": 11185 + }, + { + "epoch": 0.9279376399369765, + "grad_norm": 0.9566942453384399, + "learning_rate": 1.5771831443479435e-06, + "loss": 1.1731, + "mean_token_accuracy": 0.6997311800718308, + "num_tokens": 366460973.0, + "step": 11190 + }, + { + "epoch": 0.92835226801559, + "grad_norm": 0.9131047129631042, + "learning_rate": 1.5592012400323152e-06, + "loss": 1.1506, + "mean_token_accuracy": 0.7048509269952774, + "num_tokens": 366624813.0, + "step": 11195 + }, + { + "epoch": 0.9287668960942035, + "grad_norm": 0.8717803359031677, + "learning_rate": 1.5413208089194387e-06, + "loss": 1.0545, + "mean_token_accuracy": 0.7235703766345978, + "num_tokens": 366788653.0, + "step": 11200 + }, + { + "epoch": 0.929181524172817, + "grad_norm": 0.9159350991249084, + "learning_rate": 1.5235418884650243e-06, + "loss": 1.0669, + "mean_token_accuracy": 0.7212060108780861, + "num_tokens": 366952493.0, + "step": 11205 + }, + { + "epoch": 0.9295961522514304, + "grad_norm": 0.9394529461860657, + "learning_rate": 1.5058645159121365e-06, + "loss": 1.097, + "mean_token_accuracy": 0.7103250250220299, + "num_tokens": 367116333.0, + "step": 11210 + }, + { + "epoch": 0.9300107803300439, + "grad_norm": 0.8994800448417664, + "learning_rate": 1.4882887282911318e-06, + "loss": 1.0163, + "mean_token_accuracy": 0.7267896369099617, + "num_tokens": 367279606.0, + "step": 11215 + }, + { + "epoch": 0.9304254084086574, + "grad_norm": 0.9277415871620178, + "learning_rate": 1.470814562419548e-06, + "loss": 1.0836, + "mean_token_accuracy": 0.7113575249910354, + "num_tokens": 367443446.0, + "step": 11220 + }, + { + "epoch": 0.9308400364872709, + "grad_norm": 0.9330523610115051, + "learning_rate": 1.4534420549020655e-06, + "loss": 1.1591, + "mean_token_accuracy": 0.7000794216990471, + "num_tokens": 367607286.0, + "step": 11225 + }, + { + "epoch": 0.9312546645658843, + "grad_norm": 0.928530216217041, + "learning_rate": 1.4361712421303963e-06, + "loss": 1.1629, + "mean_token_accuracy": 0.7005498617887497, + "num_tokens": 367771126.0, + "step": 11230 + }, + { + "epoch": 0.9316692926444979, + "grad_norm": 0.8880189061164856, + "learning_rate": 1.419002160283245e-06, + "loss": 1.1284, + "mean_token_accuracy": 0.7053824573755264, + "num_tokens": 367934966.0, + "step": 11235 + }, + { + "epoch": 0.9320839207231114, + "grad_norm": 0.8915965557098389, + "learning_rate": 1.4019348453261805e-06, + "loss": 1.1011, + "mean_token_accuracy": 0.7101661771535873, + "num_tokens": 368098806.0, + "step": 11240 + }, + { + "epoch": 0.9324985488017249, + "grad_norm": 0.936050295829773, + "learning_rate": 1.384969333011621e-06, + "loss": 1.1442, + "mean_token_accuracy": 0.7046293050050736, + "num_tokens": 368262527.0, + "step": 11245 + }, + { + "epoch": 0.9329131768803384, + "grad_norm": 0.8871857523918152, + "learning_rate": 1.3681056588787156e-06, + "loss": 1.1522, + "mean_token_accuracy": 0.7038062065839767, + "num_tokens": 368426367.0, + "step": 11250 + }, + { + "epoch": 0.9333278049589518, + "grad_norm": 0.9392543435096741, + "learning_rate": 1.3513438582532844e-06, + "loss": 1.151, + "mean_token_accuracy": 0.7028775662183762, + "num_tokens": 368590207.0, + "step": 11255 + }, + { + "epoch": 0.9337424330375653, + "grad_norm": 0.9720740914344788, + "learning_rate": 1.3346839662477406e-06, + "loss": 1.2145, + "mean_token_accuracy": 0.6947580620646476, + "num_tokens": 368754047.0, + "step": 11260 + }, + { + "epoch": 0.9341570611161788, + "grad_norm": 0.942632257938385, + "learning_rate": 1.31812601776104e-06, + "loss": 1.1088, + "mean_token_accuracy": 0.7081132680177689, + "num_tokens": 368917863.0, + "step": 11265 + }, + { + "epoch": 0.9345716891947923, + "grad_norm": 0.940656304359436, + "learning_rate": 1.3016700474785593e-06, + "loss": 1.1047, + "mean_token_accuracy": 0.7101172998547554, + "num_tokens": 369081703.0, + "step": 11270 + }, + { + "epoch": 0.9349863172734058, + "grad_norm": 0.9205859303474426, + "learning_rate": 1.285316089872074e-06, + "loss": 1.051, + "mean_token_accuracy": 0.7213526412844657, + "num_tokens": 369245543.0, + "step": 11275 + }, + { + "epoch": 0.9354009453520192, + "grad_norm": 0.9171337485313416, + "learning_rate": 1.2690641791996582e-06, + "loss": 1.0978, + "mean_token_accuracy": 0.7125855311751366, + "num_tokens": 369409383.0, + "step": 11280 + }, + { + "epoch": 0.9358155734306327, + "grad_norm": 1.0124074220657349, + "learning_rate": 1.2529143495056183e-06, + "loss": 1.1558, + "mean_token_accuracy": 0.702278833091259, + "num_tokens": 369573223.0, + "step": 11285 + }, + { + "epoch": 0.9362302015092462, + "grad_norm": 0.9083593487739563, + "learning_rate": 1.2368666346204206e-06, + "loss": 1.154, + "mean_token_accuracy": 0.7014479473233223, + "num_tokens": 369737063.0, + "step": 11290 + }, + { + "epoch": 0.9366448295878597, + "grad_norm": 0.9585278630256653, + "learning_rate": 1.2209210681606299e-06, + "loss": 1.1594, + "mean_token_accuracy": 0.7005681827664375, + "num_tokens": 369900903.0, + "step": 11295 + }, + { + "epoch": 0.9370594576664731, + "grad_norm": 0.9256246089935303, + "learning_rate": 1.2050776835288213e-06, + "loss": 1.1185, + "mean_token_accuracy": 0.7068426176905632, + "num_tokens": 370064743.0, + "step": 11300 + }, + { + "epoch": 0.9374740857450866, + "grad_norm": 0.9040131568908691, + "learning_rate": 1.1893365139135303e-06, + "loss": 1.1276, + "mean_token_accuracy": 0.707337486743927, + "num_tokens": 370228583.0, + "step": 11305 + }, + { + "epoch": 0.9378887138237001, + "grad_norm": 0.9167429208755493, + "learning_rate": 1.1736975922891745e-06, + "loss": 1.0737, + "mean_token_accuracy": 0.7120051324367523, + "num_tokens": 370392423.0, + "step": 11310 + }, + { + "epoch": 0.9383033419023136, + "grad_norm": 0.9664468169212341, + "learning_rate": 1.1581609514159653e-06, + "loss": 1.1443, + "mean_token_accuracy": 0.706650273501873, + "num_tokens": 370555558.0, + "step": 11315 + }, + { + "epoch": 0.9387179699809272, + "grad_norm": 0.9230952858924866, + "learning_rate": 1.1427266238398793e-06, + "loss": 1.0584, + "mean_token_accuracy": 0.7179007872939109, + "num_tokens": 370719398.0, + "step": 11320 + }, + { + "epoch": 0.9391325980595406, + "grad_norm": 0.8887954354286194, + "learning_rate": 1.1273946418925651e-06, + "loss": 1.0938, + "mean_token_accuracy": 0.7143939375877381, + "num_tokens": 370883238.0, + "step": 11325 + }, + { + "epoch": 0.9395472261381541, + "grad_norm": 0.9098343253135681, + "learning_rate": 1.1121650376912706e-06, + "loss": 1.0811, + "mean_token_accuracy": 0.7132759064435958, + "num_tokens": 371047078.0, + "step": 11330 + }, + { + "epoch": 0.9399618542167676, + "grad_norm": 0.9233249425888062, + "learning_rate": 1.0970378431387817e-06, + "loss": 1.0538, + "mean_token_accuracy": 0.7183834314346313, + "num_tokens": 371210918.0, + "step": 11335 + }, + { + "epoch": 0.9403764822953811, + "grad_norm": 0.9147783517837524, + "learning_rate": 1.082013089923367e-06, + "loss": 1.0673, + "mean_token_accuracy": 0.7187866598367691, + "num_tokens": 371374758.0, + "step": 11340 + }, + { + "epoch": 0.9407911103739945, + "grad_norm": 0.9527497887611389, + "learning_rate": 1.0670908095187115e-06, + "loss": 1.1868, + "mean_token_accuracy": 0.6949596792459488, + "num_tokens": 371538598.0, + "step": 11345 + }, + { + "epoch": 0.941205738452608, + "grad_norm": 0.926953911781311, + "learning_rate": 1.0522710331838048e-06, + "loss": 1.067, + "mean_token_accuracy": 0.7175158828496933, + "num_tokens": 371702438.0, + "step": 11350 + }, + { + "epoch": 0.9416203665312215, + "grad_norm": 0.9394188523292542, + "learning_rate": 1.037553791962953e-06, + "loss": 1.1346, + "mean_token_accuracy": 0.7028897821903228, + "num_tokens": 371866278.0, + "step": 11355 + }, + { + "epoch": 0.942034994609835, + "grad_norm": 0.9399611353874207, + "learning_rate": 1.022939116685656e-06, + "loss": 1.1925, + "mean_token_accuracy": 0.697794483602047, + "num_tokens": 372030118.0, + "step": 11360 + }, + { + "epoch": 0.9424496226884485, + "grad_norm": 0.9226964712142944, + "learning_rate": 1.0084270379665473e-06, + "loss": 1.1273, + "mean_token_accuracy": 0.7038580939173699, + "num_tokens": 372193521.0, + "step": 11365 + }, + { + "epoch": 0.9428642507670619, + "grad_norm": 0.9384846091270447, + "learning_rate": 9.940175862053703e-07, + "loss": 1.1222, + "mean_token_accuracy": 0.7093412965536118, + "num_tokens": 372356909.0, + "step": 11370 + }, + { + "epoch": 0.9432788788456754, + "grad_norm": 0.9163985252380371, + "learning_rate": 9.797107915868574e-07, + "loss": 1.0488, + "mean_token_accuracy": 0.7222774296998977, + "num_tokens": 372519837.0, + "step": 11375 + }, + { + "epoch": 0.9436935069242889, + "grad_norm": 0.9299659132957458, + "learning_rate": 9.655066840807193e-07, + "loss": 1.1404, + "mean_token_accuracy": 0.7039345040917396, + "num_tokens": 372683677.0, + "step": 11380 + }, + { + "epoch": 0.9441081350029024, + "grad_norm": 0.9414932131767273, + "learning_rate": 9.514052934415485e-07, + "loss": 1.194, + "mean_token_accuracy": 0.7013196542859077, + "num_tokens": 372847517.0, + "step": 11385 + }, + { + "epoch": 0.9445227630815158, + "grad_norm": 0.960216224193573, + "learning_rate": 9.374066492087608e-07, + "loss": 1.1444, + "mean_token_accuracy": 0.7023032769560814, + "num_tokens": 373011357.0, + "step": 11390 + }, + { + "epoch": 0.9449373911601293, + "grad_norm": 0.922818124294281, + "learning_rate": 9.235107807065657e-07, + "loss": 1.1822, + "mean_token_accuracy": 0.7004215568304062, + "num_tokens": 373175197.0, + "step": 11395 + }, + { + "epoch": 0.9453520192387429, + "grad_norm": 0.9108608365058899, + "learning_rate": 9.097177170438453e-07, + "loss": 1.1108, + "mean_token_accuracy": 0.7077773675322533, + "num_tokens": 373339037.0, + "step": 11400 + }, + { + "epoch": 0.9457666473173564, + "grad_norm": 0.9480197429656982, + "learning_rate": 8.960274871141427e-07, + "loss": 1.1504, + "mean_token_accuracy": 0.704331623017788, + "num_tokens": 373502877.0, + "step": 11405 + }, + { + "epoch": 0.9461812753959699, + "grad_norm": 0.886017918586731, + "learning_rate": 8.824401195955956e-07, + "loss": 1.102, + "mean_token_accuracy": 0.7145711123943329, + "num_tokens": 373666717.0, + "step": 11410 + }, + { + "epoch": 0.9465959034745833, + "grad_norm": 0.9021741151809692, + "learning_rate": 8.689556429508583e-07, + "loss": 1.1941, + "mean_token_accuracy": 0.6929985351860524, + "num_tokens": 373830557.0, + "step": 11415 + }, + { + "epoch": 0.9470105315531968, + "grad_norm": 0.9444058537483215, + "learning_rate": 8.555740854270411e-07, + "loss": 1.1666, + "mean_token_accuracy": 0.7004154458642006, + "num_tokens": 373994397.0, + "step": 11420 + }, + { + "epoch": 0.9474251596318103, + "grad_norm": 0.9039308428764343, + "learning_rate": 8.422954750556766e-07, + "loss": 1.0472, + "mean_token_accuracy": 0.7199108004570007, + "num_tokens": 374158237.0, + "step": 11425 + }, + { + "epoch": 0.9478397877104238, + "grad_norm": 0.9307836294174194, + "learning_rate": 8.291198396526368e-07, + "loss": 1.1729, + "mean_token_accuracy": 0.6965970203280449, + "num_tokens": 374322077.0, + "step": 11430 + }, + { + "epoch": 0.9482544157890372, + "grad_norm": 0.9643678665161133, + "learning_rate": 8.16047206818088e-07, + "loss": 1.1457, + "mean_token_accuracy": 0.7040200352668762, + "num_tokens": 374485917.0, + "step": 11435 + }, + { + "epoch": 0.9486690438676507, + "grad_norm": 0.9190859794616699, + "learning_rate": 8.030776039364196e-07, + "loss": 1.0938, + "mean_token_accuracy": 0.7135202825069428, + "num_tokens": 374649757.0, + "step": 11440 + }, + { + "epoch": 0.9490836719462642, + "grad_norm": 0.9101454019546509, + "learning_rate": 7.90211058176199e-07, + "loss": 1.1289, + "mean_token_accuracy": 0.7079789832234382, + "num_tokens": 374813597.0, + "step": 11445 + }, + { + "epoch": 0.9494983000248777, + "grad_norm": 0.9434904456138611, + "learning_rate": 7.774475964901107e-07, + "loss": 1.1151, + "mean_token_accuracy": 0.7092314288020134, + "num_tokens": 374977437.0, + "step": 11450 + }, + { + "epoch": 0.9499129281034911, + "grad_norm": 0.9281089901924133, + "learning_rate": 7.647872456149119e-07, + "loss": 1.0864, + "mean_token_accuracy": 0.7115163713693619, + "num_tokens": 375141277.0, + "step": 11455 + }, + { + "epoch": 0.9503275561821046, + "grad_norm": 0.9672658443450928, + "learning_rate": 7.522300320713382e-07, + "loss": 1.1032, + "mean_token_accuracy": 0.7090237036347389, + "num_tokens": 375305117.0, + "step": 11460 + }, + { + "epoch": 0.9507421842607181, + "grad_norm": 0.9243296980857849, + "learning_rate": 7.397759821640981e-07, + "loss": 1.1414, + "mean_token_accuracy": 0.7054802104830742, + "num_tokens": 375468957.0, + "step": 11465 + }, + { + "epoch": 0.9511568123393316, + "grad_norm": 0.9196622371673584, + "learning_rate": 7.274251219817785e-07, + "loss": 1.1123, + "mean_token_accuracy": 0.7119990184903144, + "num_tokens": 375632797.0, + "step": 11470 + }, + { + "epoch": 0.9515714404179451, + "grad_norm": 0.8937596678733826, + "learning_rate": 7.151774773968278e-07, + "loss": 1.0793, + "mean_token_accuracy": 0.7141190156340599, + "num_tokens": 375796637.0, + "step": 11475 + }, + { + "epoch": 0.9519860684965585, + "grad_norm": 0.9182631969451904, + "learning_rate": 7.030330740654456e-07, + "loss": 1.1309, + "mean_token_accuracy": 0.7044477075338363, + "num_tokens": 375960477.0, + "step": 11480 + }, + { + "epoch": 0.9524006965751721, + "grad_norm": 0.9258006811141968, + "learning_rate": 6.909919374275987e-07, + "loss": 1.0804, + "mean_token_accuracy": 0.716819404065609, + "num_tokens": 376124317.0, + "step": 11485 + }, + { + "epoch": 0.9528153246537856, + "grad_norm": 0.9379932284355164, + "learning_rate": 6.7905409270691e-07, + "loss": 1.147, + "mean_token_accuracy": 0.7027187243103981, + "num_tokens": 376288157.0, + "step": 11490 + }, + { + "epoch": 0.9532299527323991, + "grad_norm": 0.9158267378807068, + "learning_rate": 6.672195649106205e-07, + "loss": 1.1284, + "mean_token_accuracy": 0.7096346557140351, + "num_tokens": 376451997.0, + "step": 11495 + }, + { + "epoch": 0.9536445808110126, + "grad_norm": 0.886914074420929, + "learning_rate": 6.554883788295718e-07, + "loss": 1.0926, + "mean_token_accuracy": 0.7122641801834106, + "num_tokens": 376615813.0, + "step": 11500 + }, + { + "epoch": 0.954059208889626, + "grad_norm": 0.9485747814178467, + "learning_rate": 6.438605590381119e-07, + "loss": 1.1302, + "mean_token_accuracy": 0.7081500500440597, + "num_tokens": 376779653.0, + "step": 11505 + }, + { + "epoch": 0.9544738369682395, + "grad_norm": 0.9509932398796082, + "learning_rate": 6.323361298940455e-07, + "loss": 1.0504, + "mean_token_accuracy": 0.7201918348670006, + "num_tokens": 376943493.0, + "step": 11510 + }, + { + "epoch": 0.954888465046853, + "grad_norm": 1.0838360786437988, + "learning_rate": 6.209151155386173e-07, + "loss": 1.0738, + "mean_token_accuracy": 0.7173374041914939, + "num_tokens": 377106821.0, + "step": 11515 + }, + { + "epoch": 0.9553030931254665, + "grad_norm": 0.9364581108093262, + "learning_rate": 6.095975398964337e-07, + "loss": 1.0641, + "mean_token_accuracy": 0.7187194496393203, + "num_tokens": 377270661.0, + "step": 11520 + }, + { + "epoch": 0.9557177212040799, + "grad_norm": 0.9208771586418152, + "learning_rate": 5.983834266754029e-07, + "loss": 1.1313, + "mean_token_accuracy": 0.7060850396752357, + "num_tokens": 377434501.0, + "step": 11525 + }, + { + "epoch": 0.9561323492826934, + "grad_norm": 0.9197729229927063, + "learning_rate": 5.872727993667282e-07, + "loss": 1.1365, + "mean_token_accuracy": 0.7076735079288483, + "num_tokens": 377598341.0, + "step": 11530 + }, + { + "epoch": 0.9565469773613069, + "grad_norm": 0.9293209910392761, + "learning_rate": 5.762656812448086e-07, + "loss": 1.0914, + "mean_token_accuracy": 0.7135997027158737, + "num_tokens": 377762181.0, + "step": 11535 + }, + { + "epoch": 0.9569616054399204, + "grad_norm": 0.9614062309265137, + "learning_rate": 5.653620953672334e-07, + "loss": 1.1484, + "mean_token_accuracy": 0.7025354325771331, + "num_tokens": 377926021.0, + "step": 11540 + }, + { + "epoch": 0.9573762335185338, + "grad_norm": 0.8375124335289001, + "learning_rate": 5.545620645746985e-07, + "loss": 1.0751, + "mean_token_accuracy": 0.7167216524481773, + "num_tokens": 378089861.0, + "step": 11545 + }, + { + "epoch": 0.9577908615971473, + "grad_norm": 0.9335710406303406, + "learning_rate": 5.438656114909679e-07, + "loss": 1.0971, + "mean_token_accuracy": 0.7122006356716156, + "num_tokens": 378253701.0, + "step": 11550 + }, + { + "epoch": 0.9582054896757608, + "grad_norm": 0.9336086511611938, + "learning_rate": 5.332727585228569e-07, + "loss": 1.2236, + "mean_token_accuracy": 0.6912817701697349, + "num_tokens": 378417541.0, + "step": 11555 + }, + { + "epoch": 0.9586201177543743, + "grad_norm": 0.8826514482498169, + "learning_rate": 5.227835278601379e-07, + "loss": 1.1248, + "mean_token_accuracy": 0.7095991492271423, + "num_tokens": 378581044.0, + "step": 11560 + }, + { + "epoch": 0.9590347458329878, + "grad_norm": 0.940909743309021, + "learning_rate": 5.123979414755343e-07, + "loss": 1.081, + "mean_token_accuracy": 0.7155180796980858, + "num_tokens": 378744884.0, + "step": 11565 + }, + { + "epoch": 0.9594493739116013, + "grad_norm": 0.9472241401672363, + "learning_rate": 5.021160211246378e-07, + "loss": 1.168, + "mean_token_accuracy": 0.6959921777248382, + "num_tokens": 378908724.0, + "step": 11570 + }, + { + "epoch": 0.9598640019902148, + "grad_norm": 0.9369872808456421, + "learning_rate": 4.919377883458975e-07, + "loss": 1.1965, + "mean_token_accuracy": 0.6996044397354126, + "num_tokens": 379072325.0, + "step": 11575 + }, + { + "epoch": 0.9602786300688283, + "grad_norm": 0.9338463544845581, + "learning_rate": 4.81863264460547e-07, + "loss": 1.2003, + "mean_token_accuracy": 0.6928030341863632, + "num_tokens": 379236165.0, + "step": 11580 + }, + { + "epoch": 0.9606932581474418, + "grad_norm": 0.9180485606193542, + "learning_rate": 4.71892470572588e-07, + "loss": 1.2313, + "mean_token_accuracy": 0.6894122660160065, + "num_tokens": 379400005.0, + "step": 11585 + }, + { + "epoch": 0.9611078862260553, + "grad_norm": 0.9087678790092468, + "learning_rate": 4.620254275687075e-07, + "loss": 1.0501, + "mean_token_accuracy": 0.7211693525314331, + "num_tokens": 379563845.0, + "step": 11590 + }, + { + "epoch": 0.9615225143046687, + "grad_norm": 0.9397708177566528, + "learning_rate": 4.522621561182772e-07, + "loss": 1.1268, + "mean_token_accuracy": 0.7062866583466529, + "num_tokens": 379727685.0, + "step": 11595 + }, + { + "epoch": 0.9619371423832822, + "grad_norm": 0.9346681833267212, + "learning_rate": 4.426026766732816e-07, + "loss": 1.0802, + "mean_token_accuracy": 0.7151800289750099, + "num_tokens": 379890601.0, + "step": 11600 + }, + { + "epoch": 0.9623517704618957, + "grad_norm": 0.8871059417724609, + "learning_rate": 4.3304700946827373e-07, + "loss": 1.0478, + "mean_token_accuracy": 0.7215909063816071, + "num_tokens": 380054441.0, + "step": 11605 + }, + { + "epoch": 0.9627663985405092, + "grad_norm": 0.9635825157165527, + "learning_rate": 4.2359517452035815e-07, + "loss": 1.201, + "mean_token_accuracy": 0.6936033710837364, + "num_tokens": 380218281.0, + "step": 11610 + }, + { + "epoch": 0.9631810266191226, + "grad_norm": 0.9270527958869934, + "learning_rate": 4.1424719162912464e-07, + "loss": 1.2035, + "mean_token_accuracy": 0.6924303472042084, + "num_tokens": 380382121.0, + "step": 11615 + }, + { + "epoch": 0.9635956546977361, + "grad_norm": 0.9289128184318542, + "learning_rate": 4.0500308037660915e-07, + "loss": 1.1745, + "mean_token_accuracy": 0.6984359726309777, + "num_tokens": 380545961.0, + "step": 11620 + }, + { + "epoch": 0.9640102827763496, + "grad_norm": 0.9154837131500244, + "learning_rate": 3.958628601272663e-07, + "loss": 1.1345, + "mean_token_accuracy": 0.7014968231320381, + "num_tokens": 380709801.0, + "step": 11625 + }, + { + "epoch": 0.9644249108549631, + "grad_norm": 0.9176326990127563, + "learning_rate": 3.8682655002792446e-07, + "loss": 1.098, + "mean_token_accuracy": 0.7111820951104164, + "num_tokens": 380872907.0, + "step": 11630 + }, + { + "epoch": 0.9648395389335765, + "grad_norm": 0.885033130645752, + "learning_rate": 3.7789416900773647e-07, + "loss": 1.1397, + "mean_token_accuracy": 0.7058101162314415, + "num_tokens": 381036747.0, + "step": 11635 + }, + { + "epoch": 0.96525416701219, + "grad_norm": 0.9226931929588318, + "learning_rate": 3.690657357781402e-07, + "loss": 1.2085, + "mean_token_accuracy": 0.6952407151460648, + "num_tokens": 381200587.0, + "step": 11640 + }, + { + "epoch": 0.9656687950908035, + "grad_norm": 0.9441694617271423, + "learning_rate": 3.603412688328367e-07, + "loss": 1.1136, + "mean_token_accuracy": 0.7085050046443939, + "num_tokens": 381364285.0, + "step": 11645 + }, + { + "epoch": 0.9660834231694171, + "grad_norm": 0.968190610408783, + "learning_rate": 3.517207864477401e-07, + "loss": 1.1527, + "mean_token_accuracy": 0.6979771569371224, + "num_tokens": 381527844.0, + "step": 11650 + }, + { + "epoch": 0.9664980512480306, + "grad_norm": 0.9461910724639893, + "learning_rate": 3.4320430668092206e-07, + "loss": 1.1371, + "mean_token_accuracy": 0.705865104496479, + "num_tokens": 381691684.0, + "step": 11655 + }, + { + "epoch": 0.966912679326644, + "grad_norm": 0.9138695597648621, + "learning_rate": 3.347918473726064e-07, + "loss": 1.0254, + "mean_token_accuracy": 0.726417401432991, + "num_tokens": 381855524.0, + "step": 11660 + }, + { + "epoch": 0.9673273074052575, + "grad_norm": 0.8957282900810242, + "learning_rate": 3.264834261451133e-07, + "loss": 1.1628, + "mean_token_accuracy": 0.7030608490109443, + "num_tokens": 382019364.0, + "step": 11665 + }, + { + "epoch": 0.967741935483871, + "grad_norm": 0.9572055339813232, + "learning_rate": 3.182790604028263e-07, + "loss": 1.1871, + "mean_token_accuracy": 0.6945931106805802, + "num_tokens": 382183204.0, + "step": 11670 + }, + { + "epoch": 0.9681565635624845, + "grad_norm": 0.9100860357284546, + "learning_rate": 3.101787673321421e-07, + "loss": 1.2264, + "mean_token_accuracy": 0.6919979244470597, + "num_tokens": 382346008.0, + "step": 11675 + }, + { + "epoch": 0.968571191641098, + "grad_norm": 0.9562139511108398, + "learning_rate": 3.0218256390146525e-07, + "loss": 1.1402, + "mean_token_accuracy": 0.7035862639546394, + "num_tokens": 382509848.0, + "step": 11680 + }, + { + "epoch": 0.9689858197197114, + "grad_norm": 0.9326156973838806, + "learning_rate": 2.942904668611468e-07, + "loss": 1.0916, + "mean_token_accuracy": 0.7109237551689148, + "num_tokens": 382673688.0, + "step": 11685 + }, + { + "epoch": 0.9694004477983249, + "grad_norm": 0.9341801404953003, + "learning_rate": 2.865024927434512e-07, + "loss": 1.1373, + "mean_token_accuracy": 0.7055073603987694, + "num_tokens": 382836954.0, + "step": 11690 + }, + { + "epoch": 0.9698150758769384, + "grad_norm": 0.907103955745697, + "learning_rate": 2.788186578625396e-07, + "loss": 1.0982, + "mean_token_accuracy": 0.7117118775844574, + "num_tokens": 383000794.0, + "step": 11695 + }, + { + "epoch": 0.9702297039555519, + "grad_norm": 0.8543350696563721, + "learning_rate": 2.7123897831441427e-07, + "loss": 1.1194, + "mean_token_accuracy": 0.7077773705124855, + "num_tokens": 383164634.0, + "step": 11700 + }, + { + "epoch": 0.9706443320341653, + "grad_norm": 0.9193892478942871, + "learning_rate": 2.637634699768965e-07, + "loss": 1.0522, + "mean_token_accuracy": 0.7232038155198097, + "num_tokens": 383328474.0, + "step": 11705 + }, + { + "epoch": 0.9710589601127788, + "grad_norm": 0.9411940574645996, + "learning_rate": 2.563921485095877e-07, + "loss": 1.202, + "mean_token_accuracy": 0.6897971585392952, + "num_tokens": 383492314.0, + "step": 11710 + }, + { + "epoch": 0.9714735881913923, + "grad_norm": 0.9080677628517151, + "learning_rate": 2.491250293538472e-07, + "loss": 1.1203, + "mean_token_accuracy": 0.7104960888624191, + "num_tokens": 383656154.0, + "step": 11715 + }, + { + "epoch": 0.9718882162700058, + "grad_norm": 0.9072685241699219, + "learning_rate": 2.4196212773274773e-07, + "loss": 1.1119, + "mean_token_accuracy": 0.7090437635779381, + "num_tokens": 383819428.0, + "step": 11720 + }, + { + "epoch": 0.9723028443486192, + "grad_norm": 0.8989502191543579, + "learning_rate": 2.3490345865105344e-07, + "loss": 1.1031, + "mean_token_accuracy": 0.7131559088826179, + "num_tokens": 383982803.0, + "step": 11725 + }, + { + "epoch": 0.9727174724272327, + "grad_norm": 0.9268701672554016, + "learning_rate": 2.2794903689517533e-07, + "loss": 1.1365, + "mean_token_accuracy": 0.7080437660217285, + "num_tokens": 384146211.0, + "step": 11730 + }, + { + "epoch": 0.9731321005058463, + "grad_norm": 0.9161537289619446, + "learning_rate": 2.2109887703315458e-07, + "loss": 1.1482, + "mean_token_accuracy": 0.7007575735449791, + "num_tokens": 384310051.0, + "step": 11735 + }, + { + "epoch": 0.9735467285844598, + "grad_norm": 0.9547184705734253, + "learning_rate": 2.1435299341461822e-07, + "loss": 1.0889, + "mean_token_accuracy": 0.7124229952692985, + "num_tokens": 384473824.0, + "step": 11740 + }, + { + "epoch": 0.9739613566630733, + "grad_norm": 0.9535466432571411, + "learning_rate": 2.0771140017076806e-07, + "loss": 1.2288, + "mean_token_accuracy": 0.6856060579419136, + "num_tokens": 384637664.0, + "step": 11745 + }, + { + "epoch": 0.9743759847416867, + "grad_norm": 0.9268850684165955, + "learning_rate": 2.0117411121433616e-07, + "loss": 1.1047, + "mean_token_accuracy": 0.712927196919918, + "num_tokens": 384801452.0, + "step": 11750 + }, + { + "epoch": 0.9747906128203002, + "grad_norm": 0.9306238889694214, + "learning_rate": 1.9474114023954604e-07, + "loss": 1.1047, + "mean_token_accuracy": 0.7088220864534378, + "num_tokens": 384965292.0, + "step": 11755 + }, + { + "epoch": 0.9752052408989137, + "grad_norm": 0.9298808574676514, + "learning_rate": 1.8841250072211824e-07, + "loss": 1.0884, + "mean_token_accuracy": 0.7143695011734963, + "num_tokens": 385129132.0, + "step": 11760 + }, + { + "epoch": 0.9756198689775272, + "grad_norm": 0.9119917750358582, + "learning_rate": 1.8218820591920372e-07, + "loss": 1.1635, + "mean_token_accuracy": 0.7002871423959732, + "num_tokens": 385292972.0, + "step": 11765 + }, + { + "epoch": 0.9760344970561406, + "grad_norm": 0.9063863158226013, + "learning_rate": 1.7606826886938933e-07, + "loss": 1.0596, + "mean_token_accuracy": 0.7162512198090554, + "num_tokens": 385456812.0, + "step": 11770 + }, + { + "epoch": 0.9764491251347541, + "grad_norm": 0.9217805862426758, + "learning_rate": 1.7005270239263683e-07, + "loss": 1.1521, + "mean_token_accuracy": 0.704728738963604, + "num_tokens": 385620652.0, + "step": 11775 + }, + { + "epoch": 0.9768637532133676, + "grad_norm": 0.9595069289207458, + "learning_rate": 1.641415190902884e-07, + "loss": 1.1743, + "mean_token_accuracy": 0.6987774550914765, + "num_tokens": 385784488.0, + "step": 11780 + }, + { + "epoch": 0.9772783812919811, + "grad_norm": 0.9089632034301758, + "learning_rate": 1.583347313450112e-07, + "loss": 1.1317, + "mean_token_accuracy": 0.7037695482373237, + "num_tokens": 385948328.0, + "step": 11785 + }, + { + "epoch": 0.9776930093705946, + "grad_norm": 0.9190571308135986, + "learning_rate": 1.5263235132080279e-07, + "loss": 1.1105, + "mean_token_accuracy": 0.7096163287758828, + "num_tokens": 386112168.0, + "step": 11790 + }, + { + "epoch": 0.978107637449208, + "grad_norm": 0.899294376373291, + "learning_rate": 1.4703439096294126e-07, + "loss": 1.0591, + "mean_token_accuracy": 0.7189324885606766, + "num_tokens": 386274464.0, + "step": 11795 + }, + { + "epoch": 0.9785222655278215, + "grad_norm": 0.9493892788887024, + "learning_rate": 1.4154086199795747e-07, + "loss": 1.1429, + "mean_token_accuracy": 0.7037820160388947, + "num_tokens": 386437291.0, + "step": 11800 + }, + { + "epoch": 0.978936893606435, + "grad_norm": 0.9160469174385071, + "learning_rate": 1.361517759336406e-07, + "loss": 1.0717, + "mean_token_accuracy": 0.7159518584609031, + "num_tokens": 386601131.0, + "step": 11805 + }, + { + "epoch": 0.9793515216850485, + "grad_norm": 0.9399734735488892, + "learning_rate": 1.3086714405897705e-07, + "loss": 1.1195, + "mean_token_accuracy": 0.7086204752326012, + "num_tokens": 386764971.0, + "step": 11810 + }, + { + "epoch": 0.9797661497636619, + "grad_norm": 0.9285438060760498, + "learning_rate": 1.256869774441505e-07, + "loss": 1.1302, + "mean_token_accuracy": 0.707080890238285, + "num_tokens": 386928811.0, + "step": 11815 + }, + { + "epoch": 0.9801807778422755, + "grad_norm": 0.9658012986183167, + "learning_rate": 1.2061128694050848e-07, + "loss": 1.221, + "mean_token_accuracy": 0.6887096747756004, + "num_tokens": 387092651.0, + "step": 11820 + }, + { + "epoch": 0.980595405920889, + "grad_norm": 0.9252815246582031, + "learning_rate": 1.1564008318055708e-07, + "loss": 1.1807, + "mean_token_accuracy": 0.7000767543911934, + "num_tokens": 387255540.0, + "step": 11825 + }, + { + "epoch": 0.9810100339995025, + "grad_norm": 0.9168857932090759, + "learning_rate": 1.107733765779051e-07, + "loss": 1.057, + "mean_token_accuracy": 0.717179861664772, + "num_tokens": 387419380.0, + "step": 11830 + }, + { + "epoch": 0.981424662078116, + "grad_norm": 0.9354410171508789, + "learning_rate": 1.0601117732727539e-07, + "loss": 1.1421, + "mean_token_accuracy": 0.7040261492133141, + "num_tokens": 387583220.0, + "step": 11835 + }, + { + "epoch": 0.9818392901567294, + "grad_norm": 0.9230213165283203, + "learning_rate": 1.0135349540446038e-07, + "loss": 1.0855, + "mean_token_accuracy": 0.7141923293471336, + "num_tokens": 387747060.0, + "step": 11840 + }, + { + "epoch": 0.9822539182353429, + "grad_norm": 0.9315711259841919, + "learning_rate": 9.680034056632203e-08, + "loss": 1.205, + "mean_token_accuracy": 0.6943181812763214, + "num_tokens": 387910900.0, + "step": 11845 + }, + { + "epoch": 0.9826685463139564, + "grad_norm": 0.882120668888092, + "learning_rate": 9.235172235074752e-08, + "loss": 1.1033, + "mean_token_accuracy": 0.7119501441717148, + "num_tokens": 388074740.0, + "step": 11850 + }, + { + "epoch": 0.9830831743925699, + "grad_norm": 0.8793638944625854, + "learning_rate": 8.800765007665469e-08, + "loss": 1.1175, + "mean_token_accuracy": 0.709995111823082, + "num_tokens": 388238580.0, + "step": 11855 + }, + { + "epoch": 0.9834978024711833, + "grad_norm": 0.9546449780464172, + "learning_rate": 8.376813284395324e-08, + "loss": 1.119, + "mean_token_accuracy": 0.7083150029182435, + "num_tokens": 388402420.0, + "step": 11860 + }, + { + "epoch": 0.9839124305497968, + "grad_norm": 0.9052698016166687, + "learning_rate": 7.963317953353366e-08, + "loss": 1.1325, + "mean_token_accuracy": 0.7061889037489891, + "num_tokens": 388566260.0, + "step": 11865 + }, + { + "epoch": 0.9843270586284103, + "grad_norm": 0.9273753762245178, + "learning_rate": 7.560279880723942e-08, + "loss": 1.1656, + "mean_token_accuracy": 0.7007209196686744, + "num_tokens": 388730100.0, + "step": 11870 + }, + { + "epoch": 0.9847416867070238, + "grad_norm": 0.9197583794593811, + "learning_rate": 7.167699910787251e-08, + "loss": 1.222, + "mean_token_accuracy": 0.6885202795267105, + "num_tokens": 388893940.0, + "step": 11875 + }, + { + "epoch": 0.9851563147856373, + "grad_norm": 0.9500606656074524, + "learning_rate": 6.78557886591491e-08, + "loss": 1.1255, + "mean_token_accuracy": 0.7084921777248383, + "num_tokens": 389057780.0, + "step": 11880 + }, + { + "epoch": 0.9855709428642507, + "grad_norm": 0.9366825819015503, + "learning_rate": 6.413917546569393e-08, + "loss": 1.1387, + "mean_token_accuracy": 0.7038248598575592, + "num_tokens": 389220815.0, + "step": 11885 + }, + { + "epoch": 0.9859855709428642, + "grad_norm": 0.9425362348556519, + "learning_rate": 6.052716731301811e-08, + "loss": 1.1231, + "mean_token_accuracy": 0.7050708711147309, + "num_tokens": 389384655.0, + "step": 11890 + }, + { + "epoch": 0.9864001990214777, + "grad_norm": 0.854095458984375, + "learning_rate": 5.701977176751916e-08, + "loss": 1.0811, + "mean_token_accuracy": 0.7167277589440346, + "num_tokens": 389548495.0, + "step": 11895 + }, + { + "epoch": 0.9868148271000912, + "grad_norm": 0.9569870233535767, + "learning_rate": 5.361699617644211e-08, + "loss": 1.076, + "mean_token_accuracy": 0.7142534211277962, + "num_tokens": 389712335.0, + "step": 11900 + }, + { + "epoch": 0.9872294551787048, + "grad_norm": 0.9416723251342773, + "learning_rate": 5.031884766789064e-08, + "loss": 1.1414, + "mean_token_accuracy": 0.7019489198923111, + "num_tokens": 389876175.0, + "step": 11905 + }, + { + "epoch": 0.9876440832573182, + "grad_norm": 0.894721269607544, + "learning_rate": 4.712533315077705e-08, + "loss": 1.1264, + "mean_token_accuracy": 0.703176936507225, + "num_tokens": 390040015.0, + "step": 11910 + }, + { + "epoch": 0.9880587113359317, + "grad_norm": 0.9371886253356934, + "learning_rate": 4.403645931483902e-08, + "loss": 1.1667, + "mean_token_accuracy": 0.6983622968196869, + "num_tokens": 390203804.0, + "step": 11915 + }, + { + "epoch": 0.9884733394145452, + "grad_norm": 0.9553115963935852, + "learning_rate": 4.105223263061175e-08, + "loss": 1.0813, + "mean_token_accuracy": 0.7137829884886742, + "num_tokens": 390367644.0, + "step": 11920 + }, + { + "epoch": 0.9888879674931587, + "grad_norm": 0.9798563718795776, + "learning_rate": 3.817265934941694e-08, + "loss": 1.1887, + "mean_token_accuracy": 0.6951429590582847, + "num_tokens": 390531484.0, + "step": 11925 + }, + { + "epoch": 0.9893025955717721, + "grad_norm": 0.91167151927948, + "learning_rate": 3.539774550335717e-08, + "loss": 1.1323, + "mean_token_accuracy": 0.7083638802170753, + "num_tokens": 390695324.0, + "step": 11930 + }, + { + "epoch": 0.9897172236503856, + "grad_norm": 0.9019158482551575, + "learning_rate": 3.2727496905282654e-08, + "loss": 1.2026, + "mean_token_accuracy": 0.695654584467411, + "num_tokens": 390859155.0, + "step": 11935 + }, + { + "epoch": 0.9901318517289991, + "grad_norm": 0.9292673468589783, + "learning_rate": 3.0161919148796736e-08, + "loss": 1.1376, + "mean_token_accuracy": 0.7033602118492126, + "num_tokens": 391022995.0, + "step": 11940 + }, + { + "epoch": 0.9905464798076126, + "grad_norm": 0.9041401743888855, + "learning_rate": 2.7701017608239288e-08, + "loss": 1.1124, + "mean_token_accuracy": 0.7084732592105866, + "num_tokens": 391186174.0, + "step": 11945 + }, + { + "epoch": 0.990961107886226, + "grad_norm": 0.9249791502952576, + "learning_rate": 2.5344797438686673e-08, + "loss": 1.1751, + "mean_token_accuracy": 0.7000488787889481, + "num_tokens": 391350014.0, + "step": 11950 + }, + { + "epoch": 0.9913757359648395, + "grad_norm": 0.9407515525817871, + "learning_rate": 2.3093263575912906e-08, + "loss": 1.1524, + "mean_token_accuracy": 0.7073802500963211, + "num_tokens": 391513854.0, + "step": 11955 + }, + { + "epoch": 0.991790364043453, + "grad_norm": 0.9295756220817566, + "learning_rate": 2.094642073640629e-08, + "loss": 1.1698, + "mean_token_accuracy": 0.6980205267667771, + "num_tokens": 391677694.0, + "step": 11960 + }, + { + "epoch": 0.9922049921220665, + "grad_norm": 0.8901556134223938, + "learning_rate": 1.890427341734724e-08, + "loss": 1.1572, + "mean_token_accuracy": 0.7028382167220115, + "num_tokens": 391840868.0, + "step": 11965 + }, + { + "epoch": 0.99261962020068, + "grad_norm": 0.9549967646598816, + "learning_rate": 1.696682589659715e-08, + "loss": 1.1822, + "mean_token_accuracy": 0.6990713581442833, + "num_tokens": 392004708.0, + "step": 11970 + }, + { + "epoch": 0.9930342482792934, + "grad_norm": 0.9046486020088196, + "learning_rate": 1.513408223270396e-08, + "loss": 1.0956, + "mean_token_accuracy": 0.7138257578015328, + "num_tokens": 392168548.0, + "step": 11975 + }, + { + "epoch": 0.9934488763579069, + "grad_norm": 0.9164446592330933, + "learning_rate": 1.3406046264874405e-08, + "loss": 1.156, + "mean_token_accuracy": 0.7049792245030403, + "num_tokens": 392332388.0, + "step": 11980 + }, + { + "epoch": 0.9938635044365205, + "grad_norm": 0.8822214007377625, + "learning_rate": 1.1782721612979553e-08, + "loss": 1.1395, + "mean_token_accuracy": 0.7028164654970169, + "num_tokens": 392496228.0, + "step": 11985 + }, + { + "epoch": 0.994278132515134, + "grad_norm": 0.9024549126625061, + "learning_rate": 1.0264111677538158e-08, + "loss": 1.0855, + "mean_token_accuracy": 0.7141373410820961, + "num_tokens": 392660068.0, + "step": 11990 + }, + { + "epoch": 0.9946927605937474, + "grad_norm": 0.9262228012084961, + "learning_rate": 8.850219639716662e-09, + "loss": 1.1692, + "mean_token_accuracy": 0.6999022468924523, + "num_tokens": 392823908.0, + "step": 11995 + }, + { + "epoch": 0.9951073886723609, + "grad_norm": 0.9171954393386841, + "learning_rate": 7.54104846131809e-09, + "loss": 1.0827, + "mean_token_accuracy": 0.7156158372759819, + "num_tokens": 392987748.0, + "step": 12000 + }, + { + "epoch": 0.9955220167509744, + "grad_norm": 0.9143432974815369, + "learning_rate": 6.3366008847820515e-09, + "loss": 1.0833, + "mean_token_accuracy": 0.7156586021184921, + "num_tokens": 393151588.0, + "step": 12005 + }, + { + "epoch": 0.9959366448295879, + "grad_norm": 0.858116865158081, + "learning_rate": 5.236879433162534e-09, + "loss": 1.0369, + "mean_token_accuracy": 0.7210471659898758, + "num_tokens": 393315428.0, + "step": 12010 + }, + { + "epoch": 0.9963512729082014, + "grad_norm": 0.9210796356201172, + "learning_rate": 4.2418864101501085e-09, + "loss": 1.1426, + "mean_token_accuracy": 0.7030425250530243, + "num_tokens": 393479268.0, + "step": 12015 + }, + { + "epoch": 0.9967659009868148, + "grad_norm": 0.9030014276504517, + "learning_rate": 3.351623900044176e-09, + "loss": 1.1529, + "mean_token_accuracy": 0.7069403648376464, + "num_tokens": 393643108.0, + "step": 12020 + }, + { + "epoch": 0.9971805290654283, + "grad_norm": 0.9101301431655884, + "learning_rate": 2.566093767758515e-09, + "loss": 1.1693, + "mean_token_accuracy": 0.7013722419738769, + "num_tokens": 393806152.0, + "step": 12025 + }, + { + "epoch": 0.9975951571440418, + "grad_norm": 0.9940186738967896, + "learning_rate": 1.88529765880463e-09, + "loss": 1.1615, + "mean_token_accuracy": 0.6981671527028084, + "num_tokens": 393969992.0, + "step": 12030 + }, + { + "epoch": 0.9980097852226553, + "grad_norm": 0.9276906251907349, + "learning_rate": 1.3092369993084052e-09, + "loss": 1.162, + "mean_token_accuracy": 0.7017839699983597, + "num_tokens": 394133832.0, + "step": 12035 + }, + { + "epoch": 0.9984244133012687, + "grad_norm": 0.903628945350647, + "learning_rate": 8.379129959934506e-10, + "loss": 1.1023, + "mean_token_accuracy": 0.7125794261693954, + "num_tokens": 394297672.0, + "step": 12040 + }, + { + "epoch": 0.9988390413798822, + "grad_norm": 0.9416557550430298, + "learning_rate": 4.713266361866531e-10, + "loss": 1.2072, + "mean_token_accuracy": 0.6941043511033058, + "num_tokens": 394461512.0, + "step": 12045 + }, + { + "epoch": 0.9992536694584957, + "grad_norm": 0.9321599006652832, + "learning_rate": 2.0947868781262537e-10, + "loss": 1.0976, + "mean_token_accuracy": 0.7115285903215408, + "num_tokens": 394625352.0, + "step": 12050 + }, + { + "epoch": 0.9996682975371092, + "grad_norm": 0.883375883102417, + "learning_rate": 5.236969937705283e-11, + "loss": 1.0519, + "mean_token_accuracy": 0.7237719938158989, + "num_tokens": 394789192.0, + "step": 12055 + }, + { + "epoch": 1.0, + "eval_loss": 1.127059817314148, + "eval_mean_token_accuracy": 0.707993041371355, + "eval_num_tokens": 394920264.0, + "eval_runtime": 843.2579, + "eval_samples_per_second": 23.955, + "eval_steps_per_second": 5.989, + "step": 12059 + }, + { + "epoch": 1.0, + "mean_token_accuracy": 0.7099141571670771, + "num_tokens": 394920264.0, + "step": 12059, + "total_flos": 1.0443024731807416e+18, + "train_loss": 1.2318119363747622, + "train_runtime": 64706.2591, + "train_samples_per_second": 5.964, + "train_steps_per_second": 0.186 + } + ], + "logging_steps": 5, + "max_steps": 12059, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.0443024731807416e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}