{ "best_metric": 0.5537328124046326, "best_model_checkpoint": "gpt_light_model_unpaired/model_outputs/full_new_tokenizer_gpt2_light_seqs_unp_lr_5e-4_wd_0.1_bs_32_epochs_500_/checkpoint-6058816", "epoch": 41.0, "eval_steps": 500, "global_step": 6058816, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0033834993503681246, "grad_norm": 1.8880629539489746, "learning_rate": 4.9999661650064965e-05, "loss": 1.7484, "step": 500 }, { "epoch": 0.006766998700736249, "grad_norm": 1.0062955617904663, "learning_rate": 4.999932330012993e-05, "loss": 0.7633, "step": 1000 }, { "epoch": 0.010150498051104373, "grad_norm": 0.930011510848999, "learning_rate": 4.9998984950194896e-05, "loss": 0.6464, "step": 1500 }, { "epoch": 0.013533997401472498, "grad_norm": 0.5899236798286438, "learning_rate": 4.999864660025986e-05, "loss": 0.6073, "step": 2000 }, { "epoch": 0.016917496751840625, "grad_norm": 0.6071146726608276, "learning_rate": 4.999830825032481e-05, "loss": 0.587, "step": 2500 }, { "epoch": 0.020300996102208747, "grad_norm": 0.48228690028190613, "learning_rate": 4.999796990038978e-05, "loss": 0.5754, "step": 3000 }, { "epoch": 0.02368449545257687, "grad_norm": 0.46376076340675354, "learning_rate": 4.9997631550454744e-05, "loss": 0.564, "step": 3500 }, { "epoch": 0.027067994802944997, "grad_norm": 0.40634822845458984, "learning_rate": 4.9997293200519706e-05, "loss": 0.5585, "step": 4000 }, { "epoch": 0.03045149415331312, "grad_norm": 0.4346064627170563, "learning_rate": 4.999695485058467e-05, "loss": 0.5528, "step": 4500 }, { "epoch": 0.03383499350368125, "grad_norm": 0.41330981254577637, "learning_rate": 4.999661650064964e-05, "loss": 0.5488, "step": 5000 }, { "epoch": 0.03721849285404937, "grad_norm": 0.40927866101264954, "learning_rate": 4.99962781507146e-05, "loss": 0.5455, "step": 5500 }, { "epoch": 0.04060199220441749, "grad_norm": 0.4025146961212158, "learning_rate": 4.999593980077956e-05, "loss": 0.5395, "step": 6000 }, { "epoch": 0.04398549155478562, "grad_norm": 0.40866559743881226, "learning_rate": 4.9995601450844524e-05, "loss": 0.5381, "step": 6500 }, { "epoch": 0.04736899090515374, "grad_norm": 0.35923299193382263, "learning_rate": 4.999526310090949e-05, "loss": 0.533, "step": 7000 }, { "epoch": 0.05075249025552187, "grad_norm": 0.3413761556148529, "learning_rate": 4.999492475097445e-05, "loss": 0.5299, "step": 7500 }, { "epoch": 0.05413598960588999, "grad_norm": 0.3142264187335968, "learning_rate": 4.999458640103941e-05, "loss": 0.5287, "step": 8000 }, { "epoch": 0.05751948895625812, "grad_norm": 0.31593766808509827, "learning_rate": 4.999424805110437e-05, "loss": 0.5263, "step": 8500 }, { "epoch": 0.06090298830662624, "grad_norm": 0.2999120056629181, "learning_rate": 4.999390970116934e-05, "loss": 0.5227, "step": 9000 }, { "epoch": 0.06428648765699436, "grad_norm": 0.27803242206573486, "learning_rate": 4.99935713512343e-05, "loss": 0.5191, "step": 9500 }, { "epoch": 0.0676699870073625, "grad_norm": 0.2870517671108246, "learning_rate": 4.9993233001299265e-05, "loss": 0.5187, "step": 10000 }, { "epoch": 0.07105348635773062, "grad_norm": 0.28629007935523987, "learning_rate": 4.999289465136423e-05, "loss": 0.5169, "step": 10500 }, { "epoch": 0.07443698570809874, "grad_norm": 0.2766497731208801, "learning_rate": 4.9992556301429196e-05, "loss": 0.5142, "step": 11000 }, { "epoch": 0.07782048505846687, "grad_norm": 0.28909796476364136, "learning_rate": 4.999221795149416e-05, "loss": 0.5113, "step": 11500 }, { "epoch": 0.08120398440883499, "grad_norm": 0.26921311020851135, "learning_rate": 4.9991879601559114e-05, "loss": 0.5107, "step": 12000 }, { "epoch": 0.08458748375920312, "grad_norm": 0.26146847009658813, "learning_rate": 4.999154125162408e-05, "loss": 0.5068, "step": 12500 }, { "epoch": 0.08797098310957124, "grad_norm": 0.27655014395713806, "learning_rate": 4.9991202901689045e-05, "loss": 0.5062, "step": 13000 }, { "epoch": 0.09135448245993937, "grad_norm": 0.28565794229507446, "learning_rate": 4.999086455175401e-05, "loss": 0.5019, "step": 13500 }, { "epoch": 0.09473798181030749, "grad_norm": 0.26864224672317505, "learning_rate": 4.999052620181897e-05, "loss": 0.5038, "step": 14000 }, { "epoch": 0.09812148116067562, "grad_norm": 0.2848644256591797, "learning_rate": 4.999018785188394e-05, "loss": 0.5009, "step": 14500 }, { "epoch": 0.10150498051104374, "grad_norm": 0.27758416533470154, "learning_rate": 4.99898495019489e-05, "loss": 0.5009, "step": 15000 }, { "epoch": 0.10488847986141187, "grad_norm": 0.24842596054077148, "learning_rate": 4.998951115201386e-05, "loss": 0.4975, "step": 15500 }, { "epoch": 0.10827197921177999, "grad_norm": 0.24530164897441864, "learning_rate": 4.9989172802078824e-05, "loss": 0.4981, "step": 16000 }, { "epoch": 0.11165547856214812, "grad_norm": 0.25260645151138306, "learning_rate": 4.998883445214379e-05, "loss": 0.4946, "step": 16500 }, { "epoch": 0.11503897791251624, "grad_norm": 0.2753775119781494, "learning_rate": 4.998849610220875e-05, "loss": 0.4951, "step": 17000 }, { "epoch": 0.11842247726288437, "grad_norm": 0.2859707772731781, "learning_rate": 4.998815775227371e-05, "loss": 0.4932, "step": 17500 }, { "epoch": 0.12180597661325249, "grad_norm": 0.2658611834049225, "learning_rate": 4.998781940233867e-05, "loss": 0.4898, "step": 18000 }, { "epoch": 0.12518947596362062, "grad_norm": 0.2859034836292267, "learning_rate": 4.998748105240364e-05, "loss": 0.4898, "step": 18500 }, { "epoch": 0.12857297531398873, "grad_norm": 0.24946685135364532, "learning_rate": 4.9987142702468604e-05, "loss": 0.4872, "step": 19000 }, { "epoch": 0.13195647466435687, "grad_norm": 0.29766103625297546, "learning_rate": 4.9986804352533566e-05, "loss": 0.484, "step": 19500 }, { "epoch": 0.135339974014725, "grad_norm": 0.26545271277427673, "learning_rate": 4.998646600259853e-05, "loss": 0.4857, "step": 20000 }, { "epoch": 0.1387234733650931, "grad_norm": 0.257906436920166, "learning_rate": 4.99861276526635e-05, "loss": 0.4845, "step": 20500 }, { "epoch": 0.14210697271546124, "grad_norm": 0.3082759976387024, "learning_rate": 4.998578930272846e-05, "loss": 0.4819, "step": 21000 }, { "epoch": 0.14549047206582935, "grad_norm": 0.2744785249233246, "learning_rate": 4.9985450952793414e-05, "loss": 0.4814, "step": 21500 }, { "epoch": 0.1488739714161975, "grad_norm": 0.26781973242759705, "learning_rate": 4.998511260285838e-05, "loss": 0.4814, "step": 22000 }, { "epoch": 0.15225747076656562, "grad_norm": 0.26577427983283997, "learning_rate": 4.9984774252923345e-05, "loss": 0.4794, "step": 22500 }, { "epoch": 0.15564097011693373, "grad_norm": 0.2738116681575775, "learning_rate": 4.998443590298831e-05, "loss": 0.478, "step": 23000 }, { "epoch": 0.15902446946730187, "grad_norm": 0.2903014123439789, "learning_rate": 4.998409755305327e-05, "loss": 0.4771, "step": 23500 }, { "epoch": 0.16240796881766997, "grad_norm": 0.2572200298309326, "learning_rate": 4.998375920311824e-05, "loss": 0.4768, "step": 24000 }, { "epoch": 0.1657914681680381, "grad_norm": 0.2693314254283905, "learning_rate": 4.99834208531832e-05, "loss": 0.4759, "step": 24500 }, { "epoch": 0.16917496751840624, "grad_norm": 0.27925193309783936, "learning_rate": 4.998308250324816e-05, "loss": 0.4748, "step": 25000 }, { "epoch": 0.17255846686877435, "grad_norm": 0.3076331913471222, "learning_rate": 4.9982744153313125e-05, "loss": 0.4731, "step": 25500 }, { "epoch": 0.1759419662191425, "grad_norm": 0.2727000415325165, "learning_rate": 4.9982405803378094e-05, "loss": 0.4719, "step": 26000 }, { "epoch": 0.17932546556951062, "grad_norm": 0.2653730809688568, "learning_rate": 4.998206745344305e-05, "loss": 0.4706, "step": 26500 }, { "epoch": 0.18270896491987873, "grad_norm": 0.25557219982147217, "learning_rate": 4.998172910350801e-05, "loss": 0.4707, "step": 27000 }, { "epoch": 0.18609246427024687, "grad_norm": 0.25986531376838684, "learning_rate": 4.998139075357297e-05, "loss": 0.47, "step": 27500 }, { "epoch": 0.18947596362061497, "grad_norm": 0.25300464034080505, "learning_rate": 4.998105240363794e-05, "loss": 0.4687, "step": 28000 }, { "epoch": 0.1928594629709831, "grad_norm": 0.26704490184783936, "learning_rate": 4.9980714053702904e-05, "loss": 0.4677, "step": 28500 }, { "epoch": 0.19624296232135124, "grad_norm": 0.2561919689178467, "learning_rate": 4.9980375703767866e-05, "loss": 0.4671, "step": 29000 }, { "epoch": 0.19962646167171935, "grad_norm": 0.26999321579933167, "learning_rate": 4.998003735383283e-05, "loss": 0.4669, "step": 29500 }, { "epoch": 0.2030099610220875, "grad_norm": 0.2723313570022583, "learning_rate": 4.99796990038978e-05, "loss": 0.4656, "step": 30000 }, { "epoch": 0.2063934603724556, "grad_norm": 0.28622591495513916, "learning_rate": 4.997936065396276e-05, "loss": 0.4655, "step": 30500 }, { "epoch": 0.20977695972282373, "grad_norm": 0.2644602954387665, "learning_rate": 4.997902230402772e-05, "loss": 0.4636, "step": 31000 }, { "epoch": 0.21316045907319187, "grad_norm": 0.2647627294063568, "learning_rate": 4.9978683954092684e-05, "loss": 0.4619, "step": 31500 }, { "epoch": 0.21654395842355997, "grad_norm": 0.28980907797813416, "learning_rate": 4.9978345604157646e-05, "loss": 0.4635, "step": 32000 }, { "epoch": 0.2199274577739281, "grad_norm": 0.24661865830421448, "learning_rate": 4.997800725422261e-05, "loss": 0.462, "step": 32500 }, { "epoch": 0.22331095712429624, "grad_norm": 0.2632274329662323, "learning_rate": 4.997766890428757e-05, "loss": 0.4606, "step": 33000 }, { "epoch": 0.22669445647466435, "grad_norm": 0.2585084140300751, "learning_rate": 4.997733055435254e-05, "loss": 0.4608, "step": 33500 }, { "epoch": 0.2300779558250325, "grad_norm": 0.2850891053676605, "learning_rate": 4.99769922044175e-05, "loss": 0.4603, "step": 34000 }, { "epoch": 0.2334614551754006, "grad_norm": 0.2871512174606323, "learning_rate": 4.997665385448246e-05, "loss": 0.4604, "step": 34500 }, { "epoch": 0.23684495452576873, "grad_norm": 0.27349814772605896, "learning_rate": 4.9976315504547425e-05, "loss": 0.4582, "step": 35000 }, { "epoch": 0.24022845387613687, "grad_norm": 0.25539854168891907, "learning_rate": 4.9975977154612394e-05, "loss": 0.4581, "step": 35500 }, { "epoch": 0.24361195322650497, "grad_norm": 0.2622606158256531, "learning_rate": 4.997563880467735e-05, "loss": 0.4568, "step": 36000 }, { "epoch": 0.2469954525768731, "grad_norm": 0.27912938594818115, "learning_rate": 4.997530045474231e-05, "loss": 0.4575, "step": 36500 }, { "epoch": 0.25037895192724124, "grad_norm": 0.2705869972705841, "learning_rate": 4.9974962104807274e-05, "loss": 0.456, "step": 37000 }, { "epoch": 0.2537624512776094, "grad_norm": 0.2651195824146271, "learning_rate": 4.997462375487224e-05, "loss": 0.4561, "step": 37500 }, { "epoch": 0.25714595062797746, "grad_norm": 0.2819836735725403, "learning_rate": 4.9974285404937205e-05, "loss": 0.4549, "step": 38000 }, { "epoch": 0.2605294499783456, "grad_norm": 0.27624449133872986, "learning_rate": 4.997394705500217e-05, "loss": 0.4544, "step": 38500 }, { "epoch": 0.26391294932871373, "grad_norm": 0.26522350311279297, "learning_rate": 4.997360870506713e-05, "loss": 0.4544, "step": 39000 }, { "epoch": 0.26729644867908187, "grad_norm": 0.3110775649547577, "learning_rate": 4.99732703551321e-05, "loss": 0.4514, "step": 39500 }, { "epoch": 0.27067994802945, "grad_norm": 0.2998841404914856, "learning_rate": 4.997293200519706e-05, "loss": 0.4505, "step": 40000 }, { "epoch": 0.2740634473798181, "grad_norm": 0.2854876220226288, "learning_rate": 4.997259365526202e-05, "loss": 0.4514, "step": 40500 }, { "epoch": 0.2774469467301862, "grad_norm": 0.2567809522151947, "learning_rate": 4.9972255305326984e-05, "loss": 0.452, "step": 41000 }, { "epoch": 0.28083044608055435, "grad_norm": 0.2791685461997986, "learning_rate": 4.9971916955391947e-05, "loss": 0.4511, "step": 41500 }, { "epoch": 0.2842139454309225, "grad_norm": 0.29572752118110657, "learning_rate": 4.997157860545691e-05, "loss": 0.4517, "step": 42000 }, { "epoch": 0.2875974447812906, "grad_norm": 0.27020877599716187, "learning_rate": 4.997124025552187e-05, "loss": 0.4483, "step": 42500 }, { "epoch": 0.2909809441316587, "grad_norm": 0.2704961597919464, "learning_rate": 4.997090190558684e-05, "loss": 0.4482, "step": 43000 }, { "epoch": 0.29436444348202684, "grad_norm": 0.2908715009689331, "learning_rate": 4.99705635556518e-05, "loss": 0.4477, "step": 43500 }, { "epoch": 0.297747942832395, "grad_norm": 0.28041768074035645, "learning_rate": 4.9970225205716764e-05, "loss": 0.4494, "step": 44000 }, { "epoch": 0.3011314421827631, "grad_norm": 0.23975065350532532, "learning_rate": 4.9969886855781726e-05, "loss": 0.4483, "step": 44500 }, { "epoch": 0.30451494153313124, "grad_norm": 0.2862926125526428, "learning_rate": 4.9969548505846695e-05, "loss": 0.4463, "step": 45000 }, { "epoch": 0.3078984408834993, "grad_norm": 0.26498425006866455, "learning_rate": 4.996921015591165e-05, "loss": 0.4455, "step": 45500 }, { "epoch": 0.31128194023386746, "grad_norm": 0.27339521050453186, "learning_rate": 4.996887180597661e-05, "loss": 0.447, "step": 46000 }, { "epoch": 0.3146654395842356, "grad_norm": 0.25587713718414307, "learning_rate": 4.9968533456041575e-05, "loss": 0.4481, "step": 46500 }, { "epoch": 0.31804893893460373, "grad_norm": 0.290996789932251, "learning_rate": 4.9968195106106543e-05, "loss": 0.4471, "step": 47000 }, { "epoch": 0.32143243828497187, "grad_norm": 0.2602178752422333, "learning_rate": 4.9967856756171506e-05, "loss": 0.4451, "step": 47500 }, { "epoch": 0.32481593763533995, "grad_norm": 0.27312254905700684, "learning_rate": 4.996751840623647e-05, "loss": 0.445, "step": 48000 }, { "epoch": 0.3281994369857081, "grad_norm": 0.29018092155456543, "learning_rate": 4.996718005630143e-05, "loss": 0.4445, "step": 48500 }, { "epoch": 0.3315829363360762, "grad_norm": 0.27921053767204285, "learning_rate": 4.99668417063664e-05, "loss": 0.4418, "step": 49000 }, { "epoch": 0.33496643568644435, "grad_norm": 0.2919737994670868, "learning_rate": 4.996650335643136e-05, "loss": 0.4432, "step": 49500 }, { "epoch": 0.3383499350368125, "grad_norm": 0.22763541340827942, "learning_rate": 4.996616500649632e-05, "loss": 0.4409, "step": 50000 }, { "epoch": 0.3417334343871806, "grad_norm": 0.2642669975757599, "learning_rate": 4.9965826656561285e-05, "loss": 0.4427, "step": 50500 }, { "epoch": 0.3451169337375487, "grad_norm": 0.27928030490875244, "learning_rate": 4.996548830662625e-05, "loss": 0.4444, "step": 51000 }, { "epoch": 0.34850043308791684, "grad_norm": 0.27971315383911133, "learning_rate": 4.996514995669121e-05, "loss": 0.4413, "step": 51500 }, { "epoch": 0.351883932438285, "grad_norm": 0.2908726632595062, "learning_rate": 4.996481160675617e-05, "loss": 0.4409, "step": 52000 }, { "epoch": 0.3552674317886531, "grad_norm": 0.24951176345348358, "learning_rate": 4.996447325682114e-05, "loss": 0.4403, "step": 52500 }, { "epoch": 0.35865093113902125, "grad_norm": 0.2689545452594757, "learning_rate": 4.99641349068861e-05, "loss": 0.4392, "step": 53000 }, { "epoch": 0.3620344304893893, "grad_norm": 0.2819485366344452, "learning_rate": 4.9963796556951065e-05, "loss": 0.4398, "step": 53500 }, { "epoch": 0.36541792983975746, "grad_norm": 0.2707166373729706, "learning_rate": 4.996345820701603e-05, "loss": 0.4395, "step": 54000 }, { "epoch": 0.3688014291901256, "grad_norm": 0.27707841992378235, "learning_rate": 4.996311985708099e-05, "loss": 0.4381, "step": 54500 }, { "epoch": 0.37218492854049373, "grad_norm": 0.2961883842945099, "learning_rate": 4.996278150714595e-05, "loss": 0.4397, "step": 55000 }, { "epoch": 0.37556842789086187, "grad_norm": 0.2749854624271393, "learning_rate": 4.996244315721091e-05, "loss": 0.438, "step": 55500 }, { "epoch": 0.37895192724122995, "grad_norm": 0.2799800932407379, "learning_rate": 4.9962104807275875e-05, "loss": 0.4367, "step": 56000 }, { "epoch": 0.3823354265915981, "grad_norm": 0.27645984292030334, "learning_rate": 4.9961766457340844e-05, "loss": 0.4396, "step": 56500 }, { "epoch": 0.3857189259419662, "grad_norm": 0.2911885678768158, "learning_rate": 4.9961428107405806e-05, "loss": 0.4379, "step": 57000 }, { "epoch": 0.38910242529233435, "grad_norm": 0.29465240240097046, "learning_rate": 4.996108975747077e-05, "loss": 0.4384, "step": 57500 }, { "epoch": 0.3924859246427025, "grad_norm": 0.28913426399230957, "learning_rate": 4.996075140753573e-05, "loss": 0.4376, "step": 58000 }, { "epoch": 0.39586942399307057, "grad_norm": 0.27211660146713257, "learning_rate": 4.99604130576007e-05, "loss": 0.4353, "step": 58500 }, { "epoch": 0.3992529233434387, "grad_norm": 0.2998749017715454, "learning_rate": 4.996007470766566e-05, "loss": 0.4363, "step": 59000 }, { "epoch": 0.40263642269380684, "grad_norm": 0.2559037208557129, "learning_rate": 4.9959736357730624e-05, "loss": 0.4371, "step": 59500 }, { "epoch": 0.406019922044175, "grad_norm": 0.3002610206604004, "learning_rate": 4.9959398007795586e-05, "loss": 0.4357, "step": 60000 }, { "epoch": 0.4094034213945431, "grad_norm": 0.29734936356544495, "learning_rate": 4.995905965786055e-05, "loss": 0.4354, "step": 60500 }, { "epoch": 0.4127869207449112, "grad_norm": 0.2875003218650818, "learning_rate": 4.995872130792551e-05, "loss": 0.435, "step": 61000 }, { "epoch": 0.4161704200952793, "grad_norm": 0.3090741038322449, "learning_rate": 4.995838295799047e-05, "loss": 0.4342, "step": 61500 }, { "epoch": 0.41955391944564746, "grad_norm": 0.2710956931114197, "learning_rate": 4.995804460805544e-05, "loss": 0.4357, "step": 62000 }, { "epoch": 0.4229374187960156, "grad_norm": 0.2595633566379547, "learning_rate": 4.99577062581204e-05, "loss": 0.4342, "step": 62500 }, { "epoch": 0.42632091814638373, "grad_norm": 0.2667919993400574, "learning_rate": 4.9957367908185365e-05, "loss": 0.4326, "step": 63000 }, { "epoch": 0.42970441749675187, "grad_norm": 0.27533194422721863, "learning_rate": 4.995702955825033e-05, "loss": 0.4336, "step": 63500 }, { "epoch": 0.43308791684711995, "grad_norm": 0.26632460951805115, "learning_rate": 4.995669120831529e-05, "loss": 0.4347, "step": 64000 }, { "epoch": 0.4364714161974881, "grad_norm": 0.2682251036167145, "learning_rate": 4.995635285838025e-05, "loss": 0.432, "step": 64500 }, { "epoch": 0.4398549155478562, "grad_norm": 0.2722899317741394, "learning_rate": 4.9956014508445214e-05, "loss": 0.4329, "step": 65000 }, { "epoch": 0.44323841489822435, "grad_norm": 0.28562718629837036, "learning_rate": 4.9955676158510176e-05, "loss": 0.4328, "step": 65500 }, { "epoch": 0.4466219142485925, "grad_norm": 0.2702755630016327, "learning_rate": 4.9955337808575145e-05, "loss": 0.4313, "step": 66000 }, { "epoch": 0.45000541359896057, "grad_norm": 0.30107203125953674, "learning_rate": 4.995499945864011e-05, "loss": 0.4317, "step": 66500 }, { "epoch": 0.4533889129493287, "grad_norm": 0.268795907497406, "learning_rate": 4.995466110870507e-05, "loss": 0.4314, "step": 67000 }, { "epoch": 0.45677241229969684, "grad_norm": 0.2796587646007538, "learning_rate": 4.995432275877003e-05, "loss": 0.4297, "step": 67500 }, { "epoch": 0.460155911650065, "grad_norm": 0.29196685552597046, "learning_rate": 4.9953984408835e-05, "loss": 0.4325, "step": 68000 }, { "epoch": 0.4635394110004331, "grad_norm": 0.26869258284568787, "learning_rate": 4.995364605889996e-05, "loss": 0.4312, "step": 68500 }, { "epoch": 0.4669229103508012, "grad_norm": 0.3043369948863983, "learning_rate": 4.9953307708964924e-05, "loss": 0.432, "step": 69000 }, { "epoch": 0.4703064097011693, "grad_norm": 0.2701905071735382, "learning_rate": 4.9952969359029886e-05, "loss": 0.4307, "step": 69500 }, { "epoch": 0.47368990905153746, "grad_norm": 0.2781127393245697, "learning_rate": 4.995263100909485e-05, "loss": 0.4313, "step": 70000 }, { "epoch": 0.4770734084019056, "grad_norm": 0.25713086128234863, "learning_rate": 4.995229265915981e-05, "loss": 0.4297, "step": 70500 }, { "epoch": 0.48045690775227373, "grad_norm": 0.28825071454048157, "learning_rate": 4.995195430922477e-05, "loss": 0.4311, "step": 71000 }, { "epoch": 0.4838404071026418, "grad_norm": 0.27780482172966003, "learning_rate": 4.9951615959289735e-05, "loss": 0.4293, "step": 71500 }, { "epoch": 0.48722390645300995, "grad_norm": 0.2669151723384857, "learning_rate": 4.9951277609354704e-05, "loss": 0.428, "step": 72000 }, { "epoch": 0.4906074058033781, "grad_norm": 0.28188133239746094, "learning_rate": 4.9950939259419666e-05, "loss": 0.4287, "step": 72500 }, { "epoch": 0.4939909051537462, "grad_norm": 0.29270079731941223, "learning_rate": 4.995060090948463e-05, "loss": 0.4287, "step": 73000 }, { "epoch": 0.49737440450411435, "grad_norm": 0.27843374013900757, "learning_rate": 4.995026255954959e-05, "loss": 0.428, "step": 73500 }, { "epoch": 0.5007579038544825, "grad_norm": 0.28133416175842285, "learning_rate": 4.994992420961455e-05, "loss": 0.4275, "step": 74000 }, { "epoch": 0.5041414032048506, "grad_norm": 0.2791186273097992, "learning_rate": 4.9949585859679514e-05, "loss": 0.4286, "step": 74500 }, { "epoch": 0.5075249025552188, "grad_norm": 0.2686724364757538, "learning_rate": 4.9949247509744476e-05, "loss": 0.427, "step": 75000 }, { "epoch": 0.5109084019055868, "grad_norm": 0.29482367634773254, "learning_rate": 4.9948909159809445e-05, "loss": 0.4269, "step": 75500 }, { "epoch": 0.5142919012559549, "grad_norm": 0.3145529329776764, "learning_rate": 4.994857080987441e-05, "loss": 0.4274, "step": 76000 }, { "epoch": 0.5176754006063231, "grad_norm": 0.28418517112731934, "learning_rate": 4.994823245993937e-05, "loss": 0.4268, "step": 76500 }, { "epoch": 0.5210588999566912, "grad_norm": 0.3213294446468353, "learning_rate": 4.994789411000433e-05, "loss": 0.4273, "step": 77000 }, { "epoch": 0.5244423993070594, "grad_norm": 0.2876501679420471, "learning_rate": 4.99475557600693e-05, "loss": 0.4279, "step": 77500 }, { "epoch": 0.5278258986574275, "grad_norm": 0.2928789258003235, "learning_rate": 4.994721741013426e-05, "loss": 0.4261, "step": 78000 }, { "epoch": 0.5312093980077955, "grad_norm": 0.3037525713443756, "learning_rate": 4.9946879060199225e-05, "loss": 0.4253, "step": 78500 }, { "epoch": 0.5345928973581637, "grad_norm": 0.2588423788547516, "learning_rate": 4.994654071026418e-05, "loss": 0.4259, "step": 79000 }, { "epoch": 0.5379763967085318, "grad_norm": 0.30956318974494934, "learning_rate": 4.994620236032915e-05, "loss": 0.4243, "step": 79500 }, { "epoch": 0.5413598960589, "grad_norm": 0.2818774878978729, "learning_rate": 4.994586401039411e-05, "loss": 0.4262, "step": 80000 }, { "epoch": 0.5447433954092681, "grad_norm": 0.27260732650756836, "learning_rate": 4.994552566045907e-05, "loss": 0.4257, "step": 80500 }, { "epoch": 0.5481268947596362, "grad_norm": 0.28293049335479736, "learning_rate": 4.9945187310524035e-05, "loss": 0.4244, "step": 81000 }, { "epoch": 0.5515103941100044, "grad_norm": 0.299513041973114, "learning_rate": 4.9944848960589004e-05, "loss": 0.4249, "step": 81500 }, { "epoch": 0.5548938934603724, "grad_norm": 0.2706651985645294, "learning_rate": 4.9944510610653966e-05, "loss": 0.4247, "step": 82000 }, { "epoch": 0.5582773928107406, "grad_norm": 0.3018222153186798, "learning_rate": 4.994417226071893e-05, "loss": 0.4256, "step": 82500 }, { "epoch": 0.5616608921611087, "grad_norm": 0.3136172294616699, "learning_rate": 4.994383391078389e-05, "loss": 0.4241, "step": 83000 }, { "epoch": 0.5650443915114768, "grad_norm": 0.27541613578796387, "learning_rate": 4.994349556084885e-05, "loss": 0.4239, "step": 83500 }, { "epoch": 0.568427890861845, "grad_norm": 0.2760767936706543, "learning_rate": 4.9943157210913815e-05, "loss": 0.425, "step": 84000 }, { "epoch": 0.571811390212213, "grad_norm": 0.2719828188419342, "learning_rate": 4.994281886097878e-05, "loss": 0.4239, "step": 84500 }, { "epoch": 0.5751948895625812, "grad_norm": 0.2611558437347412, "learning_rate": 4.9942480511043746e-05, "loss": 0.4215, "step": 85000 }, { "epoch": 0.5785783889129493, "grad_norm": 0.28882431983947754, "learning_rate": 4.994214216110871e-05, "loss": 0.423, "step": 85500 }, { "epoch": 0.5819618882633174, "grad_norm": 0.2750629186630249, "learning_rate": 4.994180381117367e-05, "loss": 0.4211, "step": 86000 }, { "epoch": 0.5853453876136856, "grad_norm": 0.34259527921676636, "learning_rate": 4.994146546123863e-05, "loss": 0.4222, "step": 86500 }, { "epoch": 0.5887288869640537, "grad_norm": 0.28423207998275757, "learning_rate": 4.99411271113036e-05, "loss": 0.424, "step": 87000 }, { "epoch": 0.5921123863144219, "grad_norm": 0.27727535367012024, "learning_rate": 4.994078876136856e-05, "loss": 0.4232, "step": 87500 }, { "epoch": 0.59549588566479, "grad_norm": 0.29033538699150085, "learning_rate": 4.9940450411433525e-05, "loss": 0.4223, "step": 88000 }, { "epoch": 0.598879385015158, "grad_norm": 0.3040529489517212, "learning_rate": 4.994011206149848e-05, "loss": 0.4222, "step": 88500 }, { "epoch": 0.6022628843655262, "grad_norm": 0.29766690731048584, "learning_rate": 4.993977371156345e-05, "loss": 0.4223, "step": 89000 }, { "epoch": 0.6056463837158943, "grad_norm": 0.28429678082466125, "learning_rate": 4.993943536162841e-05, "loss": 0.4233, "step": 89500 }, { "epoch": 0.6090298830662625, "grad_norm": 0.2714273929595947, "learning_rate": 4.9939097011693374e-05, "loss": 0.4216, "step": 90000 }, { "epoch": 0.6124133824166306, "grad_norm": 0.30011844635009766, "learning_rate": 4.9938758661758336e-05, "loss": 0.421, "step": 90500 }, { "epoch": 0.6157968817669986, "grad_norm": 0.28375932574272156, "learning_rate": 4.9938420311823305e-05, "loss": 0.4226, "step": 91000 }, { "epoch": 0.6191803811173668, "grad_norm": 0.27526459097862244, "learning_rate": 4.993808196188827e-05, "loss": 0.422, "step": 91500 }, { "epoch": 0.6225638804677349, "grad_norm": 0.2931526303291321, "learning_rate": 4.993774361195323e-05, "loss": 0.4208, "step": 92000 }, { "epoch": 0.6259473798181031, "grad_norm": 0.27956876158714294, "learning_rate": 4.993740526201819e-05, "loss": 0.4189, "step": 92500 }, { "epoch": 0.6293308791684712, "grad_norm": 0.29397526383399963, "learning_rate": 4.993706691208316e-05, "loss": 0.4211, "step": 93000 }, { "epoch": 0.6327143785188393, "grad_norm": 0.26474645733833313, "learning_rate": 4.9936728562148116e-05, "loss": 0.4212, "step": 93500 }, { "epoch": 0.6360978778692075, "grad_norm": 0.27761274576187134, "learning_rate": 4.993639021221308e-05, "loss": 0.4215, "step": 94000 }, { "epoch": 0.6394813772195755, "grad_norm": 0.2934247553348541, "learning_rate": 4.993605186227805e-05, "loss": 0.4205, "step": 94500 }, { "epoch": 0.6428648765699437, "grad_norm": 0.2508888840675354, "learning_rate": 4.993571351234301e-05, "loss": 0.4183, "step": 95000 }, { "epoch": 0.6462483759203118, "grad_norm": 0.28504717350006104, "learning_rate": 4.993537516240797e-05, "loss": 0.4189, "step": 95500 }, { "epoch": 0.6496318752706799, "grad_norm": 0.27248984575271606, "learning_rate": 4.993503681247293e-05, "loss": 0.4203, "step": 96000 }, { "epoch": 0.6530153746210481, "grad_norm": 0.2794642150402069, "learning_rate": 4.99346984625379e-05, "loss": 0.4215, "step": 96500 }, { "epoch": 0.6563988739714162, "grad_norm": 0.30702054500579834, "learning_rate": 4.9934360112602864e-05, "loss": 0.4185, "step": 97000 }, { "epoch": 0.6597823733217844, "grad_norm": 0.27618154883384705, "learning_rate": 4.9934021762667826e-05, "loss": 0.4195, "step": 97500 }, { "epoch": 0.6631658726721524, "grad_norm": 0.28443071246147156, "learning_rate": 4.993368341273278e-05, "loss": 0.4183, "step": 98000 }, { "epoch": 0.6665493720225206, "grad_norm": 0.2913377583026886, "learning_rate": 4.993334506279775e-05, "loss": 0.4193, "step": 98500 }, { "epoch": 0.6699328713728887, "grad_norm": 0.31212741136550903, "learning_rate": 4.993300671286271e-05, "loss": 0.419, "step": 99000 }, { "epoch": 0.6733163707232568, "grad_norm": 0.28324469923973083, "learning_rate": 4.9932668362927675e-05, "loss": 0.4181, "step": 99500 }, { "epoch": 0.676699870073625, "grad_norm": 0.2820169925689697, "learning_rate": 4.993233001299264e-05, "loss": 0.4192, "step": 100000 }, { "epoch": 0.6800833694239931, "grad_norm": 0.3011641800403595, "learning_rate": 4.9931991663057606e-05, "loss": 0.4182, "step": 100500 }, { "epoch": 0.6834668687743612, "grad_norm": 0.27722039818763733, "learning_rate": 4.993165331312257e-05, "loss": 0.4189, "step": 101000 }, { "epoch": 0.6868503681247293, "grad_norm": 0.28944921493530273, "learning_rate": 4.993131496318753e-05, "loss": 0.4196, "step": 101500 }, { "epoch": 0.6902338674750974, "grad_norm": 0.29599529504776, "learning_rate": 4.993097661325249e-05, "loss": 0.4176, "step": 102000 }, { "epoch": 0.6936173668254656, "grad_norm": 0.2635329067707062, "learning_rate": 4.993063826331746e-05, "loss": 0.4191, "step": 102500 }, { "epoch": 0.6970008661758337, "grad_norm": 0.26696333289146423, "learning_rate": 4.9930299913382416e-05, "loss": 0.4192, "step": 103000 }, { "epoch": 0.7003843655262019, "grad_norm": 0.30064094066619873, "learning_rate": 4.992996156344738e-05, "loss": 0.4176, "step": 103500 }, { "epoch": 0.70376786487657, "grad_norm": 0.27610304951667786, "learning_rate": 4.992962321351235e-05, "loss": 0.4178, "step": 104000 }, { "epoch": 0.707151364226938, "grad_norm": 0.3050728142261505, "learning_rate": 4.992928486357731e-05, "loss": 0.4185, "step": 104500 }, { "epoch": 0.7105348635773062, "grad_norm": 0.28778210282325745, "learning_rate": 4.992894651364227e-05, "loss": 0.4169, "step": 105000 }, { "epoch": 0.7139183629276743, "grad_norm": 0.2820018231868744, "learning_rate": 4.9928608163707234e-05, "loss": 0.4159, "step": 105500 }, { "epoch": 0.7173018622780425, "grad_norm": 0.32361045479774475, "learning_rate": 4.99282698137722e-05, "loss": 0.4166, "step": 106000 }, { "epoch": 0.7206853616284106, "grad_norm": 0.2677634656429291, "learning_rate": 4.9927931463837165e-05, "loss": 0.4156, "step": 106500 }, { "epoch": 0.7240688609787787, "grad_norm": 0.30490702390670776, "learning_rate": 4.992759311390213e-05, "loss": 0.4163, "step": 107000 }, { "epoch": 0.7274523603291468, "grad_norm": 0.2492278665304184, "learning_rate": 4.992725476396708e-05, "loss": 0.4169, "step": 107500 }, { "epoch": 0.7308358596795149, "grad_norm": 0.27243173122406006, "learning_rate": 4.992691641403205e-05, "loss": 0.4169, "step": 108000 }, { "epoch": 0.7342193590298831, "grad_norm": 0.2796129882335663, "learning_rate": 4.992657806409701e-05, "loss": 0.4166, "step": 108500 }, { "epoch": 0.7376028583802512, "grad_norm": 0.2759961783885956, "learning_rate": 4.9926239714161975e-05, "loss": 0.4166, "step": 109000 }, { "epoch": 0.7409863577306193, "grad_norm": 0.2797967195510864, "learning_rate": 4.992590136422694e-05, "loss": 0.4141, "step": 109500 }, { "epoch": 0.7443698570809875, "grad_norm": 0.28543514013290405, "learning_rate": 4.9925563014291906e-05, "loss": 0.4154, "step": 110000 }, { "epoch": 0.7477533564313555, "grad_norm": 0.2752548158168793, "learning_rate": 4.992522466435687e-05, "loss": 0.4149, "step": 110500 }, { "epoch": 0.7511368557817237, "grad_norm": 0.2887478768825531, "learning_rate": 4.992488631442183e-05, "loss": 0.4147, "step": 111000 }, { "epoch": 0.7545203551320918, "grad_norm": 0.2966802716255188, "learning_rate": 4.992454796448679e-05, "loss": 0.4159, "step": 111500 }, { "epoch": 0.7579038544824599, "grad_norm": 0.2996438443660736, "learning_rate": 4.992420961455176e-05, "loss": 0.4165, "step": 112000 }, { "epoch": 0.7612873538328281, "grad_norm": 0.2674398422241211, "learning_rate": 4.992387126461672e-05, "loss": 0.4161, "step": 112500 }, { "epoch": 0.7646708531831962, "grad_norm": 0.2995375990867615, "learning_rate": 4.992353291468168e-05, "loss": 0.4155, "step": 113000 }, { "epoch": 0.7680543525335644, "grad_norm": 0.2994774580001831, "learning_rate": 4.992319456474665e-05, "loss": 0.4135, "step": 113500 }, { "epoch": 0.7714378518839324, "grad_norm": 0.32188859581947327, "learning_rate": 4.992285621481161e-05, "loss": 0.4144, "step": 114000 }, { "epoch": 0.7748213512343005, "grad_norm": 0.26384779810905457, "learning_rate": 4.992251786487657e-05, "loss": 0.4144, "step": 114500 }, { "epoch": 0.7782048505846687, "grad_norm": 0.3245967924594879, "learning_rate": 4.9922179514941534e-05, "loss": 0.4143, "step": 115000 }, { "epoch": 0.7815883499350368, "grad_norm": 0.2749451696872711, "learning_rate": 4.99218411650065e-05, "loss": 0.4157, "step": 115500 }, { "epoch": 0.784971849285405, "grad_norm": 0.2730276882648468, "learning_rate": 4.9921502815071465e-05, "loss": 0.4135, "step": 116000 }, { "epoch": 0.7883553486357731, "grad_norm": 0.3006286919116974, "learning_rate": 4.992116446513643e-05, "loss": 0.4141, "step": 116500 }, { "epoch": 0.7917388479861411, "grad_norm": 0.29308584332466125, "learning_rate": 4.992082611520138e-05, "loss": 0.4148, "step": 117000 }, { "epoch": 0.7951223473365093, "grad_norm": 0.27468234300613403, "learning_rate": 4.992048776526635e-05, "loss": 0.4142, "step": 117500 }, { "epoch": 0.7985058466868774, "grad_norm": 0.31991246342658997, "learning_rate": 4.9920149415331314e-05, "loss": 0.4131, "step": 118000 }, { "epoch": 0.8018893460372456, "grad_norm": 0.2824453115463257, "learning_rate": 4.9919811065396276e-05, "loss": 0.4134, "step": 118500 }, { "epoch": 0.8052728453876137, "grad_norm": 0.27071237564086914, "learning_rate": 4.991947271546124e-05, "loss": 0.414, "step": 119000 }, { "epoch": 0.8086563447379818, "grad_norm": 0.27571454644203186, "learning_rate": 4.991913436552621e-05, "loss": 0.4115, "step": 119500 }, { "epoch": 0.81203984408835, "grad_norm": 0.27670493721961975, "learning_rate": 4.991879601559117e-05, "loss": 0.4125, "step": 120000 }, { "epoch": 0.815423343438718, "grad_norm": 0.2901179790496826, "learning_rate": 4.991845766565613e-05, "loss": 0.4142, "step": 120500 }, { "epoch": 0.8188068427890862, "grad_norm": 0.27062419056892395, "learning_rate": 4.991811931572109e-05, "loss": 0.4154, "step": 121000 }, { "epoch": 0.8221903421394543, "grad_norm": 0.2876355051994324, "learning_rate": 4.991778096578606e-05, "loss": 0.4121, "step": 121500 }, { "epoch": 0.8255738414898224, "grad_norm": 0.27911514043807983, "learning_rate": 4.991744261585102e-05, "loss": 0.4108, "step": 122000 }, { "epoch": 0.8289573408401906, "grad_norm": 0.32272857427597046, "learning_rate": 4.991710426591598e-05, "loss": 0.4114, "step": 122500 }, { "epoch": 0.8323408401905587, "grad_norm": 0.28503257036209106, "learning_rate": 4.991676591598095e-05, "loss": 0.413, "step": 123000 }, { "epoch": 0.8357243395409268, "grad_norm": 0.30182546377182007, "learning_rate": 4.991642756604591e-05, "loss": 0.4115, "step": 123500 }, { "epoch": 0.8391078388912949, "grad_norm": 0.31456178426742554, "learning_rate": 4.991608921611087e-05, "loss": 0.4128, "step": 124000 }, { "epoch": 0.8424913382416631, "grad_norm": 0.2838102877140045, "learning_rate": 4.9915750866175835e-05, "loss": 0.4105, "step": 124500 }, { "epoch": 0.8458748375920312, "grad_norm": 0.3066151738166809, "learning_rate": 4.9915412516240804e-05, "loss": 0.4131, "step": 125000 }, { "epoch": 0.8492583369423993, "grad_norm": 0.2823828458786011, "learning_rate": 4.9915074166305766e-05, "loss": 0.4127, "step": 125500 }, { "epoch": 0.8526418362927675, "grad_norm": 0.27775952219963074, "learning_rate": 4.991473581637073e-05, "loss": 0.4128, "step": 126000 }, { "epoch": 0.8560253356431355, "grad_norm": 0.2724365293979645, "learning_rate": 4.991439746643568e-05, "loss": 0.4124, "step": 126500 }, { "epoch": 0.8594088349935037, "grad_norm": 0.29520806670188904, "learning_rate": 4.991405911650065e-05, "loss": 0.4114, "step": 127000 }, { "epoch": 0.8627923343438718, "grad_norm": 0.2807687819004059, "learning_rate": 4.9913720766565614e-05, "loss": 0.413, "step": 127500 }, { "epoch": 0.8661758336942399, "grad_norm": 0.2868216633796692, "learning_rate": 4.9913382416630577e-05, "loss": 0.4107, "step": 128000 }, { "epoch": 0.8695593330446081, "grad_norm": 0.28542953729629517, "learning_rate": 4.991304406669554e-05, "loss": 0.4128, "step": 128500 }, { "epoch": 0.8729428323949762, "grad_norm": 0.29025155305862427, "learning_rate": 4.991270571676051e-05, "loss": 0.4122, "step": 129000 }, { "epoch": 0.8763263317453444, "grad_norm": 0.29234567284584045, "learning_rate": 4.991236736682547e-05, "loss": 0.4099, "step": 129500 }, { "epoch": 0.8797098310957124, "grad_norm": 0.27568700909614563, "learning_rate": 4.991202901689043e-05, "loss": 0.4118, "step": 130000 }, { "epoch": 0.8830933304460805, "grad_norm": 0.3259067237377167, "learning_rate": 4.9911690666955394e-05, "loss": 0.4123, "step": 130500 }, { "epoch": 0.8864768297964487, "grad_norm": 0.33408623933792114, "learning_rate": 4.991135231702036e-05, "loss": 0.4119, "step": 131000 }, { "epoch": 0.8898603291468168, "grad_norm": 0.2826260030269623, "learning_rate": 4.991101396708532e-05, "loss": 0.411, "step": 131500 }, { "epoch": 0.893243828497185, "grad_norm": 0.27461233735084534, "learning_rate": 4.991067561715028e-05, "loss": 0.4105, "step": 132000 }, { "epoch": 0.8966273278475531, "grad_norm": 0.3100557327270508, "learning_rate": 4.991033726721525e-05, "loss": 0.4121, "step": 132500 }, { "epoch": 0.9000108271979211, "grad_norm": 0.30305910110473633, "learning_rate": 4.990999891728021e-05, "loss": 0.411, "step": 133000 }, { "epoch": 0.9033943265482893, "grad_norm": 0.31239059567451477, "learning_rate": 4.9909660567345173e-05, "loss": 0.4091, "step": 133500 }, { "epoch": 0.9067778258986574, "grad_norm": 0.332537442445755, "learning_rate": 4.9909322217410136e-05, "loss": 0.4092, "step": 134000 }, { "epoch": 0.9101613252490256, "grad_norm": 0.3206341564655304, "learning_rate": 4.99089838674751e-05, "loss": 0.4109, "step": 134500 }, { "epoch": 0.9135448245993937, "grad_norm": 0.27052798867225647, "learning_rate": 4.9908645517540067e-05, "loss": 0.4096, "step": 135000 }, { "epoch": 0.9169283239497618, "grad_norm": 0.31399449706077576, "learning_rate": 4.990830716760503e-05, "loss": 0.4096, "step": 135500 }, { "epoch": 0.92031182330013, "grad_norm": 0.2710965871810913, "learning_rate": 4.9907968817669984e-05, "loss": 0.411, "step": 136000 }, { "epoch": 0.923695322650498, "grad_norm": 0.281416118144989, "learning_rate": 4.990763046773495e-05, "loss": 0.4109, "step": 136500 }, { "epoch": 0.9270788220008662, "grad_norm": 0.29734212160110474, "learning_rate": 4.9907292117799915e-05, "loss": 0.4111, "step": 137000 }, { "epoch": 0.9304623213512343, "grad_norm": 0.28775766491889954, "learning_rate": 4.990695376786488e-05, "loss": 0.4101, "step": 137500 }, { "epoch": 0.9338458207016024, "grad_norm": 0.28395044803619385, "learning_rate": 4.990661541792984e-05, "loss": 0.4093, "step": 138000 }, { "epoch": 0.9372293200519706, "grad_norm": 0.28168410062789917, "learning_rate": 4.990627706799481e-05, "loss": 0.4103, "step": 138500 }, { "epoch": 0.9406128194023387, "grad_norm": 0.2947444021701813, "learning_rate": 4.990593871805977e-05, "loss": 0.4099, "step": 139000 }, { "epoch": 0.9439963187527068, "grad_norm": 0.299245685338974, "learning_rate": 4.990560036812473e-05, "loss": 0.4098, "step": 139500 }, { "epoch": 0.9473798181030749, "grad_norm": 0.28602391481399536, "learning_rate": 4.9905262018189695e-05, "loss": 0.4094, "step": 140000 }, { "epoch": 0.950763317453443, "grad_norm": 0.3072488307952881, "learning_rate": 4.9904923668254663e-05, "loss": 0.4099, "step": 140500 }, { "epoch": 0.9541468168038112, "grad_norm": 0.28334489464759827, "learning_rate": 4.990458531831962e-05, "loss": 0.4078, "step": 141000 }, { "epoch": 0.9575303161541793, "grad_norm": 0.28181710839271545, "learning_rate": 4.990424696838458e-05, "loss": 0.4091, "step": 141500 }, { "epoch": 0.9609138155045475, "grad_norm": 0.285423219203949, "learning_rate": 4.990390861844954e-05, "loss": 0.4104, "step": 142000 }, { "epoch": 0.9642973148549155, "grad_norm": 0.3231546878814697, "learning_rate": 4.990357026851451e-05, "loss": 0.4093, "step": 142500 }, { "epoch": 0.9676808142052836, "grad_norm": 0.2778891324996948, "learning_rate": 4.9903231918579474e-05, "loss": 0.4104, "step": 143000 }, { "epoch": 0.9710643135556518, "grad_norm": 0.31177300214767456, "learning_rate": 4.9902893568644436e-05, "loss": 0.4077, "step": 143500 }, { "epoch": 0.9744478129060199, "grad_norm": 0.2938098907470703, "learning_rate": 4.99025552187094e-05, "loss": 0.4088, "step": 144000 }, { "epoch": 0.9778313122563881, "grad_norm": 0.3233024775981903, "learning_rate": 4.990221686877437e-05, "loss": 0.407, "step": 144500 }, { "epoch": 0.9812148116067562, "grad_norm": 0.31348568201065063, "learning_rate": 4.990187851883933e-05, "loss": 0.4091, "step": 145000 }, { "epoch": 0.9845983109571242, "grad_norm": 0.29528912901878357, "learning_rate": 4.990154016890429e-05, "loss": 0.4101, "step": 145500 }, { "epoch": 0.9879818103074924, "grad_norm": 0.30985161662101746, "learning_rate": 4.9901201818969254e-05, "loss": 0.409, "step": 146000 }, { "epoch": 0.9913653096578605, "grad_norm": 0.3055655360221863, "learning_rate": 4.9900863469034216e-05, "loss": 0.4072, "step": 146500 }, { "epoch": 0.9947488090082287, "grad_norm": 0.2524847090244293, "learning_rate": 4.990052511909918e-05, "loss": 0.4079, "step": 147000 }, { "epoch": 0.9981323083585968, "grad_norm": 0.3176727294921875, "learning_rate": 4.990018676916414e-05, "loss": 0.4089, "step": 147500 }, { "epoch": 1.0, "eval_accuracy": 0.8459713252862398, "eval_loss": 0.6243861317634583, "eval_runtime": 3360.1882, "eval_samples_per_second": 86.526, "eval_steps_per_second": 5.408, "step": 147776 }, { "epoch": 1.001515807708965, "grad_norm": 0.2944328188896179, "learning_rate": 4.989984841922911e-05, "loss": 0.4063, "step": 148000 }, { "epoch": 1.004899307059333, "grad_norm": 0.29024773836135864, "learning_rate": 4.989951006929407e-05, "loss": 0.4039, "step": 148500 }, { "epoch": 1.0082828064097011, "grad_norm": 0.27989470958709717, "learning_rate": 4.989917171935903e-05, "loss": 0.4064, "step": 149000 }, { "epoch": 1.0116663057600692, "grad_norm": 0.28938227891921997, "learning_rate": 4.9898833369423995e-05, "loss": 0.407, "step": 149500 }, { "epoch": 1.0150498051104375, "grad_norm": 0.295976459980011, "learning_rate": 4.9898495019488964e-05, "loss": 0.406, "step": 150000 }, { "epoch": 1.0184333044608056, "grad_norm": 0.31487059593200684, "learning_rate": 4.989815666955392e-05, "loss": 0.4051, "step": 150500 }, { "epoch": 1.0218168038111737, "grad_norm": 0.26871323585510254, "learning_rate": 4.989781831961888e-05, "loss": 0.4059, "step": 151000 }, { "epoch": 1.0252003031615418, "grad_norm": 0.28550779819488525, "learning_rate": 4.9897479969683844e-05, "loss": 0.4039, "step": 151500 }, { "epoch": 1.0285838025119098, "grad_norm": 0.2694486975669861, "learning_rate": 4.989714161974881e-05, "loss": 0.4062, "step": 152000 }, { "epoch": 1.0319673018622781, "grad_norm": 0.2790544629096985, "learning_rate": 4.9896803269813775e-05, "loss": 0.4051, "step": 152500 }, { "epoch": 1.0353508012126462, "grad_norm": 0.2629123628139496, "learning_rate": 4.989646491987874e-05, "loss": 0.4059, "step": 153000 }, { "epoch": 1.0387343005630143, "grad_norm": 0.3039059340953827, "learning_rate": 4.98961265699437e-05, "loss": 0.4053, "step": 153500 }, { "epoch": 1.0421177999133824, "grad_norm": 0.3079100549221039, "learning_rate": 4.989578822000867e-05, "loss": 0.4058, "step": 154000 }, { "epoch": 1.0455012992637505, "grad_norm": 0.3271750807762146, "learning_rate": 4.989544987007363e-05, "loss": 0.405, "step": 154500 }, { "epoch": 1.0488847986141188, "grad_norm": 0.30280107259750366, "learning_rate": 4.989511152013859e-05, "loss": 0.4049, "step": 155000 }, { "epoch": 1.0522682979644868, "grad_norm": 0.3045520484447479, "learning_rate": 4.9894773170203554e-05, "loss": 0.4041, "step": 155500 }, { "epoch": 1.055651797314855, "grad_norm": 0.2965322732925415, "learning_rate": 4.9894434820268516e-05, "loss": 0.4065, "step": 156000 }, { "epoch": 1.059035296665223, "grad_norm": 0.28401070833206177, "learning_rate": 4.989409647033348e-05, "loss": 0.4066, "step": 156500 }, { "epoch": 1.062418796015591, "grad_norm": 0.3080543577671051, "learning_rate": 4.989375812039844e-05, "loss": 0.4074, "step": 157000 }, { "epoch": 1.0658022953659594, "grad_norm": 0.28200146555900574, "learning_rate": 4.989341977046341e-05, "loss": 0.4048, "step": 157500 }, { "epoch": 1.0691857947163275, "grad_norm": 0.28583553433418274, "learning_rate": 4.989308142052837e-05, "loss": 0.4048, "step": 158000 }, { "epoch": 1.0725692940666955, "grad_norm": 0.29166433215141296, "learning_rate": 4.9892743070593334e-05, "loss": 0.405, "step": 158500 }, { "epoch": 1.0759527934170636, "grad_norm": 0.2707166075706482, "learning_rate": 4.9892404720658296e-05, "loss": 0.4047, "step": 159000 }, { "epoch": 1.0793362927674317, "grad_norm": 0.2985897362232208, "learning_rate": 4.9892066370723265e-05, "loss": 0.4048, "step": 159500 }, { "epoch": 1.0827197921178, "grad_norm": 0.32868626713752747, "learning_rate": 4.989172802078822e-05, "loss": 0.4075, "step": 160000 }, { "epoch": 1.086103291468168, "grad_norm": 0.3031218349933624, "learning_rate": 4.989138967085318e-05, "loss": 0.4049, "step": 160500 }, { "epoch": 1.0894867908185362, "grad_norm": 0.277643084526062, "learning_rate": 4.9891051320918144e-05, "loss": 0.4065, "step": 161000 }, { "epoch": 1.0928702901689042, "grad_norm": 0.2969783842563629, "learning_rate": 4.989071297098311e-05, "loss": 0.4044, "step": 161500 }, { "epoch": 1.0962537895192723, "grad_norm": 0.30704718828201294, "learning_rate": 4.9890374621048075e-05, "loss": 0.4035, "step": 162000 }, { "epoch": 1.0996372888696406, "grad_norm": 0.303273469209671, "learning_rate": 4.989003627111304e-05, "loss": 0.4044, "step": 162500 }, { "epoch": 1.1030207882200087, "grad_norm": 0.30890023708343506, "learning_rate": 4.9889697921178e-05, "loss": 0.4044, "step": 163000 }, { "epoch": 1.1064042875703768, "grad_norm": 0.3068098723888397, "learning_rate": 4.988935957124297e-05, "loss": 0.4049, "step": 163500 }, { "epoch": 1.1097877869207449, "grad_norm": 0.2811026871204376, "learning_rate": 4.988902122130793e-05, "loss": 0.4032, "step": 164000 }, { "epoch": 1.113171286271113, "grad_norm": 0.2884727716445923, "learning_rate": 4.988868287137289e-05, "loss": 0.4056, "step": 164500 }, { "epoch": 1.1165547856214812, "grad_norm": 0.296905517578125, "learning_rate": 4.9888344521437855e-05, "loss": 0.4044, "step": 165000 }, { "epoch": 1.1199382849718493, "grad_norm": 0.28965866565704346, "learning_rate": 4.988800617150282e-05, "loss": 0.403, "step": 165500 }, { "epoch": 1.1233217843222174, "grad_norm": 0.2720777094364166, "learning_rate": 4.988766782156778e-05, "loss": 0.4028, "step": 166000 }, { "epoch": 1.1267052836725855, "grad_norm": 0.320305734872818, "learning_rate": 4.988732947163274e-05, "loss": 0.4057, "step": 166500 }, { "epoch": 1.1300887830229536, "grad_norm": 0.33288514614105225, "learning_rate": 4.988699112169771e-05, "loss": 0.4038, "step": 167000 }, { "epoch": 1.1334722823733219, "grad_norm": 0.3267863392829895, "learning_rate": 4.988665277176267e-05, "loss": 0.405, "step": 167500 }, { "epoch": 1.13685578172369, "grad_norm": 0.2968672513961792, "learning_rate": 4.9886314421827634e-05, "loss": 0.4043, "step": 168000 }, { "epoch": 1.140239281074058, "grad_norm": 0.3072211742401123, "learning_rate": 4.9885976071892596e-05, "loss": 0.4031, "step": 168500 }, { "epoch": 1.143622780424426, "grad_norm": 0.28575554490089417, "learning_rate": 4.9885637721957565e-05, "loss": 0.4053, "step": 169000 }, { "epoch": 1.1470062797747942, "grad_norm": 0.3024563193321228, "learning_rate": 4.988529937202252e-05, "loss": 0.4051, "step": 169500 }, { "epoch": 1.1503897791251625, "grad_norm": 0.27619415521621704, "learning_rate": 4.988496102208748e-05, "loss": 0.4046, "step": 170000 }, { "epoch": 1.1537732784755306, "grad_norm": 0.31210121512413025, "learning_rate": 4.9884622672152445e-05, "loss": 0.4035, "step": 170500 }, { "epoch": 1.1571567778258987, "grad_norm": 0.2941502630710602, "learning_rate": 4.9884284322217414e-05, "loss": 0.4045, "step": 171000 }, { "epoch": 1.1605402771762667, "grad_norm": 0.29555612802505493, "learning_rate": 4.9883945972282376e-05, "loss": 0.4059, "step": 171500 }, { "epoch": 1.1639237765266348, "grad_norm": 0.30727556347846985, "learning_rate": 4.988360762234734e-05, "loss": 0.4031, "step": 172000 }, { "epoch": 1.1673072758770031, "grad_norm": 0.274087131023407, "learning_rate": 4.98832692724123e-05, "loss": 0.4037, "step": 172500 }, { "epoch": 1.1706907752273712, "grad_norm": 0.2855257987976074, "learning_rate": 4.988293092247727e-05, "loss": 0.4029, "step": 173000 }, { "epoch": 1.1740742745777393, "grad_norm": 0.29047608375549316, "learning_rate": 4.988259257254223e-05, "loss": 0.4054, "step": 173500 }, { "epoch": 1.1774577739281074, "grad_norm": 0.3169862926006317, "learning_rate": 4.988225422260719e-05, "loss": 0.4039, "step": 174000 }, { "epoch": 1.1808412732784754, "grad_norm": 0.286082923412323, "learning_rate": 4.9881915872672155e-05, "loss": 0.402, "step": 174500 }, { "epoch": 1.1842247726288437, "grad_norm": 0.3077320158481598, "learning_rate": 4.988157752273712e-05, "loss": 0.4038, "step": 175000 }, { "epoch": 1.1876082719792118, "grad_norm": 0.2859615683555603, "learning_rate": 4.988123917280208e-05, "loss": 0.4048, "step": 175500 }, { "epoch": 1.19099177132958, "grad_norm": 0.2765771746635437, "learning_rate": 4.988090082286704e-05, "loss": 0.4033, "step": 176000 }, { "epoch": 1.194375270679948, "grad_norm": 0.2984127700328827, "learning_rate": 4.988056247293201e-05, "loss": 0.4024, "step": 176500 }, { "epoch": 1.1977587700303163, "grad_norm": 0.28113898634910583, "learning_rate": 4.988022412299697e-05, "loss": 0.4023, "step": 177000 }, { "epoch": 1.2011422693806844, "grad_norm": 0.2798595130443573, "learning_rate": 4.9879885773061935e-05, "loss": 0.4034, "step": 177500 }, { "epoch": 1.2045257687310524, "grad_norm": 0.3147580027580261, "learning_rate": 4.98795474231269e-05, "loss": 0.4043, "step": 178000 }, { "epoch": 1.2079092680814205, "grad_norm": 0.316378653049469, "learning_rate": 4.9879209073191866e-05, "loss": 0.4018, "step": 178500 }, { "epoch": 1.2112927674317886, "grad_norm": 0.30602577328681946, "learning_rate": 4.987887072325682e-05, "loss": 0.4039, "step": 179000 }, { "epoch": 1.2146762667821567, "grad_norm": 0.3135761022567749, "learning_rate": 4.9878532373321783e-05, "loss": 0.4022, "step": 179500 }, { "epoch": 1.218059766132525, "grad_norm": 0.3081457018852234, "learning_rate": 4.9878194023386746e-05, "loss": 0.402, "step": 180000 }, { "epoch": 1.221443265482893, "grad_norm": 0.3091464638710022, "learning_rate": 4.9877855673451714e-05, "loss": 0.4013, "step": 180500 }, { "epoch": 1.2248267648332611, "grad_norm": 0.2828048765659332, "learning_rate": 4.9877517323516677e-05, "loss": 0.4016, "step": 181000 }, { "epoch": 1.2282102641836292, "grad_norm": 0.3356885612010956, "learning_rate": 4.987717897358164e-05, "loss": 0.4033, "step": 181500 }, { "epoch": 1.2315937635339975, "grad_norm": 0.2955090403556824, "learning_rate": 4.98768406236466e-05, "loss": 0.4022, "step": 182000 }, { "epoch": 1.2349772628843656, "grad_norm": 0.3026246130466461, "learning_rate": 4.987650227371157e-05, "loss": 0.4013, "step": 182500 }, { "epoch": 1.2383607622347337, "grad_norm": 0.30111822485923767, "learning_rate": 4.987616392377653e-05, "loss": 0.4028, "step": 183000 }, { "epoch": 1.2417442615851018, "grad_norm": 0.30120477080345154, "learning_rate": 4.9875825573841494e-05, "loss": 0.402, "step": 183500 }, { "epoch": 1.2451277609354698, "grad_norm": 0.3235791027545929, "learning_rate": 4.9875487223906456e-05, "loss": 0.4034, "step": 184000 }, { "epoch": 1.248511260285838, "grad_norm": 0.2712317109107971, "learning_rate": 4.987514887397142e-05, "loss": 0.4011, "step": 184500 }, { "epoch": 1.2518947596362062, "grad_norm": 0.2865242063999176, "learning_rate": 4.987481052403638e-05, "loss": 0.4022, "step": 185000 }, { "epoch": 1.2552782589865743, "grad_norm": 0.27379170060157776, "learning_rate": 4.987447217410134e-05, "loss": 0.4007, "step": 185500 }, { "epoch": 1.2586617583369424, "grad_norm": 0.2991056442260742, "learning_rate": 4.987413382416631e-05, "loss": 0.403, "step": 186000 }, { "epoch": 1.2620452576873105, "grad_norm": 0.27812930941581726, "learning_rate": 4.9873795474231273e-05, "loss": 0.4011, "step": 186500 }, { "epoch": 1.2654287570376788, "grad_norm": 0.3062492609024048, "learning_rate": 4.9873457124296236e-05, "loss": 0.4022, "step": 187000 }, { "epoch": 1.2688122563880468, "grad_norm": 0.2890298366546631, "learning_rate": 4.98731187743612e-05, "loss": 0.4017, "step": 187500 }, { "epoch": 1.272195755738415, "grad_norm": 0.3117908835411072, "learning_rate": 4.987278042442616e-05, "loss": 0.4005, "step": 188000 }, { "epoch": 1.275579255088783, "grad_norm": 0.30386489629745483, "learning_rate": 4.987244207449112e-05, "loss": 0.4026, "step": 188500 }, { "epoch": 1.278962754439151, "grad_norm": 0.3161843419075012, "learning_rate": 4.9872103724556084e-05, "loss": 0.401, "step": 189000 }, { "epoch": 1.2823462537895192, "grad_norm": 0.28740108013153076, "learning_rate": 4.9871765374621046e-05, "loss": 0.4016, "step": 189500 }, { "epoch": 1.2857297531398875, "grad_norm": 0.29486459493637085, "learning_rate": 4.9871427024686015e-05, "loss": 0.3998, "step": 190000 }, { "epoch": 1.2891132524902555, "grad_norm": 0.2807752788066864, "learning_rate": 4.987108867475098e-05, "loss": 0.402, "step": 190500 }, { "epoch": 1.2924967518406236, "grad_norm": 0.30910518765449524, "learning_rate": 4.987075032481594e-05, "loss": 0.4004, "step": 191000 }, { "epoch": 1.2958802511909917, "grad_norm": 0.302749365568161, "learning_rate": 4.98704119748809e-05, "loss": 0.4002, "step": 191500 }, { "epoch": 1.29926375054136, "grad_norm": 0.297519713640213, "learning_rate": 4.987007362494587e-05, "loss": 0.4026, "step": 192000 }, { "epoch": 1.302647249891728, "grad_norm": 0.289521187543869, "learning_rate": 4.986973527501083e-05, "loss": 0.4012, "step": 192500 }, { "epoch": 1.3060307492420962, "grad_norm": 0.3190580904483795, "learning_rate": 4.9869396925075795e-05, "loss": 0.4013, "step": 193000 }, { "epoch": 1.3094142485924642, "grad_norm": 0.2650038003921509, "learning_rate": 4.986905857514076e-05, "loss": 0.3998, "step": 193500 }, { "epoch": 1.3127977479428323, "grad_norm": 0.2761973440647125, "learning_rate": 4.986872022520572e-05, "loss": 0.401, "step": 194000 }, { "epoch": 1.3161812472932004, "grad_norm": 0.2967272400856018, "learning_rate": 4.986838187527068e-05, "loss": 0.4031, "step": 194500 }, { "epoch": 1.3195647466435687, "grad_norm": 0.29815414547920227, "learning_rate": 4.986804352533564e-05, "loss": 0.4023, "step": 195000 }, { "epoch": 1.3229482459939368, "grad_norm": 0.3200174868106842, "learning_rate": 4.986770517540061e-05, "loss": 0.401, "step": 195500 }, { "epoch": 1.3263317453443049, "grad_norm": 0.29795876145362854, "learning_rate": 4.9867366825465574e-05, "loss": 0.4003, "step": 196000 }, { "epoch": 1.329715244694673, "grad_norm": 0.2837540805339813, "learning_rate": 4.9867028475530536e-05, "loss": 0.402, "step": 196500 }, { "epoch": 1.3330987440450413, "grad_norm": 0.32125329971313477, "learning_rate": 4.98666901255955e-05, "loss": 0.4018, "step": 197000 }, { "epoch": 1.3364822433954093, "grad_norm": 0.2916601896286011, "learning_rate": 4.986635177566046e-05, "loss": 0.4005, "step": 197500 }, { "epoch": 1.3398657427457774, "grad_norm": 0.3081722557544708, "learning_rate": 4.986601342572542e-05, "loss": 0.3996, "step": 198000 }, { "epoch": 1.3432492420961455, "grad_norm": 0.33962398767471313, "learning_rate": 4.9865675075790385e-05, "loss": 0.4019, "step": 198500 }, { "epoch": 1.3466327414465136, "grad_norm": 0.29512572288513184, "learning_rate": 4.986533672585535e-05, "loss": 0.401, "step": 199000 }, { "epoch": 1.3500162407968817, "grad_norm": 0.3279782831668854, "learning_rate": 4.9864998375920316e-05, "loss": 0.4017, "step": 199500 }, { "epoch": 1.35339974014725, "grad_norm": 0.3008809983730316, "learning_rate": 4.986466002598528e-05, "loss": 0.4008, "step": 200000 }, { "epoch": 1.356783239497618, "grad_norm": 0.2847321629524231, "learning_rate": 4.986432167605024e-05, "loss": 0.3978, "step": 200500 }, { "epoch": 1.3601667388479861, "grad_norm": 0.29812660813331604, "learning_rate": 4.98639833261152e-05, "loss": 0.3993, "step": 201000 }, { "epoch": 1.3635502381983542, "grad_norm": 0.3119332194328308, "learning_rate": 4.986364497618017e-05, "loss": 0.4002, "step": 201500 }, { "epoch": 1.3669337375487225, "grad_norm": 0.29071831703186035, "learning_rate": 4.986330662624513e-05, "loss": 0.399, "step": 202000 }, { "epoch": 1.3703172368990906, "grad_norm": 0.2947494089603424, "learning_rate": 4.9862968276310095e-05, "loss": 0.401, "step": 202500 }, { "epoch": 1.3737007362494587, "grad_norm": 0.3152572810649872, "learning_rate": 4.986262992637506e-05, "loss": 0.4003, "step": 203000 }, { "epoch": 1.3770842355998267, "grad_norm": 0.3083915710449219, "learning_rate": 4.986229157644002e-05, "loss": 0.4017, "step": 203500 }, { "epoch": 1.3804677349501948, "grad_norm": 0.3038440942764282, "learning_rate": 4.986195322650498e-05, "loss": 0.3995, "step": 204000 }, { "epoch": 1.383851234300563, "grad_norm": 0.3062540590763092, "learning_rate": 4.9861614876569944e-05, "loss": 0.401, "step": 204500 }, { "epoch": 1.3872347336509312, "grad_norm": 0.3120565116405487, "learning_rate": 4.9861276526634906e-05, "loss": 0.3996, "step": 205000 }, { "epoch": 1.3906182330012993, "grad_norm": 0.2866579294204712, "learning_rate": 4.9860938176699875e-05, "loss": 0.4006, "step": 205500 }, { "epoch": 1.3940017323516674, "grad_norm": 0.2914845943450928, "learning_rate": 4.986059982676484e-05, "loss": 0.3988, "step": 206000 }, { "epoch": 1.3973852317020354, "grad_norm": 0.2740603983402252, "learning_rate": 4.98602614768298e-05, "loss": 0.399, "step": 206500 }, { "epoch": 1.4007687310524037, "grad_norm": 0.289460152387619, "learning_rate": 4.985992312689476e-05, "loss": 0.4, "step": 207000 }, { "epoch": 1.4041522304027718, "grad_norm": 0.29983991384506226, "learning_rate": 4.985958477695973e-05, "loss": 0.3996, "step": 207500 }, { "epoch": 1.40753572975314, "grad_norm": 0.3190790116786957, "learning_rate": 4.9859246427024685e-05, "loss": 0.4007, "step": 208000 }, { "epoch": 1.410919229103508, "grad_norm": 0.2991366982460022, "learning_rate": 4.985890807708965e-05, "loss": 0.4003, "step": 208500 }, { "epoch": 1.414302728453876, "grad_norm": 0.29199305176734924, "learning_rate": 4.9858569727154616e-05, "loss": 0.3994, "step": 209000 }, { "epoch": 1.4176862278042441, "grad_norm": 0.32636138796806335, "learning_rate": 4.985823137721958e-05, "loss": 0.4004, "step": 209500 }, { "epoch": 1.4210697271546124, "grad_norm": 0.3044842481613159, "learning_rate": 4.985789302728454e-05, "loss": 0.3996, "step": 210000 }, { "epoch": 1.4244532265049805, "grad_norm": 0.28607505559921265, "learning_rate": 4.98575546773495e-05, "loss": 0.3985, "step": 210500 }, { "epoch": 1.4278367258553486, "grad_norm": 0.3226557970046997, "learning_rate": 4.985721632741447e-05, "loss": 0.3993, "step": 211000 }, { "epoch": 1.4312202252057167, "grad_norm": 0.31989920139312744, "learning_rate": 4.9856877977479434e-05, "loss": 0.4005, "step": 211500 }, { "epoch": 1.434603724556085, "grad_norm": 0.2904207408428192, "learning_rate": 4.9856539627544396e-05, "loss": 0.4013, "step": 212000 }, { "epoch": 1.437987223906453, "grad_norm": 0.28892791271209717, "learning_rate": 4.985620127760935e-05, "loss": 0.4, "step": 212500 }, { "epoch": 1.4413707232568211, "grad_norm": 0.2917577028274536, "learning_rate": 4.985586292767432e-05, "loss": 0.3991, "step": 213000 }, { "epoch": 1.4447542226071892, "grad_norm": 0.3064013123512268, "learning_rate": 4.985552457773928e-05, "loss": 0.3971, "step": 213500 }, { "epoch": 1.4481377219575573, "grad_norm": 0.30525708198547363, "learning_rate": 4.9855186227804244e-05, "loss": 0.4017, "step": 214000 }, { "epoch": 1.4515212213079254, "grad_norm": 0.26815348863601685, "learning_rate": 4.9854847877869206e-05, "loss": 0.3983, "step": 214500 }, { "epoch": 1.4549047206582937, "grad_norm": 0.3009331524372101, "learning_rate": 4.9854509527934175e-05, "loss": 0.3989, "step": 215000 }, { "epoch": 1.4582882200086618, "grad_norm": 0.3506813049316406, "learning_rate": 4.985417117799914e-05, "loss": 0.4001, "step": 215500 }, { "epoch": 1.4616717193590298, "grad_norm": 0.30114126205444336, "learning_rate": 4.98538328280641e-05, "loss": 0.4005, "step": 216000 }, { "epoch": 1.4650552187093981, "grad_norm": 0.27080991864204407, "learning_rate": 4.985349447812906e-05, "loss": 0.3972, "step": 216500 }, { "epoch": 1.4684387180597662, "grad_norm": 0.32722705602645874, "learning_rate": 4.985315612819403e-05, "loss": 0.4001, "step": 217000 }, { "epoch": 1.4718222174101343, "grad_norm": 0.32709217071533203, "learning_rate": 4.9852817778258986e-05, "loss": 0.3984, "step": 217500 }, { "epoch": 1.4752057167605024, "grad_norm": 0.30302122235298157, "learning_rate": 4.985247942832395e-05, "loss": 0.3968, "step": 218000 }, { "epoch": 1.4785892161108705, "grad_norm": 0.30743733048439026, "learning_rate": 4.985214107838892e-05, "loss": 0.3999, "step": 218500 }, { "epoch": 1.4819727154612385, "grad_norm": 0.2864611744880676, "learning_rate": 4.985180272845388e-05, "loss": 0.3998, "step": 219000 }, { "epoch": 1.4853562148116066, "grad_norm": 0.33113473653793335, "learning_rate": 4.985146437851884e-05, "loss": 0.3991, "step": 219500 }, { "epoch": 1.488739714161975, "grad_norm": 0.30229973793029785, "learning_rate": 4.98511260285838e-05, "loss": 0.3981, "step": 220000 }, { "epoch": 1.492123213512343, "grad_norm": 0.30289003252983093, "learning_rate": 4.985078767864877e-05, "loss": 0.3991, "step": 220500 }, { "epoch": 1.495506712862711, "grad_norm": 0.32852068543434143, "learning_rate": 4.9850449328713734e-05, "loss": 0.3999, "step": 221000 }, { "epoch": 1.4988902122130794, "grad_norm": 0.3039749562740326, "learning_rate": 4.9850110978778696e-05, "loss": 0.3986, "step": 221500 }, { "epoch": 1.5022737115634475, "grad_norm": 0.30670079588890076, "learning_rate": 4.984977262884365e-05, "loss": 0.3992, "step": 222000 }, { "epoch": 1.5056572109138155, "grad_norm": 0.27520325779914856, "learning_rate": 4.984943427890862e-05, "loss": 0.3998, "step": 222500 }, { "epoch": 1.5090407102641836, "grad_norm": 0.28639331459999084, "learning_rate": 4.984909592897358e-05, "loss": 0.3985, "step": 223000 }, { "epoch": 1.5124242096145517, "grad_norm": 0.3116671144962311, "learning_rate": 4.9848757579038545e-05, "loss": 0.3987, "step": 223500 }, { "epoch": 1.5158077089649198, "grad_norm": 0.31099647283554077, "learning_rate": 4.984841922910351e-05, "loss": 0.3991, "step": 224000 }, { "epoch": 1.5191912083152879, "grad_norm": 0.3141653537750244, "learning_rate": 4.9848080879168476e-05, "loss": 0.3993, "step": 224500 }, { "epoch": 1.5225747076656562, "grad_norm": 0.27565282583236694, "learning_rate": 4.984774252923344e-05, "loss": 0.3973, "step": 225000 }, { "epoch": 1.5259582070160242, "grad_norm": 0.33048033714294434, "learning_rate": 4.98474041792984e-05, "loss": 0.3977, "step": 225500 }, { "epoch": 1.5293417063663923, "grad_norm": 0.3040478527545929, "learning_rate": 4.984706582936336e-05, "loss": 0.3989, "step": 226000 }, { "epoch": 1.5327252057167606, "grad_norm": 0.3027140200138092, "learning_rate": 4.984672747942833e-05, "loss": 0.3993, "step": 226500 }, { "epoch": 1.5361087050671287, "grad_norm": 0.2882574498653412, "learning_rate": 4.9846389129493287e-05, "loss": 0.4007, "step": 227000 }, { "epoch": 1.5394922044174968, "grad_norm": 0.3169984221458435, "learning_rate": 4.984605077955825e-05, "loss": 0.3964, "step": 227500 }, { "epoch": 1.5428757037678649, "grad_norm": 0.26678523421287537, "learning_rate": 4.984571242962322e-05, "loss": 0.3981, "step": 228000 }, { "epoch": 1.546259203118233, "grad_norm": 0.30455484986305237, "learning_rate": 4.984537407968818e-05, "loss": 0.3974, "step": 228500 }, { "epoch": 1.549642702468601, "grad_norm": 0.30882003903388977, "learning_rate": 4.984503572975314e-05, "loss": 0.3983, "step": 229000 }, { "epoch": 1.553026201818969, "grad_norm": 0.29881536960601807, "learning_rate": 4.9844697379818104e-05, "loss": 0.3965, "step": 229500 }, { "epoch": 1.5564097011693374, "grad_norm": 0.30941638350486755, "learning_rate": 4.984435902988307e-05, "loss": 0.3958, "step": 230000 }, { "epoch": 1.5597932005197055, "grad_norm": 0.28484824299812317, "learning_rate": 4.9844020679948035e-05, "loss": 0.3972, "step": 230500 }, { "epoch": 1.5631766998700736, "grad_norm": 0.27541229128837585, "learning_rate": 4.9843682330013e-05, "loss": 0.3974, "step": 231000 }, { "epoch": 1.5665601992204419, "grad_norm": 0.2993621230125427, "learning_rate": 4.984334398007795e-05, "loss": 0.3989, "step": 231500 }, { "epoch": 1.56994369857081, "grad_norm": 0.28241676092147827, "learning_rate": 4.984300563014292e-05, "loss": 0.3983, "step": 232000 }, { "epoch": 1.573327197921178, "grad_norm": 0.2912180423736572, "learning_rate": 4.9842667280207883e-05, "loss": 0.3972, "step": 232500 }, { "epoch": 1.5767106972715461, "grad_norm": 0.26539215445518494, "learning_rate": 4.9842328930272846e-05, "loss": 0.3978, "step": 233000 }, { "epoch": 1.5800941966219142, "grad_norm": 0.29652032256126404, "learning_rate": 4.984199058033781e-05, "loss": 0.3972, "step": 233500 }, { "epoch": 1.5834776959722823, "grad_norm": 0.31173455715179443, "learning_rate": 4.984165223040278e-05, "loss": 0.3987, "step": 234000 }, { "epoch": 1.5868611953226504, "grad_norm": 0.3123958110809326, "learning_rate": 4.984131388046774e-05, "loss": 0.3975, "step": 234500 }, { "epoch": 1.5902446946730187, "grad_norm": 0.3172590732574463, "learning_rate": 4.98409755305327e-05, "loss": 0.3962, "step": 235000 }, { "epoch": 1.5936281940233867, "grad_norm": 0.2888542413711548, "learning_rate": 4.984063718059766e-05, "loss": 0.399, "step": 235500 }, { "epoch": 1.597011693373755, "grad_norm": 0.3058955669403076, "learning_rate": 4.984029883066263e-05, "loss": 0.3965, "step": 236000 }, { "epoch": 1.6003951927241231, "grad_norm": 0.2985432744026184, "learning_rate": 4.983996048072759e-05, "loss": 0.3978, "step": 236500 }, { "epoch": 1.6037786920744912, "grad_norm": 0.3273903429508209, "learning_rate": 4.983962213079255e-05, "loss": 0.398, "step": 237000 }, { "epoch": 1.6071621914248593, "grad_norm": 0.32129883766174316, "learning_rate": 4.983928378085752e-05, "loss": 0.3969, "step": 237500 }, { "epoch": 1.6105456907752274, "grad_norm": 0.30501073598861694, "learning_rate": 4.983894543092248e-05, "loss": 0.3978, "step": 238000 }, { "epoch": 1.6139291901255954, "grad_norm": 0.3106651306152344, "learning_rate": 4.983860708098744e-05, "loss": 0.3971, "step": 238500 }, { "epoch": 1.6173126894759635, "grad_norm": 0.2970840036869049, "learning_rate": 4.9838268731052405e-05, "loss": 0.3965, "step": 239000 }, { "epoch": 1.6206961888263316, "grad_norm": 0.31059491634368896, "learning_rate": 4.9837930381117374e-05, "loss": 0.3998, "step": 239500 }, { "epoch": 1.6240796881767, "grad_norm": 0.3194037675857544, "learning_rate": 4.9837592031182336e-05, "loss": 0.3978, "step": 240000 }, { "epoch": 1.627463187527068, "grad_norm": 0.2921430766582489, "learning_rate": 4.98372536812473e-05, "loss": 0.3974, "step": 240500 }, { "epoch": 1.6308466868774363, "grad_norm": 0.36188748478889465, "learning_rate": 4.983691533131225e-05, "loss": 0.3976, "step": 241000 }, { "epoch": 1.6342301862278044, "grad_norm": 0.266747385263443, "learning_rate": 4.983657698137722e-05, "loss": 0.3961, "step": 241500 }, { "epoch": 1.6376136855781724, "grad_norm": 0.3226276636123657, "learning_rate": 4.9836238631442184e-05, "loss": 0.3961, "step": 242000 }, { "epoch": 1.6409971849285405, "grad_norm": 0.273739218711853, "learning_rate": 4.9835900281507146e-05, "loss": 0.3969, "step": 242500 }, { "epoch": 1.6443806842789086, "grad_norm": 0.2933656871318817, "learning_rate": 4.983556193157211e-05, "loss": 0.3966, "step": 243000 }, { "epoch": 1.6477641836292767, "grad_norm": 0.3296910226345062, "learning_rate": 4.983522358163708e-05, "loss": 0.3973, "step": 243500 }, { "epoch": 1.6511476829796448, "grad_norm": 0.3250204622745514, "learning_rate": 4.983488523170204e-05, "loss": 0.3948, "step": 244000 }, { "epoch": 1.6545311823300128, "grad_norm": 0.30639970302581787, "learning_rate": 4.9834546881767e-05, "loss": 0.3957, "step": 244500 }, { "epoch": 1.6579146816803811, "grad_norm": 0.31183937191963196, "learning_rate": 4.9834208531831964e-05, "loss": 0.3956, "step": 245000 }, { "epoch": 1.6612981810307492, "grad_norm": 0.3072659969329834, "learning_rate": 4.983387018189693e-05, "loss": 0.3954, "step": 245500 }, { "epoch": 1.6646816803811175, "grad_norm": 0.31615200638771057, "learning_rate": 4.983353183196189e-05, "loss": 0.3969, "step": 246000 }, { "epoch": 1.6680651797314856, "grad_norm": 0.292656272649765, "learning_rate": 4.983319348202685e-05, "loss": 0.398, "step": 246500 }, { "epoch": 1.6714486790818537, "grad_norm": 0.27612897753715515, "learning_rate": 4.983285513209182e-05, "loss": 0.3957, "step": 247000 }, { "epoch": 1.6748321784322218, "grad_norm": 0.3081589937210083, "learning_rate": 4.983251678215678e-05, "loss": 0.3952, "step": 247500 }, { "epoch": 1.6782156777825898, "grad_norm": 0.2990529239177704, "learning_rate": 4.983217843222174e-05, "loss": 0.3969, "step": 248000 }, { "epoch": 1.681599177132958, "grad_norm": 0.34440121054649353, "learning_rate": 4.9831840082286705e-05, "loss": 0.3971, "step": 248500 }, { "epoch": 1.684982676483326, "grad_norm": 0.2921518385410309, "learning_rate": 4.9831501732351674e-05, "loss": 0.3959, "step": 249000 }, { "epoch": 1.688366175833694, "grad_norm": 0.296103835105896, "learning_rate": 4.9831163382416636e-05, "loss": 0.3966, "step": 249500 }, { "epoch": 1.6917496751840624, "grad_norm": 0.3215673565864563, "learning_rate": 4.98308250324816e-05, "loss": 0.3963, "step": 250000 }, { "epoch": 1.6951331745344305, "grad_norm": 0.2866440713405609, "learning_rate": 4.9830486682546554e-05, "loss": 0.3959, "step": 250500 }, { "epoch": 1.6985166738847988, "grad_norm": 0.33129727840423584, "learning_rate": 4.983014833261152e-05, "loss": 0.3975, "step": 251000 }, { "epoch": 1.7019001732351668, "grad_norm": 0.2761145830154419, "learning_rate": 4.9829809982676485e-05, "loss": 0.3966, "step": 251500 }, { "epoch": 1.705283672585535, "grad_norm": 0.29251420497894287, "learning_rate": 4.982947163274145e-05, "loss": 0.3984, "step": 252000 }, { "epoch": 1.708667171935903, "grad_norm": 0.34012719988822937, "learning_rate": 4.982913328280641e-05, "loss": 0.3985, "step": 252500 }, { "epoch": 1.712050671286271, "grad_norm": 0.2798214554786682, "learning_rate": 4.982879493287138e-05, "loss": 0.3971, "step": 253000 }, { "epoch": 1.7154341706366392, "grad_norm": 0.30469515919685364, "learning_rate": 4.982845658293634e-05, "loss": 0.3976, "step": 253500 }, { "epoch": 1.7188176699870072, "grad_norm": 0.28688544034957886, "learning_rate": 4.98281182330013e-05, "loss": 0.396, "step": 254000 }, { "epoch": 1.7222011693373753, "grad_norm": 0.2807927429676056, "learning_rate": 4.9827779883066264e-05, "loss": 0.3953, "step": 254500 }, { "epoch": 1.7255846686877436, "grad_norm": 0.3119663596153259, "learning_rate": 4.982744153313123e-05, "loss": 0.3974, "step": 255000 }, { "epoch": 1.7289681680381117, "grad_norm": 0.3375037610530853, "learning_rate": 4.982710318319619e-05, "loss": 0.3963, "step": 255500 }, { "epoch": 1.73235166738848, "grad_norm": 0.30482643842697144, "learning_rate": 4.982676483326115e-05, "loss": 0.3958, "step": 256000 }, { "epoch": 1.735735166738848, "grad_norm": 0.2730487287044525, "learning_rate": 4.982642648332612e-05, "loss": 0.3954, "step": 256500 }, { "epoch": 1.7391186660892162, "grad_norm": 0.2868850529193878, "learning_rate": 4.982608813339108e-05, "loss": 0.3941, "step": 257000 }, { "epoch": 1.7425021654395842, "grad_norm": 0.31031176447868347, "learning_rate": 4.9825749783456044e-05, "loss": 0.3962, "step": 257500 }, { "epoch": 1.7458856647899523, "grad_norm": 0.2912202775478363, "learning_rate": 4.9825411433521006e-05, "loss": 0.3962, "step": 258000 }, { "epoch": 1.7492691641403204, "grad_norm": 0.32004088163375854, "learning_rate": 4.982507308358597e-05, "loss": 0.3966, "step": 258500 }, { "epoch": 1.7526526634906885, "grad_norm": 0.2900611162185669, "learning_rate": 4.982473473365094e-05, "loss": 0.3972, "step": 259000 }, { "epoch": 1.7560361628410566, "grad_norm": 0.2974969446659088, "learning_rate": 4.98243963837159e-05, "loss": 0.3974, "step": 259500 }, { "epoch": 1.7594196621914249, "grad_norm": 0.2999788522720337, "learning_rate": 4.982405803378086e-05, "loss": 0.3964, "step": 260000 }, { "epoch": 1.762803161541793, "grad_norm": 0.3285123407840729, "learning_rate": 4.982371968384582e-05, "loss": 0.3955, "step": 260500 }, { "epoch": 1.7661866608921613, "grad_norm": 0.30693507194519043, "learning_rate": 4.9823381333910785e-05, "loss": 0.395, "step": 261000 }, { "epoch": 1.7695701602425293, "grad_norm": 0.2624507546424866, "learning_rate": 4.982304298397575e-05, "loss": 0.3971, "step": 261500 }, { "epoch": 1.7729536595928974, "grad_norm": 0.3088163137435913, "learning_rate": 4.982270463404071e-05, "loss": 0.3956, "step": 262000 }, { "epoch": 1.7763371589432655, "grad_norm": 0.3004806935787201, "learning_rate": 4.982236628410568e-05, "loss": 0.396, "step": 262500 }, { "epoch": 1.7797206582936336, "grad_norm": 0.3055258095264435, "learning_rate": 4.982202793417064e-05, "loss": 0.3967, "step": 263000 }, { "epoch": 1.7831041576440017, "grad_norm": 0.29731282591819763, "learning_rate": 4.98216895842356e-05, "loss": 0.3951, "step": 263500 }, { "epoch": 1.7864876569943697, "grad_norm": 0.3096189498901367, "learning_rate": 4.9821351234300565e-05, "loss": 0.3968, "step": 264000 }, { "epoch": 1.789871156344738, "grad_norm": 0.30249008536338806, "learning_rate": 4.9821012884365534e-05, "loss": 0.3965, "step": 264500 }, { "epoch": 1.7932546556951061, "grad_norm": 0.28994259238243103, "learning_rate": 4.982067453443049e-05, "loss": 0.3958, "step": 265000 }, { "epoch": 1.7966381550454742, "grad_norm": 0.3007451295852661, "learning_rate": 4.982033618449545e-05, "loss": 0.3953, "step": 265500 }, { "epoch": 1.8000216543958425, "grad_norm": 0.3501751124858856, "learning_rate": 4.981999783456042e-05, "loss": 0.3946, "step": 266000 }, { "epoch": 1.8034051537462106, "grad_norm": 0.33922311663627625, "learning_rate": 4.981965948462538e-05, "loss": 0.3955, "step": 266500 }, { "epoch": 1.8067886530965787, "grad_norm": 0.3255097270011902, "learning_rate": 4.9819321134690344e-05, "loss": 0.3952, "step": 267000 }, { "epoch": 1.8101721524469467, "grad_norm": 0.3020866811275482, "learning_rate": 4.9818982784755307e-05, "loss": 0.3954, "step": 267500 }, { "epoch": 1.8135556517973148, "grad_norm": 0.3044784665107727, "learning_rate": 4.981864443482027e-05, "loss": 0.3946, "step": 268000 }, { "epoch": 1.816939151147683, "grad_norm": 0.3110942840576172, "learning_rate": 4.981830608488524e-05, "loss": 0.3946, "step": 268500 }, { "epoch": 1.820322650498051, "grad_norm": 0.3102254271507263, "learning_rate": 4.98179677349502e-05, "loss": 0.395, "step": 269000 }, { "epoch": 1.8237061498484193, "grad_norm": 0.29582446813583374, "learning_rate": 4.981762938501516e-05, "loss": 0.3942, "step": 269500 }, { "epoch": 1.8270896491987874, "grad_norm": 0.30391183495521545, "learning_rate": 4.9817291035080124e-05, "loss": 0.3962, "step": 270000 }, { "epoch": 1.8304731485491554, "grad_norm": 0.3510082960128784, "learning_rate": 4.9816952685145086e-05, "loss": 0.3936, "step": 270500 }, { "epoch": 1.8338566478995237, "grad_norm": 0.31101343035697937, "learning_rate": 4.981661433521005e-05, "loss": 0.3962, "step": 271000 }, { "epoch": 1.8372401472498918, "grad_norm": 0.295521080493927, "learning_rate": 4.981627598527501e-05, "loss": 0.3963, "step": 271500 }, { "epoch": 1.84062364660026, "grad_norm": 0.2972811758518219, "learning_rate": 4.981593763533998e-05, "loss": 0.3953, "step": 272000 }, { "epoch": 1.844007145950628, "grad_norm": 0.29879501461982727, "learning_rate": 4.981559928540494e-05, "loss": 0.3944, "step": 272500 }, { "epoch": 1.847390645300996, "grad_norm": 0.29525479674339294, "learning_rate": 4.9815260935469903e-05, "loss": 0.395, "step": 273000 }, { "epoch": 1.8507741446513641, "grad_norm": 0.3053932189941406, "learning_rate": 4.9814922585534866e-05, "loss": 0.3944, "step": 273500 }, { "epoch": 1.8541576440017322, "grad_norm": 0.30265629291534424, "learning_rate": 4.9814584235599834e-05, "loss": 0.3969, "step": 274000 }, { "epoch": 1.8575411433521005, "grad_norm": 0.30741703510284424, "learning_rate": 4.981424588566479e-05, "loss": 0.3948, "step": 274500 }, { "epoch": 1.8609246427024686, "grad_norm": 0.33393386006355286, "learning_rate": 4.981390753572975e-05, "loss": 0.3967, "step": 275000 }, { "epoch": 1.8643081420528367, "grad_norm": 0.28982001543045044, "learning_rate": 4.9813569185794714e-05, "loss": 0.3959, "step": 275500 }, { "epoch": 1.867691641403205, "grad_norm": 0.32962051033973694, "learning_rate": 4.981323083585968e-05, "loss": 0.3954, "step": 276000 }, { "epoch": 1.871075140753573, "grad_norm": 0.3257775902748108, "learning_rate": 4.9812892485924645e-05, "loss": 0.395, "step": 276500 }, { "epoch": 1.8744586401039411, "grad_norm": 0.3227112889289856, "learning_rate": 4.981255413598961e-05, "loss": 0.3957, "step": 277000 }, { "epoch": 1.8778421394543092, "grad_norm": 0.3045206665992737, "learning_rate": 4.981221578605457e-05, "loss": 0.3946, "step": 277500 }, { "epoch": 1.8812256388046773, "grad_norm": 0.33415108919143677, "learning_rate": 4.981187743611954e-05, "loss": 0.3946, "step": 278000 }, { "epoch": 1.8846091381550454, "grad_norm": 0.28261253237724304, "learning_rate": 4.98115390861845e-05, "loss": 0.3956, "step": 278500 }, { "epoch": 1.8879926375054135, "grad_norm": 0.29565319418907166, "learning_rate": 4.981120073624946e-05, "loss": 0.3949, "step": 279000 }, { "epoch": 1.8913761368557818, "grad_norm": 0.33516836166381836, "learning_rate": 4.9810862386314425e-05, "loss": 0.394, "step": 279500 }, { "epoch": 1.8947596362061498, "grad_norm": 0.3220950663089752, "learning_rate": 4.981052403637939e-05, "loss": 0.3936, "step": 280000 }, { "epoch": 1.898143135556518, "grad_norm": 0.28635403513908386, "learning_rate": 4.981018568644435e-05, "loss": 0.3956, "step": 280500 }, { "epoch": 1.9015266349068862, "grad_norm": 0.317091703414917, "learning_rate": 4.980984733650931e-05, "loss": 0.3954, "step": 281000 }, { "epoch": 1.9049101342572543, "grad_norm": 0.3175191283226013, "learning_rate": 4.980950898657428e-05, "loss": 0.3944, "step": 281500 }, { "epoch": 1.9082936336076224, "grad_norm": 0.29234206676483154, "learning_rate": 4.980917063663924e-05, "loss": 0.3933, "step": 282000 }, { "epoch": 1.9116771329579905, "grad_norm": 0.30621030926704407, "learning_rate": 4.9808832286704204e-05, "loss": 0.3951, "step": 282500 }, { "epoch": 1.9150606323083585, "grad_norm": 0.30070793628692627, "learning_rate": 4.9808493936769166e-05, "loss": 0.3943, "step": 283000 }, { "epoch": 1.9184441316587266, "grad_norm": 0.3219392001628876, "learning_rate": 4.9808155586834135e-05, "loss": 0.3927, "step": 283500 }, { "epoch": 1.9218276310090947, "grad_norm": 0.31432652473449707, "learning_rate": 4.980781723689909e-05, "loss": 0.3948, "step": 284000 }, { "epoch": 1.925211130359463, "grad_norm": 0.3412748873233795, "learning_rate": 4.980747888696405e-05, "loss": 0.3937, "step": 284500 }, { "epoch": 1.928594629709831, "grad_norm": 0.3151736259460449, "learning_rate": 4.9807140537029015e-05, "loss": 0.3944, "step": 285000 }, { "epoch": 1.9319781290601992, "grad_norm": 0.3102365732192993, "learning_rate": 4.9806802187093984e-05, "loss": 0.3928, "step": 285500 }, { "epoch": 1.9353616284105675, "grad_norm": 0.29650557041168213, "learning_rate": 4.9806463837158946e-05, "loss": 0.3931, "step": 286000 }, { "epoch": 1.9387451277609355, "grad_norm": 0.30163249373435974, "learning_rate": 4.980612548722391e-05, "loss": 0.395, "step": 286500 }, { "epoch": 1.9421286271113036, "grad_norm": 0.3102872967720032, "learning_rate": 4.980578713728887e-05, "loss": 0.3932, "step": 287000 }, { "epoch": 1.9455121264616717, "grad_norm": 0.33082282543182373, "learning_rate": 4.980544878735384e-05, "loss": 0.3938, "step": 287500 }, { "epoch": 1.9488956258120398, "grad_norm": 0.3110976219177246, "learning_rate": 4.98051104374188e-05, "loss": 0.3952, "step": 288000 }, { "epoch": 1.9522791251624079, "grad_norm": 0.33474001288414, "learning_rate": 4.980477208748376e-05, "loss": 0.3951, "step": 288500 }, { "epoch": 1.955662624512776, "grad_norm": 0.30204835534095764, "learning_rate": 4.9804433737548725e-05, "loss": 0.3932, "step": 289000 }, { "epoch": 1.9590461238631443, "grad_norm": 0.2943308651447296, "learning_rate": 4.980409538761369e-05, "loss": 0.393, "step": 289500 }, { "epoch": 1.9624296232135123, "grad_norm": 0.3059576749801636, "learning_rate": 4.980375703767865e-05, "loss": 0.3941, "step": 290000 }, { "epoch": 1.9658131225638806, "grad_norm": 0.2936910390853882, "learning_rate": 4.980341868774361e-05, "loss": 0.3943, "step": 290500 }, { "epoch": 1.9691966219142487, "grad_norm": 0.3129732608795166, "learning_rate": 4.980308033780858e-05, "loss": 0.3943, "step": 291000 }, { "epoch": 1.9725801212646168, "grad_norm": 0.32157155871391296, "learning_rate": 4.980274198787354e-05, "loss": 0.3937, "step": 291500 }, { "epoch": 1.9759636206149849, "grad_norm": 0.30379122495651245, "learning_rate": 4.9802403637938505e-05, "loss": 0.3934, "step": 292000 }, { "epoch": 1.979347119965353, "grad_norm": 0.31935739517211914, "learning_rate": 4.980206528800347e-05, "loss": 0.3946, "step": 292500 }, { "epoch": 1.982730619315721, "grad_norm": 0.3241221010684967, "learning_rate": 4.9801726938068436e-05, "loss": 0.3928, "step": 293000 }, { "epoch": 1.9861141186660891, "grad_norm": 0.30251502990722656, "learning_rate": 4.980138858813339e-05, "loss": 0.3938, "step": 293500 }, { "epoch": 1.9894976180164572, "grad_norm": 0.28550073504447937, "learning_rate": 4.980105023819835e-05, "loss": 0.3943, "step": 294000 }, { "epoch": 1.9928811173668255, "grad_norm": 0.33502620458602905, "learning_rate": 4.9800711888263315e-05, "loss": 0.393, "step": 294500 }, { "epoch": 1.9962646167171936, "grad_norm": 0.30803167819976807, "learning_rate": 4.9800373538328284e-05, "loss": 0.3915, "step": 295000 }, { "epoch": 1.9996481160675619, "grad_norm": 0.3257239758968353, "learning_rate": 4.9800035188393246e-05, "loss": 0.3932, "step": 295500 }, { "epoch": 2.0, "eval_accuracy": 0.8510164531812664, "eval_loss": 0.6058459877967834, "eval_runtime": 3354.2545, "eval_samples_per_second": 86.679, "eval_steps_per_second": 5.418, "step": 295552 }, { "epoch": 2.00303161541793, "grad_norm": 0.33320993185043335, "learning_rate": 4.979969683845821e-05, "loss": 0.3923, "step": 296000 }, { "epoch": 2.006415114768298, "grad_norm": 0.31498461961746216, "learning_rate": 4.979935848852317e-05, "loss": 0.3928, "step": 296500 }, { "epoch": 2.009798614118666, "grad_norm": 0.28952863812446594, "learning_rate": 4.979902013858814e-05, "loss": 0.3926, "step": 297000 }, { "epoch": 2.013182113469034, "grad_norm": 0.272152304649353, "learning_rate": 4.97986817886531e-05, "loss": 0.3907, "step": 297500 }, { "epoch": 2.0165656128194023, "grad_norm": 0.31934404373168945, "learning_rate": 4.9798343438718064e-05, "loss": 0.391, "step": 298000 }, { "epoch": 2.0199491121697704, "grad_norm": 0.3041870594024658, "learning_rate": 4.9798005088783026e-05, "loss": 0.3902, "step": 298500 }, { "epoch": 2.0233326115201384, "grad_norm": 0.3236134350299835, "learning_rate": 4.979766673884799e-05, "loss": 0.3927, "step": 299000 }, { "epoch": 2.0267161108705065, "grad_norm": 0.2894350290298462, "learning_rate": 4.979732838891295e-05, "loss": 0.3906, "step": 299500 }, { "epoch": 2.030099610220875, "grad_norm": 0.2992889881134033, "learning_rate": 4.979699003897791e-05, "loss": 0.3929, "step": 300000 }, { "epoch": 2.033483109571243, "grad_norm": 0.29241079092025757, "learning_rate": 4.979665168904288e-05, "loss": 0.3923, "step": 300500 }, { "epoch": 2.036866608921611, "grad_norm": 0.2949068546295166, "learning_rate": 4.979631333910784e-05, "loss": 0.3917, "step": 301000 }, { "epoch": 2.0402501082719793, "grad_norm": 0.3157097101211548, "learning_rate": 4.9795974989172805e-05, "loss": 0.392, "step": 301500 }, { "epoch": 2.0436336076223474, "grad_norm": 0.31389084458351135, "learning_rate": 4.979563663923777e-05, "loss": 0.3926, "step": 302000 }, { "epoch": 2.0470171069727154, "grad_norm": 0.3211916387081146, "learning_rate": 4.9795298289302736e-05, "loss": 0.3931, "step": 302500 }, { "epoch": 2.0504006063230835, "grad_norm": 0.33857420086860657, "learning_rate": 4.979495993936769e-05, "loss": 0.3892, "step": 303000 }, { "epoch": 2.0537841056734516, "grad_norm": 0.30625852942466736, "learning_rate": 4.9794621589432654e-05, "loss": 0.3897, "step": 303500 }, { "epoch": 2.0571676050238197, "grad_norm": 0.32721972465515137, "learning_rate": 4.9794283239497616e-05, "loss": 0.3922, "step": 304000 }, { "epoch": 2.0605511043741878, "grad_norm": 0.3397035002708435, "learning_rate": 4.9793944889562585e-05, "loss": 0.3926, "step": 304500 }, { "epoch": 2.0639346037245563, "grad_norm": 0.3127745985984802, "learning_rate": 4.979360653962755e-05, "loss": 0.3907, "step": 305000 }, { "epoch": 2.0673181030749244, "grad_norm": 0.3054274618625641, "learning_rate": 4.979326818969251e-05, "loss": 0.3931, "step": 305500 }, { "epoch": 2.0707016024252924, "grad_norm": 0.3376386761665344, "learning_rate": 4.979292983975747e-05, "loss": 0.3932, "step": 306000 }, { "epoch": 2.0740851017756605, "grad_norm": 0.30187106132507324, "learning_rate": 4.979259148982244e-05, "loss": 0.3915, "step": 306500 }, { "epoch": 2.0774686011260286, "grad_norm": 0.2863011956214905, "learning_rate": 4.97922531398874e-05, "loss": 0.3887, "step": 307000 }, { "epoch": 2.0808521004763967, "grad_norm": 0.29724055528640747, "learning_rate": 4.9791914789952364e-05, "loss": 0.3926, "step": 307500 }, { "epoch": 2.0842355998267648, "grad_norm": 0.3097030222415924, "learning_rate": 4.9791576440017326e-05, "loss": 0.3916, "step": 308000 }, { "epoch": 2.087619099177133, "grad_norm": 0.30876824259757996, "learning_rate": 4.979123809008229e-05, "loss": 0.3917, "step": 308500 }, { "epoch": 2.091002598527501, "grad_norm": 0.33456218242645264, "learning_rate": 4.979089974014725e-05, "loss": 0.3917, "step": 309000 }, { "epoch": 2.094386097877869, "grad_norm": 0.33212965726852417, "learning_rate": 4.979056139021221e-05, "loss": 0.3916, "step": 309500 }, { "epoch": 2.0977695972282375, "grad_norm": 0.33890166878700256, "learning_rate": 4.979022304027718e-05, "loss": 0.3933, "step": 310000 }, { "epoch": 2.1011530965786056, "grad_norm": 0.2876567840576172, "learning_rate": 4.9789884690342144e-05, "loss": 0.3922, "step": 310500 }, { "epoch": 2.1045365959289737, "grad_norm": 0.314324289560318, "learning_rate": 4.9789546340407106e-05, "loss": 0.3923, "step": 311000 }, { "epoch": 2.1079200952793418, "grad_norm": 0.3218834698200226, "learning_rate": 4.978920799047207e-05, "loss": 0.393, "step": 311500 }, { "epoch": 2.11130359462971, "grad_norm": 0.3074497878551483, "learning_rate": 4.978886964053704e-05, "loss": 0.3911, "step": 312000 }, { "epoch": 2.114687093980078, "grad_norm": 0.3323233723640442, "learning_rate": 4.978853129060199e-05, "loss": 0.3913, "step": 312500 }, { "epoch": 2.118070593330446, "grad_norm": 0.3084787130355835, "learning_rate": 4.9788192940666954e-05, "loss": 0.3914, "step": 313000 }, { "epoch": 2.121454092680814, "grad_norm": 0.3199438154697418, "learning_rate": 4.9787854590731917e-05, "loss": 0.3918, "step": 313500 }, { "epoch": 2.124837592031182, "grad_norm": 0.2899732291698456, "learning_rate": 4.9787516240796885e-05, "loss": 0.3919, "step": 314000 }, { "epoch": 2.1282210913815502, "grad_norm": 0.3069060146808624, "learning_rate": 4.978717789086185e-05, "loss": 0.3917, "step": 314500 }, { "epoch": 2.1316045907319188, "grad_norm": 0.328224241733551, "learning_rate": 4.978683954092681e-05, "loss": 0.3915, "step": 315000 }, { "epoch": 2.134988090082287, "grad_norm": 0.29223302006721497, "learning_rate": 4.978650119099177e-05, "loss": 0.3912, "step": 315500 }, { "epoch": 2.138371589432655, "grad_norm": 0.297137588262558, "learning_rate": 4.978616284105674e-05, "loss": 0.3896, "step": 316000 }, { "epoch": 2.141755088783023, "grad_norm": 0.3235442042350769, "learning_rate": 4.97858244911217e-05, "loss": 0.3925, "step": 316500 }, { "epoch": 2.145138588133391, "grad_norm": 0.35218313336372375, "learning_rate": 4.9785486141186665e-05, "loss": 0.3908, "step": 317000 }, { "epoch": 2.148522087483759, "grad_norm": 0.32196253538131714, "learning_rate": 4.978514779125163e-05, "loss": 0.3924, "step": 317500 }, { "epoch": 2.1519055868341272, "grad_norm": 0.2944926917552948, "learning_rate": 4.978480944131659e-05, "loss": 0.3916, "step": 318000 }, { "epoch": 2.1552890861844953, "grad_norm": 0.30309149622917175, "learning_rate": 4.978447109138155e-05, "loss": 0.3937, "step": 318500 }, { "epoch": 2.1586725855348634, "grad_norm": 0.3118036389350891, "learning_rate": 4.9784132741446513e-05, "loss": 0.3907, "step": 319000 }, { "epoch": 2.162056084885232, "grad_norm": 0.3191807270050049, "learning_rate": 4.978379439151148e-05, "loss": 0.3922, "step": 319500 }, { "epoch": 2.1654395842356, "grad_norm": 0.32771632075309753, "learning_rate": 4.9783456041576444e-05, "loss": 0.3912, "step": 320000 }, { "epoch": 2.168823083585968, "grad_norm": 0.31377536058425903, "learning_rate": 4.9783117691641407e-05, "loss": 0.392, "step": 320500 }, { "epoch": 2.172206582936336, "grad_norm": 0.3084983825683594, "learning_rate": 4.978277934170637e-05, "loss": 0.3912, "step": 321000 }, { "epoch": 2.1755900822867043, "grad_norm": 0.3182586431503296, "learning_rate": 4.978244099177133e-05, "loss": 0.3922, "step": 321500 }, { "epoch": 2.1789735816370723, "grad_norm": 0.2851950228214264, "learning_rate": 4.97821026418363e-05, "loss": 0.3911, "step": 322000 }, { "epoch": 2.1823570809874404, "grad_norm": 0.31247711181640625, "learning_rate": 4.9781764291901255e-05, "loss": 0.3913, "step": 322500 }, { "epoch": 2.1857405803378085, "grad_norm": 0.30481797456741333, "learning_rate": 4.978142594196622e-05, "loss": 0.3904, "step": 323000 }, { "epoch": 2.1891240796881766, "grad_norm": 0.3104479908943176, "learning_rate": 4.9781087592031186e-05, "loss": 0.3907, "step": 323500 }, { "epoch": 2.1925075790385447, "grad_norm": 0.32999172806739807, "learning_rate": 4.978074924209615e-05, "loss": 0.391, "step": 324000 }, { "epoch": 2.1958910783889127, "grad_norm": 0.32124051451683044, "learning_rate": 4.978041089216111e-05, "loss": 0.3908, "step": 324500 }, { "epoch": 2.1992745777392813, "grad_norm": 0.3054026663303375, "learning_rate": 4.978007254222607e-05, "loss": 0.3899, "step": 325000 }, { "epoch": 2.2026580770896493, "grad_norm": 0.31517350673675537, "learning_rate": 4.977973419229104e-05, "loss": 0.3917, "step": 325500 }, { "epoch": 2.2060415764400174, "grad_norm": 0.3233264684677124, "learning_rate": 4.9779395842356003e-05, "loss": 0.3928, "step": 326000 }, { "epoch": 2.2094250757903855, "grad_norm": 0.3129175007343292, "learning_rate": 4.9779057492420966e-05, "loss": 0.39, "step": 326500 }, { "epoch": 2.2128085751407536, "grad_norm": 0.3183519244194031, "learning_rate": 4.977871914248593e-05, "loss": 0.392, "step": 327000 }, { "epoch": 2.2161920744911217, "grad_norm": 0.30780667066574097, "learning_rate": 4.977838079255089e-05, "loss": 0.3926, "step": 327500 }, { "epoch": 2.2195755738414897, "grad_norm": 0.3175506591796875, "learning_rate": 4.977804244261585e-05, "loss": 0.3917, "step": 328000 }, { "epoch": 2.222959073191858, "grad_norm": 0.3365371823310852, "learning_rate": 4.9777704092680814e-05, "loss": 0.3905, "step": 328500 }, { "epoch": 2.226342572542226, "grad_norm": 0.32077744603157043, "learning_rate": 4.9777365742745776e-05, "loss": 0.3922, "step": 329000 }, { "epoch": 2.2297260718925944, "grad_norm": 0.3266448676586151, "learning_rate": 4.9777027392810745e-05, "loss": 0.3902, "step": 329500 }, { "epoch": 2.2331095712429625, "grad_norm": 0.30548393726348877, "learning_rate": 4.977668904287571e-05, "loss": 0.3906, "step": 330000 }, { "epoch": 2.2364930705933306, "grad_norm": 0.31031087040901184, "learning_rate": 4.977635069294067e-05, "loss": 0.3913, "step": 330500 }, { "epoch": 2.2398765699436987, "grad_norm": 0.2958206534385681, "learning_rate": 4.977601234300563e-05, "loss": 0.3923, "step": 331000 }, { "epoch": 2.2432600692940667, "grad_norm": 0.333795964717865, "learning_rate": 4.97756739930706e-05, "loss": 0.39, "step": 331500 }, { "epoch": 2.246643568644435, "grad_norm": 0.30800214409828186, "learning_rate": 4.9775335643135556e-05, "loss": 0.3914, "step": 332000 }, { "epoch": 2.250027067994803, "grad_norm": 0.31785306334495544, "learning_rate": 4.977499729320052e-05, "loss": 0.3915, "step": 332500 }, { "epoch": 2.253410567345171, "grad_norm": 0.33681735396385193, "learning_rate": 4.977465894326549e-05, "loss": 0.3925, "step": 333000 }, { "epoch": 2.256794066695539, "grad_norm": 0.30837705731391907, "learning_rate": 4.977432059333045e-05, "loss": 0.3906, "step": 333500 }, { "epoch": 2.260177566045907, "grad_norm": 0.3389814794063568, "learning_rate": 4.977398224339541e-05, "loss": 0.3903, "step": 334000 }, { "epoch": 2.263561065396275, "grad_norm": 0.338090717792511, "learning_rate": 4.977364389346037e-05, "loss": 0.3911, "step": 334500 }, { "epoch": 2.2669445647466437, "grad_norm": 0.32688185572624207, "learning_rate": 4.977330554352534e-05, "loss": 0.3912, "step": 335000 }, { "epoch": 2.270328064097012, "grad_norm": 0.35603490471839905, "learning_rate": 4.9772967193590304e-05, "loss": 0.3898, "step": 335500 }, { "epoch": 2.27371156344738, "grad_norm": 0.3290579915046692, "learning_rate": 4.9772628843655266e-05, "loss": 0.3918, "step": 336000 }, { "epoch": 2.277095062797748, "grad_norm": 0.32932424545288086, "learning_rate": 4.977229049372023e-05, "loss": 0.3913, "step": 336500 }, { "epoch": 2.280478562148116, "grad_norm": 0.34741368889808655, "learning_rate": 4.977195214378519e-05, "loss": 0.3899, "step": 337000 }, { "epoch": 2.283862061498484, "grad_norm": 0.33639004826545715, "learning_rate": 4.977161379385015e-05, "loss": 0.3902, "step": 337500 }, { "epoch": 2.287245560848852, "grad_norm": 0.30797648429870605, "learning_rate": 4.9771275443915115e-05, "loss": 0.3905, "step": 338000 }, { "epoch": 2.2906290601992203, "grad_norm": 0.31171470880508423, "learning_rate": 4.977093709398008e-05, "loss": 0.3893, "step": 338500 }, { "epoch": 2.2940125595495884, "grad_norm": 0.34038031101226807, "learning_rate": 4.9770598744045046e-05, "loss": 0.3917, "step": 339000 }, { "epoch": 2.297396058899957, "grad_norm": 0.30429357290267944, "learning_rate": 4.977026039411001e-05, "loss": 0.3902, "step": 339500 }, { "epoch": 2.300779558250325, "grad_norm": 0.30760350823402405, "learning_rate": 4.976992204417497e-05, "loss": 0.3902, "step": 340000 }, { "epoch": 2.304163057600693, "grad_norm": 0.2932508885860443, "learning_rate": 4.976958369423993e-05, "loss": 0.3918, "step": 340500 }, { "epoch": 2.307546556951061, "grad_norm": 0.3048866093158722, "learning_rate": 4.97692453443049e-05, "loss": 0.3913, "step": 341000 }, { "epoch": 2.3109300563014292, "grad_norm": 0.3053203821182251, "learning_rate": 4.9768906994369856e-05, "loss": 0.3903, "step": 341500 }, { "epoch": 2.3143135556517973, "grad_norm": 0.3155381977558136, "learning_rate": 4.976856864443482e-05, "loss": 0.3904, "step": 342000 }, { "epoch": 2.3176970550021654, "grad_norm": 0.3031008243560791, "learning_rate": 4.976823029449979e-05, "loss": 0.3896, "step": 342500 }, { "epoch": 2.3210805543525335, "grad_norm": 0.32542043924331665, "learning_rate": 4.976789194456475e-05, "loss": 0.3896, "step": 343000 }, { "epoch": 2.3244640537029015, "grad_norm": 0.3127189576625824, "learning_rate": 4.976755359462971e-05, "loss": 0.3899, "step": 343500 }, { "epoch": 2.3278475530532696, "grad_norm": 0.327363520860672, "learning_rate": 4.9767215244694674e-05, "loss": 0.3906, "step": 344000 }, { "epoch": 2.3312310524036377, "grad_norm": 0.3157011866569519, "learning_rate": 4.976687689475964e-05, "loss": 0.3912, "step": 344500 }, { "epoch": 2.3346145517540062, "grad_norm": 0.3201100826263428, "learning_rate": 4.9766538544824605e-05, "loss": 0.3908, "step": 345000 }, { "epoch": 2.3379980511043743, "grad_norm": 0.3359414041042328, "learning_rate": 4.976620019488957e-05, "loss": 0.3901, "step": 345500 }, { "epoch": 2.3413815504547424, "grad_norm": 0.3239155411720276, "learning_rate": 4.976586184495452e-05, "loss": 0.3905, "step": 346000 }, { "epoch": 2.3447650498051105, "grad_norm": 0.31146302819252014, "learning_rate": 4.976552349501949e-05, "loss": 0.3897, "step": 346500 }, { "epoch": 2.3481485491554785, "grad_norm": 0.32907629013061523, "learning_rate": 4.976518514508445e-05, "loss": 0.3919, "step": 347000 }, { "epoch": 2.3515320485058466, "grad_norm": 0.3361744284629822, "learning_rate": 4.9764846795149415e-05, "loss": 0.3903, "step": 347500 }, { "epoch": 2.3549155478562147, "grad_norm": 0.319180428981781, "learning_rate": 4.976450844521438e-05, "loss": 0.3905, "step": 348000 }, { "epoch": 2.358299047206583, "grad_norm": 0.3432812988758087, "learning_rate": 4.9764170095279346e-05, "loss": 0.3885, "step": 348500 }, { "epoch": 2.361682546556951, "grad_norm": 0.33523839712142944, "learning_rate": 4.976383174534431e-05, "loss": 0.3898, "step": 349000 }, { "epoch": 2.3650660459073194, "grad_norm": 0.3066440522670746, "learning_rate": 4.976349339540927e-05, "loss": 0.3905, "step": 349500 }, { "epoch": 2.3684495452576875, "grad_norm": 0.3280928134918213, "learning_rate": 4.976315504547423e-05, "loss": 0.3888, "step": 350000 }, { "epoch": 2.3718330446080556, "grad_norm": 0.28813743591308594, "learning_rate": 4.97628166955392e-05, "loss": 0.3909, "step": 350500 }, { "epoch": 2.3752165439584236, "grad_norm": 0.3382049798965454, "learning_rate": 4.976247834560416e-05, "loss": 0.3918, "step": 351000 }, { "epoch": 2.3786000433087917, "grad_norm": 0.32492849230766296, "learning_rate": 4.976213999566912e-05, "loss": 0.3902, "step": 351500 }, { "epoch": 2.38198354265916, "grad_norm": 0.3084128797054291, "learning_rate": 4.976180164573409e-05, "loss": 0.3887, "step": 352000 }, { "epoch": 2.385367042009528, "grad_norm": 0.3256911337375641, "learning_rate": 4.976146329579905e-05, "loss": 0.3897, "step": 352500 }, { "epoch": 2.388750541359896, "grad_norm": 0.29890650510787964, "learning_rate": 4.976112494586401e-05, "loss": 0.3915, "step": 353000 }, { "epoch": 2.392134040710264, "grad_norm": 0.34316855669021606, "learning_rate": 4.9760786595928974e-05, "loss": 0.3901, "step": 353500 }, { "epoch": 2.3955175400606326, "grad_norm": 0.30501681566238403, "learning_rate": 4.976044824599394e-05, "loss": 0.3896, "step": 354000 }, { "epoch": 2.398901039411, "grad_norm": 0.3112809360027313, "learning_rate": 4.9760109896058905e-05, "loss": 0.3895, "step": 354500 }, { "epoch": 2.4022845387613687, "grad_norm": 0.3149002492427826, "learning_rate": 4.975977154612387e-05, "loss": 0.3898, "step": 355000 }, { "epoch": 2.405668038111737, "grad_norm": 0.3182199001312256, "learning_rate": 4.975943319618882e-05, "loss": 0.3887, "step": 355500 }, { "epoch": 2.409051537462105, "grad_norm": 0.3300721049308777, "learning_rate": 4.975909484625379e-05, "loss": 0.3898, "step": 356000 }, { "epoch": 2.412435036812473, "grad_norm": 0.34610435366630554, "learning_rate": 4.9758756496318754e-05, "loss": 0.3897, "step": 356500 }, { "epoch": 2.415818536162841, "grad_norm": 0.29432740807533264, "learning_rate": 4.9758418146383716e-05, "loss": 0.3893, "step": 357000 }, { "epoch": 2.419202035513209, "grad_norm": 0.3141118586063385, "learning_rate": 4.975807979644868e-05, "loss": 0.3912, "step": 357500 }, { "epoch": 2.422585534863577, "grad_norm": 0.2943814992904663, "learning_rate": 4.975774144651365e-05, "loss": 0.3905, "step": 358000 }, { "epoch": 2.4259690342139453, "grad_norm": 0.34122055768966675, "learning_rate": 4.975740309657861e-05, "loss": 0.3906, "step": 358500 }, { "epoch": 2.4293525335643134, "grad_norm": 0.34175580739974976, "learning_rate": 4.975706474664357e-05, "loss": 0.3903, "step": 359000 }, { "epoch": 2.432736032914682, "grad_norm": 0.31737181544303894, "learning_rate": 4.975672639670853e-05, "loss": 0.3881, "step": 359500 }, { "epoch": 2.43611953226505, "grad_norm": 0.2970747947692871, "learning_rate": 4.97563880467735e-05, "loss": 0.3912, "step": 360000 }, { "epoch": 2.439503031615418, "grad_norm": 0.2924173176288605, "learning_rate": 4.975604969683846e-05, "loss": 0.3896, "step": 360500 }, { "epoch": 2.442886530965786, "grad_norm": 0.35092678666114807, "learning_rate": 4.975571134690342e-05, "loss": 0.3885, "step": 361000 }, { "epoch": 2.446270030316154, "grad_norm": 0.3329818844795227, "learning_rate": 4.975537299696839e-05, "loss": 0.3906, "step": 361500 }, { "epoch": 2.4496535296665223, "grad_norm": 0.32808420062065125, "learning_rate": 4.975503464703335e-05, "loss": 0.3923, "step": 362000 }, { "epoch": 2.4530370290168904, "grad_norm": 0.3235306739807129, "learning_rate": 4.975469629709831e-05, "loss": 0.3896, "step": 362500 }, { "epoch": 2.4564205283672584, "grad_norm": 0.3019697666168213, "learning_rate": 4.9754357947163275e-05, "loss": 0.3904, "step": 363000 }, { "epoch": 2.4598040277176265, "grad_norm": 0.2973634600639343, "learning_rate": 4.9754019597228244e-05, "loss": 0.3893, "step": 363500 }, { "epoch": 2.463187527067995, "grad_norm": 0.30464956164360046, "learning_rate": 4.9753681247293206e-05, "loss": 0.3901, "step": 364000 }, { "epoch": 2.466571026418363, "grad_norm": 0.28501152992248535, "learning_rate": 4.975334289735817e-05, "loss": 0.3904, "step": 364500 }, { "epoch": 2.469954525768731, "grad_norm": 0.2966424524784088, "learning_rate": 4.9753004547423123e-05, "loss": 0.3918, "step": 365000 }, { "epoch": 2.4733380251190993, "grad_norm": 0.30633342266082764, "learning_rate": 4.975266619748809e-05, "loss": 0.3905, "step": 365500 }, { "epoch": 2.4767215244694674, "grad_norm": 0.31464701890945435, "learning_rate": 4.9752327847553054e-05, "loss": 0.3895, "step": 366000 }, { "epoch": 2.4801050238198354, "grad_norm": 0.3180805444717407, "learning_rate": 4.9751989497618017e-05, "loss": 0.3879, "step": 366500 }, { "epoch": 2.4834885231702035, "grad_norm": 0.3277004659175873, "learning_rate": 4.975165114768298e-05, "loss": 0.3898, "step": 367000 }, { "epoch": 2.4868720225205716, "grad_norm": 0.3120267987251282, "learning_rate": 4.975131279774795e-05, "loss": 0.3896, "step": 367500 }, { "epoch": 2.4902555218709397, "grad_norm": 0.3504237234592438, "learning_rate": 4.975097444781291e-05, "loss": 0.3889, "step": 368000 }, { "epoch": 2.4936390212213078, "grad_norm": 0.31472906470298767, "learning_rate": 4.975063609787787e-05, "loss": 0.3902, "step": 368500 }, { "epoch": 2.497022520571676, "grad_norm": 0.2980063557624817, "learning_rate": 4.9750297747942834e-05, "loss": 0.3893, "step": 369000 }, { "epoch": 2.5004060199220444, "grad_norm": 0.31815919280052185, "learning_rate": 4.97499593980078e-05, "loss": 0.39, "step": 369500 }, { "epoch": 2.5037895192724124, "grad_norm": 0.34098586440086365, "learning_rate": 4.974962104807276e-05, "loss": 0.3891, "step": 370000 }, { "epoch": 2.5071730186227805, "grad_norm": 0.32201772928237915, "learning_rate": 4.974928269813772e-05, "loss": 0.3883, "step": 370500 }, { "epoch": 2.5105565179731486, "grad_norm": 0.32348373532295227, "learning_rate": 4.974894434820269e-05, "loss": 0.3895, "step": 371000 }, { "epoch": 2.5139400173235167, "grad_norm": 0.32830971479415894, "learning_rate": 4.974860599826765e-05, "loss": 0.3898, "step": 371500 }, { "epoch": 2.5173235166738848, "grad_norm": 0.28930607438087463, "learning_rate": 4.9748267648332613e-05, "loss": 0.3888, "step": 372000 }, { "epoch": 2.520707016024253, "grad_norm": 0.3103958070278168, "learning_rate": 4.9747929298397576e-05, "loss": 0.3899, "step": 372500 }, { "epoch": 2.524090515374621, "grad_norm": 0.3025512099266052, "learning_rate": 4.9747590948462545e-05, "loss": 0.3915, "step": 373000 }, { "epoch": 2.527474014724989, "grad_norm": 0.3118777871131897, "learning_rate": 4.974725259852751e-05, "loss": 0.3903, "step": 373500 }, { "epoch": 2.5308575140753575, "grad_norm": 0.30235418677330017, "learning_rate": 4.974691424859247e-05, "loss": 0.3883, "step": 374000 }, { "epoch": 2.534241013425725, "grad_norm": 0.31868302822113037, "learning_rate": 4.9746575898657424e-05, "loss": 0.3892, "step": 374500 }, { "epoch": 2.5376245127760937, "grad_norm": 0.2960646152496338, "learning_rate": 4.974623754872239e-05, "loss": 0.3906, "step": 375000 }, { "epoch": 2.5410080121264618, "grad_norm": 0.3299707770347595, "learning_rate": 4.9745899198787355e-05, "loss": 0.3909, "step": 375500 }, { "epoch": 2.54439151147683, "grad_norm": 0.3465183675289154, "learning_rate": 4.974556084885232e-05, "loss": 0.3893, "step": 376000 }, { "epoch": 2.547775010827198, "grad_norm": 0.29224497079849243, "learning_rate": 4.974522249891728e-05, "loss": 0.3886, "step": 376500 }, { "epoch": 2.551158510177566, "grad_norm": 0.3109583556652069, "learning_rate": 4.974488414898225e-05, "loss": 0.389, "step": 377000 }, { "epoch": 2.554542009527934, "grad_norm": 0.3018151819705963, "learning_rate": 4.974454579904721e-05, "loss": 0.3884, "step": 377500 }, { "epoch": 2.557925508878302, "grad_norm": 0.2968336045742035, "learning_rate": 4.974420744911217e-05, "loss": 0.3891, "step": 378000 }, { "epoch": 2.5613090082286707, "grad_norm": 0.33824700117111206, "learning_rate": 4.9743869099177135e-05, "loss": 0.3884, "step": 378500 }, { "epoch": 2.5646925075790383, "grad_norm": 0.2871198058128357, "learning_rate": 4.9743530749242104e-05, "loss": 0.3902, "step": 379000 }, { "epoch": 2.568076006929407, "grad_norm": 0.33174142241477966, "learning_rate": 4.974319239930706e-05, "loss": 0.3888, "step": 379500 }, { "epoch": 2.571459506279775, "grad_norm": 0.29837650060653687, "learning_rate": 4.974285404937202e-05, "loss": 0.387, "step": 380000 }, { "epoch": 2.574843005630143, "grad_norm": 0.3502374291419983, "learning_rate": 4.974251569943699e-05, "loss": 0.3907, "step": 380500 }, { "epoch": 2.578226504980511, "grad_norm": 0.34483498334884644, "learning_rate": 4.974217734950195e-05, "loss": 0.3896, "step": 381000 }, { "epoch": 2.581610004330879, "grad_norm": 0.31655412912368774, "learning_rate": 4.9741838999566914e-05, "loss": 0.3902, "step": 381500 }, { "epoch": 2.5849935036812473, "grad_norm": 0.3395337164402008, "learning_rate": 4.9741500649631876e-05, "loss": 0.389, "step": 382000 }, { "epoch": 2.5883770030316153, "grad_norm": 0.3357096016407013, "learning_rate": 4.9741162299696845e-05, "loss": 0.3899, "step": 382500 }, { "epoch": 2.5917605023819834, "grad_norm": 0.306362509727478, "learning_rate": 4.974082394976181e-05, "loss": 0.3884, "step": 383000 }, { "epoch": 2.5951440017323515, "grad_norm": 0.3453750014305115, "learning_rate": 4.974048559982677e-05, "loss": 0.3908, "step": 383500 }, { "epoch": 2.59852750108272, "grad_norm": 0.28299570083618164, "learning_rate": 4.974014724989173e-05, "loss": 0.3898, "step": 384000 }, { "epoch": 2.6019110004330877, "grad_norm": 0.34128639101982117, "learning_rate": 4.9739808899956694e-05, "loss": 0.388, "step": 384500 }, { "epoch": 2.605294499783456, "grad_norm": 0.2998855710029602, "learning_rate": 4.9739470550021656e-05, "loss": 0.3886, "step": 385000 }, { "epoch": 2.6086779991338243, "grad_norm": 0.332438200712204, "learning_rate": 4.973913220008662e-05, "loss": 0.389, "step": 385500 }, { "epoch": 2.6120614984841923, "grad_norm": 0.30915123224258423, "learning_rate": 4.973879385015158e-05, "loss": 0.3885, "step": 386000 }, { "epoch": 2.6154449978345604, "grad_norm": 0.31033918261528015, "learning_rate": 4.973845550021655e-05, "loss": 0.3895, "step": 386500 }, { "epoch": 2.6188284971849285, "grad_norm": 0.29367709159851074, "learning_rate": 4.973811715028151e-05, "loss": 0.3884, "step": 387000 }, { "epoch": 2.6222119965352966, "grad_norm": 0.3094245195388794, "learning_rate": 4.973777880034647e-05, "loss": 0.3898, "step": 387500 }, { "epoch": 2.6255954958856647, "grad_norm": 0.34055420756340027, "learning_rate": 4.9737440450411435e-05, "loss": 0.3884, "step": 388000 }, { "epoch": 2.628978995236033, "grad_norm": 0.3231176733970642, "learning_rate": 4.9737102100476404e-05, "loss": 0.3885, "step": 388500 }, { "epoch": 2.632362494586401, "grad_norm": 0.32180097699165344, "learning_rate": 4.973676375054136e-05, "loss": 0.3882, "step": 389000 }, { "epoch": 2.6357459939367693, "grad_norm": 0.29383426904678345, "learning_rate": 4.973642540060632e-05, "loss": 0.3901, "step": 389500 }, { "epoch": 2.6391294932871374, "grad_norm": 0.31776711344718933, "learning_rate": 4.973608705067129e-05, "loss": 0.388, "step": 390000 }, { "epoch": 2.6425129926375055, "grad_norm": 0.3176041543483734, "learning_rate": 4.973574870073625e-05, "loss": 0.3898, "step": 390500 }, { "epoch": 2.6458964919878736, "grad_norm": 0.3469124734401703, "learning_rate": 4.9735410350801215e-05, "loss": 0.3894, "step": 391000 }, { "epoch": 2.6492799913382417, "grad_norm": 0.3145827651023865, "learning_rate": 4.973507200086618e-05, "loss": 0.3876, "step": 391500 }, { "epoch": 2.6526634906886097, "grad_norm": 0.35601696372032166, "learning_rate": 4.973473365093114e-05, "loss": 0.3893, "step": 392000 }, { "epoch": 2.656046990038978, "grad_norm": 0.326543927192688, "learning_rate": 4.973439530099611e-05, "loss": 0.3902, "step": 392500 }, { "epoch": 2.659430489389346, "grad_norm": 0.3303530216217041, "learning_rate": 4.973405695106107e-05, "loss": 0.3878, "step": 393000 }, { "epoch": 2.662813988739714, "grad_norm": 0.3083018362522125, "learning_rate": 4.973371860112603e-05, "loss": 0.3876, "step": 393500 }, { "epoch": 2.6661974880900825, "grad_norm": 0.31711965799331665, "learning_rate": 4.9733380251190994e-05, "loss": 0.3881, "step": 394000 }, { "epoch": 2.66958098744045, "grad_norm": 0.3374720811843872, "learning_rate": 4.9733041901255956e-05, "loss": 0.3879, "step": 394500 }, { "epoch": 2.6729644867908187, "grad_norm": 0.3124774098396301, "learning_rate": 4.973270355132092e-05, "loss": 0.3875, "step": 395000 }, { "epoch": 2.6763479861411867, "grad_norm": 0.33346521854400635, "learning_rate": 4.973236520138588e-05, "loss": 0.3882, "step": 395500 }, { "epoch": 2.679731485491555, "grad_norm": 0.2966834008693695, "learning_rate": 4.973202685145085e-05, "loss": 0.3883, "step": 396000 }, { "epoch": 2.683114984841923, "grad_norm": 0.3253791332244873, "learning_rate": 4.973168850151581e-05, "loss": 0.3906, "step": 396500 }, { "epoch": 2.686498484192291, "grad_norm": 0.34003323316574097, "learning_rate": 4.9731350151580774e-05, "loss": 0.3887, "step": 397000 }, { "epoch": 2.689881983542659, "grad_norm": 0.31468504667282104, "learning_rate": 4.9731011801645736e-05, "loss": 0.3888, "step": 397500 }, { "epoch": 2.693265482893027, "grad_norm": 0.3268261253833771, "learning_rate": 4.9730673451710705e-05, "loss": 0.389, "step": 398000 }, { "epoch": 2.6966489822433957, "grad_norm": 0.37251192331314087, "learning_rate": 4.973033510177566e-05, "loss": 0.3908, "step": 398500 }, { "epoch": 2.7000324815937633, "grad_norm": 0.3105468451976776, "learning_rate": 4.972999675184062e-05, "loss": 0.3887, "step": 399000 }, { "epoch": 2.703415980944132, "grad_norm": 0.30996009707450867, "learning_rate": 4.972965840190559e-05, "loss": 0.3891, "step": 399500 }, { "epoch": 2.7067994802945, "grad_norm": 0.3323148488998413, "learning_rate": 4.972932005197055e-05, "loss": 0.3878, "step": 400000 }, { "epoch": 2.710182979644868, "grad_norm": 0.3385847508907318, "learning_rate": 4.9728981702035515e-05, "loss": 0.3887, "step": 400500 }, { "epoch": 2.713566478995236, "grad_norm": 0.38123631477355957, "learning_rate": 4.972864335210048e-05, "loss": 0.3894, "step": 401000 }, { "epoch": 2.716949978345604, "grad_norm": 0.35343465209007263, "learning_rate": 4.972830500216544e-05, "loss": 0.3862, "step": 401500 }, { "epoch": 2.7203334776959722, "grad_norm": 0.3679075837135315, "learning_rate": 4.972796665223041e-05, "loss": 0.3872, "step": 402000 }, { "epoch": 2.7237169770463403, "grad_norm": 0.3437575697898865, "learning_rate": 4.972762830229537e-05, "loss": 0.3895, "step": 402500 }, { "epoch": 2.7271004763967084, "grad_norm": 0.35861557722091675, "learning_rate": 4.972728995236033e-05, "loss": 0.3882, "step": 403000 }, { "epoch": 2.7304839757470765, "grad_norm": 0.31313803791999817, "learning_rate": 4.9726951602425295e-05, "loss": 0.3875, "step": 403500 }, { "epoch": 2.733867475097445, "grad_norm": 0.31771916151046753, "learning_rate": 4.972661325249026e-05, "loss": 0.3893, "step": 404000 }, { "epoch": 2.737250974447813, "grad_norm": 0.3310674726963043, "learning_rate": 4.972627490255522e-05, "loss": 0.3879, "step": 404500 }, { "epoch": 2.740634473798181, "grad_norm": 0.32085293531417847, "learning_rate": 4.972593655262018e-05, "loss": 0.39, "step": 405000 }, { "epoch": 2.7440179731485492, "grad_norm": 0.30057665705680847, "learning_rate": 4.972559820268515e-05, "loss": 0.3886, "step": 405500 }, { "epoch": 2.7474014724989173, "grad_norm": 0.31213104724884033, "learning_rate": 4.972525985275011e-05, "loss": 0.3874, "step": 406000 }, { "epoch": 2.7507849718492854, "grad_norm": 0.35481685400009155, "learning_rate": 4.9724921502815074e-05, "loss": 0.389, "step": 406500 }, { "epoch": 2.7541684711996535, "grad_norm": 0.33751875162124634, "learning_rate": 4.9724583152880037e-05, "loss": 0.388, "step": 407000 }, { "epoch": 2.7575519705500215, "grad_norm": 0.32072505354881287, "learning_rate": 4.9724244802945005e-05, "loss": 0.3878, "step": 407500 }, { "epoch": 2.7609354699003896, "grad_norm": 0.29553598165512085, "learning_rate": 4.972390645300996e-05, "loss": 0.3886, "step": 408000 }, { "epoch": 2.764318969250758, "grad_norm": 0.3134274482727051, "learning_rate": 4.972356810307492e-05, "loss": 0.3863, "step": 408500 }, { "epoch": 2.767702468601126, "grad_norm": 0.3523977994918823, "learning_rate": 4.9723229753139885e-05, "loss": 0.39, "step": 409000 }, { "epoch": 2.7710859679514943, "grad_norm": 0.33423638343811035, "learning_rate": 4.9722891403204854e-05, "loss": 0.3879, "step": 409500 }, { "epoch": 2.7744694673018624, "grad_norm": 0.3362995386123657, "learning_rate": 4.9722553053269816e-05, "loss": 0.3883, "step": 410000 }, { "epoch": 2.7778529666522305, "grad_norm": 0.3276742696762085, "learning_rate": 4.972221470333478e-05, "loss": 0.3868, "step": 410500 }, { "epoch": 2.7812364660025986, "grad_norm": 0.323776513338089, "learning_rate": 4.972187635339974e-05, "loss": 0.3875, "step": 411000 }, { "epoch": 2.7846199653529666, "grad_norm": 0.32547879219055176, "learning_rate": 4.972153800346471e-05, "loss": 0.3884, "step": 411500 }, { "epoch": 2.7880034647033347, "grad_norm": 0.31964075565338135, "learning_rate": 4.972119965352967e-05, "loss": 0.3879, "step": 412000 }, { "epoch": 2.791386964053703, "grad_norm": 0.2977871894836426, "learning_rate": 4.9720861303594633e-05, "loss": 0.3893, "step": 412500 }, { "epoch": 2.794770463404071, "grad_norm": 0.3149718642234802, "learning_rate": 4.9720522953659596e-05, "loss": 0.3867, "step": 413000 }, { "epoch": 2.798153962754439, "grad_norm": 0.33092522621154785, "learning_rate": 4.972018460372456e-05, "loss": 0.3881, "step": 413500 }, { "epoch": 2.8015374621048075, "grad_norm": 0.32213321328163147, "learning_rate": 4.971984625378952e-05, "loss": 0.389, "step": 414000 }, { "epoch": 2.8049209614551756, "grad_norm": 0.31457120180130005, "learning_rate": 4.971950790385448e-05, "loss": 0.3865, "step": 414500 }, { "epoch": 2.8083044608055436, "grad_norm": 0.3207741975784302, "learning_rate": 4.971916955391945e-05, "loss": 0.3898, "step": 415000 }, { "epoch": 2.8116879601559117, "grad_norm": 0.3099098801612854, "learning_rate": 4.971883120398441e-05, "loss": 0.388, "step": 415500 }, { "epoch": 2.81507145950628, "grad_norm": 0.33796587586402893, "learning_rate": 4.9718492854049375e-05, "loss": 0.3867, "step": 416000 }, { "epoch": 2.818454958856648, "grad_norm": 0.34027165174484253, "learning_rate": 4.971815450411434e-05, "loss": 0.388, "step": 416500 }, { "epoch": 2.821838458207016, "grad_norm": 0.3038021922111511, "learning_rate": 4.9717816154179306e-05, "loss": 0.3871, "step": 417000 }, { "epoch": 2.825221957557384, "grad_norm": 0.30802983045578003, "learning_rate": 4.971747780424426e-05, "loss": 0.387, "step": 417500 }, { "epoch": 2.828605456907752, "grad_norm": 0.3201408088207245, "learning_rate": 4.9717139454309224e-05, "loss": 0.3882, "step": 418000 }, { "epoch": 2.8319889562581206, "grad_norm": 0.3160160183906555, "learning_rate": 4.9716801104374186e-05, "loss": 0.3883, "step": 418500 }, { "epoch": 2.8353724556084883, "grad_norm": 0.3320191204547882, "learning_rate": 4.9716462754439155e-05, "loss": 0.3867, "step": 419000 }, { "epoch": 2.838755954958857, "grad_norm": 0.29257920384407043, "learning_rate": 4.971612440450412e-05, "loss": 0.3868, "step": 419500 }, { "epoch": 2.842139454309225, "grad_norm": 0.3090723156929016, "learning_rate": 4.971578605456908e-05, "loss": 0.3878, "step": 420000 }, { "epoch": 2.845522953659593, "grad_norm": 0.3091406524181366, "learning_rate": 4.971544770463404e-05, "loss": 0.3885, "step": 420500 }, { "epoch": 2.848906453009961, "grad_norm": 0.31975674629211426, "learning_rate": 4.971510935469901e-05, "loss": 0.3878, "step": 421000 }, { "epoch": 2.852289952360329, "grad_norm": 0.3032001256942749, "learning_rate": 4.971477100476397e-05, "loss": 0.3873, "step": 421500 }, { "epoch": 2.855673451710697, "grad_norm": 0.29951295256614685, "learning_rate": 4.9714432654828934e-05, "loss": 0.3882, "step": 422000 }, { "epoch": 2.8590569510610653, "grad_norm": 0.29293006658554077, "learning_rate": 4.9714094304893896e-05, "loss": 0.3874, "step": 422500 }, { "epoch": 2.8624404504114334, "grad_norm": 0.35580480098724365, "learning_rate": 4.971375595495886e-05, "loss": 0.3883, "step": 423000 }, { "epoch": 2.8658239497618014, "grad_norm": 0.3143445551395416, "learning_rate": 4.971341760502382e-05, "loss": 0.3873, "step": 423500 }, { "epoch": 2.86920744911217, "grad_norm": 0.3491705358028412, "learning_rate": 4.971307925508878e-05, "loss": 0.3868, "step": 424000 }, { "epoch": 2.872590948462538, "grad_norm": 0.3095901310443878, "learning_rate": 4.971274090515375e-05, "loss": 0.3887, "step": 424500 }, { "epoch": 2.875974447812906, "grad_norm": 0.3482496440410614, "learning_rate": 4.9712402555218714e-05, "loss": 0.3881, "step": 425000 }, { "epoch": 2.879357947163274, "grad_norm": 0.3041991889476776, "learning_rate": 4.9712064205283676e-05, "loss": 0.3867, "step": 425500 }, { "epoch": 2.8827414465136423, "grad_norm": 0.2977460026741028, "learning_rate": 4.971172585534864e-05, "loss": 0.388, "step": 426000 }, { "epoch": 2.8861249458640104, "grad_norm": 0.3527410328388214, "learning_rate": 4.971138750541361e-05, "loss": 0.389, "step": 426500 }, { "epoch": 2.8895084452143784, "grad_norm": 0.31773945689201355, "learning_rate": 4.971104915547856e-05, "loss": 0.3875, "step": 427000 }, { "epoch": 2.8928919445647465, "grad_norm": 0.3466269075870514, "learning_rate": 4.9710710805543524e-05, "loss": 0.3864, "step": 427500 }, { "epoch": 2.8962754439151146, "grad_norm": 0.3149116039276123, "learning_rate": 4.9710372455608486e-05, "loss": 0.388, "step": 428000 }, { "epoch": 2.899658943265483, "grad_norm": 0.3208950161933899, "learning_rate": 4.9710034105673455e-05, "loss": 0.3871, "step": 428500 }, { "epoch": 2.9030424426158508, "grad_norm": 0.31378859281539917, "learning_rate": 4.970969575573842e-05, "loss": 0.3877, "step": 429000 }, { "epoch": 2.9064259419662193, "grad_norm": 0.3283815383911133, "learning_rate": 4.970935740580338e-05, "loss": 0.388, "step": 429500 }, { "epoch": 2.9098094413165874, "grad_norm": 0.34402093291282654, "learning_rate": 4.970901905586834e-05, "loss": 0.3861, "step": 430000 }, { "epoch": 2.9131929406669554, "grad_norm": 0.3222994804382324, "learning_rate": 4.970868070593331e-05, "loss": 0.3868, "step": 430500 }, { "epoch": 2.9165764400173235, "grad_norm": 0.349680632352829, "learning_rate": 4.970834235599827e-05, "loss": 0.3874, "step": 431000 }, { "epoch": 2.9199599393676916, "grad_norm": 0.34699881076812744, "learning_rate": 4.9708004006063235e-05, "loss": 0.388, "step": 431500 }, { "epoch": 2.9233434387180597, "grad_norm": 0.29213935136795044, "learning_rate": 4.97076656561282e-05, "loss": 0.3882, "step": 432000 }, { "epoch": 2.9267269380684278, "grad_norm": 0.33256059885025024, "learning_rate": 4.970732730619316e-05, "loss": 0.3877, "step": 432500 }, { "epoch": 2.9301104374187963, "grad_norm": 0.29620301723480225, "learning_rate": 4.970698895625812e-05, "loss": 0.3881, "step": 433000 }, { "epoch": 2.933493936769164, "grad_norm": 0.3183768689632416, "learning_rate": 4.970665060632308e-05, "loss": 0.3871, "step": 433500 }, { "epoch": 2.9368774361195324, "grad_norm": 0.3184598386287689, "learning_rate": 4.970631225638805e-05, "loss": 0.3879, "step": 434000 }, { "epoch": 2.9402609354699005, "grad_norm": 0.33179497718811035, "learning_rate": 4.9705973906453014e-05, "loss": 0.386, "step": 434500 }, { "epoch": 2.9436444348202686, "grad_norm": 0.32176026701927185, "learning_rate": 4.9705635556517976e-05, "loss": 0.3874, "step": 435000 }, { "epoch": 2.9470279341706367, "grad_norm": 0.3503253161907196, "learning_rate": 4.970529720658294e-05, "loss": 0.3884, "step": 435500 }, { "epoch": 2.9504114335210048, "grad_norm": 0.35486873984336853, "learning_rate": 4.970495885664791e-05, "loss": 0.3884, "step": 436000 }, { "epoch": 2.953794932871373, "grad_norm": 0.3227147161960602, "learning_rate": 4.970462050671287e-05, "loss": 0.3862, "step": 436500 }, { "epoch": 2.957178432221741, "grad_norm": 0.31731894612312317, "learning_rate": 4.9704282156777825e-05, "loss": 0.386, "step": 437000 }, { "epoch": 2.960561931572109, "grad_norm": 0.31829655170440674, "learning_rate": 4.970394380684279e-05, "loss": 0.387, "step": 437500 }, { "epoch": 2.963945430922477, "grad_norm": 0.3109760284423828, "learning_rate": 4.9703605456907756e-05, "loss": 0.3866, "step": 438000 }, { "epoch": 2.9673289302728456, "grad_norm": 0.3356626033782959, "learning_rate": 4.970326710697272e-05, "loss": 0.3896, "step": 438500 }, { "epoch": 2.9707124296232132, "grad_norm": 0.2957267761230469, "learning_rate": 4.970292875703768e-05, "loss": 0.3871, "step": 439000 }, { "epoch": 2.9740959289735818, "grad_norm": 0.3141108453273773, "learning_rate": 4.970259040710264e-05, "loss": 0.3869, "step": 439500 }, { "epoch": 2.97747942832395, "grad_norm": 0.30219319462776184, "learning_rate": 4.970225205716761e-05, "loss": 0.3866, "step": 440000 }, { "epoch": 2.980862927674318, "grad_norm": 0.3704169988632202, "learning_rate": 4.970191370723257e-05, "loss": 0.3868, "step": 440500 }, { "epoch": 2.984246427024686, "grad_norm": 0.3687022626399994, "learning_rate": 4.9701575357297535e-05, "loss": 0.3876, "step": 441000 }, { "epoch": 2.987629926375054, "grad_norm": 0.3400663435459137, "learning_rate": 4.97012370073625e-05, "loss": 0.3897, "step": 441500 }, { "epoch": 2.991013425725422, "grad_norm": 0.3268395960330963, "learning_rate": 4.970089865742746e-05, "loss": 0.3875, "step": 442000 }, { "epoch": 2.9943969250757903, "grad_norm": 0.32051482796669006, "learning_rate": 4.970056030749242e-05, "loss": 0.3868, "step": 442500 }, { "epoch": 2.9977804244261588, "grad_norm": 0.35251978039741516, "learning_rate": 4.9700221957557384e-05, "loss": 0.3878, "step": 443000 }, { "epoch": 3.0, "eval_accuracy": 0.8527967040650408, "eval_loss": 0.5995772480964661, "eval_runtime": 3342.0055, "eval_samples_per_second": 86.997, "eval_steps_per_second": 5.437, "step": 443328 }, { "epoch": 3.001163923776527, "grad_norm": 0.3209432065486908, "learning_rate": 4.969988360762235e-05, "loss": 0.3848, "step": 443500 }, { "epoch": 3.004547423126895, "grad_norm": 0.3334997892379761, "learning_rate": 4.9699545257687315e-05, "loss": 0.3848, "step": 444000 }, { "epoch": 3.007930922477263, "grad_norm": 0.3184662163257599, "learning_rate": 4.969920690775228e-05, "loss": 0.3849, "step": 444500 }, { "epoch": 3.011314421827631, "grad_norm": 0.3202681541442871, "learning_rate": 4.969886855781724e-05, "loss": 0.3838, "step": 445000 }, { "epoch": 3.014697921177999, "grad_norm": 0.34588322043418884, "learning_rate": 4.969853020788221e-05, "loss": 0.3854, "step": 445500 }, { "epoch": 3.0180814205283673, "grad_norm": 0.3345364034175873, "learning_rate": 4.969819185794717e-05, "loss": 0.3858, "step": 446000 }, { "epoch": 3.0214649198787353, "grad_norm": 0.32423135638237, "learning_rate": 4.9697853508012125e-05, "loss": 0.385, "step": 446500 }, { "epoch": 3.0248484192291034, "grad_norm": 0.38436517119407654, "learning_rate": 4.969751515807709e-05, "loss": 0.3861, "step": 447000 }, { "epoch": 3.0282319185794715, "grad_norm": 0.34340643882751465, "learning_rate": 4.9697176808142056e-05, "loss": 0.3861, "step": 447500 }, { "epoch": 3.0316154179298396, "grad_norm": 0.3383859395980835, "learning_rate": 4.969683845820702e-05, "loss": 0.3848, "step": 448000 }, { "epoch": 3.034998917280208, "grad_norm": 0.34863921999931335, "learning_rate": 4.969650010827198e-05, "loss": 0.3848, "step": 448500 }, { "epoch": 3.038382416630576, "grad_norm": 0.345042884349823, "learning_rate": 4.969616175833694e-05, "loss": 0.3876, "step": 449000 }, { "epoch": 3.0417659159809443, "grad_norm": 0.3602176904678345, "learning_rate": 4.969582340840191e-05, "loss": 0.385, "step": 449500 }, { "epoch": 3.0451494153313123, "grad_norm": 0.3208034634590149, "learning_rate": 4.9695485058466874e-05, "loss": 0.3865, "step": 450000 }, { "epoch": 3.0485329146816804, "grad_norm": 0.35154417157173157, "learning_rate": 4.9695146708531836e-05, "loss": 0.3854, "step": 450500 }, { "epoch": 3.0519164140320485, "grad_norm": 0.3550427258014679, "learning_rate": 4.96948083585968e-05, "loss": 0.3855, "step": 451000 }, { "epoch": 3.0552999133824166, "grad_norm": 0.3407672643661499, "learning_rate": 4.969447000866176e-05, "loss": 0.3847, "step": 451500 }, { "epoch": 3.0586834127327847, "grad_norm": 0.30612611770629883, "learning_rate": 4.969413165872672e-05, "loss": 0.3849, "step": 452000 }, { "epoch": 3.0620669120831527, "grad_norm": 0.3213786482810974, "learning_rate": 4.9693793308791684e-05, "loss": 0.3855, "step": 452500 }, { "epoch": 3.065450411433521, "grad_norm": 0.36959215998649597, "learning_rate": 4.969345495885665e-05, "loss": 0.3847, "step": 453000 }, { "epoch": 3.0688339107838893, "grad_norm": 0.33155548572540283, "learning_rate": 4.9693116608921615e-05, "loss": 0.3843, "step": 453500 }, { "epoch": 3.0722174101342574, "grad_norm": 0.324441134929657, "learning_rate": 4.969277825898658e-05, "loss": 0.3846, "step": 454000 }, { "epoch": 3.0756009094846255, "grad_norm": 0.3025881350040436, "learning_rate": 4.969243990905154e-05, "loss": 0.3881, "step": 454500 }, { "epoch": 3.0789844088349936, "grad_norm": 0.3309617340564728, "learning_rate": 4.96921015591165e-05, "loss": 0.3838, "step": 455000 }, { "epoch": 3.0823679081853617, "grad_norm": 0.3241705298423767, "learning_rate": 4.969176320918147e-05, "loss": 0.385, "step": 455500 }, { "epoch": 3.0857514075357297, "grad_norm": 0.3889763057231903, "learning_rate": 4.9691424859246426e-05, "loss": 0.3865, "step": 456000 }, { "epoch": 3.089134906886098, "grad_norm": 0.310674250125885, "learning_rate": 4.969108650931139e-05, "loss": 0.3866, "step": 456500 }, { "epoch": 3.092518406236466, "grad_norm": 0.3215824067592621, "learning_rate": 4.969074815937636e-05, "loss": 0.3863, "step": 457000 }, { "epoch": 3.095901905586834, "grad_norm": 0.3177349865436554, "learning_rate": 4.969040980944132e-05, "loss": 0.3854, "step": 457500 }, { "epoch": 3.099285404937202, "grad_norm": 0.3278276026248932, "learning_rate": 4.969007145950628e-05, "loss": 0.385, "step": 458000 }, { "epoch": 3.1026689042875706, "grad_norm": 0.30803415179252625, "learning_rate": 4.9689733109571243e-05, "loss": 0.3843, "step": 458500 }, { "epoch": 3.1060524036379387, "grad_norm": 0.32083410024642944, "learning_rate": 4.968939475963621e-05, "loss": 0.3845, "step": 459000 }, { "epoch": 3.1094359029883067, "grad_norm": 0.3132006824016571, "learning_rate": 4.9689056409701174e-05, "loss": 0.3848, "step": 459500 }, { "epoch": 3.112819402338675, "grad_norm": 0.3279072642326355, "learning_rate": 4.9688718059766137e-05, "loss": 0.3864, "step": 460000 }, { "epoch": 3.116202901689043, "grad_norm": 0.3340642750263214, "learning_rate": 4.96883797098311e-05, "loss": 0.3847, "step": 460500 }, { "epoch": 3.119586401039411, "grad_norm": 0.33182334899902344, "learning_rate": 4.968804135989606e-05, "loss": 0.3864, "step": 461000 }, { "epoch": 3.122969900389779, "grad_norm": 0.34803441166877747, "learning_rate": 4.968770300996102e-05, "loss": 0.3845, "step": 461500 }, { "epoch": 3.126353399740147, "grad_norm": 0.3448106646537781, "learning_rate": 4.9687364660025985e-05, "loss": 0.3842, "step": 462000 }, { "epoch": 3.1297368990905152, "grad_norm": 0.34930822253227234, "learning_rate": 4.968702631009095e-05, "loss": 0.3858, "step": 462500 }, { "epoch": 3.1331203984408833, "grad_norm": 0.3045295774936676, "learning_rate": 4.9686687960155916e-05, "loss": 0.3852, "step": 463000 }, { "epoch": 3.136503897791252, "grad_norm": 0.34032517671585083, "learning_rate": 4.968634961022088e-05, "loss": 0.3864, "step": 463500 }, { "epoch": 3.13988739714162, "grad_norm": 0.32840994000434875, "learning_rate": 4.968601126028584e-05, "loss": 0.3862, "step": 464000 }, { "epoch": 3.143270896491988, "grad_norm": 0.33663713932037354, "learning_rate": 4.96856729103508e-05, "loss": 0.385, "step": 464500 }, { "epoch": 3.146654395842356, "grad_norm": 0.3208543360233307, "learning_rate": 4.968533456041577e-05, "loss": 0.3858, "step": 465000 }, { "epoch": 3.150037895192724, "grad_norm": 0.3440122604370117, "learning_rate": 4.968499621048073e-05, "loss": 0.3853, "step": 465500 }, { "epoch": 3.1534213945430922, "grad_norm": 0.3290427625179291, "learning_rate": 4.968465786054569e-05, "loss": 0.3852, "step": 466000 }, { "epoch": 3.1568048938934603, "grad_norm": 0.3532296121120453, "learning_rate": 4.968431951061066e-05, "loss": 0.3843, "step": 466500 }, { "epoch": 3.1601883932438284, "grad_norm": 0.2814158499240875, "learning_rate": 4.968398116067562e-05, "loss": 0.3851, "step": 467000 }, { "epoch": 3.1635718925941965, "grad_norm": 0.3247326910495758, "learning_rate": 4.968364281074058e-05, "loss": 0.3846, "step": 467500 }, { "epoch": 3.1669553919445645, "grad_norm": 0.3341606855392456, "learning_rate": 4.9683304460805544e-05, "loss": 0.3853, "step": 468000 }, { "epoch": 3.170338891294933, "grad_norm": 0.3080476224422455, "learning_rate": 4.968296611087051e-05, "loss": 0.3871, "step": 468500 }, { "epoch": 3.173722390645301, "grad_norm": 0.36451050639152527, "learning_rate": 4.9682627760935475e-05, "loss": 0.3855, "step": 469000 }, { "epoch": 3.1771058899956692, "grad_norm": 0.31912335753440857, "learning_rate": 4.968228941100044e-05, "loss": 0.385, "step": 469500 }, { "epoch": 3.1804893893460373, "grad_norm": 0.3288278877735138, "learning_rate": 4.96819510610654e-05, "loss": 0.3844, "step": 470000 }, { "epoch": 3.1838728886964054, "grad_norm": 0.3190681040287018, "learning_rate": 4.968161271113036e-05, "loss": 0.383, "step": 470500 }, { "epoch": 3.1872563880467735, "grad_norm": 0.3208576440811157, "learning_rate": 4.9681274361195324e-05, "loss": 0.3859, "step": 471000 }, { "epoch": 3.1906398873971415, "grad_norm": 0.3483971953392029, "learning_rate": 4.9680936011260286e-05, "loss": 0.3843, "step": 471500 }, { "epoch": 3.1940233867475096, "grad_norm": 0.337174654006958, "learning_rate": 4.968059766132525e-05, "loss": 0.3844, "step": 472000 }, { "epoch": 3.1974068860978777, "grad_norm": 0.3170009255409241, "learning_rate": 4.968025931139022e-05, "loss": 0.3841, "step": 472500 }, { "epoch": 3.200790385448246, "grad_norm": 0.342808336019516, "learning_rate": 4.967992096145518e-05, "loss": 0.3842, "step": 473000 }, { "epoch": 3.2041738847986143, "grad_norm": 0.33495399355888367, "learning_rate": 4.967958261152014e-05, "loss": 0.3856, "step": 473500 }, { "epoch": 3.2075573841489824, "grad_norm": 0.317581444978714, "learning_rate": 4.96792442615851e-05, "loss": 0.3853, "step": 474000 }, { "epoch": 3.2109408834993505, "grad_norm": 0.31482943892478943, "learning_rate": 4.967890591165007e-05, "loss": 0.3843, "step": 474500 }, { "epoch": 3.2143243828497186, "grad_norm": 0.3111674189567566, "learning_rate": 4.967856756171503e-05, "loss": 0.3861, "step": 475000 }, { "epoch": 3.2177078822000866, "grad_norm": 0.31386885046958923, "learning_rate": 4.967822921177999e-05, "loss": 0.3849, "step": 475500 }, { "epoch": 3.2210913815504547, "grad_norm": 0.33925771713256836, "learning_rate": 4.967789086184496e-05, "loss": 0.3859, "step": 476000 }, { "epoch": 3.224474880900823, "grad_norm": 0.3127824366092682, "learning_rate": 4.967755251190992e-05, "loss": 0.386, "step": 476500 }, { "epoch": 3.227858380251191, "grad_norm": 0.3213912546634674, "learning_rate": 4.967721416197488e-05, "loss": 0.3845, "step": 477000 }, { "epoch": 3.231241879601559, "grad_norm": 0.31241416931152344, "learning_rate": 4.9676875812039845e-05, "loss": 0.3857, "step": 477500 }, { "epoch": 3.234625378951927, "grad_norm": 0.336775541305542, "learning_rate": 4.9676537462104814e-05, "loss": 0.385, "step": 478000 }, { "epoch": 3.2380088783022956, "grad_norm": 0.34088069200515747, "learning_rate": 4.9676199112169776e-05, "loss": 0.3858, "step": 478500 }, { "epoch": 3.2413923776526636, "grad_norm": 0.340310662984848, "learning_rate": 4.967586076223474e-05, "loss": 0.3853, "step": 479000 }, { "epoch": 3.2447758770030317, "grad_norm": 0.27655091881752014, "learning_rate": 4.967552241229969e-05, "loss": 0.3844, "step": 479500 }, { "epoch": 3.2481593763534, "grad_norm": 0.34276095032691956, "learning_rate": 4.967518406236466e-05, "loss": 0.3848, "step": 480000 }, { "epoch": 3.251542875703768, "grad_norm": 0.33317485451698303, "learning_rate": 4.9674845712429624e-05, "loss": 0.3863, "step": 480500 }, { "epoch": 3.254926375054136, "grad_norm": 0.2947559654712677, "learning_rate": 4.9674507362494586e-05, "loss": 0.3851, "step": 481000 }, { "epoch": 3.258309874404504, "grad_norm": 0.33836454153060913, "learning_rate": 4.967416901255955e-05, "loss": 0.3848, "step": 481500 }, { "epoch": 3.261693373754872, "grad_norm": 0.32393962144851685, "learning_rate": 4.967383066262452e-05, "loss": 0.3865, "step": 482000 }, { "epoch": 3.26507687310524, "grad_norm": 0.34863951802253723, "learning_rate": 4.967349231268948e-05, "loss": 0.3839, "step": 482500 }, { "epoch": 3.2684603724556087, "grad_norm": 0.35544320940971375, "learning_rate": 4.967315396275444e-05, "loss": 0.3859, "step": 483000 }, { "epoch": 3.271843871805977, "grad_norm": 0.32499048113822937, "learning_rate": 4.9672815612819404e-05, "loss": 0.3859, "step": 483500 }, { "epoch": 3.275227371156345, "grad_norm": 0.3342566192150116, "learning_rate": 4.967247726288437e-05, "loss": 0.385, "step": 484000 }, { "epoch": 3.278610870506713, "grad_norm": 0.32709258794784546, "learning_rate": 4.967213891294933e-05, "loss": 0.3852, "step": 484500 }, { "epoch": 3.281994369857081, "grad_norm": 0.34438183903694153, "learning_rate": 4.967180056301429e-05, "loss": 0.3836, "step": 485000 }, { "epoch": 3.285377869207449, "grad_norm": 0.3213898241519928, "learning_rate": 4.967146221307926e-05, "loss": 0.3859, "step": 485500 }, { "epoch": 3.288761368557817, "grad_norm": 0.3340561091899872, "learning_rate": 4.967112386314422e-05, "loss": 0.3853, "step": 486000 }, { "epoch": 3.2921448679081853, "grad_norm": 0.31760358810424805, "learning_rate": 4.967078551320918e-05, "loss": 0.3852, "step": 486500 }, { "epoch": 3.2955283672585534, "grad_norm": 0.31598398089408875, "learning_rate": 4.9670447163274145e-05, "loss": 0.3849, "step": 487000 }, { "epoch": 3.2989118666089214, "grad_norm": 0.326427698135376, "learning_rate": 4.9670108813339114e-05, "loss": 0.3854, "step": 487500 }, { "epoch": 3.3022953659592895, "grad_norm": 0.3454003930091858, "learning_rate": 4.9669770463404076e-05, "loss": 0.3844, "step": 488000 }, { "epoch": 3.305678865309658, "grad_norm": 0.32165205478668213, "learning_rate": 4.966943211346904e-05, "loss": 0.3841, "step": 488500 }, { "epoch": 3.309062364660026, "grad_norm": 0.3449212610721588, "learning_rate": 4.9669093763533994e-05, "loss": 0.3868, "step": 489000 }, { "epoch": 3.312445864010394, "grad_norm": 0.3350136876106262, "learning_rate": 4.966875541359896e-05, "loss": 0.3839, "step": 489500 }, { "epoch": 3.3158293633607623, "grad_norm": 0.30893436074256897, "learning_rate": 4.9668417063663925e-05, "loss": 0.3844, "step": 490000 }, { "epoch": 3.3192128627111304, "grad_norm": 0.33487969636917114, "learning_rate": 4.966807871372889e-05, "loss": 0.386, "step": 490500 }, { "epoch": 3.3225963620614984, "grad_norm": 0.3271050453186035, "learning_rate": 4.966774036379385e-05, "loss": 0.3851, "step": 491000 }, { "epoch": 3.3259798614118665, "grad_norm": 0.3512710928916931, "learning_rate": 4.966740201385882e-05, "loss": 0.3859, "step": 491500 }, { "epoch": 3.3293633607622346, "grad_norm": 0.3383200466632843, "learning_rate": 4.966706366392378e-05, "loss": 0.3838, "step": 492000 }, { "epoch": 3.3327468601126027, "grad_norm": 0.34418985247612, "learning_rate": 4.966672531398874e-05, "loss": 0.3848, "step": 492500 }, { "epoch": 3.336130359462971, "grad_norm": 0.3254546523094177, "learning_rate": 4.9666386964053704e-05, "loss": 0.385, "step": 493000 }, { "epoch": 3.3395138588133393, "grad_norm": 0.3146657645702362, "learning_rate": 4.966604861411867e-05, "loss": 0.386, "step": 493500 }, { "epoch": 3.3428973581637074, "grad_norm": 0.34255629777908325, "learning_rate": 4.966571026418363e-05, "loss": 0.3846, "step": 494000 }, { "epoch": 3.3462808575140754, "grad_norm": 0.33413317799568176, "learning_rate": 4.966537191424859e-05, "loss": 0.3861, "step": 494500 }, { "epoch": 3.3496643568644435, "grad_norm": 0.31402289867401123, "learning_rate": 4.966503356431356e-05, "loss": 0.3848, "step": 495000 }, { "epoch": 3.3530478562148116, "grad_norm": 0.3087741732597351, "learning_rate": 4.966469521437852e-05, "loss": 0.3868, "step": 495500 }, { "epoch": 3.3564313555651797, "grad_norm": 0.2981843054294586, "learning_rate": 4.9664356864443484e-05, "loss": 0.3836, "step": 496000 }, { "epoch": 3.3598148549155478, "grad_norm": 0.3750145137310028, "learning_rate": 4.9664018514508446e-05, "loss": 0.3858, "step": 496500 }, { "epoch": 3.363198354265916, "grad_norm": 0.33225759863853455, "learning_rate": 4.9663680164573415e-05, "loss": 0.3845, "step": 497000 }, { "epoch": 3.366581853616284, "grad_norm": 0.3978593647480011, "learning_rate": 4.966334181463838e-05, "loss": 0.3836, "step": 497500 }, { "epoch": 3.369965352966652, "grad_norm": 0.3188680112361908, "learning_rate": 4.966300346470334e-05, "loss": 0.3846, "step": 498000 }, { "epoch": 3.3733488523170205, "grad_norm": 0.3641412854194641, "learning_rate": 4.96626651147683e-05, "loss": 0.3852, "step": 498500 }, { "epoch": 3.3767323516673886, "grad_norm": 0.3012436628341675, "learning_rate": 4.966232676483326e-05, "loss": 0.387, "step": 499000 }, { "epoch": 3.3801158510177567, "grad_norm": 0.33627980947494507, "learning_rate": 4.9661988414898225e-05, "loss": 0.3851, "step": 499500 }, { "epoch": 3.3834993503681248, "grad_norm": 0.3410860598087311, "learning_rate": 4.966165006496319e-05, "loss": 0.3847, "step": 500000 }, { "epoch": 3.386882849718493, "grad_norm": 0.3147096633911133, "learning_rate": 4.966131171502815e-05, "loss": 0.3857, "step": 500500 }, { "epoch": 3.390266349068861, "grad_norm": 0.3315429389476776, "learning_rate": 4.966097336509312e-05, "loss": 0.3842, "step": 501000 }, { "epoch": 3.393649848419229, "grad_norm": 0.316974014043808, "learning_rate": 4.966063501515808e-05, "loss": 0.3845, "step": 501500 }, { "epoch": 3.397033347769597, "grad_norm": 0.29767683148384094, "learning_rate": 4.966029666522304e-05, "loss": 0.3857, "step": 502000 }, { "epoch": 3.400416847119965, "grad_norm": 0.31953442096710205, "learning_rate": 4.9659958315288005e-05, "loss": 0.3857, "step": 502500 }, { "epoch": 3.4038003464703337, "grad_norm": 0.3329973816871643, "learning_rate": 4.9659619965352974e-05, "loss": 0.3857, "step": 503000 }, { "epoch": 3.4071838458207018, "grad_norm": 0.35081276297569275, "learning_rate": 4.965928161541793e-05, "loss": 0.3842, "step": 503500 }, { "epoch": 3.41056734517107, "grad_norm": 0.34732785820961, "learning_rate": 4.965894326548289e-05, "loss": 0.3859, "step": 504000 }, { "epoch": 3.413950844521438, "grad_norm": 0.3713749647140503, "learning_rate": 4.965860491554786e-05, "loss": 0.3839, "step": 504500 }, { "epoch": 3.417334343871806, "grad_norm": 0.33542150259017944, "learning_rate": 4.965826656561282e-05, "loss": 0.3861, "step": 505000 }, { "epoch": 3.420717843222174, "grad_norm": 0.29569828510284424, "learning_rate": 4.9657928215677784e-05, "loss": 0.3851, "step": 505500 }, { "epoch": 3.424101342572542, "grad_norm": 0.33000361919403076, "learning_rate": 4.965758986574275e-05, "loss": 0.3846, "step": 506000 }, { "epoch": 3.4274848419229103, "grad_norm": 0.3097745180130005, "learning_rate": 4.9657251515807716e-05, "loss": 0.3842, "step": 506500 }, { "epoch": 3.4308683412732783, "grad_norm": 0.33550727367401123, "learning_rate": 4.965691316587268e-05, "loss": 0.3859, "step": 507000 }, { "epoch": 3.4342518406236464, "grad_norm": 0.32614001631736755, "learning_rate": 4.965657481593764e-05, "loss": 0.3856, "step": 507500 }, { "epoch": 3.4376353399740145, "grad_norm": 0.38159069418907166, "learning_rate": 4.96562364660026e-05, "loss": 0.3855, "step": 508000 }, { "epoch": 3.441018839324383, "grad_norm": 0.33263829350471497, "learning_rate": 4.9655898116067564e-05, "loss": 0.3849, "step": 508500 }, { "epoch": 3.444402338674751, "grad_norm": 0.317388653755188, "learning_rate": 4.9655559766132526e-05, "loss": 0.383, "step": 509000 }, { "epoch": 3.447785838025119, "grad_norm": 0.34153813123703003, "learning_rate": 4.965522141619749e-05, "loss": 0.3836, "step": 509500 }, { "epoch": 3.4511693373754873, "grad_norm": 0.3225274384021759, "learning_rate": 4.965488306626245e-05, "loss": 0.384, "step": 510000 }, { "epoch": 3.4545528367258553, "grad_norm": 0.3087396025657654, "learning_rate": 4.965454471632742e-05, "loss": 0.385, "step": 510500 }, { "epoch": 3.4579363360762234, "grad_norm": 0.3249145448207855, "learning_rate": 4.965420636639238e-05, "loss": 0.3857, "step": 511000 }, { "epoch": 3.4613198354265915, "grad_norm": 0.34861987829208374, "learning_rate": 4.9653868016457344e-05, "loss": 0.3838, "step": 511500 }, { "epoch": 3.4647033347769596, "grad_norm": 0.3348096013069153, "learning_rate": 4.9653529666522306e-05, "loss": 0.3846, "step": 512000 }, { "epoch": 3.4680868341273277, "grad_norm": 0.3486187756061554, "learning_rate": 4.9653191316587275e-05, "loss": 0.3833, "step": 512500 }, { "epoch": 3.471470333477696, "grad_norm": 0.31958213448524475, "learning_rate": 4.965285296665223e-05, "loss": 0.3843, "step": 513000 }, { "epoch": 3.4748538328280643, "grad_norm": 0.35458359122276306, "learning_rate": 4.965251461671719e-05, "loss": 0.3845, "step": 513500 }, { "epoch": 3.4782373321784323, "grad_norm": 0.31329068541526794, "learning_rate": 4.965217626678216e-05, "loss": 0.3843, "step": 514000 }, { "epoch": 3.4816208315288004, "grad_norm": 0.3021182715892792, "learning_rate": 4.965183791684712e-05, "loss": 0.3855, "step": 514500 }, { "epoch": 3.4850043308791685, "grad_norm": 0.3269021809101105, "learning_rate": 4.9651499566912085e-05, "loss": 0.3851, "step": 515000 }, { "epoch": 3.4883878302295366, "grad_norm": 0.3389667868614197, "learning_rate": 4.965116121697705e-05, "loss": 0.3859, "step": 515500 }, { "epoch": 3.4917713295799047, "grad_norm": 0.33493050932884216, "learning_rate": 4.9650822867042016e-05, "loss": 0.3839, "step": 516000 }, { "epoch": 3.4951548289302727, "grad_norm": 0.326436311006546, "learning_rate": 4.965048451710698e-05, "loss": 0.3849, "step": 516500 }, { "epoch": 3.498538328280641, "grad_norm": 0.3414977490901947, "learning_rate": 4.965014616717194e-05, "loss": 0.3847, "step": 517000 }, { "epoch": 3.5019218276310093, "grad_norm": 0.31117111444473267, "learning_rate": 4.96498078172369e-05, "loss": 0.384, "step": 517500 }, { "epoch": 3.505305326981377, "grad_norm": 0.31108585000038147, "learning_rate": 4.9649469467301865e-05, "loss": 0.3855, "step": 518000 }, { "epoch": 3.5086888263317455, "grad_norm": 0.3296654522418976, "learning_rate": 4.964913111736683e-05, "loss": 0.3841, "step": 518500 }, { "epoch": 3.5120723256821136, "grad_norm": 0.34887850284576416, "learning_rate": 4.964879276743179e-05, "loss": 0.3848, "step": 519000 }, { "epoch": 3.5154558250324817, "grad_norm": 0.3279761075973511, "learning_rate": 4.964845441749675e-05, "loss": 0.3842, "step": 519500 }, { "epoch": 3.5188393243828497, "grad_norm": 0.3656323254108429, "learning_rate": 4.964811606756172e-05, "loss": 0.3846, "step": 520000 }, { "epoch": 3.522222823733218, "grad_norm": 0.30262666940689087, "learning_rate": 4.964777771762668e-05, "loss": 0.3848, "step": 520500 }, { "epoch": 3.525606323083586, "grad_norm": 0.3520102798938751, "learning_rate": 4.9647439367691644e-05, "loss": 0.3823, "step": 521000 }, { "epoch": 3.528989822433954, "grad_norm": 0.31039148569107056, "learning_rate": 4.9647101017756606e-05, "loss": 0.3834, "step": 521500 }, { "epoch": 3.532373321784322, "grad_norm": 0.36634209752082825, "learning_rate": 4.9646762667821575e-05, "loss": 0.3854, "step": 522000 }, { "epoch": 3.53575682113469, "grad_norm": 0.3379027247428894, "learning_rate": 4.964642431788653e-05, "loss": 0.386, "step": 522500 }, { "epoch": 3.5391403204850587, "grad_norm": 0.37038281559944153, "learning_rate": 4.964608596795149e-05, "loss": 0.3854, "step": 523000 }, { "epoch": 3.5425238198354267, "grad_norm": 0.35718047618865967, "learning_rate": 4.964574761801646e-05, "loss": 0.3852, "step": 523500 }, { "epoch": 3.545907319185795, "grad_norm": 0.3311220705509186, "learning_rate": 4.9645409268081424e-05, "loss": 0.3843, "step": 524000 }, { "epoch": 3.549290818536163, "grad_norm": 0.34186434745788574, "learning_rate": 4.9645070918146386e-05, "loss": 0.3837, "step": 524500 }, { "epoch": 3.552674317886531, "grad_norm": 0.34989869594573975, "learning_rate": 4.964473256821135e-05, "loss": 0.3843, "step": 525000 }, { "epoch": 3.556057817236899, "grad_norm": 0.36629295349121094, "learning_rate": 4.964439421827631e-05, "loss": 0.3852, "step": 525500 }, { "epoch": 3.559441316587267, "grad_norm": 0.2990732192993164, "learning_rate": 4.964405586834128e-05, "loss": 0.3834, "step": 526000 }, { "epoch": 3.5628248159376352, "grad_norm": 0.3365938663482666, "learning_rate": 4.964371751840624e-05, "loss": 0.3839, "step": 526500 }, { "epoch": 3.5662083152880033, "grad_norm": 0.3220049738883972, "learning_rate": 4.96433791684712e-05, "loss": 0.3841, "step": 527000 }, { "epoch": 3.569591814638372, "grad_norm": 0.309627890586853, "learning_rate": 4.9643040818536165e-05, "loss": 0.3862, "step": 527500 }, { "epoch": 3.5729753139887395, "grad_norm": 0.3281790316104889, "learning_rate": 4.964270246860113e-05, "loss": 0.3832, "step": 528000 }, { "epoch": 3.576358813339108, "grad_norm": 0.34404656291007996, "learning_rate": 4.964236411866609e-05, "loss": 0.3846, "step": 528500 }, { "epoch": 3.579742312689476, "grad_norm": 0.3608299791812897, "learning_rate": 4.964202576873105e-05, "loss": 0.3852, "step": 529000 }, { "epoch": 3.583125812039844, "grad_norm": 0.3281520903110504, "learning_rate": 4.964168741879602e-05, "loss": 0.3834, "step": 529500 }, { "epoch": 3.5865093113902122, "grad_norm": 0.3279350697994232, "learning_rate": 4.964134906886098e-05, "loss": 0.3862, "step": 530000 }, { "epoch": 3.5898928107405803, "grad_norm": 0.30105507373809814, "learning_rate": 4.9641010718925945e-05, "loss": 0.3857, "step": 530500 }, { "epoch": 3.5932763100909484, "grad_norm": 0.33526769280433655, "learning_rate": 4.964067236899091e-05, "loss": 0.3844, "step": 531000 }, { "epoch": 3.5966598094413165, "grad_norm": 0.3471981883049011, "learning_rate": 4.9640334019055876e-05, "loss": 0.3844, "step": 531500 }, { "epoch": 3.6000433087916845, "grad_norm": 0.32439425587654114, "learning_rate": 4.963999566912083e-05, "loss": 0.3836, "step": 532000 }, { "epoch": 3.6034268081420526, "grad_norm": 0.28566959500312805, "learning_rate": 4.963965731918579e-05, "loss": 0.3852, "step": 532500 }, { "epoch": 3.606810307492421, "grad_norm": 0.32242149114608765, "learning_rate": 4.9639318969250755e-05, "loss": 0.3842, "step": 533000 }, { "epoch": 3.6101938068427892, "grad_norm": 0.35472533106803894, "learning_rate": 4.9638980619315724e-05, "loss": 0.3819, "step": 533500 }, { "epoch": 3.6135773061931573, "grad_norm": 0.3436533510684967, "learning_rate": 4.9638642269380686e-05, "loss": 0.3846, "step": 534000 }, { "epoch": 3.6169608055435254, "grad_norm": 0.3572445511817932, "learning_rate": 4.963830391944565e-05, "loss": 0.3858, "step": 534500 }, { "epoch": 3.6203443048938935, "grad_norm": 0.3280385434627533, "learning_rate": 4.963796556951061e-05, "loss": 0.385, "step": 535000 }, { "epoch": 3.6237278042442616, "grad_norm": 0.34210699796676636, "learning_rate": 4.963762721957558e-05, "loss": 0.3843, "step": 535500 }, { "epoch": 3.6271113035946296, "grad_norm": 0.312200129032135, "learning_rate": 4.963728886964054e-05, "loss": 0.3861, "step": 536000 }, { "epoch": 3.6304948029449977, "grad_norm": 0.32678160071372986, "learning_rate": 4.9636950519705504e-05, "loss": 0.3859, "step": 536500 }, { "epoch": 3.633878302295366, "grad_norm": 0.3201853930950165, "learning_rate": 4.9636612169770466e-05, "loss": 0.3848, "step": 537000 }, { "epoch": 3.6372618016457343, "grad_norm": 0.33350878953933716, "learning_rate": 4.963627381983543e-05, "loss": 0.3833, "step": 537500 }, { "epoch": 3.640645300996102, "grad_norm": 0.32399922609329224, "learning_rate": 4.963593546990039e-05, "loss": 0.3843, "step": 538000 }, { "epoch": 3.6440288003464705, "grad_norm": 0.3093265891075134, "learning_rate": 4.963559711996535e-05, "loss": 0.3855, "step": 538500 }, { "epoch": 3.6474122996968386, "grad_norm": 0.35990044474601746, "learning_rate": 4.963525877003032e-05, "loss": 0.3828, "step": 539000 }, { "epoch": 3.6507957990472066, "grad_norm": 0.34472528100013733, "learning_rate": 4.963492042009528e-05, "loss": 0.3828, "step": 539500 }, { "epoch": 3.6541792983975747, "grad_norm": 0.3378034830093384, "learning_rate": 4.9634582070160245e-05, "loss": 0.3842, "step": 540000 }, { "epoch": 3.657562797747943, "grad_norm": 0.3017686903476715, "learning_rate": 4.963424372022521e-05, "loss": 0.3845, "step": 540500 }, { "epoch": 3.660946297098311, "grad_norm": 0.3325106203556061, "learning_rate": 4.9633905370290176e-05, "loss": 0.3847, "step": 541000 }, { "epoch": 3.664329796448679, "grad_norm": 0.37856626510620117, "learning_rate": 4.963356702035513e-05, "loss": 0.3842, "step": 541500 }, { "epoch": 3.6677132957990475, "grad_norm": 0.36542338132858276, "learning_rate": 4.9633228670420094e-05, "loss": 0.3855, "step": 542000 }, { "epoch": 3.671096795149415, "grad_norm": 0.3596114218235016, "learning_rate": 4.9632890320485056e-05, "loss": 0.3832, "step": 542500 }, { "epoch": 3.6744802944997836, "grad_norm": 0.344609797000885, "learning_rate": 4.9632551970550025e-05, "loss": 0.3838, "step": 543000 }, { "epoch": 3.6778637938501517, "grad_norm": 0.4544999897480011, "learning_rate": 4.963221362061499e-05, "loss": 0.3851, "step": 543500 }, { "epoch": 3.68124729320052, "grad_norm": 0.3322232961654663, "learning_rate": 4.963187527067995e-05, "loss": 0.3837, "step": 544000 }, { "epoch": 3.684630792550888, "grad_norm": 0.3328924775123596, "learning_rate": 4.963153692074491e-05, "loss": 0.3842, "step": 544500 }, { "epoch": 3.688014291901256, "grad_norm": 0.3372010290622711, "learning_rate": 4.963119857080988e-05, "loss": 0.3856, "step": 545000 }, { "epoch": 3.691397791251624, "grad_norm": 0.29983511567115784, "learning_rate": 4.963086022087484e-05, "loss": 0.3859, "step": 545500 }, { "epoch": 3.694781290601992, "grad_norm": 0.32538631558418274, "learning_rate": 4.9630521870939804e-05, "loss": 0.3855, "step": 546000 }, { "epoch": 3.69816478995236, "grad_norm": 0.3350183963775635, "learning_rate": 4.9630183521004767e-05, "loss": 0.3848, "step": 546500 }, { "epoch": 3.7015482893027283, "grad_norm": 0.3626324534416199, "learning_rate": 4.962984517106973e-05, "loss": 0.3841, "step": 547000 }, { "epoch": 3.704931788653097, "grad_norm": 0.32910341024398804, "learning_rate": 4.962950682113469e-05, "loss": 0.3824, "step": 547500 }, { "epoch": 3.7083152880034644, "grad_norm": 0.3035507798194885, "learning_rate": 4.962916847119965e-05, "loss": 0.3841, "step": 548000 }, { "epoch": 3.711698787353833, "grad_norm": 0.30795642733573914, "learning_rate": 4.962883012126462e-05, "loss": 0.3823, "step": 548500 }, { "epoch": 3.715082286704201, "grad_norm": 0.2978059947490692, "learning_rate": 4.9628491771329584e-05, "loss": 0.384, "step": 549000 }, { "epoch": 3.718465786054569, "grad_norm": 0.38770484924316406, "learning_rate": 4.9628153421394546e-05, "loss": 0.3851, "step": 549500 }, { "epoch": 3.721849285404937, "grad_norm": 0.33769622445106506, "learning_rate": 4.962781507145951e-05, "loss": 0.3843, "step": 550000 }, { "epoch": 3.7252327847553053, "grad_norm": 0.32749757170677185, "learning_rate": 4.962747672152448e-05, "loss": 0.3845, "step": 550500 }, { "epoch": 3.7286162841056734, "grad_norm": 0.30986374616622925, "learning_rate": 4.962713837158944e-05, "loss": 0.3858, "step": 551000 }, { "epoch": 3.7319997834560414, "grad_norm": 0.3359217345714569, "learning_rate": 4.9626800021654395e-05, "loss": 0.3848, "step": 551500 }, { "epoch": 3.73538328280641, "grad_norm": 0.3180493414402008, "learning_rate": 4.962646167171936e-05, "loss": 0.3842, "step": 552000 }, { "epoch": 3.7387667821567776, "grad_norm": 0.34266242384910583, "learning_rate": 4.9626123321784326e-05, "loss": 0.3842, "step": 552500 }, { "epoch": 3.742150281507146, "grad_norm": 0.3216932713985443, "learning_rate": 4.962578497184929e-05, "loss": 0.3834, "step": 553000 }, { "epoch": 3.745533780857514, "grad_norm": 0.3520362079143524, "learning_rate": 4.962544662191425e-05, "loss": 0.3859, "step": 553500 }, { "epoch": 3.7489172802078823, "grad_norm": 0.3680345118045807, "learning_rate": 4.962510827197921e-05, "loss": 0.3844, "step": 554000 }, { "epoch": 3.7523007795582504, "grad_norm": 0.34057295322418213, "learning_rate": 4.962476992204418e-05, "loss": 0.3835, "step": 554500 }, { "epoch": 3.7556842789086184, "grad_norm": 0.3277778625488281, "learning_rate": 4.962443157210914e-05, "loss": 0.3859, "step": 555000 }, { "epoch": 3.7590677782589865, "grad_norm": 0.3285628855228424, "learning_rate": 4.9624093222174105e-05, "loss": 0.3852, "step": 555500 }, { "epoch": 3.7624512776093546, "grad_norm": 0.311026394367218, "learning_rate": 4.962375487223907e-05, "loss": 0.3839, "step": 556000 }, { "epoch": 3.7658347769597227, "grad_norm": 0.33761194348335266, "learning_rate": 4.962341652230403e-05, "loss": 0.3837, "step": 556500 }, { "epoch": 3.7692182763100908, "grad_norm": 0.30678847432136536, "learning_rate": 4.962307817236899e-05, "loss": 0.3831, "step": 557000 }, { "epoch": 3.7726017756604593, "grad_norm": 0.32656219601631165, "learning_rate": 4.9622739822433954e-05, "loss": 0.3839, "step": 557500 }, { "epoch": 3.775985275010827, "grad_norm": 0.3577435314655304, "learning_rate": 4.962240147249892e-05, "loss": 0.3847, "step": 558000 }, { "epoch": 3.7793687743611954, "grad_norm": 0.3245508372783661, "learning_rate": 4.9622063122563885e-05, "loss": 0.3837, "step": 558500 }, { "epoch": 3.7827522737115635, "grad_norm": 0.33766868710517883, "learning_rate": 4.962172477262885e-05, "loss": 0.3849, "step": 559000 }, { "epoch": 3.7861357730619316, "grad_norm": 0.3148922324180603, "learning_rate": 4.962138642269381e-05, "loss": 0.3822, "step": 559500 }, { "epoch": 3.7895192724122997, "grad_norm": 0.34013688564300537, "learning_rate": 4.962104807275878e-05, "loss": 0.3852, "step": 560000 }, { "epoch": 3.7929027717626678, "grad_norm": 0.3306371867656708, "learning_rate": 4.962070972282374e-05, "loss": 0.3835, "step": 560500 }, { "epoch": 3.796286271113036, "grad_norm": 0.30493348836898804, "learning_rate": 4.9620371372888695e-05, "loss": 0.3837, "step": 561000 }, { "epoch": 3.799669770463404, "grad_norm": 0.3560996353626251, "learning_rate": 4.962003302295366e-05, "loss": 0.3833, "step": 561500 }, { "epoch": 3.8030532698137725, "grad_norm": 0.3450899124145508, "learning_rate": 4.9619694673018626e-05, "loss": 0.3852, "step": 562000 }, { "epoch": 3.80643676916414, "grad_norm": 0.3246195912361145, "learning_rate": 4.961935632308359e-05, "loss": 0.3848, "step": 562500 }, { "epoch": 3.8098202685145086, "grad_norm": 0.3165920674800873, "learning_rate": 4.961901797314855e-05, "loss": 0.3836, "step": 563000 }, { "epoch": 3.8132037678648767, "grad_norm": 0.3178715705871582, "learning_rate": 4.961867962321351e-05, "loss": 0.3836, "step": 563500 }, { "epoch": 3.8165872672152448, "grad_norm": 0.3456692397594452, "learning_rate": 4.961834127327848e-05, "loss": 0.3842, "step": 564000 }, { "epoch": 3.819970766565613, "grad_norm": 0.34803831577301025, "learning_rate": 4.9618002923343444e-05, "loss": 0.3841, "step": 564500 }, { "epoch": 3.823354265915981, "grad_norm": 0.3458116948604584, "learning_rate": 4.9617664573408406e-05, "loss": 0.3831, "step": 565000 }, { "epoch": 3.826737765266349, "grad_norm": 0.3486628234386444, "learning_rate": 4.961732622347337e-05, "loss": 0.3867, "step": 565500 }, { "epoch": 3.830121264616717, "grad_norm": 0.34568849205970764, "learning_rate": 4.961698787353833e-05, "loss": 0.3834, "step": 566000 }, { "epoch": 3.833504763967085, "grad_norm": 0.34365609288215637, "learning_rate": 4.961664952360329e-05, "loss": 0.3836, "step": 566500 }, { "epoch": 3.8368882633174533, "grad_norm": 0.5501734614372253, "learning_rate": 4.9616311173668254e-05, "loss": 0.3841, "step": 567000 }, { "epoch": 3.8402717626678218, "grad_norm": 0.3909134268760681, "learning_rate": 4.961597282373322e-05, "loss": 0.3838, "step": 567500 }, { "epoch": 3.84365526201819, "grad_norm": 0.34609946608543396, "learning_rate": 4.9615634473798185e-05, "loss": 0.3841, "step": 568000 }, { "epoch": 3.847038761368558, "grad_norm": 0.33834394812583923, "learning_rate": 4.961529612386315e-05, "loss": 0.384, "step": 568500 }, { "epoch": 3.850422260718926, "grad_norm": 0.32245519757270813, "learning_rate": 4.961495777392811e-05, "loss": 0.3839, "step": 569000 }, { "epoch": 3.853805760069294, "grad_norm": 0.3729417622089386, "learning_rate": 4.961461942399308e-05, "loss": 0.383, "step": 569500 }, { "epoch": 3.857189259419662, "grad_norm": 0.34652623534202576, "learning_rate": 4.961428107405804e-05, "loss": 0.3832, "step": 570000 }, { "epoch": 3.8605727587700303, "grad_norm": 0.33548983931541443, "learning_rate": 4.9613942724122996e-05, "loss": 0.3832, "step": 570500 }, { "epoch": 3.8639562581203983, "grad_norm": 0.3460247814655304, "learning_rate": 4.961360437418796e-05, "loss": 0.3841, "step": 571000 }, { "epoch": 3.8673397574707664, "grad_norm": 0.3622657358646393, "learning_rate": 4.961326602425293e-05, "loss": 0.3851, "step": 571500 }, { "epoch": 3.870723256821135, "grad_norm": 0.32590335607528687, "learning_rate": 4.961292767431789e-05, "loss": 0.3848, "step": 572000 }, { "epoch": 3.8741067561715026, "grad_norm": 0.3382300138473511, "learning_rate": 4.961258932438285e-05, "loss": 0.3819, "step": 572500 }, { "epoch": 3.877490255521871, "grad_norm": 0.33236709237098694, "learning_rate": 4.961225097444781e-05, "loss": 0.3841, "step": 573000 }, { "epoch": 3.880873754872239, "grad_norm": 0.3273625075817108, "learning_rate": 4.961191262451278e-05, "loss": 0.382, "step": 573500 }, { "epoch": 3.8842572542226073, "grad_norm": 0.3772786259651184, "learning_rate": 4.9611574274577744e-05, "loss": 0.383, "step": 574000 }, { "epoch": 3.8876407535729753, "grad_norm": 0.32407230138778687, "learning_rate": 4.9611235924642706e-05, "loss": 0.3854, "step": 574500 }, { "epoch": 3.8910242529233434, "grad_norm": 0.326904296875, "learning_rate": 4.961089757470767e-05, "loss": 0.3824, "step": 575000 }, { "epoch": 3.8944077522737115, "grad_norm": 0.2998929023742676, "learning_rate": 4.961055922477263e-05, "loss": 0.3827, "step": 575500 }, { "epoch": 3.8977912516240796, "grad_norm": 0.3163350224494934, "learning_rate": 4.961022087483759e-05, "loss": 0.3833, "step": 576000 }, { "epoch": 3.9011747509744477, "grad_norm": 0.34520888328552246, "learning_rate": 4.9609882524902555e-05, "loss": 0.3829, "step": 576500 }, { "epoch": 3.9045582503248157, "grad_norm": 0.31207075715065, "learning_rate": 4.9609544174967524e-05, "loss": 0.3825, "step": 577000 }, { "epoch": 3.9079417496751843, "grad_norm": 0.3276323676109314, "learning_rate": 4.9609205825032486e-05, "loss": 0.3828, "step": 577500 }, { "epoch": 3.9113252490255523, "grad_norm": 0.30505678057670593, "learning_rate": 4.960886747509745e-05, "loss": 0.384, "step": 578000 }, { "epoch": 3.9147087483759204, "grad_norm": 0.32663464546203613, "learning_rate": 4.960852912516241e-05, "loss": 0.3835, "step": 578500 }, { "epoch": 3.9180922477262885, "grad_norm": 0.32040658593177795, "learning_rate": 4.960819077522737e-05, "loss": 0.3822, "step": 579000 }, { "epoch": 3.9214757470766566, "grad_norm": 0.32430753111839294, "learning_rate": 4.960785242529234e-05, "loss": 0.3832, "step": 579500 }, { "epoch": 3.9248592464270247, "grad_norm": 0.31457093358039856, "learning_rate": 4.9607514075357296e-05, "loss": 0.3837, "step": 580000 }, { "epoch": 3.9282427457773927, "grad_norm": 0.33417144417762756, "learning_rate": 4.960717572542226e-05, "loss": 0.3838, "step": 580500 }, { "epoch": 3.931626245127761, "grad_norm": 0.3194616436958313, "learning_rate": 4.960683737548723e-05, "loss": 0.3832, "step": 581000 }, { "epoch": 3.935009744478129, "grad_norm": 0.3265133798122406, "learning_rate": 4.960649902555219e-05, "loss": 0.3832, "step": 581500 }, { "epoch": 3.9383932438284974, "grad_norm": 0.3196572959423065, "learning_rate": 4.960616067561715e-05, "loss": 0.3836, "step": 582000 }, { "epoch": 3.941776743178865, "grad_norm": 0.36828818917274475, "learning_rate": 4.9605822325682114e-05, "loss": 0.3837, "step": 582500 }, { "epoch": 3.9451602425292336, "grad_norm": 0.3626387119293213, "learning_rate": 4.960548397574708e-05, "loss": 0.3833, "step": 583000 }, { "epoch": 3.9485437418796017, "grad_norm": 0.3399355709552765, "learning_rate": 4.9605145625812045e-05, "loss": 0.385, "step": 583500 }, { "epoch": 3.9519272412299697, "grad_norm": 0.3553299307823181, "learning_rate": 4.960480727587701e-05, "loss": 0.3848, "step": 584000 }, { "epoch": 3.955310740580338, "grad_norm": 0.3709312379360199, "learning_rate": 4.960446892594197e-05, "loss": 0.3847, "step": 584500 }, { "epoch": 3.958694239930706, "grad_norm": 0.3104972541332245, "learning_rate": 4.960413057600693e-05, "loss": 0.3853, "step": 585000 }, { "epoch": 3.962077739281074, "grad_norm": 0.3424404263496399, "learning_rate": 4.960379222607189e-05, "loss": 0.3837, "step": 585500 }, { "epoch": 3.965461238631442, "grad_norm": 0.3378540277481079, "learning_rate": 4.9603453876136855e-05, "loss": 0.3846, "step": 586000 }, { "epoch": 3.96884473798181, "grad_norm": 0.30796873569488525, "learning_rate": 4.9603115526201824e-05, "loss": 0.3844, "step": 586500 }, { "epoch": 3.9722282373321782, "grad_norm": 0.38140997290611267, "learning_rate": 4.9602777176266786e-05, "loss": 0.3844, "step": 587000 }, { "epoch": 3.9756117366825467, "grad_norm": 0.34148746728897095, "learning_rate": 4.960243882633175e-05, "loss": 0.386, "step": 587500 }, { "epoch": 3.978995236032915, "grad_norm": 0.3145066201686859, "learning_rate": 4.960210047639671e-05, "loss": 0.3833, "step": 588000 }, { "epoch": 3.982378735383283, "grad_norm": 0.3204039931297302, "learning_rate": 4.960176212646167e-05, "loss": 0.3828, "step": 588500 }, { "epoch": 3.985762234733651, "grad_norm": 0.3088099956512451, "learning_rate": 4.960142377652664e-05, "loss": 0.3829, "step": 589000 }, { "epoch": 3.989145734084019, "grad_norm": 0.3536892831325531, "learning_rate": 4.96010854265916e-05, "loss": 0.3833, "step": 589500 }, { "epoch": 3.992529233434387, "grad_norm": 0.36354970932006836, "learning_rate": 4.960074707665656e-05, "loss": 0.3847, "step": 590000 }, { "epoch": 3.9959127327847552, "grad_norm": 0.34147948026657104, "learning_rate": 4.960040872672153e-05, "loss": 0.3833, "step": 590500 }, { "epoch": 3.9992962321351233, "grad_norm": 0.32976600527763367, "learning_rate": 4.960007037678649e-05, "loss": 0.384, "step": 591000 }, { "epoch": 4.0, "eval_accuracy": 0.8540332559426399, "eval_loss": 0.5933773517608643, "eval_runtime": 3392.2025, "eval_samples_per_second": 85.71, "eval_steps_per_second": 5.357, "step": 591104 }, { "epoch": 4.002679731485491, "grad_norm": 0.3177777826786041, "learning_rate": 4.959973202685145e-05, "loss": 0.3825, "step": 591500 }, { "epoch": 4.00606323083586, "grad_norm": 0.3040407598018646, "learning_rate": 4.9599393676916414e-05, "loss": 0.3808, "step": 592000 }, { "epoch": 4.0094467301862275, "grad_norm": 0.3417772948741913, "learning_rate": 4.959905532698138e-05, "loss": 0.3814, "step": 592500 }, { "epoch": 4.012830229536596, "grad_norm": 0.32137414813041687, "learning_rate": 4.9598716977046345e-05, "loss": 0.3813, "step": 593000 }, { "epoch": 4.016213728886964, "grad_norm": 0.3459019362926483, "learning_rate": 4.959837862711131e-05, "loss": 0.3809, "step": 593500 }, { "epoch": 4.019597228237332, "grad_norm": 0.33652251958847046, "learning_rate": 4.959804027717627e-05, "loss": 0.3815, "step": 594000 }, { "epoch": 4.0229807275877, "grad_norm": 0.28225409984588623, "learning_rate": 4.959770192724123e-05, "loss": 0.3816, "step": 594500 }, { "epoch": 4.026364226938068, "grad_norm": 0.32062068581581116, "learning_rate": 4.9597363577306194e-05, "loss": 0.3821, "step": 595000 }, { "epoch": 4.029747726288437, "grad_norm": 0.3513525128364563, "learning_rate": 4.9597025227371156e-05, "loss": 0.3815, "step": 595500 }, { "epoch": 4.0331312256388046, "grad_norm": 0.32809221744537354, "learning_rate": 4.959668687743612e-05, "loss": 0.381, "step": 596000 }, { "epoch": 4.036514724989173, "grad_norm": 0.33766239881515503, "learning_rate": 4.959634852750109e-05, "loss": 0.3808, "step": 596500 }, { "epoch": 4.039898224339541, "grad_norm": 0.311460018157959, "learning_rate": 4.959601017756605e-05, "loss": 0.381, "step": 597000 }, { "epoch": 4.043281723689909, "grad_norm": 0.3221346437931061, "learning_rate": 4.959567182763101e-05, "loss": 0.381, "step": 597500 }, { "epoch": 4.046665223040277, "grad_norm": 0.330782413482666, "learning_rate": 4.9595333477695973e-05, "loss": 0.3805, "step": 598000 }, { "epoch": 4.050048722390645, "grad_norm": 0.33255091309547424, "learning_rate": 4.959499512776094e-05, "loss": 0.3794, "step": 598500 }, { "epoch": 4.053432221741013, "grad_norm": 0.33032065629959106, "learning_rate": 4.95946567778259e-05, "loss": 0.3821, "step": 599000 }, { "epoch": 4.0568157210913816, "grad_norm": 0.35562410950660706, "learning_rate": 4.959431842789086e-05, "loss": 0.3819, "step": 599500 }, { "epoch": 4.06019922044175, "grad_norm": 0.331027626991272, "learning_rate": 4.959398007795583e-05, "loss": 0.383, "step": 600000 }, { "epoch": 4.063582719792118, "grad_norm": 0.31970423460006714, "learning_rate": 4.959364172802079e-05, "loss": 0.383, "step": 600500 }, { "epoch": 4.066966219142486, "grad_norm": 0.36062178015708923, "learning_rate": 4.959330337808575e-05, "loss": 0.3816, "step": 601000 }, { "epoch": 4.070349718492854, "grad_norm": 0.3241523802280426, "learning_rate": 4.9592965028150715e-05, "loss": 0.3824, "step": 601500 }, { "epoch": 4.073733217843222, "grad_norm": 0.3760012686252594, "learning_rate": 4.9592626678215684e-05, "loss": 0.3816, "step": 602000 }, { "epoch": 4.07711671719359, "grad_norm": 0.29808470606803894, "learning_rate": 4.9592288328280646e-05, "loss": 0.3821, "step": 602500 }, { "epoch": 4.080500216543959, "grad_norm": 0.3388458490371704, "learning_rate": 4.959194997834561e-05, "loss": 0.3824, "step": 603000 }, { "epoch": 4.083883715894326, "grad_norm": 0.34288451075553894, "learning_rate": 4.959161162841057e-05, "loss": 0.3826, "step": 603500 }, { "epoch": 4.087267215244695, "grad_norm": 0.35253462195396423, "learning_rate": 4.959127327847553e-05, "loss": 0.3822, "step": 604000 }, { "epoch": 4.090650714595062, "grad_norm": 0.346996009349823, "learning_rate": 4.9590934928540495e-05, "loss": 0.3813, "step": 604500 }, { "epoch": 4.094034213945431, "grad_norm": 0.36608654260635376, "learning_rate": 4.959059657860546e-05, "loss": 0.3808, "step": 605000 }, { "epoch": 4.097417713295799, "grad_norm": 0.38393664360046387, "learning_rate": 4.959025822867042e-05, "loss": 0.3808, "step": 605500 }, { "epoch": 4.100801212646167, "grad_norm": 0.33853015303611755, "learning_rate": 4.958991987873539e-05, "loss": 0.3823, "step": 606000 }, { "epoch": 4.104184711996536, "grad_norm": 0.3143649101257324, "learning_rate": 4.958958152880035e-05, "loss": 0.3818, "step": 606500 }, { "epoch": 4.107568211346903, "grad_norm": 0.365041583776474, "learning_rate": 4.958924317886531e-05, "loss": 0.3809, "step": 607000 }, { "epoch": 4.110951710697272, "grad_norm": 0.36500734090805054, "learning_rate": 4.9588904828930274e-05, "loss": 0.3817, "step": 607500 }, { "epoch": 4.114335210047639, "grad_norm": 0.33343392610549927, "learning_rate": 4.958856647899524e-05, "loss": 0.3812, "step": 608000 }, { "epoch": 4.117718709398008, "grad_norm": 0.2994712293148041, "learning_rate": 4.95882281290602e-05, "loss": 0.3821, "step": 608500 }, { "epoch": 4.1211022087483755, "grad_norm": 0.32676804065704346, "learning_rate": 4.958788977912516e-05, "loss": 0.3823, "step": 609000 }, { "epoch": 4.124485708098744, "grad_norm": 0.3354164659976959, "learning_rate": 4.958755142919013e-05, "loss": 0.3796, "step": 609500 }, { "epoch": 4.127869207449113, "grad_norm": 0.3428443968296051, "learning_rate": 4.958721307925509e-05, "loss": 0.3812, "step": 610000 }, { "epoch": 4.13125270679948, "grad_norm": 0.34081506729125977, "learning_rate": 4.9586874729320054e-05, "loss": 0.3825, "step": 610500 }, { "epoch": 4.134636206149849, "grad_norm": 0.3499056100845337, "learning_rate": 4.9586536379385016e-05, "loss": 0.3819, "step": 611000 }, { "epoch": 4.138019705500216, "grad_norm": 0.32771143317222595, "learning_rate": 4.9586198029449985e-05, "loss": 0.3824, "step": 611500 }, { "epoch": 4.141403204850585, "grad_norm": 0.327528178691864, "learning_rate": 4.958585967951495e-05, "loss": 0.3827, "step": 612000 }, { "epoch": 4.1447867042009525, "grad_norm": 0.3181435465812683, "learning_rate": 4.958552132957991e-05, "loss": 0.381, "step": 612500 }, { "epoch": 4.148170203551321, "grad_norm": 0.3226792514324188, "learning_rate": 4.958518297964487e-05, "loss": 0.3826, "step": 613000 }, { "epoch": 4.151553702901689, "grad_norm": 0.3568819463253021, "learning_rate": 4.958484462970983e-05, "loss": 0.3814, "step": 613500 }, { "epoch": 4.154937202252057, "grad_norm": 0.33348801732063293, "learning_rate": 4.9584506279774795e-05, "loss": 0.3847, "step": 614000 }, { "epoch": 4.158320701602426, "grad_norm": 0.35622504353523254, "learning_rate": 4.958416792983976e-05, "loss": 0.3809, "step": 614500 }, { "epoch": 4.161704200952793, "grad_norm": 0.35121893882751465, "learning_rate": 4.958382957990472e-05, "loss": 0.3818, "step": 615000 }, { "epoch": 4.165087700303162, "grad_norm": 0.34467947483062744, "learning_rate": 4.958349122996969e-05, "loss": 0.3806, "step": 615500 }, { "epoch": 4.1684711996535295, "grad_norm": 0.34680140018463135, "learning_rate": 4.958315288003465e-05, "loss": 0.3813, "step": 616000 }, { "epoch": 4.171854699003898, "grad_norm": 0.31994232535362244, "learning_rate": 4.958281453009961e-05, "loss": 0.3817, "step": 616500 }, { "epoch": 4.175238198354266, "grad_norm": 0.32718199491500854, "learning_rate": 4.9582476180164575e-05, "loss": 0.3822, "step": 617000 }, { "epoch": 4.178621697704634, "grad_norm": 0.31718477606773376, "learning_rate": 4.9582137830229544e-05, "loss": 0.3822, "step": 617500 }, { "epoch": 4.182005197055002, "grad_norm": 0.32777708768844604, "learning_rate": 4.95817994802945e-05, "loss": 0.3817, "step": 618000 }, { "epoch": 4.18538869640537, "grad_norm": 0.32051903009414673, "learning_rate": 4.958146113035946e-05, "loss": 0.381, "step": 618500 }, { "epoch": 4.188772195755738, "grad_norm": 0.33153292536735535, "learning_rate": 4.958112278042443e-05, "loss": 0.3816, "step": 619000 }, { "epoch": 4.1921556951061065, "grad_norm": 0.30980491638183594, "learning_rate": 4.958078443048939e-05, "loss": 0.3821, "step": 619500 }, { "epoch": 4.195539194456475, "grad_norm": 0.3164099454879761, "learning_rate": 4.9580446080554354e-05, "loss": 0.3796, "step": 620000 }, { "epoch": 4.198922693806843, "grad_norm": 0.33522167801856995, "learning_rate": 4.9580107730619316e-05, "loss": 0.382, "step": 620500 }, { "epoch": 4.202306193157211, "grad_norm": 0.3143502175807953, "learning_rate": 4.9579769380684285e-05, "loss": 0.3818, "step": 621000 }, { "epoch": 4.205689692507579, "grad_norm": 0.36703556776046753, "learning_rate": 4.957943103074925e-05, "loss": 0.3823, "step": 621500 }, { "epoch": 4.209073191857947, "grad_norm": 0.32491037249565125, "learning_rate": 4.957909268081421e-05, "loss": 0.3809, "step": 622000 }, { "epoch": 4.212456691208315, "grad_norm": 0.34407299757003784, "learning_rate": 4.957875433087917e-05, "loss": 0.3825, "step": 622500 }, { "epoch": 4.2158401905586835, "grad_norm": 0.3241789937019348, "learning_rate": 4.9578415980944134e-05, "loss": 0.3816, "step": 623000 }, { "epoch": 4.219223689909051, "grad_norm": 0.31797224283218384, "learning_rate": 4.9578077631009096e-05, "loss": 0.3823, "step": 623500 }, { "epoch": 4.22260718925942, "grad_norm": 0.3514759838581085, "learning_rate": 4.957773928107406e-05, "loss": 0.3826, "step": 624000 }, { "epoch": 4.225990688609788, "grad_norm": 0.34980130195617676, "learning_rate": 4.957740093113902e-05, "loss": 0.384, "step": 624500 }, { "epoch": 4.229374187960156, "grad_norm": 0.31096556782722473, "learning_rate": 4.957706258120399e-05, "loss": 0.3817, "step": 625000 }, { "epoch": 4.232757687310524, "grad_norm": 0.34254008531570435, "learning_rate": 4.957672423126895e-05, "loss": 0.3807, "step": 625500 }, { "epoch": 4.236141186660892, "grad_norm": 0.3574650287628174, "learning_rate": 4.957638588133391e-05, "loss": 0.3809, "step": 626000 }, { "epoch": 4.2395246860112605, "grad_norm": 0.3042188882827759, "learning_rate": 4.9576047531398875e-05, "loss": 0.3815, "step": 626500 }, { "epoch": 4.242908185361628, "grad_norm": 0.3633319139480591, "learning_rate": 4.9575709181463844e-05, "loss": 0.3816, "step": 627000 }, { "epoch": 4.246291684711997, "grad_norm": 0.3306909203529358, "learning_rate": 4.95753708315288e-05, "loss": 0.3817, "step": 627500 }, { "epoch": 4.249675184062364, "grad_norm": 0.3724987506866455, "learning_rate": 4.957503248159376e-05, "loss": 0.3822, "step": 628000 }, { "epoch": 4.253058683412733, "grad_norm": 0.36483970284461975, "learning_rate": 4.957469413165873e-05, "loss": 0.3817, "step": 628500 }, { "epoch": 4.2564421827631005, "grad_norm": 0.3343786895275116, "learning_rate": 4.957435578172369e-05, "loss": 0.3822, "step": 629000 }, { "epoch": 4.259825682113469, "grad_norm": 0.3305645287036896, "learning_rate": 4.9574017431788655e-05, "loss": 0.3818, "step": 629500 }, { "epoch": 4.2632091814638375, "grad_norm": 0.32104507088661194, "learning_rate": 4.957367908185362e-05, "loss": 0.3821, "step": 630000 }, { "epoch": 4.266592680814205, "grad_norm": 0.3260861337184906, "learning_rate": 4.9573340731918586e-05, "loss": 0.3824, "step": 630500 }, { "epoch": 4.269976180164574, "grad_norm": 0.31556007266044617, "learning_rate": 4.957300238198355e-05, "loss": 0.3824, "step": 631000 }, { "epoch": 4.273359679514941, "grad_norm": 0.3143795430660248, "learning_rate": 4.957266403204851e-05, "loss": 0.3811, "step": 631500 }, { "epoch": 4.27674317886531, "grad_norm": 0.3627665936946869, "learning_rate": 4.957232568211347e-05, "loss": 0.3836, "step": 632000 }, { "epoch": 4.2801266782156775, "grad_norm": 0.34783482551574707, "learning_rate": 4.9571987332178434e-05, "loss": 0.3831, "step": 632500 }, { "epoch": 4.283510177566046, "grad_norm": 0.3361124098300934, "learning_rate": 4.9571648982243396e-05, "loss": 0.3832, "step": 633000 }, { "epoch": 4.286893676916414, "grad_norm": 0.3182179927825928, "learning_rate": 4.957131063230836e-05, "loss": 0.3819, "step": 633500 }, { "epoch": 4.290277176266782, "grad_norm": 0.33180859684944153, "learning_rate": 4.957097228237332e-05, "loss": 0.3833, "step": 634000 }, { "epoch": 4.293660675617151, "grad_norm": 0.33058416843414307, "learning_rate": 4.957063393243829e-05, "loss": 0.3831, "step": 634500 }, { "epoch": 4.297044174967518, "grad_norm": 0.3244820535182953, "learning_rate": 4.957029558250325e-05, "loss": 0.3803, "step": 635000 }, { "epoch": 4.300427674317887, "grad_norm": 0.3454670011997223, "learning_rate": 4.9569957232568214e-05, "loss": 0.3809, "step": 635500 }, { "epoch": 4.3038111736682545, "grad_norm": 0.3536366820335388, "learning_rate": 4.9569618882633176e-05, "loss": 0.3802, "step": 636000 }, { "epoch": 4.307194673018623, "grad_norm": 0.31699424982070923, "learning_rate": 4.9569280532698145e-05, "loss": 0.3812, "step": 636500 }, { "epoch": 4.310578172368991, "grad_norm": 0.3276674151420593, "learning_rate": 4.95689421827631e-05, "loss": 0.3839, "step": 637000 }, { "epoch": 4.313961671719359, "grad_norm": 0.32476815581321716, "learning_rate": 4.956860383282806e-05, "loss": 0.382, "step": 637500 }, { "epoch": 4.317345171069727, "grad_norm": 0.36478739976882935, "learning_rate": 4.956826548289303e-05, "loss": 0.3825, "step": 638000 }, { "epoch": 4.320728670420095, "grad_norm": 0.39364731311798096, "learning_rate": 4.956792713295799e-05, "loss": 0.3834, "step": 638500 }, { "epoch": 4.324112169770464, "grad_norm": 0.33854612708091736, "learning_rate": 4.9567588783022955e-05, "loss": 0.3816, "step": 639000 }, { "epoch": 4.3274956691208315, "grad_norm": 0.34368497133255005, "learning_rate": 4.956725043308792e-05, "loss": 0.3825, "step": 639500 }, { "epoch": 4.3308791684712, "grad_norm": 0.3576470911502838, "learning_rate": 4.9566912083152887e-05, "loss": 0.3817, "step": 640000 }, { "epoch": 4.334262667821568, "grad_norm": 0.30935782194137573, "learning_rate": 4.956657373321785e-05, "loss": 0.3825, "step": 640500 }, { "epoch": 4.337646167171936, "grad_norm": 0.32550904154777527, "learning_rate": 4.956623538328281e-05, "loss": 0.382, "step": 641000 }, { "epoch": 4.341029666522304, "grad_norm": 0.339445024728775, "learning_rate": 4.956589703334777e-05, "loss": 0.3815, "step": 641500 }, { "epoch": 4.344413165872672, "grad_norm": 0.3440740406513214, "learning_rate": 4.9565558683412735e-05, "loss": 0.3813, "step": 642000 }, { "epoch": 4.34779666522304, "grad_norm": 0.37303054332733154, "learning_rate": 4.95652203334777e-05, "loss": 0.3824, "step": 642500 }, { "epoch": 4.3511801645734085, "grad_norm": 0.31598225235939026, "learning_rate": 4.956488198354266e-05, "loss": 0.3818, "step": 643000 }, { "epoch": 4.354563663923776, "grad_norm": 0.32695379853248596, "learning_rate": 4.956454363360762e-05, "loss": 0.3833, "step": 643500 }, { "epoch": 4.357947163274145, "grad_norm": 0.38666465878486633, "learning_rate": 4.956420528367259e-05, "loss": 0.3804, "step": 644000 }, { "epoch": 4.361330662624513, "grad_norm": 0.31230244040489197, "learning_rate": 4.956386693373755e-05, "loss": 0.3817, "step": 644500 }, { "epoch": 4.364714161974881, "grad_norm": 0.3382791578769684, "learning_rate": 4.9563528583802515e-05, "loss": 0.3808, "step": 645000 }, { "epoch": 4.368097661325249, "grad_norm": 0.3403199315071106, "learning_rate": 4.956319023386748e-05, "loss": 0.3806, "step": 645500 }, { "epoch": 4.371481160675617, "grad_norm": 0.33164751529693604, "learning_rate": 4.9562851883932446e-05, "loss": 0.3832, "step": 646000 }, { "epoch": 4.3748646600259855, "grad_norm": 0.3421335518360138, "learning_rate": 4.95625135339974e-05, "loss": 0.382, "step": 646500 }, { "epoch": 4.378248159376353, "grad_norm": 0.32325634360313416, "learning_rate": 4.956217518406236e-05, "loss": 0.3797, "step": 647000 }, { "epoch": 4.381631658726722, "grad_norm": 0.3200243413448334, "learning_rate": 4.956183683412733e-05, "loss": 0.3803, "step": 647500 }, { "epoch": 4.385015158077089, "grad_norm": 0.320699006319046, "learning_rate": 4.9561498484192294e-05, "loss": 0.381, "step": 648000 }, { "epoch": 4.388398657427458, "grad_norm": 0.32693010568618774, "learning_rate": 4.9561160134257256e-05, "loss": 0.3803, "step": 648500 }, { "epoch": 4.3917821567778255, "grad_norm": 0.317874550819397, "learning_rate": 4.956082178432222e-05, "loss": 0.3809, "step": 649000 }, { "epoch": 4.395165656128194, "grad_norm": 0.34446126222610474, "learning_rate": 4.956048343438719e-05, "loss": 0.3806, "step": 649500 }, { "epoch": 4.3985491554785625, "grad_norm": 0.3517157733440399, "learning_rate": 4.956014508445215e-05, "loss": 0.3819, "step": 650000 }, { "epoch": 4.40193265482893, "grad_norm": 0.35229623317718506, "learning_rate": 4.955980673451711e-05, "loss": 0.3828, "step": 650500 }, { "epoch": 4.405316154179299, "grad_norm": 0.33478739857673645, "learning_rate": 4.9559468384582074e-05, "loss": 0.3833, "step": 651000 }, { "epoch": 4.408699653529666, "grad_norm": 0.324706494808197, "learning_rate": 4.9559130034647036e-05, "loss": 0.3822, "step": 651500 }, { "epoch": 4.412083152880035, "grad_norm": 0.36632421612739563, "learning_rate": 4.9558791684712e-05, "loss": 0.38, "step": 652000 }, { "epoch": 4.4154666522304025, "grad_norm": 0.3096342980861664, "learning_rate": 4.955845333477696e-05, "loss": 0.3823, "step": 652500 }, { "epoch": 4.418850151580771, "grad_norm": 0.35870158672332764, "learning_rate": 4.955811498484192e-05, "loss": 0.3814, "step": 653000 }, { "epoch": 4.422233650931139, "grad_norm": 0.3230837881565094, "learning_rate": 4.955777663490689e-05, "loss": 0.3817, "step": 653500 }, { "epoch": 4.425617150281507, "grad_norm": 0.35571718215942383, "learning_rate": 4.955743828497185e-05, "loss": 0.3805, "step": 654000 }, { "epoch": 4.429000649631876, "grad_norm": 0.32466378808021545, "learning_rate": 4.9557099935036815e-05, "loss": 0.3818, "step": 654500 }, { "epoch": 4.432384148982243, "grad_norm": 0.313757985830307, "learning_rate": 4.955676158510178e-05, "loss": 0.3822, "step": 655000 }, { "epoch": 4.435767648332612, "grad_norm": 0.33871760964393616, "learning_rate": 4.9556423235166746e-05, "loss": 0.3816, "step": 655500 }, { "epoch": 4.4391511476829795, "grad_norm": 0.34044113755226135, "learning_rate": 4.95560848852317e-05, "loss": 0.3828, "step": 656000 }, { "epoch": 4.442534647033348, "grad_norm": 0.33314165472984314, "learning_rate": 4.9555746535296664e-05, "loss": 0.3809, "step": 656500 }, { "epoch": 4.445918146383716, "grad_norm": 0.34168577194213867, "learning_rate": 4.955540818536163e-05, "loss": 0.3824, "step": 657000 }, { "epoch": 4.449301645734084, "grad_norm": 0.3416721522808075, "learning_rate": 4.9555069835426595e-05, "loss": 0.3803, "step": 657500 }, { "epoch": 4.452685145084452, "grad_norm": 0.3214699625968933, "learning_rate": 4.955473148549156e-05, "loss": 0.379, "step": 658000 }, { "epoch": 4.45606864443482, "grad_norm": 0.3384753465652466, "learning_rate": 4.955439313555652e-05, "loss": 0.382, "step": 658500 }, { "epoch": 4.459452143785189, "grad_norm": 0.3610079884529114, "learning_rate": 4.955405478562148e-05, "loss": 0.3822, "step": 659000 }, { "epoch": 4.4628356431355565, "grad_norm": 0.31960728764533997, "learning_rate": 4.955371643568645e-05, "loss": 0.3825, "step": 659500 }, { "epoch": 4.466219142485925, "grad_norm": 0.3292570114135742, "learning_rate": 4.955337808575141e-05, "loss": 0.3806, "step": 660000 }, { "epoch": 4.469602641836293, "grad_norm": 0.32012736797332764, "learning_rate": 4.9553039735816374e-05, "loss": 0.3817, "step": 660500 }, { "epoch": 4.472986141186661, "grad_norm": 0.33299851417541504, "learning_rate": 4.9552701385881336e-05, "loss": 0.3808, "step": 661000 }, { "epoch": 4.476369640537029, "grad_norm": 0.3335835933685303, "learning_rate": 4.95523630359463e-05, "loss": 0.3826, "step": 661500 }, { "epoch": 4.479753139887397, "grad_norm": 0.32194557785987854, "learning_rate": 4.955202468601126e-05, "loss": 0.3832, "step": 662000 }, { "epoch": 4.483136639237765, "grad_norm": 0.3210223615169525, "learning_rate": 4.955168633607622e-05, "loss": 0.3815, "step": 662500 }, { "epoch": 4.4865201385881335, "grad_norm": 0.3858987092971802, "learning_rate": 4.955134798614119e-05, "loss": 0.3813, "step": 663000 }, { "epoch": 4.489903637938501, "grad_norm": 0.3279559314250946, "learning_rate": 4.9551009636206154e-05, "loss": 0.3812, "step": 663500 }, { "epoch": 4.49328713728887, "grad_norm": 0.31878533959388733, "learning_rate": 4.9550671286271116e-05, "loss": 0.3802, "step": 664000 }, { "epoch": 4.496670636639238, "grad_norm": 0.3142237067222595, "learning_rate": 4.955033293633608e-05, "loss": 0.382, "step": 664500 }, { "epoch": 4.500054135989606, "grad_norm": 0.30280283093452454, "learning_rate": 4.954999458640105e-05, "loss": 0.3824, "step": 665000 }, { "epoch": 4.503437635339974, "grad_norm": 0.3333982229232788, "learning_rate": 4.954965623646601e-05, "loss": 0.3809, "step": 665500 }, { "epoch": 4.506821134690342, "grad_norm": 0.34333544969558716, "learning_rate": 4.9549317886530964e-05, "loss": 0.3826, "step": 666000 }, { "epoch": 4.5102046340407105, "grad_norm": 0.3583124577999115, "learning_rate": 4.9548979536595926e-05, "loss": 0.3831, "step": 666500 }, { "epoch": 4.513588133391078, "grad_norm": 0.31179922819137573, "learning_rate": 4.9548641186660895e-05, "loss": 0.3808, "step": 667000 }, { "epoch": 4.516971632741447, "grad_norm": 0.37017133831977844, "learning_rate": 4.954830283672586e-05, "loss": 0.3808, "step": 667500 }, { "epoch": 4.520355132091814, "grad_norm": 0.34011372923851013, "learning_rate": 4.954796448679082e-05, "loss": 0.3815, "step": 668000 }, { "epoch": 4.523738631442183, "grad_norm": 0.33286014199256897, "learning_rate": 4.954762613685578e-05, "loss": 0.382, "step": 668500 }, { "epoch": 4.52712213079255, "grad_norm": 0.3390562832355499, "learning_rate": 4.954728778692075e-05, "loss": 0.3818, "step": 669000 }, { "epoch": 4.530505630142919, "grad_norm": 0.3356575667858124, "learning_rate": 4.954694943698571e-05, "loss": 0.3815, "step": 669500 }, { "epoch": 4.5338891294932875, "grad_norm": 0.34956133365631104, "learning_rate": 4.9546611087050675e-05, "loss": 0.3833, "step": 670000 }, { "epoch": 4.537272628843655, "grad_norm": 0.34817448258399963, "learning_rate": 4.954627273711564e-05, "loss": 0.3799, "step": 670500 }, { "epoch": 4.540656128194024, "grad_norm": 0.36447280645370483, "learning_rate": 4.95459343871806e-05, "loss": 0.3823, "step": 671000 }, { "epoch": 4.544039627544391, "grad_norm": 0.3624734580516815, "learning_rate": 4.954559603724556e-05, "loss": 0.3822, "step": 671500 }, { "epoch": 4.54742312689476, "grad_norm": 0.3400898277759552, "learning_rate": 4.954525768731052e-05, "loss": 0.3806, "step": 672000 }, { "epoch": 4.550806626245127, "grad_norm": 0.3510250747203827, "learning_rate": 4.954491933737549e-05, "loss": 0.3818, "step": 672500 }, { "epoch": 4.554190125595496, "grad_norm": 0.32453998923301697, "learning_rate": 4.9544580987440454e-05, "loss": 0.3825, "step": 673000 }, { "epoch": 4.557573624945864, "grad_norm": 0.3194526731967926, "learning_rate": 4.9544242637505416e-05, "loss": 0.3804, "step": 673500 }, { "epoch": 4.560957124296232, "grad_norm": 0.320198655128479, "learning_rate": 4.954390428757038e-05, "loss": 0.3817, "step": 674000 }, { "epoch": 4.564340623646601, "grad_norm": 0.3281102776527405, "learning_rate": 4.954356593763535e-05, "loss": 0.38, "step": 674500 }, { "epoch": 4.567724122996968, "grad_norm": 0.3195473849773407, "learning_rate": 4.954322758770031e-05, "loss": 0.3822, "step": 675000 }, { "epoch": 4.571107622347337, "grad_norm": 0.31520745158195496, "learning_rate": 4.9542889237765265e-05, "loss": 0.3816, "step": 675500 }, { "epoch": 4.574491121697704, "grad_norm": 0.3593721389770508, "learning_rate": 4.954255088783023e-05, "loss": 0.3809, "step": 676000 }, { "epoch": 4.577874621048073, "grad_norm": 0.3455953598022461, "learning_rate": 4.9542212537895196e-05, "loss": 0.3816, "step": 676500 }, { "epoch": 4.581258120398441, "grad_norm": 0.3602738082408905, "learning_rate": 4.954187418796016e-05, "loss": 0.3813, "step": 677000 }, { "epoch": 4.584641619748809, "grad_norm": 0.33302927017211914, "learning_rate": 4.954153583802512e-05, "loss": 0.3809, "step": 677500 }, { "epoch": 4.588025119099177, "grad_norm": 0.31621211767196655, "learning_rate": 4.954119748809008e-05, "loss": 0.3806, "step": 678000 }, { "epoch": 4.591408618449545, "grad_norm": 0.3371010422706604, "learning_rate": 4.954085913815505e-05, "loss": 0.3813, "step": 678500 }, { "epoch": 4.594792117799914, "grad_norm": 0.3501533269882202, "learning_rate": 4.954052078822001e-05, "loss": 0.3819, "step": 679000 }, { "epoch": 4.5981756171502814, "grad_norm": 0.32012107968330383, "learning_rate": 4.9540182438284975e-05, "loss": 0.3792, "step": 679500 }, { "epoch": 4.60155911650065, "grad_norm": 0.33262699842453003, "learning_rate": 4.953984408834994e-05, "loss": 0.3805, "step": 680000 }, { "epoch": 4.604942615851018, "grad_norm": 0.3417918086051941, "learning_rate": 4.95395057384149e-05, "loss": 0.3801, "step": 680500 }, { "epoch": 4.608326115201386, "grad_norm": 0.3096560537815094, "learning_rate": 4.953916738847986e-05, "loss": 0.3806, "step": 681000 }, { "epoch": 4.611709614551754, "grad_norm": 0.34804731607437134, "learning_rate": 4.9538829038544824e-05, "loss": 0.3811, "step": 681500 }, { "epoch": 4.615093113902122, "grad_norm": 0.3319779932498932, "learning_rate": 4.953849068860979e-05, "loss": 0.3816, "step": 682000 }, { "epoch": 4.61847661325249, "grad_norm": 0.31977346539497375, "learning_rate": 4.9538152338674755e-05, "loss": 0.3818, "step": 682500 }, { "epoch": 4.6218601126028585, "grad_norm": 0.32984334230422974, "learning_rate": 4.953781398873972e-05, "loss": 0.3806, "step": 683000 }, { "epoch": 4.625243611953227, "grad_norm": 0.3309282064437866, "learning_rate": 4.953747563880468e-05, "loss": 0.3818, "step": 683500 }, { "epoch": 4.628627111303595, "grad_norm": 0.3470269441604614, "learning_rate": 4.953713728886965e-05, "loss": 0.3814, "step": 684000 }, { "epoch": 4.632010610653963, "grad_norm": 0.35394591093063354, "learning_rate": 4.953679893893461e-05, "loss": 0.3798, "step": 684500 }, { "epoch": 4.635394110004331, "grad_norm": 0.3420203924179077, "learning_rate": 4.9536460588999566e-05, "loss": 0.381, "step": 685000 }, { "epoch": 4.638777609354699, "grad_norm": 0.32648688554763794, "learning_rate": 4.953612223906453e-05, "loss": 0.3823, "step": 685500 }, { "epoch": 4.642161108705067, "grad_norm": 0.34044334292411804, "learning_rate": 4.9535783889129497e-05, "loss": 0.3815, "step": 686000 }, { "epoch": 4.6455446080554355, "grad_norm": 0.34604334831237793, "learning_rate": 4.953544553919446e-05, "loss": 0.3833, "step": 686500 }, { "epoch": 4.648928107405803, "grad_norm": 0.3389892578125, "learning_rate": 4.953510718925942e-05, "loss": 0.3823, "step": 687000 }, { "epoch": 4.652311606756172, "grad_norm": 0.34680214524269104, "learning_rate": 4.953476883932438e-05, "loss": 0.3818, "step": 687500 }, { "epoch": 4.655695106106539, "grad_norm": 0.3397471308708191, "learning_rate": 4.953443048938935e-05, "loss": 0.3821, "step": 688000 }, { "epoch": 4.659078605456908, "grad_norm": 0.31688883900642395, "learning_rate": 4.9534092139454314e-05, "loss": 0.3824, "step": 688500 }, { "epoch": 4.662462104807275, "grad_norm": 0.32283297181129456, "learning_rate": 4.9533753789519276e-05, "loss": 0.3811, "step": 689000 }, { "epoch": 4.665845604157644, "grad_norm": 0.3226644992828369, "learning_rate": 4.953341543958424e-05, "loss": 0.3808, "step": 689500 }, { "epoch": 4.6692291035080125, "grad_norm": 0.2814815640449524, "learning_rate": 4.95330770896492e-05, "loss": 0.3821, "step": 690000 }, { "epoch": 4.67261260285838, "grad_norm": 0.33735325932502747, "learning_rate": 4.953273873971416e-05, "loss": 0.3805, "step": 690500 }, { "epoch": 4.675996102208749, "grad_norm": 0.3787975013256073, "learning_rate": 4.9532400389779125e-05, "loss": 0.3817, "step": 691000 }, { "epoch": 4.679379601559116, "grad_norm": 0.3654099702835083, "learning_rate": 4.9532062039844093e-05, "loss": 0.3819, "step": 691500 }, { "epoch": 4.682763100909485, "grad_norm": 0.33603522181510925, "learning_rate": 4.9531723689909056e-05, "loss": 0.3818, "step": 692000 }, { "epoch": 4.686146600259852, "grad_norm": 0.3216695487499237, "learning_rate": 4.953138533997402e-05, "loss": 0.3814, "step": 692500 }, { "epoch": 4.689530099610221, "grad_norm": 0.37610581517219543, "learning_rate": 4.953104699003898e-05, "loss": 0.3821, "step": 693000 }, { "epoch": 4.692913598960589, "grad_norm": 0.3684719502925873, "learning_rate": 4.953070864010395e-05, "loss": 0.3813, "step": 693500 }, { "epoch": 4.696297098310957, "grad_norm": 0.32597842812538147, "learning_rate": 4.953037029016891e-05, "loss": 0.3803, "step": 694000 }, { "epoch": 4.699680597661326, "grad_norm": 0.34688663482666016, "learning_rate": 4.9530031940233866e-05, "loss": 0.3814, "step": 694500 }, { "epoch": 4.703064097011693, "grad_norm": 0.32867276668548584, "learning_rate": 4.952969359029883e-05, "loss": 0.3796, "step": 695000 }, { "epoch": 4.706447596362062, "grad_norm": 0.36180058121681213, "learning_rate": 4.95293552403638e-05, "loss": 0.3805, "step": 695500 }, { "epoch": 4.709831095712429, "grad_norm": 0.3275575041770935, "learning_rate": 4.952901689042876e-05, "loss": 0.3819, "step": 696000 }, { "epoch": 4.713214595062798, "grad_norm": 0.3100377917289734, "learning_rate": 4.952867854049372e-05, "loss": 0.3802, "step": 696500 }, { "epoch": 4.716598094413166, "grad_norm": 0.3666843771934509, "learning_rate": 4.9528340190558684e-05, "loss": 0.3811, "step": 697000 }, { "epoch": 4.719981593763534, "grad_norm": 0.34636420011520386, "learning_rate": 4.952800184062365e-05, "loss": 0.3795, "step": 697500 }, { "epoch": 4.723365093113902, "grad_norm": 0.3316555917263031, "learning_rate": 4.9527663490688615e-05, "loss": 0.3804, "step": 698000 }, { "epoch": 4.72674859246427, "grad_norm": 0.3498135209083557, "learning_rate": 4.952732514075358e-05, "loss": 0.3812, "step": 698500 }, { "epoch": 4.730132091814639, "grad_norm": 0.29856714606285095, "learning_rate": 4.952698679081854e-05, "loss": 0.3816, "step": 699000 }, { "epoch": 4.733515591165006, "grad_norm": 0.32720449566841125, "learning_rate": 4.95266484408835e-05, "loss": 0.3817, "step": 699500 }, { "epoch": 4.736899090515375, "grad_norm": 0.2867276668548584, "learning_rate": 4.952631009094846e-05, "loss": 0.3808, "step": 700000 }, { "epoch": 4.740282589865743, "grad_norm": 0.36703354120254517, "learning_rate": 4.9525971741013425e-05, "loss": 0.3811, "step": 700500 }, { "epoch": 4.743666089216111, "grad_norm": 0.3417615592479706, "learning_rate": 4.9525633391078394e-05, "loss": 0.3839, "step": 701000 }, { "epoch": 4.747049588566479, "grad_norm": 0.32866930961608887, "learning_rate": 4.9525295041143356e-05, "loss": 0.3828, "step": 701500 }, { "epoch": 4.750433087916847, "grad_norm": 0.30230823159217834, "learning_rate": 4.952495669120832e-05, "loss": 0.3818, "step": 702000 }, { "epoch": 4.753816587267215, "grad_norm": 0.2984265387058258, "learning_rate": 4.952461834127328e-05, "loss": 0.3819, "step": 702500 }, { "epoch": 4.757200086617583, "grad_norm": 0.3164541721343994, "learning_rate": 4.952427999133825e-05, "loss": 0.38, "step": 703000 }, { "epoch": 4.760583585967952, "grad_norm": 0.3470172882080078, "learning_rate": 4.952394164140321e-05, "loss": 0.3822, "step": 703500 }, { "epoch": 4.76396708531832, "grad_norm": 0.34124675393104553, "learning_rate": 4.952360329146817e-05, "loss": 0.3818, "step": 704000 }, { "epoch": 4.767350584668688, "grad_norm": 0.36501240730285645, "learning_rate": 4.952326494153313e-05, "loss": 0.3804, "step": 704500 }, { "epoch": 4.770734084019056, "grad_norm": 0.3430570363998413, "learning_rate": 4.95229265915981e-05, "loss": 0.3819, "step": 705000 }, { "epoch": 4.774117583369424, "grad_norm": 0.33082473278045654, "learning_rate": 4.952258824166306e-05, "loss": 0.3813, "step": 705500 }, { "epoch": 4.777501082719792, "grad_norm": 0.344489723443985, "learning_rate": 4.952224989172802e-05, "loss": 0.3816, "step": 706000 }, { "epoch": 4.78088458207016, "grad_norm": 0.3429262638092041, "learning_rate": 4.9521911541792984e-05, "loss": 0.382, "step": 706500 }, { "epoch": 4.784268081420528, "grad_norm": 0.32940617203712463, "learning_rate": 4.952157319185795e-05, "loss": 0.3813, "step": 707000 }, { "epoch": 4.787651580770897, "grad_norm": 0.3384110629558563, "learning_rate": 4.9521234841922915e-05, "loss": 0.3819, "step": 707500 }, { "epoch": 4.791035080121265, "grad_norm": 0.3162197172641754, "learning_rate": 4.952089649198788e-05, "loss": 0.3806, "step": 708000 }, { "epoch": 4.794418579471633, "grad_norm": 0.3531859815120697, "learning_rate": 4.952055814205284e-05, "loss": 0.3809, "step": 708500 }, { "epoch": 4.797802078822, "grad_norm": 0.34763363003730774, "learning_rate": 4.95202197921178e-05, "loss": 0.3807, "step": 709000 }, { "epoch": 4.801185578172369, "grad_norm": 0.34678885340690613, "learning_rate": 4.9519881442182764e-05, "loss": 0.3794, "step": 709500 }, { "epoch": 4.804569077522737, "grad_norm": 0.33274465799331665, "learning_rate": 4.9519543092247726e-05, "loss": 0.3799, "step": 710000 }, { "epoch": 4.807952576873105, "grad_norm": 0.339770644903183, "learning_rate": 4.9519204742312695e-05, "loss": 0.3803, "step": 710500 }, { "epoch": 4.811336076223474, "grad_norm": 0.3638891577720642, "learning_rate": 4.951886639237766e-05, "loss": 0.382, "step": 711000 }, { "epoch": 4.814719575573841, "grad_norm": 0.3298690617084503, "learning_rate": 4.951852804244262e-05, "loss": 0.3799, "step": 711500 }, { "epoch": 4.81810307492421, "grad_norm": 0.3221777677536011, "learning_rate": 4.951818969250758e-05, "loss": 0.3823, "step": 712000 }, { "epoch": 4.821486574274577, "grad_norm": 0.3054807186126709, "learning_rate": 4.951785134257254e-05, "loss": 0.3826, "step": 712500 }, { "epoch": 4.824870073624946, "grad_norm": 0.32414326071739197, "learning_rate": 4.951751299263751e-05, "loss": 0.3811, "step": 713000 }, { "epoch": 4.8282535729753135, "grad_norm": 0.3311294615268707, "learning_rate": 4.951717464270247e-05, "loss": 0.382, "step": 713500 }, { "epoch": 4.831637072325682, "grad_norm": 0.3301956355571747, "learning_rate": 4.951683629276743e-05, "loss": 0.3806, "step": 714000 }, { "epoch": 4.835020571676051, "grad_norm": 0.2924399673938751, "learning_rate": 4.95164979428324e-05, "loss": 0.3812, "step": 714500 }, { "epoch": 4.838404071026418, "grad_norm": 0.3669053912162781, "learning_rate": 4.951615959289736e-05, "loss": 0.3808, "step": 715000 }, { "epoch": 4.841787570376787, "grad_norm": 0.3688805401325226, "learning_rate": 4.951582124296232e-05, "loss": 0.381, "step": 715500 }, { "epoch": 4.845171069727154, "grad_norm": 0.3344734311103821, "learning_rate": 4.9515482893027285e-05, "loss": 0.3815, "step": 716000 }, { "epoch": 4.848554569077523, "grad_norm": 0.31872445344924927, "learning_rate": 4.9515144543092254e-05, "loss": 0.3799, "step": 716500 }, { "epoch": 4.8519380684278905, "grad_norm": 0.3189932405948639, "learning_rate": 4.9514806193157216e-05, "loss": 0.3794, "step": 717000 }, { "epoch": 4.855321567778259, "grad_norm": 0.3555068075656891, "learning_rate": 4.951446784322218e-05, "loss": 0.3816, "step": 717500 }, { "epoch": 4.858705067128627, "grad_norm": 0.34589362144470215, "learning_rate": 4.951412949328714e-05, "loss": 0.381, "step": 718000 }, { "epoch": 4.862088566478995, "grad_norm": 0.3822649121284485, "learning_rate": 4.95137911433521e-05, "loss": 0.3804, "step": 718500 }, { "epoch": 4.865472065829364, "grad_norm": 0.3114178478717804, "learning_rate": 4.9513452793417064e-05, "loss": 0.3813, "step": 719000 }, { "epoch": 4.868855565179731, "grad_norm": 0.3315790891647339, "learning_rate": 4.9513114443482026e-05, "loss": 0.3806, "step": 719500 }, { "epoch": 4.8722390645301, "grad_norm": 0.3110921084880829, "learning_rate": 4.9512776093546995e-05, "loss": 0.3812, "step": 720000 }, { "epoch": 4.8756225638804676, "grad_norm": 0.3719049394130707, "learning_rate": 4.951243774361196e-05, "loss": 0.3804, "step": 720500 }, { "epoch": 4.879006063230836, "grad_norm": 0.31962937116622925, "learning_rate": 4.951209939367692e-05, "loss": 0.3824, "step": 721000 }, { "epoch": 4.882389562581204, "grad_norm": 0.34589603543281555, "learning_rate": 4.951176104374188e-05, "loss": 0.3803, "step": 721500 }, { "epoch": 4.885773061931572, "grad_norm": 0.3213373124599457, "learning_rate": 4.9511422693806844e-05, "loss": 0.3817, "step": 722000 }, { "epoch": 4.88915656128194, "grad_norm": 0.33251869678497314, "learning_rate": 4.951108434387181e-05, "loss": 0.379, "step": 722500 }, { "epoch": 4.892540060632308, "grad_norm": 0.3400231897830963, "learning_rate": 4.951074599393677e-05, "loss": 0.3815, "step": 723000 }, { "epoch": 4.895923559982677, "grad_norm": 0.33018866181373596, "learning_rate": 4.951040764400173e-05, "loss": 0.3818, "step": 723500 }, { "epoch": 4.899307059333045, "grad_norm": 0.3175853490829468, "learning_rate": 4.95100692940667e-05, "loss": 0.3809, "step": 724000 }, { "epoch": 4.902690558683413, "grad_norm": 0.3687296509742737, "learning_rate": 4.950973094413166e-05, "loss": 0.3807, "step": 724500 }, { "epoch": 4.906074058033781, "grad_norm": 0.3353497087955475, "learning_rate": 4.950939259419662e-05, "loss": 0.3795, "step": 725000 }, { "epoch": 4.909457557384149, "grad_norm": 0.3679102063179016, "learning_rate": 4.9509054244261585e-05, "loss": 0.3797, "step": 725500 }, { "epoch": 4.912841056734517, "grad_norm": 0.3353082537651062, "learning_rate": 4.9508715894326554e-05, "loss": 0.3807, "step": 726000 }, { "epoch": 4.916224556084885, "grad_norm": 0.332936555147171, "learning_rate": 4.9508377544391516e-05, "loss": 0.381, "step": 726500 }, { "epoch": 4.919608055435253, "grad_norm": 0.33889758586883545, "learning_rate": 4.950803919445648e-05, "loss": 0.3804, "step": 727000 }, { "epoch": 4.922991554785622, "grad_norm": 0.29593414068222046, "learning_rate": 4.950770084452144e-05, "loss": 0.3808, "step": 727500 }, { "epoch": 4.92637505413599, "grad_norm": 0.329371839761734, "learning_rate": 4.95073624945864e-05, "loss": 0.3807, "step": 728000 }, { "epoch": 4.929758553486358, "grad_norm": 0.3529195487499237, "learning_rate": 4.9507024144651365e-05, "loss": 0.3803, "step": 728500 }, { "epoch": 4.933142052836726, "grad_norm": 0.35925522446632385, "learning_rate": 4.950668579471633e-05, "loss": 0.3812, "step": 729000 }, { "epoch": 4.936525552187094, "grad_norm": 0.2887399196624756, "learning_rate": 4.950634744478129e-05, "loss": 0.3813, "step": 729500 }, { "epoch": 4.939909051537462, "grad_norm": 0.3366541564464569, "learning_rate": 4.950600909484626e-05, "loss": 0.3813, "step": 730000 }, { "epoch": 4.94329255088783, "grad_norm": 0.336927592754364, "learning_rate": 4.950567074491122e-05, "loss": 0.3804, "step": 730500 }, { "epoch": 4.946676050238199, "grad_norm": 0.3061399757862091, "learning_rate": 4.950533239497618e-05, "loss": 0.3812, "step": 731000 }, { "epoch": 4.950059549588566, "grad_norm": 0.38005000352859497, "learning_rate": 4.9504994045041144e-05, "loss": 0.3807, "step": 731500 }, { "epoch": 4.953443048938935, "grad_norm": 0.3589371144771576, "learning_rate": 4.950465569510611e-05, "loss": 0.3807, "step": 732000 }, { "epoch": 4.956826548289302, "grad_norm": 0.3375893831253052, "learning_rate": 4.950431734517107e-05, "loss": 0.3817, "step": 732500 }, { "epoch": 4.960210047639671, "grad_norm": 0.31873199343681335, "learning_rate": 4.950397899523603e-05, "loss": 0.3809, "step": 733000 }, { "epoch": 4.9635935469900385, "grad_norm": 0.33864834904670715, "learning_rate": 4.9503640645301e-05, "loss": 0.3794, "step": 733500 }, { "epoch": 4.966977046340407, "grad_norm": 0.3273870646953583, "learning_rate": 4.950330229536596e-05, "loss": 0.3807, "step": 734000 }, { "epoch": 4.970360545690776, "grad_norm": 0.33497095108032227, "learning_rate": 4.9502963945430924e-05, "loss": 0.3824, "step": 734500 }, { "epoch": 4.973744045041143, "grad_norm": 0.33082765340805054, "learning_rate": 4.9502625595495886e-05, "loss": 0.3795, "step": 735000 }, { "epoch": 4.977127544391512, "grad_norm": 0.32591503858566284, "learning_rate": 4.9502287245560855e-05, "loss": 0.3794, "step": 735500 }, { "epoch": 4.980511043741879, "grad_norm": 0.3750314712524414, "learning_rate": 4.950194889562582e-05, "loss": 0.3795, "step": 736000 }, { "epoch": 4.983894543092248, "grad_norm": 0.37751951813697815, "learning_rate": 4.950161054569078e-05, "loss": 0.3816, "step": 736500 }, { "epoch": 4.9872780424426155, "grad_norm": 0.33533382415771484, "learning_rate": 4.950127219575574e-05, "loss": 0.3809, "step": 737000 }, { "epoch": 4.990661541792984, "grad_norm": 0.3547668755054474, "learning_rate": 4.9500933845820703e-05, "loss": 0.3802, "step": 737500 }, { "epoch": 4.994045041143352, "grad_norm": 0.3214530646800995, "learning_rate": 4.9500595495885666e-05, "loss": 0.3816, "step": 738000 }, { "epoch": 4.99742854049372, "grad_norm": 0.3437938392162323, "learning_rate": 4.950025714595063e-05, "loss": 0.3818, "step": 738500 }, { "epoch": 5.0, "eval_accuracy": 0.8552021551800539, "eval_loss": 0.5871427655220032, "eval_runtime": 3408.6048, "eval_samples_per_second": 85.297, "eval_steps_per_second": 5.331, "step": 738880 }, { "epoch": 5.000812039844089, "grad_norm": 0.3535323441028595, "learning_rate": 4.949991879601559e-05, "loss": 0.3779, "step": 739000 }, { "epoch": 5.004195539194456, "grad_norm": 0.34933435916900635, "learning_rate": 4.949958044608056e-05, "loss": 0.3781, "step": 739500 }, { "epoch": 5.007579038544825, "grad_norm": 0.34405067563056946, "learning_rate": 4.949924209614552e-05, "loss": 0.3774, "step": 740000 }, { "epoch": 5.0109625378951925, "grad_norm": 0.3053302764892578, "learning_rate": 4.949890374621048e-05, "loss": 0.3785, "step": 740500 }, { "epoch": 5.014346037245561, "grad_norm": 0.3137829601764679, "learning_rate": 4.9498565396275445e-05, "loss": 0.3784, "step": 741000 }, { "epoch": 5.017729536595929, "grad_norm": 0.35309338569641113, "learning_rate": 4.9498227046340414e-05, "loss": 0.3794, "step": 741500 }, { "epoch": 5.021113035946297, "grad_norm": 0.3512505292892456, "learning_rate": 4.949788869640537e-05, "loss": 0.3785, "step": 742000 }, { "epoch": 5.024496535296665, "grad_norm": 0.33110466599464417, "learning_rate": 4.949755034647033e-05, "loss": 0.3792, "step": 742500 }, { "epoch": 5.027880034647033, "grad_norm": 0.30952101945877075, "learning_rate": 4.94972119965353e-05, "loss": 0.3788, "step": 743000 }, { "epoch": 5.031263533997402, "grad_norm": 0.3484048843383789, "learning_rate": 4.949687364660026e-05, "loss": 0.3799, "step": 743500 }, { "epoch": 5.0346470333477695, "grad_norm": 0.3439270257949829, "learning_rate": 4.9496535296665225e-05, "loss": 0.3795, "step": 744000 }, { "epoch": 5.038030532698138, "grad_norm": 0.3962043821811676, "learning_rate": 4.949619694673019e-05, "loss": 0.3793, "step": 744500 }, { "epoch": 5.041414032048506, "grad_norm": 0.35327818989753723, "learning_rate": 4.9495858596795156e-05, "loss": 0.3789, "step": 745000 }, { "epoch": 5.044797531398874, "grad_norm": 0.3789896070957184, "learning_rate": 4.949552024686012e-05, "loss": 0.3788, "step": 745500 }, { "epoch": 5.048181030749242, "grad_norm": 0.35470277070999146, "learning_rate": 4.949518189692508e-05, "loss": 0.3783, "step": 746000 }, { "epoch": 5.05156453009961, "grad_norm": 0.31829628348350525, "learning_rate": 4.949484354699004e-05, "loss": 0.3789, "step": 746500 }, { "epoch": 5.054948029449978, "grad_norm": 0.3227141499519348, "learning_rate": 4.9494505197055004e-05, "loss": 0.378, "step": 747000 }, { "epoch": 5.0583315288003465, "grad_norm": 0.35483554005622864, "learning_rate": 4.9494166847119966e-05, "loss": 0.3788, "step": 747500 }, { "epoch": 5.061715028150714, "grad_norm": 0.32808616757392883, "learning_rate": 4.949382849718493e-05, "loss": 0.3799, "step": 748000 }, { "epoch": 5.065098527501083, "grad_norm": 0.3302007019519806, "learning_rate": 4.949349014724989e-05, "loss": 0.3773, "step": 748500 }, { "epoch": 5.068482026851451, "grad_norm": 0.336599200963974, "learning_rate": 4.949315179731486e-05, "loss": 0.3788, "step": 749000 }, { "epoch": 5.071865526201819, "grad_norm": 0.32140204310417175, "learning_rate": 4.949281344737982e-05, "loss": 0.3792, "step": 749500 }, { "epoch": 5.075249025552187, "grad_norm": 0.31169798970222473, "learning_rate": 4.9492475097444784e-05, "loss": 0.3789, "step": 750000 }, { "epoch": 5.078632524902555, "grad_norm": 0.36335715651512146, "learning_rate": 4.9492136747509746e-05, "loss": 0.3786, "step": 750500 }, { "epoch": 5.0820160242529235, "grad_norm": 0.3447102904319763, "learning_rate": 4.9491798397574715e-05, "loss": 0.3806, "step": 751000 }, { "epoch": 5.085399523603291, "grad_norm": 0.3479219377040863, "learning_rate": 4.949146004763967e-05, "loss": 0.3796, "step": 751500 }, { "epoch": 5.08878302295366, "grad_norm": 0.3650398850440979, "learning_rate": 4.949112169770463e-05, "loss": 0.3809, "step": 752000 }, { "epoch": 5.092166522304027, "grad_norm": 0.3772086799144745, "learning_rate": 4.94907833477696e-05, "loss": 0.379, "step": 752500 }, { "epoch": 5.095550021654396, "grad_norm": 0.35594046115875244, "learning_rate": 4.949044499783456e-05, "loss": 0.3806, "step": 753000 }, { "epoch": 5.098933521004764, "grad_norm": 0.3071289360523224, "learning_rate": 4.9490106647899525e-05, "loss": 0.3801, "step": 753500 }, { "epoch": 5.102317020355132, "grad_norm": 0.3742023706436157, "learning_rate": 4.948976829796449e-05, "loss": 0.379, "step": 754000 }, { "epoch": 5.1057005197055005, "grad_norm": 0.3852168619632721, "learning_rate": 4.9489429948029456e-05, "loss": 0.3792, "step": 754500 }, { "epoch": 5.109084019055868, "grad_norm": 0.3463808596134186, "learning_rate": 4.948909159809442e-05, "loss": 0.3785, "step": 755000 }, { "epoch": 5.112467518406237, "grad_norm": 0.3639352321624756, "learning_rate": 4.948875324815938e-05, "loss": 0.3786, "step": 755500 }, { "epoch": 5.115851017756604, "grad_norm": 0.33799809217453003, "learning_rate": 4.948841489822434e-05, "loss": 0.3788, "step": 756000 }, { "epoch": 5.119234517106973, "grad_norm": 0.31555867195129395, "learning_rate": 4.9488076548289305e-05, "loss": 0.3795, "step": 756500 }, { "epoch": 5.1226180164573405, "grad_norm": 0.3564879596233368, "learning_rate": 4.948773819835427e-05, "loss": 0.3792, "step": 757000 }, { "epoch": 5.126001515807709, "grad_norm": 0.335040420293808, "learning_rate": 4.948739984841923e-05, "loss": 0.3796, "step": 757500 }, { "epoch": 5.129385015158077, "grad_norm": 0.34456387162208557, "learning_rate": 4.948706149848419e-05, "loss": 0.3798, "step": 758000 }, { "epoch": 5.132768514508445, "grad_norm": 0.36215341091156006, "learning_rate": 4.948672314854916e-05, "loss": 0.3796, "step": 758500 }, { "epoch": 5.136152013858814, "grad_norm": 0.35378921031951904, "learning_rate": 4.948638479861412e-05, "loss": 0.3797, "step": 759000 }, { "epoch": 5.139535513209181, "grad_norm": 0.35382765531539917, "learning_rate": 4.9486046448679084e-05, "loss": 0.3793, "step": 759500 }, { "epoch": 5.14291901255955, "grad_norm": 0.3886372745037079, "learning_rate": 4.9485708098744046e-05, "loss": 0.379, "step": 760000 }, { "epoch": 5.1463025119099175, "grad_norm": 0.3264980912208557, "learning_rate": 4.9485369748809015e-05, "loss": 0.3797, "step": 760500 }, { "epoch": 5.149686011260286, "grad_norm": 0.3303718566894531, "learning_rate": 4.948503139887397e-05, "loss": 0.3791, "step": 761000 }, { "epoch": 5.153069510610654, "grad_norm": 0.35948291420936584, "learning_rate": 4.948469304893893e-05, "loss": 0.379, "step": 761500 }, { "epoch": 5.156453009961022, "grad_norm": 0.3072948157787323, "learning_rate": 4.94843546990039e-05, "loss": 0.3786, "step": 762000 }, { "epoch": 5.15983650931139, "grad_norm": 0.34276217222213745, "learning_rate": 4.9484016349068864e-05, "loss": 0.3791, "step": 762500 }, { "epoch": 5.163220008661758, "grad_norm": 0.3674897253513336, "learning_rate": 4.9483677999133826e-05, "loss": 0.3807, "step": 763000 }, { "epoch": 5.166603508012127, "grad_norm": 0.3624192178249359, "learning_rate": 4.948333964919879e-05, "loss": 0.378, "step": 763500 }, { "epoch": 5.1699870073624945, "grad_norm": 0.3511359691619873, "learning_rate": 4.948300129926376e-05, "loss": 0.3792, "step": 764000 }, { "epoch": 5.173370506712863, "grad_norm": 0.33653759956359863, "learning_rate": 4.948266294932872e-05, "loss": 0.3788, "step": 764500 }, { "epoch": 5.176754006063231, "grad_norm": 0.3576100766658783, "learning_rate": 4.948232459939368e-05, "loss": 0.3806, "step": 765000 }, { "epoch": 5.180137505413599, "grad_norm": 0.33840882778167725, "learning_rate": 4.948198624945864e-05, "loss": 0.3798, "step": 765500 }, { "epoch": 5.183521004763967, "grad_norm": 0.33222416043281555, "learning_rate": 4.9481647899523605e-05, "loss": 0.3788, "step": 766000 }, { "epoch": 5.186904504114335, "grad_norm": 0.3831847310066223, "learning_rate": 4.948130954958857e-05, "loss": 0.3796, "step": 766500 }, { "epoch": 5.190288003464703, "grad_norm": 0.3188963830471039, "learning_rate": 4.948097119965353e-05, "loss": 0.3783, "step": 767000 }, { "epoch": 5.1936715028150715, "grad_norm": 0.3332746624946594, "learning_rate": 4.948063284971849e-05, "loss": 0.3797, "step": 767500 }, { "epoch": 5.19705500216544, "grad_norm": 0.33291539549827576, "learning_rate": 4.948029449978346e-05, "loss": 0.3799, "step": 768000 }, { "epoch": 5.200438501515808, "grad_norm": 0.3115937113761902, "learning_rate": 4.947995614984842e-05, "loss": 0.3793, "step": 768500 }, { "epoch": 5.203822000866176, "grad_norm": 0.3307870030403137, "learning_rate": 4.9479617799913385e-05, "loss": 0.3787, "step": 769000 }, { "epoch": 5.207205500216544, "grad_norm": 0.309565931558609, "learning_rate": 4.947927944997835e-05, "loss": 0.3808, "step": 769500 }, { "epoch": 5.210588999566912, "grad_norm": 0.3607114255428314, "learning_rate": 4.9478941100043316e-05, "loss": 0.3791, "step": 770000 }, { "epoch": 5.21397249891728, "grad_norm": 0.3276194930076599, "learning_rate": 4.947860275010827e-05, "loss": 0.3809, "step": 770500 }, { "epoch": 5.2173559982676485, "grad_norm": 0.3212314546108246, "learning_rate": 4.947826440017323e-05, "loss": 0.3776, "step": 771000 }, { "epoch": 5.220739497618016, "grad_norm": 0.3371840715408325, "learning_rate": 4.94779260502382e-05, "loss": 0.3783, "step": 771500 }, { "epoch": 5.224122996968385, "grad_norm": 0.37716975808143616, "learning_rate": 4.9477587700303164e-05, "loss": 0.3798, "step": 772000 }, { "epoch": 5.227506496318752, "grad_norm": 0.3523920476436615, "learning_rate": 4.9477249350368126e-05, "loss": 0.3795, "step": 772500 }, { "epoch": 5.230889995669121, "grad_norm": 0.31858694553375244, "learning_rate": 4.947691100043309e-05, "loss": 0.3795, "step": 773000 }, { "epoch": 5.234273495019489, "grad_norm": 0.3189753592014313, "learning_rate": 4.947657265049806e-05, "loss": 0.378, "step": 773500 }, { "epoch": 5.237656994369857, "grad_norm": 0.33399447798728943, "learning_rate": 4.947623430056302e-05, "loss": 0.3788, "step": 774000 }, { "epoch": 5.2410404937202255, "grad_norm": 0.34511929750442505, "learning_rate": 4.947589595062798e-05, "loss": 0.3792, "step": 774500 }, { "epoch": 5.244423993070593, "grad_norm": 0.3606366515159607, "learning_rate": 4.9475557600692944e-05, "loss": 0.3822, "step": 775000 }, { "epoch": 5.247807492420962, "grad_norm": 0.32893481850624084, "learning_rate": 4.9475219250757906e-05, "loss": 0.3811, "step": 775500 }, { "epoch": 5.251190991771329, "grad_norm": 0.3491991460323334, "learning_rate": 4.947488090082287e-05, "loss": 0.3787, "step": 776000 }, { "epoch": 5.254574491121698, "grad_norm": 0.3675939440727234, "learning_rate": 4.947454255088783e-05, "loss": 0.3794, "step": 776500 }, { "epoch": 5.2579579904720655, "grad_norm": 0.3883334696292877, "learning_rate": 4.947420420095279e-05, "loss": 0.3795, "step": 777000 }, { "epoch": 5.261341489822434, "grad_norm": 0.34572160243988037, "learning_rate": 4.947386585101776e-05, "loss": 0.3808, "step": 777500 }, { "epoch": 5.264724989172802, "grad_norm": 0.3183426558971405, "learning_rate": 4.947352750108272e-05, "loss": 0.3792, "step": 778000 }, { "epoch": 5.26810848852317, "grad_norm": 0.31833508610725403, "learning_rate": 4.9473189151147686e-05, "loss": 0.3776, "step": 778500 }, { "epoch": 5.271491987873539, "grad_norm": 0.3360603153705597, "learning_rate": 4.947285080121265e-05, "loss": 0.3804, "step": 779000 }, { "epoch": 5.274875487223906, "grad_norm": 0.3228151202201843, "learning_rate": 4.9472512451277617e-05, "loss": 0.3792, "step": 779500 }, { "epoch": 5.278258986574275, "grad_norm": 0.3585045337677002, "learning_rate": 4.947217410134258e-05, "loss": 0.3793, "step": 780000 }, { "epoch": 5.2816424859246425, "grad_norm": 0.3748699426651001, "learning_rate": 4.9471835751407534e-05, "loss": 0.3779, "step": 780500 }, { "epoch": 5.285025985275011, "grad_norm": 0.334375262260437, "learning_rate": 4.94714974014725e-05, "loss": 0.3801, "step": 781000 }, { "epoch": 5.288409484625379, "grad_norm": 0.33239230513572693, "learning_rate": 4.9471159051537465e-05, "loss": 0.3786, "step": 781500 }, { "epoch": 5.291792983975747, "grad_norm": 0.37011659145355225, "learning_rate": 4.947082070160243e-05, "loss": 0.3789, "step": 782000 }, { "epoch": 5.295176483326115, "grad_norm": 0.3369242250919342, "learning_rate": 4.947048235166739e-05, "loss": 0.38, "step": 782500 }, { "epoch": 5.298559982676483, "grad_norm": 0.3385688364505768, "learning_rate": 4.947014400173235e-05, "loss": 0.3783, "step": 783000 }, { "epoch": 5.301943482026852, "grad_norm": 0.3433874249458313, "learning_rate": 4.946980565179732e-05, "loss": 0.3787, "step": 783500 }, { "epoch": 5.3053269813772195, "grad_norm": 0.3598771095275879, "learning_rate": 4.946946730186228e-05, "loss": 0.3788, "step": 784000 }, { "epoch": 5.308710480727588, "grad_norm": 0.2988731861114502, "learning_rate": 4.9469128951927245e-05, "loss": 0.3799, "step": 784500 }, { "epoch": 5.312093980077956, "grad_norm": 0.3294981122016907, "learning_rate": 4.946879060199221e-05, "loss": 0.3793, "step": 785000 }, { "epoch": 5.315477479428324, "grad_norm": 0.32675206661224365, "learning_rate": 4.946845225205717e-05, "loss": 0.3786, "step": 785500 }, { "epoch": 5.318860978778692, "grad_norm": 0.3722259998321533, "learning_rate": 4.946811390212213e-05, "loss": 0.3798, "step": 786000 }, { "epoch": 5.32224447812906, "grad_norm": 0.34133970737457275, "learning_rate": 4.946777555218709e-05, "loss": 0.3803, "step": 786500 }, { "epoch": 5.325627977479428, "grad_norm": 0.3698797821998596, "learning_rate": 4.946743720225206e-05, "loss": 0.3792, "step": 787000 }, { "epoch": 5.3290114768297965, "grad_norm": 0.37338992953300476, "learning_rate": 4.9467098852317024e-05, "loss": 0.3789, "step": 787500 }, { "epoch": 5.332394976180165, "grad_norm": 0.31401678919792175, "learning_rate": 4.9466760502381986e-05, "loss": 0.3787, "step": 788000 }, { "epoch": 5.335778475530533, "grad_norm": 0.338290810585022, "learning_rate": 4.946642215244695e-05, "loss": 0.3793, "step": 788500 }, { "epoch": 5.339161974880901, "grad_norm": 0.32382553815841675, "learning_rate": 4.946608380251192e-05, "loss": 0.3798, "step": 789000 }, { "epoch": 5.342545474231269, "grad_norm": 0.3406978249549866, "learning_rate": 4.946574545257688e-05, "loss": 0.3778, "step": 789500 }, { "epoch": 5.345928973581637, "grad_norm": 0.32253405451774597, "learning_rate": 4.9465407102641835e-05, "loss": 0.3788, "step": 790000 }, { "epoch": 5.349312472932005, "grad_norm": 0.3149709701538086, "learning_rate": 4.9465068752706804e-05, "loss": 0.3784, "step": 790500 }, { "epoch": 5.3526959722823735, "grad_norm": 0.32785189151763916, "learning_rate": 4.9464730402771766e-05, "loss": 0.3807, "step": 791000 }, { "epoch": 5.356079471632741, "grad_norm": 0.36253809928894043, "learning_rate": 4.946439205283673e-05, "loss": 0.3788, "step": 791500 }, { "epoch": 5.35946297098311, "grad_norm": 0.35410332679748535, "learning_rate": 4.946405370290169e-05, "loss": 0.3815, "step": 792000 }, { "epoch": 5.362846470333478, "grad_norm": 0.3683050870895386, "learning_rate": 4.946371535296665e-05, "loss": 0.3799, "step": 792500 }, { "epoch": 5.366229969683846, "grad_norm": 0.3157660961151123, "learning_rate": 4.946337700303162e-05, "loss": 0.3787, "step": 793000 }, { "epoch": 5.369613469034214, "grad_norm": 0.33882051706314087, "learning_rate": 4.946303865309658e-05, "loss": 0.38, "step": 793500 }, { "epoch": 5.372996968384582, "grad_norm": 0.34864357113838196, "learning_rate": 4.9462700303161545e-05, "loss": 0.3782, "step": 794000 }, { "epoch": 5.3763804677349505, "grad_norm": 0.34172624349594116, "learning_rate": 4.946236195322651e-05, "loss": 0.38, "step": 794500 }, { "epoch": 5.379763967085318, "grad_norm": 0.31064727902412415, "learning_rate": 4.946202360329147e-05, "loss": 0.3784, "step": 795000 }, { "epoch": 5.383147466435687, "grad_norm": 0.34635475277900696, "learning_rate": 4.946168525335643e-05, "loss": 0.378, "step": 795500 }, { "epoch": 5.386530965786054, "grad_norm": 0.3327074944972992, "learning_rate": 4.9461346903421394e-05, "loss": 0.3798, "step": 796000 }, { "epoch": 5.389914465136423, "grad_norm": 0.359301894903183, "learning_rate": 4.946100855348636e-05, "loss": 0.3794, "step": 796500 }, { "epoch": 5.39329796448679, "grad_norm": 0.3421958088874817, "learning_rate": 4.9460670203551325e-05, "loss": 0.3796, "step": 797000 }, { "epoch": 5.396681463837159, "grad_norm": 0.337579607963562, "learning_rate": 4.946033185361629e-05, "loss": 0.3795, "step": 797500 }, { "epoch": 5.400064963187527, "grad_norm": 0.4035895764827728, "learning_rate": 4.945999350368125e-05, "loss": 0.3799, "step": 798000 }, { "epoch": 5.403448462537895, "grad_norm": 0.3159906566143036, "learning_rate": 4.945965515374622e-05, "loss": 0.3785, "step": 798500 }, { "epoch": 5.406831961888264, "grad_norm": 0.32930704951286316, "learning_rate": 4.945931680381118e-05, "loss": 0.3795, "step": 799000 }, { "epoch": 5.410215461238631, "grad_norm": 0.3148539066314697, "learning_rate": 4.9458978453876135e-05, "loss": 0.3797, "step": 799500 }, { "epoch": 5.413598960589, "grad_norm": 0.3461363613605499, "learning_rate": 4.94586401039411e-05, "loss": 0.3788, "step": 800000 }, { "epoch": 5.4169824599393674, "grad_norm": 0.3183637261390686, "learning_rate": 4.9458301754006066e-05, "loss": 0.3788, "step": 800500 }, { "epoch": 5.420365959289736, "grad_norm": 0.3579789996147156, "learning_rate": 4.945796340407103e-05, "loss": 0.3792, "step": 801000 }, { "epoch": 5.423749458640104, "grad_norm": 0.3455021381378174, "learning_rate": 4.945762505413599e-05, "loss": 0.3779, "step": 801500 }, { "epoch": 5.427132957990472, "grad_norm": 0.36724910140037537, "learning_rate": 4.945728670420095e-05, "loss": 0.3782, "step": 802000 }, { "epoch": 5.43051645734084, "grad_norm": 0.31053024530410767, "learning_rate": 4.945694835426592e-05, "loss": 0.3785, "step": 802500 }, { "epoch": 5.433899956691208, "grad_norm": 0.40029093623161316, "learning_rate": 4.9456610004330884e-05, "loss": 0.3801, "step": 803000 }, { "epoch": 5.437283456041577, "grad_norm": 0.3455757200717926, "learning_rate": 4.9456271654395846e-05, "loss": 0.3798, "step": 803500 }, { "epoch": 5.4406669553919444, "grad_norm": 0.3255632221698761, "learning_rate": 4.945593330446081e-05, "loss": 0.379, "step": 804000 }, { "epoch": 5.444050454742313, "grad_norm": 0.37028753757476807, "learning_rate": 4.945559495452577e-05, "loss": 0.3789, "step": 804500 }, { "epoch": 5.447433954092681, "grad_norm": 0.3766026496887207, "learning_rate": 4.945525660459073e-05, "loss": 0.3797, "step": 805000 }, { "epoch": 5.450817453443049, "grad_norm": 0.3767787218093872, "learning_rate": 4.9454918254655694e-05, "loss": 0.3803, "step": 805500 }, { "epoch": 5.454200952793417, "grad_norm": 0.33477210998535156, "learning_rate": 4.945457990472066e-05, "loss": 0.3799, "step": 806000 }, { "epoch": 5.457584452143785, "grad_norm": 0.5394200682640076, "learning_rate": 4.9454241554785625e-05, "loss": 0.3804, "step": 806500 }, { "epoch": 5.460967951494153, "grad_norm": 0.3293921947479248, "learning_rate": 4.945390320485059e-05, "loss": 0.3793, "step": 807000 }, { "epoch": 5.4643514508445215, "grad_norm": 0.35536640882492065, "learning_rate": 4.945356485491555e-05, "loss": 0.379, "step": 807500 }, { "epoch": 5.46773495019489, "grad_norm": 0.3433263599872589, "learning_rate": 4.945322650498052e-05, "loss": 0.3798, "step": 808000 }, { "epoch": 5.471118449545258, "grad_norm": 0.34488338232040405, "learning_rate": 4.945288815504548e-05, "loss": 0.3788, "step": 808500 }, { "epoch": 5.474501948895626, "grad_norm": 0.35691148042678833, "learning_rate": 4.9452549805110436e-05, "loss": 0.3781, "step": 809000 }, { "epoch": 5.477885448245994, "grad_norm": 0.3488087058067322, "learning_rate": 4.94522114551754e-05, "loss": 0.3795, "step": 809500 }, { "epoch": 5.481268947596362, "grad_norm": 0.32626432180404663, "learning_rate": 4.945187310524037e-05, "loss": 0.3796, "step": 810000 }, { "epoch": 5.48465244694673, "grad_norm": 0.3325187861919403, "learning_rate": 4.945153475530533e-05, "loss": 0.3793, "step": 810500 }, { "epoch": 5.4880359462970985, "grad_norm": 0.3402871787548065, "learning_rate": 4.945119640537029e-05, "loss": 0.3785, "step": 811000 }, { "epoch": 5.491419445647466, "grad_norm": 0.3130391240119934, "learning_rate": 4.945085805543525e-05, "loss": 0.3797, "step": 811500 }, { "epoch": 5.494802944997835, "grad_norm": 0.3534984290599823, "learning_rate": 4.945051970550022e-05, "loss": 0.3784, "step": 812000 }, { "epoch": 5.498186444348203, "grad_norm": 0.31189948320388794, "learning_rate": 4.9450181355565184e-05, "loss": 0.378, "step": 812500 }, { "epoch": 5.501569943698571, "grad_norm": 0.34218019247055054, "learning_rate": 4.9449843005630146e-05, "loss": 0.3791, "step": 813000 }, { "epoch": 5.504953443048939, "grad_norm": 0.357378751039505, "learning_rate": 4.944950465569511e-05, "loss": 0.3805, "step": 813500 }, { "epoch": 5.508336942399307, "grad_norm": 0.34804004430770874, "learning_rate": 4.944916630576007e-05, "loss": 0.3779, "step": 814000 }, { "epoch": 5.5117204417496755, "grad_norm": 0.36242908239364624, "learning_rate": 4.944882795582503e-05, "loss": 0.3794, "step": 814500 }, { "epoch": 5.515103941100043, "grad_norm": 0.35949084162712097, "learning_rate": 4.9448489605889995e-05, "loss": 0.3783, "step": 815000 }, { "epoch": 5.518487440450412, "grad_norm": 0.3674199879169464, "learning_rate": 4.9448151255954964e-05, "loss": 0.3783, "step": 815500 }, { "epoch": 5.521870939800779, "grad_norm": 0.35882535576820374, "learning_rate": 4.9447812906019926e-05, "loss": 0.3803, "step": 816000 }, { "epoch": 5.525254439151148, "grad_norm": 0.28911158442497253, "learning_rate": 4.944747455608489e-05, "loss": 0.3785, "step": 816500 }, { "epoch": 5.528637938501516, "grad_norm": 0.3618195354938507, "learning_rate": 4.944713620614985e-05, "loss": 0.3804, "step": 817000 }, { "epoch": 5.532021437851884, "grad_norm": 0.34649333357810974, "learning_rate": 4.944679785621482e-05, "loss": 0.3787, "step": 817500 }, { "epoch": 5.535404937202252, "grad_norm": 0.3594622313976288, "learning_rate": 4.944645950627978e-05, "loss": 0.38, "step": 818000 }, { "epoch": 5.53878843655262, "grad_norm": 0.32172903418540955, "learning_rate": 4.9446121156344737e-05, "loss": 0.38, "step": 818500 }, { "epoch": 5.542171935902989, "grad_norm": 0.3536199629306793, "learning_rate": 4.94457828064097e-05, "loss": 0.3797, "step": 819000 }, { "epoch": 5.545555435253356, "grad_norm": 0.3164806663990021, "learning_rate": 4.944544445647467e-05, "loss": 0.3805, "step": 819500 }, { "epoch": 5.548938934603725, "grad_norm": 0.3280503451824188, "learning_rate": 4.944510610653963e-05, "loss": 0.3803, "step": 820000 }, { "epoch": 5.552322433954092, "grad_norm": 0.31813400983810425, "learning_rate": 4.944476775660459e-05, "loss": 0.3803, "step": 820500 }, { "epoch": 5.555705933304461, "grad_norm": 0.3344866633415222, "learning_rate": 4.9444429406669554e-05, "loss": 0.3786, "step": 821000 }, { "epoch": 5.559089432654829, "grad_norm": 0.3649905323982239, "learning_rate": 4.944409105673452e-05, "loss": 0.3782, "step": 821500 }, { "epoch": 5.562472932005197, "grad_norm": 0.3461797833442688, "learning_rate": 4.9443752706799485e-05, "loss": 0.3793, "step": 822000 }, { "epoch": 5.565856431355565, "grad_norm": 0.35041677951812744, "learning_rate": 4.944341435686445e-05, "loss": 0.3789, "step": 822500 }, { "epoch": 5.569239930705933, "grad_norm": 0.31106075644493103, "learning_rate": 4.944307600692941e-05, "loss": 0.3801, "step": 823000 }, { "epoch": 5.572623430056302, "grad_norm": 0.3350198566913605, "learning_rate": 4.944273765699437e-05, "loss": 0.3788, "step": 823500 }, { "epoch": 5.576006929406669, "grad_norm": 0.3699307441711426, "learning_rate": 4.9442399307059333e-05, "loss": 0.3793, "step": 824000 }, { "epoch": 5.579390428757038, "grad_norm": 0.34947749972343445, "learning_rate": 4.9442060957124296e-05, "loss": 0.3795, "step": 824500 }, { "epoch": 5.582773928107406, "grad_norm": 0.320469468832016, "learning_rate": 4.9441722607189264e-05, "loss": 0.3792, "step": 825000 }, { "epoch": 5.586157427457774, "grad_norm": 0.37775009870529175, "learning_rate": 4.9441384257254227e-05, "loss": 0.38, "step": 825500 }, { "epoch": 5.589540926808142, "grad_norm": 0.3509116470813751, "learning_rate": 4.944104590731919e-05, "loss": 0.3784, "step": 826000 }, { "epoch": 5.59292442615851, "grad_norm": 0.3317195475101471, "learning_rate": 4.944070755738415e-05, "loss": 0.3801, "step": 826500 }, { "epoch": 5.596307925508878, "grad_norm": 0.37722665071487427, "learning_rate": 4.944036920744912e-05, "loss": 0.3797, "step": 827000 }, { "epoch": 5.599691424859246, "grad_norm": 0.34442973136901855, "learning_rate": 4.944003085751408e-05, "loss": 0.3789, "step": 827500 }, { "epoch": 5.603074924209615, "grad_norm": 0.37473928928375244, "learning_rate": 4.943969250757904e-05, "loss": 0.3796, "step": 828000 }, { "epoch": 5.606458423559983, "grad_norm": 0.2993427515029907, "learning_rate": 4.9439354157644e-05, "loss": 0.3808, "step": 828500 }, { "epoch": 5.609841922910351, "grad_norm": 0.33846497535705566, "learning_rate": 4.943901580770897e-05, "loss": 0.3789, "step": 829000 }, { "epoch": 5.613225422260719, "grad_norm": 0.34987756609916687, "learning_rate": 4.943867745777393e-05, "loss": 0.3798, "step": 829500 }, { "epoch": 5.616608921611087, "grad_norm": 0.3368557393550873, "learning_rate": 4.943833910783889e-05, "loss": 0.3793, "step": 830000 }, { "epoch": 5.619992420961455, "grad_norm": 0.3821764886379242, "learning_rate": 4.9438000757903855e-05, "loss": 0.3776, "step": 830500 }, { "epoch": 5.623375920311823, "grad_norm": 0.3596230447292328, "learning_rate": 4.9437662407968823e-05, "loss": 0.3798, "step": 831000 }, { "epoch": 5.626759419662191, "grad_norm": 0.35004204511642456, "learning_rate": 4.9437324058033786e-05, "loss": 0.379, "step": 831500 }, { "epoch": 5.63014291901256, "grad_norm": 0.36982133984565735, "learning_rate": 4.943698570809875e-05, "loss": 0.3794, "step": 832000 }, { "epoch": 5.633526418362928, "grad_norm": 0.3210700452327728, "learning_rate": 4.943664735816371e-05, "loss": 0.3781, "step": 832500 }, { "epoch": 5.636909917713296, "grad_norm": 0.3624829351902008, "learning_rate": 4.943630900822867e-05, "loss": 0.3791, "step": 833000 }, { "epoch": 5.640293417063664, "grad_norm": 0.3207123279571533, "learning_rate": 4.9435970658293634e-05, "loss": 0.3776, "step": 833500 }, { "epoch": 5.643676916414032, "grad_norm": 0.33684486150741577, "learning_rate": 4.9435632308358596e-05, "loss": 0.3796, "step": 834000 }, { "epoch": 5.6470604157644, "grad_norm": 0.35455456376075745, "learning_rate": 4.9435293958423565e-05, "loss": 0.3795, "step": 834500 }, { "epoch": 5.650443915114768, "grad_norm": 0.33251824975013733, "learning_rate": 4.943495560848853e-05, "loss": 0.3785, "step": 835000 }, { "epoch": 5.653827414465137, "grad_norm": 0.33744561672210693, "learning_rate": 4.943461725855349e-05, "loss": 0.3786, "step": 835500 }, { "epoch": 5.657210913815504, "grad_norm": 0.3235434591770172, "learning_rate": 4.943427890861845e-05, "loss": 0.3787, "step": 836000 }, { "epoch": 5.660594413165873, "grad_norm": 0.3207637369632721, "learning_rate": 4.943394055868342e-05, "loss": 0.3794, "step": 836500 }, { "epoch": 5.663977912516241, "grad_norm": 0.3127499520778656, "learning_rate": 4.943360220874838e-05, "loss": 0.3792, "step": 837000 }, { "epoch": 5.667361411866609, "grad_norm": 0.3637065589427948, "learning_rate": 4.943326385881334e-05, "loss": 0.3796, "step": 837500 }, { "epoch": 5.6707449112169765, "grad_norm": 0.3559350073337555, "learning_rate": 4.94329255088783e-05, "loss": 0.3787, "step": 838000 }, { "epoch": 5.674128410567345, "grad_norm": 0.41822707653045654, "learning_rate": 4.943258715894327e-05, "loss": 0.3793, "step": 838500 }, { "epoch": 5.677511909917714, "grad_norm": 0.3356015980243683, "learning_rate": 4.943224880900823e-05, "loss": 0.3786, "step": 839000 }, { "epoch": 5.680895409268081, "grad_norm": 0.3275659680366516, "learning_rate": 4.943191045907319e-05, "loss": 0.3781, "step": 839500 }, { "epoch": 5.68427890861845, "grad_norm": 0.32490238547325134, "learning_rate": 4.9431572109138155e-05, "loss": 0.3792, "step": 840000 }, { "epoch": 5.687662407968817, "grad_norm": 0.3571946322917938, "learning_rate": 4.9431233759203124e-05, "loss": 0.379, "step": 840500 }, { "epoch": 5.691045907319186, "grad_norm": 0.3392782211303711, "learning_rate": 4.9430895409268086e-05, "loss": 0.377, "step": 841000 }, { "epoch": 5.6944294066695536, "grad_norm": 0.3352891802787781, "learning_rate": 4.943055705933305e-05, "loss": 0.3813, "step": 841500 }, { "epoch": 5.697812906019922, "grad_norm": 0.3490934371948242, "learning_rate": 4.943021870939801e-05, "loss": 0.3785, "step": 842000 }, { "epoch": 5.70119640537029, "grad_norm": 0.3238186538219452, "learning_rate": 4.942988035946297e-05, "loss": 0.3783, "step": 842500 }, { "epoch": 5.704579904720658, "grad_norm": 0.340578556060791, "learning_rate": 4.9429542009527935e-05, "loss": 0.3805, "step": 843000 }, { "epoch": 5.707963404071027, "grad_norm": 0.32143744826316833, "learning_rate": 4.94292036595929e-05, "loss": 0.3774, "step": 843500 }, { "epoch": 5.711346903421394, "grad_norm": 0.3530731201171875, "learning_rate": 4.9428865309657866e-05, "loss": 0.3793, "step": 844000 }, { "epoch": 5.714730402771763, "grad_norm": 0.3481243848800659, "learning_rate": 4.942852695972283e-05, "loss": 0.3788, "step": 844500 }, { "epoch": 5.7181139021221306, "grad_norm": 0.35261571407318115, "learning_rate": 4.942818860978779e-05, "loss": 0.379, "step": 845000 }, { "epoch": 5.721497401472499, "grad_norm": 0.3564860224723816, "learning_rate": 4.942785025985275e-05, "loss": 0.3783, "step": 845500 }, { "epoch": 5.724880900822867, "grad_norm": 0.3480117917060852, "learning_rate": 4.9427511909917714e-05, "loss": 0.3785, "step": 846000 }, { "epoch": 5.728264400173235, "grad_norm": 0.3417789041996002, "learning_rate": 4.942717355998268e-05, "loss": 0.3795, "step": 846500 }, { "epoch": 5.731647899523603, "grad_norm": 0.3160167932510376, "learning_rate": 4.942683521004764e-05, "loss": 0.3797, "step": 847000 }, { "epoch": 5.735031398873971, "grad_norm": 0.32285985350608826, "learning_rate": 4.94264968601126e-05, "loss": 0.3796, "step": 847500 }, { "epoch": 5.73841489822434, "grad_norm": 0.3177568316459656, "learning_rate": 4.942615851017757e-05, "loss": 0.3794, "step": 848000 }, { "epoch": 5.741798397574708, "grad_norm": 0.3382292687892914, "learning_rate": 4.942582016024253e-05, "loss": 0.3774, "step": 848500 }, { "epoch": 5.745181896925076, "grad_norm": 0.3521333634853363, "learning_rate": 4.9425481810307494e-05, "loss": 0.3801, "step": 849000 }, { "epoch": 5.748565396275444, "grad_norm": 0.3552185595035553, "learning_rate": 4.9425143460372456e-05, "loss": 0.3782, "step": 849500 }, { "epoch": 5.751948895625812, "grad_norm": 0.37381207942962646, "learning_rate": 4.9424805110437425e-05, "loss": 0.3788, "step": 850000 }, { "epoch": 5.75533239497618, "grad_norm": 0.3287741243839264, "learning_rate": 4.942446676050239e-05, "loss": 0.3793, "step": 850500 }, { "epoch": 5.758715894326548, "grad_norm": 0.3196081519126892, "learning_rate": 4.942412841056735e-05, "loss": 0.3785, "step": 851000 }, { "epoch": 5.762099393676916, "grad_norm": 0.31051212549209595, "learning_rate": 4.942379006063231e-05, "loss": 0.379, "step": 851500 }, { "epoch": 5.765482893027285, "grad_norm": 0.3289923667907715, "learning_rate": 4.942345171069727e-05, "loss": 0.3785, "step": 852000 }, { "epoch": 5.768866392377653, "grad_norm": 0.3510921597480774, "learning_rate": 4.9423113360762235e-05, "loss": 0.3798, "step": 852500 }, { "epoch": 5.772249891728021, "grad_norm": 0.32584503293037415, "learning_rate": 4.94227750108272e-05, "loss": 0.3781, "step": 853000 }, { "epoch": 5.775633391078389, "grad_norm": 0.3186558187007904, "learning_rate": 4.9422436660892166e-05, "loss": 0.3799, "step": 853500 }, { "epoch": 5.779016890428757, "grad_norm": 0.34689462184906006, "learning_rate": 4.942209831095713e-05, "loss": 0.3786, "step": 854000 }, { "epoch": 5.782400389779125, "grad_norm": 0.3494212031364441, "learning_rate": 4.942175996102209e-05, "loss": 0.378, "step": 854500 }, { "epoch": 5.785783889129493, "grad_norm": 0.3484705984592438, "learning_rate": 4.942142161108705e-05, "loss": 0.3801, "step": 855000 }, { "epoch": 5.789167388479862, "grad_norm": 0.3759133815765381, "learning_rate": 4.9421083261152015e-05, "loss": 0.3797, "step": 855500 }, { "epoch": 5.792550887830229, "grad_norm": 0.34640076756477356, "learning_rate": 4.9420744911216984e-05, "loss": 0.3809, "step": 856000 }, { "epoch": 5.795934387180598, "grad_norm": 0.3386498689651489, "learning_rate": 4.942040656128194e-05, "loss": 0.3791, "step": 856500 }, { "epoch": 5.799317886530966, "grad_norm": 0.29726603627204895, "learning_rate": 4.94200682113469e-05, "loss": 0.379, "step": 857000 }, { "epoch": 5.802701385881334, "grad_norm": 0.35376840829849243, "learning_rate": 4.941972986141187e-05, "loss": 0.3789, "step": 857500 }, { "epoch": 5.806084885231702, "grad_norm": 0.33722490072250366, "learning_rate": 4.941939151147683e-05, "loss": 0.3788, "step": 858000 }, { "epoch": 5.80946838458207, "grad_norm": 0.36351436376571655, "learning_rate": 4.9419053161541794e-05, "loss": 0.3785, "step": 858500 }, { "epoch": 5.812851883932439, "grad_norm": 0.3795541226863861, "learning_rate": 4.9418714811606756e-05, "loss": 0.3782, "step": 859000 }, { "epoch": 5.816235383282806, "grad_norm": 0.3474687933921814, "learning_rate": 4.9418376461671725e-05, "loss": 0.3783, "step": 859500 }, { "epoch": 5.819618882633175, "grad_norm": 0.3508880138397217, "learning_rate": 4.941803811173669e-05, "loss": 0.3777, "step": 860000 }, { "epoch": 5.823002381983542, "grad_norm": 0.32037851214408875, "learning_rate": 4.941769976180165e-05, "loss": 0.3801, "step": 860500 }, { "epoch": 5.826385881333911, "grad_norm": 0.35430553555488586, "learning_rate": 4.941736141186661e-05, "loss": 0.3777, "step": 861000 }, { "epoch": 5.8297693806842785, "grad_norm": 0.3321610391139984, "learning_rate": 4.9417023061931574e-05, "loss": 0.3774, "step": 861500 }, { "epoch": 5.833152880034647, "grad_norm": 0.31289544701576233, "learning_rate": 4.9416684711996536e-05, "loss": 0.379, "step": 862000 }, { "epoch": 5.836536379385015, "grad_norm": 0.3334125876426697, "learning_rate": 4.94163463620615e-05, "loss": 0.3777, "step": 862500 }, { "epoch": 5.839919878735383, "grad_norm": 0.35061657428741455, "learning_rate": 4.941600801212646e-05, "loss": 0.3774, "step": 863000 }, { "epoch": 5.843303378085752, "grad_norm": 0.3509184420108795, "learning_rate": 4.941566966219143e-05, "loss": 0.3784, "step": 863500 }, { "epoch": 5.846686877436119, "grad_norm": 0.3638246953487396, "learning_rate": 4.941533131225639e-05, "loss": 0.3775, "step": 864000 }, { "epoch": 5.850070376786488, "grad_norm": 0.3070426881313324, "learning_rate": 4.941499296232135e-05, "loss": 0.3773, "step": 864500 }, { "epoch": 5.8534538761368555, "grad_norm": 0.32031184434890747, "learning_rate": 4.9414654612386315e-05, "loss": 0.379, "step": 865000 }, { "epoch": 5.856837375487224, "grad_norm": 0.32593870162963867, "learning_rate": 4.9414316262451284e-05, "loss": 0.3765, "step": 865500 }, { "epoch": 5.860220874837592, "grad_norm": 0.3688930869102478, "learning_rate": 4.941397791251624e-05, "loss": 0.3782, "step": 866000 }, { "epoch": 5.86360437418796, "grad_norm": 0.348285049200058, "learning_rate": 4.94136395625812e-05, "loss": 0.3794, "step": 866500 }, { "epoch": 5.866987873538328, "grad_norm": 0.3707033693790436, "learning_rate": 4.941330121264617e-05, "loss": 0.3792, "step": 867000 }, { "epoch": 5.870371372888696, "grad_norm": 0.38194599747657776, "learning_rate": 4.941296286271113e-05, "loss": 0.3775, "step": 867500 }, { "epoch": 5.873754872239065, "grad_norm": 0.3280165493488312, "learning_rate": 4.9412624512776095e-05, "loss": 0.3796, "step": 868000 }, { "epoch": 5.8771383715894325, "grad_norm": 0.34023532271385193, "learning_rate": 4.941228616284106e-05, "loss": 0.3801, "step": 868500 }, { "epoch": 5.880521870939801, "grad_norm": 0.3545888364315033, "learning_rate": 4.9411947812906026e-05, "loss": 0.3787, "step": 869000 }, { "epoch": 5.883905370290169, "grad_norm": 0.3295728862285614, "learning_rate": 4.941160946297099e-05, "loss": 0.3777, "step": 869500 }, { "epoch": 5.887288869640537, "grad_norm": 0.32660236954689026, "learning_rate": 4.941127111303595e-05, "loss": 0.3797, "step": 870000 }, { "epoch": 5.890672368990905, "grad_norm": 0.36814743280410767, "learning_rate": 4.941093276310091e-05, "loss": 0.38, "step": 870500 }, { "epoch": 5.894055868341273, "grad_norm": 0.3505696952342987, "learning_rate": 4.9410594413165874e-05, "loss": 0.3795, "step": 871000 }, { "epoch": 5.897439367691641, "grad_norm": 0.30137890577316284, "learning_rate": 4.9410256063230837e-05, "loss": 0.3773, "step": 871500 }, { "epoch": 5.9008228670420095, "grad_norm": 0.35418471693992615, "learning_rate": 4.94099177132958e-05, "loss": 0.3791, "step": 872000 }, { "epoch": 5.904206366392378, "grad_norm": 0.3241247236728668, "learning_rate": 4.940957936336076e-05, "loss": 0.3782, "step": 872500 }, { "epoch": 5.907589865742746, "grad_norm": 0.3622310161590576, "learning_rate": 4.940924101342573e-05, "loss": 0.379, "step": 873000 }, { "epoch": 5.910973365093114, "grad_norm": 0.3449992835521698, "learning_rate": 4.940890266349069e-05, "loss": 0.3789, "step": 873500 }, { "epoch": 5.914356864443482, "grad_norm": 0.3535126745700836, "learning_rate": 4.9408564313555654e-05, "loss": 0.3788, "step": 874000 }, { "epoch": 5.91774036379385, "grad_norm": 0.3176920711994171, "learning_rate": 4.9408225963620616e-05, "loss": 0.3779, "step": 874500 }, { "epoch": 5.921123863144218, "grad_norm": 0.34803539514541626, "learning_rate": 4.9407887613685585e-05, "loss": 0.3782, "step": 875000 }, { "epoch": 5.9245073624945865, "grad_norm": 0.31535181403160095, "learning_rate": 4.940754926375054e-05, "loss": 0.3796, "step": 875500 }, { "epoch": 5.927890861844954, "grad_norm": 0.3422606587409973, "learning_rate": 4.94072109138155e-05, "loss": 0.3776, "step": 876000 }, { "epoch": 5.931274361195323, "grad_norm": 0.3442267179489136, "learning_rate": 4.940687256388047e-05, "loss": 0.3779, "step": 876500 }, { "epoch": 5.934657860545691, "grad_norm": 0.3561495840549469, "learning_rate": 4.9406534213945433e-05, "loss": 0.3785, "step": 877000 }, { "epoch": 5.938041359896059, "grad_norm": 0.30205509066581726, "learning_rate": 4.9406195864010396e-05, "loss": 0.3792, "step": 877500 }, { "epoch": 5.941424859246427, "grad_norm": 0.32767045497894287, "learning_rate": 4.940585751407536e-05, "loss": 0.3796, "step": 878000 }, { "epoch": 5.944808358596795, "grad_norm": 0.3319656252861023, "learning_rate": 4.9405519164140327e-05, "loss": 0.3802, "step": 878500 }, { "epoch": 5.9481918579471635, "grad_norm": 0.3690793514251709, "learning_rate": 4.940518081420529e-05, "loss": 0.3787, "step": 879000 }, { "epoch": 5.951575357297531, "grad_norm": 0.32809531688690186, "learning_rate": 4.940484246427025e-05, "loss": 0.3801, "step": 879500 }, { "epoch": 5.9549588566479, "grad_norm": 0.35235708951950073, "learning_rate": 4.940450411433521e-05, "loss": 0.3787, "step": 880000 }, { "epoch": 5.958342355998267, "grad_norm": 0.3218901455402374, "learning_rate": 4.9404165764400175e-05, "loss": 0.381, "step": 880500 }, { "epoch": 5.961725855348636, "grad_norm": 0.34744882583618164, "learning_rate": 4.940382741446514e-05, "loss": 0.3801, "step": 881000 }, { "epoch": 5.965109354699004, "grad_norm": 0.35583585500717163, "learning_rate": 4.94034890645301e-05, "loss": 0.3782, "step": 881500 }, { "epoch": 5.968492854049372, "grad_norm": 0.3385174870491028, "learning_rate": 4.940315071459506e-05, "loss": 0.3798, "step": 882000 }, { "epoch": 5.97187635339974, "grad_norm": 0.3421972393989563, "learning_rate": 4.940281236466003e-05, "loss": 0.3775, "step": 882500 }, { "epoch": 5.975259852750108, "grad_norm": 0.3534969687461853, "learning_rate": 4.940247401472499e-05, "loss": 0.3793, "step": 883000 }, { "epoch": 5.978643352100477, "grad_norm": 0.3438570201396942, "learning_rate": 4.9402135664789955e-05, "loss": 0.3778, "step": 883500 }, { "epoch": 5.982026851450844, "grad_norm": 0.33352580666542053, "learning_rate": 4.940179731485492e-05, "loss": 0.3793, "step": 884000 }, { "epoch": 5.985410350801213, "grad_norm": 0.3402976989746094, "learning_rate": 4.9401458964919886e-05, "loss": 0.3785, "step": 884500 }, { "epoch": 5.9887938501515805, "grad_norm": 0.33715447783470154, "learning_rate": 4.940112061498484e-05, "loss": 0.3801, "step": 885000 }, { "epoch": 5.992177349501949, "grad_norm": 0.38319554924964905, "learning_rate": 4.94007822650498e-05, "loss": 0.3776, "step": 885500 }, { "epoch": 5.995560848852317, "grad_norm": 0.3456675708293915, "learning_rate": 4.940044391511477e-05, "loss": 0.3768, "step": 886000 }, { "epoch": 5.998944348202685, "grad_norm": 0.3558131754398346, "learning_rate": 4.9400105565179734e-05, "loss": 0.38, "step": 886500 }, { "epoch": 6.0, "eval_accuracy": 0.85590746471741, "eval_loss": 0.5863214135169983, "eval_runtime": 3371.7967, "eval_samples_per_second": 86.228, "eval_steps_per_second": 5.389, "step": 886656 }, { "epoch": 6.002327847553054, "grad_norm": 0.38339632749557495, "learning_rate": 4.9399767215244696e-05, "loss": 0.3773, "step": 887000 }, { "epoch": 6.005711346903421, "grad_norm": 0.3409227132797241, "learning_rate": 4.939942886530966e-05, "loss": 0.377, "step": 887500 }, { "epoch": 6.00909484625379, "grad_norm": 0.35882502794265747, "learning_rate": 4.939909051537463e-05, "loss": 0.3754, "step": 888000 }, { "epoch": 6.0124783456041575, "grad_norm": 0.3217174708843231, "learning_rate": 4.939875216543959e-05, "loss": 0.3769, "step": 888500 }, { "epoch": 6.015861844954526, "grad_norm": 0.3731520473957062, "learning_rate": 4.939841381550455e-05, "loss": 0.3763, "step": 889000 }, { "epoch": 6.019245344304894, "grad_norm": 0.34831157326698303, "learning_rate": 4.9398075465569514e-05, "loss": 0.3773, "step": 889500 }, { "epoch": 6.022628843655262, "grad_norm": 0.36046484112739563, "learning_rate": 4.9397737115634476e-05, "loss": 0.3771, "step": 890000 }, { "epoch": 6.02601234300563, "grad_norm": 0.3497769832611084, "learning_rate": 4.939739876569944e-05, "loss": 0.3776, "step": 890500 }, { "epoch": 6.029395842355998, "grad_norm": 0.3198765814304352, "learning_rate": 4.93970604157644e-05, "loss": 0.3771, "step": 891000 }, { "epoch": 6.032779341706366, "grad_norm": 0.3285377323627472, "learning_rate": 4.939672206582936e-05, "loss": 0.3755, "step": 891500 }, { "epoch": 6.0361628410567345, "grad_norm": 0.3330004811286926, "learning_rate": 4.939638371589433e-05, "loss": 0.3773, "step": 892000 }, { "epoch": 6.039546340407103, "grad_norm": 0.3420369029045105, "learning_rate": 4.939604536595929e-05, "loss": 0.3775, "step": 892500 }, { "epoch": 6.042929839757471, "grad_norm": 0.36481571197509766, "learning_rate": 4.9395707016024255e-05, "loss": 0.3773, "step": 893000 }, { "epoch": 6.046313339107839, "grad_norm": 0.34153133630752563, "learning_rate": 4.939536866608922e-05, "loss": 0.3747, "step": 893500 }, { "epoch": 6.049696838458207, "grad_norm": 0.3566543161869049, "learning_rate": 4.9395030316154186e-05, "loss": 0.3758, "step": 894000 }, { "epoch": 6.053080337808575, "grad_norm": 0.3604832887649536, "learning_rate": 4.939469196621914e-05, "loss": 0.3757, "step": 894500 }, { "epoch": 6.056463837158943, "grad_norm": 0.35277846455574036, "learning_rate": 4.9394353616284104e-05, "loss": 0.3771, "step": 895000 }, { "epoch": 6.0598473365093115, "grad_norm": 0.34369081258773804, "learning_rate": 4.939401526634907e-05, "loss": 0.3769, "step": 895500 }, { "epoch": 6.063230835859679, "grad_norm": 0.31082791090011597, "learning_rate": 4.9393676916414035e-05, "loss": 0.3771, "step": 896000 }, { "epoch": 6.066614335210048, "grad_norm": 0.34710177779197693, "learning_rate": 4.9393338566479e-05, "loss": 0.3758, "step": 896500 }, { "epoch": 6.069997834560416, "grad_norm": 0.3685329556465149, "learning_rate": 4.939300021654396e-05, "loss": 0.3777, "step": 897000 }, { "epoch": 6.073381333910784, "grad_norm": 0.30597200989723206, "learning_rate": 4.939266186660893e-05, "loss": 0.3778, "step": 897500 }, { "epoch": 6.076764833261152, "grad_norm": 0.34194767475128174, "learning_rate": 4.939232351667389e-05, "loss": 0.3766, "step": 898000 }, { "epoch": 6.08014833261152, "grad_norm": 0.33248472213745117, "learning_rate": 4.939198516673885e-05, "loss": 0.3761, "step": 898500 }, { "epoch": 6.0835318319618885, "grad_norm": 0.32487326860427856, "learning_rate": 4.9391646816803814e-05, "loss": 0.377, "step": 899000 }, { "epoch": 6.086915331312256, "grad_norm": 0.37924644351005554, "learning_rate": 4.9391308466868776e-05, "loss": 0.3773, "step": 899500 }, { "epoch": 6.090298830662625, "grad_norm": 0.3005082905292511, "learning_rate": 4.939097011693374e-05, "loss": 0.3764, "step": 900000 }, { "epoch": 6.093682330012992, "grad_norm": 0.3714136481285095, "learning_rate": 4.93906317669987e-05, "loss": 0.376, "step": 900500 }, { "epoch": 6.097065829363361, "grad_norm": 0.35402193665504456, "learning_rate": 4.939029341706366e-05, "loss": 0.3782, "step": 901000 }, { "epoch": 6.1004493287137285, "grad_norm": 0.40786847472190857, "learning_rate": 4.938995506712863e-05, "loss": 0.3776, "step": 901500 }, { "epoch": 6.103832828064097, "grad_norm": 0.36258766055107117, "learning_rate": 4.9389616717193594e-05, "loss": 0.3774, "step": 902000 }, { "epoch": 6.1072163274144655, "grad_norm": 0.3297172486782074, "learning_rate": 4.9389278367258556e-05, "loss": 0.3776, "step": 902500 }, { "epoch": 6.110599826764833, "grad_norm": 0.34612369537353516, "learning_rate": 4.938894001732352e-05, "loss": 0.3785, "step": 903000 }, { "epoch": 6.113983326115202, "grad_norm": 0.3194482624530792, "learning_rate": 4.938860166738849e-05, "loss": 0.3777, "step": 903500 }, { "epoch": 6.117366825465569, "grad_norm": 0.3068385720252991, "learning_rate": 4.938826331745345e-05, "loss": 0.3786, "step": 904000 }, { "epoch": 6.120750324815938, "grad_norm": 0.3310864567756653, "learning_rate": 4.9387924967518404e-05, "loss": 0.3769, "step": 904500 }, { "epoch": 6.1241338241663055, "grad_norm": 0.363061785697937, "learning_rate": 4.938758661758337e-05, "loss": 0.3778, "step": 905000 }, { "epoch": 6.127517323516674, "grad_norm": 0.3285543620586395, "learning_rate": 4.9387248267648335e-05, "loss": 0.378, "step": 905500 }, { "epoch": 6.130900822867042, "grad_norm": 0.32650935649871826, "learning_rate": 4.93869099177133e-05, "loss": 0.3756, "step": 906000 }, { "epoch": 6.13428432221741, "grad_norm": 0.36548131704330444, "learning_rate": 4.938657156777826e-05, "loss": 0.3773, "step": 906500 }, { "epoch": 6.137667821567779, "grad_norm": 0.3245476186275482, "learning_rate": 4.938623321784323e-05, "loss": 0.3761, "step": 907000 }, { "epoch": 6.141051320918146, "grad_norm": 0.32007771730422974, "learning_rate": 4.938589486790819e-05, "loss": 0.3772, "step": 907500 }, { "epoch": 6.144434820268515, "grad_norm": 0.3352614939212799, "learning_rate": 4.938555651797315e-05, "loss": 0.3768, "step": 908000 }, { "epoch": 6.1478183196188825, "grad_norm": 0.31153857707977295, "learning_rate": 4.9385218168038115e-05, "loss": 0.376, "step": 908500 }, { "epoch": 6.151201818969251, "grad_norm": 0.36174407601356506, "learning_rate": 4.938487981810308e-05, "loss": 0.3772, "step": 909000 }, { "epoch": 6.154585318319619, "grad_norm": 0.36423182487487793, "learning_rate": 4.938454146816804e-05, "loss": 0.3764, "step": 909500 }, { "epoch": 6.157968817669987, "grad_norm": 0.302528440952301, "learning_rate": 4.9384203118233e-05, "loss": 0.3767, "step": 910000 }, { "epoch": 6.161352317020355, "grad_norm": 0.3159232437610626, "learning_rate": 4.938386476829796e-05, "loss": 0.3763, "step": 910500 }, { "epoch": 6.164735816370723, "grad_norm": 0.3770563304424286, "learning_rate": 4.938352641836293e-05, "loss": 0.3774, "step": 911000 }, { "epoch": 6.168119315721091, "grad_norm": 0.3554304242134094, "learning_rate": 4.9383188068427894e-05, "loss": 0.3776, "step": 911500 }, { "epoch": 6.1715028150714595, "grad_norm": 0.3524431884288788, "learning_rate": 4.9382849718492857e-05, "loss": 0.3774, "step": 912000 }, { "epoch": 6.174886314421828, "grad_norm": 0.3047381639480591, "learning_rate": 4.938251136855782e-05, "loss": 0.3764, "step": 912500 }, { "epoch": 6.178269813772196, "grad_norm": 0.3575003743171692, "learning_rate": 4.938217301862279e-05, "loss": 0.3767, "step": 913000 }, { "epoch": 6.181653313122564, "grad_norm": 0.3069573938846588, "learning_rate": 4.938183466868775e-05, "loss": 0.3755, "step": 913500 }, { "epoch": 6.185036812472932, "grad_norm": 0.3316364884376526, "learning_rate": 4.9381496318752705e-05, "loss": 0.3788, "step": 914000 }, { "epoch": 6.1884203118233, "grad_norm": 0.414703905582428, "learning_rate": 4.9381157968817674e-05, "loss": 0.377, "step": 914500 }, { "epoch": 6.191803811173668, "grad_norm": 0.2995516359806061, "learning_rate": 4.9380819618882636e-05, "loss": 0.3782, "step": 915000 }, { "epoch": 6.1951873105240365, "grad_norm": 0.32982611656188965, "learning_rate": 4.93804812689476e-05, "loss": 0.3773, "step": 915500 }, { "epoch": 6.198570809874404, "grad_norm": 0.3949192464351654, "learning_rate": 4.938014291901256e-05, "loss": 0.3782, "step": 916000 }, { "epoch": 6.201954309224773, "grad_norm": 0.34046921133995056, "learning_rate": 4.937980456907752e-05, "loss": 0.3787, "step": 916500 }, { "epoch": 6.205337808575141, "grad_norm": 0.3586592674255371, "learning_rate": 4.937946621914249e-05, "loss": 0.3768, "step": 917000 }, { "epoch": 6.208721307925509, "grad_norm": 0.31867897510528564, "learning_rate": 4.9379127869207453e-05, "loss": 0.3786, "step": 917500 }, { "epoch": 6.212104807275877, "grad_norm": 0.3539881408214569, "learning_rate": 4.9378789519272416e-05, "loss": 0.3756, "step": 918000 }, { "epoch": 6.215488306626245, "grad_norm": 0.3635047972202301, "learning_rate": 4.937845116933738e-05, "loss": 0.3783, "step": 918500 }, { "epoch": 6.2188718059766135, "grad_norm": 0.3581390082836151, "learning_rate": 4.937811281940234e-05, "loss": 0.3786, "step": 919000 }, { "epoch": 6.222255305326981, "grad_norm": 0.3544600009918213, "learning_rate": 4.93777744694673e-05, "loss": 0.3756, "step": 919500 }, { "epoch": 6.22563880467735, "grad_norm": 0.345814049243927, "learning_rate": 4.9377436119532264e-05, "loss": 0.3783, "step": 920000 }, { "epoch": 6.229022304027717, "grad_norm": 0.3887973129749298, "learning_rate": 4.937709776959723e-05, "loss": 0.3786, "step": 920500 }, { "epoch": 6.232405803378086, "grad_norm": 0.371076762676239, "learning_rate": 4.9376759419662195e-05, "loss": 0.3786, "step": 921000 }, { "epoch": 6.235789302728454, "grad_norm": 0.3317933976650238, "learning_rate": 4.937642106972716e-05, "loss": 0.3763, "step": 921500 }, { "epoch": 6.239172802078822, "grad_norm": 0.36534351110458374, "learning_rate": 4.937608271979212e-05, "loss": 0.3772, "step": 922000 }, { "epoch": 6.2425563014291905, "grad_norm": 0.36879029870033264, "learning_rate": 4.937574436985709e-05, "loss": 0.3769, "step": 922500 }, { "epoch": 6.245939800779558, "grad_norm": 0.36423808336257935, "learning_rate": 4.937540601992205e-05, "loss": 0.3777, "step": 923000 }, { "epoch": 6.249323300129927, "grad_norm": 0.30669546127319336, "learning_rate": 4.9375067669987006e-05, "loss": 0.3779, "step": 923500 }, { "epoch": 6.252706799480294, "grad_norm": 0.38380688428878784, "learning_rate": 4.9374729320051975e-05, "loss": 0.3781, "step": 924000 }, { "epoch": 6.256090298830663, "grad_norm": 0.3103152811527252, "learning_rate": 4.937439097011694e-05, "loss": 0.3781, "step": 924500 }, { "epoch": 6.2594737981810304, "grad_norm": 0.36598241329193115, "learning_rate": 4.93740526201819e-05, "loss": 0.377, "step": 925000 }, { "epoch": 6.262857297531399, "grad_norm": 0.3761679530143738, "learning_rate": 4.937371427024686e-05, "loss": 0.3766, "step": 925500 }, { "epoch": 6.266240796881767, "grad_norm": 0.31851160526275635, "learning_rate": 4.937337592031182e-05, "loss": 0.3769, "step": 926000 }, { "epoch": 6.269624296232135, "grad_norm": 0.32151755690574646, "learning_rate": 4.937303757037679e-05, "loss": 0.3769, "step": 926500 }, { "epoch": 6.273007795582504, "grad_norm": 0.3226669728755951, "learning_rate": 4.9372699220441754e-05, "loss": 0.3786, "step": 927000 }, { "epoch": 6.276391294932871, "grad_norm": 0.35841917991638184, "learning_rate": 4.9372360870506716e-05, "loss": 0.3782, "step": 927500 }, { "epoch": 6.27977479428324, "grad_norm": 0.35071441531181335, "learning_rate": 4.937202252057168e-05, "loss": 0.3771, "step": 928000 }, { "epoch": 6.2831582936336074, "grad_norm": 0.3182418644428253, "learning_rate": 4.937168417063664e-05, "loss": 0.3767, "step": 928500 }, { "epoch": 6.286541792983976, "grad_norm": 0.33331987261772156, "learning_rate": 4.93713458207016e-05, "loss": 0.3777, "step": 929000 }, { "epoch": 6.289925292334344, "grad_norm": 0.3495878577232361, "learning_rate": 4.9371007470766565e-05, "loss": 0.3774, "step": 929500 }, { "epoch": 6.293308791684712, "grad_norm": 0.39855191111564636, "learning_rate": 4.9370669120831534e-05, "loss": 0.3764, "step": 930000 }, { "epoch": 6.29669229103508, "grad_norm": 0.33193016052246094, "learning_rate": 4.9370330770896496e-05, "loss": 0.3778, "step": 930500 }, { "epoch": 6.300075790385448, "grad_norm": 0.32373687624931335, "learning_rate": 4.936999242096146e-05, "loss": 0.377, "step": 931000 }, { "epoch": 6.303459289735816, "grad_norm": 0.32472047209739685, "learning_rate": 4.936965407102642e-05, "loss": 0.3763, "step": 931500 }, { "epoch": 6.3068427890861845, "grad_norm": 0.37571972608566284, "learning_rate": 4.936931572109139e-05, "loss": 0.3768, "step": 932000 }, { "epoch": 6.310226288436553, "grad_norm": 0.3355121910572052, "learning_rate": 4.936897737115635e-05, "loss": 0.3767, "step": 932500 }, { "epoch": 6.313609787786921, "grad_norm": 0.340402752161026, "learning_rate": 4.9368639021221306e-05, "loss": 0.378, "step": 933000 }, { "epoch": 6.316993287137289, "grad_norm": 0.36386340856552124, "learning_rate": 4.936830067128627e-05, "loss": 0.3771, "step": 933500 }, { "epoch": 6.320376786487657, "grad_norm": 0.3460202217102051, "learning_rate": 4.936796232135124e-05, "loss": 0.3762, "step": 934000 }, { "epoch": 6.323760285838025, "grad_norm": 0.3330344557762146, "learning_rate": 4.93676239714162e-05, "loss": 0.3775, "step": 934500 }, { "epoch": 6.327143785188393, "grad_norm": 0.3709957003593445, "learning_rate": 4.936728562148116e-05, "loss": 0.377, "step": 935000 }, { "epoch": 6.3305272845387615, "grad_norm": 0.3179211914539337, "learning_rate": 4.9366947271546124e-05, "loss": 0.3779, "step": 935500 }, { "epoch": 6.333910783889129, "grad_norm": 0.35370567440986633, "learning_rate": 4.936660892161109e-05, "loss": 0.3781, "step": 936000 }, { "epoch": 6.337294283239498, "grad_norm": 0.3791564404964447, "learning_rate": 4.9366270571676055e-05, "loss": 0.3784, "step": 936500 }, { "epoch": 6.340677782589866, "grad_norm": 0.304982453584671, "learning_rate": 4.936593222174102e-05, "loss": 0.3783, "step": 937000 }, { "epoch": 6.344061281940234, "grad_norm": 0.3278897702693939, "learning_rate": 4.936559387180598e-05, "loss": 0.3774, "step": 937500 }, { "epoch": 6.347444781290602, "grad_norm": 0.333046019077301, "learning_rate": 4.936525552187094e-05, "loss": 0.3774, "step": 938000 }, { "epoch": 6.35082828064097, "grad_norm": 0.34410935640335083, "learning_rate": 4.93649171719359e-05, "loss": 0.3776, "step": 938500 }, { "epoch": 6.3542117799913385, "grad_norm": 0.3665798604488373, "learning_rate": 4.9364578822000865e-05, "loss": 0.3796, "step": 939000 }, { "epoch": 6.357595279341706, "grad_norm": 0.31881019473075867, "learning_rate": 4.9364240472065834e-05, "loss": 0.3764, "step": 939500 }, { "epoch": 6.360978778692075, "grad_norm": 0.33913347125053406, "learning_rate": 4.9363902122130796e-05, "loss": 0.3768, "step": 940000 }, { "epoch": 6.364362278042442, "grad_norm": 0.3458954691886902, "learning_rate": 4.936356377219576e-05, "loss": 0.3763, "step": 940500 }, { "epoch": 6.367745777392811, "grad_norm": 0.35770922899246216, "learning_rate": 4.936322542226072e-05, "loss": 0.3779, "step": 941000 }, { "epoch": 6.371129276743179, "grad_norm": 0.3633727431297302, "learning_rate": 4.936288707232569e-05, "loss": 0.3766, "step": 941500 }, { "epoch": 6.374512776093547, "grad_norm": 0.35198426246643066, "learning_rate": 4.936254872239065e-05, "loss": 0.3774, "step": 942000 }, { "epoch": 6.3778962754439155, "grad_norm": 0.3605923652648926, "learning_rate": 4.936221037245561e-05, "loss": 0.3784, "step": 942500 }, { "epoch": 6.381279774794283, "grad_norm": 0.3893333077430725, "learning_rate": 4.936187202252057e-05, "loss": 0.376, "step": 943000 }, { "epoch": 6.384663274144652, "grad_norm": 0.34985458850860596, "learning_rate": 4.936153367258554e-05, "loss": 0.3777, "step": 943500 }, { "epoch": 6.388046773495019, "grad_norm": 0.3702256977558136, "learning_rate": 4.93611953226505e-05, "loss": 0.3772, "step": 944000 }, { "epoch": 6.391430272845388, "grad_norm": 0.337545782327652, "learning_rate": 4.936085697271546e-05, "loss": 0.3775, "step": 944500 }, { "epoch": 6.394813772195755, "grad_norm": 0.3542190194129944, "learning_rate": 4.9360518622780424e-05, "loss": 0.3773, "step": 945000 }, { "epoch": 6.398197271546124, "grad_norm": 0.3288930058479309, "learning_rate": 4.936018027284539e-05, "loss": 0.3761, "step": 945500 }, { "epoch": 6.401580770896492, "grad_norm": 0.3794701397418976, "learning_rate": 4.9359841922910355e-05, "loss": 0.3774, "step": 946000 }, { "epoch": 6.40496427024686, "grad_norm": 0.3350881338119507, "learning_rate": 4.935950357297532e-05, "loss": 0.3784, "step": 946500 }, { "epoch": 6.408347769597229, "grad_norm": 0.3109876811504364, "learning_rate": 4.935916522304028e-05, "loss": 0.3765, "step": 947000 }, { "epoch": 6.411731268947596, "grad_norm": 0.362585186958313, "learning_rate": 4.935882687310524e-05, "loss": 0.3779, "step": 947500 }, { "epoch": 6.415114768297965, "grad_norm": 0.3090763986110687, "learning_rate": 4.9358488523170204e-05, "loss": 0.378, "step": 948000 }, { "epoch": 6.418498267648332, "grad_norm": 0.3567908704280853, "learning_rate": 4.9358150173235166e-05, "loss": 0.3767, "step": 948500 }, { "epoch": 6.421881766998701, "grad_norm": 0.3453218936920166, "learning_rate": 4.9357811823300135e-05, "loss": 0.3772, "step": 949000 }, { "epoch": 6.425265266349069, "grad_norm": 0.365347683429718, "learning_rate": 4.93574734733651e-05, "loss": 0.3777, "step": 949500 }, { "epoch": 6.428648765699437, "grad_norm": 0.3815634846687317, "learning_rate": 4.935713512343006e-05, "loss": 0.3764, "step": 950000 }, { "epoch": 6.432032265049805, "grad_norm": 0.3377106785774231, "learning_rate": 4.935679677349502e-05, "loss": 0.3776, "step": 950500 }, { "epoch": 6.435415764400173, "grad_norm": 0.3434532582759857, "learning_rate": 4.935645842355999e-05, "loss": 0.3775, "step": 951000 }, { "epoch": 6.438799263750541, "grad_norm": 0.37483832240104675, "learning_rate": 4.935612007362495e-05, "loss": 0.3769, "step": 951500 }, { "epoch": 6.442182763100909, "grad_norm": 0.34985026717185974, "learning_rate": 4.935578172368991e-05, "loss": 0.3781, "step": 952000 }, { "epoch": 6.445566262451278, "grad_norm": 0.34416043758392334, "learning_rate": 4.935544337375487e-05, "loss": 0.3774, "step": 952500 }, { "epoch": 6.448949761801646, "grad_norm": 0.37591052055358887, "learning_rate": 4.935510502381984e-05, "loss": 0.3757, "step": 953000 }, { "epoch": 6.452333261152014, "grad_norm": 0.35309016704559326, "learning_rate": 4.93547666738848e-05, "loss": 0.3781, "step": 953500 }, { "epoch": 6.455716760502382, "grad_norm": 0.3455849587917328, "learning_rate": 4.935442832394976e-05, "loss": 0.3775, "step": 954000 }, { "epoch": 6.45910025985275, "grad_norm": 0.30579593777656555, "learning_rate": 4.9354089974014725e-05, "loss": 0.378, "step": 954500 }, { "epoch": 6.462483759203118, "grad_norm": 0.34462663531303406, "learning_rate": 4.9353751624079694e-05, "loss": 0.3782, "step": 955000 }, { "epoch": 6.465867258553486, "grad_norm": 0.35439208149909973, "learning_rate": 4.9353413274144656e-05, "loss": 0.3787, "step": 955500 }, { "epoch": 6.469250757903854, "grad_norm": 0.3425807058811188, "learning_rate": 4.935307492420962e-05, "loss": 0.3787, "step": 956000 }, { "epoch": 6.472634257254223, "grad_norm": 0.33159133791923523, "learning_rate": 4.935273657427458e-05, "loss": 0.3778, "step": 956500 }, { "epoch": 6.476017756604591, "grad_norm": 0.3423265814781189, "learning_rate": 4.935239822433954e-05, "loss": 0.3787, "step": 957000 }, { "epoch": 6.479401255954959, "grad_norm": 0.3442203998565674, "learning_rate": 4.9352059874404504e-05, "loss": 0.3783, "step": 957500 }, { "epoch": 6.482784755305327, "grad_norm": 0.3310578167438507, "learning_rate": 4.9351721524469467e-05, "loss": 0.3781, "step": 958000 }, { "epoch": 6.486168254655695, "grad_norm": 0.37686657905578613, "learning_rate": 4.9351383174534435e-05, "loss": 0.3765, "step": 958500 }, { "epoch": 6.489551754006063, "grad_norm": 0.3256211578845978, "learning_rate": 4.93510448245994e-05, "loss": 0.3777, "step": 959000 }, { "epoch": 6.492935253356431, "grad_norm": 0.33287709951400757, "learning_rate": 4.935070647466436e-05, "loss": 0.3756, "step": 959500 }, { "epoch": 6.4963187527068, "grad_norm": 0.33640360832214355, "learning_rate": 4.935036812472932e-05, "loss": 0.3785, "step": 960000 }, { "epoch": 6.499702252057167, "grad_norm": 0.35884687304496765, "learning_rate": 4.935002977479429e-05, "loss": 0.3765, "step": 960500 }, { "epoch": 6.503085751407536, "grad_norm": 0.3432214856147766, "learning_rate": 4.934969142485925e-05, "loss": 0.3772, "step": 961000 }, { "epoch": 6.506469250757904, "grad_norm": 0.34991124272346497, "learning_rate": 4.934935307492421e-05, "loss": 0.3781, "step": 961500 }, { "epoch": 6.509852750108272, "grad_norm": 0.3342727720737457, "learning_rate": 4.934901472498917e-05, "loss": 0.3767, "step": 962000 }, { "epoch": 6.51323624945864, "grad_norm": 0.35070064663887024, "learning_rate": 4.934867637505414e-05, "loss": 0.3779, "step": 962500 }, { "epoch": 6.516619748809008, "grad_norm": 0.3440904915332794, "learning_rate": 4.93483380251191e-05, "loss": 0.3783, "step": 963000 }, { "epoch": 6.520003248159377, "grad_norm": 0.31607815623283386, "learning_rate": 4.9347999675184063e-05, "loss": 0.3758, "step": 963500 }, { "epoch": 6.523386747509744, "grad_norm": 0.3344024121761322, "learning_rate": 4.9347661325249026e-05, "loss": 0.3784, "step": 964000 }, { "epoch": 6.526770246860113, "grad_norm": 0.33444446325302124, "learning_rate": 4.9347322975313994e-05, "loss": 0.376, "step": 964500 }, { "epoch": 6.53015374621048, "grad_norm": 0.3433579206466675, "learning_rate": 4.9346984625378957e-05, "loss": 0.3776, "step": 965000 }, { "epoch": 6.533537245560849, "grad_norm": 0.3734302520751953, "learning_rate": 4.934664627544392e-05, "loss": 0.3775, "step": 965500 }, { "epoch": 6.536920744911217, "grad_norm": 0.3572101891040802, "learning_rate": 4.934630792550888e-05, "loss": 0.3776, "step": 966000 }, { "epoch": 6.540304244261585, "grad_norm": 0.3306787610054016, "learning_rate": 4.934596957557384e-05, "loss": 0.3778, "step": 966500 }, { "epoch": 6.543687743611954, "grad_norm": 0.3770751357078552, "learning_rate": 4.9345631225638805e-05, "loss": 0.3761, "step": 967000 }, { "epoch": 6.547071242962321, "grad_norm": 0.3891368508338928, "learning_rate": 4.934529287570377e-05, "loss": 0.3779, "step": 967500 }, { "epoch": 6.55045474231269, "grad_norm": 0.3260124921798706, "learning_rate": 4.9344954525768736e-05, "loss": 0.3777, "step": 968000 }, { "epoch": 6.553838241663057, "grad_norm": 0.327788770198822, "learning_rate": 4.93446161758337e-05, "loss": 0.3776, "step": 968500 }, { "epoch": 6.557221741013426, "grad_norm": 0.38705557584762573, "learning_rate": 4.934427782589866e-05, "loss": 0.3769, "step": 969000 }, { "epoch": 6.5606052403637936, "grad_norm": 0.3410389721393585, "learning_rate": 4.934393947596362e-05, "loss": 0.3783, "step": 969500 }, { "epoch": 6.563988739714162, "grad_norm": 0.3226074278354645, "learning_rate": 4.934360112602859e-05, "loss": 0.3792, "step": 970000 }, { "epoch": 6.56737223906453, "grad_norm": 0.3195018172264099, "learning_rate": 4.9343262776093553e-05, "loss": 0.3763, "step": 970500 }, { "epoch": 6.570755738414898, "grad_norm": 0.3643065392971039, "learning_rate": 4.934292442615851e-05, "loss": 0.3768, "step": 971000 }, { "epoch": 6.574139237765266, "grad_norm": 0.3433854281902313, "learning_rate": 4.934258607622347e-05, "loss": 0.3777, "step": 971500 }, { "epoch": 6.577522737115634, "grad_norm": 0.3245162069797516, "learning_rate": 4.934224772628844e-05, "loss": 0.3783, "step": 972000 }, { "epoch": 6.580906236466003, "grad_norm": 0.32305416464805603, "learning_rate": 4.93419093763534e-05, "loss": 0.377, "step": 972500 }, { "epoch": 6.584289735816371, "grad_norm": 0.3494907021522522, "learning_rate": 4.9341571026418364e-05, "loss": 0.3779, "step": 973000 }, { "epoch": 6.587673235166739, "grad_norm": 0.34951290488243103, "learning_rate": 4.9341232676483326e-05, "loss": 0.3786, "step": 973500 }, { "epoch": 6.591056734517107, "grad_norm": 0.33310452103614807, "learning_rate": 4.9340894326548295e-05, "loss": 0.3769, "step": 974000 }, { "epoch": 6.594440233867475, "grad_norm": 0.46549180150032043, "learning_rate": 4.934055597661326e-05, "loss": 0.3767, "step": 974500 }, { "epoch": 6.597823733217843, "grad_norm": 0.35990414023399353, "learning_rate": 4.934021762667822e-05, "loss": 0.3761, "step": 975000 }, { "epoch": 6.601207232568211, "grad_norm": 0.3343123495578766, "learning_rate": 4.933987927674318e-05, "loss": 0.3782, "step": 975500 }, { "epoch": 6.604590731918579, "grad_norm": 0.32567545771598816, "learning_rate": 4.9339540926808144e-05, "loss": 0.3765, "step": 976000 }, { "epoch": 6.607974231268948, "grad_norm": 0.3510096073150635, "learning_rate": 4.9339202576873106e-05, "loss": 0.3768, "step": 976500 }, { "epoch": 6.611357730619316, "grad_norm": 0.32567307353019714, "learning_rate": 4.933886422693807e-05, "loss": 0.378, "step": 977000 }, { "epoch": 6.614741229969684, "grad_norm": 0.33810216188430786, "learning_rate": 4.933852587700304e-05, "loss": 0.3776, "step": 977500 }, { "epoch": 6.618124729320052, "grad_norm": 0.354951947927475, "learning_rate": 4.9338187527068e-05, "loss": 0.378, "step": 978000 }, { "epoch": 6.62150822867042, "grad_norm": 0.32536765933036804, "learning_rate": 4.933784917713296e-05, "loss": 0.3773, "step": 978500 }, { "epoch": 6.624891728020788, "grad_norm": 0.3651494085788727, "learning_rate": 4.933751082719792e-05, "loss": 0.3779, "step": 979000 }, { "epoch": 6.628275227371156, "grad_norm": 0.3602128326892853, "learning_rate": 4.9337172477262885e-05, "loss": 0.3778, "step": 979500 }, { "epoch": 6.631658726721525, "grad_norm": 0.3357810080051422, "learning_rate": 4.9336834127327854e-05, "loss": 0.3767, "step": 980000 }, { "epoch": 6.635042226071892, "grad_norm": 0.31764236092567444, "learning_rate": 4.933649577739281e-05, "loss": 0.3786, "step": 980500 }, { "epoch": 6.638425725422261, "grad_norm": 0.38006776571273804, "learning_rate": 4.933615742745777e-05, "loss": 0.378, "step": 981000 }, { "epoch": 6.641809224772629, "grad_norm": 0.28093838691711426, "learning_rate": 4.933581907752274e-05, "loss": 0.3785, "step": 981500 }, { "epoch": 6.645192724122997, "grad_norm": 0.32980915904045105, "learning_rate": 4.93354807275877e-05, "loss": 0.3763, "step": 982000 }, { "epoch": 6.648576223473365, "grad_norm": 0.34391888976097107, "learning_rate": 4.9335142377652665e-05, "loss": 0.3783, "step": 982500 }, { "epoch": 6.651959722823733, "grad_norm": 0.361083447933197, "learning_rate": 4.933480402771763e-05, "loss": 0.3766, "step": 983000 }, { "epoch": 6.655343222174102, "grad_norm": 0.36109864711761475, "learning_rate": 4.9334465677782596e-05, "loss": 0.3774, "step": 983500 }, { "epoch": 6.658726721524469, "grad_norm": 0.3269791901111603, "learning_rate": 4.933412732784756e-05, "loss": 0.378, "step": 984000 }, { "epoch": 6.662110220874838, "grad_norm": 0.3839172124862671, "learning_rate": 4.933378897791252e-05, "loss": 0.3784, "step": 984500 }, { "epoch": 6.665493720225205, "grad_norm": 0.33352527022361755, "learning_rate": 4.933345062797748e-05, "loss": 0.3783, "step": 985000 }, { "epoch": 6.668877219575574, "grad_norm": 0.36033135652542114, "learning_rate": 4.9333112278042444e-05, "loss": 0.3769, "step": 985500 }, { "epoch": 6.672260718925942, "grad_norm": 0.3391295075416565, "learning_rate": 4.9332773928107406e-05, "loss": 0.3772, "step": 986000 }, { "epoch": 6.67564421827631, "grad_norm": 0.3773353099822998, "learning_rate": 4.933243557817237e-05, "loss": 0.3771, "step": 986500 }, { "epoch": 6.679027717626679, "grad_norm": 0.3952521085739136, "learning_rate": 4.933209722823733e-05, "loss": 0.3763, "step": 987000 }, { "epoch": 6.682411216977046, "grad_norm": 0.37172383069992065, "learning_rate": 4.93317588783023e-05, "loss": 0.3776, "step": 987500 }, { "epoch": 6.685794716327415, "grad_norm": 0.3839188516139984, "learning_rate": 4.933142052836726e-05, "loss": 0.3776, "step": 988000 }, { "epoch": 6.689178215677782, "grad_norm": 0.3491535186767578, "learning_rate": 4.9331082178432224e-05, "loss": 0.378, "step": 988500 }, { "epoch": 6.692561715028151, "grad_norm": 0.32665354013442993, "learning_rate": 4.9330743828497186e-05, "loss": 0.3782, "step": 989000 }, { "epoch": 6.6959452143785185, "grad_norm": 0.3311278223991394, "learning_rate": 4.9330405478562155e-05, "loss": 0.378, "step": 989500 }, { "epoch": 6.699328713728887, "grad_norm": 0.31300148367881775, "learning_rate": 4.933006712862711e-05, "loss": 0.3771, "step": 990000 }, { "epoch": 6.702712213079256, "grad_norm": 0.359619140625, "learning_rate": 4.932972877869207e-05, "loss": 0.3774, "step": 990500 }, { "epoch": 6.706095712429623, "grad_norm": 0.3795474171638489, "learning_rate": 4.932939042875704e-05, "loss": 0.3765, "step": 991000 }, { "epoch": 6.709479211779991, "grad_norm": 0.3238963484764099, "learning_rate": 4.9329052078822e-05, "loss": 0.3775, "step": 991500 }, { "epoch": 6.712862711130359, "grad_norm": 0.35236936807632446, "learning_rate": 4.9328713728886965e-05, "loss": 0.3779, "step": 992000 }, { "epoch": 6.716246210480728, "grad_norm": 0.3102133274078369, "learning_rate": 4.932837537895193e-05, "loss": 0.378, "step": 992500 }, { "epoch": 6.7196297098310955, "grad_norm": 0.33269041776657104, "learning_rate": 4.9328037029016896e-05, "loss": 0.3793, "step": 993000 }, { "epoch": 6.723013209181464, "grad_norm": 0.39434024691581726, "learning_rate": 4.932769867908186e-05, "loss": 0.3766, "step": 993500 }, { "epoch": 6.726396708531832, "grad_norm": 0.3668605387210846, "learning_rate": 4.932736032914682e-05, "loss": 0.3766, "step": 994000 }, { "epoch": 6.7297802078822, "grad_norm": 0.418206125497818, "learning_rate": 4.932702197921178e-05, "loss": 0.3766, "step": 994500 }, { "epoch": 6.733163707232568, "grad_norm": 0.37223225831985474, "learning_rate": 4.9326683629276745e-05, "loss": 0.3766, "step": 995000 }, { "epoch": 6.736547206582936, "grad_norm": 0.36875730752944946, "learning_rate": 4.932634527934171e-05, "loss": 0.3777, "step": 995500 }, { "epoch": 6.739930705933304, "grad_norm": 0.34293901920318604, "learning_rate": 4.932600692940667e-05, "loss": 0.3781, "step": 996000 }, { "epoch": 6.7433142052836725, "grad_norm": 0.3703588545322418, "learning_rate": 4.932566857947163e-05, "loss": 0.3764, "step": 996500 }, { "epoch": 6.746697704634041, "grad_norm": 0.33531859517097473, "learning_rate": 4.93253302295366e-05, "loss": 0.3776, "step": 997000 }, { "epoch": 6.750081203984409, "grad_norm": 0.33256930112838745, "learning_rate": 4.932499187960156e-05, "loss": 0.3776, "step": 997500 }, { "epoch": 6.753464703334777, "grad_norm": 0.34108084440231323, "learning_rate": 4.9324653529666524e-05, "loss": 0.3786, "step": 998000 }, { "epoch": 6.756848202685145, "grad_norm": 0.32973945140838623, "learning_rate": 4.9324315179731486e-05, "loss": 0.3764, "step": 998500 }, { "epoch": 6.760231702035513, "grad_norm": 0.3286641240119934, "learning_rate": 4.9323976829796455e-05, "loss": 0.3773, "step": 999000 }, { "epoch": 6.763615201385881, "grad_norm": 0.3770553767681122, "learning_rate": 4.932363847986141e-05, "loss": 0.376, "step": 999500 }, { "epoch": 6.7669987007362495, "grad_norm": 0.3492177426815033, "learning_rate": 4.932330012992637e-05, "loss": 0.3762, "step": 1000000 }, { "epoch": 6.770382200086617, "grad_norm": 0.3836628496646881, "learning_rate": 4.932296177999134e-05, "loss": 0.3758, "step": 1000500 }, { "epoch": 6.773765699436986, "grad_norm": 0.32772648334503174, "learning_rate": 4.9322623430056304e-05, "loss": 0.379, "step": 1001000 }, { "epoch": 6.777149198787354, "grad_norm": 0.31784480810165405, "learning_rate": 4.9322285080121266e-05, "loss": 0.3769, "step": 1001500 }, { "epoch": 6.780532698137722, "grad_norm": 0.3466293215751648, "learning_rate": 4.932194673018623e-05, "loss": 0.3779, "step": 1002000 }, { "epoch": 6.78391619748809, "grad_norm": 0.3728845715522766, "learning_rate": 4.93216083802512e-05, "loss": 0.3774, "step": 1002500 }, { "epoch": 6.787299696838458, "grad_norm": 0.31999626755714417, "learning_rate": 4.932127003031616e-05, "loss": 0.3764, "step": 1003000 }, { "epoch": 6.7906831961888265, "grad_norm": 0.3349250555038452, "learning_rate": 4.932093168038112e-05, "loss": 0.3764, "step": 1003500 }, { "epoch": 6.794066695539194, "grad_norm": 0.3492509424686432, "learning_rate": 4.932059333044608e-05, "loss": 0.3763, "step": 1004000 }, { "epoch": 6.797450194889563, "grad_norm": 0.31739434599876404, "learning_rate": 4.9320254980511045e-05, "loss": 0.3767, "step": 1004500 }, { "epoch": 6.80083369423993, "grad_norm": 0.3199305534362793, "learning_rate": 4.931991663057601e-05, "loss": 0.3768, "step": 1005000 }, { "epoch": 6.804217193590299, "grad_norm": 0.28688299655914307, "learning_rate": 4.931957828064097e-05, "loss": 0.3771, "step": 1005500 }, { "epoch": 6.807600692940667, "grad_norm": 0.30341842770576477, "learning_rate": 4.931923993070593e-05, "loss": 0.3771, "step": 1006000 }, { "epoch": 6.810984192291035, "grad_norm": 0.34070146083831787, "learning_rate": 4.93189015807709e-05, "loss": 0.3788, "step": 1006500 }, { "epoch": 6.8143676916414035, "grad_norm": 0.3197017014026642, "learning_rate": 4.931856323083586e-05, "loss": 0.378, "step": 1007000 }, { "epoch": 6.817751190991771, "grad_norm": 0.33668169379234314, "learning_rate": 4.9318224880900825e-05, "loss": 0.3768, "step": 1007500 }, { "epoch": 6.82113469034214, "grad_norm": 0.3488815724849701, "learning_rate": 4.931788653096579e-05, "loss": 0.3758, "step": 1008000 }, { "epoch": 6.824518189692507, "grad_norm": 0.35777804255485535, "learning_rate": 4.9317548181030756e-05, "loss": 0.3777, "step": 1008500 }, { "epoch": 6.827901689042876, "grad_norm": 0.31806090474128723, "learning_rate": 4.931720983109571e-05, "loss": 0.3776, "step": 1009000 }, { "epoch": 6.8312851883932435, "grad_norm": 0.339591920375824, "learning_rate": 4.9316871481160673e-05, "loss": 0.3768, "step": 1009500 }, { "epoch": 6.834668687743612, "grad_norm": 0.3289056122303009, "learning_rate": 4.931653313122564e-05, "loss": 0.3771, "step": 1010000 }, { "epoch": 6.8380521870939805, "grad_norm": 0.3581726551055908, "learning_rate": 4.9316194781290604e-05, "loss": 0.3761, "step": 1010500 }, { "epoch": 6.841435686444348, "grad_norm": 0.35697153210639954, "learning_rate": 4.9315856431355567e-05, "loss": 0.3774, "step": 1011000 }, { "epoch": 6.844819185794717, "grad_norm": 0.34281229972839355, "learning_rate": 4.931551808142053e-05, "loss": 0.3766, "step": 1011500 }, { "epoch": 6.848202685145084, "grad_norm": 0.327567994594574, "learning_rate": 4.93151797314855e-05, "loss": 0.3778, "step": 1012000 }, { "epoch": 6.851586184495453, "grad_norm": 0.3587280809879303, "learning_rate": 4.931484138155046e-05, "loss": 0.3795, "step": 1012500 }, { "epoch": 6.8549696838458205, "grad_norm": 0.3584112226963043, "learning_rate": 4.931450303161542e-05, "loss": 0.3769, "step": 1013000 }, { "epoch": 6.858353183196189, "grad_norm": 0.33898597955703735, "learning_rate": 4.9314164681680384e-05, "loss": 0.3759, "step": 1013500 }, { "epoch": 6.861736682546557, "grad_norm": 0.3355078101158142, "learning_rate": 4.9313826331745346e-05, "loss": 0.3786, "step": 1014000 }, { "epoch": 6.865120181896925, "grad_norm": 0.3411511182785034, "learning_rate": 4.931348798181031e-05, "loss": 0.3765, "step": 1014500 }, { "epoch": 6.868503681247293, "grad_norm": 0.3562315106391907, "learning_rate": 4.931314963187527e-05, "loss": 0.3778, "step": 1015000 }, { "epoch": 6.871887180597661, "grad_norm": 0.35385093092918396, "learning_rate": 4.931281128194023e-05, "loss": 0.3774, "step": 1015500 }, { "epoch": 6.875270679948029, "grad_norm": 0.3415220081806183, "learning_rate": 4.93124729320052e-05, "loss": 0.3754, "step": 1016000 }, { "epoch": 6.8786541792983975, "grad_norm": 0.3337489664554596, "learning_rate": 4.9312134582070163e-05, "loss": 0.3768, "step": 1016500 }, { "epoch": 6.882037678648766, "grad_norm": 0.33073464035987854, "learning_rate": 4.9311796232135126e-05, "loss": 0.3766, "step": 1017000 }, { "epoch": 6.885421177999134, "grad_norm": 0.37152549624443054, "learning_rate": 4.931145788220009e-05, "loss": 0.3763, "step": 1017500 }, { "epoch": 6.888804677349502, "grad_norm": 0.34675320982933044, "learning_rate": 4.931111953226506e-05, "loss": 0.3782, "step": 1018000 }, { "epoch": 6.89218817669987, "grad_norm": 0.31108322739601135, "learning_rate": 4.931078118233002e-05, "loss": 0.3774, "step": 1018500 }, { "epoch": 6.895571676050238, "grad_norm": 0.327083557844162, "learning_rate": 4.9310442832394974e-05, "loss": 0.3767, "step": 1019000 }, { "epoch": 6.898955175400606, "grad_norm": 0.34527599811553955, "learning_rate": 4.931010448245994e-05, "loss": 0.3765, "step": 1019500 }, { "epoch": 6.9023386747509745, "grad_norm": 0.33913472294807434, "learning_rate": 4.9309766132524905e-05, "loss": 0.3753, "step": 1020000 }, { "epoch": 6.905722174101342, "grad_norm": 0.3427327275276184, "learning_rate": 4.930942778258987e-05, "loss": 0.3781, "step": 1020500 }, { "epoch": 6.909105673451711, "grad_norm": 0.35090121626853943, "learning_rate": 4.930908943265483e-05, "loss": 0.3766, "step": 1021000 }, { "epoch": 6.912489172802079, "grad_norm": 0.3135407269001007, "learning_rate": 4.93087510827198e-05, "loss": 0.3779, "step": 1021500 }, { "epoch": 6.915872672152447, "grad_norm": 0.34838712215423584, "learning_rate": 4.930841273278476e-05, "loss": 0.3772, "step": 1022000 }, { "epoch": 6.919256171502815, "grad_norm": 0.33714696764945984, "learning_rate": 4.930807438284972e-05, "loss": 0.3778, "step": 1022500 }, { "epoch": 6.922639670853183, "grad_norm": 0.3507121801376343, "learning_rate": 4.9307736032914685e-05, "loss": 0.3775, "step": 1023000 }, { "epoch": 6.9260231702035515, "grad_norm": 0.3211919069290161, "learning_rate": 4.930739768297965e-05, "loss": 0.3765, "step": 1023500 }, { "epoch": 6.929406669553919, "grad_norm": 0.32550227642059326, "learning_rate": 4.930705933304461e-05, "loss": 0.3785, "step": 1024000 }, { "epoch": 6.932790168904288, "grad_norm": 0.3237001597881317, "learning_rate": 4.930672098310957e-05, "loss": 0.3765, "step": 1024500 }, { "epoch": 6.936173668254655, "grad_norm": 0.3335512578487396, "learning_rate": 4.930638263317453e-05, "loss": 0.3767, "step": 1025000 }, { "epoch": 6.939557167605024, "grad_norm": 0.3482947051525116, "learning_rate": 4.93060442832395e-05, "loss": 0.3776, "step": 1025500 }, { "epoch": 6.942940666955392, "grad_norm": 0.30258816480636597, "learning_rate": 4.9305705933304464e-05, "loss": 0.3773, "step": 1026000 }, { "epoch": 6.94632416630576, "grad_norm": 0.3438469469547272, "learning_rate": 4.9305367583369426e-05, "loss": 0.3775, "step": 1026500 }, { "epoch": 6.9497076656561285, "grad_norm": 0.358698308467865, "learning_rate": 4.930502923343439e-05, "loss": 0.3788, "step": 1027000 }, { "epoch": 6.953091165006496, "grad_norm": 0.3043401837348938, "learning_rate": 4.930469088349936e-05, "loss": 0.3765, "step": 1027500 }, { "epoch": 6.956474664356865, "grad_norm": 0.34928059577941895, "learning_rate": 4.930435253356432e-05, "loss": 0.3766, "step": 1028000 }, { "epoch": 6.959858163707232, "grad_norm": 0.38060516119003296, "learning_rate": 4.9304014183629275e-05, "loss": 0.3764, "step": 1028500 }, { "epoch": 6.963241663057601, "grad_norm": 0.3338804841041565, "learning_rate": 4.9303675833694244e-05, "loss": 0.3755, "step": 1029000 }, { "epoch": 6.9666251624079685, "grad_norm": 0.3243337571620941, "learning_rate": 4.9303337483759206e-05, "loss": 0.3774, "step": 1029500 }, { "epoch": 6.970008661758337, "grad_norm": 0.340320348739624, "learning_rate": 4.930299913382417e-05, "loss": 0.377, "step": 1030000 }, { "epoch": 6.9733921611087055, "grad_norm": 0.36838603019714355, "learning_rate": 4.930266078388913e-05, "loss": 0.3792, "step": 1030500 }, { "epoch": 6.976775660459073, "grad_norm": 0.3550949692726135, "learning_rate": 4.93023224339541e-05, "loss": 0.3776, "step": 1031000 }, { "epoch": 6.980159159809442, "grad_norm": 0.337386816740036, "learning_rate": 4.930198408401906e-05, "loss": 0.3769, "step": 1031500 }, { "epoch": 6.983542659159809, "grad_norm": 0.3563162386417389, "learning_rate": 4.930164573408402e-05, "loss": 0.3757, "step": 1032000 }, { "epoch": 6.986926158510178, "grad_norm": 0.37489381432533264, "learning_rate": 4.9301307384148985e-05, "loss": 0.3782, "step": 1032500 }, { "epoch": 6.9903096578605455, "grad_norm": 0.36588630080223083, "learning_rate": 4.930096903421395e-05, "loss": 0.3761, "step": 1033000 }, { "epoch": 6.993693157210914, "grad_norm": 0.3377918601036072, "learning_rate": 4.930063068427891e-05, "loss": 0.3779, "step": 1033500 }, { "epoch": 6.997076656561282, "grad_norm": 0.33746615052223206, "learning_rate": 4.930029233434387e-05, "loss": 0.3765, "step": 1034000 }, { "epoch": 7.0, "eval_accuracy": 0.8563648176791943, "eval_loss": 0.5842289924621582, "eval_runtime": 3394.0562, "eval_samples_per_second": 85.663, "eval_steps_per_second": 5.354, "step": 1034432 }, { "epoch": 7.00046015591165, "grad_norm": 0.371980220079422, "learning_rate": 4.9299953984408834e-05, "loss": 0.3758, "step": 1034500 }, { "epoch": 7.003843655262018, "grad_norm": 0.38439419865608215, "learning_rate": 4.92996156344738e-05, "loss": 0.376, "step": 1035000 }, { "epoch": 7.007227154612386, "grad_norm": 0.38531625270843506, "learning_rate": 4.9299277284538765e-05, "loss": 0.3747, "step": 1035500 }, { "epoch": 7.010610653962755, "grad_norm": 0.31885409355163574, "learning_rate": 4.929893893460373e-05, "loss": 0.3743, "step": 1036000 }, { "epoch": 7.0139941533131225, "grad_norm": 0.3155359625816345, "learning_rate": 4.929860058466869e-05, "loss": 0.3756, "step": 1036500 }, { "epoch": 7.017377652663491, "grad_norm": 0.31193721294403076, "learning_rate": 4.929826223473366e-05, "loss": 0.3759, "step": 1037000 }, { "epoch": 7.020761152013859, "grad_norm": 0.33127009868621826, "learning_rate": 4.929792388479862e-05, "loss": 0.3756, "step": 1037500 }, { "epoch": 7.024144651364227, "grad_norm": 0.36094731092453003, "learning_rate": 4.9297585534863575e-05, "loss": 0.3757, "step": 1038000 }, { "epoch": 7.027528150714595, "grad_norm": 0.3782150447368622, "learning_rate": 4.9297247184928544e-05, "loss": 0.3761, "step": 1038500 }, { "epoch": 7.030911650064963, "grad_norm": 0.3468509614467621, "learning_rate": 4.9296908834993506e-05, "loss": 0.3753, "step": 1039000 }, { "epoch": 7.034295149415331, "grad_norm": 0.33092188835144043, "learning_rate": 4.929657048505847e-05, "loss": 0.3744, "step": 1039500 }, { "epoch": 7.0376786487656995, "grad_norm": 0.3647925555706024, "learning_rate": 4.929623213512343e-05, "loss": 0.3746, "step": 1040000 }, { "epoch": 7.041062148116067, "grad_norm": 0.3791462182998657, "learning_rate": 4.92958937851884e-05, "loss": 0.3753, "step": 1040500 }, { "epoch": 7.044445647466436, "grad_norm": 0.3329277038574219, "learning_rate": 4.929555543525336e-05, "loss": 0.374, "step": 1041000 }, { "epoch": 7.047829146816804, "grad_norm": 0.3497406840324402, "learning_rate": 4.9295217085318324e-05, "loss": 0.3763, "step": 1041500 }, { "epoch": 7.051212646167172, "grad_norm": 0.33520519733428955, "learning_rate": 4.9294878735383286e-05, "loss": 0.3757, "step": 1042000 }, { "epoch": 7.05459614551754, "grad_norm": 0.34984537959098816, "learning_rate": 4.929454038544825e-05, "loss": 0.3753, "step": 1042500 }, { "epoch": 7.057979644867908, "grad_norm": 0.370272696018219, "learning_rate": 4.929420203551321e-05, "loss": 0.3751, "step": 1043000 }, { "epoch": 7.0613631442182765, "grad_norm": 0.3326875567436218, "learning_rate": 4.929386368557817e-05, "loss": 0.3748, "step": 1043500 }, { "epoch": 7.064746643568644, "grad_norm": 0.3748356103897095, "learning_rate": 4.9293525335643134e-05, "loss": 0.3767, "step": 1044000 }, { "epoch": 7.068130142919013, "grad_norm": 0.6138136386871338, "learning_rate": 4.92931869857081e-05, "loss": 0.3744, "step": 1044500 }, { "epoch": 7.07151364226938, "grad_norm": 0.3744707405567169, "learning_rate": 4.9292848635773065e-05, "loss": 0.3752, "step": 1045000 }, { "epoch": 7.074897141619749, "grad_norm": 0.3307333290576935, "learning_rate": 4.929251028583803e-05, "loss": 0.3766, "step": 1045500 }, { "epoch": 7.078280640970117, "grad_norm": 0.3341595530509949, "learning_rate": 4.929217193590299e-05, "loss": 0.3737, "step": 1046000 }, { "epoch": 7.081664140320485, "grad_norm": 0.3514774739742279, "learning_rate": 4.929183358596796e-05, "loss": 0.3746, "step": 1046500 }, { "epoch": 7.0850476396708535, "grad_norm": 0.3368781805038452, "learning_rate": 4.929149523603292e-05, "loss": 0.3743, "step": 1047000 }, { "epoch": 7.088431139021221, "grad_norm": 0.3600742816925049, "learning_rate": 4.9291156886097876e-05, "loss": 0.3751, "step": 1047500 }, { "epoch": 7.09181463837159, "grad_norm": 0.3651391267776489, "learning_rate": 4.9290818536162845e-05, "loss": 0.3766, "step": 1048000 }, { "epoch": 7.095198137721957, "grad_norm": 0.4543780982494354, "learning_rate": 4.929048018622781e-05, "loss": 0.3732, "step": 1048500 }, { "epoch": 7.098581637072326, "grad_norm": 0.32120242714881897, "learning_rate": 4.929014183629277e-05, "loss": 0.3762, "step": 1049000 }, { "epoch": 7.1019651364226934, "grad_norm": 0.32367223501205444, "learning_rate": 4.928980348635773e-05, "loss": 0.3767, "step": 1049500 }, { "epoch": 7.105348635773062, "grad_norm": 0.3515234589576721, "learning_rate": 4.928946513642269e-05, "loss": 0.373, "step": 1050000 }, { "epoch": 7.1087321351234305, "grad_norm": 0.32753437757492065, "learning_rate": 4.928912678648766e-05, "loss": 0.3752, "step": 1050500 }, { "epoch": 7.112115634473798, "grad_norm": 0.3608008027076721, "learning_rate": 4.9288788436552624e-05, "loss": 0.375, "step": 1051000 }, { "epoch": 7.115499133824167, "grad_norm": 0.36846715211868286, "learning_rate": 4.9288450086617587e-05, "loss": 0.3755, "step": 1051500 }, { "epoch": 7.118882633174534, "grad_norm": 0.36164039373397827, "learning_rate": 4.928811173668255e-05, "loss": 0.3766, "step": 1052000 }, { "epoch": 7.122266132524903, "grad_norm": 0.34308990836143494, "learning_rate": 4.928777338674751e-05, "loss": 0.3743, "step": 1052500 }, { "epoch": 7.1256496318752705, "grad_norm": 0.34098562598228455, "learning_rate": 4.928743503681247e-05, "loss": 0.3773, "step": 1053000 }, { "epoch": 7.129033131225639, "grad_norm": 0.35562875866889954, "learning_rate": 4.9287096686877435e-05, "loss": 0.3748, "step": 1053500 }, { "epoch": 7.132416630576007, "grad_norm": 0.38288894295692444, "learning_rate": 4.9286758336942404e-05, "loss": 0.3744, "step": 1054000 }, { "epoch": 7.135800129926375, "grad_norm": 0.33995917439460754, "learning_rate": 4.9286419987007366e-05, "loss": 0.3753, "step": 1054500 }, { "epoch": 7.139183629276743, "grad_norm": 0.3554493486881256, "learning_rate": 4.928608163707233e-05, "loss": 0.3744, "step": 1055000 }, { "epoch": 7.142567128627111, "grad_norm": 0.3418475389480591, "learning_rate": 4.928574328713729e-05, "loss": 0.3766, "step": 1055500 }, { "epoch": 7.14595062797748, "grad_norm": 0.3773251175880432, "learning_rate": 4.928540493720226e-05, "loss": 0.3765, "step": 1056000 }, { "epoch": 7.1493341273278475, "grad_norm": 0.3789617717266083, "learning_rate": 4.928506658726722e-05, "loss": 0.3774, "step": 1056500 }, { "epoch": 7.152717626678216, "grad_norm": 0.348550945520401, "learning_rate": 4.928472823733218e-05, "loss": 0.3749, "step": 1057000 }, { "epoch": 7.156101126028584, "grad_norm": 0.3775283396244049, "learning_rate": 4.9284389887397146e-05, "loss": 0.3755, "step": 1057500 }, { "epoch": 7.159484625378952, "grad_norm": 0.33230406045913696, "learning_rate": 4.928405153746211e-05, "loss": 0.3752, "step": 1058000 }, { "epoch": 7.16286812472932, "grad_norm": 0.36388495564460754, "learning_rate": 4.928371318752707e-05, "loss": 0.3769, "step": 1058500 }, { "epoch": 7.166251624079688, "grad_norm": 0.3778153359889984, "learning_rate": 4.928337483759203e-05, "loss": 0.3743, "step": 1059000 }, { "epoch": 7.169635123430056, "grad_norm": 0.36843207478523254, "learning_rate": 4.9283036487656994e-05, "loss": 0.3751, "step": 1059500 }, { "epoch": 7.1730186227804245, "grad_norm": 0.3462293744087219, "learning_rate": 4.928269813772196e-05, "loss": 0.3749, "step": 1060000 }, { "epoch": 7.176402122130792, "grad_norm": 0.3491370975971222, "learning_rate": 4.9282359787786925e-05, "loss": 0.3753, "step": 1060500 }, { "epoch": 7.179785621481161, "grad_norm": 0.35125911235809326, "learning_rate": 4.928202143785189e-05, "loss": 0.3755, "step": 1061000 }, { "epoch": 7.183169120831529, "grad_norm": 0.3585151433944702, "learning_rate": 4.928168308791685e-05, "loss": 0.3756, "step": 1061500 }, { "epoch": 7.186552620181897, "grad_norm": 0.3339557647705078, "learning_rate": 4.928134473798181e-05, "loss": 0.3772, "step": 1062000 }, { "epoch": 7.189936119532265, "grad_norm": 0.3724111020565033, "learning_rate": 4.9281006388046774e-05, "loss": 0.3764, "step": 1062500 }, { "epoch": 7.193319618882633, "grad_norm": 0.3494477868080139, "learning_rate": 4.9280668038111736e-05, "loss": 0.3753, "step": 1063000 }, { "epoch": 7.1967031182330015, "grad_norm": 0.32152417302131653, "learning_rate": 4.9280329688176705e-05, "loss": 0.3771, "step": 1063500 }, { "epoch": 7.200086617583369, "grad_norm": 0.3638397455215454, "learning_rate": 4.927999133824167e-05, "loss": 0.3761, "step": 1064000 }, { "epoch": 7.203470116933738, "grad_norm": 0.3315892219543457, "learning_rate": 4.927965298830663e-05, "loss": 0.3767, "step": 1064500 }, { "epoch": 7.206853616284105, "grad_norm": 0.3420919179916382, "learning_rate": 4.927931463837159e-05, "loss": 0.3767, "step": 1065000 }, { "epoch": 7.210237115634474, "grad_norm": 0.35963183641433716, "learning_rate": 4.927897628843656e-05, "loss": 0.3754, "step": 1065500 }, { "epoch": 7.213620614984842, "grad_norm": 0.3440105617046356, "learning_rate": 4.927863793850152e-05, "loss": 0.3753, "step": 1066000 }, { "epoch": 7.21700411433521, "grad_norm": 0.3412320017814636, "learning_rate": 4.927829958856648e-05, "loss": 0.3747, "step": 1066500 }, { "epoch": 7.2203876136855785, "grad_norm": 0.37493258714675903, "learning_rate": 4.927796123863144e-05, "loss": 0.3765, "step": 1067000 }, { "epoch": 7.223771113035946, "grad_norm": 0.3456762433052063, "learning_rate": 4.927762288869641e-05, "loss": 0.3749, "step": 1067500 }, { "epoch": 7.227154612386315, "grad_norm": 0.3558253049850464, "learning_rate": 4.927728453876137e-05, "loss": 0.375, "step": 1068000 }, { "epoch": 7.230538111736682, "grad_norm": 0.3644653856754303, "learning_rate": 4.927694618882633e-05, "loss": 0.3751, "step": 1068500 }, { "epoch": 7.233921611087051, "grad_norm": 0.4022010862827301, "learning_rate": 4.9276607838891295e-05, "loss": 0.3752, "step": 1069000 }, { "epoch": 7.237305110437418, "grad_norm": 0.3526616096496582, "learning_rate": 4.9276269488956264e-05, "loss": 0.3765, "step": 1069500 }, { "epoch": 7.240688609787787, "grad_norm": 0.35260966420173645, "learning_rate": 4.9275931139021226e-05, "loss": 0.3764, "step": 1070000 }, { "epoch": 7.2440721091381555, "grad_norm": 0.36006537079811096, "learning_rate": 4.927559278908619e-05, "loss": 0.3745, "step": 1070500 }, { "epoch": 7.247455608488523, "grad_norm": 0.34686923027038574, "learning_rate": 4.927525443915115e-05, "loss": 0.3748, "step": 1071000 }, { "epoch": 7.250839107838892, "grad_norm": 0.3436018228530884, "learning_rate": 4.927491608921611e-05, "loss": 0.3756, "step": 1071500 }, { "epoch": 7.254222607189259, "grad_norm": 0.3261730968952179, "learning_rate": 4.9274577739281074e-05, "loss": 0.3764, "step": 1072000 }, { "epoch": 7.257606106539628, "grad_norm": 0.3459329903125763, "learning_rate": 4.9274239389346036e-05, "loss": 0.3761, "step": 1072500 }, { "epoch": 7.260989605889995, "grad_norm": 0.3929137587547302, "learning_rate": 4.9273901039411005e-05, "loss": 0.3753, "step": 1073000 }, { "epoch": 7.264373105240364, "grad_norm": 0.3764931559562683, "learning_rate": 4.927356268947597e-05, "loss": 0.3772, "step": 1073500 }, { "epoch": 7.267756604590732, "grad_norm": 0.39603105187416077, "learning_rate": 4.927322433954093e-05, "loss": 0.3751, "step": 1074000 }, { "epoch": 7.2711401039411, "grad_norm": 0.3758893311023712, "learning_rate": 4.927288598960589e-05, "loss": 0.3766, "step": 1074500 }, { "epoch": 7.274523603291469, "grad_norm": 0.39224499464035034, "learning_rate": 4.927254763967086e-05, "loss": 0.3757, "step": 1075000 }, { "epoch": 7.277907102641836, "grad_norm": 0.3400898575782776, "learning_rate": 4.927220928973582e-05, "loss": 0.3757, "step": 1075500 }, { "epoch": 7.281290601992205, "grad_norm": 0.32646670937538147, "learning_rate": 4.927187093980078e-05, "loss": 0.374, "step": 1076000 }, { "epoch": 7.284674101342572, "grad_norm": 0.344763845205307, "learning_rate": 4.927153258986574e-05, "loss": 0.3767, "step": 1076500 }, { "epoch": 7.288057600692941, "grad_norm": 0.34435445070266724, "learning_rate": 4.927119423993071e-05, "loss": 0.3748, "step": 1077000 }, { "epoch": 7.291441100043309, "grad_norm": 0.31645989418029785, "learning_rate": 4.927085588999567e-05, "loss": 0.376, "step": 1077500 }, { "epoch": 7.294824599393677, "grad_norm": 0.334351509809494, "learning_rate": 4.927051754006063e-05, "loss": 0.3768, "step": 1078000 }, { "epoch": 7.298208098744045, "grad_norm": 0.31009796261787415, "learning_rate": 4.9270179190125595e-05, "loss": 0.3769, "step": 1078500 }, { "epoch": 7.301591598094413, "grad_norm": 0.3454814553260803, "learning_rate": 4.9269840840190564e-05, "loss": 0.3752, "step": 1079000 }, { "epoch": 7.304975097444781, "grad_norm": 0.34109362959861755, "learning_rate": 4.9269502490255526e-05, "loss": 0.3771, "step": 1079500 }, { "epoch": 7.308358596795149, "grad_norm": 0.38113102316856384, "learning_rate": 4.926916414032049e-05, "loss": 0.3748, "step": 1080000 }, { "epoch": 7.311742096145518, "grad_norm": 0.3147388696670532, "learning_rate": 4.926882579038545e-05, "loss": 0.3769, "step": 1080500 }, { "epoch": 7.315125595495886, "grad_norm": 0.32656314969062805, "learning_rate": 4.926848744045041e-05, "loss": 0.3775, "step": 1081000 }, { "epoch": 7.318509094846254, "grad_norm": 0.34077635407447815, "learning_rate": 4.9268149090515375e-05, "loss": 0.3771, "step": 1081500 }, { "epoch": 7.321892594196622, "grad_norm": 0.37306562066078186, "learning_rate": 4.926781074058034e-05, "loss": 0.376, "step": 1082000 }, { "epoch": 7.32527609354699, "grad_norm": 0.3802151381969452, "learning_rate": 4.9267472390645306e-05, "loss": 0.3764, "step": 1082500 }, { "epoch": 7.328659592897358, "grad_norm": 0.3427773416042328, "learning_rate": 4.926713404071027e-05, "loss": 0.3765, "step": 1083000 }, { "epoch": 7.332043092247726, "grad_norm": 0.30575892329216003, "learning_rate": 4.926679569077523e-05, "loss": 0.3767, "step": 1083500 }, { "epoch": 7.335426591598094, "grad_norm": 0.328872948884964, "learning_rate": 4.926645734084019e-05, "loss": 0.3766, "step": 1084000 }, { "epoch": 7.338810090948463, "grad_norm": 0.38873931765556335, "learning_rate": 4.926611899090516e-05, "loss": 0.3755, "step": 1084500 }, { "epoch": 7.34219359029883, "grad_norm": 0.3008991479873657, "learning_rate": 4.926578064097012e-05, "loss": 0.3757, "step": 1085000 }, { "epoch": 7.345577089649199, "grad_norm": 0.32548144459724426, "learning_rate": 4.926544229103508e-05, "loss": 0.3758, "step": 1085500 }, { "epoch": 7.348960588999567, "grad_norm": 0.3852235674858093, "learning_rate": 4.926510394110004e-05, "loss": 0.3761, "step": 1086000 }, { "epoch": 7.352344088349935, "grad_norm": 0.3904217481613159, "learning_rate": 4.926476559116501e-05, "loss": 0.3754, "step": 1086500 }, { "epoch": 7.355727587700303, "grad_norm": 0.3320090174674988, "learning_rate": 4.926442724122997e-05, "loss": 0.376, "step": 1087000 }, { "epoch": 7.359111087050671, "grad_norm": 0.3738870322704315, "learning_rate": 4.9264088891294934e-05, "loss": 0.3759, "step": 1087500 }, { "epoch": 7.36249458640104, "grad_norm": 0.3391508162021637, "learning_rate": 4.9263750541359896e-05, "loss": 0.376, "step": 1088000 }, { "epoch": 7.365878085751407, "grad_norm": 0.3554190993309021, "learning_rate": 4.9263412191424865e-05, "loss": 0.3757, "step": 1088500 }, { "epoch": 7.369261585101776, "grad_norm": 0.37407541275024414, "learning_rate": 4.926307384148983e-05, "loss": 0.3757, "step": 1089000 }, { "epoch": 7.372645084452143, "grad_norm": 0.3865109384059906, "learning_rate": 4.926273549155479e-05, "loss": 0.3755, "step": 1089500 }, { "epoch": 7.376028583802512, "grad_norm": 0.32073667645454407, "learning_rate": 4.926239714161975e-05, "loss": 0.3749, "step": 1090000 }, { "epoch": 7.37941208315288, "grad_norm": 0.3938956558704376, "learning_rate": 4.926205879168471e-05, "loss": 0.3766, "step": 1090500 }, { "epoch": 7.382795582503248, "grad_norm": 0.35945776104927063, "learning_rate": 4.9261720441749675e-05, "loss": 0.3756, "step": 1091000 }, { "epoch": 7.386179081853617, "grad_norm": 0.35040605068206787, "learning_rate": 4.926138209181464e-05, "loss": 0.3758, "step": 1091500 }, { "epoch": 7.389562581203984, "grad_norm": 0.3531397581100464, "learning_rate": 4.9261043741879606e-05, "loss": 0.3759, "step": 1092000 }, { "epoch": 7.392946080554353, "grad_norm": 0.3244784474372864, "learning_rate": 4.926070539194457e-05, "loss": 0.3756, "step": 1092500 }, { "epoch": 7.39632957990472, "grad_norm": 0.3516186773777008, "learning_rate": 4.926036704200953e-05, "loss": 0.3763, "step": 1093000 }, { "epoch": 7.399713079255089, "grad_norm": 0.3564213514328003, "learning_rate": 4.926002869207449e-05, "loss": 0.3759, "step": 1093500 }, { "epoch": 7.403096578605457, "grad_norm": 0.33187055587768555, "learning_rate": 4.925969034213946e-05, "loss": 0.3757, "step": 1094000 }, { "epoch": 7.406480077955825, "grad_norm": 0.4159274995326996, "learning_rate": 4.9259351992204424e-05, "loss": 0.3758, "step": 1094500 }, { "epoch": 7.409863577306194, "grad_norm": 0.37454167008399963, "learning_rate": 4.925901364226938e-05, "loss": 0.3773, "step": 1095000 }, { "epoch": 7.413247076656561, "grad_norm": 0.32225438952445984, "learning_rate": 4.925867529233434e-05, "loss": 0.3766, "step": 1095500 }, { "epoch": 7.41663057600693, "grad_norm": 0.3051474690437317, "learning_rate": 4.925833694239931e-05, "loss": 0.3775, "step": 1096000 }, { "epoch": 7.420014075357297, "grad_norm": 0.32634198665618896, "learning_rate": 4.925799859246427e-05, "loss": 0.3755, "step": 1096500 }, { "epoch": 7.423397574707666, "grad_norm": 0.338914692401886, "learning_rate": 4.9257660242529234e-05, "loss": 0.3761, "step": 1097000 }, { "epoch": 7.426781074058034, "grad_norm": 0.3681485950946808, "learning_rate": 4.9257321892594197e-05, "loss": 0.3757, "step": 1097500 }, { "epoch": 7.430164573408402, "grad_norm": 0.3372761309146881, "learning_rate": 4.9256983542659165e-05, "loss": 0.3764, "step": 1098000 }, { "epoch": 7.43354807275877, "grad_norm": 0.3378821015357971, "learning_rate": 4.925664519272413e-05, "loss": 0.3765, "step": 1098500 }, { "epoch": 7.436931572109138, "grad_norm": 0.3365425169467926, "learning_rate": 4.925630684278909e-05, "loss": 0.3763, "step": 1099000 }, { "epoch": 7.440315071459506, "grad_norm": 0.3577186167240143, "learning_rate": 4.925596849285405e-05, "loss": 0.3762, "step": 1099500 }, { "epoch": 7.443698570809874, "grad_norm": 0.3601377606391907, "learning_rate": 4.9255630142919014e-05, "loss": 0.377, "step": 1100000 }, { "epoch": 7.447082070160243, "grad_norm": 0.3320385813713074, "learning_rate": 4.9255291792983976e-05, "loss": 0.3759, "step": 1100500 }, { "epoch": 7.450465569510611, "grad_norm": 0.33235663175582886, "learning_rate": 4.925495344304894e-05, "loss": 0.376, "step": 1101000 }, { "epoch": 7.453849068860979, "grad_norm": 0.33040741086006165, "learning_rate": 4.925461509311391e-05, "loss": 0.3756, "step": 1101500 }, { "epoch": 7.457232568211347, "grad_norm": 0.37784650921821594, "learning_rate": 4.925427674317887e-05, "loss": 0.3748, "step": 1102000 }, { "epoch": 7.460616067561715, "grad_norm": 0.35406967997550964, "learning_rate": 4.925393839324383e-05, "loss": 0.3758, "step": 1102500 }, { "epoch": 7.463999566912083, "grad_norm": 0.35439401865005493, "learning_rate": 4.9253600043308793e-05, "loss": 0.376, "step": 1103000 }, { "epoch": 7.467383066262451, "grad_norm": 0.3342632055282593, "learning_rate": 4.925326169337376e-05, "loss": 0.3758, "step": 1103500 }, { "epoch": 7.470766565612819, "grad_norm": 0.32852116227149963, "learning_rate": 4.9252923343438724e-05, "loss": 0.3754, "step": 1104000 }, { "epoch": 7.474150064963188, "grad_norm": 0.3155595064163208, "learning_rate": 4.925258499350368e-05, "loss": 0.3767, "step": 1104500 }, { "epoch": 7.477533564313555, "grad_norm": 0.35901975631713867, "learning_rate": 4.925224664356864e-05, "loss": 0.3764, "step": 1105000 }, { "epoch": 7.480917063663924, "grad_norm": 0.3231262266635895, "learning_rate": 4.925190829363361e-05, "loss": 0.3757, "step": 1105500 }, { "epoch": 7.484300563014292, "grad_norm": 0.37099528312683105, "learning_rate": 4.925156994369857e-05, "loss": 0.3746, "step": 1106000 }, { "epoch": 7.48768406236466, "grad_norm": 0.34101614356040955, "learning_rate": 4.9251231593763535e-05, "loss": 0.3756, "step": 1106500 }, { "epoch": 7.491067561715028, "grad_norm": 0.3836347758769989, "learning_rate": 4.92508932438285e-05, "loss": 0.3785, "step": 1107000 }, { "epoch": 7.494451061065396, "grad_norm": 0.326123982667923, "learning_rate": 4.9250554893893466e-05, "loss": 0.3762, "step": 1107500 }, { "epoch": 7.497834560415765, "grad_norm": 0.3567655086517334, "learning_rate": 4.925021654395843e-05, "loss": 0.3761, "step": 1108000 }, { "epoch": 7.501218059766132, "grad_norm": 0.3998945653438568, "learning_rate": 4.924987819402339e-05, "loss": 0.3764, "step": 1108500 }, { "epoch": 7.504601559116501, "grad_norm": 0.3774033784866333, "learning_rate": 4.924953984408835e-05, "loss": 0.3765, "step": 1109000 }, { "epoch": 7.507985058466868, "grad_norm": 0.3236536979675293, "learning_rate": 4.9249201494153315e-05, "loss": 0.376, "step": 1109500 }, { "epoch": 7.511368557817237, "grad_norm": 0.3851451277732849, "learning_rate": 4.924886314421828e-05, "loss": 0.3744, "step": 1110000 }, { "epoch": 7.514752057167605, "grad_norm": 0.34989050030708313, "learning_rate": 4.924852479428324e-05, "loss": 0.3753, "step": 1110500 }, { "epoch": 7.518135556517973, "grad_norm": 0.33295780420303345, "learning_rate": 4.924818644434821e-05, "loss": 0.3747, "step": 1111000 }, { "epoch": 7.521519055868342, "grad_norm": 0.36218512058258057, "learning_rate": 4.924784809441317e-05, "loss": 0.3744, "step": 1111500 }, { "epoch": 7.524902555218709, "grad_norm": 0.33659127354621887, "learning_rate": 4.924750974447813e-05, "loss": 0.3758, "step": 1112000 }, { "epoch": 7.528286054569078, "grad_norm": 0.3490145206451416, "learning_rate": 4.9247171394543094e-05, "loss": 0.3753, "step": 1112500 }, { "epoch": 7.531669553919445, "grad_norm": 0.3658861219882965, "learning_rate": 4.9246833044608056e-05, "loss": 0.3766, "step": 1113000 }, { "epoch": 7.535053053269814, "grad_norm": 0.33971354365348816, "learning_rate": 4.9246494694673025e-05, "loss": 0.3754, "step": 1113500 }, { "epoch": 7.5384365526201815, "grad_norm": 0.37164369225502014, "learning_rate": 4.924615634473798e-05, "loss": 0.3754, "step": 1114000 }, { "epoch": 7.54182005197055, "grad_norm": 0.3788661062717438, "learning_rate": 4.924581799480294e-05, "loss": 0.3755, "step": 1114500 }, { "epoch": 7.545203551320919, "grad_norm": 0.3288120627403259, "learning_rate": 4.924547964486791e-05, "loss": 0.377, "step": 1115000 }, { "epoch": 7.548587050671286, "grad_norm": 0.356442928314209, "learning_rate": 4.9245141294932874e-05, "loss": 0.378, "step": 1115500 }, { "epoch": 7.551970550021655, "grad_norm": 0.35600316524505615, "learning_rate": 4.9244802944997836e-05, "loss": 0.3749, "step": 1116000 }, { "epoch": 7.555354049372022, "grad_norm": 0.37920600175857544, "learning_rate": 4.92444645950628e-05, "loss": 0.3756, "step": 1116500 }, { "epoch": 7.558737548722391, "grad_norm": 0.32114550471305847, "learning_rate": 4.924412624512777e-05, "loss": 0.3776, "step": 1117000 }, { "epoch": 7.5621210480727585, "grad_norm": 0.33889034390449524, "learning_rate": 4.924378789519273e-05, "loss": 0.3773, "step": 1117500 }, { "epoch": 7.565504547423127, "grad_norm": 0.35345473885536194, "learning_rate": 4.924344954525769e-05, "loss": 0.3747, "step": 1118000 }, { "epoch": 7.568888046773495, "grad_norm": 0.32850387692451477, "learning_rate": 4.924311119532265e-05, "loss": 0.3756, "step": 1118500 }, { "epoch": 7.572271546123863, "grad_norm": 0.3322440981864929, "learning_rate": 4.9242772845387615e-05, "loss": 0.3768, "step": 1119000 }, { "epoch": 7.575655045474232, "grad_norm": 0.354925274848938, "learning_rate": 4.924243449545258e-05, "loss": 0.3751, "step": 1119500 }, { "epoch": 7.579038544824599, "grad_norm": 0.3208562731742859, "learning_rate": 4.924209614551754e-05, "loss": 0.3761, "step": 1120000 }, { "epoch": 7.582422044174968, "grad_norm": 0.3661039173603058, "learning_rate": 4.92417577955825e-05, "loss": 0.3749, "step": 1120500 }, { "epoch": 7.5858055435253355, "grad_norm": 0.3295091986656189, "learning_rate": 4.924141944564747e-05, "loss": 0.375, "step": 1121000 }, { "epoch": 7.589189042875704, "grad_norm": 0.3443576395511627, "learning_rate": 4.924108109571243e-05, "loss": 0.3756, "step": 1121500 }, { "epoch": 7.592572542226072, "grad_norm": 0.345859557390213, "learning_rate": 4.9240742745777395e-05, "loss": 0.3773, "step": 1122000 }, { "epoch": 7.59595604157644, "grad_norm": 0.35861361026763916, "learning_rate": 4.924040439584236e-05, "loss": 0.3753, "step": 1122500 }, { "epoch": 7.599339540926808, "grad_norm": 0.32491734623908997, "learning_rate": 4.9240066045907326e-05, "loss": 0.376, "step": 1123000 }, { "epoch": 7.602723040277176, "grad_norm": 0.3575875759124756, "learning_rate": 4.923972769597228e-05, "loss": 0.3759, "step": 1123500 }, { "epoch": 7.606106539627544, "grad_norm": 0.3628663122653961, "learning_rate": 4.923938934603724e-05, "loss": 0.375, "step": 1124000 }, { "epoch": 7.6094900389779125, "grad_norm": 0.3580170273780823, "learning_rate": 4.923905099610221e-05, "loss": 0.3766, "step": 1124500 }, { "epoch": 7.61287353832828, "grad_norm": 0.3711313307285309, "learning_rate": 4.9238712646167174e-05, "loss": 0.3759, "step": 1125000 }, { "epoch": 7.616257037678649, "grad_norm": 0.320784330368042, "learning_rate": 4.9238374296232136e-05, "loss": 0.3762, "step": 1125500 }, { "epoch": 7.619640537029017, "grad_norm": 0.32204243540763855, "learning_rate": 4.92380359462971e-05, "loss": 0.3777, "step": 1126000 }, { "epoch": 7.623024036379385, "grad_norm": 0.3574248254299164, "learning_rate": 4.923769759636207e-05, "loss": 0.3756, "step": 1126500 }, { "epoch": 7.626407535729753, "grad_norm": 0.3387276828289032, "learning_rate": 4.923735924642703e-05, "loss": 0.3758, "step": 1127000 }, { "epoch": 7.629791035080121, "grad_norm": 0.33094269037246704, "learning_rate": 4.923702089649199e-05, "loss": 0.3761, "step": 1127500 }, { "epoch": 7.6331745344304895, "grad_norm": 0.33912894129753113, "learning_rate": 4.9236682546556954e-05, "loss": 0.3754, "step": 1128000 }, { "epoch": 7.636558033780857, "grad_norm": 0.3694305121898651, "learning_rate": 4.9236344196621916e-05, "loss": 0.3754, "step": 1128500 }, { "epoch": 7.639941533131226, "grad_norm": 0.35075610876083374, "learning_rate": 4.923600584668688e-05, "loss": 0.3757, "step": 1129000 }, { "epoch": 7.643325032481593, "grad_norm": 0.3775167167186737, "learning_rate": 4.923566749675184e-05, "loss": 0.3766, "step": 1129500 }, { "epoch": 7.646708531831962, "grad_norm": 0.3631054759025574, "learning_rate": 4.92353291468168e-05, "loss": 0.3751, "step": 1130000 }, { "epoch": 7.65009203118233, "grad_norm": 0.3584959805011749, "learning_rate": 4.923499079688177e-05, "loss": 0.3763, "step": 1130500 }, { "epoch": 7.653475530532698, "grad_norm": 0.361883282661438, "learning_rate": 4.923465244694673e-05, "loss": 0.3761, "step": 1131000 }, { "epoch": 7.6568590298830665, "grad_norm": 0.3296509087085724, "learning_rate": 4.9234314097011695e-05, "loss": 0.3756, "step": 1131500 }, { "epoch": 7.660242529233434, "grad_norm": 0.34193360805511475, "learning_rate": 4.923397574707666e-05, "loss": 0.3749, "step": 1132000 }, { "epoch": 7.663626028583803, "grad_norm": 0.337868332862854, "learning_rate": 4.9233637397141626e-05, "loss": 0.3769, "step": 1132500 }, { "epoch": 7.66700952793417, "grad_norm": 0.37247219681739807, "learning_rate": 4.923329904720659e-05, "loss": 0.3753, "step": 1133000 }, { "epoch": 7.670393027284539, "grad_norm": 0.3282815217971802, "learning_rate": 4.9232960697271544e-05, "loss": 0.3762, "step": 1133500 }, { "epoch": 7.6737765266349065, "grad_norm": 0.29900139570236206, "learning_rate": 4.923262234733651e-05, "loss": 0.3769, "step": 1134000 }, { "epoch": 7.677160025985275, "grad_norm": 0.31819576025009155, "learning_rate": 4.9232283997401475e-05, "loss": 0.3759, "step": 1134500 }, { "epoch": 7.6805435253356436, "grad_norm": 0.3621688187122345, "learning_rate": 4.923194564746644e-05, "loss": 0.3753, "step": 1135000 }, { "epoch": 7.683927024686011, "grad_norm": 0.35284703969955444, "learning_rate": 4.92316072975314e-05, "loss": 0.3769, "step": 1135500 }, { "epoch": 7.68731052403638, "grad_norm": 0.34793928265571594, "learning_rate": 4.923126894759637e-05, "loss": 0.3764, "step": 1136000 }, { "epoch": 7.690694023386747, "grad_norm": 0.34080761671066284, "learning_rate": 4.923093059766133e-05, "loss": 0.376, "step": 1136500 }, { "epoch": 7.694077522737116, "grad_norm": 0.3793800175189972, "learning_rate": 4.923059224772629e-05, "loss": 0.3763, "step": 1137000 }, { "epoch": 7.6974610220874835, "grad_norm": 0.340820848941803, "learning_rate": 4.9230253897791254e-05, "loss": 0.3752, "step": 1137500 }, { "epoch": 7.700844521437852, "grad_norm": 0.3901347815990448, "learning_rate": 4.9229915547856216e-05, "loss": 0.3761, "step": 1138000 }, { "epoch": 7.70422802078822, "grad_norm": 0.3766717314720154, "learning_rate": 4.922957719792118e-05, "loss": 0.3755, "step": 1138500 }, { "epoch": 7.707611520138588, "grad_norm": 0.33552950620651245, "learning_rate": 4.922923884798614e-05, "loss": 0.3758, "step": 1139000 }, { "epoch": 7.710995019488957, "grad_norm": 0.3720012307167053, "learning_rate": 4.92289004980511e-05, "loss": 0.3747, "step": 1139500 }, { "epoch": 7.714378518839324, "grad_norm": 0.33156847953796387, "learning_rate": 4.922856214811607e-05, "loss": 0.3772, "step": 1140000 }, { "epoch": 7.717762018189693, "grad_norm": 0.3560585081577301, "learning_rate": 4.9228223798181034e-05, "loss": 0.3771, "step": 1140500 }, { "epoch": 7.7211455175400605, "grad_norm": 0.33120888471603394, "learning_rate": 4.9227885448245996e-05, "loss": 0.3751, "step": 1141000 }, { "epoch": 7.724529016890429, "grad_norm": 0.3647676706314087, "learning_rate": 4.922754709831096e-05, "loss": 0.374, "step": 1141500 }, { "epoch": 7.727912516240797, "grad_norm": 0.3265502452850342, "learning_rate": 4.922720874837593e-05, "loss": 0.3761, "step": 1142000 }, { "epoch": 7.731296015591165, "grad_norm": 0.3567357659339905, "learning_rate": 4.922687039844089e-05, "loss": 0.3751, "step": 1142500 }, { "epoch": 7.734679514941533, "grad_norm": 0.380672425031662, "learning_rate": 4.9226532048505844e-05, "loss": 0.3746, "step": 1143000 }, { "epoch": 7.738063014291901, "grad_norm": 0.3651910424232483, "learning_rate": 4.922619369857081e-05, "loss": 0.3771, "step": 1143500 }, { "epoch": 7.74144651364227, "grad_norm": 0.3869389295578003, "learning_rate": 4.9225855348635775e-05, "loss": 0.3763, "step": 1144000 }, { "epoch": 7.7448300129926375, "grad_norm": 0.3407086133956909, "learning_rate": 4.922551699870074e-05, "loss": 0.3777, "step": 1144500 }, { "epoch": 7.748213512343005, "grad_norm": 0.33296847343444824, "learning_rate": 4.92251786487657e-05, "loss": 0.3764, "step": 1145000 }, { "epoch": 7.751597011693374, "grad_norm": 0.3720654547214508, "learning_rate": 4.922484029883067e-05, "loss": 0.3749, "step": 1145500 }, { "epoch": 7.754980511043742, "grad_norm": 0.33162111043930054, "learning_rate": 4.922450194889563e-05, "loss": 0.375, "step": 1146000 }, { "epoch": 7.75836401039411, "grad_norm": 0.36200985312461853, "learning_rate": 4.922416359896059e-05, "loss": 0.3755, "step": 1146500 }, { "epoch": 7.761747509744478, "grad_norm": 0.3476463258266449, "learning_rate": 4.9223825249025555e-05, "loss": 0.3751, "step": 1147000 }, { "epoch": 7.765131009094846, "grad_norm": 0.3678581118583679, "learning_rate": 4.922348689909052e-05, "loss": 0.3755, "step": 1147500 }, { "epoch": 7.7685145084452145, "grad_norm": 0.35781508684158325, "learning_rate": 4.922314854915548e-05, "loss": 0.3754, "step": 1148000 }, { "epoch": 7.771898007795582, "grad_norm": 0.36711201071739197, "learning_rate": 4.922281019922044e-05, "loss": 0.3775, "step": 1148500 }, { "epoch": 7.775281507145951, "grad_norm": 0.33325284719467163, "learning_rate": 4.9222471849285403e-05, "loss": 0.3754, "step": 1149000 }, { "epoch": 7.778665006496318, "grad_norm": 0.33583301305770874, "learning_rate": 4.922213349935037e-05, "loss": 0.376, "step": 1149500 }, { "epoch": 7.782048505846687, "grad_norm": 0.3751732110977173, "learning_rate": 4.9221795149415334e-05, "loss": 0.3751, "step": 1150000 }, { "epoch": 7.785432005197055, "grad_norm": 0.33746105432510376, "learning_rate": 4.9221456799480297e-05, "loss": 0.3773, "step": 1150500 }, { "epoch": 7.788815504547423, "grad_norm": 0.33039334416389465, "learning_rate": 4.922111844954526e-05, "loss": 0.3771, "step": 1151000 }, { "epoch": 7.7921990038977915, "grad_norm": 0.3974260687828064, "learning_rate": 4.922078009961023e-05, "loss": 0.3754, "step": 1151500 }, { "epoch": 7.795582503248159, "grad_norm": 0.35063496232032776, "learning_rate": 4.922044174967519e-05, "loss": 0.374, "step": 1152000 }, { "epoch": 7.798966002598528, "grad_norm": 0.3475898504257202, "learning_rate": 4.9220103399740145e-05, "loss": 0.3754, "step": 1152500 }, { "epoch": 7.802349501948895, "grad_norm": 0.36210474371910095, "learning_rate": 4.9219765049805114e-05, "loss": 0.3767, "step": 1153000 }, { "epoch": 7.805733001299264, "grad_norm": 0.34278056025505066, "learning_rate": 4.9219426699870076e-05, "loss": 0.3763, "step": 1153500 }, { "epoch": 7.8091165006496315, "grad_norm": 0.3511875867843628, "learning_rate": 4.921908834993504e-05, "loss": 0.3756, "step": 1154000 }, { "epoch": 7.8125, "grad_norm": 0.422054260969162, "learning_rate": 4.921875e-05, "loss": 0.3759, "step": 1154500 }, { "epoch": 7.8158834993503685, "grad_norm": 0.3470151126384735, "learning_rate": 4.921841165006497e-05, "loss": 0.3762, "step": 1155000 }, { "epoch": 7.819266998700736, "grad_norm": 0.4256981313228607, "learning_rate": 4.921807330012993e-05, "loss": 0.3767, "step": 1155500 }, { "epoch": 7.822650498051105, "grad_norm": 0.3501984477043152, "learning_rate": 4.9217734950194893e-05, "loss": 0.3768, "step": 1156000 }, { "epoch": 7.826033997401472, "grad_norm": 0.3682548403739929, "learning_rate": 4.9217396600259856e-05, "loss": 0.3758, "step": 1156500 }, { "epoch": 7.829417496751841, "grad_norm": 0.32662200927734375, "learning_rate": 4.921705825032482e-05, "loss": 0.3765, "step": 1157000 }, { "epoch": 7.8328009961022085, "grad_norm": 0.3631592392921448, "learning_rate": 4.921671990038978e-05, "loss": 0.3763, "step": 1157500 }, { "epoch": 7.836184495452577, "grad_norm": 0.3583422899246216, "learning_rate": 4.921638155045474e-05, "loss": 0.3744, "step": 1158000 }, { "epoch": 7.839567994802945, "grad_norm": 0.35009801387786865, "learning_rate": 4.9216043200519704e-05, "loss": 0.3754, "step": 1158500 }, { "epoch": 7.842951494153313, "grad_norm": 0.361879825592041, "learning_rate": 4.921570485058467e-05, "loss": 0.3749, "step": 1159000 }, { "epoch": 7.846334993503682, "grad_norm": 0.3249756693840027, "learning_rate": 4.9215366500649635e-05, "loss": 0.376, "step": 1159500 }, { "epoch": 7.849718492854049, "grad_norm": 0.33673372864723206, "learning_rate": 4.92150281507146e-05, "loss": 0.3763, "step": 1160000 }, { "epoch": 7.853101992204418, "grad_norm": 0.40804365277290344, "learning_rate": 4.921468980077956e-05, "loss": 0.3758, "step": 1160500 }, { "epoch": 7.8564854915547855, "grad_norm": 0.33804696798324585, "learning_rate": 4.921435145084453e-05, "loss": 0.3773, "step": 1161000 }, { "epoch": 7.859868990905154, "grad_norm": 0.3609027862548828, "learning_rate": 4.921401310090949e-05, "loss": 0.3739, "step": 1161500 }, { "epoch": 7.863252490255522, "grad_norm": 0.3798917829990387, "learning_rate": 4.9213674750974446e-05, "loss": 0.3763, "step": 1162000 }, { "epoch": 7.86663598960589, "grad_norm": 0.39154064655303955, "learning_rate": 4.9213336401039415e-05, "loss": 0.3766, "step": 1162500 }, { "epoch": 7.870019488956258, "grad_norm": 0.35580357909202576, "learning_rate": 4.921299805110438e-05, "loss": 0.3762, "step": 1163000 }, { "epoch": 7.873402988306626, "grad_norm": 0.357096791267395, "learning_rate": 4.921265970116934e-05, "loss": 0.3761, "step": 1163500 }, { "epoch": 7.876786487656995, "grad_norm": 0.3507744371891022, "learning_rate": 4.92123213512343e-05, "loss": 0.3755, "step": 1164000 }, { "epoch": 7.8801699870073625, "grad_norm": 0.375263512134552, "learning_rate": 4.921198300129927e-05, "loss": 0.3759, "step": 1164500 }, { "epoch": 7.88355348635773, "grad_norm": 0.3427620530128479, "learning_rate": 4.921164465136423e-05, "loss": 0.3748, "step": 1165000 }, { "epoch": 7.886936985708099, "grad_norm": 0.32488977909088135, "learning_rate": 4.9211306301429194e-05, "loss": 0.3752, "step": 1165500 }, { "epoch": 7.890320485058467, "grad_norm": 0.3429107964038849, "learning_rate": 4.9210967951494156e-05, "loss": 0.3767, "step": 1166000 }, { "epoch": 7.893703984408835, "grad_norm": 0.3646707832813263, "learning_rate": 4.921062960155912e-05, "loss": 0.376, "step": 1166500 }, { "epoch": 7.897087483759203, "grad_norm": 0.34920671582221985, "learning_rate": 4.921029125162408e-05, "loss": 0.3754, "step": 1167000 }, { "epoch": 7.900470983109571, "grad_norm": 0.3300837576389313, "learning_rate": 4.920995290168904e-05, "loss": 0.3758, "step": 1167500 }, { "epoch": 7.9038544824599395, "grad_norm": 0.38310351967811584, "learning_rate": 4.9209614551754005e-05, "loss": 0.3763, "step": 1168000 }, { "epoch": 7.907237981810307, "grad_norm": 0.3786688446998596, "learning_rate": 4.9209276201818974e-05, "loss": 0.3748, "step": 1168500 }, { "epoch": 7.910621481160676, "grad_norm": 0.3455840051174164, "learning_rate": 4.9208937851883936e-05, "loss": 0.3756, "step": 1169000 }, { "epoch": 7.914004980511043, "grad_norm": 0.31133291125297546, "learning_rate": 4.92085995019489e-05, "loss": 0.3746, "step": 1169500 }, { "epoch": 7.917388479861412, "grad_norm": 0.3466230034828186, "learning_rate": 4.920826115201386e-05, "loss": 0.3756, "step": 1170000 }, { "epoch": 7.92077197921178, "grad_norm": 0.35299792885780334, "learning_rate": 4.920792280207883e-05, "loss": 0.3757, "step": 1170500 }, { "epoch": 7.924155478562148, "grad_norm": 0.3923153281211853, "learning_rate": 4.920758445214379e-05, "loss": 0.3765, "step": 1171000 }, { "epoch": 7.9275389779125165, "grad_norm": 0.34024280309677124, "learning_rate": 4.9207246102208746e-05, "loss": 0.3744, "step": 1171500 }, { "epoch": 7.930922477262884, "grad_norm": 0.35224542021751404, "learning_rate": 4.9206907752273715e-05, "loss": 0.3763, "step": 1172000 }, { "epoch": 7.934305976613253, "grad_norm": 0.4136195182800293, "learning_rate": 4.920656940233868e-05, "loss": 0.3747, "step": 1172500 }, { "epoch": 7.93768947596362, "grad_norm": 0.3392421305179596, "learning_rate": 4.920623105240364e-05, "loss": 0.3783, "step": 1173000 }, { "epoch": 7.941072975313989, "grad_norm": 0.3497343063354492, "learning_rate": 4.92058927024686e-05, "loss": 0.3752, "step": 1173500 }, { "epoch": 7.9444564746643564, "grad_norm": 0.34304261207580566, "learning_rate": 4.920555435253357e-05, "loss": 0.3759, "step": 1174000 }, { "epoch": 7.947839974014725, "grad_norm": 0.31310826539993286, "learning_rate": 4.920521600259853e-05, "loss": 0.3775, "step": 1174500 }, { "epoch": 7.9512234733650935, "grad_norm": 0.33917033672332764, "learning_rate": 4.9204877652663495e-05, "loss": 0.3753, "step": 1175000 }, { "epoch": 7.954606972715461, "grad_norm": 0.28793883323669434, "learning_rate": 4.920453930272846e-05, "loss": 0.3757, "step": 1175500 }, { "epoch": 7.95799047206583, "grad_norm": 0.3498222231864929, "learning_rate": 4.920420095279342e-05, "loss": 0.376, "step": 1176000 }, { "epoch": 7.961373971416197, "grad_norm": 0.3331172466278076, "learning_rate": 4.920386260285838e-05, "loss": 0.3764, "step": 1176500 }, { "epoch": 7.964757470766566, "grad_norm": 0.3206407129764557, "learning_rate": 4.920352425292334e-05, "loss": 0.3747, "step": 1177000 }, { "epoch": 7.9681409701169335, "grad_norm": 0.35807299613952637, "learning_rate": 4.9203185902988305e-05, "loss": 0.3746, "step": 1177500 }, { "epoch": 7.971524469467302, "grad_norm": 0.3473316431045532, "learning_rate": 4.9202847553053274e-05, "loss": 0.3765, "step": 1178000 }, { "epoch": 7.97490796881767, "grad_norm": 0.3588818907737732, "learning_rate": 4.9202509203118236e-05, "loss": 0.3775, "step": 1178500 }, { "epoch": 7.978291468168038, "grad_norm": 0.39430102705955505, "learning_rate": 4.92021708531832e-05, "loss": 0.3761, "step": 1179000 }, { "epoch": 7.981674967518407, "grad_norm": 0.34274429082870483, "learning_rate": 4.920183250324816e-05, "loss": 0.3783, "step": 1179500 }, { "epoch": 7.985058466868774, "grad_norm": 0.38946691155433655, "learning_rate": 4.920149415331313e-05, "loss": 0.374, "step": 1180000 }, { "epoch": 7.988441966219143, "grad_norm": 0.35403940081596375, "learning_rate": 4.920115580337809e-05, "loss": 0.3746, "step": 1180500 }, { "epoch": 7.9918254655695105, "grad_norm": 0.36802688241004944, "learning_rate": 4.920081745344305e-05, "loss": 0.3757, "step": 1181000 }, { "epoch": 7.995208964919879, "grad_norm": 0.38939595222473145, "learning_rate": 4.9200479103508016e-05, "loss": 0.3754, "step": 1181500 }, { "epoch": 7.998592464270247, "grad_norm": 0.35823124647140503, "learning_rate": 4.920014075357298e-05, "loss": 0.3733, "step": 1182000 }, { "epoch": 8.0, "eval_accuracy": 0.8572758723371212, "eval_loss": 0.5793046951293945, "eval_runtime": 3396.5385, "eval_samples_per_second": 85.6, "eval_steps_per_second": 5.35, "step": 1182208 }, { "epoch": 8.001975963620614, "grad_norm": 0.3599242568016052, "learning_rate": 4.919980240363794e-05, "loss": 0.374, "step": 1182500 }, { "epoch": 8.005359462970983, "grad_norm": 0.3641102612018585, "learning_rate": 4.91994640537029e-05, "loss": 0.3737, "step": 1183000 }, { "epoch": 8.008742962321351, "grad_norm": 0.30603310465812683, "learning_rate": 4.9199125703767864e-05, "loss": 0.3746, "step": 1183500 }, { "epoch": 8.01212646167172, "grad_norm": 0.3400936722755432, "learning_rate": 4.919878735383283e-05, "loss": 0.3744, "step": 1184000 }, { "epoch": 8.015509961022088, "grad_norm": 0.3894224464893341, "learning_rate": 4.9198449003897795e-05, "loss": 0.3736, "step": 1184500 }, { "epoch": 8.018893460372455, "grad_norm": 0.3433375060558319, "learning_rate": 4.919811065396276e-05, "loss": 0.3734, "step": 1185000 }, { "epoch": 8.022276959722824, "grad_norm": 0.36428365111351013, "learning_rate": 4.919777230402772e-05, "loss": 0.3733, "step": 1185500 }, { "epoch": 8.025660459073192, "grad_norm": 0.3363524377346039, "learning_rate": 4.919743395409268e-05, "loss": 0.373, "step": 1186000 }, { "epoch": 8.02904395842356, "grad_norm": 0.33948156237602234, "learning_rate": 4.9197095604157644e-05, "loss": 0.3736, "step": 1186500 }, { "epoch": 8.032427457773927, "grad_norm": 0.32381463050842285, "learning_rate": 4.9196757254222606e-05, "loss": 0.3733, "step": 1187000 }, { "epoch": 8.035810957124296, "grad_norm": 0.3439604938030243, "learning_rate": 4.9196418904287575e-05, "loss": 0.3735, "step": 1187500 }, { "epoch": 8.039194456474664, "grad_norm": 0.3471587002277374, "learning_rate": 4.919608055435254e-05, "loss": 0.3733, "step": 1188000 }, { "epoch": 8.042577955825033, "grad_norm": 0.344894140958786, "learning_rate": 4.91957422044175e-05, "loss": 0.3726, "step": 1188500 }, { "epoch": 8.0459614551754, "grad_norm": 0.3696286082267761, "learning_rate": 4.919540385448246e-05, "loss": 0.3735, "step": 1189000 }, { "epoch": 8.049344954525768, "grad_norm": 0.39218756556510925, "learning_rate": 4.919506550454743e-05, "loss": 0.375, "step": 1189500 }, { "epoch": 8.052728453876137, "grad_norm": 0.3539004325866699, "learning_rate": 4.919472715461239e-05, "loss": 0.3737, "step": 1190000 }, { "epoch": 8.056111953226505, "grad_norm": 0.3537065088748932, "learning_rate": 4.919438880467735e-05, "loss": 0.373, "step": 1190500 }, { "epoch": 8.059495452576874, "grad_norm": 0.4091944694519043, "learning_rate": 4.919405045474231e-05, "loss": 0.3738, "step": 1191000 }, { "epoch": 8.06287895192724, "grad_norm": 0.2999604642391205, "learning_rate": 4.919371210480728e-05, "loss": 0.3728, "step": 1191500 }, { "epoch": 8.066262451277609, "grad_norm": 0.3378213942050934, "learning_rate": 4.919337375487224e-05, "loss": 0.373, "step": 1192000 }, { "epoch": 8.069645950627978, "grad_norm": 0.38694649934768677, "learning_rate": 4.91930354049372e-05, "loss": 0.3737, "step": 1192500 }, { "epoch": 8.073029449978346, "grad_norm": 0.3591739535331726, "learning_rate": 4.9192697055002165e-05, "loss": 0.3744, "step": 1193000 }, { "epoch": 8.076412949328713, "grad_norm": 0.34628209471702576, "learning_rate": 4.9192358705067134e-05, "loss": 0.3741, "step": 1193500 }, { "epoch": 8.079796448679081, "grad_norm": 0.3418349325656891, "learning_rate": 4.9192020355132096e-05, "loss": 0.3737, "step": 1194000 }, { "epoch": 8.08317994802945, "grad_norm": 0.33706989884376526, "learning_rate": 4.919168200519706e-05, "loss": 0.3731, "step": 1194500 }, { "epoch": 8.086563447379818, "grad_norm": 0.3412095904350281, "learning_rate": 4.919134365526202e-05, "loss": 0.3758, "step": 1195000 }, { "epoch": 8.089946946730187, "grad_norm": 0.3373109698295593, "learning_rate": 4.919100530532698e-05, "loss": 0.3731, "step": 1195500 }, { "epoch": 8.093330446080554, "grad_norm": 0.33834993839263916, "learning_rate": 4.9190666955391945e-05, "loss": 0.3765, "step": 1196000 }, { "epoch": 8.096713945430922, "grad_norm": 0.3101872205734253, "learning_rate": 4.919032860545691e-05, "loss": 0.3758, "step": 1196500 }, { "epoch": 8.10009744478129, "grad_norm": 0.31594258546829224, "learning_rate": 4.9189990255521876e-05, "loss": 0.3751, "step": 1197000 }, { "epoch": 8.10348094413166, "grad_norm": 0.39463159441947937, "learning_rate": 4.918965190558684e-05, "loss": 0.3752, "step": 1197500 }, { "epoch": 8.106864443482026, "grad_norm": 0.36349910497665405, "learning_rate": 4.91893135556518e-05, "loss": 0.374, "step": 1198000 }, { "epoch": 8.110247942832395, "grad_norm": 0.35538309812545776, "learning_rate": 4.918897520571676e-05, "loss": 0.3725, "step": 1198500 }, { "epoch": 8.113631442182763, "grad_norm": 0.3548257350921631, "learning_rate": 4.918863685578173e-05, "loss": 0.3736, "step": 1199000 }, { "epoch": 8.117014941533132, "grad_norm": 0.36630895733833313, "learning_rate": 4.918829850584669e-05, "loss": 0.3733, "step": 1199500 }, { "epoch": 8.1203984408835, "grad_norm": 0.34236612915992737, "learning_rate": 4.918796015591165e-05, "loss": 0.3734, "step": 1200000 }, { "epoch": 8.123781940233867, "grad_norm": 0.37828969955444336, "learning_rate": 4.918762180597661e-05, "loss": 0.3747, "step": 1200500 }, { "epoch": 8.127165439584235, "grad_norm": 0.3627392053604126, "learning_rate": 4.918728345604158e-05, "loss": 0.3736, "step": 1201000 }, { "epoch": 8.130548938934604, "grad_norm": 0.3256118893623352, "learning_rate": 4.918694510610654e-05, "loss": 0.3745, "step": 1201500 }, { "epoch": 8.133932438284972, "grad_norm": 0.3644091486930847, "learning_rate": 4.9186606756171504e-05, "loss": 0.3748, "step": 1202000 }, { "epoch": 8.13731593763534, "grad_norm": 0.34699633717536926, "learning_rate": 4.9186268406236466e-05, "loss": 0.3756, "step": 1202500 }, { "epoch": 8.140699436985708, "grad_norm": 0.4031618535518646, "learning_rate": 4.9185930056301435e-05, "loss": 0.3748, "step": 1203000 }, { "epoch": 8.144082936336076, "grad_norm": 0.4030303955078125, "learning_rate": 4.91855917063664e-05, "loss": 0.3751, "step": 1203500 }, { "epoch": 8.147466435686445, "grad_norm": 0.39308834075927734, "learning_rate": 4.918525335643136e-05, "loss": 0.3763, "step": 1204000 }, { "epoch": 8.150849935036813, "grad_norm": 0.36041557788848877, "learning_rate": 4.918491500649632e-05, "loss": 0.3734, "step": 1204500 }, { "epoch": 8.15423343438718, "grad_norm": 0.3809669017791748, "learning_rate": 4.918457665656128e-05, "loss": 0.3755, "step": 1205000 }, { "epoch": 8.157616933737549, "grad_norm": 0.3428496718406677, "learning_rate": 4.9184238306626245e-05, "loss": 0.3752, "step": 1205500 }, { "epoch": 8.161000433087917, "grad_norm": 0.3226144313812256, "learning_rate": 4.918389995669121e-05, "loss": 0.3734, "step": 1206000 }, { "epoch": 8.164383932438286, "grad_norm": 0.3690769672393799, "learning_rate": 4.9183561606756176e-05, "loss": 0.3749, "step": 1206500 }, { "epoch": 8.167767431788652, "grad_norm": 0.371756911277771, "learning_rate": 4.918322325682114e-05, "loss": 0.3752, "step": 1207000 }, { "epoch": 8.171150931139021, "grad_norm": 0.37059494853019714, "learning_rate": 4.91828849068861e-05, "loss": 0.3751, "step": 1207500 }, { "epoch": 8.17453443048939, "grad_norm": 0.3456904888153076, "learning_rate": 4.918254655695106e-05, "loss": 0.375, "step": 1208000 }, { "epoch": 8.177917929839758, "grad_norm": 0.3586566746234894, "learning_rate": 4.918220820701603e-05, "loss": 0.3748, "step": 1208500 }, { "epoch": 8.181301429190125, "grad_norm": 0.33923983573913574, "learning_rate": 4.9181869857080994e-05, "loss": 0.3739, "step": 1209000 }, { "epoch": 8.184684928540493, "grad_norm": 0.36180174350738525, "learning_rate": 4.918153150714595e-05, "loss": 0.375, "step": 1209500 }, { "epoch": 8.188068427890862, "grad_norm": 0.44262030720710754, "learning_rate": 4.918119315721091e-05, "loss": 0.3736, "step": 1210000 }, { "epoch": 8.19145192724123, "grad_norm": 0.3610953688621521, "learning_rate": 4.918085480727588e-05, "loss": 0.3733, "step": 1210500 }, { "epoch": 8.194835426591599, "grad_norm": 0.34002241492271423, "learning_rate": 4.918051645734084e-05, "loss": 0.3745, "step": 1211000 }, { "epoch": 8.198218925941966, "grad_norm": 0.3462762236595154, "learning_rate": 4.9180178107405804e-05, "loss": 0.3751, "step": 1211500 }, { "epoch": 8.201602425292334, "grad_norm": 0.39004191756248474, "learning_rate": 4.9179839757470766e-05, "loss": 0.3763, "step": 1212000 }, { "epoch": 8.204985924642703, "grad_norm": 0.3487882912158966, "learning_rate": 4.9179501407535735e-05, "loss": 0.3745, "step": 1212500 }, { "epoch": 8.208369423993071, "grad_norm": 0.37315833568573, "learning_rate": 4.91791630576007e-05, "loss": 0.3739, "step": 1213000 }, { "epoch": 8.211752923343438, "grad_norm": 0.3424568176269531, "learning_rate": 4.917882470766566e-05, "loss": 0.375, "step": 1213500 }, { "epoch": 8.215136422693806, "grad_norm": 0.38436761498451233, "learning_rate": 4.917848635773062e-05, "loss": 0.3742, "step": 1214000 }, { "epoch": 8.218519922044175, "grad_norm": 0.36517414450645447, "learning_rate": 4.9178148007795584e-05, "loss": 0.3719, "step": 1214500 }, { "epoch": 8.221903421394543, "grad_norm": 0.329480916261673, "learning_rate": 4.9177809657860546e-05, "loss": 0.374, "step": 1215000 }, { "epoch": 8.225286920744912, "grad_norm": 0.3483608663082123, "learning_rate": 4.917747130792551e-05, "loss": 0.3741, "step": 1215500 }, { "epoch": 8.228670420095279, "grad_norm": 0.32875847816467285, "learning_rate": 4.917713295799048e-05, "loss": 0.3748, "step": 1216000 }, { "epoch": 8.232053919445647, "grad_norm": 0.340464323759079, "learning_rate": 4.917679460805544e-05, "loss": 0.3747, "step": 1216500 }, { "epoch": 8.235437418796016, "grad_norm": 0.3463500738143921, "learning_rate": 4.91764562581204e-05, "loss": 0.3741, "step": 1217000 }, { "epoch": 8.238820918146384, "grad_norm": 0.3442372679710388, "learning_rate": 4.917611790818536e-05, "loss": 0.3757, "step": 1217500 }, { "epoch": 8.242204417496751, "grad_norm": 0.35514867305755615, "learning_rate": 4.917577955825033e-05, "loss": 0.3738, "step": 1218000 }, { "epoch": 8.24558791684712, "grad_norm": 0.3340912163257599, "learning_rate": 4.9175441208315294e-05, "loss": 0.3738, "step": 1218500 }, { "epoch": 8.248971416197488, "grad_norm": 0.3596610426902771, "learning_rate": 4.917510285838025e-05, "loss": 0.3751, "step": 1219000 }, { "epoch": 8.252354915547857, "grad_norm": 0.33829501271247864, "learning_rate": 4.917476450844521e-05, "loss": 0.3736, "step": 1219500 }, { "epoch": 8.255738414898225, "grad_norm": 0.3386409282684326, "learning_rate": 4.917442615851018e-05, "loss": 0.3742, "step": 1220000 }, { "epoch": 8.259121914248592, "grad_norm": 0.3590584397315979, "learning_rate": 4.917408780857514e-05, "loss": 0.3721, "step": 1220500 }, { "epoch": 8.26250541359896, "grad_norm": 0.341302752494812, "learning_rate": 4.9173749458640105e-05, "loss": 0.3739, "step": 1221000 }, { "epoch": 8.265888912949329, "grad_norm": 0.36562225222587585, "learning_rate": 4.917341110870507e-05, "loss": 0.3732, "step": 1221500 }, { "epoch": 8.269272412299697, "grad_norm": 0.34959301352500916, "learning_rate": 4.9173072758770036e-05, "loss": 0.3751, "step": 1222000 }, { "epoch": 8.272655911650064, "grad_norm": 0.3470049500465393, "learning_rate": 4.9172734408835e-05, "loss": 0.3732, "step": 1222500 }, { "epoch": 8.276039411000433, "grad_norm": 0.36257070302963257, "learning_rate": 4.917239605889996e-05, "loss": 0.3749, "step": 1223000 }, { "epoch": 8.279422910350801, "grad_norm": 0.40908801555633545, "learning_rate": 4.917205770896492e-05, "loss": 0.3744, "step": 1223500 }, { "epoch": 8.28280640970117, "grad_norm": 0.3480644226074219, "learning_rate": 4.9171719359029884e-05, "loss": 0.3753, "step": 1224000 }, { "epoch": 8.286189909051538, "grad_norm": 0.33613091707229614, "learning_rate": 4.9171381009094846e-05, "loss": 0.3765, "step": 1224500 }, { "epoch": 8.289573408401905, "grad_norm": 0.36790555715560913, "learning_rate": 4.917104265915981e-05, "loss": 0.3754, "step": 1225000 }, { "epoch": 8.292956907752274, "grad_norm": 0.37085211277008057, "learning_rate": 4.917070430922478e-05, "loss": 0.3743, "step": 1225500 }, { "epoch": 8.296340407102642, "grad_norm": 0.35964706540107727, "learning_rate": 4.917036595928974e-05, "loss": 0.3725, "step": 1226000 }, { "epoch": 8.29972390645301, "grad_norm": 0.357497900724411, "learning_rate": 4.91700276093547e-05, "loss": 0.3737, "step": 1226500 }, { "epoch": 8.303107405803377, "grad_norm": 0.398779034614563, "learning_rate": 4.9169689259419664e-05, "loss": 0.3747, "step": 1227000 }, { "epoch": 8.306490905153746, "grad_norm": 0.3341631293296814, "learning_rate": 4.916935090948463e-05, "loss": 0.3741, "step": 1227500 }, { "epoch": 8.309874404504114, "grad_norm": 0.3211187720298767, "learning_rate": 4.9169012559549595e-05, "loss": 0.374, "step": 1228000 }, { "epoch": 8.313257903854483, "grad_norm": 0.350558876991272, "learning_rate": 4.916867420961455e-05, "loss": 0.3745, "step": 1228500 }, { "epoch": 8.316641403204851, "grad_norm": 0.35707876086235046, "learning_rate": 4.916833585967951e-05, "loss": 0.375, "step": 1229000 }, { "epoch": 8.320024902555218, "grad_norm": 0.36113375425338745, "learning_rate": 4.916799750974448e-05, "loss": 0.3755, "step": 1229500 }, { "epoch": 8.323408401905587, "grad_norm": 0.3182533383369446, "learning_rate": 4.916765915980944e-05, "loss": 0.3748, "step": 1230000 }, { "epoch": 8.326791901255955, "grad_norm": 0.3398992121219635, "learning_rate": 4.9167320809874405e-05, "loss": 0.373, "step": 1230500 }, { "epoch": 8.330175400606324, "grad_norm": 0.34391719102859497, "learning_rate": 4.916698245993937e-05, "loss": 0.3745, "step": 1231000 }, { "epoch": 8.33355889995669, "grad_norm": 0.3682703375816345, "learning_rate": 4.9166644110004336e-05, "loss": 0.3747, "step": 1231500 }, { "epoch": 8.336942399307059, "grad_norm": 0.3944721817970276, "learning_rate": 4.91663057600693e-05, "loss": 0.374, "step": 1232000 }, { "epoch": 8.340325898657428, "grad_norm": 0.34578758478164673, "learning_rate": 4.916596741013426e-05, "loss": 0.3744, "step": 1232500 }, { "epoch": 8.343709398007796, "grad_norm": 0.3377029597759247, "learning_rate": 4.916562906019922e-05, "loss": 0.3749, "step": 1233000 }, { "epoch": 8.347092897358163, "grad_norm": 0.3452177047729492, "learning_rate": 4.9165290710264185e-05, "loss": 0.3727, "step": 1233500 }, { "epoch": 8.350476396708531, "grad_norm": 0.3850466012954712, "learning_rate": 4.916495236032915e-05, "loss": 0.3751, "step": 1234000 }, { "epoch": 8.3538598960589, "grad_norm": 0.3559221029281616, "learning_rate": 4.916461401039411e-05, "loss": 0.3748, "step": 1234500 }, { "epoch": 8.357243395409268, "grad_norm": 0.34174761176109314, "learning_rate": 4.916427566045908e-05, "loss": 0.374, "step": 1235000 }, { "epoch": 8.360626894759637, "grad_norm": 0.35429567098617554, "learning_rate": 4.916393731052404e-05, "loss": 0.3742, "step": 1235500 }, { "epoch": 8.364010394110004, "grad_norm": 0.3283572196960449, "learning_rate": 4.9163598960589e-05, "loss": 0.3743, "step": 1236000 }, { "epoch": 8.367393893460372, "grad_norm": 0.36495864391326904, "learning_rate": 4.9163260610653964e-05, "loss": 0.3754, "step": 1236500 }, { "epoch": 8.37077739281074, "grad_norm": 0.34595993161201477, "learning_rate": 4.9162922260718927e-05, "loss": 0.3741, "step": 1237000 }, { "epoch": 8.37416089216111, "grad_norm": 0.3537779152393341, "learning_rate": 4.9162583910783895e-05, "loss": 0.3753, "step": 1237500 }, { "epoch": 8.377544391511476, "grad_norm": 0.3216964602470398, "learning_rate": 4.916224556084885e-05, "loss": 0.3762, "step": 1238000 }, { "epoch": 8.380927890861845, "grad_norm": 0.3425578773021698, "learning_rate": 4.916190721091381e-05, "loss": 0.3751, "step": 1238500 }, { "epoch": 8.384311390212213, "grad_norm": 0.36288127303123474, "learning_rate": 4.916156886097878e-05, "loss": 0.3753, "step": 1239000 }, { "epoch": 8.387694889562582, "grad_norm": 0.3431650400161743, "learning_rate": 4.9161230511043744e-05, "loss": 0.376, "step": 1239500 }, { "epoch": 8.39107838891295, "grad_norm": 0.34114548563957214, "learning_rate": 4.9160892161108706e-05, "loss": 0.3733, "step": 1240000 }, { "epoch": 8.394461888263317, "grad_norm": 0.3551206588745117, "learning_rate": 4.916055381117367e-05, "loss": 0.374, "step": 1240500 }, { "epoch": 8.397845387613685, "grad_norm": 0.3942086100578308, "learning_rate": 4.916021546123864e-05, "loss": 0.3734, "step": 1241000 }, { "epoch": 8.401228886964054, "grad_norm": 0.3584712743759155, "learning_rate": 4.91598771113036e-05, "loss": 0.3739, "step": 1241500 }, { "epoch": 8.404612386314422, "grad_norm": 0.3915914297103882, "learning_rate": 4.915953876136856e-05, "loss": 0.3739, "step": 1242000 }, { "epoch": 8.40799588566479, "grad_norm": 0.34406983852386475, "learning_rate": 4.9159200411433523e-05, "loss": 0.3759, "step": 1242500 }, { "epoch": 8.411379385015158, "grad_norm": 0.3534091114997864, "learning_rate": 4.9158862061498486e-05, "loss": 0.3734, "step": 1243000 }, { "epoch": 8.414762884365526, "grad_norm": 0.32999998331069946, "learning_rate": 4.915852371156345e-05, "loss": 0.3736, "step": 1243500 }, { "epoch": 8.418146383715895, "grad_norm": 0.3992055356502533, "learning_rate": 4.915818536162841e-05, "loss": 0.3755, "step": 1244000 }, { "epoch": 8.421529883066263, "grad_norm": 0.3797067403793335, "learning_rate": 4.915784701169338e-05, "loss": 0.374, "step": 1244500 }, { "epoch": 8.42491338241663, "grad_norm": 0.35519033670425415, "learning_rate": 4.915750866175834e-05, "loss": 0.3751, "step": 1245000 }, { "epoch": 8.428296881766999, "grad_norm": 0.37754616141319275, "learning_rate": 4.91571703118233e-05, "loss": 0.3757, "step": 1245500 }, { "epoch": 8.431680381117367, "grad_norm": 0.3400075435638428, "learning_rate": 4.9156831961888265e-05, "loss": 0.3744, "step": 1246000 }, { "epoch": 8.435063880467736, "grad_norm": 0.36160823702812195, "learning_rate": 4.915649361195323e-05, "loss": 0.3737, "step": 1246500 }, { "epoch": 8.438447379818102, "grad_norm": 0.4066164791584015, "learning_rate": 4.9156155262018196e-05, "loss": 0.374, "step": 1247000 }, { "epoch": 8.44183087916847, "grad_norm": 0.3818412125110626, "learning_rate": 4.915581691208316e-05, "loss": 0.373, "step": 1247500 }, { "epoch": 8.44521437851884, "grad_norm": 0.33103445172309875, "learning_rate": 4.9155478562148114e-05, "loss": 0.3739, "step": 1248000 }, { "epoch": 8.448597877869208, "grad_norm": 0.3441535532474518, "learning_rate": 4.915514021221308e-05, "loss": 0.3754, "step": 1248500 }, { "epoch": 8.451981377219576, "grad_norm": 0.33515992760658264, "learning_rate": 4.9154801862278045e-05, "loss": 0.3752, "step": 1249000 }, { "epoch": 8.455364876569943, "grad_norm": 0.3669910728931427, "learning_rate": 4.915446351234301e-05, "loss": 0.3738, "step": 1249500 }, { "epoch": 8.458748375920312, "grad_norm": 0.38384097814559937, "learning_rate": 4.915412516240797e-05, "loss": 0.3766, "step": 1250000 }, { "epoch": 8.46213187527068, "grad_norm": 0.3466211259365082, "learning_rate": 4.915378681247294e-05, "loss": 0.3743, "step": 1250500 }, { "epoch": 8.465515374621049, "grad_norm": 0.3566823899745941, "learning_rate": 4.91534484625379e-05, "loss": 0.374, "step": 1251000 }, { "epoch": 8.468898873971415, "grad_norm": 0.36961984634399414, "learning_rate": 4.915311011260286e-05, "loss": 0.3768, "step": 1251500 }, { "epoch": 8.472282373321784, "grad_norm": 0.37993258237838745, "learning_rate": 4.9152771762667824e-05, "loss": 0.3744, "step": 1252000 }, { "epoch": 8.475665872672153, "grad_norm": 0.3526294231414795, "learning_rate": 4.9152433412732786e-05, "loss": 0.3754, "step": 1252500 }, { "epoch": 8.479049372022521, "grad_norm": 0.37616288661956787, "learning_rate": 4.915209506279775e-05, "loss": 0.3761, "step": 1253000 }, { "epoch": 8.48243287137289, "grad_norm": 0.34510985016822815, "learning_rate": 4.915175671286271e-05, "loss": 0.3757, "step": 1253500 }, { "epoch": 8.485816370723256, "grad_norm": 0.3667396605014801, "learning_rate": 4.915141836292767e-05, "loss": 0.3758, "step": 1254000 }, { "epoch": 8.489199870073625, "grad_norm": 0.33316004276275635, "learning_rate": 4.915108001299264e-05, "loss": 0.3734, "step": 1254500 }, { "epoch": 8.492583369423993, "grad_norm": 0.3826241195201874, "learning_rate": 4.9150741663057604e-05, "loss": 0.3758, "step": 1255000 }, { "epoch": 8.495966868774362, "grad_norm": 0.3249056339263916, "learning_rate": 4.9150403313122566e-05, "loss": 0.3743, "step": 1255500 }, { "epoch": 8.499350368124729, "grad_norm": 0.4200061857700348, "learning_rate": 4.915006496318753e-05, "loss": 0.3757, "step": 1256000 }, { "epoch": 8.502733867475097, "grad_norm": 0.3448467552661896, "learning_rate": 4.91497266132525e-05, "loss": 0.3731, "step": 1256500 }, { "epoch": 8.506117366825466, "grad_norm": 0.3419877588748932, "learning_rate": 4.914938826331746e-05, "loss": 0.3744, "step": 1257000 }, { "epoch": 8.509500866175834, "grad_norm": 0.3729589283466339, "learning_rate": 4.9149049913382414e-05, "loss": 0.3739, "step": 1257500 }, { "epoch": 8.512884365526201, "grad_norm": 0.382418155670166, "learning_rate": 4.914871156344738e-05, "loss": 0.3741, "step": 1258000 }, { "epoch": 8.51626786487657, "grad_norm": 0.33111730217933655, "learning_rate": 4.9148373213512345e-05, "loss": 0.3741, "step": 1258500 }, { "epoch": 8.519651364226938, "grad_norm": 0.33443519473075867, "learning_rate": 4.914803486357731e-05, "loss": 0.374, "step": 1259000 }, { "epoch": 8.523034863577307, "grad_norm": 0.30831006169319153, "learning_rate": 4.914769651364227e-05, "loss": 0.3748, "step": 1259500 }, { "epoch": 8.526418362927675, "grad_norm": 0.3246803879737854, "learning_rate": 4.914735816370724e-05, "loss": 0.3752, "step": 1260000 }, { "epoch": 8.529801862278042, "grad_norm": 0.32946979999542236, "learning_rate": 4.91470198137722e-05, "loss": 0.3741, "step": 1260500 }, { "epoch": 8.53318536162841, "grad_norm": 0.33466988801956177, "learning_rate": 4.914668146383716e-05, "loss": 0.3733, "step": 1261000 }, { "epoch": 8.536568860978779, "grad_norm": 0.3437216281890869, "learning_rate": 4.9146343113902125e-05, "loss": 0.376, "step": 1261500 }, { "epoch": 8.539952360329147, "grad_norm": 0.3692063093185425, "learning_rate": 4.914600476396709e-05, "loss": 0.3747, "step": 1262000 }, { "epoch": 8.543335859679514, "grad_norm": 0.33591610193252563, "learning_rate": 4.914566641403205e-05, "loss": 0.3758, "step": 1262500 }, { "epoch": 8.546719359029883, "grad_norm": 0.3632853627204895, "learning_rate": 4.914532806409701e-05, "loss": 0.3751, "step": 1263000 }, { "epoch": 8.550102858380251, "grad_norm": 0.3478579819202423, "learning_rate": 4.914498971416197e-05, "loss": 0.3725, "step": 1263500 }, { "epoch": 8.55348635773062, "grad_norm": 0.35932010412216187, "learning_rate": 4.914465136422694e-05, "loss": 0.3754, "step": 1264000 }, { "epoch": 8.556869857080988, "grad_norm": 0.32846808433532715, "learning_rate": 4.9144313014291904e-05, "loss": 0.374, "step": 1264500 }, { "epoch": 8.560253356431355, "grad_norm": 0.3679615557193756, "learning_rate": 4.9143974664356866e-05, "loss": 0.3764, "step": 1265000 }, { "epoch": 8.563636855781724, "grad_norm": 0.3527098000049591, "learning_rate": 4.914363631442183e-05, "loss": 0.3757, "step": 1265500 }, { "epoch": 8.567020355132092, "grad_norm": 0.32888293266296387, "learning_rate": 4.91432979644868e-05, "loss": 0.3742, "step": 1266000 }, { "epoch": 8.57040385448246, "grad_norm": 0.38354864716529846, "learning_rate": 4.914295961455176e-05, "loss": 0.3743, "step": 1266500 }, { "epoch": 8.573787353832827, "grad_norm": 0.3360845148563385, "learning_rate": 4.9142621264616715e-05, "loss": 0.3744, "step": 1267000 }, { "epoch": 8.577170853183196, "grad_norm": 0.367890864610672, "learning_rate": 4.9142282914681684e-05, "loss": 0.3751, "step": 1267500 }, { "epoch": 8.580554352533564, "grad_norm": 0.3678390383720398, "learning_rate": 4.9141944564746646e-05, "loss": 0.3746, "step": 1268000 }, { "epoch": 8.583937851883933, "grad_norm": 0.36338627338409424, "learning_rate": 4.914160621481161e-05, "loss": 0.3758, "step": 1268500 }, { "epoch": 8.587321351234301, "grad_norm": 0.3547922670841217, "learning_rate": 4.914126786487657e-05, "loss": 0.3739, "step": 1269000 }, { "epoch": 8.590704850584668, "grad_norm": 0.3636038303375244, "learning_rate": 4.914092951494154e-05, "loss": 0.3751, "step": 1269500 }, { "epoch": 8.594088349935037, "grad_norm": 0.35374486446380615, "learning_rate": 4.91405911650065e-05, "loss": 0.3756, "step": 1270000 }, { "epoch": 8.597471849285405, "grad_norm": 0.373153418302536, "learning_rate": 4.914025281507146e-05, "loss": 0.3741, "step": 1270500 }, { "epoch": 8.600855348635774, "grad_norm": 0.35778605937957764, "learning_rate": 4.9139914465136425e-05, "loss": 0.3754, "step": 1271000 }, { "epoch": 8.60423884798614, "grad_norm": 0.39764803647994995, "learning_rate": 4.913957611520139e-05, "loss": 0.3741, "step": 1271500 }, { "epoch": 8.607622347336509, "grad_norm": 0.32927289605140686, "learning_rate": 4.913923776526635e-05, "loss": 0.3747, "step": 1272000 }, { "epoch": 8.611005846686878, "grad_norm": 0.3471881151199341, "learning_rate": 4.913889941533131e-05, "loss": 0.3766, "step": 1272500 }, { "epoch": 8.614389346037246, "grad_norm": 0.3120143711566925, "learning_rate": 4.9138561065396274e-05, "loss": 0.3751, "step": 1273000 }, { "epoch": 8.617772845387613, "grad_norm": 0.34172213077545166, "learning_rate": 4.913822271546124e-05, "loss": 0.3752, "step": 1273500 }, { "epoch": 8.621156344737981, "grad_norm": 0.37015801668167114, "learning_rate": 4.9137884365526205e-05, "loss": 0.3751, "step": 1274000 }, { "epoch": 8.62453984408835, "grad_norm": 0.34678173065185547, "learning_rate": 4.913754601559117e-05, "loss": 0.3748, "step": 1274500 }, { "epoch": 8.627923343438718, "grad_norm": 0.369243323802948, "learning_rate": 4.913720766565613e-05, "loss": 0.3756, "step": 1275000 }, { "epoch": 8.631306842789087, "grad_norm": 0.3433549106121063, "learning_rate": 4.91368693157211e-05, "loss": 0.376, "step": 1275500 }, { "epoch": 8.634690342139454, "grad_norm": 0.3244531452655792, "learning_rate": 4.913653096578606e-05, "loss": 0.3736, "step": 1276000 }, { "epoch": 8.638073841489822, "grad_norm": 0.3575780689716339, "learning_rate": 4.9136192615851015e-05, "loss": 0.3739, "step": 1276500 }, { "epoch": 8.64145734084019, "grad_norm": 0.3672553598880768, "learning_rate": 4.9135854265915984e-05, "loss": 0.3746, "step": 1277000 }, { "epoch": 8.64484084019056, "grad_norm": 0.34356722235679626, "learning_rate": 4.9135515915980946e-05, "loss": 0.3751, "step": 1277500 }, { "epoch": 8.648224339540928, "grad_norm": 0.3406316041946411, "learning_rate": 4.913517756604591e-05, "loss": 0.3733, "step": 1278000 }, { "epoch": 8.651607838891294, "grad_norm": 0.33928433060646057, "learning_rate": 4.913483921611087e-05, "loss": 0.3746, "step": 1278500 }, { "epoch": 8.654991338241663, "grad_norm": 0.3306182026863098, "learning_rate": 4.913450086617584e-05, "loss": 0.3745, "step": 1279000 }, { "epoch": 8.658374837592032, "grad_norm": 0.37345966696739197, "learning_rate": 4.91341625162408e-05, "loss": 0.3748, "step": 1279500 }, { "epoch": 8.6617583369424, "grad_norm": 0.32681286334991455, "learning_rate": 4.9133824166305764e-05, "loss": 0.3737, "step": 1280000 }, { "epoch": 8.665141836292767, "grad_norm": 0.3620678186416626, "learning_rate": 4.9133485816370726e-05, "loss": 0.3725, "step": 1280500 }, { "epoch": 8.668525335643135, "grad_norm": 0.3080102503299713, "learning_rate": 4.913314746643569e-05, "loss": 0.374, "step": 1281000 }, { "epoch": 8.671908834993504, "grad_norm": 0.3523677885532379, "learning_rate": 4.913280911650065e-05, "loss": 0.3755, "step": 1281500 }, { "epoch": 8.675292334343872, "grad_norm": 0.34980931878089905, "learning_rate": 4.913247076656561e-05, "loss": 0.3745, "step": 1282000 }, { "epoch": 8.67867583369424, "grad_norm": 0.3787962794303894, "learning_rate": 4.9132132416630574e-05, "loss": 0.3745, "step": 1282500 }, { "epoch": 8.682059333044608, "grad_norm": 0.3643645942211151, "learning_rate": 4.913179406669554e-05, "loss": 0.376, "step": 1283000 }, { "epoch": 8.685442832394976, "grad_norm": 0.32300788164138794, "learning_rate": 4.9131455716760505e-05, "loss": 0.3748, "step": 1283500 }, { "epoch": 8.688826331745345, "grad_norm": 0.38785240054130554, "learning_rate": 4.913111736682547e-05, "loss": 0.3758, "step": 1284000 }, { "epoch": 8.692209831095713, "grad_norm": 0.3650936186313629, "learning_rate": 4.913077901689043e-05, "loss": 0.3759, "step": 1284500 }, { "epoch": 8.69559333044608, "grad_norm": 0.3315945863723755, "learning_rate": 4.91304406669554e-05, "loss": 0.3732, "step": 1285000 }, { "epoch": 8.698976829796448, "grad_norm": 0.3755415380001068, "learning_rate": 4.913010231702036e-05, "loss": 0.3751, "step": 1285500 }, { "epoch": 8.702360329146817, "grad_norm": 0.3727501332759857, "learning_rate": 4.9129763967085316e-05, "loss": 0.3759, "step": 1286000 }, { "epoch": 8.705743828497186, "grad_norm": 0.3709351420402527, "learning_rate": 4.9129425617150285e-05, "loss": 0.3747, "step": 1286500 }, { "epoch": 8.709127327847552, "grad_norm": 0.37241730093955994, "learning_rate": 4.912908726721525e-05, "loss": 0.3751, "step": 1287000 }, { "epoch": 8.71251082719792, "grad_norm": 0.36001521348953247, "learning_rate": 4.912874891728021e-05, "loss": 0.3753, "step": 1287500 }, { "epoch": 8.71589432654829, "grad_norm": 0.35647907853126526, "learning_rate": 4.912841056734517e-05, "loss": 0.3756, "step": 1288000 }, { "epoch": 8.719277825898658, "grad_norm": 0.45838338136672974, "learning_rate": 4.912807221741014e-05, "loss": 0.3744, "step": 1288500 }, { "epoch": 8.722661325249026, "grad_norm": 0.33726125955581665, "learning_rate": 4.91277338674751e-05, "loss": 0.3747, "step": 1289000 }, { "epoch": 8.726044824599393, "grad_norm": 0.348104327917099, "learning_rate": 4.9127395517540064e-05, "loss": 0.3741, "step": 1289500 }, { "epoch": 8.729428323949762, "grad_norm": 0.34803929924964905, "learning_rate": 4.9127057167605027e-05, "loss": 0.3734, "step": 1290000 }, { "epoch": 8.73281182330013, "grad_norm": 0.37727972865104675, "learning_rate": 4.912671881766999e-05, "loss": 0.376, "step": 1290500 }, { "epoch": 8.736195322650499, "grad_norm": 0.3559610843658447, "learning_rate": 4.912638046773495e-05, "loss": 0.3742, "step": 1291000 }, { "epoch": 8.739578822000865, "grad_norm": 0.357925683259964, "learning_rate": 4.912604211779991e-05, "loss": 0.3748, "step": 1291500 }, { "epoch": 8.742962321351234, "grad_norm": 0.3771865665912628, "learning_rate": 4.9125703767864875e-05, "loss": 0.3758, "step": 1292000 }, { "epoch": 8.746345820701602, "grad_norm": 0.38133761286735535, "learning_rate": 4.9125365417929844e-05, "loss": 0.3769, "step": 1292500 }, { "epoch": 8.749729320051971, "grad_norm": 0.3215438723564148, "learning_rate": 4.9125027067994806e-05, "loss": 0.3741, "step": 1293000 }, { "epoch": 8.75311281940234, "grad_norm": 0.3579214811325073, "learning_rate": 4.912468871805977e-05, "loss": 0.3754, "step": 1293500 }, { "epoch": 8.756496318752706, "grad_norm": 0.38451337814331055, "learning_rate": 4.912435036812473e-05, "loss": 0.3757, "step": 1294000 }, { "epoch": 8.759879818103075, "grad_norm": 0.32298362255096436, "learning_rate": 4.91240120181897e-05, "loss": 0.3743, "step": 1294500 }, { "epoch": 8.763263317453443, "grad_norm": 0.3099938631057739, "learning_rate": 4.912367366825466e-05, "loss": 0.3731, "step": 1295000 }, { "epoch": 8.766646816803812, "grad_norm": 0.3347644507884979, "learning_rate": 4.912333531831962e-05, "loss": 0.3732, "step": 1295500 }, { "epoch": 8.770030316154179, "grad_norm": 0.32155701518058777, "learning_rate": 4.9122996968384586e-05, "loss": 0.3743, "step": 1296000 }, { "epoch": 8.773413815504547, "grad_norm": 0.3333846926689148, "learning_rate": 4.912265861844955e-05, "loss": 0.3753, "step": 1296500 }, { "epoch": 8.776797314854916, "grad_norm": 0.36494186520576477, "learning_rate": 4.912232026851451e-05, "loss": 0.3733, "step": 1297000 }, { "epoch": 8.780180814205284, "grad_norm": 0.33402982354164124, "learning_rate": 4.912198191857947e-05, "loss": 0.3759, "step": 1297500 }, { "epoch": 8.783564313555651, "grad_norm": 0.34107717871665955, "learning_rate": 4.912164356864444e-05, "loss": 0.3743, "step": 1298000 }, { "epoch": 8.78694781290602, "grad_norm": 0.3402135968208313, "learning_rate": 4.91213052187094e-05, "loss": 0.3771, "step": 1298500 }, { "epoch": 8.790331312256388, "grad_norm": 0.3298054337501526, "learning_rate": 4.9120966868774365e-05, "loss": 0.3748, "step": 1299000 }, { "epoch": 8.793714811606756, "grad_norm": 0.3509495258331299, "learning_rate": 4.912062851883933e-05, "loss": 0.3735, "step": 1299500 }, { "epoch": 8.797098310957125, "grad_norm": 0.3410671055316925, "learning_rate": 4.912029016890429e-05, "loss": 0.3735, "step": 1300000 }, { "epoch": 8.800481810307492, "grad_norm": 0.39786043763160706, "learning_rate": 4.911995181896925e-05, "loss": 0.3741, "step": 1300500 }, { "epoch": 8.80386530965786, "grad_norm": 0.35167184472084045, "learning_rate": 4.9119613469034214e-05, "loss": 0.3747, "step": 1301000 }, { "epoch": 8.807248809008229, "grad_norm": 0.30722448229789734, "learning_rate": 4.9119275119099176e-05, "loss": 0.3738, "step": 1301500 }, { "epoch": 8.810632308358597, "grad_norm": 0.3152434825897217, "learning_rate": 4.9118936769164145e-05, "loss": 0.3757, "step": 1302000 }, { "epoch": 8.814015807708964, "grad_norm": 0.3755607306957245, "learning_rate": 4.911859841922911e-05, "loss": 0.3734, "step": 1302500 }, { "epoch": 8.817399307059333, "grad_norm": 0.34447774291038513, "learning_rate": 4.911826006929407e-05, "loss": 0.3742, "step": 1303000 }, { "epoch": 8.820782806409701, "grad_norm": 0.3727530837059021, "learning_rate": 4.911792171935903e-05, "loss": 0.3749, "step": 1303500 }, { "epoch": 8.82416630576007, "grad_norm": 0.37038761377334595, "learning_rate": 4.9117583369424e-05, "loss": 0.3745, "step": 1304000 }, { "epoch": 8.827549805110438, "grad_norm": 0.3163911998271942, "learning_rate": 4.911724501948896e-05, "loss": 0.3756, "step": 1304500 }, { "epoch": 8.830933304460805, "grad_norm": 0.3718319237232208, "learning_rate": 4.911690666955392e-05, "loss": 0.3749, "step": 1305000 }, { "epoch": 8.834316803811173, "grad_norm": 0.36646541953086853, "learning_rate": 4.9116568319618886e-05, "loss": 0.3754, "step": 1305500 }, { "epoch": 8.837700303161542, "grad_norm": 0.33847516775131226, "learning_rate": 4.911622996968385e-05, "loss": 0.3748, "step": 1306000 }, { "epoch": 8.84108380251191, "grad_norm": 0.35955485701560974, "learning_rate": 4.911589161974881e-05, "loss": 0.3728, "step": 1306500 }, { "epoch": 8.844467301862277, "grad_norm": 0.33273687958717346, "learning_rate": 4.911555326981377e-05, "loss": 0.3749, "step": 1307000 }, { "epoch": 8.847850801212646, "grad_norm": 0.35360097885131836, "learning_rate": 4.911521491987874e-05, "loss": 0.3745, "step": 1307500 }, { "epoch": 8.851234300563014, "grad_norm": 0.3549317419528961, "learning_rate": 4.9114876569943704e-05, "loss": 0.3745, "step": 1308000 }, { "epoch": 8.854617799913383, "grad_norm": 0.36261728405952454, "learning_rate": 4.9114538220008666e-05, "loss": 0.3739, "step": 1308500 }, { "epoch": 8.858001299263751, "grad_norm": 0.36524686217308044, "learning_rate": 4.911419987007363e-05, "loss": 0.3738, "step": 1309000 }, { "epoch": 8.861384798614118, "grad_norm": 0.3448386788368225, "learning_rate": 4.911386152013859e-05, "loss": 0.3761, "step": 1309500 }, { "epoch": 8.864768297964487, "grad_norm": 0.3430918753147125, "learning_rate": 4.911352317020355e-05, "loss": 0.3738, "step": 1310000 }, { "epoch": 8.868151797314855, "grad_norm": 0.3501821756362915, "learning_rate": 4.9113184820268514e-05, "loss": 0.3745, "step": 1310500 }, { "epoch": 8.871535296665224, "grad_norm": 0.3411780893802643, "learning_rate": 4.9112846470333476e-05, "loss": 0.375, "step": 1311000 }, { "epoch": 8.87491879601559, "grad_norm": 0.3733043968677521, "learning_rate": 4.9112508120398445e-05, "loss": 0.375, "step": 1311500 }, { "epoch": 8.878302295365959, "grad_norm": 0.3024614751338959, "learning_rate": 4.911216977046341e-05, "loss": 0.3739, "step": 1312000 }, { "epoch": 8.881685794716327, "grad_norm": 0.32841992378234863, "learning_rate": 4.911183142052837e-05, "loss": 0.3744, "step": 1312500 }, { "epoch": 8.885069294066696, "grad_norm": 0.35591748356819153, "learning_rate": 4.911149307059333e-05, "loss": 0.3743, "step": 1313000 }, { "epoch": 8.888452793417063, "grad_norm": 0.31989797949790955, "learning_rate": 4.91111547206583e-05, "loss": 0.3748, "step": 1313500 }, { "epoch": 8.891836292767431, "grad_norm": 0.31331291794776917, "learning_rate": 4.911081637072326e-05, "loss": 0.3735, "step": 1314000 }, { "epoch": 8.8952197921178, "grad_norm": 0.36485719680786133, "learning_rate": 4.911047802078822e-05, "loss": 0.3738, "step": 1314500 }, { "epoch": 8.898603291468168, "grad_norm": 0.3131905496120453, "learning_rate": 4.911013967085319e-05, "loss": 0.373, "step": 1315000 }, { "epoch": 8.901986790818537, "grad_norm": 0.35740169882774353, "learning_rate": 4.910980132091815e-05, "loss": 0.3744, "step": 1315500 }, { "epoch": 8.905370290168904, "grad_norm": 0.37337788939476013, "learning_rate": 4.910946297098311e-05, "loss": 0.3754, "step": 1316000 }, { "epoch": 8.908753789519272, "grad_norm": 0.3863716721534729, "learning_rate": 4.910912462104807e-05, "loss": 0.3742, "step": 1316500 }, { "epoch": 8.91213728886964, "grad_norm": 0.35913145542144775, "learning_rate": 4.9108786271113035e-05, "loss": 0.3746, "step": 1317000 }, { "epoch": 8.91552078822001, "grad_norm": 0.3646251857280731, "learning_rate": 4.9108447921178004e-05, "loss": 0.3732, "step": 1317500 }, { "epoch": 8.918904287570378, "grad_norm": 0.3502126634120941, "learning_rate": 4.9108109571242966e-05, "loss": 0.3738, "step": 1318000 }, { "epoch": 8.922287786920744, "grad_norm": 0.3665471076965332, "learning_rate": 4.910777122130793e-05, "loss": 0.3743, "step": 1318500 }, { "epoch": 8.925671286271113, "grad_norm": 0.3789934813976288, "learning_rate": 4.910743287137289e-05, "loss": 0.3757, "step": 1319000 }, { "epoch": 8.929054785621481, "grad_norm": 0.34552836418151855, "learning_rate": 4.910709452143785e-05, "loss": 0.3752, "step": 1319500 }, { "epoch": 8.93243828497185, "grad_norm": 0.3529386818408966, "learning_rate": 4.9106756171502815e-05, "loss": 0.3752, "step": 1320000 }, { "epoch": 8.935821784322217, "grad_norm": 0.32900169491767883, "learning_rate": 4.910641782156778e-05, "loss": 0.3746, "step": 1320500 }, { "epoch": 8.939205283672585, "grad_norm": 0.3948231041431427, "learning_rate": 4.9106079471632746e-05, "loss": 0.3748, "step": 1321000 }, { "epoch": 8.942588783022954, "grad_norm": 0.3746308386325836, "learning_rate": 4.910574112169771e-05, "loss": 0.3732, "step": 1321500 }, { "epoch": 8.945972282373322, "grad_norm": 0.3531734347343445, "learning_rate": 4.910540277176267e-05, "loss": 0.3742, "step": 1322000 }, { "epoch": 8.949355781723689, "grad_norm": 0.3803749084472656, "learning_rate": 4.910506442182763e-05, "loss": 0.3735, "step": 1322500 }, { "epoch": 8.952739281074058, "grad_norm": 0.3477669358253479, "learning_rate": 4.91047260718926e-05, "loss": 0.375, "step": 1323000 }, { "epoch": 8.956122780424426, "grad_norm": 0.3473570942878723, "learning_rate": 4.910438772195756e-05, "loss": 0.3755, "step": 1323500 }, { "epoch": 8.959506279774795, "grad_norm": 0.3485695719718933, "learning_rate": 4.910404937202252e-05, "loss": 0.3769, "step": 1324000 }, { "epoch": 8.962889779125163, "grad_norm": 0.2922978401184082, "learning_rate": 4.910371102208748e-05, "loss": 0.3735, "step": 1324500 }, { "epoch": 8.96627327847553, "grad_norm": 0.35643815994262695, "learning_rate": 4.910337267215245e-05, "loss": 0.3742, "step": 1325000 }, { "epoch": 8.969656777825898, "grad_norm": 0.340582013130188, "learning_rate": 4.910303432221741e-05, "loss": 0.3754, "step": 1325500 }, { "epoch": 8.973040277176267, "grad_norm": 0.32579490542411804, "learning_rate": 4.9102695972282374e-05, "loss": 0.3752, "step": 1326000 }, { "epoch": 8.976423776526635, "grad_norm": 0.36068543791770935, "learning_rate": 4.9102357622347336e-05, "loss": 0.3737, "step": 1326500 }, { "epoch": 8.979807275877002, "grad_norm": 0.3534857928752899, "learning_rate": 4.9102019272412305e-05, "loss": 0.373, "step": 1327000 }, { "epoch": 8.98319077522737, "grad_norm": 0.34306958317756653, "learning_rate": 4.910168092247727e-05, "loss": 0.3747, "step": 1327500 }, { "epoch": 8.98657427457774, "grad_norm": 0.33797481656074524, "learning_rate": 4.910134257254223e-05, "loss": 0.3735, "step": 1328000 }, { "epoch": 8.989957773928108, "grad_norm": 0.4253041446208954, "learning_rate": 4.910100422260719e-05, "loss": 0.3741, "step": 1328500 }, { "epoch": 8.993341273278476, "grad_norm": 0.36487436294555664, "learning_rate": 4.910066587267215e-05, "loss": 0.3747, "step": 1329000 }, { "epoch": 8.996724772628843, "grad_norm": 0.35783451795578003, "learning_rate": 4.9100327522737116e-05, "loss": 0.3745, "step": 1329500 }, { "epoch": 9.0, "eval_accuracy": 0.8575272885373202, "eval_loss": 0.5787888169288635, "eval_runtime": 3378.0515, "eval_samples_per_second": 86.069, "eval_steps_per_second": 5.379, "step": 1329984 }, { "epoch": 9.000108271979212, "grad_norm": 0.3485052287578583, "learning_rate": 4.909998917280208e-05, "loss": 0.3746, "step": 1330000 }, { "epoch": 9.00349177132958, "grad_norm": 0.338402658700943, "learning_rate": 4.9099650822867047e-05, "loss": 0.3709, "step": 1330500 }, { "epoch": 9.006875270679949, "grad_norm": 0.3309011161327362, "learning_rate": 4.909931247293201e-05, "loss": 0.3711, "step": 1331000 }, { "epoch": 9.010258770030315, "grad_norm": 0.3409023880958557, "learning_rate": 4.909897412299697e-05, "loss": 0.3724, "step": 1331500 }, { "epoch": 9.013642269380684, "grad_norm": 0.354640930891037, "learning_rate": 4.909863577306193e-05, "loss": 0.3721, "step": 1332000 }, { "epoch": 9.017025768731052, "grad_norm": 0.3582461178302765, "learning_rate": 4.90982974231269e-05, "loss": 0.3718, "step": 1332500 }, { "epoch": 9.020409268081421, "grad_norm": 0.3886023163795471, "learning_rate": 4.9097959073191864e-05, "loss": 0.372, "step": 1333000 }, { "epoch": 9.02379276743179, "grad_norm": 0.344675213098526, "learning_rate": 4.909762072325682e-05, "loss": 0.371, "step": 1333500 }, { "epoch": 9.027176266782156, "grad_norm": 0.34601104259490967, "learning_rate": 4.909728237332178e-05, "loss": 0.3737, "step": 1334000 }, { "epoch": 9.030559766132525, "grad_norm": 0.3386719226837158, "learning_rate": 4.909694402338675e-05, "loss": 0.3728, "step": 1334500 }, { "epoch": 9.033943265482893, "grad_norm": 0.33993321657180786, "learning_rate": 4.909660567345171e-05, "loss": 0.3732, "step": 1335000 }, { "epoch": 9.037326764833262, "grad_norm": 0.3747544586658478, "learning_rate": 4.9096267323516675e-05, "loss": 0.3732, "step": 1335500 }, { "epoch": 9.040710264183629, "grad_norm": 0.317266583442688, "learning_rate": 4.909592897358164e-05, "loss": 0.3726, "step": 1336000 }, { "epoch": 9.044093763533997, "grad_norm": 0.44216030836105347, "learning_rate": 4.9095590623646606e-05, "loss": 0.3721, "step": 1336500 }, { "epoch": 9.047477262884366, "grad_norm": 0.3385017216205597, "learning_rate": 4.909525227371157e-05, "loss": 0.3738, "step": 1337000 }, { "epoch": 9.050860762234734, "grad_norm": 0.3921028673648834, "learning_rate": 4.909491392377653e-05, "loss": 0.3716, "step": 1337500 }, { "epoch": 9.054244261585103, "grad_norm": 0.3511488139629364, "learning_rate": 4.909457557384149e-05, "loss": 0.3724, "step": 1338000 }, { "epoch": 9.05762776093547, "grad_norm": 0.3934440612792969, "learning_rate": 4.9094237223906454e-05, "loss": 0.3741, "step": 1338500 }, { "epoch": 9.061011260285838, "grad_norm": 0.32151028513908386, "learning_rate": 4.9093898873971416e-05, "loss": 0.3731, "step": 1339000 }, { "epoch": 9.064394759636206, "grad_norm": 0.33140772581100464, "learning_rate": 4.909356052403638e-05, "loss": 0.3739, "step": 1339500 }, { "epoch": 9.067778258986575, "grad_norm": 0.3499145805835724, "learning_rate": 4.909322217410135e-05, "loss": 0.3738, "step": 1340000 }, { "epoch": 9.071161758336942, "grad_norm": 0.33269986510276794, "learning_rate": 4.909288382416631e-05, "loss": 0.3713, "step": 1340500 }, { "epoch": 9.07454525768731, "grad_norm": 0.3687961995601654, "learning_rate": 4.909254547423127e-05, "loss": 0.373, "step": 1341000 }, { "epoch": 9.077928757037679, "grad_norm": 0.3527079224586487, "learning_rate": 4.9092207124296234e-05, "loss": 0.3733, "step": 1341500 }, { "epoch": 9.081312256388047, "grad_norm": 0.3850629925727844, "learning_rate": 4.90918687743612e-05, "loss": 0.3729, "step": 1342000 }, { "epoch": 9.084695755738414, "grad_norm": 0.3427541255950928, "learning_rate": 4.9091530424426165e-05, "loss": 0.3722, "step": 1342500 }, { "epoch": 9.088079255088783, "grad_norm": 0.3640572428703308, "learning_rate": 4.909119207449112e-05, "loss": 0.3725, "step": 1343000 }, { "epoch": 9.091462754439151, "grad_norm": 0.3413487672805786, "learning_rate": 4.909085372455608e-05, "loss": 0.3728, "step": 1343500 }, { "epoch": 9.09484625378952, "grad_norm": 0.36850154399871826, "learning_rate": 4.909051537462105e-05, "loss": 0.373, "step": 1344000 }, { "epoch": 9.098229753139888, "grad_norm": 0.3488371968269348, "learning_rate": 4.909017702468601e-05, "loss": 0.3747, "step": 1344500 }, { "epoch": 9.101613252490255, "grad_norm": 0.32600492238998413, "learning_rate": 4.9089838674750975e-05, "loss": 0.3725, "step": 1345000 }, { "epoch": 9.104996751840623, "grad_norm": 0.3362772762775421, "learning_rate": 4.908950032481594e-05, "loss": 0.3725, "step": 1345500 }, { "epoch": 9.108380251190992, "grad_norm": 0.3490723967552185, "learning_rate": 4.9089161974880906e-05, "loss": 0.3732, "step": 1346000 }, { "epoch": 9.11176375054136, "grad_norm": 0.39872556924819946, "learning_rate": 4.908882362494587e-05, "loss": 0.3731, "step": 1346500 }, { "epoch": 9.115147249891727, "grad_norm": 0.3392133414745331, "learning_rate": 4.908848527501083e-05, "loss": 0.3722, "step": 1347000 }, { "epoch": 9.118530749242096, "grad_norm": 0.34616315364837646, "learning_rate": 4.908814692507579e-05, "loss": 0.3732, "step": 1347500 }, { "epoch": 9.121914248592464, "grad_norm": 0.34856176376342773, "learning_rate": 4.9087808575140755e-05, "loss": 0.3736, "step": 1348000 }, { "epoch": 9.125297747942833, "grad_norm": 0.3597109317779541, "learning_rate": 4.908747022520572e-05, "loss": 0.3736, "step": 1348500 }, { "epoch": 9.128681247293201, "grad_norm": 0.3455621898174286, "learning_rate": 4.908713187527068e-05, "loss": 0.3724, "step": 1349000 }, { "epoch": 9.132064746643568, "grad_norm": 0.3394932448863983, "learning_rate": 4.908679352533565e-05, "loss": 0.3737, "step": 1349500 }, { "epoch": 9.135448245993937, "grad_norm": 0.3652005195617676, "learning_rate": 4.908645517540061e-05, "loss": 0.3735, "step": 1350000 }, { "epoch": 9.138831745344305, "grad_norm": 0.3541617691516876, "learning_rate": 4.908611682546557e-05, "loss": 0.3733, "step": 1350500 }, { "epoch": 9.142215244694674, "grad_norm": 0.32934844493865967, "learning_rate": 4.9085778475530534e-05, "loss": 0.3722, "step": 1351000 }, { "epoch": 9.14559874404504, "grad_norm": 0.3870023488998413, "learning_rate": 4.90854401255955e-05, "loss": 0.3718, "step": 1351500 }, { "epoch": 9.148982243395409, "grad_norm": 0.36997953057289124, "learning_rate": 4.9085101775660465e-05, "loss": 0.3723, "step": 1352000 }, { "epoch": 9.152365742745777, "grad_norm": 0.34613293409347534, "learning_rate": 4.908476342572542e-05, "loss": 0.3735, "step": 1352500 }, { "epoch": 9.155749242096146, "grad_norm": 0.3669275939464569, "learning_rate": 4.908442507579038e-05, "loss": 0.3735, "step": 1353000 }, { "epoch": 9.159132741446514, "grad_norm": 0.333718478679657, "learning_rate": 4.908408672585535e-05, "loss": 0.3731, "step": 1353500 }, { "epoch": 9.162516240796881, "grad_norm": 0.3390810191631317, "learning_rate": 4.9083748375920314e-05, "loss": 0.3736, "step": 1354000 }, { "epoch": 9.16589974014725, "grad_norm": 0.3680596351623535, "learning_rate": 4.9083410025985276e-05, "loss": 0.3739, "step": 1354500 }, { "epoch": 9.169283239497618, "grad_norm": 0.3484257459640503, "learning_rate": 4.908307167605024e-05, "loss": 0.3715, "step": 1355000 }, { "epoch": 9.172666738847987, "grad_norm": 0.3617357313632965, "learning_rate": 4.908273332611521e-05, "loss": 0.3721, "step": 1355500 }, { "epoch": 9.176050238198354, "grad_norm": 0.3549576997756958, "learning_rate": 4.908239497618017e-05, "loss": 0.3746, "step": 1356000 }, { "epoch": 9.179433737548722, "grad_norm": 0.35443395376205444, "learning_rate": 4.908205662624513e-05, "loss": 0.3726, "step": 1356500 }, { "epoch": 9.18281723689909, "grad_norm": 0.34137991070747375, "learning_rate": 4.908171827631009e-05, "loss": 0.3728, "step": 1357000 }, { "epoch": 9.186200736249459, "grad_norm": 0.40165209770202637, "learning_rate": 4.9081379926375055e-05, "loss": 0.3734, "step": 1357500 }, { "epoch": 9.189584235599828, "grad_norm": 0.35499700903892517, "learning_rate": 4.908104157644002e-05, "loss": 0.372, "step": 1358000 }, { "epoch": 9.192967734950194, "grad_norm": 0.37622034549713135, "learning_rate": 4.908070322650498e-05, "loss": 0.3731, "step": 1358500 }, { "epoch": 9.196351234300563, "grad_norm": 0.37281620502471924, "learning_rate": 4.908036487656995e-05, "loss": 0.3732, "step": 1359000 }, { "epoch": 9.199734733650931, "grad_norm": 0.3736304044723511, "learning_rate": 4.908002652663491e-05, "loss": 0.3724, "step": 1359500 }, { "epoch": 9.2031182330013, "grad_norm": 0.3365495204925537, "learning_rate": 4.907968817669987e-05, "loss": 0.3742, "step": 1360000 }, { "epoch": 9.206501732351667, "grad_norm": 0.3519720435142517, "learning_rate": 4.9079349826764835e-05, "loss": 0.3725, "step": 1360500 }, { "epoch": 9.209885231702035, "grad_norm": 0.37363943457603455, "learning_rate": 4.9079011476829804e-05, "loss": 0.3749, "step": 1361000 }, { "epoch": 9.213268731052404, "grad_norm": 0.37272366881370544, "learning_rate": 4.9078673126894766e-05, "loss": 0.3743, "step": 1361500 }, { "epoch": 9.216652230402772, "grad_norm": 0.39422059059143066, "learning_rate": 4.907833477695973e-05, "loss": 0.3725, "step": 1362000 }, { "epoch": 9.220035729753139, "grad_norm": 0.3420394957065582, "learning_rate": 4.907799642702468e-05, "loss": 0.375, "step": 1362500 }, { "epoch": 9.223419229103508, "grad_norm": 0.34846001863479614, "learning_rate": 4.907765807708965e-05, "loss": 0.3731, "step": 1363000 }, { "epoch": 9.226802728453876, "grad_norm": 0.3821485638618469, "learning_rate": 4.9077319727154614e-05, "loss": 0.3735, "step": 1363500 }, { "epoch": 9.230186227804245, "grad_norm": 0.3564333915710449, "learning_rate": 4.9076981377219576e-05, "loss": 0.3742, "step": 1364000 }, { "epoch": 9.233569727154613, "grad_norm": 0.35378578305244446, "learning_rate": 4.907664302728454e-05, "loss": 0.3734, "step": 1364500 }, { "epoch": 9.23695322650498, "grad_norm": 0.34390419721603394, "learning_rate": 4.907630467734951e-05, "loss": 0.3731, "step": 1365000 }, { "epoch": 9.240336725855348, "grad_norm": 0.35762277245521545, "learning_rate": 4.907596632741447e-05, "loss": 0.3747, "step": 1365500 }, { "epoch": 9.243720225205717, "grad_norm": 0.3782312273979187, "learning_rate": 4.907562797747943e-05, "loss": 0.3738, "step": 1366000 }, { "epoch": 9.247103724556085, "grad_norm": 0.36128950119018555, "learning_rate": 4.9075289627544394e-05, "loss": 0.3739, "step": 1366500 }, { "epoch": 9.250487223906452, "grad_norm": 0.3614073693752289, "learning_rate": 4.9074951277609356e-05, "loss": 0.3733, "step": 1367000 }, { "epoch": 9.25387072325682, "grad_norm": 0.39572808146476746, "learning_rate": 4.907461292767432e-05, "loss": 0.3753, "step": 1367500 }, { "epoch": 9.25725422260719, "grad_norm": 0.3680035173892975, "learning_rate": 4.907427457773928e-05, "loss": 0.3724, "step": 1368000 }, { "epoch": 9.260637721957558, "grad_norm": 0.37925729155540466, "learning_rate": 4.907393622780425e-05, "loss": 0.3724, "step": 1368500 }, { "epoch": 9.264021221307926, "grad_norm": 0.34867802262306213, "learning_rate": 4.907359787786921e-05, "loss": 0.3746, "step": 1369000 }, { "epoch": 9.267404720658293, "grad_norm": 0.36166927218437195, "learning_rate": 4.907325952793417e-05, "loss": 0.3742, "step": 1369500 }, { "epoch": 9.270788220008662, "grad_norm": 0.3401140868663788, "learning_rate": 4.9072921177999135e-05, "loss": 0.3724, "step": 1370000 }, { "epoch": 9.27417171935903, "grad_norm": 0.34102433919906616, "learning_rate": 4.90725828280641e-05, "loss": 0.374, "step": 1370500 }, { "epoch": 9.277555218709399, "grad_norm": 0.3460339605808258, "learning_rate": 4.9072244478129066e-05, "loss": 0.3737, "step": 1371000 }, { "epoch": 9.280938718059765, "grad_norm": 0.40066081285476685, "learning_rate": 4.907190612819403e-05, "loss": 0.373, "step": 1371500 }, { "epoch": 9.284322217410134, "grad_norm": 0.3612765967845917, "learning_rate": 4.9071567778258984e-05, "loss": 0.3735, "step": 1372000 }, { "epoch": 9.287705716760502, "grad_norm": 0.35781562328338623, "learning_rate": 4.907122942832395e-05, "loss": 0.3731, "step": 1372500 }, { "epoch": 9.291089216110871, "grad_norm": 0.3313261568546295, "learning_rate": 4.9070891078388915e-05, "loss": 0.3751, "step": 1373000 }, { "epoch": 9.29447271546124, "grad_norm": 0.39906901121139526, "learning_rate": 4.907055272845388e-05, "loss": 0.3729, "step": 1373500 }, { "epoch": 9.297856214811606, "grad_norm": 0.3768948018550873, "learning_rate": 4.907021437851884e-05, "loss": 0.3721, "step": 1374000 }, { "epoch": 9.301239714161975, "grad_norm": 0.3924950659275055, "learning_rate": 4.906987602858381e-05, "loss": 0.3723, "step": 1374500 }, { "epoch": 9.304623213512343, "grad_norm": 0.3677727282047272, "learning_rate": 4.906953767864877e-05, "loss": 0.3718, "step": 1375000 }, { "epoch": 9.308006712862712, "grad_norm": 0.36460989713668823, "learning_rate": 4.906919932871373e-05, "loss": 0.373, "step": 1375500 }, { "epoch": 9.311390212213078, "grad_norm": 0.31929346919059753, "learning_rate": 4.9068860978778694e-05, "loss": 0.3734, "step": 1376000 }, { "epoch": 9.314773711563447, "grad_norm": 0.3578610122203827, "learning_rate": 4.9068522628843657e-05, "loss": 0.3717, "step": 1376500 }, { "epoch": 9.318157210913816, "grad_norm": 0.32925495505332947, "learning_rate": 4.906818427890862e-05, "loss": 0.374, "step": 1377000 }, { "epoch": 9.321540710264184, "grad_norm": 0.36400106549263, "learning_rate": 4.906784592897358e-05, "loss": 0.3747, "step": 1377500 }, { "epoch": 9.324924209614553, "grad_norm": 0.34628424048423767, "learning_rate": 4.906750757903855e-05, "loss": 0.3734, "step": 1378000 }, { "epoch": 9.32830770896492, "grad_norm": 0.34225183725357056, "learning_rate": 4.906716922910351e-05, "loss": 0.3731, "step": 1378500 }, { "epoch": 9.331691208315288, "grad_norm": 0.3421752452850342, "learning_rate": 4.9066830879168474e-05, "loss": 0.3725, "step": 1379000 }, { "epoch": 9.335074707665656, "grad_norm": 0.37214502692222595, "learning_rate": 4.9066492529233436e-05, "loss": 0.3738, "step": 1379500 }, { "epoch": 9.338458207016025, "grad_norm": 0.3321809470653534, "learning_rate": 4.90661541792984e-05, "loss": 0.3737, "step": 1380000 }, { "epoch": 9.341841706366392, "grad_norm": 0.3678796887397766, "learning_rate": 4.906581582936337e-05, "loss": 0.3724, "step": 1380500 }, { "epoch": 9.34522520571676, "grad_norm": 0.31558719277381897, "learning_rate": 4.906547747942833e-05, "loss": 0.3759, "step": 1381000 }, { "epoch": 9.348608705067129, "grad_norm": 0.3410813510417938, "learning_rate": 4.9065139129493285e-05, "loss": 0.3719, "step": 1381500 }, { "epoch": 9.351992204417497, "grad_norm": 0.3446415662765503, "learning_rate": 4.9064800779558253e-05, "loss": 0.3738, "step": 1382000 }, { "epoch": 9.355375703767866, "grad_norm": 0.37881118059158325, "learning_rate": 4.9064462429623216e-05, "loss": 0.372, "step": 1382500 }, { "epoch": 9.358759203118233, "grad_norm": 0.378154993057251, "learning_rate": 4.906412407968818e-05, "loss": 0.375, "step": 1383000 }, { "epoch": 9.362142702468601, "grad_norm": 0.3477650284767151, "learning_rate": 4.906378572975314e-05, "loss": 0.3746, "step": 1383500 }, { "epoch": 9.36552620181897, "grad_norm": 0.35453009605407715, "learning_rate": 4.906344737981811e-05, "loss": 0.373, "step": 1384000 }, { "epoch": 9.368909701169338, "grad_norm": 0.33059656620025635, "learning_rate": 4.906310902988307e-05, "loss": 0.3755, "step": 1384500 }, { "epoch": 9.372293200519705, "grad_norm": 0.3068912625312805, "learning_rate": 4.906277067994803e-05, "loss": 0.3726, "step": 1385000 }, { "epoch": 9.375676699870073, "grad_norm": 0.3638466000556946, "learning_rate": 4.9062432330012995e-05, "loss": 0.3746, "step": 1385500 }, { "epoch": 9.379060199220442, "grad_norm": 0.35415658354759216, "learning_rate": 4.906209398007796e-05, "loss": 0.3719, "step": 1386000 }, { "epoch": 9.38244369857081, "grad_norm": 0.3766058385372162, "learning_rate": 4.906175563014292e-05, "loss": 0.374, "step": 1386500 }, { "epoch": 9.385827197921177, "grad_norm": 0.3652975857257843, "learning_rate": 4.906141728020788e-05, "loss": 0.3736, "step": 1387000 }, { "epoch": 9.389210697271546, "grad_norm": 0.342379629611969, "learning_rate": 4.9061078930272844e-05, "loss": 0.374, "step": 1387500 }, { "epoch": 9.392594196621914, "grad_norm": 0.3303496241569519, "learning_rate": 4.906074058033781e-05, "loss": 0.3736, "step": 1388000 }, { "epoch": 9.395977695972283, "grad_norm": 0.3351159989833832, "learning_rate": 4.9060402230402775e-05, "loss": 0.3732, "step": 1388500 }, { "epoch": 9.399361195322651, "grad_norm": 0.362427681684494, "learning_rate": 4.906006388046774e-05, "loss": 0.373, "step": 1389000 }, { "epoch": 9.402744694673018, "grad_norm": 0.33641988039016724, "learning_rate": 4.90597255305327e-05, "loss": 0.373, "step": 1389500 }, { "epoch": 9.406128194023387, "grad_norm": 0.3344801068305969, "learning_rate": 4.905938718059767e-05, "loss": 0.3726, "step": 1390000 }, { "epoch": 9.409511693373755, "grad_norm": 0.4045099914073944, "learning_rate": 4.905904883066263e-05, "loss": 0.373, "step": 1390500 }, { "epoch": 9.412895192724124, "grad_norm": 0.3387194573879242, "learning_rate": 4.9058710480727585e-05, "loss": 0.3724, "step": 1391000 }, { "epoch": 9.41627869207449, "grad_norm": 0.33229130506515503, "learning_rate": 4.9058372130792554e-05, "loss": 0.3746, "step": 1391500 }, { "epoch": 9.419662191424859, "grad_norm": 0.4265800714492798, "learning_rate": 4.9058033780857516e-05, "loss": 0.3731, "step": 1392000 }, { "epoch": 9.423045690775227, "grad_norm": 0.38675758242607117, "learning_rate": 4.905769543092248e-05, "loss": 0.3743, "step": 1392500 }, { "epoch": 9.426429190125596, "grad_norm": 0.34786656498908997, "learning_rate": 4.905735708098744e-05, "loss": 0.3748, "step": 1393000 }, { "epoch": 9.429812689475964, "grad_norm": 0.32004114985466003, "learning_rate": 4.905701873105241e-05, "loss": 0.3746, "step": 1393500 }, { "epoch": 9.433196188826331, "grad_norm": 0.35318368673324585, "learning_rate": 4.905668038111737e-05, "loss": 0.3736, "step": 1394000 }, { "epoch": 9.4365796881767, "grad_norm": 0.38677674531936646, "learning_rate": 4.9056342031182334e-05, "loss": 0.3743, "step": 1394500 }, { "epoch": 9.439963187527068, "grad_norm": 0.35570451617240906, "learning_rate": 4.9056003681247296e-05, "loss": 0.3741, "step": 1395000 }, { "epoch": 9.443346686877437, "grad_norm": 0.3725792467594147, "learning_rate": 4.905566533131226e-05, "loss": 0.3741, "step": 1395500 }, { "epoch": 9.446730186227803, "grad_norm": 0.3793668746948242, "learning_rate": 4.905532698137722e-05, "loss": 0.3728, "step": 1396000 }, { "epoch": 9.450113685578172, "grad_norm": 0.3633134961128235, "learning_rate": 4.905498863144218e-05, "loss": 0.3733, "step": 1396500 }, { "epoch": 9.45349718492854, "grad_norm": 0.3719507157802582, "learning_rate": 4.9054650281507144e-05, "loss": 0.3733, "step": 1397000 }, { "epoch": 9.456880684278909, "grad_norm": 0.3579012453556061, "learning_rate": 4.905431193157211e-05, "loss": 0.375, "step": 1397500 }, { "epoch": 9.460264183629278, "grad_norm": 0.40617984533309937, "learning_rate": 4.9053973581637075e-05, "loss": 0.3745, "step": 1398000 }, { "epoch": 9.463647682979644, "grad_norm": 0.33194923400878906, "learning_rate": 4.905363523170204e-05, "loss": 0.3736, "step": 1398500 }, { "epoch": 9.467031182330013, "grad_norm": 0.35460004210472107, "learning_rate": 4.9053296881767e-05, "loss": 0.3746, "step": 1399000 }, { "epoch": 9.470414681680381, "grad_norm": 0.35819530487060547, "learning_rate": 4.905295853183197e-05, "loss": 0.3744, "step": 1399500 }, { "epoch": 9.47379818103075, "grad_norm": 0.35111624002456665, "learning_rate": 4.905262018189693e-05, "loss": 0.374, "step": 1400000 }, { "epoch": 9.477181680381117, "grad_norm": 0.3364504873752594, "learning_rate": 4.9052281831961886e-05, "loss": 0.3748, "step": 1400500 }, { "epoch": 9.480565179731485, "grad_norm": 0.3461628258228302, "learning_rate": 4.9051943482026855e-05, "loss": 0.3731, "step": 1401000 }, { "epoch": 9.483948679081854, "grad_norm": 0.35349273681640625, "learning_rate": 4.905160513209182e-05, "loss": 0.3742, "step": 1401500 }, { "epoch": 9.487332178432222, "grad_norm": 0.35184773802757263, "learning_rate": 4.905126678215678e-05, "loss": 0.3726, "step": 1402000 }, { "epoch": 9.490715677782589, "grad_norm": 0.34152382612228394, "learning_rate": 4.905092843222174e-05, "loss": 0.3724, "step": 1402500 }, { "epoch": 9.494099177132957, "grad_norm": 0.3228163421154022, "learning_rate": 4.905059008228671e-05, "loss": 0.3737, "step": 1403000 }, { "epoch": 9.497482676483326, "grad_norm": 0.3243357837200165, "learning_rate": 4.905025173235167e-05, "loss": 0.3726, "step": 1403500 }, { "epoch": 9.500866175833695, "grad_norm": 0.3643701374530792, "learning_rate": 4.9049913382416634e-05, "loss": 0.3744, "step": 1404000 }, { "epoch": 9.504249675184063, "grad_norm": 0.3463166058063507, "learning_rate": 4.9049575032481596e-05, "loss": 0.3729, "step": 1404500 }, { "epoch": 9.50763317453443, "grad_norm": 0.36135637760162354, "learning_rate": 4.904923668254656e-05, "loss": 0.3739, "step": 1405000 }, { "epoch": 9.511016673884798, "grad_norm": 0.3563440144062042, "learning_rate": 4.904889833261152e-05, "loss": 0.3731, "step": 1405500 }, { "epoch": 9.514400173235167, "grad_norm": 0.3402830958366394, "learning_rate": 4.904855998267648e-05, "loss": 0.3746, "step": 1406000 }, { "epoch": 9.517783672585535, "grad_norm": 0.3417011797428131, "learning_rate": 4.9048221632741445e-05, "loss": 0.3746, "step": 1406500 }, { "epoch": 9.521167171935904, "grad_norm": 0.321426659822464, "learning_rate": 4.9047883282806414e-05, "loss": 0.3736, "step": 1407000 }, { "epoch": 9.52455067128627, "grad_norm": 0.36277860403060913, "learning_rate": 4.9047544932871376e-05, "loss": 0.3743, "step": 1407500 }, { "epoch": 9.52793417063664, "grad_norm": 0.3817081153392792, "learning_rate": 4.904720658293634e-05, "loss": 0.3754, "step": 1408000 }, { "epoch": 9.531317669987008, "grad_norm": 0.3738994598388672, "learning_rate": 4.90468682330013e-05, "loss": 0.3738, "step": 1408500 }, { "epoch": 9.534701169337376, "grad_norm": 0.3876747786998749, "learning_rate": 4.904652988306627e-05, "loss": 0.3736, "step": 1409000 }, { "epoch": 9.538084668687743, "grad_norm": 0.36051487922668457, "learning_rate": 4.904619153313123e-05, "loss": 0.3726, "step": 1409500 }, { "epoch": 9.541468168038111, "grad_norm": 0.3415667414665222, "learning_rate": 4.9045853183196186e-05, "loss": 0.3726, "step": 1410000 }, { "epoch": 9.54485166738848, "grad_norm": 0.37187904119491577, "learning_rate": 4.9045514833261155e-05, "loss": 0.3741, "step": 1410500 }, { "epoch": 9.548235166738849, "grad_norm": 0.34227636456489563, "learning_rate": 4.904517648332612e-05, "loss": 0.374, "step": 1411000 }, { "epoch": 9.551618666089215, "grad_norm": 0.3580974042415619, "learning_rate": 4.904483813339108e-05, "loss": 0.3731, "step": 1411500 }, { "epoch": 9.555002165439584, "grad_norm": 0.355074942111969, "learning_rate": 4.904449978345604e-05, "loss": 0.3733, "step": 1412000 }, { "epoch": 9.558385664789952, "grad_norm": 0.314767986536026, "learning_rate": 4.904416143352101e-05, "loss": 0.3724, "step": 1412500 }, { "epoch": 9.56176916414032, "grad_norm": 0.36553481221199036, "learning_rate": 4.904382308358597e-05, "loss": 0.3729, "step": 1413000 }, { "epoch": 9.56515266349069, "grad_norm": 0.33571797609329224, "learning_rate": 4.9043484733650935e-05, "loss": 0.3744, "step": 1413500 }, { "epoch": 9.568536162841056, "grad_norm": 0.33554765582084656, "learning_rate": 4.90431463837159e-05, "loss": 0.3731, "step": 1414000 }, { "epoch": 9.571919662191425, "grad_norm": 0.37651869654655457, "learning_rate": 4.9042808033780866e-05, "loss": 0.3735, "step": 1414500 }, { "epoch": 9.575303161541793, "grad_norm": 0.35934606194496155, "learning_rate": 4.904246968384582e-05, "loss": 0.3727, "step": 1415000 }, { "epoch": 9.578686660892162, "grad_norm": 0.3884565830230713, "learning_rate": 4.904213133391078e-05, "loss": 0.3736, "step": 1415500 }, { "epoch": 9.582070160242528, "grad_norm": 0.3344481289386749, "learning_rate": 4.9041792983975745e-05, "loss": 0.3743, "step": 1416000 }, { "epoch": 9.585453659592897, "grad_norm": 0.37787380814552307, "learning_rate": 4.9041454634040714e-05, "loss": 0.3713, "step": 1416500 }, { "epoch": 9.588837158943265, "grad_norm": 0.3896807134151459, "learning_rate": 4.9041116284105676e-05, "loss": 0.3737, "step": 1417000 }, { "epoch": 9.592220658293634, "grad_norm": 0.37299489974975586, "learning_rate": 4.904077793417064e-05, "loss": 0.3714, "step": 1417500 }, { "epoch": 9.595604157644003, "grad_norm": 0.3850718140602112, "learning_rate": 4.90404395842356e-05, "loss": 0.3727, "step": 1418000 }, { "epoch": 9.59898765699437, "grad_norm": 0.3553083837032318, "learning_rate": 4.904010123430057e-05, "loss": 0.3734, "step": 1418500 }, { "epoch": 9.602371156344738, "grad_norm": 0.3951491415500641, "learning_rate": 4.903976288436553e-05, "loss": 0.3723, "step": 1419000 }, { "epoch": 9.605754655695106, "grad_norm": 0.4291026294231415, "learning_rate": 4.903942453443049e-05, "loss": 0.3738, "step": 1419500 }, { "epoch": 9.609138155045475, "grad_norm": 0.36827054619789124, "learning_rate": 4.9039086184495456e-05, "loss": 0.3727, "step": 1420000 }, { "epoch": 9.612521654395842, "grad_norm": 0.35370057821273804, "learning_rate": 4.903874783456042e-05, "loss": 0.3721, "step": 1420500 }, { "epoch": 9.61590515374621, "grad_norm": 0.3819619417190552, "learning_rate": 4.903840948462538e-05, "loss": 0.3728, "step": 1421000 }, { "epoch": 9.619288653096579, "grad_norm": 0.335520476102829, "learning_rate": 4.903807113469034e-05, "loss": 0.3746, "step": 1421500 }, { "epoch": 9.622672152446947, "grad_norm": 0.3815540373325348, "learning_rate": 4.903773278475531e-05, "loss": 0.3714, "step": 1422000 }, { "epoch": 9.626055651797316, "grad_norm": 0.3790264427661896, "learning_rate": 4.903739443482027e-05, "loss": 0.3744, "step": 1422500 }, { "epoch": 9.629439151147682, "grad_norm": 0.3546484410762787, "learning_rate": 4.9037056084885235e-05, "loss": 0.3728, "step": 1423000 }, { "epoch": 9.632822650498051, "grad_norm": 0.3164576292037964, "learning_rate": 4.90367177349502e-05, "loss": 0.3723, "step": 1423500 }, { "epoch": 9.63620614984842, "grad_norm": 0.34413549304008484, "learning_rate": 4.9036379385015167e-05, "loss": 0.3717, "step": 1424000 }, { "epoch": 9.639589649198788, "grad_norm": 0.3173050284385681, "learning_rate": 4.903604103508012e-05, "loss": 0.3737, "step": 1424500 }, { "epoch": 9.642973148549155, "grad_norm": 0.3323732614517212, "learning_rate": 4.9035702685145084e-05, "loss": 0.3719, "step": 1425000 }, { "epoch": 9.646356647899523, "grad_norm": 0.3265518546104431, "learning_rate": 4.9035364335210046e-05, "loss": 0.374, "step": 1425500 }, { "epoch": 9.649740147249892, "grad_norm": 0.34177255630493164, "learning_rate": 4.9035025985275015e-05, "loss": 0.3746, "step": 1426000 }, { "epoch": 9.65312364660026, "grad_norm": 0.34831613302230835, "learning_rate": 4.903468763533998e-05, "loss": 0.3724, "step": 1426500 }, { "epoch": 9.656507145950627, "grad_norm": 0.3265458047389984, "learning_rate": 4.903434928540494e-05, "loss": 0.3746, "step": 1427000 }, { "epoch": 9.659890645300996, "grad_norm": 0.3510293662548065, "learning_rate": 4.90340109354699e-05, "loss": 0.3721, "step": 1427500 }, { "epoch": 9.663274144651364, "grad_norm": 0.328610897064209, "learning_rate": 4.903367258553487e-05, "loss": 0.374, "step": 1428000 }, { "epoch": 9.666657644001733, "grad_norm": 0.34514859318733215, "learning_rate": 4.903333423559983e-05, "loss": 0.3733, "step": 1428500 }, { "epoch": 9.670041143352101, "grad_norm": 0.3269490897655487, "learning_rate": 4.903299588566479e-05, "loss": 0.374, "step": 1429000 }, { "epoch": 9.673424642702468, "grad_norm": 0.3199786841869354, "learning_rate": 4.903265753572976e-05, "loss": 0.373, "step": 1429500 }, { "epoch": 9.676808142052836, "grad_norm": 0.31224554777145386, "learning_rate": 4.903231918579472e-05, "loss": 0.3753, "step": 1430000 }, { "epoch": 9.680191641403205, "grad_norm": 0.34504151344299316, "learning_rate": 4.903198083585968e-05, "loss": 0.3738, "step": 1430500 }, { "epoch": 9.683575140753574, "grad_norm": 0.3295120894908905, "learning_rate": 4.903164248592464e-05, "loss": 0.3736, "step": 1431000 }, { "epoch": 9.686958640103942, "grad_norm": 0.34418854117393494, "learning_rate": 4.903130413598961e-05, "loss": 0.3738, "step": 1431500 }, { "epoch": 9.690342139454309, "grad_norm": 0.34600433707237244, "learning_rate": 4.9030965786054574e-05, "loss": 0.3754, "step": 1432000 }, { "epoch": 9.693725638804677, "grad_norm": 0.3856705129146576, "learning_rate": 4.9030627436119536e-05, "loss": 0.3743, "step": 1432500 }, { "epoch": 9.697109138155046, "grad_norm": 0.34834805130958557, "learning_rate": 4.90302890861845e-05, "loss": 0.3739, "step": 1433000 }, { "epoch": 9.700492637505414, "grad_norm": 0.33331626653671265, "learning_rate": 4.902995073624946e-05, "loss": 0.3736, "step": 1433500 }, { "epoch": 9.703876136855781, "grad_norm": 0.3352530300617218, "learning_rate": 4.902961238631442e-05, "loss": 0.3735, "step": 1434000 }, { "epoch": 9.70725963620615, "grad_norm": 0.34129124879837036, "learning_rate": 4.9029274036379385e-05, "loss": 0.3742, "step": 1434500 }, { "epoch": 9.710643135556518, "grad_norm": 0.31555068492889404, "learning_rate": 4.902893568644435e-05, "loss": 0.3719, "step": 1435000 }, { "epoch": 9.714026634906887, "grad_norm": 0.37110719084739685, "learning_rate": 4.9028597336509316e-05, "loss": 0.3743, "step": 1435500 }, { "epoch": 9.717410134257253, "grad_norm": 0.33461055159568787, "learning_rate": 4.902825898657428e-05, "loss": 0.375, "step": 1436000 }, { "epoch": 9.720793633607622, "grad_norm": 0.32992124557495117, "learning_rate": 4.902792063663924e-05, "loss": 0.3737, "step": 1436500 }, { "epoch": 9.72417713295799, "grad_norm": 0.3794945180416107, "learning_rate": 4.90275822867042e-05, "loss": 0.3728, "step": 1437000 }, { "epoch": 9.727560632308359, "grad_norm": 0.341802716255188, "learning_rate": 4.902724393676917e-05, "loss": 0.3728, "step": 1437500 }, { "epoch": 9.730944131658728, "grad_norm": 0.327571302652359, "learning_rate": 4.902690558683413e-05, "loss": 0.3743, "step": 1438000 }, { "epoch": 9.734327631009094, "grad_norm": 0.39465704560279846, "learning_rate": 4.902656723689909e-05, "loss": 0.3733, "step": 1438500 }, { "epoch": 9.737711130359463, "grad_norm": 0.36543476581573486, "learning_rate": 4.902622888696406e-05, "loss": 0.3741, "step": 1439000 }, { "epoch": 9.741094629709831, "grad_norm": 0.35676220059394836, "learning_rate": 4.902589053702902e-05, "loss": 0.3714, "step": 1439500 }, { "epoch": 9.7444781290602, "grad_norm": 0.3575495779514313, "learning_rate": 4.902555218709398e-05, "loss": 0.3737, "step": 1440000 }, { "epoch": 9.747861628410567, "grad_norm": 0.36440545320510864, "learning_rate": 4.9025213837158944e-05, "loss": 0.3738, "step": 1440500 }, { "epoch": 9.751245127760935, "grad_norm": 0.37288621068000793, "learning_rate": 4.9024875487223906e-05, "loss": 0.3727, "step": 1441000 }, { "epoch": 9.754628627111304, "grad_norm": 0.36296847462654114, "learning_rate": 4.9024537137288875e-05, "loss": 0.3728, "step": 1441500 }, { "epoch": 9.758012126461672, "grad_norm": 0.39852529764175415, "learning_rate": 4.902419878735384e-05, "loss": 0.3745, "step": 1442000 }, { "epoch": 9.761395625812039, "grad_norm": 0.3430839776992798, "learning_rate": 4.90238604374188e-05, "loss": 0.3749, "step": 1442500 }, { "epoch": 9.764779125162407, "grad_norm": 0.39759382605552673, "learning_rate": 4.902352208748376e-05, "loss": 0.3743, "step": 1443000 }, { "epoch": 9.768162624512776, "grad_norm": 0.3515682816505432, "learning_rate": 4.902318373754872e-05, "loss": 0.3726, "step": 1443500 }, { "epoch": 9.771546123863144, "grad_norm": 0.3821214437484741, "learning_rate": 4.9022845387613685e-05, "loss": 0.3732, "step": 1444000 }, { "epoch": 9.774929623213513, "grad_norm": 0.36167728900909424, "learning_rate": 4.902250703767865e-05, "loss": 0.3744, "step": 1444500 }, { "epoch": 9.77831312256388, "grad_norm": 0.35349175333976746, "learning_rate": 4.9022168687743616e-05, "loss": 0.3725, "step": 1445000 }, { "epoch": 9.781696621914248, "grad_norm": 0.3603718876838684, "learning_rate": 4.902183033780858e-05, "loss": 0.3728, "step": 1445500 }, { "epoch": 9.785080121264617, "grad_norm": 0.3149004876613617, "learning_rate": 4.902149198787354e-05, "loss": 0.3732, "step": 1446000 }, { "epoch": 9.788463620614985, "grad_norm": 0.35508009791374207, "learning_rate": 4.90211536379385e-05, "loss": 0.3731, "step": 1446500 }, { "epoch": 9.791847119965354, "grad_norm": 0.36445415019989014, "learning_rate": 4.902081528800347e-05, "loss": 0.3738, "step": 1447000 }, { "epoch": 9.79523061931572, "grad_norm": 0.33561834692955017, "learning_rate": 4.9020476938068434e-05, "loss": 0.3733, "step": 1447500 }, { "epoch": 9.79861411866609, "grad_norm": 0.35104507207870483, "learning_rate": 4.902013858813339e-05, "loss": 0.3735, "step": 1448000 }, { "epoch": 9.801997618016458, "grad_norm": 0.3738578259944916, "learning_rate": 4.901980023819836e-05, "loss": 0.3738, "step": 1448500 }, { "epoch": 9.805381117366826, "grad_norm": 0.36810868978500366, "learning_rate": 4.901946188826332e-05, "loss": 0.3731, "step": 1449000 }, { "epoch": 9.808764616717193, "grad_norm": 0.32860609889030457, "learning_rate": 4.901912353832828e-05, "loss": 0.373, "step": 1449500 }, { "epoch": 9.812148116067561, "grad_norm": 0.3360423743724823, "learning_rate": 4.9018785188393244e-05, "loss": 0.3736, "step": 1450000 }, { "epoch": 9.81553161541793, "grad_norm": 0.3495067358016968, "learning_rate": 4.9018446838458206e-05, "loss": 0.3741, "step": 1450500 }, { "epoch": 9.818915114768298, "grad_norm": 0.36783358454704285, "learning_rate": 4.9018108488523175e-05, "loss": 0.3748, "step": 1451000 }, { "epoch": 9.822298614118665, "grad_norm": 0.36060866713523865, "learning_rate": 4.901777013858814e-05, "loss": 0.3733, "step": 1451500 }, { "epoch": 9.825682113469034, "grad_norm": 0.32261085510253906, "learning_rate": 4.90174317886531e-05, "loss": 0.3729, "step": 1452000 }, { "epoch": 9.829065612819402, "grad_norm": 0.3506331145763397, "learning_rate": 4.901709343871806e-05, "loss": 0.3744, "step": 1452500 }, { "epoch": 9.83244911216977, "grad_norm": 0.3443905711174011, "learning_rate": 4.9016755088783024e-05, "loss": 0.3726, "step": 1453000 }, { "epoch": 9.83583261152014, "grad_norm": 0.3584153950214386, "learning_rate": 4.9016416738847986e-05, "loss": 0.3725, "step": 1453500 }, { "epoch": 9.839216110870506, "grad_norm": 0.36786600947380066, "learning_rate": 4.901607838891295e-05, "loss": 0.3724, "step": 1454000 }, { "epoch": 9.842599610220875, "grad_norm": 0.3549017906188965, "learning_rate": 4.901574003897792e-05, "loss": 0.3735, "step": 1454500 }, { "epoch": 9.845983109571243, "grad_norm": 0.39618372917175293, "learning_rate": 4.901540168904288e-05, "loss": 0.3738, "step": 1455000 }, { "epoch": 9.849366608921612, "grad_norm": 0.3646342158317566, "learning_rate": 4.901506333910784e-05, "loss": 0.374, "step": 1455500 }, { "epoch": 9.852750108271978, "grad_norm": 0.3795194923877716, "learning_rate": 4.90147249891728e-05, "loss": 0.3737, "step": 1456000 }, { "epoch": 9.856133607622347, "grad_norm": 0.35680365562438965, "learning_rate": 4.901438663923777e-05, "loss": 0.3729, "step": 1456500 }, { "epoch": 9.859517106972715, "grad_norm": 0.3975049555301666, "learning_rate": 4.9014048289302734e-05, "loss": 0.3737, "step": 1457000 }, { "epoch": 9.862900606323084, "grad_norm": 0.3477362394332886, "learning_rate": 4.901370993936769e-05, "loss": 0.3731, "step": 1457500 }, { "epoch": 9.866284105673452, "grad_norm": 0.3453601896762848, "learning_rate": 4.901337158943265e-05, "loss": 0.3743, "step": 1458000 }, { "epoch": 9.86966760502382, "grad_norm": 0.3613618016242981, "learning_rate": 4.901303323949762e-05, "loss": 0.372, "step": 1458500 }, { "epoch": 9.873051104374188, "grad_norm": 0.34904763102531433, "learning_rate": 4.901269488956258e-05, "loss": 0.3726, "step": 1459000 }, { "epoch": 9.876434603724556, "grad_norm": 0.342383474111557, "learning_rate": 4.9012356539627545e-05, "loss": 0.3739, "step": 1459500 }, { "epoch": 9.879818103074925, "grad_norm": 0.3554176390171051, "learning_rate": 4.901201818969251e-05, "loss": 0.374, "step": 1460000 }, { "epoch": 9.883201602425292, "grad_norm": 0.3256734609603882, "learning_rate": 4.9011679839757476e-05, "loss": 0.3753, "step": 1460500 }, { "epoch": 9.88658510177566, "grad_norm": 0.3433820605278015, "learning_rate": 4.901134148982244e-05, "loss": 0.375, "step": 1461000 }, { "epoch": 9.889968601126029, "grad_norm": 0.36203470826148987, "learning_rate": 4.90110031398874e-05, "loss": 0.3727, "step": 1461500 }, { "epoch": 9.893352100476397, "grad_norm": 0.36855342984199524, "learning_rate": 4.901066478995236e-05, "loss": 0.3738, "step": 1462000 }, { "epoch": 9.896735599826766, "grad_norm": 0.3460084795951843, "learning_rate": 4.9010326440017324e-05, "loss": 0.3734, "step": 1462500 }, { "epoch": 9.900119099177132, "grad_norm": 0.337659627199173, "learning_rate": 4.9009988090082287e-05, "loss": 0.3741, "step": 1463000 }, { "epoch": 9.903502598527501, "grad_norm": 0.3099478781223297, "learning_rate": 4.900964974014725e-05, "loss": 0.3741, "step": 1463500 }, { "epoch": 9.90688609787787, "grad_norm": 0.38407188653945923, "learning_rate": 4.900931139021222e-05, "loss": 0.3738, "step": 1464000 }, { "epoch": 9.910269597228238, "grad_norm": 0.3641258180141449, "learning_rate": 4.900897304027718e-05, "loss": 0.3738, "step": 1464500 }, { "epoch": 9.913653096578605, "grad_norm": 0.33939868211746216, "learning_rate": 4.900863469034214e-05, "loss": 0.3742, "step": 1465000 }, { "epoch": 9.917036595928973, "grad_norm": 0.3442780077457428, "learning_rate": 4.9008296340407104e-05, "loss": 0.3725, "step": 1465500 }, { "epoch": 9.920420095279342, "grad_norm": 0.3463834524154663, "learning_rate": 4.900795799047207e-05, "loss": 0.3733, "step": 1466000 }, { "epoch": 9.92380359462971, "grad_norm": 0.31947022676467896, "learning_rate": 4.9007619640537035e-05, "loss": 0.3724, "step": 1466500 }, { "epoch": 9.927187093980077, "grad_norm": 0.3326902389526367, "learning_rate": 4.900728129060199e-05, "loss": 0.3716, "step": 1467000 }, { "epoch": 9.930570593330446, "grad_norm": 0.3712017238140106, "learning_rate": 4.900694294066695e-05, "loss": 0.3733, "step": 1467500 }, { "epoch": 9.933954092680814, "grad_norm": 0.34310412406921387, "learning_rate": 4.900660459073192e-05, "loss": 0.373, "step": 1468000 }, { "epoch": 9.937337592031183, "grad_norm": 0.36225417256355286, "learning_rate": 4.9006266240796883e-05, "loss": 0.3743, "step": 1468500 }, { "epoch": 9.940721091381551, "grad_norm": 0.34073570370674133, "learning_rate": 4.9005927890861846e-05, "loss": 0.3749, "step": 1469000 }, { "epoch": 9.944104590731918, "grad_norm": 0.38553059101104736, "learning_rate": 4.900558954092681e-05, "loss": 0.3736, "step": 1469500 }, { "epoch": 9.947488090082286, "grad_norm": 0.379245400428772, "learning_rate": 4.9005251190991777e-05, "loss": 0.372, "step": 1470000 }, { "epoch": 9.950871589432655, "grad_norm": 0.3675907850265503, "learning_rate": 4.900491284105674e-05, "loss": 0.3734, "step": 1470500 }, { "epoch": 9.954255088783023, "grad_norm": 0.3889124095439911, "learning_rate": 4.90045744911217e-05, "loss": 0.3748, "step": 1471000 }, { "epoch": 9.957638588133392, "grad_norm": 0.34726428985595703, "learning_rate": 4.900423614118666e-05, "loss": 0.375, "step": 1471500 }, { "epoch": 9.961022087483759, "grad_norm": 0.41922616958618164, "learning_rate": 4.9003897791251625e-05, "loss": 0.3734, "step": 1472000 }, { "epoch": 9.964405586834127, "grad_norm": 0.34421753883361816, "learning_rate": 4.900355944131659e-05, "loss": 0.3732, "step": 1472500 }, { "epoch": 9.967789086184496, "grad_norm": 0.3449467420578003, "learning_rate": 4.900322109138155e-05, "loss": 0.3737, "step": 1473000 }, { "epoch": 9.971172585534864, "grad_norm": 0.37184974551200867, "learning_rate": 4.900288274144652e-05, "loss": 0.373, "step": 1473500 }, { "epoch": 9.974556084885231, "grad_norm": 0.34117862582206726, "learning_rate": 4.900254439151148e-05, "loss": 0.3717, "step": 1474000 }, { "epoch": 9.9779395842356, "grad_norm": 0.35933008790016174, "learning_rate": 4.900220604157644e-05, "loss": 0.373, "step": 1474500 }, { "epoch": 9.981323083585968, "grad_norm": 0.34943458437919617, "learning_rate": 4.9001867691641405e-05, "loss": 0.373, "step": 1475000 }, { "epoch": 9.984706582936337, "grad_norm": 0.3573608100414276, "learning_rate": 4.9001529341706373e-05, "loss": 0.3728, "step": 1475500 }, { "epoch": 9.988090082286703, "grad_norm": 0.3658691346645355, "learning_rate": 4.9001190991771336e-05, "loss": 0.3723, "step": 1476000 }, { "epoch": 9.991473581637072, "grad_norm": 0.35315924882888794, "learning_rate": 4.90008526418363e-05, "loss": 0.3732, "step": 1476500 }, { "epoch": 9.99485708098744, "grad_norm": 0.32430362701416016, "learning_rate": 4.900051429190125e-05, "loss": 0.3725, "step": 1477000 }, { "epoch": 9.998240580337809, "grad_norm": 0.434662789106369, "learning_rate": 4.900017594196622e-05, "loss": 0.374, "step": 1477500 }, { "epoch": 10.0, "eval_accuracy": 0.8576633603817478, "eval_loss": 0.5780515074729919, "eval_runtime": 3398.1128, "eval_samples_per_second": 85.56, "eval_steps_per_second": 5.348, "step": 1477760 }, { "epoch": 10.001624079688177, "grad_norm": 0.35455480217933655, "learning_rate": 4.8999837592031184e-05, "loss": 0.3714, "step": 1478000 }, { "epoch": 10.005007579038544, "grad_norm": 0.3450685739517212, "learning_rate": 4.8999499242096146e-05, "loss": 0.3697, "step": 1478500 }, { "epoch": 10.008391078388913, "grad_norm": 0.34946346282958984, "learning_rate": 4.899916089216111e-05, "loss": 0.3701, "step": 1479000 }, { "epoch": 10.011774577739281, "grad_norm": 0.35094282031059265, "learning_rate": 4.899882254222608e-05, "loss": 0.3721, "step": 1479500 }, { "epoch": 10.01515807708965, "grad_norm": 0.37638944387435913, "learning_rate": 4.899848419229104e-05, "loss": 0.3706, "step": 1480000 }, { "epoch": 10.018541576440017, "grad_norm": 0.3403615951538086, "learning_rate": 4.8998145842356e-05, "loss": 0.3706, "step": 1480500 }, { "epoch": 10.021925075790385, "grad_norm": 0.3257076144218445, "learning_rate": 4.8997807492420964e-05, "loss": 0.3714, "step": 1481000 }, { "epoch": 10.025308575140754, "grad_norm": 0.3220478296279907, "learning_rate": 4.8997469142485926e-05, "loss": 0.372, "step": 1481500 }, { "epoch": 10.028692074491122, "grad_norm": 0.36535710096359253, "learning_rate": 4.899713079255089e-05, "loss": 0.37, "step": 1482000 }, { "epoch": 10.03207557384149, "grad_norm": 0.341636061668396, "learning_rate": 4.899679244261585e-05, "loss": 0.3711, "step": 1482500 }, { "epoch": 10.035459073191857, "grad_norm": 0.3620785176753998, "learning_rate": 4.899645409268082e-05, "loss": 0.3733, "step": 1483000 }, { "epoch": 10.038842572542226, "grad_norm": 0.3525392711162567, "learning_rate": 4.899611574274578e-05, "loss": 0.3713, "step": 1483500 }, { "epoch": 10.042226071892594, "grad_norm": 0.3929446339607239, "learning_rate": 4.899577739281074e-05, "loss": 0.3717, "step": 1484000 }, { "epoch": 10.045609571242963, "grad_norm": 0.3982110023498535, "learning_rate": 4.8995439042875705e-05, "loss": 0.3735, "step": 1484500 }, { "epoch": 10.04899307059333, "grad_norm": 0.3973587453365326, "learning_rate": 4.8995100692940674e-05, "loss": 0.3714, "step": 1485000 }, { "epoch": 10.052376569943698, "grad_norm": 0.33388495445251465, "learning_rate": 4.8994762343005636e-05, "loss": 0.3711, "step": 1485500 }, { "epoch": 10.055760069294067, "grad_norm": 0.3741908073425293, "learning_rate": 4.89944239930706e-05, "loss": 0.373, "step": 1486000 }, { "epoch": 10.059143568644435, "grad_norm": 0.37797847390174866, "learning_rate": 4.8994085643135554e-05, "loss": 0.3721, "step": 1486500 }, { "epoch": 10.062527067994804, "grad_norm": 0.36359724402427673, "learning_rate": 4.899374729320052e-05, "loss": 0.3714, "step": 1487000 }, { "epoch": 10.06591056734517, "grad_norm": 0.36529961228370667, "learning_rate": 4.8993408943265485e-05, "loss": 0.3724, "step": 1487500 }, { "epoch": 10.069294066695539, "grad_norm": 0.439048171043396, "learning_rate": 4.899307059333045e-05, "loss": 0.3737, "step": 1488000 }, { "epoch": 10.072677566045908, "grad_norm": 0.33382806181907654, "learning_rate": 4.899273224339541e-05, "loss": 0.3718, "step": 1488500 }, { "epoch": 10.076061065396276, "grad_norm": 0.3356092870235443, "learning_rate": 4.899239389346038e-05, "loss": 0.3724, "step": 1489000 }, { "epoch": 10.079444564746643, "grad_norm": 0.3422519266605377, "learning_rate": 4.899205554352534e-05, "loss": 0.3728, "step": 1489500 }, { "epoch": 10.082828064097011, "grad_norm": 0.4010469317436218, "learning_rate": 4.89917171935903e-05, "loss": 0.3717, "step": 1490000 }, { "epoch": 10.08621156344738, "grad_norm": 0.37595677375793457, "learning_rate": 4.8991378843655264e-05, "loss": 0.3719, "step": 1490500 }, { "epoch": 10.089595062797748, "grad_norm": 0.3703933656215668, "learning_rate": 4.8991040493720226e-05, "loss": 0.3711, "step": 1491000 }, { "epoch": 10.092978562148115, "grad_norm": 0.3610004782676697, "learning_rate": 4.899070214378519e-05, "loss": 0.3724, "step": 1491500 }, { "epoch": 10.096362061498484, "grad_norm": 0.36778146028518677, "learning_rate": 4.899036379385015e-05, "loss": 0.3726, "step": 1492000 }, { "epoch": 10.099745560848852, "grad_norm": 0.36750367283821106, "learning_rate": 4.899002544391512e-05, "loss": 0.3713, "step": 1492500 }, { "epoch": 10.10312906019922, "grad_norm": 0.3422515094280243, "learning_rate": 4.898968709398008e-05, "loss": 0.3715, "step": 1493000 }, { "epoch": 10.10651255954959, "grad_norm": 0.3872862756252289, "learning_rate": 4.8989348744045044e-05, "loss": 0.3714, "step": 1493500 }, { "epoch": 10.109896058899956, "grad_norm": 0.3537133038043976, "learning_rate": 4.8989010394110006e-05, "loss": 0.3717, "step": 1494000 }, { "epoch": 10.113279558250325, "grad_norm": 0.3469699025154114, "learning_rate": 4.8988672044174975e-05, "loss": 0.3742, "step": 1494500 }, { "epoch": 10.116663057600693, "grad_norm": 0.3984968960285187, "learning_rate": 4.898833369423994e-05, "loss": 0.3712, "step": 1495000 }, { "epoch": 10.120046556951062, "grad_norm": 0.36833515763282776, "learning_rate": 4.89879953443049e-05, "loss": 0.3714, "step": 1495500 }, { "epoch": 10.123430056301428, "grad_norm": 0.44496744871139526, "learning_rate": 4.8987656994369854e-05, "loss": 0.3715, "step": 1496000 }, { "epoch": 10.126813555651797, "grad_norm": 0.366485059261322, "learning_rate": 4.898731864443482e-05, "loss": 0.3706, "step": 1496500 }, { "epoch": 10.130197055002165, "grad_norm": 0.3111550807952881, "learning_rate": 4.8986980294499785e-05, "loss": 0.3718, "step": 1497000 }, { "epoch": 10.133580554352534, "grad_norm": 0.35600733757019043, "learning_rate": 4.898664194456475e-05, "loss": 0.3741, "step": 1497500 }, { "epoch": 10.136964053702902, "grad_norm": 0.33908215165138245, "learning_rate": 4.898630359462971e-05, "loss": 0.3713, "step": 1498000 }, { "epoch": 10.14034755305327, "grad_norm": 0.3269176185131073, "learning_rate": 4.898596524469468e-05, "loss": 0.3716, "step": 1498500 }, { "epoch": 10.143731052403638, "grad_norm": 0.31056517362594604, "learning_rate": 4.898562689475964e-05, "loss": 0.3717, "step": 1499000 }, { "epoch": 10.147114551754006, "grad_norm": 0.3457142114639282, "learning_rate": 4.89852885448246e-05, "loss": 0.3724, "step": 1499500 }, { "epoch": 10.150498051104375, "grad_norm": 0.36891162395477295, "learning_rate": 4.8984950194889565e-05, "loss": 0.3725, "step": 1500000 }, { "epoch": 10.153881550454742, "grad_norm": 0.33141809701919556, "learning_rate": 4.898461184495453e-05, "loss": 0.3742, "step": 1500500 }, { "epoch": 10.15726504980511, "grad_norm": 0.35837483406066895, "learning_rate": 4.898427349501949e-05, "loss": 0.3704, "step": 1501000 }, { "epoch": 10.160648549155479, "grad_norm": 0.37250304222106934, "learning_rate": 4.898393514508445e-05, "loss": 0.3726, "step": 1501500 }, { "epoch": 10.164032048505847, "grad_norm": 0.32248085737228394, "learning_rate": 4.898359679514942e-05, "loss": 0.3704, "step": 1502000 }, { "epoch": 10.167415547856216, "grad_norm": 0.44182881712913513, "learning_rate": 4.898325844521438e-05, "loss": 0.3708, "step": 1502500 }, { "epoch": 10.170799047206582, "grad_norm": 0.3475426137447357, "learning_rate": 4.8982920095279344e-05, "loss": 0.3714, "step": 1503000 }, { "epoch": 10.17418254655695, "grad_norm": 0.3327220678329468, "learning_rate": 4.8982581745344306e-05, "loss": 0.373, "step": 1503500 }, { "epoch": 10.17756604590732, "grad_norm": 0.3623294532299042, "learning_rate": 4.898224339540927e-05, "loss": 0.3721, "step": 1504000 }, { "epoch": 10.180949545257688, "grad_norm": 0.33989912271499634, "learning_rate": 4.898190504547424e-05, "loss": 0.3713, "step": 1504500 }, { "epoch": 10.184333044608055, "grad_norm": 0.32133978605270386, "learning_rate": 4.89815666955392e-05, "loss": 0.3719, "step": 1505000 }, { "epoch": 10.187716543958423, "grad_norm": 0.38162854313850403, "learning_rate": 4.8981228345604155e-05, "loss": 0.3727, "step": 1505500 }, { "epoch": 10.191100043308792, "grad_norm": 0.37406229972839355, "learning_rate": 4.8980889995669124e-05, "loss": 0.3718, "step": 1506000 }, { "epoch": 10.19448354265916, "grad_norm": 0.3452966511249542, "learning_rate": 4.8980551645734086e-05, "loss": 0.3714, "step": 1506500 }, { "epoch": 10.197867042009529, "grad_norm": 0.3505021631717682, "learning_rate": 4.898021329579905e-05, "loss": 0.3707, "step": 1507000 }, { "epoch": 10.201250541359896, "grad_norm": 0.3570825159549713, "learning_rate": 4.897987494586401e-05, "loss": 0.3703, "step": 1507500 }, { "epoch": 10.204634040710264, "grad_norm": 0.3546437919139862, "learning_rate": 4.897953659592898e-05, "loss": 0.371, "step": 1508000 }, { "epoch": 10.208017540060633, "grad_norm": 0.41467759013175964, "learning_rate": 4.897919824599394e-05, "loss": 0.3722, "step": 1508500 }, { "epoch": 10.211401039411001, "grad_norm": 0.39821138978004456, "learning_rate": 4.89788598960589e-05, "loss": 0.371, "step": 1509000 }, { "epoch": 10.214784538761368, "grad_norm": 0.3705928325653076, "learning_rate": 4.8978521546123865e-05, "loss": 0.3718, "step": 1509500 }, { "epoch": 10.218168038111736, "grad_norm": 0.3757786750793457, "learning_rate": 4.897818319618883e-05, "loss": 0.3716, "step": 1510000 }, { "epoch": 10.221551537462105, "grad_norm": 0.3502088785171509, "learning_rate": 4.897784484625379e-05, "loss": 0.3726, "step": 1510500 }, { "epoch": 10.224935036812473, "grad_norm": 0.3413272202014923, "learning_rate": 4.897750649631875e-05, "loss": 0.3707, "step": 1511000 }, { "epoch": 10.228318536162842, "grad_norm": 0.35884177684783936, "learning_rate": 4.8977168146383714e-05, "loss": 0.3721, "step": 1511500 }, { "epoch": 10.231702035513209, "grad_norm": 0.33523499965667725, "learning_rate": 4.897682979644868e-05, "loss": 0.3705, "step": 1512000 }, { "epoch": 10.235085534863577, "grad_norm": 0.40591904520988464, "learning_rate": 4.8976491446513645e-05, "loss": 0.3725, "step": 1512500 }, { "epoch": 10.238469034213946, "grad_norm": 0.3495722711086273, "learning_rate": 4.897615309657861e-05, "loss": 0.3723, "step": 1513000 }, { "epoch": 10.241852533564314, "grad_norm": 0.3645005226135254, "learning_rate": 4.897581474664357e-05, "loss": 0.3725, "step": 1513500 }, { "epoch": 10.245236032914681, "grad_norm": 0.3237664997577667, "learning_rate": 4.897547639670854e-05, "loss": 0.3728, "step": 1514000 }, { "epoch": 10.24861953226505, "grad_norm": 0.3544117510318756, "learning_rate": 4.89751380467735e-05, "loss": 0.3725, "step": 1514500 }, { "epoch": 10.252003031615418, "grad_norm": 0.3753994107246399, "learning_rate": 4.8974799696838456e-05, "loss": 0.374, "step": 1515000 }, { "epoch": 10.255386530965787, "grad_norm": 0.34924355149269104, "learning_rate": 4.8974461346903424e-05, "loss": 0.3733, "step": 1515500 }, { "epoch": 10.258770030316153, "grad_norm": 0.41519322991371155, "learning_rate": 4.8974122996968387e-05, "loss": 0.3718, "step": 1516000 }, { "epoch": 10.262153529666522, "grad_norm": 0.33920472860336304, "learning_rate": 4.897378464703335e-05, "loss": 0.373, "step": 1516500 }, { "epoch": 10.26553702901689, "grad_norm": 0.3762575387954712, "learning_rate": 4.897344629709831e-05, "loss": 0.3723, "step": 1517000 }, { "epoch": 10.268920528367259, "grad_norm": 0.33512699604034424, "learning_rate": 4.897310794716328e-05, "loss": 0.371, "step": 1517500 }, { "epoch": 10.272304027717627, "grad_norm": 0.36007460951805115, "learning_rate": 4.897276959722824e-05, "loss": 0.3722, "step": 1518000 }, { "epoch": 10.275687527067994, "grad_norm": 0.34781354665756226, "learning_rate": 4.8972431247293204e-05, "loss": 0.3732, "step": 1518500 }, { "epoch": 10.279071026418363, "grad_norm": 0.3836442828178406, "learning_rate": 4.8972092897358166e-05, "loss": 0.3719, "step": 1519000 }, { "epoch": 10.282454525768731, "grad_norm": 0.3336435556411743, "learning_rate": 4.897175454742313e-05, "loss": 0.3729, "step": 1519500 }, { "epoch": 10.2858380251191, "grad_norm": 0.3318532705307007, "learning_rate": 4.897141619748809e-05, "loss": 0.3735, "step": 1520000 }, { "epoch": 10.289221524469466, "grad_norm": 0.3705673813819885, "learning_rate": 4.897107784755305e-05, "loss": 0.3723, "step": 1520500 }, { "epoch": 10.292605023819835, "grad_norm": 0.3291831910610199, "learning_rate": 4.8970739497618015e-05, "loss": 0.3729, "step": 1521000 }, { "epoch": 10.295988523170204, "grad_norm": 0.3454623818397522, "learning_rate": 4.8970401147682983e-05, "loss": 0.3723, "step": 1521500 }, { "epoch": 10.299372022520572, "grad_norm": 0.34960824251174927, "learning_rate": 4.8970062797747946e-05, "loss": 0.3724, "step": 1522000 }, { "epoch": 10.30275552187094, "grad_norm": 0.3701937198638916, "learning_rate": 4.896972444781291e-05, "loss": 0.373, "step": 1522500 }, { "epoch": 10.306139021221307, "grad_norm": 0.3673439621925354, "learning_rate": 4.896938609787787e-05, "loss": 0.3737, "step": 1523000 }, { "epoch": 10.309522520571676, "grad_norm": 0.3876641094684601, "learning_rate": 4.896904774794284e-05, "loss": 0.3723, "step": 1523500 }, { "epoch": 10.312906019922044, "grad_norm": 0.3613537847995758, "learning_rate": 4.89687093980078e-05, "loss": 0.371, "step": 1524000 }, { "epoch": 10.316289519272413, "grad_norm": 0.3192806839942932, "learning_rate": 4.8968371048072756e-05, "loss": 0.371, "step": 1524500 }, { "epoch": 10.31967301862278, "grad_norm": 0.35730502009391785, "learning_rate": 4.8968032698137725e-05, "loss": 0.3736, "step": 1525000 }, { "epoch": 10.323056517973148, "grad_norm": 0.35998380184173584, "learning_rate": 4.896769434820269e-05, "loss": 0.3726, "step": 1525500 }, { "epoch": 10.326440017323517, "grad_norm": 0.3771452307701111, "learning_rate": 4.896735599826765e-05, "loss": 0.3705, "step": 1526000 }, { "epoch": 10.329823516673885, "grad_norm": 0.39200085401535034, "learning_rate": 4.896701764833261e-05, "loss": 0.3711, "step": 1526500 }, { "epoch": 10.333207016024254, "grad_norm": 0.3914739191532135, "learning_rate": 4.896667929839758e-05, "loss": 0.3725, "step": 1527000 }, { "epoch": 10.33659051537462, "grad_norm": 0.39595144987106323, "learning_rate": 4.896634094846254e-05, "loss": 0.3732, "step": 1527500 }, { "epoch": 10.339974014724989, "grad_norm": 0.35875624418258667, "learning_rate": 4.8966002598527505e-05, "loss": 0.3728, "step": 1528000 }, { "epoch": 10.343357514075358, "grad_norm": 0.3740348219871521, "learning_rate": 4.896566424859247e-05, "loss": 0.3738, "step": 1528500 }, { "epoch": 10.346741013425726, "grad_norm": 0.3435947597026825, "learning_rate": 4.896532589865743e-05, "loss": 0.3715, "step": 1529000 }, { "epoch": 10.350124512776093, "grad_norm": 0.34385281801223755, "learning_rate": 4.896498754872239e-05, "loss": 0.3719, "step": 1529500 }, { "epoch": 10.353508012126461, "grad_norm": 0.3487713932991028, "learning_rate": 4.896464919878735e-05, "loss": 0.3726, "step": 1530000 }, { "epoch": 10.35689151147683, "grad_norm": 0.36372271180152893, "learning_rate": 4.8964310848852315e-05, "loss": 0.3715, "step": 1530500 }, { "epoch": 10.360275010827198, "grad_norm": 0.377616286277771, "learning_rate": 4.8963972498917284e-05, "loss": 0.3735, "step": 1531000 }, { "epoch": 10.363658510177567, "grad_norm": 0.33630087971687317, "learning_rate": 4.8963634148982246e-05, "loss": 0.3726, "step": 1531500 }, { "epoch": 10.367042009527934, "grad_norm": 0.35733091831207275, "learning_rate": 4.896329579904721e-05, "loss": 0.3725, "step": 1532000 }, { "epoch": 10.370425508878302, "grad_norm": 0.3580639064311981, "learning_rate": 4.896295744911217e-05, "loss": 0.372, "step": 1532500 }, { "epoch": 10.37380900822867, "grad_norm": 0.3302856683731079, "learning_rate": 4.896261909917714e-05, "loss": 0.3712, "step": 1533000 }, { "epoch": 10.37719250757904, "grad_norm": 0.33400455117225647, "learning_rate": 4.89622807492421e-05, "loss": 0.3739, "step": 1533500 }, { "epoch": 10.380576006929406, "grad_norm": 0.38881915807724, "learning_rate": 4.896194239930706e-05, "loss": 0.3717, "step": 1534000 }, { "epoch": 10.383959506279774, "grad_norm": 0.32272690534591675, "learning_rate": 4.8961604049372026e-05, "loss": 0.3717, "step": 1534500 }, { "epoch": 10.387343005630143, "grad_norm": 0.371811181306839, "learning_rate": 4.896126569943699e-05, "loss": 0.3717, "step": 1535000 }, { "epoch": 10.390726504980512, "grad_norm": 0.357614129781723, "learning_rate": 4.896092734950195e-05, "loss": 0.3723, "step": 1535500 }, { "epoch": 10.39411000433088, "grad_norm": 0.3578258454799652, "learning_rate": 4.896058899956691e-05, "loss": 0.3721, "step": 1536000 }, { "epoch": 10.397493503681247, "grad_norm": 0.33020392060279846, "learning_rate": 4.896025064963188e-05, "loss": 0.3725, "step": 1536500 }, { "epoch": 10.400877003031615, "grad_norm": 0.3697195053100586, "learning_rate": 4.895991229969684e-05, "loss": 0.3733, "step": 1537000 }, { "epoch": 10.404260502381984, "grad_norm": 0.3831406533718109, "learning_rate": 4.8959573949761805e-05, "loss": 0.373, "step": 1537500 }, { "epoch": 10.407644001732352, "grad_norm": 0.3610920011997223, "learning_rate": 4.895923559982677e-05, "loss": 0.3715, "step": 1538000 }, { "epoch": 10.41102750108272, "grad_norm": 0.33148255944252014, "learning_rate": 4.8958897249891736e-05, "loss": 0.372, "step": 1538500 }, { "epoch": 10.414411000433088, "grad_norm": 0.3550785183906555, "learning_rate": 4.895855889995669e-05, "loss": 0.3744, "step": 1539000 }, { "epoch": 10.417794499783456, "grad_norm": 0.3774226903915405, "learning_rate": 4.8958220550021654e-05, "loss": 0.372, "step": 1539500 }, { "epoch": 10.421177999133825, "grad_norm": 0.3333495259284973, "learning_rate": 4.8957882200086616e-05, "loss": 0.374, "step": 1540000 }, { "epoch": 10.424561498484191, "grad_norm": 0.3145923316478729, "learning_rate": 4.8957543850151585e-05, "loss": 0.3744, "step": 1540500 }, { "epoch": 10.42794499783456, "grad_norm": 0.37305423617362976, "learning_rate": 4.895720550021655e-05, "loss": 0.3723, "step": 1541000 }, { "epoch": 10.431328497184928, "grad_norm": 0.358360230922699, "learning_rate": 4.895686715028151e-05, "loss": 0.3726, "step": 1541500 }, { "epoch": 10.434711996535297, "grad_norm": 0.3629431426525116, "learning_rate": 4.895652880034647e-05, "loss": 0.3726, "step": 1542000 }, { "epoch": 10.438095495885666, "grad_norm": 0.4073221981525421, "learning_rate": 4.895619045041144e-05, "loss": 0.373, "step": 1542500 }, { "epoch": 10.441478995236032, "grad_norm": 0.39308783411979675, "learning_rate": 4.89558521004764e-05, "loss": 0.3718, "step": 1543000 }, { "epoch": 10.4448624945864, "grad_norm": 0.3807017505168915, "learning_rate": 4.895551375054136e-05, "loss": 0.3739, "step": 1543500 }, { "epoch": 10.44824599393677, "grad_norm": 0.3773409426212311, "learning_rate": 4.8955175400606326e-05, "loss": 0.3721, "step": 1544000 }, { "epoch": 10.451629493287138, "grad_norm": 0.36095383763313293, "learning_rate": 4.895483705067129e-05, "loss": 0.3727, "step": 1544500 }, { "epoch": 10.455012992637505, "grad_norm": 0.3516400456428528, "learning_rate": 4.895449870073625e-05, "loss": 0.371, "step": 1545000 }, { "epoch": 10.458396491987873, "grad_norm": 0.36414170265197754, "learning_rate": 4.895416035080121e-05, "loss": 0.3734, "step": 1545500 }, { "epoch": 10.461779991338242, "grad_norm": 0.3702701926231384, "learning_rate": 4.895382200086618e-05, "loss": 0.3733, "step": 1546000 }, { "epoch": 10.46516349068861, "grad_norm": 0.36798205971717834, "learning_rate": 4.8953483650931144e-05, "loss": 0.3717, "step": 1546500 }, { "epoch": 10.468546990038979, "grad_norm": 0.3716042935848236, "learning_rate": 4.8953145300996106e-05, "loss": 0.3733, "step": 1547000 }, { "epoch": 10.471930489389345, "grad_norm": 0.331015020608902, "learning_rate": 4.895280695106107e-05, "loss": 0.3733, "step": 1547500 }, { "epoch": 10.475313988739714, "grad_norm": 0.3597949147224426, "learning_rate": 4.895246860112604e-05, "loss": 0.3697, "step": 1548000 }, { "epoch": 10.478697488090083, "grad_norm": 0.3553822636604309, "learning_rate": 4.895213025119099e-05, "loss": 0.3741, "step": 1548500 }, { "epoch": 10.482080987440451, "grad_norm": 0.3873803913593292, "learning_rate": 4.8951791901255954e-05, "loss": 0.3729, "step": 1549000 }, { "epoch": 10.485464486790818, "grad_norm": 0.3339466452598572, "learning_rate": 4.8951453551320916e-05, "loss": 0.3728, "step": 1549500 }, { "epoch": 10.488847986141186, "grad_norm": 0.335771769285202, "learning_rate": 4.8951115201385885e-05, "loss": 0.3721, "step": 1550000 }, { "epoch": 10.492231485491555, "grad_norm": 0.37400147318840027, "learning_rate": 4.895077685145085e-05, "loss": 0.3733, "step": 1550500 }, { "epoch": 10.495614984841923, "grad_norm": 0.37101054191589355, "learning_rate": 4.895043850151581e-05, "loss": 0.3729, "step": 1551000 }, { "epoch": 10.498998484192292, "grad_norm": 0.36181971430778503, "learning_rate": 4.895010015158077e-05, "loss": 0.371, "step": 1551500 }, { "epoch": 10.502381983542659, "grad_norm": 0.3649595379829407, "learning_rate": 4.894976180164574e-05, "loss": 0.371, "step": 1552000 }, { "epoch": 10.505765482893027, "grad_norm": 0.3265508711338043, "learning_rate": 4.89494234517107e-05, "loss": 0.3719, "step": 1552500 }, { "epoch": 10.509148982243396, "grad_norm": 0.3649456799030304, "learning_rate": 4.894908510177566e-05, "loss": 0.3709, "step": 1553000 }, { "epoch": 10.512532481593764, "grad_norm": 0.33293768763542175, "learning_rate": 4.894874675184063e-05, "loss": 0.3732, "step": 1553500 }, { "epoch": 10.515915980944131, "grad_norm": 0.34916120767593384, "learning_rate": 4.894840840190559e-05, "loss": 0.3714, "step": 1554000 }, { "epoch": 10.5192994802945, "grad_norm": 0.3865835666656494, "learning_rate": 4.894807005197055e-05, "loss": 0.3726, "step": 1554500 }, { "epoch": 10.522682979644868, "grad_norm": 0.3607036769390106, "learning_rate": 4.894773170203551e-05, "loss": 0.3711, "step": 1555000 }, { "epoch": 10.526066478995237, "grad_norm": 0.36521416902542114, "learning_rate": 4.894739335210048e-05, "loss": 0.3734, "step": 1555500 }, { "epoch": 10.529449978345603, "grad_norm": 0.3339827060699463, "learning_rate": 4.8947055002165444e-05, "loss": 0.3731, "step": 1556000 }, { "epoch": 10.532833477695972, "grad_norm": 0.3377821743488312, "learning_rate": 4.8946716652230406e-05, "loss": 0.3729, "step": 1556500 }, { "epoch": 10.53621697704634, "grad_norm": 0.4511741101741791, "learning_rate": 4.894637830229537e-05, "loss": 0.3713, "step": 1557000 }, { "epoch": 10.539600476396709, "grad_norm": 0.34616196155548096, "learning_rate": 4.894603995236034e-05, "loss": 0.3747, "step": 1557500 }, { "epoch": 10.542983975747077, "grad_norm": 0.3160056471824646, "learning_rate": 4.894570160242529e-05, "loss": 0.3725, "step": 1558000 }, { "epoch": 10.546367475097444, "grad_norm": 0.3685974180698395, "learning_rate": 4.8945363252490255e-05, "loss": 0.372, "step": 1558500 }, { "epoch": 10.549750974447813, "grad_norm": 0.3372802734375, "learning_rate": 4.894502490255522e-05, "loss": 0.3718, "step": 1559000 }, { "epoch": 10.553134473798181, "grad_norm": 0.3723401129245758, "learning_rate": 4.8944686552620186e-05, "loss": 0.3736, "step": 1559500 }, { "epoch": 10.55651797314855, "grad_norm": 0.36342746019363403, "learning_rate": 4.894434820268515e-05, "loss": 0.374, "step": 1560000 }, { "epoch": 10.559901472498918, "grad_norm": 0.34721601009368896, "learning_rate": 4.894400985275011e-05, "loss": 0.374, "step": 1560500 }, { "epoch": 10.563284971849285, "grad_norm": 0.38332539796829224, "learning_rate": 4.894367150281507e-05, "loss": 0.3719, "step": 1561000 }, { "epoch": 10.566668471199653, "grad_norm": 0.35107436776161194, "learning_rate": 4.894333315288004e-05, "loss": 0.3731, "step": 1561500 }, { "epoch": 10.570051970550022, "grad_norm": 0.3253043591976166, "learning_rate": 4.8942994802945e-05, "loss": 0.3727, "step": 1562000 }, { "epoch": 10.57343546990039, "grad_norm": 0.36623555421829224, "learning_rate": 4.894265645300996e-05, "loss": 0.3733, "step": 1562500 }, { "epoch": 10.576818969250757, "grad_norm": 0.35085707902908325, "learning_rate": 4.894231810307493e-05, "loss": 0.3714, "step": 1563000 }, { "epoch": 10.580202468601126, "grad_norm": 0.3474346101284027, "learning_rate": 4.894197975313989e-05, "loss": 0.3736, "step": 1563500 }, { "epoch": 10.583585967951494, "grad_norm": 0.3433658182621002, "learning_rate": 4.894164140320485e-05, "loss": 0.3708, "step": 1564000 }, { "epoch": 10.586969467301863, "grad_norm": 0.38147303462028503, "learning_rate": 4.8941303053269814e-05, "loss": 0.3716, "step": 1564500 }, { "epoch": 10.59035296665223, "grad_norm": 0.33993029594421387, "learning_rate": 4.894096470333478e-05, "loss": 0.3715, "step": 1565000 }, { "epoch": 10.593736466002598, "grad_norm": 0.3724004924297333, "learning_rate": 4.8940626353399745e-05, "loss": 0.3728, "step": 1565500 }, { "epoch": 10.597119965352967, "grad_norm": 0.34733206033706665, "learning_rate": 4.894028800346471e-05, "loss": 0.3709, "step": 1566000 }, { "epoch": 10.600503464703335, "grad_norm": 0.3865257203578949, "learning_rate": 4.893994965352967e-05, "loss": 0.3732, "step": 1566500 }, { "epoch": 10.603886964053704, "grad_norm": 0.33898982405662537, "learning_rate": 4.893961130359463e-05, "loss": 0.3724, "step": 1567000 }, { "epoch": 10.60727046340407, "grad_norm": 0.32820501923561096, "learning_rate": 4.8939272953659593e-05, "loss": 0.3739, "step": 1567500 }, { "epoch": 10.610653962754439, "grad_norm": 0.34335583448410034, "learning_rate": 4.8938934603724556e-05, "loss": 0.3727, "step": 1568000 }, { "epoch": 10.614037462104807, "grad_norm": 0.3512874245643616, "learning_rate": 4.893859625378952e-05, "loss": 0.372, "step": 1568500 }, { "epoch": 10.617420961455176, "grad_norm": 0.3625296652317047, "learning_rate": 4.893825790385449e-05, "loss": 0.3731, "step": 1569000 }, { "epoch": 10.620804460805543, "grad_norm": 0.344817578792572, "learning_rate": 4.893791955391945e-05, "loss": 0.3733, "step": 1569500 }, { "epoch": 10.624187960155911, "grad_norm": 0.398043155670166, "learning_rate": 4.893758120398441e-05, "loss": 0.3726, "step": 1570000 }, { "epoch": 10.62757145950628, "grad_norm": 0.3540467619895935, "learning_rate": 4.893724285404937e-05, "loss": 0.372, "step": 1570500 }, { "epoch": 10.630954958856648, "grad_norm": 0.32507434487342834, "learning_rate": 4.893690450411434e-05, "loss": 0.3727, "step": 1571000 }, { "epoch": 10.634338458207017, "grad_norm": 0.36480745673179626, "learning_rate": 4.8936566154179304e-05, "loss": 0.3732, "step": 1571500 }, { "epoch": 10.637721957557384, "grad_norm": 0.38430601358413696, "learning_rate": 4.893622780424426e-05, "loss": 0.3753, "step": 1572000 }, { "epoch": 10.641105456907752, "grad_norm": 0.327197790145874, "learning_rate": 4.893588945430923e-05, "loss": 0.3727, "step": 1572500 }, { "epoch": 10.64448895625812, "grad_norm": 0.3469175696372986, "learning_rate": 4.893555110437419e-05, "loss": 0.3719, "step": 1573000 }, { "epoch": 10.64787245560849, "grad_norm": 0.30438894033432007, "learning_rate": 4.893521275443915e-05, "loss": 0.3723, "step": 1573500 }, { "epoch": 10.651255954958856, "grad_norm": 0.39085620641708374, "learning_rate": 4.8934874404504115e-05, "loss": 0.3733, "step": 1574000 }, { "epoch": 10.654639454309224, "grad_norm": 0.35249242186546326, "learning_rate": 4.893453605456908e-05, "loss": 0.3744, "step": 1574500 }, { "epoch": 10.658022953659593, "grad_norm": 0.35302841663360596, "learning_rate": 4.8934197704634046e-05, "loss": 0.373, "step": 1575000 }, { "epoch": 10.661406453009961, "grad_norm": 0.3354608118534088, "learning_rate": 4.893385935469901e-05, "loss": 0.3734, "step": 1575500 }, { "epoch": 10.66478995236033, "grad_norm": 0.37679925560951233, "learning_rate": 4.893352100476397e-05, "loss": 0.3724, "step": 1576000 }, { "epoch": 10.668173451710697, "grad_norm": 0.35046711564064026, "learning_rate": 4.893318265482893e-05, "loss": 0.3725, "step": 1576500 }, { "epoch": 10.671556951061065, "grad_norm": 0.34201011061668396, "learning_rate": 4.8932844304893894e-05, "loss": 0.3734, "step": 1577000 }, { "epoch": 10.674940450411434, "grad_norm": 0.3327251970767975, "learning_rate": 4.8932505954958856e-05, "loss": 0.3714, "step": 1577500 }, { "epoch": 10.678323949761802, "grad_norm": 0.31196796894073486, "learning_rate": 4.893216760502382e-05, "loss": 0.3732, "step": 1578000 }, { "epoch": 10.681707449112169, "grad_norm": 0.32061767578125, "learning_rate": 4.893182925508879e-05, "loss": 0.3713, "step": 1578500 }, { "epoch": 10.685090948462538, "grad_norm": 0.33071762323379517, "learning_rate": 4.893149090515375e-05, "loss": 0.3728, "step": 1579000 }, { "epoch": 10.688474447812906, "grad_norm": 0.4266686737537384, "learning_rate": 4.893115255521871e-05, "loss": 0.3721, "step": 1579500 }, { "epoch": 10.691857947163275, "grad_norm": 0.3743648827075958, "learning_rate": 4.8930814205283674e-05, "loss": 0.3735, "step": 1580000 }, { "epoch": 10.695241446513641, "grad_norm": 0.35196825861930847, "learning_rate": 4.893047585534864e-05, "loss": 0.371, "step": 1580500 }, { "epoch": 10.69862494586401, "grad_norm": 0.34265002608299255, "learning_rate": 4.8930137505413605e-05, "loss": 0.3722, "step": 1581000 }, { "epoch": 10.702008445214378, "grad_norm": 0.3608235716819763, "learning_rate": 4.892979915547856e-05, "loss": 0.3725, "step": 1581500 }, { "epoch": 10.705391944564747, "grad_norm": 0.3871472477912903, "learning_rate": 4.892946080554353e-05, "loss": 0.3724, "step": 1582000 }, { "epoch": 10.708775443915115, "grad_norm": 0.5456562638282776, "learning_rate": 4.892912245560849e-05, "loss": 0.3731, "step": 1582500 }, { "epoch": 10.712158943265482, "grad_norm": 0.304875910282135, "learning_rate": 4.892878410567345e-05, "loss": 0.3735, "step": 1583000 }, { "epoch": 10.71554244261585, "grad_norm": 0.3602370023727417, "learning_rate": 4.8928445755738415e-05, "loss": 0.3726, "step": 1583500 }, { "epoch": 10.71892594196622, "grad_norm": 0.36348941922187805, "learning_rate": 4.892810740580338e-05, "loss": 0.3724, "step": 1584000 }, { "epoch": 10.722309441316588, "grad_norm": 0.3065342903137207, "learning_rate": 4.8927769055868346e-05, "loss": 0.3732, "step": 1584500 }, { "epoch": 10.725692940666956, "grad_norm": 0.3403933644294739, "learning_rate": 4.892743070593331e-05, "loss": 0.3733, "step": 1585000 }, { "epoch": 10.729076440017323, "grad_norm": 0.36604762077331543, "learning_rate": 4.892709235599827e-05, "loss": 0.3722, "step": 1585500 }, { "epoch": 10.732459939367692, "grad_norm": 0.36708346009254456, "learning_rate": 4.892675400606323e-05, "loss": 0.3731, "step": 1586000 }, { "epoch": 10.73584343871806, "grad_norm": 0.3771677613258362, "learning_rate": 4.8926415656128195e-05, "loss": 0.3736, "step": 1586500 }, { "epoch": 10.739226938068429, "grad_norm": 0.3557665944099426, "learning_rate": 4.892607730619316e-05, "loss": 0.3724, "step": 1587000 }, { "epoch": 10.742610437418795, "grad_norm": 0.4154599905014038, "learning_rate": 4.892573895625812e-05, "loss": 0.3738, "step": 1587500 }, { "epoch": 10.745993936769164, "grad_norm": 0.34683358669281006, "learning_rate": 4.892540060632309e-05, "loss": 0.3727, "step": 1588000 }, { "epoch": 10.749377436119532, "grad_norm": 0.3479159474372864, "learning_rate": 4.892506225638805e-05, "loss": 0.3741, "step": 1588500 }, { "epoch": 10.752760935469901, "grad_norm": 0.3663256764411926, "learning_rate": 4.892472390645301e-05, "loss": 0.3725, "step": 1589000 }, { "epoch": 10.756144434820268, "grad_norm": 0.3802635669708252, "learning_rate": 4.8924385556517974e-05, "loss": 0.3717, "step": 1589500 }, { "epoch": 10.759527934170636, "grad_norm": 0.35679909586906433, "learning_rate": 4.892404720658294e-05, "loss": 0.3742, "step": 1590000 }, { "epoch": 10.762911433521005, "grad_norm": 0.33733245730400085, "learning_rate": 4.8923708856647905e-05, "loss": 0.3725, "step": 1590500 }, { "epoch": 10.766294932871373, "grad_norm": 0.5832557082176208, "learning_rate": 4.892337050671287e-05, "loss": 0.3715, "step": 1591000 }, { "epoch": 10.769678432221742, "grad_norm": 0.37262704968452454, "learning_rate": 4.892303215677782e-05, "loss": 0.3733, "step": 1591500 }, { "epoch": 10.773061931572109, "grad_norm": 0.3342667818069458, "learning_rate": 4.892269380684279e-05, "loss": 0.3711, "step": 1592000 }, { "epoch": 10.776445430922477, "grad_norm": 0.3682439625263214, "learning_rate": 4.8922355456907754e-05, "loss": 0.3719, "step": 1592500 }, { "epoch": 10.779828930272846, "grad_norm": 0.33057907223701477, "learning_rate": 4.8922017106972716e-05, "loss": 0.3719, "step": 1593000 }, { "epoch": 10.783212429623214, "grad_norm": 0.39824581146240234, "learning_rate": 4.892167875703768e-05, "loss": 0.3719, "step": 1593500 }, { "epoch": 10.78659592897358, "grad_norm": 0.35091519355773926, "learning_rate": 4.892134040710265e-05, "loss": 0.3733, "step": 1594000 }, { "epoch": 10.78997942832395, "grad_norm": 0.37601837515830994, "learning_rate": 4.892100205716761e-05, "loss": 0.3728, "step": 1594500 }, { "epoch": 10.793362927674318, "grad_norm": 0.3370593190193176, "learning_rate": 4.892066370723257e-05, "loss": 0.3723, "step": 1595000 }, { "epoch": 10.796746427024686, "grad_norm": 0.37330934405326843, "learning_rate": 4.892032535729753e-05, "loss": 0.3728, "step": 1595500 }, { "epoch": 10.800129926375053, "grad_norm": 0.3830586075782776, "learning_rate": 4.8919987007362495e-05, "loss": 0.3739, "step": 1596000 }, { "epoch": 10.803513425725422, "grad_norm": 0.3950082063674927, "learning_rate": 4.891964865742746e-05, "loss": 0.3732, "step": 1596500 }, { "epoch": 10.80689692507579, "grad_norm": 0.3372601568698883, "learning_rate": 4.891931030749242e-05, "loss": 0.3723, "step": 1597000 }, { "epoch": 10.810280424426159, "grad_norm": 0.35912057757377625, "learning_rate": 4.891897195755739e-05, "loss": 0.373, "step": 1597500 }, { "epoch": 10.813663923776527, "grad_norm": 0.35594770312309265, "learning_rate": 4.891863360762235e-05, "loss": 0.3732, "step": 1598000 }, { "epoch": 10.817047423126894, "grad_norm": 0.3459462523460388, "learning_rate": 4.891829525768731e-05, "loss": 0.3742, "step": 1598500 }, { "epoch": 10.820430922477263, "grad_norm": 0.33997267484664917, "learning_rate": 4.8917956907752275e-05, "loss": 0.3723, "step": 1599000 }, { "epoch": 10.823814421827631, "grad_norm": 0.34426257014274597, "learning_rate": 4.8917618557817244e-05, "loss": 0.3748, "step": 1599500 }, { "epoch": 10.827197921178, "grad_norm": 0.39141958951950073, "learning_rate": 4.8917280207882206e-05, "loss": 0.3717, "step": 1600000 }, { "epoch": 10.830581420528368, "grad_norm": 0.3831254243850708, "learning_rate": 4.891694185794717e-05, "loss": 0.3728, "step": 1600500 }, { "epoch": 10.833964919878735, "grad_norm": 0.33389633893966675, "learning_rate": 4.891660350801212e-05, "loss": 0.3723, "step": 1601000 }, { "epoch": 10.837348419229103, "grad_norm": 0.3628421425819397, "learning_rate": 4.891626515807709e-05, "loss": 0.3736, "step": 1601500 }, { "epoch": 10.840731918579472, "grad_norm": 0.3639175295829773, "learning_rate": 4.8915926808142054e-05, "loss": 0.3715, "step": 1602000 }, { "epoch": 10.84411541792984, "grad_norm": 0.35927465558052063, "learning_rate": 4.8915588458207017e-05, "loss": 0.3733, "step": 1602500 }, { "epoch": 10.847498917280207, "grad_norm": 0.3583664000034332, "learning_rate": 4.891525010827198e-05, "loss": 0.3731, "step": 1603000 }, { "epoch": 10.850882416630576, "grad_norm": 0.3696195185184479, "learning_rate": 4.891491175833695e-05, "loss": 0.3749, "step": 1603500 }, { "epoch": 10.854265915980944, "grad_norm": 0.367540180683136, "learning_rate": 4.891457340840191e-05, "loss": 0.3734, "step": 1604000 }, { "epoch": 10.857649415331313, "grad_norm": 0.38197025656700134, "learning_rate": 4.891423505846687e-05, "loss": 0.3717, "step": 1604500 }, { "epoch": 10.86103291468168, "grad_norm": 0.3927936255931854, "learning_rate": 4.8913896708531834e-05, "loss": 0.372, "step": 1605000 }, { "epoch": 10.864416414032048, "grad_norm": 0.3550572693347931, "learning_rate": 4.8913558358596796e-05, "loss": 0.372, "step": 1605500 }, { "epoch": 10.867799913382417, "grad_norm": 0.366964727640152, "learning_rate": 4.891322000866176e-05, "loss": 0.3726, "step": 1606000 }, { "epoch": 10.871183412732785, "grad_norm": 0.3368151783943176, "learning_rate": 4.891288165872672e-05, "loss": 0.3723, "step": 1606500 }, { "epoch": 10.874566912083154, "grad_norm": 0.34027695655822754, "learning_rate": 4.891254330879169e-05, "loss": 0.3715, "step": 1607000 }, { "epoch": 10.87795041143352, "grad_norm": 0.3800116181373596, "learning_rate": 4.891220495885665e-05, "loss": 0.3718, "step": 1607500 }, { "epoch": 10.881333910783889, "grad_norm": 0.35723376274108887, "learning_rate": 4.8911866608921613e-05, "loss": 0.3728, "step": 1608000 }, { "epoch": 10.884717410134257, "grad_norm": 0.3366886377334595, "learning_rate": 4.8911528258986576e-05, "loss": 0.3733, "step": 1608500 }, { "epoch": 10.888100909484626, "grad_norm": 0.38351333141326904, "learning_rate": 4.8911189909051544e-05, "loss": 0.3725, "step": 1609000 }, { "epoch": 10.891484408834993, "grad_norm": 0.36331382393836975, "learning_rate": 4.8910851559116507e-05, "loss": 0.373, "step": 1609500 }, { "epoch": 10.894867908185361, "grad_norm": 0.3579769432544708, "learning_rate": 4.891051320918147e-05, "loss": 0.3724, "step": 1610000 }, { "epoch": 10.89825140753573, "grad_norm": 0.3592638671398163, "learning_rate": 4.8910174859246424e-05, "loss": 0.3718, "step": 1610500 }, { "epoch": 10.901634906886098, "grad_norm": 0.37395668029785156, "learning_rate": 4.890983650931139e-05, "loss": 0.3714, "step": 1611000 }, { "epoch": 10.905018406236467, "grad_norm": 0.31781119108200073, "learning_rate": 4.8909498159376355e-05, "loss": 0.3735, "step": 1611500 }, { "epoch": 10.908401905586834, "grad_norm": 0.3627088665962219, "learning_rate": 4.890915980944132e-05, "loss": 0.3733, "step": 1612000 }, { "epoch": 10.911785404937202, "grad_norm": 0.3828943073749542, "learning_rate": 4.890882145950628e-05, "loss": 0.3733, "step": 1612500 }, { "epoch": 10.91516890428757, "grad_norm": 0.3835192322731018, "learning_rate": 4.890848310957125e-05, "loss": 0.3721, "step": 1613000 }, { "epoch": 10.918552403637939, "grad_norm": 0.400499165058136, "learning_rate": 4.890814475963621e-05, "loss": 0.3715, "step": 1613500 }, { "epoch": 10.921935902988306, "grad_norm": 0.348901629447937, "learning_rate": 4.890780640970117e-05, "loss": 0.3721, "step": 1614000 }, { "epoch": 10.925319402338674, "grad_norm": 0.39376094937324524, "learning_rate": 4.8907468059766135e-05, "loss": 0.3718, "step": 1614500 }, { "epoch": 10.928702901689043, "grad_norm": 0.36082133650779724, "learning_rate": 4.89071297098311e-05, "loss": 0.3737, "step": 1615000 }, { "epoch": 10.932086401039411, "grad_norm": 0.35252830386161804, "learning_rate": 4.890679135989606e-05, "loss": 0.3723, "step": 1615500 }, { "epoch": 10.93546990038978, "grad_norm": 0.3130725026130676, "learning_rate": 4.890645300996102e-05, "loss": 0.3712, "step": 1616000 }, { "epoch": 10.938853399740147, "grad_norm": 0.3593806326389313, "learning_rate": 4.890611466002599e-05, "loss": 0.3724, "step": 1616500 }, { "epoch": 10.942236899090515, "grad_norm": 0.3564632833003998, "learning_rate": 4.890577631009095e-05, "loss": 0.3736, "step": 1617000 }, { "epoch": 10.945620398440884, "grad_norm": 0.3367440402507782, "learning_rate": 4.8905437960155914e-05, "loss": 0.3722, "step": 1617500 }, { "epoch": 10.949003897791252, "grad_norm": 0.35689491033554077, "learning_rate": 4.8905099610220876e-05, "loss": 0.3726, "step": 1618000 }, { "epoch": 10.952387397141619, "grad_norm": 0.36531731486320496, "learning_rate": 4.8904761260285845e-05, "loss": 0.3716, "step": 1618500 }, { "epoch": 10.955770896491988, "grad_norm": 0.3991343677043915, "learning_rate": 4.890442291035081e-05, "loss": 0.3728, "step": 1619000 }, { "epoch": 10.959154395842356, "grad_norm": 0.3427871763706207, "learning_rate": 4.890408456041577e-05, "loss": 0.3722, "step": 1619500 }, { "epoch": 10.962537895192725, "grad_norm": 0.3547114133834839, "learning_rate": 4.8903746210480725e-05, "loss": 0.3725, "step": 1620000 }, { "epoch": 10.965921394543091, "grad_norm": 0.4285212457180023, "learning_rate": 4.8903407860545694e-05, "loss": 0.3727, "step": 1620500 }, { "epoch": 10.96930489389346, "grad_norm": 0.37522661685943604, "learning_rate": 4.8903069510610656e-05, "loss": 0.3704, "step": 1621000 }, { "epoch": 10.972688393243828, "grad_norm": 0.36373889446258545, "learning_rate": 4.890273116067562e-05, "loss": 0.3722, "step": 1621500 }, { "epoch": 10.976071892594197, "grad_norm": 0.3494463562965393, "learning_rate": 4.890239281074058e-05, "loss": 0.3731, "step": 1622000 }, { "epoch": 10.979455391944565, "grad_norm": 0.3906720280647278, "learning_rate": 4.890205446080555e-05, "loss": 0.3727, "step": 1622500 }, { "epoch": 10.982838891294932, "grad_norm": 0.3903609812259674, "learning_rate": 4.890171611087051e-05, "loss": 0.3704, "step": 1623000 }, { "epoch": 10.9862223906453, "grad_norm": 0.3798389434814453, "learning_rate": 4.890137776093547e-05, "loss": 0.3722, "step": 1623500 }, { "epoch": 10.98960588999567, "grad_norm": 0.37869197130203247, "learning_rate": 4.8901039411000435e-05, "loss": 0.3725, "step": 1624000 }, { "epoch": 10.992989389346038, "grad_norm": 0.3519867956638336, "learning_rate": 4.89007010610654e-05, "loss": 0.3725, "step": 1624500 }, { "epoch": 10.996372888696406, "grad_norm": 0.34879013895988464, "learning_rate": 4.890036271113036e-05, "loss": 0.3722, "step": 1625000 }, { "epoch": 10.999756388046773, "grad_norm": 0.3999994695186615, "learning_rate": 4.890002436119532e-05, "loss": 0.3723, "step": 1625500 }, { "epoch": 11.0, "eval_accuracy": 0.858302066261313, "eval_loss": 0.5760162472724915, "eval_runtime": 3383.38, "eval_samples_per_second": 85.933, "eval_steps_per_second": 5.371, "step": 1625536 }, { "epoch": 11.003139887397142, "grad_norm": 0.36553969979286194, "learning_rate": 4.889968601126029e-05, "loss": 0.3697, "step": 1626000 }, { "epoch": 11.00652338674751, "grad_norm": 0.34825971722602844, "learning_rate": 4.889934766132525e-05, "loss": 0.3716, "step": 1626500 }, { "epoch": 11.009906886097879, "grad_norm": 0.37449222803115845, "learning_rate": 4.8899009311390215e-05, "loss": 0.3699, "step": 1627000 }, { "epoch": 11.013290385448245, "grad_norm": 0.311702698469162, "learning_rate": 4.889867096145518e-05, "loss": 0.3691, "step": 1627500 }, { "epoch": 11.016673884798614, "grad_norm": 0.38622474670410156, "learning_rate": 4.8898332611520146e-05, "loss": 0.3706, "step": 1628000 }, { "epoch": 11.020057384148982, "grad_norm": 0.35131123661994934, "learning_rate": 4.889799426158511e-05, "loss": 0.37, "step": 1628500 }, { "epoch": 11.023440883499351, "grad_norm": 0.3682219684123993, "learning_rate": 4.889765591165007e-05, "loss": 0.3696, "step": 1629000 }, { "epoch": 11.026824382849718, "grad_norm": 0.3607659339904785, "learning_rate": 4.8897317561715025e-05, "loss": 0.3695, "step": 1629500 }, { "epoch": 11.030207882200086, "grad_norm": 0.34448304772377014, "learning_rate": 4.8896979211779994e-05, "loss": 0.3715, "step": 1630000 }, { "epoch": 11.033591381550455, "grad_norm": 0.37748998403549194, "learning_rate": 4.8896640861844956e-05, "loss": 0.3695, "step": 1630500 }, { "epoch": 11.036974880900823, "grad_norm": 0.3491244316101074, "learning_rate": 4.889630251190992e-05, "loss": 0.3704, "step": 1631000 }, { "epoch": 11.040358380251192, "grad_norm": 0.3739766478538513, "learning_rate": 4.889596416197488e-05, "loss": 0.3706, "step": 1631500 }, { "epoch": 11.043741879601559, "grad_norm": 0.3629169762134552, "learning_rate": 4.889562581203985e-05, "loss": 0.3715, "step": 1632000 }, { "epoch": 11.047125378951927, "grad_norm": 0.32953304052352905, "learning_rate": 4.889528746210481e-05, "loss": 0.3695, "step": 1632500 }, { "epoch": 11.050508878302296, "grad_norm": 0.37410807609558105, "learning_rate": 4.8894949112169774e-05, "loss": 0.3715, "step": 1633000 }, { "epoch": 11.053892377652664, "grad_norm": 0.33794400095939636, "learning_rate": 4.8894610762234736e-05, "loss": 0.3699, "step": 1633500 }, { "epoch": 11.05727587700303, "grad_norm": 0.3270277678966522, "learning_rate": 4.88942724122997e-05, "loss": 0.3697, "step": 1634000 }, { "epoch": 11.0606593763534, "grad_norm": 0.34896576404571533, "learning_rate": 4.889393406236466e-05, "loss": 0.3708, "step": 1634500 }, { "epoch": 11.064042875703768, "grad_norm": 0.38799527287483215, "learning_rate": 4.889359571242962e-05, "loss": 0.3724, "step": 1635000 }, { "epoch": 11.067426375054136, "grad_norm": 0.39045727252960205, "learning_rate": 4.889325736249459e-05, "loss": 0.3715, "step": 1635500 }, { "epoch": 11.070809874404505, "grad_norm": 0.40085986256599426, "learning_rate": 4.889291901255955e-05, "loss": 0.3709, "step": 1636000 }, { "epoch": 11.074193373754872, "grad_norm": 0.4043148159980774, "learning_rate": 4.8892580662624515e-05, "loss": 0.3709, "step": 1636500 }, { "epoch": 11.07757687310524, "grad_norm": 0.3465675413608551, "learning_rate": 4.889224231268948e-05, "loss": 0.3705, "step": 1637000 }, { "epoch": 11.080960372455609, "grad_norm": 0.3407261371612549, "learning_rate": 4.889190396275444e-05, "loss": 0.3705, "step": 1637500 }, { "epoch": 11.084343871805977, "grad_norm": 0.3841465711593628, "learning_rate": 4.889156561281941e-05, "loss": 0.371, "step": 1638000 }, { "epoch": 11.087727371156344, "grad_norm": 0.3267723619937897, "learning_rate": 4.889122726288437e-05, "loss": 0.3722, "step": 1638500 }, { "epoch": 11.091110870506713, "grad_norm": 0.32410913705825806, "learning_rate": 4.8890888912949326e-05, "loss": 0.3707, "step": 1639000 }, { "epoch": 11.094494369857081, "grad_norm": 0.3783022463321686, "learning_rate": 4.8890550563014295e-05, "loss": 0.3705, "step": 1639500 }, { "epoch": 11.09787786920745, "grad_norm": 0.39223384857177734, "learning_rate": 4.889021221307926e-05, "loss": 0.3699, "step": 1640000 }, { "epoch": 11.101261368557818, "grad_norm": 0.3522324562072754, "learning_rate": 4.888987386314422e-05, "loss": 0.3707, "step": 1640500 }, { "epoch": 11.104644867908185, "grad_norm": 0.33530494570732117, "learning_rate": 4.888953551320918e-05, "loss": 0.3698, "step": 1641000 }, { "epoch": 11.108028367258553, "grad_norm": 0.34049347043037415, "learning_rate": 4.888919716327415e-05, "loss": 0.3709, "step": 1641500 }, { "epoch": 11.111411866608922, "grad_norm": 0.34313058853149414, "learning_rate": 4.888885881333911e-05, "loss": 0.3716, "step": 1642000 }, { "epoch": 11.11479536595929, "grad_norm": 0.35914409160614014, "learning_rate": 4.8888520463404074e-05, "loss": 0.3692, "step": 1642500 }, { "epoch": 11.118178865309657, "grad_norm": 0.3764127194881439, "learning_rate": 4.8888182113469036e-05, "loss": 0.37, "step": 1643000 }, { "epoch": 11.121562364660026, "grad_norm": 0.3786813020706177, "learning_rate": 4.8887843763534e-05, "loss": 0.3704, "step": 1643500 }, { "epoch": 11.124945864010394, "grad_norm": 0.3405832052230835, "learning_rate": 4.888750541359896e-05, "loss": 0.3702, "step": 1644000 }, { "epoch": 11.128329363360763, "grad_norm": 0.36560699343681335, "learning_rate": 4.888716706366392e-05, "loss": 0.3723, "step": 1644500 }, { "epoch": 11.13171286271113, "grad_norm": 0.33541402220726013, "learning_rate": 4.8886828713728885e-05, "loss": 0.371, "step": 1645000 }, { "epoch": 11.135096362061498, "grad_norm": 0.3588182330131531, "learning_rate": 4.8886490363793854e-05, "loss": 0.3705, "step": 1645500 }, { "epoch": 11.138479861411867, "grad_norm": 0.36514315009117126, "learning_rate": 4.8886152013858816e-05, "loss": 0.3698, "step": 1646000 }, { "epoch": 11.141863360762235, "grad_norm": 0.35488948225975037, "learning_rate": 4.888581366392378e-05, "loss": 0.371, "step": 1646500 }, { "epoch": 11.145246860112604, "grad_norm": 0.3414875566959381, "learning_rate": 4.888547531398874e-05, "loss": 0.3706, "step": 1647000 }, { "epoch": 11.14863035946297, "grad_norm": 0.3390885889530182, "learning_rate": 4.888513696405371e-05, "loss": 0.3717, "step": 1647500 }, { "epoch": 11.152013858813339, "grad_norm": 0.38793033361434937, "learning_rate": 4.888479861411867e-05, "loss": 0.3706, "step": 1648000 }, { "epoch": 11.155397358163707, "grad_norm": 0.3770737946033478, "learning_rate": 4.8884460264183627e-05, "loss": 0.3713, "step": 1648500 }, { "epoch": 11.158780857514076, "grad_norm": 0.37044626474380493, "learning_rate": 4.8884121914248595e-05, "loss": 0.3705, "step": 1649000 }, { "epoch": 11.162164356864443, "grad_norm": 0.40374863147735596, "learning_rate": 4.888378356431356e-05, "loss": 0.3726, "step": 1649500 }, { "epoch": 11.165547856214811, "grad_norm": 0.348796546459198, "learning_rate": 4.888344521437852e-05, "loss": 0.3715, "step": 1650000 }, { "epoch": 11.16893135556518, "grad_norm": 0.4167865812778473, "learning_rate": 4.888310686444348e-05, "loss": 0.3737, "step": 1650500 }, { "epoch": 11.172314854915548, "grad_norm": 0.3556322753429413, "learning_rate": 4.888276851450845e-05, "loss": 0.3728, "step": 1651000 }, { "epoch": 11.175698354265917, "grad_norm": 0.3516223728656769, "learning_rate": 4.888243016457341e-05, "loss": 0.3697, "step": 1651500 }, { "epoch": 11.179081853616283, "grad_norm": 0.3844006657600403, "learning_rate": 4.8882091814638375e-05, "loss": 0.3704, "step": 1652000 }, { "epoch": 11.182465352966652, "grad_norm": 0.36370837688446045, "learning_rate": 4.888175346470334e-05, "loss": 0.3704, "step": 1652500 }, { "epoch": 11.18584885231702, "grad_norm": 0.37239259481430054, "learning_rate": 4.8881415114768306e-05, "loss": 0.3712, "step": 1653000 }, { "epoch": 11.189232351667389, "grad_norm": 0.365557998418808, "learning_rate": 4.888107676483326e-05, "loss": 0.3735, "step": 1653500 }, { "epoch": 11.192615851017756, "grad_norm": 0.38343822956085205, "learning_rate": 4.8880738414898223e-05, "loss": 0.3718, "step": 1654000 }, { "epoch": 11.195999350368124, "grad_norm": 0.33387985825538635, "learning_rate": 4.8880400064963186e-05, "loss": 0.37, "step": 1654500 }, { "epoch": 11.199382849718493, "grad_norm": 0.3623245656490326, "learning_rate": 4.8880061715028154e-05, "loss": 0.3706, "step": 1655000 }, { "epoch": 11.202766349068861, "grad_norm": 0.38692569732666016, "learning_rate": 4.8879723365093117e-05, "loss": 0.3708, "step": 1655500 }, { "epoch": 11.20614984841923, "grad_norm": 0.33788901567459106, "learning_rate": 4.887938501515808e-05, "loss": 0.3717, "step": 1656000 }, { "epoch": 11.209533347769597, "grad_norm": 0.3440932035446167, "learning_rate": 4.887904666522304e-05, "loss": 0.371, "step": 1656500 }, { "epoch": 11.212916847119965, "grad_norm": 0.3497539758682251, "learning_rate": 4.887870831528801e-05, "loss": 0.3715, "step": 1657000 }, { "epoch": 11.216300346470334, "grad_norm": 0.34109485149383545, "learning_rate": 4.887836996535297e-05, "loss": 0.3731, "step": 1657500 }, { "epoch": 11.219683845820702, "grad_norm": 0.36244407296180725, "learning_rate": 4.887803161541793e-05, "loss": 0.3711, "step": 1658000 }, { "epoch": 11.223067345171069, "grad_norm": 0.3530657887458801, "learning_rate": 4.8877693265482896e-05, "loss": 0.372, "step": 1658500 }, { "epoch": 11.226450844521437, "grad_norm": 0.3541402816772461, "learning_rate": 4.887735491554786e-05, "loss": 0.3708, "step": 1659000 }, { "epoch": 11.229834343871806, "grad_norm": 0.3628634512424469, "learning_rate": 4.887701656561282e-05, "loss": 0.3715, "step": 1659500 }, { "epoch": 11.233217843222175, "grad_norm": 0.3461693823337555, "learning_rate": 4.887667821567778e-05, "loss": 0.3711, "step": 1660000 }, { "epoch": 11.236601342572543, "grad_norm": 0.3520946204662323, "learning_rate": 4.887633986574275e-05, "loss": 0.3725, "step": 1660500 }, { "epoch": 11.23998484192291, "grad_norm": 0.3735743761062622, "learning_rate": 4.8876001515807713e-05, "loss": 0.3723, "step": 1661000 }, { "epoch": 11.243368341273278, "grad_norm": 0.37564146518707275, "learning_rate": 4.8875663165872676e-05, "loss": 0.3722, "step": 1661500 }, { "epoch": 11.246751840623647, "grad_norm": 0.38179755210876465, "learning_rate": 4.887532481593764e-05, "loss": 0.3722, "step": 1662000 }, { "epoch": 11.250135339974015, "grad_norm": 0.36789751052856445, "learning_rate": 4.8874986466002607e-05, "loss": 0.3715, "step": 1662500 }, { "epoch": 11.253518839324382, "grad_norm": 0.3178900182247162, "learning_rate": 4.887464811606756e-05, "loss": 0.3706, "step": 1663000 }, { "epoch": 11.25690233867475, "grad_norm": 0.37493330240249634, "learning_rate": 4.8874309766132524e-05, "loss": 0.3714, "step": 1663500 }, { "epoch": 11.26028583802512, "grad_norm": 0.33544522523880005, "learning_rate": 4.8873971416197486e-05, "loss": 0.3708, "step": 1664000 }, { "epoch": 11.263669337375488, "grad_norm": 0.34790122509002686, "learning_rate": 4.8873633066262455e-05, "loss": 0.3708, "step": 1664500 }, { "epoch": 11.267052836725856, "grad_norm": 0.3806302845478058, "learning_rate": 4.887329471632742e-05, "loss": 0.3706, "step": 1665000 }, { "epoch": 11.270436336076223, "grad_norm": 0.3643375039100647, "learning_rate": 4.887295636639238e-05, "loss": 0.3704, "step": 1665500 }, { "epoch": 11.273819835426591, "grad_norm": 0.41378292441368103, "learning_rate": 4.887261801645734e-05, "loss": 0.3714, "step": 1666000 }, { "epoch": 11.27720333477696, "grad_norm": 0.3954533338546753, "learning_rate": 4.887227966652231e-05, "loss": 0.3712, "step": 1666500 }, { "epoch": 11.280586834127329, "grad_norm": 0.41279372572898865, "learning_rate": 4.887194131658727e-05, "loss": 0.3711, "step": 1667000 }, { "epoch": 11.283970333477695, "grad_norm": 0.32453882694244385, "learning_rate": 4.887160296665223e-05, "loss": 0.3712, "step": 1667500 }, { "epoch": 11.287353832828064, "grad_norm": 0.35816478729248047, "learning_rate": 4.88712646167172e-05, "loss": 0.371, "step": 1668000 }, { "epoch": 11.290737332178432, "grad_norm": 0.3419073820114136, "learning_rate": 4.887092626678216e-05, "loss": 0.3713, "step": 1668500 }, { "epoch": 11.2941208315288, "grad_norm": 0.41529932618141174, "learning_rate": 4.887058791684712e-05, "loss": 0.3701, "step": 1669000 }, { "epoch": 11.297504330879168, "grad_norm": 0.34597066044807434, "learning_rate": 4.887024956691208e-05, "loss": 0.3734, "step": 1669500 }, { "epoch": 11.300887830229536, "grad_norm": 0.3705573081970215, "learning_rate": 4.886991121697705e-05, "loss": 0.3718, "step": 1670000 }, { "epoch": 11.304271329579905, "grad_norm": 0.38146474957466125, "learning_rate": 4.8869572867042014e-05, "loss": 0.3713, "step": 1670500 }, { "epoch": 11.307654828930273, "grad_norm": 0.3561438024044037, "learning_rate": 4.8869234517106976e-05, "loss": 0.3715, "step": 1671000 }, { "epoch": 11.311038328280642, "grad_norm": 0.39705178141593933, "learning_rate": 4.886889616717194e-05, "loss": 0.3707, "step": 1671500 }, { "epoch": 11.314421827631008, "grad_norm": 0.35090193152427673, "learning_rate": 4.886855781723691e-05, "loss": 0.3716, "step": 1672000 }, { "epoch": 11.317805326981377, "grad_norm": 0.3633898198604584, "learning_rate": 4.886821946730186e-05, "loss": 0.3712, "step": 1672500 }, { "epoch": 11.321188826331746, "grad_norm": 0.3485035300254822, "learning_rate": 4.8867881117366825e-05, "loss": 0.3711, "step": 1673000 }, { "epoch": 11.324572325682114, "grad_norm": 0.38166773319244385, "learning_rate": 4.886754276743179e-05, "loss": 0.3707, "step": 1673500 }, { "epoch": 11.32795582503248, "grad_norm": 0.37450724840164185, "learning_rate": 4.8867204417496756e-05, "loss": 0.3699, "step": 1674000 }, { "epoch": 11.33133932438285, "grad_norm": 0.3536973297595978, "learning_rate": 4.886686606756172e-05, "loss": 0.3733, "step": 1674500 }, { "epoch": 11.334722823733218, "grad_norm": 0.3306363821029663, "learning_rate": 4.886652771762668e-05, "loss": 0.3714, "step": 1675000 }, { "epoch": 11.338106323083586, "grad_norm": 0.3833313286304474, "learning_rate": 4.886618936769164e-05, "loss": 0.3705, "step": 1675500 }, { "epoch": 11.341489822433955, "grad_norm": 0.3853326439857483, "learning_rate": 4.886585101775661e-05, "loss": 0.371, "step": 1676000 }, { "epoch": 11.344873321784322, "grad_norm": 0.34179025888442993, "learning_rate": 4.886551266782157e-05, "loss": 0.3721, "step": 1676500 }, { "epoch": 11.34825682113469, "grad_norm": 0.36327168345451355, "learning_rate": 4.886517431788653e-05, "loss": 0.372, "step": 1677000 }, { "epoch": 11.351640320485059, "grad_norm": 0.364208847284317, "learning_rate": 4.88648359679515e-05, "loss": 0.3731, "step": 1677500 }, { "epoch": 11.355023819835427, "grad_norm": 0.3499656319618225, "learning_rate": 4.886449761801646e-05, "loss": 0.3723, "step": 1678000 }, { "epoch": 11.358407319185794, "grad_norm": 0.3366363048553467, "learning_rate": 4.886415926808142e-05, "loss": 0.372, "step": 1678500 }, { "epoch": 11.361790818536162, "grad_norm": 0.4245286285877228, "learning_rate": 4.8863820918146384e-05, "loss": 0.3716, "step": 1679000 }, { "epoch": 11.365174317886531, "grad_norm": 0.36462846398353577, "learning_rate": 4.886348256821135e-05, "loss": 0.3728, "step": 1679500 }, { "epoch": 11.3685578172369, "grad_norm": 0.36596938967704773, "learning_rate": 4.8863144218276315e-05, "loss": 0.3712, "step": 1680000 }, { "epoch": 11.371941316587268, "grad_norm": 0.37903404235839844, "learning_rate": 4.886280586834128e-05, "loss": 0.3726, "step": 1680500 }, { "epoch": 11.375324815937635, "grad_norm": 0.3716667890548706, "learning_rate": 4.886246751840624e-05, "loss": 0.3714, "step": 1681000 }, { "epoch": 11.378708315288003, "grad_norm": 0.3672736585140228, "learning_rate": 4.886212916847121e-05, "loss": 0.3708, "step": 1681500 }, { "epoch": 11.382091814638372, "grad_norm": 0.35480719804763794, "learning_rate": 4.886179081853616e-05, "loss": 0.3703, "step": 1682000 }, { "epoch": 11.38547531398874, "grad_norm": 0.33838438987731934, "learning_rate": 4.8861452468601125e-05, "loss": 0.372, "step": 1682500 }, { "epoch": 11.388858813339107, "grad_norm": 0.38470542430877686, "learning_rate": 4.886111411866609e-05, "loss": 0.3714, "step": 1683000 }, { "epoch": 11.392242312689476, "grad_norm": 0.3572859466075897, "learning_rate": 4.8860775768731056e-05, "loss": 0.3713, "step": 1683500 }, { "epoch": 11.395625812039844, "grad_norm": 0.32588905096054077, "learning_rate": 4.886043741879602e-05, "loss": 0.3712, "step": 1684000 }, { "epoch": 11.399009311390213, "grad_norm": 0.3401735723018646, "learning_rate": 4.886009906886098e-05, "loss": 0.3704, "step": 1684500 }, { "epoch": 11.402392810740581, "grad_norm": 0.3463999330997467, "learning_rate": 4.885976071892594e-05, "loss": 0.3734, "step": 1685000 }, { "epoch": 11.405776310090948, "grad_norm": 0.36956986784935, "learning_rate": 4.885942236899091e-05, "loss": 0.3727, "step": 1685500 }, { "epoch": 11.409159809441316, "grad_norm": 0.3834473788738251, "learning_rate": 4.8859084019055874e-05, "loss": 0.3705, "step": 1686000 }, { "epoch": 11.412543308791685, "grad_norm": 0.3418712615966797, "learning_rate": 4.885874566912083e-05, "loss": 0.3707, "step": 1686500 }, { "epoch": 11.415926808142054, "grad_norm": 0.3438604474067688, "learning_rate": 4.88584073191858e-05, "loss": 0.3725, "step": 1687000 }, { "epoch": 11.41931030749242, "grad_norm": 0.3532719314098358, "learning_rate": 4.885806896925076e-05, "loss": 0.3735, "step": 1687500 }, { "epoch": 11.422693806842789, "grad_norm": 0.357460618019104, "learning_rate": 4.885773061931572e-05, "loss": 0.371, "step": 1688000 }, { "epoch": 11.426077306193157, "grad_norm": 0.33926403522491455, "learning_rate": 4.8857392269380684e-05, "loss": 0.3711, "step": 1688500 }, { "epoch": 11.429460805543526, "grad_norm": 0.3649449944496155, "learning_rate": 4.885705391944565e-05, "loss": 0.372, "step": 1689000 }, { "epoch": 11.432844304893894, "grad_norm": 0.38523754477500916, "learning_rate": 4.8856715569510615e-05, "loss": 0.3724, "step": 1689500 }, { "epoch": 11.436227804244261, "grad_norm": 0.37194523215293884, "learning_rate": 4.885637721957558e-05, "loss": 0.3707, "step": 1690000 }, { "epoch": 11.43961130359463, "grad_norm": 0.32036304473876953, "learning_rate": 4.885603886964054e-05, "loss": 0.3721, "step": 1690500 }, { "epoch": 11.442994802944998, "grad_norm": 0.3410488963127136, "learning_rate": 4.88557005197055e-05, "loss": 0.3706, "step": 1691000 }, { "epoch": 11.446378302295367, "grad_norm": 0.3249223530292511, "learning_rate": 4.8855362169770464e-05, "loss": 0.3708, "step": 1691500 }, { "epoch": 11.449761801645733, "grad_norm": 0.3791833817958832, "learning_rate": 4.8855023819835426e-05, "loss": 0.3709, "step": 1692000 }, { "epoch": 11.453145300996102, "grad_norm": 0.3641836643218994, "learning_rate": 4.885468546990039e-05, "loss": 0.3703, "step": 1692500 }, { "epoch": 11.45652880034647, "grad_norm": 0.40123745799064636, "learning_rate": 4.885434711996536e-05, "loss": 0.3725, "step": 1693000 }, { "epoch": 11.459912299696839, "grad_norm": 0.3453229069709778, "learning_rate": 4.885400877003032e-05, "loss": 0.3713, "step": 1693500 }, { "epoch": 11.463295799047206, "grad_norm": 0.3665013313293457, "learning_rate": 4.885367042009528e-05, "loss": 0.3718, "step": 1694000 }, { "epoch": 11.466679298397574, "grad_norm": 0.3658900856971741, "learning_rate": 4.885333207016024e-05, "loss": 0.373, "step": 1694500 }, { "epoch": 11.470062797747943, "grad_norm": 0.3546628952026367, "learning_rate": 4.885299372022521e-05, "loss": 0.3725, "step": 1695000 }, { "epoch": 11.473446297098311, "grad_norm": 0.3913738429546356, "learning_rate": 4.8852655370290174e-05, "loss": 0.3723, "step": 1695500 }, { "epoch": 11.47682979644868, "grad_norm": 0.36838194727897644, "learning_rate": 4.885231702035513e-05, "loss": 0.3711, "step": 1696000 }, { "epoch": 11.480213295799047, "grad_norm": 0.36029133200645447, "learning_rate": 4.88519786704201e-05, "loss": 0.3716, "step": 1696500 }, { "epoch": 11.483596795149415, "grad_norm": 0.39833101630210876, "learning_rate": 4.885164032048506e-05, "loss": 0.3706, "step": 1697000 }, { "epoch": 11.486980294499784, "grad_norm": 0.3916868567466736, "learning_rate": 4.885130197055002e-05, "loss": 0.3719, "step": 1697500 }, { "epoch": 11.490363793850152, "grad_norm": 0.36675065755844116, "learning_rate": 4.8850963620614985e-05, "loss": 0.3718, "step": 1698000 }, { "epoch": 11.493747293200519, "grad_norm": 0.3780879080295563, "learning_rate": 4.8850625270679954e-05, "loss": 0.3713, "step": 1698500 }, { "epoch": 11.497130792550887, "grad_norm": 0.3343255817890167, "learning_rate": 4.8850286920744916e-05, "loss": 0.3729, "step": 1699000 }, { "epoch": 11.500514291901256, "grad_norm": 0.35851815342903137, "learning_rate": 4.884994857080988e-05, "loss": 0.3722, "step": 1699500 }, { "epoch": 11.503897791251624, "grad_norm": 0.3488004505634308, "learning_rate": 4.884961022087484e-05, "loss": 0.3721, "step": 1700000 }, { "epoch": 11.507281290601993, "grad_norm": 0.3857555091381073, "learning_rate": 4.88492718709398e-05, "loss": 0.3707, "step": 1700500 }, { "epoch": 11.51066478995236, "grad_norm": 0.36618277430534363, "learning_rate": 4.8848933521004764e-05, "loss": 0.371, "step": 1701000 }, { "epoch": 11.514048289302728, "grad_norm": 0.36315736174583435, "learning_rate": 4.8848595171069727e-05, "loss": 0.3732, "step": 1701500 }, { "epoch": 11.517431788653097, "grad_norm": 0.3206866681575775, "learning_rate": 4.884825682113469e-05, "loss": 0.3705, "step": 1702000 }, { "epoch": 11.520815288003465, "grad_norm": 0.3441249430179596, "learning_rate": 4.884791847119966e-05, "loss": 0.3719, "step": 1702500 }, { "epoch": 11.524198787353832, "grad_norm": 0.32257530093193054, "learning_rate": 4.884758012126462e-05, "loss": 0.3704, "step": 1703000 }, { "epoch": 11.5275822867042, "grad_norm": 0.33246949315071106, "learning_rate": 4.884724177132958e-05, "loss": 0.3702, "step": 1703500 }, { "epoch": 11.53096578605457, "grad_norm": 0.3414853811264038, "learning_rate": 4.8846903421394544e-05, "loss": 0.372, "step": 1704000 }, { "epoch": 11.534349285404938, "grad_norm": 0.3186386525630951, "learning_rate": 4.884656507145951e-05, "loss": 0.3718, "step": 1704500 }, { "epoch": 11.537732784755306, "grad_norm": 0.3396872580051422, "learning_rate": 4.8846226721524475e-05, "loss": 0.3716, "step": 1705000 }, { "epoch": 11.541116284105673, "grad_norm": 0.3399006128311157, "learning_rate": 4.884588837158944e-05, "loss": 0.3719, "step": 1705500 }, { "epoch": 11.544499783456041, "grad_norm": 0.3655692934989929, "learning_rate": 4.88455500216544e-05, "loss": 0.372, "step": 1706000 }, { "epoch": 11.54788328280641, "grad_norm": 0.3565117418766022, "learning_rate": 4.884521167171936e-05, "loss": 0.3713, "step": 1706500 }, { "epoch": 11.551266782156778, "grad_norm": 0.34369155764579773, "learning_rate": 4.8844873321784323e-05, "loss": 0.3721, "step": 1707000 }, { "epoch": 11.554650281507145, "grad_norm": 0.41790443658828735, "learning_rate": 4.8844534971849286e-05, "loss": 0.3726, "step": 1707500 }, { "epoch": 11.558033780857514, "grad_norm": 0.4061517119407654, "learning_rate": 4.884419662191425e-05, "loss": 0.3713, "step": 1708000 }, { "epoch": 11.561417280207882, "grad_norm": 0.3356378376483917, "learning_rate": 4.884385827197922e-05, "loss": 0.3708, "step": 1708500 }, { "epoch": 11.56480077955825, "grad_norm": 0.3446858525276184, "learning_rate": 4.884351992204418e-05, "loss": 0.3715, "step": 1709000 }, { "epoch": 11.568184278908618, "grad_norm": 0.3577991724014282, "learning_rate": 4.884318157210914e-05, "loss": 0.3716, "step": 1709500 }, { "epoch": 11.571567778258986, "grad_norm": 0.3579697608947754, "learning_rate": 4.88428432221741e-05, "loss": 0.3712, "step": 1710000 }, { "epoch": 11.574951277609355, "grad_norm": 0.3684149980545044, "learning_rate": 4.8842504872239065e-05, "loss": 0.3732, "step": 1710500 }, { "epoch": 11.578334776959723, "grad_norm": 0.4148857593536377, "learning_rate": 4.884216652230403e-05, "loss": 0.3725, "step": 1711000 }, { "epoch": 11.581718276310092, "grad_norm": 0.4042187035083771, "learning_rate": 4.884182817236899e-05, "loss": 0.3724, "step": 1711500 }, { "epoch": 11.585101775660458, "grad_norm": 0.3650857210159302, "learning_rate": 4.884148982243396e-05, "loss": 0.373, "step": 1712000 }, { "epoch": 11.588485275010827, "grad_norm": 0.35698583722114563, "learning_rate": 4.884115147249892e-05, "loss": 0.3733, "step": 1712500 }, { "epoch": 11.591868774361195, "grad_norm": 0.3720245957374573, "learning_rate": 4.884081312256388e-05, "loss": 0.37, "step": 1713000 }, { "epoch": 11.595252273711564, "grad_norm": 0.37715089321136475, "learning_rate": 4.8840474772628845e-05, "loss": 0.3719, "step": 1713500 }, { "epoch": 11.598635773061932, "grad_norm": 0.3547378182411194, "learning_rate": 4.8840136422693814e-05, "loss": 0.3718, "step": 1714000 }, { "epoch": 11.6020192724123, "grad_norm": 0.3440019488334656, "learning_rate": 4.8839798072758776e-05, "loss": 0.3722, "step": 1714500 }, { "epoch": 11.605402771762668, "grad_norm": 0.3711560368537903, "learning_rate": 4.883945972282374e-05, "loss": 0.3722, "step": 1715000 }, { "epoch": 11.608786271113036, "grad_norm": 0.35923251509666443, "learning_rate": 4.883912137288869e-05, "loss": 0.3708, "step": 1715500 }, { "epoch": 11.612169770463405, "grad_norm": 0.35365545749664307, "learning_rate": 4.883878302295366e-05, "loss": 0.3718, "step": 1716000 }, { "epoch": 11.615553269813772, "grad_norm": 0.3428800404071808, "learning_rate": 4.8838444673018624e-05, "loss": 0.3735, "step": 1716500 }, { "epoch": 11.61893676916414, "grad_norm": 0.315690279006958, "learning_rate": 4.8838106323083586e-05, "loss": 0.3708, "step": 1717000 }, { "epoch": 11.622320268514509, "grad_norm": 0.38900619745254517, "learning_rate": 4.883776797314855e-05, "loss": 0.3715, "step": 1717500 }, { "epoch": 11.625703767864877, "grad_norm": 0.3546205759048462, "learning_rate": 4.883742962321352e-05, "loss": 0.3719, "step": 1718000 }, { "epoch": 11.629087267215244, "grad_norm": 0.30673447251319885, "learning_rate": 4.883709127327848e-05, "loss": 0.3718, "step": 1718500 }, { "epoch": 11.632470766565612, "grad_norm": 0.35959509015083313, "learning_rate": 4.883675292334344e-05, "loss": 0.3722, "step": 1719000 }, { "epoch": 11.635854265915981, "grad_norm": 0.4214160144329071, "learning_rate": 4.8836414573408404e-05, "loss": 0.3709, "step": 1719500 }, { "epoch": 11.63923776526635, "grad_norm": 0.35455048084259033, "learning_rate": 4.8836076223473366e-05, "loss": 0.3714, "step": 1720000 }, { "epoch": 11.642621264616718, "grad_norm": 0.3674977719783783, "learning_rate": 4.883573787353833e-05, "loss": 0.3718, "step": 1720500 }, { "epoch": 11.646004763967085, "grad_norm": 0.37586429715156555, "learning_rate": 4.883539952360329e-05, "loss": 0.3712, "step": 1721000 }, { "epoch": 11.649388263317453, "grad_norm": 0.3308994174003601, "learning_rate": 4.883506117366826e-05, "loss": 0.3734, "step": 1721500 }, { "epoch": 11.652771762667822, "grad_norm": 0.3521401584148407, "learning_rate": 4.883472282373322e-05, "loss": 0.3717, "step": 1722000 }, { "epoch": 11.65615526201819, "grad_norm": 0.3826388418674469, "learning_rate": 4.883438447379818e-05, "loss": 0.3708, "step": 1722500 }, { "epoch": 11.659538761368557, "grad_norm": 0.39649340510368347, "learning_rate": 4.8834046123863145e-05, "loss": 0.3728, "step": 1723000 }, { "epoch": 11.662922260718926, "grad_norm": 0.37863731384277344, "learning_rate": 4.8833707773928114e-05, "loss": 0.371, "step": 1723500 }, { "epoch": 11.666305760069294, "grad_norm": 0.3028692603111267, "learning_rate": 4.8833369423993076e-05, "loss": 0.3713, "step": 1724000 }, { "epoch": 11.669689259419663, "grad_norm": 0.34876948595046997, "learning_rate": 4.883303107405804e-05, "loss": 0.3729, "step": 1724500 }, { "epoch": 11.673072758770031, "grad_norm": 0.3666656017303467, "learning_rate": 4.8832692724122994e-05, "loss": 0.3699, "step": 1725000 }, { "epoch": 11.676456258120398, "grad_norm": 0.344465047121048, "learning_rate": 4.883235437418796e-05, "loss": 0.372, "step": 1725500 }, { "epoch": 11.679839757470766, "grad_norm": 0.3424505591392517, "learning_rate": 4.8832016024252925e-05, "loss": 0.3698, "step": 1726000 }, { "epoch": 11.683223256821135, "grad_norm": 0.4021340608596802, "learning_rate": 4.883167767431789e-05, "loss": 0.371, "step": 1726500 }, { "epoch": 11.686606756171503, "grad_norm": 0.3761267364025116, "learning_rate": 4.883133932438285e-05, "loss": 0.3728, "step": 1727000 }, { "epoch": 11.68999025552187, "grad_norm": 0.3646292984485626, "learning_rate": 4.883100097444782e-05, "loss": 0.3714, "step": 1727500 }, { "epoch": 11.693373754872239, "grad_norm": 0.352817565202713, "learning_rate": 4.883066262451278e-05, "loss": 0.372, "step": 1728000 }, { "epoch": 11.696757254222607, "grad_norm": 0.35847803950309753, "learning_rate": 4.883032427457774e-05, "loss": 0.3699, "step": 1728500 }, { "epoch": 11.700140753572976, "grad_norm": 0.3726541996002197, "learning_rate": 4.8829985924642704e-05, "loss": 0.3708, "step": 1729000 }, { "epoch": 11.703524252923344, "grad_norm": 0.346699595451355, "learning_rate": 4.8829647574707666e-05, "loss": 0.3725, "step": 1729500 }, { "epoch": 11.706907752273711, "grad_norm": 0.359237939119339, "learning_rate": 4.882930922477263e-05, "loss": 0.3721, "step": 1730000 }, { "epoch": 11.71029125162408, "grad_norm": 0.393034964799881, "learning_rate": 4.882897087483759e-05, "loss": 0.3709, "step": 1730500 }, { "epoch": 11.713674750974448, "grad_norm": 0.3590317964553833, "learning_rate": 4.882863252490256e-05, "loss": 0.3721, "step": 1731000 }, { "epoch": 11.717058250324817, "grad_norm": 0.3669711947441101, "learning_rate": 4.882829417496752e-05, "loss": 0.3733, "step": 1731500 }, { "epoch": 11.720441749675183, "grad_norm": 0.36638450622558594, "learning_rate": 4.8827955825032484e-05, "loss": 0.3724, "step": 1732000 }, { "epoch": 11.723825249025552, "grad_norm": 0.3468673527240753, "learning_rate": 4.8827617475097446e-05, "loss": 0.3707, "step": 1732500 }, { "epoch": 11.72720874837592, "grad_norm": 0.3468047082424164, "learning_rate": 4.8827279125162415e-05, "loss": 0.3715, "step": 1733000 }, { "epoch": 11.730592247726289, "grad_norm": 0.3381323516368866, "learning_rate": 4.882694077522738e-05, "loss": 0.373, "step": 1733500 }, { "epoch": 11.733975747076656, "grad_norm": 0.33758747577667236, "learning_rate": 4.882660242529234e-05, "loss": 0.3721, "step": 1734000 }, { "epoch": 11.737359246427024, "grad_norm": 0.33760327100753784, "learning_rate": 4.8826264075357294e-05, "loss": 0.3705, "step": 1734500 }, { "epoch": 11.740742745777393, "grad_norm": 0.36903616786003113, "learning_rate": 4.882592572542226e-05, "loss": 0.3721, "step": 1735000 }, { "epoch": 11.744126245127761, "grad_norm": 0.37238264083862305, "learning_rate": 4.8825587375487225e-05, "loss": 0.3708, "step": 1735500 }, { "epoch": 11.74750974447813, "grad_norm": 0.31929102540016174, "learning_rate": 4.882524902555219e-05, "loss": 0.3725, "step": 1736000 }, { "epoch": 11.750893243828497, "grad_norm": 0.38301795721054077, "learning_rate": 4.882491067561715e-05, "loss": 0.3721, "step": 1736500 }, { "epoch": 11.754276743178865, "grad_norm": 0.3501420319080353, "learning_rate": 4.882457232568212e-05, "loss": 0.3718, "step": 1737000 }, { "epoch": 11.757660242529234, "grad_norm": 0.35756149888038635, "learning_rate": 4.882423397574708e-05, "loss": 0.372, "step": 1737500 }, { "epoch": 11.761043741879602, "grad_norm": 0.3879341185092926, "learning_rate": 4.882389562581204e-05, "loss": 0.3718, "step": 1738000 }, { "epoch": 11.764427241229969, "grad_norm": 0.3338926434516907, "learning_rate": 4.8823557275877005e-05, "loss": 0.3717, "step": 1738500 }, { "epoch": 11.767810740580337, "grad_norm": 0.35573339462280273, "learning_rate": 4.882321892594197e-05, "loss": 0.372, "step": 1739000 }, { "epoch": 11.771194239930706, "grad_norm": 0.3486931025981903, "learning_rate": 4.882288057600693e-05, "loss": 0.3709, "step": 1739500 }, { "epoch": 11.774577739281074, "grad_norm": 0.37727949023246765, "learning_rate": 4.882254222607189e-05, "loss": 0.3716, "step": 1740000 }, { "epoch": 11.777961238631443, "grad_norm": 0.35868674516677856, "learning_rate": 4.882220387613686e-05, "loss": 0.3731, "step": 1740500 }, { "epoch": 11.78134473798181, "grad_norm": 0.3462320566177368, "learning_rate": 4.882186552620182e-05, "loss": 0.3716, "step": 1741000 }, { "epoch": 11.784728237332178, "grad_norm": 0.39170634746551514, "learning_rate": 4.8821527176266784e-05, "loss": 0.3712, "step": 1741500 }, { "epoch": 11.788111736682547, "grad_norm": 0.3877205550670624, "learning_rate": 4.8821188826331747e-05, "loss": 0.3694, "step": 1742000 }, { "epoch": 11.791495236032915, "grad_norm": 0.3896919786930084, "learning_rate": 4.8820850476396715e-05, "loss": 0.3738, "step": 1742500 }, { "epoch": 11.794878735383282, "grad_norm": 0.34968894720077515, "learning_rate": 4.882051212646168e-05, "loss": 0.3705, "step": 1743000 }, { "epoch": 11.79826223473365, "grad_norm": 0.3168765604496002, "learning_rate": 4.882017377652664e-05, "loss": 0.3731, "step": 1743500 }, { "epoch": 11.801645734084019, "grad_norm": 0.39077892899513245, "learning_rate": 4.8819835426591595e-05, "loss": 0.3723, "step": 1744000 }, { "epoch": 11.805029233434388, "grad_norm": 0.3569541871547699, "learning_rate": 4.8819497076656564e-05, "loss": 0.3717, "step": 1744500 }, { "epoch": 11.808412732784756, "grad_norm": 0.3633823096752167, "learning_rate": 4.8819158726721526e-05, "loss": 0.3718, "step": 1745000 }, { "epoch": 11.811796232135123, "grad_norm": 0.365784615278244, "learning_rate": 4.881882037678649e-05, "loss": 0.3716, "step": 1745500 }, { "epoch": 11.815179731485491, "grad_norm": 0.35726240277290344, "learning_rate": 4.881848202685145e-05, "loss": 0.373, "step": 1746000 }, { "epoch": 11.81856323083586, "grad_norm": 0.3863852322101593, "learning_rate": 4.881814367691642e-05, "loss": 0.3708, "step": 1746500 }, { "epoch": 11.821946730186228, "grad_norm": 0.3824382722377777, "learning_rate": 4.881780532698138e-05, "loss": 0.3714, "step": 1747000 }, { "epoch": 11.825330229536595, "grad_norm": 0.37908515334129333, "learning_rate": 4.8817466977046343e-05, "loss": 0.3708, "step": 1747500 }, { "epoch": 11.828713728886964, "grad_norm": 0.3562980890274048, "learning_rate": 4.8817128627111306e-05, "loss": 0.3717, "step": 1748000 }, { "epoch": 11.832097228237332, "grad_norm": 0.3647066652774811, "learning_rate": 4.881679027717627e-05, "loss": 0.3721, "step": 1748500 }, { "epoch": 11.8354807275877, "grad_norm": 0.36721131205558777, "learning_rate": 4.881645192724123e-05, "loss": 0.3717, "step": 1749000 }, { "epoch": 11.838864226938068, "grad_norm": 0.36937791109085083, "learning_rate": 4.881611357730619e-05, "loss": 0.3704, "step": 1749500 }, { "epoch": 11.842247726288436, "grad_norm": 0.3358730673789978, "learning_rate": 4.881577522737116e-05, "loss": 0.3716, "step": 1750000 }, { "epoch": 11.845631225638805, "grad_norm": 0.38302385807037354, "learning_rate": 4.881543687743612e-05, "loss": 0.3718, "step": 1750500 }, { "epoch": 11.849014724989173, "grad_norm": 0.3572796881198883, "learning_rate": 4.8815098527501085e-05, "loss": 0.3709, "step": 1751000 }, { "epoch": 11.852398224339542, "grad_norm": 0.34055548906326294, "learning_rate": 4.881476017756605e-05, "loss": 0.3715, "step": 1751500 }, { "epoch": 11.855781723689908, "grad_norm": 0.3258049190044403, "learning_rate": 4.8814421827631016e-05, "loss": 0.3711, "step": 1752000 }, { "epoch": 11.859165223040277, "grad_norm": 0.35258248448371887, "learning_rate": 4.881408347769598e-05, "loss": 0.3707, "step": 1752500 }, { "epoch": 11.862548722390645, "grad_norm": 0.35839393734931946, "learning_rate": 4.881374512776094e-05, "loss": 0.3718, "step": 1753000 }, { "epoch": 11.865932221741014, "grad_norm": 0.3616470992565155, "learning_rate": 4.8813406777825896e-05, "loss": 0.3721, "step": 1753500 }, { "epoch": 11.869315721091382, "grad_norm": 0.36866047978401184, "learning_rate": 4.8813068427890865e-05, "loss": 0.373, "step": 1754000 }, { "epoch": 11.87269922044175, "grad_norm": 0.3095003068447113, "learning_rate": 4.881273007795583e-05, "loss": 0.3714, "step": 1754500 }, { "epoch": 11.876082719792118, "grad_norm": 0.3345845341682434, "learning_rate": 4.881239172802079e-05, "loss": 0.3733, "step": 1755000 }, { "epoch": 11.879466219142486, "grad_norm": 0.33528879284858704, "learning_rate": 4.881205337808575e-05, "loss": 0.3718, "step": 1755500 }, { "epoch": 11.882849718492855, "grad_norm": 0.3436717092990875, "learning_rate": 4.881171502815072e-05, "loss": 0.3724, "step": 1756000 }, { "epoch": 11.886233217843222, "grad_norm": 0.36002013087272644, "learning_rate": 4.881137667821568e-05, "loss": 0.3713, "step": 1756500 }, { "epoch": 11.88961671719359, "grad_norm": 0.37779897451400757, "learning_rate": 4.8811038328280644e-05, "loss": 0.3703, "step": 1757000 }, { "epoch": 11.893000216543959, "grad_norm": 0.34598466753959656, "learning_rate": 4.8810699978345606e-05, "loss": 0.3713, "step": 1757500 }, { "epoch": 11.896383715894327, "grad_norm": 0.34416958689689636, "learning_rate": 4.881036162841057e-05, "loss": 0.3693, "step": 1758000 }, { "epoch": 11.899767215244694, "grad_norm": 0.3440912067890167, "learning_rate": 4.881002327847553e-05, "loss": 0.3707, "step": 1758500 }, { "epoch": 11.903150714595062, "grad_norm": 0.34557539224624634, "learning_rate": 4.880968492854049e-05, "loss": 0.3724, "step": 1759000 }, { "epoch": 11.90653421394543, "grad_norm": 0.3842761516571045, "learning_rate": 4.880934657860546e-05, "loss": 0.3726, "step": 1759500 }, { "epoch": 11.9099177132958, "grad_norm": 0.37416863441467285, "learning_rate": 4.8809008228670424e-05, "loss": 0.3715, "step": 1760000 }, { "epoch": 11.913301212646168, "grad_norm": 0.3516014516353607, "learning_rate": 4.8808669878735386e-05, "loss": 0.3728, "step": 1760500 }, { "epoch": 11.916684711996535, "grad_norm": 0.36235132813453674, "learning_rate": 4.880833152880035e-05, "loss": 0.3712, "step": 1761000 }, { "epoch": 11.920068211346903, "grad_norm": 0.34364044666290283, "learning_rate": 4.880799317886532e-05, "loss": 0.3719, "step": 1761500 }, { "epoch": 11.923451710697272, "grad_norm": 0.380653977394104, "learning_rate": 4.880765482893028e-05, "loss": 0.3713, "step": 1762000 }, { "epoch": 11.92683521004764, "grad_norm": 0.40505051612854004, "learning_rate": 4.880731647899524e-05, "loss": 0.3727, "step": 1762500 }, { "epoch": 11.930218709398007, "grad_norm": 0.3548690676689148, "learning_rate": 4.8806978129060196e-05, "loss": 0.3725, "step": 1763000 }, { "epoch": 11.933602208748376, "grad_norm": 0.34525832533836365, "learning_rate": 4.8806639779125165e-05, "loss": 0.3727, "step": 1763500 }, { "epoch": 11.936985708098744, "grad_norm": 0.3858516216278076, "learning_rate": 4.880630142919013e-05, "loss": 0.3726, "step": 1764000 }, { "epoch": 11.940369207449113, "grad_norm": 0.3259793817996979, "learning_rate": 4.880596307925509e-05, "loss": 0.3715, "step": 1764500 }, { "epoch": 11.943752706799481, "grad_norm": 0.3813071846961975, "learning_rate": 4.880562472932005e-05, "loss": 0.374, "step": 1765000 }, { "epoch": 11.947136206149848, "grad_norm": 0.379879355430603, "learning_rate": 4.880528637938502e-05, "loss": 0.3727, "step": 1765500 }, { "epoch": 11.950519705500216, "grad_norm": 0.35107338428497314, "learning_rate": 4.880494802944998e-05, "loss": 0.3722, "step": 1766000 }, { "epoch": 11.953903204850585, "grad_norm": 0.33819907903671265, "learning_rate": 4.8804609679514945e-05, "loss": 0.3712, "step": 1766500 }, { "epoch": 11.957286704200953, "grad_norm": 0.37626633048057556, "learning_rate": 4.880427132957991e-05, "loss": 0.3712, "step": 1767000 }, { "epoch": 11.96067020355132, "grad_norm": 0.3538345694541931, "learning_rate": 4.8803932979644876e-05, "loss": 0.3704, "step": 1767500 }, { "epoch": 11.964053702901689, "grad_norm": 0.32092374563217163, "learning_rate": 4.880359462970983e-05, "loss": 0.3711, "step": 1768000 }, { "epoch": 11.967437202252057, "grad_norm": 0.3457582890987396, "learning_rate": 4.880325627977479e-05, "loss": 0.3706, "step": 1768500 }, { "epoch": 11.970820701602426, "grad_norm": 0.350088894367218, "learning_rate": 4.880291792983976e-05, "loss": 0.3712, "step": 1769000 }, { "epoch": 11.974204200952794, "grad_norm": 0.3614156246185303, "learning_rate": 4.8802579579904724e-05, "loss": 0.373, "step": 1769500 }, { "epoch": 11.977587700303161, "grad_norm": 0.36072155833244324, "learning_rate": 4.8802241229969686e-05, "loss": 0.3721, "step": 1770000 }, { "epoch": 11.98097119965353, "grad_norm": 0.38477975130081177, "learning_rate": 4.880190288003465e-05, "loss": 0.3713, "step": 1770500 }, { "epoch": 11.984354699003898, "grad_norm": 0.3696984052658081, "learning_rate": 4.880156453009961e-05, "loss": 0.3723, "step": 1771000 }, { "epoch": 11.987738198354267, "grad_norm": 0.3442613184452057, "learning_rate": 4.880122618016458e-05, "loss": 0.3723, "step": 1771500 }, { "epoch": 11.991121697704633, "grad_norm": 0.34321239590644836, "learning_rate": 4.880088783022954e-05, "loss": 0.3742, "step": 1772000 }, { "epoch": 11.994505197055002, "grad_norm": 0.3402617871761322, "learning_rate": 4.88005494802945e-05, "loss": 0.3712, "step": 1772500 }, { "epoch": 11.99788869640537, "grad_norm": 0.34689807891845703, "learning_rate": 4.8800211130359466e-05, "loss": 0.3707, "step": 1773000 }, { "epoch": 12.0, "eval_accuracy": 0.8585814519632364, "eval_loss": 0.5731452703475952, "eval_runtime": 3384.9935, "eval_samples_per_second": 85.892, "eval_steps_per_second": 5.368, "step": 1773312 }, { "epoch": 12.001272195755739, "grad_norm": 0.37968823313713074, "learning_rate": 4.879987278042443e-05, "loss": 0.3703, "step": 1773500 }, { "epoch": 12.004655695106107, "grad_norm": 0.3329985737800598, "learning_rate": 4.879953443048939e-05, "loss": 0.3683, "step": 1774000 }, { "epoch": 12.008039194456474, "grad_norm": 0.3562377393245697, "learning_rate": 4.879919608055435e-05, "loss": 0.3696, "step": 1774500 }, { "epoch": 12.011422693806843, "grad_norm": 0.37858709692955017, "learning_rate": 4.879885773061932e-05, "loss": 0.37, "step": 1775000 }, { "epoch": 12.014806193157211, "grad_norm": 0.33354243636131287, "learning_rate": 4.879851938068428e-05, "loss": 0.3695, "step": 1775500 }, { "epoch": 12.01818969250758, "grad_norm": 0.36619076132774353, "learning_rate": 4.8798181030749245e-05, "loss": 0.3693, "step": 1776000 }, { "epoch": 12.021573191857946, "grad_norm": 0.332576185464859, "learning_rate": 4.879784268081421e-05, "loss": 0.3706, "step": 1776500 }, { "epoch": 12.024956691208315, "grad_norm": 0.36819761991500854, "learning_rate": 4.8797504330879176e-05, "loss": 0.3701, "step": 1777000 }, { "epoch": 12.028340190558684, "grad_norm": 0.3572423458099365, "learning_rate": 4.879716598094413e-05, "loss": 0.3709, "step": 1777500 }, { "epoch": 12.031723689909052, "grad_norm": 0.3288554847240448, "learning_rate": 4.8796827631009094e-05, "loss": 0.369, "step": 1778000 }, { "epoch": 12.035107189259419, "grad_norm": 0.37519699335098267, "learning_rate": 4.8796489281074056e-05, "loss": 0.3687, "step": 1778500 }, { "epoch": 12.038490688609787, "grad_norm": 0.3955352008342743, "learning_rate": 4.8796150931139025e-05, "loss": 0.3702, "step": 1779000 }, { "epoch": 12.041874187960156, "grad_norm": 0.39312025904655457, "learning_rate": 4.879581258120399e-05, "loss": 0.3703, "step": 1779500 }, { "epoch": 12.045257687310524, "grad_norm": 0.36839836835861206, "learning_rate": 4.879547423126895e-05, "loss": 0.3703, "step": 1780000 }, { "epoch": 12.048641186660893, "grad_norm": 0.36080053448677063, "learning_rate": 4.879513588133391e-05, "loss": 0.3708, "step": 1780500 }, { "epoch": 12.05202468601126, "grad_norm": 0.3614180088043213, "learning_rate": 4.879479753139888e-05, "loss": 0.3695, "step": 1781000 }, { "epoch": 12.055408185361628, "grad_norm": 0.36564359068870544, "learning_rate": 4.879445918146384e-05, "loss": 0.3713, "step": 1781500 }, { "epoch": 12.058791684711997, "grad_norm": 0.3622540831565857, "learning_rate": 4.87941208315288e-05, "loss": 0.37, "step": 1782000 }, { "epoch": 12.062175184062365, "grad_norm": 0.36027461290359497, "learning_rate": 4.8793782481593766e-05, "loss": 0.37, "step": 1782500 }, { "epoch": 12.065558683412732, "grad_norm": 0.34156128764152527, "learning_rate": 4.879344413165873e-05, "loss": 0.3679, "step": 1783000 }, { "epoch": 12.0689421827631, "grad_norm": 0.3399154543876648, "learning_rate": 4.879310578172369e-05, "loss": 0.371, "step": 1783500 }, { "epoch": 12.072325682113469, "grad_norm": 0.34678414463996887, "learning_rate": 4.879276743178865e-05, "loss": 0.3698, "step": 1784000 }, { "epoch": 12.075709181463838, "grad_norm": 0.3499497175216675, "learning_rate": 4.879242908185362e-05, "loss": 0.3715, "step": 1784500 }, { "epoch": 12.079092680814206, "grad_norm": 0.35000520944595337, "learning_rate": 4.8792090731918584e-05, "loss": 0.3699, "step": 1785000 }, { "epoch": 12.082476180164573, "grad_norm": 0.4421665668487549, "learning_rate": 4.8791752381983546e-05, "loss": 0.3698, "step": 1785500 }, { "epoch": 12.085859679514941, "grad_norm": 0.368918776512146, "learning_rate": 4.879141403204851e-05, "loss": 0.3693, "step": 1786000 }, { "epoch": 12.08924317886531, "grad_norm": 0.3288615643978119, "learning_rate": 4.879107568211348e-05, "loss": 0.37, "step": 1786500 }, { "epoch": 12.092626678215678, "grad_norm": 0.35252460837364197, "learning_rate": 4.879073733217843e-05, "loss": 0.3693, "step": 1787000 }, { "epoch": 12.096010177566045, "grad_norm": 0.39786890149116516, "learning_rate": 4.8790398982243394e-05, "loss": 0.3714, "step": 1787500 }, { "epoch": 12.099393676916414, "grad_norm": 0.35984885692596436, "learning_rate": 4.8790060632308357e-05, "loss": 0.3693, "step": 1788000 }, { "epoch": 12.102777176266782, "grad_norm": 0.367716908454895, "learning_rate": 4.8789722282373325e-05, "loss": 0.3693, "step": 1788500 }, { "epoch": 12.10616067561715, "grad_norm": 0.3643549680709839, "learning_rate": 4.878938393243829e-05, "loss": 0.3705, "step": 1789000 }, { "epoch": 12.10954417496752, "grad_norm": 0.3920140862464905, "learning_rate": 4.878904558250325e-05, "loss": 0.3675, "step": 1789500 }, { "epoch": 12.112927674317886, "grad_norm": 0.36466044187545776, "learning_rate": 4.878870723256821e-05, "loss": 0.3684, "step": 1790000 }, { "epoch": 12.116311173668254, "grad_norm": 0.38179928064346313, "learning_rate": 4.878836888263318e-05, "loss": 0.3713, "step": 1790500 }, { "epoch": 12.119694673018623, "grad_norm": 0.35445892810821533, "learning_rate": 4.878803053269814e-05, "loss": 0.3703, "step": 1791000 }, { "epoch": 12.123078172368992, "grad_norm": 0.388711541891098, "learning_rate": 4.87876921827631e-05, "loss": 0.3699, "step": 1791500 }, { "epoch": 12.126461671719358, "grad_norm": 0.3741181790828705, "learning_rate": 4.878735383282807e-05, "loss": 0.3698, "step": 1792000 }, { "epoch": 12.129845171069727, "grad_norm": 0.3510143756866455, "learning_rate": 4.878701548289303e-05, "loss": 0.3714, "step": 1792500 }, { "epoch": 12.133228670420095, "grad_norm": 0.49974462389945984, "learning_rate": 4.878667713295799e-05, "loss": 0.3694, "step": 1793000 }, { "epoch": 12.136612169770464, "grad_norm": 0.3473338782787323, "learning_rate": 4.8786338783022953e-05, "loss": 0.37, "step": 1793500 }, { "epoch": 12.139995669120832, "grad_norm": 0.36775150895118713, "learning_rate": 4.878600043308792e-05, "loss": 0.3713, "step": 1794000 }, { "epoch": 12.1433791684712, "grad_norm": 0.38027772307395935, "learning_rate": 4.8785662083152884e-05, "loss": 0.3685, "step": 1794500 }, { "epoch": 12.146762667821568, "grad_norm": 0.3760400414466858, "learning_rate": 4.8785323733217847e-05, "loss": 0.3713, "step": 1795000 }, { "epoch": 12.150146167171936, "grad_norm": 0.3490118384361267, "learning_rate": 4.878498538328281e-05, "loss": 0.3692, "step": 1795500 }, { "epoch": 12.153529666522305, "grad_norm": 0.3494408130645752, "learning_rate": 4.878464703334778e-05, "loss": 0.371, "step": 1796000 }, { "epoch": 12.156913165872671, "grad_norm": 0.4272332787513733, "learning_rate": 4.878430868341273e-05, "loss": 0.3702, "step": 1796500 }, { "epoch": 12.16029666522304, "grad_norm": 0.35877782106399536, "learning_rate": 4.8783970333477695e-05, "loss": 0.37, "step": 1797000 }, { "epoch": 12.163680164573409, "grad_norm": 0.3769785761833191, "learning_rate": 4.878363198354266e-05, "loss": 0.3702, "step": 1797500 }, { "epoch": 12.167063663923777, "grad_norm": 0.35167497396469116, "learning_rate": 4.8783293633607626e-05, "loss": 0.3726, "step": 1798000 }, { "epoch": 12.170447163274144, "grad_norm": 0.35780346393585205, "learning_rate": 4.878295528367259e-05, "loss": 0.3706, "step": 1798500 }, { "epoch": 12.173830662624512, "grad_norm": 0.33106979727745056, "learning_rate": 4.878261693373755e-05, "loss": 0.3696, "step": 1799000 }, { "epoch": 12.17721416197488, "grad_norm": 0.3905043303966522, "learning_rate": 4.878227858380251e-05, "loss": 0.37, "step": 1799500 }, { "epoch": 12.18059766132525, "grad_norm": 0.367580771446228, "learning_rate": 4.878194023386748e-05, "loss": 0.3702, "step": 1800000 }, { "epoch": 12.183981160675618, "grad_norm": 0.3430144786834717, "learning_rate": 4.8781601883932443e-05, "loss": 0.3712, "step": 1800500 }, { "epoch": 12.187364660025985, "grad_norm": 0.37266072630882263, "learning_rate": 4.87812635339974e-05, "loss": 0.3711, "step": 1801000 }, { "epoch": 12.190748159376353, "grad_norm": 0.36405715346336365, "learning_rate": 4.878092518406237e-05, "loss": 0.3711, "step": 1801500 }, { "epoch": 12.194131658726722, "grad_norm": 0.3532100319862366, "learning_rate": 4.878058683412733e-05, "loss": 0.3703, "step": 1802000 }, { "epoch": 12.19751515807709, "grad_norm": 0.36643290519714355, "learning_rate": 4.878024848419229e-05, "loss": 0.371, "step": 1802500 }, { "epoch": 12.200898657427457, "grad_norm": 0.3668473958969116, "learning_rate": 4.8779910134257254e-05, "loss": 0.3703, "step": 1803000 }, { "epoch": 12.204282156777825, "grad_norm": 0.35221970081329346, "learning_rate": 4.877957178432222e-05, "loss": 0.3706, "step": 1803500 }, { "epoch": 12.207665656128194, "grad_norm": 0.30639296770095825, "learning_rate": 4.8779233434387185e-05, "loss": 0.3721, "step": 1804000 }, { "epoch": 12.211049155478563, "grad_norm": 0.3235289454460144, "learning_rate": 4.877889508445215e-05, "loss": 0.3707, "step": 1804500 }, { "epoch": 12.214432654828931, "grad_norm": 0.3486664593219757, "learning_rate": 4.877855673451711e-05, "loss": 0.3713, "step": 1805000 }, { "epoch": 12.217816154179298, "grad_norm": 0.33962762355804443, "learning_rate": 4.877821838458208e-05, "loss": 0.3695, "step": 1805500 }, { "epoch": 12.221199653529666, "grad_norm": 0.3924519419670105, "learning_rate": 4.8777880034647034e-05, "loss": 0.3721, "step": 1806000 }, { "epoch": 12.224583152880035, "grad_norm": 0.3697432577610016, "learning_rate": 4.8777541684711996e-05, "loss": 0.3702, "step": 1806500 }, { "epoch": 12.227966652230403, "grad_norm": 0.4088262617588043, "learning_rate": 4.877720333477696e-05, "loss": 0.369, "step": 1807000 }, { "epoch": 12.23135015158077, "grad_norm": 0.3665180504322052, "learning_rate": 4.877686498484193e-05, "loss": 0.3706, "step": 1807500 }, { "epoch": 12.234733650931139, "grad_norm": 0.35706061124801636, "learning_rate": 4.877652663490689e-05, "loss": 0.3693, "step": 1808000 }, { "epoch": 12.238117150281507, "grad_norm": 0.3554570972919464, "learning_rate": 4.877618828497185e-05, "loss": 0.3704, "step": 1808500 }, { "epoch": 12.241500649631876, "grad_norm": 0.35334378480911255, "learning_rate": 4.877584993503681e-05, "loss": 0.371, "step": 1809000 }, { "epoch": 12.244884148982244, "grad_norm": 0.41907766461372375, "learning_rate": 4.877551158510178e-05, "loss": 0.3705, "step": 1809500 }, { "epoch": 12.248267648332611, "grad_norm": 0.3961111009120941, "learning_rate": 4.8775173235166744e-05, "loss": 0.3706, "step": 1810000 }, { "epoch": 12.25165114768298, "grad_norm": 0.3752531409263611, "learning_rate": 4.87748348852317e-05, "loss": 0.3714, "step": 1810500 }, { "epoch": 12.255034647033348, "grad_norm": 0.37807732820510864, "learning_rate": 4.877449653529667e-05, "loss": 0.3712, "step": 1811000 }, { "epoch": 12.258418146383717, "grad_norm": 0.370071679353714, "learning_rate": 4.877415818536163e-05, "loss": 0.3699, "step": 1811500 }, { "epoch": 12.261801645734083, "grad_norm": 0.3707692325115204, "learning_rate": 4.877381983542659e-05, "loss": 0.3711, "step": 1812000 }, { "epoch": 12.265185145084452, "grad_norm": 0.35802507400512695, "learning_rate": 4.8773481485491555e-05, "loss": 0.3695, "step": 1812500 }, { "epoch": 12.26856864443482, "grad_norm": 0.39189577102661133, "learning_rate": 4.8773143135556524e-05, "loss": 0.371, "step": 1813000 }, { "epoch": 12.271952143785189, "grad_norm": 0.34621888399124146, "learning_rate": 4.8772804785621486e-05, "loss": 0.3706, "step": 1813500 }, { "epoch": 12.275335643135557, "grad_norm": 0.40879249572753906, "learning_rate": 4.877246643568645e-05, "loss": 0.3698, "step": 1814000 }, { "epoch": 12.278719142485924, "grad_norm": 0.3463546633720398, "learning_rate": 4.877212808575141e-05, "loss": 0.3713, "step": 1814500 }, { "epoch": 12.282102641836293, "grad_norm": 0.35812705755233765, "learning_rate": 4.877178973581638e-05, "loss": 0.3712, "step": 1815000 }, { "epoch": 12.285486141186661, "grad_norm": 0.3232555687427521, "learning_rate": 4.8771451385881334e-05, "loss": 0.3711, "step": 1815500 }, { "epoch": 12.28886964053703, "grad_norm": 0.3548388183116913, "learning_rate": 4.8771113035946296e-05, "loss": 0.3695, "step": 1816000 }, { "epoch": 12.292253139887396, "grad_norm": 0.3918019235134125, "learning_rate": 4.877077468601126e-05, "loss": 0.3708, "step": 1816500 }, { "epoch": 12.295636639237765, "grad_norm": 0.36075350642204285, "learning_rate": 4.877043633607623e-05, "loss": 0.3696, "step": 1817000 }, { "epoch": 12.299020138588133, "grad_norm": 0.364998459815979, "learning_rate": 4.877009798614119e-05, "loss": 0.3703, "step": 1817500 }, { "epoch": 12.302403637938502, "grad_norm": 0.34095925092697144, "learning_rate": 4.876975963620615e-05, "loss": 0.3699, "step": 1818000 }, { "epoch": 12.30578713728887, "grad_norm": 0.33013880252838135, "learning_rate": 4.8769421286271114e-05, "loss": 0.3708, "step": 1818500 }, { "epoch": 12.309170636639237, "grad_norm": 0.3411044180393219, "learning_rate": 4.876908293633608e-05, "loss": 0.3706, "step": 1819000 }, { "epoch": 12.312554135989606, "grad_norm": 0.3632298707962036, "learning_rate": 4.8768744586401045e-05, "loss": 0.3705, "step": 1819500 }, { "epoch": 12.315937635339974, "grad_norm": 0.33505117893218994, "learning_rate": 4.876840623646601e-05, "loss": 0.3713, "step": 1820000 }, { "epoch": 12.319321134690343, "grad_norm": 0.35253992676734924, "learning_rate": 4.876806788653097e-05, "loss": 0.3726, "step": 1820500 }, { "epoch": 12.32270463404071, "grad_norm": 0.32772931456565857, "learning_rate": 4.876772953659593e-05, "loss": 0.3702, "step": 1821000 }, { "epoch": 12.326088133391078, "grad_norm": 0.36878344416618347, "learning_rate": 4.876739118666089e-05, "loss": 0.37, "step": 1821500 }, { "epoch": 12.329471632741447, "grad_norm": 0.4037615656852722, "learning_rate": 4.8767052836725855e-05, "loss": 0.3716, "step": 1822000 }, { "epoch": 12.332855132091815, "grad_norm": 0.42346951365470886, "learning_rate": 4.8766714486790824e-05, "loss": 0.3706, "step": 1822500 }, { "epoch": 12.336238631442182, "grad_norm": 0.36725038290023804, "learning_rate": 4.8766376136855786e-05, "loss": 0.3711, "step": 1823000 }, { "epoch": 12.33962213079255, "grad_norm": 0.3480580449104309, "learning_rate": 4.876603778692075e-05, "loss": 0.3702, "step": 1823500 }, { "epoch": 12.343005630142919, "grad_norm": 0.3734580874443054, "learning_rate": 4.876569943698571e-05, "loss": 0.3706, "step": 1824000 }, { "epoch": 12.346389129493287, "grad_norm": 0.3935319781303406, "learning_rate": 4.876536108705067e-05, "loss": 0.3711, "step": 1824500 }, { "epoch": 12.349772628843656, "grad_norm": 0.3282531797885895, "learning_rate": 4.8765022737115635e-05, "loss": 0.3707, "step": 1825000 }, { "epoch": 12.353156128194023, "grad_norm": 0.4004055857658386, "learning_rate": 4.87646843871806e-05, "loss": 0.3713, "step": 1825500 }, { "epoch": 12.356539627544391, "grad_norm": 0.3112265169620514, "learning_rate": 4.876434603724556e-05, "loss": 0.372, "step": 1826000 }, { "epoch": 12.35992312689476, "grad_norm": 0.3350541889667511, "learning_rate": 4.876400768731053e-05, "loss": 0.3702, "step": 1826500 }, { "epoch": 12.363306626245128, "grad_norm": 0.3925771713256836, "learning_rate": 4.876366933737549e-05, "loss": 0.3708, "step": 1827000 }, { "epoch": 12.366690125595495, "grad_norm": 0.40490618348121643, "learning_rate": 4.876333098744045e-05, "loss": 0.37, "step": 1827500 }, { "epoch": 12.370073624945864, "grad_norm": 0.3550584316253662, "learning_rate": 4.8762992637505414e-05, "loss": 0.3725, "step": 1828000 }, { "epoch": 12.373457124296232, "grad_norm": 0.37563541531562805, "learning_rate": 4.876265428757038e-05, "loss": 0.3702, "step": 1828500 }, { "epoch": 12.3768406236466, "grad_norm": 0.33646509051322937, "learning_rate": 4.8762315937635345e-05, "loss": 0.3712, "step": 1829000 }, { "epoch": 12.38022412299697, "grad_norm": 0.32552415132522583, "learning_rate": 4.876197758770031e-05, "loss": 0.3722, "step": 1829500 }, { "epoch": 12.383607622347336, "grad_norm": 0.36574462056159973, "learning_rate": 4.876163923776527e-05, "loss": 0.3693, "step": 1830000 }, { "epoch": 12.386991121697704, "grad_norm": 0.37173759937286377, "learning_rate": 4.876130088783023e-05, "loss": 0.3693, "step": 1830500 }, { "epoch": 12.390374621048073, "grad_norm": 0.38284623622894287, "learning_rate": 4.8760962537895194e-05, "loss": 0.37, "step": 1831000 }, { "epoch": 12.393758120398441, "grad_norm": 0.3950818181037903, "learning_rate": 4.8760624187960156e-05, "loss": 0.3706, "step": 1831500 }, { "epoch": 12.397141619748808, "grad_norm": 0.42820975184440613, "learning_rate": 4.8760285838025125e-05, "loss": 0.3709, "step": 1832000 }, { "epoch": 12.400525119099177, "grad_norm": 0.4042028486728668, "learning_rate": 4.875994748809009e-05, "loss": 0.371, "step": 1832500 }, { "epoch": 12.403908618449545, "grad_norm": 0.3339955508708954, "learning_rate": 4.875960913815505e-05, "loss": 0.3708, "step": 1833000 }, { "epoch": 12.407292117799914, "grad_norm": 0.34510526061058044, "learning_rate": 4.875927078822001e-05, "loss": 0.3696, "step": 1833500 }, { "epoch": 12.410675617150282, "grad_norm": 0.36796268820762634, "learning_rate": 4.875893243828497e-05, "loss": 0.372, "step": 1834000 }, { "epoch": 12.414059116500649, "grad_norm": 0.38430359959602356, "learning_rate": 4.8758594088349935e-05, "loss": 0.3706, "step": 1834500 }, { "epoch": 12.417442615851018, "grad_norm": 0.39185529947280884, "learning_rate": 4.87582557384149e-05, "loss": 0.3708, "step": 1835000 }, { "epoch": 12.420826115201386, "grad_norm": 0.3595646023750305, "learning_rate": 4.875791738847986e-05, "loss": 0.3729, "step": 1835500 }, { "epoch": 12.424209614551755, "grad_norm": 0.3638467490673065, "learning_rate": 4.875757903854483e-05, "loss": 0.3716, "step": 1836000 }, { "epoch": 12.427593113902121, "grad_norm": 0.37463465332984924, "learning_rate": 4.875724068860979e-05, "loss": 0.371, "step": 1836500 }, { "epoch": 12.43097661325249, "grad_norm": 0.32411718368530273, "learning_rate": 4.875690233867475e-05, "loss": 0.371, "step": 1837000 }, { "epoch": 12.434360112602858, "grad_norm": 0.33750879764556885, "learning_rate": 4.8756563988739715e-05, "loss": 0.3708, "step": 1837500 }, { "epoch": 12.437743611953227, "grad_norm": 0.33264005184173584, "learning_rate": 4.8756225638804684e-05, "loss": 0.3696, "step": 1838000 }, { "epoch": 12.441127111303594, "grad_norm": 0.3644348382949829, "learning_rate": 4.8755887288869646e-05, "loss": 0.3701, "step": 1838500 }, { "epoch": 12.444510610653962, "grad_norm": 0.35615313053131104, "learning_rate": 4.875554893893461e-05, "loss": 0.3707, "step": 1839000 }, { "epoch": 12.44789411000433, "grad_norm": 0.37479841709136963, "learning_rate": 4.875521058899957e-05, "loss": 0.3715, "step": 1839500 }, { "epoch": 12.4512776093547, "grad_norm": 0.3680063784122467, "learning_rate": 4.875487223906453e-05, "loss": 0.372, "step": 1840000 }, { "epoch": 12.454661108705068, "grad_norm": 0.3814094662666321, "learning_rate": 4.8754533889129494e-05, "loss": 0.3724, "step": 1840500 }, { "epoch": 12.458044608055435, "grad_norm": 0.3767112195491791, "learning_rate": 4.875419553919446e-05, "loss": 0.3696, "step": 1841000 }, { "epoch": 12.461428107405803, "grad_norm": 0.34589385986328125, "learning_rate": 4.875385718925942e-05, "loss": 0.3715, "step": 1841500 }, { "epoch": 12.464811606756172, "grad_norm": 0.3726743757724762, "learning_rate": 4.875351883932439e-05, "loss": 0.3696, "step": 1842000 }, { "epoch": 12.46819510610654, "grad_norm": 0.34632807970046997, "learning_rate": 4.875318048938935e-05, "loss": 0.3723, "step": 1842500 }, { "epoch": 12.471578605456909, "grad_norm": 0.3409026861190796, "learning_rate": 4.875284213945431e-05, "loss": 0.3708, "step": 1843000 }, { "epoch": 12.474962104807275, "grad_norm": 0.3455606698989868, "learning_rate": 4.8752503789519274e-05, "loss": 0.3713, "step": 1843500 }, { "epoch": 12.478345604157644, "grad_norm": 0.3865852653980255, "learning_rate": 4.8752165439584236e-05, "loss": 0.3692, "step": 1844000 }, { "epoch": 12.481729103508012, "grad_norm": 0.375196635723114, "learning_rate": 4.87518270896492e-05, "loss": 0.3717, "step": 1844500 }, { "epoch": 12.485112602858381, "grad_norm": 0.3795843720436096, "learning_rate": 4.875148873971416e-05, "loss": 0.3705, "step": 1845000 }, { "epoch": 12.488496102208748, "grad_norm": 0.4108113944530487, "learning_rate": 4.875115038977913e-05, "loss": 0.3706, "step": 1845500 }, { "epoch": 12.491879601559116, "grad_norm": 0.3474077880382538, "learning_rate": 4.875081203984409e-05, "loss": 0.3708, "step": 1846000 }, { "epoch": 12.495263100909485, "grad_norm": 0.3858991861343384, "learning_rate": 4.8750473689909054e-05, "loss": 0.3698, "step": 1846500 }, { "epoch": 12.498646600259853, "grad_norm": 0.3526313006877899, "learning_rate": 4.8750135339974016e-05, "loss": 0.3713, "step": 1847000 }, { "epoch": 12.50203009961022, "grad_norm": 0.38953834772109985, "learning_rate": 4.8749796990038985e-05, "loss": 0.3718, "step": 1847500 }, { "epoch": 12.505413598960589, "grad_norm": 0.4159233868122101, "learning_rate": 4.874945864010395e-05, "loss": 0.371, "step": 1848000 }, { "epoch": 12.508797098310957, "grad_norm": 0.3940291404724121, "learning_rate": 4.874912029016891e-05, "loss": 0.3686, "step": 1848500 }, { "epoch": 12.512180597661326, "grad_norm": 0.35747018456459045, "learning_rate": 4.8748781940233864e-05, "loss": 0.3699, "step": 1849000 }, { "epoch": 12.515564097011694, "grad_norm": 0.35041186213493347, "learning_rate": 4.874844359029883e-05, "loss": 0.3717, "step": 1849500 }, { "epoch": 12.518947596362061, "grad_norm": 0.37993401288986206, "learning_rate": 4.8748105240363795e-05, "loss": 0.3705, "step": 1850000 }, { "epoch": 12.52233109571243, "grad_norm": 0.35090896487236023, "learning_rate": 4.874776689042876e-05, "loss": 0.37, "step": 1850500 }, { "epoch": 12.525714595062798, "grad_norm": 0.3232038915157318, "learning_rate": 4.874742854049372e-05, "loss": 0.3695, "step": 1851000 }, { "epoch": 12.529098094413166, "grad_norm": 0.3876701891422272, "learning_rate": 4.874709019055869e-05, "loss": 0.37, "step": 1851500 }, { "epoch": 12.532481593763533, "grad_norm": 0.3513367474079132, "learning_rate": 4.874675184062365e-05, "loss": 0.3707, "step": 1852000 }, { "epoch": 12.535865093113902, "grad_norm": 0.3564150333404541, "learning_rate": 4.874641349068861e-05, "loss": 0.3717, "step": 1852500 }, { "epoch": 12.53924859246427, "grad_norm": 0.3708456754684448, "learning_rate": 4.8746075140753575e-05, "loss": 0.3713, "step": 1853000 }, { "epoch": 12.542632091814639, "grad_norm": 0.3454448878765106, "learning_rate": 4.874573679081854e-05, "loss": 0.3691, "step": 1853500 }, { "epoch": 12.546015591165007, "grad_norm": 0.34445688128471375, "learning_rate": 4.87453984408835e-05, "loss": 0.3701, "step": 1854000 }, { "epoch": 12.549399090515374, "grad_norm": 0.35106971859931946, "learning_rate": 4.874506009094846e-05, "loss": 0.3706, "step": 1854500 }, { "epoch": 12.552782589865743, "grad_norm": 0.3719109892845154, "learning_rate": 4.874472174101343e-05, "loss": 0.3718, "step": 1855000 }, { "epoch": 12.556166089216111, "grad_norm": 0.3739451766014099, "learning_rate": 4.874438339107839e-05, "loss": 0.3707, "step": 1855500 }, { "epoch": 12.55954958856648, "grad_norm": 0.3855455815792084, "learning_rate": 4.8744045041143354e-05, "loss": 0.3712, "step": 1856000 }, { "epoch": 12.562933087916846, "grad_norm": 0.3706997334957123, "learning_rate": 4.8743706691208316e-05, "loss": 0.372, "step": 1856500 }, { "epoch": 12.566316587267215, "grad_norm": 0.34375709295272827, "learning_rate": 4.8743368341273285e-05, "loss": 0.3708, "step": 1857000 }, { "epoch": 12.569700086617583, "grad_norm": 0.3563523292541504, "learning_rate": 4.874302999133825e-05, "loss": 0.3722, "step": 1857500 }, { "epoch": 12.573083585967952, "grad_norm": 0.35885095596313477, "learning_rate": 4.874269164140321e-05, "loss": 0.3705, "step": 1858000 }, { "epoch": 12.57646708531832, "grad_norm": 0.38173002004623413, "learning_rate": 4.8742353291468165e-05, "loss": 0.3709, "step": 1858500 }, { "epoch": 12.579850584668687, "grad_norm": 0.35054492950439453, "learning_rate": 4.8742014941533134e-05, "loss": 0.3701, "step": 1859000 }, { "epoch": 12.583234084019056, "grad_norm": 0.34447476267814636, "learning_rate": 4.8741676591598096e-05, "loss": 0.3708, "step": 1859500 }, { "epoch": 12.586617583369424, "grad_norm": 0.3595876395702362, "learning_rate": 4.874133824166306e-05, "loss": 0.371, "step": 1860000 }, { "epoch": 12.590001082719793, "grad_norm": 0.37683749198913574, "learning_rate": 4.874099989172802e-05, "loss": 0.3721, "step": 1860500 }, { "epoch": 12.59338458207016, "grad_norm": 0.3663787841796875, "learning_rate": 4.874066154179299e-05, "loss": 0.3711, "step": 1861000 }, { "epoch": 12.596768081420528, "grad_norm": 0.37803715467453003, "learning_rate": 4.874032319185795e-05, "loss": 0.3712, "step": 1861500 }, { "epoch": 12.600151580770897, "grad_norm": 0.4103628695011139, "learning_rate": 4.873998484192291e-05, "loss": 0.3715, "step": 1862000 }, { "epoch": 12.603535080121265, "grad_norm": 0.3537883162498474, "learning_rate": 4.8739646491987875e-05, "loss": 0.3711, "step": 1862500 }, { "epoch": 12.606918579471632, "grad_norm": 0.3754059374332428, "learning_rate": 4.873930814205284e-05, "loss": 0.3702, "step": 1863000 }, { "epoch": 12.610302078822, "grad_norm": 0.35886409878730774, "learning_rate": 4.87389697921178e-05, "loss": 0.3713, "step": 1863500 }, { "epoch": 12.613685578172369, "grad_norm": 0.35821646451950073, "learning_rate": 4.873863144218276e-05, "loss": 0.3712, "step": 1864000 }, { "epoch": 12.617069077522737, "grad_norm": 0.36803847551345825, "learning_rate": 4.873829309224773e-05, "loss": 0.3706, "step": 1864500 }, { "epoch": 12.620452576873106, "grad_norm": 0.3754260838031769, "learning_rate": 4.873795474231269e-05, "loss": 0.3711, "step": 1865000 }, { "epoch": 12.623836076223473, "grad_norm": 0.35917600989341736, "learning_rate": 4.8737616392377655e-05, "loss": 0.372, "step": 1865500 }, { "epoch": 12.627219575573841, "grad_norm": 0.32796338200569153, "learning_rate": 4.873727804244262e-05, "loss": 0.3708, "step": 1866000 }, { "epoch": 12.63060307492421, "grad_norm": 0.45512181520462036, "learning_rate": 4.8736939692507586e-05, "loss": 0.3707, "step": 1866500 }, { "epoch": 12.633986574274578, "grad_norm": 0.36449873447418213, "learning_rate": 4.873660134257255e-05, "loss": 0.3713, "step": 1867000 }, { "epoch": 12.637370073624947, "grad_norm": 0.3368343412876129, "learning_rate": 4.873626299263751e-05, "loss": 0.3704, "step": 1867500 }, { "epoch": 12.640753572975314, "grad_norm": 0.37133774161338806, "learning_rate": 4.8735924642702465e-05, "loss": 0.3706, "step": 1868000 }, { "epoch": 12.644137072325682, "grad_norm": 0.33808013796806335, "learning_rate": 4.8735586292767434e-05, "loss": 0.3692, "step": 1868500 }, { "epoch": 12.64752057167605, "grad_norm": 0.3964240252971649, "learning_rate": 4.8735247942832396e-05, "loss": 0.3728, "step": 1869000 }, { "epoch": 12.65090407102642, "grad_norm": 0.32918646931648254, "learning_rate": 4.873490959289736e-05, "loss": 0.3713, "step": 1869500 }, { "epoch": 12.654287570376786, "grad_norm": 0.38005614280700684, "learning_rate": 4.873457124296232e-05, "loss": 0.3707, "step": 1870000 }, { "epoch": 12.657671069727154, "grad_norm": 0.3748287856578827, "learning_rate": 4.873423289302729e-05, "loss": 0.3711, "step": 1870500 }, { "epoch": 12.661054569077523, "grad_norm": 0.3982534408569336, "learning_rate": 4.873389454309225e-05, "loss": 0.3724, "step": 1871000 }, { "epoch": 12.664438068427891, "grad_norm": 0.3427683115005493, "learning_rate": 4.8733556193157214e-05, "loss": 0.3695, "step": 1871500 }, { "epoch": 12.667821567778258, "grad_norm": 0.3884345293045044, "learning_rate": 4.8733217843222176e-05, "loss": 0.3711, "step": 1872000 }, { "epoch": 12.671205067128627, "grad_norm": 0.3692188858985901, "learning_rate": 4.873287949328714e-05, "loss": 0.3703, "step": 1872500 }, { "epoch": 12.674588566478995, "grad_norm": 0.47177180647850037, "learning_rate": 4.87325411433521e-05, "loss": 0.3716, "step": 1873000 }, { "epoch": 12.677972065829364, "grad_norm": 0.34280335903167725, "learning_rate": 4.873220279341706e-05, "loss": 0.37, "step": 1873500 }, { "epoch": 12.681355565179732, "grad_norm": 0.39825934171676636, "learning_rate": 4.873186444348203e-05, "loss": 0.3704, "step": 1874000 }, { "epoch": 12.684739064530099, "grad_norm": 0.336199551820755, "learning_rate": 4.873152609354699e-05, "loss": 0.3709, "step": 1874500 }, { "epoch": 12.688122563880468, "grad_norm": 0.3393048644065857, "learning_rate": 4.8731187743611955e-05, "loss": 0.3704, "step": 1875000 }, { "epoch": 12.691506063230836, "grad_norm": 0.36187732219696045, "learning_rate": 4.873084939367692e-05, "loss": 0.37, "step": 1875500 }, { "epoch": 12.694889562581205, "grad_norm": 0.5153160095214844, "learning_rate": 4.8730511043741886e-05, "loss": 0.3698, "step": 1876000 }, { "epoch": 12.698273061931571, "grad_norm": 0.3579491376876831, "learning_rate": 4.873017269380685e-05, "loss": 0.3706, "step": 1876500 }, { "epoch": 12.70165656128194, "grad_norm": 0.3689671456813812, "learning_rate": 4.872983434387181e-05, "loss": 0.3711, "step": 1877000 }, { "epoch": 12.705040060632308, "grad_norm": 0.3544732630252838, "learning_rate": 4.8729495993936766e-05, "loss": 0.3707, "step": 1877500 }, { "epoch": 12.708423559982677, "grad_norm": 0.3601408898830414, "learning_rate": 4.8729157644001735e-05, "loss": 0.3708, "step": 1878000 }, { "epoch": 12.711807059333044, "grad_norm": 0.37086576223373413, "learning_rate": 4.87288192940667e-05, "loss": 0.3712, "step": 1878500 }, { "epoch": 12.715190558683412, "grad_norm": 0.378149151802063, "learning_rate": 4.872848094413166e-05, "loss": 0.3716, "step": 1879000 }, { "epoch": 12.71857405803378, "grad_norm": 0.35186734795570374, "learning_rate": 4.872814259419662e-05, "loss": 0.3715, "step": 1879500 }, { "epoch": 12.72195755738415, "grad_norm": 0.3598947525024414, "learning_rate": 4.872780424426159e-05, "loss": 0.3717, "step": 1880000 }, { "epoch": 12.725341056734518, "grad_norm": 0.34927839040756226, "learning_rate": 4.872746589432655e-05, "loss": 0.3696, "step": 1880500 }, { "epoch": 12.728724556084885, "grad_norm": 0.3616214096546173, "learning_rate": 4.8727127544391514e-05, "loss": 0.37, "step": 1881000 }, { "epoch": 12.732108055435253, "grad_norm": 0.3940015435218811, "learning_rate": 4.8726789194456477e-05, "loss": 0.3675, "step": 1881500 }, { "epoch": 12.735491554785622, "grad_norm": 0.35310205817222595, "learning_rate": 4.8726450844521445e-05, "loss": 0.3711, "step": 1882000 }, { "epoch": 12.73887505413599, "grad_norm": 0.3785247802734375, "learning_rate": 4.87261124945864e-05, "loss": 0.3707, "step": 1882500 }, { "epoch": 12.742258553486359, "grad_norm": 0.39114999771118164, "learning_rate": 4.872577414465136e-05, "loss": 0.3717, "step": 1883000 }, { "epoch": 12.745642052836725, "grad_norm": 0.36994487047195435, "learning_rate": 4.872543579471633e-05, "loss": 0.3701, "step": 1883500 }, { "epoch": 12.749025552187094, "grad_norm": 0.3476729989051819, "learning_rate": 4.8725097444781294e-05, "loss": 0.3711, "step": 1884000 }, { "epoch": 12.752409051537462, "grad_norm": 0.38086920976638794, "learning_rate": 4.8724759094846256e-05, "loss": 0.3695, "step": 1884500 }, { "epoch": 12.755792550887831, "grad_norm": 0.4031669497489929, "learning_rate": 4.872442074491122e-05, "loss": 0.3704, "step": 1885000 }, { "epoch": 12.759176050238198, "grad_norm": 0.36528006196022034, "learning_rate": 4.872408239497619e-05, "loss": 0.3696, "step": 1885500 }, { "epoch": 12.762559549588566, "grad_norm": 0.38576963543891907, "learning_rate": 4.872374404504115e-05, "loss": 0.3713, "step": 1886000 }, { "epoch": 12.765943048938935, "grad_norm": 0.3542245626449585, "learning_rate": 4.872340569510611e-05, "loss": 0.3706, "step": 1886500 }, { "epoch": 12.769326548289303, "grad_norm": 0.3419119119644165, "learning_rate": 4.872306734517107e-05, "loss": 0.3704, "step": 1887000 }, { "epoch": 12.77271004763967, "grad_norm": 0.32595545053482056, "learning_rate": 4.8722728995236036e-05, "loss": 0.371, "step": 1887500 }, { "epoch": 12.776093546990039, "grad_norm": 0.3801577687263489, "learning_rate": 4.8722390645301e-05, "loss": 0.3713, "step": 1888000 }, { "epoch": 12.779477046340407, "grad_norm": 0.3445073068141937, "learning_rate": 4.872205229536596e-05, "loss": 0.3713, "step": 1888500 }, { "epoch": 12.782860545690776, "grad_norm": 0.35645556449890137, "learning_rate": 4.872171394543092e-05, "loss": 0.3717, "step": 1889000 }, { "epoch": 12.786244045041144, "grad_norm": 0.35171839594841003, "learning_rate": 4.872137559549589e-05, "loss": 0.3717, "step": 1889500 }, { "epoch": 12.78962754439151, "grad_norm": 0.3259086310863495, "learning_rate": 4.872103724556085e-05, "loss": 0.3713, "step": 1890000 }, { "epoch": 12.79301104374188, "grad_norm": 0.38064926862716675, "learning_rate": 4.8720698895625815e-05, "loss": 0.3721, "step": 1890500 }, { "epoch": 12.796394543092248, "grad_norm": 0.34762662649154663, "learning_rate": 4.872036054569078e-05, "loss": 0.3707, "step": 1891000 }, { "epoch": 12.799778042442616, "grad_norm": 0.3977963328361511, "learning_rate": 4.8720022195755746e-05, "loss": 0.3705, "step": 1891500 }, { "epoch": 12.803161541792983, "grad_norm": 0.3365938067436218, "learning_rate": 4.87196838458207e-05, "loss": 0.3694, "step": 1892000 }, { "epoch": 12.806545041143352, "grad_norm": 0.36340808868408203, "learning_rate": 4.8719345495885664e-05, "loss": 0.3724, "step": 1892500 }, { "epoch": 12.80992854049372, "grad_norm": 0.35641616582870483, "learning_rate": 4.871900714595063e-05, "loss": 0.3727, "step": 1893000 }, { "epoch": 12.813312039844089, "grad_norm": 0.4009360074996948, "learning_rate": 4.8718668796015595e-05, "loss": 0.3692, "step": 1893500 }, { "epoch": 12.816695539194457, "grad_norm": 0.3709418475627899, "learning_rate": 4.871833044608056e-05, "loss": 0.3702, "step": 1894000 }, { "epoch": 12.820079038544824, "grad_norm": 0.36211636662483215, "learning_rate": 4.871799209614552e-05, "loss": 0.3692, "step": 1894500 }, { "epoch": 12.823462537895193, "grad_norm": 0.35264286398887634, "learning_rate": 4.871765374621048e-05, "loss": 0.3713, "step": 1895000 }, { "epoch": 12.826846037245561, "grad_norm": 0.36805665493011475, "learning_rate": 4.871731539627545e-05, "loss": 0.3717, "step": 1895500 }, { "epoch": 12.83022953659593, "grad_norm": 0.35103264451026917, "learning_rate": 4.871697704634041e-05, "loss": 0.3714, "step": 1896000 }, { "epoch": 12.833613035946296, "grad_norm": 0.3711964190006256, "learning_rate": 4.871663869640537e-05, "loss": 0.3718, "step": 1896500 }, { "epoch": 12.836996535296665, "grad_norm": 0.38640648126602173, "learning_rate": 4.8716300346470336e-05, "loss": 0.3704, "step": 1897000 }, { "epoch": 12.840380034647033, "grad_norm": 0.3128819465637207, "learning_rate": 4.87159619965353e-05, "loss": 0.3715, "step": 1897500 }, { "epoch": 12.843763533997402, "grad_norm": 0.3670842945575714, "learning_rate": 4.871562364660026e-05, "loss": 0.3722, "step": 1898000 }, { "epoch": 12.84714703334777, "grad_norm": 0.3582267761230469, "learning_rate": 4.871528529666522e-05, "loss": 0.3711, "step": 1898500 }, { "epoch": 12.850530532698137, "grad_norm": 0.32701876759529114, "learning_rate": 4.871494694673019e-05, "loss": 0.3709, "step": 1899000 }, { "epoch": 12.853914032048506, "grad_norm": 0.35891956090927124, "learning_rate": 4.8714608596795154e-05, "loss": 0.3701, "step": 1899500 }, { "epoch": 12.857297531398874, "grad_norm": 0.3812888562679291, "learning_rate": 4.8714270246860116e-05, "loss": 0.3696, "step": 1900000 }, { "epoch": 12.860681030749243, "grad_norm": 0.3577359914779663, "learning_rate": 4.871393189692508e-05, "loss": 0.3724, "step": 1900500 }, { "epoch": 12.86406453009961, "grad_norm": 0.36038023233413696, "learning_rate": 4.871359354699005e-05, "loss": 0.3695, "step": 1901000 }, { "epoch": 12.867448029449978, "grad_norm": 0.3661651909351349, "learning_rate": 4.8713255197055e-05, "loss": 0.3707, "step": 1901500 }, { "epoch": 12.870831528800347, "grad_norm": 0.3396393954753876, "learning_rate": 4.8712916847119964e-05, "loss": 0.3715, "step": 1902000 }, { "epoch": 12.874215028150715, "grad_norm": 0.38005658984184265, "learning_rate": 4.871257849718493e-05, "loss": 0.3705, "step": 1902500 }, { "epoch": 12.877598527501082, "grad_norm": 0.35202455520629883, "learning_rate": 4.8712240147249895e-05, "loss": 0.371, "step": 1903000 }, { "epoch": 12.88098202685145, "grad_norm": 0.3754706084728241, "learning_rate": 4.871190179731486e-05, "loss": 0.371, "step": 1903500 }, { "epoch": 12.884365526201819, "grad_norm": 0.3649231195449829, "learning_rate": 4.871156344737982e-05, "loss": 0.3707, "step": 1904000 }, { "epoch": 12.887749025552187, "grad_norm": 0.37613940238952637, "learning_rate": 4.871122509744478e-05, "loss": 0.3711, "step": 1904500 }, { "epoch": 12.891132524902556, "grad_norm": 0.39613452553749084, "learning_rate": 4.871088674750975e-05, "loss": 0.3699, "step": 1905000 }, { "epoch": 12.894516024252923, "grad_norm": 0.3531261384487152, "learning_rate": 4.871054839757471e-05, "loss": 0.3718, "step": 1905500 }, { "epoch": 12.897899523603291, "grad_norm": 0.35395076870918274, "learning_rate": 4.871021004763967e-05, "loss": 0.3711, "step": 1906000 }, { "epoch": 12.90128302295366, "grad_norm": 0.3228279948234558, "learning_rate": 4.870987169770464e-05, "loss": 0.3697, "step": 1906500 }, { "epoch": 12.904666522304028, "grad_norm": 0.3967723548412323, "learning_rate": 4.87095333477696e-05, "loss": 0.372, "step": 1907000 }, { "epoch": 12.908050021654397, "grad_norm": 0.3011283576488495, "learning_rate": 4.870919499783456e-05, "loss": 0.3712, "step": 1907500 }, { "epoch": 12.911433521004763, "grad_norm": 0.330438494682312, "learning_rate": 4.870885664789952e-05, "loss": 0.3709, "step": 1908000 }, { "epoch": 12.914817020355132, "grad_norm": 0.34110695123672485, "learning_rate": 4.870851829796449e-05, "loss": 0.3709, "step": 1908500 }, { "epoch": 12.9182005197055, "grad_norm": 0.35048842430114746, "learning_rate": 4.8708179948029454e-05, "loss": 0.3697, "step": 1909000 }, { "epoch": 12.921584019055869, "grad_norm": 0.3407607972621918, "learning_rate": 4.8707841598094416e-05, "loss": 0.3706, "step": 1909500 }, { "epoch": 12.924967518406236, "grad_norm": 0.3866593539714813, "learning_rate": 4.870750324815938e-05, "loss": 0.372, "step": 1910000 }, { "epoch": 12.928351017756604, "grad_norm": 0.3867066502571106, "learning_rate": 4.870716489822435e-05, "loss": 0.3726, "step": 1910500 }, { "epoch": 12.931734517106973, "grad_norm": 0.3892397880554199, "learning_rate": 4.87068265482893e-05, "loss": 0.3711, "step": 1911000 }, { "epoch": 12.935118016457341, "grad_norm": 0.3106846213340759, "learning_rate": 4.8706488198354265e-05, "loss": 0.3718, "step": 1911500 }, { "epoch": 12.938501515807708, "grad_norm": 0.3602654039859772, "learning_rate": 4.870614984841923e-05, "loss": 0.3714, "step": 1912000 }, { "epoch": 12.941885015158077, "grad_norm": 0.3535303771495819, "learning_rate": 4.8705811498484196e-05, "loss": 0.37, "step": 1912500 }, { "epoch": 12.945268514508445, "grad_norm": 0.377194881439209, "learning_rate": 4.870547314854916e-05, "loss": 0.3693, "step": 1913000 }, { "epoch": 12.948652013858814, "grad_norm": 0.35924094915390015, "learning_rate": 4.870513479861412e-05, "loss": 0.3705, "step": 1913500 }, { "epoch": 12.952035513209182, "grad_norm": 0.3477979004383087, "learning_rate": 4.870479644867908e-05, "loss": 0.3705, "step": 1914000 }, { "epoch": 12.955419012559549, "grad_norm": 0.31688621640205383, "learning_rate": 4.870445809874405e-05, "loss": 0.3705, "step": 1914500 }, { "epoch": 12.958802511909917, "grad_norm": 0.35452088713645935, "learning_rate": 4.870411974880901e-05, "loss": 0.3714, "step": 1915000 }, { "epoch": 12.962186011260286, "grad_norm": 0.3695243299007416, "learning_rate": 4.870378139887397e-05, "loss": 0.3708, "step": 1915500 }, { "epoch": 12.965569510610655, "grad_norm": 0.34055230021476746, "learning_rate": 4.870344304893894e-05, "loss": 0.3721, "step": 1916000 }, { "epoch": 12.968953009961021, "grad_norm": 0.3908543586730957, "learning_rate": 4.87031046990039e-05, "loss": 0.3714, "step": 1916500 }, { "epoch": 12.97233650931139, "grad_norm": 0.3592712879180908, "learning_rate": 4.870276634906886e-05, "loss": 0.3723, "step": 1917000 }, { "epoch": 12.975720008661758, "grad_norm": 0.39363932609558105, "learning_rate": 4.8702427999133824e-05, "loss": 0.3709, "step": 1917500 }, { "epoch": 12.979103508012127, "grad_norm": 0.36408188939094543, "learning_rate": 4.870208964919879e-05, "loss": 0.3697, "step": 1918000 }, { "epoch": 12.982487007362495, "grad_norm": 0.312065452337265, "learning_rate": 4.8701751299263755e-05, "loss": 0.371, "step": 1918500 }, { "epoch": 12.985870506712862, "grad_norm": 0.3716977536678314, "learning_rate": 4.870141294932872e-05, "loss": 0.3704, "step": 1919000 }, { "epoch": 12.98925400606323, "grad_norm": 0.3691862225532532, "learning_rate": 4.870107459939368e-05, "loss": 0.3712, "step": 1919500 }, { "epoch": 12.9926375054136, "grad_norm": 0.37253594398498535, "learning_rate": 4.870073624945865e-05, "loss": 0.3723, "step": 1920000 }, { "epoch": 12.996021004763968, "grad_norm": 0.36383548378944397, "learning_rate": 4.87003978995236e-05, "loss": 0.3716, "step": 1920500 }, { "epoch": 12.999404504114334, "grad_norm": 0.34657612442970276, "learning_rate": 4.8700059549588565e-05, "loss": 0.3697, "step": 1921000 }, { "epoch": 13.0, "eval_accuracy": 0.8588460308752098, "eval_loss": 0.5731586217880249, "eval_runtime": 3378.9041, "eval_samples_per_second": 86.047, "eval_steps_per_second": 5.378, "step": 1921088 }, { "epoch": 13.002788003464703, "grad_norm": 0.3782336413860321, "learning_rate": 4.869972119965353e-05, "loss": 0.3712, "step": 1921500 }, { "epoch": 13.006171502815072, "grad_norm": 0.33048200607299805, "learning_rate": 4.8699382849718496e-05, "loss": 0.3693, "step": 1922000 }, { "epoch": 13.00955500216544, "grad_norm": 0.36409974098205566, "learning_rate": 4.869904449978346e-05, "loss": 0.3674, "step": 1922500 }, { "epoch": 13.012938501515809, "grad_norm": 0.40011271834373474, "learning_rate": 4.869870614984842e-05, "loss": 0.3693, "step": 1923000 }, { "epoch": 13.016322000866175, "grad_norm": 0.38334009051322937, "learning_rate": 4.869836779991338e-05, "loss": 0.3686, "step": 1923500 }, { "epoch": 13.019705500216544, "grad_norm": 0.37485963106155396, "learning_rate": 4.869802944997835e-05, "loss": 0.3702, "step": 1924000 }, { "epoch": 13.023088999566912, "grad_norm": 0.3391663730144501, "learning_rate": 4.8697691100043314e-05, "loss": 0.3671, "step": 1924500 }, { "epoch": 13.02647249891728, "grad_norm": 0.38333529233932495, "learning_rate": 4.869735275010827e-05, "loss": 0.3676, "step": 1925000 }, { "epoch": 13.029855998267648, "grad_norm": 0.34279459714889526, "learning_rate": 4.869701440017324e-05, "loss": 0.3694, "step": 1925500 }, { "epoch": 13.033239497618016, "grad_norm": 0.37504786252975464, "learning_rate": 4.86966760502382e-05, "loss": 0.3685, "step": 1926000 }, { "epoch": 13.036622996968385, "grad_norm": 0.35027268528938293, "learning_rate": 4.869633770030316e-05, "loss": 0.368, "step": 1926500 }, { "epoch": 13.040006496318753, "grad_norm": 0.36550816893577576, "learning_rate": 4.8695999350368124e-05, "loss": 0.3692, "step": 1927000 }, { "epoch": 13.04338999566912, "grad_norm": 0.3510855734348297, "learning_rate": 4.869566100043309e-05, "loss": 0.3687, "step": 1927500 }, { "epoch": 13.046773495019488, "grad_norm": 0.3407793939113617, "learning_rate": 4.8695322650498055e-05, "loss": 0.3707, "step": 1928000 }, { "epoch": 13.050156994369857, "grad_norm": 0.38290151953697205, "learning_rate": 4.869498430056302e-05, "loss": 0.3698, "step": 1928500 }, { "epoch": 13.053540493720226, "grad_norm": 0.3659246265888214, "learning_rate": 4.869464595062798e-05, "loss": 0.368, "step": 1929000 }, { "epoch": 13.056923993070594, "grad_norm": 0.36423566937446594, "learning_rate": 4.869430760069295e-05, "loss": 0.37, "step": 1929500 }, { "epoch": 13.06030749242096, "grad_norm": 0.3652285039424896, "learning_rate": 4.8693969250757904e-05, "loss": 0.3699, "step": 1930000 }, { "epoch": 13.06369099177133, "grad_norm": 0.3280698359012604, "learning_rate": 4.8693630900822866e-05, "loss": 0.3685, "step": 1930500 }, { "epoch": 13.067074491121698, "grad_norm": 0.41137126088142395, "learning_rate": 4.869329255088783e-05, "loss": 0.3701, "step": 1931000 }, { "epoch": 13.070457990472066, "grad_norm": 0.3417060375213623, "learning_rate": 4.86929542009528e-05, "loss": 0.3677, "step": 1931500 }, { "epoch": 13.073841489822433, "grad_norm": 0.3309522271156311, "learning_rate": 4.869261585101776e-05, "loss": 0.3697, "step": 1932000 }, { "epoch": 13.077224989172802, "grad_norm": 0.3627679646015167, "learning_rate": 4.869227750108272e-05, "loss": 0.3699, "step": 1932500 }, { "epoch": 13.08060848852317, "grad_norm": 0.34668660163879395, "learning_rate": 4.8691939151147683e-05, "loss": 0.3683, "step": 1933000 }, { "epoch": 13.083991987873539, "grad_norm": 0.36464157700538635, "learning_rate": 4.869160080121265e-05, "loss": 0.3695, "step": 1933500 }, { "epoch": 13.087375487223907, "grad_norm": 0.36991873383522034, "learning_rate": 4.8691262451277614e-05, "loss": 0.3695, "step": 1934000 }, { "epoch": 13.090758986574274, "grad_norm": 0.37532421946525574, "learning_rate": 4.8690924101342577e-05, "loss": 0.3699, "step": 1934500 }, { "epoch": 13.094142485924642, "grad_norm": 0.34363842010498047, "learning_rate": 4.869058575140754e-05, "loss": 0.3706, "step": 1935000 }, { "epoch": 13.097525985275011, "grad_norm": 0.37082090973854065, "learning_rate": 4.86902474014725e-05, "loss": 0.3691, "step": 1935500 }, { "epoch": 13.10090948462538, "grad_norm": 0.36466777324676514, "learning_rate": 4.868990905153746e-05, "loss": 0.37, "step": 1936000 }, { "epoch": 13.104292983975746, "grad_norm": 0.40605488419532776, "learning_rate": 4.8689570701602425e-05, "loss": 0.3708, "step": 1936500 }, { "epoch": 13.107676483326115, "grad_norm": 0.326335608959198, "learning_rate": 4.8689232351667394e-05, "loss": 0.3693, "step": 1937000 }, { "epoch": 13.111059982676483, "grad_norm": 0.39977341890335083, "learning_rate": 4.8688894001732356e-05, "loss": 0.3699, "step": 1937500 }, { "epoch": 13.114443482026852, "grad_norm": 0.3610125780105591, "learning_rate": 4.868855565179732e-05, "loss": 0.3698, "step": 1938000 }, { "epoch": 13.11782698137722, "grad_norm": 0.352734237909317, "learning_rate": 4.868821730186228e-05, "loss": 0.3686, "step": 1938500 }, { "epoch": 13.121210480727587, "grad_norm": 0.42013734579086304, "learning_rate": 4.868787895192725e-05, "loss": 0.3717, "step": 1939000 }, { "epoch": 13.124593980077956, "grad_norm": 0.38568973541259766, "learning_rate": 4.8687540601992205e-05, "loss": 0.3703, "step": 1939500 }, { "epoch": 13.127977479428324, "grad_norm": 0.3538978397846222, "learning_rate": 4.868720225205717e-05, "loss": 0.369, "step": 1940000 }, { "epoch": 13.131360978778693, "grad_norm": 0.34834977984428406, "learning_rate": 4.868686390212213e-05, "loss": 0.3706, "step": 1940500 }, { "epoch": 13.13474447812906, "grad_norm": 0.33336830139160156, "learning_rate": 4.86865255521871e-05, "loss": 0.3697, "step": 1941000 }, { "epoch": 13.138127977479428, "grad_norm": 0.39493104815483093, "learning_rate": 4.868618720225206e-05, "loss": 0.3686, "step": 1941500 }, { "epoch": 13.141511476829796, "grad_norm": 0.3420484662055969, "learning_rate": 4.868584885231702e-05, "loss": 0.3691, "step": 1942000 }, { "epoch": 13.144894976180165, "grad_norm": 0.3717644214630127, "learning_rate": 4.8685510502381984e-05, "loss": 0.3705, "step": 1942500 }, { "epoch": 13.148278475530534, "grad_norm": 0.3905476927757263, "learning_rate": 4.868517215244695e-05, "loss": 0.3688, "step": 1943000 }, { "epoch": 13.1516619748809, "grad_norm": 0.36413437128067017, "learning_rate": 4.8684833802511915e-05, "loss": 0.3692, "step": 1943500 }, { "epoch": 13.155045474231269, "grad_norm": 0.37018147110939026, "learning_rate": 4.868449545257688e-05, "loss": 0.3703, "step": 1944000 }, { "epoch": 13.158428973581637, "grad_norm": 0.3624891936779022, "learning_rate": 4.868415710264184e-05, "loss": 0.37, "step": 1944500 }, { "epoch": 13.161812472932006, "grad_norm": 0.3513813018798828, "learning_rate": 4.86838187527068e-05, "loss": 0.3692, "step": 1945000 }, { "epoch": 13.165195972282373, "grad_norm": 0.35136500000953674, "learning_rate": 4.8683480402771764e-05, "loss": 0.3678, "step": 1945500 }, { "epoch": 13.168579471632741, "grad_norm": 0.34195035696029663, "learning_rate": 4.8683142052836726e-05, "loss": 0.3689, "step": 1946000 }, { "epoch": 13.17196297098311, "grad_norm": 0.353342205286026, "learning_rate": 4.8682803702901695e-05, "loss": 0.3708, "step": 1946500 }, { "epoch": 13.175346470333478, "grad_norm": 0.33236193656921387, "learning_rate": 4.868246535296666e-05, "loss": 0.3697, "step": 1947000 }, { "epoch": 13.178729969683847, "grad_norm": 0.40654829144477844, "learning_rate": 4.868212700303162e-05, "loss": 0.3708, "step": 1947500 }, { "epoch": 13.182113469034213, "grad_norm": 0.38720816373825073, "learning_rate": 4.868178865309658e-05, "loss": 0.3699, "step": 1948000 }, { "epoch": 13.185496968384582, "grad_norm": 0.37693819403648376, "learning_rate": 4.868145030316155e-05, "loss": 0.3685, "step": 1948500 }, { "epoch": 13.18888046773495, "grad_norm": 0.35839956998825073, "learning_rate": 4.8681111953226505e-05, "loss": 0.3704, "step": 1949000 }, { "epoch": 13.192263967085319, "grad_norm": 0.36030447483062744, "learning_rate": 4.868077360329147e-05, "loss": 0.3693, "step": 1949500 }, { "epoch": 13.195647466435686, "grad_norm": 0.37780576944351196, "learning_rate": 4.868043525335643e-05, "loss": 0.3707, "step": 1950000 }, { "epoch": 13.199030965786054, "grad_norm": 0.3539212942123413, "learning_rate": 4.86800969034214e-05, "loss": 0.3703, "step": 1950500 }, { "epoch": 13.202414465136423, "grad_norm": 0.3429529666900635, "learning_rate": 4.867975855348636e-05, "loss": 0.3684, "step": 1951000 }, { "epoch": 13.205797964486791, "grad_norm": 0.3764457106590271, "learning_rate": 4.867942020355132e-05, "loss": 0.3702, "step": 1951500 }, { "epoch": 13.209181463837158, "grad_norm": 0.3694688379764557, "learning_rate": 4.8679081853616285e-05, "loss": 0.3694, "step": 1952000 }, { "epoch": 13.212564963187527, "grad_norm": 0.3935447037220001, "learning_rate": 4.8678743503681254e-05, "loss": 0.3688, "step": 1952500 }, { "epoch": 13.215948462537895, "grad_norm": 0.3366578221321106, "learning_rate": 4.8678405153746216e-05, "loss": 0.3678, "step": 1953000 }, { "epoch": 13.219331961888264, "grad_norm": 0.3674514591693878, "learning_rate": 4.867806680381118e-05, "loss": 0.3702, "step": 1953500 }, { "epoch": 13.222715461238632, "grad_norm": 0.3328007757663727, "learning_rate": 4.867772845387614e-05, "loss": 0.3699, "step": 1954000 }, { "epoch": 13.226098960588999, "grad_norm": 0.3304157853126526, "learning_rate": 4.86773901039411e-05, "loss": 0.3688, "step": 1954500 }, { "epoch": 13.229482459939367, "grad_norm": 0.3459080755710602, "learning_rate": 4.8677051754006064e-05, "loss": 0.3684, "step": 1955000 }, { "epoch": 13.232865959289736, "grad_norm": 0.39367958903312683, "learning_rate": 4.8676713404071026e-05, "loss": 0.3703, "step": 1955500 }, { "epoch": 13.236249458640104, "grad_norm": 0.37636101245880127, "learning_rate": 4.8676375054135995e-05, "loss": 0.3699, "step": 1956000 }, { "epoch": 13.239632957990471, "grad_norm": 0.3529524505138397, "learning_rate": 4.867603670420096e-05, "loss": 0.3693, "step": 1956500 }, { "epoch": 13.24301645734084, "grad_norm": 0.33474189043045044, "learning_rate": 4.867569835426592e-05, "loss": 0.3691, "step": 1957000 }, { "epoch": 13.246399956691208, "grad_norm": 0.3529336750507355, "learning_rate": 4.867536000433088e-05, "loss": 0.371, "step": 1957500 }, { "epoch": 13.249783456041577, "grad_norm": 0.38825368881225586, "learning_rate": 4.8675021654395844e-05, "loss": 0.3695, "step": 1958000 }, { "epoch": 13.253166955391945, "grad_norm": 0.3401729166507721, "learning_rate": 4.8674683304460806e-05, "loss": 0.3701, "step": 1958500 }, { "epoch": 13.256550454742312, "grad_norm": 0.3608418405056, "learning_rate": 4.867434495452577e-05, "loss": 0.369, "step": 1959000 }, { "epoch": 13.25993395409268, "grad_norm": 0.5334931015968323, "learning_rate": 4.867400660459073e-05, "loss": 0.3689, "step": 1959500 }, { "epoch": 13.26331745344305, "grad_norm": 0.35742655396461487, "learning_rate": 4.86736682546557e-05, "loss": 0.3714, "step": 1960000 }, { "epoch": 13.266700952793418, "grad_norm": 0.369232714176178, "learning_rate": 4.867332990472066e-05, "loss": 0.3696, "step": 1960500 }, { "epoch": 13.270084452143784, "grad_norm": 0.34640198945999146, "learning_rate": 4.867299155478562e-05, "loss": 0.37, "step": 1961000 }, { "epoch": 13.273467951494153, "grad_norm": 0.3134201467037201, "learning_rate": 4.8672653204850585e-05, "loss": 0.3702, "step": 1961500 }, { "epoch": 13.276851450844521, "grad_norm": 0.4063735604286194, "learning_rate": 4.8672314854915554e-05, "loss": 0.3702, "step": 1962000 }, { "epoch": 13.28023495019489, "grad_norm": 0.40066906809806824, "learning_rate": 4.8671976504980516e-05, "loss": 0.3707, "step": 1962500 }, { "epoch": 13.283618449545258, "grad_norm": 0.38528531789779663, "learning_rate": 4.867163815504548e-05, "loss": 0.3695, "step": 1963000 }, { "epoch": 13.287001948895625, "grad_norm": 0.38084566593170166, "learning_rate": 4.867129980511044e-05, "loss": 0.3697, "step": 1963500 }, { "epoch": 13.290385448245994, "grad_norm": 0.36008328199386597, "learning_rate": 4.86709614551754e-05, "loss": 0.3694, "step": 1964000 }, { "epoch": 13.293768947596362, "grad_norm": 0.3995845317840576, "learning_rate": 4.8670623105240365e-05, "loss": 0.3701, "step": 1964500 }, { "epoch": 13.29715244694673, "grad_norm": 0.38086098432540894, "learning_rate": 4.867028475530533e-05, "loss": 0.3692, "step": 1965000 }, { "epoch": 13.300535946297098, "grad_norm": 0.3582462668418884, "learning_rate": 4.866994640537029e-05, "loss": 0.3695, "step": 1965500 }, { "epoch": 13.303919445647466, "grad_norm": 0.38480857014656067, "learning_rate": 4.866960805543526e-05, "loss": 0.3686, "step": 1966000 }, { "epoch": 13.307302944997835, "grad_norm": 0.36101385951042175, "learning_rate": 4.866926970550022e-05, "loss": 0.3694, "step": 1966500 }, { "epoch": 13.310686444348203, "grad_norm": 0.3723582625389099, "learning_rate": 4.866893135556518e-05, "loss": 0.3689, "step": 1967000 }, { "epoch": 13.314069943698572, "grad_norm": 0.38096871972084045, "learning_rate": 4.8668593005630144e-05, "loss": 0.37, "step": 1967500 }, { "epoch": 13.317453443048938, "grad_norm": 0.42835044860839844, "learning_rate": 4.8668254655695106e-05, "loss": 0.3703, "step": 1968000 }, { "epoch": 13.320836942399307, "grad_norm": 0.3604111671447754, "learning_rate": 4.866791630576007e-05, "loss": 0.3707, "step": 1968500 }, { "epoch": 13.324220441749675, "grad_norm": 0.33632153272628784, "learning_rate": 4.866757795582503e-05, "loss": 0.3699, "step": 1969000 }, { "epoch": 13.327603941100044, "grad_norm": 0.36421000957489014, "learning_rate": 4.866723960589e-05, "loss": 0.3713, "step": 1969500 }, { "epoch": 13.33098744045041, "grad_norm": 0.37115344405174255, "learning_rate": 4.866690125595496e-05, "loss": 0.3711, "step": 1970000 }, { "epoch": 13.33437093980078, "grad_norm": 0.33895623683929443, "learning_rate": 4.8666562906019924e-05, "loss": 0.3703, "step": 1970500 }, { "epoch": 13.337754439151148, "grad_norm": 0.36736783385276794, "learning_rate": 4.8666224556084886e-05, "loss": 0.3697, "step": 1971000 }, { "epoch": 13.341137938501516, "grad_norm": 0.3676416575908661, "learning_rate": 4.8665886206149855e-05, "loss": 0.3705, "step": 1971500 }, { "epoch": 13.344521437851885, "grad_norm": 0.3712790012359619, "learning_rate": 4.866554785621482e-05, "loss": 0.3692, "step": 1972000 }, { "epoch": 13.347904937202252, "grad_norm": 0.3409649431705475, "learning_rate": 4.866520950627978e-05, "loss": 0.3696, "step": 1972500 }, { "epoch": 13.35128843655262, "grad_norm": 0.4139016568660736, "learning_rate": 4.866487115634474e-05, "loss": 0.3706, "step": 1973000 }, { "epoch": 13.354671935902989, "grad_norm": 0.35674795508384705, "learning_rate": 4.86645328064097e-05, "loss": 0.37, "step": 1973500 }, { "epoch": 13.358055435253357, "grad_norm": 0.3726387023925781, "learning_rate": 4.8664194456474665e-05, "loss": 0.3698, "step": 1974000 }, { "epoch": 13.361438934603724, "grad_norm": 0.3655881881713867, "learning_rate": 4.866385610653963e-05, "loss": 0.3685, "step": 1974500 }, { "epoch": 13.364822433954092, "grad_norm": 0.37517014145851135, "learning_rate": 4.866351775660459e-05, "loss": 0.3716, "step": 1975000 }, { "epoch": 13.368205933304461, "grad_norm": 0.3558104932308197, "learning_rate": 4.866317940666956e-05, "loss": 0.3679, "step": 1975500 }, { "epoch": 13.37158943265483, "grad_norm": 0.35728612542152405, "learning_rate": 4.866284105673452e-05, "loss": 0.3697, "step": 1976000 }, { "epoch": 13.374972932005196, "grad_norm": 0.3204164505004883, "learning_rate": 4.866250270679948e-05, "loss": 0.37, "step": 1976500 }, { "epoch": 13.378356431355565, "grad_norm": 0.37354809045791626, "learning_rate": 4.8662164356864445e-05, "loss": 0.3704, "step": 1977000 }, { "epoch": 13.381739930705933, "grad_norm": 0.32038918137550354, "learning_rate": 4.866182600692941e-05, "loss": 0.3684, "step": 1977500 }, { "epoch": 13.385123430056302, "grad_norm": 0.3417024314403534, "learning_rate": 4.866148765699437e-05, "loss": 0.3701, "step": 1978000 }, { "epoch": 13.38850692940667, "grad_norm": 0.36151084303855896, "learning_rate": 4.866114930705933e-05, "loss": 0.3697, "step": 1978500 }, { "epoch": 13.391890428757037, "grad_norm": 0.3163359761238098, "learning_rate": 4.86608109571243e-05, "loss": 0.3698, "step": 1979000 }, { "epoch": 13.395273928107406, "grad_norm": 0.3717551529407501, "learning_rate": 4.866047260718926e-05, "loss": 0.3707, "step": 1979500 }, { "epoch": 13.398657427457774, "grad_norm": 0.38001298904418945, "learning_rate": 4.8660134257254225e-05, "loss": 0.3697, "step": 1980000 }, { "epoch": 13.402040926808143, "grad_norm": 0.3517325818538666, "learning_rate": 4.865979590731919e-05, "loss": 0.3699, "step": 1980500 }, { "epoch": 13.40542442615851, "grad_norm": 0.33883053064346313, "learning_rate": 4.8659457557384156e-05, "loss": 0.3714, "step": 1981000 }, { "epoch": 13.408807925508878, "grad_norm": 0.3610602617263794, "learning_rate": 4.865911920744912e-05, "loss": 0.3723, "step": 1981500 }, { "epoch": 13.412191424859246, "grad_norm": 0.35467588901519775, "learning_rate": 4.865878085751408e-05, "loss": 0.3699, "step": 1982000 }, { "epoch": 13.415574924209615, "grad_norm": 0.3825504183769226, "learning_rate": 4.8658442507579035e-05, "loss": 0.3719, "step": 1982500 }, { "epoch": 13.418958423559983, "grad_norm": 0.38568857312202454, "learning_rate": 4.8658104157644004e-05, "loss": 0.3688, "step": 1983000 }, { "epoch": 13.42234192291035, "grad_norm": 0.3631773591041565, "learning_rate": 4.8657765807708966e-05, "loss": 0.3697, "step": 1983500 }, { "epoch": 13.425725422260719, "grad_norm": 0.35028496384620667, "learning_rate": 4.865742745777393e-05, "loss": 0.3695, "step": 1984000 }, { "epoch": 13.429108921611087, "grad_norm": 0.3581047058105469, "learning_rate": 4.865708910783889e-05, "loss": 0.3707, "step": 1984500 }, { "epoch": 13.432492420961456, "grad_norm": 0.35143864154815674, "learning_rate": 4.865675075790386e-05, "loss": 0.3699, "step": 1985000 }, { "epoch": 13.435875920311823, "grad_norm": 0.3333075940608978, "learning_rate": 4.865641240796882e-05, "loss": 0.3703, "step": 1985500 }, { "epoch": 13.439259419662191, "grad_norm": 0.35157662630081177, "learning_rate": 4.8656074058033784e-05, "loss": 0.3701, "step": 1986000 }, { "epoch": 13.44264291901256, "grad_norm": 0.36174359917640686, "learning_rate": 4.8655735708098746e-05, "loss": 0.3699, "step": 1986500 }, { "epoch": 13.446026418362928, "grad_norm": 0.36083289980888367, "learning_rate": 4.865539735816371e-05, "loss": 0.3692, "step": 1987000 }, { "epoch": 13.449409917713297, "grad_norm": 0.3417918384075165, "learning_rate": 4.865505900822867e-05, "loss": 0.3702, "step": 1987500 }, { "epoch": 13.452793417063663, "grad_norm": 0.3874722421169281, "learning_rate": 4.865472065829363e-05, "loss": 0.3703, "step": 1988000 }, { "epoch": 13.456176916414032, "grad_norm": 0.36781999468803406, "learning_rate": 4.86543823083586e-05, "loss": 0.3702, "step": 1988500 }, { "epoch": 13.4595604157644, "grad_norm": 0.3900182247161865, "learning_rate": 4.865404395842356e-05, "loss": 0.3708, "step": 1989000 }, { "epoch": 13.462943915114769, "grad_norm": 0.36022305488586426, "learning_rate": 4.8653705608488525e-05, "loss": 0.3699, "step": 1989500 }, { "epoch": 13.466327414465136, "grad_norm": 0.3545489013195038, "learning_rate": 4.865336725855349e-05, "loss": 0.3713, "step": 1990000 }, { "epoch": 13.469710913815504, "grad_norm": 0.3593008518218994, "learning_rate": 4.8653028908618456e-05, "loss": 0.3697, "step": 1990500 }, { "epoch": 13.473094413165873, "grad_norm": 0.3376884460449219, "learning_rate": 4.865269055868342e-05, "loss": 0.3716, "step": 1991000 }, { "epoch": 13.476477912516241, "grad_norm": 0.3358532190322876, "learning_rate": 4.865235220874838e-05, "loss": 0.3691, "step": 1991500 }, { "epoch": 13.479861411866608, "grad_norm": 0.3432691693305969, "learning_rate": 4.8652013858813336e-05, "loss": 0.3709, "step": 1992000 }, { "epoch": 13.483244911216977, "grad_norm": 0.41430553793907166, "learning_rate": 4.8651675508878305e-05, "loss": 0.3699, "step": 1992500 }, { "epoch": 13.486628410567345, "grad_norm": 0.34607023000717163, "learning_rate": 4.865133715894327e-05, "loss": 0.3695, "step": 1993000 }, { "epoch": 13.490011909917714, "grad_norm": 0.3873082101345062, "learning_rate": 4.865099880900823e-05, "loss": 0.3688, "step": 1993500 }, { "epoch": 13.493395409268082, "grad_norm": 0.3736827075481415, "learning_rate": 4.865066045907319e-05, "loss": 0.3709, "step": 1994000 }, { "epoch": 13.496778908618449, "grad_norm": 0.33309677243232727, "learning_rate": 4.865032210913816e-05, "loss": 0.3695, "step": 1994500 }, { "epoch": 13.500162407968817, "grad_norm": 0.3392011523246765, "learning_rate": 4.864998375920312e-05, "loss": 0.3705, "step": 1995000 }, { "epoch": 13.503545907319186, "grad_norm": 0.36364486813545227, "learning_rate": 4.8649645409268084e-05, "loss": 0.37, "step": 1995500 }, { "epoch": 13.506929406669554, "grad_norm": 0.36173754930496216, "learning_rate": 4.8649307059333046e-05, "loss": 0.3712, "step": 1996000 }, { "epoch": 13.510312906019923, "grad_norm": 0.40453898906707764, "learning_rate": 4.8648968709398015e-05, "loss": 0.3699, "step": 1996500 }, { "epoch": 13.51369640537029, "grad_norm": 0.369381308555603, "learning_rate": 4.864863035946297e-05, "loss": 0.3679, "step": 1997000 }, { "epoch": 13.517079904720658, "grad_norm": 0.37291908264160156, "learning_rate": 4.864829200952793e-05, "loss": 0.3714, "step": 1997500 }, { "epoch": 13.520463404071027, "grad_norm": 0.3864448368549347, "learning_rate": 4.86479536595929e-05, "loss": 0.3707, "step": 1998000 }, { "epoch": 13.523846903421395, "grad_norm": 0.39676347374916077, "learning_rate": 4.8647615309657864e-05, "loss": 0.37, "step": 1998500 }, { "epoch": 13.527230402771762, "grad_norm": 0.34468865394592285, "learning_rate": 4.8647276959722826e-05, "loss": 0.3713, "step": 1999000 }, { "epoch": 13.53061390212213, "grad_norm": 0.34684211015701294, "learning_rate": 4.864693860978779e-05, "loss": 0.3702, "step": 1999500 }, { "epoch": 13.533997401472499, "grad_norm": 0.3568776249885559, "learning_rate": 4.864660025985276e-05, "loss": 0.3699, "step": 2000000 }, { "epoch": 13.537380900822868, "grad_norm": 0.36592113971710205, "learning_rate": 4.864626190991772e-05, "loss": 0.3716, "step": 2000500 }, { "epoch": 13.540764400173234, "grad_norm": 0.348983496427536, "learning_rate": 4.864592355998268e-05, "loss": 0.3689, "step": 2001000 }, { "epoch": 13.544147899523603, "grad_norm": 0.3736741542816162, "learning_rate": 4.8645585210047636e-05, "loss": 0.3713, "step": 2001500 }, { "epoch": 13.547531398873971, "grad_norm": 0.33904919028282166, "learning_rate": 4.8645246860112605e-05, "loss": 0.3689, "step": 2002000 }, { "epoch": 13.55091489822434, "grad_norm": 0.3362513482570648, "learning_rate": 4.864490851017757e-05, "loss": 0.3701, "step": 2002500 }, { "epoch": 13.554298397574708, "grad_norm": 0.34666186571121216, "learning_rate": 4.864457016024253e-05, "loss": 0.3702, "step": 2003000 }, { "epoch": 13.557681896925075, "grad_norm": 0.3963722586631775, "learning_rate": 4.864423181030749e-05, "loss": 0.3703, "step": 2003500 }, { "epoch": 13.561065396275444, "grad_norm": 0.3478109538555145, "learning_rate": 4.864389346037246e-05, "loss": 0.3692, "step": 2004000 }, { "epoch": 13.564448895625812, "grad_norm": 0.3706079423427582, "learning_rate": 4.864355511043742e-05, "loss": 0.3685, "step": 2004500 }, { "epoch": 13.56783239497618, "grad_norm": 0.35787704586982727, "learning_rate": 4.8643216760502385e-05, "loss": 0.3693, "step": 2005000 }, { "epoch": 13.571215894326548, "grad_norm": 0.4233979284763336, "learning_rate": 4.864287841056735e-05, "loss": 0.3718, "step": 2005500 }, { "epoch": 13.574599393676916, "grad_norm": 0.3667444586753845, "learning_rate": 4.8642540060632316e-05, "loss": 0.3696, "step": 2006000 }, { "epoch": 13.577982893027285, "grad_norm": 0.34215712547302246, "learning_rate": 4.864220171069727e-05, "loss": 0.3703, "step": 2006500 }, { "epoch": 13.581366392377653, "grad_norm": 0.3960571587085724, "learning_rate": 4.864186336076223e-05, "loss": 0.3697, "step": 2007000 }, { "epoch": 13.584749891728022, "grad_norm": 0.34247100353240967, "learning_rate": 4.86415250108272e-05, "loss": 0.3675, "step": 2007500 }, { "epoch": 13.588133391078388, "grad_norm": 0.3906427323818207, "learning_rate": 4.8641186660892164e-05, "loss": 0.3712, "step": 2008000 }, { "epoch": 13.591516890428757, "grad_norm": 0.36824533343315125, "learning_rate": 4.8640848310957126e-05, "loss": 0.3707, "step": 2008500 }, { "epoch": 13.594900389779125, "grad_norm": 0.37552452087402344, "learning_rate": 4.864050996102209e-05, "loss": 0.3704, "step": 2009000 }, { "epoch": 13.598283889129494, "grad_norm": 0.35757166147232056, "learning_rate": 4.864017161108706e-05, "loss": 0.3696, "step": 2009500 }, { "epoch": 13.60166738847986, "grad_norm": 0.38713139295578003, "learning_rate": 4.863983326115202e-05, "loss": 0.3708, "step": 2010000 }, { "epoch": 13.60505088783023, "grad_norm": 0.3677159547805786, "learning_rate": 4.863949491121698e-05, "loss": 0.3692, "step": 2010500 }, { "epoch": 13.608434387180598, "grad_norm": 0.4078529477119446, "learning_rate": 4.863915656128194e-05, "loss": 0.3709, "step": 2011000 }, { "epoch": 13.611817886530966, "grad_norm": 0.38677313923835754, "learning_rate": 4.8638818211346906e-05, "loss": 0.3703, "step": 2011500 }, { "epoch": 13.615201385881335, "grad_norm": 0.35637909173965454, "learning_rate": 4.863847986141187e-05, "loss": 0.3705, "step": 2012000 }, { "epoch": 13.618584885231702, "grad_norm": 0.3691534101963043, "learning_rate": 4.863814151147683e-05, "loss": 0.3681, "step": 2012500 }, { "epoch": 13.62196838458207, "grad_norm": 0.40598565340042114, "learning_rate": 4.863780316154179e-05, "loss": 0.3702, "step": 2013000 }, { "epoch": 13.625351883932439, "grad_norm": 0.34304094314575195, "learning_rate": 4.863746481160676e-05, "loss": 0.3701, "step": 2013500 }, { "epoch": 13.628735383282807, "grad_norm": 0.3206089735031128, "learning_rate": 4.863712646167172e-05, "loss": 0.3704, "step": 2014000 }, { "epoch": 13.632118882633174, "grad_norm": 0.32081344723701477, "learning_rate": 4.8636788111736685e-05, "loss": 0.3696, "step": 2014500 }, { "epoch": 13.635502381983542, "grad_norm": 0.3509119749069214, "learning_rate": 4.863644976180165e-05, "loss": 0.3717, "step": 2015000 }, { "epoch": 13.63888588133391, "grad_norm": 0.36864274740219116, "learning_rate": 4.8636111411866616e-05, "loss": 0.3698, "step": 2015500 }, { "epoch": 13.64226938068428, "grad_norm": 0.3501897156238556, "learning_rate": 4.863577306193157e-05, "loss": 0.3711, "step": 2016000 }, { "epoch": 13.645652880034646, "grad_norm": 0.4033876657485962, "learning_rate": 4.8635434711996534e-05, "loss": 0.3701, "step": 2016500 }, { "epoch": 13.649036379385015, "grad_norm": 0.335446834564209, "learning_rate": 4.86350963620615e-05, "loss": 0.3679, "step": 2017000 }, { "epoch": 13.652419878735383, "grad_norm": 0.3642156720161438, "learning_rate": 4.8634758012126465e-05, "loss": 0.3701, "step": 2017500 }, { "epoch": 13.655803378085752, "grad_norm": 0.39847907423973083, "learning_rate": 4.863441966219143e-05, "loss": 0.3706, "step": 2018000 }, { "epoch": 13.65918687743612, "grad_norm": 0.3453882038593292, "learning_rate": 4.863408131225639e-05, "loss": 0.3724, "step": 2018500 }, { "epoch": 13.662570376786487, "grad_norm": 0.3945293128490448, "learning_rate": 4.863374296232136e-05, "loss": 0.3697, "step": 2019000 }, { "epoch": 13.665953876136856, "grad_norm": 0.31794705986976624, "learning_rate": 4.863340461238632e-05, "loss": 0.3693, "step": 2019500 }, { "epoch": 13.669337375487224, "grad_norm": 0.33316007256507874, "learning_rate": 4.863306626245128e-05, "loss": 0.3682, "step": 2020000 }, { "epoch": 13.672720874837593, "grad_norm": 0.3561200499534607, "learning_rate": 4.863272791251624e-05, "loss": 0.3703, "step": 2020500 }, { "epoch": 13.676104374187961, "grad_norm": 0.38469579815864563, "learning_rate": 4.8632389562581207e-05, "loss": 0.3722, "step": 2021000 }, { "epoch": 13.679487873538328, "grad_norm": 0.3694473206996918, "learning_rate": 4.863205121264617e-05, "loss": 0.3709, "step": 2021500 }, { "epoch": 13.682871372888696, "grad_norm": 0.39557352662086487, "learning_rate": 4.863171286271113e-05, "loss": 0.3718, "step": 2022000 }, { "epoch": 13.686254872239065, "grad_norm": 0.31979459524154663, "learning_rate": 4.863137451277609e-05, "loss": 0.3715, "step": 2022500 }, { "epoch": 13.689638371589433, "grad_norm": 0.3282807171344757, "learning_rate": 4.863103616284106e-05, "loss": 0.3696, "step": 2023000 }, { "epoch": 13.6930218709398, "grad_norm": 0.3137219250202179, "learning_rate": 4.8630697812906024e-05, "loss": 0.3695, "step": 2023500 }, { "epoch": 13.696405370290169, "grad_norm": 0.390055388212204, "learning_rate": 4.8630359462970986e-05, "loss": 0.3725, "step": 2024000 }, { "epoch": 13.699788869640537, "grad_norm": 0.3728681206703186, "learning_rate": 4.863002111303595e-05, "loss": 0.3699, "step": 2024500 }, { "epoch": 13.703172368990906, "grad_norm": 0.3733259439468384, "learning_rate": 4.862968276310092e-05, "loss": 0.3701, "step": 2025000 }, { "epoch": 13.706555868341272, "grad_norm": 0.4084514379501343, "learning_rate": 4.862934441316587e-05, "loss": 0.3699, "step": 2025500 }, { "epoch": 13.709939367691641, "grad_norm": 0.3510924279689789, "learning_rate": 4.8629006063230835e-05, "loss": 0.3704, "step": 2026000 }, { "epoch": 13.71332286704201, "grad_norm": 0.3545098304748535, "learning_rate": 4.8628667713295803e-05, "loss": 0.3707, "step": 2026500 }, { "epoch": 13.716706366392378, "grad_norm": 0.40483832359313965, "learning_rate": 4.8628329363360766e-05, "loss": 0.3691, "step": 2027000 }, { "epoch": 13.720089865742747, "grad_norm": 0.4002268314361572, "learning_rate": 4.862799101342573e-05, "loss": 0.3711, "step": 2027500 }, { "epoch": 13.723473365093113, "grad_norm": 0.3556060194969177, "learning_rate": 4.862765266349069e-05, "loss": 0.3708, "step": 2028000 }, { "epoch": 13.726856864443482, "grad_norm": 0.3630557358264923, "learning_rate": 4.862731431355565e-05, "loss": 0.3715, "step": 2028500 }, { "epoch": 13.73024036379385, "grad_norm": 0.3628033995628357, "learning_rate": 4.862697596362062e-05, "loss": 0.3716, "step": 2029000 }, { "epoch": 13.733623863144219, "grad_norm": 0.3553501069545746, "learning_rate": 4.862663761368558e-05, "loss": 0.3696, "step": 2029500 }, { "epoch": 13.737007362494586, "grad_norm": 0.3359854817390442, "learning_rate": 4.862629926375054e-05, "loss": 0.3685, "step": 2030000 }, { "epoch": 13.740390861844954, "grad_norm": 0.3862282335758209, "learning_rate": 4.862596091381551e-05, "loss": 0.3711, "step": 2030500 }, { "epoch": 13.743774361195323, "grad_norm": 0.37626171112060547, "learning_rate": 4.862562256388047e-05, "loss": 0.3689, "step": 2031000 }, { "epoch": 13.747157860545691, "grad_norm": 0.3787476718425751, "learning_rate": 4.862528421394543e-05, "loss": 0.3714, "step": 2031500 }, { "epoch": 13.750541359896058, "grad_norm": 0.34771743416786194, "learning_rate": 4.8624945864010394e-05, "loss": 0.3708, "step": 2032000 }, { "epoch": 13.753924859246426, "grad_norm": 0.32347938418388367, "learning_rate": 4.862460751407536e-05, "loss": 0.371, "step": 2032500 }, { "epoch": 13.757308358596795, "grad_norm": 0.35285308957099915, "learning_rate": 4.8624269164140325e-05, "loss": 0.3699, "step": 2033000 }, { "epoch": 13.760691857947164, "grad_norm": 0.38633468747138977, "learning_rate": 4.862393081420529e-05, "loss": 0.3715, "step": 2033500 }, { "epoch": 13.764075357297532, "grad_norm": 0.38737979531288147, "learning_rate": 4.862359246427025e-05, "loss": 0.3687, "step": 2034000 }, { "epoch": 13.767458856647899, "grad_norm": 0.33574140071868896, "learning_rate": 4.862325411433522e-05, "loss": 0.3696, "step": 2034500 }, { "epoch": 13.770842355998267, "grad_norm": 0.3829055726528168, "learning_rate": 4.862291576440017e-05, "loss": 0.3702, "step": 2035000 }, { "epoch": 13.774225855348636, "grad_norm": 0.36047014594078064, "learning_rate": 4.8622577414465135e-05, "loss": 0.3702, "step": 2035500 }, { "epoch": 13.777609354699004, "grad_norm": 0.34504133462905884, "learning_rate": 4.8622239064530104e-05, "loss": 0.37, "step": 2036000 }, { "epoch": 13.780992854049373, "grad_norm": 0.3734630346298218, "learning_rate": 4.8621900714595066e-05, "loss": 0.3705, "step": 2036500 }, { "epoch": 13.78437635339974, "grad_norm": 0.3799772262573242, "learning_rate": 4.862156236466003e-05, "loss": 0.3701, "step": 2037000 }, { "epoch": 13.787759852750108, "grad_norm": 0.35689836740493774, "learning_rate": 4.862122401472499e-05, "loss": 0.3705, "step": 2037500 }, { "epoch": 13.791143352100477, "grad_norm": 0.38452208042144775, "learning_rate": 4.862088566478995e-05, "loss": 0.3695, "step": 2038000 }, { "epoch": 13.794526851450845, "grad_norm": 0.37008896470069885, "learning_rate": 4.862054731485492e-05, "loss": 0.3708, "step": 2038500 }, { "epoch": 13.797910350801212, "grad_norm": 0.3450815677642822, "learning_rate": 4.8620208964919884e-05, "loss": 0.37, "step": 2039000 }, { "epoch": 13.80129385015158, "grad_norm": 0.330727219581604, "learning_rate": 4.861987061498484e-05, "loss": 0.3714, "step": 2039500 }, { "epoch": 13.804677349501949, "grad_norm": 0.37175223231315613, "learning_rate": 4.861953226504981e-05, "loss": 0.37, "step": 2040000 }, { "epoch": 13.808060848852318, "grad_norm": 0.3831532299518585, "learning_rate": 4.861919391511477e-05, "loss": 0.3693, "step": 2040500 }, { "epoch": 13.811444348202684, "grad_norm": 0.33013632893562317, "learning_rate": 4.861885556517973e-05, "loss": 0.3702, "step": 2041000 }, { "epoch": 13.814827847553053, "grad_norm": 0.35006463527679443, "learning_rate": 4.8618517215244694e-05, "loss": 0.3711, "step": 2041500 }, { "epoch": 13.818211346903421, "grad_norm": 0.37648430466651917, "learning_rate": 4.861817886530966e-05, "loss": 0.3702, "step": 2042000 }, { "epoch": 13.82159484625379, "grad_norm": 0.35369744896888733, "learning_rate": 4.8617840515374625e-05, "loss": 0.3702, "step": 2042500 }, { "epoch": 13.824978345604158, "grad_norm": 0.3736339211463928, "learning_rate": 4.861750216543959e-05, "loss": 0.3711, "step": 2043000 }, { "epoch": 13.828361844954525, "grad_norm": 0.3667067885398865, "learning_rate": 4.861716381550455e-05, "loss": 0.3702, "step": 2043500 }, { "epoch": 13.831745344304894, "grad_norm": 0.34860265254974365, "learning_rate": 4.861682546556952e-05, "loss": 0.3707, "step": 2044000 }, { "epoch": 13.835128843655262, "grad_norm": 0.3995159864425659, "learning_rate": 4.8616487115634474e-05, "loss": 0.3717, "step": 2044500 }, { "epoch": 13.83851234300563, "grad_norm": 0.34385403990745544, "learning_rate": 4.8616148765699436e-05, "loss": 0.3699, "step": 2045000 }, { "epoch": 13.841895842355997, "grad_norm": 0.37954169511795044, "learning_rate": 4.86158104157644e-05, "loss": 0.3714, "step": 2045500 }, { "epoch": 13.845279341706366, "grad_norm": 0.3389084041118622, "learning_rate": 4.861547206582937e-05, "loss": 0.3699, "step": 2046000 }, { "epoch": 13.848662841056735, "grad_norm": 0.3550108075141907, "learning_rate": 4.861513371589433e-05, "loss": 0.3708, "step": 2046500 }, { "epoch": 13.852046340407103, "grad_norm": 0.37702715396881104, "learning_rate": 4.861479536595929e-05, "loss": 0.369, "step": 2047000 }, { "epoch": 13.855429839757472, "grad_norm": 0.37218254804611206, "learning_rate": 4.861445701602425e-05, "loss": 0.3706, "step": 2047500 }, { "epoch": 13.858813339107838, "grad_norm": 0.37504270672798157, "learning_rate": 4.861411866608922e-05, "loss": 0.3699, "step": 2048000 }, { "epoch": 13.862196838458207, "grad_norm": 0.3862540125846863, "learning_rate": 4.8613780316154184e-05, "loss": 0.3698, "step": 2048500 }, { "epoch": 13.865580337808575, "grad_norm": 0.36661839485168457, "learning_rate": 4.861344196621914e-05, "loss": 0.3702, "step": 2049000 }, { "epoch": 13.868963837158944, "grad_norm": 0.33589258790016174, "learning_rate": 4.861310361628411e-05, "loss": 0.3699, "step": 2049500 }, { "epoch": 13.87234733650931, "grad_norm": 0.41161099076271057, "learning_rate": 4.861276526634907e-05, "loss": 0.3714, "step": 2050000 }, { "epoch": 13.87573083585968, "grad_norm": 0.35029563307762146, "learning_rate": 4.861242691641403e-05, "loss": 0.3713, "step": 2050500 }, { "epoch": 13.879114335210048, "grad_norm": 0.35823801159858704, "learning_rate": 4.8612088566478995e-05, "loss": 0.3708, "step": 2051000 }, { "epoch": 13.882497834560416, "grad_norm": 0.3975470960140228, "learning_rate": 4.8611750216543964e-05, "loss": 0.3697, "step": 2051500 }, { "epoch": 13.885881333910785, "grad_norm": 0.34708172082901, "learning_rate": 4.8611411866608926e-05, "loss": 0.37, "step": 2052000 }, { "epoch": 13.889264833261151, "grad_norm": 0.390287846326828, "learning_rate": 4.861107351667389e-05, "loss": 0.3705, "step": 2052500 }, { "epoch": 13.89264833261152, "grad_norm": 0.3469938039779663, "learning_rate": 4.861073516673885e-05, "loss": 0.3701, "step": 2053000 }, { "epoch": 13.896031831961889, "grad_norm": 0.38159653544425964, "learning_rate": 4.861039681680382e-05, "loss": 0.3716, "step": 2053500 }, { "epoch": 13.899415331312257, "grad_norm": 0.43335747718811035, "learning_rate": 4.8610058466868774e-05, "loss": 0.3691, "step": 2054000 }, { "epoch": 13.902798830662624, "grad_norm": 0.3556252419948578, "learning_rate": 4.8609720116933736e-05, "loss": 0.3716, "step": 2054500 }, { "epoch": 13.906182330012992, "grad_norm": 0.3873630166053772, "learning_rate": 4.86093817669987e-05, "loss": 0.3703, "step": 2055000 }, { "epoch": 13.90956582936336, "grad_norm": 0.3356965482234955, "learning_rate": 4.860904341706367e-05, "loss": 0.3694, "step": 2055500 }, { "epoch": 13.91294932871373, "grad_norm": 0.37060633301734924, "learning_rate": 4.860870506712863e-05, "loss": 0.3695, "step": 2056000 }, { "epoch": 13.916332828064096, "grad_norm": 0.372291624546051, "learning_rate": 4.860836671719359e-05, "loss": 0.3683, "step": 2056500 }, { "epoch": 13.919716327414465, "grad_norm": 0.34234097599983215, "learning_rate": 4.8608028367258554e-05, "loss": 0.3708, "step": 2057000 }, { "epoch": 13.923099826764833, "grad_norm": 0.37905994057655334, "learning_rate": 4.860769001732352e-05, "loss": 0.3689, "step": 2057500 }, { "epoch": 13.926483326115202, "grad_norm": 0.3877720534801483, "learning_rate": 4.8607351667388485e-05, "loss": 0.3704, "step": 2058000 }, { "epoch": 13.92986682546557, "grad_norm": 0.32991376519203186, "learning_rate": 4.860701331745345e-05, "loss": 0.3694, "step": 2058500 }, { "epoch": 13.933250324815937, "grad_norm": 0.36483174562454224, "learning_rate": 4.860667496751841e-05, "loss": 0.3689, "step": 2059000 }, { "epoch": 13.936633824166305, "grad_norm": 0.4076962172985077, "learning_rate": 4.860633661758337e-05, "loss": 0.3707, "step": 2059500 }, { "epoch": 13.940017323516674, "grad_norm": 0.38313883543014526, "learning_rate": 4.860599826764833e-05, "loss": 0.3684, "step": 2060000 }, { "epoch": 13.943400822867043, "grad_norm": 0.34798476099967957, "learning_rate": 4.8605659917713295e-05, "loss": 0.3713, "step": 2060500 }, { "epoch": 13.946784322217411, "grad_norm": 0.3725389540195465, "learning_rate": 4.8605321567778264e-05, "loss": 0.3698, "step": 2061000 }, { "epoch": 13.950167821567778, "grad_norm": 0.40532901883125305, "learning_rate": 4.8604983217843226e-05, "loss": 0.3695, "step": 2061500 }, { "epoch": 13.953551320918146, "grad_norm": 0.38492247462272644, "learning_rate": 4.860464486790819e-05, "loss": 0.3696, "step": 2062000 }, { "epoch": 13.956934820268515, "grad_norm": 0.34857288002967834, "learning_rate": 4.860430651797315e-05, "loss": 0.369, "step": 2062500 }, { "epoch": 13.960318319618883, "grad_norm": 0.3815152645111084, "learning_rate": 4.860396816803812e-05, "loss": 0.3716, "step": 2063000 }, { "epoch": 13.96370181896925, "grad_norm": 0.35839033126831055, "learning_rate": 4.8603629818103075e-05, "loss": 0.3691, "step": 2063500 }, { "epoch": 13.967085318319619, "grad_norm": 0.36373916268348694, "learning_rate": 4.860329146816804e-05, "loss": 0.3685, "step": 2064000 }, { "epoch": 13.970468817669987, "grad_norm": 0.3275459408760071, "learning_rate": 4.8602953118233e-05, "loss": 0.37, "step": 2064500 }, { "epoch": 13.973852317020356, "grad_norm": 0.3416304886341095, "learning_rate": 4.860261476829797e-05, "loss": 0.3697, "step": 2065000 }, { "epoch": 13.977235816370722, "grad_norm": 0.34164056181907654, "learning_rate": 4.860227641836293e-05, "loss": 0.3706, "step": 2065500 }, { "epoch": 13.980619315721091, "grad_norm": 0.37814369797706604, "learning_rate": 4.860193806842789e-05, "loss": 0.3698, "step": 2066000 }, { "epoch": 13.98400281507146, "grad_norm": 0.395485520362854, "learning_rate": 4.8601599718492854e-05, "loss": 0.3692, "step": 2066500 }, { "epoch": 13.987386314421828, "grad_norm": 0.38372567296028137, "learning_rate": 4.860126136855782e-05, "loss": 0.3712, "step": 2067000 }, { "epoch": 13.990769813772197, "grad_norm": 0.3979295492172241, "learning_rate": 4.8600923018622785e-05, "loss": 0.3688, "step": 2067500 }, { "epoch": 13.994153313122563, "grad_norm": 0.37074190378189087, "learning_rate": 4.860058466868775e-05, "loss": 0.371, "step": 2068000 }, { "epoch": 13.997536812472932, "grad_norm": 0.37351343035697937, "learning_rate": 4.860024631875271e-05, "loss": 0.3691, "step": 2068500 }, { "epoch": 14.0, "eval_accuracy": 0.8588137544938473, "eval_loss": 0.5739654302597046, "eval_runtime": 3398.8028, "eval_samples_per_second": 85.543, "eval_steps_per_second": 5.347, "step": 2068864 }, { "epoch": 14.0009203118233, "grad_norm": 0.3616331219673157, "learning_rate": 4.859990796881767e-05, "loss": 0.369, "step": 2069000 }, { "epoch": 14.004303811173669, "grad_norm": 0.3238430619239807, "learning_rate": 4.8599569618882634e-05, "loss": 0.3689, "step": 2069500 }, { "epoch": 14.007687310524036, "grad_norm": 0.39138802886009216, "learning_rate": 4.8599231268947596e-05, "loss": 0.368, "step": 2070000 }, { "epoch": 14.011070809874404, "grad_norm": 0.3550621569156647, "learning_rate": 4.8598892919012565e-05, "loss": 0.3691, "step": 2070500 }, { "epoch": 14.014454309224773, "grad_norm": 0.3459855914115906, "learning_rate": 4.859855456907753e-05, "loss": 0.3684, "step": 2071000 }, { "epoch": 14.017837808575141, "grad_norm": 0.35226932168006897, "learning_rate": 4.859821621914249e-05, "loss": 0.368, "step": 2071500 }, { "epoch": 14.02122130792551, "grad_norm": 0.3718564808368683, "learning_rate": 4.859787786920745e-05, "loss": 0.369, "step": 2072000 }, { "epoch": 14.024604807275876, "grad_norm": 0.3818054497241974, "learning_rate": 4.859753951927242e-05, "loss": 0.3677, "step": 2072500 }, { "epoch": 14.027988306626245, "grad_norm": 0.35006603598594666, "learning_rate": 4.8597201169337376e-05, "loss": 0.3679, "step": 2073000 }, { "epoch": 14.031371805976613, "grad_norm": 0.387045681476593, "learning_rate": 4.859686281940234e-05, "loss": 0.3694, "step": 2073500 }, { "epoch": 14.034755305326982, "grad_norm": 0.34844982624053955, "learning_rate": 4.85965244694673e-05, "loss": 0.3688, "step": 2074000 }, { "epoch": 14.038138804677349, "grad_norm": 0.3560154139995575, "learning_rate": 4.859618611953227e-05, "loss": 0.3675, "step": 2074500 }, { "epoch": 14.041522304027717, "grad_norm": 0.3833746910095215, "learning_rate": 4.859584776959723e-05, "loss": 0.369, "step": 2075000 }, { "epoch": 14.044905803378086, "grad_norm": 0.33990758657455444, "learning_rate": 4.859550941966219e-05, "loss": 0.3686, "step": 2075500 }, { "epoch": 14.048289302728454, "grad_norm": 0.3707942068576813, "learning_rate": 4.8595171069727155e-05, "loss": 0.3701, "step": 2076000 }, { "epoch": 14.051672802078823, "grad_norm": 0.3842766582965851, "learning_rate": 4.8594832719792124e-05, "loss": 0.3693, "step": 2076500 }, { "epoch": 14.05505630142919, "grad_norm": 0.3341391682624817, "learning_rate": 4.8594494369857086e-05, "loss": 0.3707, "step": 2077000 }, { "epoch": 14.058439800779558, "grad_norm": 0.32680997252464294, "learning_rate": 4.859415601992205e-05, "loss": 0.3681, "step": 2077500 }, { "epoch": 14.061823300129927, "grad_norm": 0.38530468940734863, "learning_rate": 4.859381766998701e-05, "loss": 0.3691, "step": 2078000 }, { "epoch": 14.065206799480295, "grad_norm": 0.3662298321723938, "learning_rate": 4.859347932005197e-05, "loss": 0.3688, "step": 2078500 }, { "epoch": 14.068590298830662, "grad_norm": 0.3635557293891907, "learning_rate": 4.8593140970116935e-05, "loss": 0.3687, "step": 2079000 }, { "epoch": 14.07197379818103, "grad_norm": 0.34785518050193787, "learning_rate": 4.85928026201819e-05, "loss": 0.3696, "step": 2079500 }, { "epoch": 14.075357297531399, "grad_norm": 0.35233691334724426, "learning_rate": 4.8592464270246866e-05, "loss": 0.3688, "step": 2080000 }, { "epoch": 14.078740796881767, "grad_norm": 0.39892759919166565, "learning_rate": 4.859212592031183e-05, "loss": 0.3683, "step": 2080500 }, { "epoch": 14.082124296232134, "grad_norm": 0.37727105617523193, "learning_rate": 4.859178757037679e-05, "loss": 0.3684, "step": 2081000 }, { "epoch": 14.085507795582503, "grad_norm": 0.3854265511035919, "learning_rate": 4.859144922044175e-05, "loss": 0.3686, "step": 2081500 }, { "epoch": 14.088891294932871, "grad_norm": 0.3632075786590576, "learning_rate": 4.859111087050672e-05, "loss": 0.3696, "step": 2082000 }, { "epoch": 14.09227479428324, "grad_norm": 0.3490007221698761, "learning_rate": 4.8590772520571676e-05, "loss": 0.3694, "step": 2082500 }, { "epoch": 14.095658293633608, "grad_norm": 0.385567843914032, "learning_rate": 4.859043417063664e-05, "loss": 0.3691, "step": 2083000 }, { "epoch": 14.099041792983975, "grad_norm": 0.36982715129852295, "learning_rate": 4.85900958207016e-05, "loss": 0.3683, "step": 2083500 }, { "epoch": 14.102425292334344, "grad_norm": 0.35927850008010864, "learning_rate": 4.858975747076657e-05, "loss": 0.3689, "step": 2084000 }, { "epoch": 14.105808791684712, "grad_norm": 0.3497246205806732, "learning_rate": 4.858941912083153e-05, "loss": 0.3676, "step": 2084500 }, { "epoch": 14.10919229103508, "grad_norm": 0.3736872971057892, "learning_rate": 4.8589080770896494e-05, "loss": 0.3699, "step": 2085000 }, { "epoch": 14.112575790385447, "grad_norm": 0.3965182602405548, "learning_rate": 4.8588742420961456e-05, "loss": 0.3681, "step": 2085500 }, { "epoch": 14.115959289735816, "grad_norm": 0.34779226779937744, "learning_rate": 4.8588404071026425e-05, "loss": 0.3699, "step": 2086000 }, { "epoch": 14.119342789086184, "grad_norm": 0.3425229489803314, "learning_rate": 4.858806572109139e-05, "loss": 0.3683, "step": 2086500 }, { "epoch": 14.122726288436553, "grad_norm": 0.39361563324928284, "learning_rate": 4.858772737115635e-05, "loss": 0.3694, "step": 2087000 }, { "epoch": 14.126109787786922, "grad_norm": 0.39007148146629333, "learning_rate": 4.858738902122131e-05, "loss": 0.369, "step": 2087500 }, { "epoch": 14.129493287137288, "grad_norm": 0.3457988202571869, "learning_rate": 4.858705067128627e-05, "loss": 0.3685, "step": 2088000 }, { "epoch": 14.132876786487657, "grad_norm": 0.3679068982601166, "learning_rate": 4.8586712321351235e-05, "loss": 0.3685, "step": 2088500 }, { "epoch": 14.136260285838025, "grad_norm": 0.35563474893569946, "learning_rate": 4.85863739714162e-05, "loss": 0.3691, "step": 2089000 }, { "epoch": 14.139643785188394, "grad_norm": 0.3731387257575989, "learning_rate": 4.8586035621481166e-05, "loss": 0.3689, "step": 2089500 }, { "epoch": 14.14302728453876, "grad_norm": 0.39115670323371887, "learning_rate": 4.858569727154613e-05, "loss": 0.3672, "step": 2090000 }, { "epoch": 14.146410783889129, "grad_norm": 0.4126528203487396, "learning_rate": 4.858535892161109e-05, "loss": 0.3685, "step": 2090500 }, { "epoch": 14.149794283239498, "grad_norm": 0.3236331045627594, "learning_rate": 4.858502057167605e-05, "loss": 0.3679, "step": 2091000 }, { "epoch": 14.153177782589866, "grad_norm": 0.35798412561416626, "learning_rate": 4.8584682221741015e-05, "loss": 0.3687, "step": 2091500 }, { "epoch": 14.156561281940235, "grad_norm": 0.3941161334514618, "learning_rate": 4.858434387180598e-05, "loss": 0.369, "step": 2092000 }, { "epoch": 14.159944781290601, "grad_norm": 0.3767811059951782, "learning_rate": 4.858400552187094e-05, "loss": 0.369, "step": 2092500 }, { "epoch": 14.16332828064097, "grad_norm": 0.3738722801208496, "learning_rate": 4.85836671719359e-05, "loss": 0.3686, "step": 2093000 }, { "epoch": 14.166711779991338, "grad_norm": 0.4057631194591522, "learning_rate": 4.858332882200087e-05, "loss": 0.3702, "step": 2093500 }, { "epoch": 14.170095279341707, "grad_norm": 0.36650463938713074, "learning_rate": 4.858299047206583e-05, "loss": 0.3685, "step": 2094000 }, { "epoch": 14.173478778692074, "grad_norm": 0.3387785851955414, "learning_rate": 4.8582652122130794e-05, "loss": 0.3677, "step": 2094500 }, { "epoch": 14.176862278042442, "grad_norm": 0.3528936803340912, "learning_rate": 4.8582313772195756e-05, "loss": 0.3683, "step": 2095000 }, { "epoch": 14.18024577739281, "grad_norm": 0.3900843560695648, "learning_rate": 4.8581975422260725e-05, "loss": 0.3678, "step": 2095500 }, { "epoch": 14.18362927674318, "grad_norm": 0.3487549424171448, "learning_rate": 4.858163707232569e-05, "loss": 0.3692, "step": 2096000 }, { "epoch": 14.187012776093548, "grad_norm": 0.3960934281349182, "learning_rate": 4.858129872239065e-05, "loss": 0.3697, "step": 2096500 }, { "epoch": 14.190396275443915, "grad_norm": 0.352027028799057, "learning_rate": 4.858096037245561e-05, "loss": 0.3702, "step": 2097000 }, { "epoch": 14.193779774794283, "grad_norm": 0.3669641613960266, "learning_rate": 4.8580622022520574e-05, "loss": 0.3685, "step": 2097500 }, { "epoch": 14.197163274144652, "grad_norm": 0.3863503634929657, "learning_rate": 4.8580283672585536e-05, "loss": 0.3684, "step": 2098000 }, { "epoch": 14.20054677349502, "grad_norm": 0.3422660529613495, "learning_rate": 4.85799453226505e-05, "loss": 0.3679, "step": 2098500 }, { "epoch": 14.203930272845387, "grad_norm": 0.3772645890712738, "learning_rate": 4.857960697271546e-05, "loss": 0.3693, "step": 2099000 }, { "epoch": 14.207313772195755, "grad_norm": 0.3528118431568146, "learning_rate": 4.857926862278043e-05, "loss": 0.3682, "step": 2099500 }, { "epoch": 14.210697271546124, "grad_norm": 0.40165719389915466, "learning_rate": 4.857893027284539e-05, "loss": 0.3695, "step": 2100000 }, { "epoch": 14.214080770896492, "grad_norm": 0.38907134532928467, "learning_rate": 4.857859192291035e-05, "loss": 0.3677, "step": 2100500 }, { "epoch": 14.217464270246861, "grad_norm": 0.38055655360221863, "learning_rate": 4.8578253572975315e-05, "loss": 0.3682, "step": 2101000 }, { "epoch": 14.220847769597228, "grad_norm": 0.40473777055740356, "learning_rate": 4.857791522304028e-05, "loss": 0.369, "step": 2101500 }, { "epoch": 14.224231268947596, "grad_norm": 0.37386074662208557, "learning_rate": 4.857757687310524e-05, "loss": 0.3682, "step": 2102000 }, { "epoch": 14.227614768297965, "grad_norm": 0.3558364808559418, "learning_rate": 4.85772385231702e-05, "loss": 0.3685, "step": 2102500 }, { "epoch": 14.230998267648333, "grad_norm": 0.38514620065689087, "learning_rate": 4.857690017323517e-05, "loss": 0.3713, "step": 2103000 }, { "epoch": 14.2343817669987, "grad_norm": 0.3842502236366272, "learning_rate": 4.857656182330013e-05, "loss": 0.3699, "step": 2103500 }, { "epoch": 14.237765266349069, "grad_norm": 0.398985892534256, "learning_rate": 4.8576223473365095e-05, "loss": 0.3701, "step": 2104000 }, { "epoch": 14.241148765699437, "grad_norm": 0.3588099181652069, "learning_rate": 4.857588512343006e-05, "loss": 0.366, "step": 2104500 }, { "epoch": 14.244532265049806, "grad_norm": 0.3762696087360382, "learning_rate": 4.8575546773495026e-05, "loss": 0.3698, "step": 2105000 }, { "epoch": 14.247915764400172, "grad_norm": 0.34633252024650574, "learning_rate": 4.857520842355999e-05, "loss": 0.3694, "step": 2105500 }, { "epoch": 14.251299263750541, "grad_norm": 0.33246076107025146, "learning_rate": 4.857487007362495e-05, "loss": 0.3684, "step": 2106000 }, { "epoch": 14.25468276310091, "grad_norm": 0.37106186151504517, "learning_rate": 4.857453172368991e-05, "loss": 0.3678, "step": 2106500 }, { "epoch": 14.258066262451278, "grad_norm": 0.342178076505661, "learning_rate": 4.8574193373754874e-05, "loss": 0.3701, "step": 2107000 }, { "epoch": 14.261449761801646, "grad_norm": 0.3419799506664276, "learning_rate": 4.8573855023819836e-05, "loss": 0.3696, "step": 2107500 }, { "epoch": 14.264833261152013, "grad_norm": 0.3432062566280365, "learning_rate": 4.85735166738848e-05, "loss": 0.3706, "step": 2108000 }, { "epoch": 14.268216760502382, "grad_norm": 0.3578495383262634, "learning_rate": 4.857317832394976e-05, "loss": 0.3689, "step": 2108500 }, { "epoch": 14.27160025985275, "grad_norm": 0.34382176399230957, "learning_rate": 4.857283997401473e-05, "loss": 0.3688, "step": 2109000 }, { "epoch": 14.274983759203119, "grad_norm": 0.33303534984588623, "learning_rate": 4.857250162407969e-05, "loss": 0.3691, "step": 2109500 }, { "epoch": 14.278367258553486, "grad_norm": 0.3536013066768646, "learning_rate": 4.8572163274144654e-05, "loss": 0.3709, "step": 2110000 }, { "epoch": 14.281750757903854, "grad_norm": 0.3310689926147461, "learning_rate": 4.8571824924209616e-05, "loss": 0.3704, "step": 2110500 }, { "epoch": 14.285134257254223, "grad_norm": 0.4020843505859375, "learning_rate": 4.8571486574274585e-05, "loss": 0.3699, "step": 2111000 }, { "epoch": 14.288517756604591, "grad_norm": 0.3867824375629425, "learning_rate": 4.857114822433954e-05, "loss": 0.3683, "step": 2111500 }, { "epoch": 14.29190125595496, "grad_norm": 0.3328765332698822, "learning_rate": 4.85708098744045e-05, "loss": 0.3685, "step": 2112000 }, { "epoch": 14.295284755305326, "grad_norm": 0.33964329957962036, "learning_rate": 4.857047152446947e-05, "loss": 0.3702, "step": 2112500 }, { "epoch": 14.298668254655695, "grad_norm": 0.4224208891391754, "learning_rate": 4.857013317453443e-05, "loss": 0.3694, "step": 2113000 }, { "epoch": 14.302051754006063, "grad_norm": 0.4299384653568268, "learning_rate": 4.8569794824599396e-05, "loss": 0.368, "step": 2113500 }, { "epoch": 14.305435253356432, "grad_norm": 0.36000317335128784, "learning_rate": 4.856945647466436e-05, "loss": 0.3698, "step": 2114000 }, { "epoch": 14.308818752706799, "grad_norm": 0.3650577664375305, "learning_rate": 4.8569118124729327e-05, "loss": 0.3683, "step": 2114500 }, { "epoch": 14.312202252057167, "grad_norm": 0.38082313537597656, "learning_rate": 4.856877977479429e-05, "loss": 0.3687, "step": 2115000 }, { "epoch": 14.315585751407536, "grad_norm": 0.3912234604358673, "learning_rate": 4.856844142485925e-05, "loss": 0.3706, "step": 2115500 }, { "epoch": 14.318969250757904, "grad_norm": 0.36391451954841614, "learning_rate": 4.8568103074924206e-05, "loss": 0.3691, "step": 2116000 }, { "epoch": 14.322352750108273, "grad_norm": 0.36006873846054077, "learning_rate": 4.8567764724989175e-05, "loss": 0.3686, "step": 2116500 }, { "epoch": 14.32573624945864, "grad_norm": 0.3425831198692322, "learning_rate": 4.856742637505414e-05, "loss": 0.3687, "step": 2117000 }, { "epoch": 14.329119748809008, "grad_norm": 0.37592950463294983, "learning_rate": 4.85670880251191e-05, "loss": 0.3695, "step": 2117500 }, { "epoch": 14.332503248159377, "grad_norm": 0.32281264662742615, "learning_rate": 4.856674967518406e-05, "loss": 0.3694, "step": 2118000 }, { "epoch": 14.335886747509745, "grad_norm": 0.34113267064094543, "learning_rate": 4.856641132524903e-05, "loss": 0.3694, "step": 2118500 }, { "epoch": 14.339270246860112, "grad_norm": 0.33674532175064087, "learning_rate": 4.856607297531399e-05, "loss": 0.3701, "step": 2119000 }, { "epoch": 14.34265374621048, "grad_norm": 0.35087233781814575, "learning_rate": 4.8565734625378955e-05, "loss": 0.3689, "step": 2119500 }, { "epoch": 14.346037245560849, "grad_norm": 0.3552713096141815, "learning_rate": 4.856539627544392e-05, "loss": 0.3694, "step": 2120000 }, { "epoch": 14.349420744911217, "grad_norm": 0.3678264617919922, "learning_rate": 4.8565057925508886e-05, "loss": 0.3702, "step": 2120500 }, { "epoch": 14.352804244261584, "grad_norm": 0.35346418619155884, "learning_rate": 4.856471957557384e-05, "loss": 0.3673, "step": 2121000 }, { "epoch": 14.356187743611953, "grad_norm": 0.38516688346862793, "learning_rate": 4.85643812256388e-05, "loss": 0.369, "step": 2121500 }, { "epoch": 14.359571242962321, "grad_norm": 0.3606177568435669, "learning_rate": 4.856404287570377e-05, "loss": 0.3725, "step": 2122000 }, { "epoch": 14.36295474231269, "grad_norm": 0.3833155333995819, "learning_rate": 4.8563704525768734e-05, "loss": 0.3685, "step": 2122500 }, { "epoch": 14.366338241663058, "grad_norm": 0.3655698001384735, "learning_rate": 4.8563366175833696e-05, "loss": 0.3677, "step": 2123000 }, { "epoch": 14.369721741013425, "grad_norm": 0.4088969826698303, "learning_rate": 4.856302782589866e-05, "loss": 0.3699, "step": 2123500 }, { "epoch": 14.373105240363794, "grad_norm": 0.34506741166114807, "learning_rate": 4.856268947596363e-05, "loss": 0.3679, "step": 2124000 }, { "epoch": 14.376488739714162, "grad_norm": 0.32661932706832886, "learning_rate": 4.856235112602859e-05, "loss": 0.3687, "step": 2124500 }, { "epoch": 14.37987223906453, "grad_norm": 0.367184042930603, "learning_rate": 4.856201277609355e-05, "loss": 0.3697, "step": 2125000 }, { "epoch": 14.3832557384149, "grad_norm": 0.39475658535957336, "learning_rate": 4.856167442615851e-05, "loss": 0.369, "step": 2125500 }, { "epoch": 14.386639237765266, "grad_norm": 0.38947799801826477, "learning_rate": 4.8561336076223476e-05, "loss": 0.3698, "step": 2126000 }, { "epoch": 14.390022737115634, "grad_norm": 0.403063029050827, "learning_rate": 4.856099772628844e-05, "loss": 0.37, "step": 2126500 }, { "epoch": 14.393406236466003, "grad_norm": 0.39779502153396606, "learning_rate": 4.85606593763534e-05, "loss": 0.3689, "step": 2127000 }, { "epoch": 14.396789735816371, "grad_norm": 0.37461674213409424, "learning_rate": 4.856032102641836e-05, "loss": 0.3718, "step": 2127500 }, { "epoch": 14.400173235166738, "grad_norm": 0.38416680693626404, "learning_rate": 4.855998267648333e-05, "loss": 0.3691, "step": 2128000 }, { "epoch": 14.403556734517107, "grad_norm": 0.3756099343299866, "learning_rate": 4.855964432654829e-05, "loss": 0.3695, "step": 2128500 }, { "epoch": 14.406940233867475, "grad_norm": 0.3770340085029602, "learning_rate": 4.8559305976613255e-05, "loss": 0.3689, "step": 2129000 }, { "epoch": 14.410323733217844, "grad_norm": 0.3530448079109192, "learning_rate": 4.855896762667822e-05, "loss": 0.3697, "step": 2129500 }, { "epoch": 14.41370723256821, "grad_norm": 0.37978580594062805, "learning_rate": 4.8558629276743186e-05, "loss": 0.3691, "step": 2130000 }, { "epoch": 14.417090731918579, "grad_norm": 0.3957188129425049, "learning_rate": 4.855829092680814e-05, "loss": 0.3682, "step": 2130500 }, { "epoch": 14.420474231268948, "grad_norm": 0.3946071267127991, "learning_rate": 4.8557952576873104e-05, "loss": 0.3695, "step": 2131000 }, { "epoch": 14.423857730619316, "grad_norm": 0.3873770236968994, "learning_rate": 4.855761422693807e-05, "loss": 0.3687, "step": 2131500 }, { "epoch": 14.427241229969685, "grad_norm": 0.3992931544780731, "learning_rate": 4.8557275877003035e-05, "loss": 0.3689, "step": 2132000 }, { "epoch": 14.430624729320051, "grad_norm": 0.37176117300987244, "learning_rate": 4.8556937527068e-05, "loss": 0.3694, "step": 2132500 }, { "epoch": 14.43400822867042, "grad_norm": 0.3461103141307831, "learning_rate": 4.855659917713296e-05, "loss": 0.3699, "step": 2133000 }, { "epoch": 14.437391728020788, "grad_norm": 0.34357118606567383, "learning_rate": 4.855626082719793e-05, "loss": 0.3695, "step": 2133500 }, { "epoch": 14.440775227371157, "grad_norm": 0.3594299256801605, "learning_rate": 4.855592247726289e-05, "loss": 0.3684, "step": 2134000 }, { "epoch": 14.444158726721524, "grad_norm": 0.3565590977668762, "learning_rate": 4.855558412732785e-05, "loss": 0.37, "step": 2134500 }, { "epoch": 14.447542226071892, "grad_norm": 0.3601624667644501, "learning_rate": 4.855524577739281e-05, "loss": 0.3694, "step": 2135000 }, { "epoch": 14.45092572542226, "grad_norm": 0.36201897263526917, "learning_rate": 4.8554907427457776e-05, "loss": 0.3694, "step": 2135500 }, { "epoch": 14.45430922477263, "grad_norm": 0.3696894645690918, "learning_rate": 4.855456907752274e-05, "loss": 0.3701, "step": 2136000 }, { "epoch": 14.457692724122998, "grad_norm": 0.4050441086292267, "learning_rate": 4.85542307275877e-05, "loss": 0.3707, "step": 2136500 }, { "epoch": 14.461076223473365, "grad_norm": 0.3355098068714142, "learning_rate": 4.855389237765266e-05, "loss": 0.3688, "step": 2137000 }, { "epoch": 14.464459722823733, "grad_norm": 0.40232527256011963, "learning_rate": 4.855355402771763e-05, "loss": 0.3694, "step": 2137500 }, { "epoch": 14.467843222174102, "grad_norm": 0.368574321269989, "learning_rate": 4.8553215677782594e-05, "loss": 0.3697, "step": 2138000 }, { "epoch": 14.47122672152447, "grad_norm": 0.3811182677745819, "learning_rate": 4.8552877327847556e-05, "loss": 0.3685, "step": 2138500 }, { "epoch": 14.474610220874837, "grad_norm": 0.39627721905708313, "learning_rate": 4.855253897791252e-05, "loss": 0.3691, "step": 2139000 }, { "epoch": 14.477993720225205, "grad_norm": 0.34768345952033997, "learning_rate": 4.855220062797749e-05, "loss": 0.3691, "step": 2139500 }, { "epoch": 14.481377219575574, "grad_norm": 0.38759613037109375, "learning_rate": 4.855186227804244e-05, "loss": 0.37, "step": 2140000 }, { "epoch": 14.484760718925942, "grad_norm": 0.38205355405807495, "learning_rate": 4.8551523928107404e-05, "loss": 0.3697, "step": 2140500 }, { "epoch": 14.488144218276311, "grad_norm": 0.3666626513004303, "learning_rate": 4.855118557817237e-05, "loss": 0.3668, "step": 2141000 }, { "epoch": 14.491527717626678, "grad_norm": 0.3698650598526001, "learning_rate": 4.8550847228237335e-05, "loss": 0.3702, "step": 2141500 }, { "epoch": 14.494911216977046, "grad_norm": 0.3574991524219513, "learning_rate": 4.85505088783023e-05, "loss": 0.3703, "step": 2142000 }, { "epoch": 14.498294716327415, "grad_norm": 0.39458853006362915, "learning_rate": 4.855017052836726e-05, "loss": 0.3687, "step": 2142500 }, { "epoch": 14.501678215677783, "grad_norm": 0.3711014986038208, "learning_rate": 4.854983217843223e-05, "loss": 0.3695, "step": 2143000 }, { "epoch": 14.50506171502815, "grad_norm": 0.3292671740055084, "learning_rate": 4.854949382849719e-05, "loss": 0.3676, "step": 2143500 }, { "epoch": 14.508445214378519, "grad_norm": 0.35296839475631714, "learning_rate": 4.854915547856215e-05, "loss": 0.368, "step": 2144000 }, { "epoch": 14.511828713728887, "grad_norm": 0.37003955245018005, "learning_rate": 4.854881712862711e-05, "loss": 0.3692, "step": 2144500 }, { "epoch": 14.515212213079256, "grad_norm": 0.4041885733604431, "learning_rate": 4.854847877869208e-05, "loss": 0.3697, "step": 2145000 }, { "epoch": 14.518595712429622, "grad_norm": 0.3833051919937134, "learning_rate": 4.854814042875704e-05, "loss": 0.3674, "step": 2145500 }, { "epoch": 14.52197921177999, "grad_norm": 0.3829428553581238, "learning_rate": 4.8547802078822e-05, "loss": 0.3701, "step": 2146000 }, { "epoch": 14.52536271113036, "grad_norm": 0.3993237018585205, "learning_rate": 4.854746372888696e-05, "loss": 0.3693, "step": 2146500 }, { "epoch": 14.528746210480728, "grad_norm": 0.36786895990371704, "learning_rate": 4.854712537895193e-05, "loss": 0.3695, "step": 2147000 }, { "epoch": 14.532129709831096, "grad_norm": 0.34661757946014404, "learning_rate": 4.8546787029016894e-05, "loss": 0.3703, "step": 2147500 }, { "epoch": 14.535513209181463, "grad_norm": 0.40853285789489746, "learning_rate": 4.8546448679081856e-05, "loss": 0.3699, "step": 2148000 }, { "epoch": 14.538896708531832, "grad_norm": 0.3633783757686615, "learning_rate": 4.854611032914682e-05, "loss": 0.3705, "step": 2148500 }, { "epoch": 14.5422802078822, "grad_norm": 0.3499980866909027, "learning_rate": 4.854577197921179e-05, "loss": 0.3685, "step": 2149000 }, { "epoch": 14.545663707232569, "grad_norm": 0.3772374987602234, "learning_rate": 4.854543362927674e-05, "loss": 0.3696, "step": 2149500 }, { "epoch": 14.549047206582937, "grad_norm": 0.3617144227027893, "learning_rate": 4.8545095279341705e-05, "loss": 0.37, "step": 2150000 }, { "epoch": 14.552430705933304, "grad_norm": 0.3383565843105316, "learning_rate": 4.8544756929406674e-05, "loss": 0.3699, "step": 2150500 }, { "epoch": 14.555814205283673, "grad_norm": 0.3805508017539978, "learning_rate": 4.8544418579471636e-05, "loss": 0.3696, "step": 2151000 }, { "epoch": 14.559197704634041, "grad_norm": 0.39522701501846313, "learning_rate": 4.85440802295366e-05, "loss": 0.3694, "step": 2151500 }, { "epoch": 14.56258120398441, "grad_norm": 0.39664092659950256, "learning_rate": 4.854374187960156e-05, "loss": 0.3666, "step": 2152000 }, { "epoch": 14.565964703334776, "grad_norm": 0.3451291620731354, "learning_rate": 4.854340352966653e-05, "loss": 0.37, "step": 2152500 }, { "epoch": 14.569348202685145, "grad_norm": 0.34200799465179443, "learning_rate": 4.854306517973149e-05, "loss": 0.3686, "step": 2153000 }, { "epoch": 14.572731702035513, "grad_norm": 0.3482162356376648, "learning_rate": 4.854272682979645e-05, "loss": 0.3715, "step": 2153500 }, { "epoch": 14.576115201385882, "grad_norm": 0.33394691348075867, "learning_rate": 4.854238847986141e-05, "loss": 0.3691, "step": 2154000 }, { "epoch": 14.579498700736249, "grad_norm": 0.37948983907699585, "learning_rate": 4.854205012992638e-05, "loss": 0.3678, "step": 2154500 }, { "epoch": 14.582882200086617, "grad_norm": 0.3694993257522583, "learning_rate": 4.854171177999134e-05, "loss": 0.3688, "step": 2155000 }, { "epoch": 14.586265699436986, "grad_norm": 0.3592107594013214, "learning_rate": 4.85413734300563e-05, "loss": 0.3685, "step": 2155500 }, { "epoch": 14.589649198787354, "grad_norm": 0.3389520049095154, "learning_rate": 4.8541035080121264e-05, "loss": 0.3698, "step": 2156000 }, { "epoch": 14.593032698137723, "grad_norm": 0.3517674505710602, "learning_rate": 4.854069673018623e-05, "loss": 0.3683, "step": 2156500 }, { "epoch": 14.59641619748809, "grad_norm": 0.32301872968673706, "learning_rate": 4.8540358380251195e-05, "loss": 0.3665, "step": 2157000 }, { "epoch": 14.599799696838458, "grad_norm": 0.3524543046951294, "learning_rate": 4.854002003031616e-05, "loss": 0.3715, "step": 2157500 }, { "epoch": 14.603183196188827, "grad_norm": 0.34795093536376953, "learning_rate": 4.853968168038112e-05, "loss": 0.3682, "step": 2158000 }, { "epoch": 14.606566695539195, "grad_norm": 0.3424944281578064, "learning_rate": 4.853934333044609e-05, "loss": 0.3695, "step": 2158500 }, { "epoch": 14.609950194889562, "grad_norm": 0.36514824628829956, "learning_rate": 4.8539004980511043e-05, "loss": 0.3689, "step": 2159000 }, { "epoch": 14.61333369423993, "grad_norm": 0.3638770878314972, "learning_rate": 4.8538666630576006e-05, "loss": 0.3693, "step": 2159500 }, { "epoch": 14.616717193590299, "grad_norm": 0.3456403911113739, "learning_rate": 4.8538328280640974e-05, "loss": 0.3702, "step": 2160000 }, { "epoch": 14.620100692940667, "grad_norm": 0.3254307806491852, "learning_rate": 4.8537989930705937e-05, "loss": 0.3693, "step": 2160500 }, { "epoch": 14.623484192291036, "grad_norm": 0.3652152121067047, "learning_rate": 4.85376515807709e-05, "loss": 0.3688, "step": 2161000 }, { "epoch": 14.626867691641403, "grad_norm": 0.36455264687538147, "learning_rate": 4.853731323083586e-05, "loss": 0.369, "step": 2161500 }, { "epoch": 14.630251190991771, "grad_norm": 0.37666502594947815, "learning_rate": 4.853697488090082e-05, "loss": 0.3691, "step": 2162000 }, { "epoch": 14.63363469034214, "grad_norm": 0.355785071849823, "learning_rate": 4.853663653096579e-05, "loss": 0.3701, "step": 2162500 }, { "epoch": 14.637018189692508, "grad_norm": 0.3691225051879883, "learning_rate": 4.8536298181030754e-05, "loss": 0.3695, "step": 2163000 }, { "epoch": 14.640401689042875, "grad_norm": 0.3322974145412445, "learning_rate": 4.853595983109571e-05, "loss": 0.3691, "step": 2163500 }, { "epoch": 14.643785188393244, "grad_norm": 0.3645572066307068, "learning_rate": 4.853562148116068e-05, "loss": 0.3697, "step": 2164000 }, { "epoch": 14.647168687743612, "grad_norm": 0.3978794515132904, "learning_rate": 4.853528313122564e-05, "loss": 0.3689, "step": 2164500 }, { "epoch": 14.65055218709398, "grad_norm": 0.33656492829322815, "learning_rate": 4.85349447812906e-05, "loss": 0.3694, "step": 2165000 }, { "epoch": 14.653935686444349, "grad_norm": 0.326246052980423, "learning_rate": 4.8534606431355565e-05, "loss": 0.3688, "step": 2165500 }, { "epoch": 14.657319185794716, "grad_norm": 0.3709580898284912, "learning_rate": 4.8534268081420533e-05, "loss": 0.369, "step": 2166000 }, { "epoch": 14.660702685145084, "grad_norm": 0.39139363169670105, "learning_rate": 4.8533929731485496e-05, "loss": 0.3675, "step": 2166500 }, { "epoch": 14.664086184495453, "grad_norm": 0.3772076964378357, "learning_rate": 4.853359138155046e-05, "loss": 0.3682, "step": 2167000 }, { "epoch": 14.667469683845821, "grad_norm": 0.33439138531684875, "learning_rate": 4.853325303161542e-05, "loss": 0.3686, "step": 2167500 }, { "epoch": 14.670853183196188, "grad_norm": 0.403716504573822, "learning_rate": 4.853291468168039e-05, "loss": 0.3683, "step": 2168000 }, { "epoch": 14.674236682546557, "grad_norm": 0.4087928533554077, "learning_rate": 4.8532576331745344e-05, "loss": 0.3701, "step": 2168500 }, { "epoch": 14.677620181896925, "grad_norm": 0.3306918442249298, "learning_rate": 4.8532237981810306e-05, "loss": 0.3677, "step": 2169000 }, { "epoch": 14.681003681247294, "grad_norm": 0.35591837763786316, "learning_rate": 4.853189963187527e-05, "loss": 0.3686, "step": 2169500 }, { "epoch": 14.68438718059766, "grad_norm": 0.35075968503952026, "learning_rate": 4.853156128194024e-05, "loss": 0.3688, "step": 2170000 }, { "epoch": 14.687770679948029, "grad_norm": 0.35520946979522705, "learning_rate": 4.85312229320052e-05, "loss": 0.3693, "step": 2170500 }, { "epoch": 14.691154179298398, "grad_norm": 0.3742755353450775, "learning_rate": 4.853088458207016e-05, "loss": 0.3685, "step": 2171000 }, { "epoch": 14.694537678648766, "grad_norm": 0.3788643181324005, "learning_rate": 4.8530546232135124e-05, "loss": 0.3689, "step": 2171500 }, { "epoch": 14.697921177999135, "grad_norm": 0.3427134156227112, "learning_rate": 4.853020788220009e-05, "loss": 0.3694, "step": 2172000 }, { "epoch": 14.701304677349501, "grad_norm": 0.38204559683799744, "learning_rate": 4.8529869532265055e-05, "loss": 0.3698, "step": 2172500 }, { "epoch": 14.70468817669987, "grad_norm": 0.38196712732315063, "learning_rate": 4.852953118233002e-05, "loss": 0.3705, "step": 2173000 }, { "epoch": 14.708071676050238, "grad_norm": 0.3562172055244446, "learning_rate": 4.852919283239498e-05, "loss": 0.3688, "step": 2173500 }, { "epoch": 14.711455175400607, "grad_norm": 0.37580806016921997, "learning_rate": 4.852885448245994e-05, "loss": 0.3687, "step": 2174000 }, { "epoch": 14.714838674750974, "grad_norm": 0.30255070328712463, "learning_rate": 4.85285161325249e-05, "loss": 0.3678, "step": 2174500 }, { "epoch": 14.718222174101342, "grad_norm": 0.37820175290107727, "learning_rate": 4.8528177782589865e-05, "loss": 0.3677, "step": 2175000 }, { "epoch": 14.72160567345171, "grad_norm": 0.3466613292694092, "learning_rate": 4.8527839432654834e-05, "loss": 0.3693, "step": 2175500 }, { "epoch": 14.72498917280208, "grad_norm": 0.377681702375412, "learning_rate": 4.8527501082719796e-05, "loss": 0.3687, "step": 2176000 }, { "epoch": 14.728372672152448, "grad_norm": 0.3577273488044739, "learning_rate": 4.852716273278476e-05, "loss": 0.3701, "step": 2176500 }, { "epoch": 14.731756171502814, "grad_norm": 0.3537616729736328, "learning_rate": 4.852682438284972e-05, "loss": 0.3699, "step": 2177000 }, { "epoch": 14.735139670853183, "grad_norm": 0.3725182116031647, "learning_rate": 4.852648603291469e-05, "loss": 0.3699, "step": 2177500 }, { "epoch": 14.738523170203552, "grad_norm": 0.3579446077346802, "learning_rate": 4.8526147682979645e-05, "loss": 0.368, "step": 2178000 }, { "epoch": 14.74190666955392, "grad_norm": 0.35082072019577026, "learning_rate": 4.852580933304461e-05, "loss": 0.3694, "step": 2178500 }, { "epoch": 14.745290168904287, "grad_norm": 0.38058724999427795, "learning_rate": 4.852547098310957e-05, "loss": 0.3704, "step": 2179000 }, { "epoch": 14.748673668254655, "grad_norm": 0.35093605518341064, "learning_rate": 4.852513263317454e-05, "loss": 0.3703, "step": 2179500 }, { "epoch": 14.752057167605024, "grad_norm": 0.37360599637031555, "learning_rate": 4.85247942832395e-05, "loss": 0.3681, "step": 2180000 }, { "epoch": 14.755440666955392, "grad_norm": 0.382112592458725, "learning_rate": 4.852445593330446e-05, "loss": 0.3702, "step": 2180500 }, { "epoch": 14.75882416630576, "grad_norm": 0.3928546905517578, "learning_rate": 4.8524117583369424e-05, "loss": 0.3705, "step": 2181000 }, { "epoch": 14.762207665656128, "grad_norm": 0.3380775451660156, "learning_rate": 4.852377923343439e-05, "loss": 0.3692, "step": 2181500 }, { "epoch": 14.765591165006496, "grad_norm": 0.3672342002391815, "learning_rate": 4.8523440883499355e-05, "loss": 0.3718, "step": 2182000 }, { "epoch": 14.768974664356865, "grad_norm": 0.3561403751373291, "learning_rate": 4.852310253356432e-05, "loss": 0.3681, "step": 2182500 }, { "epoch": 14.772358163707233, "grad_norm": 0.36243200302124023, "learning_rate": 4.852276418362928e-05, "loss": 0.369, "step": 2183000 }, { "epoch": 14.7757416630576, "grad_norm": 0.39097335934638977, "learning_rate": 4.852242583369424e-05, "loss": 0.3683, "step": 2183500 }, { "epoch": 14.779125162407968, "grad_norm": 0.36693957448005676, "learning_rate": 4.8522087483759204e-05, "loss": 0.3694, "step": 2184000 }, { "epoch": 14.782508661758337, "grad_norm": 0.317962646484375, "learning_rate": 4.8521749133824166e-05, "loss": 0.3699, "step": 2184500 }, { "epoch": 14.785892161108706, "grad_norm": 0.3652923107147217, "learning_rate": 4.8521410783889135e-05, "loss": 0.3696, "step": 2185000 }, { "epoch": 14.789275660459072, "grad_norm": 0.3356926739215851, "learning_rate": 4.85210724339541e-05, "loss": 0.3707, "step": 2185500 }, { "epoch": 14.79265915980944, "grad_norm": 0.3495100438594818, "learning_rate": 4.852073408401906e-05, "loss": 0.3709, "step": 2186000 }, { "epoch": 14.79604265915981, "grad_norm": 0.3659546673297882, "learning_rate": 4.852039573408402e-05, "loss": 0.3698, "step": 2186500 }, { "epoch": 14.799426158510178, "grad_norm": 0.40927523374557495, "learning_rate": 4.852005738414899e-05, "loss": 0.3686, "step": 2187000 }, { "epoch": 14.802809657860546, "grad_norm": 0.3406883180141449, "learning_rate": 4.8519719034213945e-05, "loss": 0.3694, "step": 2187500 }, { "epoch": 14.806193157210913, "grad_norm": 0.3734167516231537, "learning_rate": 4.851938068427891e-05, "loss": 0.3691, "step": 2188000 }, { "epoch": 14.809576656561282, "grad_norm": 0.38106903433799744, "learning_rate": 4.851904233434387e-05, "loss": 0.3685, "step": 2188500 }, { "epoch": 14.81296015591165, "grad_norm": 0.3433600068092346, "learning_rate": 4.851870398440884e-05, "loss": 0.3691, "step": 2189000 }, { "epoch": 14.816343655262019, "grad_norm": 0.379891961812973, "learning_rate": 4.85183656344738e-05, "loss": 0.3676, "step": 2189500 }, { "epoch": 14.819727154612387, "grad_norm": 0.3658895790576935, "learning_rate": 4.851802728453876e-05, "loss": 0.3674, "step": 2190000 }, { "epoch": 14.823110653962754, "grad_norm": 0.3627364933490753, "learning_rate": 4.8517688934603725e-05, "loss": 0.3703, "step": 2190500 }, { "epoch": 14.826494153313122, "grad_norm": 0.3883538842201233, "learning_rate": 4.8517350584668694e-05, "loss": 0.3684, "step": 2191000 }, { "epoch": 14.829877652663491, "grad_norm": 0.3807075619697571, "learning_rate": 4.8517012234733656e-05, "loss": 0.37, "step": 2191500 }, { "epoch": 14.83326115201386, "grad_norm": 0.3800852298736572, "learning_rate": 4.851667388479862e-05, "loss": 0.3701, "step": 2192000 }, { "epoch": 14.836644651364226, "grad_norm": 0.365222305059433, "learning_rate": 4.851633553486358e-05, "loss": 0.3688, "step": 2192500 }, { "epoch": 14.840028150714595, "grad_norm": 0.3683980405330658, "learning_rate": 4.851599718492854e-05, "loss": 0.3695, "step": 2193000 }, { "epoch": 14.843411650064963, "grad_norm": 0.3718588054180145, "learning_rate": 4.8515658834993504e-05, "loss": 0.3703, "step": 2193500 }, { "epoch": 14.846795149415332, "grad_norm": 0.36069953441619873, "learning_rate": 4.8515320485058466e-05, "loss": 0.3709, "step": 2194000 }, { "epoch": 14.850178648765699, "grad_norm": 0.35505011677742004, "learning_rate": 4.8514982135123435e-05, "loss": 0.3698, "step": 2194500 }, { "epoch": 14.853562148116067, "grad_norm": 0.37225234508514404, "learning_rate": 4.85146437851884e-05, "loss": 0.3695, "step": 2195000 }, { "epoch": 14.856945647466436, "grad_norm": 0.43268582224845886, "learning_rate": 4.851430543525336e-05, "loss": 0.3705, "step": 2195500 }, { "epoch": 14.860329146816804, "grad_norm": 0.33920058608055115, "learning_rate": 4.851396708531832e-05, "loss": 0.369, "step": 2196000 }, { "epoch": 14.863712646167173, "grad_norm": 0.3335217833518982, "learning_rate": 4.851362873538329e-05, "loss": 0.3711, "step": 2196500 }, { "epoch": 14.86709614551754, "grad_norm": 0.3459427058696747, "learning_rate": 4.8513290385448246e-05, "loss": 0.371, "step": 2197000 }, { "epoch": 14.870479644867908, "grad_norm": 0.3620784878730774, "learning_rate": 4.851295203551321e-05, "loss": 0.3695, "step": 2197500 }, { "epoch": 14.873863144218276, "grad_norm": 0.3312499523162842, "learning_rate": 4.851261368557817e-05, "loss": 0.3693, "step": 2198000 }, { "epoch": 14.877246643568645, "grad_norm": 0.3384900689125061, "learning_rate": 4.851227533564314e-05, "loss": 0.3699, "step": 2198500 }, { "epoch": 14.880630142919012, "grad_norm": 0.35046958923339844, "learning_rate": 4.85119369857081e-05, "loss": 0.3683, "step": 2199000 }, { "epoch": 14.88401364226938, "grad_norm": 0.39812368154525757, "learning_rate": 4.851159863577306e-05, "loss": 0.3698, "step": 2199500 }, { "epoch": 14.887397141619749, "grad_norm": 0.40305712819099426, "learning_rate": 4.8511260285838025e-05, "loss": 0.3705, "step": 2200000 }, { "epoch": 14.890780640970117, "grad_norm": 0.341285765171051, "learning_rate": 4.8510921935902994e-05, "loss": 0.3688, "step": 2200500 }, { "epoch": 14.894164140320486, "grad_norm": 0.39060279726982117, "learning_rate": 4.8510583585967956e-05, "loss": 0.3692, "step": 2201000 }, { "epoch": 14.897547639670853, "grad_norm": 0.3980751633644104, "learning_rate": 4.851024523603292e-05, "loss": 0.3688, "step": 2201500 }, { "epoch": 14.900931139021221, "grad_norm": 0.30645591020584106, "learning_rate": 4.850990688609788e-05, "loss": 0.3682, "step": 2202000 }, { "epoch": 14.90431463837159, "grad_norm": 0.4017775356769562, "learning_rate": 4.850956853616284e-05, "loss": 0.3696, "step": 2202500 }, { "epoch": 14.907698137721958, "grad_norm": 0.35003912448883057, "learning_rate": 4.8509230186227805e-05, "loss": 0.3688, "step": 2203000 }, { "epoch": 14.911081637072325, "grad_norm": 0.34675323963165283, "learning_rate": 4.850889183629277e-05, "loss": 0.3691, "step": 2203500 }, { "epoch": 14.914465136422693, "grad_norm": 0.42045390605926514, "learning_rate": 4.8508553486357736e-05, "loss": 0.3698, "step": 2204000 }, { "epoch": 14.917848635773062, "grad_norm": 0.37545245885849, "learning_rate": 4.85082151364227e-05, "loss": 0.3705, "step": 2204500 }, { "epoch": 14.92123213512343, "grad_norm": 0.3739573657512665, "learning_rate": 4.850787678648766e-05, "loss": 0.3689, "step": 2205000 }, { "epoch": 14.924615634473799, "grad_norm": 0.35323527455329895, "learning_rate": 4.850753843655262e-05, "loss": 0.3689, "step": 2205500 }, { "epoch": 14.927999133824166, "grad_norm": 0.362337589263916, "learning_rate": 4.850720008661759e-05, "loss": 0.3683, "step": 2206000 }, { "epoch": 14.931382633174534, "grad_norm": 0.3333463668823242, "learning_rate": 4.8506861736682547e-05, "loss": 0.3698, "step": 2206500 }, { "epoch": 14.934766132524903, "grad_norm": 0.36068710684776306, "learning_rate": 4.850652338674751e-05, "loss": 0.369, "step": 2207000 }, { "epoch": 14.938149631875271, "grad_norm": 0.34819650650024414, "learning_rate": 4.850618503681247e-05, "loss": 0.3692, "step": 2207500 }, { "epoch": 14.941533131225638, "grad_norm": 0.3557802438735962, "learning_rate": 4.850584668687744e-05, "loss": 0.3693, "step": 2208000 }, { "epoch": 14.944916630576007, "grad_norm": 0.40220189094543457, "learning_rate": 4.85055083369424e-05, "loss": 0.3678, "step": 2208500 }, { "epoch": 14.948300129926375, "grad_norm": 0.3456258773803711, "learning_rate": 4.8505169987007364e-05, "loss": 0.3718, "step": 2209000 }, { "epoch": 14.951683629276744, "grad_norm": 0.36534014344215393, "learning_rate": 4.8504831637072326e-05, "loss": 0.3703, "step": 2209500 }, { "epoch": 14.95506712862711, "grad_norm": 0.3737540543079376, "learning_rate": 4.8504493287137295e-05, "loss": 0.3694, "step": 2210000 }, { "epoch": 14.958450627977479, "grad_norm": 0.3467683494091034, "learning_rate": 4.850415493720226e-05, "loss": 0.3675, "step": 2210500 }, { "epoch": 14.961834127327847, "grad_norm": 0.38319334387779236, "learning_rate": 4.850381658726722e-05, "loss": 0.3689, "step": 2211000 }, { "epoch": 14.965217626678216, "grad_norm": 0.39453256130218506, "learning_rate": 4.850347823733218e-05, "loss": 0.3698, "step": 2211500 }, { "epoch": 14.968601126028585, "grad_norm": 0.3527160882949829, "learning_rate": 4.8503139887397143e-05, "loss": 0.3685, "step": 2212000 }, { "epoch": 14.971984625378951, "grad_norm": 0.3644430935382843, "learning_rate": 4.8502801537462106e-05, "loss": 0.3706, "step": 2212500 }, { "epoch": 14.97536812472932, "grad_norm": 0.3877614736557007, "learning_rate": 4.850246318752707e-05, "loss": 0.3692, "step": 2213000 }, { "epoch": 14.978751624079688, "grad_norm": 0.34349948167800903, "learning_rate": 4.8502124837592037e-05, "loss": 0.3691, "step": 2213500 }, { "epoch": 14.982135123430057, "grad_norm": 0.3915456235408783, "learning_rate": 4.8501786487657e-05, "loss": 0.3691, "step": 2214000 }, { "epoch": 14.985518622780425, "grad_norm": 0.41945990920066833, "learning_rate": 4.850144813772196e-05, "loss": 0.368, "step": 2214500 }, { "epoch": 14.988902122130792, "grad_norm": 0.38330912590026855, "learning_rate": 4.850110978778692e-05, "loss": 0.3693, "step": 2215000 }, { "epoch": 14.99228562148116, "grad_norm": 0.3266220986843109, "learning_rate": 4.8500771437851885e-05, "loss": 0.3691, "step": 2215500 }, { "epoch": 14.99566912083153, "grad_norm": 0.34072598814964294, "learning_rate": 4.850043308791685e-05, "loss": 0.3679, "step": 2216000 }, { "epoch": 14.999052620181898, "grad_norm": 0.36264902353286743, "learning_rate": 4.850009473798181e-05, "loss": 0.3702, "step": 2216500 }, { "epoch": 15.0, "eval_accuracy": 0.8592597939115394, "eval_loss": 0.5718047022819519, "eval_runtime": 3550.6807, "eval_samples_per_second": 81.884, "eval_steps_per_second": 5.118, "step": 2216640 }, { "epoch": 15.002436119532264, "grad_norm": 0.3186267912387848, "learning_rate": 4.849975638804677e-05, "loss": 0.3676, "step": 2217000 }, { "epoch": 15.005819618882633, "grad_norm": 0.3836493492126465, "learning_rate": 4.849941803811174e-05, "loss": 0.3672, "step": 2217500 }, { "epoch": 15.009203118233001, "grad_norm": 0.36411088705062866, "learning_rate": 4.84990796881767e-05, "loss": 0.3662, "step": 2218000 }, { "epoch": 15.01258661758337, "grad_norm": 0.3376358151435852, "learning_rate": 4.8498741338241665e-05, "loss": 0.3674, "step": 2218500 }, { "epoch": 15.015970116933737, "grad_norm": 0.36777323484420776, "learning_rate": 4.849840298830663e-05, "loss": 0.3679, "step": 2219000 }, { "epoch": 15.019353616284105, "grad_norm": 0.3862101435661316, "learning_rate": 4.8498064638371596e-05, "loss": 0.3659, "step": 2219500 }, { "epoch": 15.022737115634474, "grad_norm": 0.38136187195777893, "learning_rate": 4.849772628843656e-05, "loss": 0.3667, "step": 2220000 }, { "epoch": 15.026120614984842, "grad_norm": 0.34917306900024414, "learning_rate": 4.849738793850152e-05, "loss": 0.3655, "step": 2220500 }, { "epoch": 15.02950411433521, "grad_norm": 0.343822717666626, "learning_rate": 4.849704958856648e-05, "loss": 0.366, "step": 2221000 }, { "epoch": 15.032887613685578, "grad_norm": 0.3669893145561218, "learning_rate": 4.8496711238631444e-05, "loss": 0.3674, "step": 2221500 }, { "epoch": 15.036271113035946, "grad_norm": 0.37427571415901184, "learning_rate": 4.8496372888696406e-05, "loss": 0.3667, "step": 2222000 }, { "epoch": 15.039654612386315, "grad_norm": 0.36124661564826965, "learning_rate": 4.849603453876137e-05, "loss": 0.3676, "step": 2222500 }, { "epoch": 15.043038111736683, "grad_norm": 0.32961371541023254, "learning_rate": 4.849569618882634e-05, "loss": 0.3681, "step": 2223000 }, { "epoch": 15.04642161108705, "grad_norm": 0.3570033013820648, "learning_rate": 4.84953578388913e-05, "loss": 0.3676, "step": 2223500 }, { "epoch": 15.049805110437418, "grad_norm": 0.3314495384693146, "learning_rate": 4.849501948895626e-05, "loss": 0.3683, "step": 2224000 }, { "epoch": 15.053188609787787, "grad_norm": 0.3179497718811035, "learning_rate": 4.8494681139021224e-05, "loss": 0.3681, "step": 2224500 }, { "epoch": 15.056572109138155, "grad_norm": 0.3659362494945526, "learning_rate": 4.8494342789086186e-05, "loss": 0.3683, "step": 2225000 }, { "epoch": 15.059955608488524, "grad_norm": 0.36493274569511414, "learning_rate": 4.8494004439151155e-05, "loss": 0.366, "step": 2225500 }, { "epoch": 15.06333910783889, "grad_norm": 0.35716429352760315, "learning_rate": 4.849366608921611e-05, "loss": 0.3668, "step": 2226000 }, { "epoch": 15.06672260718926, "grad_norm": 0.35063108801841736, "learning_rate": 4.849332773928107e-05, "loss": 0.3685, "step": 2226500 }, { "epoch": 15.070106106539628, "grad_norm": 0.3515456020832062, "learning_rate": 4.849298938934604e-05, "loss": 0.3674, "step": 2227000 }, { "epoch": 15.073489605889996, "grad_norm": 0.34294208884239197, "learning_rate": 4.8492651039411e-05, "loss": 0.3673, "step": 2227500 }, { "epoch": 15.076873105240363, "grad_norm": 0.3997062146663666, "learning_rate": 4.8492312689475965e-05, "loss": 0.3675, "step": 2228000 }, { "epoch": 15.080256604590732, "grad_norm": 0.337519109249115, "learning_rate": 4.849197433954093e-05, "loss": 0.3657, "step": 2228500 }, { "epoch": 15.0836401039411, "grad_norm": 0.3725280463695526, "learning_rate": 4.8491635989605896e-05, "loss": 0.3667, "step": 2229000 }, { "epoch": 15.087023603291469, "grad_norm": 0.32875075936317444, "learning_rate": 4.849129763967086e-05, "loss": 0.3693, "step": 2229500 }, { "epoch": 15.090407102641837, "grad_norm": 0.3951806128025055, "learning_rate": 4.849095928973582e-05, "loss": 0.3676, "step": 2230000 }, { "epoch": 15.093790601992204, "grad_norm": 0.41113314032554626, "learning_rate": 4.849062093980078e-05, "loss": 0.368, "step": 2230500 }, { "epoch": 15.097174101342572, "grad_norm": 0.36592134833335876, "learning_rate": 4.8490282589865745e-05, "loss": 0.3675, "step": 2231000 }, { "epoch": 15.100557600692941, "grad_norm": 0.3612339198589325, "learning_rate": 4.848994423993071e-05, "loss": 0.3686, "step": 2231500 }, { "epoch": 15.10394110004331, "grad_norm": 0.34407153725624084, "learning_rate": 4.848960588999567e-05, "loss": 0.3675, "step": 2232000 }, { "epoch": 15.107324599393676, "grad_norm": 0.35251736640930176, "learning_rate": 4.848926754006063e-05, "loss": 0.3669, "step": 2232500 }, { "epoch": 15.110708098744045, "grad_norm": 0.3643178939819336, "learning_rate": 4.84889291901256e-05, "loss": 0.369, "step": 2233000 }, { "epoch": 15.114091598094413, "grad_norm": 0.38118237257003784, "learning_rate": 4.848859084019056e-05, "loss": 0.3677, "step": 2233500 }, { "epoch": 15.117475097444782, "grad_norm": 0.40105199813842773, "learning_rate": 4.8488252490255524e-05, "loss": 0.3693, "step": 2234000 }, { "epoch": 15.120858596795149, "grad_norm": 0.35876044631004333, "learning_rate": 4.8487914140320486e-05, "loss": 0.3681, "step": 2234500 }, { "epoch": 15.124242096145517, "grad_norm": 0.3273409903049469, "learning_rate": 4.8487575790385455e-05, "loss": 0.3664, "step": 2235000 }, { "epoch": 15.127625595495886, "grad_norm": 0.38285356760025024, "learning_rate": 4.848723744045041e-05, "loss": 0.3695, "step": 2235500 }, { "epoch": 15.131009094846254, "grad_norm": 0.34514716267585754, "learning_rate": 4.848689909051537e-05, "loss": 0.3667, "step": 2236000 }, { "epoch": 15.134392594196623, "grad_norm": 0.3504362106323242, "learning_rate": 4.848656074058034e-05, "loss": 0.3701, "step": 2236500 }, { "epoch": 15.13777609354699, "grad_norm": 0.3744419813156128, "learning_rate": 4.8486222390645304e-05, "loss": 0.3685, "step": 2237000 }, { "epoch": 15.141159592897358, "grad_norm": 0.3694138526916504, "learning_rate": 4.8485884040710266e-05, "loss": 0.3671, "step": 2237500 }, { "epoch": 15.144543092247726, "grad_norm": 0.35547423362731934, "learning_rate": 4.848554569077523e-05, "loss": 0.3678, "step": 2238000 }, { "epoch": 15.147926591598095, "grad_norm": 0.3246602416038513, "learning_rate": 4.84852073408402e-05, "loss": 0.3677, "step": 2238500 }, { "epoch": 15.151310090948462, "grad_norm": 0.3738870918750763, "learning_rate": 4.848486899090516e-05, "loss": 0.3687, "step": 2239000 }, { "epoch": 15.15469359029883, "grad_norm": 0.3235304057598114, "learning_rate": 4.848453064097012e-05, "loss": 0.368, "step": 2239500 }, { "epoch": 15.158077089649199, "grad_norm": 0.37630218267440796, "learning_rate": 4.848419229103508e-05, "loss": 0.3673, "step": 2240000 }, { "epoch": 15.161460588999567, "grad_norm": 0.3362449109554291, "learning_rate": 4.8483853941100045e-05, "loss": 0.3666, "step": 2240500 }, { "epoch": 15.164844088349936, "grad_norm": 0.37820273637771606, "learning_rate": 4.848351559116501e-05, "loss": 0.3685, "step": 2241000 }, { "epoch": 15.168227587700303, "grad_norm": 0.3779197931289673, "learning_rate": 4.848317724122997e-05, "loss": 0.3677, "step": 2241500 }, { "epoch": 15.171611087050671, "grad_norm": 0.3281296491622925, "learning_rate": 4.848283889129493e-05, "loss": 0.3668, "step": 2242000 }, { "epoch": 15.17499458640104, "grad_norm": 0.34983110427856445, "learning_rate": 4.84825005413599e-05, "loss": 0.369, "step": 2242500 }, { "epoch": 15.178378085751408, "grad_norm": 0.37141284346580505, "learning_rate": 4.848216219142486e-05, "loss": 0.3682, "step": 2243000 }, { "epoch": 15.181761585101775, "grad_norm": 0.36924445629119873, "learning_rate": 4.8481823841489825e-05, "loss": 0.367, "step": 2243500 }, { "epoch": 15.185145084452143, "grad_norm": 0.36089447140693665, "learning_rate": 4.848148549155479e-05, "loss": 0.3688, "step": 2244000 }, { "epoch": 15.188528583802512, "grad_norm": 0.3452288508415222, "learning_rate": 4.8481147141619756e-05, "loss": 0.3685, "step": 2244500 }, { "epoch": 15.19191208315288, "grad_norm": 0.34450557827949524, "learning_rate": 4.848080879168471e-05, "loss": 0.3685, "step": 2245000 }, { "epoch": 15.195295582503249, "grad_norm": 0.35631123185157776, "learning_rate": 4.848047044174967e-05, "loss": 0.3672, "step": 2245500 }, { "epoch": 15.198679081853616, "grad_norm": 0.3319762349128723, "learning_rate": 4.848013209181464e-05, "loss": 0.3677, "step": 2246000 }, { "epoch": 15.202062581203984, "grad_norm": 0.3290312588214874, "learning_rate": 4.8479793741879604e-05, "loss": 0.3689, "step": 2246500 }, { "epoch": 15.205446080554353, "grad_norm": 0.3664303719997406, "learning_rate": 4.8479455391944567e-05, "loss": 0.3681, "step": 2247000 }, { "epoch": 15.208829579904721, "grad_norm": 0.3830331563949585, "learning_rate": 4.847911704200953e-05, "loss": 0.3695, "step": 2247500 }, { "epoch": 15.212213079255088, "grad_norm": 0.38021984696388245, "learning_rate": 4.84787786920745e-05, "loss": 0.3675, "step": 2248000 }, { "epoch": 15.215596578605457, "grad_norm": 0.4113265872001648, "learning_rate": 4.847844034213946e-05, "loss": 0.3674, "step": 2248500 }, { "epoch": 15.218980077955825, "grad_norm": 0.3703368306159973, "learning_rate": 4.847810199220442e-05, "loss": 0.3688, "step": 2249000 }, { "epoch": 15.222363577306194, "grad_norm": 0.39109688997268677, "learning_rate": 4.847776364226938e-05, "loss": 0.3687, "step": 2249500 }, { "epoch": 15.225747076656562, "grad_norm": 0.37408167123794556, "learning_rate": 4.8477425292334346e-05, "loss": 0.3686, "step": 2250000 }, { "epoch": 15.229130576006929, "grad_norm": 0.35299205780029297, "learning_rate": 4.847708694239931e-05, "loss": 0.369, "step": 2250500 }, { "epoch": 15.232514075357297, "grad_norm": 0.464773565530777, "learning_rate": 4.847674859246427e-05, "loss": 0.3689, "step": 2251000 }, { "epoch": 15.235897574707666, "grad_norm": 0.3952706456184387, "learning_rate": 4.847641024252923e-05, "loss": 0.3669, "step": 2251500 }, { "epoch": 15.239281074058034, "grad_norm": 0.3495103716850281, "learning_rate": 4.84760718925942e-05, "loss": 0.3677, "step": 2252000 }, { "epoch": 15.242664573408401, "grad_norm": 0.3670581877231598, "learning_rate": 4.8475733542659163e-05, "loss": 0.3685, "step": 2252500 }, { "epoch": 15.24604807275877, "grad_norm": 0.37361136078834534, "learning_rate": 4.8475395192724126e-05, "loss": 0.3697, "step": 2253000 }, { "epoch": 15.249431572109138, "grad_norm": 0.34565627574920654, "learning_rate": 4.847505684278909e-05, "loss": 0.3691, "step": 2253500 }, { "epoch": 15.252815071459507, "grad_norm": 0.34091848134994507, "learning_rate": 4.8474718492854057e-05, "loss": 0.3673, "step": 2254000 }, { "epoch": 15.256198570809875, "grad_norm": 0.3778747320175171, "learning_rate": 4.847438014291901e-05, "loss": 0.3674, "step": 2254500 }, { "epoch": 15.259582070160242, "grad_norm": 0.3678615689277649, "learning_rate": 4.8474041792983974e-05, "loss": 0.3682, "step": 2255000 }, { "epoch": 15.26296556951061, "grad_norm": 0.3665476441383362, "learning_rate": 4.847370344304894e-05, "loss": 0.3689, "step": 2255500 }, { "epoch": 15.266349068860979, "grad_norm": 0.3415987491607666, "learning_rate": 4.8473365093113905e-05, "loss": 0.3695, "step": 2256000 }, { "epoch": 15.269732568211348, "grad_norm": 0.3851405084133148, "learning_rate": 4.847302674317887e-05, "loss": 0.3672, "step": 2256500 }, { "epoch": 15.273116067561714, "grad_norm": 0.3553013801574707, "learning_rate": 4.847268839324383e-05, "loss": 0.3683, "step": 2257000 }, { "epoch": 15.276499566912083, "grad_norm": 0.38508784770965576, "learning_rate": 4.84723500433088e-05, "loss": 0.3674, "step": 2257500 }, { "epoch": 15.279883066262451, "grad_norm": 0.36417558789253235, "learning_rate": 4.847201169337376e-05, "loss": 0.3689, "step": 2258000 }, { "epoch": 15.28326656561282, "grad_norm": 0.35827988386154175, "learning_rate": 4.847167334343872e-05, "loss": 0.367, "step": 2258500 }, { "epoch": 15.286650064963187, "grad_norm": 0.36032170057296753, "learning_rate": 4.847133499350368e-05, "loss": 0.3675, "step": 2259000 }, { "epoch": 15.290033564313555, "grad_norm": 0.3674727976322174, "learning_rate": 4.847099664356865e-05, "loss": 0.3676, "step": 2259500 }, { "epoch": 15.293417063663924, "grad_norm": 0.358148455619812, "learning_rate": 4.847065829363361e-05, "loss": 0.3683, "step": 2260000 }, { "epoch": 15.296800563014292, "grad_norm": 0.36358699202537537, "learning_rate": 4.847031994369857e-05, "loss": 0.3677, "step": 2260500 }, { "epoch": 15.30018406236466, "grad_norm": 0.4036157429218292, "learning_rate": 4.846998159376353e-05, "loss": 0.3679, "step": 2261000 }, { "epoch": 15.303567561715028, "grad_norm": 0.38097384572029114, "learning_rate": 4.84696432438285e-05, "loss": 0.3687, "step": 2261500 }, { "epoch": 15.306951061065396, "grad_norm": 0.4017247259616852, "learning_rate": 4.8469304893893464e-05, "loss": 0.3683, "step": 2262000 }, { "epoch": 15.310334560415765, "grad_norm": 0.36650019884109497, "learning_rate": 4.8468966543958426e-05, "loss": 0.3679, "step": 2262500 }, { "epoch": 15.313718059766133, "grad_norm": 0.34185999631881714, "learning_rate": 4.846862819402339e-05, "loss": 0.3686, "step": 2263000 }, { "epoch": 15.3171015591165, "grad_norm": 0.4078949987888336, "learning_rate": 4.846828984408836e-05, "loss": 0.3673, "step": 2263500 }, { "epoch": 15.320485058466868, "grad_norm": 0.3369295299053192, "learning_rate": 4.846795149415331e-05, "loss": 0.3695, "step": 2264000 }, { "epoch": 15.323868557817237, "grad_norm": 0.3768553137779236, "learning_rate": 4.8467613144218275e-05, "loss": 0.3691, "step": 2264500 }, { "epoch": 15.327252057167605, "grad_norm": 0.37157976627349854, "learning_rate": 4.8467274794283244e-05, "loss": 0.37, "step": 2265000 }, { "epoch": 15.330635556517974, "grad_norm": 0.3911592364311218, "learning_rate": 4.8466936444348206e-05, "loss": 0.3686, "step": 2265500 }, { "epoch": 15.33401905586834, "grad_norm": 0.32359597086906433, "learning_rate": 4.846659809441317e-05, "loss": 0.3665, "step": 2266000 }, { "epoch": 15.33740255521871, "grad_norm": 0.33536919951438904, "learning_rate": 4.846625974447813e-05, "loss": 0.3689, "step": 2266500 }, { "epoch": 15.340786054569078, "grad_norm": 0.3743899166584015, "learning_rate": 4.84659213945431e-05, "loss": 0.368, "step": 2267000 }, { "epoch": 15.344169553919446, "grad_norm": 0.36726707220077515, "learning_rate": 4.846558304460806e-05, "loss": 0.3677, "step": 2267500 }, { "epoch": 15.347553053269813, "grad_norm": 0.3498327434062958, "learning_rate": 4.846524469467302e-05, "loss": 0.3688, "step": 2268000 }, { "epoch": 15.350936552620182, "grad_norm": 0.3115338981151581, "learning_rate": 4.846490634473798e-05, "loss": 0.3678, "step": 2268500 }, { "epoch": 15.35432005197055, "grad_norm": 0.3821237087249756, "learning_rate": 4.846456799480295e-05, "loss": 0.3679, "step": 2269000 }, { "epoch": 15.357703551320919, "grad_norm": 0.3773631155490875, "learning_rate": 4.846422964486791e-05, "loss": 0.3678, "step": 2269500 }, { "epoch": 15.361087050671287, "grad_norm": 0.34869384765625, "learning_rate": 4.846389129493287e-05, "loss": 0.3671, "step": 2270000 }, { "epoch": 15.364470550021654, "grad_norm": 0.3591691553592682, "learning_rate": 4.8463552944997834e-05, "loss": 0.3671, "step": 2270500 }, { "epoch": 15.367854049372022, "grad_norm": 0.36942118406295776, "learning_rate": 4.84632145950628e-05, "loss": 0.3679, "step": 2271000 }, { "epoch": 15.371237548722391, "grad_norm": 0.3710452616214752, "learning_rate": 4.8462876245127765e-05, "loss": 0.3668, "step": 2271500 }, { "epoch": 15.37462104807276, "grad_norm": 0.3890515863895416, "learning_rate": 4.846253789519273e-05, "loss": 0.3696, "step": 2272000 }, { "epoch": 15.378004547423126, "grad_norm": 0.3567555844783783, "learning_rate": 4.846219954525769e-05, "loss": 0.3677, "step": 2272500 }, { "epoch": 15.381388046773495, "grad_norm": 0.3835057318210602, "learning_rate": 4.846186119532266e-05, "loss": 0.3691, "step": 2273000 }, { "epoch": 15.384771546123863, "grad_norm": 0.3696700930595398, "learning_rate": 4.846152284538761e-05, "loss": 0.369, "step": 2273500 }, { "epoch": 15.388155045474232, "grad_norm": 0.3335026800632477, "learning_rate": 4.8461184495452575e-05, "loss": 0.3687, "step": 2274000 }, { "epoch": 15.391538544824598, "grad_norm": 0.37481197714805603, "learning_rate": 4.8460846145517544e-05, "loss": 0.37, "step": 2274500 }, { "epoch": 15.394922044174967, "grad_norm": 0.3376085162162781, "learning_rate": 4.8460507795582506e-05, "loss": 0.3694, "step": 2275000 }, { "epoch": 15.398305543525336, "grad_norm": 0.38262149691581726, "learning_rate": 4.846016944564747e-05, "loss": 0.3694, "step": 2275500 }, { "epoch": 15.401689042875704, "grad_norm": 0.36028096079826355, "learning_rate": 4.845983109571243e-05, "loss": 0.3683, "step": 2276000 }, { "epoch": 15.405072542226073, "grad_norm": 0.36225685477256775, "learning_rate": 4.84594927457774e-05, "loss": 0.369, "step": 2276500 }, { "epoch": 15.40845604157644, "grad_norm": 0.361055463552475, "learning_rate": 4.845915439584236e-05, "loss": 0.3691, "step": 2277000 }, { "epoch": 15.411839540926808, "grad_norm": 0.34308624267578125, "learning_rate": 4.8458816045907324e-05, "loss": 0.3685, "step": 2277500 }, { "epoch": 15.415223040277176, "grad_norm": 0.37975791096687317, "learning_rate": 4.845847769597228e-05, "loss": 0.3675, "step": 2278000 }, { "epoch": 15.418606539627545, "grad_norm": 0.3387218713760376, "learning_rate": 4.845813934603725e-05, "loss": 0.3674, "step": 2278500 }, { "epoch": 15.421990038977913, "grad_norm": 0.3455009162425995, "learning_rate": 4.845780099610221e-05, "loss": 0.3689, "step": 2279000 }, { "epoch": 15.42537353832828, "grad_norm": 0.3392178416252136, "learning_rate": 4.845746264616717e-05, "loss": 0.3668, "step": 2279500 }, { "epoch": 15.428757037678649, "grad_norm": 0.37927737832069397, "learning_rate": 4.8457124296232134e-05, "loss": 0.3687, "step": 2280000 }, { "epoch": 15.432140537029017, "grad_norm": 0.3556533455848694, "learning_rate": 4.84567859462971e-05, "loss": 0.3695, "step": 2280500 }, { "epoch": 15.435524036379386, "grad_norm": 0.37136268615722656, "learning_rate": 4.8456447596362065e-05, "loss": 0.3674, "step": 2281000 }, { "epoch": 15.438907535729752, "grad_norm": 0.39647194743156433, "learning_rate": 4.845610924642703e-05, "loss": 0.3677, "step": 2281500 }, { "epoch": 15.442291035080121, "grad_norm": 0.39070430397987366, "learning_rate": 4.845577089649199e-05, "loss": 0.3681, "step": 2282000 }, { "epoch": 15.44567453443049, "grad_norm": 0.34807637333869934, "learning_rate": 4.845543254655696e-05, "loss": 0.3685, "step": 2282500 }, { "epoch": 15.449058033780858, "grad_norm": 0.3645128011703491, "learning_rate": 4.8455094196621914e-05, "loss": 0.3687, "step": 2283000 }, { "epoch": 15.452441533131225, "grad_norm": 0.3826288878917694, "learning_rate": 4.8454755846686876e-05, "loss": 0.369, "step": 2283500 }, { "epoch": 15.455825032481593, "grad_norm": 0.4068937301635742, "learning_rate": 4.8454417496751845e-05, "loss": 0.3673, "step": 2284000 }, { "epoch": 15.459208531831962, "grad_norm": 0.35856378078460693, "learning_rate": 4.845407914681681e-05, "loss": 0.3678, "step": 2284500 }, { "epoch": 15.46259203118233, "grad_norm": 0.4144749045372009, "learning_rate": 4.845374079688177e-05, "loss": 0.3679, "step": 2285000 }, { "epoch": 15.465975530532699, "grad_norm": 0.36407583951950073, "learning_rate": 4.845340244694673e-05, "loss": 0.3676, "step": 2285500 }, { "epoch": 15.469359029883066, "grad_norm": 0.33645474910736084, "learning_rate": 4.84530640970117e-05, "loss": 0.3682, "step": 2286000 }, { "epoch": 15.472742529233434, "grad_norm": 0.36144402623176575, "learning_rate": 4.845272574707666e-05, "loss": 0.3676, "step": 2286500 }, { "epoch": 15.476126028583803, "grad_norm": 0.3633367419242859, "learning_rate": 4.8452387397141624e-05, "loss": 0.3678, "step": 2287000 }, { "epoch": 15.479509527934171, "grad_norm": 0.37776997685432434, "learning_rate": 4.8452049047206586e-05, "loss": 0.3683, "step": 2287500 }, { "epoch": 15.482893027284538, "grad_norm": 0.35414132475852966, "learning_rate": 4.845171069727155e-05, "loss": 0.3685, "step": 2288000 }, { "epoch": 15.486276526634907, "grad_norm": 0.3871769905090332, "learning_rate": 4.845137234733651e-05, "loss": 0.3685, "step": 2288500 }, { "epoch": 15.489660025985275, "grad_norm": 0.37936681509017944, "learning_rate": 4.845103399740147e-05, "loss": 0.3688, "step": 2289000 }, { "epoch": 15.493043525335644, "grad_norm": 0.37192994356155396, "learning_rate": 4.8450695647466435e-05, "loss": 0.3693, "step": 2289500 }, { "epoch": 15.496427024686012, "grad_norm": 0.3410022258758545, "learning_rate": 4.8450357297531404e-05, "loss": 0.3688, "step": 2290000 }, { "epoch": 15.499810524036379, "grad_norm": 0.3851412236690521, "learning_rate": 4.8450018947596366e-05, "loss": 0.369, "step": 2290500 }, { "epoch": 15.503194023386747, "grad_norm": 0.39784419536590576, "learning_rate": 4.844968059766133e-05, "loss": 0.3679, "step": 2291000 }, { "epoch": 15.506577522737116, "grad_norm": 0.37226787209510803, "learning_rate": 4.844934224772629e-05, "loss": 0.3684, "step": 2291500 }, { "epoch": 15.509961022087484, "grad_norm": 0.37065237760543823, "learning_rate": 4.844900389779126e-05, "loss": 0.3684, "step": 2292000 }, { "epoch": 15.513344521437851, "grad_norm": 0.33598220348358154, "learning_rate": 4.8448665547856214e-05, "loss": 0.3686, "step": 2292500 }, { "epoch": 15.51672802078822, "grad_norm": 0.3342346251010895, "learning_rate": 4.8448327197921177e-05, "loss": 0.3668, "step": 2293000 }, { "epoch": 15.520111520138588, "grad_norm": 0.32238996028900146, "learning_rate": 4.8447988847986145e-05, "loss": 0.3681, "step": 2293500 }, { "epoch": 15.523495019488957, "grad_norm": 0.3525577783584595, "learning_rate": 4.844765049805111e-05, "loss": 0.3683, "step": 2294000 }, { "epoch": 15.526878518839325, "grad_norm": 0.3491257429122925, "learning_rate": 4.844731214811607e-05, "loss": 0.368, "step": 2294500 }, { "epoch": 15.530262018189692, "grad_norm": 0.3477993309497833, "learning_rate": 4.844697379818103e-05, "loss": 0.3685, "step": 2295000 }, { "epoch": 15.53364551754006, "grad_norm": 0.35921916365623474, "learning_rate": 4.8446635448245994e-05, "loss": 0.3671, "step": 2295500 }, { "epoch": 15.537029016890429, "grad_norm": 0.3641141951084137, "learning_rate": 4.844629709831096e-05, "loss": 0.3674, "step": 2296000 }, { "epoch": 15.540412516240798, "grad_norm": 0.34084510803222656, "learning_rate": 4.8445958748375925e-05, "loss": 0.3687, "step": 2296500 }, { "epoch": 15.543796015591164, "grad_norm": 0.3691849410533905, "learning_rate": 4.844562039844089e-05, "loss": 0.3685, "step": 2297000 }, { "epoch": 15.547179514941533, "grad_norm": 0.3591060936450958, "learning_rate": 4.844528204850585e-05, "loss": 0.3695, "step": 2297500 }, { "epoch": 15.550563014291901, "grad_norm": 0.3356313109397888, "learning_rate": 4.844494369857081e-05, "loss": 0.3687, "step": 2298000 }, { "epoch": 15.55394651364227, "grad_norm": 0.35305818915367126, "learning_rate": 4.8444605348635773e-05, "loss": 0.3674, "step": 2298500 }, { "epoch": 15.557330012992637, "grad_norm": 0.3080775737762451, "learning_rate": 4.8444266998700736e-05, "loss": 0.3696, "step": 2299000 }, { "epoch": 15.560713512343005, "grad_norm": 0.3750038146972656, "learning_rate": 4.8443928648765704e-05, "loss": 0.3687, "step": 2299500 }, { "epoch": 15.564097011693374, "grad_norm": 0.3531736135482788, "learning_rate": 4.8443590298830667e-05, "loss": 0.3688, "step": 2300000 }, { "epoch": 15.567480511043742, "grad_norm": 0.37959063053131104, "learning_rate": 4.844325194889563e-05, "loss": 0.3695, "step": 2300500 }, { "epoch": 15.57086401039411, "grad_norm": 0.37598180770874023, "learning_rate": 4.844291359896059e-05, "loss": 0.3693, "step": 2301000 }, { "epoch": 15.574247509744477, "grad_norm": 0.3553810119628906, "learning_rate": 4.844257524902556e-05, "loss": 0.3679, "step": 2301500 }, { "epoch": 15.577631009094846, "grad_norm": 0.38904568552970886, "learning_rate": 4.8442236899090515e-05, "loss": 0.3681, "step": 2302000 }, { "epoch": 15.581014508445215, "grad_norm": 0.36272913217544556, "learning_rate": 4.844189854915548e-05, "loss": 0.3683, "step": 2302500 }, { "epoch": 15.584398007795583, "grad_norm": 0.37295016646385193, "learning_rate": 4.844156019922044e-05, "loss": 0.3675, "step": 2303000 }, { "epoch": 15.587781507145952, "grad_norm": 0.402294784784317, "learning_rate": 4.844122184928541e-05, "loss": 0.3685, "step": 2303500 }, { "epoch": 15.591165006496318, "grad_norm": 0.32193323969841003, "learning_rate": 4.844088349935037e-05, "loss": 0.3689, "step": 2304000 }, { "epoch": 15.594548505846687, "grad_norm": 0.35976263880729675, "learning_rate": 4.844054514941533e-05, "loss": 0.3669, "step": 2304500 }, { "epoch": 15.597932005197055, "grad_norm": 0.38775837421417236, "learning_rate": 4.8440206799480295e-05, "loss": 0.3677, "step": 2305000 }, { "epoch": 15.601315504547424, "grad_norm": 0.3899919092655182, "learning_rate": 4.8439868449545263e-05, "loss": 0.3672, "step": 2305500 }, { "epoch": 15.60469900389779, "grad_norm": 0.3507950007915497, "learning_rate": 4.8439530099610226e-05, "loss": 0.3687, "step": 2306000 }, { "epoch": 15.60808250324816, "grad_norm": 0.3544395864009857, "learning_rate": 4.843919174967519e-05, "loss": 0.3691, "step": 2306500 }, { "epoch": 15.611466002598528, "grad_norm": 0.3738711476325989, "learning_rate": 4.843885339974015e-05, "loss": 0.3677, "step": 2307000 }, { "epoch": 15.614849501948896, "grad_norm": 0.38061192631721497, "learning_rate": 4.843851504980511e-05, "loss": 0.3702, "step": 2307500 }, { "epoch": 15.618233001299263, "grad_norm": 0.3381912112236023, "learning_rate": 4.8438176699870074e-05, "loss": 0.3675, "step": 2308000 }, { "epoch": 15.621616500649631, "grad_norm": 0.3231547176837921, "learning_rate": 4.8437838349935036e-05, "loss": 0.3707, "step": 2308500 }, { "epoch": 15.625, "grad_norm": 0.3673042356967926, "learning_rate": 4.8437500000000005e-05, "loss": 0.3693, "step": 2309000 }, { "epoch": 15.628383499350369, "grad_norm": 0.3947595953941345, "learning_rate": 4.843716165006497e-05, "loss": 0.3686, "step": 2309500 }, { "epoch": 15.631766998700737, "grad_norm": 0.4065065383911133, "learning_rate": 4.843682330012993e-05, "loss": 0.3685, "step": 2310000 }, { "epoch": 15.635150498051104, "grad_norm": 0.37687912583351135, "learning_rate": 4.843648495019489e-05, "loss": 0.3673, "step": 2310500 }, { "epoch": 15.638533997401472, "grad_norm": 0.38236334919929504, "learning_rate": 4.843614660025986e-05, "loss": 0.3674, "step": 2311000 }, { "epoch": 15.64191749675184, "grad_norm": 0.3740776479244232, "learning_rate": 4.8435808250324816e-05, "loss": 0.3685, "step": 2311500 }, { "epoch": 15.64530099610221, "grad_norm": 0.3825748562812805, "learning_rate": 4.843546990038978e-05, "loss": 0.3689, "step": 2312000 }, { "epoch": 15.648684495452576, "grad_norm": 0.3572474718093872, "learning_rate": 4.843513155045474e-05, "loss": 0.3685, "step": 2312500 }, { "epoch": 15.652067994802945, "grad_norm": 0.35315316915512085, "learning_rate": 4.843479320051971e-05, "loss": 0.3687, "step": 2313000 }, { "epoch": 15.655451494153313, "grad_norm": 0.32699042558670044, "learning_rate": 4.843445485058467e-05, "loss": 0.3694, "step": 2313500 }, { "epoch": 15.658834993503682, "grad_norm": 0.33419954776763916, "learning_rate": 4.843411650064963e-05, "loss": 0.3685, "step": 2314000 }, { "epoch": 15.662218492854048, "grad_norm": 0.3580189347267151, "learning_rate": 4.8433778150714595e-05, "loss": 0.3693, "step": 2314500 }, { "epoch": 15.665601992204417, "grad_norm": 0.3577320873737335, "learning_rate": 4.8433439800779564e-05, "loss": 0.369, "step": 2315000 }, { "epoch": 15.668985491554785, "grad_norm": 0.4055269956588745, "learning_rate": 4.8433101450844526e-05, "loss": 0.3686, "step": 2315500 }, { "epoch": 15.672368990905154, "grad_norm": 0.3679068088531494, "learning_rate": 4.843276310090949e-05, "loss": 0.3691, "step": 2316000 }, { "epoch": 15.675752490255523, "grad_norm": 0.33523890376091003, "learning_rate": 4.843242475097445e-05, "loss": 0.3696, "step": 2316500 }, { "epoch": 15.67913598960589, "grad_norm": 0.3535049259662628, "learning_rate": 4.843208640103941e-05, "loss": 0.368, "step": 2317000 }, { "epoch": 15.682519488956258, "grad_norm": 0.3424280881881714, "learning_rate": 4.8431748051104375e-05, "loss": 0.3676, "step": 2317500 }, { "epoch": 15.685902988306626, "grad_norm": 0.34147903323173523, "learning_rate": 4.843140970116934e-05, "loss": 0.368, "step": 2318000 }, { "epoch": 15.689286487656995, "grad_norm": 0.4109104573726654, "learning_rate": 4.8431071351234306e-05, "loss": 0.3692, "step": 2318500 }, { "epoch": 15.692669987007363, "grad_norm": 0.32174432277679443, "learning_rate": 4.843073300129927e-05, "loss": 0.3674, "step": 2319000 }, { "epoch": 15.69605348635773, "grad_norm": 0.3250316083431244, "learning_rate": 4.843039465136423e-05, "loss": 0.3676, "step": 2319500 }, { "epoch": 15.699436985708099, "grad_norm": 0.34445062279701233, "learning_rate": 4.843005630142919e-05, "loss": 0.3701, "step": 2320000 }, { "epoch": 15.702820485058467, "grad_norm": 0.37600356340408325, "learning_rate": 4.842971795149416e-05, "loss": 0.3675, "step": 2320500 }, { "epoch": 15.706203984408836, "grad_norm": 0.36351558566093445, "learning_rate": 4.8429379601559116e-05, "loss": 0.3684, "step": 2321000 }, { "epoch": 15.709587483759202, "grad_norm": 0.36934009194374084, "learning_rate": 4.842904125162408e-05, "loss": 0.3696, "step": 2321500 }, { "epoch": 15.712970983109571, "grad_norm": 0.34784290194511414, "learning_rate": 4.842870290168904e-05, "loss": 0.3683, "step": 2322000 }, { "epoch": 15.71635448245994, "grad_norm": 0.36407607793807983, "learning_rate": 4.842836455175401e-05, "loss": 0.3685, "step": 2322500 }, { "epoch": 15.719737981810308, "grad_norm": 0.3845270574092865, "learning_rate": 4.842802620181897e-05, "loss": 0.3685, "step": 2323000 }, { "epoch": 15.723121481160675, "grad_norm": 0.39453384280204773, "learning_rate": 4.8427687851883934e-05, "loss": 0.3692, "step": 2323500 }, { "epoch": 15.726504980511043, "grad_norm": 0.34942588210105896, "learning_rate": 4.8427349501948896e-05, "loss": 0.3684, "step": 2324000 }, { "epoch": 15.729888479861412, "grad_norm": 0.37566226720809937, "learning_rate": 4.8427011152013865e-05, "loss": 0.3675, "step": 2324500 }, { "epoch": 15.73327197921178, "grad_norm": 0.36661645770072937, "learning_rate": 4.842667280207883e-05, "loss": 0.3692, "step": 2325000 }, { "epoch": 15.736655478562149, "grad_norm": 0.381184458732605, "learning_rate": 4.842633445214379e-05, "loss": 0.3693, "step": 2325500 }, { "epoch": 15.740038977912516, "grad_norm": 0.3286978006362915, "learning_rate": 4.842599610220875e-05, "loss": 0.3691, "step": 2326000 }, { "epoch": 15.743422477262884, "grad_norm": 0.3738749623298645, "learning_rate": 4.842565775227371e-05, "loss": 0.3677, "step": 2326500 }, { "epoch": 15.746805976613253, "grad_norm": 0.3820296823978424, "learning_rate": 4.8425319402338675e-05, "loss": 0.3689, "step": 2327000 }, { "epoch": 15.750189475963621, "grad_norm": 0.36072027683258057, "learning_rate": 4.842498105240364e-05, "loss": 0.368, "step": 2327500 }, { "epoch": 15.753572975313988, "grad_norm": 0.3539668619632721, "learning_rate": 4.8424642702468606e-05, "loss": 0.37, "step": 2328000 }, { "epoch": 15.756956474664356, "grad_norm": 0.360319584608078, "learning_rate": 4.842430435253357e-05, "loss": 0.3701, "step": 2328500 }, { "epoch": 15.760339974014725, "grad_norm": 0.3770919442176819, "learning_rate": 4.842396600259853e-05, "loss": 0.3669, "step": 2329000 }, { "epoch": 15.763723473365093, "grad_norm": 0.35376468300819397, "learning_rate": 4.842362765266349e-05, "loss": 0.3698, "step": 2329500 }, { "epoch": 15.767106972715462, "grad_norm": 0.40981361269950867, "learning_rate": 4.842328930272846e-05, "loss": 0.3688, "step": 2330000 }, { "epoch": 15.770490472065829, "grad_norm": 0.34691673517227173, "learning_rate": 4.842295095279342e-05, "loss": 0.3689, "step": 2330500 }, { "epoch": 15.773873971416197, "grad_norm": 0.34768006205558777, "learning_rate": 4.842261260285838e-05, "loss": 0.3692, "step": 2331000 }, { "epoch": 15.777257470766566, "grad_norm": 0.4068549573421478, "learning_rate": 4.842227425292334e-05, "loss": 0.3689, "step": 2331500 }, { "epoch": 15.780640970116934, "grad_norm": 0.3581763803958893, "learning_rate": 4.842193590298831e-05, "loss": 0.368, "step": 2332000 }, { "epoch": 15.784024469467301, "grad_norm": 0.3405434191226959, "learning_rate": 4.842159755305327e-05, "loss": 0.3684, "step": 2332500 }, { "epoch": 15.78740796881767, "grad_norm": 0.3610570728778839, "learning_rate": 4.8421259203118234e-05, "loss": 0.3696, "step": 2333000 }, { "epoch": 15.790791468168038, "grad_norm": 0.3928324282169342, "learning_rate": 4.8420920853183196e-05, "loss": 0.3683, "step": 2333500 }, { "epoch": 15.794174967518407, "grad_norm": 0.3627435266971588, "learning_rate": 4.8420582503248165e-05, "loss": 0.3684, "step": 2334000 }, { "epoch": 15.797558466868775, "grad_norm": 0.35874125361442566, "learning_rate": 4.842024415331313e-05, "loss": 0.369, "step": 2334500 }, { "epoch": 15.800941966219142, "grad_norm": 0.34165313839912415, "learning_rate": 4.841990580337809e-05, "loss": 0.3695, "step": 2335000 }, { "epoch": 15.80432546556951, "grad_norm": 0.33219292759895325, "learning_rate": 4.841956745344305e-05, "loss": 0.3682, "step": 2335500 }, { "epoch": 15.807708964919879, "grad_norm": 0.3279459774494171, "learning_rate": 4.8419229103508014e-05, "loss": 0.37, "step": 2336000 }, { "epoch": 15.811092464270248, "grad_norm": 0.3773161470890045, "learning_rate": 4.8418890753572976e-05, "loss": 0.3679, "step": 2336500 }, { "epoch": 15.814475963620614, "grad_norm": 0.3741438388824463, "learning_rate": 4.841855240363794e-05, "loss": 0.3697, "step": 2337000 }, { "epoch": 15.817859462970983, "grad_norm": 0.3438836932182312, "learning_rate": 4.841821405370291e-05, "loss": 0.3685, "step": 2337500 }, { "epoch": 15.821242962321351, "grad_norm": 0.36285704374313354, "learning_rate": 4.841787570376787e-05, "loss": 0.3682, "step": 2338000 }, { "epoch": 15.82462646167172, "grad_norm": 0.35527315735816956, "learning_rate": 4.841753735383283e-05, "loss": 0.3697, "step": 2338500 }, { "epoch": 15.828009961022087, "grad_norm": 0.38485226035118103, "learning_rate": 4.841719900389779e-05, "loss": 0.3683, "step": 2339000 }, { "epoch": 15.831393460372455, "grad_norm": 0.38870787620544434, "learning_rate": 4.841686065396276e-05, "loss": 0.3682, "step": 2339500 }, { "epoch": 15.834776959722824, "grad_norm": 0.36204108595848083, "learning_rate": 4.8416522304027724e-05, "loss": 0.3687, "step": 2340000 }, { "epoch": 15.838160459073192, "grad_norm": 0.37256479263305664, "learning_rate": 4.841618395409268e-05, "loss": 0.3673, "step": 2340500 }, { "epoch": 15.84154395842356, "grad_norm": 0.37685421109199524, "learning_rate": 4.841584560415764e-05, "loss": 0.3684, "step": 2341000 }, { "epoch": 15.844927457773927, "grad_norm": 0.36085960268974304, "learning_rate": 4.841550725422261e-05, "loss": 0.3691, "step": 2341500 }, { "epoch": 15.848310957124296, "grad_norm": 0.3423760235309601, "learning_rate": 4.841516890428757e-05, "loss": 0.3691, "step": 2342000 }, { "epoch": 15.851694456474664, "grad_norm": 0.383521169424057, "learning_rate": 4.8414830554352535e-05, "loss": 0.3692, "step": 2342500 }, { "epoch": 15.855077955825033, "grad_norm": 0.38384073972702026, "learning_rate": 4.84144922044175e-05, "loss": 0.3678, "step": 2343000 }, { "epoch": 15.858461455175402, "grad_norm": 0.36292794346809387, "learning_rate": 4.8414153854482466e-05, "loss": 0.3697, "step": 2343500 }, { "epoch": 15.861844954525768, "grad_norm": 0.3684341013431549, "learning_rate": 4.841381550454743e-05, "loss": 0.3687, "step": 2344000 }, { "epoch": 15.865228453876137, "grad_norm": 0.3589087128639221, "learning_rate": 4.841347715461239e-05, "loss": 0.368, "step": 2344500 }, { "epoch": 15.868611953226505, "grad_norm": 0.37417420744895935, "learning_rate": 4.841313880467735e-05, "loss": 0.3659, "step": 2345000 }, { "epoch": 15.871995452576874, "grad_norm": 0.3539438247680664, "learning_rate": 4.8412800454742314e-05, "loss": 0.3678, "step": 2345500 }, { "epoch": 15.87537895192724, "grad_norm": 0.3440802991390228, "learning_rate": 4.8412462104807277e-05, "loss": 0.3679, "step": 2346000 }, { "epoch": 15.878762451277609, "grad_norm": 0.3424714207649231, "learning_rate": 4.841212375487224e-05, "loss": 0.3692, "step": 2346500 }, { "epoch": 15.882145950627978, "grad_norm": 0.39180028438568115, "learning_rate": 4.841178540493721e-05, "loss": 0.3677, "step": 2347000 }, { "epoch": 15.885529449978346, "grad_norm": 0.34315669536590576, "learning_rate": 4.841144705500217e-05, "loss": 0.3693, "step": 2347500 }, { "epoch": 15.888912949328713, "grad_norm": 0.3350779414176941, "learning_rate": 4.841110870506713e-05, "loss": 0.3676, "step": 2348000 }, { "epoch": 15.892296448679081, "grad_norm": 0.32782721519470215, "learning_rate": 4.8410770355132094e-05, "loss": 0.3681, "step": 2348500 }, { "epoch": 15.89567994802945, "grad_norm": 0.3709561228752136, "learning_rate": 4.8410432005197056e-05, "loss": 0.3676, "step": 2349000 }, { "epoch": 15.899063447379818, "grad_norm": 0.3544650673866272, "learning_rate": 4.8410093655262025e-05, "loss": 0.3686, "step": 2349500 }, { "epoch": 15.902446946730187, "grad_norm": 0.3804757297039032, "learning_rate": 4.840975530532698e-05, "loss": 0.3702, "step": 2350000 }, { "epoch": 15.905830446080554, "grad_norm": 0.3296406865119934, "learning_rate": 4.840941695539194e-05, "loss": 0.3697, "step": 2350500 }, { "epoch": 15.909213945430922, "grad_norm": 0.36700183153152466, "learning_rate": 4.840907860545691e-05, "loss": 0.3695, "step": 2351000 }, { "epoch": 15.91259744478129, "grad_norm": 0.35151296854019165, "learning_rate": 4.8408740255521873e-05, "loss": 0.3676, "step": 2351500 }, { "epoch": 15.91598094413166, "grad_norm": 0.34182360768318176, "learning_rate": 4.8408401905586836e-05, "loss": 0.3675, "step": 2352000 }, { "epoch": 15.919364443482026, "grad_norm": 0.36365705728530884, "learning_rate": 4.84080635556518e-05, "loss": 0.3704, "step": 2352500 }, { "epoch": 15.922747942832395, "grad_norm": 0.35310786962509155, "learning_rate": 4.840772520571677e-05, "loss": 0.3683, "step": 2353000 }, { "epoch": 15.926131442182763, "grad_norm": 0.3463936448097229, "learning_rate": 4.840738685578173e-05, "loss": 0.3669, "step": 2353500 }, { "epoch": 15.929514941533132, "grad_norm": 0.3467606008052826, "learning_rate": 4.840704850584669e-05, "loss": 0.3686, "step": 2354000 }, { "epoch": 15.9328984408835, "grad_norm": 0.3721490502357483, "learning_rate": 4.840671015591165e-05, "loss": 0.3691, "step": 2354500 }, { "epoch": 15.936281940233867, "grad_norm": 0.3566875457763672, "learning_rate": 4.8406371805976615e-05, "loss": 0.368, "step": 2355000 }, { "epoch": 15.939665439584235, "grad_norm": 0.36238959431648254, "learning_rate": 4.840603345604158e-05, "loss": 0.3681, "step": 2355500 }, { "epoch": 15.943048938934604, "grad_norm": 0.37380194664001465, "learning_rate": 4.840569510610654e-05, "loss": 0.3668, "step": 2356000 }, { "epoch": 15.946432438284972, "grad_norm": 0.3661109209060669, "learning_rate": 4.840535675617151e-05, "loss": 0.3677, "step": 2356500 }, { "epoch": 15.94981593763534, "grad_norm": 0.33019015192985535, "learning_rate": 4.840501840623647e-05, "loss": 0.3682, "step": 2357000 }, { "epoch": 15.953199436985708, "grad_norm": 0.34557926654815674, "learning_rate": 4.840468005630143e-05, "loss": 0.3693, "step": 2357500 }, { "epoch": 15.956582936336076, "grad_norm": 0.3640996813774109, "learning_rate": 4.8404341706366395e-05, "loss": 0.3666, "step": 2358000 }, { "epoch": 15.959966435686445, "grad_norm": 0.3293427526950836, "learning_rate": 4.840400335643136e-05, "loss": 0.3678, "step": 2358500 }, { "epoch": 15.963349935036813, "grad_norm": 0.39549505710601807, "learning_rate": 4.8403665006496326e-05, "loss": 0.3703, "step": 2359000 }, { "epoch": 15.96673343438718, "grad_norm": 0.37706345319747925, "learning_rate": 4.840332665656128e-05, "loss": 0.3687, "step": 2359500 }, { "epoch": 15.970116933737549, "grad_norm": 0.32542896270751953, "learning_rate": 4.840298830662624e-05, "loss": 0.369, "step": 2360000 }, { "epoch": 15.973500433087917, "grad_norm": 0.3571685254573822, "learning_rate": 4.840264995669121e-05, "loss": 0.368, "step": 2360500 }, { "epoch": 15.976883932438286, "grad_norm": 0.3591122627258301, "learning_rate": 4.8402311606756174e-05, "loss": 0.3688, "step": 2361000 }, { "epoch": 15.980267431788652, "grad_norm": 0.38528913259506226, "learning_rate": 4.8401973256821136e-05, "loss": 0.3688, "step": 2361500 }, { "epoch": 15.983650931139021, "grad_norm": 0.38601624965667725, "learning_rate": 4.84016349068861e-05, "loss": 0.3674, "step": 2362000 }, { "epoch": 15.98703443048939, "grad_norm": 0.35946911573410034, "learning_rate": 4.840129655695107e-05, "loss": 0.3677, "step": 2362500 }, { "epoch": 15.990417929839758, "grad_norm": 0.3463689386844635, "learning_rate": 4.840095820701603e-05, "loss": 0.3687, "step": 2363000 }, { "epoch": 15.993801429190125, "grad_norm": 0.3598881661891937, "learning_rate": 4.840061985708099e-05, "loss": 0.3709, "step": 2363500 }, { "epoch": 15.997184928540493, "grad_norm": 0.3293743431568146, "learning_rate": 4.8400281507145954e-05, "loss": 0.3671, "step": 2364000 }, { "epoch": 16.0, "eval_accuracy": 0.8596614364942428, "eval_loss": 0.5696810483932495, "eval_runtime": 3400.6592, "eval_samples_per_second": 85.496, "eval_steps_per_second": 5.344, "step": 2364416 }, { "epoch": 16.00056842789086, "grad_norm": 0.3467516005039215, "learning_rate": 4.8399943157210916e-05, "loss": 0.3687, "step": 2364500 }, { "epoch": 16.00395192724123, "grad_norm": 0.34219473600387573, "learning_rate": 4.839960480727588e-05, "loss": 0.3661, "step": 2365000 }, { "epoch": 16.0073354265916, "grad_norm": 0.33582720160484314, "learning_rate": 4.839926645734084e-05, "loss": 0.3664, "step": 2365500 }, { "epoch": 16.010718925941966, "grad_norm": 0.37527623772621155, "learning_rate": 4.83989281074058e-05, "loss": 0.3668, "step": 2366000 }, { "epoch": 16.014102425292336, "grad_norm": 0.5134567618370056, "learning_rate": 4.839858975747077e-05, "loss": 0.3667, "step": 2366500 }, { "epoch": 16.017485924642703, "grad_norm": 0.3311164081096649, "learning_rate": 4.839825140753573e-05, "loss": 0.3668, "step": 2367000 }, { "epoch": 16.02086942399307, "grad_norm": 0.33676856756210327, "learning_rate": 4.8397913057600695e-05, "loss": 0.366, "step": 2367500 }, { "epoch": 16.02425292334344, "grad_norm": 0.3555416166782379, "learning_rate": 4.839757470766566e-05, "loss": 0.3675, "step": 2368000 }, { "epoch": 16.027636422693806, "grad_norm": 0.36959418654441833, "learning_rate": 4.8397236357730626e-05, "loss": 0.366, "step": 2368500 }, { "epoch": 16.031019922044177, "grad_norm": 0.38383468985557556, "learning_rate": 4.839689800779558e-05, "loss": 0.3667, "step": 2369000 }, { "epoch": 16.034403421394543, "grad_norm": 0.35724765062332153, "learning_rate": 4.8396559657860544e-05, "loss": 0.3681, "step": 2369500 }, { "epoch": 16.03778692074491, "grad_norm": 0.37634825706481934, "learning_rate": 4.839622130792551e-05, "loss": 0.367, "step": 2370000 }, { "epoch": 16.04117042009528, "grad_norm": 0.3529738485813141, "learning_rate": 4.8395882957990475e-05, "loss": 0.366, "step": 2370500 }, { "epoch": 16.044553919445647, "grad_norm": 0.38263168931007385, "learning_rate": 4.839554460805544e-05, "loss": 0.3675, "step": 2371000 }, { "epoch": 16.047937418796014, "grad_norm": 0.40001940727233887, "learning_rate": 4.83952062581204e-05, "loss": 0.3656, "step": 2371500 }, { "epoch": 16.051320918146384, "grad_norm": 0.3931584656238556, "learning_rate": 4.839486790818537e-05, "loss": 0.365, "step": 2372000 }, { "epoch": 16.05470441749675, "grad_norm": 0.38671112060546875, "learning_rate": 4.839452955825033e-05, "loss": 0.3669, "step": 2372500 }, { "epoch": 16.05808791684712, "grad_norm": 0.3262976109981537, "learning_rate": 4.839419120831529e-05, "loss": 0.3669, "step": 2373000 }, { "epoch": 16.061471416197488, "grad_norm": 0.387162983417511, "learning_rate": 4.839385285838025e-05, "loss": 0.3675, "step": 2373500 }, { "epoch": 16.064854915547855, "grad_norm": 0.3413422405719757, "learning_rate": 4.8393514508445216e-05, "loss": 0.3656, "step": 2374000 }, { "epoch": 16.068238414898225, "grad_norm": 0.39504578709602356, "learning_rate": 4.839317615851018e-05, "loss": 0.3666, "step": 2374500 }, { "epoch": 16.071621914248592, "grad_norm": 0.38605839014053345, "learning_rate": 4.839283780857514e-05, "loss": 0.3657, "step": 2375000 }, { "epoch": 16.075005413598962, "grad_norm": 0.3513329327106476, "learning_rate": 4.83924994586401e-05, "loss": 0.3655, "step": 2375500 }, { "epoch": 16.07838891294933, "grad_norm": 0.3624356687068939, "learning_rate": 4.839216110870507e-05, "loss": 0.3661, "step": 2376000 }, { "epoch": 16.081772412299696, "grad_norm": 0.3576614260673523, "learning_rate": 4.8391822758770034e-05, "loss": 0.3684, "step": 2376500 }, { "epoch": 16.085155911650066, "grad_norm": 0.3729521930217743, "learning_rate": 4.8391484408834996e-05, "loss": 0.3679, "step": 2377000 }, { "epoch": 16.088539411000433, "grad_norm": 0.3682872951030731, "learning_rate": 4.839114605889996e-05, "loss": 0.3664, "step": 2377500 }, { "epoch": 16.0919229103508, "grad_norm": 0.34409353137016296, "learning_rate": 4.839080770896493e-05, "loss": 0.365, "step": 2378000 }, { "epoch": 16.09530640970117, "grad_norm": 0.3514534831047058, "learning_rate": 4.839046935902988e-05, "loss": 0.3673, "step": 2378500 }, { "epoch": 16.098689909051537, "grad_norm": 0.38848912715911865, "learning_rate": 4.8390131009094844e-05, "loss": 0.3658, "step": 2379000 }, { "epoch": 16.102073408401907, "grad_norm": 0.3803500235080719, "learning_rate": 4.838979265915981e-05, "loss": 0.3671, "step": 2379500 }, { "epoch": 16.105456907752274, "grad_norm": 0.3359062075614929, "learning_rate": 4.8389454309224775e-05, "loss": 0.3672, "step": 2380000 }, { "epoch": 16.10884040710264, "grad_norm": 0.3859556019306183, "learning_rate": 4.838911595928974e-05, "loss": 0.3668, "step": 2380500 }, { "epoch": 16.11222390645301, "grad_norm": 0.3473733067512512, "learning_rate": 4.83887776093547e-05, "loss": 0.3634, "step": 2381000 }, { "epoch": 16.115607405803377, "grad_norm": 0.3759918212890625, "learning_rate": 4.838843925941967e-05, "loss": 0.3664, "step": 2381500 }, { "epoch": 16.118990905153748, "grad_norm": 0.38334834575653076, "learning_rate": 4.838810090948463e-05, "loss": 0.3673, "step": 2382000 }, { "epoch": 16.122374404504114, "grad_norm": 0.3702244162559509, "learning_rate": 4.838776255954959e-05, "loss": 0.3676, "step": 2382500 }, { "epoch": 16.12575790385448, "grad_norm": 0.35715895891189575, "learning_rate": 4.838742420961455e-05, "loss": 0.3652, "step": 2383000 }, { "epoch": 16.12914140320485, "grad_norm": 0.41490229964256287, "learning_rate": 4.838708585967952e-05, "loss": 0.3674, "step": 2383500 }, { "epoch": 16.132524902555218, "grad_norm": 0.3774898946285248, "learning_rate": 4.838674750974448e-05, "loss": 0.3668, "step": 2384000 }, { "epoch": 16.13590840190559, "grad_norm": 0.40797972679138184, "learning_rate": 4.838640915980944e-05, "loss": 0.3666, "step": 2384500 }, { "epoch": 16.139291901255955, "grad_norm": 0.3960728943347931, "learning_rate": 4.83860708098744e-05, "loss": 0.368, "step": 2385000 }, { "epoch": 16.142675400606322, "grad_norm": 0.36006104946136475, "learning_rate": 4.838573245993937e-05, "loss": 0.3683, "step": 2385500 }, { "epoch": 16.146058899956692, "grad_norm": 0.3449351191520691, "learning_rate": 4.8385394110004334e-05, "loss": 0.3666, "step": 2386000 }, { "epoch": 16.14944239930706, "grad_norm": 0.3677423596382141, "learning_rate": 4.8385055760069297e-05, "loss": 0.3676, "step": 2386500 }, { "epoch": 16.152825898657426, "grad_norm": 0.33477944135665894, "learning_rate": 4.838471741013426e-05, "loss": 0.3684, "step": 2387000 }, { "epoch": 16.156209398007796, "grad_norm": 0.36839577555656433, "learning_rate": 4.838437906019923e-05, "loss": 0.3658, "step": 2387500 }, { "epoch": 16.159592897358163, "grad_norm": 0.3929869830608368, "learning_rate": 4.838404071026418e-05, "loss": 0.3675, "step": 2388000 }, { "epoch": 16.162976396708533, "grad_norm": 0.38416117429733276, "learning_rate": 4.8383702360329145e-05, "loss": 0.3667, "step": 2388500 }, { "epoch": 16.1663598960589, "grad_norm": 0.35896387696266174, "learning_rate": 4.8383364010394114e-05, "loss": 0.3664, "step": 2389000 }, { "epoch": 16.169743395409267, "grad_norm": 0.41448551416397095, "learning_rate": 4.8383025660459076e-05, "loss": 0.3678, "step": 2389500 }, { "epoch": 16.173126894759637, "grad_norm": 0.3747521638870239, "learning_rate": 4.838268731052404e-05, "loss": 0.3668, "step": 2390000 }, { "epoch": 16.176510394110004, "grad_norm": 0.3750576078891754, "learning_rate": 4.8382348960589e-05, "loss": 0.3681, "step": 2390500 }, { "epoch": 16.179893893460374, "grad_norm": 0.3896361291408539, "learning_rate": 4.838201061065397e-05, "loss": 0.368, "step": 2391000 }, { "epoch": 16.18327739281074, "grad_norm": 0.36419275403022766, "learning_rate": 4.838167226071893e-05, "loss": 0.3669, "step": 2391500 }, { "epoch": 16.186660892161107, "grad_norm": 0.3544306457042694, "learning_rate": 4.8381333910783893e-05, "loss": 0.3689, "step": 2392000 }, { "epoch": 16.190044391511478, "grad_norm": 0.38578328490257263, "learning_rate": 4.838099556084885e-05, "loss": 0.3667, "step": 2392500 }, { "epoch": 16.193427890861845, "grad_norm": 0.37322214245796204, "learning_rate": 4.838065721091382e-05, "loss": 0.3673, "step": 2393000 }, { "epoch": 16.196811390212215, "grad_norm": 0.37914374470710754, "learning_rate": 4.838031886097878e-05, "loss": 0.3678, "step": 2393500 }, { "epoch": 16.20019488956258, "grad_norm": 0.3337806761264801, "learning_rate": 4.837998051104374e-05, "loss": 0.3678, "step": 2394000 }, { "epoch": 16.20357838891295, "grad_norm": 0.3879007399082184, "learning_rate": 4.8379642161108704e-05, "loss": 0.3678, "step": 2394500 }, { "epoch": 16.20696188826332, "grad_norm": 0.3810998797416687, "learning_rate": 4.837930381117367e-05, "loss": 0.3675, "step": 2395000 }, { "epoch": 16.210345387613685, "grad_norm": 0.4061427414417267, "learning_rate": 4.8378965461238635e-05, "loss": 0.3664, "step": 2395500 }, { "epoch": 16.213728886964052, "grad_norm": 0.34380391240119934, "learning_rate": 4.83786271113036e-05, "loss": 0.3673, "step": 2396000 }, { "epoch": 16.217112386314422, "grad_norm": 0.38658061623573303, "learning_rate": 4.837828876136856e-05, "loss": 0.3693, "step": 2396500 }, { "epoch": 16.22049588566479, "grad_norm": 0.34300288558006287, "learning_rate": 4.837795041143353e-05, "loss": 0.3661, "step": 2397000 }, { "epoch": 16.22387938501516, "grad_norm": 0.35389596223831177, "learning_rate": 4.8377612061498484e-05, "loss": 0.3669, "step": 2397500 }, { "epoch": 16.227262884365526, "grad_norm": 0.34756699204444885, "learning_rate": 4.8377273711563446e-05, "loss": 0.3671, "step": 2398000 }, { "epoch": 16.230646383715893, "grad_norm": 0.36334916949272156, "learning_rate": 4.8376935361628415e-05, "loss": 0.3666, "step": 2398500 }, { "epoch": 16.234029883066263, "grad_norm": 0.36038801074028015, "learning_rate": 4.837659701169338e-05, "loss": 0.3669, "step": 2399000 }, { "epoch": 16.23741338241663, "grad_norm": 0.38227176666259766, "learning_rate": 4.837625866175834e-05, "loss": 0.3676, "step": 2399500 }, { "epoch": 16.240796881767, "grad_norm": 0.32234764099121094, "learning_rate": 4.83759203118233e-05, "loss": 0.3668, "step": 2400000 }, { "epoch": 16.244180381117367, "grad_norm": 0.37640976905822754, "learning_rate": 4.837558196188827e-05, "loss": 0.3676, "step": 2400500 }, { "epoch": 16.247563880467734, "grad_norm": 0.34978267550468445, "learning_rate": 4.837524361195323e-05, "loss": 0.3668, "step": 2401000 }, { "epoch": 16.250947379818104, "grad_norm": 0.39871707558631897, "learning_rate": 4.8374905262018194e-05, "loss": 0.3687, "step": 2401500 }, { "epoch": 16.25433087916847, "grad_norm": 0.37368524074554443, "learning_rate": 4.8374566912083156e-05, "loss": 0.3681, "step": 2402000 }, { "epoch": 16.257714378518838, "grad_norm": 0.36890408396720886, "learning_rate": 4.837422856214812e-05, "loss": 0.3667, "step": 2402500 }, { "epoch": 16.261097877869208, "grad_norm": 0.3481537103652954, "learning_rate": 4.837389021221308e-05, "loss": 0.3678, "step": 2403000 }, { "epoch": 16.264481377219575, "grad_norm": 0.37822774052619934, "learning_rate": 4.837355186227804e-05, "loss": 0.3681, "step": 2403500 }, { "epoch": 16.267864876569945, "grad_norm": 0.34673652052879333, "learning_rate": 4.8373213512343005e-05, "loss": 0.3662, "step": 2404000 }, { "epoch": 16.27124837592031, "grad_norm": 0.342875212430954, "learning_rate": 4.8372875162407974e-05, "loss": 0.3661, "step": 2404500 }, { "epoch": 16.27463187527068, "grad_norm": 0.3608645498752594, "learning_rate": 4.8372536812472936e-05, "loss": 0.366, "step": 2405000 }, { "epoch": 16.27801537462105, "grad_norm": 0.39905846118927, "learning_rate": 4.83721984625379e-05, "loss": 0.3679, "step": 2405500 }, { "epoch": 16.281398873971415, "grad_norm": 0.37270793318748474, "learning_rate": 4.837186011260286e-05, "loss": 0.368, "step": 2406000 }, { "epoch": 16.284782373321786, "grad_norm": 0.33094844222068787, "learning_rate": 4.837152176266783e-05, "loss": 0.3661, "step": 2406500 }, { "epoch": 16.288165872672153, "grad_norm": 0.3748573958873749, "learning_rate": 4.8371183412732784e-05, "loss": 0.3673, "step": 2407000 }, { "epoch": 16.29154937202252, "grad_norm": 0.3888343870639801, "learning_rate": 4.8370845062797746e-05, "loss": 0.3674, "step": 2407500 }, { "epoch": 16.29493287137289, "grad_norm": 0.3585543632507324, "learning_rate": 4.8370506712862715e-05, "loss": 0.3686, "step": 2408000 }, { "epoch": 16.298316370723256, "grad_norm": 0.3523589074611664, "learning_rate": 4.837016836292768e-05, "loss": 0.3676, "step": 2408500 }, { "epoch": 16.301699870073627, "grad_norm": 0.31586897373199463, "learning_rate": 4.836983001299264e-05, "loss": 0.3689, "step": 2409000 }, { "epoch": 16.305083369423993, "grad_norm": 0.3635201156139374, "learning_rate": 4.83694916630576e-05, "loss": 0.3672, "step": 2409500 }, { "epoch": 16.30846686877436, "grad_norm": 0.36935552954673767, "learning_rate": 4.836915331312257e-05, "loss": 0.367, "step": 2410000 }, { "epoch": 16.31185036812473, "grad_norm": 0.3307409882545471, "learning_rate": 4.836881496318753e-05, "loss": 0.3668, "step": 2410500 }, { "epoch": 16.315233867475097, "grad_norm": 0.36583974957466125, "learning_rate": 4.8368476613252495e-05, "loss": 0.3669, "step": 2411000 }, { "epoch": 16.318617366825464, "grad_norm": 0.4040462076663971, "learning_rate": 4.836813826331746e-05, "loss": 0.3675, "step": 2411500 }, { "epoch": 16.322000866175834, "grad_norm": 0.35600608587265015, "learning_rate": 4.836779991338242e-05, "loss": 0.3666, "step": 2412000 }, { "epoch": 16.3253843655262, "grad_norm": 0.3632058799266815, "learning_rate": 4.836746156344738e-05, "loss": 0.3674, "step": 2412500 }, { "epoch": 16.32876786487657, "grad_norm": 0.376446396112442, "learning_rate": 4.836712321351234e-05, "loss": 0.3676, "step": 2413000 }, { "epoch": 16.332151364226938, "grad_norm": 0.39327868819236755, "learning_rate": 4.8366784863577305e-05, "loss": 0.3667, "step": 2413500 }, { "epoch": 16.335534863577305, "grad_norm": 0.37273505330085754, "learning_rate": 4.8366446513642274e-05, "loss": 0.3685, "step": 2414000 }, { "epoch": 16.338918362927675, "grad_norm": 0.3481425940990448, "learning_rate": 4.8366108163707236e-05, "loss": 0.3668, "step": 2414500 }, { "epoch": 16.342301862278042, "grad_norm": 0.3970904052257538, "learning_rate": 4.83657698137722e-05, "loss": 0.3669, "step": 2415000 }, { "epoch": 16.345685361628412, "grad_norm": 0.38457170128822327, "learning_rate": 4.836543146383716e-05, "loss": 0.3669, "step": 2415500 }, { "epoch": 16.34906886097878, "grad_norm": 0.36586880683898926, "learning_rate": 4.836509311390213e-05, "loss": 0.3686, "step": 2416000 }, { "epoch": 16.352452360329146, "grad_norm": 0.3561878800392151, "learning_rate": 4.8364754763967085e-05, "loss": 0.3678, "step": 2416500 }, { "epoch": 16.355835859679516, "grad_norm": 0.3339352309703827, "learning_rate": 4.836441641403205e-05, "loss": 0.3691, "step": 2417000 }, { "epoch": 16.359219359029883, "grad_norm": 0.33959805965423584, "learning_rate": 4.8364078064097016e-05, "loss": 0.3679, "step": 2417500 }, { "epoch": 16.36260285838025, "grad_norm": 0.38634565472602844, "learning_rate": 4.836373971416198e-05, "loss": 0.37, "step": 2418000 }, { "epoch": 16.36598635773062, "grad_norm": 0.3656438887119293, "learning_rate": 4.836340136422694e-05, "loss": 0.3675, "step": 2418500 }, { "epoch": 16.369369857080986, "grad_norm": 0.393478661775589, "learning_rate": 4.83630630142919e-05, "loss": 0.3671, "step": 2419000 }, { "epoch": 16.372753356431357, "grad_norm": 0.3794505000114441, "learning_rate": 4.8362724664356864e-05, "loss": 0.3674, "step": 2419500 }, { "epoch": 16.376136855781724, "grad_norm": 0.34691929817199707, "learning_rate": 4.836238631442183e-05, "loss": 0.3674, "step": 2420000 }, { "epoch": 16.37952035513209, "grad_norm": 0.3692687749862671, "learning_rate": 4.8362047964486795e-05, "loss": 0.3692, "step": 2420500 }, { "epoch": 16.38290385448246, "grad_norm": 0.3891196846961975, "learning_rate": 4.836170961455176e-05, "loss": 0.3673, "step": 2421000 }, { "epoch": 16.386287353832827, "grad_norm": 0.3694235682487488, "learning_rate": 4.836137126461672e-05, "loss": 0.3674, "step": 2421500 }, { "epoch": 16.389670853183198, "grad_norm": 0.3857375383377075, "learning_rate": 4.836103291468168e-05, "loss": 0.3679, "step": 2422000 }, { "epoch": 16.393054352533564, "grad_norm": 0.3707408905029297, "learning_rate": 4.8360694564746644e-05, "loss": 0.3666, "step": 2422500 }, { "epoch": 16.39643785188393, "grad_norm": 0.30295616388320923, "learning_rate": 4.8360356214811606e-05, "loss": 0.3677, "step": 2423000 }, { "epoch": 16.3998213512343, "grad_norm": 0.3722294270992279, "learning_rate": 4.8360017864876575e-05, "loss": 0.3694, "step": 2423500 }, { "epoch": 16.403204850584668, "grad_norm": 0.37082570791244507, "learning_rate": 4.835967951494154e-05, "loss": 0.3673, "step": 2424000 }, { "epoch": 16.40658834993504, "grad_norm": 0.378887802362442, "learning_rate": 4.83593411650065e-05, "loss": 0.3675, "step": 2424500 }, { "epoch": 16.409971849285405, "grad_norm": 0.3331888020038605, "learning_rate": 4.835900281507146e-05, "loss": 0.3684, "step": 2425000 }, { "epoch": 16.413355348635772, "grad_norm": 0.3269572854042053, "learning_rate": 4.835866446513643e-05, "loss": 0.3676, "step": 2425500 }, { "epoch": 16.416738847986142, "grad_norm": 0.36776217818260193, "learning_rate": 4.8358326115201385e-05, "loss": 0.3668, "step": 2426000 }, { "epoch": 16.42012234733651, "grad_norm": 0.40682822465896606, "learning_rate": 4.835798776526635e-05, "loss": 0.3665, "step": 2426500 }, { "epoch": 16.423505846686876, "grad_norm": 0.37275636196136475, "learning_rate": 4.8357649415331316e-05, "loss": 0.3677, "step": 2427000 }, { "epoch": 16.426889346037246, "grad_norm": 0.35203757882118225, "learning_rate": 4.835731106539628e-05, "loss": 0.3682, "step": 2427500 }, { "epoch": 16.430272845387613, "grad_norm": 0.363955020904541, "learning_rate": 4.835697271546124e-05, "loss": 0.3677, "step": 2428000 }, { "epoch": 16.433656344737983, "grad_norm": 0.3390071988105774, "learning_rate": 4.83566343655262e-05, "loss": 0.37, "step": 2428500 }, { "epoch": 16.43703984408835, "grad_norm": 0.36319419741630554, "learning_rate": 4.8356296015591165e-05, "loss": 0.3693, "step": 2429000 }, { "epoch": 16.440423343438717, "grad_norm": 0.35319650173187256, "learning_rate": 4.8355957665656134e-05, "loss": 0.367, "step": 2429500 }, { "epoch": 16.443806842789087, "grad_norm": 0.37532761693000793, "learning_rate": 4.8355619315721096e-05, "loss": 0.3679, "step": 2430000 }, { "epoch": 16.447190342139454, "grad_norm": 0.3767538070678711, "learning_rate": 4.835528096578606e-05, "loss": 0.3682, "step": 2430500 }, { "epoch": 16.450573841489824, "grad_norm": 0.3882306218147278, "learning_rate": 4.835494261585102e-05, "loss": 0.3688, "step": 2431000 }, { "epoch": 16.45395734084019, "grad_norm": 0.3863130509853363, "learning_rate": 4.835460426591598e-05, "loss": 0.3693, "step": 2431500 }, { "epoch": 16.457340840190557, "grad_norm": 0.3632519543170929, "learning_rate": 4.8354265915980944e-05, "loss": 0.3676, "step": 2432000 }, { "epoch": 16.460724339540928, "grad_norm": 0.34005188941955566, "learning_rate": 4.8353927566045907e-05, "loss": 0.3674, "step": 2432500 }, { "epoch": 16.464107838891294, "grad_norm": 0.33168646693229675, "learning_rate": 4.8353589216110875e-05, "loss": 0.3675, "step": 2433000 }, { "epoch": 16.467491338241665, "grad_norm": 0.35770660638809204, "learning_rate": 4.835325086617584e-05, "loss": 0.3679, "step": 2433500 }, { "epoch": 16.47087483759203, "grad_norm": 0.37699154019355774, "learning_rate": 4.83529125162408e-05, "loss": 0.368, "step": 2434000 }, { "epoch": 16.4742583369424, "grad_norm": 0.3676162362098694, "learning_rate": 4.835257416630576e-05, "loss": 0.3683, "step": 2434500 }, { "epoch": 16.47764183629277, "grad_norm": 0.30709290504455566, "learning_rate": 4.835223581637073e-05, "loss": 0.3688, "step": 2435000 }, { "epoch": 16.481025335643135, "grad_norm": 0.36708635091781616, "learning_rate": 4.8351897466435686e-05, "loss": 0.3679, "step": 2435500 }, { "epoch": 16.484408834993502, "grad_norm": 0.31738120317459106, "learning_rate": 4.835155911650065e-05, "loss": 0.3666, "step": 2436000 }, { "epoch": 16.487792334343872, "grad_norm": 0.3448878228664398, "learning_rate": 4.835122076656561e-05, "loss": 0.368, "step": 2436500 }, { "epoch": 16.49117583369424, "grad_norm": 0.33406171202659607, "learning_rate": 4.835088241663058e-05, "loss": 0.3687, "step": 2437000 }, { "epoch": 16.49455933304461, "grad_norm": 0.3607676923274994, "learning_rate": 4.835054406669554e-05, "loss": 0.3683, "step": 2437500 }, { "epoch": 16.497942832394976, "grad_norm": 0.38623955845832825, "learning_rate": 4.8350205716760503e-05, "loss": 0.3677, "step": 2438000 }, { "epoch": 16.501326331745343, "grad_norm": 0.3449345827102661, "learning_rate": 4.8349867366825466e-05, "loss": 0.368, "step": 2438500 }, { "epoch": 16.504709831095713, "grad_norm": 0.36246615648269653, "learning_rate": 4.8349529016890434e-05, "loss": 0.368, "step": 2439000 }, { "epoch": 16.50809333044608, "grad_norm": 0.343545526266098, "learning_rate": 4.8349190666955397e-05, "loss": 0.3685, "step": 2439500 }, { "epoch": 16.51147682979645, "grad_norm": 0.3407546281814575, "learning_rate": 4.834885231702036e-05, "loss": 0.367, "step": 2440000 }, { "epoch": 16.514860329146817, "grad_norm": 0.417532354593277, "learning_rate": 4.834851396708532e-05, "loss": 0.3684, "step": 2440500 }, { "epoch": 16.518243828497184, "grad_norm": 0.3835320472717285, "learning_rate": 4.834817561715028e-05, "loss": 0.3657, "step": 2441000 }, { "epoch": 16.521627327847554, "grad_norm": 0.37053796648979187, "learning_rate": 4.8347837267215245e-05, "loss": 0.3679, "step": 2441500 }, { "epoch": 16.52501082719792, "grad_norm": 0.34645986557006836, "learning_rate": 4.834749891728021e-05, "loss": 0.3664, "step": 2442000 }, { "epoch": 16.528394326548288, "grad_norm": 0.3670058846473694, "learning_rate": 4.8347160567345176e-05, "loss": 0.3688, "step": 2442500 }, { "epoch": 16.531777825898658, "grad_norm": 0.33533594012260437, "learning_rate": 4.834682221741014e-05, "loss": 0.3687, "step": 2443000 }, { "epoch": 16.535161325249025, "grad_norm": 0.3722659647464752, "learning_rate": 4.83464838674751e-05, "loss": 0.3668, "step": 2443500 }, { "epoch": 16.538544824599395, "grad_norm": 0.3470999300479889, "learning_rate": 4.834614551754006e-05, "loss": 0.3669, "step": 2444000 }, { "epoch": 16.54192832394976, "grad_norm": 0.3363528549671173, "learning_rate": 4.834580716760503e-05, "loss": 0.3668, "step": 2444500 }, { "epoch": 16.54531182330013, "grad_norm": 0.35689017176628113, "learning_rate": 4.834546881766999e-05, "loss": 0.3677, "step": 2445000 }, { "epoch": 16.5486953226505, "grad_norm": 0.3551176190376282, "learning_rate": 4.834513046773495e-05, "loss": 0.3685, "step": 2445500 }, { "epoch": 16.552078822000865, "grad_norm": 0.39262300729751587, "learning_rate": 4.834479211779991e-05, "loss": 0.3675, "step": 2446000 }, { "epoch": 16.555462321351236, "grad_norm": 0.3882499933242798, "learning_rate": 4.834445376786488e-05, "loss": 0.3675, "step": 2446500 }, { "epoch": 16.558845820701602, "grad_norm": 0.3415895700454712, "learning_rate": 4.834411541792984e-05, "loss": 0.3682, "step": 2447000 }, { "epoch": 16.56222932005197, "grad_norm": 0.35380828380584717, "learning_rate": 4.8343777067994804e-05, "loss": 0.3674, "step": 2447500 }, { "epoch": 16.56561281940234, "grad_norm": 0.35650962591171265, "learning_rate": 4.8343438718059766e-05, "loss": 0.3676, "step": 2448000 }, { "epoch": 16.568996318752706, "grad_norm": 0.34146806597709656, "learning_rate": 4.8343100368124735e-05, "loss": 0.3697, "step": 2448500 }, { "epoch": 16.572379818103077, "grad_norm": 0.3345761299133301, "learning_rate": 4.83427620181897e-05, "loss": 0.3672, "step": 2449000 }, { "epoch": 16.575763317453443, "grad_norm": 0.34446024894714355, "learning_rate": 4.834242366825466e-05, "loss": 0.3684, "step": 2449500 }, { "epoch": 16.57914681680381, "grad_norm": 0.47220537066459656, "learning_rate": 4.834208531831962e-05, "loss": 0.3672, "step": 2450000 }, { "epoch": 16.58253031615418, "grad_norm": 0.35243332386016846, "learning_rate": 4.8341746968384584e-05, "loss": 0.3676, "step": 2450500 }, { "epoch": 16.585913815504547, "grad_norm": 0.3121185898780823, "learning_rate": 4.8341408618449546e-05, "loss": 0.3677, "step": 2451000 }, { "epoch": 16.589297314854914, "grad_norm": 0.3816813826560974, "learning_rate": 4.834107026851451e-05, "loss": 0.3666, "step": 2451500 }, { "epoch": 16.592680814205284, "grad_norm": 0.3435656726360321, "learning_rate": 4.834073191857948e-05, "loss": 0.368, "step": 2452000 }, { "epoch": 16.59606431355565, "grad_norm": 0.387119859457016, "learning_rate": 4.834039356864444e-05, "loss": 0.3682, "step": 2452500 }, { "epoch": 16.59944781290602, "grad_norm": 0.39400357007980347, "learning_rate": 4.83400552187094e-05, "loss": 0.3669, "step": 2453000 }, { "epoch": 16.602831312256388, "grad_norm": 0.3987778425216675, "learning_rate": 4.833971686877436e-05, "loss": 0.3669, "step": 2453500 }, { "epoch": 16.606214811606755, "grad_norm": 0.3489267826080322, "learning_rate": 4.833937851883933e-05, "loss": 0.3695, "step": 2454000 }, { "epoch": 16.609598310957125, "grad_norm": 0.3362281918525696, "learning_rate": 4.8339040168904294e-05, "loss": 0.3668, "step": 2454500 }, { "epoch": 16.61298181030749, "grad_norm": 0.3904046416282654, "learning_rate": 4.833870181896925e-05, "loss": 0.3685, "step": 2455000 }, { "epoch": 16.616365309657862, "grad_norm": 0.3561997711658478, "learning_rate": 4.833836346903421e-05, "loss": 0.3683, "step": 2455500 }, { "epoch": 16.61974880900823, "grad_norm": 0.36367231607437134, "learning_rate": 4.833802511909918e-05, "loss": 0.368, "step": 2456000 }, { "epoch": 16.623132308358596, "grad_norm": 0.3647955358028412, "learning_rate": 4.833768676916414e-05, "loss": 0.3669, "step": 2456500 }, { "epoch": 16.626515807708966, "grad_norm": 0.3903414309024811, "learning_rate": 4.8337348419229105e-05, "loss": 0.3678, "step": 2457000 }, { "epoch": 16.629899307059333, "grad_norm": 0.36575034260749817, "learning_rate": 4.833701006929407e-05, "loss": 0.3687, "step": 2457500 }, { "epoch": 16.633282806409703, "grad_norm": 0.3615660071372986, "learning_rate": 4.8336671719359036e-05, "loss": 0.3662, "step": 2458000 }, { "epoch": 16.63666630576007, "grad_norm": 0.38607707619667053, "learning_rate": 4.8336333369424e-05, "loss": 0.3696, "step": 2458500 }, { "epoch": 16.640049805110436, "grad_norm": 0.3894058167934418, "learning_rate": 4.833599501948896e-05, "loss": 0.3656, "step": 2459000 }, { "epoch": 16.643433304460807, "grad_norm": 0.396003395318985, "learning_rate": 4.833565666955392e-05, "loss": 0.369, "step": 2459500 }, { "epoch": 16.646816803811173, "grad_norm": 0.3519524335861206, "learning_rate": 4.8335318319618884e-05, "loss": 0.3685, "step": 2460000 }, { "epoch": 16.65020030316154, "grad_norm": 0.3568274676799774, "learning_rate": 4.8334979969683846e-05, "loss": 0.3694, "step": 2460500 }, { "epoch": 16.65358380251191, "grad_norm": 0.3595027029514313, "learning_rate": 4.833464161974881e-05, "loss": 0.3678, "step": 2461000 }, { "epoch": 16.656967301862277, "grad_norm": 0.40519922971725464, "learning_rate": 4.833430326981378e-05, "loss": 0.3663, "step": 2461500 }, { "epoch": 16.660350801212648, "grad_norm": 0.35902488231658936, "learning_rate": 4.833396491987874e-05, "loss": 0.3669, "step": 2462000 }, { "epoch": 16.663734300563014, "grad_norm": 0.3957083225250244, "learning_rate": 4.83336265699437e-05, "loss": 0.3676, "step": 2462500 }, { "epoch": 16.66711779991338, "grad_norm": 0.4037426710128784, "learning_rate": 4.8333288220008664e-05, "loss": 0.3681, "step": 2463000 }, { "epoch": 16.67050129926375, "grad_norm": 0.3176875412464142, "learning_rate": 4.833294987007363e-05, "loss": 0.3684, "step": 2463500 }, { "epoch": 16.673884798614118, "grad_norm": 0.40276309847831726, "learning_rate": 4.8332611520138595e-05, "loss": 0.3683, "step": 2464000 }, { "epoch": 16.67726829796449, "grad_norm": 0.3713991940021515, "learning_rate": 4.833227317020355e-05, "loss": 0.367, "step": 2464500 }, { "epoch": 16.680651797314855, "grad_norm": 0.3815590441226959, "learning_rate": 4.833193482026851e-05, "loss": 0.369, "step": 2465000 }, { "epoch": 16.684035296665222, "grad_norm": 0.3649478554725647, "learning_rate": 4.833159647033348e-05, "loss": 0.3668, "step": 2465500 }, { "epoch": 16.687418796015592, "grad_norm": 0.3144074082374573, "learning_rate": 4.833125812039844e-05, "loss": 0.3672, "step": 2466000 }, { "epoch": 16.69080229536596, "grad_norm": 0.3706476092338562, "learning_rate": 4.8330919770463405e-05, "loss": 0.3676, "step": 2466500 }, { "epoch": 16.694185794716326, "grad_norm": 0.4185875356197357, "learning_rate": 4.833058142052837e-05, "loss": 0.3683, "step": 2467000 }, { "epoch": 16.697569294066696, "grad_norm": 0.38653209805488586, "learning_rate": 4.8330243070593336e-05, "loss": 0.3682, "step": 2467500 }, { "epoch": 16.700952793417063, "grad_norm": 0.37850135564804077, "learning_rate": 4.83299047206583e-05, "loss": 0.3678, "step": 2468000 }, { "epoch": 16.704336292767433, "grad_norm": 0.3965655267238617, "learning_rate": 4.832956637072326e-05, "loss": 0.3673, "step": 2468500 }, { "epoch": 16.7077197921178, "grad_norm": 0.3785822093486786, "learning_rate": 4.832922802078822e-05, "loss": 0.368, "step": 2469000 }, { "epoch": 16.711103291468167, "grad_norm": 0.36220183968544006, "learning_rate": 4.8328889670853185e-05, "loss": 0.3664, "step": 2469500 }, { "epoch": 16.714486790818537, "grad_norm": 0.34872257709503174, "learning_rate": 4.832855132091815e-05, "loss": 0.3685, "step": 2470000 }, { "epoch": 16.717870290168904, "grad_norm": 0.4009423553943634, "learning_rate": 4.832821297098311e-05, "loss": 0.3687, "step": 2470500 }, { "epoch": 16.721253789519274, "grad_norm": 0.3455296754837036, "learning_rate": 4.832787462104808e-05, "loss": 0.3667, "step": 2471000 }, { "epoch": 16.72463728886964, "grad_norm": 0.3740684986114502, "learning_rate": 4.832753627111304e-05, "loss": 0.3686, "step": 2471500 }, { "epoch": 16.728020788220007, "grad_norm": 0.35268333554267883, "learning_rate": 4.8327197921178e-05, "loss": 0.3693, "step": 2472000 }, { "epoch": 16.731404287570378, "grad_norm": 0.3672443628311157, "learning_rate": 4.8326859571242964e-05, "loss": 0.3679, "step": 2472500 }, { "epoch": 16.734787786920744, "grad_norm": 0.34308382868766785, "learning_rate": 4.832652122130793e-05, "loss": 0.3671, "step": 2473000 }, { "epoch": 16.738171286271115, "grad_norm": 0.3512164056301117, "learning_rate": 4.8326182871372895e-05, "loss": 0.3678, "step": 2473500 }, { "epoch": 16.74155478562148, "grad_norm": 0.3707292377948761, "learning_rate": 4.832584452143785e-05, "loss": 0.3675, "step": 2474000 }, { "epoch": 16.744938284971848, "grad_norm": 0.3685651421546936, "learning_rate": 4.832550617150281e-05, "loss": 0.367, "step": 2474500 }, { "epoch": 16.74832178432222, "grad_norm": 0.3733166754245758, "learning_rate": 4.832516782156778e-05, "loss": 0.3669, "step": 2475000 }, { "epoch": 16.751705283672585, "grad_norm": 0.3869015872478485, "learning_rate": 4.8324829471632744e-05, "loss": 0.3678, "step": 2475500 }, { "epoch": 16.755088783022952, "grad_norm": 0.35414817929267883, "learning_rate": 4.8324491121697706e-05, "loss": 0.3685, "step": 2476000 }, { "epoch": 16.758472282373322, "grad_norm": 0.4135025143623352, "learning_rate": 4.832415277176267e-05, "loss": 0.3693, "step": 2476500 }, { "epoch": 16.76185578172369, "grad_norm": 0.3596659302711487, "learning_rate": 4.832381442182764e-05, "loss": 0.3672, "step": 2477000 }, { "epoch": 16.76523928107406, "grad_norm": 0.35768067836761475, "learning_rate": 4.83234760718926e-05, "loss": 0.3668, "step": 2477500 }, { "epoch": 16.768622780424426, "grad_norm": 0.4043077826499939, "learning_rate": 4.832313772195756e-05, "loss": 0.3664, "step": 2478000 }, { "epoch": 16.772006279774793, "grad_norm": 0.3664889931678772, "learning_rate": 4.832279937202252e-05, "loss": 0.368, "step": 2478500 }, { "epoch": 16.775389779125163, "grad_norm": 0.3519807457923889, "learning_rate": 4.8322461022087485e-05, "loss": 0.3679, "step": 2479000 }, { "epoch": 16.77877327847553, "grad_norm": 0.3843582570552826, "learning_rate": 4.832212267215245e-05, "loss": 0.3675, "step": 2479500 }, { "epoch": 16.7821567778259, "grad_norm": 0.3271946907043457, "learning_rate": 4.832178432221741e-05, "loss": 0.3677, "step": 2480000 }, { "epoch": 16.785540277176267, "grad_norm": 0.389017790555954, "learning_rate": 4.832144597228238e-05, "loss": 0.367, "step": 2480500 }, { "epoch": 16.788923776526634, "grad_norm": 0.34948909282684326, "learning_rate": 4.832110762234734e-05, "loss": 0.367, "step": 2481000 }, { "epoch": 16.792307275877004, "grad_norm": 0.3824010193347931, "learning_rate": 4.83207692724123e-05, "loss": 0.3672, "step": 2481500 }, { "epoch": 16.79569077522737, "grad_norm": 0.3278599679470062, "learning_rate": 4.8320430922477265e-05, "loss": 0.3669, "step": 2482000 }, { "epoch": 16.79907427457774, "grad_norm": 0.35566702485084534, "learning_rate": 4.832009257254223e-05, "loss": 0.3665, "step": 2482500 }, { "epoch": 16.802457773928108, "grad_norm": 0.36212462186813354, "learning_rate": 4.8319754222607196e-05, "loss": 0.3684, "step": 2483000 }, { "epoch": 16.805841273278475, "grad_norm": 0.3928414285182953, "learning_rate": 4.831941587267215e-05, "loss": 0.3678, "step": 2483500 }, { "epoch": 16.809224772628845, "grad_norm": 0.4236493408679962, "learning_rate": 4.8319077522737113e-05, "loss": 0.3681, "step": 2484000 }, { "epoch": 16.81260827197921, "grad_norm": 0.4181719422340393, "learning_rate": 4.831873917280208e-05, "loss": 0.3672, "step": 2484500 }, { "epoch": 16.81599177132958, "grad_norm": 0.338945597410202, "learning_rate": 4.8318400822867044e-05, "loss": 0.3671, "step": 2485000 }, { "epoch": 16.81937527067995, "grad_norm": 0.355094313621521, "learning_rate": 4.8318062472932007e-05, "loss": 0.3686, "step": 2485500 }, { "epoch": 16.822758770030315, "grad_norm": 0.37627944350242615, "learning_rate": 4.831772412299697e-05, "loss": 0.3672, "step": 2486000 }, { "epoch": 16.826142269380686, "grad_norm": 0.3468860983848572, "learning_rate": 4.831738577306194e-05, "loss": 0.3663, "step": 2486500 }, { "epoch": 16.829525768731052, "grad_norm": 0.3257055878639221, "learning_rate": 4.83170474231269e-05, "loss": 0.3667, "step": 2487000 }, { "epoch": 16.83290926808142, "grad_norm": 0.38776686787605286, "learning_rate": 4.831670907319186e-05, "loss": 0.3666, "step": 2487500 }, { "epoch": 16.83629276743179, "grad_norm": 0.35589778423309326, "learning_rate": 4.8316370723256824e-05, "loss": 0.3669, "step": 2488000 }, { "epoch": 16.839676266782156, "grad_norm": 0.3393418490886688, "learning_rate": 4.8316032373321786e-05, "loss": 0.3676, "step": 2488500 }, { "epoch": 16.843059766132527, "grad_norm": 0.38172590732574463, "learning_rate": 4.831569402338675e-05, "loss": 0.3677, "step": 2489000 }, { "epoch": 16.846443265482893, "grad_norm": 0.358637273311615, "learning_rate": 4.831535567345171e-05, "loss": 0.3677, "step": 2489500 }, { "epoch": 16.84982676483326, "grad_norm": 0.34321945905685425, "learning_rate": 4.831501732351668e-05, "loss": 0.3664, "step": 2490000 }, { "epoch": 16.85321026418363, "grad_norm": 0.36980679631233215, "learning_rate": 4.831467897358164e-05, "loss": 0.3673, "step": 2490500 }, { "epoch": 16.856593763533997, "grad_norm": 0.3730469048023224, "learning_rate": 4.8314340623646603e-05, "loss": 0.3684, "step": 2491000 }, { "epoch": 16.859977262884364, "grad_norm": 0.3678283095359802, "learning_rate": 4.8314002273711566e-05, "loss": 0.3677, "step": 2491500 }, { "epoch": 16.863360762234734, "grad_norm": 0.3648795783519745, "learning_rate": 4.831366392377653e-05, "loss": 0.3661, "step": 2492000 }, { "epoch": 16.8667442615851, "grad_norm": 0.3346042335033417, "learning_rate": 4.83133255738415e-05, "loss": 0.3682, "step": 2492500 }, { "epoch": 16.87012776093547, "grad_norm": 0.3473329246044159, "learning_rate": 4.831298722390645e-05, "loss": 0.3669, "step": 2493000 }, { "epoch": 16.873511260285838, "grad_norm": 0.3240760266780853, "learning_rate": 4.8312648873971414e-05, "loss": 0.368, "step": 2493500 }, { "epoch": 16.876894759636205, "grad_norm": 0.36841729283332825, "learning_rate": 4.831231052403638e-05, "loss": 0.3681, "step": 2494000 }, { "epoch": 16.880278258986575, "grad_norm": 0.37722286581993103, "learning_rate": 4.8311972174101345e-05, "loss": 0.3679, "step": 2494500 }, { "epoch": 16.88366175833694, "grad_norm": 0.3509593904018402, "learning_rate": 4.831163382416631e-05, "loss": 0.3664, "step": 2495000 }, { "epoch": 16.887045257687312, "grad_norm": 0.347510427236557, "learning_rate": 4.831129547423127e-05, "loss": 0.3677, "step": 2495500 }, { "epoch": 16.89042875703768, "grad_norm": 0.34216421842575073, "learning_rate": 4.831095712429624e-05, "loss": 0.3684, "step": 2496000 }, { "epoch": 16.893812256388046, "grad_norm": 0.3292529881000519, "learning_rate": 4.83106187743612e-05, "loss": 0.3684, "step": 2496500 }, { "epoch": 16.897195755738416, "grad_norm": 0.3332473337650299, "learning_rate": 4.831028042442616e-05, "loss": 0.3673, "step": 2497000 }, { "epoch": 16.900579255088783, "grad_norm": 0.3342098593711853, "learning_rate": 4.8309942074491125e-05, "loss": 0.3659, "step": 2497500 }, { "epoch": 16.903962754439153, "grad_norm": 0.36247438192367554, "learning_rate": 4.830960372455609e-05, "loss": 0.367, "step": 2498000 }, { "epoch": 16.90734625378952, "grad_norm": 0.3494985103607178, "learning_rate": 4.830926537462105e-05, "loss": 0.3672, "step": 2498500 }, { "epoch": 16.910729753139886, "grad_norm": 0.41263943910598755, "learning_rate": 4.830892702468601e-05, "loss": 0.3685, "step": 2499000 }, { "epoch": 16.914113252490257, "grad_norm": 0.33395126461982727, "learning_rate": 4.830858867475097e-05, "loss": 0.3686, "step": 2499500 }, { "epoch": 16.917496751840623, "grad_norm": 0.3553406298160553, "learning_rate": 4.830825032481594e-05, "loss": 0.3677, "step": 2500000 }, { "epoch": 16.92088025119099, "grad_norm": 0.33235299587249756, "learning_rate": 4.8307911974880904e-05, "loss": 0.3674, "step": 2500500 }, { "epoch": 16.92426375054136, "grad_norm": 0.35224485397338867, "learning_rate": 4.8307573624945866e-05, "loss": 0.3702, "step": 2501000 }, { "epoch": 16.927647249891727, "grad_norm": 0.39313584566116333, "learning_rate": 4.830723527501083e-05, "loss": 0.3684, "step": 2501500 }, { "epoch": 16.931030749242097, "grad_norm": 0.3749732971191406, "learning_rate": 4.83068969250758e-05, "loss": 0.3676, "step": 2502000 }, { "epoch": 16.934414248592464, "grad_norm": 0.34019801020622253, "learning_rate": 4.830655857514075e-05, "loss": 0.3677, "step": 2502500 }, { "epoch": 16.93779774794283, "grad_norm": 0.32792386412620544, "learning_rate": 4.8306220225205715e-05, "loss": 0.3668, "step": 2503000 }, { "epoch": 16.9411812472932, "grad_norm": 0.37263306975364685, "learning_rate": 4.8305881875270684e-05, "loss": 0.3673, "step": 2503500 }, { "epoch": 16.944564746643568, "grad_norm": 0.3549545705318451, "learning_rate": 4.8305543525335646e-05, "loss": 0.3674, "step": 2504000 }, { "epoch": 16.94794824599394, "grad_norm": 0.3565514385700226, "learning_rate": 4.830520517540061e-05, "loss": 0.3675, "step": 2504500 }, { "epoch": 16.951331745344305, "grad_norm": 0.3621446490287781, "learning_rate": 4.830486682546557e-05, "loss": 0.3667, "step": 2505000 }, { "epoch": 16.954715244694672, "grad_norm": 0.3795997202396393, "learning_rate": 4.830452847553054e-05, "loss": 0.3671, "step": 2505500 }, { "epoch": 16.958098744045042, "grad_norm": 0.3907493054866791, "learning_rate": 4.83041901255955e-05, "loss": 0.3692, "step": 2506000 }, { "epoch": 16.96148224339541, "grad_norm": 0.3782593607902527, "learning_rate": 4.830385177566046e-05, "loss": 0.3667, "step": 2506500 }, { "epoch": 16.96486574274578, "grad_norm": 0.31225937604904175, "learning_rate": 4.830351342572542e-05, "loss": 0.3679, "step": 2507000 }, { "epoch": 16.968249242096146, "grad_norm": 0.37021970748901367, "learning_rate": 4.830317507579039e-05, "loss": 0.3705, "step": 2507500 }, { "epoch": 16.971632741446513, "grad_norm": 0.3210141360759735, "learning_rate": 4.830283672585535e-05, "loss": 0.3672, "step": 2508000 }, { "epoch": 16.975016240796883, "grad_norm": 0.3552591800689697, "learning_rate": 4.830249837592031e-05, "loss": 0.3691, "step": 2508500 }, { "epoch": 16.97839974014725, "grad_norm": 0.3715352416038513, "learning_rate": 4.8302160025985274e-05, "loss": 0.3689, "step": 2509000 }, { "epoch": 16.981783239497616, "grad_norm": 0.3898894190788269, "learning_rate": 4.830182167605024e-05, "loss": 0.3682, "step": 2509500 }, { "epoch": 16.985166738847987, "grad_norm": 0.3399691581726074, "learning_rate": 4.8301483326115205e-05, "loss": 0.3672, "step": 2510000 }, { "epoch": 16.988550238198354, "grad_norm": 0.36466071009635925, "learning_rate": 4.830114497618017e-05, "loss": 0.3674, "step": 2510500 }, { "epoch": 16.991933737548724, "grad_norm": 0.38154199719429016, "learning_rate": 4.830080662624513e-05, "loss": 0.3693, "step": 2511000 }, { "epoch": 16.99531723689909, "grad_norm": 0.3445392847061157, "learning_rate": 4.83004682763101e-05, "loss": 0.3676, "step": 2511500 }, { "epoch": 16.998700736249457, "grad_norm": 0.33373844623565674, "learning_rate": 4.830012992637505e-05, "loss": 0.3675, "step": 2512000 }, { "epoch": 17.0, "eval_accuracy": 0.859953776456311, "eval_loss": 0.5690349340438843, "eval_runtime": 3383.2894, "eval_samples_per_second": 85.935, "eval_steps_per_second": 5.371, "step": 2512192 }, { "epoch": 17.002084235599828, "grad_norm": 0.3574970066547394, "learning_rate": 4.8299791576440015e-05, "loss": 0.3654, "step": 2512500 }, { "epoch": 17.005467734950194, "grad_norm": 0.3643225133419037, "learning_rate": 4.8299453226504984e-05, "loss": 0.3667, "step": 2513000 }, { "epoch": 17.008851234300565, "grad_norm": 0.36455971002578735, "learning_rate": 4.8299114876569946e-05, "loss": 0.3638, "step": 2513500 }, { "epoch": 17.01223473365093, "grad_norm": 0.3444270193576813, "learning_rate": 4.829877652663491e-05, "loss": 0.366, "step": 2514000 }, { "epoch": 17.015618233001298, "grad_norm": 0.3640088438987732, "learning_rate": 4.829843817669987e-05, "loss": 0.3669, "step": 2514500 }, { "epoch": 17.01900173235167, "grad_norm": 0.35751673579216003, "learning_rate": 4.829809982676484e-05, "loss": 0.3653, "step": 2515000 }, { "epoch": 17.022385231702035, "grad_norm": 0.38286295533180237, "learning_rate": 4.82977614768298e-05, "loss": 0.3648, "step": 2515500 }, { "epoch": 17.025768731052402, "grad_norm": 0.37575405836105347, "learning_rate": 4.8297423126894764e-05, "loss": 0.3652, "step": 2516000 }, { "epoch": 17.029152230402772, "grad_norm": 0.35940518975257874, "learning_rate": 4.8297084776959726e-05, "loss": 0.3667, "step": 2516500 }, { "epoch": 17.03253572975314, "grad_norm": 0.37399694323539734, "learning_rate": 4.829674642702469e-05, "loss": 0.3644, "step": 2517000 }, { "epoch": 17.03591922910351, "grad_norm": 0.4155278205871582, "learning_rate": 4.829640807708965e-05, "loss": 0.3662, "step": 2517500 }, { "epoch": 17.039302728453876, "grad_norm": 0.3764472007751465, "learning_rate": 4.829606972715461e-05, "loss": 0.3656, "step": 2518000 }, { "epoch": 17.042686227804243, "grad_norm": 0.36024290323257446, "learning_rate": 4.8295731377219574e-05, "loss": 0.3648, "step": 2518500 }, { "epoch": 17.046069727154613, "grad_norm": 0.37095972895622253, "learning_rate": 4.829539302728454e-05, "loss": 0.3655, "step": 2519000 }, { "epoch": 17.04945322650498, "grad_norm": 0.36604204773902893, "learning_rate": 4.8295054677349505e-05, "loss": 0.3653, "step": 2519500 }, { "epoch": 17.05283672585535, "grad_norm": 0.37407541275024414, "learning_rate": 4.829471632741447e-05, "loss": 0.3658, "step": 2520000 }, { "epoch": 17.056220225205717, "grad_norm": 0.3428348898887634, "learning_rate": 4.829437797747943e-05, "loss": 0.3648, "step": 2520500 }, { "epoch": 17.059603724556084, "grad_norm": 0.3618278205394745, "learning_rate": 4.82940396275444e-05, "loss": 0.3655, "step": 2521000 }, { "epoch": 17.062987223906454, "grad_norm": 0.353392094373703, "learning_rate": 4.8293701277609354e-05, "loss": 0.3656, "step": 2521500 }, { "epoch": 17.06637072325682, "grad_norm": 0.403253972530365, "learning_rate": 4.8293362927674316e-05, "loss": 0.3664, "step": 2522000 }, { "epoch": 17.06975422260719, "grad_norm": 0.3712442219257355, "learning_rate": 4.8293024577739285e-05, "loss": 0.3659, "step": 2522500 }, { "epoch": 17.073137721957558, "grad_norm": 0.36106806993484497, "learning_rate": 4.829268622780425e-05, "loss": 0.3656, "step": 2523000 }, { "epoch": 17.076521221307924, "grad_norm": 0.3803187608718872, "learning_rate": 4.829234787786921e-05, "loss": 0.3661, "step": 2523500 }, { "epoch": 17.079904720658295, "grad_norm": 0.3683875799179077, "learning_rate": 4.829200952793417e-05, "loss": 0.3663, "step": 2524000 }, { "epoch": 17.08328822000866, "grad_norm": 0.379422128200531, "learning_rate": 4.829167117799914e-05, "loss": 0.3666, "step": 2524500 }, { "epoch": 17.08667171935903, "grad_norm": 0.3635571300983429, "learning_rate": 4.82913328280641e-05, "loss": 0.3658, "step": 2525000 }, { "epoch": 17.0900552187094, "grad_norm": 0.3490724265575409, "learning_rate": 4.8290994478129064e-05, "loss": 0.3658, "step": 2525500 }, { "epoch": 17.093438718059765, "grad_norm": 0.37665268778800964, "learning_rate": 4.8290656128194027e-05, "loss": 0.3676, "step": 2526000 }, { "epoch": 17.096822217410136, "grad_norm": 0.369052529335022, "learning_rate": 4.829031777825899e-05, "loss": 0.3675, "step": 2526500 }, { "epoch": 17.100205716760502, "grad_norm": 0.34089598059654236, "learning_rate": 4.828997942832395e-05, "loss": 0.3674, "step": 2527000 }, { "epoch": 17.10358921611087, "grad_norm": 0.3988218903541565, "learning_rate": 4.828964107838891e-05, "loss": 0.3665, "step": 2527500 }, { "epoch": 17.10697271546124, "grad_norm": 0.3494698107242584, "learning_rate": 4.8289302728453875e-05, "loss": 0.3673, "step": 2528000 }, { "epoch": 17.110356214811606, "grad_norm": 0.3407789468765259, "learning_rate": 4.8288964378518844e-05, "loss": 0.3663, "step": 2528500 }, { "epoch": 17.113739714161976, "grad_norm": 0.3363821506500244, "learning_rate": 4.8288626028583806e-05, "loss": 0.3662, "step": 2529000 }, { "epoch": 17.117123213512343, "grad_norm": 0.37021976709365845, "learning_rate": 4.828828767864877e-05, "loss": 0.3649, "step": 2529500 }, { "epoch": 17.12050671286271, "grad_norm": 0.3477225601673126, "learning_rate": 4.828794932871373e-05, "loss": 0.3663, "step": 2530000 }, { "epoch": 17.12389021221308, "grad_norm": 0.35830751061439514, "learning_rate": 4.82876109787787e-05, "loss": 0.3673, "step": 2530500 }, { "epoch": 17.127273711563447, "grad_norm": 0.3866226077079773, "learning_rate": 4.8287272628843655e-05, "loss": 0.368, "step": 2531000 }, { "epoch": 17.130657210913814, "grad_norm": 0.36625948548316956, "learning_rate": 4.828693427890862e-05, "loss": 0.3662, "step": 2531500 }, { "epoch": 17.134040710264184, "grad_norm": 0.3588632047176361, "learning_rate": 4.8286595928973586e-05, "loss": 0.3661, "step": 2532000 }, { "epoch": 17.13742420961455, "grad_norm": 0.36328765749931335, "learning_rate": 4.828625757903855e-05, "loss": 0.365, "step": 2532500 }, { "epoch": 17.14080770896492, "grad_norm": 0.37686029076576233, "learning_rate": 4.828591922910351e-05, "loss": 0.368, "step": 2533000 }, { "epoch": 17.144191208315288, "grad_norm": 0.3592396378517151, "learning_rate": 4.828558087916847e-05, "loss": 0.3661, "step": 2533500 }, { "epoch": 17.147574707665655, "grad_norm": 0.35530906915664673, "learning_rate": 4.828524252923344e-05, "loss": 0.3674, "step": 2534000 }, { "epoch": 17.150958207016025, "grad_norm": 0.39262640476226807, "learning_rate": 4.82849041792984e-05, "loss": 0.3652, "step": 2534500 }, { "epoch": 17.15434170636639, "grad_norm": 0.3824649751186371, "learning_rate": 4.8284565829363365e-05, "loss": 0.366, "step": 2535000 }, { "epoch": 17.157725205716762, "grad_norm": 0.3323943316936493, "learning_rate": 4.828422747942833e-05, "loss": 0.3656, "step": 2535500 }, { "epoch": 17.16110870506713, "grad_norm": 0.3842216432094574, "learning_rate": 4.828388912949329e-05, "loss": 0.3655, "step": 2536000 }, { "epoch": 17.164492204417495, "grad_norm": 0.4065781831741333, "learning_rate": 4.828355077955825e-05, "loss": 0.3659, "step": 2536500 }, { "epoch": 17.167875703767866, "grad_norm": 0.35870862007141113, "learning_rate": 4.8283212429623214e-05, "loss": 0.3662, "step": 2537000 }, { "epoch": 17.171259203118233, "grad_norm": 0.3968873620033264, "learning_rate": 4.8282874079688176e-05, "loss": 0.3658, "step": 2537500 }, { "epoch": 17.174642702468603, "grad_norm": 0.35877475142478943, "learning_rate": 4.8282535729753145e-05, "loss": 0.3658, "step": 2538000 }, { "epoch": 17.17802620181897, "grad_norm": 0.37040311098098755, "learning_rate": 4.828219737981811e-05, "loss": 0.3665, "step": 2538500 }, { "epoch": 17.181409701169336, "grad_norm": 0.3944379985332489, "learning_rate": 4.828185902988307e-05, "loss": 0.3668, "step": 2539000 }, { "epoch": 17.184793200519707, "grad_norm": 0.3358082175254822, "learning_rate": 4.828152067994803e-05, "loss": 0.3666, "step": 2539500 }, { "epoch": 17.188176699870073, "grad_norm": 0.35811847448349, "learning_rate": 4.8281182330013e-05, "loss": 0.3667, "step": 2540000 }, { "epoch": 17.19156019922044, "grad_norm": 0.3784216642379761, "learning_rate": 4.8280843980077955e-05, "loss": 0.3668, "step": 2540500 }, { "epoch": 17.19494369857081, "grad_norm": 0.3526269495487213, "learning_rate": 4.828050563014292e-05, "loss": 0.3677, "step": 2541000 }, { "epoch": 17.198327197921177, "grad_norm": 0.36154642701148987, "learning_rate": 4.8280167280207886e-05, "loss": 0.366, "step": 2541500 }, { "epoch": 17.201710697271547, "grad_norm": 0.3573142886161804, "learning_rate": 4.827982893027285e-05, "loss": 0.3664, "step": 2542000 }, { "epoch": 17.205094196621914, "grad_norm": 0.3573852777481079, "learning_rate": 4.827949058033781e-05, "loss": 0.3658, "step": 2542500 }, { "epoch": 17.20847769597228, "grad_norm": 0.3711315393447876, "learning_rate": 4.827915223040277e-05, "loss": 0.3673, "step": 2543000 }, { "epoch": 17.21186119532265, "grad_norm": 0.35421717166900635, "learning_rate": 4.827881388046774e-05, "loss": 0.3667, "step": 2543500 }, { "epoch": 17.215244694673018, "grad_norm": 0.3692740499973297, "learning_rate": 4.8278475530532704e-05, "loss": 0.3665, "step": 2544000 }, { "epoch": 17.21862819402339, "grad_norm": 0.33599853515625, "learning_rate": 4.8278137180597666e-05, "loss": 0.367, "step": 2544500 }, { "epoch": 17.222011693373755, "grad_norm": 0.356893390417099, "learning_rate": 4.827779883066263e-05, "loss": 0.3671, "step": 2545000 }, { "epoch": 17.225395192724122, "grad_norm": 0.33157333731651306, "learning_rate": 4.827746048072759e-05, "loss": 0.3675, "step": 2545500 }, { "epoch": 17.228778692074492, "grad_norm": 0.3692445456981659, "learning_rate": 4.827712213079255e-05, "loss": 0.366, "step": 2546000 }, { "epoch": 17.23216219142486, "grad_norm": 0.35876113176345825, "learning_rate": 4.8276783780857514e-05, "loss": 0.3665, "step": 2546500 }, { "epoch": 17.23554569077523, "grad_norm": 0.37952202558517456, "learning_rate": 4.8276445430922476e-05, "loss": 0.3661, "step": 2547000 }, { "epoch": 17.238929190125596, "grad_norm": 0.34852099418640137, "learning_rate": 4.8276107080987445e-05, "loss": 0.3665, "step": 2547500 }, { "epoch": 17.242312689475963, "grad_norm": 0.32389768958091736, "learning_rate": 4.827576873105241e-05, "loss": 0.367, "step": 2548000 }, { "epoch": 17.245696188826333, "grad_norm": 0.36108294129371643, "learning_rate": 4.827543038111737e-05, "loss": 0.3679, "step": 2548500 }, { "epoch": 17.2490796881767, "grad_norm": 0.3562123775482178, "learning_rate": 4.827509203118233e-05, "loss": 0.3675, "step": 2549000 }, { "epoch": 17.252463187527066, "grad_norm": 0.33895066380500793, "learning_rate": 4.82747536812473e-05, "loss": 0.3666, "step": 2549500 }, { "epoch": 17.255846686877437, "grad_norm": 0.39131903648376465, "learning_rate": 4.8274415331312256e-05, "loss": 0.366, "step": 2550000 }, { "epoch": 17.259230186227803, "grad_norm": 0.3654680848121643, "learning_rate": 4.827407698137722e-05, "loss": 0.3667, "step": 2550500 }, { "epoch": 17.262613685578174, "grad_norm": 0.35606712102890015, "learning_rate": 4.827373863144219e-05, "loss": 0.3661, "step": 2551000 }, { "epoch": 17.26599718492854, "grad_norm": 0.38734468817710876, "learning_rate": 4.827340028150715e-05, "loss": 0.3663, "step": 2551500 }, { "epoch": 17.269380684278907, "grad_norm": 0.35687586665153503, "learning_rate": 4.827306193157211e-05, "loss": 0.3674, "step": 2552000 }, { "epoch": 17.272764183629278, "grad_norm": 0.3550911843776703, "learning_rate": 4.827272358163707e-05, "loss": 0.368, "step": 2552500 }, { "epoch": 17.276147682979644, "grad_norm": 0.3244481384754181, "learning_rate": 4.8272385231702035e-05, "loss": 0.3672, "step": 2553000 }, { "epoch": 17.279531182330015, "grad_norm": 0.35181885957717896, "learning_rate": 4.8272046881767004e-05, "loss": 0.3679, "step": 2553500 }, { "epoch": 17.28291468168038, "grad_norm": 0.37776389718055725, "learning_rate": 4.8271708531831966e-05, "loss": 0.3679, "step": 2554000 }, { "epoch": 17.286298181030748, "grad_norm": 0.36275383830070496, "learning_rate": 4.827137018189693e-05, "loss": 0.3667, "step": 2554500 }, { "epoch": 17.28968168038112, "grad_norm": 0.3706962466239929, "learning_rate": 4.827103183196189e-05, "loss": 0.3683, "step": 2555000 }, { "epoch": 17.293065179731485, "grad_norm": 0.35722899436950684, "learning_rate": 4.827069348202685e-05, "loss": 0.367, "step": 2555500 }, { "epoch": 17.296448679081852, "grad_norm": 0.3421380817890167, "learning_rate": 4.8270355132091815e-05, "loss": 0.3664, "step": 2556000 }, { "epoch": 17.299832178432222, "grad_norm": 0.3910340666770935, "learning_rate": 4.827001678215678e-05, "loss": 0.3669, "step": 2556500 }, { "epoch": 17.30321567778259, "grad_norm": 0.34034547209739685, "learning_rate": 4.8269678432221746e-05, "loss": 0.3681, "step": 2557000 }, { "epoch": 17.30659917713296, "grad_norm": 0.36404237151145935, "learning_rate": 4.826934008228671e-05, "loss": 0.3671, "step": 2557500 }, { "epoch": 17.309982676483326, "grad_norm": 0.3977389931678772, "learning_rate": 4.826900173235167e-05, "loss": 0.3665, "step": 2558000 }, { "epoch": 17.313366175833693, "grad_norm": 0.34738966822624207, "learning_rate": 4.826866338241663e-05, "loss": 0.3667, "step": 2558500 }, { "epoch": 17.316749675184063, "grad_norm": 0.34294068813323975, "learning_rate": 4.82683250324816e-05, "loss": 0.3668, "step": 2559000 }, { "epoch": 17.32013317453443, "grad_norm": 0.38207390904426575, "learning_rate": 4.8267986682546556e-05, "loss": 0.3667, "step": 2559500 }, { "epoch": 17.3235166738848, "grad_norm": 0.36952969431877136, "learning_rate": 4.826764833261152e-05, "loss": 0.3659, "step": 2560000 }, { "epoch": 17.326900173235167, "grad_norm": 0.4127836525440216, "learning_rate": 4.826730998267649e-05, "loss": 0.3674, "step": 2560500 }, { "epoch": 17.330283672585534, "grad_norm": 0.35767117142677307, "learning_rate": 4.826697163274145e-05, "loss": 0.366, "step": 2561000 }, { "epoch": 17.333667171935904, "grad_norm": 0.3588179051876068, "learning_rate": 4.826663328280641e-05, "loss": 0.3661, "step": 2561500 }, { "epoch": 17.33705067128627, "grad_norm": 0.3567115068435669, "learning_rate": 4.8266294932871374e-05, "loss": 0.3669, "step": 2562000 }, { "epoch": 17.34043417063664, "grad_norm": 0.3763863742351532, "learning_rate": 4.8265956582936336e-05, "loss": 0.3652, "step": 2562500 }, { "epoch": 17.343817669987008, "grad_norm": 0.4155043661594391, "learning_rate": 4.8265618233001305e-05, "loss": 0.3667, "step": 2563000 }, { "epoch": 17.347201169337374, "grad_norm": 0.3805868625640869, "learning_rate": 4.826527988306627e-05, "loss": 0.3666, "step": 2563500 }, { "epoch": 17.350584668687745, "grad_norm": 0.36847659945487976, "learning_rate": 4.826494153313123e-05, "loss": 0.3671, "step": 2564000 }, { "epoch": 17.35396816803811, "grad_norm": 0.36805492639541626, "learning_rate": 4.826460318319619e-05, "loss": 0.3665, "step": 2564500 }, { "epoch": 17.35735166738848, "grad_norm": 0.3310345411300659, "learning_rate": 4.826426483326115e-05, "loss": 0.3666, "step": 2565000 }, { "epoch": 17.36073516673885, "grad_norm": 0.38131046295166016, "learning_rate": 4.8263926483326115e-05, "loss": 0.3673, "step": 2565500 }, { "epoch": 17.364118666089215, "grad_norm": 0.3580634891986847, "learning_rate": 4.826358813339108e-05, "loss": 0.3671, "step": 2566000 }, { "epoch": 17.367502165439586, "grad_norm": 0.3496025502681732, "learning_rate": 4.8263249783456046e-05, "loss": 0.3669, "step": 2566500 }, { "epoch": 17.370885664789952, "grad_norm": 0.3704014718532562, "learning_rate": 4.826291143352101e-05, "loss": 0.3651, "step": 2567000 }, { "epoch": 17.37426916414032, "grad_norm": 0.37614890933036804, "learning_rate": 4.826257308358597e-05, "loss": 0.3672, "step": 2567500 }, { "epoch": 17.37765266349069, "grad_norm": 0.37118321657180786, "learning_rate": 4.826223473365093e-05, "loss": 0.3661, "step": 2568000 }, { "epoch": 17.381036162841056, "grad_norm": 0.35460785031318665, "learning_rate": 4.82618963837159e-05, "loss": 0.3661, "step": 2568500 }, { "epoch": 17.384419662191426, "grad_norm": 0.351251482963562, "learning_rate": 4.8261558033780864e-05, "loss": 0.3674, "step": 2569000 }, { "epoch": 17.387803161541793, "grad_norm": 0.3240748345851898, "learning_rate": 4.826121968384582e-05, "loss": 0.3655, "step": 2569500 }, { "epoch": 17.39118666089216, "grad_norm": 0.3724314272403717, "learning_rate": 4.826088133391078e-05, "loss": 0.3679, "step": 2570000 }, { "epoch": 17.39457016024253, "grad_norm": 0.3763544261455536, "learning_rate": 4.826054298397575e-05, "loss": 0.3663, "step": 2570500 }, { "epoch": 17.397953659592897, "grad_norm": 0.42186710238456726, "learning_rate": 4.826020463404071e-05, "loss": 0.3676, "step": 2571000 }, { "epoch": 17.401337158943264, "grad_norm": 0.32255861163139343, "learning_rate": 4.8259866284105674e-05, "loss": 0.367, "step": 2571500 }, { "epoch": 17.404720658293634, "grad_norm": 0.35966476798057556, "learning_rate": 4.8259527934170637e-05, "loss": 0.3661, "step": 2572000 }, { "epoch": 17.408104157644, "grad_norm": 0.39646032452583313, "learning_rate": 4.8259189584235605e-05, "loss": 0.3663, "step": 2572500 }, { "epoch": 17.41148765699437, "grad_norm": 0.3326699435710907, "learning_rate": 4.825885123430057e-05, "loss": 0.367, "step": 2573000 }, { "epoch": 17.414871156344738, "grad_norm": 0.34483450651168823, "learning_rate": 4.825851288436553e-05, "loss": 0.367, "step": 2573500 }, { "epoch": 17.418254655695105, "grad_norm": 0.3655441403388977, "learning_rate": 4.825817453443049e-05, "loss": 0.3675, "step": 2574000 }, { "epoch": 17.421638155045475, "grad_norm": 0.32843220233917236, "learning_rate": 4.8257836184495454e-05, "loss": 0.3682, "step": 2574500 }, { "epoch": 17.42502165439584, "grad_norm": 0.3860529661178589, "learning_rate": 4.8257497834560416e-05, "loss": 0.367, "step": 2575000 }, { "epoch": 17.428405153746212, "grad_norm": 0.3711206614971161, "learning_rate": 4.825715948462538e-05, "loss": 0.3666, "step": 2575500 }, { "epoch": 17.43178865309658, "grad_norm": 0.3526160418987274, "learning_rate": 4.825682113469035e-05, "loss": 0.3671, "step": 2576000 }, { "epoch": 17.435172152446945, "grad_norm": 0.39998453855514526, "learning_rate": 4.825648278475531e-05, "loss": 0.3671, "step": 2576500 }, { "epoch": 17.438555651797316, "grad_norm": 0.3498559296131134, "learning_rate": 4.825614443482027e-05, "loss": 0.3676, "step": 2577000 }, { "epoch": 17.441939151147682, "grad_norm": 0.37197813391685486, "learning_rate": 4.8255806084885233e-05, "loss": 0.3663, "step": 2577500 }, { "epoch": 17.445322650498053, "grad_norm": 0.3847373127937317, "learning_rate": 4.82554677349502e-05, "loss": 0.367, "step": 2578000 }, { "epoch": 17.44870614984842, "grad_norm": 0.4149755835533142, "learning_rate": 4.8255129385015164e-05, "loss": 0.3673, "step": 2578500 }, { "epoch": 17.452089649198786, "grad_norm": 0.37201008200645447, "learning_rate": 4.825479103508012e-05, "loss": 0.3656, "step": 2579000 }, { "epoch": 17.455473148549157, "grad_norm": 0.3404081165790558, "learning_rate": 4.825445268514508e-05, "loss": 0.3684, "step": 2579500 }, { "epoch": 17.458856647899523, "grad_norm": 0.3775407671928406, "learning_rate": 4.825411433521005e-05, "loss": 0.3671, "step": 2580000 }, { "epoch": 17.46224014724989, "grad_norm": 0.316976934671402, "learning_rate": 4.825377598527501e-05, "loss": 0.3654, "step": 2580500 }, { "epoch": 17.46562364660026, "grad_norm": 0.3777703046798706, "learning_rate": 4.8253437635339975e-05, "loss": 0.3667, "step": 2581000 }, { "epoch": 17.469007145950627, "grad_norm": 0.34554705023765564, "learning_rate": 4.825309928540494e-05, "loss": 0.3679, "step": 2581500 }, { "epoch": 17.472390645300997, "grad_norm": 0.36104902625083923, "learning_rate": 4.8252760935469906e-05, "loss": 0.3671, "step": 2582000 }, { "epoch": 17.475774144651364, "grad_norm": 0.3420993983745575, "learning_rate": 4.825242258553487e-05, "loss": 0.3663, "step": 2582500 }, { "epoch": 17.47915764400173, "grad_norm": 0.3704248368740082, "learning_rate": 4.825208423559983e-05, "loss": 0.3674, "step": 2583000 }, { "epoch": 17.4825411433521, "grad_norm": 0.36432328820228577, "learning_rate": 4.825174588566479e-05, "loss": 0.3676, "step": 2583500 }, { "epoch": 17.485924642702468, "grad_norm": 0.3504055440425873, "learning_rate": 4.8251407535729755e-05, "loss": 0.3682, "step": 2584000 }, { "epoch": 17.48930814205284, "grad_norm": 0.37538814544677734, "learning_rate": 4.825106918579472e-05, "loss": 0.3659, "step": 2584500 }, { "epoch": 17.492691641403205, "grad_norm": 0.350363552570343, "learning_rate": 4.825073083585968e-05, "loss": 0.3682, "step": 2585000 }, { "epoch": 17.49607514075357, "grad_norm": 0.3279978930950165, "learning_rate": 4.825039248592465e-05, "loss": 0.3666, "step": 2585500 }, { "epoch": 17.499458640103942, "grad_norm": 0.37351611256599426, "learning_rate": 4.825005413598961e-05, "loss": 0.3669, "step": 2586000 }, { "epoch": 17.50284213945431, "grad_norm": 0.3720766305923462, "learning_rate": 4.824971578605457e-05, "loss": 0.3674, "step": 2586500 }, { "epoch": 17.50622563880468, "grad_norm": 0.38092654943466187, "learning_rate": 4.8249377436119534e-05, "loss": 0.3669, "step": 2587000 }, { "epoch": 17.509609138155046, "grad_norm": 0.3415418267250061, "learning_rate": 4.82490390861845e-05, "loss": 0.3662, "step": 2587500 }, { "epoch": 17.512992637505413, "grad_norm": 0.35386013984680176, "learning_rate": 4.8248700736249465e-05, "loss": 0.3678, "step": 2588000 }, { "epoch": 17.516376136855783, "grad_norm": 0.37551605701446533, "learning_rate": 4.824836238631442e-05, "loss": 0.3669, "step": 2588500 }, { "epoch": 17.51975963620615, "grad_norm": 0.36551395058631897, "learning_rate": 4.824802403637938e-05, "loss": 0.3674, "step": 2589000 }, { "epoch": 17.523143135556516, "grad_norm": 0.3686535656452179, "learning_rate": 4.824768568644435e-05, "loss": 0.3673, "step": 2589500 }, { "epoch": 17.526526634906887, "grad_norm": 0.4045044779777527, "learning_rate": 4.8247347336509314e-05, "loss": 0.3673, "step": 2590000 }, { "epoch": 17.529910134257253, "grad_norm": 0.3256719708442688, "learning_rate": 4.8247008986574276e-05, "loss": 0.3662, "step": 2590500 }, { "epoch": 17.533293633607624, "grad_norm": 0.3593628406524658, "learning_rate": 4.824667063663924e-05, "loss": 0.3675, "step": 2591000 }, { "epoch": 17.53667713295799, "grad_norm": 0.3674556612968445, "learning_rate": 4.824633228670421e-05, "loss": 0.366, "step": 2591500 }, { "epoch": 17.540060632308357, "grad_norm": 0.39123180508613586, "learning_rate": 4.824599393676917e-05, "loss": 0.3659, "step": 2592000 }, { "epoch": 17.543444131658728, "grad_norm": 0.36383578181266785, "learning_rate": 4.824565558683413e-05, "loss": 0.3674, "step": 2592500 }, { "epoch": 17.546827631009094, "grad_norm": 0.3695049583911896, "learning_rate": 4.824531723689909e-05, "loss": 0.3666, "step": 2593000 }, { "epoch": 17.550211130359465, "grad_norm": 0.38377198576927185, "learning_rate": 4.8244978886964055e-05, "loss": 0.368, "step": 2593500 }, { "epoch": 17.55359462970983, "grad_norm": 0.35402801632881165, "learning_rate": 4.824464053702902e-05, "loss": 0.366, "step": 2594000 }, { "epoch": 17.556978129060198, "grad_norm": 0.3696456253528595, "learning_rate": 4.824430218709398e-05, "loss": 0.3675, "step": 2594500 }, { "epoch": 17.56036162841057, "grad_norm": 0.3580005168914795, "learning_rate": 4.824396383715895e-05, "loss": 0.3653, "step": 2595000 }, { "epoch": 17.563745127760935, "grad_norm": 0.3534056842327118, "learning_rate": 4.824362548722391e-05, "loss": 0.3658, "step": 2595500 }, { "epoch": 17.567128627111302, "grad_norm": 0.3255242705345154, "learning_rate": 4.824328713728887e-05, "loss": 0.3667, "step": 2596000 }, { "epoch": 17.570512126461672, "grad_norm": 0.3532315194606781, "learning_rate": 4.8242948787353835e-05, "loss": 0.3673, "step": 2596500 }, { "epoch": 17.57389562581204, "grad_norm": 0.37919074296951294, "learning_rate": 4.8242610437418804e-05, "loss": 0.3668, "step": 2597000 }, { "epoch": 17.57727912516241, "grad_norm": 0.36233749985694885, "learning_rate": 4.8242272087483766e-05, "loss": 0.3649, "step": 2597500 }, { "epoch": 17.580662624512776, "grad_norm": 0.3300325572490692, "learning_rate": 4.824193373754872e-05, "loss": 0.3679, "step": 2598000 }, { "epoch": 17.584046123863143, "grad_norm": 0.3588019609451294, "learning_rate": 4.824159538761368e-05, "loss": 0.3661, "step": 2598500 }, { "epoch": 17.587429623213513, "grad_norm": 0.3422711193561554, "learning_rate": 4.824125703767865e-05, "loss": 0.3669, "step": 2599000 }, { "epoch": 17.59081312256388, "grad_norm": 0.4143356680870056, "learning_rate": 4.8240918687743614e-05, "loss": 0.3669, "step": 2599500 }, { "epoch": 17.59419662191425, "grad_norm": 0.36779430508613586, "learning_rate": 4.8240580337808576e-05, "loss": 0.3681, "step": 2600000 }, { "epoch": 17.597580121264617, "grad_norm": 0.33779770135879517, "learning_rate": 4.824024198787354e-05, "loss": 0.3686, "step": 2600500 }, { "epoch": 17.600963620614984, "grad_norm": 0.3325331509113312, "learning_rate": 4.823990363793851e-05, "loss": 0.3663, "step": 2601000 }, { "epoch": 17.604347119965354, "grad_norm": 0.34794822335243225, "learning_rate": 4.823956528800347e-05, "loss": 0.3684, "step": 2601500 }, { "epoch": 17.60773061931572, "grad_norm": 0.3359128534793854, "learning_rate": 4.823922693806843e-05, "loss": 0.3673, "step": 2602000 }, { "epoch": 17.61111411866609, "grad_norm": 0.3586720824241638, "learning_rate": 4.8238888588133394e-05, "loss": 0.3658, "step": 2602500 }, { "epoch": 17.614497618016458, "grad_norm": 0.3466437757015228, "learning_rate": 4.8238550238198356e-05, "loss": 0.3666, "step": 2603000 }, { "epoch": 17.617881117366824, "grad_norm": 0.3655886948108673, "learning_rate": 4.823821188826332e-05, "loss": 0.3676, "step": 2603500 }, { "epoch": 17.621264616717195, "grad_norm": 0.4002399146556854, "learning_rate": 4.823787353832828e-05, "loss": 0.3677, "step": 2604000 }, { "epoch": 17.62464811606756, "grad_norm": 0.4021463096141815, "learning_rate": 4.823753518839325e-05, "loss": 0.3668, "step": 2604500 }, { "epoch": 17.628031615417928, "grad_norm": 0.34503015875816345, "learning_rate": 4.823719683845821e-05, "loss": 0.3688, "step": 2605000 }, { "epoch": 17.6314151147683, "grad_norm": 0.35058820247650146, "learning_rate": 4.823685848852317e-05, "loss": 0.3682, "step": 2605500 }, { "epoch": 17.634798614118665, "grad_norm": 0.36215049028396606, "learning_rate": 4.8236520138588135e-05, "loss": 0.3677, "step": 2606000 }, { "epoch": 17.638182113469036, "grad_norm": 0.3813348114490509, "learning_rate": 4.8236181788653104e-05, "loss": 0.3666, "step": 2606500 }, { "epoch": 17.641565612819402, "grad_norm": 0.3682672679424286, "learning_rate": 4.8235843438718066e-05, "loss": 0.3684, "step": 2607000 }, { "epoch": 17.64494911216977, "grad_norm": 0.39241448044776917, "learning_rate": 4.823550508878302e-05, "loss": 0.3652, "step": 2607500 }, { "epoch": 17.64833261152014, "grad_norm": 0.32166317105293274, "learning_rate": 4.8235166738847984e-05, "loss": 0.3687, "step": 2608000 }, { "epoch": 17.651716110870506, "grad_norm": 0.3621087074279785, "learning_rate": 4.823482838891295e-05, "loss": 0.3664, "step": 2608500 }, { "epoch": 17.655099610220876, "grad_norm": 0.3711774945259094, "learning_rate": 4.8234490038977915e-05, "loss": 0.3659, "step": 2609000 }, { "epoch": 17.658483109571243, "grad_norm": 0.3362772762775421, "learning_rate": 4.823415168904288e-05, "loss": 0.3681, "step": 2609500 }, { "epoch": 17.66186660892161, "grad_norm": 0.34222641587257385, "learning_rate": 4.823381333910784e-05, "loss": 0.3666, "step": 2610000 }, { "epoch": 17.66525010827198, "grad_norm": 0.34946516156196594, "learning_rate": 4.823347498917281e-05, "loss": 0.3678, "step": 2610500 }, { "epoch": 17.668633607622347, "grad_norm": 0.3598966598510742, "learning_rate": 4.823313663923777e-05, "loss": 0.3682, "step": 2611000 }, { "epoch": 17.672017106972717, "grad_norm": 0.39595934748649597, "learning_rate": 4.823279828930273e-05, "loss": 0.3652, "step": 2611500 }, { "epoch": 17.675400606323084, "grad_norm": 0.38074570894241333, "learning_rate": 4.8232459939367694e-05, "loss": 0.3651, "step": 2612000 }, { "epoch": 17.67878410567345, "grad_norm": 0.37233513593673706, "learning_rate": 4.8232121589432656e-05, "loss": 0.3662, "step": 2612500 }, { "epoch": 17.68216760502382, "grad_norm": 0.33832278847694397, "learning_rate": 4.823178323949762e-05, "loss": 0.3648, "step": 2613000 }, { "epoch": 17.685551104374188, "grad_norm": 0.34705302119255066, "learning_rate": 4.823144488956258e-05, "loss": 0.3664, "step": 2613500 }, { "epoch": 17.688934603724555, "grad_norm": 0.3784879148006439, "learning_rate": 4.823110653962755e-05, "loss": 0.3674, "step": 2614000 }, { "epoch": 17.692318103074925, "grad_norm": 0.4426981210708618, "learning_rate": 4.823076818969251e-05, "loss": 0.3671, "step": 2614500 }, { "epoch": 17.69570160242529, "grad_norm": 0.3958585560321808, "learning_rate": 4.8230429839757474e-05, "loss": 0.3674, "step": 2615000 }, { "epoch": 17.699085101775662, "grad_norm": 0.3677558898925781, "learning_rate": 4.8230091489822436e-05, "loss": 0.3658, "step": 2615500 }, { "epoch": 17.70246860112603, "grad_norm": 0.35928767919540405, "learning_rate": 4.82297531398874e-05, "loss": 0.3667, "step": 2616000 }, { "epoch": 17.705852100476395, "grad_norm": 0.3817287087440491, "learning_rate": 4.822941478995237e-05, "loss": 0.3664, "step": 2616500 }, { "epoch": 17.709235599826766, "grad_norm": 0.3287027180194855, "learning_rate": 4.822907644001732e-05, "loss": 0.3669, "step": 2617000 }, { "epoch": 17.712619099177132, "grad_norm": 0.34923413395881653, "learning_rate": 4.8228738090082284e-05, "loss": 0.3657, "step": 2617500 }, { "epoch": 17.716002598527503, "grad_norm": 0.3792766332626343, "learning_rate": 4.822839974014725e-05, "loss": 0.3679, "step": 2618000 }, { "epoch": 17.71938609787787, "grad_norm": 0.3916323482990265, "learning_rate": 4.8228061390212215e-05, "loss": 0.3668, "step": 2618500 }, { "epoch": 17.722769597228236, "grad_norm": 0.3880055248737335, "learning_rate": 4.822772304027718e-05, "loss": 0.3677, "step": 2619000 }, { "epoch": 17.726153096578606, "grad_norm": 0.3902827799320221, "learning_rate": 4.822738469034214e-05, "loss": 0.3646, "step": 2619500 }, { "epoch": 17.729536595928973, "grad_norm": 0.352867066860199, "learning_rate": 4.822704634040711e-05, "loss": 0.3683, "step": 2620000 }, { "epoch": 17.73292009527934, "grad_norm": 0.37308165431022644, "learning_rate": 4.822670799047207e-05, "loss": 0.3675, "step": 2620500 }, { "epoch": 17.73630359462971, "grad_norm": 0.3646025061607361, "learning_rate": 4.822636964053703e-05, "loss": 0.3675, "step": 2621000 }, { "epoch": 17.739687093980077, "grad_norm": 0.3406023383140564, "learning_rate": 4.8226031290601995e-05, "loss": 0.3677, "step": 2621500 }, { "epoch": 17.743070593330447, "grad_norm": 0.37065035104751587, "learning_rate": 4.822569294066696e-05, "loss": 0.3666, "step": 2622000 }, { "epoch": 17.746454092680814, "grad_norm": 0.32739582657814026, "learning_rate": 4.822535459073192e-05, "loss": 0.3655, "step": 2622500 }, { "epoch": 17.74983759203118, "grad_norm": 0.3511997163295746, "learning_rate": 4.822501624079688e-05, "loss": 0.3664, "step": 2623000 }, { "epoch": 17.75322109138155, "grad_norm": 0.3391745090484619, "learning_rate": 4.8224677890861843e-05, "loss": 0.3678, "step": 2623500 }, { "epoch": 17.756604590731918, "grad_norm": 0.2988252341747284, "learning_rate": 4.822433954092681e-05, "loss": 0.3673, "step": 2624000 }, { "epoch": 17.759988090082288, "grad_norm": 0.36709460616111755, "learning_rate": 4.8224001190991774e-05, "loss": 0.3661, "step": 2624500 }, { "epoch": 17.763371589432655, "grad_norm": 0.36869344115257263, "learning_rate": 4.8223662841056737e-05, "loss": 0.3672, "step": 2625000 }, { "epoch": 17.76675508878302, "grad_norm": 0.3532327711582184, "learning_rate": 4.82233244911217e-05, "loss": 0.3677, "step": 2625500 }, { "epoch": 17.770138588133392, "grad_norm": 0.36017414927482605, "learning_rate": 4.822298614118667e-05, "loss": 0.3687, "step": 2626000 }, { "epoch": 17.77352208748376, "grad_norm": 0.3668579161167145, "learning_rate": 4.822264779125162e-05, "loss": 0.3665, "step": 2626500 }, { "epoch": 17.77690558683413, "grad_norm": 0.3817662000656128, "learning_rate": 4.8222309441316585e-05, "loss": 0.3671, "step": 2627000 }, { "epoch": 17.780289086184496, "grad_norm": 0.3410915434360504, "learning_rate": 4.8221971091381554e-05, "loss": 0.3671, "step": 2627500 }, { "epoch": 17.783672585534863, "grad_norm": 0.36917775869369507, "learning_rate": 4.8221632741446516e-05, "loss": 0.3665, "step": 2628000 }, { "epoch": 17.787056084885233, "grad_norm": 0.3521798253059387, "learning_rate": 4.822129439151148e-05, "loss": 0.3673, "step": 2628500 }, { "epoch": 17.7904395842356, "grad_norm": 0.35278838872909546, "learning_rate": 4.822095604157644e-05, "loss": 0.3668, "step": 2629000 }, { "epoch": 17.793823083585966, "grad_norm": 0.3609314560890198, "learning_rate": 4.822061769164141e-05, "loss": 0.3674, "step": 2629500 }, { "epoch": 17.797206582936337, "grad_norm": 0.3256959021091461, "learning_rate": 4.822027934170637e-05, "loss": 0.367, "step": 2630000 }, { "epoch": 17.800590082286703, "grad_norm": 0.3367217779159546, "learning_rate": 4.8219940991771333e-05, "loss": 0.3672, "step": 2630500 }, { "epoch": 17.803973581637074, "grad_norm": 0.35630953311920166, "learning_rate": 4.8219602641836296e-05, "loss": 0.368, "step": 2631000 }, { "epoch": 17.80735708098744, "grad_norm": 0.3785656690597534, "learning_rate": 4.821926429190126e-05, "loss": 0.3672, "step": 2631500 }, { "epoch": 17.810740580337807, "grad_norm": 0.40297931432724, "learning_rate": 4.821892594196622e-05, "loss": 0.3675, "step": 2632000 }, { "epoch": 17.814124079688177, "grad_norm": 0.39543208479881287, "learning_rate": 4.821858759203118e-05, "loss": 0.367, "step": 2632500 }, { "epoch": 17.817507579038544, "grad_norm": 0.3973096013069153, "learning_rate": 4.8218249242096144e-05, "loss": 0.3665, "step": 2633000 }, { "epoch": 17.820891078388915, "grad_norm": 0.3314948081970215, "learning_rate": 4.821791089216111e-05, "loss": 0.3676, "step": 2633500 }, { "epoch": 17.82427457773928, "grad_norm": 0.34036216139793396, "learning_rate": 4.8217572542226075e-05, "loss": 0.3666, "step": 2634000 }, { "epoch": 17.827658077089648, "grad_norm": 0.3349308669567108, "learning_rate": 4.821723419229104e-05, "loss": 0.3656, "step": 2634500 }, { "epoch": 17.83104157644002, "grad_norm": 0.38641485571861267, "learning_rate": 4.8216895842356e-05, "loss": 0.3682, "step": 2635000 }, { "epoch": 17.834425075790385, "grad_norm": 0.3417682349681854, "learning_rate": 4.821655749242097e-05, "loss": 0.3677, "step": 2635500 }, { "epoch": 17.837808575140755, "grad_norm": 0.37354832887649536, "learning_rate": 4.8216219142485924e-05, "loss": 0.3661, "step": 2636000 }, { "epoch": 17.841192074491122, "grad_norm": 0.3448849022388458, "learning_rate": 4.8215880792550886e-05, "loss": 0.3676, "step": 2636500 }, { "epoch": 17.84457557384149, "grad_norm": 0.3836000859737396, "learning_rate": 4.8215542442615855e-05, "loss": 0.3661, "step": 2637000 }, { "epoch": 17.84795907319186, "grad_norm": 0.3253054618835449, "learning_rate": 4.821520409268082e-05, "loss": 0.3653, "step": 2637500 }, { "epoch": 17.851342572542226, "grad_norm": 0.3134402334690094, "learning_rate": 4.821486574274578e-05, "loss": 0.3652, "step": 2638000 }, { "epoch": 17.854726071892593, "grad_norm": 0.3837721049785614, "learning_rate": 4.821452739281074e-05, "loss": 0.3662, "step": 2638500 }, { "epoch": 17.858109571242963, "grad_norm": 0.36032843589782715, "learning_rate": 4.821418904287571e-05, "loss": 0.3664, "step": 2639000 }, { "epoch": 17.86149307059333, "grad_norm": 0.3683348298072815, "learning_rate": 4.821385069294067e-05, "loss": 0.3665, "step": 2639500 }, { "epoch": 17.8648765699437, "grad_norm": 0.37075766921043396, "learning_rate": 4.8213512343005634e-05, "loss": 0.3676, "step": 2640000 }, { "epoch": 17.868260069294067, "grad_norm": 0.37984272837638855, "learning_rate": 4.8213173993070596e-05, "loss": 0.3673, "step": 2640500 }, { "epoch": 17.871643568644433, "grad_norm": 0.3673933148384094, "learning_rate": 4.821283564313556e-05, "loss": 0.3673, "step": 2641000 }, { "epoch": 17.875027067994804, "grad_norm": 0.315318763256073, "learning_rate": 4.821249729320052e-05, "loss": 0.3677, "step": 2641500 }, { "epoch": 17.87841056734517, "grad_norm": 0.3659175932407379, "learning_rate": 4.821215894326548e-05, "loss": 0.3683, "step": 2642000 }, { "epoch": 17.88179406669554, "grad_norm": 0.35016539692878723, "learning_rate": 4.8211820593330445e-05, "loss": 0.3658, "step": 2642500 }, { "epoch": 17.885177566045908, "grad_norm": 0.3711670935153961, "learning_rate": 4.8211482243395414e-05, "loss": 0.3676, "step": 2643000 }, { "epoch": 17.888561065396274, "grad_norm": 0.3717668056488037, "learning_rate": 4.8211143893460376e-05, "loss": 0.3664, "step": 2643500 }, { "epoch": 17.891944564746645, "grad_norm": 0.3931821286678314, "learning_rate": 4.821080554352534e-05, "loss": 0.3668, "step": 2644000 }, { "epoch": 17.89532806409701, "grad_norm": 0.3632790446281433, "learning_rate": 4.82104671935903e-05, "loss": 0.3674, "step": 2644500 }, { "epoch": 17.898711563447378, "grad_norm": 0.3724515736103058, "learning_rate": 4.821012884365527e-05, "loss": 0.3663, "step": 2645000 }, { "epoch": 17.90209506279775, "grad_norm": 0.33584630489349365, "learning_rate": 4.8209790493720224e-05, "loss": 0.3662, "step": 2645500 }, { "epoch": 17.905478562148115, "grad_norm": 0.321268230676651, "learning_rate": 4.8209452143785186e-05, "loss": 0.3681, "step": 2646000 }, { "epoch": 17.908862061498485, "grad_norm": 0.3851405084133148, "learning_rate": 4.8209113793850155e-05, "loss": 0.3669, "step": 2646500 }, { "epoch": 17.912245560848852, "grad_norm": 0.3688462972640991, "learning_rate": 4.820877544391512e-05, "loss": 0.3674, "step": 2647000 }, { "epoch": 17.91562906019922, "grad_norm": 0.3237268030643463, "learning_rate": 4.820843709398008e-05, "loss": 0.3664, "step": 2647500 }, { "epoch": 17.91901255954959, "grad_norm": 0.36310896277427673, "learning_rate": 4.820809874404504e-05, "loss": 0.3677, "step": 2648000 }, { "epoch": 17.922396058899956, "grad_norm": 0.38640767335891724, "learning_rate": 4.820776039411001e-05, "loss": 0.3666, "step": 2648500 }, { "epoch": 17.925779558250326, "grad_norm": 0.36768364906311035, "learning_rate": 4.820742204417497e-05, "loss": 0.3673, "step": 2649000 }, { "epoch": 17.929163057600693, "grad_norm": 0.3582685589790344, "learning_rate": 4.8207083694239935e-05, "loss": 0.3657, "step": 2649500 }, { "epoch": 17.93254655695106, "grad_norm": 0.3740227520465851, "learning_rate": 4.82067453443049e-05, "loss": 0.3695, "step": 2650000 }, { "epoch": 17.93593005630143, "grad_norm": 0.3254905641078949, "learning_rate": 4.820640699436986e-05, "loss": 0.3678, "step": 2650500 }, { "epoch": 17.939313555651797, "grad_norm": 0.3392098546028137, "learning_rate": 4.820606864443482e-05, "loss": 0.366, "step": 2651000 }, { "epoch": 17.942697055002164, "grad_norm": 0.31783849000930786, "learning_rate": 4.820573029449978e-05, "loss": 0.3673, "step": 2651500 }, { "epoch": 17.946080554352534, "grad_norm": 0.37822237610816956, "learning_rate": 4.8205391944564745e-05, "loss": 0.3669, "step": 2652000 }, { "epoch": 17.9494640537029, "grad_norm": 0.3652763366699219, "learning_rate": 4.8205053594629714e-05, "loss": 0.3684, "step": 2652500 }, { "epoch": 17.95284755305327, "grad_norm": 0.39992907643318176, "learning_rate": 4.8204715244694676e-05, "loss": 0.367, "step": 2653000 }, { "epoch": 17.956231052403638, "grad_norm": 0.34198036789894104, "learning_rate": 4.820437689475964e-05, "loss": 0.3671, "step": 2653500 }, { "epoch": 17.959614551754004, "grad_norm": 0.3602445423603058, "learning_rate": 4.82040385448246e-05, "loss": 0.3675, "step": 2654000 }, { "epoch": 17.962998051104375, "grad_norm": 0.3950427770614624, "learning_rate": 4.820370019488957e-05, "loss": 0.3671, "step": 2654500 }, { "epoch": 17.96638155045474, "grad_norm": 0.3671565055847168, "learning_rate": 4.8203361844954525e-05, "loss": 0.3686, "step": 2655000 }, { "epoch": 17.969765049805112, "grad_norm": 0.36367377638816833, "learning_rate": 4.820302349501949e-05, "loss": 0.3686, "step": 2655500 }, { "epoch": 17.97314854915548, "grad_norm": 0.36010125279426575, "learning_rate": 4.8202685145084456e-05, "loss": 0.3658, "step": 2656000 }, { "epoch": 17.976532048505845, "grad_norm": 0.338571697473526, "learning_rate": 4.820234679514942e-05, "loss": 0.3666, "step": 2656500 }, { "epoch": 17.979915547856216, "grad_norm": 0.3533976972103119, "learning_rate": 4.820200844521438e-05, "loss": 0.3663, "step": 2657000 }, { "epoch": 17.983299047206582, "grad_norm": 0.3369833827018738, "learning_rate": 4.820167009527934e-05, "loss": 0.3677, "step": 2657500 }, { "epoch": 17.986682546556953, "grad_norm": 0.36669063568115234, "learning_rate": 4.820133174534431e-05, "loss": 0.3685, "step": 2658000 }, { "epoch": 17.99006604590732, "grad_norm": 0.3408244550228119, "learning_rate": 4.820099339540927e-05, "loss": 0.3669, "step": 2658500 }, { "epoch": 17.993449545257686, "grad_norm": 0.3309365212917328, "learning_rate": 4.8200655045474235e-05, "loss": 0.3663, "step": 2659000 }, { "epoch": 17.996833044608056, "grad_norm": 0.38393697142601013, "learning_rate": 4.82003166955392e-05, "loss": 0.3675, "step": 2659500 }, { "epoch": 18.0, "eval_accuracy": 0.8601398565191122, "eval_loss": 0.5673038959503174, "eval_runtime": 3700.8712, "eval_samples_per_second": 78.561, "eval_steps_per_second": 4.91, "step": 2659968 }, { "epoch": 18.000216543958423, "grad_norm": 0.3817974925041199, "learning_rate": 4.819997834560416e-05, "loss": 0.3665, "step": 2660000 }, { "epoch": 18.00360004330879, "grad_norm": 0.3546946942806244, "learning_rate": 4.819963999566912e-05, "loss": 0.3656, "step": 2660500 }, { "epoch": 18.00698354265916, "grad_norm": 0.375627338886261, "learning_rate": 4.8199301645734084e-05, "loss": 0.3641, "step": 2661000 }, { "epoch": 18.010367042009527, "grad_norm": 0.37911269068717957, "learning_rate": 4.8198963295799046e-05, "loss": 0.3645, "step": 2661500 }, { "epoch": 18.013750541359897, "grad_norm": 0.3864506185054779, "learning_rate": 4.8198624945864015e-05, "loss": 0.3647, "step": 2662000 }, { "epoch": 18.017134040710264, "grad_norm": 0.40366044640541077, "learning_rate": 4.819828659592898e-05, "loss": 0.3652, "step": 2662500 }, { "epoch": 18.02051754006063, "grad_norm": 0.3945550322532654, "learning_rate": 4.819794824599394e-05, "loss": 0.3658, "step": 2663000 }, { "epoch": 18.023901039411, "grad_norm": 0.3441663980484009, "learning_rate": 4.81976098960589e-05, "loss": 0.3642, "step": 2663500 }, { "epoch": 18.027284538761368, "grad_norm": 0.38696587085723877, "learning_rate": 4.819727154612387e-05, "loss": 0.3652, "step": 2664000 }, { "epoch": 18.030668038111738, "grad_norm": 0.3462171256542206, "learning_rate": 4.8196933196188826e-05, "loss": 0.3646, "step": 2664500 }, { "epoch": 18.034051537462105, "grad_norm": 0.39164283871650696, "learning_rate": 4.819659484625379e-05, "loss": 0.3655, "step": 2665000 }, { "epoch": 18.03743503681247, "grad_norm": 0.3807593584060669, "learning_rate": 4.8196256496318757e-05, "loss": 0.367, "step": 2665500 }, { "epoch": 18.040818536162842, "grad_norm": 0.36958304047584534, "learning_rate": 4.819591814638372e-05, "loss": 0.3652, "step": 2666000 }, { "epoch": 18.04420203551321, "grad_norm": 0.3937586545944214, "learning_rate": 4.819557979644868e-05, "loss": 0.3652, "step": 2666500 }, { "epoch": 18.04758553486358, "grad_norm": 0.3388778269290924, "learning_rate": 4.819524144651364e-05, "loss": 0.3659, "step": 2667000 }, { "epoch": 18.050969034213946, "grad_norm": 0.374830961227417, "learning_rate": 4.819490309657861e-05, "loss": 0.3653, "step": 2667500 }, { "epoch": 18.054352533564312, "grad_norm": 0.39372459053993225, "learning_rate": 4.8194564746643574e-05, "loss": 0.3661, "step": 2668000 }, { "epoch": 18.057736032914683, "grad_norm": 0.36588943004608154, "learning_rate": 4.8194226396708536e-05, "loss": 0.3661, "step": 2668500 }, { "epoch": 18.06111953226505, "grad_norm": 0.4009133577346802, "learning_rate": 4.81938880467735e-05, "loss": 0.3659, "step": 2669000 }, { "epoch": 18.064503031615416, "grad_norm": 0.3981277644634247, "learning_rate": 4.819354969683846e-05, "loss": 0.3633, "step": 2669500 }, { "epoch": 18.067886530965787, "grad_norm": 0.38319653272628784, "learning_rate": 4.819321134690342e-05, "loss": 0.3659, "step": 2670000 }, { "epoch": 18.071270030316153, "grad_norm": 0.38970398902893066, "learning_rate": 4.8192872996968385e-05, "loss": 0.3655, "step": 2670500 }, { "epoch": 18.074653529666524, "grad_norm": 0.3901421129703522, "learning_rate": 4.819253464703335e-05, "loss": 0.3658, "step": 2671000 }, { "epoch": 18.07803702901689, "grad_norm": 0.37622517347335815, "learning_rate": 4.8192196297098316e-05, "loss": 0.3652, "step": 2671500 }, { "epoch": 18.081420528367257, "grad_norm": 0.33386629819869995, "learning_rate": 4.819185794716328e-05, "loss": 0.3662, "step": 2672000 }, { "epoch": 18.084804027717627, "grad_norm": 0.3417004942893982, "learning_rate": 4.819151959722824e-05, "loss": 0.3661, "step": 2672500 }, { "epoch": 18.088187527067994, "grad_norm": 0.35943761467933655, "learning_rate": 4.81911812472932e-05, "loss": 0.3651, "step": 2673000 }, { "epoch": 18.091571026418364, "grad_norm": 0.40275195240974426, "learning_rate": 4.819084289735817e-05, "loss": 0.3656, "step": 2673500 }, { "epoch": 18.09495452576873, "grad_norm": 0.367982417345047, "learning_rate": 4.8190504547423126e-05, "loss": 0.3662, "step": 2674000 }, { "epoch": 18.098338025119098, "grad_norm": 0.35256427526474, "learning_rate": 4.819016619748809e-05, "loss": 0.3649, "step": 2674500 }, { "epoch": 18.10172152446947, "grad_norm": 0.4005764424800873, "learning_rate": 4.818982784755306e-05, "loss": 0.366, "step": 2675000 }, { "epoch": 18.105105023819835, "grad_norm": 0.3787810206413269, "learning_rate": 4.818948949761802e-05, "loss": 0.3648, "step": 2675500 }, { "epoch": 18.108488523170205, "grad_norm": 0.3439185619354248, "learning_rate": 4.818915114768298e-05, "loss": 0.3677, "step": 2676000 }, { "epoch": 18.111872022520572, "grad_norm": 0.38821837306022644, "learning_rate": 4.8188812797747944e-05, "loss": 0.3654, "step": 2676500 }, { "epoch": 18.11525552187094, "grad_norm": 0.37852784991264343, "learning_rate": 4.818847444781291e-05, "loss": 0.3649, "step": 2677000 }, { "epoch": 18.11863902122131, "grad_norm": 0.36914563179016113, "learning_rate": 4.8188136097877875e-05, "loss": 0.3669, "step": 2677500 }, { "epoch": 18.122022520571676, "grad_norm": 0.3899284303188324, "learning_rate": 4.818779774794284e-05, "loss": 0.3661, "step": 2678000 }, { "epoch": 18.125406019922043, "grad_norm": 0.37456607818603516, "learning_rate": 4.81874593980078e-05, "loss": 0.3662, "step": 2678500 }, { "epoch": 18.128789519272413, "grad_norm": 0.34672752022743225, "learning_rate": 4.818712104807276e-05, "loss": 0.3648, "step": 2679000 }, { "epoch": 18.13217301862278, "grad_norm": 0.37629690766334534, "learning_rate": 4.818678269813772e-05, "loss": 0.3654, "step": 2679500 }, { "epoch": 18.13555651797315, "grad_norm": 0.3695160448551178, "learning_rate": 4.8186444348202685e-05, "loss": 0.3671, "step": 2680000 }, { "epoch": 18.138940017323517, "grad_norm": 0.37204939126968384, "learning_rate": 4.818610599826765e-05, "loss": 0.3655, "step": 2680500 }, { "epoch": 18.142323516673883, "grad_norm": 0.3446868658065796, "learning_rate": 4.8185767648332616e-05, "loss": 0.3649, "step": 2681000 }, { "epoch": 18.145707016024254, "grad_norm": 0.3619040846824646, "learning_rate": 4.818542929839758e-05, "loss": 0.3663, "step": 2681500 }, { "epoch": 18.14909051537462, "grad_norm": 0.3531608283519745, "learning_rate": 4.818509094846254e-05, "loss": 0.3654, "step": 2682000 }, { "epoch": 18.15247401472499, "grad_norm": 0.36895817518234253, "learning_rate": 4.81847525985275e-05, "loss": 0.366, "step": 2682500 }, { "epoch": 18.155857514075358, "grad_norm": 0.36233237385749817, "learning_rate": 4.818441424859247e-05, "loss": 0.3669, "step": 2683000 }, { "epoch": 18.159241013425724, "grad_norm": 0.40669670701026917, "learning_rate": 4.818407589865743e-05, "loss": 0.3663, "step": 2683500 }, { "epoch": 18.162624512776095, "grad_norm": 0.42166778445243835, "learning_rate": 4.818373754872239e-05, "loss": 0.3667, "step": 2684000 }, { "epoch": 18.16600801212646, "grad_norm": 0.3649405539035797, "learning_rate": 4.818339919878736e-05, "loss": 0.3652, "step": 2684500 }, { "epoch": 18.169391511476828, "grad_norm": 0.3400033712387085, "learning_rate": 4.818306084885232e-05, "loss": 0.3653, "step": 2685000 }, { "epoch": 18.1727750108272, "grad_norm": 0.35990405082702637, "learning_rate": 4.818272249891728e-05, "loss": 0.3661, "step": 2685500 }, { "epoch": 18.176158510177565, "grad_norm": 0.3466646075248718, "learning_rate": 4.8182384148982244e-05, "loss": 0.3663, "step": 2686000 }, { "epoch": 18.179542009527935, "grad_norm": 0.37089553475379944, "learning_rate": 4.8182045799047206e-05, "loss": 0.3678, "step": 2686500 }, { "epoch": 18.182925508878302, "grad_norm": 0.3663167953491211, "learning_rate": 4.8181707449112175e-05, "loss": 0.3675, "step": 2687000 }, { "epoch": 18.18630900822867, "grad_norm": 0.37911930680274963, "learning_rate": 4.818136909917714e-05, "loss": 0.3657, "step": 2687500 }, { "epoch": 18.18969250757904, "grad_norm": 0.35394448041915894, "learning_rate": 4.81810307492421e-05, "loss": 0.3653, "step": 2688000 }, { "epoch": 18.193076006929406, "grad_norm": 0.37933778762817383, "learning_rate": 4.818069239930706e-05, "loss": 0.3652, "step": 2688500 }, { "epoch": 18.196459506279776, "grad_norm": 0.35863587260246277, "learning_rate": 4.8180354049372024e-05, "loss": 0.366, "step": 2689000 }, { "epoch": 18.199843005630143, "grad_norm": 0.33558905124664307, "learning_rate": 4.8180015699436986e-05, "loss": 0.3654, "step": 2689500 }, { "epoch": 18.20322650498051, "grad_norm": 0.37256914377212524, "learning_rate": 4.817967734950195e-05, "loss": 0.3667, "step": 2690000 }, { "epoch": 18.20661000433088, "grad_norm": 0.387002557516098, "learning_rate": 4.817933899956692e-05, "loss": 0.365, "step": 2690500 }, { "epoch": 18.209993503681247, "grad_norm": 0.36478158831596375, "learning_rate": 4.817900064963188e-05, "loss": 0.3664, "step": 2691000 }, { "epoch": 18.213377003031617, "grad_norm": 0.3580193817615509, "learning_rate": 4.817866229969684e-05, "loss": 0.3653, "step": 2691500 }, { "epoch": 18.216760502381984, "grad_norm": 0.3662140667438507, "learning_rate": 4.81783239497618e-05, "loss": 0.367, "step": 2692000 }, { "epoch": 18.22014400173235, "grad_norm": 0.43125787377357483, "learning_rate": 4.817798559982677e-05, "loss": 0.3661, "step": 2692500 }, { "epoch": 18.22352750108272, "grad_norm": 0.37605369091033936, "learning_rate": 4.8177647249891734e-05, "loss": 0.3671, "step": 2693000 }, { "epoch": 18.226911000433088, "grad_norm": 0.3609592914581299, "learning_rate": 4.817730889995669e-05, "loss": 0.3662, "step": 2693500 }, { "epoch": 18.230294499783454, "grad_norm": 0.3572835326194763, "learning_rate": 4.817697055002166e-05, "loss": 0.3663, "step": 2694000 }, { "epoch": 18.233677999133825, "grad_norm": 0.3766094148159027, "learning_rate": 4.817663220008662e-05, "loss": 0.3659, "step": 2694500 }, { "epoch": 18.23706149848419, "grad_norm": 0.35266968607902527, "learning_rate": 4.817629385015158e-05, "loss": 0.3666, "step": 2695000 }, { "epoch": 18.24044499783456, "grad_norm": 0.386359840631485, "learning_rate": 4.8175955500216545e-05, "loss": 0.3663, "step": 2695500 }, { "epoch": 18.24382849718493, "grad_norm": 0.3682827055454254, "learning_rate": 4.817561715028151e-05, "loss": 0.3668, "step": 2696000 }, { "epoch": 18.247211996535295, "grad_norm": 0.34724515676498413, "learning_rate": 4.8175278800346476e-05, "loss": 0.3641, "step": 2696500 }, { "epoch": 18.250595495885666, "grad_norm": 0.36000531911849976, "learning_rate": 4.817494045041144e-05, "loss": 0.3666, "step": 2697000 }, { "epoch": 18.253978995236032, "grad_norm": 0.3692454397678375, "learning_rate": 4.81746021004764e-05, "loss": 0.3679, "step": 2697500 }, { "epoch": 18.257362494586403, "grad_norm": 0.36783865094184875, "learning_rate": 4.817426375054136e-05, "loss": 0.3648, "step": 2698000 }, { "epoch": 18.26074599393677, "grad_norm": 0.30626559257507324, "learning_rate": 4.8173925400606324e-05, "loss": 0.3648, "step": 2698500 }, { "epoch": 18.264129493287136, "grad_norm": 0.34672948718070984, "learning_rate": 4.8173587050671286e-05, "loss": 0.366, "step": 2699000 }, { "epoch": 18.267512992637506, "grad_norm": 0.3682321012020111, "learning_rate": 4.817324870073625e-05, "loss": 0.368, "step": 2699500 }, { "epoch": 18.270896491987873, "grad_norm": 0.34346839785575867, "learning_rate": 4.817291035080122e-05, "loss": 0.366, "step": 2700000 }, { "epoch": 18.27427999133824, "grad_norm": 0.39888525009155273, "learning_rate": 4.817257200086618e-05, "loss": 0.364, "step": 2700500 }, { "epoch": 18.27766349068861, "grad_norm": 0.3761538863182068, "learning_rate": 4.817223365093114e-05, "loss": 0.3653, "step": 2701000 }, { "epoch": 18.281046990038977, "grad_norm": 0.36459478735923767, "learning_rate": 4.8171895300996104e-05, "loss": 0.3651, "step": 2701500 }, { "epoch": 18.284430489389347, "grad_norm": 0.41893574595451355, "learning_rate": 4.817155695106107e-05, "loss": 0.3662, "step": 2702000 }, { "epoch": 18.287813988739714, "grad_norm": 0.3921719193458557, "learning_rate": 4.8171218601126035e-05, "loss": 0.3664, "step": 2702500 }, { "epoch": 18.29119748809008, "grad_norm": 0.3538754880428314, "learning_rate": 4.817088025119099e-05, "loss": 0.3661, "step": 2703000 }, { "epoch": 18.29458098744045, "grad_norm": 0.3760707974433899, "learning_rate": 4.817054190125595e-05, "loss": 0.3666, "step": 2703500 }, { "epoch": 18.297964486790818, "grad_norm": 0.3739316761493683, "learning_rate": 4.817020355132092e-05, "loss": 0.3674, "step": 2704000 }, { "epoch": 18.301347986141188, "grad_norm": 0.3687141537666321, "learning_rate": 4.816986520138588e-05, "loss": 0.3655, "step": 2704500 }, { "epoch": 18.304731485491555, "grad_norm": 0.3798815608024597, "learning_rate": 4.8169526851450845e-05, "loss": 0.3652, "step": 2705000 }, { "epoch": 18.30811498484192, "grad_norm": 0.38236284255981445, "learning_rate": 4.816918850151581e-05, "loss": 0.3652, "step": 2705500 }, { "epoch": 18.311498484192292, "grad_norm": 0.3626120090484619, "learning_rate": 4.8168850151580776e-05, "loss": 0.3656, "step": 2706000 }, { "epoch": 18.31488198354266, "grad_norm": 0.3783361315727234, "learning_rate": 4.816851180164574e-05, "loss": 0.366, "step": 2706500 }, { "epoch": 18.31826548289303, "grad_norm": 0.39285773038864136, "learning_rate": 4.81681734517107e-05, "loss": 0.3667, "step": 2707000 }, { "epoch": 18.321648982243396, "grad_norm": 0.3508833646774292, "learning_rate": 4.816783510177566e-05, "loss": 0.3663, "step": 2707500 }, { "epoch": 18.325032481593762, "grad_norm": 0.3748737871646881, "learning_rate": 4.8167496751840625e-05, "loss": 0.3653, "step": 2708000 }, { "epoch": 18.328415980944133, "grad_norm": 0.39258819818496704, "learning_rate": 4.816715840190559e-05, "loss": 0.3671, "step": 2708500 }, { "epoch": 18.3317994802945, "grad_norm": 0.3650103807449341, "learning_rate": 4.816682005197055e-05, "loss": 0.3672, "step": 2709000 }, { "epoch": 18.335182979644866, "grad_norm": 0.3767535090446472, "learning_rate": 4.816648170203552e-05, "loss": 0.3671, "step": 2709500 }, { "epoch": 18.338566478995237, "grad_norm": 0.37702834606170654, "learning_rate": 4.816614335210048e-05, "loss": 0.368, "step": 2710000 }, { "epoch": 18.341949978345603, "grad_norm": 0.37456855177879333, "learning_rate": 4.816580500216544e-05, "loss": 0.3672, "step": 2710500 }, { "epoch": 18.345333477695974, "grad_norm": 0.3798463046550751, "learning_rate": 4.8165466652230404e-05, "loss": 0.3658, "step": 2711000 }, { "epoch": 18.34871697704634, "grad_norm": 0.32513168454170227, "learning_rate": 4.816512830229537e-05, "loss": 0.3647, "step": 2711500 }, { "epoch": 18.352100476396707, "grad_norm": 0.37949857115745544, "learning_rate": 4.8164789952360335e-05, "loss": 0.3661, "step": 2712000 }, { "epoch": 18.355483975747077, "grad_norm": 0.32526543736457825, "learning_rate": 4.816445160242529e-05, "loss": 0.3653, "step": 2712500 }, { "epoch": 18.358867475097444, "grad_norm": 0.3609720766544342, "learning_rate": 4.816411325249025e-05, "loss": 0.3653, "step": 2713000 }, { "epoch": 18.362250974447814, "grad_norm": 0.34838616847991943, "learning_rate": 4.816377490255522e-05, "loss": 0.3669, "step": 2713500 }, { "epoch": 18.36563447379818, "grad_norm": 0.3667636513710022, "learning_rate": 4.8163436552620184e-05, "loss": 0.3648, "step": 2714000 }, { "epoch": 18.369017973148548, "grad_norm": 0.37113574147224426, "learning_rate": 4.8163098202685146e-05, "loss": 0.3647, "step": 2714500 }, { "epoch": 18.372401472498918, "grad_norm": 0.37193408608436584, "learning_rate": 4.816275985275011e-05, "loss": 0.3677, "step": 2715000 }, { "epoch": 18.375784971849285, "grad_norm": 0.36754679679870605, "learning_rate": 4.816242150281508e-05, "loss": 0.3657, "step": 2715500 }, { "epoch": 18.379168471199655, "grad_norm": 0.404785692691803, "learning_rate": 4.816208315288004e-05, "loss": 0.3669, "step": 2716000 }, { "epoch": 18.382551970550022, "grad_norm": 0.3446834087371826, "learning_rate": 4.8161744802945e-05, "loss": 0.3655, "step": 2716500 }, { "epoch": 18.38593546990039, "grad_norm": 0.3776063621044159, "learning_rate": 4.8161406453009963e-05, "loss": 0.3664, "step": 2717000 }, { "epoch": 18.38931896925076, "grad_norm": 0.34708327054977417, "learning_rate": 4.8161068103074926e-05, "loss": 0.366, "step": 2717500 }, { "epoch": 18.392702468601126, "grad_norm": 0.35970473289489746, "learning_rate": 4.816072975313989e-05, "loss": 0.3676, "step": 2718000 }, { "epoch": 18.396085967951493, "grad_norm": 0.40329545736312866, "learning_rate": 4.816039140320485e-05, "loss": 0.3653, "step": 2718500 }, { "epoch": 18.399469467301863, "grad_norm": 0.34806376695632935, "learning_rate": 4.816005305326982e-05, "loss": 0.3675, "step": 2719000 }, { "epoch": 18.40285296665223, "grad_norm": 0.362699031829834, "learning_rate": 4.815971470333478e-05, "loss": 0.3684, "step": 2719500 }, { "epoch": 18.4062364660026, "grad_norm": 0.32820168137550354, "learning_rate": 4.815937635339974e-05, "loss": 0.3647, "step": 2720000 }, { "epoch": 18.409619965352967, "grad_norm": 0.33602628111839294, "learning_rate": 4.8159038003464705e-05, "loss": 0.3652, "step": 2720500 }, { "epoch": 18.413003464703333, "grad_norm": 0.3572433888912201, "learning_rate": 4.8158699653529674e-05, "loss": 0.368, "step": 2721000 }, { "epoch": 18.416386964053704, "grad_norm": 0.38326042890548706, "learning_rate": 4.8158361303594636e-05, "loss": 0.3665, "step": 2721500 }, { "epoch": 18.41977046340407, "grad_norm": 0.36697304248809814, "learning_rate": 4.815802295365959e-05, "loss": 0.3656, "step": 2722000 }, { "epoch": 18.42315396275444, "grad_norm": 0.3343600034713745, "learning_rate": 4.8157684603724554e-05, "loss": 0.3661, "step": 2722500 }, { "epoch": 18.426537462104807, "grad_norm": 0.36516448855400085, "learning_rate": 4.815734625378952e-05, "loss": 0.3659, "step": 2723000 }, { "epoch": 18.429920961455174, "grad_norm": 0.4176952540874481, "learning_rate": 4.8157007903854485e-05, "loss": 0.366, "step": 2723500 }, { "epoch": 18.433304460805545, "grad_norm": 0.37972500920295715, "learning_rate": 4.815666955391945e-05, "loss": 0.3656, "step": 2724000 }, { "epoch": 18.43668796015591, "grad_norm": 0.38399654626846313, "learning_rate": 4.815633120398441e-05, "loss": 0.3665, "step": 2724500 }, { "epoch": 18.440071459506278, "grad_norm": 0.3755989968776703, "learning_rate": 4.815599285404938e-05, "loss": 0.3657, "step": 2725000 }, { "epoch": 18.44345495885665, "grad_norm": 0.32659873366355896, "learning_rate": 4.815565450411434e-05, "loss": 0.3656, "step": 2725500 }, { "epoch": 18.446838458207015, "grad_norm": 0.3238185942173004, "learning_rate": 4.81553161541793e-05, "loss": 0.3657, "step": 2726000 }, { "epoch": 18.450221957557385, "grad_norm": 0.3889681100845337, "learning_rate": 4.8154977804244264e-05, "loss": 0.3654, "step": 2726500 }, { "epoch": 18.453605456907752, "grad_norm": 0.38696062564849854, "learning_rate": 4.8154639454309226e-05, "loss": 0.3651, "step": 2727000 }, { "epoch": 18.45698895625812, "grad_norm": 0.35800743103027344, "learning_rate": 4.815430110437419e-05, "loss": 0.3662, "step": 2727500 }, { "epoch": 18.46037245560849, "grad_norm": 0.3667149841785431, "learning_rate": 4.815396275443915e-05, "loss": 0.3665, "step": 2728000 }, { "epoch": 18.463755954958856, "grad_norm": 0.37348583340644836, "learning_rate": 4.815362440450412e-05, "loss": 0.367, "step": 2728500 }, { "epoch": 18.467139454309226, "grad_norm": 0.3520568311214447, "learning_rate": 4.815328605456908e-05, "loss": 0.3678, "step": 2729000 }, { "epoch": 18.470522953659593, "grad_norm": 0.38284870982170105, "learning_rate": 4.8152947704634044e-05, "loss": 0.3672, "step": 2729500 }, { "epoch": 18.47390645300996, "grad_norm": 0.34518930315971375, "learning_rate": 4.8152609354699006e-05, "loss": 0.3655, "step": 2730000 }, { "epoch": 18.47728995236033, "grad_norm": 0.34147876501083374, "learning_rate": 4.8152271004763975e-05, "loss": 0.3666, "step": 2730500 }, { "epoch": 18.480673451710697, "grad_norm": 0.37820854783058167, "learning_rate": 4.815193265482894e-05, "loss": 0.3648, "step": 2731000 }, { "epoch": 18.484056951061067, "grad_norm": 0.372206449508667, "learning_rate": 4.815159430489389e-05, "loss": 0.367, "step": 2731500 }, { "epoch": 18.487440450411434, "grad_norm": 0.36140644550323486, "learning_rate": 4.8151255954958854e-05, "loss": 0.3664, "step": 2732000 }, { "epoch": 18.4908239497618, "grad_norm": 0.366239994764328, "learning_rate": 4.815091760502382e-05, "loss": 0.3672, "step": 2732500 }, { "epoch": 18.49420744911217, "grad_norm": 0.3318920135498047, "learning_rate": 4.8150579255088785e-05, "loss": 0.3668, "step": 2733000 }, { "epoch": 18.497590948462538, "grad_norm": 0.32766205072402954, "learning_rate": 4.815024090515375e-05, "loss": 0.3669, "step": 2733500 }, { "epoch": 18.500974447812904, "grad_norm": 0.3653565049171448, "learning_rate": 4.814990255521871e-05, "loss": 0.3665, "step": 2734000 }, { "epoch": 18.504357947163275, "grad_norm": 0.38248080015182495, "learning_rate": 4.814956420528368e-05, "loss": 0.367, "step": 2734500 }, { "epoch": 18.50774144651364, "grad_norm": 0.3788929283618927, "learning_rate": 4.814922585534864e-05, "loss": 0.3654, "step": 2735000 }, { "epoch": 18.51112494586401, "grad_norm": 0.3633365035057068, "learning_rate": 4.81488875054136e-05, "loss": 0.3667, "step": 2735500 }, { "epoch": 18.51450844521438, "grad_norm": 0.3879588842391968, "learning_rate": 4.8148549155478565e-05, "loss": 0.3664, "step": 2736000 }, { "epoch": 18.517891944564745, "grad_norm": 0.38161173462867737, "learning_rate": 4.814821080554353e-05, "loss": 0.3671, "step": 2736500 }, { "epoch": 18.521275443915115, "grad_norm": 0.36614733934402466, "learning_rate": 4.814787245560849e-05, "loss": 0.3667, "step": 2737000 }, { "epoch": 18.524658943265482, "grad_norm": 0.3792756497859955, "learning_rate": 4.814753410567345e-05, "loss": 0.3666, "step": 2737500 }, { "epoch": 18.528042442615853, "grad_norm": 0.38750314712524414, "learning_rate": 4.814719575573842e-05, "loss": 0.3658, "step": 2738000 }, { "epoch": 18.53142594196622, "grad_norm": 0.3520932197570801, "learning_rate": 4.814685740580338e-05, "loss": 0.3656, "step": 2738500 }, { "epoch": 18.534809441316586, "grad_norm": 0.3735598027706146, "learning_rate": 4.8146519055868344e-05, "loss": 0.3659, "step": 2739000 }, { "epoch": 18.538192940666956, "grad_norm": 0.3451687693595886, "learning_rate": 4.8146180705933306e-05, "loss": 0.3658, "step": 2739500 }, { "epoch": 18.541576440017323, "grad_norm": 0.33136609196662903, "learning_rate": 4.8145842355998275e-05, "loss": 0.3664, "step": 2740000 }, { "epoch": 18.544959939367693, "grad_norm": 0.379218190908432, "learning_rate": 4.814550400606324e-05, "loss": 0.3673, "step": 2740500 }, { "epoch": 18.54834343871806, "grad_norm": 0.3787924647331238, "learning_rate": 4.814516565612819e-05, "loss": 0.3666, "step": 2741000 }, { "epoch": 18.551726938068427, "grad_norm": 0.3593429625034332, "learning_rate": 4.8144827306193155e-05, "loss": 0.3656, "step": 2741500 }, { "epoch": 18.555110437418797, "grad_norm": 0.3921399414539337, "learning_rate": 4.8144488956258124e-05, "loss": 0.3652, "step": 2742000 }, { "epoch": 18.558493936769164, "grad_norm": 0.34136930108070374, "learning_rate": 4.8144150606323086e-05, "loss": 0.3679, "step": 2742500 }, { "epoch": 18.56187743611953, "grad_norm": 0.33634859323501587, "learning_rate": 4.814381225638805e-05, "loss": 0.3658, "step": 2743000 }, { "epoch": 18.5652609354699, "grad_norm": 0.3452155292034149, "learning_rate": 4.814347390645301e-05, "loss": 0.3647, "step": 2743500 }, { "epoch": 18.568644434820268, "grad_norm": 0.3631720542907715, "learning_rate": 4.814313555651798e-05, "loss": 0.3668, "step": 2744000 }, { "epoch": 18.572027934170638, "grad_norm": 0.3850167989730835, "learning_rate": 4.814279720658294e-05, "loss": 0.3664, "step": 2744500 }, { "epoch": 18.575411433521005, "grad_norm": 0.39307355880737305, "learning_rate": 4.81424588566479e-05, "loss": 0.3674, "step": 2745000 }, { "epoch": 18.57879493287137, "grad_norm": 0.36546698212623596, "learning_rate": 4.8142120506712865e-05, "loss": 0.3669, "step": 2745500 }, { "epoch": 18.582178432221742, "grad_norm": 0.3274228870868683, "learning_rate": 4.814178215677783e-05, "loss": 0.366, "step": 2746000 }, { "epoch": 18.58556193157211, "grad_norm": 0.37010127305984497, "learning_rate": 4.814144380684279e-05, "loss": 0.3654, "step": 2746500 }, { "epoch": 18.58894543092248, "grad_norm": 0.36521145701408386, "learning_rate": 4.814110545690775e-05, "loss": 0.3652, "step": 2747000 }, { "epoch": 18.592328930272846, "grad_norm": 0.3667973279953003, "learning_rate": 4.814076710697272e-05, "loss": 0.3666, "step": 2747500 }, { "epoch": 18.595712429623212, "grad_norm": 0.3821118474006653, "learning_rate": 4.814042875703768e-05, "loss": 0.3669, "step": 2748000 }, { "epoch": 18.599095928973583, "grad_norm": 0.34617286920547485, "learning_rate": 4.8140090407102645e-05, "loss": 0.3651, "step": 2748500 }, { "epoch": 18.60247942832395, "grad_norm": 0.349868506193161, "learning_rate": 4.813975205716761e-05, "loss": 0.3654, "step": 2749000 }, { "epoch": 18.605862927674316, "grad_norm": 0.3502577543258667, "learning_rate": 4.813941370723257e-05, "loss": 0.3661, "step": 2749500 }, { "epoch": 18.609246427024686, "grad_norm": 0.34174036979675293, "learning_rate": 4.813907535729754e-05, "loss": 0.3655, "step": 2750000 }, { "epoch": 18.612629926375053, "grad_norm": 0.3498634696006775, "learning_rate": 4.813873700736249e-05, "loss": 0.3649, "step": 2750500 }, { "epoch": 18.616013425725424, "grad_norm": 0.3564428985118866, "learning_rate": 4.8138398657427455e-05, "loss": 0.3681, "step": 2751000 }, { "epoch": 18.61939692507579, "grad_norm": 0.32354769110679626, "learning_rate": 4.8138060307492424e-05, "loss": 0.3672, "step": 2751500 }, { "epoch": 18.622780424426157, "grad_norm": 0.37490779161453247, "learning_rate": 4.8137721957557386e-05, "loss": 0.3666, "step": 2752000 }, { "epoch": 18.626163923776527, "grad_norm": 0.3808179497718811, "learning_rate": 4.813738360762235e-05, "loss": 0.3667, "step": 2752500 }, { "epoch": 18.629547423126894, "grad_norm": 0.34879276156425476, "learning_rate": 4.813704525768731e-05, "loss": 0.3662, "step": 2753000 }, { "epoch": 18.632930922477264, "grad_norm": 0.35445043444633484, "learning_rate": 4.813670690775228e-05, "loss": 0.3673, "step": 2753500 }, { "epoch": 18.63631442182763, "grad_norm": 0.37173983454704285, "learning_rate": 4.813636855781724e-05, "loss": 0.3656, "step": 2754000 }, { "epoch": 18.639697921177998, "grad_norm": 0.33357104659080505, "learning_rate": 4.8136030207882204e-05, "loss": 0.3657, "step": 2754500 }, { "epoch": 18.643081420528368, "grad_norm": 0.3388036787509918, "learning_rate": 4.8135691857947166e-05, "loss": 0.3662, "step": 2755000 }, { "epoch": 18.646464919878735, "grad_norm": 0.3628353476524353, "learning_rate": 4.813535350801213e-05, "loss": 0.3674, "step": 2755500 }, { "epoch": 18.649848419229105, "grad_norm": 0.37852683663368225, "learning_rate": 4.813501515807709e-05, "loss": 0.3661, "step": 2756000 }, { "epoch": 18.653231918579472, "grad_norm": 0.3644959628582001, "learning_rate": 4.813467680814205e-05, "loss": 0.3662, "step": 2756500 }, { "epoch": 18.65661541792984, "grad_norm": 0.371595174074173, "learning_rate": 4.8134338458207014e-05, "loss": 0.3665, "step": 2757000 }, { "epoch": 18.65999891728021, "grad_norm": 0.3815184235572815, "learning_rate": 4.813400010827198e-05, "loss": 0.3663, "step": 2757500 }, { "epoch": 18.663382416630576, "grad_norm": 0.36756274104118347, "learning_rate": 4.8133661758336945e-05, "loss": 0.3657, "step": 2758000 }, { "epoch": 18.666765915980942, "grad_norm": 0.3418610692024231, "learning_rate": 4.813332340840191e-05, "loss": 0.3664, "step": 2758500 }, { "epoch": 18.670149415331313, "grad_norm": 0.36101487278938293, "learning_rate": 4.813298505846687e-05, "loss": 0.3677, "step": 2759000 }, { "epoch": 18.67353291468168, "grad_norm": 0.39173388481140137, "learning_rate": 4.813264670853184e-05, "loss": 0.3649, "step": 2759500 }, { "epoch": 18.67691641403205, "grad_norm": 0.37323135137557983, "learning_rate": 4.8132308358596794e-05, "loss": 0.3657, "step": 2760000 }, { "epoch": 18.680299913382417, "grad_norm": 0.36500459909439087, "learning_rate": 4.8131970008661756e-05, "loss": 0.3652, "step": 2760500 }, { "epoch": 18.683683412732783, "grad_norm": 0.35408034920692444, "learning_rate": 4.8131631658726725e-05, "loss": 0.3656, "step": 2761000 }, { "epoch": 18.687066912083154, "grad_norm": 0.3520357012748718, "learning_rate": 4.813129330879169e-05, "loss": 0.3658, "step": 2761500 }, { "epoch": 18.69045041143352, "grad_norm": 0.35334864258766174, "learning_rate": 4.813095495885665e-05, "loss": 0.3673, "step": 2762000 }, { "epoch": 18.69383391078389, "grad_norm": 0.36210882663726807, "learning_rate": 4.813061660892161e-05, "loss": 0.3659, "step": 2762500 }, { "epoch": 18.697217410134257, "grad_norm": 0.3763342797756195, "learning_rate": 4.813027825898658e-05, "loss": 0.3674, "step": 2763000 }, { "epoch": 18.700600909484624, "grad_norm": 0.33555757999420166, "learning_rate": 4.812993990905154e-05, "loss": 0.3667, "step": 2763500 }, { "epoch": 18.703984408834994, "grad_norm": 0.3518519103527069, "learning_rate": 4.8129601559116504e-05, "loss": 0.3662, "step": 2764000 }, { "epoch": 18.70736790818536, "grad_norm": 0.3487462103366852, "learning_rate": 4.812926320918147e-05, "loss": 0.3674, "step": 2764500 }, { "epoch": 18.71075140753573, "grad_norm": 0.3702090382575989, "learning_rate": 4.812892485924643e-05, "loss": 0.3642, "step": 2765000 }, { "epoch": 18.7141349068861, "grad_norm": 0.348533034324646, "learning_rate": 4.812858650931139e-05, "loss": 0.3669, "step": 2765500 }, { "epoch": 18.717518406236465, "grad_norm": 0.360711008310318, "learning_rate": 4.812824815937635e-05, "loss": 0.3657, "step": 2766000 }, { "epoch": 18.720901905586835, "grad_norm": 0.3330981731414795, "learning_rate": 4.8127909809441315e-05, "loss": 0.366, "step": 2766500 }, { "epoch": 18.724285404937202, "grad_norm": 0.3385585844516754, "learning_rate": 4.8127571459506284e-05, "loss": 0.3654, "step": 2767000 }, { "epoch": 18.72766890428757, "grad_norm": 0.3667827546596527, "learning_rate": 4.8127233109571246e-05, "loss": 0.3645, "step": 2767500 }, { "epoch": 18.73105240363794, "grad_norm": 0.34755051136016846, "learning_rate": 4.812689475963621e-05, "loss": 0.366, "step": 2768000 }, { "epoch": 18.734435902988306, "grad_norm": 0.3477209508419037, "learning_rate": 4.812655640970117e-05, "loss": 0.3666, "step": 2768500 }, { "epoch": 18.737819402338676, "grad_norm": 0.4002993702888489, "learning_rate": 4.812621805976614e-05, "loss": 0.367, "step": 2769000 }, { "epoch": 18.741202901689043, "grad_norm": 0.34906914830207825, "learning_rate": 4.8125879709831095e-05, "loss": 0.3671, "step": 2769500 }, { "epoch": 18.74458640103941, "grad_norm": 0.38176172971725464, "learning_rate": 4.812554135989606e-05, "loss": 0.3663, "step": 2770000 }, { "epoch": 18.74796990038978, "grad_norm": 0.36529573798179626, "learning_rate": 4.8125203009961026e-05, "loss": 0.3669, "step": 2770500 }, { "epoch": 18.751353399740147, "grad_norm": 0.3871072828769684, "learning_rate": 4.812486466002599e-05, "loss": 0.3656, "step": 2771000 }, { "epoch": 18.754736899090517, "grad_norm": 0.37509897351264954, "learning_rate": 4.812452631009095e-05, "loss": 0.3665, "step": 2771500 }, { "epoch": 18.758120398440884, "grad_norm": 0.3503206968307495, "learning_rate": 4.812418796015591e-05, "loss": 0.365, "step": 2772000 }, { "epoch": 18.76150389779125, "grad_norm": 0.38853317499160767, "learning_rate": 4.812384961022088e-05, "loss": 0.3665, "step": 2772500 }, { "epoch": 18.76488739714162, "grad_norm": 0.33158519864082336, "learning_rate": 4.812351126028584e-05, "loss": 0.3657, "step": 2773000 }, { "epoch": 18.768270896491988, "grad_norm": 0.39370110630989075, "learning_rate": 4.8123172910350805e-05, "loss": 0.3664, "step": 2773500 }, { "epoch": 18.771654395842354, "grad_norm": 0.3575500547885895, "learning_rate": 4.812283456041577e-05, "loss": 0.3663, "step": 2774000 }, { "epoch": 18.775037895192725, "grad_norm": 0.373933345079422, "learning_rate": 4.812249621048073e-05, "loss": 0.3667, "step": 2774500 }, { "epoch": 18.77842139454309, "grad_norm": 0.3317835330963135, "learning_rate": 4.812215786054569e-05, "loss": 0.3655, "step": 2775000 }, { "epoch": 18.78180489389346, "grad_norm": 0.3341616690158844, "learning_rate": 4.8121819510610654e-05, "loss": 0.3662, "step": 2775500 }, { "epoch": 18.78518839324383, "grad_norm": 0.39613327383995056, "learning_rate": 4.8121481160675616e-05, "loss": 0.3664, "step": 2776000 }, { "epoch": 18.788571892594195, "grad_norm": 0.36104127764701843, "learning_rate": 4.8121142810740585e-05, "loss": 0.3658, "step": 2776500 }, { "epoch": 18.791955391944565, "grad_norm": 0.34897303581237793, "learning_rate": 4.812080446080555e-05, "loss": 0.367, "step": 2777000 }, { "epoch": 18.795338891294932, "grad_norm": 0.40071144700050354, "learning_rate": 4.812046611087051e-05, "loss": 0.366, "step": 2777500 }, { "epoch": 18.798722390645302, "grad_norm": 0.3589742183685303, "learning_rate": 4.812012776093547e-05, "loss": 0.3656, "step": 2778000 }, { "epoch": 18.80210588999567, "grad_norm": 0.3539280593395233, "learning_rate": 4.811978941100044e-05, "loss": 0.3672, "step": 2778500 }, { "epoch": 18.805489389346036, "grad_norm": 0.3695991635322571, "learning_rate": 4.8119451061065395e-05, "loss": 0.3681, "step": 2779000 }, { "epoch": 18.808872888696406, "grad_norm": 0.3592228293418884, "learning_rate": 4.811911271113036e-05, "loss": 0.3675, "step": 2779500 }, { "epoch": 18.812256388046773, "grad_norm": 0.3480539321899414, "learning_rate": 4.8118774361195326e-05, "loss": 0.3642, "step": 2780000 }, { "epoch": 18.815639887397143, "grad_norm": 0.36234450340270996, "learning_rate": 4.811843601126029e-05, "loss": 0.3668, "step": 2780500 }, { "epoch": 18.81902338674751, "grad_norm": 0.37986135482788086, "learning_rate": 4.811809766132525e-05, "loss": 0.3665, "step": 2781000 }, { "epoch": 18.822406886097877, "grad_norm": 0.3787166476249695, "learning_rate": 4.811775931139021e-05, "loss": 0.365, "step": 2781500 }, { "epoch": 18.825790385448247, "grad_norm": 0.3391520082950592, "learning_rate": 4.811742096145518e-05, "loss": 0.3658, "step": 2782000 }, { "epoch": 18.829173884798614, "grad_norm": 0.37807825207710266, "learning_rate": 4.8117082611520144e-05, "loss": 0.3663, "step": 2782500 }, { "epoch": 18.83255738414898, "grad_norm": 0.36283302307128906, "learning_rate": 4.8116744261585106e-05, "loss": 0.366, "step": 2783000 }, { "epoch": 18.83594088349935, "grad_norm": 0.37602904438972473, "learning_rate": 4.811640591165007e-05, "loss": 0.3669, "step": 2783500 }, { "epoch": 18.839324382849718, "grad_norm": 0.3637160062789917, "learning_rate": 4.811606756171503e-05, "loss": 0.3663, "step": 2784000 }, { "epoch": 18.842707882200088, "grad_norm": 0.35866454243659973, "learning_rate": 4.811572921177999e-05, "loss": 0.3663, "step": 2784500 }, { "epoch": 18.846091381550455, "grad_norm": 0.32873547077178955, "learning_rate": 4.8115390861844954e-05, "loss": 0.3651, "step": 2785000 }, { "epoch": 18.84947488090082, "grad_norm": 0.36401212215423584, "learning_rate": 4.8115052511909916e-05, "loss": 0.365, "step": 2785500 }, { "epoch": 18.85285838025119, "grad_norm": 0.3384600281715393, "learning_rate": 4.8114714161974885e-05, "loss": 0.3655, "step": 2786000 }, { "epoch": 18.85624187960156, "grad_norm": 0.34413713216781616, "learning_rate": 4.811437581203985e-05, "loss": 0.3652, "step": 2786500 }, { "epoch": 18.85962537895193, "grad_norm": 0.3725583255290985, "learning_rate": 4.811403746210481e-05, "loss": 0.3665, "step": 2787000 }, { "epoch": 18.863008878302296, "grad_norm": 0.35329461097717285, "learning_rate": 4.811369911216977e-05, "loss": 0.3659, "step": 2787500 }, { "epoch": 18.866392377652662, "grad_norm": 0.33690500259399414, "learning_rate": 4.811336076223474e-05, "loss": 0.3668, "step": 2788000 }, { "epoch": 18.869775877003033, "grad_norm": 0.3894781470298767, "learning_rate": 4.8113022412299696e-05, "loss": 0.3658, "step": 2788500 }, { "epoch": 18.8731593763534, "grad_norm": 0.38862013816833496, "learning_rate": 4.811268406236466e-05, "loss": 0.365, "step": 2789000 }, { "epoch": 18.87654287570377, "grad_norm": 0.3516341745853424, "learning_rate": 4.811234571242963e-05, "loss": 0.3675, "step": 2789500 }, { "epoch": 18.879926375054136, "grad_norm": 0.3779483735561371, "learning_rate": 4.811200736249459e-05, "loss": 0.367, "step": 2790000 }, { "epoch": 18.883309874404503, "grad_norm": 0.3754669725894928, "learning_rate": 4.811166901255955e-05, "loss": 0.3672, "step": 2790500 }, { "epoch": 18.886693373754873, "grad_norm": 0.38591325283050537, "learning_rate": 4.811133066262451e-05, "loss": 0.3681, "step": 2791000 }, { "epoch": 18.89007687310524, "grad_norm": 0.3474923372268677, "learning_rate": 4.811099231268948e-05, "loss": 0.3664, "step": 2791500 }, { "epoch": 18.893460372455607, "grad_norm": 0.3471725285053253, "learning_rate": 4.8110653962754444e-05, "loss": 0.3652, "step": 2792000 }, { "epoch": 18.896843871805977, "grad_norm": 0.3508303761482239, "learning_rate": 4.8110315612819406e-05, "loss": 0.3644, "step": 2792500 }, { "epoch": 18.900227371156344, "grad_norm": 0.3836657404899597, "learning_rate": 4.810997726288437e-05, "loss": 0.3679, "step": 2793000 }, { "epoch": 18.903610870506714, "grad_norm": 0.37254008650779724, "learning_rate": 4.810963891294933e-05, "loss": 0.3657, "step": 2793500 }, { "epoch": 18.90699436985708, "grad_norm": 0.35349971055984497, "learning_rate": 4.810930056301429e-05, "loss": 0.3657, "step": 2794000 }, { "epoch": 18.910377869207448, "grad_norm": 0.3606826961040497, "learning_rate": 4.8108962213079255e-05, "loss": 0.3673, "step": 2794500 }, { "epoch": 18.913761368557818, "grad_norm": 0.3424413502216339, "learning_rate": 4.810862386314422e-05, "loss": 0.3678, "step": 2795000 }, { "epoch": 18.917144867908185, "grad_norm": 0.3713364601135254, "learning_rate": 4.8108285513209186e-05, "loss": 0.3675, "step": 2795500 }, { "epoch": 18.920528367258555, "grad_norm": 0.36504119634628296, "learning_rate": 4.810794716327415e-05, "loss": 0.3664, "step": 2796000 }, { "epoch": 18.923911866608922, "grad_norm": 0.3482113182544708, "learning_rate": 4.810760881333911e-05, "loss": 0.3657, "step": 2796500 }, { "epoch": 18.92729536595929, "grad_norm": 0.36139917373657227, "learning_rate": 4.810727046340407e-05, "loss": 0.3671, "step": 2797000 }, { "epoch": 18.93067886530966, "grad_norm": 0.3819946050643921, "learning_rate": 4.810693211346904e-05, "loss": 0.3658, "step": 2797500 }, { "epoch": 18.934062364660026, "grad_norm": 0.37418124079704285, "learning_rate": 4.8106593763533997e-05, "loss": 0.3654, "step": 2798000 }, { "epoch": 18.937445864010392, "grad_norm": 0.37340039014816284, "learning_rate": 4.810625541359896e-05, "loss": 0.3667, "step": 2798500 }, { "epoch": 18.940829363360763, "grad_norm": 0.3686635494232178, "learning_rate": 4.810591706366393e-05, "loss": 0.3671, "step": 2799000 }, { "epoch": 18.94421286271113, "grad_norm": 0.34484514594078064, "learning_rate": 4.810557871372889e-05, "loss": 0.3666, "step": 2799500 }, { "epoch": 18.9475963620615, "grad_norm": 0.3572694957256317, "learning_rate": 4.810524036379385e-05, "loss": 0.3643, "step": 2800000 }, { "epoch": 18.950979861411867, "grad_norm": 0.35780635476112366, "learning_rate": 4.8104902013858814e-05, "loss": 0.3677, "step": 2800500 }, { "epoch": 18.954363360762233, "grad_norm": 0.34452852606773376, "learning_rate": 4.810456366392378e-05, "loss": 0.3664, "step": 2801000 }, { "epoch": 18.957746860112604, "grad_norm": 0.3469793498516083, "learning_rate": 4.8104225313988745e-05, "loss": 0.3674, "step": 2801500 }, { "epoch": 18.96113035946297, "grad_norm": 0.39285343885421753, "learning_rate": 4.810388696405371e-05, "loss": 0.3659, "step": 2802000 }, { "epoch": 18.96451385881334, "grad_norm": 0.39713558554649353, "learning_rate": 4.810354861411867e-05, "loss": 0.3672, "step": 2802500 }, { "epoch": 18.967897358163707, "grad_norm": 0.3319869935512543, "learning_rate": 4.810321026418363e-05, "loss": 0.3648, "step": 2803000 }, { "epoch": 18.971280857514074, "grad_norm": 0.39673224091529846, "learning_rate": 4.8102871914248593e-05, "loss": 0.3661, "step": 2803500 }, { "epoch": 18.974664356864444, "grad_norm": 0.3552807867527008, "learning_rate": 4.8102533564313556e-05, "loss": 0.3668, "step": 2804000 }, { "epoch": 18.97804785621481, "grad_norm": 0.3664064109325409, "learning_rate": 4.810219521437852e-05, "loss": 0.3664, "step": 2804500 }, { "epoch": 18.981431355565178, "grad_norm": 0.3757137954235077, "learning_rate": 4.8101856864443487e-05, "loss": 0.3663, "step": 2805000 }, { "epoch": 18.984814854915548, "grad_norm": 0.3732067048549652, "learning_rate": 4.810151851450845e-05, "loss": 0.3664, "step": 2805500 }, { "epoch": 18.988198354265915, "grad_norm": 0.34929749369621277, "learning_rate": 4.810118016457341e-05, "loss": 0.3685, "step": 2806000 }, { "epoch": 18.991581853616285, "grad_norm": 0.3506418764591217, "learning_rate": 4.810084181463837e-05, "loss": 0.3647, "step": 2806500 }, { "epoch": 18.994965352966652, "grad_norm": 0.3531164824962616, "learning_rate": 4.810050346470334e-05, "loss": 0.3657, "step": 2807000 }, { "epoch": 18.99834885231702, "grad_norm": 0.35706275701522827, "learning_rate": 4.8100165114768304e-05, "loss": 0.3659, "step": 2807500 }, { "epoch": 19.0, "eval_accuracy": 0.8607044074124969, "eval_loss": 0.5654244422912598, "eval_runtime": 3394.0031, "eval_samples_per_second": 85.664, "eval_steps_per_second": 5.354, "step": 2807744 }, { "epoch": 19.00173235166739, "grad_norm": 0.34599050879478455, "learning_rate": 4.809982676483326e-05, "loss": 0.3665, "step": 2808000 }, { "epoch": 19.005115851017756, "grad_norm": 0.376055508852005, "learning_rate": 4.809948841489823e-05, "loss": 0.3639, "step": 2808500 }, { "epoch": 19.008499350368126, "grad_norm": 0.39492684602737427, "learning_rate": 4.809915006496319e-05, "loss": 0.3646, "step": 2809000 }, { "epoch": 19.011882849718493, "grad_norm": 0.3591514527797699, "learning_rate": 4.809881171502815e-05, "loss": 0.3638, "step": 2809500 }, { "epoch": 19.01526634906886, "grad_norm": 0.3865591287612915, "learning_rate": 4.8098473365093115e-05, "loss": 0.3626, "step": 2810000 }, { "epoch": 19.01864984841923, "grad_norm": 0.3449145257472992, "learning_rate": 4.8098135015158083e-05, "loss": 0.3637, "step": 2810500 }, { "epoch": 19.022033347769597, "grad_norm": 0.4153270423412323, "learning_rate": 4.8097796665223046e-05, "loss": 0.3634, "step": 2811000 }, { "epoch": 19.025416847119967, "grad_norm": 0.38772618770599365, "learning_rate": 4.809745831528801e-05, "loss": 0.3647, "step": 2811500 }, { "epoch": 19.028800346470334, "grad_norm": 0.34968364238739014, "learning_rate": 4.809711996535297e-05, "loss": 0.3651, "step": 2812000 }, { "epoch": 19.0321838458207, "grad_norm": 0.3845962584018707, "learning_rate": 4.809678161541793e-05, "loss": 0.3647, "step": 2812500 }, { "epoch": 19.03556734517107, "grad_norm": 0.350651353597641, "learning_rate": 4.8096443265482894e-05, "loss": 0.3639, "step": 2813000 }, { "epoch": 19.038950844521437, "grad_norm": 0.3638920187950134, "learning_rate": 4.8096104915547856e-05, "loss": 0.3651, "step": 2813500 }, { "epoch": 19.042334343871804, "grad_norm": 0.3764038681983948, "learning_rate": 4.809576656561282e-05, "loss": 0.3643, "step": 2814000 }, { "epoch": 19.045717843222175, "grad_norm": 0.37085476517677307, "learning_rate": 4.809542821567779e-05, "loss": 0.3635, "step": 2814500 }, { "epoch": 19.04910134257254, "grad_norm": 0.3822312653064728, "learning_rate": 4.809508986574275e-05, "loss": 0.3665, "step": 2815000 }, { "epoch": 19.05248484192291, "grad_norm": 0.34775885939598083, "learning_rate": 4.809475151580771e-05, "loss": 0.3657, "step": 2815500 }, { "epoch": 19.05586834127328, "grad_norm": 0.3567921817302704, "learning_rate": 4.8094413165872674e-05, "loss": 0.3643, "step": 2816000 }, { "epoch": 19.059251840623645, "grad_norm": 0.39007171988487244, "learning_rate": 4.809407481593764e-05, "loss": 0.3645, "step": 2816500 }, { "epoch": 19.062635339974015, "grad_norm": 0.40277397632598877, "learning_rate": 4.8093736466002605e-05, "loss": 0.3656, "step": 2817000 }, { "epoch": 19.066018839324382, "grad_norm": 0.3536837100982666, "learning_rate": 4.809339811606756e-05, "loss": 0.3646, "step": 2817500 }, { "epoch": 19.069402338674752, "grad_norm": 0.36523324251174927, "learning_rate": 4.809305976613253e-05, "loss": 0.3655, "step": 2818000 }, { "epoch": 19.07278583802512, "grad_norm": 0.35634082555770874, "learning_rate": 4.809272141619749e-05, "loss": 0.3645, "step": 2818500 }, { "epoch": 19.076169337375486, "grad_norm": 0.36509403586387634, "learning_rate": 4.809238306626245e-05, "loss": 0.365, "step": 2819000 }, { "epoch": 19.079552836725856, "grad_norm": 0.3636913299560547, "learning_rate": 4.8092044716327415e-05, "loss": 0.3647, "step": 2819500 }, { "epoch": 19.082936336076223, "grad_norm": 0.35474660992622375, "learning_rate": 4.809170636639238e-05, "loss": 0.3653, "step": 2820000 }, { "epoch": 19.086319835426593, "grad_norm": 0.3720208406448364, "learning_rate": 4.8091368016457346e-05, "loss": 0.3631, "step": 2820500 }, { "epoch": 19.08970333477696, "grad_norm": 0.40283653140068054, "learning_rate": 4.809102966652231e-05, "loss": 0.3651, "step": 2821000 }, { "epoch": 19.093086834127327, "grad_norm": 0.36578333377838135, "learning_rate": 4.809069131658727e-05, "loss": 0.3652, "step": 2821500 }, { "epoch": 19.096470333477697, "grad_norm": 0.3494338095188141, "learning_rate": 4.809035296665223e-05, "loss": 0.366, "step": 2822000 }, { "epoch": 19.099853832828064, "grad_norm": 0.39220380783081055, "learning_rate": 4.8090014616717195e-05, "loss": 0.3632, "step": 2822500 }, { "epoch": 19.10323733217843, "grad_norm": 0.39391595125198364, "learning_rate": 4.808967626678216e-05, "loss": 0.3654, "step": 2823000 }, { "epoch": 19.1066208315288, "grad_norm": 0.36776575446128845, "learning_rate": 4.808933791684712e-05, "loss": 0.3645, "step": 2823500 }, { "epoch": 19.110004330879168, "grad_norm": 0.3435305058956146, "learning_rate": 4.808899956691209e-05, "loss": 0.3663, "step": 2824000 }, { "epoch": 19.113387830229538, "grad_norm": 0.412218302488327, "learning_rate": 4.808866121697705e-05, "loss": 0.366, "step": 2824500 }, { "epoch": 19.116771329579905, "grad_norm": 0.3410913944244385, "learning_rate": 4.808832286704201e-05, "loss": 0.3666, "step": 2825000 }, { "epoch": 19.12015482893027, "grad_norm": 0.3647315204143524, "learning_rate": 4.8087984517106974e-05, "loss": 0.3653, "step": 2825500 }, { "epoch": 19.12353832828064, "grad_norm": 0.3654209077358246, "learning_rate": 4.808764616717194e-05, "loss": 0.3651, "step": 2826000 }, { "epoch": 19.12692182763101, "grad_norm": 0.37966057658195496, "learning_rate": 4.8087307817236905e-05, "loss": 0.3647, "step": 2826500 }, { "epoch": 19.13030532698138, "grad_norm": 0.3871251940727234, "learning_rate": 4.808696946730186e-05, "loss": 0.365, "step": 2827000 }, { "epoch": 19.133688826331746, "grad_norm": 0.3365837037563324, "learning_rate": 4.808663111736682e-05, "loss": 0.3656, "step": 2827500 }, { "epoch": 19.137072325682112, "grad_norm": 0.3533678650856018, "learning_rate": 4.808629276743179e-05, "loss": 0.3637, "step": 2828000 }, { "epoch": 19.140455825032483, "grad_norm": 0.3549516499042511, "learning_rate": 4.8085954417496754e-05, "loss": 0.365, "step": 2828500 }, { "epoch": 19.14383932438285, "grad_norm": 0.3680468797683716, "learning_rate": 4.8085616067561716e-05, "loss": 0.3649, "step": 2829000 }, { "epoch": 19.14722282373322, "grad_norm": 0.37863093614578247, "learning_rate": 4.808527771762668e-05, "loss": 0.3652, "step": 2829500 }, { "epoch": 19.150606323083586, "grad_norm": 0.41318458318710327, "learning_rate": 4.808493936769165e-05, "loss": 0.3633, "step": 2830000 }, { "epoch": 19.153989822433953, "grad_norm": 0.37374892830848694, "learning_rate": 4.808460101775661e-05, "loss": 0.3639, "step": 2830500 }, { "epoch": 19.157373321784323, "grad_norm": 0.371120423078537, "learning_rate": 4.808426266782157e-05, "loss": 0.3647, "step": 2831000 }, { "epoch": 19.16075682113469, "grad_norm": 0.3479118347167969, "learning_rate": 4.808392431788653e-05, "loss": 0.3659, "step": 2831500 }, { "epoch": 19.164140320485057, "grad_norm": 0.350894957780838, "learning_rate": 4.8083585967951495e-05, "loss": 0.3657, "step": 2832000 }, { "epoch": 19.167523819835427, "grad_norm": 0.3924980163574219, "learning_rate": 4.808324761801646e-05, "loss": 0.3654, "step": 2832500 }, { "epoch": 19.170907319185794, "grad_norm": 0.349345862865448, "learning_rate": 4.808290926808142e-05, "loss": 0.3649, "step": 2833000 }, { "epoch": 19.174290818536164, "grad_norm": 0.35752519965171814, "learning_rate": 4.808257091814639e-05, "loss": 0.3648, "step": 2833500 }, { "epoch": 19.17767431788653, "grad_norm": 0.36245429515838623, "learning_rate": 4.808223256821135e-05, "loss": 0.3653, "step": 2834000 }, { "epoch": 19.181057817236898, "grad_norm": 0.3374299705028534, "learning_rate": 4.808189421827631e-05, "loss": 0.3647, "step": 2834500 }, { "epoch": 19.184441316587268, "grad_norm": 0.33912578225135803, "learning_rate": 4.8081555868341275e-05, "loss": 0.3641, "step": 2835000 }, { "epoch": 19.187824815937635, "grad_norm": 0.387525349855423, "learning_rate": 4.8081217518406244e-05, "loss": 0.3654, "step": 2835500 }, { "epoch": 19.191208315288005, "grad_norm": 0.32535794377326965, "learning_rate": 4.8080879168471206e-05, "loss": 0.3649, "step": 2836000 }, { "epoch": 19.194591814638372, "grad_norm": 0.38029584288597107, "learning_rate": 4.808054081853616e-05, "loss": 0.3645, "step": 2836500 }, { "epoch": 19.19797531398874, "grad_norm": 0.3509443700313568, "learning_rate": 4.808020246860112e-05, "loss": 0.3646, "step": 2837000 }, { "epoch": 19.20135881333911, "grad_norm": 0.38972824811935425, "learning_rate": 4.807986411866609e-05, "loss": 0.3639, "step": 2837500 }, { "epoch": 19.204742312689476, "grad_norm": 0.38763338327407837, "learning_rate": 4.8079525768731054e-05, "loss": 0.3647, "step": 2838000 }, { "epoch": 19.208125812039842, "grad_norm": 0.35395440459251404, "learning_rate": 4.8079187418796016e-05, "loss": 0.3647, "step": 2838500 }, { "epoch": 19.211509311390213, "grad_norm": 0.36853599548339844, "learning_rate": 4.807884906886098e-05, "loss": 0.3642, "step": 2839000 }, { "epoch": 19.21489281074058, "grad_norm": 0.34220248460769653, "learning_rate": 4.807851071892595e-05, "loss": 0.3656, "step": 2839500 }, { "epoch": 19.21827631009095, "grad_norm": 0.38692814111709595, "learning_rate": 4.807817236899091e-05, "loss": 0.3646, "step": 2840000 }, { "epoch": 19.221659809441316, "grad_norm": 0.3987424969673157, "learning_rate": 4.807783401905587e-05, "loss": 0.3647, "step": 2840500 }, { "epoch": 19.225043308791683, "grad_norm": 0.3585709035396576, "learning_rate": 4.8077495669120834e-05, "loss": 0.366, "step": 2841000 }, { "epoch": 19.228426808142054, "grad_norm": 0.39497193694114685, "learning_rate": 4.8077157319185796e-05, "loss": 0.3672, "step": 2841500 }, { "epoch": 19.23181030749242, "grad_norm": 0.33567896485328674, "learning_rate": 4.807681896925076e-05, "loss": 0.3651, "step": 2842000 }, { "epoch": 19.23519380684279, "grad_norm": 0.34376466274261475, "learning_rate": 4.807648061931572e-05, "loss": 0.3659, "step": 2842500 }, { "epoch": 19.238577306193157, "grad_norm": 0.39341068267822266, "learning_rate": 4.807614226938069e-05, "loss": 0.3652, "step": 2843000 }, { "epoch": 19.241960805543524, "grad_norm": 0.3692178428173065, "learning_rate": 4.807580391944565e-05, "loss": 0.3647, "step": 2843500 }, { "epoch": 19.245344304893894, "grad_norm": 0.350510835647583, "learning_rate": 4.807546556951061e-05, "loss": 0.3657, "step": 2844000 }, { "epoch": 19.24872780424426, "grad_norm": 0.35085275769233704, "learning_rate": 4.8075127219575575e-05, "loss": 0.3645, "step": 2844500 }, { "epoch": 19.25211130359463, "grad_norm": 0.3581259846687317, "learning_rate": 4.8074788869640544e-05, "loss": 0.3664, "step": 2845000 }, { "epoch": 19.255494802944998, "grad_norm": 0.3346034586429596, "learning_rate": 4.8074450519705506e-05, "loss": 0.3654, "step": 2845500 }, { "epoch": 19.258878302295365, "grad_norm": 0.41722241044044495, "learning_rate": 4.807411216977046e-05, "loss": 0.3657, "step": 2846000 }, { "epoch": 19.262261801645735, "grad_norm": 0.33750683069229126, "learning_rate": 4.8073773819835424e-05, "loss": 0.366, "step": 2846500 }, { "epoch": 19.265645300996102, "grad_norm": 0.3772087097167969, "learning_rate": 4.807343546990039e-05, "loss": 0.3657, "step": 2847000 }, { "epoch": 19.26902880034647, "grad_norm": 0.3495555818080902, "learning_rate": 4.8073097119965355e-05, "loss": 0.3654, "step": 2847500 }, { "epoch": 19.27241229969684, "grad_norm": 0.37127164006233215, "learning_rate": 4.807275877003032e-05, "loss": 0.3642, "step": 2848000 }, { "epoch": 19.275795799047206, "grad_norm": 0.3762498199939728, "learning_rate": 4.807242042009528e-05, "loss": 0.3648, "step": 2848500 }, { "epoch": 19.279179298397576, "grad_norm": 0.378359317779541, "learning_rate": 4.807208207016025e-05, "loss": 0.3642, "step": 2849000 }, { "epoch": 19.282562797747943, "grad_norm": 0.38522207736968994, "learning_rate": 4.807174372022521e-05, "loss": 0.366, "step": 2849500 }, { "epoch": 19.28594629709831, "grad_norm": 0.38753610849380493, "learning_rate": 4.807140537029017e-05, "loss": 0.3647, "step": 2850000 }, { "epoch": 19.28932979644868, "grad_norm": 0.3656563460826874, "learning_rate": 4.8071067020355134e-05, "loss": 0.3651, "step": 2850500 }, { "epoch": 19.292713295799047, "grad_norm": 0.3563537001609802, "learning_rate": 4.8070728670420097e-05, "loss": 0.3657, "step": 2851000 }, { "epoch": 19.296096795149417, "grad_norm": 0.3515377342700958, "learning_rate": 4.807039032048506e-05, "loss": 0.3671, "step": 2851500 }, { "epoch": 19.299480294499784, "grad_norm": 0.335602343082428, "learning_rate": 4.807005197055002e-05, "loss": 0.365, "step": 2852000 }, { "epoch": 19.30286379385015, "grad_norm": 0.35789671540260315, "learning_rate": 4.806971362061499e-05, "loss": 0.3666, "step": 2852500 }, { "epoch": 19.30624729320052, "grad_norm": 0.36818790435791016, "learning_rate": 4.806937527067995e-05, "loss": 0.3642, "step": 2853000 }, { "epoch": 19.309630792550887, "grad_norm": 0.34332209825515747, "learning_rate": 4.8069036920744914e-05, "loss": 0.3665, "step": 2853500 }, { "epoch": 19.313014291901254, "grad_norm": 0.3529587388038635, "learning_rate": 4.8068698570809876e-05, "loss": 0.3648, "step": 2854000 }, { "epoch": 19.316397791251624, "grad_norm": 0.3891531527042389, "learning_rate": 4.8068360220874845e-05, "loss": 0.3652, "step": 2854500 }, { "epoch": 19.31978129060199, "grad_norm": 0.3630481958389282, "learning_rate": 4.806802187093981e-05, "loss": 0.3655, "step": 2855000 }, { "epoch": 19.32316478995236, "grad_norm": 0.3677315413951874, "learning_rate": 4.806768352100476e-05, "loss": 0.3662, "step": 2855500 }, { "epoch": 19.32654828930273, "grad_norm": 0.3780626654624939, "learning_rate": 4.8067345171069725e-05, "loss": 0.3641, "step": 2856000 }, { "epoch": 19.329931788653095, "grad_norm": 0.33556923270225525, "learning_rate": 4.8067006821134693e-05, "loss": 0.3641, "step": 2856500 }, { "epoch": 19.333315288003465, "grad_norm": 0.3398982584476471, "learning_rate": 4.8066668471199656e-05, "loss": 0.366, "step": 2857000 }, { "epoch": 19.336698787353832, "grad_norm": 0.33337876200675964, "learning_rate": 4.806633012126462e-05, "loss": 0.3649, "step": 2857500 }, { "epoch": 19.340082286704202, "grad_norm": 0.3427415192127228, "learning_rate": 4.806599177132958e-05, "loss": 0.365, "step": 2858000 }, { "epoch": 19.34346578605457, "grad_norm": 0.33809319138526917, "learning_rate": 4.806565342139455e-05, "loss": 0.364, "step": 2858500 }, { "epoch": 19.346849285404936, "grad_norm": 0.35444408655166626, "learning_rate": 4.806531507145951e-05, "loss": 0.3666, "step": 2859000 }, { "epoch": 19.350232784755306, "grad_norm": 0.3796485364437103, "learning_rate": 4.806497672152447e-05, "loss": 0.3656, "step": 2859500 }, { "epoch": 19.353616284105673, "grad_norm": 0.4037618637084961, "learning_rate": 4.8064638371589435e-05, "loss": 0.3653, "step": 2860000 }, { "epoch": 19.356999783456043, "grad_norm": 0.35624685883522034, "learning_rate": 4.80643000216544e-05, "loss": 0.3633, "step": 2860500 }, { "epoch": 19.36038328280641, "grad_norm": 0.39349284768104553, "learning_rate": 4.806396167171936e-05, "loss": 0.366, "step": 2861000 }, { "epoch": 19.363766782156777, "grad_norm": 0.4229692220687866, "learning_rate": 4.806362332178432e-05, "loss": 0.3653, "step": 2861500 }, { "epoch": 19.367150281507147, "grad_norm": 0.40164726972579956, "learning_rate": 4.806328497184929e-05, "loss": 0.3657, "step": 2862000 }, { "epoch": 19.370533780857514, "grad_norm": 0.37175437808036804, "learning_rate": 4.806294662191425e-05, "loss": 0.365, "step": 2862500 }, { "epoch": 19.37391728020788, "grad_norm": 0.3736785352230072, "learning_rate": 4.8062608271979215e-05, "loss": 0.3643, "step": 2863000 }, { "epoch": 19.37730077955825, "grad_norm": 0.3389967978000641, "learning_rate": 4.806226992204418e-05, "loss": 0.3655, "step": 2863500 }, { "epoch": 19.380684278908618, "grad_norm": 0.35902392864227295, "learning_rate": 4.8061931572109146e-05, "loss": 0.3646, "step": 2864000 }, { "epoch": 19.384067778258988, "grad_norm": 0.37433475255966187, "learning_rate": 4.806159322217411e-05, "loss": 0.3665, "step": 2864500 }, { "epoch": 19.387451277609355, "grad_norm": 0.3930208384990692, "learning_rate": 4.806125487223906e-05, "loss": 0.3655, "step": 2865000 }, { "epoch": 19.39083477695972, "grad_norm": 0.32924091815948486, "learning_rate": 4.8060916522304025e-05, "loss": 0.3652, "step": 2865500 }, { "epoch": 19.39421827631009, "grad_norm": 0.3700878322124481, "learning_rate": 4.8060578172368994e-05, "loss": 0.3646, "step": 2866000 }, { "epoch": 19.39760177566046, "grad_norm": 0.36699455976486206, "learning_rate": 4.8060239822433956e-05, "loss": 0.364, "step": 2866500 }, { "epoch": 19.40098527501083, "grad_norm": 0.3643014132976532, "learning_rate": 4.805990147249892e-05, "loss": 0.3655, "step": 2867000 }, { "epoch": 19.404368774361195, "grad_norm": 0.4105173349380493, "learning_rate": 4.805956312256388e-05, "loss": 0.3665, "step": 2867500 }, { "epoch": 19.407752273711562, "grad_norm": 0.35919809341430664, "learning_rate": 4.805922477262885e-05, "loss": 0.3663, "step": 2868000 }, { "epoch": 19.411135773061932, "grad_norm": 0.3556428551673889, "learning_rate": 4.805888642269381e-05, "loss": 0.3648, "step": 2868500 }, { "epoch": 19.4145192724123, "grad_norm": 0.37149539589881897, "learning_rate": 4.8058548072758774e-05, "loss": 0.3666, "step": 2869000 }, { "epoch": 19.41790277176267, "grad_norm": 0.39399322867393494, "learning_rate": 4.8058209722823736e-05, "loss": 0.3653, "step": 2869500 }, { "epoch": 19.421286271113036, "grad_norm": 0.33901286125183105, "learning_rate": 4.80578713728887e-05, "loss": 0.366, "step": 2870000 }, { "epoch": 19.424669770463403, "grad_norm": 0.33520957827568054, "learning_rate": 4.805753302295366e-05, "loss": 0.3644, "step": 2870500 }, { "epoch": 19.428053269813773, "grad_norm": 0.3945138454437256, "learning_rate": 4.805719467301862e-05, "loss": 0.3666, "step": 2871000 }, { "epoch": 19.43143676916414, "grad_norm": 0.40912050008773804, "learning_rate": 4.805685632308359e-05, "loss": 0.3657, "step": 2871500 }, { "epoch": 19.434820268514507, "grad_norm": 0.3628746271133423, "learning_rate": 4.805651797314855e-05, "loss": 0.3666, "step": 2872000 }, { "epoch": 19.438203767864877, "grad_norm": 0.3478717803955078, "learning_rate": 4.8056179623213515e-05, "loss": 0.3663, "step": 2872500 }, { "epoch": 19.441587267215244, "grad_norm": 0.38110673427581787, "learning_rate": 4.805584127327848e-05, "loss": 0.3654, "step": 2873000 }, { "epoch": 19.444970766565614, "grad_norm": 0.36483538150787354, "learning_rate": 4.805550292334344e-05, "loss": 0.3645, "step": 2873500 }, { "epoch": 19.44835426591598, "grad_norm": 0.41633760929107666, "learning_rate": 4.805516457340841e-05, "loss": 0.3666, "step": 2874000 }, { "epoch": 19.451737765266348, "grad_norm": 0.4003651440143585, "learning_rate": 4.8054826223473364e-05, "loss": 0.3666, "step": 2874500 }, { "epoch": 19.455121264616718, "grad_norm": 0.359640508890152, "learning_rate": 4.8054487873538326e-05, "loss": 0.3648, "step": 2875000 }, { "epoch": 19.458504763967085, "grad_norm": 0.34756699204444885, "learning_rate": 4.8054149523603295e-05, "loss": 0.3643, "step": 2875500 }, { "epoch": 19.461888263317455, "grad_norm": 0.3257155418395996, "learning_rate": 4.805381117366826e-05, "loss": 0.3645, "step": 2876000 }, { "epoch": 19.46527176266782, "grad_norm": 0.3670973479747772, "learning_rate": 4.805347282373322e-05, "loss": 0.3655, "step": 2876500 }, { "epoch": 19.46865526201819, "grad_norm": 0.39196884632110596, "learning_rate": 4.805313447379818e-05, "loss": 0.3677, "step": 2877000 }, { "epoch": 19.47203876136856, "grad_norm": 0.3847583532333374, "learning_rate": 4.805279612386315e-05, "loss": 0.3656, "step": 2877500 }, { "epoch": 19.475422260718926, "grad_norm": 0.3999863862991333, "learning_rate": 4.805245777392811e-05, "loss": 0.3662, "step": 2878000 }, { "epoch": 19.478805760069292, "grad_norm": 0.35730212926864624, "learning_rate": 4.8052119423993074e-05, "loss": 0.3657, "step": 2878500 }, { "epoch": 19.482189259419663, "grad_norm": 0.390097439289093, "learning_rate": 4.8051781074058036e-05, "loss": 0.3641, "step": 2879000 }, { "epoch": 19.48557275877003, "grad_norm": 0.33165496587753296, "learning_rate": 4.8051442724123e-05, "loss": 0.3666, "step": 2879500 }, { "epoch": 19.4889562581204, "grad_norm": 0.396045058965683, "learning_rate": 4.805110437418796e-05, "loss": 0.3648, "step": 2880000 }, { "epoch": 19.492339757470766, "grad_norm": 0.32602620124816895, "learning_rate": 4.805076602425292e-05, "loss": 0.3654, "step": 2880500 }, { "epoch": 19.495723256821133, "grad_norm": 0.362203985452652, "learning_rate": 4.805042767431789e-05, "loss": 0.3661, "step": 2881000 }, { "epoch": 19.499106756171503, "grad_norm": 0.38408005237579346, "learning_rate": 4.8050089324382854e-05, "loss": 0.3656, "step": 2881500 }, { "epoch": 19.50249025552187, "grad_norm": 0.3891410827636719, "learning_rate": 4.8049750974447816e-05, "loss": 0.3658, "step": 2882000 }, { "epoch": 19.50587375487224, "grad_norm": 0.4252306818962097, "learning_rate": 4.804941262451278e-05, "loss": 0.3663, "step": 2882500 }, { "epoch": 19.509257254222607, "grad_norm": 0.3757934868335724, "learning_rate": 4.804907427457774e-05, "loss": 0.3662, "step": 2883000 }, { "epoch": 19.512640753572974, "grad_norm": 0.36446207761764526, "learning_rate": 4.804873592464271e-05, "loss": 0.3638, "step": 2883500 }, { "epoch": 19.516024252923344, "grad_norm": 0.38249266147613525, "learning_rate": 4.8048397574707664e-05, "loss": 0.3657, "step": 2884000 }, { "epoch": 19.51940775227371, "grad_norm": 0.37207794189453125, "learning_rate": 4.8048059224772626e-05, "loss": 0.3671, "step": 2884500 }, { "epoch": 19.52279125162408, "grad_norm": 0.3630044162273407, "learning_rate": 4.8047720874837595e-05, "loss": 0.3663, "step": 2885000 }, { "epoch": 19.526174750974448, "grad_norm": 0.3826695382595062, "learning_rate": 4.804738252490256e-05, "loss": 0.3647, "step": 2885500 }, { "epoch": 19.529558250324815, "grad_norm": 0.4040972590446472, "learning_rate": 4.804704417496752e-05, "loss": 0.3647, "step": 2886000 }, { "epoch": 19.532941749675185, "grad_norm": 0.37629270553588867, "learning_rate": 4.804670582503248e-05, "loss": 0.3667, "step": 2886500 }, { "epoch": 19.536325249025552, "grad_norm": 0.3865208625793457, "learning_rate": 4.804636747509745e-05, "loss": 0.3645, "step": 2887000 }, { "epoch": 19.53970874837592, "grad_norm": 0.3766942024230957, "learning_rate": 4.804602912516241e-05, "loss": 0.3654, "step": 2887500 }, { "epoch": 19.54309224772629, "grad_norm": 0.3663296401500702, "learning_rate": 4.8045690775227375e-05, "loss": 0.3639, "step": 2888000 }, { "epoch": 19.546475747076656, "grad_norm": 0.3530994951725006, "learning_rate": 4.804535242529234e-05, "loss": 0.3671, "step": 2888500 }, { "epoch": 19.549859246427026, "grad_norm": 0.4040890336036682, "learning_rate": 4.80450140753573e-05, "loss": 0.3653, "step": 2889000 }, { "epoch": 19.553242745777393, "grad_norm": 0.33798158168792725, "learning_rate": 4.804467572542226e-05, "loss": 0.3655, "step": 2889500 }, { "epoch": 19.55662624512776, "grad_norm": 0.3921116292476654, "learning_rate": 4.804433737548722e-05, "loss": 0.3654, "step": 2890000 }, { "epoch": 19.56000974447813, "grad_norm": 0.3624851405620575, "learning_rate": 4.8043999025552185e-05, "loss": 0.3671, "step": 2890500 }, { "epoch": 19.563393243828497, "grad_norm": 0.4179016947746277, "learning_rate": 4.8043660675617154e-05, "loss": 0.367, "step": 2891000 }, { "epoch": 19.566776743178867, "grad_norm": 0.34648746252059937, "learning_rate": 4.8043322325682116e-05, "loss": 0.3662, "step": 2891500 }, { "epoch": 19.570160242529234, "grad_norm": 0.3729505240917206, "learning_rate": 4.804298397574708e-05, "loss": 0.3669, "step": 2892000 }, { "epoch": 19.5735437418796, "grad_norm": 0.3661282956600189, "learning_rate": 4.804264562581204e-05, "loss": 0.365, "step": 2892500 }, { "epoch": 19.57692724122997, "grad_norm": 0.38361188769340515, "learning_rate": 4.804230727587701e-05, "loss": 0.3654, "step": 2893000 }, { "epoch": 19.580310740580337, "grad_norm": 0.397332102060318, "learning_rate": 4.8041968925941965e-05, "loss": 0.3665, "step": 2893500 }, { "epoch": 19.583694239930708, "grad_norm": 0.3681514263153076, "learning_rate": 4.804163057600693e-05, "loss": 0.3643, "step": 2894000 }, { "epoch": 19.587077739281074, "grad_norm": 0.3427649736404419, "learning_rate": 4.8041292226071896e-05, "loss": 0.365, "step": 2894500 }, { "epoch": 19.59046123863144, "grad_norm": 0.3386278748512268, "learning_rate": 4.804095387613686e-05, "loss": 0.3677, "step": 2895000 }, { "epoch": 19.59384473798181, "grad_norm": 0.3502010703086853, "learning_rate": 4.804061552620182e-05, "loss": 0.3662, "step": 2895500 }, { "epoch": 19.59722823733218, "grad_norm": 0.34507283568382263, "learning_rate": 4.804027717626678e-05, "loss": 0.3665, "step": 2896000 }, { "epoch": 19.600611736682545, "grad_norm": 0.33213046193122864, "learning_rate": 4.803993882633175e-05, "loss": 0.3658, "step": 2896500 }, { "epoch": 19.603995236032915, "grad_norm": 0.3481467068195343, "learning_rate": 4.803960047639671e-05, "loss": 0.3659, "step": 2897000 }, { "epoch": 19.607378735383282, "grad_norm": 0.3790886402130127, "learning_rate": 4.8039262126461675e-05, "loss": 0.3658, "step": 2897500 }, { "epoch": 19.610762234733652, "grad_norm": 0.35485225915908813, "learning_rate": 4.803892377652664e-05, "loss": 0.3659, "step": 2898000 }, { "epoch": 19.61414573408402, "grad_norm": 0.37360668182373047, "learning_rate": 4.80385854265916e-05, "loss": 0.3668, "step": 2898500 }, { "epoch": 19.617529233434386, "grad_norm": 0.3755984604358673, "learning_rate": 4.803824707665656e-05, "loss": 0.3652, "step": 2899000 }, { "epoch": 19.620912732784756, "grad_norm": 0.3325406014919281, "learning_rate": 4.8037908726721524e-05, "loss": 0.3646, "step": 2899500 }, { "epoch": 19.624296232135123, "grad_norm": 0.38012993335723877, "learning_rate": 4.8037570376786486e-05, "loss": 0.3671, "step": 2900000 }, { "epoch": 19.627679731485493, "grad_norm": 0.39566442370414734, "learning_rate": 4.8037232026851455e-05, "loss": 0.3646, "step": 2900500 }, { "epoch": 19.63106323083586, "grad_norm": 0.35954439640045166, "learning_rate": 4.803689367691642e-05, "loss": 0.3654, "step": 2901000 }, { "epoch": 19.634446730186227, "grad_norm": 0.41132161021232605, "learning_rate": 4.803655532698138e-05, "loss": 0.3667, "step": 2901500 }, { "epoch": 19.637830229536597, "grad_norm": 0.3903202712535858, "learning_rate": 4.803621697704634e-05, "loss": 0.3663, "step": 2902000 }, { "epoch": 19.641213728886964, "grad_norm": 0.35856491327285767, "learning_rate": 4.803587862711131e-05, "loss": 0.3661, "step": 2902500 }, { "epoch": 19.64459722823733, "grad_norm": 0.3773479163646698, "learning_rate": 4.8035540277176266e-05, "loss": 0.3658, "step": 2903000 }, { "epoch": 19.6479807275877, "grad_norm": 0.3696669638156891, "learning_rate": 4.803520192724123e-05, "loss": 0.3672, "step": 2903500 }, { "epoch": 19.651364226938068, "grad_norm": 0.3732285797595978, "learning_rate": 4.80348635773062e-05, "loss": 0.3653, "step": 2904000 }, { "epoch": 19.654747726288438, "grad_norm": 0.367210328578949, "learning_rate": 4.803452522737116e-05, "loss": 0.3669, "step": 2904500 }, { "epoch": 19.658131225638805, "grad_norm": 0.3591116964817047, "learning_rate": 4.803418687743612e-05, "loss": 0.3665, "step": 2905000 }, { "epoch": 19.66151472498917, "grad_norm": 0.3639654815196991, "learning_rate": 4.803384852750108e-05, "loss": 0.3657, "step": 2905500 }, { "epoch": 19.66489822433954, "grad_norm": 0.36325597763061523, "learning_rate": 4.803351017756605e-05, "loss": 0.3672, "step": 2906000 }, { "epoch": 19.66828172368991, "grad_norm": 0.3457540273666382, "learning_rate": 4.8033171827631014e-05, "loss": 0.3655, "step": 2906500 }, { "epoch": 19.67166522304028, "grad_norm": 0.3496015965938568, "learning_rate": 4.8032833477695976e-05, "loss": 0.3639, "step": 2907000 }, { "epoch": 19.675048722390645, "grad_norm": 0.338682621717453, "learning_rate": 4.803249512776094e-05, "loss": 0.3669, "step": 2907500 }, { "epoch": 19.678432221741012, "grad_norm": 0.3519893288612366, "learning_rate": 4.80321567778259e-05, "loss": 0.3663, "step": 2908000 }, { "epoch": 19.681815721091382, "grad_norm": 0.345056414604187, "learning_rate": 4.803181842789086e-05, "loss": 0.3656, "step": 2908500 }, { "epoch": 19.68519922044175, "grad_norm": 0.37303248047828674, "learning_rate": 4.8031480077955825e-05, "loss": 0.3655, "step": 2909000 }, { "epoch": 19.68858271979212, "grad_norm": 0.355040043592453, "learning_rate": 4.803114172802079e-05, "loss": 0.3666, "step": 2909500 }, { "epoch": 19.691966219142486, "grad_norm": 0.3853912949562073, "learning_rate": 4.8030803378085756e-05, "loss": 0.3664, "step": 2910000 }, { "epoch": 19.695349718492853, "grad_norm": 0.3670165240764618, "learning_rate": 4.803046502815072e-05, "loss": 0.3664, "step": 2910500 }, { "epoch": 19.698733217843223, "grad_norm": 0.37371906638145447, "learning_rate": 4.803012667821568e-05, "loss": 0.3643, "step": 2911000 }, { "epoch": 19.70211671719359, "grad_norm": 0.34766170382499695, "learning_rate": 4.802978832828064e-05, "loss": 0.3672, "step": 2911500 }, { "epoch": 19.705500216543957, "grad_norm": 0.3667513430118561, "learning_rate": 4.802944997834561e-05, "loss": 0.3639, "step": 2912000 }, { "epoch": 19.708883715894327, "grad_norm": 0.3334568440914154, "learning_rate": 4.8029111628410566e-05, "loss": 0.3669, "step": 2912500 }, { "epoch": 19.712267215244694, "grad_norm": 0.3696806728839874, "learning_rate": 4.802877327847553e-05, "loss": 0.3657, "step": 2913000 }, { "epoch": 19.715650714595064, "grad_norm": 0.3852570056915283, "learning_rate": 4.80284349285405e-05, "loss": 0.3645, "step": 2913500 }, { "epoch": 19.71903421394543, "grad_norm": 0.3736285865306854, "learning_rate": 4.802809657860546e-05, "loss": 0.3647, "step": 2914000 }, { "epoch": 19.722417713295798, "grad_norm": 0.33064740896224976, "learning_rate": 4.802775822867042e-05, "loss": 0.365, "step": 2914500 }, { "epoch": 19.725801212646168, "grad_norm": 0.3868357837200165, "learning_rate": 4.8027419878735384e-05, "loss": 0.3642, "step": 2915000 }, { "epoch": 19.729184711996535, "grad_norm": 0.39555037021636963, "learning_rate": 4.802708152880035e-05, "loss": 0.3654, "step": 2915500 }, { "epoch": 19.732568211346905, "grad_norm": 0.33659178018569946, "learning_rate": 4.8026743178865315e-05, "loss": 0.3672, "step": 2916000 }, { "epoch": 19.73595171069727, "grad_norm": 0.38515806198120117, "learning_rate": 4.802640482893028e-05, "loss": 0.3665, "step": 2916500 }, { "epoch": 19.73933521004764, "grad_norm": 0.36636361479759216, "learning_rate": 4.802606647899524e-05, "loss": 0.3656, "step": 2917000 }, { "epoch": 19.74271870939801, "grad_norm": 0.3609832227230072, "learning_rate": 4.80257281290602e-05, "loss": 0.3658, "step": 2917500 }, { "epoch": 19.746102208748376, "grad_norm": 0.34398800134658813, "learning_rate": 4.802538977912516e-05, "loss": 0.368, "step": 2918000 }, { "epoch": 19.749485708098746, "grad_norm": 0.3340972363948822, "learning_rate": 4.8025051429190125e-05, "loss": 0.3649, "step": 2918500 }, { "epoch": 19.752869207449113, "grad_norm": 0.35565194487571716, "learning_rate": 4.802471307925509e-05, "loss": 0.3646, "step": 2919000 }, { "epoch": 19.75625270679948, "grad_norm": 0.3764093518257141, "learning_rate": 4.8024374729320056e-05, "loss": 0.3663, "step": 2919500 }, { "epoch": 19.75963620614985, "grad_norm": 0.3450486660003662, "learning_rate": 4.802403637938502e-05, "loss": 0.3665, "step": 2920000 }, { "epoch": 19.763019705500216, "grad_norm": 0.3341296911239624, "learning_rate": 4.802369802944998e-05, "loss": 0.367, "step": 2920500 }, { "epoch": 19.766403204850583, "grad_norm": 0.3815435469150543, "learning_rate": 4.802335967951494e-05, "loss": 0.3658, "step": 2921000 }, { "epoch": 19.769786704200953, "grad_norm": 0.3538355231285095, "learning_rate": 4.802302132957991e-05, "loss": 0.366, "step": 2921500 }, { "epoch": 19.77317020355132, "grad_norm": 0.3608250916004181, "learning_rate": 4.8022682979644874e-05, "loss": 0.3665, "step": 2922000 }, { "epoch": 19.77655370290169, "grad_norm": 0.3507607579231262, "learning_rate": 4.802234462970983e-05, "loss": 0.3648, "step": 2922500 }, { "epoch": 19.779937202252057, "grad_norm": 0.3902914524078369, "learning_rate": 4.80220062797748e-05, "loss": 0.3662, "step": 2923000 }, { "epoch": 19.783320701602424, "grad_norm": 0.37399566173553467, "learning_rate": 4.802166792983976e-05, "loss": 0.3657, "step": 2923500 }, { "epoch": 19.786704200952794, "grad_norm": 0.3999204635620117, "learning_rate": 4.802132957990472e-05, "loss": 0.3642, "step": 2924000 }, { "epoch": 19.79008770030316, "grad_norm": 0.37231069803237915, "learning_rate": 4.8020991229969684e-05, "loss": 0.3668, "step": 2924500 }, { "epoch": 19.79347119965353, "grad_norm": 0.3625221252441406, "learning_rate": 4.802065288003465e-05, "loss": 0.3644, "step": 2925000 }, { "epoch": 19.796854699003898, "grad_norm": 0.3390725255012512, "learning_rate": 4.8020314530099615e-05, "loss": 0.3655, "step": 2925500 }, { "epoch": 19.800238198354265, "grad_norm": 0.3890676498413086, "learning_rate": 4.801997618016458e-05, "loss": 0.3676, "step": 2926000 }, { "epoch": 19.803621697704635, "grad_norm": 0.35610461235046387, "learning_rate": 4.801963783022954e-05, "loss": 0.364, "step": 2926500 }, { "epoch": 19.807005197055002, "grad_norm": 0.3813265264034271, "learning_rate": 4.80192994802945e-05, "loss": 0.3663, "step": 2927000 }, { "epoch": 19.81038869640537, "grad_norm": 0.3406016230583191, "learning_rate": 4.8018961130359464e-05, "loss": 0.3662, "step": 2927500 }, { "epoch": 19.81377219575574, "grad_norm": 0.34513765573501587, "learning_rate": 4.8018622780424426e-05, "loss": 0.3654, "step": 2928000 }, { "epoch": 19.817155695106106, "grad_norm": 0.38791200518608093, "learning_rate": 4.801828443048939e-05, "loss": 0.3656, "step": 2928500 }, { "epoch": 19.820539194456476, "grad_norm": 0.39310234785079956, "learning_rate": 4.801794608055436e-05, "loss": 0.3659, "step": 2929000 }, { "epoch": 19.823922693806843, "grad_norm": 0.3613438606262207, "learning_rate": 4.801760773061932e-05, "loss": 0.3658, "step": 2929500 }, { "epoch": 19.82730619315721, "grad_norm": 0.3606795370578766, "learning_rate": 4.801726938068428e-05, "loss": 0.3651, "step": 2930000 }, { "epoch": 19.83068969250758, "grad_norm": 0.37441298365592957, "learning_rate": 4.801693103074924e-05, "loss": 0.3659, "step": 2930500 }, { "epoch": 19.834073191857946, "grad_norm": 0.3620903789997101, "learning_rate": 4.801659268081421e-05, "loss": 0.3655, "step": 2931000 }, { "epoch": 19.837456691208317, "grad_norm": 0.3679297864437103, "learning_rate": 4.8016254330879174e-05, "loss": 0.3661, "step": 2931500 }, { "epoch": 19.840840190558684, "grad_norm": 0.37004727125167847, "learning_rate": 4.801591598094413e-05, "loss": 0.3663, "step": 2932000 }, { "epoch": 19.84422368990905, "grad_norm": 0.3652530312538147, "learning_rate": 4.80155776310091e-05, "loss": 0.3652, "step": 2932500 }, { "epoch": 19.84760718925942, "grad_norm": 0.35250890254974365, "learning_rate": 4.801523928107406e-05, "loss": 0.3646, "step": 2933000 }, { "epoch": 19.850990688609787, "grad_norm": 0.36383000016212463, "learning_rate": 4.801490093113902e-05, "loss": 0.3659, "step": 2933500 }, { "epoch": 19.854374187960158, "grad_norm": 0.38137707114219666, "learning_rate": 4.8014562581203985e-05, "loss": 0.3663, "step": 2934000 }, { "epoch": 19.857757687310524, "grad_norm": 0.37094008922576904, "learning_rate": 4.8014224231268954e-05, "loss": 0.3646, "step": 2934500 }, { "epoch": 19.86114118666089, "grad_norm": 0.3024352490901947, "learning_rate": 4.8013885881333916e-05, "loss": 0.3663, "step": 2935000 }, { "epoch": 19.86452468601126, "grad_norm": 0.35161951184272766, "learning_rate": 4.801354753139888e-05, "loss": 0.3655, "step": 2935500 }, { "epoch": 19.867908185361628, "grad_norm": 0.37565305829048157, "learning_rate": 4.801320918146384e-05, "loss": 0.3672, "step": 2936000 }, { "epoch": 19.871291684711995, "grad_norm": 0.3331560790538788, "learning_rate": 4.80128708315288e-05, "loss": 0.3673, "step": 2936500 }, { "epoch": 19.874675184062365, "grad_norm": 0.3546275496482849, "learning_rate": 4.8012532481593764e-05, "loss": 0.3664, "step": 2937000 }, { "epoch": 19.878058683412732, "grad_norm": 0.39799919724464417, "learning_rate": 4.8012194131658727e-05, "loss": 0.3662, "step": 2937500 }, { "epoch": 19.881442182763102, "grad_norm": 0.3795362114906311, "learning_rate": 4.801185578172369e-05, "loss": 0.3659, "step": 2938000 }, { "epoch": 19.88482568211347, "grad_norm": 0.3724011480808258, "learning_rate": 4.801151743178866e-05, "loss": 0.3656, "step": 2938500 }, { "epoch": 19.888209181463836, "grad_norm": 0.3911020755767822, "learning_rate": 4.801117908185362e-05, "loss": 0.3666, "step": 2939000 }, { "epoch": 19.891592680814206, "grad_norm": 0.3608606159687042, "learning_rate": 4.801084073191858e-05, "loss": 0.3657, "step": 2939500 }, { "epoch": 19.894976180164573, "grad_norm": 0.35787469148635864, "learning_rate": 4.8010502381983544e-05, "loss": 0.3667, "step": 2940000 }, { "epoch": 19.898359679514943, "grad_norm": 0.40856972336769104, "learning_rate": 4.801016403204851e-05, "loss": 0.3656, "step": 2940500 }, { "epoch": 19.90174317886531, "grad_norm": 0.3542422354221344, "learning_rate": 4.8009825682113475e-05, "loss": 0.3636, "step": 2941000 }, { "epoch": 19.905126678215677, "grad_norm": 0.3684796690940857, "learning_rate": 4.800948733217843e-05, "loss": 0.3663, "step": 2941500 }, { "epoch": 19.908510177566047, "grad_norm": 0.34546706080436707, "learning_rate": 4.80091489822434e-05, "loss": 0.3653, "step": 2942000 }, { "epoch": 19.911893676916414, "grad_norm": 0.3739299476146698, "learning_rate": 4.800881063230836e-05, "loss": 0.3658, "step": 2942500 }, { "epoch": 19.915277176266784, "grad_norm": 0.37413647770881653, "learning_rate": 4.8008472282373323e-05, "loss": 0.3663, "step": 2943000 }, { "epoch": 19.91866067561715, "grad_norm": 0.38979628682136536, "learning_rate": 4.8008133932438286e-05, "loss": 0.3651, "step": 2943500 }, { "epoch": 19.922044174967517, "grad_norm": 0.35113048553466797, "learning_rate": 4.8007795582503254e-05, "loss": 0.3655, "step": 2944000 }, { "epoch": 19.925427674317888, "grad_norm": 0.3997516930103302, "learning_rate": 4.8007457232568217e-05, "loss": 0.3657, "step": 2944500 }, { "epoch": 19.928811173668254, "grad_norm": 0.3631218671798706, "learning_rate": 4.800711888263318e-05, "loss": 0.3655, "step": 2945000 }, { "epoch": 19.93219467301862, "grad_norm": 0.38995862007141113, "learning_rate": 4.800678053269814e-05, "loss": 0.3652, "step": 2945500 }, { "epoch": 19.93557817236899, "grad_norm": 0.36848723888397217, "learning_rate": 4.80064421827631e-05, "loss": 0.3672, "step": 2946000 }, { "epoch": 19.93896167171936, "grad_norm": 0.3414466381072998, "learning_rate": 4.8006103832828065e-05, "loss": 0.3664, "step": 2946500 }, { "epoch": 19.94234517106973, "grad_norm": 0.3705518841743469, "learning_rate": 4.800576548289303e-05, "loss": 0.3662, "step": 2947000 }, { "epoch": 19.945728670420095, "grad_norm": 0.35034966468811035, "learning_rate": 4.800542713295799e-05, "loss": 0.3654, "step": 2947500 }, { "epoch": 19.949112169770462, "grad_norm": 0.41538190841674805, "learning_rate": 4.800508878302296e-05, "loss": 0.3658, "step": 2948000 }, { "epoch": 19.952495669120832, "grad_norm": 0.32482677698135376, "learning_rate": 4.800475043308792e-05, "loss": 0.3678, "step": 2948500 }, { "epoch": 19.9558791684712, "grad_norm": 0.36166203022003174, "learning_rate": 4.800441208315288e-05, "loss": 0.3667, "step": 2949000 }, { "epoch": 19.95926266782157, "grad_norm": 0.38970285654067993, "learning_rate": 4.8004073733217845e-05, "loss": 0.3656, "step": 2949500 }, { "epoch": 19.962646167171936, "grad_norm": 0.35561996698379517, "learning_rate": 4.8003735383282813e-05, "loss": 0.3664, "step": 2950000 }, { "epoch": 19.966029666522303, "grad_norm": 0.34169068932533264, "learning_rate": 4.8003397033347776e-05, "loss": 0.3662, "step": 2950500 }, { "epoch": 19.969413165872673, "grad_norm": 0.3757932484149933, "learning_rate": 4.800305868341273e-05, "loss": 0.3633, "step": 2951000 }, { "epoch": 19.97279666522304, "grad_norm": 0.3738752603530884, "learning_rate": 4.80027203334777e-05, "loss": 0.3663, "step": 2951500 }, { "epoch": 19.976180164573407, "grad_norm": 0.3486514091491699, "learning_rate": 4.800238198354266e-05, "loss": 0.3656, "step": 2952000 }, { "epoch": 19.979563663923777, "grad_norm": 0.3921927809715271, "learning_rate": 4.8002043633607624e-05, "loss": 0.3639, "step": 2952500 }, { "epoch": 19.982947163274144, "grad_norm": 0.3608473241329193, "learning_rate": 4.8001705283672586e-05, "loss": 0.3655, "step": 2953000 }, { "epoch": 19.986330662624514, "grad_norm": 0.3112834095954895, "learning_rate": 4.800136693373755e-05, "loss": 0.3672, "step": 2953500 }, { "epoch": 19.98971416197488, "grad_norm": 0.36548370122909546, "learning_rate": 4.800102858380252e-05, "loss": 0.3642, "step": 2954000 }, { "epoch": 19.993097661325248, "grad_norm": 0.34996843338012695, "learning_rate": 4.800069023386748e-05, "loss": 0.3673, "step": 2954500 }, { "epoch": 19.996481160675618, "grad_norm": 0.3586348295211792, "learning_rate": 4.800035188393244e-05, "loss": 0.3665, "step": 2955000 }, { "epoch": 19.999864660025985, "grad_norm": 0.35456401109695435, "learning_rate": 4.8000013533997404e-05, "loss": 0.3659, "step": 2955500 }, { "epoch": 20.0, "eval_accuracy": 0.8606411793262569, "eval_loss": 0.5660711526870728, "eval_runtime": 3389.5524, "eval_samples_per_second": 85.777, "eval_steps_per_second": 5.361, "step": 2955520 }, { "epoch": 20.003248159376355, "grad_norm": 0.37346386909484863, "learning_rate": 4.7999675184062366e-05, "loss": 0.3626, "step": 2956000 }, { "epoch": 20.00663165872672, "grad_norm": 0.345569908618927, "learning_rate": 4.799933683412733e-05, "loss": 0.3636, "step": 2956500 }, { "epoch": 20.01001515807709, "grad_norm": 0.38589319586753845, "learning_rate": 4.799899848419229e-05, "loss": 0.3638, "step": 2957000 }, { "epoch": 20.01339865742746, "grad_norm": 0.36453381180763245, "learning_rate": 4.799866013425726e-05, "loss": 0.3631, "step": 2957500 }, { "epoch": 20.016782156777825, "grad_norm": 0.38249197602272034, "learning_rate": 4.799832178432222e-05, "loss": 0.3627, "step": 2958000 }, { "epoch": 20.020165656128196, "grad_norm": 0.3455698788166046, "learning_rate": 4.799798343438718e-05, "loss": 0.3636, "step": 2958500 }, { "epoch": 20.023549155478563, "grad_norm": 0.3930585980415344, "learning_rate": 4.7997645084452145e-05, "loss": 0.3642, "step": 2959000 }, { "epoch": 20.02693265482893, "grad_norm": 0.38215509057044983, "learning_rate": 4.7997306734517114e-05, "loss": 0.3632, "step": 2959500 }, { "epoch": 20.0303161541793, "grad_norm": 0.3310193717479706, "learning_rate": 4.7996968384582076e-05, "loss": 0.3644, "step": 2960000 }, { "epoch": 20.033699653529666, "grad_norm": 0.3496049642562866, "learning_rate": 4.799663003464703e-05, "loss": 0.3626, "step": 2960500 }, { "epoch": 20.037083152880033, "grad_norm": 0.3632880449295044, "learning_rate": 4.7996291684711994e-05, "loss": 0.3648, "step": 2961000 }, { "epoch": 20.040466652230403, "grad_norm": 0.4196126461029053, "learning_rate": 4.799595333477696e-05, "loss": 0.3644, "step": 2961500 }, { "epoch": 20.04385015158077, "grad_norm": 0.43219631910324097, "learning_rate": 4.7995614984841925e-05, "loss": 0.3636, "step": 2962000 }, { "epoch": 20.04723365093114, "grad_norm": 0.35603439807891846, "learning_rate": 4.799527663490689e-05, "loss": 0.3657, "step": 2962500 }, { "epoch": 20.050617150281507, "grad_norm": 0.39890220761299133, "learning_rate": 4.799493828497185e-05, "loss": 0.3638, "step": 2963000 }, { "epoch": 20.054000649631874, "grad_norm": 0.351046621799469, "learning_rate": 4.799459993503682e-05, "loss": 0.3647, "step": 2963500 }, { "epoch": 20.057384148982244, "grad_norm": 0.3713405728340149, "learning_rate": 4.799426158510178e-05, "loss": 0.365, "step": 2964000 }, { "epoch": 20.06076764833261, "grad_norm": 0.3425780236721039, "learning_rate": 4.799392323516674e-05, "loss": 0.364, "step": 2964500 }, { "epoch": 20.06415114768298, "grad_norm": 0.359878808259964, "learning_rate": 4.7993584885231704e-05, "loss": 0.3654, "step": 2965000 }, { "epoch": 20.067534647033348, "grad_norm": 0.38747385144233704, "learning_rate": 4.7993246535296666e-05, "loss": 0.3633, "step": 2965500 }, { "epoch": 20.070918146383715, "grad_norm": 0.3746489882469177, "learning_rate": 4.799290818536163e-05, "loss": 0.3642, "step": 2966000 }, { "epoch": 20.074301645734085, "grad_norm": 0.41375428438186646, "learning_rate": 4.799256983542659e-05, "loss": 0.3643, "step": 2966500 }, { "epoch": 20.077685145084452, "grad_norm": 0.4087056517601013, "learning_rate": 4.799223148549156e-05, "loss": 0.365, "step": 2967000 }, { "epoch": 20.08106864443482, "grad_norm": 0.3827906847000122, "learning_rate": 4.799189313555652e-05, "loss": 0.3635, "step": 2967500 }, { "epoch": 20.08445214378519, "grad_norm": 0.34499284625053406, "learning_rate": 4.7991554785621484e-05, "loss": 0.3636, "step": 2968000 }, { "epoch": 20.087835643135556, "grad_norm": 0.3452380895614624, "learning_rate": 4.7991216435686446e-05, "loss": 0.3636, "step": 2968500 }, { "epoch": 20.091219142485926, "grad_norm": 0.3407035171985626, "learning_rate": 4.7990878085751415e-05, "loss": 0.3647, "step": 2969000 }, { "epoch": 20.094602641836293, "grad_norm": 0.37500280141830444, "learning_rate": 4.799053973581638e-05, "loss": 0.3651, "step": 2969500 }, { "epoch": 20.09798614118666, "grad_norm": 0.3894490897655487, "learning_rate": 4.799020138588133e-05, "loss": 0.3635, "step": 2970000 }, { "epoch": 20.10136964053703, "grad_norm": 0.3936595618724823, "learning_rate": 4.7989863035946294e-05, "loss": 0.3644, "step": 2970500 }, { "epoch": 20.104753139887396, "grad_norm": 0.37825560569763184, "learning_rate": 4.798952468601126e-05, "loss": 0.3646, "step": 2971000 }, { "epoch": 20.108136639237767, "grad_norm": 0.3469005823135376, "learning_rate": 4.7989186336076225e-05, "loss": 0.3655, "step": 2971500 }, { "epoch": 20.111520138588133, "grad_norm": 0.37107598781585693, "learning_rate": 4.798884798614119e-05, "loss": 0.3671, "step": 2972000 }, { "epoch": 20.1149036379385, "grad_norm": 0.3491670489311218, "learning_rate": 4.798850963620615e-05, "loss": 0.3643, "step": 2972500 }, { "epoch": 20.11828713728887, "grad_norm": 0.3483146131038666, "learning_rate": 4.798817128627112e-05, "loss": 0.3628, "step": 2973000 }, { "epoch": 20.121670636639237, "grad_norm": 0.35180410742759705, "learning_rate": 4.798783293633608e-05, "loss": 0.3641, "step": 2973500 }, { "epoch": 20.125054135989608, "grad_norm": 0.35226669907569885, "learning_rate": 4.798749458640104e-05, "loss": 0.3652, "step": 2974000 }, { "epoch": 20.128437635339974, "grad_norm": 0.3892536461353302, "learning_rate": 4.7987156236466005e-05, "loss": 0.3654, "step": 2974500 }, { "epoch": 20.13182113469034, "grad_norm": 0.3841457962989807, "learning_rate": 4.798681788653097e-05, "loss": 0.3636, "step": 2975000 }, { "epoch": 20.13520463404071, "grad_norm": 0.3708195090293884, "learning_rate": 4.798647953659593e-05, "loss": 0.3651, "step": 2975500 }, { "epoch": 20.138588133391078, "grad_norm": 0.3810982406139374, "learning_rate": 4.798614118666089e-05, "loss": 0.3646, "step": 2976000 }, { "epoch": 20.141971632741445, "grad_norm": 0.4043063521385193, "learning_rate": 4.798580283672586e-05, "loss": 0.3651, "step": 2976500 }, { "epoch": 20.145355132091815, "grad_norm": 0.345688134431839, "learning_rate": 4.798546448679082e-05, "loss": 0.365, "step": 2977000 }, { "epoch": 20.148738631442182, "grad_norm": 0.3402092456817627, "learning_rate": 4.7985126136855784e-05, "loss": 0.3649, "step": 2977500 }, { "epoch": 20.152122130792552, "grad_norm": 0.3475629985332489, "learning_rate": 4.7984787786920746e-05, "loss": 0.3645, "step": 2978000 }, { "epoch": 20.15550563014292, "grad_norm": 0.38040003180503845, "learning_rate": 4.7984449436985715e-05, "loss": 0.3662, "step": 2978500 }, { "epoch": 20.158889129493286, "grad_norm": 0.3826071619987488, "learning_rate": 4.798411108705068e-05, "loss": 0.3642, "step": 2979000 }, { "epoch": 20.162272628843656, "grad_norm": 0.3706953823566437, "learning_rate": 4.798377273711563e-05, "loss": 0.365, "step": 2979500 }, { "epoch": 20.165656128194023, "grad_norm": 0.3419247269630432, "learning_rate": 4.7983434387180595e-05, "loss": 0.3654, "step": 2980000 }, { "epoch": 20.169039627544393, "grad_norm": 0.4026300013065338, "learning_rate": 4.7983096037245564e-05, "loss": 0.3651, "step": 2980500 }, { "epoch": 20.17242312689476, "grad_norm": 0.39176565408706665, "learning_rate": 4.7982757687310526e-05, "loss": 0.3645, "step": 2981000 }, { "epoch": 20.175806626245127, "grad_norm": 0.3783055245876312, "learning_rate": 4.798241933737549e-05, "loss": 0.3656, "step": 2981500 }, { "epoch": 20.179190125595497, "grad_norm": 0.39230743050575256, "learning_rate": 4.798208098744045e-05, "loss": 0.3649, "step": 2982000 }, { "epoch": 20.182573624945864, "grad_norm": 0.3609783351421356, "learning_rate": 4.798174263750542e-05, "loss": 0.3641, "step": 2982500 }, { "epoch": 20.18595712429623, "grad_norm": 0.34281569719314575, "learning_rate": 4.798140428757038e-05, "loss": 0.3645, "step": 2983000 }, { "epoch": 20.1893406236466, "grad_norm": 0.35994818806648254, "learning_rate": 4.798106593763534e-05, "loss": 0.3657, "step": 2983500 }, { "epoch": 20.192724122996967, "grad_norm": 0.3607349991798401, "learning_rate": 4.7980727587700305e-05, "loss": 0.3642, "step": 2984000 }, { "epoch": 20.196107622347338, "grad_norm": 0.36213696002960205, "learning_rate": 4.798038923776527e-05, "loss": 0.3649, "step": 2984500 }, { "epoch": 20.199491121697704, "grad_norm": 0.37555092573165894, "learning_rate": 4.798005088783023e-05, "loss": 0.3648, "step": 2985000 }, { "epoch": 20.20287462104807, "grad_norm": 0.381056547164917, "learning_rate": 4.797971253789519e-05, "loss": 0.3665, "step": 2985500 }, { "epoch": 20.20625812039844, "grad_norm": 0.375205397605896, "learning_rate": 4.797937418796016e-05, "loss": 0.3645, "step": 2986000 }, { "epoch": 20.20964161974881, "grad_norm": 0.3704332113265991, "learning_rate": 4.797903583802512e-05, "loss": 0.3648, "step": 2986500 }, { "epoch": 20.21302511909918, "grad_norm": 0.359117329120636, "learning_rate": 4.7978697488090085e-05, "loss": 0.3661, "step": 2987000 }, { "epoch": 20.216408618449545, "grad_norm": 0.34534206986427307, "learning_rate": 4.797835913815505e-05, "loss": 0.3656, "step": 2987500 }, { "epoch": 20.219792117799912, "grad_norm": 0.33024686574935913, "learning_rate": 4.7978020788220016e-05, "loss": 0.3643, "step": 2988000 }, { "epoch": 20.223175617150282, "grad_norm": 0.3749881982803345, "learning_rate": 4.797768243828498e-05, "loss": 0.3648, "step": 2988500 }, { "epoch": 20.22655911650065, "grad_norm": 0.35957151651382446, "learning_rate": 4.7977344088349933e-05, "loss": 0.3644, "step": 2989000 }, { "epoch": 20.22994261585102, "grad_norm": 0.3546516001224518, "learning_rate": 4.7977005738414896e-05, "loss": 0.3654, "step": 2989500 }, { "epoch": 20.233326115201386, "grad_norm": 0.4022500514984131, "learning_rate": 4.7976667388479864e-05, "loss": 0.3661, "step": 2990000 }, { "epoch": 20.236709614551753, "grad_norm": 0.3377239406108856, "learning_rate": 4.7976329038544827e-05, "loss": 0.365, "step": 2990500 }, { "epoch": 20.240093113902123, "grad_norm": 0.3646054267883301, "learning_rate": 4.797599068860979e-05, "loss": 0.3636, "step": 2991000 }, { "epoch": 20.24347661325249, "grad_norm": 0.3465125858783722, "learning_rate": 4.797565233867475e-05, "loss": 0.3643, "step": 2991500 }, { "epoch": 20.246860112602857, "grad_norm": 0.38007932901382446, "learning_rate": 4.797531398873972e-05, "loss": 0.3645, "step": 2992000 }, { "epoch": 20.250243611953227, "grad_norm": 0.38943079113960266, "learning_rate": 4.797497563880468e-05, "loss": 0.3646, "step": 2992500 }, { "epoch": 20.253627111303594, "grad_norm": 0.40387022495269775, "learning_rate": 4.7974637288869644e-05, "loss": 0.3644, "step": 2993000 }, { "epoch": 20.257010610653964, "grad_norm": 0.35680386424064636, "learning_rate": 4.7974298938934606e-05, "loss": 0.3647, "step": 2993500 }, { "epoch": 20.26039411000433, "grad_norm": 0.35992133617401123, "learning_rate": 4.797396058899957e-05, "loss": 0.3643, "step": 2994000 }, { "epoch": 20.263777609354698, "grad_norm": 0.3910994529724121, "learning_rate": 4.797362223906453e-05, "loss": 0.3649, "step": 2994500 }, { "epoch": 20.267161108705068, "grad_norm": 0.36142924427986145, "learning_rate": 4.797328388912949e-05, "loss": 0.3655, "step": 2995000 }, { "epoch": 20.270544608055435, "grad_norm": 0.370726078748703, "learning_rate": 4.797294553919446e-05, "loss": 0.3663, "step": 2995500 }, { "epoch": 20.273928107405805, "grad_norm": 0.39392247796058655, "learning_rate": 4.7972607189259423e-05, "loss": 0.3652, "step": 2996000 }, { "epoch": 20.27731160675617, "grad_norm": 0.3471395969390869, "learning_rate": 4.7972268839324386e-05, "loss": 0.3641, "step": 2996500 }, { "epoch": 20.28069510610654, "grad_norm": 0.3578757047653198, "learning_rate": 4.797193048938935e-05, "loss": 0.364, "step": 2997000 }, { "epoch": 20.28407860545691, "grad_norm": 0.3582392930984497, "learning_rate": 4.7971592139454317e-05, "loss": 0.3661, "step": 2997500 }, { "epoch": 20.287462104807275, "grad_norm": 0.3710322380065918, "learning_rate": 4.797125378951928e-05, "loss": 0.3645, "step": 2998000 }, { "epoch": 20.290845604157646, "grad_norm": 0.3882392942905426, "learning_rate": 4.7970915439584234e-05, "loss": 0.3659, "step": 2998500 }, { "epoch": 20.294229103508012, "grad_norm": 0.3817421793937683, "learning_rate": 4.7970577089649196e-05, "loss": 0.3646, "step": 2999000 }, { "epoch": 20.29761260285838, "grad_norm": 0.361188679933548, "learning_rate": 4.7970238739714165e-05, "loss": 0.3632, "step": 2999500 }, { "epoch": 20.30099610220875, "grad_norm": 0.4115106165409088, "learning_rate": 4.796990038977913e-05, "loss": 0.3651, "step": 3000000 }, { "epoch": 20.304379601559116, "grad_norm": 0.37860366702079773, "learning_rate": 4.796956203984409e-05, "loss": 0.3653, "step": 3000500 }, { "epoch": 20.307763100909483, "grad_norm": 0.36756300926208496, "learning_rate": 4.796922368990905e-05, "loss": 0.3647, "step": 3001000 }, { "epoch": 20.311146600259853, "grad_norm": 0.39316004514694214, "learning_rate": 4.796888533997402e-05, "loss": 0.3646, "step": 3001500 }, { "epoch": 20.31453009961022, "grad_norm": 0.39260104298591614, "learning_rate": 4.796854699003898e-05, "loss": 0.3649, "step": 3002000 }, { "epoch": 20.31791359896059, "grad_norm": 0.38872653245925903, "learning_rate": 4.7968208640103945e-05, "loss": 0.3638, "step": 3002500 }, { "epoch": 20.321297098310957, "grad_norm": 0.3718373477458954, "learning_rate": 4.796787029016891e-05, "loss": 0.3653, "step": 3003000 }, { "epoch": 20.324680597661324, "grad_norm": 0.3872230648994446, "learning_rate": 4.796753194023387e-05, "loss": 0.366, "step": 3003500 }, { "epoch": 20.328064097011694, "grad_norm": 0.3719272017478943, "learning_rate": 4.796719359029883e-05, "loss": 0.3657, "step": 3004000 }, { "epoch": 20.33144759636206, "grad_norm": 0.39180269837379456, "learning_rate": 4.796685524036379e-05, "loss": 0.365, "step": 3004500 }, { "epoch": 20.33483109571243, "grad_norm": 0.39898020029067993, "learning_rate": 4.796651689042876e-05, "loss": 0.3644, "step": 3005000 }, { "epoch": 20.338214595062798, "grad_norm": 0.38650649785995483, "learning_rate": 4.7966178540493724e-05, "loss": 0.367, "step": 3005500 }, { "epoch": 20.341598094413165, "grad_norm": 0.3660157024860382, "learning_rate": 4.7965840190558686e-05, "loss": 0.3646, "step": 3006000 }, { "epoch": 20.344981593763535, "grad_norm": 0.32656699419021606, "learning_rate": 4.796550184062365e-05, "loss": 0.3655, "step": 3006500 }, { "epoch": 20.3483650931139, "grad_norm": 0.3373340666294098, "learning_rate": 4.796516349068861e-05, "loss": 0.3666, "step": 3007000 }, { "epoch": 20.35174859246427, "grad_norm": 0.3731749355792999, "learning_rate": 4.796482514075358e-05, "loss": 0.3655, "step": 3007500 }, { "epoch": 20.35513209181464, "grad_norm": 0.33905479311943054, "learning_rate": 4.7964486790818535e-05, "loss": 0.3655, "step": 3008000 }, { "epoch": 20.358515591165006, "grad_norm": 0.36236539483070374, "learning_rate": 4.79641484408835e-05, "loss": 0.3639, "step": 3008500 }, { "epoch": 20.361899090515376, "grad_norm": 0.42300811409950256, "learning_rate": 4.7963810090948466e-05, "loss": 0.3653, "step": 3009000 }, { "epoch": 20.365282589865743, "grad_norm": 0.33264434337615967, "learning_rate": 4.796347174101343e-05, "loss": 0.3645, "step": 3009500 }, { "epoch": 20.36866608921611, "grad_norm": 0.33422377705574036, "learning_rate": 4.796313339107839e-05, "loss": 0.3645, "step": 3010000 }, { "epoch": 20.37204958856648, "grad_norm": 0.390158474445343, "learning_rate": 4.796279504114335e-05, "loss": 0.3637, "step": 3010500 }, { "epoch": 20.375433087916846, "grad_norm": 0.32751229405403137, "learning_rate": 4.796245669120832e-05, "loss": 0.3655, "step": 3011000 }, { "epoch": 20.378816587267217, "grad_norm": 0.3563542366027832, "learning_rate": 4.796211834127328e-05, "loss": 0.3652, "step": 3011500 }, { "epoch": 20.382200086617583, "grad_norm": 0.3355710506439209, "learning_rate": 4.7961779991338245e-05, "loss": 0.3645, "step": 3012000 }, { "epoch": 20.38558358596795, "grad_norm": 0.3966407775878906, "learning_rate": 4.796144164140321e-05, "loss": 0.3646, "step": 3012500 }, { "epoch": 20.38896708531832, "grad_norm": 0.36854368448257446, "learning_rate": 4.796110329146817e-05, "loss": 0.3645, "step": 3013000 }, { "epoch": 20.392350584668687, "grad_norm": 0.3450147807598114, "learning_rate": 4.796076494153313e-05, "loss": 0.3639, "step": 3013500 }, { "epoch": 20.395734084019058, "grad_norm": 0.35542258620262146, "learning_rate": 4.7960426591598094e-05, "loss": 0.3642, "step": 3014000 }, { "epoch": 20.399117583369424, "grad_norm": 0.3457816541194916, "learning_rate": 4.796008824166306e-05, "loss": 0.3646, "step": 3014500 }, { "epoch": 20.40250108271979, "grad_norm": 0.34304627776145935, "learning_rate": 4.7959749891728025e-05, "loss": 0.3653, "step": 3015000 }, { "epoch": 20.40588458207016, "grad_norm": 0.3585280179977417, "learning_rate": 4.795941154179299e-05, "loss": 0.3641, "step": 3015500 }, { "epoch": 20.409268081420528, "grad_norm": 0.3855977952480316, "learning_rate": 4.795907319185795e-05, "loss": 0.3657, "step": 3016000 }, { "epoch": 20.412651580770895, "grad_norm": 0.39115121960639954, "learning_rate": 4.795873484192291e-05, "loss": 0.3647, "step": 3016500 }, { "epoch": 20.416035080121265, "grad_norm": 0.3797233998775482, "learning_rate": 4.795839649198788e-05, "loss": 0.3648, "step": 3017000 }, { "epoch": 20.419418579471632, "grad_norm": 0.4135083556175232, "learning_rate": 4.7958058142052835e-05, "loss": 0.3661, "step": 3017500 }, { "epoch": 20.422802078822002, "grad_norm": 0.3774179518222809, "learning_rate": 4.79577197921178e-05, "loss": 0.3653, "step": 3018000 }, { "epoch": 20.42618557817237, "grad_norm": 0.3676152527332306, "learning_rate": 4.7957381442182766e-05, "loss": 0.3652, "step": 3018500 }, { "epoch": 20.429569077522736, "grad_norm": 0.3528066575527191, "learning_rate": 4.795704309224773e-05, "loss": 0.3651, "step": 3019000 }, { "epoch": 20.432952576873106, "grad_norm": 0.37309351563453674, "learning_rate": 4.795670474231269e-05, "loss": 0.3662, "step": 3019500 }, { "epoch": 20.436336076223473, "grad_norm": 0.3470800518989563, "learning_rate": 4.795636639237765e-05, "loss": 0.3638, "step": 3020000 }, { "epoch": 20.439719575573843, "grad_norm": 0.36600929498672485, "learning_rate": 4.795602804244262e-05, "loss": 0.3652, "step": 3020500 }, { "epoch": 20.44310307492421, "grad_norm": 0.38214531540870667, "learning_rate": 4.7955689692507584e-05, "loss": 0.365, "step": 3021000 }, { "epoch": 20.446486574274576, "grad_norm": 0.3714052140712738, "learning_rate": 4.7955351342572546e-05, "loss": 0.3654, "step": 3021500 }, { "epoch": 20.449870073624947, "grad_norm": 0.38815969228744507, "learning_rate": 4.795501299263751e-05, "loss": 0.367, "step": 3022000 }, { "epoch": 20.453253572975314, "grad_norm": 0.3865346610546112, "learning_rate": 4.795467464270247e-05, "loss": 0.365, "step": 3022500 }, { "epoch": 20.456637072325684, "grad_norm": 0.3771718740463257, "learning_rate": 4.795433629276743e-05, "loss": 0.3642, "step": 3023000 }, { "epoch": 20.46002057167605, "grad_norm": 0.3145259916782379, "learning_rate": 4.7953997942832394e-05, "loss": 0.3649, "step": 3023500 }, { "epoch": 20.463404071026417, "grad_norm": 0.3595036268234253, "learning_rate": 4.7953659592897356e-05, "loss": 0.3642, "step": 3024000 }, { "epoch": 20.466787570376788, "grad_norm": 0.36791813373565674, "learning_rate": 4.7953321242962325e-05, "loss": 0.3657, "step": 3024500 }, { "epoch": 20.470171069727154, "grad_norm": 0.3860068917274475, "learning_rate": 4.795298289302729e-05, "loss": 0.3649, "step": 3025000 }, { "epoch": 20.47355456907752, "grad_norm": 0.379476398229599, "learning_rate": 4.795264454309225e-05, "loss": 0.3651, "step": 3025500 }, { "epoch": 20.47693806842789, "grad_norm": 0.3719246983528137, "learning_rate": 4.795230619315721e-05, "loss": 0.3657, "step": 3026000 }, { "epoch": 20.480321567778258, "grad_norm": 0.3715427815914154, "learning_rate": 4.795196784322218e-05, "loss": 0.3626, "step": 3026500 }, { "epoch": 20.48370506712863, "grad_norm": 0.42802438139915466, "learning_rate": 4.7951629493287136e-05, "loss": 0.3644, "step": 3027000 }, { "epoch": 20.487088566478995, "grad_norm": 0.37511390447616577, "learning_rate": 4.79512911433521e-05, "loss": 0.3648, "step": 3027500 }, { "epoch": 20.490472065829362, "grad_norm": 0.40279248356819153, "learning_rate": 4.795095279341707e-05, "loss": 0.3646, "step": 3028000 }, { "epoch": 20.493855565179732, "grad_norm": 0.37222951650619507, "learning_rate": 4.795061444348203e-05, "loss": 0.3656, "step": 3028500 }, { "epoch": 20.4972390645301, "grad_norm": 0.36907824873924255, "learning_rate": 4.795027609354699e-05, "loss": 0.3651, "step": 3029000 }, { "epoch": 20.50062256388047, "grad_norm": 0.38645726442337036, "learning_rate": 4.794993774361195e-05, "loss": 0.3649, "step": 3029500 }, { "epoch": 20.504006063230836, "grad_norm": 0.34893524646759033, "learning_rate": 4.794959939367692e-05, "loss": 0.364, "step": 3030000 }, { "epoch": 20.507389562581203, "grad_norm": 0.38672298192977905, "learning_rate": 4.7949261043741884e-05, "loss": 0.3656, "step": 3030500 }, { "epoch": 20.510773061931573, "grad_norm": 0.35509294271469116, "learning_rate": 4.7948922693806846e-05, "loss": 0.3647, "step": 3031000 }, { "epoch": 20.51415656128194, "grad_norm": 0.3459559679031372, "learning_rate": 4.794858434387181e-05, "loss": 0.3635, "step": 3031500 }, { "epoch": 20.517540060632307, "grad_norm": 0.3502878248691559, "learning_rate": 4.794824599393677e-05, "loss": 0.3664, "step": 3032000 }, { "epoch": 20.520923559982677, "grad_norm": 0.38622573018074036, "learning_rate": 4.794790764400173e-05, "loss": 0.3642, "step": 3032500 }, { "epoch": 20.524307059333044, "grad_norm": 0.34671613574028015, "learning_rate": 4.7947569294066695e-05, "loss": 0.3654, "step": 3033000 }, { "epoch": 20.527690558683414, "grad_norm": 0.39764460921287537, "learning_rate": 4.794723094413166e-05, "loss": 0.3648, "step": 3033500 }, { "epoch": 20.53107405803378, "grad_norm": 0.36788100004196167, "learning_rate": 4.7946892594196626e-05, "loss": 0.3646, "step": 3034000 }, { "epoch": 20.534457557384147, "grad_norm": 0.3505501449108124, "learning_rate": 4.794655424426159e-05, "loss": 0.3647, "step": 3034500 }, { "epoch": 20.537841056734518, "grad_norm": 0.37173885107040405, "learning_rate": 4.794621589432655e-05, "loss": 0.3658, "step": 3035000 }, { "epoch": 20.541224556084885, "grad_norm": 0.372952401638031, "learning_rate": 4.794587754439151e-05, "loss": 0.3641, "step": 3035500 }, { "epoch": 20.544608055435255, "grad_norm": 0.36666375398635864, "learning_rate": 4.794553919445648e-05, "loss": 0.3642, "step": 3036000 }, { "epoch": 20.54799155478562, "grad_norm": 0.3400515615940094, "learning_rate": 4.794520084452144e-05, "loss": 0.3661, "step": 3036500 }, { "epoch": 20.55137505413599, "grad_norm": 0.3797872066497803, "learning_rate": 4.79448624945864e-05, "loss": 0.3638, "step": 3037000 }, { "epoch": 20.55475855348636, "grad_norm": 0.3844873309135437, "learning_rate": 4.794452414465137e-05, "loss": 0.3652, "step": 3037500 }, { "epoch": 20.558142052836725, "grad_norm": 0.36429187655448914, "learning_rate": 4.794418579471633e-05, "loss": 0.3661, "step": 3038000 }, { "epoch": 20.561525552187096, "grad_norm": 0.3935987949371338, "learning_rate": 4.794384744478129e-05, "loss": 0.3646, "step": 3038500 }, { "epoch": 20.564909051537462, "grad_norm": 0.3462426960468292, "learning_rate": 4.7943509094846254e-05, "loss": 0.3645, "step": 3039000 }, { "epoch": 20.56829255088783, "grad_norm": 0.3837229013442993, "learning_rate": 4.794317074491122e-05, "loss": 0.3644, "step": 3039500 }, { "epoch": 20.5716760502382, "grad_norm": 0.37439703941345215, "learning_rate": 4.7942832394976185e-05, "loss": 0.3663, "step": 3040000 }, { "epoch": 20.575059549588566, "grad_norm": 0.37577924132347107, "learning_rate": 4.794249404504115e-05, "loss": 0.3652, "step": 3040500 }, { "epoch": 20.578443048938933, "grad_norm": 0.35608455538749695, "learning_rate": 4.794215569510611e-05, "loss": 0.3632, "step": 3041000 }, { "epoch": 20.581826548289303, "grad_norm": 0.3622041642665863, "learning_rate": 4.794181734517107e-05, "loss": 0.3674, "step": 3041500 }, { "epoch": 20.58521004763967, "grad_norm": 0.3489411175251007, "learning_rate": 4.7941478995236033e-05, "loss": 0.3655, "step": 3042000 }, { "epoch": 20.58859354699004, "grad_norm": 0.39626896381378174, "learning_rate": 4.7941140645300996e-05, "loss": 0.3648, "step": 3042500 }, { "epoch": 20.591977046340407, "grad_norm": 0.3579977750778198, "learning_rate": 4.794080229536596e-05, "loss": 0.3628, "step": 3043000 }, { "epoch": 20.595360545690774, "grad_norm": 0.3381921947002411, "learning_rate": 4.794046394543093e-05, "loss": 0.3649, "step": 3043500 }, { "epoch": 20.598744045041144, "grad_norm": 0.35242900252342224, "learning_rate": 4.794012559549589e-05, "loss": 0.3641, "step": 3044000 }, { "epoch": 20.60212754439151, "grad_norm": 0.36363518238067627, "learning_rate": 4.793978724556085e-05, "loss": 0.3639, "step": 3044500 }, { "epoch": 20.60551104374188, "grad_norm": 0.40079447627067566, "learning_rate": 4.793944889562581e-05, "loss": 0.3652, "step": 3045000 }, { "epoch": 20.608894543092248, "grad_norm": 0.3724885880947113, "learning_rate": 4.793911054569078e-05, "loss": 0.364, "step": 3045500 }, { "epoch": 20.612278042442615, "grad_norm": 0.3565116226673126, "learning_rate": 4.7938772195755744e-05, "loss": 0.3655, "step": 3046000 }, { "epoch": 20.615661541792985, "grad_norm": 0.329285591840744, "learning_rate": 4.79384338458207e-05, "loss": 0.3657, "step": 3046500 }, { "epoch": 20.61904504114335, "grad_norm": 0.35131943225860596, "learning_rate": 4.793809549588567e-05, "loss": 0.3676, "step": 3047000 }, { "epoch": 20.622428540493722, "grad_norm": 0.37161844968795776, "learning_rate": 4.793775714595063e-05, "loss": 0.364, "step": 3047500 }, { "epoch": 20.62581203984409, "grad_norm": 0.3753008544445038, "learning_rate": 4.793741879601559e-05, "loss": 0.3657, "step": 3048000 }, { "epoch": 20.629195539194455, "grad_norm": 0.3838074207305908, "learning_rate": 4.7937080446080555e-05, "loss": 0.3653, "step": 3048500 }, { "epoch": 20.632579038544826, "grad_norm": 0.39390793442726135, "learning_rate": 4.7936742096145524e-05, "loss": 0.3653, "step": 3049000 }, { "epoch": 20.635962537895193, "grad_norm": 0.38033226132392883, "learning_rate": 4.7936403746210486e-05, "loss": 0.3652, "step": 3049500 }, { "epoch": 20.63934603724556, "grad_norm": 0.35507896542549133, "learning_rate": 4.793606539627545e-05, "loss": 0.3656, "step": 3050000 }, { "epoch": 20.64272953659593, "grad_norm": 0.348247230052948, "learning_rate": 4.793572704634041e-05, "loss": 0.365, "step": 3050500 }, { "epoch": 20.646113035946296, "grad_norm": 0.38723573088645935, "learning_rate": 4.793538869640537e-05, "loss": 0.3659, "step": 3051000 }, { "epoch": 20.649496535296667, "grad_norm": 0.3930349349975586, "learning_rate": 4.7935050346470334e-05, "loss": 0.3634, "step": 3051500 }, { "epoch": 20.652880034647033, "grad_norm": 0.3427135646343231, "learning_rate": 4.7934711996535296e-05, "loss": 0.3661, "step": 3052000 }, { "epoch": 20.6562635339974, "grad_norm": 0.359590619802475, "learning_rate": 4.793437364660026e-05, "loss": 0.3658, "step": 3052500 }, { "epoch": 20.65964703334777, "grad_norm": 0.35423997044563293, "learning_rate": 4.793403529666523e-05, "loss": 0.3663, "step": 3053000 }, { "epoch": 20.663030532698137, "grad_norm": 0.33582478761672974, "learning_rate": 4.793369694673019e-05, "loss": 0.3656, "step": 3053500 }, { "epoch": 20.666414032048507, "grad_norm": 0.3541942536830902, "learning_rate": 4.793335859679515e-05, "loss": 0.3646, "step": 3054000 }, { "epoch": 20.669797531398874, "grad_norm": 0.368379145860672, "learning_rate": 4.7933020246860114e-05, "loss": 0.3657, "step": 3054500 }, { "epoch": 20.67318103074924, "grad_norm": 0.37707754969596863, "learning_rate": 4.793268189692508e-05, "loss": 0.3646, "step": 3055000 }, { "epoch": 20.67656453009961, "grad_norm": 0.37721332907676697, "learning_rate": 4.7932343546990045e-05, "loss": 0.3643, "step": 3055500 }, { "epoch": 20.679948029449978, "grad_norm": 0.3578668236732483, "learning_rate": 4.7932005197055e-05, "loss": 0.3637, "step": 3056000 }, { "epoch": 20.683331528800345, "grad_norm": 0.37248265743255615, "learning_rate": 4.793166684711997e-05, "loss": 0.3658, "step": 3056500 }, { "epoch": 20.686715028150715, "grad_norm": 0.42079028487205505, "learning_rate": 4.793132849718493e-05, "loss": 0.3663, "step": 3057000 }, { "epoch": 20.690098527501082, "grad_norm": 0.35661521553993225, "learning_rate": 4.793099014724989e-05, "loss": 0.3648, "step": 3057500 }, { "epoch": 20.693482026851452, "grad_norm": 0.36780673265457153, "learning_rate": 4.7930651797314855e-05, "loss": 0.3653, "step": 3058000 }, { "epoch": 20.69686552620182, "grad_norm": 0.3804129660129547, "learning_rate": 4.7930313447379824e-05, "loss": 0.3659, "step": 3058500 }, { "epoch": 20.700249025552186, "grad_norm": 0.3678402900695801, "learning_rate": 4.7929975097444786e-05, "loss": 0.3648, "step": 3059000 }, { "epoch": 20.703632524902556, "grad_norm": 0.34290778636932373, "learning_rate": 4.792963674750975e-05, "loss": 0.3639, "step": 3059500 }, { "epoch": 20.707016024252923, "grad_norm": 0.32359635829925537, "learning_rate": 4.792929839757471e-05, "loss": 0.3663, "step": 3060000 }, { "epoch": 20.710399523603293, "grad_norm": 0.3486013114452362, "learning_rate": 4.792896004763967e-05, "loss": 0.3656, "step": 3060500 }, { "epoch": 20.71378302295366, "grad_norm": 0.3553240895271301, "learning_rate": 4.7928621697704635e-05, "loss": 0.364, "step": 3061000 }, { "epoch": 20.717166522304026, "grad_norm": 0.3806537687778473, "learning_rate": 4.79282833477696e-05, "loss": 0.3658, "step": 3061500 }, { "epoch": 20.720550021654397, "grad_norm": 0.43344950675964355, "learning_rate": 4.792794499783456e-05, "loss": 0.3642, "step": 3062000 }, { "epoch": 20.723933521004763, "grad_norm": 0.3625841438770294, "learning_rate": 4.792760664789953e-05, "loss": 0.3652, "step": 3062500 }, { "epoch": 20.727317020355134, "grad_norm": 0.4011187255382538, "learning_rate": 4.792726829796449e-05, "loss": 0.3649, "step": 3063000 }, { "epoch": 20.7307005197055, "grad_norm": 0.38117992877960205, "learning_rate": 4.792692994802945e-05, "loss": 0.3653, "step": 3063500 }, { "epoch": 20.734084019055867, "grad_norm": 0.4144648313522339, "learning_rate": 4.7926591598094414e-05, "loss": 0.3635, "step": 3064000 }, { "epoch": 20.737467518406238, "grad_norm": 0.3855360746383667, "learning_rate": 4.792625324815938e-05, "loss": 0.3667, "step": 3064500 }, { "epoch": 20.740851017756604, "grad_norm": 0.37165096402168274, "learning_rate": 4.7925914898224345e-05, "loss": 0.3652, "step": 3065000 }, { "epoch": 20.74423451710697, "grad_norm": 0.3604426383972168, "learning_rate": 4.79255765482893e-05, "loss": 0.3662, "step": 3065500 }, { "epoch": 20.74761801645734, "grad_norm": 0.3857273757457733, "learning_rate": 4.792523819835427e-05, "loss": 0.3654, "step": 3066000 }, { "epoch": 20.751001515807708, "grad_norm": 0.3289198577404022, "learning_rate": 4.792489984841923e-05, "loss": 0.3633, "step": 3066500 }, { "epoch": 20.75438501515808, "grad_norm": 0.3403366208076477, "learning_rate": 4.7924561498484194e-05, "loss": 0.3654, "step": 3067000 }, { "epoch": 20.757768514508445, "grad_norm": 0.3487418293952942, "learning_rate": 4.7924223148549156e-05, "loss": 0.3638, "step": 3067500 }, { "epoch": 20.761152013858812, "grad_norm": 0.3301853537559509, "learning_rate": 4.7923884798614125e-05, "loss": 0.3647, "step": 3068000 }, { "epoch": 20.764535513209182, "grad_norm": 0.3602818250656128, "learning_rate": 4.792354644867909e-05, "loss": 0.366, "step": 3068500 }, { "epoch": 20.76791901255955, "grad_norm": 0.3686750531196594, "learning_rate": 4.792320809874405e-05, "loss": 0.3658, "step": 3069000 }, { "epoch": 20.77130251190992, "grad_norm": 0.36918964982032776, "learning_rate": 4.792286974880901e-05, "loss": 0.3648, "step": 3069500 }, { "epoch": 20.774686011260286, "grad_norm": 0.3289744257926941, "learning_rate": 4.792253139887397e-05, "loss": 0.3661, "step": 3070000 }, { "epoch": 20.778069510610653, "grad_norm": 0.36303091049194336, "learning_rate": 4.7922193048938935e-05, "loss": 0.3663, "step": 3070500 }, { "epoch": 20.781453009961023, "grad_norm": 0.34788212180137634, "learning_rate": 4.79218546990039e-05, "loss": 0.3651, "step": 3071000 }, { "epoch": 20.78483650931139, "grad_norm": 0.35184186697006226, "learning_rate": 4.792151634906886e-05, "loss": 0.3644, "step": 3071500 }, { "epoch": 20.78822000866176, "grad_norm": 0.3535412549972534, "learning_rate": 4.792117799913383e-05, "loss": 0.3652, "step": 3072000 }, { "epoch": 20.791603508012127, "grad_norm": 0.3471745252609253, "learning_rate": 4.792083964919879e-05, "loss": 0.3658, "step": 3072500 }, { "epoch": 20.794987007362494, "grad_norm": 0.3394944965839386, "learning_rate": 4.792050129926375e-05, "loss": 0.3651, "step": 3073000 }, { "epoch": 20.798370506712864, "grad_norm": 0.3789084851741791, "learning_rate": 4.7920162949328715e-05, "loss": 0.3655, "step": 3073500 }, { "epoch": 20.80175400606323, "grad_norm": 0.34156209230422974, "learning_rate": 4.7919824599393684e-05, "loss": 0.3639, "step": 3074000 }, { "epoch": 20.805137505413597, "grad_norm": 0.3876127302646637, "learning_rate": 4.7919486249458646e-05, "loss": 0.3637, "step": 3074500 }, { "epoch": 20.808521004763968, "grad_norm": 0.3334454298019409, "learning_rate": 4.79191478995236e-05, "loss": 0.367, "step": 3075000 }, { "epoch": 20.811904504114334, "grad_norm": 0.39533373713493347, "learning_rate": 4.791880954958857e-05, "loss": 0.366, "step": 3075500 }, { "epoch": 20.815288003464705, "grad_norm": 0.35846906900405884, "learning_rate": 4.791847119965353e-05, "loss": 0.3665, "step": 3076000 }, { "epoch": 20.81867150281507, "grad_norm": 0.3482748568058014, "learning_rate": 4.7918132849718494e-05, "loss": 0.364, "step": 3076500 }, { "epoch": 20.82205500216544, "grad_norm": 0.37502777576446533, "learning_rate": 4.7917794499783457e-05, "loss": 0.3644, "step": 3077000 }, { "epoch": 20.82543850151581, "grad_norm": 0.3282943069934845, "learning_rate": 4.791745614984842e-05, "loss": 0.3645, "step": 3077500 }, { "epoch": 20.828822000866175, "grad_norm": 0.3873825669288635, "learning_rate": 4.791711779991339e-05, "loss": 0.3653, "step": 3078000 }, { "epoch": 20.832205500216546, "grad_norm": 0.410609632730484, "learning_rate": 4.791677944997835e-05, "loss": 0.3653, "step": 3078500 }, { "epoch": 20.835588999566912, "grad_norm": 0.413093626499176, "learning_rate": 4.791644110004331e-05, "loss": 0.3641, "step": 3079000 }, { "epoch": 20.83897249891728, "grad_norm": 0.33067917823791504, "learning_rate": 4.7916102750108274e-05, "loss": 0.3659, "step": 3079500 }, { "epoch": 20.84235599826765, "grad_norm": 0.36715462803840637, "learning_rate": 4.7915764400173236e-05, "loss": 0.3639, "step": 3080000 }, { "epoch": 20.845739497618016, "grad_norm": 0.3519304692745209, "learning_rate": 4.79154260502382e-05, "loss": 0.364, "step": 3080500 }, { "epoch": 20.849122996968383, "grad_norm": 0.39291125535964966, "learning_rate": 4.791508770030316e-05, "loss": 0.3658, "step": 3081000 }, { "epoch": 20.852506496318753, "grad_norm": 0.40021994709968567, "learning_rate": 4.791474935036813e-05, "loss": 0.3662, "step": 3081500 }, { "epoch": 20.85588999566912, "grad_norm": 0.36764124035835266, "learning_rate": 4.791441100043309e-05, "loss": 0.364, "step": 3082000 }, { "epoch": 20.85927349501949, "grad_norm": 0.3566446304321289, "learning_rate": 4.7914072650498053e-05, "loss": 0.3659, "step": 3082500 }, { "epoch": 20.862656994369857, "grad_norm": 0.3574284017086029, "learning_rate": 4.7913734300563016e-05, "loss": 0.3656, "step": 3083000 }, { "epoch": 20.866040493720224, "grad_norm": 0.35197603702545166, "learning_rate": 4.7913395950627984e-05, "loss": 0.3657, "step": 3083500 }, { "epoch": 20.869423993070594, "grad_norm": 0.34930962324142456, "learning_rate": 4.7913057600692947e-05, "loss": 0.3659, "step": 3084000 }, { "epoch": 20.87280749242096, "grad_norm": 0.35749518871307373, "learning_rate": 4.79127192507579e-05, "loss": 0.3658, "step": 3084500 }, { "epoch": 20.87619099177133, "grad_norm": 0.35430794954299927, "learning_rate": 4.791238090082287e-05, "loss": 0.365, "step": 3085000 }, { "epoch": 20.879574491121698, "grad_norm": 0.37598565220832825, "learning_rate": 4.791204255088783e-05, "loss": 0.3649, "step": 3085500 }, { "epoch": 20.882957990472065, "grad_norm": 0.3943885564804077, "learning_rate": 4.7911704200952795e-05, "loss": 0.3645, "step": 3086000 }, { "epoch": 20.886341489822435, "grad_norm": 0.37297797203063965, "learning_rate": 4.791136585101776e-05, "loss": 0.3637, "step": 3086500 }, { "epoch": 20.8897249891728, "grad_norm": 0.3791876435279846, "learning_rate": 4.791102750108272e-05, "loss": 0.3652, "step": 3087000 }, { "epoch": 20.89310848852317, "grad_norm": 0.3426647484302521, "learning_rate": 4.791068915114769e-05, "loss": 0.3657, "step": 3087500 }, { "epoch": 20.89649198787354, "grad_norm": 0.3604070842266083, "learning_rate": 4.791035080121265e-05, "loss": 0.3653, "step": 3088000 }, { "epoch": 20.899875487223905, "grad_norm": 0.381246417760849, "learning_rate": 4.791001245127761e-05, "loss": 0.3649, "step": 3088500 }, { "epoch": 20.903258986574276, "grad_norm": 0.39292630553245544, "learning_rate": 4.7909674101342575e-05, "loss": 0.3643, "step": 3089000 }, { "epoch": 20.906642485924642, "grad_norm": 0.36294469237327576, "learning_rate": 4.790933575140754e-05, "loss": 0.3632, "step": 3089500 }, { "epoch": 20.91002598527501, "grad_norm": 0.37999242544174194, "learning_rate": 4.79089974014725e-05, "loss": 0.3635, "step": 3090000 }, { "epoch": 20.91340948462538, "grad_norm": 0.3372073769569397, "learning_rate": 4.790865905153746e-05, "loss": 0.3645, "step": 3090500 }, { "epoch": 20.916792983975746, "grad_norm": 0.44187304377555847, "learning_rate": 4.790832070160243e-05, "loss": 0.3648, "step": 3091000 }, { "epoch": 20.920176483326117, "grad_norm": 0.3418973982334137, "learning_rate": 4.790798235166739e-05, "loss": 0.3633, "step": 3091500 }, { "epoch": 20.923559982676483, "grad_norm": 0.3711940050125122, "learning_rate": 4.7907644001732354e-05, "loss": 0.3639, "step": 3092000 }, { "epoch": 20.92694348202685, "grad_norm": 0.33627617359161377, "learning_rate": 4.7907305651797316e-05, "loss": 0.3651, "step": 3092500 }, { "epoch": 20.93032698137722, "grad_norm": 0.37105128169059753, "learning_rate": 4.7906967301862285e-05, "loss": 0.3656, "step": 3093000 }, { "epoch": 20.933710480727587, "grad_norm": 0.3730045557022095, "learning_rate": 4.790662895192725e-05, "loss": 0.3647, "step": 3093500 }, { "epoch": 20.937093980077957, "grad_norm": 0.39295563101768494, "learning_rate": 4.79062906019922e-05, "loss": 0.3654, "step": 3094000 }, { "epoch": 20.940477479428324, "grad_norm": 0.341587096452713, "learning_rate": 4.7905952252057165e-05, "loss": 0.3658, "step": 3094500 }, { "epoch": 20.94386097877869, "grad_norm": 0.3795209527015686, "learning_rate": 4.7905613902122134e-05, "loss": 0.3664, "step": 3095000 }, { "epoch": 20.94724447812906, "grad_norm": 0.3408762812614441, "learning_rate": 4.7905275552187096e-05, "loss": 0.3647, "step": 3095500 }, { "epoch": 20.950627977479428, "grad_norm": 0.34244951605796814, "learning_rate": 4.790493720225206e-05, "loss": 0.3669, "step": 3096000 }, { "epoch": 20.9540114768298, "grad_norm": 0.41083306074142456, "learning_rate": 4.790459885231702e-05, "loss": 0.3661, "step": 3096500 }, { "epoch": 20.957394976180165, "grad_norm": 0.36155903339385986, "learning_rate": 4.790426050238199e-05, "loss": 0.3632, "step": 3097000 }, { "epoch": 20.96077847553053, "grad_norm": 0.36875125765800476, "learning_rate": 4.790392215244695e-05, "loss": 0.3651, "step": 3097500 }, { "epoch": 20.964161974880902, "grad_norm": 0.350536972284317, "learning_rate": 4.790358380251191e-05, "loss": 0.366, "step": 3098000 }, { "epoch": 20.96754547423127, "grad_norm": 0.36575838923454285, "learning_rate": 4.7903245452576875e-05, "loss": 0.3662, "step": 3098500 }, { "epoch": 20.970928973581636, "grad_norm": 0.36057087779045105, "learning_rate": 4.790290710264184e-05, "loss": 0.3666, "step": 3099000 }, { "epoch": 20.974312472932006, "grad_norm": 0.38979464769363403, "learning_rate": 4.79025687527068e-05, "loss": 0.3644, "step": 3099500 }, { "epoch": 20.977695972282373, "grad_norm": 0.33669137954711914, "learning_rate": 4.790223040277176e-05, "loss": 0.367, "step": 3100000 }, { "epoch": 20.981079471632743, "grad_norm": 0.380115270614624, "learning_rate": 4.790189205283673e-05, "loss": 0.3645, "step": 3100500 }, { "epoch": 20.98446297098311, "grad_norm": 0.40111979842185974, "learning_rate": 4.790155370290169e-05, "loss": 0.3652, "step": 3101000 }, { "epoch": 20.987846470333476, "grad_norm": 0.38543492555618286, "learning_rate": 4.7901215352966655e-05, "loss": 0.366, "step": 3101500 }, { "epoch": 20.991229969683847, "grad_norm": 0.35383349657058716, "learning_rate": 4.790087700303162e-05, "loss": 0.3644, "step": 3102000 }, { "epoch": 20.994613469034213, "grad_norm": 0.33337080478668213, "learning_rate": 4.7900538653096586e-05, "loss": 0.363, "step": 3102500 }, { "epoch": 20.997996968384584, "grad_norm": 0.3798995614051819, "learning_rate": 4.790020030316155e-05, "loss": 0.3633, "step": 3103000 }, { "epoch": 21.0, "eval_accuracy": 0.8607153645715448, "eval_loss": 0.5653529763221741, "eval_runtime": 3396.8155, "eval_samples_per_second": 85.593, "eval_steps_per_second": 5.35, "step": 3103296 }, { "epoch": 21.00138046773495, "grad_norm": 0.3778703510761261, "learning_rate": 4.78998619532265e-05, "loss": 0.3644, "step": 3103500 }, { "epoch": 21.004763967085317, "grad_norm": 0.369404137134552, "learning_rate": 4.7899523603291465e-05, "loss": 0.3625, "step": 3104000 }, { "epoch": 21.008147466435688, "grad_norm": 0.36425143480300903, "learning_rate": 4.7899185253356434e-05, "loss": 0.3633, "step": 3104500 }, { "epoch": 21.011530965786054, "grad_norm": 0.3674194812774658, "learning_rate": 4.7898846903421396e-05, "loss": 0.3634, "step": 3105000 }, { "epoch": 21.01491446513642, "grad_norm": 0.3845115303993225, "learning_rate": 4.789850855348636e-05, "loss": 0.3642, "step": 3105500 }, { "epoch": 21.01829796448679, "grad_norm": 0.38932108879089355, "learning_rate": 4.789817020355132e-05, "loss": 0.3611, "step": 3106000 }, { "epoch": 21.021681463837158, "grad_norm": 0.39633065462112427, "learning_rate": 4.789783185361629e-05, "loss": 0.3667, "step": 3106500 }, { "epoch": 21.02506496318753, "grad_norm": 0.39111411571502686, "learning_rate": 4.789749350368125e-05, "loss": 0.3632, "step": 3107000 }, { "epoch": 21.028448462537895, "grad_norm": 0.38436001539230347, "learning_rate": 4.7897155153746214e-05, "loss": 0.3639, "step": 3107500 }, { "epoch": 21.031831961888262, "grad_norm": 0.40379461646080017, "learning_rate": 4.7896816803811176e-05, "loss": 0.3638, "step": 3108000 }, { "epoch": 21.035215461238632, "grad_norm": 0.41419756412506104, "learning_rate": 4.789647845387614e-05, "loss": 0.3638, "step": 3108500 }, { "epoch": 21.038598960589, "grad_norm": 0.3712766766548157, "learning_rate": 4.78961401039411e-05, "loss": 0.3622, "step": 3109000 }, { "epoch": 21.04198245993937, "grad_norm": 0.38608309626579285, "learning_rate": 4.789580175400606e-05, "loss": 0.3614, "step": 3109500 }, { "epoch": 21.045365959289736, "grad_norm": 0.3591993749141693, "learning_rate": 4.789546340407103e-05, "loss": 0.3638, "step": 3110000 }, { "epoch": 21.048749458640103, "grad_norm": 0.3584752678871155, "learning_rate": 4.789512505413599e-05, "loss": 0.3625, "step": 3110500 }, { "epoch": 21.052132957990473, "grad_norm": 0.39413151144981384, "learning_rate": 4.7894786704200955e-05, "loss": 0.3618, "step": 3111000 }, { "epoch": 21.05551645734084, "grad_norm": 0.3555285632610321, "learning_rate": 4.789444835426592e-05, "loss": 0.3626, "step": 3111500 }, { "epoch": 21.05889995669121, "grad_norm": 0.37488609552383423, "learning_rate": 4.7894110004330886e-05, "loss": 0.3642, "step": 3112000 }, { "epoch": 21.062283456041577, "grad_norm": 0.36124318838119507, "learning_rate": 4.789377165439585e-05, "loss": 0.3657, "step": 3112500 }, { "epoch": 21.065666955391944, "grad_norm": 0.35256969928741455, "learning_rate": 4.7893433304460804e-05, "loss": 0.3627, "step": 3113000 }, { "epoch": 21.069050454742314, "grad_norm": 0.3735707104206085, "learning_rate": 4.7893094954525766e-05, "loss": 0.3637, "step": 3113500 }, { "epoch": 21.07243395409268, "grad_norm": 0.4027535021305084, "learning_rate": 4.7892756604590735e-05, "loss": 0.3633, "step": 3114000 }, { "epoch": 21.075817453443047, "grad_norm": 0.3611033856868744, "learning_rate": 4.78924182546557e-05, "loss": 0.3631, "step": 3114500 }, { "epoch": 21.079200952793418, "grad_norm": 0.3534304201602936, "learning_rate": 4.789207990472066e-05, "loss": 0.3635, "step": 3115000 }, { "epoch": 21.082584452143784, "grad_norm": 0.32752498984336853, "learning_rate": 4.789174155478562e-05, "loss": 0.3637, "step": 3115500 }, { "epoch": 21.085967951494155, "grad_norm": 0.34558671712875366, "learning_rate": 4.789140320485059e-05, "loss": 0.3635, "step": 3116000 }, { "epoch": 21.08935145084452, "grad_norm": 0.31938326358795166, "learning_rate": 4.789106485491555e-05, "loss": 0.363, "step": 3116500 }, { "epoch": 21.092734950194888, "grad_norm": 0.4072025716304779, "learning_rate": 4.7890726504980514e-05, "loss": 0.3634, "step": 3117000 }, { "epoch": 21.09611844954526, "grad_norm": 0.33352988958358765, "learning_rate": 4.7890388155045476e-05, "loss": 0.3639, "step": 3117500 }, { "epoch": 21.099501948895625, "grad_norm": 0.42230895161628723, "learning_rate": 4.789004980511044e-05, "loss": 0.3637, "step": 3118000 }, { "epoch": 21.102885448245996, "grad_norm": 0.3735465109348297, "learning_rate": 4.78897114551754e-05, "loss": 0.3639, "step": 3118500 }, { "epoch": 21.106268947596362, "grad_norm": 0.3831036686897278, "learning_rate": 4.788937310524036e-05, "loss": 0.3628, "step": 3119000 }, { "epoch": 21.10965244694673, "grad_norm": 0.3576519191265106, "learning_rate": 4.788903475530533e-05, "loss": 0.3635, "step": 3119500 }, { "epoch": 21.1130359462971, "grad_norm": 0.32627448439598083, "learning_rate": 4.7888696405370294e-05, "loss": 0.3633, "step": 3120000 }, { "epoch": 21.116419445647466, "grad_norm": 0.35932162404060364, "learning_rate": 4.7888358055435256e-05, "loss": 0.3636, "step": 3120500 }, { "epoch": 21.119802944997833, "grad_norm": 0.3643208146095276, "learning_rate": 4.788801970550022e-05, "loss": 0.3632, "step": 3121000 }, { "epoch": 21.123186444348203, "grad_norm": 0.35325881838798523, "learning_rate": 4.788768135556519e-05, "loss": 0.3647, "step": 3121500 }, { "epoch": 21.12656994369857, "grad_norm": 0.34074848890304565, "learning_rate": 4.788734300563015e-05, "loss": 0.3634, "step": 3122000 }, { "epoch": 21.12995344304894, "grad_norm": 0.3535933792591095, "learning_rate": 4.7887004655695104e-05, "loss": 0.3638, "step": 3122500 }, { "epoch": 21.133336942399307, "grad_norm": 0.34740936756134033, "learning_rate": 4.7886666305760067e-05, "loss": 0.3635, "step": 3123000 }, { "epoch": 21.136720441749674, "grad_norm": 0.35919180512428284, "learning_rate": 4.7886327955825035e-05, "loss": 0.365, "step": 3123500 }, { "epoch": 21.140103941100044, "grad_norm": 0.3536825180053711, "learning_rate": 4.788598960589e-05, "loss": 0.363, "step": 3124000 }, { "epoch": 21.14348744045041, "grad_norm": 0.3748694956302643, "learning_rate": 4.788565125595496e-05, "loss": 0.3643, "step": 3124500 }, { "epoch": 21.14687093980078, "grad_norm": 0.3851325511932373, "learning_rate": 4.788531290601992e-05, "loss": 0.3635, "step": 3125000 }, { "epoch": 21.150254439151148, "grad_norm": 0.3734279274940491, "learning_rate": 4.788497455608489e-05, "loss": 0.3631, "step": 3125500 }, { "epoch": 21.153637938501515, "grad_norm": 0.35346266627311707, "learning_rate": 4.788463620614985e-05, "loss": 0.3649, "step": 3126000 }, { "epoch": 21.157021437851885, "grad_norm": 0.3762940764427185, "learning_rate": 4.7884297856214815e-05, "loss": 0.3625, "step": 3126500 }, { "epoch": 21.16040493720225, "grad_norm": 0.34404096007347107, "learning_rate": 4.788395950627978e-05, "loss": 0.3636, "step": 3127000 }, { "epoch": 21.163788436552622, "grad_norm": 0.36609381437301636, "learning_rate": 4.788362115634474e-05, "loss": 0.3631, "step": 3127500 }, { "epoch": 21.16717193590299, "grad_norm": 0.34207746386528015, "learning_rate": 4.78832828064097e-05, "loss": 0.364, "step": 3128000 }, { "epoch": 21.170555435253355, "grad_norm": 0.3563523292541504, "learning_rate": 4.7882944456474663e-05, "loss": 0.3656, "step": 3128500 }, { "epoch": 21.173938934603726, "grad_norm": 0.3344648480415344, "learning_rate": 4.788260610653963e-05, "loss": 0.3625, "step": 3129000 }, { "epoch": 21.177322433954092, "grad_norm": 0.35955458879470825, "learning_rate": 4.7882267756604594e-05, "loss": 0.3648, "step": 3129500 }, { "epoch": 21.18070593330446, "grad_norm": 0.35110658407211304, "learning_rate": 4.7881929406669557e-05, "loss": 0.3645, "step": 3130000 }, { "epoch": 21.18408943265483, "grad_norm": 0.3675309121608734, "learning_rate": 4.788159105673452e-05, "loss": 0.3643, "step": 3130500 }, { "epoch": 21.187472932005196, "grad_norm": 0.36067256331443787, "learning_rate": 4.788125270679949e-05, "loss": 0.3647, "step": 3131000 }, { "epoch": 21.190856431355567, "grad_norm": 0.35494762659072876, "learning_rate": 4.788091435686445e-05, "loss": 0.363, "step": 3131500 }, { "epoch": 21.194239930705933, "grad_norm": 0.38366198539733887, "learning_rate": 4.7880576006929405e-05, "loss": 0.3663, "step": 3132000 }, { "epoch": 21.1976234300563, "grad_norm": 0.3648875057697296, "learning_rate": 4.788023765699437e-05, "loss": 0.3632, "step": 3132500 }, { "epoch": 21.20100692940667, "grad_norm": 0.3924325704574585, "learning_rate": 4.7879899307059336e-05, "loss": 0.3629, "step": 3133000 }, { "epoch": 21.204390428757037, "grad_norm": 0.3698674440383911, "learning_rate": 4.78795609571243e-05, "loss": 0.3621, "step": 3133500 }, { "epoch": 21.207773928107407, "grad_norm": 0.38015779852867126, "learning_rate": 4.787922260718926e-05, "loss": 0.3637, "step": 3134000 }, { "epoch": 21.211157427457774, "grad_norm": 0.36998480558395386, "learning_rate": 4.787888425725422e-05, "loss": 0.3648, "step": 3134500 }, { "epoch": 21.21454092680814, "grad_norm": 0.3666990101337433, "learning_rate": 4.787854590731919e-05, "loss": 0.3644, "step": 3135000 }, { "epoch": 21.21792442615851, "grad_norm": 0.34037184715270996, "learning_rate": 4.7878207557384153e-05, "loss": 0.3651, "step": 3135500 }, { "epoch": 21.221307925508878, "grad_norm": 0.393677294254303, "learning_rate": 4.7877869207449116e-05, "loss": 0.3634, "step": 3136000 }, { "epoch": 21.224691424859245, "grad_norm": 0.37676140666007996, "learning_rate": 4.787753085751408e-05, "loss": 0.3644, "step": 3136500 }, { "epoch": 21.228074924209615, "grad_norm": 0.3752439320087433, "learning_rate": 4.787719250757904e-05, "loss": 0.364, "step": 3137000 }, { "epoch": 21.23145842355998, "grad_norm": 0.3684835433959961, "learning_rate": 4.7876854157644e-05, "loss": 0.3633, "step": 3137500 }, { "epoch": 21.234841922910352, "grad_norm": 0.4008356034755707, "learning_rate": 4.7876515807708964e-05, "loss": 0.364, "step": 3138000 }, { "epoch": 21.23822542226072, "grad_norm": 0.34454894065856934, "learning_rate": 4.787617745777393e-05, "loss": 0.363, "step": 3138500 }, { "epoch": 21.241608921611085, "grad_norm": 0.367489755153656, "learning_rate": 4.7875839107838895e-05, "loss": 0.3652, "step": 3139000 }, { "epoch": 21.244992420961456, "grad_norm": 0.3662697970867157, "learning_rate": 4.787550075790386e-05, "loss": 0.3639, "step": 3139500 }, { "epoch": 21.248375920311823, "grad_norm": 0.3582600951194763, "learning_rate": 4.787516240796882e-05, "loss": 0.3629, "step": 3140000 }, { "epoch": 21.251759419662193, "grad_norm": 0.3587487041950226, "learning_rate": 4.787482405803378e-05, "loss": 0.3629, "step": 3140500 }, { "epoch": 21.25514291901256, "grad_norm": 0.34383293986320496, "learning_rate": 4.787448570809875e-05, "loss": 0.3635, "step": 3141000 }, { "epoch": 21.258526418362926, "grad_norm": 0.34148308634757996, "learning_rate": 4.7874147358163706e-05, "loss": 0.3661, "step": 3141500 }, { "epoch": 21.261909917713297, "grad_norm": 0.3754810690879822, "learning_rate": 4.787380900822867e-05, "loss": 0.3642, "step": 3142000 }, { "epoch": 21.265293417063663, "grad_norm": 0.3658917546272278, "learning_rate": 4.787347065829364e-05, "loss": 0.3654, "step": 3142500 }, { "epoch": 21.268676916414034, "grad_norm": 0.36671942472457886, "learning_rate": 4.78731323083586e-05, "loss": 0.3635, "step": 3143000 }, { "epoch": 21.2720604157644, "grad_norm": 0.3823990821838379, "learning_rate": 4.787279395842356e-05, "loss": 0.3631, "step": 3143500 }, { "epoch": 21.275443915114767, "grad_norm": 0.37779998779296875, "learning_rate": 4.787245560848852e-05, "loss": 0.3643, "step": 3144000 }, { "epoch": 21.278827414465137, "grad_norm": 0.3578616976737976, "learning_rate": 4.787211725855349e-05, "loss": 0.3636, "step": 3144500 }, { "epoch": 21.282210913815504, "grad_norm": 0.37870824337005615, "learning_rate": 4.7871778908618454e-05, "loss": 0.3642, "step": 3145000 }, { "epoch": 21.28559441316587, "grad_norm": 0.3804149031639099, "learning_rate": 4.7871440558683416e-05, "loss": 0.366, "step": 3145500 }, { "epoch": 21.28897791251624, "grad_norm": 0.3497847318649292, "learning_rate": 4.787110220874838e-05, "loss": 0.364, "step": 3146000 }, { "epoch": 21.292361411866608, "grad_norm": 0.36826586723327637, "learning_rate": 4.787076385881334e-05, "loss": 0.3635, "step": 3146500 }, { "epoch": 21.29574491121698, "grad_norm": 0.34804922342300415, "learning_rate": 4.78704255088783e-05, "loss": 0.3651, "step": 3147000 }, { "epoch": 21.299128410567345, "grad_norm": 0.38317447900772095, "learning_rate": 4.7870087158943265e-05, "loss": 0.3643, "step": 3147500 }, { "epoch": 21.302511909917712, "grad_norm": 0.37221434712409973, "learning_rate": 4.7869748809008234e-05, "loss": 0.3644, "step": 3148000 }, { "epoch": 21.305895409268082, "grad_norm": 0.39198219776153564, "learning_rate": 4.7869410459073196e-05, "loss": 0.364, "step": 3148500 }, { "epoch": 21.30927890861845, "grad_norm": 0.37624913454055786, "learning_rate": 4.786907210913816e-05, "loss": 0.3643, "step": 3149000 }, { "epoch": 21.31266240796882, "grad_norm": 0.33953070640563965, "learning_rate": 4.786873375920312e-05, "loss": 0.3646, "step": 3149500 }, { "epoch": 21.316045907319186, "grad_norm": 0.35391950607299805, "learning_rate": 4.786839540926808e-05, "loss": 0.3637, "step": 3150000 }, { "epoch": 21.319429406669553, "grad_norm": 0.38748225569725037, "learning_rate": 4.786805705933305e-05, "loss": 0.3631, "step": 3150500 }, { "epoch": 21.322812906019923, "grad_norm": 0.40156933665275574, "learning_rate": 4.786771870939801e-05, "loss": 0.3657, "step": 3151000 }, { "epoch": 21.32619640537029, "grad_norm": 0.34377700090408325, "learning_rate": 4.786738035946297e-05, "loss": 0.3625, "step": 3151500 }, { "epoch": 21.32957990472066, "grad_norm": 0.3886925280094147, "learning_rate": 4.786704200952794e-05, "loss": 0.3656, "step": 3152000 }, { "epoch": 21.332963404071027, "grad_norm": 0.4017632305622101, "learning_rate": 4.78667036595929e-05, "loss": 0.3648, "step": 3152500 }, { "epoch": 21.336346903421394, "grad_norm": 0.34350860118865967, "learning_rate": 4.786636530965786e-05, "loss": 0.3651, "step": 3153000 }, { "epoch": 21.339730402771764, "grad_norm": 0.36725303530693054, "learning_rate": 4.7866026959722824e-05, "loss": 0.3634, "step": 3153500 }, { "epoch": 21.34311390212213, "grad_norm": 0.354269802570343, "learning_rate": 4.786568860978779e-05, "loss": 0.3643, "step": 3154000 }, { "epoch": 21.346497401472497, "grad_norm": 0.34944888949394226, "learning_rate": 4.7865350259852755e-05, "loss": 0.3641, "step": 3154500 }, { "epoch": 21.349880900822868, "grad_norm": 0.41143280267715454, "learning_rate": 4.786501190991772e-05, "loss": 0.364, "step": 3155000 }, { "epoch": 21.353264400173234, "grad_norm": 0.35492393374443054, "learning_rate": 4.786467355998268e-05, "loss": 0.3633, "step": 3155500 }, { "epoch": 21.356647899523605, "grad_norm": 0.3725531995296478, "learning_rate": 4.786433521004764e-05, "loss": 0.3651, "step": 3156000 }, { "epoch": 21.36003139887397, "grad_norm": 0.3667643070220947, "learning_rate": 4.78639968601126e-05, "loss": 0.3649, "step": 3156500 }, { "epoch": 21.363414898224338, "grad_norm": 0.3814246356487274, "learning_rate": 4.7863658510177565e-05, "loss": 0.3645, "step": 3157000 }, { "epoch": 21.36679839757471, "grad_norm": 0.3644694983959198, "learning_rate": 4.786332016024253e-05, "loss": 0.366, "step": 3157500 }, { "epoch": 21.370181896925075, "grad_norm": 0.37675225734710693, "learning_rate": 4.7862981810307496e-05, "loss": 0.3652, "step": 3158000 }, { "epoch": 21.373565396275445, "grad_norm": 0.36693552136421204, "learning_rate": 4.786264346037246e-05, "loss": 0.3653, "step": 3158500 }, { "epoch": 21.376948895625812, "grad_norm": 0.3457273542881012, "learning_rate": 4.786230511043742e-05, "loss": 0.3639, "step": 3159000 }, { "epoch": 21.38033239497618, "grad_norm": 0.37589481472969055, "learning_rate": 4.786196676050238e-05, "loss": 0.3633, "step": 3159500 }, { "epoch": 21.38371589432655, "grad_norm": 0.3978182077407837, "learning_rate": 4.786162841056735e-05, "loss": 0.3636, "step": 3160000 }, { "epoch": 21.387099393676916, "grad_norm": 0.3442537784576416, "learning_rate": 4.7861290060632314e-05, "loss": 0.3656, "step": 3160500 }, { "epoch": 21.390482893027283, "grad_norm": 0.35906335711479187, "learning_rate": 4.786095171069727e-05, "loss": 0.365, "step": 3161000 }, { "epoch": 21.393866392377653, "grad_norm": 0.37391018867492676, "learning_rate": 4.786061336076224e-05, "loss": 0.3631, "step": 3161500 }, { "epoch": 21.39724989172802, "grad_norm": 0.38614532351493835, "learning_rate": 4.78602750108272e-05, "loss": 0.366, "step": 3162000 }, { "epoch": 21.40063339107839, "grad_norm": 0.39576542377471924, "learning_rate": 4.785993666089216e-05, "loss": 0.3638, "step": 3162500 }, { "epoch": 21.404016890428757, "grad_norm": 0.3507489562034607, "learning_rate": 4.7859598310957124e-05, "loss": 0.3655, "step": 3163000 }, { "epoch": 21.407400389779124, "grad_norm": 0.35442155599594116, "learning_rate": 4.785925996102209e-05, "loss": 0.3659, "step": 3163500 }, { "epoch": 21.410783889129494, "grad_norm": 0.33154821395874023, "learning_rate": 4.7858921611087055e-05, "loss": 0.3653, "step": 3164000 }, { "epoch": 21.41416738847986, "grad_norm": 0.3507884740829468, "learning_rate": 4.785858326115202e-05, "loss": 0.366, "step": 3164500 }, { "epoch": 21.41755088783023, "grad_norm": 0.31205856800079346, "learning_rate": 4.785824491121698e-05, "loss": 0.3643, "step": 3165000 }, { "epoch": 21.420934387180598, "grad_norm": 0.39912527799606323, "learning_rate": 4.785790656128194e-05, "loss": 0.3634, "step": 3165500 }, { "epoch": 21.424317886530964, "grad_norm": 0.37065696716308594, "learning_rate": 4.7857568211346904e-05, "loss": 0.3641, "step": 3166000 }, { "epoch": 21.427701385881335, "grad_norm": 0.38233476877212524, "learning_rate": 4.7857229861411866e-05, "loss": 0.3626, "step": 3166500 }, { "epoch": 21.4310848852317, "grad_norm": 0.34912359714508057, "learning_rate": 4.785689151147683e-05, "loss": 0.362, "step": 3167000 }, { "epoch": 21.434468384582072, "grad_norm": 0.38831716775894165, "learning_rate": 4.78565531615418e-05, "loss": 0.3628, "step": 3167500 }, { "epoch": 21.43785188393244, "grad_norm": 0.40890708565711975, "learning_rate": 4.785621481160676e-05, "loss": 0.3624, "step": 3168000 }, { "epoch": 21.441235383282805, "grad_norm": 0.3991265296936035, "learning_rate": 4.785587646167172e-05, "loss": 0.3654, "step": 3168500 }, { "epoch": 21.444618882633176, "grad_norm": 0.37633535265922546, "learning_rate": 4.785553811173668e-05, "loss": 0.3648, "step": 3169000 }, { "epoch": 21.448002381983542, "grad_norm": 0.36139988899230957, "learning_rate": 4.785519976180165e-05, "loss": 0.3648, "step": 3169500 }, { "epoch": 21.45138588133391, "grad_norm": 0.3636741042137146, "learning_rate": 4.7854861411866614e-05, "loss": 0.365, "step": 3170000 }, { "epoch": 21.45476938068428, "grad_norm": 0.36973482370376587, "learning_rate": 4.785452306193157e-05, "loss": 0.3652, "step": 3170500 }, { "epoch": 21.458152880034646, "grad_norm": 0.36788687109947205, "learning_rate": 4.785418471199654e-05, "loss": 0.3634, "step": 3171000 }, { "epoch": 21.461536379385016, "grad_norm": 0.364843487739563, "learning_rate": 4.78538463620615e-05, "loss": 0.3633, "step": 3171500 }, { "epoch": 21.464919878735383, "grad_norm": 0.3892847001552582, "learning_rate": 4.785350801212646e-05, "loss": 0.3645, "step": 3172000 }, { "epoch": 21.46830337808575, "grad_norm": 0.35123181343078613, "learning_rate": 4.7853169662191425e-05, "loss": 0.3643, "step": 3172500 }, { "epoch": 21.47168687743612, "grad_norm": 0.3675801753997803, "learning_rate": 4.7852831312256394e-05, "loss": 0.3651, "step": 3173000 }, { "epoch": 21.475070376786487, "grad_norm": 0.3755906820297241, "learning_rate": 4.7852492962321356e-05, "loss": 0.3656, "step": 3173500 }, { "epoch": 21.478453876136857, "grad_norm": 0.360592782497406, "learning_rate": 4.785215461238632e-05, "loss": 0.3633, "step": 3174000 }, { "epoch": 21.481837375487224, "grad_norm": 0.3928108811378479, "learning_rate": 4.785181626245128e-05, "loss": 0.3632, "step": 3174500 }, { "epoch": 21.48522087483759, "grad_norm": 0.371901273727417, "learning_rate": 4.785147791251624e-05, "loss": 0.3632, "step": 3175000 }, { "epoch": 21.48860437418796, "grad_norm": 0.3364705443382263, "learning_rate": 4.7851139562581204e-05, "loss": 0.3656, "step": 3175500 }, { "epoch": 21.491987873538328, "grad_norm": 0.3526456952095032, "learning_rate": 4.785080121264617e-05, "loss": 0.3648, "step": 3176000 }, { "epoch": 21.495371372888698, "grad_norm": 0.3650068938732147, "learning_rate": 4.785046286271113e-05, "loss": 0.3643, "step": 3176500 }, { "epoch": 21.498754872239065, "grad_norm": 0.3584924638271332, "learning_rate": 4.78501245127761e-05, "loss": 0.3661, "step": 3177000 }, { "epoch": 21.50213837158943, "grad_norm": 0.36041101813316345, "learning_rate": 4.784978616284106e-05, "loss": 0.3642, "step": 3177500 }, { "epoch": 21.505521870939802, "grad_norm": 0.3904370367527008, "learning_rate": 4.784944781290602e-05, "loss": 0.3657, "step": 3178000 }, { "epoch": 21.50890537029017, "grad_norm": 0.3386889398097992, "learning_rate": 4.7849109462970984e-05, "loss": 0.3641, "step": 3178500 }, { "epoch": 21.512288869640535, "grad_norm": 0.36542385816574097, "learning_rate": 4.784877111303595e-05, "loss": 0.3642, "step": 3179000 }, { "epoch": 21.515672368990906, "grad_norm": 0.3513859212398529, "learning_rate": 4.7848432763100915e-05, "loss": 0.3624, "step": 3179500 }, { "epoch": 21.519055868341272, "grad_norm": 0.39994296431541443, "learning_rate": 4.784809441316587e-05, "loss": 0.3638, "step": 3180000 }, { "epoch": 21.522439367691643, "grad_norm": 0.3714151680469513, "learning_rate": 4.784775606323084e-05, "loss": 0.3641, "step": 3180500 }, { "epoch": 21.52582286704201, "grad_norm": 0.3823555111885071, "learning_rate": 4.78474177132958e-05, "loss": 0.3646, "step": 3181000 }, { "epoch": 21.529206366392376, "grad_norm": 0.37660470604896545, "learning_rate": 4.7847079363360764e-05, "loss": 0.3656, "step": 3181500 }, { "epoch": 21.532589865742747, "grad_norm": 0.3868112862110138, "learning_rate": 4.7846741013425726e-05, "loss": 0.3659, "step": 3182000 }, { "epoch": 21.535973365093113, "grad_norm": 0.37358933687210083, "learning_rate": 4.7846402663490695e-05, "loss": 0.3663, "step": 3182500 }, { "epoch": 21.539356864443484, "grad_norm": 0.3735228180885315, "learning_rate": 4.784606431355566e-05, "loss": 0.3642, "step": 3183000 }, { "epoch": 21.54274036379385, "grad_norm": 0.34915071725845337, "learning_rate": 4.784572596362062e-05, "loss": 0.3649, "step": 3183500 }, { "epoch": 21.546123863144217, "grad_norm": 0.37599048018455505, "learning_rate": 4.784538761368558e-05, "loss": 0.3639, "step": 3184000 }, { "epoch": 21.549507362494587, "grad_norm": 0.3553498387336731, "learning_rate": 4.784504926375054e-05, "loss": 0.3648, "step": 3184500 }, { "epoch": 21.552890861844954, "grad_norm": 0.35894688963890076, "learning_rate": 4.7844710913815505e-05, "loss": 0.3651, "step": 3185000 }, { "epoch": 21.55627436119532, "grad_norm": 0.35583484172821045, "learning_rate": 4.784437256388047e-05, "loss": 0.3632, "step": 3185500 }, { "epoch": 21.55965786054569, "grad_norm": 0.3528789281845093, "learning_rate": 4.784403421394543e-05, "loss": 0.3648, "step": 3186000 }, { "epoch": 21.563041359896058, "grad_norm": 0.36016106605529785, "learning_rate": 4.78436958640104e-05, "loss": 0.3634, "step": 3186500 }, { "epoch": 21.56642485924643, "grad_norm": 0.389413982629776, "learning_rate": 4.784335751407536e-05, "loss": 0.3672, "step": 3187000 }, { "epoch": 21.569808358596795, "grad_norm": 0.4205852150917053, "learning_rate": 4.784301916414032e-05, "loss": 0.3646, "step": 3187500 }, { "epoch": 21.57319185794716, "grad_norm": 0.3701072931289673, "learning_rate": 4.7842680814205285e-05, "loss": 0.3655, "step": 3188000 }, { "epoch": 21.576575357297532, "grad_norm": 0.32102009654045105, "learning_rate": 4.7842342464270254e-05, "loss": 0.3638, "step": 3188500 }, { "epoch": 21.5799588566479, "grad_norm": 0.3219831585884094, "learning_rate": 4.7842004114335216e-05, "loss": 0.3644, "step": 3189000 }, { "epoch": 21.58334235599827, "grad_norm": 0.33970320224761963, "learning_rate": 4.784166576440017e-05, "loss": 0.365, "step": 3189500 }, { "epoch": 21.586725855348636, "grad_norm": 0.3333664834499359, "learning_rate": 4.784132741446514e-05, "loss": 0.3658, "step": 3190000 }, { "epoch": 21.590109354699003, "grad_norm": 0.36616653203964233, "learning_rate": 4.78409890645301e-05, "loss": 0.3644, "step": 3190500 }, { "epoch": 21.593492854049373, "grad_norm": 0.3060896098613739, "learning_rate": 4.7840650714595064e-05, "loss": 0.3631, "step": 3191000 }, { "epoch": 21.59687635339974, "grad_norm": 0.4059158265590668, "learning_rate": 4.7840312364660026e-05, "loss": 0.3638, "step": 3191500 }, { "epoch": 21.60025985275011, "grad_norm": 0.36123108863830566, "learning_rate": 4.7839974014724995e-05, "loss": 0.3646, "step": 3192000 }, { "epoch": 21.603643352100477, "grad_norm": 0.36856579780578613, "learning_rate": 4.783963566478996e-05, "loss": 0.3658, "step": 3192500 }, { "epoch": 21.607026851450843, "grad_norm": 0.35361194610595703, "learning_rate": 4.783929731485492e-05, "loss": 0.365, "step": 3193000 }, { "epoch": 21.610410350801214, "grad_norm": 0.3588424026966095, "learning_rate": 4.783895896491988e-05, "loss": 0.3647, "step": 3193500 }, { "epoch": 21.61379385015158, "grad_norm": 0.3856673836708069, "learning_rate": 4.7838620614984844e-05, "loss": 0.3629, "step": 3194000 }, { "epoch": 21.617177349501947, "grad_norm": 0.3217855989933014, "learning_rate": 4.7838282265049806e-05, "loss": 0.3645, "step": 3194500 }, { "epoch": 21.620560848852318, "grad_norm": 0.3752342462539673, "learning_rate": 4.783794391511477e-05, "loss": 0.364, "step": 3195000 }, { "epoch": 21.623944348202684, "grad_norm": 0.3654584586620331, "learning_rate": 4.783760556517973e-05, "loss": 0.3639, "step": 3195500 }, { "epoch": 21.627327847553055, "grad_norm": 0.3445321023464203, "learning_rate": 4.78372672152447e-05, "loss": 0.3654, "step": 3196000 }, { "epoch": 21.63071134690342, "grad_norm": 0.3452378213405609, "learning_rate": 4.783692886530966e-05, "loss": 0.3656, "step": 3196500 }, { "epoch": 21.634094846253788, "grad_norm": 0.37917622923851013, "learning_rate": 4.783659051537462e-05, "loss": 0.3662, "step": 3197000 }, { "epoch": 21.63747834560416, "grad_norm": 0.35758304595947266, "learning_rate": 4.7836252165439585e-05, "loss": 0.3649, "step": 3197500 }, { "epoch": 21.640861844954525, "grad_norm": 0.3921206593513489, "learning_rate": 4.7835913815504554e-05, "loss": 0.3657, "step": 3198000 }, { "epoch": 21.644245344304895, "grad_norm": 0.34664463996887207, "learning_rate": 4.7835575465569516e-05, "loss": 0.3641, "step": 3198500 }, { "epoch": 21.647628843655262, "grad_norm": 0.3591472804546356, "learning_rate": 4.783523711563447e-05, "loss": 0.3657, "step": 3199000 }, { "epoch": 21.65101234300563, "grad_norm": 0.3513629734516144, "learning_rate": 4.783489876569944e-05, "loss": 0.3647, "step": 3199500 }, { "epoch": 21.654395842356, "grad_norm": 0.37922269105911255, "learning_rate": 4.78345604157644e-05, "loss": 0.3638, "step": 3200000 }, { "epoch": 21.657779341706366, "grad_norm": 0.41598668694496155, "learning_rate": 4.7834222065829365e-05, "loss": 0.366, "step": 3200500 }, { "epoch": 21.661162841056736, "grad_norm": 0.37553656101226807, "learning_rate": 4.783388371589433e-05, "loss": 0.3647, "step": 3201000 }, { "epoch": 21.664546340407103, "grad_norm": 0.31073540449142456, "learning_rate": 4.7833545365959296e-05, "loss": 0.3645, "step": 3201500 }, { "epoch": 21.66792983975747, "grad_norm": 0.3254922926425934, "learning_rate": 4.783320701602426e-05, "loss": 0.363, "step": 3202000 }, { "epoch": 21.67131333910784, "grad_norm": 0.3663785457611084, "learning_rate": 4.783286866608922e-05, "loss": 0.3643, "step": 3202500 }, { "epoch": 21.674696838458207, "grad_norm": 0.35388389229774475, "learning_rate": 4.783253031615418e-05, "loss": 0.3628, "step": 3203000 }, { "epoch": 21.678080337808574, "grad_norm": 0.3668496012687683, "learning_rate": 4.7832191966219144e-05, "loss": 0.3659, "step": 3203500 }, { "epoch": 21.681463837158944, "grad_norm": 0.3565506339073181, "learning_rate": 4.7831853616284106e-05, "loss": 0.3634, "step": 3204000 }, { "epoch": 21.68484733650931, "grad_norm": 0.35518208146095276, "learning_rate": 4.783151526634907e-05, "loss": 0.364, "step": 3204500 }, { "epoch": 21.68823083585968, "grad_norm": 0.4002978801727295, "learning_rate": 4.783117691641403e-05, "loss": 0.3654, "step": 3205000 }, { "epoch": 21.691614335210048, "grad_norm": 0.35866105556488037, "learning_rate": 4.7830838566479e-05, "loss": 0.3643, "step": 3205500 }, { "epoch": 21.694997834560414, "grad_norm": 0.3549555242061615, "learning_rate": 4.783050021654396e-05, "loss": 0.3651, "step": 3206000 }, { "epoch": 21.698381333910785, "grad_norm": 0.3505103588104248, "learning_rate": 4.7830161866608924e-05, "loss": 0.3643, "step": 3206500 }, { "epoch": 21.70176483326115, "grad_norm": 0.373981773853302, "learning_rate": 4.7829823516673886e-05, "loss": 0.3645, "step": 3207000 }, { "epoch": 21.70514833261152, "grad_norm": 0.3849928081035614, "learning_rate": 4.7829485166738855e-05, "loss": 0.3649, "step": 3207500 }, { "epoch": 21.70853183196189, "grad_norm": 0.3909493684768677, "learning_rate": 4.782914681680382e-05, "loss": 0.3634, "step": 3208000 }, { "epoch": 21.711915331312255, "grad_norm": 0.3365996181964874, "learning_rate": 4.782880846686877e-05, "loss": 0.3648, "step": 3208500 }, { "epoch": 21.715298830662626, "grad_norm": 0.35930874943733215, "learning_rate": 4.782847011693374e-05, "loss": 0.3643, "step": 3209000 }, { "epoch": 21.718682330012992, "grad_norm": 0.3305171728134155, "learning_rate": 4.78281317669987e-05, "loss": 0.3655, "step": 3209500 }, { "epoch": 21.72206582936336, "grad_norm": 0.37265220284461975, "learning_rate": 4.7827793417063665e-05, "loss": 0.3665, "step": 3210000 }, { "epoch": 21.72544932871373, "grad_norm": 0.327110230922699, "learning_rate": 4.782745506712863e-05, "loss": 0.3655, "step": 3210500 }, { "epoch": 21.728832828064096, "grad_norm": 0.3678237497806549, "learning_rate": 4.782711671719359e-05, "loss": 0.3646, "step": 3211000 }, { "epoch": 21.732216327414466, "grad_norm": 0.3403486907482147, "learning_rate": 4.782677836725856e-05, "loss": 0.3656, "step": 3211500 }, { "epoch": 21.735599826764833, "grad_norm": 0.3431899845600128, "learning_rate": 4.782644001732352e-05, "loss": 0.3641, "step": 3212000 }, { "epoch": 21.7389833261152, "grad_norm": 0.3963979184627533, "learning_rate": 4.782610166738848e-05, "loss": 0.3651, "step": 3212500 }, { "epoch": 21.74236682546557, "grad_norm": 0.3516821265220642, "learning_rate": 4.7825763317453445e-05, "loss": 0.3636, "step": 3213000 }, { "epoch": 21.745750324815937, "grad_norm": 0.36609870195388794, "learning_rate": 4.782542496751841e-05, "loss": 0.3646, "step": 3213500 }, { "epoch": 21.749133824166307, "grad_norm": 0.3898099660873413, "learning_rate": 4.782508661758337e-05, "loss": 0.3622, "step": 3214000 }, { "epoch": 21.752517323516674, "grad_norm": 0.3062995374202728, "learning_rate": 4.782474826764833e-05, "loss": 0.3646, "step": 3214500 }, { "epoch": 21.75590082286704, "grad_norm": 0.37412673234939575, "learning_rate": 4.78244099177133e-05, "loss": 0.3647, "step": 3215000 }, { "epoch": 21.75928432221741, "grad_norm": 0.38491693139076233, "learning_rate": 4.782407156777826e-05, "loss": 0.3633, "step": 3215500 }, { "epoch": 21.762667821567778, "grad_norm": 0.3784734606742859, "learning_rate": 4.7823733217843224e-05, "loss": 0.3661, "step": 3216000 }, { "epoch": 21.766051320918148, "grad_norm": 0.3548288345336914, "learning_rate": 4.7823394867908187e-05, "loss": 0.3639, "step": 3216500 }, { "epoch": 21.769434820268515, "grad_norm": 0.3605089783668518, "learning_rate": 4.7823056517973155e-05, "loss": 0.3648, "step": 3217000 }, { "epoch": 21.77281831961888, "grad_norm": 0.3627791702747345, "learning_rate": 4.782271816803812e-05, "loss": 0.3644, "step": 3217500 }, { "epoch": 21.776201818969252, "grad_norm": 0.40289151668548584, "learning_rate": 4.782237981810307e-05, "loss": 0.3629, "step": 3218000 }, { "epoch": 21.77958531831962, "grad_norm": 0.36663007736206055, "learning_rate": 4.782204146816804e-05, "loss": 0.3652, "step": 3218500 }, { "epoch": 21.782968817669985, "grad_norm": 0.3509083688259125, "learning_rate": 4.7821703118233004e-05, "loss": 0.3652, "step": 3219000 }, { "epoch": 21.786352317020356, "grad_norm": 0.3642936944961548, "learning_rate": 4.7821364768297966e-05, "loss": 0.3652, "step": 3219500 }, { "epoch": 21.789735816370722, "grad_norm": 0.4193442761898041, "learning_rate": 4.782102641836293e-05, "loss": 0.3651, "step": 3220000 }, { "epoch": 21.793119315721093, "grad_norm": 0.4046929180622101, "learning_rate": 4.782068806842789e-05, "loss": 0.3644, "step": 3220500 }, { "epoch": 21.79650281507146, "grad_norm": 0.37898531556129456, "learning_rate": 4.782034971849286e-05, "loss": 0.366, "step": 3221000 }, { "epoch": 21.799886314421826, "grad_norm": 0.35761505365371704, "learning_rate": 4.782001136855782e-05, "loss": 0.3654, "step": 3221500 }, { "epoch": 21.803269813772197, "grad_norm": 0.3742005527019501, "learning_rate": 4.7819673018622783e-05, "loss": 0.3654, "step": 3222000 }, { "epoch": 21.806653313122563, "grad_norm": 0.3698446452617645, "learning_rate": 4.7819334668687746e-05, "loss": 0.3647, "step": 3222500 }, { "epoch": 21.810036812472934, "grad_norm": 0.35749921202659607, "learning_rate": 4.781899631875271e-05, "loss": 0.3636, "step": 3223000 }, { "epoch": 21.8134203118233, "grad_norm": 0.34802117943763733, "learning_rate": 4.781865796881767e-05, "loss": 0.3662, "step": 3223500 }, { "epoch": 21.816803811173667, "grad_norm": 0.34336793422698975, "learning_rate": 4.781831961888263e-05, "loss": 0.3651, "step": 3224000 }, { "epoch": 21.820187310524037, "grad_norm": 0.38206735253334045, "learning_rate": 4.78179812689476e-05, "loss": 0.3656, "step": 3224500 }, { "epoch": 21.823570809874404, "grad_norm": 0.38054636120796204, "learning_rate": 4.781764291901256e-05, "loss": 0.3644, "step": 3225000 }, { "epoch": 21.826954309224774, "grad_norm": 0.37155601382255554, "learning_rate": 4.7817304569077525e-05, "loss": 0.3648, "step": 3225500 }, { "epoch": 21.83033780857514, "grad_norm": 0.38225722312927246, "learning_rate": 4.781696621914249e-05, "loss": 0.3639, "step": 3226000 }, { "epoch": 21.833721307925508, "grad_norm": 0.39849624037742615, "learning_rate": 4.7816627869207456e-05, "loss": 0.3639, "step": 3226500 }, { "epoch": 21.837104807275878, "grad_norm": 0.3631201982498169, "learning_rate": 4.781628951927242e-05, "loss": 0.3644, "step": 3227000 }, { "epoch": 21.840488306626245, "grad_norm": 0.3547350764274597, "learning_rate": 4.7815951169337374e-05, "loss": 0.3631, "step": 3227500 }, { "epoch": 21.84387180597661, "grad_norm": 0.36984550952911377, "learning_rate": 4.7815612819402336e-05, "loss": 0.3652, "step": 3228000 }, { "epoch": 21.847255305326982, "grad_norm": 0.34281057119369507, "learning_rate": 4.7815274469467305e-05, "loss": 0.3634, "step": 3228500 }, { "epoch": 21.85063880467735, "grad_norm": 0.3575781285762787, "learning_rate": 4.781493611953227e-05, "loss": 0.3654, "step": 3229000 }, { "epoch": 21.85402230402772, "grad_norm": 0.3543434143066406, "learning_rate": 4.781459776959723e-05, "loss": 0.363, "step": 3229500 }, { "epoch": 21.857405803378086, "grad_norm": 0.3873865306377411, "learning_rate": 4.781425941966219e-05, "loss": 0.3636, "step": 3230000 }, { "epoch": 21.860789302728453, "grad_norm": 0.3623385727405548, "learning_rate": 4.781392106972716e-05, "loss": 0.3635, "step": 3230500 }, { "epoch": 21.864172802078823, "grad_norm": 0.35204654932022095, "learning_rate": 4.781358271979212e-05, "loss": 0.3673, "step": 3231000 }, { "epoch": 21.86755630142919, "grad_norm": 0.3785359561443329, "learning_rate": 4.7813244369857084e-05, "loss": 0.3641, "step": 3231500 }, { "epoch": 21.87093980077956, "grad_norm": 0.3396179974079132, "learning_rate": 4.7812906019922046e-05, "loss": 0.3644, "step": 3232000 }, { "epoch": 21.874323300129927, "grad_norm": 0.35946404933929443, "learning_rate": 4.781256766998701e-05, "loss": 0.3657, "step": 3232500 }, { "epoch": 21.877706799480293, "grad_norm": 0.40445196628570557, "learning_rate": 4.781222932005197e-05, "loss": 0.365, "step": 3233000 }, { "epoch": 21.881090298830664, "grad_norm": 0.3674616813659668, "learning_rate": 4.781189097011693e-05, "loss": 0.3652, "step": 3233500 }, { "epoch": 21.88447379818103, "grad_norm": 0.3473931849002838, "learning_rate": 4.78115526201819e-05, "loss": 0.3647, "step": 3234000 }, { "epoch": 21.887857297531397, "grad_norm": 0.37436696887016296, "learning_rate": 4.7811214270246864e-05, "loss": 0.3639, "step": 3234500 }, { "epoch": 21.891240796881767, "grad_norm": 0.3391352891921997, "learning_rate": 4.7810875920311826e-05, "loss": 0.3663, "step": 3235000 }, { "epoch": 21.894624296232134, "grad_norm": 0.40358367562294006, "learning_rate": 4.781053757037679e-05, "loss": 0.3653, "step": 3235500 }, { "epoch": 21.898007795582505, "grad_norm": 0.35122546553611755, "learning_rate": 4.781019922044176e-05, "loss": 0.3647, "step": 3236000 }, { "epoch": 21.90139129493287, "grad_norm": 0.393868625164032, "learning_rate": 4.780986087050672e-05, "loss": 0.3628, "step": 3236500 }, { "epoch": 21.904774794283238, "grad_norm": 0.33418118953704834, "learning_rate": 4.7809522520571674e-05, "loss": 0.3637, "step": 3237000 }, { "epoch": 21.90815829363361, "grad_norm": 0.37154361605644226, "learning_rate": 4.7809184170636636e-05, "loss": 0.3647, "step": 3237500 }, { "epoch": 21.911541792983975, "grad_norm": 0.3812848627567291, "learning_rate": 4.7808845820701605e-05, "loss": 0.3657, "step": 3238000 }, { "epoch": 21.914925292334345, "grad_norm": 0.35401734709739685, "learning_rate": 4.780850747076657e-05, "loss": 0.3634, "step": 3238500 }, { "epoch": 21.918308791684712, "grad_norm": 0.3435650169849396, "learning_rate": 4.780816912083153e-05, "loss": 0.3634, "step": 3239000 }, { "epoch": 21.92169229103508, "grad_norm": 0.37529340386390686, "learning_rate": 4.780783077089649e-05, "loss": 0.3646, "step": 3239500 }, { "epoch": 21.92507579038545, "grad_norm": 0.33938807249069214, "learning_rate": 4.780749242096146e-05, "loss": 0.3647, "step": 3240000 }, { "epoch": 21.928459289735816, "grad_norm": 0.3155749440193176, "learning_rate": 4.780715407102642e-05, "loss": 0.364, "step": 3240500 }, { "epoch": 21.931842789086183, "grad_norm": 0.37478557229042053, "learning_rate": 4.7806815721091385e-05, "loss": 0.3648, "step": 3241000 }, { "epoch": 21.935226288436553, "grad_norm": 0.35621947050094604, "learning_rate": 4.780647737115635e-05, "loss": 0.3657, "step": 3241500 }, { "epoch": 21.93860978778692, "grad_norm": 0.3681904375553131, "learning_rate": 4.780613902122131e-05, "loss": 0.3647, "step": 3242000 }, { "epoch": 21.94199328713729, "grad_norm": 0.37497857213020325, "learning_rate": 4.780580067128627e-05, "loss": 0.3643, "step": 3242500 }, { "epoch": 21.945376786487657, "grad_norm": 0.41352739930152893, "learning_rate": 4.780546232135123e-05, "loss": 0.365, "step": 3243000 }, { "epoch": 21.948760285838024, "grad_norm": 0.386015385389328, "learning_rate": 4.78051239714162e-05, "loss": 0.3654, "step": 3243500 }, { "epoch": 21.952143785188394, "grad_norm": 0.3766820430755615, "learning_rate": 4.7804785621481164e-05, "loss": 0.3657, "step": 3244000 }, { "epoch": 21.95552728453876, "grad_norm": 0.3699408769607544, "learning_rate": 4.7804447271546126e-05, "loss": 0.3641, "step": 3244500 }, { "epoch": 21.95891078388913, "grad_norm": 0.3297472894191742, "learning_rate": 4.780410892161109e-05, "loss": 0.3647, "step": 3245000 }, { "epoch": 21.962294283239498, "grad_norm": 0.36351293325424194, "learning_rate": 4.780377057167606e-05, "loss": 0.3644, "step": 3245500 }, { "epoch": 21.965677782589864, "grad_norm": 0.3939778506755829, "learning_rate": 4.780343222174102e-05, "loss": 0.3661, "step": 3246000 }, { "epoch": 21.969061281940235, "grad_norm": 0.4079965353012085, "learning_rate": 4.7803093871805975e-05, "loss": 0.3646, "step": 3246500 }, { "epoch": 21.9724447812906, "grad_norm": 0.3721138536930084, "learning_rate": 4.780275552187094e-05, "loss": 0.3652, "step": 3247000 }, { "epoch": 21.97582828064097, "grad_norm": 0.3701770603656769, "learning_rate": 4.7802417171935906e-05, "loss": 0.365, "step": 3247500 }, { "epoch": 21.97921177999134, "grad_norm": 0.3543812334537506, "learning_rate": 4.780207882200087e-05, "loss": 0.3631, "step": 3248000 }, { "epoch": 21.982595279341705, "grad_norm": 0.3594103157520294, "learning_rate": 4.780174047206583e-05, "loss": 0.3647, "step": 3248500 }, { "epoch": 21.985978778692076, "grad_norm": 0.3717160224914551, "learning_rate": 4.780140212213079e-05, "loss": 0.3654, "step": 3249000 }, { "epoch": 21.989362278042442, "grad_norm": 0.3792131543159485, "learning_rate": 4.780106377219576e-05, "loss": 0.3639, "step": 3249500 }, { "epoch": 21.992745777392813, "grad_norm": 0.34902265667915344, "learning_rate": 4.780072542226072e-05, "loss": 0.3638, "step": 3250000 }, { "epoch": 21.99612927674318, "grad_norm": 0.3875604569911957, "learning_rate": 4.7800387072325685e-05, "loss": 0.3651, "step": 3250500 }, { "epoch": 21.999512776093546, "grad_norm": 0.3711358308792114, "learning_rate": 4.780004872239065e-05, "loss": 0.3648, "step": 3251000 }, { "epoch": 22.0, "eval_accuracy": 0.8610757404558269, "eval_loss": 0.5639563798904419, "eval_runtime": 3381.8763, "eval_samples_per_second": 85.971, "eval_steps_per_second": 5.373, "step": 3251072 }, { "epoch": 22.002896275443916, "grad_norm": 0.36901411414146423, "learning_rate": 4.779971037245561e-05, "loss": 0.3639, "step": 3251500 }, { "epoch": 22.006279774794283, "grad_norm": 0.37363478541374207, "learning_rate": 4.779937202252057e-05, "loss": 0.3617, "step": 3252000 }, { "epoch": 22.00966327414465, "grad_norm": 0.35945314168930054, "learning_rate": 4.7799033672585534e-05, "loss": 0.3616, "step": 3252500 }, { "epoch": 22.01304677349502, "grad_norm": 0.3384290337562561, "learning_rate": 4.77986953226505e-05, "loss": 0.3638, "step": 3253000 }, { "epoch": 22.016430272845387, "grad_norm": 0.3904561698436737, "learning_rate": 4.7798356972715465e-05, "loss": 0.3633, "step": 3253500 }, { "epoch": 22.019813772195757, "grad_norm": 0.35363084077835083, "learning_rate": 4.779801862278043e-05, "loss": 0.3627, "step": 3254000 }, { "epoch": 22.023197271546124, "grad_norm": 0.3530249297618866, "learning_rate": 4.779768027284539e-05, "loss": 0.3629, "step": 3254500 }, { "epoch": 22.02658077089649, "grad_norm": 0.3707101047039032, "learning_rate": 4.779734192291036e-05, "loss": 0.3631, "step": 3255000 }, { "epoch": 22.02996427024686, "grad_norm": 0.33745312690734863, "learning_rate": 4.779700357297532e-05, "loss": 0.3618, "step": 3255500 }, { "epoch": 22.033347769597228, "grad_norm": 0.37105587124824524, "learning_rate": 4.7796665223040275e-05, "loss": 0.3604, "step": 3256000 }, { "epoch": 22.036731268947598, "grad_norm": 0.36006683111190796, "learning_rate": 4.779632687310524e-05, "loss": 0.3625, "step": 3256500 }, { "epoch": 22.040114768297965, "grad_norm": 0.37026074528694153, "learning_rate": 4.7795988523170206e-05, "loss": 0.363, "step": 3257000 }, { "epoch": 22.04349826764833, "grad_norm": 0.3176872432231903, "learning_rate": 4.779565017323517e-05, "loss": 0.364, "step": 3257500 }, { "epoch": 22.046881766998702, "grad_norm": 0.37873584032058716, "learning_rate": 4.779531182330013e-05, "loss": 0.3629, "step": 3258000 }, { "epoch": 22.05026526634907, "grad_norm": 0.4034285247325897, "learning_rate": 4.779497347336509e-05, "loss": 0.3618, "step": 3258500 }, { "epoch": 22.053648765699435, "grad_norm": 0.35463711619377136, "learning_rate": 4.779463512343006e-05, "loss": 0.3619, "step": 3259000 }, { "epoch": 22.057032265049806, "grad_norm": 0.39048513770103455, "learning_rate": 4.7794296773495024e-05, "loss": 0.3639, "step": 3259500 }, { "epoch": 22.060415764400172, "grad_norm": 0.38548508286476135, "learning_rate": 4.7793958423559986e-05, "loss": 0.3633, "step": 3260000 }, { "epoch": 22.063799263750543, "grad_norm": 0.38834986090660095, "learning_rate": 4.779362007362495e-05, "loss": 0.3642, "step": 3260500 }, { "epoch": 22.06718276310091, "grad_norm": 0.31702256202697754, "learning_rate": 4.779328172368991e-05, "loss": 0.3637, "step": 3261000 }, { "epoch": 22.070566262451276, "grad_norm": 0.35815784335136414, "learning_rate": 4.779294337375487e-05, "loss": 0.3634, "step": 3261500 }, { "epoch": 22.073949761801646, "grad_norm": 0.34262552857398987, "learning_rate": 4.7792605023819834e-05, "loss": 0.3629, "step": 3262000 }, { "epoch": 22.077333261152013, "grad_norm": 0.37976783514022827, "learning_rate": 4.77922666738848e-05, "loss": 0.3625, "step": 3262500 }, { "epoch": 22.080716760502384, "grad_norm": 0.41490501165390015, "learning_rate": 4.7791928323949765e-05, "loss": 0.3637, "step": 3263000 }, { "epoch": 22.08410025985275, "grad_norm": 0.3685232698917389, "learning_rate": 4.779158997401473e-05, "loss": 0.3636, "step": 3263500 }, { "epoch": 22.087483759203117, "grad_norm": 0.39337560534477234, "learning_rate": 4.779125162407969e-05, "loss": 0.3641, "step": 3264000 }, { "epoch": 22.090867258553487, "grad_norm": 0.3464198410511017, "learning_rate": 4.779091327414466e-05, "loss": 0.3634, "step": 3264500 }, { "epoch": 22.094250757903854, "grad_norm": 0.35609114170074463, "learning_rate": 4.779057492420962e-05, "loss": 0.3652, "step": 3265000 }, { "epoch": 22.097634257254224, "grad_norm": 0.3641035854816437, "learning_rate": 4.779023657427458e-05, "loss": 0.3636, "step": 3265500 }, { "epoch": 22.10101775660459, "grad_norm": 0.38921040296554565, "learning_rate": 4.778989822433954e-05, "loss": 0.3639, "step": 3266000 }, { "epoch": 22.104401255954958, "grad_norm": 0.35882940888404846, "learning_rate": 4.778955987440451e-05, "loss": 0.3612, "step": 3266500 }, { "epoch": 22.107784755305328, "grad_norm": 0.36842402815818787, "learning_rate": 4.778922152446947e-05, "loss": 0.3623, "step": 3267000 }, { "epoch": 22.111168254655695, "grad_norm": 0.34856799244880676, "learning_rate": 4.778888317453443e-05, "loss": 0.3641, "step": 3267500 }, { "epoch": 22.11455175400606, "grad_norm": 0.36763256788253784, "learning_rate": 4.7788544824599393e-05, "loss": 0.3627, "step": 3268000 }, { "epoch": 22.117935253356432, "grad_norm": 0.345851331949234, "learning_rate": 4.778820647466436e-05, "loss": 0.3632, "step": 3268500 }, { "epoch": 22.1213187527068, "grad_norm": 0.3884712755680084, "learning_rate": 4.7787868124729324e-05, "loss": 0.3629, "step": 3269000 }, { "epoch": 22.12470225205717, "grad_norm": 0.3752146065235138, "learning_rate": 4.7787529774794287e-05, "loss": 0.3627, "step": 3269500 }, { "epoch": 22.128085751407536, "grad_norm": 0.3530506491661072, "learning_rate": 4.778719142485925e-05, "loss": 0.364, "step": 3270000 }, { "epoch": 22.131469250757903, "grad_norm": 0.3271232843399048, "learning_rate": 4.778685307492421e-05, "loss": 0.3641, "step": 3270500 }, { "epoch": 22.134852750108273, "grad_norm": 0.3788377046585083, "learning_rate": 4.778651472498917e-05, "loss": 0.3629, "step": 3271000 }, { "epoch": 22.13823624945864, "grad_norm": 0.35951292514801025, "learning_rate": 4.7786176375054135e-05, "loss": 0.3628, "step": 3271500 }, { "epoch": 22.14161974880901, "grad_norm": 0.38089001178741455, "learning_rate": 4.7785838025119104e-05, "loss": 0.3641, "step": 3272000 }, { "epoch": 22.145003248159377, "grad_norm": 0.3626417815685272, "learning_rate": 4.7785499675184066e-05, "loss": 0.362, "step": 3272500 }, { "epoch": 22.148386747509743, "grad_norm": 0.34376761317253113, "learning_rate": 4.778516132524903e-05, "loss": 0.3631, "step": 3273000 }, { "epoch": 22.151770246860114, "grad_norm": 0.3726509213447571, "learning_rate": 4.778482297531399e-05, "loss": 0.3635, "step": 3273500 }, { "epoch": 22.15515374621048, "grad_norm": 0.33812475204467773, "learning_rate": 4.778448462537895e-05, "loss": 0.3634, "step": 3274000 }, { "epoch": 22.158537245560847, "grad_norm": 0.40661919116973877, "learning_rate": 4.778414627544392e-05, "loss": 0.3616, "step": 3274500 }, { "epoch": 22.161920744911217, "grad_norm": 0.4099995493888855, "learning_rate": 4.7783807925508883e-05, "loss": 0.3629, "step": 3275000 }, { "epoch": 22.165304244261584, "grad_norm": 0.4026210904121399, "learning_rate": 4.778346957557384e-05, "loss": 0.3651, "step": 3275500 }, { "epoch": 22.168687743611954, "grad_norm": 0.3911675214767456, "learning_rate": 4.778313122563881e-05, "loss": 0.3631, "step": 3276000 }, { "epoch": 22.17207124296232, "grad_norm": 0.3900057375431061, "learning_rate": 4.778279287570377e-05, "loss": 0.3624, "step": 3276500 }, { "epoch": 22.175454742312688, "grad_norm": 0.4155902862548828, "learning_rate": 4.778245452576873e-05, "loss": 0.3648, "step": 3277000 }, { "epoch": 22.17883824166306, "grad_norm": 0.3615865111351013, "learning_rate": 4.7782116175833694e-05, "loss": 0.3642, "step": 3277500 }, { "epoch": 22.182221741013425, "grad_norm": 0.3743878901004791, "learning_rate": 4.778177782589866e-05, "loss": 0.3625, "step": 3278000 }, { "epoch": 22.185605240363795, "grad_norm": 0.3730500638484955, "learning_rate": 4.7781439475963625e-05, "loss": 0.3641, "step": 3278500 }, { "epoch": 22.188988739714162, "grad_norm": 0.3749823272228241, "learning_rate": 4.778110112602859e-05, "loss": 0.3638, "step": 3279000 }, { "epoch": 22.19237223906453, "grad_norm": 0.378461092710495, "learning_rate": 4.778076277609355e-05, "loss": 0.3625, "step": 3279500 }, { "epoch": 22.1957557384149, "grad_norm": 0.3822938799858093, "learning_rate": 4.778042442615851e-05, "loss": 0.3627, "step": 3280000 }, { "epoch": 22.199139237765266, "grad_norm": 0.3303757309913635, "learning_rate": 4.7780086076223474e-05, "loss": 0.3636, "step": 3280500 }, { "epoch": 22.202522737115636, "grad_norm": 0.32777705788612366, "learning_rate": 4.7779747726288436e-05, "loss": 0.3649, "step": 3281000 }, { "epoch": 22.205906236466003, "grad_norm": 0.3365825414657593, "learning_rate": 4.77794093763534e-05, "loss": 0.3657, "step": 3281500 }, { "epoch": 22.20928973581637, "grad_norm": 0.3661050796508789, "learning_rate": 4.777907102641837e-05, "loss": 0.3627, "step": 3282000 }, { "epoch": 22.21267323516674, "grad_norm": 0.3733326494693756, "learning_rate": 4.777873267648333e-05, "loss": 0.3626, "step": 3282500 }, { "epoch": 22.216056734517107, "grad_norm": 0.3773648738861084, "learning_rate": 4.777839432654829e-05, "loss": 0.3632, "step": 3283000 }, { "epoch": 22.219440233867473, "grad_norm": 0.38698533177375793, "learning_rate": 4.777805597661325e-05, "loss": 0.3638, "step": 3283500 }, { "epoch": 22.222823733217844, "grad_norm": 0.357965350151062, "learning_rate": 4.777771762667822e-05, "loss": 0.3635, "step": 3284000 }, { "epoch": 22.22620723256821, "grad_norm": 0.36651939153671265, "learning_rate": 4.7777379276743184e-05, "loss": 0.3635, "step": 3284500 }, { "epoch": 22.22959073191858, "grad_norm": 0.40140554308891296, "learning_rate": 4.777704092680814e-05, "loss": 0.3626, "step": 3285000 }, { "epoch": 22.232974231268948, "grad_norm": 0.3868422508239746, "learning_rate": 4.777670257687311e-05, "loss": 0.364, "step": 3285500 }, { "epoch": 22.236357730619314, "grad_norm": 0.4074952304363251, "learning_rate": 4.777636422693807e-05, "loss": 0.3644, "step": 3286000 }, { "epoch": 22.239741229969685, "grad_norm": 0.32810431718826294, "learning_rate": 4.777602587700303e-05, "loss": 0.3627, "step": 3286500 }, { "epoch": 22.24312472932005, "grad_norm": 0.3723648488521576, "learning_rate": 4.7775687527067995e-05, "loss": 0.3641, "step": 3287000 }, { "epoch": 22.24650822867042, "grad_norm": 0.41280627250671387, "learning_rate": 4.7775349177132964e-05, "loss": 0.3637, "step": 3287500 }, { "epoch": 22.24989172802079, "grad_norm": 0.37163811922073364, "learning_rate": 4.7775010827197926e-05, "loss": 0.3646, "step": 3288000 }, { "epoch": 22.253275227371155, "grad_norm": 0.36624860763549805, "learning_rate": 4.777467247726289e-05, "loss": 0.3624, "step": 3288500 }, { "epoch": 22.256658726721525, "grad_norm": 0.36479902267456055, "learning_rate": 4.777433412732785e-05, "loss": 0.3638, "step": 3289000 }, { "epoch": 22.260042226071892, "grad_norm": 0.3627351224422455, "learning_rate": 4.777399577739281e-05, "loss": 0.3631, "step": 3289500 }, { "epoch": 22.26342572542226, "grad_norm": 0.3495413362979889, "learning_rate": 4.7773657427457774e-05, "loss": 0.3633, "step": 3290000 }, { "epoch": 22.26680922477263, "grad_norm": 0.40336576104164124, "learning_rate": 4.7773319077522736e-05, "loss": 0.3621, "step": 3290500 }, { "epoch": 22.270192724122996, "grad_norm": 0.3687015175819397, "learning_rate": 4.77729807275877e-05, "loss": 0.3638, "step": 3291000 }, { "epoch": 22.273576223473366, "grad_norm": 0.43421855568885803, "learning_rate": 4.777264237765267e-05, "loss": 0.3616, "step": 3291500 }, { "epoch": 22.276959722823733, "grad_norm": 0.37915298342704773, "learning_rate": 4.777230402771763e-05, "loss": 0.3647, "step": 3292000 }, { "epoch": 22.2803432221741, "grad_norm": 0.3645276725292206, "learning_rate": 4.777196567778259e-05, "loss": 0.3623, "step": 3292500 }, { "epoch": 22.28372672152447, "grad_norm": 0.36133942008018494, "learning_rate": 4.7771627327847554e-05, "loss": 0.3634, "step": 3293000 }, { "epoch": 22.287110220874837, "grad_norm": 0.366842120885849, "learning_rate": 4.777128897791252e-05, "loss": 0.3637, "step": 3293500 }, { "epoch": 22.290493720225207, "grad_norm": 0.3590872883796692, "learning_rate": 4.7770950627977485e-05, "loss": 0.3658, "step": 3294000 }, { "epoch": 22.293877219575574, "grad_norm": 0.40072405338287354, "learning_rate": 4.777061227804244e-05, "loss": 0.3639, "step": 3294500 }, { "epoch": 22.29726071892594, "grad_norm": 0.3492056429386139, "learning_rate": 4.777027392810741e-05, "loss": 0.3642, "step": 3295000 }, { "epoch": 22.30064421827631, "grad_norm": 0.3781352639198303, "learning_rate": 4.776993557817237e-05, "loss": 0.3644, "step": 3295500 }, { "epoch": 22.304027717626678, "grad_norm": 0.3554432988166809, "learning_rate": 4.776959722823733e-05, "loss": 0.3637, "step": 3296000 }, { "epoch": 22.307411216977048, "grad_norm": 0.4013362526893616, "learning_rate": 4.7769258878302295e-05, "loss": 0.365, "step": 3296500 }, { "epoch": 22.310794716327415, "grad_norm": 0.37104544043540955, "learning_rate": 4.7768920528367264e-05, "loss": 0.3642, "step": 3297000 }, { "epoch": 22.31417821567778, "grad_norm": 0.37026894092559814, "learning_rate": 4.7768582178432226e-05, "loss": 0.3638, "step": 3297500 }, { "epoch": 22.317561715028152, "grad_norm": 0.39977866411209106, "learning_rate": 4.776824382849719e-05, "loss": 0.3634, "step": 3298000 }, { "epoch": 22.32094521437852, "grad_norm": 0.33193448185920715, "learning_rate": 4.776790547856215e-05, "loss": 0.3612, "step": 3298500 }, { "epoch": 22.324328713728885, "grad_norm": 0.3862043023109436, "learning_rate": 4.776756712862711e-05, "loss": 0.3645, "step": 3299000 }, { "epoch": 22.327712213079256, "grad_norm": 0.35771092772483826, "learning_rate": 4.7767228778692075e-05, "loss": 0.3637, "step": 3299500 }, { "epoch": 22.331095712429622, "grad_norm": 0.39172911643981934, "learning_rate": 4.776689042875704e-05, "loss": 0.3646, "step": 3300000 }, { "epoch": 22.334479211779993, "grad_norm": 0.3653565049171448, "learning_rate": 4.7766552078822e-05, "loss": 0.3642, "step": 3300500 }, { "epoch": 22.33786271113036, "grad_norm": 0.4046486020088196, "learning_rate": 4.776621372888697e-05, "loss": 0.3632, "step": 3301000 }, { "epoch": 22.341246210480726, "grad_norm": 0.381191611289978, "learning_rate": 4.776587537895193e-05, "loss": 0.362, "step": 3301500 }, { "epoch": 22.344629709831096, "grad_norm": 0.3809015452861786, "learning_rate": 4.776553702901689e-05, "loss": 0.3628, "step": 3302000 }, { "epoch": 22.348013209181463, "grad_norm": 0.42356231808662415, "learning_rate": 4.7765198679081854e-05, "loss": 0.3636, "step": 3302500 }, { "epoch": 22.351396708531833, "grad_norm": 0.38534021377563477, "learning_rate": 4.776486032914682e-05, "loss": 0.3642, "step": 3303000 }, { "epoch": 22.3547802078822, "grad_norm": 0.3867948353290558, "learning_rate": 4.7764521979211785e-05, "loss": 0.3636, "step": 3303500 }, { "epoch": 22.358163707232567, "grad_norm": 0.38022705912590027, "learning_rate": 4.776418362927674e-05, "loss": 0.3641, "step": 3304000 }, { "epoch": 22.361547206582937, "grad_norm": 0.38373327255249023, "learning_rate": 4.776384527934171e-05, "loss": 0.3633, "step": 3304500 }, { "epoch": 22.364930705933304, "grad_norm": 0.33922332525253296, "learning_rate": 4.776350692940667e-05, "loss": 0.3626, "step": 3305000 }, { "epoch": 22.368314205283674, "grad_norm": 0.39301684498786926, "learning_rate": 4.7763168579471634e-05, "loss": 0.3645, "step": 3305500 }, { "epoch": 22.37169770463404, "grad_norm": 0.3709513247013092, "learning_rate": 4.7762830229536596e-05, "loss": 0.3639, "step": 3306000 }, { "epoch": 22.375081203984408, "grad_norm": 0.3477535843849182, "learning_rate": 4.7762491879601565e-05, "loss": 0.3636, "step": 3306500 }, { "epoch": 22.378464703334778, "grad_norm": 0.33180704712867737, "learning_rate": 4.776215352966653e-05, "loss": 0.3647, "step": 3307000 }, { "epoch": 22.381848202685145, "grad_norm": 0.34438008069992065, "learning_rate": 4.776181517973149e-05, "loss": 0.3644, "step": 3307500 }, { "epoch": 22.38523170203551, "grad_norm": 0.38011258840560913, "learning_rate": 4.776147682979645e-05, "loss": 0.3634, "step": 3308000 }, { "epoch": 22.388615201385882, "grad_norm": 0.3358522355556488, "learning_rate": 4.776113847986141e-05, "loss": 0.3642, "step": 3308500 }, { "epoch": 22.39199870073625, "grad_norm": 0.38324040174484253, "learning_rate": 4.7760800129926375e-05, "loss": 0.3635, "step": 3309000 }, { "epoch": 22.39538220008662, "grad_norm": 0.3560822904109955, "learning_rate": 4.776046177999134e-05, "loss": 0.3654, "step": 3309500 }, { "epoch": 22.398765699436986, "grad_norm": 0.39798104763031006, "learning_rate": 4.77601234300563e-05, "loss": 0.3626, "step": 3310000 }, { "epoch": 22.402149198787352, "grad_norm": 0.3754647970199585, "learning_rate": 4.775978508012127e-05, "loss": 0.3648, "step": 3310500 }, { "epoch": 22.405532698137723, "grad_norm": 0.3751375675201416, "learning_rate": 4.775944673018623e-05, "loss": 0.3624, "step": 3311000 }, { "epoch": 22.40891619748809, "grad_norm": 0.36246442794799805, "learning_rate": 4.775910838025119e-05, "loss": 0.3642, "step": 3311500 }, { "epoch": 22.41229969683846, "grad_norm": 0.38623011112213135, "learning_rate": 4.7758770030316155e-05, "loss": 0.3638, "step": 3312000 }, { "epoch": 22.415683196188827, "grad_norm": 0.38859832286834717, "learning_rate": 4.7758431680381124e-05, "loss": 0.3633, "step": 3312500 }, { "epoch": 22.419066695539193, "grad_norm": 0.36892005801200867, "learning_rate": 4.7758093330446086e-05, "loss": 0.3625, "step": 3313000 }, { "epoch": 22.422450194889564, "grad_norm": 0.3597000539302826, "learning_rate": 4.775775498051104e-05, "loss": 0.3668, "step": 3313500 }, { "epoch": 22.42583369423993, "grad_norm": 0.32845205068588257, "learning_rate": 4.775741663057601e-05, "loss": 0.3645, "step": 3314000 }, { "epoch": 22.429217193590297, "grad_norm": 0.3746047019958496, "learning_rate": 4.775707828064097e-05, "loss": 0.3642, "step": 3314500 }, { "epoch": 22.432600692940667, "grad_norm": 0.38173362612724304, "learning_rate": 4.7756739930705935e-05, "loss": 0.3638, "step": 3315000 }, { "epoch": 22.435984192291034, "grad_norm": 0.3481561243534088, "learning_rate": 4.77564015807709e-05, "loss": 0.3628, "step": 3315500 }, { "epoch": 22.439367691641404, "grad_norm": 0.3738810122013092, "learning_rate": 4.7756063230835866e-05, "loss": 0.3627, "step": 3316000 }, { "epoch": 22.44275119099177, "grad_norm": 0.35183948278427124, "learning_rate": 4.775572488090083e-05, "loss": 0.3647, "step": 3316500 }, { "epoch": 22.446134690342138, "grad_norm": 0.33670639991760254, "learning_rate": 4.775538653096579e-05, "loss": 0.3643, "step": 3317000 }, { "epoch": 22.44951818969251, "grad_norm": 0.36027422547340393, "learning_rate": 4.775504818103075e-05, "loss": 0.3632, "step": 3317500 }, { "epoch": 22.452901689042875, "grad_norm": 0.3198143541812897, "learning_rate": 4.7754709831095714e-05, "loss": 0.3624, "step": 3318000 }, { "epoch": 22.456285188393245, "grad_norm": 0.3682264983654022, "learning_rate": 4.7754371481160676e-05, "loss": 0.3648, "step": 3318500 }, { "epoch": 22.459668687743612, "grad_norm": 0.3861812353134155, "learning_rate": 4.775403313122564e-05, "loss": 0.3649, "step": 3319000 }, { "epoch": 22.46305218709398, "grad_norm": 0.4011298418045044, "learning_rate": 4.77536947812906e-05, "loss": 0.3647, "step": 3319500 }, { "epoch": 22.46643568644435, "grad_norm": 0.3753091096878052, "learning_rate": 4.775335643135557e-05, "loss": 0.3647, "step": 3320000 }, { "epoch": 22.469819185794716, "grad_norm": 0.3396071791648865, "learning_rate": 4.775301808142053e-05, "loss": 0.3654, "step": 3320500 }, { "epoch": 22.473202685145086, "grad_norm": 0.3864244818687439, "learning_rate": 4.7752679731485494e-05, "loss": 0.3634, "step": 3321000 }, { "epoch": 22.476586184495453, "grad_norm": 0.34095582365989685, "learning_rate": 4.7752341381550456e-05, "loss": 0.3624, "step": 3321500 }, { "epoch": 22.47996968384582, "grad_norm": 0.3696548640727997, "learning_rate": 4.7752003031615425e-05, "loss": 0.3648, "step": 3322000 }, { "epoch": 22.48335318319619, "grad_norm": 0.35707706212997437, "learning_rate": 4.775166468168039e-05, "loss": 0.3626, "step": 3322500 }, { "epoch": 22.486736682546557, "grad_norm": 0.3745119273662567, "learning_rate": 4.775132633174534e-05, "loss": 0.3648, "step": 3323000 }, { "epoch": 22.490120181896923, "grad_norm": 0.3682263493537903, "learning_rate": 4.775098798181031e-05, "loss": 0.3621, "step": 3323500 }, { "epoch": 22.493503681247294, "grad_norm": 0.38439232110977173, "learning_rate": 4.775064963187527e-05, "loss": 0.3629, "step": 3324000 }, { "epoch": 22.49688718059766, "grad_norm": 0.39407601952552795, "learning_rate": 4.7750311281940235e-05, "loss": 0.3632, "step": 3324500 }, { "epoch": 22.50027067994803, "grad_norm": 0.35311228036880493, "learning_rate": 4.77499729320052e-05, "loss": 0.3638, "step": 3325000 }, { "epoch": 22.503654179298398, "grad_norm": 0.3494729995727539, "learning_rate": 4.7749634582070166e-05, "loss": 0.3637, "step": 3325500 }, { "epoch": 22.507037678648764, "grad_norm": 0.335531622171402, "learning_rate": 4.774929623213513e-05, "loss": 0.3643, "step": 3326000 }, { "epoch": 22.510421177999135, "grad_norm": 0.37210074067115784, "learning_rate": 4.774895788220009e-05, "loss": 0.3636, "step": 3326500 }, { "epoch": 22.5138046773495, "grad_norm": 0.3789541721343994, "learning_rate": 4.774861953226505e-05, "loss": 0.3636, "step": 3327000 }, { "epoch": 22.51718817669987, "grad_norm": 0.32330092787742615, "learning_rate": 4.7748281182330015e-05, "loss": 0.3636, "step": 3327500 }, { "epoch": 22.52057167605024, "grad_norm": 0.38846564292907715, "learning_rate": 4.774794283239498e-05, "loss": 0.3647, "step": 3328000 }, { "epoch": 22.523955175400605, "grad_norm": 0.3626689910888672, "learning_rate": 4.774760448245994e-05, "loss": 0.3637, "step": 3328500 }, { "epoch": 22.527338674750975, "grad_norm": 0.3889565169811249, "learning_rate": 4.77472661325249e-05, "loss": 0.3629, "step": 3329000 }, { "epoch": 22.530722174101342, "grad_norm": 0.35343024134635925, "learning_rate": 4.774692778258987e-05, "loss": 0.3635, "step": 3329500 }, { "epoch": 22.534105673451712, "grad_norm": 0.3859838843345642, "learning_rate": 4.774658943265483e-05, "loss": 0.3641, "step": 3330000 }, { "epoch": 22.53748917280208, "grad_norm": 0.38369202613830566, "learning_rate": 4.7746251082719794e-05, "loss": 0.3636, "step": 3330500 }, { "epoch": 22.540872672152446, "grad_norm": 0.34914785623550415, "learning_rate": 4.7745912732784756e-05, "loss": 0.3624, "step": 3331000 }, { "epoch": 22.544256171502816, "grad_norm": 0.41669896245002747, "learning_rate": 4.7745574382849725e-05, "loss": 0.3642, "step": 3331500 }, { "epoch": 22.547639670853183, "grad_norm": 0.3618820309638977, "learning_rate": 4.774523603291469e-05, "loss": 0.364, "step": 3332000 }, { "epoch": 22.55102317020355, "grad_norm": 0.43509700894355774, "learning_rate": 4.774489768297964e-05, "loss": 0.3644, "step": 3332500 }, { "epoch": 22.55440666955392, "grad_norm": 0.37723085284233093, "learning_rate": 4.774455933304461e-05, "loss": 0.3643, "step": 3333000 }, { "epoch": 22.557790168904287, "grad_norm": 0.3719700872898102, "learning_rate": 4.7744220983109574e-05, "loss": 0.3652, "step": 3333500 }, { "epoch": 22.561173668254657, "grad_norm": 0.351824015378952, "learning_rate": 4.7743882633174536e-05, "loss": 0.3631, "step": 3334000 }, { "epoch": 22.564557167605024, "grad_norm": 0.3875691294670105, "learning_rate": 4.77435442832395e-05, "loss": 0.3644, "step": 3334500 }, { "epoch": 22.56794066695539, "grad_norm": 0.36765944957733154, "learning_rate": 4.774320593330447e-05, "loss": 0.3638, "step": 3335000 }, { "epoch": 22.57132416630576, "grad_norm": 0.3525508642196655, "learning_rate": 4.774286758336943e-05, "loss": 0.3647, "step": 3335500 }, { "epoch": 22.574707665656128, "grad_norm": 0.37704595923423767, "learning_rate": 4.774252923343439e-05, "loss": 0.3642, "step": 3336000 }, { "epoch": 22.578091165006498, "grad_norm": 0.3496682345867157, "learning_rate": 4.774219088349935e-05, "loss": 0.3638, "step": 3336500 }, { "epoch": 22.581474664356865, "grad_norm": 0.3411332964897156, "learning_rate": 4.7741852533564315e-05, "loss": 0.3641, "step": 3337000 }, { "epoch": 22.58485816370723, "grad_norm": 0.3448581099510193, "learning_rate": 4.774151418362928e-05, "loss": 0.3637, "step": 3337500 }, { "epoch": 22.5882416630576, "grad_norm": 0.33924153447151184, "learning_rate": 4.774117583369424e-05, "loss": 0.3642, "step": 3338000 }, { "epoch": 22.59162516240797, "grad_norm": 0.3455522358417511, "learning_rate": 4.77408374837592e-05, "loss": 0.364, "step": 3338500 }, { "epoch": 22.595008661758335, "grad_norm": 0.42671269178390503, "learning_rate": 4.774049913382417e-05, "loss": 0.3644, "step": 3339000 }, { "epoch": 22.598392161108706, "grad_norm": 0.32911738753318787, "learning_rate": 4.774016078388913e-05, "loss": 0.364, "step": 3339500 }, { "epoch": 22.601775660459072, "grad_norm": 0.3579391837120056, "learning_rate": 4.7739822433954095e-05, "loss": 0.3634, "step": 3340000 }, { "epoch": 22.605159159809443, "grad_norm": 0.34895071387290955, "learning_rate": 4.773948408401906e-05, "loss": 0.3652, "step": 3340500 }, { "epoch": 22.60854265915981, "grad_norm": 0.3935413956642151, "learning_rate": 4.7739145734084026e-05, "loss": 0.363, "step": 3341000 }, { "epoch": 22.611926158510176, "grad_norm": 0.3797558844089508, "learning_rate": 4.773880738414899e-05, "loss": 0.364, "step": 3341500 }, { "epoch": 22.615309657860546, "grad_norm": 0.35469311475753784, "learning_rate": 4.773846903421394e-05, "loss": 0.3638, "step": 3342000 }, { "epoch": 22.618693157210913, "grad_norm": 0.36201012134552, "learning_rate": 4.773813068427891e-05, "loss": 0.3634, "step": 3342500 }, { "epoch": 22.622076656561283, "grad_norm": 0.3611513078212738, "learning_rate": 4.7737792334343874e-05, "loss": 0.3654, "step": 3343000 }, { "epoch": 22.62546015591165, "grad_norm": 0.4306865930557251, "learning_rate": 4.7737453984408836e-05, "loss": 0.3633, "step": 3343500 }, { "epoch": 22.628843655262017, "grad_norm": 0.3794310390949249, "learning_rate": 4.77371156344738e-05, "loss": 0.3634, "step": 3344000 }, { "epoch": 22.632227154612387, "grad_norm": 0.3547627031803131, "learning_rate": 4.773677728453876e-05, "loss": 0.3635, "step": 3344500 }, { "epoch": 22.635610653962754, "grad_norm": 0.36016029119491577, "learning_rate": 4.773643893460373e-05, "loss": 0.3646, "step": 3345000 }, { "epoch": 22.638994153313124, "grad_norm": 0.3823973536491394, "learning_rate": 4.773610058466869e-05, "loss": 0.3645, "step": 3345500 }, { "epoch": 22.64237765266349, "grad_norm": 0.3654640316963196, "learning_rate": 4.7735762234733654e-05, "loss": 0.3638, "step": 3346000 }, { "epoch": 22.645761152013858, "grad_norm": 0.3841468393802643, "learning_rate": 4.7735423884798616e-05, "loss": 0.3647, "step": 3346500 }, { "epoch": 22.649144651364228, "grad_norm": 0.3642100691795349, "learning_rate": 4.773508553486358e-05, "loss": 0.3642, "step": 3347000 }, { "epoch": 22.652528150714595, "grad_norm": 0.382438600063324, "learning_rate": 4.773474718492854e-05, "loss": 0.3639, "step": 3347500 }, { "epoch": 22.65591165006496, "grad_norm": 0.35496222972869873, "learning_rate": 4.77344088349935e-05, "loss": 0.3635, "step": 3348000 }, { "epoch": 22.659295149415332, "grad_norm": 0.3788909316062927, "learning_rate": 4.773407048505847e-05, "loss": 0.3624, "step": 3348500 }, { "epoch": 22.6626786487657, "grad_norm": 0.3825910687446594, "learning_rate": 4.773373213512343e-05, "loss": 0.365, "step": 3349000 }, { "epoch": 22.66606214811607, "grad_norm": 0.3525485396385193, "learning_rate": 4.7733393785188395e-05, "loss": 0.3657, "step": 3349500 }, { "epoch": 22.669445647466436, "grad_norm": 0.36212408542633057, "learning_rate": 4.773305543525336e-05, "loss": 0.3631, "step": 3350000 }, { "epoch": 22.672829146816802, "grad_norm": 0.34431231021881104, "learning_rate": 4.7732717085318326e-05, "loss": 0.3629, "step": 3350500 }, { "epoch": 22.676212646167173, "grad_norm": 0.36336973309516907, "learning_rate": 4.773237873538329e-05, "loss": 0.3638, "step": 3351000 }, { "epoch": 22.67959614551754, "grad_norm": 0.4125959575176239, "learning_rate": 4.7732040385448244e-05, "loss": 0.3649, "step": 3351500 }, { "epoch": 22.68297964486791, "grad_norm": 0.3445337116718292, "learning_rate": 4.7731702035513206e-05, "loss": 0.3642, "step": 3352000 }, { "epoch": 22.686363144218276, "grad_norm": 0.366926372051239, "learning_rate": 4.7731363685578175e-05, "loss": 0.3624, "step": 3352500 }, { "epoch": 22.689746643568643, "grad_norm": 0.3692866861820221, "learning_rate": 4.773102533564314e-05, "loss": 0.3657, "step": 3353000 }, { "epoch": 22.693130142919014, "grad_norm": 0.32625988125801086, "learning_rate": 4.77306869857081e-05, "loss": 0.3647, "step": 3353500 }, { "epoch": 22.69651364226938, "grad_norm": 0.33408379554748535, "learning_rate": 4.773034863577306e-05, "loss": 0.363, "step": 3354000 }, { "epoch": 22.69989714161975, "grad_norm": 0.39026281237602234, "learning_rate": 4.773001028583803e-05, "loss": 0.3642, "step": 3354500 }, { "epoch": 22.703280640970117, "grad_norm": 0.3677361309528351, "learning_rate": 4.772967193590299e-05, "loss": 0.3653, "step": 3355000 }, { "epoch": 22.706664140320484, "grad_norm": 0.3817291855812073, "learning_rate": 4.7729333585967954e-05, "loss": 0.3637, "step": 3355500 }, { "epoch": 22.710047639670854, "grad_norm": 0.3769967257976532, "learning_rate": 4.7728995236032917e-05, "loss": 0.3641, "step": 3356000 }, { "epoch": 22.71343113902122, "grad_norm": 0.36510559916496277, "learning_rate": 4.772865688609788e-05, "loss": 0.363, "step": 3356500 }, { "epoch": 22.716814638371588, "grad_norm": 0.3357907235622406, "learning_rate": 4.772831853616284e-05, "loss": 0.3638, "step": 3357000 }, { "epoch": 22.720198137721958, "grad_norm": 0.3899648189544678, "learning_rate": 4.77279801862278e-05, "loss": 0.3638, "step": 3357500 }, { "epoch": 22.723581637072325, "grad_norm": 0.3724825084209442, "learning_rate": 4.772764183629277e-05, "loss": 0.3628, "step": 3358000 }, { "epoch": 22.726965136422695, "grad_norm": 0.3447500467300415, "learning_rate": 4.7727303486357734e-05, "loss": 0.3642, "step": 3358500 }, { "epoch": 22.730348635773062, "grad_norm": 0.3528880178928375, "learning_rate": 4.7726965136422696e-05, "loss": 0.364, "step": 3359000 }, { "epoch": 22.73373213512343, "grad_norm": 0.3487587571144104, "learning_rate": 4.772662678648766e-05, "loss": 0.3636, "step": 3359500 }, { "epoch": 22.7371156344738, "grad_norm": 0.3664812445640564, "learning_rate": 4.772628843655263e-05, "loss": 0.3642, "step": 3360000 }, { "epoch": 22.740499133824166, "grad_norm": 0.3146126866340637, "learning_rate": 4.772595008661759e-05, "loss": 0.3632, "step": 3360500 }, { "epoch": 22.743882633174536, "grad_norm": 0.35260069370269775, "learning_rate": 4.7725611736682545e-05, "loss": 0.3657, "step": 3361000 }, { "epoch": 22.747266132524903, "grad_norm": 0.36339813470840454, "learning_rate": 4.772527338674751e-05, "loss": 0.3643, "step": 3361500 }, { "epoch": 22.75064963187527, "grad_norm": 0.3912361264228821, "learning_rate": 4.7724935036812476e-05, "loss": 0.3628, "step": 3362000 }, { "epoch": 22.75403313122564, "grad_norm": 0.34351855516433716, "learning_rate": 4.772459668687744e-05, "loss": 0.3644, "step": 3362500 }, { "epoch": 22.757416630576007, "grad_norm": 0.41395920515060425, "learning_rate": 4.77242583369424e-05, "loss": 0.3644, "step": 3363000 }, { "epoch": 22.760800129926373, "grad_norm": 0.3602873384952545, "learning_rate": 4.772391998700736e-05, "loss": 0.3648, "step": 3363500 }, { "epoch": 22.764183629276744, "grad_norm": 0.36354541778564453, "learning_rate": 4.772358163707233e-05, "loss": 0.3626, "step": 3364000 }, { "epoch": 22.76756712862711, "grad_norm": 0.37120670080184937, "learning_rate": 4.772324328713729e-05, "loss": 0.3644, "step": 3364500 }, { "epoch": 22.77095062797748, "grad_norm": 0.33839091658592224, "learning_rate": 4.7722904937202255e-05, "loss": 0.3636, "step": 3365000 }, { "epoch": 22.774334127327847, "grad_norm": 0.3728088140487671, "learning_rate": 4.772256658726722e-05, "loss": 0.3648, "step": 3365500 }, { "epoch": 22.777717626678214, "grad_norm": 0.35419514775276184, "learning_rate": 4.772222823733218e-05, "loss": 0.3635, "step": 3366000 }, { "epoch": 22.781101126028585, "grad_norm": 0.3702925741672516, "learning_rate": 4.772188988739714e-05, "loss": 0.3637, "step": 3366500 }, { "epoch": 22.78448462537895, "grad_norm": 0.371176540851593, "learning_rate": 4.7721551537462104e-05, "loss": 0.3641, "step": 3367000 }, { "epoch": 22.78786812472932, "grad_norm": 0.3687214255332947, "learning_rate": 4.772121318752707e-05, "loss": 0.3654, "step": 3367500 }, { "epoch": 22.79125162407969, "grad_norm": 0.3298780024051666, "learning_rate": 4.7720874837592035e-05, "loss": 0.3641, "step": 3368000 }, { "epoch": 22.794635123430055, "grad_norm": 0.35434913635253906, "learning_rate": 4.7720536487657e-05, "loss": 0.3653, "step": 3368500 }, { "epoch": 22.798018622780425, "grad_norm": 0.34204936027526855, "learning_rate": 4.772019813772196e-05, "loss": 0.3652, "step": 3369000 }, { "epoch": 22.801402122130792, "grad_norm": 0.3661845624446869, "learning_rate": 4.771985978778693e-05, "loss": 0.364, "step": 3369500 }, { "epoch": 22.804785621481162, "grad_norm": 0.3500819206237793, "learning_rate": 4.771952143785189e-05, "loss": 0.3651, "step": 3370000 }, { "epoch": 22.80816912083153, "grad_norm": 0.4077375829219818, "learning_rate": 4.7719183087916845e-05, "loss": 0.3646, "step": 3370500 }, { "epoch": 22.811552620181896, "grad_norm": 0.38122543692588806, "learning_rate": 4.771884473798181e-05, "loss": 0.3634, "step": 3371000 }, { "epoch": 22.814936119532266, "grad_norm": 0.3645778298377991, "learning_rate": 4.7718506388046776e-05, "loss": 0.3646, "step": 3371500 }, { "epoch": 22.818319618882633, "grad_norm": 0.4079326093196869, "learning_rate": 4.771816803811174e-05, "loss": 0.3662, "step": 3372000 }, { "epoch": 22.821703118233, "grad_norm": 0.34691333770751953, "learning_rate": 4.77178296881767e-05, "loss": 0.3661, "step": 3372500 }, { "epoch": 22.82508661758337, "grad_norm": 0.3471761643886566, "learning_rate": 4.771749133824166e-05, "loss": 0.3644, "step": 3373000 }, { "epoch": 22.828470116933737, "grad_norm": 0.4043777585029602, "learning_rate": 4.771715298830663e-05, "loss": 0.3651, "step": 3373500 }, { "epoch": 22.831853616284107, "grad_norm": 0.41398492455482483, "learning_rate": 4.7716814638371594e-05, "loss": 0.3646, "step": 3374000 }, { "epoch": 22.835237115634474, "grad_norm": 0.3764701783657074, "learning_rate": 4.7716476288436556e-05, "loss": 0.3656, "step": 3374500 }, { "epoch": 22.83862061498484, "grad_norm": 0.3379947543144226, "learning_rate": 4.771613793850152e-05, "loss": 0.3642, "step": 3375000 }, { "epoch": 22.84200411433521, "grad_norm": 0.39115238189697266, "learning_rate": 4.771579958856648e-05, "loss": 0.3634, "step": 3375500 }, { "epoch": 22.845387613685578, "grad_norm": 0.37695080041885376, "learning_rate": 4.771546123863144e-05, "loss": 0.3625, "step": 3376000 }, { "epoch": 22.848771113035948, "grad_norm": 0.3427785038948059, "learning_rate": 4.7715122888696404e-05, "loss": 0.3654, "step": 3376500 }, { "epoch": 22.852154612386315, "grad_norm": 0.3660113215446472, "learning_rate": 4.771478453876137e-05, "loss": 0.3646, "step": 3377000 }, { "epoch": 22.85553811173668, "grad_norm": 0.4021322429180145, "learning_rate": 4.7714446188826335e-05, "loss": 0.3644, "step": 3377500 }, { "epoch": 22.85892161108705, "grad_norm": 0.3710779547691345, "learning_rate": 4.77141078388913e-05, "loss": 0.365, "step": 3378000 }, { "epoch": 22.86230511043742, "grad_norm": 0.38344478607177734, "learning_rate": 4.771376948895626e-05, "loss": 0.3628, "step": 3378500 }, { "epoch": 22.86568860978779, "grad_norm": 0.3572565019130707, "learning_rate": 4.771343113902123e-05, "loss": 0.3642, "step": 3379000 }, { "epoch": 22.869072109138155, "grad_norm": 0.3629647493362427, "learning_rate": 4.771309278908619e-05, "loss": 0.3644, "step": 3379500 }, { "epoch": 22.872455608488522, "grad_norm": 0.3336074948310852, "learning_rate": 4.771275443915115e-05, "loss": 0.3648, "step": 3380000 }, { "epoch": 22.875839107838893, "grad_norm": 0.388048380613327, "learning_rate": 4.771241608921611e-05, "loss": 0.3651, "step": 3380500 }, { "epoch": 22.87922260718926, "grad_norm": 0.3633332848548889, "learning_rate": 4.771207773928108e-05, "loss": 0.3647, "step": 3381000 }, { "epoch": 22.882606106539626, "grad_norm": 0.29492759704589844, "learning_rate": 4.771173938934604e-05, "loss": 0.3648, "step": 3381500 }, { "epoch": 22.885989605889996, "grad_norm": 0.364529013633728, "learning_rate": 4.7711401039411e-05, "loss": 0.3629, "step": 3382000 }, { "epoch": 22.889373105240363, "grad_norm": 0.3595414459705353, "learning_rate": 4.771106268947596e-05, "loss": 0.3644, "step": 3382500 }, { "epoch": 22.892756604590733, "grad_norm": 0.37099790573120117, "learning_rate": 4.771072433954093e-05, "loss": 0.3648, "step": 3383000 }, { "epoch": 22.8961401039411, "grad_norm": 0.3630152940750122, "learning_rate": 4.7710385989605894e-05, "loss": 0.3655, "step": 3383500 }, { "epoch": 22.899523603291467, "grad_norm": 0.33306482434272766, "learning_rate": 4.7710047639670856e-05, "loss": 0.3654, "step": 3384000 }, { "epoch": 22.902907102641837, "grad_norm": 0.37848731875419617, "learning_rate": 4.770970928973582e-05, "loss": 0.3629, "step": 3384500 }, { "epoch": 22.906290601992204, "grad_norm": 0.35071754455566406, "learning_rate": 4.770937093980078e-05, "loss": 0.3645, "step": 3385000 }, { "epoch": 22.909674101342574, "grad_norm": 0.35632824897766113, "learning_rate": 4.770903258986574e-05, "loss": 0.3641, "step": 3385500 }, { "epoch": 22.91305760069294, "grad_norm": 0.37024572491645813, "learning_rate": 4.7708694239930705e-05, "loss": 0.3652, "step": 3386000 }, { "epoch": 22.916441100043308, "grad_norm": 0.3406110405921936, "learning_rate": 4.7708355889995674e-05, "loss": 0.3642, "step": 3386500 }, { "epoch": 22.919824599393678, "grad_norm": 0.36239370703697205, "learning_rate": 4.7708017540060636e-05, "loss": 0.3643, "step": 3387000 }, { "epoch": 22.923208098744045, "grad_norm": 0.35012954473495483, "learning_rate": 4.77076791901256e-05, "loss": 0.3647, "step": 3387500 }, { "epoch": 22.92659159809441, "grad_norm": 0.44089409708976746, "learning_rate": 4.770734084019056e-05, "loss": 0.3651, "step": 3388000 }, { "epoch": 22.929975097444782, "grad_norm": 0.3752581775188446, "learning_rate": 4.770700249025553e-05, "loss": 0.3644, "step": 3388500 }, { "epoch": 22.93335859679515, "grad_norm": 0.3997404873371124, "learning_rate": 4.770666414032049e-05, "loss": 0.3656, "step": 3389000 }, { "epoch": 22.93674209614552, "grad_norm": 0.3542211949825287, "learning_rate": 4.770632579038545e-05, "loss": 0.366, "step": 3389500 }, { "epoch": 22.940125595495886, "grad_norm": 0.34117844700813293, "learning_rate": 4.770598744045041e-05, "loss": 0.3633, "step": 3390000 }, { "epoch": 22.943509094846252, "grad_norm": 0.36265915632247925, "learning_rate": 4.770564909051538e-05, "loss": 0.3646, "step": 3390500 }, { "epoch": 22.946892594196623, "grad_norm": 0.3219507932662964, "learning_rate": 4.770531074058034e-05, "loss": 0.3644, "step": 3391000 }, { "epoch": 22.95027609354699, "grad_norm": 0.36303895711898804, "learning_rate": 4.77049723906453e-05, "loss": 0.3638, "step": 3391500 }, { "epoch": 22.95365959289736, "grad_norm": 0.3494533896446228, "learning_rate": 4.7704634040710264e-05, "loss": 0.3628, "step": 3392000 }, { "epoch": 22.957043092247726, "grad_norm": 0.39607173204421997, "learning_rate": 4.770429569077523e-05, "loss": 0.3634, "step": 3392500 }, { "epoch": 22.960426591598093, "grad_norm": 0.38334110379219055, "learning_rate": 4.7703957340840195e-05, "loss": 0.3638, "step": 3393000 }, { "epoch": 22.963810090948463, "grad_norm": 0.3406641185283661, "learning_rate": 4.770361899090516e-05, "loss": 0.3642, "step": 3393500 }, { "epoch": 22.96719359029883, "grad_norm": 0.3994218707084656, "learning_rate": 4.770328064097012e-05, "loss": 0.3646, "step": 3394000 }, { "epoch": 22.970577089649197, "grad_norm": 0.362558513879776, "learning_rate": 4.770294229103508e-05, "loss": 0.363, "step": 3394500 }, { "epoch": 22.973960588999567, "grad_norm": 0.3812117874622345, "learning_rate": 4.770260394110004e-05, "loss": 0.3634, "step": 3395000 }, { "epoch": 22.977344088349934, "grad_norm": 0.3542080223560333, "learning_rate": 4.7702265591165005e-05, "loss": 0.3635, "step": 3395500 }, { "epoch": 22.980727587700304, "grad_norm": 0.3493613600730896, "learning_rate": 4.7701927241229974e-05, "loss": 0.3647, "step": 3396000 }, { "epoch": 22.98411108705067, "grad_norm": 0.38243380188941956, "learning_rate": 4.7701588891294936e-05, "loss": 0.3648, "step": 3396500 }, { "epoch": 22.987494586401038, "grad_norm": 0.366038054227829, "learning_rate": 4.77012505413599e-05, "loss": 0.3638, "step": 3397000 }, { "epoch": 22.990878085751408, "grad_norm": 0.36321356892585754, "learning_rate": 4.770091219142486e-05, "loss": 0.3632, "step": 3397500 }, { "epoch": 22.994261585101775, "grad_norm": 0.33257806301116943, "learning_rate": 4.770057384148983e-05, "loss": 0.3637, "step": 3398000 }, { "epoch": 22.997645084452145, "grad_norm": 0.38097190856933594, "learning_rate": 4.770023549155479e-05, "loss": 0.3655, "step": 3398500 }, { "epoch": 23.0, "eval_accuracy": 0.8612532908124285, "eval_loss": 0.5633518099784851, "eval_runtime": 3398.3459, "eval_samples_per_second": 85.555, "eval_steps_per_second": 5.347, "step": 3398848 }, { "epoch": 23.001028583802512, "grad_norm": 0.37000569701194763, "learning_rate": 4.7699897141619754e-05, "loss": 0.3635, "step": 3399000 }, { "epoch": 23.00441208315288, "grad_norm": 0.3592061698436737, "learning_rate": 4.769955879168471e-05, "loss": 0.36, "step": 3399500 }, { "epoch": 23.00779558250325, "grad_norm": 0.40653616189956665, "learning_rate": 4.769922044174968e-05, "loss": 0.3636, "step": 3400000 }, { "epoch": 23.011179081853616, "grad_norm": 0.3927628993988037, "learning_rate": 4.769888209181464e-05, "loss": 0.3624, "step": 3400500 }, { "epoch": 23.014562581203986, "grad_norm": 0.35245293378829956, "learning_rate": 4.76985437418796e-05, "loss": 0.3619, "step": 3401000 }, { "epoch": 23.017946080554353, "grad_norm": 0.42338356375694275, "learning_rate": 4.7698205391944564e-05, "loss": 0.3625, "step": 3401500 }, { "epoch": 23.02132957990472, "grad_norm": 0.366515189409256, "learning_rate": 4.769786704200953e-05, "loss": 0.3615, "step": 3402000 }, { "epoch": 23.02471307925509, "grad_norm": 0.3612120449542999, "learning_rate": 4.7697528692074495e-05, "loss": 0.3633, "step": 3402500 }, { "epoch": 23.028096578605457, "grad_norm": 0.36426788568496704, "learning_rate": 4.769719034213946e-05, "loss": 0.3622, "step": 3403000 }, { "epoch": 23.031480077955823, "grad_norm": 0.3689402937889099, "learning_rate": 4.769685199220442e-05, "loss": 0.3619, "step": 3403500 }, { "epoch": 23.034863577306194, "grad_norm": 0.36793333292007446, "learning_rate": 4.769651364226938e-05, "loss": 0.3625, "step": 3404000 }, { "epoch": 23.03824707665656, "grad_norm": 0.3756435811519623, "learning_rate": 4.7696175292334344e-05, "loss": 0.3626, "step": 3404500 }, { "epoch": 23.04163057600693, "grad_norm": 0.404081791639328, "learning_rate": 4.7695836942399306e-05, "loss": 0.3626, "step": 3405000 }, { "epoch": 23.045014075357297, "grad_norm": 0.3988695740699768, "learning_rate": 4.7695498592464275e-05, "loss": 0.3638, "step": 3405500 }, { "epoch": 23.048397574707664, "grad_norm": 0.35908782482147217, "learning_rate": 4.769516024252924e-05, "loss": 0.3634, "step": 3406000 }, { "epoch": 23.051781074058034, "grad_norm": 0.35598117113113403, "learning_rate": 4.76948218925942e-05, "loss": 0.3629, "step": 3406500 }, { "epoch": 23.0551645734084, "grad_norm": 0.3767651319503784, "learning_rate": 4.769448354265916e-05, "loss": 0.3632, "step": 3407000 }, { "epoch": 23.05854807275877, "grad_norm": 0.40991491079330444, "learning_rate": 4.7694145192724123e-05, "loss": 0.3629, "step": 3407500 }, { "epoch": 23.06193157210914, "grad_norm": 0.379182904958725, "learning_rate": 4.769380684278909e-05, "loss": 0.3632, "step": 3408000 }, { "epoch": 23.065315071459505, "grad_norm": 0.35139524936676025, "learning_rate": 4.7693468492854054e-05, "loss": 0.3632, "step": 3408500 }, { "epoch": 23.068698570809875, "grad_norm": 0.36326470971107483, "learning_rate": 4.769313014291901e-05, "loss": 0.3631, "step": 3409000 }, { "epoch": 23.072082070160242, "grad_norm": 0.36097511649131775, "learning_rate": 4.769279179298398e-05, "loss": 0.3641, "step": 3409500 }, { "epoch": 23.075465569510612, "grad_norm": 0.3503509759902954, "learning_rate": 4.769245344304894e-05, "loss": 0.3628, "step": 3410000 }, { "epoch": 23.07884906886098, "grad_norm": 0.40592440962791443, "learning_rate": 4.76921150931139e-05, "loss": 0.3622, "step": 3410500 }, { "epoch": 23.082232568211346, "grad_norm": 0.3593161702156067, "learning_rate": 4.7691776743178865e-05, "loss": 0.3619, "step": 3411000 }, { "epoch": 23.085616067561716, "grad_norm": 0.3567400276660919, "learning_rate": 4.7691438393243834e-05, "loss": 0.3609, "step": 3411500 }, { "epoch": 23.088999566912083, "grad_norm": 0.3718048632144928, "learning_rate": 4.7691100043308796e-05, "loss": 0.3619, "step": 3412000 }, { "epoch": 23.09238306626245, "grad_norm": 0.38009029626846313, "learning_rate": 4.769076169337376e-05, "loss": 0.362, "step": 3412500 }, { "epoch": 23.09576656561282, "grad_norm": 0.35412073135375977, "learning_rate": 4.769042334343872e-05, "loss": 0.3632, "step": 3413000 }, { "epoch": 23.099150064963187, "grad_norm": 0.37574461102485657, "learning_rate": 4.769008499350368e-05, "loss": 0.3618, "step": 3413500 }, { "epoch": 23.102533564313557, "grad_norm": 0.3682039678096771, "learning_rate": 4.7689746643568645e-05, "loss": 0.3608, "step": 3414000 }, { "epoch": 23.105917063663924, "grad_norm": 0.37198999524116516, "learning_rate": 4.768940829363361e-05, "loss": 0.3629, "step": 3414500 }, { "epoch": 23.10930056301429, "grad_norm": 0.3804379999637604, "learning_rate": 4.768906994369857e-05, "loss": 0.3624, "step": 3415000 }, { "epoch": 23.11268406236466, "grad_norm": 0.37129905819892883, "learning_rate": 4.768873159376354e-05, "loss": 0.3636, "step": 3415500 }, { "epoch": 23.116067561715028, "grad_norm": 0.384331613779068, "learning_rate": 4.76883932438285e-05, "loss": 0.3618, "step": 3416000 }, { "epoch": 23.119451061065398, "grad_norm": 0.36952680349349976, "learning_rate": 4.768805489389346e-05, "loss": 0.3618, "step": 3416500 }, { "epoch": 23.122834560415765, "grad_norm": 0.3988458514213562, "learning_rate": 4.7687716543958424e-05, "loss": 0.3627, "step": 3417000 }, { "epoch": 23.12621805976613, "grad_norm": 0.3945951759815216, "learning_rate": 4.768737819402339e-05, "loss": 0.3628, "step": 3417500 }, { "epoch": 23.1296015591165, "grad_norm": 0.39166146516799927, "learning_rate": 4.7687039844088355e-05, "loss": 0.3632, "step": 3418000 }, { "epoch": 23.13298505846687, "grad_norm": 0.3587402403354645, "learning_rate": 4.768670149415331e-05, "loss": 0.3623, "step": 3418500 }, { "epoch": 23.136368557817235, "grad_norm": 0.3880244195461273, "learning_rate": 4.768636314421828e-05, "loss": 0.3624, "step": 3419000 }, { "epoch": 23.139752057167605, "grad_norm": 0.3770838975906372, "learning_rate": 4.768602479428324e-05, "loss": 0.3629, "step": 3419500 }, { "epoch": 23.143135556517972, "grad_norm": 0.35299721360206604, "learning_rate": 4.7685686444348204e-05, "loss": 0.3629, "step": 3420000 }, { "epoch": 23.146519055868342, "grad_norm": 0.37510251998901367, "learning_rate": 4.7685348094413166e-05, "loss": 0.3639, "step": 3420500 }, { "epoch": 23.14990255521871, "grad_norm": 0.3695957362651825, "learning_rate": 4.7685009744478135e-05, "loss": 0.3609, "step": 3421000 }, { "epoch": 23.153286054569076, "grad_norm": 0.4037812650203705, "learning_rate": 4.76846713945431e-05, "loss": 0.3623, "step": 3421500 }, { "epoch": 23.156669553919446, "grad_norm": 0.3722400963306427, "learning_rate": 4.768433304460806e-05, "loss": 0.3625, "step": 3422000 }, { "epoch": 23.160053053269813, "grad_norm": 0.3467889428138733, "learning_rate": 4.768399469467302e-05, "loss": 0.3619, "step": 3422500 }, { "epoch": 23.163436552620183, "grad_norm": 0.4032399356365204, "learning_rate": 4.768365634473798e-05, "loss": 0.3639, "step": 3423000 }, { "epoch": 23.16682005197055, "grad_norm": 0.3691263794898987, "learning_rate": 4.7683317994802945e-05, "loss": 0.3634, "step": 3423500 }, { "epoch": 23.170203551320917, "grad_norm": 0.3949434459209442, "learning_rate": 4.768297964486791e-05, "loss": 0.3638, "step": 3424000 }, { "epoch": 23.173587050671287, "grad_norm": 0.3885261118412018, "learning_rate": 4.768264129493287e-05, "loss": 0.3636, "step": 3424500 }, { "epoch": 23.176970550021654, "grad_norm": 0.3906368017196655, "learning_rate": 4.768230294499784e-05, "loss": 0.364, "step": 3425000 }, { "epoch": 23.180354049372024, "grad_norm": 0.3325451910495758, "learning_rate": 4.76819645950628e-05, "loss": 0.3631, "step": 3425500 }, { "epoch": 23.18373754872239, "grad_norm": 0.3635040819644928, "learning_rate": 4.768162624512776e-05, "loss": 0.362, "step": 3426000 }, { "epoch": 23.187121048072758, "grad_norm": 0.35502421855926514, "learning_rate": 4.7681287895192725e-05, "loss": 0.362, "step": 3426500 }, { "epoch": 23.190504547423128, "grad_norm": 0.384591281414032, "learning_rate": 4.7680949545257694e-05, "loss": 0.364, "step": 3427000 }, { "epoch": 23.193888046773495, "grad_norm": 0.37069278955459595, "learning_rate": 4.7680611195322656e-05, "loss": 0.3633, "step": 3427500 }, { "epoch": 23.19727154612386, "grad_norm": 0.37285947799682617, "learning_rate": 4.768027284538761e-05, "loss": 0.3645, "step": 3428000 }, { "epoch": 23.20065504547423, "grad_norm": 0.37554123997688293, "learning_rate": 4.767993449545258e-05, "loss": 0.363, "step": 3428500 }, { "epoch": 23.2040385448246, "grad_norm": 0.4136617183685303, "learning_rate": 4.767959614551754e-05, "loss": 0.3638, "step": 3429000 }, { "epoch": 23.20742204417497, "grad_norm": 0.3720233738422394, "learning_rate": 4.7679257795582504e-05, "loss": 0.3635, "step": 3429500 }, { "epoch": 23.210805543525336, "grad_norm": 0.393037885427475, "learning_rate": 4.7678919445647466e-05, "loss": 0.3631, "step": 3430000 }, { "epoch": 23.214189042875702, "grad_norm": 0.3909498155117035, "learning_rate": 4.7678581095712435e-05, "loss": 0.363, "step": 3430500 }, { "epoch": 23.217572542226073, "grad_norm": 0.4048287570476532, "learning_rate": 4.76782427457774e-05, "loss": 0.3637, "step": 3431000 }, { "epoch": 23.22095604157644, "grad_norm": 0.33996883034706116, "learning_rate": 4.767790439584236e-05, "loss": 0.3618, "step": 3431500 }, { "epoch": 23.22433954092681, "grad_norm": 0.33059725165367126, "learning_rate": 4.767756604590732e-05, "loss": 0.364, "step": 3432000 }, { "epoch": 23.227723040277176, "grad_norm": 0.370561808347702, "learning_rate": 4.7677227695972284e-05, "loss": 0.362, "step": 3432500 }, { "epoch": 23.231106539627543, "grad_norm": 0.35351186990737915, "learning_rate": 4.7676889346037246e-05, "loss": 0.3622, "step": 3433000 }, { "epoch": 23.234490038977913, "grad_norm": 0.38210129737854004, "learning_rate": 4.767655099610221e-05, "loss": 0.3645, "step": 3433500 }, { "epoch": 23.23787353832828, "grad_norm": 0.40298372507095337, "learning_rate": 4.767621264616717e-05, "loss": 0.3631, "step": 3434000 }, { "epoch": 23.24125703767865, "grad_norm": 0.4220386743545532, "learning_rate": 4.767587429623214e-05, "loss": 0.3621, "step": 3434500 }, { "epoch": 23.244640537029017, "grad_norm": 0.38947874307632446, "learning_rate": 4.76755359462971e-05, "loss": 0.3633, "step": 3435000 }, { "epoch": 23.248024036379384, "grad_norm": 0.35837265849113464, "learning_rate": 4.767519759636206e-05, "loss": 0.3614, "step": 3435500 }, { "epoch": 23.251407535729754, "grad_norm": 0.36074700951576233, "learning_rate": 4.7674859246427025e-05, "loss": 0.3643, "step": 3436000 }, { "epoch": 23.25479103508012, "grad_norm": 0.31319719552993774, "learning_rate": 4.7674520896491994e-05, "loss": 0.3633, "step": 3436500 }, { "epoch": 23.258174534430488, "grad_norm": 0.36703911423683167, "learning_rate": 4.7674182546556956e-05, "loss": 0.3628, "step": 3437000 }, { "epoch": 23.261558033780858, "grad_norm": 0.3402256965637207, "learning_rate": 4.767384419662191e-05, "loss": 0.3626, "step": 3437500 }, { "epoch": 23.264941533131225, "grad_norm": 0.3701879680156708, "learning_rate": 4.767350584668688e-05, "loss": 0.3615, "step": 3438000 }, { "epoch": 23.268325032481595, "grad_norm": 0.4011252820491791, "learning_rate": 4.767316749675184e-05, "loss": 0.363, "step": 3438500 }, { "epoch": 23.271708531831962, "grad_norm": 0.3824266791343689, "learning_rate": 4.7672829146816805e-05, "loss": 0.3635, "step": 3439000 }, { "epoch": 23.27509203118233, "grad_norm": 0.4016554653644562, "learning_rate": 4.767249079688177e-05, "loss": 0.3646, "step": 3439500 }, { "epoch": 23.2784755305327, "grad_norm": 0.3987918794155121, "learning_rate": 4.7672152446946736e-05, "loss": 0.3624, "step": 3440000 }, { "epoch": 23.281859029883066, "grad_norm": 0.3824206590652466, "learning_rate": 4.76718140970117e-05, "loss": 0.363, "step": 3440500 }, { "epoch": 23.285242529233436, "grad_norm": 0.33757832646369934, "learning_rate": 4.767147574707666e-05, "loss": 0.363, "step": 3441000 }, { "epoch": 23.288626028583803, "grad_norm": 0.3646239936351776, "learning_rate": 4.767113739714162e-05, "loss": 0.3629, "step": 3441500 }, { "epoch": 23.29200952793417, "grad_norm": 0.37561193108558655, "learning_rate": 4.767079904720659e-05, "loss": 0.3615, "step": 3442000 }, { "epoch": 23.29539302728454, "grad_norm": 0.3657933473587036, "learning_rate": 4.7670460697271546e-05, "loss": 0.3649, "step": 3442500 }, { "epoch": 23.298776526634907, "grad_norm": 0.33516716957092285, "learning_rate": 4.767012234733651e-05, "loss": 0.3641, "step": 3443000 }, { "epoch": 23.302160025985273, "grad_norm": 0.35404688119888306, "learning_rate": 4.766978399740147e-05, "loss": 0.3638, "step": 3443500 }, { "epoch": 23.305543525335644, "grad_norm": 0.3707813620567322, "learning_rate": 4.766944564746644e-05, "loss": 0.3618, "step": 3444000 }, { "epoch": 23.30892702468601, "grad_norm": 0.3737035393714905, "learning_rate": 4.76691072975314e-05, "loss": 0.3622, "step": 3444500 }, { "epoch": 23.31231052403638, "grad_norm": 0.37302887439727783, "learning_rate": 4.7668768947596364e-05, "loss": 0.3626, "step": 3445000 }, { "epoch": 23.315694023386747, "grad_norm": 0.349115788936615, "learning_rate": 4.7668430597661326e-05, "loss": 0.3639, "step": 3445500 }, { "epoch": 23.319077522737114, "grad_norm": 0.3590487837791443, "learning_rate": 4.7668092247726295e-05, "loss": 0.3629, "step": 3446000 }, { "epoch": 23.322461022087484, "grad_norm": 0.35418424010276794, "learning_rate": 4.766775389779126e-05, "loss": 0.3617, "step": 3446500 }, { "epoch": 23.32584452143785, "grad_norm": 0.3744847774505615, "learning_rate": 4.766741554785621e-05, "loss": 0.3629, "step": 3447000 }, { "epoch": 23.32922802078822, "grad_norm": 0.3934074342250824, "learning_rate": 4.766707719792118e-05, "loss": 0.3632, "step": 3447500 }, { "epoch": 23.332611520138588, "grad_norm": 0.3964332342147827, "learning_rate": 4.766673884798614e-05, "loss": 0.3636, "step": 3448000 }, { "epoch": 23.335995019488955, "grad_norm": 0.37640121579170227, "learning_rate": 4.7666400498051106e-05, "loss": 0.3635, "step": 3448500 }, { "epoch": 23.339378518839325, "grad_norm": 0.43194273114204407, "learning_rate": 4.766606214811607e-05, "loss": 0.3624, "step": 3449000 }, { "epoch": 23.342762018189692, "grad_norm": 0.35745880007743835, "learning_rate": 4.7665723798181037e-05, "loss": 0.3646, "step": 3449500 }, { "epoch": 23.346145517540062, "grad_norm": 0.4166637659072876, "learning_rate": 4.7665385448246e-05, "loss": 0.3635, "step": 3450000 }, { "epoch": 23.34952901689043, "grad_norm": 0.3475643992424011, "learning_rate": 4.766504709831096e-05, "loss": 0.3634, "step": 3450500 }, { "epoch": 23.352912516240796, "grad_norm": 0.4426576793193817, "learning_rate": 4.766470874837592e-05, "loss": 0.3645, "step": 3451000 }, { "epoch": 23.356296015591166, "grad_norm": 0.3714113235473633, "learning_rate": 4.766437039844089e-05, "loss": 0.3638, "step": 3451500 }, { "epoch": 23.359679514941533, "grad_norm": 0.37920984625816345, "learning_rate": 4.766403204850585e-05, "loss": 0.3631, "step": 3452000 }, { "epoch": 23.3630630142919, "grad_norm": 0.35684266686439514, "learning_rate": 4.766369369857081e-05, "loss": 0.364, "step": 3452500 }, { "epoch": 23.36644651364227, "grad_norm": 0.37371116876602173, "learning_rate": 4.766335534863577e-05, "loss": 0.3623, "step": 3453000 }, { "epoch": 23.369830012992637, "grad_norm": 0.37027162313461304, "learning_rate": 4.766301699870074e-05, "loss": 0.3634, "step": 3453500 }, { "epoch": 23.373213512343007, "grad_norm": 0.3794066309928894, "learning_rate": 4.76626786487657e-05, "loss": 0.3634, "step": 3454000 }, { "epoch": 23.376597011693374, "grad_norm": 0.3557523787021637, "learning_rate": 4.7662340298830665e-05, "loss": 0.3638, "step": 3454500 }, { "epoch": 23.37998051104374, "grad_norm": 0.371554970741272, "learning_rate": 4.766200194889563e-05, "loss": 0.3638, "step": 3455000 }, { "epoch": 23.38336401039411, "grad_norm": 0.35758981108665466, "learning_rate": 4.7661663598960596e-05, "loss": 0.3615, "step": 3455500 }, { "epoch": 23.386747509744477, "grad_norm": 0.37202003598213196, "learning_rate": 4.766132524902556e-05, "loss": 0.3643, "step": 3456000 }, { "epoch": 23.390131009094848, "grad_norm": 0.37054094672203064, "learning_rate": 4.766098689909051e-05, "loss": 0.365, "step": 3456500 }, { "epoch": 23.393514508445215, "grad_norm": 0.35302260518074036, "learning_rate": 4.766064854915548e-05, "loss": 0.3629, "step": 3457000 }, { "epoch": 23.39689800779558, "grad_norm": 0.3512248396873474, "learning_rate": 4.7660310199220444e-05, "loss": 0.364, "step": 3457500 }, { "epoch": 23.40028150714595, "grad_norm": 0.377338707447052, "learning_rate": 4.7659971849285406e-05, "loss": 0.3658, "step": 3458000 }, { "epoch": 23.40366500649632, "grad_norm": 0.36699292063713074, "learning_rate": 4.765963349935037e-05, "loss": 0.3649, "step": 3458500 }, { "epoch": 23.40704850584669, "grad_norm": 0.4169010519981384, "learning_rate": 4.765929514941534e-05, "loss": 0.3634, "step": 3459000 }, { "epoch": 23.410432005197055, "grad_norm": 0.35843947529792786, "learning_rate": 4.76589567994803e-05, "loss": 0.3622, "step": 3459500 }, { "epoch": 23.413815504547422, "grad_norm": 0.3619913160800934, "learning_rate": 4.765861844954526e-05, "loss": 0.3638, "step": 3460000 }, { "epoch": 23.417199003897792, "grad_norm": 0.3665013611316681, "learning_rate": 4.7658280099610224e-05, "loss": 0.3611, "step": 3460500 }, { "epoch": 23.42058250324816, "grad_norm": 0.3594473600387573, "learning_rate": 4.7657941749675186e-05, "loss": 0.3648, "step": 3461000 }, { "epoch": 23.423966002598526, "grad_norm": 0.3695002496242523, "learning_rate": 4.765760339974015e-05, "loss": 0.3643, "step": 3461500 }, { "epoch": 23.427349501948896, "grad_norm": 0.4056394398212433, "learning_rate": 4.765726504980511e-05, "loss": 0.3632, "step": 3462000 }, { "epoch": 23.430733001299263, "grad_norm": 0.3681444823741913, "learning_rate": 4.765692669987007e-05, "loss": 0.364, "step": 3462500 }, { "epoch": 23.434116500649633, "grad_norm": 0.3586125671863556, "learning_rate": 4.765658834993504e-05, "loss": 0.363, "step": 3463000 }, { "epoch": 23.4375, "grad_norm": 0.39215049147605896, "learning_rate": 4.765625e-05, "loss": 0.3635, "step": 3463500 }, { "epoch": 23.440883499350367, "grad_norm": 0.32525262236595154, "learning_rate": 4.7655911650064965e-05, "loss": 0.3625, "step": 3464000 }, { "epoch": 23.444266998700737, "grad_norm": 0.4453783333301544, "learning_rate": 4.765557330012993e-05, "loss": 0.363, "step": 3464500 }, { "epoch": 23.447650498051104, "grad_norm": 0.39107051491737366, "learning_rate": 4.7655234950194896e-05, "loss": 0.3653, "step": 3465000 }, { "epoch": 23.451033997401474, "grad_norm": 0.36332428455352783, "learning_rate": 4.765489660025986e-05, "loss": 0.3638, "step": 3465500 }, { "epoch": 23.45441749675184, "grad_norm": 0.3637053966522217, "learning_rate": 4.7654558250324814e-05, "loss": 0.3627, "step": 3466000 }, { "epoch": 23.457800996102208, "grad_norm": 0.3457685708999634, "learning_rate": 4.765421990038978e-05, "loss": 0.3635, "step": 3466500 }, { "epoch": 23.461184495452578, "grad_norm": 0.38191792368888855, "learning_rate": 4.7653881550454745e-05, "loss": 0.3634, "step": 3467000 }, { "epoch": 23.464567994802945, "grad_norm": 0.39992186427116394, "learning_rate": 4.765354320051971e-05, "loss": 0.3645, "step": 3467500 }, { "epoch": 23.46795149415331, "grad_norm": 0.4218098223209381, "learning_rate": 4.765320485058467e-05, "loss": 0.3644, "step": 3468000 }, { "epoch": 23.47133499350368, "grad_norm": 0.3750041425228119, "learning_rate": 4.765286650064964e-05, "loss": 0.3642, "step": 3468500 }, { "epoch": 23.47471849285405, "grad_norm": 0.39663100242614746, "learning_rate": 4.76525281507146e-05, "loss": 0.3635, "step": 3469000 }, { "epoch": 23.47810199220442, "grad_norm": 0.42526987195014954, "learning_rate": 4.765218980077956e-05, "loss": 0.3639, "step": 3469500 }, { "epoch": 23.481485491554785, "grad_norm": 0.353274941444397, "learning_rate": 4.7651851450844524e-05, "loss": 0.3641, "step": 3470000 }, { "epoch": 23.484868990905152, "grad_norm": 0.3648282289505005, "learning_rate": 4.7651513100909486e-05, "loss": 0.3644, "step": 3470500 }, { "epoch": 23.488252490255523, "grad_norm": 0.40002763271331787, "learning_rate": 4.765117475097445e-05, "loss": 0.364, "step": 3471000 }, { "epoch": 23.49163598960589, "grad_norm": 0.36751842498779297, "learning_rate": 4.765083640103941e-05, "loss": 0.3628, "step": 3471500 }, { "epoch": 23.49501948895626, "grad_norm": 0.3347572684288025, "learning_rate": 4.765049805110437e-05, "loss": 0.3622, "step": 3472000 }, { "epoch": 23.498402988306626, "grad_norm": 0.35500243306159973, "learning_rate": 4.765015970116934e-05, "loss": 0.3639, "step": 3472500 }, { "epoch": 23.501786487656993, "grad_norm": 0.3748621344566345, "learning_rate": 4.7649821351234304e-05, "loss": 0.3635, "step": 3473000 }, { "epoch": 23.505169987007363, "grad_norm": 0.3916022777557373, "learning_rate": 4.7649483001299266e-05, "loss": 0.3642, "step": 3473500 }, { "epoch": 23.50855348635773, "grad_norm": 0.3466847240924835, "learning_rate": 4.764914465136423e-05, "loss": 0.3636, "step": 3474000 }, { "epoch": 23.5119369857081, "grad_norm": 0.3665750026702881, "learning_rate": 4.76488063014292e-05, "loss": 0.3639, "step": 3474500 }, { "epoch": 23.515320485058467, "grad_norm": 0.4211607277393341, "learning_rate": 4.764846795149416e-05, "loss": 0.3632, "step": 3475000 }, { "epoch": 23.518703984408834, "grad_norm": 0.3728439509868622, "learning_rate": 4.7648129601559114e-05, "loss": 0.3639, "step": 3475500 }, { "epoch": 23.522087483759204, "grad_norm": 0.3351035416126251, "learning_rate": 4.764779125162408e-05, "loss": 0.3628, "step": 3476000 }, { "epoch": 23.52547098310957, "grad_norm": 0.34060367941856384, "learning_rate": 4.7647452901689045e-05, "loss": 0.364, "step": 3476500 }, { "epoch": 23.528854482459938, "grad_norm": 0.3944180905818939, "learning_rate": 4.764711455175401e-05, "loss": 0.363, "step": 3477000 }, { "epoch": 23.532237981810308, "grad_norm": 0.32537418603897095, "learning_rate": 4.764677620181897e-05, "loss": 0.3633, "step": 3477500 }, { "epoch": 23.535621481160675, "grad_norm": 0.3785341680049896, "learning_rate": 4.764643785188393e-05, "loss": 0.3631, "step": 3478000 }, { "epoch": 23.539004980511045, "grad_norm": 0.36599990725517273, "learning_rate": 4.76460995019489e-05, "loss": 0.3644, "step": 3478500 }, { "epoch": 23.542388479861412, "grad_norm": 0.40464311838150024, "learning_rate": 4.764576115201386e-05, "loss": 0.363, "step": 3479000 }, { "epoch": 23.54577197921178, "grad_norm": 0.34878531098365784, "learning_rate": 4.7645422802078825e-05, "loss": 0.3629, "step": 3479500 }, { "epoch": 23.54915547856215, "grad_norm": 0.36536720395088196, "learning_rate": 4.764508445214379e-05, "loss": 0.3631, "step": 3480000 }, { "epoch": 23.552538977912516, "grad_norm": 0.3720216155052185, "learning_rate": 4.764474610220875e-05, "loss": 0.3631, "step": 3480500 }, { "epoch": 23.555922477262886, "grad_norm": 0.3621642291545868, "learning_rate": 4.764440775227371e-05, "loss": 0.3637, "step": 3481000 }, { "epoch": 23.559305976613253, "grad_norm": 0.35155996680259705, "learning_rate": 4.764406940233867e-05, "loss": 0.363, "step": 3481500 }, { "epoch": 23.56268947596362, "grad_norm": 0.3805035650730133, "learning_rate": 4.764373105240364e-05, "loss": 0.3629, "step": 3482000 }, { "epoch": 23.56607297531399, "grad_norm": 0.4354890286922455, "learning_rate": 4.7643392702468604e-05, "loss": 0.364, "step": 3482500 }, { "epoch": 23.569456474664356, "grad_norm": 0.35975298285484314, "learning_rate": 4.7643054352533566e-05, "loss": 0.3621, "step": 3483000 }, { "epoch": 23.572839974014727, "grad_norm": 0.3824651539325714, "learning_rate": 4.764271600259853e-05, "loss": 0.3645, "step": 3483500 }, { "epoch": 23.576223473365093, "grad_norm": 0.4012623429298401, "learning_rate": 4.76423776526635e-05, "loss": 0.3641, "step": 3484000 }, { "epoch": 23.57960697271546, "grad_norm": 0.36130985617637634, "learning_rate": 4.764203930272846e-05, "loss": 0.3632, "step": 3484500 }, { "epoch": 23.58299047206583, "grad_norm": 0.31537777185440063, "learning_rate": 4.7641700952793415e-05, "loss": 0.3628, "step": 3485000 }, { "epoch": 23.586373971416197, "grad_norm": 0.3515828251838684, "learning_rate": 4.764136260285838e-05, "loss": 0.3649, "step": 3485500 }, { "epoch": 23.589757470766564, "grad_norm": 0.39275023341178894, "learning_rate": 4.7641024252923346e-05, "loss": 0.3639, "step": 3486000 }, { "epoch": 23.593140970116934, "grad_norm": 0.39433401823043823, "learning_rate": 4.764068590298831e-05, "loss": 0.3638, "step": 3486500 }, { "epoch": 23.5965244694673, "grad_norm": 0.3990587294101715, "learning_rate": 4.764034755305327e-05, "loss": 0.3638, "step": 3487000 }, { "epoch": 23.59990796881767, "grad_norm": 0.3309509754180908, "learning_rate": 4.764000920311823e-05, "loss": 0.363, "step": 3487500 }, { "epoch": 23.603291468168038, "grad_norm": 0.3852672576904297, "learning_rate": 4.76396708531832e-05, "loss": 0.3641, "step": 3488000 }, { "epoch": 23.606674967518405, "grad_norm": 0.39036333560943604, "learning_rate": 4.763933250324816e-05, "loss": 0.3637, "step": 3488500 }, { "epoch": 23.610058466868775, "grad_norm": 0.38685497641563416, "learning_rate": 4.7638994153313125e-05, "loss": 0.3635, "step": 3489000 }, { "epoch": 23.613441966219142, "grad_norm": 0.39183852076530457, "learning_rate": 4.763865580337809e-05, "loss": 0.364, "step": 3489500 }, { "epoch": 23.616825465569512, "grad_norm": 0.37505850195884705, "learning_rate": 4.763831745344305e-05, "loss": 0.3618, "step": 3490000 }, { "epoch": 23.62020896491988, "grad_norm": 0.39759066700935364, "learning_rate": 4.763797910350801e-05, "loss": 0.3636, "step": 3490500 }, { "epoch": 23.623592464270246, "grad_norm": 0.38261231780052185, "learning_rate": 4.7637640753572974e-05, "loss": 0.3632, "step": 3491000 }, { "epoch": 23.626975963620616, "grad_norm": 0.3759249448776245, "learning_rate": 4.763730240363794e-05, "loss": 0.3641, "step": 3491500 }, { "epoch": 23.630359462970983, "grad_norm": 0.34230777621269226, "learning_rate": 4.7636964053702905e-05, "loss": 0.364, "step": 3492000 }, { "epoch": 23.63374296232135, "grad_norm": 0.35262158513069153, "learning_rate": 4.763662570376787e-05, "loss": 0.3653, "step": 3492500 }, { "epoch": 23.63712646167172, "grad_norm": 0.3902489244937897, "learning_rate": 4.763628735383283e-05, "loss": 0.3651, "step": 3493000 }, { "epoch": 23.640509961022087, "grad_norm": 0.4088767468929291, "learning_rate": 4.76359490038978e-05, "loss": 0.3634, "step": 3493500 }, { "epoch": 23.643893460372457, "grad_norm": 0.3867037296295166, "learning_rate": 4.763561065396276e-05, "loss": 0.3646, "step": 3494000 }, { "epoch": 23.647276959722824, "grad_norm": 0.39906758069992065, "learning_rate": 4.763527230402772e-05, "loss": 0.3651, "step": 3494500 }, { "epoch": 23.65066045907319, "grad_norm": 0.3372052311897278, "learning_rate": 4.763493395409268e-05, "loss": 0.3641, "step": 3495000 }, { "epoch": 23.65404395842356, "grad_norm": 0.3461311459541321, "learning_rate": 4.7634595604157647e-05, "loss": 0.3648, "step": 3495500 }, { "epoch": 23.657427457773927, "grad_norm": 0.38681378960609436, "learning_rate": 4.763425725422261e-05, "loss": 0.3633, "step": 3496000 }, { "epoch": 23.660810957124298, "grad_norm": 0.3958388566970825, "learning_rate": 4.763391890428757e-05, "loss": 0.3635, "step": 3496500 }, { "epoch": 23.664194456474664, "grad_norm": 0.3645811676979065, "learning_rate": 4.763358055435253e-05, "loss": 0.3639, "step": 3497000 }, { "epoch": 23.66757795582503, "grad_norm": 0.4053094685077667, "learning_rate": 4.76332422044175e-05, "loss": 0.3641, "step": 3497500 }, { "epoch": 23.6709614551754, "grad_norm": 0.3625944256782532, "learning_rate": 4.7632903854482464e-05, "loss": 0.3642, "step": 3498000 }, { "epoch": 23.67434495452577, "grad_norm": 0.38809558749198914, "learning_rate": 4.7632565504547426e-05, "loss": 0.364, "step": 3498500 }, { "epoch": 23.67772845387614, "grad_norm": 0.35745155811309814, "learning_rate": 4.763222715461239e-05, "loss": 0.3631, "step": 3499000 }, { "epoch": 23.681111953226505, "grad_norm": 0.3489069938659668, "learning_rate": 4.763188880467735e-05, "loss": 0.3652, "step": 3499500 }, { "epoch": 23.684495452576872, "grad_norm": 0.3746088445186615, "learning_rate": 4.763155045474231e-05, "loss": 0.3644, "step": 3500000 }, { "epoch": 23.687878951927242, "grad_norm": 0.3664596378803253, "learning_rate": 4.7631212104807275e-05, "loss": 0.364, "step": 3500500 }, { "epoch": 23.69126245127761, "grad_norm": 0.3510581851005554, "learning_rate": 4.7630873754872243e-05, "loss": 0.3638, "step": 3501000 }, { "epoch": 23.694645950627976, "grad_norm": 0.3787493109703064, "learning_rate": 4.7630535404937206e-05, "loss": 0.3664, "step": 3501500 }, { "epoch": 23.698029449978346, "grad_norm": 0.34525835514068604, "learning_rate": 4.763019705500217e-05, "loss": 0.3636, "step": 3502000 }, { "epoch": 23.701412949328713, "grad_norm": 0.39369526505470276, "learning_rate": 4.762985870506713e-05, "loss": 0.3631, "step": 3502500 }, { "epoch": 23.704796448679083, "grad_norm": 0.3621573746204376, "learning_rate": 4.76295203551321e-05, "loss": 0.3648, "step": 3503000 }, { "epoch": 23.70817994802945, "grad_norm": 0.34888139367103577, "learning_rate": 4.762918200519706e-05, "loss": 0.3622, "step": 3503500 }, { "epoch": 23.711563447379817, "grad_norm": 0.36744803190231323, "learning_rate": 4.762884365526202e-05, "loss": 0.363, "step": 3504000 }, { "epoch": 23.714946946730187, "grad_norm": 0.3537336587905884, "learning_rate": 4.762850530532698e-05, "loss": 0.3631, "step": 3504500 }, { "epoch": 23.718330446080554, "grad_norm": 0.35082361102104187, "learning_rate": 4.762816695539195e-05, "loss": 0.3629, "step": 3505000 }, { "epoch": 23.721713945430924, "grad_norm": 0.3593525290489197, "learning_rate": 4.762782860545691e-05, "loss": 0.3627, "step": 3505500 }, { "epoch": 23.72509744478129, "grad_norm": 0.3956252634525299, "learning_rate": 4.762749025552187e-05, "loss": 0.3635, "step": 3506000 }, { "epoch": 23.728480944131658, "grad_norm": 0.3581351637840271, "learning_rate": 4.7627151905586834e-05, "loss": 0.3631, "step": 3506500 }, { "epoch": 23.731864443482028, "grad_norm": 0.3484318256378174, "learning_rate": 4.76268135556518e-05, "loss": 0.3638, "step": 3507000 }, { "epoch": 23.735247942832395, "grad_norm": 0.40121597051620483, "learning_rate": 4.7626475205716765e-05, "loss": 0.3635, "step": 3507500 }, { "epoch": 23.738631442182765, "grad_norm": 0.3527781069278717, "learning_rate": 4.762613685578173e-05, "loss": 0.3634, "step": 3508000 }, { "epoch": 23.74201494153313, "grad_norm": 0.3523523807525635, "learning_rate": 4.762579850584669e-05, "loss": 0.3621, "step": 3508500 }, { "epoch": 23.7453984408835, "grad_norm": 0.37593403458595276, "learning_rate": 4.762546015591165e-05, "loss": 0.3638, "step": 3509000 }, { "epoch": 23.74878194023387, "grad_norm": 0.3803982436656952, "learning_rate": 4.762512180597661e-05, "loss": 0.3639, "step": 3509500 }, { "epoch": 23.752165439584235, "grad_norm": 0.3712470531463623, "learning_rate": 4.7624783456041575e-05, "loss": 0.3644, "step": 3510000 }, { "epoch": 23.755548938934602, "grad_norm": 0.37614697217941284, "learning_rate": 4.7624445106106544e-05, "loss": 0.3648, "step": 3510500 }, { "epoch": 23.758932438284972, "grad_norm": 0.3237117826938629, "learning_rate": 4.7624106756171506e-05, "loss": 0.3642, "step": 3511000 }, { "epoch": 23.76231593763534, "grad_norm": 0.415750652551651, "learning_rate": 4.762376840623647e-05, "loss": 0.3627, "step": 3511500 }, { "epoch": 23.76569943698571, "grad_norm": 0.3945716917514801, "learning_rate": 4.762343005630143e-05, "loss": 0.3625, "step": 3512000 }, { "epoch": 23.769082936336076, "grad_norm": 0.3449915051460266, "learning_rate": 4.76230917063664e-05, "loss": 0.3644, "step": 3512500 }, { "epoch": 23.772466435686443, "grad_norm": 0.38573160767555237, "learning_rate": 4.762275335643136e-05, "loss": 0.3633, "step": 3513000 }, { "epoch": 23.775849935036813, "grad_norm": 0.37552815675735474, "learning_rate": 4.7622415006496324e-05, "loss": 0.3635, "step": 3513500 }, { "epoch": 23.77923343438718, "grad_norm": 0.363090455532074, "learning_rate": 4.762207665656128e-05, "loss": 0.3643, "step": 3514000 }, { "epoch": 23.78261693373755, "grad_norm": 0.3741694390773773, "learning_rate": 4.762173830662625e-05, "loss": 0.3649, "step": 3514500 }, { "epoch": 23.786000433087917, "grad_norm": 0.35938596725463867, "learning_rate": 4.762139995669121e-05, "loss": 0.3634, "step": 3515000 }, { "epoch": 23.789383932438284, "grad_norm": 0.40430352091789246, "learning_rate": 4.762106160675617e-05, "loss": 0.3623, "step": 3515500 }, { "epoch": 23.792767431788654, "grad_norm": 0.3747963309288025, "learning_rate": 4.7620723256821134e-05, "loss": 0.3641, "step": 3516000 }, { "epoch": 23.79615093113902, "grad_norm": 0.35648542642593384, "learning_rate": 4.76203849068861e-05, "loss": 0.3624, "step": 3516500 }, { "epoch": 23.799534430489388, "grad_norm": 0.3804064691066742, "learning_rate": 4.7620046556951065e-05, "loss": 0.3635, "step": 3517000 }, { "epoch": 23.802917929839758, "grad_norm": 0.38011279702186584, "learning_rate": 4.761970820701603e-05, "loss": 0.3644, "step": 3517500 }, { "epoch": 23.806301429190125, "grad_norm": 0.3717729449272156, "learning_rate": 4.761936985708099e-05, "loss": 0.3642, "step": 3518000 }, { "epoch": 23.809684928540495, "grad_norm": 0.3428543508052826, "learning_rate": 4.761903150714595e-05, "loss": 0.3631, "step": 3518500 }, { "epoch": 23.81306842789086, "grad_norm": 0.362511545419693, "learning_rate": 4.7618693157210914e-05, "loss": 0.363, "step": 3519000 }, { "epoch": 23.81645192724123, "grad_norm": 0.4208146929740906, "learning_rate": 4.7618354807275876e-05, "loss": 0.3628, "step": 3519500 }, { "epoch": 23.8198354265916, "grad_norm": 0.37463951110839844, "learning_rate": 4.7618016457340845e-05, "loss": 0.3632, "step": 3520000 }, { "epoch": 23.823218925941966, "grad_norm": 0.3562808334827423, "learning_rate": 4.761767810740581e-05, "loss": 0.3642, "step": 3520500 }, { "epoch": 23.826602425292336, "grad_norm": 0.36782774329185486, "learning_rate": 4.761733975747077e-05, "loss": 0.3649, "step": 3521000 }, { "epoch": 23.829985924642703, "grad_norm": 0.3616153597831726, "learning_rate": 4.761700140753573e-05, "loss": 0.3632, "step": 3521500 }, { "epoch": 23.83336942399307, "grad_norm": 0.36399659514427185, "learning_rate": 4.76166630576007e-05, "loss": 0.3626, "step": 3522000 }, { "epoch": 23.83675292334344, "grad_norm": 0.3577078878879547, "learning_rate": 4.761632470766566e-05, "loss": 0.3651, "step": 3522500 }, { "epoch": 23.840136422693806, "grad_norm": 0.3801499307155609, "learning_rate": 4.7615986357730624e-05, "loss": 0.3629, "step": 3523000 }, { "epoch": 23.843519922044173, "grad_norm": 0.34691569209098816, "learning_rate": 4.761564800779558e-05, "loss": 0.3637, "step": 3523500 }, { "epoch": 23.846903421394543, "grad_norm": 0.38196486234664917, "learning_rate": 4.761530965786055e-05, "loss": 0.3651, "step": 3524000 }, { "epoch": 23.85028692074491, "grad_norm": 0.3814425468444824, "learning_rate": 4.761497130792551e-05, "loss": 0.3634, "step": 3524500 }, { "epoch": 23.85367042009528, "grad_norm": 0.34999626874923706, "learning_rate": 4.761463295799047e-05, "loss": 0.3627, "step": 3525000 }, { "epoch": 23.857053919445647, "grad_norm": 0.3775683045387268, "learning_rate": 4.7614294608055435e-05, "loss": 0.3645, "step": 3525500 }, { "epoch": 23.860437418796014, "grad_norm": 0.38793089985847473, "learning_rate": 4.7613956258120404e-05, "loss": 0.3643, "step": 3526000 }, { "epoch": 23.863820918146384, "grad_norm": 0.33791637420654297, "learning_rate": 4.7613617908185366e-05, "loss": 0.3624, "step": 3526500 }, { "epoch": 23.86720441749675, "grad_norm": 0.3349616527557373, "learning_rate": 4.761327955825033e-05, "loss": 0.3634, "step": 3527000 }, { "epoch": 23.87058791684712, "grad_norm": 0.3802795112133026, "learning_rate": 4.761294120831529e-05, "loss": 0.3637, "step": 3527500 }, { "epoch": 23.873971416197488, "grad_norm": 0.3808208405971527, "learning_rate": 4.761260285838025e-05, "loss": 0.3633, "step": 3528000 }, { "epoch": 23.877354915547855, "grad_norm": 0.3281201720237732, "learning_rate": 4.7612264508445214e-05, "loss": 0.3629, "step": 3528500 }, { "epoch": 23.880738414898225, "grad_norm": 0.3574613928794861, "learning_rate": 4.7611926158510176e-05, "loss": 0.3634, "step": 3529000 }, { "epoch": 23.884121914248592, "grad_norm": 0.32629862427711487, "learning_rate": 4.7611587808575145e-05, "loss": 0.3632, "step": 3529500 }, { "epoch": 23.887505413598962, "grad_norm": 0.39311572909355164, "learning_rate": 4.761124945864011e-05, "loss": 0.3634, "step": 3530000 }, { "epoch": 23.89088891294933, "grad_norm": 0.3443436622619629, "learning_rate": 4.761091110870507e-05, "loss": 0.3637, "step": 3530500 }, { "epoch": 23.894272412299696, "grad_norm": 0.3927950859069824, "learning_rate": 4.761057275877003e-05, "loss": 0.3633, "step": 3531000 }, { "epoch": 23.897655911650066, "grad_norm": 0.3710768520832062, "learning_rate": 4.7610234408834994e-05, "loss": 0.3632, "step": 3531500 }, { "epoch": 23.901039411000433, "grad_norm": 0.3737480342388153, "learning_rate": 4.760989605889996e-05, "loss": 0.3652, "step": 3532000 }, { "epoch": 23.904422910350803, "grad_norm": 0.35131508111953735, "learning_rate": 4.7609557708964925e-05, "loss": 0.3622, "step": 3532500 }, { "epoch": 23.90780640970117, "grad_norm": 0.39177268743515015, "learning_rate": 4.760921935902988e-05, "loss": 0.3645, "step": 3533000 }, { "epoch": 23.911189909051537, "grad_norm": 0.36162006855010986, "learning_rate": 4.760888100909485e-05, "loss": 0.3632, "step": 3533500 }, { "epoch": 23.914573408401907, "grad_norm": 0.36541253328323364, "learning_rate": 4.760854265915981e-05, "loss": 0.3632, "step": 3534000 }, { "epoch": 23.917956907752274, "grad_norm": 0.3805837035179138, "learning_rate": 4.760820430922477e-05, "loss": 0.3646, "step": 3534500 }, { "epoch": 23.92134040710264, "grad_norm": 0.3992142677307129, "learning_rate": 4.7607865959289735e-05, "loss": 0.3631, "step": 3535000 }, { "epoch": 23.92472390645301, "grad_norm": 0.37672194838523865, "learning_rate": 4.7607527609354704e-05, "loss": 0.3636, "step": 3535500 }, { "epoch": 23.928107405803377, "grad_norm": 0.3364548683166504, "learning_rate": 4.7607189259419666e-05, "loss": 0.3641, "step": 3536000 }, { "epoch": 23.931490905153748, "grad_norm": 0.3823467493057251, "learning_rate": 4.760685090948463e-05, "loss": 0.3652, "step": 3536500 }, { "epoch": 23.934874404504114, "grad_norm": 0.41631531715393066, "learning_rate": 4.760651255954959e-05, "loss": 0.3626, "step": 3537000 }, { "epoch": 23.93825790385448, "grad_norm": 0.41096195578575134, "learning_rate": 4.760617420961455e-05, "loss": 0.3637, "step": 3537500 }, { "epoch": 23.94164140320485, "grad_norm": 0.39381927251815796, "learning_rate": 4.7605835859679515e-05, "loss": 0.3639, "step": 3538000 }, { "epoch": 23.945024902555218, "grad_norm": 0.4025476574897766, "learning_rate": 4.760549750974448e-05, "loss": 0.3635, "step": 3538500 }, { "epoch": 23.94840840190559, "grad_norm": 0.33142849802970886, "learning_rate": 4.7605159159809446e-05, "loss": 0.3634, "step": 3539000 }, { "epoch": 23.951791901255955, "grad_norm": 0.35124173760414124, "learning_rate": 4.760482080987441e-05, "loss": 0.3637, "step": 3539500 }, { "epoch": 23.955175400606322, "grad_norm": 0.4197080433368683, "learning_rate": 4.760448245993937e-05, "loss": 0.3638, "step": 3540000 }, { "epoch": 23.958558899956692, "grad_norm": 0.36347100138664246, "learning_rate": 4.760414411000433e-05, "loss": 0.3625, "step": 3540500 }, { "epoch": 23.96194239930706, "grad_norm": 0.36141666769981384, "learning_rate": 4.7603805760069294e-05, "loss": 0.3633, "step": 3541000 }, { "epoch": 23.965325898657426, "grad_norm": 0.35059964656829834, "learning_rate": 4.760346741013426e-05, "loss": 0.3627, "step": 3541500 }, { "epoch": 23.968709398007796, "grad_norm": 0.38797518610954285, "learning_rate": 4.7603129060199225e-05, "loss": 0.3631, "step": 3542000 }, { "epoch": 23.972092897358163, "grad_norm": 0.3420599699020386, "learning_rate": 4.760279071026418e-05, "loss": 0.3648, "step": 3542500 }, { "epoch": 23.975476396708533, "grad_norm": 0.3919691741466522, "learning_rate": 4.760245236032915e-05, "loss": 0.3632, "step": 3543000 }, { "epoch": 23.9788598960589, "grad_norm": 0.44462597370147705, "learning_rate": 4.760211401039411e-05, "loss": 0.3633, "step": 3543500 }, { "epoch": 23.982243395409267, "grad_norm": 0.37594836950302124, "learning_rate": 4.7601775660459074e-05, "loss": 0.3642, "step": 3544000 }, { "epoch": 23.985626894759637, "grad_norm": 0.3407946825027466, "learning_rate": 4.7601437310524036e-05, "loss": 0.3648, "step": 3544500 }, { "epoch": 23.989010394110004, "grad_norm": 0.402271568775177, "learning_rate": 4.7601098960589005e-05, "loss": 0.3626, "step": 3545000 }, { "epoch": 23.992393893460374, "grad_norm": 0.3821162283420563, "learning_rate": 4.760076061065397e-05, "loss": 0.3638, "step": 3545500 }, { "epoch": 23.99577739281074, "grad_norm": 0.36141517758369446, "learning_rate": 4.760042226071893e-05, "loss": 0.364, "step": 3546000 }, { "epoch": 23.999160892161107, "grad_norm": 0.4005153477191925, "learning_rate": 4.760008391078389e-05, "loss": 0.3628, "step": 3546500 }, { "epoch": 24.0, "eval_accuracy": 0.8615310324361763, "eval_loss": 0.5625105500221252, "eval_runtime": 3398.82, "eval_samples_per_second": 85.543, "eval_steps_per_second": 5.347, "step": 3546624 }, { "epoch": 24.002544391511478, "grad_norm": 0.3851083815097809, "learning_rate": 4.7599745560848853e-05, "loss": 0.3627, "step": 3547000 }, { "epoch": 24.005927890861845, "grad_norm": 0.3546355366706848, "learning_rate": 4.7599407210913816e-05, "loss": 0.3611, "step": 3547500 }, { "epoch": 24.009311390212215, "grad_norm": 0.3528681695461273, "learning_rate": 4.759906886097878e-05, "loss": 0.3626, "step": 3548000 }, { "epoch": 24.01269488956258, "grad_norm": 0.3592744767665863, "learning_rate": 4.759873051104374e-05, "loss": 0.3612, "step": 3548500 }, { "epoch": 24.01607838891295, "grad_norm": 0.3767262399196625, "learning_rate": 4.759839216110871e-05, "loss": 0.3613, "step": 3549000 }, { "epoch": 24.01946188826332, "grad_norm": 0.31025248765945435, "learning_rate": 4.759805381117367e-05, "loss": 0.3603, "step": 3549500 }, { "epoch": 24.022845387613685, "grad_norm": 0.3891949951648712, "learning_rate": 4.759771546123863e-05, "loss": 0.3615, "step": 3550000 }, { "epoch": 24.026228886964052, "grad_norm": 0.3731558620929718, "learning_rate": 4.7597377111303595e-05, "loss": 0.361, "step": 3550500 }, { "epoch": 24.029612386314422, "grad_norm": 0.3438579738140106, "learning_rate": 4.7597038761368564e-05, "loss": 0.3603, "step": 3551000 }, { "epoch": 24.03299588566479, "grad_norm": 0.34653955698013306, "learning_rate": 4.7596700411433526e-05, "loss": 0.3618, "step": 3551500 }, { "epoch": 24.03637938501516, "grad_norm": 0.35605496168136597, "learning_rate": 4.759636206149848e-05, "loss": 0.3622, "step": 3552000 }, { "epoch": 24.039762884365526, "grad_norm": 0.3901239335536957, "learning_rate": 4.759602371156345e-05, "loss": 0.3623, "step": 3552500 }, { "epoch": 24.043146383715893, "grad_norm": 0.3623374402523041, "learning_rate": 4.759568536162841e-05, "loss": 0.3605, "step": 3553000 }, { "epoch": 24.046529883066263, "grad_norm": 0.38516634702682495, "learning_rate": 4.7595347011693375e-05, "loss": 0.3612, "step": 3553500 }, { "epoch": 24.04991338241663, "grad_norm": 0.39640727639198303, "learning_rate": 4.759500866175834e-05, "loss": 0.3628, "step": 3554000 }, { "epoch": 24.053296881767, "grad_norm": 0.35433048009872437, "learning_rate": 4.7594670311823306e-05, "loss": 0.3606, "step": 3554500 }, { "epoch": 24.056680381117367, "grad_norm": 0.38493824005126953, "learning_rate": 4.759433196188827e-05, "loss": 0.3618, "step": 3555000 }, { "epoch": 24.060063880467734, "grad_norm": 0.36116093397140503, "learning_rate": 4.759399361195323e-05, "loss": 0.3622, "step": 3555500 }, { "epoch": 24.063447379818104, "grad_norm": 0.34238407015800476, "learning_rate": 4.759365526201819e-05, "loss": 0.3605, "step": 3556000 }, { "epoch": 24.06683087916847, "grad_norm": 0.35663530230522156, "learning_rate": 4.759331691208316e-05, "loss": 0.3628, "step": 3556500 }, { "epoch": 24.070214378518838, "grad_norm": 0.3875925540924072, "learning_rate": 4.7592978562148116e-05, "loss": 0.3624, "step": 3557000 }, { "epoch": 24.073597877869208, "grad_norm": 0.4210719168186188, "learning_rate": 4.759264021221308e-05, "loss": 0.3614, "step": 3557500 }, { "epoch": 24.076981377219575, "grad_norm": 0.3640529215335846, "learning_rate": 4.759230186227804e-05, "loss": 0.3617, "step": 3558000 }, { "epoch": 24.080364876569945, "grad_norm": 0.39767134189605713, "learning_rate": 4.759196351234301e-05, "loss": 0.3632, "step": 3558500 }, { "epoch": 24.08374837592031, "grad_norm": 0.38562124967575073, "learning_rate": 4.759162516240797e-05, "loss": 0.3627, "step": 3559000 }, { "epoch": 24.08713187527068, "grad_norm": 0.34722045063972473, "learning_rate": 4.7591286812472934e-05, "loss": 0.3631, "step": 3559500 }, { "epoch": 24.09051537462105, "grad_norm": 0.35584744811058044, "learning_rate": 4.7590948462537896e-05, "loss": 0.3618, "step": 3560000 }, { "epoch": 24.093898873971415, "grad_norm": 0.36499258875846863, "learning_rate": 4.7590610112602865e-05, "loss": 0.3636, "step": 3560500 }, { "epoch": 24.097282373321786, "grad_norm": 0.3745898902416229, "learning_rate": 4.759027176266783e-05, "loss": 0.3625, "step": 3561000 }, { "epoch": 24.100665872672153, "grad_norm": 0.3334830403327942, "learning_rate": 4.758993341273278e-05, "loss": 0.3618, "step": 3561500 }, { "epoch": 24.10404937202252, "grad_norm": 0.3804187774658203, "learning_rate": 4.758959506279775e-05, "loss": 0.3629, "step": 3562000 }, { "epoch": 24.10743287137289, "grad_norm": 0.37679699063301086, "learning_rate": 4.758925671286271e-05, "loss": 0.3634, "step": 3562500 }, { "epoch": 24.110816370723256, "grad_norm": 0.38330692052841187, "learning_rate": 4.7588918362927675e-05, "loss": 0.3633, "step": 3563000 }, { "epoch": 24.114199870073627, "grad_norm": 0.387416273355484, "learning_rate": 4.758858001299264e-05, "loss": 0.3635, "step": 3563500 }, { "epoch": 24.117583369423993, "grad_norm": 0.3465549349784851, "learning_rate": 4.7588241663057606e-05, "loss": 0.3612, "step": 3564000 }, { "epoch": 24.12096686877436, "grad_norm": 0.34641847014427185, "learning_rate": 4.758790331312257e-05, "loss": 0.3609, "step": 3564500 }, { "epoch": 24.12435036812473, "grad_norm": 0.36127665638923645, "learning_rate": 4.758756496318753e-05, "loss": 0.3612, "step": 3565000 }, { "epoch": 24.127733867475097, "grad_norm": 0.3640176057815552, "learning_rate": 4.758722661325249e-05, "loss": 0.3628, "step": 3565500 }, { "epoch": 24.131117366825464, "grad_norm": 0.35370656847953796, "learning_rate": 4.758688826331746e-05, "loss": 0.3629, "step": 3566000 }, { "epoch": 24.134500866175834, "grad_norm": 0.3724617063999176, "learning_rate": 4.758654991338242e-05, "loss": 0.3617, "step": 3566500 }, { "epoch": 24.1378843655262, "grad_norm": 0.37611472606658936, "learning_rate": 4.758621156344738e-05, "loss": 0.3627, "step": 3567000 }, { "epoch": 24.14126786487657, "grad_norm": 0.40569251775741577, "learning_rate": 4.758587321351234e-05, "loss": 0.3633, "step": 3567500 }, { "epoch": 24.144651364226938, "grad_norm": 0.3774605393409729, "learning_rate": 4.758553486357731e-05, "loss": 0.3617, "step": 3568000 }, { "epoch": 24.148034863577305, "grad_norm": 0.37588009238243103, "learning_rate": 4.758519651364227e-05, "loss": 0.363, "step": 3568500 }, { "epoch": 24.151418362927675, "grad_norm": 0.34325888752937317, "learning_rate": 4.7584858163707234e-05, "loss": 0.3617, "step": 3569000 }, { "epoch": 24.154801862278042, "grad_norm": 0.3552990257740021, "learning_rate": 4.7584519813772196e-05, "loss": 0.3618, "step": 3569500 }, { "epoch": 24.158185361628412, "grad_norm": 0.3599201440811157, "learning_rate": 4.7584181463837165e-05, "loss": 0.363, "step": 3570000 }, { "epoch": 24.16156886097878, "grad_norm": 0.3919810652732849, "learning_rate": 4.758384311390213e-05, "loss": 0.3638, "step": 3570500 }, { "epoch": 24.164952360329146, "grad_norm": 0.3540429174900055, "learning_rate": 4.758350476396708e-05, "loss": 0.3619, "step": 3571000 }, { "epoch": 24.168335859679516, "grad_norm": 0.3905651569366455, "learning_rate": 4.758316641403205e-05, "loss": 0.3621, "step": 3571500 }, { "epoch": 24.171719359029883, "grad_norm": 0.3780975043773651, "learning_rate": 4.7582828064097014e-05, "loss": 0.3621, "step": 3572000 }, { "epoch": 24.17510285838025, "grad_norm": 0.37841475009918213, "learning_rate": 4.7582489714161976e-05, "loss": 0.3609, "step": 3572500 }, { "epoch": 24.17848635773062, "grad_norm": 0.3821147382259369, "learning_rate": 4.758215136422694e-05, "loss": 0.3631, "step": 3573000 }, { "epoch": 24.181869857080986, "grad_norm": 0.39197006821632385, "learning_rate": 4.758181301429191e-05, "loss": 0.3635, "step": 3573500 }, { "epoch": 24.185253356431357, "grad_norm": 0.37433409690856934, "learning_rate": 4.758147466435687e-05, "loss": 0.3625, "step": 3574000 }, { "epoch": 24.188636855781724, "grad_norm": 0.36877191066741943, "learning_rate": 4.758113631442183e-05, "loss": 0.3629, "step": 3574500 }, { "epoch": 24.19202035513209, "grad_norm": 0.38596317172050476, "learning_rate": 4.758079796448679e-05, "loss": 0.3617, "step": 3575000 }, { "epoch": 24.19540385448246, "grad_norm": 0.37200498580932617, "learning_rate": 4.758045961455176e-05, "loss": 0.3621, "step": 3575500 }, { "epoch": 24.198787353832827, "grad_norm": 0.3859347999095917, "learning_rate": 4.758012126461672e-05, "loss": 0.3618, "step": 3576000 }, { "epoch": 24.202170853183198, "grad_norm": 0.3903367817401886, "learning_rate": 4.757978291468168e-05, "loss": 0.3635, "step": 3576500 }, { "epoch": 24.205554352533564, "grad_norm": 0.350730299949646, "learning_rate": 4.757944456474664e-05, "loss": 0.3621, "step": 3577000 }, { "epoch": 24.20893785188393, "grad_norm": 0.360914409160614, "learning_rate": 4.757910621481161e-05, "loss": 0.3629, "step": 3577500 }, { "epoch": 24.2123213512343, "grad_norm": 0.3702397644519806, "learning_rate": 4.757876786487657e-05, "loss": 0.363, "step": 3578000 }, { "epoch": 24.215704850584668, "grad_norm": 0.35700830817222595, "learning_rate": 4.7578429514941535e-05, "loss": 0.3641, "step": 3578500 }, { "epoch": 24.21908834993504, "grad_norm": 0.39207759499549866, "learning_rate": 4.75780911650065e-05, "loss": 0.3635, "step": 3579000 }, { "epoch": 24.222471849285405, "grad_norm": 0.31646469235420227, "learning_rate": 4.7577752815071466e-05, "loss": 0.3627, "step": 3579500 }, { "epoch": 24.225855348635772, "grad_norm": 0.39063090085983276, "learning_rate": 4.757741446513643e-05, "loss": 0.3628, "step": 3580000 }, { "epoch": 24.229238847986142, "grad_norm": 0.3953896760940552, "learning_rate": 4.757707611520138e-05, "loss": 0.3647, "step": 3580500 }, { "epoch": 24.23262234733651, "grad_norm": 0.39057600498199463, "learning_rate": 4.757673776526635e-05, "loss": 0.3627, "step": 3581000 }, { "epoch": 24.236005846686876, "grad_norm": 0.3909989595413208, "learning_rate": 4.7576399415331314e-05, "loss": 0.3639, "step": 3581500 }, { "epoch": 24.239389346037246, "grad_norm": 0.347472220659256, "learning_rate": 4.7576061065396277e-05, "loss": 0.363, "step": 3582000 }, { "epoch": 24.242772845387613, "grad_norm": 0.34311747550964355, "learning_rate": 4.757572271546124e-05, "loss": 0.3619, "step": 3582500 }, { "epoch": 24.246156344737983, "grad_norm": 0.4090639352798462, "learning_rate": 4.757538436552621e-05, "loss": 0.3627, "step": 3583000 }, { "epoch": 24.24953984408835, "grad_norm": 0.37628045678138733, "learning_rate": 4.757504601559117e-05, "loss": 0.3618, "step": 3583500 }, { "epoch": 24.252923343438717, "grad_norm": 0.4039421081542969, "learning_rate": 4.757470766565613e-05, "loss": 0.3619, "step": 3584000 }, { "epoch": 24.256306842789087, "grad_norm": 0.4032013416290283, "learning_rate": 4.7574369315721094e-05, "loss": 0.3629, "step": 3584500 }, { "epoch": 24.259690342139454, "grad_norm": 0.36380964517593384, "learning_rate": 4.757403096578606e-05, "loss": 0.3622, "step": 3585000 }, { "epoch": 24.263073841489824, "grad_norm": 0.34602972865104675, "learning_rate": 4.757369261585102e-05, "loss": 0.3621, "step": 3585500 }, { "epoch": 24.26645734084019, "grad_norm": 0.38954514265060425, "learning_rate": 4.757335426591598e-05, "loss": 0.3634, "step": 3586000 }, { "epoch": 24.269840840190557, "grad_norm": 0.39841580390930176, "learning_rate": 4.757301591598094e-05, "loss": 0.3636, "step": 3586500 }, { "epoch": 24.273224339540928, "grad_norm": 0.3810007572174072, "learning_rate": 4.757267756604591e-05, "loss": 0.3632, "step": 3587000 }, { "epoch": 24.276607838891294, "grad_norm": 0.3691217601299286, "learning_rate": 4.7572339216110873e-05, "loss": 0.3622, "step": 3587500 }, { "epoch": 24.279991338241665, "grad_norm": 0.37574613094329834, "learning_rate": 4.7572000866175836e-05, "loss": 0.3652, "step": 3588000 }, { "epoch": 24.28337483759203, "grad_norm": 0.38883501291275024, "learning_rate": 4.75716625162408e-05, "loss": 0.3625, "step": 3588500 }, { "epoch": 24.2867583369424, "grad_norm": 0.3991695046424866, "learning_rate": 4.7571324166305767e-05, "loss": 0.3633, "step": 3589000 }, { "epoch": 24.29014183629277, "grad_norm": 0.3845086097717285, "learning_rate": 4.757098581637073e-05, "loss": 0.3636, "step": 3589500 }, { "epoch": 24.293525335643135, "grad_norm": 0.35532400012016296, "learning_rate": 4.7570647466435684e-05, "loss": 0.3618, "step": 3590000 }, { "epoch": 24.296908834993502, "grad_norm": 0.3768151104450226, "learning_rate": 4.757030911650065e-05, "loss": 0.3633, "step": 3590500 }, { "epoch": 24.300292334343872, "grad_norm": 0.3618510365486145, "learning_rate": 4.7569970766565615e-05, "loss": 0.3644, "step": 3591000 }, { "epoch": 24.30367583369424, "grad_norm": 0.3861792981624603, "learning_rate": 4.756963241663058e-05, "loss": 0.3624, "step": 3591500 }, { "epoch": 24.30705933304461, "grad_norm": 0.3386296331882477, "learning_rate": 4.756929406669554e-05, "loss": 0.3622, "step": 3592000 }, { "epoch": 24.310442832394976, "grad_norm": 0.38560476899147034, "learning_rate": 4.756895571676051e-05, "loss": 0.3625, "step": 3592500 }, { "epoch": 24.313826331745343, "grad_norm": 0.37961357831954956, "learning_rate": 4.756861736682547e-05, "loss": 0.3629, "step": 3593000 }, { "epoch": 24.317209831095713, "grad_norm": 0.3790128827095032, "learning_rate": 4.756827901689043e-05, "loss": 0.3615, "step": 3593500 }, { "epoch": 24.32059333044608, "grad_norm": 0.4152431786060333, "learning_rate": 4.7567940666955395e-05, "loss": 0.3647, "step": 3594000 }, { "epoch": 24.32397682979645, "grad_norm": 0.3837610185146332, "learning_rate": 4.756760231702036e-05, "loss": 0.3626, "step": 3594500 }, { "epoch": 24.327360329146817, "grad_norm": 0.32097288966178894, "learning_rate": 4.756726396708532e-05, "loss": 0.3634, "step": 3595000 }, { "epoch": 24.330743828497184, "grad_norm": 0.3750414252281189, "learning_rate": 4.756692561715028e-05, "loss": 0.3626, "step": 3595500 }, { "epoch": 24.334127327847554, "grad_norm": 0.41777998208999634, "learning_rate": 4.756658726721524e-05, "loss": 0.3628, "step": 3596000 }, { "epoch": 24.33751082719792, "grad_norm": 0.37119758129119873, "learning_rate": 4.756624891728021e-05, "loss": 0.3626, "step": 3596500 }, { "epoch": 24.340894326548288, "grad_norm": 0.3998216688632965, "learning_rate": 4.7565910567345174e-05, "loss": 0.3625, "step": 3597000 }, { "epoch": 24.344277825898658, "grad_norm": 0.3342391848564148, "learning_rate": 4.7565572217410136e-05, "loss": 0.3621, "step": 3597500 }, { "epoch": 24.347661325249025, "grad_norm": 0.3821890950202942, "learning_rate": 4.75652338674751e-05, "loss": 0.3615, "step": 3598000 }, { "epoch": 24.351044824599395, "grad_norm": 0.3546050190925598, "learning_rate": 4.756489551754007e-05, "loss": 0.3623, "step": 3598500 }, { "epoch": 24.35442832394976, "grad_norm": 0.38674062490463257, "learning_rate": 4.756455716760503e-05, "loss": 0.3619, "step": 3599000 }, { "epoch": 24.35781182330013, "grad_norm": 0.35297295451164246, "learning_rate": 4.7564218817669985e-05, "loss": 0.3631, "step": 3599500 }, { "epoch": 24.3611953226505, "grad_norm": 0.3537425398826599, "learning_rate": 4.7563880467734954e-05, "loss": 0.3637, "step": 3600000 }, { "epoch": 24.364578822000865, "grad_norm": 0.35156282782554626, "learning_rate": 4.7563542117799916e-05, "loss": 0.3631, "step": 3600500 }, { "epoch": 24.367962321351236, "grad_norm": 0.37335723638534546, "learning_rate": 4.756320376786488e-05, "loss": 0.3633, "step": 3601000 }, { "epoch": 24.371345820701602, "grad_norm": 0.3556972146034241, "learning_rate": 4.756286541792984e-05, "loss": 0.3626, "step": 3601500 }, { "epoch": 24.37472932005197, "grad_norm": 0.3641450107097626, "learning_rate": 4.75625270679948e-05, "loss": 0.3645, "step": 3602000 }, { "epoch": 24.37811281940234, "grad_norm": 0.3725520372390747, "learning_rate": 4.756218871805977e-05, "loss": 0.363, "step": 3602500 }, { "epoch": 24.381496318752706, "grad_norm": 0.40755024552345276, "learning_rate": 4.756185036812473e-05, "loss": 0.3632, "step": 3603000 }, { "epoch": 24.384879818103077, "grad_norm": 0.4027111828327179, "learning_rate": 4.7561512018189695e-05, "loss": 0.3647, "step": 3603500 }, { "epoch": 24.388263317453443, "grad_norm": 0.38163647055625916, "learning_rate": 4.756117366825466e-05, "loss": 0.3634, "step": 3604000 }, { "epoch": 24.39164681680381, "grad_norm": 0.36534103751182556, "learning_rate": 4.756083531831962e-05, "loss": 0.3622, "step": 3604500 }, { "epoch": 24.39503031615418, "grad_norm": 0.3960428237915039, "learning_rate": 4.756049696838458e-05, "loss": 0.3636, "step": 3605000 }, { "epoch": 24.398413815504547, "grad_norm": 0.38842570781707764, "learning_rate": 4.7560158618449544e-05, "loss": 0.364, "step": 3605500 }, { "epoch": 24.401797314854914, "grad_norm": 0.39695754647254944, "learning_rate": 4.755982026851451e-05, "loss": 0.363, "step": 3606000 }, { "epoch": 24.405180814205284, "grad_norm": 0.37467947602272034, "learning_rate": 4.7559481918579475e-05, "loss": 0.3621, "step": 3606500 }, { "epoch": 24.40856431355565, "grad_norm": 0.36468884348869324, "learning_rate": 4.755914356864444e-05, "loss": 0.3615, "step": 3607000 }, { "epoch": 24.41194781290602, "grad_norm": 0.3378750681877136, "learning_rate": 4.75588052187094e-05, "loss": 0.3626, "step": 3607500 }, { "epoch": 24.415331312256388, "grad_norm": 0.37069934606552124, "learning_rate": 4.755846686877437e-05, "loss": 0.3634, "step": 3608000 }, { "epoch": 24.418714811606755, "grad_norm": 0.32639679312705994, "learning_rate": 4.755812851883933e-05, "loss": 0.3629, "step": 3608500 }, { "epoch": 24.422098310957125, "grad_norm": 0.3813059628009796, "learning_rate": 4.755779016890429e-05, "loss": 0.3626, "step": 3609000 }, { "epoch": 24.42548181030749, "grad_norm": 0.33321505784988403, "learning_rate": 4.7557451818969254e-05, "loss": 0.363, "step": 3609500 }, { "epoch": 24.428865309657862, "grad_norm": 0.37366393208503723, "learning_rate": 4.7557113469034216e-05, "loss": 0.3641, "step": 3610000 }, { "epoch": 24.43224880900823, "grad_norm": 0.36428675055503845, "learning_rate": 4.755677511909918e-05, "loss": 0.3627, "step": 3610500 }, { "epoch": 24.435632308358596, "grad_norm": 0.3707229793071747, "learning_rate": 4.755643676916414e-05, "loss": 0.3625, "step": 3611000 }, { "epoch": 24.439015807708966, "grad_norm": 0.36802589893341064, "learning_rate": 4.75560984192291e-05, "loss": 0.3638, "step": 3611500 }, { "epoch": 24.442399307059333, "grad_norm": 0.3834385275840759, "learning_rate": 4.755576006929407e-05, "loss": 0.3628, "step": 3612000 }, { "epoch": 24.445782806409703, "grad_norm": 0.36757373809814453, "learning_rate": 4.7555421719359034e-05, "loss": 0.3641, "step": 3612500 }, { "epoch": 24.44916630576007, "grad_norm": 0.3710903823375702, "learning_rate": 4.7555083369423996e-05, "loss": 0.3621, "step": 3613000 }, { "epoch": 24.452549805110436, "grad_norm": 0.37583836913108826, "learning_rate": 4.755474501948896e-05, "loss": 0.363, "step": 3613500 }, { "epoch": 24.455933304460807, "grad_norm": 0.3556283414363861, "learning_rate": 4.755440666955392e-05, "loss": 0.363, "step": 3614000 }, { "epoch": 24.459316803811173, "grad_norm": 0.38675931096076965, "learning_rate": 4.755406831961888e-05, "loss": 0.3636, "step": 3614500 }, { "epoch": 24.46270030316154, "grad_norm": 0.35959991812705994, "learning_rate": 4.7553729969683844e-05, "loss": 0.3632, "step": 3615000 }, { "epoch": 24.46608380251191, "grad_norm": 0.36478689312934875, "learning_rate": 4.755339161974881e-05, "loss": 0.3644, "step": 3615500 }, { "epoch": 24.469467301862277, "grad_norm": 0.37179750204086304, "learning_rate": 4.7553053269813775e-05, "loss": 0.363, "step": 3616000 }, { "epoch": 24.472850801212648, "grad_norm": 0.38893115520477295, "learning_rate": 4.755271491987874e-05, "loss": 0.3631, "step": 3616500 }, { "epoch": 24.476234300563014, "grad_norm": 0.4172806143760681, "learning_rate": 4.75523765699437e-05, "loss": 0.3618, "step": 3617000 }, { "epoch": 24.47961779991338, "grad_norm": 0.38247597217559814, "learning_rate": 4.755203822000867e-05, "loss": 0.3633, "step": 3617500 }, { "epoch": 24.48300129926375, "grad_norm": 0.398297518491745, "learning_rate": 4.755169987007363e-05, "loss": 0.3633, "step": 3618000 }, { "epoch": 24.486384798614118, "grad_norm": 0.37525323033332825, "learning_rate": 4.755136152013859e-05, "loss": 0.3622, "step": 3618500 }, { "epoch": 24.48976829796449, "grad_norm": 0.35158777236938477, "learning_rate": 4.755102317020355e-05, "loss": 0.3628, "step": 3619000 }, { "epoch": 24.493151797314855, "grad_norm": 0.34074023365974426, "learning_rate": 4.755068482026852e-05, "loss": 0.363, "step": 3619500 }, { "epoch": 24.496535296665222, "grad_norm": 0.3309043347835541, "learning_rate": 4.755034647033348e-05, "loss": 0.3624, "step": 3620000 }, { "epoch": 24.499918796015592, "grad_norm": 0.3668690323829651, "learning_rate": 4.755000812039844e-05, "loss": 0.3621, "step": 3620500 }, { "epoch": 24.50330229536596, "grad_norm": 0.3494468033313751, "learning_rate": 4.75496697704634e-05, "loss": 0.364, "step": 3621000 }, { "epoch": 24.506685794716326, "grad_norm": 0.3519150912761688, "learning_rate": 4.754933142052837e-05, "loss": 0.3615, "step": 3621500 }, { "epoch": 24.510069294066696, "grad_norm": 0.4194455146789551, "learning_rate": 4.7548993070593334e-05, "loss": 0.3628, "step": 3622000 }, { "epoch": 24.513452793417063, "grad_norm": 0.3814716935157776, "learning_rate": 4.7548654720658296e-05, "loss": 0.3622, "step": 3622500 }, { "epoch": 24.516836292767433, "grad_norm": 0.392856627702713, "learning_rate": 4.754831637072326e-05, "loss": 0.3624, "step": 3623000 }, { "epoch": 24.5202197921178, "grad_norm": 0.35863417387008667, "learning_rate": 4.754797802078822e-05, "loss": 0.3623, "step": 3623500 }, { "epoch": 24.523603291468167, "grad_norm": 0.38365086913108826, "learning_rate": 4.754763967085318e-05, "loss": 0.3629, "step": 3624000 }, { "epoch": 24.526986790818537, "grad_norm": 0.3537156581878662, "learning_rate": 4.7547301320918145e-05, "loss": 0.3643, "step": 3624500 }, { "epoch": 24.530370290168904, "grad_norm": 0.35854488611221313, "learning_rate": 4.7546962970983114e-05, "loss": 0.364, "step": 3625000 }, { "epoch": 24.533753789519274, "grad_norm": 0.38134366273880005, "learning_rate": 4.7546624621048076e-05, "loss": 0.3637, "step": 3625500 }, { "epoch": 24.53713728886964, "grad_norm": 0.37400737404823303, "learning_rate": 4.754628627111304e-05, "loss": 0.3644, "step": 3626000 }, { "epoch": 24.540520788220007, "grad_norm": 0.38586708903312683, "learning_rate": 4.7545947921178e-05, "loss": 0.3631, "step": 3626500 }, { "epoch": 24.543904287570378, "grad_norm": 0.38755014538764954, "learning_rate": 4.754560957124297e-05, "loss": 0.3633, "step": 3627000 }, { "epoch": 24.547287786920744, "grad_norm": 0.3673875629901886, "learning_rate": 4.754527122130793e-05, "loss": 0.3634, "step": 3627500 }, { "epoch": 24.550671286271115, "grad_norm": 0.38896000385284424, "learning_rate": 4.754493287137289e-05, "loss": 0.3625, "step": 3628000 }, { "epoch": 24.55405478562148, "grad_norm": 0.36808526515960693, "learning_rate": 4.754459452143785e-05, "loss": 0.3627, "step": 3628500 }, { "epoch": 24.557438284971848, "grad_norm": 0.34860363602638245, "learning_rate": 4.754425617150282e-05, "loss": 0.3634, "step": 3629000 }, { "epoch": 24.56082178432222, "grad_norm": 0.3394933044910431, "learning_rate": 4.754391782156778e-05, "loss": 0.3637, "step": 3629500 }, { "epoch": 24.564205283672585, "grad_norm": 0.35248643159866333, "learning_rate": 4.754357947163274e-05, "loss": 0.363, "step": 3630000 }, { "epoch": 24.567588783022952, "grad_norm": 0.35058140754699707, "learning_rate": 4.7543241121697704e-05, "loss": 0.3631, "step": 3630500 }, { "epoch": 24.570972282373322, "grad_norm": 0.3432460427284241, "learning_rate": 4.754290277176267e-05, "loss": 0.3644, "step": 3631000 }, { "epoch": 24.57435578172369, "grad_norm": 0.4002135396003723, "learning_rate": 4.7542564421827635e-05, "loss": 0.3628, "step": 3631500 }, { "epoch": 24.57773928107406, "grad_norm": 0.3974725008010864, "learning_rate": 4.75422260718926e-05, "loss": 0.3623, "step": 3632000 }, { "epoch": 24.581122780424426, "grad_norm": 0.3433004915714264, "learning_rate": 4.754188772195756e-05, "loss": 0.3647, "step": 3632500 }, { "epoch": 24.584506279774793, "grad_norm": 0.4006336033344269, "learning_rate": 4.754154937202252e-05, "loss": 0.3622, "step": 3633000 }, { "epoch": 24.587889779125163, "grad_norm": 0.38487058877944946, "learning_rate": 4.7541211022087483e-05, "loss": 0.3634, "step": 3633500 }, { "epoch": 24.59127327847553, "grad_norm": 0.3639155328273773, "learning_rate": 4.7540872672152446e-05, "loss": 0.3643, "step": 3634000 }, { "epoch": 24.5946567778259, "grad_norm": 0.36571940779685974, "learning_rate": 4.7540534322217414e-05, "loss": 0.3641, "step": 3634500 }, { "epoch": 24.598040277176267, "grad_norm": 0.42414596676826477, "learning_rate": 4.7540195972282377e-05, "loss": 0.3646, "step": 3635000 }, { "epoch": 24.601423776526634, "grad_norm": 0.34833043813705444, "learning_rate": 4.753985762234734e-05, "loss": 0.3624, "step": 3635500 }, { "epoch": 24.604807275877004, "grad_norm": 0.32802829146385193, "learning_rate": 4.75395192724123e-05, "loss": 0.3642, "step": 3636000 }, { "epoch": 24.60819077522737, "grad_norm": 0.36172571778297424, "learning_rate": 4.753918092247727e-05, "loss": 0.3632, "step": 3636500 }, { "epoch": 24.61157427457774, "grad_norm": 0.39942455291748047, "learning_rate": 4.753884257254223e-05, "loss": 0.3637, "step": 3637000 }, { "epoch": 24.614957773928108, "grad_norm": 0.3636648952960968, "learning_rate": 4.7538504222607194e-05, "loss": 0.3632, "step": 3637500 }, { "epoch": 24.618341273278475, "grad_norm": 0.35619986057281494, "learning_rate": 4.753816587267215e-05, "loss": 0.3649, "step": 3638000 }, { "epoch": 24.621724772628845, "grad_norm": 0.36945921182632446, "learning_rate": 4.753782752273712e-05, "loss": 0.363, "step": 3638500 }, { "epoch": 24.62510827197921, "grad_norm": 0.3725719153881073, "learning_rate": 4.753748917280208e-05, "loss": 0.3627, "step": 3639000 }, { "epoch": 24.62849177132958, "grad_norm": 0.3723055422306061, "learning_rate": 4.753715082286704e-05, "loss": 0.3627, "step": 3639500 }, { "epoch": 24.63187527067995, "grad_norm": 0.33416539430618286, "learning_rate": 4.7536812472932005e-05, "loss": 0.3624, "step": 3640000 }, { "epoch": 24.635258770030315, "grad_norm": 0.37129876017570496, "learning_rate": 4.7536474122996973e-05, "loss": 0.3638, "step": 3640500 }, { "epoch": 24.638642269380686, "grad_norm": 0.3299279510974884, "learning_rate": 4.7536135773061936e-05, "loss": 0.3639, "step": 3641000 }, { "epoch": 24.642025768731052, "grad_norm": 0.40114957094192505, "learning_rate": 4.75357974231269e-05, "loss": 0.3633, "step": 3641500 }, { "epoch": 24.64540926808142, "grad_norm": 0.42098936438560486, "learning_rate": 4.753545907319186e-05, "loss": 0.3631, "step": 3642000 }, { "epoch": 24.64879276743179, "grad_norm": 0.38485991954803467, "learning_rate": 4.753512072325682e-05, "loss": 0.3624, "step": 3642500 }, { "epoch": 24.652176266782156, "grad_norm": 0.35971710085868835, "learning_rate": 4.7534782373321784e-05, "loss": 0.3618, "step": 3643000 }, { "epoch": 24.655559766132527, "grad_norm": 0.39405959844589233, "learning_rate": 4.7534444023386746e-05, "loss": 0.3629, "step": 3643500 }, { "epoch": 24.658943265482893, "grad_norm": 0.39238572120666504, "learning_rate": 4.7534105673451715e-05, "loss": 0.3632, "step": 3644000 }, { "epoch": 24.66232676483326, "grad_norm": 0.36738380789756775, "learning_rate": 4.753376732351668e-05, "loss": 0.3645, "step": 3644500 }, { "epoch": 24.66571026418363, "grad_norm": 0.3584558963775635, "learning_rate": 4.753342897358164e-05, "loss": 0.3632, "step": 3645000 }, { "epoch": 24.669093763533997, "grad_norm": 0.3708287477493286, "learning_rate": 4.75330906236466e-05, "loss": 0.3636, "step": 3645500 }, { "epoch": 24.672477262884364, "grad_norm": 0.3938468396663666, "learning_rate": 4.753275227371157e-05, "loss": 0.3624, "step": 3646000 }, { "epoch": 24.675860762234734, "grad_norm": 0.38310766220092773, "learning_rate": 4.753241392377653e-05, "loss": 0.3642, "step": 3646500 }, { "epoch": 24.6792442615851, "grad_norm": 0.3726683259010315, "learning_rate": 4.7532075573841495e-05, "loss": 0.3626, "step": 3647000 }, { "epoch": 24.68262776093547, "grad_norm": 0.35090330243110657, "learning_rate": 4.753173722390645e-05, "loss": 0.3616, "step": 3647500 }, { "epoch": 24.686011260285838, "grad_norm": 0.37750542163848877, "learning_rate": 4.753139887397142e-05, "loss": 0.3625, "step": 3648000 }, { "epoch": 24.689394759636205, "grad_norm": 0.3820444941520691, "learning_rate": 4.753106052403638e-05, "loss": 0.3625, "step": 3648500 }, { "epoch": 24.692778258986575, "grad_norm": 0.3139392137527466, "learning_rate": 4.753072217410134e-05, "loss": 0.3639, "step": 3649000 }, { "epoch": 24.69616175833694, "grad_norm": 0.3789820671081543, "learning_rate": 4.7530383824166305e-05, "loss": 0.3633, "step": 3649500 }, { "epoch": 24.699545257687312, "grad_norm": 0.36827102303504944, "learning_rate": 4.7530045474231274e-05, "loss": 0.3635, "step": 3650000 }, { "epoch": 24.70292875703768, "grad_norm": 0.40081366896629333, "learning_rate": 4.7529707124296236e-05, "loss": 0.3635, "step": 3650500 }, { "epoch": 24.706312256388046, "grad_norm": 0.38157737255096436, "learning_rate": 4.75293687743612e-05, "loss": 0.3642, "step": 3651000 }, { "epoch": 24.709695755738416, "grad_norm": 0.3455936312675476, "learning_rate": 4.752903042442616e-05, "loss": 0.3631, "step": 3651500 }, { "epoch": 24.713079255088783, "grad_norm": 0.4064480662345886, "learning_rate": 4.752869207449112e-05, "loss": 0.3626, "step": 3652000 }, { "epoch": 24.716462754439153, "grad_norm": 0.3554478883743286, "learning_rate": 4.7528353724556085e-05, "loss": 0.3635, "step": 3652500 }, { "epoch": 24.71984625378952, "grad_norm": 0.3989277184009552, "learning_rate": 4.752801537462105e-05, "loss": 0.3642, "step": 3653000 }, { "epoch": 24.723229753139886, "grad_norm": 0.3907890021800995, "learning_rate": 4.7527677024686016e-05, "loss": 0.3617, "step": 3653500 }, { "epoch": 24.726613252490257, "grad_norm": 0.3725045621395111, "learning_rate": 4.752733867475098e-05, "loss": 0.3633, "step": 3654000 }, { "epoch": 24.729996751840623, "grad_norm": 0.3770049512386322, "learning_rate": 4.752700032481594e-05, "loss": 0.3603, "step": 3654500 }, { "epoch": 24.73338025119099, "grad_norm": 0.35805559158325195, "learning_rate": 4.75266619748809e-05, "loss": 0.3629, "step": 3655000 }, { "epoch": 24.73676375054136, "grad_norm": 0.3792979419231415, "learning_rate": 4.752632362494587e-05, "loss": 0.3636, "step": 3655500 }, { "epoch": 24.740147249891727, "grad_norm": 0.40678563714027405, "learning_rate": 4.752598527501083e-05, "loss": 0.3636, "step": 3656000 }, { "epoch": 24.743530749242097, "grad_norm": 0.3942176103591919, "learning_rate": 4.7525646925075795e-05, "loss": 0.3626, "step": 3656500 }, { "epoch": 24.746914248592464, "grad_norm": 0.3723395764827728, "learning_rate": 4.752530857514075e-05, "loss": 0.3611, "step": 3657000 }, { "epoch": 24.75029774794283, "grad_norm": 0.3674887716770172, "learning_rate": 4.752497022520572e-05, "loss": 0.3624, "step": 3657500 }, { "epoch": 24.7536812472932, "grad_norm": 0.37841156125068665, "learning_rate": 4.752463187527068e-05, "loss": 0.3634, "step": 3658000 }, { "epoch": 24.757064746643568, "grad_norm": 0.423721581697464, "learning_rate": 4.7524293525335644e-05, "loss": 0.362, "step": 3658500 }, { "epoch": 24.76044824599394, "grad_norm": 0.37729448080062866, "learning_rate": 4.7523955175400606e-05, "loss": 0.3619, "step": 3659000 }, { "epoch": 24.763831745344305, "grad_norm": 0.3563602566719055, "learning_rate": 4.7523616825465575e-05, "loss": 0.3637, "step": 3659500 }, { "epoch": 24.767215244694672, "grad_norm": 0.3742561638355255, "learning_rate": 4.752327847553054e-05, "loss": 0.364, "step": 3660000 }, { "epoch": 24.770598744045042, "grad_norm": 0.41626352071762085, "learning_rate": 4.75229401255955e-05, "loss": 0.3628, "step": 3660500 }, { "epoch": 24.77398224339541, "grad_norm": 0.3524860143661499, "learning_rate": 4.752260177566046e-05, "loss": 0.3639, "step": 3661000 }, { "epoch": 24.77736574274578, "grad_norm": 0.4287998378276825, "learning_rate": 4.752226342572542e-05, "loss": 0.3628, "step": 3661500 }, { "epoch": 24.780749242096146, "grad_norm": 0.3763847351074219, "learning_rate": 4.7521925075790385e-05, "loss": 0.3627, "step": 3662000 }, { "epoch": 24.784132741446513, "grad_norm": 0.3679342269897461, "learning_rate": 4.752158672585535e-05, "loss": 0.3638, "step": 3662500 }, { "epoch": 24.787516240796883, "grad_norm": 0.365048885345459, "learning_rate": 4.7521248375920316e-05, "loss": 0.3633, "step": 3663000 }, { "epoch": 24.79089974014725, "grad_norm": 0.3907424509525299, "learning_rate": 4.752091002598528e-05, "loss": 0.3643, "step": 3663500 }, { "epoch": 24.794283239497616, "grad_norm": 0.37695226073265076, "learning_rate": 4.752057167605024e-05, "loss": 0.3623, "step": 3664000 }, { "epoch": 24.797666738847987, "grad_norm": 0.40741339325904846, "learning_rate": 4.75202333261152e-05, "loss": 0.3657, "step": 3664500 }, { "epoch": 24.801050238198354, "grad_norm": 0.35198110342025757, "learning_rate": 4.7519894976180165e-05, "loss": 0.3629, "step": 3665000 }, { "epoch": 24.804433737548724, "grad_norm": 0.36765557527542114, "learning_rate": 4.7519556626245134e-05, "loss": 0.3635, "step": 3665500 }, { "epoch": 24.80781723689909, "grad_norm": 0.3759390711784363, "learning_rate": 4.7519218276310096e-05, "loss": 0.3638, "step": 3666000 }, { "epoch": 24.811200736249457, "grad_norm": 0.3992515802383423, "learning_rate": 4.751887992637505e-05, "loss": 0.3633, "step": 3666500 }, { "epoch": 24.814584235599828, "grad_norm": 0.3276924192905426, "learning_rate": 4.751854157644002e-05, "loss": 0.3622, "step": 3667000 }, { "epoch": 24.817967734950194, "grad_norm": 0.3511314392089844, "learning_rate": 4.751820322650498e-05, "loss": 0.3615, "step": 3667500 }, { "epoch": 24.821351234300565, "grad_norm": 0.36177822947502136, "learning_rate": 4.7517864876569944e-05, "loss": 0.3636, "step": 3668000 }, { "epoch": 24.82473473365093, "grad_norm": 0.32816770672798157, "learning_rate": 4.7517526526634906e-05, "loss": 0.3637, "step": 3668500 }, { "epoch": 24.828118233001298, "grad_norm": 0.31602737307548523, "learning_rate": 4.7517188176699875e-05, "loss": 0.3631, "step": 3669000 }, { "epoch": 24.83150173235167, "grad_norm": 0.3635820746421814, "learning_rate": 4.751684982676484e-05, "loss": 0.3645, "step": 3669500 }, { "epoch": 24.834885231702035, "grad_norm": 0.3586866855621338, "learning_rate": 4.75165114768298e-05, "loss": 0.362, "step": 3670000 }, { "epoch": 24.838268731052402, "grad_norm": 0.31392955780029297, "learning_rate": 4.751617312689476e-05, "loss": 0.3641, "step": 3670500 }, { "epoch": 24.841652230402772, "grad_norm": 0.37633511424064636, "learning_rate": 4.751583477695973e-05, "loss": 0.3626, "step": 3671000 }, { "epoch": 24.84503572975314, "grad_norm": 0.3376986086368561, "learning_rate": 4.7515496427024686e-05, "loss": 0.3638, "step": 3671500 }, { "epoch": 24.84841922910351, "grad_norm": 0.3483399450778961, "learning_rate": 4.751515807708965e-05, "loss": 0.3639, "step": 3672000 }, { "epoch": 24.851802728453876, "grad_norm": 0.36665934324264526, "learning_rate": 4.751481972715462e-05, "loss": 0.3636, "step": 3672500 }, { "epoch": 24.855186227804243, "grad_norm": 0.3893853425979614, "learning_rate": 4.751448137721958e-05, "loss": 0.3639, "step": 3673000 }, { "epoch": 24.858569727154613, "grad_norm": 0.3913221061229706, "learning_rate": 4.751414302728454e-05, "loss": 0.362, "step": 3673500 }, { "epoch": 24.86195322650498, "grad_norm": 0.36556679010391235, "learning_rate": 4.75138046773495e-05, "loss": 0.3625, "step": 3674000 }, { "epoch": 24.86533672585535, "grad_norm": 0.40248703956604004, "learning_rate": 4.7513466327414465e-05, "loss": 0.3632, "step": 3674500 }, { "epoch": 24.868720225205717, "grad_norm": 0.35256707668304443, "learning_rate": 4.7513127977479434e-05, "loss": 0.3623, "step": 3675000 }, { "epoch": 24.872103724556084, "grad_norm": 0.3386058807373047, "learning_rate": 4.7512789627544396e-05, "loss": 0.3627, "step": 3675500 }, { "epoch": 24.875487223906454, "grad_norm": 0.36424487829208374, "learning_rate": 4.751245127760935e-05, "loss": 0.365, "step": 3676000 }, { "epoch": 24.87887072325682, "grad_norm": 0.38003039360046387, "learning_rate": 4.751211292767432e-05, "loss": 0.3641, "step": 3676500 }, { "epoch": 24.882254222607187, "grad_norm": 0.40823283791542053, "learning_rate": 4.751177457773928e-05, "loss": 0.3634, "step": 3677000 }, { "epoch": 24.885637721957558, "grad_norm": 0.40488094091415405, "learning_rate": 4.7511436227804245e-05, "loss": 0.3642, "step": 3677500 }, { "epoch": 24.889021221307924, "grad_norm": 0.37072402238845825, "learning_rate": 4.751109787786921e-05, "loss": 0.3629, "step": 3678000 }, { "epoch": 24.892404720658295, "grad_norm": 0.3462226092815399, "learning_rate": 4.7510759527934176e-05, "loss": 0.3617, "step": 3678500 }, { "epoch": 24.89578822000866, "grad_norm": 0.3488467037677765, "learning_rate": 4.751042117799914e-05, "loss": 0.364, "step": 3679000 }, { "epoch": 24.89917171935903, "grad_norm": 0.3277910053730011, "learning_rate": 4.75100828280641e-05, "loss": 0.3638, "step": 3679500 }, { "epoch": 24.9025552187094, "grad_norm": 0.40324801206588745, "learning_rate": 4.750974447812906e-05, "loss": 0.3618, "step": 3680000 }, { "epoch": 24.905938718059765, "grad_norm": 0.34906333684921265, "learning_rate": 4.750940612819403e-05, "loss": 0.3637, "step": 3680500 }, { "epoch": 24.909322217410136, "grad_norm": 0.37106651067733765, "learning_rate": 4.7509067778258987e-05, "loss": 0.3628, "step": 3681000 }, { "epoch": 24.912705716760502, "grad_norm": 0.37618666887283325, "learning_rate": 4.750872942832395e-05, "loss": 0.3655, "step": 3681500 }, { "epoch": 24.91608921611087, "grad_norm": 0.3458239436149597, "learning_rate": 4.750839107838891e-05, "loss": 0.3624, "step": 3682000 }, { "epoch": 24.91947271546124, "grad_norm": 0.43893659114837646, "learning_rate": 4.750805272845388e-05, "loss": 0.3626, "step": 3682500 }, { "epoch": 24.922856214811606, "grad_norm": 0.3353835642337799, "learning_rate": 4.750771437851884e-05, "loss": 0.3653, "step": 3683000 }, { "epoch": 24.926239714161976, "grad_norm": 0.3734031319618225, "learning_rate": 4.7507376028583804e-05, "loss": 0.3636, "step": 3683500 }, { "epoch": 24.929623213512343, "grad_norm": 0.3506411910057068, "learning_rate": 4.7507037678648766e-05, "loss": 0.3631, "step": 3684000 }, { "epoch": 24.93300671286271, "grad_norm": 0.35105693340301514, "learning_rate": 4.7506699328713735e-05, "loss": 0.3631, "step": 3684500 }, { "epoch": 24.93639021221308, "grad_norm": 0.3812028169631958, "learning_rate": 4.75063609787787e-05, "loss": 0.3631, "step": 3685000 }, { "epoch": 24.939773711563447, "grad_norm": 0.38500118255615234, "learning_rate": 4.750602262884365e-05, "loss": 0.363, "step": 3685500 }, { "epoch": 24.943157210913817, "grad_norm": 0.40273165702819824, "learning_rate": 4.750568427890862e-05, "loss": 0.3631, "step": 3686000 }, { "epoch": 24.946540710264184, "grad_norm": 0.34937480092048645, "learning_rate": 4.7505345928973583e-05, "loss": 0.3623, "step": 3686500 }, { "epoch": 24.94992420961455, "grad_norm": 0.3561623990535736, "learning_rate": 4.7505007579038546e-05, "loss": 0.3631, "step": 3687000 }, { "epoch": 24.95330770896492, "grad_norm": 0.3699971139431, "learning_rate": 4.750466922910351e-05, "loss": 0.365, "step": 3687500 }, { "epoch": 24.956691208315288, "grad_norm": 0.36315932869911194, "learning_rate": 4.750433087916848e-05, "loss": 0.3631, "step": 3688000 }, { "epoch": 24.960074707665655, "grad_norm": 0.4144958555698395, "learning_rate": 4.750399252923344e-05, "loss": 0.3622, "step": 3688500 }, { "epoch": 24.963458207016025, "grad_norm": 0.3754120171070099, "learning_rate": 4.75036541792984e-05, "loss": 0.3649, "step": 3689000 }, { "epoch": 24.96684170636639, "grad_norm": 0.3640006184577942, "learning_rate": 4.750331582936336e-05, "loss": 0.3623, "step": 3689500 }, { "epoch": 24.970225205716762, "grad_norm": 0.35096555948257446, "learning_rate": 4.750297747942833e-05, "loss": 0.3619, "step": 3690000 }, { "epoch": 24.97360870506713, "grad_norm": 0.33679062128067017, "learning_rate": 4.750263912949329e-05, "loss": 0.3629, "step": 3690500 }, { "epoch": 24.976992204417495, "grad_norm": 0.35485759377479553, "learning_rate": 4.750230077955825e-05, "loss": 0.3622, "step": 3691000 }, { "epoch": 24.980375703767866, "grad_norm": 0.3367948830127716, "learning_rate": 4.750196242962321e-05, "loss": 0.3641, "step": 3691500 }, { "epoch": 24.983759203118233, "grad_norm": 0.3389408588409424, "learning_rate": 4.750162407968818e-05, "loss": 0.3633, "step": 3692000 }, { "epoch": 24.987142702468603, "grad_norm": 0.37847328186035156, "learning_rate": 4.750128572975314e-05, "loss": 0.363, "step": 3692500 }, { "epoch": 24.99052620181897, "grad_norm": 0.3536604344844818, "learning_rate": 4.7500947379818105e-05, "loss": 0.3629, "step": 3693000 }, { "epoch": 24.993909701169336, "grad_norm": 0.36198484897613525, "learning_rate": 4.750060902988307e-05, "loss": 0.3624, "step": 3693500 }, { "epoch": 24.997293200519707, "grad_norm": 0.3825433850288391, "learning_rate": 4.7500270679948036e-05, "loss": 0.3643, "step": 3694000 }, { "epoch": 25.0, "eval_accuracy": 0.8616033147198132, "eval_loss": 0.5629200339317322, "eval_runtime": 3407.061, "eval_samples_per_second": 85.336, "eval_steps_per_second": 5.334, "step": 3694400 }, { "epoch": 25.000676699870073, "grad_norm": 0.3510105013847351, "learning_rate": 4.7499932330013e-05, "loss": 0.3628, "step": 3694500 }, { "epoch": 25.00406019922044, "grad_norm": 0.40690895915031433, "learning_rate": 4.749959398007795e-05, "loss": 0.3626, "step": 3695000 }, { "epoch": 25.00744369857081, "grad_norm": 0.3398245573043823, "learning_rate": 4.749925563014292e-05, "loss": 0.3613, "step": 3695500 }, { "epoch": 25.010827197921177, "grad_norm": 0.3666698634624481, "learning_rate": 4.7498917280207884e-05, "loss": 0.3625, "step": 3696000 }, { "epoch": 25.014210697271547, "grad_norm": 0.37911367416381836, "learning_rate": 4.7498578930272846e-05, "loss": 0.3618, "step": 3696500 }, { "epoch": 25.017594196621914, "grad_norm": 0.32052409648895264, "learning_rate": 4.749824058033781e-05, "loss": 0.3606, "step": 3697000 }, { "epoch": 25.02097769597228, "grad_norm": 0.31819668412208557, "learning_rate": 4.749790223040278e-05, "loss": 0.3599, "step": 3697500 }, { "epoch": 25.02436119532265, "grad_norm": 0.3609914481639862, "learning_rate": 4.749756388046774e-05, "loss": 0.3605, "step": 3698000 }, { "epoch": 25.027744694673018, "grad_norm": 0.37735220789909363, "learning_rate": 4.74972255305327e-05, "loss": 0.3614, "step": 3698500 }, { "epoch": 25.03112819402339, "grad_norm": 0.38208866119384766, "learning_rate": 4.7496887180597664e-05, "loss": 0.3617, "step": 3699000 }, { "epoch": 25.034511693373755, "grad_norm": 0.352157860994339, "learning_rate": 4.749654883066263e-05, "loss": 0.3613, "step": 3699500 }, { "epoch": 25.037895192724122, "grad_norm": 0.37447309494018555, "learning_rate": 4.749621048072759e-05, "loss": 0.3605, "step": 3700000 }, { "epoch": 25.041278692074492, "grad_norm": 0.3469589650630951, "learning_rate": 4.749587213079255e-05, "loss": 0.361, "step": 3700500 }, { "epoch": 25.04466219142486, "grad_norm": 0.38122642040252686, "learning_rate": 4.749553378085751e-05, "loss": 0.3619, "step": 3701000 }, { "epoch": 25.04804569077523, "grad_norm": 0.3568724989891052, "learning_rate": 4.749519543092248e-05, "loss": 0.3606, "step": 3701500 }, { "epoch": 25.051429190125596, "grad_norm": 0.3496883809566498, "learning_rate": 4.749485708098744e-05, "loss": 0.3612, "step": 3702000 }, { "epoch": 25.054812689475963, "grad_norm": 0.3566105365753174, "learning_rate": 4.7494518731052405e-05, "loss": 0.3612, "step": 3702500 }, { "epoch": 25.058196188826333, "grad_norm": 0.37609434127807617, "learning_rate": 4.749418038111737e-05, "loss": 0.36, "step": 3703000 }, { "epoch": 25.0615796881767, "grad_norm": 0.35694074630737305, "learning_rate": 4.7493842031182336e-05, "loss": 0.3622, "step": 3703500 }, { "epoch": 25.064963187527066, "grad_norm": 0.4110996425151825, "learning_rate": 4.74935036812473e-05, "loss": 0.3622, "step": 3704000 }, { "epoch": 25.068346686877437, "grad_norm": 0.37473252415657043, "learning_rate": 4.7493165331312254e-05, "loss": 0.3622, "step": 3704500 }, { "epoch": 25.071730186227803, "grad_norm": 0.357221782207489, "learning_rate": 4.749282698137722e-05, "loss": 0.3619, "step": 3705000 }, { "epoch": 25.075113685578174, "grad_norm": 0.3608001172542572, "learning_rate": 4.7492488631442185e-05, "loss": 0.3615, "step": 3705500 }, { "epoch": 25.07849718492854, "grad_norm": 0.38230764865875244, "learning_rate": 4.749215028150715e-05, "loss": 0.3616, "step": 3706000 }, { "epoch": 25.081880684278907, "grad_norm": 0.36422863602638245, "learning_rate": 4.749181193157211e-05, "loss": 0.3626, "step": 3706500 }, { "epoch": 25.085264183629278, "grad_norm": 0.3531195819377899, "learning_rate": 4.749147358163708e-05, "loss": 0.3614, "step": 3707000 }, { "epoch": 25.088647682979644, "grad_norm": 0.3735826909542084, "learning_rate": 4.749113523170204e-05, "loss": 0.3614, "step": 3707500 }, { "epoch": 25.092031182330015, "grad_norm": 0.3754367232322693, "learning_rate": 4.7490796881767e-05, "loss": 0.3613, "step": 3708000 }, { "epoch": 25.09541468168038, "grad_norm": 0.38606202602386475, "learning_rate": 4.7490458531831964e-05, "loss": 0.3621, "step": 3708500 }, { "epoch": 25.098798181030748, "grad_norm": 0.39081791043281555, "learning_rate": 4.749012018189693e-05, "loss": 0.3629, "step": 3709000 }, { "epoch": 25.10218168038112, "grad_norm": 0.3802073299884796, "learning_rate": 4.748978183196189e-05, "loss": 0.3625, "step": 3709500 }, { "epoch": 25.105565179731485, "grad_norm": 0.3735586702823639, "learning_rate": 4.748944348202685e-05, "loss": 0.3624, "step": 3710000 }, { "epoch": 25.108948679081852, "grad_norm": 0.3707588315010071, "learning_rate": 4.748910513209181e-05, "loss": 0.3624, "step": 3710500 }, { "epoch": 25.112332178432222, "grad_norm": 0.39641883969306946, "learning_rate": 4.748876678215678e-05, "loss": 0.362, "step": 3711000 }, { "epoch": 25.11571567778259, "grad_norm": 0.4227748513221741, "learning_rate": 4.7488428432221744e-05, "loss": 0.3631, "step": 3711500 }, { "epoch": 25.11909917713296, "grad_norm": 0.37438488006591797, "learning_rate": 4.7488090082286706e-05, "loss": 0.3631, "step": 3712000 }, { "epoch": 25.122482676483326, "grad_norm": 0.36240556836128235, "learning_rate": 4.748775173235167e-05, "loss": 0.3612, "step": 3712500 }, { "epoch": 25.125866175833693, "grad_norm": 0.3844728469848633, "learning_rate": 4.748741338241664e-05, "loss": 0.3611, "step": 3713000 }, { "epoch": 25.129249675184063, "grad_norm": 0.38626983761787415, "learning_rate": 4.74870750324816e-05, "loss": 0.3607, "step": 3713500 }, { "epoch": 25.13263317453443, "grad_norm": 0.3698998987674713, "learning_rate": 4.7486736682546554e-05, "loss": 0.3622, "step": 3714000 }, { "epoch": 25.1360166738848, "grad_norm": 0.3693174421787262, "learning_rate": 4.748639833261152e-05, "loss": 0.3612, "step": 3714500 }, { "epoch": 25.139400173235167, "grad_norm": 0.3790559768676758, "learning_rate": 4.7486059982676485e-05, "loss": 0.3618, "step": 3715000 }, { "epoch": 25.142783672585534, "grad_norm": 0.3717188537120819, "learning_rate": 4.748572163274145e-05, "loss": 0.3639, "step": 3715500 }, { "epoch": 25.146167171935904, "grad_norm": 0.4066247045993805, "learning_rate": 4.748538328280641e-05, "loss": 0.3638, "step": 3716000 }, { "epoch": 25.14955067128627, "grad_norm": 0.3499060869216919, "learning_rate": 4.748504493287138e-05, "loss": 0.3617, "step": 3716500 }, { "epoch": 25.15293417063664, "grad_norm": 0.35585179924964905, "learning_rate": 4.748470658293634e-05, "loss": 0.3623, "step": 3717000 }, { "epoch": 25.156317669987008, "grad_norm": 0.36418354511260986, "learning_rate": 4.74843682330013e-05, "loss": 0.3623, "step": 3717500 }, { "epoch": 25.159701169337374, "grad_norm": 0.3552171587944031, "learning_rate": 4.7484029883066265e-05, "loss": 0.3615, "step": 3718000 }, { "epoch": 25.163084668687745, "grad_norm": 0.3636505901813507, "learning_rate": 4.7483691533131234e-05, "loss": 0.3634, "step": 3718500 }, { "epoch": 25.16646816803811, "grad_norm": 0.3401271104812622, "learning_rate": 4.748335318319619e-05, "loss": 0.3618, "step": 3719000 }, { "epoch": 25.16985166738848, "grad_norm": 0.4107031524181366, "learning_rate": 4.748301483326115e-05, "loss": 0.3637, "step": 3719500 }, { "epoch": 25.17323516673885, "grad_norm": 0.3740893304347992, "learning_rate": 4.748267648332611e-05, "loss": 0.3619, "step": 3720000 }, { "epoch": 25.176618666089215, "grad_norm": 0.3845370411872864, "learning_rate": 4.748233813339108e-05, "loss": 0.3616, "step": 3720500 }, { "epoch": 25.180002165439586, "grad_norm": 0.3863700330257416, "learning_rate": 4.7481999783456044e-05, "loss": 0.3602, "step": 3721000 }, { "epoch": 25.183385664789952, "grad_norm": 0.43995434045791626, "learning_rate": 4.7481661433521007e-05, "loss": 0.3612, "step": 3721500 }, { "epoch": 25.18676916414032, "grad_norm": 0.3989493250846863, "learning_rate": 4.748132308358597e-05, "loss": 0.3626, "step": 3722000 }, { "epoch": 25.19015266349069, "grad_norm": 0.369140625, "learning_rate": 4.748098473365094e-05, "loss": 0.3626, "step": 3722500 }, { "epoch": 25.193536162841056, "grad_norm": 0.38794782757759094, "learning_rate": 4.74806463837159e-05, "loss": 0.3617, "step": 3723000 }, { "epoch": 25.196919662191426, "grad_norm": 0.32923948764801025, "learning_rate": 4.748030803378086e-05, "loss": 0.3638, "step": 3723500 }, { "epoch": 25.200303161541793, "grad_norm": 0.34737542271614075, "learning_rate": 4.7479969683845824e-05, "loss": 0.3632, "step": 3724000 }, { "epoch": 25.20368666089216, "grad_norm": 0.37374576926231384, "learning_rate": 4.7479631333910786e-05, "loss": 0.3632, "step": 3724500 }, { "epoch": 25.20707016024253, "grad_norm": 0.39663198590278625, "learning_rate": 4.747929298397575e-05, "loss": 0.3616, "step": 3725000 }, { "epoch": 25.210453659592897, "grad_norm": 0.38434311747550964, "learning_rate": 4.747895463404071e-05, "loss": 0.3597, "step": 3725500 }, { "epoch": 25.213837158943264, "grad_norm": 0.3597494661808014, "learning_rate": 4.747861628410568e-05, "loss": 0.3623, "step": 3726000 }, { "epoch": 25.217220658293634, "grad_norm": 0.4105066955089569, "learning_rate": 4.747827793417064e-05, "loss": 0.3622, "step": 3726500 }, { "epoch": 25.220604157644, "grad_norm": 0.37283262610435486, "learning_rate": 4.7477939584235603e-05, "loss": 0.3634, "step": 3727000 }, { "epoch": 25.22398765699437, "grad_norm": 0.3647730350494385, "learning_rate": 4.7477601234300566e-05, "loss": 0.3622, "step": 3727500 }, { "epoch": 25.227371156344738, "grad_norm": 0.388548344373703, "learning_rate": 4.747726288436553e-05, "loss": 0.3635, "step": 3728000 }, { "epoch": 25.230754655695105, "grad_norm": 0.4077318012714386, "learning_rate": 4.747692453443049e-05, "loss": 0.362, "step": 3728500 }, { "epoch": 25.234138155045475, "grad_norm": 0.3589150011539459, "learning_rate": 4.747658618449545e-05, "loss": 0.3627, "step": 3729000 }, { "epoch": 25.23752165439584, "grad_norm": 0.38778451085090637, "learning_rate": 4.7476247834560414e-05, "loss": 0.3633, "step": 3729500 }, { "epoch": 25.240905153746212, "grad_norm": 0.408589631319046, "learning_rate": 4.747590948462538e-05, "loss": 0.3628, "step": 3730000 }, { "epoch": 25.24428865309658, "grad_norm": 0.3778936564922333, "learning_rate": 4.7475571134690345e-05, "loss": 0.3631, "step": 3730500 }, { "epoch": 25.247672152446945, "grad_norm": 0.41418513655662537, "learning_rate": 4.747523278475531e-05, "loss": 0.3638, "step": 3731000 }, { "epoch": 25.251055651797316, "grad_norm": 0.36329400539398193, "learning_rate": 4.747489443482027e-05, "loss": 0.3615, "step": 3731500 }, { "epoch": 25.254439151147682, "grad_norm": 0.369795560836792, "learning_rate": 4.747455608488524e-05, "loss": 0.3616, "step": 3732000 }, { "epoch": 25.257822650498053, "grad_norm": 0.3831300437450409, "learning_rate": 4.74742177349502e-05, "loss": 0.3607, "step": 3732500 }, { "epoch": 25.26120614984842, "grad_norm": 0.344378799200058, "learning_rate": 4.747387938501516e-05, "loss": 0.3626, "step": 3733000 }, { "epoch": 25.264589649198786, "grad_norm": 0.3777843415737152, "learning_rate": 4.7473541035080125e-05, "loss": 0.3616, "step": 3733500 }, { "epoch": 25.267973148549157, "grad_norm": 0.3878310024738312, "learning_rate": 4.747320268514509e-05, "loss": 0.3627, "step": 3734000 }, { "epoch": 25.271356647899523, "grad_norm": 0.4041731655597687, "learning_rate": 4.747286433521005e-05, "loss": 0.3617, "step": 3734500 }, { "epoch": 25.27474014724989, "grad_norm": 0.36002233624458313, "learning_rate": 4.747252598527501e-05, "loss": 0.3615, "step": 3735000 }, { "epoch": 25.27812364660026, "grad_norm": 0.3519437611103058, "learning_rate": 4.747218763533997e-05, "loss": 0.3633, "step": 3735500 }, { "epoch": 25.281507145950627, "grad_norm": 0.3840735852718353, "learning_rate": 4.747184928540494e-05, "loss": 0.3627, "step": 3736000 }, { "epoch": 25.284890645300997, "grad_norm": 0.33825719356536865, "learning_rate": 4.7471510935469904e-05, "loss": 0.3605, "step": 3736500 }, { "epoch": 25.288274144651364, "grad_norm": 0.3834848403930664, "learning_rate": 4.7471172585534866e-05, "loss": 0.3636, "step": 3737000 }, { "epoch": 25.29165764400173, "grad_norm": 0.35997170209884644, "learning_rate": 4.747083423559983e-05, "loss": 0.364, "step": 3737500 }, { "epoch": 25.2950411433521, "grad_norm": 0.385812908411026, "learning_rate": 4.747049588566479e-05, "loss": 0.3637, "step": 3738000 }, { "epoch": 25.298424642702468, "grad_norm": 0.34480977058410645, "learning_rate": 4.747015753572975e-05, "loss": 0.3615, "step": 3738500 }, { "epoch": 25.30180814205284, "grad_norm": 0.3738287389278412, "learning_rate": 4.7469819185794715e-05, "loss": 0.3637, "step": 3739000 }, { "epoch": 25.305191641403205, "grad_norm": 0.34847918152809143, "learning_rate": 4.7469480835859684e-05, "loss": 0.3607, "step": 3739500 }, { "epoch": 25.30857514075357, "grad_norm": 0.3540794849395752, "learning_rate": 4.7469142485924646e-05, "loss": 0.3604, "step": 3740000 }, { "epoch": 25.311958640103942, "grad_norm": 0.39001625776290894, "learning_rate": 4.746880413598961e-05, "loss": 0.3628, "step": 3740500 }, { "epoch": 25.31534213945431, "grad_norm": 0.3212912082672119, "learning_rate": 4.746846578605457e-05, "loss": 0.362, "step": 3741000 }, { "epoch": 25.31872563880468, "grad_norm": 0.36917877197265625, "learning_rate": 4.746812743611954e-05, "loss": 0.3616, "step": 3741500 }, { "epoch": 25.322109138155046, "grad_norm": 0.36249247193336487, "learning_rate": 4.74677890861845e-05, "loss": 0.3632, "step": 3742000 }, { "epoch": 25.325492637505413, "grad_norm": 0.30697494745254517, "learning_rate": 4.746745073624946e-05, "loss": 0.3636, "step": 3742500 }, { "epoch": 25.328876136855783, "grad_norm": 0.37793347239494324, "learning_rate": 4.7467112386314425e-05, "loss": 0.3636, "step": 3743000 }, { "epoch": 25.33225963620615, "grad_norm": 0.3809393346309662, "learning_rate": 4.746677403637939e-05, "loss": 0.3618, "step": 3743500 }, { "epoch": 25.335643135556516, "grad_norm": 0.36163750290870667, "learning_rate": 4.746643568644435e-05, "loss": 0.3619, "step": 3744000 }, { "epoch": 25.339026634906887, "grad_norm": 0.4327174723148346, "learning_rate": 4.746609733650931e-05, "loss": 0.3622, "step": 3744500 }, { "epoch": 25.342410134257253, "grad_norm": 0.3379616439342499, "learning_rate": 4.7465758986574274e-05, "loss": 0.3622, "step": 3745000 }, { "epoch": 25.345793633607624, "grad_norm": 0.3480340242385864, "learning_rate": 4.746542063663924e-05, "loss": 0.3651, "step": 3745500 }, { "epoch": 25.34917713295799, "grad_norm": 0.3799471855163574, "learning_rate": 4.7465082286704205e-05, "loss": 0.3615, "step": 3746000 }, { "epoch": 25.352560632308357, "grad_norm": 0.40400072932243347, "learning_rate": 4.746474393676917e-05, "loss": 0.3636, "step": 3746500 }, { "epoch": 25.355944131658728, "grad_norm": 0.3719613552093506, "learning_rate": 4.746440558683413e-05, "loss": 0.3624, "step": 3747000 }, { "epoch": 25.359327631009094, "grad_norm": 0.3671671152114868, "learning_rate": 4.746406723689909e-05, "loss": 0.3623, "step": 3747500 }, { "epoch": 25.362711130359465, "grad_norm": 0.3434518873691559, "learning_rate": 4.746372888696405e-05, "loss": 0.3624, "step": 3748000 }, { "epoch": 25.36609462970983, "grad_norm": 0.36297357082366943, "learning_rate": 4.7463390537029015e-05, "loss": 0.3636, "step": 3748500 }, { "epoch": 25.369478129060198, "grad_norm": 0.3704264760017395, "learning_rate": 4.7463052187093984e-05, "loss": 0.3621, "step": 3749000 }, { "epoch": 25.37286162841057, "grad_norm": 0.36693716049194336, "learning_rate": 4.7462713837158946e-05, "loss": 0.362, "step": 3749500 }, { "epoch": 25.376245127760935, "grad_norm": 0.40354418754577637, "learning_rate": 4.746237548722391e-05, "loss": 0.3613, "step": 3750000 }, { "epoch": 25.379628627111302, "grad_norm": 0.3502896726131439, "learning_rate": 4.746203713728887e-05, "loss": 0.3618, "step": 3750500 }, { "epoch": 25.383012126461672, "grad_norm": 0.381521999835968, "learning_rate": 4.746169878735384e-05, "loss": 0.3638, "step": 3751000 }, { "epoch": 25.38639562581204, "grad_norm": 0.41425198316574097, "learning_rate": 4.74613604374188e-05, "loss": 0.3631, "step": 3751500 }, { "epoch": 25.38977912516241, "grad_norm": 0.3814198076725006, "learning_rate": 4.7461022087483764e-05, "loss": 0.3639, "step": 3752000 }, { "epoch": 25.393162624512776, "grad_norm": 0.3682596683502197, "learning_rate": 4.746068373754872e-05, "loss": 0.3631, "step": 3752500 }, { "epoch": 25.396546123863143, "grad_norm": 0.3883252441883087, "learning_rate": 4.746034538761369e-05, "loss": 0.3621, "step": 3753000 }, { "epoch": 25.399929623213513, "grad_norm": 0.3718891739845276, "learning_rate": 4.746000703767865e-05, "loss": 0.3634, "step": 3753500 }, { "epoch": 25.40331312256388, "grad_norm": 0.37141090631484985, "learning_rate": 4.745966868774361e-05, "loss": 0.3623, "step": 3754000 }, { "epoch": 25.40669662191425, "grad_norm": 0.3447796702384949, "learning_rate": 4.7459330337808574e-05, "loss": 0.3627, "step": 3754500 }, { "epoch": 25.410080121264617, "grad_norm": 0.3654313087463379, "learning_rate": 4.745899198787354e-05, "loss": 0.3621, "step": 3755000 }, { "epoch": 25.413463620614984, "grad_norm": 0.3879503011703491, "learning_rate": 4.7458653637938505e-05, "loss": 0.3623, "step": 3755500 }, { "epoch": 25.416847119965354, "grad_norm": 0.34639325737953186, "learning_rate": 4.745831528800347e-05, "loss": 0.3627, "step": 3756000 }, { "epoch": 25.42023061931572, "grad_norm": 0.3954552710056305, "learning_rate": 4.745797693806843e-05, "loss": 0.3609, "step": 3756500 }, { "epoch": 25.42361411866609, "grad_norm": 0.3754726052284241, "learning_rate": 4.745763858813339e-05, "loss": 0.363, "step": 3757000 }, { "epoch": 25.426997618016458, "grad_norm": 0.39806005358695984, "learning_rate": 4.7457300238198354e-05, "loss": 0.3633, "step": 3757500 }, { "epoch": 25.430381117366824, "grad_norm": 0.3469841480255127, "learning_rate": 4.7456961888263316e-05, "loss": 0.3633, "step": 3758000 }, { "epoch": 25.433764616717195, "grad_norm": 0.3493543863296509, "learning_rate": 4.7456623538328285e-05, "loss": 0.3622, "step": 3758500 }, { "epoch": 25.43714811606756, "grad_norm": 0.37127816677093506, "learning_rate": 4.745628518839325e-05, "loss": 0.3649, "step": 3759000 }, { "epoch": 25.440531615417928, "grad_norm": 0.36104732751846313, "learning_rate": 4.745594683845821e-05, "loss": 0.3634, "step": 3759500 }, { "epoch": 25.4439151147683, "grad_norm": 0.3581537902355194, "learning_rate": 4.745560848852317e-05, "loss": 0.3615, "step": 3760000 }, { "epoch": 25.447298614118665, "grad_norm": 0.34989815950393677, "learning_rate": 4.745527013858814e-05, "loss": 0.3638, "step": 3760500 }, { "epoch": 25.450682113469036, "grad_norm": 0.4142208993434906, "learning_rate": 4.74549317886531e-05, "loss": 0.3622, "step": 3761000 }, { "epoch": 25.454065612819402, "grad_norm": 0.4234132766723633, "learning_rate": 4.7454593438718064e-05, "loss": 0.3631, "step": 3761500 }, { "epoch": 25.45744911216977, "grad_norm": 0.4301721751689911, "learning_rate": 4.745425508878302e-05, "loss": 0.3651, "step": 3762000 }, { "epoch": 25.46083261152014, "grad_norm": 0.38137391209602356, "learning_rate": 4.745391673884799e-05, "loss": 0.3622, "step": 3762500 }, { "epoch": 25.464216110870506, "grad_norm": 0.4020451307296753, "learning_rate": 4.745357838891295e-05, "loss": 0.3631, "step": 3763000 }, { "epoch": 25.467599610220876, "grad_norm": 0.3583117425441742, "learning_rate": 4.745324003897791e-05, "loss": 0.3633, "step": 3763500 }, { "epoch": 25.470983109571243, "grad_norm": 0.34035882353782654, "learning_rate": 4.7452901689042875e-05, "loss": 0.3637, "step": 3764000 }, { "epoch": 25.47436660892161, "grad_norm": 0.3805950880050659, "learning_rate": 4.7452563339107844e-05, "loss": 0.3634, "step": 3764500 }, { "epoch": 25.47775010827198, "grad_norm": 0.36006009578704834, "learning_rate": 4.7452224989172806e-05, "loss": 0.3626, "step": 3765000 }, { "epoch": 25.481133607622347, "grad_norm": 0.3865433931350708, "learning_rate": 4.745188663923777e-05, "loss": 0.3619, "step": 3765500 }, { "epoch": 25.484517106972717, "grad_norm": 0.3628615736961365, "learning_rate": 4.745154828930273e-05, "loss": 0.3615, "step": 3766000 }, { "epoch": 25.487900606323084, "grad_norm": 0.37006327509880066, "learning_rate": 4.745120993936769e-05, "loss": 0.3619, "step": 3766500 }, { "epoch": 25.49128410567345, "grad_norm": 0.36280131340026855, "learning_rate": 4.7450871589432654e-05, "loss": 0.3625, "step": 3767000 }, { "epoch": 25.49466760502382, "grad_norm": 0.363631010055542, "learning_rate": 4.7450533239497617e-05, "loss": 0.3619, "step": 3767500 }, { "epoch": 25.498051104374188, "grad_norm": 0.36467093229293823, "learning_rate": 4.7450194889562585e-05, "loss": 0.362, "step": 3768000 }, { "epoch": 25.501434603724555, "grad_norm": 0.34792622923851013, "learning_rate": 4.744985653962755e-05, "loss": 0.3626, "step": 3768500 }, { "epoch": 25.504818103074925, "grad_norm": 0.35685989260673523, "learning_rate": 4.744951818969251e-05, "loss": 0.3625, "step": 3769000 }, { "epoch": 25.50820160242529, "grad_norm": 0.40018942952156067, "learning_rate": 4.744917983975747e-05, "loss": 0.3626, "step": 3769500 }, { "epoch": 25.511585101775662, "grad_norm": 0.3791816830635071, "learning_rate": 4.744884148982244e-05, "loss": 0.3629, "step": 3770000 }, { "epoch": 25.51496860112603, "grad_norm": 0.3799859881401062, "learning_rate": 4.74485031398874e-05, "loss": 0.3628, "step": 3770500 }, { "epoch": 25.518352100476395, "grad_norm": 0.3531995117664337, "learning_rate": 4.7448164789952365e-05, "loss": 0.3646, "step": 3771000 }, { "epoch": 25.521735599826766, "grad_norm": 0.3656378984451294, "learning_rate": 4.744782644001732e-05, "loss": 0.3633, "step": 3771500 }, { "epoch": 25.525119099177132, "grad_norm": 0.35773876309394836, "learning_rate": 4.744748809008229e-05, "loss": 0.3636, "step": 3772000 }, { "epoch": 25.528502598527503, "grad_norm": 0.40995362401008606, "learning_rate": 4.744714974014725e-05, "loss": 0.3627, "step": 3772500 }, { "epoch": 25.53188609787787, "grad_norm": 0.3314554691314697, "learning_rate": 4.7446811390212213e-05, "loss": 0.3636, "step": 3773000 }, { "epoch": 25.535269597228236, "grad_norm": 0.34793007373809814, "learning_rate": 4.7446473040277176e-05, "loss": 0.3628, "step": 3773500 }, { "epoch": 25.538653096578606, "grad_norm": 0.4217507839202881, "learning_rate": 4.7446134690342144e-05, "loss": 0.362, "step": 3774000 }, { "epoch": 25.542036595928973, "grad_norm": 0.4134977459907532, "learning_rate": 4.7445796340407107e-05, "loss": 0.3629, "step": 3774500 }, { "epoch": 25.54542009527934, "grad_norm": 0.35792210698127747, "learning_rate": 4.744545799047207e-05, "loss": 0.3607, "step": 3775000 }, { "epoch": 25.54880359462971, "grad_norm": 0.4191746711730957, "learning_rate": 4.744511964053703e-05, "loss": 0.3628, "step": 3775500 }, { "epoch": 25.552187093980077, "grad_norm": 0.35209158062934875, "learning_rate": 4.744478129060199e-05, "loss": 0.3615, "step": 3776000 }, { "epoch": 25.555570593330447, "grad_norm": 0.4061746299266815, "learning_rate": 4.7444442940666955e-05, "loss": 0.3636, "step": 3776500 }, { "epoch": 25.558954092680814, "grad_norm": 0.34914374351501465, "learning_rate": 4.744410459073192e-05, "loss": 0.3633, "step": 3777000 }, { "epoch": 25.56233759203118, "grad_norm": 0.38358649611473083, "learning_rate": 4.7443766240796886e-05, "loss": 0.3645, "step": 3777500 }, { "epoch": 25.56572109138155, "grad_norm": 0.4001278877258301, "learning_rate": 4.744342789086185e-05, "loss": 0.3618, "step": 3778000 }, { "epoch": 25.569104590731918, "grad_norm": 0.3768290877342224, "learning_rate": 4.744308954092681e-05, "loss": 0.3636, "step": 3778500 }, { "epoch": 25.572488090082288, "grad_norm": 0.36954954266548157, "learning_rate": 4.744275119099177e-05, "loss": 0.3622, "step": 3779000 }, { "epoch": 25.575871589432655, "grad_norm": 0.38198724389076233, "learning_rate": 4.744241284105674e-05, "loss": 0.3632, "step": 3779500 }, { "epoch": 25.57925508878302, "grad_norm": 0.3659634590148926, "learning_rate": 4.7442074491121703e-05, "loss": 0.362, "step": 3780000 }, { "epoch": 25.582638588133392, "grad_norm": 0.3782658576965332, "learning_rate": 4.7441736141186666e-05, "loss": 0.3611, "step": 3780500 }, { "epoch": 25.58602208748376, "grad_norm": 0.35388150811195374, "learning_rate": 4.744139779125162e-05, "loss": 0.3611, "step": 3781000 }, { "epoch": 25.58940558683413, "grad_norm": 0.3796742558479309, "learning_rate": 4.744105944131659e-05, "loss": 0.3626, "step": 3781500 }, { "epoch": 25.592789086184496, "grad_norm": 0.36054444313049316, "learning_rate": 4.744072109138155e-05, "loss": 0.3612, "step": 3782000 }, { "epoch": 25.596172585534863, "grad_norm": 0.342253714799881, "learning_rate": 4.7440382741446514e-05, "loss": 0.3623, "step": 3782500 }, { "epoch": 25.599556084885233, "grad_norm": 0.35453668236732483, "learning_rate": 4.7440044391511476e-05, "loss": 0.3619, "step": 3783000 }, { "epoch": 25.6029395842356, "grad_norm": 0.37217822670936584, "learning_rate": 4.7439706041576445e-05, "loss": 0.3622, "step": 3783500 }, { "epoch": 25.606323083585966, "grad_norm": 0.3663980960845947, "learning_rate": 4.743936769164141e-05, "loss": 0.3633, "step": 3784000 }, { "epoch": 25.609706582936337, "grad_norm": 0.4085804522037506, "learning_rate": 4.743902934170637e-05, "loss": 0.3623, "step": 3784500 }, { "epoch": 25.613090082286703, "grad_norm": 0.37254148721694946, "learning_rate": 4.743869099177133e-05, "loss": 0.3623, "step": 3785000 }, { "epoch": 25.616473581637074, "grad_norm": 0.370465487241745, "learning_rate": 4.74383526418363e-05, "loss": 0.3619, "step": 3785500 }, { "epoch": 25.61985708098744, "grad_norm": 0.3487185835838318, "learning_rate": 4.7438014291901256e-05, "loss": 0.3619, "step": 3786000 }, { "epoch": 25.623240580337807, "grad_norm": 0.3992827534675598, "learning_rate": 4.743767594196622e-05, "loss": 0.3615, "step": 3786500 }, { "epoch": 25.626624079688177, "grad_norm": 0.35768917202949524, "learning_rate": 4.743733759203119e-05, "loss": 0.362, "step": 3787000 }, { "epoch": 25.630007579038544, "grad_norm": 0.3796229660511017, "learning_rate": 4.743699924209615e-05, "loss": 0.3625, "step": 3787500 }, { "epoch": 25.633391078388915, "grad_norm": 0.38146036863327026, "learning_rate": 4.743666089216111e-05, "loss": 0.3638, "step": 3788000 }, { "epoch": 25.63677457773928, "grad_norm": 0.4110412001609802, "learning_rate": 4.743632254222607e-05, "loss": 0.3623, "step": 3788500 }, { "epoch": 25.640158077089648, "grad_norm": 0.33680954575538635, "learning_rate": 4.743598419229104e-05, "loss": 0.3647, "step": 3789000 }, { "epoch": 25.64354157644002, "grad_norm": 0.3926984369754791, "learning_rate": 4.7435645842356004e-05, "loss": 0.3615, "step": 3789500 }, { "epoch": 25.646925075790385, "grad_norm": 0.3979891836643219, "learning_rate": 4.7435307492420966e-05, "loss": 0.3634, "step": 3790000 }, { "epoch": 25.650308575140755, "grad_norm": 0.4162346124649048, "learning_rate": 4.743496914248592e-05, "loss": 0.3639, "step": 3790500 }, { "epoch": 25.653692074491122, "grad_norm": 0.36416104435920715, "learning_rate": 4.743463079255089e-05, "loss": 0.3641, "step": 3791000 }, { "epoch": 25.65707557384149, "grad_norm": 0.43271806836128235, "learning_rate": 4.743429244261585e-05, "loss": 0.3639, "step": 3791500 }, { "epoch": 25.66045907319186, "grad_norm": 0.3622170686721802, "learning_rate": 4.7433954092680815e-05, "loss": 0.3622, "step": 3792000 }, { "epoch": 25.663842572542226, "grad_norm": 0.4002637565135956, "learning_rate": 4.743361574274578e-05, "loss": 0.3626, "step": 3792500 }, { "epoch": 25.667226071892593, "grad_norm": 0.3458804190158844, "learning_rate": 4.7433277392810746e-05, "loss": 0.3599, "step": 3793000 }, { "epoch": 25.670609571242963, "grad_norm": 0.3337368071079254, "learning_rate": 4.743293904287571e-05, "loss": 0.3619, "step": 3793500 }, { "epoch": 25.67399307059333, "grad_norm": 0.3929619789123535, "learning_rate": 4.743260069294067e-05, "loss": 0.3635, "step": 3794000 }, { "epoch": 25.6773765699437, "grad_norm": 0.3558287024497986, "learning_rate": 4.743226234300563e-05, "loss": 0.363, "step": 3794500 }, { "epoch": 25.680760069294067, "grad_norm": 0.3677956461906433, "learning_rate": 4.74319239930706e-05, "loss": 0.3627, "step": 3795000 }, { "epoch": 25.684143568644433, "grad_norm": 0.3794611394405365, "learning_rate": 4.7431585643135556e-05, "loss": 0.3629, "step": 3795500 }, { "epoch": 25.687527067994804, "grad_norm": 0.35885268449783325, "learning_rate": 4.743124729320052e-05, "loss": 0.3621, "step": 3796000 }, { "epoch": 25.69091056734517, "grad_norm": 0.3457956910133362, "learning_rate": 4.743090894326549e-05, "loss": 0.363, "step": 3796500 }, { "epoch": 25.69429406669554, "grad_norm": 0.37334170937538147, "learning_rate": 4.743057059333045e-05, "loss": 0.3636, "step": 3797000 }, { "epoch": 25.697677566045908, "grad_norm": 0.36617588996887207, "learning_rate": 4.743023224339541e-05, "loss": 0.3635, "step": 3797500 }, { "epoch": 25.701061065396274, "grad_norm": 0.3778825104236603, "learning_rate": 4.7429893893460374e-05, "loss": 0.3626, "step": 3798000 }, { "epoch": 25.704444564746645, "grad_norm": 0.37063291668891907, "learning_rate": 4.7429555543525336e-05, "loss": 0.3632, "step": 3798500 }, { "epoch": 25.70782806409701, "grad_norm": 0.3541485369205475, "learning_rate": 4.7429217193590305e-05, "loss": 0.3629, "step": 3799000 }, { "epoch": 25.711211563447378, "grad_norm": 0.36199647188186646, "learning_rate": 4.742887884365527e-05, "loss": 0.3644, "step": 3799500 }, { "epoch": 25.71459506279775, "grad_norm": 0.36414459347724915, "learning_rate": 4.742854049372022e-05, "loss": 0.3637, "step": 3800000 }, { "epoch": 25.717978562148115, "grad_norm": 0.38215571641921997, "learning_rate": 4.742820214378519e-05, "loss": 0.3628, "step": 3800500 }, { "epoch": 25.721362061498485, "grad_norm": 0.3797191381454468, "learning_rate": 4.742786379385015e-05, "loss": 0.3637, "step": 3801000 }, { "epoch": 25.724745560848852, "grad_norm": 0.3910582959651947, "learning_rate": 4.7427525443915115e-05, "loss": 0.3635, "step": 3801500 }, { "epoch": 25.72812906019922, "grad_norm": 0.3790856599807739, "learning_rate": 4.742718709398008e-05, "loss": 0.3624, "step": 3802000 }, { "epoch": 25.73151255954959, "grad_norm": 0.36054325103759766, "learning_rate": 4.7426848744045046e-05, "loss": 0.3629, "step": 3802500 }, { "epoch": 25.734896058899956, "grad_norm": 0.354313462972641, "learning_rate": 4.742651039411001e-05, "loss": 0.3631, "step": 3803000 }, { "epoch": 25.738279558250326, "grad_norm": 0.37741386890411377, "learning_rate": 4.742617204417497e-05, "loss": 0.3633, "step": 3803500 }, { "epoch": 25.741663057600693, "grad_norm": 0.37754544615745544, "learning_rate": 4.742583369423993e-05, "loss": 0.3631, "step": 3804000 }, { "epoch": 25.74504655695106, "grad_norm": 0.34346863627433777, "learning_rate": 4.74254953443049e-05, "loss": 0.3625, "step": 3804500 }, { "epoch": 25.74843005630143, "grad_norm": 0.31208792328834534, "learning_rate": 4.742515699436986e-05, "loss": 0.3619, "step": 3805000 }, { "epoch": 25.751813555651797, "grad_norm": 0.380153089761734, "learning_rate": 4.742481864443482e-05, "loss": 0.3611, "step": 3805500 }, { "epoch": 25.755197055002164, "grad_norm": 0.3855472803115845, "learning_rate": 4.742448029449978e-05, "loss": 0.3629, "step": 3806000 }, { "epoch": 25.758580554352534, "grad_norm": 0.3726450502872467, "learning_rate": 4.742414194456475e-05, "loss": 0.3638, "step": 3806500 }, { "epoch": 25.7619640537029, "grad_norm": 0.3527815341949463, "learning_rate": 4.742380359462971e-05, "loss": 0.3629, "step": 3807000 }, { "epoch": 25.76534755305327, "grad_norm": 0.35626131296157837, "learning_rate": 4.7423465244694674e-05, "loss": 0.3646, "step": 3807500 }, { "epoch": 25.768731052403638, "grad_norm": 0.36378493905067444, "learning_rate": 4.7423126894759636e-05, "loss": 0.3626, "step": 3808000 }, { "epoch": 25.772114551754004, "grad_norm": 0.41029688715934753, "learning_rate": 4.7422788544824605e-05, "loss": 0.3629, "step": 3808500 }, { "epoch": 25.775498051104375, "grad_norm": 0.3284132778644562, "learning_rate": 4.742245019488957e-05, "loss": 0.3627, "step": 3809000 }, { "epoch": 25.77888155045474, "grad_norm": 0.32602161169052124, "learning_rate": 4.742211184495452e-05, "loss": 0.3626, "step": 3809500 }, { "epoch": 25.782265049805112, "grad_norm": 0.3718428611755371, "learning_rate": 4.742177349501949e-05, "loss": 0.3606, "step": 3810000 }, { "epoch": 25.78564854915548, "grad_norm": 0.35943469405174255, "learning_rate": 4.7421435145084454e-05, "loss": 0.3615, "step": 3810500 }, { "epoch": 25.789032048505845, "grad_norm": 0.38358232378959656, "learning_rate": 4.7421096795149416e-05, "loss": 0.3624, "step": 3811000 }, { "epoch": 25.792415547856216, "grad_norm": 0.37543123960494995, "learning_rate": 4.742075844521438e-05, "loss": 0.3608, "step": 3811500 }, { "epoch": 25.795799047206582, "grad_norm": 0.35307949781417847, "learning_rate": 4.742042009527935e-05, "loss": 0.3625, "step": 3812000 }, { "epoch": 25.799182546556953, "grad_norm": 0.3657030761241913, "learning_rate": 4.742008174534431e-05, "loss": 0.3624, "step": 3812500 }, { "epoch": 25.80256604590732, "grad_norm": 0.3477466404438019, "learning_rate": 4.741974339540927e-05, "loss": 0.3618, "step": 3813000 }, { "epoch": 25.805949545257686, "grad_norm": 0.4210405945777893, "learning_rate": 4.741940504547423e-05, "loss": 0.3631, "step": 3813500 }, { "epoch": 25.809333044608056, "grad_norm": 0.37172731757164, "learning_rate": 4.74190666955392e-05, "loss": 0.3628, "step": 3814000 }, { "epoch": 25.812716543958423, "grad_norm": 0.3678707480430603, "learning_rate": 4.741872834560416e-05, "loss": 0.3634, "step": 3814500 }, { "epoch": 25.816100043308793, "grad_norm": 0.317853182554245, "learning_rate": 4.741838999566912e-05, "loss": 0.3635, "step": 3815000 }, { "epoch": 25.81948354265916, "grad_norm": 0.34395352005958557, "learning_rate": 4.741805164573408e-05, "loss": 0.3632, "step": 3815500 }, { "epoch": 25.822867042009527, "grad_norm": 0.38165464997291565, "learning_rate": 4.741771329579905e-05, "loss": 0.3627, "step": 3816000 }, { "epoch": 25.826250541359897, "grad_norm": 0.35092389583587646, "learning_rate": 4.741737494586401e-05, "loss": 0.3602, "step": 3816500 }, { "epoch": 25.829634040710264, "grad_norm": 0.3503788411617279, "learning_rate": 4.7417036595928975e-05, "loss": 0.3627, "step": 3817000 }, { "epoch": 25.83301754006063, "grad_norm": 0.3763909637928009, "learning_rate": 4.741669824599394e-05, "loss": 0.3617, "step": 3817500 }, { "epoch": 25.836401039411, "grad_norm": 0.3493872284889221, "learning_rate": 4.7416359896058906e-05, "loss": 0.362, "step": 3818000 }, { "epoch": 25.839784538761368, "grad_norm": 0.351111501455307, "learning_rate": 4.741602154612387e-05, "loss": 0.3618, "step": 3818500 }, { "epoch": 25.843168038111738, "grad_norm": 0.35869088768959045, "learning_rate": 4.7415683196188823e-05, "loss": 0.3618, "step": 3819000 }, { "epoch": 25.846551537462105, "grad_norm": 0.3645501136779785, "learning_rate": 4.741534484625379e-05, "loss": 0.3639, "step": 3819500 }, { "epoch": 25.84993503681247, "grad_norm": 0.352455198764801, "learning_rate": 4.7415006496318754e-05, "loss": 0.3605, "step": 3820000 }, { "epoch": 25.853318536162842, "grad_norm": 0.36805835366249084, "learning_rate": 4.7414668146383717e-05, "loss": 0.3611, "step": 3820500 }, { "epoch": 25.85670203551321, "grad_norm": 0.376402348279953, "learning_rate": 4.741432979644868e-05, "loss": 0.3631, "step": 3821000 }, { "epoch": 25.86008553486358, "grad_norm": 0.3744443953037262, "learning_rate": 4.741399144651365e-05, "loss": 0.3627, "step": 3821500 }, { "epoch": 25.863469034213946, "grad_norm": 0.3914426267147064, "learning_rate": 4.741365309657861e-05, "loss": 0.3617, "step": 3822000 }, { "epoch": 25.866852533564312, "grad_norm": 0.3761354684829712, "learning_rate": 4.741331474664357e-05, "loss": 0.3613, "step": 3822500 }, { "epoch": 25.870236032914683, "grad_norm": 0.3577970266342163, "learning_rate": 4.7412976396708534e-05, "loss": 0.3637, "step": 3823000 }, { "epoch": 25.87361953226505, "grad_norm": 0.37655699253082275, "learning_rate": 4.74126380467735e-05, "loss": 0.3649, "step": 3823500 }, { "epoch": 25.877003031615416, "grad_norm": 0.4279420077800751, "learning_rate": 4.741229969683846e-05, "loss": 0.3618, "step": 3824000 }, { "epoch": 25.880386530965787, "grad_norm": 0.36734089255332947, "learning_rate": 4.741196134690342e-05, "loss": 0.3634, "step": 3824500 }, { "epoch": 25.883770030316153, "grad_norm": 0.3726929724216461, "learning_rate": 4.741162299696838e-05, "loss": 0.3638, "step": 3825000 }, { "epoch": 25.887153529666524, "grad_norm": 0.36211109161376953, "learning_rate": 4.741128464703335e-05, "loss": 0.3636, "step": 3825500 }, { "epoch": 25.89053702901689, "grad_norm": 0.3322620689868927, "learning_rate": 4.7410946297098313e-05, "loss": 0.3619, "step": 3826000 }, { "epoch": 25.893920528367257, "grad_norm": 0.39214855432510376, "learning_rate": 4.7410607947163276e-05, "loss": 0.3617, "step": 3826500 }, { "epoch": 25.897304027717627, "grad_norm": 0.3484244644641876, "learning_rate": 4.741026959722824e-05, "loss": 0.3616, "step": 3827000 }, { "epoch": 25.900687527067994, "grad_norm": 0.37596139311790466, "learning_rate": 4.740993124729321e-05, "loss": 0.3646, "step": 3827500 }, { "epoch": 25.904071026418364, "grad_norm": 0.4313521683216095, "learning_rate": 4.740959289735817e-05, "loss": 0.3627, "step": 3828000 }, { "epoch": 25.90745452576873, "grad_norm": 0.37457355856895447, "learning_rate": 4.7409254547423124e-05, "loss": 0.3625, "step": 3828500 }, { "epoch": 25.910838025119098, "grad_norm": 0.33323827385902405, "learning_rate": 4.740891619748809e-05, "loss": 0.3628, "step": 3829000 }, { "epoch": 25.91422152446947, "grad_norm": 0.3745093047618866, "learning_rate": 4.7408577847553055e-05, "loss": 0.362, "step": 3829500 }, { "epoch": 25.917605023819835, "grad_norm": 0.35997912287712097, "learning_rate": 4.740823949761802e-05, "loss": 0.3629, "step": 3830000 }, { "epoch": 25.9209885231702, "grad_norm": 0.34573546051979065, "learning_rate": 4.740790114768298e-05, "loss": 0.3627, "step": 3830500 }, { "epoch": 25.924372022520572, "grad_norm": 0.3673235774040222, "learning_rate": 4.740756279774795e-05, "loss": 0.363, "step": 3831000 }, { "epoch": 25.92775552187094, "grad_norm": 0.39050784707069397, "learning_rate": 4.740722444781291e-05, "loss": 0.3618, "step": 3831500 }, { "epoch": 25.93113902122131, "grad_norm": 0.3835177719593048, "learning_rate": 4.740688609787787e-05, "loss": 0.364, "step": 3832000 }, { "epoch": 25.934522520571676, "grad_norm": 0.39260244369506836, "learning_rate": 4.7406547747942835e-05, "loss": 0.3645, "step": 3832500 }, { "epoch": 25.937906019922043, "grad_norm": 0.38124096393585205, "learning_rate": 4.7406209398007804e-05, "loss": 0.3639, "step": 3833000 }, { "epoch": 25.941289519272413, "grad_norm": 0.3928065598011017, "learning_rate": 4.740587104807276e-05, "loss": 0.3615, "step": 3833500 }, { "epoch": 25.94467301862278, "grad_norm": 0.3781130611896515, "learning_rate": 4.740553269813772e-05, "loss": 0.362, "step": 3834000 }, { "epoch": 25.94805651797315, "grad_norm": 0.34431740641593933, "learning_rate": 4.740519434820268e-05, "loss": 0.3618, "step": 3834500 }, { "epoch": 25.951440017323517, "grad_norm": 0.3865680694580078, "learning_rate": 4.740485599826765e-05, "loss": 0.3628, "step": 3835000 }, { "epoch": 25.954823516673883, "grad_norm": 0.3760943114757538, "learning_rate": 4.7404517648332614e-05, "loss": 0.3633, "step": 3835500 }, { "epoch": 25.958207016024254, "grad_norm": 0.33652690052986145, "learning_rate": 4.7404179298397576e-05, "loss": 0.3612, "step": 3836000 }, { "epoch": 25.96159051537462, "grad_norm": 0.3407208323478699, "learning_rate": 4.740384094846254e-05, "loss": 0.3611, "step": 3836500 }, { "epoch": 25.96497401472499, "grad_norm": 0.3594238758087158, "learning_rate": 4.740350259852751e-05, "loss": 0.3638, "step": 3837000 }, { "epoch": 25.968357514075358, "grad_norm": 0.3876775801181793, "learning_rate": 4.740316424859247e-05, "loss": 0.3644, "step": 3837500 }, { "epoch": 25.971741013425724, "grad_norm": 0.37705734372138977, "learning_rate": 4.7402825898657425e-05, "loss": 0.3629, "step": 3838000 }, { "epoch": 25.975124512776095, "grad_norm": 0.3409097194671631, "learning_rate": 4.7402487548722394e-05, "loss": 0.3624, "step": 3838500 }, { "epoch": 25.97850801212646, "grad_norm": 0.37340086698532104, "learning_rate": 4.7402149198787356e-05, "loss": 0.3636, "step": 3839000 }, { "epoch": 25.98189151147683, "grad_norm": 0.382447749376297, "learning_rate": 4.740181084885232e-05, "loss": 0.3641, "step": 3839500 }, { "epoch": 25.9852750108272, "grad_norm": 0.38271307945251465, "learning_rate": 4.740147249891728e-05, "loss": 0.3629, "step": 3840000 }, { "epoch": 25.988658510177565, "grad_norm": 0.3467562198638916, "learning_rate": 4.740113414898225e-05, "loss": 0.3643, "step": 3840500 }, { "epoch": 25.992042009527935, "grad_norm": 0.3526530861854553, "learning_rate": 4.740079579904721e-05, "loss": 0.3609, "step": 3841000 }, { "epoch": 25.995425508878302, "grad_norm": 0.37061673402786255, "learning_rate": 4.740045744911217e-05, "loss": 0.3629, "step": 3841500 }, { "epoch": 25.99880900822867, "grad_norm": 0.3723934590816498, "learning_rate": 4.7400119099177135e-05, "loss": 0.3633, "step": 3842000 }, { "epoch": 26.0, "eval_accuracy": 0.8616330661467587, "eval_loss": 0.5619011521339417, "eval_runtime": 3404.9323, "eval_samples_per_second": 85.389, "eval_steps_per_second": 5.337, "step": 3842176 }, { "epoch": 26.00219250757904, "grad_norm": 0.370853066444397, "learning_rate": 4.7399780749242104e-05, "loss": 0.3604, "step": 3842500 }, { "epoch": 26.005576006929406, "grad_norm": 0.4079779088497162, "learning_rate": 4.739944239930706e-05, "loss": 0.3601, "step": 3843000 }, { "epoch": 26.008959506279776, "grad_norm": 0.3787406086921692, "learning_rate": 4.739910404937202e-05, "loss": 0.3595, "step": 3843500 }, { "epoch": 26.012343005630143, "grad_norm": 0.37509703636169434, "learning_rate": 4.7398765699436984e-05, "loss": 0.3616, "step": 3844000 }, { "epoch": 26.01572650498051, "grad_norm": 0.4073735773563385, "learning_rate": 4.739842734950195e-05, "loss": 0.3601, "step": 3844500 }, { "epoch": 26.01911000433088, "grad_norm": 0.3762528598308563, "learning_rate": 4.7398088999566915e-05, "loss": 0.3603, "step": 3845000 }, { "epoch": 26.022493503681247, "grad_norm": 0.3731355667114258, "learning_rate": 4.739775064963188e-05, "loss": 0.3625, "step": 3845500 }, { "epoch": 26.025877003031617, "grad_norm": 0.37788233160972595, "learning_rate": 4.739741229969684e-05, "loss": 0.3634, "step": 3846000 }, { "epoch": 26.029260502381984, "grad_norm": 0.38412338495254517, "learning_rate": 4.739707394976181e-05, "loss": 0.3608, "step": 3846500 }, { "epoch": 26.03264400173235, "grad_norm": 0.3652576208114624, "learning_rate": 4.739673559982677e-05, "loss": 0.3609, "step": 3847000 }, { "epoch": 26.03602750108272, "grad_norm": 0.35933735966682434, "learning_rate": 4.739639724989173e-05, "loss": 0.3595, "step": 3847500 }, { "epoch": 26.039411000433088, "grad_norm": 0.3820779025554657, "learning_rate": 4.7396058899956694e-05, "loss": 0.3598, "step": 3848000 }, { "epoch": 26.042794499783454, "grad_norm": 0.4067803621292114, "learning_rate": 4.7395720550021656e-05, "loss": 0.3618, "step": 3848500 }, { "epoch": 26.046177999133825, "grad_norm": 0.359842449426651, "learning_rate": 4.739538220008662e-05, "loss": 0.3614, "step": 3849000 }, { "epoch": 26.04956149848419, "grad_norm": 0.36242035031318665, "learning_rate": 4.739504385015158e-05, "loss": 0.3624, "step": 3849500 }, { "epoch": 26.05294499783456, "grad_norm": 0.39534276723861694, "learning_rate": 4.739470550021655e-05, "loss": 0.3602, "step": 3850000 }, { "epoch": 26.05632849718493, "grad_norm": 0.34640955924987793, "learning_rate": 4.739436715028151e-05, "loss": 0.3612, "step": 3850500 }, { "epoch": 26.059711996535295, "grad_norm": 0.4061083495616913, "learning_rate": 4.7394028800346474e-05, "loss": 0.3606, "step": 3851000 }, { "epoch": 26.063095495885666, "grad_norm": 0.38183504343032837, "learning_rate": 4.7393690450411436e-05, "loss": 0.3616, "step": 3851500 }, { "epoch": 26.066478995236032, "grad_norm": 0.37067651748657227, "learning_rate": 4.73933521004764e-05, "loss": 0.3617, "step": 3852000 }, { "epoch": 26.069862494586403, "grad_norm": 0.34321513772010803, "learning_rate": 4.739301375054136e-05, "loss": 0.3604, "step": 3852500 }, { "epoch": 26.07324599393677, "grad_norm": 0.38992804288864136, "learning_rate": 4.739267540060632e-05, "loss": 0.3613, "step": 3853000 }, { "epoch": 26.076629493287136, "grad_norm": 0.3853125274181366, "learning_rate": 4.7392337050671284e-05, "loss": 0.3617, "step": 3853500 }, { "epoch": 26.080012992637506, "grad_norm": 0.35063523054122925, "learning_rate": 4.739199870073625e-05, "loss": 0.3632, "step": 3854000 }, { "epoch": 26.083396491987873, "grad_norm": 0.3972536623477936, "learning_rate": 4.7391660350801215e-05, "loss": 0.3593, "step": 3854500 }, { "epoch": 26.08677999133824, "grad_norm": 0.3626726269721985, "learning_rate": 4.739132200086618e-05, "loss": 0.3624, "step": 3855000 }, { "epoch": 26.09016349068861, "grad_norm": 0.3531181514263153, "learning_rate": 4.739098365093114e-05, "loss": 0.36, "step": 3855500 }, { "epoch": 26.093546990038977, "grad_norm": 0.3698676824569702, "learning_rate": 4.739064530099611e-05, "loss": 0.3635, "step": 3856000 }, { "epoch": 26.096930489389347, "grad_norm": 0.38867586851119995, "learning_rate": 4.739030695106107e-05, "loss": 0.3624, "step": 3856500 }, { "epoch": 26.100313988739714, "grad_norm": 0.36070555448532104, "learning_rate": 4.738996860112603e-05, "loss": 0.361, "step": 3857000 }, { "epoch": 26.10369748809008, "grad_norm": 0.3562554121017456, "learning_rate": 4.7389630251190995e-05, "loss": 0.3611, "step": 3857500 }, { "epoch": 26.10708098744045, "grad_norm": 0.3982466757297516, "learning_rate": 4.738929190125596e-05, "loss": 0.3601, "step": 3858000 }, { "epoch": 26.110464486790818, "grad_norm": 0.4041410982608795, "learning_rate": 4.738895355132092e-05, "loss": 0.362, "step": 3858500 }, { "epoch": 26.113847986141188, "grad_norm": 0.39795050024986267, "learning_rate": 4.738861520138588e-05, "loss": 0.3618, "step": 3859000 }, { "epoch": 26.117231485491555, "grad_norm": 0.3832455575466156, "learning_rate": 4.738827685145085e-05, "loss": 0.3621, "step": 3859500 }, { "epoch": 26.12061498484192, "grad_norm": 0.3504578173160553, "learning_rate": 4.738793850151581e-05, "loss": 0.3623, "step": 3860000 }, { "epoch": 26.123998484192292, "grad_norm": 0.37978294491767883, "learning_rate": 4.7387600151580774e-05, "loss": 0.3608, "step": 3860500 }, { "epoch": 26.12738198354266, "grad_norm": 0.34524473547935486, "learning_rate": 4.7387261801645737e-05, "loss": 0.3595, "step": 3861000 }, { "epoch": 26.13076548289303, "grad_norm": 0.33990371227264404, "learning_rate": 4.73869234517107e-05, "loss": 0.361, "step": 3861500 }, { "epoch": 26.134148982243396, "grad_norm": 0.41926467418670654, "learning_rate": 4.738658510177566e-05, "loss": 0.3609, "step": 3862000 }, { "epoch": 26.137532481593762, "grad_norm": 0.3623351752758026, "learning_rate": 4.738624675184062e-05, "loss": 0.362, "step": 3862500 }, { "epoch": 26.140915980944133, "grad_norm": 0.36266013979911804, "learning_rate": 4.7385908401905585e-05, "loss": 0.3615, "step": 3863000 }, { "epoch": 26.1442994802945, "grad_norm": 0.3743663430213928, "learning_rate": 4.7385570051970554e-05, "loss": 0.3624, "step": 3863500 }, { "epoch": 26.147682979644866, "grad_norm": 0.3882286846637726, "learning_rate": 4.7385231702035516e-05, "loss": 0.3625, "step": 3864000 }, { "epoch": 26.151066478995237, "grad_norm": 0.3620988130569458, "learning_rate": 4.738489335210048e-05, "loss": 0.3612, "step": 3864500 }, { "epoch": 26.154449978345603, "grad_norm": 0.4201053977012634, "learning_rate": 4.738455500216544e-05, "loss": 0.3618, "step": 3865000 }, { "epoch": 26.157833477695974, "grad_norm": 0.3706298768520355, "learning_rate": 4.738421665223041e-05, "loss": 0.3606, "step": 3865500 }, { "epoch": 26.16121697704634, "grad_norm": 0.36147424578666687, "learning_rate": 4.738387830229537e-05, "loss": 0.3631, "step": 3866000 }, { "epoch": 26.164600476396707, "grad_norm": 0.3444823920726776, "learning_rate": 4.7383539952360333e-05, "loss": 0.3614, "step": 3866500 }, { "epoch": 26.167983975747077, "grad_norm": 0.38151130080223083, "learning_rate": 4.7383201602425296e-05, "loss": 0.3617, "step": 3867000 }, { "epoch": 26.171367475097444, "grad_norm": 0.3470594584941864, "learning_rate": 4.738286325249026e-05, "loss": 0.3624, "step": 3867500 }, { "epoch": 26.174750974447814, "grad_norm": 0.3673975467681885, "learning_rate": 4.738252490255522e-05, "loss": 0.362, "step": 3868000 }, { "epoch": 26.17813447379818, "grad_norm": 0.34868329763412476, "learning_rate": 4.738218655262018e-05, "loss": 0.3622, "step": 3868500 }, { "epoch": 26.181517973148548, "grad_norm": 0.3584482967853546, "learning_rate": 4.7381848202685144e-05, "loss": 0.3614, "step": 3869000 }, { "epoch": 26.184901472498918, "grad_norm": 0.3847140073776245, "learning_rate": 4.738150985275011e-05, "loss": 0.3619, "step": 3869500 }, { "epoch": 26.188284971849285, "grad_norm": 0.36739906668663025, "learning_rate": 4.7381171502815075e-05, "loss": 0.3624, "step": 3870000 }, { "epoch": 26.191668471199655, "grad_norm": 0.36381658911705017, "learning_rate": 4.738083315288004e-05, "loss": 0.3618, "step": 3870500 }, { "epoch": 26.195051970550022, "grad_norm": 0.37314146757125854, "learning_rate": 4.7380494802945e-05, "loss": 0.3605, "step": 3871000 }, { "epoch": 26.19843546990039, "grad_norm": 0.4022788405418396, "learning_rate": 4.738015645300996e-05, "loss": 0.362, "step": 3871500 }, { "epoch": 26.20181896925076, "grad_norm": 0.3680167496204376, "learning_rate": 4.7379818103074924e-05, "loss": 0.3626, "step": 3872000 }, { "epoch": 26.205202468601126, "grad_norm": 0.35963067412376404, "learning_rate": 4.7379479753139886e-05, "loss": 0.3628, "step": 3872500 }, { "epoch": 26.208585967951493, "grad_norm": 0.36652347445487976, "learning_rate": 4.7379141403204855e-05, "loss": 0.3615, "step": 3873000 }, { "epoch": 26.211969467301863, "grad_norm": 0.3470577001571655, "learning_rate": 4.737880305326982e-05, "loss": 0.362, "step": 3873500 }, { "epoch": 26.21535296665223, "grad_norm": 0.37560033798217773, "learning_rate": 4.737846470333478e-05, "loss": 0.3623, "step": 3874000 }, { "epoch": 26.2187364660026, "grad_norm": 0.3894418179988861, "learning_rate": 4.737812635339974e-05, "loss": 0.36, "step": 3874500 }, { "epoch": 26.222119965352967, "grad_norm": 0.3860340118408203, "learning_rate": 4.737778800346471e-05, "loss": 0.3614, "step": 3875000 }, { "epoch": 26.225503464703333, "grad_norm": 0.44323745369911194, "learning_rate": 4.737744965352967e-05, "loss": 0.361, "step": 3875500 }, { "epoch": 26.228886964053704, "grad_norm": 0.3986993432044983, "learning_rate": 4.7377111303594634e-05, "loss": 0.3612, "step": 3876000 }, { "epoch": 26.23227046340407, "grad_norm": 0.3293212056159973, "learning_rate": 4.7376772953659596e-05, "loss": 0.362, "step": 3876500 }, { "epoch": 26.23565396275444, "grad_norm": 0.4050101935863495, "learning_rate": 4.737643460372456e-05, "loss": 0.3604, "step": 3877000 }, { "epoch": 26.239037462104807, "grad_norm": 0.3691443204879761, "learning_rate": 4.737609625378952e-05, "loss": 0.3614, "step": 3877500 }, { "epoch": 26.242420961455174, "grad_norm": 0.3770415186882019, "learning_rate": 4.737575790385448e-05, "loss": 0.3616, "step": 3878000 }, { "epoch": 26.245804460805545, "grad_norm": 0.3994768559932709, "learning_rate": 4.7375419553919445e-05, "loss": 0.3615, "step": 3878500 }, { "epoch": 26.24918796015591, "grad_norm": 0.35937219858169556, "learning_rate": 4.7375081203984414e-05, "loss": 0.3621, "step": 3879000 }, { "epoch": 26.252571459506278, "grad_norm": 0.3573669195175171, "learning_rate": 4.7374742854049376e-05, "loss": 0.3617, "step": 3879500 }, { "epoch": 26.25595495885665, "grad_norm": 0.34345391392707825, "learning_rate": 4.737440450411434e-05, "loss": 0.362, "step": 3880000 }, { "epoch": 26.259338458207015, "grad_norm": 0.36527591943740845, "learning_rate": 4.73740661541793e-05, "loss": 0.3612, "step": 3880500 }, { "epoch": 26.262721957557385, "grad_norm": 0.34206730127334595, "learning_rate": 4.737372780424426e-05, "loss": 0.3603, "step": 3881000 }, { "epoch": 26.266105456907752, "grad_norm": 0.3721388578414917, "learning_rate": 4.7373389454309224e-05, "loss": 0.3597, "step": 3881500 }, { "epoch": 26.26948895625812, "grad_norm": 0.36627134680747986, "learning_rate": 4.7373051104374186e-05, "loss": 0.3606, "step": 3882000 }, { "epoch": 26.27287245560849, "grad_norm": 0.39107099175453186, "learning_rate": 4.7372712754439155e-05, "loss": 0.3623, "step": 3882500 }, { "epoch": 26.276255954958856, "grad_norm": 0.3856317400932312, "learning_rate": 4.737237440450412e-05, "loss": 0.3627, "step": 3883000 }, { "epoch": 26.279639454309226, "grad_norm": 0.36741721630096436, "learning_rate": 4.737203605456908e-05, "loss": 0.362, "step": 3883500 }, { "epoch": 26.283022953659593, "grad_norm": 0.387513130903244, "learning_rate": 4.737169770463404e-05, "loss": 0.363, "step": 3884000 }, { "epoch": 26.28640645300996, "grad_norm": 0.3745093047618866, "learning_rate": 4.737135935469901e-05, "loss": 0.3615, "step": 3884500 }, { "epoch": 26.28978995236033, "grad_norm": 0.3810684382915497, "learning_rate": 4.737102100476397e-05, "loss": 0.3617, "step": 3885000 }, { "epoch": 26.293173451710697, "grad_norm": 0.3833572566509247, "learning_rate": 4.7370682654828935e-05, "loss": 0.3615, "step": 3885500 }, { "epoch": 26.296556951061067, "grad_norm": 0.3960288465023041, "learning_rate": 4.737034430489389e-05, "loss": 0.3624, "step": 3886000 }, { "epoch": 26.299940450411434, "grad_norm": 0.3764936625957489, "learning_rate": 4.737000595495886e-05, "loss": 0.3623, "step": 3886500 }, { "epoch": 26.3033239497618, "grad_norm": 0.388413667678833, "learning_rate": 4.736966760502382e-05, "loss": 0.3618, "step": 3887000 }, { "epoch": 26.30670744911217, "grad_norm": 0.3840676546096802, "learning_rate": 4.736932925508878e-05, "loss": 0.361, "step": 3887500 }, { "epoch": 26.310090948462538, "grad_norm": 0.39145728945732117, "learning_rate": 4.7368990905153745e-05, "loss": 0.3622, "step": 3888000 }, { "epoch": 26.313474447812904, "grad_norm": 0.37844544649124146, "learning_rate": 4.7368652555218714e-05, "loss": 0.363, "step": 3888500 }, { "epoch": 26.316857947163275, "grad_norm": 0.41900765895843506, "learning_rate": 4.7368314205283676e-05, "loss": 0.3615, "step": 3889000 }, { "epoch": 26.32024144651364, "grad_norm": 0.362606406211853, "learning_rate": 4.736797585534864e-05, "loss": 0.3619, "step": 3889500 }, { "epoch": 26.32362494586401, "grad_norm": 0.3716253340244293, "learning_rate": 4.73676375054136e-05, "loss": 0.3608, "step": 3890000 }, { "epoch": 26.32700844521438, "grad_norm": 0.38629764318466187, "learning_rate": 4.736729915547856e-05, "loss": 0.3622, "step": 3890500 }, { "epoch": 26.330391944564745, "grad_norm": 0.3564845323562622, "learning_rate": 4.7366960805543525e-05, "loss": 0.3631, "step": 3891000 }, { "epoch": 26.333775443915115, "grad_norm": 0.3664126992225647, "learning_rate": 4.736662245560849e-05, "loss": 0.362, "step": 3891500 }, { "epoch": 26.337158943265482, "grad_norm": 0.3530741333961487, "learning_rate": 4.7366284105673456e-05, "loss": 0.3626, "step": 3892000 }, { "epoch": 26.340542442615853, "grad_norm": 0.3932848274707794, "learning_rate": 4.736594575573842e-05, "loss": 0.3619, "step": 3892500 }, { "epoch": 26.34392594196622, "grad_norm": 0.35414615273475647, "learning_rate": 4.736560740580338e-05, "loss": 0.3613, "step": 3893000 }, { "epoch": 26.347309441316586, "grad_norm": 0.38742774724960327, "learning_rate": 4.736526905586834e-05, "loss": 0.364, "step": 3893500 }, { "epoch": 26.350692940666956, "grad_norm": 0.42660969495773315, "learning_rate": 4.736493070593331e-05, "loss": 0.3606, "step": 3894000 }, { "epoch": 26.354076440017323, "grad_norm": 0.37752971053123474, "learning_rate": 4.736459235599827e-05, "loss": 0.3618, "step": 3894500 }, { "epoch": 26.357459939367693, "grad_norm": 0.34589022397994995, "learning_rate": 4.7364254006063235e-05, "loss": 0.3612, "step": 3895000 }, { "epoch": 26.36084343871806, "grad_norm": 0.3234190344810486, "learning_rate": 4.736391565612819e-05, "loss": 0.3629, "step": 3895500 }, { "epoch": 26.364226938068427, "grad_norm": 0.36566781997680664, "learning_rate": 4.736357730619316e-05, "loss": 0.3615, "step": 3896000 }, { "epoch": 26.367610437418797, "grad_norm": 0.3908241093158722, "learning_rate": 4.736323895625812e-05, "loss": 0.3616, "step": 3896500 }, { "epoch": 26.370993936769164, "grad_norm": 0.3684042692184448, "learning_rate": 4.7362900606323084e-05, "loss": 0.3626, "step": 3897000 }, { "epoch": 26.37437743611953, "grad_norm": 0.35040467977523804, "learning_rate": 4.7362562256388046e-05, "loss": 0.3622, "step": 3897500 }, { "epoch": 26.3777609354699, "grad_norm": 0.3635048568248749, "learning_rate": 4.7362223906453015e-05, "loss": 0.3613, "step": 3898000 }, { "epoch": 26.381144434820268, "grad_norm": 0.38577190041542053, "learning_rate": 4.736188555651798e-05, "loss": 0.3629, "step": 3898500 }, { "epoch": 26.384527934170638, "grad_norm": 0.34817418456077576, "learning_rate": 4.736154720658294e-05, "loss": 0.362, "step": 3899000 }, { "epoch": 26.387911433521005, "grad_norm": 0.3552338778972626, "learning_rate": 4.73612088566479e-05, "loss": 0.3625, "step": 3899500 }, { "epoch": 26.39129493287137, "grad_norm": 0.38207247853279114, "learning_rate": 4.736087050671287e-05, "loss": 0.3622, "step": 3900000 }, { "epoch": 26.394678432221742, "grad_norm": 0.39675194025039673, "learning_rate": 4.7360532156777825e-05, "loss": 0.3628, "step": 3900500 }, { "epoch": 26.39806193157211, "grad_norm": 0.41346973180770874, "learning_rate": 4.736019380684279e-05, "loss": 0.3614, "step": 3901000 }, { "epoch": 26.40144543092248, "grad_norm": 0.36755767464637756, "learning_rate": 4.7359855456907756e-05, "loss": 0.3626, "step": 3901500 }, { "epoch": 26.404828930272846, "grad_norm": 0.39664730429649353, "learning_rate": 4.735951710697272e-05, "loss": 0.3623, "step": 3902000 }, { "epoch": 26.408212429623212, "grad_norm": 0.37295064330101013, "learning_rate": 4.735917875703768e-05, "loss": 0.3609, "step": 3902500 }, { "epoch": 26.411595928973583, "grad_norm": 0.37683048844337463, "learning_rate": 4.735884040710264e-05, "loss": 0.3619, "step": 3903000 }, { "epoch": 26.41497942832395, "grad_norm": 0.3602394461631775, "learning_rate": 4.735850205716761e-05, "loss": 0.3618, "step": 3903500 }, { "epoch": 26.418362927674316, "grad_norm": 0.4101329743862152, "learning_rate": 4.7358163707232574e-05, "loss": 0.362, "step": 3904000 }, { "epoch": 26.421746427024686, "grad_norm": 0.3093413710594177, "learning_rate": 4.7357825357297536e-05, "loss": 0.3616, "step": 3904500 }, { "epoch": 26.425129926375053, "grad_norm": 0.35834816098213196, "learning_rate": 4.735748700736249e-05, "loss": 0.3614, "step": 3905000 }, { "epoch": 26.428513425725424, "grad_norm": 0.36701199412345886, "learning_rate": 4.735714865742746e-05, "loss": 0.3628, "step": 3905500 }, { "epoch": 26.43189692507579, "grad_norm": 0.3996990919113159, "learning_rate": 4.735681030749242e-05, "loss": 0.3626, "step": 3906000 }, { "epoch": 26.435280424426157, "grad_norm": 0.34733450412750244, "learning_rate": 4.7356471957557384e-05, "loss": 0.3622, "step": 3906500 }, { "epoch": 26.438663923776527, "grad_norm": 0.37848150730133057, "learning_rate": 4.7356133607622347e-05, "loss": 0.3602, "step": 3907000 }, { "epoch": 26.442047423126894, "grad_norm": 0.35177531838417053, "learning_rate": 4.7355795257687315e-05, "loss": 0.3616, "step": 3907500 }, { "epoch": 26.445430922477264, "grad_norm": 0.3411475121974945, "learning_rate": 4.735545690775228e-05, "loss": 0.3629, "step": 3908000 }, { "epoch": 26.44881442182763, "grad_norm": 0.3760984539985657, "learning_rate": 4.735511855781724e-05, "loss": 0.3608, "step": 3908500 }, { "epoch": 26.452197921177998, "grad_norm": 0.3662256598472595, "learning_rate": 4.73547802078822e-05, "loss": 0.3627, "step": 3909000 }, { "epoch": 26.455581420528368, "grad_norm": 0.3671654462814331, "learning_rate": 4.735444185794717e-05, "loss": 0.3629, "step": 3909500 }, { "epoch": 26.458964919878735, "grad_norm": 0.38121622800827026, "learning_rate": 4.7354103508012126e-05, "loss": 0.3633, "step": 3910000 }, { "epoch": 26.462348419229105, "grad_norm": 0.3634631931781769, "learning_rate": 4.735376515807709e-05, "loss": 0.3627, "step": 3910500 }, { "epoch": 26.465731918579472, "grad_norm": 0.38152438402175903, "learning_rate": 4.735342680814206e-05, "loss": 0.3629, "step": 3911000 }, { "epoch": 26.46911541792984, "grad_norm": 0.36148032546043396, "learning_rate": 4.735308845820702e-05, "loss": 0.3616, "step": 3911500 }, { "epoch": 26.47249891728021, "grad_norm": 0.37832167744636536, "learning_rate": 4.735275010827198e-05, "loss": 0.3627, "step": 3912000 }, { "epoch": 26.475882416630576, "grad_norm": 0.3521499037742615, "learning_rate": 4.7352411758336943e-05, "loss": 0.3594, "step": 3912500 }, { "epoch": 26.479265915980942, "grad_norm": 0.32856032252311707, "learning_rate": 4.735207340840191e-05, "loss": 0.3638, "step": 3913000 }, { "epoch": 26.482649415331313, "grad_norm": 0.3490496873855591, "learning_rate": 4.7351735058466874e-05, "loss": 0.3626, "step": 3913500 }, { "epoch": 26.48603291468168, "grad_norm": 0.34898892045021057, "learning_rate": 4.7351396708531837e-05, "loss": 0.3616, "step": 3914000 }, { "epoch": 26.48941641403205, "grad_norm": 0.3551773428916931, "learning_rate": 4.735105835859679e-05, "loss": 0.3631, "step": 3914500 }, { "epoch": 26.492799913382417, "grad_norm": 0.3575814366340637, "learning_rate": 4.735072000866176e-05, "loss": 0.3616, "step": 3915000 }, { "epoch": 26.496183412732783, "grad_norm": 0.3804273009300232, "learning_rate": 4.735038165872672e-05, "loss": 0.3607, "step": 3915500 }, { "epoch": 26.499566912083154, "grad_norm": 0.42856428027153015, "learning_rate": 4.7350043308791685e-05, "loss": 0.3622, "step": 3916000 }, { "epoch": 26.50295041143352, "grad_norm": 0.40299901366233826, "learning_rate": 4.734970495885665e-05, "loss": 0.3621, "step": 3916500 }, { "epoch": 26.50633391078389, "grad_norm": 0.389115571975708, "learning_rate": 4.7349366608921616e-05, "loss": 0.3615, "step": 3917000 }, { "epoch": 26.509717410134257, "grad_norm": 0.3799566626548767, "learning_rate": 4.734902825898658e-05, "loss": 0.3627, "step": 3917500 }, { "epoch": 26.513100909484624, "grad_norm": 0.3813866376876831, "learning_rate": 4.734868990905154e-05, "loss": 0.3633, "step": 3918000 }, { "epoch": 26.516484408834994, "grad_norm": 0.36013519763946533, "learning_rate": 4.73483515591165e-05, "loss": 0.3629, "step": 3918500 }, { "epoch": 26.51986790818536, "grad_norm": 0.370895117521286, "learning_rate": 4.734801320918147e-05, "loss": 0.3617, "step": 3919000 }, { "epoch": 26.52325140753573, "grad_norm": 0.3173573315143585, "learning_rate": 4.734767485924643e-05, "loss": 0.3621, "step": 3919500 }, { "epoch": 26.5266349068861, "grad_norm": 0.3925241529941559, "learning_rate": 4.734733650931139e-05, "loss": 0.363, "step": 3920000 }, { "epoch": 26.530018406236465, "grad_norm": 0.386522114276886, "learning_rate": 4.734699815937636e-05, "loss": 0.3634, "step": 3920500 }, { "epoch": 26.533401905586835, "grad_norm": 0.37653228640556335, "learning_rate": 4.734665980944132e-05, "loss": 0.3632, "step": 3921000 }, { "epoch": 26.536785404937202, "grad_norm": 0.3695538640022278, "learning_rate": 4.734632145950628e-05, "loss": 0.3627, "step": 3921500 }, { "epoch": 26.54016890428757, "grad_norm": 0.34452754259109497, "learning_rate": 4.7345983109571244e-05, "loss": 0.3629, "step": 3922000 }, { "epoch": 26.54355240363794, "grad_norm": 0.4069274365901947, "learning_rate": 4.734564475963621e-05, "loss": 0.3617, "step": 3922500 }, { "epoch": 26.546935902988306, "grad_norm": 0.39067214727401733, "learning_rate": 4.7345306409701175e-05, "loss": 0.3609, "step": 3923000 }, { "epoch": 26.550319402338676, "grad_norm": 0.367840051651001, "learning_rate": 4.734496805976614e-05, "loss": 0.3628, "step": 3923500 }, { "epoch": 26.553702901689043, "grad_norm": 0.3256097137928009, "learning_rate": 4.734462970983109e-05, "loss": 0.3637, "step": 3924000 }, { "epoch": 26.55708640103941, "grad_norm": 0.33790019154548645, "learning_rate": 4.734429135989606e-05, "loss": 0.36, "step": 3924500 }, { "epoch": 26.56046990038978, "grad_norm": 0.38533130288124084, "learning_rate": 4.7343953009961024e-05, "loss": 0.3638, "step": 3925000 }, { "epoch": 26.563853399740147, "grad_norm": 0.36499324440956116, "learning_rate": 4.7343614660025986e-05, "loss": 0.3632, "step": 3925500 }, { "epoch": 26.567236899090517, "grad_norm": 0.3344828188419342, "learning_rate": 4.734327631009095e-05, "loss": 0.3609, "step": 3926000 }, { "epoch": 26.570620398440884, "grad_norm": 0.3918331265449524, "learning_rate": 4.734293796015592e-05, "loss": 0.3623, "step": 3926500 }, { "epoch": 26.57400389779125, "grad_norm": 0.35491883754730225, "learning_rate": 4.734259961022088e-05, "loss": 0.3611, "step": 3927000 }, { "epoch": 26.57738739714162, "grad_norm": 0.363515704870224, "learning_rate": 4.734226126028584e-05, "loss": 0.3621, "step": 3927500 }, { "epoch": 26.580770896491988, "grad_norm": 0.36305171251296997, "learning_rate": 4.73419229103508e-05, "loss": 0.362, "step": 3928000 }, { "epoch": 26.584154395842354, "grad_norm": 0.3938886821269989, "learning_rate": 4.734158456041577e-05, "loss": 0.363, "step": 3928500 }, { "epoch": 26.587537895192725, "grad_norm": 0.3645011782646179, "learning_rate": 4.734124621048073e-05, "loss": 0.363, "step": 3929000 }, { "epoch": 26.59092139454309, "grad_norm": 0.3929731845855713, "learning_rate": 4.734090786054569e-05, "loss": 0.3625, "step": 3929500 }, { "epoch": 26.59430489389346, "grad_norm": 0.3837154507637024, "learning_rate": 4.734056951061066e-05, "loss": 0.3621, "step": 3930000 }, { "epoch": 26.59768839324383, "grad_norm": 0.3836233615875244, "learning_rate": 4.734023116067562e-05, "loss": 0.3619, "step": 3930500 }, { "epoch": 26.601071892594195, "grad_norm": 0.41164737939834595, "learning_rate": 4.733989281074058e-05, "loss": 0.3627, "step": 3931000 }, { "epoch": 26.604455391944565, "grad_norm": 0.4338706135749817, "learning_rate": 4.7339554460805545e-05, "loss": 0.3629, "step": 3931500 }, { "epoch": 26.607838891294932, "grad_norm": 0.3807239234447479, "learning_rate": 4.733921611087051e-05, "loss": 0.3629, "step": 3932000 }, { "epoch": 26.611222390645302, "grad_norm": 0.36886560916900635, "learning_rate": 4.7338877760935476e-05, "loss": 0.3609, "step": 3932500 }, { "epoch": 26.61460588999567, "grad_norm": 0.35011523962020874, "learning_rate": 4.733853941100044e-05, "loss": 0.3626, "step": 3933000 }, { "epoch": 26.617989389346036, "grad_norm": 0.3590085208415985, "learning_rate": 4.733820106106539e-05, "loss": 0.3631, "step": 3933500 }, { "epoch": 26.621372888696406, "grad_norm": 0.39787015318870544, "learning_rate": 4.733786271113036e-05, "loss": 0.3612, "step": 3934000 }, { "epoch": 26.624756388046773, "grad_norm": 0.4101943075656891, "learning_rate": 4.7337524361195324e-05, "loss": 0.3616, "step": 3934500 }, { "epoch": 26.628139887397143, "grad_norm": 0.35002443194389343, "learning_rate": 4.7337186011260286e-05, "loss": 0.3624, "step": 3935000 }, { "epoch": 26.63152338674751, "grad_norm": 0.32612723112106323, "learning_rate": 4.733684766132525e-05, "loss": 0.3603, "step": 3935500 }, { "epoch": 26.634906886097877, "grad_norm": 0.39648929238319397, "learning_rate": 4.733650931139022e-05, "loss": 0.3626, "step": 3936000 }, { "epoch": 26.638290385448247, "grad_norm": 0.3798840045928955, "learning_rate": 4.733617096145518e-05, "loss": 0.3619, "step": 3936500 }, { "epoch": 26.641673884798614, "grad_norm": 0.3551783859729767, "learning_rate": 4.733583261152014e-05, "loss": 0.3609, "step": 3937000 }, { "epoch": 26.64505738414898, "grad_norm": 0.3756074011325836, "learning_rate": 4.7335494261585104e-05, "loss": 0.3626, "step": 3937500 }, { "epoch": 26.64844088349935, "grad_norm": 0.37941914796829224, "learning_rate": 4.733515591165007e-05, "loss": 0.3632, "step": 3938000 }, { "epoch": 26.651824382849718, "grad_norm": 0.3654654324054718, "learning_rate": 4.733481756171503e-05, "loss": 0.3623, "step": 3938500 }, { "epoch": 26.655207882200088, "grad_norm": 0.38069528341293335, "learning_rate": 4.733447921177999e-05, "loss": 0.3621, "step": 3939000 }, { "epoch": 26.658591381550455, "grad_norm": 0.3884636461734772, "learning_rate": 4.733414086184495e-05, "loss": 0.3628, "step": 3939500 }, { "epoch": 26.66197488090082, "grad_norm": 0.3615627586841583, "learning_rate": 4.733380251190992e-05, "loss": 0.3615, "step": 3940000 }, { "epoch": 26.66535838025119, "grad_norm": 0.37071868777275085, "learning_rate": 4.733346416197488e-05, "loss": 0.3627, "step": 3940500 }, { "epoch": 26.66874187960156, "grad_norm": 0.3769175410270691, "learning_rate": 4.7333125812039845e-05, "loss": 0.3634, "step": 3941000 }, { "epoch": 26.67212537895193, "grad_norm": 0.396505206823349, "learning_rate": 4.733278746210481e-05, "loss": 0.3629, "step": 3941500 }, { "epoch": 26.675508878302296, "grad_norm": 0.3847061097621918, "learning_rate": 4.7332449112169776e-05, "loss": 0.3616, "step": 3942000 }, { "epoch": 26.678892377652662, "grad_norm": 0.3411423861980438, "learning_rate": 4.733211076223474e-05, "loss": 0.3627, "step": 3942500 }, { "epoch": 26.682275877003033, "grad_norm": 0.36055833101272583, "learning_rate": 4.7331772412299694e-05, "loss": 0.3618, "step": 3943000 }, { "epoch": 26.6856593763534, "grad_norm": 0.31502601504325867, "learning_rate": 4.733143406236466e-05, "loss": 0.3628, "step": 3943500 }, { "epoch": 26.68904287570377, "grad_norm": 0.3988841772079468, "learning_rate": 4.7331095712429625e-05, "loss": 0.3632, "step": 3944000 }, { "epoch": 26.692426375054136, "grad_norm": 0.3657224476337433, "learning_rate": 4.733075736249459e-05, "loss": 0.362, "step": 3944500 }, { "epoch": 26.695809874404503, "grad_norm": 0.37842491269111633, "learning_rate": 4.733041901255955e-05, "loss": 0.3619, "step": 3945000 }, { "epoch": 26.699193373754873, "grad_norm": 0.321781188249588, "learning_rate": 4.733008066262452e-05, "loss": 0.3605, "step": 3945500 }, { "epoch": 26.70257687310524, "grad_norm": 0.4276789426803589, "learning_rate": 4.732974231268948e-05, "loss": 0.3617, "step": 3946000 }, { "epoch": 26.705960372455607, "grad_norm": 0.3677583336830139, "learning_rate": 4.732940396275444e-05, "loss": 0.362, "step": 3946500 }, { "epoch": 26.709343871805977, "grad_norm": 0.3639909625053406, "learning_rate": 4.7329065612819404e-05, "loss": 0.3617, "step": 3947000 }, { "epoch": 26.712727371156344, "grad_norm": 0.3415442407131195, "learning_rate": 4.732872726288437e-05, "loss": 0.3626, "step": 3947500 }, { "epoch": 26.716110870506714, "grad_norm": 0.3350616693496704, "learning_rate": 4.732838891294933e-05, "loss": 0.3624, "step": 3948000 }, { "epoch": 26.71949436985708, "grad_norm": 0.3643999695777893, "learning_rate": 4.732805056301429e-05, "loss": 0.362, "step": 3948500 }, { "epoch": 26.722877869207448, "grad_norm": 0.33907368779182434, "learning_rate": 4.732771221307925e-05, "loss": 0.3627, "step": 3949000 }, { "epoch": 26.726261368557818, "grad_norm": 0.41405385732650757, "learning_rate": 4.732737386314422e-05, "loss": 0.3626, "step": 3949500 }, { "epoch": 26.729644867908185, "grad_norm": 0.366876482963562, "learning_rate": 4.7327035513209184e-05, "loss": 0.3615, "step": 3950000 }, { "epoch": 26.733028367258555, "grad_norm": 0.3724476993083954, "learning_rate": 4.7326697163274146e-05, "loss": 0.3631, "step": 3950500 }, { "epoch": 26.736411866608922, "grad_norm": 0.37764909863471985, "learning_rate": 4.732635881333911e-05, "loss": 0.3625, "step": 3951000 }, { "epoch": 26.73979536595929, "grad_norm": 0.37439340353012085, "learning_rate": 4.732602046340408e-05, "loss": 0.3633, "step": 3951500 }, { "epoch": 26.74317886530966, "grad_norm": 0.36886876821517944, "learning_rate": 4.732568211346904e-05, "loss": 0.3635, "step": 3952000 }, { "epoch": 26.746562364660026, "grad_norm": 0.37583333253860474, "learning_rate": 4.7325343763533994e-05, "loss": 0.3625, "step": 3952500 }, { "epoch": 26.749945864010392, "grad_norm": 0.33810582756996155, "learning_rate": 4.732500541359896e-05, "loss": 0.3635, "step": 3953000 }, { "epoch": 26.753329363360763, "grad_norm": 0.35692256689071655, "learning_rate": 4.7324667063663925e-05, "loss": 0.3624, "step": 3953500 }, { "epoch": 26.75671286271113, "grad_norm": 0.33927008509635925, "learning_rate": 4.732432871372889e-05, "loss": 0.3632, "step": 3954000 }, { "epoch": 26.7600963620615, "grad_norm": 0.3944675028324127, "learning_rate": 4.732399036379385e-05, "loss": 0.3656, "step": 3954500 }, { "epoch": 26.763479861411867, "grad_norm": 0.35236796736717224, "learning_rate": 4.732365201385882e-05, "loss": 0.3611, "step": 3955000 }, { "epoch": 26.766863360762233, "grad_norm": 0.43203791975975037, "learning_rate": 4.732331366392378e-05, "loss": 0.362, "step": 3955500 }, { "epoch": 26.770246860112604, "grad_norm": 0.4389057457447052, "learning_rate": 4.732297531398874e-05, "loss": 0.362, "step": 3956000 }, { "epoch": 26.77363035946297, "grad_norm": 0.37879079580307007, "learning_rate": 4.7322636964053705e-05, "loss": 0.363, "step": 3956500 }, { "epoch": 26.77701385881334, "grad_norm": 0.35850194096565247, "learning_rate": 4.7322298614118674e-05, "loss": 0.3616, "step": 3957000 }, { "epoch": 26.780397358163707, "grad_norm": 0.35824859142303467, "learning_rate": 4.732196026418363e-05, "loss": 0.3617, "step": 3957500 }, { "epoch": 26.783780857514074, "grad_norm": 0.3858749270439148, "learning_rate": 4.732162191424859e-05, "loss": 0.3629, "step": 3958000 }, { "epoch": 26.787164356864444, "grad_norm": 0.3488672375679016, "learning_rate": 4.7321283564313553e-05, "loss": 0.3626, "step": 3958500 }, { "epoch": 26.79054785621481, "grad_norm": 0.39496272802352905, "learning_rate": 4.732094521437852e-05, "loss": 0.3633, "step": 3959000 }, { "epoch": 26.793931355565178, "grad_norm": 0.40459078550338745, "learning_rate": 4.7320606864443484e-05, "loss": 0.3614, "step": 3959500 }, { "epoch": 26.797314854915548, "grad_norm": 0.3908441364765167, "learning_rate": 4.7320268514508447e-05, "loss": 0.3618, "step": 3960000 }, { "epoch": 26.800698354265915, "grad_norm": 0.3586594760417938, "learning_rate": 4.731993016457341e-05, "loss": 0.3628, "step": 3960500 }, { "epoch": 26.804081853616285, "grad_norm": 0.3698421120643616, "learning_rate": 4.731959181463838e-05, "loss": 0.363, "step": 3961000 }, { "epoch": 26.807465352966652, "grad_norm": 0.3757856488227844, "learning_rate": 4.731925346470334e-05, "loss": 0.3625, "step": 3961500 }, { "epoch": 26.81084885231702, "grad_norm": 0.39045828580856323, "learning_rate": 4.73189151147683e-05, "loss": 0.361, "step": 3962000 }, { "epoch": 26.81423235166739, "grad_norm": 0.3710618019104004, "learning_rate": 4.7318576764833264e-05, "loss": 0.3612, "step": 3962500 }, { "epoch": 26.817615851017756, "grad_norm": 0.39672067761421204, "learning_rate": 4.7318238414898226e-05, "loss": 0.3633, "step": 3963000 }, { "epoch": 26.820999350368126, "grad_norm": 0.3813120424747467, "learning_rate": 4.731790006496319e-05, "loss": 0.3627, "step": 3963500 }, { "epoch": 26.824382849718493, "grad_norm": 0.3530002236366272, "learning_rate": 4.731756171502815e-05, "loss": 0.3638, "step": 3964000 }, { "epoch": 26.82776634906886, "grad_norm": 0.4095313847064972, "learning_rate": 4.731722336509312e-05, "loss": 0.3614, "step": 3964500 }, { "epoch": 26.83114984841923, "grad_norm": 0.3469836413860321, "learning_rate": 4.731688501515808e-05, "loss": 0.3623, "step": 3965000 }, { "epoch": 26.834533347769597, "grad_norm": 0.36245352029800415, "learning_rate": 4.7316546665223043e-05, "loss": 0.3616, "step": 3965500 }, { "epoch": 26.837916847119967, "grad_norm": 0.3662642240524292, "learning_rate": 4.7316208315288006e-05, "loss": 0.363, "step": 3966000 }, { "epoch": 26.841300346470334, "grad_norm": 0.37218987941741943, "learning_rate": 4.7315869965352975e-05, "loss": 0.3615, "step": 3966500 }, { "epoch": 26.8446838458207, "grad_norm": 0.3281077444553375, "learning_rate": 4.731553161541793e-05, "loss": 0.3622, "step": 3967000 }, { "epoch": 26.84806734517107, "grad_norm": 0.3949171006679535, "learning_rate": 4.731519326548289e-05, "loss": 0.3628, "step": 3967500 }, { "epoch": 26.851450844521437, "grad_norm": 0.4136095941066742, "learning_rate": 4.7314854915547854e-05, "loss": 0.3641, "step": 3968000 }, { "epoch": 26.854834343871808, "grad_norm": 0.33874234557151794, "learning_rate": 4.731451656561282e-05, "loss": 0.3622, "step": 3968500 }, { "epoch": 26.858217843222175, "grad_norm": 0.338762491941452, "learning_rate": 4.7314178215677785e-05, "loss": 0.3619, "step": 3969000 }, { "epoch": 26.86160134257254, "grad_norm": 0.3444060683250427, "learning_rate": 4.731383986574275e-05, "loss": 0.3635, "step": 3969500 }, { "epoch": 26.86498484192291, "grad_norm": 0.3588503301143646, "learning_rate": 4.731350151580771e-05, "loss": 0.3612, "step": 3970000 }, { "epoch": 26.86836834127328, "grad_norm": 0.37239402532577515, "learning_rate": 4.731316316587268e-05, "loss": 0.3631, "step": 3970500 }, { "epoch": 26.871751840623645, "grad_norm": 0.3820563852787018, "learning_rate": 4.731282481593764e-05, "loss": 0.364, "step": 3971000 }, { "epoch": 26.875135339974015, "grad_norm": 0.35125070810317993, "learning_rate": 4.73124864660026e-05, "loss": 0.3616, "step": 3971500 }, { "epoch": 26.878518839324382, "grad_norm": 0.36873704195022583, "learning_rate": 4.7312148116067565e-05, "loss": 0.364, "step": 3972000 }, { "epoch": 26.881902338674752, "grad_norm": 0.3618520498275757, "learning_rate": 4.731180976613253e-05, "loss": 0.3629, "step": 3972500 }, { "epoch": 26.88528583802512, "grad_norm": 0.3631375730037689, "learning_rate": 4.731147141619749e-05, "loss": 0.3624, "step": 3973000 }, { "epoch": 26.888669337375486, "grad_norm": 0.3646116256713867, "learning_rate": 4.731113306626245e-05, "loss": 0.3641, "step": 3973500 }, { "epoch": 26.892052836725856, "grad_norm": 0.35803481936454773, "learning_rate": 4.731079471632742e-05, "loss": 0.3615, "step": 3974000 }, { "epoch": 26.895436336076223, "grad_norm": 0.3725024461746216, "learning_rate": 4.731045636639238e-05, "loss": 0.3619, "step": 3974500 }, { "epoch": 26.898819835426593, "grad_norm": 0.36321380734443665, "learning_rate": 4.7310118016457344e-05, "loss": 0.3624, "step": 3975000 }, { "epoch": 26.90220333477696, "grad_norm": 0.33477428555488586, "learning_rate": 4.7309779666522306e-05, "loss": 0.3628, "step": 3975500 }, { "epoch": 26.905586834127327, "grad_norm": 0.3610873818397522, "learning_rate": 4.7309441316587275e-05, "loss": 0.3623, "step": 3976000 }, { "epoch": 26.908970333477697, "grad_norm": 0.3352227210998535, "learning_rate": 4.730910296665223e-05, "loss": 0.3631, "step": 3976500 }, { "epoch": 26.912353832828064, "grad_norm": 0.3139859437942505, "learning_rate": 4.730876461671719e-05, "loss": 0.3632, "step": 3977000 }, { "epoch": 26.91573733217843, "grad_norm": 0.36809584498405457, "learning_rate": 4.7308426266782155e-05, "loss": 0.3613, "step": 3977500 }, { "epoch": 26.9191208315288, "grad_norm": 0.3616427481174469, "learning_rate": 4.7308087916847124e-05, "loss": 0.3639, "step": 3978000 }, { "epoch": 26.922504330879168, "grad_norm": 0.35086822509765625, "learning_rate": 4.7307749566912086e-05, "loss": 0.3615, "step": 3978500 }, { "epoch": 26.925887830229538, "grad_norm": 0.3581278622150421, "learning_rate": 4.730741121697705e-05, "loss": 0.3626, "step": 3979000 }, { "epoch": 26.929271329579905, "grad_norm": 0.3818640410900116, "learning_rate": 4.730707286704201e-05, "loss": 0.3625, "step": 3979500 }, { "epoch": 26.93265482893027, "grad_norm": 0.3971289396286011, "learning_rate": 4.730673451710698e-05, "loss": 0.3623, "step": 3980000 }, { "epoch": 26.93603832828064, "grad_norm": 0.39175015687942505, "learning_rate": 4.730639616717194e-05, "loss": 0.3627, "step": 3980500 }, { "epoch": 26.93942182763101, "grad_norm": 0.40893542766571045, "learning_rate": 4.73060578172369e-05, "loss": 0.3633, "step": 3981000 }, { "epoch": 26.94280532698138, "grad_norm": 0.35594314336776733, "learning_rate": 4.7305719467301865e-05, "loss": 0.3627, "step": 3981500 }, { "epoch": 26.946188826331746, "grad_norm": 0.36786365509033203, "learning_rate": 4.730538111736683e-05, "loss": 0.3612, "step": 3982000 }, { "epoch": 26.949572325682112, "grad_norm": 0.3444536328315735, "learning_rate": 4.730504276743179e-05, "loss": 0.3622, "step": 3982500 }, { "epoch": 26.952955825032483, "grad_norm": 0.35013580322265625, "learning_rate": 4.730470441749675e-05, "loss": 0.3631, "step": 3983000 }, { "epoch": 26.95633932438285, "grad_norm": 0.36221638321876526, "learning_rate": 4.730436606756172e-05, "loss": 0.3624, "step": 3983500 }, { "epoch": 26.959722823733216, "grad_norm": 0.36968088150024414, "learning_rate": 4.730402771762668e-05, "loss": 0.3625, "step": 3984000 }, { "epoch": 26.963106323083586, "grad_norm": 0.3494577705860138, "learning_rate": 4.7303689367691645e-05, "loss": 0.3627, "step": 3984500 }, { "epoch": 26.966489822433953, "grad_norm": 0.3767252266407013, "learning_rate": 4.730335101775661e-05, "loss": 0.3632, "step": 3985000 }, { "epoch": 26.969873321784323, "grad_norm": 0.36130470037460327, "learning_rate": 4.730301266782157e-05, "loss": 0.3628, "step": 3985500 }, { "epoch": 26.97325682113469, "grad_norm": 0.3665076792240143, "learning_rate": 4.730267431788653e-05, "loss": 0.3647, "step": 3986000 }, { "epoch": 26.976640320485057, "grad_norm": 0.37495723366737366, "learning_rate": 4.730233596795149e-05, "loss": 0.3631, "step": 3986500 }, { "epoch": 26.980023819835427, "grad_norm": 0.3862490952014923, "learning_rate": 4.7301997618016455e-05, "loss": 0.3622, "step": 3987000 }, { "epoch": 26.983407319185794, "grad_norm": 0.3855757415294647, "learning_rate": 4.7301659268081424e-05, "loss": 0.3602, "step": 3987500 }, { "epoch": 26.986790818536164, "grad_norm": 0.3551534116268158, "learning_rate": 4.7301320918146386e-05, "loss": 0.3624, "step": 3988000 }, { "epoch": 26.99017431788653, "grad_norm": 0.3630467653274536, "learning_rate": 4.730098256821135e-05, "loss": 0.3615, "step": 3988500 }, { "epoch": 26.993557817236898, "grad_norm": 0.3774111866950989, "learning_rate": 4.730064421827631e-05, "loss": 0.3648, "step": 3989000 }, { "epoch": 26.996941316587268, "grad_norm": 0.35956814885139465, "learning_rate": 4.730030586834128e-05, "loss": 0.3621, "step": 3989500 }, { "epoch": 27.0, "eval_accuracy": 0.8617120390552822, "eval_loss": 0.5606268644332886, "eval_runtime": 3363.4255, "eval_samples_per_second": 86.443, "eval_steps_per_second": 5.403, "step": 3989952 }, { "epoch": 27.000324815937635, "grad_norm": 0.36992666125297546, "learning_rate": 4.729996751840624e-05, "loss": 0.362, "step": 3990000 }, { "epoch": 27.003708315288005, "grad_norm": 0.4187975227832794, "learning_rate": 4.7299629168471204e-05, "loss": 0.3592, "step": 3990500 }, { "epoch": 27.007091814638372, "grad_norm": 0.35702478885650635, "learning_rate": 4.7299290818536166e-05, "loss": 0.3611, "step": 3991000 }, { "epoch": 27.01047531398874, "grad_norm": 0.40064552426338196, "learning_rate": 4.729895246860113e-05, "loss": 0.3586, "step": 3991500 }, { "epoch": 27.01385881333911, "grad_norm": 0.39525461196899414, "learning_rate": 4.729861411866609e-05, "loss": 0.3607, "step": 3992000 }, { "epoch": 27.017242312689476, "grad_norm": 0.3214837312698364, "learning_rate": 4.729827576873105e-05, "loss": 0.3591, "step": 3992500 }, { "epoch": 27.020625812039842, "grad_norm": 0.362924724817276, "learning_rate": 4.729793741879602e-05, "loss": 0.3579, "step": 3993000 }, { "epoch": 27.024009311390213, "grad_norm": 0.418075293302536, "learning_rate": 4.729759906886098e-05, "loss": 0.359, "step": 3993500 }, { "epoch": 27.02739281074058, "grad_norm": 0.39205074310302734, "learning_rate": 4.7297260718925945e-05, "loss": 0.3611, "step": 3994000 }, { "epoch": 27.03077631009095, "grad_norm": 0.341508150100708, "learning_rate": 4.729692236899091e-05, "loss": 0.3607, "step": 3994500 }, { "epoch": 27.034159809441316, "grad_norm": 0.3604142963886261, "learning_rate": 4.729658401905587e-05, "loss": 0.3597, "step": 3995000 }, { "epoch": 27.037543308791683, "grad_norm": 0.39036643505096436, "learning_rate": 4.729624566912083e-05, "loss": 0.3611, "step": 3995500 }, { "epoch": 27.040926808142054, "grad_norm": 0.3812994360923767, "learning_rate": 4.7295907319185794e-05, "loss": 0.3609, "step": 3996000 }, { "epoch": 27.04431030749242, "grad_norm": 0.3937513530254364, "learning_rate": 4.7295568969250756e-05, "loss": 0.3617, "step": 3996500 }, { "epoch": 27.04769380684279, "grad_norm": 0.3918488323688507, "learning_rate": 4.7295230619315725e-05, "loss": 0.3582, "step": 3997000 }, { "epoch": 27.051077306193157, "grad_norm": 0.36170023679733276, "learning_rate": 4.729489226938069e-05, "loss": 0.3612, "step": 3997500 }, { "epoch": 27.054460805543524, "grad_norm": 0.3825434446334839, "learning_rate": 4.729455391944565e-05, "loss": 0.3616, "step": 3998000 }, { "epoch": 27.057844304893894, "grad_norm": 0.37038761377334595, "learning_rate": 4.729421556951061e-05, "loss": 0.361, "step": 3998500 }, { "epoch": 27.06122780424426, "grad_norm": 0.33910655975341797, "learning_rate": 4.729387721957558e-05, "loss": 0.362, "step": 3999000 }, { "epoch": 27.06461130359463, "grad_norm": 0.37690266966819763, "learning_rate": 4.729353886964054e-05, "loss": 0.3625, "step": 3999500 }, { "epoch": 27.067994802944998, "grad_norm": 0.3696511685848236, "learning_rate": 4.7293200519705504e-05, "loss": 0.3615, "step": 4000000 }, { "epoch": 27.071378302295365, "grad_norm": 0.35271111130714417, "learning_rate": 4.7292862169770467e-05, "loss": 0.3619, "step": 4000500 }, { "epoch": 27.074761801645735, "grad_norm": 0.3805581033229828, "learning_rate": 4.729252381983543e-05, "loss": 0.361, "step": 4001000 }, { "epoch": 27.078145300996102, "grad_norm": 0.39928561449050903, "learning_rate": 4.729218546990039e-05, "loss": 0.3592, "step": 4001500 }, { "epoch": 27.08152880034647, "grad_norm": 0.367096871137619, "learning_rate": 4.729184711996535e-05, "loss": 0.3585, "step": 4002000 }, { "epoch": 27.08491229969684, "grad_norm": 0.34236449003219604, "learning_rate": 4.7291508770030315e-05, "loss": 0.3611, "step": 4002500 }, { "epoch": 27.088295799047206, "grad_norm": 0.34770265221595764, "learning_rate": 4.7291170420095284e-05, "loss": 0.3614, "step": 4003000 }, { "epoch": 27.091679298397576, "grad_norm": 0.37283119559288025, "learning_rate": 4.7290832070160246e-05, "loss": 0.3587, "step": 4003500 }, { "epoch": 27.095062797747943, "grad_norm": 0.4142138957977295, "learning_rate": 4.729049372022521e-05, "loss": 0.3606, "step": 4004000 }, { "epoch": 27.09844629709831, "grad_norm": 0.3636082112789154, "learning_rate": 4.729015537029017e-05, "loss": 0.3617, "step": 4004500 }, { "epoch": 27.10182979644868, "grad_norm": 0.34307849407196045, "learning_rate": 4.728981702035513e-05, "loss": 0.3608, "step": 4005000 }, { "epoch": 27.105213295799047, "grad_norm": 0.3708007037639618, "learning_rate": 4.7289478670420095e-05, "loss": 0.361, "step": 4005500 }, { "epoch": 27.108596795149417, "grad_norm": 0.39478373527526855, "learning_rate": 4.728914032048506e-05, "loss": 0.3617, "step": 4006000 }, { "epoch": 27.111980294499784, "grad_norm": 0.3448827564716339, "learning_rate": 4.7288801970550026e-05, "loss": 0.3614, "step": 4006500 }, { "epoch": 27.11536379385015, "grad_norm": 0.35732850432395935, "learning_rate": 4.728846362061499e-05, "loss": 0.3624, "step": 4007000 }, { "epoch": 27.11874729320052, "grad_norm": 0.39854931831359863, "learning_rate": 4.728812527067995e-05, "loss": 0.3613, "step": 4007500 }, { "epoch": 27.122130792550887, "grad_norm": 0.39971092343330383, "learning_rate": 4.728778692074491e-05, "loss": 0.3604, "step": 4008000 }, { "epoch": 27.125514291901254, "grad_norm": 0.36446234583854675, "learning_rate": 4.728744857080988e-05, "loss": 0.3601, "step": 4008500 }, { "epoch": 27.128897791251624, "grad_norm": 0.36135321855545044, "learning_rate": 4.728711022087484e-05, "loss": 0.3626, "step": 4009000 }, { "epoch": 27.13228129060199, "grad_norm": 0.390614777803421, "learning_rate": 4.7286771870939805e-05, "loss": 0.3599, "step": 4009500 }, { "epoch": 27.13566478995236, "grad_norm": 0.40647169947624207, "learning_rate": 4.728643352100476e-05, "loss": 0.3619, "step": 4010000 }, { "epoch": 27.13904828930273, "grad_norm": 0.3296395242214203, "learning_rate": 4.728609517106973e-05, "loss": 0.3619, "step": 4010500 }, { "epoch": 27.142431788653095, "grad_norm": 0.37486401200294495, "learning_rate": 4.728575682113469e-05, "loss": 0.3605, "step": 4011000 }, { "epoch": 27.145815288003465, "grad_norm": 0.3922019898891449, "learning_rate": 4.7285418471199654e-05, "loss": 0.3608, "step": 4011500 }, { "epoch": 27.149198787353832, "grad_norm": 0.32637348771095276, "learning_rate": 4.7285080121264616e-05, "loss": 0.3599, "step": 4012000 }, { "epoch": 27.152582286704202, "grad_norm": 0.3791635036468506, "learning_rate": 4.7284741771329585e-05, "loss": 0.3609, "step": 4012500 }, { "epoch": 27.15596578605457, "grad_norm": 0.36705470085144043, "learning_rate": 4.728440342139455e-05, "loss": 0.3614, "step": 4013000 }, { "epoch": 27.159349285404936, "grad_norm": 0.39378514885902405, "learning_rate": 4.728406507145951e-05, "loss": 0.362, "step": 4013500 }, { "epoch": 27.162732784755306, "grad_norm": 0.35343000292778015, "learning_rate": 4.728372672152447e-05, "loss": 0.3617, "step": 4014000 }, { "epoch": 27.166116284105673, "grad_norm": 0.3905445337295532, "learning_rate": 4.728338837158944e-05, "loss": 0.3611, "step": 4014500 }, { "epoch": 27.169499783456043, "grad_norm": 0.36750927567481995, "learning_rate": 4.7283050021654395e-05, "loss": 0.3583, "step": 4015000 }, { "epoch": 27.17288328280641, "grad_norm": 0.3575628399848938, "learning_rate": 4.728271167171936e-05, "loss": 0.3627, "step": 4015500 }, { "epoch": 27.176266782156777, "grad_norm": 0.35029515624046326, "learning_rate": 4.7282373321784326e-05, "loss": 0.3629, "step": 4016000 }, { "epoch": 27.179650281507147, "grad_norm": 0.4039035439491272, "learning_rate": 4.728203497184929e-05, "loss": 0.3619, "step": 4016500 }, { "epoch": 27.183033780857514, "grad_norm": 0.37368181347846985, "learning_rate": 4.728169662191425e-05, "loss": 0.362, "step": 4017000 }, { "epoch": 27.18641728020788, "grad_norm": 0.38231322169303894, "learning_rate": 4.728135827197921e-05, "loss": 0.3599, "step": 4017500 }, { "epoch": 27.18980077955825, "grad_norm": 0.38161662220954895, "learning_rate": 4.728101992204418e-05, "loss": 0.3628, "step": 4018000 }, { "epoch": 27.193184278908618, "grad_norm": 0.4048933684825897, "learning_rate": 4.7280681572109144e-05, "loss": 0.3604, "step": 4018500 }, { "epoch": 27.196567778258988, "grad_norm": 0.4047812521457672, "learning_rate": 4.7280343222174106e-05, "loss": 0.3602, "step": 4019000 }, { "epoch": 27.199951277609355, "grad_norm": 0.39230266213417053, "learning_rate": 4.728000487223906e-05, "loss": 0.3609, "step": 4019500 }, { "epoch": 27.20333477695972, "grad_norm": 0.3826599717140198, "learning_rate": 4.727966652230403e-05, "loss": 0.3623, "step": 4020000 }, { "epoch": 27.20671827631009, "grad_norm": 0.3624647557735443, "learning_rate": 4.727932817236899e-05, "loss": 0.361, "step": 4020500 }, { "epoch": 27.21010177566046, "grad_norm": 0.3965359330177307, "learning_rate": 4.7278989822433954e-05, "loss": 0.3619, "step": 4021000 }, { "epoch": 27.21348527501083, "grad_norm": 0.3927077353000641, "learning_rate": 4.7278651472498916e-05, "loss": 0.3597, "step": 4021500 }, { "epoch": 27.216868774361195, "grad_norm": 0.3874165117740631, "learning_rate": 4.7278313122563885e-05, "loss": 0.3618, "step": 4022000 }, { "epoch": 27.220252273711562, "grad_norm": 0.36228522658348083, "learning_rate": 4.727797477262885e-05, "loss": 0.3608, "step": 4022500 }, { "epoch": 27.223635773061932, "grad_norm": 0.3947308361530304, "learning_rate": 4.727763642269381e-05, "loss": 0.3606, "step": 4023000 }, { "epoch": 27.2270192724123, "grad_norm": 0.353760302066803, "learning_rate": 4.727729807275877e-05, "loss": 0.3622, "step": 4023500 }, { "epoch": 27.23040277176267, "grad_norm": 0.34535297751426697, "learning_rate": 4.727695972282374e-05, "loss": 0.3618, "step": 4024000 }, { "epoch": 27.233786271113036, "grad_norm": 0.3804129362106323, "learning_rate": 4.7276621372888696e-05, "loss": 0.3609, "step": 4024500 }, { "epoch": 27.237169770463403, "grad_norm": 0.42950165271759033, "learning_rate": 4.727628302295366e-05, "loss": 0.3612, "step": 4025000 }, { "epoch": 27.240553269813773, "grad_norm": 0.3900119662284851, "learning_rate": 4.727594467301863e-05, "loss": 0.3625, "step": 4025500 }, { "epoch": 27.24393676916414, "grad_norm": 0.38278695940971375, "learning_rate": 4.727560632308359e-05, "loss": 0.3621, "step": 4026000 }, { "epoch": 27.247320268514507, "grad_norm": 0.3604544401168823, "learning_rate": 4.727526797314855e-05, "loss": 0.3603, "step": 4026500 }, { "epoch": 27.250703767864877, "grad_norm": 0.30103299021720886, "learning_rate": 4.727492962321351e-05, "loss": 0.3614, "step": 4027000 }, { "epoch": 27.254087267215244, "grad_norm": 0.3802935779094696, "learning_rate": 4.727459127327848e-05, "loss": 0.3635, "step": 4027500 }, { "epoch": 27.257470766565614, "grad_norm": 0.3827607035636902, "learning_rate": 4.7274252923343444e-05, "loss": 0.3614, "step": 4028000 }, { "epoch": 27.26085426591598, "grad_norm": 0.3646654188632965, "learning_rate": 4.7273914573408406e-05, "loss": 0.3617, "step": 4028500 }, { "epoch": 27.264237765266348, "grad_norm": 0.3567125201225281, "learning_rate": 4.727357622347336e-05, "loss": 0.3623, "step": 4029000 }, { "epoch": 27.267621264616718, "grad_norm": 0.3684942424297333, "learning_rate": 4.727323787353833e-05, "loss": 0.3606, "step": 4029500 }, { "epoch": 27.271004763967085, "grad_norm": 0.36184918880462646, "learning_rate": 4.727289952360329e-05, "loss": 0.3615, "step": 4030000 }, { "epoch": 27.274388263317455, "grad_norm": 0.3627530038356781, "learning_rate": 4.7272561173668255e-05, "loss": 0.3611, "step": 4030500 }, { "epoch": 27.27777176266782, "grad_norm": 0.41672393679618835, "learning_rate": 4.727222282373322e-05, "loss": 0.362, "step": 4031000 }, { "epoch": 27.28115526201819, "grad_norm": 0.3685767948627472, "learning_rate": 4.7271884473798186e-05, "loss": 0.3637, "step": 4031500 }, { "epoch": 27.28453876136856, "grad_norm": 0.3790867030620575, "learning_rate": 4.727154612386315e-05, "loss": 0.3612, "step": 4032000 }, { "epoch": 27.287922260718926, "grad_norm": 0.3841542601585388, "learning_rate": 4.727120777392811e-05, "loss": 0.36, "step": 4032500 }, { "epoch": 27.291305760069292, "grad_norm": 0.3759045898914337, "learning_rate": 4.727086942399307e-05, "loss": 0.3607, "step": 4033000 }, { "epoch": 27.294689259419663, "grad_norm": 0.42025455832481384, "learning_rate": 4.727053107405804e-05, "loss": 0.3624, "step": 4033500 }, { "epoch": 27.29807275877003, "grad_norm": 0.4027925729751587, "learning_rate": 4.7270192724122996e-05, "loss": 0.3613, "step": 4034000 }, { "epoch": 27.3014562581204, "grad_norm": 0.4137814939022064, "learning_rate": 4.726985437418796e-05, "loss": 0.3614, "step": 4034500 }, { "epoch": 27.304839757470766, "grad_norm": 0.390674352645874, "learning_rate": 4.726951602425293e-05, "loss": 0.3622, "step": 4035000 }, { "epoch": 27.308223256821133, "grad_norm": 0.39336270093917847, "learning_rate": 4.726917767431789e-05, "loss": 0.3618, "step": 4035500 }, { "epoch": 27.311606756171503, "grad_norm": 0.3716800808906555, "learning_rate": 4.726883932438285e-05, "loss": 0.3613, "step": 4036000 }, { "epoch": 27.31499025552187, "grad_norm": 0.3723650276660919, "learning_rate": 4.7268500974447814e-05, "loss": 0.3622, "step": 4036500 }, { "epoch": 27.31837375487224, "grad_norm": 0.41383472084999084, "learning_rate": 4.726816262451278e-05, "loss": 0.3609, "step": 4037000 }, { "epoch": 27.321757254222607, "grad_norm": 0.38070666790008545, "learning_rate": 4.7267824274577745e-05, "loss": 0.3619, "step": 4037500 }, { "epoch": 27.325140753572974, "grad_norm": 0.3477083444595337, "learning_rate": 4.726748592464271e-05, "loss": 0.3614, "step": 4038000 }, { "epoch": 27.328524252923344, "grad_norm": 0.39822229743003845, "learning_rate": 4.726714757470766e-05, "loss": 0.3622, "step": 4038500 }, { "epoch": 27.33190775227371, "grad_norm": 0.38165462017059326, "learning_rate": 4.726680922477263e-05, "loss": 0.3613, "step": 4039000 }, { "epoch": 27.33529125162408, "grad_norm": 0.37544435262680054, "learning_rate": 4.726647087483759e-05, "loss": 0.3611, "step": 4039500 }, { "epoch": 27.338674750974448, "grad_norm": 0.3443518877029419, "learning_rate": 4.7266132524902555e-05, "loss": 0.3625, "step": 4040000 }, { "epoch": 27.342058250324815, "grad_norm": 0.36943838000297546, "learning_rate": 4.726579417496752e-05, "loss": 0.3617, "step": 4040500 }, { "epoch": 27.345441749675185, "grad_norm": 0.3410223722457886, "learning_rate": 4.7265455825032486e-05, "loss": 0.3622, "step": 4041000 }, { "epoch": 27.348825249025552, "grad_norm": 0.3751671612262726, "learning_rate": 4.726511747509745e-05, "loss": 0.3603, "step": 4041500 }, { "epoch": 27.35220874837592, "grad_norm": 0.41506850719451904, "learning_rate": 4.726477912516241e-05, "loss": 0.3613, "step": 4042000 }, { "epoch": 27.35559224772629, "grad_norm": 0.40068018436431885, "learning_rate": 4.726444077522737e-05, "loss": 0.3627, "step": 4042500 }, { "epoch": 27.358975747076656, "grad_norm": 0.34891965985298157, "learning_rate": 4.726410242529234e-05, "loss": 0.3619, "step": 4043000 }, { "epoch": 27.362359246427026, "grad_norm": 0.38653042912483215, "learning_rate": 4.72637640753573e-05, "loss": 0.3641, "step": 4043500 }, { "epoch": 27.365742745777393, "grad_norm": 0.4033982753753662, "learning_rate": 4.726342572542226e-05, "loss": 0.3619, "step": 4044000 }, { "epoch": 27.36912624512776, "grad_norm": 0.40694281458854675, "learning_rate": 4.726308737548723e-05, "loss": 0.3625, "step": 4044500 }, { "epoch": 27.37250974447813, "grad_norm": 0.37466728687286377, "learning_rate": 4.726274902555219e-05, "loss": 0.3603, "step": 4045000 }, { "epoch": 27.375893243828497, "grad_norm": 0.3674294650554657, "learning_rate": 4.726241067561715e-05, "loss": 0.3609, "step": 4045500 }, { "epoch": 27.379276743178867, "grad_norm": 0.3904021680355072, "learning_rate": 4.7262072325682114e-05, "loss": 0.3618, "step": 4046000 }, { "epoch": 27.382660242529234, "grad_norm": 0.40246257185935974, "learning_rate": 4.726173397574708e-05, "loss": 0.3619, "step": 4046500 }, { "epoch": 27.3860437418796, "grad_norm": 0.3744506239891052, "learning_rate": 4.7261395625812045e-05, "loss": 0.3631, "step": 4047000 }, { "epoch": 27.38942724122997, "grad_norm": 0.33874863386154175, "learning_rate": 4.726105727587701e-05, "loss": 0.3635, "step": 4047500 }, { "epoch": 27.392810740580337, "grad_norm": 0.35687437653541565, "learning_rate": 4.726071892594196e-05, "loss": 0.36, "step": 4048000 }, { "epoch": 27.396194239930708, "grad_norm": 0.38321176171302795, "learning_rate": 4.726038057600693e-05, "loss": 0.3614, "step": 4048500 }, { "epoch": 27.399577739281074, "grad_norm": 0.36645007133483887, "learning_rate": 4.7260042226071894e-05, "loss": 0.362, "step": 4049000 }, { "epoch": 27.40296123863144, "grad_norm": 0.3766949474811554, "learning_rate": 4.7259703876136856e-05, "loss": 0.3615, "step": 4049500 }, { "epoch": 27.40634473798181, "grad_norm": 0.41977280378341675, "learning_rate": 4.725936552620182e-05, "loss": 0.3621, "step": 4050000 }, { "epoch": 27.40972823733218, "grad_norm": 0.380024790763855, "learning_rate": 4.725902717626679e-05, "loss": 0.3623, "step": 4050500 }, { "epoch": 27.413111736682545, "grad_norm": 0.3690486252307892, "learning_rate": 4.725868882633175e-05, "loss": 0.3637, "step": 4051000 }, { "epoch": 27.416495236032915, "grad_norm": 0.4049592614173889, "learning_rate": 4.725835047639671e-05, "loss": 0.3628, "step": 4051500 }, { "epoch": 27.419878735383282, "grad_norm": 0.4195794463157654, "learning_rate": 4.7258012126461673e-05, "loss": 0.3629, "step": 4052000 }, { "epoch": 27.423262234733652, "grad_norm": 0.40863946080207825, "learning_rate": 4.725767377652664e-05, "loss": 0.3619, "step": 4052500 }, { "epoch": 27.42664573408402, "grad_norm": 0.35777702927589417, "learning_rate": 4.72573354265916e-05, "loss": 0.3618, "step": 4053000 }, { "epoch": 27.430029233434386, "grad_norm": 0.3972858786582947, "learning_rate": 4.725699707665656e-05, "loss": 0.3627, "step": 4053500 }, { "epoch": 27.433412732784756, "grad_norm": 0.3517029881477356, "learning_rate": 4.725665872672153e-05, "loss": 0.3622, "step": 4054000 }, { "epoch": 27.436796232135123, "grad_norm": 0.3701731860637665, "learning_rate": 4.725632037678649e-05, "loss": 0.3607, "step": 4054500 }, { "epoch": 27.440179731485493, "grad_norm": 0.3667539954185486, "learning_rate": 4.725598202685145e-05, "loss": 0.362, "step": 4055000 }, { "epoch": 27.44356323083586, "grad_norm": 0.3279666602611542, "learning_rate": 4.7255643676916415e-05, "loss": 0.3612, "step": 4055500 }, { "epoch": 27.446946730186227, "grad_norm": 0.396075040102005, "learning_rate": 4.725530532698138e-05, "loss": 0.3615, "step": 4056000 }, { "epoch": 27.450330229536597, "grad_norm": 0.3535042405128479, "learning_rate": 4.7254966977046346e-05, "loss": 0.3591, "step": 4056500 }, { "epoch": 27.453713728886964, "grad_norm": 0.4029294550418854, "learning_rate": 4.725462862711131e-05, "loss": 0.3615, "step": 4057000 }, { "epoch": 27.45709722823733, "grad_norm": 0.3558937609195709, "learning_rate": 4.7254290277176264e-05, "loss": 0.3614, "step": 4057500 }, { "epoch": 27.4604807275877, "grad_norm": 0.36616647243499756, "learning_rate": 4.725395192724123e-05, "loss": 0.3611, "step": 4058000 }, { "epoch": 27.463864226938068, "grad_norm": 0.36483973264694214, "learning_rate": 4.7253613577306195e-05, "loss": 0.3623, "step": 4058500 }, { "epoch": 27.467247726288438, "grad_norm": 0.3783068060874939, "learning_rate": 4.725327522737116e-05, "loss": 0.3615, "step": 4059000 }, { "epoch": 27.470631225638805, "grad_norm": 0.36561059951782227, "learning_rate": 4.725293687743612e-05, "loss": 0.3627, "step": 4059500 }, { "epoch": 27.47401472498917, "grad_norm": 0.36476603150367737, "learning_rate": 4.725259852750109e-05, "loss": 0.3627, "step": 4060000 }, { "epoch": 27.47739822433954, "grad_norm": 0.35045233368873596, "learning_rate": 4.725226017756605e-05, "loss": 0.3611, "step": 4060500 }, { "epoch": 27.48078172368991, "grad_norm": 0.33032065629959106, "learning_rate": 4.725192182763101e-05, "loss": 0.3627, "step": 4061000 }, { "epoch": 27.48416522304028, "grad_norm": 0.3880566954612732, "learning_rate": 4.7251583477695974e-05, "loss": 0.3609, "step": 4061500 }, { "epoch": 27.487548722390645, "grad_norm": 0.3635287880897522, "learning_rate": 4.725124512776094e-05, "loss": 0.3613, "step": 4062000 }, { "epoch": 27.490932221741012, "grad_norm": 0.4103771150112152, "learning_rate": 4.72509067778259e-05, "loss": 0.3612, "step": 4062500 }, { "epoch": 27.494315721091382, "grad_norm": 0.3320116698741913, "learning_rate": 4.725056842789086e-05, "loss": 0.3621, "step": 4063000 }, { "epoch": 27.49769922044175, "grad_norm": 0.3799278736114502, "learning_rate": 4.725023007795583e-05, "loss": 0.3627, "step": 4063500 }, { "epoch": 27.50108271979212, "grad_norm": 0.3894334137439728, "learning_rate": 4.724989172802079e-05, "loss": 0.3622, "step": 4064000 }, { "epoch": 27.504466219142486, "grad_norm": 0.3801906704902649, "learning_rate": 4.7249553378085754e-05, "loss": 0.3628, "step": 4064500 }, { "epoch": 27.507849718492853, "grad_norm": 0.35581132769584656, "learning_rate": 4.7249215028150716e-05, "loss": 0.3621, "step": 4065000 }, { "epoch": 27.511233217843223, "grad_norm": 0.3754686117172241, "learning_rate": 4.724887667821568e-05, "loss": 0.3621, "step": 4065500 }, { "epoch": 27.51461671719359, "grad_norm": 0.36557772755622864, "learning_rate": 4.724853832828065e-05, "loss": 0.3614, "step": 4066000 }, { "epoch": 27.518000216543957, "grad_norm": 0.37378761172294617, "learning_rate": 4.724819997834561e-05, "loss": 0.3618, "step": 4066500 }, { "epoch": 27.521383715894327, "grad_norm": 0.3569445312023163, "learning_rate": 4.7247861628410564e-05, "loss": 0.3621, "step": 4067000 }, { "epoch": 27.524767215244694, "grad_norm": 0.4252917468547821, "learning_rate": 4.724752327847553e-05, "loss": 0.3605, "step": 4067500 }, { "epoch": 27.528150714595064, "grad_norm": 0.3686460256576538, "learning_rate": 4.7247184928540495e-05, "loss": 0.3623, "step": 4068000 }, { "epoch": 27.53153421394543, "grad_norm": 0.405404269695282, "learning_rate": 4.724684657860546e-05, "loss": 0.3622, "step": 4068500 }, { "epoch": 27.534917713295798, "grad_norm": 0.37391921877861023, "learning_rate": 4.724650822867042e-05, "loss": 0.3615, "step": 4069000 }, { "epoch": 27.538301212646168, "grad_norm": 0.3840598464012146, "learning_rate": 4.724616987873539e-05, "loss": 0.3608, "step": 4069500 }, { "epoch": 27.541684711996535, "grad_norm": 0.36684784293174744, "learning_rate": 4.724583152880035e-05, "loss": 0.3594, "step": 4070000 }, { "epoch": 27.545068211346905, "grad_norm": 0.36803022027015686, "learning_rate": 4.724549317886531e-05, "loss": 0.3617, "step": 4070500 }, { "epoch": 27.54845171069727, "grad_norm": 0.3929801285266876, "learning_rate": 4.7245154828930275e-05, "loss": 0.3611, "step": 4071000 }, { "epoch": 27.55183521004764, "grad_norm": 0.37724795937538147, "learning_rate": 4.7244816478995244e-05, "loss": 0.3625, "step": 4071500 }, { "epoch": 27.55521870939801, "grad_norm": 0.352273553609848, "learning_rate": 4.72444781290602e-05, "loss": 0.3622, "step": 4072000 }, { "epoch": 27.558602208748376, "grad_norm": 0.37165728211402893, "learning_rate": 4.724413977912516e-05, "loss": 0.3624, "step": 4072500 }, { "epoch": 27.561985708098746, "grad_norm": 0.36808037757873535, "learning_rate": 4.724380142919012e-05, "loss": 0.3622, "step": 4073000 }, { "epoch": 27.565369207449113, "grad_norm": 0.4029386341571808, "learning_rate": 4.724346307925509e-05, "loss": 0.3614, "step": 4073500 }, { "epoch": 27.56875270679948, "grad_norm": 0.37487930059432983, "learning_rate": 4.7243124729320054e-05, "loss": 0.3624, "step": 4074000 }, { "epoch": 27.57213620614985, "grad_norm": 0.3402683734893799, "learning_rate": 4.7242786379385016e-05, "loss": 0.363, "step": 4074500 }, { "epoch": 27.575519705500216, "grad_norm": 0.3881003260612488, "learning_rate": 4.724244802944998e-05, "loss": 0.3615, "step": 4075000 }, { "epoch": 27.578903204850583, "grad_norm": 0.34688401222229004, "learning_rate": 4.724210967951495e-05, "loss": 0.3616, "step": 4075500 }, { "epoch": 27.582286704200953, "grad_norm": 0.4068271219730377, "learning_rate": 4.724177132957991e-05, "loss": 0.3599, "step": 4076000 }, { "epoch": 27.58567020355132, "grad_norm": 0.39702337980270386, "learning_rate": 4.724143297964487e-05, "loss": 0.3641, "step": 4076500 }, { "epoch": 27.58905370290169, "grad_norm": 0.31892362236976624, "learning_rate": 4.7241094629709834e-05, "loss": 0.3614, "step": 4077000 }, { "epoch": 27.592437202252057, "grad_norm": 0.38383200764656067, "learning_rate": 4.7240756279774796e-05, "loss": 0.3611, "step": 4077500 }, { "epoch": 27.595820701602424, "grad_norm": 0.3788059651851654, "learning_rate": 4.724041792983976e-05, "loss": 0.3614, "step": 4078000 }, { "epoch": 27.599204200952794, "grad_norm": 0.38105660676956177, "learning_rate": 4.724007957990472e-05, "loss": 0.3601, "step": 4078500 }, { "epoch": 27.60258770030316, "grad_norm": 0.3627316355705261, "learning_rate": 4.723974122996969e-05, "loss": 0.3617, "step": 4079000 }, { "epoch": 27.60597119965353, "grad_norm": 0.4075861871242523, "learning_rate": 4.723940288003465e-05, "loss": 0.3613, "step": 4079500 }, { "epoch": 27.609354699003898, "grad_norm": 0.39518603682518005, "learning_rate": 4.723906453009961e-05, "loss": 0.3621, "step": 4080000 }, { "epoch": 27.612738198354265, "grad_norm": 0.3917335867881775, "learning_rate": 4.7238726180164575e-05, "loss": 0.3633, "step": 4080500 }, { "epoch": 27.616121697704635, "grad_norm": 0.34773170948028564, "learning_rate": 4.7238387830229544e-05, "loss": 0.3626, "step": 4081000 }, { "epoch": 27.619505197055002, "grad_norm": 0.364572674036026, "learning_rate": 4.72380494802945e-05, "loss": 0.3608, "step": 4081500 }, { "epoch": 27.62288869640537, "grad_norm": 0.36159542202949524, "learning_rate": 4.723771113035946e-05, "loss": 0.3606, "step": 4082000 }, { "epoch": 27.62627219575574, "grad_norm": 0.3716708719730377, "learning_rate": 4.7237372780424424e-05, "loss": 0.3614, "step": 4082500 }, { "epoch": 27.629655695106106, "grad_norm": 0.3588971793651581, "learning_rate": 4.723703443048939e-05, "loss": 0.3633, "step": 4083000 }, { "epoch": 27.633039194456476, "grad_norm": 0.37614136934280396, "learning_rate": 4.7236696080554355e-05, "loss": 0.3615, "step": 4083500 }, { "epoch": 27.636422693806843, "grad_norm": 0.42491161823272705, "learning_rate": 4.723635773061932e-05, "loss": 0.3615, "step": 4084000 }, { "epoch": 27.63980619315721, "grad_norm": 0.3415015637874603, "learning_rate": 4.723601938068428e-05, "loss": 0.3619, "step": 4084500 }, { "epoch": 27.64318969250758, "grad_norm": 0.3611709773540497, "learning_rate": 4.723568103074925e-05, "loss": 0.3638, "step": 4085000 }, { "epoch": 27.646573191857946, "grad_norm": 0.3997561037540436, "learning_rate": 4.723534268081421e-05, "loss": 0.3608, "step": 4085500 }, { "epoch": 27.649956691208317, "grad_norm": 0.37621551752090454, "learning_rate": 4.723500433087917e-05, "loss": 0.362, "step": 4086000 }, { "epoch": 27.653340190558684, "grad_norm": 0.3873799443244934, "learning_rate": 4.7234665980944134e-05, "loss": 0.362, "step": 4086500 }, { "epoch": 27.65672368990905, "grad_norm": 0.3721669316291809, "learning_rate": 4.7234327631009096e-05, "loss": 0.3615, "step": 4087000 }, { "epoch": 27.66010718925942, "grad_norm": 0.334337055683136, "learning_rate": 4.723398928107406e-05, "loss": 0.3617, "step": 4087500 }, { "epoch": 27.663490688609787, "grad_norm": 0.3558754622936249, "learning_rate": 4.723365093113902e-05, "loss": 0.362, "step": 4088000 }, { "epoch": 27.666874187960158, "grad_norm": 0.3540439307689667, "learning_rate": 4.723331258120399e-05, "loss": 0.3605, "step": 4088500 }, { "epoch": 27.670257687310524, "grad_norm": 0.39304184913635254, "learning_rate": 4.723297423126895e-05, "loss": 0.3621, "step": 4089000 }, { "epoch": 27.67364118666089, "grad_norm": 0.3880615532398224, "learning_rate": 4.7232635881333914e-05, "loss": 0.3617, "step": 4089500 }, { "epoch": 27.67702468601126, "grad_norm": 0.41420215368270874, "learning_rate": 4.7232297531398876e-05, "loss": 0.3608, "step": 4090000 }, { "epoch": 27.680408185361628, "grad_norm": 0.3796791732311249, "learning_rate": 4.7231959181463845e-05, "loss": 0.3633, "step": 4090500 }, { "epoch": 27.683791684711995, "grad_norm": 0.39305371046066284, "learning_rate": 4.72316208315288e-05, "loss": 0.362, "step": 4091000 }, { "epoch": 27.687175184062365, "grad_norm": 0.3777586817741394, "learning_rate": 4.723128248159376e-05, "loss": 0.3617, "step": 4091500 }, { "epoch": 27.690558683412732, "grad_norm": 0.3716682195663452, "learning_rate": 4.7230944131658724e-05, "loss": 0.3611, "step": 4092000 }, { "epoch": 27.693942182763102, "grad_norm": 0.37100502848625183, "learning_rate": 4.723060578172369e-05, "loss": 0.362, "step": 4092500 }, { "epoch": 27.69732568211347, "grad_norm": 0.3646644651889801, "learning_rate": 4.7230267431788655e-05, "loss": 0.3616, "step": 4093000 }, { "epoch": 27.700709181463836, "grad_norm": 0.41356727480888367, "learning_rate": 4.722992908185362e-05, "loss": 0.3609, "step": 4093500 }, { "epoch": 27.704092680814206, "grad_norm": 0.3958253562450409, "learning_rate": 4.722959073191858e-05, "loss": 0.3619, "step": 4094000 }, { "epoch": 27.707476180164573, "grad_norm": 0.342966765165329, "learning_rate": 4.722925238198355e-05, "loss": 0.3618, "step": 4094500 }, { "epoch": 27.710859679514943, "grad_norm": 0.36626705527305603, "learning_rate": 4.722891403204851e-05, "loss": 0.3613, "step": 4095000 }, { "epoch": 27.71424317886531, "grad_norm": 0.36850035190582275, "learning_rate": 4.722857568211347e-05, "loss": 0.3611, "step": 4095500 }, { "epoch": 27.717626678215677, "grad_norm": 0.4046573340892792, "learning_rate": 4.7228237332178435e-05, "loss": 0.3608, "step": 4096000 }, { "epoch": 27.721010177566047, "grad_norm": 0.3967227637767792, "learning_rate": 4.72278989822434e-05, "loss": 0.362, "step": 4096500 }, { "epoch": 27.724393676916414, "grad_norm": 0.3719586730003357, "learning_rate": 4.722756063230836e-05, "loss": 0.3626, "step": 4097000 }, { "epoch": 27.727777176266784, "grad_norm": 0.36543700098991394, "learning_rate": 4.722722228237332e-05, "loss": 0.3636, "step": 4097500 }, { "epoch": 27.73116067561715, "grad_norm": 0.3697172999382019, "learning_rate": 4.722688393243829e-05, "loss": 0.3599, "step": 4098000 }, { "epoch": 27.734544174967517, "grad_norm": 0.337506502866745, "learning_rate": 4.722654558250325e-05, "loss": 0.3614, "step": 4098500 }, { "epoch": 27.737927674317888, "grad_norm": 0.36323779821395874, "learning_rate": 4.7226207232568214e-05, "loss": 0.3626, "step": 4099000 }, { "epoch": 27.741311173668254, "grad_norm": 0.39795252680778503, "learning_rate": 4.722586888263318e-05, "loss": 0.3616, "step": 4099500 }, { "epoch": 27.74469467301862, "grad_norm": 0.34694918990135193, "learning_rate": 4.7225530532698146e-05, "loss": 0.3634, "step": 4100000 }, { "epoch": 27.74807817236899, "grad_norm": 0.37187129259109497, "learning_rate": 4.72251921827631e-05, "loss": 0.3619, "step": 4100500 }, { "epoch": 27.75146167171936, "grad_norm": 0.38147711753845215, "learning_rate": 4.722485383282806e-05, "loss": 0.3611, "step": 4101000 }, { "epoch": 27.75484517106973, "grad_norm": 0.3710109293460846, "learning_rate": 4.7224515482893025e-05, "loss": 0.3616, "step": 4101500 }, { "epoch": 27.758228670420095, "grad_norm": 0.38604336977005005, "learning_rate": 4.7224177132957994e-05, "loss": 0.3611, "step": 4102000 }, { "epoch": 27.761612169770462, "grad_norm": 0.40243417024612427, "learning_rate": 4.7223838783022956e-05, "loss": 0.3625, "step": 4102500 }, { "epoch": 27.764995669120832, "grad_norm": 0.4230211675167084, "learning_rate": 4.722350043308792e-05, "loss": 0.3612, "step": 4103000 }, { "epoch": 27.7683791684712, "grad_norm": 0.40428245067596436, "learning_rate": 4.722316208315288e-05, "loss": 0.3601, "step": 4103500 }, { "epoch": 27.77176266782157, "grad_norm": 0.374149352312088, "learning_rate": 4.722282373321785e-05, "loss": 0.3618, "step": 4104000 }, { "epoch": 27.775146167171936, "grad_norm": 0.3828555941581726, "learning_rate": 4.722248538328281e-05, "loss": 0.3633, "step": 4104500 }, { "epoch": 27.778529666522303, "grad_norm": 0.38992854952812195, "learning_rate": 4.7222147033347774e-05, "loss": 0.3616, "step": 4105000 }, { "epoch": 27.781913165872673, "grad_norm": 0.3347651958465576, "learning_rate": 4.7221808683412736e-05, "loss": 0.3628, "step": 4105500 }, { "epoch": 27.78529666522304, "grad_norm": 0.3705926537513733, "learning_rate": 4.72214703334777e-05, "loss": 0.3622, "step": 4106000 }, { "epoch": 27.788680164573407, "grad_norm": 0.3586915135383606, "learning_rate": 4.722113198354266e-05, "loss": 0.3618, "step": 4106500 }, { "epoch": 27.792063663923777, "grad_norm": 0.4063935875892639, "learning_rate": 4.722079363360762e-05, "loss": 0.3612, "step": 4107000 }, { "epoch": 27.795447163274144, "grad_norm": 0.36764734983444214, "learning_rate": 4.722045528367259e-05, "loss": 0.3618, "step": 4107500 }, { "epoch": 27.798830662624514, "grad_norm": 0.3634496033191681, "learning_rate": 4.722011693373755e-05, "loss": 0.3618, "step": 4108000 }, { "epoch": 27.80221416197488, "grad_norm": 0.382001668214798, "learning_rate": 4.7219778583802515e-05, "loss": 0.3635, "step": 4108500 }, { "epoch": 27.805597661325248, "grad_norm": 0.38688212633132935, "learning_rate": 4.721944023386748e-05, "loss": 0.3634, "step": 4109000 }, { "epoch": 27.808981160675618, "grad_norm": 0.3695046305656433, "learning_rate": 4.7219101883932446e-05, "loss": 0.3614, "step": 4109500 }, { "epoch": 27.812364660025985, "grad_norm": 0.3804705739021301, "learning_rate": 4.72187635339974e-05, "loss": 0.3618, "step": 4110000 }, { "epoch": 27.815748159376355, "grad_norm": 0.37848877906799316, "learning_rate": 4.7218425184062364e-05, "loss": 0.3629, "step": 4110500 }, { "epoch": 27.81913165872672, "grad_norm": 0.40287455916404724, "learning_rate": 4.7218086834127326e-05, "loss": 0.3618, "step": 4111000 }, { "epoch": 27.82251515807709, "grad_norm": 0.3197984993457794, "learning_rate": 4.7217748484192295e-05, "loss": 0.3627, "step": 4111500 }, { "epoch": 27.82589865742746, "grad_norm": 0.37059545516967773, "learning_rate": 4.721741013425726e-05, "loss": 0.3615, "step": 4112000 }, { "epoch": 27.829282156777825, "grad_norm": 0.38981738686561584, "learning_rate": 4.721707178432222e-05, "loss": 0.3621, "step": 4112500 }, { "epoch": 27.832665656128192, "grad_norm": 0.3608679175376892, "learning_rate": 4.721673343438718e-05, "loss": 0.3622, "step": 4113000 }, { "epoch": 27.836049155478563, "grad_norm": 0.37277328968048096, "learning_rate": 4.721639508445215e-05, "loss": 0.3642, "step": 4113500 }, { "epoch": 27.83943265482893, "grad_norm": 0.389523446559906, "learning_rate": 4.721605673451711e-05, "loss": 0.3622, "step": 4114000 }, { "epoch": 27.8428161541793, "grad_norm": 0.38738763332366943, "learning_rate": 4.7215718384582074e-05, "loss": 0.3632, "step": 4114500 }, { "epoch": 27.846199653529666, "grad_norm": 0.3468678593635559, "learning_rate": 4.7215380034647036e-05, "loss": 0.361, "step": 4115000 }, { "epoch": 27.849583152880033, "grad_norm": 0.38216879963874817, "learning_rate": 4.7215041684712e-05, "loss": 0.3625, "step": 4115500 }, { "epoch": 27.852966652230403, "grad_norm": 0.3636903464794159, "learning_rate": 4.721470333477696e-05, "loss": 0.3623, "step": 4116000 }, { "epoch": 27.85635015158077, "grad_norm": 0.3981262445449829, "learning_rate": 4.721436498484192e-05, "loss": 0.3622, "step": 4116500 }, { "epoch": 27.85973365093114, "grad_norm": 0.36636820435523987, "learning_rate": 4.721402663490689e-05, "loss": 0.3614, "step": 4117000 }, { "epoch": 27.863117150281507, "grad_norm": 0.35542821884155273, "learning_rate": 4.7213688284971854e-05, "loss": 0.3633, "step": 4117500 }, { "epoch": 27.866500649631874, "grad_norm": 0.39117830991744995, "learning_rate": 4.7213349935036816e-05, "loss": 0.3617, "step": 4118000 }, { "epoch": 27.869884148982244, "grad_norm": 0.36367714405059814, "learning_rate": 4.721301158510178e-05, "loss": 0.3612, "step": 4118500 }, { "epoch": 27.87326764833261, "grad_norm": 0.36560484766960144, "learning_rate": 4.721267323516674e-05, "loss": 0.3607, "step": 4119000 }, { "epoch": 27.87665114768298, "grad_norm": 0.39626118540763855, "learning_rate": 4.72123348852317e-05, "loss": 0.3626, "step": 4119500 }, { "epoch": 27.880034647033348, "grad_norm": 0.38299208879470825, "learning_rate": 4.7211996535296664e-05, "loss": 0.3618, "step": 4120000 }, { "epoch": 27.883418146383715, "grad_norm": 0.32079488039016724, "learning_rate": 4.7211658185361626e-05, "loss": 0.3635, "step": 4120500 }, { "epoch": 27.886801645734085, "grad_norm": 0.4020163118839264, "learning_rate": 4.7211319835426595e-05, "loss": 0.3609, "step": 4121000 }, { "epoch": 27.890185145084452, "grad_norm": 0.3395242691040039, "learning_rate": 4.721098148549156e-05, "loss": 0.3606, "step": 4121500 }, { "epoch": 27.893568644434822, "grad_norm": 0.3686038851737976, "learning_rate": 4.721064313555652e-05, "loss": 0.3624, "step": 4122000 }, { "epoch": 27.89695214378519, "grad_norm": 0.37975063920021057, "learning_rate": 4.721030478562148e-05, "loss": 0.3602, "step": 4122500 }, { "epoch": 27.900335643135556, "grad_norm": 0.3796452581882477, "learning_rate": 4.720996643568645e-05, "loss": 0.3642, "step": 4123000 }, { "epoch": 27.903719142485926, "grad_norm": 0.347863107919693, "learning_rate": 4.720962808575141e-05, "loss": 0.3607, "step": 4123500 }, { "epoch": 27.907102641836293, "grad_norm": 0.3953015208244324, "learning_rate": 4.7209289735816375e-05, "loss": 0.3634, "step": 4124000 }, { "epoch": 27.91048614118666, "grad_norm": 0.39113664627075195, "learning_rate": 4.720895138588134e-05, "loss": 0.3618, "step": 4124500 }, { "epoch": 27.91386964053703, "grad_norm": 0.3630425035953522, "learning_rate": 4.72086130359463e-05, "loss": 0.3622, "step": 4125000 }, { "epoch": 27.917253139887396, "grad_norm": 0.40215301513671875, "learning_rate": 4.720827468601126e-05, "loss": 0.3633, "step": 4125500 }, { "epoch": 27.920636639237767, "grad_norm": 0.40930697321891785, "learning_rate": 4.720793633607622e-05, "loss": 0.3604, "step": 4126000 }, { "epoch": 27.924020138588133, "grad_norm": 0.35423481464385986, "learning_rate": 4.720759798614119e-05, "loss": 0.361, "step": 4126500 }, { "epoch": 27.9274036379385, "grad_norm": 0.3465040624141693, "learning_rate": 4.7207259636206154e-05, "loss": 0.362, "step": 4127000 }, { "epoch": 27.93078713728887, "grad_norm": 0.4105958640575409, "learning_rate": 4.7206921286271116e-05, "loss": 0.3623, "step": 4127500 }, { "epoch": 27.934170636639237, "grad_norm": 0.3818987011909485, "learning_rate": 4.720658293633608e-05, "loss": 0.3625, "step": 4128000 }, { "epoch": 27.937554135989608, "grad_norm": 0.3856114149093628, "learning_rate": 4.720624458640104e-05, "loss": 0.3624, "step": 4128500 }, { "epoch": 27.940937635339974, "grad_norm": 0.42435044050216675, "learning_rate": 4.720590623646601e-05, "loss": 0.3614, "step": 4129000 }, { "epoch": 27.94432113469034, "grad_norm": 0.38874831795692444, "learning_rate": 4.7205567886530965e-05, "loss": 0.3609, "step": 4129500 }, { "epoch": 27.94770463404071, "grad_norm": 0.43664172291755676, "learning_rate": 4.720522953659593e-05, "loss": 0.3617, "step": 4130000 }, { "epoch": 27.951088133391078, "grad_norm": 0.3564797043800354, "learning_rate": 4.7204891186660896e-05, "loss": 0.3613, "step": 4130500 }, { "epoch": 27.954471632741445, "grad_norm": 0.36907705664634705, "learning_rate": 4.720455283672586e-05, "loss": 0.3615, "step": 4131000 }, { "epoch": 27.957855132091815, "grad_norm": 0.3607375919818878, "learning_rate": 4.720421448679082e-05, "loss": 0.3616, "step": 4131500 }, { "epoch": 27.961238631442182, "grad_norm": 0.3759130537509918, "learning_rate": 4.720387613685578e-05, "loss": 0.3604, "step": 4132000 }, { "epoch": 27.964622130792552, "grad_norm": 0.4064229130744934, "learning_rate": 4.720353778692075e-05, "loss": 0.3613, "step": 4132500 }, { "epoch": 27.96800563014292, "grad_norm": 0.40052372217178345, "learning_rate": 4.720319943698571e-05, "loss": 0.3624, "step": 4133000 }, { "epoch": 27.971389129493286, "grad_norm": 0.42291682958602905, "learning_rate": 4.7202861087050675e-05, "loss": 0.3634, "step": 4133500 }, { "epoch": 27.974772628843656, "grad_norm": 0.40400055050849915, "learning_rate": 4.720252273711564e-05, "loss": 0.3626, "step": 4134000 }, { "epoch": 27.978156128194023, "grad_norm": 0.38714271783828735, "learning_rate": 4.72021843871806e-05, "loss": 0.3623, "step": 4134500 }, { "epoch": 27.981539627544393, "grad_norm": 0.33237898349761963, "learning_rate": 4.720184603724556e-05, "loss": 0.3625, "step": 4135000 }, { "epoch": 27.98492312689476, "grad_norm": 0.3680419623851776, "learning_rate": 4.7201507687310524e-05, "loss": 0.3623, "step": 4135500 }, { "epoch": 27.988306626245127, "grad_norm": 0.38814762234687805, "learning_rate": 4.7201169337375486e-05, "loss": 0.3625, "step": 4136000 }, { "epoch": 27.991690125595497, "grad_norm": 0.37873101234436035, "learning_rate": 4.7200830987440455e-05, "loss": 0.3617, "step": 4136500 }, { "epoch": 27.995073624945864, "grad_norm": 0.4092039465904236, "learning_rate": 4.720049263750542e-05, "loss": 0.3621, "step": 4137000 }, { "epoch": 27.99845712429623, "grad_norm": 0.3645796775817871, "learning_rate": 4.720015428757038e-05, "loss": 0.3616, "step": 4137500 }, { "epoch": 28.0, "eval_accuracy": 0.8621847770921827, "eval_loss": 0.5612766146659851, "eval_runtime": 3363.4387, "eval_samples_per_second": 86.442, "eval_steps_per_second": 5.403, "step": 4137728 }, { "epoch": 28.0018406236466, "grad_norm": 0.43283811211586, "learning_rate": 4.719981593763534e-05, "loss": 0.3593, "step": 4138000 }, { "epoch": 28.005224122996967, "grad_norm": 0.37070658802986145, "learning_rate": 4.719947758770031e-05, "loss": 0.3594, "step": 4138500 }, { "epoch": 28.008607622347338, "grad_norm": 0.37569138407707214, "learning_rate": 4.7199139237765266e-05, "loss": 0.3597, "step": 4139000 }, { "epoch": 28.011991121697704, "grad_norm": 0.3612668514251709, "learning_rate": 4.719880088783023e-05, "loss": 0.3575, "step": 4139500 }, { "epoch": 28.01537462104807, "grad_norm": 0.38011425733566284, "learning_rate": 4.7198462537895197e-05, "loss": 0.3592, "step": 4140000 }, { "epoch": 28.01875812039844, "grad_norm": 0.3579834997653961, "learning_rate": 4.719812418796016e-05, "loss": 0.3612, "step": 4140500 }, { "epoch": 28.02214161974881, "grad_norm": 0.368886262178421, "learning_rate": 4.719778583802512e-05, "loss": 0.3603, "step": 4141000 }, { "epoch": 28.02552511909918, "grad_norm": 0.37519514560699463, "learning_rate": 4.719744748809008e-05, "loss": 0.3595, "step": 4141500 }, { "epoch": 28.028908618449545, "grad_norm": 0.37708401679992676, "learning_rate": 4.719710913815505e-05, "loss": 0.3607, "step": 4142000 }, { "epoch": 28.032292117799912, "grad_norm": 0.38011103868484497, "learning_rate": 4.7196770788220014e-05, "loss": 0.3596, "step": 4142500 }, { "epoch": 28.035675617150282, "grad_norm": 0.4065256416797638, "learning_rate": 4.7196432438284976e-05, "loss": 0.3593, "step": 4143000 }, { "epoch": 28.03905911650065, "grad_norm": 0.37354305386543274, "learning_rate": 4.719609408834993e-05, "loss": 0.3605, "step": 4143500 }, { "epoch": 28.04244261585102, "grad_norm": 0.38129520416259766, "learning_rate": 4.71957557384149e-05, "loss": 0.3614, "step": 4144000 }, { "epoch": 28.045826115201386, "grad_norm": 0.3501156270503998, "learning_rate": 4.719541738847986e-05, "loss": 0.3579, "step": 4144500 }, { "epoch": 28.049209614551753, "grad_norm": 0.4075695872306824, "learning_rate": 4.7195079038544825e-05, "loss": 0.3613, "step": 4145000 }, { "epoch": 28.052593113902123, "grad_norm": 0.3449297547340393, "learning_rate": 4.719474068860979e-05, "loss": 0.3591, "step": 4145500 }, { "epoch": 28.05597661325249, "grad_norm": 0.3645080327987671, "learning_rate": 4.7194402338674756e-05, "loss": 0.36, "step": 4146000 }, { "epoch": 28.059360112602857, "grad_norm": 0.3833574056625366, "learning_rate": 4.719406398873972e-05, "loss": 0.3594, "step": 4146500 }, { "epoch": 28.062743611953227, "grad_norm": 0.40159744024276733, "learning_rate": 4.719372563880468e-05, "loss": 0.3601, "step": 4147000 }, { "epoch": 28.066127111303594, "grad_norm": 0.38786768913269043, "learning_rate": 4.719338728886964e-05, "loss": 0.3607, "step": 4147500 }, { "epoch": 28.069510610653964, "grad_norm": 0.35397759079933167, "learning_rate": 4.719304893893461e-05, "loss": 0.36, "step": 4148000 }, { "epoch": 28.07289411000433, "grad_norm": 0.3652489483356476, "learning_rate": 4.7192710588999566e-05, "loss": 0.3604, "step": 4148500 }, { "epoch": 28.076277609354698, "grad_norm": 0.4199112057685852, "learning_rate": 4.719237223906453e-05, "loss": 0.3606, "step": 4149000 }, { "epoch": 28.079661108705068, "grad_norm": 0.40466582775115967, "learning_rate": 4.71920338891295e-05, "loss": 0.3602, "step": 4149500 }, { "epoch": 28.083044608055435, "grad_norm": 0.4131191074848175, "learning_rate": 4.719169553919446e-05, "loss": 0.3604, "step": 4150000 }, { "epoch": 28.086428107405805, "grad_norm": 0.38448336720466614, "learning_rate": 4.719135718925942e-05, "loss": 0.3593, "step": 4150500 }, { "epoch": 28.08981160675617, "grad_norm": 0.41221994161605835, "learning_rate": 4.7191018839324384e-05, "loss": 0.3606, "step": 4151000 }, { "epoch": 28.09319510610654, "grad_norm": 0.36168429255485535, "learning_rate": 4.719068048938935e-05, "loss": 0.3619, "step": 4151500 }, { "epoch": 28.09657860545691, "grad_norm": 0.36486026644706726, "learning_rate": 4.7190342139454315e-05, "loss": 0.3619, "step": 4152000 }, { "epoch": 28.099962104807275, "grad_norm": 0.3782370686531067, "learning_rate": 4.719000378951928e-05, "loss": 0.3608, "step": 4152500 }, { "epoch": 28.103345604157646, "grad_norm": 0.3529711067676544, "learning_rate": 4.718966543958423e-05, "loss": 0.3601, "step": 4153000 }, { "epoch": 28.106729103508012, "grad_norm": 0.3651941418647766, "learning_rate": 4.71893270896492e-05, "loss": 0.3595, "step": 4153500 }, { "epoch": 28.11011260285838, "grad_norm": 0.37990447878837585, "learning_rate": 4.718898873971416e-05, "loss": 0.3608, "step": 4154000 }, { "epoch": 28.11349610220875, "grad_norm": 0.3762222230434418, "learning_rate": 4.7188650389779125e-05, "loss": 0.3611, "step": 4154500 }, { "epoch": 28.116879601559116, "grad_norm": 0.3797217607498169, "learning_rate": 4.718831203984409e-05, "loss": 0.3605, "step": 4155000 }, { "epoch": 28.120263100909483, "grad_norm": 0.37965890765190125, "learning_rate": 4.7187973689909056e-05, "loss": 0.3616, "step": 4155500 }, { "epoch": 28.123646600259853, "grad_norm": 0.350475549697876, "learning_rate": 4.718763533997402e-05, "loss": 0.3596, "step": 4156000 }, { "epoch": 28.12703009961022, "grad_norm": 0.42017796635627747, "learning_rate": 4.718729699003898e-05, "loss": 0.3617, "step": 4156500 }, { "epoch": 28.13041359896059, "grad_norm": 0.3585604131221771, "learning_rate": 4.718695864010394e-05, "loss": 0.3606, "step": 4157000 }, { "epoch": 28.133797098310957, "grad_norm": 0.40403875708580017, "learning_rate": 4.718662029016891e-05, "loss": 0.3616, "step": 4157500 }, { "epoch": 28.137180597661324, "grad_norm": 0.3596263527870178, "learning_rate": 4.718628194023387e-05, "loss": 0.3589, "step": 4158000 }, { "epoch": 28.140564097011694, "grad_norm": 0.36147862672805786, "learning_rate": 4.718594359029883e-05, "loss": 0.3627, "step": 4158500 }, { "epoch": 28.14394759636206, "grad_norm": 0.3843778669834137, "learning_rate": 4.71856052403638e-05, "loss": 0.3594, "step": 4159000 }, { "epoch": 28.14733109571243, "grad_norm": 0.3639533817768097, "learning_rate": 4.718526689042876e-05, "loss": 0.3606, "step": 4159500 }, { "epoch": 28.150714595062798, "grad_norm": 0.3854668438434601, "learning_rate": 4.718492854049372e-05, "loss": 0.3591, "step": 4160000 }, { "epoch": 28.154098094413165, "grad_norm": 0.4005782902240753, "learning_rate": 4.7184590190558684e-05, "loss": 0.3603, "step": 4160500 }, { "epoch": 28.157481593763535, "grad_norm": 0.3792315423488617, "learning_rate": 4.718425184062365e-05, "loss": 0.3623, "step": 4161000 }, { "epoch": 28.1608650931139, "grad_norm": 0.38012492656707764, "learning_rate": 4.7183913490688615e-05, "loss": 0.3614, "step": 4161500 }, { "epoch": 28.16424859246427, "grad_norm": 0.35042500495910645, "learning_rate": 4.718357514075358e-05, "loss": 0.3605, "step": 4162000 }, { "epoch": 28.16763209181464, "grad_norm": 0.39792054891586304, "learning_rate": 4.718323679081853e-05, "loss": 0.3603, "step": 4162500 }, { "epoch": 28.171015591165006, "grad_norm": 0.37952131032943726, "learning_rate": 4.71828984408835e-05, "loss": 0.3612, "step": 4163000 }, { "epoch": 28.174399090515376, "grad_norm": 0.3857076168060303, "learning_rate": 4.7182560090948464e-05, "loss": 0.3603, "step": 4163500 }, { "epoch": 28.177782589865743, "grad_norm": 0.3405747413635254, "learning_rate": 4.7182221741013426e-05, "loss": 0.3615, "step": 4164000 }, { "epoch": 28.18116608921611, "grad_norm": 0.32335302233695984, "learning_rate": 4.718188339107839e-05, "loss": 0.3611, "step": 4164500 }, { "epoch": 28.18454958856648, "grad_norm": 0.4017068147659302, "learning_rate": 4.718154504114336e-05, "loss": 0.362, "step": 4165000 }, { "epoch": 28.187933087916846, "grad_norm": 0.3898669183254242, "learning_rate": 4.718120669120832e-05, "loss": 0.362, "step": 4165500 }, { "epoch": 28.191316587267217, "grad_norm": 0.40183600783348083, "learning_rate": 4.718086834127328e-05, "loss": 0.3612, "step": 4166000 }, { "epoch": 28.194700086617583, "grad_norm": 0.37525132298469543, "learning_rate": 4.718052999133824e-05, "loss": 0.3606, "step": 4166500 }, { "epoch": 28.19808358596795, "grad_norm": 0.38127073645591736, "learning_rate": 4.718019164140321e-05, "loss": 0.3613, "step": 4167000 }, { "epoch": 28.20146708531832, "grad_norm": 0.3760685324668884, "learning_rate": 4.717985329146817e-05, "loss": 0.3609, "step": 4167500 }, { "epoch": 28.204850584668687, "grad_norm": 0.361402690410614, "learning_rate": 4.717951494153313e-05, "loss": 0.3613, "step": 4168000 }, { "epoch": 28.208234084019058, "grad_norm": 0.3808455169200897, "learning_rate": 4.71791765915981e-05, "loss": 0.3607, "step": 4168500 }, { "epoch": 28.211617583369424, "grad_norm": 0.37132528424263, "learning_rate": 4.717883824166306e-05, "loss": 0.3615, "step": 4169000 }, { "epoch": 28.21500108271979, "grad_norm": 0.3817844092845917, "learning_rate": 4.717849989172802e-05, "loss": 0.3603, "step": 4169500 }, { "epoch": 28.21838458207016, "grad_norm": 0.35797831416130066, "learning_rate": 4.7178161541792985e-05, "loss": 0.3611, "step": 4170000 }, { "epoch": 28.221768081420528, "grad_norm": 0.41020745038986206, "learning_rate": 4.7177823191857954e-05, "loss": 0.362, "step": 4170500 }, { "epoch": 28.225151580770895, "grad_norm": 0.362379252910614, "learning_rate": 4.7177484841922916e-05, "loss": 0.3616, "step": 4171000 }, { "epoch": 28.228535080121265, "grad_norm": 0.34908509254455566, "learning_rate": 4.717714649198788e-05, "loss": 0.3621, "step": 4171500 }, { "epoch": 28.231918579471632, "grad_norm": 0.3852998614311218, "learning_rate": 4.717680814205283e-05, "loss": 0.3611, "step": 4172000 }, { "epoch": 28.235302078822002, "grad_norm": 0.39981764554977417, "learning_rate": 4.71764697921178e-05, "loss": 0.361, "step": 4172500 }, { "epoch": 28.23868557817237, "grad_norm": 0.3649141192436218, "learning_rate": 4.7176131442182764e-05, "loss": 0.3631, "step": 4173000 }, { "epoch": 28.242069077522736, "grad_norm": 0.3951915204524994, "learning_rate": 4.7175793092247726e-05, "loss": 0.3611, "step": 4173500 }, { "epoch": 28.245452576873106, "grad_norm": 0.39952680468559265, "learning_rate": 4.717545474231269e-05, "loss": 0.3612, "step": 4174000 }, { "epoch": 28.248836076223473, "grad_norm": 0.40479788184165955, "learning_rate": 4.717511639237766e-05, "loss": 0.362, "step": 4174500 }, { "epoch": 28.252219575573843, "grad_norm": 0.37462568283081055, "learning_rate": 4.717477804244262e-05, "loss": 0.3623, "step": 4175000 }, { "epoch": 28.25560307492421, "grad_norm": 0.3540719747543335, "learning_rate": 4.717443969250758e-05, "loss": 0.3613, "step": 4175500 }, { "epoch": 28.258986574274576, "grad_norm": 0.3863060474395752, "learning_rate": 4.7174101342572544e-05, "loss": 0.3619, "step": 4176000 }, { "epoch": 28.262370073624947, "grad_norm": 0.35178685188293457, "learning_rate": 4.717376299263751e-05, "loss": 0.3628, "step": 4176500 }, { "epoch": 28.265753572975314, "grad_norm": 0.37595534324645996, "learning_rate": 4.717342464270247e-05, "loss": 0.3619, "step": 4177000 }, { "epoch": 28.269137072325684, "grad_norm": 0.3724938631057739, "learning_rate": 4.717308629276743e-05, "loss": 0.3599, "step": 4177500 }, { "epoch": 28.27252057167605, "grad_norm": 0.3593493700027466, "learning_rate": 4.71727479428324e-05, "loss": 0.3622, "step": 4178000 }, { "epoch": 28.275904071026417, "grad_norm": 0.39943647384643555, "learning_rate": 4.717240959289736e-05, "loss": 0.3598, "step": 4178500 }, { "epoch": 28.279287570376788, "grad_norm": 0.3845526874065399, "learning_rate": 4.717207124296232e-05, "loss": 0.3605, "step": 4179000 }, { "epoch": 28.282671069727154, "grad_norm": 0.3796115219593048, "learning_rate": 4.7171732893027285e-05, "loss": 0.3631, "step": 4179500 }, { "epoch": 28.28605456907752, "grad_norm": 0.37783804535865784, "learning_rate": 4.7171394543092254e-05, "loss": 0.362, "step": 4180000 }, { "epoch": 28.28943806842789, "grad_norm": 0.3520191013813019, "learning_rate": 4.7171056193157216e-05, "loss": 0.3603, "step": 4180500 }, { "epoch": 28.292821567778258, "grad_norm": 0.39619338512420654, "learning_rate": 4.717071784322218e-05, "loss": 0.3605, "step": 4181000 }, { "epoch": 28.29620506712863, "grad_norm": 0.363335520029068, "learning_rate": 4.7170379493287134e-05, "loss": 0.3614, "step": 4181500 }, { "epoch": 28.299588566478995, "grad_norm": 0.3505692183971405, "learning_rate": 4.71700411433521e-05, "loss": 0.3594, "step": 4182000 }, { "epoch": 28.302972065829362, "grad_norm": 0.3883934020996094, "learning_rate": 4.7169702793417065e-05, "loss": 0.362, "step": 4182500 }, { "epoch": 28.306355565179732, "grad_norm": 0.35601288080215454, "learning_rate": 4.716936444348203e-05, "loss": 0.361, "step": 4183000 }, { "epoch": 28.3097390645301, "grad_norm": 0.3968811333179474, "learning_rate": 4.716902609354699e-05, "loss": 0.3612, "step": 4183500 }, { "epoch": 28.31312256388047, "grad_norm": 0.3789132535457611, "learning_rate": 4.716868774361196e-05, "loss": 0.3595, "step": 4184000 }, { "epoch": 28.316506063230836, "grad_norm": 0.39282628893852234, "learning_rate": 4.716834939367692e-05, "loss": 0.3607, "step": 4184500 }, { "epoch": 28.319889562581203, "grad_norm": 0.37454989552497864, "learning_rate": 4.716801104374188e-05, "loss": 0.3603, "step": 4185000 }, { "epoch": 28.323273061931573, "grad_norm": 0.412693053483963, "learning_rate": 4.7167672693806844e-05, "loss": 0.3603, "step": 4185500 }, { "epoch": 28.32665656128194, "grad_norm": 0.39129626750946045, "learning_rate": 4.716733434387181e-05, "loss": 0.3614, "step": 4186000 }, { "epoch": 28.330040060632307, "grad_norm": 0.3551253378391266, "learning_rate": 4.716699599393677e-05, "loss": 0.3607, "step": 4186500 }, { "epoch": 28.333423559982677, "grad_norm": 0.3845466375350952, "learning_rate": 4.716665764400173e-05, "loss": 0.3612, "step": 4187000 }, { "epoch": 28.336807059333044, "grad_norm": 0.39009878039360046, "learning_rate": 4.71663192940667e-05, "loss": 0.3614, "step": 4187500 }, { "epoch": 28.340190558683414, "grad_norm": 0.3824731409549713, "learning_rate": 4.716598094413166e-05, "loss": 0.3614, "step": 4188000 }, { "epoch": 28.34357405803378, "grad_norm": 0.37005728483200073, "learning_rate": 4.7165642594196624e-05, "loss": 0.3619, "step": 4188500 }, { "epoch": 28.346957557384147, "grad_norm": 0.4180995225906372, "learning_rate": 4.7165304244261586e-05, "loss": 0.3593, "step": 4189000 }, { "epoch": 28.350341056734518, "grad_norm": 0.3632233738899231, "learning_rate": 4.716496589432655e-05, "loss": 0.3615, "step": 4189500 }, { "epoch": 28.353724556084885, "grad_norm": 0.33749160170555115, "learning_rate": 4.716462754439152e-05, "loss": 0.361, "step": 4190000 }, { "epoch": 28.357108055435255, "grad_norm": 0.40541553497314453, "learning_rate": 4.716428919445648e-05, "loss": 0.3612, "step": 4190500 }, { "epoch": 28.36049155478562, "grad_norm": 0.3835623860359192, "learning_rate": 4.716395084452144e-05, "loss": 0.3612, "step": 4191000 }, { "epoch": 28.36387505413599, "grad_norm": 0.36845603585243225, "learning_rate": 4.7163612494586403e-05, "loss": 0.3627, "step": 4191500 }, { "epoch": 28.36725855348636, "grad_norm": 0.3849983215332031, "learning_rate": 4.7163274144651366e-05, "loss": 0.3623, "step": 4192000 }, { "epoch": 28.370642052836725, "grad_norm": 0.35379183292388916, "learning_rate": 4.716293579471633e-05, "loss": 0.3613, "step": 4192500 }, { "epoch": 28.374025552187096, "grad_norm": 0.3451194167137146, "learning_rate": 4.716259744478129e-05, "loss": 0.3609, "step": 4193000 }, { "epoch": 28.377409051537462, "grad_norm": 0.420551598072052, "learning_rate": 4.716225909484626e-05, "loss": 0.3626, "step": 4193500 }, { "epoch": 28.38079255088783, "grad_norm": 0.3557862341403961, "learning_rate": 4.716192074491122e-05, "loss": 0.3604, "step": 4194000 }, { "epoch": 28.3841760502382, "grad_norm": 0.35886773467063904, "learning_rate": 4.716158239497618e-05, "loss": 0.36, "step": 4194500 }, { "epoch": 28.387559549588566, "grad_norm": 0.35705262422561646, "learning_rate": 4.7161244045041145e-05, "loss": 0.3617, "step": 4195000 }, { "epoch": 28.390943048938933, "grad_norm": 0.3624837100505829, "learning_rate": 4.7160905695106114e-05, "loss": 0.3606, "step": 4195500 }, { "epoch": 28.394326548289303, "grad_norm": 0.3872695565223694, "learning_rate": 4.716056734517107e-05, "loss": 0.3625, "step": 4196000 }, { "epoch": 28.39771004763967, "grad_norm": 0.35200896859169006, "learning_rate": 4.716022899523603e-05, "loss": 0.3609, "step": 4196500 }, { "epoch": 28.40109354699004, "grad_norm": 0.3648911714553833, "learning_rate": 4.7159890645301e-05, "loss": 0.362, "step": 4197000 }, { "epoch": 28.404477046340407, "grad_norm": 0.3744070827960968, "learning_rate": 4.715955229536596e-05, "loss": 0.36, "step": 4197500 }, { "epoch": 28.407860545690774, "grad_norm": 0.35949233174324036, "learning_rate": 4.7159213945430925e-05, "loss": 0.3612, "step": 4198000 }, { "epoch": 28.411244045041144, "grad_norm": 0.3645084798336029, "learning_rate": 4.715887559549589e-05, "loss": 0.3608, "step": 4198500 }, { "epoch": 28.41462754439151, "grad_norm": 0.37557467818260193, "learning_rate": 4.715853724556085e-05, "loss": 0.3607, "step": 4199000 }, { "epoch": 28.41801104374188, "grad_norm": 0.39893093705177307, "learning_rate": 4.715819889562582e-05, "loss": 0.3615, "step": 4199500 }, { "epoch": 28.421394543092248, "grad_norm": 0.3806627094745636, "learning_rate": 4.715786054569078e-05, "loss": 0.3632, "step": 4200000 }, { "epoch": 28.424778042442615, "grad_norm": 0.40269455313682556, "learning_rate": 4.715752219575574e-05, "loss": 0.3608, "step": 4200500 }, { "epoch": 28.428161541792985, "grad_norm": 0.34466657042503357, "learning_rate": 4.7157183845820704e-05, "loss": 0.3606, "step": 4201000 }, { "epoch": 28.43154504114335, "grad_norm": 0.41121411323547363, "learning_rate": 4.7156845495885666e-05, "loss": 0.362, "step": 4201500 }, { "epoch": 28.434928540493722, "grad_norm": 0.41764146089553833, "learning_rate": 4.715650714595063e-05, "loss": 0.3622, "step": 4202000 }, { "epoch": 28.43831203984409, "grad_norm": 0.3867233097553253, "learning_rate": 4.715616879601559e-05, "loss": 0.363, "step": 4202500 }, { "epoch": 28.441695539194455, "grad_norm": 0.3862224817276001, "learning_rate": 4.715583044608056e-05, "loss": 0.3617, "step": 4203000 }, { "epoch": 28.445079038544826, "grad_norm": 0.38365626335144043, "learning_rate": 4.715549209614552e-05, "loss": 0.3611, "step": 4203500 }, { "epoch": 28.448462537895193, "grad_norm": 0.3457985520362854, "learning_rate": 4.7155153746210484e-05, "loss": 0.3601, "step": 4204000 }, { "epoch": 28.45184603724556, "grad_norm": 0.35866719484329224, "learning_rate": 4.7154815396275446e-05, "loss": 0.3613, "step": 4204500 }, { "epoch": 28.45522953659593, "grad_norm": 0.3741433024406433, "learning_rate": 4.7154477046340415e-05, "loss": 0.3608, "step": 4205000 }, { "epoch": 28.458613035946296, "grad_norm": 0.42533352971076965, "learning_rate": 4.715413869640537e-05, "loss": 0.3623, "step": 4205500 }, { "epoch": 28.461996535296667, "grad_norm": 0.3592928946018219, "learning_rate": 4.715380034647033e-05, "loss": 0.3619, "step": 4206000 }, { "epoch": 28.465380034647033, "grad_norm": 0.36465469002723694, "learning_rate": 4.7153461996535294e-05, "loss": 0.3634, "step": 4206500 }, { "epoch": 28.4687635339974, "grad_norm": 0.36585113406181335, "learning_rate": 4.715312364660026e-05, "loss": 0.361, "step": 4207000 }, { "epoch": 28.47214703334777, "grad_norm": 0.38405710458755493, "learning_rate": 4.7152785296665225e-05, "loss": 0.3608, "step": 4207500 }, { "epoch": 28.475530532698137, "grad_norm": 0.39637649059295654, "learning_rate": 4.715244694673019e-05, "loss": 0.3617, "step": 4208000 }, { "epoch": 28.478914032048507, "grad_norm": 0.37295740842819214, "learning_rate": 4.715210859679515e-05, "loss": 0.3614, "step": 4208500 }, { "epoch": 28.482297531398874, "grad_norm": 0.3904971480369568, "learning_rate": 4.715177024686012e-05, "loss": 0.3621, "step": 4209000 }, { "epoch": 28.48568103074924, "grad_norm": 0.35155045986175537, "learning_rate": 4.715143189692508e-05, "loss": 0.3609, "step": 4209500 }, { "epoch": 28.48906453009961, "grad_norm": 0.355697363615036, "learning_rate": 4.715109354699004e-05, "loss": 0.3605, "step": 4210000 }, { "epoch": 28.492448029449978, "grad_norm": 0.3459162414073944, "learning_rate": 4.7150755197055005e-05, "loss": 0.3626, "step": 4210500 }, { "epoch": 28.495831528800345, "grad_norm": 0.36261093616485596, "learning_rate": 4.715041684711997e-05, "loss": 0.3612, "step": 4211000 }, { "epoch": 28.499215028150715, "grad_norm": 0.3706207573413849, "learning_rate": 4.715007849718493e-05, "loss": 0.3611, "step": 4211500 }, { "epoch": 28.502598527501082, "grad_norm": 0.39113545417785645, "learning_rate": 4.714974014724989e-05, "loss": 0.3632, "step": 4212000 }, { "epoch": 28.505982026851452, "grad_norm": 0.388141393661499, "learning_rate": 4.714940179731486e-05, "loss": 0.3615, "step": 4212500 }, { "epoch": 28.50936552620182, "grad_norm": 0.35941025614738464, "learning_rate": 4.714906344737982e-05, "loss": 0.3619, "step": 4213000 }, { "epoch": 28.512749025552186, "grad_norm": 0.39670437574386597, "learning_rate": 4.7148725097444784e-05, "loss": 0.3609, "step": 4213500 }, { "epoch": 28.516132524902556, "grad_norm": 0.41994255781173706, "learning_rate": 4.7148386747509746e-05, "loss": 0.3628, "step": 4214000 }, { "epoch": 28.519516024252923, "grad_norm": 0.41873568296432495, "learning_rate": 4.7148048397574715e-05, "loss": 0.3601, "step": 4214500 }, { "epoch": 28.522899523603293, "grad_norm": 0.36655986309051514, "learning_rate": 4.714771004763967e-05, "loss": 0.3627, "step": 4215000 }, { "epoch": 28.52628302295366, "grad_norm": 0.38597121834754944, "learning_rate": 4.714737169770463e-05, "loss": 0.3624, "step": 4215500 }, { "epoch": 28.529666522304026, "grad_norm": 0.34690356254577637, "learning_rate": 4.7147033347769595e-05, "loss": 0.3618, "step": 4216000 }, { "epoch": 28.533050021654397, "grad_norm": 0.3700477182865143, "learning_rate": 4.7146694997834564e-05, "loss": 0.3618, "step": 4216500 }, { "epoch": 28.536433521004763, "grad_norm": 0.38068437576293945, "learning_rate": 4.7146356647899526e-05, "loss": 0.3598, "step": 4217000 }, { "epoch": 28.539817020355134, "grad_norm": 0.4197213053703308, "learning_rate": 4.714601829796449e-05, "loss": 0.3639, "step": 4217500 }, { "epoch": 28.5432005197055, "grad_norm": 0.4043872356414795, "learning_rate": 4.714567994802945e-05, "loss": 0.3612, "step": 4218000 }, { "epoch": 28.546584019055867, "grad_norm": 0.38576143980026245, "learning_rate": 4.714534159809442e-05, "loss": 0.3617, "step": 4218500 }, { "epoch": 28.549967518406238, "grad_norm": 0.3723194897174835, "learning_rate": 4.714500324815938e-05, "loss": 0.3611, "step": 4219000 }, { "epoch": 28.553351017756604, "grad_norm": 0.3344508707523346, "learning_rate": 4.714466489822434e-05, "loss": 0.3618, "step": 4219500 }, { "epoch": 28.55673451710697, "grad_norm": 0.3709132969379425, "learning_rate": 4.7144326548289305e-05, "loss": 0.361, "step": 4220000 }, { "epoch": 28.56011801645734, "grad_norm": 0.3418281078338623, "learning_rate": 4.714398819835427e-05, "loss": 0.3599, "step": 4220500 }, { "epoch": 28.563501515807708, "grad_norm": 0.34958213567733765, "learning_rate": 4.714364984841923e-05, "loss": 0.3613, "step": 4221000 }, { "epoch": 28.56688501515808, "grad_norm": 0.36479049921035767, "learning_rate": 4.714331149848419e-05, "loss": 0.3604, "step": 4221500 }, { "epoch": 28.570268514508445, "grad_norm": 0.4019864797592163, "learning_rate": 4.714297314854916e-05, "loss": 0.361, "step": 4222000 }, { "epoch": 28.573652013858812, "grad_norm": 0.37898576259613037, "learning_rate": 4.714263479861412e-05, "loss": 0.361, "step": 4222500 }, { "epoch": 28.577035513209182, "grad_norm": 0.4045998454093933, "learning_rate": 4.7142296448679085e-05, "loss": 0.3622, "step": 4223000 }, { "epoch": 28.58041901255955, "grad_norm": 0.3739849925041199, "learning_rate": 4.714195809874405e-05, "loss": 0.3619, "step": 4223500 }, { "epoch": 28.58380251190992, "grad_norm": 0.3683318793773651, "learning_rate": 4.7141619748809016e-05, "loss": 0.3617, "step": 4224000 }, { "epoch": 28.587186011260286, "grad_norm": 0.3913784325122833, "learning_rate": 4.714128139887397e-05, "loss": 0.3627, "step": 4224500 }, { "epoch": 28.590569510610653, "grad_norm": 0.39089998602867126, "learning_rate": 4.714094304893893e-05, "loss": 0.3604, "step": 4225000 }, { "epoch": 28.593953009961023, "grad_norm": 0.4031013548374176, "learning_rate": 4.7140604699003895e-05, "loss": 0.3618, "step": 4225500 }, { "epoch": 28.59733650931139, "grad_norm": 0.36095115542411804, "learning_rate": 4.7140266349068864e-05, "loss": 0.3619, "step": 4226000 }, { "epoch": 28.60072000866176, "grad_norm": 0.37524983286857605, "learning_rate": 4.7139927999133826e-05, "loss": 0.3612, "step": 4226500 }, { "epoch": 28.604103508012127, "grad_norm": 0.3565714955329895, "learning_rate": 4.713958964919879e-05, "loss": 0.3608, "step": 4227000 }, { "epoch": 28.607487007362494, "grad_norm": 0.35314592719078064, "learning_rate": 4.713925129926375e-05, "loss": 0.3618, "step": 4227500 }, { "epoch": 28.610870506712864, "grad_norm": 0.3785061538219452, "learning_rate": 4.713891294932872e-05, "loss": 0.3619, "step": 4228000 }, { "epoch": 28.61425400606323, "grad_norm": 0.40104159712791443, "learning_rate": 4.713857459939368e-05, "loss": 0.3612, "step": 4228500 }, { "epoch": 28.617637505413597, "grad_norm": 0.373701274394989, "learning_rate": 4.7138236249458644e-05, "loss": 0.3609, "step": 4229000 }, { "epoch": 28.621021004763968, "grad_norm": 0.3659369647502899, "learning_rate": 4.7137897899523606e-05, "loss": 0.3607, "step": 4229500 }, { "epoch": 28.624404504114334, "grad_norm": 0.3658003807067871, "learning_rate": 4.713755954958857e-05, "loss": 0.3601, "step": 4230000 }, { "epoch": 28.627788003464705, "grad_norm": 0.3933974802494049, "learning_rate": 4.713722119965353e-05, "loss": 0.3629, "step": 4230500 }, { "epoch": 28.63117150281507, "grad_norm": 0.351081907749176, "learning_rate": 4.713688284971849e-05, "loss": 0.3621, "step": 4231000 }, { "epoch": 28.63455500216544, "grad_norm": 0.3646312952041626, "learning_rate": 4.713654449978346e-05, "loss": 0.3607, "step": 4231500 }, { "epoch": 28.63793850151581, "grad_norm": 0.3640258312225342, "learning_rate": 4.713620614984842e-05, "loss": 0.3619, "step": 4232000 }, { "epoch": 28.641322000866175, "grad_norm": 0.38643383979797363, "learning_rate": 4.7135867799913385e-05, "loss": 0.3611, "step": 4232500 }, { "epoch": 28.644705500216546, "grad_norm": 0.3771829903125763, "learning_rate": 4.713552944997835e-05, "loss": 0.3608, "step": 4233000 }, { "epoch": 28.648088999566912, "grad_norm": 0.3571754992008209, "learning_rate": 4.7135191100043317e-05, "loss": 0.3625, "step": 4233500 }, { "epoch": 28.65147249891728, "grad_norm": 0.3494188189506531, "learning_rate": 4.713485275010827e-05, "loss": 0.362, "step": 4234000 }, { "epoch": 28.65485599826765, "grad_norm": 0.3332378566265106, "learning_rate": 4.7134514400173234e-05, "loss": 0.3616, "step": 4234500 }, { "epoch": 28.658239497618016, "grad_norm": 0.3812815248966217, "learning_rate": 4.7134176050238196e-05, "loss": 0.3602, "step": 4235000 }, { "epoch": 28.661622996968383, "grad_norm": 0.3675067722797394, "learning_rate": 4.7133837700303165e-05, "loss": 0.3605, "step": 4235500 }, { "epoch": 28.665006496318753, "grad_norm": 0.38357430696487427, "learning_rate": 4.713349935036813e-05, "loss": 0.3605, "step": 4236000 }, { "epoch": 28.66838999566912, "grad_norm": 0.3515216112136841, "learning_rate": 4.713316100043309e-05, "loss": 0.3624, "step": 4236500 }, { "epoch": 28.67177349501949, "grad_norm": 0.36064037680625916, "learning_rate": 4.713282265049805e-05, "loss": 0.3605, "step": 4237000 }, { "epoch": 28.675156994369857, "grad_norm": 0.40382781624794006, "learning_rate": 4.713248430056302e-05, "loss": 0.3627, "step": 4237500 }, { "epoch": 28.678540493720224, "grad_norm": 0.3234970271587372, "learning_rate": 4.713214595062798e-05, "loss": 0.3633, "step": 4238000 }, { "epoch": 28.681923993070594, "grad_norm": 0.40176334977149963, "learning_rate": 4.7131807600692945e-05, "loss": 0.3616, "step": 4238500 }, { "epoch": 28.68530749242096, "grad_norm": 0.3842661678791046, "learning_rate": 4.713146925075791e-05, "loss": 0.3626, "step": 4239000 }, { "epoch": 28.68869099177133, "grad_norm": 0.4055143892765045, "learning_rate": 4.713113090082287e-05, "loss": 0.3604, "step": 4239500 }, { "epoch": 28.692074491121698, "grad_norm": 0.3751370310783386, "learning_rate": 4.713079255088783e-05, "loss": 0.3598, "step": 4240000 }, { "epoch": 28.695457990472065, "grad_norm": 0.43553754687309265, "learning_rate": 4.713045420095279e-05, "loss": 0.362, "step": 4240500 }, { "epoch": 28.698841489822435, "grad_norm": 0.3492582142353058, "learning_rate": 4.713011585101776e-05, "loss": 0.3609, "step": 4241000 }, { "epoch": 28.7022249891728, "grad_norm": 0.36575913429260254, "learning_rate": 4.7129777501082724e-05, "loss": 0.3612, "step": 4241500 }, { "epoch": 28.70560848852317, "grad_norm": 0.34025856852531433, "learning_rate": 4.7129439151147686e-05, "loss": 0.3612, "step": 4242000 }, { "epoch": 28.70899198787354, "grad_norm": 0.36681750416755676, "learning_rate": 4.712910080121265e-05, "loss": 0.3617, "step": 4242500 }, { "epoch": 28.712375487223905, "grad_norm": 0.3458787798881531, "learning_rate": 4.712876245127762e-05, "loss": 0.3598, "step": 4243000 }, { "epoch": 28.715758986574276, "grad_norm": 0.3240845203399658, "learning_rate": 4.712842410134258e-05, "loss": 0.3609, "step": 4243500 }, { "epoch": 28.719142485924642, "grad_norm": 0.3944730758666992, "learning_rate": 4.7128085751407535e-05, "loss": 0.3633, "step": 4244000 }, { "epoch": 28.72252598527501, "grad_norm": 0.3892706036567688, "learning_rate": 4.71277474014725e-05, "loss": 0.3604, "step": 4244500 }, { "epoch": 28.72590948462538, "grad_norm": 0.3503298759460449, "learning_rate": 4.7127409051537466e-05, "loss": 0.3617, "step": 4245000 }, { "epoch": 28.729292983975746, "grad_norm": 0.3950233459472656, "learning_rate": 4.712707070160243e-05, "loss": 0.3618, "step": 4245500 }, { "epoch": 28.732676483326117, "grad_norm": 0.3659052550792694, "learning_rate": 4.712673235166739e-05, "loss": 0.361, "step": 4246000 }, { "epoch": 28.736059982676483, "grad_norm": 0.37944474816322327, "learning_rate": 4.712639400173235e-05, "loss": 0.3615, "step": 4246500 }, { "epoch": 28.73944348202685, "grad_norm": 0.32573771476745605, "learning_rate": 4.712605565179732e-05, "loss": 0.3617, "step": 4247000 }, { "epoch": 28.74282698137722, "grad_norm": 0.3730095326900482, "learning_rate": 4.712571730186228e-05, "loss": 0.3613, "step": 4247500 }, { "epoch": 28.746210480727587, "grad_norm": 0.3818854093551636, "learning_rate": 4.7125378951927245e-05, "loss": 0.3619, "step": 4248000 }, { "epoch": 28.749593980077957, "grad_norm": 0.3606642186641693, "learning_rate": 4.712504060199221e-05, "loss": 0.3616, "step": 4248500 }, { "epoch": 28.752977479428324, "grad_norm": 0.408157616853714, "learning_rate": 4.712470225205717e-05, "loss": 0.359, "step": 4249000 }, { "epoch": 28.75636097877869, "grad_norm": 0.3866164982318878, "learning_rate": 4.712436390212213e-05, "loss": 0.3625, "step": 4249500 }, { "epoch": 28.75974447812906, "grad_norm": 0.3604215979576111, "learning_rate": 4.7124025552187094e-05, "loss": 0.3594, "step": 4250000 }, { "epoch": 28.763127977479428, "grad_norm": 0.3600050210952759, "learning_rate": 4.712368720225206e-05, "loss": 0.3633, "step": 4250500 }, { "epoch": 28.7665114768298, "grad_norm": 0.39066022634506226, "learning_rate": 4.7123348852317025e-05, "loss": 0.3622, "step": 4251000 }, { "epoch": 28.769894976180165, "grad_norm": 0.41124704480171204, "learning_rate": 4.712301050238199e-05, "loss": 0.3624, "step": 4251500 }, { "epoch": 28.77327847553053, "grad_norm": 0.32621800899505615, "learning_rate": 4.712267215244695e-05, "loss": 0.363, "step": 4252000 }, { "epoch": 28.776661974880902, "grad_norm": 0.3741280436515808, "learning_rate": 4.712233380251191e-05, "loss": 0.3625, "step": 4252500 }, { "epoch": 28.78004547423127, "grad_norm": 0.4018608629703522, "learning_rate": 4.712199545257688e-05, "loss": 0.3612, "step": 4253000 }, { "epoch": 28.783428973581636, "grad_norm": 0.4042356610298157, "learning_rate": 4.7121657102641835e-05, "loss": 0.36, "step": 4253500 }, { "epoch": 28.786812472932006, "grad_norm": 0.34805935621261597, "learning_rate": 4.71213187527068e-05, "loss": 0.3617, "step": 4254000 }, { "epoch": 28.790195972282373, "grad_norm": 0.4501422047615051, "learning_rate": 4.7120980402771766e-05, "loss": 0.3628, "step": 4254500 }, { "epoch": 28.793579471632743, "grad_norm": 0.4037505090236664, "learning_rate": 4.712064205283673e-05, "loss": 0.3604, "step": 4255000 }, { "epoch": 28.79696297098311, "grad_norm": 0.3673800528049469, "learning_rate": 4.712030370290169e-05, "loss": 0.3615, "step": 4255500 }, { "epoch": 28.800346470333476, "grad_norm": 0.35557159781455994, "learning_rate": 4.711996535296665e-05, "loss": 0.3591, "step": 4256000 }, { "epoch": 28.803729969683847, "grad_norm": 0.36397409439086914, "learning_rate": 4.711962700303162e-05, "loss": 0.3614, "step": 4256500 }, { "epoch": 28.807113469034213, "grad_norm": 0.40870311856269836, "learning_rate": 4.7119288653096584e-05, "loss": 0.3626, "step": 4257000 }, { "epoch": 28.810496968384584, "grad_norm": 0.36634010076522827, "learning_rate": 4.7118950303161546e-05, "loss": 0.3618, "step": 4257500 }, { "epoch": 28.81388046773495, "grad_norm": 0.33262279629707336, "learning_rate": 4.711861195322651e-05, "loss": 0.362, "step": 4258000 }, { "epoch": 28.817263967085317, "grad_norm": 0.36668986082077026, "learning_rate": 4.711827360329147e-05, "loss": 0.3614, "step": 4258500 }, { "epoch": 28.820647466435688, "grad_norm": 0.34490931034088135, "learning_rate": 4.711793525335643e-05, "loss": 0.3617, "step": 4259000 }, { "epoch": 28.824030965786054, "grad_norm": 0.3578276038169861, "learning_rate": 4.7117596903421394e-05, "loss": 0.3628, "step": 4259500 }, { "epoch": 28.82741446513642, "grad_norm": 0.3873530924320221, "learning_rate": 4.7117258553486356e-05, "loss": 0.3608, "step": 4260000 }, { "epoch": 28.83079796448679, "grad_norm": 0.3942200541496277, "learning_rate": 4.7116920203551325e-05, "loss": 0.3613, "step": 4260500 }, { "epoch": 28.834181463837158, "grad_norm": 0.3761383295059204, "learning_rate": 4.711658185361629e-05, "loss": 0.3613, "step": 4261000 }, { "epoch": 28.83756496318753, "grad_norm": 0.3999760150909424, "learning_rate": 4.711624350368125e-05, "loss": 0.3618, "step": 4261500 }, { "epoch": 28.840948462537895, "grad_norm": 0.36837732791900635, "learning_rate": 4.711590515374621e-05, "loss": 0.3629, "step": 4262000 }, { "epoch": 28.844331961888262, "grad_norm": 0.3617350459098816, "learning_rate": 4.711556680381118e-05, "loss": 0.361, "step": 4262500 }, { "epoch": 28.847715461238632, "grad_norm": 0.3469606339931488, "learning_rate": 4.7115228453876136e-05, "loss": 0.3617, "step": 4263000 }, { "epoch": 28.851098960589, "grad_norm": 0.3613704442977905, "learning_rate": 4.71148901039411e-05, "loss": 0.3617, "step": 4263500 }, { "epoch": 28.85448245993937, "grad_norm": 0.40201789140701294, "learning_rate": 4.711455175400607e-05, "loss": 0.361, "step": 4264000 }, { "epoch": 28.857865959289736, "grad_norm": 0.44502681493759155, "learning_rate": 4.711421340407103e-05, "loss": 0.3612, "step": 4264500 }, { "epoch": 28.861249458640103, "grad_norm": 0.34542232751846313, "learning_rate": 4.711387505413599e-05, "loss": 0.3618, "step": 4265000 }, { "epoch": 28.864632957990473, "grad_norm": 0.3687632381916046, "learning_rate": 4.711353670420095e-05, "loss": 0.3617, "step": 4265500 }, { "epoch": 28.86801645734084, "grad_norm": 0.3720606565475464, "learning_rate": 4.711319835426592e-05, "loss": 0.3617, "step": 4266000 }, { "epoch": 28.871399956691207, "grad_norm": 0.35069751739501953, "learning_rate": 4.7112860004330884e-05, "loss": 0.3612, "step": 4266500 }, { "epoch": 28.874783456041577, "grad_norm": 0.36152008175849915, "learning_rate": 4.7112521654395846e-05, "loss": 0.3615, "step": 4267000 }, { "epoch": 28.878166955391944, "grad_norm": 0.3989510238170624, "learning_rate": 4.711218330446081e-05, "loss": 0.361, "step": 4267500 }, { "epoch": 28.881550454742314, "grad_norm": 0.41541945934295654, "learning_rate": 4.711184495452577e-05, "loss": 0.3612, "step": 4268000 }, { "epoch": 28.88493395409268, "grad_norm": 0.39647066593170166, "learning_rate": 4.711150660459073e-05, "loss": 0.3614, "step": 4268500 }, { "epoch": 28.888317453443047, "grad_norm": 0.35431623458862305, "learning_rate": 4.7111168254655695e-05, "loss": 0.3627, "step": 4269000 }, { "epoch": 28.891700952793418, "grad_norm": 0.3839551508426666, "learning_rate": 4.711082990472066e-05, "loss": 0.3601, "step": 4269500 }, { "epoch": 28.895084452143784, "grad_norm": 0.3508188724517822, "learning_rate": 4.7110491554785626e-05, "loss": 0.3594, "step": 4270000 }, { "epoch": 28.898467951494155, "grad_norm": 0.36134690046310425, "learning_rate": 4.711015320485059e-05, "loss": 0.3614, "step": 4270500 }, { "epoch": 28.90185145084452, "grad_norm": 0.36609122157096863, "learning_rate": 4.710981485491555e-05, "loss": 0.361, "step": 4271000 }, { "epoch": 28.905234950194888, "grad_norm": 0.3730999231338501, "learning_rate": 4.710947650498051e-05, "loss": 0.3617, "step": 4271500 }, { "epoch": 28.90861844954526, "grad_norm": 0.41123491525650024, "learning_rate": 4.710913815504548e-05, "loss": 0.3614, "step": 4272000 }, { "epoch": 28.912001948895625, "grad_norm": 0.3542698323726654, "learning_rate": 4.7108799805110437e-05, "loss": 0.3612, "step": 4272500 }, { "epoch": 28.915385448245996, "grad_norm": 0.4087660014629364, "learning_rate": 4.71084614551754e-05, "loss": 0.3608, "step": 4273000 }, { "epoch": 28.918768947596362, "grad_norm": 0.37405553460121155, "learning_rate": 4.710812310524037e-05, "loss": 0.3616, "step": 4273500 }, { "epoch": 28.92215244694673, "grad_norm": 0.3734908401966095, "learning_rate": 4.710778475530533e-05, "loss": 0.3616, "step": 4274000 }, { "epoch": 28.9255359462971, "grad_norm": 0.37422364950180054, "learning_rate": 4.710744640537029e-05, "loss": 0.3629, "step": 4274500 }, { "epoch": 28.928919445647466, "grad_norm": 0.3265927731990814, "learning_rate": 4.7107108055435254e-05, "loss": 0.3601, "step": 4275000 }, { "epoch": 28.932302944997836, "grad_norm": 0.3842279314994812, "learning_rate": 4.710676970550022e-05, "loss": 0.3611, "step": 4275500 }, { "epoch": 28.935686444348203, "grad_norm": 0.3847258388996124, "learning_rate": 4.7106431355565185e-05, "loss": 0.3616, "step": 4276000 }, { "epoch": 28.93906994369857, "grad_norm": 0.4233771860599518, "learning_rate": 4.710609300563015e-05, "loss": 0.3609, "step": 4276500 }, { "epoch": 28.94245344304894, "grad_norm": 0.38106000423431396, "learning_rate": 4.71057546556951e-05, "loss": 0.3624, "step": 4277000 }, { "epoch": 28.945836942399307, "grad_norm": 0.3772009015083313, "learning_rate": 4.710541630576007e-05, "loss": 0.3609, "step": 4277500 }, { "epoch": 28.949220441749674, "grad_norm": 0.34100696444511414, "learning_rate": 4.7105077955825033e-05, "loss": 0.362, "step": 4278000 }, { "epoch": 28.952603941100044, "grad_norm": 0.3932490646839142, "learning_rate": 4.7104739605889996e-05, "loss": 0.3628, "step": 4278500 }, { "epoch": 28.95598744045041, "grad_norm": 0.36704206466674805, "learning_rate": 4.710440125595496e-05, "loss": 0.3617, "step": 4279000 }, { "epoch": 28.95937093980078, "grad_norm": 0.393768846988678, "learning_rate": 4.7104062906019927e-05, "loss": 0.3609, "step": 4279500 }, { "epoch": 28.962754439151148, "grad_norm": 0.40044960379600525, "learning_rate": 4.710372455608489e-05, "loss": 0.3613, "step": 4280000 }, { "epoch": 28.966137938501515, "grad_norm": 0.37255147099494934, "learning_rate": 4.710338620614985e-05, "loss": 0.3619, "step": 4280500 }, { "epoch": 28.969521437851885, "grad_norm": 0.4047803580760956, "learning_rate": 4.710304785621481e-05, "loss": 0.3612, "step": 4281000 }, { "epoch": 28.97290493720225, "grad_norm": 0.3823975920677185, "learning_rate": 4.710270950627978e-05, "loss": 0.3605, "step": 4281500 }, { "epoch": 28.976288436552622, "grad_norm": 0.4176645874977112, "learning_rate": 4.710237115634474e-05, "loss": 0.3621, "step": 4282000 }, { "epoch": 28.97967193590299, "grad_norm": 0.3672531843185425, "learning_rate": 4.71020328064097e-05, "loss": 0.3615, "step": 4282500 }, { "epoch": 28.983055435253355, "grad_norm": 0.36267173290252686, "learning_rate": 4.710169445647467e-05, "loss": 0.3616, "step": 4283000 }, { "epoch": 28.986438934603726, "grad_norm": 0.3600069582462311, "learning_rate": 4.710135610653963e-05, "loss": 0.3616, "step": 4283500 }, { "epoch": 28.989822433954092, "grad_norm": 0.3569662272930145, "learning_rate": 4.710101775660459e-05, "loss": 0.3609, "step": 4284000 }, { "epoch": 28.99320593330446, "grad_norm": 0.3485254645347595, "learning_rate": 4.7100679406669555e-05, "loss": 0.3608, "step": 4284500 }, { "epoch": 28.99658943265483, "grad_norm": 0.4217069149017334, "learning_rate": 4.7100341056734523e-05, "loss": 0.362, "step": 4285000 }, { "epoch": 28.999972932005196, "grad_norm": 0.36723482608795166, "learning_rate": 4.7100002706799486e-05, "loss": 0.3631, "step": 4285500 }, { "epoch": 29.0, "eval_accuracy": 0.8623626838338284, "eval_loss": 0.5590023398399353, "eval_runtime": 3353.1542, "eval_samples_per_second": 86.708, "eval_steps_per_second": 5.419, "step": 4285504 }, { "epoch": 29.003356431355567, "grad_norm": 0.37278684973716736, "learning_rate": 4.709966435686445e-05, "loss": 0.3605, "step": 4286000 }, { "epoch": 29.006739930705933, "grad_norm": 0.3531095087528229, "learning_rate": 4.70993260069294e-05, "loss": 0.3574, "step": 4286500 }, { "epoch": 29.0101234300563, "grad_norm": 0.35951194167137146, "learning_rate": 4.709898765699437e-05, "loss": 0.3575, "step": 4287000 }, { "epoch": 29.01350692940667, "grad_norm": 0.4008044898509979, "learning_rate": 4.7098649307059334e-05, "loss": 0.36, "step": 4287500 }, { "epoch": 29.016890428757037, "grad_norm": 0.34333372116088867, "learning_rate": 4.7098310957124296e-05, "loss": 0.3602, "step": 4288000 }, { "epoch": 29.020273928107407, "grad_norm": 0.34748372435569763, "learning_rate": 4.709797260718926e-05, "loss": 0.3594, "step": 4288500 }, { "epoch": 29.023657427457774, "grad_norm": 0.3687286972999573, "learning_rate": 4.709763425725423e-05, "loss": 0.3606, "step": 4289000 }, { "epoch": 29.02704092680814, "grad_norm": 0.37696537375450134, "learning_rate": 4.709729590731919e-05, "loss": 0.3597, "step": 4289500 }, { "epoch": 29.03042442615851, "grad_norm": 0.3552592992782593, "learning_rate": 4.709695755738415e-05, "loss": 0.36, "step": 4290000 }, { "epoch": 29.033807925508878, "grad_norm": 0.3937763571739197, "learning_rate": 4.7096619207449114e-05, "loss": 0.3604, "step": 4290500 }, { "epoch": 29.037191424859245, "grad_norm": 0.41418910026550293, "learning_rate": 4.709628085751408e-05, "loss": 0.3599, "step": 4291000 }, { "epoch": 29.040574924209615, "grad_norm": 0.3835686147212982, "learning_rate": 4.709594250757904e-05, "loss": 0.3587, "step": 4291500 }, { "epoch": 29.04395842355998, "grad_norm": 0.36630573868751526, "learning_rate": 4.7095604157644e-05, "loss": 0.3597, "step": 4292000 }, { "epoch": 29.047341922910352, "grad_norm": 0.32699382305145264, "learning_rate": 4.709526580770897e-05, "loss": 0.3601, "step": 4292500 }, { "epoch": 29.05072542226072, "grad_norm": 0.37527844309806824, "learning_rate": 4.709492745777393e-05, "loss": 0.3595, "step": 4293000 }, { "epoch": 29.054108921611085, "grad_norm": 0.39489516615867615, "learning_rate": 4.709458910783889e-05, "loss": 0.3607, "step": 4293500 }, { "epoch": 29.057492420961456, "grad_norm": 0.3843926191329956, "learning_rate": 4.7094250757903855e-05, "loss": 0.3596, "step": 4294000 }, { "epoch": 29.060875920311823, "grad_norm": 0.3965241014957428, "learning_rate": 4.7093912407968824e-05, "loss": 0.3581, "step": 4294500 }, { "epoch": 29.064259419662193, "grad_norm": 0.393311083316803, "learning_rate": 4.7093574058033786e-05, "loss": 0.3608, "step": 4295000 }, { "epoch": 29.06764291901256, "grad_norm": 0.40740710496902466, "learning_rate": 4.709323570809875e-05, "loss": 0.3594, "step": 4295500 }, { "epoch": 29.071026418362926, "grad_norm": 0.40089091658592224, "learning_rate": 4.7092897358163704e-05, "loss": 0.3596, "step": 4296000 }, { "epoch": 29.074409917713297, "grad_norm": 0.37659919261932373, "learning_rate": 4.709255900822867e-05, "loss": 0.359, "step": 4296500 }, { "epoch": 29.077793417063663, "grad_norm": 0.3679037094116211, "learning_rate": 4.7092220658293635e-05, "loss": 0.3592, "step": 4297000 }, { "epoch": 29.081176916414034, "grad_norm": 0.3677593767642975, "learning_rate": 4.70918823083586e-05, "loss": 0.3608, "step": 4297500 }, { "epoch": 29.0845604157644, "grad_norm": 0.3663621246814728, "learning_rate": 4.709154395842356e-05, "loss": 0.3593, "step": 4298000 }, { "epoch": 29.087943915114767, "grad_norm": 0.39356285333633423, "learning_rate": 4.709120560848853e-05, "loss": 0.3602, "step": 4298500 }, { "epoch": 29.091327414465137, "grad_norm": 0.3937978744506836, "learning_rate": 4.709086725855349e-05, "loss": 0.3601, "step": 4299000 }, { "epoch": 29.094710913815504, "grad_norm": 0.36316731572151184, "learning_rate": 4.709052890861845e-05, "loss": 0.3599, "step": 4299500 }, { "epoch": 29.09809441316587, "grad_norm": 0.38304466009140015, "learning_rate": 4.7090190558683414e-05, "loss": 0.3593, "step": 4300000 }, { "epoch": 29.10147791251624, "grad_norm": 0.35874873399734497, "learning_rate": 4.708985220874838e-05, "loss": 0.3598, "step": 4300500 }, { "epoch": 29.104861411866608, "grad_norm": 0.402009516954422, "learning_rate": 4.708951385881334e-05, "loss": 0.3611, "step": 4301000 }, { "epoch": 29.10824491121698, "grad_norm": 0.39570045471191406, "learning_rate": 4.70891755088783e-05, "loss": 0.3613, "step": 4301500 }, { "epoch": 29.111628410567345, "grad_norm": 0.363852858543396, "learning_rate": 4.708883715894327e-05, "loss": 0.361, "step": 4302000 }, { "epoch": 29.115011909917712, "grad_norm": 0.3463621437549591, "learning_rate": 4.708849880900823e-05, "loss": 0.362, "step": 4302500 }, { "epoch": 29.118395409268082, "grad_norm": 0.3559868335723877, "learning_rate": 4.7088160459073194e-05, "loss": 0.3604, "step": 4303000 }, { "epoch": 29.12177890861845, "grad_norm": 0.4086800813674927, "learning_rate": 4.7087822109138156e-05, "loss": 0.3598, "step": 4303500 }, { "epoch": 29.12516240796882, "grad_norm": 0.3575659692287445, "learning_rate": 4.7087483759203125e-05, "loss": 0.3619, "step": 4304000 }, { "epoch": 29.128545907319186, "grad_norm": 0.36323443055152893, "learning_rate": 4.708714540926809e-05, "loss": 0.3604, "step": 4304500 }, { "epoch": 29.131929406669553, "grad_norm": 0.32497456669807434, "learning_rate": 4.708680705933305e-05, "loss": 0.361, "step": 4305000 }, { "epoch": 29.135312906019923, "grad_norm": 0.4088630974292755, "learning_rate": 4.708646870939801e-05, "loss": 0.3608, "step": 4305500 }, { "epoch": 29.13869640537029, "grad_norm": 0.3804304301738739, "learning_rate": 4.708613035946297e-05, "loss": 0.3601, "step": 4306000 }, { "epoch": 29.14207990472066, "grad_norm": 0.420848548412323, "learning_rate": 4.7085792009527935e-05, "loss": 0.36, "step": 4306500 }, { "epoch": 29.145463404071027, "grad_norm": 0.3965778648853302, "learning_rate": 4.70854536595929e-05, "loss": 0.3608, "step": 4307000 }, { "epoch": 29.148846903421394, "grad_norm": 0.3896230161190033, "learning_rate": 4.708511530965786e-05, "loss": 0.3583, "step": 4307500 }, { "epoch": 29.152230402771764, "grad_norm": 0.4677799344062805, "learning_rate": 4.708477695972283e-05, "loss": 0.3619, "step": 4308000 }, { "epoch": 29.15561390212213, "grad_norm": 0.38909775018692017, "learning_rate": 4.708443860978779e-05, "loss": 0.3618, "step": 4308500 }, { "epoch": 29.158997401472497, "grad_norm": 0.37043771147727966, "learning_rate": 4.708410025985275e-05, "loss": 0.3622, "step": 4309000 }, { "epoch": 29.162380900822868, "grad_norm": 0.3786575496196747, "learning_rate": 4.7083761909917715e-05, "loss": 0.3612, "step": 4309500 }, { "epoch": 29.165764400173234, "grad_norm": 0.3752582371234894, "learning_rate": 4.7083423559982684e-05, "loss": 0.3618, "step": 4310000 }, { "epoch": 29.169147899523605, "grad_norm": 0.37404459714889526, "learning_rate": 4.708308521004764e-05, "loss": 0.359, "step": 4310500 }, { "epoch": 29.17253139887397, "grad_norm": 0.37942689657211304, "learning_rate": 4.70827468601126e-05, "loss": 0.3612, "step": 4311000 }, { "epoch": 29.175914898224338, "grad_norm": 0.3892790973186493, "learning_rate": 4.708240851017757e-05, "loss": 0.3616, "step": 4311500 }, { "epoch": 29.17929839757471, "grad_norm": 0.38474565744400024, "learning_rate": 4.708207016024253e-05, "loss": 0.3597, "step": 4312000 }, { "epoch": 29.182681896925075, "grad_norm": 0.3569866418838501, "learning_rate": 4.7081731810307494e-05, "loss": 0.36, "step": 4312500 }, { "epoch": 29.186065396275445, "grad_norm": 0.41088053584098816, "learning_rate": 4.7081393460372456e-05, "loss": 0.3602, "step": 4313000 }, { "epoch": 29.189448895625812, "grad_norm": 0.3667789697647095, "learning_rate": 4.7081055110437425e-05, "loss": 0.361, "step": 4313500 }, { "epoch": 29.19283239497618, "grad_norm": 0.34484750032424927, "learning_rate": 4.708071676050239e-05, "loss": 0.36, "step": 4314000 }, { "epoch": 29.19621589432655, "grad_norm": 0.3753925859928131, "learning_rate": 4.708037841056735e-05, "loss": 0.3593, "step": 4314500 }, { "epoch": 29.199599393676916, "grad_norm": 0.3755471408367157, "learning_rate": 4.708004006063231e-05, "loss": 0.3607, "step": 4315000 }, { "epoch": 29.202982893027283, "grad_norm": 0.3804199993610382, "learning_rate": 4.7079701710697274e-05, "loss": 0.362, "step": 4315500 }, { "epoch": 29.206366392377653, "grad_norm": 0.4020286798477173, "learning_rate": 4.7079363360762236e-05, "loss": 0.3618, "step": 4316000 }, { "epoch": 29.20974989172802, "grad_norm": 0.4113852381706238, "learning_rate": 4.70790250108272e-05, "loss": 0.3602, "step": 4316500 }, { "epoch": 29.21313339107839, "grad_norm": 0.3878616690635681, "learning_rate": 4.707868666089216e-05, "loss": 0.3618, "step": 4317000 }, { "epoch": 29.216516890428757, "grad_norm": 0.4221148192882538, "learning_rate": 4.707834831095713e-05, "loss": 0.3599, "step": 4317500 }, { "epoch": 29.219900389779124, "grad_norm": 0.3535100817680359, "learning_rate": 4.707800996102209e-05, "loss": 0.3613, "step": 4318000 }, { "epoch": 29.223283889129494, "grad_norm": 0.4416511356830597, "learning_rate": 4.707767161108705e-05, "loss": 0.3613, "step": 4318500 }, { "epoch": 29.22666738847986, "grad_norm": 0.3685159981250763, "learning_rate": 4.7077333261152015e-05, "loss": 0.3607, "step": 4319000 }, { "epoch": 29.23005088783023, "grad_norm": 0.3986661732196808, "learning_rate": 4.7076994911216984e-05, "loss": 0.3607, "step": 4319500 }, { "epoch": 29.233434387180598, "grad_norm": 0.3640308976173401, "learning_rate": 4.707665656128194e-05, "loss": 0.3625, "step": 4320000 }, { "epoch": 29.236817886530964, "grad_norm": 0.36600756645202637, "learning_rate": 4.70763182113469e-05, "loss": 0.3593, "step": 4320500 }, { "epoch": 29.240201385881335, "grad_norm": 0.37366312742233276, "learning_rate": 4.707597986141187e-05, "loss": 0.3609, "step": 4321000 }, { "epoch": 29.2435848852317, "grad_norm": 0.38794785737991333, "learning_rate": 4.707564151147683e-05, "loss": 0.3605, "step": 4321500 }, { "epoch": 29.246968384582072, "grad_norm": 0.3888173997402191, "learning_rate": 4.7075303161541795e-05, "loss": 0.3602, "step": 4322000 }, { "epoch": 29.25035188393244, "grad_norm": 0.3312968313694, "learning_rate": 4.707496481160676e-05, "loss": 0.3605, "step": 4322500 }, { "epoch": 29.253735383282805, "grad_norm": 0.38750284910202026, "learning_rate": 4.707462646167172e-05, "loss": 0.3611, "step": 4323000 }, { "epoch": 29.257118882633176, "grad_norm": 0.3781498670578003, "learning_rate": 4.707428811173669e-05, "loss": 0.3614, "step": 4323500 }, { "epoch": 29.260502381983542, "grad_norm": 0.375742107629776, "learning_rate": 4.707394976180165e-05, "loss": 0.3607, "step": 4324000 }, { "epoch": 29.26388588133391, "grad_norm": 0.3669828176498413, "learning_rate": 4.707361141186661e-05, "loss": 0.3608, "step": 4324500 }, { "epoch": 29.26726938068428, "grad_norm": 0.3698820173740387, "learning_rate": 4.7073273061931574e-05, "loss": 0.36, "step": 4325000 }, { "epoch": 29.270652880034646, "grad_norm": 0.33423253893852234, "learning_rate": 4.7072934711996537e-05, "loss": 0.3597, "step": 4325500 }, { "epoch": 29.274036379385016, "grad_norm": 0.3763883113861084, "learning_rate": 4.70725963620615e-05, "loss": 0.36, "step": 4326000 }, { "epoch": 29.277419878735383, "grad_norm": 0.3575267195701599, "learning_rate": 4.707225801212646e-05, "loss": 0.3584, "step": 4326500 }, { "epoch": 29.28080337808575, "grad_norm": 0.39500075578689575, "learning_rate": 4.707191966219143e-05, "loss": 0.3601, "step": 4327000 }, { "epoch": 29.28418687743612, "grad_norm": 0.46322932839393616, "learning_rate": 4.707158131225639e-05, "loss": 0.3611, "step": 4327500 }, { "epoch": 29.287570376786487, "grad_norm": 0.4118308126926422, "learning_rate": 4.7071242962321354e-05, "loss": 0.3608, "step": 4328000 }, { "epoch": 29.290953876136857, "grad_norm": 0.3777792751789093, "learning_rate": 4.7070904612386316e-05, "loss": 0.3606, "step": 4328500 }, { "epoch": 29.294337375487224, "grad_norm": 0.35416123270988464, "learning_rate": 4.7070566262451285e-05, "loss": 0.3616, "step": 4329000 }, { "epoch": 29.29772087483759, "grad_norm": 0.35928085446357727, "learning_rate": 4.707022791251624e-05, "loss": 0.3603, "step": 4329500 }, { "epoch": 29.30110437418796, "grad_norm": 0.3598766326904297, "learning_rate": 4.70698895625812e-05, "loss": 0.3604, "step": 4330000 }, { "epoch": 29.304487873538328, "grad_norm": 0.3819045424461365, "learning_rate": 4.706955121264617e-05, "loss": 0.361, "step": 4330500 }, { "epoch": 29.307871372888698, "grad_norm": 0.3496757447719574, "learning_rate": 4.7069212862711133e-05, "loss": 0.3598, "step": 4331000 }, { "epoch": 29.311254872239065, "grad_norm": 0.37243950366973877, "learning_rate": 4.7068874512776096e-05, "loss": 0.3612, "step": 4331500 }, { "epoch": 29.31463837158943, "grad_norm": 0.3167635202407837, "learning_rate": 4.706853616284106e-05, "loss": 0.3609, "step": 4332000 }, { "epoch": 29.318021870939802, "grad_norm": 0.3345939815044403, "learning_rate": 4.706819781290602e-05, "loss": 0.3601, "step": 4332500 }, { "epoch": 29.32140537029017, "grad_norm": 0.3902365565299988, "learning_rate": 4.706785946297099e-05, "loss": 0.3621, "step": 4333000 }, { "epoch": 29.324788869640535, "grad_norm": 0.38333427906036377, "learning_rate": 4.706752111303595e-05, "loss": 0.3606, "step": 4333500 }, { "epoch": 29.328172368990906, "grad_norm": 0.34627583622932434, "learning_rate": 4.706718276310091e-05, "loss": 0.3609, "step": 4334000 }, { "epoch": 29.331555868341272, "grad_norm": 0.35351866483688354, "learning_rate": 4.7066844413165875e-05, "loss": 0.3615, "step": 4334500 }, { "epoch": 29.334939367691643, "grad_norm": 0.4443703889846802, "learning_rate": 4.706650606323084e-05, "loss": 0.3614, "step": 4335000 }, { "epoch": 29.33832286704201, "grad_norm": 0.392807275056839, "learning_rate": 4.70661677132958e-05, "loss": 0.3603, "step": 4335500 }, { "epoch": 29.341706366392376, "grad_norm": 0.39465057849884033, "learning_rate": 4.706582936336076e-05, "loss": 0.3621, "step": 4336000 }, { "epoch": 29.345089865742747, "grad_norm": 0.37733396887779236, "learning_rate": 4.706549101342573e-05, "loss": 0.3625, "step": 4336500 }, { "epoch": 29.348473365093113, "grad_norm": 0.3787976801395416, "learning_rate": 4.706515266349069e-05, "loss": 0.3625, "step": 4337000 }, { "epoch": 29.351856864443484, "grad_norm": 0.36785629391670227, "learning_rate": 4.7064814313555655e-05, "loss": 0.3599, "step": 4337500 }, { "epoch": 29.35524036379385, "grad_norm": 0.40150776505470276, "learning_rate": 4.706447596362062e-05, "loss": 0.3614, "step": 4338000 }, { "epoch": 29.358623863144217, "grad_norm": 0.34027591347694397, "learning_rate": 4.7064137613685586e-05, "loss": 0.3612, "step": 4338500 }, { "epoch": 29.362007362494587, "grad_norm": 0.41849809885025024, "learning_rate": 4.706379926375054e-05, "loss": 0.3602, "step": 4339000 }, { "epoch": 29.365390861844954, "grad_norm": 0.3601183593273163, "learning_rate": 4.70634609138155e-05, "loss": 0.3608, "step": 4339500 }, { "epoch": 29.36877436119532, "grad_norm": 0.36231401562690735, "learning_rate": 4.7063122563880465e-05, "loss": 0.3608, "step": 4340000 }, { "epoch": 29.37215786054569, "grad_norm": 0.38266411423683167, "learning_rate": 4.7062784213945434e-05, "loss": 0.3604, "step": 4340500 }, { "epoch": 29.375541359896058, "grad_norm": 0.4335971772670746, "learning_rate": 4.7062445864010396e-05, "loss": 0.3599, "step": 4341000 }, { "epoch": 29.37892485924643, "grad_norm": 0.3665923774242401, "learning_rate": 4.706210751407536e-05, "loss": 0.3608, "step": 4341500 }, { "epoch": 29.382308358596795, "grad_norm": 0.4152335226535797, "learning_rate": 4.706176916414032e-05, "loss": 0.3601, "step": 4342000 }, { "epoch": 29.38569185794716, "grad_norm": 0.3795805871486664, "learning_rate": 4.706143081420529e-05, "loss": 0.361, "step": 4342500 }, { "epoch": 29.389075357297532, "grad_norm": 0.3984297811985016, "learning_rate": 4.706109246427025e-05, "loss": 0.3607, "step": 4343000 }, { "epoch": 29.3924588566479, "grad_norm": 0.39234739542007446, "learning_rate": 4.7060754114335214e-05, "loss": 0.361, "step": 4343500 }, { "epoch": 29.39584235599827, "grad_norm": 0.40547654032707214, "learning_rate": 4.7060415764400176e-05, "loss": 0.3614, "step": 4344000 }, { "epoch": 29.399225855348636, "grad_norm": 0.36906078457832336, "learning_rate": 4.706007741446514e-05, "loss": 0.3617, "step": 4344500 }, { "epoch": 29.402609354699003, "grad_norm": 0.3373365104198456, "learning_rate": 4.70597390645301e-05, "loss": 0.3611, "step": 4345000 }, { "epoch": 29.405992854049373, "grad_norm": 0.3218066096305847, "learning_rate": 4.705940071459506e-05, "loss": 0.3616, "step": 4345500 }, { "epoch": 29.40937635339974, "grad_norm": 0.37889230251312256, "learning_rate": 4.705906236466003e-05, "loss": 0.3618, "step": 4346000 }, { "epoch": 29.41275985275011, "grad_norm": 0.3492085039615631, "learning_rate": 4.705872401472499e-05, "loss": 0.3613, "step": 4346500 }, { "epoch": 29.416143352100477, "grad_norm": 0.36156710982322693, "learning_rate": 4.7058385664789955e-05, "loss": 0.3589, "step": 4347000 }, { "epoch": 29.419526851450843, "grad_norm": 0.3797200620174408, "learning_rate": 4.705804731485492e-05, "loss": 0.3608, "step": 4347500 }, { "epoch": 29.422910350801214, "grad_norm": 0.3567017614841461, "learning_rate": 4.7057708964919886e-05, "loss": 0.3621, "step": 4348000 }, { "epoch": 29.42629385015158, "grad_norm": 0.3498610258102417, "learning_rate": 4.705737061498484e-05, "loss": 0.36, "step": 4348500 }, { "epoch": 29.429677349501947, "grad_norm": 0.376139760017395, "learning_rate": 4.7057032265049804e-05, "loss": 0.3596, "step": 4349000 }, { "epoch": 29.433060848852318, "grad_norm": 0.37960341572761536, "learning_rate": 4.7056693915114766e-05, "loss": 0.3608, "step": 4349500 }, { "epoch": 29.436444348202684, "grad_norm": 0.38942623138427734, "learning_rate": 4.7056355565179735e-05, "loss": 0.3614, "step": 4350000 }, { "epoch": 29.439827847553055, "grad_norm": 0.38209375739097595, "learning_rate": 4.70560172152447e-05, "loss": 0.3608, "step": 4350500 }, { "epoch": 29.44321134690342, "grad_norm": 0.41018059849739075, "learning_rate": 4.705567886530966e-05, "loss": 0.3622, "step": 4351000 }, { "epoch": 29.446594846253788, "grad_norm": 0.39549022912979126, "learning_rate": 4.705534051537462e-05, "loss": 0.359, "step": 4351500 }, { "epoch": 29.44997834560416, "grad_norm": 0.39096882939338684, "learning_rate": 4.705500216543959e-05, "loss": 0.3605, "step": 4352000 }, { "epoch": 29.453361844954525, "grad_norm": 0.36013635993003845, "learning_rate": 4.705466381550455e-05, "loss": 0.3614, "step": 4352500 }, { "epoch": 29.456745344304895, "grad_norm": 0.37079086899757385, "learning_rate": 4.7054325465569514e-05, "loss": 0.3598, "step": 4353000 }, { "epoch": 29.460128843655262, "grad_norm": 0.3588239848613739, "learning_rate": 4.7053987115634476e-05, "loss": 0.3615, "step": 4353500 }, { "epoch": 29.46351234300563, "grad_norm": 0.4153600335121155, "learning_rate": 4.705364876569944e-05, "loss": 0.3611, "step": 4354000 }, { "epoch": 29.466895842356, "grad_norm": 0.35029229521751404, "learning_rate": 4.70533104157644e-05, "loss": 0.3609, "step": 4354500 }, { "epoch": 29.470279341706366, "grad_norm": 0.39438584446907043, "learning_rate": 4.705297206582936e-05, "loss": 0.3603, "step": 4355000 }, { "epoch": 29.473662841056736, "grad_norm": 0.35910433530807495, "learning_rate": 4.705263371589433e-05, "loss": 0.3599, "step": 4355500 }, { "epoch": 29.477046340407103, "grad_norm": 0.3918461501598358, "learning_rate": 4.7052295365959294e-05, "loss": 0.3588, "step": 4356000 }, { "epoch": 29.48042983975747, "grad_norm": 0.38227224349975586, "learning_rate": 4.7051957016024256e-05, "loss": 0.3596, "step": 4356500 }, { "epoch": 29.48381333910784, "grad_norm": 0.3609776496887207, "learning_rate": 4.705161866608922e-05, "loss": 0.362, "step": 4357000 }, { "epoch": 29.487196838458207, "grad_norm": 0.3666316270828247, "learning_rate": 4.705128031615419e-05, "loss": 0.3604, "step": 4357500 }, { "epoch": 29.490580337808574, "grad_norm": 0.40913018584251404, "learning_rate": 4.705094196621914e-05, "loss": 0.3615, "step": 4358000 }, { "epoch": 29.493963837158944, "grad_norm": 0.4122064709663391, "learning_rate": 4.7050603616284104e-05, "loss": 0.3604, "step": 4358500 }, { "epoch": 29.49734733650931, "grad_norm": 0.3761943578720093, "learning_rate": 4.7050265266349066e-05, "loss": 0.3603, "step": 4359000 }, { "epoch": 29.50073083585968, "grad_norm": 0.360890656709671, "learning_rate": 4.7049926916414035e-05, "loss": 0.3629, "step": 4359500 }, { "epoch": 29.504114335210048, "grad_norm": 0.379151850938797, "learning_rate": 4.7049588566479e-05, "loss": 0.3621, "step": 4360000 }, { "epoch": 29.507497834560414, "grad_norm": 0.3611099421977997, "learning_rate": 4.704925021654396e-05, "loss": 0.3605, "step": 4360500 }, { "epoch": 29.510881333910785, "grad_norm": 0.3445838391780853, "learning_rate": 4.704891186660892e-05, "loss": 0.3615, "step": 4361000 }, { "epoch": 29.51426483326115, "grad_norm": 0.38196510076522827, "learning_rate": 4.704857351667389e-05, "loss": 0.36, "step": 4361500 }, { "epoch": 29.51764833261152, "grad_norm": 0.38769465684890747, "learning_rate": 4.704823516673885e-05, "loss": 0.3606, "step": 4362000 }, { "epoch": 29.52103183196189, "grad_norm": 0.3828052878379822, "learning_rate": 4.7047896816803815e-05, "loss": 0.3621, "step": 4362500 }, { "epoch": 29.524415331312255, "grad_norm": 0.40334826707839966, "learning_rate": 4.704755846686878e-05, "loss": 0.3623, "step": 4363000 }, { "epoch": 29.527798830662626, "grad_norm": 0.38683021068573, "learning_rate": 4.704722011693374e-05, "loss": 0.359, "step": 4363500 }, { "epoch": 29.531182330012992, "grad_norm": 0.37075749039649963, "learning_rate": 4.70468817669987e-05, "loss": 0.3612, "step": 4364000 }, { "epoch": 29.53456582936336, "grad_norm": 0.3464714586734772, "learning_rate": 4.704654341706366e-05, "loss": 0.3618, "step": 4364500 }, { "epoch": 29.53794932871373, "grad_norm": 0.3840615749359131, "learning_rate": 4.704620506712863e-05, "loss": 0.3609, "step": 4365000 }, { "epoch": 29.541332828064096, "grad_norm": 0.3820098042488098, "learning_rate": 4.7045866717193594e-05, "loss": 0.3622, "step": 4365500 }, { "epoch": 29.544716327414466, "grad_norm": 0.3600960075855255, "learning_rate": 4.7045528367258556e-05, "loss": 0.3596, "step": 4366000 }, { "epoch": 29.548099826764833, "grad_norm": 0.37287211418151855, "learning_rate": 4.704519001732352e-05, "loss": 0.3618, "step": 4366500 }, { "epoch": 29.5514833261152, "grad_norm": 0.368624746799469, "learning_rate": 4.704485166738849e-05, "loss": 0.3606, "step": 4367000 }, { "epoch": 29.55486682546557, "grad_norm": 0.3669593632221222, "learning_rate": 4.704451331745345e-05, "loss": 0.3615, "step": 4367500 }, { "epoch": 29.558250324815937, "grad_norm": 0.39893969893455505, "learning_rate": 4.7044174967518405e-05, "loss": 0.3618, "step": 4368000 }, { "epoch": 29.561633824166307, "grad_norm": 0.36940744519233704, "learning_rate": 4.704383661758337e-05, "loss": 0.3612, "step": 4368500 }, { "epoch": 29.565017323516674, "grad_norm": 0.35981062054634094, "learning_rate": 4.7043498267648336e-05, "loss": 0.3609, "step": 4369000 }, { "epoch": 29.56840082286704, "grad_norm": 0.394185334444046, "learning_rate": 4.70431599177133e-05, "loss": 0.3604, "step": 4369500 }, { "epoch": 29.57178432221741, "grad_norm": 0.35855773091316223, "learning_rate": 4.704282156777826e-05, "loss": 0.3594, "step": 4370000 }, { "epoch": 29.575167821567778, "grad_norm": 0.3543767035007477, "learning_rate": 4.704248321784322e-05, "loss": 0.3615, "step": 4370500 }, { "epoch": 29.578551320918148, "grad_norm": 0.3909660875797272, "learning_rate": 4.704214486790819e-05, "loss": 0.3593, "step": 4371000 }, { "epoch": 29.581934820268515, "grad_norm": 0.39414045214653015, "learning_rate": 4.704180651797315e-05, "loss": 0.3599, "step": 4371500 }, { "epoch": 29.58531831961888, "grad_norm": 0.36996832489967346, "learning_rate": 4.7041468168038116e-05, "loss": 0.3603, "step": 4372000 }, { "epoch": 29.588701818969252, "grad_norm": 0.35564756393432617, "learning_rate": 4.704112981810308e-05, "loss": 0.3609, "step": 4372500 }, { "epoch": 29.59208531831962, "grad_norm": 0.4340767562389374, "learning_rate": 4.704079146816804e-05, "loss": 0.3608, "step": 4373000 }, { "epoch": 29.595468817669985, "grad_norm": 0.38144609332084656, "learning_rate": 4.7040453118233e-05, "loss": 0.3608, "step": 4373500 }, { "epoch": 29.598852317020356, "grad_norm": 0.3498848080635071, "learning_rate": 4.7040114768297964e-05, "loss": 0.3624, "step": 4374000 }, { "epoch": 29.602235816370722, "grad_norm": 0.4123530387878418, "learning_rate": 4.703977641836293e-05, "loss": 0.3612, "step": 4374500 }, { "epoch": 29.605619315721093, "grad_norm": 0.3861576318740845, "learning_rate": 4.7039438068427895e-05, "loss": 0.3609, "step": 4375000 }, { "epoch": 29.60900281507146, "grad_norm": 0.3890489339828491, "learning_rate": 4.703909971849286e-05, "loss": 0.3617, "step": 4375500 }, { "epoch": 29.612386314421826, "grad_norm": 0.3864743709564209, "learning_rate": 4.703876136855782e-05, "loss": 0.3616, "step": 4376000 }, { "epoch": 29.615769813772197, "grad_norm": 0.34650570154190063, "learning_rate": 4.703842301862279e-05, "loss": 0.361, "step": 4376500 }, { "epoch": 29.619153313122563, "grad_norm": 0.39615970849990845, "learning_rate": 4.703808466868775e-05, "loss": 0.3614, "step": 4377000 }, { "epoch": 29.622536812472934, "grad_norm": 0.4292544722557068, "learning_rate": 4.7037746318752706e-05, "loss": 0.3608, "step": 4377500 }, { "epoch": 29.6259203118233, "grad_norm": 0.4034567177295685, "learning_rate": 4.703740796881767e-05, "loss": 0.3618, "step": 4378000 }, { "epoch": 29.629303811173667, "grad_norm": 0.3960621953010559, "learning_rate": 4.703706961888264e-05, "loss": 0.3606, "step": 4378500 }, { "epoch": 29.632687310524037, "grad_norm": 0.3851377069950104, "learning_rate": 4.70367312689476e-05, "loss": 0.3608, "step": 4379000 }, { "epoch": 29.636070809874404, "grad_norm": 0.32946351170539856, "learning_rate": 4.703639291901256e-05, "loss": 0.361, "step": 4379500 }, { "epoch": 29.639454309224774, "grad_norm": 0.39054372906684875, "learning_rate": 4.703605456907752e-05, "loss": 0.3621, "step": 4380000 }, { "epoch": 29.64283780857514, "grad_norm": 0.39904502034187317, "learning_rate": 4.703571621914249e-05, "loss": 0.3626, "step": 4380500 }, { "epoch": 29.646221307925508, "grad_norm": 0.3671392798423767, "learning_rate": 4.7035377869207454e-05, "loss": 0.3602, "step": 4381000 }, { "epoch": 29.649604807275878, "grad_norm": 0.3751044273376465, "learning_rate": 4.7035039519272416e-05, "loss": 0.3598, "step": 4381500 }, { "epoch": 29.652988306626245, "grad_norm": 0.3547898828983307, "learning_rate": 4.703470116933738e-05, "loss": 0.3634, "step": 4382000 }, { "epoch": 29.65637180597661, "grad_norm": 0.3591013252735138, "learning_rate": 4.703436281940234e-05, "loss": 0.3623, "step": 4382500 }, { "epoch": 29.659755305326982, "grad_norm": 0.42844271659851074, "learning_rate": 4.70340244694673e-05, "loss": 0.361, "step": 4383000 }, { "epoch": 29.66313880467735, "grad_norm": 0.3706211447715759, "learning_rate": 4.7033686119532265e-05, "loss": 0.3616, "step": 4383500 }, { "epoch": 29.66652230402772, "grad_norm": 0.3800713121891022, "learning_rate": 4.7033347769597234e-05, "loss": 0.3604, "step": 4384000 }, { "epoch": 29.669905803378086, "grad_norm": 0.35419294238090515, "learning_rate": 4.7033009419662196e-05, "loss": 0.3608, "step": 4384500 }, { "epoch": 29.673289302728453, "grad_norm": 0.39697718620300293, "learning_rate": 4.703267106972716e-05, "loss": 0.3613, "step": 4385000 }, { "epoch": 29.676672802078823, "grad_norm": 0.388007789850235, "learning_rate": 4.703233271979212e-05, "loss": 0.3597, "step": 4385500 }, { "epoch": 29.68005630142919, "grad_norm": 0.38864538073539734, "learning_rate": 4.703199436985708e-05, "loss": 0.3604, "step": 4386000 }, { "epoch": 29.68343980077956, "grad_norm": 0.36027100682258606, "learning_rate": 4.703165601992205e-05, "loss": 0.3632, "step": 4386500 }, { "epoch": 29.686823300129927, "grad_norm": 0.39807599782943726, "learning_rate": 4.7031317669987006e-05, "loss": 0.3612, "step": 4387000 }, { "epoch": 29.690206799480293, "grad_norm": 0.3704375922679901, "learning_rate": 4.703097932005197e-05, "loss": 0.3611, "step": 4387500 }, { "epoch": 29.693590298830664, "grad_norm": 0.35683444142341614, "learning_rate": 4.703064097011694e-05, "loss": 0.3624, "step": 4388000 }, { "epoch": 29.69697379818103, "grad_norm": 0.38248318433761597, "learning_rate": 4.70303026201819e-05, "loss": 0.3599, "step": 4388500 }, { "epoch": 29.700357297531397, "grad_norm": 0.39218348264694214, "learning_rate": 4.702996427024686e-05, "loss": 0.3601, "step": 4389000 }, { "epoch": 29.703740796881767, "grad_norm": 0.3762940764427185, "learning_rate": 4.7029625920311824e-05, "loss": 0.3618, "step": 4389500 }, { "epoch": 29.707124296232134, "grad_norm": 0.3820953369140625, "learning_rate": 4.702928757037679e-05, "loss": 0.3598, "step": 4390000 }, { "epoch": 29.710507795582505, "grad_norm": 0.38454100489616394, "learning_rate": 4.7028949220441755e-05, "loss": 0.3609, "step": 4390500 }, { "epoch": 29.71389129493287, "grad_norm": 0.40086978673934937, "learning_rate": 4.702861087050672e-05, "loss": 0.3596, "step": 4391000 }, { "epoch": 29.717274794283238, "grad_norm": 0.39777833223342896, "learning_rate": 4.702827252057168e-05, "loss": 0.3609, "step": 4391500 }, { "epoch": 29.72065829363361, "grad_norm": 0.36414191126823425, "learning_rate": 4.702793417063664e-05, "loss": 0.3621, "step": 4392000 }, { "epoch": 29.724041792983975, "grad_norm": 0.4004693031311035, "learning_rate": 4.70275958207016e-05, "loss": 0.3623, "step": 4392500 }, { "epoch": 29.727425292334345, "grad_norm": 0.3868340253829956, "learning_rate": 4.7027257470766565e-05, "loss": 0.3606, "step": 4393000 }, { "epoch": 29.730808791684712, "grad_norm": 0.3559049367904663, "learning_rate": 4.702691912083153e-05, "loss": 0.3602, "step": 4393500 }, { "epoch": 29.73419229103508, "grad_norm": 0.3866305351257324, "learning_rate": 4.7026580770896496e-05, "loss": 0.3592, "step": 4394000 }, { "epoch": 29.73757579038545, "grad_norm": 0.33717986941337585, "learning_rate": 4.702624242096146e-05, "loss": 0.3617, "step": 4394500 }, { "epoch": 29.740959289735816, "grad_norm": 0.3839470148086548, "learning_rate": 4.702590407102642e-05, "loss": 0.3605, "step": 4395000 }, { "epoch": 29.744342789086183, "grad_norm": 0.3991214334964752, "learning_rate": 4.702556572109138e-05, "loss": 0.3593, "step": 4395500 }, { "epoch": 29.747726288436553, "grad_norm": 0.3859218955039978, "learning_rate": 4.702522737115635e-05, "loss": 0.3609, "step": 4396000 }, { "epoch": 29.75110978778692, "grad_norm": 0.3510134220123291, "learning_rate": 4.702488902122131e-05, "loss": 0.362, "step": 4396500 }, { "epoch": 29.75449328713729, "grad_norm": 0.38294655084609985, "learning_rate": 4.702455067128627e-05, "loss": 0.3613, "step": 4397000 }, { "epoch": 29.757876786487657, "grad_norm": 0.3909126818180084, "learning_rate": 4.702421232135124e-05, "loss": 0.3618, "step": 4397500 }, { "epoch": 29.761260285838024, "grad_norm": 0.36475950479507446, "learning_rate": 4.70238739714162e-05, "loss": 0.3612, "step": 4398000 }, { "epoch": 29.764643785188394, "grad_norm": 0.38487258553504944, "learning_rate": 4.702353562148116e-05, "loss": 0.3617, "step": 4398500 }, { "epoch": 29.76802728453876, "grad_norm": 0.35010942816734314, "learning_rate": 4.7023197271546124e-05, "loss": 0.3615, "step": 4399000 }, { "epoch": 29.77141078388913, "grad_norm": 0.34493643045425415, "learning_rate": 4.702285892161109e-05, "loss": 0.3618, "step": 4399500 }, { "epoch": 29.774794283239498, "grad_norm": 0.3537772297859192, "learning_rate": 4.7022520571676055e-05, "loss": 0.3614, "step": 4400000 }, { "epoch": 29.778177782589864, "grad_norm": 0.4162846505641937, "learning_rate": 4.702218222174102e-05, "loss": 0.3609, "step": 4400500 }, { "epoch": 29.781561281940235, "grad_norm": 0.40141090750694275, "learning_rate": 4.702184387180598e-05, "loss": 0.3613, "step": 4401000 }, { "epoch": 29.7849447812906, "grad_norm": 0.3717232048511505, "learning_rate": 4.702150552187094e-05, "loss": 0.3596, "step": 4401500 }, { "epoch": 29.78832828064097, "grad_norm": 0.3388756513595581, "learning_rate": 4.7021167171935904e-05, "loss": 0.3608, "step": 4402000 }, { "epoch": 29.79171177999134, "grad_norm": 0.417555570602417, "learning_rate": 4.7020828822000866e-05, "loss": 0.3615, "step": 4402500 }, { "epoch": 29.795095279341705, "grad_norm": 0.36066934466362, "learning_rate": 4.702049047206583e-05, "loss": 0.3629, "step": 4403000 }, { "epoch": 29.798478778692076, "grad_norm": 0.3946530222892761, "learning_rate": 4.70201521221308e-05, "loss": 0.3619, "step": 4403500 }, { "epoch": 29.801862278042442, "grad_norm": 0.3465481996536255, "learning_rate": 4.701981377219576e-05, "loss": 0.3609, "step": 4404000 }, { "epoch": 29.805245777392813, "grad_norm": 0.34671127796173096, "learning_rate": 4.701947542226072e-05, "loss": 0.3605, "step": 4404500 }, { "epoch": 29.80862927674318, "grad_norm": 0.3607783019542694, "learning_rate": 4.701913707232568e-05, "loss": 0.3611, "step": 4405000 }, { "epoch": 29.812012776093546, "grad_norm": 0.3950929343700409, "learning_rate": 4.701879872239065e-05, "loss": 0.3623, "step": 4405500 }, { "epoch": 29.815396275443916, "grad_norm": 0.3737722933292389, "learning_rate": 4.701846037245561e-05, "loss": 0.3616, "step": 4406000 }, { "epoch": 29.818779774794283, "grad_norm": 0.4050480127334595, "learning_rate": 4.701812202252057e-05, "loss": 0.3619, "step": 4406500 }, { "epoch": 29.82216327414465, "grad_norm": 0.3761073350906372, "learning_rate": 4.701778367258554e-05, "loss": 0.3601, "step": 4407000 }, { "epoch": 29.82554677349502, "grad_norm": 0.38306209444999695, "learning_rate": 4.70174453226505e-05, "loss": 0.3612, "step": 4407500 }, { "epoch": 29.828930272845387, "grad_norm": 0.392617404460907, "learning_rate": 4.701710697271546e-05, "loss": 0.3602, "step": 4408000 }, { "epoch": 29.832313772195757, "grad_norm": 0.3812628388404846, "learning_rate": 4.7016768622780425e-05, "loss": 0.3628, "step": 4408500 }, { "epoch": 29.835697271546124, "grad_norm": 0.3694405257701874, "learning_rate": 4.7016430272845394e-05, "loss": 0.3619, "step": 4409000 }, { "epoch": 29.83908077089649, "grad_norm": 0.36880505084991455, "learning_rate": 4.7016091922910356e-05, "loss": 0.3605, "step": 4409500 }, { "epoch": 29.84246427024686, "grad_norm": 0.37738537788391113, "learning_rate": 4.701575357297532e-05, "loss": 0.3635, "step": 4410000 }, { "epoch": 29.845847769597228, "grad_norm": 0.3428693115711212, "learning_rate": 4.701541522304027e-05, "loss": 0.3616, "step": 4410500 }, { "epoch": 29.849231268947598, "grad_norm": 0.34460675716400146, "learning_rate": 4.701507687310524e-05, "loss": 0.3632, "step": 4411000 }, { "epoch": 29.852614768297965, "grad_norm": 0.35981813073158264, "learning_rate": 4.7014738523170204e-05, "loss": 0.3603, "step": 4411500 }, { "epoch": 29.85599826764833, "grad_norm": 0.4211750030517578, "learning_rate": 4.7014400173235167e-05, "loss": 0.3619, "step": 4412000 }, { "epoch": 29.859381766998702, "grad_norm": 0.40512290596961975, "learning_rate": 4.701406182330013e-05, "loss": 0.3609, "step": 4412500 }, { "epoch": 29.86276526634907, "grad_norm": 0.34953683614730835, "learning_rate": 4.70137234733651e-05, "loss": 0.3609, "step": 4413000 }, { "epoch": 29.866148765699435, "grad_norm": 0.38703393936157227, "learning_rate": 4.701338512343006e-05, "loss": 0.3621, "step": 4413500 }, { "epoch": 29.869532265049806, "grad_norm": 0.35878366231918335, "learning_rate": 4.701304677349502e-05, "loss": 0.3622, "step": 4414000 }, { "epoch": 29.872915764400172, "grad_norm": 0.37528976798057556, "learning_rate": 4.7012708423559984e-05, "loss": 0.3601, "step": 4414500 }, { "epoch": 29.876299263750543, "grad_norm": 0.3553240895271301, "learning_rate": 4.701237007362495e-05, "loss": 0.3617, "step": 4415000 }, { "epoch": 29.87968276310091, "grad_norm": 0.3715384304523468, "learning_rate": 4.701203172368991e-05, "loss": 0.3623, "step": 4415500 }, { "epoch": 29.883066262451276, "grad_norm": 0.3395177721977234, "learning_rate": 4.701169337375487e-05, "loss": 0.3605, "step": 4416000 }, { "epoch": 29.886449761801646, "grad_norm": 0.3491855561733246, "learning_rate": 4.701135502381984e-05, "loss": 0.3601, "step": 4416500 }, { "epoch": 29.889833261152013, "grad_norm": 0.3426545560359955, "learning_rate": 4.70110166738848e-05, "loss": 0.3606, "step": 4417000 }, { "epoch": 29.893216760502384, "grad_norm": 0.42065492272377014, "learning_rate": 4.7010678323949763e-05, "loss": 0.361, "step": 4417500 }, { "epoch": 29.89660025985275, "grad_norm": 0.4317997694015503, "learning_rate": 4.7010339974014726e-05, "loss": 0.3613, "step": 4418000 }, { "epoch": 29.899983759203117, "grad_norm": 0.42664942145347595, "learning_rate": 4.7010001624079694e-05, "loss": 0.3604, "step": 4418500 }, { "epoch": 29.903367258553487, "grad_norm": 0.3807823359966278, "learning_rate": 4.7009663274144657e-05, "loss": 0.3613, "step": 4419000 }, { "epoch": 29.906750757903854, "grad_norm": 0.3887644410133362, "learning_rate": 4.700932492420962e-05, "loss": 0.3608, "step": 4419500 }, { "epoch": 29.91013425725422, "grad_norm": 0.3543424606323242, "learning_rate": 4.700898657427458e-05, "loss": 0.3617, "step": 4420000 }, { "epoch": 29.91351775660459, "grad_norm": 0.4223953187465668, "learning_rate": 4.700864822433954e-05, "loss": 0.3611, "step": 4420500 }, { "epoch": 29.916901255954958, "grad_norm": 0.33534178137779236, "learning_rate": 4.7008309874404505e-05, "loss": 0.3606, "step": 4421000 }, { "epoch": 29.920284755305328, "grad_norm": 0.38698020577430725, "learning_rate": 4.700797152446947e-05, "loss": 0.3597, "step": 4421500 }, { "epoch": 29.923668254655695, "grad_norm": 0.39192628860473633, "learning_rate": 4.700763317453443e-05, "loss": 0.3617, "step": 4422000 }, { "epoch": 29.92705175400606, "grad_norm": 0.4152633249759674, "learning_rate": 4.70072948245994e-05, "loss": 0.3604, "step": 4422500 }, { "epoch": 29.930435253356432, "grad_norm": 0.3698878288269043, "learning_rate": 4.700695647466436e-05, "loss": 0.3625, "step": 4423000 }, { "epoch": 29.9338187527068, "grad_norm": 0.3756166398525238, "learning_rate": 4.700661812472932e-05, "loss": 0.3621, "step": 4423500 }, { "epoch": 29.93720225205717, "grad_norm": 0.395037442445755, "learning_rate": 4.7006279774794285e-05, "loss": 0.3604, "step": 4424000 }, { "epoch": 29.940585751407536, "grad_norm": 0.40361082553863525, "learning_rate": 4.7005941424859253e-05, "loss": 0.3607, "step": 4424500 }, { "epoch": 29.943969250757903, "grad_norm": 0.3730100393295288, "learning_rate": 4.700560307492421e-05, "loss": 0.362, "step": 4425000 }, { "epoch": 29.947352750108273, "grad_norm": 0.3448823094367981, "learning_rate": 4.700526472498917e-05, "loss": 0.3619, "step": 4425500 }, { "epoch": 29.95073624945864, "grad_norm": 0.3748168349266052, "learning_rate": 4.700492637505414e-05, "loss": 0.3613, "step": 4426000 }, { "epoch": 29.95411974880901, "grad_norm": 0.4092998504638672, "learning_rate": 4.70045880251191e-05, "loss": 0.3614, "step": 4426500 }, { "epoch": 29.957503248159377, "grad_norm": 0.37465614080429077, "learning_rate": 4.7004249675184064e-05, "loss": 0.3614, "step": 4427000 }, { "epoch": 29.960886747509743, "grad_norm": 0.3548694849014282, "learning_rate": 4.7003911325249026e-05, "loss": 0.3598, "step": 4427500 }, { "epoch": 29.964270246860114, "grad_norm": 0.3760984539985657, "learning_rate": 4.7003572975313995e-05, "loss": 0.3622, "step": 4428000 }, { "epoch": 29.96765374621048, "grad_norm": 0.37645846605300903, "learning_rate": 4.700323462537896e-05, "loss": 0.3606, "step": 4428500 }, { "epoch": 29.971037245560847, "grad_norm": 0.35915839672088623, "learning_rate": 4.700289627544392e-05, "loss": 0.3619, "step": 4429000 }, { "epoch": 29.974420744911217, "grad_norm": 0.3876221776008606, "learning_rate": 4.700255792550888e-05, "loss": 0.3616, "step": 4429500 }, { "epoch": 29.977804244261584, "grad_norm": 0.3990806043148041, "learning_rate": 4.7002219575573844e-05, "loss": 0.3609, "step": 4430000 }, { "epoch": 29.981187743611954, "grad_norm": 0.40360915660858154, "learning_rate": 4.7001881225638806e-05, "loss": 0.3632, "step": 4430500 }, { "epoch": 29.98457124296232, "grad_norm": 0.39829617738723755, "learning_rate": 4.700154287570377e-05, "loss": 0.3603, "step": 4431000 }, { "epoch": 29.987954742312688, "grad_norm": 0.40149858593940735, "learning_rate": 4.700120452576873e-05, "loss": 0.3615, "step": 4431500 }, { "epoch": 29.99133824166306, "grad_norm": 0.3309660255908966, "learning_rate": 4.70008661758337e-05, "loss": 0.3612, "step": 4432000 }, { "epoch": 29.994721741013425, "grad_norm": 0.3783590495586395, "learning_rate": 4.700052782589866e-05, "loss": 0.3623, "step": 4432500 }, { "epoch": 29.998105240363795, "grad_norm": 0.35361525416374207, "learning_rate": 4.700018947596362e-05, "loss": 0.361, "step": 4433000 }, { "epoch": 30.0, "eval_accuracy": 0.8624615571487356, "eval_loss": 0.5571216940879822, "eval_runtime": 3351.2354, "eval_samples_per_second": 86.757, "eval_steps_per_second": 5.422, "step": 4433280 }, { "epoch": 30.001488739714162, "grad_norm": 0.36750322580337524, "learning_rate": 4.6999851126028585e-05, "loss": 0.3598, "step": 4433500 }, { "epoch": 30.00487223906453, "grad_norm": 0.3880546987056732, "learning_rate": 4.6999512776093554e-05, "loss": 0.3591, "step": 4434000 }, { "epoch": 30.0082557384149, "grad_norm": 0.38808223605155945, "learning_rate": 4.699917442615851e-05, "loss": 0.3582, "step": 4434500 }, { "epoch": 30.011639237765266, "grad_norm": 0.3704867362976074, "learning_rate": 4.699883607622347e-05, "loss": 0.3611, "step": 4435000 }, { "epoch": 30.015022737115636, "grad_norm": 0.3950101435184479, "learning_rate": 4.699849772628844e-05, "loss": 0.3596, "step": 4435500 }, { "epoch": 30.018406236466003, "grad_norm": 0.3647949993610382, "learning_rate": 4.69981593763534e-05, "loss": 0.3585, "step": 4436000 }, { "epoch": 30.02178973581637, "grad_norm": 0.3902396857738495, "learning_rate": 4.6997821026418365e-05, "loss": 0.3582, "step": 4436500 }, { "epoch": 30.02517323516674, "grad_norm": 0.3702434301376343, "learning_rate": 4.699748267648333e-05, "loss": 0.3596, "step": 4437000 }, { "epoch": 30.028556734517107, "grad_norm": 0.4297676682472229, "learning_rate": 4.6997144326548296e-05, "loss": 0.3596, "step": 4437500 }, { "epoch": 30.031940233867473, "grad_norm": 0.39627528190612793, "learning_rate": 4.699680597661326e-05, "loss": 0.3596, "step": 4438000 }, { "epoch": 30.035323733217844, "grad_norm": 0.3526000678539276, "learning_rate": 4.699646762667822e-05, "loss": 0.3583, "step": 4438500 }, { "epoch": 30.03870723256821, "grad_norm": 0.3624178171157837, "learning_rate": 4.699612927674318e-05, "loss": 0.3604, "step": 4439000 }, { "epoch": 30.04209073191858, "grad_norm": 0.40386179089546204, "learning_rate": 4.6995790926808144e-05, "loss": 0.3587, "step": 4439500 }, { "epoch": 30.045474231268948, "grad_norm": 0.3986361622810364, "learning_rate": 4.6995452576873106e-05, "loss": 0.3606, "step": 4440000 }, { "epoch": 30.048857730619314, "grad_norm": 0.3725155293941498, "learning_rate": 4.699511422693807e-05, "loss": 0.3593, "step": 4440500 }, { "epoch": 30.052241229969685, "grad_norm": 0.36626580357551575, "learning_rate": 4.699477587700303e-05, "loss": 0.3589, "step": 4441000 }, { "epoch": 30.05562472932005, "grad_norm": 0.38600772619247437, "learning_rate": 4.6994437527068e-05, "loss": 0.3597, "step": 4441500 }, { "epoch": 30.05900822867042, "grad_norm": 0.38901904225349426, "learning_rate": 4.699409917713296e-05, "loss": 0.3606, "step": 4442000 }, { "epoch": 30.06239172802079, "grad_norm": 0.36698004603385925, "learning_rate": 4.6993760827197924e-05, "loss": 0.3615, "step": 4442500 }, { "epoch": 30.065775227371155, "grad_norm": 0.3775622844696045, "learning_rate": 4.6993422477262886e-05, "loss": 0.3597, "step": 4443000 }, { "epoch": 30.069158726721525, "grad_norm": 0.3821042478084564, "learning_rate": 4.6993084127327855e-05, "loss": 0.3594, "step": 4443500 }, { "epoch": 30.072542226071892, "grad_norm": 0.3540192246437073, "learning_rate": 4.699274577739281e-05, "loss": 0.3604, "step": 4444000 }, { "epoch": 30.07592572542226, "grad_norm": 0.3810232877731323, "learning_rate": 4.699240742745777e-05, "loss": 0.3587, "step": 4444500 }, { "epoch": 30.07930922477263, "grad_norm": 0.3720387816429138, "learning_rate": 4.699206907752274e-05, "loss": 0.3603, "step": 4445000 }, { "epoch": 30.082692724122996, "grad_norm": 0.4007198214530945, "learning_rate": 4.69917307275877e-05, "loss": 0.3588, "step": 4445500 }, { "epoch": 30.086076223473366, "grad_norm": 0.40312501788139343, "learning_rate": 4.6991392377652665e-05, "loss": 0.3594, "step": 4446000 }, { "epoch": 30.089459722823733, "grad_norm": 0.3461494445800781, "learning_rate": 4.699105402771763e-05, "loss": 0.3583, "step": 4446500 }, { "epoch": 30.0928432221741, "grad_norm": 0.36141762137413025, "learning_rate": 4.6990715677782596e-05, "loss": 0.3594, "step": 4447000 }, { "epoch": 30.09622672152447, "grad_norm": 0.39119812846183777, "learning_rate": 4.699037732784756e-05, "loss": 0.359, "step": 4447500 }, { "epoch": 30.099610220874837, "grad_norm": 0.41577664017677307, "learning_rate": 4.699003897791252e-05, "loss": 0.3598, "step": 4448000 }, { "epoch": 30.102993720225207, "grad_norm": 0.35464170575141907, "learning_rate": 4.698970062797748e-05, "loss": 0.3604, "step": 4448500 }, { "epoch": 30.106377219575574, "grad_norm": 0.37684959173202515, "learning_rate": 4.6989362278042445e-05, "loss": 0.3603, "step": 4449000 }, { "epoch": 30.10976071892594, "grad_norm": 0.38398608565330505, "learning_rate": 4.698902392810741e-05, "loss": 0.3597, "step": 4449500 }, { "epoch": 30.11314421827631, "grad_norm": 0.3568044900894165, "learning_rate": 4.698868557817237e-05, "loss": 0.3612, "step": 4450000 }, { "epoch": 30.116527717626678, "grad_norm": 0.3571385145187378, "learning_rate": 4.698834722823733e-05, "loss": 0.3606, "step": 4450500 }, { "epoch": 30.119911216977048, "grad_norm": 0.3913380801677704, "learning_rate": 4.69880088783023e-05, "loss": 0.3597, "step": 4451000 }, { "epoch": 30.123294716327415, "grad_norm": 0.390293151140213, "learning_rate": 4.698767052836726e-05, "loss": 0.3595, "step": 4451500 }, { "epoch": 30.12667821567778, "grad_norm": 0.40033870935440063, "learning_rate": 4.6987332178432224e-05, "loss": 0.3605, "step": 4452000 }, { "epoch": 30.130061715028152, "grad_norm": 0.3905723989009857, "learning_rate": 4.6986993828497186e-05, "loss": 0.3589, "step": 4452500 }, { "epoch": 30.13344521437852, "grad_norm": 0.37469837069511414, "learning_rate": 4.6986655478562155e-05, "loss": 0.3591, "step": 4453000 }, { "epoch": 30.136828713728885, "grad_norm": 0.36965039372444153, "learning_rate": 4.698631712862711e-05, "loss": 0.3595, "step": 4453500 }, { "epoch": 30.140212213079256, "grad_norm": 0.36205413937568665, "learning_rate": 4.698597877869207e-05, "loss": 0.3599, "step": 4454000 }, { "epoch": 30.143595712429622, "grad_norm": 0.4121004045009613, "learning_rate": 4.698564042875704e-05, "loss": 0.3597, "step": 4454500 }, { "epoch": 30.146979211779993, "grad_norm": 0.3439054489135742, "learning_rate": 4.6985302078822004e-05, "loss": 0.3606, "step": 4455000 }, { "epoch": 30.15036271113036, "grad_norm": 0.3691730499267578, "learning_rate": 4.6984963728886966e-05, "loss": 0.3594, "step": 4455500 }, { "epoch": 30.153746210480726, "grad_norm": 0.39207255840301514, "learning_rate": 4.698462537895193e-05, "loss": 0.3602, "step": 4456000 }, { "epoch": 30.157129709831096, "grad_norm": 0.4125577509403229, "learning_rate": 4.698428702901689e-05, "loss": 0.3615, "step": 4456500 }, { "epoch": 30.160513209181463, "grad_norm": 0.371836394071579, "learning_rate": 4.698394867908186e-05, "loss": 0.359, "step": 4457000 }, { "epoch": 30.163896708531833, "grad_norm": 0.38201549649238586, "learning_rate": 4.698361032914682e-05, "loss": 0.3606, "step": 4457500 }, { "epoch": 30.1672802078822, "grad_norm": 0.3787565529346466, "learning_rate": 4.698327197921178e-05, "loss": 0.3619, "step": 4458000 }, { "epoch": 30.170663707232567, "grad_norm": 0.3717700242996216, "learning_rate": 4.6982933629276745e-05, "loss": 0.3608, "step": 4458500 }, { "epoch": 30.174047206582937, "grad_norm": 0.3638845384120941, "learning_rate": 4.698259527934171e-05, "loss": 0.3593, "step": 4459000 }, { "epoch": 30.177430705933304, "grad_norm": 0.3781699538230896, "learning_rate": 4.698225692940667e-05, "loss": 0.3599, "step": 4459500 }, { "epoch": 30.180814205283674, "grad_norm": 0.38198304176330566, "learning_rate": 4.698191857947163e-05, "loss": 0.3609, "step": 4460000 }, { "epoch": 30.18419770463404, "grad_norm": 0.382424533367157, "learning_rate": 4.69815802295366e-05, "loss": 0.3601, "step": 4460500 }, { "epoch": 30.187581203984408, "grad_norm": 0.37011992931365967, "learning_rate": 4.698124187960156e-05, "loss": 0.3611, "step": 4461000 }, { "epoch": 30.190964703334778, "grad_norm": 0.4030992090702057, "learning_rate": 4.6980903529666525e-05, "loss": 0.3599, "step": 4461500 }, { "epoch": 30.194348202685145, "grad_norm": 0.380354642868042, "learning_rate": 4.698056517973149e-05, "loss": 0.3594, "step": 4462000 }, { "epoch": 30.19773170203551, "grad_norm": 0.36242347955703735, "learning_rate": 4.6980226829796456e-05, "loss": 0.3588, "step": 4462500 }, { "epoch": 30.201115201385882, "grad_norm": 0.4033394753932953, "learning_rate": 4.697988847986141e-05, "loss": 0.3617, "step": 4463000 }, { "epoch": 30.20449870073625, "grad_norm": 0.39557284116744995, "learning_rate": 4.6979550129926373e-05, "loss": 0.3606, "step": 4463500 }, { "epoch": 30.20788220008662, "grad_norm": 0.3634182810783386, "learning_rate": 4.6979211779991336e-05, "loss": 0.3591, "step": 4464000 }, { "epoch": 30.211265699436986, "grad_norm": 0.39265722036361694, "learning_rate": 4.6978873430056304e-05, "loss": 0.3611, "step": 4464500 }, { "epoch": 30.214649198787352, "grad_norm": 0.40822023153305054, "learning_rate": 4.6978535080121267e-05, "loss": 0.361, "step": 4465000 }, { "epoch": 30.218032698137723, "grad_norm": 0.41366010904312134, "learning_rate": 4.697819673018623e-05, "loss": 0.3597, "step": 4465500 }, { "epoch": 30.22141619748809, "grad_norm": 0.40065518021583557, "learning_rate": 4.697785838025119e-05, "loss": 0.3619, "step": 4466000 }, { "epoch": 30.22479969683846, "grad_norm": 0.39153268933296204, "learning_rate": 4.697752003031616e-05, "loss": 0.3602, "step": 4466500 }, { "epoch": 30.228183196188827, "grad_norm": 0.3661727011203766, "learning_rate": 4.697718168038112e-05, "loss": 0.3608, "step": 4467000 }, { "epoch": 30.231566695539193, "grad_norm": 0.3721162974834442, "learning_rate": 4.6976843330446084e-05, "loss": 0.3611, "step": 4467500 }, { "epoch": 30.234950194889564, "grad_norm": 0.35852909088134766, "learning_rate": 4.6976504980511046e-05, "loss": 0.3603, "step": 4468000 }, { "epoch": 30.23833369423993, "grad_norm": 0.37085092067718506, "learning_rate": 4.697616663057601e-05, "loss": 0.3613, "step": 4468500 }, { "epoch": 30.241717193590297, "grad_norm": 0.3549633324146271, "learning_rate": 4.697582828064097e-05, "loss": 0.3589, "step": 4469000 }, { "epoch": 30.245100692940667, "grad_norm": 0.423056036233902, "learning_rate": 4.697548993070593e-05, "loss": 0.3593, "step": 4469500 }, { "epoch": 30.248484192291034, "grad_norm": 0.37763503193855286, "learning_rate": 4.69751515807709e-05, "loss": 0.3592, "step": 4470000 }, { "epoch": 30.251867691641404, "grad_norm": 0.3798384964466095, "learning_rate": 4.6974813230835863e-05, "loss": 0.3593, "step": 4470500 }, { "epoch": 30.25525119099177, "grad_norm": 0.42569223046302795, "learning_rate": 4.6974474880900826e-05, "loss": 0.3603, "step": 4471000 }, { "epoch": 30.258634690342138, "grad_norm": 0.3724420666694641, "learning_rate": 4.697413653096579e-05, "loss": 0.3614, "step": 4471500 }, { "epoch": 30.26201818969251, "grad_norm": 0.3679376244544983, "learning_rate": 4.697379818103076e-05, "loss": 0.3603, "step": 4472000 }, { "epoch": 30.265401689042875, "grad_norm": 0.34181925654411316, "learning_rate": 4.697345983109571e-05, "loss": 0.3612, "step": 4472500 }, { "epoch": 30.268785188393245, "grad_norm": 0.4061616063117981, "learning_rate": 4.6973121481160674e-05, "loss": 0.3599, "step": 4473000 }, { "epoch": 30.272168687743612, "grad_norm": 0.4007011950016022, "learning_rate": 4.6972783131225636e-05, "loss": 0.3615, "step": 4473500 }, { "epoch": 30.27555218709398, "grad_norm": 0.31149446964263916, "learning_rate": 4.6972444781290605e-05, "loss": 0.3598, "step": 4474000 }, { "epoch": 30.27893568644435, "grad_norm": 0.3719044029712677, "learning_rate": 4.697210643135557e-05, "loss": 0.3603, "step": 4474500 }, { "epoch": 30.282319185794716, "grad_norm": 0.35385191440582275, "learning_rate": 4.697176808142053e-05, "loss": 0.36, "step": 4475000 }, { "epoch": 30.285702685145086, "grad_norm": 0.3812853693962097, "learning_rate": 4.697142973148549e-05, "loss": 0.3617, "step": 4475500 }, { "epoch": 30.289086184495453, "grad_norm": 0.3912808299064636, "learning_rate": 4.697109138155046e-05, "loss": 0.3614, "step": 4476000 }, { "epoch": 30.29246968384582, "grad_norm": 0.3804853856563568, "learning_rate": 4.697075303161542e-05, "loss": 0.3616, "step": 4476500 }, { "epoch": 30.29585318319619, "grad_norm": 0.32981404662132263, "learning_rate": 4.6970414681680385e-05, "loss": 0.3598, "step": 4477000 }, { "epoch": 30.299236682546557, "grad_norm": 0.3632580637931824, "learning_rate": 4.697007633174535e-05, "loss": 0.3613, "step": 4477500 }, { "epoch": 30.302620181896923, "grad_norm": 0.36564409732818604, "learning_rate": 4.696973798181031e-05, "loss": 0.3622, "step": 4478000 }, { "epoch": 30.306003681247294, "grad_norm": 0.4020627439022064, "learning_rate": 4.696939963187527e-05, "loss": 0.3601, "step": 4478500 }, { "epoch": 30.30938718059766, "grad_norm": 0.37020233273506165, "learning_rate": 4.696906128194023e-05, "loss": 0.3609, "step": 4479000 }, { "epoch": 30.31277067994803, "grad_norm": 0.38797226548194885, "learning_rate": 4.69687229320052e-05, "loss": 0.3603, "step": 4479500 }, { "epoch": 30.316154179298398, "grad_norm": 0.37550729513168335, "learning_rate": 4.6968384582070164e-05, "loss": 0.3603, "step": 4480000 }, { "epoch": 30.319537678648764, "grad_norm": 0.34766310453414917, "learning_rate": 4.6968046232135126e-05, "loss": 0.3604, "step": 4480500 }, { "epoch": 30.322921177999135, "grad_norm": 0.4076807498931885, "learning_rate": 4.696770788220009e-05, "loss": 0.36, "step": 4481000 }, { "epoch": 30.3263046773495, "grad_norm": 0.418477863073349, "learning_rate": 4.696736953226506e-05, "loss": 0.3617, "step": 4481500 }, { "epoch": 30.32968817669987, "grad_norm": 0.38338854908943176, "learning_rate": 4.696703118233002e-05, "loss": 0.3595, "step": 4482000 }, { "epoch": 30.33307167605024, "grad_norm": 0.37635135650634766, "learning_rate": 4.6966692832394975e-05, "loss": 0.3584, "step": 4482500 }, { "epoch": 30.336455175400605, "grad_norm": 0.38679662346839905, "learning_rate": 4.696635448245994e-05, "loss": 0.3599, "step": 4483000 }, { "epoch": 30.339838674750975, "grad_norm": 0.377916157245636, "learning_rate": 4.6966016132524906e-05, "loss": 0.3602, "step": 4483500 }, { "epoch": 30.343222174101342, "grad_norm": 0.38286900520324707, "learning_rate": 4.696567778258987e-05, "loss": 0.361, "step": 4484000 }, { "epoch": 30.346605673451712, "grad_norm": 0.372231662273407, "learning_rate": 4.696533943265483e-05, "loss": 0.3596, "step": 4484500 }, { "epoch": 30.34998917280208, "grad_norm": 0.3833298087120056, "learning_rate": 4.696500108271979e-05, "loss": 0.3601, "step": 4485000 }, { "epoch": 30.353372672152446, "grad_norm": 0.4139886498451233, "learning_rate": 4.696466273278476e-05, "loss": 0.3605, "step": 4485500 }, { "epoch": 30.356756171502816, "grad_norm": 0.34204918146133423, "learning_rate": 4.696432438284972e-05, "loss": 0.3613, "step": 4486000 }, { "epoch": 30.360139670853183, "grad_norm": 0.38752761483192444, "learning_rate": 4.6963986032914685e-05, "loss": 0.3615, "step": 4486500 }, { "epoch": 30.36352317020355, "grad_norm": 0.3860863447189331, "learning_rate": 4.696364768297965e-05, "loss": 0.3607, "step": 4487000 }, { "epoch": 30.36690666955392, "grad_norm": 0.4313901960849762, "learning_rate": 4.696330933304461e-05, "loss": 0.3608, "step": 4487500 }, { "epoch": 30.370290168904287, "grad_norm": 0.39515420794487, "learning_rate": 4.696297098310957e-05, "loss": 0.3607, "step": 4488000 }, { "epoch": 30.373673668254657, "grad_norm": 0.39293739199638367, "learning_rate": 4.6962632633174534e-05, "loss": 0.3601, "step": 4488500 }, { "epoch": 30.377057167605024, "grad_norm": 0.37145382165908813, "learning_rate": 4.69622942832395e-05, "loss": 0.3606, "step": 4489000 }, { "epoch": 30.38044066695539, "grad_norm": 0.4163252115249634, "learning_rate": 4.6961955933304465e-05, "loss": 0.3613, "step": 4489500 }, { "epoch": 30.38382416630576, "grad_norm": 0.34504857659339905, "learning_rate": 4.696161758336943e-05, "loss": 0.3605, "step": 4490000 }, { "epoch": 30.387207665656128, "grad_norm": 0.34117037057876587, "learning_rate": 4.696127923343439e-05, "loss": 0.3623, "step": 4490500 }, { "epoch": 30.390591165006498, "grad_norm": 0.40235698223114014, "learning_rate": 4.696094088349936e-05, "loss": 0.3616, "step": 4491000 }, { "epoch": 30.393974664356865, "grad_norm": 0.38580021262168884, "learning_rate": 4.696060253356432e-05, "loss": 0.3604, "step": 4491500 }, { "epoch": 30.39735816370723, "grad_norm": 0.36691147089004517, "learning_rate": 4.6960264183629275e-05, "loss": 0.3599, "step": 4492000 }, { "epoch": 30.4007416630576, "grad_norm": 0.39179420471191406, "learning_rate": 4.695992583369424e-05, "loss": 0.3618, "step": 4492500 }, { "epoch": 30.40412516240797, "grad_norm": 0.3824407756328583, "learning_rate": 4.6959587483759206e-05, "loss": 0.3598, "step": 4493000 }, { "epoch": 30.407508661758335, "grad_norm": 0.394069641828537, "learning_rate": 4.695924913382417e-05, "loss": 0.36, "step": 4493500 }, { "epoch": 30.410892161108706, "grad_norm": 0.3963245153427124, "learning_rate": 4.695891078388913e-05, "loss": 0.3608, "step": 4494000 }, { "epoch": 30.414275660459072, "grad_norm": 0.35123634338378906, "learning_rate": 4.695857243395409e-05, "loss": 0.3611, "step": 4494500 }, { "epoch": 30.417659159809443, "grad_norm": 0.39586952328681946, "learning_rate": 4.695823408401906e-05, "loss": 0.3626, "step": 4495000 }, { "epoch": 30.42104265915981, "grad_norm": 0.35105255246162415, "learning_rate": 4.6957895734084024e-05, "loss": 0.3599, "step": 4495500 }, { "epoch": 30.424426158510176, "grad_norm": 0.342883825302124, "learning_rate": 4.6957557384148986e-05, "loss": 0.36, "step": 4496000 }, { "epoch": 30.427809657860546, "grad_norm": 0.3733304440975189, "learning_rate": 4.695721903421395e-05, "loss": 0.3599, "step": 4496500 }, { "epoch": 30.431193157210913, "grad_norm": 0.3445548415184021, "learning_rate": 4.695688068427891e-05, "loss": 0.3603, "step": 4497000 }, { "epoch": 30.434576656561283, "grad_norm": 0.3503607213497162, "learning_rate": 4.695654233434387e-05, "loss": 0.3599, "step": 4497500 }, { "epoch": 30.43796015591165, "grad_norm": 0.3949844241142273, "learning_rate": 4.6956203984408834e-05, "loss": 0.3617, "step": 4498000 }, { "epoch": 30.441343655262017, "grad_norm": 0.40511298179626465, "learning_rate": 4.69558656344738e-05, "loss": 0.3609, "step": 4498500 }, { "epoch": 30.444727154612387, "grad_norm": 0.37571224570274353, "learning_rate": 4.6955527284538765e-05, "loss": 0.3585, "step": 4499000 }, { "epoch": 30.448110653962754, "grad_norm": 0.37550434470176697, "learning_rate": 4.695518893460373e-05, "loss": 0.3615, "step": 4499500 }, { "epoch": 30.451494153313124, "grad_norm": 0.4065387547016144, "learning_rate": 4.695485058466869e-05, "loss": 0.3602, "step": 4500000 }, { "epoch": 30.45487765266349, "grad_norm": 0.3657798171043396, "learning_rate": 4.695451223473366e-05, "loss": 0.3628, "step": 4500500 }, { "epoch": 30.458261152013858, "grad_norm": 0.402597039937973, "learning_rate": 4.695417388479862e-05, "loss": 0.3611, "step": 4501000 }, { "epoch": 30.461644651364228, "grad_norm": 0.3700020909309387, "learning_rate": 4.6953835534863576e-05, "loss": 0.3609, "step": 4501500 }, { "epoch": 30.465028150714595, "grad_norm": 0.37999027967453003, "learning_rate": 4.695349718492854e-05, "loss": 0.3611, "step": 4502000 }, { "epoch": 30.46841165006496, "grad_norm": 0.4107026755809784, "learning_rate": 4.695315883499351e-05, "loss": 0.3602, "step": 4502500 }, { "epoch": 30.471795149415332, "grad_norm": 0.3572571277618408, "learning_rate": 4.695282048505847e-05, "loss": 0.3605, "step": 4503000 }, { "epoch": 30.4751786487657, "grad_norm": 0.3318372368812561, "learning_rate": 4.695248213512343e-05, "loss": 0.3606, "step": 4503500 }, { "epoch": 30.47856214811607, "grad_norm": 0.38756465911865234, "learning_rate": 4.695214378518839e-05, "loss": 0.3615, "step": 4504000 }, { "epoch": 30.481945647466436, "grad_norm": 0.38692402839660645, "learning_rate": 4.695180543525336e-05, "loss": 0.3605, "step": 4504500 }, { "epoch": 30.485329146816802, "grad_norm": 0.3692699670791626, "learning_rate": 4.6951467085318324e-05, "loss": 0.3595, "step": 4505000 }, { "epoch": 30.488712646167173, "grad_norm": 0.3602524697780609, "learning_rate": 4.6951128735383287e-05, "loss": 0.3608, "step": 4505500 }, { "epoch": 30.49209614551754, "grad_norm": 0.3955193758010864, "learning_rate": 4.695079038544825e-05, "loss": 0.36, "step": 4506000 }, { "epoch": 30.49547964486791, "grad_norm": 0.37169477343559265, "learning_rate": 4.695045203551321e-05, "loss": 0.3605, "step": 4506500 }, { "epoch": 30.498863144218276, "grad_norm": 0.37480148673057556, "learning_rate": 4.695011368557817e-05, "loss": 0.3618, "step": 4507000 }, { "epoch": 30.502246643568643, "grad_norm": 0.36751168966293335, "learning_rate": 4.6949775335643135e-05, "loss": 0.3608, "step": 4507500 }, { "epoch": 30.505630142919014, "grad_norm": 0.311517596244812, "learning_rate": 4.6949436985708104e-05, "loss": 0.3602, "step": 4508000 }, { "epoch": 30.50901364226938, "grad_norm": 0.3592129349708557, "learning_rate": 4.6949098635773066e-05, "loss": 0.3604, "step": 4508500 }, { "epoch": 30.51239714161975, "grad_norm": 0.3845657706260681, "learning_rate": 4.694876028583803e-05, "loss": 0.3602, "step": 4509000 }, { "epoch": 30.515780640970117, "grad_norm": 0.37815576791763306, "learning_rate": 4.694842193590299e-05, "loss": 0.3603, "step": 4509500 }, { "epoch": 30.519164140320484, "grad_norm": 0.36821088194847107, "learning_rate": 4.694808358596795e-05, "loss": 0.3617, "step": 4510000 }, { "epoch": 30.522547639670854, "grad_norm": 0.3512398600578308, "learning_rate": 4.694774523603292e-05, "loss": 0.3624, "step": 4510500 }, { "epoch": 30.52593113902122, "grad_norm": 0.30272915959358215, "learning_rate": 4.694740688609788e-05, "loss": 0.3631, "step": 4511000 }, { "epoch": 30.529314638371588, "grad_norm": 0.37545034289360046, "learning_rate": 4.694706853616284e-05, "loss": 0.36, "step": 4511500 }, { "epoch": 30.532698137721958, "grad_norm": 0.3488900363445282, "learning_rate": 4.694673018622781e-05, "loss": 0.361, "step": 4512000 }, { "epoch": 30.536081637072325, "grad_norm": 0.3494974672794342, "learning_rate": 4.694639183629277e-05, "loss": 0.3613, "step": 4512500 }, { "epoch": 30.539465136422695, "grad_norm": 0.37769636511802673, "learning_rate": 4.694605348635773e-05, "loss": 0.359, "step": 4513000 }, { "epoch": 30.542848635773062, "grad_norm": 0.3510251045227051, "learning_rate": 4.6945715136422694e-05, "loss": 0.3607, "step": 4513500 }, { "epoch": 30.54623213512343, "grad_norm": 0.43237999081611633, "learning_rate": 4.694537678648766e-05, "loss": 0.3607, "step": 4514000 }, { "epoch": 30.5496156344738, "grad_norm": 0.40540429949760437, "learning_rate": 4.6945038436552625e-05, "loss": 0.3607, "step": 4514500 }, { "epoch": 30.552999133824166, "grad_norm": 0.392507940530777, "learning_rate": 4.694470008661759e-05, "loss": 0.3608, "step": 4515000 }, { "epoch": 30.556382633174536, "grad_norm": 0.403692364692688, "learning_rate": 4.694436173668255e-05, "loss": 0.3608, "step": 4515500 }, { "epoch": 30.559766132524903, "grad_norm": 0.3994150459766388, "learning_rate": 4.694402338674751e-05, "loss": 0.3625, "step": 4516000 }, { "epoch": 30.56314963187527, "grad_norm": 0.3970039486885071, "learning_rate": 4.6943685036812474e-05, "loss": 0.3603, "step": 4516500 }, { "epoch": 30.56653313122564, "grad_norm": 0.35074129700660706, "learning_rate": 4.6943346686877436e-05, "loss": 0.3609, "step": 4517000 }, { "epoch": 30.569916630576007, "grad_norm": 0.3436708152294159, "learning_rate": 4.6943008336942405e-05, "loss": 0.3615, "step": 4517500 }, { "epoch": 30.573300129926373, "grad_norm": 0.3597992956638336, "learning_rate": 4.694266998700737e-05, "loss": 0.3614, "step": 4518000 }, { "epoch": 30.576683629276744, "grad_norm": 0.40081173181533813, "learning_rate": 4.694233163707233e-05, "loss": 0.362, "step": 4518500 }, { "epoch": 30.58006712862711, "grad_norm": 0.41162216663360596, "learning_rate": 4.694199328713729e-05, "loss": 0.3607, "step": 4519000 }, { "epoch": 30.58345062797748, "grad_norm": 0.3673850893974304, "learning_rate": 4.694165493720225e-05, "loss": 0.3613, "step": 4519500 }, { "epoch": 30.586834127327847, "grad_norm": 0.3754594326019287, "learning_rate": 4.694131658726722e-05, "loss": 0.3604, "step": 4520000 }, { "epoch": 30.590217626678214, "grad_norm": 0.3868243992328644, "learning_rate": 4.694097823733218e-05, "loss": 0.3605, "step": 4520500 }, { "epoch": 30.593601126028585, "grad_norm": 0.3701590299606323, "learning_rate": 4.694063988739714e-05, "loss": 0.3587, "step": 4521000 }, { "epoch": 30.59698462537895, "grad_norm": 0.3930908739566803, "learning_rate": 4.694030153746211e-05, "loss": 0.3594, "step": 4521500 }, { "epoch": 30.60036812472932, "grad_norm": 0.39562007784843445, "learning_rate": 4.693996318752707e-05, "loss": 0.3603, "step": 4522000 }, { "epoch": 30.60375162407969, "grad_norm": 0.3444449305534363, "learning_rate": 4.693962483759203e-05, "loss": 0.3606, "step": 4522500 }, { "epoch": 30.607135123430055, "grad_norm": 0.38593870401382446, "learning_rate": 4.6939286487656995e-05, "loss": 0.3598, "step": 4523000 }, { "epoch": 30.610518622780425, "grad_norm": 0.3739345967769623, "learning_rate": 4.6938948137721964e-05, "loss": 0.3602, "step": 4523500 }, { "epoch": 30.613902122130792, "grad_norm": 0.3817853629589081, "learning_rate": 4.6938609787786926e-05, "loss": 0.3609, "step": 4524000 }, { "epoch": 30.617285621481162, "grad_norm": 0.39146387577056885, "learning_rate": 4.693827143785189e-05, "loss": 0.3619, "step": 4524500 }, { "epoch": 30.62066912083153, "grad_norm": 0.39248788356781006, "learning_rate": 4.693793308791685e-05, "loss": 0.3594, "step": 4525000 }, { "epoch": 30.624052620181896, "grad_norm": 0.4058881103992462, "learning_rate": 4.693759473798181e-05, "loss": 0.3597, "step": 4525500 }, { "epoch": 30.627436119532266, "grad_norm": 0.39480721950531006, "learning_rate": 4.6937256388046774e-05, "loss": 0.3609, "step": 4526000 }, { "epoch": 30.630819618882633, "grad_norm": 0.3836055099964142, "learning_rate": 4.6936918038111736e-05, "loss": 0.3614, "step": 4526500 }, { "epoch": 30.634203118233, "grad_norm": 0.42320966720581055, "learning_rate": 4.69365796881767e-05, "loss": 0.3606, "step": 4527000 }, { "epoch": 30.63758661758337, "grad_norm": 0.38261085748672485, "learning_rate": 4.693624133824167e-05, "loss": 0.3593, "step": 4527500 }, { "epoch": 30.640970116933737, "grad_norm": 0.371084600687027, "learning_rate": 4.693590298830663e-05, "loss": 0.3598, "step": 4528000 }, { "epoch": 30.644353616284107, "grad_norm": 0.3737763464450836, "learning_rate": 4.693556463837159e-05, "loss": 0.3616, "step": 4528500 }, { "epoch": 30.647737115634474, "grad_norm": 0.34043997526168823, "learning_rate": 4.6935226288436554e-05, "loss": 0.3604, "step": 4529000 }, { "epoch": 30.65112061498484, "grad_norm": 0.37720245122909546, "learning_rate": 4.693488793850152e-05, "loss": 0.3594, "step": 4529500 }, { "epoch": 30.65450411433521, "grad_norm": 0.41709980368614197, "learning_rate": 4.693454958856648e-05, "loss": 0.362, "step": 4530000 }, { "epoch": 30.657887613685578, "grad_norm": 0.336330771446228, "learning_rate": 4.693421123863144e-05, "loss": 0.3612, "step": 4530500 }, { "epoch": 30.661271113035948, "grad_norm": 0.36730971932411194, "learning_rate": 4.693387288869641e-05, "loss": 0.3578, "step": 4531000 }, { "epoch": 30.664654612386315, "grad_norm": 0.3970741629600525, "learning_rate": 4.693353453876137e-05, "loss": 0.3613, "step": 4531500 }, { "epoch": 30.66803811173668, "grad_norm": 0.4086032509803772, "learning_rate": 4.693319618882633e-05, "loss": 0.3595, "step": 4532000 }, { "epoch": 30.67142161108705, "grad_norm": 0.4054871201515198, "learning_rate": 4.6932857838891295e-05, "loss": 0.3606, "step": 4532500 }, { "epoch": 30.67480511043742, "grad_norm": 0.3702140152454376, "learning_rate": 4.6932519488956264e-05, "loss": 0.3601, "step": 4533000 }, { "epoch": 30.67818860978779, "grad_norm": 0.3790774941444397, "learning_rate": 4.6932181139021226e-05, "loss": 0.3623, "step": 4533500 }, { "epoch": 30.681572109138155, "grad_norm": 0.3950534760951996, "learning_rate": 4.693184278908619e-05, "loss": 0.3607, "step": 4534000 }, { "epoch": 30.684955608488522, "grad_norm": 0.3758167326450348, "learning_rate": 4.693150443915115e-05, "loss": 0.3599, "step": 4534500 }, { "epoch": 30.688339107838893, "grad_norm": 0.41029444336891174, "learning_rate": 4.693116608921611e-05, "loss": 0.3598, "step": 4535000 }, { "epoch": 30.69172260718926, "grad_norm": 0.39428913593292236, "learning_rate": 4.6930827739281075e-05, "loss": 0.36, "step": 4535500 }, { "epoch": 30.695106106539626, "grad_norm": 0.33379751443862915, "learning_rate": 4.693048938934604e-05, "loss": 0.36, "step": 4536000 }, { "epoch": 30.698489605889996, "grad_norm": 0.38447538018226624, "learning_rate": 4.6930151039411e-05, "loss": 0.3615, "step": 4536500 }, { "epoch": 30.701873105240363, "grad_norm": 0.3635065257549286, "learning_rate": 4.692981268947597e-05, "loss": 0.361, "step": 4537000 }, { "epoch": 30.705256604590733, "grad_norm": 0.3788832724094391, "learning_rate": 4.692947433954093e-05, "loss": 0.3614, "step": 4537500 }, { "epoch": 30.7086401039411, "grad_norm": 0.3611948788166046, "learning_rate": 4.692913598960589e-05, "loss": 0.362, "step": 4538000 }, { "epoch": 30.712023603291467, "grad_norm": 0.3582962155342102, "learning_rate": 4.6928797639670854e-05, "loss": 0.3599, "step": 4538500 }, { "epoch": 30.715407102641837, "grad_norm": 0.3849903643131256, "learning_rate": 4.692845928973582e-05, "loss": 0.3603, "step": 4539000 }, { "epoch": 30.718790601992204, "grad_norm": 0.38260653614997864, "learning_rate": 4.692812093980078e-05, "loss": 0.3601, "step": 4539500 }, { "epoch": 30.722174101342574, "grad_norm": 0.3879907727241516, "learning_rate": 4.692778258986574e-05, "loss": 0.3619, "step": 4540000 }, { "epoch": 30.72555760069294, "grad_norm": 0.3873540163040161, "learning_rate": 4.692744423993071e-05, "loss": 0.3615, "step": 4540500 }, { "epoch": 30.728941100043308, "grad_norm": 0.3742757737636566, "learning_rate": 4.692710588999567e-05, "loss": 0.3597, "step": 4541000 }, { "epoch": 30.732324599393678, "grad_norm": 0.39747047424316406, "learning_rate": 4.6926767540060634e-05, "loss": 0.3603, "step": 4541500 }, { "epoch": 30.735708098744045, "grad_norm": 0.3961320221424103, "learning_rate": 4.6926429190125596e-05, "loss": 0.3619, "step": 4542000 }, { "epoch": 30.73909159809441, "grad_norm": 0.38149750232696533, "learning_rate": 4.6926090840190565e-05, "loss": 0.3608, "step": 4542500 }, { "epoch": 30.742475097444782, "grad_norm": 0.4243999719619751, "learning_rate": 4.692575249025553e-05, "loss": 0.36, "step": 4543000 }, { "epoch": 30.74585859679515, "grad_norm": 0.36307740211486816, "learning_rate": 4.692541414032049e-05, "loss": 0.3607, "step": 4543500 }, { "epoch": 30.74924209614552, "grad_norm": 0.3696213960647583, "learning_rate": 4.692507579038545e-05, "loss": 0.3623, "step": 4544000 }, { "epoch": 30.752625595495886, "grad_norm": 0.34193122386932373, "learning_rate": 4.692473744045041e-05, "loss": 0.3615, "step": 4544500 }, { "epoch": 30.756009094846252, "grad_norm": 0.3728967607021332, "learning_rate": 4.6924399090515375e-05, "loss": 0.3598, "step": 4545000 }, { "epoch": 30.759392594196623, "grad_norm": 0.3507138788700104, "learning_rate": 4.692406074058034e-05, "loss": 0.3605, "step": 4545500 }, { "epoch": 30.76277609354699, "grad_norm": 0.36319103837013245, "learning_rate": 4.69237223906453e-05, "loss": 0.3617, "step": 4546000 }, { "epoch": 30.76615959289736, "grad_norm": 0.3437182307243347, "learning_rate": 4.692338404071027e-05, "loss": 0.362, "step": 4546500 }, { "epoch": 30.769543092247726, "grad_norm": 0.3841271996498108, "learning_rate": 4.692304569077523e-05, "loss": 0.3605, "step": 4547000 }, { "epoch": 30.772926591598093, "grad_norm": 0.3654472529888153, "learning_rate": 4.692270734084019e-05, "loss": 0.3614, "step": 4547500 }, { "epoch": 30.776310090948463, "grad_norm": 0.3569715917110443, "learning_rate": 4.6922368990905155e-05, "loss": 0.3613, "step": 4548000 }, { "epoch": 30.77969359029883, "grad_norm": 0.37520235776901245, "learning_rate": 4.6922030640970124e-05, "loss": 0.3602, "step": 4548500 }, { "epoch": 30.783077089649197, "grad_norm": 0.3281329870223999, "learning_rate": 4.692169229103508e-05, "loss": 0.3602, "step": 4549000 }, { "epoch": 30.786460588999567, "grad_norm": 0.4030281603336334, "learning_rate": 4.692135394110004e-05, "loss": 0.3609, "step": 4549500 }, { "epoch": 30.789844088349934, "grad_norm": 0.40497538447380066, "learning_rate": 4.692101559116501e-05, "loss": 0.362, "step": 4550000 }, { "epoch": 30.793227587700304, "grad_norm": 0.3655402362346649, "learning_rate": 4.692067724122997e-05, "loss": 0.3611, "step": 4550500 }, { "epoch": 30.79661108705067, "grad_norm": 0.37955793738365173, "learning_rate": 4.6920338891294934e-05, "loss": 0.3622, "step": 4551000 }, { "epoch": 30.799994586401038, "grad_norm": 0.36361464858055115, "learning_rate": 4.6920000541359897e-05, "loss": 0.3599, "step": 4551500 }, { "epoch": 30.803378085751408, "grad_norm": 0.3851824998855591, "learning_rate": 4.6919662191424865e-05, "loss": 0.3619, "step": 4552000 }, { "epoch": 30.806761585101775, "grad_norm": 0.3443010449409485, "learning_rate": 4.691932384148983e-05, "loss": 0.3598, "step": 4552500 }, { "epoch": 30.810145084452145, "grad_norm": 0.35350069403648376, "learning_rate": 4.691898549155479e-05, "loss": 0.3599, "step": 4553000 }, { "epoch": 30.813528583802512, "grad_norm": 0.36163684725761414, "learning_rate": 4.691864714161975e-05, "loss": 0.362, "step": 4553500 }, { "epoch": 30.81691208315288, "grad_norm": 0.3298378586769104, "learning_rate": 4.6918308791684714e-05, "loss": 0.3609, "step": 4554000 }, { "epoch": 30.82029558250325, "grad_norm": 0.4053889811038971, "learning_rate": 4.6917970441749676e-05, "loss": 0.3603, "step": 4554500 }, { "epoch": 30.823679081853616, "grad_norm": 0.40790021419525146, "learning_rate": 4.691763209181464e-05, "loss": 0.3612, "step": 4555000 }, { "epoch": 30.827062581203986, "grad_norm": 0.373309850692749, "learning_rate": 4.69172937418796e-05, "loss": 0.3604, "step": 4555500 }, { "epoch": 30.830446080554353, "grad_norm": 0.3495636582374573, "learning_rate": 4.691695539194457e-05, "loss": 0.3601, "step": 4556000 }, { "epoch": 30.83382957990472, "grad_norm": 0.4138449728488922, "learning_rate": 4.691661704200953e-05, "loss": 0.3625, "step": 4556500 }, { "epoch": 30.83721307925509, "grad_norm": 0.4141077995300293, "learning_rate": 4.6916278692074493e-05, "loss": 0.361, "step": 4557000 }, { "epoch": 30.840596578605457, "grad_norm": 0.42484527826309204, "learning_rate": 4.6915940342139456e-05, "loss": 0.3623, "step": 4557500 }, { "epoch": 30.843980077955827, "grad_norm": 0.3764879107475281, "learning_rate": 4.6915601992204424e-05, "loss": 0.3611, "step": 4558000 }, { "epoch": 30.847363577306194, "grad_norm": 0.39300256967544556, "learning_rate": 4.691526364226938e-05, "loss": 0.3597, "step": 4558500 }, { "epoch": 30.85074707665656, "grad_norm": 0.34704139828681946, "learning_rate": 4.691492529233434e-05, "loss": 0.3596, "step": 4559000 }, { "epoch": 30.85413057600693, "grad_norm": 0.3790600895881653, "learning_rate": 4.691458694239931e-05, "loss": 0.36, "step": 4559500 }, { "epoch": 30.857514075357297, "grad_norm": 0.3805277347564697, "learning_rate": 4.691424859246427e-05, "loss": 0.3623, "step": 4560000 }, { "epoch": 30.860897574707664, "grad_norm": 0.4104638993740082, "learning_rate": 4.6913910242529235e-05, "loss": 0.3611, "step": 4560500 }, { "epoch": 30.864281074058034, "grad_norm": 0.40832528471946716, "learning_rate": 4.69135718925942e-05, "loss": 0.3614, "step": 4561000 }, { "epoch": 30.8676645734084, "grad_norm": 0.362267404794693, "learning_rate": 4.6913233542659166e-05, "loss": 0.3608, "step": 4561500 }, { "epoch": 30.87104807275877, "grad_norm": 0.34720054268836975, "learning_rate": 4.691289519272413e-05, "loss": 0.3592, "step": 4562000 }, { "epoch": 30.87443157210914, "grad_norm": 0.4220869541168213, "learning_rate": 4.691255684278909e-05, "loss": 0.3618, "step": 4562500 }, { "epoch": 30.877815071459505, "grad_norm": 0.3882518708705902, "learning_rate": 4.691221849285405e-05, "loss": 0.3594, "step": 4563000 }, { "epoch": 30.881198570809875, "grad_norm": 0.39041945338249207, "learning_rate": 4.6911880142919015e-05, "loss": 0.3616, "step": 4563500 }, { "epoch": 30.884582070160242, "grad_norm": 0.40342891216278076, "learning_rate": 4.691154179298398e-05, "loss": 0.3609, "step": 4564000 }, { "epoch": 30.887965569510612, "grad_norm": 0.3788714110851288, "learning_rate": 4.691120344304894e-05, "loss": 0.3605, "step": 4564500 }, { "epoch": 30.89134906886098, "grad_norm": 0.37482404708862305, "learning_rate": 4.69108650931139e-05, "loss": 0.3606, "step": 4565000 }, { "epoch": 30.894732568211346, "grad_norm": 0.3604934513568878, "learning_rate": 4.691052674317887e-05, "loss": 0.3619, "step": 4565500 }, { "epoch": 30.898116067561716, "grad_norm": 0.38737213611602783, "learning_rate": 4.691018839324383e-05, "loss": 0.3614, "step": 4566000 }, { "epoch": 30.901499566912083, "grad_norm": 0.37785857915878296, "learning_rate": 4.6909850043308794e-05, "loss": 0.3608, "step": 4566500 }, { "epoch": 30.90488306626245, "grad_norm": 0.39249515533447266, "learning_rate": 4.6909511693373756e-05, "loss": 0.3614, "step": 4567000 }, { "epoch": 30.90826656561282, "grad_norm": 0.3591851592063904, "learning_rate": 4.6909173343438725e-05, "loss": 0.3613, "step": 4567500 }, { "epoch": 30.911650064963187, "grad_norm": 0.3635176420211792, "learning_rate": 4.690883499350368e-05, "loss": 0.3601, "step": 4568000 }, { "epoch": 30.915033564313557, "grad_norm": 0.3742623031139374, "learning_rate": 4.690849664356864e-05, "loss": 0.3604, "step": 4568500 }, { "epoch": 30.918417063663924, "grad_norm": 0.3985554277896881, "learning_rate": 4.690815829363361e-05, "loss": 0.3606, "step": 4569000 }, { "epoch": 30.92180056301429, "grad_norm": 0.3568021059036255, "learning_rate": 4.6907819943698574e-05, "loss": 0.3616, "step": 4569500 }, { "epoch": 30.92518406236466, "grad_norm": 0.4350185692310333, "learning_rate": 4.6907481593763536e-05, "loss": 0.3606, "step": 4570000 }, { "epoch": 30.928567561715028, "grad_norm": 0.39672911167144775, "learning_rate": 4.69071432438285e-05, "loss": 0.3609, "step": 4570500 }, { "epoch": 30.931951061065398, "grad_norm": 0.34133613109588623, "learning_rate": 4.690680489389347e-05, "loss": 0.361, "step": 4571000 }, { "epoch": 30.935334560415765, "grad_norm": 0.37614962458610535, "learning_rate": 4.690646654395843e-05, "loss": 0.3605, "step": 4571500 }, { "epoch": 30.93871805976613, "grad_norm": 0.3928428888320923, "learning_rate": 4.690612819402339e-05, "loss": 0.3598, "step": 4572000 }, { "epoch": 30.9421015591165, "grad_norm": 0.37217843532562256, "learning_rate": 4.690578984408835e-05, "loss": 0.3615, "step": 4572500 }, { "epoch": 30.94548505846687, "grad_norm": 0.3554867208003998, "learning_rate": 4.6905451494153315e-05, "loss": 0.3617, "step": 4573000 }, { "epoch": 30.948868557817235, "grad_norm": 0.3619111180305481, "learning_rate": 4.690511314421828e-05, "loss": 0.3578, "step": 4573500 }, { "epoch": 30.952252057167605, "grad_norm": 0.3643021583557129, "learning_rate": 4.690477479428324e-05, "loss": 0.3602, "step": 4574000 }, { "epoch": 30.955635556517972, "grad_norm": 0.3516826927661896, "learning_rate": 4.69044364443482e-05, "loss": 0.3608, "step": 4574500 }, { "epoch": 30.959019055868342, "grad_norm": 0.40577948093414307, "learning_rate": 4.690409809441317e-05, "loss": 0.361, "step": 4575000 }, { "epoch": 30.96240255521871, "grad_norm": 0.382333368062973, "learning_rate": 4.690375974447813e-05, "loss": 0.3602, "step": 4575500 }, { "epoch": 30.965786054569076, "grad_norm": 0.36798179149627686, "learning_rate": 4.6903421394543095e-05, "loss": 0.3614, "step": 4576000 }, { "epoch": 30.969169553919446, "grad_norm": 0.3688673973083496, "learning_rate": 4.690308304460806e-05, "loss": 0.3603, "step": 4576500 }, { "epoch": 30.972553053269813, "grad_norm": 0.3547976016998291, "learning_rate": 4.6902744694673026e-05, "loss": 0.3595, "step": 4577000 }, { "epoch": 30.975936552620183, "grad_norm": 0.3677893280982971, "learning_rate": 4.690240634473798e-05, "loss": 0.3605, "step": 4577500 }, { "epoch": 30.97932005197055, "grad_norm": 0.3358895480632782, "learning_rate": 4.690206799480294e-05, "loss": 0.3603, "step": 4578000 }, { "epoch": 30.982703551320917, "grad_norm": 0.3861973285675049, "learning_rate": 4.690172964486791e-05, "loss": 0.3606, "step": 4578500 }, { "epoch": 30.986087050671287, "grad_norm": 0.43521299958229065, "learning_rate": 4.6901391294932874e-05, "loss": 0.3597, "step": 4579000 }, { "epoch": 30.989470550021654, "grad_norm": 0.3649062216281891, "learning_rate": 4.6901052944997836e-05, "loss": 0.3608, "step": 4579500 }, { "epoch": 30.992854049372024, "grad_norm": 0.3704669177532196, "learning_rate": 4.69007145950628e-05, "loss": 0.3602, "step": 4580000 }, { "epoch": 30.99623754872239, "grad_norm": 0.3746252655982971, "learning_rate": 4.690037624512777e-05, "loss": 0.3623, "step": 4580500 }, { "epoch": 30.999621048072758, "grad_norm": 0.38075169920921326, "learning_rate": 4.690003789519273e-05, "loss": 0.36, "step": 4581000 }, { "epoch": 31.0, "eval_accuracy": 0.8624102377023692, "eval_loss": 0.5580710172653198, "eval_runtime": 3344.7912, "eval_samples_per_second": 86.924, "eval_steps_per_second": 5.433, "step": 4581056 }, { "epoch": 31.003004547423128, "grad_norm": 0.3490939438343048, "learning_rate": 4.689969954525769e-05, "loss": 0.3585, "step": 4581500 }, { "epoch": 31.006388046773495, "grad_norm": 0.34328708052635193, "learning_rate": 4.6899361195322654e-05, "loss": 0.3583, "step": 4582000 }, { "epoch": 31.00977154612386, "grad_norm": 0.37582507729530334, "learning_rate": 4.6899022845387616e-05, "loss": 0.3595, "step": 4582500 }, { "epoch": 31.01315504547423, "grad_norm": 0.36721697449684143, "learning_rate": 4.689868449545258e-05, "loss": 0.3587, "step": 4583000 }, { "epoch": 31.0165385448246, "grad_norm": 0.3577655255794525, "learning_rate": 4.689834614551754e-05, "loss": 0.3561, "step": 4583500 }, { "epoch": 31.01992204417497, "grad_norm": 0.37689751386642456, "learning_rate": 4.68980077955825e-05, "loss": 0.3586, "step": 4584000 }, { "epoch": 31.023305543525336, "grad_norm": 0.3436678349971771, "learning_rate": 4.689766944564747e-05, "loss": 0.3581, "step": 4584500 }, { "epoch": 31.026689042875702, "grad_norm": 0.37823644280433655, "learning_rate": 4.689733109571243e-05, "loss": 0.3589, "step": 4585000 }, { "epoch": 31.030072542226073, "grad_norm": 0.4318449795246124, "learning_rate": 4.6896992745777395e-05, "loss": 0.3594, "step": 4585500 }, { "epoch": 31.03345604157644, "grad_norm": 0.38328883051872253, "learning_rate": 4.689665439584236e-05, "loss": 0.3596, "step": 4586000 }, { "epoch": 31.03683954092681, "grad_norm": 0.3845657706260681, "learning_rate": 4.6896316045907326e-05, "loss": 0.3585, "step": 4586500 }, { "epoch": 31.040223040277176, "grad_norm": 0.4173922538757324, "learning_rate": 4.689597769597228e-05, "loss": 0.3592, "step": 4587000 }, { "epoch": 31.043606539627543, "grad_norm": 0.4044972360134125, "learning_rate": 4.6895639346037244e-05, "loss": 0.3597, "step": 4587500 }, { "epoch": 31.046990038977913, "grad_norm": 0.3396126925945282, "learning_rate": 4.689530099610221e-05, "loss": 0.3581, "step": 4588000 }, { "epoch": 31.05037353832828, "grad_norm": 0.3836978077888489, "learning_rate": 4.6894962646167175e-05, "loss": 0.3594, "step": 4588500 }, { "epoch": 31.05375703767865, "grad_norm": 0.3947596251964569, "learning_rate": 4.689462429623214e-05, "loss": 0.359, "step": 4589000 }, { "epoch": 31.057140537029017, "grad_norm": 0.3396083414554596, "learning_rate": 4.68942859462971e-05, "loss": 0.3609, "step": 4589500 }, { "epoch": 31.060524036379384, "grad_norm": 0.3815324902534485, "learning_rate": 4.689394759636206e-05, "loss": 0.359, "step": 4590000 }, { "epoch": 31.063907535729754, "grad_norm": 0.40731295943260193, "learning_rate": 4.689360924642703e-05, "loss": 0.3594, "step": 4590500 }, { "epoch": 31.06729103508012, "grad_norm": 0.40843304991722107, "learning_rate": 4.689327089649199e-05, "loss": 0.3592, "step": 4591000 }, { "epoch": 31.070674534430488, "grad_norm": 0.39855343103408813, "learning_rate": 4.6892932546556954e-05, "loss": 0.359, "step": 4591500 }, { "epoch": 31.074058033780858, "grad_norm": 0.4003579318523407, "learning_rate": 4.6892594196621916e-05, "loss": 0.3581, "step": 4592000 }, { "epoch": 31.077441533131225, "grad_norm": 0.4000629484653473, "learning_rate": 4.689225584668688e-05, "loss": 0.3591, "step": 4592500 }, { "epoch": 31.080825032481595, "grad_norm": 0.40287065505981445, "learning_rate": 4.689191749675184e-05, "loss": 0.3597, "step": 4593000 }, { "epoch": 31.084208531831962, "grad_norm": 0.3625352680683136, "learning_rate": 4.68915791468168e-05, "loss": 0.36, "step": 4593500 }, { "epoch": 31.08759203118233, "grad_norm": 0.40282487869262695, "learning_rate": 4.689124079688177e-05, "loss": 0.359, "step": 4594000 }, { "epoch": 31.0909755305327, "grad_norm": 0.37409570813179016, "learning_rate": 4.6890902446946734e-05, "loss": 0.3595, "step": 4594500 }, { "epoch": 31.094359029883066, "grad_norm": 0.33406898379325867, "learning_rate": 4.6890564097011696e-05, "loss": 0.3591, "step": 4595000 }, { "epoch": 31.097742529233436, "grad_norm": 0.3854425251483917, "learning_rate": 4.689022574707666e-05, "loss": 0.3594, "step": 4595500 }, { "epoch": 31.101126028583803, "grad_norm": 0.402474582195282, "learning_rate": 4.688988739714163e-05, "loss": 0.3604, "step": 4596000 }, { "epoch": 31.10450952793417, "grad_norm": 0.38899245858192444, "learning_rate": 4.688954904720659e-05, "loss": 0.3606, "step": 4596500 }, { "epoch": 31.10789302728454, "grad_norm": 0.4109959304332733, "learning_rate": 4.6889210697271544e-05, "loss": 0.3592, "step": 4597000 }, { "epoch": 31.111276526634907, "grad_norm": 0.3785685896873474, "learning_rate": 4.6888872347336507e-05, "loss": 0.3604, "step": 4597500 }, { "epoch": 31.114660025985273, "grad_norm": 0.3987598121166229, "learning_rate": 4.6888533997401475e-05, "loss": 0.3601, "step": 4598000 }, { "epoch": 31.118043525335644, "grad_norm": 0.38963502645492554, "learning_rate": 4.688819564746644e-05, "loss": 0.3597, "step": 4598500 }, { "epoch": 31.12142702468601, "grad_norm": 0.38518026471138, "learning_rate": 4.68878572975314e-05, "loss": 0.3609, "step": 4599000 }, { "epoch": 31.12481052403638, "grad_norm": 0.4085533022880554, "learning_rate": 4.688751894759636e-05, "loss": 0.3602, "step": 4599500 }, { "epoch": 31.128194023386747, "grad_norm": 0.35115405917167664, "learning_rate": 4.688718059766133e-05, "loss": 0.3591, "step": 4600000 }, { "epoch": 31.131577522737114, "grad_norm": 0.3623368442058563, "learning_rate": 4.688684224772629e-05, "loss": 0.36, "step": 4600500 }, { "epoch": 31.134961022087484, "grad_norm": 0.36518803238868713, "learning_rate": 4.6886503897791255e-05, "loss": 0.3588, "step": 4601000 }, { "epoch": 31.13834452143785, "grad_norm": 0.40113335847854614, "learning_rate": 4.688616554785622e-05, "loss": 0.3592, "step": 4601500 }, { "epoch": 31.14172802078822, "grad_norm": 0.35938259959220886, "learning_rate": 4.688582719792118e-05, "loss": 0.3597, "step": 4602000 }, { "epoch": 31.145111520138588, "grad_norm": 0.3931177258491516, "learning_rate": 4.688548884798614e-05, "loss": 0.3592, "step": 4602500 }, { "epoch": 31.148495019488955, "grad_norm": 0.41567766666412354, "learning_rate": 4.6885150498051103e-05, "loss": 0.3603, "step": 4603000 }, { "epoch": 31.151878518839325, "grad_norm": 0.4345746338367462, "learning_rate": 4.688481214811607e-05, "loss": 0.3614, "step": 4603500 }, { "epoch": 31.155262018189692, "grad_norm": 0.36789408326148987, "learning_rate": 4.6884473798181034e-05, "loss": 0.3604, "step": 4604000 }, { "epoch": 31.158645517540062, "grad_norm": 0.3862924575805664, "learning_rate": 4.6884135448245997e-05, "loss": 0.3582, "step": 4604500 }, { "epoch": 31.16202901689043, "grad_norm": 0.384548544883728, "learning_rate": 4.688379709831096e-05, "loss": 0.3579, "step": 4605000 }, { "epoch": 31.165412516240796, "grad_norm": 0.3928464353084564, "learning_rate": 4.688345874837593e-05, "loss": 0.3612, "step": 4605500 }, { "epoch": 31.168796015591166, "grad_norm": 0.36854758858680725, "learning_rate": 4.688312039844089e-05, "loss": 0.3608, "step": 4606000 }, { "epoch": 31.172179514941533, "grad_norm": 0.36410751938819885, "learning_rate": 4.6882782048505845e-05, "loss": 0.3594, "step": 4606500 }, { "epoch": 31.1755630142919, "grad_norm": 0.3804601728916168, "learning_rate": 4.688244369857081e-05, "loss": 0.3597, "step": 4607000 }, { "epoch": 31.17894651364227, "grad_norm": 0.36085045337677, "learning_rate": 4.6882105348635776e-05, "loss": 0.3607, "step": 4607500 }, { "epoch": 31.182330012992637, "grad_norm": 0.3745473325252533, "learning_rate": 4.688176699870074e-05, "loss": 0.3591, "step": 4608000 }, { "epoch": 31.185713512343007, "grad_norm": 0.39781248569488525, "learning_rate": 4.68814286487657e-05, "loss": 0.3596, "step": 4608500 }, { "epoch": 31.189097011693374, "grad_norm": 0.39101073145866394, "learning_rate": 4.688109029883066e-05, "loss": 0.3619, "step": 4609000 }, { "epoch": 31.19248051104374, "grad_norm": 0.37476101517677307, "learning_rate": 4.688075194889563e-05, "loss": 0.3585, "step": 4609500 }, { "epoch": 31.19586401039411, "grad_norm": 0.39833685755729675, "learning_rate": 4.6880413598960593e-05, "loss": 0.36, "step": 4610000 }, { "epoch": 31.199247509744477, "grad_norm": 0.39467480778694153, "learning_rate": 4.6880075249025556e-05, "loss": 0.359, "step": 4610500 }, { "epoch": 31.202631009094848, "grad_norm": 0.3747001588344574, "learning_rate": 4.687973689909052e-05, "loss": 0.3619, "step": 4611000 }, { "epoch": 31.206014508445215, "grad_norm": 0.4207741916179657, "learning_rate": 4.687939854915548e-05, "loss": 0.3614, "step": 4611500 }, { "epoch": 31.20939800779558, "grad_norm": 0.3837501108646393, "learning_rate": 4.687906019922044e-05, "loss": 0.3598, "step": 4612000 }, { "epoch": 31.21278150714595, "grad_norm": 0.36298200488090515, "learning_rate": 4.6878721849285404e-05, "loss": 0.3595, "step": 4612500 }, { "epoch": 31.21616500649632, "grad_norm": 0.4514963626861572, "learning_rate": 4.687838349935037e-05, "loss": 0.3576, "step": 4613000 }, { "epoch": 31.21954850584669, "grad_norm": 0.4059322774410248, "learning_rate": 4.6878045149415335e-05, "loss": 0.3601, "step": 4613500 }, { "epoch": 31.222932005197055, "grad_norm": 0.39534878730773926, "learning_rate": 4.68777067994803e-05, "loss": 0.3604, "step": 4614000 }, { "epoch": 31.226315504547422, "grad_norm": 0.38197407126426697, "learning_rate": 4.687736844954526e-05, "loss": 0.3593, "step": 4614500 }, { "epoch": 31.229699003897792, "grad_norm": 0.3725822865962982, "learning_rate": 4.687703009961023e-05, "loss": 0.3613, "step": 4615000 }, { "epoch": 31.23308250324816, "grad_norm": 0.36064666509628296, "learning_rate": 4.687669174967519e-05, "loss": 0.3612, "step": 4615500 }, { "epoch": 31.236466002598526, "grad_norm": 0.361724317073822, "learning_rate": 4.6876353399740146e-05, "loss": 0.3586, "step": 4616000 }, { "epoch": 31.239849501948896, "grad_norm": 0.37389740347862244, "learning_rate": 4.687601504980511e-05, "loss": 0.3596, "step": 4616500 }, { "epoch": 31.243233001299263, "grad_norm": 0.3896735906600952, "learning_rate": 4.687567669987008e-05, "loss": 0.3614, "step": 4617000 }, { "epoch": 31.246616500649633, "grad_norm": 0.36957865953445435, "learning_rate": 4.687533834993504e-05, "loss": 0.3581, "step": 4617500 }, { "epoch": 31.25, "grad_norm": 0.37003257870674133, "learning_rate": 4.6875e-05, "loss": 0.3604, "step": 4618000 }, { "epoch": 31.253383499350367, "grad_norm": 0.3659612834453583, "learning_rate": 4.687466165006496e-05, "loss": 0.3609, "step": 4618500 }, { "epoch": 31.256766998700737, "grad_norm": 0.3906296491622925, "learning_rate": 4.687432330012993e-05, "loss": 0.3606, "step": 4619000 }, { "epoch": 31.260150498051104, "grad_norm": 0.37453439831733704, "learning_rate": 4.6873984950194894e-05, "loss": 0.3604, "step": 4619500 }, { "epoch": 31.263533997401474, "grad_norm": 0.40176907181739807, "learning_rate": 4.6873646600259856e-05, "loss": 0.3591, "step": 4620000 }, { "epoch": 31.26691749675184, "grad_norm": 0.3883937895298004, "learning_rate": 4.687330825032482e-05, "loss": 0.3588, "step": 4620500 }, { "epoch": 31.270300996102208, "grad_norm": 0.3804904818534851, "learning_rate": 4.687296990038978e-05, "loss": 0.3596, "step": 4621000 }, { "epoch": 31.273684495452578, "grad_norm": 0.3401568531990051, "learning_rate": 4.687263155045474e-05, "loss": 0.3607, "step": 4621500 }, { "epoch": 31.277067994802945, "grad_norm": 0.397629052400589, "learning_rate": 4.6872293200519705e-05, "loss": 0.3597, "step": 4622000 }, { "epoch": 31.28045149415331, "grad_norm": 0.40658038854599, "learning_rate": 4.6871954850584674e-05, "loss": 0.3608, "step": 4622500 }, { "epoch": 31.28383499350368, "grad_norm": 0.38460561633110046, "learning_rate": 4.6871616500649636e-05, "loss": 0.3596, "step": 4623000 }, { "epoch": 31.28721849285405, "grad_norm": 0.4182147979736328, "learning_rate": 4.68712781507146e-05, "loss": 0.3604, "step": 4623500 }, { "epoch": 31.29060199220442, "grad_norm": 0.38921990990638733, "learning_rate": 4.687093980077956e-05, "loss": 0.3601, "step": 4624000 }, { "epoch": 31.293985491554785, "grad_norm": 0.40398287773132324, "learning_rate": 4.687060145084453e-05, "loss": 0.3618, "step": 4624500 }, { "epoch": 31.297368990905152, "grad_norm": 0.38871896266937256, "learning_rate": 4.687026310090949e-05, "loss": 0.3617, "step": 4625000 }, { "epoch": 31.300752490255523, "grad_norm": 0.4172930419445038, "learning_rate": 4.6869924750974446e-05, "loss": 0.3603, "step": 4625500 }, { "epoch": 31.30413598960589, "grad_norm": 0.38299837708473206, "learning_rate": 4.686958640103941e-05, "loss": 0.3602, "step": 4626000 }, { "epoch": 31.30751948895626, "grad_norm": 0.3578115999698639, "learning_rate": 4.686924805110438e-05, "loss": 0.3597, "step": 4626500 }, { "epoch": 31.310902988306626, "grad_norm": 0.3577576279640198, "learning_rate": 4.686890970116934e-05, "loss": 0.3599, "step": 4627000 }, { "epoch": 31.314286487656993, "grad_norm": 0.35940787196159363, "learning_rate": 4.68685713512343e-05, "loss": 0.3588, "step": 4627500 }, { "epoch": 31.317669987007363, "grad_norm": 0.37951037287712097, "learning_rate": 4.6868233001299264e-05, "loss": 0.3605, "step": 4628000 }, { "epoch": 31.32105348635773, "grad_norm": 0.4084251821041107, "learning_rate": 4.686789465136423e-05, "loss": 0.3606, "step": 4628500 }, { "epoch": 31.3244369857081, "grad_norm": 0.3893352150917053, "learning_rate": 4.6867556301429195e-05, "loss": 0.3592, "step": 4629000 }, { "epoch": 31.327820485058467, "grad_norm": 0.37116125226020813, "learning_rate": 4.686721795149416e-05, "loss": 0.3598, "step": 4629500 }, { "epoch": 31.331203984408834, "grad_norm": 0.4443565905094147, "learning_rate": 4.686687960155912e-05, "loss": 0.3601, "step": 4630000 }, { "epoch": 31.334587483759204, "grad_norm": 0.3604671061038971, "learning_rate": 4.686654125162408e-05, "loss": 0.3599, "step": 4630500 }, { "epoch": 31.33797098310957, "grad_norm": 0.35667118430137634, "learning_rate": 4.686620290168904e-05, "loss": 0.3591, "step": 4631000 }, { "epoch": 31.341354482459938, "grad_norm": 0.3471163213253021, "learning_rate": 4.6865864551754005e-05, "loss": 0.3587, "step": 4631500 }, { "epoch": 31.344737981810308, "grad_norm": 0.3652501106262207, "learning_rate": 4.6865526201818974e-05, "loss": 0.3599, "step": 4632000 }, { "epoch": 31.348121481160675, "grad_norm": 0.3757786154747009, "learning_rate": 4.6865187851883936e-05, "loss": 0.3615, "step": 4632500 }, { "epoch": 31.351504980511045, "grad_norm": 0.3931577801704407, "learning_rate": 4.68648495019489e-05, "loss": 0.3613, "step": 4633000 }, { "epoch": 31.354888479861412, "grad_norm": 0.4004794657230377, "learning_rate": 4.686451115201386e-05, "loss": 0.3608, "step": 4633500 }, { "epoch": 31.35827197921178, "grad_norm": 0.42484724521636963, "learning_rate": 4.686417280207883e-05, "loss": 0.3614, "step": 4634000 }, { "epoch": 31.36165547856215, "grad_norm": 0.4051852226257324, "learning_rate": 4.686383445214379e-05, "loss": 0.3596, "step": 4634500 }, { "epoch": 31.365038977912516, "grad_norm": 0.3806665539741516, "learning_rate": 4.686349610220875e-05, "loss": 0.3601, "step": 4635000 }, { "epoch": 31.368422477262886, "grad_norm": 0.3938251733779907, "learning_rate": 4.686315775227371e-05, "loss": 0.3603, "step": 4635500 }, { "epoch": 31.371805976613253, "grad_norm": 0.36172622442245483, "learning_rate": 4.686281940233868e-05, "loss": 0.3614, "step": 4636000 }, { "epoch": 31.37518947596362, "grad_norm": 0.40758925676345825, "learning_rate": 4.686248105240364e-05, "loss": 0.3592, "step": 4636500 }, { "epoch": 31.37857297531399, "grad_norm": 0.3601266145706177, "learning_rate": 4.68621427024686e-05, "loss": 0.3614, "step": 4637000 }, { "epoch": 31.381956474664356, "grad_norm": 0.38919326663017273, "learning_rate": 4.6861804352533564e-05, "loss": 0.3603, "step": 4637500 }, { "epoch": 31.385339974014727, "grad_norm": 0.35700440406799316, "learning_rate": 4.686146600259853e-05, "loss": 0.3607, "step": 4638000 }, { "epoch": 31.388723473365093, "grad_norm": 0.3763803541660309, "learning_rate": 4.6861127652663495e-05, "loss": 0.3595, "step": 4638500 }, { "epoch": 31.39210697271546, "grad_norm": 0.4129246175289154, "learning_rate": 4.686078930272846e-05, "loss": 0.3612, "step": 4639000 }, { "epoch": 31.39549047206583, "grad_norm": 0.38882720470428467, "learning_rate": 4.686045095279342e-05, "loss": 0.3608, "step": 4639500 }, { "epoch": 31.398873971416197, "grad_norm": 0.3492278456687927, "learning_rate": 4.686011260285838e-05, "loss": 0.361, "step": 4640000 }, { "epoch": 31.402257470766564, "grad_norm": 0.36681997776031494, "learning_rate": 4.6859774252923344e-05, "loss": 0.3616, "step": 4640500 }, { "epoch": 31.405640970116934, "grad_norm": 0.3288574516773224, "learning_rate": 4.6859435902988306e-05, "loss": 0.3616, "step": 4641000 }, { "epoch": 31.4090244694673, "grad_norm": 0.3875277042388916, "learning_rate": 4.6859097553053275e-05, "loss": 0.3604, "step": 4641500 }, { "epoch": 31.41240796881767, "grad_norm": 0.3879646956920624, "learning_rate": 4.685875920311824e-05, "loss": 0.3618, "step": 4642000 }, { "epoch": 31.415791468168038, "grad_norm": 0.367970734834671, "learning_rate": 4.68584208531832e-05, "loss": 0.36, "step": 4642500 }, { "epoch": 31.419174967518405, "grad_norm": 0.3683345913887024, "learning_rate": 4.685808250324816e-05, "loss": 0.3608, "step": 4643000 }, { "epoch": 31.422558466868775, "grad_norm": 0.3478187322616577, "learning_rate": 4.685774415331312e-05, "loss": 0.3597, "step": 4643500 }, { "epoch": 31.425941966219142, "grad_norm": 0.37414970993995667, "learning_rate": 4.685740580337809e-05, "loss": 0.3599, "step": 4644000 }, { "epoch": 31.429325465569512, "grad_norm": 0.3836267590522766, "learning_rate": 4.685706745344305e-05, "loss": 0.3589, "step": 4644500 }, { "epoch": 31.43270896491988, "grad_norm": 0.3911152482032776, "learning_rate": 4.685672910350801e-05, "loss": 0.361, "step": 4645000 }, { "epoch": 31.436092464270246, "grad_norm": 0.3513597249984741, "learning_rate": 4.685639075357298e-05, "loss": 0.3602, "step": 4645500 }, { "epoch": 31.439475963620616, "grad_norm": 0.42742282152175903, "learning_rate": 4.685605240363794e-05, "loss": 0.3618, "step": 4646000 }, { "epoch": 31.442859462970983, "grad_norm": 0.4075610637664795, "learning_rate": 4.68557140537029e-05, "loss": 0.3613, "step": 4646500 }, { "epoch": 31.44624296232135, "grad_norm": 0.36794495582580566, "learning_rate": 4.6855375703767865e-05, "loss": 0.3609, "step": 4647000 }, { "epoch": 31.44962646167172, "grad_norm": 0.38682469725608826, "learning_rate": 4.6855037353832834e-05, "loss": 0.3623, "step": 4647500 }, { "epoch": 31.453009961022087, "grad_norm": 0.39508524537086487, "learning_rate": 4.6854699003897796e-05, "loss": 0.3607, "step": 4648000 }, { "epoch": 31.456393460372457, "grad_norm": 0.3804340362548828, "learning_rate": 4.685436065396276e-05, "loss": 0.3601, "step": 4648500 }, { "epoch": 31.459776959722824, "grad_norm": 0.39548158645629883, "learning_rate": 4.685402230402772e-05, "loss": 0.3597, "step": 4649000 }, { "epoch": 31.46316045907319, "grad_norm": 0.3658633232116699, "learning_rate": 4.685368395409268e-05, "loss": 0.3596, "step": 4649500 }, { "epoch": 31.46654395842356, "grad_norm": 0.382374107837677, "learning_rate": 4.6853345604157645e-05, "loss": 0.3595, "step": 4650000 }, { "epoch": 31.469927457773927, "grad_norm": 0.40996837615966797, "learning_rate": 4.685300725422261e-05, "loss": 0.3614, "step": 4650500 }, { "epoch": 31.473310957124298, "grad_norm": 0.4142885208129883, "learning_rate": 4.6852668904287576e-05, "loss": 0.3584, "step": 4651000 }, { "epoch": 31.476694456474664, "grad_norm": 0.4049195647239685, "learning_rate": 4.685233055435254e-05, "loss": 0.3593, "step": 4651500 }, { "epoch": 31.48007795582503, "grad_norm": 0.4016236662864685, "learning_rate": 4.68519922044175e-05, "loss": 0.3615, "step": 4652000 }, { "epoch": 31.4834614551754, "grad_norm": 0.39193233847618103, "learning_rate": 4.685165385448246e-05, "loss": 0.36, "step": 4652500 }, { "epoch": 31.48684495452577, "grad_norm": 0.33804407715797424, "learning_rate": 4.6851315504547424e-05, "loss": 0.3608, "step": 4653000 }, { "epoch": 31.49022845387614, "grad_norm": 0.40392035245895386, "learning_rate": 4.685097715461239e-05, "loss": 0.3597, "step": 4653500 }, { "epoch": 31.493611953226505, "grad_norm": 0.37518763542175293, "learning_rate": 4.685063880467735e-05, "loss": 0.3611, "step": 4654000 }, { "epoch": 31.496995452576872, "grad_norm": 0.3940230906009674, "learning_rate": 4.685030045474231e-05, "loss": 0.3596, "step": 4654500 }, { "epoch": 31.500378951927242, "grad_norm": 0.35870859026908875, "learning_rate": 4.684996210480728e-05, "loss": 0.3611, "step": 4655000 }, { "epoch": 31.50376245127761, "grad_norm": 0.3652559518814087, "learning_rate": 4.684962375487224e-05, "loss": 0.3584, "step": 4655500 }, { "epoch": 31.507145950627976, "grad_norm": 0.3551163375377655, "learning_rate": 4.6849285404937204e-05, "loss": 0.3606, "step": 4656000 }, { "epoch": 31.510529449978346, "grad_norm": 0.3690461814403534, "learning_rate": 4.6848947055002166e-05, "loss": 0.3606, "step": 4656500 }, { "epoch": 31.513912949328713, "grad_norm": 0.3778305947780609, "learning_rate": 4.6848608705067135e-05, "loss": 0.3603, "step": 4657000 }, { "epoch": 31.517296448679083, "grad_norm": 0.359300434589386, "learning_rate": 4.68482703551321e-05, "loss": 0.36, "step": 4657500 }, { "epoch": 31.52067994802945, "grad_norm": 0.4256763756275177, "learning_rate": 4.684793200519706e-05, "loss": 0.3606, "step": 4658000 }, { "epoch": 31.524063447379817, "grad_norm": 0.37461280822753906, "learning_rate": 4.684759365526202e-05, "loss": 0.36, "step": 4658500 }, { "epoch": 31.527446946730187, "grad_norm": 0.40624773502349854, "learning_rate": 4.684725530532698e-05, "loss": 0.3599, "step": 4659000 }, { "epoch": 31.530830446080554, "grad_norm": 0.3790808916091919, "learning_rate": 4.6846916955391945e-05, "loss": 0.3609, "step": 4659500 }, { "epoch": 31.534213945430924, "grad_norm": 0.4063674807548523, "learning_rate": 4.684657860545691e-05, "loss": 0.3604, "step": 4660000 }, { "epoch": 31.53759744478129, "grad_norm": 0.3722701966762543, "learning_rate": 4.684624025552187e-05, "loss": 0.3596, "step": 4660500 }, { "epoch": 31.540980944131658, "grad_norm": 0.4062025249004364, "learning_rate": 4.684590190558684e-05, "loss": 0.3611, "step": 4661000 }, { "epoch": 31.544364443482028, "grad_norm": 0.3837735056877136, "learning_rate": 4.68455635556518e-05, "loss": 0.3603, "step": 4661500 }, { "epoch": 31.547747942832395, "grad_norm": 0.3805675208568573, "learning_rate": 4.684522520571676e-05, "loss": 0.3596, "step": 4662000 }, { "epoch": 31.551131442182765, "grad_norm": 0.3643147051334381, "learning_rate": 4.6844886855781725e-05, "loss": 0.3609, "step": 4662500 }, { "epoch": 31.55451494153313, "grad_norm": 0.3622497022151947, "learning_rate": 4.6844548505846694e-05, "loss": 0.3583, "step": 4663000 }, { "epoch": 31.5578984408835, "grad_norm": 0.3624139428138733, "learning_rate": 4.684421015591165e-05, "loss": 0.3609, "step": 4663500 }, { "epoch": 31.56128194023387, "grad_norm": 0.3771470785140991, "learning_rate": 4.684387180597661e-05, "loss": 0.3601, "step": 4664000 }, { "epoch": 31.564665439584235, "grad_norm": 0.3783354163169861, "learning_rate": 4.684353345604158e-05, "loss": 0.3622, "step": 4664500 }, { "epoch": 31.568048938934602, "grad_norm": 0.3971211314201355, "learning_rate": 4.684319510610654e-05, "loss": 0.3596, "step": 4665000 }, { "epoch": 31.571432438284972, "grad_norm": 0.3950501084327698, "learning_rate": 4.6842856756171504e-05, "loss": 0.361, "step": 4665500 }, { "epoch": 31.57481593763534, "grad_norm": 0.3610475957393646, "learning_rate": 4.6842518406236466e-05, "loss": 0.3603, "step": 4666000 }, { "epoch": 31.57819943698571, "grad_norm": 0.3884837031364441, "learning_rate": 4.6842180056301435e-05, "loss": 0.3606, "step": 4666500 }, { "epoch": 31.581582936336076, "grad_norm": 0.3898885250091553, "learning_rate": 4.68418417063664e-05, "loss": 0.3604, "step": 4667000 }, { "epoch": 31.584966435686443, "grad_norm": 0.38525745272636414, "learning_rate": 4.684150335643136e-05, "loss": 0.3623, "step": 4667500 }, { "epoch": 31.588349935036813, "grad_norm": 0.40762364864349365, "learning_rate": 4.684116500649632e-05, "loss": 0.3597, "step": 4668000 }, { "epoch": 31.59173343438718, "grad_norm": 0.37447842955589294, "learning_rate": 4.6840826656561284e-05, "loss": 0.3588, "step": 4668500 }, { "epoch": 31.59511693373755, "grad_norm": 0.40254485607147217, "learning_rate": 4.6840488306626246e-05, "loss": 0.3594, "step": 4669000 }, { "epoch": 31.598500433087917, "grad_norm": 0.39968255162239075, "learning_rate": 4.684014995669121e-05, "loss": 0.361, "step": 4669500 }, { "epoch": 31.601883932438284, "grad_norm": 0.33560580015182495, "learning_rate": 4.683981160675617e-05, "loss": 0.3596, "step": 4670000 }, { "epoch": 31.605267431788654, "grad_norm": 0.39426931738853455, "learning_rate": 4.683947325682114e-05, "loss": 0.3596, "step": 4670500 }, { "epoch": 31.60865093113902, "grad_norm": 0.4080418646335602, "learning_rate": 4.68391349068861e-05, "loss": 0.3581, "step": 4671000 }, { "epoch": 31.612034430489388, "grad_norm": 0.33502811193466187, "learning_rate": 4.683879655695106e-05, "loss": 0.3599, "step": 4671500 }, { "epoch": 31.615417929839758, "grad_norm": 0.36781832575798035, "learning_rate": 4.6838458207016025e-05, "loss": 0.3623, "step": 4672000 }, { "epoch": 31.618801429190125, "grad_norm": 0.42620423436164856, "learning_rate": 4.6838119857080994e-05, "loss": 0.3588, "step": 4672500 }, { "epoch": 31.622184928540495, "grad_norm": 0.3697637617588043, "learning_rate": 4.683778150714595e-05, "loss": 0.3604, "step": 4673000 }, { "epoch": 31.62556842789086, "grad_norm": 0.3921816945075989, "learning_rate": 4.683744315721091e-05, "loss": 0.3601, "step": 4673500 }, { "epoch": 31.62895192724123, "grad_norm": 0.38950610160827637, "learning_rate": 4.683710480727588e-05, "loss": 0.3595, "step": 4674000 }, { "epoch": 31.6323354265916, "grad_norm": 0.3828791677951813, "learning_rate": 4.683676645734084e-05, "loss": 0.3605, "step": 4674500 }, { "epoch": 31.635718925941966, "grad_norm": 0.3760644495487213, "learning_rate": 4.6836428107405805e-05, "loss": 0.3605, "step": 4675000 }, { "epoch": 31.639102425292336, "grad_norm": 0.3590831458568573, "learning_rate": 4.683608975747077e-05, "loss": 0.3611, "step": 4675500 }, { "epoch": 31.642485924642703, "grad_norm": 0.38537073135375977, "learning_rate": 4.6835751407535736e-05, "loss": 0.361, "step": 4676000 }, { "epoch": 31.64586942399307, "grad_norm": 0.4026997685432434, "learning_rate": 4.68354130576007e-05, "loss": 0.36, "step": 4676500 }, { "epoch": 31.64925292334344, "grad_norm": 0.4079139530658722, "learning_rate": 4.683507470766566e-05, "loss": 0.3604, "step": 4677000 }, { "epoch": 31.652636422693806, "grad_norm": 0.3519248068332672, "learning_rate": 4.683473635773062e-05, "loss": 0.3607, "step": 4677500 }, { "epoch": 31.656019922044173, "grad_norm": 0.422990620136261, "learning_rate": 4.6834398007795584e-05, "loss": 0.3605, "step": 4678000 }, { "epoch": 31.659403421394543, "grad_norm": 0.36565840244293213, "learning_rate": 4.6834059657860546e-05, "loss": 0.3603, "step": 4678500 }, { "epoch": 31.66278692074491, "grad_norm": 0.39964473247528076, "learning_rate": 4.683372130792551e-05, "loss": 0.3616, "step": 4679000 }, { "epoch": 31.66617042009528, "grad_norm": 0.3694780170917511, "learning_rate": 4.683338295799047e-05, "loss": 0.3601, "step": 4679500 }, { "epoch": 31.669553919445647, "grad_norm": 0.34623003005981445, "learning_rate": 4.683304460805544e-05, "loss": 0.3626, "step": 4680000 }, { "epoch": 31.672937418796014, "grad_norm": 0.3790321946144104, "learning_rate": 4.68327062581204e-05, "loss": 0.3618, "step": 4680500 }, { "epoch": 31.676320918146384, "grad_norm": 0.34973055124282837, "learning_rate": 4.6832367908185364e-05, "loss": 0.3612, "step": 4681000 }, { "epoch": 31.67970441749675, "grad_norm": 0.3752252459526062, "learning_rate": 4.6832029558250326e-05, "loss": 0.3584, "step": 4681500 }, { "epoch": 31.68308791684712, "grad_norm": 0.364828884601593, "learning_rate": 4.6831691208315295e-05, "loss": 0.3619, "step": 4682000 }, { "epoch": 31.686471416197488, "grad_norm": 0.4167463183403015, "learning_rate": 4.683135285838025e-05, "loss": 0.3609, "step": 4682500 }, { "epoch": 31.689854915547855, "grad_norm": 0.3400552272796631, "learning_rate": 4.683101450844521e-05, "loss": 0.3607, "step": 4683000 }, { "epoch": 31.693238414898225, "grad_norm": 0.3850339651107788, "learning_rate": 4.683067615851018e-05, "loss": 0.3591, "step": 4683500 }, { "epoch": 31.696621914248592, "grad_norm": 0.3828616142272949, "learning_rate": 4.683033780857514e-05, "loss": 0.361, "step": 4684000 }, { "epoch": 31.700005413598962, "grad_norm": 0.3609130084514618, "learning_rate": 4.6829999458640105e-05, "loss": 0.3619, "step": 4684500 }, { "epoch": 31.70338891294933, "grad_norm": 0.3726107180118561, "learning_rate": 4.682966110870507e-05, "loss": 0.3617, "step": 4685000 }, { "epoch": 31.706772412299696, "grad_norm": 0.4102441668510437, "learning_rate": 4.6829322758770036e-05, "loss": 0.36, "step": 4685500 }, { "epoch": 31.710155911650066, "grad_norm": 0.3822396397590637, "learning_rate": 4.6828984408835e-05, "loss": 0.3606, "step": 4686000 }, { "epoch": 31.713539411000433, "grad_norm": 0.40908774733543396, "learning_rate": 4.682864605889996e-05, "loss": 0.3601, "step": 4686500 }, { "epoch": 31.716922910350803, "grad_norm": 0.34824812412261963, "learning_rate": 4.682830770896492e-05, "loss": 0.3598, "step": 4687000 }, { "epoch": 31.72030640970117, "grad_norm": 0.3778131306171417, "learning_rate": 4.6827969359029885e-05, "loss": 0.3619, "step": 4687500 }, { "epoch": 31.723689909051537, "grad_norm": 0.34503281116485596, "learning_rate": 4.682763100909485e-05, "loss": 0.3607, "step": 4688000 }, { "epoch": 31.727073408401907, "grad_norm": 0.38387107849121094, "learning_rate": 4.682729265915981e-05, "loss": 0.3619, "step": 4688500 }, { "epoch": 31.730456907752274, "grad_norm": 0.379212886095047, "learning_rate": 4.682695430922477e-05, "loss": 0.3605, "step": 4689000 }, { "epoch": 31.73384040710264, "grad_norm": 0.37629860639572144, "learning_rate": 4.682661595928974e-05, "loss": 0.361, "step": 4689500 }, { "epoch": 31.73722390645301, "grad_norm": 0.332742303609848, "learning_rate": 4.68262776093547e-05, "loss": 0.3589, "step": 4690000 }, { "epoch": 31.740607405803377, "grad_norm": 0.3826378285884857, "learning_rate": 4.6825939259419664e-05, "loss": 0.3604, "step": 4690500 }, { "epoch": 31.743990905153748, "grad_norm": 0.3938352167606354, "learning_rate": 4.6825600909484627e-05, "loss": 0.3603, "step": 4691000 }, { "epoch": 31.747374404504114, "grad_norm": 0.45102402567863464, "learning_rate": 4.6825262559549595e-05, "loss": 0.3603, "step": 4691500 }, { "epoch": 31.75075790385448, "grad_norm": 0.3734528422355652, "learning_rate": 4.682492420961455e-05, "loss": 0.3608, "step": 4692000 }, { "epoch": 31.75414140320485, "grad_norm": 0.379263311624527, "learning_rate": 4.682458585967951e-05, "loss": 0.3602, "step": 4692500 }, { "epoch": 31.757524902555218, "grad_norm": 0.40262335538864136, "learning_rate": 4.682424750974448e-05, "loss": 0.3591, "step": 4693000 }, { "epoch": 31.76090840190559, "grad_norm": 0.38411325216293335, "learning_rate": 4.6823909159809444e-05, "loss": 0.3609, "step": 4693500 }, { "epoch": 31.764291901255955, "grad_norm": 0.34137555956840515, "learning_rate": 4.6823570809874406e-05, "loss": 0.3612, "step": 4694000 }, { "epoch": 31.767675400606322, "grad_norm": 0.4293920695781708, "learning_rate": 4.682323245993937e-05, "loss": 0.3608, "step": 4694500 }, { "epoch": 31.771058899956692, "grad_norm": 0.4304783344268799, "learning_rate": 4.682289411000434e-05, "loss": 0.3611, "step": 4695000 }, { "epoch": 31.77444239930706, "grad_norm": 0.3868955075740814, "learning_rate": 4.68225557600693e-05, "loss": 0.3601, "step": 4695500 }, { "epoch": 31.777825898657426, "grad_norm": 0.35239407420158386, "learning_rate": 4.682221741013426e-05, "loss": 0.3616, "step": 4696000 }, { "epoch": 31.781209398007796, "grad_norm": 0.35901278257369995, "learning_rate": 4.6821879060199223e-05, "loss": 0.361, "step": 4696500 }, { "epoch": 31.784592897358163, "grad_norm": 0.3940626084804535, "learning_rate": 4.6821540710264186e-05, "loss": 0.361, "step": 4697000 }, { "epoch": 31.787976396708533, "grad_norm": 0.4088931977748871, "learning_rate": 4.682120236032915e-05, "loss": 0.3593, "step": 4697500 }, { "epoch": 31.7913598960589, "grad_norm": 0.38752543926239014, "learning_rate": 4.682086401039411e-05, "loss": 0.3604, "step": 4698000 }, { "epoch": 31.794743395409267, "grad_norm": 0.3893696665763855, "learning_rate": 4.682052566045907e-05, "loss": 0.3588, "step": 4698500 }, { "epoch": 31.798126894759637, "grad_norm": 0.34849750995635986, "learning_rate": 4.682018731052404e-05, "loss": 0.3597, "step": 4699000 }, { "epoch": 31.801510394110004, "grad_norm": 0.32492223381996155, "learning_rate": 4.6819848960589e-05, "loss": 0.3603, "step": 4699500 }, { "epoch": 31.804893893460374, "grad_norm": 0.38688844442367554, "learning_rate": 4.6819510610653965e-05, "loss": 0.3614, "step": 4700000 }, { "epoch": 31.80827739281074, "grad_norm": 0.3697317838668823, "learning_rate": 4.681917226071893e-05, "loss": 0.3595, "step": 4700500 }, { "epoch": 31.811660892161107, "grad_norm": 0.359293133020401, "learning_rate": 4.6818833910783896e-05, "loss": 0.36, "step": 4701000 }, { "epoch": 31.815044391511478, "grad_norm": 0.39656931161880493, "learning_rate": 4.681849556084885e-05, "loss": 0.3615, "step": 4701500 }, { "epoch": 31.818427890861845, "grad_norm": 0.3779048025608063, "learning_rate": 4.6818157210913814e-05, "loss": 0.36, "step": 4702000 }, { "epoch": 31.82181139021221, "grad_norm": 0.42255547642707825, "learning_rate": 4.681781886097878e-05, "loss": 0.3608, "step": 4702500 }, { "epoch": 31.82519488956258, "grad_norm": 0.36566025018692017, "learning_rate": 4.6817480511043745e-05, "loss": 0.3613, "step": 4703000 }, { "epoch": 31.82857838891295, "grad_norm": 0.3470553755760193, "learning_rate": 4.681714216110871e-05, "loss": 0.361, "step": 4703500 }, { "epoch": 31.83196188826332, "grad_norm": 0.37342017889022827, "learning_rate": 4.681680381117367e-05, "loss": 0.3595, "step": 4704000 }, { "epoch": 31.835345387613685, "grad_norm": 0.3994550108909607, "learning_rate": 4.681646546123864e-05, "loss": 0.3606, "step": 4704500 }, { "epoch": 31.838728886964052, "grad_norm": 0.36774635314941406, "learning_rate": 4.68161271113036e-05, "loss": 0.3609, "step": 4705000 }, { "epoch": 31.842112386314422, "grad_norm": 0.37347596883773804, "learning_rate": 4.681578876136856e-05, "loss": 0.3613, "step": 4705500 }, { "epoch": 31.84549588566479, "grad_norm": 0.36244910955429077, "learning_rate": 4.6815450411433524e-05, "loss": 0.36, "step": 4706000 }, { "epoch": 31.84887938501516, "grad_norm": 0.3883986175060272, "learning_rate": 4.6815112061498486e-05, "loss": 0.3593, "step": 4706500 }, { "epoch": 31.852262884365526, "grad_norm": 0.4014483094215393, "learning_rate": 4.681477371156345e-05, "loss": 0.3617, "step": 4707000 }, { "epoch": 31.855646383715893, "grad_norm": 0.3901611268520355, "learning_rate": 4.681443536162841e-05, "loss": 0.3597, "step": 4707500 }, { "epoch": 31.859029883066263, "grad_norm": 0.3857150375843048, "learning_rate": 4.681409701169337e-05, "loss": 0.3604, "step": 4708000 }, { "epoch": 31.86241338241663, "grad_norm": 0.39421144127845764, "learning_rate": 4.681375866175834e-05, "loss": 0.3607, "step": 4708500 }, { "epoch": 31.865796881767, "grad_norm": 0.3842255771160126, "learning_rate": 4.6813420311823304e-05, "loss": 0.3608, "step": 4709000 }, { "epoch": 31.869180381117367, "grad_norm": 0.3913736045360565, "learning_rate": 4.6813081961888266e-05, "loss": 0.3614, "step": 4709500 }, { "epoch": 31.872563880467734, "grad_norm": 0.328275591135025, "learning_rate": 4.681274361195323e-05, "loss": 0.3608, "step": 4710000 }, { "epoch": 31.875947379818104, "grad_norm": 0.3807612359523773, "learning_rate": 4.68124052620182e-05, "loss": 0.3601, "step": 4710500 }, { "epoch": 31.87933087916847, "grad_norm": 0.35134157538414, "learning_rate": 4.681206691208316e-05, "loss": 0.3597, "step": 4711000 }, { "epoch": 31.882714378518838, "grad_norm": 0.34368059039115906, "learning_rate": 4.6811728562148114e-05, "loss": 0.3611, "step": 4711500 }, { "epoch": 31.886097877869208, "grad_norm": 0.3971202075481415, "learning_rate": 4.681139021221308e-05, "loss": 0.3606, "step": 4712000 }, { "epoch": 31.889481377219575, "grad_norm": 0.3558181822299957, "learning_rate": 4.6811051862278045e-05, "loss": 0.3599, "step": 4712500 }, { "epoch": 31.892864876569945, "grad_norm": 0.3947175145149231, "learning_rate": 4.681071351234301e-05, "loss": 0.3594, "step": 4713000 }, { "epoch": 31.89624837592031, "grad_norm": 0.4031308889389038, "learning_rate": 4.681037516240797e-05, "loss": 0.3608, "step": 4713500 }, { "epoch": 31.89963187527068, "grad_norm": 0.3834698498249054, "learning_rate": 4.681003681247293e-05, "loss": 0.3594, "step": 4714000 }, { "epoch": 31.90301537462105, "grad_norm": 0.35495537519454956, "learning_rate": 4.68096984625379e-05, "loss": 0.3596, "step": 4714500 }, { "epoch": 31.906398873971415, "grad_norm": 0.3593955338001251, "learning_rate": 4.680936011260286e-05, "loss": 0.361, "step": 4715000 }, { "epoch": 31.909782373321786, "grad_norm": 0.37075990438461304, "learning_rate": 4.6809021762667825e-05, "loss": 0.3607, "step": 4715500 }, { "epoch": 31.913165872672153, "grad_norm": 0.3548746109008789, "learning_rate": 4.680868341273279e-05, "loss": 0.359, "step": 4716000 }, { "epoch": 31.91654937202252, "grad_norm": 0.36042797565460205, "learning_rate": 4.680834506279775e-05, "loss": 0.3597, "step": 4716500 }, { "epoch": 31.91993287137289, "grad_norm": 0.40099388360977173, "learning_rate": 4.680800671286271e-05, "loss": 0.3614, "step": 4717000 }, { "epoch": 31.923316370723256, "grad_norm": 0.4233962893486023, "learning_rate": 4.680766836292767e-05, "loss": 0.3597, "step": 4717500 }, { "epoch": 31.926699870073627, "grad_norm": 0.36417707800865173, "learning_rate": 4.680733001299264e-05, "loss": 0.3604, "step": 4718000 }, { "epoch": 31.930083369423993, "grad_norm": 0.3779143989086151, "learning_rate": 4.6806991663057604e-05, "loss": 0.3596, "step": 4718500 }, { "epoch": 31.93346686877436, "grad_norm": 0.35195863246917725, "learning_rate": 4.6806653313122566e-05, "loss": 0.36, "step": 4719000 }, { "epoch": 31.93685036812473, "grad_norm": 0.39064928889274597, "learning_rate": 4.680631496318753e-05, "loss": 0.3608, "step": 4719500 }, { "epoch": 31.940233867475097, "grad_norm": 0.4043065309524536, "learning_rate": 4.68059766132525e-05, "loss": 0.3606, "step": 4720000 }, { "epoch": 31.943617366825464, "grad_norm": 0.391126811504364, "learning_rate": 4.680563826331746e-05, "loss": 0.3608, "step": 4720500 }, { "epoch": 31.947000866175834, "grad_norm": 0.3295779526233673, "learning_rate": 4.6805299913382415e-05, "loss": 0.3599, "step": 4721000 }, { "epoch": 31.9503843655262, "grad_norm": 0.3799513876438141, "learning_rate": 4.6804961563447384e-05, "loss": 0.3609, "step": 4721500 }, { "epoch": 31.95376786487657, "grad_norm": 0.3539388179779053, "learning_rate": 4.6804623213512346e-05, "loss": 0.3592, "step": 4722000 }, { "epoch": 31.957151364226938, "grad_norm": 0.35162782669067383, "learning_rate": 4.680428486357731e-05, "loss": 0.3612, "step": 4722500 }, { "epoch": 31.960534863577305, "grad_norm": 0.35630112886428833, "learning_rate": 4.680394651364227e-05, "loss": 0.3607, "step": 4723000 }, { "epoch": 31.963918362927675, "grad_norm": 0.3836091458797455, "learning_rate": 4.680360816370723e-05, "loss": 0.3619, "step": 4723500 }, { "epoch": 31.967301862278042, "grad_norm": 0.3872407078742981, "learning_rate": 4.68032698137722e-05, "loss": 0.3602, "step": 4724000 }, { "epoch": 31.970685361628412, "grad_norm": 0.3915506601333618, "learning_rate": 4.680293146383716e-05, "loss": 0.3602, "step": 4724500 }, { "epoch": 31.97406886097878, "grad_norm": 0.36311158537864685, "learning_rate": 4.6802593113902125e-05, "loss": 0.3592, "step": 4725000 }, { "epoch": 31.977452360329146, "grad_norm": 0.31301188468933105, "learning_rate": 4.680225476396709e-05, "loss": 0.362, "step": 4725500 }, { "epoch": 31.980835859679516, "grad_norm": 0.37805527448654175, "learning_rate": 4.680191641403205e-05, "loss": 0.3605, "step": 4726000 }, { "epoch": 31.984219359029883, "grad_norm": 0.40406933426856995, "learning_rate": 4.680157806409701e-05, "loss": 0.3609, "step": 4726500 }, { "epoch": 31.98760285838025, "grad_norm": 0.3633565604686737, "learning_rate": 4.6801239714161974e-05, "loss": 0.3615, "step": 4727000 }, { "epoch": 31.99098635773062, "grad_norm": 0.37993454933166504, "learning_rate": 4.680090136422694e-05, "loss": 0.3605, "step": 4727500 }, { "epoch": 31.994369857080986, "grad_norm": 0.40591785311698914, "learning_rate": 4.6800563014291905e-05, "loss": 0.3599, "step": 4728000 }, { "epoch": 31.997753356431357, "grad_norm": 0.3888760805130005, "learning_rate": 4.680022466435687e-05, "loss": 0.3603, "step": 4728500 }, { "epoch": 32.0, "eval_accuracy": 0.8626005641265935, "eval_loss": 0.5565376281738281, "eval_runtime": 3342.5449, "eval_samples_per_second": 86.983, "eval_steps_per_second": 5.437, "step": 4728832 }, { "epoch": 32.00113685578172, "grad_norm": 0.39983639121055603, "learning_rate": 4.679988631442183e-05, "loss": 0.3589, "step": 4729000 }, { "epoch": 32.004520355132094, "grad_norm": 0.34574589133262634, "learning_rate": 4.67995479644868e-05, "loss": 0.3593, "step": 4729500 }, { "epoch": 32.00790385448246, "grad_norm": 0.37997013330459595, "learning_rate": 4.679920961455176e-05, "loss": 0.3575, "step": 4730000 }, { "epoch": 32.01128735383283, "grad_norm": 0.39169222116470337, "learning_rate": 4.6798871264616715e-05, "loss": 0.3577, "step": 4730500 }, { "epoch": 32.0146708531832, "grad_norm": 0.36011025309562683, "learning_rate": 4.679853291468168e-05, "loss": 0.3582, "step": 4731000 }, { "epoch": 32.01805435253356, "grad_norm": 0.3974354565143585, "learning_rate": 4.6798194564746646e-05, "loss": 0.3589, "step": 4731500 }, { "epoch": 32.02143785188393, "grad_norm": 0.38111263513565063, "learning_rate": 4.679785621481161e-05, "loss": 0.3583, "step": 4732000 }, { "epoch": 32.0248213512343, "grad_norm": 0.3499109148979187, "learning_rate": 4.679751786487657e-05, "loss": 0.3584, "step": 4732500 }, { "epoch": 32.02820485058467, "grad_norm": 0.4800916612148285, "learning_rate": 4.679717951494153e-05, "loss": 0.3591, "step": 4733000 }, { "epoch": 32.031588349935035, "grad_norm": 0.3772190809249878, "learning_rate": 4.67968411650065e-05, "loss": 0.359, "step": 4733500 }, { "epoch": 32.034971849285405, "grad_norm": 0.3727220892906189, "learning_rate": 4.6796502815071464e-05, "loss": 0.3599, "step": 4734000 }, { "epoch": 32.038355348635775, "grad_norm": 0.41529178619384766, "learning_rate": 4.6796164465136426e-05, "loss": 0.3586, "step": 4734500 }, { "epoch": 32.04173884798614, "grad_norm": 0.4112264811992645, "learning_rate": 4.679582611520139e-05, "loss": 0.3594, "step": 4735000 }, { "epoch": 32.04512234733651, "grad_norm": 0.3342491686344147, "learning_rate": 4.679548776526635e-05, "loss": 0.3589, "step": 4735500 }, { "epoch": 32.04850584668688, "grad_norm": 0.3887941241264343, "learning_rate": 4.679514941533131e-05, "loss": 0.3588, "step": 4736000 }, { "epoch": 32.05188934603724, "grad_norm": 0.37229710817337036, "learning_rate": 4.6794811065396274e-05, "loss": 0.3602, "step": 4736500 }, { "epoch": 32.05527284538761, "grad_norm": 0.46064630150794983, "learning_rate": 4.679447271546124e-05, "loss": 0.3597, "step": 4737000 }, { "epoch": 32.05865634473798, "grad_norm": 0.39698526263237, "learning_rate": 4.6794134365526205e-05, "loss": 0.3597, "step": 4737500 }, { "epoch": 32.06203984408835, "grad_norm": 0.3707379996776581, "learning_rate": 4.679379601559117e-05, "loss": 0.3585, "step": 4738000 }, { "epoch": 32.06542334343872, "grad_norm": 0.4082753658294678, "learning_rate": 4.679345766565613e-05, "loss": 0.3571, "step": 4738500 }, { "epoch": 32.06880684278909, "grad_norm": 0.3864918649196625, "learning_rate": 4.67931193157211e-05, "loss": 0.3588, "step": 4739000 }, { "epoch": 32.07219034213946, "grad_norm": 0.36663374304771423, "learning_rate": 4.679278096578606e-05, "loss": 0.3582, "step": 4739500 }, { "epoch": 32.07557384148982, "grad_norm": 0.37536200881004333, "learning_rate": 4.6792442615851016e-05, "loss": 0.3581, "step": 4740000 }, { "epoch": 32.07895734084019, "grad_norm": 0.37068191170692444, "learning_rate": 4.679210426591598e-05, "loss": 0.3593, "step": 4740500 }, { "epoch": 32.08234084019056, "grad_norm": 0.41226398944854736, "learning_rate": 4.679176591598095e-05, "loss": 0.3582, "step": 4741000 }, { "epoch": 32.085724339540924, "grad_norm": 0.35647261142730713, "learning_rate": 4.679142756604591e-05, "loss": 0.3607, "step": 4741500 }, { "epoch": 32.089107838891294, "grad_norm": 0.37242090702056885, "learning_rate": 4.679108921611087e-05, "loss": 0.3582, "step": 4742000 }, { "epoch": 32.092491338241665, "grad_norm": 0.39995208382606506, "learning_rate": 4.6790750866175833e-05, "loss": 0.3591, "step": 4742500 }, { "epoch": 32.09587483759203, "grad_norm": 0.3676217198371887, "learning_rate": 4.67904125162408e-05, "loss": 0.3603, "step": 4743000 }, { "epoch": 32.0992583369424, "grad_norm": 0.37295204401016235, "learning_rate": 4.6790074166305764e-05, "loss": 0.3582, "step": 4743500 }, { "epoch": 32.10264183629277, "grad_norm": 0.38864821195602417, "learning_rate": 4.6789735816370727e-05, "loss": 0.3596, "step": 4744000 }, { "epoch": 32.10602533564314, "grad_norm": 0.3998364210128784, "learning_rate": 4.678939746643569e-05, "loss": 0.3579, "step": 4744500 }, { "epoch": 32.1094088349935, "grad_norm": 0.44700887799263, "learning_rate": 4.678905911650065e-05, "loss": 0.3596, "step": 4745000 }, { "epoch": 32.11279233434387, "grad_norm": 0.34449413418769836, "learning_rate": 4.678872076656561e-05, "loss": 0.3592, "step": 4745500 }, { "epoch": 32.11617583369424, "grad_norm": 0.3522138297557831, "learning_rate": 4.6788382416630575e-05, "loss": 0.3591, "step": 4746000 }, { "epoch": 32.119559333044606, "grad_norm": 0.4162333607673645, "learning_rate": 4.6788044066695544e-05, "loss": 0.3589, "step": 4746500 }, { "epoch": 32.122942832394976, "grad_norm": 0.42545434832572937, "learning_rate": 4.6787705716760506e-05, "loss": 0.3604, "step": 4747000 }, { "epoch": 32.12632633174535, "grad_norm": 0.3827410042285919, "learning_rate": 4.678736736682547e-05, "loss": 0.3586, "step": 4747500 }, { "epoch": 32.12970983109571, "grad_norm": 0.3557436168193817, "learning_rate": 4.678702901689043e-05, "loss": 0.3601, "step": 4748000 }, { "epoch": 32.13309333044608, "grad_norm": 0.3585735857486725, "learning_rate": 4.67866906669554e-05, "loss": 0.3587, "step": 4748500 }, { "epoch": 32.13647682979645, "grad_norm": 0.37218043208122253, "learning_rate": 4.678635231702036e-05, "loss": 0.3584, "step": 4749000 }, { "epoch": 32.13986032914681, "grad_norm": 0.37905794382095337, "learning_rate": 4.678601396708532e-05, "loss": 0.3594, "step": 4749500 }, { "epoch": 32.143243828497184, "grad_norm": 0.34577351808547974, "learning_rate": 4.678567561715028e-05, "loss": 0.3602, "step": 4750000 }, { "epoch": 32.146627327847554, "grad_norm": 0.3684971034526825, "learning_rate": 4.678533726721525e-05, "loss": 0.3599, "step": 4750500 }, { "epoch": 32.150010827197924, "grad_norm": 0.3612573444843292, "learning_rate": 4.678499891728021e-05, "loss": 0.3586, "step": 4751000 }, { "epoch": 32.15339432654829, "grad_norm": 0.3678892254829407, "learning_rate": 4.678466056734517e-05, "loss": 0.3591, "step": 4751500 }, { "epoch": 32.15677782589866, "grad_norm": 0.404162734746933, "learning_rate": 4.6784322217410134e-05, "loss": 0.3605, "step": 4752000 }, { "epoch": 32.16016132524903, "grad_norm": 0.41183698177337646, "learning_rate": 4.67839838674751e-05, "loss": 0.3615, "step": 4752500 }, { "epoch": 32.16354482459939, "grad_norm": 0.3945522606372833, "learning_rate": 4.6783645517540065e-05, "loss": 0.3611, "step": 4753000 }, { "epoch": 32.16692832394976, "grad_norm": 0.3847481906414032, "learning_rate": 4.678330716760503e-05, "loss": 0.3587, "step": 4753500 }, { "epoch": 32.17031182330013, "grad_norm": 0.37639281153678894, "learning_rate": 4.678296881766999e-05, "loss": 0.3596, "step": 4754000 }, { "epoch": 32.173695322650495, "grad_norm": 0.3757501542568207, "learning_rate": 4.678263046773495e-05, "loss": 0.3607, "step": 4754500 }, { "epoch": 32.177078822000865, "grad_norm": 0.3711193799972534, "learning_rate": 4.6782292117799914e-05, "loss": 0.3588, "step": 4755000 }, { "epoch": 32.180462321351236, "grad_norm": 0.35680362582206726, "learning_rate": 4.6781953767864876e-05, "loss": 0.3611, "step": 4755500 }, { "epoch": 32.1838458207016, "grad_norm": 0.3715004026889801, "learning_rate": 4.6781615417929845e-05, "loss": 0.3595, "step": 4756000 }, { "epoch": 32.18722932005197, "grad_norm": 0.3905080556869507, "learning_rate": 4.678127706799481e-05, "loss": 0.36, "step": 4756500 }, { "epoch": 32.19061281940234, "grad_norm": 0.3584302067756653, "learning_rate": 4.678093871805977e-05, "loss": 0.359, "step": 4757000 }, { "epoch": 32.19399631875271, "grad_norm": 0.37233132123947144, "learning_rate": 4.678060036812473e-05, "loss": 0.3586, "step": 4757500 }, { "epoch": 32.19737981810307, "grad_norm": 0.3986111879348755, "learning_rate": 4.67802620181897e-05, "loss": 0.3601, "step": 4758000 }, { "epoch": 32.20076331745344, "grad_norm": 0.39952966570854187, "learning_rate": 4.677992366825466e-05, "loss": 0.3602, "step": 4758500 }, { "epoch": 32.204146816803814, "grad_norm": 0.3870437443256378, "learning_rate": 4.677958531831962e-05, "loss": 0.36, "step": 4759000 }, { "epoch": 32.20753031615418, "grad_norm": 0.4017411172389984, "learning_rate": 4.677924696838458e-05, "loss": 0.3598, "step": 4759500 }, { "epoch": 32.21091381550455, "grad_norm": 0.3741097152233124, "learning_rate": 4.677890861844955e-05, "loss": 0.359, "step": 4760000 }, { "epoch": 32.21429731485492, "grad_norm": 0.34826579689979553, "learning_rate": 4.677857026851451e-05, "loss": 0.3603, "step": 4760500 }, { "epoch": 32.21768081420528, "grad_norm": 0.3822405934333801, "learning_rate": 4.677823191857947e-05, "loss": 0.3602, "step": 4761000 }, { "epoch": 32.22106431355565, "grad_norm": 0.42045676708221436, "learning_rate": 4.6777893568644435e-05, "loss": 0.3608, "step": 4761500 }, { "epoch": 32.22444781290602, "grad_norm": 0.3952271044254303, "learning_rate": 4.6777555218709404e-05, "loss": 0.3608, "step": 4762000 }, { "epoch": 32.22783131225639, "grad_norm": 0.34460505843162537, "learning_rate": 4.6777216868774366e-05, "loss": 0.3583, "step": 4762500 }, { "epoch": 32.231214811606755, "grad_norm": 0.4037352204322815, "learning_rate": 4.677687851883933e-05, "loss": 0.3601, "step": 4763000 }, { "epoch": 32.234598310957125, "grad_norm": 0.35192182660102844, "learning_rate": 4.677654016890429e-05, "loss": 0.3587, "step": 4763500 }, { "epoch": 32.237981810307495, "grad_norm": 0.3796621561050415, "learning_rate": 4.677620181896925e-05, "loss": 0.36, "step": 4764000 }, { "epoch": 32.24136530965786, "grad_norm": 0.4029964208602905, "learning_rate": 4.6775863469034214e-05, "loss": 0.3596, "step": 4764500 }, { "epoch": 32.24474880900823, "grad_norm": 0.3702819049358368, "learning_rate": 4.6775525119099176e-05, "loss": 0.3596, "step": 4765000 }, { "epoch": 32.2481323083586, "grad_norm": 0.3341525197029114, "learning_rate": 4.6775186769164145e-05, "loss": 0.3597, "step": 4765500 }, { "epoch": 32.25151580770896, "grad_norm": 0.37364503741264343, "learning_rate": 4.677484841922911e-05, "loss": 0.3597, "step": 4766000 }, { "epoch": 32.25489930705933, "grad_norm": 0.3837050795555115, "learning_rate": 4.677451006929407e-05, "loss": 0.3593, "step": 4766500 }, { "epoch": 32.2582828064097, "grad_norm": 0.390402227640152, "learning_rate": 4.677417171935903e-05, "loss": 0.359, "step": 4767000 }, { "epoch": 32.261666305760066, "grad_norm": 0.37794673442840576, "learning_rate": 4.6773833369424e-05, "loss": 0.359, "step": 4767500 }, { "epoch": 32.265049805110436, "grad_norm": 0.38157832622528076, "learning_rate": 4.677349501948896e-05, "loss": 0.3594, "step": 4768000 }, { "epoch": 32.26843330446081, "grad_norm": 0.3674872815608978, "learning_rate": 4.677315666955392e-05, "loss": 0.3611, "step": 4768500 }, { "epoch": 32.27181680381118, "grad_norm": 0.3956109881401062, "learning_rate": 4.677281831961888e-05, "loss": 0.3602, "step": 4769000 }, { "epoch": 32.27520030316154, "grad_norm": 0.37928852438926697, "learning_rate": 4.677247996968385e-05, "loss": 0.3594, "step": 4769500 }, { "epoch": 32.27858380251191, "grad_norm": 0.38557955622673035, "learning_rate": 4.677214161974881e-05, "loss": 0.3595, "step": 4770000 }, { "epoch": 32.28196730186228, "grad_norm": 0.38661181926727295, "learning_rate": 4.677180326981377e-05, "loss": 0.3581, "step": 4770500 }, { "epoch": 32.285350801212644, "grad_norm": 0.38425928354263306, "learning_rate": 4.6771464919878735e-05, "loss": 0.3598, "step": 4771000 }, { "epoch": 32.288734300563014, "grad_norm": 0.3826102614402771, "learning_rate": 4.6771126569943704e-05, "loss": 0.36, "step": 4771500 }, { "epoch": 32.292117799913385, "grad_norm": 0.40827110409736633, "learning_rate": 4.6770788220008666e-05, "loss": 0.3592, "step": 4772000 }, { "epoch": 32.29550129926375, "grad_norm": 0.3643607497215271, "learning_rate": 4.677044987007363e-05, "loss": 0.3614, "step": 4772500 }, { "epoch": 32.29888479861412, "grad_norm": 0.3747618496417999, "learning_rate": 4.677011152013859e-05, "loss": 0.3588, "step": 4773000 }, { "epoch": 32.30226829796449, "grad_norm": 0.35724756121635437, "learning_rate": 4.676977317020355e-05, "loss": 0.3606, "step": 4773500 }, { "epoch": 32.30565179731485, "grad_norm": 0.3933773338794708, "learning_rate": 4.6769434820268515e-05, "loss": 0.3594, "step": 4774000 }, { "epoch": 32.30903529666522, "grad_norm": 0.36762532591819763, "learning_rate": 4.676909647033348e-05, "loss": 0.3599, "step": 4774500 }, { "epoch": 32.31241879601559, "grad_norm": 0.37173232436180115, "learning_rate": 4.6768758120398446e-05, "loss": 0.3596, "step": 4775000 }, { "epoch": 32.31580229536596, "grad_norm": 0.39160171151161194, "learning_rate": 4.676841977046341e-05, "loss": 0.3604, "step": 4775500 }, { "epoch": 32.319185794716326, "grad_norm": 0.3913881182670593, "learning_rate": 4.676808142052837e-05, "loss": 0.3596, "step": 4776000 }, { "epoch": 32.322569294066696, "grad_norm": 0.37087172269821167, "learning_rate": 4.676774307059333e-05, "loss": 0.3595, "step": 4776500 }, { "epoch": 32.325952793417066, "grad_norm": 0.4002458155155182, "learning_rate": 4.6767404720658294e-05, "loss": 0.3597, "step": 4777000 }, { "epoch": 32.32933629276743, "grad_norm": 0.392475962638855, "learning_rate": 4.676706637072326e-05, "loss": 0.3601, "step": 4777500 }, { "epoch": 32.3327197921178, "grad_norm": 0.37204068899154663, "learning_rate": 4.676672802078822e-05, "loss": 0.3593, "step": 4778000 }, { "epoch": 32.33610329146817, "grad_norm": 0.35469821095466614, "learning_rate": 4.676638967085318e-05, "loss": 0.3597, "step": 4778500 }, { "epoch": 32.33948679081853, "grad_norm": 0.4092603027820587, "learning_rate": 4.676605132091815e-05, "loss": 0.3582, "step": 4779000 }, { "epoch": 32.342870290168904, "grad_norm": 0.36772671341896057, "learning_rate": 4.676571297098311e-05, "loss": 0.359, "step": 4779500 }, { "epoch": 32.346253789519274, "grad_norm": 0.3744930922985077, "learning_rate": 4.6765374621048074e-05, "loss": 0.3605, "step": 4780000 }, { "epoch": 32.34963728886964, "grad_norm": 0.4097921550273895, "learning_rate": 4.6765036271113036e-05, "loss": 0.3601, "step": 4780500 }, { "epoch": 32.35302078822001, "grad_norm": 0.3940643072128296, "learning_rate": 4.6764697921178005e-05, "loss": 0.36, "step": 4781000 }, { "epoch": 32.35640428757038, "grad_norm": 0.37415140867233276, "learning_rate": 4.676435957124297e-05, "loss": 0.3594, "step": 4781500 }, { "epoch": 32.35978778692075, "grad_norm": 0.3879052400588989, "learning_rate": 4.676402122130793e-05, "loss": 0.3597, "step": 4782000 }, { "epoch": 32.36317128627111, "grad_norm": 0.4095955193042755, "learning_rate": 4.676368287137289e-05, "loss": 0.3587, "step": 4782500 }, { "epoch": 32.36655478562148, "grad_norm": 0.3983441889286041, "learning_rate": 4.676334452143785e-05, "loss": 0.36, "step": 4783000 }, { "epoch": 32.36993828497185, "grad_norm": 0.38272032141685486, "learning_rate": 4.6763006171502816e-05, "loss": 0.3613, "step": 4783500 }, { "epoch": 32.373321784322215, "grad_norm": 0.3874930143356323, "learning_rate": 4.676266782156778e-05, "loss": 0.3589, "step": 4784000 }, { "epoch": 32.376705283672585, "grad_norm": 0.3716520369052887, "learning_rate": 4.6762329471632747e-05, "loss": 0.3599, "step": 4784500 }, { "epoch": 32.380088783022956, "grad_norm": 0.3528953492641449, "learning_rate": 4.676199112169771e-05, "loss": 0.3598, "step": 4785000 }, { "epoch": 32.38347228237332, "grad_norm": 0.3771120309829712, "learning_rate": 4.676165277176267e-05, "loss": 0.3577, "step": 4785500 }, { "epoch": 32.38685578172369, "grad_norm": 0.37628158926963806, "learning_rate": 4.676131442182763e-05, "loss": 0.3605, "step": 4786000 }, { "epoch": 32.39023928107406, "grad_norm": 0.37140750885009766, "learning_rate": 4.6760976071892595e-05, "loss": 0.3598, "step": 4786500 }, { "epoch": 32.39362278042443, "grad_norm": 0.40377941727638245, "learning_rate": 4.6760637721957564e-05, "loss": 0.3595, "step": 4787000 }, { "epoch": 32.39700627977479, "grad_norm": 0.3502059578895569, "learning_rate": 4.676029937202252e-05, "loss": 0.3597, "step": 4787500 }, { "epoch": 32.40038977912516, "grad_norm": 0.3411078155040741, "learning_rate": 4.675996102208748e-05, "loss": 0.3597, "step": 4788000 }, { "epoch": 32.40377327847553, "grad_norm": 0.3946812152862549, "learning_rate": 4.675962267215245e-05, "loss": 0.36, "step": 4788500 }, { "epoch": 32.4071567778259, "grad_norm": 0.39661461114883423, "learning_rate": 4.675928432221741e-05, "loss": 0.3618, "step": 4789000 }, { "epoch": 32.41054027717627, "grad_norm": 0.40929490327835083, "learning_rate": 4.6758945972282375e-05, "loss": 0.3597, "step": 4789500 }, { "epoch": 32.41392377652664, "grad_norm": 0.37436795234680176, "learning_rate": 4.675860762234734e-05, "loss": 0.3599, "step": 4790000 }, { "epoch": 32.417307275877, "grad_norm": 0.3523145914077759, "learning_rate": 4.6758269272412306e-05, "loss": 0.3591, "step": 4790500 }, { "epoch": 32.42069077522737, "grad_norm": 0.3629417419433594, "learning_rate": 4.675793092247727e-05, "loss": 0.3609, "step": 4791000 }, { "epoch": 32.42407427457774, "grad_norm": 0.42778724431991577, "learning_rate": 4.675759257254223e-05, "loss": 0.3592, "step": 4791500 }, { "epoch": 32.427457773928104, "grad_norm": 0.3381754457950592, "learning_rate": 4.675725422260719e-05, "loss": 0.3603, "step": 4792000 }, { "epoch": 32.430841273278475, "grad_norm": 0.3926127851009369, "learning_rate": 4.6756915872672154e-05, "loss": 0.3599, "step": 4792500 }, { "epoch": 32.434224772628845, "grad_norm": 0.4157543182373047, "learning_rate": 4.6756577522737116e-05, "loss": 0.3611, "step": 4793000 }, { "epoch": 32.437608271979215, "grad_norm": 0.3517511785030365, "learning_rate": 4.675623917280208e-05, "loss": 0.3607, "step": 4793500 }, { "epoch": 32.44099177132958, "grad_norm": 0.377532035112381, "learning_rate": 4.675590082286704e-05, "loss": 0.3587, "step": 4794000 }, { "epoch": 32.44437527067995, "grad_norm": 0.45183494687080383, "learning_rate": 4.675556247293201e-05, "loss": 0.3605, "step": 4794500 }, { "epoch": 32.44775877003032, "grad_norm": 0.4093371331691742, "learning_rate": 4.675522412299697e-05, "loss": 0.3611, "step": 4795000 }, { "epoch": 32.45114226938068, "grad_norm": 0.38602185249328613, "learning_rate": 4.6754885773061934e-05, "loss": 0.3602, "step": 4795500 }, { "epoch": 32.45452576873105, "grad_norm": 0.3903023600578308, "learning_rate": 4.6754547423126896e-05, "loss": 0.3591, "step": 4796000 }, { "epoch": 32.45790926808142, "grad_norm": 0.3937007486820221, "learning_rate": 4.6754209073191865e-05, "loss": 0.3576, "step": 4796500 }, { "epoch": 32.461292767431786, "grad_norm": 0.37577611207962036, "learning_rate": 4.675387072325682e-05, "loss": 0.3592, "step": 4797000 }, { "epoch": 32.464676266782156, "grad_norm": 0.3842858076095581, "learning_rate": 4.675353237332178e-05, "loss": 0.3596, "step": 4797500 }, { "epoch": 32.46805976613253, "grad_norm": 0.3671128451824188, "learning_rate": 4.675319402338675e-05, "loss": 0.3595, "step": 4798000 }, { "epoch": 32.47144326548289, "grad_norm": 0.40977221727371216, "learning_rate": 4.675285567345171e-05, "loss": 0.3591, "step": 4798500 }, { "epoch": 32.47482676483326, "grad_norm": 0.3818470239639282, "learning_rate": 4.6752517323516675e-05, "loss": 0.3584, "step": 4799000 }, { "epoch": 32.47821026418363, "grad_norm": 0.429762065410614, "learning_rate": 4.675217897358164e-05, "loss": 0.3608, "step": 4799500 }, { "epoch": 32.481593763534, "grad_norm": 0.3330877125263214, "learning_rate": 4.6751840623646606e-05, "loss": 0.3603, "step": 4800000 }, { "epoch": 32.484977262884364, "grad_norm": 0.3307543396949768, "learning_rate": 4.675150227371157e-05, "loss": 0.3603, "step": 4800500 }, { "epoch": 32.488360762234734, "grad_norm": 0.4174194037914276, "learning_rate": 4.675116392377653e-05, "loss": 0.3612, "step": 4801000 }, { "epoch": 32.491744261585104, "grad_norm": 0.39622727036476135, "learning_rate": 4.675082557384149e-05, "loss": 0.3587, "step": 4801500 }, { "epoch": 32.49512776093547, "grad_norm": 0.38315004110336304, "learning_rate": 4.6750487223906455e-05, "loss": 0.3598, "step": 4802000 }, { "epoch": 32.49851126028584, "grad_norm": 0.3557627201080322, "learning_rate": 4.675014887397142e-05, "loss": 0.3595, "step": 4802500 }, { "epoch": 32.50189475963621, "grad_norm": 0.3378044366836548, "learning_rate": 4.674981052403638e-05, "loss": 0.3598, "step": 4803000 }, { "epoch": 32.50527825898657, "grad_norm": 0.3784140348434448, "learning_rate": 4.674947217410134e-05, "loss": 0.3594, "step": 4803500 }, { "epoch": 32.50866175833694, "grad_norm": 0.3857872188091278, "learning_rate": 4.674913382416631e-05, "loss": 0.3604, "step": 4804000 }, { "epoch": 32.51204525768731, "grad_norm": 0.37597906589508057, "learning_rate": 4.674879547423127e-05, "loss": 0.3583, "step": 4804500 }, { "epoch": 32.515428757037675, "grad_norm": 0.3755928575992584, "learning_rate": 4.6748457124296234e-05, "loss": 0.3581, "step": 4805000 }, { "epoch": 32.518812256388046, "grad_norm": 0.38332635164260864, "learning_rate": 4.6748118774361196e-05, "loss": 0.3601, "step": 4805500 }, { "epoch": 32.522195755738416, "grad_norm": 0.40942707657814026, "learning_rate": 4.6747780424426165e-05, "loss": 0.361, "step": 4806000 }, { "epoch": 32.525579255088786, "grad_norm": 0.3584476709365845, "learning_rate": 4.674744207449112e-05, "loss": 0.3581, "step": 4806500 }, { "epoch": 32.52896275443915, "grad_norm": 0.363699734210968, "learning_rate": 4.674710372455608e-05, "loss": 0.3595, "step": 4807000 }, { "epoch": 32.53234625378952, "grad_norm": 0.38536036014556885, "learning_rate": 4.674676537462105e-05, "loss": 0.3594, "step": 4807500 }, { "epoch": 32.53572975313989, "grad_norm": 0.36360061168670654, "learning_rate": 4.6746427024686014e-05, "loss": 0.3588, "step": 4808000 }, { "epoch": 32.53911325249025, "grad_norm": 0.3698710799217224, "learning_rate": 4.6746088674750976e-05, "loss": 0.3603, "step": 4808500 }, { "epoch": 32.54249675184062, "grad_norm": 0.4059138596057892, "learning_rate": 4.674575032481594e-05, "loss": 0.36, "step": 4809000 }, { "epoch": 32.545880251190994, "grad_norm": 0.4048005938529968, "learning_rate": 4.674541197488091e-05, "loss": 0.3596, "step": 4809500 }, { "epoch": 32.54926375054136, "grad_norm": 0.37766924500465393, "learning_rate": 4.674507362494587e-05, "loss": 0.3606, "step": 4810000 }, { "epoch": 32.55264724989173, "grad_norm": 0.3820647597312927, "learning_rate": 4.674473527501083e-05, "loss": 0.361, "step": 4810500 }, { "epoch": 32.5560307492421, "grad_norm": 0.338340163230896, "learning_rate": 4.674439692507579e-05, "loss": 0.3601, "step": 4811000 }, { "epoch": 32.55941424859246, "grad_norm": 0.36598479747772217, "learning_rate": 4.6744058575140755e-05, "loss": 0.3583, "step": 4811500 }, { "epoch": 32.56279774794283, "grad_norm": 0.34915420413017273, "learning_rate": 4.674372022520572e-05, "loss": 0.3599, "step": 4812000 }, { "epoch": 32.5661812472932, "grad_norm": 0.3632654845714569, "learning_rate": 4.674338187527068e-05, "loss": 0.3617, "step": 4812500 }, { "epoch": 32.56956474664357, "grad_norm": 0.40251436829566956, "learning_rate": 4.674304352533564e-05, "loss": 0.3595, "step": 4813000 }, { "epoch": 32.572948245993935, "grad_norm": 0.39226406812667847, "learning_rate": 4.674270517540061e-05, "loss": 0.3579, "step": 4813500 }, { "epoch": 32.576331745344305, "grad_norm": 0.3955608606338501, "learning_rate": 4.674236682546557e-05, "loss": 0.3601, "step": 4814000 }, { "epoch": 32.579715244694675, "grad_norm": 0.3937000632286072, "learning_rate": 4.6742028475530535e-05, "loss": 0.3597, "step": 4814500 }, { "epoch": 32.58309874404504, "grad_norm": 0.38861674070358276, "learning_rate": 4.67416901255955e-05, "loss": 0.3599, "step": 4815000 }, { "epoch": 32.58648224339541, "grad_norm": 0.40989431738853455, "learning_rate": 4.6741351775660466e-05, "loss": 0.3584, "step": 4815500 }, { "epoch": 32.58986574274578, "grad_norm": 0.3231464624404907, "learning_rate": 4.674101342572542e-05, "loss": 0.3611, "step": 4816000 }, { "epoch": 32.59324924209614, "grad_norm": 0.3813242018222809, "learning_rate": 4.674067507579038e-05, "loss": 0.3609, "step": 4816500 }, { "epoch": 32.59663274144651, "grad_norm": 0.3899689316749573, "learning_rate": 4.674033672585535e-05, "loss": 0.36, "step": 4817000 }, { "epoch": 32.60001624079688, "grad_norm": 0.36689555644989014, "learning_rate": 4.6739998375920314e-05, "loss": 0.361, "step": 4817500 }, { "epoch": 32.60339974014725, "grad_norm": 0.40323251485824585, "learning_rate": 4.6739660025985276e-05, "loss": 0.3602, "step": 4818000 }, { "epoch": 32.60678323949762, "grad_norm": 0.388261079788208, "learning_rate": 4.673932167605024e-05, "loss": 0.3607, "step": 4818500 }, { "epoch": 32.61016673884799, "grad_norm": 0.3827062249183655, "learning_rate": 4.673898332611521e-05, "loss": 0.3599, "step": 4819000 }, { "epoch": 32.61355023819836, "grad_norm": 0.37925106287002563, "learning_rate": 4.673864497618017e-05, "loss": 0.362, "step": 4819500 }, { "epoch": 32.61693373754872, "grad_norm": 0.35628315806388855, "learning_rate": 4.673830662624513e-05, "loss": 0.3597, "step": 4820000 }, { "epoch": 32.62031723689909, "grad_norm": 0.37639716267585754, "learning_rate": 4.6737968276310094e-05, "loss": 0.3591, "step": 4820500 }, { "epoch": 32.62370073624946, "grad_norm": 0.37759920954704285, "learning_rate": 4.6737629926375056e-05, "loss": 0.3618, "step": 4821000 }, { "epoch": 32.627084235599824, "grad_norm": 0.3671201765537262, "learning_rate": 4.673729157644002e-05, "loss": 0.3609, "step": 4821500 }, { "epoch": 32.630467734950194, "grad_norm": 0.32618698477745056, "learning_rate": 4.673695322650498e-05, "loss": 0.36, "step": 4822000 }, { "epoch": 32.633851234300565, "grad_norm": 0.3773176074028015, "learning_rate": 4.673661487656994e-05, "loss": 0.3591, "step": 4822500 }, { "epoch": 32.63723473365093, "grad_norm": 0.39158689975738525, "learning_rate": 4.673627652663491e-05, "loss": 0.3604, "step": 4823000 }, { "epoch": 32.6406182330013, "grad_norm": 0.38822489976882935, "learning_rate": 4.673593817669987e-05, "loss": 0.3607, "step": 4823500 }, { "epoch": 32.64400173235167, "grad_norm": 0.36614277958869934, "learning_rate": 4.6735599826764835e-05, "loss": 0.3594, "step": 4824000 }, { "epoch": 32.64738523170204, "grad_norm": 0.39221155643463135, "learning_rate": 4.67352614768298e-05, "loss": 0.3608, "step": 4824500 }, { "epoch": 32.6507687310524, "grad_norm": 0.33508527278900146, "learning_rate": 4.6734923126894766e-05, "loss": 0.3603, "step": 4825000 }, { "epoch": 32.65415223040277, "grad_norm": 0.3844468295574188, "learning_rate": 4.673458477695973e-05, "loss": 0.3604, "step": 4825500 }, { "epoch": 32.65753572975314, "grad_norm": 0.35471558570861816, "learning_rate": 4.6734246427024684e-05, "loss": 0.3601, "step": 4826000 }, { "epoch": 32.660919229103506, "grad_norm": 0.3607887029647827, "learning_rate": 4.673390807708965e-05, "loss": 0.3592, "step": 4826500 }, { "epoch": 32.664302728453876, "grad_norm": 0.3825249671936035, "learning_rate": 4.6733569727154615e-05, "loss": 0.3613, "step": 4827000 }, { "epoch": 32.667686227804246, "grad_norm": 0.3158179819583893, "learning_rate": 4.673323137721958e-05, "loss": 0.3599, "step": 4827500 }, { "epoch": 32.67106972715461, "grad_norm": 0.37185347080230713, "learning_rate": 4.673289302728454e-05, "loss": 0.3614, "step": 4828000 }, { "epoch": 32.67445322650498, "grad_norm": 0.3913642168045044, "learning_rate": 4.673255467734951e-05, "loss": 0.3598, "step": 4828500 }, { "epoch": 32.67783672585535, "grad_norm": 0.36755040287971497, "learning_rate": 4.673221632741447e-05, "loss": 0.358, "step": 4829000 }, { "epoch": 32.68122022520571, "grad_norm": 0.3860124349594116, "learning_rate": 4.673187797747943e-05, "loss": 0.36, "step": 4829500 }, { "epoch": 32.684603724556084, "grad_norm": 0.4249396026134491, "learning_rate": 4.6731539627544394e-05, "loss": 0.3605, "step": 4830000 }, { "epoch": 32.687987223906454, "grad_norm": 0.39165258407592773, "learning_rate": 4.6731201277609357e-05, "loss": 0.3593, "step": 4830500 }, { "epoch": 32.691370723256824, "grad_norm": 0.39184510707855225, "learning_rate": 4.673086292767432e-05, "loss": 0.3595, "step": 4831000 }, { "epoch": 32.69475422260719, "grad_norm": 0.3789173662662506, "learning_rate": 4.673052457773928e-05, "loss": 0.3587, "step": 4831500 }, { "epoch": 32.69813772195756, "grad_norm": 0.3882516920566559, "learning_rate": 4.673018622780424e-05, "loss": 0.3605, "step": 4832000 }, { "epoch": 32.70152122130793, "grad_norm": 0.39031362533569336, "learning_rate": 4.672984787786921e-05, "loss": 0.36, "step": 4832500 }, { "epoch": 32.70490472065829, "grad_norm": 0.3862682282924652, "learning_rate": 4.6729509527934174e-05, "loss": 0.3595, "step": 4833000 }, { "epoch": 32.70828822000866, "grad_norm": 0.3593071699142456, "learning_rate": 4.6729171177999136e-05, "loss": 0.36, "step": 4833500 }, { "epoch": 32.71167171935903, "grad_norm": 0.36724409461021423, "learning_rate": 4.67288328280641e-05, "loss": 0.3619, "step": 4834000 }, { "epoch": 32.715055218709395, "grad_norm": 0.36828306317329407, "learning_rate": 4.672849447812907e-05, "loss": 0.3615, "step": 4834500 }, { "epoch": 32.718438718059765, "grad_norm": 0.36529555916786194, "learning_rate": 4.672815612819403e-05, "loss": 0.3605, "step": 4835000 }, { "epoch": 32.721822217410136, "grad_norm": 0.36009782552719116, "learning_rate": 4.6727817778258985e-05, "loss": 0.3617, "step": 4835500 }, { "epoch": 32.7252057167605, "grad_norm": 0.3862138092517853, "learning_rate": 4.6727479428323953e-05, "loss": 0.36, "step": 4836000 }, { "epoch": 32.72858921611087, "grad_norm": 0.45396357774734497, "learning_rate": 4.6727141078388916e-05, "loss": 0.3611, "step": 4836500 }, { "epoch": 32.73197271546124, "grad_norm": 0.3386971354484558, "learning_rate": 4.672680272845388e-05, "loss": 0.3605, "step": 4837000 }, { "epoch": 32.73535621481161, "grad_norm": 0.39177754521369934, "learning_rate": 4.672646437851884e-05, "loss": 0.3595, "step": 4837500 }, { "epoch": 32.73873971416197, "grad_norm": 0.3584165871143341, "learning_rate": 4.672612602858381e-05, "loss": 0.3598, "step": 4838000 }, { "epoch": 32.74212321351234, "grad_norm": 0.3619323968887329, "learning_rate": 4.672578767864877e-05, "loss": 0.3605, "step": 4838500 }, { "epoch": 32.74550671286271, "grad_norm": 0.4178721010684967, "learning_rate": 4.672544932871373e-05, "loss": 0.3603, "step": 4839000 }, { "epoch": 32.74889021221308, "grad_norm": 0.34086382389068604, "learning_rate": 4.6725110978778695e-05, "loss": 0.3589, "step": 4839500 }, { "epoch": 32.75227371156345, "grad_norm": 0.36982759833335876, "learning_rate": 4.672477262884366e-05, "loss": 0.3608, "step": 4840000 }, { "epoch": 32.75565721091382, "grad_norm": 0.34839120507240295, "learning_rate": 4.672443427890862e-05, "loss": 0.361, "step": 4840500 }, { "epoch": 32.75904071026418, "grad_norm": 0.39314019680023193, "learning_rate": 4.672409592897358e-05, "loss": 0.3602, "step": 4841000 }, { "epoch": 32.76242420961455, "grad_norm": 0.3656735122203827, "learning_rate": 4.6723757579038544e-05, "loss": 0.3606, "step": 4841500 }, { "epoch": 32.76580770896492, "grad_norm": 0.3888089954853058, "learning_rate": 4.672341922910351e-05, "loss": 0.3609, "step": 4842000 }, { "epoch": 32.76919120831529, "grad_norm": 0.3699166476726532, "learning_rate": 4.6723080879168475e-05, "loss": 0.3597, "step": 4842500 }, { "epoch": 32.772574707665655, "grad_norm": 0.3918304443359375, "learning_rate": 4.672274252923344e-05, "loss": 0.3619, "step": 4843000 }, { "epoch": 32.775958207016025, "grad_norm": 0.3193145990371704, "learning_rate": 4.67224041792984e-05, "loss": 0.3607, "step": 4843500 }, { "epoch": 32.779341706366395, "grad_norm": 0.38491323590278625, "learning_rate": 4.672206582936337e-05, "loss": 0.3613, "step": 4844000 }, { "epoch": 32.78272520571676, "grad_norm": 0.3670559823513031, "learning_rate": 4.672172747942833e-05, "loss": 0.36, "step": 4844500 }, { "epoch": 32.78610870506713, "grad_norm": 0.35742372274398804, "learning_rate": 4.6721389129493285e-05, "loss": 0.3591, "step": 4845000 }, { "epoch": 32.7894922044175, "grad_norm": 0.39176833629608154, "learning_rate": 4.6721050779558254e-05, "loss": 0.3599, "step": 4845500 }, { "epoch": 32.79287570376786, "grad_norm": 0.3562770187854767, "learning_rate": 4.6720712429623216e-05, "loss": 0.3596, "step": 4846000 }, { "epoch": 32.79625920311823, "grad_norm": 0.40307995676994324, "learning_rate": 4.672037407968818e-05, "loss": 0.3599, "step": 4846500 }, { "epoch": 32.7996427024686, "grad_norm": 0.3937683701515198, "learning_rate": 4.672003572975314e-05, "loss": 0.3609, "step": 4847000 }, { "epoch": 32.803026201818966, "grad_norm": 0.3605369031429291, "learning_rate": 4.67196973798181e-05, "loss": 0.3599, "step": 4847500 }, { "epoch": 32.806409701169336, "grad_norm": 0.3692830801010132, "learning_rate": 4.671935902988307e-05, "loss": 0.3589, "step": 4848000 }, { "epoch": 32.80979320051971, "grad_norm": 0.3484083116054535, "learning_rate": 4.6719020679948034e-05, "loss": 0.3594, "step": 4848500 }, { "epoch": 32.81317669987008, "grad_norm": 0.35944950580596924, "learning_rate": 4.6718682330012996e-05, "loss": 0.3596, "step": 4849000 }, { "epoch": 32.81656019922044, "grad_norm": 0.374080091714859, "learning_rate": 4.671834398007796e-05, "loss": 0.3611, "step": 4849500 }, { "epoch": 32.81994369857081, "grad_norm": 0.41749081015586853, "learning_rate": 4.671800563014292e-05, "loss": 0.3601, "step": 4850000 }, { "epoch": 32.82332719792118, "grad_norm": 0.3499038517475128, "learning_rate": 4.671766728020788e-05, "loss": 0.3597, "step": 4850500 }, { "epoch": 32.826710697271544, "grad_norm": 0.3616945743560791, "learning_rate": 4.6717328930272844e-05, "loss": 0.3604, "step": 4851000 }, { "epoch": 32.830094196621914, "grad_norm": 0.3701276183128357, "learning_rate": 4.671699058033781e-05, "loss": 0.3589, "step": 4851500 }, { "epoch": 32.833477695972284, "grad_norm": 0.4054557979106903, "learning_rate": 4.6716652230402775e-05, "loss": 0.3605, "step": 4852000 }, { "epoch": 32.83686119532265, "grad_norm": 0.3850688338279724, "learning_rate": 4.671631388046774e-05, "loss": 0.3607, "step": 4852500 }, { "epoch": 32.84024469467302, "grad_norm": 0.3332974314689636, "learning_rate": 4.67159755305327e-05, "loss": 0.361, "step": 4853000 }, { "epoch": 32.84362819402339, "grad_norm": 0.40008342266082764, "learning_rate": 4.671563718059767e-05, "loss": 0.3588, "step": 4853500 }, { "epoch": 32.84701169337375, "grad_norm": 0.3763919174671173, "learning_rate": 4.671529883066263e-05, "loss": 0.3595, "step": 4854000 }, { "epoch": 32.85039519272412, "grad_norm": 0.3576953113079071, "learning_rate": 4.6714960480727586e-05, "loss": 0.3602, "step": 4854500 }, { "epoch": 32.85377869207449, "grad_norm": 0.38050130009651184, "learning_rate": 4.6714622130792555e-05, "loss": 0.3601, "step": 4855000 }, { "epoch": 32.85716219142486, "grad_norm": 0.35217756032943726, "learning_rate": 4.671428378085752e-05, "loss": 0.3594, "step": 4855500 }, { "epoch": 32.860545690775226, "grad_norm": 0.33961227536201477, "learning_rate": 4.671394543092248e-05, "loss": 0.36, "step": 4856000 }, { "epoch": 32.863929190125596, "grad_norm": 0.3544408977031708, "learning_rate": 4.671360708098744e-05, "loss": 0.361, "step": 4856500 }, { "epoch": 32.867312689475966, "grad_norm": 0.431446373462677, "learning_rate": 4.67132687310524e-05, "loss": 0.3598, "step": 4857000 }, { "epoch": 32.87069618882633, "grad_norm": 0.39491701126098633, "learning_rate": 4.671293038111737e-05, "loss": 0.3601, "step": 4857500 }, { "epoch": 32.8740796881767, "grad_norm": 0.36959463357925415, "learning_rate": 4.6712592031182334e-05, "loss": 0.3603, "step": 4858000 }, { "epoch": 32.87746318752707, "grad_norm": 0.37260931730270386, "learning_rate": 4.6712253681247296e-05, "loss": 0.3602, "step": 4858500 }, { "epoch": 32.88084668687743, "grad_norm": 0.3644866347312927, "learning_rate": 4.671191533131226e-05, "loss": 0.3607, "step": 4859000 }, { "epoch": 32.8842301862278, "grad_norm": 0.3580959439277649, "learning_rate": 4.671157698137722e-05, "loss": 0.3599, "step": 4859500 }, { "epoch": 32.887613685578174, "grad_norm": 0.37152689695358276, "learning_rate": 4.671123863144218e-05, "loss": 0.3601, "step": 4860000 }, { "epoch": 32.89099718492854, "grad_norm": 0.3975582420825958, "learning_rate": 4.6710900281507145e-05, "loss": 0.3601, "step": 4860500 }, { "epoch": 32.89438068427891, "grad_norm": 0.35651257634162903, "learning_rate": 4.6710561931572114e-05, "loss": 0.3608, "step": 4861000 }, { "epoch": 32.89776418362928, "grad_norm": 0.36524707078933716, "learning_rate": 4.6710223581637076e-05, "loss": 0.3594, "step": 4861500 }, { "epoch": 32.90114768297965, "grad_norm": 0.3711818754673004, "learning_rate": 4.670988523170204e-05, "loss": 0.3584, "step": 4862000 }, { "epoch": 32.90453118233001, "grad_norm": 0.39311543107032776, "learning_rate": 4.6709546881767e-05, "loss": 0.3595, "step": 4862500 }, { "epoch": 32.90791468168038, "grad_norm": 0.3769167363643646, "learning_rate": 4.670920853183197e-05, "loss": 0.3623, "step": 4863000 }, { "epoch": 32.91129818103075, "grad_norm": 0.40175095200538635, "learning_rate": 4.670887018189693e-05, "loss": 0.3613, "step": 4863500 }, { "epoch": 32.914681680381115, "grad_norm": 0.38338661193847656, "learning_rate": 4.6708531831961886e-05, "loss": 0.3593, "step": 4864000 }, { "epoch": 32.918065179731485, "grad_norm": 0.37028419971466064, "learning_rate": 4.670819348202685e-05, "loss": 0.3617, "step": 4864500 }, { "epoch": 32.921448679081855, "grad_norm": 0.3657394349575043, "learning_rate": 4.670785513209182e-05, "loss": 0.3597, "step": 4865000 }, { "epoch": 32.92483217843222, "grad_norm": 0.3838844299316406, "learning_rate": 4.670751678215678e-05, "loss": 0.3596, "step": 4865500 }, { "epoch": 32.92821567778259, "grad_norm": 0.3631505072116852, "learning_rate": 4.670717843222174e-05, "loss": 0.3587, "step": 4866000 }, { "epoch": 32.93159917713296, "grad_norm": 0.3473980128765106, "learning_rate": 4.6706840082286704e-05, "loss": 0.3604, "step": 4866500 }, { "epoch": 32.93498267648333, "grad_norm": 0.3915610611438751, "learning_rate": 4.670650173235167e-05, "loss": 0.3594, "step": 4867000 }, { "epoch": 32.93836617583369, "grad_norm": 0.3296775221824646, "learning_rate": 4.6706163382416635e-05, "loss": 0.3595, "step": 4867500 }, { "epoch": 32.94174967518406, "grad_norm": 0.35316115617752075, "learning_rate": 4.67058250324816e-05, "loss": 0.3616, "step": 4868000 }, { "epoch": 32.94513317453443, "grad_norm": 0.3577609062194824, "learning_rate": 4.670548668254656e-05, "loss": 0.3616, "step": 4868500 }, { "epoch": 32.9485166738848, "grad_norm": 0.3541504740715027, "learning_rate": 4.670514833261152e-05, "loss": 0.3601, "step": 4869000 }, { "epoch": 32.95190017323517, "grad_norm": 0.3470286726951599, "learning_rate": 4.670480998267648e-05, "loss": 0.3599, "step": 4869500 }, { "epoch": 32.95528367258554, "grad_norm": 0.3620125651359558, "learning_rate": 4.6704471632741445e-05, "loss": 0.3595, "step": 4870000 }, { "epoch": 32.9586671719359, "grad_norm": 0.371470183134079, "learning_rate": 4.6704133282806414e-05, "loss": 0.3622, "step": 4870500 }, { "epoch": 32.96205067128627, "grad_norm": 0.3785405457019806, "learning_rate": 4.6703794932871376e-05, "loss": 0.3598, "step": 4871000 }, { "epoch": 32.96543417063664, "grad_norm": 0.372321218252182, "learning_rate": 4.670345658293634e-05, "loss": 0.3601, "step": 4871500 }, { "epoch": 32.968817669987004, "grad_norm": 0.36906686425209045, "learning_rate": 4.67031182330013e-05, "loss": 0.3588, "step": 4872000 }, { "epoch": 32.972201169337374, "grad_norm": 0.4017215371131897, "learning_rate": 4.670277988306627e-05, "loss": 0.36, "step": 4872500 }, { "epoch": 32.975584668687745, "grad_norm": 0.3841962516307831, "learning_rate": 4.670244153313123e-05, "loss": 0.3593, "step": 4873000 }, { "epoch": 32.978968168038115, "grad_norm": 0.3314242362976074, "learning_rate": 4.670210318319619e-05, "loss": 0.3613, "step": 4873500 }, { "epoch": 32.98235166738848, "grad_norm": 0.37515687942504883, "learning_rate": 4.670176483326115e-05, "loss": 0.3598, "step": 4874000 }, { "epoch": 32.98573516673885, "grad_norm": 0.37607279419898987, "learning_rate": 4.670142648332612e-05, "loss": 0.361, "step": 4874500 }, { "epoch": 32.98911866608922, "grad_norm": 0.39258554577827454, "learning_rate": 4.670108813339108e-05, "loss": 0.3606, "step": 4875000 }, { "epoch": 32.99250216543958, "grad_norm": 0.39667394757270813, "learning_rate": 4.670074978345604e-05, "loss": 0.3608, "step": 4875500 }, { "epoch": 32.99588566478995, "grad_norm": 0.3682968318462372, "learning_rate": 4.6700411433521004e-05, "loss": 0.3602, "step": 4876000 }, { "epoch": 32.99926916414032, "grad_norm": 0.3392215967178345, "learning_rate": 4.670007308358597e-05, "loss": 0.3602, "step": 4876500 }, { "epoch": 33.0, "eval_accuracy": 0.862800375099966, "eval_loss": 0.5578371286392212, "eval_runtime": 3360.5352, "eval_samples_per_second": 86.517, "eval_steps_per_second": 5.407, "step": 4876608 }, { "epoch": 33.002652663490686, "grad_norm": 0.40107080340385437, "learning_rate": 4.6699734733650935e-05, "loss": 0.3596, "step": 4877000 }, { "epoch": 33.006036162841056, "grad_norm": 0.366609126329422, "learning_rate": 4.66993963837159e-05, "loss": 0.3588, "step": 4877500 }, { "epoch": 33.009419662191426, "grad_norm": 0.37344497442245483, "learning_rate": 4.669905803378086e-05, "loss": 0.358, "step": 4878000 }, { "epoch": 33.01280316154179, "grad_norm": 0.3847726285457611, "learning_rate": 4.669871968384582e-05, "loss": 0.3586, "step": 4878500 }, { "epoch": 33.01618666089216, "grad_norm": 0.3856114447116852, "learning_rate": 4.6698381333910784e-05, "loss": 0.3557, "step": 4879000 }, { "epoch": 33.01957016024253, "grad_norm": 0.40235546231269836, "learning_rate": 4.6698042983975746e-05, "loss": 0.3593, "step": 4879500 }, { "epoch": 33.0229536595929, "grad_norm": 0.37816309928894043, "learning_rate": 4.6697704634040715e-05, "loss": 0.3573, "step": 4880000 }, { "epoch": 33.026337158943264, "grad_norm": 0.3726678192615509, "learning_rate": 4.669736628410568e-05, "loss": 0.3558, "step": 4880500 }, { "epoch": 33.029720658293634, "grad_norm": 0.3437913954257965, "learning_rate": 4.669702793417064e-05, "loss": 0.3588, "step": 4881000 }, { "epoch": 33.033104157644004, "grad_norm": 0.40929800271987915, "learning_rate": 4.66966895842356e-05, "loss": 0.3574, "step": 4881500 }, { "epoch": 33.03648765699437, "grad_norm": 0.3646358251571655, "learning_rate": 4.669635123430057e-05, "loss": 0.3566, "step": 4882000 }, { "epoch": 33.03987115634474, "grad_norm": 0.38432562351226807, "learning_rate": 4.669601288436553e-05, "loss": 0.3575, "step": 4882500 }, { "epoch": 33.04325465569511, "grad_norm": 0.3522011935710907, "learning_rate": 4.669567453443049e-05, "loss": 0.3586, "step": 4883000 }, { "epoch": 33.04663815504547, "grad_norm": 0.3728903830051422, "learning_rate": 4.669533618449545e-05, "loss": 0.3585, "step": 4883500 }, { "epoch": 33.05002165439584, "grad_norm": 0.366802453994751, "learning_rate": 4.669499783456042e-05, "loss": 0.3586, "step": 4884000 }, { "epoch": 33.05340515374621, "grad_norm": 0.3676486611366272, "learning_rate": 4.669465948462538e-05, "loss": 0.3582, "step": 4884500 }, { "epoch": 33.056788653096575, "grad_norm": 0.38962480425834656, "learning_rate": 4.669432113469034e-05, "loss": 0.3586, "step": 4885000 }, { "epoch": 33.060172152446945, "grad_norm": 0.4253799319267273, "learning_rate": 4.6693982784755305e-05, "loss": 0.3583, "step": 4885500 }, { "epoch": 33.063555651797316, "grad_norm": 0.3932543992996216, "learning_rate": 4.6693644434820274e-05, "loss": 0.3582, "step": 4886000 }, { "epoch": 33.066939151147686, "grad_norm": 0.39341434836387634, "learning_rate": 4.6693306084885236e-05, "loss": 0.358, "step": 4886500 }, { "epoch": 33.07032265049805, "grad_norm": 0.38702037930488586, "learning_rate": 4.66929677349502e-05, "loss": 0.3588, "step": 4887000 }, { "epoch": 33.07370614984842, "grad_norm": 0.3913222849369049, "learning_rate": 4.669262938501516e-05, "loss": 0.3594, "step": 4887500 }, { "epoch": 33.07708964919879, "grad_norm": 0.3729381561279297, "learning_rate": 4.669229103508012e-05, "loss": 0.3587, "step": 4888000 }, { "epoch": 33.08047314854915, "grad_norm": 0.3916638493537903, "learning_rate": 4.6691952685145085e-05, "loss": 0.3596, "step": 4888500 }, { "epoch": 33.08385664789952, "grad_norm": 0.3831269145011902, "learning_rate": 4.669161433521005e-05, "loss": 0.3584, "step": 4889000 }, { "epoch": 33.087240147249894, "grad_norm": 0.3752813935279846, "learning_rate": 4.6691275985275016e-05, "loss": 0.3596, "step": 4889500 }, { "epoch": 33.09062364660026, "grad_norm": 0.37731239199638367, "learning_rate": 4.669093763533998e-05, "loss": 0.3589, "step": 4890000 }, { "epoch": 33.09400714595063, "grad_norm": 0.40924587845802307, "learning_rate": 4.669059928540494e-05, "loss": 0.3581, "step": 4890500 }, { "epoch": 33.097390645301, "grad_norm": 0.3785557150840759, "learning_rate": 4.66902609354699e-05, "loss": 0.3586, "step": 4891000 }, { "epoch": 33.10077414465137, "grad_norm": 0.39560848474502563, "learning_rate": 4.668992258553487e-05, "loss": 0.3566, "step": 4891500 }, { "epoch": 33.10415764400173, "grad_norm": 0.4397170841693878, "learning_rate": 4.668958423559983e-05, "loss": 0.3584, "step": 4892000 }, { "epoch": 33.1075411433521, "grad_norm": 0.4220854640007019, "learning_rate": 4.668924588566479e-05, "loss": 0.3603, "step": 4892500 }, { "epoch": 33.11092464270247, "grad_norm": 0.4177074730396271, "learning_rate": 4.668890753572975e-05, "loss": 0.3579, "step": 4893000 }, { "epoch": 33.114308142052835, "grad_norm": 0.35741767287254333, "learning_rate": 4.668856918579472e-05, "loss": 0.3576, "step": 4893500 }, { "epoch": 33.117691641403205, "grad_norm": 0.36441221833229065, "learning_rate": 4.668823083585968e-05, "loss": 0.3579, "step": 4894000 }, { "epoch": 33.121075140753575, "grad_norm": 0.3814169466495514, "learning_rate": 4.6687892485924644e-05, "loss": 0.359, "step": 4894500 }, { "epoch": 33.12445864010394, "grad_norm": 0.39792677760124207, "learning_rate": 4.6687554135989606e-05, "loss": 0.3582, "step": 4895000 }, { "epoch": 33.12784213945431, "grad_norm": 0.38604456186294556, "learning_rate": 4.6687215786054575e-05, "loss": 0.3575, "step": 4895500 }, { "epoch": 33.13122563880468, "grad_norm": 0.3650904595851898, "learning_rate": 4.668687743611954e-05, "loss": 0.3598, "step": 4896000 }, { "epoch": 33.13460913815504, "grad_norm": 0.41717758774757385, "learning_rate": 4.66865390861845e-05, "loss": 0.3583, "step": 4896500 }, { "epoch": 33.13799263750541, "grad_norm": 0.3828420639038086, "learning_rate": 4.668620073624946e-05, "loss": 0.3593, "step": 4897000 }, { "epoch": 33.14137613685578, "grad_norm": 0.41209474205970764, "learning_rate": 4.668586238631442e-05, "loss": 0.3582, "step": 4897500 }, { "epoch": 33.14475963620615, "grad_norm": 0.3780669867992401, "learning_rate": 4.6685524036379385e-05, "loss": 0.3591, "step": 4898000 }, { "epoch": 33.148143135556516, "grad_norm": 0.35185787081718445, "learning_rate": 4.668518568644435e-05, "loss": 0.3577, "step": 4898500 }, { "epoch": 33.15152663490689, "grad_norm": 0.3798390030860901, "learning_rate": 4.6684847336509316e-05, "loss": 0.3575, "step": 4899000 }, { "epoch": 33.15491013425726, "grad_norm": 0.38691797852516174, "learning_rate": 4.668450898657428e-05, "loss": 0.359, "step": 4899500 }, { "epoch": 33.15829363360762, "grad_norm": 0.34414389729499817, "learning_rate": 4.668417063663924e-05, "loss": 0.3597, "step": 4900000 }, { "epoch": 33.16167713295799, "grad_norm": 0.3660948574542999, "learning_rate": 4.66838322867042e-05, "loss": 0.3598, "step": 4900500 }, { "epoch": 33.16506063230836, "grad_norm": 0.3499019145965576, "learning_rate": 4.668349393676917e-05, "loss": 0.3602, "step": 4901000 }, { "epoch": 33.168444131658724, "grad_norm": 0.3914853036403656, "learning_rate": 4.6683155586834134e-05, "loss": 0.3587, "step": 4901500 }, { "epoch": 33.171827631009094, "grad_norm": 0.37145745754241943, "learning_rate": 4.668281723689909e-05, "loss": 0.3585, "step": 4902000 }, { "epoch": 33.175211130359465, "grad_norm": 0.3621034622192383, "learning_rate": 4.668247888696405e-05, "loss": 0.3594, "step": 4902500 }, { "epoch": 33.17859462970983, "grad_norm": 0.3783204257488251, "learning_rate": 4.668214053702902e-05, "loss": 0.3592, "step": 4903000 }, { "epoch": 33.1819781290602, "grad_norm": 0.3723049759864807, "learning_rate": 4.668180218709398e-05, "loss": 0.3578, "step": 4903500 }, { "epoch": 33.18536162841057, "grad_norm": 0.4073820114135742, "learning_rate": 4.6681463837158944e-05, "loss": 0.3595, "step": 4904000 }, { "epoch": 33.18874512776094, "grad_norm": 0.36406978964805603, "learning_rate": 4.6681125487223906e-05, "loss": 0.3605, "step": 4904500 }, { "epoch": 33.1921286271113, "grad_norm": 0.3732823133468628, "learning_rate": 4.6680787137288875e-05, "loss": 0.3593, "step": 4905000 }, { "epoch": 33.19551212646167, "grad_norm": 0.3324179947376251, "learning_rate": 4.668044878735384e-05, "loss": 0.3593, "step": 4905500 }, { "epoch": 33.19889562581204, "grad_norm": 0.3814818561077118, "learning_rate": 4.66801104374188e-05, "loss": 0.3595, "step": 4906000 }, { "epoch": 33.202279125162406, "grad_norm": 0.3528575897216797, "learning_rate": 4.667977208748376e-05, "loss": 0.3589, "step": 4906500 }, { "epoch": 33.205662624512776, "grad_norm": 0.4369259178638458, "learning_rate": 4.6679433737548724e-05, "loss": 0.3591, "step": 4907000 }, { "epoch": 33.209046123863146, "grad_norm": 0.3980875015258789, "learning_rate": 4.6679095387613686e-05, "loss": 0.3606, "step": 4907500 }, { "epoch": 33.21242962321351, "grad_norm": 0.3704988956451416, "learning_rate": 4.667875703767865e-05, "loss": 0.3599, "step": 4908000 }, { "epoch": 33.21581312256388, "grad_norm": 0.3260713219642639, "learning_rate": 4.667841868774362e-05, "loss": 0.3594, "step": 4908500 }, { "epoch": 33.21919662191425, "grad_norm": 0.3785054683685303, "learning_rate": 4.667808033780858e-05, "loss": 0.3585, "step": 4909000 }, { "epoch": 33.22258012126461, "grad_norm": 0.39522457122802734, "learning_rate": 4.667774198787354e-05, "loss": 0.3599, "step": 4909500 }, { "epoch": 33.22596362061498, "grad_norm": 0.3919886350631714, "learning_rate": 4.66774036379385e-05, "loss": 0.3592, "step": 4910000 }, { "epoch": 33.229347119965354, "grad_norm": 0.35814350843429565, "learning_rate": 4.6677065288003465e-05, "loss": 0.3585, "step": 4910500 }, { "epoch": 33.232730619315724, "grad_norm": 0.37814611196517944, "learning_rate": 4.6676726938068434e-05, "loss": 0.3595, "step": 4911000 }, { "epoch": 33.23611411866609, "grad_norm": 0.3492003381252289, "learning_rate": 4.667638858813339e-05, "loss": 0.3581, "step": 4911500 }, { "epoch": 33.23949761801646, "grad_norm": 0.3310892581939697, "learning_rate": 4.667605023819835e-05, "loss": 0.3574, "step": 4912000 }, { "epoch": 33.24288111736683, "grad_norm": 0.32191982865333557, "learning_rate": 4.667571188826332e-05, "loss": 0.3585, "step": 4912500 }, { "epoch": 33.24626461671719, "grad_norm": 0.37318795919418335, "learning_rate": 4.667537353832828e-05, "loss": 0.3596, "step": 4913000 }, { "epoch": 33.24964811606756, "grad_norm": 0.41358447074890137, "learning_rate": 4.6675035188393245e-05, "loss": 0.3585, "step": 4913500 }, { "epoch": 33.25303161541793, "grad_norm": 0.3911883533000946, "learning_rate": 4.667469683845821e-05, "loss": 0.3601, "step": 4914000 }, { "epoch": 33.256415114768295, "grad_norm": 0.38080286979675293, "learning_rate": 4.6674358488523176e-05, "loss": 0.3594, "step": 4914500 }, { "epoch": 33.259798614118665, "grad_norm": 0.3826783001422882, "learning_rate": 4.667402013858814e-05, "loss": 0.3603, "step": 4915000 }, { "epoch": 33.263182113469036, "grad_norm": 0.36232760548591614, "learning_rate": 4.66736817886531e-05, "loss": 0.3589, "step": 4915500 }, { "epoch": 33.266565612819406, "grad_norm": 0.42275017499923706, "learning_rate": 4.667334343871806e-05, "loss": 0.3592, "step": 4916000 }, { "epoch": 33.26994911216977, "grad_norm": 0.3786555230617523, "learning_rate": 4.6673005088783024e-05, "loss": 0.3602, "step": 4916500 }, { "epoch": 33.27333261152014, "grad_norm": 0.36606207489967346, "learning_rate": 4.6672666738847987e-05, "loss": 0.3587, "step": 4917000 }, { "epoch": 33.27671611087051, "grad_norm": 0.4194417893886566, "learning_rate": 4.667232838891295e-05, "loss": 0.358, "step": 4917500 }, { "epoch": 33.28009961022087, "grad_norm": 0.3580016493797302, "learning_rate": 4.667199003897791e-05, "loss": 0.3582, "step": 4918000 }, { "epoch": 33.28348310957124, "grad_norm": 0.3689776659011841, "learning_rate": 4.667165168904288e-05, "loss": 0.3606, "step": 4918500 }, { "epoch": 33.28686660892161, "grad_norm": 0.41763901710510254, "learning_rate": 4.667131333910784e-05, "loss": 0.3585, "step": 4919000 }, { "epoch": 33.29025010827198, "grad_norm": 0.33973315358161926, "learning_rate": 4.6670974989172804e-05, "loss": 0.3587, "step": 4919500 }, { "epoch": 33.29363360762235, "grad_norm": 0.3305444121360779, "learning_rate": 4.6670636639237766e-05, "loss": 0.3598, "step": 4920000 }, { "epoch": 33.29701710697272, "grad_norm": 0.3761975169181824, "learning_rate": 4.6670298289302735e-05, "loss": 0.3592, "step": 4920500 }, { "epoch": 33.30040060632308, "grad_norm": 0.3960320055484772, "learning_rate": 4.666995993936769e-05, "loss": 0.3583, "step": 4921000 }, { "epoch": 33.30378410567345, "grad_norm": 0.3718136250972748, "learning_rate": 4.666962158943265e-05, "loss": 0.3572, "step": 4921500 }, { "epoch": 33.30716760502382, "grad_norm": 0.384726345539093, "learning_rate": 4.666928323949762e-05, "loss": 0.3594, "step": 4922000 }, { "epoch": 33.31055110437419, "grad_norm": 0.3793955147266388, "learning_rate": 4.6668944889562583e-05, "loss": 0.3598, "step": 4922500 }, { "epoch": 33.313934603724555, "grad_norm": 0.3863762319087982, "learning_rate": 4.6668606539627546e-05, "loss": 0.3601, "step": 4923000 }, { "epoch": 33.317318103074925, "grad_norm": 0.3898181617259979, "learning_rate": 4.666826818969251e-05, "loss": 0.3599, "step": 4923500 }, { "epoch": 33.320701602425295, "grad_norm": 0.37115055322647095, "learning_rate": 4.6667929839757477e-05, "loss": 0.3596, "step": 4924000 }, { "epoch": 33.32408510177566, "grad_norm": 0.3558864891529083, "learning_rate": 4.666759148982244e-05, "loss": 0.3576, "step": 4924500 }, { "epoch": 33.32746860112603, "grad_norm": 0.3862520754337311, "learning_rate": 4.66672531398874e-05, "loss": 0.3589, "step": 4925000 }, { "epoch": 33.3308521004764, "grad_norm": 0.3589293956756592, "learning_rate": 4.666691478995236e-05, "loss": 0.3601, "step": 4925500 }, { "epoch": 33.33423559982676, "grad_norm": 0.35759809613227844, "learning_rate": 4.6666576440017325e-05, "loss": 0.3595, "step": 4926000 }, { "epoch": 33.33761909917713, "grad_norm": 0.3903287947177887, "learning_rate": 4.666623809008229e-05, "loss": 0.3604, "step": 4926500 }, { "epoch": 33.3410025985275, "grad_norm": 0.3714323341846466, "learning_rate": 4.666589974014725e-05, "loss": 0.3607, "step": 4927000 }, { "epoch": 33.344386097877866, "grad_norm": 0.4133627712726593, "learning_rate": 4.666556139021221e-05, "loss": 0.3599, "step": 4927500 }, { "epoch": 33.347769597228236, "grad_norm": 0.3589901626110077, "learning_rate": 4.666522304027718e-05, "loss": 0.3596, "step": 4928000 }, { "epoch": 33.35115309657861, "grad_norm": 0.40413162112236023, "learning_rate": 4.666488469034214e-05, "loss": 0.3586, "step": 4928500 }, { "epoch": 33.35453659592898, "grad_norm": 0.4117417633533478, "learning_rate": 4.6664546340407105e-05, "loss": 0.3605, "step": 4929000 }, { "epoch": 33.35792009527934, "grad_norm": 0.4197719693183899, "learning_rate": 4.666420799047207e-05, "loss": 0.3595, "step": 4929500 }, { "epoch": 33.36130359462971, "grad_norm": 0.4158882200717926, "learning_rate": 4.6663869640537036e-05, "loss": 0.3589, "step": 4930000 }, { "epoch": 33.36468709398008, "grad_norm": 0.3937288820743561, "learning_rate": 4.666353129060199e-05, "loss": 0.3591, "step": 4930500 }, { "epoch": 33.368070593330444, "grad_norm": 0.4039243459701538, "learning_rate": 4.666319294066695e-05, "loss": 0.3587, "step": 4931000 }, { "epoch": 33.371454092680814, "grad_norm": 0.3809860348701477, "learning_rate": 4.666285459073192e-05, "loss": 0.3596, "step": 4931500 }, { "epoch": 33.374837592031184, "grad_norm": 0.39591655135154724, "learning_rate": 4.6662516240796884e-05, "loss": 0.359, "step": 4932000 }, { "epoch": 33.37822109138155, "grad_norm": 0.36925196647644043, "learning_rate": 4.6662177890861846e-05, "loss": 0.3579, "step": 4932500 }, { "epoch": 33.38160459073192, "grad_norm": 0.3570270836353302, "learning_rate": 4.666183954092681e-05, "loss": 0.3594, "step": 4933000 }, { "epoch": 33.38498809008229, "grad_norm": 0.4062257409095764, "learning_rate": 4.666150119099178e-05, "loss": 0.3596, "step": 4933500 }, { "epoch": 33.38837158943265, "grad_norm": 0.3464130461215973, "learning_rate": 4.666116284105674e-05, "loss": 0.3612, "step": 4934000 }, { "epoch": 33.39175508878302, "grad_norm": 0.3435489237308502, "learning_rate": 4.66608244911217e-05, "loss": 0.3601, "step": 4934500 }, { "epoch": 33.39513858813339, "grad_norm": 0.37780943512916565, "learning_rate": 4.6660486141186664e-05, "loss": 0.3589, "step": 4935000 }, { "epoch": 33.39852208748376, "grad_norm": 0.373794823884964, "learning_rate": 4.6660147791251626e-05, "loss": 0.3614, "step": 4935500 }, { "epoch": 33.401905586834125, "grad_norm": 0.43545180559158325, "learning_rate": 4.665980944131659e-05, "loss": 0.3608, "step": 4936000 }, { "epoch": 33.405289086184496, "grad_norm": 0.3868274986743927, "learning_rate": 4.665947109138155e-05, "loss": 0.3588, "step": 4936500 }, { "epoch": 33.408672585534866, "grad_norm": 0.3833744525909424, "learning_rate": 4.665913274144651e-05, "loss": 0.3587, "step": 4937000 }, { "epoch": 33.41205608488523, "grad_norm": 0.3601362109184265, "learning_rate": 4.665879439151148e-05, "loss": 0.3605, "step": 4937500 }, { "epoch": 33.4154395842356, "grad_norm": 0.36071741580963135, "learning_rate": 4.665845604157644e-05, "loss": 0.3581, "step": 4938000 }, { "epoch": 33.41882308358597, "grad_norm": 0.3734799027442932, "learning_rate": 4.6658117691641405e-05, "loss": 0.3596, "step": 4938500 }, { "epoch": 33.42220658293633, "grad_norm": 0.35517418384552, "learning_rate": 4.665777934170637e-05, "loss": 0.3606, "step": 4939000 }, { "epoch": 33.4255900822867, "grad_norm": 0.37884628772735596, "learning_rate": 4.6657440991771336e-05, "loss": 0.3585, "step": 4939500 }, { "epoch": 33.428973581637074, "grad_norm": 0.38768133521080017, "learning_rate": 4.66571026418363e-05, "loss": 0.3589, "step": 4940000 }, { "epoch": 33.432357080987444, "grad_norm": 0.3687038719654083, "learning_rate": 4.6656764291901254e-05, "loss": 0.3599, "step": 4940500 }, { "epoch": 33.43574058033781, "grad_norm": 0.3995122015476227, "learning_rate": 4.665642594196622e-05, "loss": 0.3594, "step": 4941000 }, { "epoch": 33.43912407968818, "grad_norm": 0.40393587946891785, "learning_rate": 4.6656087592031185e-05, "loss": 0.3598, "step": 4941500 }, { "epoch": 33.44250757903855, "grad_norm": 0.4265615940093994, "learning_rate": 4.665574924209615e-05, "loss": 0.3585, "step": 4942000 }, { "epoch": 33.44589107838891, "grad_norm": 0.3678419888019562, "learning_rate": 4.665541089216111e-05, "loss": 0.362, "step": 4942500 }, { "epoch": 33.44927457773928, "grad_norm": 0.344463586807251, "learning_rate": 4.665507254222608e-05, "loss": 0.3591, "step": 4943000 }, { "epoch": 33.45265807708965, "grad_norm": 0.3779900372028351, "learning_rate": 4.665473419229104e-05, "loss": 0.3601, "step": 4943500 }, { "epoch": 33.456041576440015, "grad_norm": 0.384981632232666, "learning_rate": 4.6654395842356e-05, "loss": 0.3588, "step": 4944000 }, { "epoch": 33.459425075790385, "grad_norm": 0.3845674693584442, "learning_rate": 4.6654057492420964e-05, "loss": 0.3592, "step": 4944500 }, { "epoch": 33.462808575140755, "grad_norm": 0.39477214217185974, "learning_rate": 4.6653719142485926e-05, "loss": 0.3583, "step": 4945000 }, { "epoch": 33.46619207449112, "grad_norm": 0.3745117485523224, "learning_rate": 4.665338079255089e-05, "loss": 0.3588, "step": 4945500 }, { "epoch": 33.46957557384149, "grad_norm": 0.39083191752433777, "learning_rate": 4.665304244261585e-05, "loss": 0.3603, "step": 4946000 }, { "epoch": 33.47295907319186, "grad_norm": 0.3955481946468353, "learning_rate": 4.665270409268081e-05, "loss": 0.3603, "step": 4946500 }, { "epoch": 33.47634257254223, "grad_norm": 0.3859942853450775, "learning_rate": 4.665236574274578e-05, "loss": 0.3596, "step": 4947000 }, { "epoch": 33.47972607189259, "grad_norm": 0.41049662232398987, "learning_rate": 4.6652027392810744e-05, "loss": 0.3597, "step": 4947500 }, { "epoch": 33.48310957124296, "grad_norm": 0.38303178548812866, "learning_rate": 4.6651689042875706e-05, "loss": 0.3583, "step": 4948000 }, { "epoch": 33.48649307059333, "grad_norm": 0.37434321641921997, "learning_rate": 4.665135069294067e-05, "loss": 0.3591, "step": 4948500 }, { "epoch": 33.489876569943696, "grad_norm": 0.35679763555526733, "learning_rate": 4.665101234300564e-05, "loss": 0.3599, "step": 4949000 }, { "epoch": 33.49326006929407, "grad_norm": 0.38214111328125, "learning_rate": 4.66506739930706e-05, "loss": 0.3611, "step": 4949500 }, { "epoch": 33.49664356864444, "grad_norm": 0.3885679543018341, "learning_rate": 4.6650335643135554e-05, "loss": 0.3592, "step": 4950000 }, { "epoch": 33.5000270679948, "grad_norm": 0.3565627336502075, "learning_rate": 4.664999729320052e-05, "loss": 0.3595, "step": 4950500 }, { "epoch": 33.50341056734517, "grad_norm": 0.4018453359603882, "learning_rate": 4.6649658943265485e-05, "loss": 0.3582, "step": 4951000 }, { "epoch": 33.50679406669554, "grad_norm": 0.3506716787815094, "learning_rate": 4.664932059333045e-05, "loss": 0.3605, "step": 4951500 }, { "epoch": 33.510177566045904, "grad_norm": 0.34621524810791016, "learning_rate": 4.664898224339541e-05, "loss": 0.3589, "step": 4952000 }, { "epoch": 33.513561065396274, "grad_norm": 0.3600829541683197, "learning_rate": 4.664864389346038e-05, "loss": 0.3601, "step": 4952500 }, { "epoch": 33.516944564746645, "grad_norm": 0.4031408429145813, "learning_rate": 4.664830554352534e-05, "loss": 0.3608, "step": 4953000 }, { "epoch": 33.520328064097015, "grad_norm": 0.38953065872192383, "learning_rate": 4.66479671935903e-05, "loss": 0.3597, "step": 4953500 }, { "epoch": 33.52371156344738, "grad_norm": 0.41566869616508484, "learning_rate": 4.6647628843655265e-05, "loss": 0.3585, "step": 4954000 }, { "epoch": 33.52709506279775, "grad_norm": 0.4138035178184509, "learning_rate": 4.664729049372023e-05, "loss": 0.3581, "step": 4954500 }, { "epoch": 33.53047856214812, "grad_norm": 0.4077489674091339, "learning_rate": 4.664695214378519e-05, "loss": 0.3591, "step": 4955000 }, { "epoch": 33.53386206149848, "grad_norm": 0.38625746965408325, "learning_rate": 4.664661379385015e-05, "loss": 0.3607, "step": 4955500 }, { "epoch": 33.53724556084885, "grad_norm": 0.4039769172668457, "learning_rate": 4.664627544391511e-05, "loss": 0.3623, "step": 4956000 }, { "epoch": 33.54062906019922, "grad_norm": 0.4044813811779022, "learning_rate": 4.664593709398008e-05, "loss": 0.358, "step": 4956500 }, { "epoch": 33.544012559549586, "grad_norm": 0.3929738402366638, "learning_rate": 4.6645598744045044e-05, "loss": 0.3588, "step": 4957000 }, { "epoch": 33.547396058899956, "grad_norm": 0.3623534142971039, "learning_rate": 4.6645260394110006e-05, "loss": 0.3588, "step": 4957500 }, { "epoch": 33.550779558250326, "grad_norm": 0.3804514706134796, "learning_rate": 4.664492204417497e-05, "loss": 0.3585, "step": 4958000 }, { "epoch": 33.55416305760069, "grad_norm": 0.39171263575553894, "learning_rate": 4.664458369423994e-05, "loss": 0.3584, "step": 4958500 }, { "epoch": 33.55754655695106, "grad_norm": 0.3710254728794098, "learning_rate": 4.66442453443049e-05, "loss": 0.3606, "step": 4959000 }, { "epoch": 33.56093005630143, "grad_norm": 0.37038999795913696, "learning_rate": 4.6643906994369855e-05, "loss": 0.3587, "step": 4959500 }, { "epoch": 33.5643135556518, "grad_norm": 0.3657047748565674, "learning_rate": 4.6643568644434824e-05, "loss": 0.362, "step": 4960000 }, { "epoch": 33.567697055002164, "grad_norm": 0.3724454939365387, "learning_rate": 4.6643230294499786e-05, "loss": 0.3581, "step": 4960500 }, { "epoch": 33.571080554352534, "grad_norm": 0.415170282125473, "learning_rate": 4.664289194456475e-05, "loss": 0.3601, "step": 4961000 }, { "epoch": 33.574464053702904, "grad_norm": 0.37282267212867737, "learning_rate": 4.664255359462971e-05, "loss": 0.3608, "step": 4961500 }, { "epoch": 33.57784755305327, "grad_norm": 0.4100336730480194, "learning_rate": 4.664221524469468e-05, "loss": 0.3605, "step": 4962000 }, { "epoch": 33.58123105240364, "grad_norm": 0.38508278131484985, "learning_rate": 4.664187689475964e-05, "loss": 0.3614, "step": 4962500 }, { "epoch": 33.58461455175401, "grad_norm": 0.3460726737976074, "learning_rate": 4.66415385448246e-05, "loss": 0.3576, "step": 4963000 }, { "epoch": 33.58799805110437, "grad_norm": 0.398885577917099, "learning_rate": 4.6641200194889565e-05, "loss": 0.3598, "step": 4963500 }, { "epoch": 33.59138155045474, "grad_norm": 0.40901660919189453, "learning_rate": 4.664086184495453e-05, "loss": 0.3596, "step": 4964000 }, { "epoch": 33.59476504980511, "grad_norm": 0.3406643867492676, "learning_rate": 4.664052349501949e-05, "loss": 0.3584, "step": 4964500 }, { "epoch": 33.59814854915548, "grad_norm": 0.40784019231796265, "learning_rate": 4.664018514508445e-05, "loss": 0.3599, "step": 4965000 }, { "epoch": 33.601532048505845, "grad_norm": 0.4412790536880493, "learning_rate": 4.6639846795149414e-05, "loss": 0.3595, "step": 4965500 }, { "epoch": 33.604915547856216, "grad_norm": 0.3802646994590759, "learning_rate": 4.663950844521438e-05, "loss": 0.3606, "step": 4966000 }, { "epoch": 33.608299047206586, "grad_norm": 0.37169966101646423, "learning_rate": 4.6639170095279345e-05, "loss": 0.3592, "step": 4966500 }, { "epoch": 33.61168254655695, "grad_norm": 0.3817939758300781, "learning_rate": 4.663883174534431e-05, "loss": 0.3603, "step": 4967000 }, { "epoch": 33.61506604590732, "grad_norm": 0.34958896040916443, "learning_rate": 4.663849339540927e-05, "loss": 0.3581, "step": 4967500 }, { "epoch": 33.61844954525769, "grad_norm": 0.3952915072441101, "learning_rate": 4.663815504547424e-05, "loss": 0.3605, "step": 4968000 }, { "epoch": 33.62183304460805, "grad_norm": 0.36356744170188904, "learning_rate": 4.66378166955392e-05, "loss": 0.3617, "step": 4968500 }, { "epoch": 33.62521654395842, "grad_norm": 0.4046655297279358, "learning_rate": 4.6637478345604156e-05, "loss": 0.3585, "step": 4969000 }, { "epoch": 33.62860004330879, "grad_norm": 0.3863518536090851, "learning_rate": 4.6637139995669124e-05, "loss": 0.3585, "step": 4969500 }, { "epoch": 33.63198354265916, "grad_norm": 0.35415220260620117, "learning_rate": 4.6636801645734087e-05, "loss": 0.3596, "step": 4970000 }, { "epoch": 33.63536704200953, "grad_norm": 0.3962274491786957, "learning_rate": 4.663646329579905e-05, "loss": 0.3593, "step": 4970500 }, { "epoch": 33.6387505413599, "grad_norm": 0.3565498888492584, "learning_rate": 4.663612494586401e-05, "loss": 0.3596, "step": 4971000 }, { "epoch": 33.64213404071027, "grad_norm": 0.3514151871204376, "learning_rate": 4.663578659592898e-05, "loss": 0.359, "step": 4971500 }, { "epoch": 33.64551754006063, "grad_norm": 0.35873162746429443, "learning_rate": 4.663544824599394e-05, "loss": 0.3597, "step": 4972000 }, { "epoch": 33.648901039411, "grad_norm": 0.38362917304039, "learning_rate": 4.6635109896058904e-05, "loss": 0.3594, "step": 4972500 }, { "epoch": 33.65228453876137, "grad_norm": 0.3515843152999878, "learning_rate": 4.6634771546123866e-05, "loss": 0.361, "step": 4973000 }, { "epoch": 33.655668038111735, "grad_norm": 0.3710431754589081, "learning_rate": 4.663443319618883e-05, "loss": 0.3594, "step": 4973500 }, { "epoch": 33.659051537462105, "grad_norm": 0.4550579786300659, "learning_rate": 4.663409484625379e-05, "loss": 0.3588, "step": 4974000 }, { "epoch": 33.662435036812475, "grad_norm": 0.36737725138664246, "learning_rate": 4.663375649631875e-05, "loss": 0.3596, "step": 4974500 }, { "epoch": 33.66581853616284, "grad_norm": 0.40225350856781006, "learning_rate": 4.6633418146383715e-05, "loss": 0.3579, "step": 4975000 }, { "epoch": 33.66920203551321, "grad_norm": 0.39821138978004456, "learning_rate": 4.6633079796448683e-05, "loss": 0.3605, "step": 4975500 }, { "epoch": 33.67258553486358, "grad_norm": 0.36197197437286377, "learning_rate": 4.6632741446513646e-05, "loss": 0.3588, "step": 4976000 }, { "epoch": 33.67596903421394, "grad_norm": 0.3673679828643799, "learning_rate": 4.663240309657861e-05, "loss": 0.3602, "step": 4976500 }, { "epoch": 33.67935253356431, "grad_norm": 0.38268110156059265, "learning_rate": 4.663206474664357e-05, "loss": 0.3612, "step": 4977000 }, { "epoch": 33.68273603291468, "grad_norm": 0.36730483174324036, "learning_rate": 4.663172639670854e-05, "loss": 0.3618, "step": 4977500 }, { "epoch": 33.68611953226505, "grad_norm": 0.39497968554496765, "learning_rate": 4.66313880467735e-05, "loss": 0.3596, "step": 4978000 }, { "epoch": 33.689503031615416, "grad_norm": 0.36848193407058716, "learning_rate": 4.6631049696838456e-05, "loss": 0.3604, "step": 4978500 }, { "epoch": 33.69288653096579, "grad_norm": 0.3875355124473572, "learning_rate": 4.6630711346903425e-05, "loss": 0.3596, "step": 4979000 }, { "epoch": 33.69627003031616, "grad_norm": 0.38768208026885986, "learning_rate": 4.663037299696839e-05, "loss": 0.3592, "step": 4979500 }, { "epoch": 33.69965352966652, "grad_norm": 0.34618815779685974, "learning_rate": 4.663003464703335e-05, "loss": 0.3598, "step": 4980000 }, { "epoch": 33.70303702901689, "grad_norm": 0.377130389213562, "learning_rate": 4.662969629709831e-05, "loss": 0.3604, "step": 4980500 }, { "epoch": 33.70642052836726, "grad_norm": 0.35038310289382935, "learning_rate": 4.6629357947163274e-05, "loss": 0.3608, "step": 4981000 }, { "epoch": 33.709804027717624, "grad_norm": 0.36757394671440125, "learning_rate": 4.662901959722824e-05, "loss": 0.3602, "step": 4981500 }, { "epoch": 33.713187527067994, "grad_norm": 0.3615155816078186, "learning_rate": 4.6628681247293205e-05, "loss": 0.3607, "step": 4982000 }, { "epoch": 33.716571026418364, "grad_norm": 0.37312600016593933, "learning_rate": 4.662834289735817e-05, "loss": 0.3587, "step": 4982500 }, { "epoch": 33.71995452576873, "grad_norm": 0.3604985475540161, "learning_rate": 4.662800454742313e-05, "loss": 0.3593, "step": 4983000 }, { "epoch": 33.7233380251191, "grad_norm": 0.38975080847740173, "learning_rate": 4.662766619748809e-05, "loss": 0.3603, "step": 4983500 }, { "epoch": 33.72672152446947, "grad_norm": 0.36984366178512573, "learning_rate": 4.662732784755305e-05, "loss": 0.3598, "step": 4984000 }, { "epoch": 33.73010502381984, "grad_norm": 0.3986571729183197, "learning_rate": 4.6626989497618015e-05, "loss": 0.3606, "step": 4984500 }, { "epoch": 33.7334885231702, "grad_norm": 0.41753271222114563, "learning_rate": 4.6626651147682984e-05, "loss": 0.3604, "step": 4985000 }, { "epoch": 33.73687202252057, "grad_norm": 0.35058414936065674, "learning_rate": 4.6626312797747946e-05, "loss": 0.3602, "step": 4985500 }, { "epoch": 33.74025552187094, "grad_norm": 0.3672487139701843, "learning_rate": 4.662597444781291e-05, "loss": 0.3601, "step": 4986000 }, { "epoch": 33.743639021221306, "grad_norm": 0.43293723464012146, "learning_rate": 4.662563609787787e-05, "loss": 0.3594, "step": 4986500 }, { "epoch": 33.747022520571676, "grad_norm": 0.3658154308795929, "learning_rate": 4.662529774794284e-05, "loss": 0.3609, "step": 4987000 }, { "epoch": 33.750406019922046, "grad_norm": 0.3655095100402832, "learning_rate": 4.66249593980078e-05, "loss": 0.361, "step": 4987500 }, { "epoch": 33.75378951927241, "grad_norm": 0.40919873118400574, "learning_rate": 4.662462104807276e-05, "loss": 0.3618, "step": 4988000 }, { "epoch": 33.75717301862278, "grad_norm": 0.3246718943119049, "learning_rate": 4.662428269813772e-05, "loss": 0.3581, "step": 4988500 }, { "epoch": 33.76055651797315, "grad_norm": 0.37892502546310425, "learning_rate": 4.662394434820269e-05, "loss": 0.3603, "step": 4989000 }, { "epoch": 33.76394001732352, "grad_norm": 0.3403722047805786, "learning_rate": 4.662360599826765e-05, "loss": 0.3582, "step": 4989500 }, { "epoch": 33.76732351667388, "grad_norm": 0.352461040019989, "learning_rate": 4.662326764833261e-05, "loss": 0.3607, "step": 4990000 }, { "epoch": 33.770707016024254, "grad_norm": 0.38694116473197937, "learning_rate": 4.6622929298397574e-05, "loss": 0.3601, "step": 4990500 }, { "epoch": 33.774090515374624, "grad_norm": 0.3797401189804077, "learning_rate": 4.662259094846254e-05, "loss": 0.3607, "step": 4991000 }, { "epoch": 33.77747401472499, "grad_norm": 0.3736964464187622, "learning_rate": 4.6622252598527505e-05, "loss": 0.3601, "step": 4991500 }, { "epoch": 33.78085751407536, "grad_norm": 0.3719107508659363, "learning_rate": 4.662191424859247e-05, "loss": 0.3607, "step": 4992000 }, { "epoch": 33.78424101342573, "grad_norm": 0.410574734210968, "learning_rate": 4.662157589865743e-05, "loss": 0.3598, "step": 4992500 }, { "epoch": 33.78762451277609, "grad_norm": 0.3776076138019562, "learning_rate": 4.662123754872239e-05, "loss": 0.3598, "step": 4993000 }, { "epoch": 33.79100801212646, "grad_norm": 0.3880300521850586, "learning_rate": 4.6620899198787354e-05, "loss": 0.359, "step": 4993500 }, { "epoch": 33.79439151147683, "grad_norm": 0.36968597769737244, "learning_rate": 4.6620560848852316e-05, "loss": 0.3591, "step": 4994000 }, { "epoch": 33.797775010827195, "grad_norm": 0.357287734746933, "learning_rate": 4.6620222498917285e-05, "loss": 0.359, "step": 4994500 }, { "epoch": 33.801158510177565, "grad_norm": 0.36389797925949097, "learning_rate": 4.661988414898225e-05, "loss": 0.3608, "step": 4995000 }, { "epoch": 33.804542009527935, "grad_norm": 0.3828994631767273, "learning_rate": 4.661954579904721e-05, "loss": 0.3591, "step": 4995500 }, { "epoch": 33.807925508878306, "grad_norm": 0.3847697973251343, "learning_rate": 4.661920744911217e-05, "loss": 0.3595, "step": 4996000 }, { "epoch": 33.81130900822867, "grad_norm": 0.3801291584968567, "learning_rate": 4.661886909917714e-05, "loss": 0.3595, "step": 4996500 }, { "epoch": 33.81469250757904, "grad_norm": 0.3958672285079956, "learning_rate": 4.66185307492421e-05, "loss": 0.3608, "step": 4997000 }, { "epoch": 33.81807600692941, "grad_norm": 0.36948850750923157, "learning_rate": 4.661819239930706e-05, "loss": 0.3622, "step": 4997500 }, { "epoch": 33.82145950627977, "grad_norm": 0.3512585461139679, "learning_rate": 4.661785404937202e-05, "loss": 0.3595, "step": 4998000 }, { "epoch": 33.82484300563014, "grad_norm": 0.38217779994010925, "learning_rate": 4.661751569943699e-05, "loss": 0.3593, "step": 4998500 }, { "epoch": 33.82822650498051, "grad_norm": 0.35705459117889404, "learning_rate": 4.661717734950195e-05, "loss": 0.3605, "step": 4999000 }, { "epoch": 33.83161000433088, "grad_norm": 0.4054109752178192, "learning_rate": 4.661683899956691e-05, "loss": 0.3606, "step": 4999500 }, { "epoch": 33.83499350368125, "grad_norm": 0.37336549162864685, "learning_rate": 4.6616500649631875e-05, "loss": 0.3588, "step": 5000000 }, { "epoch": 33.83837700303162, "grad_norm": 0.4157043993473053, "learning_rate": 4.6616162299696844e-05, "loss": 0.3609, "step": 5000500 }, { "epoch": 33.84176050238198, "grad_norm": 0.3988616466522217, "learning_rate": 4.6615823949761806e-05, "loss": 0.3601, "step": 5001000 }, { "epoch": 33.84514400173235, "grad_norm": 0.3774484395980835, "learning_rate": 4.661548559982677e-05, "loss": 0.3596, "step": 5001500 }, { "epoch": 33.84852750108272, "grad_norm": 0.37425515055656433, "learning_rate": 4.661514724989173e-05, "loss": 0.3598, "step": 5002000 }, { "epoch": 33.85191100043309, "grad_norm": 0.40442442893981934, "learning_rate": 4.661480889995669e-05, "loss": 0.3613, "step": 5002500 }, { "epoch": 33.855294499783454, "grad_norm": 0.40121543407440186, "learning_rate": 4.6614470550021654e-05, "loss": 0.3605, "step": 5003000 }, { "epoch": 33.858677999133825, "grad_norm": 0.3543761968612671, "learning_rate": 4.6614132200086616e-05, "loss": 0.3587, "step": 5003500 }, { "epoch": 33.862061498484195, "grad_norm": 0.4035884141921997, "learning_rate": 4.6613793850151585e-05, "loss": 0.3598, "step": 5004000 }, { "epoch": 33.86544499783456, "grad_norm": 0.40483883023262024, "learning_rate": 4.661345550021655e-05, "loss": 0.3595, "step": 5004500 }, { "epoch": 33.86882849718493, "grad_norm": 0.36776548624038696, "learning_rate": 4.661311715028151e-05, "loss": 0.3595, "step": 5005000 }, { "epoch": 33.8722119965353, "grad_norm": 0.35366418957710266, "learning_rate": 4.661277880034647e-05, "loss": 0.3597, "step": 5005500 }, { "epoch": 33.87559549588566, "grad_norm": 0.36323100328445435, "learning_rate": 4.661244045041144e-05, "loss": 0.3592, "step": 5006000 }, { "epoch": 33.87897899523603, "grad_norm": 0.3393199145793915, "learning_rate": 4.66121021004764e-05, "loss": 0.3605, "step": 5006500 }, { "epoch": 33.8823624945864, "grad_norm": 0.3459985852241516, "learning_rate": 4.661176375054136e-05, "loss": 0.3597, "step": 5007000 }, { "epoch": 33.885745993936766, "grad_norm": 0.37933433055877686, "learning_rate": 4.661142540060632e-05, "loss": 0.3595, "step": 5007500 }, { "epoch": 33.889129493287136, "grad_norm": 0.39216548204421997, "learning_rate": 4.661108705067129e-05, "loss": 0.3596, "step": 5008000 }, { "epoch": 33.892512992637506, "grad_norm": 0.363120436668396, "learning_rate": 4.661074870073625e-05, "loss": 0.3596, "step": 5008500 }, { "epoch": 33.89589649198788, "grad_norm": 0.4088318347930908, "learning_rate": 4.661041035080121e-05, "loss": 0.3598, "step": 5009000 }, { "epoch": 33.89927999133824, "grad_norm": 0.3933384418487549, "learning_rate": 4.6610072000866175e-05, "loss": 0.3586, "step": 5009500 }, { "epoch": 33.90266349068861, "grad_norm": 0.3848974406719208, "learning_rate": 4.6609733650931144e-05, "loss": 0.3586, "step": 5010000 }, { "epoch": 33.90604699003898, "grad_norm": 0.36713850498199463, "learning_rate": 4.6609395300996106e-05, "loss": 0.3624, "step": 5010500 }, { "epoch": 33.909430489389344, "grad_norm": 0.36642223596572876, "learning_rate": 4.660905695106107e-05, "loss": 0.3597, "step": 5011000 }, { "epoch": 33.912813988739714, "grad_norm": 0.40728092193603516, "learning_rate": 4.660871860112603e-05, "loss": 0.3602, "step": 5011500 }, { "epoch": 33.916197488090084, "grad_norm": 0.3996118903160095, "learning_rate": 4.660838025119099e-05, "loss": 0.3616, "step": 5012000 }, { "epoch": 33.91958098744045, "grad_norm": 0.41957518458366394, "learning_rate": 4.6608041901255955e-05, "loss": 0.3601, "step": 5012500 }, { "epoch": 33.92296448679082, "grad_norm": 0.35961875319480896, "learning_rate": 4.660770355132092e-05, "loss": 0.36, "step": 5013000 }, { "epoch": 33.92634798614119, "grad_norm": 0.42392638325691223, "learning_rate": 4.6607365201385886e-05, "loss": 0.3596, "step": 5013500 }, { "epoch": 33.92973148549156, "grad_norm": 0.3994753658771515, "learning_rate": 4.660702685145085e-05, "loss": 0.36, "step": 5014000 }, { "epoch": 33.93311498484192, "grad_norm": 0.376228392124176, "learning_rate": 4.660668850151581e-05, "loss": 0.3597, "step": 5014500 }, { "epoch": 33.93649848419229, "grad_norm": 0.3853345513343811, "learning_rate": 4.660635015158077e-05, "loss": 0.3607, "step": 5015000 }, { "epoch": 33.93988198354266, "grad_norm": 0.36617282032966614, "learning_rate": 4.660601180164574e-05, "loss": 0.3604, "step": 5015500 }, { "epoch": 33.943265482893025, "grad_norm": 0.37868422269821167, "learning_rate": 4.66056734517107e-05, "loss": 0.3613, "step": 5016000 }, { "epoch": 33.946648982243396, "grad_norm": 0.34545958042144775, "learning_rate": 4.660533510177566e-05, "loss": 0.3599, "step": 5016500 }, { "epoch": 33.950032481593766, "grad_norm": 0.3555082380771637, "learning_rate": 4.660499675184062e-05, "loss": 0.3605, "step": 5017000 }, { "epoch": 33.95341598094413, "grad_norm": 0.3674757182598114, "learning_rate": 4.660465840190559e-05, "loss": 0.3597, "step": 5017500 }, { "epoch": 33.9567994802945, "grad_norm": 0.371110200881958, "learning_rate": 4.660432005197055e-05, "loss": 0.3596, "step": 5018000 }, { "epoch": 33.96018297964487, "grad_norm": 0.3983413875102997, "learning_rate": 4.6603981702035514e-05, "loss": 0.3594, "step": 5018500 }, { "epoch": 33.96356647899523, "grad_norm": 0.37326234579086304, "learning_rate": 4.6603643352100476e-05, "loss": 0.3581, "step": 5019000 }, { "epoch": 33.9669499783456, "grad_norm": 0.3510463237762451, "learning_rate": 4.6603305002165445e-05, "loss": 0.3601, "step": 5019500 }, { "epoch": 33.970333477695974, "grad_norm": 0.3734467923641205, "learning_rate": 4.660296665223041e-05, "loss": 0.3597, "step": 5020000 }, { "epoch": 33.973716977046344, "grad_norm": 0.3781816363334656, "learning_rate": 4.660262830229537e-05, "loss": 0.3592, "step": 5020500 }, { "epoch": 33.97710047639671, "grad_norm": 0.35887932777404785, "learning_rate": 4.660228995236033e-05, "loss": 0.3604, "step": 5021000 }, { "epoch": 33.98048397574708, "grad_norm": 0.38643041253089905, "learning_rate": 4.6601951602425293e-05, "loss": 0.3591, "step": 5021500 }, { "epoch": 33.98386747509745, "grad_norm": 0.3732677102088928, "learning_rate": 4.6601613252490256e-05, "loss": 0.3607, "step": 5022000 }, { "epoch": 33.98725097444781, "grad_norm": 0.392314612865448, "learning_rate": 4.660127490255522e-05, "loss": 0.3609, "step": 5022500 }, { "epoch": 33.99063447379818, "grad_norm": 0.3450537323951721, "learning_rate": 4.660093655262019e-05, "loss": 0.3605, "step": 5023000 }, { "epoch": 33.99401797314855, "grad_norm": 0.3555243909358978, "learning_rate": 4.660059820268515e-05, "loss": 0.3595, "step": 5023500 }, { "epoch": 33.997401472498915, "grad_norm": 0.3801630437374115, "learning_rate": 4.660025985275011e-05, "loss": 0.3599, "step": 5024000 }, { "epoch": 34.0, "eval_accuracy": 0.862768993043337, "eval_loss": 0.556894838809967, "eval_runtime": 3349.7517, "eval_samples_per_second": 86.796, "eval_steps_per_second": 5.425, "step": 5024384 }, { "epoch": 34.000784971849285, "grad_norm": 0.3564847707748413, "learning_rate": 4.659992150281507e-05, "loss": 0.3599, "step": 5024500 }, { "epoch": 34.004168471199655, "grad_norm": 0.38413703441619873, "learning_rate": 4.659958315288004e-05, "loss": 0.3566, "step": 5025000 }, { "epoch": 34.00755197055002, "grad_norm": 0.39659664034843445, "learning_rate": 4.6599244802945004e-05, "loss": 0.3583, "step": 5025500 }, { "epoch": 34.01093546990039, "grad_norm": 0.3901703357696533, "learning_rate": 4.659890645300996e-05, "loss": 0.3584, "step": 5026000 }, { "epoch": 34.01431896925076, "grad_norm": 0.3614691197872162, "learning_rate": 4.659856810307492e-05, "loss": 0.3565, "step": 5026500 }, { "epoch": 34.01770246860113, "grad_norm": 0.35379400849342346, "learning_rate": 4.659822975313989e-05, "loss": 0.3554, "step": 5027000 }, { "epoch": 34.02108596795149, "grad_norm": 0.37780022621154785, "learning_rate": 4.659789140320485e-05, "loss": 0.359, "step": 5027500 }, { "epoch": 34.02446946730186, "grad_norm": 0.4037107229232788, "learning_rate": 4.6597553053269815e-05, "loss": 0.3585, "step": 5028000 }, { "epoch": 34.02785296665223, "grad_norm": 0.3732971251010895, "learning_rate": 4.659721470333478e-05, "loss": 0.3581, "step": 5028500 }, { "epoch": 34.031236466002596, "grad_norm": 0.4241940379142761, "learning_rate": 4.6596876353399746e-05, "loss": 0.3581, "step": 5029000 }, { "epoch": 34.03461996535297, "grad_norm": 0.4099807143211365, "learning_rate": 4.659653800346471e-05, "loss": 0.3557, "step": 5029500 }, { "epoch": 34.03800346470334, "grad_norm": 0.3990079462528229, "learning_rate": 4.659619965352967e-05, "loss": 0.3582, "step": 5030000 }, { "epoch": 34.0413869640537, "grad_norm": 0.38755473494529724, "learning_rate": 4.659586130359463e-05, "loss": 0.3583, "step": 5030500 }, { "epoch": 34.04477046340407, "grad_norm": 0.376600056886673, "learning_rate": 4.6595522953659594e-05, "loss": 0.3573, "step": 5031000 }, { "epoch": 34.04815396275444, "grad_norm": 0.39983510971069336, "learning_rate": 4.6595184603724556e-05, "loss": 0.3579, "step": 5031500 }, { "epoch": 34.051537462104804, "grad_norm": 0.4000255763530731, "learning_rate": 4.659484625378952e-05, "loss": 0.3584, "step": 5032000 }, { "epoch": 34.054920961455174, "grad_norm": 0.39626163244247437, "learning_rate": 4.659450790385449e-05, "loss": 0.359, "step": 5032500 }, { "epoch": 34.058304460805545, "grad_norm": 0.37869513034820557, "learning_rate": 4.659416955391945e-05, "loss": 0.358, "step": 5033000 }, { "epoch": 34.061687960155915, "grad_norm": 0.33970946073532104, "learning_rate": 4.659383120398441e-05, "loss": 0.3576, "step": 5033500 }, { "epoch": 34.06507145950628, "grad_norm": 0.40400931239128113, "learning_rate": 4.6593492854049374e-05, "loss": 0.3565, "step": 5034000 }, { "epoch": 34.06845495885665, "grad_norm": 0.38513997197151184, "learning_rate": 4.659315450411434e-05, "loss": 0.3576, "step": 5034500 }, { "epoch": 34.07183845820702, "grad_norm": 0.376578152179718, "learning_rate": 4.6592816154179305e-05, "loss": 0.3588, "step": 5035000 }, { "epoch": 34.07522195755738, "grad_norm": 0.4014889895915985, "learning_rate": 4.659247780424426e-05, "loss": 0.3593, "step": 5035500 }, { "epoch": 34.07860545690775, "grad_norm": 0.3878783583641052, "learning_rate": 4.659213945430922e-05, "loss": 0.359, "step": 5036000 }, { "epoch": 34.08198895625812, "grad_norm": 0.3966333270072937, "learning_rate": 4.659180110437419e-05, "loss": 0.3586, "step": 5036500 }, { "epoch": 34.085372455608486, "grad_norm": 0.356948584318161, "learning_rate": 4.659146275443915e-05, "loss": 0.3594, "step": 5037000 }, { "epoch": 34.088755954958856, "grad_norm": 0.43688029050827026, "learning_rate": 4.6591124404504115e-05, "loss": 0.3598, "step": 5037500 }, { "epoch": 34.092139454309226, "grad_norm": 0.34125515818595886, "learning_rate": 4.659078605456908e-05, "loss": 0.358, "step": 5038000 }, { "epoch": 34.09552295365959, "grad_norm": 0.403870165348053, "learning_rate": 4.6590447704634046e-05, "loss": 0.3578, "step": 5038500 }, { "epoch": 34.09890645300996, "grad_norm": 0.42687666416168213, "learning_rate": 4.659010935469901e-05, "loss": 0.3579, "step": 5039000 }, { "epoch": 34.10228995236033, "grad_norm": 0.37997591495513916, "learning_rate": 4.658977100476397e-05, "loss": 0.3583, "step": 5039500 }, { "epoch": 34.1056734517107, "grad_norm": 0.37498998641967773, "learning_rate": 4.658943265482893e-05, "loss": 0.3586, "step": 5040000 }, { "epoch": 34.10905695106106, "grad_norm": 0.39867478609085083, "learning_rate": 4.6589094304893895e-05, "loss": 0.3583, "step": 5040500 }, { "epoch": 34.112440450411434, "grad_norm": 0.38414129614830017, "learning_rate": 4.658875595495886e-05, "loss": 0.3578, "step": 5041000 }, { "epoch": 34.115823949761804, "grad_norm": 0.3924691379070282, "learning_rate": 4.658841760502382e-05, "loss": 0.3592, "step": 5041500 }, { "epoch": 34.11920744911217, "grad_norm": 0.3739098608493805, "learning_rate": 4.658807925508879e-05, "loss": 0.3579, "step": 5042000 }, { "epoch": 34.12259094846254, "grad_norm": 0.3583349883556366, "learning_rate": 4.658774090515375e-05, "loss": 0.3591, "step": 5042500 }, { "epoch": 34.12597444781291, "grad_norm": 0.40906092524528503, "learning_rate": 4.658740255521871e-05, "loss": 0.3594, "step": 5043000 }, { "epoch": 34.12935794716327, "grad_norm": 0.33160528540611267, "learning_rate": 4.6587064205283674e-05, "loss": 0.3577, "step": 5043500 }, { "epoch": 34.13274144651364, "grad_norm": 0.41338586807250977, "learning_rate": 4.6586725855348636e-05, "loss": 0.3586, "step": 5044000 }, { "epoch": 34.13612494586401, "grad_norm": 0.3884548246860504, "learning_rate": 4.6586387505413605e-05, "loss": 0.3604, "step": 5044500 }, { "epoch": 34.13950844521438, "grad_norm": 0.3763774037361145, "learning_rate": 4.658604915547856e-05, "loss": 0.357, "step": 5045000 }, { "epoch": 34.142891944564745, "grad_norm": 0.381849080324173, "learning_rate": 4.658571080554352e-05, "loss": 0.3595, "step": 5045500 }, { "epoch": 34.146275443915115, "grad_norm": 0.39528530836105347, "learning_rate": 4.658537245560849e-05, "loss": 0.3591, "step": 5046000 }, { "epoch": 34.149658943265486, "grad_norm": 0.36974236369132996, "learning_rate": 4.6585034105673454e-05, "loss": 0.359, "step": 5046500 }, { "epoch": 34.15304244261585, "grad_norm": 0.3644038736820221, "learning_rate": 4.6584695755738416e-05, "loss": 0.36, "step": 5047000 }, { "epoch": 34.15642594196622, "grad_norm": 0.3983452618122101, "learning_rate": 4.658435740580338e-05, "loss": 0.3585, "step": 5047500 }, { "epoch": 34.15980944131659, "grad_norm": 0.3650473952293396, "learning_rate": 4.658401905586835e-05, "loss": 0.3591, "step": 5048000 }, { "epoch": 34.16319294066695, "grad_norm": 0.37225234508514404, "learning_rate": 4.658368070593331e-05, "loss": 0.3594, "step": 5048500 }, { "epoch": 34.16657644001732, "grad_norm": 0.41506242752075195, "learning_rate": 4.658334235599827e-05, "loss": 0.3586, "step": 5049000 }, { "epoch": 34.16995993936769, "grad_norm": 0.3849523067474365, "learning_rate": 4.658300400606323e-05, "loss": 0.3576, "step": 5049500 }, { "epoch": 34.17334343871806, "grad_norm": 0.4018872380256653, "learning_rate": 4.6582665656128195e-05, "loss": 0.3579, "step": 5050000 }, { "epoch": 34.17672693806843, "grad_norm": 0.3655732274055481, "learning_rate": 4.658232730619316e-05, "loss": 0.3578, "step": 5050500 }, { "epoch": 34.1801104374188, "grad_norm": 0.3754114508628845, "learning_rate": 4.658198895625812e-05, "loss": 0.3585, "step": 5051000 }, { "epoch": 34.18349393676917, "grad_norm": 0.37415942549705505, "learning_rate": 4.658165060632308e-05, "loss": 0.3587, "step": 5051500 }, { "epoch": 34.18687743611953, "grad_norm": 0.35716935992240906, "learning_rate": 4.658131225638805e-05, "loss": 0.3593, "step": 5052000 }, { "epoch": 34.1902609354699, "grad_norm": 0.3869902789592743, "learning_rate": 4.658097390645301e-05, "loss": 0.36, "step": 5052500 }, { "epoch": 34.19364443482027, "grad_norm": 0.355259507894516, "learning_rate": 4.6580635556517975e-05, "loss": 0.3597, "step": 5053000 }, { "epoch": 34.197027934170634, "grad_norm": 0.3885345458984375, "learning_rate": 4.658029720658294e-05, "loss": 0.3578, "step": 5053500 }, { "epoch": 34.200411433521005, "grad_norm": 0.37546995282173157, "learning_rate": 4.6579958856647906e-05, "loss": 0.3601, "step": 5054000 }, { "epoch": 34.203794932871375, "grad_norm": 0.39922034740448, "learning_rate": 4.657962050671287e-05, "loss": 0.3573, "step": 5054500 }, { "epoch": 34.20717843222174, "grad_norm": 0.37996113300323486, "learning_rate": 4.657928215677782e-05, "loss": 0.3583, "step": 5055000 }, { "epoch": 34.21056193157211, "grad_norm": 0.39103591442108154, "learning_rate": 4.657894380684279e-05, "loss": 0.3586, "step": 5055500 }, { "epoch": 34.21394543092248, "grad_norm": 0.3796161115169525, "learning_rate": 4.6578605456907754e-05, "loss": 0.3595, "step": 5056000 }, { "epoch": 34.21732893027284, "grad_norm": 0.3596300184726715, "learning_rate": 4.6578267106972717e-05, "loss": 0.3579, "step": 5056500 }, { "epoch": 34.22071242962321, "grad_norm": 0.38669246435165405, "learning_rate": 4.657792875703768e-05, "loss": 0.3594, "step": 5057000 }, { "epoch": 34.22409592897358, "grad_norm": 0.419877290725708, "learning_rate": 4.657759040710265e-05, "loss": 0.3579, "step": 5057500 }, { "epoch": 34.22747942832395, "grad_norm": 0.40100720524787903, "learning_rate": 4.657725205716761e-05, "loss": 0.359, "step": 5058000 }, { "epoch": 34.230862927674316, "grad_norm": 0.4499681293964386, "learning_rate": 4.657691370723257e-05, "loss": 0.359, "step": 5058500 }, { "epoch": 34.23424642702469, "grad_norm": 0.3997453451156616, "learning_rate": 4.6576575357297534e-05, "loss": 0.3589, "step": 5059000 }, { "epoch": 34.23762992637506, "grad_norm": 0.3672904968261719, "learning_rate": 4.6576237007362496e-05, "loss": 0.3582, "step": 5059500 }, { "epoch": 34.24101342572542, "grad_norm": 0.39281371235847473, "learning_rate": 4.657589865742746e-05, "loss": 0.3597, "step": 5060000 }, { "epoch": 34.24439692507579, "grad_norm": 0.3908872902393341, "learning_rate": 4.657556030749242e-05, "loss": 0.3591, "step": 5060500 }, { "epoch": 34.24778042442616, "grad_norm": 0.3856948912143707, "learning_rate": 4.657522195755738e-05, "loss": 0.3574, "step": 5061000 }, { "epoch": 34.251163923776524, "grad_norm": 0.389735609292984, "learning_rate": 4.657488360762235e-05, "loss": 0.3579, "step": 5061500 }, { "epoch": 34.254547423126894, "grad_norm": 0.3987472653388977, "learning_rate": 4.6574545257687313e-05, "loss": 0.36, "step": 5062000 }, { "epoch": 34.257930922477264, "grad_norm": 0.38841792941093445, "learning_rate": 4.6574206907752276e-05, "loss": 0.3576, "step": 5062500 }, { "epoch": 34.26131442182763, "grad_norm": 0.3690048158168793, "learning_rate": 4.657386855781724e-05, "loss": 0.3586, "step": 5063000 }, { "epoch": 34.264697921178, "grad_norm": 0.41382020711898804, "learning_rate": 4.6573530207882207e-05, "loss": 0.36, "step": 5063500 }, { "epoch": 34.26808142052837, "grad_norm": 0.39931240677833557, "learning_rate": 4.657319185794717e-05, "loss": 0.3598, "step": 5064000 }, { "epoch": 34.27146491987874, "grad_norm": 0.4021117687225342, "learning_rate": 4.6572853508012124e-05, "loss": 0.3575, "step": 5064500 }, { "epoch": 34.2748484192291, "grad_norm": 0.3797292411327362, "learning_rate": 4.657251515807709e-05, "loss": 0.3583, "step": 5065000 }, { "epoch": 34.27823191857947, "grad_norm": 0.39234450459480286, "learning_rate": 4.6572176808142055e-05, "loss": 0.3586, "step": 5065500 }, { "epoch": 34.28161541792984, "grad_norm": 0.4292556643486023, "learning_rate": 4.657183845820702e-05, "loss": 0.3583, "step": 5066000 }, { "epoch": 34.284998917280205, "grad_norm": 0.3685343861579895, "learning_rate": 4.657150010827198e-05, "loss": 0.3599, "step": 5066500 }, { "epoch": 34.288382416630576, "grad_norm": 0.40842705965042114, "learning_rate": 4.657116175833695e-05, "loss": 0.3594, "step": 5067000 }, { "epoch": 34.291765915980946, "grad_norm": 0.3621181547641754, "learning_rate": 4.657082340840191e-05, "loss": 0.359, "step": 5067500 }, { "epoch": 34.29514941533131, "grad_norm": 0.3603003919124603, "learning_rate": 4.657048505846687e-05, "loss": 0.3597, "step": 5068000 }, { "epoch": 34.29853291468168, "grad_norm": 0.3800334930419922, "learning_rate": 4.6570146708531835e-05, "loss": 0.3617, "step": 5068500 }, { "epoch": 34.30191641403205, "grad_norm": 0.379797101020813, "learning_rate": 4.65698083585968e-05, "loss": 0.3588, "step": 5069000 }, { "epoch": 34.30529991338242, "grad_norm": 0.3685661852359772, "learning_rate": 4.656947000866176e-05, "loss": 0.3595, "step": 5069500 }, { "epoch": 34.30868341273278, "grad_norm": 0.3684749901294708, "learning_rate": 4.656913165872672e-05, "loss": 0.3574, "step": 5070000 }, { "epoch": 34.312066912083154, "grad_norm": 0.3760967552661896, "learning_rate": 4.656879330879168e-05, "loss": 0.3594, "step": 5070500 }, { "epoch": 34.315450411433524, "grad_norm": 0.3654816150665283, "learning_rate": 4.656845495885665e-05, "loss": 0.3593, "step": 5071000 }, { "epoch": 34.31883391078389, "grad_norm": 0.3774130642414093, "learning_rate": 4.6568116608921614e-05, "loss": 0.3585, "step": 5071500 }, { "epoch": 34.32221741013426, "grad_norm": 0.35825785994529724, "learning_rate": 4.6567778258986576e-05, "loss": 0.359, "step": 5072000 }, { "epoch": 34.32560090948463, "grad_norm": 0.41758641600608826, "learning_rate": 4.656743990905154e-05, "loss": 0.3589, "step": 5072500 }, { "epoch": 34.32898440883499, "grad_norm": 0.3906160295009613, "learning_rate": 4.656710155911651e-05, "loss": 0.3591, "step": 5073000 }, { "epoch": 34.33236790818536, "grad_norm": 0.3692469894886017, "learning_rate": 4.656676320918147e-05, "loss": 0.3586, "step": 5073500 }, { "epoch": 34.33575140753573, "grad_norm": 0.35179731249809265, "learning_rate": 4.6566424859246425e-05, "loss": 0.3596, "step": 5074000 }, { "epoch": 34.339134906886095, "grad_norm": 0.3839186429977417, "learning_rate": 4.6566086509311394e-05, "loss": 0.3586, "step": 5074500 }, { "epoch": 34.342518406236465, "grad_norm": 0.34909674525260925, "learning_rate": 4.6565748159376356e-05, "loss": 0.3595, "step": 5075000 }, { "epoch": 34.345901905586835, "grad_norm": 0.3728690445423126, "learning_rate": 4.656540980944132e-05, "loss": 0.3568, "step": 5075500 }, { "epoch": 34.349285404937206, "grad_norm": 0.3859647810459137, "learning_rate": 4.656507145950628e-05, "loss": 0.359, "step": 5076000 }, { "epoch": 34.35266890428757, "grad_norm": 0.38348260521888733, "learning_rate": 4.656473310957125e-05, "loss": 0.3566, "step": 5076500 }, { "epoch": 34.35605240363794, "grad_norm": 0.37911906838417053, "learning_rate": 4.656439475963621e-05, "loss": 0.3583, "step": 5077000 }, { "epoch": 34.35943590298831, "grad_norm": 0.36353886127471924, "learning_rate": 4.656405640970117e-05, "loss": 0.3586, "step": 5077500 }, { "epoch": 34.36281940233867, "grad_norm": 0.4255317449569702, "learning_rate": 4.6563718059766135e-05, "loss": 0.36, "step": 5078000 }, { "epoch": 34.36620290168904, "grad_norm": 0.36470532417297363, "learning_rate": 4.65633797098311e-05, "loss": 0.3596, "step": 5078500 }, { "epoch": 34.36958640103941, "grad_norm": 0.3753197193145752, "learning_rate": 4.656304135989606e-05, "loss": 0.3584, "step": 5079000 }, { "epoch": 34.372969900389776, "grad_norm": 0.40687665343284607, "learning_rate": 4.656270300996102e-05, "loss": 0.3584, "step": 5079500 }, { "epoch": 34.37635339974015, "grad_norm": 0.37256020307540894, "learning_rate": 4.6562364660025984e-05, "loss": 0.3579, "step": 5080000 }, { "epoch": 34.37973689909052, "grad_norm": 0.41435477137565613, "learning_rate": 4.656202631009095e-05, "loss": 0.3594, "step": 5080500 }, { "epoch": 34.38312039844088, "grad_norm": 0.36371898651123047, "learning_rate": 4.6561687960155915e-05, "loss": 0.3597, "step": 5081000 }, { "epoch": 34.38650389779125, "grad_norm": 0.4099665582180023, "learning_rate": 4.656134961022088e-05, "loss": 0.3594, "step": 5081500 }, { "epoch": 34.38988739714162, "grad_norm": 0.37060919404029846, "learning_rate": 4.656101126028584e-05, "loss": 0.3596, "step": 5082000 }, { "epoch": 34.39327089649199, "grad_norm": 0.38851264119148254, "learning_rate": 4.656067291035081e-05, "loss": 0.358, "step": 5082500 }, { "epoch": 34.396654395842354, "grad_norm": 0.3856896162033081, "learning_rate": 4.656033456041577e-05, "loss": 0.3598, "step": 5083000 }, { "epoch": 34.400037895192725, "grad_norm": 0.3771379590034485, "learning_rate": 4.6559996210480725e-05, "loss": 0.3606, "step": 5083500 }, { "epoch": 34.403421394543095, "grad_norm": 0.3658657670021057, "learning_rate": 4.6559657860545694e-05, "loss": 0.3586, "step": 5084000 }, { "epoch": 34.40680489389346, "grad_norm": 0.3816579580307007, "learning_rate": 4.6559319510610656e-05, "loss": 0.3591, "step": 5084500 }, { "epoch": 34.41018839324383, "grad_norm": 0.3742064833641052, "learning_rate": 4.655898116067562e-05, "loss": 0.3585, "step": 5085000 }, { "epoch": 34.4135718925942, "grad_norm": 0.3874446153640747, "learning_rate": 4.655864281074058e-05, "loss": 0.3593, "step": 5085500 }, { "epoch": 34.41695539194456, "grad_norm": 0.377986341714859, "learning_rate": 4.655830446080555e-05, "loss": 0.3608, "step": 5086000 }, { "epoch": 34.42033889129493, "grad_norm": 0.39694201946258545, "learning_rate": 4.655796611087051e-05, "loss": 0.3593, "step": 5086500 }, { "epoch": 34.4237223906453, "grad_norm": 0.3425546884536743, "learning_rate": 4.6557627760935474e-05, "loss": 0.3592, "step": 5087000 }, { "epoch": 34.427105889995666, "grad_norm": 0.3719537854194641, "learning_rate": 4.6557289411000436e-05, "loss": 0.3599, "step": 5087500 }, { "epoch": 34.430489389346036, "grad_norm": 0.37190452218055725, "learning_rate": 4.65569510610654e-05, "loss": 0.3595, "step": 5088000 }, { "epoch": 34.433872888696406, "grad_norm": 0.4227867126464844, "learning_rate": 4.655661271113036e-05, "loss": 0.3588, "step": 5088500 }, { "epoch": 34.43725638804678, "grad_norm": 0.3775515854358673, "learning_rate": 4.655627436119532e-05, "loss": 0.3596, "step": 5089000 }, { "epoch": 34.44063988739714, "grad_norm": 0.3955373764038086, "learning_rate": 4.6555936011260284e-05, "loss": 0.3596, "step": 5089500 }, { "epoch": 34.44402338674751, "grad_norm": 0.3818521201610565, "learning_rate": 4.655559766132525e-05, "loss": 0.3588, "step": 5090000 }, { "epoch": 34.44740688609788, "grad_norm": 0.3896339535713196, "learning_rate": 4.6555259311390215e-05, "loss": 0.3575, "step": 5090500 }, { "epoch": 34.450790385448244, "grad_norm": 0.34918251633644104, "learning_rate": 4.655492096145518e-05, "loss": 0.3599, "step": 5091000 }, { "epoch": 34.454173884798614, "grad_norm": 0.3901742696762085, "learning_rate": 4.655458261152014e-05, "loss": 0.36, "step": 5091500 }, { "epoch": 34.457557384148984, "grad_norm": 0.3786981701850891, "learning_rate": 4.655424426158511e-05, "loss": 0.3584, "step": 5092000 }, { "epoch": 34.46094088349935, "grad_norm": 0.420971155166626, "learning_rate": 4.655390591165007e-05, "loss": 0.3582, "step": 5092500 }, { "epoch": 34.46432438284972, "grad_norm": 0.3856641352176666, "learning_rate": 4.6553567561715026e-05, "loss": 0.3595, "step": 5093000 }, { "epoch": 34.46770788220009, "grad_norm": 0.38781335949897766, "learning_rate": 4.6553229211779995e-05, "loss": 0.3597, "step": 5093500 }, { "epoch": 34.47109138155046, "grad_norm": 0.3730041980743408, "learning_rate": 4.655289086184496e-05, "loss": 0.358, "step": 5094000 }, { "epoch": 34.47447488090082, "grad_norm": 0.4017457067966461, "learning_rate": 4.655255251190992e-05, "loss": 0.3584, "step": 5094500 }, { "epoch": 34.47785838025119, "grad_norm": 0.38262927532196045, "learning_rate": 4.655221416197488e-05, "loss": 0.3596, "step": 5095000 }, { "epoch": 34.48124187960156, "grad_norm": 0.34982404112815857, "learning_rate": 4.655187581203985e-05, "loss": 0.36, "step": 5095500 }, { "epoch": 34.484625378951925, "grad_norm": 0.39245760440826416, "learning_rate": 4.655153746210481e-05, "loss": 0.3588, "step": 5096000 }, { "epoch": 34.488008878302296, "grad_norm": 0.4110981523990631, "learning_rate": 4.6551199112169774e-05, "loss": 0.3585, "step": 5096500 }, { "epoch": 34.491392377652666, "grad_norm": 0.34749066829681396, "learning_rate": 4.6550860762234736e-05, "loss": 0.3589, "step": 5097000 }, { "epoch": 34.49477587700303, "grad_norm": 0.3863259553909302, "learning_rate": 4.65505224122997e-05, "loss": 0.3609, "step": 5097500 }, { "epoch": 34.4981593763534, "grad_norm": 0.35212135314941406, "learning_rate": 4.655018406236466e-05, "loss": 0.3597, "step": 5098000 }, { "epoch": 34.50154287570377, "grad_norm": 0.38584738969802856, "learning_rate": 4.654984571242962e-05, "loss": 0.3578, "step": 5098500 }, { "epoch": 34.50492637505413, "grad_norm": 0.35282817482948303, "learning_rate": 4.6549507362494585e-05, "loss": 0.3592, "step": 5099000 }, { "epoch": 34.5083098744045, "grad_norm": 0.3928869068622589, "learning_rate": 4.6549169012559554e-05, "loss": 0.3589, "step": 5099500 }, { "epoch": 34.51169337375487, "grad_norm": 0.394925594329834, "learning_rate": 4.6548830662624516e-05, "loss": 0.358, "step": 5100000 }, { "epoch": 34.515076873105244, "grad_norm": 0.379443883895874, "learning_rate": 4.654849231268948e-05, "loss": 0.3578, "step": 5100500 }, { "epoch": 34.51846037245561, "grad_norm": 0.37842902541160583, "learning_rate": 4.654815396275444e-05, "loss": 0.359, "step": 5101000 }, { "epoch": 34.52184387180598, "grad_norm": 0.3890233635902405, "learning_rate": 4.654781561281941e-05, "loss": 0.3584, "step": 5101500 }, { "epoch": 34.52522737115635, "grad_norm": 0.3421824872493744, "learning_rate": 4.654747726288437e-05, "loss": 0.3597, "step": 5102000 }, { "epoch": 34.52861087050671, "grad_norm": 0.3568973243236542, "learning_rate": 4.6547138912949327e-05, "loss": 0.3594, "step": 5102500 }, { "epoch": 34.53199436985708, "grad_norm": 0.3865303695201874, "learning_rate": 4.6546800563014295e-05, "loss": 0.3604, "step": 5103000 }, { "epoch": 34.53537786920745, "grad_norm": 0.34883520007133484, "learning_rate": 4.654646221307926e-05, "loss": 0.3602, "step": 5103500 }, { "epoch": 34.538761368557815, "grad_norm": 0.4210364818572998, "learning_rate": 4.654612386314422e-05, "loss": 0.3604, "step": 5104000 }, { "epoch": 34.542144867908185, "grad_norm": 0.41963812708854675, "learning_rate": 4.654578551320918e-05, "loss": 0.3609, "step": 5104500 }, { "epoch": 34.545528367258555, "grad_norm": 0.40737205743789673, "learning_rate": 4.654544716327415e-05, "loss": 0.3581, "step": 5105000 }, { "epoch": 34.54891186660892, "grad_norm": 0.35566970705986023, "learning_rate": 4.654510881333911e-05, "loss": 0.3601, "step": 5105500 }, { "epoch": 34.55229536595929, "grad_norm": 0.3322935402393341, "learning_rate": 4.6544770463404075e-05, "loss": 0.3584, "step": 5106000 }, { "epoch": 34.55567886530966, "grad_norm": 0.3786003887653351, "learning_rate": 4.654443211346904e-05, "loss": 0.3599, "step": 5106500 }, { "epoch": 34.55906236466003, "grad_norm": 0.36978060007095337, "learning_rate": 4.6544093763534e-05, "loss": 0.3592, "step": 5107000 }, { "epoch": 34.56244586401039, "grad_norm": 0.3569822609424591, "learning_rate": 4.654375541359896e-05, "loss": 0.3575, "step": 5107500 }, { "epoch": 34.56582936336076, "grad_norm": 0.3197338581085205, "learning_rate": 4.6543417063663923e-05, "loss": 0.3605, "step": 5108000 }, { "epoch": 34.56921286271113, "grad_norm": 0.36754342913627625, "learning_rate": 4.6543078713728886e-05, "loss": 0.3599, "step": 5108500 }, { "epoch": 34.572596362061496, "grad_norm": 0.42086338996887207, "learning_rate": 4.6542740363793854e-05, "loss": 0.3592, "step": 5109000 }, { "epoch": 34.57597986141187, "grad_norm": 0.3892146050930023, "learning_rate": 4.6542402013858817e-05, "loss": 0.36, "step": 5109500 }, { "epoch": 34.57936336076224, "grad_norm": 0.3757900297641754, "learning_rate": 4.654206366392378e-05, "loss": 0.3587, "step": 5110000 }, { "epoch": 34.5827468601126, "grad_norm": 0.3483287990093231, "learning_rate": 4.654172531398874e-05, "loss": 0.359, "step": 5110500 }, { "epoch": 34.58613035946297, "grad_norm": 0.3535486161708832, "learning_rate": 4.654138696405371e-05, "loss": 0.3602, "step": 5111000 }, { "epoch": 34.58951385881334, "grad_norm": 0.387491911649704, "learning_rate": 4.654104861411867e-05, "loss": 0.3587, "step": 5111500 }, { "epoch": 34.592897358163704, "grad_norm": 0.39275237917900085, "learning_rate": 4.654071026418363e-05, "loss": 0.3606, "step": 5112000 }, { "epoch": 34.596280857514074, "grad_norm": 0.4202045202255249, "learning_rate": 4.6540371914248596e-05, "loss": 0.3603, "step": 5112500 }, { "epoch": 34.599664356864444, "grad_norm": 0.3850978910923004, "learning_rate": 4.654003356431356e-05, "loss": 0.3591, "step": 5113000 }, { "epoch": 34.603047856214815, "grad_norm": 0.4012092351913452, "learning_rate": 4.653969521437852e-05, "loss": 0.3584, "step": 5113500 }, { "epoch": 34.60643135556518, "grad_norm": 0.37787535786628723, "learning_rate": 4.653935686444348e-05, "loss": 0.3593, "step": 5114000 }, { "epoch": 34.60981485491555, "grad_norm": 0.3980405032634735, "learning_rate": 4.6539018514508445e-05, "loss": 0.3604, "step": 5114500 }, { "epoch": 34.61319835426592, "grad_norm": 0.39192214608192444, "learning_rate": 4.6538680164573413e-05, "loss": 0.36, "step": 5115000 }, { "epoch": 34.61658185361628, "grad_norm": 0.3803424537181854, "learning_rate": 4.6538341814638376e-05, "loss": 0.3605, "step": 5115500 }, { "epoch": 34.61996535296665, "grad_norm": 0.32273218035697937, "learning_rate": 4.653800346470334e-05, "loss": 0.3601, "step": 5116000 }, { "epoch": 34.62334885231702, "grad_norm": 0.36341169476509094, "learning_rate": 4.65376651147683e-05, "loss": 0.3602, "step": 5116500 }, { "epoch": 34.626732351667386, "grad_norm": 0.3712911605834961, "learning_rate": 4.653732676483326e-05, "loss": 0.3602, "step": 5117000 }, { "epoch": 34.630115851017756, "grad_norm": 0.35886746644973755, "learning_rate": 4.6536988414898224e-05, "loss": 0.3602, "step": 5117500 }, { "epoch": 34.633499350368126, "grad_norm": 0.433685302734375, "learning_rate": 4.6536650064963186e-05, "loss": 0.3584, "step": 5118000 }, { "epoch": 34.63688284971849, "grad_norm": 0.37122201919555664, "learning_rate": 4.6536311715028155e-05, "loss": 0.3593, "step": 5118500 }, { "epoch": 34.64026634906886, "grad_norm": 0.35382482409477234, "learning_rate": 4.653597336509312e-05, "loss": 0.3606, "step": 5119000 }, { "epoch": 34.64364984841923, "grad_norm": 0.3513053357601166, "learning_rate": 4.653563501515808e-05, "loss": 0.3602, "step": 5119500 }, { "epoch": 34.6470333477696, "grad_norm": 0.3879770338535309, "learning_rate": 4.653529666522304e-05, "loss": 0.3596, "step": 5120000 }, { "epoch": 34.65041684711996, "grad_norm": 0.3754001259803772, "learning_rate": 4.653495831528801e-05, "loss": 0.3591, "step": 5120500 }, { "epoch": 34.653800346470334, "grad_norm": 0.3843109905719757, "learning_rate": 4.653461996535297e-05, "loss": 0.3601, "step": 5121000 }, { "epoch": 34.657183845820704, "grad_norm": 0.38966867327690125, "learning_rate": 4.653428161541793e-05, "loss": 0.3589, "step": 5121500 }, { "epoch": 34.66056734517107, "grad_norm": 0.3803916871547699, "learning_rate": 4.653394326548289e-05, "loss": 0.3589, "step": 5122000 }, { "epoch": 34.66395084452144, "grad_norm": 0.3781992197036743, "learning_rate": 4.653360491554786e-05, "loss": 0.3585, "step": 5122500 }, { "epoch": 34.66733434387181, "grad_norm": 0.3852325975894928, "learning_rate": 4.653326656561282e-05, "loss": 0.3599, "step": 5123000 }, { "epoch": 34.67071784322217, "grad_norm": 0.37221723794937134, "learning_rate": 4.653292821567778e-05, "loss": 0.361, "step": 5123500 }, { "epoch": 34.67410134257254, "grad_norm": 0.3999648094177246, "learning_rate": 4.6532589865742745e-05, "loss": 0.3607, "step": 5124000 }, { "epoch": 34.67748484192291, "grad_norm": 0.3421521484851837, "learning_rate": 4.6532251515807714e-05, "loss": 0.3577, "step": 5124500 }, { "epoch": 34.68086834127328, "grad_norm": 0.37870514392852783, "learning_rate": 4.6531913165872676e-05, "loss": 0.359, "step": 5125000 }, { "epoch": 34.684251840623645, "grad_norm": 0.3588518500328064, "learning_rate": 4.653157481593764e-05, "loss": 0.3597, "step": 5125500 }, { "epoch": 34.687635339974015, "grad_norm": 0.390232652425766, "learning_rate": 4.65312364660026e-05, "loss": 0.3595, "step": 5126000 }, { "epoch": 34.691018839324386, "grad_norm": 0.38923120498657227, "learning_rate": 4.653089811606756e-05, "loss": 0.36, "step": 5126500 }, { "epoch": 34.69440233867475, "grad_norm": 0.39458635449409485, "learning_rate": 4.6530559766132525e-05, "loss": 0.3603, "step": 5127000 }, { "epoch": 34.69778583802512, "grad_norm": 0.4159116744995117, "learning_rate": 4.653022141619749e-05, "loss": 0.3592, "step": 5127500 }, { "epoch": 34.70116933737549, "grad_norm": 0.3766118586063385, "learning_rate": 4.6529883066262456e-05, "loss": 0.3599, "step": 5128000 }, { "epoch": 34.70455283672585, "grad_norm": 0.3492608964443207, "learning_rate": 4.652954471632742e-05, "loss": 0.359, "step": 5128500 }, { "epoch": 34.70793633607622, "grad_norm": 0.4123823642730713, "learning_rate": 4.652920636639238e-05, "loss": 0.3585, "step": 5129000 }, { "epoch": 34.71131983542659, "grad_norm": 0.3934713304042816, "learning_rate": 4.652886801645734e-05, "loss": 0.3584, "step": 5129500 }, { "epoch": 34.71470333477696, "grad_norm": 0.34409138560295105, "learning_rate": 4.652852966652231e-05, "loss": 0.3589, "step": 5130000 }, { "epoch": 34.71808683412733, "grad_norm": 0.3865695595741272, "learning_rate": 4.652819131658727e-05, "loss": 0.359, "step": 5130500 }, { "epoch": 34.7214703334777, "grad_norm": 0.366634339094162, "learning_rate": 4.652785296665223e-05, "loss": 0.3604, "step": 5131000 }, { "epoch": 34.72485383282807, "grad_norm": 0.4457561671733856, "learning_rate": 4.652751461671719e-05, "loss": 0.3595, "step": 5131500 }, { "epoch": 34.72823733217843, "grad_norm": 0.36967164278030396, "learning_rate": 4.652717626678216e-05, "loss": 0.3612, "step": 5132000 }, { "epoch": 34.7316208315288, "grad_norm": 0.37795010209083557, "learning_rate": 4.652683791684712e-05, "loss": 0.3599, "step": 5132500 }, { "epoch": 34.73500433087917, "grad_norm": 0.38283535838127136, "learning_rate": 4.6526499566912084e-05, "loss": 0.361, "step": 5133000 }, { "epoch": 34.738387830229534, "grad_norm": 0.40486639738082886, "learning_rate": 4.6526161216977046e-05, "loss": 0.3584, "step": 5133500 }, { "epoch": 34.741771329579905, "grad_norm": 0.41568008065223694, "learning_rate": 4.6525822867042015e-05, "loss": 0.3585, "step": 5134000 }, { "epoch": 34.745154828930275, "grad_norm": 0.3480258584022522, "learning_rate": 4.652548451710698e-05, "loss": 0.3591, "step": 5134500 }, { "epoch": 34.74853832828064, "grad_norm": 0.3713112473487854, "learning_rate": 4.652514616717194e-05, "loss": 0.3602, "step": 5135000 }, { "epoch": 34.75192182763101, "grad_norm": 0.36227697134017944, "learning_rate": 4.65248078172369e-05, "loss": 0.3595, "step": 5135500 }, { "epoch": 34.75530532698138, "grad_norm": 0.3820270597934723, "learning_rate": 4.652446946730186e-05, "loss": 0.3595, "step": 5136000 }, { "epoch": 34.75868882633174, "grad_norm": 0.3776843547821045, "learning_rate": 4.6524131117366825e-05, "loss": 0.3598, "step": 5136500 }, { "epoch": 34.76207232568211, "grad_norm": 0.4049762189388275, "learning_rate": 4.652379276743179e-05, "loss": 0.3593, "step": 5137000 }, { "epoch": 34.76545582503248, "grad_norm": 0.3922647535800934, "learning_rate": 4.6523454417496756e-05, "loss": 0.3593, "step": 5137500 }, { "epoch": 34.76883932438285, "grad_norm": 0.3500220477581024, "learning_rate": 4.652311606756172e-05, "loss": 0.3594, "step": 5138000 }, { "epoch": 34.772222823733216, "grad_norm": 0.36333972215652466, "learning_rate": 4.652277771762668e-05, "loss": 0.3601, "step": 5138500 }, { "epoch": 34.775606323083586, "grad_norm": 0.3815446197986603, "learning_rate": 4.652243936769164e-05, "loss": 0.3593, "step": 5139000 }, { "epoch": 34.77898982243396, "grad_norm": 0.387391060590744, "learning_rate": 4.652210101775661e-05, "loss": 0.3597, "step": 5139500 }, { "epoch": 34.78237332178432, "grad_norm": 0.37375015020370483, "learning_rate": 4.6521762667821574e-05, "loss": 0.3584, "step": 5140000 }, { "epoch": 34.78575682113469, "grad_norm": 0.335807204246521, "learning_rate": 4.652142431788653e-05, "loss": 0.3602, "step": 5140500 }, { "epoch": 34.78914032048506, "grad_norm": 0.3519132137298584, "learning_rate": 4.652108596795149e-05, "loss": 0.3585, "step": 5141000 }, { "epoch": 34.792523819835424, "grad_norm": 0.38648733496665955, "learning_rate": 4.652074761801646e-05, "loss": 0.3596, "step": 5141500 }, { "epoch": 34.795907319185794, "grad_norm": 0.3896413743495941, "learning_rate": 4.652040926808142e-05, "loss": 0.3598, "step": 5142000 }, { "epoch": 34.799290818536164, "grad_norm": 0.38711005449295044, "learning_rate": 4.6520070918146384e-05, "loss": 0.3593, "step": 5142500 }, { "epoch": 34.80267431788653, "grad_norm": 0.41289591789245605, "learning_rate": 4.6519732568211346e-05, "loss": 0.3612, "step": 5143000 }, { "epoch": 34.8060578172369, "grad_norm": 0.42670756578445435, "learning_rate": 4.6519394218276315e-05, "loss": 0.3585, "step": 5143500 }, { "epoch": 34.80944131658727, "grad_norm": 0.36375993490219116, "learning_rate": 4.651905586834128e-05, "loss": 0.3629, "step": 5144000 }, { "epoch": 34.81282481593764, "grad_norm": 0.348468154668808, "learning_rate": 4.651871751840624e-05, "loss": 0.3595, "step": 5144500 }, { "epoch": 34.816208315288, "grad_norm": 0.37443798780441284, "learning_rate": 4.65183791684712e-05, "loss": 0.3594, "step": 5145000 }, { "epoch": 34.81959181463837, "grad_norm": 0.3582093119621277, "learning_rate": 4.6518040818536164e-05, "loss": 0.3587, "step": 5145500 }, { "epoch": 34.82297531398874, "grad_norm": 0.3682185411453247, "learning_rate": 4.6517702468601126e-05, "loss": 0.3602, "step": 5146000 }, { "epoch": 34.826358813339105, "grad_norm": 0.33949360251426697, "learning_rate": 4.651736411866609e-05, "loss": 0.3597, "step": 5146500 }, { "epoch": 34.829742312689476, "grad_norm": 0.3803558051586151, "learning_rate": 4.651702576873106e-05, "loss": 0.3611, "step": 5147000 }, { "epoch": 34.833125812039846, "grad_norm": 0.4080631136894226, "learning_rate": 4.651668741879602e-05, "loss": 0.3607, "step": 5147500 }, { "epoch": 34.83650931139021, "grad_norm": 0.3974284827709198, "learning_rate": 4.651634906886098e-05, "loss": 0.3599, "step": 5148000 }, { "epoch": 34.83989281074058, "grad_norm": 0.38302257657051086, "learning_rate": 4.651601071892594e-05, "loss": 0.3583, "step": 5148500 }, { "epoch": 34.84327631009095, "grad_norm": 0.3776800334453583, "learning_rate": 4.651567236899091e-05, "loss": 0.3596, "step": 5149000 }, { "epoch": 34.84665980944132, "grad_norm": 0.44061967730522156, "learning_rate": 4.6515334019055874e-05, "loss": 0.3604, "step": 5149500 }, { "epoch": 34.85004330879168, "grad_norm": 0.3747578561306, "learning_rate": 4.651499566912083e-05, "loss": 0.3595, "step": 5150000 }, { "epoch": 34.85342680814205, "grad_norm": 0.36438217759132385, "learning_rate": 4.651465731918579e-05, "loss": 0.3591, "step": 5150500 }, { "epoch": 34.856810307492424, "grad_norm": 0.41121163964271545, "learning_rate": 4.651431896925076e-05, "loss": 0.358, "step": 5151000 }, { "epoch": 34.86019380684279, "grad_norm": 0.38191524147987366, "learning_rate": 4.651398061931572e-05, "loss": 0.3601, "step": 5151500 }, { "epoch": 34.86357730619316, "grad_norm": 0.40387558937072754, "learning_rate": 4.6513642269380685e-05, "loss": 0.3595, "step": 5152000 }, { "epoch": 34.86696080554353, "grad_norm": 0.35598960518836975, "learning_rate": 4.651330391944565e-05, "loss": 0.3591, "step": 5152500 }, { "epoch": 34.87034430489389, "grad_norm": 0.33690690994262695, "learning_rate": 4.6512965569510616e-05, "loss": 0.36, "step": 5153000 }, { "epoch": 34.87372780424426, "grad_norm": 0.3854426443576813, "learning_rate": 4.651262721957558e-05, "loss": 0.3597, "step": 5153500 }, { "epoch": 34.87711130359463, "grad_norm": 0.3753717541694641, "learning_rate": 4.651228886964054e-05, "loss": 0.3597, "step": 5154000 }, { "epoch": 34.880494802944995, "grad_norm": 0.35593491792678833, "learning_rate": 4.65119505197055e-05, "loss": 0.3592, "step": 5154500 }, { "epoch": 34.883878302295365, "grad_norm": 0.37082576751708984, "learning_rate": 4.6511612169770464e-05, "loss": 0.3605, "step": 5155000 }, { "epoch": 34.887261801645735, "grad_norm": 0.41517174243927, "learning_rate": 4.6511273819835427e-05, "loss": 0.3592, "step": 5155500 }, { "epoch": 34.890645300996106, "grad_norm": 0.4043850600719452, "learning_rate": 4.651093546990039e-05, "loss": 0.3596, "step": 5156000 }, { "epoch": 34.89402880034647, "grad_norm": 0.4053369462490082, "learning_rate": 4.651059711996536e-05, "loss": 0.359, "step": 5156500 }, { "epoch": 34.89741229969684, "grad_norm": 0.40419548749923706, "learning_rate": 4.651025877003032e-05, "loss": 0.3585, "step": 5157000 }, { "epoch": 34.90079579904721, "grad_norm": 0.3818973898887634, "learning_rate": 4.650992042009528e-05, "loss": 0.3586, "step": 5157500 }, { "epoch": 34.90417929839757, "grad_norm": 0.41882139444351196, "learning_rate": 4.6509582070160244e-05, "loss": 0.3599, "step": 5158000 }, { "epoch": 34.90756279774794, "grad_norm": 0.38736435770988464, "learning_rate": 4.650924372022521e-05, "loss": 0.3589, "step": 5158500 }, { "epoch": 34.91094629709831, "grad_norm": 0.33506953716278076, "learning_rate": 4.6508905370290175e-05, "loss": 0.3598, "step": 5159000 }, { "epoch": 34.914329796448676, "grad_norm": 0.3672519624233246, "learning_rate": 4.650856702035513e-05, "loss": 0.3604, "step": 5159500 }, { "epoch": 34.91771329579905, "grad_norm": 0.3793613016605377, "learning_rate": 4.650822867042009e-05, "loss": 0.3593, "step": 5160000 }, { "epoch": 34.92109679514942, "grad_norm": 0.3812962472438812, "learning_rate": 4.650789032048506e-05, "loss": 0.3604, "step": 5160500 }, { "epoch": 34.92448029449978, "grad_norm": 0.4058850109577179, "learning_rate": 4.6507551970550023e-05, "loss": 0.3597, "step": 5161000 }, { "epoch": 34.92786379385015, "grad_norm": 0.37779462337493896, "learning_rate": 4.6507213620614986e-05, "loss": 0.3606, "step": 5161500 }, { "epoch": 34.93124729320052, "grad_norm": 0.4026508927345276, "learning_rate": 4.650687527067995e-05, "loss": 0.3602, "step": 5162000 }, { "epoch": 34.93463079255089, "grad_norm": 0.40791353583335876, "learning_rate": 4.650653692074492e-05, "loss": 0.3592, "step": 5162500 }, { "epoch": 34.938014291901254, "grad_norm": 0.3928754925727844, "learning_rate": 4.650619857080988e-05, "loss": 0.3604, "step": 5163000 }, { "epoch": 34.941397791251624, "grad_norm": 0.37801092863082886, "learning_rate": 4.650586022087484e-05, "loss": 0.3593, "step": 5163500 }, { "epoch": 34.944781290601995, "grad_norm": 0.34316352009773254, "learning_rate": 4.65055218709398e-05, "loss": 0.3584, "step": 5164000 }, { "epoch": 34.94816478995236, "grad_norm": 0.39071527123451233, "learning_rate": 4.6505183521004765e-05, "loss": 0.36, "step": 5164500 }, { "epoch": 34.95154828930273, "grad_norm": 0.34213265776634216, "learning_rate": 4.650484517106973e-05, "loss": 0.3603, "step": 5165000 }, { "epoch": 34.9549317886531, "grad_norm": 0.3949839174747467, "learning_rate": 4.650450682113469e-05, "loss": 0.3585, "step": 5165500 }, { "epoch": 34.95831528800346, "grad_norm": 0.384823203086853, "learning_rate": 4.650416847119966e-05, "loss": 0.3588, "step": 5166000 }, { "epoch": 34.96169878735383, "grad_norm": 0.4190446734428406, "learning_rate": 4.650383012126462e-05, "loss": 0.3591, "step": 5166500 }, { "epoch": 34.9650822867042, "grad_norm": 0.3400190472602844, "learning_rate": 4.650349177132958e-05, "loss": 0.3609, "step": 5167000 }, { "epoch": 34.968465786054566, "grad_norm": 0.39202719926834106, "learning_rate": 4.6503153421394545e-05, "loss": 0.3596, "step": 5167500 }, { "epoch": 34.971849285404936, "grad_norm": 0.3868553936481476, "learning_rate": 4.650281507145951e-05, "loss": 0.3588, "step": 5168000 }, { "epoch": 34.975232784755306, "grad_norm": 0.3931337594985962, "learning_rate": 4.6502476721524476e-05, "loss": 0.3608, "step": 5168500 }, { "epoch": 34.97861628410568, "grad_norm": 0.4102155268192291, "learning_rate": 4.650213837158944e-05, "loss": 0.3594, "step": 5169000 }, { "epoch": 34.98199978345604, "grad_norm": 0.36841264367103577, "learning_rate": 4.650180002165439e-05, "loss": 0.3594, "step": 5169500 }, { "epoch": 34.98538328280641, "grad_norm": 0.3891858458518982, "learning_rate": 4.650146167171936e-05, "loss": 0.3609, "step": 5170000 }, { "epoch": 34.98876678215678, "grad_norm": 0.39834773540496826, "learning_rate": 4.6501123321784324e-05, "loss": 0.3601, "step": 5170500 }, { "epoch": 34.99215028150714, "grad_norm": 0.36702215671539307, "learning_rate": 4.6500784971849286e-05, "loss": 0.3602, "step": 5171000 }, { "epoch": 34.995533780857514, "grad_norm": 0.4127750098705292, "learning_rate": 4.650044662191425e-05, "loss": 0.3593, "step": 5171500 }, { "epoch": 34.998917280207884, "grad_norm": 0.41186824440956116, "learning_rate": 4.650010827197922e-05, "loss": 0.3586, "step": 5172000 }, { "epoch": 35.0, "eval_accuracy": 0.8629421222081165, "eval_loss": 0.5557395219802856, "eval_runtime": 3356.599, "eval_samples_per_second": 86.619, "eval_steps_per_second": 5.414, "step": 5172160 }, { "epoch": 35.00230077955825, "grad_norm": 0.3806530833244324, "learning_rate": 4.649976992204418e-05, "loss": 0.3571, "step": 5172500 }, { "epoch": 35.00568427890862, "grad_norm": 0.3965018093585968, "learning_rate": 4.649943157210914e-05, "loss": 0.3583, "step": 5173000 }, { "epoch": 35.00906777825899, "grad_norm": 0.36404433846473694, "learning_rate": 4.6499093222174104e-05, "loss": 0.3569, "step": 5173500 }, { "epoch": 35.01245127760936, "grad_norm": 0.38004666566848755, "learning_rate": 4.6498754872239066e-05, "loss": 0.3575, "step": 5174000 }, { "epoch": 35.01583477695972, "grad_norm": 0.3895862400531769, "learning_rate": 4.649841652230403e-05, "loss": 0.3577, "step": 5174500 }, { "epoch": 35.01921827631009, "grad_norm": 0.37043413519859314, "learning_rate": 4.649807817236899e-05, "loss": 0.3569, "step": 5175000 }, { "epoch": 35.02260177566046, "grad_norm": 0.3911413550376892, "learning_rate": 4.649773982243396e-05, "loss": 0.3577, "step": 5175500 }, { "epoch": 35.025985275010825, "grad_norm": 0.39181697368621826, "learning_rate": 4.649740147249892e-05, "loss": 0.3573, "step": 5176000 }, { "epoch": 35.029368774361195, "grad_norm": 0.3618784546852112, "learning_rate": 4.649706312256388e-05, "loss": 0.3567, "step": 5176500 }, { "epoch": 35.032752273711566, "grad_norm": 0.42894452810287476, "learning_rate": 4.6496724772628845e-05, "loss": 0.3589, "step": 5177000 }, { "epoch": 35.03613577306193, "grad_norm": 0.41768378019332886, "learning_rate": 4.649638642269381e-05, "loss": 0.3573, "step": 5177500 }, { "epoch": 35.0395192724123, "grad_norm": 0.4142279624938965, "learning_rate": 4.6496048072758776e-05, "loss": 0.3572, "step": 5178000 }, { "epoch": 35.04290277176267, "grad_norm": 0.40129706263542175, "learning_rate": 4.649570972282374e-05, "loss": 0.3562, "step": 5178500 }, { "epoch": 35.04628627111303, "grad_norm": 0.38917475938796997, "learning_rate": 4.6495371372888694e-05, "loss": 0.3571, "step": 5179000 }, { "epoch": 35.0496697704634, "grad_norm": 0.3783824145793915, "learning_rate": 4.649503302295366e-05, "loss": 0.358, "step": 5179500 }, { "epoch": 35.05305326981377, "grad_norm": 0.42363739013671875, "learning_rate": 4.6494694673018625e-05, "loss": 0.359, "step": 5180000 }, { "epoch": 35.056436769164144, "grad_norm": 0.43193498253822327, "learning_rate": 4.649435632308359e-05, "loss": 0.357, "step": 5180500 }, { "epoch": 35.05982026851451, "grad_norm": 0.366696834564209, "learning_rate": 4.649401797314855e-05, "loss": 0.3583, "step": 5181000 }, { "epoch": 35.06320376786488, "grad_norm": 0.37471070885658264, "learning_rate": 4.649367962321352e-05, "loss": 0.3582, "step": 5181500 }, { "epoch": 35.06658726721525, "grad_norm": 0.36143651604652405, "learning_rate": 4.649334127327848e-05, "loss": 0.3589, "step": 5182000 }, { "epoch": 35.06997076656561, "grad_norm": 0.41785821318626404, "learning_rate": 4.649300292334344e-05, "loss": 0.3571, "step": 5182500 }, { "epoch": 35.07335426591598, "grad_norm": 0.38460901379585266, "learning_rate": 4.6492664573408404e-05, "loss": 0.3568, "step": 5183000 }, { "epoch": 35.07673776526635, "grad_norm": 0.3540903925895691, "learning_rate": 4.6492326223473366e-05, "loss": 0.3572, "step": 5183500 }, { "epoch": 35.080121264616714, "grad_norm": 0.37059855461120605, "learning_rate": 4.649198787353833e-05, "loss": 0.3576, "step": 5184000 }, { "epoch": 35.083504763967085, "grad_norm": 0.3962647616863251, "learning_rate": 4.649164952360329e-05, "loss": 0.3572, "step": 5184500 }, { "epoch": 35.086888263317455, "grad_norm": 0.42337074875831604, "learning_rate": 4.649131117366825e-05, "loss": 0.359, "step": 5185000 }, { "epoch": 35.09027176266782, "grad_norm": 0.36690419912338257, "learning_rate": 4.649097282373322e-05, "loss": 0.3571, "step": 5185500 }, { "epoch": 35.09365526201819, "grad_norm": 0.3772687017917633, "learning_rate": 4.6490634473798184e-05, "loss": 0.3585, "step": 5186000 }, { "epoch": 35.09703876136856, "grad_norm": 0.3888028860092163, "learning_rate": 4.6490296123863146e-05, "loss": 0.3575, "step": 5186500 }, { "epoch": 35.10042226071893, "grad_norm": 0.38842588663101196, "learning_rate": 4.648995777392811e-05, "loss": 0.3587, "step": 5187000 }, { "epoch": 35.10380576006929, "grad_norm": 0.40151742100715637, "learning_rate": 4.648961942399308e-05, "loss": 0.3597, "step": 5187500 }, { "epoch": 35.10718925941966, "grad_norm": 0.3653225004673004, "learning_rate": 4.648928107405804e-05, "loss": 0.3588, "step": 5188000 }, { "epoch": 35.11057275877003, "grad_norm": 0.42274415493011475, "learning_rate": 4.6488942724122994e-05, "loss": 0.3582, "step": 5188500 }, { "epoch": 35.113956258120396, "grad_norm": 0.43063274025917053, "learning_rate": 4.648860437418796e-05, "loss": 0.3589, "step": 5189000 }, { "epoch": 35.117339757470766, "grad_norm": 0.38887819647789, "learning_rate": 4.6488266024252925e-05, "loss": 0.3581, "step": 5189500 }, { "epoch": 35.12072325682114, "grad_norm": 0.3567529022693634, "learning_rate": 4.648792767431789e-05, "loss": 0.3585, "step": 5190000 }, { "epoch": 35.1241067561715, "grad_norm": 0.33756041526794434, "learning_rate": 4.648758932438285e-05, "loss": 0.3591, "step": 5190500 }, { "epoch": 35.12749025552187, "grad_norm": 0.39104586839675903, "learning_rate": 4.648725097444782e-05, "loss": 0.3587, "step": 5191000 }, { "epoch": 35.13087375487224, "grad_norm": 0.3683955669403076, "learning_rate": 4.648691262451278e-05, "loss": 0.3575, "step": 5191500 }, { "epoch": 35.134257254222604, "grad_norm": 0.39823782444000244, "learning_rate": 4.648657427457774e-05, "loss": 0.3589, "step": 5192000 }, { "epoch": 35.137640753572974, "grad_norm": 0.41600486636161804, "learning_rate": 4.6486235924642705e-05, "loss": 0.3577, "step": 5192500 }, { "epoch": 35.141024252923344, "grad_norm": 0.37839174270629883, "learning_rate": 4.648589757470767e-05, "loss": 0.3596, "step": 5193000 }, { "epoch": 35.144407752273715, "grad_norm": 0.3842908442020416, "learning_rate": 4.648555922477263e-05, "loss": 0.3592, "step": 5193500 }, { "epoch": 35.14779125162408, "grad_norm": 0.4035647213459015, "learning_rate": 4.648522087483759e-05, "loss": 0.3581, "step": 5194000 }, { "epoch": 35.15117475097445, "grad_norm": 0.43002358078956604, "learning_rate": 4.648488252490255e-05, "loss": 0.3577, "step": 5194500 }, { "epoch": 35.15455825032482, "grad_norm": 0.4105346202850342, "learning_rate": 4.648454417496752e-05, "loss": 0.3588, "step": 5195000 }, { "epoch": 35.15794174967518, "grad_norm": 0.36859622597694397, "learning_rate": 4.6484205825032484e-05, "loss": 0.3588, "step": 5195500 }, { "epoch": 35.16132524902555, "grad_norm": 0.4235802888870239, "learning_rate": 4.6483867475097447e-05, "loss": 0.3587, "step": 5196000 }, { "epoch": 35.16470874837592, "grad_norm": 0.3733595609664917, "learning_rate": 4.648352912516241e-05, "loss": 0.3573, "step": 5196500 }, { "epoch": 35.168092247726285, "grad_norm": 0.38341930508613586, "learning_rate": 4.648319077522738e-05, "loss": 0.3582, "step": 5197000 }, { "epoch": 35.171475747076656, "grad_norm": 0.4134353995323181, "learning_rate": 4.648285242529234e-05, "loss": 0.359, "step": 5197500 }, { "epoch": 35.174859246427026, "grad_norm": 0.42130154371261597, "learning_rate": 4.6482514075357295e-05, "loss": 0.3572, "step": 5198000 }, { "epoch": 35.178242745777396, "grad_norm": 0.4000984728336334, "learning_rate": 4.6482175725422264e-05, "loss": 0.3576, "step": 5198500 }, { "epoch": 35.18162624512776, "grad_norm": 0.3766460418701172, "learning_rate": 4.6481837375487226e-05, "loss": 0.3585, "step": 5199000 }, { "epoch": 35.18500974447813, "grad_norm": 0.39292386174201965, "learning_rate": 4.648149902555219e-05, "loss": 0.3575, "step": 5199500 }, { "epoch": 35.1883932438285, "grad_norm": 0.39176398515701294, "learning_rate": 4.648116067561715e-05, "loss": 0.3589, "step": 5200000 }, { "epoch": 35.19177674317886, "grad_norm": 0.3988284468650818, "learning_rate": 4.648082232568212e-05, "loss": 0.3579, "step": 5200500 }, { "epoch": 35.195160242529234, "grad_norm": 0.3713776469230652, "learning_rate": 4.648048397574708e-05, "loss": 0.3584, "step": 5201000 }, { "epoch": 35.198543741879604, "grad_norm": 0.3763924837112427, "learning_rate": 4.6480145625812043e-05, "loss": 0.3579, "step": 5201500 }, { "epoch": 35.20192724122997, "grad_norm": 0.3911839425563812, "learning_rate": 4.6479807275877006e-05, "loss": 0.3583, "step": 5202000 }, { "epoch": 35.20531074058034, "grad_norm": 0.4037778973579407, "learning_rate": 4.647946892594197e-05, "loss": 0.3601, "step": 5202500 }, { "epoch": 35.20869423993071, "grad_norm": 0.37728357315063477, "learning_rate": 4.647913057600693e-05, "loss": 0.358, "step": 5203000 }, { "epoch": 35.21207773928107, "grad_norm": 0.36691904067993164, "learning_rate": 4.647879222607189e-05, "loss": 0.3566, "step": 5203500 }, { "epoch": 35.21546123863144, "grad_norm": 0.38442808389663696, "learning_rate": 4.6478453876136854e-05, "loss": 0.3593, "step": 5204000 }, { "epoch": 35.21884473798181, "grad_norm": 0.41736212372779846, "learning_rate": 4.647811552620182e-05, "loss": 0.3584, "step": 5204500 }, { "epoch": 35.22222823733218, "grad_norm": 0.3651248812675476, "learning_rate": 4.6477777176266785e-05, "loss": 0.3598, "step": 5205000 }, { "epoch": 35.225611736682545, "grad_norm": 0.365179181098938, "learning_rate": 4.647743882633175e-05, "loss": 0.3592, "step": 5205500 }, { "epoch": 35.228995236032915, "grad_norm": 0.3637058138847351, "learning_rate": 4.647710047639671e-05, "loss": 0.3596, "step": 5206000 }, { "epoch": 35.232378735383286, "grad_norm": 0.3909095525741577, "learning_rate": 4.647676212646168e-05, "loss": 0.3596, "step": 5206500 }, { "epoch": 35.23576223473365, "grad_norm": 0.390669584274292, "learning_rate": 4.647642377652664e-05, "loss": 0.3592, "step": 5207000 }, { "epoch": 35.23914573408402, "grad_norm": 0.40053826570510864, "learning_rate": 4.6476085426591596e-05, "loss": 0.3594, "step": 5207500 }, { "epoch": 35.24252923343439, "grad_norm": 0.4064285457134247, "learning_rate": 4.6475747076656565e-05, "loss": 0.3589, "step": 5208000 }, { "epoch": 35.24591273278475, "grad_norm": 0.3962462246417999, "learning_rate": 4.647540872672153e-05, "loss": 0.3599, "step": 5208500 }, { "epoch": 35.24929623213512, "grad_norm": 0.3796030282974243, "learning_rate": 4.647507037678649e-05, "loss": 0.3597, "step": 5209000 }, { "epoch": 35.25267973148549, "grad_norm": 0.3501536250114441, "learning_rate": 4.647473202685145e-05, "loss": 0.3602, "step": 5209500 }, { "epoch": 35.256063230835856, "grad_norm": 0.35535991191864014, "learning_rate": 4.647439367691642e-05, "loss": 0.3577, "step": 5210000 }, { "epoch": 35.25944673018623, "grad_norm": 0.3918737769126892, "learning_rate": 4.647405532698138e-05, "loss": 0.3584, "step": 5210500 }, { "epoch": 35.2628302295366, "grad_norm": 0.3632281422615051, "learning_rate": 4.6473716977046344e-05, "loss": 0.3585, "step": 5211000 }, { "epoch": 35.26621372888697, "grad_norm": 0.3752191364765167, "learning_rate": 4.6473378627111306e-05, "loss": 0.3585, "step": 5211500 }, { "epoch": 35.26959722823733, "grad_norm": 0.4171627461910248, "learning_rate": 4.647304027717627e-05, "loss": 0.358, "step": 5212000 }, { "epoch": 35.2729807275877, "grad_norm": 0.3874201774597168, "learning_rate": 4.647270192724123e-05, "loss": 0.3589, "step": 5212500 }, { "epoch": 35.27636422693807, "grad_norm": 0.36945509910583496, "learning_rate": 4.647236357730619e-05, "loss": 0.3597, "step": 5213000 }, { "epoch": 35.279747726288434, "grad_norm": 0.38861581683158875, "learning_rate": 4.6472025227371155e-05, "loss": 0.3591, "step": 5213500 }, { "epoch": 35.283131225638805, "grad_norm": 0.3578372895717621, "learning_rate": 4.6471686877436124e-05, "loss": 0.3587, "step": 5214000 }, { "epoch": 35.286514724989175, "grad_norm": 0.3561571538448334, "learning_rate": 4.6471348527501086e-05, "loss": 0.358, "step": 5214500 }, { "epoch": 35.28989822433954, "grad_norm": 0.39968201518058777, "learning_rate": 4.647101017756605e-05, "loss": 0.3588, "step": 5215000 }, { "epoch": 35.29328172368991, "grad_norm": 0.42220667004585266, "learning_rate": 4.647067182763101e-05, "loss": 0.3607, "step": 5215500 }, { "epoch": 35.29666522304028, "grad_norm": 0.41770124435424805, "learning_rate": 4.647033347769598e-05, "loss": 0.359, "step": 5216000 }, { "epoch": 35.30004872239064, "grad_norm": 0.3415060043334961, "learning_rate": 4.646999512776094e-05, "loss": 0.3597, "step": 5216500 }, { "epoch": 35.30343222174101, "grad_norm": 0.3768995404243469, "learning_rate": 4.6469656777825896e-05, "loss": 0.3587, "step": 5217000 }, { "epoch": 35.30681572109138, "grad_norm": 0.368702232837677, "learning_rate": 4.6469318427890865e-05, "loss": 0.3599, "step": 5217500 }, { "epoch": 35.31019922044175, "grad_norm": 0.3856721520423889, "learning_rate": 4.646898007795583e-05, "loss": 0.3584, "step": 5218000 }, { "epoch": 35.313582719792116, "grad_norm": 0.3629095256328583, "learning_rate": 4.646864172802079e-05, "loss": 0.3582, "step": 5218500 }, { "epoch": 35.316966219142486, "grad_norm": 0.3303627669811249, "learning_rate": 4.646830337808575e-05, "loss": 0.3582, "step": 5219000 }, { "epoch": 35.32034971849286, "grad_norm": 0.4104846715927124, "learning_rate": 4.646796502815072e-05, "loss": 0.3597, "step": 5219500 }, { "epoch": 35.32373321784322, "grad_norm": 0.3870714604854584, "learning_rate": 4.646762667821568e-05, "loss": 0.3591, "step": 5220000 }, { "epoch": 35.32711671719359, "grad_norm": 0.38889339566230774, "learning_rate": 4.6467288328280645e-05, "loss": 0.3587, "step": 5220500 }, { "epoch": 35.33050021654396, "grad_norm": 0.38608211278915405, "learning_rate": 4.646694997834561e-05, "loss": 0.36, "step": 5221000 }, { "epoch": 35.33388371589432, "grad_norm": 0.38808754086494446, "learning_rate": 4.646661162841057e-05, "loss": 0.3586, "step": 5221500 }, { "epoch": 35.337267215244694, "grad_norm": 0.39795616269111633, "learning_rate": 4.646627327847553e-05, "loss": 0.3598, "step": 5222000 }, { "epoch": 35.340650714595064, "grad_norm": 0.4353151023387909, "learning_rate": 4.646593492854049e-05, "loss": 0.3586, "step": 5222500 }, { "epoch": 35.344034213945434, "grad_norm": 0.36362794041633606, "learning_rate": 4.6465596578605455e-05, "loss": 0.3607, "step": 5223000 }, { "epoch": 35.3474177132958, "grad_norm": 0.4323992431163788, "learning_rate": 4.6465258228670424e-05, "loss": 0.3567, "step": 5223500 }, { "epoch": 35.35080121264617, "grad_norm": 0.3503703773021698, "learning_rate": 4.6464919878735386e-05, "loss": 0.3584, "step": 5224000 }, { "epoch": 35.35418471199654, "grad_norm": 0.4126063585281372, "learning_rate": 4.646458152880035e-05, "loss": 0.3594, "step": 5224500 }, { "epoch": 35.3575682113469, "grad_norm": 0.3662908375263214, "learning_rate": 4.646424317886531e-05, "loss": 0.3593, "step": 5225000 }, { "epoch": 35.36095171069727, "grad_norm": 0.37647882103919983, "learning_rate": 4.646390482893028e-05, "loss": 0.3601, "step": 5225500 }, { "epoch": 35.36433521004764, "grad_norm": 0.3930198550224304, "learning_rate": 4.646356647899524e-05, "loss": 0.3596, "step": 5226000 }, { "epoch": 35.367718709398005, "grad_norm": 0.36636117100715637, "learning_rate": 4.64632281290602e-05, "loss": 0.359, "step": 5226500 }, { "epoch": 35.371102208748376, "grad_norm": 0.37644514441490173, "learning_rate": 4.6462889779125166e-05, "loss": 0.3588, "step": 5227000 }, { "epoch": 35.374485708098746, "grad_norm": 0.41436415910720825, "learning_rate": 4.646255142919013e-05, "loss": 0.3592, "step": 5227500 }, { "epoch": 35.37786920744911, "grad_norm": 0.3898829221725464, "learning_rate": 4.646221307925509e-05, "loss": 0.3584, "step": 5228000 }, { "epoch": 35.38125270679948, "grad_norm": 0.3516451120376587, "learning_rate": 4.646187472932005e-05, "loss": 0.36, "step": 5228500 }, { "epoch": 35.38463620614985, "grad_norm": 0.35585659742355347, "learning_rate": 4.646153637938502e-05, "loss": 0.359, "step": 5229000 }, { "epoch": 35.38801970550022, "grad_norm": 0.3721868097782135, "learning_rate": 4.646119802944998e-05, "loss": 0.3583, "step": 5229500 }, { "epoch": 35.39140320485058, "grad_norm": 0.3831140100955963, "learning_rate": 4.6460859679514945e-05, "loss": 0.3576, "step": 5230000 }, { "epoch": 35.39478670420095, "grad_norm": 0.36397331953048706, "learning_rate": 4.646052132957991e-05, "loss": 0.3586, "step": 5230500 }, { "epoch": 35.398170203551324, "grad_norm": 0.40902072191238403, "learning_rate": 4.646018297964487e-05, "loss": 0.3595, "step": 5231000 }, { "epoch": 35.40155370290169, "grad_norm": 0.38793542981147766, "learning_rate": 4.645984462970983e-05, "loss": 0.3594, "step": 5231500 }, { "epoch": 35.40493720225206, "grad_norm": 0.37250202894210815, "learning_rate": 4.6459506279774794e-05, "loss": 0.3598, "step": 5232000 }, { "epoch": 35.40832070160243, "grad_norm": 0.3919159471988678, "learning_rate": 4.6459167929839756e-05, "loss": 0.3595, "step": 5232500 }, { "epoch": 35.41170420095279, "grad_norm": 0.4006628394126892, "learning_rate": 4.6458829579904725e-05, "loss": 0.3593, "step": 5233000 }, { "epoch": 35.41508770030316, "grad_norm": 0.381758451461792, "learning_rate": 4.645849122996969e-05, "loss": 0.3578, "step": 5233500 }, { "epoch": 35.41847119965353, "grad_norm": 0.3751373291015625, "learning_rate": 4.645815288003465e-05, "loss": 0.3591, "step": 5234000 }, { "epoch": 35.421854699003894, "grad_norm": 0.3407689929008484, "learning_rate": 4.645781453009961e-05, "loss": 0.3588, "step": 5234500 }, { "epoch": 35.425238198354265, "grad_norm": 0.4218127727508545, "learning_rate": 4.645747618016458e-05, "loss": 0.3574, "step": 5235000 }, { "epoch": 35.428621697704635, "grad_norm": 0.4281761348247528, "learning_rate": 4.645713783022954e-05, "loss": 0.3586, "step": 5235500 }, { "epoch": 35.432005197055005, "grad_norm": 0.41469109058380127, "learning_rate": 4.64567994802945e-05, "loss": 0.3564, "step": 5236000 }, { "epoch": 35.43538869640537, "grad_norm": 0.38898664712905884, "learning_rate": 4.6456461130359466e-05, "loss": 0.3603, "step": 5236500 }, { "epoch": 35.43877219575574, "grad_norm": 0.3658848702907562, "learning_rate": 4.645612278042443e-05, "loss": 0.3579, "step": 5237000 }, { "epoch": 35.44215569510611, "grad_norm": 0.41348376870155334, "learning_rate": 4.645578443048939e-05, "loss": 0.3591, "step": 5237500 }, { "epoch": 35.44553919445647, "grad_norm": 0.40406206250190735, "learning_rate": 4.645544608055435e-05, "loss": 0.3574, "step": 5238000 }, { "epoch": 35.44892269380684, "grad_norm": 0.39499127864837646, "learning_rate": 4.6455107730619315e-05, "loss": 0.3583, "step": 5238500 }, { "epoch": 35.45230619315721, "grad_norm": 0.36872249841690063, "learning_rate": 4.6454769380684284e-05, "loss": 0.3603, "step": 5239000 }, { "epoch": 35.455689692507576, "grad_norm": 0.37547048926353455, "learning_rate": 4.6454431030749246e-05, "loss": 0.3595, "step": 5239500 }, { "epoch": 35.45907319185795, "grad_norm": 0.4177234172821045, "learning_rate": 4.645409268081421e-05, "loss": 0.3585, "step": 5240000 }, { "epoch": 35.46245669120832, "grad_norm": 0.3927750587463379, "learning_rate": 4.645375433087917e-05, "loss": 0.3606, "step": 5240500 }, { "epoch": 35.46584019055868, "grad_norm": 0.3455972969532013, "learning_rate": 4.645341598094413e-05, "loss": 0.3582, "step": 5241000 }, { "epoch": 35.46922368990905, "grad_norm": 0.3680725395679474, "learning_rate": 4.6453077631009094e-05, "loss": 0.3592, "step": 5241500 }, { "epoch": 35.47260718925942, "grad_norm": 0.41897982358932495, "learning_rate": 4.6452739281074057e-05, "loss": 0.3595, "step": 5242000 }, { "epoch": 35.47599068860979, "grad_norm": 0.3519628345966339, "learning_rate": 4.6452400931139025e-05, "loss": 0.3597, "step": 5242500 }, { "epoch": 35.479374187960154, "grad_norm": 0.39874663949012756, "learning_rate": 4.645206258120399e-05, "loss": 0.3591, "step": 5243000 }, { "epoch": 35.482757687310524, "grad_norm": 0.41478675603866577, "learning_rate": 4.645172423126895e-05, "loss": 0.3575, "step": 5243500 }, { "epoch": 35.486141186660895, "grad_norm": 0.36475494503974915, "learning_rate": 4.645138588133391e-05, "loss": 0.3595, "step": 5244000 }, { "epoch": 35.48952468601126, "grad_norm": 0.41342654824256897, "learning_rate": 4.645104753139888e-05, "loss": 0.3568, "step": 5244500 }, { "epoch": 35.49290818536163, "grad_norm": 0.3389993906021118, "learning_rate": 4.645070918146384e-05, "loss": 0.3574, "step": 5245000 }, { "epoch": 35.496291684712, "grad_norm": 0.3895016312599182, "learning_rate": 4.64503708315288e-05, "loss": 0.3582, "step": 5245500 }, { "epoch": 35.49967518406236, "grad_norm": 0.4148198962211609, "learning_rate": 4.645003248159377e-05, "loss": 0.3615, "step": 5246000 }, { "epoch": 35.50305868341273, "grad_norm": 0.36932939291000366, "learning_rate": 4.644969413165873e-05, "loss": 0.3585, "step": 5246500 }, { "epoch": 35.5064421827631, "grad_norm": 0.38193395733833313, "learning_rate": 4.644935578172369e-05, "loss": 0.3591, "step": 5247000 }, { "epoch": 35.50982568211347, "grad_norm": 0.3812614977359772, "learning_rate": 4.6449017431788653e-05, "loss": 0.358, "step": 5247500 }, { "epoch": 35.513209181463836, "grad_norm": 0.38990533351898193, "learning_rate": 4.6448679081853616e-05, "loss": 0.3592, "step": 5248000 }, { "epoch": 35.516592680814206, "grad_norm": 0.3911738693714142, "learning_rate": 4.6448340731918584e-05, "loss": 0.3587, "step": 5248500 }, { "epoch": 35.519976180164576, "grad_norm": 0.3662065863609314, "learning_rate": 4.6448002381983547e-05, "loss": 0.3584, "step": 5249000 }, { "epoch": 35.52335967951494, "grad_norm": 0.4016791880130768, "learning_rate": 4.644766403204851e-05, "loss": 0.3604, "step": 5249500 }, { "epoch": 35.52674317886531, "grad_norm": 0.39107978343963623, "learning_rate": 4.644732568211347e-05, "loss": 0.3589, "step": 5250000 }, { "epoch": 35.53012667821568, "grad_norm": 0.3511251211166382, "learning_rate": 4.644698733217843e-05, "loss": 0.359, "step": 5250500 }, { "epoch": 35.53351017756604, "grad_norm": 0.34765365719795227, "learning_rate": 4.6446648982243395e-05, "loss": 0.3596, "step": 5251000 }, { "epoch": 35.536893676916414, "grad_norm": 0.409365177154541, "learning_rate": 4.644631063230836e-05, "loss": 0.36, "step": 5251500 }, { "epoch": 35.540277176266784, "grad_norm": 0.3806619644165039, "learning_rate": 4.6445972282373326e-05, "loss": 0.3586, "step": 5252000 }, { "epoch": 35.54366067561715, "grad_norm": 0.3907391428947449, "learning_rate": 4.644563393243829e-05, "loss": 0.3592, "step": 5252500 }, { "epoch": 35.54704417496752, "grad_norm": 0.3842097222805023, "learning_rate": 4.644529558250325e-05, "loss": 0.3582, "step": 5253000 }, { "epoch": 35.55042767431789, "grad_norm": 0.35678067803382874, "learning_rate": 4.644495723256821e-05, "loss": 0.3587, "step": 5253500 }, { "epoch": 35.55381117366826, "grad_norm": 0.4046052098274231, "learning_rate": 4.644461888263318e-05, "loss": 0.3587, "step": 5254000 }, { "epoch": 35.55719467301862, "grad_norm": 0.369022011756897, "learning_rate": 4.6444280532698143e-05, "loss": 0.3589, "step": 5254500 }, { "epoch": 35.56057817236899, "grad_norm": 0.44715407490730286, "learning_rate": 4.64439421827631e-05, "loss": 0.3593, "step": 5255000 }, { "epoch": 35.56396167171936, "grad_norm": 0.4040326476097107, "learning_rate": 4.644360383282806e-05, "loss": 0.3587, "step": 5255500 }, { "epoch": 35.567345171069725, "grad_norm": 0.41204652190208435, "learning_rate": 4.644326548289303e-05, "loss": 0.358, "step": 5256000 }, { "epoch": 35.570728670420095, "grad_norm": 0.37181586027145386, "learning_rate": 4.644292713295799e-05, "loss": 0.3605, "step": 5256500 }, { "epoch": 35.574112169770466, "grad_norm": 0.3742663562297821, "learning_rate": 4.6442588783022954e-05, "loss": 0.3584, "step": 5257000 }, { "epoch": 35.57749566912083, "grad_norm": 0.3441641926765442, "learning_rate": 4.6442250433087916e-05, "loss": 0.3592, "step": 5257500 }, { "epoch": 35.5808791684712, "grad_norm": 0.3913019895553589, "learning_rate": 4.6441912083152885e-05, "loss": 0.359, "step": 5258000 }, { "epoch": 35.58426266782157, "grad_norm": 0.36842718720436096, "learning_rate": 4.644157373321785e-05, "loss": 0.3594, "step": 5258500 }, { "epoch": 35.58764616717193, "grad_norm": 0.3861198425292969, "learning_rate": 4.644123538328281e-05, "loss": 0.3593, "step": 5259000 }, { "epoch": 35.5910296665223, "grad_norm": 0.3628122806549072, "learning_rate": 4.644089703334777e-05, "loss": 0.359, "step": 5259500 }, { "epoch": 35.59441316587267, "grad_norm": 0.3801637589931488, "learning_rate": 4.6440558683412734e-05, "loss": 0.3593, "step": 5260000 }, { "epoch": 35.59779666522304, "grad_norm": 0.397336483001709, "learning_rate": 4.6440220333477696e-05, "loss": 0.3588, "step": 5260500 }, { "epoch": 35.60118016457341, "grad_norm": 0.3856032192707062, "learning_rate": 4.643988198354266e-05, "loss": 0.3579, "step": 5261000 }, { "epoch": 35.60456366392378, "grad_norm": 0.39578643441200256, "learning_rate": 4.643954363360763e-05, "loss": 0.3597, "step": 5261500 }, { "epoch": 35.60794716327415, "grad_norm": 0.3347266614437103, "learning_rate": 4.643920528367259e-05, "loss": 0.3588, "step": 5262000 }, { "epoch": 35.61133066262451, "grad_norm": 0.40181776881217957, "learning_rate": 4.643886693373755e-05, "loss": 0.3587, "step": 5262500 }, { "epoch": 35.61471416197488, "grad_norm": 0.3796705901622772, "learning_rate": 4.643852858380251e-05, "loss": 0.3592, "step": 5263000 }, { "epoch": 35.61809766132525, "grad_norm": 0.44177448749542236, "learning_rate": 4.643819023386748e-05, "loss": 0.3579, "step": 5263500 }, { "epoch": 35.621481160675614, "grad_norm": 0.3485350012779236, "learning_rate": 4.6437851883932444e-05, "loss": 0.3593, "step": 5264000 }, { "epoch": 35.624864660025985, "grad_norm": 0.37154629826545715, "learning_rate": 4.64375135339974e-05, "loss": 0.3594, "step": 5264500 }, { "epoch": 35.628248159376355, "grad_norm": 0.3512772023677826, "learning_rate": 4.643717518406236e-05, "loss": 0.3592, "step": 5265000 }, { "epoch": 35.63163165872672, "grad_norm": 0.38935092091560364, "learning_rate": 4.643683683412733e-05, "loss": 0.3589, "step": 5265500 }, { "epoch": 35.63501515807709, "grad_norm": 0.37523671984672546, "learning_rate": 4.643649848419229e-05, "loss": 0.3604, "step": 5266000 }, { "epoch": 35.63839865742746, "grad_norm": 0.380526065826416, "learning_rate": 4.6436160134257255e-05, "loss": 0.3591, "step": 5266500 }, { "epoch": 35.64178215677783, "grad_norm": 0.39485201239585876, "learning_rate": 4.643582178432222e-05, "loss": 0.3587, "step": 5267000 }, { "epoch": 35.64516565612819, "grad_norm": 0.3372666537761688, "learning_rate": 4.6435483434387186e-05, "loss": 0.359, "step": 5267500 }, { "epoch": 35.64854915547856, "grad_norm": 0.4184510409832001, "learning_rate": 4.643514508445215e-05, "loss": 0.3593, "step": 5268000 }, { "epoch": 35.65193265482893, "grad_norm": 0.4210392236709595, "learning_rate": 4.643480673451711e-05, "loss": 0.3602, "step": 5268500 }, { "epoch": 35.655316154179296, "grad_norm": 0.361260324716568, "learning_rate": 4.643446838458207e-05, "loss": 0.3608, "step": 5269000 }, { "epoch": 35.658699653529666, "grad_norm": 0.39221853017807007, "learning_rate": 4.6434130034647034e-05, "loss": 0.3585, "step": 5269500 }, { "epoch": 35.66208315288004, "grad_norm": 0.39438000321388245, "learning_rate": 4.6433791684711996e-05, "loss": 0.3592, "step": 5270000 }, { "epoch": 35.6654666522304, "grad_norm": 0.38537243008613586, "learning_rate": 4.643345333477696e-05, "loss": 0.3579, "step": 5270500 }, { "epoch": 35.66885015158077, "grad_norm": 0.36047154664993286, "learning_rate": 4.643311498484193e-05, "loss": 0.359, "step": 5271000 }, { "epoch": 35.67223365093114, "grad_norm": 0.37812140583992004, "learning_rate": 4.643277663490689e-05, "loss": 0.36, "step": 5271500 }, { "epoch": 35.67561715028151, "grad_norm": 0.3501070439815521, "learning_rate": 4.643243828497185e-05, "loss": 0.3592, "step": 5272000 }, { "epoch": 35.679000649631874, "grad_norm": 0.4167449474334717, "learning_rate": 4.6432099935036814e-05, "loss": 0.3594, "step": 5272500 }, { "epoch": 35.682384148982244, "grad_norm": 0.3701885938644409, "learning_rate": 4.643176158510178e-05, "loss": 0.3586, "step": 5273000 }, { "epoch": 35.685767648332614, "grad_norm": 0.3991261124610901, "learning_rate": 4.6431423235166745e-05, "loss": 0.3577, "step": 5273500 }, { "epoch": 35.68915114768298, "grad_norm": 0.37794020771980286, "learning_rate": 4.64310848852317e-05, "loss": 0.3587, "step": 5274000 }, { "epoch": 35.69253464703335, "grad_norm": 0.3513682782649994, "learning_rate": 4.643074653529666e-05, "loss": 0.3592, "step": 5274500 }, { "epoch": 35.69591814638372, "grad_norm": 0.3790803551673889, "learning_rate": 4.643040818536163e-05, "loss": 0.359, "step": 5275000 }, { "epoch": 35.69930164573408, "grad_norm": 0.40094253420829773, "learning_rate": 4.643006983542659e-05, "loss": 0.3589, "step": 5275500 }, { "epoch": 35.70268514508445, "grad_norm": 0.3767148554325104, "learning_rate": 4.6429731485491555e-05, "loss": 0.3577, "step": 5276000 }, { "epoch": 35.70606864443482, "grad_norm": 0.42464911937713623, "learning_rate": 4.642939313555652e-05, "loss": 0.3596, "step": 5276500 }, { "epoch": 35.709452143785185, "grad_norm": 0.36401018500328064, "learning_rate": 4.6429054785621486e-05, "loss": 0.3598, "step": 5277000 }, { "epoch": 35.712835643135556, "grad_norm": 0.3809833526611328, "learning_rate": 4.642871643568645e-05, "loss": 0.359, "step": 5277500 }, { "epoch": 35.716219142485926, "grad_norm": 0.41787412762641907, "learning_rate": 4.642837808575141e-05, "loss": 0.3604, "step": 5278000 }, { "epoch": 35.719602641836296, "grad_norm": 0.4240560233592987, "learning_rate": 4.642803973581637e-05, "loss": 0.3591, "step": 5278500 }, { "epoch": 35.72298614118666, "grad_norm": 0.3682841658592224, "learning_rate": 4.6427701385881335e-05, "loss": 0.3585, "step": 5279000 }, { "epoch": 35.72636964053703, "grad_norm": 0.34984299540519714, "learning_rate": 4.64273630359463e-05, "loss": 0.3585, "step": 5279500 }, { "epoch": 35.7297531398874, "grad_norm": 0.3614426255226135, "learning_rate": 4.642702468601126e-05, "loss": 0.3577, "step": 5280000 }, { "epoch": 35.73313663923776, "grad_norm": 0.3550345301628113, "learning_rate": 4.642668633607623e-05, "loss": 0.3587, "step": 5280500 }, { "epoch": 35.73652013858813, "grad_norm": 0.43409860134124756, "learning_rate": 4.642634798614119e-05, "loss": 0.359, "step": 5281000 }, { "epoch": 35.739903637938504, "grad_norm": 0.42521485686302185, "learning_rate": 4.642600963620615e-05, "loss": 0.3603, "step": 5281500 }, { "epoch": 35.74328713728887, "grad_norm": 0.4124901592731476, "learning_rate": 4.6425671286271114e-05, "loss": 0.3591, "step": 5282000 }, { "epoch": 35.74667063663924, "grad_norm": 0.38093918561935425, "learning_rate": 4.642533293633608e-05, "loss": 0.36, "step": 5282500 }, { "epoch": 35.75005413598961, "grad_norm": 0.3848240077495575, "learning_rate": 4.6424994586401045e-05, "loss": 0.3587, "step": 5283000 }, { "epoch": 35.75343763533997, "grad_norm": 0.3985755145549774, "learning_rate": 4.642465623646601e-05, "loss": 0.3608, "step": 5283500 }, { "epoch": 35.75682113469034, "grad_norm": 0.37025782465934753, "learning_rate": 4.642431788653096e-05, "loss": 0.3596, "step": 5284000 }, { "epoch": 35.76020463404071, "grad_norm": 0.33689063787460327, "learning_rate": 4.642397953659593e-05, "loss": 0.3589, "step": 5284500 }, { "epoch": 35.76358813339108, "grad_norm": 0.3519691228866577, "learning_rate": 4.6423641186660894e-05, "loss": 0.3598, "step": 5285000 }, { "epoch": 35.766971632741445, "grad_norm": 0.37017616629600525, "learning_rate": 4.6423302836725856e-05, "loss": 0.3591, "step": 5285500 }, { "epoch": 35.770355132091815, "grad_norm": 0.3611086308956146, "learning_rate": 4.642296448679082e-05, "loss": 0.3571, "step": 5286000 }, { "epoch": 35.773738631442185, "grad_norm": 0.3775934875011444, "learning_rate": 4.642262613685579e-05, "loss": 0.3588, "step": 5286500 }, { "epoch": 35.77712213079255, "grad_norm": 0.4065438508987427, "learning_rate": 4.642228778692075e-05, "loss": 0.36, "step": 5287000 }, { "epoch": 35.78050563014292, "grad_norm": 0.4096687436103821, "learning_rate": 4.642194943698571e-05, "loss": 0.3588, "step": 5287500 }, { "epoch": 35.78388912949329, "grad_norm": 0.4001534879207611, "learning_rate": 4.642161108705067e-05, "loss": 0.3582, "step": 5288000 }, { "epoch": 35.78727262884365, "grad_norm": 0.35695651173591614, "learning_rate": 4.6421272737115635e-05, "loss": 0.3597, "step": 5288500 }, { "epoch": 35.79065612819402, "grad_norm": 0.3913804590702057, "learning_rate": 4.64209343871806e-05, "loss": 0.3607, "step": 5289000 }, { "epoch": 35.79403962754439, "grad_norm": 0.36949577927589417, "learning_rate": 4.642059603724556e-05, "loss": 0.3586, "step": 5289500 }, { "epoch": 35.797423126894756, "grad_norm": 0.3722244203090668, "learning_rate": 4.642025768731053e-05, "loss": 0.3581, "step": 5290000 }, { "epoch": 35.80080662624513, "grad_norm": 0.37984737753868103, "learning_rate": 4.641991933737549e-05, "loss": 0.3597, "step": 5290500 }, { "epoch": 35.8041901255955, "grad_norm": 0.4076242744922638, "learning_rate": 4.641958098744045e-05, "loss": 0.3593, "step": 5291000 }, { "epoch": 35.80757362494587, "grad_norm": 0.4038795530796051, "learning_rate": 4.6419242637505415e-05, "loss": 0.3595, "step": 5291500 }, { "epoch": 35.81095712429623, "grad_norm": 0.3721678853034973, "learning_rate": 4.6418904287570384e-05, "loss": 0.3597, "step": 5292000 }, { "epoch": 35.8143406236466, "grad_norm": 0.4363827407360077, "learning_rate": 4.6418565937635346e-05, "loss": 0.3588, "step": 5292500 }, { "epoch": 35.81772412299697, "grad_norm": 0.3978166878223419, "learning_rate": 4.641822758770031e-05, "loss": 0.358, "step": 5293000 }, { "epoch": 35.821107622347334, "grad_norm": 0.39794260263442993, "learning_rate": 4.6417889237765263e-05, "loss": 0.3594, "step": 5293500 }, { "epoch": 35.824491121697704, "grad_norm": 0.35563987493515015, "learning_rate": 4.641755088783023e-05, "loss": 0.3593, "step": 5294000 }, { "epoch": 35.827874621048075, "grad_norm": 0.3575843572616577, "learning_rate": 4.6417212537895194e-05, "loss": 0.3598, "step": 5294500 }, { "epoch": 35.83125812039844, "grad_norm": 0.42469942569732666, "learning_rate": 4.6416874187960157e-05, "loss": 0.3583, "step": 5295000 }, { "epoch": 35.83464161974881, "grad_norm": 0.34596970677375793, "learning_rate": 4.641653583802512e-05, "loss": 0.3593, "step": 5295500 }, { "epoch": 35.83802511909918, "grad_norm": 0.40347158908843994, "learning_rate": 4.641619748809009e-05, "loss": 0.3588, "step": 5296000 }, { "epoch": 35.84140861844955, "grad_norm": 0.3695710599422455, "learning_rate": 4.641585913815505e-05, "loss": 0.3583, "step": 5296500 }, { "epoch": 35.84479211779991, "grad_norm": 0.3977468013763428, "learning_rate": 4.641552078822001e-05, "loss": 0.3582, "step": 5297000 }, { "epoch": 35.84817561715028, "grad_norm": 0.39114922285079956, "learning_rate": 4.6415182438284974e-05, "loss": 0.3597, "step": 5297500 }, { "epoch": 35.85155911650065, "grad_norm": 0.38786453008651733, "learning_rate": 4.6414844088349936e-05, "loss": 0.3584, "step": 5298000 }, { "epoch": 35.854942615851016, "grad_norm": 0.35438981652259827, "learning_rate": 4.64145057384149e-05, "loss": 0.3589, "step": 5298500 }, { "epoch": 35.858326115201386, "grad_norm": 0.36184030771255493, "learning_rate": 4.641416738847986e-05, "loss": 0.3601, "step": 5299000 }, { "epoch": 35.861709614551756, "grad_norm": 0.4410672187805176, "learning_rate": 4.641382903854483e-05, "loss": 0.3593, "step": 5299500 }, { "epoch": 35.86509311390212, "grad_norm": 0.40411484241485596, "learning_rate": 4.641349068860979e-05, "loss": 0.3589, "step": 5300000 }, { "epoch": 35.86847661325249, "grad_norm": 0.3632428050041199, "learning_rate": 4.6413152338674753e-05, "loss": 0.3584, "step": 5300500 }, { "epoch": 35.87186011260286, "grad_norm": 0.39821600914001465, "learning_rate": 4.6412813988739716e-05, "loss": 0.3572, "step": 5301000 }, { "epoch": 35.87524361195322, "grad_norm": 0.40248432755470276, "learning_rate": 4.641247563880468e-05, "loss": 0.3601, "step": 5301500 }, { "epoch": 35.878627111303594, "grad_norm": 0.3401845097541809, "learning_rate": 4.641213728886965e-05, "loss": 0.3577, "step": 5302000 }, { "epoch": 35.882010610653964, "grad_norm": 0.36157751083374023, "learning_rate": 4.641179893893461e-05, "loss": 0.3606, "step": 5302500 }, { "epoch": 35.885394110004334, "grad_norm": 0.40573593974113464, "learning_rate": 4.6411460588999564e-05, "loss": 0.3584, "step": 5303000 }, { "epoch": 35.8887776093547, "grad_norm": 0.38465699553489685, "learning_rate": 4.641112223906453e-05, "loss": 0.3598, "step": 5303500 }, { "epoch": 35.89216110870507, "grad_norm": 0.38376665115356445, "learning_rate": 4.6410783889129495e-05, "loss": 0.3604, "step": 5304000 }, { "epoch": 35.89554460805544, "grad_norm": 0.3362913727760315, "learning_rate": 4.641044553919446e-05, "loss": 0.3582, "step": 5304500 }, { "epoch": 35.8989281074058, "grad_norm": 0.3604910373687744, "learning_rate": 4.641010718925942e-05, "loss": 0.3593, "step": 5305000 }, { "epoch": 35.90231160675617, "grad_norm": 0.35981470346450806, "learning_rate": 4.640976883932439e-05, "loss": 0.3596, "step": 5305500 }, { "epoch": 35.90569510610654, "grad_norm": 0.35935699939727783, "learning_rate": 4.640943048938935e-05, "loss": 0.3608, "step": 5306000 }, { "epoch": 35.909078605456905, "grad_norm": 0.3535047173500061, "learning_rate": 4.640909213945431e-05, "loss": 0.3597, "step": 5306500 }, { "epoch": 35.912462104807275, "grad_norm": 0.42171168327331543, "learning_rate": 4.6408753789519275e-05, "loss": 0.3607, "step": 5307000 }, { "epoch": 35.915845604157646, "grad_norm": 0.4060705006122589, "learning_rate": 4.640841543958424e-05, "loss": 0.3605, "step": 5307500 }, { "epoch": 35.91922910350801, "grad_norm": 0.3892277777194977, "learning_rate": 4.64080770896492e-05, "loss": 0.3601, "step": 5308000 }, { "epoch": 35.92261260285838, "grad_norm": 0.43026089668273926, "learning_rate": 4.640773873971416e-05, "loss": 0.3593, "step": 5308500 }, { "epoch": 35.92599610220875, "grad_norm": 0.3809777796268463, "learning_rate": 4.640740038977913e-05, "loss": 0.3587, "step": 5309000 }, { "epoch": 35.92937960155912, "grad_norm": 0.36120370030403137, "learning_rate": 4.640706203984409e-05, "loss": 0.3588, "step": 5309500 }, { "epoch": 35.93276310090948, "grad_norm": 0.376757949590683, "learning_rate": 4.6406723689909054e-05, "loss": 0.3593, "step": 5310000 }, { "epoch": 35.93614660025985, "grad_norm": 0.329667866230011, "learning_rate": 4.6406385339974016e-05, "loss": 0.3583, "step": 5310500 }, { "epoch": 35.939530099610224, "grad_norm": 0.3501999080181122, "learning_rate": 4.640604699003898e-05, "loss": 0.3589, "step": 5311000 }, { "epoch": 35.94291359896059, "grad_norm": 0.3717506527900696, "learning_rate": 4.640570864010395e-05, "loss": 0.3592, "step": 5311500 }, { "epoch": 35.94629709831096, "grad_norm": 0.39163535833358765, "learning_rate": 4.640537029016891e-05, "loss": 0.3596, "step": 5312000 }, { "epoch": 35.94968059766133, "grad_norm": 0.3506573736667633, "learning_rate": 4.6405031940233865e-05, "loss": 0.3594, "step": 5312500 }, { "epoch": 35.95306409701169, "grad_norm": 0.3836919367313385, "learning_rate": 4.6404693590298834e-05, "loss": 0.3584, "step": 5313000 }, { "epoch": 35.95644759636206, "grad_norm": 0.41102489829063416, "learning_rate": 4.6404355240363796e-05, "loss": 0.3593, "step": 5313500 }, { "epoch": 35.95983109571243, "grad_norm": 0.3786611557006836, "learning_rate": 4.640401689042876e-05, "loss": 0.3595, "step": 5314000 }, { "epoch": 35.963214595062794, "grad_norm": 0.38707447052001953, "learning_rate": 4.640367854049372e-05, "loss": 0.3592, "step": 5314500 }, { "epoch": 35.966598094413165, "grad_norm": 0.3671312928199768, "learning_rate": 4.640334019055869e-05, "loss": 0.3592, "step": 5315000 }, { "epoch": 35.969981593763535, "grad_norm": 0.3961714208126068, "learning_rate": 4.640300184062365e-05, "loss": 0.361, "step": 5315500 }, { "epoch": 35.973365093113905, "grad_norm": 0.38784390687942505, "learning_rate": 4.640266349068861e-05, "loss": 0.3603, "step": 5316000 }, { "epoch": 35.97674859246427, "grad_norm": 0.3855289816856384, "learning_rate": 4.6402325140753575e-05, "loss": 0.3598, "step": 5316500 }, { "epoch": 35.98013209181464, "grad_norm": 0.3643551170825958, "learning_rate": 4.640198679081854e-05, "loss": 0.3586, "step": 5317000 }, { "epoch": 35.98351559116501, "grad_norm": 0.3573073744773865, "learning_rate": 4.64016484408835e-05, "loss": 0.3586, "step": 5317500 }, { "epoch": 35.98689909051537, "grad_norm": 0.4077359139919281, "learning_rate": 4.640131009094846e-05, "loss": 0.3594, "step": 5318000 }, { "epoch": 35.99028258986574, "grad_norm": 0.35591912269592285, "learning_rate": 4.6400971741013424e-05, "loss": 0.3591, "step": 5318500 }, { "epoch": 35.99366608921611, "grad_norm": 0.3844730854034424, "learning_rate": 4.640063339107839e-05, "loss": 0.36, "step": 5319000 }, { "epoch": 35.997049588566476, "grad_norm": 0.407355934381485, "learning_rate": 4.6400295041143355e-05, "loss": 0.3602, "step": 5319500 }, { "epoch": 36.0, "eval_accuracy": 0.8629955463435177, "eval_loss": 0.5554865002632141, "eval_runtime": 3359.5803, "eval_samples_per_second": 86.542, "eval_steps_per_second": 5.409, "step": 5319936 }, { "epoch": 36.000433087916846, "grad_norm": 0.3714601695537567, "learning_rate": 4.639995669120832e-05, "loss": 0.359, "step": 5320000 }, { "epoch": 36.00381658726722, "grad_norm": 0.3737226128578186, "learning_rate": 4.639961834127328e-05, "loss": 0.3562, "step": 5320500 }, { "epoch": 36.00720008661758, "grad_norm": 0.37579116225242615, "learning_rate": 4.639927999133825e-05, "loss": 0.3573, "step": 5321000 }, { "epoch": 36.01058358596795, "grad_norm": 0.3803044855594635, "learning_rate": 4.639894164140321e-05, "loss": 0.3565, "step": 5321500 }, { "epoch": 36.01396708531832, "grad_norm": 0.34731969237327576, "learning_rate": 4.6398603291468165e-05, "loss": 0.358, "step": 5322000 }, { "epoch": 36.01735058466869, "grad_norm": 0.35496339201927185, "learning_rate": 4.6398264941533134e-05, "loss": 0.3559, "step": 5322500 }, { "epoch": 36.020734084019054, "grad_norm": 0.3641916513442993, "learning_rate": 4.6397926591598096e-05, "loss": 0.3561, "step": 5323000 }, { "epoch": 36.024117583369424, "grad_norm": 0.40890011191368103, "learning_rate": 4.639758824166306e-05, "loss": 0.3579, "step": 5323500 }, { "epoch": 36.027501082719795, "grad_norm": 0.36843806505203247, "learning_rate": 4.639724989172802e-05, "loss": 0.3576, "step": 5324000 }, { "epoch": 36.03088458207016, "grad_norm": 0.35895708203315735, "learning_rate": 4.639691154179299e-05, "loss": 0.3556, "step": 5324500 }, { "epoch": 36.03426808142053, "grad_norm": 0.42412081360816956, "learning_rate": 4.639657319185795e-05, "loss": 0.3579, "step": 5325000 }, { "epoch": 36.0376515807709, "grad_norm": 0.3796720802783966, "learning_rate": 4.6396234841922914e-05, "loss": 0.3576, "step": 5325500 }, { "epoch": 36.04103508012126, "grad_norm": 0.3830703794956207, "learning_rate": 4.6395896491987876e-05, "loss": 0.3578, "step": 5326000 }, { "epoch": 36.04441857947163, "grad_norm": 0.3955300450325012, "learning_rate": 4.639555814205284e-05, "loss": 0.3571, "step": 5326500 }, { "epoch": 36.047802078822, "grad_norm": 0.3940010964870453, "learning_rate": 4.63952197921178e-05, "loss": 0.3575, "step": 5327000 }, { "epoch": 36.05118557817237, "grad_norm": 0.37831932306289673, "learning_rate": 4.639488144218276e-05, "loss": 0.3572, "step": 5327500 }, { "epoch": 36.054569077522736, "grad_norm": 0.3522729277610779, "learning_rate": 4.6394543092247724e-05, "loss": 0.3579, "step": 5328000 }, { "epoch": 36.057952576873106, "grad_norm": 0.37316861748695374, "learning_rate": 4.639420474231269e-05, "loss": 0.3567, "step": 5328500 }, { "epoch": 36.061336076223476, "grad_norm": 0.3755626380443573, "learning_rate": 4.6393866392377655e-05, "loss": 0.3572, "step": 5329000 }, { "epoch": 36.06471957557384, "grad_norm": 0.37687817215919495, "learning_rate": 4.639352804244262e-05, "loss": 0.3582, "step": 5329500 }, { "epoch": 36.06810307492421, "grad_norm": 0.36007440090179443, "learning_rate": 4.639318969250758e-05, "loss": 0.3593, "step": 5330000 }, { "epoch": 36.07148657427458, "grad_norm": 0.36718013882637024, "learning_rate": 4.639285134257255e-05, "loss": 0.3583, "step": 5330500 }, { "epoch": 36.07487007362494, "grad_norm": 0.3536832928657532, "learning_rate": 4.639251299263751e-05, "loss": 0.3578, "step": 5331000 }, { "epoch": 36.07825357297531, "grad_norm": 0.3785749673843384, "learning_rate": 4.6392174642702466e-05, "loss": 0.3584, "step": 5331500 }, { "epoch": 36.081637072325684, "grad_norm": 0.366966187953949, "learning_rate": 4.6391836292767435e-05, "loss": 0.359, "step": 5332000 }, { "epoch": 36.08502057167605, "grad_norm": 0.4064937233924866, "learning_rate": 4.63914979428324e-05, "loss": 0.3577, "step": 5332500 }, { "epoch": 36.08840407102642, "grad_norm": 0.381213515996933, "learning_rate": 4.639115959289736e-05, "loss": 0.3583, "step": 5333000 }, { "epoch": 36.09178757037679, "grad_norm": 0.39870741963386536, "learning_rate": 4.639082124296232e-05, "loss": 0.3579, "step": 5333500 }, { "epoch": 36.09517106972716, "grad_norm": 0.37169355154037476, "learning_rate": 4.639048289302729e-05, "loss": 0.3597, "step": 5334000 }, { "epoch": 36.09855456907752, "grad_norm": 0.4086398482322693, "learning_rate": 4.639014454309225e-05, "loss": 0.3578, "step": 5334500 }, { "epoch": 36.10193806842789, "grad_norm": 0.4269033670425415, "learning_rate": 4.6389806193157214e-05, "loss": 0.3579, "step": 5335000 }, { "epoch": 36.10532156777826, "grad_norm": 0.38613566756248474, "learning_rate": 4.6389467843222177e-05, "loss": 0.3586, "step": 5335500 }, { "epoch": 36.108705067128625, "grad_norm": 0.3657166361808777, "learning_rate": 4.638912949328714e-05, "loss": 0.3575, "step": 5336000 }, { "epoch": 36.112088566478995, "grad_norm": 0.4097485840320587, "learning_rate": 4.63887911433521e-05, "loss": 0.3568, "step": 5336500 }, { "epoch": 36.115472065829366, "grad_norm": 0.3976365327835083, "learning_rate": 4.638845279341706e-05, "loss": 0.3592, "step": 5337000 }, { "epoch": 36.11885556517973, "grad_norm": 0.3897380530834198, "learning_rate": 4.6388114443482025e-05, "loss": 0.3595, "step": 5337500 }, { "epoch": 36.1222390645301, "grad_norm": 0.35844093561172485, "learning_rate": 4.6387776093546994e-05, "loss": 0.3566, "step": 5338000 }, { "epoch": 36.12562256388047, "grad_norm": 0.4127728044986725, "learning_rate": 4.6387437743611956e-05, "loss": 0.3583, "step": 5338500 }, { "epoch": 36.12900606323083, "grad_norm": 0.37532439827919006, "learning_rate": 4.638709939367692e-05, "loss": 0.3593, "step": 5339000 }, { "epoch": 36.1323895625812, "grad_norm": 0.3638269901275635, "learning_rate": 4.638676104374188e-05, "loss": 0.3581, "step": 5339500 }, { "epoch": 36.13577306193157, "grad_norm": 0.40364179015159607, "learning_rate": 4.638642269380685e-05, "loss": 0.3586, "step": 5340000 }, { "epoch": 36.13915656128194, "grad_norm": 0.38234543800354004, "learning_rate": 4.638608434387181e-05, "loss": 0.3577, "step": 5340500 }, { "epoch": 36.14254006063231, "grad_norm": 0.4303121566772461, "learning_rate": 4.638574599393677e-05, "loss": 0.3589, "step": 5341000 }, { "epoch": 36.14592355998268, "grad_norm": 0.39878717064857483, "learning_rate": 4.6385407644001736e-05, "loss": 0.358, "step": 5341500 }, { "epoch": 36.14930705933305, "grad_norm": 0.3977298438549042, "learning_rate": 4.63850692940667e-05, "loss": 0.358, "step": 5342000 }, { "epoch": 36.15269055868341, "grad_norm": 0.3616574704647064, "learning_rate": 4.638473094413166e-05, "loss": 0.3591, "step": 5342500 }, { "epoch": 36.15607405803378, "grad_norm": 0.34833091497421265, "learning_rate": 4.638439259419662e-05, "loss": 0.3583, "step": 5343000 }, { "epoch": 36.15945755738415, "grad_norm": 0.415879487991333, "learning_rate": 4.638405424426159e-05, "loss": 0.3586, "step": 5343500 }, { "epoch": 36.162841056734514, "grad_norm": 0.3834993541240692, "learning_rate": 4.638371589432655e-05, "loss": 0.3585, "step": 5344000 }, { "epoch": 36.166224556084885, "grad_norm": 0.40970712900161743, "learning_rate": 4.6383377544391515e-05, "loss": 0.3573, "step": 5344500 }, { "epoch": 36.169608055435255, "grad_norm": 0.3846546411514282, "learning_rate": 4.638303919445648e-05, "loss": 0.3588, "step": 5345000 }, { "epoch": 36.17299155478562, "grad_norm": 0.39750853180885315, "learning_rate": 4.6382700844521446e-05, "loss": 0.3573, "step": 5345500 }, { "epoch": 36.17637505413599, "grad_norm": 0.34784793853759766, "learning_rate": 4.63823624945864e-05, "loss": 0.3575, "step": 5346000 }, { "epoch": 36.17975855348636, "grad_norm": 0.3635327219963074, "learning_rate": 4.6382024144651364e-05, "loss": 0.3559, "step": 5346500 }, { "epoch": 36.18314205283673, "grad_norm": 0.3809336721897125, "learning_rate": 4.6381685794716326e-05, "loss": 0.3583, "step": 5347000 }, { "epoch": 36.18652555218709, "grad_norm": 0.350963294506073, "learning_rate": 4.6381347444781295e-05, "loss": 0.3562, "step": 5347500 }, { "epoch": 36.18990905153746, "grad_norm": 0.40162286162376404, "learning_rate": 4.638100909484626e-05, "loss": 0.3587, "step": 5348000 }, { "epoch": 36.19329255088783, "grad_norm": 0.4086779057979584, "learning_rate": 4.638067074491122e-05, "loss": 0.3576, "step": 5348500 }, { "epoch": 36.196676050238196, "grad_norm": 0.395887166261673, "learning_rate": 4.638033239497618e-05, "loss": 0.3594, "step": 5349000 }, { "epoch": 36.200059549588566, "grad_norm": 0.36396047472953796, "learning_rate": 4.637999404504115e-05, "loss": 0.357, "step": 5349500 }, { "epoch": 36.20344304893894, "grad_norm": 0.34778252243995667, "learning_rate": 4.637965569510611e-05, "loss": 0.3579, "step": 5350000 }, { "epoch": 36.2068265482893, "grad_norm": 0.3774062395095825, "learning_rate": 4.637931734517107e-05, "loss": 0.3584, "step": 5350500 }, { "epoch": 36.21021004763967, "grad_norm": 0.4170990586280823, "learning_rate": 4.6378978995236036e-05, "loss": 0.3574, "step": 5351000 }, { "epoch": 36.21359354699004, "grad_norm": 0.3895938992500305, "learning_rate": 4.6378640645301e-05, "loss": 0.3597, "step": 5351500 }, { "epoch": 36.21697704634041, "grad_norm": 0.38899892568588257, "learning_rate": 4.637830229536596e-05, "loss": 0.3598, "step": 5352000 }, { "epoch": 36.220360545690774, "grad_norm": 0.40833351016044617, "learning_rate": 4.637796394543092e-05, "loss": 0.3577, "step": 5352500 }, { "epoch": 36.223744045041144, "grad_norm": 0.4181547462940216, "learning_rate": 4.637762559549589e-05, "loss": 0.3593, "step": 5353000 }, { "epoch": 36.227127544391514, "grad_norm": 0.3875177502632141, "learning_rate": 4.6377287245560854e-05, "loss": 0.3569, "step": 5353500 }, { "epoch": 36.23051104374188, "grad_norm": 0.36812880635261536, "learning_rate": 4.6376948895625816e-05, "loss": 0.3586, "step": 5354000 }, { "epoch": 36.23389454309225, "grad_norm": 0.42248624563217163, "learning_rate": 4.637661054569078e-05, "loss": 0.3568, "step": 5354500 }, { "epoch": 36.23727804244262, "grad_norm": 0.4050443470478058, "learning_rate": 4.637627219575575e-05, "loss": 0.3587, "step": 5355000 }, { "epoch": 36.24066154179298, "grad_norm": 0.41615429520606995, "learning_rate": 4.63759338458207e-05, "loss": 0.3582, "step": 5355500 }, { "epoch": 36.24404504114335, "grad_norm": 0.36678463220596313, "learning_rate": 4.6375595495885664e-05, "loss": 0.3587, "step": 5356000 }, { "epoch": 36.24742854049372, "grad_norm": 0.3542846739292145, "learning_rate": 4.6375257145950626e-05, "loss": 0.3577, "step": 5356500 }, { "epoch": 36.250812039844085, "grad_norm": 0.42447608709335327, "learning_rate": 4.6374918796015595e-05, "loss": 0.3597, "step": 5357000 }, { "epoch": 36.254195539194455, "grad_norm": 0.41700679063796997, "learning_rate": 4.637458044608056e-05, "loss": 0.3592, "step": 5357500 }, { "epoch": 36.257579038544826, "grad_norm": 0.371595174074173, "learning_rate": 4.637424209614552e-05, "loss": 0.36, "step": 5358000 }, { "epoch": 36.260962537895196, "grad_norm": 0.38792935013771057, "learning_rate": 4.637390374621048e-05, "loss": 0.3598, "step": 5358500 }, { "epoch": 36.26434603724556, "grad_norm": 0.4075170159339905, "learning_rate": 4.637356539627545e-05, "loss": 0.3567, "step": 5359000 }, { "epoch": 36.26772953659593, "grad_norm": 0.39475739002227783, "learning_rate": 4.637322704634041e-05, "loss": 0.3586, "step": 5359500 }, { "epoch": 36.2711130359463, "grad_norm": 0.36673352122306824, "learning_rate": 4.637288869640537e-05, "loss": 0.3578, "step": 5360000 }, { "epoch": 36.27449653529666, "grad_norm": 0.331930935382843, "learning_rate": 4.637255034647034e-05, "loss": 0.3584, "step": 5360500 }, { "epoch": 36.27788003464703, "grad_norm": 0.3997202515602112, "learning_rate": 4.63722119965353e-05, "loss": 0.359, "step": 5361000 }, { "epoch": 36.281263533997404, "grad_norm": 0.41310784220695496, "learning_rate": 4.637187364660026e-05, "loss": 0.3585, "step": 5361500 }, { "epoch": 36.28464703334777, "grad_norm": 0.39765745401382446, "learning_rate": 4.637153529666522e-05, "loss": 0.3573, "step": 5362000 }, { "epoch": 36.28803053269814, "grad_norm": 0.39464566111564636, "learning_rate": 4.637119694673019e-05, "loss": 0.3583, "step": 5362500 }, { "epoch": 36.29141403204851, "grad_norm": 0.418169230222702, "learning_rate": 4.6370858596795154e-05, "loss": 0.3587, "step": 5363000 }, { "epoch": 36.29479753139887, "grad_norm": 0.3595762848854065, "learning_rate": 4.6370520246860116e-05, "loss": 0.3591, "step": 5363500 }, { "epoch": 36.29818103074924, "grad_norm": 0.3601153790950775, "learning_rate": 4.637018189692508e-05, "loss": 0.3592, "step": 5364000 }, { "epoch": 36.30156453009961, "grad_norm": 0.3526724576950073, "learning_rate": 4.636984354699004e-05, "loss": 0.3587, "step": 5364500 }, { "epoch": 36.30494802944998, "grad_norm": 0.38247859477996826, "learning_rate": 4.6369505197055e-05, "loss": 0.3593, "step": 5365000 }, { "epoch": 36.308331528800345, "grad_norm": 0.3702358901500702, "learning_rate": 4.6369166847119965e-05, "loss": 0.3589, "step": 5365500 }, { "epoch": 36.311715028150715, "grad_norm": 0.40158194303512573, "learning_rate": 4.636882849718493e-05, "loss": 0.3609, "step": 5366000 }, { "epoch": 36.315098527501085, "grad_norm": 0.37560826539993286, "learning_rate": 4.6368490147249896e-05, "loss": 0.3591, "step": 5366500 }, { "epoch": 36.31848202685145, "grad_norm": 0.3786207139492035, "learning_rate": 4.636815179731486e-05, "loss": 0.3595, "step": 5367000 }, { "epoch": 36.32186552620182, "grad_norm": 0.4172016978263855, "learning_rate": 4.636781344737982e-05, "loss": 0.3574, "step": 5367500 }, { "epoch": 36.32524902555219, "grad_norm": 0.40235501527786255, "learning_rate": 4.636747509744478e-05, "loss": 0.3575, "step": 5368000 }, { "epoch": 36.32863252490255, "grad_norm": 0.3735620975494385, "learning_rate": 4.636713674750975e-05, "loss": 0.3586, "step": 5368500 }, { "epoch": 36.33201602425292, "grad_norm": 0.38861343264579773, "learning_rate": 4.636679839757471e-05, "loss": 0.3595, "step": 5369000 }, { "epoch": 36.33539952360329, "grad_norm": 0.3941441774368286, "learning_rate": 4.636646004763967e-05, "loss": 0.3569, "step": 5369500 }, { "epoch": 36.338783022953656, "grad_norm": 0.36925792694091797, "learning_rate": 4.636612169770464e-05, "loss": 0.358, "step": 5370000 }, { "epoch": 36.342166522304026, "grad_norm": 0.35760292410850525, "learning_rate": 4.63657833477696e-05, "loss": 0.3567, "step": 5370500 }, { "epoch": 36.3455500216544, "grad_norm": 0.41832560300827026, "learning_rate": 4.636544499783456e-05, "loss": 0.3587, "step": 5371000 }, { "epoch": 36.34893352100477, "grad_norm": 0.4099883735179901, "learning_rate": 4.6365106647899524e-05, "loss": 0.3595, "step": 5371500 }, { "epoch": 36.35231702035513, "grad_norm": 0.37353208661079407, "learning_rate": 4.6364768297964486e-05, "loss": 0.3594, "step": 5372000 }, { "epoch": 36.3557005197055, "grad_norm": 0.40817371010780334, "learning_rate": 4.6364429948029455e-05, "loss": 0.3593, "step": 5372500 }, { "epoch": 36.35908401905587, "grad_norm": 0.39321252703666687, "learning_rate": 4.636409159809442e-05, "loss": 0.3587, "step": 5373000 }, { "epoch": 36.362467518406234, "grad_norm": 0.3826882243156433, "learning_rate": 4.636375324815938e-05, "loss": 0.3596, "step": 5373500 }, { "epoch": 36.365851017756604, "grad_norm": 0.3495653569698334, "learning_rate": 4.636341489822434e-05, "loss": 0.359, "step": 5374000 }, { "epoch": 36.369234517106975, "grad_norm": 0.4428384304046631, "learning_rate": 4.63630765482893e-05, "loss": 0.359, "step": 5374500 }, { "epoch": 36.37261801645734, "grad_norm": 0.39965730905532837, "learning_rate": 4.6362738198354265e-05, "loss": 0.3575, "step": 5375000 }, { "epoch": 36.37600151580771, "grad_norm": 0.3903505802154541, "learning_rate": 4.636239984841923e-05, "loss": 0.3573, "step": 5375500 }, { "epoch": 36.37938501515808, "grad_norm": 0.359244704246521, "learning_rate": 4.6362061498484196e-05, "loss": 0.3582, "step": 5376000 }, { "epoch": 36.38276851450845, "grad_norm": 0.39628976583480835, "learning_rate": 4.636172314854916e-05, "loss": 0.3588, "step": 5376500 }, { "epoch": 36.38615201385881, "grad_norm": 0.38931041955947876, "learning_rate": 4.636138479861412e-05, "loss": 0.357, "step": 5377000 }, { "epoch": 36.38953551320918, "grad_norm": 0.3588677644729614, "learning_rate": 4.636104644867908e-05, "loss": 0.3578, "step": 5377500 }, { "epoch": 36.39291901255955, "grad_norm": 0.369165301322937, "learning_rate": 4.636070809874405e-05, "loss": 0.3599, "step": 5378000 }, { "epoch": 36.396302511909916, "grad_norm": 0.36009520292282104, "learning_rate": 4.6360369748809014e-05, "loss": 0.3589, "step": 5378500 }, { "epoch": 36.399686011260286, "grad_norm": 0.37979286909103394, "learning_rate": 4.636003139887397e-05, "loss": 0.3572, "step": 5379000 }, { "epoch": 36.403069510610656, "grad_norm": 0.3631051182746887, "learning_rate": 4.635969304893894e-05, "loss": 0.359, "step": 5379500 }, { "epoch": 36.40645300996102, "grad_norm": 0.4344273507595062, "learning_rate": 4.63593546990039e-05, "loss": 0.3582, "step": 5380000 }, { "epoch": 36.40983650931139, "grad_norm": 0.3770763874053955, "learning_rate": 4.635901634906886e-05, "loss": 0.3581, "step": 5380500 }, { "epoch": 36.41322000866176, "grad_norm": 0.3626430332660675, "learning_rate": 4.6358677999133824e-05, "loss": 0.3589, "step": 5381000 }, { "epoch": 36.41660350801212, "grad_norm": 0.3731286823749542, "learning_rate": 4.6358339649198787e-05, "loss": 0.3587, "step": 5381500 }, { "epoch": 36.419987007362494, "grad_norm": 0.3749610185623169, "learning_rate": 4.6358001299263755e-05, "loss": 0.3599, "step": 5382000 }, { "epoch": 36.423370506712864, "grad_norm": 0.37450677156448364, "learning_rate": 4.635766294932872e-05, "loss": 0.357, "step": 5382500 }, { "epoch": 36.426754006063234, "grad_norm": 0.412520170211792, "learning_rate": 4.635732459939368e-05, "loss": 0.3576, "step": 5383000 }, { "epoch": 36.4301375054136, "grad_norm": 0.3579409122467041, "learning_rate": 4.635698624945864e-05, "loss": 0.3577, "step": 5383500 }, { "epoch": 36.43352100476397, "grad_norm": 0.3862817883491516, "learning_rate": 4.6356647899523604e-05, "loss": 0.3593, "step": 5384000 }, { "epoch": 36.43690450411434, "grad_norm": 0.3743864893913269, "learning_rate": 4.6356309549588566e-05, "loss": 0.3587, "step": 5384500 }, { "epoch": 36.4402880034647, "grad_norm": 0.40367433428764343, "learning_rate": 4.635597119965353e-05, "loss": 0.3584, "step": 5385000 }, { "epoch": 36.44367150281507, "grad_norm": 0.3912498354911804, "learning_rate": 4.63556328497185e-05, "loss": 0.3595, "step": 5385500 }, { "epoch": 36.44705500216544, "grad_norm": 0.394877552986145, "learning_rate": 4.635529449978346e-05, "loss": 0.359, "step": 5386000 }, { "epoch": 36.450438501515805, "grad_norm": 0.4186530113220215, "learning_rate": 4.635495614984842e-05, "loss": 0.3586, "step": 5386500 }, { "epoch": 36.453822000866175, "grad_norm": 0.42265915870666504, "learning_rate": 4.6354617799913383e-05, "loss": 0.3582, "step": 5387000 }, { "epoch": 36.457205500216546, "grad_norm": 0.3434792757034302, "learning_rate": 4.635427944997835e-05, "loss": 0.3571, "step": 5387500 }, { "epoch": 36.46058899956691, "grad_norm": 0.36982664465904236, "learning_rate": 4.6353941100043314e-05, "loss": 0.3619, "step": 5388000 }, { "epoch": 36.46397249891728, "grad_norm": 0.3465140461921692, "learning_rate": 4.635360275010827e-05, "loss": 0.3572, "step": 5388500 }, { "epoch": 36.46735599826765, "grad_norm": 0.3905229866504669, "learning_rate": 4.635326440017323e-05, "loss": 0.3587, "step": 5389000 }, { "epoch": 36.47073949761802, "grad_norm": 0.4007827639579773, "learning_rate": 4.63529260502382e-05, "loss": 0.3587, "step": 5389500 }, { "epoch": 36.47412299696838, "grad_norm": 0.3601161241531372, "learning_rate": 4.635258770030316e-05, "loss": 0.3579, "step": 5390000 }, { "epoch": 36.47750649631875, "grad_norm": 0.3981015086174011, "learning_rate": 4.6352249350368125e-05, "loss": 0.3593, "step": 5390500 }, { "epoch": 36.48088999566912, "grad_norm": 0.3994007706642151, "learning_rate": 4.635191100043309e-05, "loss": 0.358, "step": 5391000 }, { "epoch": 36.48427349501949, "grad_norm": 0.3872138559818268, "learning_rate": 4.6351572650498056e-05, "loss": 0.3585, "step": 5391500 }, { "epoch": 36.48765699436986, "grad_norm": 0.38818129897117615, "learning_rate": 4.635123430056302e-05, "loss": 0.3605, "step": 5392000 }, { "epoch": 36.49104049372023, "grad_norm": 0.3747335970401764, "learning_rate": 4.635089595062798e-05, "loss": 0.3589, "step": 5392500 }, { "epoch": 36.49442399307059, "grad_norm": 0.36242780089378357, "learning_rate": 4.635055760069294e-05, "loss": 0.3595, "step": 5393000 }, { "epoch": 36.49780749242096, "grad_norm": 0.41383475065231323, "learning_rate": 4.6350219250757905e-05, "loss": 0.3586, "step": 5393500 }, { "epoch": 36.50119099177133, "grad_norm": 0.3369709253311157, "learning_rate": 4.634988090082287e-05, "loss": 0.3588, "step": 5394000 }, { "epoch": 36.504574491121694, "grad_norm": 0.3563244044780731, "learning_rate": 4.634954255088783e-05, "loss": 0.3571, "step": 5394500 }, { "epoch": 36.507957990472065, "grad_norm": 0.3913503885269165, "learning_rate": 4.63492042009528e-05, "loss": 0.3586, "step": 5395000 }, { "epoch": 36.511341489822435, "grad_norm": 0.368977814912796, "learning_rate": 4.634886585101776e-05, "loss": 0.3586, "step": 5395500 }, { "epoch": 36.514724989172805, "grad_norm": 0.3941846489906311, "learning_rate": 4.634852750108272e-05, "loss": 0.3584, "step": 5396000 }, { "epoch": 36.51810848852317, "grad_norm": 0.4536517858505249, "learning_rate": 4.6348189151147684e-05, "loss": 0.3577, "step": 5396500 }, { "epoch": 36.52149198787354, "grad_norm": 0.3874954879283905, "learning_rate": 4.634785080121265e-05, "loss": 0.3591, "step": 5397000 }, { "epoch": 36.52487548722391, "grad_norm": 0.4522276818752289, "learning_rate": 4.6347512451277615e-05, "loss": 0.3594, "step": 5397500 }, { "epoch": 36.52825898657427, "grad_norm": 0.39889538288116455, "learning_rate": 4.634717410134258e-05, "loss": 0.3586, "step": 5398000 }, { "epoch": 36.53164248592464, "grad_norm": 0.36961063742637634, "learning_rate": 4.634683575140753e-05, "loss": 0.3576, "step": 5398500 }, { "epoch": 36.53502598527501, "grad_norm": 0.3706519901752472, "learning_rate": 4.63464974014725e-05, "loss": 0.3564, "step": 5399000 }, { "epoch": 36.538409484625376, "grad_norm": 0.3952397108078003, "learning_rate": 4.6346159051537464e-05, "loss": 0.3595, "step": 5399500 }, { "epoch": 36.541792983975746, "grad_norm": 0.37313124537467957, "learning_rate": 4.6345820701602426e-05, "loss": 0.3592, "step": 5400000 }, { "epoch": 36.54517648332612, "grad_norm": 0.37953048944473267, "learning_rate": 4.634548235166739e-05, "loss": 0.3592, "step": 5400500 }, { "epoch": 36.54855998267648, "grad_norm": 0.3729240596294403, "learning_rate": 4.634514400173236e-05, "loss": 0.3586, "step": 5401000 }, { "epoch": 36.55194348202685, "grad_norm": 0.3639374077320099, "learning_rate": 4.634480565179732e-05, "loss": 0.3595, "step": 5401500 }, { "epoch": 36.55532698137722, "grad_norm": 0.37958860397338867, "learning_rate": 4.634446730186228e-05, "loss": 0.3595, "step": 5402000 }, { "epoch": 36.55871048072759, "grad_norm": 0.3735257685184479, "learning_rate": 4.634412895192724e-05, "loss": 0.3598, "step": 5402500 }, { "epoch": 36.562093980077954, "grad_norm": 0.38983970880508423, "learning_rate": 4.6343790601992205e-05, "loss": 0.3574, "step": 5403000 }, { "epoch": 36.565477479428324, "grad_norm": 0.3864821791648865, "learning_rate": 4.634345225205717e-05, "loss": 0.3597, "step": 5403500 }, { "epoch": 36.568860978778694, "grad_norm": 0.37265393137931824, "learning_rate": 4.634311390212213e-05, "loss": 0.3581, "step": 5404000 }, { "epoch": 36.57224447812906, "grad_norm": 0.3908233344554901, "learning_rate": 4.63427755521871e-05, "loss": 0.3593, "step": 5404500 }, { "epoch": 36.57562797747943, "grad_norm": 0.3419293165206909, "learning_rate": 4.634243720225206e-05, "loss": 0.3585, "step": 5405000 }, { "epoch": 36.5790114768298, "grad_norm": 0.36108455061912537, "learning_rate": 4.634209885231702e-05, "loss": 0.358, "step": 5405500 }, { "epoch": 36.58239497618016, "grad_norm": 0.37847715616226196, "learning_rate": 4.6341760502381985e-05, "loss": 0.3591, "step": 5406000 }, { "epoch": 36.58577847553053, "grad_norm": 0.3705810308456421, "learning_rate": 4.6341422152446954e-05, "loss": 0.3582, "step": 5406500 }, { "epoch": 36.5891619748809, "grad_norm": 0.38005268573760986, "learning_rate": 4.6341083802511916e-05, "loss": 0.3562, "step": 5407000 }, { "epoch": 36.59254547423127, "grad_norm": 0.3952135443687439, "learning_rate": 4.634074545257688e-05, "loss": 0.3586, "step": 5407500 }, { "epoch": 36.595928973581636, "grad_norm": 0.40303125977516174, "learning_rate": 4.634040710264183e-05, "loss": 0.3577, "step": 5408000 }, { "epoch": 36.599312472932006, "grad_norm": 0.38825419545173645, "learning_rate": 4.63400687527068e-05, "loss": 0.3588, "step": 5408500 }, { "epoch": 36.602695972282376, "grad_norm": 0.37603962421417236, "learning_rate": 4.6339730402771764e-05, "loss": 0.3586, "step": 5409000 }, { "epoch": 36.60607947163274, "grad_norm": 0.39547544717788696, "learning_rate": 4.6339392052836726e-05, "loss": 0.3579, "step": 5409500 }, { "epoch": 36.60946297098311, "grad_norm": 0.38806581497192383, "learning_rate": 4.633905370290169e-05, "loss": 0.3591, "step": 5410000 }, { "epoch": 36.61284647033348, "grad_norm": 0.39700913429260254, "learning_rate": 4.633871535296666e-05, "loss": 0.3583, "step": 5410500 }, { "epoch": 36.61622996968384, "grad_norm": 0.39626410603523254, "learning_rate": 4.633837700303162e-05, "loss": 0.3579, "step": 5411000 }, { "epoch": 36.61961346903421, "grad_norm": 0.3509100675582886, "learning_rate": 4.633803865309658e-05, "loss": 0.3596, "step": 5411500 }, { "epoch": 36.622996968384584, "grad_norm": 0.3858174979686737, "learning_rate": 4.6337700303161544e-05, "loss": 0.3593, "step": 5412000 }, { "epoch": 36.62638046773495, "grad_norm": 0.39701759815216064, "learning_rate": 4.6337361953226506e-05, "loss": 0.3611, "step": 5412500 }, { "epoch": 36.62976396708532, "grad_norm": 0.362775593996048, "learning_rate": 4.633702360329147e-05, "loss": 0.3596, "step": 5413000 }, { "epoch": 36.63314746643569, "grad_norm": 0.39234185218811035, "learning_rate": 4.633668525335643e-05, "loss": 0.3569, "step": 5413500 }, { "epoch": 36.63653096578606, "grad_norm": 0.4118044078350067, "learning_rate": 4.63363469034214e-05, "loss": 0.3584, "step": 5414000 }, { "epoch": 36.63991446513642, "grad_norm": 0.38820743560791016, "learning_rate": 4.633600855348636e-05, "loss": 0.3596, "step": 5414500 }, { "epoch": 36.64329796448679, "grad_norm": 0.37806764245033264, "learning_rate": 4.633567020355132e-05, "loss": 0.3599, "step": 5415000 }, { "epoch": 36.64668146383716, "grad_norm": 0.3902973532676697, "learning_rate": 4.6335331853616285e-05, "loss": 0.3592, "step": 5415500 }, { "epoch": 36.650064963187525, "grad_norm": 0.3700363039970398, "learning_rate": 4.6334993503681254e-05, "loss": 0.3604, "step": 5416000 }, { "epoch": 36.653448462537895, "grad_norm": 0.38539978861808777, "learning_rate": 4.6334655153746216e-05, "loss": 0.3608, "step": 5416500 }, { "epoch": 36.656831961888265, "grad_norm": 0.37120288610458374, "learning_rate": 4.633431680381118e-05, "loss": 0.3595, "step": 5417000 }, { "epoch": 36.66021546123863, "grad_norm": 0.37381964921951294, "learning_rate": 4.6333978453876134e-05, "loss": 0.3591, "step": 5417500 }, { "epoch": 36.663598960589, "grad_norm": 0.36754944920539856, "learning_rate": 4.63336401039411e-05, "loss": 0.3588, "step": 5418000 }, { "epoch": 36.66698245993937, "grad_norm": 0.4037177860736847, "learning_rate": 4.6333301754006065e-05, "loss": 0.3587, "step": 5418500 }, { "epoch": 36.67036595928973, "grad_norm": 0.3826404809951782, "learning_rate": 4.633296340407103e-05, "loss": 0.3592, "step": 5419000 }, { "epoch": 36.6737494586401, "grad_norm": 0.4238000214099884, "learning_rate": 4.633262505413599e-05, "loss": 0.3593, "step": 5419500 }, { "epoch": 36.67713295799047, "grad_norm": 0.40022116899490356, "learning_rate": 4.633228670420096e-05, "loss": 0.3598, "step": 5420000 }, { "epoch": 36.68051645734084, "grad_norm": 0.3914811909198761, "learning_rate": 4.633194835426592e-05, "loss": 0.3589, "step": 5420500 }, { "epoch": 36.68389995669121, "grad_norm": 0.37830302119255066, "learning_rate": 4.633161000433088e-05, "loss": 0.3586, "step": 5421000 }, { "epoch": 36.68728345604158, "grad_norm": 0.3562361001968384, "learning_rate": 4.6331271654395844e-05, "loss": 0.3587, "step": 5421500 }, { "epoch": 36.69066695539195, "grad_norm": 0.3969062566757202, "learning_rate": 4.6330933304460806e-05, "loss": 0.3602, "step": 5422000 }, { "epoch": 36.69405045474231, "grad_norm": 0.37047722935676575, "learning_rate": 4.633059495452577e-05, "loss": 0.3597, "step": 5422500 }, { "epoch": 36.69743395409268, "grad_norm": 0.4213216006755829, "learning_rate": 4.633025660459073e-05, "loss": 0.3569, "step": 5423000 }, { "epoch": 36.70081745344305, "grad_norm": 0.3517361879348755, "learning_rate": 4.63299182546557e-05, "loss": 0.3575, "step": 5423500 }, { "epoch": 36.704200952793414, "grad_norm": 0.3819379210472107, "learning_rate": 4.632957990472066e-05, "loss": 0.3592, "step": 5424000 }, { "epoch": 36.707584452143784, "grad_norm": 0.40440550446510315, "learning_rate": 4.6329241554785624e-05, "loss": 0.3581, "step": 5424500 }, { "epoch": 36.710967951494155, "grad_norm": 0.38853567838668823, "learning_rate": 4.6328903204850586e-05, "loss": 0.3603, "step": 5425000 }, { "epoch": 36.71435145084452, "grad_norm": 0.38550257682800293, "learning_rate": 4.6328564854915555e-05, "loss": 0.3599, "step": 5425500 }, { "epoch": 36.71773495019489, "grad_norm": 0.379503071308136, "learning_rate": 4.632822650498052e-05, "loss": 0.3592, "step": 5426000 }, { "epoch": 36.72111844954526, "grad_norm": 0.3605881929397583, "learning_rate": 4.632788815504548e-05, "loss": 0.3586, "step": 5426500 }, { "epoch": 36.72450194889563, "grad_norm": 0.3627432584762573, "learning_rate": 4.6327549805110434e-05, "loss": 0.3591, "step": 5427000 }, { "epoch": 36.72788544824599, "grad_norm": 0.4143041968345642, "learning_rate": 4.63272114551754e-05, "loss": 0.3593, "step": 5427500 }, { "epoch": 36.73126894759636, "grad_norm": 0.4168412685394287, "learning_rate": 4.6326873105240365e-05, "loss": 0.3584, "step": 5428000 }, { "epoch": 36.73465244694673, "grad_norm": 0.3902731239795685, "learning_rate": 4.632653475530533e-05, "loss": 0.3595, "step": 5428500 }, { "epoch": 36.738035946297096, "grad_norm": 0.40431979298591614, "learning_rate": 4.632619640537029e-05, "loss": 0.3598, "step": 5429000 }, { "epoch": 36.741419445647466, "grad_norm": 0.3976406753063202, "learning_rate": 4.632585805543526e-05, "loss": 0.3582, "step": 5429500 }, { "epoch": 36.744802944997836, "grad_norm": 0.3488816022872925, "learning_rate": 4.632551970550022e-05, "loss": 0.3588, "step": 5430000 }, { "epoch": 36.7481864443482, "grad_norm": 0.36659806966781616, "learning_rate": 4.632518135556518e-05, "loss": 0.357, "step": 5430500 }, { "epoch": 36.75156994369857, "grad_norm": 0.39805367588996887, "learning_rate": 4.6324843005630145e-05, "loss": 0.3597, "step": 5431000 }, { "epoch": 36.75495344304894, "grad_norm": 0.365629106760025, "learning_rate": 4.632450465569511e-05, "loss": 0.3589, "step": 5431500 }, { "epoch": 36.75833694239931, "grad_norm": 0.3875758647918701, "learning_rate": 4.632416630576007e-05, "loss": 0.3597, "step": 5432000 }, { "epoch": 36.761720441749674, "grad_norm": 0.4101535379886627, "learning_rate": 4.632382795582503e-05, "loss": 0.3589, "step": 5432500 }, { "epoch": 36.765103941100044, "grad_norm": 0.37793266773223877, "learning_rate": 4.632348960589e-05, "loss": 0.3584, "step": 5433000 }, { "epoch": 36.768487440450414, "grad_norm": 0.35704052448272705, "learning_rate": 4.632315125595496e-05, "loss": 0.3581, "step": 5433500 }, { "epoch": 36.77187093980078, "grad_norm": 0.4402250647544861, "learning_rate": 4.6322812906019924e-05, "loss": 0.3594, "step": 5434000 }, { "epoch": 36.77525443915115, "grad_norm": 0.4011518359184265, "learning_rate": 4.632247455608489e-05, "loss": 0.3583, "step": 5434500 }, { "epoch": 36.77863793850152, "grad_norm": 0.3994705379009247, "learning_rate": 4.632213620614985e-05, "loss": 0.3592, "step": 5435000 }, { "epoch": 36.78202143785188, "grad_norm": 0.3601782023906708, "learning_rate": 4.632179785621482e-05, "loss": 0.3581, "step": 5435500 }, { "epoch": 36.78540493720225, "grad_norm": 0.3508903682231903, "learning_rate": 4.632145950627978e-05, "loss": 0.3599, "step": 5436000 }, { "epoch": 36.78878843655262, "grad_norm": 0.3926507234573364, "learning_rate": 4.6321121156344735e-05, "loss": 0.3596, "step": 5436500 }, { "epoch": 36.792171935902985, "grad_norm": 0.39061203598976135, "learning_rate": 4.6320782806409704e-05, "loss": 0.3604, "step": 5437000 }, { "epoch": 36.795555435253355, "grad_norm": 0.37601664662361145, "learning_rate": 4.6320444456474666e-05, "loss": 0.3589, "step": 5437500 }, { "epoch": 36.798938934603726, "grad_norm": 0.3568149209022522, "learning_rate": 4.632010610653963e-05, "loss": 0.3593, "step": 5438000 }, { "epoch": 36.802322433954096, "grad_norm": 0.406227707862854, "learning_rate": 4.631976775660459e-05, "loss": 0.3595, "step": 5438500 }, { "epoch": 36.80570593330446, "grad_norm": 0.3645710051059723, "learning_rate": 4.631942940666956e-05, "loss": 0.358, "step": 5439000 }, { "epoch": 36.80908943265483, "grad_norm": 0.36030206084251404, "learning_rate": 4.631909105673452e-05, "loss": 0.3586, "step": 5439500 }, { "epoch": 36.8124729320052, "grad_norm": 0.38765445351600647, "learning_rate": 4.6318752706799484e-05, "loss": 0.3591, "step": 5440000 }, { "epoch": 36.81585643135556, "grad_norm": 0.35829150676727295, "learning_rate": 4.6318414356864446e-05, "loss": 0.359, "step": 5440500 }, { "epoch": 36.81923993070593, "grad_norm": 0.3841919004917145, "learning_rate": 4.631807600692941e-05, "loss": 0.3587, "step": 5441000 }, { "epoch": 36.822623430056304, "grad_norm": 0.39539119601249695, "learning_rate": 4.631773765699437e-05, "loss": 0.3586, "step": 5441500 }, { "epoch": 36.82600692940667, "grad_norm": 0.34584274888038635, "learning_rate": 4.631739930705933e-05, "loss": 0.3596, "step": 5442000 }, { "epoch": 36.82939042875704, "grad_norm": 0.3979288935661316, "learning_rate": 4.6317060957124294e-05, "loss": 0.3601, "step": 5442500 }, { "epoch": 36.83277392810741, "grad_norm": 0.40058383345603943, "learning_rate": 4.631672260718926e-05, "loss": 0.3591, "step": 5443000 }, { "epoch": 36.83615742745777, "grad_norm": 0.35918498039245605, "learning_rate": 4.6316384257254225e-05, "loss": 0.3582, "step": 5443500 }, { "epoch": 36.83954092680814, "grad_norm": 0.3729562759399414, "learning_rate": 4.631604590731919e-05, "loss": 0.3588, "step": 5444000 }, { "epoch": 36.84292442615851, "grad_norm": 0.3985956907272339, "learning_rate": 4.631570755738415e-05, "loss": 0.3601, "step": 5444500 }, { "epoch": 36.84630792550888, "grad_norm": 0.3987262547016144, "learning_rate": 4.631536920744912e-05, "loss": 0.3597, "step": 5445000 }, { "epoch": 36.849691424859245, "grad_norm": 0.36801108717918396, "learning_rate": 4.631503085751408e-05, "loss": 0.3582, "step": 5445500 }, { "epoch": 36.853074924209615, "grad_norm": 0.38933831453323364, "learning_rate": 4.6314692507579036e-05, "loss": 0.3597, "step": 5446000 }, { "epoch": 36.856458423559985, "grad_norm": 0.38778480887413025, "learning_rate": 4.6314354157644005e-05, "loss": 0.3585, "step": 5446500 }, { "epoch": 36.85984192291035, "grad_norm": 0.3846574127674103, "learning_rate": 4.631401580770897e-05, "loss": 0.357, "step": 5447000 }, { "epoch": 36.86322542226072, "grad_norm": 0.41694244742393494, "learning_rate": 4.631367745777393e-05, "loss": 0.3589, "step": 5447500 }, { "epoch": 36.86660892161109, "grad_norm": 0.37676724791526794, "learning_rate": 4.631333910783889e-05, "loss": 0.3574, "step": 5448000 }, { "epoch": 36.86999242096145, "grad_norm": 0.38469696044921875, "learning_rate": 4.631300075790386e-05, "loss": 0.3591, "step": 5448500 }, { "epoch": 36.87337592031182, "grad_norm": 0.35767194628715515, "learning_rate": 4.631266240796882e-05, "loss": 0.3599, "step": 5449000 }, { "epoch": 36.87675941966219, "grad_norm": 0.35038360953330994, "learning_rate": 4.6312324058033784e-05, "loss": 0.3594, "step": 5449500 }, { "epoch": 36.880142919012556, "grad_norm": 0.3759296238422394, "learning_rate": 4.6311985708098746e-05, "loss": 0.3597, "step": 5450000 }, { "epoch": 36.883526418362926, "grad_norm": 0.3757367432117462, "learning_rate": 4.631164735816371e-05, "loss": 0.3586, "step": 5450500 }, { "epoch": 36.8869099177133, "grad_norm": 0.3534103333950043, "learning_rate": 4.631130900822867e-05, "loss": 0.3597, "step": 5451000 }, { "epoch": 36.89029341706367, "grad_norm": 0.4127538204193115, "learning_rate": 4.631097065829363e-05, "loss": 0.3597, "step": 5451500 }, { "epoch": 36.89367691641403, "grad_norm": 0.4183761775493622, "learning_rate": 4.6310632308358595e-05, "loss": 0.3601, "step": 5452000 }, { "epoch": 36.8970604157644, "grad_norm": 0.3787117004394531, "learning_rate": 4.6310293958423564e-05, "loss": 0.3577, "step": 5452500 }, { "epoch": 36.90044391511477, "grad_norm": 0.38860374689102173, "learning_rate": 4.6309955608488526e-05, "loss": 0.3583, "step": 5453000 }, { "epoch": 36.903827414465134, "grad_norm": 0.37418508529663086, "learning_rate": 4.630961725855349e-05, "loss": 0.3603, "step": 5453500 }, { "epoch": 36.907210913815504, "grad_norm": 0.37635117769241333, "learning_rate": 4.630927890861845e-05, "loss": 0.3601, "step": 5454000 }, { "epoch": 36.910594413165875, "grad_norm": 0.3899887204170227, "learning_rate": 4.630894055868342e-05, "loss": 0.3597, "step": 5454500 }, { "epoch": 36.91397791251624, "grad_norm": 0.3659631013870239, "learning_rate": 4.630860220874838e-05, "loss": 0.3581, "step": 5455000 }, { "epoch": 36.91736141186661, "grad_norm": 0.36870715022087097, "learning_rate": 4.6308263858813336e-05, "loss": 0.3607, "step": 5455500 }, { "epoch": 36.92074491121698, "grad_norm": 0.37624937295913696, "learning_rate": 4.6307925508878305e-05, "loss": 0.3574, "step": 5456000 }, { "epoch": 36.92412841056735, "grad_norm": 0.40849369764328003, "learning_rate": 4.630758715894327e-05, "loss": 0.3578, "step": 5456500 }, { "epoch": 36.92751190991771, "grad_norm": 0.37514176964759827, "learning_rate": 4.630724880900823e-05, "loss": 0.359, "step": 5457000 }, { "epoch": 36.93089540926808, "grad_norm": 0.358591228723526, "learning_rate": 4.630691045907319e-05, "loss": 0.359, "step": 5457500 }, { "epoch": 36.93427890861845, "grad_norm": 0.37737929821014404, "learning_rate": 4.630657210913816e-05, "loss": 0.3594, "step": 5458000 }, { "epoch": 36.937662407968816, "grad_norm": 0.3692334294319153, "learning_rate": 4.630623375920312e-05, "loss": 0.3587, "step": 5458500 }, { "epoch": 36.941045907319186, "grad_norm": 0.3860073685646057, "learning_rate": 4.6305895409268085e-05, "loss": 0.3592, "step": 5459000 }, { "epoch": 36.944429406669556, "grad_norm": 0.3795725405216217, "learning_rate": 4.630555705933305e-05, "loss": 0.3586, "step": 5459500 }, { "epoch": 36.94781290601992, "grad_norm": 0.3706900477409363, "learning_rate": 4.6305218709398016e-05, "loss": 0.3596, "step": 5460000 }, { "epoch": 36.95119640537029, "grad_norm": 0.40297240018844604, "learning_rate": 4.630488035946297e-05, "loss": 0.3581, "step": 5460500 }, { "epoch": 36.95457990472066, "grad_norm": 0.3621428608894348, "learning_rate": 4.630454200952793e-05, "loss": 0.3606, "step": 5461000 }, { "epoch": 36.95796340407102, "grad_norm": 0.42571696639060974, "learning_rate": 4.6304203659592895e-05, "loss": 0.3589, "step": 5461500 }, { "epoch": 36.96134690342139, "grad_norm": 0.3712451457977295, "learning_rate": 4.6303865309657864e-05, "loss": 0.3579, "step": 5462000 }, { "epoch": 36.964730402771764, "grad_norm": 0.3637605309486389, "learning_rate": 4.6303526959722826e-05, "loss": 0.358, "step": 5462500 }, { "epoch": 36.968113902122134, "grad_norm": 0.40731701254844666, "learning_rate": 4.630318860978779e-05, "loss": 0.359, "step": 5463000 }, { "epoch": 36.9714974014725, "grad_norm": 0.3971612751483917, "learning_rate": 4.630285025985275e-05, "loss": 0.3582, "step": 5463500 }, { "epoch": 36.97488090082287, "grad_norm": 0.3789381682872772, "learning_rate": 4.630251190991772e-05, "loss": 0.3588, "step": 5464000 }, { "epoch": 36.97826440017324, "grad_norm": 0.39527857303619385, "learning_rate": 4.630217355998268e-05, "loss": 0.3584, "step": 5464500 }, { "epoch": 36.9816478995236, "grad_norm": 0.386687695980072, "learning_rate": 4.630183521004764e-05, "loss": 0.3584, "step": 5465000 }, { "epoch": 36.98503139887397, "grad_norm": 0.35649698972702026, "learning_rate": 4.6301496860112606e-05, "loss": 0.3588, "step": 5465500 }, { "epoch": 36.98841489822434, "grad_norm": 0.3558090329170227, "learning_rate": 4.630115851017757e-05, "loss": 0.3588, "step": 5466000 }, { "epoch": 36.991798397574705, "grad_norm": 0.3773846924304962, "learning_rate": 4.630082016024253e-05, "loss": 0.3578, "step": 5466500 }, { "epoch": 36.995181896925075, "grad_norm": 0.4010309875011444, "learning_rate": 4.630048181030749e-05, "loss": 0.3609, "step": 5467000 }, { "epoch": 36.998565396275445, "grad_norm": 0.39403846859931946, "learning_rate": 4.630014346037246e-05, "loss": 0.3594, "step": 5467500 }, { "epoch": 37.0, "eval_accuracy": 0.8629837217567206, "eval_loss": 0.5562915802001953, "eval_runtime": 3362.5317, "eval_samples_per_second": 86.466, "eval_steps_per_second": 5.404, "step": 5467712 }, { "epoch": 37.00194889562581, "grad_norm": 0.3730051815509796, "learning_rate": 4.629980511043742e-05, "loss": 0.3589, "step": 5468000 }, { "epoch": 37.00533239497618, "grad_norm": 0.38916271924972534, "learning_rate": 4.6299466760502385e-05, "loss": 0.3568, "step": 5468500 }, { "epoch": 37.00871589432655, "grad_norm": 0.40832391381263733, "learning_rate": 4.629912841056735e-05, "loss": 0.3564, "step": 5469000 }, { "epoch": 37.01209939367692, "grad_norm": 0.37710896134376526, "learning_rate": 4.6298790060632316e-05, "loss": 0.3557, "step": 5469500 }, { "epoch": 37.01548289302728, "grad_norm": 0.3445221185684204, "learning_rate": 4.629845171069727e-05, "loss": 0.3573, "step": 5470000 }, { "epoch": 37.01886639237765, "grad_norm": 0.37882137298583984, "learning_rate": 4.6298113360762234e-05, "loss": 0.3558, "step": 5470500 }, { "epoch": 37.02224989172802, "grad_norm": 0.3661237061023712, "learning_rate": 4.6297775010827196e-05, "loss": 0.357, "step": 5471000 }, { "epoch": 37.02563339107839, "grad_norm": 0.36156991124153137, "learning_rate": 4.6297436660892165e-05, "loss": 0.3575, "step": 5471500 }, { "epoch": 37.02901689042876, "grad_norm": 0.4158290922641754, "learning_rate": 4.629709831095713e-05, "loss": 0.3583, "step": 5472000 }, { "epoch": 37.03240038977913, "grad_norm": 0.35936787724494934, "learning_rate": 4.629675996102209e-05, "loss": 0.3563, "step": 5472500 }, { "epoch": 37.03578388912949, "grad_norm": 0.3601103723049164, "learning_rate": 4.629642161108705e-05, "loss": 0.3584, "step": 5473000 }, { "epoch": 37.03916738847986, "grad_norm": 0.39076584577560425, "learning_rate": 4.629608326115202e-05, "loss": 0.3581, "step": 5473500 }, { "epoch": 37.04255088783023, "grad_norm": 0.38857847452163696, "learning_rate": 4.629574491121698e-05, "loss": 0.357, "step": 5474000 }, { "epoch": 37.045934387180594, "grad_norm": 0.42331647872924805, "learning_rate": 4.629540656128194e-05, "loss": 0.3558, "step": 5474500 }, { "epoch": 37.049317886530964, "grad_norm": 0.3687676191329956, "learning_rate": 4.6295068211346907e-05, "loss": 0.3574, "step": 5475000 }, { "epoch": 37.052701385881335, "grad_norm": 0.41535496711730957, "learning_rate": 4.629472986141187e-05, "loss": 0.3568, "step": 5475500 }, { "epoch": 37.056084885231705, "grad_norm": 0.43465059995651245, "learning_rate": 4.629439151147683e-05, "loss": 0.357, "step": 5476000 }, { "epoch": 37.05946838458207, "grad_norm": 0.4017110764980316, "learning_rate": 4.629405316154179e-05, "loss": 0.3567, "step": 5476500 }, { "epoch": 37.06285188393244, "grad_norm": 0.3725665211677551, "learning_rate": 4.629371481160676e-05, "loss": 0.357, "step": 5477000 }, { "epoch": 37.06623538328281, "grad_norm": 0.408597469329834, "learning_rate": 4.6293376461671724e-05, "loss": 0.3583, "step": 5477500 }, { "epoch": 37.06961888263317, "grad_norm": 0.3889630138874054, "learning_rate": 4.6293038111736686e-05, "loss": 0.3574, "step": 5478000 }, { "epoch": 37.07300238198354, "grad_norm": 0.4078400135040283, "learning_rate": 4.629269976180165e-05, "loss": 0.3574, "step": 5478500 }, { "epoch": 37.07638588133391, "grad_norm": 0.38494017720222473, "learning_rate": 4.629236141186662e-05, "loss": 0.3595, "step": 5479000 }, { "epoch": 37.079769380684276, "grad_norm": 0.38983720541000366, "learning_rate": 4.629202306193157e-05, "loss": 0.357, "step": 5479500 }, { "epoch": 37.083152880034646, "grad_norm": 0.3703463077545166, "learning_rate": 4.6291684711996535e-05, "loss": 0.3566, "step": 5480000 }, { "epoch": 37.08653637938502, "grad_norm": 0.42000865936279297, "learning_rate": 4.62913463620615e-05, "loss": 0.3565, "step": 5480500 }, { "epoch": 37.08991987873539, "grad_norm": 0.3849279284477234, "learning_rate": 4.6291008012126466e-05, "loss": 0.357, "step": 5481000 }, { "epoch": 37.09330337808575, "grad_norm": 0.37425053119659424, "learning_rate": 4.629066966219143e-05, "loss": 0.3573, "step": 5481500 }, { "epoch": 37.09668687743612, "grad_norm": 0.4025816023349762, "learning_rate": 4.629033131225639e-05, "loss": 0.3576, "step": 5482000 }, { "epoch": 37.10007037678649, "grad_norm": 0.3557578921318054, "learning_rate": 4.628999296232135e-05, "loss": 0.3579, "step": 5482500 }, { "epoch": 37.103453876136854, "grad_norm": 0.3866497278213501, "learning_rate": 4.628965461238632e-05, "loss": 0.3591, "step": 5483000 }, { "epoch": 37.106837375487224, "grad_norm": 0.3914271295070648, "learning_rate": 4.628931626245128e-05, "loss": 0.3577, "step": 5483500 }, { "epoch": 37.110220874837594, "grad_norm": 0.3865836560726166, "learning_rate": 4.628897791251624e-05, "loss": 0.3564, "step": 5484000 }, { "epoch": 37.11360437418796, "grad_norm": 0.4411778450012207, "learning_rate": 4.628863956258121e-05, "loss": 0.3573, "step": 5484500 }, { "epoch": 37.11698787353833, "grad_norm": 0.40927067399024963, "learning_rate": 4.628830121264617e-05, "loss": 0.3588, "step": 5485000 }, { "epoch": 37.1203713728887, "grad_norm": 0.39079582691192627, "learning_rate": 4.628796286271113e-05, "loss": 0.3572, "step": 5485500 }, { "epoch": 37.12375487223906, "grad_norm": 0.4265403747558594, "learning_rate": 4.6287624512776094e-05, "loss": 0.3567, "step": 5486000 }, { "epoch": 37.12713837158943, "grad_norm": 0.34536078572273254, "learning_rate": 4.628728616284106e-05, "loss": 0.3578, "step": 5486500 }, { "epoch": 37.1305218709398, "grad_norm": 0.4132966697216034, "learning_rate": 4.6286947812906025e-05, "loss": 0.3572, "step": 5487000 }, { "epoch": 37.13390537029017, "grad_norm": 0.3569653630256653, "learning_rate": 4.628660946297099e-05, "loss": 0.3575, "step": 5487500 }, { "epoch": 37.137288869640535, "grad_norm": 0.3779239058494568, "learning_rate": 4.628627111303595e-05, "loss": 0.3574, "step": 5488000 }, { "epoch": 37.140672368990906, "grad_norm": 0.3855503797531128, "learning_rate": 4.628593276310092e-05, "loss": 0.3577, "step": 5488500 }, { "epoch": 37.144055868341276, "grad_norm": 0.3920680284500122, "learning_rate": 4.628559441316587e-05, "loss": 0.3575, "step": 5489000 }, { "epoch": 37.14743936769164, "grad_norm": 0.40385711193084717, "learning_rate": 4.6285256063230835e-05, "loss": 0.358, "step": 5489500 }, { "epoch": 37.15082286704201, "grad_norm": 0.3753313720226288, "learning_rate": 4.62849177132958e-05, "loss": 0.3591, "step": 5490000 }, { "epoch": 37.15420636639238, "grad_norm": 0.38031160831451416, "learning_rate": 4.6284579363360766e-05, "loss": 0.3584, "step": 5490500 }, { "epoch": 37.15758986574274, "grad_norm": 0.35980871319770813, "learning_rate": 4.628424101342573e-05, "loss": 0.3582, "step": 5491000 }, { "epoch": 37.16097336509311, "grad_norm": 0.3997298777103424, "learning_rate": 4.628390266349069e-05, "loss": 0.3572, "step": 5491500 }, { "epoch": 37.164356864443484, "grad_norm": 0.4069244861602783, "learning_rate": 4.628356431355565e-05, "loss": 0.3574, "step": 5492000 }, { "epoch": 37.16774036379385, "grad_norm": 0.37065088748931885, "learning_rate": 4.628322596362062e-05, "loss": 0.3573, "step": 5492500 }, { "epoch": 37.17112386314422, "grad_norm": 0.40082502365112305, "learning_rate": 4.6282887613685584e-05, "loss": 0.3579, "step": 5493000 }, { "epoch": 37.17450736249459, "grad_norm": 0.3844318687915802, "learning_rate": 4.628254926375054e-05, "loss": 0.3588, "step": 5493500 }, { "epoch": 37.17789086184496, "grad_norm": 0.3889318108558655, "learning_rate": 4.628221091381551e-05, "loss": 0.36, "step": 5494000 }, { "epoch": 37.18127436119532, "grad_norm": 0.34067457914352417, "learning_rate": 4.628187256388047e-05, "loss": 0.3578, "step": 5494500 }, { "epoch": 37.18465786054569, "grad_norm": 0.3923149108886719, "learning_rate": 4.628153421394543e-05, "loss": 0.3589, "step": 5495000 }, { "epoch": 37.18804135989606, "grad_norm": 0.420271098613739, "learning_rate": 4.6281195864010394e-05, "loss": 0.3592, "step": 5495500 }, { "epoch": 37.191424859246425, "grad_norm": 0.42455965280532837, "learning_rate": 4.628085751407536e-05, "loss": 0.3601, "step": 5496000 }, { "epoch": 37.194808358596795, "grad_norm": 0.4452683925628662, "learning_rate": 4.6280519164140325e-05, "loss": 0.3582, "step": 5496500 }, { "epoch": 37.198191857947165, "grad_norm": 0.41060131788253784, "learning_rate": 4.628018081420529e-05, "loss": 0.3581, "step": 5497000 }, { "epoch": 37.20157535729753, "grad_norm": 0.396625280380249, "learning_rate": 4.627984246427025e-05, "loss": 0.3594, "step": 5497500 }, { "epoch": 37.2049588566479, "grad_norm": 0.4013853073120117, "learning_rate": 4.627950411433521e-05, "loss": 0.3585, "step": 5498000 }, { "epoch": 37.20834235599827, "grad_norm": 0.3770159184932709, "learning_rate": 4.6279165764400174e-05, "loss": 0.3554, "step": 5498500 }, { "epoch": 37.21172585534863, "grad_norm": 0.38657766580581665, "learning_rate": 4.6278827414465136e-05, "loss": 0.3588, "step": 5499000 }, { "epoch": 37.215109354699, "grad_norm": 0.3623330593109131, "learning_rate": 4.62784890645301e-05, "loss": 0.3573, "step": 5499500 }, { "epoch": 37.21849285404937, "grad_norm": 0.35294994711875916, "learning_rate": 4.627815071459507e-05, "loss": 0.3589, "step": 5500000 }, { "epoch": 37.22187635339974, "grad_norm": 0.408532977104187, "learning_rate": 4.627781236466003e-05, "loss": 0.3588, "step": 5500500 }, { "epoch": 37.225259852750106, "grad_norm": 0.38643956184387207, "learning_rate": 4.627747401472499e-05, "loss": 0.3575, "step": 5501000 }, { "epoch": 37.22864335210048, "grad_norm": 0.4474964737892151, "learning_rate": 4.627713566478995e-05, "loss": 0.3568, "step": 5501500 }, { "epoch": 37.23202685145085, "grad_norm": 0.3891274631023407, "learning_rate": 4.627679731485492e-05, "loss": 0.3588, "step": 5502000 }, { "epoch": 37.23541035080121, "grad_norm": 0.3855583369731903, "learning_rate": 4.6276458964919884e-05, "loss": 0.3585, "step": 5502500 }, { "epoch": 37.23879385015158, "grad_norm": 0.3802156448364258, "learning_rate": 4.627612061498484e-05, "loss": 0.3568, "step": 5503000 }, { "epoch": 37.24217734950195, "grad_norm": 0.3620854616165161, "learning_rate": 4.627578226504981e-05, "loss": 0.3593, "step": 5503500 }, { "epoch": 37.245560848852314, "grad_norm": 0.36845290660858154, "learning_rate": 4.627544391511477e-05, "loss": 0.3579, "step": 5504000 }, { "epoch": 37.248944348202684, "grad_norm": 0.417395681142807, "learning_rate": 4.627510556517973e-05, "loss": 0.3588, "step": 5504500 }, { "epoch": 37.252327847553055, "grad_norm": 0.35247254371643066, "learning_rate": 4.6274767215244695e-05, "loss": 0.3569, "step": 5505000 }, { "epoch": 37.255711346903425, "grad_norm": 0.3863687813282013, "learning_rate": 4.627442886530966e-05, "loss": 0.3572, "step": 5505500 }, { "epoch": 37.25909484625379, "grad_norm": 0.4075227379798889, "learning_rate": 4.6274090515374626e-05, "loss": 0.3567, "step": 5506000 }, { "epoch": 37.26247834560416, "grad_norm": 0.3664039075374603, "learning_rate": 4.627375216543959e-05, "loss": 0.3578, "step": 5506500 }, { "epoch": 37.26586184495453, "grad_norm": 0.4030807316303253, "learning_rate": 4.627341381550455e-05, "loss": 0.3579, "step": 5507000 }, { "epoch": 37.26924534430489, "grad_norm": 0.37312668561935425, "learning_rate": 4.627307546556951e-05, "loss": 0.3569, "step": 5507500 }, { "epoch": 37.27262884365526, "grad_norm": 0.4228224456310272, "learning_rate": 4.6272737115634474e-05, "loss": 0.3574, "step": 5508000 }, { "epoch": 37.27601234300563, "grad_norm": 0.36767372488975525, "learning_rate": 4.6272398765699436e-05, "loss": 0.3588, "step": 5508500 }, { "epoch": 37.279395842355996, "grad_norm": 0.399533212184906, "learning_rate": 4.62720604157644e-05, "loss": 0.3591, "step": 5509000 }, { "epoch": 37.282779341706366, "grad_norm": 0.37349948287010193, "learning_rate": 4.627172206582937e-05, "loss": 0.3564, "step": 5509500 }, { "epoch": 37.286162841056736, "grad_norm": 0.3685149550437927, "learning_rate": 4.627138371589433e-05, "loss": 0.3582, "step": 5510000 }, { "epoch": 37.2895463404071, "grad_norm": 0.3562529683113098, "learning_rate": 4.627104536595929e-05, "loss": 0.3584, "step": 5510500 }, { "epoch": 37.29292983975747, "grad_norm": 0.39345937967300415, "learning_rate": 4.6270707016024254e-05, "loss": 0.3565, "step": 5511000 }, { "epoch": 37.29631333910784, "grad_norm": 0.3297092020511627, "learning_rate": 4.627036866608922e-05, "loss": 0.3571, "step": 5511500 }, { "epoch": 37.29969683845821, "grad_norm": 0.3960650563240051, "learning_rate": 4.6270030316154185e-05, "loss": 0.3594, "step": 5512000 }, { "epoch": 37.303080337808574, "grad_norm": 0.3693292438983917, "learning_rate": 4.626969196621914e-05, "loss": 0.359, "step": 5512500 }, { "epoch": 37.306463837158944, "grad_norm": 0.38844481110572815, "learning_rate": 4.626935361628411e-05, "loss": 0.3588, "step": 5513000 }, { "epoch": 37.309847336509314, "grad_norm": 0.3629488945007324, "learning_rate": 4.626901526634907e-05, "loss": 0.3596, "step": 5513500 }, { "epoch": 37.31323083585968, "grad_norm": 0.3812672197818756, "learning_rate": 4.626867691641403e-05, "loss": 0.3587, "step": 5514000 }, { "epoch": 37.31661433521005, "grad_norm": 0.393673837184906, "learning_rate": 4.6268338566478995e-05, "loss": 0.3588, "step": 5514500 }, { "epoch": 37.31999783456042, "grad_norm": 0.39718562364578247, "learning_rate": 4.626800021654396e-05, "loss": 0.3595, "step": 5515000 }, { "epoch": 37.32338133391078, "grad_norm": 0.3568868637084961, "learning_rate": 4.6267661866608926e-05, "loss": 0.3595, "step": 5515500 }, { "epoch": 37.32676483326115, "grad_norm": 0.3998781740665436, "learning_rate": 4.626732351667389e-05, "loss": 0.3572, "step": 5516000 }, { "epoch": 37.33014833261152, "grad_norm": 0.3915063142776489, "learning_rate": 4.626698516673885e-05, "loss": 0.3596, "step": 5516500 }, { "epoch": 37.333531831961885, "grad_norm": 0.39350441098213196, "learning_rate": 4.626664681680381e-05, "loss": 0.3588, "step": 5517000 }, { "epoch": 37.336915331312255, "grad_norm": 0.3588891625404358, "learning_rate": 4.6266308466868775e-05, "loss": 0.3593, "step": 5517500 }, { "epoch": 37.340298830662626, "grad_norm": 0.44149383902549744, "learning_rate": 4.626597011693374e-05, "loss": 0.3585, "step": 5518000 }, { "epoch": 37.343682330012996, "grad_norm": 0.3888978958129883, "learning_rate": 4.62656317669987e-05, "loss": 0.3582, "step": 5518500 }, { "epoch": 37.34706582936336, "grad_norm": 0.35463154315948486, "learning_rate": 4.626529341706367e-05, "loss": 0.3573, "step": 5519000 }, { "epoch": 37.35044932871373, "grad_norm": 0.4043683409690857, "learning_rate": 4.626495506712863e-05, "loss": 0.3585, "step": 5519500 }, { "epoch": 37.3538328280641, "grad_norm": 0.3934733271598816, "learning_rate": 4.626461671719359e-05, "loss": 0.359, "step": 5520000 }, { "epoch": 37.35721632741446, "grad_norm": 0.41046079993247986, "learning_rate": 4.6264278367258554e-05, "loss": 0.3547, "step": 5520500 }, { "epoch": 37.36059982676483, "grad_norm": 0.3889351487159729, "learning_rate": 4.626394001732352e-05, "loss": 0.3581, "step": 5521000 }, { "epoch": 37.3639833261152, "grad_norm": 0.38513049483299255, "learning_rate": 4.6263601667388485e-05, "loss": 0.3601, "step": 5521500 }, { "epoch": 37.36736682546557, "grad_norm": 0.37006130814552307, "learning_rate": 4.626326331745345e-05, "loss": 0.3587, "step": 5522000 }, { "epoch": 37.37075032481594, "grad_norm": 0.4113992154598236, "learning_rate": 4.62629249675184e-05, "loss": 0.3586, "step": 5522500 }, { "epoch": 37.37413382416631, "grad_norm": 0.39319366216659546, "learning_rate": 4.626258661758337e-05, "loss": 0.3593, "step": 5523000 }, { "epoch": 37.37751732351667, "grad_norm": 0.38613831996917725, "learning_rate": 4.6262248267648334e-05, "loss": 0.358, "step": 5523500 }, { "epoch": 37.38090082286704, "grad_norm": 0.38404855132102966, "learning_rate": 4.6261909917713296e-05, "loss": 0.3591, "step": 5524000 }, { "epoch": 37.38428432221741, "grad_norm": 0.34263017773628235, "learning_rate": 4.626157156777826e-05, "loss": 0.3576, "step": 5524500 }, { "epoch": 37.38766782156778, "grad_norm": 0.3522478938102722, "learning_rate": 4.626123321784323e-05, "loss": 0.3589, "step": 5525000 }, { "epoch": 37.391051320918145, "grad_norm": 0.3875865936279297, "learning_rate": 4.626089486790819e-05, "loss": 0.3578, "step": 5525500 }, { "epoch": 37.394434820268515, "grad_norm": 0.38345375657081604, "learning_rate": 4.626055651797315e-05, "loss": 0.3596, "step": 5526000 }, { "epoch": 37.397818319618885, "grad_norm": 0.3821273744106293, "learning_rate": 4.6260218168038113e-05, "loss": 0.3584, "step": 5526500 }, { "epoch": 37.40120181896925, "grad_norm": 0.35929813981056213, "learning_rate": 4.6259879818103076e-05, "loss": 0.359, "step": 5527000 }, { "epoch": 37.40458531831962, "grad_norm": 0.4246380627155304, "learning_rate": 4.625954146816804e-05, "loss": 0.3601, "step": 5527500 }, { "epoch": 37.40796881766999, "grad_norm": 0.36928990483283997, "learning_rate": 4.6259203118233e-05, "loss": 0.3584, "step": 5528000 }, { "epoch": 37.41135231702035, "grad_norm": 0.3711598515510559, "learning_rate": 4.625886476829797e-05, "loss": 0.3566, "step": 5528500 }, { "epoch": 37.41473581637072, "grad_norm": 0.36917001008987427, "learning_rate": 4.625852641836293e-05, "loss": 0.3585, "step": 5529000 }, { "epoch": 37.41811931572109, "grad_norm": 0.4055008590221405, "learning_rate": 4.625818806842789e-05, "loss": 0.3576, "step": 5529500 }, { "epoch": 37.42150281507146, "grad_norm": 0.3711182773113251, "learning_rate": 4.6257849718492855e-05, "loss": 0.3583, "step": 5530000 }, { "epoch": 37.424886314421826, "grad_norm": 0.354253888130188, "learning_rate": 4.6257511368557824e-05, "loss": 0.3593, "step": 5530500 }, { "epoch": 37.4282698137722, "grad_norm": 0.42840898036956787, "learning_rate": 4.6257173018622786e-05, "loss": 0.3596, "step": 5531000 }, { "epoch": 37.43165331312257, "grad_norm": 0.3781728744506836, "learning_rate": 4.625683466868775e-05, "loss": 0.3582, "step": 5531500 }, { "epoch": 37.43503681247293, "grad_norm": 0.3387506604194641, "learning_rate": 4.6256496318752704e-05, "loss": 0.3588, "step": 5532000 }, { "epoch": 37.4384203118233, "grad_norm": 0.41057971119880676, "learning_rate": 4.625615796881767e-05, "loss": 0.3578, "step": 5532500 }, { "epoch": 37.44180381117367, "grad_norm": 0.42533671855926514, "learning_rate": 4.6255819618882635e-05, "loss": 0.3574, "step": 5533000 }, { "epoch": 37.445187310524034, "grad_norm": 0.40534380078315735, "learning_rate": 4.62554812689476e-05, "loss": 0.3592, "step": 5533500 }, { "epoch": 37.448570809874404, "grad_norm": 0.383487343788147, "learning_rate": 4.625514291901256e-05, "loss": 0.3575, "step": 5534000 }, { "epoch": 37.451954309224774, "grad_norm": 0.3784342110157013, "learning_rate": 4.625480456907753e-05, "loss": 0.3605, "step": 5534500 }, { "epoch": 37.45533780857514, "grad_norm": 0.43512246012687683, "learning_rate": 4.625446621914249e-05, "loss": 0.3586, "step": 5535000 }, { "epoch": 37.45872130792551, "grad_norm": 0.3840678036212921, "learning_rate": 4.625412786920745e-05, "loss": 0.3586, "step": 5535500 }, { "epoch": 37.46210480727588, "grad_norm": 0.3802643120288849, "learning_rate": 4.6253789519272414e-05, "loss": 0.3581, "step": 5536000 }, { "epoch": 37.46548830662625, "grad_norm": 0.3913117051124573, "learning_rate": 4.6253451169337376e-05, "loss": 0.3592, "step": 5536500 }, { "epoch": 37.46887180597661, "grad_norm": 0.3788554072380066, "learning_rate": 4.625311281940234e-05, "loss": 0.3595, "step": 5537000 }, { "epoch": 37.47225530532698, "grad_norm": 0.36486807465553284, "learning_rate": 4.62527744694673e-05, "loss": 0.3591, "step": 5537500 }, { "epoch": 37.47563880467735, "grad_norm": 0.40812861919403076, "learning_rate": 4.625243611953227e-05, "loss": 0.3581, "step": 5538000 }, { "epoch": 37.479022304027716, "grad_norm": 0.4144463837146759, "learning_rate": 4.625209776959723e-05, "loss": 0.3577, "step": 5538500 }, { "epoch": 37.482405803378086, "grad_norm": 0.3697092831134796, "learning_rate": 4.6251759419662194e-05, "loss": 0.3579, "step": 5539000 }, { "epoch": 37.485789302728456, "grad_norm": 0.35993775725364685, "learning_rate": 4.6251421069727156e-05, "loss": 0.3603, "step": 5539500 }, { "epoch": 37.48917280207882, "grad_norm": 0.4362202286720276, "learning_rate": 4.6251082719792125e-05, "loss": 0.3577, "step": 5540000 }, { "epoch": 37.49255630142919, "grad_norm": 0.3813544511795044, "learning_rate": 4.625074436985709e-05, "loss": 0.3581, "step": 5540500 }, { "epoch": 37.49593980077956, "grad_norm": 0.42652133107185364, "learning_rate": 4.625040601992205e-05, "loss": 0.3588, "step": 5541000 }, { "epoch": 37.49932330012992, "grad_norm": 0.38198423385620117, "learning_rate": 4.6250067669987004e-05, "loss": 0.3574, "step": 5541500 }, { "epoch": 37.50270679948029, "grad_norm": 0.36867615580558777, "learning_rate": 4.624972932005197e-05, "loss": 0.3584, "step": 5542000 }, { "epoch": 37.506090298830664, "grad_norm": 0.39017850160598755, "learning_rate": 4.6249390970116935e-05, "loss": 0.3577, "step": 5542500 }, { "epoch": 37.509473798181034, "grad_norm": 0.39716485142707825, "learning_rate": 4.62490526201819e-05, "loss": 0.3582, "step": 5543000 }, { "epoch": 37.5128572975314, "grad_norm": 0.3819413185119629, "learning_rate": 4.624871427024686e-05, "loss": 0.3577, "step": 5543500 }, { "epoch": 37.51624079688177, "grad_norm": 0.38019052147865295, "learning_rate": 4.624837592031183e-05, "loss": 0.3591, "step": 5544000 }, { "epoch": 37.51962429623214, "grad_norm": 0.4178309440612793, "learning_rate": 4.624803757037679e-05, "loss": 0.3591, "step": 5544500 }, { "epoch": 37.5230077955825, "grad_norm": 0.40726998448371887, "learning_rate": 4.624769922044175e-05, "loss": 0.3598, "step": 5545000 }, { "epoch": 37.52639129493287, "grad_norm": 0.3306938707828522, "learning_rate": 4.6247360870506715e-05, "loss": 0.3591, "step": 5545500 }, { "epoch": 37.52977479428324, "grad_norm": 0.412689208984375, "learning_rate": 4.624702252057168e-05, "loss": 0.3584, "step": 5546000 }, { "epoch": 37.533158293633605, "grad_norm": 0.38152435421943665, "learning_rate": 4.624668417063664e-05, "loss": 0.3586, "step": 5546500 }, { "epoch": 37.536541792983975, "grad_norm": 0.3947616219520569, "learning_rate": 4.62463458207016e-05, "loss": 0.3595, "step": 5547000 }, { "epoch": 37.539925292334345, "grad_norm": 0.3962426483631134, "learning_rate": 4.624600747076657e-05, "loss": 0.3577, "step": 5547500 }, { "epoch": 37.54330879168471, "grad_norm": 0.3332233130931854, "learning_rate": 4.624566912083153e-05, "loss": 0.3577, "step": 5548000 }, { "epoch": 37.54669229103508, "grad_norm": 0.37939968705177307, "learning_rate": 4.6245330770896494e-05, "loss": 0.3598, "step": 5548500 }, { "epoch": 37.55007579038545, "grad_norm": 0.3684322237968445, "learning_rate": 4.6244992420961456e-05, "loss": 0.3572, "step": 5549000 }, { "epoch": 37.55345928973582, "grad_norm": 0.38885024189949036, "learning_rate": 4.6244654071026425e-05, "loss": 0.3599, "step": 5549500 }, { "epoch": 37.55684278908618, "grad_norm": 0.38561055064201355, "learning_rate": 4.624431572109139e-05, "loss": 0.3592, "step": 5550000 }, { "epoch": 37.56022628843655, "grad_norm": 0.3819288909435272, "learning_rate": 4.624397737115635e-05, "loss": 0.356, "step": 5550500 }, { "epoch": 37.56360978778692, "grad_norm": 0.4337511658668518, "learning_rate": 4.6243639021221305e-05, "loss": 0.3597, "step": 5551000 }, { "epoch": 37.56699328713729, "grad_norm": 0.33733001351356506, "learning_rate": 4.6243300671286274e-05, "loss": 0.3588, "step": 5551500 }, { "epoch": 37.57037678648766, "grad_norm": 0.42798560857772827, "learning_rate": 4.6242962321351236e-05, "loss": 0.3576, "step": 5552000 }, { "epoch": 37.57376028583803, "grad_norm": 0.3651011288166046, "learning_rate": 4.62426239714162e-05, "loss": 0.3592, "step": 5552500 }, { "epoch": 37.57714378518839, "grad_norm": 0.36789029836654663, "learning_rate": 4.624228562148116e-05, "loss": 0.3594, "step": 5553000 }, { "epoch": 37.58052728453876, "grad_norm": 0.36014118790626526, "learning_rate": 4.624194727154613e-05, "loss": 0.3586, "step": 5553500 }, { "epoch": 37.58391078388913, "grad_norm": 0.4079189598560333, "learning_rate": 4.624160892161109e-05, "loss": 0.3582, "step": 5554000 }, { "epoch": 37.5872942832395, "grad_norm": 0.3701742887496948, "learning_rate": 4.624127057167605e-05, "loss": 0.3583, "step": 5554500 }, { "epoch": 37.590677782589864, "grad_norm": 0.39185646176338196, "learning_rate": 4.6240932221741015e-05, "loss": 0.3589, "step": 5555000 }, { "epoch": 37.594061281940235, "grad_norm": 0.3799331486225128, "learning_rate": 4.624059387180598e-05, "loss": 0.3592, "step": 5555500 }, { "epoch": 37.597444781290605, "grad_norm": 0.4082535207271576, "learning_rate": 4.624025552187094e-05, "loss": 0.3575, "step": 5556000 }, { "epoch": 37.60082828064097, "grad_norm": 0.40764573216438293, "learning_rate": 4.62399171719359e-05, "loss": 0.3592, "step": 5556500 }, { "epoch": 37.60421177999134, "grad_norm": 0.41705572605133057, "learning_rate": 4.623957882200087e-05, "loss": 0.3583, "step": 5557000 }, { "epoch": 37.60759527934171, "grad_norm": 0.35448992252349854, "learning_rate": 4.623924047206583e-05, "loss": 0.358, "step": 5557500 }, { "epoch": 37.61097877869207, "grad_norm": 0.4184199869632721, "learning_rate": 4.6238902122130795e-05, "loss": 0.3587, "step": 5558000 }, { "epoch": 37.61436227804244, "grad_norm": 0.3703767657279968, "learning_rate": 4.623856377219576e-05, "loss": 0.3581, "step": 5558500 }, { "epoch": 37.61774577739281, "grad_norm": 0.3824697732925415, "learning_rate": 4.6238225422260726e-05, "loss": 0.3578, "step": 5559000 }, { "epoch": 37.621129276743176, "grad_norm": 0.41411226987838745, "learning_rate": 4.623788707232569e-05, "loss": 0.3595, "step": 5559500 }, { "epoch": 37.624512776093546, "grad_norm": 0.3943541944026947, "learning_rate": 4.623754872239065e-05, "loss": 0.3596, "step": 5560000 }, { "epoch": 37.627896275443916, "grad_norm": 0.37810245156288147, "learning_rate": 4.6237210372455605e-05, "loss": 0.3583, "step": 5560500 }, { "epoch": 37.63127977479429, "grad_norm": 0.40703824162483215, "learning_rate": 4.6236872022520574e-05, "loss": 0.359, "step": 5561000 }, { "epoch": 37.63466327414465, "grad_norm": 0.3871955871582031, "learning_rate": 4.6236533672585536e-05, "loss": 0.3562, "step": 5561500 }, { "epoch": 37.63804677349502, "grad_norm": 0.3976595103740692, "learning_rate": 4.62361953226505e-05, "loss": 0.3599, "step": 5562000 }, { "epoch": 37.64143027284539, "grad_norm": 0.33061501383781433, "learning_rate": 4.623585697271546e-05, "loss": 0.3575, "step": 5562500 }, { "epoch": 37.644813772195754, "grad_norm": 0.36976704001426697, "learning_rate": 4.623551862278043e-05, "loss": 0.3592, "step": 5563000 }, { "epoch": 37.648197271546124, "grad_norm": 0.3802747428417206, "learning_rate": 4.623518027284539e-05, "loss": 0.3567, "step": 5563500 }, { "epoch": 37.651580770896494, "grad_norm": 0.4013938009738922, "learning_rate": 4.6234841922910354e-05, "loss": 0.3573, "step": 5564000 }, { "epoch": 37.65496427024686, "grad_norm": 0.4191991686820984, "learning_rate": 4.6234503572975316e-05, "loss": 0.3586, "step": 5564500 }, { "epoch": 37.65834776959723, "grad_norm": 0.3883405327796936, "learning_rate": 4.623416522304028e-05, "loss": 0.3596, "step": 5565000 }, { "epoch": 37.6617312689476, "grad_norm": 0.41135701537132263, "learning_rate": 4.623382687310524e-05, "loss": 0.3597, "step": 5565500 }, { "epoch": 37.66511476829796, "grad_norm": 0.41621407866477966, "learning_rate": 4.62334885231702e-05, "loss": 0.3583, "step": 5566000 }, { "epoch": 37.66849826764833, "grad_norm": 0.3869550824165344, "learning_rate": 4.623315017323517e-05, "loss": 0.3587, "step": 5566500 }, { "epoch": 37.6718817669987, "grad_norm": 0.39791762828826904, "learning_rate": 4.623281182330013e-05, "loss": 0.3586, "step": 5567000 }, { "epoch": 37.67526526634907, "grad_norm": 0.3453007638454437, "learning_rate": 4.6232473473365095e-05, "loss": 0.3584, "step": 5567500 }, { "epoch": 37.678648765699435, "grad_norm": 0.4014468193054199, "learning_rate": 4.623213512343006e-05, "loss": 0.3577, "step": 5568000 }, { "epoch": 37.682032265049806, "grad_norm": 0.32720428705215454, "learning_rate": 4.623179677349502e-05, "loss": 0.3581, "step": 5568500 }, { "epoch": 37.685415764400176, "grad_norm": 0.4049336612224579, "learning_rate": 4.623145842355999e-05, "loss": 0.3575, "step": 5569000 }, { "epoch": 37.68879926375054, "grad_norm": 0.3947456479072571, "learning_rate": 4.623112007362495e-05, "loss": 0.3583, "step": 5569500 }, { "epoch": 37.69218276310091, "grad_norm": 0.403266966342926, "learning_rate": 4.6230781723689906e-05, "loss": 0.3585, "step": 5570000 }, { "epoch": 37.69556626245128, "grad_norm": 0.36558789014816284, "learning_rate": 4.6230443373754875e-05, "loss": 0.3574, "step": 5570500 }, { "epoch": 37.69894976180164, "grad_norm": 0.3818999230861664, "learning_rate": 4.623010502381984e-05, "loss": 0.3587, "step": 5571000 }, { "epoch": 37.70233326115201, "grad_norm": 0.4195767641067505, "learning_rate": 4.62297666738848e-05, "loss": 0.3588, "step": 5571500 }, { "epoch": 37.70571676050238, "grad_norm": 0.37867316603660583, "learning_rate": 4.622942832394976e-05, "loss": 0.3558, "step": 5572000 }, { "epoch": 37.70910025985275, "grad_norm": 0.38056012988090515, "learning_rate": 4.622908997401473e-05, "loss": 0.3581, "step": 5572500 }, { "epoch": 37.71248375920312, "grad_norm": 0.3594655990600586, "learning_rate": 4.622875162407969e-05, "loss": 0.358, "step": 5573000 }, { "epoch": 37.71586725855349, "grad_norm": 0.3999258279800415, "learning_rate": 4.6228413274144655e-05, "loss": 0.3604, "step": 5573500 }, { "epoch": 37.71925075790386, "grad_norm": 0.4410228729248047, "learning_rate": 4.622807492420962e-05, "loss": 0.359, "step": 5574000 }, { "epoch": 37.72263425725422, "grad_norm": 0.3825768232345581, "learning_rate": 4.6227736574274586e-05, "loss": 0.3585, "step": 5574500 }, { "epoch": 37.72601775660459, "grad_norm": 0.3880165219306946, "learning_rate": 4.622739822433954e-05, "loss": 0.3581, "step": 5575000 }, { "epoch": 37.72940125595496, "grad_norm": 0.4235299229621887, "learning_rate": 4.62270598744045e-05, "loss": 0.3585, "step": 5575500 }, { "epoch": 37.732784755305325, "grad_norm": 0.376895934343338, "learning_rate": 4.6226721524469465e-05, "loss": 0.3585, "step": 5576000 }, { "epoch": 37.736168254655695, "grad_norm": 0.4097041189670563, "learning_rate": 4.6226383174534434e-05, "loss": 0.3581, "step": 5576500 }, { "epoch": 37.739551754006065, "grad_norm": 0.4131355285644531, "learning_rate": 4.6226044824599396e-05, "loss": 0.3591, "step": 5577000 }, { "epoch": 37.74293525335643, "grad_norm": 0.42813947796821594, "learning_rate": 4.622570647466436e-05, "loss": 0.3601, "step": 5577500 }, { "epoch": 37.7463187527068, "grad_norm": 0.38428157567977905, "learning_rate": 4.622536812472932e-05, "loss": 0.3588, "step": 5578000 }, { "epoch": 37.74970225205717, "grad_norm": 0.3647453486919403, "learning_rate": 4.622502977479429e-05, "loss": 0.3563, "step": 5578500 }, { "epoch": 37.75308575140754, "grad_norm": 0.3345412611961365, "learning_rate": 4.622469142485925e-05, "loss": 0.3594, "step": 5579000 }, { "epoch": 37.7564692507579, "grad_norm": 0.4206676483154297, "learning_rate": 4.622435307492421e-05, "loss": 0.3593, "step": 5579500 }, { "epoch": 37.75985275010827, "grad_norm": 0.38542622327804565, "learning_rate": 4.6224014724989176e-05, "loss": 0.3584, "step": 5580000 }, { "epoch": 37.76323624945864, "grad_norm": 0.34646499156951904, "learning_rate": 4.622367637505414e-05, "loss": 0.3587, "step": 5580500 }, { "epoch": 37.766619748809006, "grad_norm": 0.40047022700309753, "learning_rate": 4.62233380251191e-05, "loss": 0.3575, "step": 5581000 }, { "epoch": 37.77000324815938, "grad_norm": 0.4088267385959625, "learning_rate": 4.622299967518406e-05, "loss": 0.3586, "step": 5581500 }, { "epoch": 37.77338674750975, "grad_norm": 0.36530613899230957, "learning_rate": 4.622266132524903e-05, "loss": 0.3598, "step": 5582000 }, { "epoch": 37.77677024686011, "grad_norm": 0.36611422896385193, "learning_rate": 4.622232297531399e-05, "loss": 0.3572, "step": 5582500 }, { "epoch": 37.78015374621048, "grad_norm": 0.4519994854927063, "learning_rate": 4.6221984625378955e-05, "loss": 0.3588, "step": 5583000 }, { "epoch": 37.78353724556085, "grad_norm": 0.37126612663269043, "learning_rate": 4.622164627544392e-05, "loss": 0.3577, "step": 5583500 }, { "epoch": 37.786920744911214, "grad_norm": 0.44382140040397644, "learning_rate": 4.6221307925508886e-05, "loss": 0.3591, "step": 5584000 }, { "epoch": 37.790304244261584, "grad_norm": 0.3470163345336914, "learning_rate": 4.622096957557384e-05, "loss": 0.3584, "step": 5584500 }, { "epoch": 37.793687743611954, "grad_norm": 0.4182018041610718, "learning_rate": 4.6220631225638804e-05, "loss": 0.3598, "step": 5585000 }, { "epoch": 37.797071242962325, "grad_norm": 0.3888076841831207, "learning_rate": 4.6220292875703766e-05, "loss": 0.3588, "step": 5585500 }, { "epoch": 37.80045474231269, "grad_norm": 0.36507245898246765, "learning_rate": 4.6219954525768735e-05, "loss": 0.3601, "step": 5586000 }, { "epoch": 37.80383824166306, "grad_norm": 0.3805876672267914, "learning_rate": 4.62196161758337e-05, "loss": 0.3605, "step": 5586500 }, { "epoch": 37.80722174101343, "grad_norm": 0.42710334062576294, "learning_rate": 4.621927782589866e-05, "loss": 0.3604, "step": 5587000 }, { "epoch": 37.81060524036379, "grad_norm": 0.38477030396461487, "learning_rate": 4.621893947596362e-05, "loss": 0.3581, "step": 5587500 }, { "epoch": 37.81398873971416, "grad_norm": 0.3720863461494446, "learning_rate": 4.621860112602859e-05, "loss": 0.3572, "step": 5588000 }, { "epoch": 37.81737223906453, "grad_norm": 0.3941510319709778, "learning_rate": 4.621826277609355e-05, "loss": 0.3594, "step": 5588500 }, { "epoch": 37.820755738414896, "grad_norm": 0.4163818955421448, "learning_rate": 4.621792442615851e-05, "loss": 0.3605, "step": 5589000 }, { "epoch": 37.824139237765266, "grad_norm": 0.39274489879608154, "learning_rate": 4.6217586076223476e-05, "loss": 0.358, "step": 5589500 }, { "epoch": 37.827522737115636, "grad_norm": 0.3928956687450409, "learning_rate": 4.621724772628844e-05, "loss": 0.3596, "step": 5590000 }, { "epoch": 37.830906236466, "grad_norm": 0.3751726448535919, "learning_rate": 4.62169093763534e-05, "loss": 0.3608, "step": 5590500 }, { "epoch": 37.83428973581637, "grad_norm": 0.35046321153640747, "learning_rate": 4.621657102641836e-05, "loss": 0.3591, "step": 5591000 }, { "epoch": 37.83767323516674, "grad_norm": 0.401225745677948, "learning_rate": 4.621623267648333e-05, "loss": 0.3592, "step": 5591500 }, { "epoch": 37.84105673451711, "grad_norm": 0.42670589685440063, "learning_rate": 4.6215894326548294e-05, "loss": 0.3587, "step": 5592000 }, { "epoch": 37.84444023386747, "grad_norm": 0.3757880628108978, "learning_rate": 4.6215555976613256e-05, "loss": 0.3582, "step": 5592500 }, { "epoch": 37.847823733217844, "grad_norm": 0.3731699287891388, "learning_rate": 4.621521762667822e-05, "loss": 0.3593, "step": 5593000 }, { "epoch": 37.851207232568214, "grad_norm": 0.38897013664245605, "learning_rate": 4.621487927674319e-05, "loss": 0.3591, "step": 5593500 }, { "epoch": 37.85459073191858, "grad_norm": 0.38821354508399963, "learning_rate": 4.621454092680814e-05, "loss": 0.3593, "step": 5594000 }, { "epoch": 37.85797423126895, "grad_norm": 0.3750055134296417, "learning_rate": 4.6214202576873104e-05, "loss": 0.3611, "step": 5594500 }, { "epoch": 37.86135773061932, "grad_norm": 0.4040793180465698, "learning_rate": 4.6213864226938066e-05, "loss": 0.3577, "step": 5595000 }, { "epoch": 37.86474122996968, "grad_norm": 0.3993184566497803, "learning_rate": 4.6213525877003035e-05, "loss": 0.3609, "step": 5595500 }, { "epoch": 37.86812472932005, "grad_norm": 0.39517033100128174, "learning_rate": 4.6213187527068e-05, "loss": 0.359, "step": 5596000 }, { "epoch": 37.87150822867042, "grad_norm": 0.38171622157096863, "learning_rate": 4.621284917713296e-05, "loss": 0.3583, "step": 5596500 }, { "epoch": 37.874891728020785, "grad_norm": 0.39077261090278625, "learning_rate": 4.621251082719792e-05, "loss": 0.3592, "step": 5597000 }, { "epoch": 37.878275227371155, "grad_norm": 0.3833240270614624, "learning_rate": 4.621217247726289e-05, "loss": 0.3582, "step": 5597500 }, { "epoch": 37.881658726721525, "grad_norm": 0.37280595302581787, "learning_rate": 4.621183412732785e-05, "loss": 0.3586, "step": 5598000 }, { "epoch": 37.885042226071896, "grad_norm": 0.3744910657405853, "learning_rate": 4.621149577739281e-05, "loss": 0.3582, "step": 5598500 }, { "epoch": 37.88842572542226, "grad_norm": 0.38365522027015686, "learning_rate": 4.621115742745778e-05, "loss": 0.3587, "step": 5599000 }, { "epoch": 37.89180922477263, "grad_norm": 0.3723243772983551, "learning_rate": 4.621081907752274e-05, "loss": 0.3587, "step": 5599500 }, { "epoch": 37.895192724123, "grad_norm": 0.39694342017173767, "learning_rate": 4.62104807275877e-05, "loss": 0.3589, "step": 5600000 }, { "epoch": 37.89857622347336, "grad_norm": 0.4003523290157318, "learning_rate": 4.621014237765266e-05, "loss": 0.3595, "step": 5600500 }, { "epoch": 37.90195972282373, "grad_norm": 0.3916511535644531, "learning_rate": 4.620980402771763e-05, "loss": 0.3588, "step": 5601000 }, { "epoch": 37.9053432221741, "grad_norm": 0.39069026708602905, "learning_rate": 4.6209465677782594e-05, "loss": 0.3605, "step": 5601500 }, { "epoch": 37.90872672152447, "grad_norm": 0.3300125300884247, "learning_rate": 4.6209127327847556e-05, "loss": 0.3592, "step": 5602000 }, { "epoch": 37.91211022087484, "grad_norm": 0.3769150972366333, "learning_rate": 4.620878897791252e-05, "loss": 0.358, "step": 5602500 }, { "epoch": 37.91549372022521, "grad_norm": 0.406015008687973, "learning_rate": 4.620845062797749e-05, "loss": 0.3595, "step": 5603000 }, { "epoch": 37.91887721957558, "grad_norm": 0.35252100229263306, "learning_rate": 4.620811227804244e-05, "loss": 0.358, "step": 5603500 }, { "epoch": 37.92226071892594, "grad_norm": 0.4023178219795227, "learning_rate": 4.6207773928107405e-05, "loss": 0.3584, "step": 5604000 }, { "epoch": 37.92564421827631, "grad_norm": 0.3610759675502777, "learning_rate": 4.620743557817237e-05, "loss": 0.3579, "step": 5604500 }, { "epoch": 37.92902771762668, "grad_norm": 0.3983677923679352, "learning_rate": 4.6207097228237336e-05, "loss": 0.36, "step": 5605000 }, { "epoch": 37.932411216977044, "grad_norm": 0.40776327252388, "learning_rate": 4.62067588783023e-05, "loss": 0.3603, "step": 5605500 }, { "epoch": 37.935794716327415, "grad_norm": 0.3881959021091461, "learning_rate": 4.620642052836726e-05, "loss": 0.359, "step": 5606000 }, { "epoch": 37.939178215677785, "grad_norm": 0.5547685623168945, "learning_rate": 4.620608217843222e-05, "loss": 0.3591, "step": 5606500 }, { "epoch": 37.94256171502815, "grad_norm": 0.36712491512298584, "learning_rate": 4.620574382849719e-05, "loss": 0.3586, "step": 5607000 }, { "epoch": 37.94594521437852, "grad_norm": 0.403956800699234, "learning_rate": 4.620540547856215e-05, "loss": 0.359, "step": 5607500 }, { "epoch": 37.94932871372889, "grad_norm": 0.38546839356422424, "learning_rate": 4.620506712862711e-05, "loss": 0.3583, "step": 5608000 }, { "epoch": 37.95271221307925, "grad_norm": 0.40115487575531006, "learning_rate": 4.620472877869208e-05, "loss": 0.3576, "step": 5608500 }, { "epoch": 37.95609571242962, "grad_norm": 0.3921334445476532, "learning_rate": 4.620439042875704e-05, "loss": 0.3575, "step": 5609000 }, { "epoch": 37.95947921177999, "grad_norm": 0.3957189917564392, "learning_rate": 4.6204052078822e-05, "loss": 0.3595, "step": 5609500 }, { "epoch": 37.96286271113036, "grad_norm": 0.40347981452941895, "learning_rate": 4.6203713728886964e-05, "loss": 0.3575, "step": 5610000 }, { "epoch": 37.966246210480726, "grad_norm": 0.3771745562553406, "learning_rate": 4.620337537895193e-05, "loss": 0.3584, "step": 5610500 }, { "epoch": 37.969629709831096, "grad_norm": 0.36563006043434143, "learning_rate": 4.6203037029016895e-05, "loss": 0.3587, "step": 5611000 }, { "epoch": 37.97301320918147, "grad_norm": 0.38988545536994934, "learning_rate": 4.620269867908186e-05, "loss": 0.359, "step": 5611500 }, { "epoch": 37.97639670853183, "grad_norm": 0.39158713817596436, "learning_rate": 4.620236032914682e-05, "loss": 0.3576, "step": 5612000 }, { "epoch": 37.9797802078822, "grad_norm": 0.4202806353569031, "learning_rate": 4.620202197921179e-05, "loss": 0.3583, "step": 5612500 }, { "epoch": 37.98316370723257, "grad_norm": 0.3770996332168579, "learning_rate": 4.6201683629276743e-05, "loss": 0.3593, "step": 5613000 }, { "epoch": 37.986547206582934, "grad_norm": 0.3654348850250244, "learning_rate": 4.6201345279341706e-05, "loss": 0.3579, "step": 5613500 }, { "epoch": 37.989930705933304, "grad_norm": 0.3695790469646454, "learning_rate": 4.620100692940667e-05, "loss": 0.3588, "step": 5614000 }, { "epoch": 37.993314205283674, "grad_norm": 0.4236592650413513, "learning_rate": 4.6200668579471637e-05, "loss": 0.3594, "step": 5614500 }, { "epoch": 37.99669770463404, "grad_norm": 0.3716669976711273, "learning_rate": 4.62003302295366e-05, "loss": 0.3599, "step": 5615000 }, { "epoch": 38.0, "eval_accuracy": 0.8631016885684777, "eval_loss": 0.555016815662384, "eval_runtime": 3366.5592, "eval_samples_per_second": 86.362, "eval_steps_per_second": 5.398, "step": 5615488 }, { "epoch": 38.00008120398441, "grad_norm": 0.3898286521434784, "learning_rate": 4.619999187960156e-05, "loss": 0.3583, "step": 5615500 }, { "epoch": 38.00346470333478, "grad_norm": 0.40544092655181885, "learning_rate": 4.619965352966652e-05, "loss": 0.3573, "step": 5616000 }, { "epoch": 38.00684820268515, "grad_norm": 0.4436746835708618, "learning_rate": 4.619931517973149e-05, "loss": 0.3564, "step": 5616500 }, { "epoch": 38.01023170203551, "grad_norm": 0.36663416028022766, "learning_rate": 4.6198976829796454e-05, "loss": 0.3571, "step": 5617000 }, { "epoch": 38.01361520138588, "grad_norm": 0.36764028668403625, "learning_rate": 4.619863847986141e-05, "loss": 0.3577, "step": 5617500 }, { "epoch": 38.01699870073625, "grad_norm": 0.40741071105003357, "learning_rate": 4.619830012992638e-05, "loss": 0.3573, "step": 5618000 }, { "epoch": 38.020382200086615, "grad_norm": 0.40826502442359924, "learning_rate": 4.619796177999134e-05, "loss": 0.3565, "step": 5618500 }, { "epoch": 38.023765699436986, "grad_norm": 0.4195602536201477, "learning_rate": 4.61976234300563e-05, "loss": 0.3555, "step": 5619000 }, { "epoch": 38.027149198787356, "grad_norm": 0.37044769525527954, "learning_rate": 4.6197285080121265e-05, "loss": 0.3564, "step": 5619500 }, { "epoch": 38.03053269813772, "grad_norm": 0.41229209303855896, "learning_rate": 4.6196946730186233e-05, "loss": 0.3561, "step": 5620000 }, { "epoch": 38.03391619748809, "grad_norm": 0.38138049840927124, "learning_rate": 4.6196608380251196e-05, "loss": 0.3558, "step": 5620500 }, { "epoch": 38.03729969683846, "grad_norm": 0.3792511522769928, "learning_rate": 4.619627003031616e-05, "loss": 0.3556, "step": 5621000 }, { "epoch": 38.04068319618882, "grad_norm": 0.3787059783935547, "learning_rate": 4.619593168038112e-05, "loss": 0.3558, "step": 5621500 }, { "epoch": 38.04406669553919, "grad_norm": 0.4135395288467407, "learning_rate": 4.619559333044608e-05, "loss": 0.356, "step": 5622000 }, { "epoch": 38.047450194889564, "grad_norm": 0.36780989170074463, "learning_rate": 4.6195254980511044e-05, "loss": 0.3572, "step": 5622500 }, { "epoch": 38.050833694239934, "grad_norm": 0.41230207681655884, "learning_rate": 4.6194916630576006e-05, "loss": 0.3574, "step": 5623000 }, { "epoch": 38.0542171935903, "grad_norm": 0.3845711350440979, "learning_rate": 4.619457828064097e-05, "loss": 0.3574, "step": 5623500 }, { "epoch": 38.05760069294067, "grad_norm": 0.3832910358905792, "learning_rate": 4.619423993070594e-05, "loss": 0.3569, "step": 5624000 }, { "epoch": 38.06098419229104, "grad_norm": 0.4200763702392578, "learning_rate": 4.61939015807709e-05, "loss": 0.3566, "step": 5624500 }, { "epoch": 38.0643676916414, "grad_norm": 0.38283076882362366, "learning_rate": 4.619356323083586e-05, "loss": 0.3561, "step": 5625000 }, { "epoch": 38.06775119099177, "grad_norm": 0.4039447605609894, "learning_rate": 4.6193224880900824e-05, "loss": 0.3575, "step": 5625500 }, { "epoch": 38.07113469034214, "grad_norm": 0.38887515664100647, "learning_rate": 4.619288653096579e-05, "loss": 0.356, "step": 5626000 }, { "epoch": 38.074518189692505, "grad_norm": 0.3814372420310974, "learning_rate": 4.6192548181030755e-05, "loss": 0.3568, "step": 5626500 }, { "epoch": 38.077901689042875, "grad_norm": 0.38250312209129333, "learning_rate": 4.619220983109571e-05, "loss": 0.3583, "step": 5627000 }, { "epoch": 38.081285188393245, "grad_norm": 0.38772666454315186, "learning_rate": 4.619187148116068e-05, "loss": 0.3569, "step": 5627500 }, { "epoch": 38.08466868774361, "grad_norm": 0.3952089250087738, "learning_rate": 4.619153313122564e-05, "loss": 0.3574, "step": 5628000 }, { "epoch": 38.08805218709398, "grad_norm": 0.3905099630355835, "learning_rate": 4.61911947812906e-05, "loss": 0.357, "step": 5628500 }, { "epoch": 38.09143568644435, "grad_norm": 0.3963543176651001, "learning_rate": 4.6190856431355565e-05, "loss": 0.3582, "step": 5629000 }, { "epoch": 38.09481918579472, "grad_norm": 0.35863327980041504, "learning_rate": 4.6190518081420534e-05, "loss": 0.357, "step": 5629500 }, { "epoch": 38.09820268514508, "grad_norm": 0.40665847063064575, "learning_rate": 4.6190179731485496e-05, "loss": 0.3588, "step": 5630000 }, { "epoch": 38.10158618449545, "grad_norm": 0.4140339195728302, "learning_rate": 4.618984138155046e-05, "loss": 0.3569, "step": 5630500 }, { "epoch": 38.10496968384582, "grad_norm": 0.3830562233924866, "learning_rate": 4.618950303161542e-05, "loss": 0.3578, "step": 5631000 }, { "epoch": 38.108353183196186, "grad_norm": 0.40696558356285095, "learning_rate": 4.618916468168038e-05, "loss": 0.357, "step": 5631500 }, { "epoch": 38.11173668254656, "grad_norm": 0.36331936717033386, "learning_rate": 4.6188826331745345e-05, "loss": 0.3578, "step": 5632000 }, { "epoch": 38.11512018189693, "grad_norm": 0.3589300215244293, "learning_rate": 4.618848798181031e-05, "loss": 0.3576, "step": 5632500 }, { "epoch": 38.11850368124729, "grad_norm": 0.4246826469898224, "learning_rate": 4.618814963187527e-05, "loss": 0.3567, "step": 5633000 }, { "epoch": 38.12188718059766, "grad_norm": 0.3818008601665497, "learning_rate": 4.618781128194024e-05, "loss": 0.3595, "step": 5633500 }, { "epoch": 38.12527067994803, "grad_norm": 0.35223817825317383, "learning_rate": 4.61874729320052e-05, "loss": 0.3587, "step": 5634000 }, { "epoch": 38.1286541792984, "grad_norm": 0.38550788164138794, "learning_rate": 4.618713458207016e-05, "loss": 0.3569, "step": 5634500 }, { "epoch": 38.132037678648764, "grad_norm": 0.41353002190589905, "learning_rate": 4.6186796232135124e-05, "loss": 0.3584, "step": 5635000 }, { "epoch": 38.135421177999135, "grad_norm": 0.38820987939834595, "learning_rate": 4.618645788220009e-05, "loss": 0.3587, "step": 5635500 }, { "epoch": 38.138804677349505, "grad_norm": 0.37804484367370605, "learning_rate": 4.6186119532265055e-05, "loss": 0.3543, "step": 5636000 }, { "epoch": 38.14218817669987, "grad_norm": 0.417277067899704, "learning_rate": 4.618578118233002e-05, "loss": 0.3578, "step": 5636500 }, { "epoch": 38.14557167605024, "grad_norm": 0.3772396147251129, "learning_rate": 4.618544283239498e-05, "loss": 0.358, "step": 5637000 }, { "epoch": 38.14895517540061, "grad_norm": 0.37377408146858215, "learning_rate": 4.618510448245994e-05, "loss": 0.3583, "step": 5637500 }, { "epoch": 38.15233867475097, "grad_norm": 0.3786788284778595, "learning_rate": 4.6184766132524904e-05, "loss": 0.3574, "step": 5638000 }, { "epoch": 38.15572217410134, "grad_norm": 0.37169507145881653, "learning_rate": 4.6184427782589866e-05, "loss": 0.3566, "step": 5638500 }, { "epoch": 38.15910567345171, "grad_norm": 0.4147394597530365, "learning_rate": 4.618408943265483e-05, "loss": 0.3591, "step": 5639000 }, { "epoch": 38.162489172802076, "grad_norm": 0.41069290041923523, "learning_rate": 4.61837510827198e-05, "loss": 0.3589, "step": 5639500 }, { "epoch": 38.165872672152446, "grad_norm": 0.41666343808174133, "learning_rate": 4.618341273278476e-05, "loss": 0.357, "step": 5640000 }, { "epoch": 38.169256171502816, "grad_norm": 0.40683019161224365, "learning_rate": 4.618307438284972e-05, "loss": 0.3577, "step": 5640500 }, { "epoch": 38.17263967085319, "grad_norm": 0.4103277921676636, "learning_rate": 4.618273603291468e-05, "loss": 0.3571, "step": 5641000 }, { "epoch": 38.17602317020355, "grad_norm": 0.43209654092788696, "learning_rate": 4.6182397682979645e-05, "loss": 0.3574, "step": 5641500 }, { "epoch": 38.17940666955392, "grad_norm": 0.3668835461139679, "learning_rate": 4.618205933304461e-05, "loss": 0.3572, "step": 5642000 }, { "epoch": 38.18279016890429, "grad_norm": 0.39525651931762695, "learning_rate": 4.618172098310957e-05, "loss": 0.3568, "step": 5642500 }, { "epoch": 38.18617366825465, "grad_norm": 0.42401039600372314, "learning_rate": 4.618138263317454e-05, "loss": 0.3569, "step": 5643000 }, { "epoch": 38.189557167605024, "grad_norm": 0.38974061608314514, "learning_rate": 4.61810442832395e-05, "loss": 0.3587, "step": 5643500 }, { "epoch": 38.192940666955394, "grad_norm": 0.39442285895347595, "learning_rate": 4.618070593330446e-05, "loss": 0.3578, "step": 5644000 }, { "epoch": 38.19632416630576, "grad_norm": 0.38561609387397766, "learning_rate": 4.6180367583369425e-05, "loss": 0.3584, "step": 5644500 }, { "epoch": 38.19970766565613, "grad_norm": 0.34974971413612366, "learning_rate": 4.6180029233434394e-05, "loss": 0.3586, "step": 5645000 }, { "epoch": 38.2030911650065, "grad_norm": 0.418800950050354, "learning_rate": 4.6179690883499356e-05, "loss": 0.3595, "step": 5645500 }, { "epoch": 38.20647466435686, "grad_norm": 0.3942648470401764, "learning_rate": 4.617935253356432e-05, "loss": 0.3571, "step": 5646000 }, { "epoch": 38.20985816370723, "grad_norm": 0.4242751896381378, "learning_rate": 4.617901418362927e-05, "loss": 0.357, "step": 5646500 }, { "epoch": 38.2132416630576, "grad_norm": 0.3806268572807312, "learning_rate": 4.617867583369424e-05, "loss": 0.3572, "step": 5647000 }, { "epoch": 38.21662516240797, "grad_norm": 0.3610764443874359, "learning_rate": 4.6178337483759204e-05, "loss": 0.3581, "step": 5647500 }, { "epoch": 38.220008661758335, "grad_norm": 0.41190919280052185, "learning_rate": 4.6177999133824166e-05, "loss": 0.3581, "step": 5648000 }, { "epoch": 38.223392161108706, "grad_norm": 0.3944419026374817, "learning_rate": 4.617766078388913e-05, "loss": 0.3578, "step": 5648500 }, { "epoch": 38.226775660459076, "grad_norm": 0.3749035596847534, "learning_rate": 4.61773224339541e-05, "loss": 0.358, "step": 5649000 }, { "epoch": 38.23015915980944, "grad_norm": 0.3959389925003052, "learning_rate": 4.617698408401906e-05, "loss": 0.3575, "step": 5649500 }, { "epoch": 38.23354265915981, "grad_norm": 0.3524419367313385, "learning_rate": 4.617664573408402e-05, "loss": 0.358, "step": 5650000 }, { "epoch": 38.23692615851018, "grad_norm": 0.3562764525413513, "learning_rate": 4.6176307384148984e-05, "loss": 0.3579, "step": 5650500 }, { "epoch": 38.24030965786054, "grad_norm": 0.3809186816215515, "learning_rate": 4.6175969034213946e-05, "loss": 0.3574, "step": 5651000 }, { "epoch": 38.24369315721091, "grad_norm": 0.35034775733947754, "learning_rate": 4.617563068427891e-05, "loss": 0.3579, "step": 5651500 }, { "epoch": 38.24707665656128, "grad_norm": 0.4001205563545227, "learning_rate": 4.617529233434387e-05, "loss": 0.3593, "step": 5652000 }, { "epoch": 38.25046015591165, "grad_norm": 0.39731815457344055, "learning_rate": 4.617495398440884e-05, "loss": 0.3583, "step": 5652500 }, { "epoch": 38.25384365526202, "grad_norm": 0.4180459976196289, "learning_rate": 4.61746156344738e-05, "loss": 0.356, "step": 5653000 }, { "epoch": 38.25722715461239, "grad_norm": 0.37674039602279663, "learning_rate": 4.617427728453876e-05, "loss": 0.3577, "step": 5653500 }, { "epoch": 38.26061065396276, "grad_norm": 0.39728453755378723, "learning_rate": 4.6173938934603725e-05, "loss": 0.357, "step": 5654000 }, { "epoch": 38.26399415331312, "grad_norm": 0.3983825147151947, "learning_rate": 4.6173600584668694e-05, "loss": 0.3581, "step": 5654500 }, { "epoch": 38.26737765266349, "grad_norm": 0.4299536943435669, "learning_rate": 4.6173262234733656e-05, "loss": 0.3582, "step": 5655000 }, { "epoch": 38.27076115201386, "grad_norm": 0.3940950632095337, "learning_rate": 4.617292388479862e-05, "loss": 0.3586, "step": 5655500 }, { "epoch": 38.274144651364225, "grad_norm": 0.3991442620754242, "learning_rate": 4.6172585534863574e-05, "loss": 0.3569, "step": 5656000 }, { "epoch": 38.277528150714595, "grad_norm": 0.38127321004867554, "learning_rate": 4.617224718492854e-05, "loss": 0.3594, "step": 5656500 }, { "epoch": 38.280911650064965, "grad_norm": 0.3947940766811371, "learning_rate": 4.6171908834993505e-05, "loss": 0.3593, "step": 5657000 }, { "epoch": 38.28429514941533, "grad_norm": 0.3717426359653473, "learning_rate": 4.617157048505847e-05, "loss": 0.3558, "step": 5657500 }, { "epoch": 38.2876786487657, "grad_norm": 0.3773467540740967, "learning_rate": 4.617123213512343e-05, "loss": 0.3566, "step": 5658000 }, { "epoch": 38.29106214811607, "grad_norm": 0.4220793843269348, "learning_rate": 4.61708937851884e-05, "loss": 0.3576, "step": 5658500 }, { "epoch": 38.29444564746644, "grad_norm": 0.35687562823295593, "learning_rate": 4.617055543525336e-05, "loss": 0.3586, "step": 5659000 }, { "epoch": 38.2978291468168, "grad_norm": 0.3728513717651367, "learning_rate": 4.617021708531832e-05, "loss": 0.3564, "step": 5659500 }, { "epoch": 38.30121264616717, "grad_norm": 0.38042473793029785, "learning_rate": 4.6169878735383284e-05, "loss": 0.3594, "step": 5660000 }, { "epoch": 38.30459614551754, "grad_norm": 0.37338787317276, "learning_rate": 4.6169540385448247e-05, "loss": 0.3576, "step": 5660500 }, { "epoch": 38.307979644867906, "grad_norm": 0.415351539850235, "learning_rate": 4.616920203551321e-05, "loss": 0.3584, "step": 5661000 }, { "epoch": 38.31136314421828, "grad_norm": 0.37263545393943787, "learning_rate": 4.616886368557817e-05, "loss": 0.3596, "step": 5661500 }, { "epoch": 38.31474664356865, "grad_norm": 0.38477614521980286, "learning_rate": 4.616852533564314e-05, "loss": 0.3592, "step": 5662000 }, { "epoch": 38.31813014291901, "grad_norm": 0.4276152551174164, "learning_rate": 4.61681869857081e-05, "loss": 0.3585, "step": 5662500 }, { "epoch": 38.32151364226938, "grad_norm": 0.39438101649284363, "learning_rate": 4.6167848635773064e-05, "loss": 0.3593, "step": 5663000 }, { "epoch": 38.32489714161975, "grad_norm": 0.37863606214523315, "learning_rate": 4.6167510285838026e-05, "loss": 0.3587, "step": 5663500 }, { "epoch": 38.328280640970114, "grad_norm": 0.3537881672382355, "learning_rate": 4.6167171935902995e-05, "loss": 0.3571, "step": 5664000 }, { "epoch": 38.331664140320484, "grad_norm": 0.3838760256767273, "learning_rate": 4.616683358596796e-05, "loss": 0.3577, "step": 5664500 }, { "epoch": 38.335047639670854, "grad_norm": 0.37645915150642395, "learning_rate": 4.616649523603292e-05, "loss": 0.3585, "step": 5665000 }, { "epoch": 38.338431139021225, "grad_norm": 0.4026859700679779, "learning_rate": 4.6166156886097875e-05, "loss": 0.3574, "step": 5665500 }, { "epoch": 38.34181463837159, "grad_norm": 0.3878239393234253, "learning_rate": 4.6165818536162843e-05, "loss": 0.3569, "step": 5666000 }, { "epoch": 38.34519813772196, "grad_norm": 0.39249539375305176, "learning_rate": 4.6165480186227806e-05, "loss": 0.3577, "step": 5666500 }, { "epoch": 38.34858163707233, "grad_norm": 0.3605285584926605, "learning_rate": 4.616514183629277e-05, "loss": 0.3588, "step": 5667000 }, { "epoch": 38.35196513642269, "grad_norm": 0.3994273841381073, "learning_rate": 4.616480348635773e-05, "loss": 0.357, "step": 5667500 }, { "epoch": 38.35534863577306, "grad_norm": 0.37711766362190247, "learning_rate": 4.61644651364227e-05, "loss": 0.3578, "step": 5668000 }, { "epoch": 38.35873213512343, "grad_norm": 0.40630966424942017, "learning_rate": 4.616412678648766e-05, "loss": 0.3572, "step": 5668500 }, { "epoch": 38.362115634473795, "grad_norm": 0.3620378375053406, "learning_rate": 4.616378843655262e-05, "loss": 0.3581, "step": 5669000 }, { "epoch": 38.365499133824166, "grad_norm": 0.393038809299469, "learning_rate": 4.6163450086617585e-05, "loss": 0.3582, "step": 5669500 }, { "epoch": 38.368882633174536, "grad_norm": 0.39178842306137085, "learning_rate": 4.616311173668255e-05, "loss": 0.3585, "step": 5670000 }, { "epoch": 38.3722661325249, "grad_norm": 0.34970539808273315, "learning_rate": 4.616277338674751e-05, "loss": 0.3586, "step": 5670500 }, { "epoch": 38.37564963187527, "grad_norm": 0.36897364258766174, "learning_rate": 4.616243503681247e-05, "loss": 0.3582, "step": 5671000 }, { "epoch": 38.37903313122564, "grad_norm": 0.41955244541168213, "learning_rate": 4.616209668687744e-05, "loss": 0.3583, "step": 5671500 }, { "epoch": 38.38241663057601, "grad_norm": 0.42534422874450684, "learning_rate": 4.61617583369424e-05, "loss": 0.358, "step": 5672000 }, { "epoch": 38.38580012992637, "grad_norm": 0.3807324469089508, "learning_rate": 4.6161419987007365e-05, "loss": 0.3576, "step": 5672500 }, { "epoch": 38.389183629276744, "grad_norm": 0.3953002691268921, "learning_rate": 4.616108163707233e-05, "loss": 0.3572, "step": 5673000 }, { "epoch": 38.392567128627114, "grad_norm": 0.40438657999038696, "learning_rate": 4.6160743287137296e-05, "loss": 0.3572, "step": 5673500 }, { "epoch": 38.39595062797748, "grad_norm": 0.3939213454723358, "learning_rate": 4.616040493720226e-05, "loss": 0.3587, "step": 5674000 }, { "epoch": 38.39933412732785, "grad_norm": 0.4456295669078827, "learning_rate": 4.616006658726722e-05, "loss": 0.3577, "step": 5674500 }, { "epoch": 38.40271762667822, "grad_norm": 0.40989065170288086, "learning_rate": 4.6159728237332175e-05, "loss": 0.3587, "step": 5675000 }, { "epoch": 38.40610112602858, "grad_norm": 0.38084176182746887, "learning_rate": 4.6159389887397144e-05, "loss": 0.3581, "step": 5675500 }, { "epoch": 38.40948462537895, "grad_norm": 0.4279657006263733, "learning_rate": 4.6159051537462106e-05, "loss": 0.3579, "step": 5676000 }, { "epoch": 38.41286812472932, "grad_norm": 0.3909102976322174, "learning_rate": 4.615871318752707e-05, "loss": 0.3588, "step": 5676500 }, { "epoch": 38.416251624079685, "grad_norm": 0.3725705146789551, "learning_rate": 4.615837483759203e-05, "loss": 0.3583, "step": 5677000 }, { "epoch": 38.419635123430055, "grad_norm": 0.35240438580513, "learning_rate": 4.6158036487657e-05, "loss": 0.3591, "step": 5677500 }, { "epoch": 38.423018622780425, "grad_norm": 0.36960190534591675, "learning_rate": 4.615769813772196e-05, "loss": 0.3589, "step": 5678000 }, { "epoch": 38.426402122130796, "grad_norm": 0.3883328437805176, "learning_rate": 4.6157359787786924e-05, "loss": 0.356, "step": 5678500 }, { "epoch": 38.42978562148116, "grad_norm": 0.3890812397003174, "learning_rate": 4.6157021437851886e-05, "loss": 0.3582, "step": 5679000 }, { "epoch": 38.43316912083153, "grad_norm": 0.39880093932151794, "learning_rate": 4.615668308791685e-05, "loss": 0.3573, "step": 5679500 }, { "epoch": 38.4365526201819, "grad_norm": 0.41489291191101074, "learning_rate": 4.615634473798181e-05, "loss": 0.3578, "step": 5680000 }, { "epoch": 38.43993611953226, "grad_norm": 0.3967892527580261, "learning_rate": 4.615600638804677e-05, "loss": 0.3572, "step": 5680500 }, { "epoch": 38.44331961888263, "grad_norm": 0.40296727418899536, "learning_rate": 4.615566803811174e-05, "loss": 0.3582, "step": 5681000 }, { "epoch": 38.446703118233, "grad_norm": 0.41405507922172546, "learning_rate": 4.61553296881767e-05, "loss": 0.3592, "step": 5681500 }, { "epoch": 38.450086617583366, "grad_norm": 0.4172917604446411, "learning_rate": 4.6154991338241665e-05, "loss": 0.3579, "step": 5682000 }, { "epoch": 38.45347011693374, "grad_norm": 0.4117705821990967, "learning_rate": 4.615465298830663e-05, "loss": 0.3582, "step": 5682500 }, { "epoch": 38.45685361628411, "grad_norm": 0.33144837617874146, "learning_rate": 4.6154314638371596e-05, "loss": 0.3583, "step": 5683000 }, { "epoch": 38.46023711563447, "grad_norm": 0.3817470371723175, "learning_rate": 4.615397628843656e-05, "loss": 0.3581, "step": 5683500 }, { "epoch": 38.46362061498484, "grad_norm": 0.3916853964328766, "learning_rate": 4.615363793850152e-05, "loss": 0.3586, "step": 5684000 }, { "epoch": 38.46700411433521, "grad_norm": 0.3973696827888489, "learning_rate": 4.6153299588566476e-05, "loss": 0.3584, "step": 5684500 }, { "epoch": 38.47038761368558, "grad_norm": 0.34616851806640625, "learning_rate": 4.6152961238631445e-05, "loss": 0.3589, "step": 5685000 }, { "epoch": 38.473771113035944, "grad_norm": 0.41244304180145264, "learning_rate": 4.615262288869641e-05, "loss": 0.3575, "step": 5685500 }, { "epoch": 38.477154612386315, "grad_norm": 0.37640896439552307, "learning_rate": 4.615228453876137e-05, "loss": 0.357, "step": 5686000 }, { "epoch": 38.480538111736685, "grad_norm": 0.3661516606807709, "learning_rate": 4.615194618882633e-05, "loss": 0.3585, "step": 5686500 }, { "epoch": 38.48392161108705, "grad_norm": 0.42485180497169495, "learning_rate": 4.61516078388913e-05, "loss": 0.3578, "step": 5687000 }, { "epoch": 38.48730511043742, "grad_norm": 0.42335501313209534, "learning_rate": 4.615126948895626e-05, "loss": 0.3579, "step": 5687500 }, { "epoch": 38.49068860978779, "grad_norm": 0.4134356677532196, "learning_rate": 4.6150931139021224e-05, "loss": 0.3581, "step": 5688000 }, { "epoch": 38.49407210913815, "grad_norm": 0.40912774205207825, "learning_rate": 4.6150592789086186e-05, "loss": 0.3569, "step": 5688500 }, { "epoch": 38.49745560848852, "grad_norm": 0.36274468898773193, "learning_rate": 4.6150254439151155e-05, "loss": 0.358, "step": 5689000 }, { "epoch": 38.50083910783889, "grad_norm": 0.3722778558731079, "learning_rate": 4.614991608921611e-05, "loss": 0.3583, "step": 5689500 }, { "epoch": 38.50422260718926, "grad_norm": 0.3648729622364044, "learning_rate": 4.614957773928107e-05, "loss": 0.3582, "step": 5690000 }, { "epoch": 38.507606106539626, "grad_norm": 0.3988613486289978, "learning_rate": 4.614923938934604e-05, "loss": 0.3578, "step": 5690500 }, { "epoch": 38.510989605889996, "grad_norm": 0.381133109331131, "learning_rate": 4.6148901039411004e-05, "loss": 0.3601, "step": 5691000 }, { "epoch": 38.51437310524037, "grad_norm": 0.3979582190513611, "learning_rate": 4.6148562689475966e-05, "loss": 0.3596, "step": 5691500 }, { "epoch": 38.51775660459073, "grad_norm": 0.388432115316391, "learning_rate": 4.614822433954093e-05, "loss": 0.3594, "step": 5692000 }, { "epoch": 38.5211401039411, "grad_norm": 0.33916813135147095, "learning_rate": 4.614788598960589e-05, "loss": 0.3587, "step": 5692500 }, { "epoch": 38.52452360329147, "grad_norm": 0.3934556841850281, "learning_rate": 4.614754763967086e-05, "loss": 0.3562, "step": 5693000 }, { "epoch": 38.527907102641834, "grad_norm": 0.35447025299072266, "learning_rate": 4.614720928973582e-05, "loss": 0.3577, "step": 5693500 }, { "epoch": 38.531290601992204, "grad_norm": 0.3884115517139435, "learning_rate": 4.6146870939800776e-05, "loss": 0.3573, "step": 5694000 }, { "epoch": 38.534674101342574, "grad_norm": 0.4255782961845398, "learning_rate": 4.6146532589865745e-05, "loss": 0.3586, "step": 5694500 }, { "epoch": 38.53805760069294, "grad_norm": 0.3719644546508789, "learning_rate": 4.614619423993071e-05, "loss": 0.359, "step": 5695000 }, { "epoch": 38.54144110004331, "grad_norm": 0.3874768316745758, "learning_rate": 4.614585588999567e-05, "loss": 0.3575, "step": 5695500 }, { "epoch": 38.54482459939368, "grad_norm": 0.368914395570755, "learning_rate": 4.614551754006063e-05, "loss": 0.3582, "step": 5696000 }, { "epoch": 38.54820809874405, "grad_norm": 0.37411215901374817, "learning_rate": 4.61451791901256e-05, "loss": 0.3602, "step": 5696500 }, { "epoch": 38.55159159809441, "grad_norm": 0.36497390270233154, "learning_rate": 4.614484084019056e-05, "loss": 0.3589, "step": 5697000 }, { "epoch": 38.55497509744478, "grad_norm": 0.37230151891708374, "learning_rate": 4.6144502490255525e-05, "loss": 0.3587, "step": 5697500 }, { "epoch": 38.55835859679515, "grad_norm": 0.4010688066482544, "learning_rate": 4.614416414032049e-05, "loss": 0.357, "step": 5698000 }, { "epoch": 38.561742096145515, "grad_norm": 0.4175252914428711, "learning_rate": 4.6143825790385456e-05, "loss": 0.3591, "step": 5698500 }, { "epoch": 38.565125595495886, "grad_norm": 0.3658173680305481, "learning_rate": 4.614348744045041e-05, "loss": 0.3575, "step": 5699000 }, { "epoch": 38.568509094846256, "grad_norm": 0.4055701792240143, "learning_rate": 4.614314909051537e-05, "loss": 0.3583, "step": 5699500 }, { "epoch": 38.57189259419662, "grad_norm": 0.414473295211792, "learning_rate": 4.614281074058034e-05, "loss": 0.3573, "step": 5700000 }, { "epoch": 38.57527609354699, "grad_norm": 0.373854398727417, "learning_rate": 4.6142472390645304e-05, "loss": 0.3588, "step": 5700500 }, { "epoch": 38.57865959289736, "grad_norm": 0.3989484906196594, "learning_rate": 4.6142134040710266e-05, "loss": 0.3584, "step": 5701000 }, { "epoch": 38.58204309224772, "grad_norm": 0.41586586833000183, "learning_rate": 4.614179569077523e-05, "loss": 0.3563, "step": 5701500 }, { "epoch": 38.58542659159809, "grad_norm": 0.3961929678916931, "learning_rate": 4.614145734084019e-05, "loss": 0.3586, "step": 5702000 }, { "epoch": 38.58881009094846, "grad_norm": 0.39267802238464355, "learning_rate": 4.614111899090516e-05, "loss": 0.3565, "step": 5702500 }, { "epoch": 38.592193590298834, "grad_norm": 0.3943745791912079, "learning_rate": 4.614078064097012e-05, "loss": 0.3583, "step": 5703000 }, { "epoch": 38.5955770896492, "grad_norm": 0.4008505344390869, "learning_rate": 4.614044229103508e-05, "loss": 0.3589, "step": 5703500 }, { "epoch": 38.59896058899957, "grad_norm": 0.382214218378067, "learning_rate": 4.6140103941100046e-05, "loss": 0.359, "step": 5704000 }, { "epoch": 38.60234408834994, "grad_norm": 0.38153693079948425, "learning_rate": 4.613976559116501e-05, "loss": 0.3587, "step": 5704500 }, { "epoch": 38.6057275877003, "grad_norm": 0.3921079635620117, "learning_rate": 4.613942724122997e-05, "loss": 0.3576, "step": 5705000 }, { "epoch": 38.60911108705067, "grad_norm": 0.36310869455337524, "learning_rate": 4.613908889129493e-05, "loss": 0.3576, "step": 5705500 }, { "epoch": 38.61249458640104, "grad_norm": 0.4206840395927429, "learning_rate": 4.61387505413599e-05, "loss": 0.3601, "step": 5706000 }, { "epoch": 38.615878085751405, "grad_norm": 0.3657309412956238, "learning_rate": 4.613841219142486e-05, "loss": 0.3588, "step": 5706500 }, { "epoch": 38.619261585101775, "grad_norm": 0.3499012291431427, "learning_rate": 4.6138073841489826e-05, "loss": 0.359, "step": 5707000 }, { "epoch": 38.622645084452145, "grad_norm": 0.38575655221939087, "learning_rate": 4.613773549155479e-05, "loss": 0.3586, "step": 5707500 }, { "epoch": 38.62602858380251, "grad_norm": 0.42723017930984497, "learning_rate": 4.6137397141619757e-05, "loss": 0.3573, "step": 5708000 }, { "epoch": 38.62941208315288, "grad_norm": 0.4009384214878082, "learning_rate": 4.613705879168471e-05, "loss": 0.3576, "step": 5708500 }, { "epoch": 38.63279558250325, "grad_norm": 0.38194090127944946, "learning_rate": 4.6136720441749674e-05, "loss": 0.36, "step": 5709000 }, { "epoch": 38.63617908185362, "grad_norm": 0.4206542670726776, "learning_rate": 4.6136382091814636e-05, "loss": 0.3581, "step": 5709500 }, { "epoch": 38.63956258120398, "grad_norm": 0.38274261355400085, "learning_rate": 4.6136043741879605e-05, "loss": 0.3574, "step": 5710000 }, { "epoch": 38.64294608055435, "grad_norm": 0.36390283703804016, "learning_rate": 4.613570539194457e-05, "loss": 0.3581, "step": 5710500 }, { "epoch": 38.64632957990472, "grad_norm": 0.35540375113487244, "learning_rate": 4.613536704200953e-05, "loss": 0.3573, "step": 5711000 }, { "epoch": 38.649713079255086, "grad_norm": 0.39186784625053406, "learning_rate": 4.613502869207449e-05, "loss": 0.3583, "step": 5711500 }, { "epoch": 38.65309657860546, "grad_norm": 0.40506142377853394, "learning_rate": 4.613469034213946e-05, "loss": 0.3592, "step": 5712000 }, { "epoch": 38.65648007795583, "grad_norm": 0.4294401705265045, "learning_rate": 4.613435199220442e-05, "loss": 0.3582, "step": 5712500 }, { "epoch": 38.65986357730619, "grad_norm": 0.35989508032798767, "learning_rate": 4.613401364226938e-05, "loss": 0.3594, "step": 5713000 }, { "epoch": 38.66324707665656, "grad_norm": 0.38508445024490356, "learning_rate": 4.613367529233435e-05, "loss": 0.3596, "step": 5713500 }, { "epoch": 38.66663057600693, "grad_norm": 0.3540355861186981, "learning_rate": 4.613333694239931e-05, "loss": 0.3583, "step": 5714000 }, { "epoch": 38.6700140753573, "grad_norm": 0.39032605290412903, "learning_rate": 4.613299859246427e-05, "loss": 0.3589, "step": 5714500 }, { "epoch": 38.673397574707664, "grad_norm": 0.38501599431037903, "learning_rate": 4.613266024252923e-05, "loss": 0.359, "step": 5715000 }, { "epoch": 38.676781074058034, "grad_norm": 0.3840806484222412, "learning_rate": 4.61323218925942e-05, "loss": 0.3594, "step": 5715500 }, { "epoch": 38.680164573408405, "grad_norm": 0.39163729548454285, "learning_rate": 4.6131983542659164e-05, "loss": 0.3584, "step": 5716000 }, { "epoch": 38.68354807275877, "grad_norm": 0.36240720748901367, "learning_rate": 4.6131645192724126e-05, "loss": 0.3586, "step": 5716500 }, { "epoch": 38.68693157210914, "grad_norm": 0.38391172885894775, "learning_rate": 4.613130684278909e-05, "loss": 0.3573, "step": 5717000 }, { "epoch": 38.69031507145951, "grad_norm": 0.3868270218372345, "learning_rate": 4.613096849285406e-05, "loss": 0.3579, "step": 5717500 }, { "epoch": 38.69369857080987, "grad_norm": 0.38391754031181335, "learning_rate": 4.613063014291901e-05, "loss": 0.3573, "step": 5718000 }, { "epoch": 38.69708207016024, "grad_norm": 0.4172001779079437, "learning_rate": 4.6130291792983975e-05, "loss": 0.3598, "step": 5718500 }, { "epoch": 38.70046556951061, "grad_norm": 0.3733766973018646, "learning_rate": 4.612995344304894e-05, "loss": 0.3575, "step": 5719000 }, { "epoch": 38.703849068860976, "grad_norm": 0.39495787024497986, "learning_rate": 4.6129615093113906e-05, "loss": 0.36, "step": 5719500 }, { "epoch": 38.707232568211346, "grad_norm": 0.42287978529930115, "learning_rate": 4.612927674317887e-05, "loss": 0.359, "step": 5720000 }, { "epoch": 38.710616067561716, "grad_norm": 0.40965384244918823, "learning_rate": 4.612893839324383e-05, "loss": 0.3586, "step": 5720500 }, { "epoch": 38.713999566912086, "grad_norm": 0.370764821767807, "learning_rate": 4.612860004330879e-05, "loss": 0.3592, "step": 5721000 }, { "epoch": 38.71738306626245, "grad_norm": 0.3937366008758545, "learning_rate": 4.612826169337376e-05, "loss": 0.359, "step": 5721500 }, { "epoch": 38.72076656561282, "grad_norm": 0.3713039457798004, "learning_rate": 4.612792334343872e-05, "loss": 0.3586, "step": 5722000 }, { "epoch": 38.72415006496319, "grad_norm": 0.4095219075679779, "learning_rate": 4.612758499350368e-05, "loss": 0.3594, "step": 5722500 }, { "epoch": 38.72753356431355, "grad_norm": 0.3711714744567871, "learning_rate": 4.612724664356865e-05, "loss": 0.359, "step": 5723000 }, { "epoch": 38.730917063663924, "grad_norm": 0.3650723695755005, "learning_rate": 4.612690829363361e-05, "loss": 0.3575, "step": 5723500 }, { "epoch": 38.734300563014294, "grad_norm": 0.3415977954864502, "learning_rate": 4.612656994369857e-05, "loss": 0.3586, "step": 5724000 }, { "epoch": 38.73768406236466, "grad_norm": 0.4030567407608032, "learning_rate": 4.6126231593763534e-05, "loss": 0.3585, "step": 5724500 }, { "epoch": 38.74106756171503, "grad_norm": 0.41347163915634155, "learning_rate": 4.61258932438285e-05, "loss": 0.3577, "step": 5725000 }, { "epoch": 38.7444510610654, "grad_norm": 0.37313735485076904, "learning_rate": 4.6125554893893465e-05, "loss": 0.3579, "step": 5725500 }, { "epoch": 38.74783456041576, "grad_norm": 0.37139520049095154, "learning_rate": 4.612521654395843e-05, "loss": 0.3573, "step": 5726000 }, { "epoch": 38.75121805976613, "grad_norm": 0.41949698328971863, "learning_rate": 4.612487819402339e-05, "loss": 0.3581, "step": 5726500 }, { "epoch": 38.7546015591165, "grad_norm": 0.3581236004829407, "learning_rate": 4.612453984408836e-05, "loss": 0.3581, "step": 5727000 }, { "epoch": 38.75798505846687, "grad_norm": 0.37782368063926697, "learning_rate": 4.612420149415331e-05, "loss": 0.3572, "step": 5727500 }, { "epoch": 38.761368557817235, "grad_norm": 0.36826446652412415, "learning_rate": 4.6123863144218275e-05, "loss": 0.3588, "step": 5728000 }, { "epoch": 38.764752057167605, "grad_norm": 0.3888064920902252, "learning_rate": 4.612352479428324e-05, "loss": 0.3573, "step": 5728500 }, { "epoch": 38.768135556517976, "grad_norm": 0.3726211488246918, "learning_rate": 4.6123186444348206e-05, "loss": 0.359, "step": 5729000 }, { "epoch": 38.77151905586834, "grad_norm": 0.3911852240562439, "learning_rate": 4.612284809441317e-05, "loss": 0.3588, "step": 5729500 }, { "epoch": 38.77490255521871, "grad_norm": 0.386538028717041, "learning_rate": 4.612250974447813e-05, "loss": 0.3571, "step": 5730000 }, { "epoch": 38.77828605456908, "grad_norm": 0.35132524371147156, "learning_rate": 4.612217139454309e-05, "loss": 0.36, "step": 5730500 }, { "epoch": 38.78166955391944, "grad_norm": 0.33423352241516113, "learning_rate": 4.612183304460806e-05, "loss": 0.3577, "step": 5731000 }, { "epoch": 38.78505305326981, "grad_norm": 0.40488508343696594, "learning_rate": 4.6121494694673024e-05, "loss": 0.3598, "step": 5731500 }, { "epoch": 38.78843655262018, "grad_norm": 0.35855478048324585, "learning_rate": 4.612115634473798e-05, "loss": 0.3574, "step": 5732000 }, { "epoch": 38.79182005197055, "grad_norm": 0.4208734333515167, "learning_rate": 4.612081799480295e-05, "loss": 0.3575, "step": 5732500 }, { "epoch": 38.79520355132092, "grad_norm": 0.3655511140823364, "learning_rate": 4.612047964486791e-05, "loss": 0.3593, "step": 5733000 }, { "epoch": 38.79858705067129, "grad_norm": 0.43602752685546875, "learning_rate": 4.612014129493287e-05, "loss": 0.3574, "step": 5733500 }, { "epoch": 38.80197055002166, "grad_norm": 0.4224923849105835, "learning_rate": 4.6119802944997834e-05, "loss": 0.359, "step": 5734000 }, { "epoch": 38.80535404937202, "grad_norm": 0.3702527582645416, "learning_rate": 4.61194645950628e-05, "loss": 0.3573, "step": 5734500 }, { "epoch": 38.80873754872239, "grad_norm": 0.4083997905254364, "learning_rate": 4.6119126245127765e-05, "loss": 0.3607, "step": 5735000 }, { "epoch": 38.81212104807276, "grad_norm": 0.37937113642692566, "learning_rate": 4.611878789519273e-05, "loss": 0.3583, "step": 5735500 }, { "epoch": 38.815504547423124, "grad_norm": 0.39841771125793457, "learning_rate": 4.611844954525769e-05, "loss": 0.3573, "step": 5736000 }, { "epoch": 38.818888046773495, "grad_norm": 0.38441547751426697, "learning_rate": 4.611811119532266e-05, "loss": 0.3579, "step": 5736500 }, { "epoch": 38.822271546123865, "grad_norm": 0.4236910939216614, "learning_rate": 4.6117772845387614e-05, "loss": 0.3577, "step": 5737000 }, { "epoch": 38.82565504547423, "grad_norm": 0.38515132665634155, "learning_rate": 4.6117434495452576e-05, "loss": 0.3575, "step": 5737500 }, { "epoch": 38.8290385448246, "grad_norm": 0.39090225100517273, "learning_rate": 4.611709614551754e-05, "loss": 0.3588, "step": 5738000 }, { "epoch": 38.83242204417497, "grad_norm": 0.3756837844848633, "learning_rate": 4.611675779558251e-05, "loss": 0.3586, "step": 5738500 }, { "epoch": 38.83580554352534, "grad_norm": 0.37347716093063354, "learning_rate": 4.611641944564747e-05, "loss": 0.3578, "step": 5739000 }, { "epoch": 38.8391890428757, "grad_norm": 0.3697563707828522, "learning_rate": 4.611608109571243e-05, "loss": 0.3582, "step": 5739500 }, { "epoch": 38.84257254222607, "grad_norm": 0.41285571455955505, "learning_rate": 4.611574274577739e-05, "loss": 0.3592, "step": 5740000 }, { "epoch": 38.84595604157644, "grad_norm": 0.39796262979507446, "learning_rate": 4.611540439584236e-05, "loss": 0.3592, "step": 5740500 }, { "epoch": 38.849339540926806, "grad_norm": 0.3566901981830597, "learning_rate": 4.6115066045907324e-05, "loss": 0.3574, "step": 5741000 }, { "epoch": 38.852723040277176, "grad_norm": 0.3584422469139099, "learning_rate": 4.611472769597228e-05, "loss": 0.3589, "step": 5741500 }, { "epoch": 38.85610653962755, "grad_norm": 0.3689061999320984, "learning_rate": 4.611438934603725e-05, "loss": 0.3588, "step": 5742000 }, { "epoch": 38.85949003897791, "grad_norm": 0.3805975914001465, "learning_rate": 4.611405099610221e-05, "loss": 0.3581, "step": 5742500 }, { "epoch": 38.86287353832828, "grad_norm": 0.3651244342327118, "learning_rate": 4.611371264616717e-05, "loss": 0.3589, "step": 5743000 }, { "epoch": 38.86625703767865, "grad_norm": 0.3986314833164215, "learning_rate": 4.6113374296232135e-05, "loss": 0.3586, "step": 5743500 }, { "epoch": 38.869640537029014, "grad_norm": 0.352322518825531, "learning_rate": 4.6113035946297104e-05, "loss": 0.3601, "step": 5744000 }, { "epoch": 38.873024036379384, "grad_norm": 0.4206475615501404, "learning_rate": 4.6112697596362066e-05, "loss": 0.3575, "step": 5744500 }, { "epoch": 38.876407535729754, "grad_norm": 0.4022883474826813, "learning_rate": 4.611235924642703e-05, "loss": 0.3598, "step": 5745000 }, { "epoch": 38.879791035080125, "grad_norm": 0.40589722990989685, "learning_rate": 4.611202089649199e-05, "loss": 0.3579, "step": 5745500 }, { "epoch": 38.88317453443049, "grad_norm": 0.35741114616394043, "learning_rate": 4.611168254655696e-05, "loss": 0.3579, "step": 5746000 }, { "epoch": 38.88655803378086, "grad_norm": 0.41959348320961, "learning_rate": 4.6111344196621914e-05, "loss": 0.3571, "step": 5746500 }, { "epoch": 38.88994153313123, "grad_norm": 0.37600529193878174, "learning_rate": 4.6111005846686877e-05, "loss": 0.3576, "step": 5747000 }, { "epoch": 38.89332503248159, "grad_norm": 0.40771156549453735, "learning_rate": 4.611066749675184e-05, "loss": 0.3597, "step": 5747500 }, { "epoch": 38.89670853183196, "grad_norm": 0.3621875047683716, "learning_rate": 4.611032914681681e-05, "loss": 0.3575, "step": 5748000 }, { "epoch": 38.90009203118233, "grad_norm": 0.38153213262557983, "learning_rate": 4.610999079688177e-05, "loss": 0.3581, "step": 5748500 }, { "epoch": 38.903475530532695, "grad_norm": 0.36397770047187805, "learning_rate": 4.610965244694673e-05, "loss": 0.3581, "step": 5749000 }, { "epoch": 38.906859029883066, "grad_norm": 0.3855028748512268, "learning_rate": 4.6109314097011694e-05, "loss": 0.3587, "step": 5749500 }, { "epoch": 38.910242529233436, "grad_norm": 0.41015130281448364, "learning_rate": 4.610897574707666e-05, "loss": 0.3593, "step": 5750000 }, { "epoch": 38.9136260285838, "grad_norm": 0.3674156069755554, "learning_rate": 4.6108637397141625e-05, "loss": 0.3582, "step": 5750500 }, { "epoch": 38.91700952793417, "grad_norm": 0.35390201210975647, "learning_rate": 4.610829904720659e-05, "loss": 0.3593, "step": 5751000 }, { "epoch": 38.92039302728454, "grad_norm": 0.3748086094856262, "learning_rate": 4.610796069727155e-05, "loss": 0.3585, "step": 5751500 }, { "epoch": 38.92377652663491, "grad_norm": 0.4166073501110077, "learning_rate": 4.610762234733651e-05, "loss": 0.3599, "step": 5752000 }, { "epoch": 38.92716002598527, "grad_norm": 0.3876727223396301, "learning_rate": 4.6107283997401473e-05, "loss": 0.3587, "step": 5752500 }, { "epoch": 38.93054352533564, "grad_norm": 0.34974271059036255, "learning_rate": 4.6106945647466436e-05, "loss": 0.3578, "step": 5753000 }, { "epoch": 38.933927024686014, "grad_norm": 0.3858926594257355, "learning_rate": 4.6106607297531404e-05, "loss": 0.3587, "step": 5753500 }, { "epoch": 38.93731052403638, "grad_norm": 0.38629648089408875, "learning_rate": 4.6106268947596367e-05, "loss": 0.3597, "step": 5754000 }, { "epoch": 38.94069402338675, "grad_norm": 0.3409755229949951, "learning_rate": 4.610593059766133e-05, "loss": 0.3584, "step": 5754500 }, { "epoch": 38.94407752273712, "grad_norm": 0.3903469443321228, "learning_rate": 4.610559224772629e-05, "loss": 0.3588, "step": 5755000 }, { "epoch": 38.94746102208748, "grad_norm": 0.36905232071876526, "learning_rate": 4.610525389779125e-05, "loss": 0.3577, "step": 5755500 }, { "epoch": 38.95084452143785, "grad_norm": 0.3966832458972931, "learning_rate": 4.6104915547856215e-05, "loss": 0.359, "step": 5756000 }, { "epoch": 38.95422802078822, "grad_norm": 0.3584676682949066, "learning_rate": 4.610457719792118e-05, "loss": 0.3601, "step": 5756500 }, { "epoch": 38.957611520138585, "grad_norm": 0.4345291554927826, "learning_rate": 4.610423884798614e-05, "loss": 0.3582, "step": 5757000 }, { "epoch": 38.960995019488955, "grad_norm": 0.4365847706794739, "learning_rate": 4.610390049805111e-05, "loss": 0.3587, "step": 5757500 }, { "epoch": 38.964378518839325, "grad_norm": 0.38714271783828735, "learning_rate": 4.610356214811607e-05, "loss": 0.3589, "step": 5758000 }, { "epoch": 38.967762018189696, "grad_norm": 0.39306461811065674, "learning_rate": 4.610322379818103e-05, "loss": 0.3588, "step": 5758500 }, { "epoch": 38.97114551754006, "grad_norm": 0.37263697385787964, "learning_rate": 4.6102885448245995e-05, "loss": 0.3599, "step": 5759000 }, { "epoch": 38.97452901689043, "grad_norm": 0.40572571754455566, "learning_rate": 4.6102547098310963e-05, "loss": 0.3576, "step": 5759500 }, { "epoch": 38.9779125162408, "grad_norm": 0.33693259954452515, "learning_rate": 4.6102208748375926e-05, "loss": 0.3587, "step": 5760000 }, { "epoch": 38.98129601559116, "grad_norm": 0.39590874314308167, "learning_rate": 4.610187039844089e-05, "loss": 0.3579, "step": 5760500 }, { "epoch": 38.98467951494153, "grad_norm": 0.3590654730796814, "learning_rate": 4.610153204850585e-05, "loss": 0.3581, "step": 5761000 }, { "epoch": 38.9880630142919, "grad_norm": 0.41421273350715637, "learning_rate": 4.610119369857081e-05, "loss": 0.3576, "step": 5761500 }, { "epoch": 38.991446513642266, "grad_norm": 0.3868899345397949, "learning_rate": 4.6100855348635774e-05, "loss": 0.3597, "step": 5762000 }, { "epoch": 38.99483001299264, "grad_norm": 0.38397252559661865, "learning_rate": 4.6100516998700736e-05, "loss": 0.359, "step": 5762500 }, { "epoch": 38.99821351234301, "grad_norm": 0.36752620339393616, "learning_rate": 4.6100178648765705e-05, "loss": 0.3576, "step": 5763000 }, { "epoch": 39.0, "eval_accuracy": 0.8632947080532718, "eval_loss": 0.5549707412719727, "eval_runtime": 3389.5211, "eval_samples_per_second": 85.777, "eval_steps_per_second": 5.361, "step": 5763264 }, { "epoch": 39.00159701169338, "grad_norm": 0.41049081087112427, "learning_rate": 4.609984029883067e-05, "loss": 0.3554, "step": 5763500 }, { "epoch": 39.00498051104374, "grad_norm": 0.38443803787231445, "learning_rate": 4.609950194889563e-05, "loss": 0.3558, "step": 5764000 }, { "epoch": 39.00836401039411, "grad_norm": 0.40071505308151245, "learning_rate": 4.609916359896059e-05, "loss": 0.354, "step": 5764500 }, { "epoch": 39.01174750974448, "grad_norm": 0.3943445682525635, "learning_rate": 4.6098825249025554e-05, "loss": 0.3558, "step": 5765000 }, { "epoch": 39.015131009094844, "grad_norm": 0.3997240662574768, "learning_rate": 4.6098486899090516e-05, "loss": 0.3554, "step": 5765500 }, { "epoch": 39.018514508445215, "grad_norm": 0.40480175614356995, "learning_rate": 4.609814854915548e-05, "loss": 0.3559, "step": 5766000 }, { "epoch": 39.021898007795585, "grad_norm": 0.3974194824695587, "learning_rate": 4.609781019922044e-05, "loss": 0.3558, "step": 5766500 }, { "epoch": 39.02528150714595, "grad_norm": 0.3885852098464966, "learning_rate": 4.609747184928541e-05, "loss": 0.3565, "step": 5767000 }, { "epoch": 39.02866500649632, "grad_norm": 0.3391273617744446, "learning_rate": 4.609713349935037e-05, "loss": 0.3549, "step": 5767500 }, { "epoch": 39.03204850584669, "grad_norm": 0.3764244616031647, "learning_rate": 4.609679514941533e-05, "loss": 0.3564, "step": 5768000 }, { "epoch": 39.03543200519705, "grad_norm": 0.3896447420120239, "learning_rate": 4.6096456799480295e-05, "loss": 0.356, "step": 5768500 }, { "epoch": 39.03881550454742, "grad_norm": 0.3836119472980499, "learning_rate": 4.6096118449545264e-05, "loss": 0.3571, "step": 5769000 }, { "epoch": 39.04219900389779, "grad_norm": 0.37785929441452026, "learning_rate": 4.6095780099610226e-05, "loss": 0.3555, "step": 5769500 }, { "epoch": 39.04558250324816, "grad_norm": 0.3991580605506897, "learning_rate": 4.609544174967519e-05, "loss": 0.3567, "step": 5770000 }, { "epoch": 39.048966002598526, "grad_norm": 0.3892557919025421, "learning_rate": 4.609510339974015e-05, "loss": 0.3566, "step": 5770500 }, { "epoch": 39.052349501948896, "grad_norm": 0.3755117356777191, "learning_rate": 4.609476504980511e-05, "loss": 0.3554, "step": 5771000 }, { "epoch": 39.05573300129927, "grad_norm": 0.3512061834335327, "learning_rate": 4.6094426699870075e-05, "loss": 0.3573, "step": 5771500 }, { "epoch": 39.05911650064963, "grad_norm": 0.3926057517528534, "learning_rate": 4.609408834993504e-05, "loss": 0.3567, "step": 5772000 }, { "epoch": 39.0625, "grad_norm": 0.37486153841018677, "learning_rate": 4.609375e-05, "loss": 0.356, "step": 5772500 }, { "epoch": 39.06588349935037, "grad_norm": 0.37479740381240845, "learning_rate": 4.609341165006497e-05, "loss": 0.3582, "step": 5773000 }, { "epoch": 39.06926699870073, "grad_norm": 0.3782919645309448, "learning_rate": 4.609307330012993e-05, "loss": 0.3559, "step": 5773500 }, { "epoch": 39.072650498051104, "grad_norm": 0.3841499090194702, "learning_rate": 4.609273495019489e-05, "loss": 0.358, "step": 5774000 }, { "epoch": 39.076033997401474, "grad_norm": 0.40607038140296936, "learning_rate": 4.6092396600259854e-05, "loss": 0.3562, "step": 5774500 }, { "epoch": 39.07941749675184, "grad_norm": 0.4377508759498596, "learning_rate": 4.6092058250324816e-05, "loss": 0.3588, "step": 5775000 }, { "epoch": 39.08280099610221, "grad_norm": 0.3485512435436249, "learning_rate": 4.609171990038978e-05, "loss": 0.3562, "step": 5775500 }, { "epoch": 39.08618449545258, "grad_norm": 0.39141741394996643, "learning_rate": 4.609138155045474e-05, "loss": 0.3563, "step": 5776000 }, { "epoch": 39.08956799480295, "grad_norm": 0.3922073245048523, "learning_rate": 4.609104320051971e-05, "loss": 0.3566, "step": 5776500 }, { "epoch": 39.09295149415331, "grad_norm": 0.3608970642089844, "learning_rate": 4.609070485058467e-05, "loss": 0.3575, "step": 5777000 }, { "epoch": 39.09633499350368, "grad_norm": 0.4330408275127411, "learning_rate": 4.6090366500649634e-05, "loss": 0.3566, "step": 5777500 }, { "epoch": 39.09971849285405, "grad_norm": 0.3976117670536041, "learning_rate": 4.6090028150714596e-05, "loss": 0.3569, "step": 5778000 }, { "epoch": 39.103101992204415, "grad_norm": 0.4137636721134186, "learning_rate": 4.6089689800779565e-05, "loss": 0.3559, "step": 5778500 }, { "epoch": 39.106485491554785, "grad_norm": 0.39320191740989685, "learning_rate": 4.608935145084453e-05, "loss": 0.3584, "step": 5779000 }, { "epoch": 39.109868990905156, "grad_norm": 0.3872518539428711, "learning_rate": 4.608901310090949e-05, "loss": 0.3577, "step": 5779500 }, { "epoch": 39.11325249025552, "grad_norm": 0.38965508341789246, "learning_rate": 4.6088674750974444e-05, "loss": 0.3557, "step": 5780000 }, { "epoch": 39.11663598960589, "grad_norm": 0.4170781672000885, "learning_rate": 4.608833640103941e-05, "loss": 0.3561, "step": 5780500 }, { "epoch": 39.12001948895626, "grad_norm": 0.40871477127075195, "learning_rate": 4.6087998051104375e-05, "loss": 0.356, "step": 5781000 }, { "epoch": 39.12340298830662, "grad_norm": 0.36769962310791016, "learning_rate": 4.608765970116934e-05, "loss": 0.3587, "step": 5781500 }, { "epoch": 39.12678648765699, "grad_norm": 0.3955308794975281, "learning_rate": 4.60873213512343e-05, "loss": 0.3587, "step": 5782000 }, { "epoch": 39.13016998700736, "grad_norm": 0.4196022152900696, "learning_rate": 4.608698300129927e-05, "loss": 0.3581, "step": 5782500 }, { "epoch": 39.133553486357734, "grad_norm": 0.37169861793518066, "learning_rate": 4.608664465136423e-05, "loss": 0.3588, "step": 5783000 }, { "epoch": 39.1369369857081, "grad_norm": 0.3570674657821655, "learning_rate": 4.608630630142919e-05, "loss": 0.3591, "step": 5783500 }, { "epoch": 39.14032048505847, "grad_norm": 0.37060561776161194, "learning_rate": 4.6085967951494155e-05, "loss": 0.3563, "step": 5784000 }, { "epoch": 39.14370398440884, "grad_norm": 0.36501386761665344, "learning_rate": 4.608562960155912e-05, "loss": 0.356, "step": 5784500 }, { "epoch": 39.1470874837592, "grad_norm": 0.4009703993797302, "learning_rate": 4.608529125162408e-05, "loss": 0.3573, "step": 5785000 }, { "epoch": 39.15047098310957, "grad_norm": 0.38712480664253235, "learning_rate": 4.608495290168904e-05, "loss": 0.3549, "step": 5785500 }, { "epoch": 39.15385448245994, "grad_norm": 0.3608624041080475, "learning_rate": 4.608461455175401e-05, "loss": 0.3567, "step": 5786000 }, { "epoch": 39.157237981810304, "grad_norm": 0.41465529799461365, "learning_rate": 4.608427620181897e-05, "loss": 0.3581, "step": 5786500 }, { "epoch": 39.160621481160675, "grad_norm": 0.38023123145103455, "learning_rate": 4.6083937851883934e-05, "loss": 0.3575, "step": 5787000 }, { "epoch": 39.164004980511045, "grad_norm": 0.3878611922264099, "learning_rate": 4.6083599501948896e-05, "loss": 0.3588, "step": 5787500 }, { "epoch": 39.167388479861415, "grad_norm": 0.3601565361022949, "learning_rate": 4.6083261152013865e-05, "loss": 0.3565, "step": 5788000 }, { "epoch": 39.17077197921178, "grad_norm": 0.4064596891403198, "learning_rate": 4.608292280207883e-05, "loss": 0.3578, "step": 5788500 }, { "epoch": 39.17415547856215, "grad_norm": 0.35656580328941345, "learning_rate": 4.608258445214379e-05, "loss": 0.3582, "step": 5789000 }, { "epoch": 39.17753897791252, "grad_norm": 0.42319318652153015, "learning_rate": 4.6082246102208745e-05, "loss": 0.3579, "step": 5789500 }, { "epoch": 39.18092247726288, "grad_norm": 0.3610454499721527, "learning_rate": 4.6081907752273714e-05, "loss": 0.3564, "step": 5790000 }, { "epoch": 39.18430597661325, "grad_norm": 0.38486212491989136, "learning_rate": 4.6081569402338676e-05, "loss": 0.3584, "step": 5790500 }, { "epoch": 39.18768947596362, "grad_norm": 0.4174489378929138, "learning_rate": 4.608123105240364e-05, "loss": 0.3564, "step": 5791000 }, { "epoch": 39.191072975313986, "grad_norm": 0.42792096734046936, "learning_rate": 4.60808927024686e-05, "loss": 0.3574, "step": 5791500 }, { "epoch": 39.19445647466436, "grad_norm": 0.3785710632801056, "learning_rate": 4.608055435253357e-05, "loss": 0.3582, "step": 5792000 }, { "epoch": 39.19783997401473, "grad_norm": 0.41874465346336365, "learning_rate": 4.608021600259853e-05, "loss": 0.3571, "step": 5792500 }, { "epoch": 39.20122347336509, "grad_norm": 0.4343903660774231, "learning_rate": 4.607987765266349e-05, "loss": 0.3566, "step": 5793000 }, { "epoch": 39.20460697271546, "grad_norm": 0.37548828125, "learning_rate": 4.6079539302728455e-05, "loss": 0.3576, "step": 5793500 }, { "epoch": 39.20799047206583, "grad_norm": 0.3644872009754181, "learning_rate": 4.607920095279342e-05, "loss": 0.3578, "step": 5794000 }, { "epoch": 39.2113739714162, "grad_norm": 0.4181846082210541, "learning_rate": 4.607886260285838e-05, "loss": 0.3579, "step": 5794500 }, { "epoch": 39.214757470766564, "grad_norm": 0.43744346499443054, "learning_rate": 4.607852425292334e-05, "loss": 0.3556, "step": 5795000 }, { "epoch": 39.218140970116934, "grad_norm": 0.38154008984565735, "learning_rate": 4.607818590298831e-05, "loss": 0.3584, "step": 5795500 }, { "epoch": 39.221524469467305, "grad_norm": 0.42544493079185486, "learning_rate": 4.607784755305327e-05, "loss": 0.3571, "step": 5796000 }, { "epoch": 39.22490796881767, "grad_norm": 0.3827608525753021, "learning_rate": 4.6077509203118235e-05, "loss": 0.3581, "step": 5796500 }, { "epoch": 39.22829146816804, "grad_norm": 0.3791741728782654, "learning_rate": 4.60771708531832e-05, "loss": 0.3574, "step": 5797000 }, { "epoch": 39.23167496751841, "grad_norm": 0.37400010228157043, "learning_rate": 4.6076832503248166e-05, "loss": 0.3568, "step": 5797500 }, { "epoch": 39.23505846686877, "grad_norm": 0.399890273809433, "learning_rate": 4.607649415331313e-05, "loss": 0.3587, "step": 5798000 }, { "epoch": 39.23844196621914, "grad_norm": 0.3848799467086792, "learning_rate": 4.607615580337809e-05, "loss": 0.3586, "step": 5798500 }, { "epoch": 39.24182546556951, "grad_norm": 0.4184007942676544, "learning_rate": 4.6075817453443046e-05, "loss": 0.3565, "step": 5799000 }, { "epoch": 39.245208964919875, "grad_norm": 0.350588858127594, "learning_rate": 4.6075479103508014e-05, "loss": 0.3583, "step": 5799500 }, { "epoch": 39.248592464270246, "grad_norm": 0.4084034860134125, "learning_rate": 4.6075140753572977e-05, "loss": 0.3575, "step": 5800000 }, { "epoch": 39.251975963620616, "grad_norm": 0.4222847819328308, "learning_rate": 4.607480240363794e-05, "loss": 0.3574, "step": 5800500 }, { "epoch": 39.255359462970986, "grad_norm": 0.3579876124858856, "learning_rate": 4.60744640537029e-05, "loss": 0.3587, "step": 5801000 }, { "epoch": 39.25874296232135, "grad_norm": 0.3947008550167084, "learning_rate": 4.607412570376787e-05, "loss": 0.3569, "step": 5801500 }, { "epoch": 39.26212646167172, "grad_norm": 0.3536688983440399, "learning_rate": 4.607378735383283e-05, "loss": 0.3588, "step": 5802000 }, { "epoch": 39.26550996102209, "grad_norm": 0.39786744117736816, "learning_rate": 4.6073449003897794e-05, "loss": 0.359, "step": 5802500 }, { "epoch": 39.26889346037245, "grad_norm": 0.3595603406429291, "learning_rate": 4.6073110653962756e-05, "loss": 0.3583, "step": 5803000 }, { "epoch": 39.272276959722824, "grad_norm": 0.4054912030696869, "learning_rate": 4.6072772304027725e-05, "loss": 0.3577, "step": 5803500 }, { "epoch": 39.275660459073194, "grad_norm": 0.4205889105796814, "learning_rate": 4.607243395409268e-05, "loss": 0.3578, "step": 5804000 }, { "epoch": 39.27904395842356, "grad_norm": 0.40288498997688293, "learning_rate": 4.607209560415764e-05, "loss": 0.357, "step": 5804500 }, { "epoch": 39.28242745777393, "grad_norm": 0.38287627696990967, "learning_rate": 4.607175725422261e-05, "loss": 0.3571, "step": 5805000 }, { "epoch": 39.2858109571243, "grad_norm": 0.3729548752307892, "learning_rate": 4.6071418904287573e-05, "loss": 0.3582, "step": 5805500 }, { "epoch": 39.28919445647466, "grad_norm": 0.3424685299396515, "learning_rate": 4.6071080554352536e-05, "loss": 0.3589, "step": 5806000 }, { "epoch": 39.29257795582503, "grad_norm": 0.37398502230644226, "learning_rate": 4.60707422044175e-05, "loss": 0.3584, "step": 5806500 }, { "epoch": 39.2959614551754, "grad_norm": 0.3575172424316406, "learning_rate": 4.607040385448247e-05, "loss": 0.3567, "step": 5807000 }, { "epoch": 39.29934495452577, "grad_norm": 0.38726699352264404, "learning_rate": 4.607006550454743e-05, "loss": 0.3593, "step": 5807500 }, { "epoch": 39.302728453876135, "grad_norm": 0.401279091835022, "learning_rate": 4.606972715461239e-05, "loss": 0.3595, "step": 5808000 }, { "epoch": 39.306111953226505, "grad_norm": 0.39637458324432373, "learning_rate": 4.6069388804677346e-05, "loss": 0.3583, "step": 5808500 }, { "epoch": 39.309495452576876, "grad_norm": 0.3764081597328186, "learning_rate": 4.6069050454742315e-05, "loss": 0.3587, "step": 5809000 }, { "epoch": 39.31287895192724, "grad_norm": 0.3873443007469177, "learning_rate": 4.606871210480728e-05, "loss": 0.3574, "step": 5809500 }, { "epoch": 39.31626245127761, "grad_norm": 0.40507134795188904, "learning_rate": 4.606837375487224e-05, "loss": 0.3576, "step": 5810000 }, { "epoch": 39.31964595062798, "grad_norm": 0.43546003103256226, "learning_rate": 4.60680354049372e-05, "loss": 0.359, "step": 5810500 }, { "epoch": 39.32302944997834, "grad_norm": 0.41429951786994934, "learning_rate": 4.606769705500217e-05, "loss": 0.3571, "step": 5811000 }, { "epoch": 39.32641294932871, "grad_norm": 0.4261474013328552, "learning_rate": 4.606735870506713e-05, "loss": 0.3575, "step": 5811500 }, { "epoch": 39.32979644867908, "grad_norm": 0.3997849225997925, "learning_rate": 4.6067020355132095e-05, "loss": 0.3575, "step": 5812000 }, { "epoch": 39.33317994802945, "grad_norm": 0.35773953795433044, "learning_rate": 4.606668200519706e-05, "loss": 0.3589, "step": 5812500 }, { "epoch": 39.33656344737982, "grad_norm": 0.38859155774116516, "learning_rate": 4.6066343655262026e-05, "loss": 0.3576, "step": 5813000 }, { "epoch": 39.33994694673019, "grad_norm": 0.37268543243408203, "learning_rate": 4.606600530532698e-05, "loss": 0.3561, "step": 5813500 }, { "epoch": 39.34333044608056, "grad_norm": 0.40817004442214966, "learning_rate": 4.606566695539194e-05, "loss": 0.3598, "step": 5814000 }, { "epoch": 39.34671394543092, "grad_norm": 0.34740012884140015, "learning_rate": 4.606532860545691e-05, "loss": 0.3596, "step": 5814500 }, { "epoch": 39.35009744478129, "grad_norm": 0.42565348744392395, "learning_rate": 4.6064990255521874e-05, "loss": 0.3594, "step": 5815000 }, { "epoch": 39.35348094413166, "grad_norm": 0.40188512206077576, "learning_rate": 4.6064651905586836e-05, "loss": 0.3578, "step": 5815500 }, { "epoch": 39.356864443482024, "grad_norm": 0.4051068127155304, "learning_rate": 4.60643135556518e-05, "loss": 0.3587, "step": 5816000 }, { "epoch": 39.360247942832395, "grad_norm": 0.3960541784763336, "learning_rate": 4.606397520571677e-05, "loss": 0.3595, "step": 5816500 }, { "epoch": 39.363631442182765, "grad_norm": 0.41114065051078796, "learning_rate": 4.606363685578173e-05, "loss": 0.3588, "step": 5817000 }, { "epoch": 39.36701494153313, "grad_norm": 0.3999212384223938, "learning_rate": 4.606329850584669e-05, "loss": 0.3571, "step": 5817500 }, { "epoch": 39.3703984408835, "grad_norm": 0.34364742040634155, "learning_rate": 4.606296015591165e-05, "loss": 0.3587, "step": 5818000 }, { "epoch": 39.37378194023387, "grad_norm": 0.3538402020931244, "learning_rate": 4.6062621805976616e-05, "loss": 0.3583, "step": 5818500 }, { "epoch": 39.37716543958424, "grad_norm": 0.3612450957298279, "learning_rate": 4.606228345604158e-05, "loss": 0.3591, "step": 5819000 }, { "epoch": 39.3805489389346, "grad_norm": 0.366875022649765, "learning_rate": 4.606194510610654e-05, "loss": 0.3573, "step": 5819500 }, { "epoch": 39.38393243828497, "grad_norm": 0.39192113280296326, "learning_rate": 4.60616067561715e-05, "loss": 0.3569, "step": 5820000 }, { "epoch": 39.38731593763534, "grad_norm": 0.37817490100860596, "learning_rate": 4.606126840623647e-05, "loss": 0.3604, "step": 5820500 }, { "epoch": 39.390699436985706, "grad_norm": 0.35173866152763367, "learning_rate": 4.606093005630143e-05, "loss": 0.3584, "step": 5821000 }, { "epoch": 39.394082936336076, "grad_norm": 0.37890276312828064, "learning_rate": 4.6060591706366395e-05, "loss": 0.3584, "step": 5821500 }, { "epoch": 39.39746643568645, "grad_norm": 0.3983881175518036, "learning_rate": 4.606025335643136e-05, "loss": 0.3563, "step": 5822000 }, { "epoch": 39.40084993503681, "grad_norm": 0.3656408190727234, "learning_rate": 4.6059915006496326e-05, "loss": 0.3579, "step": 5822500 }, { "epoch": 39.40423343438718, "grad_norm": 0.40919533371925354, "learning_rate": 4.605957665656128e-05, "loss": 0.3581, "step": 5823000 }, { "epoch": 39.40761693373755, "grad_norm": 0.3984890580177307, "learning_rate": 4.6059238306626244e-05, "loss": 0.3574, "step": 5823500 }, { "epoch": 39.411000433087914, "grad_norm": 0.3947007954120636, "learning_rate": 4.605889995669121e-05, "loss": 0.3586, "step": 5824000 }, { "epoch": 39.414383932438284, "grad_norm": 0.4393731653690338, "learning_rate": 4.6058561606756175e-05, "loss": 0.3568, "step": 5824500 }, { "epoch": 39.417767431788654, "grad_norm": 0.36653101444244385, "learning_rate": 4.605822325682114e-05, "loss": 0.3573, "step": 5825000 }, { "epoch": 39.421150931139024, "grad_norm": 0.37711697816848755, "learning_rate": 4.60578849068861e-05, "loss": 0.3583, "step": 5825500 }, { "epoch": 39.42453443048939, "grad_norm": 0.40406227111816406, "learning_rate": 4.605754655695106e-05, "loss": 0.3575, "step": 5826000 }, { "epoch": 39.42791792983976, "grad_norm": 0.43455639481544495, "learning_rate": 4.605720820701603e-05, "loss": 0.3598, "step": 5826500 }, { "epoch": 39.43130142919013, "grad_norm": 0.3846614360809326, "learning_rate": 4.605686985708099e-05, "loss": 0.3564, "step": 5827000 }, { "epoch": 39.43468492854049, "grad_norm": 0.3586784601211548, "learning_rate": 4.605653150714595e-05, "loss": 0.3582, "step": 5827500 }, { "epoch": 39.43806842789086, "grad_norm": 0.4095541536808014, "learning_rate": 4.6056193157210916e-05, "loss": 0.3582, "step": 5828000 }, { "epoch": 39.44145192724123, "grad_norm": 0.390663206577301, "learning_rate": 4.605585480727588e-05, "loss": 0.3564, "step": 5828500 }, { "epoch": 39.444835426591595, "grad_norm": 0.3315221667289734, "learning_rate": 4.605551645734084e-05, "loss": 0.3577, "step": 5829000 }, { "epoch": 39.448218925941966, "grad_norm": 0.4073394238948822, "learning_rate": 4.60551781074058e-05, "loss": 0.3575, "step": 5829500 }, { "epoch": 39.451602425292336, "grad_norm": 0.41086679697036743, "learning_rate": 4.605483975747077e-05, "loss": 0.3581, "step": 5830000 }, { "epoch": 39.4549859246427, "grad_norm": 0.37248337268829346, "learning_rate": 4.6054501407535734e-05, "loss": 0.357, "step": 5830500 }, { "epoch": 39.45836942399307, "grad_norm": 0.42306119203567505, "learning_rate": 4.6054163057600696e-05, "loss": 0.3563, "step": 5831000 }, { "epoch": 39.46175292334344, "grad_norm": 0.3646513521671295, "learning_rate": 4.605382470766566e-05, "loss": 0.3583, "step": 5831500 }, { "epoch": 39.46513642269381, "grad_norm": 0.33797353506088257, "learning_rate": 4.605348635773063e-05, "loss": 0.3594, "step": 5832000 }, { "epoch": 39.46851992204417, "grad_norm": 0.3744948208332062, "learning_rate": 4.605314800779558e-05, "loss": 0.3579, "step": 5832500 }, { "epoch": 39.47190342139454, "grad_norm": 0.35115155577659607, "learning_rate": 4.6052809657860544e-05, "loss": 0.3574, "step": 5833000 }, { "epoch": 39.475286920744914, "grad_norm": 0.39874720573425293, "learning_rate": 4.605247130792551e-05, "loss": 0.3583, "step": 5833500 }, { "epoch": 39.47867042009528, "grad_norm": 0.3970976769924164, "learning_rate": 4.6052132957990475e-05, "loss": 0.3576, "step": 5834000 }, { "epoch": 39.48205391944565, "grad_norm": 0.4019801914691925, "learning_rate": 4.605179460805544e-05, "loss": 0.3562, "step": 5834500 }, { "epoch": 39.48543741879602, "grad_norm": 0.39601776003837585, "learning_rate": 4.60514562581204e-05, "loss": 0.3571, "step": 5835000 }, { "epoch": 39.48882091814638, "grad_norm": 0.36481037735939026, "learning_rate": 4.605111790818536e-05, "loss": 0.3581, "step": 5835500 }, { "epoch": 39.49220441749675, "grad_norm": 0.39808598160743713, "learning_rate": 4.605077955825033e-05, "loss": 0.3582, "step": 5836000 }, { "epoch": 39.49558791684712, "grad_norm": 0.3782365918159485, "learning_rate": 4.605044120831529e-05, "loss": 0.3591, "step": 5836500 }, { "epoch": 39.49897141619749, "grad_norm": 0.4156523644924164, "learning_rate": 4.605010285838025e-05, "loss": 0.3588, "step": 5837000 }, { "epoch": 39.502354915547855, "grad_norm": 0.33446115255355835, "learning_rate": 4.604976450844522e-05, "loss": 0.3572, "step": 5837500 }, { "epoch": 39.505738414898225, "grad_norm": 0.38550758361816406, "learning_rate": 4.604942615851018e-05, "loss": 0.3572, "step": 5838000 }, { "epoch": 39.509121914248595, "grad_norm": 0.37014317512512207, "learning_rate": 4.604908780857514e-05, "loss": 0.3572, "step": 5838500 }, { "epoch": 39.51250541359896, "grad_norm": 0.39192500710487366, "learning_rate": 4.60487494586401e-05, "loss": 0.3587, "step": 5839000 }, { "epoch": 39.51588891294933, "grad_norm": 0.36324289441108704, "learning_rate": 4.604841110870507e-05, "loss": 0.3582, "step": 5839500 }, { "epoch": 39.5192724122997, "grad_norm": 0.3271288275718689, "learning_rate": 4.6048072758770034e-05, "loss": 0.3581, "step": 5840000 }, { "epoch": 39.52265591165006, "grad_norm": 0.41794437170028687, "learning_rate": 4.6047734408834997e-05, "loss": 0.358, "step": 5840500 }, { "epoch": 39.52603941100043, "grad_norm": 0.391206294298172, "learning_rate": 4.604739605889996e-05, "loss": 0.3582, "step": 5841000 }, { "epoch": 39.5294229103508, "grad_norm": 0.40756654739379883, "learning_rate": 4.604705770896493e-05, "loss": 0.356, "step": 5841500 }, { "epoch": 39.532806409701166, "grad_norm": 0.4058922231197357, "learning_rate": 4.604671935902988e-05, "loss": 0.3588, "step": 5842000 }, { "epoch": 39.53618990905154, "grad_norm": 0.39431220293045044, "learning_rate": 4.6046381009094845e-05, "loss": 0.3574, "step": 5842500 }, { "epoch": 39.53957340840191, "grad_norm": 0.413131445646286, "learning_rate": 4.604604265915981e-05, "loss": 0.3579, "step": 5843000 }, { "epoch": 39.54295690775228, "grad_norm": 0.40306076407432556, "learning_rate": 4.6045704309224776e-05, "loss": 0.3579, "step": 5843500 }, { "epoch": 39.54634040710264, "grad_norm": 0.3776503801345825, "learning_rate": 4.604536595928974e-05, "loss": 0.3572, "step": 5844000 }, { "epoch": 39.54972390645301, "grad_norm": 0.39396798610687256, "learning_rate": 4.60450276093547e-05, "loss": 0.3574, "step": 5844500 }, { "epoch": 39.55310740580338, "grad_norm": 0.38373348116874695, "learning_rate": 4.604468925941966e-05, "loss": 0.359, "step": 5845000 }, { "epoch": 39.556490905153744, "grad_norm": 0.38188958168029785, "learning_rate": 4.604435090948463e-05, "loss": 0.3573, "step": 5845500 }, { "epoch": 39.559874404504114, "grad_norm": 0.36791303753852844, "learning_rate": 4.6044012559549593e-05, "loss": 0.3589, "step": 5846000 }, { "epoch": 39.563257903854485, "grad_norm": 0.39912042021751404, "learning_rate": 4.604367420961455e-05, "loss": 0.3586, "step": 5846500 }, { "epoch": 39.56664140320485, "grad_norm": 0.4015141427516937, "learning_rate": 4.604333585967952e-05, "loss": 0.3575, "step": 5847000 }, { "epoch": 39.57002490255522, "grad_norm": 0.3737492561340332, "learning_rate": 4.604299750974448e-05, "loss": 0.3585, "step": 5847500 }, { "epoch": 39.57340840190559, "grad_norm": 0.3960467278957367, "learning_rate": 4.604265915980944e-05, "loss": 0.3572, "step": 5848000 }, { "epoch": 39.57679190125595, "grad_norm": 0.3899024426937103, "learning_rate": 4.6042320809874404e-05, "loss": 0.359, "step": 5848500 }, { "epoch": 39.58017540060632, "grad_norm": 0.3939670920372009, "learning_rate": 4.604198245993937e-05, "loss": 0.3588, "step": 5849000 }, { "epoch": 39.58355889995669, "grad_norm": 0.3528555631637573, "learning_rate": 4.6041644110004335e-05, "loss": 0.3569, "step": 5849500 }, { "epoch": 39.58694239930706, "grad_norm": 0.3714154362678528, "learning_rate": 4.60413057600693e-05, "loss": 0.3577, "step": 5850000 }, { "epoch": 39.590325898657426, "grad_norm": 0.3541982173919678, "learning_rate": 4.604096741013426e-05, "loss": 0.3575, "step": 5850500 }, { "epoch": 39.593709398007796, "grad_norm": 0.37014955282211304, "learning_rate": 4.604062906019923e-05, "loss": 0.3589, "step": 5851000 }, { "epoch": 39.597092897358166, "grad_norm": 0.3928954303264618, "learning_rate": 4.6040290710264184e-05, "loss": 0.3574, "step": 5851500 }, { "epoch": 39.60047639670853, "grad_norm": 0.42399078607559204, "learning_rate": 4.6039952360329146e-05, "loss": 0.3581, "step": 5852000 }, { "epoch": 39.6038598960589, "grad_norm": 0.42243319749832153, "learning_rate": 4.603961401039411e-05, "loss": 0.3593, "step": 5852500 }, { "epoch": 39.60724339540927, "grad_norm": 0.33981195092201233, "learning_rate": 4.603927566045908e-05, "loss": 0.3581, "step": 5853000 }, { "epoch": 39.61062689475963, "grad_norm": 0.3757496178150177, "learning_rate": 4.603893731052404e-05, "loss": 0.359, "step": 5853500 }, { "epoch": 39.614010394110004, "grad_norm": 0.3754291534423828, "learning_rate": 4.6038598960589e-05, "loss": 0.3585, "step": 5854000 }, { "epoch": 39.617393893460374, "grad_norm": 0.3909532427787781, "learning_rate": 4.603826061065396e-05, "loss": 0.3572, "step": 5854500 }, { "epoch": 39.62077739281074, "grad_norm": 0.39325129985809326, "learning_rate": 4.603792226071893e-05, "loss": 0.3589, "step": 5855000 }, { "epoch": 39.62416089216111, "grad_norm": 0.36456966400146484, "learning_rate": 4.6037583910783894e-05, "loss": 0.3593, "step": 5855500 }, { "epoch": 39.62754439151148, "grad_norm": 0.3960898518562317, "learning_rate": 4.603724556084885e-05, "loss": 0.3565, "step": 5856000 }, { "epoch": 39.63092789086185, "grad_norm": 0.38639023900032043, "learning_rate": 4.603690721091382e-05, "loss": 0.3591, "step": 5856500 }, { "epoch": 39.63431139021221, "grad_norm": 0.35006192326545715, "learning_rate": 4.603656886097878e-05, "loss": 0.3584, "step": 5857000 }, { "epoch": 39.63769488956258, "grad_norm": 0.3980305790901184, "learning_rate": 4.603623051104374e-05, "loss": 0.3571, "step": 5857500 }, { "epoch": 39.64107838891295, "grad_norm": 0.34487035870552063, "learning_rate": 4.6035892161108705e-05, "loss": 0.3575, "step": 5858000 }, { "epoch": 39.644461888263315, "grad_norm": 0.3834744691848755, "learning_rate": 4.6035553811173674e-05, "loss": 0.3568, "step": 5858500 }, { "epoch": 39.647845387613685, "grad_norm": 0.38060876727104187, "learning_rate": 4.6035215461238636e-05, "loss": 0.3582, "step": 5859000 }, { "epoch": 39.651228886964056, "grad_norm": 0.41033247113227844, "learning_rate": 4.60348771113036e-05, "loss": 0.3558, "step": 5859500 }, { "epoch": 39.65461238631442, "grad_norm": 0.37758803367614746, "learning_rate": 4.603453876136856e-05, "loss": 0.3577, "step": 5860000 }, { "epoch": 39.65799588566479, "grad_norm": 0.40897002816200256, "learning_rate": 4.603420041143353e-05, "loss": 0.3582, "step": 5860500 }, { "epoch": 39.66137938501516, "grad_norm": 0.4154495298862457, "learning_rate": 4.6033862061498484e-05, "loss": 0.3566, "step": 5861000 }, { "epoch": 39.66476288436553, "grad_norm": 0.3547229766845703, "learning_rate": 4.6033523711563446e-05, "loss": 0.3573, "step": 5861500 }, { "epoch": 39.66814638371589, "grad_norm": 0.3655273914337158, "learning_rate": 4.603318536162841e-05, "loss": 0.3564, "step": 5862000 }, { "epoch": 39.67152988306626, "grad_norm": 0.36885225772857666, "learning_rate": 4.603284701169338e-05, "loss": 0.3578, "step": 5862500 }, { "epoch": 39.674913382416634, "grad_norm": 0.40368983149528503, "learning_rate": 4.603250866175834e-05, "loss": 0.3582, "step": 5863000 }, { "epoch": 39.678296881767, "grad_norm": 0.4202343225479126, "learning_rate": 4.60321703118233e-05, "loss": 0.3591, "step": 5863500 }, { "epoch": 39.68168038111737, "grad_norm": 0.43563011288642883, "learning_rate": 4.6031831961888264e-05, "loss": 0.3589, "step": 5864000 }, { "epoch": 39.68506388046774, "grad_norm": 0.3813161551952362, "learning_rate": 4.603149361195323e-05, "loss": 0.3573, "step": 5864500 }, { "epoch": 39.6884473798181, "grad_norm": 0.38616743683815, "learning_rate": 4.6031155262018195e-05, "loss": 0.3568, "step": 5865000 }, { "epoch": 39.69183087916847, "grad_norm": 0.4203009605407715, "learning_rate": 4.603081691208316e-05, "loss": 0.3577, "step": 5865500 }, { "epoch": 39.69521437851884, "grad_norm": 0.3902522027492523, "learning_rate": 4.603047856214812e-05, "loss": 0.3584, "step": 5866000 }, { "epoch": 39.698597877869204, "grad_norm": 0.3417580723762512, "learning_rate": 4.603014021221308e-05, "loss": 0.357, "step": 5866500 }, { "epoch": 39.701981377219575, "grad_norm": 0.3956816792488098, "learning_rate": 4.602980186227804e-05, "loss": 0.3575, "step": 5867000 }, { "epoch": 39.705364876569945, "grad_norm": 0.35489580035209656, "learning_rate": 4.6029463512343005e-05, "loss": 0.3579, "step": 5867500 }, { "epoch": 39.708748375920315, "grad_norm": 0.4248609244823456, "learning_rate": 4.6029125162407974e-05, "loss": 0.3583, "step": 5868000 }, { "epoch": 39.71213187527068, "grad_norm": 0.4011983871459961, "learning_rate": 4.6028786812472936e-05, "loss": 0.3573, "step": 5868500 }, { "epoch": 39.71551537462105, "grad_norm": 0.35149234533309937, "learning_rate": 4.60284484625379e-05, "loss": 0.3587, "step": 5869000 }, { "epoch": 39.71889887397142, "grad_norm": 0.4021625220775604, "learning_rate": 4.602811011260286e-05, "loss": 0.3574, "step": 5869500 }, { "epoch": 39.72228237332178, "grad_norm": 0.36163243651390076, "learning_rate": 4.602777176266783e-05, "loss": 0.3574, "step": 5870000 }, { "epoch": 39.72566587267215, "grad_norm": 0.35306787490844727, "learning_rate": 4.6027433412732785e-05, "loss": 0.3554, "step": 5870500 }, { "epoch": 39.72904937202252, "grad_norm": 0.39128831028938293, "learning_rate": 4.602709506279775e-05, "loss": 0.3591, "step": 5871000 }, { "epoch": 39.732432871372886, "grad_norm": 0.4118329882621765, "learning_rate": 4.602675671286271e-05, "loss": 0.3592, "step": 5871500 }, { "epoch": 39.735816370723256, "grad_norm": 0.3582545220851898, "learning_rate": 4.602641836292768e-05, "loss": 0.357, "step": 5872000 }, { "epoch": 39.73919987007363, "grad_norm": 0.38006383180618286, "learning_rate": 4.602608001299264e-05, "loss": 0.3593, "step": 5872500 }, { "epoch": 39.74258336942399, "grad_norm": 0.3745099604129791, "learning_rate": 4.60257416630576e-05, "loss": 0.3572, "step": 5873000 }, { "epoch": 39.74596686877436, "grad_norm": 0.4437626898288727, "learning_rate": 4.6025403313122564e-05, "loss": 0.3579, "step": 5873500 }, { "epoch": 39.74935036812473, "grad_norm": 0.4171718657016754, "learning_rate": 4.602506496318753e-05, "loss": 0.3578, "step": 5874000 }, { "epoch": 39.7527338674751, "grad_norm": 0.3632790446281433, "learning_rate": 4.6024726613252495e-05, "loss": 0.359, "step": 5874500 }, { "epoch": 39.756117366825464, "grad_norm": 0.3919455409049988, "learning_rate": 4.602438826331746e-05, "loss": 0.3597, "step": 5875000 }, { "epoch": 39.759500866175834, "grad_norm": 0.3734685778617859, "learning_rate": 4.602404991338242e-05, "loss": 0.3582, "step": 5875500 }, { "epoch": 39.762884365526205, "grad_norm": 0.39074575901031494, "learning_rate": 4.602371156344738e-05, "loss": 0.3575, "step": 5876000 }, { "epoch": 39.76626786487657, "grad_norm": 0.40093353390693665, "learning_rate": 4.6023373213512344e-05, "loss": 0.3578, "step": 5876500 }, { "epoch": 39.76965136422694, "grad_norm": 0.36939117312431335, "learning_rate": 4.6023034863577306e-05, "loss": 0.3587, "step": 5877000 }, { "epoch": 39.77303486357731, "grad_norm": 0.38103702664375305, "learning_rate": 4.6022696513642275e-05, "loss": 0.3577, "step": 5877500 }, { "epoch": 39.77641836292767, "grad_norm": 0.41622504591941833, "learning_rate": 4.602235816370724e-05, "loss": 0.3577, "step": 5878000 }, { "epoch": 39.77980186227804, "grad_norm": 0.3693433403968811, "learning_rate": 4.60220198137722e-05, "loss": 0.3567, "step": 5878500 }, { "epoch": 39.78318536162841, "grad_norm": 0.38066574931144714, "learning_rate": 4.602168146383716e-05, "loss": 0.3599, "step": 5879000 }, { "epoch": 39.786568860978775, "grad_norm": 0.3881836533546448, "learning_rate": 4.602134311390213e-05, "loss": 0.3581, "step": 5879500 }, { "epoch": 39.789952360329146, "grad_norm": 0.4086085855960846, "learning_rate": 4.6021004763967085e-05, "loss": 0.3588, "step": 5880000 }, { "epoch": 39.793335859679516, "grad_norm": 0.3807792663574219, "learning_rate": 4.602066641403205e-05, "loss": 0.357, "step": 5880500 }, { "epoch": 39.796719359029886, "grad_norm": 0.4019613265991211, "learning_rate": 4.602032806409701e-05, "loss": 0.3576, "step": 5881000 }, { "epoch": 39.80010285838025, "grad_norm": 0.3932480812072754, "learning_rate": 4.601998971416198e-05, "loss": 0.3586, "step": 5881500 }, { "epoch": 39.80348635773062, "grad_norm": 0.37599310278892517, "learning_rate": 4.601965136422694e-05, "loss": 0.3572, "step": 5882000 }, { "epoch": 39.80686985708099, "grad_norm": 0.39159277081489563, "learning_rate": 4.60193130142919e-05, "loss": 0.3583, "step": 5882500 }, { "epoch": 39.81025335643135, "grad_norm": 0.4587567448616028, "learning_rate": 4.6018974664356865e-05, "loss": 0.3584, "step": 5883000 }, { "epoch": 39.81363685578172, "grad_norm": 0.39030855894088745, "learning_rate": 4.6018636314421834e-05, "loss": 0.3587, "step": 5883500 }, { "epoch": 39.817020355132094, "grad_norm": 0.4061824381351471, "learning_rate": 4.6018297964486796e-05, "loss": 0.3577, "step": 5884000 }, { "epoch": 39.82040385448246, "grad_norm": 0.42569318413734436, "learning_rate": 4.601795961455176e-05, "loss": 0.3584, "step": 5884500 }, { "epoch": 39.82378735383283, "grad_norm": 0.3514672517776489, "learning_rate": 4.601762126461672e-05, "loss": 0.3574, "step": 5885000 }, { "epoch": 39.8271708531832, "grad_norm": 0.3994062542915344, "learning_rate": 4.601728291468168e-05, "loss": 0.3577, "step": 5885500 }, { "epoch": 39.83055435253357, "grad_norm": 0.4034961760044098, "learning_rate": 4.6016944564746644e-05, "loss": 0.3597, "step": 5886000 }, { "epoch": 39.83393785188393, "grad_norm": 0.3779377043247223, "learning_rate": 4.6016606214811607e-05, "loss": 0.3581, "step": 5886500 }, { "epoch": 39.8373213512343, "grad_norm": 0.3766578435897827, "learning_rate": 4.6016267864876575e-05, "loss": 0.3566, "step": 5887000 }, { "epoch": 39.84070485058467, "grad_norm": 0.3871261775493622, "learning_rate": 4.601592951494154e-05, "loss": 0.358, "step": 5887500 }, { "epoch": 39.844088349935035, "grad_norm": 0.3668980896472931, "learning_rate": 4.60155911650065e-05, "loss": 0.3587, "step": 5888000 }, { "epoch": 39.847471849285405, "grad_norm": 0.3791254460811615, "learning_rate": 4.601525281507146e-05, "loss": 0.3595, "step": 5888500 }, { "epoch": 39.850855348635775, "grad_norm": 0.36987370252609253, "learning_rate": 4.6014914465136424e-05, "loss": 0.3596, "step": 5889000 }, { "epoch": 39.85423884798614, "grad_norm": 0.40789470076560974, "learning_rate": 4.6014576115201386e-05, "loss": 0.359, "step": 5889500 }, { "epoch": 39.85762234733651, "grad_norm": 0.37992024421691895, "learning_rate": 4.601423776526635e-05, "loss": 0.3591, "step": 5890000 }, { "epoch": 39.86100584668688, "grad_norm": 0.4102227985858917, "learning_rate": 4.601389941533131e-05, "loss": 0.3583, "step": 5890500 }, { "epoch": 39.86438934603724, "grad_norm": 0.3742344379425049, "learning_rate": 4.601356106539628e-05, "loss": 0.3578, "step": 5891000 }, { "epoch": 39.86777284538761, "grad_norm": 0.3808158040046692, "learning_rate": 4.601322271546124e-05, "loss": 0.3584, "step": 5891500 }, { "epoch": 39.87115634473798, "grad_norm": 0.4250912368297577, "learning_rate": 4.6012884365526203e-05, "loss": 0.3565, "step": 5892000 }, { "epoch": 39.87453984408835, "grad_norm": 0.4150397777557373, "learning_rate": 4.6012546015591166e-05, "loss": 0.3582, "step": 5892500 }, { "epoch": 39.87792334343872, "grad_norm": 0.3577214181423187, "learning_rate": 4.6012207665656134e-05, "loss": 0.3573, "step": 5893000 }, { "epoch": 39.88130684278909, "grad_norm": 0.34287115931510925, "learning_rate": 4.6011869315721097e-05, "loss": 0.3598, "step": 5893500 }, { "epoch": 39.88469034213946, "grad_norm": 0.376375287771225, "learning_rate": 4.601153096578606e-05, "loss": 0.3564, "step": 5894000 }, { "epoch": 39.88807384148982, "grad_norm": 0.41835808753967285, "learning_rate": 4.601119261585102e-05, "loss": 0.3583, "step": 5894500 }, { "epoch": 39.89145734084019, "grad_norm": 0.41685935854911804, "learning_rate": 4.601085426591598e-05, "loss": 0.3583, "step": 5895000 }, { "epoch": 39.89484084019056, "grad_norm": 0.40916863083839417, "learning_rate": 4.6010515915980945e-05, "loss": 0.3566, "step": 5895500 }, { "epoch": 39.898224339540924, "grad_norm": 0.3984031677246094, "learning_rate": 4.601017756604591e-05, "loss": 0.3588, "step": 5896000 }, { "epoch": 39.901607838891294, "grad_norm": 0.40788960456848145, "learning_rate": 4.600983921611087e-05, "loss": 0.3585, "step": 5896500 }, { "epoch": 39.904991338241665, "grad_norm": 0.37437763810157776, "learning_rate": 4.600950086617584e-05, "loss": 0.3578, "step": 5897000 }, { "epoch": 39.90837483759203, "grad_norm": 0.4370589554309845, "learning_rate": 4.60091625162408e-05, "loss": 0.3579, "step": 5897500 }, { "epoch": 39.9117583369424, "grad_norm": 0.38210275769233704, "learning_rate": 4.600882416630576e-05, "loss": 0.3576, "step": 5898000 }, { "epoch": 39.91514183629277, "grad_norm": 0.37423208355903625, "learning_rate": 4.6008485816370725e-05, "loss": 0.3585, "step": 5898500 }, { "epoch": 39.91852533564314, "grad_norm": 0.42241284251213074, "learning_rate": 4.600814746643569e-05, "loss": 0.3578, "step": 5899000 }, { "epoch": 39.9219088349935, "grad_norm": 0.39159727096557617, "learning_rate": 4.600780911650065e-05, "loss": 0.3573, "step": 5899500 }, { "epoch": 39.92529233434387, "grad_norm": 0.3790798485279083, "learning_rate": 4.600747076656561e-05, "loss": 0.3588, "step": 5900000 }, { "epoch": 39.92867583369424, "grad_norm": 0.3681636452674866, "learning_rate": 4.600713241663058e-05, "loss": 0.359, "step": 5900500 }, { "epoch": 39.932059333044606, "grad_norm": 0.38505762815475464, "learning_rate": 4.600679406669554e-05, "loss": 0.3593, "step": 5901000 }, { "epoch": 39.935442832394976, "grad_norm": 0.35872504115104675, "learning_rate": 4.6006455716760504e-05, "loss": 0.3578, "step": 5901500 }, { "epoch": 39.93882633174535, "grad_norm": 0.415998250246048, "learning_rate": 4.6006117366825466e-05, "loss": 0.3584, "step": 5902000 }, { "epoch": 39.94220983109571, "grad_norm": 0.33244240283966064, "learning_rate": 4.6005779016890435e-05, "loss": 0.3567, "step": 5902500 }, { "epoch": 39.94559333044608, "grad_norm": 0.36787545680999756, "learning_rate": 4.60054406669554e-05, "loss": 0.3571, "step": 5903000 }, { "epoch": 39.94897682979645, "grad_norm": 0.4008660614490509, "learning_rate": 4.600510231702036e-05, "loss": 0.3595, "step": 5903500 }, { "epoch": 39.95236032914681, "grad_norm": 0.36828315258026123, "learning_rate": 4.600476396708532e-05, "loss": 0.359, "step": 5904000 }, { "epoch": 39.955743828497184, "grad_norm": 0.4211144745349884, "learning_rate": 4.6004425617150284e-05, "loss": 0.3586, "step": 5904500 }, { "epoch": 39.959127327847554, "grad_norm": 0.39619413018226624, "learning_rate": 4.6004087267215246e-05, "loss": 0.3592, "step": 5905000 }, { "epoch": 39.962510827197924, "grad_norm": 0.39225244522094727, "learning_rate": 4.600374891728021e-05, "loss": 0.3595, "step": 5905500 }, { "epoch": 39.96589432654829, "grad_norm": 0.3748459815979004, "learning_rate": 4.600341056734517e-05, "loss": 0.3582, "step": 5906000 }, { "epoch": 39.96927782589866, "grad_norm": 0.427299439907074, "learning_rate": 4.600307221741014e-05, "loss": 0.3594, "step": 5906500 }, { "epoch": 39.97266132524903, "grad_norm": 0.3732565641403198, "learning_rate": 4.60027338674751e-05, "loss": 0.3567, "step": 5907000 }, { "epoch": 39.97604482459939, "grad_norm": 0.4486788809299469, "learning_rate": 4.600239551754006e-05, "loss": 0.3587, "step": 5907500 }, { "epoch": 39.97942832394976, "grad_norm": 0.40454939007759094, "learning_rate": 4.6002057167605025e-05, "loss": 0.3593, "step": 5908000 }, { "epoch": 39.98281182330013, "grad_norm": 0.4098987579345703, "learning_rate": 4.600171881766999e-05, "loss": 0.3564, "step": 5908500 }, { "epoch": 39.986195322650495, "grad_norm": 0.4133622348308563, "learning_rate": 4.600138046773495e-05, "loss": 0.3586, "step": 5909000 }, { "epoch": 39.989578822000865, "grad_norm": 0.3992837369441986, "learning_rate": 4.600104211779991e-05, "loss": 0.3566, "step": 5909500 }, { "epoch": 39.992962321351236, "grad_norm": 0.35246866941452026, "learning_rate": 4.600070376786488e-05, "loss": 0.3573, "step": 5910000 }, { "epoch": 39.9963458207016, "grad_norm": 0.3770292103290558, "learning_rate": 4.600036541792984e-05, "loss": 0.359, "step": 5910500 }, { "epoch": 39.99972932005197, "grad_norm": 0.39107683300971985, "learning_rate": 4.6000027067994805e-05, "loss": 0.3583, "step": 5911000 }, { "epoch": 40.0, "eval_accuracy": 0.8631902569762103, "eval_loss": 0.5551746487617493, "eval_runtime": 3370.1943, "eval_samples_per_second": 86.269, "eval_steps_per_second": 5.392, "step": 5911040 }, { "epoch": 40.00311281940234, "grad_norm": 0.3805144429206848, "learning_rate": 4.599968871805977e-05, "loss": 0.3567, "step": 5911500 }, { "epoch": 40.00649631875271, "grad_norm": 0.3924298882484436, "learning_rate": 4.5999350368124736e-05, "loss": 0.3553, "step": 5912000 }, { "epoch": 40.00987981810307, "grad_norm": 0.39365607500076294, "learning_rate": 4.59990120181897e-05, "loss": 0.3537, "step": 5912500 }, { "epoch": 40.01326331745344, "grad_norm": 0.4074643850326538, "learning_rate": 4.599867366825466e-05, "loss": 0.3564, "step": 5913000 }, { "epoch": 40.016646816803814, "grad_norm": 0.41037800908088684, "learning_rate": 4.5998335318319615e-05, "loss": 0.3563, "step": 5913500 }, { "epoch": 40.02003031615418, "grad_norm": 0.4109076261520386, "learning_rate": 4.5997996968384584e-05, "loss": 0.3548, "step": 5914000 }, { "epoch": 40.02341381550455, "grad_norm": 0.40716245770454407, "learning_rate": 4.5997658618449546e-05, "loss": 0.3569, "step": 5914500 }, { "epoch": 40.02679731485492, "grad_norm": 0.38791415095329285, "learning_rate": 4.599732026851451e-05, "loss": 0.3559, "step": 5915000 }, { "epoch": 40.03018081420528, "grad_norm": 0.37908267974853516, "learning_rate": 4.599698191857947e-05, "loss": 0.3556, "step": 5915500 }, { "epoch": 40.03356431355565, "grad_norm": 0.35474398732185364, "learning_rate": 4.599664356864444e-05, "loss": 0.357, "step": 5916000 }, { "epoch": 40.03694781290602, "grad_norm": 0.3792150616645813, "learning_rate": 4.59963052187094e-05, "loss": 0.3563, "step": 5916500 }, { "epoch": 40.04033131225639, "grad_norm": 0.41410353779792786, "learning_rate": 4.5995966868774364e-05, "loss": 0.3554, "step": 5917000 }, { "epoch": 40.043714811606755, "grad_norm": 0.3760049641132355, "learning_rate": 4.5995628518839326e-05, "loss": 0.3557, "step": 5917500 }, { "epoch": 40.047098310957125, "grad_norm": 0.3935118317604065, "learning_rate": 4.5995290168904295e-05, "loss": 0.3557, "step": 5918000 }, { "epoch": 40.050481810307495, "grad_norm": 0.40363773703575134, "learning_rate": 4.599495181896925e-05, "loss": 0.3558, "step": 5918500 }, { "epoch": 40.05386530965786, "grad_norm": 0.39092475175857544, "learning_rate": 4.599461346903421e-05, "loss": 0.3575, "step": 5919000 }, { "epoch": 40.05724880900823, "grad_norm": 0.41542938351631165, "learning_rate": 4.599427511909918e-05, "loss": 0.3577, "step": 5919500 }, { "epoch": 40.0606323083586, "grad_norm": 0.40621158480644226, "learning_rate": 4.599393676916414e-05, "loss": 0.3566, "step": 5920000 }, { "epoch": 40.06401580770896, "grad_norm": 0.3823866844177246, "learning_rate": 4.5993598419229105e-05, "loss": 0.3558, "step": 5920500 }, { "epoch": 40.06739930705933, "grad_norm": 0.38658443093299866, "learning_rate": 4.599326006929407e-05, "loss": 0.3573, "step": 5921000 }, { "epoch": 40.0707828064097, "grad_norm": 0.45442578196525574, "learning_rate": 4.5992921719359036e-05, "loss": 0.3562, "step": 5921500 }, { "epoch": 40.074166305760066, "grad_norm": 0.3641699552536011, "learning_rate": 4.5992583369424e-05, "loss": 0.3565, "step": 5922000 }, { "epoch": 40.077549805110436, "grad_norm": 0.41012468934059143, "learning_rate": 4.599224501948896e-05, "loss": 0.3564, "step": 5922500 }, { "epoch": 40.08093330446081, "grad_norm": 0.37911295890808105, "learning_rate": 4.5991906669553916e-05, "loss": 0.3568, "step": 5923000 }, { "epoch": 40.08431680381118, "grad_norm": 0.36601924896240234, "learning_rate": 4.5991568319618885e-05, "loss": 0.356, "step": 5923500 }, { "epoch": 40.08770030316154, "grad_norm": 0.3938429057598114, "learning_rate": 4.599122996968385e-05, "loss": 0.3571, "step": 5924000 }, { "epoch": 40.09108380251191, "grad_norm": 0.3910832703113556, "learning_rate": 4.599089161974881e-05, "loss": 0.3561, "step": 5924500 }, { "epoch": 40.09446730186228, "grad_norm": 0.41138482093811035, "learning_rate": 4.599055326981377e-05, "loss": 0.3568, "step": 5925000 }, { "epoch": 40.097850801212644, "grad_norm": 0.3618259131908417, "learning_rate": 4.599021491987874e-05, "loss": 0.3556, "step": 5925500 }, { "epoch": 40.101234300563014, "grad_norm": 0.39352527260780334, "learning_rate": 4.59898765699437e-05, "loss": 0.3564, "step": 5926000 }, { "epoch": 40.104617799913385, "grad_norm": 0.3567906320095062, "learning_rate": 4.5989538220008664e-05, "loss": 0.3577, "step": 5926500 }, { "epoch": 40.10800129926375, "grad_norm": 0.3813663423061371, "learning_rate": 4.5989199870073626e-05, "loss": 0.3568, "step": 5927000 }, { "epoch": 40.11138479861412, "grad_norm": 0.3681528866291046, "learning_rate": 4.5988861520138595e-05, "loss": 0.3551, "step": 5927500 }, { "epoch": 40.11476829796449, "grad_norm": 0.3969293534755707, "learning_rate": 4.598852317020355e-05, "loss": 0.3569, "step": 5928000 }, { "epoch": 40.11815179731485, "grad_norm": 0.38515737652778625, "learning_rate": 4.598818482026851e-05, "loss": 0.3562, "step": 5928500 }, { "epoch": 40.12153529666522, "grad_norm": 0.4067007601261139, "learning_rate": 4.598784647033348e-05, "loss": 0.3563, "step": 5929000 }, { "epoch": 40.12491879601559, "grad_norm": 0.36965522170066833, "learning_rate": 4.5987508120398444e-05, "loss": 0.3554, "step": 5929500 }, { "epoch": 40.12830229536596, "grad_norm": 0.40654975175857544, "learning_rate": 4.5987169770463406e-05, "loss": 0.3548, "step": 5930000 }, { "epoch": 40.131685794716326, "grad_norm": 0.4154300391674042, "learning_rate": 4.598683142052837e-05, "loss": 0.3564, "step": 5930500 }, { "epoch": 40.135069294066696, "grad_norm": 0.37950339913368225, "learning_rate": 4.598649307059334e-05, "loss": 0.357, "step": 5931000 }, { "epoch": 40.138452793417066, "grad_norm": 0.36879271268844604, "learning_rate": 4.59861547206583e-05, "loss": 0.3584, "step": 5931500 }, { "epoch": 40.14183629276743, "grad_norm": 0.4255102574825287, "learning_rate": 4.598581637072326e-05, "loss": 0.3574, "step": 5932000 }, { "epoch": 40.1452197921178, "grad_norm": 0.362420916557312, "learning_rate": 4.5985478020788217e-05, "loss": 0.3572, "step": 5932500 }, { "epoch": 40.14860329146817, "grad_norm": 0.4359753131866455, "learning_rate": 4.5985139670853185e-05, "loss": 0.3557, "step": 5933000 }, { "epoch": 40.15198679081853, "grad_norm": 0.41759398579597473, "learning_rate": 4.598480132091815e-05, "loss": 0.3578, "step": 5933500 }, { "epoch": 40.155370290168904, "grad_norm": 0.3773077726364136, "learning_rate": 4.598446297098311e-05, "loss": 0.3557, "step": 5934000 }, { "epoch": 40.158753789519274, "grad_norm": 0.3422764539718628, "learning_rate": 4.598412462104807e-05, "loss": 0.3578, "step": 5934500 }, { "epoch": 40.16213728886964, "grad_norm": 0.3767971992492676, "learning_rate": 4.598378627111304e-05, "loss": 0.3569, "step": 5935000 }, { "epoch": 40.16552078822001, "grad_norm": 0.3736751675605774, "learning_rate": 4.5983447921178e-05, "loss": 0.3577, "step": 5935500 }, { "epoch": 40.16890428757038, "grad_norm": 0.4079616367816925, "learning_rate": 4.5983109571242965e-05, "loss": 0.3569, "step": 5936000 }, { "epoch": 40.17228778692075, "grad_norm": 0.4015907645225525, "learning_rate": 4.598277122130793e-05, "loss": 0.3581, "step": 5936500 }, { "epoch": 40.17567128627111, "grad_norm": 0.3792710602283478, "learning_rate": 4.5982432871372896e-05, "loss": 0.3583, "step": 5937000 }, { "epoch": 40.17905478562148, "grad_norm": 0.44401755928993225, "learning_rate": 4.598209452143785e-05, "loss": 0.3567, "step": 5937500 }, { "epoch": 40.18243828497185, "grad_norm": 0.404691219329834, "learning_rate": 4.5981756171502813e-05, "loss": 0.3573, "step": 5938000 }, { "epoch": 40.185821784322215, "grad_norm": 0.3993310034275055, "learning_rate": 4.598141782156778e-05, "loss": 0.3577, "step": 5938500 }, { "epoch": 40.189205283672585, "grad_norm": 0.3728111684322357, "learning_rate": 4.5981079471632744e-05, "loss": 0.356, "step": 5939000 }, { "epoch": 40.192588783022956, "grad_norm": 0.37582162022590637, "learning_rate": 4.5980741121697707e-05, "loss": 0.3568, "step": 5939500 }, { "epoch": 40.19597228237332, "grad_norm": 0.3851401209831238, "learning_rate": 4.598040277176267e-05, "loss": 0.3568, "step": 5940000 }, { "epoch": 40.19935578172369, "grad_norm": 0.3895336985588074, "learning_rate": 4.598006442182764e-05, "loss": 0.3581, "step": 5940500 }, { "epoch": 40.20273928107406, "grad_norm": 0.39405518770217896, "learning_rate": 4.59797260718926e-05, "loss": 0.3572, "step": 5941000 }, { "epoch": 40.20612278042443, "grad_norm": 0.4513786733150482, "learning_rate": 4.597938772195756e-05, "loss": 0.3574, "step": 5941500 }, { "epoch": 40.20950627977479, "grad_norm": 0.36259347200393677, "learning_rate": 4.597904937202252e-05, "loss": 0.3565, "step": 5942000 }, { "epoch": 40.21288977912516, "grad_norm": 0.4183095693588257, "learning_rate": 4.5978711022087486e-05, "loss": 0.3562, "step": 5942500 }, { "epoch": 40.21627327847553, "grad_norm": 0.40985429286956787, "learning_rate": 4.597837267215245e-05, "loss": 0.3575, "step": 5943000 }, { "epoch": 40.2196567778259, "grad_norm": 0.3730085790157318, "learning_rate": 4.597803432221741e-05, "loss": 0.3571, "step": 5943500 }, { "epoch": 40.22304027717627, "grad_norm": 0.3655538260936737, "learning_rate": 4.597769597228237e-05, "loss": 0.3563, "step": 5944000 }, { "epoch": 40.22642377652664, "grad_norm": 0.3684774935245514, "learning_rate": 4.597735762234734e-05, "loss": 0.3582, "step": 5944500 }, { "epoch": 40.229807275877, "grad_norm": 0.3919091522693634, "learning_rate": 4.5977019272412303e-05, "loss": 0.3575, "step": 5945000 }, { "epoch": 40.23319077522737, "grad_norm": 0.37911686301231384, "learning_rate": 4.5976680922477266e-05, "loss": 0.3576, "step": 5945500 }, { "epoch": 40.23657427457774, "grad_norm": 0.36616331338882446, "learning_rate": 4.597634257254223e-05, "loss": 0.3588, "step": 5946000 }, { "epoch": 40.239957773928104, "grad_norm": 0.38871854543685913, "learning_rate": 4.59760042226072e-05, "loss": 0.3574, "step": 5946500 }, { "epoch": 40.243341273278475, "grad_norm": 0.40605083107948303, "learning_rate": 4.597566587267215e-05, "loss": 0.3576, "step": 5947000 }, { "epoch": 40.246724772628845, "grad_norm": 0.3829292058944702, "learning_rate": 4.5975327522737114e-05, "loss": 0.3564, "step": 5947500 }, { "epoch": 40.250108271979215, "grad_norm": 0.3808296322822571, "learning_rate": 4.597498917280208e-05, "loss": 0.3577, "step": 5948000 }, { "epoch": 40.25349177132958, "grad_norm": 0.3695087134838104, "learning_rate": 4.5974650822867045e-05, "loss": 0.3587, "step": 5948500 }, { "epoch": 40.25687527067995, "grad_norm": 0.3775908946990967, "learning_rate": 4.597431247293201e-05, "loss": 0.3573, "step": 5949000 }, { "epoch": 40.26025877003032, "grad_norm": 0.3715585172176361, "learning_rate": 4.597397412299697e-05, "loss": 0.3583, "step": 5949500 }, { "epoch": 40.26364226938068, "grad_norm": 0.3579839766025543, "learning_rate": 4.597363577306194e-05, "loss": 0.3568, "step": 5950000 }, { "epoch": 40.26702576873105, "grad_norm": 0.39917808771133423, "learning_rate": 4.59732974231269e-05, "loss": 0.358, "step": 5950500 }, { "epoch": 40.27040926808142, "grad_norm": 0.37289804220199585, "learning_rate": 4.597295907319186e-05, "loss": 0.3562, "step": 5951000 }, { "epoch": 40.273792767431786, "grad_norm": 0.39005935192108154, "learning_rate": 4.597262072325682e-05, "loss": 0.3594, "step": 5951500 }, { "epoch": 40.277176266782156, "grad_norm": 0.41513052582740784, "learning_rate": 4.597228237332179e-05, "loss": 0.3579, "step": 5952000 }, { "epoch": 40.28055976613253, "grad_norm": 0.35334670543670654, "learning_rate": 4.597194402338675e-05, "loss": 0.3571, "step": 5952500 }, { "epoch": 40.28394326548289, "grad_norm": 0.4273497760295868, "learning_rate": 4.597160567345171e-05, "loss": 0.3568, "step": 5953000 }, { "epoch": 40.28732676483326, "grad_norm": 0.35260331630706787, "learning_rate": 4.597126732351667e-05, "loss": 0.3584, "step": 5953500 }, { "epoch": 40.29071026418363, "grad_norm": 0.4094509482383728, "learning_rate": 4.597092897358164e-05, "loss": 0.357, "step": 5954000 }, { "epoch": 40.294093763534, "grad_norm": 0.3877350389957428, "learning_rate": 4.5970590623646604e-05, "loss": 0.3589, "step": 5954500 }, { "epoch": 40.297477262884364, "grad_norm": 0.35621377825737, "learning_rate": 4.5970252273711566e-05, "loss": 0.3567, "step": 5955000 }, { "epoch": 40.300860762234734, "grad_norm": 0.4190492331981659, "learning_rate": 4.596991392377653e-05, "loss": 0.358, "step": 5955500 }, { "epoch": 40.304244261585104, "grad_norm": 0.3819047212600708, "learning_rate": 4.59695755738415e-05, "loss": 0.3559, "step": 5956000 }, { "epoch": 40.30762776093547, "grad_norm": 0.3595086634159088, "learning_rate": 4.596923722390645e-05, "loss": 0.3569, "step": 5956500 }, { "epoch": 40.31101126028584, "grad_norm": 0.41916951537132263, "learning_rate": 4.5968898873971415e-05, "loss": 0.3573, "step": 5957000 }, { "epoch": 40.31439475963621, "grad_norm": 0.41138339042663574, "learning_rate": 4.5968560524036384e-05, "loss": 0.3587, "step": 5957500 }, { "epoch": 40.31777825898657, "grad_norm": 0.4318738579750061, "learning_rate": 4.5968222174101346e-05, "loss": 0.3554, "step": 5958000 }, { "epoch": 40.32116175833694, "grad_norm": 0.36461934447288513, "learning_rate": 4.596788382416631e-05, "loss": 0.3569, "step": 5958500 }, { "epoch": 40.32454525768731, "grad_norm": 0.3716467022895813, "learning_rate": 4.596754547423127e-05, "loss": 0.3572, "step": 5959000 }, { "epoch": 40.327928757037675, "grad_norm": 0.3580436408519745, "learning_rate": 4.596720712429623e-05, "loss": 0.3573, "step": 5959500 }, { "epoch": 40.331312256388046, "grad_norm": 0.414146363735199, "learning_rate": 4.59668687743612e-05, "loss": 0.3581, "step": 5960000 }, { "epoch": 40.334695755738416, "grad_norm": 0.42792972922325134, "learning_rate": 4.596653042442616e-05, "loss": 0.3579, "step": 5960500 }, { "epoch": 40.338079255088786, "grad_norm": 0.36569127440452576, "learning_rate": 4.596619207449112e-05, "loss": 0.3567, "step": 5961000 }, { "epoch": 40.34146275443915, "grad_norm": 0.3546809256076813, "learning_rate": 4.596585372455609e-05, "loss": 0.3582, "step": 5961500 }, { "epoch": 40.34484625378952, "grad_norm": 0.35243067145347595, "learning_rate": 4.596551537462105e-05, "loss": 0.3569, "step": 5962000 }, { "epoch": 40.34822975313989, "grad_norm": 0.39889785647392273, "learning_rate": 4.596517702468601e-05, "loss": 0.3569, "step": 5962500 }, { "epoch": 40.35161325249025, "grad_norm": 0.3862617611885071, "learning_rate": 4.5964838674750974e-05, "loss": 0.3576, "step": 5963000 }, { "epoch": 40.35499675184062, "grad_norm": 0.4139243960380554, "learning_rate": 4.596450032481594e-05, "loss": 0.3552, "step": 5963500 }, { "epoch": 40.358380251190994, "grad_norm": 0.414829283952713, "learning_rate": 4.5964161974880905e-05, "loss": 0.3565, "step": 5964000 }, { "epoch": 40.36176375054136, "grad_norm": 0.3791615068912506, "learning_rate": 4.596382362494587e-05, "loss": 0.356, "step": 5964500 }, { "epoch": 40.36514724989173, "grad_norm": 0.36732369661331177, "learning_rate": 4.596348527501083e-05, "loss": 0.3587, "step": 5965000 }, { "epoch": 40.3685307492421, "grad_norm": 0.4129006862640381, "learning_rate": 4.59631469250758e-05, "loss": 0.3588, "step": 5965500 }, { "epoch": 40.37191424859246, "grad_norm": 0.3940274715423584, "learning_rate": 4.596280857514075e-05, "loss": 0.3575, "step": 5966000 }, { "epoch": 40.37529774794283, "grad_norm": 0.3739898204803467, "learning_rate": 4.5962470225205715e-05, "loss": 0.3587, "step": 5966500 }, { "epoch": 40.3786812472932, "grad_norm": 0.39497530460357666, "learning_rate": 4.5962131875270684e-05, "loss": 0.3559, "step": 5967000 }, { "epoch": 40.38206474664357, "grad_norm": 0.40286335349082947, "learning_rate": 4.5961793525335646e-05, "loss": 0.3584, "step": 5967500 }, { "epoch": 40.385448245993935, "grad_norm": 0.36392825841903687, "learning_rate": 4.596145517540061e-05, "loss": 0.3581, "step": 5968000 }, { "epoch": 40.388831745344305, "grad_norm": 0.37698015570640564, "learning_rate": 4.596111682546557e-05, "loss": 0.3574, "step": 5968500 }, { "epoch": 40.392215244694675, "grad_norm": 0.4125801622867584, "learning_rate": 4.596077847553053e-05, "loss": 0.3584, "step": 5969000 }, { "epoch": 40.39559874404504, "grad_norm": 0.39844027161598206, "learning_rate": 4.59604401255955e-05, "loss": 0.3559, "step": 5969500 }, { "epoch": 40.39898224339541, "grad_norm": 0.42070603370666504, "learning_rate": 4.5960101775660464e-05, "loss": 0.3569, "step": 5970000 }, { "epoch": 40.40236574274578, "grad_norm": 0.38819003105163574, "learning_rate": 4.595976342572542e-05, "loss": 0.3571, "step": 5970500 }, { "epoch": 40.40574924209614, "grad_norm": 0.370553582906723, "learning_rate": 4.595942507579039e-05, "loss": 0.358, "step": 5971000 }, { "epoch": 40.40913274144651, "grad_norm": 0.4015754461288452, "learning_rate": 4.595908672585535e-05, "loss": 0.3579, "step": 5971500 }, { "epoch": 40.41251624079688, "grad_norm": 0.399566650390625, "learning_rate": 4.595874837592031e-05, "loss": 0.3571, "step": 5972000 }, { "epoch": 40.41589974014725, "grad_norm": 0.3622363805770874, "learning_rate": 4.5958410025985274e-05, "loss": 0.3583, "step": 5972500 }, { "epoch": 40.41928323949762, "grad_norm": 0.39378848671913147, "learning_rate": 4.595807167605024e-05, "loss": 0.3576, "step": 5973000 }, { "epoch": 40.42266673884799, "grad_norm": 0.39654675126075745, "learning_rate": 4.5957733326115205e-05, "loss": 0.3556, "step": 5973500 }, { "epoch": 40.42605023819836, "grad_norm": 0.404291570186615, "learning_rate": 4.595739497618017e-05, "loss": 0.3596, "step": 5974000 }, { "epoch": 40.42943373754872, "grad_norm": 0.39705920219421387, "learning_rate": 4.595705662624513e-05, "loss": 0.3579, "step": 5974500 }, { "epoch": 40.43281723689909, "grad_norm": 0.3815435767173767, "learning_rate": 4.59567182763101e-05, "loss": 0.3597, "step": 5975000 }, { "epoch": 40.43620073624946, "grad_norm": 0.3721045255661011, "learning_rate": 4.5956379926375054e-05, "loss": 0.358, "step": 5975500 }, { "epoch": 40.439584235599824, "grad_norm": 0.4162366986274719, "learning_rate": 4.5956041576440016e-05, "loss": 0.359, "step": 5976000 }, { "epoch": 40.442967734950194, "grad_norm": 0.4168601930141449, "learning_rate": 4.595570322650498e-05, "loss": 0.3573, "step": 5976500 }, { "epoch": 40.446351234300565, "grad_norm": 0.4040071666240692, "learning_rate": 4.595536487656995e-05, "loss": 0.3572, "step": 5977000 }, { "epoch": 40.44973473365093, "grad_norm": 0.3881406784057617, "learning_rate": 4.595502652663491e-05, "loss": 0.3575, "step": 5977500 }, { "epoch": 40.4531182330013, "grad_norm": 0.3883194029331207, "learning_rate": 4.595468817669987e-05, "loss": 0.3575, "step": 5978000 }, { "epoch": 40.45650173235167, "grad_norm": 0.3823226988315582, "learning_rate": 4.595434982676483e-05, "loss": 0.3573, "step": 5978500 }, { "epoch": 40.45988523170204, "grad_norm": 0.42660027742385864, "learning_rate": 4.59540114768298e-05, "loss": 0.3583, "step": 5979000 }, { "epoch": 40.4632687310524, "grad_norm": 0.3672007620334625, "learning_rate": 4.5953673126894764e-05, "loss": 0.3589, "step": 5979500 }, { "epoch": 40.46665223040277, "grad_norm": 0.4127280116081238, "learning_rate": 4.5953334776959727e-05, "loss": 0.3568, "step": 5980000 }, { "epoch": 40.47003572975314, "grad_norm": 0.39377361536026, "learning_rate": 4.595299642702469e-05, "loss": 0.3573, "step": 5980500 }, { "epoch": 40.473419229103506, "grad_norm": 0.3723832964897156, "learning_rate": 4.595265807708965e-05, "loss": 0.3572, "step": 5981000 }, { "epoch": 40.476802728453876, "grad_norm": 0.3649671673774719, "learning_rate": 4.595231972715461e-05, "loss": 0.3591, "step": 5981500 }, { "epoch": 40.480186227804246, "grad_norm": 0.38867637515068054, "learning_rate": 4.5951981377219575e-05, "loss": 0.3578, "step": 5982000 }, { "epoch": 40.48356972715461, "grad_norm": 0.4012649953365326, "learning_rate": 4.5951643027284544e-05, "loss": 0.3571, "step": 5982500 }, { "epoch": 40.48695322650498, "grad_norm": 0.38399559259414673, "learning_rate": 4.5951304677349506e-05, "loss": 0.3583, "step": 5983000 }, { "epoch": 40.49033672585535, "grad_norm": 0.4250212013721466, "learning_rate": 4.595096632741447e-05, "loss": 0.3578, "step": 5983500 }, { "epoch": 40.49372022520571, "grad_norm": 0.3713824152946472, "learning_rate": 4.595062797747943e-05, "loss": 0.3585, "step": 5984000 }, { "epoch": 40.497103724556084, "grad_norm": 0.4231489300727844, "learning_rate": 4.59502896275444e-05, "loss": 0.3586, "step": 5984500 }, { "epoch": 40.500487223906454, "grad_norm": 0.3801126778125763, "learning_rate": 4.5949951277609355e-05, "loss": 0.3569, "step": 5985000 }, { "epoch": 40.503870723256824, "grad_norm": 0.42874547839164734, "learning_rate": 4.594961292767432e-05, "loss": 0.3587, "step": 5985500 }, { "epoch": 40.50725422260719, "grad_norm": 0.3551200032234192, "learning_rate": 4.594927457773928e-05, "loss": 0.3578, "step": 5986000 }, { "epoch": 40.51063772195756, "grad_norm": 0.3460412323474884, "learning_rate": 4.594893622780425e-05, "loss": 0.3579, "step": 5986500 }, { "epoch": 40.51402122130793, "grad_norm": 0.3623094856739044, "learning_rate": 4.594859787786921e-05, "loss": 0.3586, "step": 5987000 }, { "epoch": 40.51740472065829, "grad_norm": 0.38421979546546936, "learning_rate": 4.594825952793417e-05, "loss": 0.3576, "step": 5987500 }, { "epoch": 40.52078822000866, "grad_norm": 0.36018890142440796, "learning_rate": 4.5947921177999134e-05, "loss": 0.3579, "step": 5988000 }, { "epoch": 40.52417171935903, "grad_norm": 0.4077075719833374, "learning_rate": 4.59475828280641e-05, "loss": 0.3588, "step": 5988500 }, { "epoch": 40.527555218709395, "grad_norm": 0.3461393415927887, "learning_rate": 4.5947244478129065e-05, "loss": 0.3592, "step": 5989000 }, { "epoch": 40.530938718059765, "grad_norm": 0.3685201108455658, "learning_rate": 4.594690612819403e-05, "loss": 0.3584, "step": 5989500 }, { "epoch": 40.534322217410136, "grad_norm": 0.37963569164276123, "learning_rate": 4.594656777825899e-05, "loss": 0.3567, "step": 5990000 }, { "epoch": 40.5377057167605, "grad_norm": 0.39100098609924316, "learning_rate": 4.594622942832395e-05, "loss": 0.3591, "step": 5990500 }, { "epoch": 40.54108921611087, "grad_norm": 0.40807783603668213, "learning_rate": 4.5945891078388914e-05, "loss": 0.3596, "step": 5991000 }, { "epoch": 40.54447271546124, "grad_norm": 0.3754607141017914, "learning_rate": 4.5945552728453876e-05, "loss": 0.3566, "step": 5991500 }, { "epoch": 40.54785621481161, "grad_norm": 0.38255998492240906, "learning_rate": 4.5945214378518845e-05, "loss": 0.3584, "step": 5992000 }, { "epoch": 40.55123971416197, "grad_norm": 0.3817797899246216, "learning_rate": 4.594487602858381e-05, "loss": 0.3567, "step": 5992500 }, { "epoch": 40.55462321351234, "grad_norm": 0.3684269189834595, "learning_rate": 4.594453767864877e-05, "loss": 0.3593, "step": 5993000 }, { "epoch": 40.55800671286271, "grad_norm": 0.3826868236064911, "learning_rate": 4.594419932871373e-05, "loss": 0.3575, "step": 5993500 }, { "epoch": 40.56139021221308, "grad_norm": 0.4091789126396179, "learning_rate": 4.59438609787787e-05, "loss": 0.3576, "step": 5994000 }, { "epoch": 40.56477371156345, "grad_norm": 0.41384512186050415, "learning_rate": 4.5943522628843655e-05, "loss": 0.3585, "step": 5994500 }, { "epoch": 40.56815721091382, "grad_norm": 0.38715091347694397, "learning_rate": 4.594318427890862e-05, "loss": 0.3583, "step": 5995000 }, { "epoch": 40.57154071026418, "grad_norm": 0.3721652925014496, "learning_rate": 4.594284592897358e-05, "loss": 0.3581, "step": 5995500 }, { "epoch": 40.57492420961455, "grad_norm": 0.4144269824028015, "learning_rate": 4.594250757903855e-05, "loss": 0.3577, "step": 5996000 }, { "epoch": 40.57830770896492, "grad_norm": 0.36702612042427063, "learning_rate": 4.594216922910351e-05, "loss": 0.3584, "step": 5996500 }, { "epoch": 40.58169120831529, "grad_norm": 0.41641584038734436, "learning_rate": 4.594183087916847e-05, "loss": 0.3571, "step": 5997000 }, { "epoch": 40.585074707665655, "grad_norm": 0.3495043218135834, "learning_rate": 4.5941492529233435e-05, "loss": 0.3571, "step": 5997500 }, { "epoch": 40.588458207016025, "grad_norm": 0.40029117465019226, "learning_rate": 4.5941154179298404e-05, "loss": 0.3572, "step": 5998000 }, { "epoch": 40.591841706366395, "grad_norm": 0.3930448293685913, "learning_rate": 4.5940815829363366e-05, "loss": 0.3572, "step": 5998500 }, { "epoch": 40.59522520571676, "grad_norm": 0.3824836015701294, "learning_rate": 4.594047747942833e-05, "loss": 0.3583, "step": 5999000 }, { "epoch": 40.59860870506713, "grad_norm": 0.36394089460372925, "learning_rate": 4.594013912949329e-05, "loss": 0.3584, "step": 5999500 }, { "epoch": 40.6019922044175, "grad_norm": 0.3454444408416748, "learning_rate": 4.593980077955825e-05, "loss": 0.3585, "step": 6000000 }, { "epoch": 40.60537570376786, "grad_norm": 0.3821978271007538, "learning_rate": 4.5939462429623214e-05, "loss": 0.3583, "step": 6000500 }, { "epoch": 40.60875920311823, "grad_norm": 0.38868358731269836, "learning_rate": 4.5939124079688176e-05, "loss": 0.358, "step": 6001000 }, { "epoch": 40.6121427024686, "grad_norm": 0.4017002582550049, "learning_rate": 4.5938785729753145e-05, "loss": 0.3577, "step": 6001500 }, { "epoch": 40.615526201818966, "grad_norm": 0.3536180853843689, "learning_rate": 4.593844737981811e-05, "loss": 0.358, "step": 6002000 }, { "epoch": 40.618909701169336, "grad_norm": 0.4217749238014221, "learning_rate": 4.593810902988307e-05, "loss": 0.3569, "step": 6002500 }, { "epoch": 40.62229320051971, "grad_norm": 0.3951268792152405, "learning_rate": 4.593777067994803e-05, "loss": 0.3565, "step": 6003000 }, { "epoch": 40.62567669987008, "grad_norm": 0.4119824469089508, "learning_rate": 4.5937432330013e-05, "loss": 0.3592, "step": 6003500 }, { "epoch": 40.62906019922044, "grad_norm": 0.3959159255027771, "learning_rate": 4.5937093980077956e-05, "loss": 0.3564, "step": 6004000 }, { "epoch": 40.63244369857081, "grad_norm": 0.41242820024490356, "learning_rate": 4.593675563014292e-05, "loss": 0.3597, "step": 6004500 }, { "epoch": 40.63582719792118, "grad_norm": 0.3618526756763458, "learning_rate": 4.593641728020788e-05, "loss": 0.3582, "step": 6005000 }, { "epoch": 40.639210697271544, "grad_norm": 0.3952378034591675, "learning_rate": 4.593607893027285e-05, "loss": 0.3558, "step": 6005500 }, { "epoch": 40.642594196621914, "grad_norm": 0.3555503785610199, "learning_rate": 4.593574058033781e-05, "loss": 0.3584, "step": 6006000 }, { "epoch": 40.645977695972284, "grad_norm": 0.41898027062416077, "learning_rate": 4.593540223040277e-05, "loss": 0.358, "step": 6006500 }, { "epoch": 40.64936119532265, "grad_norm": 0.3822486400604248, "learning_rate": 4.5935063880467735e-05, "loss": 0.3584, "step": 6007000 }, { "epoch": 40.65274469467302, "grad_norm": 0.42330479621887207, "learning_rate": 4.5934725530532704e-05, "loss": 0.3572, "step": 6007500 }, { "epoch": 40.65612819402339, "grad_norm": 0.42337852716445923, "learning_rate": 4.5934387180597666e-05, "loss": 0.358, "step": 6008000 }, { "epoch": 40.65951169337375, "grad_norm": 0.3900694251060486, "learning_rate": 4.593404883066263e-05, "loss": 0.3602, "step": 6008500 }, { "epoch": 40.66289519272412, "grad_norm": 0.38055336475372314, "learning_rate": 4.593371048072759e-05, "loss": 0.3565, "step": 6009000 }, { "epoch": 40.66627869207449, "grad_norm": 0.406038761138916, "learning_rate": 4.593337213079255e-05, "loss": 0.3581, "step": 6009500 }, { "epoch": 40.66966219142486, "grad_norm": 0.34239327907562256, "learning_rate": 4.5933033780857515e-05, "loss": 0.3581, "step": 6010000 }, { "epoch": 40.673045690775226, "grad_norm": 0.374510794878006, "learning_rate": 4.593269543092248e-05, "loss": 0.3576, "step": 6010500 }, { "epoch": 40.676429190125596, "grad_norm": 0.38726770877838135, "learning_rate": 4.5932357080987446e-05, "loss": 0.3565, "step": 6011000 }, { "epoch": 40.679812689475966, "grad_norm": 0.4068380296230316, "learning_rate": 4.593201873105241e-05, "loss": 0.3567, "step": 6011500 }, { "epoch": 40.68319618882633, "grad_norm": 0.42458081245422363, "learning_rate": 4.593168038111737e-05, "loss": 0.3585, "step": 6012000 }, { "epoch": 40.6865796881767, "grad_norm": 0.45075562596321106, "learning_rate": 4.593134203118233e-05, "loss": 0.3572, "step": 6012500 }, { "epoch": 40.68996318752707, "grad_norm": 0.39937084913253784, "learning_rate": 4.59310036812473e-05, "loss": 0.3569, "step": 6013000 }, { "epoch": 40.69334668687743, "grad_norm": 0.372530996799469, "learning_rate": 4.5930665331312256e-05, "loss": 0.3583, "step": 6013500 }, { "epoch": 40.6967301862278, "grad_norm": 0.3779999315738678, "learning_rate": 4.593032698137722e-05, "loss": 0.3575, "step": 6014000 }, { "epoch": 40.700113685578174, "grad_norm": 0.3948569595813751, "learning_rate": 4.592998863144218e-05, "loss": 0.3567, "step": 6014500 }, { "epoch": 40.70349718492854, "grad_norm": 0.4100547134876251, "learning_rate": 4.592965028150715e-05, "loss": 0.3575, "step": 6015000 }, { "epoch": 40.70688068427891, "grad_norm": 0.3982258439064026, "learning_rate": 4.592931193157211e-05, "loss": 0.3559, "step": 6015500 }, { "epoch": 40.71026418362928, "grad_norm": 0.41315528750419617, "learning_rate": 4.5928973581637074e-05, "loss": 0.3591, "step": 6016000 }, { "epoch": 40.71364768297965, "grad_norm": 0.3640074133872986, "learning_rate": 4.5928635231702036e-05, "loss": 0.3586, "step": 6016500 }, { "epoch": 40.71703118233001, "grad_norm": 0.3865445852279663, "learning_rate": 4.5928296881767005e-05, "loss": 0.3578, "step": 6017000 }, { "epoch": 40.72041468168038, "grad_norm": 0.40857845544815063, "learning_rate": 4.592795853183197e-05, "loss": 0.358, "step": 6017500 }, { "epoch": 40.72379818103075, "grad_norm": 0.45894935727119446, "learning_rate": 4.592762018189693e-05, "loss": 0.3594, "step": 6018000 }, { "epoch": 40.727181680381115, "grad_norm": 0.4257006347179413, "learning_rate": 4.592728183196189e-05, "loss": 0.357, "step": 6018500 }, { "epoch": 40.730565179731485, "grad_norm": 0.37430503964424133, "learning_rate": 4.592694348202685e-05, "loss": 0.3586, "step": 6019000 }, { "epoch": 40.733948679081855, "grad_norm": 0.3944646716117859, "learning_rate": 4.5926605132091815e-05, "loss": 0.3561, "step": 6019500 }, { "epoch": 40.73733217843222, "grad_norm": 0.3656405508518219, "learning_rate": 4.592626678215678e-05, "loss": 0.3596, "step": 6020000 }, { "epoch": 40.74071567778259, "grad_norm": 0.41020286083221436, "learning_rate": 4.5925928432221746e-05, "loss": 0.3579, "step": 6020500 }, { "epoch": 40.74409917713296, "grad_norm": 0.38027000427246094, "learning_rate": 4.592559008228671e-05, "loss": 0.36, "step": 6021000 }, { "epoch": 40.74748267648333, "grad_norm": 0.36818185448646545, "learning_rate": 4.592525173235167e-05, "loss": 0.3573, "step": 6021500 }, { "epoch": 40.75086617583369, "grad_norm": 0.38438427448272705, "learning_rate": 4.592491338241663e-05, "loss": 0.3564, "step": 6022000 }, { "epoch": 40.75424967518406, "grad_norm": 0.3644617795944214, "learning_rate": 4.5924575032481595e-05, "loss": 0.3586, "step": 6022500 }, { "epoch": 40.75763317453443, "grad_norm": 0.37247687578201294, "learning_rate": 4.592423668254656e-05, "loss": 0.3558, "step": 6023000 }, { "epoch": 40.7610166738848, "grad_norm": 0.35467490553855896, "learning_rate": 4.592389833261152e-05, "loss": 0.3568, "step": 6023500 }, { "epoch": 40.76440017323517, "grad_norm": 0.37191349267959595, "learning_rate": 4.592355998267648e-05, "loss": 0.3574, "step": 6024000 }, { "epoch": 40.76778367258554, "grad_norm": 0.37874293327331543, "learning_rate": 4.592322163274145e-05, "loss": 0.356, "step": 6024500 }, { "epoch": 40.7711671719359, "grad_norm": 0.37583380937576294, "learning_rate": 4.592288328280641e-05, "loss": 0.3575, "step": 6025000 }, { "epoch": 40.77455067128627, "grad_norm": 0.37207546830177307, "learning_rate": 4.5922544932871374e-05, "loss": 0.3574, "step": 6025500 }, { "epoch": 40.77793417063664, "grad_norm": 0.3862629532814026, "learning_rate": 4.5922206582936337e-05, "loss": 0.3574, "step": 6026000 }, { "epoch": 40.781317669987004, "grad_norm": 0.36246976256370544, "learning_rate": 4.5921868233001305e-05, "loss": 0.3579, "step": 6026500 }, { "epoch": 40.784701169337374, "grad_norm": 0.3949558734893799, "learning_rate": 4.592152988306627e-05, "loss": 0.3584, "step": 6027000 }, { "epoch": 40.788084668687745, "grad_norm": 0.4112015664577484, "learning_rate": 4.592119153313123e-05, "loss": 0.3582, "step": 6027500 }, { "epoch": 40.791468168038115, "grad_norm": 0.3892301917076111, "learning_rate": 4.592085318319619e-05, "loss": 0.3593, "step": 6028000 }, { "epoch": 40.79485166738848, "grad_norm": 0.3758805990219116, "learning_rate": 4.5920514833261154e-05, "loss": 0.3579, "step": 6028500 }, { "epoch": 40.79823516673885, "grad_norm": 0.40026286244392395, "learning_rate": 4.5920176483326116e-05, "loss": 0.3591, "step": 6029000 }, { "epoch": 40.80161866608922, "grad_norm": 0.37214037775993347, "learning_rate": 4.591983813339108e-05, "loss": 0.3591, "step": 6029500 }, { "epoch": 40.80500216543958, "grad_norm": 0.4223296642303467, "learning_rate": 4.591949978345604e-05, "loss": 0.3576, "step": 6030000 }, { "epoch": 40.80838566478995, "grad_norm": 0.3842328190803528, "learning_rate": 4.591916143352101e-05, "loss": 0.3572, "step": 6030500 }, { "epoch": 40.81176916414032, "grad_norm": 0.43461838364601135, "learning_rate": 4.591882308358597e-05, "loss": 0.358, "step": 6031000 }, { "epoch": 40.815152663490686, "grad_norm": 0.35343775153160095, "learning_rate": 4.5918484733650933e-05, "loss": 0.3571, "step": 6031500 }, { "epoch": 40.818536162841056, "grad_norm": 0.3892674744129181, "learning_rate": 4.5918146383715896e-05, "loss": 0.3584, "step": 6032000 }, { "epoch": 40.821919662191426, "grad_norm": 0.45603224635124207, "learning_rate": 4.5917808033780864e-05, "loss": 0.3582, "step": 6032500 }, { "epoch": 40.82530316154179, "grad_norm": 0.4169098138809204, "learning_rate": 4.591746968384582e-05, "loss": 0.3585, "step": 6033000 }, { "epoch": 40.82868666089216, "grad_norm": 0.3868551254272461, "learning_rate": 4.591713133391078e-05, "loss": 0.358, "step": 6033500 }, { "epoch": 40.83207016024253, "grad_norm": 0.40169456601142883, "learning_rate": 4.591679298397575e-05, "loss": 0.3567, "step": 6034000 }, { "epoch": 40.8354536595929, "grad_norm": 0.35913214087486267, "learning_rate": 4.591645463404071e-05, "loss": 0.3591, "step": 6034500 }, { "epoch": 40.838837158943264, "grad_norm": 0.38202446699142456, "learning_rate": 4.5916116284105675e-05, "loss": 0.3574, "step": 6035000 }, { "epoch": 40.842220658293634, "grad_norm": 0.3530464470386505, "learning_rate": 4.591577793417064e-05, "loss": 0.3592, "step": 6035500 }, { "epoch": 40.845604157644004, "grad_norm": 0.37851080298423767, "learning_rate": 4.5915439584235606e-05, "loss": 0.3578, "step": 6036000 }, { "epoch": 40.84898765699437, "grad_norm": 0.38370686769485474, "learning_rate": 4.591510123430057e-05, "loss": 0.3588, "step": 6036500 }, { "epoch": 40.85237115634474, "grad_norm": 0.382432758808136, "learning_rate": 4.591476288436553e-05, "loss": 0.3584, "step": 6037000 }, { "epoch": 40.85575465569511, "grad_norm": 0.37672939896583557, "learning_rate": 4.591442453443049e-05, "loss": 0.3587, "step": 6037500 }, { "epoch": 40.85913815504547, "grad_norm": 0.3865501582622528, "learning_rate": 4.5914086184495455e-05, "loss": 0.3568, "step": 6038000 }, { "epoch": 40.86252165439584, "grad_norm": 0.36493822932243347, "learning_rate": 4.591374783456042e-05, "loss": 0.3576, "step": 6038500 }, { "epoch": 40.86590515374621, "grad_norm": 0.38755717873573303, "learning_rate": 4.591340948462538e-05, "loss": 0.3582, "step": 6039000 }, { "epoch": 40.869288653096575, "grad_norm": 0.420865923166275, "learning_rate": 4.591307113469034e-05, "loss": 0.3578, "step": 6039500 }, { "epoch": 40.872672152446945, "grad_norm": 0.39818307757377625, "learning_rate": 4.591273278475531e-05, "loss": 0.3582, "step": 6040000 }, { "epoch": 40.876055651797316, "grad_norm": 0.39191457629203796, "learning_rate": 4.591239443482027e-05, "loss": 0.3581, "step": 6040500 }, { "epoch": 40.879439151147686, "grad_norm": 0.4268248975276947, "learning_rate": 4.5912056084885234e-05, "loss": 0.3585, "step": 6041000 }, { "epoch": 40.88282265049805, "grad_norm": 0.4153226315975189, "learning_rate": 4.5911717734950196e-05, "loss": 0.3571, "step": 6041500 }, { "epoch": 40.88620614984842, "grad_norm": 0.37966054677963257, "learning_rate": 4.5911379385015165e-05, "loss": 0.3587, "step": 6042000 }, { "epoch": 40.88958964919879, "grad_norm": 0.38498005270957947, "learning_rate": 4.591104103508012e-05, "loss": 0.3571, "step": 6042500 }, { "epoch": 40.89297314854915, "grad_norm": 0.3869180679321289, "learning_rate": 4.591070268514508e-05, "loss": 0.3583, "step": 6043000 }, { "epoch": 40.89635664789952, "grad_norm": 0.3785061836242676, "learning_rate": 4.591036433521005e-05, "loss": 0.3583, "step": 6043500 }, { "epoch": 40.899740147249894, "grad_norm": 0.42890506982803345, "learning_rate": 4.5910025985275014e-05, "loss": 0.3581, "step": 6044000 }, { "epoch": 40.90312364660026, "grad_norm": 0.3925100564956665, "learning_rate": 4.5909687635339976e-05, "loss": 0.3582, "step": 6044500 }, { "epoch": 40.90650714595063, "grad_norm": 0.35613325238227844, "learning_rate": 4.590934928540494e-05, "loss": 0.3583, "step": 6045000 }, { "epoch": 40.909890645301, "grad_norm": 0.3742819130420685, "learning_rate": 4.590901093546991e-05, "loss": 0.3591, "step": 6045500 }, { "epoch": 40.91327414465137, "grad_norm": 0.38186126947402954, "learning_rate": 4.590867258553487e-05, "loss": 0.358, "step": 6046000 }, { "epoch": 40.91665764400173, "grad_norm": 0.3733210861682892, "learning_rate": 4.590833423559983e-05, "loss": 0.3575, "step": 6046500 }, { "epoch": 40.9200411433521, "grad_norm": 0.36450719833374023, "learning_rate": 4.5907995885664786e-05, "loss": 0.3583, "step": 6047000 }, { "epoch": 40.92342464270247, "grad_norm": 0.34866660833358765, "learning_rate": 4.5907657535729755e-05, "loss": 0.3581, "step": 6047500 }, { "epoch": 40.926808142052835, "grad_norm": 0.42558640241622925, "learning_rate": 4.590731918579472e-05, "loss": 0.3573, "step": 6048000 }, { "epoch": 40.930191641403205, "grad_norm": 0.4213411211967468, "learning_rate": 4.590698083585968e-05, "loss": 0.3578, "step": 6048500 }, { "epoch": 40.933575140753575, "grad_norm": 0.4106290638446808, "learning_rate": 4.590664248592464e-05, "loss": 0.3568, "step": 6049000 }, { "epoch": 40.93695864010394, "grad_norm": 0.39277195930480957, "learning_rate": 4.590630413598961e-05, "loss": 0.3575, "step": 6049500 }, { "epoch": 40.94034213945431, "grad_norm": 0.42475879192352295, "learning_rate": 4.590596578605457e-05, "loss": 0.3567, "step": 6050000 }, { "epoch": 40.94372563880468, "grad_norm": 0.40933966636657715, "learning_rate": 4.5905627436119535e-05, "loss": 0.3582, "step": 6050500 }, { "epoch": 40.94710913815504, "grad_norm": 0.391244500875473, "learning_rate": 4.59052890861845e-05, "loss": 0.3585, "step": 6051000 }, { "epoch": 40.95049263750541, "grad_norm": 0.38254618644714355, "learning_rate": 4.5904950736249466e-05, "loss": 0.3585, "step": 6051500 }, { "epoch": 40.95387613685578, "grad_norm": 0.3931604027748108, "learning_rate": 4.590461238631442e-05, "loss": 0.3587, "step": 6052000 }, { "epoch": 40.95725963620615, "grad_norm": 0.42496395111083984, "learning_rate": 4.590427403637938e-05, "loss": 0.3561, "step": 6052500 }, { "epoch": 40.960643135556516, "grad_norm": 0.4158755838871002, "learning_rate": 4.590393568644435e-05, "loss": 0.3579, "step": 6053000 }, { "epoch": 40.96402663490689, "grad_norm": 0.36279532313346863, "learning_rate": 4.5903597336509314e-05, "loss": 0.3579, "step": 6053500 }, { "epoch": 40.96741013425726, "grad_norm": 0.3945808708667755, "learning_rate": 4.5903258986574276e-05, "loss": 0.3572, "step": 6054000 }, { "epoch": 40.97079363360762, "grad_norm": 0.3513178527355194, "learning_rate": 4.590292063663924e-05, "loss": 0.3591, "step": 6054500 }, { "epoch": 40.97417713295799, "grad_norm": 0.38702842593193054, "learning_rate": 4.590258228670421e-05, "loss": 0.3578, "step": 6055000 }, { "epoch": 40.97756063230836, "grad_norm": 0.40155449509620667, "learning_rate": 4.590224393676917e-05, "loss": 0.3575, "step": 6055500 }, { "epoch": 40.980944131658724, "grad_norm": 0.3826092481613159, "learning_rate": 4.590190558683413e-05, "loss": 0.3585, "step": 6056000 }, { "epoch": 40.984327631009094, "grad_norm": 0.3890673816204071, "learning_rate": 4.590156723689909e-05, "loss": 0.3573, "step": 6056500 }, { "epoch": 40.987711130359465, "grad_norm": 0.4273022711277008, "learning_rate": 4.5901228886964056e-05, "loss": 0.359, "step": 6057000 }, { "epoch": 40.99109462970983, "grad_norm": 0.35231855511665344, "learning_rate": 4.590089053702902e-05, "loss": 0.3567, "step": 6057500 }, { "epoch": 40.9944781290602, "grad_norm": 0.3643796443939209, "learning_rate": 4.590055218709398e-05, "loss": 0.3573, "step": 6058000 }, { "epoch": 40.99786162841057, "grad_norm": 0.39727750420570374, "learning_rate": 4.590021383715894e-05, "loss": 0.3579, "step": 6058500 }, { "epoch": 41.0, "eval_accuracy": 0.8634818976166824, "eval_loss": 0.5537328124046326, "eval_runtime": 3396.4168, "eval_samples_per_second": 85.603, "eval_steps_per_second": 5.35, "step": 6058816 } ], "logging_steps": 500, "max_steps": 73888000, "num_input_tokens_seen": 0, "num_train_epochs": 500, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.065980816103834e+19, "train_batch_size": 16, "trial_name": null, "trial_params": null }